dsp - CMSIS DSP Library from CMSIS 2.0. See http://www.…

Users » simon » Code » dsp

CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

src/Cortex-M4-M3/FilteringFunctions/arm_conv_partial_q31.c@0:1014af42efd9, 2011-03-10 (annotated)

Committer:: simon
Date:: Thu Mar 10 15:07:50 2011 +0000
Revision:: 0:1014af42efd9

Who changed what in which revision?

User	Revision	Line number	New contents of line
simon	0:1014af42efd9	1	/* ----------------------------------------------------------------------
simon	0:1014af42efd9	2	* Copyright (C) 2010 ARM Limited. All rights reserved.
simon	0:1014af42efd9	3	*
simon	0:1014af42efd9	4	* $Date: 29. November 2010
simon	0:1014af42efd9	5	* $Revision: V1.0.3
simon	0:1014af42efd9	6	*
simon	0:1014af42efd9	7	* Project: CMSIS DSP Library
simon	0:1014af42efd9	8	* Title: arm_conv_partial_q31.c
simon	0:1014af42efd9	9	*
simon	0:1014af42efd9	10	* Description: Q31 Partial convolution.
simon	0:1014af42efd9	11	*
simon	0:1014af42efd9	12	* Target Processor: Cortex-M4/Cortex-M3
simon	0:1014af42efd9	13	*
simon	0:1014af42efd9	14	* Version 1.0.3 2010/11/29
simon	0:1014af42efd9	15	* Re-organized the CMSIS folders and updated documentation.
simon	0:1014af42efd9	16	*
simon	0:1014af42efd9	17	* Version 1.0.2 2010/11/11
simon	0:1014af42efd9	18	* Documentation updated.
simon	0:1014af42efd9	19	*
simon	0:1014af42efd9	20	* Version 1.0.1 2010/10/05
simon	0:1014af42efd9	21	* Production release and review comments incorporated.
simon	0:1014af42efd9	22	*
simon	0:1014af42efd9	23	* Version 1.0.0 2010/09/20
simon	0:1014af42efd9	24	* Production release and review comments incorporated
simon	0:1014af42efd9	25	*
simon	0:1014af42efd9	26	* Version 0.0.7 2010/06/10
simon	0:1014af42efd9	27	* Misra-C changes done
simon	0:1014af42efd9	28	*
simon	0:1014af42efd9	29	* -------------------------------------------------------------------- */
simon	0:1014af42efd9	30
simon	0:1014af42efd9	31	#include "arm_math.h"
simon	0:1014af42efd9	32
simon	0:1014af42efd9	33	/**
simon	0:1014af42efd9	34	* @ingroup groupFilters
simon	0:1014af42efd9	35	*/
simon	0:1014af42efd9	36
simon	0:1014af42efd9	37	/**
simon	0:1014af42efd9	38	* @addtogroup PartialConv
simon	0:1014af42efd9	39	* @{
simon	0:1014af42efd9	40	*/
simon	0:1014af42efd9	41
simon	0:1014af42efd9	42	/**
simon	0:1014af42efd9	43	* @brief Partial convolution of Q31 sequences.
simon	0:1014af42efd9	44	* @param[in] *pSrcA points to the first input sequence.
simon	0:1014af42efd9	45	* @param[in] srcALen length of the first input sequence.
simon	0:1014af42efd9	46	* @param[in] *pSrcB points to the second input sequence.
simon	0:1014af42efd9	47	* @param[in] srcBLen length of the second input sequence.
simon	0:1014af42efd9	48	* @param[out] *pDst points to the location where the output result is written.
simon	0:1014af42efd9	49	* @param[in] firstIndex is the first output sample to start with.
simon	0:1014af42efd9	50	* @param[in] numPoints is the number of output points to be computed.
simon	0:1014af42efd9	51	* @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
simon	0:1014af42efd9	52	*
simon	0:1014af42efd9	53	* See <code>arm_conv_partial_fast_q31()</code> for a faster but less precise implementation of this function.
simon	0:1014af42efd9	54	*/
simon	0:1014af42efd9	55
simon	0:1014af42efd9	56	arm_status arm_conv_partial_q31(
simon	0:1014af42efd9	57	q31_t * pSrcA,
simon	0:1014af42efd9	58	uint32_t srcALen,
simon	0:1014af42efd9	59	q31_t * pSrcB,
simon	0:1014af42efd9	60	uint32_t srcBLen,
simon	0:1014af42efd9	61	q31_t * pDst,
simon	0:1014af42efd9	62	uint32_t firstIndex,
simon	0:1014af42efd9	63	uint32_t numPoints)
simon	0:1014af42efd9	64	{
simon	0:1014af42efd9	65	q31_t pIn1; / inputA pointer */
simon	0:1014af42efd9	66	q31_t pIn2; / inputB pointer */
simon	0:1014af42efd9	67	q31_t pOut = pDst; / output pointer */
simon	0:1014af42efd9	68	q31_t px; / Intermediate inputA pointer */
simon	0:1014af42efd9	69	q31_t py; / Intermediate inputB pointer */
simon	0:1014af42efd9	70	q31_t pSrc1, pSrc2; /* Intermediate pointers */
simon	0:1014af42efd9	71	q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
simon	0:1014af42efd9	72	q31_t x0, x1, x2, x3, c0;
simon	0:1014af42efd9	73	uint32_t j, k, count, check, blkCnt;
simon	0:1014af42efd9	74	int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
simon	0:1014af42efd9	75	arm_status status; /* status of Partial convolution */
simon	0:1014af42efd9	76
simon	0:1014af42efd9	77
simon	0:1014af42efd9	78	/* Check for range of output samples to be calculated */
simon	0:1014af42efd9	79	if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
simon	0:1014af42efd9	80	{
simon	0:1014af42efd9	81	/* Set status as ARM_MATH_ARGUMENT_ERROR */
simon	0:1014af42efd9	82	status = ARM_MATH_ARGUMENT_ERROR;
simon	0:1014af42efd9	83	}
simon	0:1014af42efd9	84	else
simon	0:1014af42efd9	85	{
simon	0:1014af42efd9	86
simon	0:1014af42efd9	87	/* The algorithm implementation is based on the lengths of the inputs. */
simon	0:1014af42efd9	88	/* srcB is always made to slide across srcA. */
simon	0:1014af42efd9	89	/* So srcBLen is always considered as shorter or equal to srcALen */
simon	0:1014af42efd9	90	if(srcALen >= srcBLen)
simon	0:1014af42efd9	91	{
simon	0:1014af42efd9	92	/* Initialization of inputA pointer */
simon	0:1014af42efd9	93	pIn1 = pSrcA;
simon	0:1014af42efd9	94
simon	0:1014af42efd9	95	/* Initialization of inputB pointer */
simon	0:1014af42efd9	96	pIn2 = pSrcB;
simon	0:1014af42efd9	97	}
simon	0:1014af42efd9	98	else
simon	0:1014af42efd9	99	{
simon	0:1014af42efd9	100	/* Initialization of inputA pointer */
simon	0:1014af42efd9	101	pIn1 = pSrcB;
simon	0:1014af42efd9	102
simon	0:1014af42efd9	103	/* Initialization of inputB pointer */
simon	0:1014af42efd9	104	pIn2 = pSrcA;
simon	0:1014af42efd9	105
simon	0:1014af42efd9	106	/* srcBLen is always considered as shorter or equal to srcALen */
simon	0:1014af42efd9	107	j = srcBLen;
simon	0:1014af42efd9	108	srcBLen = srcALen;
simon	0:1014af42efd9	109	srcALen = j;
simon	0:1014af42efd9	110	}
simon	0:1014af42efd9	111
simon	0:1014af42efd9	112	/* Conditions to check which loopCounter holds
simon	0:1014af42efd9	113	* the first and last indices of the output samples to be calculated. */
simon	0:1014af42efd9	114	check = firstIndex + numPoints;
simon	0:1014af42efd9	115	blockSize3 = ((int32_t) check - (int32_t) srcALen);
simon	0:1014af42efd9	116	blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
simon	0:1014af42efd9	117	blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
simon	0:1014af42efd9	118	blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
simon	0:1014af42efd9	119	(int32_t) numPoints) : 0;
simon	0:1014af42efd9	120	blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
simon	0:1014af42efd9	121	(int32_t) firstIndex);
simon	0:1014af42efd9	122	blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
simon	0:1014af42efd9	123
simon	0:1014af42efd9	124	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
simon	0:1014af42efd9	125	/* The function is internally
simon	0:1014af42efd9	126	* divided into three stages according to the number of multiplications that has to be
simon	0:1014af42efd9	127	* taken place between inputA samples and inputB samples. In the first stage of the
simon	0:1014af42efd9	128	* algorithm, the multiplications increase by one for every iteration.
simon	0:1014af42efd9	129	* In the second stage of the algorithm, srcBLen number of multiplications are done.
simon	0:1014af42efd9	130	* In the third stage of the algorithm, the multiplications decrease by one
simon	0:1014af42efd9	131	* for every iteration. */
simon	0:1014af42efd9	132
simon	0:1014af42efd9	133	/* Set the output pointer to point to the firstIndex
simon	0:1014af42efd9	134	* of the output sample to be calculated. */
simon	0:1014af42efd9	135	pOut = pDst + firstIndex;
simon	0:1014af42efd9	136
simon	0:1014af42efd9	137	/* --------------------------
simon	0:1014af42efd9	138	* Initializations of stage1
simon	0:1014af42efd9	139	* -------------------------*/
simon	0:1014af42efd9	140
simon	0:1014af42efd9	141	/* sum = x[0] * y[0]
simon	0:1014af42efd9	142	* sum = x[0] * y[1] + x[1] * y[0]
simon	0:1014af42efd9	143	* ....
simon	0:1014af42efd9	144	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
simon	0:1014af42efd9	145	*/
simon	0:1014af42efd9	146
simon	0:1014af42efd9	147	/* In this stage the MAC operations are increased by 1 for every iteration.
simon	0:1014af42efd9	148	The count variable holds the number of MAC operations performed.
simon	0:1014af42efd9	149	Since the partial convolution starts from firstIndex
simon	0:1014af42efd9	150	Number of Macs to be performed is firstIndex + 1 */
simon	0:1014af42efd9	151	count = 1u + firstIndex;
simon	0:1014af42efd9	152
simon	0:1014af42efd9	153	/* Working pointer of inputA */
simon	0:1014af42efd9	154	px = pIn1;
simon	0:1014af42efd9	155
simon	0:1014af42efd9	156	/* Working pointer of inputB */
simon	0:1014af42efd9	157	pSrc2 = pIn2 + firstIndex;
simon	0:1014af42efd9	158	py = pSrc2;
simon	0:1014af42efd9	159
simon	0:1014af42efd9	160	/* ------------------------
simon	0:1014af42efd9	161	* Stage1 process
simon	0:1014af42efd9	162	* ----------------------*/
simon	0:1014af42efd9	163
simon	0:1014af42efd9	164	/* The first loop starts here */
simon	0:1014af42efd9	165	while(blockSize1 > 0)
simon	0:1014af42efd9	166	{
simon	0:1014af42efd9	167	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	168	sum = 0;
simon	0:1014af42efd9	169
simon	0:1014af42efd9	170	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	171	k = count >> 2u;
simon	0:1014af42efd9	172
simon	0:1014af42efd9	173	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	174	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	175	while(k > 0u)
simon	0:1014af42efd9	176	{
simon	0:1014af42efd9	177	/* x[0] * y[srcBLen - 1] */
simon	0:1014af42efd9	178	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	179	/* x[1] * y[srcBLen - 2] */
simon	0:1014af42efd9	180	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	181	/* x[2] * y[srcBLen - 3] */
simon	0:1014af42efd9	182	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	183	/* x[3] * y[srcBLen - 4] */
simon	0:1014af42efd9	184	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	185
simon	0:1014af42efd9	186	/* Decrement the loop counter */
simon	0:1014af42efd9	187	k--;
simon	0:1014af42efd9	188	}
simon	0:1014af42efd9	189
simon	0:1014af42efd9	190	/* If the count is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	191	** No loop unrolling is used. */
simon	0:1014af42efd9	192	k = count % 0x4u;
simon	0:1014af42efd9	193
simon	0:1014af42efd9	194	while(k > 0u)
simon	0:1014af42efd9	195	{
simon	0:1014af42efd9	196	/* Perform the multiply-accumulate */
simon	0:1014af42efd9	197	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	198
simon	0:1014af42efd9	199	/* Decrement the loop counter */
simon	0:1014af42efd9	200	k--;
simon	0:1014af42efd9	201	}
simon	0:1014af42efd9	202
simon	0:1014af42efd9	203	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	204	*pOut++ = (q31_t) (sum >> 31);
simon	0:1014af42efd9	205
simon	0:1014af42efd9	206	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	207	py = ++pSrc2;
simon	0:1014af42efd9	208	px = pIn1;
simon	0:1014af42efd9	209
simon	0:1014af42efd9	210	/* Increment the MAC count */
simon	0:1014af42efd9	211	count++;
simon	0:1014af42efd9	212
simon	0:1014af42efd9	213	/* Decrement the loop counter */
simon	0:1014af42efd9	214	blockSize1--;
simon	0:1014af42efd9	215	}
simon	0:1014af42efd9	216
simon	0:1014af42efd9	217	/* --------------------------
simon	0:1014af42efd9	218	* Initializations of stage2
simon	0:1014af42efd9	219	* ------------------------*/
simon	0:1014af42efd9	220
simon	0:1014af42efd9	221	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
simon	0:1014af42efd9	222	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
simon	0:1014af42efd9	223	* ....
simon	0:1014af42efd9	224	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
simon	0:1014af42efd9	225	*/
simon	0:1014af42efd9	226
simon	0:1014af42efd9	227	/* Working pointer of inputA */
simon	0:1014af42efd9	228	px = pIn1;
simon	0:1014af42efd9	229
simon	0:1014af42efd9	230	/* Working pointer of inputB */
simon	0:1014af42efd9	231	pSrc2 = pIn2 + (srcBLen - 1u);
simon	0:1014af42efd9	232	py = pSrc2;
simon	0:1014af42efd9	233
simon	0:1014af42efd9	234	/* count is index by which the pointer pIn1 to be incremented */
simon	0:1014af42efd9	235	count = 1u;
simon	0:1014af42efd9	236
simon	0:1014af42efd9	237	/* -------------------
simon	0:1014af42efd9	238	* Stage2 process
simon	0:1014af42efd9	239	* ------------------*/
simon	0:1014af42efd9	240
simon	0:1014af42efd9	241	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
simon	0:1014af42efd9	242	* So, to loop unroll over blockSize2,
simon	0:1014af42efd9	243	* srcBLen should be greater than or equal to 4 */
simon	0:1014af42efd9	244	if(srcBLen >= 4u)
simon	0:1014af42efd9	245	{
simon	0:1014af42efd9	246	/* Loop unroll over blockSize2 */
simon	0:1014af42efd9	247	blkCnt = ((uint32_t) blockSize2 >> 2u);
simon	0:1014af42efd9	248
simon	0:1014af42efd9	249	while(blkCnt > 0u)
simon	0:1014af42efd9	250	{
simon	0:1014af42efd9	251	/* Set all accumulators to zero */
simon	0:1014af42efd9	252	acc0 = 0;
simon	0:1014af42efd9	253	acc1 = 0;
simon	0:1014af42efd9	254	acc2 = 0;
simon	0:1014af42efd9	255	acc3 = 0;
simon	0:1014af42efd9	256
simon	0:1014af42efd9	257	/* read x[0], x[1], x[2] samples */
simon	0:1014af42efd9	258	x0 = *(px++);
simon	0:1014af42efd9	259	x1 = *(px++);
simon	0:1014af42efd9	260	x2 = *(px++);
simon	0:1014af42efd9	261
simon	0:1014af42efd9	262	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	263	k = srcBLen >> 2u;
simon	0:1014af42efd9	264
simon	0:1014af42efd9	265	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	266	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	267	do
simon	0:1014af42efd9	268	{
simon	0:1014af42efd9	269	/* Read y[srcBLen - 1] sample */
simon	0:1014af42efd9	270	c0 = *(py--);
simon	0:1014af42efd9	271
simon	0:1014af42efd9	272	/* Read x[3] sample */
simon	0:1014af42efd9	273	x3 = *(px++);
simon	0:1014af42efd9	274
simon	0:1014af42efd9	275	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	276	/* acc0 += x[0] * y[srcBLen - 1] */
simon	0:1014af42efd9	277	acc0 += (q63_t) x0 *c0;
simon	0:1014af42efd9	278	/* acc1 += x[1] * y[srcBLen - 1] */
simon	0:1014af42efd9	279	acc1 += (q63_t) x1 *c0;
simon	0:1014af42efd9	280	/* acc2 += x[2] * y[srcBLen - 1] */
simon	0:1014af42efd9	281	acc2 += (q63_t) x2 *c0;
simon	0:1014af42efd9	282	/* acc3 += x[3] * y[srcBLen - 1] */
simon	0:1014af42efd9	283	acc3 += (q63_t) x3 *c0;
simon	0:1014af42efd9	284
simon	0:1014af42efd9	285	/* Read y[srcBLen - 2] sample */
simon	0:1014af42efd9	286	c0 = *(py--);
simon	0:1014af42efd9	287
simon	0:1014af42efd9	288	/* Read x[4] sample */
simon	0:1014af42efd9	289	x0 = *(px++);
simon	0:1014af42efd9	290
simon	0:1014af42efd9	291	/* Perform the multiply-accumulate */
simon	0:1014af42efd9	292	/* acc0 += x[1] * y[srcBLen - 2] */
simon	0:1014af42efd9	293	acc0 += (q63_t) x1 *c0;
simon	0:1014af42efd9	294	/* acc1 += x[2] * y[srcBLen - 2] */
simon	0:1014af42efd9	295	acc1 += (q63_t) x2 *c0;
simon	0:1014af42efd9	296	/* acc2 += x[3] * y[srcBLen - 2] */
simon	0:1014af42efd9	297	acc2 += (q63_t) x3 *c0;
simon	0:1014af42efd9	298	/* acc3 += x[4] * y[srcBLen - 2] */
simon	0:1014af42efd9	299	acc3 += (q63_t) x0 *c0;
simon	0:1014af42efd9	300
simon	0:1014af42efd9	301	/* Read y[srcBLen - 3] sample */
simon	0:1014af42efd9	302	c0 = *(py--);
simon	0:1014af42efd9	303
simon	0:1014af42efd9	304	/* Read x[5] sample */
simon	0:1014af42efd9	305	x1 = *(px++);
simon	0:1014af42efd9	306
simon	0:1014af42efd9	307	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	308	/* acc0 += x[2] * y[srcBLen - 3] */
simon	0:1014af42efd9	309	acc0 += (q63_t) x2 *c0;
simon	0:1014af42efd9	310	/* acc1 += x[3] * y[srcBLen - 2] */
simon	0:1014af42efd9	311	acc1 += (q63_t) x3 *c0;
simon	0:1014af42efd9	312	/* acc2 += x[4] * y[srcBLen - 2] */
simon	0:1014af42efd9	313	acc2 += (q63_t) x0 *c0;
simon	0:1014af42efd9	314	/* acc3 += x[5] * y[srcBLen - 2] */
simon	0:1014af42efd9	315	acc3 += (q63_t) x1 *c0;
simon	0:1014af42efd9	316
simon	0:1014af42efd9	317	/* Read y[srcBLen - 4] sample */
simon	0:1014af42efd9	318	c0 = *(py--);
simon	0:1014af42efd9	319
simon	0:1014af42efd9	320	/* Read x[6] sample */
simon	0:1014af42efd9	321	x2 = *(px++);
simon	0:1014af42efd9	322
simon	0:1014af42efd9	323	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	324	/* acc0 += x[3] * y[srcBLen - 4] */
simon	0:1014af42efd9	325	acc0 += (q63_t) x3 *c0;
simon	0:1014af42efd9	326	/* acc1 += x[4] * y[srcBLen - 4] */
simon	0:1014af42efd9	327	acc1 += (q63_t) x0 *c0;
simon	0:1014af42efd9	328	/* acc2 += x[5] * y[srcBLen - 4] */
simon	0:1014af42efd9	329	acc2 += (q63_t) x1 *c0;
simon	0:1014af42efd9	330	/* acc3 += x[6] * y[srcBLen - 4] */
simon	0:1014af42efd9	331	acc3 += (q63_t) x2 *c0;
simon	0:1014af42efd9	332
simon	0:1014af42efd9	333	} while(--k);
simon	0:1014af42efd9	334
simon	0:1014af42efd9	335	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	336	** No loop unrolling is used. */
simon	0:1014af42efd9	337	k = srcBLen % 0x4u;
simon	0:1014af42efd9	338
simon	0:1014af42efd9	339	while(k > 0u)
simon	0:1014af42efd9	340	{
simon	0:1014af42efd9	341	/* Read y[srcBLen - 5] sample */
simon	0:1014af42efd9	342	c0 = *(py--);
simon	0:1014af42efd9	343
simon	0:1014af42efd9	344	/* Read x[7] sample */
simon	0:1014af42efd9	345	x3 = *(px++);
simon	0:1014af42efd9	346
simon	0:1014af42efd9	347	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	348	/* acc0 += x[4] * y[srcBLen - 5] */
simon	0:1014af42efd9	349	acc0 += (q63_t) x0 *c0;
simon	0:1014af42efd9	350	/* acc1 += x[5] * y[srcBLen - 5] */
simon	0:1014af42efd9	351	acc1 += (q63_t) x1 *c0;
simon	0:1014af42efd9	352	/* acc2 += x[6] * y[srcBLen - 5] */
simon	0:1014af42efd9	353	acc2 += (q63_t) x2 *c0;
simon	0:1014af42efd9	354	/* acc3 += x[7] * y[srcBLen - 5] */
simon	0:1014af42efd9	355	acc3 += (q63_t) x3 *c0;
simon	0:1014af42efd9	356
simon	0:1014af42efd9	357	/* Reuse the present samples for the next MAC */
simon	0:1014af42efd9	358	x0 = x1;
simon	0:1014af42efd9	359	x1 = x2;
simon	0:1014af42efd9	360	x2 = x3;
simon	0:1014af42efd9	361
simon	0:1014af42efd9	362	/* Decrement the loop counter */
simon	0:1014af42efd9	363	k--;
simon	0:1014af42efd9	364	}
simon	0:1014af42efd9	365
simon	0:1014af42efd9	366	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	367	*pOut++ = (q31_t) (acc0 >> 31);
simon	0:1014af42efd9	368	*pOut++ = (q31_t) (acc1 >> 31);
simon	0:1014af42efd9	369	*pOut++ = (q31_t) (acc2 >> 31);
simon	0:1014af42efd9	370	*pOut++ = (q31_t) (acc3 >> 31);
simon	0:1014af42efd9	371
simon	0:1014af42efd9	372	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	373	px = pIn1 + (count * 4u);
simon	0:1014af42efd9	374	py = pSrc2;
simon	0:1014af42efd9	375
simon	0:1014af42efd9	376	/* Increment the pointer pIn1 index, count by 1 */
simon	0:1014af42efd9	377	count++;
simon	0:1014af42efd9	378
simon	0:1014af42efd9	379	/* Decrement the loop counter */
simon	0:1014af42efd9	380	blkCnt--;
simon	0:1014af42efd9	381	}
simon	0:1014af42efd9	382
simon	0:1014af42efd9	383	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
simon	0:1014af42efd9	384	** No loop unrolling is used. */
simon	0:1014af42efd9	385	blkCnt = (uint32_t) blockSize2 % 0x4u;
simon	0:1014af42efd9	386
simon	0:1014af42efd9	387	while(blkCnt > 0u)
simon	0:1014af42efd9	388	{
simon	0:1014af42efd9	389	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	390	sum = 0;
simon	0:1014af42efd9	391
simon	0:1014af42efd9	392	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	393	k = srcBLen >> 2u;
simon	0:1014af42efd9	394
simon	0:1014af42efd9	395	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	396	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	397	while(k > 0u)
simon	0:1014af42efd9	398	{
simon	0:1014af42efd9	399	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	400	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	401	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	402	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	403	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	404
simon	0:1014af42efd9	405	/* Decrement the loop counter */
simon	0:1014af42efd9	406	k--;
simon	0:1014af42efd9	407	}
simon	0:1014af42efd9	408
simon	0:1014af42efd9	409	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	410	** No loop unrolling is used. */
simon	0:1014af42efd9	411	k = srcBLen % 0x4u;
simon	0:1014af42efd9	412
simon	0:1014af42efd9	413	while(k > 0u)
simon	0:1014af42efd9	414	{
simon	0:1014af42efd9	415	/* Perform the multiply-accumulate */
simon	0:1014af42efd9	416	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	417
simon	0:1014af42efd9	418	/* Decrement the loop counter */
simon	0:1014af42efd9	419	k--;
simon	0:1014af42efd9	420	}
simon	0:1014af42efd9	421
simon	0:1014af42efd9	422	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	423	*pOut++ = (q31_t) (sum >> 31);
simon	0:1014af42efd9	424
simon	0:1014af42efd9	425	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	426	px = pIn1 + count;
simon	0:1014af42efd9	427	py = pSrc2;
simon	0:1014af42efd9	428
simon	0:1014af42efd9	429	/* Increment the MAC count */
simon	0:1014af42efd9	430	count++;
simon	0:1014af42efd9	431
simon	0:1014af42efd9	432	/* Decrement the loop counter */
simon	0:1014af42efd9	433	blkCnt--;
simon	0:1014af42efd9	434	}
simon	0:1014af42efd9	435	}
simon	0:1014af42efd9	436	else
simon	0:1014af42efd9	437	{
simon	0:1014af42efd9	438	/* If the srcBLen is not a multiple of 4,
simon	0:1014af42efd9	439	* the blockSize2 loop cannot be unrolled by 4 */
simon	0:1014af42efd9	440	blkCnt = (uint32_t) blockSize2;
simon	0:1014af42efd9	441
simon	0:1014af42efd9	442	while(blkCnt > 0u)
simon	0:1014af42efd9	443	{
simon	0:1014af42efd9	444	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	445	sum = 0;
simon	0:1014af42efd9	446
simon	0:1014af42efd9	447	/* srcBLen number of MACS should be performed */
simon	0:1014af42efd9	448	k = srcBLen;
simon	0:1014af42efd9	449
simon	0:1014af42efd9	450	while(k > 0u)
simon	0:1014af42efd9	451	{
simon	0:1014af42efd9	452	/* Perform the multiply-accumulate */
simon	0:1014af42efd9	453	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	454
simon	0:1014af42efd9	455	/* Decrement the loop counter */
simon	0:1014af42efd9	456	k--;
simon	0:1014af42efd9	457	}
simon	0:1014af42efd9	458
simon	0:1014af42efd9	459	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	460	*pOut++ = (q31_t) (sum >> 31);
simon	0:1014af42efd9	461
simon	0:1014af42efd9	462	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	463	px = pIn1 + count;
simon	0:1014af42efd9	464	py = pSrc2;
simon	0:1014af42efd9	465
simon	0:1014af42efd9	466	/* Increment the MAC count */
simon	0:1014af42efd9	467	count++;
simon	0:1014af42efd9	468
simon	0:1014af42efd9	469	/* Decrement the loop counter */
simon	0:1014af42efd9	470	blkCnt--;
simon	0:1014af42efd9	471	}
simon	0:1014af42efd9	472	}
simon	0:1014af42efd9	473
simon	0:1014af42efd9	474
simon	0:1014af42efd9	475	/* --------------------------
simon	0:1014af42efd9	476	* Initializations of stage3
simon	0:1014af42efd9	477	* -------------------------*/
simon	0:1014af42efd9	478
simon	0:1014af42efd9	479	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
simon	0:1014af42efd9	480	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
simon	0:1014af42efd9	481	* ....
simon	0:1014af42efd9	482	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
simon	0:1014af42efd9	483	* sum += x[srcALen-1] * y[srcBLen-1]
simon	0:1014af42efd9	484	*/
simon	0:1014af42efd9	485
simon	0:1014af42efd9	486	/* In this stage the MAC operations are decreased by 1 for every iteration.
simon	0:1014af42efd9	487	The blockSize3 variable holds the number of MAC operations performed */
simon	0:1014af42efd9	488	count = srcBLen - 1u;
simon	0:1014af42efd9	489
simon	0:1014af42efd9	490	/* Working pointer of inputA */
simon	0:1014af42efd9	491	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
simon	0:1014af42efd9	492	px = pSrc1;
simon	0:1014af42efd9	493
simon	0:1014af42efd9	494	/* Working pointer of inputB */
simon	0:1014af42efd9	495	pSrc2 = pIn2 + (srcBLen - 1u);
simon	0:1014af42efd9	496	py = pSrc2;
simon	0:1014af42efd9	497
simon	0:1014af42efd9	498	/* -------------------
simon	0:1014af42efd9	499	* Stage3 process
simon	0:1014af42efd9	500	* ------------------*/
simon	0:1014af42efd9	501
simon	0:1014af42efd9	502	while(blockSize3 > 0)
simon	0:1014af42efd9	503	{
simon	0:1014af42efd9	504	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	505	sum = 0;
simon	0:1014af42efd9	506
simon	0:1014af42efd9	507	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	508	k = count >> 2u;
simon	0:1014af42efd9	509
simon	0:1014af42efd9	510	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	511	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	512	while(k > 0u)
simon	0:1014af42efd9	513	{
simon	0:1014af42efd9	514	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	515	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	516	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	517	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	518
simon	0:1014af42efd9	519	/* Decrement the loop counter */
simon	0:1014af42efd9	520	k--;
simon	0:1014af42efd9	521	}
simon	0:1014af42efd9	522
simon	0:1014af42efd9	523	/* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	524	** No loop unrolling is used. */
simon	0:1014af42efd9	525	k = count % 0x4u;
simon	0:1014af42efd9	526
simon	0:1014af42efd9	527	while(k > 0u)
simon	0:1014af42efd9	528	{
simon	0:1014af42efd9	529	/* Perform the multiply-accumulate */
simon	0:1014af42efd9	530	sum += (q63_t) * px++ * (*py--);
simon	0:1014af42efd9	531
simon	0:1014af42efd9	532	/* Decrement the loop counter */
simon	0:1014af42efd9	533	k--;
simon	0:1014af42efd9	534	}
simon	0:1014af42efd9	535
simon	0:1014af42efd9	536	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	537	*pOut++ = (q31_t) (sum >> 31);
simon	0:1014af42efd9	538
simon	0:1014af42efd9	539	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	540	px = ++pSrc1;
simon	0:1014af42efd9	541	py = pSrc2;
simon	0:1014af42efd9	542
simon	0:1014af42efd9	543	/* Decrement the MAC count */
simon	0:1014af42efd9	544	count--;
simon	0:1014af42efd9	545
simon	0:1014af42efd9	546	/* Decrement the loop counter */
simon	0:1014af42efd9	547	blockSize3--;
simon	0:1014af42efd9	548
simon	0:1014af42efd9	549	}
simon	0:1014af42efd9	550
simon	0:1014af42efd9	551	/* set status as ARM_MATH_SUCCESS */
simon	0:1014af42efd9	552	status = ARM_MATH_SUCCESS;
simon	0:1014af42efd9	553	}
simon	0:1014af42efd9	554
simon	0:1014af42efd9	555	/* Return to application */
simon	0:1014af42efd9	556	return (status);
simon	0:1014af42efd9	557
simon	0:1014af42efd9	558	}
simon	0:1014af42efd9	559
simon	0:1014af42efd9	560	/**
simon	0:1014af42efd9	561	* @} end of PartialConv group
simon	0:1014af42efd9	562	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	10 Mar 2011
Imports:	907
Forks:	1
Commits:	3
Dependents:	5
Dependencies:	0
Followers:	35

src/Cortex-M4-M3/FilteringFunctions/arm_conv_partial_q31.c@0:1014af42efd9, 2011-03-10 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning