dsp - CMSIS DSP Library from CMSIS 2.0. See http://www.…

Users » simon » Code » dsp

CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

src/Cortex-M4-M3/FilteringFunctions/arm_correlate_q15.c@0:1014af42efd9, 2011-03-10 (annotated)

Committer:: simon
Date:: Thu Mar 10 15:07:50 2011 +0000
Revision:: 0:1014af42efd9

Who changed what in which revision?

User	Revision	Line number	New contents of line
simon	0:1014af42efd9	1	/* ----------------------------------------------------------------------
simon	0:1014af42efd9	2	* Copyright (C) 2010 ARM Limited. All rights reserved.
simon	0:1014af42efd9	3	*
simon	0:1014af42efd9	4	* $Date: 29. November 2010
simon	0:1014af42efd9	5	* $Revision: V1.0.3
simon	0:1014af42efd9	6	*
simon	0:1014af42efd9	7	* Project: CMSIS DSP Library
simon	0:1014af42efd9	8	* Title: arm_correlate_q15.c
simon	0:1014af42efd9	9	*
simon	0:1014af42efd9	10	* Description: Q15 Correlation.
simon	0:1014af42efd9	11	*
simon	0:1014af42efd9	12	* Target Processor: Cortex-M4/Cortex-M3
simon	0:1014af42efd9	13	*
simon	0:1014af42efd9	14	* Version 1.0.3 2010/11/29
simon	0:1014af42efd9	15	* Re-organized the CMSIS folders and updated documentation.
simon	0:1014af42efd9	16	*
simon	0:1014af42efd9	17	* Version 1.0.2 2010/11/11
simon	0:1014af42efd9	18	* Documentation updated.
simon	0:1014af42efd9	19	*
simon	0:1014af42efd9	20	* Version 1.0.1 2010/10/05
simon	0:1014af42efd9	21	* Production release and review comments incorporated.
simon	0:1014af42efd9	22	*
simon	0:1014af42efd9	23	* Version 1.0.0 2010/09/20
simon	0:1014af42efd9	24	* Production release and review comments incorporated
simon	0:1014af42efd9	25	*
simon	0:1014af42efd9	26	* Version 0.0.7 2010/06/10
simon	0:1014af42efd9	27	* Misra-C changes done
simon	0:1014af42efd9	28	*
simon	0:1014af42efd9	29	* -------------------------------------------------------------------- */
simon	0:1014af42efd9	30
simon	0:1014af42efd9	31	#include "arm_math.h"
simon	0:1014af42efd9	32
simon	0:1014af42efd9	33	/**
simon	0:1014af42efd9	34	* @ingroup groupFilters
simon	0:1014af42efd9	35	*/
simon	0:1014af42efd9	36
simon	0:1014af42efd9	37	/**
simon	0:1014af42efd9	38	* @addtogroup Corr
simon	0:1014af42efd9	39	* @{
simon	0:1014af42efd9	40	*/
simon	0:1014af42efd9	41
simon	0:1014af42efd9	42	/**
simon	0:1014af42efd9	43	* @brief Correlation of Q15 sequences
simon	0:1014af42efd9	44	* @param[in] *pSrcA points to the first input sequence.
simon	0:1014af42efd9	45	* @param[in] srcALen length of the first input sequence.
simon	0:1014af42efd9	46	* @param[in] *pSrcB points to the second input sequence.
simon	0:1014af42efd9	47	* @param[in] srcBLen length of the second input sequence.
simon	0:1014af42efd9	48	* @param[out] pDst points to the location where the output result is written. Length 2 max(srcALen, srcBLen) - 1.
simon	0:1014af42efd9	49	* @return none.
simon	0:1014af42efd9	50	*
simon	0:1014af42efd9	51	* @details
simon	0:1014af42efd9	52	* <b>Scaling and Overflow Behavior:</b>
simon	0:1014af42efd9	53	*
simon	0:1014af42efd9	54	* \par
simon	0:1014af42efd9	55	* The function is implemented using a 64-bit internal accumulator.
simon	0:1014af42efd9	56	* Both inputs are in 1.15 format and multiplications yield a 2.30 result.
simon	0:1014af42efd9	57	* The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
simon	0:1014af42efd9	58	* This approach provides 33 guard bits and there is no risk of overflow.
simon	0:1014af42efd9	59	* The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
simon	0:1014af42efd9	60	*
simon	0:1014af42efd9	61	* \par
simon	0:1014af42efd9	62	* Refer to <code>arm_correlate_fast_q15()</code> for a faster but less precise version of this function.
simon	0:1014af42efd9	63	*/
simon	0:1014af42efd9	64
simon	0:1014af42efd9	65	void arm_correlate_q15(
simon	0:1014af42efd9	66	q15_t * pSrcA,
simon	0:1014af42efd9	67	uint32_t srcALen,
simon	0:1014af42efd9	68	q15_t * pSrcB,
simon	0:1014af42efd9	69	uint32_t srcBLen,
simon	0:1014af42efd9	70	q15_t * pDst)
simon	0:1014af42efd9	71	{
simon	0:1014af42efd9	72	q15_t pIn1; / inputA pointer */
simon	0:1014af42efd9	73	q15_t pIn2; / inputB pointer */
simon	0:1014af42efd9	74	q15_t pOut = pDst; / output pointer */
simon	0:1014af42efd9	75	q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
simon	0:1014af42efd9	76	q15_t px; / Intermediate inputA pointer */
simon	0:1014af42efd9	77	q15_t py; / Intermediate inputB pointer */
simon	0:1014af42efd9	78	q15_t pSrc1; / Intermediate pointers */
simon	0:1014af42efd9	79	q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
simon	0:1014af42efd9	80	uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
simon	0:1014af42efd9	81	int32_t inc = 1; /* Destination address modifier */
simon	0:1014af42efd9	82	q31_t pb; / 32 bit pointer for inputB buffer */
simon	0:1014af42efd9	83
simon	0:1014af42efd9	84
simon	0:1014af42efd9	85	/* The algorithm implementation is based on the lengths of the inputs. */
simon	0:1014af42efd9	86	/* srcB is always made to slide across srcA. */
simon	0:1014af42efd9	87	/* So srcBLen is always considered as shorter or equal to srcALen */
simon	0:1014af42efd9	88	/* But CORR(x, y) is reverse of CORR(y, x) */
simon	0:1014af42efd9	89	/* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
simon	0:1014af42efd9	90	/* and the destination pointer modifier, inc is set to -1 */
simon	0:1014af42efd9	91	/* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
simon	0:1014af42efd9	92	/* But to improve the performance,
simon	0:1014af42efd9	93	* we include zeroes in the output instead of zero padding either of the the inputs*/
simon	0:1014af42efd9	94	/* If srcALen > srcBLen,
simon	0:1014af42efd9	95	* (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
simon	0:1014af42efd9	96	/* If srcALen < srcBLen,
simon	0:1014af42efd9	97	* (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
simon	0:1014af42efd9	98	if(srcALen >= srcBLen)
simon	0:1014af42efd9	99	{
simon	0:1014af42efd9	100	/* Initialization of inputA pointer */
simon	0:1014af42efd9	101	pIn1 = (pSrcA);
simon	0:1014af42efd9	102
simon	0:1014af42efd9	103	/* Initialization of inputB pointer */
simon	0:1014af42efd9	104	pIn2 = (pSrcB);
simon	0:1014af42efd9	105
simon	0:1014af42efd9	106	/* Number of output samples is calculated */
simon	0:1014af42efd9	107	outBlockSize = (2u * srcALen) - 1u;
simon	0:1014af42efd9	108
simon	0:1014af42efd9	109	/* When srcALen > srcBLen, zero padding is done to srcB
simon	0:1014af42efd9	110	* to make their lengths equal.
simon	0:1014af42efd9	111	* Instead, (outBlockSize - (srcALen + srcBLen - 1))
simon	0:1014af42efd9	112	* number of output samples are made zero */
simon	0:1014af42efd9	113	j = outBlockSize - (srcALen + (srcBLen - 1u));
simon	0:1014af42efd9	114
simon	0:1014af42efd9	115	while(j > 0u)
simon	0:1014af42efd9	116	{
simon	0:1014af42efd9	117	/* Zero is stored in the destination buffer */
simon	0:1014af42efd9	118	*pOut++ = 0;
simon	0:1014af42efd9	119
simon	0:1014af42efd9	120	/* Decrement the loop counter */
simon	0:1014af42efd9	121	j--;
simon	0:1014af42efd9	122	}
simon	0:1014af42efd9	123
simon	0:1014af42efd9	124	}
simon	0:1014af42efd9	125	else
simon	0:1014af42efd9	126	{
simon	0:1014af42efd9	127	/* Initialization of inputA pointer */
simon	0:1014af42efd9	128	pIn1 = (pSrcB);
simon	0:1014af42efd9	129
simon	0:1014af42efd9	130	/* Initialization of inputB pointer */
simon	0:1014af42efd9	131	pIn2 = (pSrcA);
simon	0:1014af42efd9	132
simon	0:1014af42efd9	133	/* srcBLen is always considered as shorter or equal to srcALen */
simon	0:1014af42efd9	134	j = srcBLen;
simon	0:1014af42efd9	135	srcBLen = srcALen;
simon	0:1014af42efd9	136	srcALen = j;
simon	0:1014af42efd9	137
simon	0:1014af42efd9	138	/* CORR(x, y) = Reverse order(CORR(y, x)) */
simon	0:1014af42efd9	139	/* Hence set the destination pointer to point to the last output sample */
simon	0:1014af42efd9	140	pOut = pDst + ((srcALen + srcBLen) - 2u);
simon	0:1014af42efd9	141
simon	0:1014af42efd9	142	/* Destination address modifier is set to -1 */
simon	0:1014af42efd9	143	inc = -1;
simon	0:1014af42efd9	144
simon	0:1014af42efd9	145	}
simon	0:1014af42efd9	146
simon	0:1014af42efd9	147	/* The function is internally
simon	0:1014af42efd9	148	* divided into three parts according to the number of multiplications that has to be
simon	0:1014af42efd9	149	* taken place between inputA samples and inputB samples. In the first part of the
simon	0:1014af42efd9	150	* algorithm, the multiplications increase by one for every iteration.
simon	0:1014af42efd9	151	* In the second part of the algorithm, srcBLen number of multiplications are done.
simon	0:1014af42efd9	152	* In the third part of the algorithm, the multiplications decrease by one
simon	0:1014af42efd9	153	* for every iteration.*/
simon	0:1014af42efd9	154	/* The algorithm is implemented in three stages.
simon	0:1014af42efd9	155	* The loop counters of each stage is initiated here. */
simon	0:1014af42efd9	156	blockSize1 = srcBLen - 1u;
simon	0:1014af42efd9	157	blockSize2 = srcALen - (srcBLen - 1u);
simon	0:1014af42efd9	158	blockSize3 = blockSize1;
simon	0:1014af42efd9	159
simon	0:1014af42efd9	160	/* --------------------------
simon	0:1014af42efd9	161	* Initializations of stage1
simon	0:1014af42efd9	162	* -------------------------*/
simon	0:1014af42efd9	163
simon	0:1014af42efd9	164	/* sum = x[0] * y[srcBlen - 1]
simon	0:1014af42efd9	165	* sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
simon	0:1014af42efd9	166	* ....
simon	0:1014af42efd9	167	* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
simon	0:1014af42efd9	168	*/
simon	0:1014af42efd9	169
simon	0:1014af42efd9	170	/* In this stage the MAC operations are increased by 1 for every iteration.
simon	0:1014af42efd9	171	The count variable holds the number of MAC operations performed */
simon	0:1014af42efd9	172	count = 1u;
simon	0:1014af42efd9	173
simon	0:1014af42efd9	174	/* Working pointer of inputA */
simon	0:1014af42efd9	175	px = pIn1;
simon	0:1014af42efd9	176
simon	0:1014af42efd9	177	/* Working pointer of inputB */
simon	0:1014af42efd9	178	pSrc1 = pIn2 + (srcBLen - 1u);
simon	0:1014af42efd9	179	py = pSrc1;
simon	0:1014af42efd9	180
simon	0:1014af42efd9	181	/* ------------------------
simon	0:1014af42efd9	182	* Stage1 process
simon	0:1014af42efd9	183	* ----------------------*/
simon	0:1014af42efd9	184
simon	0:1014af42efd9	185	/* The first loop starts here */
simon	0:1014af42efd9	186	while(blockSize1 > 0u)
simon	0:1014af42efd9	187	{
simon	0:1014af42efd9	188	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	189	sum = 0;
simon	0:1014af42efd9	190
simon	0:1014af42efd9	191	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	192	k = count >> 2;
simon	0:1014af42efd9	193
simon	0:1014af42efd9	194	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	195	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	196	while(k > 0u)
simon	0:1014af42efd9	197	{
simon	0:1014af42efd9	198	/* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
simon	0:1014af42efd9	199	sum = __SMLALD(__SIMD32(px)++, __SIMD32(py)++, sum);
simon	0:1014af42efd9	200	/* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
simon	0:1014af42efd9	201	sum = __SMLALD(__SIMD32(px)++, __SIMD32(py)++, sum);
simon	0:1014af42efd9	202
simon	0:1014af42efd9	203	/* Decrement the loop counter */
simon	0:1014af42efd9	204	k--;
simon	0:1014af42efd9	205	}
simon	0:1014af42efd9	206
simon	0:1014af42efd9	207	/* If the count is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	208	** No loop unrolling is used. */
simon	0:1014af42efd9	209	k = count % 0x4u;
simon	0:1014af42efd9	210
simon	0:1014af42efd9	211	while(k > 0u)
simon	0:1014af42efd9	212	{
simon	0:1014af42efd9	213	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	214	/* x[0] * y[srcBLen - 1] */
simon	0:1014af42efd9	215	sum = __SMLALD(px++, py++, sum);
simon	0:1014af42efd9	216
simon	0:1014af42efd9	217	/* Decrement the loop counter */
simon	0:1014af42efd9	218	k--;
simon	0:1014af42efd9	219	}
simon	0:1014af42efd9	220
simon	0:1014af42efd9	221	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	222	*pOut = (q15_t) (__SSAT((sum >> 15), 16));
simon	0:1014af42efd9	223	/* Destination pointer is updated according to the address modifier, inc */
simon	0:1014af42efd9	224	pOut += inc;
simon	0:1014af42efd9	225
simon	0:1014af42efd9	226	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	227	py = pSrc1 - count;
simon	0:1014af42efd9	228	px = pIn1;
simon	0:1014af42efd9	229
simon	0:1014af42efd9	230	/* Increment the MAC count */
simon	0:1014af42efd9	231	count++;
simon	0:1014af42efd9	232
simon	0:1014af42efd9	233	/* Decrement the loop counter */
simon	0:1014af42efd9	234	blockSize1--;
simon	0:1014af42efd9	235	}
simon	0:1014af42efd9	236
simon	0:1014af42efd9	237	/* --------------------------
simon	0:1014af42efd9	238	* Initializations of stage2
simon	0:1014af42efd9	239	* ------------------------*/
simon	0:1014af42efd9	240
simon	0:1014af42efd9	241	/* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
simon	0:1014af42efd9	242	* sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
simon	0:1014af42efd9	243	* ....
simon	0:1014af42efd9	244	* sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
simon	0:1014af42efd9	245	*/
simon	0:1014af42efd9	246
simon	0:1014af42efd9	247	/* Working pointer of inputA */
simon	0:1014af42efd9	248	px = pIn1;
simon	0:1014af42efd9	249
simon	0:1014af42efd9	250	/* Working pointer of inputB */
simon	0:1014af42efd9	251	py = pIn2;
simon	0:1014af42efd9	252
simon	0:1014af42efd9	253	/* Initialize inputB pointer of type q31 */
simon	0:1014af42efd9	254	pb = (q31_t *) (py);
simon	0:1014af42efd9	255
simon	0:1014af42efd9	256	/* count is index by which the pointer pIn1 to be incremented */
simon	0:1014af42efd9	257	count = 0u;
simon	0:1014af42efd9	258
simon	0:1014af42efd9	259	/* -------------------
simon	0:1014af42efd9	260	* Stage2 process
simon	0:1014af42efd9	261	* ------------------*/
simon	0:1014af42efd9	262
simon	0:1014af42efd9	263	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
simon	0:1014af42efd9	264	* So, to loop unroll over blockSize2,
simon	0:1014af42efd9	265	* srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
simon	0:1014af42efd9	266	if(srcBLen >= 4u)
simon	0:1014af42efd9	267	{
simon	0:1014af42efd9	268	/* Loop unroll over blockSize2, by 4 */
simon	0:1014af42efd9	269	blkCnt = blockSize2 >> 2u;
simon	0:1014af42efd9	270
simon	0:1014af42efd9	271	while(blkCnt > 0u)
simon	0:1014af42efd9	272	{
simon	0:1014af42efd9	273	/* Set all accumulators to zero */
simon	0:1014af42efd9	274	acc0 = 0;
simon	0:1014af42efd9	275	acc1 = 0;
simon	0:1014af42efd9	276	acc2 = 0;
simon	0:1014af42efd9	277	acc3 = 0;
simon	0:1014af42efd9	278
simon	0:1014af42efd9	279	/* read x[0], x[1] samples */
simon	0:1014af42efd9	280	x0 = (q31_t ) (px++);
simon	0:1014af42efd9	281	/* read x[1], x[2] samples */
simon	0:1014af42efd9	282	x1 = (q31_t ) (px++);
simon	0:1014af42efd9	283
simon	0:1014af42efd9	284	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	285	k = srcBLen >> 2u;
simon	0:1014af42efd9	286
simon	0:1014af42efd9	287	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	288	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	289	do
simon	0:1014af42efd9	290	{
simon	0:1014af42efd9	291	/* Read the first two inputB samples using SIMD:
simon	0:1014af42efd9	292	* y[0] and y[1] */
simon	0:1014af42efd9	293	c0 = *(pb++);
simon	0:1014af42efd9	294
simon	0:1014af42efd9	295	/* acc0 += x[0] * y[0] + x[1] * y[1] */
simon	0:1014af42efd9	296	acc0 = __SMLALD(x0, c0, acc0);
simon	0:1014af42efd9	297
simon	0:1014af42efd9	298	/* acc1 += x[1] * y[0] + x[2] * y[1] */
simon	0:1014af42efd9	299	acc1 = __SMLALD(x1, c0, acc1);
simon	0:1014af42efd9	300
simon	0:1014af42efd9	301	/* Read x[2], x[3] */
simon	0:1014af42efd9	302	x2 = (q31_t ) (px++);
simon	0:1014af42efd9	303
simon	0:1014af42efd9	304	/* Read x[3], x[4] */
simon	0:1014af42efd9	305	x3 = (q31_t ) (px++);
simon	0:1014af42efd9	306
simon	0:1014af42efd9	307	/* acc2 += x[2] * y[0] + x[3] * y[1] */
simon	0:1014af42efd9	308	acc2 = __SMLALD(x2, c0, acc2);
simon	0:1014af42efd9	309
simon	0:1014af42efd9	310	/* acc3 += x[3] * y[0] + x[4] * y[1] */
simon	0:1014af42efd9	311	acc3 = __SMLALD(x3, c0, acc3);
simon	0:1014af42efd9	312
simon	0:1014af42efd9	313	/* Read y[2] and y[3] */
simon	0:1014af42efd9	314	c0 = *(pb++);
simon	0:1014af42efd9	315
simon	0:1014af42efd9	316	/* acc0 += x[2] * y[2] + x[3] * y[3] */
simon	0:1014af42efd9	317	acc0 = __SMLALD(x2, c0, acc0);
simon	0:1014af42efd9	318
simon	0:1014af42efd9	319	/* acc1 += x[3] * y[2] + x[4] * y[3] */
simon	0:1014af42efd9	320	acc1 = __SMLALD(x3, c0, acc1);
simon	0:1014af42efd9	321
simon	0:1014af42efd9	322	/* Read x[4], x[5] */
simon	0:1014af42efd9	323	x0 = (q31_t ) (px++);
simon	0:1014af42efd9	324
simon	0:1014af42efd9	325	/* Read x[5], x[6] */
simon	0:1014af42efd9	326	x1 = (q31_t ) (px++);
simon	0:1014af42efd9	327
simon	0:1014af42efd9	328	/* acc2 += x[4] * y[2] + x[5] * y[3] */
simon	0:1014af42efd9	329	acc2 = __SMLALD(x0, c0, acc2);
simon	0:1014af42efd9	330
simon	0:1014af42efd9	331	/* acc3 += x[5] * y[2] + x[6] * y[3] */
simon	0:1014af42efd9	332	acc3 = __SMLALD(x1, c0, acc3);
simon	0:1014af42efd9	333
simon	0:1014af42efd9	334	} while(--k);
simon	0:1014af42efd9	335
simon	0:1014af42efd9	336	/* For the next MAC operations, SIMD is not used
simon	0:1014af42efd9	337	* So, the 16 bit pointer if inputB, py is updated */
simon	0:1014af42efd9	338	py = (q15_t *) (pb);
simon	0:1014af42efd9	339
simon	0:1014af42efd9	340	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	341	** No loop unrolling is used. */
simon	0:1014af42efd9	342	k = srcBLen % 0x4u;
simon	0:1014af42efd9	343
simon	0:1014af42efd9	344	if(k == 1u)
simon	0:1014af42efd9	345	{
simon	0:1014af42efd9	346	/* Read y[4] */
simon	0:1014af42efd9	347	c0 = *py;
simon	0:1014af42efd9	348	c0 = c0 & 0x0000FFFF;
simon	0:1014af42efd9	349
simon	0:1014af42efd9	350	/* Read x[7] */
simon	0:1014af42efd9	351	x3 = (q31_t ) px++;
simon	0:1014af42efd9	352
simon	0:1014af42efd9	353	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	354	acc0 = __SMLALD(x0, c0, acc0);
simon	0:1014af42efd9	355	acc1 = __SMLALD(x1, c0, acc1);
simon	0:1014af42efd9	356	acc2 = __SMLALDX(x1, c0, acc2);
simon	0:1014af42efd9	357	acc3 = __SMLALDX(x3, c0, acc3);
simon	0:1014af42efd9	358	}
simon	0:1014af42efd9	359
simon	0:1014af42efd9	360	if(k == 2u)
simon	0:1014af42efd9	361	{
simon	0:1014af42efd9	362	/* Read y[4], y[5] */
simon	0:1014af42efd9	363	c0 = *(pb);
simon	0:1014af42efd9	364
simon	0:1014af42efd9	365	/* Read x[7], x[8] */
simon	0:1014af42efd9	366	x3 = (q31_t ) px++;
simon	0:1014af42efd9	367
simon	0:1014af42efd9	368	/* Read x[9] */
simon	0:1014af42efd9	369	x2 = (q31_t ) px++;
simon	0:1014af42efd9	370
simon	0:1014af42efd9	371	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	372	acc0 = __SMLALD(x0, c0, acc0);
simon	0:1014af42efd9	373	acc1 = __SMLALD(x1, c0, acc1);
simon	0:1014af42efd9	374	acc2 = __SMLALD(x3, c0, acc2);
simon	0:1014af42efd9	375	acc3 = __SMLALD(x2, c0, acc3);
simon	0:1014af42efd9	376	}
simon	0:1014af42efd9	377
simon	0:1014af42efd9	378	if(k == 3u)
simon	0:1014af42efd9	379	{
simon	0:1014af42efd9	380	/* Read y[4], y[5] */
simon	0:1014af42efd9	381	c0 = *pb++;
simon	0:1014af42efd9	382
simon	0:1014af42efd9	383	/* Read x[7], x[8] */
simon	0:1014af42efd9	384	x3 = (q31_t ) px++;
simon	0:1014af42efd9	385
simon	0:1014af42efd9	386	/* Read x[9] */
simon	0:1014af42efd9	387	x2 = (q31_t ) px++;
simon	0:1014af42efd9	388
simon	0:1014af42efd9	389	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	390	acc0 = __SMLALD(x0, c0, acc0);
simon	0:1014af42efd9	391	acc1 = __SMLALD(x1, c0, acc1);
simon	0:1014af42efd9	392	acc2 = __SMLALD(x3, c0, acc2);
simon	0:1014af42efd9	393	acc3 = __SMLALD(x2, c0, acc3);
simon	0:1014af42efd9	394
simon	0:1014af42efd9	395	/* Read y[6] */
simon	0:1014af42efd9	396	c0 = (q15_t) (*pb);
simon	0:1014af42efd9	397	c0 = c0 & 0x0000FFFF;
simon	0:1014af42efd9	398
simon	0:1014af42efd9	399	/* Read x[10] */
simon	0:1014af42efd9	400	x3 = (q31_t ) px++;
simon	0:1014af42efd9	401
simon	0:1014af42efd9	402	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	403	acc0 = __SMLALDX(x1, c0, acc0);
simon	0:1014af42efd9	404	acc1 = __SMLALD(x2, c0, acc1);
simon	0:1014af42efd9	405	acc2 = __SMLALDX(x2, c0, acc2);
simon	0:1014af42efd9	406	acc3 = __SMLALDX(x3, c0, acc3);
simon	0:1014af42efd9	407	}
simon	0:1014af42efd9	408
simon	0:1014af42efd9	409	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	410	*pOut = (q15_t) (__SSAT(acc0 >> 15, 16));
simon	0:1014af42efd9	411	/* Destination pointer is updated according to the address modifier, inc */
simon	0:1014af42efd9	412	pOut += inc;
simon	0:1014af42efd9	413
simon	0:1014af42efd9	414	*pOut = (q15_t) (__SSAT(acc1 >> 15, 16));
simon	0:1014af42efd9	415	pOut += inc;
simon	0:1014af42efd9	416
simon	0:1014af42efd9	417	*pOut = (q15_t) (__SSAT(acc2 >> 15, 16));
simon	0:1014af42efd9	418	pOut += inc;
simon	0:1014af42efd9	419
simon	0:1014af42efd9	420	*pOut = (q15_t) (__SSAT(acc3 >> 15, 16));
simon	0:1014af42efd9	421	pOut += inc;
simon	0:1014af42efd9	422
simon	0:1014af42efd9	423	/* Increment the count by 4 as 4 output values are computed */
simon	0:1014af42efd9	424	count += 4u;
simon	0:1014af42efd9	425
simon	0:1014af42efd9	426	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	427	px = pIn1 + count;
simon	0:1014af42efd9	428	py = pIn2;
simon	0:1014af42efd9	429	pb = (q31_t *) (py);
simon	0:1014af42efd9	430
simon	0:1014af42efd9	431
simon	0:1014af42efd9	432	/* Decrement the loop counter */
simon	0:1014af42efd9	433	blkCnt--;
simon	0:1014af42efd9	434	}
simon	0:1014af42efd9	435
simon	0:1014af42efd9	436	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
simon	0:1014af42efd9	437	** No loop unrolling is used. */
simon	0:1014af42efd9	438	blkCnt = blockSize2 % 0x4u;
simon	0:1014af42efd9	439
simon	0:1014af42efd9	440	while(blkCnt > 0u)
simon	0:1014af42efd9	441	{
simon	0:1014af42efd9	442	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	443	sum = 0;
simon	0:1014af42efd9	444
simon	0:1014af42efd9	445	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	446	k = srcBLen >> 2u;
simon	0:1014af42efd9	447
simon	0:1014af42efd9	448	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	449	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	450	while(k > 0u)
simon	0:1014af42efd9	451	{
simon	0:1014af42efd9	452	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	453	sum += ((q63_t) * px++ * *py++);
simon	0:1014af42efd9	454	sum += ((q63_t) * px++ * *py++);
simon	0:1014af42efd9	455	sum += ((q63_t) * px++ * *py++);
simon	0:1014af42efd9	456	sum += ((q63_t) * px++ * *py++);
simon	0:1014af42efd9	457
simon	0:1014af42efd9	458	/* Decrement the loop counter */
simon	0:1014af42efd9	459	k--;
simon	0:1014af42efd9	460	}
simon	0:1014af42efd9	461
simon	0:1014af42efd9	462	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	463	** No loop unrolling is used. */
simon	0:1014af42efd9	464	k = srcBLen % 0x4u;
simon	0:1014af42efd9	465
simon	0:1014af42efd9	466	while(k > 0u)
simon	0:1014af42efd9	467	{
simon	0:1014af42efd9	468	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	469	sum += ((q63_t) * px++ * *py++);
simon	0:1014af42efd9	470
simon	0:1014af42efd9	471	/* Decrement the loop counter */
simon	0:1014af42efd9	472	k--;
simon	0:1014af42efd9	473	}
simon	0:1014af42efd9	474
simon	0:1014af42efd9	475	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	476	*pOut = (q15_t) (__SSAT(sum >> 15, 16));
simon	0:1014af42efd9	477	/* Destination pointer is updated according to the address modifier, inc */
simon	0:1014af42efd9	478	pOut += inc;
simon	0:1014af42efd9	479
simon	0:1014af42efd9	480	/* Increment count by 1, as one output value is computed */
simon	0:1014af42efd9	481	count++;
simon	0:1014af42efd9	482
simon	0:1014af42efd9	483	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	484	px = pIn1 + count;
simon	0:1014af42efd9	485	py = pIn2;
simon	0:1014af42efd9	486
simon	0:1014af42efd9	487	/* Decrement the loop counter */
simon	0:1014af42efd9	488	blkCnt--;
simon	0:1014af42efd9	489	}
simon	0:1014af42efd9	490	}
simon	0:1014af42efd9	491	else
simon	0:1014af42efd9	492	{
simon	0:1014af42efd9	493	/* If the srcBLen is not a multiple of 4,
simon	0:1014af42efd9	494	* the blockSize2 loop cannot be unrolled by 4 */
simon	0:1014af42efd9	495	blkCnt = blockSize2;
simon	0:1014af42efd9	496
simon	0:1014af42efd9	497	while(blkCnt > 0u)
simon	0:1014af42efd9	498	{
simon	0:1014af42efd9	499	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	500	sum = 0;
simon	0:1014af42efd9	501
simon	0:1014af42efd9	502	/* Loop over srcBLen */
simon	0:1014af42efd9	503	k = srcBLen;
simon	0:1014af42efd9	504
simon	0:1014af42efd9	505	while(k > 0u)
simon	0:1014af42efd9	506	{
simon	0:1014af42efd9	507	/* Perform the multiply-accumulate */
simon	0:1014af42efd9	508	sum += ((q63_t) * px++ * *py++);
simon	0:1014af42efd9	509
simon	0:1014af42efd9	510	/* Decrement the loop counter */
simon	0:1014af42efd9	511	k--;
simon	0:1014af42efd9	512	}
simon	0:1014af42efd9	513
simon	0:1014af42efd9	514	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	515	*pOut = (q15_t) (__SSAT(sum >> 15, 16));
simon	0:1014af42efd9	516	/* Destination pointer is updated according to the address modifier, inc */
simon	0:1014af42efd9	517	pOut += inc;
simon	0:1014af42efd9	518
simon	0:1014af42efd9	519	/* Increment the MAC count */
simon	0:1014af42efd9	520	count++;
simon	0:1014af42efd9	521
simon	0:1014af42efd9	522	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	523	px = pIn1 + count;
simon	0:1014af42efd9	524	py = pIn2;
simon	0:1014af42efd9	525
simon	0:1014af42efd9	526	/* Decrement the loop counter */
simon	0:1014af42efd9	527	blkCnt--;
simon	0:1014af42efd9	528	}
simon	0:1014af42efd9	529	}
simon	0:1014af42efd9	530
simon	0:1014af42efd9	531	/* --------------------------
simon	0:1014af42efd9	532	* Initializations of stage3
simon	0:1014af42efd9	533	* -------------------------*/
simon	0:1014af42efd9	534
simon	0:1014af42efd9	535	/* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
simon	0:1014af42efd9	536	* sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
simon	0:1014af42efd9	537	* ....
simon	0:1014af42efd9	538	* sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
simon	0:1014af42efd9	539	* sum += x[srcALen-1] * y[0]
simon	0:1014af42efd9	540	*/
simon	0:1014af42efd9	541
simon	0:1014af42efd9	542	/* In this stage the MAC operations are decreased by 1 for every iteration.
simon	0:1014af42efd9	543	The count variable holds the number of MAC operations performed */
simon	0:1014af42efd9	544	count = srcBLen - 1u;
simon	0:1014af42efd9	545
simon	0:1014af42efd9	546	/* Working pointer of inputA */
simon	0:1014af42efd9	547	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
simon	0:1014af42efd9	548	px = pSrc1;
simon	0:1014af42efd9	549
simon	0:1014af42efd9	550	/* Working pointer of inputB */
simon	0:1014af42efd9	551	py = pIn2;
simon	0:1014af42efd9	552
simon	0:1014af42efd9	553	/* -------------------
simon	0:1014af42efd9	554	* Stage3 process
simon	0:1014af42efd9	555	* ------------------*/
simon	0:1014af42efd9	556
simon	0:1014af42efd9	557	while(blockSize3 > 0u)
simon	0:1014af42efd9	558	{
simon	0:1014af42efd9	559	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	560	sum = 0;
simon	0:1014af42efd9	561
simon	0:1014af42efd9	562	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	563	k = count >> 2u;
simon	0:1014af42efd9	564
simon	0:1014af42efd9	565	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	566	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	567	while(k > 0u)
simon	0:1014af42efd9	568	{
simon	0:1014af42efd9	569	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	570	/* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
simon	0:1014af42efd9	571	sum = __SMLALD(__SIMD32(px)++, __SIMD32(py)++, sum);
simon	0:1014af42efd9	572	/* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
simon	0:1014af42efd9	573	sum = __SMLALD(__SIMD32(px)++, __SIMD32(py)++, sum);
simon	0:1014af42efd9	574
simon	0:1014af42efd9	575	/* Decrement the loop counter */
simon	0:1014af42efd9	576	k--;
simon	0:1014af42efd9	577	}
simon	0:1014af42efd9	578
simon	0:1014af42efd9	579	/* If the count is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	580	** No loop unrolling is used. */
simon	0:1014af42efd9	581	k = count % 0x4u;
simon	0:1014af42efd9	582
simon	0:1014af42efd9	583	while(k > 0u)
simon	0:1014af42efd9	584	{
simon	0:1014af42efd9	585	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	586	sum = __SMLALD(px++, py++, sum);
simon	0:1014af42efd9	587
simon	0:1014af42efd9	588	/* Decrement the loop counter */
simon	0:1014af42efd9	589	k--;
simon	0:1014af42efd9	590	}
simon	0:1014af42efd9	591
simon	0:1014af42efd9	592	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	593	*pOut = (q15_t) (__SSAT((sum >> 15), 16));
simon	0:1014af42efd9	594	/* Destination pointer is updated according to the address modifier, inc */
simon	0:1014af42efd9	595	pOut += inc;
simon	0:1014af42efd9	596
simon	0:1014af42efd9	597	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	598	px = ++pSrc1;
simon	0:1014af42efd9	599	py = pIn2;
simon	0:1014af42efd9	600
simon	0:1014af42efd9	601	/* Decrement the MAC count */
simon	0:1014af42efd9	602	count--;
simon	0:1014af42efd9	603
simon	0:1014af42efd9	604	/* Decrement the loop counter */
simon	0:1014af42efd9	605	blockSize3--;
simon	0:1014af42efd9	606	}
simon	0:1014af42efd9	607
simon	0:1014af42efd9	608	}
simon	0:1014af42efd9	609
simon	0:1014af42efd9	610	/**
simon	0:1014af42efd9	611	* @} end of Corr group
simon	0:1014af42efd9	612	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	10 Mar 2011
Imports:	907
Forks:	1
Commits:	3
Dependents:	5
Dependencies:	0
Followers:	35

src/Cortex-M4-M3/FilteringFunctions/arm_correlate_q15.c@0:1014af42efd9, 2011-03-10 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning