dsp - CMSIS DSP Library from CMSIS 2.0. See http://www.…

Users » simon » Code » dsp

CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

src/Cortex-M4-M3/FilteringFunctions/arm_conv_partial_q15.c@0:1014af42efd9, 2011-03-10 (annotated)

Committer:: simon
Date:: Thu Mar 10 15:07:50 2011 +0000
Revision:: 0:1014af42efd9

Who changed what in which revision?

User	Revision	Line number	New contents of line
simon	0:1014af42efd9	1	/* ----------------------------------------------------------------------
simon	0:1014af42efd9	2	* Copyright (C) 2010 ARM Limited. All rights reserved.
simon	0:1014af42efd9	3	*
simon	0:1014af42efd9	4	* $Date: 29. November 2010
simon	0:1014af42efd9	5	* $Revision: V1.0.3
simon	0:1014af42efd9	6	*
simon	0:1014af42efd9	7	* Project: CMSIS DSP Library
simon	0:1014af42efd9	8	* Title: arm_conv_partial_q15.c
simon	0:1014af42efd9	9	*
simon	0:1014af42efd9	10	* Description: Q15 Partial convolution.
simon	0:1014af42efd9	11	*
simon	0:1014af42efd9	12	* Target Processor: Cortex-M4/Cortex-M3
simon	0:1014af42efd9	13	*
simon	0:1014af42efd9	14	* Version 1.0.3 2010/11/29
simon	0:1014af42efd9	15	* Re-organized the CMSIS folders and updated documentation.
simon	0:1014af42efd9	16	*
simon	0:1014af42efd9	17	* Version 1.0.2 2010/11/11
simon	0:1014af42efd9	18	* Documentation updated.
simon	0:1014af42efd9	19	*
simon	0:1014af42efd9	20	* Version 1.0.1 2010/10/05
simon	0:1014af42efd9	21	* Production release and review comments incorporated.
simon	0:1014af42efd9	22	*
simon	0:1014af42efd9	23	* Version 1.0.0 2010/09/20
simon	0:1014af42efd9	24	* Production release and review comments incorporated
simon	0:1014af42efd9	25	*
simon	0:1014af42efd9	26	* Version 0.0.7 2010/06/10
simon	0:1014af42efd9	27	* Misra-C changes done
simon	0:1014af42efd9	28	*
simon	0:1014af42efd9	29	* -------------------------------------------------------------------- */
simon	0:1014af42efd9	30
simon	0:1014af42efd9	31	#include "arm_math.h"
simon	0:1014af42efd9	32
simon	0:1014af42efd9	33	/**
simon	0:1014af42efd9	34	* @ingroup groupFilters
simon	0:1014af42efd9	35	*/
simon	0:1014af42efd9	36
simon	0:1014af42efd9	37	/**
simon	0:1014af42efd9	38	* @addtogroup PartialConv
simon	0:1014af42efd9	39	* @{
simon	0:1014af42efd9	40	*/
simon	0:1014af42efd9	41
simon	0:1014af42efd9	42	/**
simon	0:1014af42efd9	43	* @brief Partial convolution of Q15 sequences.
simon	0:1014af42efd9	44	* @param[in] *pSrcA points to the first input sequence.
simon	0:1014af42efd9	45	* @param[in] srcALen length of the first input sequence.
simon	0:1014af42efd9	46	* @param[in] *pSrcB points to the second input sequence.
simon	0:1014af42efd9	47	* @param[in] srcBLen length of the second input sequence.
simon	0:1014af42efd9	48	* @param[out] *pDst points to the location where the output result is written.
simon	0:1014af42efd9	49	* @param[in] firstIndex is the first output sample to start with.
simon	0:1014af42efd9	50	* @param[in] numPoints is the number of output points to be computed.
simon	0:1014af42efd9	51	* @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
simon	0:1014af42efd9	52	*
simon	0:1014af42efd9	53	* Refer to <code>arm_conv_partial_fast_q15()</code> for a faster but less precise version of this function.
simon	0:1014af42efd9	54	*/
simon	0:1014af42efd9	55
simon	0:1014af42efd9	56
simon	0:1014af42efd9	57	arm_status arm_conv_partial_q15(
simon	0:1014af42efd9	58	q15_t * pSrcA,
simon	0:1014af42efd9	59	uint32_t srcALen,
simon	0:1014af42efd9	60	q15_t * pSrcB,
simon	0:1014af42efd9	61	uint32_t srcBLen,
simon	0:1014af42efd9	62	q15_t * pDst,
simon	0:1014af42efd9	63	uint32_t firstIndex,
simon	0:1014af42efd9	64	uint32_t numPoints)
simon	0:1014af42efd9	65	{
simon	0:1014af42efd9	66	q15_t pIn1; / inputA pointer */
simon	0:1014af42efd9	67	q15_t pIn2; / inputB pointer */
simon	0:1014af42efd9	68	q15_t pOut = pDst; / output pointer */
simon	0:1014af42efd9	69	q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
simon	0:1014af42efd9	70	q15_t px; / Intermediate inputA pointer */
simon	0:1014af42efd9	71	q15_t py; / Intermediate inputB pointer */
simon	0:1014af42efd9	72	q15_t pSrc1, pSrc2; /* Intermediate pointers */
simon	0:1014af42efd9	73	q31_t x0, x1, x2, x3, c0; /* Temporary input variables */
simon	0:1014af42efd9	74	uint32_t j, k, count, check, blkCnt;
simon	0:1014af42efd9	75	int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
simon	0:1014af42efd9	76	arm_status status; /* status of Partial convolution */
simon	0:1014af42efd9	77	q31_t pb; / 32 bit pointer for inputB buffer */
simon	0:1014af42efd9	78
simon	0:1014af42efd9	79	/* Check for range of output samples to be calculated */
simon	0:1014af42efd9	80	if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
simon	0:1014af42efd9	81	{
simon	0:1014af42efd9	82	/* Set status as ARM_MATH_ARGUMENT_ERROR */
simon	0:1014af42efd9	83	status = ARM_MATH_ARGUMENT_ERROR;
simon	0:1014af42efd9	84	}
simon	0:1014af42efd9	85	else
simon	0:1014af42efd9	86	{
simon	0:1014af42efd9	87
simon	0:1014af42efd9	88	/* The algorithm implementation is based on the lengths of the inputs. */
simon	0:1014af42efd9	89	/* srcB is always made to slide across srcA. */
simon	0:1014af42efd9	90	/* So srcBLen is always considered as shorter or equal to srcALen */
simon	0:1014af42efd9	91	if(srcALen >= srcBLen)
simon	0:1014af42efd9	92	{
simon	0:1014af42efd9	93	/* Initialization of inputA pointer */
simon	0:1014af42efd9	94	pIn1 = pSrcA;
simon	0:1014af42efd9	95
simon	0:1014af42efd9	96	/* Initialization of inputB pointer */
simon	0:1014af42efd9	97	pIn2 = pSrcB;
simon	0:1014af42efd9	98	}
simon	0:1014af42efd9	99	else
simon	0:1014af42efd9	100	{
simon	0:1014af42efd9	101	/* Initialization of inputA pointer */
simon	0:1014af42efd9	102	pIn1 = pSrcB;
simon	0:1014af42efd9	103
simon	0:1014af42efd9	104	/* Initialization of inputB pointer */
simon	0:1014af42efd9	105	pIn2 = pSrcA;
simon	0:1014af42efd9	106
simon	0:1014af42efd9	107	/* srcBLen is always considered as shorter or equal to srcALen */
simon	0:1014af42efd9	108	j = srcBLen;
simon	0:1014af42efd9	109	srcBLen = srcALen;
simon	0:1014af42efd9	110	srcALen = j;
simon	0:1014af42efd9	111	}
simon	0:1014af42efd9	112
simon	0:1014af42efd9	113	/* Conditions to check which loopCounter holds
simon	0:1014af42efd9	114	* the first and last indices of the output samples to be calculated. */
simon	0:1014af42efd9	115	check = firstIndex + numPoints;
simon	0:1014af42efd9	116	blockSize3 = ((int32_t) check - (int32_t) srcALen);
simon	0:1014af42efd9	117	blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
simon	0:1014af42efd9	118	blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
simon	0:1014af42efd9	119	blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
simon	0:1014af42efd9	120	(int32_t) numPoints) : 0;
simon	0:1014af42efd9	121	blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
simon	0:1014af42efd9	122	(int32_t) firstIndex);
simon	0:1014af42efd9	123	blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
simon	0:1014af42efd9	124
simon	0:1014af42efd9	125	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
simon	0:1014af42efd9	126	/* The function is internally
simon	0:1014af42efd9	127	* divided into three stages according to the number of multiplications that has to be
simon	0:1014af42efd9	128	* taken place between inputA samples and inputB samples. In the first stage of the
simon	0:1014af42efd9	129	* algorithm, the multiplications increase by one for every iteration.
simon	0:1014af42efd9	130	* In the second stage of the algorithm, srcBLen number of multiplications are done.
simon	0:1014af42efd9	131	* In the third stage of the algorithm, the multiplications decrease by one
simon	0:1014af42efd9	132	* for every iteration. */
simon	0:1014af42efd9	133
simon	0:1014af42efd9	134	/* Set the output pointer to point to the firstIndex
simon	0:1014af42efd9	135	* of the output sample to be calculated. */
simon	0:1014af42efd9	136	pOut = pDst + firstIndex;
simon	0:1014af42efd9	137
simon	0:1014af42efd9	138	/* --------------------------
simon	0:1014af42efd9	139	* Initializations of stage1
simon	0:1014af42efd9	140	* -------------------------*/
simon	0:1014af42efd9	141
simon	0:1014af42efd9	142	/* sum = x[0] * y[0]
simon	0:1014af42efd9	143	* sum = x[0] * y[1] + x[1] * y[0]
simon	0:1014af42efd9	144	* ....
simon	0:1014af42efd9	145	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
simon	0:1014af42efd9	146	*/
simon	0:1014af42efd9	147
simon	0:1014af42efd9	148	/* In this stage the MAC operations are increased by 1 for every iteration.
simon	0:1014af42efd9	149	The count variable holds the number of MAC operations performed.
simon	0:1014af42efd9	150	Since the partial convolution starts from firstIndex
simon	0:1014af42efd9	151	Number of Macs to be performed is firstIndex + 1 */
simon	0:1014af42efd9	152	count = 1u + firstIndex;
simon	0:1014af42efd9	153
simon	0:1014af42efd9	154	/* Working pointer of inputA */
simon	0:1014af42efd9	155	px = pIn1;
simon	0:1014af42efd9	156
simon	0:1014af42efd9	157	/* Working pointer of inputB */
simon	0:1014af42efd9	158	pSrc2 = pIn2 + firstIndex;
simon	0:1014af42efd9	159	py = pSrc2;
simon	0:1014af42efd9	160
simon	0:1014af42efd9	161	/* ------------------------
simon	0:1014af42efd9	162	* Stage1 process
simon	0:1014af42efd9	163	* ----------------------*/
simon	0:1014af42efd9	164
simon	0:1014af42efd9	165	/* For loop unrolling by 4, this stage is divided into two. */
simon	0:1014af42efd9	166	/* First part of this stage computes the MAC operations less than 4 */
simon	0:1014af42efd9	167	/* Second part of this stage computes the MAC operations greater than or equal to 4 */
simon	0:1014af42efd9	168
simon	0:1014af42efd9	169	/* The first part of the stage starts here */
simon	0:1014af42efd9	170	while((count < 4u) && (blockSize1 > 0))
simon	0:1014af42efd9	171	{
simon	0:1014af42efd9	172	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	173	sum = 0;
simon	0:1014af42efd9	174
simon	0:1014af42efd9	175	/* Loop over number of MAC operations between
simon	0:1014af42efd9	176	* inputA samples and inputB samples */
simon	0:1014af42efd9	177	k = count;
simon	0:1014af42efd9	178
simon	0:1014af42efd9	179	while(k > 0u)
simon	0:1014af42efd9	180	{
simon	0:1014af42efd9	181	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	182	sum = __SMLALD(px++, py--, sum);
simon	0:1014af42efd9	183
simon	0:1014af42efd9	184	/* Decrement the loop counter */
simon	0:1014af42efd9	185	k--;
simon	0:1014af42efd9	186	}
simon	0:1014af42efd9	187
simon	0:1014af42efd9	188	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	189	*pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
simon	0:1014af42efd9	190
simon	0:1014af42efd9	191	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	192	py = ++pSrc2;
simon	0:1014af42efd9	193	px = pIn1;
simon	0:1014af42efd9	194
simon	0:1014af42efd9	195	/* Increment the MAC count */
simon	0:1014af42efd9	196	count++;
simon	0:1014af42efd9	197
simon	0:1014af42efd9	198	/* Decrement the loop counter */
simon	0:1014af42efd9	199	blockSize1--;
simon	0:1014af42efd9	200	}
simon	0:1014af42efd9	201
simon	0:1014af42efd9	202	/* The second part of the stage starts here */
simon	0:1014af42efd9	203	/* The internal loop, over count, is unrolled by 4 */
simon	0:1014af42efd9	204	/* To, read the last two inputB samples using SIMD:
simon	0:1014af42efd9	205	* y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
simon	0:1014af42efd9	206	py = py - 1;
simon	0:1014af42efd9	207
simon	0:1014af42efd9	208	while(blockSize1 > 0)
simon	0:1014af42efd9	209	{
simon	0:1014af42efd9	210	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	211	sum = 0;
simon	0:1014af42efd9	212
simon	0:1014af42efd9	213	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	214	k = count >> 2u;
simon	0:1014af42efd9	215
simon	0:1014af42efd9	216	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	217	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	218	while(k > 0u)
simon	0:1014af42efd9	219	{
simon	0:1014af42efd9	220	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	221	/* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
simon	0:1014af42efd9	222	sum = __SMLALDX(__SIMD32(px)++, __SIMD32(py)--, sum);
simon	0:1014af42efd9	223	/* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
simon	0:1014af42efd9	224	sum = __SMLALDX(__SIMD32(px)++, __SIMD32(py)--, sum);
simon	0:1014af42efd9	225
simon	0:1014af42efd9	226	/* Decrement the loop counter */
simon	0:1014af42efd9	227	k--;
simon	0:1014af42efd9	228	}
simon	0:1014af42efd9	229
simon	0:1014af42efd9	230	/* For the next MAC operations, the pointer py is used without SIMD
simon	0:1014af42efd9	231	* So, py is incremented by 1 */
simon	0:1014af42efd9	232	py = py + 1u;
simon	0:1014af42efd9	233
simon	0:1014af42efd9	234	/* If the count is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	235	** No loop unrolling is used. */
simon	0:1014af42efd9	236	k = count % 0x4u;
simon	0:1014af42efd9	237
simon	0:1014af42efd9	238	while(k > 0u)
simon	0:1014af42efd9	239	{
simon	0:1014af42efd9	240	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	241	sum = __SMLALD(px++, py--, sum);
simon	0:1014af42efd9	242
simon	0:1014af42efd9	243	/* Decrement the loop counter */
simon	0:1014af42efd9	244	k--;
simon	0:1014af42efd9	245	}
simon	0:1014af42efd9	246
simon	0:1014af42efd9	247	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	248	*pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
simon	0:1014af42efd9	249
simon	0:1014af42efd9	250	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	251	py = ++pSrc2 - 1u;
simon	0:1014af42efd9	252	px = pIn1;
simon	0:1014af42efd9	253
simon	0:1014af42efd9	254	/* Increment the MAC count */
simon	0:1014af42efd9	255	count++;
simon	0:1014af42efd9	256
simon	0:1014af42efd9	257	/* Decrement the loop counter */
simon	0:1014af42efd9	258	blockSize1--;
simon	0:1014af42efd9	259	}
simon	0:1014af42efd9	260
simon	0:1014af42efd9	261	/* --------------------------
simon	0:1014af42efd9	262	* Initializations of stage2
simon	0:1014af42efd9	263	* ------------------------*/
simon	0:1014af42efd9	264
simon	0:1014af42efd9	265	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
simon	0:1014af42efd9	266	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
simon	0:1014af42efd9	267	* ....
simon	0:1014af42efd9	268	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
simon	0:1014af42efd9	269	*/
simon	0:1014af42efd9	270
simon	0:1014af42efd9	271	/* Working pointer of inputA */
simon	0:1014af42efd9	272	px = pIn1;
simon	0:1014af42efd9	273
simon	0:1014af42efd9	274	/* Working pointer of inputB */
simon	0:1014af42efd9	275	pSrc2 = pIn2 + (srcBLen - 1u);
simon	0:1014af42efd9	276	py = pSrc2;
simon	0:1014af42efd9	277
simon	0:1014af42efd9	278	/* Initialize inputB pointer of type q31 */
simon	0:1014af42efd9	279	pb = (q31_t *) (py - 1u);
simon	0:1014af42efd9	280
simon	0:1014af42efd9	281	/* count is the index by which the pointer pIn1 to be incremented */
simon	0:1014af42efd9	282	count = 1u;
simon	0:1014af42efd9	283
simon	0:1014af42efd9	284
simon	0:1014af42efd9	285	/* --------------------
simon	0:1014af42efd9	286	* Stage2 process
simon	0:1014af42efd9	287	* -------------------*/
simon	0:1014af42efd9	288
simon	0:1014af42efd9	289	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
simon	0:1014af42efd9	290	* So, to loop unroll over blockSize2,
simon	0:1014af42efd9	291	* srcBLen should be greater than or equal to 4 */
simon	0:1014af42efd9	292	if(srcBLen >= 4u)
simon	0:1014af42efd9	293	{
simon	0:1014af42efd9	294	/* Loop unroll over blockSize2, by 4 */
simon	0:1014af42efd9	295	blkCnt = ((uint32_t) blockSize2 >> 2u);
simon	0:1014af42efd9	296
simon	0:1014af42efd9	297	while(blkCnt > 0u)
simon	0:1014af42efd9	298	{
simon	0:1014af42efd9	299	/* Set all accumulators to zero */
simon	0:1014af42efd9	300	acc0 = 0;
simon	0:1014af42efd9	301	acc1 = 0;
simon	0:1014af42efd9	302	acc2 = 0;
simon	0:1014af42efd9	303	acc3 = 0;
simon	0:1014af42efd9	304
simon	0:1014af42efd9	305
simon	0:1014af42efd9	306	/* read x[0], x[1] samples */
simon	0:1014af42efd9	307	x0 = (q31_t ) (px++);
simon	0:1014af42efd9	308	/* read x[1], x[2] samples */
simon	0:1014af42efd9	309	x1 = (q31_t ) (px++);
simon	0:1014af42efd9	310
simon	0:1014af42efd9	311
simon	0:1014af42efd9	312	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	313	k = srcBLen >> 2u;
simon	0:1014af42efd9	314
simon	0:1014af42efd9	315	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	316	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	317	do
simon	0:1014af42efd9	318	{
simon	0:1014af42efd9	319	/* Read the last two inputB samples using SIMD:
simon	0:1014af42efd9	320	* y[srcBLen - 1] and y[srcBLen - 2] */
simon	0:1014af42efd9	321	c0 = *(pb--);
simon	0:1014af42efd9	322
simon	0:1014af42efd9	323	/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
simon	0:1014af42efd9	324	acc0 = __SMLALDX(x0, c0, acc0);
simon	0:1014af42efd9	325
simon	0:1014af42efd9	326	/* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
simon	0:1014af42efd9	327	acc1 = __SMLALDX(x1, c0, acc1);
simon	0:1014af42efd9	328
simon	0:1014af42efd9	329	/* Read x[2], x[3] */
simon	0:1014af42efd9	330	x2 = (q31_t ) (px++);
simon	0:1014af42efd9	331
simon	0:1014af42efd9	332	/* Read x[3], x[4] */
simon	0:1014af42efd9	333	x3 = (q31_t ) (px++);
simon	0:1014af42efd9	334
simon	0:1014af42efd9	335	/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
simon	0:1014af42efd9	336	acc2 = __SMLALDX(x2, c0, acc2);
simon	0:1014af42efd9	337
simon	0:1014af42efd9	338	/* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
simon	0:1014af42efd9	339	acc3 = __SMLALDX(x3, c0, acc3);
simon	0:1014af42efd9	340
simon	0:1014af42efd9	341	/* Read y[srcBLen - 3] and y[srcBLen - 4] */
simon	0:1014af42efd9	342	c0 = *(pb--);
simon	0:1014af42efd9	343
simon	0:1014af42efd9	344	/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
simon	0:1014af42efd9	345	acc0 = __SMLALDX(x2, c0, acc0);
simon	0:1014af42efd9	346
simon	0:1014af42efd9	347	/* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
simon	0:1014af42efd9	348	acc1 = __SMLALDX(x3, c0, acc1);
simon	0:1014af42efd9	349
simon	0:1014af42efd9	350	/* Read x[4], x[5] */
simon	0:1014af42efd9	351	x0 = (q31_t ) (px++);
simon	0:1014af42efd9	352
simon	0:1014af42efd9	353	/* Read x[5], x[6] */
simon	0:1014af42efd9	354	x1 = (q31_t ) (px++);
simon	0:1014af42efd9	355
simon	0:1014af42efd9	356	/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
simon	0:1014af42efd9	357	acc2 = __SMLALDX(x0, c0, acc2);
simon	0:1014af42efd9	358
simon	0:1014af42efd9	359	/* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
simon	0:1014af42efd9	360	acc3 = __SMLALDX(x1, c0, acc3);
simon	0:1014af42efd9	361
simon	0:1014af42efd9	362	} while(--k);
simon	0:1014af42efd9	363
simon	0:1014af42efd9	364	/* For the next MAC operations, SIMD is not used
simon	0:1014af42efd9	365	* So, the 16 bit pointer if inputB, py is updated */
simon	0:1014af42efd9	366	py = (q15_t *) pb;
simon	0:1014af42efd9	367	py = py + 1;
simon	0:1014af42efd9	368
simon	0:1014af42efd9	369	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	370	** No loop unrolling is used. */
simon	0:1014af42efd9	371	k = srcBLen % 0x4u;
simon	0:1014af42efd9	372
simon	0:1014af42efd9	373	if(k == 1u)
simon	0:1014af42efd9	374	{
simon	0:1014af42efd9	375	/* Read y[srcBLen - 5] */
simon	0:1014af42efd9	376	c0 = *(py);
simon	0:1014af42efd9	377
simon	0:1014af42efd9	378	/* Read x[7] */
simon	0:1014af42efd9	379	x3 = (q31_t ) px++;
simon	0:1014af42efd9	380
simon	0:1014af42efd9	381	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	382	acc0 = __SMLALD(x0, c0, acc0);
simon	0:1014af42efd9	383	acc1 = __SMLALD(x1, c0, acc1);
simon	0:1014af42efd9	384	acc2 = __SMLALDX(x1, c0, acc2);
simon	0:1014af42efd9	385	acc3 = __SMLALDX(x3, c0, acc3);
simon	0:1014af42efd9	386	}
simon	0:1014af42efd9	387
simon	0:1014af42efd9	388	if(k == 2u)
simon	0:1014af42efd9	389	{
simon	0:1014af42efd9	390	/* Read y[srcBLen - 5], y[srcBLen - 6] */
simon	0:1014af42efd9	391	c0 = *(pb);
simon	0:1014af42efd9	392
simon	0:1014af42efd9	393	/* Read x[7], x[8] */
simon	0:1014af42efd9	394	x3 = (q31_t ) px++;
simon	0:1014af42efd9	395
simon	0:1014af42efd9	396	/* Read x[9] */
simon	0:1014af42efd9	397	x2 = (q31_t ) px++;
simon	0:1014af42efd9	398
simon	0:1014af42efd9	399	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	400	acc0 = __SMLALDX(x0, c0, acc0);
simon	0:1014af42efd9	401	acc1 = __SMLALDX(x1, c0, acc1);
simon	0:1014af42efd9	402	acc2 = __SMLALDX(x3, c0, acc2);
simon	0:1014af42efd9	403	acc3 = __SMLALDX(x2, c0, acc3);
simon	0:1014af42efd9	404	}
simon	0:1014af42efd9	405
simon	0:1014af42efd9	406	if(k == 3u)
simon	0:1014af42efd9	407	{
simon	0:1014af42efd9	408	/* Read y[srcBLen - 5], y[srcBLen - 6] */
simon	0:1014af42efd9	409	c0 = *pb--;
simon	0:1014af42efd9	410
simon	0:1014af42efd9	411	/* Read x[7], x[8] */
simon	0:1014af42efd9	412	x3 = (q31_t ) px++;
simon	0:1014af42efd9	413
simon	0:1014af42efd9	414	/* Read x[9] */
simon	0:1014af42efd9	415	x2 = (q31_t ) px++;
simon	0:1014af42efd9	416
simon	0:1014af42efd9	417	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	418	acc0 = __SMLALDX(x0, c0, acc0);
simon	0:1014af42efd9	419	acc1 = __SMLALDX(x1, c0, acc1);
simon	0:1014af42efd9	420	acc2 = __SMLALDX(x3, c0, acc2);
simon	0:1014af42efd9	421	acc3 = __SMLALDX(x2, c0, acc3);
simon	0:1014af42efd9	422
simon	0:1014af42efd9	423	/* Read y[srcBLen - 7] */
simon	0:1014af42efd9	424	c0 = (q15_t) (*pb >> 16);
simon	0:1014af42efd9	425
simon	0:1014af42efd9	426	/* Read x[10] */
simon	0:1014af42efd9	427	x3 = (q31_t ) px++;
simon	0:1014af42efd9	428
simon	0:1014af42efd9	429	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	430	acc0 = __SMLALDX(x1, c0, acc0);
simon	0:1014af42efd9	431	acc1 = __SMLALD(x2, c0, acc1);
simon	0:1014af42efd9	432	acc2 = __SMLALDX(x2, c0, acc2);
simon	0:1014af42efd9	433	acc3 = __SMLALDX(x3, c0, acc3);
simon	0:1014af42efd9	434	}
simon	0:1014af42efd9	435
simon	0:1014af42efd9	436	/* Store the results in the accumulators in the destination buffer. */
simon	0:1014af42efd9	437	*__SIMD32(pOut)++ =
simon	0:1014af42efd9	438	__PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
simon	0:1014af42efd9	439	*__SIMD32(pOut)++ =
simon	0:1014af42efd9	440	__PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
simon	0:1014af42efd9	441
simon	0:1014af42efd9	442	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	443	px = pIn1 + (count * 4u);
simon	0:1014af42efd9	444	py = pSrc2;
simon	0:1014af42efd9	445	pb = (q31_t *) (py - 1);
simon	0:1014af42efd9	446
simon	0:1014af42efd9	447	/* Increment the pointer pIn1 index, count by 1 */
simon	0:1014af42efd9	448	count++;
simon	0:1014af42efd9	449
simon	0:1014af42efd9	450	/* Decrement the loop counter */
simon	0:1014af42efd9	451	blkCnt--;
simon	0:1014af42efd9	452	}
simon	0:1014af42efd9	453
simon	0:1014af42efd9	454	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
simon	0:1014af42efd9	455	** No loop unrolling is used. */
simon	0:1014af42efd9	456	blkCnt = (uint32_t) blockSize2 % 0x4u;
simon	0:1014af42efd9	457
simon	0:1014af42efd9	458	while(blkCnt > 0u)
simon	0:1014af42efd9	459	{
simon	0:1014af42efd9	460	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	461	sum = 0;
simon	0:1014af42efd9	462
simon	0:1014af42efd9	463	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	464	k = srcBLen >> 2u;
simon	0:1014af42efd9	465
simon	0:1014af42efd9	466	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	467	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	468	while(k > 0u)
simon	0:1014af42efd9	469	{
simon	0:1014af42efd9	470	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	471	sum += (q63_t) ((q31_t) * px++ * *py--);
simon	0:1014af42efd9	472	sum += (q63_t) ((q31_t) * px++ * *py--);
simon	0:1014af42efd9	473	sum += (q63_t) ((q31_t) * px++ * *py--);
simon	0:1014af42efd9	474	sum += (q63_t) ((q31_t) * px++ * *py--);
simon	0:1014af42efd9	475
simon	0:1014af42efd9	476	/* Decrement the loop counter */
simon	0:1014af42efd9	477	k--;
simon	0:1014af42efd9	478	}
simon	0:1014af42efd9	479
simon	0:1014af42efd9	480	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	481	** No loop unrolling is used. */
simon	0:1014af42efd9	482	k = srcBLen % 0x4u;
simon	0:1014af42efd9	483
simon	0:1014af42efd9	484	while(k > 0u)
simon	0:1014af42efd9	485	{
simon	0:1014af42efd9	486	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	487	sum += (q63_t) ((q31_t) * px++ * *py--);
simon	0:1014af42efd9	488
simon	0:1014af42efd9	489	/* Decrement the loop counter */
simon	0:1014af42efd9	490	k--;
simon	0:1014af42efd9	491	}
simon	0:1014af42efd9	492
simon	0:1014af42efd9	493	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	494	*pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
simon	0:1014af42efd9	495
simon	0:1014af42efd9	496	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	497	px = pIn1 + count;
simon	0:1014af42efd9	498	py = pSrc2;
simon	0:1014af42efd9	499
simon	0:1014af42efd9	500	/* Increment the pointer pIn1 index, count by 1 */
simon	0:1014af42efd9	501	count++;
simon	0:1014af42efd9	502
simon	0:1014af42efd9	503	/* Decrement the loop counter */
simon	0:1014af42efd9	504	blkCnt--;
simon	0:1014af42efd9	505	}
simon	0:1014af42efd9	506	}
simon	0:1014af42efd9	507	else
simon	0:1014af42efd9	508	{
simon	0:1014af42efd9	509	/* If the srcBLen is not a multiple of 4,
simon	0:1014af42efd9	510	* the blockSize2 loop cannot be unrolled by 4 */
simon	0:1014af42efd9	511	blkCnt = (uint32_t) blockSize2;
simon	0:1014af42efd9	512
simon	0:1014af42efd9	513	while(blkCnt > 0u)
simon	0:1014af42efd9	514	{
simon	0:1014af42efd9	515	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	516	sum = 0;
simon	0:1014af42efd9	517
simon	0:1014af42efd9	518	/* srcBLen number of MACS should be performed */
simon	0:1014af42efd9	519	k = srcBLen;
simon	0:1014af42efd9	520
simon	0:1014af42efd9	521	while(k > 0u)
simon	0:1014af42efd9	522	{
simon	0:1014af42efd9	523	/* Perform the multiply-accumulate */
simon	0:1014af42efd9	524	sum += (q63_t) ((q31_t) * px++ * *py--);
simon	0:1014af42efd9	525
simon	0:1014af42efd9	526	/* Decrement the loop counter */
simon	0:1014af42efd9	527	k--;
simon	0:1014af42efd9	528	}
simon	0:1014af42efd9	529
simon	0:1014af42efd9	530	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	531	*pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
simon	0:1014af42efd9	532
simon	0:1014af42efd9	533	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	534	px = pIn1 + count;
simon	0:1014af42efd9	535	py = pSrc2;
simon	0:1014af42efd9	536
simon	0:1014af42efd9	537	/* Increment the MAC count */
simon	0:1014af42efd9	538	count++;
simon	0:1014af42efd9	539
simon	0:1014af42efd9	540	/* Decrement the loop counter */
simon	0:1014af42efd9	541	blkCnt--;
simon	0:1014af42efd9	542	}
simon	0:1014af42efd9	543	}
simon	0:1014af42efd9	544
simon	0:1014af42efd9	545
simon	0:1014af42efd9	546	/* --------------------------
simon	0:1014af42efd9	547	* Initializations of stage3
simon	0:1014af42efd9	548	* -------------------------*/
simon	0:1014af42efd9	549
simon	0:1014af42efd9	550	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
simon	0:1014af42efd9	551	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
simon	0:1014af42efd9	552	* ....
simon	0:1014af42efd9	553	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
simon	0:1014af42efd9	554	* sum += x[srcALen-1] * y[srcBLen-1]
simon	0:1014af42efd9	555	*/
simon	0:1014af42efd9	556
simon	0:1014af42efd9	557	/* In this stage the MAC operations are decreased by 1 for every iteration.
simon	0:1014af42efd9	558	The count variable holds the number of MAC operations performed */
simon	0:1014af42efd9	559	count = srcBLen - 1u;
simon	0:1014af42efd9	560
simon	0:1014af42efd9	561	/* Working pointer of inputA */
simon	0:1014af42efd9	562	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
simon	0:1014af42efd9	563	px = pSrc1;
simon	0:1014af42efd9	564
simon	0:1014af42efd9	565	/* Working pointer of inputB */
simon	0:1014af42efd9	566	pSrc2 = pIn2 + (srcBLen - 1u);
simon	0:1014af42efd9	567	pIn2 = pSrc2 - 1u;
simon	0:1014af42efd9	568	py = pIn2;
simon	0:1014af42efd9	569
simon	0:1014af42efd9	570	/* -------------------
simon	0:1014af42efd9	571	* Stage3 process
simon	0:1014af42efd9	572	* ------------------*/
simon	0:1014af42efd9	573
simon	0:1014af42efd9	574	/* For loop unrolling by 4, this stage is divided into two. */
simon	0:1014af42efd9	575	/* First part of this stage computes the MAC operations greater than 4 */
simon	0:1014af42efd9	576	/* Second part of this stage computes the MAC operations less than or equal to 4 */
simon	0:1014af42efd9	577
simon	0:1014af42efd9	578	/* The first part of the stage starts here */
simon	0:1014af42efd9	579	j = count >> 2u;
simon	0:1014af42efd9	580
simon	0:1014af42efd9	581	while((j > 0u) && (blockSize3 > 0))
simon	0:1014af42efd9	582	{
simon	0:1014af42efd9	583	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	584	sum = 0;
simon	0:1014af42efd9	585
simon	0:1014af42efd9	586	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	587	k = count >> 2u;
simon	0:1014af42efd9	588
simon	0:1014af42efd9	589	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	590	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	591	while(k > 0u)
simon	0:1014af42efd9	592	{
simon	0:1014af42efd9	593	/* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
simon	0:1014af42efd9	594	* with y[srcBLen - 1], y[srcBLen - 2] respectively */
simon	0:1014af42efd9	595	sum = __SMLALDX(__SIMD32(px)++, __SIMD32(py)--, sum);
simon	0:1014af42efd9	596	/* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
simon	0:1014af42efd9	597	* with y[srcBLen - 3], y[srcBLen - 4] respectively */
simon	0:1014af42efd9	598	sum = __SMLALDX(__SIMD32(px)++, __SIMD32(py)--, sum);
simon	0:1014af42efd9	599
simon	0:1014af42efd9	600	/* Decrement the loop counter */
simon	0:1014af42efd9	601	k--;
simon	0:1014af42efd9	602	}
simon	0:1014af42efd9	603
simon	0:1014af42efd9	604	/* For the next MAC operations, the pointer py is used without SIMD
simon	0:1014af42efd9	605	* So, py is incremented by 1 */
simon	0:1014af42efd9	606	py = py + 1u;
simon	0:1014af42efd9	607
simon	0:1014af42efd9	608	/* If the count is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	609	** No loop unrolling is used. */
simon	0:1014af42efd9	610	k = count % 0x4u;
simon	0:1014af42efd9	611
simon	0:1014af42efd9	612	while(k > 0u)
simon	0:1014af42efd9	613	{
simon	0:1014af42efd9	614	/* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
simon	0:1014af42efd9	615	sum = __SMLALD(px++, py--, sum);
simon	0:1014af42efd9	616
simon	0:1014af42efd9	617	/* Decrement the loop counter */
simon	0:1014af42efd9	618	k--;
simon	0:1014af42efd9	619	}
simon	0:1014af42efd9	620
simon	0:1014af42efd9	621	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	622	*pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
simon	0:1014af42efd9	623
simon	0:1014af42efd9	624	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	625	px = ++pSrc1;
simon	0:1014af42efd9	626	py = pIn2;
simon	0:1014af42efd9	627
simon	0:1014af42efd9	628	/* Decrement the MAC count */
simon	0:1014af42efd9	629	count--;
simon	0:1014af42efd9	630
simon	0:1014af42efd9	631	/* Decrement the loop counter */
simon	0:1014af42efd9	632	blockSize3--;
simon	0:1014af42efd9	633
simon	0:1014af42efd9	634	j--;
simon	0:1014af42efd9	635	}
simon	0:1014af42efd9	636
simon	0:1014af42efd9	637	/* The second part of the stage starts here */
simon	0:1014af42efd9	638	/* SIMD is not used for the next MAC operations,
simon	0:1014af42efd9	639	* so pointer py is updated to read only one sample at a time */
simon	0:1014af42efd9	640	py = py + 1u;
simon	0:1014af42efd9	641
simon	0:1014af42efd9	642	while(blockSize3 > 0)
simon	0:1014af42efd9	643	{
simon	0:1014af42efd9	644	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	645	sum = 0;
simon	0:1014af42efd9	646
simon	0:1014af42efd9	647	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	648	k = count;
simon	0:1014af42efd9	649
simon	0:1014af42efd9	650	while(k > 0u)
simon	0:1014af42efd9	651	{
simon	0:1014af42efd9	652	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	653	/* sum += x[srcALen-1] * y[srcBLen-1] */
simon	0:1014af42efd9	654	sum = __SMLALD(px++, py--, sum);
simon	0:1014af42efd9	655
simon	0:1014af42efd9	656	/* Decrement the loop counter */
simon	0:1014af42efd9	657	k--;
simon	0:1014af42efd9	658	}
simon	0:1014af42efd9	659
simon	0:1014af42efd9	660	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	661	*pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
simon	0:1014af42efd9	662
simon	0:1014af42efd9	663	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	664	px = ++pSrc1;
simon	0:1014af42efd9	665	py = pSrc2;
simon	0:1014af42efd9	666
simon	0:1014af42efd9	667	/* Decrement the MAC count */
simon	0:1014af42efd9	668	count--;
simon	0:1014af42efd9	669
simon	0:1014af42efd9	670	/* Decrement the loop counter */
simon	0:1014af42efd9	671	blockSize3--;
simon	0:1014af42efd9	672	}
simon	0:1014af42efd9	673
simon	0:1014af42efd9	674	/* set status as ARM_MATH_SUCCESS */
simon	0:1014af42efd9	675	status = ARM_MATH_SUCCESS;
simon	0:1014af42efd9	676	}
simon	0:1014af42efd9	677
simon	0:1014af42efd9	678	/* Return to application */
simon	0:1014af42efd9	679	return (status);
simon	0:1014af42efd9	680
simon	0:1014af42efd9	681	}
simon	0:1014af42efd9	682
simon	0:1014af42efd9	683	/**
simon	0:1014af42efd9	684	* @} end of PartialConv group
simon	0:1014af42efd9	685	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	10 Mar 2011
Imports:	907
Forks:	1
Commits:	3
Dependents:	5
Dependencies:	0
Followers:	35

src/Cortex-M4-M3/FilteringFunctions/arm_conv_partial_q15.c@0:1014af42efd9, 2011-03-10 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning