dsp - CMSIS DSP Library from CMSIS 2.0. See http://www.…

Users » simon » Code » dsp

CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents: K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

src/Cortex-M4-M3/FilteringFunctions/arm_conv_partial_q7.c@0:1014af42efd9, 2011-03-10 (annotated)

Committer:: simon
Date:: Thu Mar 10 15:07:50 2011 +0000
Revision:: 0:1014af42efd9

Who changed what in which revision?

User	Revision	Line number	New contents of line
simon	0:1014af42efd9	1	/* ----------------------------------------------------------------------
simon	0:1014af42efd9	2	* Copyright (C) 2010 ARM Limited. All rights reserved.
simon	0:1014af42efd9	3	*
simon	0:1014af42efd9	4	* $Date: 29. November 2010
simon	0:1014af42efd9	5	* $Revision: V1.0.3
simon	0:1014af42efd9	6	*
simon	0:1014af42efd9	7	* Project: CMSIS DSP Library
simon	0:1014af42efd9	8	* Title: arm_conv_partial_q7.c
simon	0:1014af42efd9	9	*
simon	0:1014af42efd9	10	* Description: Q7 Partial convolution.
simon	0:1014af42efd9	11	*
simon	0:1014af42efd9	12	* Target Processor: Cortex-M4/Cortex-M3
simon	0:1014af42efd9	13	*
simon	0:1014af42efd9	14	* Version 1.0.3 2010/11/29
simon	0:1014af42efd9	15	* Re-organized the CMSIS folders and updated documentation.
simon	0:1014af42efd9	16	*
simon	0:1014af42efd9	17	* Version 1.0.2 2010/11/11
simon	0:1014af42efd9	18	* Documentation updated.
simon	0:1014af42efd9	19	*
simon	0:1014af42efd9	20	* Version 1.0.1 2010/10/05
simon	0:1014af42efd9	21	* Production release and review comments incorporated.
simon	0:1014af42efd9	22	*
simon	0:1014af42efd9	23	* Version 1.0.0 2010/09/20
simon	0:1014af42efd9	24	* Production release and review comments incorporated
simon	0:1014af42efd9	25	*
simon	0:1014af42efd9	26	* Version 0.0.7 2010/06/10
simon	0:1014af42efd9	27	* Misra-C changes done
simon	0:1014af42efd9	28	*
simon	0:1014af42efd9	29	* -------------------------------------------------------------------- */
simon	0:1014af42efd9	30
simon	0:1014af42efd9	31	#include "arm_math.h"
simon	0:1014af42efd9	32
simon	0:1014af42efd9	33	/**
simon	0:1014af42efd9	34	* @ingroup groupFilters
simon	0:1014af42efd9	35	*/
simon	0:1014af42efd9	36
simon	0:1014af42efd9	37	/**
simon	0:1014af42efd9	38	* @addtogroup PartialConv
simon	0:1014af42efd9	39	* @{
simon	0:1014af42efd9	40	*/
simon	0:1014af42efd9	41
simon	0:1014af42efd9	42	/**
simon	0:1014af42efd9	43	* @brief Partial convolution of Q7 sequences
simon	0:1014af42efd9	44	* @param[in] *pSrcA points to the first input sequence.
simon	0:1014af42efd9	45	* @param[in] srcALen length of the first input sequence.
simon	0:1014af42efd9	46	* @param[in] *pSrcB points to the second input sequence.
simon	0:1014af42efd9	47	* @param[in] srcBLen length of the second input sequence.
simon	0:1014af42efd9	48	* @param[out] *pDst points to the location where the output result is written.
simon	0:1014af42efd9	49	* @param[in] firstIndex is the first output sample to start with.
simon	0:1014af42efd9	50	* @param[in] numPoints is the number of output points to be computed.
simon	0:1014af42efd9	51	* @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
simon	0:1014af42efd9	52	*
simon	0:1014af42efd9	53	*/
simon	0:1014af42efd9	54
simon	0:1014af42efd9	55	arm_status arm_conv_partial_q7(
simon	0:1014af42efd9	56	q7_t * pSrcA,
simon	0:1014af42efd9	57	uint32_t srcALen,
simon	0:1014af42efd9	58	q7_t * pSrcB,
simon	0:1014af42efd9	59	uint32_t srcBLen,
simon	0:1014af42efd9	60	q7_t * pDst,
simon	0:1014af42efd9	61	uint32_t firstIndex,
simon	0:1014af42efd9	62	uint32_t numPoints)
simon	0:1014af42efd9	63	{
simon	0:1014af42efd9	64	q7_t pIn1; / inputA pointer */
simon	0:1014af42efd9	65	q7_t pIn2; / inputB pointer */
simon	0:1014af42efd9	66	q7_t pOut = pDst; / output pointer */
simon	0:1014af42efd9	67	q7_t px; / Intermediate inputA pointer */
simon	0:1014af42efd9	68	q7_t py; / Intermediate inputB pointer */
simon	0:1014af42efd9	69	q7_t pSrc1, pSrc2; /* Intermediate pointers */
simon	0:1014af42efd9	70	q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
simon	0:1014af42efd9	71	q31_t input1, input2;
simon	0:1014af42efd9	72	q15_t in1, in2;
simon	0:1014af42efd9	73	q7_t x0, x1, x2, x3, c0, c1;
simon	0:1014af42efd9	74	uint32_t j, k, count, check, blkCnt;
simon	0:1014af42efd9	75	int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
simon	0:1014af42efd9	76	arm_status status;
simon	0:1014af42efd9	77
simon	0:1014af42efd9	78
simon	0:1014af42efd9	79	/* Check for range of output samples to be calculated */
simon	0:1014af42efd9	80	if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
simon	0:1014af42efd9	81	{
simon	0:1014af42efd9	82	/* Set status as ARM_MATH_ARGUMENT_ERROR */
simon	0:1014af42efd9	83	status = ARM_MATH_ARGUMENT_ERROR;
simon	0:1014af42efd9	84	}
simon	0:1014af42efd9	85	else
simon	0:1014af42efd9	86	{
simon	0:1014af42efd9	87
simon	0:1014af42efd9	88	/* The algorithm implementation is based on the lengths of the inputs. */
simon	0:1014af42efd9	89	/* srcB is always made to slide across srcA. */
simon	0:1014af42efd9	90	/* So srcBLen is always considered as shorter or equal to srcALen */
simon	0:1014af42efd9	91	if(srcALen >= srcBLen)
simon	0:1014af42efd9	92	{
simon	0:1014af42efd9	93	/* Initialization of inputA pointer */
simon	0:1014af42efd9	94	pIn1 = pSrcA;
simon	0:1014af42efd9	95
simon	0:1014af42efd9	96	/* Initialization of inputB pointer */
simon	0:1014af42efd9	97	pIn2 = pSrcB;
simon	0:1014af42efd9	98	}
simon	0:1014af42efd9	99	else
simon	0:1014af42efd9	100	{
simon	0:1014af42efd9	101	/* Initialization of inputA pointer */
simon	0:1014af42efd9	102	pIn1 = pSrcB;
simon	0:1014af42efd9	103
simon	0:1014af42efd9	104	/* Initialization of inputB pointer */
simon	0:1014af42efd9	105	pIn2 = pSrcA;
simon	0:1014af42efd9	106
simon	0:1014af42efd9	107	/* srcBLen is always considered as shorter or equal to srcALen */
simon	0:1014af42efd9	108	j = srcBLen;
simon	0:1014af42efd9	109	srcBLen = srcALen;
simon	0:1014af42efd9	110	srcALen = j;
simon	0:1014af42efd9	111	}
simon	0:1014af42efd9	112
simon	0:1014af42efd9	113	/* Conditions to check which loopCounter holds
simon	0:1014af42efd9	114	* the first and last indices of the output samples to be calculated. */
simon	0:1014af42efd9	115	check = firstIndex + numPoints;
simon	0:1014af42efd9	116	blockSize3 = ((int32_t) check - (int32_t) srcALen);
simon	0:1014af42efd9	117	blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
simon	0:1014af42efd9	118	blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
simon	0:1014af42efd9	119	blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
simon	0:1014af42efd9	120	(int32_t) numPoints) : 0;
simon	0:1014af42efd9	121	blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
simon	0:1014af42efd9	122	(int32_t) firstIndex);
simon	0:1014af42efd9	123	blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
simon	0:1014af42efd9	124
simon	0:1014af42efd9	125	/* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
simon	0:1014af42efd9	126	/* The function is internally
simon	0:1014af42efd9	127	* divided into three stages according to the number of multiplications that has to be
simon	0:1014af42efd9	128	* taken place between inputA samples and inputB samples. In the first stage of the
simon	0:1014af42efd9	129	* algorithm, the multiplications increase by one for every iteration.
simon	0:1014af42efd9	130	* In the second stage of the algorithm, srcBLen number of multiplications are done.
simon	0:1014af42efd9	131	* In the third stage of the algorithm, the multiplications decrease by one
simon	0:1014af42efd9	132	* for every iteration. */
simon	0:1014af42efd9	133
simon	0:1014af42efd9	134	/* Set the output pointer to point to the firstIndex
simon	0:1014af42efd9	135	* of the output sample to be calculated. */
simon	0:1014af42efd9	136	pOut = pDst + firstIndex;
simon	0:1014af42efd9	137
simon	0:1014af42efd9	138	/* --------------------------
simon	0:1014af42efd9	139	* Initializations of stage1
simon	0:1014af42efd9	140	* -------------------------*/
simon	0:1014af42efd9	141
simon	0:1014af42efd9	142	/* sum = x[0] * y[0]
simon	0:1014af42efd9	143	* sum = x[0] * y[1] + x[1] * y[0]
simon	0:1014af42efd9	144	* ....
simon	0:1014af42efd9	145	* sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
simon	0:1014af42efd9	146	*/
simon	0:1014af42efd9	147
simon	0:1014af42efd9	148	/* In this stage the MAC operations are increased by 1 for every iteration.
simon	0:1014af42efd9	149	The count variable holds the number of MAC operations performed.
simon	0:1014af42efd9	150	Since the partial convolution starts from from firstIndex
simon	0:1014af42efd9	151	Number of Macs to be performed is firstIndex + 1 */
simon	0:1014af42efd9	152	count = 1u + firstIndex;
simon	0:1014af42efd9	153
simon	0:1014af42efd9	154	/* Working pointer of inputA */
simon	0:1014af42efd9	155	px = pIn1;
simon	0:1014af42efd9	156
simon	0:1014af42efd9	157	/* Working pointer of inputB */
simon	0:1014af42efd9	158	pSrc2 = pIn2 + firstIndex;
simon	0:1014af42efd9	159	py = pSrc2;
simon	0:1014af42efd9	160
simon	0:1014af42efd9	161	/* ------------------------
simon	0:1014af42efd9	162	* Stage1 process
simon	0:1014af42efd9	163	* ----------------------*/
simon	0:1014af42efd9	164
simon	0:1014af42efd9	165	/* The first stage starts here */
simon	0:1014af42efd9	166	while(blockSize1 > 0)
simon	0:1014af42efd9	167	{
simon	0:1014af42efd9	168	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	169	sum = 0;
simon	0:1014af42efd9	170
simon	0:1014af42efd9	171	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	172	k = count >> 2u;
simon	0:1014af42efd9	173
simon	0:1014af42efd9	174	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	175	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	176	while(k > 0u)
simon	0:1014af42efd9	177	{
simon	0:1014af42efd9	178	/* x[0] , x[1] */
simon	0:1014af42efd9	179	in1 = (q15_t) * px++;
simon	0:1014af42efd9	180	in2 = (q15_t) * px++;
simon	0:1014af42efd9	181	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	182
simon	0:1014af42efd9	183	/* y[srcBLen - 1] , y[srcBLen - 2] */
simon	0:1014af42efd9	184	in1 = (q15_t) * py--;
simon	0:1014af42efd9	185	in2 = (q15_t) * py--;
simon	0:1014af42efd9	186	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	187
simon	0:1014af42efd9	188	/* x[0] * y[srcBLen - 1] */
simon	0:1014af42efd9	189	/* x[1] * y[srcBLen - 2] */
simon	0:1014af42efd9	190	sum = __SMLAD(input1, input2, sum);
simon	0:1014af42efd9	191
simon	0:1014af42efd9	192	/* x[2] , x[3] */
simon	0:1014af42efd9	193	in1 = (q15_t) * px++;
simon	0:1014af42efd9	194	in2 = (q15_t) * px++;
simon	0:1014af42efd9	195	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	196
simon	0:1014af42efd9	197	/* y[srcBLen - 3] , y[srcBLen - 4] */
simon	0:1014af42efd9	198	in1 = (q15_t) * py--;
simon	0:1014af42efd9	199	in2 = (q15_t) * py--;
simon	0:1014af42efd9	200	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	201
simon	0:1014af42efd9	202	/* x[2] * y[srcBLen - 3] */
simon	0:1014af42efd9	203	/* x[3] * y[srcBLen - 4] */
simon	0:1014af42efd9	204	sum = __SMLAD(input1, input2, sum);
simon	0:1014af42efd9	205
simon	0:1014af42efd9	206	/* Decrement the loop counter */
simon	0:1014af42efd9	207	k--;
simon	0:1014af42efd9	208	}
simon	0:1014af42efd9	209
simon	0:1014af42efd9	210	/* If the count is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	211	** No loop unrolling is used. */
simon	0:1014af42efd9	212	k = count % 0x4u;
simon	0:1014af42efd9	213
simon	0:1014af42efd9	214	while(k > 0u)
simon	0:1014af42efd9	215	{
simon	0:1014af42efd9	216	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	217	sum += ((q31_t) * px++ * *py--);
simon	0:1014af42efd9	218
simon	0:1014af42efd9	219	/* Decrement the loop counter */
simon	0:1014af42efd9	220	k--;
simon	0:1014af42efd9	221	}
simon	0:1014af42efd9	222
simon	0:1014af42efd9	223	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	224	*pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
simon	0:1014af42efd9	225
simon	0:1014af42efd9	226	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	227	py = ++pSrc2;
simon	0:1014af42efd9	228	px = pIn1;
simon	0:1014af42efd9	229
simon	0:1014af42efd9	230	/* Increment the MAC count */
simon	0:1014af42efd9	231	count++;
simon	0:1014af42efd9	232
simon	0:1014af42efd9	233	/* Decrement the loop counter */
simon	0:1014af42efd9	234	blockSize1--;
simon	0:1014af42efd9	235	}
simon	0:1014af42efd9	236
simon	0:1014af42efd9	237	/* --------------------------
simon	0:1014af42efd9	238	* Initializations of stage2
simon	0:1014af42efd9	239	* ------------------------*/
simon	0:1014af42efd9	240
simon	0:1014af42efd9	241	/* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
simon	0:1014af42efd9	242	* sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
simon	0:1014af42efd9	243	* ....
simon	0:1014af42efd9	244	* sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
simon	0:1014af42efd9	245	*/
simon	0:1014af42efd9	246
simon	0:1014af42efd9	247	/* Working pointer of inputA */
simon	0:1014af42efd9	248	px = pIn1;
simon	0:1014af42efd9	249
simon	0:1014af42efd9	250	/* Working pointer of inputB */
simon	0:1014af42efd9	251	pSrc2 = pIn2 + (srcBLen - 1u);
simon	0:1014af42efd9	252	py = pSrc2;
simon	0:1014af42efd9	253
simon	0:1014af42efd9	254	/* count is index by which the pointer pIn1 to be incremented */
simon	0:1014af42efd9	255	count = 1u;
simon	0:1014af42efd9	256
simon	0:1014af42efd9	257	/* -------------------
simon	0:1014af42efd9	258	* Stage2 process
simon	0:1014af42efd9	259	* ------------------*/
simon	0:1014af42efd9	260
simon	0:1014af42efd9	261	/* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
simon	0:1014af42efd9	262	* So, to loop unroll over blockSize2,
simon	0:1014af42efd9	263	* srcBLen should be greater than or equal to 4 */
simon	0:1014af42efd9	264	if(srcBLen >= 4u)
simon	0:1014af42efd9	265	{
simon	0:1014af42efd9	266	/* Loop unroll over blockSize2, by 4 */
simon	0:1014af42efd9	267	blkCnt = ((uint32_t) blockSize2 >> 2u);
simon	0:1014af42efd9	268
simon	0:1014af42efd9	269	while(blkCnt > 0u)
simon	0:1014af42efd9	270	{
simon	0:1014af42efd9	271	/* Set all accumulators to zero */
simon	0:1014af42efd9	272	acc0 = 0;
simon	0:1014af42efd9	273	acc1 = 0;
simon	0:1014af42efd9	274	acc2 = 0;
simon	0:1014af42efd9	275	acc3 = 0;
simon	0:1014af42efd9	276
simon	0:1014af42efd9	277	/* read x[0], x[1], x[2] samples */
simon	0:1014af42efd9	278	x0 = *(px++);
simon	0:1014af42efd9	279	x1 = *(px++);
simon	0:1014af42efd9	280	x2 = *(px++);
simon	0:1014af42efd9	281
simon	0:1014af42efd9	282	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	283	k = srcBLen >> 2u;
simon	0:1014af42efd9	284
simon	0:1014af42efd9	285	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	286	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	287	do
simon	0:1014af42efd9	288	{
simon	0:1014af42efd9	289	/* Read y[srcBLen - 1] sample */
simon	0:1014af42efd9	290	c0 = *(py--);
simon	0:1014af42efd9	291	/* Read y[srcBLen - 2] sample */
simon	0:1014af42efd9	292	c1 = *(py--);
simon	0:1014af42efd9	293
simon	0:1014af42efd9	294	/* Read x[3] sample */
simon	0:1014af42efd9	295	x3 = *(px++);
simon	0:1014af42efd9	296
simon	0:1014af42efd9	297	/* x[0] and x[1] are packed */
simon	0:1014af42efd9	298	in1 = (q15_t) x0;
simon	0:1014af42efd9	299	in2 = (q15_t) x1;
simon	0:1014af42efd9	300
simon	0:1014af42efd9	301	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	302
simon	0:1014af42efd9	303	/* y[srcBLen - 1] and y[srcBLen - 2] are packed */
simon	0:1014af42efd9	304	in1 = (q15_t) c0;
simon	0:1014af42efd9	305	in2 = (q15_t) c1;
simon	0:1014af42efd9	306
simon	0:1014af42efd9	307	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	308
simon	0:1014af42efd9	309	/* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
simon	0:1014af42efd9	310	acc0 = __SMLAD(input1, input2, acc0);
simon	0:1014af42efd9	311
simon	0:1014af42efd9	312	/* x[1] and x[2] are packed */
simon	0:1014af42efd9	313	in1 = (q15_t) x1;
simon	0:1014af42efd9	314	in2 = (q15_t) x2;
simon	0:1014af42efd9	315
simon	0:1014af42efd9	316	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	317
simon	0:1014af42efd9	318	/* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
simon	0:1014af42efd9	319	acc1 = __SMLAD(input1, input2, acc1);
simon	0:1014af42efd9	320
simon	0:1014af42efd9	321	/* x[2] and x[3] are packed */
simon	0:1014af42efd9	322	in1 = (q15_t) x2;
simon	0:1014af42efd9	323	in2 = (q15_t) x3;
simon	0:1014af42efd9	324
simon	0:1014af42efd9	325	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	326
simon	0:1014af42efd9	327	/* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
simon	0:1014af42efd9	328	acc2 = __SMLAD(input1, input2, acc2);
simon	0:1014af42efd9	329
simon	0:1014af42efd9	330	/* Read x[4] sample */
simon	0:1014af42efd9	331	x0 = *(px++);
simon	0:1014af42efd9	332
simon	0:1014af42efd9	333	/* x[3] and x[4] are packed */
simon	0:1014af42efd9	334	in1 = (q15_t) x3;
simon	0:1014af42efd9	335	in2 = (q15_t) x0;
simon	0:1014af42efd9	336
simon	0:1014af42efd9	337	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	338
simon	0:1014af42efd9	339	/* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
simon	0:1014af42efd9	340	acc3 = __SMLAD(input1, input2, acc3);
simon	0:1014af42efd9	341
simon	0:1014af42efd9	342	/* Read y[srcBLen - 3] sample */
simon	0:1014af42efd9	343	c0 = *(py--);
simon	0:1014af42efd9	344	/* Read y[srcBLen - 4] sample */
simon	0:1014af42efd9	345	c1 = *(py--);
simon	0:1014af42efd9	346
simon	0:1014af42efd9	347	/* Read x[5] sample */
simon	0:1014af42efd9	348	x1 = *(px++);
simon	0:1014af42efd9	349
simon	0:1014af42efd9	350	/* x[2] and x[3] are packed */
simon	0:1014af42efd9	351	in1 = (q15_t) x2;
simon	0:1014af42efd9	352	in2 = (q15_t) x3;
simon	0:1014af42efd9	353
simon	0:1014af42efd9	354	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	355
simon	0:1014af42efd9	356	/* y[srcBLen - 3] and y[srcBLen - 4] are packed */
simon	0:1014af42efd9	357	in1 = (q15_t) c0;
simon	0:1014af42efd9	358	in2 = (q15_t) c1;
simon	0:1014af42efd9	359
simon	0:1014af42efd9	360	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	361
simon	0:1014af42efd9	362	/* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
simon	0:1014af42efd9	363	acc0 = __SMLAD(input1, input2, acc0);
simon	0:1014af42efd9	364
simon	0:1014af42efd9	365	/* x[3] and x[4] are packed */
simon	0:1014af42efd9	366	in1 = (q15_t) x3;
simon	0:1014af42efd9	367	in2 = (q15_t) x0;
simon	0:1014af42efd9	368
simon	0:1014af42efd9	369	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	370
simon	0:1014af42efd9	371	/* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
simon	0:1014af42efd9	372	acc1 = __SMLAD(input1, input2, acc1);
simon	0:1014af42efd9	373
simon	0:1014af42efd9	374	/* x[4] and x[5] are packed */
simon	0:1014af42efd9	375	in1 = (q15_t) x0;
simon	0:1014af42efd9	376	in2 = (q15_t) x1;
simon	0:1014af42efd9	377
simon	0:1014af42efd9	378	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	379
simon	0:1014af42efd9	380	/* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
simon	0:1014af42efd9	381	acc2 = __SMLAD(input1, input2, acc2);
simon	0:1014af42efd9	382
simon	0:1014af42efd9	383	/* Read x[6] sample */
simon	0:1014af42efd9	384	x2 = *(px++);
simon	0:1014af42efd9	385
simon	0:1014af42efd9	386	/* x[5] and x[6] are packed */
simon	0:1014af42efd9	387	in1 = (q15_t) x1;
simon	0:1014af42efd9	388	in2 = (q15_t) x2;
simon	0:1014af42efd9	389
simon	0:1014af42efd9	390	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	391
simon	0:1014af42efd9	392	/* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
simon	0:1014af42efd9	393	acc3 = __SMLAD(input1, input2, acc3);
simon	0:1014af42efd9	394
simon	0:1014af42efd9	395	} while(--k);
simon	0:1014af42efd9	396
simon	0:1014af42efd9	397	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	398	** No loop unrolling is used. */
simon	0:1014af42efd9	399	k = srcBLen % 0x4u;
simon	0:1014af42efd9	400
simon	0:1014af42efd9	401	while(k > 0u)
simon	0:1014af42efd9	402	{
simon	0:1014af42efd9	403	/* Read y[srcBLen - 5] sample */
simon	0:1014af42efd9	404	c0 = *(py--);
simon	0:1014af42efd9	405
simon	0:1014af42efd9	406	/* Read x[7] sample */
simon	0:1014af42efd9	407	x3 = *(px++);
simon	0:1014af42efd9	408
simon	0:1014af42efd9	409	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	410	/* acc0 += x[4] * y[srcBLen - 5] */
simon	0:1014af42efd9	411	acc0 += ((q31_t) x0 * c0);
simon	0:1014af42efd9	412	/* acc1 += x[5] * y[srcBLen - 5] */
simon	0:1014af42efd9	413	acc1 += ((q31_t) x1 * c0);
simon	0:1014af42efd9	414	/* acc2 += x[6] * y[srcBLen - 5] */
simon	0:1014af42efd9	415	acc2 += ((q31_t) x2 * c0);
simon	0:1014af42efd9	416	/* acc3 += x[7] * y[srcBLen - 5] */
simon	0:1014af42efd9	417	acc3 += ((q31_t) x3 * c0);
simon	0:1014af42efd9	418
simon	0:1014af42efd9	419	/* Reuse the present samples for the next MAC */
simon	0:1014af42efd9	420	x0 = x1;
simon	0:1014af42efd9	421	x1 = x2;
simon	0:1014af42efd9	422	x2 = x3;
simon	0:1014af42efd9	423
simon	0:1014af42efd9	424	/* Decrement the loop counter */
simon	0:1014af42efd9	425	k--;
simon	0:1014af42efd9	426	}
simon	0:1014af42efd9	427
simon	0:1014af42efd9	428	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	429	*pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8));
simon	0:1014af42efd9	430	*pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8));
simon	0:1014af42efd9	431	*pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8));
simon	0:1014af42efd9	432	*pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8));
simon	0:1014af42efd9	433
simon	0:1014af42efd9	434	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	435	px = pIn1 + count * 4u;
simon	0:1014af42efd9	436	py = pSrc2;
simon	0:1014af42efd9	437
simon	0:1014af42efd9	438	/* Increment the pointer pIn1 index, count by 1 */
simon	0:1014af42efd9	439	count++;
simon	0:1014af42efd9	440
simon	0:1014af42efd9	441	/* Decrement the loop counter */
simon	0:1014af42efd9	442	blkCnt--;
simon	0:1014af42efd9	443	}
simon	0:1014af42efd9	444
simon	0:1014af42efd9	445	/* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
simon	0:1014af42efd9	446	** No loop unrolling is used. */
simon	0:1014af42efd9	447	blkCnt = (uint32_t) blockSize2 % 0x4u;
simon	0:1014af42efd9	448
simon	0:1014af42efd9	449	while(blkCnt > 0u)
simon	0:1014af42efd9	450	{
simon	0:1014af42efd9	451	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	452	sum = 0;
simon	0:1014af42efd9	453
simon	0:1014af42efd9	454	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	455	k = srcBLen >> 2u;
simon	0:1014af42efd9	456
simon	0:1014af42efd9	457	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	458	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	459	while(k > 0u)
simon	0:1014af42efd9	460	{
simon	0:1014af42efd9	461
simon	0:1014af42efd9	462	/* Reading two inputs of SrcA buffer and packing */
simon	0:1014af42efd9	463	in1 = (q15_t) * px++;
simon	0:1014af42efd9	464	in2 = (q15_t) * px++;
simon	0:1014af42efd9	465	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	466
simon	0:1014af42efd9	467	/* Reading two inputs of SrcB buffer and packing */
simon	0:1014af42efd9	468	in1 = (q15_t) * py--;
simon	0:1014af42efd9	469	in2 = (q15_t) * py--;
simon	0:1014af42efd9	470	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	471
simon	0:1014af42efd9	472	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	473	sum = __SMLAD(input1, input2, sum);
simon	0:1014af42efd9	474
simon	0:1014af42efd9	475	/* Reading two inputs of SrcA buffer and packing */
simon	0:1014af42efd9	476	in1 = (q15_t) * px++;
simon	0:1014af42efd9	477	in2 = (q15_t) * px++;
simon	0:1014af42efd9	478	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	479
simon	0:1014af42efd9	480	/* Reading two inputs of SrcB buffer and packing */
simon	0:1014af42efd9	481	in1 = (q15_t) * py--;
simon	0:1014af42efd9	482	in2 = (q15_t) * py--;
simon	0:1014af42efd9	483	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	484
simon	0:1014af42efd9	485	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	486	sum = __SMLAD(input1, input2, sum);
simon	0:1014af42efd9	487
simon	0:1014af42efd9	488	/* Decrement the loop counter */
simon	0:1014af42efd9	489	k--;
simon	0:1014af42efd9	490	}
simon	0:1014af42efd9	491
simon	0:1014af42efd9	492	/* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	493	** No loop unrolling is used. */
simon	0:1014af42efd9	494	k = srcBLen % 0x4u;
simon	0:1014af42efd9	495
simon	0:1014af42efd9	496	while(k > 0u)
simon	0:1014af42efd9	497	{
simon	0:1014af42efd9	498	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	499	sum += ((q31_t) * px++ * *py--);
simon	0:1014af42efd9	500
simon	0:1014af42efd9	501	/* Decrement the loop counter */
simon	0:1014af42efd9	502	k--;
simon	0:1014af42efd9	503	}
simon	0:1014af42efd9	504
simon	0:1014af42efd9	505	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	506	*pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
simon	0:1014af42efd9	507
simon	0:1014af42efd9	508	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	509	px = pIn1 + count;
simon	0:1014af42efd9	510	py = pSrc2;
simon	0:1014af42efd9	511
simon	0:1014af42efd9	512	/* Increment the pointer pIn1 index, count by 1 */
simon	0:1014af42efd9	513	count++;
simon	0:1014af42efd9	514
simon	0:1014af42efd9	515	/* Decrement the loop counter */
simon	0:1014af42efd9	516	blkCnt--;
simon	0:1014af42efd9	517	}
simon	0:1014af42efd9	518	}
simon	0:1014af42efd9	519	else
simon	0:1014af42efd9	520	{
simon	0:1014af42efd9	521	/* If the srcBLen is not a multiple of 4,
simon	0:1014af42efd9	522	* the blockSize2 loop cannot be unrolled by 4 */
simon	0:1014af42efd9	523	blkCnt = (uint32_t) blockSize2;
simon	0:1014af42efd9	524
simon	0:1014af42efd9	525	while(blkCnt > 0u)
simon	0:1014af42efd9	526	{
simon	0:1014af42efd9	527	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	528	sum = 0;
simon	0:1014af42efd9	529
simon	0:1014af42efd9	530	/* srcBLen number of MACS should be performed */
simon	0:1014af42efd9	531	k = srcBLen;
simon	0:1014af42efd9	532
simon	0:1014af42efd9	533	while(k > 0u)
simon	0:1014af42efd9	534	{
simon	0:1014af42efd9	535	/* Perform the multiply-accumulate */
simon	0:1014af42efd9	536	sum += ((q31_t) * px++ * *py--);
simon	0:1014af42efd9	537
simon	0:1014af42efd9	538	/* Decrement the loop counter */
simon	0:1014af42efd9	539	k--;
simon	0:1014af42efd9	540	}
simon	0:1014af42efd9	541
simon	0:1014af42efd9	542	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	543	*pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
simon	0:1014af42efd9	544
simon	0:1014af42efd9	545	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	546	px = pIn1 + count;
simon	0:1014af42efd9	547	py = pSrc2;
simon	0:1014af42efd9	548
simon	0:1014af42efd9	549	/* Increment the MAC count */
simon	0:1014af42efd9	550	count++;
simon	0:1014af42efd9	551
simon	0:1014af42efd9	552	/* Decrement the loop counter */
simon	0:1014af42efd9	553	blkCnt--;
simon	0:1014af42efd9	554	}
simon	0:1014af42efd9	555	}
simon	0:1014af42efd9	556
simon	0:1014af42efd9	557
simon	0:1014af42efd9	558	/* --------------------------
simon	0:1014af42efd9	559	* Initializations of stage3
simon	0:1014af42efd9	560	* -------------------------*/
simon	0:1014af42efd9	561
simon	0:1014af42efd9	562	/* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
simon	0:1014af42efd9	563	* sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
simon	0:1014af42efd9	564	* ....
simon	0:1014af42efd9	565	* sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
simon	0:1014af42efd9	566	* sum += x[srcALen-1] * y[srcBLen-1]
simon	0:1014af42efd9	567	*/
simon	0:1014af42efd9	568
simon	0:1014af42efd9	569	/* In this stage the MAC operations are decreased by 1 for every iteration.
simon	0:1014af42efd9	570	The count variable holds the number of MAC operations performed */
simon	0:1014af42efd9	571	count = srcBLen - 1u;
simon	0:1014af42efd9	572
simon	0:1014af42efd9	573	/* Working pointer of inputA */
simon	0:1014af42efd9	574	pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
simon	0:1014af42efd9	575	px = pSrc1;
simon	0:1014af42efd9	576
simon	0:1014af42efd9	577	/* Working pointer of inputB */
simon	0:1014af42efd9	578	pSrc2 = pIn2 + (srcBLen - 1u);
simon	0:1014af42efd9	579	py = pSrc2;
simon	0:1014af42efd9	580
simon	0:1014af42efd9	581	/* -------------------
simon	0:1014af42efd9	582	* Stage3 process
simon	0:1014af42efd9	583	* ------------------*/
simon	0:1014af42efd9	584
simon	0:1014af42efd9	585	while(blockSize3 > 0)
simon	0:1014af42efd9	586	{
simon	0:1014af42efd9	587	/* Accumulator is made zero for every iteration */
simon	0:1014af42efd9	588	sum = 0;
simon	0:1014af42efd9	589
simon	0:1014af42efd9	590	/* Apply loop unrolling and compute 4 MACs simultaneously. */
simon	0:1014af42efd9	591	k = count >> 2u;
simon	0:1014af42efd9	592
simon	0:1014af42efd9	593	/* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon	0:1014af42efd9	594	** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon	0:1014af42efd9	595	while(k > 0u)
simon	0:1014af42efd9	596	{
simon	0:1014af42efd9	597	/* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
simon	0:1014af42efd9	598	in1 = (q15_t) * px++;
simon	0:1014af42efd9	599	in2 = (q15_t) * px++;
simon	0:1014af42efd9	600	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	601
simon	0:1014af42efd9	602	/* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
simon	0:1014af42efd9	603	in1 = (q15_t) * py--;
simon	0:1014af42efd9	604	in2 = (q15_t) * py--;
simon	0:1014af42efd9	605	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	606
simon	0:1014af42efd9	607	/* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
simon	0:1014af42efd9	608	/* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
simon	0:1014af42efd9	609	sum = __SMLAD(input1, input2, sum);
simon	0:1014af42efd9	610
simon	0:1014af42efd9	611	/* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
simon	0:1014af42efd9	612	in1 = (q15_t) * px++;
simon	0:1014af42efd9	613	in2 = (q15_t) * px++;
simon	0:1014af42efd9	614	input1 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	615
simon	0:1014af42efd9	616	/* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
simon	0:1014af42efd9	617	in1 = (q15_t) * py--;
simon	0:1014af42efd9	618	in2 = (q15_t) * py--;
simon	0:1014af42efd9	619	input2 = ((q31_t) in1 & 0x0000FFFF) \| ((q31_t) in2 << 16);
simon	0:1014af42efd9	620
simon	0:1014af42efd9	621	/* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
simon	0:1014af42efd9	622	/* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
simon	0:1014af42efd9	623	sum = __SMLAD(input1, input2, sum);
simon	0:1014af42efd9	624
simon	0:1014af42efd9	625	/* Decrement the loop counter */
simon	0:1014af42efd9	626	k--;
simon	0:1014af42efd9	627	}
simon	0:1014af42efd9	628
simon	0:1014af42efd9	629	/* If the count is not a multiple of 4, compute any remaining MACs here.
simon	0:1014af42efd9	630	** No loop unrolling is used. */
simon	0:1014af42efd9	631	k = count % 0x4u;
simon	0:1014af42efd9	632
simon	0:1014af42efd9	633	while(k > 0u)
simon	0:1014af42efd9	634	{
simon	0:1014af42efd9	635	/* Perform the multiply-accumulates */
simon	0:1014af42efd9	636	/* sum += x[srcALen-1] * y[srcBLen-1] */
simon	0:1014af42efd9	637	sum += ((q31_t) * px++ * *py--);
simon	0:1014af42efd9	638
simon	0:1014af42efd9	639	/* Decrement the loop counter */
simon	0:1014af42efd9	640	k--;
simon	0:1014af42efd9	641	}
simon	0:1014af42efd9	642
simon	0:1014af42efd9	643	/* Store the result in the accumulator in the destination buffer. */
simon	0:1014af42efd9	644	*pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
simon	0:1014af42efd9	645
simon	0:1014af42efd9	646	/* Update the inputA and inputB pointers for next MAC calculation */
simon	0:1014af42efd9	647	px = ++pSrc1;
simon	0:1014af42efd9	648	py = pSrc2;
simon	0:1014af42efd9	649
simon	0:1014af42efd9	650	/* Decrement the MAC count */
simon	0:1014af42efd9	651	count--;
simon	0:1014af42efd9	652
simon	0:1014af42efd9	653	/* Decrement the loop counter */
simon	0:1014af42efd9	654	blockSize3--;
simon	0:1014af42efd9	655
simon	0:1014af42efd9	656	}
simon	0:1014af42efd9	657
simon	0:1014af42efd9	658	/* set status as ARM_MATH_SUCCESS */
simon	0:1014af42efd9	659	status = ARM_MATH_SUCCESS;
simon	0:1014af42efd9	660	}
simon	0:1014af42efd9	661
simon	0:1014af42efd9	662	/* Return to application */
simon	0:1014af42efd9	663	return (status);
simon	0:1014af42efd9	664
simon	0:1014af42efd9	665	}
simon	0:1014af42efd9	666
simon	0:1014af42efd9	667	/**
simon	0:1014af42efd9	668	* @} end of PartialConv group
simon	0:1014af42efd9	669	*/

Repository toolbox

Export to desktop IDE

Repository details

Type:	Library
Created:	10 Mar 2011
Imports:	907
Forks:	1
Commits:	3
Dependents:	5
Dependencies:	0
Followers:	35

src/Cortex-M4-M3/FilteringFunctions/arm_conv_partial_q7.c@0:1014af42efd9, 2011-03-10 (annotated)

Who changed what in which revision?

Repository toolbox

Repository details

Important Information for this Arm website

Access Warning