CMSIS DSP Library from CMSIS 2.0. See http://www.onarm.com/cmsis/ for full details

Dependents:   K22F_DSP_Matrix_least_square BNO055-ELEC3810 1BNO055 ECE4180Project--Slave2 ... more

Committer:
simon
Date:
Thu Mar 10 15:07:50 2011 +0000
Revision:
0:1014af42efd9

        

Who changed what in which revision?

UserRevisionLine numberNew contents of line
simon 0:1014af42efd9 1 /* ----------------------------------------------------------------------
simon 0:1014af42efd9 2 * Copyright (C) 2010 ARM Limited. All rights reserved.
simon 0:1014af42efd9 3 *
simon 0:1014af42efd9 4 * $Date: 29. November 2010
simon 0:1014af42efd9 5 * $Revision: V1.0.3
simon 0:1014af42efd9 6 *
simon 0:1014af42efd9 7 * Project: CMSIS DSP Library
simon 0:1014af42efd9 8 * Title: arm_conv_partial_q7.c
simon 0:1014af42efd9 9 *
simon 0:1014af42efd9 10 * Description: Q7 Partial convolution.
simon 0:1014af42efd9 11 *
simon 0:1014af42efd9 12 * Target Processor: Cortex-M4/Cortex-M3
simon 0:1014af42efd9 13 *
simon 0:1014af42efd9 14 * Version 1.0.3 2010/11/29
simon 0:1014af42efd9 15 * Re-organized the CMSIS folders and updated documentation.
simon 0:1014af42efd9 16 *
simon 0:1014af42efd9 17 * Version 1.0.2 2010/11/11
simon 0:1014af42efd9 18 * Documentation updated.
simon 0:1014af42efd9 19 *
simon 0:1014af42efd9 20 * Version 1.0.1 2010/10/05
simon 0:1014af42efd9 21 * Production release and review comments incorporated.
simon 0:1014af42efd9 22 *
simon 0:1014af42efd9 23 * Version 1.0.0 2010/09/20
simon 0:1014af42efd9 24 * Production release and review comments incorporated
simon 0:1014af42efd9 25 *
simon 0:1014af42efd9 26 * Version 0.0.7 2010/06/10
simon 0:1014af42efd9 27 * Misra-C changes done
simon 0:1014af42efd9 28 *
simon 0:1014af42efd9 29 * -------------------------------------------------------------------- */
simon 0:1014af42efd9 30
simon 0:1014af42efd9 31 #include "arm_math.h"
simon 0:1014af42efd9 32
simon 0:1014af42efd9 33 /**
simon 0:1014af42efd9 34 * @ingroup groupFilters
simon 0:1014af42efd9 35 */
simon 0:1014af42efd9 36
simon 0:1014af42efd9 37 /**
simon 0:1014af42efd9 38 * @addtogroup PartialConv
simon 0:1014af42efd9 39 * @{
simon 0:1014af42efd9 40 */
simon 0:1014af42efd9 41
simon 0:1014af42efd9 42 /**
simon 0:1014af42efd9 43 * @brief Partial convolution of Q7 sequences
simon 0:1014af42efd9 44 * @param[in] *pSrcA points to the first input sequence.
simon 0:1014af42efd9 45 * @param[in] srcALen length of the first input sequence.
simon 0:1014af42efd9 46 * @param[in] *pSrcB points to the second input sequence.
simon 0:1014af42efd9 47 * @param[in] srcBLen length of the second input sequence.
simon 0:1014af42efd9 48 * @param[out] *pDst points to the location where the output result is written.
simon 0:1014af42efd9 49 * @param[in] firstIndex is the first output sample to start with.
simon 0:1014af42efd9 50 * @param[in] numPoints is the number of output points to be computed.
simon 0:1014af42efd9 51 * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
simon 0:1014af42efd9 52 *
simon 0:1014af42efd9 53 */
simon 0:1014af42efd9 54
simon 0:1014af42efd9 55 arm_status arm_conv_partial_q7(
simon 0:1014af42efd9 56 q7_t * pSrcA,
simon 0:1014af42efd9 57 uint32_t srcALen,
simon 0:1014af42efd9 58 q7_t * pSrcB,
simon 0:1014af42efd9 59 uint32_t srcBLen,
simon 0:1014af42efd9 60 q7_t * pDst,
simon 0:1014af42efd9 61 uint32_t firstIndex,
simon 0:1014af42efd9 62 uint32_t numPoints)
simon 0:1014af42efd9 63 {
simon 0:1014af42efd9 64 q7_t *pIn1; /* inputA pointer */
simon 0:1014af42efd9 65 q7_t *pIn2; /* inputB pointer */
simon 0:1014af42efd9 66 q7_t *pOut = pDst; /* output pointer */
simon 0:1014af42efd9 67 q7_t *px; /* Intermediate inputA pointer */
simon 0:1014af42efd9 68 q7_t *py; /* Intermediate inputB pointer */
simon 0:1014af42efd9 69 q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
simon 0:1014af42efd9 70 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
simon 0:1014af42efd9 71 q31_t input1, input2;
simon 0:1014af42efd9 72 q15_t in1, in2;
simon 0:1014af42efd9 73 q7_t x0, x1, x2, x3, c0, c1;
simon 0:1014af42efd9 74 uint32_t j, k, count, check, blkCnt;
simon 0:1014af42efd9 75 int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
simon 0:1014af42efd9 76 arm_status status;
simon 0:1014af42efd9 77
simon 0:1014af42efd9 78
simon 0:1014af42efd9 79 /* Check for range of output samples to be calculated */
simon 0:1014af42efd9 80 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
simon 0:1014af42efd9 81 {
simon 0:1014af42efd9 82 /* Set status as ARM_MATH_ARGUMENT_ERROR */
simon 0:1014af42efd9 83 status = ARM_MATH_ARGUMENT_ERROR;
simon 0:1014af42efd9 84 }
simon 0:1014af42efd9 85 else
simon 0:1014af42efd9 86 {
simon 0:1014af42efd9 87
simon 0:1014af42efd9 88 /* The algorithm implementation is based on the lengths of the inputs. */
simon 0:1014af42efd9 89 /* srcB is always made to slide across srcA. */
simon 0:1014af42efd9 90 /* So srcBLen is always considered as shorter or equal to srcALen */
simon 0:1014af42efd9 91 if(srcALen >= srcBLen)
simon 0:1014af42efd9 92 {
simon 0:1014af42efd9 93 /* Initialization of inputA pointer */
simon 0:1014af42efd9 94 pIn1 = pSrcA;
simon 0:1014af42efd9 95
simon 0:1014af42efd9 96 /* Initialization of inputB pointer */
simon 0:1014af42efd9 97 pIn2 = pSrcB;
simon 0:1014af42efd9 98 }
simon 0:1014af42efd9 99 else
simon 0:1014af42efd9 100 {
simon 0:1014af42efd9 101 /* Initialization of inputA pointer */
simon 0:1014af42efd9 102 pIn1 = pSrcB;
simon 0:1014af42efd9 103
simon 0:1014af42efd9 104 /* Initialization of inputB pointer */
simon 0:1014af42efd9 105 pIn2 = pSrcA;
simon 0:1014af42efd9 106
simon 0:1014af42efd9 107 /* srcBLen is always considered as shorter or equal to srcALen */
simon 0:1014af42efd9 108 j = srcBLen;
simon 0:1014af42efd9 109 srcBLen = srcALen;
simon 0:1014af42efd9 110 srcALen = j;
simon 0:1014af42efd9 111 }
simon 0:1014af42efd9 112
simon 0:1014af42efd9 113 /* Conditions to check which loopCounter holds
simon 0:1014af42efd9 114 * the first and last indices of the output samples to be calculated. */
simon 0:1014af42efd9 115 check = firstIndex + numPoints;
simon 0:1014af42efd9 116 blockSize3 = ((int32_t) check - (int32_t) srcALen);
simon 0:1014af42efd9 117 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
simon 0:1014af42efd9 118 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
simon 0:1014af42efd9 119 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
simon 0:1014af42efd9 120 (int32_t) numPoints) : 0;
simon 0:1014af42efd9 121 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
simon 0:1014af42efd9 122 (int32_t) firstIndex);
simon 0:1014af42efd9 123 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
simon 0:1014af42efd9 124
simon 0:1014af42efd9 125 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
simon 0:1014af42efd9 126 /* The function is internally
simon 0:1014af42efd9 127 * divided into three stages according to the number of multiplications that has to be
simon 0:1014af42efd9 128 * taken place between inputA samples and inputB samples. In the first stage of the
simon 0:1014af42efd9 129 * algorithm, the multiplications increase by one for every iteration.
simon 0:1014af42efd9 130 * In the second stage of the algorithm, srcBLen number of multiplications are done.
simon 0:1014af42efd9 131 * In the third stage of the algorithm, the multiplications decrease by one
simon 0:1014af42efd9 132 * for every iteration. */
simon 0:1014af42efd9 133
simon 0:1014af42efd9 134 /* Set the output pointer to point to the firstIndex
simon 0:1014af42efd9 135 * of the output sample to be calculated. */
simon 0:1014af42efd9 136 pOut = pDst + firstIndex;
simon 0:1014af42efd9 137
simon 0:1014af42efd9 138 /* --------------------------
simon 0:1014af42efd9 139 * Initializations of stage1
simon 0:1014af42efd9 140 * -------------------------*/
simon 0:1014af42efd9 141
simon 0:1014af42efd9 142 /* sum = x[0] * y[0]
simon 0:1014af42efd9 143 * sum = x[0] * y[1] + x[1] * y[0]
simon 0:1014af42efd9 144 * ....
simon 0:1014af42efd9 145 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
simon 0:1014af42efd9 146 */
simon 0:1014af42efd9 147
simon 0:1014af42efd9 148 /* In this stage the MAC operations are increased by 1 for every iteration.
simon 0:1014af42efd9 149 The count variable holds the number of MAC operations performed.
simon 0:1014af42efd9 150 Since the partial convolution starts from from firstIndex
simon 0:1014af42efd9 151 Number of Macs to be performed is firstIndex + 1 */
simon 0:1014af42efd9 152 count = 1u + firstIndex;
simon 0:1014af42efd9 153
simon 0:1014af42efd9 154 /* Working pointer of inputA */
simon 0:1014af42efd9 155 px = pIn1;
simon 0:1014af42efd9 156
simon 0:1014af42efd9 157 /* Working pointer of inputB */
simon 0:1014af42efd9 158 pSrc2 = pIn2 + firstIndex;
simon 0:1014af42efd9 159 py = pSrc2;
simon 0:1014af42efd9 160
simon 0:1014af42efd9 161 /* ------------------------
simon 0:1014af42efd9 162 * Stage1 process
simon 0:1014af42efd9 163 * ----------------------*/
simon 0:1014af42efd9 164
simon 0:1014af42efd9 165 /* The first stage starts here */
simon 0:1014af42efd9 166 while(blockSize1 > 0)
simon 0:1014af42efd9 167 {
simon 0:1014af42efd9 168 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 169 sum = 0;
simon 0:1014af42efd9 170
simon 0:1014af42efd9 171 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 172 k = count >> 2u;
simon 0:1014af42efd9 173
simon 0:1014af42efd9 174 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 175 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 176 while(k > 0u)
simon 0:1014af42efd9 177 {
simon 0:1014af42efd9 178 /* x[0] , x[1] */
simon 0:1014af42efd9 179 in1 = (q15_t) * px++;
simon 0:1014af42efd9 180 in2 = (q15_t) * px++;
simon 0:1014af42efd9 181 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 182
simon 0:1014af42efd9 183 /* y[srcBLen - 1] , y[srcBLen - 2] */
simon 0:1014af42efd9 184 in1 = (q15_t) * py--;
simon 0:1014af42efd9 185 in2 = (q15_t) * py--;
simon 0:1014af42efd9 186 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 187
simon 0:1014af42efd9 188 /* x[0] * y[srcBLen - 1] */
simon 0:1014af42efd9 189 /* x[1] * y[srcBLen - 2] */
simon 0:1014af42efd9 190 sum = __SMLAD(input1, input2, sum);
simon 0:1014af42efd9 191
simon 0:1014af42efd9 192 /* x[2] , x[3] */
simon 0:1014af42efd9 193 in1 = (q15_t) * px++;
simon 0:1014af42efd9 194 in2 = (q15_t) * px++;
simon 0:1014af42efd9 195 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 196
simon 0:1014af42efd9 197 /* y[srcBLen - 3] , y[srcBLen - 4] */
simon 0:1014af42efd9 198 in1 = (q15_t) * py--;
simon 0:1014af42efd9 199 in2 = (q15_t) * py--;
simon 0:1014af42efd9 200 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 201
simon 0:1014af42efd9 202 /* x[2] * y[srcBLen - 3] */
simon 0:1014af42efd9 203 /* x[3] * y[srcBLen - 4] */
simon 0:1014af42efd9 204 sum = __SMLAD(input1, input2, sum);
simon 0:1014af42efd9 205
simon 0:1014af42efd9 206 /* Decrement the loop counter */
simon 0:1014af42efd9 207 k--;
simon 0:1014af42efd9 208 }
simon 0:1014af42efd9 209
simon 0:1014af42efd9 210 /* If the count is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 211 ** No loop unrolling is used. */
simon 0:1014af42efd9 212 k = count % 0x4u;
simon 0:1014af42efd9 213
simon 0:1014af42efd9 214 while(k > 0u)
simon 0:1014af42efd9 215 {
simon 0:1014af42efd9 216 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 217 sum += ((q31_t) * px++ * *py--);
simon 0:1014af42efd9 218
simon 0:1014af42efd9 219 /* Decrement the loop counter */
simon 0:1014af42efd9 220 k--;
simon 0:1014af42efd9 221 }
simon 0:1014af42efd9 222
simon 0:1014af42efd9 223 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 224 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
simon 0:1014af42efd9 225
simon 0:1014af42efd9 226 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 227 py = ++pSrc2;
simon 0:1014af42efd9 228 px = pIn1;
simon 0:1014af42efd9 229
simon 0:1014af42efd9 230 /* Increment the MAC count */
simon 0:1014af42efd9 231 count++;
simon 0:1014af42efd9 232
simon 0:1014af42efd9 233 /* Decrement the loop counter */
simon 0:1014af42efd9 234 blockSize1--;
simon 0:1014af42efd9 235 }
simon 0:1014af42efd9 236
simon 0:1014af42efd9 237 /* --------------------------
simon 0:1014af42efd9 238 * Initializations of stage2
simon 0:1014af42efd9 239 * ------------------------*/
simon 0:1014af42efd9 240
simon 0:1014af42efd9 241 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
simon 0:1014af42efd9 242 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
simon 0:1014af42efd9 243 * ....
simon 0:1014af42efd9 244 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
simon 0:1014af42efd9 245 */
simon 0:1014af42efd9 246
simon 0:1014af42efd9 247 /* Working pointer of inputA */
simon 0:1014af42efd9 248 px = pIn1;
simon 0:1014af42efd9 249
simon 0:1014af42efd9 250 /* Working pointer of inputB */
simon 0:1014af42efd9 251 pSrc2 = pIn2 + (srcBLen - 1u);
simon 0:1014af42efd9 252 py = pSrc2;
simon 0:1014af42efd9 253
simon 0:1014af42efd9 254 /* count is index by which the pointer pIn1 to be incremented */
simon 0:1014af42efd9 255 count = 1u;
simon 0:1014af42efd9 256
simon 0:1014af42efd9 257 /* -------------------
simon 0:1014af42efd9 258 * Stage2 process
simon 0:1014af42efd9 259 * ------------------*/
simon 0:1014af42efd9 260
simon 0:1014af42efd9 261 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
simon 0:1014af42efd9 262 * So, to loop unroll over blockSize2,
simon 0:1014af42efd9 263 * srcBLen should be greater than or equal to 4 */
simon 0:1014af42efd9 264 if(srcBLen >= 4u)
simon 0:1014af42efd9 265 {
simon 0:1014af42efd9 266 /* Loop unroll over blockSize2, by 4 */
simon 0:1014af42efd9 267 blkCnt = ((uint32_t) blockSize2 >> 2u);
simon 0:1014af42efd9 268
simon 0:1014af42efd9 269 while(blkCnt > 0u)
simon 0:1014af42efd9 270 {
simon 0:1014af42efd9 271 /* Set all accumulators to zero */
simon 0:1014af42efd9 272 acc0 = 0;
simon 0:1014af42efd9 273 acc1 = 0;
simon 0:1014af42efd9 274 acc2 = 0;
simon 0:1014af42efd9 275 acc3 = 0;
simon 0:1014af42efd9 276
simon 0:1014af42efd9 277 /* read x[0], x[1], x[2] samples */
simon 0:1014af42efd9 278 x0 = *(px++);
simon 0:1014af42efd9 279 x1 = *(px++);
simon 0:1014af42efd9 280 x2 = *(px++);
simon 0:1014af42efd9 281
simon 0:1014af42efd9 282 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 283 k = srcBLen >> 2u;
simon 0:1014af42efd9 284
simon 0:1014af42efd9 285 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 286 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 287 do
simon 0:1014af42efd9 288 {
simon 0:1014af42efd9 289 /* Read y[srcBLen - 1] sample */
simon 0:1014af42efd9 290 c0 = *(py--);
simon 0:1014af42efd9 291 /* Read y[srcBLen - 2] sample */
simon 0:1014af42efd9 292 c1 = *(py--);
simon 0:1014af42efd9 293
simon 0:1014af42efd9 294 /* Read x[3] sample */
simon 0:1014af42efd9 295 x3 = *(px++);
simon 0:1014af42efd9 296
simon 0:1014af42efd9 297 /* x[0] and x[1] are packed */
simon 0:1014af42efd9 298 in1 = (q15_t) x0;
simon 0:1014af42efd9 299 in2 = (q15_t) x1;
simon 0:1014af42efd9 300
simon 0:1014af42efd9 301 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 302
simon 0:1014af42efd9 303 /* y[srcBLen - 1] and y[srcBLen - 2] are packed */
simon 0:1014af42efd9 304 in1 = (q15_t) c0;
simon 0:1014af42efd9 305 in2 = (q15_t) c1;
simon 0:1014af42efd9 306
simon 0:1014af42efd9 307 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 308
simon 0:1014af42efd9 309 /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
simon 0:1014af42efd9 310 acc0 = __SMLAD(input1, input2, acc0);
simon 0:1014af42efd9 311
simon 0:1014af42efd9 312 /* x[1] and x[2] are packed */
simon 0:1014af42efd9 313 in1 = (q15_t) x1;
simon 0:1014af42efd9 314 in2 = (q15_t) x2;
simon 0:1014af42efd9 315
simon 0:1014af42efd9 316 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 317
simon 0:1014af42efd9 318 /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
simon 0:1014af42efd9 319 acc1 = __SMLAD(input1, input2, acc1);
simon 0:1014af42efd9 320
simon 0:1014af42efd9 321 /* x[2] and x[3] are packed */
simon 0:1014af42efd9 322 in1 = (q15_t) x2;
simon 0:1014af42efd9 323 in2 = (q15_t) x3;
simon 0:1014af42efd9 324
simon 0:1014af42efd9 325 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 326
simon 0:1014af42efd9 327 /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
simon 0:1014af42efd9 328 acc2 = __SMLAD(input1, input2, acc2);
simon 0:1014af42efd9 329
simon 0:1014af42efd9 330 /* Read x[4] sample */
simon 0:1014af42efd9 331 x0 = *(px++);
simon 0:1014af42efd9 332
simon 0:1014af42efd9 333 /* x[3] and x[4] are packed */
simon 0:1014af42efd9 334 in1 = (q15_t) x3;
simon 0:1014af42efd9 335 in2 = (q15_t) x0;
simon 0:1014af42efd9 336
simon 0:1014af42efd9 337 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 338
simon 0:1014af42efd9 339 /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
simon 0:1014af42efd9 340 acc3 = __SMLAD(input1, input2, acc3);
simon 0:1014af42efd9 341
simon 0:1014af42efd9 342 /* Read y[srcBLen - 3] sample */
simon 0:1014af42efd9 343 c0 = *(py--);
simon 0:1014af42efd9 344 /* Read y[srcBLen - 4] sample */
simon 0:1014af42efd9 345 c1 = *(py--);
simon 0:1014af42efd9 346
simon 0:1014af42efd9 347 /* Read x[5] sample */
simon 0:1014af42efd9 348 x1 = *(px++);
simon 0:1014af42efd9 349
simon 0:1014af42efd9 350 /* x[2] and x[3] are packed */
simon 0:1014af42efd9 351 in1 = (q15_t) x2;
simon 0:1014af42efd9 352 in2 = (q15_t) x3;
simon 0:1014af42efd9 353
simon 0:1014af42efd9 354 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 355
simon 0:1014af42efd9 356 /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
simon 0:1014af42efd9 357 in1 = (q15_t) c0;
simon 0:1014af42efd9 358 in2 = (q15_t) c1;
simon 0:1014af42efd9 359
simon 0:1014af42efd9 360 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 361
simon 0:1014af42efd9 362 /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
simon 0:1014af42efd9 363 acc0 = __SMLAD(input1, input2, acc0);
simon 0:1014af42efd9 364
simon 0:1014af42efd9 365 /* x[3] and x[4] are packed */
simon 0:1014af42efd9 366 in1 = (q15_t) x3;
simon 0:1014af42efd9 367 in2 = (q15_t) x0;
simon 0:1014af42efd9 368
simon 0:1014af42efd9 369 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 370
simon 0:1014af42efd9 371 /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
simon 0:1014af42efd9 372 acc1 = __SMLAD(input1, input2, acc1);
simon 0:1014af42efd9 373
simon 0:1014af42efd9 374 /* x[4] and x[5] are packed */
simon 0:1014af42efd9 375 in1 = (q15_t) x0;
simon 0:1014af42efd9 376 in2 = (q15_t) x1;
simon 0:1014af42efd9 377
simon 0:1014af42efd9 378 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 379
simon 0:1014af42efd9 380 /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
simon 0:1014af42efd9 381 acc2 = __SMLAD(input1, input2, acc2);
simon 0:1014af42efd9 382
simon 0:1014af42efd9 383 /* Read x[6] sample */
simon 0:1014af42efd9 384 x2 = *(px++);
simon 0:1014af42efd9 385
simon 0:1014af42efd9 386 /* x[5] and x[6] are packed */
simon 0:1014af42efd9 387 in1 = (q15_t) x1;
simon 0:1014af42efd9 388 in2 = (q15_t) x2;
simon 0:1014af42efd9 389
simon 0:1014af42efd9 390 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 391
simon 0:1014af42efd9 392 /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
simon 0:1014af42efd9 393 acc3 = __SMLAD(input1, input2, acc3);
simon 0:1014af42efd9 394
simon 0:1014af42efd9 395 } while(--k);
simon 0:1014af42efd9 396
simon 0:1014af42efd9 397 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 398 ** No loop unrolling is used. */
simon 0:1014af42efd9 399 k = srcBLen % 0x4u;
simon 0:1014af42efd9 400
simon 0:1014af42efd9 401 while(k > 0u)
simon 0:1014af42efd9 402 {
simon 0:1014af42efd9 403 /* Read y[srcBLen - 5] sample */
simon 0:1014af42efd9 404 c0 = *(py--);
simon 0:1014af42efd9 405
simon 0:1014af42efd9 406 /* Read x[7] sample */
simon 0:1014af42efd9 407 x3 = *(px++);
simon 0:1014af42efd9 408
simon 0:1014af42efd9 409 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 410 /* acc0 += x[4] * y[srcBLen - 5] */
simon 0:1014af42efd9 411 acc0 += ((q31_t) x0 * c0);
simon 0:1014af42efd9 412 /* acc1 += x[5] * y[srcBLen - 5] */
simon 0:1014af42efd9 413 acc1 += ((q31_t) x1 * c0);
simon 0:1014af42efd9 414 /* acc2 += x[6] * y[srcBLen - 5] */
simon 0:1014af42efd9 415 acc2 += ((q31_t) x2 * c0);
simon 0:1014af42efd9 416 /* acc3 += x[7] * y[srcBLen - 5] */
simon 0:1014af42efd9 417 acc3 += ((q31_t) x3 * c0);
simon 0:1014af42efd9 418
simon 0:1014af42efd9 419 /* Reuse the present samples for the next MAC */
simon 0:1014af42efd9 420 x0 = x1;
simon 0:1014af42efd9 421 x1 = x2;
simon 0:1014af42efd9 422 x2 = x3;
simon 0:1014af42efd9 423
simon 0:1014af42efd9 424 /* Decrement the loop counter */
simon 0:1014af42efd9 425 k--;
simon 0:1014af42efd9 426 }
simon 0:1014af42efd9 427
simon 0:1014af42efd9 428 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 429 *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8));
simon 0:1014af42efd9 430 *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8));
simon 0:1014af42efd9 431 *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8));
simon 0:1014af42efd9 432 *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8));
simon 0:1014af42efd9 433
simon 0:1014af42efd9 434 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 435 px = pIn1 + count * 4u;
simon 0:1014af42efd9 436 py = pSrc2;
simon 0:1014af42efd9 437
simon 0:1014af42efd9 438 /* Increment the pointer pIn1 index, count by 1 */
simon 0:1014af42efd9 439 count++;
simon 0:1014af42efd9 440
simon 0:1014af42efd9 441 /* Decrement the loop counter */
simon 0:1014af42efd9 442 blkCnt--;
simon 0:1014af42efd9 443 }
simon 0:1014af42efd9 444
simon 0:1014af42efd9 445 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
simon 0:1014af42efd9 446 ** No loop unrolling is used. */
simon 0:1014af42efd9 447 blkCnt = (uint32_t) blockSize2 % 0x4u;
simon 0:1014af42efd9 448
simon 0:1014af42efd9 449 while(blkCnt > 0u)
simon 0:1014af42efd9 450 {
simon 0:1014af42efd9 451 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 452 sum = 0;
simon 0:1014af42efd9 453
simon 0:1014af42efd9 454 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 455 k = srcBLen >> 2u;
simon 0:1014af42efd9 456
simon 0:1014af42efd9 457 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 458 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 459 while(k > 0u)
simon 0:1014af42efd9 460 {
simon 0:1014af42efd9 461
simon 0:1014af42efd9 462 /* Reading two inputs of SrcA buffer and packing */
simon 0:1014af42efd9 463 in1 = (q15_t) * px++;
simon 0:1014af42efd9 464 in2 = (q15_t) * px++;
simon 0:1014af42efd9 465 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 466
simon 0:1014af42efd9 467 /* Reading two inputs of SrcB buffer and packing */
simon 0:1014af42efd9 468 in1 = (q15_t) * py--;
simon 0:1014af42efd9 469 in2 = (q15_t) * py--;
simon 0:1014af42efd9 470 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 471
simon 0:1014af42efd9 472 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 473 sum = __SMLAD(input1, input2, sum);
simon 0:1014af42efd9 474
simon 0:1014af42efd9 475 /* Reading two inputs of SrcA buffer and packing */
simon 0:1014af42efd9 476 in1 = (q15_t) * px++;
simon 0:1014af42efd9 477 in2 = (q15_t) * px++;
simon 0:1014af42efd9 478 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 479
simon 0:1014af42efd9 480 /* Reading two inputs of SrcB buffer and packing */
simon 0:1014af42efd9 481 in1 = (q15_t) * py--;
simon 0:1014af42efd9 482 in2 = (q15_t) * py--;
simon 0:1014af42efd9 483 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 484
simon 0:1014af42efd9 485 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 486 sum = __SMLAD(input1, input2, sum);
simon 0:1014af42efd9 487
simon 0:1014af42efd9 488 /* Decrement the loop counter */
simon 0:1014af42efd9 489 k--;
simon 0:1014af42efd9 490 }
simon 0:1014af42efd9 491
simon 0:1014af42efd9 492 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 493 ** No loop unrolling is used. */
simon 0:1014af42efd9 494 k = srcBLen % 0x4u;
simon 0:1014af42efd9 495
simon 0:1014af42efd9 496 while(k > 0u)
simon 0:1014af42efd9 497 {
simon 0:1014af42efd9 498 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 499 sum += ((q31_t) * px++ * *py--);
simon 0:1014af42efd9 500
simon 0:1014af42efd9 501 /* Decrement the loop counter */
simon 0:1014af42efd9 502 k--;
simon 0:1014af42efd9 503 }
simon 0:1014af42efd9 504
simon 0:1014af42efd9 505 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 506 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
simon 0:1014af42efd9 507
simon 0:1014af42efd9 508 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 509 px = pIn1 + count;
simon 0:1014af42efd9 510 py = pSrc2;
simon 0:1014af42efd9 511
simon 0:1014af42efd9 512 /* Increment the pointer pIn1 index, count by 1 */
simon 0:1014af42efd9 513 count++;
simon 0:1014af42efd9 514
simon 0:1014af42efd9 515 /* Decrement the loop counter */
simon 0:1014af42efd9 516 blkCnt--;
simon 0:1014af42efd9 517 }
simon 0:1014af42efd9 518 }
simon 0:1014af42efd9 519 else
simon 0:1014af42efd9 520 {
simon 0:1014af42efd9 521 /* If the srcBLen is not a multiple of 4,
simon 0:1014af42efd9 522 * the blockSize2 loop cannot be unrolled by 4 */
simon 0:1014af42efd9 523 blkCnt = (uint32_t) blockSize2;
simon 0:1014af42efd9 524
simon 0:1014af42efd9 525 while(blkCnt > 0u)
simon 0:1014af42efd9 526 {
simon 0:1014af42efd9 527 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 528 sum = 0;
simon 0:1014af42efd9 529
simon 0:1014af42efd9 530 /* srcBLen number of MACS should be performed */
simon 0:1014af42efd9 531 k = srcBLen;
simon 0:1014af42efd9 532
simon 0:1014af42efd9 533 while(k > 0u)
simon 0:1014af42efd9 534 {
simon 0:1014af42efd9 535 /* Perform the multiply-accumulate */
simon 0:1014af42efd9 536 sum += ((q31_t) * px++ * *py--);
simon 0:1014af42efd9 537
simon 0:1014af42efd9 538 /* Decrement the loop counter */
simon 0:1014af42efd9 539 k--;
simon 0:1014af42efd9 540 }
simon 0:1014af42efd9 541
simon 0:1014af42efd9 542 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 543 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
simon 0:1014af42efd9 544
simon 0:1014af42efd9 545 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 546 px = pIn1 + count;
simon 0:1014af42efd9 547 py = pSrc2;
simon 0:1014af42efd9 548
simon 0:1014af42efd9 549 /* Increment the MAC count */
simon 0:1014af42efd9 550 count++;
simon 0:1014af42efd9 551
simon 0:1014af42efd9 552 /* Decrement the loop counter */
simon 0:1014af42efd9 553 blkCnt--;
simon 0:1014af42efd9 554 }
simon 0:1014af42efd9 555 }
simon 0:1014af42efd9 556
simon 0:1014af42efd9 557
simon 0:1014af42efd9 558 /* --------------------------
simon 0:1014af42efd9 559 * Initializations of stage3
simon 0:1014af42efd9 560 * -------------------------*/
simon 0:1014af42efd9 561
simon 0:1014af42efd9 562 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
simon 0:1014af42efd9 563 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
simon 0:1014af42efd9 564 * ....
simon 0:1014af42efd9 565 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
simon 0:1014af42efd9 566 * sum += x[srcALen-1] * y[srcBLen-1]
simon 0:1014af42efd9 567 */
simon 0:1014af42efd9 568
simon 0:1014af42efd9 569 /* In this stage the MAC operations are decreased by 1 for every iteration.
simon 0:1014af42efd9 570 The count variable holds the number of MAC operations performed */
simon 0:1014af42efd9 571 count = srcBLen - 1u;
simon 0:1014af42efd9 572
simon 0:1014af42efd9 573 /* Working pointer of inputA */
simon 0:1014af42efd9 574 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
simon 0:1014af42efd9 575 px = pSrc1;
simon 0:1014af42efd9 576
simon 0:1014af42efd9 577 /* Working pointer of inputB */
simon 0:1014af42efd9 578 pSrc2 = pIn2 + (srcBLen - 1u);
simon 0:1014af42efd9 579 py = pSrc2;
simon 0:1014af42efd9 580
simon 0:1014af42efd9 581 /* -------------------
simon 0:1014af42efd9 582 * Stage3 process
simon 0:1014af42efd9 583 * ------------------*/
simon 0:1014af42efd9 584
simon 0:1014af42efd9 585 while(blockSize3 > 0)
simon 0:1014af42efd9 586 {
simon 0:1014af42efd9 587 /* Accumulator is made zero for every iteration */
simon 0:1014af42efd9 588 sum = 0;
simon 0:1014af42efd9 589
simon 0:1014af42efd9 590 /* Apply loop unrolling and compute 4 MACs simultaneously. */
simon 0:1014af42efd9 591 k = count >> 2u;
simon 0:1014af42efd9 592
simon 0:1014af42efd9 593 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
simon 0:1014af42efd9 594 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
simon 0:1014af42efd9 595 while(k > 0u)
simon 0:1014af42efd9 596 {
simon 0:1014af42efd9 597 /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
simon 0:1014af42efd9 598 in1 = (q15_t) * px++;
simon 0:1014af42efd9 599 in2 = (q15_t) * px++;
simon 0:1014af42efd9 600 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 601
simon 0:1014af42efd9 602 /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
simon 0:1014af42efd9 603 in1 = (q15_t) * py--;
simon 0:1014af42efd9 604 in2 = (q15_t) * py--;
simon 0:1014af42efd9 605 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 606
simon 0:1014af42efd9 607 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
simon 0:1014af42efd9 608 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
simon 0:1014af42efd9 609 sum = __SMLAD(input1, input2, sum);
simon 0:1014af42efd9 610
simon 0:1014af42efd9 611 /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
simon 0:1014af42efd9 612 in1 = (q15_t) * px++;
simon 0:1014af42efd9 613 in2 = (q15_t) * px++;
simon 0:1014af42efd9 614 input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 615
simon 0:1014af42efd9 616 /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
simon 0:1014af42efd9 617 in1 = (q15_t) * py--;
simon 0:1014af42efd9 618 in2 = (q15_t) * py--;
simon 0:1014af42efd9 619 input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
simon 0:1014af42efd9 620
simon 0:1014af42efd9 621 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
simon 0:1014af42efd9 622 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
simon 0:1014af42efd9 623 sum = __SMLAD(input1, input2, sum);
simon 0:1014af42efd9 624
simon 0:1014af42efd9 625 /* Decrement the loop counter */
simon 0:1014af42efd9 626 k--;
simon 0:1014af42efd9 627 }
simon 0:1014af42efd9 628
simon 0:1014af42efd9 629 /* If the count is not a multiple of 4, compute any remaining MACs here.
simon 0:1014af42efd9 630 ** No loop unrolling is used. */
simon 0:1014af42efd9 631 k = count % 0x4u;
simon 0:1014af42efd9 632
simon 0:1014af42efd9 633 while(k > 0u)
simon 0:1014af42efd9 634 {
simon 0:1014af42efd9 635 /* Perform the multiply-accumulates */
simon 0:1014af42efd9 636 /* sum += x[srcALen-1] * y[srcBLen-1] */
simon 0:1014af42efd9 637 sum += ((q31_t) * px++ * *py--);
simon 0:1014af42efd9 638
simon 0:1014af42efd9 639 /* Decrement the loop counter */
simon 0:1014af42efd9 640 k--;
simon 0:1014af42efd9 641 }
simon 0:1014af42efd9 642
simon 0:1014af42efd9 643 /* Store the result in the accumulator in the destination buffer. */
simon 0:1014af42efd9 644 *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
simon 0:1014af42efd9 645
simon 0:1014af42efd9 646 /* Update the inputA and inputB pointers for next MAC calculation */
simon 0:1014af42efd9 647 px = ++pSrc1;
simon 0:1014af42efd9 648 py = pSrc2;
simon 0:1014af42efd9 649
simon 0:1014af42efd9 650 /* Decrement the MAC count */
simon 0:1014af42efd9 651 count--;
simon 0:1014af42efd9 652
simon 0:1014af42efd9 653 /* Decrement the loop counter */
simon 0:1014af42efd9 654 blockSize3--;
simon 0:1014af42efd9 655
simon 0:1014af42efd9 656 }
simon 0:1014af42efd9 657
simon 0:1014af42efd9 658 /* set status as ARM_MATH_SUCCESS */
simon 0:1014af42efd9 659 status = ARM_MATH_SUCCESS;
simon 0:1014af42efd9 660 }
simon 0:1014af42efd9 661
simon 0:1014af42efd9 662 /* Return to application */
simon 0:1014af42efd9 663 return (status);
simon 0:1014af42efd9 664
simon 0:1014af42efd9 665 }
simon 0:1014af42efd9 666
simon 0:1014af42efd9 667 /**
simon 0:1014af42efd9 668 * @} end of PartialConv group
simon 0:1014af42efd9 669 */