CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_q7.c Source File

arm_conv_partial_q7.c

00001 /* ----------------------------------------------------------------------   
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.   
00003 *   
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *   
00007 * Project:      CMSIS DSP Library   
00008 * Title:        arm_conv_partial_q7.c   
00009 *   
00010 * Description:  Partial convolution of Q7 sequences.   
00011 *   
00012 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.    
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**   
00044  * @ingroup groupFilters   
00045  */
00046 
00047 /**   
00048  * @addtogroup PartialConv   
00049  * @{   
00050  */
00051 
00052 /**   
00053  * @brief Partial convolution of Q7 sequences.   
00054  * @param[in]       *pSrcA points to the first input sequence.   
00055  * @param[in]       srcALen length of the first input sequence.   
00056  * @param[in]       *pSrcB points to the second input sequence.   
00057  * @param[in]       srcBLen length of the second input sequence.   
00058  * @param[out]      *pDst points to the location where the output result is written.   
00059  * @param[in]       firstIndex is the first output sample to start with.   
00060  * @param[in]       numPoints is the number of output points to be computed.   
00061  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].   
00062  *  
00063  * \par    
00064  * Refer the function <code>arm_conv_partial_opt_q7()</code> for a faster implementation of this function.
00065  *  
00066  */
00067 
00068 arm_status arm_conv_partial_q7(
00069   q7_t * pSrcA,
00070   uint32_t srcALen,
00071   q7_t * pSrcB,
00072   uint32_t srcBLen,
00073   q7_t * pDst,
00074   uint32_t firstIndex,
00075   uint32_t numPoints)
00076 {
00077 
00078 
00079 #ifndef ARM_MATH_CM0_FAMILY
00080 
00081   /* Run the below code for Cortex-M4 and Cortex-M3 */
00082 
00083   q7_t *pIn1;                                    /* inputA pointer */
00084   q7_t *pIn2;                                    /* inputB pointer */
00085   q7_t *pOut = pDst;                             /* output pointer */
00086   q7_t *px;                                      /* Intermediate inputA pointer */
00087   q7_t *py;                                      /* Intermediate inputB pointer */
00088   q7_t *pSrc1, *pSrc2;                           /* Intermediate pointers */
00089   q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulator */
00090   q31_t input1, input2;
00091   q15_t in1, in2;
00092   q7_t x0, x1, x2, x3, c0, c1;
00093   uint32_t j, k, count, check, blkCnt;
00094   int32_t blockSize1, blockSize2, blockSize3;    /* loop counter */
00095   arm_status status;
00096 
00097 
00098   /* Check for range of output samples to be calculated */
00099   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00100   {
00101     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00102     status = ARM_MATH_ARGUMENT_ERROR;
00103   }
00104   else
00105   {
00106 
00107     /* The algorithm implementation is based on the lengths of the inputs. */
00108     /* srcB is always made to slide across srcA. */
00109     /* So srcBLen is always considered as shorter or equal to srcALen */
00110     if(srcALen >= srcBLen)
00111     {
00112       /* Initialization of inputA pointer */
00113       pIn1 = pSrcA;
00114 
00115       /* Initialization of inputB pointer */
00116       pIn2 = pSrcB;
00117     }
00118     else
00119     {
00120       /* Initialization of inputA pointer */
00121       pIn1 = pSrcB;
00122 
00123       /* Initialization of inputB pointer */
00124       pIn2 = pSrcA;
00125 
00126       /* srcBLen is always considered as shorter or equal to srcALen */
00127       j = srcBLen;
00128       srcBLen = srcALen;
00129       srcALen = j;
00130     }
00131 
00132     /* Conditions to check which loopCounter holds   
00133      * the first and last indices of the output samples to be calculated. */
00134     check = firstIndex + numPoints;
00135     blockSize3 = ((int32_t) check - (int32_t) srcALen);
00136     blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
00137     blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
00138     blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
00139                                      (int32_t) numPoints) : 0;
00140     blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
00141                                     (int32_t) firstIndex);
00142     blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
00143 
00144     /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
00145     /* The function is internally   
00146      * divided into three stages according to the number of multiplications that has to be   
00147      * taken place between inputA samples and inputB samples. In the first stage of the   
00148      * algorithm, the multiplications increase by one for every iteration.   
00149      * In the second stage of the algorithm, srcBLen number of multiplications are done.   
00150      * In the third stage of the algorithm, the multiplications decrease by one   
00151      * for every iteration. */
00152 
00153     /* Set the output pointer to point to the firstIndex   
00154      * of the output sample to be calculated. */
00155     pOut = pDst + firstIndex;
00156 
00157     /* --------------------------   
00158      * Initializations of stage1   
00159      * -------------------------*/
00160 
00161     /* sum = x[0] * y[0]   
00162      * sum = x[0] * y[1] + x[1] * y[0]   
00163      * ....   
00164      * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]   
00165      */
00166 
00167     /* In this stage the MAC operations are increased by 1 for every iteration.   
00168        The count variable holds the number of MAC operations performed.   
00169        Since the partial convolution starts from from firstIndex   
00170        Number of Macs to be performed is firstIndex + 1 */
00171     count = 1u + firstIndex;
00172 
00173     /* Working pointer of inputA */
00174     px = pIn1;
00175 
00176     /* Working pointer of inputB */
00177     pSrc2 = pIn2 + firstIndex;
00178     py = pSrc2;
00179 
00180     /* ------------------------   
00181      * Stage1 process   
00182      * ----------------------*/
00183 
00184     /* The first stage starts here */
00185     while(blockSize1 > 0)
00186     {
00187       /* Accumulator is made zero for every iteration */
00188       sum = 0;
00189 
00190       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00191       k = count >> 2u;
00192 
00193       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00194        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00195       while(k > 0u)
00196       {
00197         /* x[0] , x[1] */
00198         in1 = (q15_t) * px++;
00199         in2 = (q15_t) * px++;
00200         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00201 
00202         /* y[srcBLen - 1] , y[srcBLen - 2] */
00203         in1 = (q15_t) * py--;
00204         in2 = (q15_t) * py--;
00205         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00206 
00207         /* x[0] * y[srcBLen - 1] */
00208         /* x[1] * y[srcBLen - 2] */
00209         sum = __SMLAD(input1, input2, sum);
00210 
00211         /* x[2] , x[3] */
00212         in1 = (q15_t) * px++;
00213         in2 = (q15_t) * px++;
00214         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00215 
00216         /* y[srcBLen - 3] , y[srcBLen - 4] */
00217         in1 = (q15_t) * py--;
00218         in2 = (q15_t) * py--;
00219         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00220 
00221         /* x[2] * y[srcBLen - 3] */
00222         /* x[3] * y[srcBLen - 4] */
00223         sum = __SMLAD(input1, input2, sum);
00224 
00225         /* Decrement the loop counter */
00226         k--;
00227       }
00228 
00229       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00230        ** No loop unrolling is used. */
00231       k = count % 0x4u;
00232 
00233       while(k > 0u)
00234       {
00235         /* Perform the multiply-accumulates */
00236         sum += ((q31_t) * px++ * *py--);
00237 
00238         /* Decrement the loop counter */
00239         k--;
00240       }
00241 
00242       /* Store the result in the accumulator in the destination buffer. */
00243       *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
00244 
00245       /* Update the inputA and inputB pointers for next MAC calculation */
00246       py = ++pSrc2;
00247       px = pIn1;
00248 
00249       /* Increment the MAC count */
00250       count++;
00251 
00252       /* Decrement the loop counter */
00253       blockSize1--;
00254     }
00255 
00256     /* --------------------------   
00257      * Initializations of stage2   
00258      * ------------------------*/
00259 
00260     /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]   
00261      * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]   
00262      * ....   
00263      * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]   
00264      */
00265 
00266     /* Working pointer of inputA */
00267     px = pIn1;
00268 
00269     /* Working pointer of inputB */
00270     pSrc2 = pIn2 + (srcBLen - 1u);
00271     py = pSrc2;
00272 
00273     /* count is index by which the pointer pIn1 to be incremented */
00274     count = 0u;
00275 
00276     /* -------------------   
00277      * Stage2 process   
00278      * ------------------*/
00279 
00280     /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.   
00281      * So, to loop unroll over blockSize2,   
00282      * srcBLen should be greater than or equal to 4 */
00283     if(srcBLen >= 4u)
00284     {
00285       /* Loop unroll over blockSize2, by 4 */
00286       blkCnt = ((uint32_t) blockSize2 >> 2u);
00287 
00288       while(blkCnt > 0u)
00289       {
00290         /* Set all accumulators to zero */
00291         acc0 = 0;
00292         acc1 = 0;
00293         acc2 = 0;
00294         acc3 = 0;
00295 
00296         /* read x[0], x[1], x[2] samples */
00297         x0 = *(px++);
00298         x1 = *(px++);
00299         x2 = *(px++);
00300 
00301         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00302         k = srcBLen >> 2u;
00303 
00304         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00305          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00306         do
00307         {
00308           /* Read y[srcBLen - 1] sample */
00309           c0 = *(py--);
00310           /* Read y[srcBLen - 2] sample */
00311           c1 = *(py--);
00312 
00313           /* Read x[3] sample */
00314           x3 = *(px++);
00315 
00316           /* x[0] and x[1] are packed */
00317           in1 = (q15_t) x0;
00318           in2 = (q15_t) x1;
00319 
00320           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00321 
00322           /* y[srcBLen - 1]   and y[srcBLen - 2] are packed */
00323           in1 = (q15_t) c0;
00324           in2 = (q15_t) c1;
00325 
00326           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00327 
00328           /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2]  */
00329           acc0 = __SMLAD(input1, input2, acc0);
00330 
00331           /* x[1] and x[2] are packed */
00332           in1 = (q15_t) x1;
00333           in2 = (q15_t) x2;
00334 
00335           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00336 
00337           /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2]  */
00338           acc1 = __SMLAD(input1, input2, acc1);
00339 
00340           /* x[2] and x[3] are packed */
00341           in1 = (q15_t) x2;
00342           in2 = (q15_t) x3;
00343 
00344           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00345 
00346           /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2]  */
00347           acc2 = __SMLAD(input1, input2, acc2);
00348 
00349           /* Read x[4] sample */
00350           x0 = *(px++);
00351 
00352           /* x[3] and x[4] are packed */
00353           in1 = (q15_t) x3;
00354           in2 = (q15_t) x0;
00355 
00356           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00357 
00358           /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2]  */
00359           acc3 = __SMLAD(input1, input2, acc3);
00360 
00361           /* Read y[srcBLen - 3] sample */
00362           c0 = *(py--);
00363           /* Read y[srcBLen - 4] sample */
00364           c1 = *(py--);
00365 
00366           /* Read x[5] sample */
00367           x1 = *(px++);
00368 
00369           /* x[2] and x[3] are packed */
00370           in1 = (q15_t) x2;
00371           in2 = (q15_t) x3;
00372 
00373           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00374 
00375           /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
00376           in1 = (q15_t) c0;
00377           in2 = (q15_t) c1;
00378 
00379           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00380 
00381           /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4]  */
00382           acc0 = __SMLAD(input1, input2, acc0);
00383 
00384           /* x[3] and x[4] are packed */
00385           in1 = (q15_t) x3;
00386           in2 = (q15_t) x0;
00387 
00388           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00389 
00390           /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4]  */
00391           acc1 = __SMLAD(input1, input2, acc1);
00392 
00393           /* x[4] and x[5] are packed */
00394           in1 = (q15_t) x0;
00395           in2 = (q15_t) x1;
00396 
00397           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00398 
00399           /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4]  */
00400           acc2 = __SMLAD(input1, input2, acc2);
00401 
00402           /* Read x[6] sample */
00403           x2 = *(px++);
00404 
00405           /* x[5] and x[6] are packed */
00406           in1 = (q15_t) x1;
00407           in2 = (q15_t) x2;
00408 
00409           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00410 
00411           /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4]  */
00412           acc3 = __SMLAD(input1, input2, acc3);
00413 
00414         } while(--k);
00415 
00416         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00417          ** No loop unrolling is used. */
00418         k = srcBLen % 0x4u;
00419 
00420         while(k > 0u)
00421         {
00422           /* Read y[srcBLen - 5] sample */
00423           c0 = *(py--);
00424 
00425           /* Read x[7] sample */
00426           x3 = *(px++);
00427 
00428           /* Perform the multiply-accumulates */
00429           /* acc0 +=  x[4] * y[srcBLen - 5] */
00430           acc0 += ((q31_t) x0 * c0);
00431           /* acc1 +=  x[5] * y[srcBLen - 5] */
00432           acc1 += ((q31_t) x1 * c0);
00433           /* acc2 +=  x[6] * y[srcBLen - 5] */
00434           acc2 += ((q31_t) x2 * c0);
00435           /* acc3 +=  x[7] * y[srcBLen - 5] */
00436           acc3 += ((q31_t) x3 * c0);
00437 
00438           /* Reuse the present samples for the next MAC */
00439           x0 = x1;
00440           x1 = x2;
00441           x2 = x3;
00442 
00443           /* Decrement the loop counter */
00444           k--;
00445         }
00446 
00447         /* Store the result in the accumulator in the destination buffer. */
00448         *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8));
00449         *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8));
00450         *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8));
00451         *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8));
00452 
00453         /* Increment the pointer pIn1 index, count by 4 */
00454         count += 4u;
00455 
00456         /* Update the inputA and inputB pointers for next MAC calculation */
00457         px = pIn1 + count;
00458         py = pSrc2;
00459 
00460 
00461         /* Decrement the loop counter */
00462         blkCnt--;
00463       }
00464 
00465       /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.   
00466        ** No loop unrolling is used. */
00467       blkCnt = (uint32_t) blockSize2 % 0x4u;
00468 
00469       while(blkCnt > 0u)
00470       {
00471         /* Accumulator is made zero for every iteration */
00472         sum = 0;
00473 
00474         /* Apply loop unrolling and compute 4 MACs simultaneously. */
00475         k = srcBLen >> 2u;
00476 
00477         /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00478          ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00479         while(k > 0u)
00480         {
00481 
00482           /* Reading two inputs of SrcA buffer and packing */
00483           in1 = (q15_t) * px++;
00484           in2 = (q15_t) * px++;
00485           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00486 
00487           /* Reading two inputs of SrcB buffer and packing */
00488           in1 = (q15_t) * py--;
00489           in2 = (q15_t) * py--;
00490           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00491 
00492           /* Perform the multiply-accumulates */
00493           sum = __SMLAD(input1, input2, sum);
00494 
00495           /* Reading two inputs of SrcA buffer and packing */
00496           in1 = (q15_t) * px++;
00497           in2 = (q15_t) * px++;
00498           input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00499 
00500           /* Reading two inputs of SrcB buffer and packing */
00501           in1 = (q15_t) * py--;
00502           in2 = (q15_t) * py--;
00503           input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00504 
00505           /* Perform the multiply-accumulates */
00506           sum = __SMLAD(input1, input2, sum);
00507 
00508           /* Decrement the loop counter */
00509           k--;
00510         }
00511 
00512         /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.   
00513          ** No loop unrolling is used. */
00514         k = srcBLen % 0x4u;
00515 
00516         while(k > 0u)
00517         {
00518           /* Perform the multiply-accumulates */
00519           sum += ((q31_t) * px++ * *py--);
00520 
00521           /* Decrement the loop counter */
00522           k--;
00523         }
00524 
00525         /* Store the result in the accumulator in the destination buffer. */
00526         *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
00527 
00528         /* Increment the pointer pIn1 index, count by 1 */
00529         count++;
00530 
00531         /* Update the inputA and inputB pointers for next MAC calculation */
00532         px = pIn1 + count;
00533         py = pSrc2; 
00534 
00535         /* Decrement the loop counter */
00536         blkCnt--;
00537       }
00538     }
00539     else
00540     {
00541       /* If the srcBLen is not a multiple of 4,   
00542        * the blockSize2 loop cannot be unrolled by 4 */
00543       blkCnt = (uint32_t) blockSize2;
00544 
00545       while(blkCnt > 0u)
00546       {
00547         /* Accumulator is made zero for every iteration */
00548         sum = 0;
00549 
00550         /* srcBLen number of MACS should be performed */
00551         k = srcBLen;
00552 
00553         while(k > 0u)
00554         {
00555           /* Perform the multiply-accumulate */
00556           sum += ((q31_t) * px++ * *py--);
00557 
00558           /* Decrement the loop counter */
00559           k--;
00560         }
00561 
00562         /* Store the result in the accumulator in the destination buffer. */
00563         *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
00564 
00565         /* Increment the MAC count */
00566         count++;
00567 
00568         /* Update the inputA and inputB pointers for next MAC calculation */
00569         px = pIn1 + count;
00570         py = pSrc2;
00571 
00572         /* Decrement the loop counter */
00573         blkCnt--;
00574       }
00575     }
00576 
00577 
00578     /* --------------------------   
00579      * Initializations of stage3   
00580      * -------------------------*/
00581 
00582     /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]   
00583      * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]   
00584      * ....   
00585      * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]   
00586      * sum +=  x[srcALen-1] * y[srcBLen-1]   
00587      */
00588 
00589     /* In this stage the MAC operations are decreased by 1 for every iteration.   
00590        The count variable holds the number of MAC operations performed */
00591     count = srcBLen - 1u;
00592 
00593     /* Working pointer of inputA */
00594     pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
00595     px = pSrc1;
00596 
00597     /* Working pointer of inputB */
00598     pSrc2 = pIn2 + (srcBLen - 1u);
00599     py = pSrc2;
00600 
00601     /* -------------------   
00602      * Stage3 process   
00603      * ------------------*/
00604 
00605     while(blockSize3 > 0)
00606     {
00607       /* Accumulator is made zero for every iteration */
00608       sum = 0;
00609 
00610       /* Apply loop unrolling and compute 4 MACs simultaneously. */
00611       k = count >> 2u;
00612 
00613       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.   
00614        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
00615       while(k > 0u)
00616       {
00617         /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
00618         in1 = (q15_t) * px++;
00619         in2 = (q15_t) * px++;
00620         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00621 
00622         /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
00623         in1 = (q15_t) * py--;
00624         in2 = (q15_t) * py--;
00625         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00626 
00627         /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
00628         /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
00629         sum = __SMLAD(input1, input2, sum);
00630 
00631         /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
00632         in1 = (q15_t) * px++;
00633         in2 = (q15_t) * px++;
00634         input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00635 
00636         /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
00637         in1 = (q15_t) * py--;
00638         in2 = (q15_t) * py--;
00639         input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
00640 
00641         /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
00642         /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
00643         sum = __SMLAD(input1, input2, sum);
00644 
00645         /* Decrement the loop counter */
00646         k--;
00647       }
00648 
00649       /* If the count is not a multiple of 4, compute any remaining MACs here.   
00650        ** No loop unrolling is used. */
00651       k = count % 0x4u;
00652 
00653       while(k > 0u)
00654       {
00655         /* Perform the multiply-accumulates */
00656         /* sum +=  x[srcALen-1] * y[srcBLen-1] */
00657         sum += ((q31_t) * px++ * *py--);
00658 
00659         /* Decrement the loop counter */
00660         k--;
00661       }
00662 
00663       /* Store the result in the accumulator in the destination buffer. */
00664       *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
00665 
00666       /* Update the inputA and inputB pointers for next MAC calculation */
00667       px = ++pSrc1;
00668       py = pSrc2;
00669 
00670       /* Decrement the MAC count */
00671       count--;
00672 
00673       /* Decrement the loop counter */
00674       blockSize3--;
00675 
00676     }
00677 
00678     /* set status as ARM_MATH_SUCCESS */
00679     status = ARM_MATH_SUCCESS;
00680   }
00681 
00682   /* Return to application */
00683   return (status);
00684 
00685 #else
00686 
00687   /* Run the below code for Cortex-M0 */
00688 
00689   q7_t *pIn1 = pSrcA;                            /* inputA pointer */
00690   q7_t *pIn2 = pSrcB;                            /* inputB pointer */
00691   q31_t sum;                                     /* Accumulator */
00692   uint32_t i, j;                                 /* loop counters */
00693   arm_status status;                             /* status of Partial convolution */
00694 
00695   /* Check for range of output samples to be calculated */
00696   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00697   {
00698     /* Set status as ARM_ARGUMENT_ERROR */
00699     status = ARM_MATH_ARGUMENT_ERROR;
00700   }
00701   else
00702   {
00703     /* Loop to calculate convolution for output length number of values */
00704     for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
00705     {
00706       /* Initialize sum with zero to carry on MAC operations */
00707       sum = 0;
00708 
00709       /* Loop to perform MAC operations according to convolution equation */
00710       for (j = 0; j <= i; j++)
00711       {
00712         /* Check the array limitations */
00713         if(((i - j) < srcBLen) && (j < srcALen))
00714         {
00715           /* z[i] += x[i-j] * y[j] */
00716           sum += ((q15_t) pIn1[j] * (pIn2[i - j]));
00717         }
00718       }
00719 
00720       /* Store the output in the destination buffer */
00721       pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
00722     }
00723     /* set status as ARM_SUCCESS as there are no argument errors */
00724     status = ARM_MATH_SUCCESS;
00725   }
00726   return (status);
00727 
00728 #endif /*  #ifndef ARM_MATH_CM0_FAMILY */
00729 
00730 }
00731 
00732 /**   
00733  * @} end of PartialConv group   
00734  */