CMSIS DSP library

Dependents:   KL25Z_FFT_Demo Hat_Board_v5_1 KL25Z_FFT_Demo_tony KL25Z_FFT_Demo_tony ... more

Fork of mbed-dsp by mbed official

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers arm_conv_partial_opt_q7.c Source File

arm_conv_partial_opt_q7.c

00001 /* ----------------------------------------------------------------------    
00002 * Copyright (C) 2010-2013 ARM Limited. All rights reserved.    
00003 *    
00004 * $Date:        17. January 2013
00005 * $Revision:    V1.4.1
00006 *    
00007 * Project:      CMSIS DSP Library    
00008 * Title:        arm_conv_partial_opt_q7.c    
00009 *    
00010 * Description:  Partial convolution of Q7 sequences.    
00011 *    
00012 * Target Processor: Cortex-M4/Cortex-M3
00013 *  
00014 * Redistribution and use in source and binary forms, with or without 
00015 * modification, are permitted provided that the following conditions
00016 * are met:
00017 *   - Redistributions of source code must retain the above copyright
00018 *     notice, this list of conditions and the following disclaimer.
00019 *   - Redistributions in binary form must reproduce the above copyright
00020 *     notice, this list of conditions and the following disclaimer in
00021 *     the documentation and/or other materials provided with the 
00022 *     distribution.
00023 *   - Neither the name of ARM LIMITED nor the names of its contributors
00024 *     may be used to endorse or promote products derived from this
00025 *     software without specific prior written permission.
00026 *
00027 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00028 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00029 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00030 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
00031 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00032 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00033 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00034 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00035 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00036 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00037 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00038 * POSSIBILITY OF SUCH DAMAGE.  
00039 * -------------------------------------------------------------------- */
00040 
00041 #include "arm_math.h"
00042 
00043 /**    
00044  * @ingroup groupFilters    
00045  */
00046 
00047 /**    
00048  * @addtogroup PartialConv    
00049  * @{    
00050  */
00051 
00052 /**    
00053  * @brief Partial convolution of Q7 sequences.    
00054  * @param[in]       *pSrcA points to the first input sequence.    
00055  * @param[in]       srcALen length of the first input sequence.    
00056  * @param[in]       *pSrcB points to the second input sequence.    
00057  * @param[in]       srcBLen length of the second input sequence.    
00058  * @param[out]      *pDst points to the location where the output result is written.    
00059  * @param[in]       firstIndex is the first output sample to start with.    
00060  * @param[in]       numPoints is the number of output points to be computed.    
00061  * @param[in]      *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.    
00062  * @param[in]      *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).    
00063  * @return  Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].    
00064  *    
00065  * \par Restrictions    
00066  *  If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE    
00067  *  In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit   
00068  * 
00069  *
00070  * 
00071  */
00072 
00073 
00074 #ifndef UNALIGNED_SUPPORT_DISABLE
00075 
00076 arm_status arm_conv_partial_opt_q7(
00077   q7_t * pSrcA,
00078   uint32_t srcALen,
00079   q7_t * pSrcB,
00080   uint32_t srcBLen,
00081   q7_t * pDst,
00082   uint32_t firstIndex,
00083   uint32_t numPoints,
00084   q15_t * pScratch1,
00085   q15_t * pScratch2)
00086 {
00087 
00088   q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
00089   q15_t x4;                                      /* Temporary input variable */
00090   q7_t *pIn1, *pIn2;                             /* inputA and inputB pointer */
00091   uint32_t j, k, blkCnt, tapCnt;                 /* loop counter */
00092   q7_t *px;                                      /* Temporary input1 pointer */
00093   q15_t *py;                                     /* Temporary input2 pointer */
00094   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00095   q31_t x1, x2, x3, y1;                          /* Temporary input variables */
00096   arm_status status;
00097   q7_t *pOut = pDst;                             /* output pointer */
00098   q7_t out0, out1, out2, out3;                   /* temporary variables */
00099 
00100   /* Check for range of output samples to be calculated */
00101   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00102   {
00103     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00104     status = ARM_MATH_ARGUMENT_ERROR;
00105   }
00106   else
00107   {
00108 
00109     /* The algorithm implementation is based on the lengths of the inputs. */
00110     /* srcB is always made to slide across srcA. */
00111     /* So srcBLen is always considered as shorter or equal to srcALen */
00112     if(srcALen >= srcBLen)
00113     {
00114       /* Initialization of inputA pointer */
00115       pIn1 = pSrcA;
00116 
00117       /* Initialization of inputB pointer */
00118       pIn2 = pSrcB;
00119     }
00120     else
00121     {
00122       /* Initialization of inputA pointer */
00123       pIn1 = pSrcB;
00124 
00125       /* Initialization of inputB pointer */
00126       pIn2 = pSrcA;
00127 
00128       /* srcBLen is always considered as shorter or equal to srcALen */
00129       j = srcBLen;
00130       srcBLen = srcALen;
00131       srcALen = j;
00132     }
00133 
00134     /* pointer to take end of scratch2 buffer */
00135     pScr2 = pScratch2;
00136 
00137     /* points to smaller length sequence */
00138     px = pIn2 + srcBLen - 1;
00139 
00140     /* Apply loop unrolling and do 4 Copies simultaneously. */
00141     k = srcBLen >> 2u;
00142 
00143     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00144      ** a second loop below copies for the remaining 1 to 3 samples. */
00145     while(k > 0u)
00146     {
00147       /* copy second buffer in reversal manner */
00148       x4 = (q15_t) * px--;
00149       *pScr2++ = x4;
00150       x4 = (q15_t) * px--;
00151       *pScr2++ = x4;
00152       x4 = (q15_t) * px--;
00153       *pScr2++ = x4;
00154       x4 = (q15_t) * px--;
00155       *pScr2++ = x4;
00156 
00157       /* Decrement the loop counter */
00158       k--;
00159     }
00160 
00161     /* If the count is not a multiple of 4, copy remaining samples here.       
00162      ** No loop unrolling is used. */
00163     k = srcBLen % 0x4u;
00164 
00165     while(k > 0u)
00166     {
00167       /* copy second buffer in reversal manner for remaining samples */
00168       x4 = (q15_t) * px--;
00169       *pScr2++ = x4;
00170 
00171       /* Decrement the loop counter */
00172       k--;
00173     }
00174 
00175     /* Initialze temporary scratch pointer */
00176     pScr1 = pScratch1;
00177 
00178     /* Fill (srcBLen - 1u) zeros in scratch buffer */
00179     arm_fill_q15(0, pScr1, (srcBLen - 1u));
00180 
00181     /* Update temporary scratch pointer */
00182     pScr1 += (srcBLen - 1u);
00183 
00184     /* Copy (srcALen) samples in scratch buffer */
00185     /* Apply loop unrolling and do 4 Copies simultaneously. */
00186     k = srcALen >> 2u;
00187 
00188     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00189      ** a second loop below copies for the remaining 1 to 3 samples. */
00190     while(k > 0u)
00191     {
00192       /* copy second buffer in reversal manner */
00193       x4 = (q15_t) * pIn1++;
00194       *pScr1++ = x4;
00195       x4 = (q15_t) * pIn1++;
00196       *pScr1++ = x4;
00197       x4 = (q15_t) * pIn1++;
00198       *pScr1++ = x4;
00199       x4 = (q15_t) * pIn1++;
00200       *pScr1++ = x4;
00201 
00202       /* Decrement the loop counter */
00203       k--;
00204     }
00205 
00206     /* If the count is not a multiple of 4, copy remaining samples here.       
00207      ** No loop unrolling is used. */
00208     k = srcALen % 0x4u;
00209 
00210     while(k > 0u)
00211     {
00212       /* copy second buffer in reversal manner for remaining samples */
00213       x4 = (q15_t) * pIn1++;
00214       *pScr1++ = x4;
00215 
00216       /* Decrement the loop counter */
00217       k--;
00218     }
00219 
00220     /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
00221     arm_fill_q15(0, pScr1, (srcBLen - 1u));
00222 
00223     /* Update pointer */
00224     pScr1 += (srcBLen - 1u);
00225 
00226 
00227     /* Temporary pointer for scratch2 */
00228     py = pScratch2;
00229 
00230     /* Initialization of pIn2 pointer */
00231     pIn2 = (q7_t *) py;
00232 
00233     pScr2 = py;
00234 
00235     pOut = pDst + firstIndex;
00236 
00237     pScratch1 += firstIndex;
00238 
00239     /* Actual convolution process starts here */
00240     blkCnt = (numPoints) >> 2;
00241 
00242 
00243     while(blkCnt > 0)
00244     {
00245       /* Initialze temporary scratch pointer as scratch1 */
00246       pScr1 = pScratch1;
00247 
00248       /* Clear Accumlators */
00249       acc0 = 0;
00250       acc1 = 0;
00251       acc2 = 0;
00252       acc3 = 0;
00253 
00254       /* Read two samples from scratch1 buffer */
00255       x1 = *__SIMD32(pScr1)++;
00256 
00257       /* Read next two samples from scratch1 buffer */
00258       x2 = *__SIMD32(pScr1)++;
00259 
00260       tapCnt = (srcBLen) >> 2u;
00261 
00262       while(tapCnt > 0u)
00263       {
00264 
00265         /* Read four samples from smaller buffer */
00266         y1 = _SIMD32_OFFSET(pScr2);
00267 
00268         /* multiply and accumlate */
00269         acc0 = __SMLAD(x1, y1, acc0);
00270         acc2 = __SMLAD(x2, y1, acc2);
00271 
00272         /* pack input data */
00273 #ifndef ARM_MATH_BIG_ENDIAN
00274         x3 = __PKHBT(x2, x1, 0);
00275 #else
00276         x3 = __PKHBT(x1, x2, 0);
00277 #endif
00278 
00279         /* multiply and accumlate */
00280         acc1 = __SMLADX(x3, y1, acc1);
00281 
00282         /* Read next two samples from scratch1 buffer */
00283         x1 = *__SIMD32(pScr1)++;
00284 
00285         /* pack input data */
00286 #ifndef ARM_MATH_BIG_ENDIAN
00287         x3 = __PKHBT(x1, x2, 0);
00288 #else
00289         x3 = __PKHBT(x2, x1, 0);
00290 #endif
00291 
00292         acc3 = __SMLADX(x3, y1, acc3);
00293 
00294         /* Read four samples from smaller buffer */
00295         y1 = _SIMD32_OFFSET(pScr2 + 2u);
00296 
00297         acc0 = __SMLAD(x2, y1, acc0);
00298 
00299         acc2 = __SMLAD(x1, y1, acc2);
00300 
00301         acc1 = __SMLADX(x3, y1, acc1);
00302 
00303         x2 = *__SIMD32(pScr1)++;
00304 
00305 #ifndef ARM_MATH_BIG_ENDIAN
00306         x3 = __PKHBT(x2, x1, 0);
00307 #else
00308         x3 = __PKHBT(x1, x2, 0);
00309 #endif
00310 
00311         acc3 = __SMLADX(x3, y1, acc3);
00312 
00313         pScr2 += 4u;
00314 
00315 
00316         /* Decrement the loop counter */
00317         tapCnt--;
00318       }
00319 
00320 
00321 
00322       /* Update scratch pointer for remaining samples of smaller length sequence */
00323       pScr1 -= 4u;
00324 
00325 
00326       /* apply same above for remaining samples of smaller length sequence */
00327       tapCnt = (srcBLen) & 3u;
00328 
00329       while(tapCnt > 0u)
00330       {
00331 
00332         /* accumlate the results */
00333         acc0 += (*pScr1++ * *pScr2);
00334         acc1 += (*pScr1++ * *pScr2);
00335         acc2 += (*pScr1++ * *pScr2);
00336         acc3 += (*pScr1++ * *pScr2++);
00337 
00338         pScr1 -= 3u;
00339 
00340         /* Decrement the loop counter */
00341         tapCnt--;
00342       }
00343 
00344       blkCnt--;
00345 
00346       /* Store the result in the accumulator in the destination buffer. */
00347       out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
00348       out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
00349       out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
00350       out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
00351 
00352       *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
00353 
00354       /* Initialization of inputB pointer */
00355       pScr2 = py;
00356 
00357       pScratch1 += 4u;
00358 
00359     }
00360 
00361     blkCnt = (numPoints) & 0x3;
00362 
00363     /* Calculate convolution for remaining samples of Bigger length sequence */
00364     while(blkCnt > 0)
00365     {
00366       /* Initialze temporary scratch pointer as scratch1 */
00367       pScr1 = pScratch1;
00368 
00369       /* Clear Accumlators */
00370       acc0 = 0;
00371 
00372       tapCnt = (srcBLen) >> 1u;
00373 
00374       while(tapCnt > 0u)
00375       {
00376 
00377         /* Read next two samples from scratch1 buffer */
00378         x1 = *__SIMD32(pScr1)++;
00379 
00380         /* Read two samples from smaller buffer */
00381         y1 = *__SIMD32(pScr2)++;
00382 
00383         acc0 = __SMLAD(x1, y1, acc0);
00384 
00385         /* Decrement the loop counter */
00386         tapCnt--;
00387       }
00388 
00389       tapCnt = (srcBLen) & 1u;
00390 
00391       /* apply same above for remaining samples of smaller length sequence */
00392       while(tapCnt > 0u)
00393       {
00394 
00395         /* accumlate the results */
00396         acc0 += (*pScr1++ * *pScr2++);
00397 
00398         /* Decrement the loop counter */
00399         tapCnt--;
00400       }
00401 
00402       blkCnt--;
00403 
00404       /* Store the result in the accumulator in the destination buffer. */
00405       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
00406 
00407       /* Initialization of inputB pointer */
00408       pScr2 = py;
00409 
00410       pScratch1 += 1u;
00411 
00412     }
00413 
00414     /* set status as ARM_MATH_SUCCESS */
00415     status = ARM_MATH_SUCCESS;
00416 
00417 
00418   }
00419 
00420   return (status);
00421 
00422 }
00423 
00424 #else
00425 
00426 arm_status arm_conv_partial_opt_q7(
00427   q7_t * pSrcA,
00428   uint32_t srcALen,
00429   q7_t * pSrcB,
00430   uint32_t srcBLen,
00431   q7_t * pDst,
00432   uint32_t firstIndex,
00433   uint32_t numPoints,
00434   q15_t * pScratch1,
00435   q15_t * pScratch2)
00436 {
00437 
00438   q15_t *pScr2, *pScr1;                          /* Intermediate pointers for scratch pointers */
00439   q15_t x4;                                      /* Temporary input variable */
00440   q7_t *pIn1, *pIn2;                             /* inputA and inputB pointer */
00441   uint32_t j, k, blkCnt, tapCnt;                 /* loop counter */
00442   q7_t *px;                                      /* Temporary input1 pointer */
00443   q15_t *py;                                     /* Temporary input2 pointer */
00444   q31_t acc0, acc1, acc2, acc3;                  /* Accumulator */
00445   arm_status status;
00446   q7_t *pOut = pDst;                             /* output pointer */
00447   q15_t x10, x11, x20, x21;                      /* Temporary input variables */
00448   q15_t y10, y11;                                /* Temporary input variables */
00449   q7_t out0, out1, out2, out3;                   /* temporary variables */
00450 
00451   /* Check for range of output samples to be calculated */
00452   if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
00453   {
00454     /* Set status as ARM_MATH_ARGUMENT_ERROR */
00455     status = ARM_MATH_ARGUMENT_ERROR;
00456   }
00457   else
00458   {
00459 
00460     /* The algorithm implementation is based on the lengths of the inputs. */
00461     /* srcB is always made to slide across srcA. */
00462     /* So srcBLen is always considered as shorter or equal to srcALen */
00463     if(srcALen >= srcBLen)
00464     {
00465       /* Initialization of inputA pointer */
00466       pIn1 = pSrcA;
00467 
00468       /* Initialization of inputB pointer */
00469       pIn2 = pSrcB;
00470     }
00471     else
00472     {
00473       /* Initialization of inputA pointer */
00474       pIn1 = pSrcB;
00475 
00476       /* Initialization of inputB pointer */
00477       pIn2 = pSrcA;
00478 
00479       /* srcBLen is always considered as shorter or equal to srcALen */
00480       j = srcBLen;
00481       srcBLen = srcALen;
00482       srcALen = j;
00483     }
00484 
00485     /* pointer to take end of scratch2 buffer */
00486     pScr2 = pScratch2;
00487 
00488     /* points to smaller length sequence */
00489     px = pIn2 + srcBLen - 1;
00490 
00491     /* Apply loop unrolling and do 4 Copies simultaneously. */
00492     k = srcBLen >> 2u;
00493 
00494     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00495      ** a second loop below copies for the remaining 1 to 3 samples. */
00496     while(k > 0u)
00497     {
00498       /* copy second buffer in reversal manner */
00499       x4 = (q15_t) * px--;
00500       *pScr2++ = x4;
00501       x4 = (q15_t) * px--;
00502       *pScr2++ = x4;
00503       x4 = (q15_t) * px--;
00504       *pScr2++ = x4;
00505       x4 = (q15_t) * px--;
00506       *pScr2++ = x4;
00507 
00508       /* Decrement the loop counter */
00509       k--;
00510     }
00511 
00512     /* If the count is not a multiple of 4, copy remaining samples here.       
00513      ** No loop unrolling is used. */
00514     k = srcBLen % 0x4u;
00515 
00516     while(k > 0u)
00517     {
00518       /* copy second buffer in reversal manner for remaining samples */
00519       x4 = (q15_t) * px--;
00520       *pScr2++ = x4;
00521 
00522       /* Decrement the loop counter */
00523       k--;
00524     }
00525 
00526     /* Initialze temporary scratch pointer */
00527     pScr1 = pScratch1;
00528 
00529     /* Fill (srcBLen - 1u) zeros in scratch buffer */
00530     arm_fill_q15(0, pScr1, (srcBLen - 1u));
00531 
00532     /* Update temporary scratch pointer */
00533     pScr1 += (srcBLen - 1u);
00534 
00535     /* Copy (srcALen) samples in scratch buffer */
00536     /* Apply loop unrolling and do 4 Copies simultaneously. */
00537     k = srcALen >> 2u;
00538 
00539     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00540      ** a second loop below copies for the remaining 1 to 3 samples. */
00541     while(k > 0u)
00542     {
00543       /* copy second buffer in reversal manner */
00544       x4 = (q15_t) * pIn1++;
00545       *pScr1++ = x4;
00546       x4 = (q15_t) * pIn1++;
00547       *pScr1++ = x4;
00548       x4 = (q15_t) * pIn1++;
00549       *pScr1++ = x4;
00550       x4 = (q15_t) * pIn1++;
00551       *pScr1++ = x4;
00552 
00553       /* Decrement the loop counter */
00554       k--;
00555     }
00556 
00557     /* If the count is not a multiple of 4, copy remaining samples here.       
00558      ** No loop unrolling is used. */
00559     k = srcALen % 0x4u;
00560 
00561     while(k > 0u)
00562     {
00563       /* copy second buffer in reversal manner for remaining samples */
00564       x4 = (q15_t) * pIn1++;
00565       *pScr1++ = x4;
00566 
00567       /* Decrement the loop counter */
00568       k--;
00569     }
00570 
00571     /* Apply loop unrolling and do 4 Copies simultaneously. */
00572     k = (srcBLen - 1u) >> 2u;
00573 
00574     /* First part of the processing with loop unrolling copies 4 data points at a time.       
00575      ** a second loop below copies for the remaining 1 to 3 samples. */
00576     while(k > 0u)
00577     {
00578       /* copy second buffer in reversal manner */
00579       *pScr1++ = 0;
00580       *pScr1++ = 0;
00581       *pScr1++ = 0;
00582       *pScr1++ = 0;
00583 
00584       /* Decrement the loop counter */
00585       k--;
00586     }
00587 
00588     /* If the count is not a multiple of 4, copy remaining samples here.       
00589      ** No loop unrolling is used. */
00590     k = (srcBLen - 1u) % 0x4u;
00591 
00592     while(k > 0u)
00593     {
00594       /* copy second buffer in reversal manner for remaining samples */
00595       *pScr1++ = 0;
00596 
00597       /* Decrement the loop counter */
00598       k--;
00599     }
00600 
00601 
00602     /* Temporary pointer for scratch2 */
00603     py = pScratch2;
00604 
00605     /* Initialization of pIn2 pointer */
00606     pIn2 = (q7_t *) py;
00607 
00608     pScr2 = py;
00609 
00610     pOut = pDst + firstIndex;
00611 
00612     pScratch1 += firstIndex;
00613 
00614     /* Actual convolution process starts here */
00615     blkCnt = (numPoints) >> 2;
00616 
00617 
00618     while(blkCnt > 0)
00619     {
00620       /* Initialze temporary scratch pointer as scratch1 */
00621       pScr1 = pScratch1;
00622 
00623       /* Clear Accumlators */
00624       acc0 = 0;
00625       acc1 = 0;
00626       acc2 = 0;
00627       acc3 = 0;
00628 
00629       /* Read two samples from scratch1 buffer */
00630       x10 = *pScr1++;
00631       x11 = *pScr1++;
00632 
00633       /* Read next two samples from scratch1 buffer */
00634       x20 = *pScr1++;
00635       x21 = *pScr1++;
00636 
00637       tapCnt = (srcBLen) >> 2u;
00638 
00639       while(tapCnt > 0u)
00640       {
00641 
00642         /* Read four samples from smaller buffer */
00643         y10 = *pScr2;
00644         y11 = *(pScr2 + 1u);
00645 
00646         /* multiply and accumlate */
00647         acc0 += (q31_t) x10 *y10;
00648         acc0 += (q31_t) x11 *y11;
00649         acc2 += (q31_t) x20 *y10;
00650         acc2 += (q31_t) x21 *y11;
00651 
00652 
00653         acc1 += (q31_t) x11 *y10;
00654         acc1 += (q31_t) x20 *y11;
00655 
00656         /* Read next two samples from scratch1 buffer */
00657         x10 = *pScr1;
00658         x11 = *(pScr1 + 1u);
00659 
00660         /* multiply and accumlate */
00661         acc3 += (q31_t) x21 *y10;
00662         acc3 += (q31_t) x10 *y11;
00663 
00664         /* Read next two samples from scratch2 buffer */
00665         y10 = *(pScr2 + 2u);
00666         y11 = *(pScr2 + 3u);
00667 
00668         /* multiply and accumlate */
00669         acc0 += (q31_t) x20 *y10;
00670         acc0 += (q31_t) x21 *y11;
00671         acc2 += (q31_t) x10 *y10;
00672         acc2 += (q31_t) x11 *y11;
00673         acc1 += (q31_t) x21 *y10;
00674         acc1 += (q31_t) x10 *y11;
00675 
00676         /* Read next two samples from scratch1 buffer */
00677         x20 = *(pScr1 + 2);
00678         x21 = *(pScr1 + 3);
00679 
00680         /* multiply and accumlate */
00681         acc3 += (q31_t) x11 *y10;
00682         acc3 += (q31_t) x20 *y11;
00683 
00684         /* update scratch pointers */
00685 
00686         pScr1 += 4u;
00687         pScr2 += 4u;
00688 
00689         /* Decrement the loop counter */
00690         tapCnt--;
00691       }
00692 
00693 
00694 
00695       /* Update scratch pointer for remaining samples of smaller length sequence */
00696       pScr1 -= 4u;
00697 
00698 
00699       /* apply same above for remaining samples of smaller length sequence */
00700       tapCnt = (srcBLen) & 3u;
00701 
00702       while(tapCnt > 0u)
00703       {
00704 
00705         /* accumlate the results */
00706         acc0 += (*pScr1++ * *pScr2);
00707         acc1 += (*pScr1++ * *pScr2);
00708         acc2 += (*pScr1++ * *pScr2);
00709         acc3 += (*pScr1++ * *pScr2++);
00710 
00711         pScr1 -= 3u;
00712 
00713         /* Decrement the loop counter */
00714         tapCnt--;
00715       }
00716 
00717       blkCnt--;
00718 
00719       /* Store the result in the accumulator in the destination buffer. */
00720       out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
00721       out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
00722       out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
00723       out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
00724 
00725 
00726       *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
00727 
00728       /* Initialization of inputB pointer */
00729       pScr2 = py;
00730 
00731       pScratch1 += 4u;
00732 
00733     }
00734 
00735     blkCnt = (numPoints) & 0x3;
00736 
00737     /* Calculate convolution for remaining samples of Bigger length sequence */
00738     while(blkCnt > 0)
00739     {
00740       /* Initialze temporary scratch pointer as scratch1 */
00741       pScr1 = pScratch1;
00742 
00743       /* Clear Accumlators */
00744       acc0 = 0;
00745 
00746       tapCnt = (srcBLen) >> 1u;
00747 
00748       while(tapCnt > 0u)
00749       {
00750 
00751         /* Read next two samples from scratch1 buffer */
00752         x10 = *pScr1++;
00753         x11 = *pScr1++;
00754 
00755         /* Read two samples from smaller buffer */
00756         y10 = *pScr2++;
00757         y11 = *pScr2++;
00758 
00759         /* multiply and accumlate */
00760         acc0 += (q31_t) x10 *y10;
00761         acc0 += (q31_t) x11 *y11;
00762 
00763         /* Decrement the loop counter */
00764         tapCnt--;
00765       }
00766 
00767       tapCnt = (srcBLen) & 1u;
00768 
00769       /* apply same above for remaining samples of smaller length sequence */
00770       while(tapCnt > 0u)
00771       {
00772 
00773         /* accumlate the results */
00774         acc0 += (*pScr1++ * *pScr2++);
00775 
00776         /* Decrement the loop counter */
00777         tapCnt--;
00778       }
00779 
00780       blkCnt--;
00781 
00782       /* Store the result in the accumulator in the destination buffer. */
00783       *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
00784 
00785       /* Initialization of inputB pointer */
00786       pScr2 = py;
00787 
00788       pScratch1 += 1u;
00789 
00790     }
00791 
00792     /* set status as ARM_MATH_SUCCESS */
00793     status = ARM_MATH_SUCCESS;
00794 
00795   }
00796 
00797   return (status);
00798 
00799 }
00800 
00801 #endif  /*  #ifndef UNALIGNED_SUPPORT_DISABLE   */
00802 
00803 
00804 
00805 /**    
00806  * @} end of PartialConv group    
00807  */