STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_fast_q15.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_fast_q15.c
9 *
10 * Description: Fast Q15 Convolution.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
78  q15_t * pSrcA,
79  uint32_t srcALen,
80  q15_t * pSrcB,
81  uint32_t srcBLen,
82  q15_t * pDst)
83 {
84 #ifndef UNALIGNED_SUPPORT_DISABLE
85  q15_t *pIn1; /* inputA pointer */
86  q15_t *pIn2; /* inputB pointer */
87  q15_t *pOut = pDst; /* output pointer */
88  q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
89  q15_t *px; /* Intermediate inputA pointer */
90  q15_t *py; /* Intermediate inputB pointer */
91  q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
92  q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
93  uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
94 
95  /* The algorithm implementation is based on the lengths of the inputs. */
96  /* srcB is always made to slide across srcA. */
97  /* So srcBLen is always considered as shorter or equal to srcALen */
98  if(srcALen >= srcBLen)
99  {
100  /* Initialization of inputA pointer */
101  pIn1 = pSrcA;
102 
103  /* Initialization of inputB pointer */
104  pIn2 = pSrcB;
105  }
106  else
107  {
108  /* Initialization of inputA pointer */
109  pIn1 = pSrcB;
110 
111  /* Initialization of inputB pointer */
112  pIn2 = pSrcA;
113 
114  /* srcBLen is always considered as shorter or equal to srcALen */
115  j = srcBLen;
116  srcBLen = srcALen;
117  srcALen = j;
118  }
119 
120  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
121  /* The function is internally
122  * divided into three stages according to the number of multiplications that has to be
123  * taken place between inputA samples and inputB samples. In the first stage of the
124  * algorithm, the multiplications increase by one for every iteration.
125  * In the second stage of the algorithm, srcBLen number of multiplications are done.
126  * In the third stage of the algorithm, the multiplications decrease by one
127  * for every iteration. */
128 
129  /* The algorithm is implemented in three stages.
130  The loop counters of each stage is initiated here. */
131  blockSize1 = srcBLen - 1u;
132  blockSize2 = srcALen - (srcBLen - 1u);
133  blockSize3 = blockSize1;
134 
135  /* --------------------------
136  * Initializations of stage1
137  * -------------------------*/
138 
139  /* sum = x[0] * y[0]
140  * sum = x[0] * y[1] + x[1] * y[0]
141  * ....
142  * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
143  */
144 
145  /* In this stage the MAC operations are increased by 1 for every iteration.
146  The count variable holds the number of MAC operations performed */
147  count = 1u;
148 
149  /* Working pointer of inputA */
150  px = pIn1;
151 
152  /* Working pointer of inputB */
153  py = pIn2;
154 
155 
156  /* ------------------------
157  * Stage1 process
158  * ----------------------*/
159 
160  /* For loop unrolling by 4, this stage is divided into two. */
161  /* First part of this stage computes the MAC operations less than 4 */
162  /* Second part of this stage computes the MAC operations greater than or equal to 4 */
163 
164  /* The first part of the stage starts here */
165  while((count < 4u) && (blockSize1 > 0u))
166  {
167  /* Accumulator is made zero for every iteration */
168  sum = 0;
169 
170  /* Loop over number of MAC operations between
171  * inputA samples and inputB samples */
172  k = count;
173 
174  while(k > 0u)
175  {
176  /* Perform the multiply-accumulates */
177  sum = __SMLAD(*px++, *py--, sum);
178 
179  /* Decrement the loop counter */
180  k--;
181  }
182 
183  /* Store the result in the accumulator in the destination buffer. */
184  *pOut++ = (q15_t) (sum >> 15);
185 
186  /* Update the inputA and inputB pointers for next MAC calculation */
187  py = pIn2 + count;
188  px = pIn1;
189 
190  /* Increment the MAC count */
191  count++;
192 
193  /* Decrement the loop counter */
194  blockSize1--;
195  }
196 
197  /* The second part of the stage starts here */
198  /* The internal loop, over count, is unrolled by 4 */
199  /* To, read the last two inputB samples using SIMD:
200  * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
201  py = py - 1;
202 
203  while(blockSize1 > 0u)
204  {
205  /* Accumulator is made zero for every iteration */
206  sum = 0;
207 
208  /* Apply loop unrolling and compute 4 MACs simultaneously. */
209  k = count >> 2u;
210 
211  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
212  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
213  while(k > 0u)
214  {
215  /* Perform the multiply-accumulates */
216  /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
217  sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
218  /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
219  sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
220 
221  /* Decrement the loop counter */
222  k--;
223  }
224 
225  /* For the next MAC operations, the pointer py is used without SIMD
226  * So, py is incremented by 1 */
227  py = py + 1u;
228 
229  /* If the count is not a multiple of 4, compute any remaining MACs here.
230  ** No loop unrolling is used. */
231  k = count % 0x4u;
232 
233  while(k > 0u)
234  {
235  /* Perform the multiply-accumulates */
236  sum = __SMLAD(*px++, *py--, sum);
237 
238  /* Decrement the loop counter */
239  k--;
240  }
241 
242  /* Store the result in the accumulator in the destination buffer. */
243  *pOut++ = (q15_t) (sum >> 15);
244 
245  /* Update the inputA and inputB pointers for next MAC calculation */
246  py = pIn2 + (count - 1u);
247  px = pIn1;
248 
249  /* Increment the MAC count */
250  count++;
251 
252  /* Decrement the loop counter */
253  blockSize1--;
254  }
255 
256  /* --------------------------
257  * Initializations of stage2
258  * ------------------------*/
259 
260  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
261  * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
262  * ....
263  * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
264  */
265 
266  /* Working pointer of inputA */
267  px = pIn1;
268 
269  /* Working pointer of inputB */
270  pSrc2 = pIn2 + (srcBLen - 1u);
271  py = pSrc2;
272 
273  /* count is the index by which the pointer pIn1 to be incremented */
274  count = 0u;
275 
276 
277  /* --------------------
278  * Stage2 process
279  * -------------------*/
280 
281  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
282  * So, to loop unroll over blockSize2,
283  * srcBLen should be greater than or equal to 4 */
284  if(srcBLen >= 4u)
285  {
286  /* Loop unroll over blockSize2, by 4 */
287  blkCnt = blockSize2 >> 2u;
288 
289  while(blkCnt > 0u)
290  {
291  py = py - 1u;
292 
293  /* Set all accumulators to zero */
294  acc0 = 0;
295  acc1 = 0;
296  acc2 = 0;
297  acc3 = 0;
298 
299 
300  /* read x[0], x[1] samples */
301  x0 = *__SIMD32(px);
302  /* read x[1], x[2] samples */
303  x1 = _SIMD32_OFFSET(px+1);
304  px+= 2u;
305 
306 
307  /* Apply loop unrolling and compute 4 MACs simultaneously. */
308  k = srcBLen >> 2u;
309 
310  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
311  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
312  do
313  {
314  /* Read the last two inputB samples using SIMD:
315  * y[srcBLen - 1] and y[srcBLen - 2] */
316  c0 = *__SIMD32(py)--;
317 
318  /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
319  acc0 = __SMLADX(x0, c0, acc0);
320 
321  /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
322  acc1 = __SMLADX(x1, c0, acc1);
323 
324  /* Read x[2], x[3] */
325  x2 = *__SIMD32(px);
326 
327  /* Read x[3], x[4] */
328  x3 = _SIMD32_OFFSET(px+1);
329 
330  /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
331  acc2 = __SMLADX(x2, c0, acc2);
332 
333  /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
334  acc3 = __SMLADX(x3, c0, acc3);
335 
336  /* Read y[srcBLen - 3] and y[srcBLen - 4] */
337  c0 = *__SIMD32(py)--;
338 
339  /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
340  acc0 = __SMLADX(x2, c0, acc0);
341 
342  /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
343  acc1 = __SMLADX(x3, c0, acc1);
344 
345  /* Read x[4], x[5] */
346  x0 = _SIMD32_OFFSET(px+2);
347 
348  /* Read x[5], x[6] */
349  x1 = _SIMD32_OFFSET(px+3);
350  px += 4u;
351 
352  /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
353  acc2 = __SMLADX(x0, c0, acc2);
354 
355  /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
356  acc3 = __SMLADX(x1, c0, acc3);
357 
358  } while(--k);
359 
360  /* For the next MAC operations, SIMD is not used
361  * So, the 16 bit pointer if inputB, py is updated */
362 
363  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
364  ** No loop unrolling is used. */
365  k = srcBLen % 0x4u;
366 
367  if(k == 1u)
368  {
369  /* Read y[srcBLen - 5] */
370  c0 = *(py+1);
371 
372 #ifdef ARM_MATH_BIG_ENDIAN
373 
374  c0 = c0 << 16u;
375 
376 #else
377 
378  c0 = c0 & 0x0000FFFF;
379 
380 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
381 
382  /* Read x[7] */
383  x3 = *__SIMD32(px);
384  px++;
385 
386  /* Perform the multiply-accumulates */
387  acc0 = __SMLAD(x0, c0, acc0);
388  acc1 = __SMLAD(x1, c0, acc1);
389  acc2 = __SMLADX(x1, c0, acc2);
390  acc3 = __SMLADX(x3, c0, acc3);
391  }
392 
393  if(k == 2u)
394  {
395  /* Read y[srcBLen - 5], y[srcBLen - 6] */
396  c0 = _SIMD32_OFFSET(py);
397 
398  /* Read x[7], x[8] */
399  x3 = *__SIMD32(px);
400 
401  /* Read x[9] */
402  x2 = _SIMD32_OFFSET(px+1);
403  px += 2u;
404 
405  /* Perform the multiply-accumulates */
406  acc0 = __SMLADX(x0, c0, acc0);
407  acc1 = __SMLADX(x1, c0, acc1);
408  acc2 = __SMLADX(x3, c0, acc2);
409  acc3 = __SMLADX(x2, c0, acc3);
410  }
411 
412  if(k == 3u)
413  {
414  /* Read y[srcBLen - 5], y[srcBLen - 6] */
415  c0 = _SIMD32_OFFSET(py);
416 
417  /* Read x[7], x[8] */
418  x3 = *__SIMD32(px);
419 
420  /* Read x[9] */
421  x2 = _SIMD32_OFFSET(px+1);
422 
423  /* Perform the multiply-accumulates */
424  acc0 = __SMLADX(x0, c0, acc0);
425  acc1 = __SMLADX(x1, c0, acc1);
426  acc2 = __SMLADX(x3, c0, acc2);
427  acc3 = __SMLADX(x2, c0, acc3);
428 
429  /* Read y[srcBLen - 7] */
430  c0 = *(py-1);
431 #ifdef ARM_MATH_BIG_ENDIAN
432 
433  c0 = c0 << 16u;
434 #else
435 
436  c0 = c0 & 0x0000FFFF;
437 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
438 
439  /* Read x[10] */
440  x3 = _SIMD32_OFFSET(px+2);
441  px += 3u;
442 
443  /* Perform the multiply-accumulates */
444  acc0 = __SMLADX(x1, c0, acc0);
445  acc1 = __SMLAD(x2, c0, acc1);
446  acc2 = __SMLADX(x2, c0, acc2);
447  acc3 = __SMLADX(x3, c0, acc3);
448  }
449 
450  /* Store the results in the accumulators in the destination buffer. */
451 #ifndef ARM_MATH_BIG_ENDIAN
452 
453  *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16);
454  *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16);
455 
456 #else
457 
458  *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16);
459  *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16);
460 
461 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
462 
463  /* Increment the pointer pIn1 index, count by 4 */
464  count += 4u;
465 
466  /* Update the inputA and inputB pointers for next MAC calculation */
467  px = pIn1 + count;
468  py = pSrc2;
469 
470  /* Decrement the loop counter */
471  blkCnt--;
472  }
473 
474  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
475  ** No loop unrolling is used. */
476  blkCnt = blockSize2 % 0x4u;
477 
478  while(blkCnt > 0u)
479  {
480  /* Accumulator is made zero for every iteration */
481  sum = 0;
482 
483  /* Apply loop unrolling and compute 4 MACs simultaneously. */
484  k = srcBLen >> 2u;
485 
486  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
487  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
488  while(k > 0u)
489  {
490  /* Perform the multiply-accumulates */
491  sum += ((q31_t) * px++ * *py--);
492  sum += ((q31_t) * px++ * *py--);
493  sum += ((q31_t) * px++ * *py--);
494  sum += ((q31_t) * px++ * *py--);
495 
496  /* Decrement the loop counter */
497  k--;
498  }
499 
500  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
501  ** No loop unrolling is used. */
502  k = srcBLen % 0x4u;
503 
504  while(k > 0u)
505  {
506  /* Perform the multiply-accumulates */
507  sum += ((q31_t) * px++ * *py--);
508 
509  /* Decrement the loop counter */
510  k--;
511  }
512 
513  /* Store the result in the accumulator in the destination buffer. */
514  *pOut++ = (q15_t) (sum >> 15);
515 
516  /* Increment the pointer pIn1 index, count by 1 */
517  count++;
518 
519  /* Update the inputA and inputB pointers for next MAC calculation */
520  px = pIn1 + count;
521  py = pSrc2;
522 
523  /* Decrement the loop counter */
524  blkCnt--;
525  }
526  }
527  else
528  {
529  /* If the srcBLen is not a multiple of 4,
530  * the blockSize2 loop cannot be unrolled by 4 */
531  blkCnt = blockSize2;
532 
533  while(blkCnt > 0u)
534  {
535  /* Accumulator is made zero for every iteration */
536  sum = 0;
537 
538  /* srcBLen number of MACS should be performed */
539  k = srcBLen;
540 
541  while(k > 0u)
542  {
543  /* Perform the multiply-accumulate */
544  sum += ((q31_t) * px++ * *py--);
545 
546  /* Decrement the loop counter */
547  k--;
548  }
549 
550  /* Store the result in the accumulator in the destination buffer. */
551  *pOut++ = (q15_t) (sum >> 15);
552 
553  /* Increment the MAC count */
554  count++;
555 
556  /* Update the inputA and inputB pointers for next MAC calculation */
557  px = pIn1 + count;
558  py = pSrc2;
559 
560  /* Decrement the loop counter */
561  blkCnt--;
562  }
563  }
564 
565 
566  /* --------------------------
567  * Initializations of stage3
568  * -------------------------*/
569 
570  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
571  * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
572  * ....
573  * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
574  * sum += x[srcALen-1] * y[srcBLen-1]
575  */
576 
577  /* In this stage the MAC operations are decreased by 1 for every iteration.
578  The blockSize3 variable holds the number of MAC operations performed */
579 
580  /* Working pointer of inputA */
581  pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
582  px = pSrc1;
583 
584  /* Working pointer of inputB */
585  pSrc2 = pIn2 + (srcBLen - 1u);
586  pIn2 = pSrc2 - 1u;
587  py = pIn2;
588 
589  /* -------------------
590  * Stage3 process
591  * ------------------*/
592 
593  /* For loop unrolling by 4, this stage is divided into two. */
594  /* First part of this stage computes the MAC operations greater than 4 */
595  /* Second part of this stage computes the MAC operations less than or equal to 4 */
596 
597  /* The first part of the stage starts here */
598  j = blockSize3 >> 2u;
599 
600  while((j > 0u) && (blockSize3 > 0u))
601  {
602  /* Accumulator is made zero for every iteration */
603  sum = 0;
604 
605  /* Apply loop unrolling and compute 4 MACs simultaneously. */
606  k = blockSize3 >> 2u;
607 
608  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
609  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
610  while(k > 0u)
611  {
612  /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
613  * with y[srcBLen - 1], y[srcBLen - 2] respectively */
614  sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
615  /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
616  * with y[srcBLen - 3], y[srcBLen - 4] respectively */
617  sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
618 
619  /* Decrement the loop counter */
620  k--;
621  }
622 
623  /* For the next MAC operations, the pointer py is used without SIMD
624  * So, py is incremented by 1 */
625  py = py + 1u;
626 
627  /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
628  ** No loop unrolling is used. */
629  k = blockSize3 % 0x4u;
630 
631  while(k > 0u)
632  {
633  /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
634  sum = __SMLAD(*px++, *py--, sum);
635 
636  /* Decrement the loop counter */
637  k--;
638  }
639 
640  /* Store the result in the accumulator in the destination buffer. */
641  *pOut++ = (q15_t) (sum >> 15);
642 
643  /* Update the inputA and inputB pointers for next MAC calculation */
644  px = ++pSrc1;
645  py = pIn2;
646 
647  /* Decrement the loop counter */
648  blockSize3--;
649 
650  j--;
651  }
652 
653  /* The second part of the stage starts here */
654  /* SIMD is not used for the next MAC operations,
655  * so pointer py is updated to read only one sample at a time */
656  py = py + 1u;
657 
658  while(blockSize3 > 0u)
659  {
660  /* Accumulator is made zero for every iteration */
661  sum = 0;
662 
663  /* Apply loop unrolling and compute 4 MACs simultaneously. */
664  k = blockSize3;
665 
666  while(k > 0u)
667  {
668  /* Perform the multiply-accumulates */
669  /* sum += x[srcALen-1] * y[srcBLen-1] */
670  sum = __SMLAD(*px++, *py--, sum);
671 
672  /* Decrement the loop counter */
673  k--;
674  }
675 
676  /* Store the result in the accumulator in the destination buffer. */
677  *pOut++ = (q15_t) (sum >> 15);
678 
679  /* Update the inputA and inputB pointers for next MAC calculation */
680  px = ++pSrc1;
681  py = pSrc2;
682 
683  /* Decrement the loop counter */
684  blockSize3--;
685  }
686 
687 #else
688  q15_t *pIn1; /* inputA pointer */
689  q15_t *pIn2; /* inputB pointer */
690  q15_t *pOut = pDst; /* output pointer */
691  q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
692  q15_t *px; /* Intermediate inputA pointer */
693  q15_t *py; /* Intermediate inputB pointer */
694  q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
695  q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
696  uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
697  q15_t a, b;
698 
699  /* The algorithm implementation is based on the lengths of the inputs. */
700  /* srcB is always made to slide across srcA. */
701  /* So srcBLen is always considered as shorter or equal to srcALen */
702  if(srcALen >= srcBLen)
703  {
704  /* Initialization of inputA pointer */
705  pIn1 = pSrcA;
706 
707  /* Initialization of inputB pointer */
708  pIn2 = pSrcB;
709  }
710  else
711  {
712  /* Initialization of inputA pointer */
713  pIn1 = pSrcB;
714 
715  /* Initialization of inputB pointer */
716  pIn2 = pSrcA;
717 
718  /* srcBLen is always considered as shorter or equal to srcALen */
719  j = srcBLen;
720  srcBLen = srcALen;
721  srcALen = j;
722  }
723 
724  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
725  /* The function is internally
726  * divided into three stages according to the number of multiplications that has to be
727  * taken place between inputA samples and inputB samples. In the first stage of the
728  * algorithm, the multiplications increase by one for every iteration.
729  * In the second stage of the algorithm, srcBLen number of multiplications are done.
730  * In the third stage of the algorithm, the multiplications decrease by one
731  * for every iteration. */
732 
733  /* The algorithm is implemented in three stages.
734  The loop counters of each stage is initiated here. */
735  blockSize1 = srcBLen - 1u;
736  blockSize2 = srcALen - (srcBLen - 1u);
737  blockSize3 = blockSize1;
738 
739  /* --------------------------
740  * Initializations of stage1
741  * -------------------------*/
742 
743  /* sum = x[0] * y[0]
744  * sum = x[0] * y[1] + x[1] * y[0]
745  * ....
746  * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
747  */
748 
749  /* In this stage the MAC operations are increased by 1 for every iteration.
750  The count variable holds the number of MAC operations performed */
751  count = 1u;
752 
753  /* Working pointer of inputA */
754  px = pIn1;
755 
756  /* Working pointer of inputB */
757  py = pIn2;
758 
759 
760  /* ------------------------
761  * Stage1 process
762  * ----------------------*/
763 
764  /* For loop unrolling by 4, this stage is divided into two. */
765  /* First part of this stage computes the MAC operations less than 4 */
766  /* Second part of this stage computes the MAC operations greater than or equal to 4 */
767 
768  /* The first part of the stage starts here */
769  while((count < 4u) && (blockSize1 > 0u))
770  {
771  /* Accumulator is made zero for every iteration */
772  sum = 0;
773 
774  /* Loop over number of MAC operations between
775  * inputA samples and inputB samples */
776  k = count;
777 
778  while(k > 0u)
779  {
780  /* Perform the multiply-accumulates */
781  sum += ((q31_t) * px++ * *py--);
782 
783  /* Decrement the loop counter */
784  k--;
785  }
786 
787  /* Store the result in the accumulator in the destination buffer. */
788  *pOut++ = (q15_t) (sum >> 15);
789 
790  /* Update the inputA and inputB pointers for next MAC calculation */
791  py = pIn2 + count;
792  px = pIn1;
793 
794  /* Increment the MAC count */
795  count++;
796 
797  /* Decrement the loop counter */
798  blockSize1--;
799  }
800 
801  /* The second part of the stage starts here */
802  /* The internal loop, over count, is unrolled by 4 */
803  /* To, read the last two inputB samples using SIMD:
804  * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
805  py = py - 1;
806 
807  while(blockSize1 > 0u)
808  {
809  /* Accumulator is made zero for every iteration */
810  sum = 0;
811 
812  /* Apply loop unrolling and compute 4 MACs simultaneously. */
813  k = count >> 2u;
814 
815  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
816  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
817  py++;
818 
819  while(k > 0u)
820  {
821  /* Perform the multiply-accumulates */
822  sum += ((q31_t) * px++ * *py--);
823  sum += ((q31_t) * px++ * *py--);
824  sum += ((q31_t) * px++ * *py--);
825  sum += ((q31_t) * px++ * *py--);
826 
827  /* Decrement the loop counter */
828  k--;
829  }
830 
831  /* If the count is not a multiple of 4, compute any remaining MACs here.
832  ** No loop unrolling is used. */
833  k = count % 0x4u;
834 
835  while(k > 0u)
836  {
837  /* Perform the multiply-accumulates */
838  sum += ((q31_t) * px++ * *py--);
839 
840  /* Decrement the loop counter */
841  k--;
842  }
843 
844  /* Store the result in the accumulator in the destination buffer. */
845  *pOut++ = (q15_t) (sum >> 15);
846 
847  /* Update the inputA and inputB pointers for next MAC calculation */
848  py = pIn2 + (count - 1u);
849  px = pIn1;
850 
851  /* Increment the MAC count */
852  count++;
853 
854  /* Decrement the loop counter */
855  blockSize1--;
856  }
857 
858  /* --------------------------
859  * Initializations of stage2
860  * ------------------------*/
861 
862  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
863  * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
864  * ....
865  * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
866  */
867 
868  /* Working pointer of inputA */
869  px = pIn1;
870 
871  /* Working pointer of inputB */
872  pSrc2 = pIn2 + (srcBLen - 1u);
873  py = pSrc2;
874 
875  /* count is the index by which the pointer pIn1 to be incremented */
876  count = 0u;
877 
878 
879  /* --------------------
880  * Stage2 process
881  * -------------------*/
882 
883  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
884  * So, to loop unroll over blockSize2,
885  * srcBLen should be greater than or equal to 4 */
886  if(srcBLen >= 4u)
887  {
888  /* Loop unroll over blockSize2, by 4 */
889  blkCnt = blockSize2 >> 2u;
890 
891  while(blkCnt > 0u)
892  {
893  py = py - 1u;
894 
895  /* Set all accumulators to zero */
896  acc0 = 0;
897  acc1 = 0;
898  acc2 = 0;
899  acc3 = 0;
900 
901  /* read x[0], x[1] samples */
902  a = *px++;
903  b = *px++;
904 
905 #ifndef ARM_MATH_BIG_ENDIAN
906 
907  x0 = __PKHBT(a, b, 16);
908  a = *px;
909  x1 = __PKHBT(b, a, 16);
910 
911 #else
912 
913  x0 = __PKHBT(b, a, 16);
914  a = *px;
915  x1 = __PKHBT(a, b, 16);
916 
917 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
918 
919  /* Apply loop unrolling and compute 4 MACs simultaneously. */
920  k = srcBLen >> 2u;
921 
922  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
923  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
924  do
925  {
926  /* Read the last two inputB samples using SIMD:
927  * y[srcBLen - 1] and y[srcBLen - 2] */
928  a = *py;
929  b = *(py+1);
930  py -= 2;
931 
932 #ifndef ARM_MATH_BIG_ENDIAN
933 
934  c0 = __PKHBT(a, b, 16);
935 
936 #else
937 
938  c0 = __PKHBT(b, a, 16);;
939 
940 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
941 
942  /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
943  acc0 = __SMLADX(x0, c0, acc0);
944 
945  /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
946  acc1 = __SMLADX(x1, c0, acc1);
947 
948  a = *px;
949  b = *(px + 1);
950 
951 #ifndef ARM_MATH_BIG_ENDIAN
952 
953  x2 = __PKHBT(a, b, 16);
954  a = *(px + 2);
955  x3 = __PKHBT(b, a, 16);
956 
957 #else
958 
959  x2 = __PKHBT(b, a, 16);
960  a = *(px + 2);
961  x3 = __PKHBT(a, b, 16);
962 
963 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
964 
965  /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
966  acc2 = __SMLADX(x2, c0, acc2);
967 
968  /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
969  acc3 = __SMLADX(x3, c0, acc3);
970 
971  /* Read y[srcBLen - 3] and y[srcBLen - 4] */
972  a = *py;
973  b = *(py+1);
974  py -= 2;
975 
976 #ifndef ARM_MATH_BIG_ENDIAN
977 
978  c0 = __PKHBT(a, b, 16);
979 
980 #else
981 
982  c0 = __PKHBT(b, a, 16);;
983 
984 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
985 
986  /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
987  acc0 = __SMLADX(x2, c0, acc0);
988 
989  /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
990  acc1 = __SMLADX(x3, c0, acc1);
991 
992  /* Read x[4], x[5], x[6] */
993  a = *(px + 2);
994  b = *(px + 3);
995 
996 #ifndef ARM_MATH_BIG_ENDIAN
997 
998  x0 = __PKHBT(a, b, 16);
999  a = *(px + 4);
1000  x1 = __PKHBT(b, a, 16);
1001 
1002 #else
1003 
1004  x0 = __PKHBT(b, a, 16);
1005  a = *(px + 4);
1006  x1 = __PKHBT(a, b, 16);
1007 
1008 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1009 
1010  px += 4u;
1011 
1012  /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
1013  acc2 = __SMLADX(x0, c0, acc2);
1014 
1015  /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
1016  acc3 = __SMLADX(x1, c0, acc3);
1017 
1018  } while(--k);
1019 
1020  /* For the next MAC operations, SIMD is not used
1021  * So, the 16 bit pointer if inputB, py is updated */
1022 
1023  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1024  ** No loop unrolling is used. */
1025  k = srcBLen % 0x4u;
1026 
1027  if(k == 1u)
1028  {
1029  /* Read y[srcBLen - 5] */
1030  c0 = *(py+1);
1031 
1032 #ifdef ARM_MATH_BIG_ENDIAN
1033 
1034  c0 = c0 << 16u;
1035 
1036 #else
1037 
1038  c0 = c0 & 0x0000FFFF;
1039 
1040 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1041 
1042  /* Read x[7] */
1043  a = *px;
1044  b = *(px+1);
1045  px++;
1046 
1047 #ifndef ARM_MATH_BIG_ENDIAN
1048 
1049  x3 = __PKHBT(a, b, 16);
1050 
1051 #else
1052 
1053  x3 = __PKHBT(b, a, 16);;
1054 
1055 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1056 
1057 
1058  /* Perform the multiply-accumulates */
1059  acc0 = __SMLAD(x0, c0, acc0);
1060  acc1 = __SMLAD(x1, c0, acc1);
1061  acc2 = __SMLADX(x1, c0, acc2);
1062  acc3 = __SMLADX(x3, c0, acc3);
1063  }
1064 
1065  if(k == 2u)
1066  {
1067  /* Read y[srcBLen - 5], y[srcBLen - 6] */
1068  a = *py;
1069  b = *(py+1);
1070 
1071 #ifndef ARM_MATH_BIG_ENDIAN
1072 
1073  c0 = __PKHBT(a, b, 16);
1074 
1075 #else
1076 
1077  c0 = __PKHBT(b, a, 16);;
1078 
1079 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1080 
1081  /* Read x[7], x[8], x[9] */
1082  a = *px;
1083  b = *(px + 1);
1084 
1085 #ifndef ARM_MATH_BIG_ENDIAN
1086 
1087  x3 = __PKHBT(a, b, 16);
1088  a = *(px + 2);
1089  x2 = __PKHBT(b, a, 16);
1090 
1091 #else
1092 
1093  x3 = __PKHBT(b, a, 16);
1094  a = *(px + 2);
1095  x2 = __PKHBT(a, b, 16);
1096 
1097 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1098  px += 2u;
1099 
1100  /* Perform the multiply-accumulates */
1101  acc0 = __SMLADX(x0, c0, acc0);
1102  acc1 = __SMLADX(x1, c0, acc1);
1103  acc2 = __SMLADX(x3, c0, acc2);
1104  acc3 = __SMLADX(x2, c0, acc3);
1105  }
1106 
1107  if(k == 3u)
1108  {
1109  /* Read y[srcBLen - 5], y[srcBLen - 6] */
1110  a = *py;
1111  b = *(py+1);
1112 
1113 #ifndef ARM_MATH_BIG_ENDIAN
1114 
1115  c0 = __PKHBT(a, b, 16);
1116 
1117 #else
1118 
1119  c0 = __PKHBT(b, a, 16);;
1120 
1121 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1122 
1123  /* Read x[7], x[8], x[9] */
1124  a = *px;
1125  b = *(px + 1);
1126 
1127 #ifndef ARM_MATH_BIG_ENDIAN
1128 
1129  x3 = __PKHBT(a, b, 16);
1130  a = *(px + 2);
1131  x2 = __PKHBT(b, a, 16);
1132 
1133 #else
1134 
1135  x3 = __PKHBT(b, a, 16);
1136  a = *(px + 2);
1137  x2 = __PKHBT(a, b, 16);
1138 
1139 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1140 
1141  /* Perform the multiply-accumulates */
1142  acc0 = __SMLADX(x0, c0, acc0);
1143  acc1 = __SMLADX(x1, c0, acc1);
1144  acc2 = __SMLADX(x3, c0, acc2);
1145  acc3 = __SMLADX(x2, c0, acc3);
1146 
1147  /* Read y[srcBLen - 7] */
1148  c0 = *(py-1);
1149 #ifdef ARM_MATH_BIG_ENDIAN
1150 
1151  c0 = c0 << 16u;
1152 #else
1153 
1154  c0 = c0 & 0x0000FFFF;
1155 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1156 
1157  /* Read x[10] */
1158  a = *(px+2);
1159  b = *(px+3);
1160 
1161 #ifndef ARM_MATH_BIG_ENDIAN
1162 
1163  x3 = __PKHBT(a, b, 16);
1164 
1165 #else
1166 
1167  x3 = __PKHBT(b, a, 16);;
1168 
1169 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1170 
1171  px += 3u;
1172 
1173  /* Perform the multiply-accumulates */
1174  acc0 = __SMLADX(x1, c0, acc0);
1175  acc1 = __SMLAD(x2, c0, acc1);
1176  acc2 = __SMLADX(x2, c0, acc2);
1177  acc3 = __SMLADX(x3, c0, acc3);
1178  }
1179 
1180  /* Store the results in the accumulators in the destination buffer. */
1181  *pOut++ = (q15_t)(acc0 >> 15);
1182  *pOut++ = (q15_t)(acc1 >> 15);
1183  *pOut++ = (q15_t)(acc2 >> 15);
1184  *pOut++ = (q15_t)(acc3 >> 15);
1185 
1186  /* Increment the pointer pIn1 index, count by 4 */
1187  count += 4u;
1188 
1189  /* Update the inputA and inputB pointers for next MAC calculation */
1190  px = pIn1 + count;
1191  py = pSrc2;
1192 
1193  /* Decrement the loop counter */
1194  blkCnt--;
1195  }
1196 
1197  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
1198  ** No loop unrolling is used. */
1199  blkCnt = blockSize2 % 0x4u;
1200 
1201  while(blkCnt > 0u)
1202  {
1203  /* Accumulator is made zero for every iteration */
1204  sum = 0;
1205 
1206  /* Apply loop unrolling and compute 4 MACs simultaneously. */
1207  k = srcBLen >> 2u;
1208 
1209  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1210  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1211  while(k > 0u)
1212  {
1213  /* Perform the multiply-accumulates */
1214  sum += ((q31_t) * px++ * *py--);
1215  sum += ((q31_t) * px++ * *py--);
1216  sum += ((q31_t) * px++ * *py--);
1217  sum += ((q31_t) * px++ * *py--);
1218 
1219  /* Decrement the loop counter */
1220  k--;
1221  }
1222 
1223  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1224  ** No loop unrolling is used. */
1225  k = srcBLen % 0x4u;
1226 
1227  while(k > 0u)
1228  {
1229  /* Perform the multiply-accumulates */
1230  sum += ((q31_t) * px++ * *py--);
1231 
1232  /* Decrement the loop counter */
1233  k--;
1234  }
1235 
1236  /* Store the result in the accumulator in the destination buffer. */
1237  *pOut++ = (q15_t) (sum >> 15);
1238 
1239  /* Increment the pointer pIn1 index, count by 1 */
1240  count++;
1241 
1242  /* Update the inputA and inputB pointers for next MAC calculation */
1243  px = pIn1 + count;
1244  py = pSrc2;
1245 
1246  /* Decrement the loop counter */
1247  blkCnt--;
1248  }
1249  }
1250  else
1251  {
1252  /* If the srcBLen is not a multiple of 4,
1253  * the blockSize2 loop cannot be unrolled by 4 */
1254  blkCnt = blockSize2;
1255 
1256  while(blkCnt > 0u)
1257  {
1258  /* Accumulator is made zero for every iteration */
1259  sum = 0;
1260 
1261  /* srcBLen number of MACS should be performed */
1262  k = srcBLen;
1263 
1264  while(k > 0u)
1265  {
1266  /* Perform the multiply-accumulate */
1267  sum += ((q31_t) * px++ * *py--);
1268 
1269  /* Decrement the loop counter */
1270  k--;
1271  }
1272 
1273  /* Store the result in the accumulator in the destination buffer. */
1274  *pOut++ = (q15_t) (sum >> 15);
1275 
1276  /* Increment the MAC count */
1277  count++;
1278 
1279  /* Update the inputA and inputB pointers for next MAC calculation */
1280  px = pIn1 + count;
1281  py = pSrc2;
1282 
1283  /* Decrement the loop counter */
1284  blkCnt--;
1285  }
1286  }
1287 
1288 
1289  /* --------------------------
1290  * Initializations of stage3
1291  * -------------------------*/
1292 
1293  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
1294  * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
1295  * ....
1296  * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
1297  * sum += x[srcALen-1] * y[srcBLen-1]
1298  */
1299 
1300  /* In this stage the MAC operations are decreased by 1 for every iteration.
1301  The blockSize3 variable holds the number of MAC operations performed */
1302 
1303  /* Working pointer of inputA */
1304  pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
1305  px = pSrc1;
1306 
1307  /* Working pointer of inputB */
1308  pSrc2 = pIn2 + (srcBLen - 1u);
1309  pIn2 = pSrc2 - 1u;
1310  py = pIn2;
1311 
1312  /* -------------------
1313  * Stage3 process
1314  * ------------------*/
1315 
1316  /* For loop unrolling by 4, this stage is divided into two. */
1317  /* First part of this stage computes the MAC operations greater than 4 */
1318  /* Second part of this stage computes the MAC operations less than or equal to 4 */
1319 
1320  /* The first part of the stage starts here */
1321  j = blockSize3 >> 2u;
1322 
1323  while((j > 0u) && (blockSize3 > 0u))
1324  {
1325  /* Accumulator is made zero for every iteration */
1326  sum = 0;
1327 
1328  /* Apply loop unrolling and compute 4 MACs simultaneously. */
1329  k = blockSize3 >> 2u;
1330 
1331  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1332  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1333  py++;
1334 
1335  while(k > 0u)
1336  {
1337  sum += ((q31_t) * px++ * *py--);
1338  sum += ((q31_t) * px++ * *py--);
1339  sum += ((q31_t) * px++ * *py--);
1340  sum += ((q31_t) * px++ * *py--);
1341  /* Decrement the loop counter */
1342  k--;
1343  }
1344 
1345  /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
1346  ** No loop unrolling is used. */
1347  k = blockSize3 % 0x4u;
1348 
1349  while(k > 0u)
1350  {
1351  /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
1352  sum += ((q31_t) * px++ * *py--);
1353 
1354  /* Decrement the loop counter */
1355  k--;
1356  }
1357 
1358  /* Store the result in the accumulator in the destination buffer. */
1359  *pOut++ = (q15_t) (sum >> 15);
1360 
1361  /* Update the inputA and inputB pointers for next MAC calculation */
1362  px = ++pSrc1;
1363  py = pIn2;
1364 
1365  /* Decrement the loop counter */
1366  blockSize3--;
1367 
1368  j--;
1369  }
1370 
1371  /* The second part of the stage starts here */
1372  /* SIMD is not used for the next MAC operations,
1373  * so pointer py is updated to read only one sample at a time */
1374  py = py + 1u;
1375 
1376  while(blockSize3 > 0u)
1377  {
1378  /* Accumulator is made zero for every iteration */
1379  sum = 0;
1380 
1381  /* Apply loop unrolling and compute 4 MACs simultaneously. */
1382  k = blockSize3;
1383 
1384  while(k > 0u)
1385  {
1386  /* Perform the multiply-accumulates */
1387  /* sum += x[srcALen-1] * y[srcBLen-1] */
1388  sum += ((q31_t) * px++ * *py--);
1389 
1390  /* Decrement the loop counter */
1391  k--;
1392  }
1393 
1394  /* Store the result in the accumulator in the destination buffer. */
1395  *pOut++ = (q15_t) (sum >> 15);
1396 
1397  /* Update the inputA and inputB pointers for next MAC calculation */
1398  px = ++pSrc1;
1399  py = pSrc2;
1400 
1401  /* Decrement the loop counter */
1402  blockSize3--;
1403  }
1404 
1405 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
1406 }
1407 
void arm_conv_fast_q15(q15_t *pSrcA, uint32_t srcALen, q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst)
Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
#define _SIMD32_OFFSET(addr)
Definition: arm_math.h:447
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397