STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_partial_q15.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_partial_q15.c
9 *
10 * Description: Partial convolution of Q15 sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
72  q15_t * pSrcA,
73  uint32_t srcALen,
74  q15_t * pSrcB,
75  uint32_t srcBLen,
76  q15_t * pDst,
77  uint32_t firstIndex,
78  uint32_t numPoints)
79 {
80 
81 #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
82 
83  /* Run the below code for Cortex-M4 and Cortex-M3 */
84 
85  q15_t *pIn1; /* inputA pointer */
86  q15_t *pIn2; /* inputB pointer */
87  q15_t *pOut = pDst; /* output pointer */
88  q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
89  q15_t *px; /* Intermediate inputA pointer */
90  q15_t *py; /* Intermediate inputB pointer */
91  q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
92  q31_t x0, x1, x2, x3, c0; /* Temporary input variables */
93  uint32_t j, k, count, check, blkCnt;
94  int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
95  arm_status status; /* status of Partial convolution */
96 
97  /* Check for range of output samples to be calculated */
98  if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
99  {
100  /* Set status as ARM_MATH_ARGUMENT_ERROR */
101  status = ARM_MATH_ARGUMENT_ERROR;
102  }
103  else
104  {
105 
106  /* The algorithm implementation is based on the lengths of the inputs. */
107  /* srcB is always made to slide across srcA. */
108  /* So srcBLen is always considered as shorter or equal to srcALen */
109  if(srcALen >= srcBLen)
110  {
111  /* Initialization of inputA pointer */
112  pIn1 = pSrcA;
113 
114  /* Initialization of inputB pointer */
115  pIn2 = pSrcB;
116  }
117  else
118  {
119  /* Initialization of inputA pointer */
120  pIn1 = pSrcB;
121 
122  /* Initialization of inputB pointer */
123  pIn2 = pSrcA;
124 
125  /* srcBLen is always considered as shorter or equal to srcALen */
126  j = srcBLen;
127  srcBLen = srcALen;
128  srcALen = j;
129  }
130 
131  /* Conditions to check which loopCounter holds
132  * the first and last indices of the output samples to be calculated. */
133  check = firstIndex + numPoints;
134  blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
135  blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
136  blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
137  blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
138  (int32_t) numPoints) : 0;
139  blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
140  (int32_t) firstIndex);
141  blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
142 
143  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
144  /* The function is internally
145  * divided into three stages according to the number of multiplications that has to be
146  * taken place between inputA samples and inputB samples. In the first stage of the
147  * algorithm, the multiplications increase by one for every iteration.
148  * In the second stage of the algorithm, srcBLen number of multiplications are done.
149  * In the third stage of the algorithm, the multiplications decrease by one
150  * for every iteration. */
151 
152  /* Set the output pointer to point to the firstIndex
153  * of the output sample to be calculated. */
154  pOut = pDst + firstIndex;
155 
156  /* --------------------------
157  * Initializations of stage1
158  * -------------------------*/
159 
160  /* sum = x[0] * y[0]
161  * sum = x[0] * y[1] + x[1] * y[0]
162  * ....
163  * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
164  */
165 
166  /* In this stage the MAC operations are increased by 1 for every iteration.
167  The count variable holds the number of MAC operations performed.
168  Since the partial convolution starts from firstIndex
169  Number of Macs to be performed is firstIndex + 1 */
170  count = 1u + firstIndex;
171 
172  /* Working pointer of inputA */
173  px = pIn1;
174 
175  /* Working pointer of inputB */
176  pSrc2 = pIn2 + firstIndex;
177  py = pSrc2;
178 
179  /* ------------------------
180  * Stage1 process
181  * ----------------------*/
182 
183  /* For loop unrolling by 4, this stage is divided into two. */
184  /* First part of this stage computes the MAC operations less than 4 */
185  /* Second part of this stage computes the MAC operations greater than or equal to 4 */
186 
187  /* The first part of the stage starts here */
188  while((count < 4u) && (blockSize1 > 0))
189  {
190  /* Accumulator is made zero for every iteration */
191  sum = 0;
192 
193  /* Loop over number of MAC operations between
194  * inputA samples and inputB samples */
195  k = count;
196 
197  while(k > 0u)
198  {
199  /* Perform the multiply-accumulates */
200  sum = __SMLALD(*px++, *py--, sum);
201 
202  /* Decrement the loop counter */
203  k--;
204  }
205 
206  /* Store the result in the accumulator in the destination buffer. */
207  *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
208 
209  /* Update the inputA and inputB pointers for next MAC calculation */
210  py = ++pSrc2;
211  px = pIn1;
212 
213  /* Increment the MAC count */
214  count++;
215 
216  /* Decrement the loop counter */
217  blockSize1--;
218  }
219 
220  /* The second part of the stage starts here */
221  /* The internal loop, over count, is unrolled by 4 */
222  /* To, read the last two inputB samples using SIMD:
223  * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
224  py = py - 1;
225 
226  while(blockSize1 > 0)
227  {
228  /* Accumulator is made zero for every iteration */
229  sum = 0;
230 
231  /* Apply loop unrolling and compute 4 MACs simultaneously. */
232  k = count >> 2u;
233 
234  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
235  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
236  while(k > 0u)
237  {
238  /* Perform the multiply-accumulates */
239  /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
240  sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
241  /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
242  sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
243 
244  /* Decrement the loop counter */
245  k--;
246  }
247 
248  /* For the next MAC operations, the pointer py is used without SIMD
249  * So, py is incremented by 1 */
250  py = py + 1u;
251 
252  /* If the count is not a multiple of 4, compute any remaining MACs here.
253  ** No loop unrolling is used. */
254  k = count % 0x4u;
255 
256  while(k > 0u)
257  {
258  /* Perform the multiply-accumulates */
259  sum = __SMLALD(*px++, *py--, sum);
260 
261  /* Decrement the loop counter */
262  k--;
263  }
264 
265  /* Store the result in the accumulator in the destination buffer. */
266  *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
267 
268  /* Update the inputA and inputB pointers for next MAC calculation */
269  py = ++pSrc2 - 1u;
270  px = pIn1;
271 
272  /* Increment the MAC count */
273  count++;
274 
275  /* Decrement the loop counter */
276  blockSize1--;
277  }
278 
279  /* --------------------------
280  * Initializations of stage2
281  * ------------------------*/
282 
283  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
284  * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
285  * ....
286  * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
287  */
288 
289  /* Working pointer of inputA */
290  if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
291  {
292  px = pIn1 + firstIndex - srcBLen + 1;
293  }
294  else
295  {
296  px = pIn1;
297  }
298 
299  /* Working pointer of inputB */
300  pSrc2 = pIn2 + (srcBLen - 1u);
301  py = pSrc2;
302 
303  /* count is the index by which the pointer pIn1 to be incremented */
304  count = 0u;
305 
306 
307  /* --------------------
308  * Stage2 process
309  * -------------------*/
310 
311  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
312  * So, to loop unroll over blockSize2,
313  * srcBLen should be greater than or equal to 4 */
314  if(srcBLen >= 4u)
315  {
316  /* Loop unroll over blockSize2, by 4 */
317  blkCnt = blockSize2 >> 2u;
318 
319  while(blkCnt > 0u)
320  {
321  py = py - 1u;
322 
323  /* Set all accumulators to zero */
324  acc0 = 0;
325  acc1 = 0;
326  acc2 = 0;
327  acc3 = 0;
328 
329 
330  /* read x[0], x[1] samples */
331  x0 = *__SIMD32(px);
332  /* read x[1], x[2] samples */
333  x1 = _SIMD32_OFFSET(px+1);
334  px+= 2u;
335 
336 
337  /* Apply loop unrolling and compute 4 MACs simultaneously. */
338  k = srcBLen >> 2u;
339 
340  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
341  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
342  do
343  {
344  /* Read the last two inputB samples using SIMD:
345  * y[srcBLen - 1] and y[srcBLen - 2] */
346  c0 = *__SIMD32(py)--;
347 
348  /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
349  acc0 = __SMLALDX(x0, c0, acc0);
350 
351  /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
352  acc1 = __SMLALDX(x1, c0, acc1);
353 
354  /* Read x[2], x[3] */
355  x2 = *__SIMD32(px);
356 
357  /* Read x[3], x[4] */
358  x3 = _SIMD32_OFFSET(px+1);
359 
360  /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
361  acc2 = __SMLALDX(x2, c0, acc2);
362 
363  /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
364  acc3 = __SMLALDX(x3, c0, acc3);
365 
366  /* Read y[srcBLen - 3] and y[srcBLen - 4] */
367  c0 = *__SIMD32(py)--;
368 
369  /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
370  acc0 = __SMLALDX(x2, c0, acc0);
371 
372  /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
373  acc1 = __SMLALDX(x3, c0, acc1);
374 
375  /* Read x[4], x[5] */
376  x0 = _SIMD32_OFFSET(px+2);
377 
378  /* Read x[5], x[6] */
379  x1 = _SIMD32_OFFSET(px+3);
380  px += 4u;
381 
382  /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
383  acc2 = __SMLALDX(x0, c0, acc2);
384 
385  /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
386  acc3 = __SMLALDX(x1, c0, acc3);
387 
388  } while(--k);
389 
390  /* For the next MAC operations, SIMD is not used
391  * So, the 16 bit pointer if inputB, py is updated */
392 
393  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
394  ** No loop unrolling is used. */
395  k = srcBLen % 0x4u;
396 
397  if(k == 1u)
398  {
399  /* Read y[srcBLen - 5] */
400  c0 = *(py+1);
401 
402 #ifdef ARM_MATH_BIG_ENDIAN
403 
404  c0 = c0 << 16u;
405 
406 #else
407 
408  c0 = c0 & 0x0000FFFF;
409 
410 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
411 
412  /* Read x[7] */
413  x3 = *__SIMD32(px);
414  px++;
415 
416  /* Perform the multiply-accumulates */
417  acc0 = __SMLALD(x0, c0, acc0);
418  acc1 = __SMLALD(x1, c0, acc1);
419  acc2 = __SMLALDX(x1, c0, acc2);
420  acc3 = __SMLALDX(x3, c0, acc3);
421  }
422 
423  if(k == 2u)
424  {
425  /* Read y[srcBLen - 5], y[srcBLen - 6] */
426  c0 = _SIMD32_OFFSET(py);
427 
428  /* Read x[7], x[8] */
429  x3 = *__SIMD32(px);
430 
431  /* Read x[9] */
432  x2 = _SIMD32_OFFSET(px+1);
433  px += 2u;
434 
435  /* Perform the multiply-accumulates */
436  acc0 = __SMLALDX(x0, c0, acc0);
437  acc1 = __SMLALDX(x1, c0, acc1);
438  acc2 = __SMLALDX(x3, c0, acc2);
439  acc3 = __SMLALDX(x2, c0, acc3);
440  }
441 
442  if(k == 3u)
443  {
444  /* Read y[srcBLen - 5], y[srcBLen - 6] */
445  c0 = _SIMD32_OFFSET(py);
446 
447  /* Read x[7], x[8] */
448  x3 = *__SIMD32(px);
449 
450  /* Read x[9] */
451  x2 = _SIMD32_OFFSET(px+1);
452 
453  /* Perform the multiply-accumulates */
454  acc0 = __SMLALDX(x0, c0, acc0);
455  acc1 = __SMLALDX(x1, c0, acc1);
456  acc2 = __SMLALDX(x3, c0, acc2);
457  acc3 = __SMLALDX(x2, c0, acc3);
458 
459  c0 = *(py-1);
460 
461 #ifdef ARM_MATH_BIG_ENDIAN
462 
463  c0 = c0 << 16u;
464 #else
465 
466  c0 = c0 & 0x0000FFFF;
467 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
468 
469  /* Read x[10] */
470  x3 = _SIMD32_OFFSET(px+2);
471  px += 3u;
472 
473  /* Perform the multiply-accumulates */
474  acc0 = __SMLALDX(x1, c0, acc0);
475  acc1 = __SMLALD(x2, c0, acc1);
476  acc2 = __SMLALDX(x2, c0, acc2);
477  acc3 = __SMLALDX(x3, c0, acc3);
478  }
479 
480 
481  /* Store the results in the accumulators in the destination buffer. */
482 
483 #ifndef ARM_MATH_BIG_ENDIAN
484 
485  *__SIMD32(pOut)++ =
486  __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
487  *__SIMD32(pOut)++ =
488  __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
489 
490 #else
491 
492  *__SIMD32(pOut)++ =
493  __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
494  *__SIMD32(pOut)++ =
495  __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
496 
497 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
498 
499  /* Increment the pointer pIn1 index, count by 4 */
500  count += 4u;
501 
502  /* Update the inputA and inputB pointers for next MAC calculation */
503  px = pIn1 + count;
504  py = pSrc2;
505 
506  /* Decrement the loop counter */
507  blkCnt--;
508  }
509 
510  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
511  ** No loop unrolling is used. */
512  blkCnt = (uint32_t) blockSize2 % 0x4u;
513 
514  while(blkCnt > 0u)
515  {
516  /* Accumulator is made zero for every iteration */
517  sum = 0;
518 
519  /* Apply loop unrolling and compute 4 MACs simultaneously. */
520  k = srcBLen >> 2u;
521 
522  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
523  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
524  while(k > 0u)
525  {
526  /* Perform the multiply-accumulates */
527  sum += (q63_t) ((q31_t) * px++ * *py--);
528  sum += (q63_t) ((q31_t) * px++ * *py--);
529  sum += (q63_t) ((q31_t) * px++ * *py--);
530  sum += (q63_t) ((q31_t) * px++ * *py--);
531 
532  /* Decrement the loop counter */
533  k--;
534  }
535 
536  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
537  ** No loop unrolling is used. */
538  k = srcBLen % 0x4u;
539 
540  while(k > 0u)
541  {
542  /* Perform the multiply-accumulates */
543  sum += (q63_t) ((q31_t) * px++ * *py--);
544 
545  /* Decrement the loop counter */
546  k--;
547  }
548 
549  /* Store the result in the accumulator in the destination buffer. */
550  *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
551 
552  /* Increment the pointer pIn1 index, count by 1 */
553  count++;
554 
555  /* Update the inputA and inputB pointers for next MAC calculation */
556  px = pIn1 + count;
557  py = pSrc2;
558 
559  /* Decrement the loop counter */
560  blkCnt--;
561  }
562  }
563  else
564  {
565  /* If the srcBLen is not a multiple of 4,
566  * the blockSize2 loop cannot be unrolled by 4 */
567  blkCnt = (uint32_t) blockSize2;
568 
569  while(blkCnt > 0u)
570  {
571  /* Accumulator is made zero for every iteration */
572  sum = 0;
573 
574  /* srcBLen number of MACS should be performed */
575  k = srcBLen;
576 
577  while(k > 0u)
578  {
579  /* Perform the multiply-accumulate */
580  sum += (q63_t) ((q31_t) * px++ * *py--);
581 
582  /* Decrement the loop counter */
583  k--;
584  }
585 
586  /* Store the result in the accumulator in the destination buffer. */
587  *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
588 
589  /* Increment the MAC count */
590  count++;
591 
592  /* Update the inputA and inputB pointers for next MAC calculation */
593  px = pIn1 + count;
594  py = pSrc2;
595 
596  /* Decrement the loop counter */
597  blkCnt--;
598  }
599  }
600 
601 
602  /* --------------------------
603  * Initializations of stage3
604  * -------------------------*/
605 
606  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
607  * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
608  * ....
609  * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
610  * sum += x[srcALen-1] * y[srcBLen-1]
611  */
612 
613  /* In this stage the MAC operations are decreased by 1 for every iteration.
614  The count variable holds the number of MAC operations performed */
615  count = srcBLen - 1u;
616 
617  /* Working pointer of inputA */
618  pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
619  px = pSrc1;
620 
621  /* Working pointer of inputB */
622  pSrc2 = pIn2 + (srcBLen - 1u);
623  pIn2 = pSrc2 - 1u;
624  py = pIn2;
625 
626  /* -------------------
627  * Stage3 process
628  * ------------------*/
629 
630  /* For loop unrolling by 4, this stage is divided into two. */
631  /* First part of this stage computes the MAC operations greater than 4 */
632  /* Second part of this stage computes the MAC operations less than or equal to 4 */
633 
634  /* The first part of the stage starts here */
635  j = count >> 2u;
636 
637  while((j > 0u) && (blockSize3 > 0))
638  {
639  /* Accumulator is made zero for every iteration */
640  sum = 0;
641 
642  /* Apply loop unrolling and compute 4 MACs simultaneously. */
643  k = count >> 2u;
644 
645  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
646  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
647  while(k > 0u)
648  {
649  /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
650  * with y[srcBLen - 1], y[srcBLen - 2] respectively */
651  sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
652  /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
653  * with y[srcBLen - 3], y[srcBLen - 4] respectively */
654  sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
655 
656  /* Decrement the loop counter */
657  k--;
658  }
659 
660  /* For the next MAC operations, the pointer py is used without SIMD
661  * So, py is incremented by 1 */
662  py = py + 1u;
663 
664  /* If the count is not a multiple of 4, compute any remaining MACs here.
665  ** No loop unrolling is used. */
666  k = count % 0x4u;
667 
668  while(k > 0u)
669  {
670  /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
671  sum = __SMLALD(*px++, *py--, sum);
672 
673  /* Decrement the loop counter */
674  k--;
675  }
676 
677  /* Store the result in the accumulator in the destination buffer. */
678  *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
679 
680  /* Update the inputA and inputB pointers for next MAC calculation */
681  px = ++pSrc1;
682  py = pIn2;
683 
684  /* Decrement the MAC count */
685  count--;
686 
687  /* Decrement the loop counter */
688  blockSize3--;
689 
690  j--;
691  }
692 
693  /* The second part of the stage starts here */
694  /* SIMD is not used for the next MAC operations,
695  * so pointer py is updated to read only one sample at a time */
696  py = py + 1u;
697 
698  while(blockSize3 > 0)
699  {
700  /* Accumulator is made zero for every iteration */
701  sum = 0;
702 
703  /* Apply loop unrolling and compute 4 MACs simultaneously. */
704  k = count;
705 
706  while(k > 0u)
707  {
708  /* Perform the multiply-accumulates */
709  /* sum += x[srcALen-1] * y[srcBLen-1] */
710  sum = __SMLALD(*px++, *py--, sum);
711 
712  /* Decrement the loop counter */
713  k--;
714  }
715 
716  /* Store the result in the accumulator in the destination buffer. */
717  *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
718 
719  /* Update the inputA and inputB pointers for next MAC calculation */
720  px = ++pSrc1;
721  py = pSrc2;
722 
723  /* Decrement the MAC count */
724  count--;
725 
726  /* Decrement the loop counter */
727  blockSize3--;
728  }
729 
730  /* set status as ARM_MATH_SUCCESS */
731  status = ARM_MATH_SUCCESS;
732  }
733 
734  /* Return to application */
735  return (status);
736 
737 #else
738 
739  /* Run the below code for Cortex-M0 */
740 
741  q15_t *pIn1 = pSrcA; /* inputA pointer */
742  q15_t *pIn2 = pSrcB; /* inputB pointer */
743  q63_t sum; /* Accumulator */
744  uint32_t i, j; /* loop counters */
745  arm_status status; /* status of Partial convolution */
746 
747  /* Check for range of output samples to be calculated */
748  if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
749  {
750  /* Set status as ARM_ARGUMENT_ERROR */
751  status = ARM_MATH_ARGUMENT_ERROR;
752  }
753  else
754  {
755  /* Loop to calculate convolution for output length number of values */
756  for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
757  {
758  /* Initialize sum with zero to carry on MAC operations */
759  sum = 0;
760 
761  /* Loop to perform MAC operations according to convolution equation */
762  for (j = 0; j <= i; j++)
763  {
764  /* Check the array limitations */
765  if(((i - j) < srcBLen) && (j < srcALen))
766  {
767  /* z[i] += x[i-j] * y[j] */
768  sum += ((q31_t) pIn1[j] * (pIn2[i - j]));
769  }
770  }
771 
772  /* Store the output in the destination buffer */
773  pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
774  }
775  /* set status as ARM_SUCCESS as there are no argument errors */
776  status = ARM_MATH_SUCCESS;
777  }
778  return (status);
779 
780 #endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
781 
782 }
783 
int64_t q63_t
64-bit fractional data type in 1.63 format.
Definition: arm_math.h:402
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
#define _SIMD32_OFFSET(addr)
Definition: arm_math.h:447
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
arm_status arm_conv_partial_q15(q15_t *pSrcA, uint32_t srcALen, q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst, uint32_t firstIndex, uint32_t numPoints)
Partial convolution of Q15 sequences.
arm_status
Error status returned by some functions in the library.
Definition: arm_math.h:373