STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_partial_fast_q15.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_partial_fast_q15.c
9 *
10 * Description: Fast Q15 Partial convolution.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
68  q15_t * pSrcA,
69  uint32_t srcALen,
70  q15_t * pSrcB,
71  uint32_t srcBLen,
72  q15_t * pDst,
73  uint32_t firstIndex,
74  uint32_t numPoints)
75 {
76 #ifndef UNALIGNED_SUPPORT_DISABLE
77 
78  q15_t *pIn1; /* inputA pointer */
79  q15_t *pIn2; /* inputB pointer */
80  q15_t *pOut = pDst; /* output pointer */
81  q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
82  q15_t *px; /* Intermediate inputA pointer */
83  q15_t *py; /* Intermediate inputB pointer */
84  q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
85  q31_t x0, x1, x2, x3, c0;
86  uint32_t j, k, count, check, blkCnt;
87  int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
88  arm_status status; /* status of Partial convolution */
89 
90  /* Check for range of output samples to be calculated */
91  if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
92  {
93  /* Set status as ARM_MATH_ARGUMENT_ERROR */
94  status = ARM_MATH_ARGUMENT_ERROR;
95  }
96  else
97  {
98 
99  /* The algorithm implementation is based on the lengths of the inputs. */
100  /* srcB is always made to slide across srcA. */
101  /* So srcBLen is always considered as shorter or equal to srcALen */
102  if(srcALen >=srcBLen)
103  {
104  /* Initialization of inputA pointer */
105  pIn1 = pSrcA;
106 
107  /* Initialization of inputB pointer */
108  pIn2 = pSrcB;
109  }
110  else
111  {
112  /* Initialization of inputA pointer */
113  pIn1 = pSrcB;
114 
115  /* Initialization of inputB pointer */
116  pIn2 = pSrcA;
117 
118  /* srcBLen is always considered as shorter or equal to srcALen */
119  j = srcBLen;
120  srcBLen = srcALen;
121  srcALen = j;
122  }
123 
124  /* Conditions to check which loopCounter holds
125  * the first and last indices of the output samples to be calculated. */
126  check = firstIndex + numPoints;
127  blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
128  blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
129  blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
130  blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
131  (int32_t) numPoints) : 0;
132  blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
133  (int32_t) firstIndex);
134  blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
135 
136  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
137  /* The function is internally
138  * divided into three stages according to the number of multiplications that has to be
139  * taken place between inputA samples and inputB samples. In the first stage of the
140  * algorithm, the multiplications increase by one for every iteration.
141  * In the second stage of the algorithm, srcBLen number of multiplications are done.
142  * In the third stage of the algorithm, the multiplications decrease by one
143  * for every iteration. */
144 
145  /* Set the output pointer to point to the firstIndex
146  * of the output sample to be calculated. */
147  pOut = pDst + firstIndex;
148 
149  /* --------------------------
150  * Initializations of stage1
151  * -------------------------*/
152 
153  /* sum = x[0] * y[0]
154  * sum = x[0] * y[1] + x[1] * y[0]
155  * ....
156  * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
157  */
158 
159  /* In this stage the MAC operations are increased by 1 for every iteration.
160  The count variable holds the number of MAC operations performed.
161  Since the partial convolution starts from firstIndex
162  Number of Macs to be performed is firstIndex + 1 */
163  count = 1u + firstIndex;
164 
165  /* Working pointer of inputA */
166  px = pIn1;
167 
168  /* Working pointer of inputB */
169  pSrc2 = pIn2 + firstIndex;
170  py = pSrc2;
171 
172  /* ------------------------
173  * Stage1 process
174  * ----------------------*/
175 
176  /* For loop unrolling by 4, this stage is divided into two. */
177  /* First part of this stage computes the MAC operations less than 4 */
178  /* Second part of this stage computes the MAC operations greater than or equal to 4 */
179 
180  /* The first part of the stage starts here */
181  while((count < 4u) && (blockSize1 > 0))
182  {
183  /* Accumulator is made zero for every iteration */
184  sum = 0;
185 
186  /* Loop over number of MAC operations between
187  * inputA samples and inputB samples */
188  k = count;
189 
190  while(k > 0u)
191  {
192  /* Perform the multiply-accumulates */
193  sum = __SMLAD(*px++, *py--, sum);
194 
195  /* Decrement the loop counter */
196  k--;
197  }
198 
199  /* Store the result in the accumulator in the destination buffer. */
200  *pOut++ = (q15_t) (sum >> 15);
201 
202  /* Update the inputA and inputB pointers for next MAC calculation */
203  py = ++pSrc2;
204  px = pIn1;
205 
206  /* Increment the MAC count */
207  count++;
208 
209  /* Decrement the loop counter */
210  blockSize1--;
211  }
212 
213  /* The second part of the stage starts here */
214  /* The internal loop, over count, is unrolled by 4 */
215  /* To, read the last two inputB samples using SIMD:
216  * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
217  py = py - 1;
218 
219  while(blockSize1 > 0)
220  {
221  /* Accumulator is made zero for every iteration */
222  sum = 0;
223 
224  /* Apply loop unrolling and compute 4 MACs simultaneously. */
225  k = count >> 2u;
226 
227  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
228  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
229  while(k > 0u)
230  {
231  /* Perform the multiply-accumulates */
232  /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
233  sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
234  /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
235  sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
236 
237  /* Decrement the loop counter */
238  k--;
239  }
240 
241  /* For the next MAC operations, the pointer py is used without SIMD
242  * So, py is incremented by 1 */
243  py = py + 1u;
244 
245  /* If the count is not a multiple of 4, compute any remaining MACs here.
246  ** No loop unrolling is used. */
247  k = count % 0x4u;
248 
249  while(k > 0u)
250  {
251  /* Perform the multiply-accumulates */
252  sum = __SMLAD(*px++, *py--, sum);
253 
254  /* Decrement the loop counter */
255  k--;
256  }
257 
258  /* Store the result in the accumulator in the destination buffer. */
259  *pOut++ = (q15_t) (sum >> 15);
260 
261  /* Update the inputA and inputB pointers for next MAC calculation */
262  py = ++pSrc2 - 1u;
263  px = pIn1;
264 
265  /* Increment the MAC count */
266  count++;
267 
268  /* Decrement the loop counter */
269  blockSize1--;
270  }
271 
272  /* --------------------------
273  * Initializations of stage2
274  * ------------------------*/
275 
276  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
277  * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
278  * ....
279  * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
280  */
281 
282  /* Working pointer of inputA */
283  if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
284  {
285  px = pIn1 + firstIndex - srcBLen + 1;
286  }
287  else
288  {
289  px = pIn1;
290  }
291 
292  /* Working pointer of inputB */
293  pSrc2 = pIn2 + (srcBLen - 1u);
294  py = pSrc2;
295 
296  /* count is the index by which the pointer pIn1 to be incremented */
297  count = 0u;
298 
299 
300  /* --------------------
301  * Stage2 process
302  * -------------------*/
303 
304  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
305  * So, to loop unroll over blockSize2,
306  * srcBLen should be greater than or equal to 4 */
307  if(srcBLen >= 4u)
308  {
309  /* Loop unroll over blockSize2, by 4 */
310  blkCnt = ((uint32_t) blockSize2 >> 2u);
311 
312  while(blkCnt > 0u)
313  {
314  py = py - 1u;
315 
316  /* Set all accumulators to zero */
317  acc0 = 0;
318  acc1 = 0;
319  acc2 = 0;
320  acc3 = 0;
321 
322 
323  /* read x[0], x[1] samples */
324  x0 = *__SIMD32(px);
325  /* read x[1], x[2] samples */
326  x1 = _SIMD32_OFFSET(px+1);
327  px+= 2u;
328 
329 
330  /* Apply loop unrolling and compute 4 MACs simultaneously. */
331  k = srcBLen >> 2u;
332 
333  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
334  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
335  do
336  {
337  /* Read the last two inputB samples using SIMD:
338  * y[srcBLen - 1] and y[srcBLen - 2] */
339  c0 = *__SIMD32(py)--;
340 
341  /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
342  acc0 = __SMLADX(x0, c0, acc0);
343 
344  /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
345  acc1 = __SMLADX(x1, c0, acc1);
346 
347  /* Read x[2], x[3] */
348  x2 = *__SIMD32(px);
349 
350  /* Read x[3], x[4] */
351  x3 = _SIMD32_OFFSET(px+1);
352 
353  /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
354  acc2 = __SMLADX(x2, c0, acc2);
355 
356  /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
357  acc3 = __SMLADX(x3, c0, acc3);
358 
359  /* Read y[srcBLen - 3] and y[srcBLen - 4] */
360  c0 = *__SIMD32(py)--;
361 
362  /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
363  acc0 = __SMLADX(x2, c0, acc0);
364 
365  /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
366  acc1 = __SMLADX(x3, c0, acc1);
367 
368  /* Read x[4], x[5] */
369  x0 = _SIMD32_OFFSET(px+2);
370 
371  /* Read x[5], x[6] */
372  x1 = _SIMD32_OFFSET(px+3);
373  px += 4u;
374 
375  /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
376  acc2 = __SMLADX(x0, c0, acc2);
377 
378  /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
379  acc3 = __SMLADX(x1, c0, acc3);
380 
381  } while(--k);
382 
383  /* For the next MAC operations, SIMD is not used
384  * So, the 16 bit pointer if inputB, py is updated */
385 
386  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
387  ** No loop unrolling is used. */
388  k = srcBLen % 0x4u;
389 
390  if(k == 1u)
391  {
392  /* Read y[srcBLen - 5] */
393  c0 = *(py+1);
394 #ifdef ARM_MATH_BIG_ENDIAN
395 
396  c0 = c0 << 16u;
397 
398 #else
399 
400  c0 = c0 & 0x0000FFFF;
401 
402 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
403 
404  /* Read x[7] */
405  x3 = *__SIMD32(px);
406  px++;
407 
408  /* Perform the multiply-accumulates */
409  acc0 = __SMLAD(x0, c0, acc0);
410  acc1 = __SMLAD(x1, c0, acc1);
411  acc2 = __SMLADX(x1, c0, acc2);
412  acc3 = __SMLADX(x3, c0, acc3);
413  }
414 
415  if(k == 2u)
416  {
417  /* Read y[srcBLen - 5], y[srcBLen - 6] */
418  c0 = _SIMD32_OFFSET(py);
419 
420  /* Read x[7], x[8] */
421  x3 = *__SIMD32(px);
422 
423  /* Read x[9] */
424  x2 = _SIMD32_OFFSET(px+1);
425  px += 2u;
426 
427  /* Perform the multiply-accumulates */
428  acc0 = __SMLADX(x0, c0, acc0);
429  acc1 = __SMLADX(x1, c0, acc1);
430  acc2 = __SMLADX(x3, c0, acc2);
431  acc3 = __SMLADX(x2, c0, acc3);
432  }
433 
434  if(k == 3u)
435  {
436  /* Read y[srcBLen - 5], y[srcBLen - 6] */
437  c0 = _SIMD32_OFFSET(py);
438 
439  /* Read x[7], x[8] */
440  x3 = *__SIMD32(px);
441 
442  /* Read x[9] */
443  x2 = _SIMD32_OFFSET(px+1);
444 
445  /* Perform the multiply-accumulates */
446  acc0 = __SMLADX(x0, c0, acc0);
447  acc1 = __SMLADX(x1, c0, acc1);
448  acc2 = __SMLADX(x3, c0, acc2);
449  acc3 = __SMLADX(x2, c0, acc3);
450 
451  c0 = *(py-1);
452 #ifdef ARM_MATH_BIG_ENDIAN
453 
454  c0 = c0 << 16u;
455 #else
456 
457  c0 = c0 & 0x0000FFFF;
458 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
459 
460  /* Read x[10] */
461  x3 = _SIMD32_OFFSET(px+2);
462  px += 3u;
463 
464  /* Perform the multiply-accumulates */
465  acc0 = __SMLADX(x1, c0, acc0);
466  acc1 = __SMLAD(x2, c0, acc1);
467  acc2 = __SMLADX(x2, c0, acc2);
468  acc3 = __SMLADX(x3, c0, acc3);
469  }
470 
471  /* Store the results in the accumulators in the destination buffer. */
472 #ifndef ARM_MATH_BIG_ENDIAN
473 
474  *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16);
475  *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16);
476 
477 #else
478 
479  *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16);
480  *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16);
481 
482 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
483 
484  /* Increment the pointer pIn1 index, count by 4 */
485  count += 4u;
486 
487  /* Update the inputA and inputB pointers for next MAC calculation */
488  px = pIn1 + count;
489  py = pSrc2;
490 
491  /* Decrement the loop counter */
492  blkCnt--;
493  }
494 
495  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
496  ** No loop unrolling is used. */
497  blkCnt = (uint32_t) blockSize2 % 0x4u;
498 
499  while(blkCnt > 0u)
500  {
501  /* Accumulator is made zero for every iteration */
502  sum = 0;
503 
504  /* Apply loop unrolling and compute 4 MACs simultaneously. */
505  k = srcBLen >> 2u;
506 
507  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
508  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
509  while(k > 0u)
510  {
511  /* Perform the multiply-accumulates */
512  sum += ((q31_t) * px++ * *py--);
513  sum += ((q31_t) * px++ * *py--);
514  sum += ((q31_t) * px++ * *py--);
515  sum += ((q31_t) * px++ * *py--);
516 
517  /* Decrement the loop counter */
518  k--;
519  }
520 
521  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
522  ** No loop unrolling is used. */
523  k = srcBLen % 0x4u;
524 
525  while(k > 0u)
526  {
527  /* Perform the multiply-accumulates */
528  sum += ((q31_t) * px++ * *py--);
529 
530  /* Decrement the loop counter */
531  k--;
532  }
533 
534  /* Store the result in the accumulator in the destination buffer. */
535  *pOut++ = (q15_t) (sum >> 15);
536 
537  /* Increment the pointer pIn1 index, count by 1 */
538  count++;
539 
540  /* Update the inputA and inputB pointers for next MAC calculation */
541  px = pIn1 + count;
542  py = pSrc2;
543 
544  /* Decrement the loop counter */
545  blkCnt--;
546  }
547  }
548  else
549  {
550  /* If the srcBLen is not a multiple of 4,
551  * the blockSize2 loop cannot be unrolled by 4 */
552  blkCnt = (uint32_t) blockSize2;
553 
554  while(blkCnt > 0u)
555  {
556  /* Accumulator is made zero for every iteration */
557  sum = 0;
558 
559  /* srcBLen number of MACS should be performed */
560  k = srcBLen;
561 
562  while(k > 0u)
563  {
564  /* Perform the multiply-accumulate */
565  sum += ((q31_t) * px++ * *py--);
566 
567  /* Decrement the loop counter */
568  k--;
569  }
570 
571  /* Store the result in the accumulator in the destination buffer. */
572  *pOut++ = (q15_t) (sum >> 15);
573 
574  /* Increment the MAC count */
575  count++;
576 
577  /* Update the inputA and inputB pointers for next MAC calculation */
578  px = pIn1 + count;
579  py = pSrc2;
580 
581  /* Decrement the loop counter */
582  blkCnt--;
583  }
584  }
585 
586 
587  /* --------------------------
588  * Initializations of stage3
589  * -------------------------*/
590 
591  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
592  * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
593  * ....
594  * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
595  * sum += x[srcALen-1] * y[srcBLen-1]
596  */
597 
598  /* In this stage the MAC operations are decreased by 1 for every iteration.
599  The count variable holds the number of MAC operations performed */
600  count = srcBLen - 1u;
601 
602  /* Working pointer of inputA */
603  pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
604  px = pSrc1;
605 
606  /* Working pointer of inputB */
607  pSrc2 = pIn2 + (srcBLen - 1u);
608  pIn2 = pSrc2 - 1u;
609  py = pIn2;
610 
611  /* -------------------
612  * Stage3 process
613  * ------------------*/
614 
615  /* For loop unrolling by 4, this stage is divided into two. */
616  /* First part of this stage computes the MAC operations greater than 4 */
617  /* Second part of this stage computes the MAC operations less than or equal to 4 */
618 
619  /* The first part of the stage starts here */
620  j = count >> 2u;
621 
622  while((j > 0u) && (blockSize3 > 0))
623  {
624  /* Accumulator is made zero for every iteration */
625  sum = 0;
626 
627  /* Apply loop unrolling and compute 4 MACs simultaneously. */
628  k = count >> 2u;
629 
630  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
631  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
632  while(k > 0u)
633  {
634  /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
635  * with y[srcBLen - 1], y[srcBLen - 2] respectively */
636  sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
637  /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
638  * with y[srcBLen - 3], y[srcBLen - 4] respectively */
639  sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
640 
641  /* Decrement the loop counter */
642  k--;
643  }
644 
645  /* For the next MAC operations, the pointer py is used without SIMD
646  * So, py is incremented by 1 */
647  py = py + 1u;
648 
649  /* If the count is not a multiple of 4, compute any remaining MACs here.
650  ** No loop unrolling is used. */
651  k = count % 0x4u;
652 
653  while(k > 0u)
654  {
655  /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
656  sum = __SMLAD(*px++, *py--, sum);
657 
658  /* Decrement the loop counter */
659  k--;
660  }
661 
662  /* Store the result in the accumulator in the destination buffer. */
663  *pOut++ = (q15_t) (sum >> 15);
664 
665  /* Update the inputA and inputB pointers for next MAC calculation */
666  px = ++pSrc1;
667  py = pIn2;
668 
669  /* Decrement the MAC count */
670  count--;
671 
672  /* Decrement the loop counter */
673  blockSize3--;
674 
675  j--;
676  }
677 
678  /* The second part of the stage starts here */
679  /* SIMD is not used for the next MAC operations,
680  * so pointer py is updated to read only one sample at a time */
681  py = py + 1u;
682 
683  while(blockSize3 > 0)
684  {
685  /* Accumulator is made zero for every iteration */
686  sum = 0;
687 
688  /* Apply loop unrolling and compute 4 MACs simultaneously. */
689  k = count;
690 
691  while(k > 0u)
692  {
693  /* Perform the multiply-accumulates */
694  /* sum += x[srcALen-1] * y[srcBLen-1] */
695  sum = __SMLAD(*px++, *py--, sum);
696 
697  /* Decrement the loop counter */
698  k--;
699  }
700 
701  /* Store the result in the accumulator in the destination buffer. */
702  *pOut++ = (q15_t) (sum >> 15);
703 
704  /* Update the inputA and inputB pointers for next MAC calculation */
705  px = ++pSrc1;
706  py = pSrc2;
707 
708  /* Decrement the MAC count */
709  count--;
710 
711  /* Decrement the loop counter */
712  blockSize3--;
713  }
714 
715  /* set status as ARM_MATH_SUCCESS */
716  status = ARM_MATH_SUCCESS;
717  }
718 
719  /* Return to application */
720  return (status);
721 
722 #else
723 
724  q15_t *pIn1; /* inputA pointer */
725  q15_t *pIn2; /* inputB pointer */
726  q15_t *pOut = pDst; /* output pointer */
727  q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
728  q15_t *px; /* Intermediate inputA pointer */
729  q15_t *py; /* Intermediate inputB pointer */
730  q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
731  q31_t x0, x1, x2, x3, c0;
732  uint32_t j, k, count, check, blkCnt;
733  int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
734  arm_status status; /* status of Partial convolution */
735  q15_t a, b;
736 
737  /* Check for range of output samples to be calculated */
738  if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
739  {
740  /* Set status as ARM_MATH_ARGUMENT_ERROR */
741  status = ARM_MATH_ARGUMENT_ERROR;
742  }
743  else
744  {
745 
746  /* The algorithm implementation is based on the lengths of the inputs. */
747  /* srcB is always made to slide across srcA. */
748  /* So srcBLen is always considered as shorter or equal to srcALen */
749  if(srcALen >=srcBLen)
750  {
751  /* Initialization of inputA pointer */
752  pIn1 = pSrcA;
753 
754  /* Initialization of inputB pointer */
755  pIn2 = pSrcB;
756  }
757  else
758  {
759  /* Initialization of inputA pointer */
760  pIn1 = pSrcB;
761 
762  /* Initialization of inputB pointer */
763  pIn2 = pSrcA;
764 
765  /* srcBLen is always considered as shorter or equal to srcALen */
766  j = srcBLen;
767  srcBLen = srcALen;
768  srcALen = j;
769  }
770 
771  /* Conditions to check which loopCounter holds
772  * the first and last indices of the output samples to be calculated. */
773  check = firstIndex + numPoints;
774  blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
775  blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
776  blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
777  blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
778  (int32_t) numPoints) : 0;
779  blockSize2 = ((int32_t) check - blockSize3) -
780  (blockSize1 + (int32_t) firstIndex);
781  blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
782 
783  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
784  /* The function is internally
785  * divided into three stages according to the number of multiplications that has to be
786  * taken place between inputA samples and inputB samples. In the first stage of the
787  * algorithm, the multiplications increase by one for every iteration.
788  * In the second stage of the algorithm, srcBLen number of multiplications are done.
789  * In the third stage of the algorithm, the multiplications decrease by one
790  * for every iteration. */
791 
792  /* Set the output pointer to point to the firstIndex
793  * of the output sample to be calculated. */
794  pOut = pDst + firstIndex;
795 
796  /* --------------------------
797  * Initializations of stage1
798  * -------------------------*/
799 
800  /* sum = x[0] * y[0]
801  * sum = x[0] * y[1] + x[1] * y[0]
802  * ....
803  * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
804  */
805 
806  /* In this stage the MAC operations are increased by 1 for every iteration.
807  The count variable holds the number of MAC operations performed.
808  Since the partial convolution starts from firstIndex
809  Number of Macs to be performed is firstIndex + 1 */
810  count = 1u + firstIndex;
811 
812  /* Working pointer of inputA */
813  px = pIn1;
814 
815  /* Working pointer of inputB */
816  pSrc2 = pIn2 + firstIndex;
817  py = pSrc2;
818 
819  /* ------------------------
820  * Stage1 process
821  * ----------------------*/
822 
823  /* For loop unrolling by 4, this stage is divided into two. */
824  /* First part of this stage computes the MAC operations less than 4 */
825  /* Second part of this stage computes the MAC operations greater than or equal to 4 */
826 
827  /* The first part of the stage starts here */
828  while((count < 4u) && (blockSize1 > 0))
829  {
830  /* Accumulator is made zero for every iteration */
831  sum = 0;
832 
833  /* Loop over number of MAC operations between
834  * inputA samples and inputB samples */
835  k = count;
836 
837  while(k > 0u)
838  {
839  /* Perform the multiply-accumulates */
840  sum += ((q31_t) * px++ * *py--);
841 
842  /* Decrement the loop counter */
843  k--;
844  }
845 
846  /* Store the result in the accumulator in the destination buffer. */
847  *pOut++ = (q15_t) (sum >> 15);
848 
849  /* Update the inputA and inputB pointers for next MAC calculation */
850  py = ++pSrc2;
851  px = pIn1;
852 
853  /* Increment the MAC count */
854  count++;
855 
856  /* Decrement the loop counter */
857  blockSize1--;
858  }
859 
860  /* The second part of the stage starts here */
861  /* The internal loop, over count, is unrolled by 4 */
862  /* To, read the last two inputB samples using SIMD:
863  * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
864  py = py - 1;
865 
866  while(blockSize1 > 0)
867  {
868  /* Accumulator is made zero for every iteration */
869  sum = 0;
870 
871  /* Apply loop unrolling and compute 4 MACs simultaneously. */
872  k = count >> 2u;
873 
874  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
875  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
876  py++;
877 
878  while(k > 0u)
879  {
880  /* Perform the multiply-accumulates */
881  sum += ((q31_t) * px++ * *py--);
882  sum += ((q31_t) * px++ * *py--);
883  sum += ((q31_t) * px++ * *py--);
884  sum += ((q31_t) * px++ * *py--);
885 
886  /* Decrement the loop counter */
887  k--;
888  }
889 
890  /* If the count is not a multiple of 4, compute any remaining MACs here.
891  ** No loop unrolling is used. */
892  k = count % 0x4u;
893 
894  while(k > 0u)
895  {
896  /* Perform the multiply-accumulates */
897  sum += ((q31_t) * px++ * *py--);
898 
899  /* Decrement the loop counter */
900  k--;
901  }
902 
903  /* Store the result in the accumulator in the destination buffer. */
904  *pOut++ = (q15_t) (sum >> 15);
905 
906  /* Update the inputA and inputB pointers for next MAC calculation */
907  py = ++pSrc2 - 1u;
908  px = pIn1;
909 
910  /* Increment the MAC count */
911  count++;
912 
913  /* Decrement the loop counter */
914  blockSize1--;
915  }
916 
917  /* --------------------------
918  * Initializations of stage2
919  * ------------------------*/
920 
921  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
922  * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
923  * ....
924  * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
925  */
926 
927  /* Working pointer of inputA */
928  if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
929  {
930  px = pIn1 + firstIndex - srcBLen + 1;
931  }
932  else
933  {
934  px = pIn1;
935  }
936 
937  /* Working pointer of inputB */
938  pSrc2 = pIn2 + (srcBLen - 1u);
939  py = pSrc2;
940 
941  /* count is the index by which the pointer pIn1 to be incremented */
942  count = 0u;
943 
944 
945  /* --------------------
946  * Stage2 process
947  * -------------------*/
948 
949  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
950  * So, to loop unroll over blockSize2,
951  * srcBLen should be greater than or equal to 4 */
952  if(srcBLen >= 4u)
953  {
954  /* Loop unroll over blockSize2, by 4 */
955  blkCnt = ((uint32_t) blockSize2 >> 2u);
956 
957  while(blkCnt > 0u)
958  {
959  py = py - 1u;
960 
961  /* Set all accumulators to zero */
962  acc0 = 0;
963  acc1 = 0;
964  acc2 = 0;
965  acc3 = 0;
966 
967  /* read x[0], x[1] samples */
968  a = *px++;
969  b = *px++;
970 
971 #ifndef ARM_MATH_BIG_ENDIAN
972 
973  x0 = __PKHBT(a, b, 16);
974  a = *px;
975  x1 = __PKHBT(b, a, 16);
976 
977 #else
978 
979  x0 = __PKHBT(b, a, 16);
980  a = *px;
981  x1 = __PKHBT(a, b, 16);
982 
983 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
984 
985  /* Apply loop unrolling and compute 4 MACs simultaneously. */
986  k = srcBLen >> 2u;
987 
988  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
989  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
990  do
991  {
992  /* Read the last two inputB samples using SIMD:
993  * y[srcBLen - 1] and y[srcBLen - 2] */
994  a = *py;
995  b = *(py+1);
996  py -= 2;
997 
998 #ifndef ARM_MATH_BIG_ENDIAN
999 
1000  c0 = __PKHBT(a, b, 16);
1001 
1002 #else
1003 
1004  c0 = __PKHBT(b, a, 16);;
1005 
1006 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1007 
1008  /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
1009  acc0 = __SMLADX(x0, c0, acc0);
1010 
1011  /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
1012  acc1 = __SMLADX(x1, c0, acc1);
1013 
1014  a = *px;
1015  b = *(px + 1);
1016 
1017 #ifndef ARM_MATH_BIG_ENDIAN
1018 
1019  x2 = __PKHBT(a, b, 16);
1020  a = *(px + 2);
1021  x3 = __PKHBT(b, a, 16);
1022 
1023 #else
1024 
1025  x2 = __PKHBT(b, a, 16);
1026  a = *(px + 2);
1027  x3 = __PKHBT(a, b, 16);
1028 
1029 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1030 
1031  /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
1032  acc2 = __SMLADX(x2, c0, acc2);
1033 
1034  /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
1035  acc3 = __SMLADX(x3, c0, acc3);
1036 
1037  /* Read y[srcBLen - 3] and y[srcBLen - 4] */
1038  a = *py;
1039  b = *(py+1);
1040  py -= 2;
1041 
1042 #ifndef ARM_MATH_BIG_ENDIAN
1043 
1044  c0 = __PKHBT(a, b, 16);
1045 
1046 #else
1047 
1048  c0 = __PKHBT(b, a, 16);;
1049 
1050 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1051 
1052  /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
1053  acc0 = __SMLADX(x2, c0, acc0);
1054 
1055  /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
1056  acc1 = __SMLADX(x3, c0, acc1);
1057 
1058  /* Read x[4], x[5], x[6] */
1059  a = *(px + 2);
1060  b = *(px + 3);
1061 
1062 #ifndef ARM_MATH_BIG_ENDIAN
1063 
1064  x0 = __PKHBT(a, b, 16);
1065  a = *(px + 4);
1066  x1 = __PKHBT(b, a, 16);
1067 
1068 #else
1069 
1070  x0 = __PKHBT(b, a, 16);
1071  a = *(px + 4);
1072  x1 = __PKHBT(a, b, 16);
1073 
1074 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1075 
1076  px += 4u;
1077 
1078  /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
1079  acc2 = __SMLADX(x0, c0, acc2);
1080 
1081  /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
1082  acc3 = __SMLADX(x1, c0, acc3);
1083 
1084  } while(--k);
1085 
1086  /* For the next MAC operations, SIMD is not used
1087  * So, the 16 bit pointer if inputB, py is updated */
1088 
1089  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1090  ** No loop unrolling is used. */
1091  k = srcBLen % 0x4u;
1092 
1093  if(k == 1u)
1094  {
1095  /* Read y[srcBLen - 5] */
1096  c0 = *(py+1);
1097 
1098 #ifdef ARM_MATH_BIG_ENDIAN
1099 
1100  c0 = c0 << 16u;
1101 
1102 #else
1103 
1104  c0 = c0 & 0x0000FFFF;
1105 
1106 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1107 
1108  /* Read x[7] */
1109  a = *px;
1110  b = *(px+1);
1111  px++;
1112 
1113 #ifndef ARM_MATH_BIG_ENDIAN
1114 
1115  x3 = __PKHBT(a, b, 16);
1116 
1117 #else
1118 
1119  x3 = __PKHBT(b, a, 16);;
1120 
1121 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1122 
1123 
1124  /* Perform the multiply-accumulates */
1125  acc0 = __SMLAD(x0, c0, acc0);
1126  acc1 = __SMLAD(x1, c0, acc1);
1127  acc2 = __SMLADX(x1, c0, acc2);
1128  acc3 = __SMLADX(x3, c0, acc3);
1129  }
1130 
1131  if(k == 2u)
1132  {
1133  /* Read y[srcBLen - 5], y[srcBLen - 6] */
1134  a = *py;
1135  b = *(py+1);
1136 
1137 #ifndef ARM_MATH_BIG_ENDIAN
1138 
1139  c0 = __PKHBT(a, b, 16);
1140 
1141 #else
1142 
1143  c0 = __PKHBT(b, a, 16);;
1144 
1145 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1146 
1147  /* Read x[7], x[8], x[9] */
1148  a = *px;
1149  b = *(px + 1);
1150 
1151 #ifndef ARM_MATH_BIG_ENDIAN
1152 
1153  x3 = __PKHBT(a, b, 16);
1154  a = *(px + 2);
1155  x2 = __PKHBT(b, a, 16);
1156 
1157 #else
1158 
1159  x3 = __PKHBT(b, a, 16);
1160  a = *(px + 2);
1161  x2 = __PKHBT(a, b, 16);
1162 
1163 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1164  px += 2u;
1165 
1166  /* Perform the multiply-accumulates */
1167  acc0 = __SMLADX(x0, c0, acc0);
1168  acc1 = __SMLADX(x1, c0, acc1);
1169  acc2 = __SMLADX(x3, c0, acc2);
1170  acc3 = __SMLADX(x2, c0, acc3);
1171  }
1172 
1173  if(k == 3u)
1174  {
1175  /* Read y[srcBLen - 5], y[srcBLen - 6] */
1176  a = *py;
1177  b = *(py+1);
1178 
1179 #ifndef ARM_MATH_BIG_ENDIAN
1180 
1181  c0 = __PKHBT(a, b, 16);
1182 
1183 #else
1184 
1185  c0 = __PKHBT(b, a, 16);;
1186 
1187 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1188 
1189  /* Read x[7], x[8], x[9] */
1190  a = *px;
1191  b = *(px + 1);
1192 
1193 #ifndef ARM_MATH_BIG_ENDIAN
1194 
1195  x3 = __PKHBT(a, b, 16);
1196  a = *(px + 2);
1197  x2 = __PKHBT(b, a, 16);
1198 
1199 #else
1200 
1201  x3 = __PKHBT(b, a, 16);
1202  a = *(px + 2);
1203  x2 = __PKHBT(a, b, 16);
1204 
1205 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1206 
1207  /* Perform the multiply-accumulates */
1208  acc0 = __SMLADX(x0, c0, acc0);
1209  acc1 = __SMLADX(x1, c0, acc1);
1210  acc2 = __SMLADX(x3, c0, acc2);
1211  acc3 = __SMLADX(x2, c0, acc3);
1212 
1213  /* Read y[srcBLen - 7] */
1214  c0 = *(py-1);
1215 #ifdef ARM_MATH_BIG_ENDIAN
1216 
1217  c0 = c0 << 16u;
1218 #else
1219 
1220  c0 = c0 & 0x0000FFFF;
1221 #endif /* #ifdef ARM_MATH_BIG_ENDIAN */
1222 
1223  /* Read x[10] */
1224  a = *(px+2);
1225  b = *(px+3);
1226 
1227 #ifndef ARM_MATH_BIG_ENDIAN
1228 
1229  x3 = __PKHBT(a, b, 16);
1230 
1231 #else
1232 
1233  x3 = __PKHBT(b, a, 16);;
1234 
1235 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1236 
1237  px += 3u;
1238 
1239  /* Perform the multiply-accumulates */
1240  acc0 = __SMLADX(x1, c0, acc0);
1241  acc1 = __SMLAD(x2, c0, acc1);
1242  acc2 = __SMLADX(x2, c0, acc2);
1243  acc3 = __SMLADX(x3, c0, acc3);
1244  }
1245 
1246  /* Store the results in the accumulators in the destination buffer. */
1247  *pOut++ = (q15_t)(acc0 >> 15);
1248  *pOut++ = (q15_t)(acc1 >> 15);
1249  *pOut++ = (q15_t)(acc2 >> 15);
1250  *pOut++ = (q15_t)(acc3 >> 15);
1251 
1252  /* Increment the pointer pIn1 index, count by 4 */
1253  count += 4u;
1254 
1255  /* Update the inputA and inputB pointers for next MAC calculation */
1256  px = pIn1 + count;
1257  py = pSrc2;
1258 
1259  /* Decrement the loop counter */
1260  blkCnt--;
1261  }
1262 
1263  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
1264  ** No loop unrolling is used. */
1265  blkCnt = (uint32_t) blockSize2 % 0x4u;
1266 
1267  while(blkCnt > 0u)
1268  {
1269  /* Accumulator is made zero for every iteration */
1270  sum = 0;
1271 
1272  /* Apply loop unrolling and compute 4 MACs simultaneously. */
1273  k = srcBLen >> 2u;
1274 
1275  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1276  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1277  while(k > 0u)
1278  {
1279  /* Perform the multiply-accumulates */
1280  sum += ((q31_t) * px++ * *py--);
1281  sum += ((q31_t) * px++ * *py--);
1282  sum += ((q31_t) * px++ * *py--);
1283  sum += ((q31_t) * px++ * *py--);
1284 
1285  /* Decrement the loop counter */
1286  k--;
1287  }
1288 
1289  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
1290  ** No loop unrolling is used. */
1291  k = srcBLen % 0x4u;
1292 
1293  while(k > 0u)
1294  {
1295  /* Perform the multiply-accumulates */
1296  sum += ((q31_t) * px++ * *py--);
1297 
1298  /* Decrement the loop counter */
1299  k--;
1300  }
1301 
1302  /* Store the result in the accumulator in the destination buffer. */
1303  *pOut++ = (q15_t) (sum >> 15);
1304 
1305  /* Increment the pointer pIn1 index, count by 1 */
1306  count++;
1307 
1308  /* Update the inputA and inputB pointers for next MAC calculation */
1309  px = pIn1 + count;
1310  py = pSrc2;
1311 
1312  /* Decrement the loop counter */
1313  blkCnt--;
1314  }
1315  }
1316  else
1317  {
1318  /* If the srcBLen is not a multiple of 4,
1319  * the blockSize2 loop cannot be unrolled by 4 */
1320  blkCnt = (uint32_t) blockSize2;
1321 
1322  while(blkCnt > 0u)
1323  {
1324  /* Accumulator is made zero for every iteration */
1325  sum = 0;
1326 
1327  /* srcBLen number of MACS should be performed */
1328  k = srcBLen;
1329 
1330  while(k > 0u)
1331  {
1332  /* Perform the multiply-accumulate */
1333  sum += ((q31_t) * px++ * *py--);
1334 
1335  /* Decrement the loop counter */
1336  k--;
1337  }
1338 
1339  /* Store the result in the accumulator in the destination buffer. */
1340  *pOut++ = (q15_t) (sum >> 15);
1341 
1342  /* Increment the MAC count */
1343  count++;
1344 
1345  /* Update the inputA and inputB pointers for next MAC calculation */
1346  px = pIn1 + count;
1347  py = pSrc2;
1348 
1349  /* Decrement the loop counter */
1350  blkCnt--;
1351  }
1352  }
1353 
1354 
1355  /* --------------------------
1356  * Initializations of stage3
1357  * -------------------------*/
1358 
1359  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
1360  * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
1361  * ....
1362  * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
1363  * sum += x[srcALen-1] * y[srcBLen-1]
1364  */
1365 
1366  /* In this stage the MAC operations are decreased by 1 for every iteration.
1367  The count variable holds the number of MAC operations performed */
1368  count = srcBLen - 1u;
1369 
1370  /* Working pointer of inputA */
1371  pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
1372  px = pSrc1;
1373 
1374  /* Working pointer of inputB */
1375  pSrc2 = pIn2 + (srcBLen - 1u);
1376  pIn2 = pSrc2 - 1u;
1377  py = pIn2;
1378 
1379  /* -------------------
1380  * Stage3 process
1381  * ------------------*/
1382 
1383  /* For loop unrolling by 4, this stage is divided into two. */
1384  /* First part of this stage computes the MAC operations greater than 4 */
1385  /* Second part of this stage computes the MAC operations less than or equal to 4 */
1386 
1387  /* The first part of the stage starts here */
1388  j = count >> 2u;
1389 
1390  while((j > 0u) && (blockSize3 > 0))
1391  {
1392  /* Accumulator is made zero for every iteration */
1393  sum = 0;
1394 
1395  /* Apply loop unrolling and compute 4 MACs simultaneously. */
1396  k = count >> 2u;
1397 
1398  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
1399  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
1400  py++;
1401 
1402  while(k > 0u)
1403  {
1404  /* Perform the multiply-accumulates */
1405  sum += ((q31_t) * px++ * *py--);
1406  sum += ((q31_t) * px++ * *py--);
1407  sum += ((q31_t) * px++ * *py--);
1408  sum += ((q31_t) * px++ * *py--);
1409  /* Decrement the loop counter */
1410  k--;
1411  }
1412 
1413 
1414  /* If the count is not a multiple of 4, compute any remaining MACs here.
1415  ** No loop unrolling is used. */
1416  k = count % 0x4u;
1417 
1418  while(k > 0u)
1419  {
1420  /* Perform the multiply-accumulates */
1421  sum += ((q31_t) * px++ * *py--);
1422 
1423  /* Decrement the loop counter */
1424  k--;
1425  }
1426 
1427  /* Store the result in the accumulator in the destination buffer. */
1428  *pOut++ = (q15_t) (sum >> 15);
1429 
1430  /* Update the inputA and inputB pointers for next MAC calculation */
1431  px = ++pSrc1;
1432  py = pIn2;
1433 
1434  /* Decrement the MAC count */
1435  count--;
1436 
1437  /* Decrement the loop counter */
1438  blockSize3--;
1439 
1440  j--;
1441  }
1442 
1443  /* The second part of the stage starts here */
1444  /* SIMD is not used for the next MAC operations,
1445  * so pointer py is updated to read only one sample at a time */
1446  py = py + 1u;
1447 
1448  while(blockSize3 > 0)
1449  {
1450  /* Accumulator is made zero for every iteration */
1451  sum = 0;
1452 
1453  /* Apply loop unrolling and compute 4 MACs simultaneously. */
1454  k = count;
1455 
1456  while(k > 0u)
1457  {
1458  /* Perform the multiply-accumulates */
1459  /* sum += x[srcALen-1] * y[srcBLen-1] */
1460  sum += ((q31_t) * px++ * *py--);
1461 
1462  /* Decrement the loop counter */
1463  k--;
1464  }
1465 
1466  /* Store the result in the accumulator in the destination buffer. */
1467  *pOut++ = (q15_t) (sum >> 15);
1468 
1469  /* Update the inputA and inputB pointers for next MAC calculation */
1470  px = ++pSrc1;
1471  py = pSrc2;
1472 
1473  /* Decrement the MAC count */
1474  count--;
1475 
1476  /* Decrement the loop counter */
1477  blockSize3--;
1478  }
1479 
1480  /* set status as ARM_MATH_SUCCESS */
1481  status = ARM_MATH_SUCCESS;
1482  }
1483 
1484  /* Return to application */
1485  return (status);
1486 
1487 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
1488 }
1489 
arm_status arm_conv_partial_fast_q15(q15_t *pSrcA, uint32_t srcALen, q15_t *pSrcB, uint32_t srcBLen, q15_t *pDst, uint32_t firstIndex, uint32_t numPoints)
Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
#define _SIMD32_OFFSET(addr)
Definition: arm_math.h:447
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
arm_status
Error status returned by some functions in the library.
Definition: arm_math.h:373