STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_partial_fast_q31.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_partial_fast_q31.c
9 *
10 * Description: Fast Q31 Partial convolution.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
68  q31_t * pSrcA,
69  uint32_t srcALen,
70  q31_t * pSrcB,
71  uint32_t srcBLen,
72  q31_t * pDst,
73  uint32_t firstIndex,
74  uint32_t numPoints)
75 {
76  q31_t *pIn1; /* inputA pointer */
77  q31_t *pIn2; /* inputB pointer */
78  q31_t *pOut = pDst; /* output pointer */
79  q31_t *px; /* Intermediate inputA pointer */
80  q31_t *py; /* Intermediate inputB pointer */
81  q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
82  q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
83  q31_t x0, x1, x2, x3, c0;
84  uint32_t j, k, count, check, blkCnt;
85  int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
86  arm_status status; /* status of Partial convolution */
87 
88 
89  /* Check for range of output samples to be calculated */
90  if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
91  {
92  /* Set status as ARM_MATH_ARGUMENT_ERROR */
93  status = ARM_MATH_ARGUMENT_ERROR;
94  }
95  else
96  {
97 
98  /* The algorithm implementation is based on the lengths of the inputs. */
99  /* srcB is always made to slide across srcA. */
100  /* So srcBLen is always considered as shorter or equal to srcALen */
101  if(srcALen >= srcBLen)
102  {
103  /* Initialization of inputA pointer */
104  pIn1 = pSrcA;
105 
106  /* Initialization of inputB pointer */
107  pIn2 = pSrcB;
108  }
109  else
110  {
111  /* Initialization of inputA pointer */
112  pIn1 = pSrcB;
113 
114  /* Initialization of inputB pointer */
115  pIn2 = pSrcA;
116 
117  /* srcBLen is always considered as shorter or equal to srcALen */
118  j = srcBLen;
119  srcBLen = srcALen;
120  srcALen = j;
121  }
122 
123  /* Conditions to check which loopCounter holds
124  * the first and last indices of the output samples to be calculated. */
125  check = firstIndex + numPoints;
126  blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
127  blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
128  blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
129  blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
130  (int32_t) numPoints) : 0;
131  blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
132  (int32_t) firstIndex);
133  blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
134 
135  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
136  /* The function is internally
137  * divided into three stages according to the number of multiplications that has to be
138  * taken place between inputA samples and inputB samples. In the first stage of the
139  * algorithm, the multiplications increase by one for every iteration.
140  * In the second stage of the algorithm, srcBLen number of multiplications are done.
141  * In the third stage of the algorithm, the multiplications decrease by one
142  * for every iteration. */
143 
144  /* Set the output pointer to point to the firstIndex
145  * of the output sample to be calculated. */
146  pOut = pDst + firstIndex;
147 
148  /* --------------------------
149  * Initializations of stage1
150  * -------------------------*/
151 
152  /* sum = x[0] * y[0]
153  * sum = x[0] * y[1] + x[1] * y[0]
154  * ....
155  * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
156  */
157 
158  /* In this stage the MAC operations are increased by 1 for every iteration.
159  The count variable holds the number of MAC operations performed.
160  Since the partial convolution starts from firstIndex
161  Number of Macs to be performed is firstIndex + 1 */
162  count = 1u + firstIndex;
163 
164  /* Working pointer of inputA */
165  px = pIn1;
166 
167  /* Working pointer of inputB */
168  pSrc2 = pIn2 + firstIndex;
169  py = pSrc2;
170 
171  /* ------------------------
172  * Stage1 process
173  * ----------------------*/
174 
175  /* The first loop starts here */
176  while(blockSize1 > 0)
177  {
178  /* Accumulator is made zero for every iteration */
179  sum = 0;
180 
181  /* Apply loop unrolling and compute 4 MACs simultaneously. */
182  k = count >> 2u;
183 
184  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
185  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
186  while(k > 0u)
187  {
188  /* x[0] * y[srcBLen - 1] */
189  sum = (q31_t) ((((q63_t) sum << 32) +
190  ((q63_t) * px++ * (*py--))) >> 32);
191 
192  /* x[1] * y[srcBLen - 2] */
193  sum = (q31_t) ((((q63_t) sum << 32) +
194  ((q63_t) * px++ * (*py--))) >> 32);
195 
196  /* x[2] * y[srcBLen - 3] */
197  sum = (q31_t) ((((q63_t) sum << 32) +
198  ((q63_t) * px++ * (*py--))) >> 32);
199 
200  /* x[3] * y[srcBLen - 4] */
201  sum = (q31_t) ((((q63_t) sum << 32) +
202  ((q63_t) * px++ * (*py--))) >> 32);
203 
204  /* Decrement the loop counter */
205  k--;
206  }
207 
208  /* If the count is not a multiple of 4, compute any remaining MACs here.
209  ** No loop unrolling is used. */
210  k = count % 0x4u;
211 
212  while(k > 0u)
213  {
214  /* Perform the multiply-accumulates */
215  sum = (q31_t) ((((q63_t) sum << 32) +
216  ((q63_t) * px++ * (*py--))) >> 32);
217 
218  /* Decrement the loop counter */
219  k--;
220  }
221 
222  /* Store the result in the accumulator in the destination buffer. */
223  *pOut++ = sum << 1;
224 
225  /* Update the inputA and inputB pointers for next MAC calculation */
226  py = ++pSrc2;
227  px = pIn1;
228 
229  /* Increment the MAC count */
230  count++;
231 
232  /* Decrement the loop counter */
233  blockSize1--;
234  }
235 
236  /* --------------------------
237  * Initializations of stage2
238  * ------------------------*/
239 
240  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
241  * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
242  * ....
243  * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
244  */
245 
246  /* Working pointer of inputA */
247  if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
248  {
249  px = pIn1 + firstIndex - srcBLen + 1;
250  }
251  else
252  {
253  px = pIn1;
254  }
255 
256  /* Working pointer of inputB */
257  pSrc2 = pIn2 + (srcBLen - 1u);
258  py = pSrc2;
259 
260  /* count is index by which the pointer pIn1 to be incremented */
261  count = 0u;
262 
263  /* -------------------
264  * Stage2 process
265  * ------------------*/
266 
267  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
268  * So, to loop unroll over blockSize2,
269  * srcBLen should be greater than or equal to 4 */
270  if(srcBLen >= 4u)
271  {
272  /* Loop unroll over blockSize2 */
273  blkCnt = ((uint32_t) blockSize2 >> 2u);
274 
275  while(blkCnt > 0u)
276  {
277  /* Set all accumulators to zero */
278  acc0 = 0;
279  acc1 = 0;
280  acc2 = 0;
281  acc3 = 0;
282 
283  /* read x[0], x[1], x[2] samples */
284  x0 = *(px++);
285  x1 = *(px++);
286  x2 = *(px++);
287 
288  /* Apply loop unrolling and compute 4 MACs simultaneously. */
289  k = srcBLen >> 2u;
290 
291  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
292  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
293  do
294  {
295  /* Read y[srcBLen - 1] sample */
296  c0 = *(py--);
297 
298  /* Read x[3] sample */
299  x3 = *(px++);
300 
301  /* Perform the multiply-accumulate */
302  /* acc0 += x[0] * y[srcBLen - 1] */
303  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
304 
305  /* acc1 += x[1] * y[srcBLen - 1] */
306  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
307 
308  /* acc2 += x[2] * y[srcBLen - 1] */
309  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
310 
311  /* acc3 += x[3] * y[srcBLen - 1] */
312  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
313 
314  /* Read y[srcBLen - 2] sample */
315  c0 = *(py--);
316 
317  /* Read x[4] sample */
318  x0 = *(px++);
319 
320  /* Perform the multiply-accumulate */
321  /* acc0 += x[1] * y[srcBLen - 2] */
322  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
323  /* acc1 += x[2] * y[srcBLen - 2] */
324  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
325  /* acc2 += x[3] * y[srcBLen - 2] */
326  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
327  /* acc3 += x[4] * y[srcBLen - 2] */
328  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
329 
330  /* Read y[srcBLen - 3] sample */
331  c0 = *(py--);
332 
333  /* Read x[5] sample */
334  x1 = *(px++);
335 
336  /* Perform the multiply-accumulates */
337  /* acc0 += x[2] * y[srcBLen - 3] */
338  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
339  /* acc1 += x[3] * y[srcBLen - 2] */
340  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
341  /* acc2 += x[4] * y[srcBLen - 2] */
342  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
343  /* acc3 += x[5] * y[srcBLen - 2] */
344  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
345 
346  /* Read y[srcBLen - 4] sample */
347  c0 = *(py--);
348 
349  /* Read x[6] sample */
350  x2 = *(px++);
351 
352  /* Perform the multiply-accumulates */
353  /* acc0 += x[3] * y[srcBLen - 4] */
354  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
355  /* acc1 += x[4] * y[srcBLen - 4] */
356  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
357  /* acc2 += x[5] * y[srcBLen - 4] */
358  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
359  /* acc3 += x[6] * y[srcBLen - 4] */
360  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
361 
362 
363  } while(--k);
364 
365  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
366  ** No loop unrolling is used. */
367  k = srcBLen % 0x4u;
368 
369  while(k > 0u)
370  {
371  /* Read y[srcBLen - 5] sample */
372  c0 = *(py--);
373 
374  /* Read x[7] sample */
375  x3 = *(px++);
376 
377  /* Perform the multiply-accumulates */
378  /* acc0 += x[4] * y[srcBLen - 5] */
379  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
380  /* acc1 += x[5] * y[srcBLen - 5] */
381  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
382  /* acc2 += x[6] * y[srcBLen - 5] */
383  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
384  /* acc3 += x[7] * y[srcBLen - 5] */
385  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
386 
387  /* Reuse the present samples for the next MAC */
388  x0 = x1;
389  x1 = x2;
390  x2 = x3;
391 
392  /* Decrement the loop counter */
393  k--;
394  }
395 
396  /* Store the result in the accumulator in the destination buffer. */
397  *pOut++ = (q31_t) (acc0 << 1);
398  *pOut++ = (q31_t) (acc1 << 1);
399  *pOut++ = (q31_t) (acc2 << 1);
400  *pOut++ = (q31_t) (acc3 << 1);
401 
402  /* Increment the pointer pIn1 index, count by 4 */
403  count += 4u;
404 
405  /* Update the inputA and inputB pointers for next MAC calculation */
406  px = pIn1 + count;
407  py = pSrc2;
408 
409  /* Decrement the loop counter */
410  blkCnt--;
411  }
412 
413  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
414  ** No loop unrolling is used. */
415  blkCnt = (uint32_t) blockSize2 % 0x4u;
416 
417  while(blkCnt > 0u)
418  {
419  /* Accumulator is made zero for every iteration */
420  sum = 0;
421 
422  /* Apply loop unrolling and compute 4 MACs simultaneously. */
423  k = srcBLen >> 2u;
424 
425  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
426  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
427  while(k > 0u)
428  {
429  /* Perform the multiply-accumulates */
430  sum = (q31_t) ((((q63_t) sum << 32) +
431  ((q63_t) * px++ * (*py--))) >> 32);
432  sum = (q31_t) ((((q63_t) sum << 32) +
433  ((q63_t) * px++ * (*py--))) >> 32);
434  sum = (q31_t) ((((q63_t) sum << 32) +
435  ((q63_t) * px++ * (*py--))) >> 32);
436  sum = (q31_t) ((((q63_t) sum << 32) +
437  ((q63_t) * px++ * (*py--))) >> 32);
438 
439  /* Decrement the loop counter */
440  k--;
441  }
442 
443  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
444  ** No loop unrolling is used. */
445  k = srcBLen % 0x4u;
446 
447  while(k > 0u)
448  {
449  /* Perform the multiply-accumulate */
450  sum = (q31_t) ((((q63_t) sum << 32) +
451  ((q63_t) * px++ * (*py--))) >> 32);
452 
453  /* Decrement the loop counter */
454  k--;
455  }
456 
457  /* Store the result in the accumulator in the destination buffer. */
458  *pOut++ = sum << 1;
459 
460  /* Increment the MAC count */
461  count++;
462 
463  /* Update the inputA and inputB pointers for next MAC calculation */
464  px = pIn1 + count;
465  py = pSrc2;
466 
467  /* Decrement the loop counter */
468  blkCnt--;
469  }
470  }
471  else
472  {
473  /* If the srcBLen is not a multiple of 4,
474  * the blockSize2 loop cannot be unrolled by 4 */
475  blkCnt = (uint32_t) blockSize2;
476 
477  while(blkCnt > 0u)
478  {
479  /* Accumulator is made zero for every iteration */
480  sum = 0;
481 
482  /* srcBLen number of MACS should be performed */
483  k = srcBLen;
484 
485  while(k > 0u)
486  {
487  /* Perform the multiply-accumulate */
488  sum = (q31_t) ((((q63_t) sum << 32) +
489  ((q63_t) * px++ * (*py--))) >> 32);
490 
491  /* Decrement the loop counter */
492  k--;
493  }
494 
495  /* Store the result in the accumulator in the destination buffer. */
496  *pOut++ = sum << 1;
497 
498  /* Increment the MAC count */
499  count++;
500 
501  /* Update the inputA and inputB pointers for next MAC calculation */
502  px = pIn1 + count;
503  py = pSrc2;
504 
505  /* Decrement the loop counter */
506  blkCnt--;
507  }
508  }
509 
510 
511  /* --------------------------
512  * Initializations of stage3
513  * -------------------------*/
514 
515  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
516  * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
517  * ....
518  * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
519  * sum += x[srcALen-1] * y[srcBLen-1]
520  */
521 
522  /* In this stage the MAC operations are decreased by 1 for every iteration.
523  The count variable holds the number of MAC operations performed */
524  count = srcBLen - 1u;
525 
526  /* Working pointer of inputA */
527  pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
528  px = pSrc1;
529 
530  /* Working pointer of inputB */
531  pSrc2 = pIn2 + (srcBLen - 1u);
532  py = pSrc2;
533 
534  /* -------------------
535  * Stage3 process
536  * ------------------*/
537 
538  while(blockSize3 > 0)
539  {
540  /* Accumulator is made zero for every iteration */
541  sum = 0;
542 
543  /* Apply loop unrolling and compute 4 MACs simultaneously. */
544  k = count >> 2u;
545 
546  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
547  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
548  while(k > 0u)
549  {
550  /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
551  sum = (q31_t) ((((q63_t) sum << 32) +
552  ((q63_t) * px++ * (*py--))) >> 32);
553 
554  /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
555  sum = (q31_t) ((((q63_t) sum << 32) +
556  ((q63_t) * px++ * (*py--))) >> 32);
557 
558  /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
559  sum = (q31_t) ((((q63_t) sum << 32) +
560  ((q63_t) * px++ * (*py--))) >> 32);
561 
562  /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
563  sum = (q31_t) ((((q63_t) sum << 32) +
564  ((q63_t) * px++ * (*py--))) >> 32);
565 
566  /* Decrement the loop counter */
567  k--;
568  }
569 
570  /* If the count is not a multiple of 4, compute any remaining MACs here.
571  ** No loop unrolling is used. */
572  k = count % 0x4u;
573 
574  while(k > 0u)
575  {
576  /* Perform the multiply-accumulates */
577  /* sum += x[srcALen-1] * y[srcBLen-1] */
578  sum = (q31_t) ((((q63_t) sum << 32) +
579  ((q63_t) * px++ * (*py--))) >> 32);
580 
581  /* Decrement the loop counter */
582  k--;
583  }
584 
585  /* Store the result in the accumulator in the destination buffer. */
586  *pOut++ = sum << 1;
587 
588  /* Update the inputA and inputB pointers for next MAC calculation */
589  px = ++pSrc1;
590  py = pSrc2;
591 
592  /* Decrement the MAC count */
593  count--;
594 
595  /* Decrement the loop counter */
596  blockSize3--;
597 
598  }
599 
600  /* set status as ARM_MATH_SUCCESS */
601  status = ARM_MATH_SUCCESS;
602  }
603 
604  /* Return to application */
605  return (status);
606 
607 }
608 
int64_t q63_t
64-bit fractional data type in 1.63 format.
Definition: arm_math.h:402
arm_status arm_conv_partial_fast_q31(q31_t *pSrcA, uint32_t srcALen, q31_t *pSrcB, uint32_t srcBLen, q31_t *pDst, uint32_t firstIndex, uint32_t numPoints)
Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
arm_status
Error status returned by some functions in the library.
Definition: arm_math.h:373