STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_fast_q31.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_fast_q31.c
9 *
10 * Description: Q31 Convolution (fast version).
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
80  q31_t * pSrcA,
81  uint32_t srcALen,
82  q31_t * pSrcB,
83  uint32_t srcBLen,
84  q31_t * pDst)
85 {
86  q31_t *pIn1; /* inputA pointer */
87  q31_t *pIn2; /* inputB pointer */
88  q31_t *pOut = pDst; /* output pointer */
89  q31_t *px; /* Intermediate inputA pointer */
90  q31_t *py; /* Intermediate inputB pointer */
91  q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
92  q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
93  q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
94  uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
95 
96  /* The algorithm implementation is based on the lengths of the inputs. */
97  /* srcB is always made to slide across srcA. */
98  /* So srcBLen is always considered as shorter or equal to srcALen */
99  if(srcALen >= srcBLen)
100  {
101  /* Initialization of inputA pointer */
102  pIn1 = pSrcA;
103 
104  /* Initialization of inputB pointer */
105  pIn2 = pSrcB;
106  }
107  else
108  {
109  /* Initialization of inputA pointer */
110  pIn1 = pSrcB;
111 
112  /* Initialization of inputB pointer */
113  pIn2 = pSrcA;
114 
115  /* srcBLen is always considered as shorter or equal to srcALen */
116  j = srcBLen;
117  srcBLen = srcALen;
118  srcALen = j;
119  }
120 
121  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
122  /* The function is internally
123  * divided into three stages according to the number of multiplications that has to be
124  * taken place between inputA samples and inputB samples. In the first stage of the
125  * algorithm, the multiplications increase by one for every iteration.
126  * In the second stage of the algorithm, srcBLen number of multiplications are done.
127  * In the third stage of the algorithm, the multiplications decrease by one
128  * for every iteration. */
129 
130  /* The algorithm is implemented in three stages.
131  The loop counters of each stage is initiated here. */
132  blockSize1 = srcBLen - 1u;
133  blockSize2 = srcALen - (srcBLen - 1u);
134  blockSize3 = blockSize1;
135 
136  /* --------------------------
137  * Initializations of stage1
138  * -------------------------*/
139 
140  /* sum = x[0] * y[0]
141  * sum = x[0] * y[1] + x[1] * y[0]
142  * ....
143  * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
144  */
145 
146  /* In this stage the MAC operations are increased by 1 for every iteration.
147  The count variable holds the number of MAC operations performed */
148  count = 1u;
149 
150  /* Working pointer of inputA */
151  px = pIn1;
152 
153  /* Working pointer of inputB */
154  py = pIn2;
155 
156 
157  /* ------------------------
158  * Stage1 process
159  * ----------------------*/
160 
161  /* The first stage starts here */
162  while(blockSize1 > 0u)
163  {
164  /* Accumulator is made zero for every iteration */
165  sum = 0;
166 
167  /* Apply loop unrolling and compute 4 MACs simultaneously. */
168  k = count >> 2u;
169 
170  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
171  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
172  while(k > 0u)
173  {
174  /* x[0] * y[srcBLen - 1] */
175  sum = (q31_t) ((((q63_t) sum << 32) +
176  ((q63_t) * px++ * (*py--))) >> 32);
177 
178  /* x[1] * y[srcBLen - 2] */
179  sum = (q31_t) ((((q63_t) sum << 32) +
180  ((q63_t) * px++ * (*py--))) >> 32);
181 
182  /* x[2] * y[srcBLen - 3] */
183  sum = (q31_t) ((((q63_t) sum << 32) +
184  ((q63_t) * px++ * (*py--))) >> 32);
185 
186  /* x[3] * y[srcBLen - 4] */
187  sum = (q31_t) ((((q63_t) sum << 32) +
188  ((q63_t) * px++ * (*py--))) >> 32);
189 
190  /* Decrement the loop counter */
191  k--;
192  }
193 
194  /* If the count is not a multiple of 4, compute any remaining MACs here.
195  ** No loop unrolling is used. */
196  k = count % 0x4u;
197 
198  while(k > 0u)
199  {
200  /* Perform the multiply-accumulate */
201  sum = (q31_t) ((((q63_t) sum << 32) +
202  ((q63_t) * px++ * (*py--))) >> 32);
203 
204  /* Decrement the loop counter */
205  k--;
206  }
207 
208  /* Store the result in the accumulator in the destination buffer. */
209  *pOut++ = sum << 1;
210 
211  /* Update the inputA and inputB pointers for next MAC calculation */
212  py = pIn2 + count;
213  px = pIn1;
214 
215  /* Increment the MAC count */
216  count++;
217 
218  /* Decrement the loop counter */
219  blockSize1--;
220  }
221 
222  /* --------------------------
223  * Initializations of stage2
224  * ------------------------*/
225 
226  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
227  * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
228  * ....
229  * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
230  */
231 
232  /* Working pointer of inputA */
233  px = pIn1;
234 
235  /* Working pointer of inputB */
236  pSrc2 = pIn2 + (srcBLen - 1u);
237  py = pSrc2;
238 
239  /* count is index by which the pointer pIn1 to be incremented */
240  count = 0u;
241 
242  /* -------------------
243  * Stage2 process
244  * ------------------*/
245 
246  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
247  * So, to loop unroll over blockSize2,
248  * srcBLen should be greater than or equal to 4 */
249  if(srcBLen >= 4u)
250  {
251  /* Loop unroll over blockSize2, by 4 */
252  blkCnt = blockSize2 >> 2u;
253 
254  while(blkCnt > 0u)
255  {
256  /* Set all accumulators to zero */
257  acc0 = 0;
258  acc1 = 0;
259  acc2 = 0;
260  acc3 = 0;
261 
262  /* read x[0], x[1], x[2] samples */
263  x0 = *(px++);
264  x1 = *(px++);
265  x2 = *(px++);
266 
267  /* Apply loop unrolling and compute 4 MACs simultaneously. */
268  k = srcBLen >> 2u;
269 
270  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
271  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
272  do
273  {
274  /* Read y[srcBLen - 1] sample */
275  c0 = *(py--);
276 
277  /* Read x[3] sample */
278  x3 = *(px++);
279 
280  /* Perform the multiply-accumulates */
281  /* acc0 += x[0] * y[srcBLen - 1] */
282  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
283 
284  /* acc1 += x[1] * y[srcBLen - 1] */
285  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
286 
287  /* acc2 += x[2] * y[srcBLen - 1] */
288  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
289 
290  /* acc3 += x[3] * y[srcBLen - 1] */
291  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
292 
293  /* Read y[srcBLen - 2] sample */
294  c0 = *(py--);
295 
296  /* Read x[4] sample */
297  x0 = *(px++);
298 
299  /* Perform the multiply-accumulate */
300  /* acc0 += x[1] * y[srcBLen - 2] */
301  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
302  /* acc1 += x[2] * y[srcBLen - 2] */
303  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
304  /* acc2 += x[3] * y[srcBLen - 2] */
305  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
306  /* acc3 += x[4] * y[srcBLen - 2] */
307  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
308 
309  /* Read y[srcBLen - 3] sample */
310  c0 = *(py--);
311 
312  /* Read x[5] sample */
313  x1 = *(px++);
314 
315  /* Perform the multiply-accumulates */
316  /* acc0 += x[2] * y[srcBLen - 3] */
317  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
318  /* acc1 += x[3] * y[srcBLen - 3] */
319  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
320  /* acc2 += x[4] * y[srcBLen - 3] */
321  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
322  /* acc3 += x[5] * y[srcBLen - 3] */
323  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
324 
325  /* Read y[srcBLen - 4] sample */
326  c0 = *(py--);
327 
328  /* Read x[6] sample */
329  x2 = *(px++);
330 
331  /* Perform the multiply-accumulates */
332  /* acc0 += x[3] * y[srcBLen - 4] */
333  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
334  /* acc1 += x[4] * y[srcBLen - 4] */
335  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
336  /* acc2 += x[5] * y[srcBLen - 4] */
337  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
338  /* acc3 += x[6] * y[srcBLen - 4] */
339  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
340 
341 
342  } while(--k);
343 
344  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
345  ** No loop unrolling is used. */
346  k = srcBLen % 0x4u;
347 
348  while(k > 0u)
349  {
350  /* Read y[srcBLen - 5] sample */
351  c0 = *(py--);
352 
353  /* Read x[7] sample */
354  x3 = *(px++);
355 
356  /* Perform the multiply-accumulates */
357  /* acc0 += x[4] * y[srcBLen - 5] */
358  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
359  /* acc1 += x[5] * y[srcBLen - 5] */
360  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
361  /* acc2 += x[6] * y[srcBLen - 5] */
362  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
363  /* acc3 += x[7] * y[srcBLen - 5] */
364  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
365 
366  /* Reuse the present samples for the next MAC */
367  x0 = x1;
368  x1 = x2;
369  x2 = x3;
370 
371  /* Decrement the loop counter */
372  k--;
373  }
374 
375  /* Store the results in the accumulators in the destination buffer. */
376  *pOut++ = (q31_t) (acc0 << 1);
377  *pOut++ = (q31_t) (acc1 << 1);
378  *pOut++ = (q31_t) (acc2 << 1);
379  *pOut++ = (q31_t) (acc3 << 1);
380 
381  /* Increment the pointer pIn1 index, count by 4 */
382  count += 4u;
383 
384  /* Update the inputA and inputB pointers for next MAC calculation */
385  px = pIn1 + count;
386  py = pSrc2;
387 
388  /* Decrement the loop counter */
389  blkCnt--;
390  }
391 
392  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
393  ** No loop unrolling is used. */
394  blkCnt = blockSize2 % 0x4u;
395 
396  while(blkCnt > 0u)
397  {
398  /* Accumulator is made zero for every iteration */
399  sum = 0;
400 
401  /* Apply loop unrolling and compute 4 MACs simultaneously. */
402  k = srcBLen >> 2u;
403 
404  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
405  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
406  while(k > 0u)
407  {
408  /* Perform the multiply-accumulates */
409  sum = (q31_t) ((((q63_t) sum << 32) +
410  ((q63_t) * px++ * (*py--))) >> 32);
411  sum = (q31_t) ((((q63_t) sum << 32) +
412  ((q63_t) * px++ * (*py--))) >> 32);
413  sum = (q31_t) ((((q63_t) sum << 32) +
414  ((q63_t) * px++ * (*py--))) >> 32);
415  sum = (q31_t) ((((q63_t) sum << 32) +
416  ((q63_t) * px++ * (*py--))) >> 32);
417 
418  /* Decrement the loop counter */
419  k--;
420  }
421 
422  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
423  ** No loop unrolling is used. */
424  k = srcBLen % 0x4u;
425 
426  while(k > 0u)
427  {
428  /* Perform the multiply-accumulate */
429  sum = (q31_t) ((((q63_t) sum << 32) +
430  ((q63_t) * px++ * (*py--))) >> 32);
431 
432  /* Decrement the loop counter */
433  k--;
434  }
435 
436  /* Store the result in the accumulator in the destination buffer. */
437  *pOut++ = sum << 1;
438 
439  /* Increment the MAC count */
440  count++;
441 
442  /* Update the inputA and inputB pointers for next MAC calculation */
443  px = pIn1 + count;
444  py = pSrc2;
445 
446  /* Decrement the loop counter */
447  blkCnt--;
448  }
449  }
450  else
451  {
452  /* If the srcBLen is not a multiple of 4,
453  * the blockSize2 loop cannot be unrolled by 4 */
454  blkCnt = blockSize2;
455 
456  while(blkCnt > 0u)
457  {
458  /* Accumulator is made zero for every iteration */
459  sum = 0;
460 
461  /* srcBLen number of MACS should be performed */
462  k = srcBLen;
463 
464  while(k > 0u)
465  {
466  /* Perform the multiply-accumulate */
467  sum = (q31_t) ((((q63_t) sum << 32) +
468  ((q63_t) * px++ * (*py--))) >> 32);
469 
470  /* Decrement the loop counter */
471  k--;
472  }
473 
474  /* Store the result in the accumulator in the destination buffer. */
475  *pOut++ = sum << 1;
476 
477  /* Increment the MAC count */
478  count++;
479 
480  /* Update the inputA and inputB pointers for next MAC calculation */
481  px = pIn1 + count;
482  py = pSrc2;
483 
484  /* Decrement the loop counter */
485  blkCnt--;
486  }
487  }
488 
489 
490  /* --------------------------
491  * Initializations of stage3
492  * -------------------------*/
493 
494  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
495  * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
496  * ....
497  * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
498  * sum += x[srcALen-1] * y[srcBLen-1]
499  */
500 
501  /* In this stage the MAC operations are decreased by 1 for every iteration.
502  The blockSize3 variable holds the number of MAC operations performed */
503 
504  /* Working pointer of inputA */
505  pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
506  px = pSrc1;
507 
508  /* Working pointer of inputB */
509  pSrc2 = pIn2 + (srcBLen - 1u);
510  py = pSrc2;
511 
512  /* -------------------
513  * Stage3 process
514  * ------------------*/
515 
516  while(blockSize3 > 0u)
517  {
518  /* Accumulator is made zero for every iteration */
519  sum = 0;
520 
521  /* Apply loop unrolling and compute 4 MACs simultaneously. */
522  k = blockSize3 >> 2u;
523 
524  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
525  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
526  while(k > 0u)
527  {
528  /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
529  sum = (q31_t) ((((q63_t) sum << 32) +
530  ((q63_t) * px++ * (*py--))) >> 32);
531 
532  /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
533  sum = (q31_t) ((((q63_t) sum << 32) +
534  ((q63_t) * px++ * (*py--))) >> 32);
535 
536  /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
537  sum = (q31_t) ((((q63_t) sum << 32) +
538  ((q63_t) * px++ * (*py--))) >> 32);
539 
540  /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
541  sum = (q31_t) ((((q63_t) sum << 32) +
542  ((q63_t) * px++ * (*py--))) >> 32);
543 
544  /* Decrement the loop counter */
545  k--;
546  }
547 
548  /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
549  ** No loop unrolling is used. */
550  k = blockSize3 % 0x4u;
551 
552  while(k > 0u)
553  {
554  /* Perform the multiply-accumulate */
555  sum = (q31_t) ((((q63_t) sum << 32) +
556  ((q63_t) * px++ * (*py--))) >> 32);
557 
558  /* Decrement the loop counter */
559  k--;
560  }
561 
562  /* Store the result in the accumulator in the destination buffer. */
563  *pOut++ = sum << 1;
564 
565  /* Update the inputA and inputB pointers for next MAC calculation */
566  px = ++pSrc1;
567  py = pSrc2;
568 
569  /* Decrement the loop counter */
570  blockSize3--;
571  }
572 
573 }
574 
int64_t q63_t
64-bit fractional data type in 1.63 format.
Definition: arm_math.h:402
void arm_conv_fast_q31(q31_t *pSrcA, uint32_t srcALen, q31_t *pSrcB, uint32_t srcBLen, q31_t *pDst)
Convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397