STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_f32.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_f32.c
9 *
10 * Description: Convolution of floating-point sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
125  float32_t * pSrcA,
126  uint32_t srcALen,
127  float32_t * pSrcB,
128  uint32_t srcBLen,
129  float32_t * pDst)
130 {
131 
132 
133 #ifndef ARM_MATH_CM0_FAMILY
134 
135  /* Run the below code for Cortex-M4 and Cortex-M3 */
136 
137  float32_t *pIn1; /* inputA pointer */
138  float32_t *pIn2; /* inputB pointer */
139  float32_t *pOut = pDst; /* output pointer */
140  float32_t *px; /* Intermediate inputA pointer */
141  float32_t *py; /* Intermediate inputB pointer */
142  float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
143  float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
144  float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
145  uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */
146 
147  /* The algorithm implementation is based on the lengths of the inputs. */
148  /* srcB is always made to slide across srcA. */
149  /* So srcBLen is always considered as shorter or equal to srcALen */
150  if(srcALen >= srcBLen)
151  {
152  /* Initialization of inputA pointer */
153  pIn1 = pSrcA;
154 
155  /* Initialization of inputB pointer */
156  pIn2 = pSrcB;
157  }
158  else
159  {
160  /* Initialization of inputA pointer */
161  pIn1 = pSrcB;
162 
163  /* Initialization of inputB pointer */
164  pIn2 = pSrcA;
165 
166  /* srcBLen is always considered as shorter or equal to srcALen */
167  j = srcBLen;
168  srcBLen = srcALen;
169  srcALen = j;
170  }
171 
172  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
173  /* The function is internally
174  * divided into three stages according to the number of multiplications that has to be
175  * taken place between inputA samples and inputB samples. In the first stage of the
176  * algorithm, the multiplications increase by one for every iteration.
177  * In the second stage of the algorithm, srcBLen number of multiplications are done.
178  * In the third stage of the algorithm, the multiplications decrease by one
179  * for every iteration. */
180 
181  /* The algorithm is implemented in three stages.
182  The loop counters of each stage is initiated here. */
183  blockSize1 = srcBLen - 1u;
184  blockSize2 = srcALen - (srcBLen - 1u);
185  blockSize3 = blockSize1;
186 
187  /* --------------------------
188  * initializations of stage1
189  * -------------------------*/
190 
191  /* sum = x[0] * y[0]
192  * sum = x[0] * y[1] + x[1] * y[0]
193  * ....
194  * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
195  */
196 
197  /* In this stage the MAC operations are increased by 1 for every iteration.
198  The count variable holds the number of MAC operations performed */
199  count = 1u;
200 
201  /* Working pointer of inputA */
202  px = pIn1;
203 
204  /* Working pointer of inputB */
205  py = pIn2;
206 
207 
208  /* ------------------------
209  * Stage1 process
210  * ----------------------*/
211 
212  /* The first stage starts here */
213  while(blockSize1 > 0u)
214  {
215  /* Accumulator is made zero for every iteration */
216  sum = 0.0f;
217 
218  /* Apply loop unrolling and compute 4 MACs simultaneously. */
219  k = count >> 2u;
220 
221  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
222  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
223  while(k > 0u)
224  {
225  /* x[0] * y[srcBLen - 1] */
226  sum += *px++ * *py--;
227 
228  /* x[1] * y[srcBLen - 2] */
229  sum += *px++ * *py--;
230 
231  /* x[2] * y[srcBLen - 3] */
232  sum += *px++ * *py--;
233 
234  /* x[3] * y[srcBLen - 4] */
235  sum += *px++ * *py--;
236 
237  /* Decrement the loop counter */
238  k--;
239  }
240 
241  /* If the count is not a multiple of 4, compute any remaining MACs here.
242  ** No loop unrolling is used. */
243  k = count % 0x4u;
244 
245  while(k > 0u)
246  {
247  /* Perform the multiply-accumulate */
248  sum += *px++ * *py--;
249 
250  /* Decrement the loop counter */
251  k--;
252  }
253 
254  /* Store the result in the accumulator in the destination buffer. */
255  *pOut++ = sum;
256 
257  /* Update the inputA and inputB pointers for next MAC calculation */
258  py = pIn2 + count;
259  px = pIn1;
260 
261  /* Increment the MAC count */
262  count++;
263 
264  /* Decrement the loop counter */
265  blockSize1--;
266  }
267 
268  /* --------------------------
269  * Initializations of stage2
270  * ------------------------*/
271 
272  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
273  * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
274  * ....
275  * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
276  */
277 
278  /* Working pointer of inputA */
279  px = pIn1;
280 
281  /* Working pointer of inputB */
282  pSrc2 = pIn2 + (srcBLen - 1u);
283  py = pSrc2;
284 
285  /* count is index by which the pointer pIn1 to be incremented */
286  count = 0u;
287 
288  /* -------------------
289  * Stage2 process
290  * ------------------*/
291 
292  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
293  * So, to loop unroll over blockSize2,
294  * srcBLen should be greater than or equal to 4 */
295  if(srcBLen >= 4u)
296  {
297  /* Loop unroll over blockSize2, by 4 */
298  blkCnt = blockSize2 >> 2u;
299 
300  while(blkCnt > 0u)
301  {
302  /* Set all accumulators to zero */
303  acc0 = 0.0f;
304  acc1 = 0.0f;
305  acc2 = 0.0f;
306  acc3 = 0.0f;
307 
308  /* read x[0], x[1], x[2] samples */
309  x0 = *(px++);
310  x1 = *(px++);
311  x2 = *(px++);
312 
313  /* Apply loop unrolling and compute 4 MACs simultaneously. */
314  k = srcBLen >> 2u;
315 
316  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
317  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
318  do
319  {
320  /* Read y[srcBLen - 1] sample */
321  c0 = *(py--);
322 
323  /* Read x[3] sample */
324  x3 = *(px);
325 
326  /* Perform the multiply-accumulate */
327  /* acc0 += x[0] * y[srcBLen - 1] */
328  acc0 += x0 * c0;
329 
330  /* acc1 += x[1] * y[srcBLen - 1] */
331  acc1 += x1 * c0;
332 
333  /* acc2 += x[2] * y[srcBLen - 1] */
334  acc2 += x2 * c0;
335 
336  /* acc3 += x[3] * y[srcBLen - 1] */
337  acc3 += x3 * c0;
338 
339  /* Read y[srcBLen - 2] sample */
340  c0 = *(py--);
341 
342  /* Read x[4] sample */
343  x0 = *(px + 1u);
344 
345  /* Perform the multiply-accumulate */
346  /* acc0 += x[1] * y[srcBLen - 2] */
347  acc0 += x1 * c0;
348  /* acc1 += x[2] * y[srcBLen - 2] */
349  acc1 += x2 * c0;
350  /* acc2 += x[3] * y[srcBLen - 2] */
351  acc2 += x3 * c0;
352  /* acc3 += x[4] * y[srcBLen - 2] */
353  acc3 += x0 * c0;
354 
355  /* Read y[srcBLen - 3] sample */
356  c0 = *(py--);
357 
358  /* Read x[5] sample */
359  x1 = *(px + 2u);
360 
361  /* Perform the multiply-accumulates */
362  /* acc0 += x[2] * y[srcBLen - 3] */
363  acc0 += x2 * c0;
364  /* acc1 += x[3] * y[srcBLen - 2] */
365  acc1 += x3 * c0;
366  /* acc2 += x[4] * y[srcBLen - 2] */
367  acc2 += x0 * c0;
368  /* acc3 += x[5] * y[srcBLen - 2] */
369  acc3 += x1 * c0;
370 
371  /* Read y[srcBLen - 4] sample */
372  c0 = *(py--);
373 
374  /* Read x[6] sample */
375  x2 = *(px + 3u);
376  px += 4u;
377 
378  /* Perform the multiply-accumulates */
379  /* acc0 += x[3] * y[srcBLen - 4] */
380  acc0 += x3 * c0;
381  /* acc1 += x[4] * y[srcBLen - 4] */
382  acc1 += x0 * c0;
383  /* acc2 += x[5] * y[srcBLen - 4] */
384  acc2 += x1 * c0;
385  /* acc3 += x[6] * y[srcBLen - 4] */
386  acc3 += x2 * c0;
387 
388 
389  } while(--k);
390 
391  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
392  ** No loop unrolling is used. */
393  k = srcBLen % 0x4u;
394 
395  while(k > 0u)
396  {
397  /* Read y[srcBLen - 5] sample */
398  c0 = *(py--);
399 
400  /* Read x[7] sample */
401  x3 = *(px++);
402 
403  /* Perform the multiply-accumulates */
404  /* acc0 += x[4] * y[srcBLen - 5] */
405  acc0 += x0 * c0;
406  /* acc1 += x[5] * y[srcBLen - 5] */
407  acc1 += x1 * c0;
408  /* acc2 += x[6] * y[srcBLen - 5] */
409  acc2 += x2 * c0;
410  /* acc3 += x[7] * y[srcBLen - 5] */
411  acc3 += x3 * c0;
412 
413  /* Reuse the present samples for the next MAC */
414  x0 = x1;
415  x1 = x2;
416  x2 = x3;
417 
418  /* Decrement the loop counter */
419  k--;
420  }
421 
422  /* Store the result in the accumulator in the destination buffer. */
423  *pOut++ = acc0;
424  *pOut++ = acc1;
425  *pOut++ = acc2;
426  *pOut++ = acc3;
427 
428  /* Increment the pointer pIn1 index, count by 4 */
429  count += 4u;
430 
431  /* Update the inputA and inputB pointers for next MAC calculation */
432  px = pIn1 + count;
433  py = pSrc2;
434 
435 
436  /* Decrement the loop counter */
437  blkCnt--;
438  }
439 
440 
441  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
442  ** No loop unrolling is used. */
443  blkCnt = blockSize2 % 0x4u;
444 
445  while(blkCnt > 0u)
446  {
447  /* Accumulator is made zero for every iteration */
448  sum = 0.0f;
449 
450  /* Apply loop unrolling and compute 4 MACs simultaneously. */
451  k = srcBLen >> 2u;
452 
453  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
454  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
455  while(k > 0u)
456  {
457  /* Perform the multiply-accumulates */
458  sum += *px++ * *py--;
459  sum += *px++ * *py--;
460  sum += *px++ * *py--;
461  sum += *px++ * *py--;
462 
463  /* Decrement the loop counter */
464  k--;
465  }
466 
467  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
468  ** No loop unrolling is used. */
469  k = srcBLen % 0x4u;
470 
471  while(k > 0u)
472  {
473  /* Perform the multiply-accumulate */
474  sum += *px++ * *py--;
475 
476  /* Decrement the loop counter */
477  k--;
478  }
479 
480  /* Store the result in the accumulator in the destination buffer. */
481  *pOut++ = sum;
482 
483  /* Increment the MAC count */
484  count++;
485 
486  /* Update the inputA and inputB pointers for next MAC calculation */
487  px = pIn1 + count;
488  py = pSrc2;
489 
490  /* Decrement the loop counter */
491  blkCnt--;
492  }
493  }
494  else
495  {
496  /* If the srcBLen is not a multiple of 4,
497  * the blockSize2 loop cannot be unrolled by 4 */
498  blkCnt = blockSize2;
499 
500  while(blkCnt > 0u)
501  {
502  /* Accumulator is made zero for every iteration */
503  sum = 0.0f;
504 
505  /* srcBLen number of MACS should be performed */
506  k = srcBLen;
507 
508  while(k > 0u)
509  {
510  /* Perform the multiply-accumulate */
511  sum += *px++ * *py--;
512 
513  /* Decrement the loop counter */
514  k--;
515  }
516 
517  /* Store the result in the accumulator in the destination buffer. */
518  *pOut++ = sum;
519 
520  /* Increment the MAC count */
521  count++;
522 
523  /* Update the inputA and inputB pointers for next MAC calculation */
524  px = pIn1 + count;
525  py = pSrc2;
526 
527  /* Decrement the loop counter */
528  blkCnt--;
529  }
530  }
531 
532 
533  /* --------------------------
534  * Initializations of stage3
535  * -------------------------*/
536 
537  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
538  * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
539  * ....
540  * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
541  * sum += x[srcALen-1] * y[srcBLen-1]
542  */
543 
544  /* In this stage the MAC operations are decreased by 1 for every iteration.
545  The blockSize3 variable holds the number of MAC operations performed */
546 
547  /* Working pointer of inputA */
548  pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
549  px = pSrc1;
550 
551  /* Working pointer of inputB */
552  pSrc2 = pIn2 + (srcBLen - 1u);
553  py = pSrc2;
554 
555  /* -------------------
556  * Stage3 process
557  * ------------------*/
558 
559  while(blockSize3 > 0u)
560  {
561  /* Accumulator is made zero for every iteration */
562  sum = 0.0f;
563 
564  /* Apply loop unrolling and compute 4 MACs simultaneously. */
565  k = blockSize3 >> 2u;
566 
567  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
568  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
569  while(k > 0u)
570  {
571  /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
572  sum += *px++ * *py--;
573 
574  /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
575  sum += *px++ * *py--;
576 
577  /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
578  sum += *px++ * *py--;
579 
580  /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
581  sum += *px++ * *py--;
582 
583  /* Decrement the loop counter */
584  k--;
585  }
586 
587  /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
588  ** No loop unrolling is used. */
589  k = blockSize3 % 0x4u;
590 
591  while(k > 0u)
592  {
593  /* Perform the multiply-accumulates */
594  /* sum += x[srcALen-1] * y[srcBLen-1] */
595  sum += *px++ * *py--;
596 
597  /* Decrement the loop counter */
598  k--;
599  }
600 
601  /* Store the result in the accumulator in the destination buffer. */
602  *pOut++ = sum;
603 
604  /* Update the inputA and inputB pointers for next MAC calculation */
605  px = ++pSrc1;
606  py = pSrc2;
607 
608  /* Decrement the loop counter */
609  blockSize3--;
610  }
611 
612 #else
613 
614  /* Run the below code for Cortex-M0 */
615 
616  float32_t *pIn1 = pSrcA; /* inputA pointer */
617  float32_t *pIn2 = pSrcB; /* inputB pointer */
618  float32_t sum; /* Accumulator */
619  uint32_t i, j; /* loop counters */
620 
621  /* Loop to calculate convolution for output length number of times */
622  for (i = 0u; i < ((srcALen + srcBLen) - 1u); i++)
623  {
624  /* Initialize sum with zero to carry out MAC operations */
625  sum = 0.0f;
626 
627  /* Loop to perform MAC operations according to convolution equation */
628  for (j = 0u; j <= i; j++)
629  {
630  /* Check the array limitations */
631  if((((i - j) < srcBLen) && (j < srcALen)))
632  {
633  /* z[i] += x[i-j] * y[j] */
634  sum += pIn1[j] * pIn2[i - j];
635  }
636  }
637  /* Store the output in the destination buffer */
638  pDst[i] = sum;
639  }
640 
641 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
642 
643 }
644 
float float32_t
32-bit floating-point type definition.
Definition: arm_math.h:407
void arm_conv_f32(float32_t *pSrcA, uint32_t srcALen, float32_t *pSrcB, uint32_t srcBLen, float32_t *pDst)
Convolution of floating-point sequences.
Definition: arm_conv_f32.c:124