STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_conv_partial_f32.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_conv_partial_f32.c
9 *
10 * Description: Partial convolution of floating-point sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
96  float32_t * pSrcA,
97  uint32_t srcALen,
98  float32_t * pSrcB,
99  uint32_t srcBLen,
100  float32_t * pDst,
101  uint32_t firstIndex,
102  uint32_t numPoints)
103 {
104 
105 
106 #ifndef ARM_MATH_CM0_FAMILY
107 
108  /* Run the below code for Cortex-M4 and Cortex-M3 */
109 
110  float32_t *pIn1 = pSrcA; /* inputA pointer */
111  float32_t *pIn2 = pSrcB; /* inputB pointer */
112  float32_t *pOut = pDst; /* output pointer */
113  float32_t *px; /* Intermediate inputA pointer */
114  float32_t *py; /* Intermediate inputB pointer */
115  float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
116  float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
117  float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
118  uint32_t j, k, count = 0u, blkCnt, check;
119  int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
120  arm_status status; /* status of Partial convolution */
121 
122 
123  /* Check for range of output samples to be calculated */
124  if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
125  {
126  /* Set status as ARM_MATH_ARGUMENT_ERROR */
127  status = ARM_MATH_ARGUMENT_ERROR;
128  }
129  else
130  {
131 
132  /* The algorithm implementation is based on the lengths of the inputs. */
133  /* srcB is always made to slide across srcA. */
134  /* So srcBLen is always considered as shorter or equal to srcALen */
135  if(srcALen >= srcBLen)
136  {
137  /* Initialization of inputA pointer */
138  pIn1 = pSrcA;
139 
140  /* Initialization of inputB pointer */
141  pIn2 = pSrcB;
142  }
143  else
144  {
145  /* Initialization of inputA pointer */
146  pIn1 = pSrcB;
147 
148  /* Initialization of inputB pointer */
149  pIn2 = pSrcA;
150 
151  /* srcBLen is always considered as shorter or equal to srcALen */
152  j = srcBLen;
153  srcBLen = srcALen;
154  srcALen = j;
155  }
156 
157  /* Conditions to check which loopCounter holds
158  * the first and last indices of the output samples to be calculated. */
159  check = firstIndex + numPoints;
160  blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
161  blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
162  blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
163  blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
164  (int32_t) numPoints) : 0;
165  blockSize2 = ((int32_t) check - blockSize3) -
166  (blockSize1 + (int32_t) firstIndex);
167  blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
168 
169  /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
170  /* The function is internally
171  * divided into three stages according to the number of multiplications that has to be
172  * taken place between inputA samples and inputB samples. In the first stage of the
173  * algorithm, the multiplications increase by one for every iteration.
174  * In the second stage of the algorithm, srcBLen number of multiplications are done.
175  * In the third stage of the algorithm, the multiplications decrease by one
176  * for every iteration. */
177 
178  /* Set the output pointer to point to the firstIndex
179  * of the output sample to be calculated. */
180  pOut = pDst + firstIndex;
181 
182  /* --------------------------
183  * Initializations of stage1
184  * -------------------------*/
185 
186  /* sum = x[0] * y[0]
187  * sum = x[0] * y[1] + x[1] * y[0]
188  * ....
189  * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
190  */
191 
192  /* In this stage the MAC operations are increased by 1 for every iteration.
193  The count variable holds the number of MAC operations performed.
194  Since the partial convolution starts from from firstIndex
195  Number of Macs to be performed is firstIndex + 1 */
196  count = 1u + firstIndex;
197 
198  /* Working pointer of inputA */
199  px = pIn1;
200 
201  /* Working pointer of inputB */
202  pSrc1 = pIn2 + firstIndex;
203  py = pSrc1;
204 
205  /* ------------------------
206  * Stage1 process
207  * ----------------------*/
208 
209  /* The first stage starts here */
210  while(blockSize1 > 0)
211  {
212  /* Accumulator is made zero for every iteration */
213  sum = 0.0f;
214 
215  /* Apply loop unrolling and compute 4 MACs simultaneously. */
216  k = count >> 2u;
217 
218  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
219  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
220  while(k > 0u)
221  {
222  /* x[0] * y[srcBLen - 1] */
223  sum += *px++ * *py--;
224 
225  /* x[1] * y[srcBLen - 2] */
226  sum += *px++ * *py--;
227 
228  /* x[2] * y[srcBLen - 3] */
229  sum += *px++ * *py--;
230 
231  /* x[3] * y[srcBLen - 4] */
232  sum += *px++ * *py--;
233 
234  /* Decrement the loop counter */
235  k--;
236  }
237 
238  /* If the count is not a multiple of 4, compute any remaining MACs here.
239  ** No loop unrolling is used. */
240  k = count % 0x4u;
241 
242  while(k > 0u)
243  {
244  /* Perform the multiply-accumulates */
245  sum += *px++ * *py--;
246 
247  /* Decrement the loop counter */
248  k--;
249  }
250 
251  /* Store the result in the accumulator in the destination buffer. */
252  *pOut++ = sum;
253 
254  /* Update the inputA and inputB pointers for next MAC calculation */
255  py = ++pSrc1;
256  px = pIn1;
257 
258  /* Increment the MAC count */
259  count++;
260 
261  /* Decrement the loop counter */
262  blockSize1--;
263  }
264 
265  /* --------------------------
266  * Initializations of stage2
267  * ------------------------*/
268 
269  /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
270  * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
271  * ....
272  * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
273  */
274 
275  /* Working pointer of inputA */
276  if((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
277  {
278  px = pIn1 + firstIndex - srcBLen + 1;
279  }
280  else
281  {
282  px = pIn1;
283  }
284 
285  /* Working pointer of inputB */
286  pSrc2 = pIn2 + (srcBLen - 1u);
287  py = pSrc2;
288 
289  /* count is index by which the pointer pIn1 to be incremented */
290  count = 0u;
291 
292  /* -------------------
293  * Stage2 process
294  * ------------------*/
295 
296  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
297  * So, to loop unroll over blockSize2,
298  * srcBLen should be greater than or equal to 4 */
299  if(srcBLen >= 4u)
300  {
301  /* Loop unroll over blockSize2, by 4 */
302  blkCnt = ((uint32_t) blockSize2 >> 2u);
303 
304  while(blkCnt > 0u)
305  {
306  /* Set all accumulators to zero */
307  acc0 = 0.0f;
308  acc1 = 0.0f;
309  acc2 = 0.0f;
310  acc3 = 0.0f;
311 
312  /* read x[0], x[1], x[2] samples */
313  x0 = *(px++);
314  x1 = *(px++);
315  x2 = *(px++);
316 
317  /* Apply loop unrolling and compute 4 MACs simultaneously. */
318  k = srcBLen >> 2u;
319 
320  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
321  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
322  do
323  {
324  /* Read y[srcBLen - 1] sample */
325  c0 = *(py--);
326 
327  /* Read x[3] sample */
328  x3 = *(px++);
329 
330  /* Perform the multiply-accumulate */
331  /* acc0 += x[0] * y[srcBLen - 1] */
332  acc0 += x0 * c0;
333 
334  /* acc1 += x[1] * y[srcBLen - 1] */
335  acc1 += x1 * c0;
336 
337  /* acc2 += x[2] * y[srcBLen - 1] */
338  acc2 += x2 * c0;
339 
340  /* acc3 += x[3] * y[srcBLen - 1] */
341  acc3 += x3 * c0;
342 
343  /* Read y[srcBLen - 2] sample */
344  c0 = *(py--);
345 
346  /* Read x[4] sample */
347  x0 = *(px++);
348 
349  /* Perform the multiply-accumulate */
350  /* acc0 += x[1] * y[srcBLen - 2] */
351  acc0 += x1 * c0;
352  /* acc1 += x[2] * y[srcBLen - 2] */
353  acc1 += x2 * c0;
354  /* acc2 += x[3] * y[srcBLen - 2] */
355  acc2 += x3 * c0;
356  /* acc3 += x[4] * y[srcBLen - 2] */
357  acc3 += x0 * c0;
358 
359  /* Read y[srcBLen - 3] sample */
360  c0 = *(py--);
361 
362  /* Read x[5] sample */
363  x1 = *(px++);
364 
365  /* Perform the multiply-accumulates */
366  /* acc0 += x[2] * y[srcBLen - 3] */
367  acc0 += x2 * c0;
368  /* acc1 += x[3] * y[srcBLen - 2] */
369  acc1 += x3 * c0;
370  /* acc2 += x[4] * y[srcBLen - 2] */
371  acc2 += x0 * c0;
372  /* acc3 += x[5] * y[srcBLen - 2] */
373  acc3 += x1 * c0;
374 
375  /* Read y[srcBLen - 4] sample */
376  c0 = *(py--);
377 
378  /* Read x[6] sample */
379  x2 = *(px++);
380 
381  /* Perform the multiply-accumulates */
382  /* acc0 += x[3] * y[srcBLen - 4] */
383  acc0 += x3 * c0;
384  /* acc1 += x[4] * y[srcBLen - 4] */
385  acc1 += x0 * c0;
386  /* acc2 += x[5] * y[srcBLen - 4] */
387  acc2 += x1 * c0;
388  /* acc3 += x[6] * y[srcBLen - 4] */
389  acc3 += x2 * c0;
390 
391 
392  } while(--k);
393 
394  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
395  ** No loop unrolling is used. */
396  k = srcBLen % 0x4u;
397 
398  while(k > 0u)
399  {
400  /* Read y[srcBLen - 5] sample */
401  c0 = *(py--);
402 
403  /* Read x[7] sample */
404  x3 = *(px++);
405 
406  /* Perform the multiply-accumulates */
407  /* acc0 += x[4] * y[srcBLen - 5] */
408  acc0 += x0 * c0;
409  /* acc1 += x[5] * y[srcBLen - 5] */
410  acc1 += x1 * c0;
411  /* acc2 += x[6] * y[srcBLen - 5] */
412  acc2 += x2 * c0;
413  /* acc3 += x[7] * y[srcBLen - 5] */
414  acc3 += x3 * c0;
415 
416  /* Reuse the present samples for the next MAC */
417  x0 = x1;
418  x1 = x2;
419  x2 = x3;
420 
421  /* Decrement the loop counter */
422  k--;
423  }
424 
425  /* Store the result in the accumulator in the destination buffer. */
426  *pOut++ = acc0;
427  *pOut++ = acc1;
428  *pOut++ = acc2;
429  *pOut++ = acc3;
430 
431  /* Increment the pointer pIn1 index, count by 1 */
432  count += 4u;
433 
434  /* Update the inputA and inputB pointers for next MAC calculation */
435  px = pIn1 + count;
436  py = pSrc2;
437 
438  /* Decrement the loop counter */
439  blkCnt--;
440  }
441 
442  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
443  ** No loop unrolling is used. */
444  blkCnt = (uint32_t) blockSize2 % 0x4u;
445 
446  while(blkCnt > 0u)
447  {
448  /* Accumulator is made zero for every iteration */
449  sum = 0.0f;
450 
451  /* Apply loop unrolling and compute 4 MACs simultaneously. */
452  k = srcBLen >> 2u;
453 
454  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
455  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
456  while(k > 0u)
457  {
458  /* Perform the multiply-accumulates */
459  sum += *px++ * *py--;
460  sum += *px++ * *py--;
461  sum += *px++ * *py--;
462  sum += *px++ * *py--;
463 
464  /* Decrement the loop counter */
465  k--;
466  }
467 
468  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
469  ** No loop unrolling is used. */
470  k = srcBLen % 0x4u;
471 
472  while(k > 0u)
473  {
474  /* Perform the multiply-accumulate */
475  sum += *px++ * *py--;
476 
477  /* Decrement the loop counter */
478  k--;
479  }
480 
481  /* Store the result in the accumulator in the destination buffer. */
482  *pOut++ = sum;
483 
484  /* Increment the MAC count */
485  count++;
486 
487  /* Update the inputA and inputB pointers for next MAC calculation */
488  px = pIn1 + count;
489  py = pSrc2;
490 
491  /* Decrement the loop counter */
492  blkCnt--;
493  }
494  }
495  else
496  {
497  /* If the srcBLen is not a multiple of 4,
498  * the blockSize2 loop cannot be unrolled by 4 */
499  blkCnt = (uint32_t) blockSize2;
500 
501  while(blkCnt > 0u)
502  {
503  /* Accumulator is made zero for every iteration */
504  sum = 0.0f;
505 
506  /* srcBLen number of MACS should be performed */
507  k = srcBLen;
508 
509  while(k > 0u)
510  {
511  /* Perform the multiply-accumulate */
512  sum += *px++ * *py--;
513 
514  /* Decrement the loop counter */
515  k--;
516  }
517 
518  /* Store the result in the accumulator in the destination buffer. */
519  *pOut++ = sum;
520 
521  /* Increment the MAC count */
522  count++;
523 
524  /* Update the inputA and inputB pointers for next MAC calculation */
525  px = pIn1 + count;
526  py = pSrc2;
527 
528  /* Decrement the loop counter */
529  blkCnt--;
530  }
531  }
532 
533 
534  /* --------------------------
535  * Initializations of stage3
536  * -------------------------*/
537 
538  /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
539  * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
540  * ....
541  * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
542  * sum += x[srcALen-1] * y[srcBLen-1]
543  */
544 
545  /* In this stage the MAC operations are decreased by 1 for every iteration.
546  The count variable holds the number of MAC operations performed */
547  count = srcBLen - 1u;
548 
549  /* Working pointer of inputA */
550  pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
551  px = pSrc1;
552 
553  /* Working pointer of inputB */
554  pSrc2 = pIn2 + (srcBLen - 1u);
555  py = pSrc2;
556 
557  while(blockSize3 > 0)
558  {
559  /* Accumulator is made zero for every iteration */
560  sum = 0.0f;
561 
562  /* Apply loop unrolling and compute 4 MACs simultaneously. */
563  k = count >> 2u;
564 
565  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
566  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
567  while(k > 0u)
568  {
569  /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
570  sum += *px++ * *py--;
571 
572  /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
573  sum += *px++ * *py--;
574 
575  /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
576  sum += *px++ * *py--;
577 
578  /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
579  sum += *px++ * *py--;
580 
581  /* Decrement the loop counter */
582  k--;
583  }
584 
585  /* If the count is not a multiple of 4, compute any remaining MACs here.
586  ** No loop unrolling is used. */
587  k = count % 0x4u;
588 
589  while(k > 0u)
590  {
591  /* Perform the multiply-accumulates */
592  /* sum += x[srcALen-1] * y[srcBLen-1] */
593  sum += *px++ * *py--;
594 
595  /* Decrement the loop counter */
596  k--;
597  }
598 
599  /* Store the result in the accumulator in the destination buffer. */
600  *pOut++ = sum;
601 
602  /* Update the inputA and inputB pointers for next MAC calculation */
603  px = ++pSrc1;
604  py = pSrc2;
605 
606  /* Decrement the MAC count */
607  count--;
608 
609  /* Decrement the loop counter */
610  blockSize3--;
611 
612  }
613 
614  /* set status as ARM_MATH_SUCCESS */
615  status = ARM_MATH_SUCCESS;
616  }
617 
618  /* Return to application */
619  return (status);
620 
621 #else
622 
623  /* Run the below code for Cortex-M0 */
624 
625  float32_t *pIn1 = pSrcA; /* inputA pointer */
626  float32_t *pIn2 = pSrcB; /* inputB pointer */
627  float32_t sum; /* Accumulator */
628  uint32_t i, j; /* loop counters */
629  arm_status status; /* status of Partial convolution */
630 
631  /* Check for range of output samples to be calculated */
632  if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
633  {
634  /* Set status as ARM_ARGUMENT_ERROR */
635  status = ARM_MATH_ARGUMENT_ERROR;
636  }
637  else
638  {
639  /* Loop to calculate convolution for output length number of values */
640  for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
641  {
642  /* Initialize sum with zero to carry on MAC operations */
643  sum = 0.0f;
644 
645  /* Loop to perform MAC operations according to convolution equation */
646  for (j = 0u; j <= i; j++)
647  {
648  /* Check the array limitations for inputs */
649  if((((i - j) < srcBLen) && (j < srcALen)))
650  {
651  /* z[i] += x[i-j] * y[j] */
652  sum += pIn1[j] * pIn2[i - j];
653  }
654  }
655  /* Store the output in the destination buffer */
656  pDst[i] = sum;
657  }
658  /* set status as ARM_SUCCESS as there are no argument errors */
659  status = ARM_MATH_SUCCESS;
660  }
661  return (status);
662 
663 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
664 
665 }
666 
float float32_t
32-bit floating-point type definition.
Definition: arm_math.h:407
arm_status arm_conv_partial_f32(float32_t *pSrcA, uint32_t srcALen, float32_t *pSrcB, uint32_t srcBLen, float32_t *pDst, uint32_t firstIndex, uint32_t numPoints)
Partial convolution of floating-point sequences.
arm_status
Error status returned by some functions in the library.
Definition: arm_math.h:373