STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_correlate_f32.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_f32.c
9 *
10 * Description: Correlation of floating-point sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
117  float32_t * pSrcA,
118  uint32_t srcALen,
119  float32_t * pSrcB,
120  uint32_t srcBLen,
121  float32_t * pDst)
122 {
123 
124 
125 #ifndef ARM_MATH_CM0_FAMILY
126 
127  /* Run the below code for Cortex-M4 and Cortex-M3 */
128 
129  float32_t *pIn1; /* inputA pointer */
130  float32_t *pIn2; /* inputB pointer */
131  float32_t *pOut = pDst; /* output pointer */
132  float32_t *px; /* Intermediate inputA pointer */
133  float32_t *py; /* Intermediate inputB pointer */
134  float32_t *pSrc1; /* Intermediate pointers */
135  float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
136  float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
137  uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */
138  int32_t inc = 1; /* Destination address modifier */
139 
140 
141  /* The algorithm implementation is based on the lengths of the inputs. */
142  /* srcB is always made to slide across srcA. */
143  /* So srcBLen is always considered as shorter or equal to srcALen */
144  /* But CORR(x, y) is reverse of CORR(y, x) */
145  /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
146  /* and the destination pointer modifier, inc is set to -1 */
147  /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
148  /* But to improve the performance,
149  * we assume zeroes in the output instead of zero padding either of the the inputs*/
150  /* If srcALen > srcBLen,
151  * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
152  /* If srcALen < srcBLen,
153  * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
154  if(srcALen >= srcBLen)
155  {
156  /* Initialization of inputA pointer */
157  pIn1 = pSrcA;
158 
159  /* Initialization of inputB pointer */
160  pIn2 = pSrcB;
161 
162  /* Number of output samples is calculated */
163  outBlockSize = (2u * srcALen) - 1u;
164 
165  /* When srcALen > srcBLen, zero padding has to be done to srcB
166  * to make their lengths equal.
167  * Instead, (outBlockSize - (srcALen + srcBLen - 1))
168  * number of output samples are made zero */
169  j = outBlockSize - (srcALen + (srcBLen - 1u));
170 
171  /* Updating the pointer position to non zero value */
172  pOut += j;
173 
174  //while(j > 0u)
175  //{
176  // /* Zero is stored in the destination buffer */
177  // *pOut++ = 0.0f;
178 
179  // /* Decrement the loop counter */
180  // j--;
181  //}
182 
183  }
184  else
185  {
186  /* Initialization of inputA pointer */
187  pIn1 = pSrcB;
188 
189  /* Initialization of inputB pointer */
190  pIn2 = pSrcA;
191 
192  /* srcBLen is always considered as shorter or equal to srcALen */
193  j = srcBLen;
194  srcBLen = srcALen;
195  srcALen = j;
196 
197  /* CORR(x, y) = Reverse order(CORR(y, x)) */
198  /* Hence set the destination pointer to point to the last output sample */
199  pOut = pDst + ((srcALen + srcBLen) - 2u);
200 
201  /* Destination address modifier is set to -1 */
202  inc = -1;
203 
204  }
205 
206  /* The function is internally
207  * divided into three parts according to the number of multiplications that has to be
208  * taken place between inputA samples and inputB samples. In the first part of the
209  * algorithm, the multiplications increase by one for every iteration.
210  * In the second part of the algorithm, srcBLen number of multiplications are done.
211  * In the third part of the algorithm, the multiplications decrease by one
212  * for every iteration.*/
213  /* The algorithm is implemented in three stages.
214  * The loop counters of each stage is initiated here. */
215  blockSize1 = srcBLen - 1u;
216  blockSize2 = srcALen - (srcBLen - 1u);
217  blockSize3 = blockSize1;
218 
219  /* --------------------------
220  * Initializations of stage1
221  * -------------------------*/
222 
223  /* sum = x[0] * y[srcBlen - 1]
224  * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
225  * ....
226  * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
227  */
228 
229  /* In this stage the MAC operations are increased by 1 for every iteration.
230  The count variable holds the number of MAC operations performed */
231  count = 1u;
232 
233  /* Working pointer of inputA */
234  px = pIn1;
235 
236  /* Working pointer of inputB */
237  pSrc1 = pIn2 + (srcBLen - 1u);
238  py = pSrc1;
239 
240  /* ------------------------
241  * Stage1 process
242  * ----------------------*/
243 
244  /* The first stage starts here */
245  while(blockSize1 > 0u)
246  {
247  /* Accumulator is made zero for every iteration */
248  sum = 0.0f;
249 
250  /* Apply loop unrolling and compute 4 MACs simultaneously. */
251  k = count >> 2u;
252 
253  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
254  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
255  while(k > 0u)
256  {
257  /* x[0] * y[srcBLen - 4] */
258  sum += *px++ * *py++;
259  /* x[1] * y[srcBLen - 3] */
260  sum += *px++ * *py++;
261  /* x[2] * y[srcBLen - 2] */
262  sum += *px++ * *py++;
263  /* x[3] * y[srcBLen - 1] */
264  sum += *px++ * *py++;
265 
266  /* Decrement the loop counter */
267  k--;
268  }
269 
270  /* If the count is not a multiple of 4, compute any remaining MACs here.
271  ** No loop unrolling is used. */
272  k = count % 0x4u;
273 
274  while(k > 0u)
275  {
276  /* Perform the multiply-accumulate */
277  /* x[0] * y[srcBLen - 1] */
278  sum += *px++ * *py++;
279 
280  /* Decrement the loop counter */
281  k--;
282  }
283 
284  /* Store the result in the accumulator in the destination buffer. */
285  *pOut = sum;
286  /* Destination pointer is updated according to the address modifier, inc */
287  pOut += inc;
288 
289  /* Update the inputA and inputB pointers for next MAC calculation */
290  py = pSrc1 - count;
291  px = pIn1;
292 
293  /* Increment the MAC count */
294  count++;
295 
296  /* Decrement the loop counter */
297  blockSize1--;
298  }
299 
300  /* --------------------------
301  * Initializations of stage2
302  * ------------------------*/
303 
304  /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
305  * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
306  * ....
307  * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
308  */
309 
310  /* Working pointer of inputA */
311  px = pIn1;
312 
313  /* Working pointer of inputB */
314  py = pIn2;
315 
316  /* count is index by which the pointer pIn1 to be incremented */
317  count = 0u;
318 
319  /* -------------------
320  * Stage2 process
321  * ------------------*/
322 
323  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
324  * So, to loop unroll over blockSize2,
325  * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
326  if(srcBLen >= 4u)
327  {
328  /* Loop unroll over blockSize2, by 4 */
329  blkCnt = blockSize2 >> 2u;
330 
331  while(blkCnt > 0u)
332  {
333  /* Set all accumulators to zero */
334  acc0 = 0.0f;
335  acc1 = 0.0f;
336  acc2 = 0.0f;
337  acc3 = 0.0f;
338 
339  /* read x[0], x[1], x[2] samples */
340  x0 = *(px++);
341  x1 = *(px++);
342  x2 = *(px++);
343 
344  /* Apply loop unrolling and compute 4 MACs simultaneously. */
345  k = srcBLen >> 2u;
346 
347  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
348  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
349  do
350  {
351  /* Read y[0] sample */
352  c0 = *(py++);
353 
354  /* Read x[3] sample */
355  x3 = *(px++);
356 
357  /* Perform the multiply-accumulate */
358  /* acc0 += x[0] * y[0] */
359  acc0 += x0 * c0;
360  /* acc1 += x[1] * y[0] */
361  acc1 += x1 * c0;
362  /* acc2 += x[2] * y[0] */
363  acc2 += x2 * c0;
364  /* acc3 += x[3] * y[0] */
365  acc3 += x3 * c0;
366 
367  /* Read y[1] sample */
368  c0 = *(py++);
369 
370  /* Read x[4] sample */
371  x0 = *(px++);
372 
373  /* Perform the multiply-accumulate */
374  /* acc0 += x[1] * y[1] */
375  acc0 += x1 * c0;
376  /* acc1 += x[2] * y[1] */
377  acc1 += x2 * c0;
378  /* acc2 += x[3] * y[1] */
379  acc2 += x3 * c0;
380  /* acc3 += x[4] * y[1] */
381  acc3 += x0 * c0;
382 
383  /* Read y[2] sample */
384  c0 = *(py++);
385 
386  /* Read x[5] sample */
387  x1 = *(px++);
388 
389  /* Perform the multiply-accumulates */
390  /* acc0 += x[2] * y[2] */
391  acc0 += x2 * c0;
392  /* acc1 += x[3] * y[2] */
393  acc1 += x3 * c0;
394  /* acc2 += x[4] * y[2] */
395  acc2 += x0 * c0;
396  /* acc3 += x[5] * y[2] */
397  acc3 += x1 * c0;
398 
399  /* Read y[3] sample */
400  c0 = *(py++);
401 
402  /* Read x[6] sample */
403  x2 = *(px++);
404 
405  /* Perform the multiply-accumulates */
406  /* acc0 += x[3] * y[3] */
407  acc0 += x3 * c0;
408  /* acc1 += x[4] * y[3] */
409  acc1 += x0 * c0;
410  /* acc2 += x[5] * y[3] */
411  acc2 += x1 * c0;
412  /* acc3 += x[6] * y[3] */
413  acc3 += x2 * c0;
414 
415 
416  } while(--k);
417 
418  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
419  ** No loop unrolling is used. */
420  k = srcBLen % 0x4u;
421 
422  while(k > 0u)
423  {
424  /* Read y[4] sample */
425  c0 = *(py++);
426 
427  /* Read x[7] sample */
428  x3 = *(px++);
429 
430  /* Perform the multiply-accumulates */
431  /* acc0 += x[4] * y[4] */
432  acc0 += x0 * c0;
433  /* acc1 += x[5] * y[4] */
434  acc1 += x1 * c0;
435  /* acc2 += x[6] * y[4] */
436  acc2 += x2 * c0;
437  /* acc3 += x[7] * y[4] */
438  acc3 += x3 * c0;
439 
440  /* Reuse the present samples for the next MAC */
441  x0 = x1;
442  x1 = x2;
443  x2 = x3;
444 
445  /* Decrement the loop counter */
446  k--;
447  }
448 
449  /* Store the result in the accumulator in the destination buffer. */
450  *pOut = acc0;
451  /* Destination pointer is updated according to the address modifier, inc */
452  pOut += inc;
453 
454  *pOut = acc1;
455  pOut += inc;
456 
457  *pOut = acc2;
458  pOut += inc;
459 
460  *pOut = acc3;
461  pOut += inc;
462 
463  /* Increment the pointer pIn1 index, count by 4 */
464  count += 4u;
465 
466  /* Update the inputA and inputB pointers for next MAC calculation */
467  px = pIn1 + count;
468  py = pIn2;
469 
470  /* Decrement the loop counter */
471  blkCnt--;
472  }
473 
474  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
475  ** No loop unrolling is used. */
476  blkCnt = blockSize2 % 0x4u;
477 
478  while(blkCnt > 0u)
479  {
480  /* Accumulator is made zero for every iteration */
481  sum = 0.0f;
482 
483  /* Apply loop unrolling and compute 4 MACs simultaneously. */
484  k = srcBLen >> 2u;
485 
486  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
487  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
488  while(k > 0u)
489  {
490  /* Perform the multiply-accumulates */
491  sum += *px++ * *py++;
492  sum += *px++ * *py++;
493  sum += *px++ * *py++;
494  sum += *px++ * *py++;
495 
496  /* Decrement the loop counter */
497  k--;
498  }
499 
500  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
501  ** No loop unrolling is used. */
502  k = srcBLen % 0x4u;
503 
504  while(k > 0u)
505  {
506  /* Perform the multiply-accumulate */
507  sum += *px++ * *py++;
508 
509  /* Decrement the loop counter */
510  k--;
511  }
512 
513  /* Store the result in the accumulator in the destination buffer. */
514  *pOut = sum;
515  /* Destination pointer is updated according to the address modifier, inc */
516  pOut += inc;
517 
518  /* Increment the pointer pIn1 index, count by 1 */
519  count++;
520 
521  /* Update the inputA and inputB pointers for next MAC calculation */
522  px = pIn1 + count;
523  py = pIn2;
524 
525  /* Decrement the loop counter */
526  blkCnt--;
527  }
528  }
529  else
530  {
531  /* If the srcBLen is not a multiple of 4,
532  * the blockSize2 loop cannot be unrolled by 4 */
533  blkCnt = blockSize2;
534 
535  while(blkCnt > 0u)
536  {
537  /* Accumulator is made zero for every iteration */
538  sum = 0.0f;
539 
540  /* Loop over srcBLen */
541  k = srcBLen;
542 
543  while(k > 0u)
544  {
545  /* Perform the multiply-accumulate */
546  sum += *px++ * *py++;
547 
548  /* Decrement the loop counter */
549  k--;
550  }
551 
552  /* Store the result in the accumulator in the destination buffer. */
553  *pOut = sum;
554  /* Destination pointer is updated according to the address modifier, inc */
555  pOut += inc;
556 
557  /* Increment the pointer pIn1 index, count by 1 */
558  count++;
559 
560  /* Update the inputA and inputB pointers for next MAC calculation */
561  px = pIn1 + count;
562  py = pIn2;
563 
564  /* Decrement the loop counter */
565  blkCnt--;
566  }
567  }
568 
569  /* --------------------------
570  * Initializations of stage3
571  * -------------------------*/
572 
573  /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
574  * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
575  * ....
576  * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
577  * sum += x[srcALen-1] * y[0]
578  */
579 
580  /* In this stage the MAC operations are decreased by 1 for every iteration.
581  The count variable holds the number of MAC operations performed */
582  count = srcBLen - 1u;
583 
584  /* Working pointer of inputA */
585  pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
586  px = pSrc1;
587 
588  /* Working pointer of inputB */
589  py = pIn2;
590 
591  /* -------------------
592  * Stage3 process
593  * ------------------*/
594 
595  while(blockSize3 > 0u)
596  {
597  /* Accumulator is made zero for every iteration */
598  sum = 0.0f;
599 
600  /* Apply loop unrolling and compute 4 MACs simultaneously. */
601  k = count >> 2u;
602 
603  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
604  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
605  while(k > 0u)
606  {
607  /* Perform the multiply-accumulates */
608  /* sum += x[srcALen - srcBLen + 4] * y[3] */
609  sum += *px++ * *py++;
610  /* sum += x[srcALen - srcBLen + 3] * y[2] */
611  sum += *px++ * *py++;
612  /* sum += x[srcALen - srcBLen + 2] * y[1] */
613  sum += *px++ * *py++;
614  /* sum += x[srcALen - srcBLen + 1] * y[0] */
615  sum += *px++ * *py++;
616 
617  /* Decrement the loop counter */
618  k--;
619  }
620 
621  /* If the count is not a multiple of 4, compute any remaining MACs here.
622  ** No loop unrolling is used. */
623  k = count % 0x4u;
624 
625  while(k > 0u)
626  {
627  /* Perform the multiply-accumulates */
628  sum += *px++ * *py++;
629 
630  /* Decrement the loop counter */
631  k--;
632  }
633 
634  /* Store the result in the accumulator in the destination buffer. */
635  *pOut = sum;
636  /* Destination pointer is updated according to the address modifier, inc */
637  pOut += inc;
638 
639  /* Update the inputA and inputB pointers for next MAC calculation */
640  px = ++pSrc1;
641  py = pIn2;
642 
643  /* Decrement the MAC count */
644  count--;
645 
646  /* Decrement the loop counter */
647  blockSize3--;
648  }
649 
650 #else
651 
652  /* Run the below code for Cortex-M0 */
653 
654  float32_t *pIn1 = pSrcA; /* inputA pointer */
655  float32_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */
656  float32_t sum; /* Accumulator */
657  uint32_t i = 0u, j; /* loop counters */
658  uint32_t inv = 0u; /* Reverse order flag */
659  uint32_t tot = 0u; /* Length */
660 
661  /* The algorithm implementation is based on the lengths of the inputs. */
662  /* srcB is always made to slide across srcA. */
663  /* So srcBLen is always considered as shorter or equal to srcALen */
664  /* But CORR(x, y) is reverse of CORR(y, x) */
665  /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
666  /* and a varaible, inv is set to 1 */
667  /* If lengths are not equal then zero pad has to be done to make the two
668  * inputs of same length. But to improve the performance, we assume zeroes
669  * in the output instead of zero padding either of the the inputs*/
670  /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
671  * starting of the output buffer */
672  /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
673  * ending of the output buffer */
674  /* Once the zero padding is done the remaining of the output is calcualted
675  * using convolution but with the shorter signal time shifted. */
676 
677  /* Calculate the length of the remaining sequence */
678  tot = ((srcALen + srcBLen) - 2u);
679 
680  if(srcALen > srcBLen)
681  {
682  /* Calculating the number of zeros to be padded to the output */
683  j = srcALen - srcBLen;
684 
685  /* Initialise the pointer after zero padding */
686  pDst += j;
687  }
688 
689  else if(srcALen < srcBLen)
690  {
691  /* Initialization to inputB pointer */
692  pIn1 = pSrcB;
693 
694  /* Initialization to the end of inputA pointer */
695  pIn2 = pSrcA + (srcALen - 1u);
696 
697  /* Initialisation of the pointer after zero padding */
698  pDst = pDst + tot;
699 
700  /* Swapping the lengths */
701  j = srcALen;
702  srcALen = srcBLen;
703  srcBLen = j;
704 
705  /* Setting the reverse flag */
706  inv = 1;
707 
708  }
709 
710  /* Loop to calculate convolution for output length number of times */
711  for (i = 0u; i <= tot; i++)
712  {
713  /* Initialize sum with zero to carry on MAC operations */
714  sum = 0.0f;
715 
716  /* Loop to perform MAC operations according to convolution equation */
717  for (j = 0u; j <= i; j++)
718  {
719  /* Check the array limitations */
720  if((((i - j) < srcBLen) && (j < srcALen)))
721  {
722  /* z[i] += x[i-j] * y[j] */
723  sum += pIn1[j] * pIn2[-((int32_t) i - j)];
724  }
725  }
726  /* Store the output in the destination buffer */
727  if(inv == 1)
728  *pDst-- = sum;
729  else
730  *pDst++ = sum;
731  }
732 
733 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
734 
735 }
736 
float float32_t
32-bit floating-point type definition.
Definition: arm_math.h:407
void arm_correlate_f32(float32_t *pSrcA, uint32_t srcALen, float32_t *pSrcB, uint32_t srcBLen, float32_t *pDst)
Correlation of floating-point sequences.