STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_correlate_q31.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_q31.c
9 *
10 * Description: Correlation of Q31 sequences.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
79  q31_t * pSrcA,
80  uint32_t srcALen,
81  q31_t * pSrcB,
82  uint32_t srcBLen,
83  q31_t * pDst)
84 {
85 
86 #ifndef ARM_MATH_CM0_FAMILY
87 
88  /* Run the below code for Cortex-M4 and Cortex-M3 */
89 
90  q31_t *pIn1; /* inputA pointer */
91  q31_t *pIn2; /* inputB pointer */
92  q31_t *pOut = pDst; /* output pointer */
93  q31_t *px; /* Intermediate inputA pointer */
94  q31_t *py; /* Intermediate inputB pointer */
95  q31_t *pSrc1; /* Intermediate pointers */
96  q63_t sum, acc0, acc1, acc2; /* Accumulators */
97  q31_t x0, x1, x2, c0; /* temporary variables for holding input and coefficient values */
98  uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
99  int32_t inc = 1; /* Destination address modifier */
100 
101 
102  /* The algorithm implementation is based on the lengths of the inputs. */
103  /* srcB is always made to slide across srcA. */
104  /* So srcBLen is always considered as shorter or equal to srcALen */
105  /* But CORR(x, y) is reverse of CORR(y, x) */
106  /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
107  /* and the destination pointer modifier, inc is set to -1 */
108  /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
109  /* But to improve the performance,
110  * we include zeroes in the output instead of zero padding either of the the inputs*/
111  /* If srcALen > srcBLen,
112  * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
113  /* If srcALen < srcBLen,
114  * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
115  if(srcALen >= srcBLen)
116  {
117  /* Initialization of inputA pointer */
118  pIn1 = (pSrcA);
119 
120  /* Initialization of inputB pointer */
121  pIn2 = (pSrcB);
122 
123  /* Number of output samples is calculated */
124  outBlockSize = (2u * srcALen) - 1u;
125 
126  /* When srcALen > srcBLen, zero padding is done to srcB
127  * to make their lengths equal.
128  * Instead, (outBlockSize - (srcALen + srcBLen - 1))
129  * number of output samples are made zero */
130  j = outBlockSize - (srcALen + (srcBLen - 1u));
131 
132  /* Updating the pointer position to non zero value */
133  pOut += j;
134 
135  }
136  else
137  {
138  /* Initialization of inputA pointer */
139  pIn1 = (pSrcB);
140 
141  /* Initialization of inputB pointer */
142  pIn2 = (pSrcA);
143 
144  /* srcBLen is always considered as shorter or equal to srcALen */
145  j = srcBLen;
146  srcBLen = srcALen;
147  srcALen = j;
148 
149  /* CORR(x, y) = Reverse order(CORR(y, x)) */
150  /* Hence set the destination pointer to point to the last output sample */
151  pOut = pDst + ((srcALen + srcBLen) - 2u);
152 
153  /* Destination address modifier is set to -1 */
154  inc = -1;
155 
156  }
157 
158  /* The function is internally
159  * divided into three parts according to the number of multiplications that has to be
160  * taken place between inputA samples and inputB samples. In the first part of the
161  * algorithm, the multiplications increase by one for every iteration.
162  * In the second part of the algorithm, srcBLen number of multiplications are done.
163  * In the third part of the algorithm, the multiplications decrease by one
164  * for every iteration.*/
165  /* The algorithm is implemented in three stages.
166  * The loop counters of each stage is initiated here. */
167  blockSize1 = srcBLen - 1u;
168  blockSize2 = srcALen - (srcBLen - 1u);
169  blockSize3 = blockSize1;
170 
171  /* --------------------------
172  * Initializations of stage1
173  * -------------------------*/
174 
175  /* sum = x[0] * y[srcBlen - 1]
176  * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
177  * ....
178  * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
179  */
180 
181  /* In this stage the MAC operations are increased by 1 for every iteration.
182  The count variable holds the number of MAC operations performed */
183  count = 1u;
184 
185  /* Working pointer of inputA */
186  px = pIn1;
187 
188  /* Working pointer of inputB */
189  pSrc1 = pIn2 + (srcBLen - 1u);
190  py = pSrc1;
191 
192  /* ------------------------
193  * Stage1 process
194  * ----------------------*/
195 
196  /* The first stage starts here */
197  while(blockSize1 > 0u)
198  {
199  /* Accumulator is made zero for every iteration */
200  sum = 0;
201 
202  /* Apply loop unrolling and compute 4 MACs simultaneously. */
203  k = count >> 2;
204 
205  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
206  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
207  while(k > 0u)
208  {
209  /* x[0] * y[srcBLen - 4] */
210  sum += (q63_t) * px++ * (*py++);
211  /* x[1] * y[srcBLen - 3] */
212  sum += (q63_t) * px++ * (*py++);
213  /* x[2] * y[srcBLen - 2] */
214  sum += (q63_t) * px++ * (*py++);
215  /* x[3] * y[srcBLen - 1] */
216  sum += (q63_t) * px++ * (*py++);
217 
218  /* Decrement the loop counter */
219  k--;
220  }
221 
222  /* If the count is not a multiple of 4, compute any remaining MACs here.
223  ** No loop unrolling is used. */
224  k = count % 0x4u;
225 
226  while(k > 0u)
227  {
228  /* Perform the multiply-accumulates */
229  /* x[0] * y[srcBLen - 1] */
230  sum += (q63_t) * px++ * (*py++);
231 
232  /* Decrement the loop counter */
233  k--;
234  }
235 
236  /* Store the result in the accumulator in the destination buffer. */
237  *pOut = (q31_t) (sum >> 31);
238  /* Destination pointer is updated according to the address modifier, inc */
239  pOut += inc;
240 
241  /* Update the inputA and inputB pointers for next MAC calculation */
242  py = pSrc1 - count;
243  px = pIn1;
244 
245  /* Increment the MAC count */
246  count++;
247 
248  /* Decrement the loop counter */
249  blockSize1--;
250  }
251 
252  /* --------------------------
253  * Initializations of stage2
254  * ------------------------*/
255 
256  /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
257  * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
258  * ....
259  * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
260  */
261 
262  /* Working pointer of inputA */
263  px = pIn1;
264 
265  /* Working pointer of inputB */
266  py = pIn2;
267 
268  /* count is index by which the pointer pIn1 to be incremented */
269  count = 0u;
270 
271  /* -------------------
272  * Stage2 process
273  * ------------------*/
274 
275  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
276  * So, to loop unroll over blockSize2,
277  * srcBLen should be greater than or equal to 4 */
278  if(srcBLen >= 4u)
279  {
280  /* Loop unroll by 3 */
281  blkCnt = blockSize2 / 3;
282 
283  while(blkCnt > 0u)
284  {
285  /* Set all accumulators to zero */
286  acc0 = 0;
287  acc1 = 0;
288  acc2 = 0;
289 
290  /* read x[0], x[1] samples */
291  x0 = *(px++);
292  x1 = *(px++);
293 
294  /* Apply loop unrolling and compute 3 MACs simultaneously. */
295  k = srcBLen / 3;
296 
297  /* First part of the processing with loop unrolling. Compute 3 MACs at a time.
298  ** a second loop below computes MACs for the remaining 1 to 2 samples. */
299  do
300  {
301  /* Read y[0] sample */
302  c0 = *(py);
303 
304  /* Read x[2] sample */
305  x2 = *(px);
306 
307  /* Perform the multiply-accumulate */
308  /* acc0 += x[0] * y[0] */
309  acc0 += ((q63_t) x0 * c0);
310  /* acc1 += x[1] * y[0] */
311  acc1 += ((q63_t) x1 * c0);
312  /* acc2 += x[2] * y[0] */
313  acc2 += ((q63_t) x2 * c0);
314 
315  /* Read y[1] sample */
316  c0 = *(py + 1u);
317 
318  /* Read x[3] sample */
319  x0 = *(px + 1u);
320 
321  /* Perform the multiply-accumulates */
322  /* acc0 += x[1] * y[1] */
323  acc0 += ((q63_t) x1 * c0);
324  /* acc1 += x[2] * y[1] */
325  acc1 += ((q63_t) x2 * c0);
326  /* acc2 += x[3] * y[1] */
327  acc2 += ((q63_t) x0 * c0);
328 
329  /* Read y[2] sample */
330  c0 = *(py + 2u);
331 
332  /* Read x[4] sample */
333  x1 = *(px + 2u);
334 
335  /* Perform the multiply-accumulates */
336  /* acc0 += x[2] * y[2] */
337  acc0 += ((q63_t) x2 * c0);
338  /* acc1 += x[3] * y[2] */
339  acc1 += ((q63_t) x0 * c0);
340  /* acc2 += x[4] * y[2] */
341  acc2 += ((q63_t) x1 * c0);
342 
343  /* update scratch pointers */
344  px += 3u;
345  py += 3u;
346 
347  } while(--k);
348 
349  /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
350  ** No loop unrolling is used. */
351  k = srcBLen - (3 * (srcBLen / 3));
352 
353  while(k > 0u)
354  {
355  /* Read y[4] sample */
356  c0 = *(py++);
357 
358  /* Read x[7] sample */
359  x2 = *(px++);
360 
361  /* Perform the multiply-accumulates */
362  /* acc0 += x[4] * y[4] */
363  acc0 += ((q63_t) x0 * c0);
364  /* acc1 += x[5] * y[4] */
365  acc1 += ((q63_t) x1 * c0);
366  /* acc2 += x[6] * y[4] */
367  acc2 += ((q63_t) x2 * c0);
368 
369  /* Reuse the present samples for the next MAC */
370  x0 = x1;
371  x1 = x2;
372 
373  /* Decrement the loop counter */
374  k--;
375  }
376 
377  /* Store the result in the accumulator in the destination buffer. */
378  *pOut = (q31_t) (acc0 >> 31);
379  /* Destination pointer is updated according to the address modifier, inc */
380  pOut += inc;
381 
382  *pOut = (q31_t) (acc1 >> 31);
383  pOut += inc;
384 
385  *pOut = (q31_t) (acc2 >> 31);
386  pOut += inc;
387 
388  /* Increment the pointer pIn1 index, count by 3 */
389  count += 3u;
390 
391  /* Update the inputA and inputB pointers for next MAC calculation */
392  px = pIn1 + count;
393  py = pIn2;
394 
395 
396  /* Decrement the loop counter */
397  blkCnt--;
398  }
399 
400  /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.
401  ** No loop unrolling is used. */
402  blkCnt = blockSize2 - 3 * (blockSize2 / 3);
403 
404  while(blkCnt > 0u)
405  {
406  /* Accumulator is made zero for every iteration */
407  sum = 0;
408 
409  /* Apply loop unrolling and compute 4 MACs simultaneously. */
410  k = srcBLen >> 2u;
411 
412  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
413  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
414  while(k > 0u)
415  {
416  /* Perform the multiply-accumulates */
417  sum += (q63_t) * px++ * (*py++);
418  sum += (q63_t) * px++ * (*py++);
419  sum += (q63_t) * px++ * (*py++);
420  sum += (q63_t) * px++ * (*py++);
421 
422  /* Decrement the loop counter */
423  k--;
424  }
425 
426  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
427  ** No loop unrolling is used. */
428  k = srcBLen % 0x4u;
429 
430  while(k > 0u)
431  {
432  /* Perform the multiply-accumulate */
433  sum += (q63_t) * px++ * (*py++);
434 
435  /* Decrement the loop counter */
436  k--;
437  }
438 
439  /* Store the result in the accumulator in the destination buffer. */
440  *pOut = (q31_t) (sum >> 31);
441  /* Destination pointer is updated according to the address modifier, inc */
442  pOut += inc;
443 
444  /* Increment the MAC count */
445  count++;
446 
447  /* Update the inputA and inputB pointers for next MAC calculation */
448  px = pIn1 + count;
449  py = pIn2;
450 
451  /* Decrement the loop counter */
452  blkCnt--;
453  }
454  }
455  else
456  {
457  /* If the srcBLen is not a multiple of 4,
458  * the blockSize2 loop cannot be unrolled by 4 */
459  blkCnt = blockSize2;
460 
461  while(blkCnt > 0u)
462  {
463  /* Accumulator is made zero for every iteration */
464  sum = 0;
465 
466  /* Loop over srcBLen */
467  k = srcBLen;
468 
469  while(k > 0u)
470  {
471  /* Perform the multiply-accumulate */
472  sum += (q63_t) * px++ * (*py++);
473 
474  /* Decrement the loop counter */
475  k--;
476  }
477 
478  /* Store the result in the accumulator in the destination buffer. */
479  *pOut = (q31_t) (sum >> 31);
480  /* Destination pointer is updated according to the address modifier, inc */
481  pOut += inc;
482 
483  /* Increment the MAC count */
484  count++;
485 
486  /* Update the inputA and inputB pointers for next MAC calculation */
487  px = pIn1 + count;
488  py = pIn2;
489 
490  /* Decrement the loop counter */
491  blkCnt--;
492  }
493  }
494 
495  /* --------------------------
496  * Initializations of stage3
497  * -------------------------*/
498 
499  /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
500  * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
501  * ....
502  * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
503  * sum += x[srcALen-1] * y[0]
504  */
505 
506  /* In this stage the MAC operations are decreased by 1 for every iteration.
507  The count variable holds the number of MAC operations performed */
508  count = srcBLen - 1u;
509 
510  /* Working pointer of inputA */
511  pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
512  px = pSrc1;
513 
514  /* Working pointer of inputB */
515  py = pIn2;
516 
517  /* -------------------
518  * Stage3 process
519  * ------------------*/
520 
521  while(blockSize3 > 0u)
522  {
523  /* Accumulator is made zero for every iteration */
524  sum = 0;
525 
526  /* Apply loop unrolling and compute 4 MACs simultaneously. */
527  k = count >> 2u;
528 
529  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
530  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
531  while(k > 0u)
532  {
533  /* Perform the multiply-accumulates */
534  /* sum += x[srcALen - srcBLen + 4] * y[3] */
535  sum += (q63_t) * px++ * (*py++);
536  /* sum += x[srcALen - srcBLen + 3] * y[2] */
537  sum += (q63_t) * px++ * (*py++);
538  /* sum += x[srcALen - srcBLen + 2] * y[1] */
539  sum += (q63_t) * px++ * (*py++);
540  /* sum += x[srcALen - srcBLen + 1] * y[0] */
541  sum += (q63_t) * px++ * (*py++);
542 
543  /* Decrement the loop counter */
544  k--;
545  }
546 
547  /* If the count is not a multiple of 4, compute any remaining MACs here.
548  ** No loop unrolling is used. */
549  k = count % 0x4u;
550 
551  while(k > 0u)
552  {
553  /* Perform the multiply-accumulates */
554  sum += (q63_t) * px++ * (*py++);
555 
556  /* Decrement the loop counter */
557  k--;
558  }
559 
560  /* Store the result in the accumulator in the destination buffer. */
561  *pOut = (q31_t) (sum >> 31);
562  /* Destination pointer is updated according to the address modifier, inc */
563  pOut += inc;
564 
565  /* Update the inputA and inputB pointers for next MAC calculation */
566  px = ++pSrc1;
567  py = pIn2;
568 
569  /* Decrement the MAC count */
570  count--;
571 
572  /* Decrement the loop counter */
573  blockSize3--;
574  }
575 
576 #else
577 
578  /* Run the below code for Cortex-M0 */
579 
580  q31_t *pIn1 = pSrcA; /* inputA pointer */
581  q31_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */
582  q63_t sum; /* Accumulators */
583  uint32_t i = 0u, j; /* loop counters */
584  uint32_t inv = 0u; /* Reverse order flag */
585  uint32_t tot = 0u; /* Length */
586 
587  /* The algorithm implementation is based on the lengths of the inputs. */
588  /* srcB is always made to slide across srcA. */
589  /* So srcBLen is always considered as shorter or equal to srcALen */
590  /* But CORR(x, y) is reverse of CORR(y, x) */
591  /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
592  /* and a varaible, inv is set to 1 */
593  /* If lengths are not equal then zero pad has to be done to make the two
594  * inputs of same length. But to improve the performance, we include zeroes
595  * in the output instead of zero padding either of the the inputs*/
596  /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
597  * starting of the output buffer */
598  /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
599  * ending of the output buffer */
600  /* Once the zero padding is done the remaining of the output is calcualted
601  * using correlation but with the shorter signal time shifted. */
602 
603  /* Calculate the length of the remaining sequence */
604  tot = ((srcALen + srcBLen) - 2u);
605 
606  if(srcALen > srcBLen)
607  {
608  /* Calculating the number of zeros to be padded to the output */
609  j = srcALen - srcBLen;
610 
611  /* Initialise the pointer after zero padding */
612  pDst += j;
613  }
614 
615  else if(srcALen < srcBLen)
616  {
617  /* Initialization to inputB pointer */
618  pIn1 = pSrcB;
619 
620  /* Initialization to the end of inputA pointer */
621  pIn2 = pSrcA + (srcALen - 1u);
622 
623  /* Initialisation of the pointer after zero padding */
624  pDst = pDst + tot;
625 
626  /* Swapping the lengths */
627  j = srcALen;
628  srcALen = srcBLen;
629  srcBLen = j;
630 
631  /* Setting the reverse flag */
632  inv = 1;
633 
634  }
635 
636  /* Loop to calculate correlation for output length number of times */
637  for (i = 0u; i <= tot; i++)
638  {
639  /* Initialize sum with zero to carry on MAC operations */
640  sum = 0;
641 
642  /* Loop to perform MAC operations according to correlation equation */
643  for (j = 0u; j <= i; j++)
644  {
645  /* Check the array limitations */
646  if((((i - j) < srcBLen) && (j < srcALen)))
647  {
648  /* z[i] += x[i-j] * y[j] */
649  sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
650  }
651  }
652  /* Store the output in the destination buffer */
653  if(inv == 1)
654  *pDst-- = (q31_t) (sum >> 31u);
655  else
656  *pDst++ = (q31_t) (sum >> 31u);
657  }
658 
659 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
660 
661 }
662 
int64_t q63_t
64-bit fractional data type in 1.63 format.
Definition: arm_math.h:402
void arm_correlate_q31(q31_t *pSrcA, uint32_t srcALen, q31_t *pSrcB, uint32_t srcBLen, q31_t *pDst)
Correlation of Q31 sequences.
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397