STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_correlate_fast_q31.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_correlate_fast_q31.c
9 *
10 * Description: Fast Q31 Correlation.
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
82  q31_t * pSrcA,
83  uint32_t srcALen,
84  q31_t * pSrcB,
85  uint32_t srcBLen,
86  q31_t * pDst)
87 {
88  q31_t *pIn1; /* inputA pointer */
89  q31_t *pIn2; /* inputB pointer */
90  q31_t *pOut = pDst; /* output pointer */
91  q31_t *px; /* Intermediate inputA pointer */
92  q31_t *py; /* Intermediate inputB pointer */
93  q31_t *pSrc1; /* Intermediate pointers */
94  q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
95  q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
96  uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
97  int32_t inc = 1; /* Destination address modifier */
98 
99 
100  /* The algorithm implementation is based on the lengths of the inputs. */
101  /* srcB is always made to slide across srcA. */
102  /* So srcBLen is always considered as shorter or equal to srcALen */
103  if(srcALen >= srcBLen)
104  {
105  /* Initialization of inputA pointer */
106  pIn1 = (pSrcA);
107 
108  /* Initialization of inputB pointer */
109  pIn2 = (pSrcB);
110 
111  /* Number of output samples is calculated */
112  outBlockSize = (2u * srcALen) - 1u;
113 
114  /* When srcALen > srcBLen, zero padding is done to srcB
115  * to make their lengths equal.
116  * Instead, (outBlockSize - (srcALen + srcBLen - 1))
117  * number of output samples are made zero */
118  j = outBlockSize - (srcALen + (srcBLen - 1u));
119 
120  /* Updating the pointer position to non zero value */
121  pOut += j;
122 
123  }
124  else
125  {
126  /* Initialization of inputA pointer */
127  pIn1 = (pSrcB);
128 
129  /* Initialization of inputB pointer */
130  pIn2 = (pSrcA);
131 
132  /* srcBLen is always considered as shorter or equal to srcALen */
133  j = srcBLen;
134  srcBLen = srcALen;
135  srcALen = j;
136 
137  /* CORR(x, y) = Reverse order(CORR(y, x)) */
138  /* Hence set the destination pointer to point to the last output sample */
139  pOut = pDst + ((srcALen + srcBLen) - 2u);
140 
141  /* Destination address modifier is set to -1 */
142  inc = -1;
143 
144  }
145 
146  /* The function is internally
147  * divided into three parts according to the number of multiplications that has to be
148  * taken place between inputA samples and inputB samples. In the first part of the
149  * algorithm, the multiplications increase by one for every iteration.
150  * In the second part of the algorithm, srcBLen number of multiplications are done.
151  * In the third part of the algorithm, the multiplications decrease by one
152  * for every iteration.*/
153  /* The algorithm is implemented in three stages.
154  * The loop counters of each stage is initiated here. */
155  blockSize1 = srcBLen - 1u;
156  blockSize2 = srcALen - (srcBLen - 1u);
157  blockSize3 = blockSize1;
158 
159  /* --------------------------
160  * Initializations of stage1
161  * -------------------------*/
162 
163  /* sum = x[0] * y[srcBlen - 1]
164  * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
165  * ....
166  * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
167  */
168 
169  /* In this stage the MAC operations are increased by 1 for every iteration.
170  The count variable holds the number of MAC operations performed */
171  count = 1u;
172 
173  /* Working pointer of inputA */
174  px = pIn1;
175 
176  /* Working pointer of inputB */
177  pSrc1 = pIn2 + (srcBLen - 1u);
178  py = pSrc1;
179 
180  /* ------------------------
181  * Stage1 process
182  * ----------------------*/
183 
184  /* The first stage starts here */
185  while(blockSize1 > 0u)
186  {
187  /* Accumulator is made zero for every iteration */
188  sum = 0;
189 
190  /* Apply loop unrolling and compute 4 MACs simultaneously. */
191  k = count >> 2;
192 
193  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
194  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
195  while(k > 0u)
196  {
197  /* x[0] * y[srcBLen - 4] */
198  sum = (q31_t) ((((q63_t) sum << 32) +
199  ((q63_t) * px++ * (*py++))) >> 32);
200  /* x[1] * y[srcBLen - 3] */
201  sum = (q31_t) ((((q63_t) sum << 32) +
202  ((q63_t) * px++ * (*py++))) >> 32);
203  /* x[2] * y[srcBLen - 2] */
204  sum = (q31_t) ((((q63_t) sum << 32) +
205  ((q63_t) * px++ * (*py++))) >> 32);
206  /* x[3] * y[srcBLen - 1] */
207  sum = (q31_t) ((((q63_t) sum << 32) +
208  ((q63_t) * px++ * (*py++))) >> 32);
209 
210  /* Decrement the loop counter */
211  k--;
212  }
213 
214  /* If the count is not a multiple of 4, compute any remaining MACs here.
215  ** No loop unrolling is used. */
216  k = count % 0x4u;
217 
218  while(k > 0u)
219  {
220  /* Perform the multiply-accumulates */
221  /* x[0] * y[srcBLen - 1] */
222  sum = (q31_t) ((((q63_t) sum << 32) +
223  ((q63_t) * px++ * (*py++))) >> 32);
224 
225  /* Decrement the loop counter */
226  k--;
227  }
228 
229  /* Store the result in the accumulator in the destination buffer. */
230  *pOut = sum << 1;
231  /* Destination pointer is updated according to the address modifier, inc */
232  pOut += inc;
233 
234  /* Update the inputA and inputB pointers for next MAC calculation */
235  py = pSrc1 - count;
236  px = pIn1;
237 
238  /* Increment the MAC count */
239  count++;
240 
241  /* Decrement the loop counter */
242  blockSize1--;
243  }
244 
245  /* --------------------------
246  * Initializations of stage2
247  * ------------------------*/
248 
249  /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
250  * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
251  * ....
252  * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
253  */
254 
255  /* Working pointer of inputA */
256  px = pIn1;
257 
258  /* Working pointer of inputB */
259  py = pIn2;
260 
261  /* count is index by which the pointer pIn1 to be incremented */
262  count = 0u;
263 
264  /* -------------------
265  * Stage2 process
266  * ------------------*/
267 
268  /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
269  * So, to loop unroll over blockSize2,
270  * srcBLen should be greater than or equal to 4 */
271  if(srcBLen >= 4u)
272  {
273  /* Loop unroll over blockSize2, by 4 */
274  blkCnt = blockSize2 >> 2u;
275 
276  while(blkCnt > 0u)
277  {
278  /* Set all accumulators to zero */
279  acc0 = 0;
280  acc1 = 0;
281  acc2 = 0;
282  acc3 = 0;
283 
284  /* read x[0], x[1], x[2] samples */
285  x0 = *(px++);
286  x1 = *(px++);
287  x2 = *(px++);
288 
289  /* Apply loop unrolling and compute 4 MACs simultaneously. */
290  k = srcBLen >> 2u;
291 
292  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
293  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
294  do
295  {
296  /* Read y[0] sample */
297  c0 = *(py++);
298 
299  /* Read x[3] sample */
300  x3 = *(px++);
301 
302  /* Perform the multiply-accumulate */
303  /* acc0 += x[0] * y[0] */
304  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
305  /* acc1 += x[1] * y[0] */
306  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
307  /* acc2 += x[2] * y[0] */
308  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
309  /* acc3 += x[3] * y[0] */
310  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
311 
312  /* Read y[1] sample */
313  c0 = *(py++);
314 
315  /* Read x[4] sample */
316  x0 = *(px++);
317 
318  /* Perform the multiply-accumulates */
319  /* acc0 += x[1] * y[1] */
320  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
321  /* acc1 += x[2] * y[1] */
322  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
323  /* acc2 += x[3] * y[1] */
324  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
325  /* acc3 += x[4] * y[1] */
326  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
327 
328  /* Read y[2] sample */
329  c0 = *(py++);
330 
331  /* Read x[5] sample */
332  x1 = *(px++);
333 
334  /* Perform the multiply-accumulates */
335  /* acc0 += x[2] * y[2] */
336  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
337  /* acc1 += x[3] * y[2] */
338  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
339  /* acc2 += x[4] * y[2] */
340  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
341  /* acc3 += x[5] * y[2] */
342  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
343 
344  /* Read y[3] sample */
345  c0 = *(py++);
346 
347  /* Read x[6] sample */
348  x2 = *(px++);
349 
350  /* Perform the multiply-accumulates */
351  /* acc0 += x[3] * y[3] */
352  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
353  /* acc1 += x[4] * y[3] */
354  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
355  /* acc2 += x[5] * y[3] */
356  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
357  /* acc3 += x[6] * y[3] */
358  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
359 
360 
361  } while(--k);
362 
363  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
364  ** No loop unrolling is used. */
365  k = srcBLen % 0x4u;
366 
367  while(k > 0u)
368  {
369  /* Read y[4] sample */
370  c0 = *(py++);
371 
372  /* Read x[7] sample */
373  x3 = *(px++);
374 
375  /* Perform the multiply-accumulates */
376  /* acc0 += x[4] * y[4] */
377  acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
378  /* acc1 += x[5] * y[4] */
379  acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
380  /* acc2 += x[6] * y[4] */
381  acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
382  /* acc3 += x[7] * y[4] */
383  acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
384 
385  /* Reuse the present samples for the next MAC */
386  x0 = x1;
387  x1 = x2;
388  x2 = x3;
389 
390  /* Decrement the loop counter */
391  k--;
392  }
393 
394  /* Store the result in the accumulator in the destination buffer. */
395  *pOut = (q31_t) (acc0 << 1);
396  /* Destination pointer is updated according to the address modifier, inc */
397  pOut += inc;
398 
399  *pOut = (q31_t) (acc1 << 1);
400  pOut += inc;
401 
402  *pOut = (q31_t) (acc2 << 1);
403  pOut += inc;
404 
405  *pOut = (q31_t) (acc3 << 1);
406  pOut += inc;
407 
408  /* Increment the pointer pIn1 index, count by 4 */
409  count += 4u;
410 
411  /* Update the inputA and inputB pointers for next MAC calculation */
412  px = pIn1 + count;
413  py = pIn2;
414 
415 
416  /* Decrement the loop counter */
417  blkCnt--;
418  }
419 
420  /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
421  ** No loop unrolling is used. */
422  blkCnt = blockSize2 % 0x4u;
423 
424  while(blkCnt > 0u)
425  {
426  /* Accumulator is made zero for every iteration */
427  sum = 0;
428 
429  /* Apply loop unrolling and compute 4 MACs simultaneously. */
430  k = srcBLen >> 2u;
431 
432  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
433  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
434  while(k > 0u)
435  {
436  /* Perform the multiply-accumulates */
437  sum = (q31_t) ((((q63_t) sum << 32) +
438  ((q63_t) * px++ * (*py++))) >> 32);
439  sum = (q31_t) ((((q63_t) sum << 32) +
440  ((q63_t) * px++ * (*py++))) >> 32);
441  sum = (q31_t) ((((q63_t) sum << 32) +
442  ((q63_t) * px++ * (*py++))) >> 32);
443  sum = (q31_t) ((((q63_t) sum << 32) +
444  ((q63_t) * px++ * (*py++))) >> 32);
445 
446  /* Decrement the loop counter */
447  k--;
448  }
449 
450  /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
451  ** No loop unrolling is used. */
452  k = srcBLen % 0x4u;
453 
454  while(k > 0u)
455  {
456  /* Perform the multiply-accumulate */
457  sum = (q31_t) ((((q63_t) sum << 32) +
458  ((q63_t) * px++ * (*py++))) >> 32);
459 
460  /* Decrement the loop counter */
461  k--;
462  }
463 
464  /* Store the result in the accumulator in the destination buffer. */
465  *pOut = sum << 1;
466  /* Destination pointer is updated according to the address modifier, inc */
467  pOut += inc;
468 
469  /* Increment the MAC count */
470  count++;
471 
472  /* Update the inputA and inputB pointers for next MAC calculation */
473  px = pIn1 + count;
474  py = pIn2;
475 
476 
477  /* Decrement the loop counter */
478  blkCnt--;
479  }
480  }
481  else
482  {
483  /* If the srcBLen is not a multiple of 4,
484  * the blockSize2 loop cannot be unrolled by 4 */
485  blkCnt = blockSize2;
486 
487  while(blkCnt > 0u)
488  {
489  /* Accumulator is made zero for every iteration */
490  sum = 0;
491 
492  /* Loop over srcBLen */
493  k = srcBLen;
494 
495  while(k > 0u)
496  {
497  /* Perform the multiply-accumulate */
498  sum = (q31_t) ((((q63_t) sum << 32) +
499  ((q63_t) * px++ * (*py++))) >> 32);
500 
501  /* Decrement the loop counter */
502  k--;
503  }
504 
505  /* Store the result in the accumulator in the destination buffer. */
506  *pOut = sum << 1;
507  /* Destination pointer is updated according to the address modifier, inc */
508  pOut += inc;
509 
510  /* Increment the MAC count */
511  count++;
512 
513  /* Update the inputA and inputB pointers for next MAC calculation */
514  px = pIn1 + count;
515  py = pIn2;
516 
517  /* Decrement the loop counter */
518  blkCnt--;
519  }
520  }
521 
522  /* --------------------------
523  * Initializations of stage3
524  * -------------------------*/
525 
526  /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
527  * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
528  * ....
529  * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
530  * sum += x[srcALen-1] * y[0]
531  */
532 
533  /* In this stage the MAC operations are decreased by 1 for every iteration.
534  The count variable holds the number of MAC operations performed */
535  count = srcBLen - 1u;
536 
537  /* Working pointer of inputA */
538  pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1u;
539  px = pSrc1;
540 
541  /* Working pointer of inputB */
542  py = pIn2;
543 
544  /* -------------------
545  * Stage3 process
546  * ------------------*/
547 
548  while(blockSize3 > 0u)
549  {
550  /* Accumulator is made zero for every iteration */
551  sum = 0;
552 
553  /* Apply loop unrolling and compute 4 MACs simultaneously. */
554  k = count >> 2u;
555 
556  /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
557  ** a second loop below computes MACs for the remaining 1 to 3 samples. */
558  while(k > 0u)
559  {
560  /* Perform the multiply-accumulates */
561  /* sum += x[srcALen - srcBLen + 4] * y[3] */
562  sum = (q31_t) ((((q63_t) sum << 32) +
563  ((q63_t) * px++ * (*py++))) >> 32);
564  /* sum += x[srcALen - srcBLen + 3] * y[2] */
565  sum = (q31_t) ((((q63_t) sum << 32) +
566  ((q63_t) * px++ * (*py++))) >> 32);
567  /* sum += x[srcALen - srcBLen + 2] * y[1] */
568  sum = (q31_t) ((((q63_t) sum << 32) +
569  ((q63_t) * px++ * (*py++))) >> 32);
570  /* sum += x[srcALen - srcBLen + 1] * y[0] */
571  sum = (q31_t) ((((q63_t) sum << 32) +
572  ((q63_t) * px++ * (*py++))) >> 32);
573 
574  /* Decrement the loop counter */
575  k--;
576  }
577 
578  /* If the count is not a multiple of 4, compute any remaining MACs here.
579  ** No loop unrolling is used. */
580  k = count % 0x4u;
581 
582  while(k > 0u)
583  {
584  /* Perform the multiply-accumulates */
585  sum = (q31_t) ((((q63_t) sum << 32) +
586  ((q63_t) * px++ * (*py++))) >> 32);
587 
588  /* Decrement the loop counter */
589  k--;
590  }
591 
592  /* Store the result in the accumulator in the destination buffer. */
593  *pOut = sum << 1;
594  /* Destination pointer is updated according to the address modifier, inc */
595  pOut += inc;
596 
597  /* Update the inputA and inputB pointers for next MAC calculation */
598  px = ++pSrc1;
599  py = pIn2;
600 
601  /* Decrement the MAC count */
602  count--;
603 
604  /* Decrement the loop counter */
605  blockSize3--;
606  }
607 
608 }
609 
int64_t q63_t
64-bit fractional data type in 1.63 format.
Definition: arm_math.h:402
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
void arm_correlate_fast_q31(q31_t *pSrcA, uint32_t srcALen, q31_t *pSrcB, uint32_t srcBLen, q31_t *pDst)
Correlation of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.