STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_cfft_radix4_q15.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_cfft_radix4_q15.c
9 *
10 * Description: This file has function definition of Radix-4 FFT & IFFT function and
11 * In-place bit reversal using bit reversal table
12 *
13 * Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
14 *
15 * Redistribution and use in source and binary forms, with or without
16 * modification, are permitted provided that the following conditions
17 * are met:
18 * - Redistributions of source code must retain the above copyright
19 * notice, this list of conditions and the following disclaimer.
20 * - Redistributions in binary form must reproduce the above copyright
21 * notice, this list of conditions and the following disclaimer in
22 * the documentation and/or other materials provided with the
23 * distribution.
24 * - Neither the name of ARM LIMITED nor the names of its contributors
25 * may be used to endorse or promote products derived from this
26 * software without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
29 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
30 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
31 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
32 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
33 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
34 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
35 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
36 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
38 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
39 * POSSIBILITY OF SUCH DAMAGE.
40 * -------------------------------------------------------------------- */
41 
42 #include "arm_math.h"
43 
44 
46  q15_t * pSrc16,
47  uint32_t fftLen,
48  q15_t * pCoef16,
49  uint32_t twidCoefModifier);
50 
52  q15_t * pSrc16,
53  uint32_t fftLen,
54  q15_t * pCoef16,
55  uint32_t twidCoefModifier);
56 
58  q15_t * pSrc,
59  uint32_t fftLen,
60  uint16_t bitRevFactor,
61  uint16_t * pBitRevTab);
62 
93  q15_t * pSrc)
94 {
95  if(S->ifftFlag == 1u)
96  {
97  /* Complex IFFT radix-4 */
99  S->twidCoefModifier);
100  }
101  else
102  {
103  /* Complex FFT radix-4 */
105  S->twidCoefModifier);
106  }
107 
108  if(S->bitReverseFlag == 1u)
109  {
110  /* Bit Reversal */
112  }
113 
114 }
115 
120 /*
121 * Radix-4 FFT algorithm used is :
122 *
123 * Input real and imaginary data:
124 * x(n) = xa + j * ya
125 * x(n+N/4 ) = xb + j * yb
126 * x(n+N/2 ) = xc + j * yc
127 * x(n+3N 4) = xd + j * yd
128 *
129 *
130 * Output real and imaginary data:
131 * x(4r) = xa'+ j * ya'
132 * x(4r+1) = xb'+ j * yb'
133 * x(4r+2) = xc'+ j * yc'
134 * x(4r+3) = xd'+ j * yd'
135 *
136 *
137 * Twiddle factors for radix-4 FFT:
138 * Wn = co1 + j * (- si1)
139 * W2n = co2 + j * (- si2)
140 * W3n = co3 + j * (- si3)
141 
142 * The real and imaginary output values for the radix-4 butterfly are
143 * xa' = xa + xb + xc + xd
144 * ya' = ya + yb + yc + yd
145 * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
146 * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
147 * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
148 * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
149 * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
150 * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
151 *
152 */
153 
164  q15_t * pSrc16,
165  uint32_t fftLen,
166  q15_t * pCoef16,
167  uint32_t twidCoefModifier)
168 {
169 
170 #ifndef ARM_MATH_CM0_FAMILY
171 
172  /* Run the below code for Cortex-M4 and Cortex-M3 */
173 
174  q31_t R, S, T, U;
175  q31_t C1, C2, C3, out1, out2;
176  uint32_t n1, n2, ic, i0, j, k;
177 
178  q15_t *ptr1;
179  q15_t *pSi0;
180  q15_t *pSi1;
181  q15_t *pSi2;
182  q15_t *pSi3;
183 
184  q31_t xaya, xbyb, xcyc, xdyd;
185 
186  /* Total process is divided into three stages */
187 
188  /* process first stage, middle stages, & last stage */
189 
190  /* Initializations for the first stage */
191  n2 = fftLen;
192  n1 = n2;
193 
194  /* n2 = fftLen/4 */
195  n2 >>= 2u;
196 
197  /* Index for twiddle coefficient */
198  ic = 0u;
199 
200  /* Index for input read and output write */
201  j = n2;
202 
203  pSi0 = pSrc16;
204  pSi1 = pSi0 + 2 * n2;
205  pSi2 = pSi1 + 2 * n2;
206  pSi3 = pSi2 + 2 * n2;
207 
208  /* Input is in 1.15(q15) format */
209 
210  /* start of first stage process */
211  do
212  {
213  /* Butterfly implementation */
214 
215  /* Reading i0, i0+fftLen/2 inputs */
216  /* Read ya (real), xa(imag) input */
217  T = _SIMD32_OFFSET(pSi0);
218  T = __SHADD16(T, 0); // this is just a SIMD arithmetic shift right by 1
219  T = __SHADD16(T, 0); // it turns out doing this twice is 2 cycles, the alternative takes 3 cycles
220  //in = ((int16_t) (T & 0xFFFF)) >> 2; // alternative code that takes 3 cycles
221  //T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
222 
223  /* Read yc (real), xc(imag) input */
224  S = _SIMD32_OFFSET(pSi2);
225  S = __SHADD16(S, 0);
226  S = __SHADD16(S, 0);
227 
228  /* R = packed((ya + yc), (xa + xc) ) */
229  R = __QADD16(T, S);
230 
231  /* S = packed((ya - yc), (xa - xc) ) */
232  S = __QSUB16(T, S);
233 
234  /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
235  /* Read yb (real), xb(imag) input */
236  T = _SIMD32_OFFSET(pSi1);
237  T = __SHADD16(T, 0);
238  T = __SHADD16(T, 0);
239 
240  /* Read yd (real), xd(imag) input */
241  U = _SIMD32_OFFSET(pSi3);
242  U = __SHADD16(U, 0);
243  U = __SHADD16(U, 0);
244 
245  /* T = packed((yb + yd), (xb + xd) ) */
246  T = __QADD16(T, U);
247 
248  /* writing the butterfly processed i0 sample */
249  /* xa' = xa + xb + xc + xd */
250  /* ya' = ya + yb + yc + yd */
251  _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
252  pSi0 += 2;
253 
254  /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
255  R = __QSUB16(R, T);
256 
257  /* co2 & si2 are read from SIMD Coefficient pointer */
258  C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
259 
260 #ifndef ARM_MATH_BIG_ENDIAN
261 
262  /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
263  out1 = __SMUAD(C2, R) >> 16u;
264  /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
265  out2 = __SMUSDX(C2, R);
266 
267 #else
268 
269  /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
270  out1 = __SMUSDX(R, C2) >> 16u;
271  /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
272  out2 = __SMUAD(C2, R);
273 
274 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
275 
276  /* Reading i0+fftLen/4 */
277  /* T = packed(yb, xb) */
278  T = _SIMD32_OFFSET(pSi1);
279  T = __SHADD16(T, 0);
280  T = __SHADD16(T, 0);
281 
282  /* writing the butterfly processed i0 + fftLen/4 sample */
283  /* writing output(xc', yc') in little endian format */
284  _SIMD32_OFFSET(pSi1) =
285  (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
286  pSi1 += 2;
287 
288  /* Butterfly calculations */
289  /* U = packed(yd, xd) */
290  U = _SIMD32_OFFSET(pSi3);
291  U = __SHADD16(U, 0);
292  U = __SHADD16(U, 0);
293 
294  /* T = packed(yb-yd, xb-xd) */
295  T = __QSUB16(T, U);
296 
297 #ifndef ARM_MATH_BIG_ENDIAN
298 
299  /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
300  R = __QASX(S, T);
301  /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
302  S = __QSAX(S, T);
303 
304 #else
305 
306  /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
307  R = __QSAX(S, T);
308  /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
309  S = __QASX(S, T);
310 
311 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
312 
313  /* co1 & si1 are read from SIMD Coefficient pointer */
314  C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
315  /* Butterfly process for the i0+fftLen/2 sample */
316 
317 #ifndef ARM_MATH_BIG_ENDIAN
318 
319  /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
320  out1 = __SMUAD(C1, S) >> 16u;
321  /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
322  out2 = __SMUSDX(C1, S);
323 
324 #else
325 
326  /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
327  out1 = __SMUSDX(S, C1) >> 16u;
328  /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
329  out2 = __SMUAD(C1, S);
330 
331 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
332 
333  /* writing output(xb', yb') in little endian format */
334  _SIMD32_OFFSET(pSi2) =
335  ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
336  pSi2 += 2;
337 
338 
339  /* co3 & si3 are read from SIMD Coefficient pointer */
340  C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
341  /* Butterfly process for the i0+3fftLen/4 sample */
342 
343 #ifndef ARM_MATH_BIG_ENDIAN
344 
345  /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
346  out1 = __SMUAD(C3, R) >> 16u;
347  /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
348  out2 = __SMUSDX(C3, R);
349 
350 #else
351 
352  /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
353  out1 = __SMUSDX(R, C3) >> 16u;
354  /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
355  out2 = __SMUAD(C3, R);
356 
357 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
358 
359  /* writing output(xd', yd') in little endian format */
360  _SIMD32_OFFSET(pSi3) =
361  ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
362  pSi3 += 2;
363 
364  /* Twiddle coefficients index modifier */
365  ic = ic + twidCoefModifier;
366 
367  } while(--j);
368  /* data is in 4.11(q11) format */
369 
370  /* end of first stage process */
371 
372 
373  /* start of middle stage process */
374 
375  /* Twiddle coefficients index modifier */
376  twidCoefModifier <<= 2u;
377 
378  /* Calculation of Middle stage */
379  for (k = fftLen / 4u; k > 4u; k >>= 2u)
380  {
381  /* Initializations for the middle stage */
382  n1 = n2;
383  n2 >>= 2u;
384  ic = 0u;
385 
386  for (j = 0u; j <= (n2 - 1u); j++)
387  {
388  /* index calculation for the coefficients */
389  C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
390  C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
391  C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
392 
393  /* Twiddle coefficients index modifier */
394  ic = ic + twidCoefModifier;
395 
396  pSi0 = pSrc16 + 2 * j;
397  pSi1 = pSi0 + 2 * n2;
398  pSi2 = pSi1 + 2 * n2;
399  pSi3 = pSi2 + 2 * n2;
400 
401  /* Butterfly implementation */
402  for (i0 = j; i0 < fftLen; i0 += n1)
403  {
404  /* Reading i0, i0+fftLen/2 inputs */
405  /* Read ya (real), xa(imag) input */
406  T = _SIMD32_OFFSET(pSi0);
407 
408  /* Read yc (real), xc(imag) input */
409  S = _SIMD32_OFFSET(pSi2);
410 
411  /* R = packed( (ya + yc), (xa + xc)) */
412  R = __QADD16(T, S);
413 
414  /* S = packed((ya - yc), (xa - xc)) */
415  S = __QSUB16(T, S);
416 
417  /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
418  /* Read yb (real), xb(imag) input */
419  T = _SIMD32_OFFSET(pSi1);
420 
421  /* Read yd (real), xd(imag) input */
422  U = _SIMD32_OFFSET(pSi3);
423 
424  /* T = packed( (yb + yd), (xb + xd)) */
425  T = __QADD16(T, U);
426 
427  /* writing the butterfly processed i0 sample */
428 
429  /* xa' = xa + xb + xc + xd */
430  /* ya' = ya + yb + yc + yd */
431  out1 = __SHADD16(R, T);
432  out1 = __SHADD16(out1, 0);
433  _SIMD32_OFFSET(pSi0) = out1;
434  pSi0 += 2 * n1;
435 
436  /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
437  R = __SHSUB16(R, T);
438 
439 #ifndef ARM_MATH_BIG_ENDIAN
440 
441  /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
442  out1 = __SMUAD(C2, R) >> 16u;
443 
444  /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
445  out2 = __SMUSDX(C2, R);
446 
447 #else
448 
449  /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
450  out1 = __SMUSDX(R, C2) >> 16u;
451 
452  /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
453  out2 = __SMUAD(C2, R);
454 
455 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
456 
457  /* Reading i0+3fftLen/4 */
458  /* Read yb (real), xb(imag) input */
459  T = _SIMD32_OFFSET(pSi1);
460 
461  /* writing the butterfly processed i0 + fftLen/4 sample */
462  /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
463  /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
464  _SIMD32_OFFSET(pSi1) =
465  ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
466  pSi1 += 2 * n1;
467 
468  /* Butterfly calculations */
469 
470  /* Read yd (real), xd(imag) input */
471  U = _SIMD32_OFFSET(pSi3);
472 
473  /* T = packed(yb-yd, xb-xd) */
474  T = __QSUB16(T, U);
475 
476 #ifndef ARM_MATH_BIG_ENDIAN
477 
478  /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
479  R = __SHASX(S, T);
480 
481  /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
482  S = __SHSAX(S, T);
483 
484 
485  /* Butterfly process for the i0+fftLen/2 sample */
486  out1 = __SMUAD(C1, S) >> 16u;
487  out2 = __SMUSDX(C1, S);
488 
489 #else
490 
491  /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
492  R = __SHSAX(S, T);
493 
494  /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
495  S = __SHASX(S, T);
496 
497 
498  /* Butterfly process for the i0+fftLen/2 sample */
499  out1 = __SMUSDX(S, C1) >> 16u;
500  out2 = __SMUAD(C1, S);
501 
502 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
503 
504  /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
505  /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
506  _SIMD32_OFFSET(pSi2) =
507  ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
508  pSi2 += 2 * n1;
509 
510  /* Butterfly process for the i0+3fftLen/4 sample */
511 
512 #ifndef ARM_MATH_BIG_ENDIAN
513 
514  out1 = __SMUAD(C3, R) >> 16u;
515  out2 = __SMUSDX(C3, R);
516 
517 #else
518 
519  out1 = __SMUSDX(R, C3) >> 16u;
520  out2 = __SMUAD(C3, R);
521 
522 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
523 
524  /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
525  /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
526  _SIMD32_OFFSET(pSi3) =
527  ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
528  pSi3 += 2 * n1;
529  }
530  }
531  /* Twiddle coefficients index modifier */
532  twidCoefModifier <<= 2u;
533  }
534  /* end of middle stage process */
535 
536 
537  /* data is in 10.6(q6) format for the 1024 point */
538  /* data is in 8.8(q8) format for the 256 point */
539  /* data is in 6.10(q10) format for the 64 point */
540  /* data is in 4.12(q12) format for the 16 point */
541 
542  /* Initializations for the last stage */
543  j = fftLen >> 2;
544 
545  ptr1 = &pSrc16[0];
546 
547  /* start of last stage process */
548 
549  /* Butterfly implementation */
550  do
551  {
552  /* Read xa (real), ya(imag) input */
553  xaya = *__SIMD32(ptr1)++;
554 
555  /* Read xb (real), yb(imag) input */
556  xbyb = *__SIMD32(ptr1)++;
557 
558  /* Read xc (real), yc(imag) input */
559  xcyc = *__SIMD32(ptr1)++;
560 
561  /* Read xd (real), yd(imag) input */
562  xdyd = *__SIMD32(ptr1)++;
563 
564  /* R = packed((ya + yc), (xa + xc)) */
565  R = __QADD16(xaya, xcyc);
566 
567  /* T = packed((yb + yd), (xb + xd)) */
568  T = __QADD16(xbyb, xdyd);
569 
570  /* pointer updation for writing */
571  ptr1 = ptr1 - 8u;
572 
573 
574  /* xa' = xa + xb + xc + xd */
575  /* ya' = ya + yb + yc + yd */
576  *__SIMD32(ptr1)++ = __SHADD16(R, T);
577 
578  /* T = packed((yb + yd), (xb + xd)) */
579  T = __QADD16(xbyb, xdyd);
580 
581  /* xc' = (xa-xb+xc-xd) */
582  /* yc' = (ya-yb+yc-yd) */
583  *__SIMD32(ptr1)++ = __SHSUB16(R, T);
584 
585  /* S = packed((ya - yc), (xa - xc)) */
586  S = __QSUB16(xaya, xcyc);
587 
588  /* Read yd (real), xd(imag) input */
589  /* T = packed( (yb - yd), (xb - xd)) */
590  U = __QSUB16(xbyb, xdyd);
591 
592 #ifndef ARM_MATH_BIG_ENDIAN
593 
594  /* xb' = (xa+yb-xc-yd) */
595  /* yb' = (ya-xb-yc+xd) */
596  *__SIMD32(ptr1)++ = __SHSAX(S, U);
597 
598 
599  /* xd' = (xa-yb-xc+yd) */
600  /* yd' = (ya+xb-yc-xd) */
601  *__SIMD32(ptr1)++ = __SHASX(S, U);
602 
603 #else
604 
605  /* xb' = (xa+yb-xc-yd) */
606  /* yb' = (ya-xb-yc+xd) */
607  *__SIMD32(ptr1)++ = __SHASX(S, U);
608 
609 
610  /* xd' = (xa-yb-xc+yd) */
611  /* yd' = (ya+xb-yc-xd) */
612  *__SIMD32(ptr1)++ = __SHSAX(S, U);
613 
614 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
615 
616  } while(--j);
617 
618  /* end of last stage process */
619 
620  /* output is in 11.5(q5) format for the 1024 point */
621  /* output is in 9.7(q7) format for the 256 point */
622  /* output is in 7.9(q9) format for the 64 point */
623  /* output is in 5.11(q11) format for the 16 point */
624 
625 
626 #else
627 
628  /* Run the below code for Cortex-M0 */
629 
630  q15_t R0, R1, S0, S1, T0, T1, U0, U1;
631  q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
632  uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
633 
634  /* Total process is divided into three stages */
635 
636  /* process first stage, middle stages, & last stage */
637 
638  /* Initializations for the first stage */
639  n2 = fftLen;
640  n1 = n2;
641 
642  /* n2 = fftLen/4 */
643  n2 >>= 2u;
644 
645  /* Index for twiddle coefficient */
646  ic = 0u;
647 
648  /* Index for input read and output write */
649  i0 = 0u;
650  j = n2;
651 
652  /* Input is in 1.15(q15) format */
653 
654  /* start of first stage process */
655  do
656  {
657  /* Butterfly implementation */
658 
659  /* index calculation for the input as, */
660  /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
661  i1 = i0 + n2;
662  i2 = i1 + n2;
663  i3 = i2 + n2;
664 
665  /* Reading i0, i0+fftLen/2 inputs */
666 
667  /* input is down scale by 4 to avoid overflow */
668  /* Read ya (real), xa(imag) input */
669  T0 = pSrc16[i0 * 2u] >> 2u;
670  T1 = pSrc16[(i0 * 2u) + 1u] >> 2u;
671 
672  /* input is down scale by 4 to avoid overflow */
673  /* Read yc (real), xc(imag) input */
674  S0 = pSrc16[i2 * 2u] >> 2u;
675  S1 = pSrc16[(i2 * 2u) + 1u] >> 2u;
676 
677  /* R0 = (ya + yc) */
678  R0 = __SSAT(T0 + S0, 16u);
679  /* R1 = (xa + xc) */
680  R1 = __SSAT(T1 + S1, 16u);
681 
682  /* S0 = (ya - yc) */
683  S0 = __SSAT(T0 - S0, 16);
684  /* S1 = (xa - xc) */
685  S1 = __SSAT(T1 - S1, 16);
686 
687  /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
688  /* input is down scale by 4 to avoid overflow */
689  /* Read yb (real), xb(imag) input */
690  T0 = pSrc16[i1 * 2u] >> 2u;
691  T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
692 
693  /* input is down scale by 4 to avoid overflow */
694  /* Read yd (real), xd(imag) input */
695  U0 = pSrc16[i3 * 2u] >> 2u;
696  U1 = pSrc16[(i3 * 2u) + 1] >> 2u;
697 
698  /* T0 = (yb + yd) */
699  T0 = __SSAT(T0 + U0, 16u);
700  /* T1 = (xb + xd) */
701  T1 = __SSAT(T1 + U1, 16u);
702 
703  /* writing the butterfly processed i0 sample */
704  /* ya' = ya + yb + yc + yd */
705  /* xa' = xa + xb + xc + xd */
706  pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
707  pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
708 
709  /* R0 = (ya + yc) - (yb + yd) */
710  /* R1 = (xa + xc) - (xb + xd) */
711  R0 = __SSAT(R0 - T0, 16u);
712  R1 = __SSAT(R1 - T1, 16u);
713 
714  /* co2 & si2 are read from Coefficient pointer */
715  Co2 = pCoef16[2u * ic * 2u];
716  Si2 = pCoef16[(2u * ic * 2u) + 1];
717 
718  /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
719  out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u);
720  /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
721  out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u);
722 
723  /* Reading i0+fftLen/4 */
724  /* input is down scale by 4 to avoid overflow */
725  /* T0 = yb, T1 = xb */
726  T0 = pSrc16[i1 * 2u] >> 2;
727  T1 = pSrc16[(i1 * 2u) + 1] >> 2;
728 
729  /* writing the butterfly processed i0 + fftLen/4 sample */
730  /* writing output(xc', yc') in little endian format */
731  pSrc16[i1 * 2u] = out1;
732  pSrc16[(i1 * 2u) + 1] = out2;
733 
734  /* Butterfly calculations */
735  /* input is down scale by 4 to avoid overflow */
736  /* U0 = yd, U1 = xd */
737  U0 = pSrc16[i3 * 2u] >> 2;
738  U1 = pSrc16[(i3 * 2u) + 1] >> 2;
739  /* T0 = yb-yd */
740  T0 = __SSAT(T0 - U0, 16);
741  /* T1 = xb-xd */
742  T1 = __SSAT(T1 - U1, 16);
743 
744  /* R1 = (ya-yc) + (xb- xd), R0 = (xa-xc) - (yb-yd)) */
745  R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
746  R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
747 
748  /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
749  S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16u);
750  S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16u);
751 
752  /* co1 & si1 are read from Coefficient pointer */
753  Co1 = pCoef16[ic * 2u];
754  Si1 = pCoef16[(ic * 2u) + 1];
755  /* Butterfly process for the i0+fftLen/2 sample */
756  /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
757  out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
758  /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
759  out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
760 
761  /* writing output(xb', yb') in little endian format */
762  pSrc16[i2 * 2u] = out1;
763  pSrc16[(i2 * 2u) + 1] = out2;
764 
765  /* Co3 & si3 are read from Coefficient pointer */
766  Co3 = pCoef16[3u * (ic * 2u)];
767  Si3 = pCoef16[(3u * (ic * 2u)) + 1];
768  /* Butterfly process for the i0+3fftLen/4 sample */
769  /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
770  out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u);
771  /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
772  out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u);
773  /* writing output(xd', yd') in little endian format */
774  pSrc16[i3 * 2u] = out1;
775  pSrc16[(i3 * 2u) + 1] = out2;
776 
777  /* Twiddle coefficients index modifier */
778  ic = ic + twidCoefModifier;
779 
780  /* Updating input index */
781  i0 = i0 + 1u;
782 
783  } while(--j);
784  /* data is in 4.11(q11) format */
785 
786  /* end of first stage process */
787 
788 
789  /* start of middle stage process */
790 
791  /* Twiddle coefficients index modifier */
792  twidCoefModifier <<= 2u;
793 
794  /* Calculation of Middle stage */
795  for (k = fftLen / 4u; k > 4u; k >>= 2u)
796  {
797  /* Initializations for the middle stage */
798  n1 = n2;
799  n2 >>= 2u;
800  ic = 0u;
801 
802  for (j = 0u; j <= (n2 - 1u); j++)
803  {
804  /* index calculation for the coefficients */
805  Co1 = pCoef16[ic * 2u];
806  Si1 = pCoef16[(ic * 2u) + 1u];
807  Co2 = pCoef16[2u * (ic * 2u)];
808  Si2 = pCoef16[(2u * (ic * 2u)) + 1u];
809  Co3 = pCoef16[3u * (ic * 2u)];
810  Si3 = pCoef16[(3u * (ic * 2u)) + 1u];
811 
812  /* Twiddle coefficients index modifier */
813  ic = ic + twidCoefModifier;
814 
815  /* Butterfly implementation */
816  for (i0 = j; i0 < fftLen; i0 += n1)
817  {
818  /* index calculation for the input as, */
819  /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
820  i1 = i0 + n2;
821  i2 = i1 + n2;
822  i3 = i2 + n2;
823 
824  /* Reading i0, i0+fftLen/2 inputs */
825  /* Read ya (real), xa(imag) input */
826  T0 = pSrc16[i0 * 2u];
827  T1 = pSrc16[(i0 * 2u) + 1u];
828 
829  /* Read yc (real), xc(imag) input */
830  S0 = pSrc16[i2 * 2u];
831  S1 = pSrc16[(i2 * 2u) + 1u];
832 
833  /* R0 = (ya + yc), R1 = (xa + xc) */
834  R0 = __SSAT(T0 + S0, 16);
835  R1 = __SSAT(T1 + S1, 16);
836 
837  /* S0 = (ya - yc), S1 =(xa - xc) */
838  S0 = __SSAT(T0 - S0, 16);
839  S1 = __SSAT(T1 - S1, 16);
840 
841  /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
842  /* Read yb (real), xb(imag) input */
843  T0 = pSrc16[i1 * 2u];
844  T1 = pSrc16[(i1 * 2u) + 1u];
845 
846  /* Read yd (real), xd(imag) input */
847  U0 = pSrc16[i3 * 2u];
848  U1 = pSrc16[(i3 * 2u) + 1u];
849 
850 
851  /* T0 = (yb + yd), T1 = (xb + xd) */
852  T0 = __SSAT(T0 + U0, 16);
853  T1 = __SSAT(T1 + U1, 16);
854 
855  /* writing the butterfly processed i0 sample */
856 
857  /* xa' = xa + xb + xc + xd */
858  /* ya' = ya + yb + yc + yd */
859  out1 = ((R0 >> 1u) + (T0 >> 1u)) >> 1u;
860  out2 = ((R1 >> 1u) + (T1 >> 1u)) >> 1u;
861 
862  pSrc16[i0 * 2u] = out1;
863  pSrc16[(2u * i0) + 1u] = out2;
864 
865  /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
866  R0 = (R0 >> 1u) - (T0 >> 1u);
867  R1 = (R1 >> 1u) - (T1 >> 1u);
868 
869  /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
870  out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16u);
871 
872  /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
873  out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16u);
874 
875  /* Reading i0+3fftLen/4 */
876  /* Read yb (real), xb(imag) input */
877  T0 = pSrc16[i1 * 2u];
878  T1 = pSrc16[(i1 * 2u) + 1u];
879 
880  /* writing the butterfly processed i0 + fftLen/4 sample */
881  /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
882  /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
883  pSrc16[i1 * 2u] = out1;
884  pSrc16[(i1 * 2u) + 1u] = out2;
885 
886  /* Butterfly calculations */
887 
888  /* Read yd (real), xd(imag) input */
889  U0 = pSrc16[i3 * 2u];
890  U1 = pSrc16[(i3 * 2u) + 1u];
891 
892  /* T0 = yb-yd, T1 = xb-xd */
893  T0 = __SSAT(T0 - U0, 16);
894  T1 = __SSAT(T1 - U1, 16);
895 
896  /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
897  R0 = (S0 >> 1u) - (T1 >> 1u);
898  R1 = (S1 >> 1u) + (T0 >> 1u);
899 
900  /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
901  S0 = (S0 >> 1u) + (T1 >> 1u);
902  S1 = (S1 >> 1u) - (T0 >> 1u);
903 
904  /* Butterfly process for the i0+fftLen/2 sample */
905  out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16u);
906 
907  out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16u);
908 
909  /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
910  /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
911  pSrc16[i2 * 2u] = out1;
912  pSrc16[(i2 * 2u) + 1u] = out2;
913 
914  /* Butterfly process for the i0+3fftLen/4 sample */
915  out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16u);
916 
917  out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16u);
918  /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
919  /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
920  pSrc16[i3 * 2u] = out1;
921  pSrc16[(i3 * 2u) + 1u] = out2;
922  }
923  }
924  /* Twiddle coefficients index modifier */
925  twidCoefModifier <<= 2u;
926  }
927  /* end of middle stage process */
928 
929 
930  /* data is in 10.6(q6) format for the 1024 point */
931  /* data is in 8.8(q8) format for the 256 point */
932  /* data is in 6.10(q10) format for the 64 point */
933  /* data is in 4.12(q12) format for the 16 point */
934 
935  /* Initializations for the last stage */
936  n1 = n2;
937  n2 >>= 2u;
938 
939  /* start of last stage process */
940 
941  /* Butterfly implementation */
942  for (i0 = 0u; i0 <= (fftLen - n1); i0 += n1)
943  {
944  /* index calculation for the input as, */
945  /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
946  i1 = i0 + n2;
947  i2 = i1 + n2;
948  i3 = i2 + n2;
949 
950  /* Reading i0, i0+fftLen/2 inputs */
951  /* Read ya (real), xa(imag) input */
952  T0 = pSrc16[i0 * 2u];
953  T1 = pSrc16[(i0 * 2u) + 1u];
954 
955  /* Read yc (real), xc(imag) input */
956  S0 = pSrc16[i2 * 2u];
957  S1 = pSrc16[(i2 * 2u) + 1u];
958 
959  /* R0 = (ya + yc), R1 = (xa + xc) */
960  R0 = __SSAT(T0 + S0, 16u);
961  R1 = __SSAT(T1 + S1, 16u);
962 
963  /* S0 = (ya - yc), S1 = (xa - xc) */
964  S0 = __SSAT(T0 - S0, 16u);
965  S1 = __SSAT(T1 - S1, 16u);
966 
967  /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
968  /* Read yb (real), xb(imag) input */
969  T0 = pSrc16[i1 * 2u];
970  T1 = pSrc16[(i1 * 2u) + 1u];
971  /* Read yd (real), xd(imag) input */
972  U0 = pSrc16[i3 * 2u];
973  U1 = pSrc16[(i3 * 2u) + 1u];
974 
975  /* T0 = (yb + yd), T1 = (xb + xd)) */
976  T0 = __SSAT(T0 + U0, 16u);
977  T1 = __SSAT(T1 + U1, 16u);
978 
979  /* writing the butterfly processed i0 sample */
980  /* xa' = xa + xb + xc + xd */
981  /* ya' = ya + yb + yc + yd */
982  pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
983  pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
984 
985  /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
986  R0 = (R0 >> 1u) - (T0 >> 1u);
987  R1 = (R1 >> 1u) - (T1 >> 1u);
988  /* Read yb (real), xb(imag) input */
989  T0 = pSrc16[i1 * 2u];
990  T1 = pSrc16[(i1 * 2u) + 1u];
991 
992  /* writing the butterfly processed i0 + fftLen/4 sample */
993  /* xc' = (xa-xb+xc-xd) */
994  /* yc' = (ya-yb+yc-yd) */
995  pSrc16[i1 * 2u] = R0;
996  pSrc16[(i1 * 2u) + 1u] = R1;
997 
998  /* Read yd (real), xd(imag) input */
999  U0 = pSrc16[i3 * 2u];
1000  U1 = pSrc16[(i3 * 2u) + 1u];
1001  /* T0 = (yb - yd), T1 = (xb - xd) */
1002  T0 = __SSAT(T0 - U0, 16u);
1003  T1 = __SSAT(T1 - U1, 16u);
1004 
1005  /* writing the butterfly processed i0 + fftLen/2 sample */
1006  /* xb' = (xa+yb-xc-yd) */
1007  /* yb' = (ya-xb-yc+xd) */
1008  pSrc16[i2 * 2u] = (S0 >> 1u) + (T1 >> 1u);
1009  pSrc16[(i2 * 2u) + 1u] = (S1 >> 1u) - (T0 >> 1u);
1010 
1011  /* writing the butterfly processed i0 + 3fftLen/4 sample */
1012  /* xd' = (xa-yb-xc+yd) */
1013  /* yd' = (ya+xb-yc-xd) */
1014  pSrc16[i3 * 2u] = (S0 >> 1u) - (T1 >> 1u);
1015  pSrc16[(i3 * 2u) + 1u] = (S1 >> 1u) + (T0 >> 1u);
1016 
1017  }
1018 
1019  /* end of last stage process */
1020 
1021  /* output is in 11.5(q5) format for the 1024 point */
1022  /* output is in 9.7(q7) format for the 256 point */
1023  /* output is in 7.9(q9) format for the 64 point */
1024  /* output is in 5.11(q11) format for the 16 point */
1025 
1026 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
1027 
1028 }
1029 
1030 
1040 /*
1041 * Radix-4 IFFT algorithm used is :
1042 *
1043 * CIFFT uses same twiddle coefficients as CFFT function
1044 * x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
1045 *
1046 *
1047 * IFFT is implemented with following changes in equations from FFT
1048 *
1049 * Input real and imaginary data:
1050 * x(n) = xa + j * ya
1051 * x(n+N/4 ) = xb + j * yb
1052 * x(n+N/2 ) = xc + j * yc
1053 * x(n+3N 4) = xd + j * yd
1054 *
1055 *
1056 * Output real and imaginary data:
1057 * x(4r) = xa'+ j * ya'
1058 * x(4r+1) = xb'+ j * yb'
1059 * x(4r+2) = xc'+ j * yc'
1060 * x(4r+3) = xd'+ j * yd'
1061 *
1062 *
1063 * Twiddle factors for radix-4 IFFT:
1064 * Wn = co1 + j * (si1)
1065 * W2n = co2 + j * (si2)
1066 * W3n = co3 + j * (si3)
1067 
1068 * The real and imaginary output values for the radix-4 butterfly are
1069 * xa' = xa + xb + xc + xd
1070 * ya' = ya + yb + yc + yd
1071 * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
1072 * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
1073 * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
1074 * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
1075 * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
1076 * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
1077 *
1078 */
1079 
1081  q15_t * pSrc16,
1082  uint32_t fftLen,
1083  q15_t * pCoef16,
1084  uint32_t twidCoefModifier)
1085 {
1086 
1087 #ifndef ARM_MATH_CM0_FAMILY
1088 
1089  /* Run the below code for Cortex-M4 and Cortex-M3 */
1090 
1091  q31_t R, S, T, U;
1092  q31_t C1, C2, C3, out1, out2;
1093  uint32_t n1, n2, ic, i0, j, k;
1094 
1095  q15_t *ptr1;
1096  q15_t *pSi0;
1097  q15_t *pSi1;
1098  q15_t *pSi2;
1099  q15_t *pSi3;
1100 
1101  q31_t xaya, xbyb, xcyc, xdyd;
1102 
1103  /* Total process is divided into three stages */
1104 
1105  /* process first stage, middle stages, & last stage */
1106 
1107  /* Initializations for the first stage */
1108  n2 = fftLen;
1109  n1 = n2;
1110 
1111  /* n2 = fftLen/4 */
1112  n2 >>= 2u;
1113 
1114  /* Index for twiddle coefficient */
1115  ic = 0u;
1116 
1117  /* Index for input read and output write */
1118  j = n2;
1119 
1120  pSi0 = pSrc16;
1121  pSi1 = pSi0 + 2 * n2;
1122  pSi2 = pSi1 + 2 * n2;
1123  pSi3 = pSi2 + 2 * n2;
1124 
1125  /* Input is in 1.15(q15) format */
1126 
1127  /* start of first stage process */
1128  do
1129  {
1130  /* Butterfly implementation */
1131 
1132  /* Reading i0, i0+fftLen/2 inputs */
1133  /* Read ya (real), xa(imag) input */
1134  T = _SIMD32_OFFSET(pSi0);
1135  T = __SHADD16(T, 0);
1136  T = __SHADD16(T, 0);
1137 
1138  /* Read yc (real), xc(imag) input */
1139  S = _SIMD32_OFFSET(pSi2);
1140  S = __SHADD16(S, 0);
1141  S = __SHADD16(S, 0);
1142 
1143  /* R = packed((ya + yc), (xa + xc) ) */
1144  R = __QADD16(T, S);
1145 
1146  /* S = packed((ya - yc), (xa - xc) ) */
1147  S = __QSUB16(T, S);
1148 
1149  /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1150  /* Read yb (real), xb(imag) input */
1151  T = _SIMD32_OFFSET(pSi1);
1152  T = __SHADD16(T, 0);
1153  T = __SHADD16(T, 0);
1154 
1155  /* Read yd (real), xd(imag) input */
1156  U = _SIMD32_OFFSET(pSi3);
1157  U = __SHADD16(U, 0);
1158  U = __SHADD16(U, 0);
1159 
1160  /* T = packed((yb + yd), (xb + xd) ) */
1161  T = __QADD16(T, U);
1162 
1163  /* writing the butterfly processed i0 sample */
1164  /* xa' = xa + xb + xc + xd */
1165  /* ya' = ya + yb + yc + yd */
1166  _SIMD32_OFFSET(pSi0) = __SHADD16(R, T);
1167  pSi0 += 2;
1168 
1169  /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
1170  R = __QSUB16(R, T);
1171 
1172  /* co2 & si2 are read from SIMD Coefficient pointer */
1173  C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
1174 
1175 #ifndef ARM_MATH_BIG_ENDIAN
1176 
1177  /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1178  out1 = __SMUSD(C2, R) >> 16u;
1179  /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1180  out2 = __SMUADX(C2, R);
1181 
1182 #else
1183 
1184  /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1185  out1 = __SMUADX(C2, R) >> 16u;
1186  /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1187  out2 = __SMUSD(__QSUB16(0, C2), R);
1188 
1189 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1190 
1191  /* Reading i0+fftLen/4 */
1192  /* T = packed(yb, xb) */
1193  T = _SIMD32_OFFSET(pSi1);
1194  T = __SHADD16(T, 0);
1195  T = __SHADD16(T, 0);
1196 
1197  /* writing the butterfly processed i0 + fftLen/4 sample */
1198  /* writing output(xc', yc') in little endian format */
1199  _SIMD32_OFFSET(pSi1) =
1200  (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1201  pSi1 += 2;
1202 
1203  /* Butterfly calculations */
1204  /* U = packed(yd, xd) */
1205  U = _SIMD32_OFFSET(pSi3);
1206  U = __SHADD16(U, 0);
1207  U = __SHADD16(U, 0);
1208 
1209  /* T = packed(yb-yd, xb-xd) */
1210  T = __QSUB16(T, U);
1211 
1212 #ifndef ARM_MATH_BIG_ENDIAN
1213 
1214  /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1215  R = __QSAX(S, T);
1216  /* S = packed((ya-yc) + (xb- xd), (xa-xc) - (yb-yd)) */
1217  S = __QASX(S, T);
1218 
1219 #else
1220 
1221  /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1222  R = __QASX(S, T);
1223  /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
1224  S = __QSAX(S, T);
1225 
1226 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1227 
1228  /* co1 & si1 are read from SIMD Coefficient pointer */
1229  C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
1230  /* Butterfly process for the i0+fftLen/2 sample */
1231 
1232 #ifndef ARM_MATH_BIG_ENDIAN
1233 
1234  /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1235  out1 = __SMUSD(C1, S) >> 16u;
1236  /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1237  out2 = __SMUADX(C1, S);
1238 
1239 #else
1240 
1241  /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1242  out1 = __SMUADX(C1, S) >> 16u;
1243  /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1244  out2 = __SMUSD(__QSUB16(0, C1), S);
1245 
1246 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1247 
1248  /* writing output(xb', yb') in little endian format */
1249  _SIMD32_OFFSET(pSi2) =
1250  ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF);
1251  pSi2 += 2;
1252 
1253 
1254  /* co3 & si3 are read from SIMD Coefficient pointer */
1255  C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
1256  /* Butterfly process for the i0+3fftLen/4 sample */
1257 
1258 #ifndef ARM_MATH_BIG_ENDIAN
1259 
1260  /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1261  out1 = __SMUSD(C3, R) >> 16u;
1262  /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1263  out2 = __SMUADX(C3, R);
1264 
1265 #else
1266 
1267  /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1268  out1 = __SMUADX(C3, R) >> 16u;
1269  /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1270  out2 = __SMUSD(__QSUB16(0, C3), R);
1271 
1272 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1273 
1274  /* writing output(xd', yd') in little endian format */
1275  _SIMD32_OFFSET(pSi3) =
1276  ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1277  pSi3 += 2;
1278 
1279  /* Twiddle coefficients index modifier */
1280  ic = ic + twidCoefModifier;
1281 
1282  } while(--j);
1283  /* data is in 4.11(q11) format */
1284 
1285  /* end of first stage process */
1286 
1287 
1288  /* start of middle stage process */
1289 
1290  /* Twiddle coefficients index modifier */
1291  twidCoefModifier <<= 2u;
1292 
1293  /* Calculation of Middle stage */
1294  for (k = fftLen / 4u; k > 4u; k >>= 2u)
1295  {
1296  /* Initializations for the middle stage */
1297  n1 = n2;
1298  n2 >>= 2u;
1299  ic = 0u;
1300 
1301  for (j = 0u; j <= (n2 - 1u); j++)
1302  {
1303  /* index calculation for the coefficients */
1304  C1 = _SIMD32_OFFSET(pCoef16 + (2u * ic));
1305  C2 = _SIMD32_OFFSET(pCoef16 + (4u * ic));
1306  C3 = _SIMD32_OFFSET(pCoef16 + (6u * ic));
1307 
1308  /* Twiddle coefficients index modifier */
1309  ic = ic + twidCoefModifier;
1310 
1311  pSi0 = pSrc16 + 2 * j;
1312  pSi1 = pSi0 + 2 * n2;
1313  pSi2 = pSi1 + 2 * n2;
1314  pSi3 = pSi2 + 2 * n2;
1315 
1316  /* Butterfly implementation */
1317  for (i0 = j; i0 < fftLen; i0 += n1)
1318  {
1319  /* Reading i0, i0+fftLen/2 inputs */
1320  /* Read ya (real), xa(imag) input */
1321  T = _SIMD32_OFFSET(pSi0);
1322 
1323  /* Read yc (real), xc(imag) input */
1324  S = _SIMD32_OFFSET(pSi2);
1325 
1326  /* R = packed( (ya + yc), (xa + xc)) */
1327  R = __QADD16(T, S);
1328 
1329  /* S = packed((ya - yc), (xa - xc)) */
1330  S = __QSUB16(T, S);
1331 
1332  /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1333  /* Read yb (real), xb(imag) input */
1334  T = _SIMD32_OFFSET(pSi1);
1335 
1336  /* Read yd (real), xd(imag) input */
1337  U = _SIMD32_OFFSET(pSi3);
1338 
1339  /* T = packed( (yb + yd), (xb + xd)) */
1340  T = __QADD16(T, U);
1341 
1342  /* writing the butterfly processed i0 sample */
1343 
1344  /* xa' = xa + xb + xc + xd */
1345  /* ya' = ya + yb + yc + yd */
1346  out1 = __SHADD16(R, T);
1347  out1 = __SHADD16(out1, 0);
1348  _SIMD32_OFFSET(pSi0) = out1;
1349  pSi0 += 2 * n1;
1350 
1351  /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
1352  R = __SHSUB16(R, T);
1353 
1354 #ifndef ARM_MATH_BIG_ENDIAN
1355 
1356  /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1357  out1 = __SMUSD(C2, R) >> 16u;
1358 
1359  /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1360  out2 = __SMUADX(C2, R);
1361 
1362 #else
1363 
1364  /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1365  out1 = __SMUADX(R, C2) >> 16u;
1366 
1367  /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1368  out2 = __SMUSD(__QSUB16(0, C2), R);
1369 
1370 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1371 
1372  /* Reading i0+3fftLen/4 */
1373  /* Read yb (real), xb(imag) input */
1374  T = _SIMD32_OFFSET(pSi1);
1375 
1376  /* writing the butterfly processed i0 + fftLen/4 sample */
1377  /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1378  /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1379  _SIMD32_OFFSET(pSi1) =
1380  ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1381  pSi1 += 2 * n1;
1382 
1383  /* Butterfly calculations */
1384 
1385  /* Read yd (real), xd(imag) input */
1386  U = _SIMD32_OFFSET(pSi3);
1387 
1388  /* T = packed(yb-yd, xb-xd) */
1389  T = __QSUB16(T, U);
1390 
1391 #ifndef ARM_MATH_BIG_ENDIAN
1392 
1393  /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1394  R = __SHSAX(S, T);
1395 
1396  /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
1397  S = __SHASX(S, T);
1398 
1399 
1400  /* Butterfly process for the i0+fftLen/2 sample */
1401  out1 = __SMUSD(C1, S) >> 16u;
1402  out2 = __SMUADX(C1, S);
1403 
1404 #else
1405 
1406  /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1407  R = __SHASX(S, T);
1408 
1409  /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
1410  S = __SHSAX(S, T);
1411 
1412 
1413  /* Butterfly process for the i0+fftLen/2 sample */
1414  out1 = __SMUADX(S, C1) >> 16u;
1415  out2 = __SMUSD(__QSUB16(0, C1), S);
1416 
1417 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1418 
1419  /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1420  /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1421  _SIMD32_OFFSET(pSi2) =
1422  ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1423  pSi2 += 2 * n1;
1424 
1425  /* Butterfly process for the i0+3fftLen/4 sample */
1426 
1427 #ifndef ARM_MATH_BIG_ENDIAN
1428 
1429  out1 = __SMUSD(C3, R) >> 16u;
1430  out2 = __SMUADX(C3, R);
1431 
1432 #else
1433 
1434  out1 = __SMUADX(C3, R) >> 16u;
1435  out2 = __SMUSD(__QSUB16(0, C3), R);
1436 
1437 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1438 
1439  /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1440  /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1441  _SIMD32_OFFSET(pSi3) =
1442  ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
1443  pSi3 += 2 * n1;
1444  }
1445  }
1446  /* Twiddle coefficients index modifier */
1447  twidCoefModifier <<= 2u;
1448  }
1449  /* end of middle stage process */
1450 
1451  /* data is in 10.6(q6) format for the 1024 point */
1452  /* data is in 8.8(q8) format for the 256 point */
1453  /* data is in 6.10(q10) format for the 64 point */
1454  /* data is in 4.12(q12) format for the 16 point */
1455 
1456  /* Initializations for the last stage */
1457  j = fftLen >> 2;
1458 
1459  ptr1 = &pSrc16[0];
1460 
1461  /* start of last stage process */
1462 
1463  /* Butterfly implementation */
1464  do
1465  {
1466  /* Read xa (real), ya(imag) input */
1467  xaya = *__SIMD32(ptr1)++;
1468 
1469  /* Read xb (real), yb(imag) input */
1470  xbyb = *__SIMD32(ptr1)++;
1471 
1472  /* Read xc (real), yc(imag) input */
1473  xcyc = *__SIMD32(ptr1)++;
1474 
1475  /* Read xd (real), yd(imag) input */
1476  xdyd = *__SIMD32(ptr1)++;
1477 
1478  /* R = packed((ya + yc), (xa + xc)) */
1479  R = __QADD16(xaya, xcyc);
1480 
1481  /* T = packed((yb + yd), (xb + xd)) */
1482  T = __QADD16(xbyb, xdyd);
1483 
1484  /* pointer updation for writing */
1485  ptr1 = ptr1 - 8u;
1486 
1487 
1488  /* xa' = xa + xb + xc + xd */
1489  /* ya' = ya + yb + yc + yd */
1490  *__SIMD32(ptr1)++ = __SHADD16(R, T);
1491 
1492  /* T = packed((yb + yd), (xb + xd)) */
1493  T = __QADD16(xbyb, xdyd);
1494 
1495  /* xc' = (xa-xb+xc-xd) */
1496  /* yc' = (ya-yb+yc-yd) */
1497  *__SIMD32(ptr1)++ = __SHSUB16(R, T);
1498 
1499  /* S = packed((ya - yc), (xa - xc)) */
1500  S = __QSUB16(xaya, xcyc);
1501 
1502  /* Read yd (real), xd(imag) input */
1503  /* T = packed( (yb - yd), (xb - xd)) */
1504  U = __QSUB16(xbyb, xdyd);
1505 
1506 #ifndef ARM_MATH_BIG_ENDIAN
1507 
1508  /* xb' = (xa+yb-xc-yd) */
1509  /* yb' = (ya-xb-yc+xd) */
1510  *__SIMD32(ptr1)++ = __SHASX(S, U);
1511 
1512 
1513  /* xd' = (xa-yb-xc+yd) */
1514  /* yd' = (ya+xb-yc-xd) */
1515  *__SIMD32(ptr1)++ = __SHSAX(S, U);
1516 
1517 #else
1518 
1519  /* xb' = (xa+yb-xc-yd) */
1520  /* yb' = (ya-xb-yc+xd) */
1521  *__SIMD32(ptr1)++ = __SHSAX(S, U);
1522 
1523 
1524  /* xd' = (xa-yb-xc+yd) */
1525  /* yd' = (ya+xb-yc-xd) */
1526  *__SIMD32(ptr1)++ = __SHASX(S, U);
1527 
1528 
1529 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1530 
1531  } while(--j);
1532 
1533  /* end of last stage process */
1534 
1535  /* output is in 11.5(q5) format for the 1024 point */
1536  /* output is in 9.7(q7) format for the 256 point */
1537  /* output is in 7.9(q9) format for the 64 point */
1538  /* output is in 5.11(q11) format for the 16 point */
1539 
1540 
1541 #else
1542 
1543  /* Run the below code for Cortex-M0 */
1544 
1545  q15_t R0, R1, S0, S1, T0, T1, U0, U1;
1546  q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
1547  uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
1548 
1549  /* Total process is divided into three stages */
1550 
1551  /* process first stage, middle stages, & last stage */
1552 
1553  /* Initializations for the first stage */
1554  n2 = fftLen;
1555  n1 = n2;
1556 
1557  /* n2 = fftLen/4 */
1558  n2 >>= 2u;
1559 
1560  /* Index for twiddle coefficient */
1561  ic = 0u;
1562 
1563  /* Index for input read and output write */
1564  i0 = 0u;
1565 
1566  j = n2;
1567 
1568  /* Input is in 1.15(q15) format */
1569 
1570  /* Start of first stage process */
1571  do
1572  {
1573  /* Butterfly implementation */
1574 
1575  /* index calculation for the input as, */
1576  /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1577  i1 = i0 + n2;
1578  i2 = i1 + n2;
1579  i3 = i2 + n2;
1580 
1581  /* Reading i0, i0+fftLen/2 inputs */
1582  /* input is down scale by 4 to avoid overflow */
1583  /* Read ya (real), xa(imag) input */
1584  T0 = pSrc16[i0 * 2u] >> 2u;
1585  T1 = pSrc16[(i0 * 2u) + 1u] >> 2u;
1586  /* input is down scale by 4 to avoid overflow */
1587  /* Read yc (real), xc(imag) input */
1588  S0 = pSrc16[i2 * 2u] >> 2u;
1589  S1 = pSrc16[(i2 * 2u) + 1u] >> 2u;
1590 
1591  /* R0 = (ya + yc), R1 = (xa + xc) */
1592  R0 = __SSAT(T0 + S0, 16u);
1593  R1 = __SSAT(T1 + S1, 16u);
1594  /* S0 = (ya - yc), S1 = (xa - xc) */
1595  S0 = __SSAT(T0 - S0, 16u);
1596  S1 = __SSAT(T1 - S1, 16u);
1597 
1598  /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1599  /* input is down scale by 4 to avoid overflow */
1600  /* Read yb (real), xb(imag) input */
1601  T0 = pSrc16[i1 * 2u] >> 2u;
1602  T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
1603  /* Read yd (real), xd(imag) input */
1604  /* input is down scale by 4 to avoid overflow */
1605  U0 = pSrc16[i3 * 2u] >> 2u;
1606  U1 = pSrc16[(i3 * 2u) + 1u] >> 2u;
1607 
1608  /* T0 = (yb + yd), T1 = (xb + xd) */
1609  T0 = __SSAT(T0 + U0, 16u);
1610  T1 = __SSAT(T1 + U1, 16u);
1611 
1612  /* writing the butterfly processed i0 sample */
1613  /* xa' = xa + xb + xc + xd */
1614  /* ya' = ya + yb + yc + yd */
1615  pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
1616  pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
1617 
1618  /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
1619  R0 = __SSAT(R0 - T0, 16u);
1620  R1 = __SSAT(R1 - T1, 16u);
1621  /* co2 & si2 are read from Coefficient pointer */
1622  Co2 = pCoef16[2u * ic * 2u];
1623  Si2 = pCoef16[(2u * ic * 2u) + 1u];
1624  /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1625  out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16u);
1626  /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1627  out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16u);
1628 
1629  /* Reading i0+fftLen/4 */
1630  /* input is down scale by 4 to avoid overflow */
1631  /* T0 = yb, T1 = xb */
1632  T0 = pSrc16[i1 * 2u] >> 2u;
1633  T1 = pSrc16[(i1 * 2u) + 1u] >> 2u;
1634 
1635  /* writing the butterfly processed i0 + fftLen/4 sample */
1636  /* writing output(xc', yc') in little endian format */
1637  pSrc16[i1 * 2u] = out1;
1638  pSrc16[(i1 * 2u) + 1u] = out2;
1639 
1640  /* Butterfly calculations */
1641  /* input is down scale by 4 to avoid overflow */
1642  /* U0 = yd, U1 = xd) */
1643  U0 = pSrc16[i3 * 2u] >> 2u;
1644  U1 = pSrc16[(i3 * 2u) + 1u] >> 2u;
1645 
1646  /* T0 = yb-yd, T1 = xb-xd) */
1647  T0 = __SSAT(T0 - U0, 16u);
1648  T1 = __SSAT(T1 - U1, 16u);
1649  /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1650  R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
1651  R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
1652  /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1653  S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
1654  S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
1655 
1656  /* co1 & si1 are read from Coefficient pointer */
1657  Co1 = pCoef16[ic * 2u];
1658  Si1 = pCoef16[(ic * 2u) + 1u];
1659  /* Butterfly process for the i0+fftLen/2 sample */
1660  /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1661  out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u);
1662  /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1663  out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u);
1664  /* writing output(xb', yb') in little endian format */
1665  pSrc16[i2 * 2u] = out1;
1666  pSrc16[(i2 * 2u) + 1u] = out2;
1667 
1668  /* Co3 & si3 are read from Coefficient pointer */
1669  Co3 = pCoef16[3u * ic * 2u];
1670  Si3 = pCoef16[(3u * ic * 2u) + 1u];
1671  /* Butterfly process for the i0+3fftLen/4 sample */
1672  /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1673  out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u);
1674  /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1675  out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u);
1676  /* writing output(xd', yd') in little endian format */
1677  pSrc16[i3 * 2u] = out1;
1678  pSrc16[(i3 * 2u) + 1u] = out2;
1679 
1680  /* Twiddle coefficients index modifier */
1681  ic = ic + twidCoefModifier;
1682 
1683  /* Updating input index */
1684  i0 = i0 + 1u;
1685 
1686  } while(--j);
1687 
1688  /* End of first stage process */
1689 
1690  /* data is in 4.11(q11) format */
1691 
1692 
1693  /* Start of Middle stage process */
1694 
1695  /* Twiddle coefficients index modifier */
1696  twidCoefModifier <<= 2u;
1697 
1698  /* Calculation of Middle stage */
1699  for (k = fftLen / 4u; k > 4u; k >>= 2u)
1700  {
1701  /* Initializations for the middle stage */
1702  n1 = n2;
1703  n2 >>= 2u;
1704  ic = 0u;
1705 
1706  for (j = 0u; j <= (n2 - 1u); j++)
1707  {
1708  /* index calculation for the coefficients */
1709  Co1 = pCoef16[ic * 2u];
1710  Si1 = pCoef16[(ic * 2u) + 1u];
1711  Co2 = pCoef16[2u * ic * 2u];
1712  Si2 = pCoef16[2u * ic * 2u + 1u];
1713  Co3 = pCoef16[3u * ic * 2u];
1714  Si3 = pCoef16[(3u * ic * 2u) + 1u];
1715 
1716  /* Twiddle coefficients index modifier */
1717  ic = ic + twidCoefModifier;
1718 
1719  /* Butterfly implementation */
1720  for (i0 = j; i0 < fftLen; i0 += n1)
1721  {
1722  /* index calculation for the input as, */
1723  /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1724  i1 = i0 + n2;
1725  i2 = i1 + n2;
1726  i3 = i2 + n2;
1727 
1728  /* Reading i0, i0+fftLen/2 inputs */
1729  /* Read ya (real), xa(imag) input */
1730  T0 = pSrc16[i0 * 2u];
1731  T1 = pSrc16[(i0 * 2u) + 1u];
1732 
1733  /* Read yc (real), xc(imag) input */
1734  S0 = pSrc16[i2 * 2u];
1735  S1 = pSrc16[(i2 * 2u) + 1u];
1736 
1737 
1738  /* R0 = (ya + yc), R1 = (xa + xc) */
1739  R0 = __SSAT(T0 + S0, 16u);
1740  R1 = __SSAT(T1 + S1, 16u);
1741  /* S0 = (ya - yc), S1 = (xa - xc) */
1742  S0 = __SSAT(T0 - S0, 16u);
1743  S1 = __SSAT(T1 - S1, 16u);
1744 
1745  /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1746  /* Read yb (real), xb(imag) input */
1747  T0 = pSrc16[i1 * 2u];
1748  T1 = pSrc16[(i1 * 2u) + 1u];
1749 
1750  /* Read yd (real), xd(imag) input */
1751  U0 = pSrc16[i3 * 2u];
1752  U1 = pSrc16[(i3 * 2u) + 1u];
1753 
1754  /* T0 = (yb + yd), T1 = (xb + xd) */
1755  T0 = __SSAT(T0 + U0, 16u);
1756  T1 = __SSAT(T1 + U1, 16u);
1757 
1758  /* writing the butterfly processed i0 sample */
1759  /* xa' = xa + xb + xc + xd */
1760  /* ya' = ya + yb + yc + yd */
1761  pSrc16[i0 * 2u] = ((R0 >> 1u) + (T0 >> 1u)) >> 1u;
1762  pSrc16[(i0 * 2u) + 1u] = ((R1 >> 1u) + (T1 >> 1u)) >> 1u;
1763 
1764  /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1765  R0 = (R0 >> 1u) - (T0 >> 1u);
1766  R1 = (R1 >> 1u) - (T1 >> 1u);
1767 
1768  /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
1769  out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
1770  /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1771  out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
1772 
1773  /* Reading i0+3fftLen/4 */
1774  /* Read yb (real), xb(imag) input */
1775  T0 = pSrc16[i1 * 2u];
1776  T1 = pSrc16[(i1 * 2u) + 1u];
1777 
1778  /* writing the butterfly processed i0 + fftLen/4 sample */
1779  /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1780  /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1781  pSrc16[i1 * 2u] = out1;
1782  pSrc16[(i1 * 2u) + 1u] = out2;
1783 
1784  /* Butterfly calculations */
1785  /* Read yd (real), xd(imag) input */
1786  U0 = pSrc16[i3 * 2u];
1787  U1 = pSrc16[(i3 * 2u) + 1u];
1788 
1789  /* T0 = yb-yd, T1 = xb-xd) */
1790  T0 = __SSAT(T0 - U0, 16u);
1791  T1 = __SSAT(T1 - U1, 16u);
1792 
1793  /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1794  R0 = (S0 >> 1u) + (T1 >> 1u);
1795  R1 = (S1 >> 1u) - (T0 >> 1u);
1796 
1797  /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1798  S0 = (S0 >> 1u) - (T1 >> 1u);
1799  S1 = (S1 >> 1u) + (T0 >> 1u);
1800 
1801  /* Butterfly process for the i0+fftLen/2 sample */
1802  out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16u);
1803  out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16u);
1804  /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1805  /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1806  pSrc16[i2 * 2u] = out1;
1807  pSrc16[(i2 * 2u) + 1u] = out2;
1808 
1809  /* Butterfly process for the i0+3fftLen/4 sample */
1810  out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16u);
1811 
1812  out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16u);
1813  /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1814  /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1815  pSrc16[i3 * 2u] = out1;
1816  pSrc16[(i3 * 2u) + 1u] = out2;
1817 
1818 
1819  }
1820  }
1821  /* Twiddle coefficients index modifier */
1822  twidCoefModifier <<= 2u;
1823  }
1824  /* End of Middle stages process */
1825 
1826 
1827  /* data is in 10.6(q6) format for the 1024 point */
1828  /* data is in 8.8(q8) format for the 256 point */
1829  /* data is in 6.10(q10) format for the 64 point */
1830  /* data is in 4.12(q12) format for the 16 point */
1831 
1832  /* start of last stage process */
1833 
1834 
1835  /* Initializations for the last stage */
1836  n1 = n2;
1837  n2 >>= 2u;
1838 
1839  /* Butterfly implementation */
1840  for (i0 = 0u; i0 <= (fftLen - n1); i0 += n1)
1841  {
1842  /* index calculation for the input as, */
1843  /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1844  i1 = i0 + n2;
1845  i2 = i1 + n2;
1846  i3 = i2 + n2;
1847 
1848  /* Reading i0, i0+fftLen/2 inputs */
1849  /* Read ya (real), xa(imag) input */
1850  T0 = pSrc16[i0 * 2u];
1851  T1 = pSrc16[(i0 * 2u) + 1u];
1852  /* Read yc (real), xc(imag) input */
1853  S0 = pSrc16[i2 * 2u];
1854  S1 = pSrc16[(i2 * 2u) + 1u];
1855 
1856  /* R0 = (ya + yc), R1 = (xa + xc) */
1857  R0 = __SSAT(T0 + S0, 16u);
1858  R1 = __SSAT(T1 + S1, 16u);
1859  /* S0 = (ya - yc), S1 = (xa - xc) */
1860  S0 = __SSAT(T0 - S0, 16u);
1861  S1 = __SSAT(T1 - S1, 16u);
1862 
1863  /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1864  /* Read yb (real), xb(imag) input */
1865  T0 = pSrc16[i1 * 2u];
1866  T1 = pSrc16[(i1 * 2u) + 1u];
1867  /* Read yd (real), xd(imag) input */
1868  U0 = pSrc16[i3 * 2u];
1869  U1 = pSrc16[(i3 * 2u) + 1u];
1870 
1871  /* T0 = (yb + yd), T1 = (xb + xd) */
1872  T0 = __SSAT(T0 + U0, 16u);
1873  T1 = __SSAT(T1 + U1, 16u);
1874 
1875  /* writing the butterfly processed i0 sample */
1876  /* xa' = xa + xb + xc + xd */
1877  /* ya' = ya + yb + yc + yd */
1878  pSrc16[i0 * 2u] = (R0 >> 1u) + (T0 >> 1u);
1879  pSrc16[(i0 * 2u) + 1u] = (R1 >> 1u) + (T1 >> 1u);
1880 
1881  /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1882  R0 = (R0 >> 1u) - (T0 >> 1u);
1883  R1 = (R1 >> 1u) - (T1 >> 1u);
1884 
1885  /* Read yb (real), xb(imag) input */
1886  T0 = pSrc16[i1 * 2u];
1887  T1 = pSrc16[(i1 * 2u) + 1u];
1888 
1889  /* writing the butterfly processed i0 + fftLen/4 sample */
1890  /* xc' = (xa-xb+xc-xd) */
1891  /* yc' = (ya-yb+yc-yd) */
1892  pSrc16[i1 * 2u] = R0;
1893  pSrc16[(i1 * 2u) + 1u] = R1;
1894 
1895  /* Read yd (real), xd(imag) input */
1896  U0 = pSrc16[i3 * 2u];
1897  U1 = pSrc16[(i3 * 2u) + 1u];
1898  /* T0 = (yb - yd), T1 = (xb - xd) */
1899  T0 = __SSAT(T0 - U0, 16u);
1900  T1 = __SSAT(T1 - U1, 16u);
1901 
1902  /* writing the butterfly processed i0 + fftLen/2 sample */
1903  /* xb' = (xa-yb-xc+yd) */
1904  /* yb' = (ya+xb-yc-xd) */
1905  pSrc16[i2 * 2u] = (S0 >> 1u) - (T1 >> 1u);
1906  pSrc16[(i2 * 2u) + 1u] = (S1 >> 1u) + (T0 >> 1u);
1907 
1908 
1909  /* writing the butterfly processed i0 + 3fftLen/4 sample */
1910  /* xd' = (xa+yb-xc-yd) */
1911  /* yd' = (ya-xb-yc+xd) */
1912  pSrc16[i3 * 2u] = (S0 >> 1u) + (T1 >> 1u);
1913  pSrc16[(i3 * 2u) + 1u] = (S1 >> 1u) - (T0 >> 1u);
1914  }
1915  /* end of last stage process */
1916 
1917  /* output is in 11.5(q5) format for the 1024 point */
1918  /* output is in 9.7(q7) format for the 256 point */
1919  /* output is in 7.9(q9) format for the 64 point */
1920  /* output is in 5.11(q11) format for the 16 point */
1921 
1922 #endif /* #ifndef ARM_MATH_CM0_FAMILY */
1923 
1924 }
void arm_cfft_radix4_q15(const arm_cfft_radix4_instance_q15 *S, q15_t *pSrc)
Processing function for the Q15 CFFT/CIFFT.
void arm_radix4_butterfly_q15(q15_t *pSrc16, uint32_t fftLen, q15_t *pCoef16, uint32_t twidCoefModifier)
Core function for the Q15 CFFT butterfly process.
void arm_bitreversal_q15(q15_t *pSrc, uint32_t fftLen, uint16_t bitRevFactor, uint16_t *pBitRevTab)
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
#define _SIMD32_OFFSET(addr)
Definition: arm_math.h:447
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
Instance structure for the Q15 CFFT/CIFFT function.
Definition: arm_math.h:1975
void arm_radix4_butterfly_inverse_q15(q15_t *pSrc16, uint32_t fftLen, q15_t *pCoef16, uint32_t twidCoefModifier)
Core function for the Q15 CIFFT butterfly process.