STM32F769IDiscovery  1.00
uDANTE Audio Networking with STM32F7 DISCO board
arm_mat_mult_fast_q15.c
Go to the documentation of this file.
1 /* ----------------------------------------------------------------------
2 * Copyright (C) 2010-2014 ARM Limited. All rights reserved.
3 *
4 * $Date: 19. March 2015
5 * $Revision: V.1.4.5
6 *
7 * Project: CMSIS DSP Library
8 * Title: arm_mat_mult_fast_q15.c
9 *
10 * Description: Q15 matrix multiplication (fast variant)
11 *
12 * Target Processor: Cortex-M4/Cortex-M3
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * - Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * - Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in
21 * the documentation and/or other materials provided with the
22 * distribution.
23 * - Neither the name of ARM LIMITED nor the names of its contributors
24 * may be used to endorse or promote products derived from this
25 * software without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
30 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
31 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
34 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
35 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
37 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38 * POSSIBILITY OF SUCH DAMAGE.
39 * -------------------------------------------------------------------- */
40 
41 #include "arm_math.h"
42 
86  const arm_matrix_instance_q15 * pSrcA,
87  const arm_matrix_instance_q15 * pSrcB,
89  q15_t * pState)
90 {
91  q31_t sum; /* accumulator */
92  q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
93  q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
94  q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
95  q15_t *px; /* Temporary output data matrix pointer */
96  uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
97  uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
98  uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
99  uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
100  uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */
101  arm_status status; /* status of matrix multiplication */
102 
103 #ifndef UNALIGNED_SUPPORT_DISABLE
104 
105  q31_t in; /* Temporary variable to hold the input value */
106  q31_t inA1, inA2, inB1, inB2;
107 
108 #else
109 
110  q15_t in; /* Temporary variable to hold the input value */
111  q15_t inA1, inA2, inB1, inB2;
112 
113 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
114 
115 #ifdef ARM_MATH_MATRIX_CHECK
116  /* Check for matrix mismatch condition */
117  if((pSrcA->numCols != pSrcB->numRows) ||
118  (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
119  {
120  /* Set status as ARM_MATH_SIZE_MISMATCH */
121  status = ARM_MATH_SIZE_MISMATCH;
122  }
123  else
124 #endif
125  {
126  /* Matrix transpose */
127  do
128  {
129  /* Apply loop unrolling and exchange the columns with row elements */
130  col = numColsB >> 2;
131 
132  /* The pointer px is set to starting address of the column being processed */
133  px = pSrcBT + i;
134 
135  /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
136  ** a second loop below computes the remaining 1 to 3 samples. */
137  while(col > 0u)
138  {
139 #ifndef UNALIGNED_SUPPORT_DISABLE
140  /* Read two elements from the row */
141  in = *__SIMD32(pInB)++;
142 
143  /* Unpack and store one element in the destination */
144 #ifndef ARM_MATH_BIG_ENDIAN
145 
146  *px = (q15_t) in;
147 
148 #else
149 
150  *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
151 
152 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
153 
154  /* Update the pointer px to point to the next row of the transposed matrix */
155  px += numRowsB;
156 
157  /* Unpack and store the second element in the destination */
158 #ifndef ARM_MATH_BIG_ENDIAN
159 
160  *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
161 
162 #else
163 
164  *px = (q15_t) in;
165 
166 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
167 
168  /* Update the pointer px to point to the next row of the transposed matrix */
169  px += numRowsB;
170 
171  /* Read two elements from the row */
172  in = *__SIMD32(pInB)++;
173 
174  /* Unpack and store one element in the destination */
175 #ifndef ARM_MATH_BIG_ENDIAN
176 
177  *px = (q15_t) in;
178 
179 #else
180 
181  *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
182 
183 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
184 
185  /* Update the pointer px to point to the next row of the transposed matrix */
186  px += numRowsB;
187 
188  /* Unpack and store the second element in the destination */
189 
190 #ifndef ARM_MATH_BIG_ENDIAN
191 
192  *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
193 
194 #else
195 
196  *px = (q15_t) in;
197 
198 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
199 
200 #else
201 
202  /* Read one element from the row */
203  in = *pInB++;
204 
205  /* Store one element in the destination */
206  *px = in;
207 
208  /* Update the pointer px to point to the next row of the transposed matrix */
209  px += numRowsB;
210 
211  /* Read one element from the row */
212  in = *pInB++;
213 
214  /* Store one element in the destination */
215  *px = in;
216 
217  /* Update the pointer px to point to the next row of the transposed matrix */
218  px += numRowsB;
219 
220  /* Read one element from the row */
221  in = *pInB++;
222 
223  /* Store one element in the destination */
224  *px = in;
225 
226  /* Update the pointer px to point to the next row of the transposed matrix */
227  px += numRowsB;
228 
229  /* Read one element from the row */
230  in = *pInB++;
231 
232  /* Store one element in the destination */
233  *px = in;
234 
235 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
236 
237  /* Update the pointer px to point to the next row of the transposed matrix */
238  px += numRowsB;
239 
240  /* Decrement the column loop counter */
241  col--;
242  }
243 
244  /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
245  ** No loop unrolling is used. */
246  col = numColsB % 0x4u;
247 
248  while(col > 0u)
249  {
250  /* Read and store the input element in the destination */
251  *px = *pInB++;
252 
253  /* Update the pointer px to point to the next row of the transposed matrix */
254  px += numRowsB;
255 
256  /* Decrement the column loop counter */
257  col--;
258  }
259 
260  i++;
261 
262  /* Decrement the row loop counter */
263  row--;
264 
265  } while(row > 0u);
266 
267  /* Reset the variables for the usage in the following multiplication process */
268  row = numRowsA;
269  i = 0u;
270  px = pDst->pData;
271 
272  /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
273  /* row loop */
274  do
275  {
276  /* For every row wise process, the column loop counter is to be initiated */
277  col = numColsB;
278 
279  /* For every row wise process, the pIn2 pointer is set
280  ** to the starting address of the transposed pSrcB data */
281  pInB = pSrcBT;
282 
283  /* column loop */
284  do
285  {
286  /* Set the variable sum, that acts as accumulator, to zero */
287  sum = 0;
288 
289  /* Apply loop unrolling and compute 2 MACs simultaneously. */
290  colCnt = numColsA >> 2;
291 
292  /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
293  pInA = pSrcA->pData + i;
294 
295  /* matrix multiplication */
296  while(colCnt > 0u)
297  {
298  /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
299 #ifndef UNALIGNED_SUPPORT_DISABLE
300 
301  inA1 = *__SIMD32(pInA)++;
302  inB1 = *__SIMD32(pInB)++;
303  inA2 = *__SIMD32(pInA)++;
304  inB2 = *__SIMD32(pInB)++;
305 
306  sum = __SMLAD(inA1, inB1, sum);
307  sum = __SMLAD(inA2, inB2, sum);
308 
309 #else
310 
311  inA1 = *pInA++;
312  inB1 = *pInB++;
313  inA2 = *pInA++;
314  sum += inA1 * inB1;
315  inB2 = *pInB++;
316 
317  inA1 = *pInA++;
318  inB1 = *pInB++;
319  sum += inA2 * inB2;
320  inA2 = *pInA++;
321  inB2 = *pInB++;
322 
323  sum += inA1 * inB1;
324  sum += inA2 * inB2;
325 
326 #endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
327 
328  /* Decrement the loop counter */
329  colCnt--;
330  }
331 
332  /* process odd column samples */
333  colCnt = numColsA % 0x4u;
334 
335  while(colCnt > 0u)
336  {
337  /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
338  sum += (q31_t) (*pInA++) * (*pInB++);
339 
340  colCnt--;
341  }
342 
343  /* Saturate and store the result in the destination buffer */
344  *px = (q15_t) (sum >> 15);
345  px++;
346 
347  /* Decrement the column loop counter */
348  col--;
349 
350  } while(col > 0u);
351 
352  i = i + numColsA;
353 
354  /* Decrement the row loop counter */
355  row--;
356 
357  } while(row > 0u);
358 
359  /* set status as ARM_MATH_SUCCESS */
360  status = ARM_MATH_SUCCESS;
361  }
362 
363  /* Return to application */
364  return (status);
365 }
366 
int16_t q15_t
16-bit fractional data type in 1.15 format.
Definition: arm_math.h:392
arm_status arm_mat_mult_fast_q15(const arm_matrix_instance_q15 *pSrcA, const arm_matrix_instance_q15 *pSrcB, arm_matrix_instance_q15 *pDst, q15_t *pState)
Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4.
#define __SIMD32(addr)
definition to read/write two 16 bit values.
Definition: arm_math.h:445
Instance structure for the Q15 matrix structure.
Definition: arm_math.h:1390
int32_t q31_t
32-bit fractional data type in 1.31 format.
Definition: arm_math.h:397
arm_status
Error status returned by some functions in the library.
Definition: arm_math.h:373