Vector Optimized Library of Kernels 3.1.2
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_32fc_s32f_atan2_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
61#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_a_H
62#define INCLUDED_volk_32fc_s32f_atan2_32f_a_H
63
64#include <math.h>
65
66#ifdef LV_HAVE_GENERIC
67static inline void volk_32fc_s32f_atan2_32f_generic(float* outputVector,
68 const lv_32fc_t* inputVector,
69 const float normalizeFactor,
70 unsigned int num_points)
71{
72 float* outPtr = outputVector;
73 const float* inPtr = (float*)inputVector;
74 const float invNormalizeFactor = 1.f / normalizeFactor;
75 unsigned int number = 0;
76 for (; number < num_points; number++) {
77 const float real = *inPtr++;
78 const float imag = *inPtr++;
79 *outPtr++ = atan2f(imag, real) * invNormalizeFactor;
80 }
81}
82#endif /* LV_HAVE_GENERIC */
83
84#ifdef LV_HAVE_GENERIC
85#include <volk/volk_common.h>
86static inline void volk_32fc_s32f_atan2_32f_polynomial(float* outputVector,
87 const lv_32fc_t* inputVector,
88 const float normalizeFactor,
89 unsigned int num_points)
90{
91 float* outPtr = outputVector;
92 const float* inPtr = (float*)inputVector;
93 const float invNormalizeFactor = 1.f / normalizeFactor;
94 unsigned int number = 0;
95 for (; number < num_points; number++) {
96 const float x = *inPtr++;
97 const float y = *inPtr++;
98 *outPtr++ = volk_atan2(y, x) * invNormalizeFactor;
99 }
100}
101#endif /* LV_HAVE_GENERIC */
102
103#if LV_HAVE_AVX2 && LV_HAVE_FMA
104#include <immintrin.h>
106static inline void volk_32fc_s32f_atan2_32f_a_avx2_fma(float* outputVector,
107 const lv_32fc_t* complexVector,
108 const float normalizeFactor,
109 unsigned int num_points)
110{
111 const float* in = (float*)complexVector;
112 float* out = (float*)outputVector;
113
114 const float invNormalizeFactor = 1.f / normalizeFactor;
115 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
116 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
117 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
118 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
119 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
120 const __m256 zero = _mm256_setzero_ps();
121
122 unsigned int number = 0;
123 unsigned int eighth_points = num_points / 8;
124 for (; number < eighth_points; number++) {
125 __m256 z1 = _mm256_load_ps(in);
126 in += 8;
127 __m256 z2 = _mm256_load_ps(in);
128 in += 8;
129
130 __m256 x = _mm256_real(z1, z2);
131 __m256 y = _mm256_imag(z1, z2);
132
133 __m256 swap_mask = _mm256_cmp_ps(
134 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
135 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
136 _mm256_blendv_ps(x, y, swap_mask));
137 __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
138 input = _mm256_blendv_ps(input, zero, nan_mask);
139 __m256 result = _m256_arctan_poly_avx2_fma(input);
140
141 input =
142 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
143 result = _mm256_blendv_ps(result, input, swap_mask);
144
145 __m256 x_sign_mask =
146 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
147
148 result = _mm256_add_ps(
149 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
150 result);
151 result = _mm256_mul_ps(result, vinvNormalizeFactor);
152
153 _mm256_store_ps(out, result);
154 out += 8;
155 }
156
157 number = eighth_points * 8;
159 out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
160}
161#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
162
163#if LV_HAVE_AVX2
164#include <immintrin.h>
166static inline void volk_32fc_s32f_atan2_32f_a_avx2(float* outputVector,
167 const lv_32fc_t* complexVector,
168 const float normalizeFactor,
169 unsigned int num_points)
170{
171 const float* in = (float*)complexVector;
172 float* out = (float*)outputVector;
173
174 const float invNormalizeFactor = 1.f / normalizeFactor;
175 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
176 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
177 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
178 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
179 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
180 const __m256 zero = _mm256_setzero_ps();
181
182 unsigned int number = 0;
183 unsigned int eighth_points = num_points / 8;
184 for (; number < eighth_points; number++) {
185 __m256 z1 = _mm256_load_ps(in);
186 in += 8;
187 __m256 z2 = _mm256_load_ps(in);
188 in += 8;
189
190 __m256 x = _mm256_real(z1, z2);
191 __m256 y = _mm256_imag(z1, z2);
192
193 __m256 swap_mask = _mm256_cmp_ps(
194 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
195 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
196 _mm256_blendv_ps(x, y, swap_mask));
197 __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
198 input = _mm256_blendv_ps(input, zero, nan_mask);
199 __m256 result = _m256_arctan_poly_avx(input);
200
201 input =
202 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
203 result = _mm256_blendv_ps(result, input, swap_mask);
204
205 __m256 x_sign_mask =
206 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
207
208 result = _mm256_add_ps(
209 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
210 result);
211 result = _mm256_mul_ps(result, vinvNormalizeFactor);
212
213 _mm256_store_ps(out, result);
214 out += 8;
215 }
216
217 number = eighth_points * 8;
219 out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
220}
221#endif /* LV_HAVE_AVX2 for aligned */
222#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_a_H */
223
224#ifndef INCLUDED_volk_32fc_s32f_atan2_32f_u_H
225#define INCLUDED_volk_32fc_s32f_atan2_32f_u_H
226
227#if LV_HAVE_AVX2 && LV_HAVE_FMA
228#include <immintrin.h>
230static inline void volk_32fc_s32f_atan2_32f_u_avx2_fma(float* outputVector,
231 const lv_32fc_t* complexVector,
232 const float normalizeFactor,
233 unsigned int num_points)
234{
235 const float* in = (float*)complexVector;
236 float* out = (float*)outputVector;
237
238 const float invNormalizeFactor = 1.f / normalizeFactor;
239 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
240 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
241 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
242 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
243 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
244 const __m256 zero = _mm256_setzero_ps();
245
246 unsigned int number = 0;
247 unsigned int eighth_points = num_points / 8;
248 for (; number < eighth_points; number++) {
249 __m256 z1 = _mm256_loadu_ps(in);
250 in += 8;
251 __m256 z2 = _mm256_loadu_ps(in);
252 in += 8;
253
254 __m256 x = _mm256_real(z1, z2);
255 __m256 y = _mm256_imag(z1, z2);
256
257 __m256 swap_mask = _mm256_cmp_ps(
258 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
259 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
260 _mm256_blendv_ps(x, y, swap_mask));
261 __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
262 input = _mm256_blendv_ps(input, zero, nan_mask);
263 __m256 result = _m256_arctan_poly_avx2_fma(input);
264
265 input =
266 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
267 result = _mm256_blendv_ps(result, input, swap_mask);
268
269 __m256 x_sign_mask =
270 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
271
272 result = _mm256_add_ps(
273 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
274 result);
275 result = _mm256_mul_ps(result, vinvNormalizeFactor);
276
277 _mm256_storeu_ps(out, result);
278 out += 8;
279 }
280
281 number = eighth_points * 8;
283 out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
284}
285#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
286
287#if LV_HAVE_AVX2
288#include <immintrin.h>
290static inline void volk_32fc_s32f_atan2_32f_u_avx2(float* outputVector,
291 const lv_32fc_t* complexVector,
292 const float normalizeFactor,
293 unsigned int num_points)
294{
295 const float* in = (float*)complexVector;
296 float* out = (float*)outputVector;
297
298 const float invNormalizeFactor = 1.f / normalizeFactor;
299 const __m256 vinvNormalizeFactor = _mm256_set1_ps(invNormalizeFactor);
300 const __m256 pi = _mm256_set1_ps(0x1.921fb6p1f);
301 const __m256 pi_2 = _mm256_set1_ps(0x1.921fb6p0f);
302 const __m256 abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
303 const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
304 const __m256 zero = _mm256_setzero_ps();
305
306 unsigned int number = 0;
307 unsigned int eighth_points = num_points / 8;
308 for (; number < eighth_points; number++) {
309 __m256 z1 = _mm256_loadu_ps(in);
310 in += 8;
311 __m256 z2 = _mm256_loadu_ps(in);
312 in += 8;
313
314 __m256 x = _mm256_real(z1, z2);
315 __m256 y = _mm256_imag(z1, z2);
316
317 __m256 swap_mask = _mm256_cmp_ps(
318 _mm256_and_ps(y, abs_mask), _mm256_and_ps(x, abs_mask), _CMP_GT_OS);
319 __m256 input = _mm256_div_ps(_mm256_blendv_ps(y, x, swap_mask),
320 _mm256_blendv_ps(x, y, swap_mask));
321 __m256 nan_mask = _mm256_cmp_ps(input, input, _CMP_UNORD_Q);
322 input = _mm256_blendv_ps(input, zero, nan_mask);
323 __m256 result = _m256_arctan_poly_avx(input);
324
325 input =
326 _mm256_sub_ps(_mm256_or_ps(pi_2, _mm256_and_ps(input, sign_mask)), result);
327 result = _mm256_blendv_ps(result, input, swap_mask);
328
329 __m256 x_sign_mask =
330 _mm256_castsi256_ps(_mm256_srai_epi32(_mm256_castps_si256(x), 31));
331
332 result = _mm256_add_ps(
333 _mm256_and_ps(_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)), x_sign_mask),
334 result);
335 result = _mm256_mul_ps(result, vinvNormalizeFactor);
336
337 _mm256_storeu_ps(out, result);
338 out += 8;
339 }
340
341 number = eighth_points * 8;
343 out, (lv_32fc_t*)in, normalizeFactor, num_points - number);
344}
345#endif /* LV_HAVE_AVX2 for unaligned */
346
347#endif /* INCLUDED_volk_32fc_s32f_atan2_32f_u_H */