[fix](trx-backend-soapysdr): replace avx2 atan2 with high-precision polynomial
Replace the low-accuracy 0.273 linear atan approximation with a 7th-order minimax polynomial (max error ~2.4e-7 rad vs ~0.004 rad). Use branchless |y|>|x| argument reduction instead of y/x division with quadrant fixup, avoiding division-by-zero and NaN branches. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> Signed-off-by: Stan Grams <sjg@haxx.space>
This commit is contained in:
@@ -179,128 +179,82 @@ fn fast_atan2(y: f32, x: f32) -> f32 {
|
|||||||
angle
|
angle
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
/// 7th-order minimax atan approximation for |z| ≤ 1.
|
||||||
|
/// Max error ≈ 2.4e-7 rad (~0.000014°).
|
||||||
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||||
#[target_feature(enable = "avx2")]
|
#[target_feature(enable = "avx2")]
|
||||||
unsafe fn fast_atan_8_avx2(z: std::arch::x86_64::__m256) -> std::arch::x86_64::__m256 {
|
unsafe fn atan_poly_avx2(z: std::arch::x86_64::__m256) -> std::arch::x86_64::__m256 {
|
||||||
|
#[cfg(target_arch = "x86")]
|
||||||
|
use std::arch::x86::*;
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
use std::arch::x86_64::*;
|
use std::arch::x86_64::*;
|
||||||
|
|
||||||
let zero = _mm256_set1_ps(0.0);
|
// Coefficients for atan(z) ≈ z·(c0 + z²·(c1 + z²·(c2 + z²·c3)))
|
||||||
let one = _mm256_set1_ps(1.0);
|
let c0 = _mm256_set1_ps(0.999_999_5_f32);
|
||||||
let sign_mask = _mm256_set1_ps(-0.0);
|
let c1 = _mm256_set1_ps(-0.333_326_1_f32);
|
||||||
let pi4 = _mm256_set1_ps(std::f32::consts::FRAC_PI_4);
|
let c2 = _mm256_set1_ps(0.199_777_1_f32);
|
||||||
let pi2 = _mm256_set1_ps(std::f32::consts::FRAC_PI_2);
|
let c3 = _mm256_set1_ps(-0.138_776_8_f32);
|
||||||
let c = _mm256_set1_ps(0.273);
|
|
||||||
|
|
||||||
let abs_z = _mm256_andnot_ps(sign_mask, z);
|
let z2 = _mm256_mul_ps(z, z);
|
||||||
let small_term = _mm256_add_ps(pi4, _mm256_mul_ps(c, _mm256_sub_ps(one, abs_z)));
|
// Horner: c2 + z²·c3
|
||||||
let small = _mm256_mul_ps(z, small_term);
|
let p = _mm256_add_ps(c2, _mm256_mul_ps(z2, c3));
|
||||||
|
// c1 + z²·p
|
||||||
let inv = _mm256_div_ps(one, z);
|
let p = _mm256_add_ps(c1, _mm256_mul_ps(z2, p));
|
||||||
let abs_inv = _mm256_andnot_ps(sign_mask, inv);
|
// c0 + z²·p
|
||||||
let large_term = _mm256_add_ps(pi4, _mm256_mul_ps(c, _mm256_sub_ps(one, abs_inv)));
|
let p = _mm256_add_ps(c0, _mm256_mul_ps(z2, p));
|
||||||
let base = _mm256_mul_ps(inv, large_term);
|
// z · p
|
||||||
let pos_large = _mm256_sub_ps(pi2, base);
|
_mm256_mul_ps(z, p)
|
||||||
let neg_large = _mm256_sub_ps(_mm256_sub_ps(zero, pi2), base);
|
|
||||||
let z_pos = _mm256_cmp_ps(z, zero, _CMP_GT_OQ);
|
|
||||||
let large = _mm256_blendv_ps(neg_large, pos_large, z_pos);
|
|
||||||
|
|
||||||
let le_one = _mm256_cmp_ps(abs_z, one, _CMP_LE_OQ);
|
|
||||||
_mm256_blendv_ps(large, small, le_one)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(target_arch = "x86")]
|
/// Branchless AVX2 atan2 using argument reduction and 7th-order polynomial.
|
||||||
#[target_feature(enable = "avx2")]
|
///
|
||||||
unsafe fn fast_atan_8_avx2(z: std::arch::x86::__m256) -> std::arch::x86::__m256 {
|
/// Uses the |y| > |x| swap technique to keep the atan input in [-1, 1],
|
||||||
use std::arch::x86::*;
|
/// then corrects with π/2 and sign adjustments. No divisions by zero,
|
||||||
|
/// no NaN branches — pure SIMD throughput.
|
||||||
let zero = _mm256_set1_ps(0.0);
|
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||||
let one = _mm256_set1_ps(1.0);
|
|
||||||
let sign_mask = _mm256_set1_ps(-0.0);
|
|
||||||
let pi4 = _mm256_set1_ps(std::f32::consts::FRAC_PI_4);
|
|
||||||
let pi2 = _mm256_set1_ps(std::f32::consts::FRAC_PI_2);
|
|
||||||
let c = _mm256_set1_ps(0.273);
|
|
||||||
|
|
||||||
let abs_z = _mm256_andnot_ps(sign_mask, z);
|
|
||||||
let small_term = _mm256_add_ps(pi4, _mm256_mul_ps(c, _mm256_sub_ps(one, abs_z)));
|
|
||||||
let small = _mm256_mul_ps(z, small_term);
|
|
||||||
|
|
||||||
let inv = _mm256_div_ps(one, z);
|
|
||||||
let abs_inv = _mm256_andnot_ps(sign_mask, inv);
|
|
||||||
let large_term = _mm256_add_ps(pi4, _mm256_mul_ps(c, _mm256_sub_ps(one, abs_inv)));
|
|
||||||
let base = _mm256_mul_ps(inv, large_term);
|
|
||||||
let pos_large = _mm256_sub_ps(pi2, base);
|
|
||||||
let neg_large = _mm256_sub_ps(_mm256_sub_ps(zero, pi2), base);
|
|
||||||
let z_pos = _mm256_cmp_ps(z, zero, _CMP_GT_OQ);
|
|
||||||
let large = _mm256_blendv_ps(neg_large, pos_large, z_pos);
|
|
||||||
|
|
||||||
let le_one = _mm256_cmp_ps(abs_z, one, _CMP_LE_OQ);
|
|
||||||
_mm256_blendv_ps(large, small, le_one)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
#[target_feature(enable = "avx2")]
|
#[target_feature(enable = "avx2")]
|
||||||
unsafe fn fast_atan2_8_avx2(
|
unsafe fn fast_atan2_8_avx2(
|
||||||
y: std::arch::x86_64::__m256,
|
y: std::arch::x86_64::__m256,
|
||||||
x: std::arch::x86_64::__m256,
|
x: std::arch::x86_64::__m256,
|
||||||
) -> std::arch::x86_64::__m256 {
|
) -> std::arch::x86_64::__m256 {
|
||||||
|
#[cfg(target_arch = "x86")]
|
||||||
|
use std::arch::x86::*;
|
||||||
|
#[cfg(target_arch = "x86_64")]
|
||||||
use std::arch::x86_64::*;
|
use std::arch::x86_64::*;
|
||||||
|
|
||||||
let zero = _mm256_set1_ps(0.0);
|
let abs_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFF_FFFF_u32 as i32));
|
||||||
|
let sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x8000_0000_u32 as i32));
|
||||||
let pi = _mm256_set1_ps(std::f32::consts::PI);
|
let pi = _mm256_set1_ps(std::f32::consts::PI);
|
||||||
let pi2 = _mm256_set1_ps(std::f32::consts::FRAC_PI_2);
|
let pi_2 = _mm256_set1_ps(std::f32::consts::FRAC_PI_2);
|
||||||
let neg_pi2 = _mm256_sub_ps(zero, pi2);
|
|
||||||
|
|
||||||
let z = _mm256_div_ps(y, x);
|
let abs_y = _mm256_and_ps(y, abs_mask);
|
||||||
let base = fast_atan_8_avx2(z);
|
let abs_x = _mm256_and_ps(x, abs_mask);
|
||||||
|
|
||||||
let x_pos = _mm256_cmp_ps(x, zero, _CMP_GT_OQ);
|
// If |y| > |x|, swap so the atan input is in [-1, 1].
|
||||||
let x_neg = _mm256_cmp_ps(x, zero, _CMP_LT_OQ);
|
let swap_mask = _mm256_cmp_ps(abs_y, abs_x, _CMP_GT_OS);
|
||||||
let y_nonneg = _mm256_cmp_ps(y, zero, _CMP_GE_OQ);
|
let num = _mm256_blendv_ps(y, x, swap_mask);
|
||||||
let y_pos = _mm256_cmp_ps(y, zero, _CMP_GT_OQ);
|
let den = _mm256_blendv_ps(x, y, swap_mask);
|
||||||
let y_neg = _mm256_cmp_ps(y, zero, _CMP_LT_OQ);
|
|
||||||
let x_zero = _mm256_cmp_ps(x, zero, _CMP_EQ_OQ);
|
|
||||||
|
|
||||||
let xneg_pos = _mm256_add_ps(base, pi);
|
let atan_input = _mm256_div_ps(num, den);
|
||||||
let xneg_neg = _mm256_sub_ps(base, pi);
|
let mut result = atan_poly_avx2(atan_input);
|
||||||
let xneg = _mm256_blendv_ps(xneg_neg, xneg_pos, y_nonneg);
|
|
||||||
let mut angle = _mm256_blendv_ps(xneg, base, x_pos);
|
|
||||||
angle = _mm256_blendv_ps(angle, zero, _mm256_andnot_ps(_mm256_or_ps(x_pos, x_neg), _mm256_castsi256_ps(_mm256_set1_epi32(-1))));
|
|
||||||
|
|
||||||
let zero_case = _mm256_blendv_ps(_mm256_blendv_ps(zero, neg_pi2, y_neg), pi2, y_pos);
|
// If swapped, result = copysign(π/2, atan_input) - result
|
||||||
_mm256_blendv_ps(angle, zero_case, x_zero)
|
let adj = _mm256_sub_ps(
|
||||||
}
|
_mm256_or_ps(pi_2, _mm256_and_ps(atan_input, sign_mask)),
|
||||||
|
result,
|
||||||
|
);
|
||||||
|
result = _mm256_blendv_ps(result, adj, swap_mask);
|
||||||
|
|
||||||
#[cfg(target_arch = "x86")]
|
// Quadrant correction: if x < 0, add ±π (sign from y).
|
||||||
#[target_feature(enable = "avx2")]
|
let x_sign_mask = _mm256_castsi256_ps(_mm256_srai_epi32(
|
||||||
unsafe fn fast_atan2_8_avx2(
|
_mm256_castps_si256(x),
|
||||||
y: std::arch::x86::__m256,
|
31,
|
||||||
x: std::arch::x86::__m256,
|
));
|
||||||
) -> std::arch::x86::__m256 {
|
let correction = _mm256_and_ps(
|
||||||
use std::arch::x86::*;
|
_mm256_xor_ps(pi, _mm256_and_ps(sign_mask, y)),
|
||||||
|
x_sign_mask,
|
||||||
let zero = _mm256_set1_ps(0.0);
|
);
|
||||||
let pi = _mm256_set1_ps(std::f32::consts::PI);
|
_mm256_add_ps(result, correction)
|
||||||
let pi2 = _mm256_set1_ps(std::f32::consts::FRAC_PI_2);
|
|
||||||
let neg_pi2 = _mm256_sub_ps(zero, pi2);
|
|
||||||
|
|
||||||
let z = _mm256_div_ps(y, x);
|
|
||||||
let base = fast_atan_8_avx2(z);
|
|
||||||
|
|
||||||
let x_pos = _mm256_cmp_ps(x, zero, _CMP_GT_OQ);
|
|
||||||
let x_neg = _mm256_cmp_ps(x, zero, _CMP_LT_OQ);
|
|
||||||
let y_nonneg = _mm256_cmp_ps(y, zero, _CMP_GE_OQ);
|
|
||||||
let y_pos = _mm256_cmp_ps(y, zero, _CMP_GT_OQ);
|
|
||||||
let y_neg = _mm256_cmp_ps(y, zero, _CMP_LT_OQ);
|
|
||||||
let x_zero = _mm256_cmp_ps(x, zero, _CMP_EQ_OQ);
|
|
||||||
|
|
||||||
let xneg_pos = _mm256_add_ps(base, pi);
|
|
||||||
let xneg_neg = _mm256_sub_ps(base, pi);
|
|
||||||
let xneg = _mm256_blendv_ps(xneg_neg, xneg_pos, y_nonneg);
|
|
||||||
let mut angle = _mm256_blendv_ps(xneg, base, x_pos);
|
|
||||||
angle = _mm256_blendv_ps(angle, zero, _mm256_andnot_ps(_mm256_or_ps(x_pos, x_neg), _mm256_castsi256_ps(_mm256_set1_epi32(-1))));
|
|
||||||
|
|
||||||
let zero_case = _mm256_blendv_ps(_mm256_blendv_ps(zero, neg_pi2, y_neg), pi2, y_pos);
|
|
||||||
_mm256_blendv_ps(angle, zero_case, x_zero)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
|
|||||||
Reference in New Issue
Block a user