diff --git a/sub_crates/rmath/src/wide4/mod.rs b/sub_crates/rmath/src/wide4/mod.rs index 9e4d72c..7fcdeb6 100644 --- a/sub_crates/rmath/src/wide4/mod.rs +++ b/sub_crates/rmath/src/wide4/mod.rs @@ -748,4 +748,38 @@ mod tests { assert_eq!(all ^ all, none); assert_eq!(none ^ none, none); } + + #[test] + fn matches_fallback() { + fn tf1(n: Float4) -> [f32; 4] { + [n.a(), n.b(), n.c(), n.d()] + } + fn tf2(n: fallback::Float4) -> [f32; 4] { + [n.a(), n.b(), n.c(), n.d()] + } + + let a1 = Float4::new(1.53245, 5.4234523, -424.432, 0.0004231); + let b1 = Float4::new(74.63, -9.65436, 3.0, -1003.3); + let c1 = Float4::new(-0.4216, -132.52, 8.9452, 42.0); + + let a2 = fallback::Float4::new(1.53245, 5.4234523, -424.432, 0.0004231); + let b2 = fallback::Float4::new(74.63, -9.65436, 3.0, -1003.3); + let c2 = fallback::Float4::new(-0.4216, -132.52, 8.9452, 42.0); + + assert_eq!(tf1(a1), tf2(a2)); + assert_eq!(tf1(b1), tf2(b2)); + assert_eq!(tf1(c1), tf2(c2)); + + assert_eq!(tf1(a1 + b1), tf2(a2 + b2)); + assert_eq!(tf1(a1 - b1), tf2(a2 - b2)); + assert_eq!(tf1(a1 * b1), tf2(a2 * b2)); + assert_eq!(tf1(a1 / b1), tf2(a2 / b2)); + assert_eq!(tf1(a1.mul_add(b1, c1)), tf2(a2.mul_add(b2, c2))); + assert_eq!(tf1(a1.min(b1)), tf2(a2.min(b2))); + assert_eq!(tf1(a1.max(b1)), tf2(a2.max(b2))); + assert_eq!(a1.min_element(), a2.min_element()); + assert_eq!(a1.max_element(), a2.max_element()); + assert_eq!(tf1(a1.recip()), tf2(a2.recip())); + assert_eq!(tf1(a1.abs()), tf2(a2.abs())); + } } diff --git a/sub_crates/rmath/src/wide4/sse.rs b/sub_crates/rmath/src/wide4/sse.rs index 6f45e27..a8511d0 100644 --- a/sub_crates/rmath/src/wide4/sse.rs +++ b/sub_crates/rmath/src/wide4/sse.rs @@ -3,8 +3,8 @@ use std::ops::{Add, BitAnd, BitOr, BitXor, Div, Index, Mul, Neg, Not, Sub}; use std::arch::x86_64::{ __m128, _mm_add_ps, _mm_and_ps, _mm_castsi128_ps, _mm_cmpeq_ps, _mm_cmpge_ps, _mm_cmpgt_ps, _mm_cmple_ps, _mm_cmplt_ps, _mm_div_ps, _mm_fmadd_ps, _mm_max_ps, _mm_min_ps, _mm_movemask_ps, - _mm_mul_ps, _mm_or_ps, _mm_rcp_ps, _mm_set1_epi32, _mm_set1_ps, _mm_set_epi32, _mm_set_ps, - _mm_setzero_ps, _mm_shuffle_ps, _mm_storeu_ps, _mm_sub_ps, _mm_xor_ps, + _mm_mul_ps, _mm_or_ps, _mm_set1_epi32, _mm_set1_ps, _mm_set_epi32, _mm_set_ps, _mm_setzero_ps, + _mm_shuffle_ps, _mm_storeu_ps, _mm_sub_ps, _mm_xor_ps, }; use crate::FMulAdd; @@ -77,7 +77,10 @@ impl Float4 { /// 1.0 / self #[inline(always)] pub fn recip(self) -> Self { - Self(unsafe { _mm_rcp_ps(self.0) }) + // The reciprocal intrinsic is not precise enough. + // Self(unsafe { std::arch::x86_64::_mm_rcp_ps(self.0) }) + + Self::splat(1.0) / self } #[inline(always)]