#![allow(dead_code)] /// Implementation of Float4 for x86_64 platforms with SSE support. #[cfg(all(target_arch = "x86_64", target_feature = "sse"))] mod x86_64_sse { use std::{ arch::x86_64::__m128, cmp::PartialEq, ops::{Add, AddAssign, BitAnd, BitOr, Div, DivAssign, Mul, MulAssign, Sub, SubAssign}, }; #[derive(Debug, Copy, Clone)] pub struct Float4 { data: __m128, } impl Float4 { #[inline(always)] pub fn new(a: f32, b: f32, c: f32, d: f32) -> Float4 { use std::arch::x86_64::_mm_set_ps; Float4 { data: unsafe { _mm_set_ps(d, c, b, a) }, } } #[inline(always)] pub fn splat(n: f32) -> Float4 { use std::arch::x86_64::_mm_set1_ps; Float4 { data: unsafe { _mm_set1_ps(n) }, } } #[inline] pub fn h_sum(&self) -> f32 { #[cfg(target_feature = "sse3")] { use std::arch::x86_64::{ _mm_add_ps, _mm_add_ss, _mm_cvtss_f32, _mm_movehdup_ps, _mm_movehl_ps, }; unsafe { let v = self.data; let shuf = _mm_movehdup_ps(v); let sums = _mm_add_ps(v, shuf); let shuf = _mm_movehl_ps(shuf, sums); let sums = _mm_add_ss(sums, shuf); _mm_cvtss_f32(sums) } } #[cfg(not(target_feature = "sse3"))] { use std::arch::x86_64::{ _mm_add_ps, _mm_add_ss, _mm_cvtss_f32, _mm_movehl_ps, _mm_shuffle_ps, }; unsafe { let v = self.data; let shuf = _mm_shuffle_ps(v, v, (2 << 6) | (3 << 4) | 1); let sums = _mm_add_ps(v, shuf); let shuf = _mm_movehl_ps(shuf, sums); let sums = _mm_add_ss(sums, shuf); _mm_cvtss_f32(sums) } } } #[inline] pub fn h_product(&self) -> f32 { (self.get_0() * self.get_1()) * (self.get_2() * self.get_3()) } #[inline] pub fn h_min(&self) -> f32 { let n1 = if self.get_0() < self.get_1() { self.get_0() } else { self.get_1() }; let n2 = if self.get_2() < self.get_3() { self.get_2() } else { self.get_3() }; if n1 < n2 { n1 } else { n2 } } #[inline] pub fn h_max(&self) -> f32 { let n1 = if self.get_0() > self.get_1() { self.get_0() } else { self.get_1() }; let n2 = if self.get_2() > self.get_3() { self.get_2() } else { self.get_3() }; if n1 > n2 { n1 } else { n2 } } #[inline(always)] pub fn v_min(&self, other: Float4) -> Float4 { use std::arch::x86_64::_mm_min_ps; Float4 { data: unsafe { _mm_min_ps(self.data, other.data) }, } } #[inline(always)] pub fn v_max(&self, other: Float4) -> Float4 { use std::arch::x86_64::_mm_max_ps; Float4 { data: unsafe { _mm_max_ps(self.data, other.data) }, } } #[inline(always)] pub fn lt(&self, other: Float4) -> Bool4 { use std::arch::x86_64::_mm_cmplt_ps; Bool4 { data: unsafe { _mm_cmplt_ps(self.data, other.data) }, } } #[inline(always)] pub fn lte(&self, other: Float4) -> Bool4 { use std::arch::x86_64::_mm_cmple_ps; Bool4 { data: unsafe { _mm_cmple_ps(self.data, other.data) }, } } #[inline(always)] pub fn gt(&self, other: Float4) -> Bool4 { use std::arch::x86_64::_mm_cmpgt_ps; Bool4 { data: unsafe { _mm_cmpgt_ps(self.data, other.data) }, } } #[inline(always)] pub fn gte(&self, other: Float4) -> Bool4 { use std::arch::x86_64::_mm_cmpge_ps; Bool4 { data: unsafe { _mm_cmpge_ps(self.data, other.data) }, } } /// Set the nth element to the given value. #[inline(always)] pub fn set_n(&mut self, n: usize, v: f32) { assert!( n <= 3, "Attempted to set element of Float4 outside of bounds." ); unsafe { *(&mut self.data as *mut std::arch::x86_64::__m128 as *mut f32).add(n) = v } } /// Set the 0th element to the given value. #[inline(always)] pub fn set_0(&mut self, v: f32) { self.set_n(0, v); } /// Set the 1th element to the given value. #[inline(always)] pub fn set_1(&mut self, v: f32) { self.set_n(1, v); } /// Set the 2th element to the given value. #[inline(always)] pub fn set_2(&mut self, v: f32) { self.set_n(2, v); } /// Set the 3th element to the given value. #[inline(always)] pub fn set_3(&mut self, v: f32) { self.set_n(3, v); } /// Returns the value of the nth element. #[inline(always)] pub fn get_n(&self, n: usize) -> f32 { assert!( n <= 3, "Attempted to access element of Float4 outside of bounds." ); unsafe { *(&self.data as *const std::arch::x86_64::__m128 as *const f32).add(n) } } /// Returns the value of the 0th element. #[inline(always)] pub fn get_0(&self) -> f32 { self.get_n(0) } /// Returns the value of the 1th element. #[inline(always)] pub fn get_1(&self) -> f32 { self.get_n(1) } /// Returns the value of the 2th element. #[inline(always)] pub fn get_2(&self) -> f32 { self.get_n(2) } /// Returns the value of the 3th element. #[inline(always)] pub fn get_3(&self) -> f32 { self.get_n(3) } /// Returns a Float4 with all elements set to the value /// of element 0. #[inline(always)] pub fn all_0(&self) -> Float4 { use std::arch::x86_64::_mm_shuffle_ps; Float4 { data: unsafe { _mm_shuffle_ps(self.data, self.data, 0b00_00_00_00) }, } } /// Returns a Float4 with all elements set to the value /// of element 1. #[inline(always)] pub fn all_1(&self) -> Float4 { use std::arch::x86_64::_mm_shuffle_ps; Float4 { data: unsafe { _mm_shuffle_ps(self.data, self.data, 0b01_01_01_01) }, } } /// Returns a Float4 with all elements set to the value /// of element 2. #[inline(always)] pub fn all_2(&self) -> Float4 { use std::arch::x86_64::_mm_shuffle_ps; Float4 { data: unsafe { _mm_shuffle_ps(self.data, self.data, 0b10_10_10_10) }, } } /// Returns a Float4 with all elements set to the value /// of element 3. #[inline(always)] pub fn all_3(&self) -> Float4 { use std::arch::x86_64::_mm_shuffle_ps; Float4 { data: unsafe { _mm_shuffle_ps(self.data, self.data, 0b11_11_11_11) }, } } /// Returns the square roots of all elements. #[inline(always)] pub fn sqrt(&self) -> Float4 { use std::arch::x86_64::_mm_sqrt_ps; Float4 { data: unsafe { _mm_sqrt_ps(self.data) }, } } /// Performs a fused multiply add. /// /// i.e. self * b + c #[inline(always)] pub fn fmadd(&self, b: Float4, c: Float4) -> Float4 { #[cfg(target_feature = "fma")] { use std::arch::x86_64::_mm_fmadd_ps; Float4 { data: unsafe { _mm_fmadd_ps(self.data, b.data, c.data) }, } } #[cfg(not(target_feature = "fma"))] { (*self * b) + c } } } impl PartialEq for Float4 { #[inline] fn eq(&self, other: &Float4) -> bool { self.get_0() == other.get_0() && self.get_1() == other.get_1() && self.get_2() == other.get_2() && self.get_3() == other.get_3() } } impl Add for Float4 { type Output = Float4; #[inline(always)] fn add(self, other: Float4) -> Float4 { use std::arch::x86_64::_mm_add_ps; Float4 { data: unsafe { _mm_add_ps(self.data, other.data) }, } } } impl AddAssign for Float4 { #[inline(always)] fn add_assign(&mut self, rhs: Float4) { *self = *self + rhs; } } impl Sub for Float4 { type Output = Float4; #[inline(always)] fn sub(self, other: Float4) -> Float4 { use std::arch::x86_64::_mm_sub_ps; Float4 { data: unsafe { _mm_sub_ps(self.data, other.data) }, } } } impl SubAssign for Float4 { #[inline(always)] fn sub_assign(&mut self, rhs: Float4) { *self = *self - rhs; } } impl Mul for Float4 { type Output = Float4; #[inline(always)] fn mul(self, other: Float4) -> Float4 { use std::arch::x86_64::_mm_mul_ps; Float4 { data: unsafe { _mm_mul_ps(self.data, other.data) }, } } } impl Mul for Float4 { type Output = Float4; #[inline(always)] fn mul(self, other: f32) -> Float4 { self * Float4::splat(other) } } impl MulAssign for Float4 { #[inline(always)] fn mul_assign(&mut self, rhs: Float4) { *self = *self * rhs; } } impl MulAssign for Float4 { #[inline(always)] fn mul_assign(&mut self, rhs: f32) { *self = *self * rhs; } } impl Div for Float4 { type Output = Float4; #[inline(always)] fn div(self, other: Float4) -> Float4 { use std::arch::x86_64::_mm_div_ps; Float4 { data: unsafe { _mm_div_ps(self.data, other.data) }, } } } impl Div for Float4 { type Output = Float4; #[inline(always)] fn div(self, other: f32) -> Float4 { self / Float4::splat(other) } } impl DivAssign for Float4 { #[inline(always)] fn div_assign(&mut self, rhs: Float4) { *self = *self / rhs; } } impl DivAssign for Float4 { #[inline(always)] fn div_assign(&mut self, rhs: f32) { *self = *self / rhs; } } // Free functions for Float4 #[inline(always)] pub fn v_min(a: Float4, b: Float4) -> Float4 { a.v_min(b) } #[inline(always)] pub fn v_max(a: Float4, b: Float4) -> Float4 { a.v_max(b) } /// Transposes a 4x4 matrix in-place. #[inline(always)] pub fn transpose(matrix: &mut [Float4; 4]) { use std::arch::x86_64::_MM_TRANSPOSE4_PS; // The weird &mut/*mut gymnastics below are to get around // the borrow-checker. We know statically that these references // are non-overlapping, so it's safe. unsafe { _MM_TRANSPOSE4_PS( &mut *(&mut matrix[0].data as *mut __m128), &mut *(&mut matrix[1].data as *mut __m128), &mut *(&mut matrix[2].data as *mut __m128), &mut *(&mut matrix[3].data as *mut __m128), ) }; } /// Inverts a 4x4 matrix and returns the determinate. #[inline(always)] pub fn invert(matrix: &mut [Float4; 4]) -> f32 { // Code pulled from "Streaming SIMD Extensions - Inverse of 4x4 Matrix" // by Intel. // ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf // Ported to Rust. // TODO: once __m64 and accompanying intrinsics are stabilized, switch // to using those, commented out in the code below. use std::arch::x86_64::{ _mm_add_ps, _mm_add_ss, _mm_cvtss_f32, _mm_mul_ps, _mm_mul_ss, _mm_rcp_ss, // _mm_loadh_pi, // _mm_loadl_pi, // _mm_storeh_pi, // _mm_storel_pi, _mm_set_ps, _mm_shuffle_ps, _mm_sub_ps, _mm_sub_ss, }; use std::mem::transmute; let mut minor0: __m128; let mut minor1: __m128; let mut minor2: __m128; let mut minor3: __m128; let row0: __m128; let mut row1: __m128; let mut row2: __m128; let mut row3: __m128; let mut det: __m128; let mut tmp1: __m128; unsafe { // tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4)); tmp1 = _mm_set_ps( matrix[1].get_1(), matrix[1].get_0(), matrix[0].get_1(), matrix[0].get_0(), ); // row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12)); row1 = _mm_set_ps( matrix[3].get_1(), matrix[3].get_0(), matrix[2].get_1(), matrix[2].get_0(), ); row0 = _mm_shuffle_ps(tmp1, row1, 0x88); row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); // tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6)); tmp1 = _mm_set_ps( matrix[1].get_3(), matrix[1].get_2(), matrix[0].get_3(), matrix[0].get_2(), ); // row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14)); row3 = _mm_set_ps( matrix[3].get_3(), matrix[3].get_2(), matrix[2].get_3(), matrix[2].get_2(), ); row2 = _mm_shuffle_ps(tmp1, row3, 0x88); row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); // ----------------------------------------------- tmp1 = _mm_mul_ps(row2, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_mul_ps(row1, tmp1); minor1 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(row1, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); minor3 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); row2 = _mm_shuffle_ps(row2, row2, 0x4E); minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); minor2 = _mm_mul_ps(row0, tmp1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row1); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row3); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); // ----------------------------------------------- tmp1 = _mm_mul_ps(row0, row2); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); // ----------------------------------------------- det = _mm_mul_ps(row0, minor0); det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); tmp1 = _mm_rcp_ss(det); det = _mm_sub_ss( _mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)), ); det = _mm_shuffle_ps(det, det, 0x00); minor0 = _mm_mul_ps(det, minor0); // _mm_storel_pi((__m64*)(src), minor0); // _mm_storeh_pi((__m64*)(src+2), minor0); let minor0 = transmute::<__m128, [f32; 4]>(minor0); matrix[0].data = _mm_set_ps(minor0[3], minor0[2], minor0[1], minor0[0]); minor1 = _mm_mul_ps(det, minor1); // _mm_storel_pi((__m64*)(src+4), minor1); // _mm_storeh_pi((__m64*)(src+6), minor1); let minor1 = transmute::<__m128, [f32; 4]>(minor1); matrix[1].data = _mm_set_ps(minor1[3], minor1[2], minor1[1], minor1[0]); minor2 = _mm_mul_ps(det, minor2); // _mm_storel_pi((__m64*)(src+ 8), minor2); // _mm_storeh_pi((__m64*)(src+10), minor2); let minor2 = transmute::<__m128, [f32; 4]>(minor2); matrix[2].data = _mm_set_ps(minor2[3], minor2[2], minor2[1], minor2[0]); minor3 = _mm_mul_ps(det, minor3); // _mm_storel_pi((__m64*)(src+12), minor3); // _mm_storeh_pi((__m64*)(src+14), minor3); let minor3 = transmute::<__m128, [f32; 4]>(minor3); matrix[3].data = _mm_set_ps(minor3[3], minor3[2], minor3[1], minor3[0]); _mm_cvtss_f32(det) } } /// Essentially a tuple of four bools, which will use SIMD operations /// where possible on a platform. #[derive(Debug, Copy, Clone)] pub struct Bool4 { data: __m128, } impl Bool4 { #[inline(always)] pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 { use std::arch::x86_64::_mm_set_ps; Bool4 { data: unsafe { _mm_set_ps( if d { 1.0 } else { 0.0 }, if c { 1.0 } else { 0.0 }, if b { 1.0 } else { 0.0 }, if a { 1.0 } else { 0.0 }, ) }, } } #[inline(always)] pub fn new_false() -> Bool4 { use std::arch::x86_64::_mm_set1_ps; Bool4 { data: unsafe { _mm_set1_ps(0.0) }, } } /// Returns the value of the nth element. #[inline(always)] pub fn get_n(&self, n: usize) -> bool { assert!( n <= 3, "Attempted to access element of Bool4 outside of bounds." ); 0 != unsafe { *(&self.data as *const std::arch::x86_64::__m128 as *const u32).add(n) } } /// Returns the value of the 0th element. #[inline(always)] pub fn get_0(&self) -> bool { self.get_n(0) } /// Returns the value of the 1st element. #[inline(always)] pub fn get_1(&self) -> bool { self.get_n(1) } /// Returns the value of the 2nd element. #[inline(always)] pub fn get_2(&self) -> bool { self.get_n(2) } /// Returns the value of the 3rd element. #[inline(always)] pub fn get_3(&self) -> bool { self.get_n(3) } /// Returns whether all four bools are false. /// /// This is the `NOT` operation on the result of `OR`ing all the /// contained bools. If even one bool is true, this returns false. #[inline(always)] pub fn is_all_false(&self) -> bool { let a = unsafe { *(&self.data as *const __m128 as *const u128) }; a == 0 } #[inline] pub fn to_bitmask(&self) -> u8 { let a = unsafe { *(&self.data as *const __m128 as *const u8).offset(0) }; let b = unsafe { *(&self.data as *const __m128 as *const u8).offset(4) }; let c = unsafe { *(&self.data as *const __m128 as *const u8).offset(8) }; let d = unsafe { *(&self.data as *const __m128 as *const u8).offset(12) }; (a & 0b0000_0001) | (b & 0b0000_0010) | (c & 0b0000_0100) | (d & 0b0000_1000) } } impl BitAnd for Bool4 { type Output = Bool4; #[inline(always)] fn bitand(self, rhs: Bool4) -> Bool4 { use std::arch::x86_64::_mm_and_ps; Bool4 { data: unsafe { _mm_and_ps(self.data, rhs.data) }, } } } impl BitOr for Bool4 { type Output = Bool4; #[inline(always)] fn bitor(self, rhs: Bool4) -> Bool4 { use std::arch::x86_64::_mm_or_ps; Bool4 { data: unsafe { _mm_or_ps(self.data, rhs.data) }, } } } } //=========================================================================== /// Implementation fo Float4 for any platform, foregoing any /// platform-specific optimizations. mod fallback { use std::{ cmp::PartialEq, ops::{Add, AddAssign, BitAnd, BitOr, Div, DivAssign, Mul, MulAssign, Sub, SubAssign}, }; #[derive(Debug, Copy, Clone)] pub struct Float4 { data: [f32; 4], } impl Float4 { #[inline(always)] pub fn new(a: f32, b: f32, c: f32, d: f32) -> Float4 { Float4 { data: [a, b, c, d] } } #[inline(always)] pub fn splat(n: f32) -> Float4 { Float4 { data: [n, n, n, n] } } #[inline] pub fn h_sum(&self) -> f32 { (self.get_0() + self.get_1()) + (self.get_2() + self.get_3()) } #[inline] pub fn h_product(&self) -> f32 { (self.get_0() * self.get_1()) * (self.get_2() * self.get_3()) } #[inline] pub fn h_min(&self) -> f32 { let n1 = if self.get_0() < self.get_1() { self.get_0() } else { self.get_1() }; let n2 = if self.get_2() < self.get_3() { self.get_2() } else { self.get_3() }; if n1 < n2 { n1 } else { n2 } } #[inline] pub fn h_max(&self) -> f32 { let n1 = if self.get_0() > self.get_1() { self.get_0() } else { self.get_1() }; let n2 = if self.get_2() > self.get_3() { self.get_2() } else { self.get_3() }; if n1 > n2 { n1 } else { n2 } } #[inline(always)] pub fn v_min(&self, other: Float4) -> Float4 { Float4::new( if self.get_0() < other.get_0() { self.get_0() } else { other.get_0() }, if self.get_1() < other.get_1() { self.get_1() } else { other.get_1() }, if self.get_2() < other.get_2() { self.get_2() } else { other.get_2() }, if self.get_3() < other.get_3() { self.get_3() } else { other.get_3() }, ) } #[inline(always)] pub fn v_max(&self, other: Float4) -> Float4 { Float4::new( if self.get_0() > other.get_0() { self.get_0() } else { other.get_0() }, if self.get_1() > other.get_1() { self.get_1() } else { other.get_1() }, if self.get_2() > other.get_2() { self.get_2() } else { other.get_2() }, if self.get_3() > other.get_3() { self.get_3() } else { other.get_3() }, ) } #[inline(always)] pub fn lt(&self, other: Float4) -> Bool4 { Bool4 { data: [ self.data[0] < other.data[0], self.data[1] < other.data[1], self.data[2] < other.data[2], self.data[3] < other.data[3], ], } } #[inline(always)] pub fn lte(&self, other: Float4) -> Bool4 { Bool4 { data: [ self.data[0] <= other.data[0], self.data[1] <= other.data[1], self.data[2] <= other.data[2], self.data[3] <= other.data[3], ], } } #[inline(always)] pub fn gt(&self, other: Float4) -> Bool4 { Bool4 { data: [ self.data[0] > other.data[0], self.data[1] > other.data[1], self.data[2] > other.data[2], self.data[3] > other.data[3], ], } } #[inline(always)] pub fn gte(&self, other: Float4) -> Bool4 { Bool4 { data: [ self.data[0] >= other.data[0], self.data[1] >= other.data[1], self.data[2] >= other.data[2], self.data[3] >= other.data[3], ], } } /// Set the nth element to the given value. #[inline(always)] pub fn set_n(&mut self, n: usize, v: f32) { assert!( n <= 3, "Attempted to set element of Float4 outside of bounds." ); unsafe { *self.data.get_unchecked_mut(n) = v; } } /// Set the 0th element to the given value. #[inline(always)] pub fn set_0(&mut self, v: f32) { self.set_n(0, v); } /// Set the 1th element to the given value. #[inline(always)] pub fn set_1(&mut self, v: f32) { self.set_n(1, v); } /// Set the 2th element to the given value. #[inline(always)] pub fn set_2(&mut self, v: f32) { self.set_n(2, v); } /// Set the 3th element to the given value. #[inline(always)] pub fn set_3(&mut self, v: f32) { self.set_n(3, v); } /// Returns the value of the nth element. #[inline(always)] pub fn get_n(&self, n: usize) -> f32 { assert!( n <= 3, "Attempted to access element of Float4 outside of bounds." ); unsafe { *self.data.get_unchecked(n) } } /// Returns the value of the 0th element. #[inline(always)] pub fn get_0(&self) -> f32 { self.get_n(0) } /// Returns the value of the 1th element. #[inline(always)] pub fn get_1(&self) -> f32 { self.get_n(1) } /// Returns the value of the 2th element. #[inline(always)] pub fn get_2(&self) -> f32 { self.get_n(2) } /// Returns the value of the 3th element. #[inline(always)] pub fn get_3(&self) -> f32 { self.get_n(3) } /// Returns a Float4 with all elements set to the value /// of element 0. #[inline(always)] pub fn all_0(&self) -> Float4 { Float4 { data: [self.data[0], self.data[0], self.data[0], self.data[0]], } } /// Returns a Float4 with all elements set to the value /// of element 1. #[inline(always)] pub fn all_1(&self) -> Float4 { Float4 { data: [self.data[1], self.data[1], self.data[1], self.data[1]], } } /// Returns a Float4 with all elements set to the value /// of element 2. #[inline(always)] pub fn all_2(&self) -> Float4 { Float4 { data: [self.data[2], self.data[2], self.data[2], self.data[2]], } } /// Returns a Float4 with all elements set to the value /// of element 3. #[inline(always)] pub fn all_3(&self) -> Float4 { Float4 { data: [self.data[3], self.data[3], self.data[3], self.data[3]], } } /// Returns the square roots of all elements. #[inline(always)] pub fn sqrt(&self) -> Float4 { Float4::new( self.get_0().sqrt(), self.get_1().sqrt(), self.get_2().sqrt(), self.get_3().sqrt(), ) } /// Performs a fused multiply add. /// /// i.e. self * b + c #[inline(always)] pub fn fmadd(&self, b: Float4, c: Float4) -> Float4 { (*self * b) + c } } impl PartialEq for Float4 { #[inline] fn eq(&self, other: &Float4) -> bool { self.get_0() == other.get_0() && self.get_1() == other.get_1() && self.get_2() == other.get_2() && self.get_3() == other.get_3() } } impl Add for Float4 { type Output = Float4; #[inline(always)] fn add(self, other: Float4) -> Float4 { Float4 { data: [ self.get_0() + other.get_0(), self.get_1() + other.get_1(), self.get_2() + other.get_2(), self.get_3() + other.get_3(), ], } } } impl AddAssign for Float4 { #[inline(always)] fn add_assign(&mut self, rhs: Float4) { *self = *self + rhs; } } impl Sub for Float4 { type Output = Float4; #[inline(always)] fn sub(self, other: Float4) -> Float4 { Float4 { data: [ self.get_0() - other.get_0(), self.get_1() - other.get_1(), self.get_2() - other.get_2(), self.get_3() - other.get_3(), ], } } } impl SubAssign for Float4 { #[inline(always)] fn sub_assign(&mut self, rhs: Float4) { *self = *self - rhs; } } impl Mul for Float4 { type Output = Float4; #[inline(always)] fn mul(self, other: Float4) -> Float4 { Float4 { data: [ self.get_0() * other.get_0(), self.get_1() * other.get_1(), self.get_2() * other.get_2(), self.get_3() * other.get_3(), ], } } } impl Mul for Float4 { type Output = Float4; #[inline(always)] fn mul(self, other: f32) -> Float4 { Float4 { data: [ self.get_0() * other, self.get_1() * other, self.get_2() * other, self.get_3() * other, ], } } } impl MulAssign for Float4 { #[inline(always)] fn mul_assign(&mut self, rhs: Float4) { *self = *self * rhs; } } impl MulAssign for Float4 { #[inline(always)] fn mul_assign(&mut self, rhs: f32) { *self = *self * rhs; } } impl Div for Float4 { type Output = Float4; #[inline(always)] fn div(self, other: Float4) -> Float4 { Float4 { data: [ self.get_0() / other.get_0(), self.get_1() / other.get_1(), self.get_2() / other.get_2(), self.get_3() / other.get_3(), ], } } } impl Div for Float4 { type Output = Float4; #[inline(always)] fn div(self, other: f32) -> Float4 { Float4 { data: [ self.get_0() / other, self.get_1() / other, self.get_2() / other, self.get_3() / other, ], } } } impl DivAssign for Float4 { #[inline(always)] fn div_assign(&mut self, rhs: Float4) { *self = *self / rhs; } } impl DivAssign for Float4 { #[inline(always)] fn div_assign(&mut self, rhs: f32) { *self = *self / rhs; } } // Free functions for Float4 #[inline(always)] pub fn v_min(a: Float4, b: Float4) -> Float4 { a.v_min(b) } #[inline(always)] pub fn v_max(a: Float4, b: Float4) -> Float4 { a.v_max(b) } /// Transposes a 4x4 matrix in-place #[inline(always)] pub fn transpose(matrix: &mut [Float4; 4]) { let m = [ Float4::new( matrix[0].get_0(), matrix[1].get_0(), matrix[2].get_0(), matrix[3].get_0(), ), Float4::new( matrix[0].get_1(), matrix[1].get_1(), matrix[2].get_1(), matrix[3].get_1(), ), Float4::new( matrix[0].get_2(), matrix[1].get_2(), matrix[2].get_2(), matrix[3].get_2(), ), Float4::new( matrix[0].get_3(), matrix[1].get_3(), matrix[2].get_3(), matrix[3].get_3(), ), ]; *matrix = m; } /// Inverts a 4x4 matrix and returns the determinate. #[inline(always)] pub fn invert(matrix: &mut [Float4; 4]) -> f32 { let m = *matrix; let s0 = (m[0].get_0() * m[1].get_1()) - (m[1].get_0() * m[0].get_1()); let s1 = (m[0].get_0() * m[1].get_2()) - (m[1].get_0() * m[0].get_2()); let s2 = (m[0].get_0() * m[1].get_3()) - (m[1].get_0() * m[0].get_3()); let s3 = (m[0].get_1() * m[1].get_2()) - (m[1].get_1() * m[0].get_2()); let s4 = (m[0].get_1() * m[1].get_3()) - (m[1].get_1() * m[0].get_3()); let s5 = (m[0].get_2() * m[1].get_3()) - (m[1].get_2() * m[0].get_3()); let c5 = (m[2].get_2() * m[3].get_3()) - (m[3].get_2() * m[2].get_3()); let c4 = (m[2].get_1() * m[3].get_3()) - (m[3].get_1() * m[2].get_3()); let c3 = (m[2].get_1() * m[3].get_2()) - (m[3].get_1() * m[2].get_2()); let c2 = (m[2].get_0() * m[3].get_3()) - (m[3].get_0() * m[2].get_3()); let c1 = (m[2].get_0() * m[3].get_2()) - (m[3].get_0() * m[2].get_2()); let c0 = (m[2].get_0() * m[3].get_1()) - (m[3].get_0() * m[2].get_1()); // We don't check for 0.0 determinant, as that is expected to be handled // by the calling code. let det = (s0 * c5) - (s1 * c4) + (s2 * c3) + (s3 * c2) - (s4 * c1) + (s5 * c0); let invdet = 1.0 / det; *matrix = [ Float4::new( ((m[1].get_1() * c5) - (m[1].get_2() * c4) + (m[1].get_3() * c3)) * invdet, ((-m[0].get_1() * c5) + (m[0].get_2() * c4) - (m[0].get_3() * c3)) * invdet, ((m[3].get_1() * s5) - (m[3].get_2() * s4) + (m[3].get_3() * s3)) * invdet, ((-m[2].get_1() * s5) + (m[2].get_2() * s4) - (m[2].get_3() * s3)) * invdet, ), Float4::new( ((-m[1].get_0() * c5) + (m[1].get_2() * c2) - (m[1].get_3() * c1)) * invdet, ((m[0].get_0() * c5) - (m[0].get_2() * c2) + (m[0].get_3() * c1)) * invdet, ((-m[3].get_0() * s5) + (m[3].get_2() * s2) - (m[3].get_3() * s1)) * invdet, ((m[2].get_0() * s5) - (m[2].get_2() * s2) + (m[2].get_3() * s1)) * invdet, ), Float4::new( ((m[1].get_0() * c4) - (m[1].get_1() * c2) + (m[1].get_3() * c0)) * invdet, ((-m[0].get_0() * c4) + (m[0].get_1() * c2) - (m[0].get_3() * c0)) * invdet, ((m[3].get_0() * s4) - (m[3].get_1() * s2) + (m[3].get_3() * s0)) * invdet, ((-m[2].get_0() * s4) + (m[2].get_1() * s2) - (m[2].get_3() * s0)) * invdet, ), Float4::new( ((-m[1].get_0() * c3) + (m[1].get_1() * c1) - (m[1].get_2() * c0)) * invdet, ((m[0].get_0() * c3) - (m[0].get_1() * c1) + (m[0].get_2() * c0)) * invdet, ((-m[3].get_0() * s3) + (m[3].get_1() * s1) - (m[3].get_2() * s0)) * invdet, ((m[2].get_0() * s3) - (m[2].get_1() * s1) + (m[2].get_2() * s0)) * invdet, ), ]; det } /// Essentially a tuple of four bools. #[derive(Debug, Copy, Clone)] pub struct Bool4 { data: [bool; 4], } impl Bool4 { #[inline(always)] pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 { Bool4 { data: [a, b, c, d] } } #[inline(always)] pub fn new_false() -> Bool4 { Bool4 { data: [false, false, false, false], } } /// Returns the value of the nth element. #[inline(always)] pub fn get_n(self, n: usize) -> bool { assert!( n <= 3, "Attempted to access element of Bool4 outside of bounds." ); unsafe { *self.data.get_unchecked(n) } } /// Returns the value of the 0th element. #[inline(always)] pub fn get_0(self) -> bool { self.get_n(0) } /// Returns the value of the 1th element. #[inline(always)] pub fn get_1(self) -> bool { self.get_n(1) } /// Returns the value of the 2th element. #[inline(always)] pub fn get_2(self) -> bool { self.get_n(2) } /// Returns the value of the 3th element. #[inline(always)] pub fn get_3(self) -> bool { self.get_n(3) } /// Returns whether all four bools are false. /// /// This is the `NOT` operation on the result of `OR`ing all the /// contained bools. If even one bool is true, this returns false. #[inline(always)] pub fn is_all_false(&self) -> bool { !(self.data[0] | self.data[1] | self.data[2] | self.data[3]) } #[inline] pub fn to_bitmask(self) -> u8 { (self.get_0() as u8) | ((self.get_1() as u8) << 1) | ((self.get_2() as u8) << 2) | ((self.get_3() as u8) << 3) } } impl BitAnd for Bool4 { type Output = Bool4; #[inline(always)] fn bitand(self, rhs: Bool4) -> Bool4 { Bool4 { data: [ self.data[0] && rhs.data[0], self.data[1] && rhs.data[1], self.data[2] && rhs.data[2], self.data[3] && rhs.data[3], ], } } } impl BitOr for Bool4 { type Output = Bool4; #[inline(always)] fn bitor(self, rhs: Bool4) -> Bool4 { Bool4 { data: [ self.data[0] || rhs.data[0], self.data[1] || rhs.data[1], self.data[2] || rhs.data[2], self.data[3] || rhs.data[3], ], } } } } //=========================================================================== #[cfg(all(target_arch = "x86_64", target_feature = "sse"))] pub use crate::x86_64_sse::{invert, transpose, v_max, v_min, Bool4, Float4}; #[cfg(not(all(target_arch = "x86_64", target_feature = "sse")))] pub use fallback::{invert, transpose, v_max, v_min, Bool4, Float4}; //=========================================================================== #[cfg(test)] mod tests { use super::*; #[test] fn get() { let f = Float4::new(1.0, 2.0, 3.0, 4.0); assert_eq!(f.get_0(), 1.0); assert_eq!(f.get_1(), 2.0); assert_eq!(f.get_2(), 3.0); assert_eq!(f.get_3(), 4.0); } #[test] fn get_n() { let f = Float4::new(1.0, 2.0, 3.0, 4.0); assert_eq!(f.get_n(0), 1.0); assert_eq!(f.get_n(1), 2.0); assert_eq!(f.get_n(2), 3.0); assert_eq!(f.get_n(3), 4.0); } #[test] fn set() { let mut f = Float4::new(1.0, 2.0, 3.0, 4.0); f.set_0(5.0); f.set_1(6.0); f.set_2(7.0); f.set_3(8.0); assert_eq!(f.get_0(), 5.0); assert_eq!(f.get_1(), 6.0); assert_eq!(f.get_2(), 7.0); assert_eq!(f.get_3(), 8.0); } #[test] fn set_n() { let mut f = Float4::new(1.0, 2.0, 3.0, 4.0); f.set_n(0, 5.0); f.set_n(1, 6.0); f.set_n(2, 7.0); f.set_n(3, 8.0); assert_eq!(f.get_0(), 5.0); assert_eq!(f.get_1(), 6.0); assert_eq!(f.get_2(), 7.0); assert_eq!(f.get_3(), 8.0); } #[test] fn all() { let f = Float4::new(1.0, 2.0, 3.0, 4.0); assert_eq!(f.all_0(), Float4::splat(1.0)); assert_eq!(f.all_1(), Float4::splat(2.0)); assert_eq!(f.all_2(), Float4::splat(3.0)); assert_eq!(f.all_3(), Float4::splat(4.0)); } #[test] fn partial_eq_1() { let f1 = Float4::new(1.0, 2.0, 3.0, 4.0); let f2 = Float4::new(1.0, 2.0, 3.0, 4.0); assert!(f1 == f2); } #[test] fn partial_eq_2() { let f1 = Float4::new(1.0, 2.0, 3.0, 4.0); let f2 = Float4::new(1.0, 2.1, 3.0, 4.0); assert!(!(f1 == f2)); } #[test] fn h_sum() { let f = Float4::new(1.0, 2.0, 3.0, 4.0); assert_eq!(f.h_sum(), 10.0); } #[test] fn h_product() { let f = Float4::new(1.0, 2.0, 3.0, 4.0); assert_eq!(f.h_product(), 24.0); } #[test] fn h_min() { let f = Float4::new(1.0, 2.0, 3.0, 4.0); assert_eq!(f.h_min(), 1.0); } #[test] fn h_max() { let f = Float4::new(1.0, 2.0, 3.0, 4.0); assert_eq!(f.h_max(), 4.0); } #[test] fn add() { let f1 = Float4::new(1.0, 2.0, 3.0, 4.0); let f2 = Float4::new(2.0, 3.0, 4.0, 5.0); let f3 = Float4::new(3.0, 5.0, 7.0, 9.0); assert_eq!(f1 + f2, f3); } #[test] fn sub() { let f1 = Float4::new(1.0, 2.0, 3.0, 4.0); let f2 = Float4::new(2.0, 3.0, 4.0, 5.0); let f3 = Float4::new(-1.0, -1.0, -1.0, -1.0); assert_eq!(f1 - f2, f3); } #[test] fn mul_component() { let f1 = Float4::new(1.0, 2.0, 3.0, 4.0); let f2 = Float4::new(2.0, 3.0, 4.0, 5.0); let f3 = Float4::new(2.0, 6.0, 12.0, 20.0); assert_eq!(f1 * f2, f3); } #[test] fn mul_scalar() { let f1 = Float4::new(1.0, 2.0, 3.0, 4.0); let v = 3.0; let f2 = Float4::new(3.0, 6.0, 9.0, 12.0); assert_eq!(f1 * v, f2); } #[test] fn div_component() { let f1 = Float4::new(1.0, 3.0, 3.0, 6.0); let f2 = Float4::new(2.0, 2.0, 4.0, 8.0); let f3 = Float4::new(0.5, 1.5, 0.75, 0.75); assert_eq!(f1 / f2, f3); } #[test] fn div_scalar() { let f1 = Float4::new(1.0, 2.0, 3.0, 4.0); let v = 2.0; let f2 = Float4::new(0.5, 1.0, 1.5, 2.0); assert_eq!(f1 / v, f2); } #[test] fn lt() { let f1 = Float4::new(1.0, 2.0, 3.0, 4.0); let f2 = Float4::new(0.5, 2.0, 3.5, 2.0); let r = f1.lt(f2); assert_eq!(r.get_0(), false); assert_eq!(r.get_1(), false); assert_eq!(r.get_2(), true); assert_eq!(r.get_3(), false); } #[test] fn gt() { let f1 = Float4::new(1.0, 2.0, 3.0, 4.0); let f2 = Float4::new(0.5, 2.0, 3.5, 2.0); let r = f1.gt(f2); assert_eq!(r.get_0(), true); assert_eq!(r.get_1(), false); assert_eq!(r.get_2(), false); assert_eq!(r.get_3(), true); } #[test] fn matrix_transpose() { let mut m1 = [ Float4::new(1.0, 2.0, 3.0, 4.0), Float4::new(5.0, 6.0, 7.0, 8.0), Float4::new(9.0, 10.0, 11.0, 12.0), Float4::new(13.0, 14.0, 15.0, 16.0), ]; let m2 = [ Float4::new(1.0, 5.0, 9.0, 13.0), Float4::new(2.0, 6.0, 10.0, 14.0), Float4::new(3.0, 7.0, 11.0, 15.0), Float4::new(4.0, 8.0, 12.0, 16.0), ]; transpose(&mut m1); assert_eq!(m1, m2); } #[test] fn bool4_bitmask_01() { let f1 = Float4::new(0.0, 0.0, 0.0, 0.0); let f2 = Float4::new(-1.0, -1.0, 1.0, -1.0); let r = f1.lt(f2).to_bitmask(); assert_eq!(r, 0b00000100); } #[test] fn bool4_bitmask_02() { let f1 = Float4::new(0.0, 0.0, 0.0, 0.0); let f2 = Float4::new(1.0, -1.0, 1.0, -1.0); let r = f1.lt(f2).to_bitmask(); assert_eq!(r, 0b00000101); } #[test] fn bool4_bitmask_03() { let f1 = Float4::new(0.0, 0.0, 0.0, 0.0); let f2 = Float4::new(-1.0, 1.0, -1.0, 1.0); let r = f1.lt(f2).to_bitmask(); assert_eq!(r, 0b00001010); } #[test] fn bool4_is_all_false() { assert_eq!(true, Bool4::new(false, false, false, false).is_all_false()); assert_eq!(false, Bool4::new(false, false, true, false).is_all_false()); } }