psychopath/sub_crates/float4/src/lib.rs

1621 lines
47 KiB
Rust

#![allow(dead_code)]
/// Implementation of Float4 for x86_64 platforms with SSE support.
#[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
mod x86_64_sse {
use std::{
arch::x86_64::__m128,
cmp::PartialEq,
ops::{Add, AddAssign, BitAnd, BitOr, Div, DivAssign, Mul, MulAssign, Sub, SubAssign},
};
#[derive(Debug, Copy, Clone)]
pub struct Float4 {
data: __m128,
}
impl Float4 {
#[inline(always)]
pub fn new(a: f32, b: f32, c: f32, d: f32) -> Float4 {
use std::arch::x86_64::_mm_set_ps;
Float4 {
data: unsafe { _mm_set_ps(d, c, b, a) },
}
}
#[inline(always)]
pub fn splat(n: f32) -> Float4 {
use std::arch::x86_64::_mm_set1_ps;
Float4 {
data: unsafe { _mm_set1_ps(n) },
}
}
#[inline]
pub fn h_sum(&self) -> f32 {
#[cfg(target_feature = "sse3")]
{
use std::arch::x86_64::{
_mm_add_ps, _mm_add_ss, _mm_cvtss_f32, _mm_movehdup_ps, _mm_movehl_ps,
};
unsafe {
let v = self.data;
let shuf = _mm_movehdup_ps(v);
let sums = _mm_add_ps(v, shuf);
let shuf = _mm_movehl_ps(shuf, sums);
let sums = _mm_add_ss(sums, shuf);
_mm_cvtss_f32(sums)
}
}
#[cfg(not(target_feature = "sse3"))]
{
use std::arch::x86_64::{
_mm_add_ps, _mm_add_ss, _mm_cvtss_f32, _mm_movehl_ps, _mm_shuffle_ps,
};
unsafe {
let v = self.data;
let shuf = _mm_shuffle_ps(v, v, (2 << 6) | (3 << 4) | 1);
let sums = _mm_add_ps(v, shuf);
let shuf = _mm_movehl_ps(shuf, sums);
let sums = _mm_add_ss(sums, shuf);
_mm_cvtss_f32(sums)
}
}
}
#[inline]
pub fn h_product(&self) -> f32 {
(self.get_0() * self.get_1()) * (self.get_2() * self.get_3())
}
#[inline]
pub fn h_min(&self) -> f32 {
let n1 = if self.get_0() < self.get_1() {
self.get_0()
} else {
self.get_1()
};
let n2 = if self.get_2() < self.get_3() {
self.get_2()
} else {
self.get_3()
};
if n1 < n2 {
n1
} else {
n2
}
}
#[inline]
pub fn h_max(&self) -> f32 {
let n1 = if self.get_0() > self.get_1() {
self.get_0()
} else {
self.get_1()
};
let n2 = if self.get_2() > self.get_3() {
self.get_2()
} else {
self.get_3()
};
if n1 > n2 {
n1
} else {
n2
}
}
#[inline(always)]
pub fn v_min(&self, other: Float4) -> Float4 {
use std::arch::x86_64::_mm_min_ps;
Float4 {
data: unsafe { _mm_min_ps(self.data, other.data) },
}
}
#[inline(always)]
pub fn v_max(&self, other: Float4) -> Float4 {
use std::arch::x86_64::_mm_max_ps;
Float4 {
data: unsafe { _mm_max_ps(self.data, other.data) },
}
}
#[inline(always)]
pub fn lt(&self, other: Float4) -> Bool4 {
use std::arch::x86_64::_mm_cmplt_ps;
Bool4 {
data: unsafe { _mm_cmplt_ps(self.data, other.data) },
}
}
#[inline(always)]
pub fn lte(&self, other: Float4) -> Bool4 {
use std::arch::x86_64::_mm_cmple_ps;
Bool4 {
data: unsafe { _mm_cmple_ps(self.data, other.data) },
}
}
#[inline(always)]
pub fn gt(&self, other: Float4) -> Bool4 {
use std::arch::x86_64::_mm_cmpgt_ps;
Bool4 {
data: unsafe { _mm_cmpgt_ps(self.data, other.data) },
}
}
#[inline(always)]
pub fn gte(&self, other: Float4) -> Bool4 {
use std::arch::x86_64::_mm_cmpge_ps;
Bool4 {
data: unsafe { _mm_cmpge_ps(self.data, other.data) },
}
}
/// Set the nth element to the given value.
#[inline(always)]
pub fn set_n(&mut self, n: usize, v: f32) {
assert!(
n <= 3,
"Attempted to set element of Float4 outside of bounds."
);
unsafe { *(&mut self.data as *mut std::arch::x86_64::__m128 as *mut f32).add(n) = v }
}
/// Set the 0th element to the given value.
#[inline(always)]
pub fn set_0(&mut self, v: f32) {
self.set_n(0, v);
}
/// Set the 1th element to the given value.
#[inline(always)]
pub fn set_1(&mut self, v: f32) {
self.set_n(1, v);
}
/// Set the 2th element to the given value.
#[inline(always)]
pub fn set_2(&mut self, v: f32) {
self.set_n(2, v);
}
/// Set the 3th element to the given value.
#[inline(always)]
pub fn set_3(&mut self, v: f32) {
self.set_n(3, v);
}
/// Returns the value of the nth element.
#[inline(always)]
pub fn get_n(&self, n: usize) -> f32 {
assert!(
n <= 3,
"Attempted to access element of Float4 outside of bounds."
);
unsafe { *(&self.data as *const std::arch::x86_64::__m128 as *const f32).add(n) }
}
/// Returns the value of the 0th element.
#[inline(always)]
pub fn get_0(&self) -> f32 {
self.get_n(0)
}
/// Returns the value of the 1th element.
#[inline(always)]
pub fn get_1(&self) -> f32 {
self.get_n(1)
}
/// Returns the value of the 2th element.
#[inline(always)]
pub fn get_2(&self) -> f32 {
self.get_n(2)
}
/// Returns the value of the 3th element.
#[inline(always)]
pub fn get_3(&self) -> f32 {
self.get_n(3)
}
/// Returns a Float4 with all elements set to the value
/// of element 0.
#[inline(always)]
pub fn all_0(&self) -> Float4 {
use std::arch::x86_64::_mm_shuffle_ps;
Float4 {
data: unsafe { _mm_shuffle_ps(self.data, self.data, 0b00_00_00_00) },
}
}
/// Returns a Float4 with all elements set to the value
/// of element 1.
#[inline(always)]
pub fn all_1(&self) -> Float4 {
use std::arch::x86_64::_mm_shuffle_ps;
Float4 {
data: unsafe { _mm_shuffle_ps(self.data, self.data, 0b01_01_01_01) },
}
}
/// Returns a Float4 with all elements set to the value
/// of element 2.
#[inline(always)]
pub fn all_2(&self) -> Float4 {
use std::arch::x86_64::_mm_shuffle_ps;
Float4 {
data: unsafe { _mm_shuffle_ps(self.data, self.data, 0b10_10_10_10) },
}
}
/// Returns a Float4 with all elements set to the value
/// of element 3.
#[inline(always)]
pub fn all_3(&self) -> Float4 {
use std::arch::x86_64::_mm_shuffle_ps;
Float4 {
data: unsafe { _mm_shuffle_ps(self.data, self.data, 0b11_11_11_11) },
}
}
/// Returns the square roots of all elements.
#[inline(always)]
pub fn sqrt(&self) -> Float4 {
use std::arch::x86_64::_mm_sqrt_ps;
Float4 {
data: unsafe { _mm_sqrt_ps(self.data) },
}
}
/// Performs a fused multiply add.
///
/// i.e. self * b + c
#[inline(always)]
pub fn fmadd(&self, b: Float4, c: Float4) -> Float4 {
#[cfg(target_feature = "fma")]
{
use std::arch::x86_64::_mm_fmadd_ps;
Float4 {
data: unsafe { _mm_fmadd_ps(self.data, b.data, c.data) },
}
}
#[cfg(not(target_feature = "fma"))]
{
(*self * b) + c
}
}
}
impl PartialEq for Float4 {
#[inline]
fn eq(&self, other: &Float4) -> bool {
self.get_0() == other.get_0()
&& self.get_1() == other.get_1()
&& self.get_2() == other.get_2()
&& self.get_3() == other.get_3()
}
}
impl Add for Float4 {
type Output = Float4;
#[inline(always)]
fn add(self, other: Float4) -> Float4 {
use std::arch::x86_64::_mm_add_ps;
Float4 {
data: unsafe { _mm_add_ps(self.data, other.data) },
}
}
}
impl AddAssign for Float4 {
#[inline(always)]
fn add_assign(&mut self, rhs: Float4) {
*self = *self + rhs;
}
}
impl Sub for Float4 {
type Output = Float4;
#[inline(always)]
fn sub(self, other: Float4) -> Float4 {
use std::arch::x86_64::_mm_sub_ps;
Float4 {
data: unsafe { _mm_sub_ps(self.data, other.data) },
}
}
}
impl SubAssign for Float4 {
#[inline(always)]
fn sub_assign(&mut self, rhs: Float4) {
*self = *self - rhs;
}
}
impl Mul for Float4 {
type Output = Float4;
#[inline(always)]
fn mul(self, other: Float4) -> Float4 {
use std::arch::x86_64::_mm_mul_ps;
Float4 {
data: unsafe { _mm_mul_ps(self.data, other.data) },
}
}
}
impl Mul<f32> for Float4 {
type Output = Float4;
#[inline(always)]
fn mul(self, other: f32) -> Float4 {
self * Float4::splat(other)
}
}
impl MulAssign for Float4 {
#[inline(always)]
fn mul_assign(&mut self, rhs: Float4) {
*self = *self * rhs;
}
}
impl MulAssign<f32> for Float4 {
#[inline(always)]
fn mul_assign(&mut self, rhs: f32) {
*self = *self * rhs;
}
}
impl Div for Float4 {
type Output = Float4;
#[inline(always)]
fn div(self, other: Float4) -> Float4 {
use std::arch::x86_64::_mm_div_ps;
Float4 {
data: unsafe { _mm_div_ps(self.data, other.data) },
}
}
}
impl Div<f32> for Float4 {
type Output = Float4;
#[inline(always)]
fn div(self, other: f32) -> Float4 {
self / Float4::splat(other)
}
}
impl DivAssign for Float4 {
#[inline(always)]
fn div_assign(&mut self, rhs: Float4) {
*self = *self / rhs;
}
}
impl DivAssign<f32> for Float4 {
#[inline(always)]
fn div_assign(&mut self, rhs: f32) {
*self = *self / rhs;
}
}
// Free functions for Float4
#[inline(always)]
pub fn v_min(a: Float4, b: Float4) -> Float4 {
a.v_min(b)
}
#[inline(always)]
pub fn v_max(a: Float4, b: Float4) -> Float4 {
a.v_max(b)
}
/// Transposes a 4x4 matrix in-place.
#[inline(always)]
pub fn transpose(matrix: &mut [Float4; 4]) {
use std::arch::x86_64::_MM_TRANSPOSE4_PS;
// The weird &mut/*mut gymnastics below are to get around
// the borrow-checker. We know statically that these references
// are non-overlapping, so it's safe.
unsafe {
_MM_TRANSPOSE4_PS(
&mut *(&mut matrix[0].data as *mut __m128),
&mut *(&mut matrix[1].data as *mut __m128),
&mut *(&mut matrix[2].data as *mut __m128),
&mut *(&mut matrix[3].data as *mut __m128),
)
};
}
/// Inverts a 4x4 matrix and returns the determinate.
#[inline(always)]
pub fn invert(matrix: &mut [Float4; 4]) -> f32 {
// Code pulled from "Streaming SIMD Extensions - Inverse of 4x4 Matrix"
// by Intel.
// ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
// Ported to Rust.
// TODO: once __m64 and accompanying intrinsics are stabilized, switch
// to using those, commented out in the code below.
use std::arch::x86_64::{
_mm_add_ps,
_mm_add_ss,
_mm_cvtss_f32,
_mm_mul_ps,
_mm_mul_ss,
_mm_rcp_ss,
// _mm_loadh_pi,
// _mm_loadl_pi,
// _mm_storeh_pi,
// _mm_storel_pi,
_mm_set_ps,
_mm_shuffle_ps,
_mm_sub_ps,
_mm_sub_ss,
};
use std::mem::transmute;
let mut minor0: __m128;
let mut minor1: __m128;
let mut minor2: __m128;
let mut minor3: __m128;
let row0: __m128;
let mut row1: __m128;
let mut row2: __m128;
let mut row3: __m128;
let mut det: __m128;
let mut tmp1: __m128;
unsafe {
// tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
tmp1 = _mm_set_ps(
matrix[1].get_1(),
matrix[1].get_0(),
matrix[0].get_1(),
matrix[0].get_0(),
);
// row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
row1 = _mm_set_ps(
matrix[3].get_1(),
matrix[3].get_0(),
matrix[2].get_1(),
matrix[2].get_0(),
);
row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
// tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
tmp1 = _mm_set_ps(
matrix[1].get_3(),
matrix[1].get_2(),
matrix[0].get_3(),
matrix[0].get_2(),
);
// row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
row3 = _mm_set_ps(
matrix[3].get_3(),
matrix[3].get_2(),
matrix[2].get_3(),
matrix[2].get_2(),
);
row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
// -----------------------------------------------
tmp1 = _mm_mul_ps(row2, row3);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor0 = _mm_mul_ps(row1, tmp1);
minor1 = _mm_mul_ps(row0, tmp1);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
// -----------------------------------------------
tmp1 = _mm_mul_ps(row1, row2);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
minor3 = _mm_mul_ps(row0, tmp1);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
// -----------------------------------------------
tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
row2 = _mm_shuffle_ps(row2, row2, 0x4E);
minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
minor2 = _mm_mul_ps(row0, tmp1);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
// -----------------------------------------------
tmp1 = _mm_mul_ps(row0, row1);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
// -----------------------------------------------
tmp1 = _mm_mul_ps(row0, row3);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
// -----------------------------------------------
tmp1 = _mm_mul_ps(row0, row2);
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
// -----------------------------------------------
det = _mm_mul_ps(row0, minor0);
det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
tmp1 = _mm_rcp_ss(det);
det = _mm_sub_ss(
_mm_add_ss(tmp1, tmp1),
_mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)),
);
det = _mm_shuffle_ps(det, det, 0x00);
minor0 = _mm_mul_ps(det, minor0);
// _mm_storel_pi((__m64*)(src), minor0);
// _mm_storeh_pi((__m64*)(src+2), minor0);
let minor0 = transmute::<__m128, [f32; 4]>(minor0);
matrix[0].data = _mm_set_ps(minor0[3], minor0[2], minor0[1], minor0[0]);
minor1 = _mm_mul_ps(det, minor1);
// _mm_storel_pi((__m64*)(src+4), minor1);
// _mm_storeh_pi((__m64*)(src+6), minor1);
let minor1 = transmute::<__m128, [f32; 4]>(minor1);
matrix[1].data = _mm_set_ps(minor1[3], minor1[2], minor1[1], minor1[0]);
minor2 = _mm_mul_ps(det, minor2);
// _mm_storel_pi((__m64*)(src+ 8), minor2);
// _mm_storeh_pi((__m64*)(src+10), minor2);
let minor2 = transmute::<__m128, [f32; 4]>(minor2);
matrix[2].data = _mm_set_ps(minor2[3], minor2[2], minor2[1], minor2[0]);
minor3 = _mm_mul_ps(det, minor3);
// _mm_storel_pi((__m64*)(src+12), minor3);
// _mm_storeh_pi((__m64*)(src+14), minor3);
let minor3 = transmute::<__m128, [f32; 4]>(minor3);
matrix[3].data = _mm_set_ps(minor3[3], minor3[2], minor3[1], minor3[0]);
_mm_cvtss_f32(det)
}
}
/// Essentially a tuple of four bools, which will use SIMD operations
/// where possible on a platform.
#[derive(Debug, Copy, Clone)]
pub struct Bool4 {
data: __m128,
}
impl Bool4 {
#[inline(always)]
pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 {
use std::arch::x86_64::_mm_set_ps;
Bool4 {
data: unsafe {
_mm_set_ps(
if d { 1.0 } else { 0.0 },
if c { 1.0 } else { 0.0 },
if b { 1.0 } else { 0.0 },
if a { 1.0 } else { 0.0 },
)
},
}
}
#[inline(always)]
pub fn new_false() -> Bool4 {
use std::arch::x86_64::_mm_set1_ps;
Bool4 {
data: unsafe { _mm_set1_ps(0.0) },
}
}
/// Returns the value of the nth element.
#[inline(always)]
pub fn get_n(&self, n: usize) -> bool {
assert!(
n <= 3,
"Attempted to access element of Bool4 outside of bounds."
);
0 != unsafe { *(&self.data as *const std::arch::x86_64::__m128 as *const u32).add(n) }
}
/// Returns the value of the 0th element.
#[inline(always)]
pub fn get_0(&self) -> bool {
self.get_n(0)
}
/// Returns the value of the 1st element.
#[inline(always)]
pub fn get_1(&self) -> bool {
self.get_n(1)
}
/// Returns the value of the 2nd element.
#[inline(always)]
pub fn get_2(&self) -> bool {
self.get_n(2)
}
/// Returns the value of the 3rd element.
#[inline(always)]
pub fn get_3(&self) -> bool {
self.get_n(3)
}
/// Returns whether all four bools are false.
///
/// This is the `NOT` operation on the result of `OR`ing all the
/// contained bools. If even one bool is true, this returns false.
#[inline(always)]
pub fn is_all_false(&self) -> bool {
let a = unsafe { *(&self.data as *const __m128 as *const u128) };
a == 0
}
#[inline]
pub fn to_bitmask(&self) -> u8 {
let a = unsafe { *(&self.data as *const __m128 as *const u8).offset(0) };
let b = unsafe { *(&self.data as *const __m128 as *const u8).offset(4) };
let c = unsafe { *(&self.data as *const __m128 as *const u8).offset(8) };
let d = unsafe { *(&self.data as *const __m128 as *const u8).offset(12) };
(a & 0b0000_0001) | (b & 0b0000_0010) | (c & 0b0000_0100) | (d & 0b0000_1000)
}
}
impl BitAnd for Bool4 {
type Output = Bool4;
#[inline(always)]
fn bitand(self, rhs: Bool4) -> Bool4 {
use std::arch::x86_64::_mm_and_ps;
Bool4 {
data: unsafe { _mm_and_ps(self.data, rhs.data) },
}
}
}
impl BitOr for Bool4 {
type Output = Bool4;
#[inline(always)]
fn bitor(self, rhs: Bool4) -> Bool4 {
use std::arch::x86_64::_mm_or_ps;
Bool4 {
data: unsafe { _mm_or_ps(self.data, rhs.data) },
}
}
}
}
//===========================================================================
/// Implementation fo Float4 for any platform, foregoing any
/// platform-specific optimizations.
mod fallback {
use std::{
cmp::PartialEq,
ops::{Add, AddAssign, BitAnd, BitOr, Div, DivAssign, Mul, MulAssign, Sub, SubAssign},
};
#[derive(Debug, Copy, Clone)]
pub struct Float4 {
data: [f32; 4],
}
impl Float4 {
#[inline(always)]
pub fn new(a: f32, b: f32, c: f32, d: f32) -> Float4 {
Float4 { data: [a, b, c, d] }
}
#[inline(always)]
pub fn splat(n: f32) -> Float4 {
Float4 { data: [n, n, n, n] }
}
#[inline]
pub fn h_sum(&self) -> f32 {
(self.get_0() + self.get_1()) + (self.get_2() + self.get_3())
}
#[inline]
pub fn h_product(&self) -> f32 {
(self.get_0() * self.get_1()) * (self.get_2() * self.get_3())
}
#[inline]
pub fn h_min(&self) -> f32 {
let n1 = if self.get_0() < self.get_1() {
self.get_0()
} else {
self.get_1()
};
let n2 = if self.get_2() < self.get_3() {
self.get_2()
} else {
self.get_3()
};
if n1 < n2 {
n1
} else {
n2
}
}
#[inline]
pub fn h_max(&self) -> f32 {
let n1 = if self.get_0() > self.get_1() {
self.get_0()
} else {
self.get_1()
};
let n2 = if self.get_2() > self.get_3() {
self.get_2()
} else {
self.get_3()
};
if n1 > n2 {
n1
} else {
n2
}
}
#[inline(always)]
pub fn v_min(&self, other: Float4) -> Float4 {
Float4::new(
if self.get_0() < other.get_0() {
self.get_0()
} else {
other.get_0()
},
if self.get_1() < other.get_1() {
self.get_1()
} else {
other.get_1()
},
if self.get_2() < other.get_2() {
self.get_2()
} else {
other.get_2()
},
if self.get_3() < other.get_3() {
self.get_3()
} else {
other.get_3()
},
)
}
#[inline(always)]
pub fn v_max(&self, other: Float4) -> Float4 {
Float4::new(
if self.get_0() > other.get_0() {
self.get_0()
} else {
other.get_0()
},
if self.get_1() > other.get_1() {
self.get_1()
} else {
other.get_1()
},
if self.get_2() > other.get_2() {
self.get_2()
} else {
other.get_2()
},
if self.get_3() > other.get_3() {
self.get_3()
} else {
other.get_3()
},
)
}
#[inline(always)]
pub fn lt(&self, other: Float4) -> Bool4 {
Bool4 {
data: [
self.data[0] < other.data[0],
self.data[1] < other.data[1],
self.data[2] < other.data[2],
self.data[3] < other.data[3],
],
}
}
#[inline(always)]
pub fn lte(&self, other: Float4) -> Bool4 {
Bool4 {
data: [
self.data[0] <= other.data[0],
self.data[1] <= other.data[1],
self.data[2] <= other.data[2],
self.data[3] <= other.data[3],
],
}
}
#[inline(always)]
pub fn gt(&self, other: Float4) -> Bool4 {
Bool4 {
data: [
self.data[0] > other.data[0],
self.data[1] > other.data[1],
self.data[2] > other.data[2],
self.data[3] > other.data[3],
],
}
}
#[inline(always)]
pub fn gte(&self, other: Float4) -> Bool4 {
Bool4 {
data: [
self.data[0] >= other.data[0],
self.data[1] >= other.data[1],
self.data[2] >= other.data[2],
self.data[3] >= other.data[3],
],
}
}
/// Set the nth element to the given value.
#[inline(always)]
pub fn set_n(&mut self, n: usize, v: f32) {
assert!(
n <= 3,
"Attempted to set element of Float4 outside of bounds."
);
unsafe {
*self.data.get_unchecked_mut(n) = v;
}
}
/// Set the 0th element to the given value.
#[inline(always)]
pub fn set_0(&mut self, v: f32) {
self.set_n(0, v);
}
/// Set the 1th element to the given value.
#[inline(always)]
pub fn set_1(&mut self, v: f32) {
self.set_n(1, v);
}
/// Set the 2th element to the given value.
#[inline(always)]
pub fn set_2(&mut self, v: f32) {
self.set_n(2, v);
}
/// Set the 3th element to the given value.
#[inline(always)]
pub fn set_3(&mut self, v: f32) {
self.set_n(3, v);
}
/// Returns the value of the nth element.
#[inline(always)]
pub fn get_n(&self, n: usize) -> f32 {
assert!(
n <= 3,
"Attempted to access element of Float4 outside of bounds."
);
unsafe { *self.data.get_unchecked(n) }
}
/// Returns the value of the 0th element.
#[inline(always)]
pub fn get_0(&self) -> f32 {
self.get_n(0)
}
/// Returns the value of the 1th element.
#[inline(always)]
pub fn get_1(&self) -> f32 {
self.get_n(1)
}
/// Returns the value of the 2th element.
#[inline(always)]
pub fn get_2(&self) -> f32 {
self.get_n(2)
}
/// Returns the value of the 3th element.
#[inline(always)]
pub fn get_3(&self) -> f32 {
self.get_n(3)
}
/// Returns a Float4 with all elements set to the value
/// of element 0.
#[inline(always)]
pub fn all_0(&self) -> Float4 {
Float4 {
data: [self.data[0], self.data[0], self.data[0], self.data[0]],
}
}
/// Returns a Float4 with all elements set to the value
/// of element 1.
#[inline(always)]
pub fn all_1(&self) -> Float4 {
Float4 {
data: [self.data[1], self.data[1], self.data[1], self.data[1]],
}
}
/// Returns a Float4 with all elements set to the value
/// of element 2.
#[inline(always)]
pub fn all_2(&self) -> Float4 {
Float4 {
data: [self.data[2], self.data[2], self.data[2], self.data[2]],
}
}
/// Returns a Float4 with all elements set to the value
/// of element 3.
#[inline(always)]
pub fn all_3(&self) -> Float4 {
Float4 {
data: [self.data[3], self.data[3], self.data[3], self.data[3]],
}
}
/// Returns the square roots of all elements.
#[inline(always)]
pub fn sqrt(&self) -> Float4 {
Float4::new(
self.get_0().sqrt(),
self.get_1().sqrt(),
self.get_2().sqrt(),
self.get_3().sqrt(),
)
}
/// Performs a fused multiply add.
///
/// i.e. self * b + c
#[inline(always)]
pub fn fmadd(&self, b: Float4, c: Float4) -> Float4 {
(*self * b) + c
}
}
impl PartialEq for Float4 {
#[inline]
fn eq(&self, other: &Float4) -> bool {
self.get_0() == other.get_0()
&& self.get_1() == other.get_1()
&& self.get_2() == other.get_2()
&& self.get_3() == other.get_3()
}
}
impl Add for Float4 {
type Output = Float4;
#[inline(always)]
fn add(self, other: Float4) -> Float4 {
Float4 {
data: [
self.get_0() + other.get_0(),
self.get_1() + other.get_1(),
self.get_2() + other.get_2(),
self.get_3() + other.get_3(),
],
}
}
}
impl AddAssign for Float4 {
#[inline(always)]
fn add_assign(&mut self, rhs: Float4) {
*self = *self + rhs;
}
}
impl Sub for Float4 {
type Output = Float4;
#[inline(always)]
fn sub(self, other: Float4) -> Float4 {
Float4 {
data: [
self.get_0() - other.get_0(),
self.get_1() - other.get_1(),
self.get_2() - other.get_2(),
self.get_3() - other.get_3(),
],
}
}
}
impl SubAssign for Float4 {
#[inline(always)]
fn sub_assign(&mut self, rhs: Float4) {
*self = *self - rhs;
}
}
impl Mul for Float4 {
type Output = Float4;
#[inline(always)]
fn mul(self, other: Float4) -> Float4 {
Float4 {
data: [
self.get_0() * other.get_0(),
self.get_1() * other.get_1(),
self.get_2() * other.get_2(),
self.get_3() * other.get_3(),
],
}
}
}
impl Mul<f32> for Float4 {
type Output = Float4;
#[inline(always)]
fn mul(self, other: f32) -> Float4 {
Float4 {
data: [
self.get_0() * other,
self.get_1() * other,
self.get_2() * other,
self.get_3() * other,
],
}
}
}
impl MulAssign for Float4 {
#[inline(always)]
fn mul_assign(&mut self, rhs: Float4) {
*self = *self * rhs;
}
}
impl MulAssign<f32> for Float4 {
#[inline(always)]
fn mul_assign(&mut self, rhs: f32) {
*self = *self * rhs;
}
}
impl Div for Float4 {
type Output = Float4;
#[inline(always)]
fn div(self, other: Float4) -> Float4 {
Float4 {
data: [
self.get_0() / other.get_0(),
self.get_1() / other.get_1(),
self.get_2() / other.get_2(),
self.get_3() / other.get_3(),
],
}
}
}
impl Div<f32> for Float4 {
type Output = Float4;
#[inline(always)]
fn div(self, other: f32) -> Float4 {
Float4 {
data: [
self.get_0() / other,
self.get_1() / other,
self.get_2() / other,
self.get_3() / other,
],
}
}
}
impl DivAssign for Float4 {
#[inline(always)]
fn div_assign(&mut self, rhs: Float4) {
*self = *self / rhs;
}
}
impl DivAssign<f32> for Float4 {
#[inline(always)]
fn div_assign(&mut self, rhs: f32) {
*self = *self / rhs;
}
}
// Free functions for Float4
#[inline(always)]
pub fn v_min(a: Float4, b: Float4) -> Float4 {
a.v_min(b)
}
#[inline(always)]
pub fn v_max(a: Float4, b: Float4) -> Float4 {
a.v_max(b)
}
/// Transposes a 4x4 matrix in-place
#[inline(always)]
pub fn transpose(matrix: &mut [Float4; 4]) {
let m = [
Float4::new(
matrix[0].get_0(),
matrix[1].get_0(),
matrix[2].get_0(),
matrix[3].get_0(),
),
Float4::new(
matrix[0].get_1(),
matrix[1].get_1(),
matrix[2].get_1(),
matrix[3].get_1(),
),
Float4::new(
matrix[0].get_2(),
matrix[1].get_2(),
matrix[2].get_2(),
matrix[3].get_2(),
),
Float4::new(
matrix[0].get_3(),
matrix[1].get_3(),
matrix[2].get_3(),
matrix[3].get_3(),
),
];
*matrix = m;
}
/// Inverts a 4x4 matrix and returns the determinate.
#[inline(always)]
pub fn invert(matrix: &mut [Float4; 4]) -> f32 {
let m = *matrix;
let s0 = (m[0].get_0() * m[1].get_1()) - (m[1].get_0() * m[0].get_1());
let s1 = (m[0].get_0() * m[1].get_2()) - (m[1].get_0() * m[0].get_2());
let s2 = (m[0].get_0() * m[1].get_3()) - (m[1].get_0() * m[0].get_3());
let s3 = (m[0].get_1() * m[1].get_2()) - (m[1].get_1() * m[0].get_2());
let s4 = (m[0].get_1() * m[1].get_3()) - (m[1].get_1() * m[0].get_3());
let s5 = (m[0].get_2() * m[1].get_3()) - (m[1].get_2() * m[0].get_3());
let c5 = (m[2].get_2() * m[3].get_3()) - (m[3].get_2() * m[2].get_3());
let c4 = (m[2].get_1() * m[3].get_3()) - (m[3].get_1() * m[2].get_3());
let c3 = (m[2].get_1() * m[3].get_2()) - (m[3].get_1() * m[2].get_2());
let c2 = (m[2].get_0() * m[3].get_3()) - (m[3].get_0() * m[2].get_3());
let c1 = (m[2].get_0() * m[3].get_2()) - (m[3].get_0() * m[2].get_2());
let c0 = (m[2].get_0() * m[3].get_1()) - (m[3].get_0() * m[2].get_1());
// We don't check for 0.0 determinant, as that is expected to be handled
// by the calling code.
let det = (s0 * c5) - (s1 * c4) + (s2 * c3) + (s3 * c2) - (s4 * c1) + (s5 * c0);
let invdet = 1.0 / det;
*matrix = [
Float4::new(
((m[1].get_1() * c5) - (m[1].get_2() * c4) + (m[1].get_3() * c3)) * invdet,
((-m[0].get_1() * c5) + (m[0].get_2() * c4) - (m[0].get_3() * c3)) * invdet,
((m[3].get_1() * s5) - (m[3].get_2() * s4) + (m[3].get_3() * s3)) * invdet,
((-m[2].get_1() * s5) + (m[2].get_2() * s4) - (m[2].get_3() * s3)) * invdet,
),
Float4::new(
((-m[1].get_0() * c5) + (m[1].get_2() * c2) - (m[1].get_3() * c1)) * invdet,
((m[0].get_0() * c5) - (m[0].get_2() * c2) + (m[0].get_3() * c1)) * invdet,
((-m[3].get_0() * s5) + (m[3].get_2() * s2) - (m[3].get_3() * s1)) * invdet,
((m[2].get_0() * s5) - (m[2].get_2() * s2) + (m[2].get_3() * s1)) * invdet,
),
Float4::new(
((m[1].get_0() * c4) - (m[1].get_1() * c2) + (m[1].get_3() * c0)) * invdet,
((-m[0].get_0() * c4) + (m[0].get_1() * c2) - (m[0].get_3() * c0)) * invdet,
((m[3].get_0() * s4) - (m[3].get_1() * s2) + (m[3].get_3() * s0)) * invdet,
((-m[2].get_0() * s4) + (m[2].get_1() * s2) - (m[2].get_3() * s0)) * invdet,
),
Float4::new(
((-m[1].get_0() * c3) + (m[1].get_1() * c1) - (m[1].get_2() * c0)) * invdet,
((m[0].get_0() * c3) - (m[0].get_1() * c1) + (m[0].get_2() * c0)) * invdet,
((-m[3].get_0() * s3) + (m[3].get_1() * s1) - (m[3].get_2() * s0)) * invdet,
((m[2].get_0() * s3) - (m[2].get_1() * s1) + (m[2].get_2() * s0)) * invdet,
),
];
det
}
/// Essentially a tuple of four bools.
#[derive(Debug, Copy, Clone)]
pub struct Bool4 {
data: [bool; 4],
}
impl Bool4 {
#[inline(always)]
pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 {
Bool4 { data: [a, b, c, d] }
}
#[inline(always)]
pub fn new_false() -> Bool4 {
Bool4 {
data: [false, false, false, false],
}
}
/// Returns the value of the nth element.
#[inline(always)]
pub fn get_n(self, n: usize) -> bool {
assert!(
n <= 3,
"Attempted to access element of Bool4 outside of bounds."
);
unsafe { *self.data.get_unchecked(n) }
}
/// Returns the value of the 0th element.
#[inline(always)]
pub fn get_0(self) -> bool {
self.get_n(0)
}
/// Returns the value of the 1th element.
#[inline(always)]
pub fn get_1(self) -> bool {
self.get_n(1)
}
/// Returns the value of the 2th element.
#[inline(always)]
pub fn get_2(self) -> bool {
self.get_n(2)
}
/// Returns the value of the 3th element.
#[inline(always)]
pub fn get_3(self) -> bool {
self.get_n(3)
}
/// Returns whether all four bools are false.
///
/// This is the `NOT` operation on the result of `OR`ing all the
/// contained bools. If even one bool is true, this returns false.
#[inline(always)]
pub fn is_all_false(&self) -> bool {
!(self.data[0] | self.data[1] | self.data[2] | self.data[3])
}
#[inline]
pub fn to_bitmask(self) -> u8 {
(self.get_0() as u8)
| ((self.get_1() as u8) << 1)
| ((self.get_2() as u8) << 2)
| ((self.get_3() as u8) << 3)
}
}
impl BitAnd for Bool4 {
type Output = Bool4;
#[inline(always)]
fn bitand(self, rhs: Bool4) -> Bool4 {
Bool4 {
data: [
self.data[0] && rhs.data[0],
self.data[1] && rhs.data[1],
self.data[2] && rhs.data[2],
self.data[3] && rhs.data[3],
],
}
}
}
impl BitOr for Bool4 {
type Output = Bool4;
#[inline(always)]
fn bitor(self, rhs: Bool4) -> Bool4 {
Bool4 {
data: [
self.data[0] || rhs.data[0],
self.data[1] || rhs.data[1],
self.data[2] || rhs.data[2],
self.data[3] || rhs.data[3],
],
}
}
}
}
//===========================================================================
#[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
pub use crate::x86_64_sse::{invert, transpose, v_max, v_min, Bool4, Float4};
#[cfg(not(all(target_arch = "x86_64", target_feature = "sse")))]
pub use fallback::{invert, transpose, v_max, v_min, Bool4, Float4};
//===========================================================================
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn get() {
let f = Float4::new(1.0, 2.0, 3.0, 4.0);
assert_eq!(f.get_0(), 1.0);
assert_eq!(f.get_1(), 2.0);
assert_eq!(f.get_2(), 3.0);
assert_eq!(f.get_3(), 4.0);
}
#[test]
fn get_n() {
let f = Float4::new(1.0, 2.0, 3.0, 4.0);
assert_eq!(f.get_n(0), 1.0);
assert_eq!(f.get_n(1), 2.0);
assert_eq!(f.get_n(2), 3.0);
assert_eq!(f.get_n(3), 4.0);
}
#[test]
fn set() {
let mut f = Float4::new(1.0, 2.0, 3.0, 4.0);
f.set_0(5.0);
f.set_1(6.0);
f.set_2(7.0);
f.set_3(8.0);
assert_eq!(f.get_0(), 5.0);
assert_eq!(f.get_1(), 6.0);
assert_eq!(f.get_2(), 7.0);
assert_eq!(f.get_3(), 8.0);
}
#[test]
fn set_n() {
let mut f = Float4::new(1.0, 2.0, 3.0, 4.0);
f.set_n(0, 5.0);
f.set_n(1, 6.0);
f.set_n(2, 7.0);
f.set_n(3, 8.0);
assert_eq!(f.get_0(), 5.0);
assert_eq!(f.get_1(), 6.0);
assert_eq!(f.get_2(), 7.0);
assert_eq!(f.get_3(), 8.0);
}
#[test]
fn all() {
let f = Float4::new(1.0, 2.0, 3.0, 4.0);
assert_eq!(f.all_0(), Float4::splat(1.0));
assert_eq!(f.all_1(), Float4::splat(2.0));
assert_eq!(f.all_2(), Float4::splat(3.0));
assert_eq!(f.all_3(), Float4::splat(4.0));
}
#[test]
fn partial_eq_1() {
let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
let f2 = Float4::new(1.0, 2.0, 3.0, 4.0);
assert!(f1 == f2);
}
#[test]
fn partial_eq_2() {
let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
let f2 = Float4::new(1.0, 2.1, 3.0, 4.0);
assert!(!(f1 == f2));
}
#[test]
fn h_sum() {
let f = Float4::new(1.0, 2.0, 3.0, 4.0);
assert_eq!(f.h_sum(), 10.0);
}
#[test]
fn h_product() {
let f = Float4::new(1.0, 2.0, 3.0, 4.0);
assert_eq!(f.h_product(), 24.0);
}
#[test]
fn h_min() {
let f = Float4::new(1.0, 2.0, 3.0, 4.0);
assert_eq!(f.h_min(), 1.0);
}
#[test]
fn h_max() {
let f = Float4::new(1.0, 2.0, 3.0, 4.0);
assert_eq!(f.h_max(), 4.0);
}
#[test]
fn add() {
let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
let f2 = Float4::new(2.0, 3.0, 4.0, 5.0);
let f3 = Float4::new(3.0, 5.0, 7.0, 9.0);
assert_eq!(f1 + f2, f3);
}
#[test]
fn sub() {
let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
let f2 = Float4::new(2.0, 3.0, 4.0, 5.0);
let f3 = Float4::new(-1.0, -1.0, -1.0, -1.0);
assert_eq!(f1 - f2, f3);
}
#[test]
fn mul_component() {
let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
let f2 = Float4::new(2.0, 3.0, 4.0, 5.0);
let f3 = Float4::new(2.0, 6.0, 12.0, 20.0);
assert_eq!(f1 * f2, f3);
}
#[test]
fn mul_scalar() {
let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
let v = 3.0;
let f2 = Float4::new(3.0, 6.0, 9.0, 12.0);
assert_eq!(f1 * v, f2);
}
#[test]
fn div_component() {
let f1 = Float4::new(1.0, 3.0, 3.0, 6.0);
let f2 = Float4::new(2.0, 2.0, 4.0, 8.0);
let f3 = Float4::new(0.5, 1.5, 0.75, 0.75);
assert_eq!(f1 / f2, f3);
}
#[test]
fn div_scalar() {
let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
let v = 2.0;
let f2 = Float4::new(0.5, 1.0, 1.5, 2.0);
assert_eq!(f1 / v, f2);
}
#[test]
fn lt() {
let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
let f2 = Float4::new(0.5, 2.0, 3.5, 2.0);
let r = f1.lt(f2);
assert_eq!(r.get_0(), false);
assert_eq!(r.get_1(), false);
assert_eq!(r.get_2(), true);
assert_eq!(r.get_3(), false);
}
#[test]
fn gt() {
let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
let f2 = Float4::new(0.5, 2.0, 3.5, 2.0);
let r = f1.gt(f2);
assert_eq!(r.get_0(), true);
assert_eq!(r.get_1(), false);
assert_eq!(r.get_2(), false);
assert_eq!(r.get_3(), true);
}
#[test]
fn matrix_transpose() {
let mut m1 = [
Float4::new(1.0, 2.0, 3.0, 4.0),
Float4::new(5.0, 6.0, 7.0, 8.0),
Float4::new(9.0, 10.0, 11.0, 12.0),
Float4::new(13.0, 14.0, 15.0, 16.0),
];
let m2 = [
Float4::new(1.0, 5.0, 9.0, 13.0),
Float4::new(2.0, 6.0, 10.0, 14.0),
Float4::new(3.0, 7.0, 11.0, 15.0),
Float4::new(4.0, 8.0, 12.0, 16.0),
];
transpose(&mut m1);
assert_eq!(m1, m2);
}
#[test]
fn bool4_bitmask_01() {
let f1 = Float4::new(0.0, 0.0, 0.0, 0.0);
let f2 = Float4::new(-1.0, -1.0, 1.0, -1.0);
let r = f1.lt(f2).to_bitmask();
assert_eq!(r, 0b00000100);
}
#[test]
fn bool4_bitmask_02() {
let f1 = Float4::new(0.0, 0.0, 0.0, 0.0);
let f2 = Float4::new(1.0, -1.0, 1.0, -1.0);
let r = f1.lt(f2).to_bitmask();
assert_eq!(r, 0b00000101);
}
#[test]
fn bool4_bitmask_03() {
let f1 = Float4::new(0.0, 0.0, 0.0, 0.0);
let f2 = Float4::new(-1.0, 1.0, -1.0, 1.0);
let r = f1.lt(f2).to_bitmask();
assert_eq!(r, 0b00001010);
}
#[test]
fn bool4_is_all_false() {
assert_eq!(true, Bool4::new(false, false, false, false).is_all_false());
assert_eq!(false, Bool4::new(false, false, true, false).is_all_false());
}
}