psychopath/sub_crates/rmath/src/wide4.rs

584 lines
15 KiB
Rust

use std::ops::{AddAssign, DivAssign, MulAssign, SubAssign};
use approx::relative_eq;
use crate::{difference_of_products, two_prod, two_sum};
pub use fallback::Float4;
mod fallback {
use std::ops::{Add, Div, Mul, Neg, Sub};
use crate::FMulAdd;
#[derive(Debug, Copy, Clone)]
#[repr(C, align(16))]
pub struct Float4 {
n: [f32; 4],
}
impl Float4 {
/// Create a new `Float4` with the given components.
#[inline(always)]
pub fn new(a: f32, b: f32, c: f32, d: f32) -> Self {
Self { n: [a, b, c, d] }
}
/// Create a new `Float4` with all elements set to `n`.
#[inline(always)]
pub fn splat(n: f32) -> Self {
Self { n: [n, n, n, n] }
}
/// Component-wise fused multiply-add.
///
/// `(self * a) + b` with only one rounding error.
#[inline(always)]
pub fn mul_add(self, a: Self, b: Self) -> Self {
Self {
n: [
self.n[0].mul_add(a.n[0], b.n[0]),
self.n[1].mul_add(a.n[1], b.n[1]),
self.n[2].mul_add(a.n[2], b.n[2]),
self.n[3].mul_add(a.n[3], b.n[3]),
],
}
}
/// Vertical minimum.
#[inline(always)]
pub fn min(self, a: Self) -> Self {
Self {
n: [
self.n[0].min(a.n[0]),
self.n[1].min(a.n[1]),
self.n[2].min(a.n[2]),
self.n[3].min(a.n[3]),
],
}
}
/// Vertical maximum.
#[inline(always)]
pub fn max(self, a: Self) -> Self {
Self {
n: [
self.n[0].max(a.n[0]),
self.n[1].max(a.n[1]),
self.n[2].max(a.n[2]),
self.n[3].max(a.n[3]),
],
}
}
// /// Horizontal minimum.
// #[inline(always)]
// pub fn hmin(self) -> f32 {
// let a = self.n[0].min(self.n[1]);
// let b = self.n[2].min(self.n[3]);
// a.min(b)
// }
// /// Horizontal maximum.
// #[inline(always)]
// pub fn hmax(self) -> f32 {
// let a = self.n[0].max(self.n[1]);
// let b = self.n[2].max(self.n[3]);
// a.max(b)
// }
//-----------------------------------------------------
// Individual components.
#[inline(always)]
pub fn a(self) -> f32 {
self.n[0]
}
#[inline(always)]
pub fn b(self) -> f32 {
self.n[1]
}
#[inline(always)]
pub fn c(self) -> f32 {
self.n[2]
}
#[inline(always)]
pub fn d(self) -> f32 {
self.n[3]
}
#[inline(always)]
#[must_use]
pub fn set_a(self, n: f32) -> Self {
Self {
n: [n, self.n[1], self.n[2], self.n[3]],
}
}
#[inline(always)]
#[must_use]
pub fn set_b(self, n: f32) -> Self {
Self {
n: [self.n[0], n, self.n[2], self.n[3]],
}
}
#[inline(always)]
#[must_use]
pub fn set_c(self, n: f32) -> Self {
Self {
n: [self.n[0], self.n[1], n, self.n[3]],
}
}
#[inline(always)]
#[must_use]
pub fn set_d(self, n: f32) -> Self {
Self {
n: [self.n[0], self.n[1], self.n[2], n],
}
}
//-----------------------------------------------------
// Shuffles.
#[inline(always)]
pub fn aaaa(self) -> Self {
let a = self.n[0];
Self { n: [a, a, a, a] }
}
#[inline(always)]
pub fn bbbb(self) -> Self {
let b = self.n[1];
Self { n: [b, b, b, b] }
}
#[inline(always)]
pub fn cccc(self) -> Self {
let c = self.n[2];
Self { n: [c, c, c, c] }
}
#[inline(always)]
pub fn dddd(self) -> Self {
let d = self.n[3];
Self { n: [d, d, d, d] }
}
#[inline(always)]
pub fn bcad(self) -> Self {
let a = self.n[0];
let b = self.n[1];
let c = self.n[2];
let d = self.n[3];
Self { n: [b, c, a, d] }
}
#[inline(always)]
pub fn cabd(self) -> Self {
let a = self.n[0];
let b = self.n[1];
let c = self.n[2];
let d = self.n[3];
Self { n: [c, a, b, d] }
}
}
impl Add for Float4 {
type Output = Self;
#[inline(always)]
fn add(self, rhs: Self) -> Self {
Self {
n: [
self.n[0] + rhs.n[0],
self.n[1] + rhs.n[1],
self.n[2] + rhs.n[2],
self.n[3] + rhs.n[3],
],
}
}
}
impl Sub for Float4 {
type Output = Self;
#[inline(always)]
fn sub(self, rhs: Self) -> Self {
Self {
n: [
self.n[0] - rhs.n[0],
self.n[1] - rhs.n[1],
self.n[2] - rhs.n[2],
self.n[3] - rhs.n[3],
],
}
}
}
impl Mul for Float4 {
type Output = Self;
#[inline(always)]
fn mul(self, rhs: Self) -> Self {
Self {
n: [
self.n[0] * rhs.n[0],
self.n[1] * rhs.n[1],
self.n[2] * rhs.n[2],
self.n[3] * rhs.n[3],
],
}
}
}
impl Mul<f32> for Float4 {
type Output = Self;
#[inline(always)]
fn mul(self, rhs: f32) -> Self {
Self {
n: [
self.n[0] * rhs,
self.n[1] * rhs,
self.n[2] * rhs,
self.n[3] * rhs,
],
}
}
}
impl Div for Float4 {
type Output = Self;
#[inline(always)]
fn div(self, rhs: Self) -> Self {
Self {
n: [
self.n[0] / rhs.n[0],
self.n[1] / rhs.n[1],
self.n[2] / rhs.n[2],
self.n[3] / rhs.n[3],
],
}
}
}
impl Div<f32> for Float4 {
type Output = Self;
#[inline(always)]
fn div(self, rhs: f32) -> Self {
Self {
n: [
self.n[0] / rhs,
self.n[1] / rhs,
self.n[2] / rhs,
self.n[3] / rhs,
],
}
}
}
impl Neg for Float4 {
type Output = Self;
#[inline(always)]
fn neg(self) -> Self {
Self {
n: [-self.n[0], -self.n[1], -self.n[2], -self.n[3]],
}
}
}
impl FMulAdd for Float4 {
fn fma(self, b: Self, c: Self) -> Self {
self.mul_add(b, c)
}
}
}
//-------------------------------------------------------------
// Float4 impls that don't depend on its inner representation.
impl Float4 {
/// 3D dot product (only uses the first 3 components).
#[inline(always)]
pub fn dot_3(a: Self, b: Self) -> f32 {
let (p, p_err) = two_prod(a, b);
// Products.
let (x, x_err) = (p.a(), p_err.a());
let (y, y_err) = (p.b(), p_err.b());
let (z, z_err) = (p.c(), p_err.c());
// Sums.
let (s1, s1_err) = two_sum(x, y);
let err1 = x_err + (y_err + s1_err);
let (s2, s2_err) = two_sum(s1, z);
let err2 = z_err + (err1 + s2_err);
// Final result with rounding error compensation.
s2 + err2
}
/// 3D dot product (only uses the first 3 components).
///
/// Faster but less precise version.
#[inline(always)]
pub fn dot_3_fast(a: Self, b: Self) -> f32 {
let c = a * b;
c.a() + c.b() + c.c()
}
#[inline(always)]
pub fn transpose_3x3(m: [Self; 3]) -> [Self; 3] {
[
// The fourth component in each row below is arbitrary,
// but in this case chosen so that it matches the
// behavior of the SSE version of transpose_3x3.
Self::new(m[0].a(), m[1].a(), m[2].a(), m[2].d()),
Self::new(m[0].b(), m[1].b(), m[2].b(), m[2].d()),
Self::new(m[0].c(), m[1].c(), m[2].c(), m[2].d()),
]
}
/// Invert a 3x3 matrix.
///
/// Returns `None` if not invertible.
#[inline]
pub fn invert_3x3(m: [Self; 3]) -> Option<[Self; 3]> {
let m0_bca = m[0].bcad();
let m1_bca = m[1].bcad();
let m2_bca = m[2].bcad();
let m0_cab = m[0].cabd();
let m1_cab = m[1].cabd();
let m2_cab = m[2].cabd();
let abc = difference_of_products(m1_bca, m2_cab, m1_cab, m2_bca);
let def = difference_of_products(m2_bca, m0_cab, m2_cab, m0_bca);
let ghi = difference_of_products(m0_bca, m1_cab, m0_cab, m1_bca);
let det = Self::dot_3(
Self::new(abc.a(), def.a(), ghi.a(), 0.0),
Self::new(m[0].a(), m[1].a(), m[2].a(), 0.0),
);
if det == 0.0 {
None
} else {
Some(Self::transpose_3x3([abc / det, def / det, ghi / det]))
}
}
/// Invert a 3x3 matrix. Faster but less precise version.
///
/// Returns `None` if not invertible.
#[inline]
pub fn invert_3x3_fast(m: [Self; 3]) -> Option<[Self; 3]> {
let m0_bca = m[0].bcad();
let m1_bca = m[1].bcad();
let m2_bca = m[2].bcad();
let m0_cab = m[0].cabd();
let m1_cab = m[1].cabd();
let m2_cab = m[2].cabd();
let abc = (m1_bca * m2_cab) - (m1_cab * m2_bca);
let def = (m2_bca * m0_cab) - (m2_cab * m0_bca);
let ghi = (m0_bca * m1_cab) - (m0_cab * m1_bca);
let det = Self::dot_3_fast(
Self::new(abc.a(), def.a(), ghi.a(), 0.0),
Self::new(m[0].a(), m[1].a(), m[2].a(), 0.0),
);
if det == 0.0 {
None
} else {
Some(Self::transpose_3x3([abc / det, def / det, ghi / det]))
}
}
/// Multiplies a 3D vector with a 3x3 matrix.
#[inline]
pub fn vec_mul_3x3(self, m: &[Self; 3]) -> Self {
let x = self.aaaa();
let y = self.bbbb();
let z = self.cccc();
// Products.
let (a, a_err) = two_prod(x, m[0]);
let (b, b_err) = two_prod(y, m[1]);
let (c, c_err) = two_prod(z, m[2]);
// Sums.
let (s1, s1_err) = two_sum(a, b);
let err1 = a_err + (b_err + s1_err);
let (s2, s2_err) = two_sum(c, s1);
let err2 = c_err + (err1 + s2_err);
s2 + err2
}
/// Multiplies a 3D vector with a 3x3 matrix.
///
/// Faster but less precise version.
#[inline]
pub fn vec_mul_3x3_fast(self, m: &[Self; 3]) -> Self {
let x = self.aaaa();
let y = self.bbbb();
let z = self.cccc();
(x * m[0]) + (y * m[1]) + (z * m[2])
}
/// Transforms a 3d point by an affine transform.
///
/// `m` is the 3x3 part of the affine transform, `t` is the translation part.
#[inline]
pub fn vec_mul_affine(self, m: &[Self; 3], t: Self) -> Self {
let x = self.aaaa();
let y = self.bbbb();
let z = self.cccc();
// Products.
let (a, a_err) = two_prod(x, m[0]);
let (b, b_err) = two_prod(y, m[1]);
let (c, c_err) = two_prod(z, m[2]);
// Sums.
let (s1, s1_err) = two_sum(a, b);
let err1 = a_err + (b_err + s1_err);
let (s2, s2_err) = two_sum(c, s1);
let err2 = c_err + (err1 + s2_err);
let (s3, s3_err) = two_sum(t, s2);
let err3 = err2 + s3_err;
s3 + err3
}
/// Transforms a 3d point by an affine transform.
///
/// Faster but less precise version.
#[inline]
pub fn vec_mul_affine_fast(self, m: &[Self; 3], t: Self) -> Self {
let x = self.aaaa();
let y = self.bbbb();
let z = self.cccc();
(x * m[0]) + (y * m[1]) + (z * m[2]) + t
}
/// Transforms a 3d point by an affine transform, except it applies
/// the translation part before the 3x3 part.
///
/// This is primarily useful for performing efficient inverse transforms by
/// passing an inverted 3x3 part and a negated translation part.
///
/// `m` is the 3x3 part of the affine transform, `t` is the translation part.
#[inline]
pub fn vec_mul_affine_rev(self, m: &[Self; 3], t: Self) -> Self {
let (v, v_err) = two_sum(self, t);
let (x, x_err) = (v.aaaa(), v_err.aaaa());
let (y, y_err) = (v.bbbb(), v_err.bbbb());
let (z, z_err) = (v.cccc(), v_err.cccc());
// Products.
let ((a, a_err1), a_err2) = (two_prod(x, m[0]), x_err * m[0]);
let ((b, b_err1), b_err2) = (two_prod(y, m[1]), y_err * m[1]);
let ((c, c_err1), c_err2) = (two_prod(z, m[2]), z_err * m[2]);
let a_err = a_err1 + a_err2;
let b_err = b_err1 + b_err2;
let c_err = c_err1 + c_err2;
// Sums.
let (s1, s1_err) = two_sum(a, b);
let err1 = a_err + (b_err + s1_err);
let (s2, s2_err) = two_sum(c, s1);
let err2 = c_err + (err1 + s2_err);
let (s3, s3_err) = two_sum(t, s2);
let err3 = err2 + s3_err;
s3 + err3
}
/// Transforms a 3d point by an affine transform, except it applies
/// the translation part before the 3x3 part.
///
/// Faster but less precise version.
#[inline]
pub fn vec_mul_affine_rev_fast(self, m: &[Self; 3], t: Self) -> Self {
let v = self + t;
let x = v.aaaa();
let y = v.bbbb();
let z = v.cccc();
(x * m[0]) + (y * m[1]) + (z * m[2])
}
/// Returns whether the `Float4`s are approximately equal to each
/// other.
///
/// Each corresponding element cannot have a relative error exceeding
/// `epsilon`.
pub(crate) fn aprx_eq(a: Self, b: Self, epsilon: f32) -> bool {
let mut eq = true;
eq &= relative_eq!(a.a(), b.a(), epsilon = epsilon);
eq &= relative_eq!(a.b(), b.b(), epsilon = epsilon);
eq &= relative_eq!(a.c(), b.c(), epsilon = epsilon);
eq &= relative_eq!(a.d(), b.d(), epsilon = epsilon);
eq
}
}
impl AddAssign for Float4 {
#[inline(always)]
fn add_assign(&mut self, rhs: Self) {
*self = *self + rhs;
}
}
impl SubAssign for Float4 {
#[inline(always)]
fn sub_assign(&mut self, rhs: Self) {
*self = *self - rhs;
}
}
impl MulAssign for Float4 {
#[inline(always)]
fn mul_assign(&mut self, rhs: Self) {
*self = *self * rhs;
}
}
impl MulAssign<f32> for Float4 {
#[inline(always)]
fn mul_assign(&mut self, rhs: f32) {
*self = *self * rhs;
}
}
impl DivAssign for Float4 {
#[inline(always)]
fn div_assign(&mut self, rhs: Self) {
*self = *self / rhs;
}
}
impl DivAssign<f32> for Float4 {
#[inline(always)]
fn div_assign(&mut self, rhs: f32) {
*self = *self / rhs;
}
}