Implement dot products and 3x3 matrix inversion.

Both precise and fast versions. But untested, so might be incorrect!
2022-07-14 15:30:30 -07:00 · 2022-07-14 15:30:30 -07:00 · c398387b55
commit c398387b55
parent 8a695a7694
5 changed files with 302 additions and 72 deletions
--- a/sub_crates/rmath/src/lib.rs
+++ b/sub_crates/rmath/src/lib.rs
@ -8,17 +8,26 @@ mod vector;
 pub mod wide4;
 mod xform;

+use std::ops::{Add, Mul, Neg, Sub};
+
 pub use self::{normal::Normal, point::Point, vector::Vector, xform::Xform, xform::XformFull};

-// /// Trait for calculating dot products.
-// pub trait DotProduct {
-//     fn dot(self, other: Self) -> f32;
-// }
+/// Trait for calculating dot products.
+pub trait DotProduct {
+    fn dot(self, other: Self) -> f32;

-// #[inline]
-// pub fn dot<T: DotProduct>(a: T, b: T) -> f32 {
-//     a.dot(b)
-// }
+    fn dot_fast(self, other: Self) -> f32;
+}
+
+#[inline(always)]
+pub fn dot<T: DotProduct>(a: T, b: T) -> f32 {
+    a.dot(b)
+}
+
+#[inline(always)]
+pub fn dot_fast<T: DotProduct>(a: T, b: T) -> f32 {
+    a.dot_fast(b)
+}

 // /// Trait for calculating cross products.
 // pub trait CrossProduct {
@ -29,3 +38,68 @@ pub use self::{normal::Normal, point::Point, vector::Vector, xform::Xform, xform
 // pub fn cross<T: CrossProduct>(a: T, b: T) -> T {
 //     a.cross(b)
 // }
+
+//-------------------------------------------------------------
+
+/// Trait representing types that can do fused multiply-add.
+trait FMulAdd {
+    /// `(self * b) + c` with only one floating point rounding error.
+    fn fma(self, b: Self, c: Self) -> Self;
+}
+
+impl FMulAdd for f32 {
+    fn fma(self, b: Self, c: Self) -> Self {
+        self.mul_add(b, c)
+    }
+}
+
+/// `(a * b) - (c * d)` but done with high precision via floating point tricks.
+///
+/// See https://pharr.org/matt/blog/2019/11/03/difference-of-floats
+#[inline(always)]
+fn difference_of_products<T>(a: T, b: T, c: T, d: T) -> T
+where
+    T: Copy + FMulAdd + Add<Output = T> + Mul<Output = T> + Neg<Output = T>,
+{
+    let cd = c * d;
+    let dop = a.fma(b, -cd);
+    let err = (-c).fma(d, cd);
+    dop + err
+}
+
+/// `(a * b) + (c * d)` but done with high precision via floating point tricks.
+#[inline(always)]
+fn sum_of_products<T>(a: T, b: T, c: T, d: T) -> T
+where
+    T: Copy + FMulAdd + Add<Output = T> + Mul<Output = T> + Neg<Output = T>,
+{
+    let cd = c * d;
+    let sop = a.fma(b, cd);
+    let err = c.fma(d, -cd);
+    sop + err
+}
+
+/// `a * b` but also returns a rounding error for precise composition
+/// with other operations.
+#[inline(always)]
+fn two_prod<T>(a: T, b: T) -> (T, T)
+// (product, rounding_err)
+where
+    T: Copy + FMulAdd + Mul<Output = T> + Neg<Output = T>,
+{
+    let ab = a * b;
+    (ab, a.fma(b, -ab))
+}
+
+/// `a + b` but also returns a rounding error for precise composition
+/// with other operations.
+#[inline(always)]
+fn two_sum<T>(a: T, b: T) -> (T, T)
+// (sum, rounding_err)
+where
+    T: Copy + Add<Output = T> + Sub<Output = T>,
+{
+    let sum = a + b;
+    let delta = sum - a;
+    (sum, (a - (sum - delta)) + (b - delta))
+}
--- a/sub_crates/rmath/src/normal.rs
+++ b/sub_crates/rmath/src/normal.rs
@ -3,7 +3,7 @@
 use std::ops::{Add, Div, Mul, Neg, Sub};

 use crate::wide4::Float4;
-
+use crate::DotProduct;
 use crate::Vector;

 /// A surface normal in 3D space.
@ -100,17 +100,6 @@ impl Mul<f32> for Normal {
    }
 }

-// impl Mul<Transform> for Normal {
-//     type Output = Normal;
-
-//     #[inline]
-//     fn mul(self, other: Transform) -> Normal {
-//         Normal {
-//             co: other.0.matrix3.inverse().transpose().mul_vec3a(self.co),
-//         }
-//     }
-// }
-
 impl Div<f32> for Normal {
    type Output = Self;

@ -129,12 +118,17 @@ impl Neg for Normal {
    }
 }

-// impl DotProduct for Normal {
-//     #[inline(always)]
-//     fn dot(self, other: Normal) -> f32 {
-//         self.co.dot(other.co)
-//     }
-// }
+impl DotProduct for Normal {
+    #[inline(always)]
+    fn dot(self, other: Self) -> f32 {
+        Float4::dot_3(self.0, other.0)
+    }
+
+    #[inline(always)]
+    fn dot_fast(self, other: Self) -> f32 {
+        Float4::dot_3_fast(self.0, other.0)
+    }
+}

 // impl CrossProduct for Normal {
 //     #[inline]
--- a/sub_crates/rmath/src/vector.rs
+++ b/sub_crates/rmath/src/vector.rs
@ -5,6 +5,7 @@ use std::ops::{Add, Div, Mul, Neg, Sub};
 use crate::normal::Normal;
 use crate::point::Point;
 use crate::wide4::Float4;
+use crate::DotProduct;

 /// A direction vector in 3D space.
 #[derive(Debug, Copy, Clone)]
@ -105,15 +106,6 @@ impl Mul<f32> for Vector {
    }
 }

-// impl Mul<Transform> for Vector {
-//     type Output = Self;
-
-//     #[inline]
-//     fn mul(self, other: Transform) -> Self {
-//         Self(other.0.transform_vector3a(self.0))
-//     }
-// }
-
 impl Div<f32> for Vector {
    type Output = Self;

@ -132,12 +124,17 @@ impl Neg for Vector {
    }
 }

-// impl DotProduct for Vector {
-//     #[inline(always)]
-//     fn dot(self, other: Self) -> f32 {
-//         self.co.dot(other.co)
-//     }
-// }
+impl DotProduct for Vector {
+    #[inline(always)]
+    fn dot(self, other: Self) -> f32 {
+        Float4::dot_3(self.0, other.0)
+    }
+
+    #[inline(always)]
+    fn dot_fast(self, other: Self) -> f32 {
+        Float4::dot_3_fast(self.0, other.0)
+    }
+}

 // impl CrossProduct for Vector {
 //     #[inline]
--- a/sub_crates/rmath/src/wide4.rs
+++ b/sub_crates/rmath/src/wide4.rs
@ -1,9 +1,13 @@
 use std::ops::{AddAssign, DivAssign, MulAssign, SubAssign};

+use crate::{difference_of_products, two_prod, two_sum};
+
 pub use fallback::Float4;
 mod fallback {
    use std::ops::{Add, Div, Mul, Neg, Sub};

+    use crate::FMulAdd;
+
    #[allow(non_camel_case_types)]
    #[derive(Debug, Copy, Clone)]
    #[repr(C, align(16))]
@ -81,31 +85,6 @@ mod fallback {
        //     a.max(b)
        // }

-        //-----------------------------------------------------
-        // For matrix stuff.
-
-        #[inline(always)]
-        pub fn transpose_3x3(m: [Self; 3]) -> [Self; 3] {
-            [
-                // The fourth component in each row below is arbitrary,
-                // but in this case chosen so that it matches the
-                // behavior of the SSE version of transpose_3x3.
-                Self::new(m[0].a(), m[1].a(), m[2].a(), m[2].d()),
-                Self::new(m[0].b(), m[1].b(), m[2].b(), m[2].d()),
-                Self::new(m[0].c(), m[1].c(), m[2].c(), m[2].d()),
-            ]
-        }
-
-        #[inline]
-        pub fn invert_3x3(_m: [Self; 3]) -> [Self; 3] {
-            todo!()
-        }
-
-        #[inline]
-        pub fn invert_3x3_precise(_m: [Self; 3]) -> [Self; 3] {
-            todo!()
-        }
-
        //-----------------------------------------------------
        // Individual components.

@ -187,6 +166,24 @@ mod fallback {
            let d = self.n[3];
            Self { n: [d, d, d, d] }
        }
+
+        #[inline(always)]
+        pub fn bcad(self) -> Self {
+            let a = self.n[0];
+            let b = self.n[1];
+            let c = self.n[2];
+            let d = self.n[3];
+            Self { n: [b, c, a, d] }
+        }
+
+        #[inline(always)]
+        pub fn cabd(self) -> Self {
+            let a = self.n[0];
+            let b = self.n[1];
+            let c = self.n[2];
+            let d = self.n[3];
+            Self { n: [c, a, b, d] }
+        }
    }

    impl Add for Float4 {
@ -295,9 +292,168 @@ mod fallback {
            }
        }
    }
+
+    impl FMulAdd for Float4 {
+        fn fma(self, b: Self, c: Self) -> Self {
+            self.mul_add(b, c)
+        }
+    }
 }

 //-------------------------------------------------------------
+// Float4 impls that don't depend on its inner representation.
+
+impl Float4 {
+    /// 3D dot product (only uses the first 3 components).
+    #[inline(always)]
+    pub fn dot_3(a: Self, b: Self) -> f32 {
+        let (p, p_err) = two_prod(a, b);
+
+        // Products.
+        let (x, x_err) = (p.a(), p_err.a());
+        let (y, y_err) = (p.b(), p_err.b());
+        let (z, z_err) = (p.c(), p_err.c());
+
+        // Sums.
+        let (s1, s1_err) = two_sum(x, y);
+        let err1 = x_err + (y_err + s1_err);
+
+        let (s2, s2_err) = two_sum(s1, z);
+        let err2 = z_err + (err1 + s2_err);
+
+        // Final result with rounding error compensation.
+        s2 + err2
+    }
+
+    /// 3D dot product (only uses the first 3 components).
+    ///
+    /// Faster but less precise version.
+    #[inline(always)]
+    pub fn dot_3_fast(a: Self, b: Self) -> f32 {
+        let c = a * b;
+        c.a() + c.b() + c.c()
+    }
+
+    #[inline(always)]
+    pub fn transpose_3x3(m: [Self; 3]) -> [Self; 3] {
+        [
+            // The fourth component in each row below is arbitrary,
+            // but in this case chosen so that it matches the
+            // behavior of the SSE version of transpose_3x3.
+            Self::new(m[0].a(), m[1].a(), m[2].a(), m[2].d()),
+            Self::new(m[0].b(), m[1].b(), m[2].b(), m[2].d()),
+            Self::new(m[0].c(), m[1].c(), m[2].c(), m[2].d()),
+        ]
+    }
+
+    /// Invert a 3x3 matrix.
+    ///
+    /// Returns `None` if not invertible.
+    #[inline]
+    pub fn invert_3x3(m: [Self; 3]) -> Option<[Self; 3]> {
+        // let a = difference_of_products(m[1].b(), m[2].c(), m[1].c(), m[2].b());
+        // let b = difference_of_products(m[1].c(), m[2].a(), m[1].a(), m[2].c());
+        // let c = difference_of_products(m[1].a(), m[2].b(), m[1].b(), m[2].a());
+        // let d = difference_of_products(m[2].b(), m[0].c(), m[2].c(), m[0].b());
+        // let e = difference_of_products(m[2].c(), m[0].a(), m[2].a(), m[0].c());
+        // let f = difference_of_products(m[2].a(), m[0].b(), m[2].b(), m[0].a());
+        // let g = difference_of_products(m[0].b(), m[1].c(), m[0].c(), m[1].b());
+        // let h = difference_of_products(m[0].c(), m[1].a(), m[0].a(), m[1].c());
+        // let i = difference_of_products(m[0].a(), m[1].b(), m[0].b(), m[1].a());
+
+        let m0_bca = m[0].bcad();
+        let m1_bca = m[1].bcad();
+        let m2_bca = m[2].bcad();
+        let m0_cab = m[0].cabd();
+        let m1_cab = m[1].cabd();
+        let m2_cab = m[2].cabd();
+        let abc = difference_of_products(m1_bca, m2_cab, m1_cab, m2_bca);
+        let def = difference_of_products(m2_bca, m0_cab, m2_cab, m0_bca);
+        let ghi = difference_of_products(m0_bca, m1_cab, m0_cab, m1_bca);
+
+        // TODO: use precise inner product.
+        let det = Self::dot_3(
+            Self::new(abc.a(), def.a(), ghi.a(), 0.0),
+            Self::new(m[0].a(), m[1].a(), m[2].a(), 0.0),
+        );
+
+        if det == 0.0 {
+            None
+        } else {
+            Some(Self::transpose_3x3([abc / det, def / det, ghi / det]))
+        }
+    }
+
+    /// Invert a 3x3 matrix.  Faster but less precise version.
+    ///
+    /// Returns `None` if not invertible.
+    #[inline]
+    pub fn invert_3x3_fast(m: [Self; 3]) -> Option<[Self; 3]> {
+        // let a = (m[1].b() * m[2].c()) - (m[1].c() * m[2].b());
+        // let b = (m[1].c() * m[2].a()) - (m[1].a() * m[2].c());
+        // let c = (m[1].a() * m[2].b()) - (m[1].b() * m[2].a());
+        // let e = (m[2].c() * m[0].a()) - (m[2].a() * m[0].c());
+        // let f = (m[2].a() * m[0].b()) - (m[2].b() * m[0].a());
+        // let g = (m[0].b() * m[1].c()) - (m[0].c() * m[1].b());
+        // let h = (m[0].c() * m[1].a()) - (m[0].a() * m[1].c());
+        // let i = (m[0].a() * m[1].b()) - (m[0].b() * m[1].a());
+
+        let m0_bca = m[0].bcad();
+        let m1_bca = m[1].bcad();
+        let m2_bca = m[2].bcad();
+        let m0_cab = m[0].cabd();
+        let m1_cab = m[1].cabd();
+        let m2_cab = m[2].cabd();
+        let abc = (m1_bca * m2_cab) - (m1_cab * m2_bca);
+        let def = (m2_bca * m0_cab) - (m2_cab * m0_bca);
+        let ghi = (m0_bca * m1_cab) - (m0_cab * m1_bca);
+
+        let det = Self::dot_3(
+            Self::new(abc.a(), def.a(), ghi.a(), 0.0),
+            Self::new(m[0].a(), m[1].a(), m[2].a(), 0.0),
+        );
+
+        if det == 0.0 {
+            None
+        } else {
+            Some(Self::transpose_3x3([abc / det, def / det, ghi / det]))
+        }
+    }
+
+    /// Multiplies a 3D vector with a 3x3 matrix.
+    #[inline]
+    pub fn vec3_mul_3x3(_v: Self, _m: &[Self; 3]) -> Self {
+        todo!()
+    }
+
+    /// Multiplies a 3D vector with a 3x3 matrix.
+    ///
+    /// Faster but less precise version.
+    #[inline]
+    pub fn vec3_mul_3x3_fast(_v: Self, _m: &[Self; 3]) -> Self {
+        todo!()
+    }
+
+    /// Transforms a 3d point by an affine transform.
+    ///
+    /// `m` is the 3x3 part of the affine transform, `t` is the translation part.
+    #[inline]
+    pub fn pnt3_mul_affine(_p: Self, _m: &[Self; 3], _t: Self) -> Self {
+        todo!()
+    }
+
+    /// Transforms a 3d point by an affine transform, except it applies
+    /// the translation part before the 3x3 part.
+    ///
+    /// This is primarily useful for performing efficient inverse transforms by
+    /// passing an inverted 3x3 part and a negated translation part.
+    ///
+    /// `m` is the 3x3 part of the affine transform, `t` is the translation part.
+    #[inline]
+    pub fn pnt3_mul_affine_rev(_p: Self, _m: &[Self; 3], _t: Self) -> Self {
+        todo!()
+    }
+}

 impl AddAssign for Float4 {
    #[inline(always)]
--- a/sub_crates/rmath/src/xform.rs
+++ b/sub_crates/rmath/src/xform.rs
@ -96,22 +96,31 @@ impl Xform {
        eq
    }

-    /// Returns the dual transform, which can do inverse transforms.
+    /// Computes a "full" version of the transform, which can do both
+    /// forward and inverse transforms.
    #[inline]
-    pub fn compute_dual(self) -> XformFull {
+    pub fn compute_full(self) -> XformFull {
        XformFull {
            m: self.m,
-            m_inv: Float4::invert_3x3(self.m),
+            m_inv: Float4::invert_3x3(self.m).unwrap_or([
+                Float4::new(1.0, 0.0, 0.0, 0.0),
+                Float4::new(0.0, 1.0, 0.0, 0.0),
+                Float4::new(0.0, 0.0, 1.0, 0.0),
+            ]),
            t: self.t,
        }
    }

-    /// Slower but precise version of `compute_dual()`.
+    /// Faster but less precise version of `compute_full()`.
    #[inline]
-    pub fn compute_dual_precise(self) -> XformFull {
+    pub fn compute_full_fast(self) -> XformFull {
        XformFull {
            m: self.m,
-            m_inv: Float4::invert_3x3_precise(self.m),
+            m_inv: Float4::invert_3x3_fast(self.m).unwrap_or([
+                Float4::new(1.0, 0.0, 0.0, 0.0),
+                Float4::new(0.0, 1.0, 0.0, 0.0),
+                Float4::new(0.0, 0.0, 1.0, 0.0),
+            ]),
            t: self.t,
        }
    }