Convert Psychopath over to use new RMath library.

2022-07-15 21:42:35 -07:00 · 2022-07-15 21:42:35 -07:00 · 08e2e6eb06
commit 08e2e6eb06
parent a84da943d0
33 changed files with 427 additions and 304 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -177,12 +177,6 @@ dependencies = [
 "wasi",
 ]

-[[package]]
-name = "glam"
-version = "0.15.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "411e0584defa447c328f25c756ba3d0685727ecc126b46c3c1176001141cd4b6"
-
 [[package]]
 name = "half"
 version = "1.7.1"
@ -339,7 +333,6 @@ dependencies = [
 "copy_in_place",
 "crossbeam",
 "fastapprox",
- "glam",
 "half",
 "halton",
 "kioku",
@ -604,7 +597,7 @@ checksum = "26e3528b09b1f1b1e152342a4462d1e80d568dc5623a0772252a6e584a53d550"
 name = "spectral_upsampling"
 version = "0.1.0"
 dependencies = [
- "glam",
+ "rmath",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -35,7 +35,6 @@ png_encode_mini = "0.1.2"
 rustc-serialize = "0.3"
 scoped_threadpool = "0.1"
 time = "0.1"
-glam = "0.15"
 fastapprox = "0.3"

 # Local crate dependencies
--- a/psychoblend/assembly.py
+++ b/psychoblend/assembly.py
@ -314,7 +314,7 @@ class Instance:
        w.indent()
        w.write("Data [$%s]\n" % self.data_name)
        for mat in self.time_xforms:
-            w.write("Transform [%s]\n" % mat2str(mat.inverted()))
+            w.write("Transform [%s]\n" % mat2str(mat))
        for ms in self.ob.material_slots:
            if ms != None:
                w.write("SurfaceShaderBind [$%s]\n" % escape_name(ms.material.name))
--- a/psychoblend/world.py
+++ b/psychoblend/world.py
@ -82,7 +82,7 @@ class Camera:
        mat = self.ob.matrix_world.copy()
        matz = Matrix()
        matz[2][2] = -1
-        self.xforms += [mat * matz]
+        self.xforms += [(mat * matz).inverted()]

    def export(self, render_engine, w):
        render_engine.update_stats("", "Psychopath: Exporting %s" % self.ob.name)
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@ -6,7 +6,7 @@

 use std::mem::{transmute, MaybeUninit};

-use glam::BVec4A;
+use rmath::wide4::Bool4;

 use kioku::Arena;

@ -123,12 +123,12 @@ impl<'a> BVH4<'a> {
                    traversal_code,
                } => {
                    node_tests += ray_stack.ray_count_in_next_task() as u64;
-                    let mut all_hits = BVec4A::default();
+                    let mut all_hits = Bool4::new_false();

                    // Ray testing
                    ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| {
                        if rays.is_done(ray_idx) {
-                            BVec4A::default()
+                            Bool4::new_false()
                        } else {
                            let hits = if bounds.len() == 1 {
                                bounds[0].intersect_ray(
--- a/src/bbox.rs
+++ b/src/bbox.rs
@ -7,7 +7,7 @@ use std::{

 use crate::{
    lerp::{lerp, lerp_slice, Lerp},
-    math::{fast_minf32, Point, Transform, Vector},
+    math::{fast_minf32, Point, Vector, Xform, XformFull},
 };

 const BBOX_MAXT_ADJUST: f32 = 1.000_000_24;
@ -41,12 +41,12 @@ impl BBox {
    // Returns whether the given ray intersects with the bbox.
    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> bool {
        // Calculate slab intersections
-        let t1 = (self.min.co - orig.co) * dir_inv.co;
-        let t2 = (self.max.co - orig.co) * dir_inv.co;
+        let t1 = (self.min.0 - orig.0) * dir_inv.0;
+        let t2 = (self.max.0 - orig.0) * dir_inv.0;

        // Find the far and near intersection
-        let far_t = t1.max(t2).extend(std::f32::INFINITY);
-        let near_t = t1.min(t2).extend(0.0);
+        let far_t = t1.max(t2).set_d(std::f32::INFINITY);
+        let near_t = t1.min(t2).set_d(0.0);
        let far_hit_t = fast_minf32(far_t.min_element() * BBOX_MAXT_ADJUST, max_t);
        let near_hit_t = near_t.max_element();

@ -54,8 +54,10 @@ impl BBox {
        near_hit_t <= far_hit_t
    }

-    // Creates a new BBox transformed into a different space.
-    pub fn transformed(&self, xform: Transform) -> BBox {
+    // Creates a new BBox transformed from its local space to the
+    // given space.
+    #[must_use]
+    pub fn xform(&self, xform: &XformFull) -> BBox {
        // BBox corners
        let vs = [
            Point::new(self.min.x(), self.min.y(), self.min.z()),
@ -71,7 +73,7 @@ impl BBox {
        // Transform BBox corners and make new bbox
        let mut b = BBox::new();
        for v in &vs {
-            let v = *v * xform;
+            let v = v.xform(&xform);
            b.min = v.min(b.min);
            b.max = v.max(b.max);
        }
@ -103,12 +105,8 @@ impl BitOr for BBox {

    fn bitor(self, rhs: BBox) -> BBox {
        BBox::from_points(
-            Point {
-                co: self.min.co.min(rhs.min.co),
-            },
-            Point {
-                co: self.max.co.max(rhs.max.co),
-            },
+            Point(self.min.0.min(rhs.min.0)),
+            Point(self.max.0.max(rhs.max.0)),
        )
    }
 }
@ -124,14 +122,7 @@ impl BitOr<Point> for BBox {
    type Output = BBox;

    fn bitor(self, rhs: Point) -> BBox {
-        BBox::from_points(
-            Point {
-                co: self.min.co.min(rhs.co),
-            },
-            Point {
-                co: self.max.co.max(rhs.co),
-            },
-        )
+        BBox::from_points(Point(self.min.0.min(rhs.0)), Point(self.max.0.max(rhs.0)))
    }
 }

@ -150,7 +141,11 @@ impl Lerp for BBox {
    }
 }

-pub fn transform_bbox_slice_from(bbs_in: &[BBox], xforms: &[Transform], bbs_out: &mut Vec<BBox>) {
+pub fn transform_bbox_slice_from(
+    bbs_in: &[BBox],
+    xforms: &[Xform],
+    bbs_out: &mut Vec<BBox>,
+) -> Result<(), ()> {
    bbs_out.clear();

    // Transform the bounding boxes
@ -158,17 +153,19 @@ pub fn transform_bbox_slice_from(bbs_in: &[BBox], xforms: &[Transform], bbs_out:
        bbs_out.extend_from_slice(bbs_in);
    } else if bbs_in.len() == xforms.len() {
        for (bb, xf) in Iterator::zip(bbs_in.iter(), xforms.iter()) {
-            bbs_out.push(bb.transformed(xf.inverse()));
+            bbs_out.push(bb.xform(&xf.into_full().ok_or(())?));
        }
    } else if bbs_in.len() > xforms.len() {
        let s = (bbs_in.len() - 1) as f32;
        for (i, bb) in bbs_in.iter().enumerate() {
-            bbs_out.push(bb.transformed(lerp_slice(xforms, i as f32 / s).inverse()));
+            bbs_out.push(bb.xform(&lerp_slice(xforms, i as f32 / s).into_full().ok_or(())?));
        }
    } else if bbs_in.len() < xforms.len() {
        let s = (xforms.len() - 1) as f32;
        for (i, xf) in xforms.iter().enumerate() {
-            bbs_out.push(lerp_slice(bbs_in, i as f32 / s).transformed(xf.inverse()));
+            bbs_out.push(lerp_slice(bbs_in, i as f32 / s).xform(&xf.into_full().ok_or(())?));
        }
    }
+
+    Ok(())
 }
--- a/src/bbox4.rs
+++ b/src/bbox4.rs
@ -9,16 +9,16 @@ use crate::{
    math::{Point, Vector},
 };

-use glam::{BVec4A, Vec4};
+use rmath::wide4::{Bool4, Float4};

 const BBOX_MAXT_ADJUST: f32 = 1.000_000_24;

 /// A SIMD set of 4 3D axis-aligned bounding boxes.
 #[derive(Debug, Copy, Clone)]
 pub struct BBox4 {
-    pub x: (Vec4, Vec4), // (min, max)
-    pub y: (Vec4, Vec4), // (min, max)
-    pub z: (Vec4, Vec4), // (min, max)
+    pub x: (Float4, Float4), // (min, max)
+    pub y: (Float4, Float4), // (min, max)
+    pub z: (Float4, Float4), // (min, max)
 }

 impl BBox4 {
@ -26,16 +26,16 @@ impl BBox4 {
    pub fn new() -> BBox4 {
        BBox4 {
            x: (
-                Vec4::splat(std::f32::INFINITY),
-                Vec4::splat(std::f32::NEG_INFINITY),
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
            ),
            y: (
-                Vec4::splat(std::f32::INFINITY),
-                Vec4::splat(std::f32::NEG_INFINITY),
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
            ),
            z: (
-                Vec4::splat(std::f32::INFINITY),
-                Vec4::splat(std::f32::NEG_INFINITY),
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
            ),
        }
    }
@ -45,30 +45,30 @@ impl BBox4 {
    pub fn from_bboxes(b1: BBox, b2: BBox, b3: BBox, b4: BBox) -> BBox4 {
        BBox4 {
            x: (
-                Vec4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()),
-                Vec4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()),
+                Float4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()),
+                Float4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()),
            ),
            y: (
-                Vec4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()),
-                Vec4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()),
+                Float4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()),
+                Float4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()),
            ),
            z: (
-                Vec4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()),
-                Vec4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z()),
+                Float4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()),
+                Float4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z()),
            ),
        }
    }

    // Returns whether the given ray intersects with the bboxes.
-    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> BVec4A {
+    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> Bool4 {
        // Get the ray data into SIMD format.
-        let ro_x = Vec4::splat(orig.co[0]);
-        let ro_y = Vec4::splat(orig.co[1]);
-        let ro_z = Vec4::splat(orig.co[2]);
-        let rdi_x = Vec4::splat(dir_inv.co[0]);
-        let rdi_y = Vec4::splat(dir_inv.co[1]);
-        let rdi_z = Vec4::splat(dir_inv.co[2]);
-        let max_t = Vec4::splat(max_t);
+        let ro_x = orig.0.aaaa();
+        let ro_y = orig.0.bbbb();
+        let ro_z = orig.0.cccc();
+        let rdi_x = dir_inv.0.aaaa();
+        let rdi_y = dir_inv.0.bbbb();
+        let rdi_z = dir_inv.0.cccc();
+        let max_t = Float4::splat(max_t);

        // Slab tests
        let t1_x = (self.x.0 - ro_x) * rdi_x;
@ -87,10 +87,11 @@ impl BBox4 {
        let t_near_z = t1_z.min(t2_z);

        // Calculate over-all far t hit.
-        let far_t = (t_far_x.min(t_far_y.min(t_far_z)) * Vec4::splat(BBOX_MAXT_ADJUST)).min(max_t);
+        let far_t =
+            (t_far_x.min(t_far_y.min(t_far_z)) * Float4::splat(BBOX_MAXT_ADJUST)).min(max_t);

        // Calculate over-all near t hit.
-        let near_t = t_near_x.max(t_near_y).max(t_near_z.max(Vec4::splat(0.0)));
+        let near_t = t_near_x.max(t_near_y).max(t_near_z.max(Float4::splat(0.0)));

        // Hit results
        near_t.cmplt(far_t)
--- a/src/camera.rs
+++ b/src/camera.rs
@ -4,14 +4,14 @@ use kioku::Arena;

 use crate::{
    lerp::lerp_slice,
-    math::{Point, Transform, Vector},
+    math::{Point, Vector, Xform},
    ray::Ray,
    sampling::square_to_circle,
 };

 #[derive(Copy, Clone, Debug)]
 pub struct Camera<'a> {
-    transforms: &'a [Transform],
+    transforms: &'a [Xform],
    fovs: &'a [f32],
    tfovs: &'a [f32],
    aperture_radii: &'a [f32],
@ -21,7 +21,7 @@ pub struct Camera<'a> {
 impl<'a> Camera<'a> {
    pub fn new(
        arena: &'a Arena,
-        transforms: &[Transform],
+        transforms: &[Xform],
        fovs: &[f32],
        mut aperture_radii: &[f32],
        mut focus_distances: &[f32],
@ -73,7 +73,7 @@ impl<'a> Camera<'a> {

    pub fn generate_ray(&self, x: f32, y: f32, time: f32, wavelength: f32, u: f32, v: f32) -> Ray {
        // Get time-interpolated camera settings
-        let transform = lerp_slice(self.transforms, time);
+        let transform = lerp_slice(self.transforms, time).into_full_fast().unwrap();
        let tfov = lerp_slice(self.tfovs, time);
        let aperture_radius = lerp_slice(self.aperture_radii, time);
        let focus_distance = lerp_slice(self.focus_distances, time);
@ -93,8 +93,8 @@ impl<'a> Camera<'a> {
        .normalized();

        Ray {
-            orig: orig * transform,
-            dir: dir * transform,
+            orig: orig.xform_inv_fast(&transform),
+            dir: dir.xform_inv_fast(&transform),
            time: time,
            wavelength: wavelength,
            max_t: std::f32::INFINITY,
--- a/src/color.rs
+++ b/src/color.rs
@ -1,11 +1,11 @@
 use std::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign};

+use crate::math::Float4;
 pub use color::{
    rec709_e_to_xyz, rec709_to_xyz, xyz_to_aces_ap0, xyz_to_aces_ap0_e, xyz_to_rec709,
    xyz_to_rec709_e,
 };
 use compact::fluv::fluv32;
-use glam::Vec4;
 use half::f16;
 use spectral_upsampling::meng::{spectrum_xyz_to_p_4, EQUAL_ENERGY_REFLECTANCE};

@ -31,10 +31,10 @@ fn nth_wavelength(hero_wavelength: f32, n: usize) -> f32 {
    }
 }

-/// Returns all wavelengths of a hero wavelength set as a Vec4
+/// Returns all wavelengths of a hero wavelength set as a Float4
 #[inline(always)]
-fn wavelengths(hero_wavelength: f32) -> Vec4 {
-    Vec4::new(
+fn wavelengths(hero_wavelength: f32) -> Float4 {
+    Float4::new(
        nth_wavelength(hero_wavelength, 0),
        nth_wavelength(hero_wavelength, 1),
        nth_wavelength(hero_wavelength, 2),
@ -94,7 +94,7 @@ impl Color {
            } => {
                SpectralSample::from_parts(
                    // TODO: make this SIMD
-                    Vec4::new(
+                    Float4::new(
                        plancks_law(temperature, wls[0]) * factor,
                        plancks_law(temperature, wls[1]) * factor,
                        plancks_law(temperature, wls[2]) * factor,
@ -109,7 +109,7 @@ impl Color {
            } => {
                SpectralSample::from_parts(
                    // TODO: make this SIMD
-                    Vec4::new(
+                    Float4::new(
                        plancks_law_normalized(temperature, wls[0]) * factor,
                        plancks_law_normalized(temperature, wls[1]) * factor,
                        plancks_law_normalized(temperature, wls[2]) * factor,
@ -386,7 +386,7 @@ fn plancks_law_normalized(temperature: f32, wavelength: f32) -> f32 {

 #[derive(Copy, Clone, Debug)]
 pub struct SpectralSample {
-    pub e: Vec4,
+    pub e: Float4,
    hero_wavelength: f32,
 }

@ -394,7 +394,7 @@ impl SpectralSample {
    pub fn new(wavelength: f32) -> SpectralSample {
        debug_assert!(wavelength >= WL_MIN && wavelength <= WL_MAX);
        SpectralSample {
-            e: Vec4::splat(0.0),
+            e: Float4::splat(0.0),
            hero_wavelength: wavelength,
        }
    }
@ -403,12 +403,12 @@ impl SpectralSample {
    pub fn from_value(value: f32, wavelength: f32) -> SpectralSample {
        debug_assert!(wavelength >= WL_MIN && wavelength <= WL_MAX);
        SpectralSample {
-            e: Vec4::splat(value),
+            e: Float4::splat(value),
            hero_wavelength: wavelength,
        }
    }

-    pub fn from_parts(e: Vec4, wavelength: f32) -> SpectralSample {
+    pub fn from_parts(e: Float4, wavelength: f32) -> SpectralSample {
        debug_assert!(wavelength >= WL_MIN && wavelength <= WL_MAX);
        SpectralSample {
            e: e,
@ -599,8 +599,8 @@ impl DivAssign<f32> for XYZ {
 /// the method in the paper "Physically Meaningful Rendering using Tristimulus
 /// Colours" by Meng et al.
 #[inline(always)]
-fn xyz_to_spectrum_4(xyz: (f32, f32, f32), wavelengths: Vec4) -> Vec4 {
-    spectrum_xyz_to_p_4(wavelengths, xyz) * Vec4::splat(1.0 / EQUAL_ENERGY_REFLECTANCE)
+fn xyz_to_spectrum_4(xyz: (f32, f32, f32), wavelengths: Float4) -> Float4 {
+    spectrum_xyz_to_p_4(wavelengths, xyz) * Float4::splat(1.0 / EQUAL_ENERGY_REFLECTANCE)
    // aces_to_spectrum_p4(wavelengths, xyz_to_aces_ap0_e(xyz))
 }

--- a/src/lerp.rs
+++ b/src/lerp.rs
@ -1,6 +1,6 @@
 #![allow(dead_code)]

-use math3d::{Normal, Point, Transform, Vector};
+use rmath::{wide4::Float4, Normal, Point, Vector, Xform};

 /// Trait for allowing a type to be linearly interpolated.
 pub trait Lerp: Copy {
@ -100,36 +100,34 @@ impl<T: Lerp> Lerp for [T; 4] {
    }
 }

-impl Lerp for glam::Vec4 {
-    fn lerp(self, other: glam::Vec4, alpha: f32) -> glam::Vec4 {
+impl Lerp for Float4 {
+    fn lerp(self, other: Self, alpha: f32) -> Self {
        (self * (1.0 - alpha)) + (other * alpha)
    }
 }

-impl Lerp for Transform {
-    fn lerp(self, other: Transform, alpha: f32) -> Transform {
+impl Lerp for Xform {
+    fn lerp(self, other: Self, alpha: f32) -> Self {
        (self * (1.0 - alpha)) + (other * alpha)
    }
 }

 impl Lerp for Normal {
-    fn lerp(self, other: Normal, alpha: f32) -> Normal {
+    fn lerp(self, other: Self, alpha: f32) -> Self {
        (self * (1.0 - alpha)) + (other * alpha)
    }
 }

 impl Lerp for Point {
-    fn lerp(self, other: Point, alpha: f32) -> Point {
-        let s = self;
-        let o = other;
-        Point {
-            co: (s.co * (1.0 - alpha)) + (o.co * alpha),
-        }
+    fn lerp(self, other: Self, alpha: f32) -> Self {
+        let a = self.0;
+        let b = other.0;
+        Point((a * (1.0 - alpha)) + (b * alpha))
    }
 }

 impl Lerp for Vector {
-    fn lerp(self, other: Vector, alpha: f32) -> Vector {
+    fn lerp(self, other: Self, alpha: f32) -> Self {
        (self * (1.0 - alpha)) + (other * alpha)
    }
 }
--- a/src/light/mod.rs
+++ b/src/light/mod.rs
@ -6,7 +6,7 @@ use std::fmt::Debug;

 use crate::{
    color::SpectralSample,
-    math::{Normal, Point, Transform, Vector},
+    math::{Normal, Point, Vector, XformFull},
    surface::Surface,
 };

@ -34,7 +34,7 @@ pub trait SurfaceLight: Surface {
    /// - The pdf of the sample.
    fn sample_from_point(
        &self,
-        space: &Transform,
+        space: &XformFull,
        arr: Point,
        u: f32,
        v: f32,
--- a/src/light/rectangle_light.rs
+++ b/src/light/rectangle_light.rs
@ -5,7 +5,7 @@ use crate::{
    boundable::Boundable,
    color::{Color, SpectralSample},
    lerp::lerp_slice,
-    math::{cross, dot, Normal, Point, Transform, Vector},
+    math::{cross, dot, Normal, Point, Vector, Xform, XformFull},
    ray::{RayBatch, RayStack},
    sampling::{
        spherical_triangle_solid_angle, triangle_surface_area, uniform_sample_spherical_triangle,
@ -51,7 +51,7 @@ impl<'a> RectangleLight<'a> {
    // more efficiently by inlining it there.
    fn sample_pdf(
        &self,
-        space: &Transform,
+        space: &XformFull,
        arr: Point,
        sample_dir: Vector,
        hit_point: Point,
@ -64,11 +64,10 @@ impl<'a> RectangleLight<'a> {
        let dim = lerp_slice(self.dimensions, time);

        // Get the four corners of the rectangle, transformed into world space
-        let space_inv = space.inverse();
-        let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0) * space_inv;
-        let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0) * space_inv;
-        let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0) * space_inv;
-        let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0) * space_inv;
+        let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0).xform(space);
+        let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0).xform(space);
+        let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0).xform(space);
+        let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0).xform(space);

        // Get the four corners of the rectangle, projected on to the unit
        // sphere centered around arr.
@ -82,7 +81,7 @@ impl<'a> RectangleLight<'a> {
        let area_2 = spherical_triangle_solid_angle(sp4, sp1, sp3);

        // World-space surface normal
-        let normal = Normal::new(0.0, 0.0, 1.0) * space_inv;
+        let normal = Normal::new(0.0, 0.0, 1.0).xform(space);

        // PDF
        if (area_1 + area_2) < SIMPLE_SAMPLING_THRESHOLD {
@ -97,7 +96,7 @@ impl<'a> RectangleLight<'a> {

    // fn outgoing(
    //     &self,
-    //     space: &Transform,
+    //     space: &XformFull,
    //     dir: Vector,
    //     u: f32,
    //     v: f32,
@ -120,7 +119,7 @@ impl<'a> RectangleLight<'a> {
 impl<'a> SurfaceLight for RectangleLight<'a> {
    fn sample_from_point(
        &self,
-        space: &Transform,
+        space: &XformFull,
        arr: Point,
        u: f32,
        v: f32,
@ -135,11 +134,10 @@ impl<'a> SurfaceLight for RectangleLight<'a> {
        let surface_area_inv: f64 = 1.0 / surface_area;

        // Get the four corners of the rectangle, transformed into world space
-        let space_inv = space.inverse();
-        let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0) * space_inv;
-        let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0) * space_inv;
-        let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0) * space_inv;
-        let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0) * space_inv;
+        let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0).xform(space);
+        let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0).xform(space);
+        let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0).xform(space);
+        let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0).xform(space);

        // Get the four corners of the rectangle relative to arr.
        let lp1 = p1 - arr;
@ -158,7 +156,7 @@ impl<'a> SurfaceLight for RectangleLight<'a> {
        let area_2 = spherical_triangle_solid_angle(sp4, sp1, sp3);

        // Calculate world-space surface normal
-        let normal = Normal::new(0.0, 0.0, 1.0) * space_inv;
+        let normal = Normal::new(0.0, 0.0, 1.0).xform(space);

        if (area_1 + area_2) < SIMPLE_SAMPLING_THRESHOLD {
            // Simple sampling for more distant lights
@ -215,18 +213,16 @@ impl<'a> SurfaceLight for RectangleLight<'a> {
            };

            // Project shadow_vec back onto the light's surface
-            let arr_local = arr * *space;
-            let shadow_vec_local = shadow_vec * *space;
+            let arr_local = arr.xform_inv(space);
+            let shadow_vec_local = shadow_vec.xform_inv(space);
            let shadow_vec_local = shadow_vec_local * (-arr_local.z() / shadow_vec_local.z());
            let mut sample_point_local = arr_local + shadow_vec_local;
            {
                let x = sample_point_local.x().max(dim.0 * -0.5).min(dim.0 * 0.5);
                let y = sample_point_local.y().max(dim.1 * -0.5).min(dim.1 * 0.5);
-                sample_point_local.set_x(x);
-                sample_point_local.set_y(y);
-                sample_point_local.set_z(0.0);
+                sample_point_local = Point::new(x, y, 0.0);
            }
-            let sample_point = sample_point_local * space_inv;
+            let sample_point = sample_point_local.xform(space);
            let point_err = 0.0001; // TODO: this is a hack, do properly.

            // Calculate pdf and light energy
@ -261,7 +257,7 @@ impl<'a> Surface for RectangleLight<'a> {
        ray_stack: &mut RayStack,
        isects: &mut [SurfaceIntersection],
        shader: &dyn SurfaceShader,
-        space: &[Transform],
+        space: &[Xform],
    ) {
        let _ = shader; // Silence 'unused' warning

@ -275,13 +271,17 @@ impl<'a> Surface for RectangleLight<'a> {
            let dim = lerp_slice(self.dimensions, time);
            let xform = lerp_slice(space, time);

-            let space_inv = xform.inverse();
+            let space = if let Some(xform) = xform.into_full() {
+                xform
+            } else {
+                return;
+            };

            // Get the four corners of the rectangle, transformed into world space
-            let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0) * space_inv;
-            let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0) * space_inv;
-            let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0) * space_inv;
-            let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0) * space_inv;
+            let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0).xform(&space);
+            let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0).xform(&space);
+            let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0).xform(&space);
+            let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0).xform(&space);

            // Test against two triangles that make up the light
            let ray_pre = triangle::RayTriPrecompute::new(dir);
@ -302,9 +302,9 @@ impl<'a> Surface for RectangleLight<'a> {
                                pos_err: pos_err,
                                nor: normal,
                                nor_g: normal,
-                                local_space: xform,
+                                local_space: space,
                                sample_pdf: self.sample_pdf(
-                                    &xform,
+                                    &space,
                                    orig,
                                    dir,
                                    pos,
--- a/src/light/sphere_light.rs
+++ b/src/light/sphere_light.rs
@ -7,7 +7,7 @@ use crate::{
    boundable::Boundable,
    color::{Color, SpectralSample},
    lerp::lerp_slice,
-    math::{coordinate_system_from_vector, dot, Normal, Point, Transform, Vector},
+    math::{coordinate_system_from_vector, dot, Normal, Point, Vector, Xform, XformFull},
    ray::{RayBatch, RayStack},
    sampling::{uniform_sample_cone, uniform_sample_cone_pdf, uniform_sample_sphere},
    shading::surface_closure::SurfaceClosure,
@ -50,7 +50,7 @@ impl<'a> SphereLight<'a> {
    // more efficiently by inlining it there.
    fn sample_pdf(
        &self,
-        space: &Transform,
+        space: &XformFull,
        arr: Point,
        sample_dir: Vector,
        sample_u: f32,
@ -61,7 +61,7 @@ impl<'a> SphereLight<'a> {
        // We're not using these, silence warnings
        let _ = (sample_dir, sample_u, sample_v, wavelength);

-        let arr = arr * *space;
+        let arr = arr.xform_inv(space);
        let pos = Point::new(0.0, 0.0, 0.0);
        let radius: f64 = lerp_slice(self.radii, time) as f64;

@ -84,7 +84,7 @@ impl<'a> SphereLight<'a> {
 impl<'a> SurfaceLight for SphereLight<'a> {
    fn sample_from_point(
        &self,
-        space: &Transform,
+        space: &XformFull,
        arr: Point,
        u: f32,
        v: f32,
@ -92,12 +92,9 @@ impl<'a> SurfaceLight for SphereLight<'a> {
        time: f32,
    ) -> (SpectralSample, (Point, Normal, f32), f32) {
        // TODO: track fp error due to transforms
-        let arr = arr * *space;
+        let arr = arr.xform_inv(space);
        let pos = Point::new(0.0, 0.0, 0.0);

-        // Precalculate local->world space transform matrix
-        let inv_space = space.inverse();
-
        // Calculate time interpolated values
        let radius: f64 = lerp_slice(self.radii, time) as f64;
        let col = lerp_slice(self.colors, time);
@ -115,7 +112,7 @@ impl<'a> SurfaceLight for SphereLight<'a> {
        // TODO: do this properly.  This is a total hack.
        let sample_point_err = {
            let v = Vector::new(radius as f32, radius as f32, radius as f32);
-            let v2 = v * inv_space;
+            let v2 = v.xform(space);
            v2.length() * SAMPLE_POINT_FUDGE
        };

@ -159,8 +156,8 @@ impl<'a> SurfaceLight for SphereLight<'a> {
                let normal = (arr + sample_vec).into_vector().normalized();
                let point = normal * radius as f32;
                (
-                    point.into_point() * inv_space,
-                    normal.into_normal() * inv_space,
+                    point.into_point().xform(space),
+                    normal.into_normal().xform(space),
                )
            };
            let pdf = uniform_sample_cone_pdf(cos_theta_max);
@ -177,8 +174,8 @@ impl<'a> SurfaceLight for SphereLight<'a> {
                let normal = (arr + sample_vec).into_vector().normalized();
                let point = normal * radius as f32;
                (
-                    point.into_point() * inv_space,
-                    normal.into_normal() * inv_space,
+                    point.into_point().xform(space),
+                    normal.into_normal().xform(space),
                )
            };
            let pdf = 1.0 / (4.0 * PI_64);
@ -210,7 +207,7 @@ impl<'a> Surface for SphereLight<'a> {
        ray_stack: &mut RayStack,
        isects: &mut [SurfaceIntersection],
        shader: &dyn SurfaceShader,
-        space: &[Transform],
+        space: &[Xform],
    ) {
        let _ = shader; // Silence 'unused' warning

@ -218,14 +215,18 @@ impl<'a> Surface for SphereLight<'a> {
            let time = rays.time(ray_idx);

            // Get the transform space
-            let xform = lerp_slice(space, time);
+            let xform = if let Some(xform) = lerp_slice(space, time).into_full() {
+                xform
+            } else {
+                return;
+            };

            // Get the radius of the sphere at the ray's time
            let radius = lerp_slice(self.radii, time); // Radius of the sphere

            // Get the ray origin and direction in local space
            let orig = rays.orig_local(ray_idx).into_vector();
-            let dir = rays.dir(ray_idx) * xform;
+            let dir = rays.dir(ray_idx).xform_inv(&xform);

            // Code adapted to Rust from https://github.com/Tecla/Rayito
            // Ray-sphere intersection can result in either zero, one or two points
@ -286,18 +287,16 @@ impl<'a> Surface for SphereLight<'a> {
                isects[ray_idx] = SurfaceIntersection::Occlude;
                rays.mark_done(ray_idx);
            } else {
-                let inv_xform = xform.inverse();
-
                // Position is calculated from the local-space ray and t, and then
                // re-projected onto the surface of the sphere.
                let t_pos = orig + (dir * t);
                let unit_pos = t_pos.normalized();
-                let pos = (unit_pos * radius * inv_xform).into_point();
+                let pos = (unit_pos * radius).xform(&xform).into_point();

                // TODO: proper error bounds.
                let pos_err = 0.001;

-                let normal = unit_pos.into_normal() * inv_xform;
+                let normal = unit_pos.into_normal().xform(&xform);

                let intersection_data = SurfaceIntersectionData {
                    incoming: rays.dir(ray_idx),
--- a/src/math.rs
+++ b/src/math.rs
@ -2,7 +2,9 @@

 use std::f32;

-pub use math3d::{cross, dot, CrossProduct, DotProduct, Normal, Point, Transform, Vector};
+pub use rmath::{
+    cross, dot, wide4::Float4, CrossProduct, DotProduct, Normal, Point, Vector, Xform, XformFull,
+};

 /// Clamps a value between a min and max.
 pub fn clamp<T: PartialOrd>(v: T, lower: T, upper: T) -> T {
--- a/src/parse/psy.rs
+++ b/src/parse/psy.rs
@ -10,7 +10,7 @@ use crate::{
    camera::Camera,
    color::{rec709_e_to_xyz, Color},
    light::WorldLightSource,
-    math::Transform,
+    math::Xform,
    renderer::Renderer,
    scene::Scene,
    scene::World,
@ -553,17 +553,17 @@ fn parse_world<'a>(arena: &'a Arena, tree: &'a DataTree) -> Result<World<'a>, Ps
    }
 }

-pub fn parse_matrix(contents: &str) -> Result<Transform, PsyParseError> {
+pub fn parse_matrix(contents: &str) -> Result<Xform, PsyParseError> {
    if let IResult::Ok((leftover, ns)) = all_consuming(tuple((
        ws_f32, ws_f32, ws_f32, ws_f32, ws_f32, ws_f32, ws_f32, ws_f32, ws_f32, ws_f32, ws_f32,
        ws_f32, ws_f32, ws_f32, ws_f32, ws_f32,
    )))(contents)
    {
        if leftover.is_empty() {
-            return Ok(Transform::new_from_values(
+            return Ok(Xform::new(
                // We throw away the last row, since it's not necessarily affine.
                // TODO: is there a more correct way to handle this?
-                ns.0, ns.4, ns.8, ns.12, ns.1, ns.5, ns.9, ns.13, ns.2, ns.6, ns.10, ns.14,
+                ns.0, ns.1, ns.2, ns.4, ns.5, ns.6, ns.8, ns.9, ns.10, ns.12, ns.13, ns.14,
            ));
        }
    }
--- a/src/ray.rs
+++ b/src/ray.rs
@ -1,8 +1,8 @@
 #![allow(dead_code)]

-use glam::BVec4A;
+use rmath::wide4::Bool4;

-use crate::math::{Point, Transform, Vector};
+use crate::math::{Point, Vector, XformFull};

 type RayIndexType = u16;
 type FlagType = u8;
@ -85,9 +85,7 @@ impl RayBatch {

    pub fn set_from_ray(&mut self, ray: &Ray, is_occlusion: bool, idx: usize) {
        self.hot[idx].orig_local = ray.orig;
-        self.hot[idx].dir_inv_local = Vector {
-            co: ray.dir.co.recip(),
-        };
+        self.hot[idx].dir_inv_local = Vector(ray.dir.0.recip());
        self.hot[idx].max_t = ray.max_t;
        self.hot[idx].time = ray.time;
        self.hot[idx].flags = if is_occlusion { OCCLUSION_FLAG } else { 0 };
@ -115,15 +113,13 @@ impl RayBatch {
    }

    /// Updates the accel data of the given ray (at index `idx`) with the
-    /// given world-to-local-space transform matrix.
+    /// given transform.
    ///
    /// This should be called when entering (and exiting) traversal of a
    /// new transform space.
-    pub fn update_local(&mut self, idx: usize, xform: &Transform) {
-        self.hot[idx].orig_local = self.cold[idx].orig * *xform;
-        self.hot[idx].dir_inv_local = Vector {
-            co: (self.cold[idx].dir * *xform).co.recip(),
-        };
+    pub fn update_local(&mut self, idx: usize, xform: &XformFull) {
+        self.hot[idx].orig_local = self.cold[idx].orig.xform_inv(xform);
+        self.hot[idx].dir_inv_local = Vector((self.cold[idx].dir.xform_inv(xform)).0.recip());
    }

    //==========================================================
@ -349,7 +345,7 @@ impl RayStack {
    /// indicated lanes.
    pub fn pop_do_next_task_and_push_rays<F>(&mut self, output_lane_count: usize, mut handle_ray: F)
    where
-        F: FnMut(usize) -> BVec4A,
+        F: FnMut(usize) -> Bool4,
    {
        // Pop the task and do necessary bookkeeping.
        let task = self.tasks.pop().unwrap();
--- a/src/renderer.rs
+++ b/src/renderer.rs
@ -9,8 +9,6 @@ use std::{
 use crossbeam::sync::MsQueue;
 use scoped_threadpool::Pool;

-use glam::Vec4;
-
 use crate::{
    accel::ACCEL_NODE_RAY_TESTS,
    color::{map_0_1_to_wavelength, SpectralSample, XYZ},
@ -18,7 +16,7 @@ use crate::{
    hash::hash_u32,
    hilbert,
    image::Image,
-    math::{probit, upper_power_of_two},
+    math::{probit, upper_power_of_two, Float4},
    mis::power_heuristic,
    ray::{Ray, RayBatch},
    scene::{Scene, SceneLightSample},
@ -379,12 +377,12 @@ pub struct LightPath {
    wavelength: f32,

    next_bounce_ray: Option<Ray>,
-    next_attenuation_fac: Vec4,
+    next_attenuation_fac: Float4,

    closure_sample_pdf: f32,
-    light_attenuation: Vec4,
-    pending_color_addition: Vec4,
-    color: Vec4,
+    light_attenuation: Float4,
+    pending_color_addition: Float4,
+    color: Float4,
 }

 #[allow(clippy::new_ret_no_self)]
@ -412,12 +410,12 @@ impl LightPath {
                wavelength: wavelength,

                next_bounce_ray: None,
-                next_attenuation_fac: Vec4::splat(1.0),
+                next_attenuation_fac: Float4::splat(1.0),

                closure_sample_pdf: 1.0,
-                light_attenuation: Vec4::splat(1.0),
-                pending_color_addition: Vec4::splat(0.0),
-                color: Vec4::splat(0.0),
+                light_attenuation: Float4::splat(1.0),
+                pending_color_addition: Float4::splat(0.0),
+                color: Float4::splat(0.0),
            },
            scene.camera.generate_ray(
                image_plane_co.0,
--- a/src/scene/assembly.rs
+++ b/src/scene/assembly.rs
@ -10,7 +10,7 @@ use crate::{
    color::SpectralSample,
    lerp::lerp_slice,
    light::SurfaceLight,
-    math::{Normal, Point, Transform},
+    math::{Normal, Point, Xform, XformFull},
    shading::SurfaceShader,
    surface::{Surface, SurfaceIntersection},
    transform_stack::TransformStack,
@ -21,7 +21,7 @@ pub struct Assembly<'a> {
    // Instance list
    pub instances: &'a [Instance],
    pub light_instances: &'a [Instance],
-    pub xforms: &'a [Transform],
+    pub xforms: &'a [Xform],

    // Surface shader list
    pub surface_shaders: &'a [&'a dyn SurfaceShader],
@ -58,15 +58,20 @@ impl<'a> Assembly<'a> {
        } = *intr
        {
            let sel_xform = if !xform_stack.top().is_empty() {
-                lerp_slice(xform_stack.top(), time)
+                if let Some(xform) = lerp_slice(xform_stack.top(), time).into_full() {
+                    xform
+                } else {
+                    return None;
+                }
            } else {
-                Transform::new()
+                XformFull::identity()
            };
+
            if let Some((light_i, sel_pdf, whittled_n)) = self.light_accel.select(
-                idata.incoming * sel_xform,
-                idata.pos * sel_xform,
-                idata.nor * sel_xform,
-                idata.nor_g * sel_xform,
+                idata.incoming.xform_inv(&sel_xform),
+                idata.pos.xform_inv(&sel_xform),
+                idata.nor.xform_inv(&sel_xform),
+                idata.nor_g.xform_inv(&sel_xform),
                &closure,
                time,
                n,
@ -76,12 +81,12 @@ impl<'a> Assembly<'a> {
                    InstanceType::Object => {
                        match self.objects[inst.data_index] {
                            Object::SurfaceLight(light) => {
-                                // Get the world-to-object space transform of the light
+                                // Get the transform of the light.
                                let xform = if let Some((a, b)) = inst.transform_indices {
                                    let pxforms = xform_stack.top();
                                    let xform = lerp_slice(&self.xforms[a..b], time);
                                    if !pxforms.is_empty() {
-                                        lerp_slice(pxforms, time) * xform
+                                        lerp_slice(pxforms, time).compose(&xform)
                                    } else {
                                        xform
                                    }
@ -90,15 +95,20 @@ impl<'a> Assembly<'a> {
                                    if !pxforms.is_empty() {
                                        lerp_slice(pxforms, time)
                                    } else {
-                                        Transform::new()
+                                        Xform::identity()
                                    }
-                                };
+                                }
+                                .into_full();

                                // Sample the light
-                                let (color, sample_geo, pdf) = light.sample_from_point(
-                                    &xform, idata.pos, uvw.0, uvw.1, wavelength, time,
-                                );
-                                return Some((color, sample_geo, pdf, sel_pdf));
+                                if let Some(xform) = xform {
+                                    let (color, sample_geo, pdf) = light.sample_from_point(
+                                        &xform, idata.pos, uvw.0, uvw.1, wavelength, time,
+                                    );
+                                    return Some((color, sample_geo, pdf, sel_pdf));
+                                } else {
+                                    return None;
+                                }
                            }

                            _ => unimplemented!(),
@ -106,7 +116,7 @@ impl<'a> Assembly<'a> {
                    }

                    InstanceType::Assembly => {
-                        // Push the world-to-object space transforms of the assembly onto
+                        // Push the transform of the assembly onto
                        // the transform stack.
                        if let Some((a, b)) = inst.transform_indices {
                            xform_stack.push(&self.xforms[a..b]);
@ -152,7 +162,7 @@ pub struct AssemblyBuilder<'a> {

    // Instance list
    instances: Vec<Instance>,
-    xforms: Vec<Transform>,
+    xforms: Vec<Xform>,

    // Shader list
    surface_shaders: Vec<&'a dyn SurfaceShader>,
@ -224,7 +234,7 @@ impl<'a> AssemblyBuilder<'a> {
        &mut self,
        name: &str,
        surface_shader_name: Option<&str>,
-        xforms: Option<&[Transform]>,
+        xforms: Option<&[Xform]>,
    ) {
        // Make sure name exists
        if !self.name_exists(name) {
@ -380,7 +390,7 @@ impl<'a> AssemblyBuilder<'a> {
            // Transform the bounding boxes, if necessary
            if let Some((xstart, xend)) = inst.transform_indices {
                let xf = &self.xforms[xstart..xend];
-                transform_bbox_slice_from(&bbs, xf, &mut bbs2);
+                transform_bbox_slice_from(&bbs, xf, &mut bbs2).unwrap();
            } else {
                bbs2.clear();
                bbs2.extend(bbs);
--- a/src/shading/surface_closure.rs
+++ b/src/shading/surface_closure.rs
@ -2,12 +2,10 @@

 use std::f32::consts::PI as PI_32;

-use glam::Vec4;
-
 use crate::{
    color::{Color, SpectralSample},
    lerp::{lerp, Lerp},
-    math::{clamp, dot, zup_to_vec, Normal, Vector},
+    math::{clamp, dot, zup_to_vec, Float4, Normal, Vector},
    sampling::cosine_sample_hemisphere,
 };

@ -512,7 +510,7 @@ mod ggx_closure {
                rev_fresnel,
            );

-            SpectralSample::from_parts(Vec4::new(c0, c1, c2, c3), wavelength)
+            SpectralSample::from_parts(Float4::new(c0, c1, c2, c3), wavelength)
        };

        // Calculate everything else
--- a/src/surface/micropoly_batch.rs
+++ b/src/surface/micropoly_batch.rs
@ -9,7 +9,7 @@ use crate::{
    bbox::BBox,
    boundable::Boundable,
    lerp::lerp_slice,
-    math::{cross, dot, Normal, Point, Transform},
+    math::{cross, dot, Normal, Point, Xform, XformFull},
    ray::{RayBatch, RayStack},
    shading::SurfaceClosure,
 };
@ -150,13 +150,17 @@ impl<'a> MicropolyBatch<'a> {
        rays: &mut RayBatch,
        ray_stack: &mut RayStack,
        isects: &mut [SurfaceIntersection],
-        space: &[Transform],
+        space: &[Xform],
    ) {
        // Precalculate transform for non-motion blur cases
        let static_mat_space = if space.len() == 1 {
-            lerp_slice(space, 0.0).inverse()
+            if let Some(xform) = space[0].into_full() {
+                xform
+            } else {
+                return;
+            }
        } else {
-            Transform::new()
+            XformFull::identity()
        };

        self.accel
@ -182,11 +186,11 @@ impl<'a> MicropolyBatch<'a> {
                            );
                            if !space.is_empty() {
                                (*tri_cache[i].as_mut_ptr()).0 =
-                                    (*tri_cache[i].as_mut_ptr()).0 * static_mat_space;
+                                    (*tri_cache[i].as_mut_ptr()).0.xform(&static_mat_space);
                                (*tri_cache[i].as_mut_ptr()).1 =
-                                    (*tri_cache[i].as_mut_ptr()).1 * static_mat_space;
+                                    (*tri_cache[i].as_mut_ptr()).1.xform(&static_mat_space);
                                (*tri_cache[i].as_mut_ptr()).2 =
-                                    (*tri_cache[i].as_mut_ptr()).2 * static_mat_space;
+                                    (*tri_cache[i].as_mut_ptr()).2.xform(&static_mat_space);
                            }
                        }
                    }
@ -205,7 +209,11 @@ impl<'a> MicropolyBatch<'a> {
                    // Calculate the ray space, if necessary.
                    let mat_space = if space.len() > 1 {
                        // Per-ray transform, for motion blur
-                        lerp_slice(space, ray_time).inverse()
+                        if let Some(xform) = lerp_slice(space, ray_time).into_full() {
+                            xform
+                        } else {
+                            return;
+                        }
                    } else {
                        static_mat_space
                    };
@ -251,9 +259,9 @@ impl<'a> MicropolyBatch<'a> {
                            };

                            if !space.is_empty() {
-                                tri.0 = tri.0 * mat_space;
-                                tri.1 = tri.1 * mat_space;
-                                tri.2 = tri.2 * mat_space;
+                                tri.0 = tri.0.xform(&mat_space);
+                                tri.1 = tri.1.xform(&mat_space);
+                                tri.2 = tri.2.xform(&mat_space);
                            }

                            tri
@ -311,7 +319,7 @@ impl<'a> MicropolyBatch<'a> {
                            let n1 = lerp_slice(n1_slice, ray_time).normalized();
                            let n2 = lerp_slice(n2_slice, ray_time).normalized();

-                            let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
+                            let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)).xform(&mat_space);
                            if dot(s_nor, geo_normal) >= 0.0 {
                                s_nor
                            } else {
--- a/src/surface/mod.rs
+++ b/src/surface/mod.rs
@ -10,7 +10,7 @@ use std::fmt::Debug;

 use crate::{
    boundable::Boundable,
-    math::{Normal, Point, Transform, Vector},
+    math::{Normal, Point, Vector, Xform, XformFull},
    ray::{RayBatch, RayStack},
    shading::surface_closure::SurfaceClosure,
    shading::SurfaceShader,
@ -25,7 +25,7 @@ pub trait Surface: Boundable + Debug + Sync {
        ray_stack: &mut RayStack,
        isects: &mut [SurfaceIntersection],
        shader: &dyn SurfaceShader,
-        space: &[Transform],
+        space: &[Xform],
    );
 }

@ -80,13 +80,13 @@ pub enum SurfaceIntersection {

 #[derive(Debug, Copy, Clone)]
 pub struct SurfaceIntersectionData {
-    pub incoming: Vector, // Direction of the incoming ray
-    pub pos: Point,       // Position of the intersection
+    pub incoming: Vector, // Direction of the incoming ray.
+    pub pos: Point,       // Position of the intersection.
    pub pos_err: f32,     // Error magnitude of the intersection position.  Imagine
    // a cube centered around `pos` with dimensions of `2 * pos_err`.
-    pub nor: Normal,            // Shading normal
-    pub nor_g: Normal,          // True geometric normal
-    pub local_space: Transform, // Matrix from global space to local space
-    pub t: f32,                 // Ray t-value at the intersection point
-    pub sample_pdf: f32,        // The PDF of getting this point by explicitly sampling the surface
+    pub nor: Normal,            // Shading normal.
+    pub nor_g: Normal,          // True geometric normal.
+    pub local_space: XformFull, // Matrix from local to world space.
+    pub t: f32,                 // Ray t-value at the intersection point.
+    pub sample_pdf: f32,        // The PDF of getting this point by explicitly sampling the surface.
 }
--- a/src/surface/triangle.rs
+++ b/src/surface/triangle.rs
@ -162,7 +162,7 @@ pub fn surface_point(tri: (Point, Point, Point), bary: (f32, f32, f32)) -> (Poin
        + (tri.1.into_vector().abs() * bary.1)
        + (tri.2.into_vector().abs() * bary.2))
        * fp_gamma(7))
-    .co
+    .0
    .max_element();

    (pos, pos_err)
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@ -7,7 +7,7 @@ use crate::{
    bbox::BBox,
    boundable::Boundable,
    lerp::lerp_slice,
-    math::{cross, dot, Normal, Point, Transform},
+    math::{cross, dot, Normal, Point, Xform, XformFull},
    ray::{RayBatch, RayStack},
    shading::SurfaceShader,
 };
@ -128,13 +128,17 @@ impl<'a> Surface for TriangleMesh<'a> {
        ray_stack: &mut RayStack,
        isects: &mut [SurfaceIntersection],
        shader: &dyn SurfaceShader,
-        space: &[Transform],
+        space: &[Xform],
    ) {
        // Precalculate transform for non-motion blur cases
        let static_mat_space = if space.len() == 1 {
-            lerp_slice(space, 0.0).inverse()
+            if let Some(xform) = lerp_slice(space, 0.0).into_full() {
+                xform
+            } else {
+                return;
+            }
        } else {
-            Transform::new()
+            XformFull::identity()
        };

        self.accel
@ -160,11 +164,11 @@ impl<'a> Surface for TriangleMesh<'a> {
                            );
                            if !space.is_empty() {
                                (*tri_cache[i].as_mut_ptr()).0 =
-                                    (*tri_cache[i].as_mut_ptr()).0 * static_mat_space;
+                                    (*tri_cache[i].as_mut_ptr()).0.xform(&static_mat_space);
                                (*tri_cache[i].as_mut_ptr()).1 =
-                                    (*tri_cache[i].as_mut_ptr()).1 * static_mat_space;
+                                    (*tri_cache[i].as_mut_ptr()).1.xform(&static_mat_space);
                                (*tri_cache[i].as_mut_ptr()).2 =
-                                    (*tri_cache[i].as_mut_ptr()).2 * static_mat_space;
+                                    (*tri_cache[i].as_mut_ptr()).2.xform(&static_mat_space);
                            }
                        }
                    }
@ -183,7 +187,11 @@ impl<'a> Surface for TriangleMesh<'a> {
                    // Calculate the ray space, if necessary.
                    let mat_space = if space.len() > 1 {
                        // Per-ray transform, for motion blur
-                        lerp_slice(space, ray_time).inverse()
+                        if let Some(xform) = lerp_slice(space, ray_time).into_full() {
+                            xform
+                        } else {
+                            return;
+                        }
                    } else {
                        static_mat_space
                    };
@ -229,9 +237,9 @@ impl<'a> Surface for TriangleMesh<'a> {
                            };

                            if !space.is_empty() {
-                                tri.0 = tri.0 * mat_space;
-                                tri.1 = tri.1 * mat_space;
-                                tri.2 = tri.2 * mat_space;
+                                tri.0 = tri.0.xform(&mat_space);
+                                tri.1 = tri.1.xform(&mat_space);
+                                tri.2 = tri.2.xform(&mat_space);
                            }

                            tri
@ -289,7 +297,7 @@ impl<'a> Surface for TriangleMesh<'a> {
                            let n1 = lerp_slice(n1_slice, ray_time).normalized();
                            let n2 = lerp_slice(n2_slice, ray_time).normalized();

-                            let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
+                            let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)).xform(&mat_space);
                            if dot(s_nor, geo_normal) >= 0.0 {
                                s_nor
                            } else {
--- a/src/tracer.rs
+++ b/src/tracer.rs
@ -4,7 +4,7 @@ use crate::{
    accel::ray_code,
    color::{rec709_to_xyz, Color},
    lerp::lerp_slice,
-    math::Transform,
+    math::XformFull,
    ray::{RayBatch, RayStack},
    scene::{Assembly, InstanceType, Object},
    shading::{SimpleSurfaceShader, SurfaceShader},
@ -63,7 +63,7 @@ impl<'a> TracerInner<'a> {

        // Prep the accel part of the rays.
        {
-            let ident = Transform::new();
+            let ident = XformFull::identity();
            for i in 0..rays.len() {
                rays.update_local(i, &ident);
            }
@ -105,7 +105,15 @@ impl<'a> TracerInner<'a> {
                    let xforms = self.xform_stack.top();
                    ray_stack.do_next_task(|ray_idx| {
                        let t = rays.time(ray_idx);
-                        rays.update_local(ray_idx, &lerp_slice(xforms, t));
+                        rays.update_local(
+                            ray_idx,
+                            &if let Some(xform) = lerp_slice(xforms, t).into_full() {
+                                xform
+                            } else {
+                                // TODO: filter out ray instead.
+                                XformFull::identity()
+                            },
+                        );
                    });
                    ray_stack.duplicate_next_task();
                }
@ -137,10 +145,18 @@ impl<'a> TracerInner<'a> {
                    if !xforms.is_empty() {
                        ray_stack.pop_do_next_task(|ray_idx| {
                            let t = rays.time(ray_idx);
-                            rays.update_local(ray_idx, &lerp_slice(xforms, t));
+                            rays.update_local(
+                                ray_idx,
+                                &if let Some(xform) = lerp_slice(xforms, t).into_full() {
+                                    xform
+                                } else {
+                                    // TODO: filter out ray instead.
+                                    XformFull::identity()
+                                },
+                            );
                        });
                    } else {
-                        let ident = Transform::new();
+                        let ident = XformFull::identity();
                        ray_stack.pop_do_next_task(|ray_idx| {
                            rays.update_local(ray_idx, &ident);
                        });
--- a/src/transform_stack.rs
+++ b/src/transform_stack.rs
@ -3,10 +3,10 @@ use std::{
    mem::{transmute, MaybeUninit},
 };

-use crate::{algorithm::merge_slices_to, math::Transform};
+use crate::{algorithm::merge_slices_to, math::Xform};

 pub struct TransformStack {
-    stack: Vec<MaybeUninit<Transform>>,
+    stack: Vec<MaybeUninit<Xform>>,
    stack_indices: Vec<usize>,
 }

@ -30,11 +30,11 @@ impl TransformStack {
        self.stack_indices.push(0);
    }

-    pub fn push(&mut self, xforms: &[Transform]) {
+    pub fn push(&mut self, xforms: &[Xform]) {
        assert!(!xforms.is_empty());

        if self.stack.is_empty() {
-            let xforms: &[MaybeUninit<Transform>] = unsafe { transmute(xforms) };
+            let xforms: &[MaybeUninit<Xform>] = unsafe { transmute(xforms) };
            self.stack.extend(xforms);
        } else {
            let sil = self.stack_indices.len();
@ -54,7 +54,7 @@ impl TransformStack {
                unsafe { transmute(&xfs1[i1..i2]) },
                xforms,
                xfs2,
-                |xf1, xf2| *xf1 * *xf2,
+                |xf1, xf2| xf2.compose(xf1),
            );
        }

@ -73,7 +73,7 @@ impl TransformStack {
        self.stack_indices.pop();
    }

-    pub fn top(&self) -> &[Transform] {
+    pub fn top(&self) -> &[Xform] {
        let sil = self.stack_indices.len();
        let i1 = self.stack_indices[sil - 2];
        let i2 = self.stack_indices[sil - 1];
--- a/sub_crates/rmath/src/normal.rs
+++ b/sub_crates/rmath/src/normal.rs
@ -56,6 +56,16 @@ impl Normal {
        self.0.c()
    }

+    #[inline(always)]
+    pub fn get_n(self, i: usize) -> f32 {
+        match i {
+            0 => self.x(),
+            1 => self.y(),
+            2 => self.z(),
+            _ => panic!("Out of bounds index into 3D vector."),
+        }
+    }
+
    #[inline(always)]
    #[must_use]
    pub fn set_x(self, x: f32) -> Self {
--- a/sub_crates/rmath/src/point.rs
+++ b/sub_crates/rmath/src/point.rs
@ -47,6 +47,16 @@ impl Point {
        self.0.c()
    }

+    #[inline(always)]
+    pub fn get_n(self, i: usize) -> f32 {
+        match i {
+            0 => self.x(),
+            1 => self.y(),
+            2 => self.z(),
+            _ => panic!("Out of bounds index into 3D vector."),
+        }
+    }
+
    #[inline(always)]
    #[must_use]
    pub fn set_x(self, x: f32) -> Self {
--- a/sub_crates/rmath/src/vector.rs
+++ b/sub_crates/rmath/src/vector.rs
@ -37,6 +37,11 @@ impl Vector {
        Self(self.0 / self.length())
    }

+    #[inline(always)]
+    pub fn abs(self) -> Self {
+        Self(self.0.abs())
+    }
+
    #[inline(always)]
    pub fn into_point(self) -> Point {
        Point(self.0)
@ -62,6 +67,16 @@ impl Vector {
        self.0.c()
    }

+    #[inline(always)]
+    pub fn get_n(self, i: usize) -> f32 {
+        match i {
+            0 => self.x(),
+            1 => self.y(),
+            2 => self.z(),
+            _ => panic!("Out of bounds index into 3D vector."),
+        }
+    }
+
    #[inline(always)]
    #[must_use]
    pub fn set_x(self, x: f32) -> Self {
--- a/sub_crates/rmath/src/wide4.rs
+++ b/sub_crates/rmath/src/wide4.rs
@ -9,7 +9,7 @@ use crate::{difference_of_products, two_prod, two_sum};

 pub use fallback::{Bool4, Float4};
 mod fallback {
-    use std::ops::{Add, BitAnd, BitOr, BitXor, Div, Mul, Neg, Not, Sub};
+    use std::ops::{Add, BitAnd, BitOr, BitXor, Div, Index, Mul, Neg, Not, Sub};

    use crate::FMulAdd;

@ -65,28 +65,44 @@ mod fallback {
            ])
        }

-        // /// Horizontal minimum.
-        // #[inline(always)]
-        // pub fn hmin(self) -> f32 {
-        //     let a = self.0[0].min(self.0[1]);
-        //     let b = self.0[2].min(self.0[3]);
-        //     a.min(b)
-        // }
+        /// Horizontal minimum.
+        #[inline(always)]
+        pub fn min_element(self) -> f32 {
+            let a = self.0[0].min(self.0[1]);
+            let b = self.0[2].min(self.0[3]);
+            a.min(b)
+        }

-        // /// Horizontal maximum.
-        // #[inline(always)]
-        // pub fn hmax(self) -> f32 {
-        //     let a = self.0[0].max(self.0[1]);
-        //     let b = self.0[2].max(self.0[3]);
-        //     a.max(b)
-        // }
+        /// Horizontal maximum.
+        #[inline(always)]
+        pub fn max_element(self) -> f32 {
+            let a = self.0[0].max(self.0[1]);
+            let b = self.0[2].max(self.0[3]);
+            a.max(b)
+        }
+
+        /// 1.0 / self
+        #[inline(always)]
+        pub fn recip(self) -> Self {
+            Float4::splat(1.0) / self
+        }
+
+        #[inline(always)]
+        pub fn abs(self) -> Self {
+            Float4::new(
+                self.a().abs(),
+                self.b().abs(),
+                self.c().abs(),
+                self.d().abs(),
+            )
+        }

        //-----------------------------------------------------
        // Comparisons.

        /// Less than.
        #[inline(always)]
-        pub fn lt(self, rhs: Self) -> Bool4 {
+        pub fn cmplt(self, rhs: Self) -> Bool4 {
            Bool4([
                self.0[0] < rhs.0[0],
                self.0[1] < rhs.0[1],
@ -97,7 +113,7 @@ mod fallback {

        /// Less than or equal.
        #[inline(always)]
-        pub fn lte(self, rhs: Self) -> Bool4 {
+        pub fn cmplte(self, rhs: Self) -> Bool4 {
            Bool4([
                self.0[0] <= rhs.0[0],
                self.0[1] <= rhs.0[1],
@ -108,7 +124,7 @@ mod fallback {

        /// Greater than.
        #[inline(always)]
-        pub fn gt(self, rhs: Self) -> Bool4 {
+        pub fn cmpgt(self, rhs: Self) -> Bool4 {
            Bool4([
                self.0[0] > rhs.0[0],
                self.0[1] > rhs.0[1],
@ -119,7 +135,7 @@ mod fallback {

        /// Greater than or equal.
        #[inline(always)]
-        pub fn gte(self, rhs: Self) -> Bool4 {
+        pub fn cmpgte(self, rhs: Self) -> Bool4 {
            Bool4([
                self.0[0] >= rhs.0[0],
                self.0[1] >= rhs.0[1],
@ -130,7 +146,7 @@ mod fallback {

        /// Equal.
        #[inline(always)]
-        pub fn eq(self, rhs: Self) -> Bool4 {
+        pub fn cmpeq(self, rhs: Self) -> Bool4 {
            Bool4([
                self.0[0] == rhs.0[0],
                self.0[1] == rhs.0[1],
@ -232,6 +248,15 @@ mod fallback {
        }
    }

+    impl Index<usize> for Float4 {
+        type Output = f32;
+
+        #[inline(always)]
+        fn index(&self, idx: usize) -> &f32 {
+            &self.0[idx]
+        }
+    }
+
    impl Add for Float4 {
        type Output = Self;

@ -339,18 +364,34 @@ mod fallback {
    pub struct Bool4([bool; 4]);

    impl Bool4 {
+        #[inline(always)]
+        pub fn new_false() -> Self {
+            Self([false, false, false, false])
+        }
+
        #[inline(always)]
        pub fn to_bools(self) -> [bool; 4] {
            self.0
        }

+        /// Note: `a` goes to the least significant bit.
        #[inline(always)]
-        pub fn to_bitmask(self) -> u8 {
+        pub fn bitmask(self) -> u8 {
            self.0[0] as u8
                | ((self.0[1] as u8) << 1)
                | ((self.0[2] as u8) << 2)
                | ((self.0[3] as u8) << 3)
        }
+
+        #[inline(always)]
+        pub fn any(self) -> bool {
+            self.0[0] | &self.0[1] | self.0[2] | self.0[3]
+        }
+
+        #[inline(always)]
+        pub fn all(self) -> bool {
+            self.0[0] & &self.0[1] & self.0[2] & self.0[3]
+        }
    }

    impl BitAnd for Bool4 {
@ -698,6 +739,12 @@ impl Float4 {
    }
 }

+impl From<Float4> for (f32, f32, f32, f32) {
+    fn from(v: Float4) -> (f32, f32, f32, f32) {
+        (v.a(), v.b(), v.c(), v.d())
+    }
+}
+
 impl AddAssign for Float4 {
    #[inline(always)]
    fn add_assign(&mut self, rhs: Self) {
@ -743,7 +790,7 @@ impl DivAssign<f32> for Float4 {
 impl PartialEq for Float4 {
    #[inline(always)]
    fn eq(&self, rhs: &Self) -> bool {
-        Self::eq(*self, *rhs).to_bitmask() == 0b1111
+        self.cmpeq(*rhs).bitmask() == 0b1111
    }
 }

--- a/sub_crates/rmath/src/xform.rs
+++ b/sub_crates/rmath/src/xform.rs
@ -184,6 +184,24 @@ pub struct XformFull {
    pub t: Float4,          // Forward translation.
 }

+impl XformFull {
+    pub fn identity() -> Self {
+        Self {
+            m: [
+                Float4::new(1.0, 0.0, 0.0, 0.0),
+                Float4::new(0.0, 1.0, 0.0, 0.0),
+                Float4::new(0.0, 0.0, 1.0, 0.0),
+            ],
+            m_inv: [
+                Float4::new(1.0, 0.0, 0.0, 0.0),
+                Float4::new(0.0, 1.0, 0.0, 0.0),
+                Float4::new(0.0, 0.0, 1.0, 0.0),
+            ],
+            t: Float4::splat(0.0),
+        }
+    }
+}
+
 //-------------------------------------------------------------

 #[cfg(test)]
--- a/sub_crates/spectral_upsampling/Cargo.toml
+++ b/sub_crates/spectral_upsampling/Cargo.toml
@ -10,4 +10,4 @@ name = "spectral_upsampling"
 path = "src/lib.rs"

 [dependencies]
-glam = "0.15"
+rmath = { path = "../rmath" }
--- a/sub_crates/spectral_upsampling/src/jakob.rs
+++ b/sub_crates/spectral_upsampling/src/jakob.rs
@ -6,7 +6,7 @@
 /// The provides similar color matching as full Jakob, at the expense of
 /// somewhat lower quality spectrums, and the inability to precalculate
 /// the coefficents for even more efficient evaluation later on.
-use glam::Vec4;
+use rmath::wide4::Float4;

 /// How many polynomial coefficients?
 const RGB2SPEC_N_COEFFS: usize = 3;
@ -15,7 +15,7 @@ const RGB2SPEC_N_COEFFS: usize = 3;
 include!(concat!(env!("OUT_DIR"), "/jakob_table_inc.rs"));

 #[inline]
-pub fn rec709_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
+pub fn rec709_to_spectrum_p4(lambdas: Float4, rgb: (f32, f32, f32)) -> Float4 {
    small_rgb_to_spectrum_p4(
        REC709_TABLE,
        REC709_TABLE_RES,
@ -26,7 +26,7 @@ pub fn rec709_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
 }

 #[inline]
-pub fn rec2020_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
+pub fn rec2020_to_spectrum_p4(lambdas: Float4, rgb: (f32, f32, f32)) -> Float4 {
    small_rgb_to_spectrum_p4(
        REC2020_TABLE,
        REC2020_TABLE_RES,
@ -37,7 +37,7 @@ pub fn rec2020_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
 }

 #[inline]
-pub fn aces_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
+pub fn aces_to_spectrum_p4(lambdas: Float4, rgb: (f32, f32, f32)) -> Float4 {
    small_rgb_to_spectrum_p4(
        ACES_TABLE,
        ACES_TABLE_RES,
@ -56,9 +56,9 @@ fn small_rgb_to_spectrum_p4(
    table: &[[(f32, f32, f32); 2]],
    table_res: usize,
    table_mid_value: f32,
-    lambdas: Vec4,
+    lambdas: Float4,
    rgb: (f32, f32, f32),
-) -> Vec4 {
+) -> Float4 {
    // Determine largest RGB component, and calculate the other two
    // components scaled for lookups.
    let (i, max_val, x, y) = if rgb.0 > rgb.1 && rgb.0 > rgb.2 {
@ -71,7 +71,7 @@ fn small_rgb_to_spectrum_p4(
    if max_val == 0.0 {
        // If max_val is zero, just return zero.  This avoids NaN's from
        // divide by zero.  This is also correct, since it's black.
-        return Vec4::splat(0.0);
+        return Float4::splat(0.0);
    }
    let x = x * 63.0 / max_val;
    let y = y * 63.0 / max_val;
@ -91,20 +91,20 @@ fn small_rgb_to_spectrum_p4(

    // Convert to SIMD format for faster interpolation.
    let a0 = [
-        Vec4::new(a0[0].0, a0[0].1, a0[0].2, 0.0),
-        Vec4::new(a0[1].0, a0[1].1, a0[1].2, 0.0),
+        Float4::new(a0[0].0, a0[0].1, a0[0].2, 0.0),
+        Float4::new(a0[1].0, a0[1].1, a0[1].2, 0.0),
    ];
    let a1 = [
-        Vec4::new(a1[0].0, a1[0].1, a1[0].2, 0.0),
-        Vec4::new(a1[1].0, a1[1].1, a1[1].2, 0.0),
+        Float4::new(a1[0].0, a1[0].1, a1[0].2, 0.0),
+        Float4::new(a1[1].0, a1[1].1, a1[1].2, 0.0),
    ];
    let a2 = [
-        Vec4::new(a2[0].0, a2[0].1, a2[0].2, 0.0),
-        Vec4::new(a2[1].0, a2[1].1, a2[1].2, 0.0),
+        Float4::new(a2[0].0, a2[0].1, a2[0].2, 0.0),
+        Float4::new(a2[1].0, a2[1].1, a2[1].2, 0.0),
    ];
    let a3 = [
-        Vec4::new(a3[0].0, a3[0].1, a3[0].2, 0.0),
-        Vec4::new(a3[1].0, a3[1].1, a3[1].2, 0.0),
+        Float4::new(a3[0].0, a3[0].1, a3[0].2, 0.0),
+        Float4::new(a3[1].0, a3[1].1, a3[1].2, 0.0),
    ];

    // Do interpolation.
@ -133,22 +133,22 @@ fn small_rgb_to_spectrum_p4(
 // Coefficient -> eval functions

 #[inline(always)]
-fn rgb2spec_fma_4(a: Vec4, b: Vec4, c: Vec4) -> Vec4 {
+fn rgb2spec_fma_4(a: Float4, b: Float4, c: Float4) -> Float4 {
    (a * b) + c
 }

-fn rgb2spec_eval_4(coeff: [f32; RGB2SPEC_N_COEFFS], lambda: Vec4) -> Vec4 {
-    let co0 = Vec4::splat(coeff[0]);
-    let co1 = Vec4::splat(coeff[1]);
-    let co2 = Vec4::splat(coeff[2]);
+fn rgb2spec_eval_4(coeff: [f32; RGB2SPEC_N_COEFFS], lambda: Float4) -> Float4 {
+    let co0 = Float4::splat(coeff[0]);
+    let co1 = Float4::splat(coeff[1]);
+    let co2 = Float4::splat(coeff[2]);

    let x = rgb2spec_fma_4(rgb2spec_fma_4(co0, lambda, co1), lambda, co2);

    let y = {
        // TODO: replace this with a SIMD sqrt op.
-        let (x, y, z, w) = rgb2spec_fma_4(x, x, Vec4::splat(1.0)).into();
-        Vec4::new(x.sqrt(), y.sqrt(), z.sqrt(), w.sqrt()).recip()
+        let (x, y, z, w) = rgb2spec_fma_4(x, x, Float4::splat(1.0)).into();
+        Float4::new(x.sqrt(), y.sqrt(), z.sqrt(), w.sqrt()).recip()
    };

-    rgb2spec_fma_4(Vec4::splat(0.5) * x, y, Vec4::splat(0.5))
+    rgb2spec_fma_4(Float4::splat(0.5) * x, y, Float4::splat(0.5))
 }
--- a/sub_crates/spectral_upsampling/src/meng.rs
+++ b/sub_crates/spectral_upsampling/src/meng.rs
@ -6,7 +6,7 @@

 use std::f32;

-use glam::Vec4;
+use rmath::wide4::Float4;

 mod meng_spectra_tables;

@ -174,7 +174,7 @@ pub fn spectrum_xyz_to_p(lambda: f32, xyz: (f32, f32, f32)) -> f32 {
 ///
 /// Works on 4 wavelengths at once via SIMD.
 #[inline]
-pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {
+pub fn spectrum_xyz_to_p_4(lambdas: Float4, xyz: (f32, f32, f32)) -> Float4 {
    assert!(lambdas.min_element() >= SPECTRUM_SAMPLE_MIN);
    assert!(lambdas.max_element() <= SPECTRUM_SAMPLE_MAX);

@ -184,7 +184,7 @@ pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {
        if norm < f32::MAX {
            norm
        } else {
-            return Vec4::splat(0.0);
+            return Float4::splat(0.0);
        }
    };

@ -197,7 +197,7 @@ pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {
        || uv.1 < 0.0
        || uv.1 >= SPECTRUM_GRID_HEIGHT as f32
    {
-        return Vec4::splat(0.0);
+        return Float4::splat(0.0);
    }

    let uvi = (uv.0 as i32, uv.1 as i32);
@ -214,11 +214,11 @@ pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {

    // If the cell has no points, nothing we can do, so return 0.0
    if num == 0 {
-        return Vec4::splat(0.0);
+        return Float4::splat(0.0);
    }

    // Normalize lambda to spectrum table index range.
-    let sb: Vec4 = (lambdas - Vec4::splat(SPECTRUM_SAMPLE_MIN))
+    let sb: Float4 = (lambdas - Float4::splat(SPECTRUM_SAMPLE_MIN))
        / (SPECTRUM_SAMPLE_MAX - SPECTRUM_SAMPLE_MIN)
        * (SPECTRUM_NUM_SAMPLES as f32 - 1.0);
    debug_assert!(sb.min_element() >= 0.0);
@ -226,7 +226,7 @@ pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {

    // Get the spectral values for the vertices of the grid cell.
    // TODO: use integer SIMD intrinsics to make this part faster.
-    let mut p = [Vec4::splat(0.0); 6];
+    let mut p = [Float4::splat(0.0); 6];
    let sb0: [i32; 4] = [sb[0] as i32, sb[1] as i32, sb[2] as i32, sb[3] as i32];
    assert!(sb0[0].max(sb0[1]).max(sb0[2].max(sb0[3])) < SPECTRUM_NUM_SAMPLES);
    let sb1: [i32; 4] = [
@ -235,27 +235,27 @@ pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {
        (sb[2] as i32 + 1).min(SPECTRUM_NUM_SAMPLES - 1),
        (sb[3] as i32 + 1).min(SPECTRUM_NUM_SAMPLES - 1),
    ];
-    let sbf = sb - Vec4::new(sb0[0] as f32, sb0[1] as f32, sb0[2] as f32, sb0[3] as f32);
+    let sbf = sb - Float4::new(sb0[0] as f32, sb0[1] as f32, sb0[2] as f32, sb0[3] as f32);
    for i in 0..(num as usize) {
        debug_assert!(idx[i] >= 0);
        let spectrum = &SPECTRUM_DATA_POINTS[idx[i] as usize].spectrum;
-        let p0 = Vec4::new(
+        let p0 = Float4::new(
            spectrum[sb0[0] as usize],
            spectrum[sb0[1] as usize],
            spectrum[sb0[2] as usize],
            spectrum[sb0[3] as usize],
        );
-        let p1 = Vec4::new(
+        let p1 = Float4::new(
            spectrum[sb1[0] as usize],
            spectrum[sb1[1] as usize],
            spectrum[sb1[2] as usize],
            spectrum[sb1[3] as usize],
        );
-        p[i] = p0 * (Vec4::splat(1.0) - sbf) + p1 * sbf;
+        p[i] = p0 * (Float4::splat(1.0) - sbf) + p1 * sbf;
    }

    // Linearly interpolate the spectral power of the cell vertices.
-    let mut interpolated_p = Vec4::splat(0.0);
+    let mut interpolated_p = Float4::splat(0.0);
    if inside {
        // Fast path for normal inner quads:
        let uv2 = (uv.0 - uvi.0 as f32, uv.1 - uvi.1 as f32);