diff --git a/Cargo.lock b/Cargo.lock
index 7e4b4ef..0323c91 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -8,6 +8,14 @@ dependencies = [
  "winapi 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
+[[package]]
+name = "approx"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "atty"
 version = "0.2.11"
@@ -110,10 +118,6 @@ name = "crossbeam"
 version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
-[[package]]
-name = "float4"
-version = "0.1.0"
-
 [[package]]
 name = "fnv"
 version = "1.0.6"
@@ -124,6 +128,14 @@ name = "fuchsia-cprng"
 version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
+[[package]]
+name = "glam"
+version = "0.7.1"
+source = "git+https://github.com/bitshifter/glam-rs.git?rev=0f314f99#0f314f990710ff9357e5896de2b55ec82fe88e0d"
+dependencies = [
+ "approx 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
 [[package]]
 name = "half"
 version = "1.3.0"
@@ -147,7 +159,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 name = "math3d"
 version = "0.1.0"
 dependencies = [
- "float4 0.1.0",
+ "approx 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "glam 0.7.1 (git+https://github.com/bitshifter/glam-rs.git?rev=0f314f99)",
 ]
 
 [[package]]
@@ -246,7 +259,7 @@ dependencies = [
  "color 0.1.0",
  "copy_in_place 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
- "float4 0.1.0",
+ "glam 0.7.1 (git+https://github.com/bitshifter/glam-rs.git?rev=0f314f99)",
  "half 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "halton 0.1.0",
  "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -449,7 +462,7 @@ version = "0.1.0"
 name = "spectral_upsampling"
 version = "0.1.0"
 dependencies = [
- "float4 0.1.0",
+ "glam 0.7.1 (git+https://github.com/bitshifter/glam-rs.git?rev=0f314f99)",
 ]
 
 [[package]]
@@ -551,6 +564,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [metadata]
 "checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
+"checksum approx 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f0e60b75072ecd4168020818c0107f2857bb6c4e64252d8d3983f6263b40a5c3"
 "checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652"
 "checksum autocfg 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "0e49efa51329a5fd37e7c79db4621af617cd4e3e5bc224939808d076077077bf"
 "checksum base64 0.9.3 (registry+https://github.com/rust-lang/crates.io-index)" = "489d6c0ed21b11d038c31b6ceccca973e65d73ba3bd8ecb9a2babf5546164643"
@@ -567,6 +581,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "24ce9782d4d5c53674646a6a4c1863a21a8fc0cb649b3c94dfc16e45071dea19"
 "checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
 "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
+"checksum glam 0.7.1 (git+https://github.com/bitshifter/glam-rs.git?rev=0f314f99)" = "<none>"
 "checksum half 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9353c2a89d550b58fa0061d8ed8d002a7d8cdf2494eb0e432859bd3a9e543836"
 "checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14"
 "checksum libc 0.2.55 (registry+https://github.com/rust-lang/crates.io-index)" = "42914d39aad277d9e176efbdad68acb1d5443ab65afe0e0e4f0d49352a950880"
diff --git a/Cargo.toml b/Cargo.toml
index 14ee2ac..f2fe96e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,6 @@
 members = [
     "sub_crates/bvh_order",
     "sub_crates/color",
-    "sub_crates/float4",
     "sub_crates/halton",
     "sub_crates/math3d",
     "sub_crates/mem_arena",
@@ -36,6 +35,7 @@ png_encode_mini = "0.1.2"
 rustc-serialize = "0.3"
 scoped_threadpool = "0.1"
 time = "0.1"
+glam = {git="https://github.com/bitshifter/glam-rs.git", rev="0f314f99", default-features=false, features=["approx"]}
 
 # Local crate dependencies
 [dependencies.bvh_order]
@@ -44,9 +44,6 @@ path = "sub_crates/bvh_order"
 [dependencies.color]
 path = "sub_crates/color"
 
-[dependencies.float4]
-path = "sub_crates/float4"
-
 [dependencies.halton]
 path = "sub_crates/halton"
 
diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index 92fce91..d87d22c 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -6,6 +6,8 @@
 
 use std::mem::{transmute, MaybeUninit};
 
+use glam::Vec4Mask;
+
 use mem_arena::MemArena;
 
 use crate::{
@@ -23,7 +25,6 @@ use super::{
 };
 
 use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
-use float4::Bool4;
 
 pub fn ray_code(dir: Vector) -> usize {
     let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
@@ -122,12 +123,12 @@ impl<'a> BVH4<'a> {
                     traversal_code,
                 } => {
                     node_tests += ray_stack.ray_count_in_next_task() as u64;
-                    let mut all_hits = Bool4::new_false();
+                    let mut all_hits = Vec4Mask::default();
 
                     // Ray testing
                     ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| {
                         if rays.is_done(ray_idx) {
-                            Bool4::new_false()
+                            Vec4Mask::default()
                         } else {
                             let hits = if bounds.len() == 1 {
                                 bounds[0].intersect_ray(
@@ -148,7 +149,7 @@ impl<'a> BVH4<'a> {
                     });
 
                     // If there were any intersections, create tasks.
-                    if !all_hits.is_all_false() {
+                    if all_hits.any() {
                         let order_code = traversal_table[traversal_code as usize];
                         let mut lane_count = 0;
                         let mut i = children.len() as u8;
diff --git a/src/bbox.rs b/src/bbox.rs
index a4a43bb..bd573ac 100644
--- a/src/bbox.rs
+++ b/src/bbox.rs
@@ -45,12 +45,12 @@ impl BBox {
         let t2 = (self.max.co - orig.co) * dir_inv.co;
 
         // Find the far and near intersection
-        let mut far_t = t1.v_max(t2);
-        let mut near_t = t1.v_min(t2);
-        far_t.set_3(std::f32::INFINITY);
-        near_t.set_3(0.0);
-        let far_hit_t = fast_minf32(far_t.h_min() * BBOX_MAXT_ADJUST, max_t);
-        let near_hit_t = near_t.h_max();
+        let mut far_t = t1.max(t2);
+        let mut near_t = t1.min(t2);
+        far_t.set_w(std::f32::INFINITY);
+        near_t.set_w(0.0);
+        let far_hit_t = fast_minf32(far_t.min_element() * BBOX_MAXT_ADJUST, max_t);
+        let near_hit_t = near_t.max_element();
 
         // Did we hit?
         near_hit_t <= far_hit_t
@@ -106,10 +106,10 @@ impl BitOr for BBox {
     fn bitor(self, rhs: BBox) -> BBox {
         BBox::from_points(
             Point {
-                co: self.min.co.v_min(rhs.min.co),
+                co: self.min.co.min(rhs.min.co),
             },
             Point {
-                co: self.max.co.v_max(rhs.max.co),
+                co: self.max.co.max(rhs.max.co),
             },
         )
     }
@@ -128,10 +128,10 @@ impl BitOr<Point> for BBox {
     fn bitor(self, rhs: Point) -> BBox {
         BBox::from_points(
             Point {
-                co: self.min.co.v_min(rhs.co),
+                co: self.min.co.min(rhs.co),
             },
             Point {
-                co: self.max.co.v_max(rhs.co),
+                co: self.max.co.max(rhs.co),
             },
         )
     }
diff --git a/src/bbox4.rs b/src/bbox4.rs
index 71793a4..07cb456 100644
--- a/src/bbox4.rs
+++ b/src/bbox4.rs
@@ -9,16 +9,16 @@ use crate::{
     math::{Point, Vector},
 };
 
-use float4::{Bool4, Float4};
+use glam::{Vec4, Vec4Mask};
 
 const BBOX_MAXT_ADJUST: f32 = 1.00000024;
 
 /// A SIMD set of 4 3D axis-aligned bounding boxes.
 #[derive(Debug, Copy, Clone)]
 pub struct BBox4 {
-    pub x: (Float4, Float4), // (min, max)
-    pub y: (Float4, Float4), // (min, max)
-    pub z: (Float4, Float4), // (min, max)
+    pub x: (Vec4, Vec4), // (min, max)
+    pub y: (Vec4, Vec4), // (min, max)
+    pub z: (Vec4, Vec4), // (min, max)
 }
 
 impl BBox4 {
@@ -26,16 +26,16 @@ impl BBox4 {
     pub fn new() -> BBox4 {
         BBox4 {
             x: (
-                Float4::splat(std::f32::INFINITY),
-                Float4::splat(std::f32::NEG_INFINITY),
+                Vec4::splat(std::f32::INFINITY),
+                Vec4::splat(std::f32::NEG_INFINITY),
             ),
             y: (
-                Float4::splat(std::f32::INFINITY),
-                Float4::splat(std::f32::NEG_INFINITY),
+                Vec4::splat(std::f32::INFINITY),
+                Vec4::splat(std::f32::NEG_INFINITY),
             ),
             z: (
-                Float4::splat(std::f32::INFINITY),
-                Float4::splat(std::f32::NEG_INFINITY),
+                Vec4::splat(std::f32::INFINITY),
+                Vec4::splat(std::f32::NEG_INFINITY),
             ),
         }
     }
@@ -45,30 +45,30 @@ impl BBox4 {
     pub fn from_bboxes(b1: BBox, b2: BBox, b3: BBox, b4: BBox) -> BBox4 {
         BBox4 {
             x: (
-                Float4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()),
-                Float4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()),
+                Vec4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()),
+                Vec4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()),
             ),
             y: (
-                Float4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()),
-                Float4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()),
+                Vec4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()),
+                Vec4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()),
             ),
             z: (
-                Float4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()),
-                Float4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z()),
+                Vec4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()),
+                Vec4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z()),
             ),
         }
     }
 
     // Returns whether the given ray intersects with the bboxes.
-    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> Bool4 {
+    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> Vec4Mask {
         // Get the ray data into SIMD format.
-        let ro_x = orig.co.all_0();
-        let ro_y = orig.co.all_1();
-        let ro_z = orig.co.all_2();
-        let rdi_x = dir_inv.co.all_0();
-        let rdi_y = dir_inv.co.all_1();
-        let rdi_z = dir_inv.co.all_2();
-        let max_t = Float4::splat(max_t);
+        let ro_x = Vec4::splat(orig.co.x());
+        let ro_y = Vec4::splat(orig.co.y());
+        let ro_z = Vec4::splat(orig.co.z());
+        let rdi_x = Vec4::splat(dir_inv.co.x());
+        let rdi_y = Vec4::splat(dir_inv.co.y());
+        let rdi_z = Vec4::splat(dir_inv.co.z());
+        let max_t = Vec4::splat(max_t);
 
         // Slab tests
         let t1_x = (self.x.0 - ro_x) * rdi_x;
@@ -79,24 +79,21 @@ impl BBox4 {
         let t2_z = (self.z.1 - ro_z) * rdi_z;
 
         // Get the far and near t hits for each axis.
-        let t_far_x = t1_x.v_max(t2_x);
-        let t_far_y = t1_y.v_max(t2_y);
-        let t_far_z = t1_z.v_max(t2_z);
-        let t_near_x = t1_x.v_min(t2_x);
-        let t_near_y = t1_y.v_min(t2_y);
-        let t_near_z = t1_z.v_min(t2_z);
+        let t_far_x = t1_x.max(t2_x);
+        let t_far_y = t1_y.max(t2_y);
+        let t_far_z = t1_z.max(t2_z);
+        let t_near_x = t1_x.min(t2_x);
+        let t_near_y = t1_y.min(t2_y);
+        let t_near_z = t1_z.min(t2_z);
 
         // Calculate over-all far t hit.
-        let far_t =
-            (t_far_x.v_min(t_far_y.v_min(t_far_z)) * Float4::splat(BBOX_MAXT_ADJUST)).v_min(max_t);
+        let far_t = (t_far_x.min(t_far_y.min(t_far_z)) * Vec4::splat(BBOX_MAXT_ADJUST)).min(max_t);
 
         // Calculate over-all near t hit.
-        let near_t = t_near_x
-            .v_max(t_near_y)
-            .v_max(t_near_z.v_max(Float4::splat(0.0)));
+        let near_t = t_near_x.max(t_near_y).max(t_near_z.max(Vec4::splat(0.0)));
 
         // Hit results
-        near_t.lt(far_t)
+        near_t.cmplt(far_t)
     }
 }
 
@@ -106,9 +103,9 @@ impl BitOr for BBox4 {
 
     fn bitor(self, rhs: BBox4) -> BBox4 {
         BBox4 {
-            x: (self.x.0.v_min(rhs.x.0), self.x.1.v_max(rhs.x.1)),
-            y: (self.y.0.v_min(rhs.y.0), self.y.1.v_max(rhs.y.1)),
-            z: (self.z.0.v_min(rhs.z.0), self.z.1.v_max(rhs.z.1)),
+            x: (self.x.0.min(rhs.x.0), self.x.1.max(rhs.x.1)),
+            y: (self.y.0.min(rhs.y.0), self.y.1.max(rhs.y.1)),
+            z: (self.z.0.min(rhs.z.0), self.z.1.max(rhs.z.1)),
         }
     }
 }
diff --git a/src/color.rs b/src/color.rs
index 1e25e36..891a465 100644
--- a/src/color.rs
+++ b/src/color.rs
@@ -4,7 +4,7 @@ pub use color::{
     rec709_e_to_xyz, rec709_to_xyz, xyz_to_aces_ap0, xyz_to_aces_ap0_e, xyz_to_rec709,
     xyz_to_rec709_e,
 };
-use float4::Float4;
+use glam::Vec4;
 use half::f16;
 use spectral_upsampling::meng::{spectrum_xyz_to_p_4, EQUAL_ENERGY_REFLECTANCE};
 use trifloat::signed48;
@@ -31,10 +31,10 @@ fn nth_wavelength(hero_wavelength: f32, n: usize) -> f32 {
     }
 }
 
-/// Returns all wavelengths of a hero wavelength set as a Float4
+/// Returns all wavelengths of a hero wavelength set as a Vec4
 #[inline(always)]
-fn wavelengths(hero_wavelength: f32) -> Float4 {
-    Float4::new(
+fn wavelengths(hero_wavelength: f32) -> Vec4 {
+    Vec4::new(
         nth_wavelength(hero_wavelength, 0),
         nth_wavelength(hero_wavelength, 1),
         nth_wavelength(hero_wavelength, 2),
@@ -94,11 +94,11 @@ impl Color {
             } => {
                 SpectralSample::from_parts(
                     // TODO: make this SIMD
-                    Float4::new(
-                        plancks_law(temperature, wls.get_0()) * factor,
-                        plancks_law(temperature, wls.get_1()) * factor,
-                        plancks_law(temperature, wls.get_2()) * factor,
-                        plancks_law(temperature, wls.get_3()) * factor,
+                    Vec4::new(
+                        plancks_law(temperature, wls.x()) * factor,
+                        plancks_law(temperature, wls.y()) * factor,
+                        plancks_law(temperature, wls.z()) * factor,
+                        plancks_law(temperature, wls.w()) * factor,
                     ),
                     hero_wavelength,
                 )
@@ -109,11 +109,11 @@ impl Color {
             } => {
                 SpectralSample::from_parts(
                     // TODO: make this SIMD
-                    Float4::new(
-                        plancks_law_normalized(temperature, wls.get_0()) * factor,
-                        plancks_law_normalized(temperature, wls.get_1()) * factor,
-                        plancks_law_normalized(temperature, wls.get_2()) * factor,
-                        plancks_law_normalized(temperature, wls.get_3()) * factor,
+                    Vec4::new(
+                        plancks_law_normalized(temperature, wls.x()) * factor,
+                        plancks_law_normalized(temperature, wls.y()) * factor,
+                        plancks_law_normalized(temperature, wls.z()) * factor,
+                        plancks_law_normalized(temperature, wls.w()) * factor,
                     ),
                     hero_wavelength,
                 )
@@ -388,7 +388,7 @@ fn plancks_law_normalized(temperature: f32, wavelength: f32) -> f32 {
 
 #[derive(Copy, Clone, Debug)]
 pub struct SpectralSample {
-    pub e: Float4,
+    pub e: Vec4,
     hero_wavelength: f32,
 }
 
@@ -396,7 +396,7 @@ impl SpectralSample {
     pub fn new(wavelength: f32) -> SpectralSample {
         debug_assert!(wavelength >= WL_MIN && wavelength <= WL_MAX);
         SpectralSample {
-            e: Float4::splat(0.0),
+            e: Vec4::splat(0.0),
             hero_wavelength: wavelength,
         }
     }
@@ -405,12 +405,12 @@ impl SpectralSample {
     pub fn from_value(value: f32, wavelength: f32) -> SpectralSample {
         debug_assert!(wavelength >= WL_MIN && wavelength <= WL_MAX);
         SpectralSample {
-            e: Float4::splat(value),
+            e: Vec4::splat(value),
             hero_wavelength: wavelength,
         }
     }
 
-    pub fn from_parts(e: Float4, wavelength: f32) -> SpectralSample {
+    pub fn from_parts(e: Vec4, wavelength: f32) -> SpectralSample {
         debug_assert!(wavelength >= WL_MIN && wavelength <= WL_MAX);
         SpectralSample {
             e: e,
@@ -520,10 +520,10 @@ impl XYZ {
     }
 
     pub fn from_spectral_sample(ss: &SpectralSample) -> XYZ {
-        let xyz0 = XYZ::from_wavelength(ss.wl_n(0), ss.e.get_0());
-        let xyz1 = XYZ::from_wavelength(ss.wl_n(1), ss.e.get_1());
-        let xyz2 = XYZ::from_wavelength(ss.wl_n(2), ss.e.get_2());
-        let xyz3 = XYZ::from_wavelength(ss.wl_n(3), ss.e.get_3());
+        let xyz0 = XYZ::from_wavelength(ss.wl_n(0), ss.e.x());
+        let xyz1 = XYZ::from_wavelength(ss.wl_n(1), ss.e.y());
+        let xyz2 = XYZ::from_wavelength(ss.wl_n(2), ss.e.z());
+        let xyz3 = XYZ::from_wavelength(ss.wl_n(3), ss.e.w());
         (xyz0 + xyz1 + xyz2 + xyz3) * 0.75
     }
 
@@ -601,8 +601,8 @@ impl DivAssign<f32> for XYZ {
 /// the method in the paper "Physically Meaningful Rendering using Tristimulus
 /// Colours" by Meng et al.
 #[inline(always)]
-fn xyz_to_spectrum_4(xyz: (f32, f32, f32), wavelengths: Float4) -> Float4 {
-    spectrum_xyz_to_p_4(wavelengths, xyz) * Float4::splat(1.0 / EQUAL_ENERGY_REFLECTANCE)
+fn xyz_to_spectrum_4(xyz: (f32, f32, f32), wavelengths: Vec4) -> Vec4 {
+    spectrum_xyz_to_p_4(wavelengths, xyz) * Vec4::splat(1.0 / EQUAL_ENERGY_REFLECTANCE)
     // aces_to_spectrum_p4(wavelengths, xyz_to_aces_ap0_e(xyz))
 }
 
diff --git a/src/lerp.rs b/src/lerp.rs
index fbfa659..0449c1d 100644
--- a/src/lerp.rs
+++ b/src/lerp.rs
@@ -73,23 +73,15 @@ impl<T: Lerp> Lerp for (T, T) {
     }
 }
 
-impl Lerp for float4::Float4 {
-    fn lerp(self, other: float4::Float4, alpha: f32) -> float4::Float4 {
+impl Lerp for glam::Vec4 {
+    fn lerp(self, other: glam::Vec4, alpha: f32) -> glam::Vec4 {
         (self * (1.0 - alpha)) + (other * alpha)
     }
 }
 
 impl Lerp for Matrix4x4 {
     fn lerp(self, other: Matrix4x4, alpha: f32) -> Matrix4x4 {
-        let alpha_minus = 1.0 - alpha;
-        Matrix4x4 {
-            values: [
-                (self[0] * alpha_minus) + (other[0] * alpha),
-                (self[1] * alpha_minus) + (other[1] * alpha),
-                (self[2] * alpha_minus) + (other[2] * alpha),
-                (self[3] * alpha_minus) + (other[3] * alpha),
-            ],
-        }
+        (self * (1.0 - alpha)) + (other * alpha)
     }
 }
 
diff --git a/src/ray.rs b/src/ray.rs
index 7c2bc83..f2055ac 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -1,6 +1,6 @@
 #![allow(dead_code)]
 
-use float4::{Bool4, Float4};
+use glam::{Vec4, Vec4Mask};
 
 use crate::math::{Matrix4x4, Point, Vector};
 
@@ -86,7 +86,7 @@ impl RayBatch {
     pub fn set_from_ray(&mut self, ray: &Ray, is_occlusion: bool, idx: usize) {
         self.hot[idx].orig_local = ray.orig;
         self.hot[idx].dir_inv_local = Vector {
-            co: Float4::splat(1.0) / ray.dir.co,
+            co: Vec4::splat(1.0) / ray.dir.co,
         };
         self.hot[idx].max_t = ray.max_t;
         self.hot[idx].time = ray.time;
@@ -122,7 +122,7 @@ impl RayBatch {
     pub fn update_local(&mut self, idx: usize, xform: &Matrix4x4) {
         self.hot[idx].orig_local = self.cold[idx].orig * *xform;
         self.hot[idx].dir_inv_local = Vector {
-            co: Float4::splat(1.0) / (self.cold[idx].dir * *xform).co,
+            co: Vec4::splat(1.0) / (self.cold[idx].dir * *xform).co,
         };
     }
 
@@ -349,7 +349,7 @@ impl RayStack {
     /// indicated lanes.
     pub fn pop_do_next_task_and_push_rays<F>(&mut self, output_lane_count: usize, mut handle_ray: F)
     where
-        F: FnMut(usize) -> Bool4,
+        F: FnMut(usize) -> Vec4Mask,
     {
         // Pop the task and do necessary bookkeeping.
         let task = self.tasks.pop().unwrap();
@@ -372,9 +372,9 @@ impl RayStack {
         // Execute task.
         for i in task_range.0..task_range.1 {
             let ray_idx = *unsafe { self.lanes[task.lane].idxs.get_unchecked(i) };
-            let push_mask = handle_ray(ray_idx as usize);
+            let push_mask = handle_ray(ray_idx as usize).bitmask();
             for l in 0..output_lane_count {
-                if push_mask.get_n(l) {
+                if (push_mask & (1 << l)) != 0 {
                     self.lanes[l as usize].idxs.push(ray_idx);
                 }
             }
diff --git a/src/renderer.rs b/src/renderer.rs
index 50d3061..d956f26 100644
--- a/src/renderer.rs
+++ b/src/renderer.rs
@@ -9,7 +9,7 @@ use std::{
 use crossbeam::sync::MsQueue;
 use scoped_threadpool::Pool;
 
-use float4::Float4;
+use glam::Vec4;
 
 use crate::{
     accel::ACCEL_NODE_RAY_TESTS,
@@ -374,12 +374,12 @@ pub struct LightPath {
     wavelength: f32,
 
     next_bounce_ray: Option<Ray>,
-    next_attenuation_fac: Float4,
+    next_attenuation_fac: Vec4,
 
     closure_sample_pdf: f32,
-    light_attenuation: Float4,
-    pending_color_addition: Float4,
-    color: Float4,
+    light_attenuation: Vec4,
+    pending_color_addition: Vec4,
+    color: Vec4,
 }
 
 #[allow(clippy::new_ret_no_self)]
@@ -405,12 +405,12 @@ impl LightPath {
                 wavelength: wavelength,
 
                 next_bounce_ray: None,
-                next_attenuation_fac: Float4::splat(1.0),
+                next_attenuation_fac: Vec4::splat(1.0),
 
                 closure_sample_pdf: 1.0,
-                light_attenuation: Float4::splat(1.0),
-                pending_color_addition: Float4::splat(0.0),
-                color: Float4::splat(0.0),
+                light_attenuation: Vec4::splat(1.0),
+                pending_color_addition: Vec4::splat(0.0),
+                color: Vec4::splat(0.0),
             },
             scene.camera.generate_ray(
                 image_plane_co.0,
@@ -565,7 +565,7 @@ impl LightPath {
 
                         // If there's any possible contribution, set up for a
                         // light ray.
-                        if attenuation.e.h_max() <= 0.0 {
+                        if attenuation.e.max_element() <= 0.0 {
                             false
                         } else {
                             // Calculate and store the light that will be contributed
@@ -599,7 +599,7 @@ impl LightPath {
                         };
 
                         // Check if pdf is zero, to avoid NaN's.
-                        if (pdf > 0.0) && (filter.e.h_max() > 0.0) {
+                        if (pdf > 0.0) && (filter.e.max_element() > 0.0) {
                             // Account for the additional light attenuation from
                             // this bounce
                             self.next_attenuation_fac = filter.e;
diff --git a/src/shading/surface_closure.rs b/src/shading/surface_closure.rs
index be14360..10713eb 100644
--- a/src/shading/surface_closure.rs
+++ b/src/shading/surface_closure.rs
@@ -2,7 +2,7 @@
 
 use std::f32::consts::PI as PI_32;
 
-use float4::Float4;
+use glam::Vec4;
 
 use crate::{
     color::{Color, SpectralSample},
@@ -492,27 +492,27 @@ mod ggx_closure {
             let spectrum_sample = col.to_spectral_sample(wavelength);
             let rev_fresnel = 1.0 - fresnel;
             let c0 = lerp(
-                schlick_fresnel_from_fac(spectrum_sample.e.get_0(), hb),
-                spectrum_sample.e.get_0(),
+                schlick_fresnel_from_fac(spectrum_sample.e.x(), hb),
+                spectrum_sample.e.x(),
                 rev_fresnel,
             );
             let c1 = lerp(
-                schlick_fresnel_from_fac(spectrum_sample.e.get_1(), hb),
-                spectrum_sample.e.get_1(),
+                schlick_fresnel_from_fac(spectrum_sample.e.y(), hb),
+                spectrum_sample.e.y(),
                 rev_fresnel,
             );
             let c2 = lerp(
-                schlick_fresnel_from_fac(spectrum_sample.e.get_2(), hb),
-                spectrum_sample.e.get_2(),
+                schlick_fresnel_from_fac(spectrum_sample.e.z(), hb),
+                spectrum_sample.e.z(),
                 rev_fresnel,
             );
             let c3 = lerp(
-                schlick_fresnel_from_fac(spectrum_sample.e.get_3(), hb),
-                spectrum_sample.e.get_3(),
+                schlick_fresnel_from_fac(spectrum_sample.e.w(), hb),
+                spectrum_sample.e.w(),
                 rev_fresnel,
             );
 
-            SpectralSample::from_parts(Float4::new(c0, c1, c2, c3), wavelength)
+            SpectralSample::from_parts(Vec4::new(c0, c1, c2, c3), wavelength)
         };
 
         // Calculate everything else
diff --git a/src/surface/triangle.rs b/src/surface/triangle.rs
index 4aed3a3..5e0f60f 100644
--- a/src/surface/triangle.rs
+++ b/src/surface/triangle.rs
@@ -163,7 +163,7 @@ pub fn surface_point(tri: (Point, Point, Point), bary: (f32, f32, f32)) -> (Poin
         + (tri.2.into_vector().abs() * bary.2))
         * fp_gamma(7))
     .co
-    .h_max();
+    .max_element();
 
     (pos, pos_err)
 }
diff --git a/sub_crates/float4/Cargo.toml b/sub_crates/float4/Cargo.toml
deleted file mode 100644
index 3cc0324..0000000
--- a/sub_crates/float4/Cargo.toml
+++ /dev/null
@@ -1,10 +0,0 @@
-[package]
-name = "float4"
-version = "0.1.0"
-authors = ["Nathan Vegdahl <cessen@cessen.com>"]
-edition = "2018"
-license = "MIT"
-
-[lib]
-name = "float4"
-path = "src/lib.rs"
diff --git a/sub_crates/float4/src/lib.rs b/sub_crates/float4/src/lib.rs
deleted file mode 100644
index 0f081b3..0000000
--- a/sub_crates/float4/src/lib.rs
+++ /dev/null
@@ -1,1620 +0,0 @@
-#![allow(dead_code)]
-
-/// Implementation of Float4 for x86_64 platforms with SSE support.
-#[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
-mod x86_64_sse {
-    use std::{
-        arch::x86_64::__m128,
-        cmp::PartialEq,
-        ops::{Add, AddAssign, BitAnd, BitOr, Div, DivAssign, Mul, MulAssign, Sub, SubAssign},
-    };
-
-    #[derive(Debug, Copy, Clone)]
-    pub struct Float4 {
-        data: __m128,
-    }
-
-    impl Float4 {
-        #[inline(always)]
-        pub fn new(a: f32, b: f32, c: f32, d: f32) -> Float4 {
-            use std::arch::x86_64::_mm_set_ps;
-            Float4 {
-                data: unsafe { _mm_set_ps(d, c, b, a) },
-            }
-        }
-
-        #[inline(always)]
-        pub fn splat(n: f32) -> Float4 {
-            use std::arch::x86_64::_mm_set1_ps;
-            Float4 {
-                data: unsafe { _mm_set1_ps(n) },
-            }
-        }
-
-        #[inline]
-        pub fn h_sum(&self) -> f32 {
-            #[cfg(target_feature = "sse3")]
-            {
-                use std::arch::x86_64::{
-                    _mm_add_ps, _mm_add_ss, _mm_cvtss_f32, _mm_movehdup_ps, _mm_movehl_ps,
-                };
-                unsafe {
-                    let v = self.data;
-                    let shuf = _mm_movehdup_ps(v);
-                    let sums = _mm_add_ps(v, shuf);
-                    let shuf = _mm_movehl_ps(shuf, sums);
-                    let sums = _mm_add_ss(sums, shuf);
-                    _mm_cvtss_f32(sums)
-                }
-            }
-            #[cfg(not(target_feature = "sse3"))]
-            {
-                use std::arch::x86_64::{
-                    _mm_add_ps, _mm_add_ss, _mm_cvtss_f32, _mm_movehl_ps, _mm_shuffle_ps,
-                };
-                unsafe {
-                    let v = self.data;
-                    let shuf = _mm_shuffle_ps(v, v, (2 << 6) | (3 << 4) | 1);
-                    let sums = _mm_add_ps(v, shuf);
-                    let shuf = _mm_movehl_ps(shuf, sums);
-                    let sums = _mm_add_ss(sums, shuf);
-                    _mm_cvtss_f32(sums)
-                }
-            }
-        }
-
-        #[inline]
-        pub fn h_product(&self) -> f32 {
-            (self.get_0() * self.get_1()) * (self.get_2() * self.get_3())
-        }
-
-        #[inline]
-        pub fn h_min(&self) -> f32 {
-            let n1 = if self.get_0() < self.get_1() {
-                self.get_0()
-            } else {
-                self.get_1()
-            };
-            let n2 = if self.get_2() < self.get_3() {
-                self.get_2()
-            } else {
-                self.get_3()
-            };
-            if n1 < n2 {
-                n1
-            } else {
-                n2
-            }
-        }
-
-        #[inline]
-        pub fn h_max(&self) -> f32 {
-            let n1 = if self.get_0() > self.get_1() {
-                self.get_0()
-            } else {
-                self.get_1()
-            };
-            let n2 = if self.get_2() > self.get_3() {
-                self.get_2()
-            } else {
-                self.get_3()
-            };
-            if n1 > n2 {
-                n1
-            } else {
-                n2
-            }
-        }
-
-        #[inline(always)]
-        pub fn v_min(&self, other: Float4) -> Float4 {
-            use std::arch::x86_64::_mm_min_ps;
-            Float4 {
-                data: unsafe { _mm_min_ps(self.data, other.data) },
-            }
-        }
-
-        #[inline(always)]
-        pub fn v_max(&self, other: Float4) -> Float4 {
-            use std::arch::x86_64::_mm_max_ps;
-            Float4 {
-                data: unsafe { _mm_max_ps(self.data, other.data) },
-            }
-        }
-
-        #[inline(always)]
-        pub fn lt(&self, other: Float4) -> Bool4 {
-            use std::arch::x86_64::_mm_cmplt_ps;
-            Bool4 {
-                data: unsafe { _mm_cmplt_ps(self.data, other.data) },
-            }
-        }
-
-        #[inline(always)]
-        pub fn lte(&self, other: Float4) -> Bool4 {
-            use std::arch::x86_64::_mm_cmple_ps;
-            Bool4 {
-                data: unsafe { _mm_cmple_ps(self.data, other.data) },
-            }
-        }
-
-        #[inline(always)]
-        pub fn gt(&self, other: Float4) -> Bool4 {
-            use std::arch::x86_64::_mm_cmpgt_ps;
-            Bool4 {
-                data: unsafe { _mm_cmpgt_ps(self.data, other.data) },
-            }
-        }
-
-        #[inline(always)]
-        pub fn gte(&self, other: Float4) -> Bool4 {
-            use std::arch::x86_64::_mm_cmpge_ps;
-            Bool4 {
-                data: unsafe { _mm_cmpge_ps(self.data, other.data) },
-            }
-        }
-
-        /// Set the nth element to the given value.
-        #[inline(always)]
-        pub fn set_n(&mut self, n: usize, v: f32) {
-            assert!(
-                n <= 3,
-                "Attempted to set element of Float4 outside of bounds."
-            );
-
-            unsafe { *(&mut self.data as *mut std::arch::x86_64::__m128 as *mut f32).add(n) = v }
-        }
-
-        /// Set the 0th element to the given value.
-        #[inline(always)]
-        pub fn set_0(&mut self, v: f32) {
-            self.set_n(0, v);
-        }
-
-        /// Set the 1th element to the given value.
-        #[inline(always)]
-        pub fn set_1(&mut self, v: f32) {
-            self.set_n(1, v);
-        }
-
-        /// Set the 2th element to the given value.
-        #[inline(always)]
-        pub fn set_2(&mut self, v: f32) {
-            self.set_n(2, v);
-        }
-
-        /// Set the 3th element to the given value.
-        #[inline(always)]
-        pub fn set_3(&mut self, v: f32) {
-            self.set_n(3, v);
-        }
-
-        /// Returns the value of the nth element.
-        #[inline(always)]
-        pub fn get_n(&self, n: usize) -> f32 {
-            assert!(
-                n <= 3,
-                "Attempted to access element of Float4 outside of bounds."
-            );
-
-            unsafe { *(&self.data as *const std::arch::x86_64::__m128 as *const f32).add(n) }
-        }
-
-        /// Returns the value of the 0th element.
-        #[inline(always)]
-        pub fn get_0(&self) -> f32 {
-            self.get_n(0)
-        }
-
-        /// Returns the value of the 1th element.
-        #[inline(always)]
-        pub fn get_1(&self) -> f32 {
-            self.get_n(1)
-        }
-
-        /// Returns the value of the 2th element.
-        #[inline(always)]
-        pub fn get_2(&self) -> f32 {
-            self.get_n(2)
-        }
-
-        /// Returns the value of the 3th element.
-        #[inline(always)]
-        pub fn get_3(&self) -> f32 {
-            self.get_n(3)
-        }
-
-        /// Returns a Float4 with all elements set to the value
-        /// of element 0.
-        #[inline(always)]
-        pub fn all_0(&self) -> Float4 {
-            use std::arch::x86_64::_mm_shuffle_ps;
-            Float4 {
-                data: unsafe { _mm_shuffle_ps(self.data, self.data, 0b00_00_00_00) },
-            }
-        }
-
-        /// Returns a Float4 with all elements set to the value
-        /// of element 1.
-        #[inline(always)]
-        pub fn all_1(&self) -> Float4 {
-            use std::arch::x86_64::_mm_shuffle_ps;
-            Float4 {
-                data: unsafe { _mm_shuffle_ps(self.data, self.data, 0b01_01_01_01) },
-            }
-        }
-
-        /// Returns a Float4 with all elements set to the value
-        /// of element 2.
-        #[inline(always)]
-        pub fn all_2(&self) -> Float4 {
-            use std::arch::x86_64::_mm_shuffle_ps;
-            Float4 {
-                data: unsafe { _mm_shuffle_ps(self.data, self.data, 0b10_10_10_10) },
-            }
-        }
-
-        /// Returns a Float4 with all elements set to the value
-        /// of element 3.
-        #[inline(always)]
-        pub fn all_3(&self) -> Float4 {
-            use std::arch::x86_64::_mm_shuffle_ps;
-            Float4 {
-                data: unsafe { _mm_shuffle_ps(self.data, self.data, 0b11_11_11_11) },
-            }
-        }
-
-        /// Returns the square roots of all elements.
-        #[inline(always)]
-        pub fn sqrt(&self) -> Float4 {
-            use std::arch::x86_64::_mm_sqrt_ps;
-            Float4 {
-                data: unsafe { _mm_sqrt_ps(self.data) },
-            }
-        }
-
-        /// Performs a fused multiply add.
-        ///
-        /// i.e. self * b + c
-        #[inline(always)]
-        pub fn fmadd(&self, b: Float4, c: Float4) -> Float4 {
-            #[cfg(target_feature = "fma")]
-            {
-                use std::arch::x86_64::_mm_fmadd_ps;
-                Float4 {
-                    data: unsafe { _mm_fmadd_ps(self.data, b.data, c.data) },
-                }
-            }
-            #[cfg(not(target_feature = "fma"))]
-            {
-                (*self * b) + c
-            }
-        }
-    }
-
-    impl PartialEq for Float4 {
-        #[inline]
-        fn eq(&self, other: &Float4) -> bool {
-            self.get_0() == other.get_0()
-                && self.get_1() == other.get_1()
-                && self.get_2() == other.get_2()
-                && self.get_3() == other.get_3()
-        }
-    }
-
-    impl Add for Float4 {
-        type Output = Float4;
-
-        #[inline(always)]
-        fn add(self, other: Float4) -> Float4 {
-            use std::arch::x86_64::_mm_add_ps;
-            Float4 {
-                data: unsafe { _mm_add_ps(self.data, other.data) },
-            }
-        }
-    }
-
-    impl AddAssign for Float4 {
-        #[inline(always)]
-        fn add_assign(&mut self, rhs: Float4) {
-            *self = *self + rhs;
-        }
-    }
-
-    impl Sub for Float4 {
-        type Output = Float4;
-
-        #[inline(always)]
-        fn sub(self, other: Float4) -> Float4 {
-            use std::arch::x86_64::_mm_sub_ps;
-            Float4 {
-                data: unsafe { _mm_sub_ps(self.data, other.data) },
-            }
-        }
-    }
-
-    impl SubAssign for Float4 {
-        #[inline(always)]
-        fn sub_assign(&mut self, rhs: Float4) {
-            *self = *self - rhs;
-        }
-    }
-
-    impl Mul for Float4 {
-        type Output = Float4;
-
-        #[inline(always)]
-        fn mul(self, other: Float4) -> Float4 {
-            use std::arch::x86_64::_mm_mul_ps;
-            Float4 {
-                data: unsafe { _mm_mul_ps(self.data, other.data) },
-            }
-        }
-    }
-
-    impl Mul<f32> for Float4 {
-        type Output = Float4;
-
-        #[inline(always)]
-        fn mul(self, other: f32) -> Float4 {
-            self * Float4::splat(other)
-        }
-    }
-
-    impl MulAssign for Float4 {
-        #[inline(always)]
-        fn mul_assign(&mut self, rhs: Float4) {
-            *self = *self * rhs;
-        }
-    }
-
-    impl MulAssign<f32> for Float4 {
-        #[inline(always)]
-        fn mul_assign(&mut self, rhs: f32) {
-            *self = *self * rhs;
-        }
-    }
-
-    impl Div for Float4 {
-        type Output = Float4;
-
-        #[inline(always)]
-        fn div(self, other: Float4) -> Float4 {
-            use std::arch::x86_64::_mm_div_ps;
-            Float4 {
-                data: unsafe { _mm_div_ps(self.data, other.data) },
-            }
-        }
-    }
-
-    impl Div<f32> for Float4 {
-        type Output = Float4;
-
-        #[inline(always)]
-        fn div(self, other: f32) -> Float4 {
-            self / Float4::splat(other)
-        }
-    }
-
-    impl DivAssign for Float4 {
-        #[inline(always)]
-        fn div_assign(&mut self, rhs: Float4) {
-            *self = *self / rhs;
-        }
-    }
-
-    impl DivAssign<f32> for Float4 {
-        #[inline(always)]
-        fn div_assign(&mut self, rhs: f32) {
-            *self = *self / rhs;
-        }
-    }
-
-    // Free functions for Float4
-
-    #[inline(always)]
-    pub fn v_min(a: Float4, b: Float4) -> Float4 {
-        a.v_min(b)
-    }
-
-    #[inline(always)]
-    pub fn v_max(a: Float4, b: Float4) -> Float4 {
-        a.v_max(b)
-    }
-
-    /// Transposes a 4x4 matrix in-place.
-    #[inline(always)]
-    pub fn transpose(matrix: &mut [Float4; 4]) {
-        use std::arch::x86_64::_MM_TRANSPOSE4_PS;
-
-        // The weird &mut/*mut gymnastics below are to get around
-        // the borrow-checker.  We know statically that these references
-        // are non-overlapping, so it's safe.
-        unsafe {
-            _MM_TRANSPOSE4_PS(
-                &mut *(&mut matrix[0].data as *mut __m128),
-                &mut *(&mut matrix[1].data as *mut __m128),
-                &mut *(&mut matrix[2].data as *mut __m128),
-                &mut *(&mut matrix[3].data as *mut __m128),
-            )
-        };
-    }
-
-    /// Inverts a 4x4 matrix and returns the determinate.
-    #[inline(always)]
-    pub fn invert(matrix: &mut [Float4; 4]) -> f32 {
-        // Code pulled from "Streaming SIMD Extensions - Inverse of 4x4 Matrix"
-        // by Intel.
-        // ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
-        // Ported to Rust.
-
-        // TODO: once __m64 and accompanying intrinsics are stabilized, switch
-        // to using those, commented out in the code below.
-        use std::arch::x86_64::{
-            _mm_add_ps,
-            _mm_add_ss,
-            _mm_cvtss_f32,
-            _mm_mul_ps,
-            _mm_mul_ss,
-            _mm_rcp_ss,
-            // _mm_loadh_pi,
-            // _mm_loadl_pi,
-            // _mm_storeh_pi,
-            // _mm_storel_pi,
-            _mm_set_ps,
-            _mm_shuffle_ps,
-            _mm_sub_ps,
-            _mm_sub_ss,
-        };
-        use std::mem::transmute;
-
-        let mut minor0: __m128;
-        let mut minor1: __m128;
-        let mut minor2: __m128;
-        let mut minor3: __m128;
-        let row0: __m128;
-        let mut row1: __m128;
-        let mut row2: __m128;
-        let mut row3: __m128;
-        let mut det: __m128;
-        let mut tmp1: __m128;
-
-        unsafe {
-            // tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
-            tmp1 = _mm_set_ps(
-                matrix[1].get_1(),
-                matrix[1].get_0(),
-                matrix[0].get_1(),
-                matrix[0].get_0(),
-            );
-
-            // row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
-            row1 = _mm_set_ps(
-                matrix[3].get_1(),
-                matrix[3].get_0(),
-                matrix[2].get_1(),
-                matrix[2].get_0(),
-            );
-
-            row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
-            row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
-
-            // tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
-            tmp1 = _mm_set_ps(
-                matrix[1].get_3(),
-                matrix[1].get_2(),
-                matrix[0].get_3(),
-                matrix[0].get_2(),
-            );
-
-            // row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
-            row3 = _mm_set_ps(
-                matrix[3].get_3(),
-                matrix[3].get_2(),
-                matrix[2].get_3(),
-                matrix[2].get_2(),
-            );
-
-            row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
-            row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
-            // -----------------------------------------------
-            tmp1 = _mm_mul_ps(row2, row3);
-            tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
-            minor0 = _mm_mul_ps(row1, tmp1);
-            minor1 = _mm_mul_ps(row0, tmp1);
-            tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
-            minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
-            minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
-            minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
-            // -----------------------------------------------
-            tmp1 = _mm_mul_ps(row1, row2);
-            tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
-            minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
-            minor3 = _mm_mul_ps(row0, tmp1);
-            tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
-            minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
-            minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
-            minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
-            // -----------------------------------------------
-            tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
-            tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
-            row2 = _mm_shuffle_ps(row2, row2, 0x4E);
-            minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
-            minor2 = _mm_mul_ps(row0, tmp1);
-            tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
-            minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
-            minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
-            minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
-            // -----------------------------------------------
-            tmp1 = _mm_mul_ps(row0, row1);
-            tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
-            minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
-            minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
-            tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
-            minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
-            minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
-            // -----------------------------------------------
-            tmp1 = _mm_mul_ps(row0, row3);
-            tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
-            minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
-            minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
-            tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
-            minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
-            minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
-            // -----------------------------------------------
-            tmp1 = _mm_mul_ps(row0, row2);
-            tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
-            minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
-            minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
-            tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
-            minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
-            minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
-            // -----------------------------------------------
-            det = _mm_mul_ps(row0, minor0);
-            det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
-            det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
-            tmp1 = _mm_rcp_ss(det);
-            det = _mm_sub_ss(
-                _mm_add_ss(tmp1, tmp1),
-                _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)),
-            );
-            det = _mm_shuffle_ps(det, det, 0x00);
-
-            minor0 = _mm_mul_ps(det, minor0);
-
-            // _mm_storel_pi((__m64*)(src), minor0);
-            // _mm_storeh_pi((__m64*)(src+2), minor0);
-            let minor0 = transmute::<__m128, [f32; 4]>(minor0);
-            matrix[0].data = _mm_set_ps(minor0[3], minor0[2], minor0[1], minor0[0]);
-
-            minor1 = _mm_mul_ps(det, minor1);
-
-            // _mm_storel_pi((__m64*)(src+4), minor1);
-            // _mm_storeh_pi((__m64*)(src+6), minor1);
-            let minor1 = transmute::<__m128, [f32; 4]>(minor1);
-            matrix[1].data = _mm_set_ps(minor1[3], minor1[2], minor1[1], minor1[0]);
-
-            minor2 = _mm_mul_ps(det, minor2);
-
-            // _mm_storel_pi((__m64*)(src+ 8), minor2);
-            // _mm_storeh_pi((__m64*)(src+10), minor2);
-            let minor2 = transmute::<__m128, [f32; 4]>(minor2);
-            matrix[2].data = _mm_set_ps(minor2[3], minor2[2], minor2[1], minor2[0]);
-
-            minor3 = _mm_mul_ps(det, minor3);
-
-            // _mm_storel_pi((__m64*)(src+12), minor3);
-            // _mm_storeh_pi((__m64*)(src+14), minor3);
-            let minor3 = transmute::<__m128, [f32; 4]>(minor3);
-            matrix[3].data = _mm_set_ps(minor3[3], minor3[2], minor3[1], minor3[0]);
-
-            _mm_cvtss_f32(det)
-        }
-    }
-
-    /// Essentially a tuple of four bools, which will use SIMD operations
-    /// where possible on a platform.
-    #[derive(Debug, Copy, Clone)]
-    pub struct Bool4 {
-        data: __m128,
-    }
-
-    impl Bool4 {
-        #[inline(always)]
-        pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 {
-            use std::arch::x86_64::_mm_set_ps;
-            Bool4 {
-                data: unsafe {
-                    _mm_set_ps(
-                        if d { 1.0 } else { 0.0 },
-                        if c { 1.0 } else { 0.0 },
-                        if b { 1.0 } else { 0.0 },
-                        if a { 1.0 } else { 0.0 },
-                    )
-                },
-            }
-        }
-
-        #[inline(always)]
-        pub fn new_false() -> Bool4 {
-            use std::arch::x86_64::_mm_set1_ps;
-            Bool4 {
-                data: unsafe { _mm_set1_ps(0.0) },
-            }
-        }
-
-        /// Returns the value of the nth element.
-        #[inline(always)]
-        pub fn get_n(&self, n: usize) -> bool {
-            assert!(
-                n <= 3,
-                "Attempted to access element of Bool4 outside of bounds."
-            );
-
-            0 != unsafe { *(&self.data as *const std::arch::x86_64::__m128 as *const u32).add(n) }
-        }
-
-        /// Returns the value of the 0th element.
-        #[inline(always)]
-        pub fn get_0(&self) -> bool {
-            self.get_n(0)
-        }
-
-        /// Returns the value of the 1st element.
-        #[inline(always)]
-        pub fn get_1(&self) -> bool {
-            self.get_n(1)
-        }
-
-        /// Returns the value of the 2nd element.
-        #[inline(always)]
-        pub fn get_2(&self) -> bool {
-            self.get_n(2)
-        }
-
-        /// Returns the value of the 3rd element.
-        #[inline(always)]
-        pub fn get_3(&self) -> bool {
-            self.get_n(3)
-        }
-
-        /// Returns whether all four bools are false.
-        ///
-        /// This is the `NOT` operation on the result of `OR`ing all the
-        /// contained bools.  If even one bool is true, this returns false.
-        #[inline(always)]
-        pub fn is_all_false(&self) -> bool {
-            let a = unsafe { *(&self.data as *const __m128 as *const u128) };
-            a == 0
-        }
-
-        #[inline]
-        pub fn to_bitmask(&self) -> u8 {
-            let a = unsafe { *(&self.data as *const __m128 as *const u8).offset(0) };
-            let b = unsafe { *(&self.data as *const __m128 as *const u8).offset(4) };
-            let c = unsafe { *(&self.data as *const __m128 as *const u8).offset(8) };
-            let d = unsafe { *(&self.data as *const __m128 as *const u8).offset(12) };
-            (a & 0b0000_0001) | (b & 0b0000_0010) | (c & 0b0000_0100) | (d & 0b0000_1000)
-        }
-    }
-
-    impl BitAnd for Bool4 {
-        type Output = Bool4;
-
-        #[inline(always)]
-        fn bitand(self, rhs: Bool4) -> Bool4 {
-            use std::arch::x86_64::_mm_and_ps;
-            Bool4 {
-                data: unsafe { _mm_and_ps(self.data, rhs.data) },
-            }
-        }
-    }
-
-    impl BitOr for Bool4 {
-        type Output = Bool4;
-
-        #[inline(always)]
-        fn bitor(self, rhs: Bool4) -> Bool4 {
-            use std::arch::x86_64::_mm_or_ps;
-            Bool4 {
-                data: unsafe { _mm_or_ps(self.data, rhs.data) },
-            }
-        }
-    }
-}
-
-//===========================================================================
-
-/// Implementation fo Float4 for any platform, foregoing any
-/// platform-specific optimizations.
-mod fallback {
-    use std::{
-        cmp::PartialEq,
-        ops::{Add, AddAssign, BitAnd, BitOr, Div, DivAssign, Mul, MulAssign, Sub, SubAssign},
-    };
-
-    #[derive(Debug, Copy, Clone)]
-    pub struct Float4 {
-        data: [f32; 4],
-    }
-
-    impl Float4 {
-        #[inline(always)]
-        pub fn new(a: f32, b: f32, c: f32, d: f32) -> Float4 {
-            Float4 { data: [a, b, c, d] }
-        }
-
-        #[inline(always)]
-        pub fn splat(n: f32) -> Float4 {
-            Float4 { data: [n, n, n, n] }
-        }
-
-        #[inline]
-        pub fn h_sum(&self) -> f32 {
-            (self.get_0() + self.get_1()) + (self.get_2() + self.get_3())
-        }
-
-        #[inline]
-        pub fn h_product(&self) -> f32 {
-            (self.get_0() * self.get_1()) * (self.get_2() * self.get_3())
-        }
-
-        #[inline]
-        pub fn h_min(&self) -> f32 {
-            let n1 = if self.get_0() < self.get_1() {
-                self.get_0()
-            } else {
-                self.get_1()
-            };
-            let n2 = if self.get_2() < self.get_3() {
-                self.get_2()
-            } else {
-                self.get_3()
-            };
-            if n1 < n2 {
-                n1
-            } else {
-                n2
-            }
-        }
-
-        #[inline]
-        pub fn h_max(&self) -> f32 {
-            let n1 = if self.get_0() > self.get_1() {
-                self.get_0()
-            } else {
-                self.get_1()
-            };
-            let n2 = if self.get_2() > self.get_3() {
-                self.get_2()
-            } else {
-                self.get_3()
-            };
-            if n1 > n2 {
-                n1
-            } else {
-                n2
-            }
-        }
-
-        #[inline(always)]
-        pub fn v_min(&self, other: Float4) -> Float4 {
-            Float4::new(
-                if self.get_0() < other.get_0() {
-                    self.get_0()
-                } else {
-                    other.get_0()
-                },
-                if self.get_1() < other.get_1() {
-                    self.get_1()
-                } else {
-                    other.get_1()
-                },
-                if self.get_2() < other.get_2() {
-                    self.get_2()
-                } else {
-                    other.get_2()
-                },
-                if self.get_3() < other.get_3() {
-                    self.get_3()
-                } else {
-                    other.get_3()
-                },
-            )
-        }
-
-        #[inline(always)]
-        pub fn v_max(&self, other: Float4) -> Float4 {
-            Float4::new(
-                if self.get_0() > other.get_0() {
-                    self.get_0()
-                } else {
-                    other.get_0()
-                },
-                if self.get_1() > other.get_1() {
-                    self.get_1()
-                } else {
-                    other.get_1()
-                },
-                if self.get_2() > other.get_2() {
-                    self.get_2()
-                } else {
-                    other.get_2()
-                },
-                if self.get_3() > other.get_3() {
-                    self.get_3()
-                } else {
-                    other.get_3()
-                },
-            )
-        }
-
-        #[inline(always)]
-        pub fn lt(&self, other: Float4) -> Bool4 {
-            Bool4 {
-                data: [
-                    self.data[0] < other.data[0],
-                    self.data[1] < other.data[1],
-                    self.data[2] < other.data[2],
-                    self.data[3] < other.data[3],
-                ],
-            }
-        }
-
-        #[inline(always)]
-        pub fn lte(&self, other: Float4) -> Bool4 {
-            Bool4 {
-                data: [
-                    self.data[0] <= other.data[0],
-                    self.data[1] <= other.data[1],
-                    self.data[2] <= other.data[2],
-                    self.data[3] <= other.data[3],
-                ],
-            }
-        }
-
-        #[inline(always)]
-        pub fn gt(&self, other: Float4) -> Bool4 {
-            Bool4 {
-                data: [
-                    self.data[0] > other.data[0],
-                    self.data[1] > other.data[1],
-                    self.data[2] > other.data[2],
-                    self.data[3] > other.data[3],
-                ],
-            }
-        }
-
-        #[inline(always)]
-        pub fn gte(&self, other: Float4) -> Bool4 {
-            Bool4 {
-                data: [
-                    self.data[0] >= other.data[0],
-                    self.data[1] >= other.data[1],
-                    self.data[2] >= other.data[2],
-                    self.data[3] >= other.data[3],
-                ],
-            }
-        }
-
-        /// Set the nth element to the given value.
-        #[inline(always)]
-        pub fn set_n(&mut self, n: usize, v: f32) {
-            assert!(
-                n <= 3,
-                "Attempted to set element of Float4 outside of bounds."
-            );
-            unsafe {
-                *self.data.get_unchecked_mut(n) = v;
-            }
-        }
-
-        /// Set the 0th element to the given value.
-        #[inline(always)]
-        pub fn set_0(&mut self, v: f32) {
-            self.set_n(0, v);
-        }
-
-        /// Set the 1th element to the given value.
-        #[inline(always)]
-        pub fn set_1(&mut self, v: f32) {
-            self.set_n(1, v);
-        }
-
-        /// Set the 2th element to the given value.
-        #[inline(always)]
-        pub fn set_2(&mut self, v: f32) {
-            self.set_n(2, v);
-        }
-
-        /// Set the 3th element to the given value.
-        #[inline(always)]
-        pub fn set_3(&mut self, v: f32) {
-            self.set_n(3, v);
-        }
-
-        /// Returns the value of the nth element.
-        #[inline(always)]
-        pub fn get_n(&self, n: usize) -> f32 {
-            assert!(
-                n <= 3,
-                "Attempted to access element of Float4 outside of bounds."
-            );
-            unsafe { *self.data.get_unchecked(n) }
-        }
-
-        /// Returns the value of the 0th element.
-        #[inline(always)]
-        pub fn get_0(&self) -> f32 {
-            self.get_n(0)
-        }
-
-        /// Returns the value of the 1th element.
-        #[inline(always)]
-        pub fn get_1(&self) -> f32 {
-            self.get_n(1)
-        }
-
-        /// Returns the value of the 2th element.
-        #[inline(always)]
-        pub fn get_2(&self) -> f32 {
-            self.get_n(2)
-        }
-
-        /// Returns the value of the 3th element.
-        #[inline(always)]
-        pub fn get_3(&self) -> f32 {
-            self.get_n(3)
-        }
-
-        /// Returns a Float4 with all elements set to the value
-        /// of element 0.
-        #[inline(always)]
-        pub fn all_0(&self) -> Float4 {
-            Float4 {
-                data: [self.data[0], self.data[0], self.data[0], self.data[0]],
-            }
-        }
-
-        /// Returns a Float4 with all elements set to the value
-        /// of element 1.
-        #[inline(always)]
-        pub fn all_1(&self) -> Float4 {
-            Float4 {
-                data: [self.data[1], self.data[1], self.data[1], self.data[1]],
-            }
-        }
-
-        /// Returns a Float4 with all elements set to the value
-        /// of element 2.
-        #[inline(always)]
-        pub fn all_2(&self) -> Float4 {
-            Float4 {
-                data: [self.data[2], self.data[2], self.data[2], self.data[2]],
-            }
-        }
-
-        /// Returns a Float4 with all elements set to the value
-        /// of element 3.
-        #[inline(always)]
-        pub fn all_3(&self) -> Float4 {
-            Float4 {
-                data: [self.data[3], self.data[3], self.data[3], self.data[3]],
-            }
-        }
-
-        /// Returns the square roots of all elements.
-        #[inline(always)]
-        pub fn sqrt(&self) -> Float4 {
-            Float4::new(
-                self.get_0().sqrt(),
-                self.get_1().sqrt(),
-                self.get_2().sqrt(),
-                self.get_3().sqrt(),
-            )
-        }
-
-        /// Performs a fused multiply add.
-        ///
-        /// i.e. self * b + c
-        #[inline(always)]
-        pub fn fmadd(&self, b: Float4, c: Float4) -> Float4 {
-            (*self * b) + c
-        }
-    }
-
-    impl PartialEq for Float4 {
-        #[inline]
-        fn eq(&self, other: &Float4) -> bool {
-            self.get_0() == other.get_0()
-                && self.get_1() == other.get_1()
-                && self.get_2() == other.get_2()
-                && self.get_3() == other.get_3()
-        }
-    }
-
-    impl Add for Float4 {
-        type Output = Float4;
-
-        #[inline(always)]
-        fn add(self, other: Float4) -> Float4 {
-            Float4 {
-                data: [
-                    self.get_0() + other.get_0(),
-                    self.get_1() + other.get_1(),
-                    self.get_2() + other.get_2(),
-                    self.get_3() + other.get_3(),
-                ],
-            }
-        }
-    }
-
-    impl AddAssign for Float4 {
-        #[inline(always)]
-        fn add_assign(&mut self, rhs: Float4) {
-            *self = *self + rhs;
-        }
-    }
-
-    impl Sub for Float4 {
-        type Output = Float4;
-
-        #[inline(always)]
-        fn sub(self, other: Float4) -> Float4 {
-            Float4 {
-                data: [
-                    self.get_0() - other.get_0(),
-                    self.get_1() - other.get_1(),
-                    self.get_2() - other.get_2(),
-                    self.get_3() - other.get_3(),
-                ],
-            }
-        }
-    }
-
-    impl SubAssign for Float4 {
-        #[inline(always)]
-        fn sub_assign(&mut self, rhs: Float4) {
-            *self = *self - rhs;
-        }
-    }
-
-    impl Mul for Float4 {
-        type Output = Float4;
-
-        #[inline(always)]
-        fn mul(self, other: Float4) -> Float4 {
-            Float4 {
-                data: [
-                    self.get_0() * other.get_0(),
-                    self.get_1() * other.get_1(),
-                    self.get_2() * other.get_2(),
-                    self.get_3() * other.get_3(),
-                ],
-            }
-        }
-    }
-
-    impl Mul<f32> for Float4 {
-        type Output = Float4;
-
-        #[inline(always)]
-        fn mul(self, other: f32) -> Float4 {
-            Float4 {
-                data: [
-                    self.get_0() * other,
-                    self.get_1() * other,
-                    self.get_2() * other,
-                    self.get_3() * other,
-                ],
-            }
-        }
-    }
-
-    impl MulAssign for Float4 {
-        #[inline(always)]
-        fn mul_assign(&mut self, rhs: Float4) {
-            *self = *self * rhs;
-        }
-    }
-
-    impl MulAssign<f32> for Float4 {
-        #[inline(always)]
-        fn mul_assign(&mut self, rhs: f32) {
-            *self = *self * rhs;
-        }
-    }
-
-    impl Div for Float4 {
-        type Output = Float4;
-
-        #[inline(always)]
-        fn div(self, other: Float4) -> Float4 {
-            Float4 {
-                data: [
-                    self.get_0() / other.get_0(),
-                    self.get_1() / other.get_1(),
-                    self.get_2() / other.get_2(),
-                    self.get_3() / other.get_3(),
-                ],
-            }
-        }
-    }
-
-    impl Div<f32> for Float4 {
-        type Output = Float4;
-
-        #[inline(always)]
-        fn div(self, other: f32) -> Float4 {
-            Float4 {
-                data: [
-                    self.get_0() / other,
-                    self.get_1() / other,
-                    self.get_2() / other,
-                    self.get_3() / other,
-                ],
-            }
-        }
-    }
-
-    impl DivAssign for Float4 {
-        #[inline(always)]
-        fn div_assign(&mut self, rhs: Float4) {
-            *self = *self / rhs;
-        }
-    }
-
-    impl DivAssign<f32> for Float4 {
-        #[inline(always)]
-        fn div_assign(&mut self, rhs: f32) {
-            *self = *self / rhs;
-        }
-    }
-
-    // Free functions for Float4
-    #[inline(always)]
-    pub fn v_min(a: Float4, b: Float4) -> Float4 {
-        a.v_min(b)
-    }
-
-    #[inline(always)]
-    pub fn v_max(a: Float4, b: Float4) -> Float4 {
-        a.v_max(b)
-    }
-
-    /// Transposes a 4x4 matrix in-place
-    #[inline(always)]
-    pub fn transpose(matrix: &mut [Float4; 4]) {
-        let m = [
-            Float4::new(
-                matrix[0].get_0(),
-                matrix[1].get_0(),
-                matrix[2].get_0(),
-                matrix[3].get_0(),
-            ),
-            Float4::new(
-                matrix[0].get_1(),
-                matrix[1].get_1(),
-                matrix[2].get_1(),
-                matrix[3].get_1(),
-            ),
-            Float4::new(
-                matrix[0].get_2(),
-                matrix[1].get_2(),
-                matrix[2].get_2(),
-                matrix[3].get_2(),
-            ),
-            Float4::new(
-                matrix[0].get_3(),
-                matrix[1].get_3(),
-                matrix[2].get_3(),
-                matrix[3].get_3(),
-            ),
-        ];
-
-        *matrix = m;
-    }
-
-    /// Inverts a 4x4 matrix and returns the determinate.
-    #[inline(always)]
-    pub fn invert(matrix: &mut [Float4; 4]) -> f32 {
-        let m = *matrix;
-
-        let s0 = (m[0].get_0() * m[1].get_1()) - (m[1].get_0() * m[0].get_1());
-        let s1 = (m[0].get_0() * m[1].get_2()) - (m[1].get_0() * m[0].get_2());
-        let s2 = (m[0].get_0() * m[1].get_3()) - (m[1].get_0() * m[0].get_3());
-        let s3 = (m[0].get_1() * m[1].get_2()) - (m[1].get_1() * m[0].get_2());
-        let s4 = (m[0].get_1() * m[1].get_3()) - (m[1].get_1() * m[0].get_3());
-        let s5 = (m[0].get_2() * m[1].get_3()) - (m[1].get_2() * m[0].get_3());
-
-        let c5 = (m[2].get_2() * m[3].get_3()) - (m[3].get_2() * m[2].get_3());
-        let c4 = (m[2].get_1() * m[3].get_3()) - (m[3].get_1() * m[2].get_3());
-        let c3 = (m[2].get_1() * m[3].get_2()) - (m[3].get_1() * m[2].get_2());
-        let c2 = (m[2].get_0() * m[3].get_3()) - (m[3].get_0() * m[2].get_3());
-        let c1 = (m[2].get_0() * m[3].get_2()) - (m[3].get_0() * m[2].get_2());
-        let c0 = (m[2].get_0() * m[3].get_1()) - (m[3].get_0() * m[2].get_1());
-
-        // We don't check for 0.0 determinant, as that is expected to be handled
-        // by the calling code.
-        let det = (s0 * c5) - (s1 * c4) + (s2 * c3) + (s3 * c2) - (s4 * c1) + (s5 * c0);
-        let invdet = 1.0 / det;
-
-        *matrix = [
-            Float4::new(
-                ((m[1].get_1() * c5) - (m[1].get_2() * c4) + (m[1].get_3() * c3)) * invdet,
-                ((-m[0].get_1() * c5) + (m[0].get_2() * c4) - (m[0].get_3() * c3)) * invdet,
-                ((m[3].get_1() * s5) - (m[3].get_2() * s4) + (m[3].get_3() * s3)) * invdet,
-                ((-m[2].get_1() * s5) + (m[2].get_2() * s4) - (m[2].get_3() * s3)) * invdet,
-            ),
-            Float4::new(
-                ((-m[1].get_0() * c5) + (m[1].get_2() * c2) - (m[1].get_3() * c1)) * invdet,
-                ((m[0].get_0() * c5) - (m[0].get_2() * c2) + (m[0].get_3() * c1)) * invdet,
-                ((-m[3].get_0() * s5) + (m[3].get_2() * s2) - (m[3].get_3() * s1)) * invdet,
-                ((m[2].get_0() * s5) - (m[2].get_2() * s2) + (m[2].get_3() * s1)) * invdet,
-            ),
-            Float4::new(
-                ((m[1].get_0() * c4) - (m[1].get_1() * c2) + (m[1].get_3() * c0)) * invdet,
-                ((-m[0].get_0() * c4) + (m[0].get_1() * c2) - (m[0].get_3() * c0)) * invdet,
-                ((m[3].get_0() * s4) - (m[3].get_1() * s2) + (m[3].get_3() * s0)) * invdet,
-                ((-m[2].get_0() * s4) + (m[2].get_1() * s2) - (m[2].get_3() * s0)) * invdet,
-            ),
-            Float4::new(
-                ((-m[1].get_0() * c3) + (m[1].get_1() * c1) - (m[1].get_2() * c0)) * invdet,
-                ((m[0].get_0() * c3) - (m[0].get_1() * c1) + (m[0].get_2() * c0)) * invdet,
-                ((-m[3].get_0() * s3) + (m[3].get_1() * s1) - (m[3].get_2() * s0)) * invdet,
-                ((m[2].get_0() * s3) - (m[2].get_1() * s1) + (m[2].get_2() * s0)) * invdet,
-            ),
-        ];
-
-        det
-    }
-
-    /// Essentially a tuple of four bools.
-    #[derive(Debug, Copy, Clone)]
-    pub struct Bool4 {
-        data: [bool; 4],
-    }
-
-    impl Bool4 {
-        #[inline(always)]
-        pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 {
-            Bool4 { data: [a, b, c, d] }
-        }
-
-        #[inline(always)]
-        pub fn new_false() -> Bool4 {
-            Bool4 {
-                data: [false, false, false, false],
-            }
-        }
-
-        /// Returns the value of the nth element.
-        #[inline(always)]
-        pub fn get_n(self, n: usize) -> bool {
-            assert!(
-                n <= 3,
-                "Attempted to access element of Bool4 outside of bounds."
-            );
-            unsafe { *self.data.get_unchecked(n) }
-        }
-
-        /// Returns the value of the 0th element.
-        #[inline(always)]
-        pub fn get_0(self) -> bool {
-            self.get_n(0)
-        }
-
-        /// Returns the value of the 1th element.
-        #[inline(always)]
-        pub fn get_1(self) -> bool {
-            self.get_n(1)
-        }
-
-        /// Returns the value of the 2th element.
-        #[inline(always)]
-        pub fn get_2(self) -> bool {
-            self.get_n(2)
-        }
-
-        /// Returns the value of the 3th element.
-        #[inline(always)]
-        pub fn get_3(self) -> bool {
-            self.get_n(3)
-        }
-
-        /// Returns whether all four bools are false.
-        ///
-        /// This is the `NOT` operation on the result of `OR`ing all the
-        /// contained bools.  If even one bool is true, this returns false.
-        #[inline(always)]
-        pub fn is_all_false(&self) -> bool {
-            !(self.data[0] | self.data[1] | self.data[2] | self.data[3])
-        }
-
-        #[inline]
-        pub fn to_bitmask(self) -> u8 {
-            (self.get_0() as u8)
-                | ((self.get_1() as u8) << 1)
-                | ((self.get_2() as u8) << 2)
-                | ((self.get_3() as u8) << 3)
-        }
-    }
-
-    impl BitAnd for Bool4 {
-        type Output = Bool4;
-
-        #[inline(always)]
-        fn bitand(self, rhs: Bool4) -> Bool4 {
-            Bool4 {
-                data: [
-                    self.data[0] && rhs.data[0],
-                    self.data[1] && rhs.data[1],
-                    self.data[2] && rhs.data[2],
-                    self.data[3] && rhs.data[3],
-                ],
-            }
-        }
-    }
-
-    impl BitOr for Bool4 {
-        type Output = Bool4;
-
-        #[inline(always)]
-        fn bitor(self, rhs: Bool4) -> Bool4 {
-            Bool4 {
-                data: [
-                    self.data[0] || rhs.data[0],
-                    self.data[1] || rhs.data[1],
-                    self.data[2] || rhs.data[2],
-                    self.data[3] || rhs.data[3],
-                ],
-            }
-        }
-    }
-}
-
-//===========================================================================
-
-#[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
-pub use crate::x86_64_sse::{invert, transpose, v_max, v_min, Bool4, Float4};
-
-#[cfg(not(all(target_arch = "x86_64", target_feature = "sse")))]
-pub use fallback::{invert, transpose, v_max, v_min, Bool4, Float4};
-
-//===========================================================================
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn get() {
-        let f = Float4::new(1.0, 2.0, 3.0, 4.0);
-
-        assert_eq!(f.get_0(), 1.0);
-        assert_eq!(f.get_1(), 2.0);
-        assert_eq!(f.get_2(), 3.0);
-        assert_eq!(f.get_3(), 4.0);
-    }
-
-    #[test]
-    fn get_n() {
-        let f = Float4::new(1.0, 2.0, 3.0, 4.0);
-
-        assert_eq!(f.get_n(0), 1.0);
-        assert_eq!(f.get_n(1), 2.0);
-        assert_eq!(f.get_n(2), 3.0);
-        assert_eq!(f.get_n(3), 4.0);
-    }
-
-    #[test]
-    fn set() {
-        let mut f = Float4::new(1.0, 2.0, 3.0, 4.0);
-        f.set_0(5.0);
-        f.set_1(6.0);
-        f.set_2(7.0);
-        f.set_3(8.0);
-
-        assert_eq!(f.get_0(), 5.0);
-        assert_eq!(f.get_1(), 6.0);
-        assert_eq!(f.get_2(), 7.0);
-        assert_eq!(f.get_3(), 8.0);
-    }
-
-    #[test]
-    fn set_n() {
-        let mut f = Float4::new(1.0, 2.0, 3.0, 4.0);
-        f.set_n(0, 5.0);
-        f.set_n(1, 6.0);
-        f.set_n(2, 7.0);
-        f.set_n(3, 8.0);
-
-        assert_eq!(f.get_0(), 5.0);
-        assert_eq!(f.get_1(), 6.0);
-        assert_eq!(f.get_2(), 7.0);
-        assert_eq!(f.get_3(), 8.0);
-    }
-
-    #[test]
-    fn all() {
-        let f = Float4::new(1.0, 2.0, 3.0, 4.0);
-
-        assert_eq!(f.all_0(), Float4::splat(1.0));
-        assert_eq!(f.all_1(), Float4::splat(2.0));
-        assert_eq!(f.all_2(), Float4::splat(3.0));
-        assert_eq!(f.all_3(), Float4::splat(4.0));
-    }
-
-    #[test]
-    fn partial_eq_1() {
-        let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
-        let f2 = Float4::new(1.0, 2.0, 3.0, 4.0);
-
-        assert!(f1 == f2);
-    }
-
-    #[test]
-    fn partial_eq_2() {
-        let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
-        let f2 = Float4::new(1.0, 2.1, 3.0, 4.0);
-
-        assert!(!(f1 == f2));
-    }
-
-    #[test]
-    fn h_sum() {
-        let f = Float4::new(1.0, 2.0, 3.0, 4.0);
-        assert_eq!(f.h_sum(), 10.0);
-    }
-
-    #[test]
-    fn h_product() {
-        let f = Float4::new(1.0, 2.0, 3.0, 4.0);
-        assert_eq!(f.h_product(), 24.0);
-    }
-
-    #[test]
-    fn h_min() {
-        let f = Float4::new(1.0, 2.0, 3.0, 4.0);
-        assert_eq!(f.h_min(), 1.0);
-    }
-
-    #[test]
-    fn h_max() {
-        let f = Float4::new(1.0, 2.0, 3.0, 4.0);
-        assert_eq!(f.h_max(), 4.0);
-    }
-
-    #[test]
-    fn add() {
-        let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
-        let f2 = Float4::new(2.0, 3.0, 4.0, 5.0);
-        let f3 = Float4::new(3.0, 5.0, 7.0, 9.0);
-
-        assert_eq!(f1 + f2, f3);
-    }
-
-    #[test]
-    fn sub() {
-        let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
-        let f2 = Float4::new(2.0, 3.0, 4.0, 5.0);
-        let f3 = Float4::new(-1.0, -1.0, -1.0, -1.0);
-
-        assert_eq!(f1 - f2, f3);
-    }
-
-    #[test]
-    fn mul_component() {
-        let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
-        let f2 = Float4::new(2.0, 3.0, 4.0, 5.0);
-        let f3 = Float4::new(2.0, 6.0, 12.0, 20.0);
-
-        assert_eq!(f1 * f2, f3);
-    }
-
-    #[test]
-    fn mul_scalar() {
-        let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
-        let v = 3.0;
-        let f2 = Float4::new(3.0, 6.0, 9.0, 12.0);
-
-        assert_eq!(f1 * v, f2);
-    }
-
-    #[test]
-    fn div_component() {
-        let f1 = Float4::new(1.0, 3.0, 3.0, 6.0);
-        let f2 = Float4::new(2.0, 2.0, 4.0, 8.0);
-        let f3 = Float4::new(0.5, 1.5, 0.75, 0.75);
-
-        assert_eq!(f1 / f2, f3);
-    }
-
-    #[test]
-    fn div_scalar() {
-        let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
-        let v = 2.0;
-        let f2 = Float4::new(0.5, 1.0, 1.5, 2.0);
-
-        assert_eq!(f1 / v, f2);
-    }
-
-    #[test]
-    fn lt() {
-        let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
-        let f2 = Float4::new(0.5, 2.0, 3.5, 2.0);
-
-        let r = f1.lt(f2);
-
-        assert_eq!(r.get_0(), false);
-        assert_eq!(r.get_1(), false);
-        assert_eq!(r.get_2(), true);
-        assert_eq!(r.get_3(), false);
-    }
-
-    #[test]
-    fn gt() {
-        let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
-        let f2 = Float4::new(0.5, 2.0, 3.5, 2.0);
-
-        let r = f1.gt(f2);
-
-        assert_eq!(r.get_0(), true);
-        assert_eq!(r.get_1(), false);
-        assert_eq!(r.get_2(), false);
-        assert_eq!(r.get_3(), true);
-    }
-
-    #[test]
-    fn matrix_transpose() {
-        let mut m1 = [
-            Float4::new(1.0, 2.0, 3.0, 4.0),
-            Float4::new(5.0, 6.0, 7.0, 8.0),
-            Float4::new(9.0, 10.0, 11.0, 12.0),
-            Float4::new(13.0, 14.0, 15.0, 16.0),
-        ];
-        let m2 = [
-            Float4::new(1.0, 5.0, 9.0, 13.0),
-            Float4::new(2.0, 6.0, 10.0, 14.0),
-            Float4::new(3.0, 7.0, 11.0, 15.0),
-            Float4::new(4.0, 8.0, 12.0, 16.0),
-        ];
-
-        transpose(&mut m1);
-
-        assert_eq!(m1, m2);
-    }
-
-    #[test]
-    fn bool4_bitmask_01() {
-        let f1 = Float4::new(0.0, 0.0, 0.0, 0.0);
-        let f2 = Float4::new(-1.0, -1.0, 1.0, -1.0);
-        let r = f1.lt(f2).to_bitmask();
-
-        assert_eq!(r, 0b00000100);
-    }
-
-    #[test]
-    fn bool4_bitmask_02() {
-        let f1 = Float4::new(0.0, 0.0, 0.0, 0.0);
-        let f2 = Float4::new(1.0, -1.0, 1.0, -1.0);
-        let r = f1.lt(f2).to_bitmask();
-
-        assert_eq!(r, 0b00000101);
-    }
-
-    #[test]
-    fn bool4_bitmask_03() {
-        let f1 = Float4::new(0.0, 0.0, 0.0, 0.0);
-        let f2 = Float4::new(-1.0, 1.0, -1.0, 1.0);
-        let r = f1.lt(f2).to_bitmask();
-
-        assert_eq!(r, 0b00001010);
-    }
-
-    #[test]
-    fn bool4_is_all_false() {
-        assert_eq!(true, Bool4::new(false, false, false, false).is_all_false());
-        assert_eq!(false, Bool4::new(false, false, true, false).is_all_false());
-    }
-}
diff --git a/sub_crates/math3d/Cargo.toml b/sub_crates/math3d/Cargo.toml
index 5547f6b..792c5b5 100644
--- a/sub_crates/math3d/Cargo.toml
+++ b/sub_crates/math3d/Cargo.toml
@@ -10,5 +10,6 @@ name = "math3d"
 path = "src/lib.rs"
 
 # Local crate dependencies
-[dependencies.float4]
-path = "../float4"
\ No newline at end of file
+[dependencies]
+glam = {git="https://github.com/bitshifter/glam-rs.git", rev="0f314f99", default-features=false, features=["approx"]}
+approx = "0.3"
diff --git a/sub_crates/math3d/src/matrix.rs b/sub_crates/math3d/src/matrix.rs
index 9b80c9c..e804064 100644
--- a/sub_crates/math3d/src/matrix.rs
+++ b/sub_crates/math3d/src/matrix.rs
@@ -1,29 +1,21 @@
 #![allow(dead_code)]
 
-use std::ops::{Index, IndexMut, Mul};
+use std::ops::{Add, Mul};
 
-use float4::{invert, transpose, Float4};
+use approx::RelativeEq;
+use glam::{Mat4, Vec4};
 
 use super::Point;
 
 /// A 4x4 matrix, used for transforms
-#[derive(Debug, Copy, Clone)]
-pub struct Matrix4x4 {
-    pub values: [Float4; 4],
-}
+#[derive(Debug, Copy, Clone, PartialEq)]
+pub struct Matrix4x4(pub Mat4);
 
 impl Matrix4x4 {
     /// Creates a new identity matrix
     #[inline]
     pub fn new() -> Matrix4x4 {
-        Matrix4x4 {
-            values: [
-                Float4::new(1.0, 0.0, 0.0, 0.0),
-                Float4::new(0.0, 1.0, 0.0, 0.0),
-                Float4::new(0.0, 0.0, 1.0, 0.0),
-                Float4::new(0.0, 0.0, 0.0, 1.0),
-            ],
-        }
+        Matrix4x4(Mat4::identity())
     }
 
     /// Creates a new matrix with the specified values:
@@ -52,108 +44,37 @@ impl Matrix4x4 {
         o: f32,
         p: f32,
     ) -> Matrix4x4 {
-        Matrix4x4 {
-            values: [
-                Float4::new(a, b, c, d),
-                Float4::new(e, f, g, h),
-                Float4::new(i, j, k, l),
-                Float4::new(m, n, o, p),
-            ],
-        }
+        Matrix4x4(Mat4::new(
+            Vec4::new(a, e, i, m),
+            Vec4::new(b, f, j, n),
+            Vec4::new(c, g, k, o),
+            Vec4::new(d, h, l, p),
+        ))
     }
 
     #[inline]
     pub fn from_location(loc: Point) -> Matrix4x4 {
-        Matrix4x4 {
-            values: [
-                Float4::new(1.0, 0.0, 0.0, loc.x()),
-                Float4::new(0.0, 1.0, 0.0, loc.y()),
-                Float4::new(0.0, 0.0, 1.0, loc.z()),
-                Float4::new(0.0, 0.0, 0.0, 1.0),
-            ],
-        }
+        Matrix4x4(Mat4::from_translation(loc.co.truncate()))
     }
 
     /// Returns whether the matrices are approximately equal to each other.
-    /// Each corresponding element in the matrices cannot have a relative error
-    /// exceeding `epsilon`.
+    /// Each corresponding element in the matrices cannot have a relative
+    /// error exceeding epsilon.
     #[inline]
     pub fn aprx_eq(&self, other: Matrix4x4, epsilon: f32) -> bool {
-        let mut result = true;
-
-        for y in 0..4 {
-            for x in 0..4 {
-                // All of this stuff is just an approximate comparison
-                // of floating point numbers.  See:
-                // http://floating-point-gui.de/errors/comparison/
-                // It might be worth breaking this out into a separate funcion,
-                // but I'm not entirely sure where to put it.
-                let a = self[y].get_n(x);
-                let b = other[y].get_n(x);
-                let aabs = a.abs();
-                let babs = b.abs();
-                let diff = (a - b).abs();
-                if a == b {
-                } else if (aabs <= std::f32::EPSILON) || (babs <= std::f32::EPSILON) {
-                    result = result && (diff < std::f32::EPSILON);
-                } else {
-                    let rel = 2.0 * diff / (aabs + babs);
-                    println!("{}", rel);
-                    result = result && (rel < epsilon);
-                }
-            }
-        }
-
-        result
+        self.0.relative_eq(&other.0, std::f32::EPSILON, epsilon)
     }
 
     /// Returns the transpose of the matrix
     #[inline]
     pub fn transposed(&self) -> Matrix4x4 {
-        let mut m = *self;
-        transpose(&mut m.values);
-        m
+        Matrix4x4(self.0.transpose())
     }
 
     /// Returns the inverse of the Matrix
     #[inline]
-    #[allow(clippy::float_cmp)]
     pub fn inverse(&self) -> Matrix4x4 {
-        let mut m = *self;
-        let det = invert(&mut m.values);
-        debug_assert_ne!(det, 0.0);
-        m
-    }
-}
-
-impl Index<usize> for Matrix4x4 {
-    type Output = Float4;
-
-    #[inline(always)]
-    fn index(&self, _index: usize) -> &Float4 {
-        &self.values[_index]
-    }
-}
-
-impl IndexMut<usize> for Matrix4x4 {
-    #[inline(always)]
-    fn index_mut(&mut self, _index: usize) -> &mut Float4 {
-        &mut self.values[_index]
-    }
-}
-
-impl PartialEq for Matrix4x4 {
-    #[inline]
-    fn eq(&self, other: &Matrix4x4) -> bool {
-        let mut result = true;
-
-        for y in 0..4 {
-            for x in 0..4 {
-                result = result && (self[y].get_n(x) == other[y].get_n(x));
-            }
-        }
-
-        result
+        Matrix4x4(self.0.inverse())
     }
 }
 
@@ -164,40 +85,32 @@ impl Default for Matrix4x4 {
 }
 
 /// Multiply two matrices together
-impl Mul<Matrix4x4> for Matrix4x4 {
-    type Output = Matrix4x4;
+impl Mul for Matrix4x4 {
+    type Output = Self;
 
     #[inline]
-    fn mul(self, other: Matrix4x4) -> Matrix4x4 {
-        let m = self.transposed();
-        Matrix4x4 {
-            values: [
-                Float4::new(
-                    (m[0] * other[0]).h_sum(),
-                    (m[1] * other[0]).h_sum(),
-                    (m[2] * other[0]).h_sum(),
-                    (m[3] * other[0]).h_sum(),
-                ),
-                Float4::new(
-                    (m[0] * other[1]).h_sum(),
-                    (m[1] * other[1]).h_sum(),
-                    (m[2] * other[1]).h_sum(),
-                    (m[3] * other[1]).h_sum(),
-                ),
-                Float4::new(
-                    (m[0] * other[2]).h_sum(),
-                    (m[1] * other[2]).h_sum(),
-                    (m[2] * other[2]).h_sum(),
-                    (m[3] * other[2]).h_sum(),
-                ),
-                Float4::new(
-                    (m[0] * other[3]).h_sum(),
-                    (m[1] * other[3]).h_sum(),
-                    (m[2] * other[3]).h_sum(),
-                    (m[3] * other[3]).h_sum(),
-                ),
-            ],
-        }
+    fn mul(self, other: Self) -> Self {
+        Self(other.0.mul_mat4(&self.0))
+    }
+}
+
+/// Multiply a matrix by a f32
+impl Mul<f32> for Matrix4x4 {
+    type Output = Self;
+
+    #[inline]
+    fn mul(self, other: f32) -> Self {
+        Self(self.0 * other)
+    }
+}
+
+/// Add two matrices together
+impl Add for Matrix4x4 {
+    type Output = Self;
+
+    #[inline]
+    fn add(self, other: Self) -> Self {
+        Self(self.0 + other.0)
     }
 }
 
@@ -218,22 +131,24 @@ mod tests {
     }
 
     #[test]
-    fn aproximate_equality_test() {
+    fn approximate_equality_test() {
         let a = Matrix4x4::new();
         let b = Matrix4x4::new_from_values(
-            1.001, 0.0, 0.0, 0.0, 0.0, 1.001, 0.0, 0.0, 0.0, 0.0, 1.001, 0.0, 0.0, 0.0, 0.0, 1.001,
+            1.000001, 0.0, 0.0, 0.0, 0.0, 1.000001, 0.0, 0.0, 0.0, 0.0, 1.000001, 0.0, 0.0, 0.0,
+            0.0, 1.000001,
         );
         let c = Matrix4x4::new_from_values(
-            1.003, 0.0, 0.0, 0.0, 0.0, 1.003, 0.0, 0.0, 0.0, 0.0, 1.003, 0.0, 0.0, 0.0, 0.0, 1.003,
+            1.000003, 0.0, 0.0, 0.0, 0.0, 1.000003, 0.0, 0.0, 0.0, 0.0, 1.000003, 0.0, 0.0, 0.0,
+            0.0, 1.000003,
         );
         let d = Matrix4x4::new_from_values(
-            -1.001, 0.0, 0.0, 0.0, 0.0, -1.001, 0.0, 0.0, 0.0, 0.0, -1.001, 0.0, 0.0, 0.0, 0.0,
-            -1.001,
+            -1.000001, 0.0, 0.0, 0.0, 0.0, -1.000001, 0.0, 0.0, 0.0, 0.0, -1.000001, 0.0, 0.0, 0.0,
+            0.0, -1.000001,
         );
 
-        assert!(a.aprx_eq(b, 0.002));
-        assert!(!a.aprx_eq(c, 0.002));
-        assert!(!a.aprx_eq(d, 0.002));
+        assert!(a.aprx_eq(b, 0.000001));
+        assert!(!a.aprx_eq(c, 0.000001));
+        assert!(!a.aprx_eq(d, 0.000001));
     }
 
     #[test]
@@ -260,7 +175,7 @@ mod tests {
         let b = a.inverse();
         let c = Matrix4x4::new();
 
-        assert!((a * b).aprx_eq(c, 0.00001));
+        assert!((dbg!(a * b)).aprx_eq(dbg!(c), 0.0000001));
     }
 
     #[test]
diff --git a/sub_crates/math3d/src/normal.rs b/sub_crates/math3d/src/normal.rs
index e1c9067..3a2fccd 100644
--- a/sub_crates/math3d/src/normal.rs
+++ b/sub_crates/math3d/src/normal.rs
@@ -5,42 +5,44 @@ use std::{
     ops::{Add, Div, Mul, Neg, Sub},
 };
 
-use float4::Float4;
+use glam::Vec4;
 
 use super::{CrossProduct, DotProduct, Matrix4x4, Vector};
 
 /// A surface normal in 3d homogeneous space.
 #[derive(Debug, Copy, Clone)]
 pub struct Normal {
-    pub co: Float4,
+    pub co: Vec4,
 }
 
 impl Normal {
     #[inline(always)]
     pub fn new(x: f32, y: f32, z: f32) -> Normal {
         Normal {
-            co: Float4::new(x, y, z, 0.0),
+            co: Vec4::new(x, y, z, 0.0),
         }
     }
 
     #[inline(always)]
     pub fn length(&self) -> f32 {
-        (self.co * self.co).h_sum().sqrt()
+        self.co.length()
     }
 
     #[inline(always)]
     pub fn length2(&self) -> f32 {
-        (self.co * self.co).h_sum()
+        self.co.length_squared()
     }
 
     #[inline(always)]
     pub fn normalized(&self) -> Normal {
-        *self / self.length()
+        Normal {
+            co: self.co.normalize(),
+        }
     }
 
     #[inline(always)]
     pub fn into_vector(self) -> Vector {
-        Vector::new(self.co.get_0(), self.co.get_1(), self.co.get_2())
+        Vector { co: self.co }
     }
 
     #[inline(always)]
@@ -55,32 +57,32 @@ impl Normal {
 
     #[inline(always)]
     pub fn x(&self) -> f32 {
-        self.co.get_0()
+        self.co.x()
     }
 
     #[inline(always)]
     pub fn y(&self) -> f32 {
-        self.co.get_1()
+        self.co.y()
     }
 
     #[inline(always)]
     pub fn z(&self) -> f32 {
-        self.co.get_2()
+        self.co.z()
     }
 
     #[inline(always)]
     pub fn set_x(&mut self, x: f32) {
-        self.co.set_0(x);
+        self.co.set_x(x);
     }
 
     #[inline(always)]
     pub fn set_y(&mut self, y: f32) {
-        self.co.set_1(y);
+        self.co.set_y(y);
     }
 
     #[inline(always)]
     pub fn set_z(&mut self, z: f32) {
-        self.co.set_2(z);
+        self.co.set_z(z);
     }
 }
 
@@ -129,15 +131,10 @@ impl Mul<Matrix4x4> for Normal {
 
     #[inline]
     fn mul(self, other: Matrix4x4) -> Normal {
-        let mat = other.inverse().transposed();
-        Normal {
-            co: Float4::new(
-                (self.co * mat.values[0]).h_sum(),
-                (self.co * mat.values[1]).h_sum(),
-                (self.co * mat.values[2]).h_sum(),
-                0.0,
-            ),
-        }
+        let mat = other.0.inverse().transpose();
+        let mut co = mat.mul_vec4(self.co);
+        co.set_w(0.0);
+        Normal { co: co }
     }
 }
 
@@ -164,7 +161,7 @@ impl Neg for Normal {
 impl DotProduct for Normal {
     #[inline(always)]
     fn dot(self, other: Normal) -> f32 {
-        (self.co * other.co).h_sum()
+        self.co.dot(other.co)
     }
 }
 
@@ -172,12 +169,7 @@ impl CrossProduct for Normal {
     #[inline]
     fn cross(self, other: Normal) -> Normal {
         Normal {
-            co: Float4::new(
-                (self.co.get_1() * other.co.get_2()) - (self.co.get_2() * other.co.get_1()),
-                (self.co.get_2() * other.co.get_0()) - (self.co.get_0() * other.co.get_2()),
-                (self.co.get_0() * other.co.get_1()) - (self.co.get_1() * other.co.get_0()),
-                0.0,
-            ),
+            co: self.co.truncate().cross(other.co.truncate()).extend(0.0),
         }
     }
 }
@@ -186,6 +178,7 @@ impl CrossProduct for Normal {
 mod tests {
     use super::super::{CrossProduct, DotProduct, Matrix4x4};
     use super::*;
+    use approx::UlpsEq;
 
     #[test]
     fn add() {
@@ -220,8 +213,10 @@ mod tests {
         let m = Matrix4x4::new_from_values(
             1.0, 2.0, 2.0, 1.5, 3.0, 6.0, 7.0, 8.0, 9.0, 2.0, 11.0, 12.0, 13.0, 7.0, 15.0, 3.0,
         );
-        let nm = Normal::new(-19.258825, 5.717648, -1.770588);
-        assert!(((n * m) - nm).length2() < 0.00001);
+        let mut nm = n * m;
+        nm.co.set_w(0.0);
+        let nm2 = Normal::new(-19.258825, 5.717648, -1.770588);
+        assert!(nm.co.ulps_eq(&nm2.co, 0.0, 4));
     }
 
     #[test]
diff --git a/sub_crates/math3d/src/point.rs b/sub_crates/math3d/src/point.rs
index 075fb9c..998acc9 100644
--- a/sub_crates/math3d/src/point.rs
+++ b/sub_crates/math3d/src/point.rs
@@ -5,21 +5,21 @@ use std::{
     ops::{Add, Mul, Sub},
 };
 
-use float4::Float4;
+use glam::Vec4;
 
 use super::{Matrix4x4, Vector};
 
 /// A position in 3d homogeneous space.
 #[derive(Debug, Copy, Clone)]
 pub struct Point {
-    pub co: Float4,
+    pub co: Vec4,
 }
 
 impl Point {
     #[inline(always)]
     pub fn new(x: f32, y: f32, z: f32) -> Point {
         Point {
-            co: Float4::new(x, y, z, 1.0),
+            co: Vec4::new(x, y, z, 1.0),
         }
     }
 
@@ -28,7 +28,7 @@ impl Point {
     #[inline(always)]
     pub fn norm(&self) -> Point {
         Point {
-            co: self.co / self.co.get_3(),
+            co: self.co / self.co.w(),
         }
     }
 
@@ -38,7 +38,7 @@ impl Point {
         let n2 = other.norm();
 
         Point {
-            co: n1.co.v_min(n2.co),
+            co: n1.co.min(n2.co),
         }
     }
 
@@ -48,13 +48,15 @@ impl Point {
         let n2 = other.norm();
 
         Point {
-            co: n1.co.v_max(n2.co),
+            co: n1.co.max(n2.co),
         }
     }
 
     #[inline(always)]
     pub fn into_vector(self) -> Vector {
-        Vector::new(self.co.get_0(), self.co.get_1(), self.co.get_2())
+        let mut v = Vector { co: self.co };
+        v.co.set_w(0.0);
+        v
     }
 
     #[inline(always)]
@@ -69,32 +71,32 @@ impl Point {
 
     #[inline(always)]
     pub fn x(&self) -> f32 {
-        self.co.get_0()
+        self.co.x()
     }
 
     #[inline(always)]
     pub fn y(&self) -> f32 {
-        self.co.get_1()
+        self.co.y()
     }
 
     #[inline(always)]
     pub fn z(&self) -> f32 {
-        self.co.get_2()
+        self.co.z()
     }
 
     #[inline(always)]
     pub fn set_x(&mut self, x: f32) {
-        self.co.set_0(x);
+        self.co.set_x(x);
     }
 
     #[inline(always)]
     pub fn set_y(&mut self, y: f32) {
-        self.co.set_1(y);
+        self.co.set_y(y);
     }
 
     #[inline(always)]
     pub fn set_z(&mut self, z: f32) {
-        self.co.set_2(z);
+        self.co.set_z(z);
     }
 }
 
@@ -144,12 +146,7 @@ impl Mul<Matrix4x4> for Point {
     #[inline]
     fn mul(self, other: Matrix4x4) -> Point {
         Point {
-            co: Float4::new(
-                (self.co * other.values[0]).h_sum(),
-                (self.co * other.values[1]).h_sum(),
-                (self.co * other.values[2]).h_sum(),
-                (self.co * other.values[3]).h_sum(),
-            ),
+            co: other.0.mul_vec4(self.co),
         }
     }
 }
@@ -163,7 +160,7 @@ mod tests {
     fn norm() {
         let mut p1 = Point::new(1.0, 2.0, 3.0);
         let p2 = Point::new(2.0, 4.0, 6.0);
-        p1.co.set_3(0.5);
+        p1.co.set_w(0.5);
 
         assert_eq!(p2, p1.norm());
     }
@@ -203,7 +200,7 @@ mod tests {
             1.0, 2.0, 2.0, 1.5, 3.0, 6.0, 7.0, 8.0, 9.0, 2.0, 11.0, 12.0, 2.0, 3.0, 1.0, 5.0,
         );
         let mut pm = Point::new(15.5, 54.0, 70.0);
-        pm.co.set_3(18.5);
+        pm.co.set_w(18.5);
         assert_eq!(p * m, pm);
     }
 
diff --git a/sub_crates/math3d/src/vector.rs b/sub_crates/math3d/src/vector.rs
index 6c6f9c0..e584a09 100644
--- a/sub_crates/math3d/src/vector.rs
+++ b/sub_crates/math3d/src/vector.rs
@@ -5,37 +5,39 @@ use std::{
     ops::{Add, Div, Mul, Neg, Sub},
 };
 
-use float4::Float4;
+use glam::Vec4;
 
 use super::{CrossProduct, DotProduct, Matrix4x4, Normal, Point};
 
 /// A direction vector in 3d homogeneous space.
 #[derive(Debug, Copy, Clone)]
 pub struct Vector {
-    pub co: Float4,
+    pub co: Vec4,
 }
 
 impl Vector {
     #[inline(always)]
     pub fn new(x: f32, y: f32, z: f32) -> Vector {
         Vector {
-            co: Float4::new(x, y, z, 0.0),
+            co: Vec4::new(x, y, z, 0.0),
         }
     }
 
     #[inline(always)]
     pub fn length(&self) -> f32 {
-        (self.co * self.co).h_sum().sqrt()
+        self.co.length()
     }
 
     #[inline(always)]
     pub fn length2(&self) -> f32 {
-        (self.co * self.co).h_sum()
+        self.co.length_squared()
     }
 
     #[inline(always)]
     pub fn normalized(&self) -> Vector {
-        *self / self.length()
+        Vector {
+            co: self.co.normalize(),
+        }
     }
 
     #[inline(always)]
@@ -65,32 +67,32 @@ impl Vector {
 
     #[inline(always)]
     pub fn x(&self) -> f32 {
-        self.co.get_0()
+        self.co.x()
     }
 
     #[inline(always)]
     pub fn y(&self) -> f32 {
-        self.co.get_1()
+        self.co.y()
     }
 
     #[inline(always)]
     pub fn z(&self) -> f32 {
-        self.co.get_2()
+        self.co.z()
     }
 
     #[inline(always)]
     pub fn set_x(&mut self, x: f32) {
-        self.co.set_0(x);
+        self.co.set_x(x);
     }
 
     #[inline(always)]
     pub fn set_y(&mut self, y: f32) {
-        self.co.set_1(y);
+        self.co.set_y(y);
     }
 
     #[inline(always)]
     pub fn set_z(&mut self, z: f32) {
-        self.co.set_2(z);
+        self.co.set_z(z);
     }
 }
 
@@ -140,12 +142,7 @@ impl Mul<Matrix4x4> for Vector {
     #[inline]
     fn mul(self, other: Matrix4x4) -> Vector {
         Vector {
-            co: Float4::new(
-                (self.co * other.values[0]).h_sum(),
-                (self.co * other.values[1]).h_sum(),
-                (self.co * other.values[2]).h_sum(),
-                (self.co * other.values[3]).h_sum(),
-            ),
+            co: other.0.mul_vec4(self.co),
         }
     }
 }
@@ -173,7 +170,7 @@ impl Neg for Vector {
 impl DotProduct for Vector {
     #[inline(always)]
     fn dot(self, other: Vector) -> f32 {
-        (self.co * other.co).h_sum()
+        self.co.dot(other.co)
     }
 }
 
@@ -181,12 +178,7 @@ impl CrossProduct for Vector {
     #[inline]
     fn cross(self, other: Vector) -> Vector {
         Vector {
-            co: Float4::new(
-                (self.co.get_1() * other.co.get_2()) - (self.co.get_2() * other.co.get_1()),
-                (self.co.get_2() * other.co.get_0()) - (self.co.get_0() * other.co.get_2()),
-                (self.co.get_0() * other.co.get_1()) - (self.co.get_1() * other.co.get_0()),
-                0.0,
-            ),
+            co: self.co.truncate().cross(other.co.truncate()).extend(0.0),
         }
     }
 }
@@ -230,7 +222,7 @@ mod tests {
             1.0, 2.0, 2.0, 1.5, 3.0, 6.0, 7.0, 8.0, 9.0, 2.0, 11.0, 12.0, 13.0, 7.0, 15.0, 3.0,
         );
         let mut vm = Vector::new(14.0, 46.0, 58.0);
-        vm.co.set_3(90.5);
+        vm.co.set_w(90.5);
         assert_eq!(v * m, vm);
     }
 
diff --git a/sub_crates/spectral_upsampling/Cargo.toml b/sub_crates/spectral_upsampling/Cargo.toml
index 3dfa2d0..a9bf965 100644
--- a/sub_crates/spectral_upsampling/Cargo.toml
+++ b/sub_crates/spectral_upsampling/Cargo.toml
@@ -9,6 +9,5 @@ license = "MIT"
 name = "spectral_upsampling"
 path = "src/lib.rs"
 
-# Local crate dependencies
-[dependencies.float4]
-path = "../float4"
\ No newline at end of file
+[dependencies]
+glam = {git="https://github.com/bitshifter/glam-rs.git", rev="0f314f99", default-features=false, features=["approx"]}
\ No newline at end of file
diff --git a/sub_crates/spectral_upsampling/src/jakob.rs b/sub_crates/spectral_upsampling/src/jakob.rs
index 76b41d7..8a15156 100644
--- a/sub_crates/spectral_upsampling/src/jakob.rs
+++ b/sub_crates/spectral_upsampling/src/jakob.rs
@@ -6,7 +6,7 @@
 /// The provides similar color matching as full Jakob, at the expense of
 /// somewhat lower quality spectrums, and the inability to precalculate
 /// the coefficents for even more efficient evaluation later on.
-use float4::Float4;
+use glam::Vec4;
 
 /// How many polynomial coefficients?
 const RGB2SPEC_N_COEFFS: usize = 3;
@@ -15,7 +15,7 @@ const RGB2SPEC_N_COEFFS: usize = 3;
 include!(concat!(env!("OUT_DIR"), "/jakob_table_inc.rs"));
 
 #[inline]
-pub fn rec709_to_spectrum_p4(lambdas: Float4, rgb: (f32, f32, f32)) -> Float4 {
+pub fn rec709_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
     small_rgb_to_spectrum_p4(
         REC709_TABLE,
         REC709_TABLE_RES,
@@ -26,7 +26,7 @@ pub fn rec709_to_spectrum_p4(lambdas: Float4, rgb: (f32, f32, f32)) -> Float4 {
 }
 
 #[inline]
-pub fn rec2020_to_spectrum_p4(lambdas: Float4, rgb: (f32, f32, f32)) -> Float4 {
+pub fn rec2020_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
     small_rgb_to_spectrum_p4(
         REC2020_TABLE,
         REC2020_TABLE_RES,
@@ -37,7 +37,7 @@ pub fn rec2020_to_spectrum_p4(lambdas: Float4, rgb: (f32, f32, f32)) -> Float4 {
 }
 
 #[inline]
-pub fn aces_to_spectrum_p4(lambdas: Float4, rgb: (f32, f32, f32)) -> Float4 {
+pub fn aces_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
     small_rgb_to_spectrum_p4(
         ACES_TABLE,
         ACES_TABLE_RES,
@@ -55,9 +55,9 @@ fn small_rgb_to_spectrum_p4(
     table: &[[(f32, f32, f32); 2]],
     table_res: usize,
     table_mid_value: f32,
-    lambdas: Float4,
+    lambdas: Vec4,
     rgb: (f32, f32, f32),
-) -> Float4 {
+) -> Vec4 {
     // Determine largest RGB component, and calculate the other two
     // components scaled for lookups.
     let (i, max_val, x, y) = if rgb.0 > rgb.1 && rgb.0 > rgb.2 {
@@ -70,7 +70,7 @@ fn small_rgb_to_spectrum_p4(
     if max_val == 0.0 {
         // If max_val is zero, just return zero.  This avoids NaN's from
         // divide by zero.  This is also correct, since it's black.
-        return Float4::splat(0.0);
+        return Vec4::splat(0.0);
     }
     let x = x * 63.0 / max_val;
     let y = y * 63.0 / max_val;
@@ -90,20 +90,20 @@ fn small_rgb_to_spectrum_p4(
 
     // Convert to SIMD format for faster interpolation.
     let a0 = [
-        Float4::new(a0[0].0, a0[0].1, a0[0].2, 0.0),
-        Float4::new(a0[1].0, a0[1].1, a0[1].2, 0.0),
+        Vec4::new(a0[0].0, a0[0].1, a0[0].2, 0.0),
+        Vec4::new(a0[1].0, a0[1].1, a0[1].2, 0.0),
     ];
     let a1 = [
-        Float4::new(a1[0].0, a1[0].1, a1[0].2, 0.0),
-        Float4::new(a1[1].0, a1[1].1, a1[1].2, 0.0),
+        Vec4::new(a1[0].0, a1[0].1, a1[0].2, 0.0),
+        Vec4::new(a1[1].0, a1[1].1, a1[1].2, 0.0),
     ];
     let a2 = [
-        Float4::new(a2[0].0, a2[0].1, a2[0].2, 0.0),
-        Float4::new(a2[1].0, a2[1].1, a2[1].2, 0.0),
+        Vec4::new(a2[0].0, a2[0].1, a2[0].2, 0.0),
+        Vec4::new(a2[1].0, a2[1].1, a2[1].2, 0.0),
     ];
     let a3 = [
-        Float4::new(a3[0].0, a3[0].1, a3[0].2, 0.0),
-        Float4::new(a3[1].0, a3[1].1, a3[1].2, 0.0),
+        Vec4::new(a3[0].0, a3[0].1, a3[0].2, 0.0),
+        Vec4::new(a3[1].0, a3[1].1, a3[1].2, 0.0),
     ];
 
     // Do interpolation.
@@ -117,16 +117,14 @@ fn small_rgb_to_spectrum_p4(
 
     // Evaluate the spectral function and return the result.
     if max_val <= table_mid_value {
-        rgb2spec_eval_4([c[0].get_0(), c[0].get_1(), c[0].get_2()], lambdas)
-            * (1.0 / table_mid_value)
-            * max_val
+        rgb2spec_eval_4([c[0].x(), c[0].y(), c[0].z()], lambdas) * (1.0 / table_mid_value) * max_val
     } else if max_val < 1.0 {
         let n = (max_val - table_mid_value) / (1.0 - table_mid_value);
-        let s0 = rgb2spec_eval_4([c[0].get_0(), c[0].get_1(), c[0].get_2()], lambdas);
-        let s1 = rgb2spec_eval_4([c[1].get_0(), c[1].get_1(), c[1].get_2()], lambdas);
+        let s0 = rgb2spec_eval_4([c[0].x(), c[0].y(), c[0].z()], lambdas);
+        let s1 = rgb2spec_eval_4([c[1].x(), c[1].y(), c[1].z()], lambdas);
         (s0 * (1.0 - n)) + (s1 * n)
     } else {
-        rgb2spec_eval_4([c[1].get_0(), c[1].get_1(), c[1].get_2()], lambdas) * max_val
+        rgb2spec_eval_4([c[1].x(), c[1].y(), c[1].z()], lambdas) * max_val
     }
 }
 
@@ -134,18 +132,22 @@ fn small_rgb_to_spectrum_p4(
 // Coefficient -> eval functions
 
 #[inline(always)]
-fn rgb2spec_fma_4(a: Float4, b: Float4, c: Float4) -> Float4 {
-    a.fmadd(b, c)
+fn rgb2spec_fma_4(a: Vec4, b: Vec4, c: Vec4) -> Vec4 {
+    (a * b) + c
 }
 
-fn rgb2spec_eval_4(coeff: [f32; RGB2SPEC_N_COEFFS], lambda: Float4) -> Float4 {
-    let co0 = Float4::splat(coeff[0]);
-    let co1 = Float4::splat(coeff[1]);
-    let co2 = Float4::splat(coeff[2]);
+fn rgb2spec_eval_4(coeff: [f32; RGB2SPEC_N_COEFFS], lambda: Vec4) -> Vec4 {
+    let co0 = Vec4::splat(coeff[0]);
+    let co1 = Vec4::splat(coeff[1]);
+    let co2 = Vec4::splat(coeff[2]);
 
     let x = rgb2spec_fma_4(rgb2spec_fma_4(co0, lambda, co1), lambda, co2);
 
-    let y = Float4::splat(1.0) / (rgb2spec_fma_4(x, x, Float4::splat(1.0))).sqrt();
+    let y = {
+        // TODO: replace this with a SIMD sqrt op.
+        let (x, y, z, w) = rgb2spec_fma_4(x, x, Vec4::splat(1.0)).into();
+        Vec4::new(x.sqrt(), y.sqrt(), z.sqrt(), w.sqrt()).reciprocal()
+    };
 
-    rgb2spec_fma_4(Float4::splat(0.5) * x, y, Float4::splat(0.5))
+    rgb2spec_fma_4(Vec4::splat(0.5) * x, y, Vec4::splat(0.5))
 }
diff --git a/sub_crates/spectral_upsampling/src/meng.rs b/sub_crates/spectral_upsampling/src/meng.rs
index 5953d1f..bc14eb2 100644
--- a/sub_crates/spectral_upsampling/src/meng.rs
+++ b/sub_crates/spectral_upsampling/src/meng.rs
@@ -6,7 +6,7 @@
 
 use std::f32;
 
-use float4::Float4;
+use glam::Vec4;
 
 mod meng_spectra_tables;
 
@@ -174,9 +174,9 @@ pub fn spectrum_xyz_to_p(lambda: f32, xyz: (f32, f32, f32)) -> f32 {
 ///
 /// Works on 4 wavelengths at once via SIMD.
 #[inline]
-pub fn spectrum_xyz_to_p_4(lambdas: Float4, xyz: (f32, f32, f32)) -> Float4 {
-    assert!(lambdas.h_min() >= SPECTRUM_SAMPLE_MIN);
-    assert!(lambdas.h_max() <= SPECTRUM_SAMPLE_MAX);
+pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {
+    assert!(lambdas.min_element() >= SPECTRUM_SAMPLE_MIN);
+    assert!(lambdas.max_element() <= SPECTRUM_SAMPLE_MAX);
 
     let inv_norm = xyz.0 + xyz.1 + xyz.2;
     let norm = {
@@ -184,7 +184,7 @@ pub fn spectrum_xyz_to_p_4(lambdas: Float4, xyz: (f32, f32, f32)) -> Float4 {
         if norm < f32::MAX {
             norm
         } else {
-            return Float4::splat(0.0);
+            return Vec4::splat(0.0);
         }
     };
 
@@ -197,7 +197,7 @@ pub fn spectrum_xyz_to_p_4(lambdas: Float4, xyz: (f32, f32, f32)) -> Float4 {
         || uv.1 < 0.0
         || uv.1 >= SPECTRUM_GRID_HEIGHT as f32
     {
-        return Float4::splat(0.0);
+        return Vec4::splat(0.0);
     }
 
     let uvi = (uv.0 as i32, uv.1 as i32);
@@ -214,53 +214,48 @@ pub fn spectrum_xyz_to_p_4(lambdas: Float4, xyz: (f32, f32, f32)) -> Float4 {
 
     // If the cell has no points, nothing we can do, so return 0.0
     if num == 0 {
-        return Float4::splat(0.0);
+        return Vec4::splat(0.0);
     }
 
     // Normalize lambda to spectrum table index range.
-    let sb: Float4 = (lambdas - Float4::splat(SPECTRUM_SAMPLE_MIN))
+    let sb: Vec4 = (lambdas - Vec4::splat(SPECTRUM_SAMPLE_MIN))
         / (SPECTRUM_SAMPLE_MAX - SPECTRUM_SAMPLE_MIN)
         * (SPECTRUM_NUM_SAMPLES as f32 - 1.0);
-    debug_assert!(sb.h_min() >= 0.0);
-    debug_assert!(sb.h_max() <= SPECTRUM_NUM_SAMPLES as f32);
+    debug_assert!(sb.min_element() >= 0.0);
+    debug_assert!(sb.max_element() <= SPECTRUM_NUM_SAMPLES as f32);
 
     // Get the spectral values for the vertices of the grid cell.
     // TODO: use integer SIMD intrinsics to make this part faster.
-    let mut p = [Float4::splat(0.0); 6];
-    let sb0: [i32; 4] = [
-        sb.get_0() as i32,
-        sb.get_1() as i32,
-        sb.get_2() as i32,
-        sb.get_3() as i32,
-    ];
+    let mut p = [Vec4::splat(0.0); 6];
+    let sb0: [i32; 4] = [sb.x() as i32, sb.y() as i32, sb.z() as i32, sb.w() as i32];
     assert!(sb0[0].max(sb0[1]).max(sb0[2].max(sb0[3])) < SPECTRUM_NUM_SAMPLES);
     let sb1: [i32; 4] = [
-        (sb.get_0() as i32 + 1).min(SPECTRUM_NUM_SAMPLES - 1),
-        (sb.get_1() as i32 + 1).min(SPECTRUM_NUM_SAMPLES - 1),
-        (sb.get_2() as i32 + 1).min(SPECTRUM_NUM_SAMPLES - 1),
-        (sb.get_3() as i32 + 1).min(SPECTRUM_NUM_SAMPLES - 1),
+        (sb.x() as i32 + 1).min(SPECTRUM_NUM_SAMPLES - 1),
+        (sb.y() as i32 + 1).min(SPECTRUM_NUM_SAMPLES - 1),
+        (sb.z() as i32 + 1).min(SPECTRUM_NUM_SAMPLES - 1),
+        (sb.w() as i32 + 1).min(SPECTRUM_NUM_SAMPLES - 1),
     ];
-    let sbf = sb - Float4::new(sb0[0] as f32, sb0[1] as f32, sb0[2] as f32, sb0[3] as f32);
+    let sbf = sb - Vec4::new(sb0[0] as f32, sb0[1] as f32, sb0[2] as f32, sb0[3] as f32);
     for i in 0..(num as usize) {
         debug_assert!(idx[i] >= 0);
         let spectrum = &SPECTRUM_DATA_POINTS[idx[i] as usize].spectrum;
-        let p0 = Float4::new(
+        let p0 = Vec4::new(
             spectrum[sb0[0] as usize],
             spectrum[sb0[1] as usize],
             spectrum[sb0[2] as usize],
             spectrum[sb0[3] as usize],
         );
-        let p1 = Float4::new(
+        let p1 = Vec4::new(
             spectrum[sb1[0] as usize],
             spectrum[sb1[1] as usize],
             spectrum[sb1[2] as usize],
             spectrum[sb1[3] as usize],
         );
-        p[i] = p0 * (Float4::splat(1.0) - sbf) + p1 * sbf;
+        p[i] = p0 * (Vec4::splat(1.0) - sbf) + p1 * sbf;
     }
 
     // Linearly interpolate the spectral power of the cell vertices.
-    let mut interpolated_p = Float4::splat(0.0);
+    let mut interpolated_p = Vec4::splat(0.0);
     if inside {
         // Fast path for normal inner quads:
         let uv2 = (uv.0 - uvi.0 as f32, uv.1 - uvi.1 as f32);