diff --git a/Cargo.lock b/Cargo.lock
index f3a62d6..998b57d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -67,9 +67,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 [[package]]
 name = "float4"
 version = "0.1.0"
-dependencies = [
- "simd 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
-]
 
 [[package]]
 name = "half"
@@ -198,11 +195,6 @@ name = "scoped_threadpool"
 version = "0.1.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
-[[package]]
-name = "simd"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-
 [[package]]
 name = "sobol"
 version = "0.1.0"
@@ -296,7 +288,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum rustc-serialize 0.3.24 (registry+https://github.com/rust-lang/crates.io-index)" = "dcf128d1287d2ea9d80910b5f1120d0b8eede3fbf1abe91c40d39ea7d51e6fda"
 "checksum safemem 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e27a8b19b835f7aea908818e871f5cc3a5a186550c30773be987e155e8163d8f"
 "checksum scoped_threadpool 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8"
-"checksum simd 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3dd0805c7363ab51a829a1511ad24b6ed0349feaa756c4bc2f977f9f496e6673"
 "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550"
 "checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096"
 "checksum textwrap 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c0b59b6b4b44d867f1370ef1bd91bfb262bf07bf0ae65c202ea2fbc16153b693"
diff --git a/Cargo.toml b/Cargo.toml
index a5192e8..c6343ac 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,9 +15,6 @@ name = "psychopath"
 version = "0.1.0"
 authors = ["Nathan Vegdahl <cessen@cessen.com>"]
 
-[features]
-simd_perf = ["float4/simd_perf", "math3d/simd_perf"]
-
 [profile.release]
 debug = true
 
diff --git a/sub_crates/float4/Cargo.toml b/sub_crates/float4/Cargo.toml
index e2efc1c..d7c4b70 100644
--- a/sub_crates/float4/Cargo.toml
+++ b/sub_crates/float4/Cargo.toml
@@ -7,10 +7,3 @@ license = "MIT"
 [lib]
 name = "float4"
 path = "src/lib.rs"
-
-[features]
-simd_perf = ["simd"]
-
-[dependencies]
-# Crates.io dependencies
-simd = { version = "0.2.1", optional = true }
\ No newline at end of file
diff --git a/sub_crates/float4/src/lib.rs b/sub_crates/float4/src/lib.rs
index 8358d82..fe7bafc 100644
--- a/sub_crates/float4/src/lib.rs
+++ b/sub_crates/float4/src/lib.rs
@@ -1,115 +1,485 @@
 #![allow(dead_code)]
 
-#[cfg(feature = "simd_perf")]
-extern crate simd;
+/// Implementation of Float4 for x86_64 platforms with sse support
+#[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
+mod x86_64_sse {
+    use std::arch::x86_64::__m128;
+    use std::cmp::PartialEq;
+    use std::ops::{Add, AddAssign, BitAnd, BitOr, Div, DivAssign, Mul, MulAssign, Sub, SubAssign};
 
-use std::cmp::PartialEq;
-use std::ops::{Add, AddAssign, BitAnd, Div, DivAssign, Mul, MulAssign, Sub, SubAssign};
+    #[derive(Debug, Copy, Clone)]
+    pub struct Float4 {
+        data: __m128,
+    }
 
-#[cfg(feature = "simd_perf")]
-use simd::{bool32fx4, f32x4};
-
-/// Essentially a tuple of four floats, which will use SIMD operations
-/// where possible on a platform.
-#[cfg(feature = "simd_perf")]
-#[derive(Debug, Copy, Clone)]
-pub struct Float4 {
-    data: f32x4,
-}
-
-#[cfg(not(feature = "simd_perf"))]
-#[derive(Debug, Copy, Clone)]
-pub struct Float4 {
-    data: [f32; 4],
-}
-
-impl Float4 {
-    #[inline(always)]
-    pub fn new(a: f32, b: f32, c: f32, d: f32) -> Float4 {
-        #[cfg(feature = "simd_perf")]
-        {
+    impl Float4 {
+        #[inline(always)]
+        pub fn new(a: f32, b: f32, c: f32, d: f32) -> Float4 {
+            use std::arch::x86_64::_mm_set_ps;
             Float4 {
-                data: f32x4::new(a, b, c, d),
+                data: unsafe { _mm_set_ps(d, c, b, a) },
             }
         }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+
+        #[inline(always)]
+        pub fn splat(n: f32) -> Float4 {
+            use std::arch::x86_64::_mm_set1_ps;
+            Float4 {
+                data: unsafe { _mm_set1_ps(n) },
+            }
+        }
+
+        #[inline]
+        pub fn h_sum(&self) -> f32 {
+            (self.get_0() + self.get_1()) + (self.get_2() + self.get_3())
+        }
+
+        #[inline]
+        pub fn h_product(&self) -> f32 {
+            (self.get_0() * self.get_1()) * (self.get_2() * self.get_3())
+        }
+
+        #[inline]
+        pub fn h_min(&self) -> f32 {
+            let n1 = if self.get_0() < self.get_1() {
+                self.get_0()
+            } else {
+                self.get_1()
+            };
+            let n2 = if self.get_2() < self.get_3() {
+                self.get_2()
+            } else {
+                self.get_3()
+            };
+            if n1 < n2 {
+                n1
+            } else {
+                n2
+            }
+        }
+
+        #[inline]
+        pub fn h_max(&self) -> f32 {
+            let n1 = if self.get_0() > self.get_1() {
+                self.get_0()
+            } else {
+                self.get_1()
+            };
+            let n2 = if self.get_2() > self.get_3() {
+                self.get_2()
+            } else {
+                self.get_3()
+            };
+            if n1 > n2 {
+                n1
+            } else {
+                n2
+            }
+        }
+
+        #[inline(always)]
+        pub fn v_min(&self, other: Float4) -> Float4 {
+            use std::arch::x86_64::_mm_min_ps;
+            Float4 {
+                data: unsafe { _mm_min_ps(self.data, other.data) },
+            }
+        }
+
+        #[inline(always)]
+        pub fn v_max(&self, other: Float4) -> Float4 {
+            use std::arch::x86_64::_mm_max_ps;
+            Float4 {
+                data: unsafe { _mm_max_ps(self.data, other.data) },
+            }
+        }
+
+        #[inline(always)]
+        pub fn lt(&self, other: Float4) -> Bool4 {
+            use std::arch::x86_64::_mm_cmplt_ps;
+            Bool4 {
+                data: unsafe { _mm_cmplt_ps(self.data, other.data) },
+            }
+        }
+
+        #[inline(always)]
+        pub fn lte(&self, other: Float4) -> Bool4 {
+            use std::arch::x86_64::_mm_cmple_ps;
+            Bool4 {
+                data: unsafe { _mm_cmple_ps(self.data, other.data) },
+            }
+        }
+
+        #[inline(always)]
+        pub fn gt(&self, other: Float4) -> Bool4 {
+            use std::arch::x86_64::_mm_cmpgt_ps;
+            Bool4 {
+                data: unsafe { _mm_cmpgt_ps(self.data, other.data) },
+            }
+        }
+
+        #[inline(always)]
+        pub fn gte(&self, other: Float4) -> Bool4 {
+            use std::arch::x86_64::_mm_cmpge_ps;
+            Bool4 {
+                data: unsafe { _mm_cmpge_ps(self.data, other.data) },
+            }
+        }
+
+        /// Set the nth element to the given value.
+        #[inline(always)]
+        pub fn set_n(&mut self, n: usize, v: f32) {
+            use std::mem::transmute;
+            assert!(
+                n <= 3,
+                "Attempted to set element of Float4 outside of bounds."
+            );
+
+            unsafe { *transmute::<*mut __m128, *mut f32>(&mut self.data).offset(n as isize) = v }
+        }
+
+        /// Set the 0th element to the given value.
+        #[inline(always)]
+        pub fn set_0(&mut self, v: f32) {
+            self.set_n(0, v);
+        }
+
+        /// Set the 1th element to the given value.
+        #[inline(always)]
+        pub fn set_1(&mut self, v: f32) {
+            self.set_n(1, v);
+        }
+
+        /// Set the 2th element to the given value.
+        #[inline(always)]
+        pub fn set_2(&mut self, v: f32) {
+            self.set_n(2, v);
+        }
+
+        /// Set the 3th element to the given value.
+        #[inline(always)]
+        pub fn set_3(&mut self, v: f32) {
+            self.set_n(3, v);
+        }
+
+        /// Returns the value of the nth element.
+        #[inline(always)]
+        pub fn get_n(&self, n: usize) -> f32 {
+            use std::mem::transmute;
+            assert!(
+                n <= 3,
+                "Attempted to access element of Float4 outside of bounds."
+            );
+
+            unsafe { *transmute::<*const __m128, *const f32>(&self.data).offset(n as isize) }
+        }
+
+        /// Returns the value of the 0th element.
+        #[inline(always)]
+        pub fn get_0(&self) -> f32 {
+            self.get_n(0)
+        }
+
+        /// Returns the value of the 1th element.
+        #[inline(always)]
+        pub fn get_1(&self) -> f32 {
+            self.get_n(1)
+        }
+
+        /// Returns the value of the 2th element.
+        #[inline(always)]
+        pub fn get_2(&self) -> f32 {
+            self.get_n(2)
+        }
+
+        /// Returns the value of the 3th element.
+        #[inline(always)]
+        pub fn get_3(&self) -> f32 {
+            self.get_n(3)
+        }
+    }
+
+    impl PartialEq for Float4 {
+        #[inline]
+        fn eq(&self, other: &Float4) -> bool {
+            self.get_0() == other.get_0()
+                && self.get_1() == other.get_1()
+                && self.get_2() == other.get_2()
+                && self.get_3() == other.get_3()
+        }
+    }
+
+    impl Add for Float4 {
+        type Output = Float4;
+
+        #[inline(always)]
+        fn add(self, other: Float4) -> Float4 {
+            use std::arch::x86_64::_mm_add_ps;
+            Float4 {
+                data: unsafe { _mm_add_ps(self.data, other.data) },
+            }
+        }
+    }
+
+    impl AddAssign for Float4 {
+        #[inline(always)]
+        fn add_assign(&mut self, rhs: Float4) {
+            *self = *self + rhs;
+        }
+    }
+
+    impl Sub for Float4 {
+        type Output = Float4;
+
+        #[inline(always)]
+        fn sub(self, other: Float4) -> Float4 {
+            use std::arch::x86_64::_mm_sub_ps;
+            Float4 {
+                data: unsafe { _mm_sub_ps(self.data, other.data) },
+            }
+        }
+    }
+
+    impl SubAssign for Float4 {
+        #[inline(always)]
+        fn sub_assign(&mut self, rhs: Float4) {
+            *self = *self - rhs;
+        }
+    }
+
+    impl Mul for Float4 {
+        type Output = Float4;
+
+        #[inline(always)]
+        fn mul(self, other: Float4) -> Float4 {
+            use std::arch::x86_64::_mm_mul_ps;
+            Float4 {
+                data: unsafe { _mm_mul_ps(self.data, other.data) },
+            }
+        }
+    }
+
+    impl Mul<f32> for Float4 {
+        type Output = Float4;
+
+        #[inline(always)]
+        fn mul(self, other: f32) -> Float4 {
+            self * Float4::splat(other)
+        }
+    }
+
+    impl MulAssign for Float4 {
+        #[inline(always)]
+        fn mul_assign(&mut self, rhs: Float4) {
+            *self = *self * rhs;
+        }
+    }
+
+    impl MulAssign<f32> for Float4 {
+        #[inline(always)]
+        fn mul_assign(&mut self, rhs: f32) {
+            *self = *self * rhs;
+        }
+    }
+
+    impl Div for Float4 {
+        type Output = Float4;
+
+        #[inline(always)]
+        fn div(self, other: Float4) -> Float4 {
+            use std::arch::x86_64::_mm_div_ps;
+            Float4 {
+                data: unsafe { _mm_div_ps(self.data, other.data) },
+            }
+        }
+    }
+
+    impl Div<f32> for Float4 {
+        type Output = Float4;
+
+        #[inline(always)]
+        fn div(self, other: f32) -> Float4 {
+            self / Float4::splat(other)
+        }
+    }
+
+    impl DivAssign for Float4 {
+        #[inline(always)]
+        fn div_assign(&mut self, rhs: Float4) {
+            *self = *self / rhs;
+        }
+    }
+
+    impl DivAssign<f32> for Float4 {
+        #[inline(always)]
+        fn div_assign(&mut self, rhs: f32) {
+            *self = *self / rhs;
+        }
+    }
+
+    #[inline(always)]
+    pub fn v_min(a: Float4, b: Float4) -> Float4 {
+        a.v_min(b)
+    }
+
+    #[inline(always)]
+    pub fn v_max(a: Float4, b: Float4) -> Float4 {
+        a.v_max(b)
+    }
+
+    /// Essentially a tuple of four bools, which will use SIMD operations
+    /// where possible on a platform.
+    #[derive(Debug, Copy, Clone)]
+    pub struct Bool4 {
+        data: __m128,
+    }
+
+    impl Bool4 {
+        /// Returns the value of the nth element.
+        #[inline(always)]
+        pub fn get_n(&self, n: usize) -> bool {
+            use std::mem::transmute;
+            assert!(
+                n <= 3,
+                "Attempted to access element of Bool4 outside of bounds."
+            );
+
+            0 != unsafe { *transmute::<*const __m128, *const u32>(&self.data).offset(n as isize) }
+        }
+
+        /// Returns the value of the 0th element.
+        #[inline(always)]
+        pub fn get_0(&self) -> bool {
+            self.get_n(0)
+        }
+
+        /// Returns the value of the 1th element.
+        #[inline(always)]
+        pub fn get_1(&self) -> bool {
+            self.get_n(1)
+        }
+
+        /// Returns the value of the 2th element.
+        #[inline(always)]
+        pub fn get_2(&self) -> bool {
+            self.get_n(2)
+        }
+
+        /// Returns the value of the 3th element.
+        #[inline(always)]
+        pub fn get_3(&self) -> bool {
+            self.get_n(3)
+        }
+
+        #[inline]
+        pub fn to_bitmask(&self) -> u8 {
+            use std::mem::transmute;
+            let a = unsafe { *transmute::<*const __m128, *const u8>(&self.data).offset(0) };
+            let b = unsafe { *transmute::<*const __m128, *const u8>(&self.data).offset(4) };
+            let c = unsafe { *transmute::<*const __m128, *const u8>(&self.data).offset(8) };
+            let d = unsafe { *transmute::<*const __m128, *const u8>(&self.data).offset(12) };
+            (a & 0b00000001) | (b & 0b00000010) | (c & 0b00000100) | (d & 0b00001000)
+        }
+    }
+
+    impl BitAnd for Bool4 {
+        type Output = Bool4;
+
+        #[inline(always)]
+        fn bitand(self, rhs: Bool4) -> Bool4 {
+            use std::arch::x86_64::_mm_and_ps;
+            Bool4 {
+                data: unsafe { _mm_and_ps(self.data, rhs.data) },
+            }
+        }
+    }
+
+    impl BitOr for Bool4 {
+        type Output = Bool4;
+
+        #[inline(always)]
+        fn bitor(self, rhs: Bool4) -> Bool4 {
+            use std::arch::x86_64::_mm_or_ps;
+            Bool4 {
+                data: unsafe { _mm_or_ps(self.data, rhs.data) },
+            }
+        }
+    }
+}
+
+//===========================================================================
+
+/// Implementation fo Float4 for any platform, foregoing any
+/// platform-specific optimizations.
+mod fallback {
+    use std::cmp::PartialEq;
+    use std::ops::{Add, AddAssign, BitAnd, BitOr, Div, DivAssign, Mul, MulAssign, Sub, SubAssign};
+
+    #[derive(Debug, Copy, Clone)]
+    pub struct Float4 {
+        data: [f32; 4],
+    }
+
+    impl Float4 {
+        #[inline(always)]
+        pub fn new(a: f32, b: f32, c: f32, d: f32) -> Float4 {
             Float4 { data: [a, b, c, d] }
         }
-    }
 
-    #[inline(always)]
-    pub fn splat(n: f32) -> Float4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Float4 {
-                data: f32x4::splat(n),
-            }
-        }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+        #[inline(always)]
+        pub fn splat(n: f32) -> Float4 {
             Float4 { data: [n, n, n, n] }
         }
-    }
 
-    #[inline]
-    pub fn h_sum(&self) -> f32 {
-        (self.get_0() + self.get_1()) + (self.get_2() + self.get_3())
-    }
-
-    #[inline]
-    pub fn h_product(&self) -> f32 {
-        (self.get_0() * self.get_1()) * (self.get_2() * self.get_3())
-    }
-
-    #[inline]
-    pub fn h_min(&self) -> f32 {
-        let n1 = if self.get_0() < self.get_1() {
-            self.get_0()
-        } else {
-            self.get_1()
-        };
-        let n2 = if self.get_2() < self.get_3() {
-            self.get_2()
-        } else {
-            self.get_3()
-        };
-        if n1 < n2 {
-            n1
-        } else {
-            n2
+        #[inline]
+        pub fn h_sum(&self) -> f32 {
+            (self.get_0() + self.get_1()) + (self.get_2() + self.get_3())
         }
-    }
 
-    #[inline]
-    pub fn h_max(&self) -> f32 {
-        let n1 = if self.get_0() > self.get_1() {
-            self.get_0()
-        } else {
-            self.get_1()
-        };
-        let n2 = if self.get_2() > self.get_3() {
-            self.get_2()
-        } else {
-            self.get_3()
-        };
-        if n1 > n2 {
-            n1
-        } else {
-            n2
+        #[inline]
+        pub fn h_product(&self) -> f32 {
+            (self.get_0() * self.get_1()) * (self.get_2() * self.get_3())
         }
-    }
 
-    #[inline(always)]
-    pub fn v_min(&self, other: Float4) -> Float4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Float4 {
-                data: self.data.min(other.data),
+        #[inline]
+        pub fn h_min(&self) -> f32 {
+            let n1 = if self.get_0() < self.get_1() {
+                self.get_0()
+            } else {
+                self.get_1()
+            };
+            let n2 = if self.get_2() < self.get_3() {
+                self.get_2()
+            } else {
+                self.get_3()
+            };
+            if n1 < n2 {
+                n1
+            } else {
+                n2
             }
         }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+
+        #[inline]
+        pub fn h_max(&self) -> f32 {
+            let n1 = if self.get_0() > self.get_1() {
+                self.get_0()
+            } else {
+                self.get_1()
+            };
+            let n2 = if self.get_2() > self.get_3() {
+                self.get_2()
+            } else {
+                self.get_3()
+            };
+            if n1 > n2 {
+                n1
+            } else {
+                n2
+            }
+        }
+
+        #[inline(always)]
+        pub fn v_min(&self, other: Float4) -> Float4 {
             Float4::new(
                 if self.get_0() < other.get_0() {
                     self.get_0()
@@ -133,18 +503,9 @@ impl Float4 {
                 },
             )
         }
-    }
 
-    #[inline(always)]
-    pub fn v_max(&self, other: Float4) -> Float4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Float4 {
-                data: self.data.max(other.data),
-            }
-        }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+        #[inline(always)]
+        pub fn v_max(&self, other: Float4) -> Float4 {
             Float4::new(
                 if self.get_0() > other.get_0() {
                     self.get_0()
@@ -168,18 +529,9 @@ impl Float4 {
                 },
             )
         }
-    }
 
-    #[inline(always)]
-    pub fn lt(&self, other: Float4) -> Bool4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Bool4 {
-                data: self.data.lt(other.data),
-            }
-        }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+        #[inline(always)]
+        pub fn lt(&self, other: Float4) -> Bool4 {
             Bool4 {
                 data: [
                     self.data[0] < other.data[0],
@@ -189,18 +541,9 @@ impl Float4 {
                 ],
             }
         }
-    }
 
-    #[inline(always)]
-    pub fn lte(&self, other: Float4) -> Bool4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Bool4 {
-                data: self.data.le(other.data),
-            }
-        }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+        #[inline(always)]
+        pub fn lte(&self, other: Float4) -> Bool4 {
             Bool4 {
                 data: [
                     self.data[0] <= other.data[0],
@@ -210,18 +553,9 @@ impl Float4 {
                 ],
             }
         }
-    }
 
-    #[inline(always)]
-    pub fn gt(&self, other: Float4) -> Bool4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Bool4 {
-                data: self.data.gt(other.data),
-            }
-        }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+        #[inline(always)]
+        pub fn gt(&self, other: Float4) -> Bool4 {
             Bool4 {
                 data: [
                     self.data[0] > other.data[0],
@@ -231,18 +565,9 @@ impl Float4 {
                 ],
             }
         }
-    }
 
-    #[inline(always)]
-    pub fn gte(&self, other: Float4) -> Bool4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Bool4 {
-                data: self.data.ge(other.data),
-            }
-        }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+        #[inline(always)]
+        pub fn gte(&self, other: Float4) -> Bool4 {
             Bool4 {
                 data: [
                     self.data[0] >= other.data[0],
@@ -252,110 +577,93 @@ impl Float4 {
                 ],
             }
         }
-    }
 
-    /// Set the nth element to the given value.
-    #[inline(always)]
-    pub fn set_n(&mut self, n: usize, v: f32) {
-        assert!(
-            n <= 3,
-            "Attempted to set element of Float4 outside of bounds."
-        );
-        #[cfg(feature = "simd_perf")]
-        {
-            self.data = self.data.replace(n as u32, v);
-        }
-        #[cfg(not(feature = "simd_perf"))]
-        unsafe {
-            *self.data.get_unchecked_mut(n) = v;
-        }
-    }
-
-    /// Set the 0th element to the given value.
-    #[inline(always)]
-    pub fn set_0(&mut self, v: f32) {
-        self.set_n(0, v);
-    }
-
-    /// Set the 1th element to the given value.
-    #[inline(always)]
-    pub fn set_1(&mut self, v: f32) {
-        self.set_n(1, v);
-    }
-
-    /// Set the 2th element to the given value.
-    #[inline(always)]
-    pub fn set_2(&mut self, v: f32) {
-        self.set_n(2, v);
-    }
-
-    /// Set the 3th element to the given value.
-    #[inline(always)]
-    pub fn set_3(&mut self, v: f32) {
-        self.set_n(3, v);
-    }
-
-    /// Returns the value of the nth element.
-    #[inline(always)]
-    pub fn get_n(&self, n: usize) -> f32 {
-        assert!(
-            n <= 3,
-            "Attempted to access element of Float4 outside of bounds."
-        );
-        #[cfg(feature = "simd_perf")]
-        {
-            self.data.extract(n as u32)
-        }
-        #[cfg(not(feature = "simd_perf"))]
-        unsafe { *self.data.get_unchecked(n) }
-    }
-
-    /// Returns the value of the 0th element.
-    #[inline(always)]
-    pub fn get_0(&self) -> f32 {
-        self.get_n(0)
-    }
-
-    /// Returns the value of the 1th element.
-    #[inline(always)]
-    pub fn get_1(&self) -> f32 {
-        self.get_n(1)
-    }
-
-    /// Returns the value of the 2th element.
-    #[inline(always)]
-    pub fn get_2(&self) -> f32 {
-        self.get_n(2)
-    }
-
-    /// Returns the value of the 3th element.
-    #[inline(always)]
-    pub fn get_3(&self) -> f32 {
-        self.get_n(3)
-    }
-}
-
-impl PartialEq for Float4 {
-    #[inline]
-    fn eq(&self, other: &Float4) -> bool {
-        self.get_0() == other.get_0() && self.get_1() == other.get_1()
-            && self.get_2() == other.get_2() && self.get_3() == other.get_3()
-    }
-}
-
-impl Add for Float4 {
-    type Output = Float4;
-
-    #[inline(always)]
-    fn add(self, other: Float4) -> Float4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Float4 {
-                data: self.data + other.data,
+        /// Set the nth element to the given value.
+        #[inline(always)]
+        pub fn set_n(&mut self, n: usize, v: f32) {
+            assert!(
+                n <= 3,
+                "Attempted to set element of Float4 outside of bounds."
+            );
+            unsafe {
+                *self.data.get_unchecked_mut(n) = v;
             }
         }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+
+        /// Set the 0th element to the given value.
+        #[inline(always)]
+        pub fn set_0(&mut self, v: f32) {
+            self.set_n(0, v);
+        }
+
+        /// Set the 1th element to the given value.
+        #[inline(always)]
+        pub fn set_1(&mut self, v: f32) {
+            self.set_n(1, v);
+        }
+
+        /// Set the 2th element to the given value.
+        #[inline(always)]
+        pub fn set_2(&mut self, v: f32) {
+            self.set_n(2, v);
+        }
+
+        /// Set the 3th element to the given value.
+        #[inline(always)]
+        pub fn set_3(&mut self, v: f32) {
+            self.set_n(3, v);
+        }
+
+        /// Returns the value of the nth element.
+        #[inline(always)]
+        pub fn get_n(&self, n: usize) -> f32 {
+            assert!(
+                n <= 3,
+                "Attempted to access element of Float4 outside of bounds."
+            );
+            unsafe { *self.data.get_unchecked(n) }
+        }
+
+        /// Returns the value of the 0th element.
+        #[inline(always)]
+        pub fn get_0(&self) -> f32 {
+            self.get_n(0)
+        }
+
+        /// Returns the value of the 1th element.
+        #[inline(always)]
+        pub fn get_1(&self) -> f32 {
+            self.get_n(1)
+        }
+
+        /// Returns the value of the 2th element.
+        #[inline(always)]
+        pub fn get_2(&self) -> f32 {
+            self.get_n(2)
+        }
+
+        /// Returns the value of the 3th element.
+        #[inline(always)]
+        pub fn get_3(&self) -> f32 {
+            self.get_n(3)
+        }
+    }
+
+    impl PartialEq for Float4 {
+        #[inline]
+        fn eq(&self, other: &Float4) -> bool {
+            self.get_0() == other.get_0()
+                && self.get_1() == other.get_1()
+                && self.get_2() == other.get_2()
+                && self.get_3() == other.get_3()
+        }
+    }
+
+    impl Add for Float4 {
+        type Output = Float4;
+
+        #[inline(always)]
+        fn add(self, other: Float4) -> Float4 {
             Float4 {
                 data: [
                     self.get_0() + other.get_0(),
@@ -366,28 +674,19 @@ impl Add for Float4 {
             }
         }
     }
-}
 
-impl AddAssign for Float4 {
-    #[inline(always)]
-    fn add_assign(&mut self, rhs: Float4) {
-        *self = *self + rhs;
-    }
-}
-
-impl Sub for Float4 {
-    type Output = Float4;
-
-    #[inline(always)]
-    fn sub(self, other: Float4) -> Float4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Float4 {
-                data: self.data - other.data,
-            }
+    impl AddAssign for Float4 {
+        #[inline(always)]
+        fn add_assign(&mut self, rhs: Float4) {
+            *self = *self + rhs;
         }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+    }
+
+    impl Sub for Float4 {
+        type Output = Float4;
+
+        #[inline(always)]
+        fn sub(self, other: Float4) -> Float4 {
             Float4 {
                 data: [
                     self.get_0() - other.get_0(),
@@ -398,28 +697,19 @@ impl Sub for Float4 {
             }
         }
     }
-}
 
-impl SubAssign for Float4 {
-    #[inline(always)]
-    fn sub_assign(&mut self, rhs: Float4) {
-        *self = *self - rhs;
-    }
-}
-
-impl Mul for Float4 {
-    type Output = Float4;
-
-    #[inline(always)]
-    fn mul(self, other: Float4) -> Float4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Float4 {
-                data: self.data * other.data,
-            }
+    impl SubAssign for Float4 {
+        #[inline(always)]
+        fn sub_assign(&mut self, rhs: Float4) {
+            *self = *self - rhs;
         }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+    }
+
+    impl Mul for Float4 {
+        type Output = Float4;
+
+        #[inline(always)]
+        fn mul(self, other: Float4) -> Float4 {
             Float4 {
                 data: [
                     self.get_0() * other.get_0(),
@@ -430,21 +720,12 @@ impl Mul for Float4 {
             }
         }
     }
-}
 
-impl Mul<f32> for Float4 {
-    type Output = Float4;
+    impl Mul<f32> for Float4 {
+        type Output = Float4;
 
-    #[inline(always)]
-    fn mul(self, other: f32) -> Float4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Float4 {
-                data: self.data * f32x4::splat(other),
-            }
-        }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+        #[inline(always)]
+        fn mul(self, other: f32) -> Float4 {
             Float4 {
                 data: [
                     self.get_0() * other,
@@ -455,35 +736,26 @@ impl Mul<f32> for Float4 {
             }
         }
     }
-}
 
-impl MulAssign for Float4 {
-    #[inline(always)]
-    fn mul_assign(&mut self, rhs: Float4) {
-        *self = *self * rhs;
-    }
-}
-
-impl MulAssign<f32> for Float4 {
-    #[inline(always)]
-    fn mul_assign(&mut self, rhs: f32) {
-        *self = *self * rhs;
-    }
-}
-
-impl Div for Float4 {
-    type Output = Float4;
-
-    #[inline(always)]
-    fn div(self, other: Float4) -> Float4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Float4 {
-                data: self.data / other.data,
-            }
+    impl MulAssign for Float4 {
+        #[inline(always)]
+        fn mul_assign(&mut self, rhs: Float4) {
+            *self = *self * rhs;
         }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+    }
+
+    impl MulAssign<f32> for Float4 {
+        #[inline(always)]
+        fn mul_assign(&mut self, rhs: f32) {
+            *self = *self * rhs;
+        }
+    }
+
+    impl Div for Float4 {
+        type Output = Float4;
+
+        #[inline(always)]
+        fn div(self, other: Float4) -> Float4 {
             Float4 {
                 data: [
                     self.get_0() / other.get_0(),
@@ -494,21 +766,12 @@ impl Div for Float4 {
             }
         }
     }
-}
 
-impl Div<f32> for Float4 {
-    type Output = Float4;
+    impl Div<f32> for Float4 {
+        type Output = Float4;
 
-    #[inline(always)]
-    fn div(self, other: f32) -> Float4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Float4 {
-                data: self.data / f32x4::splat(other),
-            }
-        }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+        #[inline(always)]
+        fn div(self, other: f32) -> Float4 {
             Float4 {
                 data: [
                     self.get_0() / other,
@@ -519,108 +782,94 @@ impl Div<f32> for Float4 {
             }
         }
     }
-}
 
-impl DivAssign for Float4 {
-    #[inline(always)]
-    fn div_assign(&mut self, rhs: Float4) {
-        *self = *self / rhs;
-    }
-}
-
-impl DivAssign<f32> for Float4 {
-    #[inline(always)]
-    fn div_assign(&mut self, rhs: f32) {
-        *self = *self / rhs;
-    }
-}
-
-#[inline(always)]
-pub fn v_min(a: Float4, b: Float4) -> Float4 {
-    a.v_min(b)
-}
-
-#[inline(always)]
-pub fn v_max(a: Float4, b: Float4) -> Float4 {
-    a.v_max(b)
-}
-
-/// Essentially a tuple of four bools, which will use SIMD operations
-/// where possible on a platform.
-#[cfg(feature = "simd_perf")]
-#[derive(Debug, Copy, Clone)]
-pub struct Bool4 {
-    data: bool32fx4,
-}
-
-#[cfg(not(feature = "simd_perf"))]
-#[derive(Debug, Copy, Clone)]
-pub struct Bool4 {
-    data: [bool; 4],
-}
-
-impl Bool4 {
-    /// Returns the value of the nth element.
-    #[inline(always)]
-    pub fn get_n(&self, n: usize) -> bool {
-        assert!(
-            n <= 3,
-            "Attempted to access element of Bool4 outside of bounds."
-        );
-        #[cfg(feature = "simd_perf")]
-        {
-            self.data.extract(n as u32)
+    impl DivAssign for Float4 {
+        #[inline(always)]
+        fn div_assign(&mut self, rhs: Float4) {
+            *self = *self / rhs;
         }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+    }
+
+    impl DivAssign<f32> for Float4 {
+        #[inline(always)]
+        fn div_assign(&mut self, rhs: f32) {
+            *self = *self / rhs;
+        }
+    }
+
+    #[inline(always)]
+    pub fn v_min(a: Float4, b: Float4) -> Float4 {
+        a.v_min(b)
+    }
+
+    #[inline(always)]
+    pub fn v_max(a: Float4, b: Float4) -> Float4 {
+        a.v_max(b)
+    }
+
+    /// Essentially a tuple of four bools, which will use SIMD operations
+    /// where possible on a platform.
+    #[cfg(feature = "simd_perf")]
+    #[derive(Debug, Copy, Clone)]
+    pub struct Bool4 {
+        data: bool32fx4,
+    }
+
+    #[cfg(not(feature = "simd_perf"))]
+    #[derive(Debug, Copy, Clone)]
+    pub struct Bool4 {
+        data: [bool; 4],
+    }
+
+    impl Bool4 {
+        /// Returns the value of the nth element.
+        #[inline(always)]
+        pub fn get_n(&self, n: usize) -> bool {
+            assert!(
+                n <= 3,
+                "Attempted to access element of Bool4 outside of bounds."
+            );
             unsafe { *self.data.get_unchecked(n) }
         }
-    }
 
-    /// Returns the value of the 0th element.
-    #[inline(always)]
-    pub fn get_0(&self) -> bool {
-        self.get_n(0)
-    }
-
-    /// Returns the value of the 1th element.
-    #[inline(always)]
-    pub fn get_1(&self) -> bool {
-        self.get_n(1)
-    }
-
-    /// Returns the value of the 2th element.
-    #[inline(always)]
-    pub fn get_2(&self) -> bool {
-        self.get_n(2)
-    }
-
-    /// Returns the value of the 3th element.
-    #[inline(always)]
-    pub fn get_3(&self) -> bool {
-        self.get_n(3)
-    }
-
-    #[inline]
-    pub fn to_bitmask(&self) -> u8 {
-        (self.get_0() as u8) | ((self.get_1() as u8) << 1) | ((self.get_2() as u8) << 2)
-            | ((self.get_3() as u8) << 3)
-    }
-}
-
-impl BitAnd for Bool4 {
-    type Output = Bool4;
-
-    #[inline(always)]
-    fn bitand(self, rhs: Bool4) -> Bool4 {
-        #[cfg(feature = "simd_perf")]
-        {
-            Bool4 {
-                data: self.data & rhs.data,
-            }
+        /// Returns the value of the 0th element.
+        #[inline(always)]
+        pub fn get_0(&self) -> bool {
+            self.get_n(0)
         }
-        #[cfg(not(feature = "simd_perf"))]
-        {
+
+        /// Returns the value of the 1th element.
+        #[inline(always)]
+        pub fn get_1(&self) -> bool {
+            self.get_n(1)
+        }
+
+        /// Returns the value of the 2th element.
+        #[inline(always)]
+        pub fn get_2(&self) -> bool {
+            self.get_n(2)
+        }
+
+        /// Returns the value of the 3th element.
+        #[inline(always)]
+        pub fn get_3(&self) -> bool {
+            self.get_n(3)
+        }
+
+        #[inline]
+        pub fn to_bitmask(&self) -> u8 {
+            (self.get_0() as u8)
+                | ((self.get_1() as u8) << 1)
+                | ((self.get_2() as u8) << 2)
+                | ((self.get_3() as u8) << 3)
+        }
+    }
+
+    impl BitAnd for Bool4 {
+        type Output = Bool4;
+
+        #[inline(always)]
+        fn bitand(self, rhs: Bool4) -> Bool4 {
             Bool4 {
                 data: [
                     self.data[0] && rhs.data[0],
@@ -631,8 +880,34 @@ impl BitAnd for Bool4 {
             }
         }
     }
+
+    impl BitOr for Bool4 {
+        type Output = Bool4;
+
+        #[inline(always)]
+        fn bitor(self, rhs: Bool4) -> Bool4 {
+            Bool4 {
+                data: [
+                    self.data[0] || rhs.data[0],
+                    self.data[1] || rhs.data[1],
+                    self.data[2] || rhs.data[2],
+                    self.data[3] || rhs.data[3],
+                ],
+            }
+        }
+    }
 }
 
+//===========================================================================
+
+#[cfg(all(target_arch = "x86_64", target_feature = "sse"))]
+pub use x86_64_sse::{v_max, v_min, Bool4, Float4};
+
+#[cfg(not(all(target_arch = "x86_64", target_feature = "sse")))]
+pub use fallback::{v_max, v_min, Bool4, Float4};
+
+//===========================================================================
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -778,4 +1053,57 @@ mod tests {
 
         assert_eq!(f1 / v, f2);
     }
+
+    #[test]
+    fn lt() {
+        let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
+        let f2 = Float4::new(0.5, 2.0, 3.5, 2.0);
+
+        let r = f1.lt(f2);
+
+        assert_eq!(r.get_0(), false);
+        assert_eq!(r.get_1(), false);
+        assert_eq!(r.get_2(), true);
+        assert_eq!(r.get_3(), false);
+    }
+
+    #[test]
+    fn gt() {
+        let f1 = Float4::new(1.0, 2.0, 3.0, 4.0);
+        let f2 = Float4::new(0.5, 2.0, 3.5, 2.0);
+
+        let r = f1.gt(f2);
+
+        assert_eq!(r.get_0(), true);
+        assert_eq!(r.get_1(), false);
+        assert_eq!(r.get_2(), false);
+        assert_eq!(r.get_3(), true);
+    }
+
+    #[test]
+    fn bool4_bitmask_01() {
+        let f1 = Float4::new(0.0, 0.0, 0.0, 0.0);
+        let f2 = Float4::new(-1.0, -1.0, 1.0, -1.0);
+        let r = f1.lt(f2).to_bitmask();
+
+        assert_eq!(r, 0b00000100);
+    }
+
+    #[test]
+    fn bool4_bitmask_02() {
+        let f1 = Float4::new(0.0, 0.0, 0.0, 0.0);
+        let f2 = Float4::new(1.0, -1.0, 1.0, -1.0);
+        let r = f1.lt(f2).to_bitmask();
+
+        assert_eq!(r, 0b00000101);
+    }
+
+    #[test]
+    fn bool4_bitmask_03() {
+        let f1 = Float4::new(0.0, 0.0, 0.0, 0.0);
+        let f2 = Float4::new(-1.0, 1.0, -1.0, 1.0);
+        let r = f1.lt(f2).to_bitmask();
+
+        assert_eq!(r, 0b00001010);
+    }
 }
diff --git a/sub_crates/math3d/Cargo.toml b/sub_crates/math3d/Cargo.toml
index de115e4..53c875f 100644
--- a/sub_crates/math3d/Cargo.toml
+++ b/sub_crates/math3d/Cargo.toml
@@ -8,9 +8,6 @@ license = "MIT"
 name = "math3d"
 path = "src/lib.rs"
 
-[features]
-simd_perf = ["float4/simd_perf"]
-
 # Local crate dependencies
 [dependencies.float4]
 path = "../float4"
\ No newline at end of file