diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
new file mode 100644
index 0000000..15eab8e
--- /dev/null
+++ b/src/accel/bvh4.rs
@@ -0,0 +1,164 @@
+#![allow(dead_code)]
+
+use mem_arena::MemArena;
+
+use algorithm::partition;
+use bbox::BBox;
+use boundable::Boundable;
+use lerp::lerp_slice;
+use ray::AccelRay;
+
+use super::bvh_base::{BVHBase, BVHBaseNode, BVH_MAX_DEPTH};
+
+
+#[derive(Copy, Clone, Debug)]
+pub struct BVH4<'a> {
+    root: Option<&'a BVH4Node<'a>>,
+    depth: usize,
+}
+
+#[derive(Copy, Clone, Debug)]
+enum BVH4Node<'a> {
+    Internal {
+        bounds: &'a [BBox],
+        children: (&'a BVH4Node<'a>, &'a BVH4Node<'a>),
+        split_axis: u8,
+    },
+
+    Leaf {
+        bounds: &'a [BBox],
+        object_range: (usize, usize),
+    },
+}
+
+impl<'a> BVH4<'a> {
+    pub fn from_objects<'b, T, F>(arena: &'a MemArena,
+                                  objects: &mut [T],
+                                  objects_per_leaf: usize,
+                                  bounder: F)
+                                  -> BVH4<'a>
+        where F: 'b + Fn(&T) -> &'b [BBox]
+    {
+        if objects.len() == 0 {
+            BVH4 {
+                root: None,
+                depth: 0,
+            }
+        } else {
+            let base = BVHBase::from_objects(objects, objects_per_leaf, bounder);
+
+            BVH4 {
+                root: Some(BVH4::construct_from_base(arena, &base, base.root_node_index())),
+                depth: base.depth,
+            }
+        }
+    }
+
+    pub fn tree_depth(&self) -> usize {
+        self.depth
+    }
+
+    pub fn traverse<T, F>(&self, rays: &mut [AccelRay], objects: &[T], mut obj_ray_test: F)
+        where F: FnMut(&T, &mut [AccelRay])
+    {
+        match self.root {
+            None => {}
+
+            Some(root) => {
+                // +2 of max depth for root and last child
+                let mut node_stack = [root; BVH_MAX_DEPTH + 2];
+                let mut ray_i_stack = [rays.len(); BVH_MAX_DEPTH + 2];
+                let mut stack_ptr = 1;
+
+                while stack_ptr > 0 {
+                    match node_stack[stack_ptr] {
+                        &BVH4Node::Internal { bounds, children, split_axis } => {
+                            let part = partition(&mut rays[..ray_i_stack[stack_ptr]], |r| {
+                                (!r.is_done()) && lerp_slice(bounds, r.time).intersect_accel_ray(r)
+                            });
+                            if part > 0 {
+                                node_stack[stack_ptr] = children.0;
+                                node_stack[stack_ptr + 1] = children.1;
+                                ray_i_stack[stack_ptr] = part;
+                                ray_i_stack[stack_ptr + 1] = part;
+                                if rays[0].dir_inv.get_n(split_axis as usize).is_sign_positive() {
+                                    node_stack.swap(stack_ptr, stack_ptr + 1);
+                                }
+                                stack_ptr += 1;
+                            } else {
+                                stack_ptr -= 1;
+                            }
+                        }
+
+                        &BVH4Node::Leaf { bounds, object_range } => {
+                            let part = partition(&mut rays[..ray_i_stack[stack_ptr]], |r| {
+                                (!r.is_done()) && lerp_slice(bounds, r.time).intersect_accel_ray(r)
+                            });
+                            if part > 0 {
+                                for obj in &objects[object_range.0..object_range.1] {
+                                    obj_ray_test(obj, &mut rays[..part]);
+                                }
+                            }
+
+                            stack_ptr -= 1;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    fn construct_from_base(arena: &'a MemArena,
+                           base: &BVHBase,
+                           node_index: usize)
+                           -> &'a mut BVH4Node<'a> {
+        match &base.nodes[node_index] {
+            &BVHBaseNode::Internal { bounds_range, children_indices, split_axis } => {
+                let mut node = unsafe { arena.alloc_uninitialized::<BVH4Node>() };
+
+                let bounds = arena.copy_slice(&base.bounds[bounds_range.0..bounds_range.1]);
+                let child1 = BVH4::construct_from_base(arena, base, children_indices.0);
+                let child2 = BVH4::construct_from_base(arena, base, children_indices.1);
+
+                *node = BVH4Node::Internal {
+                    bounds: bounds,
+                    children: (child1, child2),
+                    split_axis: split_axis,
+                };
+
+                return node;
+            }
+
+            &BVHBaseNode::Leaf { bounds_range, object_range } => {
+                let mut node = unsafe { arena.alloc_uninitialized::<BVH4Node>() };
+                let bounds = arena.copy_slice(&base.bounds[bounds_range.0..bounds_range.1]);
+
+                *node = BVH4Node::Leaf {
+                    bounds: bounds,
+                    object_range: object_range,
+                };
+
+                return node;
+            }
+        }
+    }
+}
+
+lazy_static! {
+    static ref DEGENERATE_BOUNDS: [BBox; 1] = [BBox::new()];
+}
+
+impl<'a> Boundable for BVH4<'a> {
+    fn bounds<'b>(&'b self) -> &'b [BBox] {
+        match self.root {
+            None => &DEGENERATE_BOUNDS[..],
+            Some(root) => {
+                match root {
+                    &BVH4Node::Internal { bounds, .. } => bounds,
+
+                    &BVH4Node::Leaf { bounds, .. } => bounds,
+                }
+            }
+        }
+    }
+}
diff --git a/src/accel/mod.rs b/src/accel/mod.rs
index 4a5ff4a..3bde84d 100644
--- a/src/accel/mod.rs
+++ b/src/accel/mod.rs
@@ -1,5 +1,6 @@
 mod bvh_base;
 mod bvh;
+mod bvh4;
 mod light_array;
 mod light_tree;
 mod objects_split;
@@ -8,6 +9,7 @@ use math::{Vector, Point, Normal};
 use shading::surface_closure::SurfaceClosure;
 
 pub use self::bvh::BVH;
+pub use self::bvh4::BVH4;
 pub use self::light_tree::LightTree;
 
 
diff --git a/src/bbox4.rs b/src/bbox4.rs
new file mode 100644
index 0000000..1c79470
--- /dev/null
+++ b/src/bbox4.rs
@@ -0,0 +1,121 @@
+#![allow(dead_code)]
+
+use std;
+use std::ops::{BitOr, BitOrAssign};
+
+use bbox::BBox;
+use float4::{Float4, Bool4, v_min, v_max};
+use lerp::{lerp, Lerp};
+use ray::AccelRay;
+
+
+const BBOX_MAXT_ADJUST: f32 = 1.00000024;
+
+/// A SIMD set of 4 3D axis-aligned bounding boxes.
+#[derive(Debug, Copy, Clone)]
+pub struct BBox4 {
+    pub min: (Float4, Float4, Float4), // xs, ys, zs
+    pub max: (Float4, Float4, Float4), // xs, ys, zs
+}
+
+impl BBox4 {
+    /// Creates a degenerate BBox with +infinity min and -infinity max.
+    pub fn new() -> BBox4 {
+        BBox4 {
+            min: (Float4::splat(std::f32::INFINITY),
+                  Float4::splat(std::f32::INFINITY),
+                  Float4::splat(std::f32::INFINITY)),
+            max: (Float4::splat(std::f32::NEG_INFINITY),
+                  Float4::splat(std::f32::NEG_INFINITY),
+                  Float4::splat(std::f32::NEG_INFINITY)),
+        }
+    }
+
+    /// Creates a BBox with min as the minimum extent and max as the maximum
+    /// extent.
+    pub fn from_bboxes(b1: BBox, b2: BBox, b3: BBox, b4: BBox) -> BBox4 {
+        BBox4 {
+            min: (Float4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()),
+                  Float4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()),
+                  Float4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z())),
+            max: (Float4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()),
+                  Float4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()),
+                  Float4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z())),
+        }
+    }
+
+    // Returns whether the given ray intersects with the bboxes.
+    pub fn intersect_accel_ray(&self, ray: &AccelRay) -> Bool4 {
+        // Convert ray to SIMD form
+        let ray4_o =
+            (Float4::splat(ray.orig.x()), Float4::splat(ray.orig.y()), Float4::splat(ray.orig.z()));
+        let ray4_dinv = (Float4::splat(ray.dir_inv.x()),
+                         Float4::splat(ray.dir_inv.y()),
+                         Float4::splat(ray.dir_inv.z()));
+
+        // Calculate the plane intersections
+        let (xlos, xhis) = if ray.dir_inv.x() >= 0.0 {
+            ((self.min.0 - ray4_o.0) * ray4_dinv.0, (self.max.0 - ray4_o.0) * ray4_dinv.0)
+        } else {
+            ((self.max.0 - ray4_o.0) * ray4_dinv.0, (self.min.0 - ray4_o.0) * ray4_dinv.0)
+        };
+
+        let (ylos, yhis) = if ray.dir_inv.y() >= 0.0 {
+            ((self.min.1 - ray4_o.1) * ray4_dinv.1, (self.max.1 - ray4_o.1) * ray4_dinv.1)
+        } else {
+            ((self.max.1 - ray4_o.1) * ray4_dinv.1, (self.min.1 - ray4_o.1) * ray4_dinv.1)
+        };
+
+        let (zlos, zhis) = if ray.dir_inv.z() >= 0.0 {
+            ((self.min.2 - ray4_o.2) * ray4_dinv.2, (self.max.2 - ray4_o.2) * ray4_dinv.2)
+        } else {
+            ((self.max.2 - ray4_o.2) * ray4_dinv.2, (self.min.2 - ray4_o.2) * ray4_dinv.2)
+        };
+
+        // Get the minimum and maximum hits
+        let mins = v_max(v_max(xlos, ylos), v_max(zlos, Float4::splat(0.0)));
+        let maxs = v_max(v_min(v_min(xhis, yhis), zhis),
+                         Float4::splat(std::f32::NEG_INFINITY) * Float4::splat(BBOX_MAXT_ADJUST));
+
+        // Check for hits
+        let hits = mins.lt(Float4::splat(ray.max_t)) & mins.lte(maxs);
+
+        return hits;
+    }
+}
+
+
+/// Union of two BBoxes.
+impl BitOr for BBox4 {
+    type Output = BBox4;
+
+    fn bitor(self, rhs: BBox4) -> BBox4 {
+        BBox4 {
+            min: (self.min.0.v_min(rhs.min.0),
+                  self.min.1.v_min(rhs.min.1),
+                  self.min.2.v_min(rhs.min.2)),
+            max: (self.max.0.v_max(rhs.max.0),
+                  self.max.1.v_max(rhs.max.1),
+                  self.max.2.v_max(rhs.max.2)),
+        }
+    }
+}
+
+impl BitOrAssign for BBox4 {
+    fn bitor_assign(&mut self, rhs: BBox4) {
+        *self = *self | rhs;
+    }
+}
+
+impl Lerp for BBox4 {
+    fn lerp(self, other: BBox4, alpha: f32) -> BBox4 {
+        BBox4 {
+            min: (lerp(self.min.0, other.min.0, alpha),
+                  lerp(self.min.1, other.min.1, alpha),
+                  lerp(self.min.2, other.min.2, alpha)),
+            max: (lerp(self.max.0, other.max.0, alpha),
+                  lerp(self.max.1, other.max.1, alpha),
+                  lerp(self.max.2, other.max.2, alpha)),
+        }
+    }
+}
diff --git a/src/float4.rs b/src/float4.rs
index de7c1e7..ad4f7da 100644
--- a/src/float4.rs
+++ b/src/float4.rs
@@ -1,11 +1,12 @@
 #![allow(dead_code)]
 
 use std::cmp::PartialEq;
-use std::ops::{Add, Sub, Mul, Div};
+use std::ops::{Add, Sub, Mul, Div, BitAnd};
 
 #[cfg(feature = "simd_perf")]
-use simd::f32x4;
+use simd::{f32x4, bool32fx4};
 
+use lerp::Lerp;
 
 /// Essentially a tuple of four floats, which will use SIMD operations
 /// where possible on a platform.
@@ -133,6 +134,62 @@ impl Float4 {
                     })
     }
 
+    #[cfg(feature = "simd_perf")]
+    pub fn lt(&self, other: Float4) -> Bool4 {
+        Bool4 { data: self.data.lt(other.data) }
+    }
+    #[cfg(not(feature = "simd_perf"))]
+    pub fn lt(&self, other: Float4) -> Bool4 {
+        Bool4 {
+            data: [self.data[0] < other.data[0],
+                   self.data[1] < other.data[1],
+                   self.data[2] < other.data[2],
+                   self.data[3] < other.data[3]],
+        }
+    }
+
+    #[cfg(feature = "simd_perf")]
+    pub fn lte(&self, other: Float4) -> Bool4 {
+        Bool4 { data: self.data.lte(other.data) }
+    }
+    #[cfg(not(feature = "simd_perf"))]
+    pub fn lte(&self, other: Float4) -> Bool4 {
+        Bool4 {
+            data: [self.data[0] <= other.data[0],
+                   self.data[1] <= other.data[1],
+                   self.data[2] <= other.data[2],
+                   self.data[3] <= other.data[3]],
+        }
+    }
+
+    #[cfg(feature = "simd_perf")]
+    pub fn gt(&self, other: Float4) -> Bool4 {
+        Bool4 { data: self.data.gt(other.data) }
+    }
+    #[cfg(not(feature = "simd_perf"))]
+    pub fn gt(&self, other: Float4) -> Bool4 {
+        Bool4 {
+            data: [self.data[0] > other.data[0],
+                   self.data[1] > other.data[1],
+                   self.data[2] > other.data[2],
+                   self.data[3] > other.data[3]],
+        }
+    }
+
+    #[cfg(feature = "simd_perf")]
+    pub fn gte(&self, other: Float4) -> Bool4 {
+        Bool4 { data: self.data.gte(other.data) }
+    }
+    #[cfg(not(feature = "simd_perf"))]
+    pub fn gte(&self, other: Float4) -> Bool4 {
+        Bool4 {
+            data: [self.data[0] >= other.data[0],
+                   self.data[1] >= other.data[1],
+                   self.data[2] >= other.data[2],
+                   self.data[3] >= other.data[3]],
+        }
+    }
+
     /// Set the nth element to the given value.
     #[inline]
     pub fn set_n(&mut self, n: usize, v: f32) {
@@ -382,6 +439,110 @@ impl Div<f32> for Float4 {
     }
 }
 
+impl Lerp for Float4 {
+    fn lerp(self, other: Float4, alpha: f32) -> Float4 {
+        (self * (1.0 - alpha)) + (other * alpha)
+    }
+}
+
+#[inline(always)]
+pub fn v_min(a: Float4, b: Float4) -> Float4 {
+    a.v_min(b)
+}
+
+#[inline(always)]
+pub fn v_max(a: Float4, b: Float4) -> Float4 {
+    a.v_max(b)
+}
+
+
+/// Essentially a tuple of four bools, which will use SIMD operations
+/// where possible on a platform.
+#[cfg(feature = "simd_perf")]
+#[derive(Debug, Copy, Clone)]
+pub struct Bool4 {
+    data: bool32fx4,
+}
+
+#[cfg(not(feature = "simd_perf"))]
+#[derive(Debug, Copy, Clone)]
+pub struct Bool4 {
+    data: [bool; 4],
+}
+
+impl Bool4 {
+    /// Returns the value of the 0th element.
+    #[cfg(feature = "simd_perf")]
+    #[inline(always)]
+    pub fn get_0(&self) -> bool {
+        self.data.extract(0)
+    }
+    #[cfg(not(feature = "simd_perf"))]
+    #[inline(always)]
+    pub fn get_0(&self) -> bool {
+        unsafe { *self.data.get_unchecked(0) }
+    }
+
+    /// Returns the value of the 1th element.
+    #[cfg(feature = "simd_perf")]
+    #[inline(always)]
+    pub fn get_1(&self) -> bool {
+        self.data.extract(1)
+    }
+    #[cfg(not(feature = "simd_perf"))]
+    #[inline(always)]
+    pub fn get_1(&self) -> bool {
+        unsafe { *self.data.get_unchecked(1) }
+    }
+
+    /// Returns the value of the 2th element.
+    #[cfg(feature = "simd_perf")]
+    #[inline(always)]
+    pub fn get_2(&self) -> bool {
+        self.data.extract(2)
+    }
+    #[cfg(not(feature = "simd_perf"))]
+    #[inline(always)]
+    pub fn get_2(&self) -> bool {
+        unsafe { *self.data.get_unchecked(2) }
+    }
+
+    /// Returns the value of the 3th element.
+    #[cfg(feature = "simd_perf")]
+    #[inline(always)]
+    pub fn get_3(&self) -> bool {
+        self.data.extract(3)
+    }
+    #[cfg(not(feature = "simd_perf"))]
+    #[inline(always)]
+    pub fn get_3(&self) -> bool {
+        unsafe { *self.data.get_unchecked(3) }
+    }
+
+    pub fn to_bitmask(&self) -> u8 {
+        (self.get_0() as u8) & ((self.get_1() as u8) << 1) & ((self.get_2() as u8) << 2) &
+        ((self.get_3() as u8) << 3)
+    }
+}
+
+impl BitAnd for Bool4 {
+    type Output = Bool4;
+
+    #[cfg(feature = "simd_perf")]
+    fn bitand(self, rhs: Bool4) -> Bool4 {
+        Bool4 { data: self.data & rhs.data }
+    }
+    #[cfg(not(feature = "simd_perf"))]
+    fn bitand(self, rhs: Bool4) -> Bool4 {
+        Bool4 {
+            data: [self.data[0] && rhs.data[0],
+                   self.data[1] && rhs.data[1],
+                   self.data[2] && rhs.data[2],
+                   self.data[3] && rhs.data[3]],
+        }
+    }
+}
+
 
 #[cfg(test)]
 mod tests {
diff --git a/src/main.rs b/src/main.rs
index 14a8a1c..6db9bb1 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -21,6 +21,7 @@ extern crate simd;
 mod accel;
 mod algorithm;
 mod bbox;
+mod bbox4;
 mod bitstack;
 mod boundable;
 mod camera;