diff --git a/src/accel/bvh4_simd.rs b/src/accel/bvh4_simd.rs
new file mode 100644
index 0000000..06a9f15
--- /dev/null
+++ b/src/accel/bvh4_simd.rs
@@ -0,0 +1,387 @@
+//! This BVH4 implementation pulls a lot of ideas from the paper
+//! "Efficient Ray Tracing Kernels for Modern CPU Architectures"
+//! by Fuetterling et al.
+//!
+//! Specifically, the table-based traversal order approach they
+//! propose is largely followed by this implementation.
+
+#![allow(dead_code)]
+
+use mem_arena::MemArena;
+
+use crate::{
+    bbox::BBox,
+    bbox4::BBox4,
+    boundable::Boundable,
+    lerp::lerp_slice,
+    math::Vector,
+    ray::{RayBatch, RayStack},
+    timer::Timer,
+};
+
+use super::{
+    bvh_base::{BVHBase, BVHBaseNode, BVH_MAX_DEPTH},
+    ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME,
+};
+
+use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
+
+pub fn ray_code(dir: Vector) -> usize {
+    let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
+    ray_sign_is_neg[0] as usize
+        + ((ray_sign_is_neg[1] as usize) << 1)
+        + ((ray_sign_is_neg[2] as usize) << 2)
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct BVH4<'a> {
+    root: Option<&'a BVH4Node<'a>>,
+    depth: usize,
+    node_count: usize,
+    _bounds: Option<&'a [BBox]>,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum BVH4Node<'a> {
+    Internal {
+        bounds: &'a [BBox4],
+        children: &'a [BVH4Node<'a>],
+        traversal_code: u8,
+    },
+
+    Leaf {
+        object_range: (usize, usize),
+    },
+}
+
+impl<'a> BVH4<'a> {
+    pub fn from_objects<'b, T, F>(
+        arena: &'a MemArena,
+        objects: &mut [T],
+        objects_per_leaf: usize,
+        bounder: F,
+    ) -> BVH4<'a>
+    where
+        F: 'b + Fn(&T) -> &'b [BBox],
+    {
+        if objects.len() == 0 {
+            BVH4 {
+                root: None,
+                depth: 0,
+                node_count: 0,
+                _bounds: None,
+            }
+        } else {
+            let base = BVHBase::from_objects(objects, objects_per_leaf, bounder);
+
+            let fill_node = unsafe { arena.alloc_uninitialized_with_alignment::<BVH4Node>(32) };
+            let node_count = BVH4::construct_from_base(
+                arena,
+                &base,
+                &base.nodes[base.root_node_index()],
+                fill_node,
+            );
+
+            BVH4 {
+                root: Some(fill_node),
+                depth: (base.depth / 2) + 1,
+                node_count: node_count,
+                _bounds: {
+                    let range = base.nodes[base.root_node_index()].bounds_range();
+                    Some(arena.copy_slice(&base.bounds[range.0..range.1]))
+                },
+            }
+        }
+    }
+
+    pub fn tree_depth(&self) -> usize {
+        self.depth
+    }
+
+    pub fn traverse<T, F>(
+        &self,
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
+        objects: &[T],
+        mut obj_ray_test: F,
+    ) where
+        F: FnMut(&T, &mut RayBatch, &mut RayStack),
+    {
+        if self.root.is_none() {
+            return;
+        }
+
+        let mut trav_time: f64 = 0.0;
+        let mut timer = Timer::new();
+
+        let traversal_table =
+            &TRAVERSAL_TABLE[ray_code(rays.dir_inv_local(ray_stack.next_task_ray_idx(0)))];
+
+        // +2 of max depth for root and last child
+        let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2];
+        let mut stack_ptr = 1;
+
+        while stack_ptr > 0 {
+            match node_stack[stack_ptr] {
+                &BVH4Node::Internal {
+                    bounds,
+                    children,
+                    traversal_code,
+                } => {
+                    let mut all_hits = 0;
+
+                    // Ray testing
+                    ray_stack.pop_do_next_task(children.len(), |ray_idx| {
+                        if rays.is_done(ray_idx) {
+                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                        } else {
+                            let hits = lerp_slice(bounds, rays.time(ray_idx))
+                                .intersect_ray(
+                                    rays.orig_local(ray_idx),
+                                    rays.dir_inv_local(ray_idx),
+                                    rays.max_t(ray_idx),
+                                )
+                                .to_bitmask();
+
+                            if hits != 0 {
+                                all_hits |= hits;
+                                let mut lanes = [0u8; 8];
+                                let mut lane_count = 0;
+                                for i in 0..children.len() {
+                                    if (hits >> i) & 1 != 0 {
+                                        lanes[lane_count] = i as u8;
+                                        lane_count += 1;
+                                    }
+                                }
+                                (lanes, lane_count)
+                            } else {
+                                ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                            }
+                        }
+                    });
+
+                    // If there were any intersections, create tasks.
+                    if all_hits > 0 {
+                        let order_code = traversal_table[traversal_code as usize];
+                        let mut lanes = [0usize; 4];
+                        let mut lane_count = 0;
+                        for i in 0..children.len() {
+                            let inv_i = (children.len() - 1) - i;
+                            let child_i = ((order_code >> (inv_i * 2)) & 3) as usize;
+                            if ((all_hits >> child_i) & 1) != 0 {
+                                node_stack[stack_ptr + lane_count] = &children[child_i];
+                                lanes[lane_count] = child_i;
+                                lane_count += 1;
+                            }
+                        }
+
+                        ray_stack.push_lanes_to_tasks(&lanes[..lane_count]);
+                        stack_ptr += lane_count - 1;
+                    } else {
+                        stack_ptr -= 1;
+                    }
+                }
+
+                &BVH4Node::Leaf { object_range } => {
+                    trav_time += timer.tick() as f64;
+
+                    // Set up the tasks for each object.
+                    let obj_count = object_range.1 - object_range.0;
+                    for _ in 0..(obj_count - 1) {
+                        ray_stack.duplicate_next_task();
+                    }
+
+                    // Do the ray tests.
+                    for obj in &objects[object_range.0..object_range.1] {
+                        obj_ray_test(obj, rays, ray_stack);
+                    }
+
+                    timer.tick();
+
+                    stack_ptr -= 1;
+                }
+            }
+        }
+
+        trav_time += timer.tick() as f64;
+        ACCEL_TRAV_TIME.with(|att| {
+            let v = att.get();
+            att.set(v + trav_time);
+        });
+    }
+
+    fn construct_from_base(
+        arena: &'a MemArena,
+        base: &BVHBase,
+        node: &BVHBaseNode,
+        fill_node: &mut BVH4Node<'a>,
+    ) -> usize {
+        let mut node_count = 0;
+
+        match node {
+            // Create internal node
+            &BVHBaseNode::Internal {
+                bounds_range: _,
+                children_indices,
+                split_axis,
+            } => {
+                let child_l = &base.nodes[children_indices.0];
+                let child_r = &base.nodes[children_indices.1];
+
+                // Prepare convenient access to the stuff we need.
+                let child_count: usize;
+                let children; // [Optional, Optional, Optional, Optional]
+                let split_info: SplitAxes;
+                match *child_l {
+                    BVHBaseNode::Internal {
+                        children_indices: i_l,
+                        split_axis: s_l,
+                        ..
+                    } => {
+                        match *child_r {
+                            BVHBaseNode::Internal {
+                                children_indices: i_r,
+                                split_axis: s_r,
+                                ..
+                            } => {
+                                // Four nodes
+                                child_count = 4;
+                                children = [
+                                    Some(&base.nodes[i_l.0]),
+                                    Some(&base.nodes[i_l.1]),
+                                    Some(&base.nodes[i_r.0]),
+                                    Some(&base.nodes[i_r.1]),
+                                ];
+                                split_info = SplitAxes::Full((split_axis, s_l, s_r));
+                            }
+                            BVHBaseNode::Leaf { .. } => {
+                                // Three nodes with left split
+                                child_count = 3;
+                                children = [
+                                    Some(&base.nodes[i_l.0]),
+                                    Some(&base.nodes[i_l.1]),
+                                    Some(child_r),
+                                    None,
+                                ];
+                                split_info = SplitAxes::Left((split_axis, s_l));
+                            }
+                        }
+                    }
+                    BVHBaseNode::Leaf { .. } => {
+                        match *child_r {
+                            BVHBaseNode::Internal {
+                                children_indices: i_r,
+                                split_axis: s_r,
+                                ..
+                            } => {
+                                // Three nodes with right split
+                                child_count = 3;
+                                children = [
+                                    Some(child_l),
+                                    Some(&base.nodes[i_r.0]),
+                                    Some(&base.nodes[i_r.1]),
+                                    None,
+                                ];
+                                split_info = SplitAxes::Right((split_axis, s_r));
+                            }
+                            BVHBaseNode::Leaf { .. } => {
+                                // Two nodes
+                                child_count = 2;
+                                children = [Some(child_l), Some(child_r), None, None];
+                                split_info = SplitAxes::TopOnly(split_axis);
+                            }
+                        }
+                    }
+                }
+
+                node_count += child_count;
+
+                // Construct bounds
+                let bounds = {
+                    let bounds_len = children
+                        .iter()
+                        .map(|c| {
+                            if let &Some(n) = c {
+                                let len = n.bounds_range().1 - n.bounds_range().0;
+                                debug_assert!(len >= 1);
+                                len
+                            } else {
+                                0
+                            }
+                        })
+                        .max()
+                        .unwrap();
+                    debug_assert!(bounds_len >= 1);
+                    let bounds =
+                        unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) };
+                    if bounds_len < 2 {
+                        let b1 =
+                            children[0].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b2 =
+                            children[1].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b3 =
+                            children[2].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b4 =
+                            children[3].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4);
+                    } else {
+                        for (i, b) in bounds.iter_mut().enumerate() {
+                            let time = i as f32 / (bounds_len - 1) as f32;
+
+                            let b1 = children[0].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b2 = children[1].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b3 = children[2].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b4 = children[3].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            *b = BBox4::from_bboxes(b1, b2, b3, b4);
+                        }
+                    }
+                    bounds
+                };
+
+                // Construct child nodes
+                let child_nodes = unsafe {
+                    arena.alloc_array_uninitialized_with_alignment::<BVH4Node>(child_count, 32)
+                };
+                for (i, c) in children[0..child_count].iter().enumerate() {
+                    node_count +=
+                        BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]);
+                }
+
+                // Build this node
+                *fill_node = BVH4Node::Internal {
+                    bounds: bounds,
+                    children: child_nodes,
+                    traversal_code: calc_traversal_code(split_info),
+                };
+            }
+
+            // Create internal node
+            &BVHBaseNode::Leaf { object_range, .. } => {
+                *fill_node = BVH4Node::Leaf {
+                    object_range: object_range,
+                };
+                node_count += 1;
+            }
+        }
+
+        return node_count;
+    }
+}
+
+impl<'a> Boundable for BVH4<'a> {
+    fn bounds<'b>(&'b self) -> &'b [BBox] {
+        self._bounds.unwrap_or(&[])
+    }
+}
diff --git a/src/accel/mod.rs b/src/accel/mod.rs
index abbb1d4..1bac6d7 100644
--- a/src/accel/mod.rs
+++ b/src/accel/mod.rs
@@ -1,5 +1,6 @@
 // mod bvh;
 mod bvh4;
+mod bvh4_simd;
 mod bvh_base;
 mod light_array;
 mod light_tree;
@@ -14,7 +15,7 @@ use crate::{
 
 pub use self::{
     // bvh::{BVHNode, BVH},
-    bvh4::{ray_code, BVH4Node, BVH4},
+    bvh4_simd::{ray_code, BVH4Node, BVH4},
     light_array::LightArray,
     light_tree::LightTree,
 };
diff --git a/src/bbox4.rs b/src/bbox4.rs
new file mode 100644
index 0000000..71793a4
--- /dev/null
+++ b/src/bbox4.rs
@@ -0,0 +1,139 @@
+#![allow(dead_code)]
+
+use std;
+use std::ops::{BitOr, BitOrAssign};
+
+use crate::{
+    bbox::BBox,
+    lerp::{lerp, Lerp},
+    math::{Point, Vector},
+};
+
+use float4::{Bool4, Float4};
+
+const BBOX_MAXT_ADJUST: f32 = 1.00000024;
+
+/// A SIMD set of 4 3D axis-aligned bounding boxes.
+#[derive(Debug, Copy, Clone)]
+pub struct BBox4 {
+    pub x: (Float4, Float4), // (min, max)
+    pub y: (Float4, Float4), // (min, max)
+    pub z: (Float4, Float4), // (min, max)
+}
+
+impl BBox4 {
+    /// Creates a degenerate BBox with +infinity min and -infinity max.
+    pub fn new() -> BBox4 {
+        BBox4 {
+            x: (
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
+            ),
+            y: (
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
+            ),
+            z: (
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
+            ),
+        }
+    }
+
+    /// Creates a BBox with min as the minimum extent and max as the maximum
+    /// extent.
+    pub fn from_bboxes(b1: BBox, b2: BBox, b3: BBox, b4: BBox) -> BBox4 {
+        BBox4 {
+            x: (
+                Float4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()),
+                Float4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()),
+            ),
+            y: (
+                Float4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()),
+                Float4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()),
+            ),
+            z: (
+                Float4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()),
+                Float4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z()),
+            ),
+        }
+    }
+
+    // Returns whether the given ray intersects with the bboxes.
+    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> Bool4 {
+        // Get the ray data into SIMD format.
+        let ro_x = orig.co.all_0();
+        let ro_y = orig.co.all_1();
+        let ro_z = orig.co.all_2();
+        let rdi_x = dir_inv.co.all_0();
+        let rdi_y = dir_inv.co.all_1();
+        let rdi_z = dir_inv.co.all_2();
+        let max_t = Float4::splat(max_t);
+
+        // Slab tests
+        let t1_x = (self.x.0 - ro_x) * rdi_x;
+        let t1_y = (self.y.0 - ro_y) * rdi_y;
+        let t1_z = (self.z.0 - ro_z) * rdi_z;
+        let t2_x = (self.x.1 - ro_x) * rdi_x;
+        let t2_y = (self.y.1 - ro_y) * rdi_y;
+        let t2_z = (self.z.1 - ro_z) * rdi_z;
+
+        // Get the far and near t hits for each axis.
+        let t_far_x = t1_x.v_max(t2_x);
+        let t_far_y = t1_y.v_max(t2_y);
+        let t_far_z = t1_z.v_max(t2_z);
+        let t_near_x = t1_x.v_min(t2_x);
+        let t_near_y = t1_y.v_min(t2_y);
+        let t_near_z = t1_z.v_min(t2_z);
+
+        // Calculate over-all far t hit.
+        let far_t =
+            (t_far_x.v_min(t_far_y.v_min(t_far_z)) * Float4::splat(BBOX_MAXT_ADJUST)).v_min(max_t);
+
+        // Calculate over-all near t hit.
+        let near_t = t_near_x
+            .v_max(t_near_y)
+            .v_max(t_near_z.v_max(Float4::splat(0.0)));
+
+        // Hit results
+        near_t.lt(far_t)
+    }
+}
+
+/// Union of two BBoxes.
+impl BitOr for BBox4 {
+    type Output = BBox4;
+
+    fn bitor(self, rhs: BBox4) -> BBox4 {
+        BBox4 {
+            x: (self.x.0.v_min(rhs.x.0), self.x.1.v_max(rhs.x.1)),
+            y: (self.y.0.v_min(rhs.y.0), self.y.1.v_max(rhs.y.1)),
+            z: (self.z.0.v_min(rhs.z.0), self.z.1.v_max(rhs.z.1)),
+        }
+    }
+}
+
+impl BitOrAssign for BBox4 {
+    fn bitor_assign(&mut self, rhs: BBox4) {
+        *self = *self | rhs;
+    }
+}
+
+impl Lerp for BBox4 {
+    fn lerp(self, other: BBox4, alpha: f32) -> BBox4 {
+        BBox4 {
+            x: (
+                lerp(self.x.0, other.x.0, alpha),
+                lerp(self.x.1, other.x.1, alpha),
+            ),
+            y: (
+                lerp(self.y.0, other.y.0, alpha),
+                lerp(self.y.1, other.y.1, alpha),
+            ),
+            z: (
+                lerp(self.z.0, other.z.0, alpha),
+                lerp(self.z.1, other.z.1, alpha),
+            ),
+        }
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index bd5cf51..bd18195 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -17,6 +17,7 @@ extern crate lazy_static;
 mod accel;
 mod algorithm;
 mod bbox;
+mod bbox4;
 mod boundable;
 mod camera;
 mod color;
diff --git a/src/ray.rs b/src/ray.rs
index 3b485f0..852f5c9 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -273,6 +273,25 @@ impl RayStack {
         }
     }
 
+    pub fn duplicate_next_task(&mut self) {
+        let task = self.tasks.last().unwrap();
+        let l = task.lane;
+        let start = task.start_idx;
+        let end = self.lanes[l].end_len;
+
+        for i in start..end {
+            let idx = self.lanes[l].idxs[i];
+            self.lanes[l].idxs.push(idx);
+        }
+
+        self.tasks.push(RayTask {
+            lane: l,
+            start_idx: end,
+        });
+
+        self.lanes[l].end_len = self.lanes[l].idxs.len();
+    }
+
     /// Pops the next task off the stack, and executes the provided closure for
     /// each ray index in the task.  The return value of the closure is the list
     /// of lanes (by index) to add the given ray index back into.