diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index 6ee9525..d7e68e1 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -1,13 +1,17 @@
+//! This BVH4 implementation is based on the ideas from the paper
+//! "Efficient Ray Tracing Kernels for Modern CPU Architectures"
+//! by Fuetterling et al.
+
 #![allow(dead_code)]
 
-use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
-use math3d::Vector;
 use mem_arena::MemArena;
 
 use crate::{
     bbox::BBox,
+    bbox4::BBox4,
     boundable::Boundable,
     lerp::lerp_slice,
+    math::Vector,
     ray::{RayBatch, RayStack},
     timer::Timer,
 };
@@ -17,6 +21,9 @@ use super::{
     ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME,
 };
 
+use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
+use float4::Bool4;
+
 pub fn ray_code(dir: Vector) -> usize {
     let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
     ray_sign_is_neg[0] as usize
@@ -28,20 +35,19 @@ pub fn ray_code(dir: Vector) -> usize {
 pub struct BVH4<'a> {
     root: Option<&'a BVH4Node<'a>>,
     depth: usize,
+    node_count: usize,
+    _bounds: Option<&'a [BBox]>,
 }
 
 #[derive(Copy, Clone, Debug)]
 pub enum BVH4Node<'a> {
-    Inner {
-        traversal_code: u8,
-        bounds_start: &'a BBox,
-        bounds_len: u16,
+    Internal {
+        bounds: &'a [BBox4],
         children: &'a [BVH4Node<'a>],
+        traversal_code: u8,
     },
 
     Leaf {
-        bounds_start: &'a BBox,
-        bounds_len: u16,
         object_range: (usize, usize),
     },
 }
@@ -56,19 +62,32 @@ impl<'a> BVH4<'a> {
     where
         F: 'b + Fn(&T) -> &'b [BBox],
     {
-        if objects.is_empty() {
+        if objects.len() == 0 {
             BVH4 {
                 root: None,
                 depth: 0,
+                node_count: 0,
+                _bounds: None,
             }
         } else {
             let base = BVHBase::from_objects(objects, objects_per_leaf, bounder);
 
-            let root = unsafe { arena.alloc_uninitialized::<BVH4Node>() };
-            BVH4::construct_from_base(arena, &base, base.root_node_index(), root);
+            let fill_node = unsafe { arena.alloc_uninitialized_with_alignment::<BVH4Node>(32) };
+            let node_count = BVH4::construct_from_base(
+                arena,
+                &base,
+                &base.nodes[base.root_node_index()],
+                fill_node,
+            );
+
             BVH4 {
-                root: Some(root),
-                depth: base.depth,
+                root: Some(fill_node),
+                depth: (base.depth / 2) + 1,
+                node_count: node_count,
+                _bounds: {
+                    let range = base.nodes[base.root_node_index()].bounds_range();
+                    Some(arena.copy_slice(&base.bounds[range.0..range.1]))
+                },
             }
         }
     }
@@ -103,117 +122,63 @@ impl<'a> BVH4<'a> {
 
         while stack_ptr > 0 {
             node_tests += ray_stack.ray_count_in_next_task() as u64;
-            match *node_stack[stack_ptr] {
-                BVH4Node::Inner {
-                    traversal_code,
-                    bounds_start,
-                    bounds_len,
+            match node_stack[stack_ptr] {
+                &BVH4Node::Internal {
+                    bounds,
                     children,
+                    traversal_code,
                 } => {
-                    // Test rays against bbox.
-                    let bounds =
-                        unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
+                    let mut all_hits = Bool4::new_false();
 
-                    let mut hit_count = 0;
-                    ray_stack.pop_do_next_task(children.len(), |ray_idx| {
-                        let hit = (!rays.is_done(ray_idx))
-                            && lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
+                    // Ray testing
+                    ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| {
+                        if rays.is_done(ray_idx) {
+                            (Bool4::new_false(), 0)
+                        } else {
+                            let hits = lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
                                 rays.orig_local(ray_idx),
                                 rays.dir_inv_local(ray_idx),
                                 rays.max_t(ray_idx),
                             );
-
-                        if hit {
-                            hit_count += 1;
-                            ([0, 1, 2, 3], children.len())
-                        } else {
-                            ([0; 4], 0)
+                            all_hits = all_hits | hits;
+                            (hits, children.len())
                         }
                     });
 
                     // If there were any intersections, create tasks.
-                    if hit_count > 0 {
+                    if !all_hits.is_all_false() {
                         let order_code = traversal_table[traversal_code as usize];
-                        match children.len() {
-                            4 => {
-                                let i4 = ((order_code >> 6) & 0b11) as usize;
-                                let i3 = ((order_code >> 4) & 0b11) as usize;
-                                let i2 = ((order_code >> 2) & 0b11) as usize;
-                                let i1 = (order_code & 0b11) as usize;
-
-                                ray_stack.push_lanes_to_tasks(&[i4, i3, i2, i1]);
-
-                                node_stack[stack_ptr] = &children[i4];
-                                node_stack[stack_ptr + 1] = &children[i3];
-                                node_stack[stack_ptr + 2] = &children[i2];
-                                node_stack[stack_ptr + 3] = &children[i1];
-
-                                stack_ptr += 3;
+                        let mut lanes = [0usize; 4];
+                        let mut lane_count = 0;
+                        for i in 0..children.len() {
+                            let inv_i = (children.len() - 1) - i;
+                            let child_i = ((order_code >> (inv_i * 2)) & 3) as usize;
+                            if all_hits.get_n(child_i) {
+                                node_stack[stack_ptr + lane_count] = &children[child_i];
+                                lanes[lane_count] = child_i;
+                                lane_count += 1;
                             }
-                            3 => {
-                                let i3 = ((order_code >> 4) & 0b11) as usize;
-                                let i2 = ((order_code >> 2) & 0b11) as usize;
-                                let i1 = (order_code & 0b11) as usize;
-
-                                ray_stack.push_lanes_to_tasks(&[i3, i2, i1]);
-
-                                node_stack[stack_ptr] = &children[i3];
-                                node_stack[stack_ptr + 1] = &children[i2];
-                                node_stack[stack_ptr + 2] = &children[i1];
-
-                                stack_ptr += 2;
-                            }
-                            2 => {
-                                let i2 = ((order_code >> 2) & 0b11) as usize;
-                                let i1 = (order_code & 0b11) as usize;
-
-                                ray_stack.push_lanes_to_tasks(&[i2, i1]);
-
-                                node_stack[stack_ptr] = &children[i2];
-                                node_stack[stack_ptr + 1] = &children[i1];
-
-                                stack_ptr += 1;
-                            }
-                            _ => unreachable!(),
                         }
+
+                        ray_stack.push_lanes_to_tasks(&lanes[..lane_count]);
+                        stack_ptr += lane_count - 1;
                     } else {
                         stack_ptr -= 1;
                     }
                 }
 
-                BVH4Node::Leaf {
-                    object_range,
-                    bounds_start,
-                    bounds_len,
-                } => {
-                    // Test rays against bounds.
-                    let bounds =
-                        unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
-                    let object_count = object_range.1 - object_range.0;
-                    let mut hit_count = 0;
-
-                    ray_stack.pop_do_next_task(object_count, |ray_idx| {
-                        let hit = (!rays.is_done(ray_idx))
-                            && lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
-                                rays.orig_local(ray_idx),
-                                rays.dir_inv_local(ray_idx),
-                                rays.max_t(ray_idx),
-                            );
-                        if hit {
-                            hit_count += 1;
-                            ([0, 1, 2, 3], object_count)
-                        } else {
-                            ([0; 4], 0)
-                        }
-                    });
-
+                &BVH4Node::Leaf { object_range } => {
                     trav_time += timer.tick() as f64;
 
-                    if hit_count > 0 {
-                        ray_stack.push_lanes_to_tasks(&[0, 1, 2, 3, 4, 5, 6, 7][..object_count]);
-                        for obj in &objects[object_range.0..object_range.1] {
-                            obj_ray_test(obj, rays, ray_stack);
-                        }
+                    // Set up the tasks for each object.
+                    let obj_count = object_range.1 - object_range.0;
+                    for _ in 0..(obj_count - 1) {
+                        ray_stack.duplicate_next_task();
+                    }
+
+                    // Do the ray tests.
+                    for obj in &objects[object_range.0..object_range.1] {
+                        obj_ray_test(obj, rays, ray_stack);
                     }
 
                     timer.tick();
@@ -237,12 +202,15 @@ impl<'a> BVH4<'a> {
     fn construct_from_base(
         arena: &'a MemArena,
         base: &BVHBase,
-        node_index: usize,
-        node_mem: &mut BVH4Node<'a>,
-    ) {
-        match base.nodes[node_index] {
-            BVHBaseNode::Internal {
-                bounds_range,
+        node: &BVHBaseNode,
+        fill_node: &mut BVH4Node<'a>,
+    ) -> usize {
+        let mut node_count = 0;
+
+        match node {
+            // Create internal node
+            &BVHBaseNode::Internal {
+                bounds_range: _,
                 children_indices,
                 split_axis,
             } => {
@@ -251,7 +219,7 @@ impl<'a> BVH4<'a> {
 
                 // Prepare convenient access to the stuff we need.
                 let child_count: usize;
-                let child_indices: [usize; 4];
+                let children; // [Optional, Optional, Optional, Optional]
                 let split_info: SplitAxes;
                 match *child_l {
                     BVHBaseNode::Internal {
@@ -267,13 +235,23 @@ impl<'a> BVH4<'a> {
                             } => {
                                 // Four nodes
                                 child_count = 4;
-                                child_indices = [i_l.0, i_l.1, i_r.0, i_r.1];
+                                children = [
+                                    Some(&base.nodes[i_l.0]),
+                                    Some(&base.nodes[i_l.1]),
+                                    Some(&base.nodes[i_r.0]),
+                                    Some(&base.nodes[i_r.1]),
+                                ];
                                 split_info = SplitAxes::Full((split_axis, s_l, s_r));
                             }
                             BVHBaseNode::Leaf { .. } => {
                                 // Three nodes with left split
                                 child_count = 3;
-                                child_indices = [i_l.0, i_l.1, children_indices.1, 0];
+                                children = [
+                                    Some(&base.nodes[i_l.0]),
+                                    Some(&base.nodes[i_l.1]),
+                                    Some(child_r),
+                                    None,
+                                ];
                                 split_info = SplitAxes::Left((split_axis, s_l));
                             }
                         }
@@ -287,76 +265,112 @@ impl<'a> BVH4<'a> {
                             } => {
                                 // Three nodes with right split
                                 child_count = 3;
-                                child_indices = [children_indices.0, i_r.0, i_r.1, 0];
+                                children = [
+                                    Some(child_l),
+                                    Some(&base.nodes[i_r.0]),
+                                    Some(&base.nodes[i_r.1]),
+                                    None,
+                                ];
                                 split_info = SplitAxes::Right((split_axis, s_r));
                             }
                             BVHBaseNode::Leaf { .. } => {
                                 // Two nodes
                                 child_count = 2;
-                                child_indices = [children_indices.0, children_indices.1, 0, 0];
+                                children = [Some(child_l), Some(child_r), None, None];
                                 split_info = SplitAxes::TopOnly(split_axis);
                             }
                         }
                     }
                 }
 
-                // Copy bounds
-                let bounds = arena
-                    .copy_slice_with_alignment(&base.bounds[bounds_range.0..bounds_range.1], 32);
+                node_count += child_count;
 
-                // Build children
-                let children_mem = unsafe {
+                // Construct bounds
+                let bounds = {
+                    let bounds_len = children
+                        .iter()
+                        .map(|c| {
+                            if let &Some(n) = c {
+                                let len = n.bounds_range().1 - n.bounds_range().0;
+                                debug_assert!(len >= 1);
+                                len
+                            } else {
+                                0
+                            }
+                        })
+                        .max()
+                        .unwrap();
+                    debug_assert!(bounds_len >= 1);
+                    let bounds =
+                        unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) };
+                    if bounds_len < 2 {
+                        let b1 =
+                            children[0].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b2 =
+                            children[1].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b3 =
+                            children[2].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b4 =
+                            children[3].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4);
+                    } else {
+                        for (i, b) in bounds.iter_mut().enumerate() {
+                            let time = i as f32 / (bounds_len - 1) as f32;
+
+                            let b1 = children[0].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b2 = children[1].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b3 = children[2].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b4 = children[3].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            *b = BBox4::from_bboxes(b1, b2, b3, b4);
+                        }
+                    }
+                    bounds
+                };
+
+                // Construct child nodes
+                let child_nodes = unsafe {
                     arena.alloc_array_uninitialized_with_alignment::<BVH4Node>(child_count, 32)
                 };
-                for i in 0..child_count {
-                    BVH4::construct_from_base(arena, base, child_indices[i], &mut children_mem[i]);
+                for (i, c) in children[0..child_count].iter().enumerate() {
+                    node_count +=
+                        BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]);
                 }
 
-                // Fill in node
-                *node_mem = BVH4Node::Inner {
+                // Build this node
+                *fill_node = BVH4Node::Internal {
+                    bounds: bounds,
+                    children: child_nodes,
                     traversal_code: calc_traversal_code(split_info),
-                    bounds_start: &bounds[0],
-                    bounds_len: bounds.len() as u16,
-                    children: children_mem,
                 };
             }
 
-            BVHBaseNode::Leaf {
-                bounds_range,
-                object_range,
-            } => {
-                let bounds = arena.copy_slice(&base.bounds[bounds_range.0..bounds_range.1]);
-
-                *node_mem = BVH4Node::Leaf {
-                    bounds_start: &bounds[0],
-                    bounds_len: bounds.len() as u16,
+            // Create internal node
+            &BVHBaseNode::Leaf { object_range, .. } => {
+                *fill_node = BVH4Node::Leaf {
                     object_range: object_range,
                 };
+                node_count += 1;
             }
         }
-    }
-}
 
-lazy_static! {
-    static ref DEGENERATE_BOUNDS: [BBox; 1] = [BBox::new()];
+        return node_count;
+    }
 }
 
 impl<'a> Boundable for BVH4<'a> {
-    fn bounds(&self) -> &[BBox] {
-        match self.root {
-            None => &DEGENERATE_BOUNDS[..],
-            Some(root) => match *root {
-                BVH4Node::Inner {
-                    bounds_start,
-                    bounds_len,
-                    ..
-                }
-                | BVH4Node::Leaf {
-                    bounds_start,
-                    bounds_len,
-                    ..
-                } => unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) },
-            },
-        }
+    fn bounds<'b>(&'b self) -> &'b [BBox] {
+        self._bounds.unwrap_or(&[])
     }
 }
diff --git a/src/accel/bvh4_simd.rs b/src/accel/bvh4_simd.rs
deleted file mode 100644
index 2ad0848..0000000
--- a/src/accel/bvh4_simd.rs
+++ /dev/null
@@ -1,386 +0,0 @@
-//! This BVH4 implementation pulls a lot of ideas from the paper
-//! "Efficient Ray Tracing Kernels for Modern CPU Architectures"
-//! by Fuetterling et al.
-//!
-//! Specifically, the table-based traversal order approach they
-//! propose is largely followed by this implementation.
-
-#![allow(dead_code)]
-
-use mem_arena::MemArena;
-
-use crate::{
-    bbox::BBox,
-    bbox4::BBox4,
-    boundable::Boundable,
-    lerp::lerp_slice,
-    math::Vector,
-    ray::{RayBatch, RayStack},
-    timer::Timer,
-};
-
-use super::{
-    bvh_base::{BVHBase, BVHBaseNode, BVH_MAX_DEPTH},
-    ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME,
-};
-
-use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
-use float4::Bool4;
-
-pub fn ray_code(dir: Vector) -> usize {
-    let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
-    ray_sign_is_neg[0] as usize
-        + ((ray_sign_is_neg[1] as usize) << 1)
-        + ((ray_sign_is_neg[2] as usize) << 2)
-}
-
-#[derive(Copy, Clone, Debug)]
-pub struct BVH4<'a> {
-    root: Option<&'a BVH4Node<'a>>,
-    depth: usize,
-    node_count: usize,
-    _bounds: Option<&'a [BBox]>,
-}
-
-#[derive(Copy, Clone, Debug)]
-pub enum BVH4Node<'a> {
-    Internal {
-        bounds: &'a [BBox4],
-        children: &'a [BVH4Node<'a>],
-        traversal_code: u8,
-    },
-
-    Leaf {
-        object_range: (usize, usize),
-    },
-}
-
-impl<'a> BVH4<'a> {
-    pub fn from_objects<'b, T, F>(
-        arena: &'a MemArena,
-        objects: &mut [T],
-        objects_per_leaf: usize,
-        bounder: F,
-    ) -> BVH4<'a>
-    where
-        F: 'b + Fn(&T) -> &'b [BBox],
-    {
-        if objects.len() == 0 {
-            BVH4 {
-                root: None,
-                depth: 0,
-                node_count: 0,
-                _bounds: None,
-            }
-        } else {
-            let base = BVHBase::from_objects(objects, objects_per_leaf, bounder);
-
-            let fill_node = unsafe { arena.alloc_uninitialized_with_alignment::<BVH4Node>(32) };
-            let node_count = BVH4::construct_from_base(
-                arena,
-                &base,
-                &base.nodes[base.root_node_index()],
-                fill_node,
-            );
-
-            BVH4 {
-                root: Some(fill_node),
-                depth: (base.depth / 2) + 1,
-                node_count: node_count,
-                _bounds: {
-                    let range = base.nodes[base.root_node_index()].bounds_range();
-                    Some(arena.copy_slice(&base.bounds[range.0..range.1]))
-                },
-            }
-        }
-    }
-
-    pub fn tree_depth(&self) -> usize {
-        self.depth
-    }
-
-    pub fn traverse<T, F>(
-        &self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-        objects: &[T],
-        mut obj_ray_test: F,
-    ) where
-        F: FnMut(&T, &mut RayBatch, &mut RayStack),
-    {
-        if self.root.is_none() {
-            return;
-        }
-
-        let mut trav_time: f64 = 0.0;
-        let mut timer = Timer::new();
-
-        let traversal_table =
-            &TRAVERSAL_TABLE[ray_code(rays.dir_inv_local(ray_stack.next_task_ray_idx(0)))];
-
-        // +2 of max depth for root and last child
-        let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2];
-        let mut stack_ptr = 1;
-
-        while stack_ptr > 0 {
-            match node_stack[stack_ptr] {
-                &BVH4Node::Internal {
-                    bounds,
-                    children,
-                    traversal_code,
-                } => {
-                    let mut all_hits = Bool4::new();
-
-                    // Ray testing
-                    ray_stack.pop_do_next_task(children.len(), |ray_idx| {
-                        if rays.is_done(ray_idx) {
-                            ([0; 4], 0)
-                        } else {
-                            let hits = lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
-                                rays.orig_local(ray_idx),
-                                rays.dir_inv_local(ray_idx),
-                                rays.max_t(ray_idx),
-                            );
-
-                            if !hits.all_false() {
-                                all_hits = all_hits | hits;
-                                let mut lanes = [0u8; 4];
-                                let mut lane_count = 0;
-                                for i in 0..children.len() {
-                                    if hits.get_n(i) {
-                                        lanes[lane_count] = i as u8;
-                                        lane_count += 1;
-                                    }
-                                }
-                                (lanes, lane_count)
-                            } else {
-                                ([0; 4], 0)
-                            }
-                        }
-                    });
-
-                    // If there were any intersections, create tasks.
-                    if !all_hits.all_false() {
-                        let order_code = traversal_table[traversal_code as usize];
-                        let mut lanes = [0usize; 4];
-                        let mut lane_count = 0;
-                        for i in 0..children.len() {
-                            let inv_i = (children.len() - 1) - i;
-                            let child_i = ((order_code >> (inv_i * 2)) & 3) as usize;
-                            if all_hits.get_n(child_i) {
-                                node_stack[stack_ptr + lane_count] = &children[child_i];
-                                lanes[lane_count] = child_i;
-                                lane_count += 1;
-                            }
-                        }
-
-                        ray_stack.push_lanes_to_tasks(&lanes[..lane_count]);
-                        stack_ptr += lane_count - 1;
-                    } else {
-                        stack_ptr -= 1;
-                    }
-                }
-
-                &BVH4Node::Leaf { object_range } => {
-                    trav_time += timer.tick() as f64;
-
-                    // Set up the tasks for each object.
-                    let obj_count = object_range.1 - object_range.0;
-                    for _ in 0..(obj_count - 1) {
-                        ray_stack.duplicate_next_task();
-                    }
-
-                    // Do the ray tests.
-                    for obj in &objects[object_range.0..object_range.1] {
-                        obj_ray_test(obj, rays, ray_stack);
-                    }
-
-                    timer.tick();
-
-                    stack_ptr -= 1;
-                }
-            }
-        }
-
-        trav_time += timer.tick() as f64;
-        ACCEL_TRAV_TIME.with(|att| {
-            let v = att.get();
-            att.set(v + trav_time);
-        });
-    }
-
-    fn construct_from_base(
-        arena: &'a MemArena,
-        base: &BVHBase,
-        node: &BVHBaseNode,
-        fill_node: &mut BVH4Node<'a>,
-    ) -> usize {
-        let mut node_count = 0;
-
-        match node {
-            // Create internal node
-            &BVHBaseNode::Internal {
-                bounds_range: _,
-                children_indices,
-                split_axis,
-            } => {
-                let child_l = &base.nodes[children_indices.0];
-                let child_r = &base.nodes[children_indices.1];
-
-                // Prepare convenient access to the stuff we need.
-                let child_count: usize;
-                let children; // [Optional, Optional, Optional, Optional]
-                let split_info: SplitAxes;
-                match *child_l {
-                    BVHBaseNode::Internal {
-                        children_indices: i_l,
-                        split_axis: s_l,
-                        ..
-                    } => {
-                        match *child_r {
-                            BVHBaseNode::Internal {
-                                children_indices: i_r,
-                                split_axis: s_r,
-                                ..
-                            } => {
-                                // Four nodes
-                                child_count = 4;
-                                children = [
-                                    Some(&base.nodes[i_l.0]),
-                                    Some(&base.nodes[i_l.1]),
-                                    Some(&base.nodes[i_r.0]),
-                                    Some(&base.nodes[i_r.1]),
-                                ];
-                                split_info = SplitAxes::Full((split_axis, s_l, s_r));
-                            }
-                            BVHBaseNode::Leaf { .. } => {
-                                // Three nodes with left split
-                                child_count = 3;
-                                children = [
-                                    Some(&base.nodes[i_l.0]),
-                                    Some(&base.nodes[i_l.1]),
-                                    Some(child_r),
-                                    None,
-                                ];
-                                split_info = SplitAxes::Left((split_axis, s_l));
-                            }
-                        }
-                    }
-                    BVHBaseNode::Leaf { .. } => {
-                        match *child_r {
-                            BVHBaseNode::Internal {
-                                children_indices: i_r,
-                                split_axis: s_r,
-                                ..
-                            } => {
-                                // Three nodes with right split
-                                child_count = 3;
-                                children = [
-                                    Some(child_l),
-                                    Some(&base.nodes[i_r.0]),
-                                    Some(&base.nodes[i_r.1]),
-                                    None,
-                                ];
-                                split_info = SplitAxes::Right((split_axis, s_r));
-                            }
-                            BVHBaseNode::Leaf { .. } => {
-                                // Two nodes
-                                child_count = 2;
-                                children = [Some(child_l), Some(child_r), None, None];
-                                split_info = SplitAxes::TopOnly(split_axis);
-                            }
-                        }
-                    }
-                }
-
-                node_count += child_count;
-
-                // Construct bounds
-                let bounds = {
-                    let bounds_len = children
-                        .iter()
-                        .map(|c| {
-                            if let &Some(n) = c {
-                                let len = n.bounds_range().1 - n.bounds_range().0;
-                                debug_assert!(len >= 1);
-                                len
-                            } else {
-                                0
-                            }
-                        })
-                        .max()
-                        .unwrap();
-                    debug_assert!(bounds_len >= 1);
-                    let bounds =
-                        unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) };
-                    if bounds_len < 2 {
-                        let b1 =
-                            children[0].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
-                        let b2 =
-                            children[1].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
-                        let b3 =
-                            children[2].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
-                        let b4 =
-                            children[3].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
-                        bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4);
-                    } else {
-                        for (i, b) in bounds.iter_mut().enumerate() {
-                            let time = i as f32 / (bounds_len - 1) as f32;
-
-                            let b1 = children[0].map_or(BBox::new(), |c| {
-                                let (x, y) = c.bounds_range();
-                                lerp_slice(&base.bounds[x..y], time)
-                            });
-                            let b2 = children[1].map_or(BBox::new(), |c| {
-                                let (x, y) = c.bounds_range();
-                                lerp_slice(&base.bounds[x..y], time)
-                            });
-                            let b3 = children[2].map_or(BBox::new(), |c| {
-                                let (x, y) = c.bounds_range();
-                                lerp_slice(&base.bounds[x..y], time)
-                            });
-                            let b4 = children[3].map_or(BBox::new(), |c| {
-                                let (x, y) = c.bounds_range();
-                                lerp_slice(&base.bounds[x..y], time)
-                            });
-                            *b = BBox4::from_bboxes(b1, b2, b3, b4);
-                        }
-                    }
-                    bounds
-                };
-
-                // Construct child nodes
-                let child_nodes = unsafe {
-                    arena.alloc_array_uninitialized_with_alignment::<BVH4Node>(child_count, 32)
-                };
-                for (i, c) in children[0..child_count].iter().enumerate() {
-                    node_count +=
-                        BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]);
-                }
-
-                // Build this node
-                *fill_node = BVH4Node::Internal {
-                    bounds: bounds,
-                    children: child_nodes,
-                    traversal_code: calc_traversal_code(split_info),
-                };
-            }
-
-            // Create internal node
-            &BVHBaseNode::Leaf { object_range, .. } => {
-                *fill_node = BVH4Node::Leaf {
-                    object_range: object_range,
-                };
-                node_count += 1;
-            }
-        }
-
-        return node_count;
-    }
-}
-
-impl<'a> Boundable for BVH4<'a> {
-    fn bounds<'b>(&'b self) -> &'b [BBox] {
-        self._bounds.unwrap_or(&[])
-    }
-}
diff --git a/src/accel/mod.rs b/src/accel/mod.rs
index 1bac6d7..abbb1d4 100644
--- a/src/accel/mod.rs
+++ b/src/accel/mod.rs
@@ -1,6 +1,5 @@
 // mod bvh;
 mod bvh4;
-mod bvh4_simd;
 mod bvh_base;
 mod light_array;
 mod light_tree;
@@ -15,7 +14,7 @@ use crate::{
 
 pub use self::{
     // bvh::{BVHNode, BVH},
-    bvh4_simd::{ray_code, BVH4Node, BVH4},
+    bvh4::{ray_code, BVH4Node, BVH4},
     light_array::LightArray,
     light_tree::LightTree,
 };
diff --git a/src/light/rectangle_light.rs b/src/light/rectangle_light.rs
index 8df2890..db01072 100644
--- a/src/light/rectangle_light.rs
+++ b/src/light/rectangle_light.rs
@@ -265,7 +265,7 @@ impl<'a> Surface for RectangleLight<'a> {
     ) {
         let _ = shader; // Silence 'unused' warning
 
-        ray_stack.pop_do_next_task(0, |ray_idx| {
+        ray_stack.pop_do_next_task(|ray_idx| {
             let time = rays.time(ray_idx);
             let orig = rays.orig(ray_idx);
             let dir = rays.dir(ray_idx);
@@ -332,8 +332,6 @@ impl<'a> Surface for RectangleLight<'a> {
                     }
                 }
             }
-
-            ([0; 4], 0)
         });
     }
 }
diff --git a/src/light/sphere_light.rs b/src/light/sphere_light.rs
index 8c596a8..e17371f 100644
--- a/src/light/sphere_light.rs
+++ b/src/light/sphere_light.rs
@@ -214,7 +214,7 @@ impl<'a> Surface for SphereLight<'a> {
     ) {
         let _ = shader; // Silence 'unused' warning
 
-        ray_stack.pop_do_next_task(0, |ray_idx| {
+        ray_stack.pop_do_next_task(|ray_idx| {
             let time = rays.time(ray_idx);
 
             // Get the transform space
@@ -242,7 +242,7 @@ impl<'a> Surface for SphereLight<'a> {
             let discriminant = (b * b) - (4.0 * a * c);
             if discriminant < 0.0 {
                 // Discriminant less than zero?  No solution => no intersection.
-                return ([0; 4], 0);
+                return;
             }
             let discriminant = discriminant.sqrt();
 
@@ -268,7 +268,7 @@ impl<'a> Surface for SphereLight<'a> {
             // Check our intersection for validity against this ray's extents
             if t0 > rays.max_t(ray_idx) || t1 <= 0.0 {
                 // Didn't hit because sphere is entirely outside of ray's extents
-                return ([0; 4], 0);
+                return;
             }
 
             let t = if t0 > 0.0 {
@@ -278,7 +278,7 @@ impl<'a> Surface for SphereLight<'a> {
             } else {
                 // Didn't hit because ray is entirely within the sphere, and
                 // therefore doesn't hit its surface.
-                return ([0; 4], 0);
+                return;
             };
 
             // We hit the sphere, so calculate intersection info.
@@ -334,8 +334,6 @@ impl<'a> Surface for SphereLight<'a> {
                 // Set ray's max t
                 rays.set_max_t(ray_idx, t);
             }
-
-            ([0; 4], 0)
         });
     }
 }
diff --git a/src/ray.rs b/src/ray.rs
index 4312f32..2fa92de 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -1,6 +1,6 @@
 #![allow(dead_code)]
 
-use float4::Float4;
+use float4::{Bool4, Float4};
 
 use crate::math::{Matrix4x4, Point, Vector};
 
@@ -293,11 +293,31 @@ impl RayStack {
     }
 
     /// Pops the next task off the stack, and executes the provided closure for
-    /// each ray index in the task.  The return value of the closure is the list
-    /// of lanes (by index) to add the given ray index back into.
-    pub fn pop_do_next_task<F>(&mut self, needed_lanes: usize, mut handle_ray: F)
+    /// each ray index in the task.
+    pub fn pop_do_next_task<F>(&mut self, mut handle_ray: F)
     where
-        F: FnMut(usize) -> ([u8; 4], usize),
+        F: FnMut(usize),
+    {
+        // Pop the task and do necessary bookkeeping.
+        let task = self.tasks.pop().unwrap();
+        let task_range = (task.start_idx, self.lanes[task.lane].end_len);
+        self.lanes[task.lane].end_len = task.start_idx;
+
+        // Execute task.
+        for i in task_range.0..task_range.1 {
+            let ray_idx = self.lanes[task.lane].idxs[i];
+            handle_ray(ray_idx as usize);
+        }
+
+        self.lanes[task.lane].idxs.truncate(task_range.0);
+    }
+
+    /// Pops the next task off the stack, executes the provided closure for
+    /// each ray index in the task, and pushes the ray indices back onto the
+    /// indicated lanes.
+    pub fn pop_do_next_task_and_push_rays<F>(&mut self, needed_lanes: usize, mut handle_ray: F)
+    where
+        F: FnMut(usize) -> (Bool4, usize),
     {
         // Prepare lanes.
         self.ensure_lane_count(needed_lanes);
@@ -311,13 +331,15 @@ impl RayStack {
         let mut source_lane_cap = task_range.0;
         for i in task_range.0..task_range.1 {
             let ray_idx = self.lanes[task.lane].idxs[i];
-            let (add_list, list_len) = handle_ray(ray_idx as usize);
-            for &l in &add_list[..list_len] {
-                if l == task.lane as u8 {
-                    self.lanes[l as usize].idxs[source_lane_cap] = ray_idx;
-                    source_lane_cap += 1;
-                } else {
-                    self.lanes[l as usize].idxs.push(ray_idx);
+            let (push_mask, c) = handle_ray(ray_idx as usize);
+            for l in 0..c {
+                if push_mask.get_n(l) {
+                    if l == task.lane {
+                        self.lanes[l as usize].idxs[source_lane_cap] = ray_idx;
+                        source_lane_cap += 1;
+                    } else {
+                        self.lanes[l as usize].idxs.push(ray_idx);
+                    }
                 }
             }
         }
diff --git a/src/surface/triangle_mesh.rs b/src/surface/triangle_mesh.rs
index 906b7a5..1b54232 100644
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@@ -157,7 +157,7 @@ impl<'a> Surface for TriangleMesh<'a> {
                 };
 
                 // Test each ray against the current triangle.
-                ray_stack.pop_do_next_task(0, |ray_idx| {
+                ray_stack.pop_do_next_task(|ray_idx| {
                     let ray_idx = ray_idx as usize;
                     let ray_time = rays.time(ray_idx);
 
@@ -275,8 +275,6 @@ impl<'a> Surface for TriangleMesh<'a> {
                             rays.set_max_t(ray_idx, t);
                         }
                     }
-
-                    ([0; 4], 0)
                 });
             },
         );
diff --git a/src/tracer.rs b/src/tracer.rs
index 8ba78c3..e733cdd 100644
--- a/src/tracer.rs
+++ b/src/tracer.rs
@@ -12,6 +12,8 @@ use crate::{
     transform_stack::TransformStack,
 };
 
+use float4::Bool4;
+
 pub struct Tracer<'a> {
     ray_stack: RayStack,
     inner: TracerInner<'a>,
@@ -96,10 +98,10 @@ impl<'a> TracerInner<'a> {
                     // Do transforms
                     // TODO: re-divide rays based on direction (maybe?).
                     let xforms = self.xform_stack.top();
-                    ray_stack.pop_do_next_task(2, |ray_idx| {
+                    ray_stack.pop_do_next_task_and_push_rays(2, |ray_idx| {
                         let t = rays.time(ray_idx);
                         rays.update_local(ray_idx, &lerp_slice(xforms, t));
-                        ([0, 1, 0, 0], 2)
+                        (Bool4::new(true, true, false, false), 2)
                     });
                     ray_stack.push_lanes_to_tasks(&[0, 1]);
                 }
@@ -129,16 +131,14 @@ impl<'a> TracerInner<'a> {
                     // Undo transforms
                     let xforms = self.xform_stack.top();
                     if !xforms.is_empty() {
-                        ray_stack.pop_do_next_task(0, |ray_idx| {
+                        ray_stack.pop_do_next_task(|ray_idx| {
                             let t = rays.time(ray_idx);
                             rays.update_local(ray_idx, &lerp_slice(xforms, t));
-                            ([0; 4], 0)
                         });
                     } else {
                         let ident = Matrix4x4::new();
-                        ray_stack.pop_do_next_task(0, |ray_idx| {
+                        ray_stack.pop_do_next_task(|ray_idx| {
                             rays.update_local(ray_idx, &ident);
-                            ([0; 4], 0)
                         });
                     }
                 }
diff --git a/sub_crates/float4/src/lib.rs b/sub_crates/float4/src/lib.rs
index 99c0417..327fbf9 100644
--- a/sub_crates/float4/src/lib.rs
+++ b/sub_crates/float4/src/lib.rs
@@ -621,7 +621,22 @@ mod x86_64_sse {
 
     impl Bool4 {
         #[inline(always)]
-        pub fn new() -> Bool4 {
+        pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 {
+            use std::arch::x86_64::_mm_set_ps;
+            Bool4 {
+                data: unsafe {
+                    _mm_set_ps(
+                        if d { 1.0 } else { 0.0 },
+                        if c { 1.0 } else { 0.0 },
+                        if b { 1.0 } else { 0.0 },
+                        if a { 1.0 } else { 0.0 },
+                    )
+                },
+            }
+        }
+
+        #[inline(always)]
+        pub fn new_false() -> Bool4 {
             use std::arch::x86_64::_mm_set1_ps;
             Bool4 {
                 data: unsafe { _mm_set1_ps(0.0) },
@@ -667,7 +682,8 @@ mod x86_64_sse {
         ///
         /// This is the `OR` operation on all the contained bools.  If even
         /// one bool is true, this returns true.
-        pub fn all_false(&self) -> bool {
+        #[inline(always)]
+        pub fn is_all_false(&self) -> bool {
             let a = unsafe { *(&self.data as *const __m128 as *const u128) };
             a == 0
         }