diff --git a/Cargo.lock b/Cargo.lock
index bc8f7b6..7e4b4ef 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -100,6 +100,11 @@ dependencies = [
 name = "color"
 version = "0.1.0"
 
+[[package]]
+name = "copy_in_place"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
 [[package]]
 name = "crossbeam"
 version = "0.3.2"
@@ -239,6 +244,7 @@ dependencies = [
  "bvh_order 0.1.0",
  "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "color 0.1.0",
+ "copy_in_place 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "float4 0.1.0",
  "half 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -557,6 +563,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33"
 "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
 "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
+"checksum copy_in_place 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b792a46b1ef44bb5e9a04721d34e186522431be965a283437107843d62ddbaad"
 "checksum crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "24ce9782d4d5c53674646a6a4c1863a21a8fc0cb649b3c94dfc16e45071dea19"
 "checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
 "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
diff --git a/Cargo.toml b/Cargo.toml
index 1e51807..14ee2ac 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,6 +25,7 @@ debug = true
 # Crates.io dependencies
 base64 = "0.9"
 clap = "2.30"
+copy_in_place = "0.2.0"
 crossbeam = "0.3"
 half = "1.0"
 lazy_static = "1.0"
diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index 11766bc..5b09e0f 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -1,36 +1,52 @@
+//! This BVH4 implementation is based on the ideas from the paper
+//! "Efficient Ray Tracing Kernels for Modern CPU Architectures"
+//! by Fuetterling et al.
+
 #![allow(dead_code)]
 
-use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
 use mem_arena::MemArena;
 
 use crate::{
-    algorithm::partition, bbox::BBox, boundable::Boundable, lerp::lerp_slice, ray::AccelRay,
-    timer::Timer,
+    bbox::BBox,
+    bbox4::BBox4,
+    boundable::Boundable,
+    lerp::lerp_slice,
+    math::Vector,
+    ray::{RayBatch, RayStack},
 };
 
 use super::{
     bvh_base::{BVHBase, BVHBaseNode, BVH_MAX_DEPTH},
-    ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME,
+    ACCEL_NODE_RAY_TESTS,
 };
 
+use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
+use float4::Bool4;
+
+pub fn ray_code(dir: Vector) -> usize {
+    let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
+    ray_sign_is_neg[0] as usize
+        + ((ray_sign_is_neg[1] as usize) << 1)
+        + ((ray_sign_is_neg[2] as usize) << 2)
+}
+
 #[derive(Copy, Clone, Debug)]
 pub struct BVH4<'a> {
     root: Option<&'a BVH4Node<'a>>,
     depth: usize,
+    node_count: usize,
+    _bounds: Option<&'a [BBox]>,
 }
 
 #[derive(Copy, Clone, Debug)]
 pub enum BVH4Node<'a> {
-    Inner {
-        traversal_code: u8,
-        bounds_start: &'a BBox,
-        bounds_len: u16,
+    Internal {
+        bounds: &'a [BBox4],
         children: &'a [BVH4Node<'a>],
+        traversal_code: u8,
     },
 
     Leaf {
-        bounds_start: &'a BBox,
-        bounds_len: u16,
         object_range: (usize, usize),
     },
 }
@@ -45,19 +61,32 @@ impl<'a> BVH4<'a> {
     where
         F: 'b + Fn(&T) -> &'b [BBox],
     {
-        if objects.is_empty() {
+        if objects.len() == 0 {
             BVH4 {
                 root: None,
                 depth: 0,
+                node_count: 0,
+                _bounds: None,
             }
         } else {
             let base = BVHBase::from_objects(objects, objects_per_leaf, bounder);
 
-            let root = unsafe { arena.alloc_uninitialized::<BVH4Node>() };
-            BVH4::construct_from_base(arena, &base, base.root_node_index(), root);
+            let fill_node = unsafe { arena.alloc_uninitialized_with_alignment::<BVH4Node>(32) };
+            let node_count = BVH4::construct_from_base(
+                arena,
+                &base,
+                &base.nodes[base.root_node_index()],
+                fill_node,
+            );
+
             BVH4 {
-                root: Some(root),
-                depth: base.depth,
+                root: Some(fill_node),
+                depth: (base.depth / 2) + 1,
+                node_count: node_count,
+                _bounds: {
+                    let range = base.nodes[base.root_node_index()].bounds_range();
+                    Some(arena.copy_slice(&base.bounds[range.0..range.1]))
+                },
             }
         }
     }
@@ -66,135 +95,85 @@ impl<'a> BVH4<'a> {
         self.depth
     }
 
-    pub fn traverse<T, F>(&self, rays: &mut [AccelRay], objects: &[T], mut obj_ray_test: F)
+    pub fn traverse<F>(&self, rays: &mut RayBatch, ray_stack: &mut RayStack, mut obj_ray_test: F)
     where
-        F: FnMut(&T, &mut [AccelRay]),
+        F: FnMut(std::ops::Range<usize>, &mut RayBatch, &mut RayStack),
     {
         if self.root.is_none() {
             return;
         }
 
-        let mut timer = Timer::new();
-        let mut trav_time: f64 = 0.0;
         let mut node_tests: u64 = 0;
 
-        let traversal_table = {
-            let ray_sign_is_neg = [
-                rays[0].dir_inv.x() < 0.0,
-                rays[0].dir_inv.y() < 0.0,
-                rays[0].dir_inv.z() < 0.0,
-            ];
-            let ray_code = ray_sign_is_neg[0] as usize
-                + ((ray_sign_is_neg[1] as usize) << 1)
-                + ((ray_sign_is_neg[2] as usize) << 2);
-            &TRAVERSAL_TABLE[ray_code]
-        };
+        let traversal_table =
+            &TRAVERSAL_TABLE[ray_code(rays.dir_inv_local(ray_stack.next_task_ray_idx(0)))];
 
         // +2 of max depth for root and last child
         let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2];
-        let mut ray_i_stack = [rays.len(); (BVH_MAX_DEPTH * 3) + 2];
         let mut stack_ptr = 1;
 
         while stack_ptr > 0 {
-            node_tests += ray_i_stack[stack_ptr] as u64;
-            match *node_stack[stack_ptr] {
-                BVH4Node::Inner {
-                    traversal_code,
-                    bounds_start,
-                    bounds_len,
+            match node_stack[stack_ptr] {
+                &BVH4Node::Internal {
+                    bounds,
                     children,
+                    traversal_code,
                 } => {
-                    let bounds =
-                        unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
-                    let part = partition(&mut rays[..ray_i_stack[stack_ptr]], |r| {
-                        (!r.is_done()) && lerp_slice(bounds, r.time).intersect_accel_ray(r)
-                    });
-                    if part > 0 {
-                        let order_code = traversal_table[traversal_code as usize];
-                        match children.len() {
-                            4 => {
-                                let i4 = ((order_code >> 6) & 0b11) as usize;
-                                let i3 = ((order_code >> 4) & 0b11) as usize;
-                                let i2 = ((order_code >> 2) & 0b11) as usize;
-                                let i1 = (order_code & 0b11) as usize;
+                    node_tests += ray_stack.ray_count_in_next_task() as u64;
+                    let mut all_hits = Bool4::new_false();
 
-                                ray_i_stack[stack_ptr] = part;
-                                ray_i_stack[stack_ptr + 1] = part;
-                                ray_i_stack[stack_ptr + 2] = part;
-                                ray_i_stack[stack_ptr + 3] = part;
-
-                                node_stack[stack_ptr] = &children[i4];
-                                node_stack[stack_ptr + 1] = &children[i3];
-                                node_stack[stack_ptr + 2] = &children[i2];
-                                node_stack[stack_ptr + 3] = &children[i1];
-
-                                stack_ptr += 3;
-                            }
-                            3 => {
-                                let i3 = ((order_code >> 4) & 0b11) as usize;
-                                let i2 = ((order_code >> 2) & 0b11) as usize;
-                                let i1 = (order_code & 0b11) as usize;
-
-                                ray_i_stack[stack_ptr] = part;
-                                ray_i_stack[stack_ptr + 1] = part;
-                                ray_i_stack[stack_ptr + 2] = part;
-
-                                node_stack[stack_ptr] = &children[i3];
-                                node_stack[stack_ptr + 1] = &children[i2];
-                                node_stack[stack_ptr + 2] = &children[i1];
-
-                                stack_ptr += 2;
-                            }
-                            2 => {
-                                let i2 = ((order_code >> 2) & 0b11) as usize;
-                                let i1 = (order_code & 0b11) as usize;
-
-                                ray_i_stack[stack_ptr] = part;
-                                ray_i_stack[stack_ptr + 1] = part;
-
-                                node_stack[stack_ptr] = &children[i2];
-                                node_stack[stack_ptr + 1] = &children[i1];
-
-                                stack_ptr += 1;
-                            }
-                            _ => unreachable!(),
+                    // Ray testing
+                    ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| {
+                        if rays.is_done(ray_idx) {
+                            Bool4::new_false()
+                        } else {
+                            let hits = if bounds.len() == 1 {
+                                bounds[0].intersect_ray(
+                                    rays.orig_local(ray_idx),
+                                    rays.dir_inv_local(ray_idx),
+                                    rays.max_t(ray_idx),
+                                )
+                            } else {
+                                lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
+                                    rays.orig_local(ray_idx),
+                                    rays.dir_inv_local(ray_idx),
+                                    rays.max_t(ray_idx),
+                                )
+                            };
+                            all_hits = all_hits | hits;
+                            hits
                         }
+                    });
+
+                    // If there were any intersections, create tasks.
+                    if !all_hits.is_all_false() {
+                        let order_code = traversal_table[traversal_code as usize];
+                        let mut lane_count = 0;
+                        let mut i = children.len() as u8;
+                        while i > 0 {
+                            i -= 1;
+                            let child_i = ((order_code >> (i * 2)) & 3) as usize;
+                            if ray_stack.push_lane_to_task(child_i) {
+                                node_stack[stack_ptr + lane_count] = &children[child_i];
+                                lane_count += 1;
+                            }
+                        }
+
+                        stack_ptr += lane_count - 1;
                     } else {
                         stack_ptr -= 1;
                     }
                 }
 
-                BVH4Node::Leaf {
-                    object_range,
-                    bounds_start,
-                    bounds_len,
-                } => {
-                    let bounds =
-                        unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
-                    let part = partition(&mut rays[..ray_i_stack[stack_ptr]], |r| {
-                        (!r.is_done()) && lerp_slice(bounds, r.time).intersect_accel_ray(r)
-                    });
-
-                    trav_time += timer.tick() as f64;
-
-                    if part > 0 {
-                        for obj in &objects[object_range.0..object_range.1] {
-                            obj_ray_test(obj, &mut rays[..part]);
-                        }
-                    }
-
-                    timer.tick();
+                &BVH4Node::Leaf { object_range } => {
+                    // Do the ray tests.
+                    obj_ray_test(object_range.0..object_range.1, rays, ray_stack);
 
                     stack_ptr -= 1;
                 }
             }
         }
 
-        trav_time += timer.tick() as f64;
-        ACCEL_TRAV_TIME.with(|att| {
-            let v = att.get();
-            att.set(v + trav_time);
-        });
         ACCEL_NODE_RAY_TESTS.with(|anv| {
             let v = anv.get();
             anv.set(v + node_tests);
@@ -204,12 +183,15 @@ impl<'a> BVH4<'a> {
     fn construct_from_base(
         arena: &'a MemArena,
         base: &BVHBase,
-        node_index: usize,
-        node_mem: &mut BVH4Node<'a>,
-    ) {
-        match base.nodes[node_index] {
-            BVHBaseNode::Internal {
-                bounds_range,
+        node: &BVHBaseNode,
+        fill_node: &mut BVH4Node<'a>,
+    ) -> usize {
+        let mut node_count = 0;
+
+        match node {
+            // Create internal node
+            &BVHBaseNode::Internal {
+                bounds_range: _,
                 children_indices,
                 split_axis,
             } => {
@@ -218,7 +200,7 @@ impl<'a> BVH4<'a> {
 
                 // Prepare convenient access to the stuff we need.
                 let child_count: usize;
-                let child_indices: [usize; 4];
+                let children; // [Optional, Optional, Optional, Optional]
                 let split_info: SplitAxes;
                 match *child_l {
                     BVHBaseNode::Internal {
@@ -234,13 +216,23 @@ impl<'a> BVH4<'a> {
                             } => {
                                 // Four nodes
                                 child_count = 4;
-                                child_indices = [i_l.0, i_l.1, i_r.0, i_r.1];
+                                children = [
+                                    Some(&base.nodes[i_l.0]),
+                                    Some(&base.nodes[i_l.1]),
+                                    Some(&base.nodes[i_r.0]),
+                                    Some(&base.nodes[i_r.1]),
+                                ];
                                 split_info = SplitAxes::Full((split_axis, s_l, s_r));
                             }
                             BVHBaseNode::Leaf { .. } => {
                                 // Three nodes with left split
                                 child_count = 3;
-                                child_indices = [i_l.0, i_l.1, children_indices.1, 0];
+                                children = [
+                                    Some(&base.nodes[i_l.0]),
+                                    Some(&base.nodes[i_l.1]),
+                                    Some(child_r),
+                                    None,
+                                ];
                                 split_info = SplitAxes::Left((split_axis, s_l));
                             }
                         }
@@ -254,76 +246,112 @@ impl<'a> BVH4<'a> {
                             } => {
                                 // Three nodes with right split
                                 child_count = 3;
-                                child_indices = [children_indices.0, i_r.0, i_r.1, 0];
+                                children = [
+                                    Some(child_l),
+                                    Some(&base.nodes[i_r.0]),
+                                    Some(&base.nodes[i_r.1]),
+                                    None,
+                                ];
                                 split_info = SplitAxes::Right((split_axis, s_r));
                             }
                             BVHBaseNode::Leaf { .. } => {
                                 // Two nodes
                                 child_count = 2;
-                                child_indices = [children_indices.0, children_indices.1, 0, 0];
+                                children = [Some(child_l), Some(child_r), None, None];
                                 split_info = SplitAxes::TopOnly(split_axis);
                             }
                         }
                     }
                 }
 
-                // Copy bounds
-                let bounds = arena
-                    .copy_slice_with_alignment(&base.bounds[bounds_range.0..bounds_range.1], 32);
+                node_count += child_count;
 
-                // Build children
-                let children_mem = unsafe {
+                // Construct bounds
+                let bounds = {
+                    let bounds_len = children
+                        .iter()
+                        .map(|c| {
+                            if let &Some(n) = c {
+                                let len = n.bounds_range().1 - n.bounds_range().0;
+                                debug_assert!(len >= 1);
+                                len
+                            } else {
+                                0
+                            }
+                        })
+                        .max()
+                        .unwrap();
+                    debug_assert!(bounds_len >= 1);
+                    let bounds =
+                        unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) };
+                    if bounds_len < 2 {
+                        let b1 =
+                            children[0].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b2 =
+                            children[1].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b3 =
+                            children[2].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b4 =
+                            children[3].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4);
+                    } else {
+                        for (i, b) in bounds.iter_mut().enumerate() {
+                            let time = i as f32 / (bounds_len - 1) as f32;
+
+                            let b1 = children[0].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b2 = children[1].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b3 = children[2].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b4 = children[3].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            *b = BBox4::from_bboxes(b1, b2, b3, b4);
+                        }
+                    }
+                    bounds
+                };
+
+                // Construct child nodes
+                let child_nodes = unsafe {
                     arena.alloc_array_uninitialized_with_alignment::<BVH4Node>(child_count, 32)
                 };
-                for i in 0..child_count {
-                    BVH4::construct_from_base(arena, base, child_indices[i], &mut children_mem[i]);
+                for (i, c) in children[0..child_count].iter().enumerate() {
+                    node_count +=
+                        BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]);
                 }
 
-                // Fill in node
-                *node_mem = BVH4Node::Inner {
+                // Build this node
+                *fill_node = BVH4Node::Internal {
+                    bounds: bounds,
+                    children: child_nodes,
                     traversal_code: calc_traversal_code(split_info),
-                    bounds_start: &bounds[0],
-                    bounds_len: bounds.len() as u16,
-                    children: children_mem,
                 };
             }
 
-            BVHBaseNode::Leaf {
-                bounds_range,
-                object_range,
-            } => {
-                let bounds = arena.copy_slice(&base.bounds[bounds_range.0..bounds_range.1]);
-
-                *node_mem = BVH4Node::Leaf {
-                    bounds_start: &bounds[0],
-                    bounds_len: bounds.len() as u16,
+            // Create internal node
+            &BVHBaseNode::Leaf { object_range, .. } => {
+                *fill_node = BVH4Node::Leaf {
                     object_range: object_range,
                 };
+                node_count += 1;
             }
         }
-    }
-}
 
-lazy_static! {
-    static ref DEGENERATE_BOUNDS: [BBox; 1] = [BBox::new()];
+        return node_count;
+    }
 }
 
 impl<'a> Boundable for BVH4<'a> {
-    fn bounds(&self) -> &[BBox] {
-        match self.root {
-            None => &DEGENERATE_BOUNDS[..],
-            Some(root) => match *root {
-                BVH4Node::Inner {
-                    bounds_start,
-                    bounds_len,
-                    ..
-                }
-                | BVH4Node::Leaf {
-                    bounds_start,
-                    bounds_len,
-                    ..
-                } => unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) },
-            },
-        }
+    fn bounds<'b>(&'b self) -> &'b [BBox] {
+        self._bounds.unwrap_or(&[])
     }
 }
diff --git a/src/accel/mod.rs b/src/accel/mod.rs
index fe8ee3d..ba83a3a 100644
--- a/src/accel/mod.rs
+++ b/src/accel/mod.rs
@@ -1,4 +1,4 @@
-mod bvh;
+// mod bvh;
 mod bvh4;
 mod bvh_base;
 mod light_array;
@@ -13,15 +13,14 @@ use crate::{
 };
 
 pub use self::{
-    bvh::{BVHNode, BVH},
-    bvh4::{BVH4Node, BVH4},
+    // bvh::{BVHNode, BVH},
+    bvh4::{ray_code, BVH4Node, BVH4},
     light_array::LightArray,
     light_tree::LightTree,
 };
 
 // Track BVH traversal time
 thread_local! {
-    pub static ACCEL_TRAV_TIME: Cell<f64> = Cell::new(0.0);
     pub static ACCEL_NODE_RAY_TESTS: Cell<u64> = Cell::new(0);
 }
 
diff --git a/src/bbox.rs b/src/bbox.rs
index 33d3e6e..a4a43bb 100644
--- a/src/bbox.rs
+++ b/src/bbox.rs
@@ -7,8 +7,7 @@ use std::{
 
 use crate::{
     lerp::{lerp, lerp_slice, Lerp},
-    math::{fast_minf32, Matrix4x4, Point},
-    ray::AccelRay,
+    math::{fast_minf32, Matrix4x4, Point, Vector},
 };
 
 const BBOX_MAXT_ADJUST: f32 = 1.000_000_24;
@@ -40,17 +39,17 @@ impl BBox {
     }
 
     // Returns whether the given ray intersects with the bbox.
-    pub fn intersect_accel_ray(&self, ray: &AccelRay) -> bool {
+    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> bool {
         // Calculate slab intersections
-        let t1 = (self.min.co - ray.orig.co) * ray.dir_inv.co;
-        let t2 = (self.max.co - ray.orig.co) * ray.dir_inv.co;
+        let t1 = (self.min.co - orig.co) * dir_inv.co;
+        let t2 = (self.max.co - orig.co) * dir_inv.co;
 
         // Find the far and near intersection
         let mut far_t = t1.v_max(t2);
         let mut near_t = t1.v_min(t2);
         far_t.set_3(std::f32::INFINITY);
         near_t.set_3(0.0);
-        let far_hit_t = fast_minf32(far_t.h_min() * BBOX_MAXT_ADJUST, ray.max_t);
+        let far_hit_t = fast_minf32(far_t.h_min() * BBOX_MAXT_ADJUST, max_t);
         let near_hit_t = near_t.h_max();
 
         // Did we hit?
diff --git a/src/bbox4.rs b/src/bbox4.rs
new file mode 100644
index 0000000..71793a4
--- /dev/null
+++ b/src/bbox4.rs
@@ -0,0 +1,139 @@
+#![allow(dead_code)]
+
+use std;
+use std::ops::{BitOr, BitOrAssign};
+
+use crate::{
+    bbox::BBox,
+    lerp::{lerp, Lerp},
+    math::{Point, Vector},
+};
+
+use float4::{Bool4, Float4};
+
+const BBOX_MAXT_ADJUST: f32 = 1.00000024;
+
+/// A SIMD set of 4 3D axis-aligned bounding boxes.
+#[derive(Debug, Copy, Clone)]
+pub struct BBox4 {
+    pub x: (Float4, Float4), // (min, max)
+    pub y: (Float4, Float4), // (min, max)
+    pub z: (Float4, Float4), // (min, max)
+}
+
+impl BBox4 {
+    /// Creates a degenerate BBox with +infinity min and -infinity max.
+    pub fn new() -> BBox4 {
+        BBox4 {
+            x: (
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
+            ),
+            y: (
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
+            ),
+            z: (
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
+            ),
+        }
+    }
+
+    /// Creates a BBox with min as the minimum extent and max as the maximum
+    /// extent.
+    pub fn from_bboxes(b1: BBox, b2: BBox, b3: BBox, b4: BBox) -> BBox4 {
+        BBox4 {
+            x: (
+                Float4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()),
+                Float4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()),
+            ),
+            y: (
+                Float4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()),
+                Float4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()),
+            ),
+            z: (
+                Float4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()),
+                Float4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z()),
+            ),
+        }
+    }
+
+    // Returns whether the given ray intersects with the bboxes.
+    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> Bool4 {
+        // Get the ray data into SIMD format.
+        let ro_x = orig.co.all_0();
+        let ro_y = orig.co.all_1();
+        let ro_z = orig.co.all_2();
+        let rdi_x = dir_inv.co.all_0();
+        let rdi_y = dir_inv.co.all_1();
+        let rdi_z = dir_inv.co.all_2();
+        let max_t = Float4::splat(max_t);
+
+        // Slab tests
+        let t1_x = (self.x.0 - ro_x) * rdi_x;
+        let t1_y = (self.y.0 - ro_y) * rdi_y;
+        let t1_z = (self.z.0 - ro_z) * rdi_z;
+        let t2_x = (self.x.1 - ro_x) * rdi_x;
+        let t2_y = (self.y.1 - ro_y) * rdi_y;
+        let t2_z = (self.z.1 - ro_z) * rdi_z;
+
+        // Get the far and near t hits for each axis.
+        let t_far_x = t1_x.v_max(t2_x);
+        let t_far_y = t1_y.v_max(t2_y);
+        let t_far_z = t1_z.v_max(t2_z);
+        let t_near_x = t1_x.v_min(t2_x);
+        let t_near_y = t1_y.v_min(t2_y);
+        let t_near_z = t1_z.v_min(t2_z);
+
+        // Calculate over-all far t hit.
+        let far_t =
+            (t_far_x.v_min(t_far_y.v_min(t_far_z)) * Float4::splat(BBOX_MAXT_ADJUST)).v_min(max_t);
+
+        // Calculate over-all near t hit.
+        let near_t = t_near_x
+            .v_max(t_near_y)
+            .v_max(t_near_z.v_max(Float4::splat(0.0)));
+
+        // Hit results
+        near_t.lt(far_t)
+    }
+}
+
+/// Union of two BBoxes.
+impl BitOr for BBox4 {
+    type Output = BBox4;
+
+    fn bitor(self, rhs: BBox4) -> BBox4 {
+        BBox4 {
+            x: (self.x.0.v_min(rhs.x.0), self.x.1.v_max(rhs.x.1)),
+            y: (self.y.0.v_min(rhs.y.0), self.y.1.v_max(rhs.y.1)),
+            z: (self.z.0.v_min(rhs.z.0), self.z.1.v_max(rhs.z.1)),
+        }
+    }
+}
+
+impl BitOrAssign for BBox4 {
+    fn bitor_assign(&mut self, rhs: BBox4) {
+        *self = *self | rhs;
+    }
+}
+
+impl Lerp for BBox4 {
+    fn lerp(self, other: BBox4, alpha: f32) -> BBox4 {
+        BBox4 {
+            x: (
+                lerp(self.x.0, other.x.0, alpha),
+                lerp(self.x.1, other.x.1, alpha),
+            ),
+            y: (
+                lerp(self.y.0, other.y.0, alpha),
+                lerp(self.y.1, other.y.1, alpha),
+            ),
+            z: (
+                lerp(self.z.0, other.z.0, alpha),
+                lerp(self.z.1, other.z.1, alpha),
+            ),
+        }
+    }
+}
diff --git a/src/camera.rs b/src/camera.rs
index e3ed8c5..287805c 100644
--- a/src/camera.rs
+++ b/src/camera.rs
@@ -92,6 +92,12 @@ impl<'a> Camera<'a> {
         )
         .normalized();
 
-        Ray::new(orig * transform, dir * transform, time, wavelength, false)
+        Ray {
+            orig: orig * transform,
+            dir: dir * transform,
+            time: time,
+            wavelength: wavelength,
+            max_t: std::f32::INFINITY,
+        }
     }
 }
diff --git a/src/light/rectangle_light.rs b/src/light/rectangle_light.rs
index 98bae49..e399d68 100644
--- a/src/light/rectangle_light.rs
+++ b/src/light/rectangle_light.rs
@@ -6,7 +6,7 @@ use crate::{
     color::{Color, SpectralSample},
     lerp::lerp_slice,
     math::{cross, dot, Matrix4x4, Normal, Point, Vector},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack},
     sampling::{
         spherical_triangle_solid_angle, triangle_surface_area, uniform_sample_spherical_triangle,
         uniform_sample_triangle,
@@ -257,20 +257,23 @@ impl<'a> SurfaceLight for RectangleLight<'a> {
 impl<'a> Surface for RectangleLight<'a> {
     fn intersect_rays(
         &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
         isects: &mut [SurfaceIntersection],
         shader: &SurfaceShader,
         space: &[Matrix4x4],
     ) {
         let _ = shader; // Silence 'unused' warning
 
-        for r in accel_rays.iter_mut() {
-            let wr = &wrays[r.id as usize];
+        ray_stack.pop_do_next_task(|ray_idx| {
+            let time = rays.time(ray_idx);
+            let orig = rays.orig(ray_idx);
+            let dir = rays.dir(ray_idx);
+            let max_t = rays.max_t(ray_idx);
 
             // Calculate time interpolated values
-            let dim = lerp_slice(self.dimensions, r.time);
-            let xform = lerp_slice(space, r.time);
+            let dim = lerp_slice(self.dimensions, time);
+            let xform = lerp_slice(space, time);
 
             let space_inv = xform.inverse();
 
@@ -281,18 +284,19 @@ impl<'a> Surface for RectangleLight<'a> {
             let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0) * space_inv;
 
             // Test against two triangles that make up the light
+            let ray_pre = triangle::RayTriPrecompute::new(dir);
             for tri in &[(p1, p2, p3), (p3, p4, p1)] {
-                if let Some((t, b0, b1, b2)) = triangle::intersect_ray(wr, *tri) {
-                    if t < r.max_t {
-                        if r.is_occlusion() {
-                            isects[r.id as usize] = SurfaceIntersection::Occlude;
-                            r.mark_done();
+                if let Some((t, b0, b1, b2)) = triangle::intersect_ray(orig, ray_pre, max_t, *tri) {
+                    if t < max_t {
+                        if rays.is_occlusion(ray_idx) {
+                            isects[ray_idx] = SurfaceIntersection::Occlude;
+                            rays.mark_done(ray_idx);
                         } else {
                             let (pos, pos_err) = triangle::surface_point(*tri, (b0, b1, b2));
                             let normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();
 
                             let intersection_data = SurfaceIntersectionData {
-                                incoming: wr.dir,
+                                incoming: dir,
                                 t: t,
                                 pos: pos,
                                 pos_err: pos_err,
@@ -301,35 +305,35 @@ impl<'a> Surface for RectangleLight<'a> {
                                 local_space: xform,
                                 sample_pdf: self.sample_pdf(
                                     &xform,
-                                    wr.orig,
-                                    wr.dir,
+                                    orig,
+                                    dir,
                                     pos,
-                                    wr.wavelength,
-                                    r.time,
+                                    rays.wavelength(ray_idx),
+                                    time,
                                 ),
                             };
 
                             let closure = {
                                 let inv_surface_area = (1.0 / (dim.0 as f64 * dim.1 as f64)) as f32;
-                                let color = lerp_slice(self.colors, r.time) * inv_surface_area;
+                                let color = lerp_slice(self.colors, time) * inv_surface_area;
                                 SurfaceClosure::Emit(color)
                             };
 
                             // Fill in intersection
-                            isects[r.id as usize] = SurfaceIntersection::Hit {
+                            isects[ray_idx] = SurfaceIntersection::Hit {
                                 intersection_data: intersection_data,
                                 closure: closure,
                             };
 
                             // Set ray's max t
-                            r.max_t = t;
+                            rays.set_max_t(ray_idx, t);
                         }
 
                         break;
                     }
                 }
             }
-        }
+        });
     }
 }
 
diff --git a/src/light/sphere_light.rs b/src/light/sphere_light.rs
index 2323902..e17371f 100644
--- a/src/light/sphere_light.rs
+++ b/src/light/sphere_light.rs
@@ -8,7 +8,7 @@ use crate::{
     color::{Color, SpectralSample},
     lerp::lerp_slice,
     math::{coordinate_system_from_vector, dot, Matrix4x4, Normal, Point, Vector},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack},
     sampling::{uniform_sample_cone, uniform_sample_cone_pdf, uniform_sample_sphere},
     shading::surface_closure::SurfaceClosure,
     shading::SurfaceShader,
@@ -206,26 +206,26 @@ impl<'a> SurfaceLight for SphereLight<'a> {
 impl<'a> Surface for SphereLight<'a> {
     fn intersect_rays(
         &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
         isects: &mut [SurfaceIntersection],
         shader: &SurfaceShader,
         space: &[Matrix4x4],
     ) {
         let _ = shader; // Silence 'unused' warning
 
-        for r in accel_rays.iter_mut() {
-            let wr = &wrays[r.id as usize];
+        ray_stack.pop_do_next_task(|ray_idx| {
+            let time = rays.time(ray_idx);
 
             // Get the transform space
-            let xform = lerp_slice(space, r.time);
+            let xform = lerp_slice(space, time);
 
             // Get the radius of the sphere at the ray's time
-            let radius = lerp_slice(self.radii, r.time); // Radius of the sphere
+            let radius = lerp_slice(self.radii, time); // Radius of the sphere
 
             // Get the ray origin and direction in local space
-            let orig = r.orig.into_vector();
-            let dir = wr.dir * xform;
+            let orig = rays.orig(ray_idx).into_vector();
+            let dir = rays.dir(ray_idx) * xform;
 
             // Code adapted to Rust from https://github.com/Tecla/Rayito
             // Ray-sphere intersection can result in either zero, one or two points
@@ -242,7 +242,7 @@ impl<'a> Surface for SphereLight<'a> {
             let discriminant = (b * b) - (4.0 * a * c);
             if discriminant < 0.0 {
                 // Discriminant less than zero?  No solution => no intersection.
-                continue;
+                return;
             }
             let discriminant = discriminant.sqrt();
 
@@ -257,7 +257,7 @@ impl<'a> Surface for SphereLight<'a> {
 
             // Get our final parametric values
             let mut t0 = q / a;
-            let mut t1 = if q != 0.0 { c / q } else { r.max_t };
+            let mut t1 = if q != 0.0 { c / q } else { rays.max_t(ray_idx) };
 
             // Swap them so they are ordered right
             if t0 > t1 {
@@ -266,25 +266,25 @@ impl<'a> Surface for SphereLight<'a> {
             }
 
             // Check our intersection for validity against this ray's extents
-            if t0 > r.max_t || t1 <= 0.0 {
-                // Didn't hit because shere is entirely outside of ray's extents
-                continue;
+            if t0 > rays.max_t(ray_idx) || t1 <= 0.0 {
+                // Didn't hit because sphere is entirely outside of ray's extents
+                return;
             }
 
             let t = if t0 > 0.0 {
                 t0
-            } else if t1 <= r.max_t {
+            } else if t1 <= rays.max_t(ray_idx) {
                 t1
             } else {
                 // Didn't hit because ray is entirely within the sphere, and
                 // therefore doesn't hit its surface.
-                continue;
+                return;
             };
 
             // We hit the sphere, so calculate intersection info.
-            if r.is_occlusion() {
-                isects[r.id as usize] = SurfaceIntersection::Occlude;
-                r.mark_done();
+            if rays.is_occlusion(ray_idx) {
+                isects[ray_idx] = SurfaceIntersection::Occlude;
+                rays.mark_done(ray_idx);
             } else {
                 let inv_xform = xform.inverse();
 
@@ -300,7 +300,7 @@ impl<'a> Surface for SphereLight<'a> {
                 let normal = unit_pos.into_normal() * inv_xform;
 
                 let intersection_data = SurfaceIntersectionData {
-                    incoming: wr.dir,
+                    incoming: rays.dir(ray_idx),
                     t: t,
                     pos: pos,
                     pos_err: pos_err,
@@ -309,32 +309,32 @@ impl<'a> Surface for SphereLight<'a> {
                     local_space: xform,
                     sample_pdf: self.sample_pdf(
                         &xform,
-                        wr.orig,
-                        wr.dir,
+                        rays.orig(ray_idx),
+                        rays.dir(ray_idx),
                         0.0,
                         0.0,
-                        wr.wavelength,
-                        r.time,
+                        rays.wavelength(ray_idx),
+                        time,
                     ),
                 };
 
                 let closure = {
                     let inv_surface_area =
                         (1.0 / (4.0 * PI_64 * radius as f64 * radius as f64)) as f32;
-                    let color = lerp_slice(self.colors, r.time) * inv_surface_area;
+                    let color = lerp_slice(self.colors, time) * inv_surface_area;
                     SurfaceClosure::Emit(color)
                 };
 
                 // Fill in intersection
-                isects[r.id as usize] = SurfaceIntersection::Hit {
+                isects[ray_idx] = SurfaceIntersection::Hit {
                     intersection_data: intersection_data,
                     closure: closure,
                 };
 
                 // Set ray's max t
-                r.max_t = t;
+                rays.set_max_t(ray_idx, t);
             }
-        }
+        });
     }
 }
 
diff --git a/src/main.rs b/src/main.rs
index c1f5cef..f469e98 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -11,12 +11,12 @@
 #![allow(clippy::needless_range_loop)]
 #![allow(clippy::excessive_precision)]
 
-#[macro_use]
 extern crate lazy_static;
 
 mod accel;
 mod algorithm;
 mod bbox;
+mod bbox4;
 mod boundable;
 mod camera;
 mod color;
@@ -47,10 +47,9 @@ use nom::{error_position, take_until};
 use mem_arena::MemArena;
 
 use crate::{
-    accel::{BVH4Node, BVHNode},
+    accel::BVH4Node,
     bbox::BBox,
     parse::{parse_scene, DataTree},
-    ray::{AccelRay, Ray},
     renderer::LightPath,
     surface::SurfaceIntersection,
     timer::Timer,
@@ -159,15 +158,13 @@ fn main() {
 
     // Print some misc useful dev info.
     if args.is_present("dev") {
-        println!("Ray size:       {} bytes", mem::size_of::<Ray>());
-        println!("AccelRay size:  {} bytes", mem::size_of::<AccelRay>());
         println!(
             "SurfaceIntersection size:  {} bytes",
             mem::size_of::<SurfaceIntersection>()
         );
         println!("LightPath size: {} bytes", mem::size_of::<LightPath>());
         println!("BBox size: {} bytes", mem::size_of::<BBox>());
-        println!("BVHNode size: {} bytes", mem::size_of::<BVHNode>());
+        // println!("BVHNode size: {} bytes", mem::size_of::<BVHNode>());
         println!("BVH4Node size: {} bytes", mem::size_of::<BVH4Node>());
         return;
     }
@@ -295,9 +292,10 @@ fn main() {
                         "\t\tTrace:                  {:.3}s",
                         ntime * rstats.trace_time
                     );
+                    println!("\t\t\tRays traced:          {}", rstats.ray_count);
                     println!(
-                        "\t\t\tTraversal:            {:.3}s",
-                        ntime * rstats.accel_traversal_time
+                        "\t\t\tRays/sec:             {}",
+                        (rstats.ray_count as f64 / (ntime * rstats.trace_time) as f64) as u64
                     );
                     println!("\t\t\tRay/node tests:       {}", rstats.accel_node_visits);
                     println!(
diff --git a/src/ray.rs b/src/ray.rs
index cf91b74..7c2bc83 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -1,102 +1,401 @@
 #![allow(dead_code)]
 
-use float4::Float4;
+use float4::{Bool4, Float4};
 
 use crate::math::{Matrix4x4, Point, Vector};
 
-const OCCLUSION_FLAG: u32 = 1;
-const DONE_FLAG: u32 = 1 << 1;
+type RayIndexType = u16;
+type FlagType = u8;
+const OCCLUSION_FLAG: FlagType = 1;
+const DONE_FLAG: FlagType = 1 << 1;
 
+/// This is never used directly in ray tracing--it's only used as a convenience
+/// for filling the RayBatch structure.
 #[derive(Debug, Copy, Clone)]
 pub struct Ray {
     pub orig: Point,
     pub dir: Vector,
-    pub max_t: f32,
     pub time: f32,
     pub wavelength: f32,
-    pub flags: u32,
+    pub max_t: f32,
 }
 
-impl Ray {
-    pub fn new(orig: Point, dir: Vector, time: f32, wavelength: f32, is_occ: bool) -> Ray {
-        if !is_occ {
-            Ray {
-                orig: orig,
-                dir: dir,
-                max_t: std::f32::INFINITY,
-                time: time,
-                wavelength: wavelength,
-                flags: 0,
-            }
-        } else {
-            Ray {
-                orig: orig,
-                dir: dir,
-                max_t: 1.0,
-                time: time,
-                wavelength: wavelength,
-                flags: OCCLUSION_FLAG,
-            }
+/// The hot (frequently accessed) parts of ray data.
+#[derive(Debug, Copy, Clone)]
+struct RayHot {
+    orig_local: Point,     // Local-space ray origin
+    dir_inv_local: Vector, // Local-space 1.0/ray direction
+    max_t: f32,
+    time: f32,
+    flags: FlagType,
+}
+
+/// The cold (infrequently accessed) parts of ray data.
+#[derive(Debug, Copy, Clone)]
+struct RayCold {
+    orig: Point, // World-space ray origin
+    dir: Vector, // World-space ray direction
+    wavelength: f32,
+}
+
+/// A batch of rays, separated into hot and cold parts.
+#[derive(Debug)]
+pub struct RayBatch {
+    hot: Vec<RayHot>,
+    cold: Vec<RayCold>,
+}
+
+impl RayBatch {
+    /// Creates a new empty ray batch.
+    pub fn new() -> RayBatch {
+        RayBatch {
+            hot: Vec::new(),
+            cold: Vec::new(),
         }
     }
 
-    pub fn transform(&mut self, mat: &Matrix4x4) {
-        self.orig = self.orig * *mat;
-        self.dir = self.dir * *mat;
+    /// Creates a new empty ray batch, with pre-allocated capacity for
+    /// `n` rays.
+    pub fn with_capacity(n: usize) -> RayBatch {
+        RayBatch {
+            hot: Vec::with_capacity(n),
+            cold: Vec::with_capacity(n),
+        }
     }
 
-    pub fn is_occlusion(&self) -> bool {
-        (self.flags & OCCLUSION_FLAG) != 0
-    }
-}
-
-#[derive(Debug, Copy, Clone)]
-pub struct AccelRay {
-    pub orig: Point,
-    pub dir_inv: Vector,
-    pub max_t: f32,
-    pub time: f32,
-    pub flags: u32,
-    pub id: u32,
-}
-
-impl AccelRay {
-    pub fn new(ray: &Ray, id: u32) -> AccelRay {
-        AccelRay {
-            orig: ray.orig,
-            dir_inv: Vector {
-                co: Float4::splat(1.0) / ray.dir.co,
-            },
+    pub fn push(&mut self, ray: Ray, is_occlusion: bool) {
+        self.hot.push(RayHot {
+            orig_local: ray.orig,   // Bogus, to place-hold.
+            dir_inv_local: ray.dir, // Bogus, to place-hold.
             max_t: ray.max_t,
             time: ray.time,
-            flags: ray.flags,
-            id: id,
+            flags: if is_occlusion { OCCLUSION_FLAG } else { 0 },
+        });
+        self.cold.push(RayCold {
+            orig: ray.orig,
+            dir: ray.dir,
+            wavelength: ray.wavelength,
+        });
+    }
+
+    pub fn swap(&mut self, a: usize, b: usize) {
+        self.hot.swap(a, b);
+        self.cold.swap(a, b);
+    }
+
+    pub fn set_from_ray(&mut self, ray: &Ray, is_occlusion: bool, idx: usize) {
+        self.hot[idx].orig_local = ray.orig;
+        self.hot[idx].dir_inv_local = Vector {
+            co: Float4::splat(1.0) / ray.dir.co,
+        };
+        self.hot[idx].max_t = ray.max_t;
+        self.hot[idx].time = ray.time;
+        self.hot[idx].flags = if is_occlusion { OCCLUSION_FLAG } else { 0 };
+
+        self.cold[idx].orig = ray.orig;
+        self.cold[idx].dir = ray.dir;
+        self.cold[idx].wavelength = ray.wavelength;
+    }
+
+    pub fn truncate(&mut self, len: usize) {
+        self.hot.truncate(len);
+        self.cold.truncate(len);
+    }
+
+    /// Clear all rays, settings the size of the batch back to zero.
+    ///
+    /// Capacity is maintained.
+    pub fn clear(&mut self) {
+        self.hot.clear();
+        self.cold.clear();
+    }
+
+    pub fn len(&self) -> usize {
+        self.hot.len()
+    }
+
+    /// Updates the accel data of the given ray (at index `idx`) with the
+    /// given world-to-local-space transform matrix.
+    ///
+    /// This should be called when entering (and exiting) traversal of a
+    /// new transform space.
+    pub fn update_local(&mut self, idx: usize, xform: &Matrix4x4) {
+        self.hot[idx].orig_local = self.cold[idx].orig * *xform;
+        self.hot[idx].dir_inv_local = Vector {
+            co: Float4::splat(1.0) / (self.cold[idx].dir * *xform).co,
+        };
+    }
+
+    //==========================================================
+    // Data access
+
+    #[inline(always)]
+    pub fn orig(&self, idx: usize) -> Point {
+        self.cold[idx].orig
+    }
+
+    #[inline(always)]
+    pub fn dir(&self, idx: usize) -> Vector {
+        self.cold[idx].dir
+    }
+
+    #[inline(always)]
+    pub fn orig_local(&self, idx: usize) -> Point {
+        self.hot[idx].orig_local
+    }
+
+    #[inline(always)]
+    pub fn dir_inv_local(&self, idx: usize) -> Vector {
+        self.hot[idx].dir_inv_local
+    }
+
+    #[inline(always)]
+    pub fn time(&self, idx: usize) -> f32 {
+        self.hot[idx].time
+    }
+
+    #[inline(always)]
+    pub fn max_t(&self, idx: usize) -> f32 {
+        self.hot[idx].max_t
+    }
+
+    #[inline(always)]
+    pub fn set_max_t(&mut self, idx: usize, new_max_t: f32) {
+        self.hot[idx].max_t = new_max_t;
+    }
+
+    #[inline(always)]
+    pub fn wavelength(&self, idx: usize) -> f32 {
+        self.cold[idx].wavelength
+    }
+
+    /// Returns whether the given ray (at index `idx`) is an occlusion ray.
+    #[inline(always)]
+    pub fn is_occlusion(&self, idx: usize) -> bool {
+        (self.hot[idx].flags & OCCLUSION_FLAG) != 0
+    }
+
+    /// Returns whether the given ray (at index `idx`) has finished traversal.
+    #[inline(always)]
+    pub fn is_done(&self, idx: usize) -> bool {
+        (self.hot[idx].flags & DONE_FLAG) != 0
+    }
+
+    /// Marks the given ray (at index `idx`) as an occlusion ray.
+    #[inline(always)]
+    pub fn mark_occlusion(&mut self, idx: usize) {
+        self.hot[idx].flags |= OCCLUSION_FLAG
+    }
+
+    /// Marks the given ray (at index `idx`) as having finished traversal.
+    #[inline(always)]
+    pub fn mark_done(&mut self, idx: usize) {
+        self.hot[idx].flags |= DONE_FLAG
+    }
+}
+
+/// A structure used for tracking traversal of a ray batch through a scene.
+#[derive(Debug)]
+pub struct RayStack {
+    lanes: Vec<Lane>,
+    tasks: Vec<RayTask>,
+}
+
+impl RayStack {
+    pub fn new() -> RayStack {
+        RayStack {
+            lanes: Vec::new(),
+            tasks: Vec::new(),
         }
     }
 
-    pub fn update_from_world_ray(&mut self, wr: &Ray) {
-        self.orig = wr.orig;
-        self.dir_inv = Vector {
-            co: Float4::splat(1.0) / wr.dir.co,
-        };
+    /// Returns whether the stack is empty of tasks or not.
+    pub fn is_empty(&self) -> bool {
+        self.tasks.is_empty()
     }
 
-    pub fn update_from_xformed_world_ray(&mut self, wr: &Ray, mat: &Matrix4x4) {
-        self.orig = wr.orig * *mat;
-        self.dir_inv = Vector {
-            co: Float4::splat(1.0) / (wr.dir * *mat).co,
-        };
+    /// Makes sure there are at least `count` lanes.
+    pub fn ensure_lane_count(&mut self, count: usize) {
+        while self.lanes.len() < count {
+            self.lanes.push(Lane {
+                idxs: Vec::new(),
+                end_len: 0,
+            })
+        }
     }
 
-    pub fn is_occlusion(&self) -> bool {
-        (self.flags & OCCLUSION_FLAG) != 0
+    pub fn ray_count_in_next_task(&self) -> usize {
+        let task = self.tasks.last().unwrap();
+        let end = self.lanes[task.lane].end_len;
+        end - task.start_idx
     }
 
-    pub fn is_done(&self) -> bool {
-        (self.flags & DONE_FLAG) != 0
+    pub fn next_task_ray_idx(&self, i: usize) -> usize {
+        let task = self.tasks.last().unwrap();
+        let i = i + task.start_idx;
+        debug_assert!(i < self.lanes[task.lane].end_len);
+        self.lanes[task.lane].idxs[i] as usize
     }
 
-    pub fn mark_done(&mut self) {
-        self.flags |= DONE_FLAG;
+    /// Clears the lanes and tasks of the RayStack.
+    ///
+    /// Note: this is (importantly) different than calling clear individually
+    /// on the `lanes` and `tasks` members.  Specifically, we don't want to
+    /// clear `lanes` itself, as that would also free all the memory of the
+    /// individual lanes.  Instead, we want to iterate over the individual
+    /// lanes and clear them, but leave `lanes` itself untouched.
+    pub fn clear(&mut self) {
+        for lane in self.lanes.iter_mut() {
+            lane.idxs.clear();
+            lane.end_len = 0;
+        }
+
+        self.tasks.clear();
+    }
+
+    /// Pushes the given ray index onto the end of the specified lane.
+    pub fn push_ray_index(&mut self, ray_idx: usize, lane: usize) {
+        assert!(self.lanes.len() > lane);
+        self.lanes[lane].idxs.push(ray_idx as RayIndexType);
+    }
+
+    /// Pushes any excess indices on the given lane to a new task on the
+    /// task stack.
+    ///
+    /// Returns whether a task was pushed or not.  No task will be pushed
+    /// if there are no excess indices on the end of the lane.
+    pub fn push_lane_to_task(&mut self, lane_idx: usize) -> bool {
+        if self.lanes[lane_idx].end_len < self.lanes[lane_idx].idxs.len() {
+            self.tasks.push(RayTask {
+                lane: lane_idx,
+                start_idx: self.lanes[lane_idx].end_len,
+            });
+            self.lanes[lane_idx].end_len = self.lanes[lane_idx].idxs.len();
+            true
+        } else {
+            false
+        }
+    }
+
+    /// Takes the given list of lane indices, and pushes any excess indices on
+    /// the end of each into a new task, in the order provided.
+    pub fn push_lanes_to_tasks(&mut self, lane_idxs: &[usize]) {
+        for &l in lane_idxs {
+            self.push_lane_to_task(l);
+        }
+    }
+
+    pub fn duplicate_next_task(&mut self) {
+        let task = self.tasks.last().unwrap();
+        let l = task.lane;
+        let start = task.start_idx;
+        let end = self.lanes[l].end_len;
+
+        // Extend the indices vector
+        self.lanes[l].idxs.reserve(end - start);
+        let old_len = self.lanes[l].idxs.len();
+        let new_len = old_len + end - start;
+        unsafe {
+            self.lanes[l].idxs.set_len(new_len);
+        }
+
+        // Copy elements
+        copy_in_place::copy_in_place(&mut self.lanes[l].idxs, start..end, end);
+
+        // Push the new task onto the stack
+        self.tasks.push(RayTask {
+            lane: l,
+            start_idx: end,
+        });
+
+        self.lanes[l].end_len = self.lanes[l].idxs.len();
+    }
+
+    // Pops the next task off the stack.
+    pub fn pop_task(&mut self) {
+        let task = self.tasks.pop().unwrap();
+        self.lanes[task.lane].end_len = task.start_idx;
+        self.lanes[task.lane].idxs.truncate(task.start_idx);
+    }
+
+    // Executes a task without popping it from the task stack.
+    pub fn do_next_task<F>(&mut self, mut handle_ray: F)
+    where
+        F: FnMut(usize),
+    {
+        let task = self.tasks.last().unwrap();
+        let task_range = (task.start_idx, self.lanes[task.lane].end_len);
+
+        // Execute task.
+        for i in task_range.0..task_range.1 {
+            let ray_idx = self.lanes[task.lane].idxs[i];
+            handle_ray(ray_idx as usize);
+        }
+    }
+
+    /// Pops the next task off the stack, and executes the provided closure for
+    /// each ray index in the task.
+    #[inline(always)]
+    pub fn pop_do_next_task<F>(&mut self, handle_ray: F)
+    where
+        F: FnMut(usize),
+    {
+        self.do_next_task(handle_ray);
+        self.pop_task();
+    }
+
+    /// Pops the next task off the stack, executes the provided closure for
+    /// each ray index in the task, and pushes the ray indices back onto the
+    /// indicated lanes.
+    pub fn pop_do_next_task_and_push_rays<F>(&mut self, output_lane_count: usize, mut handle_ray: F)
+    where
+        F: FnMut(usize) -> Bool4,
+    {
+        // Pop the task and do necessary bookkeeping.
+        let task = self.tasks.pop().unwrap();
+        let task_range = (task.start_idx, self.lanes[task.lane].end_len);
+        self.lanes[task.lane].end_len = task.start_idx;
+
+        // SAFETY: this is probably evil, and depends on behavior of Vec that
+        // are not actually promised.  But we're essentially truncating the lane
+        // to the start of our task range, but will continue to access it's
+        // elements beyond that range via `get_unchecked()` below.  Because the
+        // memory is not freed nor altered, this is safe.  However, again, the
+        // Vec apis don't promise this behavior.  So:
+        //
+        // TODO: build a slightly different lane abstraction to get this same
+        // efficiency without depending on implicit Vec behavior.
+        unsafe {
+            self.lanes[task.lane].idxs.set_len(task.start_idx);
+        }
+
+        // Execute task.
+        for i in task_range.0..task_range.1 {
+            let ray_idx = *unsafe { self.lanes[task.lane].idxs.get_unchecked(i) };
+            let push_mask = handle_ray(ray_idx as usize);
+            for l in 0..output_lane_count {
+                if push_mask.get_n(l) {
+                    self.lanes[l as usize].idxs.push(ray_idx);
+                }
+            }
+        }
     }
 }
+
+/// A lane within a RayStack.
+#[derive(Debug)]
+struct Lane {
+    idxs: Vec<RayIndexType>,
+    end_len: usize,
+}
+
+/// A task within a RayStack.
+//
+// Specifies the lane that the relevant ray pointers are in, and the
+// starting index within that lane.  The relevant pointers are always
+// `&[start_idx..]` within the given lane.
+#[derive(Debug)]
+struct RayTask {
+    lane: usize,
+    start_idx: usize,
+}
diff --git a/src/renderer.rs b/src/renderer.rs
index 8f1471f..50d3061 100644
--- a/src/renderer.rs
+++ b/src/renderer.rs
@@ -12,8 +12,7 @@ use scoped_threadpool::Pool;
 use float4::Float4;
 
 use crate::{
-    accel::{ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME},
-    algorithm::partition_pair,
+    accel::ACCEL_NODE_RAY_TESTS,
     color::{map_0_1_to_wavelength, SpectralSample, XYZ},
     fp_utils::robust_ray_origin,
     hash::hash_u32,
@@ -21,7 +20,7 @@ use crate::{
     image::Image,
     math::{fast_logit, upper_power_of_two},
     mis::power_heuristic,
-    ray::Ray,
+    ray::{Ray, RayBatch},
     scene::{Scene, SceneLightSample},
     surface,
     timer::Timer,
@@ -41,8 +40,8 @@ pub struct Renderer<'a> {
 #[derive(Debug, Copy, Clone)]
 pub struct RenderStats {
     pub trace_time: f64,
-    pub accel_traversal_time: f64,
     pub accel_node_visits: u64,
+    pub ray_count: u64,
     pub initial_ray_generation_time: f64,
     pub ray_generation_time: f64,
     pub sample_writing_time: f64,
@@ -53,8 +52,8 @@ impl RenderStats {
     fn new() -> RenderStats {
         RenderStats {
             trace_time: 0.0,
-            accel_traversal_time: 0.0,
             accel_node_visits: 0,
+            ray_count: 0,
             initial_ray_generation_time: 0.0,
             ray_generation_time: 0.0,
             sample_writing_time: 0.0,
@@ -64,8 +63,8 @@ impl RenderStats {
 
     fn collect(&mut self, other: RenderStats) {
         self.trace_time += other.trace_time;
-        self.accel_traversal_time += other.accel_traversal_time;
         self.accel_node_visits += other.accel_node_visits;
+        self.ray_count += other.ray_count;
         self.initial_ray_generation_time += other.initial_ray_generation_time;
         self.ray_generation_time += other.ray_generation_time;
         self.sample_writing_time += other.sample_writing_time;
@@ -207,7 +206,7 @@ impl<'a> Renderer<'a> {
         let mut total_timer = Timer::new();
 
         let mut paths = Vec::new();
-        let mut rays = Vec::new();
+        let mut rays = RayBatch::new();
         let mut tracer = Tracer::from_assembly(&self.scene.root);
         let mut xform_stack = TransformStack::new();
 
@@ -266,7 +265,7 @@ impl<'a> Renderer<'a> {
                             offset + si as u32,
                         );
                         paths.push(path);
-                        rays.push(ray);
+                        rays.push(ray, false);
                     }
                 }
             }
@@ -276,13 +275,20 @@ impl<'a> Renderer<'a> {
             let mut pi = paths.len();
             while pi > 0 {
                 // Test rays against scene
-                let isects = tracer.trace(&rays);
+                let isects = tracer.trace(&mut rays);
                 stats.trace_time += timer.tick() as f64;
 
                 // Determine next rays to shoot based on result
-                pi = partition_pair(&mut paths[..pi], &mut rays[..pi], |i, path, ray| {
-                    path.next(&mut xform_stack, &self.scene, &isects[i], &mut *ray)
-                });
+                let mut new_end = 0;
+                for i in 0..pi {
+                    if paths[i].next(&mut xform_stack, &self.scene, &isects[i], &mut rays, i) {
+                        paths.swap(new_end, i);
+                        rays.swap(new_end, i);
+                        new_end += 1;
+                    }
+                }
+                rays.truncate(new_end);
+                pi = new_end;
                 stats.ray_generation_time += timer.tick() as f64;
             }
 
@@ -338,10 +344,7 @@ impl<'a> Renderer<'a> {
         }
 
         stats.total_time += total_timer.tick() as f64;
-        ACCEL_TRAV_TIME.with(|att| {
-            stats.accel_traversal_time = att.get();
-            att.set(0.0);
-        });
+        stats.ray_count = tracer.rays_traced();
         ACCEL_NODE_RAY_TESTS.with(|anv| {
             stats.accel_node_visits = anv.get();
             anv.set(0);
@@ -431,7 +434,8 @@ impl LightPath {
         xform_stack: &mut TransformStack,
         scene: &Scene,
         isect: &surface::SurfaceIntersection,
-        ray: &mut Ray,
+        rays: &mut RayBatch,
+        ray_idx: usize,
     ) -> bool {
         match self.event {
             //--------------------------------------------------------------------
@@ -496,13 +500,13 @@ impl LightPath {
                             // Distant light
                             SceneLightSample::Distant { direction, .. } => {
                                 let (attenuation, closure_pdf) = closure.evaluate(
-                                    ray.dir,
+                                    rays.dir(ray_idx),
                                     direction,
                                     idata.nor,
                                     idata.nor_g,
                                     self.wavelength,
                                 );
-                                let mut shadow_ray = {
+                                let shadow_ray = {
                                     // Calculate the shadow ray for testing if the light is
                                     // in shadow or not.
                                     let offset_pos = robust_ray_origin(
@@ -511,15 +515,14 @@ impl LightPath {
                                         idata.nor_g.normalized(),
                                         direction,
                                     );
-                                    Ray::new(
-                                        offset_pos,
-                                        direction,
-                                        self.time,
-                                        self.wavelength,
-                                        true,
-                                    )
+                                    Ray {
+                                        orig: offset_pos,
+                                        dir: direction,
+                                        time: self.time,
+                                        wavelength: self.wavelength,
+                                        max_t: std::f32::INFINITY,
+                                    }
                                 };
-                                shadow_ray.max_t = std::f32::INFINITY;
                                 (attenuation, closure_pdf, shadow_ray)
                             }
 
@@ -527,7 +530,7 @@ impl LightPath {
                             SceneLightSample::Surface { sample_geo, .. } => {
                                 let dir = sample_geo.0 - idata.pos;
                                 let (attenuation, closure_pdf) = closure.evaluate(
-                                    ray.dir,
+                                    rays.dir(ray_idx),
                                     dir,
                                     idata.nor,
                                     idata.nor_g,
@@ -548,13 +551,13 @@ impl LightPath {
                                         sample_geo.1.normalized(),
                                         -dir,
                                     );
-                                    Ray::new(
-                                        offset_pos,
-                                        offset_end - offset_pos,
-                                        self.time,
-                                        self.wavelength,
-                                        true,
-                                    )
+                                    Ray {
+                                        orig: offset_pos,
+                                        dir: offset_end - offset_pos,
+                                        time: self.time,
+                                        wavelength: self.wavelength,
+                                        max_t: 1.0,
+                                    }
                                 };
                                 (attenuation, closure_pdf, shadow_ray)
                             }
@@ -572,7 +575,7 @@ impl LightPath {
                                 light_info.color().e * attenuation.e * self.light_attenuation
                                     / (light_mis_pdf * light_sel_pdf);
 
-                            *ray = shadow_ray;
+                            rays.set_from_ray(&shadow_ray, true, ray_idx);
 
                             true
                         }
@@ -609,8 +612,13 @@ impl LightPath {
                                 idata.nor_g.normalized(),
                                 dir,
                             );
-                            self.next_bounce_ray =
-                                Some(Ray::new(offset_pos, dir, self.time, self.wavelength, false));
+                            self.next_bounce_ray = Some(Ray {
+                                orig: offset_pos,
+                                dir: dir,
+                                time: self.time,
+                                wavelength: self.wavelength,
+                                max_t: std::f32::INFINITY,
+                            });
 
                             true
                         } else {
@@ -626,7 +634,7 @@ impl LightPath {
                         self.event = LightPathEvent::ShadowRay;
                         return true;
                     } else if do_bounce {
-                        *ray = self.next_bounce_ray.unwrap();
+                        rays.set_from_ray(&self.next_bounce_ray.unwrap(), false, ray_idx);
                         self.event = LightPathEvent::BounceRay;
                         self.light_attenuation *= self.next_attenuation_fac;
                         return true;
@@ -657,7 +665,7 @@ impl LightPath {
 
                 // Set up for the next bounce, if any
                 if let Some(ref nbr) = self.next_bounce_ray {
-                    *ray = *nbr;
+                    rays.set_from_ray(nbr, false, ray_idx);
                     self.light_attenuation *= self.next_attenuation_fac;
                     self.event = LightPathEvent::BounceRay;
                     return true;
diff --git a/src/surface/micropoly_batch.rs b/src/surface/micropoly_batch.rs
index 36d686f..8bb9447 100644
--- a/src/surface/micropoly_batch.rs
+++ b/src/surface/micropoly_batch.rs
@@ -8,7 +8,7 @@ use crate::{
     boundable::Boundable,
     lerp::lerp_slice,
     math::{cross, dot, Matrix4x4, Normal, Point},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack, RayTask}
     shading::surface_closure::SurfaceClosure,
 };
 
@@ -99,8 +99,8 @@ impl<'a> MicropolyBatch<'a> {
 impl<'a> MicropolyBatch<'a> {
     fn intersect_rays(
         &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
         isects: &mut [SurfaceIntersection],
         space: &[Matrix4x4],
     ) {
@@ -112,7 +112,7 @@ impl<'a> MicropolyBatch<'a> {
         };
 
         self.accel
-            .traverse(&mut accel_rays[..], self.indices, |tri_indices, rs| {
+            .traverse(rays, ray_stack, self.indices, |tri_indices, rs| {
                 // For static triangles with static transforms, cache them.
                 let is_cached = self.time_sample_count == 1 && space.len() <= 1;
                 let mut tri = if is_cached {
diff --git a/src/surface/mod.rs b/src/surface/mod.rs
index 9c2b761..2f90223 100644
--- a/src/surface/mod.rs
+++ b/src/surface/mod.rs
@@ -1,6 +1,6 @@
 #![allow(dead_code)]
 
-pub mod micropoly_batch;
+// pub mod micropoly_batch;
 pub mod triangle;
 pub mod triangle_mesh;
 
@@ -9,7 +9,7 @@ use std::fmt::Debug;
 use crate::{
     boundable::Boundable,
     math::{Matrix4x4, Normal, Point, Vector},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack},
     shading::surface_closure::SurfaceClosure,
     shading::SurfaceShader,
 };
@@ -17,8 +17,8 @@ use crate::{
 pub trait Surface: Boundable + Debug + Sync {
     fn intersect_rays(
         &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
         isects: &mut [SurfaceIntersection],
         shader: &SurfaceShader,
         space: &[Matrix4x4],
diff --git a/src/surface/triangle.rs b/src/surface/triangle.rs
index c252e59..4aed3a3 100644
--- a/src/surface/triangle.rs
+++ b/src/surface/triangle.rs
@@ -1,6 +1,48 @@
 #![allow(dead_code)]
 
-use crate::{fp_utils::fp_gamma, math::Point, ray::Ray};
+use crate::{
+    fp_utils::fp_gamma,
+    math::{Point, Vector},
+};
+
+#[derive(Debug, Copy, Clone)]
+pub struct RayTriPrecompute {
+    i: (usize, usize, usize),
+    s: (f32, f32, f32),
+}
+
+impl RayTriPrecompute {
+    pub fn new(ray_dir: Vector) -> RayTriPrecompute {
+        // Calculate the permuted dimension indices for the new ray space.
+        let (xi, yi, zi) = {
+            let xabs = ray_dir.x().abs();
+            let yabs = ray_dir.y().abs();
+            let zabs = ray_dir.z().abs();
+
+            if xabs > yabs && xabs > zabs {
+                (1, 2, 0)
+            } else if yabs > zabs {
+                (2, 0, 1)
+            } else {
+                (0, 1, 2)
+            }
+        };
+
+        let dir_x = ray_dir.get_n(xi);
+        let dir_y = ray_dir.get_n(yi);
+        let dir_z = ray_dir.get_n(zi);
+
+        // Calculate shear constants.
+        let sx = dir_x / dir_z;
+        let sy = dir_y / dir_z;
+        let sz = 1.0 / dir_z;
+
+        RayTriPrecompute {
+            i: (xi, yi, zi),
+            s: (sx, sy, sz),
+        }
+    }
+}
 
 /// Intersects `ray` with `tri`, returning `Some((t, b0, b1, b2))`, or `None`
 /// if no intersection.
@@ -13,42 +55,23 @@ use crate::{fp_utils::fp_gamma, math::Point, ray::Ray};
 ///
 /// Uses the ray-triangle test from the paper "Watertight Ray/Triangle
 /// Intersection" by Woop et al.
-pub fn intersect_ray(ray: &Ray, tri: (Point, Point, Point)) -> Option<(f32, f32, f32, f32)> {
-    // Calculate the permuted dimension indices for the new ray space.
-    let (xi, yi, zi) = {
-        let xabs = ray.dir.x().abs();
-        let yabs = ray.dir.y().abs();
-        let zabs = ray.dir.z().abs();
-
-        if xabs > yabs && xabs > zabs {
-            (1, 2, 0)
-        } else if yabs > zabs {
-            (2, 0, 1)
-        } else {
-            (0, 1, 2)
-        }
-    };
-
-    let dir_x = ray.dir.get_n(xi);
-    let dir_y = ray.dir.get_n(yi);
-    let dir_z = ray.dir.get_n(zi);
-
-    // Calculate shear constants.
-    let sx = dir_x / dir_z;
-    let sy = dir_y / dir_z;
-    let sz = 1.0 / dir_z;
-
+pub fn intersect_ray(
+    ray_orig: Point,
+    ray_pre: RayTriPrecompute,
+    ray_max_t: f32,
+    tri: (Point, Point, Point),
+) -> Option<(f32, f32, f32, f32)> {
     // Calculate vertices in ray space.
-    let p0 = tri.0 - ray.orig;
-    let p1 = tri.1 - ray.orig;
-    let p2 = tri.2 - ray.orig;
+    let p0 = tri.0 - ray_orig;
+    let p1 = tri.1 - ray_orig;
+    let p2 = tri.2 - ray_orig;
 
-    let p0x = p0.get_n(xi) - (sx * p0.get_n(zi));
-    let p0y = p0.get_n(yi) - (sy * p0.get_n(zi));
-    let p1x = p1.get_n(xi) - (sx * p1.get_n(zi));
-    let p1y = p1.get_n(yi) - (sy * p1.get_n(zi));
-    let p2x = p2.get_n(xi) - (sx * p2.get_n(zi));
-    let p2y = p2.get_n(yi) - (sy * p2.get_n(zi));
+    let p0x = p0.get_n(ray_pre.i.0) - (ray_pre.s.0 * p0.get_n(ray_pre.i.2));
+    let p0y = p0.get_n(ray_pre.i.1) - (ray_pre.s.1 * p0.get_n(ray_pre.i.2));
+    let p1x = p1.get_n(ray_pre.i.0) - (ray_pre.s.0 * p1.get_n(ray_pre.i.2));
+    let p1y = p1.get_n(ray_pre.i.1) - (ray_pre.s.1 * p1.get_n(ray_pre.i.2));
+    let p2x = p2.get_n(ray_pre.i.0) - (ray_pre.s.0 * p2.get_n(ray_pre.i.2));
+    let p2y = p2.get_n(ray_pre.i.1) - (ray_pre.s.1 * p2.get_n(ray_pre.i.2));
 
     // Calculate scaled barycentric coordinates.
     let mut e0 = (p1x * p2y) - (p1y * p2x);
@@ -74,14 +97,14 @@ pub fn intersect_ray(ray: &Ray, tri: (Point, Point, Point)) -> Option<(f32, f32,
     }
 
     // Calculate t of hitpoint.
-    let p0z = sz * p0.get_n(zi);
-    let p1z = sz * p1.get_n(zi);
-    let p2z = sz * p2.get_n(zi);
+    let p0z = ray_pre.s.2 * p0.get_n(ray_pre.i.2);
+    let p1z = ray_pre.s.2 * p1.get_n(ray_pre.i.2);
+    let p2z = ray_pre.s.2 * p2.get_n(ray_pre.i.2);
     let t_scaled = (e0 * p0z) + (e1 * p1z) + (e2 * p2z);
 
     // Check if the hitpoint t is within ray min/max t.
-    if (det > 0.0 && (t_scaled <= 0.0 || t_scaled > (ray.max_t * det)))
-        || (det < 0.0 && (t_scaled >= 0.0 || t_scaled < (ray.max_t * det)))
+    if (det > 0.0 && (t_scaled <= 0.0 || t_scaled > (ray_max_t * det)))
+        || (det < 0.0 && (t_scaled >= 0.0 || t_scaled < (ray_max_t * det)))
     {
         return None;
     }
diff --git a/src/surface/triangle_mesh.rs b/src/surface/triangle_mesh.rs
index a067416..43388a8 100644
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@@ -8,12 +8,14 @@ use crate::{
     boundable::Boundable,
     lerp::lerp_slice,
     math::{cross, dot, Matrix4x4, Normal, Point},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack},
     shading::SurfaceShader,
 };
 
 use super::{triangle, Surface, SurfaceIntersection, SurfaceIntersectionData};
 
+const MAX_LEAF_TRIANGLE_COUNT: usize = 3;
+
 #[derive(Copy, Clone, Debug)]
 pub struct TriangleMesh<'a> {
     time_sample_count: usize,
@@ -93,7 +95,7 @@ impl<'a> TriangleMesh<'a> {
         };
 
         // Build BVH
-        let accel = BVH4::from_objects(arena, &mut indices[..], 3, |tri| {
+        let accel = BVH4::from_objects(arena, &mut indices[..], MAX_LEAF_TRIANGLE_COUNT, |tri| {
             &bounds
                 [(tri.3 as usize * time_sample_count)..((tri.3 as usize + 1) * time_sample_count)]
         });
@@ -117,8 +119,8 @@ impl<'a> Boundable for TriangleMesh<'a> {
 impl<'a> Surface for TriangleMesh<'a> {
     fn intersect_rays(
         &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
         isects: &mut [SurfaceIntersection],
         shader: &SurfaceShader,
         space: &[Matrix4x4],
@@ -131,144 +133,177 @@ impl<'a> Surface for TriangleMesh<'a> {
         };
 
         self.accel
-            .traverse(&mut accel_rays[..], self.indices, |tri_indices, rs| {
-                // For static triangles with static transforms, cache them.
-                let is_cached = self.time_sample_count == 1 && space.len() <= 1;
-                let mut tri = if is_cached {
-                    let tri = (
-                        self.vertices[tri_indices.0 as usize],
-                        self.vertices[tri_indices.1 as usize],
-                        self.vertices[tri_indices.2 as usize],
-                    );
-                    if space.is_empty() {
-                        tri
-                    } else {
-                        (
-                            tri.0 * static_mat_space,
-                            tri.1 * static_mat_space,
-                            tri.2 * static_mat_space,
-                        )
-                    }
-                } else {
-                    unsafe { std::mem::uninitialized() }
-                };
+            .traverse(rays, ray_stack, |idx_range, rays, ray_stack| {
+                let tri_count = idx_range.end - idx_range.start;
 
-                // Test each ray against the current triangle.
-                for r in rs {
-                    let wr = &wrays[r.id as usize];
+                // Build the triangle cache if we can!
+                let is_cached = ray_stack.ray_count_in_next_task() >= tri_count
+                    && self.time_sample_count == 1
+                    && space.len() <= 1;
+                let mut tri_cache = [unsafe { std::mem::uninitialized() }; MAX_LEAF_TRIANGLE_COUNT];
+                if is_cached {
+                    for tri_idx in idx_range.clone() {
+                        let i = tri_idx - idx_range.start;
+                        let tri_indices = self.indices[tri_idx];
 
-                    // Get triangle if necessary
-                    if !is_cached {
-                        tri = if self.time_sample_count == 1 {
-                            // No deformation motion blur, so fast-path it.
-                            (
-                                self.vertices[tri_indices.0 as usize],
-                                self.vertices[tri_indices.1 as usize],
-                                self.vertices[tri_indices.2 as usize],
-                            )
-                        } else {
-                            // Deformation motion blur, need to interpolate.
-                            let p0_slice = &self.vertices[(tri_indices.0 as usize
-                                * self.time_sample_count)
-                                ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
-                            let p1_slice = &self.vertices[(tri_indices.1 as usize
-                                * self.time_sample_count)
-                                ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
-                            let p2_slice = &self.vertices[(tri_indices.2 as usize
-                                * self.time_sample_count)
-                                ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
-
-                            let p0 = lerp_slice(p0_slice, wr.time);
-                            let p1 = lerp_slice(p1_slice, wr.time);
-                            let p2 = lerp_slice(p2_slice, wr.time);
-
-                            (p0, p1, p2)
-                        };
-                    }
-
-                    // Transform triangle if necessary, and get transform space.
-                    let mat_space = if !space.is_empty() {
-                        if space.len() > 1 {
-                            // Per-ray transform, for motion blur
-                            let mat_space = lerp_slice(space, wr.time).inverse();
-                            tri = (tri.0 * mat_space, tri.1 * mat_space, tri.2 * mat_space);
-                            mat_space
-                        } else {
-                            // Same transform for all rays
-                            if !is_cached {
-                                tri = (
-                                    tri.0 * static_mat_space,
-                                    tri.1 * static_mat_space,
-                                    tri.2 * static_mat_space,
-                                );
-                            }
-                            static_mat_space
-                        }
-                    } else {
-                        // No transforms
-                        Matrix4x4::new()
-                    };
-
-                    // Test ray against triangle
-                    if let Some((t, b0, b1, b2)) = triangle::intersect_ray(wr, tri) {
-                        if t < r.max_t {
-                            if r.is_occlusion() {
-                                isects[r.id as usize] = SurfaceIntersection::Occlude;
-                                r.mark_done();
-                            } else {
-                                // Calculate intersection point and error magnitudes
-                                let (pos, pos_err) = triangle::surface_point(tri, (b0, b1, b2));
-
-                                // Calculate geometric surface normal
-                                let geo_normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();
-
-                                // Calculate interpolated surface normal, if any
-                                let shading_normal = if let Some(normals) = self.normals {
-                                    let n0_slice = &normals[(tri_indices.0 as usize
-                                        * self.time_sample_count)
-                                        ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
-                                    let n1_slice = &normals[(tri_indices.1 as usize
-                                        * self.time_sample_count)
-                                        ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
-                                    let n2_slice = &normals[(tri_indices.2 as usize
-                                        * self.time_sample_count)
-                                        ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
-
-                                    let n0 = lerp_slice(n0_slice, wr.time).normalized();
-                                    let n1 = lerp_slice(n1_slice, wr.time).normalized();
-                                    let n2 = lerp_slice(n2_slice, wr.time).normalized();
-
-                                    let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
-                                    if dot(s_nor, geo_normal) >= 0.0 {
-                                        s_nor
-                                    } else {
-                                        -s_nor
-                                    }
-                                } else {
-                                    geo_normal
-                                };
-
-                                let intersection_data = SurfaceIntersectionData {
-                                    incoming: wr.dir,
-                                    t: t,
-                                    pos: pos,
-                                    pos_err: pos_err,
-                                    nor: shading_normal,
-                                    nor_g: geo_normal,
-                                    local_space: mat_space,
-                                    sample_pdf: 0.0,
-                                };
-
-                                // Fill in intersection data
-                                isects[r.id as usize] = SurfaceIntersection::Hit {
-                                    intersection_data: intersection_data,
-                                    closure: shader.shade(&intersection_data, wr.time),
-                                };
-                                r.max_t = t;
-                            }
+                        // For static triangles with static transforms, cache them.
+                        tri_cache[i] = (
+                            self.vertices[tri_indices.0 as usize],
+                            self.vertices[tri_indices.1 as usize],
+                            self.vertices[tri_indices.2 as usize],
+                        );
+                        if !space.is_empty() {
+                            tri_cache[i].0 = tri_cache[i].0 * static_mat_space;
+                            tri_cache[i].1 = tri_cache[i].1 * static_mat_space;
+                            tri_cache[i].2 = tri_cache[i].2 * static_mat_space;
                         }
                     }
                 }
+
+                // Test each ray against the triangles.
+                ray_stack.do_next_task(|ray_idx| {
+                    let ray_idx = ray_idx as usize;
+
+                    if rays.is_done(ray_idx) {
+                        return;
+                    }
+
+                    let ray_time = rays.time(ray_idx);
+
+                    // Calculate the ray space, if necessary.
+                    let mat_space = if space.len() > 1 {
+                        // Per-ray transform, for motion blur
+                        lerp_slice(space, ray_time).inverse()
+                    } else {
+                        static_mat_space
+                    };
+
+                    // Iterate through the triangles and test the ray against them.
+                    let mut non_shadow_hit = false;
+                    let mut hit_tri = unsafe { std::mem::uninitialized() };
+                    let mut hit_tri_indices = unsafe { std::mem::uninitialized() };
+                    let mut hit_tri_data = unsafe { std::mem::uninitialized() };
+                    let ray_pre = triangle::RayTriPrecompute::new(rays.dir(ray_idx));
+                    for tri_idx in idx_range.clone() {
+                        let tri_indices = self.indices[tri_idx];
+
+                        // Get triangle if necessary
+                        let tri = if is_cached {
+                            let i = tri_idx - idx_range.start;
+                            tri_cache[i]
+                        } else {
+                            let mut tri = if self.time_sample_count == 1 {
+                                // No deformation motion blur, so fast-path it.
+                                (
+                                    self.vertices[tri_indices.0 as usize],
+                                    self.vertices[tri_indices.1 as usize],
+                                    self.vertices[tri_indices.2 as usize],
+                                )
+                            } else {
+                                // Deformation motion blur, need to interpolate.
+                                let p0_slice = &self.vertices[(tri_indices.0 as usize
+                                    * self.time_sample_count)
+                                    ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
+                                let p1_slice = &self.vertices[(tri_indices.1 as usize
+                                    * self.time_sample_count)
+                                    ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
+                                let p2_slice = &self.vertices[(tri_indices.2 as usize
+                                    * self.time_sample_count)
+                                    ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
+
+                                let p0 = lerp_slice(p0_slice, ray_time);
+                                let p1 = lerp_slice(p1_slice, ray_time);
+                                let p2 = lerp_slice(p2_slice, ray_time);
+
+                                (p0, p1, p2)
+                            };
+
+                            if !space.is_empty() {
+                                tri.0 = tri.0 * mat_space;
+                                tri.1 = tri.1 * mat_space;
+                                tri.2 = tri.2 * mat_space;
+                            }
+
+                            tri
+                        };
+
+                        // Test ray against triangle
+                        if let Some((t, b0, b1, b2)) = triangle::intersect_ray(
+                            rays.orig(ray_idx),
+                            ray_pre,
+                            rays.max_t(ray_idx),
+                            tri,
+                        ) {
+                            if rays.is_occlusion(ray_idx) {
+                                isects[ray_idx] = SurfaceIntersection::Occlude;
+                                rays.mark_done(ray_idx);
+                                break;
+                            } else {
+                                non_shadow_hit = true;
+                                rays.set_max_t(ray_idx, t);
+                                hit_tri = tri;
+                                hit_tri_indices = tri_indices;
+                                hit_tri_data = (t, b0, b1, b2);
+                            }
+                        }
+                    }
+
+                    // Calculate intersection data if necessary.
+                    if non_shadow_hit {
+                        let (t, b0, b1, b2) = hit_tri_data;
+
+                        // Calculate intersection point and error magnitudes
+                        let (pos, pos_err) = triangle::surface_point(hit_tri, (b0, b1, b2));
+
+                        // Calculate geometric surface normal
+                        let geo_normal =
+                            cross(hit_tri.0 - hit_tri.1, hit_tri.0 - hit_tri.2).into_normal();
+
+                        // Calculate interpolated surface normal, if any
+                        let shading_normal = if let Some(normals) = self.normals {
+                            let n0_slice = &normals[(hit_tri_indices.0 as usize
+                                * self.time_sample_count)
+                                ..((hit_tri_indices.0 as usize + 1) * self.time_sample_count)];
+                            let n1_slice = &normals[(hit_tri_indices.1 as usize
+                                * self.time_sample_count)
+                                ..((hit_tri_indices.1 as usize + 1) * self.time_sample_count)];
+                            let n2_slice = &normals[(hit_tri_indices.2 as usize
+                                * self.time_sample_count)
+                                ..((hit_tri_indices.2 as usize + 1) * self.time_sample_count)];
+
+                            let n0 = lerp_slice(n0_slice, ray_time).normalized();
+                            let n1 = lerp_slice(n1_slice, ray_time).normalized();
+                            let n2 = lerp_slice(n2_slice, ray_time).normalized();
+
+                            let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
+                            if dot(s_nor, geo_normal) >= 0.0 {
+                                s_nor
+                            } else {
+                                -s_nor
+                            }
+                        } else {
+                            geo_normal
+                        };
+
+                        let intersection_data = SurfaceIntersectionData {
+                            incoming: rays.dir(ray_idx),
+                            t: t,
+                            pos: pos,
+                            pos_err: pos_err,
+                            nor: shading_normal,
+                            nor_g: geo_normal,
+                            local_space: mat_space,
+                            sample_pdf: 0.0,
+                        };
+
+                        // Fill in intersection data
+                        isects[ray_idx] = SurfaceIntersection::Hit {
+                            intersection_data: intersection_data,
+                            closure: shader.shade(&intersection_data, ray_time),
+                        };
+                    }
+                });
+                ray_stack.pop_task();
             });
     }
 }
diff --git a/src/tracer.rs b/src/tracer.rs
index 4105dfc..d3b5b09 100644
--- a/src/tracer.rs
+++ b/src/tracer.rs
@@ -1,10 +1,11 @@
 use std::iter;
 
 use crate::{
-    algorithm::partition,
+    accel::ray_code,
     color::{rec709_to_xyz, Color},
     lerp::lerp_slice,
-    ray::{AccelRay, Ray},
+    math::Matrix4x4,
+    ray::{RayBatch, RayStack},
     scene::{Assembly, InstanceType, Object},
     shading::{SimpleSurfaceShader, SurfaceShader},
     surface::SurfaceIntersection,
@@ -12,14 +13,16 @@ use crate::{
 };
 
 pub struct Tracer<'a> {
-    rays: Vec<AccelRay>,
+    ray_trace_count: u64,
+    ray_stack: RayStack,
     inner: TracerInner<'a>,
 }
 
 impl<'a> Tracer<'a> {
     pub fn from_assembly(assembly: &'a Assembly) -> Tracer<'a> {
         Tracer {
-            rays: Vec::new(),
+            ray_trace_count: 0,
+            ray_stack: RayStack::new(),
             inner: TracerInner {
                 root: assembly,
                 xform_stack: TransformStack::new(),
@@ -28,17 +31,13 @@ impl<'a> Tracer<'a> {
         }
     }
 
-    pub fn trace<'b>(&'b mut self, wrays: &[Ray]) -> &'b [SurfaceIntersection] {
-        self.rays.clear();
-        self.rays.reserve(wrays.len());
-        let mut ids = 0..(wrays.len() as u32);
-        self.rays.extend(
-            wrays
-                .iter()
-                .map(|wr| AccelRay::new(wr, ids.next().unwrap())),
-        );
+    pub fn trace<'b>(&'b mut self, rays: &mut RayBatch) -> &'b [SurfaceIntersection] {
+        self.ray_trace_count += rays.len() as u64;
+        self.inner.trace(rays, &mut self.ray_stack)
+    }
 
-        self.inner.trace(wrays, &mut self.rays[..])
+    pub fn rays_traced(&self) -> u64 {
+        self.ray_trace_count
     }
 }
 
@@ -49,16 +48,37 @@ struct TracerInner<'a> {
 }
 
 impl<'a> TracerInner<'a> {
-    fn trace<'b>(&'b mut self, wrays: &[Ray], rays: &mut [AccelRay]) -> &'b [SurfaceIntersection] {
+    fn trace<'b>(
+        &'b mut self,
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
+    ) -> &'b [SurfaceIntersection] {
+        ray_stack.clear();
+
         // Ready the isects
         self.isects.clear();
-        self.isects.reserve(wrays.len());
+        self.isects.reserve(rays.len());
         self.isects
-            .extend(iter::repeat(SurfaceIntersection::Miss).take(wrays.len()));
+            .extend(iter::repeat(SurfaceIntersection::Miss).take(rays.len()));
 
-        let mut ray_sets = split_rays_by_direction(&mut rays[..]);
-        for ray_set in ray_sets.iter_mut().filter(|ray_set| !ray_set.is_empty()) {
-            self.trace_assembly(self.root, wrays, ray_set);
+        // Prep the accel part of the rays.
+        {
+            let ident = Matrix4x4::new();
+            for i in 0..rays.len() {
+                rays.update_local(i, &ident);
+            }
+        }
+
+        // Divide the rays into 8 different lanes by direction.
+        ray_stack.ensure_lane_count(8);
+        for i in 0..rays.len() {
+            ray_stack.push_ray_index(i, ray_code(rays.dir(i)));
+        }
+        ray_stack.push_lanes_to_tasks(&[0, 1, 2, 3, 4, 5, 6, 7]);
+
+        // Trace each of the 8 lanes separately.
+        while !ray_stack.is_empty() {
+            self.trace_assembly(self.root, rays, ray_stack);
         }
 
         &self.isects
@@ -67,82 +87,43 @@ impl<'a> TracerInner<'a> {
     fn trace_assembly<'b>(
         &'b mut self,
         assembly: &Assembly,
-        wrays: &[Ray],
-        accel_rays: &mut [AccelRay],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
     ) {
         assembly
             .object_accel
-            .traverse(&mut accel_rays[..], &assembly.instances[..], |inst, rs| {
+            .traverse(rays, ray_stack, |idx_range, rays, ray_stack| {
+                let inst = &assembly.instances[idx_range.start];
+
                 // Transform rays if needed
                 if let Some((xstart, xend)) = inst.transform_indices {
                     // Push transforms to stack
                     self.xform_stack.push(&assembly.xforms[xstart..xend]);
 
                     // Do transforms
+                    // TODO: re-divide rays based on direction (maybe?).
                     let xforms = self.xform_stack.top();
-                    for ray in &mut rs[..] {
-                        let id = ray.id;
-                        let t = ray.time;
-                        ray.update_from_xformed_world_ray(
-                            &wrays[id as usize],
-                            &lerp_slice(xforms, t),
-                        );
-                    }
+                    ray_stack.do_next_task(|ray_idx| {
+                        let t = rays.time(ray_idx);
+                        rays.update_local(ray_idx, &lerp_slice(xforms, t));
+                    });
+                    ray_stack.duplicate_next_task();
                 }
 
                 // Trace rays
-                {
-                    // This is kind of weird looking, but what we're doing here is
-                    // splitting the rays up based on direction if they were
-                    // transformed, and not splitting them up if they weren't
-                    // transformed.
-                    // But to keep the actual tracing code in one place (DRY),
-                    // we map both cases to an array slice that contains slices of
-                    // ray arrays.  Gah... that's confusing even when explained.
-                    // TODO: do this in a way that's less confusing.  Probably split
-                    // the tracing code out into a trace_instance() method or
-                    // something.
-                    let mut tmp = if inst.transform_indices.is_some() {
-                        split_rays_by_direction(rs)
-                    } else {
-                        [
-                            &mut rs[..],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                        ]
-                    };
-                    let ray_sets = if inst.transform_indices.is_some() {
-                        &mut tmp[..]
-                    } else {
-                        &mut tmp[..1]
-                    };
+                match inst.instance_type {
+                    InstanceType::Object => {
+                        self.trace_object(
+                            &assembly.objects[inst.data_index],
+                            inst.surface_shader_index
+                                .map(|i| assembly.surface_shaders[i]),
+                            rays,
+                            ray_stack,
+                        );
+                    }
 
-                    // Loop through the split ray slices and trace them
-                    for ray_set in ray_sets.iter_mut().filter(|ray_set| !ray_set.is_empty()) {
-                        match inst.instance_type {
-                            InstanceType::Object => {
-                                self.trace_object(
-                                    &assembly.objects[inst.data_index],
-                                    inst.surface_shader_index
-                                        .map(|i| assembly.surface_shaders[i]),
-                                    wrays,
-                                    ray_set,
-                                );
-                            }
-
-                            InstanceType::Assembly => {
-                                self.trace_assembly(
-                                    &assembly.assemblies[inst.data_index],
-                                    wrays,
-                                    ray_set,
-                                );
-                            }
-                        }
+                    InstanceType::Assembly => {
+                        self.trace_assembly(&assembly.assemblies[inst.data_index], rays, ray_stack);
                     }
                 }
 
@@ -154,19 +135,15 @@ impl<'a> TracerInner<'a> {
                     // Undo transforms
                     let xforms = self.xform_stack.top();
                     if !xforms.is_empty() {
-                        for ray in &mut rs[..] {
-                            let id = ray.id;
-                            let t = ray.time;
-                            ray.update_from_xformed_world_ray(
-                                &wrays[id as usize],
-                                &lerp_slice(xforms, t),
-                            );
-                        }
+                        ray_stack.pop_do_next_task(|ray_idx| {
+                            let t = rays.time(ray_idx);
+                            rays.update_local(ray_idx, &lerp_slice(xforms, t));
+                        });
                     } else {
-                        for ray in &mut rs[..] {
-                            let id = ray.id;
-                            ray.update_from_world_ray(&wrays[id as usize]);
-                        }
+                        let ident = Matrix4x4::new();
+                        ray_stack.pop_do_next_task(|ray_idx| {
+                            rays.update_local(ray_idx, &ident);
+                        });
                     }
                 }
             });
@@ -176,8 +153,8 @@ impl<'a> TracerInner<'a> {
         &'b mut self,
         obj: &Object,
         surface_shader: Option<&SurfaceShader>,
-        wrays: &[Ray],
-        rays: &mut [AccelRay],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
     ) {
         match *obj {
             Object::Surface(surface) => {
@@ -188,7 +165,7 @@ impl<'a> TracerInner<'a> {
 
                 surface.intersect_rays(
                     rays,
-                    wrays,
+                    ray_stack,
                     &mut self.isects,
                     shader,
                     self.xform_stack.top(),
@@ -203,7 +180,7 @@ impl<'a> TracerInner<'a> {
 
                 surface.intersect_rays(
                     rays,
-                    wrays,
+                    ray_stack,
                     &mut self.isects,
                     &bogus_shader,
                     self.xform_stack.top(),
@@ -212,27 +189,3 @@ impl<'a> TracerInner<'a> {
         }
     }
 }
-
-fn split_rays_by_direction(rays: &mut [AccelRay]) -> [&mut [AccelRay]; 8] {
-    // |   |   |   |   |   |   |   |   |
-    //     s1  s2  s3  s4  s5  s6  s7
-    let s4 = partition(&mut rays[..], |r| r.dir_inv.x() >= 0.0);
-
-    let s2 = partition(&mut rays[..s4], |r| r.dir_inv.y() >= 0.0);
-    let s6 = s4 + partition(&mut rays[s4..], |r| r.dir_inv.y() >= 0.0);
-
-    let s1 = partition(&mut rays[..s2], |r| r.dir_inv.z() >= 0.0);
-    let s3 = s2 + partition(&mut rays[s2..s4], |r| r.dir_inv.z() >= 0.0);
-    let s5 = s4 + partition(&mut rays[s4..s6], |r| r.dir_inv.z() >= 0.0);
-    let s7 = s6 + partition(&mut rays[s6..], |r| r.dir_inv.z() >= 0.0);
-
-    let (rest, rs7) = rays.split_at_mut(s7);
-    let (rest, rs6) = rest.split_at_mut(s6);
-    let (rest, rs5) = rest.split_at_mut(s5);
-    let (rest, rs4) = rest.split_at_mut(s4);
-    let (rest, rs3) = rest.split_at_mut(s3);
-    let (rest, rs2) = rest.split_at_mut(s2);
-    let (rs0, rs1) = rest.split_at_mut(s1);
-
-    [rs0, rs1, rs2, rs3, rs4, rs5, rs6, rs7]
-}
diff --git a/sub_crates/float4/src/lib.rs b/sub_crates/float4/src/lib.rs
index 4006301..0f081b3 100644
--- a/sub_crates/float4/src/lib.rs
+++ b/sub_crates/float4/src/lib.rs
@@ -620,6 +620,29 @@ mod x86_64_sse {
     }
 
     impl Bool4 {
+        #[inline(always)]
+        pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 {
+            use std::arch::x86_64::_mm_set_ps;
+            Bool4 {
+                data: unsafe {
+                    _mm_set_ps(
+                        if d { 1.0 } else { 0.0 },
+                        if c { 1.0 } else { 0.0 },
+                        if b { 1.0 } else { 0.0 },
+                        if a { 1.0 } else { 0.0 },
+                    )
+                },
+            }
+        }
+
+        #[inline(always)]
+        pub fn new_false() -> Bool4 {
+            use std::arch::x86_64::_mm_set1_ps;
+            Bool4 {
+                data: unsafe { _mm_set1_ps(0.0) },
+            }
+        }
+
         /// Returns the value of the nth element.
         #[inline(always)]
         pub fn get_n(&self, n: usize) -> bool {
@@ -637,24 +660,34 @@ mod x86_64_sse {
             self.get_n(0)
         }
 
-        /// Returns the value of the 1th element.
+        /// Returns the value of the 1st element.
         #[inline(always)]
         pub fn get_1(&self) -> bool {
             self.get_n(1)
         }
 
-        /// Returns the value of the 2th element.
+        /// Returns the value of the 2nd element.
         #[inline(always)]
         pub fn get_2(&self) -> bool {
             self.get_n(2)
         }
 
-        /// Returns the value of the 3th element.
+        /// Returns the value of the 3rd element.
         #[inline(always)]
         pub fn get_3(&self) -> bool {
             self.get_n(3)
         }
 
+        /// Returns whether all four bools are false.
+        ///
+        /// This is the `NOT` operation on the result of `OR`ing all the
+        /// contained bools.  If even one bool is true, this returns false.
+        #[inline(always)]
+        pub fn is_all_false(&self) -> bool {
+            let a = unsafe { *(&self.data as *const __m128 as *const u128) };
+            a == 0
+        }
+
         #[inline]
         pub fn to_bitmask(&self) -> u8 {
             let a = unsafe { *(&self.data as *const __m128 as *const u8).offset(0) };
@@ -1236,21 +1269,25 @@ mod fallback {
         det
     }
 
-    /// Essentially a tuple of four bools, which will use SIMD operations
-    /// where possible on a platform.
-    #[cfg(feature = "simd_perf")]
-    #[derive(Debug, Copy, Clone)]
-    pub struct Bool4 {
-        data: bool32fx4,
-    }
-
-    #[cfg(not(feature = "simd_perf"))]
+    /// Essentially a tuple of four bools.
     #[derive(Debug, Copy, Clone)]
     pub struct Bool4 {
         data: [bool; 4],
     }
 
     impl Bool4 {
+        #[inline(always)]
+        pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 {
+            Bool4 { data: [a, b, c, d] }
+        }
+
+        #[inline(always)]
+        pub fn new_false() -> Bool4 {
+            Bool4 {
+                data: [false, false, false, false],
+            }
+        }
+
         /// Returns the value of the nth element.
         #[inline(always)]
         pub fn get_n(self, n: usize) -> bool {
@@ -1285,6 +1322,15 @@ mod fallback {
             self.get_n(3)
         }
 
+        /// Returns whether all four bools are false.
+        ///
+        /// This is the `NOT` operation on the result of `OR`ing all the
+        /// contained bools.  If even one bool is true, this returns false.
+        #[inline(always)]
+        pub fn is_all_false(&self) -> bool {
+            !(self.data[0] | self.data[1] | self.data[2] | self.data[3])
+        }
+
         #[inline]
         pub fn to_bitmask(self) -> u8 {
             (self.get_0() as u8)
@@ -1565,4 +1611,10 @@ mod tests {
 
         assert_eq!(r, 0b00001010);
     }
+
+    #[test]
+    fn bool4_is_all_false() {
+        assert_eq!(true, Bool4::new(false, false, false, false).is_all_false());
+        assert_eq!(false, Bool4::new(false, false, true, false).is_all_false());
+    }
 }