Initial implementation of ORST traversal.

This is a "just get it working" implementation. Performance optimizations still need to be done.
2019-06-23 18:40:52 +09:00 · 2019-06-23 18:40:52 +09:00 · 630a79aca5
commit 630a79aca5
parent 1a29b16aa2
14 changed files with 548 additions and 446 deletions
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@ -1,10 +1,14 @@
 #![allow(dead_code)]

 use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
+use math3d::Vector;
 use mem_arena::MemArena;

 use crate::{
-    algorithm::partition, bbox::BBox, boundable::Boundable, lerp::lerp_slice, ray::AccelRay,
+    bbox::BBox,
+    boundable::Boundable,
+    lerp::lerp_slice,
+    ray::{RayBatch, RayStack},
    timer::Timer,
 };

@ -13,6 +17,13 @@ use super::{
    ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME,
 };

+pub fn ray_code(dir: Vector) -> usize {
+    let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
+    ray_sign_is_neg[0] as usize
+        + ((ray_sign_is_neg[1] as usize) << 1)
+        + ((ray_sign_is_neg[2] as usize) << 2)
+}
+
 #[derive(Copy, Clone, Debug)]
 pub struct BVH4<'a> {
    root: Option<&'a BVH4Node<'a>>,
@ -66,9 +77,14 @@ impl<'a> BVH4<'a> {
        self.depth
    }

-    pub fn traverse<T, F>(&self, rays: &mut [AccelRay], objects: &[T], mut obj_ray_test: F)
-    where
-        F: FnMut(&T, &mut [AccelRay]),
+    pub fn traverse<T, F>(
+        &self,
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
+        objects: &[T],
+        mut obj_ray_test: F,
+    ) where
+        F: FnMut(&T, &mut RayBatch, &mut RayStack),
    {
        if self.root.is_none() {
            return;
@ -78,25 +94,15 @@ impl<'a> BVH4<'a> {
        let mut trav_time: f64 = 0.0;
        let mut node_tests: u64 = 0;

-        let traversal_table = {
-            let ray_sign_is_neg = [
-                rays[0].dir_inv.x() < 0.0,
-                rays[0].dir_inv.y() < 0.0,
-                rays[0].dir_inv.z() < 0.0,
-            ];
-            let ray_code = ray_sign_is_neg[0] as usize
-                + ((ray_sign_is_neg[1] as usize) << 1)
-                + ((ray_sign_is_neg[2] as usize) << 2);
-            &TRAVERSAL_TABLE[ray_code]
-        };
+        let traversal_table =
+            &TRAVERSAL_TABLE[ray_code(rays.dir_inv_accel[ray_stack.next_task_ray_idx(0)])];

        // +2 of max depth for root and last child
        let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2];
-        let mut ray_i_stack = [rays.len(); (BVH_MAX_DEPTH * 3) + 2];
        let mut stack_ptr = 1;

        while stack_ptr > 0 {
-            node_tests += ray_i_stack[stack_ptr] as u64;
+            node_tests += ray_stack.ray_count_in_next_task() as u64;
            match *node_stack[stack_ptr] {
                BVH4Node::Inner {
                    traversal_code,
@ -104,12 +110,29 @@ impl<'a> BVH4<'a> {
                    bounds_len,
                    children,
                } => {
+                    // Test rays against bbox.
                    let bounds =
                        unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
-                    let part = partition(&mut rays[..ray_i_stack[stack_ptr]], |r| {
-                        (!r.is_done()) && lerp_slice(bounds, r.time).intersect_accel_ray(r)
+
+                    let mut hit_count = 0;
+                    ray_stack.pop_do_next_task(children.len(), |ray_idx| {
+                        let hit = (!rays.is_done(ray_idx))
+                            && lerp_slice(bounds, rays.time[ray_idx]).intersect_ray(
+                                rays.orig_accel[ray_idx],
+                                rays.dir_inv_accel[ray_idx],
+                                rays.max_t[ray_idx],
+                            );
+
+                        if hit {
+                            hit_count += 1;
+                            ([0, 1, 2, 3, 4, 5, 6, 7], children.len())
+                        } else {
+                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                        }
                    });
-                    if part > 0 {
+
+                    // If there were any intersections, create tasks.
+                    if hit_count > 0 {
                        let order_code = traversal_table[traversal_code as usize];
                        match children.len() {
                            4 => {
@ -118,10 +141,7 @@ impl<'a> BVH4<'a> {
                                let i2 = ((order_code >> 2) & 0b11) as usize;
                                let i1 = (order_code & 0b11) as usize;

-                                ray_i_stack[stack_ptr] = part;
-                                ray_i_stack[stack_ptr + 1] = part;
-                                ray_i_stack[stack_ptr + 2] = part;
-                                ray_i_stack[stack_ptr + 3] = part;
+                                ray_stack.push_lanes_to_tasks(&[i4, i3, i2, i1]);

                                node_stack[stack_ptr] = &children[i4];
                                node_stack[stack_ptr + 1] = &children[i3];
@ -135,9 +155,7 @@ impl<'a> BVH4<'a> {
                                let i2 = ((order_code >> 2) & 0b11) as usize;
                                let i1 = (order_code & 0b11) as usize;

-                                ray_i_stack[stack_ptr] = part;
-                                ray_i_stack[stack_ptr + 1] = part;
-                                ray_i_stack[stack_ptr + 2] = part;
+                                ray_stack.push_lanes_to_tasks(&[i3, i2, i1]);

                                node_stack[stack_ptr] = &children[i3];
                                node_stack[stack_ptr + 1] = &children[i2];
@ -149,8 +167,7 @@ impl<'a> BVH4<'a> {
                                let i2 = ((order_code >> 2) & 0b11) as usize;
                                let i1 = (order_code & 0b11) as usize;

-                                ray_i_stack[stack_ptr] = part;
-                                ray_i_stack[stack_ptr + 1] = part;
+                                ray_stack.push_lanes_to_tasks(&[i2, i1]);

                                node_stack[stack_ptr] = &children[i2];
                                node_stack[stack_ptr + 1] = &children[i1];
@ -169,17 +186,33 @@ impl<'a> BVH4<'a> {
                    bounds_start,
                    bounds_len,
                } => {
+                    // Test rays against bounds.
                    let bounds =
                        unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
-                    let part = partition(&mut rays[..ray_i_stack[stack_ptr]], |r| {
-                        (!r.is_done()) && lerp_slice(bounds, r.time).intersect_accel_ray(r)
-                    });
+                    let object_count = object_range.1 - object_range.0;
+                    let mut hit_count = 0;

+                    ray_stack.pop_do_next_task(object_count, |ray_idx| {
+                        let hit = (!rays.is_done(ray_idx))
+                            && lerp_slice(bounds, rays.time[ray_idx]).intersect_ray(
+                                rays.orig_accel[ray_idx],
+                                rays.dir_inv_accel[ray_idx],
+                                rays.max_t[ray_idx],
+                            );
+                        if hit {
+                            hit_count += 1;
+                            ([0, 1, 2, 3, 4, 5, 6, 7], object_count)
+                        } else {
+                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                        }
+                    });
+                    
                    trav_time += timer.tick() as f64;

-                    if part > 0 {
+                    if hit_count > 0 {
+                        ray_stack.push_lanes_to_tasks(&[0, 1, 2, 3, 4, 5, 6, 7][..object_count]);
                        for obj in &objects[object_range.0..object_range.1] {
-                            obj_ray_test(obj, &mut rays[..part]);
+                            obj_ray_test(obj, rays, ray_stack);
                        }
                    }

--- a/src/accel/mod.rs
+++ b/src/accel/mod.rs
@ -1,4 +1,4 @@
-mod bvh;
+// mod bvh;
 mod bvh4;
 mod bvh_base;
 mod light_array;
@ -13,8 +13,8 @@ use crate::{
 };

 pub use self::{
-    bvh::{BVHNode, BVH},
-    bvh4::{BVH4Node, BVH4},
+    // bvh::{BVHNode, BVH},
+    bvh4::{ray_code, BVH4Node, BVH4},
    light_array::LightArray,
    light_tree::LightTree,
 };
--- a/src/bbox.rs
+++ b/src/bbox.rs
@ -7,8 +7,7 @@ use std::{

 use crate::{
    lerp::{lerp, lerp_slice, Lerp},
-    math::{fast_minf32, Matrix4x4, Point},
-    ray::AccelRay,
+    math::{fast_minf32, Matrix4x4, Point, Vector},
 };

 const BBOX_MAXT_ADJUST: f32 = 1.000_000_24;
@ -40,17 +39,17 @@ impl BBox {
    }

    // Returns whether the given ray intersects with the bbox.
-    pub fn intersect_accel_ray(&self, ray: &AccelRay) -> bool {
+    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> bool {
        // Calculate slab intersections
-        let t1 = (self.min.co - ray.orig.co) * ray.dir_inv.co;
-        let t2 = (self.max.co - ray.orig.co) * ray.dir_inv.co;
+        let t1 = (self.min.co - orig.co) * dir_inv.co;
+        let t2 = (self.max.co - orig.co) * dir_inv.co;

        // Find the far and near intersection
        let mut far_t = t1.v_max(t2);
        let mut near_t = t1.v_min(t2);
        far_t.set_3(std::f32::INFINITY);
        near_t.set_3(0.0);
-        let far_hit_t = fast_minf32(far_t.h_min() * BBOX_MAXT_ADJUST, ray.max_t);
+        let far_hit_t = fast_minf32(far_t.h_min() * BBOX_MAXT_ADJUST, max_t);
        let near_hit_t = near_t.h_max();

        // Did we hit?
--- a/src/camera.rs
+++ b/src/camera.rs
@ -92,6 +92,12 @@ impl<'a> Camera<'a> {
        )
        .normalized();

-        Ray::new(orig * transform, dir * transform, time, wavelength, false)
+        Ray {
+            orig: orig * transform,
+            dir: dir * transform,
+            time: time,
+            wavelength: wavelength,
+            max_t: std::f32::INFINITY,
+        }
    }
 }
--- a/src/light/rectangle_light.rs
+++ b/src/light/rectangle_light.rs
@ -6,7 +6,7 @@ use crate::{
    color::{Color, SpectralSample},
    lerp::lerp_slice,
    math::{cross, dot, Matrix4x4, Normal, Point, Vector},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack},
    sampling::{
        spherical_triangle_solid_angle, triangle_surface_area, uniform_sample_spherical_triangle,
        uniform_sample_triangle,
@ -257,20 +257,23 @@ impl<'a> SurfaceLight for RectangleLight<'a> {
 impl<'a> Surface for RectangleLight<'a> {
    fn intersect_rays(
        &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
        isects: &mut [SurfaceIntersection],
        shader: &SurfaceShader,
        space: &[Matrix4x4],
    ) {
        let _ = shader; // Silence 'unused' warning

-        for r in accel_rays.iter_mut() {
-            let wr = &wrays[r.id as usize];
+        ray_stack.pop_do_next_task(0, |ray_idx| {
+            let time = rays.time[ray_idx];
+            let orig = rays.orig_world[ray_idx];
+            let dir = rays.dir_world[ray_idx];
+            let max_t = rays.max_t[ray_idx];

            // Calculate time interpolated values
-            let dim = lerp_slice(self.dimensions, r.time);
-            let xform = lerp_slice(space, r.time);
+            let dim = lerp_slice(self.dimensions, time);
+            let xform = lerp_slice(space, time);

            let space_inv = xform.inverse();

@ -282,17 +285,17 @@ impl<'a> Surface for RectangleLight<'a> {

            // Test against two triangles that make up the light
            for tri in &[(p1, p2, p3), (p3, p4, p1)] {
-                if let Some((t, b0, b1, b2)) = triangle::intersect_ray(wr, *tri) {
-                    if t < r.max_t {
-                        if r.is_occlusion() {
-                            isects[r.id as usize] = SurfaceIntersection::Occlude;
-                            r.mark_done();
+                if let Some((t, b0, b1, b2)) = triangle::intersect_ray(orig, dir, max_t, *tri) {
+                    if t < max_t {
+                        if rays.is_occlusion(ray_idx) {
+                            isects[ray_idx] = SurfaceIntersection::Occlude;
+                            rays.mark_done(ray_idx);
                        } else {
                            let (pos, pos_err) = triangle::surface_point(*tri, (b0, b1, b2));
                            let normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();

                            let intersection_data = SurfaceIntersectionData {
-                                incoming: wr.dir,
+                                incoming: dir,
                                t: t,
                                pos: pos,
                                pos_err: pos_err,
@ -301,35 +304,37 @@ impl<'a> Surface for RectangleLight<'a> {
                                local_space: xform,
                                sample_pdf: self.sample_pdf(
                                    &xform,
-                                    wr.orig,
-                                    wr.dir,
+                                    orig,
+                                    dir,
                                    pos,
-                                    wr.wavelength,
-                                    r.time,
+                                    rays.wavelength[ray_idx],
+                                    time,
                                ),
                            };

                            let closure = {
                                let inv_surface_area = (1.0 / (dim.0 as f64 * dim.1 as f64)) as f32;
-                                let color = lerp_slice(self.colors, r.time) * inv_surface_area;
+                                let color = lerp_slice(self.colors, time) * inv_surface_area;
                                SurfaceClosure::Emit(color)
                            };

                            // Fill in intersection
-                            isects[r.id as usize] = SurfaceIntersection::Hit {
+                            isects[ray_idx] = SurfaceIntersection::Hit {
                                intersection_data: intersection_data,
                                closure: closure,
                            };

                            // Set ray's max t
-                            r.max_t = t;
+                            rays.max_t[ray_idx] = t;
                        }

                        break;
                    }
                }
            }
-        }
+
+            ([0, 0, 0, 0, 0, 0, 0, 0], 0)
+        });
    }
 }

--- a/src/light/sphere_light.rs
+++ b/src/light/sphere_light.rs
@ -8,7 +8,7 @@ use crate::{
    color::{Color, SpectralSample},
    lerp::lerp_slice,
    math::{coordinate_system_from_vector, dot, Matrix4x4, Normal, Point, Vector},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack},
    sampling::{uniform_sample_cone, uniform_sample_cone_pdf, uniform_sample_sphere},
    shading::surface_closure::SurfaceClosure,
    shading::SurfaceShader,
@ -206,26 +206,26 @@ impl<'a> SurfaceLight for SphereLight<'a> {
 impl<'a> Surface for SphereLight<'a> {
    fn intersect_rays(
        &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
        isects: &mut [SurfaceIntersection],
        shader: &SurfaceShader,
        space: &[Matrix4x4],
    ) {
        let _ = shader; // Silence 'unused' warning

-        for r in accel_rays.iter_mut() {
-            let wr = &wrays[r.id as usize];
+        ray_stack.pop_do_next_task(0, |ray_idx| {
+            let time = rays.time[ray_idx];

            // Get the transform space
-            let xform = lerp_slice(space, r.time);
+            let xform = lerp_slice(space, time);

            // Get the radius of the sphere at the ray's time
-            let radius = lerp_slice(self.radii, r.time); // Radius of the sphere
+            let radius = lerp_slice(self.radii, time); // Radius of the sphere

            // Get the ray origin and direction in local space
-            let orig = r.orig.into_vector();
-            let dir = wr.dir * xform;
+            let orig = rays.orig_accel[ray_idx].into_vector();
+            let dir = rays.dir_world[ray_idx] * xform;

            // Code adapted to Rust from https://github.com/Tecla/Rayito
            // Ray-sphere intersection can result in either zero, one or two points
@ -242,7 +242,7 @@ impl<'a> Surface for SphereLight<'a> {
            let discriminant = (b * b) - (4.0 * a * c);
            if discriminant < 0.0 {
                // Discriminant less than zero?  No solution => no intersection.
-                continue;
+                return ([0, 0, 0, 0, 0, 0, 0, 0], 0);
            }
            let discriminant = discriminant.sqrt();

@ -257,7 +257,7 @@ impl<'a> Surface for SphereLight<'a> {

            // Get our final parametric values
            let mut t0 = q / a;
-            let mut t1 = if q != 0.0 { c / q } else { r.max_t };
+            let mut t1 = if q != 0.0 { c / q } else { rays.max_t[ray_idx] };

            // Swap them so they are ordered right
            if t0 > t1 {
@ -266,25 +266,25 @@ impl<'a> Surface for SphereLight<'a> {
            }

            // Check our intersection for validity against this ray's extents
-            if t0 > r.max_t || t1 <= 0.0 {
-                // Didn't hit because shere is entirely outside of ray's extents
-                continue;
+            if t0 > rays.max_t[ray_idx] || t1 <= 0.0 {
+                // Didn't hit because sphere is entirely outside of ray's extents
+                return ([0, 0, 0, 0, 0, 0, 0, 0], 0);
            }

            let t = if t0 > 0.0 {
                t0
-            } else if t1 <= r.max_t {
+            } else if t1 <= rays.max_t[ray_idx] {
                t1
            } else {
                // Didn't hit because ray is entirely within the sphere, and
                // therefore doesn't hit its surface.
-                continue;
+                return ([0, 0, 0, 0, 0, 0, 0, 0], 0);
            };

            // We hit the sphere, so calculate intersection info.
-            if r.is_occlusion() {
-                isects[r.id as usize] = SurfaceIntersection::Occlude;
-                r.mark_done();
+            if rays.is_occlusion(ray_idx) {
+                isects[ray_idx] = SurfaceIntersection::Occlude;
+                rays.mark_done(ray_idx);
            } else {
                let inv_xform = xform.inverse();

@ -300,7 +300,7 @@ impl<'a> Surface for SphereLight<'a> {
                let normal = unit_pos.into_normal() * inv_xform;

                let intersection_data = SurfaceIntersectionData {
-                    incoming: wr.dir,
+                    incoming: rays.dir_world[ray_idx],
                    t: t,
                    pos: pos,
                    pos_err: pos_err,
@ -309,32 +309,34 @@ impl<'a> Surface for SphereLight<'a> {
                    local_space: xform,
                    sample_pdf: self.sample_pdf(
                        &xform,
-                        wr.orig,
-                        wr.dir,
+                        rays.orig_world[ray_idx],
+                        rays.dir_world[ray_idx],
                        0.0,
                        0.0,
-                        wr.wavelength,
-                        r.time,
+                        rays.wavelength[ray_idx],
+                        time,
                    ),
                };

                let closure = {
                    let inv_surface_area =
                        (1.0 / (4.0 * PI_64 * radius as f64 * radius as f64)) as f32;
-                    let color = lerp_slice(self.colors, r.time) * inv_surface_area;
+                    let color = lerp_slice(self.colors, time) * inv_surface_area;
                    SurfaceClosure::Emit(color)
                };

                // Fill in intersection
-                isects[r.id as usize] = SurfaceIntersection::Hit {
+                isects[ray_idx] = SurfaceIntersection::Hit {
                    intersection_data: intersection_data,
                    closure: closure,
                };

                // Set ray's max t
-                r.max_t = t;
+                rays.max_t[ray_idx] = t;
            }
-        }
+
+            ([0, 0, 0, 0, 0, 0, 0, 0], 0)
+        });
    }
 }

--- a/src/main.rs
+++ b/src/main.rs
@ -47,10 +47,9 @@ use nom::{error_position, take_until};
 use mem_arena::MemArena;

 use crate::{
-    accel::{BVH4Node, BVHNode},
+    accel::BVH4Node,
    bbox::BBox,
    parse::{parse_scene, DataTree},
-    ray::{AccelRay, Ray},
    renderer::LightPath,
    surface::SurfaceIntersection,
    timer::Timer,
@ -159,15 +158,13 @@ fn main() {

    // Print some misc useful dev info.
    if args.is_present("dev") {
-        println!("Ray size:       {} bytes", mem::size_of::<Ray>());
-        println!("AccelRay size:  {} bytes", mem::size_of::<AccelRay>());
        println!(
            "SurfaceIntersection size:  {} bytes",
            mem::size_of::<SurfaceIntersection>()
        );
        println!("LightPath size: {} bytes", mem::size_of::<LightPath>());
        println!("BBox size: {} bytes", mem::size_of::<BBox>());
-        println!("BVHNode size: {} bytes", mem::size_of::<BVHNode>());
+        // println!("BVHNode size: {} bytes", mem::size_of::<BVHNode>());
        println!("BVH4Node size: {} bytes", mem::size_of::<BVH4Node>());
        return;
    }
--- a/src/ray.rs
+++ b/src/ray.rs
@ -8,6 +8,17 @@ type FlagType = u8;
 const OCCLUSION_FLAG: FlagType = 1;
 const DONE_FLAG: FlagType = 1 << 1;

+/// This is never used directly in ray tracing--it's only used as a convenience
+/// for filling the RayBatch structure.
+#[derive(Debug, Copy, Clone)]
+pub struct Ray {
+    pub orig: Point,
+    pub dir: Vector,
+    pub time: f32,
+    pub wavelength: f32,
+    pub max_t: f32,
+}
+
 /// A batch of rays, stored in SoA layout.
 #[derive(Debug)]
 pub struct RayBatch {
@ -51,6 +62,60 @@ impl RayBatch {
        }
    }

+    pub fn push(&mut self, ray: Ray, is_occlusion: bool) {
+        self.orig_world.push(ray.orig);
+        self.dir_world.push(ray.dir);
+        self.orig_accel.push(ray.orig); // Bogus, to place-hold.
+        self.dir_inv_accel.push(ray.dir); // Bogus, to place-hold.
+        self.time.push(ray.time);
+        self.wavelength.push(ray.wavelength);
+        if is_occlusion {
+            self.max_t.push(1.0);
+            self.flags.push(OCCLUSION_FLAG);
+        } else {
+            self.max_t.push(std::f32::INFINITY);
+            self.flags.push(0);
+        }
+    }
+
+    pub fn swap(&mut self, a: usize, b: usize) {
+        if a != b {
+            self.orig_world.swap(a, b);
+            self.dir_world.swap(a, b);
+            self.orig_accel.swap(a, b);
+            self.dir_inv_accel.swap(a, b);
+            self.max_t.swap(a, b);
+            self.time.swap(a, b);
+            self.wavelength.swap(a, b);
+            self.flags.swap(a, b);
+        }
+    }
+
+    pub fn set_from_ray(&mut self, ray: &Ray, is_shadow: bool, idx: usize) {
+        self.orig_world[idx] = ray.orig;
+        self.dir_world[idx] = ray.dir;
+        self.orig_accel[idx] = ray.orig;
+        self.dir_inv_accel[idx] = Vector {
+            co: Float4::splat(1.0) / ray.dir.co,
+        };
+        self.max_t[idx] = ray.max_t;
+        self.time[idx] = ray.time;
+        self.wavelength[idx] = ray.wavelength;
+        self.time[idx] = ray.time;
+        self.flags[idx] = if is_shadow { OCCLUSION_FLAG } else { 0 };
+    }
+
+    pub fn truncate(&mut self, len: usize) {
+        self.orig_world.truncate(len);
+        self.dir_world.truncate(len);
+        self.orig_accel.truncate(len);
+        self.dir_inv_accel.truncate(len);
+        self.max_t.truncate(len);
+        self.time.truncate(len);
+        self.wavelength.truncate(len);
+        self.flags.truncate(len);
+    }
+
    /// Clear all rays, settings the size of the batch back to zero.
    ///
    /// Capacity is maintained.
@ -65,6 +130,10 @@ impl RayBatch {
        self.flags.clear();
    }

+    pub fn len(&self) -> usize {
+        self.orig_world.len()
+    }
+
    /// Returns whether the given ray (at index `idx`) is an occlusion ray.
    pub fn is_occlusion(&self, idx: usize) -> bool {
        (self.flags[idx] & OCCLUSION_FLAG) != 0
@ -101,117 +170,129 @@ impl RayBatch {
 /// A structure used for tracking traversal of a ray batch through a scene.
 #[derive(Debug)]
 pub struct RayStack {
-    lanes: Vec<Vec<u16>>,
+    lanes: Vec<Lane>,
    tasks: Vec<RayTask>,
 }

-/// A task within a RayStack.
+impl RayStack {
+    pub fn new() -> RayStack {
+        RayStack {
+            lanes: Vec::new(),
+            tasks: Vec::new(),
+        }
+    }
+
+    /// Returns whether the stack is empty of tasks or not.
+    pub fn is_empty(&self) -> bool {
+        self.tasks.is_empty()
+    }
+
+    /// Makes sure there are at least `count` lanes.
+    pub fn ensure_lane_count(&mut self, count: usize) {
+        while self.lanes.len() < count {
+            self.lanes.push(Lane {
+                idxs: Vec::new(),
+                end_len: 0,
+            })
+        }
+    }
+
+    pub fn ray_count_in_next_task(&self) -> usize {
+        let task = self.tasks.last().unwrap();
+        let end = self.lanes[task.lane].end_len;
+        end - task.start_idx
+    }
+
+    pub fn next_task_ray_idx(&self, i: usize) -> usize {
+        let task = self.tasks.last().unwrap();
+        let i = i + task.start_idx;
+        debug_assert!(i < self.lanes[task.lane].end_len);
+        self.lanes[task.lane].idxs[i] as usize
+    }
+
+    /// Clears the lanes and tasks of the RayStack.
+    ///
+    /// Note: this is (importantly) different than calling clear individually
+    /// on the `lanes` and `tasks` members.  Specifically, we don't want to
+    /// clear `lanes` itself, as that would also free all the memory of the
+    /// individual lanes.  Instead, we want to iterate over the individual
+    /// lanes and clear them, but leave `lanes` itself untouched.
+    pub fn clear(&mut self) {
+        for lane in self.lanes.iter_mut() {
+            lane.idxs.clear();
+            lane.end_len = 0;
+        }
+
+        self.tasks.clear();
+    }
+
+    /// Pushes the given ray index onto the end of the specified lane.
+    pub fn push_ray_index(&mut self, ray_idx: usize, lane: usize) {
+        assert!(self.lanes.len() > lane);
+        self.lanes[lane].idxs.push(ray_idx as u16);
+    }
+
+    /// Takes the given list of lane indices, and pushes any excess indices on
+    /// the end of each into a new task, in the order provided.
+    pub fn push_lanes_to_tasks(&mut self, lane_idxs: &[usize]) {
+        for &l in lane_idxs {
+            if self.lanes[l].end_len < self.lanes[l].idxs.len() {
+                self.tasks.push(RayTask {
+                    lane: l,
+                    start_idx: self.lanes[l].end_len,
+                });
+                self.lanes[l].end_len = self.lanes[l].idxs.len();
+            }
+        }
+    }
+
+    /// Pops the next task off the stack, and executes the provided closure for
+    /// each ray index in the task.  The return value of the closure is the list
+    /// of lanes (by index) to add the given ray index back into.
+    pub fn pop_do_next_task<F>(&mut self, needed_lanes: usize, mut handle_ray: F)
+    where
+        F: FnMut(usize) -> ([u8; 8], usize),
+    {
+        // Prepare lanes.
+        self.ensure_lane_count(needed_lanes);
+
+        // Pop the task and do necessary bookkeeping.
+        let task = self.tasks.pop().unwrap();
+        let task_range = (task.start_idx, self.lanes[task.lane].end_len);
+        self.lanes[task.lane].end_len = task.start_idx;
+
+        // Execute task.
+        let mut source_lane_cap = task_range.0;
+        for i in task_range.0..task_range.1 {
+            let ray_idx = self.lanes[task.lane].idxs[i];
+            let (add_list, list_len) = handle_ray(ray_idx as usize);
+            for &l in &add_list[..list_len] {
+                if l == task.lane as u8 {
+                    self.lanes[l as usize].idxs[source_lane_cap] = ray_idx;
+                    source_lane_cap += 1;
+                } else {
+                    self.lanes[l as usize].idxs.push(ray_idx);
+                }
+            }
+        }
+        self.lanes[task.lane].idxs.truncate(source_lane_cap);
+    }
+}
+
+/// A lane within a RayStack.
 #[derive(Debug)]
-pub enum RayTask {
-    // A barrier represents a division when traversing into a new system.
-    // For example, when traversing from the top-level BVH into an object's
-    // local BVH.  It helps with keeping track of where we're at and aids in
-    // debugging.
-    Barrier,
-
-    // A task for handling a set of rays.
-    //
-    // Specifies the lane that the relevant ray pointers are in, and the
-    // starting index within that lane.  The relevant pointers are always
-    // `&[start_idx..]` within the given lane.
-    Rays { lane: usize, start_idx: usize },
+struct Lane {
+    idxs: Vec<u16>,
+    end_len: usize,
 }

-#[derive(Debug, Copy, Clone)]
-pub struct Ray {
-    pub orig: Point,
-    pub dir: Vector,
-    pub max_t: f32,
-    pub time: f32,
-    pub wavelength: f32,
-    pub flags: FlagType,
-}
-
-impl Ray {
-    pub fn new(orig: Point, dir: Vector, time: f32, wavelength: f32, is_occ: bool) -> Ray {
-        if !is_occ {
-            Ray {
-                orig: orig,
-                dir: dir,
-                max_t: std::f32::INFINITY,
-                time: time,
-                wavelength: wavelength,
-                flags: 0,
-            }
-        } else {
-            Ray {
-                orig: orig,
-                dir: dir,
-                max_t: 1.0,
-                time: time,
-                wavelength: wavelength,
-                flags: OCCLUSION_FLAG,
-            }
-        }
-    }
-
-    pub fn transform(&mut self, mat: &Matrix4x4) {
-        self.orig = self.orig * *mat;
-        self.dir = self.dir * *mat;
-    }
-
-    pub fn is_occlusion(&self) -> bool {
-        (self.flags & OCCLUSION_FLAG) != 0
-    }
-}
-
-#[derive(Debug, Copy, Clone)]
-pub struct AccelRay {
-    pub orig: Point,
-    pub dir_inv: Vector,
-    pub max_t: f32,
-    pub time: f32,
-    pub flags: FlagType,
-    pub id: u32,
-}
-
-impl AccelRay {
-    pub fn new(ray: &Ray, id: u32) -> AccelRay {
-        AccelRay {
-            orig: ray.orig,
-            dir_inv: Vector {
-                co: Float4::splat(1.0) / ray.dir.co,
-            },
-            max_t: ray.max_t,
-            time: ray.time,
-            flags: ray.flags,
-            id: id,
-        }
-    }
-
-    pub fn update_from_world_ray(&mut self, wr: &Ray) {
-        self.orig = wr.orig;
-        self.dir_inv = Vector {
-            co: Float4::splat(1.0) / wr.dir.co,
-        };
-    }
-
-    pub fn update_from_xformed_world_ray(&mut self, wr: &Ray, mat: &Matrix4x4) {
-        self.orig = wr.orig * *mat;
-        self.dir_inv = Vector {
-            co: Float4::splat(1.0) / (wr.dir * *mat).co,
-        };
-    }
-
-    pub fn is_occlusion(&self) -> bool {
-        (self.flags & OCCLUSION_FLAG) != 0
-    }
-
-    pub fn is_done(&self) -> bool {
-        (self.flags & DONE_FLAG) != 0
-    }
-
-    pub fn mark_done(&mut self) {
-        self.flags |= DONE_FLAG;
-    }
+/// A task within a RayStack.
+//
+// Specifies the lane that the relevant ray pointers are in, and the
+// starting index within that lane.  The relevant pointers are always
+// `&[start_idx..]` within the given lane.
+#[derive(Debug)]
+struct RayTask {
+    lane: usize,
+    start_idx: usize,
 }
--- a/src/renderer.rs
+++ b/src/renderer.rs
@ -13,7 +13,6 @@ use float4::Float4;

 use crate::{
    accel::{ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME},
-    algorithm::partition_pair,
    color::{map_0_1_to_wavelength, SpectralSample, XYZ},
    fp_utils::robust_ray_origin,
    hash::hash_u32,
@ -21,7 +20,7 @@ use crate::{
    image::Image,
    math::{fast_logit, upper_power_of_two},
    mis::power_heuristic,
-    ray::Ray,
+    ray::{Ray, RayBatch},
    scene::{Scene, SceneLightSample},
    surface,
    timer::Timer,
@ -207,7 +206,7 @@ impl<'a> Renderer<'a> {
        let mut total_timer = Timer::new();

        let mut paths = Vec::new();
-        let mut rays = Vec::new();
+        let mut rays = RayBatch::new();
        let mut tracer = Tracer::from_assembly(&self.scene.root);
        let mut xform_stack = TransformStack::new();

@ -266,7 +265,7 @@ impl<'a> Renderer<'a> {
                            offset + si as u32,
                        );
                        paths.push(path);
-                        rays.push(ray);
+                        rays.push(ray, false);
                    }
                }
            }
@ -276,13 +275,20 @@ impl<'a> Renderer<'a> {
            let mut pi = paths.len();
            while pi > 0 {
                // Test rays against scene
-                let isects = tracer.trace(&rays);
+                let isects = tracer.trace(&mut rays);
                stats.trace_time += timer.tick() as f64;

                // Determine next rays to shoot based on result
-                pi = partition_pair(&mut paths[..pi], &mut rays[..pi], |i, path, ray| {
-                    path.next(&mut xform_stack, &self.scene, &isects[i], &mut *ray)
-                });
+                let mut new_end = 0;
+                for i in 0..pi {
+                    if paths[i].next(&mut xform_stack, &self.scene, &isects[i], &mut rays, i) {
+                        paths.swap(new_end, i);
+                        rays.swap(new_end, i);
+                        new_end += 1;
+                    }
+                }
+                rays.truncate(new_end);
+                pi = new_end;
                stats.ray_generation_time += timer.tick() as f64;
            }

@ -431,7 +437,8 @@ impl LightPath {
        xform_stack: &mut TransformStack,
        scene: &Scene,
        isect: &surface::SurfaceIntersection,
-        ray: &mut Ray,
+        rays: &mut RayBatch,
+        ray_idx: usize,
    ) -> bool {
        match self.event {
            //--------------------------------------------------------------------
@ -496,13 +503,13 @@ impl LightPath {
                            // Distant light
                            SceneLightSample::Distant { direction, .. } => {
                                let (attenuation, closure_pdf) = closure.evaluate(
-                                    ray.dir,
+                                    rays.dir_world[ray_idx],
                                    direction,
                                    idata.nor,
                                    idata.nor_g,
                                    self.wavelength,
                                );
-                                let mut shadow_ray = {
+                                let shadow_ray = {
                                    // Calculate the shadow ray for testing if the light is
                                    // in shadow or not.
                                    let offset_pos = robust_ray_origin(
@ -511,15 +518,14 @@ impl LightPath {
                                        idata.nor_g.normalized(),
                                        direction,
                                    );
-                                    Ray::new(
-                                        offset_pos,
-                                        direction,
-                                        self.time,
-                                        self.wavelength,
-                                        true,
-                                    )
+                                    Ray {
+                                        orig: offset_pos,
+                                        dir: direction,
+                                        time: self.time,
+                                        wavelength: self.wavelength,
+                                        max_t: std::f32::INFINITY,
+                                    }
                                };
-                                shadow_ray.max_t = std::f32::INFINITY;
                                (attenuation, closure_pdf, shadow_ray)
                            }

@ -527,7 +533,7 @@ impl LightPath {
                            SceneLightSample::Surface { sample_geo, .. } => {
                                let dir = sample_geo.0 - idata.pos;
                                let (attenuation, closure_pdf) = closure.evaluate(
-                                    ray.dir,
+                                    rays.dir_world[ray_idx],
                                    dir,
                                    idata.nor,
                                    idata.nor_g,
@ -548,13 +554,13 @@ impl LightPath {
                                        sample_geo.1.normalized(),
                                        -dir,
                                    );
-                                    Ray::new(
-                                        offset_pos,
-                                        offset_end - offset_pos,
-                                        self.time,
-                                        self.wavelength,
-                                        true,
-                                    )
+                                    Ray {
+                                        orig: offset_pos,
+                                        dir: offset_end - offset_pos,
+                                        time: self.time,
+                                        wavelength: self.wavelength,
+                                        max_t: 1.0,
+                                    }
                                };
                                (attenuation, closure_pdf, shadow_ray)
                            }
@ -572,7 +578,7 @@ impl LightPath {
                                light_info.color().e * attenuation.e * self.light_attenuation
                                    / (light_mis_pdf * light_sel_pdf);

-                            *ray = shadow_ray;
+                            rays.set_from_ray(&shadow_ray, true, ray_idx);

                            true
                        }
@ -609,8 +615,13 @@ impl LightPath {
                                idata.nor_g.normalized(),
                                dir,
                            );
-                            self.next_bounce_ray =
-                                Some(Ray::new(offset_pos, dir, self.time, self.wavelength, false));
+                            self.next_bounce_ray = Some(Ray {
+                                orig: offset_pos,
+                                dir: dir,
+                                time: self.time,
+                                wavelength: self.wavelength,
+                                max_t: std::f32::INFINITY,
+                            });

                            true
                        } else {
@ -626,7 +637,7 @@ impl LightPath {
                        self.event = LightPathEvent::ShadowRay;
                        return true;
                    } else if do_bounce {
-                        *ray = self.next_bounce_ray.unwrap();
+                        rays.set_from_ray(&self.next_bounce_ray.unwrap(), false, ray_idx);
                        self.event = LightPathEvent::BounceRay;
                        self.light_attenuation *= self.next_attenuation_fac;
                        return true;
@ -657,7 +668,7 @@ impl LightPath {

                // Set up for the next bounce, if any
                if let Some(ref nbr) = self.next_bounce_ray {
-                    *ray = *nbr;
+                    rays.set_from_ray(nbr, false, ray_idx);
                    self.light_attenuation *= self.next_attenuation_fac;
                    self.event = LightPathEvent::BounceRay;
                    return true;
--- a/src/surface/micropoly_batch.rs
+++ b/src/surface/micropoly_batch.rs
@ -8,7 +8,7 @@ use crate::{
    boundable::Boundable,
    lerp::lerp_slice,
    math::{cross, dot, Matrix4x4, Normal, Point},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack, RayTask}
    shading::surface_closure::SurfaceClosure,
 };

@ -99,8 +99,8 @@ impl<'a> MicropolyBatch<'a> {
 impl<'a> MicropolyBatch<'a> {
    fn intersect_rays(
        &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
        isects: &mut [SurfaceIntersection],
        space: &[Matrix4x4],
    ) {
@ -112,7 +112,7 @@ impl<'a> MicropolyBatch<'a> {
        };

        self.accel
-            .traverse(&mut accel_rays[..], self.indices, |tri_indices, rs| {
+            .traverse(rays, ray_stack, self.indices, |tri_indices, rs| {
                // For static triangles with static transforms, cache them.
                let is_cached = self.time_sample_count == 1 && space.len() <= 1;
                let mut tri = if is_cached {
--- a/src/surface/mod.rs
+++ b/src/surface/mod.rs
@ -1,6 +1,6 @@
 #![allow(dead_code)]

-pub mod micropoly_batch;
+// pub mod micropoly_batch;
 pub mod triangle;
 pub mod triangle_mesh;

@ -9,7 +9,7 @@ use std::fmt::Debug;
 use crate::{
    boundable::Boundable,
    math::{Matrix4x4, Normal, Point, Vector},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack},
    shading::surface_closure::SurfaceClosure,
    shading::SurfaceShader,
 };
@ -17,8 +17,8 @@ use crate::{
 pub trait Surface: Boundable + Debug + Sync {
    fn intersect_rays(
        &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
        isects: &mut [SurfaceIntersection],
        shader: &SurfaceShader,
        space: &[Matrix4x4],
--- a/src/surface/triangle.rs
+++ b/src/surface/triangle.rs
@ -1,6 +1,9 @@
 #![allow(dead_code)]

-use crate::{fp_utils::fp_gamma, math::Point, ray::Ray};
+use crate::{
+    fp_utils::fp_gamma,
+    math::{Point, Vector},
+};

 /// Intersects `ray` with `tri`, returning `Some((t, b0, b1, b2))`, or `None`
 /// if no intersection.
@ -13,12 +16,17 @@ use crate::{fp_utils::fp_gamma, math::Point, ray::Ray};
 ///
 /// Uses the ray-triangle test from the paper "Watertight Ray/Triangle
 /// Intersection" by Woop et al.
-pub fn intersect_ray(ray: &Ray, tri: (Point, Point, Point)) -> Option<(f32, f32, f32, f32)> {
+pub fn intersect_ray(
+    ray_orig: Point,
+    ray_dir: Vector,
+    ray_max_t: f32,
+    tri: (Point, Point, Point),
+) -> Option<(f32, f32, f32, f32)> {
    // Calculate the permuted dimension indices for the new ray space.
    let (xi, yi, zi) = {
-        let xabs = ray.dir.x().abs();
-        let yabs = ray.dir.y().abs();
-        let zabs = ray.dir.z().abs();
+        let xabs = ray_dir.x().abs();
+        let yabs = ray_dir.y().abs();
+        let zabs = ray_dir.z().abs();

        if xabs > yabs && xabs > zabs {
            (1, 2, 0)
@ -29,9 +37,9 @@ pub fn intersect_ray(ray: &Ray, tri: (Point, Point, Point)) -> Option<(f32, f32,
        }
    };

-    let dir_x = ray.dir.get_n(xi);
-    let dir_y = ray.dir.get_n(yi);
-    let dir_z = ray.dir.get_n(zi);
+    let dir_x = ray_dir.get_n(xi);
+    let dir_y = ray_dir.get_n(yi);
+    let dir_z = ray_dir.get_n(zi);

    // Calculate shear constants.
    let sx = dir_x / dir_z;
@ -39,9 +47,9 @@ pub fn intersect_ray(ray: &Ray, tri: (Point, Point, Point)) -> Option<(f32, f32,
    let sz = 1.0 / dir_z;

    // Calculate vertices in ray space.
-    let p0 = tri.0 - ray.orig;
-    let p1 = tri.1 - ray.orig;
-    let p2 = tri.2 - ray.orig;
+    let p0 = tri.0 - ray_orig;
+    let p1 = tri.1 - ray_orig;
+    let p2 = tri.2 - ray_orig;

    let p0x = p0.get_n(xi) - (sx * p0.get_n(zi));
    let p0y = p0.get_n(yi) - (sy * p0.get_n(zi));
@ -80,8 +88,8 @@ pub fn intersect_ray(ray: &Ray, tri: (Point, Point, Point)) -> Option<(f32, f32,
    let t_scaled = (e0 * p0z) + (e1 * p1z) + (e2 * p2z);

    // Check if the hitpoint t is within ray min/max t.
-    if (det > 0.0 && (t_scaled <= 0.0 || t_scaled > (ray.max_t * det)))
-        || (det < 0.0 && (t_scaled >= 0.0 || t_scaled < (ray.max_t * det)))
+    if (det > 0.0 && (t_scaled <= 0.0 || t_scaled > (ray_max_t * det)))
+        || (det < 0.0 && (t_scaled >= 0.0 || t_scaled < (ray_max_t * det)))
    {
        return None;
    }
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@ -8,7 +8,7 @@ use crate::{
    boundable::Boundable,
    lerp::lerp_slice,
    math::{cross, dot, Matrix4x4, Normal, Point},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack},
    shading::SurfaceShader,
 };

@ -117,8 +117,8 @@ impl<'a> Boundable for TriangleMesh<'a> {
 impl<'a> Surface for TriangleMesh<'a> {
    fn intersect_rays(
        &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
        isects: &mut [SurfaceIntersection],
        shader: &SurfaceShader,
        space: &[Matrix4x4],
@ -130,8 +130,11 @@ impl<'a> Surface for TriangleMesh<'a> {
            Matrix4x4::new()
        };

-        self.accel
-            .traverse(&mut accel_rays[..], self.indices, |tri_indices, rs| {
+        self.accel.traverse(
+            rays,
+            ray_stack,
+            self.indices,
+            |tri_indices, rays, ray_stack| {
                // For static triangles with static transforms, cache them.
                let is_cached = self.time_sample_count == 1 && space.len() <= 1;
                let mut tri = if is_cached {
@ -154,8 +157,9 @@ impl<'a> Surface for TriangleMesh<'a> {
                };

                // Test each ray against the current triangle.
-                for r in rs {
-                    let wr = &wrays[r.id as usize];
+                ray_stack.pop_do_next_task(0, |ray_idx| {
+                    let ray_idx = ray_idx as usize;
+                    let ray_time = rays.time[ray_idx];

                    // Get triangle if necessary
                    if !is_cached {
@ -178,9 +182,9 @@ impl<'a> Surface for TriangleMesh<'a> {
                                * self.time_sample_count)
                                ..((tri_indices.2 as usize + 1) * self.time_sample_count)];

-                            let p0 = lerp_slice(p0_slice, wr.time);
-                            let p1 = lerp_slice(p1_slice, wr.time);
-                            let p2 = lerp_slice(p2_slice, wr.time);
+                            let p0 = lerp_slice(p0_slice, ray_time);
+                            let p1 = lerp_slice(p1_slice, ray_time);
+                            let p2 = lerp_slice(p2_slice, ray_time);

                            (p0, p1, p2)
                        };
@ -190,7 +194,7 @@ impl<'a> Surface for TriangleMesh<'a> {
                    let mat_space = if !space.is_empty() {
                        if space.len() > 1 {
                            // Per-ray transform, for motion blur
-                            let mat_space = lerp_slice(space, wr.time).inverse();
+                            let mat_space = lerp_slice(space, ray_time).inverse();
                            tri = (tri.0 * mat_space, tri.1 * mat_space, tri.2 * mat_space);
                            mat_space
                        } else {
@ -210,65 +214,71 @@ impl<'a> Surface for TriangleMesh<'a> {
                    };

                    // Test ray against triangle
-                    if let Some((t, b0, b1, b2)) = triangle::intersect_ray(wr, tri) {
-                        if t < r.max_t {
-                            if r.is_occlusion() {
-                                isects[r.id as usize] = SurfaceIntersection::Occlude;
-                                r.mark_done();
-                            } else {
-                                // Calculate intersection point and error magnitudes
-                                let (pos, pos_err) = triangle::surface_point(tri, (b0, b1, b2));
+                    if let Some((t, b0, b1, b2)) = triangle::intersect_ray(
+                        rays.orig_world[ray_idx],
+                        rays.dir_world[ray_idx],
+                        rays.max_t[ray_idx],
+                        tri,
+                    ) {
+                        if rays.is_occlusion(ray_idx) {
+                            isects[ray_idx] = SurfaceIntersection::Occlude;
+                            rays.mark_done(ray_idx);
+                        } else {
+                            // Calculate intersection point and error magnitudes
+                            let (pos, pos_err) = triangle::surface_point(tri, (b0, b1, b2));

-                                // Calculate geometric surface normal
-                                let geo_normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();
+                            // Calculate geometric surface normal
+                            let geo_normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();

-                                // Calculate interpolated surface normal, if any
-                                let shading_normal = if let Some(normals) = self.normals {
-                                    let n0_slice = &normals[(tri_indices.0 as usize
-                                        * self.time_sample_count)
-                                        ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
-                                    let n1_slice = &normals[(tri_indices.1 as usize
-                                        * self.time_sample_count)
-                                        ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
-                                    let n2_slice = &normals[(tri_indices.2 as usize
-                                        * self.time_sample_count)
-                                        ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
+                            // Calculate interpolated surface normal, if any
+                            let shading_normal = if let Some(normals) = self.normals {
+                                let n0_slice = &normals[(tri_indices.0 as usize
+                                    * self.time_sample_count)
+                                    ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
+                                let n1_slice = &normals[(tri_indices.1 as usize
+                                    * self.time_sample_count)
+                                    ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
+                                let n2_slice = &normals[(tri_indices.2 as usize
+                                    * self.time_sample_count)
+                                    ..((tri_indices.2 as usize + 1) * self.time_sample_count)];

-                                    let n0 = lerp_slice(n0_slice, wr.time).normalized();
-                                    let n1 = lerp_slice(n1_slice, wr.time).normalized();
-                                    let n2 = lerp_slice(n2_slice, wr.time).normalized();
+                                let n0 = lerp_slice(n0_slice, ray_time).normalized();
+                                let n1 = lerp_slice(n1_slice, ray_time).normalized();
+                                let n2 = lerp_slice(n2_slice, ray_time).normalized();

-                                    let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
-                                    if dot(s_nor, geo_normal) >= 0.0 {
-                                        s_nor
-                                    } else {
-                                        -s_nor
-                                    }
+                                let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
+                                if dot(s_nor, geo_normal) >= 0.0 {
+                                    s_nor
                                } else {
-                                    geo_normal
-                                };
+                                    -s_nor
+                                }
+                            } else {
+                                geo_normal
+                            };

-                                let intersection_data = SurfaceIntersectionData {
-                                    incoming: wr.dir,
-                                    t: t,
-                                    pos: pos,
-                                    pos_err: pos_err,
-                                    nor: shading_normal,
-                                    nor_g: geo_normal,
-                                    local_space: mat_space,
-                                    sample_pdf: 0.0,
-                                };
+                            let intersection_data = SurfaceIntersectionData {
+                                incoming: rays.dir_world[ray_idx],
+                                t: t,
+                                pos: pos,
+                                pos_err: pos_err,
+                                nor: shading_normal,
+                                nor_g: geo_normal,
+                                local_space: mat_space,
+                                sample_pdf: 0.0,
+                            };

-                                // Fill in intersection data
-                                isects[r.id as usize] = SurfaceIntersection::Hit {
-                                    intersection_data: intersection_data,
-                                    closure: shader.shade(&intersection_data, wr.time),
-                                };
-                                r.max_t = t;
-                            }
+                            // Fill in intersection data
+                            isects[ray_idx] = SurfaceIntersection::Hit {
+                                intersection_data: intersection_data,
+                                closure: shader.shade(&intersection_data, ray_time),
+                            };
+                            rays.max_t[ray_idx] = t;
                        }
                    }
-                }
-            });
+
+                    ([0, 0, 0, 0, 0, 0, 0, 0], 0)
+                });
+            },
+        );
    }
 }
--- a/src/tracer.rs
+++ b/src/tracer.rs
@ -1,10 +1,11 @@
 use std::iter;

 use crate::{
-    algorithm::partition,
+    accel::ray_code,
    color::{rec709_to_xyz, Color},
    lerp::lerp_slice,
-    ray::{AccelRay, Ray},
+    math::Matrix4x4,
+    ray::{RayBatch, RayStack},
    scene::{Assembly, InstanceType, Object},
    shading::{SimpleSurfaceShader, SurfaceShader},
    surface::SurfaceIntersection,
@ -12,14 +13,14 @@ use crate::{
 };

 pub struct Tracer<'a> {
-    rays: Vec<AccelRay>,
+    ray_stack: RayStack,
    inner: TracerInner<'a>,
 }

 impl<'a> Tracer<'a> {
    pub fn from_assembly(assembly: &'a Assembly) -> Tracer<'a> {
        Tracer {
-            rays: Vec::new(),
+            ray_stack: RayStack::new(),
            inner: TracerInner {
                root: assembly,
                xform_stack: TransformStack::new(),
@ -28,17 +29,8 @@ impl<'a> Tracer<'a> {
        }
    }

-    pub fn trace<'b>(&'b mut self, wrays: &[Ray]) -> &'b [SurfaceIntersection] {
-        self.rays.clear();
-        self.rays.reserve(wrays.len());
-        let mut ids = 0..(wrays.len() as u32);
-        self.rays.extend(
-            wrays
-                .iter()
-                .map(|wr| AccelRay::new(wr, ids.next().unwrap())),
-        );
-
-        self.inner.trace(wrays, &mut self.rays[..])
+    pub fn trace<'b>(&'b mut self, rays: &mut RayBatch) -> &'b [SurfaceIntersection] {
+        self.inner.trace(rays, &mut self.ray_stack)
    }
 }

@ -49,16 +41,37 @@ struct TracerInner<'a> {
 }

 impl<'a> TracerInner<'a> {
-    fn trace<'b>(&'b mut self, wrays: &[Ray], rays: &mut [AccelRay]) -> &'b [SurfaceIntersection] {
+    fn trace<'b>(
+        &'b mut self,
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
+    ) -> &'b [SurfaceIntersection] {
+        ray_stack.clear();
+
        // Ready the isects
        self.isects.clear();
-        self.isects.reserve(wrays.len());
+        self.isects.reserve(rays.len());
        self.isects
-            .extend(iter::repeat(SurfaceIntersection::Miss).take(wrays.len()));
+            .extend(iter::repeat(SurfaceIntersection::Miss).take(rays.len()));

-        let mut ray_sets = split_rays_by_direction(&mut rays[..]);
-        for ray_set in ray_sets.iter_mut().filter(|ray_set| !ray_set.is_empty()) {
-            self.trace_assembly(self.root, wrays, ray_set);
+        // Prep the accel part of the rays.
+        {
+            let ident = Matrix4x4::new();
+            for i in 0..rays.len() {
+                rays.update_accel(i, &ident);
+            }
+        }
+
+        // Divide the rays into 8 different lanes by direction.
+        ray_stack.ensure_lane_count(8);
+        for i in 0..rays.len() {
+            ray_stack.push_ray_index(i, ray_code(rays.dir_world[i]));
+        }
+        ray_stack.push_lanes_to_tasks(&[0, 1, 2, 3, 4, 5, 6, 7]);
+
+        // Trace each of the 8 lanes separately.
+        while !ray_stack.is_empty() {
+            self.trace_assembly(self.root, rays, ray_stack);
        }

        &self.isects
@ -67,82 +80,44 @@ impl<'a> TracerInner<'a> {
    fn trace_assembly<'b>(
        &'b mut self,
        assembly: &Assembly,
-        wrays: &[Ray],
-        accel_rays: &mut [AccelRay],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
    ) {
-        assembly
-            .object_accel
-            .traverse(&mut accel_rays[..], &assembly.instances[..], |inst, rs| {
+        assembly.object_accel.traverse(
+            rays,
+            ray_stack,
+            &assembly.instances[..],
+            |inst, rays, ray_stack| {
                // Transform rays if needed
                if let Some((xstart, xend)) = inst.transform_indices {
                    // Push transforms to stack
                    self.xform_stack.push(&assembly.xforms[xstart..xend]);

                    // Do transforms
+                    // TODO: re-divide rays based on direction (maybe?).
                    let xforms = self.xform_stack.top();
-                    for ray in &mut rs[..] {
-                        let id = ray.id;
-                        let t = ray.time;
-                        ray.update_from_xformed_world_ray(
-                            &wrays[id as usize],
-                            &lerp_slice(xforms, t),
-                        );
-                    }
+                    ray_stack.pop_do_next_task(2, |ray_idx| {
+                        let t = rays.time[ray_idx];
+                        rays.update_accel(ray_idx, &lerp_slice(xforms, t));
+                        ([0, 1, 2, 3, 4, 5, 6, 7], 2)
+                    });
+                    ray_stack.push_lanes_to_tasks(&[0, 1]);
                }

                // Trace rays
-                {
-                    // This is kind of weird looking, but what we're doing here is
-                    // splitting the rays up based on direction if they were
-                    // transformed, and not splitting them up if they weren't
-                    // transformed.
-                    // But to keep the actual tracing code in one place (DRY),
-                    // we map both cases to an array slice that contains slices of
-                    // ray arrays.  Gah... that's confusing even when explained.
-                    // TODO: do this in a way that's less confusing.  Probably split
-                    // the tracing code out into a trace_instance() method or
-                    // something.
-                    let mut tmp = if inst.transform_indices.is_some() {
-                        split_rays_by_direction(rs)
-                    } else {
-                        [
-                            &mut rs[..],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                        ]
-                    };
-                    let ray_sets = if inst.transform_indices.is_some() {
-                        &mut tmp[..]
-                    } else {
-                        &mut tmp[..1]
-                    };
+                match inst.instance_type {
+                    InstanceType::Object => {
+                        self.trace_object(
+                            &assembly.objects[inst.data_index],
+                            inst.surface_shader_index
+                                .map(|i| assembly.surface_shaders[i]),
+                            rays,
+                            ray_stack,
+                        );
+                    }

-                    // Loop through the split ray slices and trace them
-                    for ray_set in ray_sets.iter_mut().filter(|ray_set| !ray_set.is_empty()) {
-                        match inst.instance_type {
-                            InstanceType::Object => {
-                                self.trace_object(
-                                    &assembly.objects[inst.data_index],
-                                    inst.surface_shader_index
-                                        .map(|i| assembly.surface_shaders[i]),
-                                    wrays,
-                                    ray_set,
-                                );
-                            }
-
-                            InstanceType::Assembly => {
-                                self.trace_assembly(
-                                    &assembly.assemblies[inst.data_index],
-                                    wrays,
-                                    ray_set,
-                                );
-                            }
-                        }
+                    InstanceType::Assembly => {
+                        self.trace_assembly(&assembly.assemblies[inst.data_index], rays, ray_stack);
                    }
                }

@ -154,30 +129,29 @@ impl<'a> TracerInner<'a> {
                    // Undo transforms
                    let xforms = self.xform_stack.top();
                    if !xforms.is_empty() {
-                        for ray in &mut rs[..] {
-                            let id = ray.id;
-                            let t = ray.time;
-                            ray.update_from_xformed_world_ray(
-                                &wrays[id as usize],
-                                &lerp_slice(xforms, t),
-                            );
-                        }
+                        ray_stack.pop_do_next_task(0, |ray_idx| {
+                            let t = rays.time[ray_idx];
+                            rays.update_accel(ray_idx, &lerp_slice(xforms, t));
+                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                        });
                    } else {
-                        for ray in &mut rs[..] {
-                            let id = ray.id;
-                            ray.update_from_world_ray(&wrays[id as usize]);
-                        }
+                        let ident = Matrix4x4::new();
+                        ray_stack.pop_do_next_task(0, |ray_idx| {
+                            rays.update_accel(ray_idx, &ident);
+                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                        });
                    }
                }
-            });
+            },
+        );
    }

    fn trace_object<'b>(
        &'b mut self,
        obj: &Object,
        surface_shader: Option<&SurfaceShader>,
-        wrays: &[Ray],
-        rays: &mut [AccelRay],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
    ) {
        match *obj {
            Object::Surface(surface) => {
@ -188,7 +162,7 @@ impl<'a> TracerInner<'a> {

                surface.intersect_rays(
                    rays,
-                    wrays,
+                    ray_stack,
                    &mut self.isects,
                    shader,
                    self.xform_stack.top(),
@ -203,7 +177,7 @@ impl<'a> TracerInner<'a> {

                surface.intersect_rays(
                    rays,
-                    wrays,
+                    ray_stack,
                    &mut self.isects,
                    &bogus_shader,
                    self.xform_stack.top(),
@ -212,27 +186,3 @@ impl<'a> TracerInner<'a> {
        }
    }
 }
-
-fn split_rays_by_direction(rays: &mut [AccelRay]) -> [&mut [AccelRay]; 8] {
-    // |   |   |   |   |   |   |   |   |
-    //     s1  s2  s3  s4  s5  s6  s7
-    let s4 = partition(&mut rays[..], |r| r.dir_inv.x() >= 0.0);
-
-    let s2 = partition(&mut rays[..s4], |r| r.dir_inv.y() >= 0.0);
-    let s6 = s4 + partition(&mut rays[s4..], |r| r.dir_inv.y() >= 0.0);
-
-    let s1 = partition(&mut rays[..s2], |r| r.dir_inv.z() >= 0.0);
-    let s3 = s2 + partition(&mut rays[s2..s4], |r| r.dir_inv.z() >= 0.0);
-    let s5 = s4 + partition(&mut rays[s4..s6], |r| r.dir_inv.z() >= 0.0);
-    let s7 = s6 + partition(&mut rays[s6..], |r| r.dir_inv.z() >= 0.0);
-
-    let (rest, rs7) = rays.split_at_mut(s7);
-    let (rest, rs6) = rest.split_at_mut(s6);
-    let (rest, rs5) = rest.split_at_mut(s5);
-    let (rest, rs4) = rest.split_at_mut(s4);
-    let (rest, rs3) = rest.split_at_mut(s3);
-    let (rest, rs2) = rest.split_at_mut(s2);
-    let (rs0, rs1) = rest.split_at_mut(s1);
-
-    [rs0, rs1, rs2, rs3, rs4, rs5, rs6, rs7]
-}