From 8ca6e27f39d5adf54e7e8daf23dcf2308531f5e7 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 31 Jul 2022 17:50:54 -0700
Subject: [PATCH] WIP do depth-first instead of breadth-first ray tracing.

Currently totally broken.
---
 src/accel/bvh.rs               |  86 ++++++-
 src/accel/bvh4.rs              |  76 +++---
 src/bbox4.rs                   |  25 +-
 src/camera.rs                  |  15 +-
 src/light/rectangle_light.rs   | 134 +++++------
 src/light/sphere_light.rs      | 233 +++++++++----------
 src/main.rs                    |   2 +-
 src/ray.rs                     | 414 +++++----------------------------
 src/renderer.rs                | 216 ++++++++---------
 src/scene/assembly.rs          |  86 +++----
 src/scene/mod.rs               |  11 +-
 src/surface/mod.rs             |  15 +-
 src/surface/triangle_mesh.rs   | 292 +++++++++--------------
 src/tracer.rs                  | 222 +++++-------------
 src/transform_stack.rs         |  75 +-----
 sub_crates/rmath/src/vector.rs |   5 +
 16 files changed, 684 insertions(+), 1223 deletions(-)

diff --git a/src/accel/bvh.rs b/src/accel/bvh.rs
index d813ebb..ecf5aaa 100644
--- a/src/accel/bvh.rs
+++ b/src/accel/bvh.rs
@@ -67,7 +67,91 @@ impl<'a> BVH<'a> {
         self.depth
     }
 
-    pub fn traverse<T, F>(&self, rays: &mut [AccelRay], objects: &[T], mut obj_ray_test: F)
+    pub fn traverse<T, F>(&self, ray: &mut AccelRay, objects: &[T], mut obj_ray_test: F)
+    where
+        F: FnMut(&T, &mut AccelRay),
+    {
+        if self.root.is_none() {
+            return;
+        }
+
+        let mut timer = Timer::new();
+        let mut trav_time: f64 = 0.0;
+        let mut node_tests: u64 = 0;
+
+        let ray_sign = [
+            ray.dir_inv.x() >= 0.0,
+            ray.dir_inv.y() >= 0.0,
+            ray.dir_inv.z() >= 0.0,
+        ];
+
+        // +2 of max depth for root and last child
+        let mut node_stack = [self.root.unwrap(); BVH_MAX_DEPTH + 2];
+        let mut stack_ptr = 1;
+
+        while stack_ptr > 0 && !ray.is_done {
+            node_tests += 1;
+            match *node_stack[stack_ptr] {
+                BVHNode::Internal {
+                    children,
+                    bounds_start,
+                    bounds_len,
+                    split_axis,
+                } => {
+                    let bounds =
+                        unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
+                    let is_hit = lerp_slice(bounds, ray.time).intersect_accel_ray(&ray);
+
+                    if is_hit {
+                        if ray_sign[split_axis as usize] {
+                            node_stack[stack_ptr] = children.1;
+                            node_stack[stack_ptr + 1] = children.0;
+                        } else {
+                            node_stack[stack_ptr] = children.0;
+                            node_stack[stack_ptr + 1] = children.1;
+                        }
+                        stack_ptr += 1;
+                    } else {
+                        stack_ptr -= 1;
+                    }
+                }
+
+                BVHNode::Leaf {
+                    object_range,
+                    bounds_start,
+                    bounds_len,
+                } => {
+                    let bounds =
+                        unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
+                    let is_hit = lerp_slice(bounds, ray.time).intersect_accel_ray(&ray);
+
+                    trav_time += timer.tick() as f64;
+
+                    if is_hit {
+                        for obj in &objects[object_range.0..object_range.1] {
+                            obj_ray_test(obj, ray);
+                        }
+                    }
+
+                    timer.tick();
+
+                    stack_ptr -= 1;
+                }
+            }
+        }
+
+        trav_time += timer.tick() as f64;
+        ACCEL_TRAV_TIME.with(|att| {
+            let v = att.get();
+            att.set(v + trav_time);
+        });
+        ACCEL_NODE_RAY_TESTS.with(|anv| {
+            let v = anv.get();
+            anv.set(v + node_tests);
+        });
+    }
+
+    pub fn traverse_multi<T, F>(&self, rays: &mut [AccelRay], objects: &[T], mut obj_ray_test: F)
     where
         F: FnMut(&T, &mut [AccelRay]),
     {
diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index 3c3b169..c89f7ad 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -6,8 +6,6 @@
 
 use std::mem::{transmute, MaybeUninit};
 
-use rmath::wide4::Bool4;
-
 use kioku::Arena;
 
 use crate::{
@@ -16,7 +14,7 @@ use crate::{
     boundable::Boundable,
     lerp::lerp_slice,
     math::Vector,
-    ray::{RayBatch, RayStack},
+    ray::{LocalRay, Ray},
 };
 
 use super::{
@@ -25,6 +23,7 @@ use super::{
 };
 
 use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
+use rmath::wide4::Float4;
 
 pub fn ray_code(dir: Vector) -> usize {
     let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
@@ -33,6 +32,8 @@ pub fn ray_code(dir: Vector) -> usize {
         + ((ray_sign_is_neg[2] as usize) << 2)
 }
 
+//-------------------------------------------------------------
+
 #[derive(Copy, Clone, Debug)]
 pub struct BVH4<'a> {
     root: Option<&'a BVH4Node<'a>>,
@@ -98,9 +99,9 @@ impl<'a> BVH4<'a> {
         self.depth
     }
 
-    pub fn traverse<F>(&self, rays: &mut RayBatch, ray_stack: &mut RayStack, mut obj_ray_test: F)
+    pub fn traverse<F>(&self, ray: &mut Ray, local_ray: &LocalRay, mut obj_ray_test: F)
     where
-        F: FnMut(std::ops::Range<usize>, &mut RayBatch, &mut RayStack),
+        F: FnMut(std::ops::Range<usize>, &mut Ray),
     {
         if self.root.is_none() {
             return;
@@ -108,55 +109,48 @@ impl<'a> BVH4<'a> {
 
         let mut node_tests: u64 = 0;
 
-        let traversal_table =
-            &TRAVERSAL_TABLE[ray_code(rays.dir_inv_local(ray_stack.next_task_ray_idx(0)))];
+        // SIMD-ready ray data.
+        let orig4 = [
+            local_ray.orig.0.aaaa(),
+            local_ray.orig.0.bbbb(),
+            local_ray.orig.0.cccc(),
+        ];
+        let dir_inv4 = [
+            local_ray.dir_inv.0.aaaa(),
+            local_ray.dir_inv.0.bbbb(),
+            local_ray.dir_inv.0.cccc(),
+        ];
+        let mut max_t4 = Float4::splat(ray.max_t);
 
         // +2 of max depth for root and last child
         let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2];
         let mut stack_ptr = 1;
 
-        while stack_ptr > 0 {
+        let traversal_table = &TRAVERSAL_TABLE[ray_code(local_ray.dir_inv)];
+
+        while stack_ptr > 0 && !ray.is_done() {
             match *node_stack[stack_ptr] {
                 BVH4Node::Internal {
                     bounds,
                     children,
                     traversal_code,
                 } => {
-                    node_tests += ray_stack.ray_count_in_next_task() as u64;
-                    let mut all_hits = Bool4::new_false();
+                    node_tests += 1;
 
-                    // Ray testing
-                    ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| {
-                        if rays.is_done(ray_idx) {
-                            Bool4::new_false()
-                        } else {
-                            let hits = if bounds.len() == 1 {
-                                bounds[0].intersect_ray(
-                                    rays.orig_local(ray_idx),
-                                    rays.dir_inv_local(ray_idx),
-                                    rays.max_t(ray_idx),
-                                )
-                            } else {
-                                lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
-                                    rays.orig_local(ray_idx),
-                                    rays.dir_inv_local(ray_idx),
-                                    rays.max_t(ray_idx),
-                                )
-                            };
-                            all_hits |= hits;
-                            hits
-                        }
-                    });
+                    let hits = if bounds.len() == 1 {
+                        bounds[0].intersect_ray(orig4, dir_inv4, max_t4)
+                    } else {
+                        lerp_slice(bounds, ray.time).intersect_ray(orig4, dir_inv4, max_t4)
+                    };
 
-                    // If there were any intersections, create tasks.
-                    if all_hits.any() {
+                    // Push child nodes onto the stack if there were any hits.
+                    if hits.any() {
                         let order_code = traversal_table[traversal_code as usize];
+                        let hits = hits.to_bools();
                         let mut lane_count = 0;
-                        let mut i = children.len() as u8;
-                        while i > 0 {
-                            i -= 1;
+                        for i in (0..children.len() as u8).rev() {
                             let child_i = ((order_code >> (i * 2)) & 3) as usize;
-                            if ray_stack.push_lane_to_task(child_i) {
+                            if hits[child_i] {
                                 node_stack[stack_ptr + lane_count] = &children[child_i];
                                 lane_count += 1;
                             }
@@ -169,8 +163,10 @@ impl<'a> BVH4<'a> {
                 }
 
                 BVH4Node::Leaf { object_range } => {
-                    // Do the ray tests.
-                    obj_ray_test(object_range.0..object_range.1, rays, ray_stack);
+                    obj_ray_test(object_range.0..object_range.1, ray);
+
+                    // Update SIMD max_t in case there was a hit.
+                    max_t4 = Float4::splat(ray.max_t);
 
                     stack_ptr -= 1;
                 }
diff --git a/src/bbox4.rs b/src/bbox4.rs
index c0e5861..f313bd7 100644
--- a/src/bbox4.rs
+++ b/src/bbox4.rs
@@ -6,7 +6,6 @@ use std::ops::{BitOr, BitOrAssign};
 use crate::{
     bbox::BBox,
     lerp::{lerp, Lerp},
-    math::{Point, Vector},
 };
 
 use rmath::wide4::{Bool4, Float4};
@@ -60,23 +59,15 @@ impl BBox4 {
     }
 
     // Returns whether the given ray intersects with the bboxes.
-    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> Bool4 {
-        // Get the ray data into SIMD format.
-        let ro_x = orig.0.aaaa();
-        let ro_y = orig.0.bbbb();
-        let ro_z = orig.0.cccc();
-        let rdi_x = dir_inv.0.aaaa();
-        let rdi_y = dir_inv.0.bbbb();
-        let rdi_z = dir_inv.0.cccc();
-        let max_t = Float4::splat(max_t);
-
+    #[inline(always)]
+    pub fn intersect_ray(&self, orig: [Float4; 3], dir_inv: [Float4; 3], max_t: Float4) -> Bool4 {
         // Slab tests
-        let t1_x = (self.x.0 - ro_x) * rdi_x;
-        let t1_y = (self.y.0 - ro_y) * rdi_y;
-        let t1_z = (self.z.0 - ro_z) * rdi_z;
-        let t2_x = (self.x.1 - ro_x) * rdi_x;
-        let t2_y = (self.y.1 - ro_y) * rdi_y;
-        let t2_z = (self.z.1 - ro_z) * rdi_z;
+        let t1_x = (self.x.0 - orig[0]) * dir_inv[0];
+        let t1_y = (self.y.0 - orig[1]) * dir_inv[1];
+        let t1_z = (self.z.0 - orig[2]) * dir_inv[2];
+        let t2_x = (self.x.1 - orig[0]) * dir_inv[0];
+        let t2_y = (self.y.1 - orig[1]) * dir_inv[1];
+        let t2_z = (self.z.1 - orig[2]) * dir_inv[2];
 
         // Get the far and near t hits for each axis.
         let t_far_x = t1_x.max(t2_x);
diff --git a/src/camera.rs b/src/camera.rs
index 7dbd3b9..2903d38 100644
--- a/src/camera.rs
+++ b/src/camera.rs
@@ -92,12 +92,13 @@ impl<'a> Camera<'a> {
         )
         .normalized();
 
-        Ray {
-            orig: orig.xform_inv_fast(&transform),
-            dir: dir.xform_inv_fast(&transform),
-            time: time,
-            wavelength: wavelength,
-            max_t: std::f32::INFINITY,
-        }
+        Ray::new(
+            orig.xform_inv_fast(&transform),
+            dir.xform_inv_fast(&transform),
+            time,
+            wavelength,
+            std::f32::INFINITY,
+            false,
+        )
     }
 }
diff --git a/src/light/rectangle_light.rs b/src/light/rectangle_light.rs
index 73c3b14..ae31f6c 100644
--- a/src/light/rectangle_light.rs
+++ b/src/light/rectangle_light.rs
@@ -5,8 +5,8 @@ use crate::{
     boundable::Boundable,
     color::{Color, SpectralSample},
     lerp::lerp_slice,
-    math::{cross, dot, Normal, Point, Vector, Xform, XformFull},
-    ray::{RayBatch, RayStack},
+    math::{cross, dot, Normal, Point, Vector, XformFull},
+    ray::{LocalRay, Ray},
     sampling::{
         spherical_triangle_solid_angle, triangle_surface_area, uniform_sample_spherical_triangle,
         uniform_sample_triangle,
@@ -251,89 +251,77 @@ impl<'a> SurfaceLight for RectangleLight<'a> {
 }
 
 impl<'a> Surface for RectangleLight<'a> {
-    fn intersect_rays(
+    fn intersect_ray(
         &self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-        isects: &mut [SurfaceIntersection],
-        shader: &dyn SurfaceShader,
-        space: &[Xform],
+        ray: &mut Ray,
+        _local_ray: &LocalRay,
+        space: &XformFull,
+        isect: &mut SurfaceIntersection,
+        _shader: &dyn SurfaceShader,
     ) {
-        let _ = shader; // Silence 'unused' warning
+        let time = ray.time;
 
-        ray_stack.pop_do_next_task(|ray_idx| {
-            let time = rays.time(ray_idx);
-            let orig = rays.orig(ray_idx);
-            let dir = rays.dir(ray_idx);
-            let max_t = rays.max_t(ray_idx);
+        // Calculate time interpolated values.
+        let dim = lerp_slice(self.dimensions, time);
 
-            // Calculate time interpolated values
-            let dim = lerp_slice(self.dimensions, time);
-            let xform = lerp_slice(space, time);
+        // Get the four corners of the rectangle, transformed into world space.
+        let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0).xform(space);
+        let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0).xform(space);
+        let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0).xform(space);
+        let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0).xform(space);
 
-            let space = if let Some(xform) = xform.to_full() {
-                xform
-            } else {
-                return;
-            };
+        // Test against two triangles that make up the light.
+        let ray_pre = triangle::RayTriPrecompute::new(ray.dir);
+        for tri in &[(p1, p2, p3), (p3, p4, p1)] {
+            if let Some((t, b0, b1, b2)) =
+                triangle::intersect_ray(ray.orig, ray_pre, ray.max_t, *tri)
+            {
+                if t < ray.max_t {
+                    if ray.is_occlusion() {
+                        *isect = SurfaceIntersection::Occlude;
+                        ray.mark_done();
+                        return;
+                    } else {
+                        let (pos, pos_err) = triangle::surface_point(*tri, (b0, b1, b2));
+                        let normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();
 
-            // Get the four corners of the rectangle, transformed into world space
-            let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0).xform(&space);
-            let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0).xform(&space);
-            let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0).xform(&space);
-            let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0).xform(&space);
+                        let intersection_data = SurfaceIntersectionData {
+                            incoming: ray.dir,
+                            t: t,
+                            pos: pos,
+                            pos_err: pos_err,
+                            nor: normal,
+                            nor_g: normal,
+                            local_space: *space,
+                            sample_pdf: self.sample_pdf(
+                                space,
+                                ray.orig,
+                                ray.dir,
+                                pos,
+                                ray.wavelength,
+                                time,
+                            ),
+                        };
 
-            // Test against two triangles that make up the light
-            let ray_pre = triangle::RayTriPrecompute::new(dir);
-            for tri in &[(p1, p2, p3), (p3, p4, p1)] {
-                if let Some((t, b0, b1, b2)) = triangle::intersect_ray(orig, ray_pre, max_t, *tri) {
-                    if t < max_t {
-                        if rays.is_occlusion(ray_idx) {
-                            isects[ray_idx] = SurfaceIntersection::Occlude;
-                            rays.mark_done(ray_idx);
-                        } else {
-                            let (pos, pos_err) = triangle::surface_point(*tri, (b0, b1, b2));
-                            let normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();
+                        let closure = {
+                            let inv_surface_area = (1.0 / (dim.0 as f64 * dim.1 as f64)) as f32;
+                            let color = lerp_slice(self.colors, time) * inv_surface_area;
+                            SurfaceClosure::Emit(color)
+                        };
 
-                            let intersection_data = SurfaceIntersectionData {
-                                incoming: dir,
-                                t: t,
-                                pos: pos,
-                                pos_err: pos_err,
-                                nor: normal,
-                                nor_g: normal,
-                                local_space: space,
-                                sample_pdf: self.sample_pdf(
-                                    &space,
-                                    orig,
-                                    dir,
-                                    pos,
-                                    rays.wavelength(ray_idx),
-                                    time,
-                                ),
-                            };
+                        // Fill in intersection.
+                        *isect = SurfaceIntersection::Hit {
+                            intersection_data: intersection_data,
+                            closure: closure,
+                        };
 
-                            let closure = {
-                                let inv_surface_area = (1.0 / (dim.0 as f64 * dim.1 as f64)) as f32;
-                                let color = lerp_slice(self.colors, time) * inv_surface_area;
-                                SurfaceClosure::Emit(color)
-                            };
-
-                            // Fill in intersection
-                            isects[ray_idx] = SurfaceIntersection::Hit {
-                                intersection_data: intersection_data,
-                                closure: closure,
-                            };
-
-                            // Set ray's max t
-                            rays.set_max_t(ray_idx, t);
-                        }
-
-                        break;
+                        ray.max_t = t;
                     }
+
+                    break;
                 }
             }
-        });
+        }
     }
 }
 
diff --git a/src/light/sphere_light.rs b/src/light/sphere_light.rs
index 8da8515..4763a56 100644
--- a/src/light/sphere_light.rs
+++ b/src/light/sphere_light.rs
@@ -7,8 +7,8 @@ use crate::{
     boundable::Boundable,
     color::{Color, SpectralSample},
     lerp::lerp_slice,
-    math::{coordinate_system_from_vector, dot, Normal, Point, Vector, Xform, XformFull},
-    ray::{RayBatch, RayStack},
+    math::{coordinate_system_from_vector, dot, Normal, Point, Vector, XformFull},
+    ray::{LocalRay, Ray},
     sampling::{uniform_sample_cone, uniform_sample_cone_pdf, uniform_sample_sphere},
     shading::surface_closure::SurfaceClosure,
     shading::SurfaceShader,
@@ -201,139 +201,122 @@ impl<'a> SurfaceLight for SphereLight<'a> {
 }
 
 impl<'a> Surface for SphereLight<'a> {
-    fn intersect_rays(
+    fn intersect_ray(
         &self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-        isects: &mut [SurfaceIntersection],
-        shader: &dyn SurfaceShader,
-        space: &[Xform],
+        ray: &mut Ray,
+        local_ray: &LocalRay,
+        space: &XformFull,
+        isect: &mut SurfaceIntersection,
+        _shader: &dyn SurfaceShader,
     ) {
-        let _ = shader; // Silence 'unused' warning
+        let time = ray.time;
 
-        ray_stack.pop_do_next_task(|ray_idx| {
-            let time = rays.time(ray_idx);
+        // Get the radius of the sphere at the ray's time
+        let radius = lerp_slice(self.radii, time); // Radius of the sphere
 
-            // Get the transform space
-            let xform = if let Some(xform) = lerp_slice(space, time).to_full() {
-                xform
-            } else {
-                return;
+        // Code adapted to Rust from https://github.com/Tecla/Rayito
+        // Ray-sphere intersection can result in either zero, one or two points
+        // of intersection.  It turns into a quadratic equation, so we just find
+        // the solution using the quadratic formula.  Note that there is a
+        // slightly more stable form of it when computing it on a computer, and
+        // we use that method to keep everything accurate.
+
+        // Calculate quadratic coeffs
+        let a = local_ray.dir.length2();
+        let b = 2.0 * dot(local_ray.dir, local_ray.orig.into_vector());
+        let c = local_ray.orig.into_vector().length2() - (radius * radius);
+
+        let discriminant = (b * b) - (4.0 * a * c);
+        if discriminant < 0.0 {
+            // Discriminant less than zero?  No solution => no intersection.
+            return;
+        }
+        let discriminant = discriminant.sqrt();
+
+        // Compute a more stable form of our param t (t0 = q/a, t1 = c/q)
+        // q = -0.5 * (b - sqrt(b * b - 4.0 * a * c)) if b < 0, or
+        // q = -0.5 * (b + sqrt(b * b - 4.0 * a * c)) if b >= 0
+        let q = if b < 0.0 {
+            -0.5 * (b - discriminant)
+        } else {
+            -0.5 * (b + discriminant)
+        };
+
+        // Get our final parametric values
+        let mut t0 = q / a;
+        let mut t1 = if q != 0.0 { c / q } else { ray.max_t };
+
+        // Swap them so they are ordered right
+        if t0 > t1 {
+            use std::mem::swap;
+            swap(&mut t0, &mut t1);
+        }
+
+        // Check our intersection for validity against this ray's extents
+        if t0 > ray.max_t || t1 <= 0.0 {
+            // Didn't hit because sphere is entirely outside of ray's extents
+            return;
+        }
+
+        let t = if t0 > 0.0 {
+            t0
+        } else if t1 <= ray.max_t {
+            t1
+        } else {
+            // Didn't hit because ray is entirely within the sphere, and
+            // therefore doesn't hit its surface.
+            return;
+        };
+
+        // We hit the sphere, so calculate intersection info.
+        if ray.is_occlusion() {
+            *isect = SurfaceIntersection::Occlude;
+            ray.mark_done();
+        } else {
+            // Position is calculated from the local-space ray and t, and then
+            // re-projected onto the surface of the sphere.
+            let t_pos = local_ray.orig + (local_ray.dir * t);
+            let unit_pos = t_pos.into_vector().normalized();
+            let pos = (unit_pos * radius).xform(space).into_point();
+
+            // TODO: proper error bounds.
+            let pos_err = 0.001;
+
+            let normal = unit_pos.into_normal().xform_fast(space);
+
+            let intersection_data = SurfaceIntersectionData {
+                incoming: ray.dir,
+                t: t,
+                pos: pos,
+                pos_err: pos_err,
+                nor: normal,
+                nor_g: normal,
+                local_space: *space,
+                sample_pdf: self.sample_pdf(
+                    space,
+                    ray.orig,
+                    ray.dir,
+                    0.0,
+                    0.0,
+                    ray.wavelength,
+                    time,
+                ),
             };
 
-            // Get the radius of the sphere at the ray's time
-            let radius = lerp_slice(self.radii, time); // Radius of the sphere
-
-            // Get the ray origin and direction in local space
-            let orig = rays.orig_local(ray_idx).into_vector();
-            let dir = rays.dir(ray_idx).xform_inv(&xform);
-
-            // Code adapted to Rust from https://github.com/Tecla/Rayito
-            // Ray-sphere intersection can result in either zero, one or two points
-            // of intersection.  It turns into a quadratic equation, so we just find
-            // the solution using the quadratic formula.  Note that there is a
-            // slightly more stable form of it when computing it on a computer, and
-            // we use that method to keep everything accurate.
-
-            // Calculate quadratic coeffs
-            let a = dir.length2();
-            let b = 2.0 * dot(dir, orig);
-            let c = orig.length2() - (radius * radius);
-
-            let discriminant = (b * b) - (4.0 * a * c);
-            if discriminant < 0.0 {
-                // Discriminant less than zero?  No solution => no intersection.
-                return;
-            }
-            let discriminant = discriminant.sqrt();
-
-            // Compute a more stable form of our param t (t0 = q/a, t1 = c/q)
-            // q = -0.5 * (b - sqrt(b * b - 4.0 * a * c)) if b < 0, or
-            // q = -0.5 * (b + sqrt(b * b - 4.0 * a * c)) if b >= 0
-            let q = if b < 0.0 {
-                -0.5 * (b - discriminant)
-            } else {
-                -0.5 * (b + discriminant)
+            let closure = {
+                let inv_surface_area = (1.0 / (4.0 * PI_64 * radius as f64 * radius as f64)) as f32;
+                let color = lerp_slice(self.colors, time) * inv_surface_area;
+                SurfaceClosure::Emit(color)
             };
 
-            // Get our final parametric values
-            let mut t0 = q / a;
-            let mut t1 = if q != 0.0 { c / q } else { rays.max_t(ray_idx) };
-
-            // Swap them so they are ordered right
-            if t0 > t1 {
-                use std::mem::swap;
-                swap(&mut t0, &mut t1);
-            }
-
-            // Check our intersection for validity against this ray's extents
-            if t0 > rays.max_t(ray_idx) || t1 <= 0.0 {
-                // Didn't hit because sphere is entirely outside of ray's extents
-                return;
-            }
-
-            let t = if t0 > 0.0 {
-                t0
-            } else if t1 <= rays.max_t(ray_idx) {
-                t1
-            } else {
-                // Didn't hit because ray is entirely within the sphere, and
-                // therefore doesn't hit its surface.
-                return;
+            // Fill in intersection
+            *isect = SurfaceIntersection::Hit {
+                intersection_data: intersection_data,
+                closure: closure,
             };
 
-            // We hit the sphere, so calculate intersection info.
-            if rays.is_occlusion(ray_idx) {
-                isects[ray_idx] = SurfaceIntersection::Occlude;
-                rays.mark_done(ray_idx);
-            } else {
-                // Position is calculated from the local-space ray and t, and then
-                // re-projected onto the surface of the sphere.
-                let t_pos = orig + (dir * t);
-                let unit_pos = t_pos.normalized();
-                let pos = (unit_pos * radius).xform(&xform).into_point();
-
-                // TODO: proper error bounds.
-                let pos_err = 0.001;
-
-                let normal = unit_pos.into_normal().xform_fast(&xform);
-
-                let intersection_data = SurfaceIntersectionData {
-                    incoming: rays.dir(ray_idx),
-                    t: t,
-                    pos: pos,
-                    pos_err: pos_err,
-                    nor: normal,
-                    nor_g: normal,
-                    local_space: xform,
-                    sample_pdf: self.sample_pdf(
-                        &xform,
-                        rays.orig(ray_idx),
-                        rays.dir(ray_idx),
-                        0.0,
-                        0.0,
-                        rays.wavelength(ray_idx),
-                        time,
-                    ),
-                };
-
-                let closure = {
-                    let inv_surface_area =
-                        (1.0 / (4.0 * PI_64 * radius as f64 * radius as f64)) as f32;
-                    let color = lerp_slice(self.colors, time) * inv_surface_area;
-                    SurfaceClosure::Emit(color)
-                };
-
-                // Fill in intersection
-                isects[ray_idx] = SurfaceIntersection::Hit {
-                    intersection_data: intersection_data,
-                    closure: closure,
-                };
-
-                // Set ray's max t
-                rays.set_max_t(ray_idx, t);
-            }
-        });
+            ray.max_t = t;
+        }
     }
 }
 
diff --git a/src/main.rs b/src/main.rs
index 1627931..eec57db 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -39,7 +39,7 @@ mod space_fill;
 mod surface;
 mod timer;
 mod tracer;
-mod transform_stack;
+// mod transform_stack;
 
 use std::{fs::File, io, io::Read, mem, path::Path, str::FromStr};
 
diff --git a/src/ray.rs b/src/ray.rs
index 035cdf2..45d476b 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -1,16 +1,11 @@
 #![allow(dead_code)]
 
-use rmath::wide4::Bool4;
-
 use crate::math::{Point, Vector, XformFull};
 
-type RayIndexType = u16;
 type FlagType = u8;
 const OCCLUSION_FLAG: FlagType = 1;
 const DONE_FLAG: FlagType = 1 << 1;
 
-/// This is never used directly in ray tracing--it's only used as a convenience
-/// for filling the RayBatch structure.
 #[derive(Debug, Copy, Clone)]
 pub struct Ray {
     pub orig: Point,
@@ -18,380 +13,85 @@ pub struct Ray {
     pub time: f32,
     pub wavelength: f32,
     pub max_t: f32,
+    pub flags: FlagType,
 }
 
-/// The hot (frequently accessed) parts of ray data.
+/// A specifically local-space ray, for passing to functions when we've
+/// already calculated the local-space version of a ray for the object
+/// in question.
+///
+/// Also includes `dir_inv`, which is generally useful to have as well.
 #[derive(Debug, Copy, Clone)]
-struct RayHot {
-    orig_local: Point,     // Local-space ray origin
-    dir_inv_local: Vector, // Local-space 1.0/ray direction
-    max_t: f32,
-    time: f32,
-    flags: FlagType,
+pub struct LocalRay {
+    pub orig: Point,
+    pub dir: Vector,
+    pub dir_inv: Vector,
 }
 
-/// The cold (infrequently accessed) parts of ray data.
-#[derive(Debug, Copy, Clone)]
-struct RayCold {
-    orig: Point, // World-space ray origin
-    dir: Vector, // World-space ray direction
-    wavelength: f32,
-}
-
-/// A batch of rays, separated into hot and cold parts.
-#[derive(Debug)]
-pub struct RayBatch {
-    hot: Vec<RayHot>,
-    cold: Vec<RayCold>,
-}
-
-impl RayBatch {
-    /// Creates a new empty ray batch.
-    pub fn new() -> RayBatch {
-        RayBatch {
-            hot: Vec::new(),
-            cold: Vec::new(),
-        }
-    }
-
-    /// Creates a new empty ray batch, with pre-allocated capacity for
-    /// `n` rays.
-    pub fn with_capacity(n: usize) -> RayBatch {
-        RayBatch {
-            hot: Vec::with_capacity(n),
-            cold: Vec::with_capacity(n),
-        }
-    }
-
-    pub fn push(&mut self, ray: Ray, is_occlusion: bool) {
-        self.hot.push(RayHot {
-            orig_local: ray.orig,   // Bogus, to place-hold.
-            dir_inv_local: ray.dir, // Bogus, to place-hold.
-            max_t: ray.max_t,
-            time: ray.time,
+impl Ray {
+    pub fn new(
+        orig: Point,
+        dir: Vector,
+        time: f32,
+        wavelength: f32,
+        max_t: f32,
+        is_occlusion: bool,
+    ) -> Self {
+        Self {
+            orig: orig,
+            dir: dir,
+            time: time,
+            wavelength: wavelength,
+            max_t: max_t,
             flags: if is_occlusion { OCCLUSION_FLAG } else { 0 },
-        });
-        self.cold.push(RayCold {
-            orig: ray.orig,
-            dir: ray.dir,
-            wavelength: ray.wavelength,
-        });
+        }
     }
 
-    pub fn swap(&mut self, a: usize, b: usize) {
-        self.hot.swap(a, b);
-        self.cold.swap(a, b);
+    /// Creates a local ray from the given transform.
+    pub fn to_local_xform(&self, xform: &XformFull) -> LocalRay {
+        let orig = self.orig.xform_inv(xform);
+        let dir = self.dir.xform_inv(xform);
+
+        LocalRay {
+            orig: orig,
+            dir: dir,
+            dir_inv: dir.recip(),
+        }
     }
 
-    pub fn set_from_ray(&mut self, ray: &Ray, is_occlusion: bool, idx: usize) {
-        self.hot[idx].orig_local = ray.orig;
-        self.hot[idx].dir_inv_local = Vector(ray.dir.0.recip());
-        self.hot[idx].max_t = ray.max_t;
-        self.hot[idx].time = ray.time;
-        self.hot[idx].flags = if is_occlusion { OCCLUSION_FLAG } else { 0 };
-
-        self.cold[idx].orig = ray.orig;
-        self.cold[idx].dir = ray.dir;
-        self.cold[idx].wavelength = ray.wavelength;
+    /// Creates a local ray with no transform applied.
+    pub fn to_local(&self) -> LocalRay {
+        LocalRay {
+            orig: self.orig,
+            dir: self.dir,
+            dir_inv: self.dir.recip(),
+        }
     }
 
-    pub fn truncate(&mut self, len: usize) {
-        self.hot.truncate(len);
-        self.cold.truncate(len);
-    }
-
-    /// Clear all rays, settings the size of the batch back to zero.
-    ///
-    /// Capacity is maintained.
-    pub fn clear(&mut self) {
-        self.hot.clear();
-        self.cold.clear();
-    }
-
-    pub fn len(&self) -> usize {
-        self.hot.len()
-    }
-
-    /// Updates the accel data of the given ray (at index `idx`) with the
-    /// given transform.
-    ///
-    /// This should be called when entering (and exiting) traversal of a
-    /// new transform space.
-    pub fn update_local(&mut self, idx: usize, xform: &XformFull) {
-        self.hot[idx].orig_local = self.cold[idx].orig.xform_inv(xform);
-        self.hot[idx].dir_inv_local = Vector((self.cold[idx].dir.xform_inv(xform)).0.recip());
-    }
-
-    //==========================================================
-    // Data access
+    //---------------------------------------------------------
+    // Flags.
 
+    /// Returns whether this is an occlusion ray.
     #[inline(always)]
-    pub fn orig(&self, idx: usize) -> Point {
-        self.cold[idx].orig
+    pub fn is_occlusion(&self) -> bool {
+        (self.flags & OCCLUSION_FLAG) != 0
     }
 
+    /// Returns whether this ray has finished traversal.
     #[inline(always)]
-    pub fn dir(&self, idx: usize) -> Vector {
-        self.cold[idx].dir
+    pub fn is_done(&self) -> bool {
+        (self.flags & DONE_FLAG) != 0
     }
 
+    /// Marks this as an occlusion ray.
     #[inline(always)]
-    pub fn orig_local(&self, idx: usize) -> Point {
-        self.hot[idx].orig_local
+    pub fn mark_occlusion(&mut self) {
+        self.flags |= OCCLUSION_FLAG
     }
 
+    /// Marks this as having finished traversal.
     #[inline(always)]
-    pub fn dir_inv_local(&self, idx: usize) -> Vector {
-        self.hot[idx].dir_inv_local
-    }
-
-    #[inline(always)]
-    pub fn time(&self, idx: usize) -> f32 {
-        self.hot[idx].time
-    }
-
-    #[inline(always)]
-    pub fn max_t(&self, idx: usize) -> f32 {
-        self.hot[idx].max_t
-    }
-
-    #[inline(always)]
-    pub fn set_max_t(&mut self, idx: usize, new_max_t: f32) {
-        self.hot[idx].max_t = new_max_t;
-    }
-
-    #[inline(always)]
-    pub fn wavelength(&self, idx: usize) -> f32 {
-        self.cold[idx].wavelength
-    }
-
-    /// Returns whether the given ray (at index `idx`) is an occlusion ray.
-    #[inline(always)]
-    pub fn is_occlusion(&self, idx: usize) -> bool {
-        (self.hot[idx].flags & OCCLUSION_FLAG) != 0
-    }
-
-    /// Returns whether the given ray (at index `idx`) has finished traversal.
-    #[inline(always)]
-    pub fn is_done(&self, idx: usize) -> bool {
-        (self.hot[idx].flags & DONE_FLAG) != 0
-    }
-
-    /// Marks the given ray (at index `idx`) as an occlusion ray.
-    #[inline(always)]
-    pub fn mark_occlusion(&mut self, idx: usize) {
-        self.hot[idx].flags |= OCCLUSION_FLAG
-    }
-
-    /// Marks the given ray (at index `idx`) as having finished traversal.
-    #[inline(always)]
-    pub fn mark_done(&mut self, idx: usize) {
-        self.hot[idx].flags |= DONE_FLAG
+    pub fn mark_done(&mut self) {
+        self.flags |= DONE_FLAG
     }
 }
-
-/// A structure used for tracking traversal of a ray batch through a scene.
-#[derive(Debug)]
-pub struct RayStack {
-    lanes: Vec<Lane>,
-    tasks: Vec<RayTask>,
-}
-
-impl RayStack {
-    pub fn new() -> RayStack {
-        RayStack {
-            lanes: Vec::new(),
-            tasks: Vec::new(),
-        }
-    }
-
-    /// Returns whether the stack is empty of tasks or not.
-    pub fn is_empty(&self) -> bool {
-        self.tasks.is_empty()
-    }
-
-    /// Makes sure there are at least `count` lanes.
-    pub fn ensure_lane_count(&mut self, count: usize) {
-        while self.lanes.len() < count {
-            self.lanes.push(Lane {
-                idxs: Vec::new(),
-                end_len: 0,
-            })
-        }
-    }
-
-    pub fn ray_count_in_next_task(&self) -> usize {
-        let task = self.tasks.last().unwrap();
-        let end = self.lanes[task.lane].end_len;
-        end - task.start_idx
-    }
-
-    pub fn next_task_ray_idx(&self, i: usize) -> usize {
-        let task = self.tasks.last().unwrap();
-        let i = i + task.start_idx;
-        debug_assert!(i < self.lanes[task.lane].end_len);
-        self.lanes[task.lane].idxs[i] as usize
-    }
-
-    /// Clears the lanes and tasks of the RayStack.
-    ///
-    /// Note: this is (importantly) different than calling clear individually
-    /// on the `lanes` and `tasks` members.  Specifically, we don't want to
-    /// clear `lanes` itself, as that would also free all the memory of the
-    /// individual lanes.  Instead, we want to iterate over the individual
-    /// lanes and clear them, but leave `lanes` itself untouched.
-    pub fn clear(&mut self) {
-        for lane in self.lanes.iter_mut() {
-            lane.idxs.clear();
-            lane.end_len = 0;
-        }
-
-        self.tasks.clear();
-    }
-
-    /// Pushes the given ray index onto the end of the specified lane.
-    pub fn push_ray_index(&mut self, ray_idx: usize, lane: usize) {
-        assert!(self.lanes.len() > lane);
-        self.lanes[lane].idxs.push(ray_idx as RayIndexType);
-    }
-
-    /// Pushes any excess indices on the given lane to a new task on the
-    /// task stack.
-    ///
-    /// Returns whether a task was pushed or not.  No task will be pushed
-    /// if there are no excess indices on the end of the lane.
-    pub fn push_lane_to_task(&mut self, lane_idx: usize) -> bool {
-        if self.lanes[lane_idx].end_len < self.lanes[lane_idx].idxs.len() {
-            self.tasks.push(RayTask {
-                lane: lane_idx,
-                start_idx: self.lanes[lane_idx].end_len,
-            });
-            self.lanes[lane_idx].end_len = self.lanes[lane_idx].idxs.len();
-            true
-        } else {
-            false
-        }
-    }
-
-    /// Takes the given list of lane indices, and pushes any excess indices on
-    /// the end of each into a new task, in the order provided.
-    pub fn push_lanes_to_tasks(&mut self, lane_idxs: &[usize]) {
-        for &l in lane_idxs {
-            self.push_lane_to_task(l);
-        }
-    }
-
-    pub fn duplicate_next_task(&mut self) {
-        let task = self.tasks.last().unwrap();
-        let l = task.lane;
-        let start = task.start_idx;
-        let end = self.lanes[l].end_len;
-
-        // Extend the indices vector
-        self.lanes[l].idxs.reserve(end - start);
-        let old_len = self.lanes[l].idxs.len();
-        let new_len = old_len + end - start;
-        unsafe {
-            self.lanes[l].idxs.set_len(new_len);
-        }
-
-        // Copy elements
-        copy_in_place::copy_in_place(&mut self.lanes[l].idxs, start..end, end);
-
-        // Push the new task onto the stack
-        self.tasks.push(RayTask {
-            lane: l,
-            start_idx: end,
-        });
-
-        self.lanes[l].end_len = self.lanes[l].idxs.len();
-    }
-
-    // Pops the next task off the stack.
-    pub fn pop_task(&mut self) {
-        let task = self.tasks.pop().unwrap();
-        self.lanes[task.lane].end_len = task.start_idx;
-        self.lanes[task.lane].idxs.truncate(task.start_idx);
-    }
-
-    // Executes a task without popping it from the task stack.
-    pub fn do_next_task<F>(&mut self, mut handle_ray: F)
-    where
-        F: FnMut(usize),
-    {
-        let task = self.tasks.last().unwrap();
-        let task_range = (task.start_idx, self.lanes[task.lane].end_len);
-
-        // Execute task.
-        for i in task_range.0..task_range.1 {
-            let ray_idx = self.lanes[task.lane].idxs[i];
-            handle_ray(ray_idx as usize);
-        }
-    }
-
-    /// Pops the next task off the stack, and executes the provided closure for
-    /// each ray index in the task.
-    #[inline(always)]
-    pub fn pop_do_next_task<F>(&mut self, handle_ray: F)
-    where
-        F: FnMut(usize),
-    {
-        self.do_next_task(handle_ray);
-        self.pop_task();
-    }
-
-    /// Pops the next task off the stack, executes the provided closure for
-    /// each ray index in the task, and pushes the ray indices back onto the
-    /// indicated lanes.
-    pub fn pop_do_next_task_and_push_rays<F>(&mut self, output_lane_count: usize, mut handle_ray: F)
-    where
-        F: FnMut(usize) -> Bool4,
-    {
-        // Pop the task and do necessary bookkeeping.
-        let task = self.tasks.pop().unwrap();
-        let task_range = (task.start_idx, self.lanes[task.lane].end_len);
-        self.lanes[task.lane].end_len = task.start_idx;
-
-        // SAFETY: this is probably evil, and depends on behavior of Vec that
-        // are not actually promised.  But we're essentially truncating the lane
-        // to the start of our task range, but will continue to access it's
-        // elements beyond that range via `get_unchecked()` below.  Because the
-        // memory is not freed nor altered, this is safe.  However, again, the
-        // Vec apis don't promise this behavior.  So:
-        //
-        // TODO: build a slightly different lane abstraction to get this same
-        // efficiency without depending on implicit Vec behavior.
-        unsafe {
-            self.lanes[task.lane].idxs.set_len(task.start_idx);
-        }
-
-        // Execute task.
-        for i in task_range.0..task_range.1 {
-            let ray_idx = *unsafe { self.lanes[task.lane].idxs.get_unchecked(i) };
-            let push_mask = handle_ray(ray_idx as usize).bitmask();
-            for l in 0..output_lane_count {
-                if (push_mask & (1 << l)) != 0 {
-                    self.lanes[l as usize].idxs.push(ray_idx);
-                }
-            }
-        }
-    }
-}
-
-/// A lane within a RayStack.
-#[derive(Debug)]
-struct Lane {
-    idxs: Vec<RayIndexType>,
-    end_len: usize,
-}
-
-/// A task within a RayStack.
-//
-// Specifies the lane that the relevant ray pointers are in, and the
-// starting index within that lane.  The relevant pointers are always
-// `&[start_idx..]` within the given lane.
-#[derive(Debug)]
-struct RayTask {
-    lane: usize,
-    start_idx: usize,
-}
diff --git a/src/renderer.rs b/src/renderer.rs
index d2d9313..6363b45 100644
--- a/src/renderer.rs
+++ b/src/renderer.rs
@@ -14,16 +14,15 @@ use crate::{
     color::{map_0_1_to_wavelength, SpectralSample, XYZ},
     fp_utils::robust_ray_origin,
     image::Image,
-    math::{probit, upper_power_of_two, Float4},
+    math::{probit, upper_power_of_two, Float4, XformFull},
     mis::power_heuristic,
-    ray::{Ray, RayBatch},
+    ray::Ray,
     scene::{Scene, SceneLightSample},
     scramble::owen4,
     space_fill::{hilbert, morton},
     surface,
     timer::Timer,
     tracer::Tracer,
-    transform_stack::TransformStack,
 };
 
 #[derive(Debug)]
@@ -203,10 +202,7 @@ impl<'a> Renderer<'a> {
         let mut timer = Timer::new();
         let mut total_timer = Timer::new();
 
-        let mut paths = Vec::new();
-        let mut rays = RayBatch::new();
         let mut tracer = Tracer::from_assembly(&self.scene.root);
-        let mut xform_stack = TransformStack::new();
 
         // Pre-calculate some useful values related to the image plane
         let cmpx = 1.0 / self.resolution.0 as f32;
@@ -220,9 +216,6 @@ impl<'a> Renderer<'a> {
 
         // Render
         'render_loop: loop {
-            paths.clear();
-            rays.clear();
-
             // Get bucket, or exit if no more jobs left
             let bucket: BucketJob;
             loop {
@@ -235,9 +228,14 @@ impl<'a> Renderer<'a> {
             }
 
             timer.tick();
-            // Generate light paths and initial rays
-            for y in bucket.y..(bucket.y + bucket.h) {
-                for x in bucket.x..(bucket.x + bucket.w) {
+
+            let bucket_min = (bucket.x, bucket.y);
+            let bucket_max = (bucket.x + bucket.w, bucket.y + bucket.h);
+            let mut img_bucket = image.get_bucket(bucket_min, bucket_max);
+
+            // Trace each sample in each pixel.
+            for y in bucket_min.1..bucket_max.1 {
+                for x in bucket_min.0..bucket_max.0 {
                     // `si_offset` is for screen-space blue-noise sampling in the
                     // spirit of the paper "Screen-Space Blue-Noise Diffusion of Monte
                     // Carlo Sampling Error via Hierarchical Ordering of Pixels" by
@@ -266,8 +264,8 @@ impl<'a> Renderer<'a> {
                             ((samp_x - 0.5) * x_extent, (0.5 - samp_y) * y_extent)
                         };
 
-                        // Create the light path and initial ray for this sample
-                        let (path, ray) = LightPath::new(
+                        // Create the light path and initial ray for this sample.
+                        let (mut path, mut ray) = LightPath::new(
                             &self.scene,
                             self.seed,
                             (x, y),
@@ -277,83 +275,66 @@ impl<'a> Renderer<'a> {
                             map_0_1_to_wavelength(d0),
                             si as u32,
                         );
-                        paths.push(path);
-                        rays.push(ray, false);
+
+                        let mut isect = surface::SurfaceIntersection::Miss;
+
+                        // Trace light path.
+                        while path.next(&self.scene, &isect, &mut ray) {
+                            isect = surface::SurfaceIntersection::Miss;
+                            tracer.trace(&mut ray, &mut isect);
+                        }
+
+                        // Accummulate light path color to pixel.
+                        let path_col = SpectralSample::from_parts(path.color, path.wavelength);
+                        let mut col = img_bucket.get(x, y);
+                        col += XYZ::from_spectral_sample(&path_col) / self.spp as f32;
+                        img_bucket.set(x, y, col);
                     }
                 }
             }
-            stats.initial_ray_generation_time += timer.tick() as f64;
+            // stats.initial_ray_generation_time += timer.tick() as f64;
+            // stats.ray_generation_time += timer.tick() as f64;
+            // stats.trace_time += timer.tick() as f64;
+            // stats.sample_writing_time += timer.tick() as f64;
 
-            // Trace the paths!
-            let mut pi = paths.len();
-            while pi > 0 {
-                // Test rays against scene
-                let isects = tracer.trace(&mut rays);
-                stats.trace_time += timer.tick() as f64;
+            // Pre-calculate base64 encoding if needed
+            let base64_enc = if do_blender_output {
+                use crate::color::xyz_to_rec709_e;
+                Some(img_bucket.rgba_base64(xyz_to_rec709_e))
+            } else {
+                None
+            };
 
-                // Determine next rays to shoot based on result
-                let mut new_end = 0;
-                for i in 0..pi {
-                    if paths[i].next(&mut xform_stack, &self.scene, &isects[i], &mut rays, i) {
-                        paths.swap(new_end, i);
-                        rays.swap(new_end, i);
-                        new_end += 1;
-                    }
+            // Print render progress, and image data if doing blender output
+            let guard = pixels_rendered.lock().unwrap();
+            let mut pr = (*guard).get();
+            let percentage_old = pr as f64 / total_pixels as f64 * 100.0;
+
+            pr += bucket.w as usize * bucket.h as usize;
+            (*guard).set(pr);
+            let percentage_new = pr as f64 / total_pixels as f64 * 100.0;
+
+            let old_string = format!("{:.2}%", percentage_old);
+            let new_string = format!("{:.2}%", percentage_new);
+
+            if let Some(bucket_data) = base64_enc {
+                // If doing Blender output
+                println!("DIV");
+                println!("{}", new_string);
+                println!(
+                    "{} {} {} {}",
+                    bucket_min.0, bucket_min.1, bucket_max.0, bucket_max.1
+                );
+                println!("{}", bucket_data);
+                println!("BUCKET_END");
+                println!("DIV");
+            } else {
+                // If doing console output
+                if new_string != old_string {
+                    print!("\r{}", new_string);
                 }
-                rays.truncate(new_end);
-                pi = new_end;
-                stats.ray_generation_time += timer.tick() as f64;
-            }
-
-            {
-                // Calculate color based on ray hits and save to image
-                let min = (bucket.x, bucket.y);
-                let max = (bucket.x + bucket.w, bucket.y + bucket.h);
-                let mut img_bucket = image.get_bucket(min, max);
-                for path in &paths {
-                    let path_col = SpectralSample::from_parts(path.color, path.wavelength);
-                    let mut col = img_bucket.get(path.pixel_co.0, path.pixel_co.1);
-                    col += XYZ::from_spectral_sample(&path_col) / self.spp as f32;
-                    img_bucket.set(path.pixel_co.0, path.pixel_co.1, col);
-                }
-                stats.sample_writing_time += timer.tick() as f64;
-
-                // Pre-calculate base64 encoding if needed
-                let base64_enc = if do_blender_output {
-                    use crate::color::xyz_to_rec709_e;
-                    Some(img_bucket.rgba_base64(xyz_to_rec709_e))
-                } else {
-                    None
-                };
-
-                // Print render progress, and image data if doing blender output
-                let guard = pixels_rendered.lock().unwrap();
-                let mut pr = (*guard).get();
-                let percentage_old = pr as f64 / total_pixels as f64 * 100.0;
-
-                pr += bucket.w as usize * bucket.h as usize;
-                (*guard).set(pr);
-                let percentage_new = pr as f64 / total_pixels as f64 * 100.0;
-
-                let old_string = format!("{:.2}%", percentage_old);
-                let new_string = format!("{:.2}%", percentage_new);
-
-                if let Some(bucket_data) = base64_enc {
-                    // If doing Blender output
-                    println!("DIV");
-                    println!("{}", new_string);
-                    println!("{} {} {} {}", min.0, min.1, max.0, max.1);
-                    println!("{}", bucket_data);
-                    println!("BUCKET_END");
-                    println!("DIV");
-                } else {
-                    // If doing console output
-                    if new_string != old_string {
-                        print!("\r{}", new_string);
-                    }
-                }
-                let _ = io::stdout().flush();
             }
+            let _ = io::stdout().flush();
         }
 
         stats.total_time += total_timer.tick() as f64;
@@ -450,14 +431,7 @@ impl LightPath {
         get_sample_4d(self.sample_number, dimension, self.sampling_seed)
     }
 
-    fn next(
-        &mut self,
-        xform_stack: &mut TransformStack,
-        scene: &Scene,
-        isect: &surface::SurfaceIntersection,
-        rays: &mut RayBatch,
-        ray_idx: usize,
-    ) -> bool {
+    fn next(&mut self, scene: &Scene, isect: &surface::SurfaceIntersection, ray: &mut Ray) -> bool {
         match self.event {
             //--------------------------------------------------------------------
             // Result of Camera or bounce ray, prepare next bounce and light rays
@@ -493,13 +467,12 @@ impl LightPath {
                     self.next_lds_sequence();
                     let (light_n, d2, d3, d4) = self.next_lds_samp();
                     let light_uvw = (d2, d3, d4);
-                    xform_stack.clear();
                     let light_info = scene.sample_lights(
-                        xform_stack,
                         light_n,
                         light_uvw,
                         self.wavelength,
                         self.time,
+                        &XformFull::identity(),
                         isect,
                     );
                     let found_light = if light_info.is_none()
@@ -518,7 +491,7 @@ impl LightPath {
                             // Distant light
                             SceneLightSample::Distant { direction, .. } => {
                                 let (attenuation, closure_pdf) = closure.evaluate(
-                                    rays.dir(ray_idx),
+                                    ray.dir,
                                     direction,
                                     idata.nor,
                                     idata.nor_g,
@@ -533,13 +506,14 @@ impl LightPath {
                                         idata.nor_g.normalized(),
                                         direction,
                                     );
-                                    Ray {
-                                        orig: offset_pos,
-                                        dir: direction,
-                                        time: self.time,
-                                        wavelength: self.wavelength,
-                                        max_t: std::f32::INFINITY,
-                                    }
+                                    Ray::new(
+                                        offset_pos,
+                                        direction,
+                                        self.time,
+                                        self.wavelength,
+                                        std::f32::INFINITY,
+                                        true,
+                                    )
                                 };
                                 (attenuation, closure_pdf, shadow_ray)
                             }
@@ -548,7 +522,7 @@ impl LightPath {
                             SceneLightSample::Surface { sample_geo, .. } => {
                                 let dir = sample_geo.0 - idata.pos;
                                 let (attenuation, closure_pdf) = closure.evaluate(
-                                    rays.dir(ray_idx),
+                                    ray.dir,
                                     dir,
                                     idata.nor,
                                     idata.nor_g,
@@ -569,13 +543,14 @@ impl LightPath {
                                         sample_geo.1.normalized(),
                                         -dir,
                                     );
-                                    Ray {
-                                        orig: offset_pos,
-                                        dir: offset_end - offset_pos,
-                                        time: self.time,
-                                        wavelength: self.wavelength,
-                                        max_t: 1.0,
-                                    }
+                                    Ray::new(
+                                        offset_pos,
+                                        offset_end - offset_pos,
+                                        self.time,
+                                        self.wavelength,
+                                        1.0,
+                                        true,
+                                    )
                                 };
                                 (attenuation, closure_pdf, shadow_ray)
                             }
@@ -593,7 +568,7 @@ impl LightPath {
                                 light_info.color().e * attenuation.e * self.light_attenuation
                                     / (light_mis_pdf * light_sel_pdf);
 
-                            rays.set_from_ray(&shadow_ray, true, ray_idx);
+                            *ray = shadow_ray;
 
                             true
                         }
@@ -630,13 +605,14 @@ impl LightPath {
                                 idata.nor_g.normalized(),
                                 dir,
                             );
-                            self.next_bounce_ray = Some(Ray {
-                                orig: offset_pos,
-                                dir: dir,
-                                time: self.time,
-                                wavelength: self.wavelength,
-                                max_t: std::f32::INFINITY,
-                            });
+                            self.next_bounce_ray = Some(Ray::new(
+                                offset_pos,
+                                dir,
+                                self.time,
+                                self.wavelength,
+                                std::f32::INFINITY,
+                                false,
+                            ));
 
                             true
                         } else {
@@ -652,7 +628,7 @@ impl LightPath {
                         self.event = LightPathEvent::ShadowRay;
                         return true;
                     } else if do_bounce {
-                        rays.set_from_ray(&self.next_bounce_ray.unwrap(), false, ray_idx);
+                        *ray = self.next_bounce_ray.unwrap();
                         self.event = LightPathEvent::BounceRay;
                         self.light_attenuation *= self.next_attenuation_fac;
                         return true;
@@ -683,7 +659,7 @@ impl LightPath {
 
                 // Set up for the next bounce, if any
                 if let Some(ref nbr) = self.next_bounce_ray {
-                    rays.set_from_ray(nbr, false, ray_idx);
+                    *ray = *nbr;
                     self.light_attenuation *= self.next_attenuation_fac;
                     self.event = LightPathEvent::BounceRay;
                     return true;
diff --git a/src/scene/assembly.rs b/src/scene/assembly.rs
index e6df886..9a0470b 100644
--- a/src/scene/assembly.rs
+++ b/src/scene/assembly.rs
@@ -13,7 +13,6 @@ use crate::{
     math::{Normal, Point, Xform, XformFull},
     shading::SurfaceShader,
     surface::{Surface, SurfaceIntersection},
-    transform_stack::TransformStack,
 };
 
 #[derive(Copy, Clone, Debug)]
@@ -45,11 +44,11 @@ impl<'a> Assembly<'a> {
     // Returns (light_color, (sample_point, normal, point_err), pdf, selection_pdf)
     pub fn sample_lights(
         &self,
-        xform_stack: &mut TransformStack,
         n: f32,
         uvw: (f32, f32, f32),
         wavelength: f32,
         time: f32,
+        space: &XformFull,
         intr: &SurfaceIntersection,
     ) -> Option<(SpectralSample, (Point, Normal, f32), f32, f32)> {
         if let SurfaceIntersection::Hit {
@@ -57,58 +56,46 @@ impl<'a> Assembly<'a> {
             closure,
         } = *intr
         {
-            let sel_xform = if !xform_stack.top().is_empty() {
-                if let Some(xform) = lerp_slice(xform_stack.top(), time).to_full() {
-                    xform
-                } else {
-                    return None;
-                }
-            } else {
-                XformFull::identity()
-            };
-
             if let Some((light_i, sel_pdf, whittled_n)) = self.light_accel.select(
-                idata.incoming.xform_inv(&sel_xform),
-                idata.pos.xform_inv(&sel_xform),
-                idata.nor.xform_inv_fast(&sel_xform),
-                idata.nor_g.xform_inv_fast(&sel_xform),
+                idata.incoming.xform_inv(space),
+                idata.pos.xform_inv(space),
+                idata.nor.xform_inv_fast(space),
+                idata.nor_g.xform_inv_fast(space),
                 &closure,
                 time,
                 n,
             ) {
                 let inst = self.light_instances[light_i];
+
+                // Handle transforms.
+                let local_space = if let Some((a, b)) = inst.transform_indices {
+                    if let Some(new_space) = lerp_slice(&self.xforms[a..b], time)
+                        .compose(&space.fwd)
+                        .to_full()
+                    {
+                        new_space
+                    } else {
+                        // Invalid transform.  Give up.
+                        return None;
+                    }
+                } else {
+                    *space
+                };
+
                 match inst.instance_type {
                     InstanceType::Object => {
                         match self.objects[inst.data_index] {
                             Object::SurfaceLight(light) => {
-                                // Get the transform of the light.
-                                let xform = if let Some((a, b)) = inst.transform_indices {
-                                    let pxforms = xform_stack.top();
-                                    let xform = lerp_slice(&self.xforms[a..b], time);
-                                    if !pxforms.is_empty() {
-                                        xform.compose(&lerp_slice(pxforms, time))
-                                    } else {
-                                        xform
-                                    }
-                                } else {
-                                    let pxforms = xform_stack.top();
-                                    if !pxforms.is_empty() {
-                                        lerp_slice(pxforms, time)
-                                    } else {
-                                        Xform::identity()
-                                    }
-                                }
-                                .to_full();
-
                                 // Sample the light
-                                if let Some(xform) = xform {
-                                    let (color, sample_geo, pdf) = light.sample_from_point(
-                                        &xform, idata.pos, uvw.0, uvw.1, wavelength, time,
-                                    );
-                                    return Some((color, sample_geo, pdf, sel_pdf));
-                                } else {
-                                    return None;
-                                }
+                                let (color, sample_geo, pdf) = light.sample_from_point(
+                                    &local_space,
+                                    idata.pos,
+                                    uvw.0,
+                                    uvw.1,
+                                    wavelength,
+                                    time,
+                                );
+                                return Some((color, sample_geo, pdf, sel_pdf));
                             }
 
                             _ => unimplemented!(),
@@ -116,27 +103,16 @@ impl<'a> Assembly<'a> {
                     }
 
                     InstanceType::Assembly => {
-                        // Push the transform of the assembly onto
-                        // the transform stack.
-                        if let Some((a, b)) = inst.transform_indices {
-                            xform_stack.push(&self.xforms[a..b]);
-                        }
-
                         // Sample sub-assembly lights
                         let sample = self.assemblies[inst.data_index].sample_lights(
-                            xform_stack,
                             whittled_n,
                             uvw,
                             wavelength,
                             time,
+                            &local_space,
                             intr,
                         );
 
-                        // Pop the assembly's transforms off the transform stack.
-                        if inst.transform_indices.is_some() {
-                            xform_stack.pop();
-                        }
-
                         // Return sample
                         return sample.map(|(ss, v, pdf, spdf)| (ss, v, pdf, spdf * sel_pdf));
                     }
diff --git a/src/scene/mod.rs b/src/scene/mod.rs
index 3861c76..3e3f872 100644
--- a/src/scene/mod.rs
+++ b/src/scene/mod.rs
@@ -6,9 +6,8 @@ use crate::{
     algorithm::weighted_choice,
     camera::Camera,
     color::SpectralSample,
-    math::{Normal, Point, Vector},
+    math::{Normal, Point, Vector, XformFull},
     surface::SurfaceIntersection,
-    transform_stack::TransformStack,
 };
 
 pub use self::{
@@ -27,11 +26,11 @@ pub struct Scene<'a> {
 impl<'a> Scene<'a> {
     pub fn sample_lights(
         &self,
-        xform_stack: &mut TransformStack,
         n: f32,
         uvw: (f32, f32, f32),
         wavelength: f32,
         time: f32,
+        space: &XformFull,
         intr: &SurfaceIntersection,
     ) -> SceneLightSample {
         // TODO: this just selects between world lights and local lights
@@ -81,9 +80,9 @@ impl<'a> Scene<'a> {
                 // Local lights
                 let n = (n - wl_prob) / (1.0 - wl_prob);
 
-                if let Some((ss, sgeo, pdf, spdf)) =
-                    self.root
-                        .sample_lights(xform_stack, n, uvw, wavelength, time, intr)
+                if let Some((ss, sgeo, pdf, spdf)) = self
+                    .root
+                    .sample_lights(n, uvw, wavelength, time, space, intr)
                 {
                     return SceneLightSample::Surface {
                         color: ss,
diff --git a/src/surface/mod.rs b/src/surface/mod.rs
index 6b20132..546238a 100644
--- a/src/surface/mod.rs
+++ b/src/surface/mod.rs
@@ -2,7 +2,6 @@
 
 // pub mod micropoly_batch;
 pub mod bilinear_patch;
-pub mod micropoly_batch;
 pub mod triangle;
 pub mod triangle_mesh;
 
@@ -10,8 +9,8 @@ use std::fmt::Debug;
 
 use crate::{
     boundable::Boundable,
-    math::{Normal, Point, Vector, Xform, XformFull},
-    ray::{RayBatch, RayStack},
+    math::{Normal, Point, Vector, XformFull},
+    ray::{LocalRay, Ray},
     shading::surface_closure::SurfaceClosure,
     shading::SurfaceShader,
 };
@@ -19,13 +18,13 @@ use crate::{
 const MAX_EDGE_DICE: u32 = 128;
 
 pub trait Surface: Boundable + Debug + Sync {
-    fn intersect_rays(
+    fn intersect_ray(
         &self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-        isects: &mut [SurfaceIntersection],
+        ray: &mut Ray,
+        local_ray: &LocalRay,
+        space: &XformFull,
+        isect: &mut SurfaceIntersection,
         shader: &dyn SurfaceShader,
-        space: &[Xform],
     );
 }
 
diff --git a/src/surface/triangle_mesh.rs b/src/surface/triangle_mesh.rs
index 6e3546a..1faaa02 100644
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@@ -7,8 +7,8 @@ use crate::{
     bbox::BBox,
     boundable::Boundable,
     lerp::lerp_slice,
-    math::{cross, dot, Normal, Point, Xform},
-    ray::{RayBatch, RayStack},
+    math::{cross, dot, Normal, Point, XformFull},
+    ray::{LocalRay, Ray},
     shading::SurfaceShader,
 };
 
@@ -122,209 +122,125 @@ impl<'a> Boundable for TriangleMesh<'a> {
 }
 
 impl<'a> Surface for TriangleMesh<'a> {
-    fn intersect_rays(
+    fn intersect_ray(
         &self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-        isects: &mut [SurfaceIntersection],
+        ray: &mut Ray,
+        local_ray: &LocalRay,
+        space: &XformFull,
+        isect: &mut SurfaceIntersection,
         shader: &dyn SurfaceShader,
-        space: &[Xform],
     ) {
-        // Precalculate transform for non-motion blur cases
-        let static_mat_space = if space.len() == 1 {
-            space[0]
-        } else {
-            Xform::identity()
-        };
+        self.accel.traverse(ray, local_ray, |idx_range, ray| {
+            // Iterate through the triangles and test the ray against them.
+            let mut non_shadow_hit = false;
+            let mut hit_tri = std::mem::MaybeUninit::uninit();
+            let mut hit_tri_indices = std::mem::MaybeUninit::uninit();
+            let mut hit_tri_data = std::mem::MaybeUninit::uninit();
+            let ray_pre = triangle::RayTriPrecompute::new(ray.dir);
+            for tri_idx in idx_range.clone() {
+                let tri_indices = self.indices[tri_idx];
 
-        self.accel
-            .traverse(rays, ray_stack, |idx_range, rays, ray_stack| {
-                let tri_count = idx_range.end - idx_range.start;
+                // Get triangle.
+                let mut tri = if self.time_sample_count == 1 {
+                    // No deformation motion blur, so fast-path it.
+                    (
+                        self.vertices[tri_indices.0 as usize],
+                        self.vertices[tri_indices.1 as usize],
+                        self.vertices[tri_indices.2 as usize],
+                    )
+                } else {
+                    // Deformation motion blur, need to interpolate.
+                    let p0_slice = &self.vertices[(tri_indices.0 as usize * self.time_sample_count)
+                        ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
+                    let p1_slice = &self.vertices[(tri_indices.1 as usize * self.time_sample_count)
+                        ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
+                    let p2_slice = &self.vertices[(tri_indices.2 as usize * self.time_sample_count)
+                        ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
 
-                // Build the triangle cache if we can!
-                let is_cached = ray_stack.ray_count_in_next_task() >= tri_count
-                    && self.time_sample_count == 1
-                    && space.len() <= 1;
-                let mut tri_cache = [std::mem::MaybeUninit::uninit(); MAX_LEAF_TRIANGLE_COUNT];
-                if is_cached {
-                    for tri_idx in idx_range.clone() {
-                        let i = tri_idx - idx_range.start;
-                        let tri_indices = self.indices[tri_idx];
+                    let p0 = lerp_slice(p0_slice, ray.time);
+                    let p1 = lerp_slice(p1_slice, ray.time);
+                    let p2 = lerp_slice(p2_slice, ray.time);
 
-                        // For static triangles with static transforms, cache them.
+                    (p0, p1, p2)
+                };
+
+                // Transform triangle into world space.
+                tri.0 = tri.0.xform(space);
+                tri.1 = tri.1.xform(space);
+                tri.2 = tri.2.xform(space);
+
+                // Test ray against triangle
+                if let Some((t, b0, b1, b2)) =
+                    triangle::intersect_ray(ray.orig, ray_pre, ray.max_t, tri)
+                {
+                    if ray.is_occlusion() {
+                        *isect = SurfaceIntersection::Occlude;
+                        ray.mark_done();
+                        break;
+                    } else {
+                        non_shadow_hit = true;
+                        ray.max_t = t;
                         unsafe {
-                            *tri_cache[i].as_mut_ptr() = (
-                                self.vertices[tri_indices.0 as usize],
-                                self.vertices[tri_indices.1 as usize],
-                                self.vertices[tri_indices.2 as usize],
-                            );
-                            if !space.is_empty() {
-                                (*tri_cache[i].as_mut_ptr()).0 =
-                                    (*tri_cache[i].as_mut_ptr()).0.xform(&static_mat_space);
-                                (*tri_cache[i].as_mut_ptr()).1 =
-                                    (*tri_cache[i].as_mut_ptr()).1.xform(&static_mat_space);
-                                (*tri_cache[i].as_mut_ptr()).2 =
-                                    (*tri_cache[i].as_mut_ptr()).2.xform(&static_mat_space);
-                            }
+                            *hit_tri.as_mut_ptr() = tri;
+                            *hit_tri_indices.as_mut_ptr() = tri_indices;
+                            *hit_tri_data.as_mut_ptr() = (t, b0, b1, b2);
                         }
                     }
                 }
+            }
 
-                // Test each ray against the triangles.
-                ray_stack.do_next_task(|ray_idx| {
-                    let ray_idx = ray_idx as usize;
+            // Calculate intersection data if necessary.
+            if non_shadow_hit {
+                let hit_tri = unsafe { hit_tri.assume_init() };
+                let (t, b0, b1, b2) = unsafe { hit_tri_data.assume_init() };
 
-                    if rays.is_done(ray_idx) {
-                        return;
-                    }
+                // Calculate intersection point and error magnitudes
+                let (pos, pos_err) = triangle::surface_point(hit_tri, (b0, b1, b2));
 
-                    let ray_time = rays.time(ray_idx);
+                // Calculate geometric surface normal
+                let geo_normal = cross(hit_tri.0 - hit_tri.1, hit_tri.0 - hit_tri.2).into_normal();
 
-                    // Calculate the ray space, if necessary.
-                    let mat_space = if space.len() > 1 {
-                        // Per-ray transform, for motion blur
-                        lerp_slice(space, ray_time)
+                // Calculate interpolated surface normal, if any
+                let shading_normal = if let Some(normals) = self.normals {
+                    let hit_tri_indices = unsafe { hit_tri_indices.assume_init() };
+                    let n0_slice = &normals[(hit_tri_indices.0 as usize * self.time_sample_count)
+                        ..((hit_tri_indices.0 as usize + 1) * self.time_sample_count)];
+                    let n1_slice = &normals[(hit_tri_indices.1 as usize * self.time_sample_count)
+                        ..((hit_tri_indices.1 as usize + 1) * self.time_sample_count)];
+                    let n2_slice = &normals[(hit_tri_indices.2 as usize * self.time_sample_count)
+                        ..((hit_tri_indices.2 as usize + 1) * self.time_sample_count)];
+
+                    let n0 = lerp_slice(n0_slice, ray.time).normalized();
+                    let n1 = lerp_slice(n1_slice, ray.time).normalized();
+                    let n2 = lerp_slice(n2_slice, ray.time).normalized();
+
+                    let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)).xform_fast(&space);
+                    if dot(s_nor, geo_normal) >= 0.0 {
+                        s_nor
                     } else {
-                        static_mat_space
-                    };
-
-                    // Iterate through the triangles and test the ray against them.
-                    let mut non_shadow_hit = false;
-                    let mut hit_tri = std::mem::MaybeUninit::uninit();
-                    let mut hit_tri_indices = std::mem::MaybeUninit::uninit();
-                    let mut hit_tri_data = std::mem::MaybeUninit::uninit();
-                    let ray_pre = triangle::RayTriPrecompute::new(rays.dir(ray_idx));
-                    for tri_idx in idx_range.clone() {
-                        let tri_indices = self.indices[tri_idx];
-
-                        // Get triangle if necessary
-                        let tri = if is_cached {
-                            let i = tri_idx - idx_range.start;
-                            unsafe { tri_cache[i].assume_init() }
-                        } else {
-                            let mut tri = if self.time_sample_count == 1 {
-                                // No deformation motion blur, so fast-path it.
-                                (
-                                    self.vertices[tri_indices.0 as usize],
-                                    self.vertices[tri_indices.1 as usize],
-                                    self.vertices[tri_indices.2 as usize],
-                                )
-                            } else {
-                                // Deformation motion blur, need to interpolate.
-                                let p0_slice = &self.vertices[(tri_indices.0 as usize
-                                    * self.time_sample_count)
-                                    ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
-                                let p1_slice = &self.vertices[(tri_indices.1 as usize
-                                    * self.time_sample_count)
-                                    ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
-                                let p2_slice = &self.vertices[(tri_indices.2 as usize
-                                    * self.time_sample_count)
-                                    ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
-
-                                let p0 = lerp_slice(p0_slice, ray_time);
-                                let p1 = lerp_slice(p1_slice, ray_time);
-                                let p2 = lerp_slice(p2_slice, ray_time);
-
-                                (p0, p1, p2)
-                            };
-
-                            if !space.is_empty() {
-                                tri.0 = tri.0.xform(&mat_space);
-                                tri.1 = tri.1.xform(&mat_space);
-                                tri.2 = tri.2.xform(&mat_space);
-                            }
-
-                            tri
-                        };
-
-                        // Test ray against triangle
-                        if let Some((t, b0, b1, b2)) = triangle::intersect_ray(
-                            rays.orig(ray_idx),
-                            ray_pre,
-                            rays.max_t(ray_idx),
-                            tri,
-                        ) {
-                            if rays.is_occlusion(ray_idx) {
-                                isects[ray_idx] = SurfaceIntersection::Occlude;
-                                rays.mark_done(ray_idx);
-                                break;
-                            } else {
-                                non_shadow_hit = true;
-                                rays.set_max_t(ray_idx, t);
-                                unsafe {
-                                    *hit_tri.as_mut_ptr() = tri;
-                                    *hit_tri_indices.as_mut_ptr() = tri_indices;
-                                    *hit_tri_data.as_mut_ptr() = (t, b0, b1, b2);
-                                }
-                            }
-                        }
+                        -s_nor
                     }
+                } else {
+                    geo_normal
+                };
 
-                    // Calculate intersection data if necessary.
-                    if non_shadow_hit {
-                        // Get the full space data.
-                        let mat_space = if let Some(space) = mat_space.to_full() {
-                            space
-                        } else {
-                            return;
-                        };
+                let intersection_data = SurfaceIntersectionData {
+                    incoming: ray.dir,
+                    t: t,
+                    pos: pos,
+                    pos_err: pos_err,
+                    nor: shading_normal,
+                    nor_g: geo_normal,
+                    local_space: *space,
+                    sample_pdf: 0.0,
+                };
 
-                        let hit_tri = unsafe { hit_tri.assume_init() };
-                        let (t, b0, b1, b2) = unsafe { hit_tri_data.assume_init() };
-
-                        // Calculate intersection point and error magnitudes
-                        let (pos, pos_err) = triangle::surface_point(hit_tri, (b0, b1, b2));
-
-                        // Calculate geometric surface normal
-                        let geo_normal =
-                            cross(hit_tri.0 - hit_tri.1, hit_tri.0 - hit_tri.2).into_normal();
-
-                        // Calculate interpolated surface normal, if any
-                        let shading_normal = if let Some(normals) = self.normals {
-                            let hit_tri_indices = unsafe { hit_tri_indices.assume_init() };
-                            let n0_slice = &normals[(hit_tri_indices.0 as usize
-                                * self.time_sample_count)
-                                ..((hit_tri_indices.0 as usize + 1) * self.time_sample_count)];
-                            let n1_slice = &normals[(hit_tri_indices.1 as usize
-                                * self.time_sample_count)
-                                ..((hit_tri_indices.1 as usize + 1) * self.time_sample_count)];
-                            let n2_slice = &normals[(hit_tri_indices.2 as usize
-                                * self.time_sample_count)
-                                ..((hit_tri_indices.2 as usize + 1) * self.time_sample_count)];
-
-                            let n0 = lerp_slice(n0_slice, ray_time).normalized();
-                            let n1 = lerp_slice(n1_slice, ray_time).normalized();
-                            let n2 = lerp_slice(n2_slice, ray_time).normalized();
-
-                            let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)).xform_fast(&mat_space);
-                            if dot(s_nor, geo_normal) >= 0.0 {
-                                s_nor
-                            } else {
-                                -s_nor
-                            }
-                        } else {
-                            geo_normal
-                        };
-
-                        let intersection_data = SurfaceIntersectionData {
-                            incoming: rays.dir(ray_idx),
-                            t: t,
-                            pos: pos,
-                            pos_err: pos_err,
-                            nor: shading_normal,
-                            nor_g: geo_normal,
-                            local_space: mat_space,
-                            sample_pdf: 0.0,
-                        };
-
-                        // Fill in intersection data
-                        isects[ray_idx] = SurfaceIntersection::Hit {
-                            intersection_data: intersection_data,
-                            closure: shader.shade(&intersection_data, ray_time),
-                        };
-                    }
-                });
-                ray_stack.pop_task();
-            });
+                // Fill in intersection data
+                *isect = SurfaceIntersection::Hit {
+                    intersection_data: intersection_data,
+                    closure: shader.shade(&intersection_data, ray.time),
+                };
+            }
+        });
     }
 }
diff --git a/src/tracer.rs b/src/tracer.rs
index 5a3d834..7bbaba3 100644
--- a/src/tracer.rs
+++ b/src/tracer.rs
@@ -1,198 +1,110 @@
-use std::iter;
-
 use crate::{
-    accel::ray_code,
     color::{rec709_to_xyz, Color},
     lerp::lerp_slice,
     math::XformFull,
-    ray::{RayBatch, RayStack},
+    ray::{LocalRay, Ray},
     scene::{Assembly, InstanceType, Object},
     shading::{SimpleSurfaceShader, SurfaceShader},
     surface::SurfaceIntersection,
-    transform_stack::TransformStack,
 };
 
 pub struct Tracer<'a> {
+    root: &'a Assembly<'a>,
     ray_trace_count: u64,
-    ray_stack: RayStack,
-    inner: TracerInner<'a>,
 }
 
 impl<'a> Tracer<'a> {
     pub fn from_assembly(assembly: &'a Assembly) -> Tracer<'a> {
         Tracer {
+            root: assembly,
             ray_trace_count: 0,
-            ray_stack: RayStack::new(),
-            inner: TracerInner {
-                root: assembly,
-                xform_stack: TransformStack::new(),
-                isects: Vec::new(),
-            },
         }
     }
 
-    pub fn trace<'b>(&'b mut self, rays: &mut RayBatch) -> &'b [SurfaceIntersection] {
-        self.ray_trace_count += rays.len() as u64;
-        self.inner.trace(rays, &mut self.ray_stack)
-    }
-
     pub fn rays_traced(&self) -> u64 {
         self.ray_trace_count
     }
-}
 
-struct TracerInner<'a> {
-    root: &'a Assembly<'a>,
-    xform_stack: TransformStack,
-    isects: Vec<SurfaceIntersection>,
-}
+    pub fn trace(&mut self, ray: &mut Ray, isect: &mut SurfaceIntersection) {
+        self.ray_trace_count += 1;
 
-impl<'a> TracerInner<'a> {
-    fn trace<'b>(
-        &'b mut self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-    ) -> &'b [SurfaceIntersection] {
-        ray_stack.clear();
+        let local_ray = ray.to_local();
+        let space = XformFull::identity();
 
-        // Ready the isects
-        self.isects.clear();
-        self.isects.reserve(rays.len());
-        self.isects
-            .extend(iter::repeat(SurfaceIntersection::Miss).take(rays.len()));
-
-        // Prep the accel part of the rays.
-        {
-            let ident = XformFull::identity();
-            for i in 0..rays.len() {
-                rays.update_local(i, &ident);
-            }
-        }
-
-        // Divide the rays into 8 different lanes by direction.
-        ray_stack.ensure_lane_count(8);
-        for i in 0..rays.len() {
-            ray_stack.push_ray_index(i, ray_code(rays.dir(i)));
-        }
-        ray_stack.push_lanes_to_tasks(&[0, 1, 2, 3, 4, 5, 6, 7]);
-
-        // Trace each of the 8 lanes separately.
-        while !ray_stack.is_empty() {
-            self.trace_assembly(self.root, rays, ray_stack);
-        }
-
-        &self.isects
+        self.trace_assembly(self.root, ray, &local_ray, &space, isect);
     }
 
-    fn trace_assembly<'b>(
-        &'b mut self,
+    fn trace_assembly(
+        &mut self,
         assembly: &Assembly,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
+        ray: &mut Ray,
+        local_ray: &LocalRay,
+        space: &XformFull,
+        isect: &mut SurfaceIntersection,
     ) {
         assembly
             .object_accel
-            .traverse(rays, ray_stack, |idx_range, rays, ray_stack| {
-                let inst = &assembly.instances[idx_range.start];
+            .traverse(ray, local_ray, |idx_range, ray| {
+                for inst_idx in idx_range {
+                    let inst = &assembly.instances[inst_idx];
 
-                // Transform rays if needed
-                if let Some((xstart, xend)) = inst.transform_indices {
-                    // Push transforms to stack
-                    self.xform_stack.push(&assembly.xforms[xstart..xend]);
+                    // Handle transforms if needed.
+                    let (local_space, local_ray) = if let Some((xstart, xend)) =
+                        inst.transform_indices
+                    {
+                        let instance_xform = lerp_slice(&assembly.xforms[xstart..xend], ray.time);
+                        let combined_xform = instance_xform.compose(&space.fwd);
 
-                    // Do transforms
-                    // TODO: re-divide rays based on direction (maybe?).
-                    let xforms = self.xform_stack.top();
-                    let static_xform = if xforms.len() == 1 {
-                        if let Some(xform) = xforms[0].to_full() {
-                            Some(xform)
+                        if let Some(xform) = combined_xform.to_full() {
+                            (xform, ray.to_local_xform(&xform))
                         } else {
-                            return;
+                            // Invalid transform, so skip traversing into this instance.
+                            continue;
                         }
-                    } else if xforms.len() == 0 {
-                        Some(XformFull::identity())
                     } else {
-                        None
+                        (*space, *local_ray)
                     };
-                    ray_stack.do_next_task(|ray_idx| {
-                        let t = rays.time(ray_idx);
-                        rays.update_local(
-                            ray_idx,
-                            &static_xform.unwrap_or_else(|| {
-                                lerp_slice(xforms, t).to_full().unwrap_or(
-                                    // TODO: filter out ray instead.
-                                    XformFull::identity(),
-                                )
-                            }),
-                        );
-                    });
-                    ray_stack.duplicate_next_task();
-                }
 
-                // Trace rays
-                match inst.instance_type {
-                    InstanceType::Object => {
-                        self.trace_object(
-                            &assembly.objects[inst.data_index],
-                            inst.surface_shader_index
-                                .map(|i| assembly.surface_shaders[i]),
-                            rays,
-                            ray_stack,
-                        );
-                    }
-
-                    InstanceType::Assembly => {
-                        self.trace_assembly(&assembly.assemblies[inst.data_index], rays, ray_stack);
-                    }
-                }
-
-                // Un-transform rays if needed
-                if inst.transform_indices.is_some() {
-                    // Pop transforms off stack
-                    self.xform_stack.pop();
-
-                    // Undo transforms
-                    let xforms = self.xform_stack.top();
-                    let static_xform = if xforms.len() == 1 {
-                        if let Some(xform) = xforms[0].to_full() {
-                            Some(xform)
-                        } else {
-                            return;
-                        }
-                    } else if xforms.len() == 0 {
-                        Some(XformFull::identity())
-                    } else {
-                        None
-                    };
-                    if !xforms.is_empty() {
-                        ray_stack.pop_do_next_task(|ray_idx| {
-                            let t = rays.time(ray_idx);
-                            rays.update_local(
-                                ray_idx,
-                                &static_xform.unwrap_or_else(|| {
-                                    lerp_slice(xforms, t).to_full().unwrap_or(
-                                        // TODO: filter out ray instead.
-                                        XformFull::identity(),
-                                    )
-                                }),
+                    // Trace ray.
+                    match inst.instance_type {
+                        InstanceType::Object => {
+                            self.trace_object(
+                                &assembly.objects[inst.data_index],
+                                inst.surface_shader_index
+                                    .map(|i| assembly.surface_shaders[i]),
+                                ray,
+                                &local_ray,
+                                &local_space,
+                                isect,
                             );
-                        });
-                    } else {
-                        let ident = XformFull::identity();
-                        ray_stack.pop_do_next_task(|ray_idx| {
-                            rays.update_local(ray_idx, &ident);
-                        });
+                        }
+
+                        InstanceType::Assembly => {
+                            self.trace_assembly(
+                                &assembly.assemblies[inst.data_index],
+                                ray,
+                                &local_ray,
+                                &local_space,
+                                isect,
+                            );
+                        }
+                    }
+
+                    if ray.is_done() {
+                        return;
                     }
                 }
             });
     }
 
     fn trace_object<'b>(
-        &'b mut self,
+        &mut self,
         obj: &Object,
         surface_shader: Option<&dyn SurfaceShader>,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
+        ray: &mut Ray,
+        local_ray: &LocalRay,
+        space: &XformFull,
+        isect: &mut SurfaceIntersection,
     ) {
         match *obj {
             Object::Surface(surface) => {
@@ -201,13 +113,7 @@ impl<'a> TracerInner<'a> {
                 };
                 let shader = surface_shader.unwrap_or(&unassigned_shader);
 
-                surface.intersect_rays(
-                    rays,
-                    ray_stack,
-                    &mut self.isects,
-                    shader,
-                    self.xform_stack.top(),
-                );
+                surface.intersect_ray(ray, local_ray, space, isect, shader);
             }
 
             Object::SurfaceLight(surface) => {
@@ -216,13 +122,7 @@ impl<'a> TracerInner<'a> {
                     color: Color::new_xyz(rec709_to_xyz((1.0, 0.0, 1.0))),
                 };
 
-                surface.intersect_rays(
-                    rays,
-                    ray_stack,
-                    &mut self.isects,
-                    &bogus_shader,
-                    self.xform_stack.top(),
-                );
+                surface.intersect_ray(ray, local_ray, space, isect, &bogus_shader);
             }
         }
     }
diff --git a/src/transform_stack.rs b/src/transform_stack.rs
index ce5356d..ce76356 100644
--- a/src/transform_stack.rs
+++ b/src/transform_stack.rs
@@ -1,83 +1,30 @@
-use std::{
-    cmp,
-    mem::{transmute, MaybeUninit},
-};
-
-use crate::{algorithm::merge_slices_to, math::Xform};
+use crate::math::Xform;
 
 pub struct TransformStack {
-    stack: Vec<MaybeUninit<Xform>>,
-    stack_indices: Vec<usize>,
+    stack: Vec<Xform>,
 }
 
 impl TransformStack {
     pub fn new() -> TransformStack {
-        let mut ts = TransformStack {
-            stack: Vec::new(),
-            stack_indices: Vec::new(),
-        };
-
-        ts.stack_indices.push(0);
-        ts.stack_indices.push(0);
-
-        ts
+        TransformStack { stack: Vec::new() }
     }
 
     pub fn clear(&mut self) {
         self.stack.clear();
-        self.stack_indices.clear();
-        self.stack_indices.push(0);
-        self.stack_indices.push(0);
     }
 
-    pub fn push(&mut self, xforms: &[Xform]) {
-        assert!(!xforms.is_empty());
-
-        if self.stack.is_empty() {
-            let xforms: &[MaybeUninit<Xform>] = unsafe { transmute(xforms) };
-            self.stack.extend(xforms);
-        } else {
-            let sil = self.stack_indices.len();
-            let i1 = self.stack_indices[sil - 2];
-            let i2 = self.stack_indices[sil - 1];
-            // Reserve stack space for the new transforms.
-            // Note this leaves exposed uninitialized memory.  The subsequent call to
-            // merge_slices_to() fills that memory in.
-            {
-                let maxlen = cmp::max(xforms.len(), i2 - i1);
-                self.stack.reserve(maxlen);
-                let l = self.stack.len();
-                unsafe { self.stack.set_len(l + maxlen) };
-            }
-            let (xfs1, xfs2) = self.stack.split_at_mut(i2);
-            merge_slices_to(
-                unsafe { transmute(&xfs1[i1..i2]) },
-                xforms,
-                xfs2,
-                |xf1, xf2| xf2.compose(xf1),
-            );
+    pub fn push(&mut self, xform: Xform) {
+        match self.stack.last() {
+            None => self.stack.push(xform),
+            Some(prev_xform) => self.stack.push(xform.compose(prev_xform)),
         }
-
-        self.stack_indices.push(self.stack.len());
     }
 
-    pub fn pop(&mut self) {
-        assert!(self.stack_indices.len() > 2);
-
-        let sl = self.stack.len();
-        let sil = self.stack_indices.len();
-        let i1 = self.stack_indices[sil - 2];
-        let i2 = self.stack_indices[sil - 1];
-
-        self.stack.truncate(sl - (i2 - i1));
-        self.stack_indices.pop();
+    pub fn pop(&mut self) -> Option<Xform> {
+        self.stack.pop()
     }
 
-    pub fn top(&self) -> &[Xform] {
-        let sil = self.stack_indices.len();
-        let i1 = self.stack_indices[sil - 2];
-        let i2 = self.stack_indices[sil - 1];
-
-        unsafe { transmute(&self.stack[i1..i2]) }
+    pub fn top(&self) -> Option<&Xform> {
+        self.stack.last()
     }
 }
diff --git a/sub_crates/rmath/src/vector.rs b/sub_crates/rmath/src/vector.rs
index 87cc1ff..34888ff 100644
--- a/sub_crates/rmath/src/vector.rs
+++ b/sub_crates/rmath/src/vector.rs
@@ -42,6 +42,11 @@ impl Vector {
         Self(self.0.abs())
     }
 
+    #[inline(always)]
+    pub fn recip(self) -> Self {
+        Self(self.0.recip())
+    }
+
     #[inline(always)]
     pub fn into_point(self) -> Point {
         Point(self.0)