From 3d4ac7f57b1068cdecbe7efcf9cbbc42d52b9a03 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Fri, 21 Jun 2019 23:02:44 +0900
Subject: [PATCH 01/20] Created a RayBatch type, which stores multiple rays is
 SoA layout.

---
 src/ray.rs | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 95 insertions(+), 4 deletions(-)
diff --git a/src/ray.rs b/src/ray.rs
index cf91b74..d2cf51f 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -4,8 +4,99 @@ use float4::Float4;
 
 use crate::math::{Matrix4x4, Point, Vector};
 
-const OCCLUSION_FLAG: u32 = 1;
-const DONE_FLAG: u32 = 1 << 1;
+type FlagType = u8;
+const OCCLUSION_FLAG: FlagType = 1;
+const DONE_FLAG: FlagType = 1 << 1;
+
+/// A batch of rays, stored in SoA layout.
+#[derive(Debug)]
+pub struct RayBatch {
+    pub orig_world: Vec<Point>,
+    pub dir_world: Vec<Vector>,
+    pub orig_accel: Vec<Point>,
+    pub dir_inv_accel: Vec<Vector>,
+    pub max_t: Vec<f32>,
+    pub time: Vec<f32>,
+    pub wavelength: Vec<f32>,
+    pub flags: Vec<FlagType>,
+}
+
+impl RayBatch {
+    /// Creates a new empty ray batch.
+    pub fn new() -> RayBatch {
+        RayBatch {
+            orig_world: Vec::new(),
+            dir_world: Vec::new(),
+            orig_accel: Vec::new(),
+            dir_inv_accel: Vec::new(),
+            max_t: Vec::new(),
+            time: Vec::new(),
+            wavelength: Vec::new(),
+            flags: Vec::new(),
+        }
+    }
+
+    /// Creates a new empty ray batch, with pre-allocated capacity for
+    /// `n` rays.
+    pub fn with_capacity(n: usize) -> RayBatch {
+        RayBatch {
+            orig_world: Vec::with_capacity(n),
+            dir_world: Vec::with_capacity(n),
+            orig_accel: Vec::with_capacity(n),
+            dir_inv_accel: Vec::with_capacity(n),
+            max_t: Vec::with_capacity(n),
+            time: Vec::with_capacity(n),
+            wavelength: Vec::with_capacity(n),
+            flags: Vec::with_capacity(n),
+        }
+    }
+
+    /// Clear all rays, settings the size of the batch back to zero.
+    ///
+    /// Capacity is maintained.
+    pub fn clear(&mut self) {
+        self.orig_world.clear();
+        self.dir_world.clear();
+        self.orig_accel.clear();
+        self.dir_inv_accel.clear();
+        self.max_t.clear();
+        self.time.clear();
+        self.wavelength.clear();
+        self.flags.clear();
+    }
+
+    /// Returns whether the given ray (at index `idx`) is an occlusion ray.
+    pub fn is_occlusion(&self, idx: usize) -> bool {
+        (self.flags[idx] & OCCLUSION_FLAG) != 0
+    }
+
+    /// Returns whether the given ray (at index `idx`) has finished traversal.
+    pub fn is_done(&self, idx: usize) -> bool {
+        (self.flags[idx] & DONE_FLAG) != 0
+    }
+
+    /// Marks the given ray (at index `idx`) as an occlusion ray.
+    pub fn mark_occlusion(&mut self, idx: usize) {
+        self.flags[idx] |= OCCLUSION_FLAG
+    }
+
+    /// Marks the given ray (at index `idx`) as having finished traversal.
+    pub fn mark_done(&mut self, idx: usize) {
+        self.flags[idx] |= DONE_FLAG
+    }
+
+    /// Updates the accel data of the given ray (at index `idx`) with the
+    /// given world-to-local-space transform matrix.
+    ///
+    /// This should be called when entering (and exiting) traversal of a
+    /// new transform space.
+    pub fn update_accel(&mut self, idx: usize, xform: &Matrix4x4) {
+        self.orig_accel[idx] = self.orig_world[idx] * *xform;
+        self.dir_inv_accel[idx] = Vector {
+            co: Float4::splat(1.0) / (self.dir_world[idx] * *xform).co,
+        };
+    }
+}
 
 #[derive(Debug, Copy, Clone)]
 pub struct Ray {
@@ -14,7 +105,7 @@ pub struct Ray {
     pub max_t: f32,
     pub time: f32,
     pub wavelength: f32,
-    pub flags: u32,
+    pub flags: FlagType,
 }
 
 impl Ray {
@@ -56,7 +147,7 @@ pub struct AccelRay {
     pub dir_inv: Vector,
     pub max_t: f32,
     pub time: f32,
-    pub flags: u32,
+    pub flags: FlagType,
     pub id: u32,
 }
 

From 1a29b16aa28c454aeb05966bff1ce60a43d070f4 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 22 Jun 2019 04:19:55 +0900
Subject: [PATCH 02/20] Sketching out the structures for ray traversal
 tracking.

---
 src/ray.rs | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/ray.rs b/src/ray.rs
index d2cf51f..2c7e41b 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -98,6 +98,30 @@ impl RayBatch {
     }
 }
 
+/// A structure used for tracking traversal of a ray batch through a scene.
+#[derive(Debug)]
+pub struct RayStack {
+    lanes: Vec<Vec<u16>>,
+    tasks: Vec<RayTask>,
+}
+
+/// A task within a RayStack.
+#[derive(Debug)]
+pub enum RayTask {
+    // A barrier represents a division when traversing into a new system.
+    // For example, when traversing from the top-level BVH into an object's
+    // local BVH.  It helps with keeping track of where we're at and aids in
+    // debugging.
+    Barrier,
+
+    // A task for handling a set of rays.
+    //
+    // Specifies the lane that the relevant ray pointers are in, and the
+    // starting index within that lane.  The relevant pointers are always
+    // `&[start_idx..]` within the given lane.
+    Rays { lane: usize, start_idx: usize },
+}
+
 #[derive(Debug, Copy, Clone)]
 pub struct Ray {
     pub orig: Point,

From 630a79aca5582ae7b685fb7659939c580c41f2e4 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 23 Jun 2019 18:40:52 +0900
Subject: [PATCH 03/20] Initial implementation of ORST traversal.

This is a "just get it working" implementation.  Performance
optimizations still need to be done.
---
 src/accel/bvh4.rs              | 101 +++++++----
 src/accel/mod.rs               |   6 +-
 src/bbox.rs                    |  11 +-
 src/camera.rs                  |   8 +-
 src/light/rectangle_light.rs   |  47 +++---
 src/light/sphere_light.rs      |  58 +++----
 src/main.rs                    |   7 +-
 src/ray.rs                     | 295 +++++++++++++++++++++------------
 src/renderer.rs                |  75 +++++----
 src/surface/micropoly_batch.rs |   8 +-
 src/surface/mod.rs             |   8 +-
 src/surface/triangle.rs        |  34 ++--
 src/surface/triangle_mesh.rs   | 134 ++++++++-------
 src/tracer.rs                  | 202 +++++++++-------------
 14 files changed, 548 insertions(+), 446 deletions(-)

diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index 11766bc..c739d38 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -1,10 +1,14 @@
 #![allow(dead_code)]
 
 use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
+use math3d::Vector;
 use mem_arena::MemArena;
 
 use crate::{
-    algorithm::partition, bbox::BBox, boundable::Boundable, lerp::lerp_slice, ray::AccelRay,
+    bbox::BBox,
+    boundable::Boundable,
+    lerp::lerp_slice,
+    ray::{RayBatch, RayStack},
     timer::Timer,
 };
 
@@ -13,6 +17,13 @@ use super::{
     ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME,
 };
 
+pub fn ray_code(dir: Vector) -> usize {
+    let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
+    ray_sign_is_neg[0] as usize
+        + ((ray_sign_is_neg[1] as usize) << 1)
+        + ((ray_sign_is_neg[2] as usize) << 2)
+}
+
 #[derive(Copy, Clone, Debug)]
 pub struct BVH4<'a> {
     root: Option<&'a BVH4Node<'a>>,
@@ -66,9 +77,14 @@ impl<'a> BVH4<'a> {
         self.depth
     }
 
-    pub fn traverse<T, F>(&self, rays: &mut [AccelRay], objects: &[T], mut obj_ray_test: F)
-    where
-        F: FnMut(&T, &mut [AccelRay]),
+    pub fn traverse<T, F>(
+        &self,
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
+        objects: &[T],
+        mut obj_ray_test: F,
+    ) where
+        F: FnMut(&T, &mut RayBatch, &mut RayStack),
     {
         if self.root.is_none() {
             return;
@@ -78,25 +94,15 @@ impl<'a> BVH4<'a> {
         let mut trav_time: f64 = 0.0;
         let mut node_tests: u64 = 0;
 
-        let traversal_table = {
-            let ray_sign_is_neg = [
-                rays[0].dir_inv.x() < 0.0,
-                rays[0].dir_inv.y() < 0.0,
-                rays[0].dir_inv.z() < 0.0,
-            ];
-            let ray_code = ray_sign_is_neg[0] as usize
-                + ((ray_sign_is_neg[1] as usize) << 1)
-                + ((ray_sign_is_neg[2] as usize) << 2);
-            &TRAVERSAL_TABLE[ray_code]
-        };
+        let traversal_table =
+            &TRAVERSAL_TABLE[ray_code(rays.dir_inv_accel[ray_stack.next_task_ray_idx(0)])];
 
         // +2 of max depth for root and last child
         let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2];
-        let mut ray_i_stack = [rays.len(); (BVH_MAX_DEPTH * 3) + 2];
         let mut stack_ptr = 1;
 
         while stack_ptr > 0 {
-            node_tests += ray_i_stack[stack_ptr] as u64;
+            node_tests += ray_stack.ray_count_in_next_task() as u64;
             match *node_stack[stack_ptr] {
                 BVH4Node::Inner {
                     traversal_code,
@@ -104,12 +110,29 @@ impl<'a> BVH4<'a> {
                     bounds_len,
                     children,
                 } => {
+                    // Test rays against bbox.
                     let bounds =
                         unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
-                    let part = partition(&mut rays[..ray_i_stack[stack_ptr]], |r| {
-                        (!r.is_done()) && lerp_slice(bounds, r.time).intersect_accel_ray(r)
+
+                    let mut hit_count = 0;
+                    ray_stack.pop_do_next_task(children.len(), |ray_idx| {
+                        let hit = (!rays.is_done(ray_idx))
+                            && lerp_slice(bounds, rays.time[ray_idx]).intersect_ray(
+                                rays.orig_accel[ray_idx],
+                                rays.dir_inv_accel[ray_idx],
+                                rays.max_t[ray_idx],
+                            );
+
+                        if hit {
+                            hit_count += 1;
+                            ([0, 1, 2, 3, 4, 5, 6, 7], children.len())
+                        } else {
+                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                        }
                     });
-                    if part > 0 {
+
+                    // If there were any intersections, create tasks.
+                    if hit_count > 0 {
                         let order_code = traversal_table[traversal_code as usize];
                         match children.len() {
                             4 => {
@@ -118,10 +141,7 @@ impl<'a> BVH4<'a> {
                                 let i2 = ((order_code >> 2) & 0b11) as usize;
                                 let i1 = (order_code & 0b11) as usize;
 
-                                ray_i_stack[stack_ptr] = part;
-                                ray_i_stack[stack_ptr + 1] = part;
-                                ray_i_stack[stack_ptr + 2] = part;
-                                ray_i_stack[stack_ptr + 3] = part;
+                                ray_stack.push_lanes_to_tasks(&[i4, i3, i2, i1]);
 
                                 node_stack[stack_ptr] = &children[i4];
                                 node_stack[stack_ptr + 1] = &children[i3];
@@ -135,9 +155,7 @@ impl<'a> BVH4<'a> {
                                 let i2 = ((order_code >> 2) & 0b11) as usize;
                                 let i1 = (order_code & 0b11) as usize;
 
-                                ray_i_stack[stack_ptr] = part;
-                                ray_i_stack[stack_ptr + 1] = part;
-                                ray_i_stack[stack_ptr + 2] = part;
+                                ray_stack.push_lanes_to_tasks(&[i3, i2, i1]);
 
                                 node_stack[stack_ptr] = &children[i3];
                                 node_stack[stack_ptr + 1] = &children[i2];
@@ -149,8 +167,7 @@ impl<'a> BVH4<'a> {
                                 let i2 = ((order_code >> 2) & 0b11) as usize;
                                 let i1 = (order_code & 0b11) as usize;
 
-                                ray_i_stack[stack_ptr] = part;
-                                ray_i_stack[stack_ptr + 1] = part;
+                                ray_stack.push_lanes_to_tasks(&[i2, i1]);
 
                                 node_stack[stack_ptr] = &children[i2];
                                 node_stack[stack_ptr + 1] = &children[i1];
@@ -169,17 +186,33 @@ impl<'a> BVH4<'a> {
                     bounds_start,
                     bounds_len,
                 } => {
+                    // Test rays against bounds.
                     let bounds =
                         unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
-                    let part = partition(&mut rays[..ray_i_stack[stack_ptr]], |r| {
-                        (!r.is_done()) && lerp_slice(bounds, r.time).intersect_accel_ray(r)
-                    });
+                    let object_count = object_range.1 - object_range.0;
+                    let mut hit_count = 0;
 
+                    ray_stack.pop_do_next_task(object_count, |ray_idx| {
+                        let hit = (!rays.is_done(ray_idx))
+                            && lerp_slice(bounds, rays.time[ray_idx]).intersect_ray(
+                                rays.orig_accel[ray_idx],
+                                rays.dir_inv_accel[ray_idx],
+                                rays.max_t[ray_idx],
+                            );
+                        if hit {
+                            hit_count += 1;
+                            ([0, 1, 2, 3, 4, 5, 6, 7], object_count)
+                        } else {
+                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                        }
+                    });
+                    
                     trav_time += timer.tick() as f64;
 
-                    if part > 0 {
+                    if hit_count > 0 {
+                        ray_stack.push_lanes_to_tasks(&[0, 1, 2, 3, 4, 5, 6, 7][..object_count]);
                         for obj in &objects[object_range.0..object_range.1] {
-                            obj_ray_test(obj, &mut rays[..part]);
+                            obj_ray_test(obj, rays, ray_stack);
                         }
                     }
 
diff --git a/src/accel/mod.rs b/src/accel/mod.rs
index fe8ee3d..abbb1d4 100644
--- a/src/accel/mod.rs
+++ b/src/accel/mod.rs
@@ -1,4 +1,4 @@
-mod bvh;
+// mod bvh;
 mod bvh4;
 mod bvh_base;
 mod light_array;
@@ -13,8 +13,8 @@ use crate::{
 };
 
 pub use self::{
-    bvh::{BVHNode, BVH},
-    bvh4::{BVH4Node, BVH4},
+    // bvh::{BVHNode, BVH},
+    bvh4::{ray_code, BVH4Node, BVH4},
     light_array::LightArray,
     light_tree::LightTree,
 };
diff --git a/src/bbox.rs b/src/bbox.rs
index 33d3e6e..a4a43bb 100644
--- a/src/bbox.rs
+++ b/src/bbox.rs
@@ -7,8 +7,7 @@ use std::{
 
 use crate::{
     lerp::{lerp, lerp_slice, Lerp},
-    math::{fast_minf32, Matrix4x4, Point},
-    ray::AccelRay,
+    math::{fast_minf32, Matrix4x4, Point, Vector},
 };
 
 const BBOX_MAXT_ADJUST: f32 = 1.000_000_24;
@@ -40,17 +39,17 @@ impl BBox {
     }
 
     // Returns whether the given ray intersects with the bbox.
-    pub fn intersect_accel_ray(&self, ray: &AccelRay) -> bool {
+    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> bool {
         // Calculate slab intersections
-        let t1 = (self.min.co - ray.orig.co) * ray.dir_inv.co;
-        let t2 = (self.max.co - ray.orig.co) * ray.dir_inv.co;
+        let t1 = (self.min.co - orig.co) * dir_inv.co;
+        let t2 = (self.max.co - orig.co) * dir_inv.co;
 
         // Find the far and near intersection
         let mut far_t = t1.v_max(t2);
         let mut near_t = t1.v_min(t2);
         far_t.set_3(std::f32::INFINITY);
         near_t.set_3(0.0);
-        let far_hit_t = fast_minf32(far_t.h_min() * BBOX_MAXT_ADJUST, ray.max_t);
+        let far_hit_t = fast_minf32(far_t.h_min() * BBOX_MAXT_ADJUST, max_t);
         let near_hit_t = near_t.h_max();
 
         // Did we hit?
diff --git a/src/camera.rs b/src/camera.rs
index e3ed8c5..287805c 100644
--- a/src/camera.rs
+++ b/src/camera.rs
@@ -92,6 +92,12 @@ impl<'a> Camera<'a> {
         )
         .normalized();
 
-        Ray::new(orig * transform, dir * transform, time, wavelength, false)
+        Ray {
+            orig: orig * transform,
+            dir: dir * transform,
+            time: time,
+            wavelength: wavelength,
+            max_t: std::f32::INFINITY,
+        }
     }
 }
diff --git a/src/light/rectangle_light.rs b/src/light/rectangle_light.rs
index 98bae49..4711c36 100644
--- a/src/light/rectangle_light.rs
+++ b/src/light/rectangle_light.rs
@@ -6,7 +6,7 @@ use crate::{
     color::{Color, SpectralSample},
     lerp::lerp_slice,
     math::{cross, dot, Matrix4x4, Normal, Point, Vector},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack},
     sampling::{
         spherical_triangle_solid_angle, triangle_surface_area, uniform_sample_spherical_triangle,
         uniform_sample_triangle,
@@ -257,20 +257,23 @@ impl<'a> SurfaceLight for RectangleLight<'a> {
 impl<'a> Surface for RectangleLight<'a> {
     fn intersect_rays(
         &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
         isects: &mut [SurfaceIntersection],
         shader: &SurfaceShader,
         space: &[Matrix4x4],
     ) {
         let _ = shader; // Silence 'unused' warning
 
-        for r in accel_rays.iter_mut() {
-            let wr = &wrays[r.id as usize];
+        ray_stack.pop_do_next_task(0, |ray_idx| {
+            let time = rays.time[ray_idx];
+            let orig = rays.orig_world[ray_idx];
+            let dir = rays.dir_world[ray_idx];
+            let max_t = rays.max_t[ray_idx];
 
             // Calculate time interpolated values
-            let dim = lerp_slice(self.dimensions, r.time);
-            let xform = lerp_slice(space, r.time);
+            let dim = lerp_slice(self.dimensions, time);
+            let xform = lerp_slice(space, time);
 
             let space_inv = xform.inverse();
 
@@ -282,17 +285,17 @@ impl<'a> Surface for RectangleLight<'a> {
 
             // Test against two triangles that make up the light
             for tri in &[(p1, p2, p3), (p3, p4, p1)] {
-                if let Some((t, b0, b1, b2)) = triangle::intersect_ray(wr, *tri) {
-                    if t < r.max_t {
-                        if r.is_occlusion() {
-                            isects[r.id as usize] = SurfaceIntersection::Occlude;
-                            r.mark_done();
+                if let Some((t, b0, b1, b2)) = triangle::intersect_ray(orig, dir, max_t, *tri) {
+                    if t < max_t {
+                        if rays.is_occlusion(ray_idx) {
+                            isects[ray_idx] = SurfaceIntersection::Occlude;
+                            rays.mark_done(ray_idx);
                         } else {
                             let (pos, pos_err) = triangle::surface_point(*tri, (b0, b1, b2));
                             let normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();
 
                             let intersection_data = SurfaceIntersectionData {
-                                incoming: wr.dir,
+                                incoming: dir,
                                 t: t,
                                 pos: pos,
                                 pos_err: pos_err,
@@ -301,35 +304,37 @@ impl<'a> Surface for RectangleLight<'a> {
                                 local_space: xform,
                                 sample_pdf: self.sample_pdf(
                                     &xform,
-                                    wr.orig,
-                                    wr.dir,
+                                    orig,
+                                    dir,
                                     pos,
-                                    wr.wavelength,
-                                    r.time,
+                                    rays.wavelength[ray_idx],
+                                    time,
                                 ),
                             };
 
                             let closure = {
                                 let inv_surface_area = (1.0 / (dim.0 as f64 * dim.1 as f64)) as f32;
-                                let color = lerp_slice(self.colors, r.time) * inv_surface_area;
+                                let color = lerp_slice(self.colors, time) * inv_surface_area;
                                 SurfaceClosure::Emit(color)
                             };
 
                             // Fill in intersection
-                            isects[r.id as usize] = SurfaceIntersection::Hit {
+                            isects[ray_idx] = SurfaceIntersection::Hit {
                                 intersection_data: intersection_data,
                                 closure: closure,
                             };
 
                             // Set ray's max t
-                            r.max_t = t;
+                            rays.max_t[ray_idx] = t;
                         }
 
                         break;
                     }
                 }
             }
-        }
+
+            ([0, 0, 0, 0, 0, 0, 0, 0], 0)
+        });
     }
 }
 
diff --git a/src/light/sphere_light.rs b/src/light/sphere_light.rs
index 2323902..944baa8 100644
--- a/src/light/sphere_light.rs
+++ b/src/light/sphere_light.rs
@@ -8,7 +8,7 @@ use crate::{
     color::{Color, SpectralSample},
     lerp::lerp_slice,
     math::{coordinate_system_from_vector, dot, Matrix4x4, Normal, Point, Vector},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack},
     sampling::{uniform_sample_cone, uniform_sample_cone_pdf, uniform_sample_sphere},
     shading::surface_closure::SurfaceClosure,
     shading::SurfaceShader,
@@ -206,26 +206,26 @@ impl<'a> SurfaceLight for SphereLight<'a> {
 impl<'a> Surface for SphereLight<'a> {
     fn intersect_rays(
         &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
         isects: &mut [SurfaceIntersection],
         shader: &SurfaceShader,
         space: &[Matrix4x4],
     ) {
         let _ = shader; // Silence 'unused' warning
 
-        for r in accel_rays.iter_mut() {
-            let wr = &wrays[r.id as usize];
+        ray_stack.pop_do_next_task(0, |ray_idx| {
+            let time = rays.time[ray_idx];
 
             // Get the transform space
-            let xform = lerp_slice(space, r.time);
+            let xform = lerp_slice(space, time);
 
             // Get the radius of the sphere at the ray's time
-            let radius = lerp_slice(self.radii, r.time); // Radius of the sphere
+            let radius = lerp_slice(self.radii, time); // Radius of the sphere
 
             // Get the ray origin and direction in local space
-            let orig = r.orig.into_vector();
-            let dir = wr.dir * xform;
+            let orig = rays.orig_accel[ray_idx].into_vector();
+            let dir = rays.dir_world[ray_idx] * xform;
 
             // Code adapted to Rust from https://github.com/Tecla/Rayito
             // Ray-sphere intersection can result in either zero, one or two points
@@ -242,7 +242,7 @@ impl<'a> Surface for SphereLight<'a> {
             let discriminant = (b * b) - (4.0 * a * c);
             if discriminant < 0.0 {
                 // Discriminant less than zero?  No solution => no intersection.
-                continue;
+                return ([0, 0, 0, 0, 0, 0, 0, 0], 0);
             }
             let discriminant = discriminant.sqrt();
 
@@ -257,7 +257,7 @@ impl<'a> Surface for SphereLight<'a> {
 
             // Get our final parametric values
             let mut t0 = q / a;
-            let mut t1 = if q != 0.0 { c / q } else { r.max_t };
+            let mut t1 = if q != 0.0 { c / q } else { rays.max_t[ray_idx] };
 
             // Swap them so they are ordered right
             if t0 > t1 {
@@ -266,25 +266,25 @@ impl<'a> Surface for SphereLight<'a> {
             }
 
             // Check our intersection for validity against this ray's extents
-            if t0 > r.max_t || t1 <= 0.0 {
-                // Didn't hit because shere is entirely outside of ray's extents
-                continue;
+            if t0 > rays.max_t[ray_idx] || t1 <= 0.0 {
+                // Didn't hit because sphere is entirely outside of ray's extents
+                return ([0, 0, 0, 0, 0, 0, 0, 0], 0);
             }
 
             let t = if t0 > 0.0 {
                 t0
-            } else if t1 <= r.max_t {
+            } else if t1 <= rays.max_t[ray_idx] {
                 t1
             } else {
                 // Didn't hit because ray is entirely within the sphere, and
                 // therefore doesn't hit its surface.
-                continue;
+                return ([0, 0, 0, 0, 0, 0, 0, 0], 0);
             };
 
             // We hit the sphere, so calculate intersection info.
-            if r.is_occlusion() {
-                isects[r.id as usize] = SurfaceIntersection::Occlude;
-                r.mark_done();
+            if rays.is_occlusion(ray_idx) {
+                isects[ray_idx] = SurfaceIntersection::Occlude;
+                rays.mark_done(ray_idx);
             } else {
                 let inv_xform = xform.inverse();
 
@@ -300,7 +300,7 @@ impl<'a> Surface for SphereLight<'a> {
                 let normal = unit_pos.into_normal() * inv_xform;
 
                 let intersection_data = SurfaceIntersectionData {
-                    incoming: wr.dir,
+                    incoming: rays.dir_world[ray_idx],
                     t: t,
                     pos: pos,
                     pos_err: pos_err,
@@ -309,32 +309,34 @@ impl<'a> Surface for SphereLight<'a> {
                     local_space: xform,
                     sample_pdf: self.sample_pdf(
                         &xform,
-                        wr.orig,
-                        wr.dir,
+                        rays.orig_world[ray_idx],
+                        rays.dir_world[ray_idx],
                         0.0,
                         0.0,
-                        wr.wavelength,
-                        r.time,
+                        rays.wavelength[ray_idx],
+                        time,
                     ),
                 };
 
                 let closure = {
                     let inv_surface_area =
                         (1.0 / (4.0 * PI_64 * radius as f64 * radius as f64)) as f32;
-                    let color = lerp_slice(self.colors, r.time) * inv_surface_area;
+                    let color = lerp_slice(self.colors, time) * inv_surface_area;
                     SurfaceClosure::Emit(color)
                 };
 
                 // Fill in intersection
-                isects[r.id as usize] = SurfaceIntersection::Hit {
+                isects[ray_idx] = SurfaceIntersection::Hit {
                     intersection_data: intersection_data,
                     closure: closure,
                 };
 
                 // Set ray's max t
-                r.max_t = t;
+                rays.max_t[ray_idx] = t;
             }
-        }
+
+            ([0, 0, 0, 0, 0, 0, 0, 0], 0)
+        });
     }
 }
 
diff --git a/src/main.rs b/src/main.rs
index c1f5cef..bd5cf51 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -47,10 +47,9 @@ use nom::{error_position, take_until};
 use mem_arena::MemArena;
 
 use crate::{
-    accel::{BVH4Node, BVHNode},
+    accel::BVH4Node,
     bbox::BBox,
     parse::{parse_scene, DataTree},
-    ray::{AccelRay, Ray},
     renderer::LightPath,
     surface::SurfaceIntersection,
     timer::Timer,
@@ -159,15 +158,13 @@ fn main() {
 
     // Print some misc useful dev info.
     if args.is_present("dev") {
-        println!("Ray size:       {} bytes", mem::size_of::<Ray>());
-        println!("AccelRay size:  {} bytes", mem::size_of::<AccelRay>());
         println!(
             "SurfaceIntersection size:  {} bytes",
             mem::size_of::<SurfaceIntersection>()
         );
         println!("LightPath size: {} bytes", mem::size_of::<LightPath>());
         println!("BBox size: {} bytes", mem::size_of::<BBox>());
-        println!("BVHNode size: {} bytes", mem::size_of::<BVHNode>());
+        // println!("BVHNode size: {} bytes", mem::size_of::<BVHNode>());
         println!("BVH4Node size: {} bytes", mem::size_of::<BVH4Node>());
         return;
     }
diff --git a/src/ray.rs b/src/ray.rs
index 2c7e41b..1881dba 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -8,6 +8,17 @@ type FlagType = u8;
 const OCCLUSION_FLAG: FlagType = 1;
 const DONE_FLAG: FlagType = 1 << 1;
 
+/// This is never used directly in ray tracing--it's only used as a convenience
+/// for filling the RayBatch structure.
+#[derive(Debug, Copy, Clone)]
+pub struct Ray {
+    pub orig: Point,
+    pub dir: Vector,
+    pub time: f32,
+    pub wavelength: f32,
+    pub max_t: f32,
+}
+
 /// A batch of rays, stored in SoA layout.
 #[derive(Debug)]
 pub struct RayBatch {
@@ -51,6 +62,60 @@ impl RayBatch {
         }
     }
 
+    pub fn push(&mut self, ray: Ray, is_occlusion: bool) {
+        self.orig_world.push(ray.orig);
+        self.dir_world.push(ray.dir);
+        self.orig_accel.push(ray.orig); // Bogus, to place-hold.
+        self.dir_inv_accel.push(ray.dir); // Bogus, to place-hold.
+        self.time.push(ray.time);
+        self.wavelength.push(ray.wavelength);
+        if is_occlusion {
+            self.max_t.push(1.0);
+            self.flags.push(OCCLUSION_FLAG);
+        } else {
+            self.max_t.push(std::f32::INFINITY);
+            self.flags.push(0);
+        }
+    }
+
+    pub fn swap(&mut self, a: usize, b: usize) {
+        if a != b {
+            self.orig_world.swap(a, b);
+            self.dir_world.swap(a, b);
+            self.orig_accel.swap(a, b);
+            self.dir_inv_accel.swap(a, b);
+            self.max_t.swap(a, b);
+            self.time.swap(a, b);
+            self.wavelength.swap(a, b);
+            self.flags.swap(a, b);
+        }
+    }
+
+    pub fn set_from_ray(&mut self, ray: &Ray, is_shadow: bool, idx: usize) {
+        self.orig_world[idx] = ray.orig;
+        self.dir_world[idx] = ray.dir;
+        self.orig_accel[idx] = ray.orig;
+        self.dir_inv_accel[idx] = Vector {
+            co: Float4::splat(1.0) / ray.dir.co,
+        };
+        self.max_t[idx] = ray.max_t;
+        self.time[idx] = ray.time;
+        self.wavelength[idx] = ray.wavelength;
+        self.time[idx] = ray.time;
+        self.flags[idx] = if is_shadow { OCCLUSION_FLAG } else { 0 };
+    }
+
+    pub fn truncate(&mut self, len: usize) {
+        self.orig_world.truncate(len);
+        self.dir_world.truncate(len);
+        self.orig_accel.truncate(len);
+        self.dir_inv_accel.truncate(len);
+        self.max_t.truncate(len);
+        self.time.truncate(len);
+        self.wavelength.truncate(len);
+        self.flags.truncate(len);
+    }
+
     /// Clear all rays, settings the size of the batch back to zero.
     ///
     /// Capacity is maintained.
@@ -65,6 +130,10 @@ impl RayBatch {
         self.flags.clear();
     }
 
+    pub fn len(&self) -> usize {
+        self.orig_world.len()
+    }
+
     /// Returns whether the given ray (at index `idx`) is an occlusion ray.
     pub fn is_occlusion(&self, idx: usize) -> bool {
         (self.flags[idx] & OCCLUSION_FLAG) != 0
@@ -101,117 +170,129 @@ impl RayBatch {
 /// A structure used for tracking traversal of a ray batch through a scene.
 #[derive(Debug)]
 pub struct RayStack {
-    lanes: Vec<Vec<u16>>,
+    lanes: Vec<Lane>,
     tasks: Vec<RayTask>,
 }
 
-/// A task within a RayStack.
+impl RayStack {
+    pub fn new() -> RayStack {
+        RayStack {
+            lanes: Vec::new(),
+            tasks: Vec::new(),
+        }
+    }
+
+    /// Returns whether the stack is empty of tasks or not.
+    pub fn is_empty(&self) -> bool {
+        self.tasks.is_empty()
+    }
+
+    /// Makes sure there are at least `count` lanes.
+    pub fn ensure_lane_count(&mut self, count: usize) {
+        while self.lanes.len() < count {
+            self.lanes.push(Lane {
+                idxs: Vec::new(),
+                end_len: 0,
+            })
+        }
+    }
+
+    pub fn ray_count_in_next_task(&self) -> usize {
+        let task = self.tasks.last().unwrap();
+        let end = self.lanes[task.lane].end_len;
+        end - task.start_idx
+    }
+
+    pub fn next_task_ray_idx(&self, i: usize) -> usize {
+        let task = self.tasks.last().unwrap();
+        let i = i + task.start_idx;
+        debug_assert!(i < self.lanes[task.lane].end_len);
+        self.lanes[task.lane].idxs[i] as usize
+    }
+
+    /// Clears the lanes and tasks of the RayStack.
+    ///
+    /// Note: this is (importantly) different than calling clear individually
+    /// on the `lanes` and `tasks` members.  Specifically, we don't want to
+    /// clear `lanes` itself, as that would also free all the memory of the
+    /// individual lanes.  Instead, we want to iterate over the individual
+    /// lanes and clear them, but leave `lanes` itself untouched.
+    pub fn clear(&mut self) {
+        for lane in self.lanes.iter_mut() {
+            lane.idxs.clear();
+            lane.end_len = 0;
+        }
+
+        self.tasks.clear();
+    }
+
+    /// Pushes the given ray index onto the end of the specified lane.
+    pub fn push_ray_index(&mut self, ray_idx: usize, lane: usize) {
+        assert!(self.lanes.len() > lane);
+        self.lanes[lane].idxs.push(ray_idx as u16);
+    }
+
+    /// Takes the given list of lane indices, and pushes any excess indices on
+    /// the end of each into a new task, in the order provided.
+    pub fn push_lanes_to_tasks(&mut self, lane_idxs: &[usize]) {
+        for &l in lane_idxs {
+            if self.lanes[l].end_len < self.lanes[l].idxs.len() {
+                self.tasks.push(RayTask {
+                    lane: l,
+                    start_idx: self.lanes[l].end_len,
+                });
+                self.lanes[l].end_len = self.lanes[l].idxs.len();
+            }
+        }
+    }
+
+    /// Pops the next task off the stack, and executes the provided closure for
+    /// each ray index in the task.  The return value of the closure is the list
+    /// of lanes (by index) to add the given ray index back into.
+    pub fn pop_do_next_task<F>(&mut self, needed_lanes: usize, mut handle_ray: F)
+    where
+        F: FnMut(usize) -> ([u8; 8], usize),
+    {
+        // Prepare lanes.
+        self.ensure_lane_count(needed_lanes);
+
+        // Pop the task and do necessary bookkeeping.
+        let task = self.tasks.pop().unwrap();
+        let task_range = (task.start_idx, self.lanes[task.lane].end_len);
+        self.lanes[task.lane].end_len = task.start_idx;
+
+        // Execute task.
+        let mut source_lane_cap = task_range.0;
+        for i in task_range.0..task_range.1 {
+            let ray_idx = self.lanes[task.lane].idxs[i];
+            let (add_list, list_len) = handle_ray(ray_idx as usize);
+            for &l in &add_list[..list_len] {
+                if l == task.lane as u8 {
+                    self.lanes[l as usize].idxs[source_lane_cap] = ray_idx;
+                    source_lane_cap += 1;
+                } else {
+                    self.lanes[l as usize].idxs.push(ray_idx);
+                }
+            }
+        }
+        self.lanes[task.lane].idxs.truncate(source_lane_cap);
+    }
+}
+
+/// A lane within a RayStack.
 #[derive(Debug)]
-pub enum RayTask {
-    // A barrier represents a division when traversing into a new system.
-    // For example, when traversing from the top-level BVH into an object's
-    // local BVH.  It helps with keeping track of where we're at and aids in
-    // debugging.
-    Barrier,
-
-    // A task for handling a set of rays.
-    //
-    // Specifies the lane that the relevant ray pointers are in, and the
-    // starting index within that lane.  The relevant pointers are always
-    // `&[start_idx..]` within the given lane.
-    Rays { lane: usize, start_idx: usize },
+struct Lane {
+    idxs: Vec<u16>,
+    end_len: usize,
 }
 
-#[derive(Debug, Copy, Clone)]
-pub struct Ray {
-    pub orig: Point,
-    pub dir: Vector,
-    pub max_t: f32,
-    pub time: f32,
-    pub wavelength: f32,
-    pub flags: FlagType,
-}
-
-impl Ray {
-    pub fn new(orig: Point, dir: Vector, time: f32, wavelength: f32, is_occ: bool) -> Ray {
-        if !is_occ {
-            Ray {
-                orig: orig,
-                dir: dir,
-                max_t: std::f32::INFINITY,
-                time: time,
-                wavelength: wavelength,
-                flags: 0,
-            }
-        } else {
-            Ray {
-                orig: orig,
-                dir: dir,
-                max_t: 1.0,
-                time: time,
-                wavelength: wavelength,
-                flags: OCCLUSION_FLAG,
-            }
-        }
-    }
-
-    pub fn transform(&mut self, mat: &Matrix4x4) {
-        self.orig = self.orig * *mat;
-        self.dir = self.dir * *mat;
-    }
-
-    pub fn is_occlusion(&self) -> bool {
-        (self.flags & OCCLUSION_FLAG) != 0
-    }
-}
-
-#[derive(Debug, Copy, Clone)]
-pub struct AccelRay {
-    pub orig: Point,
-    pub dir_inv: Vector,
-    pub max_t: f32,
-    pub time: f32,
-    pub flags: FlagType,
-    pub id: u32,
-}
-
-impl AccelRay {
-    pub fn new(ray: &Ray, id: u32) -> AccelRay {
-        AccelRay {
-            orig: ray.orig,
-            dir_inv: Vector {
-                co: Float4::splat(1.0) / ray.dir.co,
-            },
-            max_t: ray.max_t,
-            time: ray.time,
-            flags: ray.flags,
-            id: id,
-        }
-    }
-
-    pub fn update_from_world_ray(&mut self, wr: &Ray) {
-        self.orig = wr.orig;
-        self.dir_inv = Vector {
-            co: Float4::splat(1.0) / wr.dir.co,
-        };
-    }
-
-    pub fn update_from_xformed_world_ray(&mut self, wr: &Ray, mat: &Matrix4x4) {
-        self.orig = wr.orig * *mat;
-        self.dir_inv = Vector {
-            co: Float4::splat(1.0) / (wr.dir * *mat).co,
-        };
-    }
-
-    pub fn is_occlusion(&self) -> bool {
-        (self.flags & OCCLUSION_FLAG) != 0
-    }
-
-    pub fn is_done(&self) -> bool {
-        (self.flags & DONE_FLAG) != 0
-    }
-
-    pub fn mark_done(&mut self) {
-        self.flags |= DONE_FLAG;
-    }
+/// A task within a RayStack.
+//
+// Specifies the lane that the relevant ray pointers are in, and the
+// starting index within that lane.  The relevant pointers are always
+// `&[start_idx..]` within the given lane.
+#[derive(Debug)]
+struct RayTask {
+    lane: usize,
+    start_idx: usize,
 }
diff --git a/src/renderer.rs b/src/renderer.rs
index 8f1471f..6f3fe80 100644
--- a/src/renderer.rs
+++ b/src/renderer.rs
@@ -13,7 +13,6 @@ use float4::Float4;
 
 use crate::{
     accel::{ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME},
-    algorithm::partition_pair,
     color::{map_0_1_to_wavelength, SpectralSample, XYZ},
     fp_utils::robust_ray_origin,
     hash::hash_u32,
@@ -21,7 +20,7 @@ use crate::{
     image::Image,
     math::{fast_logit, upper_power_of_two},
     mis::power_heuristic,
-    ray::Ray,
+    ray::{Ray, RayBatch},
     scene::{Scene, SceneLightSample},
     surface,
     timer::Timer,
@@ -207,7 +206,7 @@ impl<'a> Renderer<'a> {
         let mut total_timer = Timer::new();
 
         let mut paths = Vec::new();
-        let mut rays = Vec::new();
+        let mut rays = RayBatch::new();
         let mut tracer = Tracer::from_assembly(&self.scene.root);
         let mut xform_stack = TransformStack::new();
 
@@ -266,7 +265,7 @@ impl<'a> Renderer<'a> {
                             offset + si as u32,
                         );
                         paths.push(path);
-                        rays.push(ray);
+                        rays.push(ray, false);
                     }
                 }
             }
@@ -276,13 +275,20 @@ impl<'a> Renderer<'a> {
             let mut pi = paths.len();
             while pi > 0 {
                 // Test rays against scene
-                let isects = tracer.trace(&rays);
+                let isects = tracer.trace(&mut rays);
                 stats.trace_time += timer.tick() as f64;
 
                 // Determine next rays to shoot based on result
-                pi = partition_pair(&mut paths[..pi], &mut rays[..pi], |i, path, ray| {
-                    path.next(&mut xform_stack, &self.scene, &isects[i], &mut *ray)
-                });
+                let mut new_end = 0;
+                for i in 0..pi {
+                    if paths[i].next(&mut xform_stack, &self.scene, &isects[i], &mut rays, i) {
+                        paths.swap(new_end, i);
+                        rays.swap(new_end, i);
+                        new_end += 1;
+                    }
+                }
+                rays.truncate(new_end);
+                pi = new_end;
                 stats.ray_generation_time += timer.tick() as f64;
             }
 
@@ -431,7 +437,8 @@ impl LightPath {
         xform_stack: &mut TransformStack,
         scene: &Scene,
         isect: &surface::SurfaceIntersection,
-        ray: &mut Ray,
+        rays: &mut RayBatch,
+        ray_idx: usize,
     ) -> bool {
         match self.event {
             //--------------------------------------------------------------------
@@ -496,13 +503,13 @@ impl LightPath {
                             // Distant light
                             SceneLightSample::Distant { direction, .. } => {
                                 let (attenuation, closure_pdf) = closure.evaluate(
-                                    ray.dir,
+                                    rays.dir_world[ray_idx],
                                     direction,
                                     idata.nor,
                                     idata.nor_g,
                                     self.wavelength,
                                 );
-                                let mut shadow_ray = {
+                                let shadow_ray = {
                                     // Calculate the shadow ray for testing if the light is
                                     // in shadow or not.
                                     let offset_pos = robust_ray_origin(
@@ -511,15 +518,14 @@ impl LightPath {
                                         idata.nor_g.normalized(),
                                         direction,
                                     );
-                                    Ray::new(
-                                        offset_pos,
-                                        direction,
-                                        self.time,
-                                        self.wavelength,
-                                        true,
-                                    )
+                                    Ray {
+                                        orig: offset_pos,
+                                        dir: direction,
+                                        time: self.time,
+                                        wavelength: self.wavelength,
+                                        max_t: std::f32::INFINITY,
+                                    }
                                 };
-                                shadow_ray.max_t = std::f32::INFINITY;
                                 (attenuation, closure_pdf, shadow_ray)
                             }
 
@@ -527,7 +533,7 @@ impl LightPath {
                             SceneLightSample::Surface { sample_geo, .. } => {
                                 let dir = sample_geo.0 - idata.pos;
                                 let (attenuation, closure_pdf) = closure.evaluate(
-                                    ray.dir,
+                                    rays.dir_world[ray_idx],
                                     dir,
                                     idata.nor,
                                     idata.nor_g,
@@ -548,13 +554,13 @@ impl LightPath {
                                         sample_geo.1.normalized(),
                                         -dir,
                                     );
-                                    Ray::new(
-                                        offset_pos,
-                                        offset_end - offset_pos,
-                                        self.time,
-                                        self.wavelength,
-                                        true,
-                                    )
+                                    Ray {
+                                        orig: offset_pos,
+                                        dir: offset_end - offset_pos,
+                                        time: self.time,
+                                        wavelength: self.wavelength,
+                                        max_t: 1.0,
+                                    }
                                 };
                                 (attenuation, closure_pdf, shadow_ray)
                             }
@@ -572,7 +578,7 @@ impl LightPath {
                                 light_info.color().e * attenuation.e * self.light_attenuation
                                     / (light_mis_pdf * light_sel_pdf);
 
-                            *ray = shadow_ray;
+                            rays.set_from_ray(&shadow_ray, true, ray_idx);
 
                             true
                         }
@@ -609,8 +615,13 @@ impl LightPath {
                                 idata.nor_g.normalized(),
                                 dir,
                             );
-                            self.next_bounce_ray =
-                                Some(Ray::new(offset_pos, dir, self.time, self.wavelength, false));
+                            self.next_bounce_ray = Some(Ray {
+                                orig: offset_pos,
+                                dir: dir,
+                                time: self.time,
+                                wavelength: self.wavelength,
+                                max_t: std::f32::INFINITY,
+                            });
 
                             true
                         } else {
@@ -626,7 +637,7 @@ impl LightPath {
                         self.event = LightPathEvent::ShadowRay;
                         return true;
                     } else if do_bounce {
-                        *ray = self.next_bounce_ray.unwrap();
+                        rays.set_from_ray(&self.next_bounce_ray.unwrap(), false, ray_idx);
                         self.event = LightPathEvent::BounceRay;
                         self.light_attenuation *= self.next_attenuation_fac;
                         return true;
@@ -657,7 +668,7 @@ impl LightPath {
 
                 // Set up for the next bounce, if any
                 if let Some(ref nbr) = self.next_bounce_ray {
-                    *ray = *nbr;
+                    rays.set_from_ray(nbr, false, ray_idx);
                     self.light_attenuation *= self.next_attenuation_fac;
                     self.event = LightPathEvent::BounceRay;
                     return true;
diff --git a/src/surface/micropoly_batch.rs b/src/surface/micropoly_batch.rs
index 36d686f..8bb9447 100644
--- a/src/surface/micropoly_batch.rs
+++ b/src/surface/micropoly_batch.rs
@@ -8,7 +8,7 @@ use crate::{
     boundable::Boundable,
     lerp::lerp_slice,
     math::{cross, dot, Matrix4x4, Normal, Point},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack, RayTask}
     shading::surface_closure::SurfaceClosure,
 };
 
@@ -99,8 +99,8 @@ impl<'a> MicropolyBatch<'a> {
 impl<'a> MicropolyBatch<'a> {
     fn intersect_rays(
         &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
         isects: &mut [SurfaceIntersection],
         space: &[Matrix4x4],
     ) {
@@ -112,7 +112,7 @@ impl<'a> MicropolyBatch<'a> {
         };
 
         self.accel
-            .traverse(&mut accel_rays[..], self.indices, |tri_indices, rs| {
+            .traverse(rays, ray_stack, self.indices, |tri_indices, rs| {
                 // For static triangles with static transforms, cache them.
                 let is_cached = self.time_sample_count == 1 && space.len() <= 1;
                 let mut tri = if is_cached {
diff --git a/src/surface/mod.rs b/src/surface/mod.rs
index 9c2b761..2f90223 100644
--- a/src/surface/mod.rs
+++ b/src/surface/mod.rs
@@ -1,6 +1,6 @@
 #![allow(dead_code)]
 
-pub mod micropoly_batch;
+// pub mod micropoly_batch;
 pub mod triangle;
 pub mod triangle_mesh;
 
@@ -9,7 +9,7 @@ use std::fmt::Debug;
 use crate::{
     boundable::Boundable,
     math::{Matrix4x4, Normal, Point, Vector},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack},
     shading::surface_closure::SurfaceClosure,
     shading::SurfaceShader,
 };
@@ -17,8 +17,8 @@ use crate::{
 pub trait Surface: Boundable + Debug + Sync {
     fn intersect_rays(
         &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
         isects: &mut [SurfaceIntersection],
         shader: &SurfaceShader,
         space: &[Matrix4x4],
diff --git a/src/surface/triangle.rs b/src/surface/triangle.rs
index c252e59..5f0a9f6 100644
--- a/src/surface/triangle.rs
+++ b/src/surface/triangle.rs
@@ -1,6 +1,9 @@
 #![allow(dead_code)]
 
-use crate::{fp_utils::fp_gamma, math::Point, ray::Ray};
+use crate::{
+    fp_utils::fp_gamma,
+    math::{Point, Vector},
+};
 
 /// Intersects `ray` with `tri`, returning `Some((t, b0, b1, b2))`, or `None`
 /// if no intersection.
@@ -13,12 +16,17 @@ use crate::{fp_utils::fp_gamma, math::Point, ray::Ray};
 ///
 /// Uses the ray-triangle test from the paper "Watertight Ray/Triangle
 /// Intersection" by Woop et al.
-pub fn intersect_ray(ray: &Ray, tri: (Point, Point, Point)) -> Option<(f32, f32, f32, f32)> {
+pub fn intersect_ray(
+    ray_orig: Point,
+    ray_dir: Vector,
+    ray_max_t: f32,
+    tri: (Point, Point, Point),
+) -> Option<(f32, f32, f32, f32)> {
     // Calculate the permuted dimension indices for the new ray space.
     let (xi, yi, zi) = {
-        let xabs = ray.dir.x().abs();
-        let yabs = ray.dir.y().abs();
-        let zabs = ray.dir.z().abs();
+        let xabs = ray_dir.x().abs();
+        let yabs = ray_dir.y().abs();
+        let zabs = ray_dir.z().abs();
 
         if xabs > yabs && xabs > zabs {
             (1, 2, 0)
@@ -29,9 +37,9 @@ pub fn intersect_ray(ray: &Ray, tri: (Point, Point, Point)) -> Option<(f32, f32,
         }
     };
 
-    let dir_x = ray.dir.get_n(xi);
-    let dir_y = ray.dir.get_n(yi);
-    let dir_z = ray.dir.get_n(zi);
+    let dir_x = ray_dir.get_n(xi);
+    let dir_y = ray_dir.get_n(yi);
+    let dir_z = ray_dir.get_n(zi);
 
     // Calculate shear constants.
     let sx = dir_x / dir_z;
@@ -39,9 +47,9 @@ pub fn intersect_ray(ray: &Ray, tri: (Point, Point, Point)) -> Option<(f32, f32,
     let sz = 1.0 / dir_z;
 
     // Calculate vertices in ray space.
-    let p0 = tri.0 - ray.orig;
-    let p1 = tri.1 - ray.orig;
-    let p2 = tri.2 - ray.orig;
+    let p0 = tri.0 - ray_orig;
+    let p1 = tri.1 - ray_orig;
+    let p2 = tri.2 - ray_orig;
 
     let p0x = p0.get_n(xi) - (sx * p0.get_n(zi));
     let p0y = p0.get_n(yi) - (sy * p0.get_n(zi));
@@ -80,8 +88,8 @@ pub fn intersect_ray(ray: &Ray, tri: (Point, Point, Point)) -> Option<(f32, f32,
     let t_scaled = (e0 * p0z) + (e1 * p1z) + (e2 * p2z);
 
     // Check if the hitpoint t is within ray min/max t.
-    if (det > 0.0 && (t_scaled <= 0.0 || t_scaled > (ray.max_t * det)))
-        || (det < 0.0 && (t_scaled >= 0.0 || t_scaled < (ray.max_t * det)))
+    if (det > 0.0 && (t_scaled <= 0.0 || t_scaled > (ray_max_t * det)))
+        || (det < 0.0 && (t_scaled >= 0.0 || t_scaled < (ray_max_t * det)))
     {
         return None;
     }
diff --git a/src/surface/triangle_mesh.rs b/src/surface/triangle_mesh.rs
index a067416..38f3ebf 100644
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@@ -8,7 +8,7 @@ use crate::{
     boundable::Boundable,
     lerp::lerp_slice,
     math::{cross, dot, Matrix4x4, Normal, Point},
-    ray::{AccelRay, Ray},
+    ray::{RayBatch, RayStack},
     shading::SurfaceShader,
 };
 
@@ -117,8 +117,8 @@ impl<'a> Boundable for TriangleMesh<'a> {
 impl<'a> Surface for TriangleMesh<'a> {
     fn intersect_rays(
         &self,
-        accel_rays: &mut [AccelRay],
-        wrays: &[Ray],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
         isects: &mut [SurfaceIntersection],
         shader: &SurfaceShader,
         space: &[Matrix4x4],
@@ -130,8 +130,11 @@ impl<'a> Surface for TriangleMesh<'a> {
             Matrix4x4::new()
         };
 
-        self.accel
-            .traverse(&mut accel_rays[..], self.indices, |tri_indices, rs| {
+        self.accel.traverse(
+            rays,
+            ray_stack,
+            self.indices,
+            |tri_indices, rays, ray_stack| {
                 // For static triangles with static transforms, cache them.
                 let is_cached = self.time_sample_count == 1 && space.len() <= 1;
                 let mut tri = if is_cached {
@@ -154,8 +157,9 @@ impl<'a> Surface for TriangleMesh<'a> {
                 };
 
                 // Test each ray against the current triangle.
-                for r in rs {
-                    let wr = &wrays[r.id as usize];
+                ray_stack.pop_do_next_task(0, |ray_idx| {
+                    let ray_idx = ray_idx as usize;
+                    let ray_time = rays.time[ray_idx];
 
                     // Get triangle if necessary
                     if !is_cached {
@@ -178,9 +182,9 @@ impl<'a> Surface for TriangleMesh<'a> {
                                 * self.time_sample_count)
                                 ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
 
-                            let p0 = lerp_slice(p0_slice, wr.time);
-                            let p1 = lerp_slice(p1_slice, wr.time);
-                            let p2 = lerp_slice(p2_slice, wr.time);
+                            let p0 = lerp_slice(p0_slice, ray_time);
+                            let p1 = lerp_slice(p1_slice, ray_time);
+                            let p2 = lerp_slice(p2_slice, ray_time);
 
                             (p0, p1, p2)
                         };
@@ -190,7 +194,7 @@ impl<'a> Surface for TriangleMesh<'a> {
                     let mat_space = if !space.is_empty() {
                         if space.len() > 1 {
                             // Per-ray transform, for motion blur
-                            let mat_space = lerp_slice(space, wr.time).inverse();
+                            let mat_space = lerp_slice(space, ray_time).inverse();
                             tri = (tri.0 * mat_space, tri.1 * mat_space, tri.2 * mat_space);
                             mat_space
                         } else {
@@ -210,65 +214,71 @@ impl<'a> Surface for TriangleMesh<'a> {
                     };
 
                     // Test ray against triangle
-                    if let Some((t, b0, b1, b2)) = triangle::intersect_ray(wr, tri) {
-                        if t < r.max_t {
-                            if r.is_occlusion() {
-                                isects[r.id as usize] = SurfaceIntersection::Occlude;
-                                r.mark_done();
-                            } else {
-                                // Calculate intersection point and error magnitudes
-                                let (pos, pos_err) = triangle::surface_point(tri, (b0, b1, b2));
+                    if let Some((t, b0, b1, b2)) = triangle::intersect_ray(
+                        rays.orig_world[ray_idx],
+                        rays.dir_world[ray_idx],
+                        rays.max_t[ray_idx],
+                        tri,
+                    ) {
+                        if rays.is_occlusion(ray_idx) {
+                            isects[ray_idx] = SurfaceIntersection::Occlude;
+                            rays.mark_done(ray_idx);
+                        } else {
+                            // Calculate intersection point and error magnitudes
+                            let (pos, pos_err) = triangle::surface_point(tri, (b0, b1, b2));
 
-                                // Calculate geometric surface normal
-                                let geo_normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();
+                            // Calculate geometric surface normal
+                            let geo_normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();
 
-                                // Calculate interpolated surface normal, if any
-                                let shading_normal = if let Some(normals) = self.normals {
-                                    let n0_slice = &normals[(tri_indices.0 as usize
-                                        * self.time_sample_count)
-                                        ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
-                                    let n1_slice = &normals[(tri_indices.1 as usize
-                                        * self.time_sample_count)
-                                        ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
-                                    let n2_slice = &normals[(tri_indices.2 as usize
-                                        * self.time_sample_count)
-                                        ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
+                            // Calculate interpolated surface normal, if any
+                            let shading_normal = if let Some(normals) = self.normals {
+                                let n0_slice = &normals[(tri_indices.0 as usize
+                                    * self.time_sample_count)
+                                    ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
+                                let n1_slice = &normals[(tri_indices.1 as usize
+                                    * self.time_sample_count)
+                                    ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
+                                let n2_slice = &normals[(tri_indices.2 as usize
+                                    * self.time_sample_count)
+                                    ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
 
-                                    let n0 = lerp_slice(n0_slice, wr.time).normalized();
-                                    let n1 = lerp_slice(n1_slice, wr.time).normalized();
-                                    let n2 = lerp_slice(n2_slice, wr.time).normalized();
+                                let n0 = lerp_slice(n0_slice, ray_time).normalized();
+                                let n1 = lerp_slice(n1_slice, ray_time).normalized();
+                                let n2 = lerp_slice(n2_slice, ray_time).normalized();
 
-                                    let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
-                                    if dot(s_nor, geo_normal) >= 0.0 {
-                                        s_nor
-                                    } else {
-                                        -s_nor
-                                    }
+                                let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
+                                if dot(s_nor, geo_normal) >= 0.0 {
+                                    s_nor
                                 } else {
-                                    geo_normal
-                                };
+                                    -s_nor
+                                }
+                            } else {
+                                geo_normal
+                            };
 
-                                let intersection_data = SurfaceIntersectionData {
-                                    incoming: wr.dir,
-                                    t: t,
-                                    pos: pos,
-                                    pos_err: pos_err,
-                                    nor: shading_normal,
-                                    nor_g: geo_normal,
-                                    local_space: mat_space,
-                                    sample_pdf: 0.0,
-                                };
+                            let intersection_data = SurfaceIntersectionData {
+                                incoming: rays.dir_world[ray_idx],
+                                t: t,
+                                pos: pos,
+                                pos_err: pos_err,
+                                nor: shading_normal,
+                                nor_g: geo_normal,
+                                local_space: mat_space,
+                                sample_pdf: 0.0,
+                            };
 
-                                // Fill in intersection data
-                                isects[r.id as usize] = SurfaceIntersection::Hit {
-                                    intersection_data: intersection_data,
-                                    closure: shader.shade(&intersection_data, wr.time),
-                                };
-                                r.max_t = t;
-                            }
+                            // Fill in intersection data
+                            isects[ray_idx] = SurfaceIntersection::Hit {
+                                intersection_data: intersection_data,
+                                closure: shader.shade(&intersection_data, ray_time),
+                            };
+                            rays.max_t[ray_idx] = t;
                         }
                     }
-                }
-            });
+
+                    ([0, 0, 0, 0, 0, 0, 0, 0], 0)
+                });
+            },
+        );
     }
 }
diff --git a/src/tracer.rs b/src/tracer.rs
index 4105dfc..3fba96e 100644
--- a/src/tracer.rs
+++ b/src/tracer.rs
@@ -1,10 +1,11 @@
 use std::iter;
 
 use crate::{
-    algorithm::partition,
+    accel::ray_code,
     color::{rec709_to_xyz, Color},
     lerp::lerp_slice,
-    ray::{AccelRay, Ray},
+    math::Matrix4x4,
+    ray::{RayBatch, RayStack},
     scene::{Assembly, InstanceType, Object},
     shading::{SimpleSurfaceShader, SurfaceShader},
     surface::SurfaceIntersection,
@@ -12,14 +13,14 @@ use crate::{
 };
 
 pub struct Tracer<'a> {
-    rays: Vec<AccelRay>,
+    ray_stack: RayStack,
     inner: TracerInner<'a>,
 }
 
 impl<'a> Tracer<'a> {
     pub fn from_assembly(assembly: &'a Assembly) -> Tracer<'a> {
         Tracer {
-            rays: Vec::new(),
+            ray_stack: RayStack::new(),
             inner: TracerInner {
                 root: assembly,
                 xform_stack: TransformStack::new(),
@@ -28,17 +29,8 @@ impl<'a> Tracer<'a> {
         }
     }
 
-    pub fn trace<'b>(&'b mut self, wrays: &[Ray]) -> &'b [SurfaceIntersection] {
-        self.rays.clear();
-        self.rays.reserve(wrays.len());
-        let mut ids = 0..(wrays.len() as u32);
-        self.rays.extend(
-            wrays
-                .iter()
-                .map(|wr| AccelRay::new(wr, ids.next().unwrap())),
-        );
-
-        self.inner.trace(wrays, &mut self.rays[..])
+    pub fn trace<'b>(&'b mut self, rays: &mut RayBatch) -> &'b [SurfaceIntersection] {
+        self.inner.trace(rays, &mut self.ray_stack)
     }
 }
 
@@ -49,16 +41,37 @@ struct TracerInner<'a> {
 }
 
 impl<'a> TracerInner<'a> {
-    fn trace<'b>(&'b mut self, wrays: &[Ray], rays: &mut [AccelRay]) -> &'b [SurfaceIntersection] {
+    fn trace<'b>(
+        &'b mut self,
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
+    ) -> &'b [SurfaceIntersection] {
+        ray_stack.clear();
+
         // Ready the isects
         self.isects.clear();
-        self.isects.reserve(wrays.len());
+        self.isects.reserve(rays.len());
         self.isects
-            .extend(iter::repeat(SurfaceIntersection::Miss).take(wrays.len()));
+            .extend(iter::repeat(SurfaceIntersection::Miss).take(rays.len()));
 
-        let mut ray_sets = split_rays_by_direction(&mut rays[..]);
-        for ray_set in ray_sets.iter_mut().filter(|ray_set| !ray_set.is_empty()) {
-            self.trace_assembly(self.root, wrays, ray_set);
+        // Prep the accel part of the rays.
+        {
+            let ident = Matrix4x4::new();
+            for i in 0..rays.len() {
+                rays.update_accel(i, &ident);
+            }
+        }
+
+        // Divide the rays into 8 different lanes by direction.
+        ray_stack.ensure_lane_count(8);
+        for i in 0..rays.len() {
+            ray_stack.push_ray_index(i, ray_code(rays.dir_world[i]));
+        }
+        ray_stack.push_lanes_to_tasks(&[0, 1, 2, 3, 4, 5, 6, 7]);
+
+        // Trace each of the 8 lanes separately.
+        while !ray_stack.is_empty() {
+            self.trace_assembly(self.root, rays, ray_stack);
         }
 
         &self.isects
@@ -67,82 +80,44 @@ impl<'a> TracerInner<'a> {
     fn trace_assembly<'b>(
         &'b mut self,
         assembly: &Assembly,
-        wrays: &[Ray],
-        accel_rays: &mut [AccelRay],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
     ) {
-        assembly
-            .object_accel
-            .traverse(&mut accel_rays[..], &assembly.instances[..], |inst, rs| {
+        assembly.object_accel.traverse(
+            rays,
+            ray_stack,
+            &assembly.instances[..],
+            |inst, rays, ray_stack| {
                 // Transform rays if needed
                 if let Some((xstart, xend)) = inst.transform_indices {
                     // Push transforms to stack
                     self.xform_stack.push(&assembly.xforms[xstart..xend]);
 
                     // Do transforms
+                    // TODO: re-divide rays based on direction (maybe?).
                     let xforms = self.xform_stack.top();
-                    for ray in &mut rs[..] {
-                        let id = ray.id;
-                        let t = ray.time;
-                        ray.update_from_xformed_world_ray(
-                            &wrays[id as usize],
-                            &lerp_slice(xforms, t),
-                        );
-                    }
+                    ray_stack.pop_do_next_task(2, |ray_idx| {
+                        let t = rays.time[ray_idx];
+                        rays.update_accel(ray_idx, &lerp_slice(xforms, t));
+                        ([0, 1, 2, 3, 4, 5, 6, 7], 2)
+                    });
+                    ray_stack.push_lanes_to_tasks(&[0, 1]);
                 }
 
                 // Trace rays
-                {
-                    // This is kind of weird looking, but what we're doing here is
-                    // splitting the rays up based on direction if they were
-                    // transformed, and not splitting them up if they weren't
-                    // transformed.
-                    // But to keep the actual tracing code in one place (DRY),
-                    // we map both cases to an array slice that contains slices of
-                    // ray arrays.  Gah... that's confusing even when explained.
-                    // TODO: do this in a way that's less confusing.  Probably split
-                    // the tracing code out into a trace_instance() method or
-                    // something.
-                    let mut tmp = if inst.transform_indices.is_some() {
-                        split_rays_by_direction(rs)
-                    } else {
-                        [
-                            &mut rs[..],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                            &mut [],
-                        ]
-                    };
-                    let ray_sets = if inst.transform_indices.is_some() {
-                        &mut tmp[..]
-                    } else {
-                        &mut tmp[..1]
-                    };
+                match inst.instance_type {
+                    InstanceType::Object => {
+                        self.trace_object(
+                            &assembly.objects[inst.data_index],
+                            inst.surface_shader_index
+                                .map(|i| assembly.surface_shaders[i]),
+                            rays,
+                            ray_stack,
+                        );
+                    }
 
-                    // Loop through the split ray slices and trace them
-                    for ray_set in ray_sets.iter_mut().filter(|ray_set| !ray_set.is_empty()) {
-                        match inst.instance_type {
-                            InstanceType::Object => {
-                                self.trace_object(
-                                    &assembly.objects[inst.data_index],
-                                    inst.surface_shader_index
-                                        .map(|i| assembly.surface_shaders[i]),
-                                    wrays,
-                                    ray_set,
-                                );
-                            }
-
-                            InstanceType::Assembly => {
-                                self.trace_assembly(
-                                    &assembly.assemblies[inst.data_index],
-                                    wrays,
-                                    ray_set,
-                                );
-                            }
-                        }
+                    InstanceType::Assembly => {
+                        self.trace_assembly(&assembly.assemblies[inst.data_index], rays, ray_stack);
                     }
                 }
 
@@ -154,30 +129,29 @@ impl<'a> TracerInner<'a> {
                     // Undo transforms
                     let xforms = self.xform_stack.top();
                     if !xforms.is_empty() {
-                        for ray in &mut rs[..] {
-                            let id = ray.id;
-                            let t = ray.time;
-                            ray.update_from_xformed_world_ray(
-                                &wrays[id as usize],
-                                &lerp_slice(xforms, t),
-                            );
-                        }
+                        ray_stack.pop_do_next_task(0, |ray_idx| {
+                            let t = rays.time[ray_idx];
+                            rays.update_accel(ray_idx, &lerp_slice(xforms, t));
+                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                        });
                     } else {
-                        for ray in &mut rs[..] {
-                            let id = ray.id;
-                            ray.update_from_world_ray(&wrays[id as usize]);
-                        }
+                        let ident = Matrix4x4::new();
+                        ray_stack.pop_do_next_task(0, |ray_idx| {
+                            rays.update_accel(ray_idx, &ident);
+                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                        });
                     }
                 }
-            });
+            },
+        );
     }
 
     fn trace_object<'b>(
         &'b mut self,
         obj: &Object,
         surface_shader: Option<&SurfaceShader>,
-        wrays: &[Ray],
-        rays: &mut [AccelRay],
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
     ) {
         match *obj {
             Object::Surface(surface) => {
@@ -188,7 +162,7 @@ impl<'a> TracerInner<'a> {
 
                 surface.intersect_rays(
                     rays,
-                    wrays,
+                    ray_stack,
                     &mut self.isects,
                     shader,
                     self.xform_stack.top(),
@@ -203,7 +177,7 @@ impl<'a> TracerInner<'a> {
 
                 surface.intersect_rays(
                     rays,
-                    wrays,
+                    ray_stack,
                     &mut self.isects,
                     &bogus_shader,
                     self.xform_stack.top(),
@@ -212,27 +186,3 @@ impl<'a> TracerInner<'a> {
         }
     }
 }
-
-fn split_rays_by_direction(rays: &mut [AccelRay]) -> [&mut [AccelRay]; 8] {
-    // |   |   |   |   |   |   |   |   |
-    //     s1  s2  s3  s4  s5  s6  s7
-    let s4 = partition(&mut rays[..], |r| r.dir_inv.x() >= 0.0);
-
-    let s2 = partition(&mut rays[..s4], |r| r.dir_inv.y() >= 0.0);
-    let s6 = s4 + partition(&mut rays[s4..], |r| r.dir_inv.y() >= 0.0);
-
-    let s1 = partition(&mut rays[..s2], |r| r.dir_inv.z() >= 0.0);
-    let s3 = s2 + partition(&mut rays[s2..s4], |r| r.dir_inv.z() >= 0.0);
-    let s5 = s4 + partition(&mut rays[s4..s6], |r| r.dir_inv.z() >= 0.0);
-    let s7 = s6 + partition(&mut rays[s6..], |r| r.dir_inv.z() >= 0.0);
-
-    let (rest, rs7) = rays.split_at_mut(s7);
-    let (rest, rs6) = rest.split_at_mut(s6);
-    let (rest, rs5) = rest.split_at_mut(s5);
-    let (rest, rs4) = rest.split_at_mut(s4);
-    let (rest, rs3) = rest.split_at_mut(s3);
-    let (rest, rs2) = rest.split_at_mut(s2);
-    let (rs0, rs1) = rest.split_at_mut(s1);
-
-    [rs0, rs1, rs2, rs3, rs4, rs5, rs6, rs7]
-}

From eef29c2b2f7739434433eece6a04c8f5ee0cb7ae Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 23 Jun 2019 19:26:30 +0900
Subject: [PATCH 04/20] Type alias for the ray index type.

Makes things easier to play with.
---
 src/accel/bvh4.rs | 2 +-
 src/ray.rs        | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index c739d38..86b2389 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -206,7 +206,7 @@ impl<'a> BVH4<'a> {
                             ([0, 1, 2, 3, 4, 5, 6, 7], 0)
                         }
                     });
-                    
+
                     trav_time += timer.tick() as f64;
 
                     if hit_count > 0 {
diff --git a/src/ray.rs b/src/ray.rs
index 1881dba..585b0cc 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -4,6 +4,7 @@ use float4::Float4;
 
 use crate::math::{Matrix4x4, Point, Vector};
 
+type RayIndexType = u16;
 type FlagType = u8;
 const OCCLUSION_FLAG: FlagType = 1;
 const DONE_FLAG: FlagType = 1 << 1;
@@ -229,7 +230,7 @@ impl RayStack {
     /// Pushes the given ray index onto the end of the specified lane.
     pub fn push_ray_index(&mut self, ray_idx: usize, lane: usize) {
         assert!(self.lanes.len() > lane);
-        self.lanes[lane].idxs.push(ray_idx as u16);
+        self.lanes[lane].idxs.push(ray_idx as RayIndexType);
     }
 
     /// Takes the given list of lane indices, and pushes any excess indices on
@@ -282,7 +283,7 @@ impl RayStack {
 /// A lane within a RayStack.
 #[derive(Debug)]
 struct Lane {
-    idxs: Vec<u16>,
+    idxs: Vec<RayIndexType>,
     end_len: usize,
 }
 

From 5dd8eb919bf5eb3b06083cff26d5a8314627d8f2 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Tue, 25 Jun 2019 17:31:51 +0900
Subject: [PATCH 05/20] Changed ray batch data access to be through methods.

This is (potentially) just temporary.  It's to make it a bit easier
to play with data layout to see how that affects performance.
---
 src/accel/bvh4.rs            |  18 +++---
 src/light/rectangle_light.rs |  12 ++--
 src/light/sphere_light.rs    |  22 ++++----
 src/ray.rs                   | 105 +++++++++++++++++++++++++----------
 src/renderer.rs              |   4 +-
 src/surface/triangle_mesh.rs |  12 ++--
 src/tracer.rs                |  14 ++---
 7 files changed, 117 insertions(+), 70 deletions(-)

diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index 86b2389..b21a5a2 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -95,7 +95,7 @@ impl<'a> BVH4<'a> {
         let mut node_tests: u64 = 0;
 
         let traversal_table =
-            &TRAVERSAL_TABLE[ray_code(rays.dir_inv_accel[ray_stack.next_task_ray_idx(0)])];
+            &TRAVERSAL_TABLE[ray_code(rays.dir_inv_local(ray_stack.next_task_ray_idx(0)))];
 
         // +2 of max depth for root and last child
         let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2];
@@ -117,10 +117,10 @@ impl<'a> BVH4<'a> {
                     let mut hit_count = 0;
                     ray_stack.pop_do_next_task(children.len(), |ray_idx| {
                         let hit = (!rays.is_done(ray_idx))
-                            && lerp_slice(bounds, rays.time[ray_idx]).intersect_ray(
-                                rays.orig_accel[ray_idx],
-                                rays.dir_inv_accel[ray_idx],
-                                rays.max_t[ray_idx],
+                            && lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
+                                rays.orig_local(ray_idx),
+                                rays.dir_inv_local(ray_idx),
+                                rays.max_t(ray_idx),
                             );
 
                         if hit {
@@ -194,10 +194,10 @@ impl<'a> BVH4<'a> {
 
                     ray_stack.pop_do_next_task(object_count, |ray_idx| {
                         let hit = (!rays.is_done(ray_idx))
-                            && lerp_slice(bounds, rays.time[ray_idx]).intersect_ray(
-                                rays.orig_accel[ray_idx],
-                                rays.dir_inv_accel[ray_idx],
-                                rays.max_t[ray_idx],
+                            && lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
+                                rays.orig_local(ray_idx),
+                                rays.dir_inv_local(ray_idx),
+                                rays.max_t(ray_idx),
                             );
                         if hit {
                             hit_count += 1;
diff --git a/src/light/rectangle_light.rs b/src/light/rectangle_light.rs
index 4711c36..f7af205 100644
--- a/src/light/rectangle_light.rs
+++ b/src/light/rectangle_light.rs
@@ -266,10 +266,10 @@ impl<'a> Surface for RectangleLight<'a> {
         let _ = shader; // Silence 'unused' warning
 
         ray_stack.pop_do_next_task(0, |ray_idx| {
-            let time = rays.time[ray_idx];
-            let orig = rays.orig_world[ray_idx];
-            let dir = rays.dir_world[ray_idx];
-            let max_t = rays.max_t[ray_idx];
+            let time = rays.time(ray_idx);
+            let orig = rays.orig(ray_idx);
+            let dir = rays.dir(ray_idx);
+            let max_t = rays.max_t(ray_idx);
 
             // Calculate time interpolated values
             let dim = lerp_slice(self.dimensions, time);
@@ -307,7 +307,7 @@ impl<'a> Surface for RectangleLight<'a> {
                                     orig,
                                     dir,
                                     pos,
-                                    rays.wavelength[ray_idx],
+                                    rays.wavelength(ray_idx),
                                     time,
                                 ),
                             };
@@ -325,7 +325,7 @@ impl<'a> Surface for RectangleLight<'a> {
                             };
 
                             // Set ray's max t
-                            rays.max_t[ray_idx] = t;
+                            rays.set_max_t(ray_idx, t);
                         }
 
                         break;
diff --git a/src/light/sphere_light.rs b/src/light/sphere_light.rs
index 944baa8..ace32f7 100644
--- a/src/light/sphere_light.rs
+++ b/src/light/sphere_light.rs
@@ -215,7 +215,7 @@ impl<'a> Surface for SphereLight<'a> {
         let _ = shader; // Silence 'unused' warning
 
         ray_stack.pop_do_next_task(0, |ray_idx| {
-            let time = rays.time[ray_idx];
+            let time = rays.time(ray_idx);
 
             // Get the transform space
             let xform = lerp_slice(space, time);
@@ -224,8 +224,8 @@ impl<'a> Surface for SphereLight<'a> {
             let radius = lerp_slice(self.radii, time); // Radius of the sphere
 
             // Get the ray origin and direction in local space
-            let orig = rays.orig_accel[ray_idx].into_vector();
-            let dir = rays.dir_world[ray_idx] * xform;
+            let orig = rays.orig(ray_idx).into_vector();
+            let dir = rays.dir(ray_idx) * xform;
 
             // Code adapted to Rust from https://github.com/Tecla/Rayito
             // Ray-sphere intersection can result in either zero, one or two points
@@ -257,7 +257,7 @@ impl<'a> Surface for SphereLight<'a> {
 
             // Get our final parametric values
             let mut t0 = q / a;
-            let mut t1 = if q != 0.0 { c / q } else { rays.max_t[ray_idx] };
+            let mut t1 = if q != 0.0 { c / q } else { rays.max_t(ray_idx) };
 
             // Swap them so they are ordered right
             if t0 > t1 {
@@ -266,14 +266,14 @@ impl<'a> Surface for SphereLight<'a> {
             }
 
             // Check our intersection for validity against this ray's extents
-            if t0 > rays.max_t[ray_idx] || t1 <= 0.0 {
+            if t0 > rays.max_t(ray_idx) || t1 <= 0.0 {
                 // Didn't hit because sphere is entirely outside of ray's extents
                 return ([0, 0, 0, 0, 0, 0, 0, 0], 0);
             }
 
             let t = if t0 > 0.0 {
                 t0
-            } else if t1 <= rays.max_t[ray_idx] {
+            } else if t1 <= rays.max_t(ray_idx) {
                 t1
             } else {
                 // Didn't hit because ray is entirely within the sphere, and
@@ -300,7 +300,7 @@ impl<'a> Surface for SphereLight<'a> {
                 let normal = unit_pos.into_normal() * inv_xform;
 
                 let intersection_data = SurfaceIntersectionData {
-                    incoming: rays.dir_world[ray_idx],
+                    incoming: rays.dir(ray_idx),
                     t: t,
                     pos: pos,
                     pos_err: pos_err,
@@ -309,11 +309,11 @@ impl<'a> Surface for SphereLight<'a> {
                     local_space: xform,
                     sample_pdf: self.sample_pdf(
                         &xform,
-                        rays.orig_world[ray_idx],
-                        rays.dir_world[ray_idx],
+                        rays.orig(ray_idx),
+                        rays.dir(ray_idx),
                         0.0,
                         0.0,
-                        rays.wavelength[ray_idx],
+                        rays.wavelength(ray_idx),
                         time,
                     ),
                 };
@@ -332,7 +332,7 @@ impl<'a> Surface for SphereLight<'a> {
                 };
 
                 // Set ray's max t
-                rays.max_t[ray_idx] = t;
+                rays.set_max_t(ray_idx, t);
             }
 
             ([0, 0, 0, 0, 0, 0, 0, 0], 0)
diff --git a/src/ray.rs b/src/ray.rs
index 585b0cc..214b0eb 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -23,14 +23,14 @@ pub struct Ray {
 /// A batch of rays, stored in SoA layout.
 #[derive(Debug)]
 pub struct RayBatch {
-    pub orig_world: Vec<Point>,
-    pub dir_world: Vec<Vector>,
-    pub orig_accel: Vec<Point>,
-    pub dir_inv_accel: Vec<Vector>,
-    pub max_t: Vec<f32>,
-    pub time: Vec<f32>,
-    pub wavelength: Vec<f32>,
-    pub flags: Vec<FlagType>,
+    orig_world: Vec<Point>,
+    dir_world: Vec<Vector>,
+    orig_accel: Vec<Point>,
+    dir_inv_accel: Vec<Vector>,
+    max_t: Vec<f32>,
+    time: Vec<f32>,
+    wavelength: Vec<f32>,
+    flags: Vec<FlagType>,
 }
 
 impl RayBatch {
@@ -135,37 +135,84 @@ impl RayBatch {
         self.orig_world.len()
     }
 
-    /// Returns whether the given ray (at index `idx`) is an occlusion ray.
-    pub fn is_occlusion(&self, idx: usize) -> bool {
-        (self.flags[idx] & OCCLUSION_FLAG) != 0
-    }
-
-    /// Returns whether the given ray (at index `idx`) has finished traversal.
-    pub fn is_done(&self, idx: usize) -> bool {
-        (self.flags[idx] & DONE_FLAG) != 0
-    }
-
-    /// Marks the given ray (at index `idx`) as an occlusion ray.
-    pub fn mark_occlusion(&mut self, idx: usize) {
-        self.flags[idx] |= OCCLUSION_FLAG
-    }
-
-    /// Marks the given ray (at index `idx`) as having finished traversal.
-    pub fn mark_done(&mut self, idx: usize) {
-        self.flags[idx] |= DONE_FLAG
-    }
-
     /// Updates the accel data of the given ray (at index `idx`) with the
     /// given world-to-local-space transform matrix.
     ///
     /// This should be called when entering (and exiting) traversal of a
     /// new transform space.
-    pub fn update_accel(&mut self, idx: usize, xform: &Matrix4x4) {
+    pub fn update_local(&mut self, idx: usize, xform: &Matrix4x4) {
         self.orig_accel[idx] = self.orig_world[idx] * *xform;
         self.dir_inv_accel[idx] = Vector {
             co: Float4::splat(1.0) / (self.dir_world[idx] * *xform).co,
         };
     }
+
+    //==========================================================
+    // Data access
+
+    #[inline(always)]
+    pub fn orig(&self, idx: usize) -> Point {
+        self.orig_world[idx]
+    }
+
+    #[inline(always)]
+    pub fn dir(&self, idx: usize) -> Vector {
+        self.dir_world[idx]
+    }
+
+    #[inline(always)]
+    pub fn orig_local(&self, idx: usize) -> Point {
+        self.orig_accel[idx]
+    }
+
+    #[inline(always)]
+    pub fn dir_inv_local(&self, idx: usize) -> Vector {
+        self.dir_inv_accel[idx]
+    }
+
+    #[inline(always)]
+    pub fn time(&self, idx: usize) -> f32 {
+        self.time[idx]
+    }
+
+    #[inline(always)]
+    pub fn max_t(&self, idx: usize) -> f32 {
+        self.max_t[idx]
+    }
+
+    #[inline(always)]
+    pub fn set_max_t(&mut self, idx: usize, new_max_t: f32) {
+        self.max_t[idx] = new_max_t;
+    }
+
+    #[inline(always)]
+    pub fn wavelength(&self, idx: usize) -> f32 {
+        self.wavelength[idx]
+    }
+
+    /// Returns whether the given ray (at index `idx`) is an occlusion ray.
+    #[inline(always)]
+    pub fn is_occlusion(&self, idx: usize) -> bool {
+        (self.flags[idx] & OCCLUSION_FLAG) != 0
+    }
+
+    /// Returns whether the given ray (at index `idx`) has finished traversal.
+    #[inline(always)]
+    pub fn is_done(&self, idx: usize) -> bool {
+        (self.flags[idx] & DONE_FLAG) != 0
+    }
+
+    /// Marks the given ray (at index `idx`) as an occlusion ray.
+    #[inline(always)]
+    pub fn mark_occlusion(&mut self, idx: usize) {
+        self.flags[idx] |= OCCLUSION_FLAG
+    }
+
+    /// Marks the given ray (at index `idx`) as having finished traversal.
+    #[inline(always)]
+    pub fn mark_done(&mut self, idx: usize) {
+        self.flags[idx] |= DONE_FLAG
+    }
 }
 
 /// A structure used for tracking traversal of a ray batch through a scene.
diff --git a/src/renderer.rs b/src/renderer.rs
index 6f3fe80..2c3077d 100644
--- a/src/renderer.rs
+++ b/src/renderer.rs
@@ -503,7 +503,7 @@ impl LightPath {
                             // Distant light
                             SceneLightSample::Distant { direction, .. } => {
                                 let (attenuation, closure_pdf) = closure.evaluate(
-                                    rays.dir_world[ray_idx],
+                                    rays.dir(ray_idx),
                                     direction,
                                     idata.nor,
                                     idata.nor_g,
@@ -533,7 +533,7 @@ impl LightPath {
                             SceneLightSample::Surface { sample_geo, .. } => {
                                 let dir = sample_geo.0 - idata.pos;
                                 let (attenuation, closure_pdf) = closure.evaluate(
-                                    rays.dir_world[ray_idx],
+                                    rays.dir(ray_idx),
                                     dir,
                                     idata.nor,
                                     idata.nor_g,
diff --git a/src/surface/triangle_mesh.rs b/src/surface/triangle_mesh.rs
index 38f3ebf..a1f8cb6 100644
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@@ -159,7 +159,7 @@ impl<'a> Surface for TriangleMesh<'a> {
                 // Test each ray against the current triangle.
                 ray_stack.pop_do_next_task(0, |ray_idx| {
                     let ray_idx = ray_idx as usize;
-                    let ray_time = rays.time[ray_idx];
+                    let ray_time = rays.time(ray_idx);
 
                     // Get triangle if necessary
                     if !is_cached {
@@ -215,9 +215,9 @@ impl<'a> Surface for TriangleMesh<'a> {
 
                     // Test ray against triangle
                     if let Some((t, b0, b1, b2)) = triangle::intersect_ray(
-                        rays.orig_world[ray_idx],
-                        rays.dir_world[ray_idx],
-                        rays.max_t[ray_idx],
+                        rays.orig(ray_idx),
+                        rays.dir(ray_idx),
+                        rays.max_t(ray_idx),
                         tri,
                     ) {
                         if rays.is_occlusion(ray_idx) {
@@ -257,7 +257,7 @@ impl<'a> Surface for TriangleMesh<'a> {
                             };
 
                             let intersection_data = SurfaceIntersectionData {
-                                incoming: rays.dir_world[ray_idx],
+                                incoming: rays.dir(ray_idx),
                                 t: t,
                                 pos: pos,
                                 pos_err: pos_err,
@@ -272,7 +272,7 @@ impl<'a> Surface for TriangleMesh<'a> {
                                 intersection_data: intersection_data,
                                 closure: shader.shade(&intersection_data, ray_time),
                             };
-                            rays.max_t[ray_idx] = t;
+                            rays.set_max_t(ray_idx, t);
                         }
                     }
 
diff --git a/src/tracer.rs b/src/tracer.rs
index 3fba96e..d11d0bb 100644
--- a/src/tracer.rs
+++ b/src/tracer.rs
@@ -58,14 +58,14 @@ impl<'a> TracerInner<'a> {
         {
             let ident = Matrix4x4::new();
             for i in 0..rays.len() {
-                rays.update_accel(i, &ident);
+                rays.update_local(i, &ident);
             }
         }
 
         // Divide the rays into 8 different lanes by direction.
         ray_stack.ensure_lane_count(8);
         for i in 0..rays.len() {
-            ray_stack.push_ray_index(i, ray_code(rays.dir_world[i]));
+            ray_stack.push_ray_index(i, ray_code(rays.dir(i)));
         }
         ray_stack.push_lanes_to_tasks(&[0, 1, 2, 3, 4, 5, 6, 7]);
 
@@ -97,8 +97,8 @@ impl<'a> TracerInner<'a> {
                     // TODO: re-divide rays based on direction (maybe?).
                     let xforms = self.xform_stack.top();
                     ray_stack.pop_do_next_task(2, |ray_idx| {
-                        let t = rays.time[ray_idx];
-                        rays.update_accel(ray_idx, &lerp_slice(xforms, t));
+                        let t = rays.time(ray_idx);
+                        rays.update_local(ray_idx, &lerp_slice(xforms, t));
                         ([0, 1, 2, 3, 4, 5, 6, 7], 2)
                     });
                     ray_stack.push_lanes_to_tasks(&[0, 1]);
@@ -130,14 +130,14 @@ impl<'a> TracerInner<'a> {
                     let xforms = self.xform_stack.top();
                     if !xforms.is_empty() {
                         ray_stack.pop_do_next_task(0, |ray_idx| {
-                            let t = rays.time[ray_idx];
-                            rays.update_accel(ray_idx, &lerp_slice(xforms, t));
+                            let t = rays.time(ray_idx);
+                            rays.update_local(ray_idx, &lerp_slice(xforms, t));
                             ([0, 1, 2, 3, 4, 5, 6, 7], 0)
                         });
                     } else {
                         let ident = Matrix4x4::new();
                         ray_stack.pop_do_next_task(0, |ray_idx| {
-                            rays.update_accel(ray_idx, &ident);
+                            rays.update_local(ray_idx, &ident);
                             ([0, 1, 2, 3, 4, 5, 6, 7], 0)
                         });
                     }

From 50f9fd851bf24d049c1df2cfd23049473f803449 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Tue, 25 Jun 2019 18:49:10 +0900
Subject: [PATCH 06/20] Improved ray batch data layout.

Gives a small performance boost.
---
 src/ray.rs | 159 +++++++++++++++++++++++------------------------------
 1 file changed, 69 insertions(+), 90 deletions(-)

diff --git a/src/ray.rs b/src/ray.rs
index 214b0eb..3b485f0 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -20,31 +20,37 @@ pub struct Ray {
     pub max_t: f32,
 }
 
-/// A batch of rays, stored in SoA layout.
+/// The hot (frequently accessed) parts of ray data.
+#[derive(Debug, Copy, Clone)]
+struct RayHot {
+    orig_local: Point,     // Local-space ray origin
+    dir_inv_local: Vector, // Local-space 1.0/ray direction
+    max_t: f32,
+    time: f32,
+    flags: FlagType,
+}
+
+/// The cold (infrequently accessed) parts of ray data.
+#[derive(Debug, Copy, Clone)]
+struct RayCold {
+    orig: Point, // World-space ray origin
+    dir: Vector, // World-space ray direction
+    wavelength: f32,
+}
+
+/// A batch of rays, separated into hot and cold parts.
 #[derive(Debug)]
 pub struct RayBatch {
-    orig_world: Vec<Point>,
-    dir_world: Vec<Vector>,
-    orig_accel: Vec<Point>,
-    dir_inv_accel: Vec<Vector>,
-    max_t: Vec<f32>,
-    time: Vec<f32>,
-    wavelength: Vec<f32>,
-    flags: Vec<FlagType>,
+    hot: Vec<RayHot>,
+    cold: Vec<RayCold>,
 }
 
 impl RayBatch {
     /// Creates a new empty ray batch.
     pub fn new() -> RayBatch {
         RayBatch {
-            orig_world: Vec::new(),
-            dir_world: Vec::new(),
-            orig_accel: Vec::new(),
-            dir_inv_accel: Vec::new(),
-            max_t: Vec::new(),
-            time: Vec::new(),
-            wavelength: Vec::new(),
-            flags: Vec::new(),
+            hot: Vec::new(),
+            cold: Vec::new(),
         }
     }
 
@@ -52,87 +58,60 @@ impl RayBatch {
     /// `n` rays.
     pub fn with_capacity(n: usize) -> RayBatch {
         RayBatch {
-            orig_world: Vec::with_capacity(n),
-            dir_world: Vec::with_capacity(n),
-            orig_accel: Vec::with_capacity(n),
-            dir_inv_accel: Vec::with_capacity(n),
-            max_t: Vec::with_capacity(n),
-            time: Vec::with_capacity(n),
-            wavelength: Vec::with_capacity(n),
-            flags: Vec::with_capacity(n),
+            hot: Vec::with_capacity(n),
+            cold: Vec::with_capacity(n),
         }
     }
 
     pub fn push(&mut self, ray: Ray, is_occlusion: bool) {
-        self.orig_world.push(ray.orig);
-        self.dir_world.push(ray.dir);
-        self.orig_accel.push(ray.orig); // Bogus, to place-hold.
-        self.dir_inv_accel.push(ray.dir); // Bogus, to place-hold.
-        self.time.push(ray.time);
-        self.wavelength.push(ray.wavelength);
-        if is_occlusion {
-            self.max_t.push(1.0);
-            self.flags.push(OCCLUSION_FLAG);
-        } else {
-            self.max_t.push(std::f32::INFINITY);
-            self.flags.push(0);
-        }
+        self.hot.push(RayHot {
+            orig_local: ray.orig,   // Bogus, to place-hold.
+            dir_inv_local: ray.dir, // Bogus, to place-hold.
+            max_t: ray.max_t,
+            time: ray.time,
+            flags: if is_occlusion { OCCLUSION_FLAG } else { 0 },
+        });
+        self.cold.push(RayCold {
+            orig: ray.orig,
+            dir: ray.dir,
+            wavelength: ray.wavelength,
+        });
     }
 
     pub fn swap(&mut self, a: usize, b: usize) {
-        if a != b {
-            self.orig_world.swap(a, b);
-            self.dir_world.swap(a, b);
-            self.orig_accel.swap(a, b);
-            self.dir_inv_accel.swap(a, b);
-            self.max_t.swap(a, b);
-            self.time.swap(a, b);
-            self.wavelength.swap(a, b);
-            self.flags.swap(a, b);
-        }
+        self.hot.swap(a, b);
+        self.cold.swap(a, b);
     }
 
-    pub fn set_from_ray(&mut self, ray: &Ray, is_shadow: bool, idx: usize) {
-        self.orig_world[idx] = ray.orig;
-        self.dir_world[idx] = ray.dir;
-        self.orig_accel[idx] = ray.orig;
-        self.dir_inv_accel[idx] = Vector {
+    pub fn set_from_ray(&mut self, ray: &Ray, is_occlusion: bool, idx: usize) {
+        self.hot[idx].orig_local = ray.orig;
+        self.hot[idx].dir_inv_local = Vector {
             co: Float4::splat(1.0) / ray.dir.co,
         };
-        self.max_t[idx] = ray.max_t;
-        self.time[idx] = ray.time;
-        self.wavelength[idx] = ray.wavelength;
-        self.time[idx] = ray.time;
-        self.flags[idx] = if is_shadow { OCCLUSION_FLAG } else { 0 };
+        self.hot[idx].max_t = ray.max_t;
+        self.hot[idx].time = ray.time;
+        self.hot[idx].flags = if is_occlusion { OCCLUSION_FLAG } else { 0 };
+
+        self.cold[idx].orig = ray.orig;
+        self.cold[idx].dir = ray.dir;
+        self.cold[idx].wavelength = ray.wavelength;
     }
 
     pub fn truncate(&mut self, len: usize) {
-        self.orig_world.truncate(len);
-        self.dir_world.truncate(len);
-        self.orig_accel.truncate(len);
-        self.dir_inv_accel.truncate(len);
-        self.max_t.truncate(len);
-        self.time.truncate(len);
-        self.wavelength.truncate(len);
-        self.flags.truncate(len);
+        self.hot.truncate(len);
+        self.cold.truncate(len);
     }
 
     /// Clear all rays, settings the size of the batch back to zero.
     ///
     /// Capacity is maintained.
     pub fn clear(&mut self) {
-        self.orig_world.clear();
-        self.dir_world.clear();
-        self.orig_accel.clear();
-        self.dir_inv_accel.clear();
-        self.max_t.clear();
-        self.time.clear();
-        self.wavelength.clear();
-        self.flags.clear();
+        self.hot.clear();
+        self.cold.clear();
     }
 
     pub fn len(&self) -> usize {
-        self.orig_world.len()
+        self.hot.len()
     }
 
     /// Updates the accel data of the given ray (at index `idx`) with the
@@ -141,9 +120,9 @@ impl RayBatch {
     /// This should be called when entering (and exiting) traversal of a
     /// new transform space.
     pub fn update_local(&mut self, idx: usize, xform: &Matrix4x4) {
-        self.orig_accel[idx] = self.orig_world[idx] * *xform;
-        self.dir_inv_accel[idx] = Vector {
-            co: Float4::splat(1.0) / (self.dir_world[idx] * *xform).co,
+        self.hot[idx].orig_local = self.cold[idx].orig * *xform;
+        self.hot[idx].dir_inv_local = Vector {
+            co: Float4::splat(1.0) / (self.cold[idx].dir * *xform).co,
         };
     }
 
@@ -152,66 +131,66 @@ impl RayBatch {
 
     #[inline(always)]
     pub fn orig(&self, idx: usize) -> Point {
-        self.orig_world[idx]
+        self.cold[idx].orig
     }
 
     #[inline(always)]
     pub fn dir(&self, idx: usize) -> Vector {
-        self.dir_world[idx]
+        self.cold[idx].dir
     }
 
     #[inline(always)]
     pub fn orig_local(&self, idx: usize) -> Point {
-        self.orig_accel[idx]
+        self.hot[idx].orig_local
     }
 
     #[inline(always)]
     pub fn dir_inv_local(&self, idx: usize) -> Vector {
-        self.dir_inv_accel[idx]
+        self.hot[idx].dir_inv_local
     }
 
     #[inline(always)]
     pub fn time(&self, idx: usize) -> f32 {
-        self.time[idx]
+        self.hot[idx].time
     }
 
     #[inline(always)]
     pub fn max_t(&self, idx: usize) -> f32 {
-        self.max_t[idx]
+        self.hot[idx].max_t
     }
 
     #[inline(always)]
     pub fn set_max_t(&mut self, idx: usize, new_max_t: f32) {
-        self.max_t[idx] = new_max_t;
+        self.hot[idx].max_t = new_max_t;
     }
 
     #[inline(always)]
     pub fn wavelength(&self, idx: usize) -> f32 {
-        self.wavelength[idx]
+        self.cold[idx].wavelength
     }
 
     /// Returns whether the given ray (at index `idx`) is an occlusion ray.
     #[inline(always)]
     pub fn is_occlusion(&self, idx: usize) -> bool {
-        (self.flags[idx] & OCCLUSION_FLAG) != 0
+        (self.hot[idx].flags & OCCLUSION_FLAG) != 0
     }
 
     /// Returns whether the given ray (at index `idx`) has finished traversal.
     #[inline(always)]
     pub fn is_done(&self, idx: usize) -> bool {
-        (self.flags[idx] & DONE_FLAG) != 0
+        (self.hot[idx].flags & DONE_FLAG) != 0
     }
 
     /// Marks the given ray (at index `idx`) as an occlusion ray.
     #[inline(always)]
     pub fn mark_occlusion(&mut self, idx: usize) {
-        self.flags[idx] |= OCCLUSION_FLAG
+        self.hot[idx].flags |= OCCLUSION_FLAG
     }
 
     /// Marks the given ray (at index `idx`) as having finished traversal.
     #[inline(always)]
     pub fn mark_done(&mut self, idx: usize) {
-        self.flags[idx] |= DONE_FLAG
+        self.hot[idx].flags |= DONE_FLAG
     }
 }
 

From aed0f2ede1cedb611bcffbcd4708e847dea05075 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Fri, 28 Jun 2019 21:57:29 +0900
Subject: [PATCH 07/20] Implemented a SIMD version of the BVH4.

It does indeed appear to be faster with this style of traversal!
---
 src/accel/bvh4_simd.rs | 387 +++++++++++++++++++++++++++++++++++++++++
 src/accel/mod.rs       |   3 +-
 src/bbox4.rs           | 139 +++++++++++++++
 src/main.rs            |   1 +
 src/ray.rs             |  19 ++
 5 files changed, 548 insertions(+), 1 deletion(-)
 create mode 100644 src/accel/bvh4_simd.rs
 create mode 100644 src/bbox4.rs

diff --git a/src/accel/bvh4_simd.rs b/src/accel/bvh4_simd.rs
new file mode 100644
index 0000000..06a9f15
--- /dev/null
+++ b/src/accel/bvh4_simd.rs
@@ -0,0 +1,387 @@
+//! This BVH4 implementation pulls a lot of ideas from the paper
+//! "Efficient Ray Tracing Kernels for Modern CPU Architectures"
+//! by Fuetterling et al.
+//!
+//! Specifically, the table-based traversal order approach they
+//! propose is largely followed by this implementation.
+
+#![allow(dead_code)]
+
+use mem_arena::MemArena;
+
+use crate::{
+    bbox::BBox,
+    bbox4::BBox4,
+    boundable::Boundable,
+    lerp::lerp_slice,
+    math::Vector,
+    ray::{RayBatch, RayStack},
+    timer::Timer,
+};
+
+use super::{
+    bvh_base::{BVHBase, BVHBaseNode, BVH_MAX_DEPTH},
+    ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME,
+};
+
+use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
+
+pub fn ray_code(dir: Vector) -> usize {
+    let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
+    ray_sign_is_neg[0] as usize
+        + ((ray_sign_is_neg[1] as usize) << 1)
+        + ((ray_sign_is_neg[2] as usize) << 2)
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct BVH4<'a> {
+    root: Option<&'a BVH4Node<'a>>,
+    depth: usize,
+    node_count: usize,
+    _bounds: Option<&'a [BBox]>,
+}
+
+#[derive(Copy, Clone, Debug)]
+pub enum BVH4Node<'a> {
+    Internal {
+        bounds: &'a [BBox4],
+        children: &'a [BVH4Node<'a>],
+        traversal_code: u8,
+    },
+
+    Leaf {
+        object_range: (usize, usize),
+    },
+}
+
+impl<'a> BVH4<'a> {
+    pub fn from_objects<'b, T, F>(
+        arena: &'a MemArena,
+        objects: &mut [T],
+        objects_per_leaf: usize,
+        bounder: F,
+    ) -> BVH4<'a>
+    where
+        F: 'b + Fn(&T) -> &'b [BBox],
+    {
+        if objects.len() == 0 {
+            BVH4 {
+                root: None,
+                depth: 0,
+                node_count: 0,
+                _bounds: None,
+            }
+        } else {
+            let base = BVHBase::from_objects(objects, objects_per_leaf, bounder);
+
+            let fill_node = unsafe { arena.alloc_uninitialized_with_alignment::<BVH4Node>(32) };
+            let node_count = BVH4::construct_from_base(
+                arena,
+                &base,
+                &base.nodes[base.root_node_index()],
+                fill_node,
+            );
+
+            BVH4 {
+                root: Some(fill_node),
+                depth: (base.depth / 2) + 1,
+                node_count: node_count,
+                _bounds: {
+                    let range = base.nodes[base.root_node_index()].bounds_range();
+                    Some(arena.copy_slice(&base.bounds[range.0..range.1]))
+                },
+            }
+        }
+    }
+
+    pub fn tree_depth(&self) -> usize {
+        self.depth
+    }
+
+    pub fn traverse<T, F>(
+        &self,
+        rays: &mut RayBatch,
+        ray_stack: &mut RayStack,
+        objects: &[T],
+        mut obj_ray_test: F,
+    ) where
+        F: FnMut(&T, &mut RayBatch, &mut RayStack),
+    {
+        if self.root.is_none() {
+            return;
+        }
+
+        let mut trav_time: f64 = 0.0;
+        let mut timer = Timer::new();
+
+        let traversal_table =
+            &TRAVERSAL_TABLE[ray_code(rays.dir_inv_local(ray_stack.next_task_ray_idx(0)))];
+
+        // +2 of max depth for root and last child
+        let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2];
+        let mut stack_ptr = 1;
+
+        while stack_ptr > 0 {
+            match node_stack[stack_ptr] {
+                &BVH4Node::Internal {
+                    bounds,
+                    children,
+                    traversal_code,
+                } => {
+                    let mut all_hits = 0;
+
+                    // Ray testing
+                    ray_stack.pop_do_next_task(children.len(), |ray_idx| {
+                        if rays.is_done(ray_idx) {
+                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                        } else {
+                            let hits = lerp_slice(bounds, rays.time(ray_idx))
+                                .intersect_ray(
+                                    rays.orig_local(ray_idx),
+                                    rays.dir_inv_local(ray_idx),
+                                    rays.max_t(ray_idx),
+                                )
+                                .to_bitmask();
+
+                            if hits != 0 {
+                                all_hits |= hits;
+                                let mut lanes = [0u8; 8];
+                                let mut lane_count = 0;
+                                for i in 0..children.len() {
+                                    if (hits >> i) & 1 != 0 {
+                                        lanes[lane_count] = i as u8;
+                                        lane_count += 1;
+                                    }
+                                }
+                                (lanes, lane_count)
+                            } else {
+                                ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                            }
+                        }
+                    });
+
+                    // If there were any intersections, create tasks.
+                    if all_hits > 0 {
+                        let order_code = traversal_table[traversal_code as usize];
+                        let mut lanes = [0usize; 4];
+                        let mut lane_count = 0;
+                        for i in 0..children.len() {
+                            let inv_i = (children.len() - 1) - i;
+                            let child_i = ((order_code >> (inv_i * 2)) & 3) as usize;
+                            if ((all_hits >> child_i) & 1) != 0 {
+                                node_stack[stack_ptr + lane_count] = &children[child_i];
+                                lanes[lane_count] = child_i;
+                                lane_count += 1;
+                            }
+                        }
+
+                        ray_stack.push_lanes_to_tasks(&lanes[..lane_count]);
+                        stack_ptr += lane_count - 1;
+                    } else {
+                        stack_ptr -= 1;
+                    }
+                }
+
+                &BVH4Node::Leaf { object_range } => {
+                    trav_time += timer.tick() as f64;
+
+                    // Set up the tasks for each object.
+                    let obj_count = object_range.1 - object_range.0;
+                    for _ in 0..(obj_count - 1) {
+                        ray_stack.duplicate_next_task();
+                    }
+
+                    // Do the ray tests.
+                    for obj in &objects[object_range.0..object_range.1] {
+                        obj_ray_test(obj, rays, ray_stack);
+                    }
+
+                    timer.tick();
+
+                    stack_ptr -= 1;
+                }
+            }
+        }
+
+        trav_time += timer.tick() as f64;
+        ACCEL_TRAV_TIME.with(|att| {
+            let v = att.get();
+            att.set(v + trav_time);
+        });
+    }
+
+    fn construct_from_base(
+        arena: &'a MemArena,
+        base: &BVHBase,
+        node: &BVHBaseNode,
+        fill_node: &mut BVH4Node<'a>,
+    ) -> usize {
+        let mut node_count = 0;
+
+        match node {
+            // Create internal node
+            &BVHBaseNode::Internal {
+                bounds_range: _,
+                children_indices,
+                split_axis,
+            } => {
+                let child_l = &base.nodes[children_indices.0];
+                let child_r = &base.nodes[children_indices.1];
+
+                // Prepare convenient access to the stuff we need.
+                let child_count: usize;
+                let children; // [Optional, Optional, Optional, Optional]
+                let split_info: SplitAxes;
+                match *child_l {
+                    BVHBaseNode::Internal {
+                        children_indices: i_l,
+                        split_axis: s_l,
+                        ..
+                    } => {
+                        match *child_r {
+                            BVHBaseNode::Internal {
+                                children_indices: i_r,
+                                split_axis: s_r,
+                                ..
+                            } => {
+                                // Four nodes
+                                child_count = 4;
+                                children = [
+                                    Some(&base.nodes[i_l.0]),
+                                    Some(&base.nodes[i_l.1]),
+                                    Some(&base.nodes[i_r.0]),
+                                    Some(&base.nodes[i_r.1]),
+                                ];
+                                split_info = SplitAxes::Full((split_axis, s_l, s_r));
+                            }
+                            BVHBaseNode::Leaf { .. } => {
+                                // Three nodes with left split
+                                child_count = 3;
+                                children = [
+                                    Some(&base.nodes[i_l.0]),
+                                    Some(&base.nodes[i_l.1]),
+                                    Some(child_r),
+                                    None,
+                                ];
+                                split_info = SplitAxes::Left((split_axis, s_l));
+                            }
+                        }
+                    }
+                    BVHBaseNode::Leaf { .. } => {
+                        match *child_r {
+                            BVHBaseNode::Internal {
+                                children_indices: i_r,
+                                split_axis: s_r,
+                                ..
+                            } => {
+                                // Three nodes with right split
+                                child_count = 3;
+                                children = [
+                                    Some(child_l),
+                                    Some(&base.nodes[i_r.0]),
+                                    Some(&base.nodes[i_r.1]),
+                                    None,
+                                ];
+                                split_info = SplitAxes::Right((split_axis, s_r));
+                            }
+                            BVHBaseNode::Leaf { .. } => {
+                                // Two nodes
+                                child_count = 2;
+                                children = [Some(child_l), Some(child_r), None, None];
+                                split_info = SplitAxes::TopOnly(split_axis);
+                            }
+                        }
+                    }
+                }
+
+                node_count += child_count;
+
+                // Construct bounds
+                let bounds = {
+                    let bounds_len = children
+                        .iter()
+                        .map(|c| {
+                            if let &Some(n) = c {
+                                let len = n.bounds_range().1 - n.bounds_range().0;
+                                debug_assert!(len >= 1);
+                                len
+                            } else {
+                                0
+                            }
+                        })
+                        .max()
+                        .unwrap();
+                    debug_assert!(bounds_len >= 1);
+                    let bounds =
+                        unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) };
+                    if bounds_len < 2 {
+                        let b1 =
+                            children[0].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b2 =
+                            children[1].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b3 =
+                            children[2].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b4 =
+                            children[3].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4);
+                    } else {
+                        for (i, b) in bounds.iter_mut().enumerate() {
+                            let time = i as f32 / (bounds_len - 1) as f32;
+
+                            let b1 = children[0].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b2 = children[1].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b3 = children[2].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b4 = children[3].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            *b = BBox4::from_bboxes(b1, b2, b3, b4);
+                        }
+                    }
+                    bounds
+                };
+
+                // Construct child nodes
+                let child_nodes = unsafe {
+                    arena.alloc_array_uninitialized_with_alignment::<BVH4Node>(child_count, 32)
+                };
+                for (i, c) in children[0..child_count].iter().enumerate() {
+                    node_count +=
+                        BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]);
+                }
+
+                // Build this node
+                *fill_node = BVH4Node::Internal {
+                    bounds: bounds,
+                    children: child_nodes,
+                    traversal_code: calc_traversal_code(split_info),
+                };
+            }
+
+            // Create internal node
+            &BVHBaseNode::Leaf { object_range, .. } => {
+                *fill_node = BVH4Node::Leaf {
+                    object_range: object_range,
+                };
+                node_count += 1;
+            }
+        }
+
+        return node_count;
+    }
+}
+
+impl<'a> Boundable for BVH4<'a> {
+    fn bounds<'b>(&'b self) -> &'b [BBox] {
+        self._bounds.unwrap_or(&[])
+    }
+}
diff --git a/src/accel/mod.rs b/src/accel/mod.rs
index abbb1d4..1bac6d7 100644
--- a/src/accel/mod.rs
+++ b/src/accel/mod.rs
@@ -1,5 +1,6 @@
 // mod bvh;
 mod bvh4;
+mod bvh4_simd;
 mod bvh_base;
 mod light_array;
 mod light_tree;
@@ -14,7 +15,7 @@ use crate::{
 
 pub use self::{
     // bvh::{BVHNode, BVH},
-    bvh4::{ray_code, BVH4Node, BVH4},
+    bvh4_simd::{ray_code, BVH4Node, BVH4},
     light_array::LightArray,
     light_tree::LightTree,
 };
diff --git a/src/bbox4.rs b/src/bbox4.rs
new file mode 100644
index 0000000..71793a4
--- /dev/null
+++ b/src/bbox4.rs
@@ -0,0 +1,139 @@
+#![allow(dead_code)]
+
+use std;
+use std::ops::{BitOr, BitOrAssign};
+
+use crate::{
+    bbox::BBox,
+    lerp::{lerp, Lerp},
+    math::{Point, Vector},
+};
+
+use float4::{Bool4, Float4};
+
+const BBOX_MAXT_ADJUST: f32 = 1.00000024;
+
+/// A SIMD set of 4 3D axis-aligned bounding boxes.
+#[derive(Debug, Copy, Clone)]
+pub struct BBox4 {
+    pub x: (Float4, Float4), // (min, max)
+    pub y: (Float4, Float4), // (min, max)
+    pub z: (Float4, Float4), // (min, max)
+}
+
+impl BBox4 {
+    /// Creates a degenerate BBox with +infinity min and -infinity max.
+    pub fn new() -> BBox4 {
+        BBox4 {
+            x: (
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
+            ),
+            y: (
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
+            ),
+            z: (
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
+            ),
+        }
+    }
+
+    /// Creates a BBox with min as the minimum extent and max as the maximum
+    /// extent.
+    pub fn from_bboxes(b1: BBox, b2: BBox, b3: BBox, b4: BBox) -> BBox4 {
+        BBox4 {
+            x: (
+                Float4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()),
+                Float4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()),
+            ),
+            y: (
+                Float4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()),
+                Float4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()),
+            ),
+            z: (
+                Float4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()),
+                Float4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z()),
+            ),
+        }
+    }
+
+    // Returns whether the given ray intersects with the bboxes.
+    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> Bool4 {
+        // Get the ray data into SIMD format.
+        let ro_x = orig.co.all_0();
+        let ro_y = orig.co.all_1();
+        let ro_z = orig.co.all_2();
+        let rdi_x = dir_inv.co.all_0();
+        let rdi_y = dir_inv.co.all_1();
+        let rdi_z = dir_inv.co.all_2();
+        let max_t = Float4::splat(max_t);
+
+        // Slab tests
+        let t1_x = (self.x.0 - ro_x) * rdi_x;
+        let t1_y = (self.y.0 - ro_y) * rdi_y;
+        let t1_z = (self.z.0 - ro_z) * rdi_z;
+        let t2_x = (self.x.1 - ro_x) * rdi_x;
+        let t2_y = (self.y.1 - ro_y) * rdi_y;
+        let t2_z = (self.z.1 - ro_z) * rdi_z;
+
+        // Get the far and near t hits for each axis.
+        let t_far_x = t1_x.v_max(t2_x);
+        let t_far_y = t1_y.v_max(t2_y);
+        let t_far_z = t1_z.v_max(t2_z);
+        let t_near_x = t1_x.v_min(t2_x);
+        let t_near_y = t1_y.v_min(t2_y);
+        let t_near_z = t1_z.v_min(t2_z);
+
+        // Calculate over-all far t hit.
+        let far_t =
+            (t_far_x.v_min(t_far_y.v_min(t_far_z)) * Float4::splat(BBOX_MAXT_ADJUST)).v_min(max_t);
+
+        // Calculate over-all near t hit.
+        let near_t = t_near_x
+            .v_max(t_near_y)
+            .v_max(t_near_z.v_max(Float4::splat(0.0)));
+
+        // Hit results
+        near_t.lt(far_t)
+    }
+}
+
+/// Union of two BBoxes.
+impl BitOr for BBox4 {
+    type Output = BBox4;
+
+    fn bitor(self, rhs: BBox4) -> BBox4 {
+        BBox4 {
+            x: (self.x.0.v_min(rhs.x.0), self.x.1.v_max(rhs.x.1)),
+            y: (self.y.0.v_min(rhs.y.0), self.y.1.v_max(rhs.y.1)),
+            z: (self.z.0.v_min(rhs.z.0), self.z.1.v_max(rhs.z.1)),
+        }
+    }
+}
+
+impl BitOrAssign for BBox4 {
+    fn bitor_assign(&mut self, rhs: BBox4) {
+        *self = *self | rhs;
+    }
+}
+
+impl Lerp for BBox4 {
+    fn lerp(self, other: BBox4, alpha: f32) -> BBox4 {
+        BBox4 {
+            x: (
+                lerp(self.x.0, other.x.0, alpha),
+                lerp(self.x.1, other.x.1, alpha),
+            ),
+            y: (
+                lerp(self.y.0, other.y.0, alpha),
+                lerp(self.y.1, other.y.1, alpha),
+            ),
+            z: (
+                lerp(self.z.0, other.z.0, alpha),
+                lerp(self.z.1, other.z.1, alpha),
+            ),
+        }
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index bd5cf51..bd18195 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -17,6 +17,7 @@ extern crate lazy_static;
 mod accel;
 mod algorithm;
 mod bbox;
+mod bbox4;
 mod boundable;
 mod camera;
 mod color;
diff --git a/src/ray.rs b/src/ray.rs
index 3b485f0..852f5c9 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -273,6 +273,25 @@ impl RayStack {
         }
     }
 
+    pub fn duplicate_next_task(&mut self) {
+        let task = self.tasks.last().unwrap();
+        let l = task.lane;
+        let start = task.start_idx;
+        let end = self.lanes[l].end_len;
+
+        for i in start..end {
+            let idx = self.lanes[l].idxs[i];
+            self.lanes[l].idxs.push(idx);
+        }
+
+        self.tasks.push(RayTask {
+            lane: l,
+            start_idx: end,
+        });
+
+        self.lanes[l].end_len = self.lanes[l].idxs.len();
+    }
+
     /// Pops the next task off the stack, and executes the provided closure for
     /// each ray index in the task.  The return value of the closure is the list
     /// of lanes (by index) to add the given ray index back into.

From 2fddcae0fd90e1a6474ab46acc5a872fa957aa2d Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Fri, 28 Jun 2019 22:22:41 +0900
Subject: [PATCH 08/20] Reduced the size of a hot return value.

Gives a small performance boost.
---
 src/accel/bvh4.rs            | 8 ++++----
 src/accel/bvh4_simd.rs       | 6 +++---
 src/light/rectangle_light.rs | 2 +-
 src/light/sphere_light.rs    | 8 ++++----
 src/ray.rs                   | 2 +-
 src/surface/triangle_mesh.rs | 2 +-
 src/tracer.rs                | 6 +++---
 7 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index b21a5a2..6ee9525 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -125,9 +125,9 @@ impl<'a> BVH4<'a> {
 
                         if hit {
                             hit_count += 1;
-                            ([0, 1, 2, 3, 4, 5, 6, 7], children.len())
+                            ([0, 1, 2, 3], children.len())
                         } else {
-                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                            ([0; 4], 0)
                         }
                     });
 
@@ -201,9 +201,9 @@ impl<'a> BVH4<'a> {
                             );
                         if hit {
                             hit_count += 1;
-                            ([0, 1, 2, 3, 4, 5, 6, 7], object_count)
+                            ([0, 1, 2, 3], object_count)
                         } else {
-                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                            ([0; 4], 0)
                         }
                     });
 
diff --git a/src/accel/bvh4_simd.rs b/src/accel/bvh4_simd.rs
index 06a9f15..95b042f 100644
--- a/src/accel/bvh4_simd.rs
+++ b/src/accel/bvh4_simd.rs
@@ -133,7 +133,7 @@ impl<'a> BVH4<'a> {
                     // Ray testing
                     ray_stack.pop_do_next_task(children.len(), |ray_idx| {
                         if rays.is_done(ray_idx) {
-                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                            ([0; 4], 0)
                         } else {
                             let hits = lerp_slice(bounds, rays.time(ray_idx))
                                 .intersect_ray(
@@ -145,7 +145,7 @@ impl<'a> BVH4<'a> {
 
                             if hits != 0 {
                                 all_hits |= hits;
-                                let mut lanes = [0u8; 8];
+                                let mut lanes = [0u8; 4];
                                 let mut lane_count = 0;
                                 for i in 0..children.len() {
                                     if (hits >> i) & 1 != 0 {
@@ -155,7 +155,7 @@ impl<'a> BVH4<'a> {
                                 }
                                 (lanes, lane_count)
                             } else {
-                                ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                                ([0; 4], 0)
                             }
                         }
                     });
diff --git a/src/light/rectangle_light.rs b/src/light/rectangle_light.rs
index f7af205..8df2890 100644
--- a/src/light/rectangle_light.rs
+++ b/src/light/rectangle_light.rs
@@ -333,7 +333,7 @@ impl<'a> Surface for RectangleLight<'a> {
                 }
             }
 
-            ([0, 0, 0, 0, 0, 0, 0, 0], 0)
+            ([0; 4], 0)
         });
     }
 }
diff --git a/src/light/sphere_light.rs b/src/light/sphere_light.rs
index ace32f7..8c596a8 100644
--- a/src/light/sphere_light.rs
+++ b/src/light/sphere_light.rs
@@ -242,7 +242,7 @@ impl<'a> Surface for SphereLight<'a> {
             let discriminant = (b * b) - (4.0 * a * c);
             if discriminant < 0.0 {
                 // Discriminant less than zero?  No solution => no intersection.
-                return ([0, 0, 0, 0, 0, 0, 0, 0], 0);
+                return ([0; 4], 0);
             }
             let discriminant = discriminant.sqrt();
 
@@ -268,7 +268,7 @@ impl<'a> Surface for SphereLight<'a> {
             // Check our intersection for validity against this ray's extents
             if t0 > rays.max_t(ray_idx) || t1 <= 0.0 {
                 // Didn't hit because sphere is entirely outside of ray's extents
-                return ([0, 0, 0, 0, 0, 0, 0, 0], 0);
+                return ([0; 4], 0);
             }
 
             let t = if t0 > 0.0 {
@@ -278,7 +278,7 @@ impl<'a> Surface for SphereLight<'a> {
             } else {
                 // Didn't hit because ray is entirely within the sphere, and
                 // therefore doesn't hit its surface.
-                return ([0, 0, 0, 0, 0, 0, 0, 0], 0);
+                return ([0; 4], 0);
             };
 
             // We hit the sphere, so calculate intersection info.
@@ -335,7 +335,7 @@ impl<'a> Surface for SphereLight<'a> {
                 rays.set_max_t(ray_idx, t);
             }
 
-            ([0, 0, 0, 0, 0, 0, 0, 0], 0)
+            ([0; 4], 0)
         });
     }
 }
diff --git a/src/ray.rs b/src/ray.rs
index 852f5c9..4312f32 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -297,7 +297,7 @@ impl RayStack {
     /// of lanes (by index) to add the given ray index back into.
     pub fn pop_do_next_task<F>(&mut self, needed_lanes: usize, mut handle_ray: F)
     where
-        F: FnMut(usize) -> ([u8; 8], usize),
+        F: FnMut(usize) -> ([u8; 4], usize),
     {
         // Prepare lanes.
         self.ensure_lane_count(needed_lanes);
diff --git a/src/surface/triangle_mesh.rs b/src/surface/triangle_mesh.rs
index a1f8cb6..906b7a5 100644
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@@ -276,7 +276,7 @@ impl<'a> Surface for TriangleMesh<'a> {
                         }
                     }
 
-                    ([0, 0, 0, 0, 0, 0, 0, 0], 0)
+                    ([0; 4], 0)
                 });
             },
         );
diff --git a/src/tracer.rs b/src/tracer.rs
index d11d0bb..8ba78c3 100644
--- a/src/tracer.rs
+++ b/src/tracer.rs
@@ -99,7 +99,7 @@ impl<'a> TracerInner<'a> {
                     ray_stack.pop_do_next_task(2, |ray_idx| {
                         let t = rays.time(ray_idx);
                         rays.update_local(ray_idx, &lerp_slice(xforms, t));
-                        ([0, 1, 2, 3, 4, 5, 6, 7], 2)
+                        ([0, 1, 0, 0], 2)
                     });
                     ray_stack.push_lanes_to_tasks(&[0, 1]);
                 }
@@ -132,13 +132,13 @@ impl<'a> TracerInner<'a> {
                         ray_stack.pop_do_next_task(0, |ray_idx| {
                             let t = rays.time(ray_idx);
                             rays.update_local(ray_idx, &lerp_slice(xforms, t));
-                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                            ([0; 4], 0)
                         });
                     } else {
                         let ident = Matrix4x4::new();
                         ray_stack.pop_do_next_task(0, |ray_idx| {
                             rays.update_local(ray_idx, &ident);
-                            ([0, 1, 2, 3, 4, 5, 6, 7], 0)
+                            ([0; 4], 0)
                         });
                     }
                 }

From c5d23592b979e97493c86af909eadff11f010d4f Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Fri, 28 Jun 2019 22:56:51 +0900
Subject: [PATCH 09/20] Keep Bool4 in its native format instead of converting
 to a bitmask.

This gives a small performance boost.
---
 src/accel/bvh4_simd.rs       | 25 ++++++++++++-------------
 sub_crates/float4/src/lib.rs | 23 ++++++++++++++++++++---
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/src/accel/bvh4_simd.rs b/src/accel/bvh4_simd.rs
index 95b042f..2ad0848 100644
--- a/src/accel/bvh4_simd.rs
+++ b/src/accel/bvh4_simd.rs
@@ -25,6 +25,7 @@ use super::{
 };
 
 use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
+use float4::Bool4;
 
 pub fn ray_code(dir: Vector) -> usize {
     let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
@@ -128,27 +129,25 @@ impl<'a> BVH4<'a> {
                     children,
                     traversal_code,
                 } => {
-                    let mut all_hits = 0;
+                    let mut all_hits = Bool4::new();
 
                     // Ray testing
                     ray_stack.pop_do_next_task(children.len(), |ray_idx| {
                         if rays.is_done(ray_idx) {
                             ([0; 4], 0)
                         } else {
-                            let hits = lerp_slice(bounds, rays.time(ray_idx))
-                                .intersect_ray(
-                                    rays.orig_local(ray_idx),
-                                    rays.dir_inv_local(ray_idx),
-                                    rays.max_t(ray_idx),
-                                )
-                                .to_bitmask();
+                            let hits = lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
+                                rays.orig_local(ray_idx),
+                                rays.dir_inv_local(ray_idx),
+                                rays.max_t(ray_idx),
+                            );
 
-                            if hits != 0 {
-                                all_hits |= hits;
+                            if !hits.all_false() {
+                                all_hits = all_hits | hits;
                                 let mut lanes = [0u8; 4];
                                 let mut lane_count = 0;
                                 for i in 0..children.len() {
-                                    if (hits >> i) & 1 != 0 {
+                                    if hits.get_n(i) {
                                         lanes[lane_count] = i as u8;
                                         lane_count += 1;
                                     }
@@ -161,14 +160,14 @@ impl<'a> BVH4<'a> {
                     });
 
                     // If there were any intersections, create tasks.
-                    if all_hits > 0 {
+                    if !all_hits.all_false() {
                         let order_code = traversal_table[traversal_code as usize];
                         let mut lanes = [0usize; 4];
                         let mut lane_count = 0;
                         for i in 0..children.len() {
                             let inv_i = (children.len() - 1) - i;
                             let child_i = ((order_code >> (inv_i * 2)) & 3) as usize;
-                            if ((all_hits >> child_i) & 1) != 0 {
+                            if all_hits.get_n(child_i) {
                                 node_stack[stack_ptr + lane_count] = &children[child_i];
                                 lanes[lane_count] = child_i;
                                 lane_count += 1;
diff --git a/sub_crates/float4/src/lib.rs b/sub_crates/float4/src/lib.rs
index 4006301..99c0417 100644
--- a/sub_crates/float4/src/lib.rs
+++ b/sub_crates/float4/src/lib.rs
@@ -620,6 +620,14 @@ mod x86_64_sse {
     }
 
     impl Bool4 {
+        #[inline(always)]
+        pub fn new() -> Bool4 {
+            use std::arch::x86_64::_mm_set1_ps;
+            Bool4 {
+                data: unsafe { _mm_set1_ps(0.0) },
+            }
+        }
+
         /// Returns the value of the nth element.
         #[inline(always)]
         pub fn get_n(&self, n: usize) -> bool {
@@ -637,24 +645,33 @@ mod x86_64_sse {
             self.get_n(0)
         }
 
-        /// Returns the value of the 1th element.
+        /// Returns the value of the 1st element.
         #[inline(always)]
         pub fn get_1(&self) -> bool {
             self.get_n(1)
         }
 
-        /// Returns the value of the 2th element.
+        /// Returns the value of the 2nd element.
         #[inline(always)]
         pub fn get_2(&self) -> bool {
             self.get_n(2)
         }
 
-        /// Returns the value of the 3th element.
+        /// Returns the value of the 3rd element.
         #[inline(always)]
         pub fn get_3(&self) -> bool {
             self.get_n(3)
         }
 
+        /// Returns whether all four bools are false.
+        ///
+        /// This is the `OR` operation on all the contained bools.  If even
+        /// one bool is true, this returns true.
+        pub fn all_false(&self) -> bool {
+            let a = unsafe { *(&self.data as *const __m128 as *const u128) };
+            a == 0
+        }
+
         #[inline]
         pub fn to_bitmask(&self) -> u8 {
             let a = unsafe { *(&self.data as *const __m128 as *const u8).offset(0) };

From b09f9684d16afa9f81eea1732c19255d25cdddbd Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 29 Jun 2019 07:22:22 +0900
Subject: [PATCH 10/20] Remove non-SIMD BVH4, and keep more bool calculations
 in SIMD format.

---
 src/accel/bvh4.rs            | 324 +++++++++++++++--------------
 src/accel/bvh4_simd.rs       | 386 -----------------------------------
 src/accel/mod.rs             |   3 +-
 src/light/rectangle_light.rs |   4 +-
 src/light/sphere_light.rs    |  10 +-
 src/ray.rs                   |  46 +++--
 src/surface/triangle_mesh.rs |   4 +-
 src/tracer.rs                |  12 +-
 sub_crates/float4/src/lib.rs |  20 +-
 9 files changed, 234 insertions(+), 575 deletions(-)
 delete mode 100644 src/accel/bvh4_simd.rs

diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index 6ee9525..d7e68e1 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -1,13 +1,17 @@
+//! This BVH4 implementation is based on the ideas from the paper
+//! "Efficient Ray Tracing Kernels for Modern CPU Architectures"
+//! by Fuetterling et al.
+
 #![allow(dead_code)]
 
-use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
-use math3d::Vector;
 use mem_arena::MemArena;
 
 use crate::{
     bbox::BBox,
+    bbox4::BBox4,
     boundable::Boundable,
     lerp::lerp_slice,
+    math::Vector,
     ray::{RayBatch, RayStack},
     timer::Timer,
 };
@@ -17,6 +21,9 @@ use super::{
     ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME,
 };
 
+use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
+use float4::Bool4;
+
 pub fn ray_code(dir: Vector) -> usize {
     let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
     ray_sign_is_neg[0] as usize
@@ -28,20 +35,19 @@ pub fn ray_code(dir: Vector) -> usize {
 pub struct BVH4<'a> {
     root: Option<&'a BVH4Node<'a>>,
     depth: usize,
+    node_count: usize,
+    _bounds: Option<&'a [BBox]>,
 }
 
 #[derive(Copy, Clone, Debug)]
 pub enum BVH4Node<'a> {
-    Inner {
-        traversal_code: u8,
-        bounds_start: &'a BBox,
-        bounds_len: u16,
+    Internal {
+        bounds: &'a [BBox4],
         children: &'a [BVH4Node<'a>],
+        traversal_code: u8,
     },
 
     Leaf {
-        bounds_start: &'a BBox,
-        bounds_len: u16,
         object_range: (usize, usize),
     },
 }
@@ -56,19 +62,32 @@ impl<'a> BVH4<'a> {
     where
         F: 'b + Fn(&T) -> &'b [BBox],
     {
-        if objects.is_empty() {
+        if objects.len() == 0 {
             BVH4 {
                 root: None,
                 depth: 0,
+                node_count: 0,
+                _bounds: None,
             }
         } else {
             let base = BVHBase::from_objects(objects, objects_per_leaf, bounder);
 
-            let root = unsafe { arena.alloc_uninitialized::<BVH4Node>() };
-            BVH4::construct_from_base(arena, &base, base.root_node_index(), root);
+            let fill_node = unsafe { arena.alloc_uninitialized_with_alignment::<BVH4Node>(32) };
+            let node_count = BVH4::construct_from_base(
+                arena,
+                &base,
+                &base.nodes[base.root_node_index()],
+                fill_node,
+            );
+
             BVH4 {
-                root: Some(root),
-                depth: base.depth,
+                root: Some(fill_node),
+                depth: (base.depth / 2) + 1,
+                node_count: node_count,
+                _bounds: {
+                    let range = base.nodes[base.root_node_index()].bounds_range();
+                    Some(arena.copy_slice(&base.bounds[range.0..range.1]))
+                },
             }
         }
     }
@@ -103,117 +122,63 @@ impl<'a> BVH4<'a> {
 
         while stack_ptr > 0 {
             node_tests += ray_stack.ray_count_in_next_task() as u64;
-            match *node_stack[stack_ptr] {
-                BVH4Node::Inner {
-                    traversal_code,
-                    bounds_start,
-                    bounds_len,
+            match node_stack[stack_ptr] {
+                &BVH4Node::Internal {
+                    bounds,
                     children,
+                    traversal_code,
                 } => {
-                    // Test rays against bbox.
-                    let bounds =
-                        unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
+                    let mut all_hits = Bool4::new_false();
 
-                    let mut hit_count = 0;
-                    ray_stack.pop_do_next_task(children.len(), |ray_idx| {
-                        let hit = (!rays.is_done(ray_idx))
-                            && lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
+                    // Ray testing
+                    ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| {
+                        if rays.is_done(ray_idx) {
+                            (Bool4::new_false(), 0)
+                        } else {
+                            let hits = lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
                                 rays.orig_local(ray_idx),
                                 rays.dir_inv_local(ray_idx),
                                 rays.max_t(ray_idx),
                             );
-
-                        if hit {
-                            hit_count += 1;
-                            ([0, 1, 2, 3], children.len())
-                        } else {
-                            ([0; 4], 0)
+                            all_hits = all_hits | hits;
+                            (hits, children.len())
                         }
                     });
 
                     // If there were any intersections, create tasks.
-                    if hit_count > 0 {
+                    if !all_hits.is_all_false() {
                         let order_code = traversal_table[traversal_code as usize];
-                        match children.len() {
-                            4 => {
-                                let i4 = ((order_code >> 6) & 0b11) as usize;
-                                let i3 = ((order_code >> 4) & 0b11) as usize;
-                                let i2 = ((order_code >> 2) & 0b11) as usize;
-                                let i1 = (order_code & 0b11) as usize;
-
-                                ray_stack.push_lanes_to_tasks(&[i4, i3, i2, i1]);
-
-                                node_stack[stack_ptr] = &children[i4];
-                                node_stack[stack_ptr + 1] = &children[i3];
-                                node_stack[stack_ptr + 2] = &children[i2];
-                                node_stack[stack_ptr + 3] = &children[i1];
-
-                                stack_ptr += 3;
+                        let mut lanes = [0usize; 4];
+                        let mut lane_count = 0;
+                        for i in 0..children.len() {
+                            let inv_i = (children.len() - 1) - i;
+                            let child_i = ((order_code >> (inv_i * 2)) & 3) as usize;
+                            if all_hits.get_n(child_i) {
+                                node_stack[stack_ptr + lane_count] = &children[child_i];
+                                lanes[lane_count] = child_i;
+                                lane_count += 1;
                             }
-                            3 => {
-                                let i3 = ((order_code >> 4) & 0b11) as usize;
-                                let i2 = ((order_code >> 2) & 0b11) as usize;
-                                let i1 = (order_code & 0b11) as usize;
-
-                                ray_stack.push_lanes_to_tasks(&[i3, i2, i1]);
-
-                                node_stack[stack_ptr] = &children[i3];
-                                node_stack[stack_ptr + 1] = &children[i2];
-                                node_stack[stack_ptr + 2] = &children[i1];
-
-                                stack_ptr += 2;
-                            }
-                            2 => {
-                                let i2 = ((order_code >> 2) & 0b11) as usize;
-                                let i1 = (order_code & 0b11) as usize;
-
-                                ray_stack.push_lanes_to_tasks(&[i2, i1]);
-
-                                node_stack[stack_ptr] = &children[i2];
-                                node_stack[stack_ptr + 1] = &children[i1];
-
-                                stack_ptr += 1;
-                            }
-                            _ => unreachable!(),
                         }
+
+                        ray_stack.push_lanes_to_tasks(&lanes[..lane_count]);
+                        stack_ptr += lane_count - 1;
                     } else {
                         stack_ptr -= 1;
                     }
                 }
 
-                BVH4Node::Leaf {
-                    object_range,
-                    bounds_start,
-                    bounds_len,
-                } => {
-                    // Test rays against bounds.
-                    let bounds =
-                        unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
-                    let object_count = object_range.1 - object_range.0;
-                    let mut hit_count = 0;
-
-                    ray_stack.pop_do_next_task(object_count, |ray_idx| {
-                        let hit = (!rays.is_done(ray_idx))
-                            && lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
-                                rays.orig_local(ray_idx),
-                                rays.dir_inv_local(ray_idx),
-                                rays.max_t(ray_idx),
-                            );
-                        if hit {
-                            hit_count += 1;
-                            ([0, 1, 2, 3], object_count)
-                        } else {
-                            ([0; 4], 0)
-                        }
-                    });
-
+                &BVH4Node::Leaf { object_range } => {
                     trav_time += timer.tick() as f64;
 
-                    if hit_count > 0 {
-                        ray_stack.push_lanes_to_tasks(&[0, 1, 2, 3, 4, 5, 6, 7][..object_count]);
-                        for obj in &objects[object_range.0..object_range.1] {
-                            obj_ray_test(obj, rays, ray_stack);
-                        }
+                    // Set up the tasks for each object.
+                    let obj_count = object_range.1 - object_range.0;
+                    for _ in 0..(obj_count - 1) {
+                        ray_stack.duplicate_next_task();
+                    }
+
+                    // Do the ray tests.
+                    for obj in &objects[object_range.0..object_range.1] {
+                        obj_ray_test(obj, rays, ray_stack);
                     }
 
                     timer.tick();
@@ -237,12 +202,15 @@ impl<'a> BVH4<'a> {
     fn construct_from_base(
         arena: &'a MemArena,
         base: &BVHBase,
-        node_index: usize,
-        node_mem: &mut BVH4Node<'a>,
-    ) {
-        match base.nodes[node_index] {
-            BVHBaseNode::Internal {
-                bounds_range,
+        node: &BVHBaseNode,
+        fill_node: &mut BVH4Node<'a>,
+    ) -> usize {
+        let mut node_count = 0;
+
+        match node {
+            // Create internal node
+            &BVHBaseNode::Internal {
+                bounds_range: _,
                 children_indices,
                 split_axis,
             } => {
@@ -251,7 +219,7 @@ impl<'a> BVH4<'a> {
 
                 // Prepare convenient access to the stuff we need.
                 let child_count: usize;
-                let child_indices: [usize; 4];
+                let children; // [Optional, Optional, Optional, Optional]
                 let split_info: SplitAxes;
                 match *child_l {
                     BVHBaseNode::Internal {
@@ -267,13 +235,23 @@ impl<'a> BVH4<'a> {
                             } => {
                                 // Four nodes
                                 child_count = 4;
-                                child_indices = [i_l.0, i_l.1, i_r.0, i_r.1];
+                                children = [
+                                    Some(&base.nodes[i_l.0]),
+                                    Some(&base.nodes[i_l.1]),
+                                    Some(&base.nodes[i_r.0]),
+                                    Some(&base.nodes[i_r.1]),
+                                ];
                                 split_info = SplitAxes::Full((split_axis, s_l, s_r));
                             }
                             BVHBaseNode::Leaf { .. } => {
                                 // Three nodes with left split
                                 child_count = 3;
-                                child_indices = [i_l.0, i_l.1, children_indices.1, 0];
+                                children = [
+                                    Some(&base.nodes[i_l.0]),
+                                    Some(&base.nodes[i_l.1]),
+                                    Some(child_r),
+                                    None,
+                                ];
                                 split_info = SplitAxes::Left((split_axis, s_l));
                             }
                         }
@@ -287,76 +265,112 @@ impl<'a> BVH4<'a> {
                             } => {
                                 // Three nodes with right split
                                 child_count = 3;
-                                child_indices = [children_indices.0, i_r.0, i_r.1, 0];
+                                children = [
+                                    Some(child_l),
+                                    Some(&base.nodes[i_r.0]),
+                                    Some(&base.nodes[i_r.1]),
+                                    None,
+                                ];
                                 split_info = SplitAxes::Right((split_axis, s_r));
                             }
                             BVHBaseNode::Leaf { .. } => {
                                 // Two nodes
                                 child_count = 2;
-                                child_indices = [children_indices.0, children_indices.1, 0, 0];
+                                children = [Some(child_l), Some(child_r), None, None];
                                 split_info = SplitAxes::TopOnly(split_axis);
                             }
                         }
                     }
                 }
 
-                // Copy bounds
-                let bounds = arena
-                    .copy_slice_with_alignment(&base.bounds[bounds_range.0..bounds_range.1], 32);
+                node_count += child_count;
 
-                // Build children
-                let children_mem = unsafe {
+                // Construct bounds
+                let bounds = {
+                    let bounds_len = children
+                        .iter()
+                        .map(|c| {
+                            if let &Some(n) = c {
+                                let len = n.bounds_range().1 - n.bounds_range().0;
+                                debug_assert!(len >= 1);
+                                len
+                            } else {
+                                0
+                            }
+                        })
+                        .max()
+                        .unwrap();
+                    debug_assert!(bounds_len >= 1);
+                    let bounds =
+                        unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) };
+                    if bounds_len < 2 {
+                        let b1 =
+                            children[0].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b2 =
+                            children[1].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b3 =
+                            children[2].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        let b4 =
+                            children[3].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
+                        bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4);
+                    } else {
+                        for (i, b) in bounds.iter_mut().enumerate() {
+                            let time = i as f32 / (bounds_len - 1) as f32;
+
+                            let b1 = children[0].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b2 = children[1].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b3 = children[2].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            let b4 = children[3].map_or(BBox::new(), |c| {
+                                let (x, y) = c.bounds_range();
+                                lerp_slice(&base.bounds[x..y], time)
+                            });
+                            *b = BBox4::from_bboxes(b1, b2, b3, b4);
+                        }
+                    }
+                    bounds
+                };
+
+                // Construct child nodes
+                let child_nodes = unsafe {
                     arena.alloc_array_uninitialized_with_alignment::<BVH4Node>(child_count, 32)
                 };
-                for i in 0..child_count {
-                    BVH4::construct_from_base(arena, base, child_indices[i], &mut children_mem[i]);
+                for (i, c) in children[0..child_count].iter().enumerate() {
+                    node_count +=
+                        BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]);
                 }
 
-                // Fill in node
-                *node_mem = BVH4Node::Inner {
+                // Build this node
+                *fill_node = BVH4Node::Internal {
+                    bounds: bounds,
+                    children: child_nodes,
                     traversal_code: calc_traversal_code(split_info),
-                    bounds_start: &bounds[0],
-                    bounds_len: bounds.len() as u16,
-                    children: children_mem,
                 };
             }
 
-            BVHBaseNode::Leaf {
-                bounds_range,
-                object_range,
-            } => {
-                let bounds = arena.copy_slice(&base.bounds[bounds_range.0..bounds_range.1]);
-
-                *node_mem = BVH4Node::Leaf {
-                    bounds_start: &bounds[0],
-                    bounds_len: bounds.len() as u16,
+            // Create internal node
+            &BVHBaseNode::Leaf { object_range, .. } => {
+                *fill_node = BVH4Node::Leaf {
                     object_range: object_range,
                 };
+                node_count += 1;
             }
         }
-    }
-}
 
-lazy_static! {
-    static ref DEGENERATE_BOUNDS: [BBox; 1] = [BBox::new()];
+        return node_count;
+    }
 }
 
 impl<'a> Boundable for BVH4<'a> {
-    fn bounds(&self) -> &[BBox] {
-        match self.root {
-            None => &DEGENERATE_BOUNDS[..],
-            Some(root) => match *root {
-                BVH4Node::Inner {
-                    bounds_start,
-                    bounds_len,
-                    ..
-                }
-                | BVH4Node::Leaf {
-                    bounds_start,
-                    bounds_len,
-                    ..
-                } => unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) },
-            },
-        }
+    fn bounds<'b>(&'b self) -> &'b [BBox] {
+        self._bounds.unwrap_or(&[])
     }
 }
diff --git a/src/accel/bvh4_simd.rs b/src/accel/bvh4_simd.rs
deleted file mode 100644
index 2ad0848..0000000
--- a/src/accel/bvh4_simd.rs
+++ /dev/null
@@ -1,386 +0,0 @@
-//! This BVH4 implementation pulls a lot of ideas from the paper
-//! "Efficient Ray Tracing Kernels for Modern CPU Architectures"
-//! by Fuetterling et al.
-//!
-//! Specifically, the table-based traversal order approach they
-//! propose is largely followed by this implementation.
-
-#![allow(dead_code)]
-
-use mem_arena::MemArena;
-
-use crate::{
-    bbox::BBox,
-    bbox4::BBox4,
-    boundable::Boundable,
-    lerp::lerp_slice,
-    math::Vector,
-    ray::{RayBatch, RayStack},
-    timer::Timer,
-};
-
-use super::{
-    bvh_base::{BVHBase, BVHBaseNode, BVH_MAX_DEPTH},
-    ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME,
-};
-
-use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
-use float4::Bool4;
-
-pub fn ray_code(dir: Vector) -> usize {
-    let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
-    ray_sign_is_neg[0] as usize
-        + ((ray_sign_is_neg[1] as usize) << 1)
-        + ((ray_sign_is_neg[2] as usize) << 2)
-}
-
-#[derive(Copy, Clone, Debug)]
-pub struct BVH4<'a> {
-    root: Option<&'a BVH4Node<'a>>,
-    depth: usize,
-    node_count: usize,
-    _bounds: Option<&'a [BBox]>,
-}
-
-#[derive(Copy, Clone, Debug)]
-pub enum BVH4Node<'a> {
-    Internal {
-        bounds: &'a [BBox4],
-        children: &'a [BVH4Node<'a>],
-        traversal_code: u8,
-    },
-
-    Leaf {
-        object_range: (usize, usize),
-    },
-}
-
-impl<'a> BVH4<'a> {
-    pub fn from_objects<'b, T, F>(
-        arena: &'a MemArena,
-        objects: &mut [T],
-        objects_per_leaf: usize,
-        bounder: F,
-    ) -> BVH4<'a>
-    where
-        F: 'b + Fn(&T) -> &'b [BBox],
-    {
-        if objects.len() == 0 {
-            BVH4 {
-                root: None,
-                depth: 0,
-                node_count: 0,
-                _bounds: None,
-            }
-        } else {
-            let base = BVHBase::from_objects(objects, objects_per_leaf, bounder);
-
-            let fill_node = unsafe { arena.alloc_uninitialized_with_alignment::<BVH4Node>(32) };
-            let node_count = BVH4::construct_from_base(
-                arena,
-                &base,
-                &base.nodes[base.root_node_index()],
-                fill_node,
-            );
-
-            BVH4 {
-                root: Some(fill_node),
-                depth: (base.depth / 2) + 1,
-                node_count: node_count,
-                _bounds: {
-                    let range = base.nodes[base.root_node_index()].bounds_range();
-                    Some(arena.copy_slice(&base.bounds[range.0..range.1]))
-                },
-            }
-        }
-    }
-
-    pub fn tree_depth(&self) -> usize {
-        self.depth
-    }
-
-    pub fn traverse<T, F>(
-        &self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-        objects: &[T],
-        mut obj_ray_test: F,
-    ) where
-        F: FnMut(&T, &mut RayBatch, &mut RayStack),
-    {
-        if self.root.is_none() {
-            return;
-        }
-
-        let mut trav_time: f64 = 0.0;
-        let mut timer = Timer::new();
-
-        let traversal_table =
-            &TRAVERSAL_TABLE[ray_code(rays.dir_inv_local(ray_stack.next_task_ray_idx(0)))];
-
-        // +2 of max depth for root and last child
-        let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2];
-        let mut stack_ptr = 1;
-
-        while stack_ptr > 0 {
-            match node_stack[stack_ptr] {
-                &BVH4Node::Internal {
-                    bounds,
-                    children,
-                    traversal_code,
-                } => {
-                    let mut all_hits = Bool4::new();
-
-                    // Ray testing
-                    ray_stack.pop_do_next_task(children.len(), |ray_idx| {
-                        if rays.is_done(ray_idx) {
-                            ([0; 4], 0)
-                        } else {
-                            let hits = lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
-                                rays.orig_local(ray_idx),
-                                rays.dir_inv_local(ray_idx),
-                                rays.max_t(ray_idx),
-                            );
-
-                            if !hits.all_false() {
-                                all_hits = all_hits | hits;
-                                let mut lanes = [0u8; 4];
-                                let mut lane_count = 0;
-                                for i in 0..children.len() {
-                                    if hits.get_n(i) {
-                                        lanes[lane_count] = i as u8;
-                                        lane_count += 1;
-                                    }
-                                }
-                                (lanes, lane_count)
-                            } else {
-                                ([0; 4], 0)
-                            }
-                        }
-                    });
-
-                    // If there were any intersections, create tasks.
-                    if !all_hits.all_false() {
-                        let order_code = traversal_table[traversal_code as usize];
-                        let mut lanes = [0usize; 4];
-                        let mut lane_count = 0;
-                        for i in 0..children.len() {
-                            let inv_i = (children.len() - 1) - i;
-                            let child_i = ((order_code >> (inv_i * 2)) & 3) as usize;
-                            if all_hits.get_n(child_i) {
-                                node_stack[stack_ptr + lane_count] = &children[child_i];
-                                lanes[lane_count] = child_i;
-                                lane_count += 1;
-                            }
-                        }
-
-                        ray_stack.push_lanes_to_tasks(&lanes[..lane_count]);
-                        stack_ptr += lane_count - 1;
-                    } else {
-                        stack_ptr -= 1;
-                    }
-                }
-
-                &BVH4Node::Leaf { object_range } => {
-                    trav_time += timer.tick() as f64;
-
-                    // Set up the tasks for each object.
-                    let obj_count = object_range.1 - object_range.0;
-                    for _ in 0..(obj_count - 1) {
-                        ray_stack.duplicate_next_task();
-                    }
-
-                    // Do the ray tests.
-                    for obj in &objects[object_range.0..object_range.1] {
-                        obj_ray_test(obj, rays, ray_stack);
-                    }
-
-                    timer.tick();
-
-                    stack_ptr -= 1;
-                }
-            }
-        }
-
-        trav_time += timer.tick() as f64;
-        ACCEL_TRAV_TIME.with(|att| {
-            let v = att.get();
-            att.set(v + trav_time);
-        });
-    }
-
-    fn construct_from_base(
-        arena: &'a MemArena,
-        base: &BVHBase,
-        node: &BVHBaseNode,
-        fill_node: &mut BVH4Node<'a>,
-    ) -> usize {
-        let mut node_count = 0;
-
-        match node {
-            // Create internal node
-            &BVHBaseNode::Internal {
-                bounds_range: _,
-                children_indices,
-                split_axis,
-            } => {
-                let child_l = &base.nodes[children_indices.0];
-                let child_r = &base.nodes[children_indices.1];
-
-                // Prepare convenient access to the stuff we need.
-                let child_count: usize;
-                let children; // [Optional, Optional, Optional, Optional]
-                let split_info: SplitAxes;
-                match *child_l {
-                    BVHBaseNode::Internal {
-                        children_indices: i_l,
-                        split_axis: s_l,
-                        ..
-                    } => {
-                        match *child_r {
-                            BVHBaseNode::Internal {
-                                children_indices: i_r,
-                                split_axis: s_r,
-                                ..
-                            } => {
-                                // Four nodes
-                                child_count = 4;
-                                children = [
-                                    Some(&base.nodes[i_l.0]),
-                                    Some(&base.nodes[i_l.1]),
-                                    Some(&base.nodes[i_r.0]),
-                                    Some(&base.nodes[i_r.1]),
-                                ];
-                                split_info = SplitAxes::Full((split_axis, s_l, s_r));
-                            }
-                            BVHBaseNode::Leaf { .. } => {
-                                // Three nodes with left split
-                                child_count = 3;
-                                children = [
-                                    Some(&base.nodes[i_l.0]),
-                                    Some(&base.nodes[i_l.1]),
-                                    Some(child_r),
-                                    None,
-                                ];
-                                split_info = SplitAxes::Left((split_axis, s_l));
-                            }
-                        }
-                    }
-                    BVHBaseNode::Leaf { .. } => {
-                        match *child_r {
-                            BVHBaseNode::Internal {
-                                children_indices: i_r,
-                                split_axis: s_r,
-                                ..
-                            } => {
-                                // Three nodes with right split
-                                child_count = 3;
-                                children = [
-                                    Some(child_l),
-                                    Some(&base.nodes[i_r.0]),
-                                    Some(&base.nodes[i_r.1]),
-                                    None,
-                                ];
-                                split_info = SplitAxes::Right((split_axis, s_r));
-                            }
-                            BVHBaseNode::Leaf { .. } => {
-                                // Two nodes
-                                child_count = 2;
-                                children = [Some(child_l), Some(child_r), None, None];
-                                split_info = SplitAxes::TopOnly(split_axis);
-                            }
-                        }
-                    }
-                }
-
-                node_count += child_count;
-
-                // Construct bounds
-                let bounds = {
-                    let bounds_len = children
-                        .iter()
-                        .map(|c| {
-                            if let &Some(n) = c {
-                                let len = n.bounds_range().1 - n.bounds_range().0;
-                                debug_assert!(len >= 1);
-                                len
-                            } else {
-                                0
-                            }
-                        })
-                        .max()
-                        .unwrap();
-                    debug_assert!(bounds_len >= 1);
-                    let bounds =
-                        unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) };
-                    if bounds_len < 2 {
-                        let b1 =
-                            children[0].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
-                        let b2 =
-                            children[1].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
-                        let b3 =
-                            children[2].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
-                        let b4 =
-                            children[3].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
-                        bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4);
-                    } else {
-                        for (i, b) in bounds.iter_mut().enumerate() {
-                            let time = i as f32 / (bounds_len - 1) as f32;
-
-                            let b1 = children[0].map_or(BBox::new(), |c| {
-                                let (x, y) = c.bounds_range();
-                                lerp_slice(&base.bounds[x..y], time)
-                            });
-                            let b2 = children[1].map_or(BBox::new(), |c| {
-                                let (x, y) = c.bounds_range();
-                                lerp_slice(&base.bounds[x..y], time)
-                            });
-                            let b3 = children[2].map_or(BBox::new(), |c| {
-                                let (x, y) = c.bounds_range();
-                                lerp_slice(&base.bounds[x..y], time)
-                            });
-                            let b4 = children[3].map_or(BBox::new(), |c| {
-                                let (x, y) = c.bounds_range();
-                                lerp_slice(&base.bounds[x..y], time)
-                            });
-                            *b = BBox4::from_bboxes(b1, b2, b3, b4);
-                        }
-                    }
-                    bounds
-                };
-
-                // Construct child nodes
-                let child_nodes = unsafe {
-                    arena.alloc_array_uninitialized_with_alignment::<BVH4Node>(child_count, 32)
-                };
-                for (i, c) in children[0..child_count].iter().enumerate() {
-                    node_count +=
-                        BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]);
-                }
-
-                // Build this node
-                *fill_node = BVH4Node::Internal {
-                    bounds: bounds,
-                    children: child_nodes,
-                    traversal_code: calc_traversal_code(split_info),
-                };
-            }
-
-            // Create internal node
-            &BVHBaseNode::Leaf { object_range, .. } => {
-                *fill_node = BVH4Node::Leaf {
-                    object_range: object_range,
-                };
-                node_count += 1;
-            }
-        }
-
-        return node_count;
-    }
-}
-
-impl<'a> Boundable for BVH4<'a> {
-    fn bounds<'b>(&'b self) -> &'b [BBox] {
-        self._bounds.unwrap_or(&[])
-    }
-}
diff --git a/src/accel/mod.rs b/src/accel/mod.rs
index 1bac6d7..abbb1d4 100644
--- a/src/accel/mod.rs
+++ b/src/accel/mod.rs
@@ -1,6 +1,5 @@
 // mod bvh;
 mod bvh4;
-mod bvh4_simd;
 mod bvh_base;
 mod light_array;
 mod light_tree;
@@ -15,7 +14,7 @@ use crate::{
 
 pub use self::{
     // bvh::{BVHNode, BVH},
-    bvh4_simd::{ray_code, BVH4Node, BVH4},
+    bvh4::{ray_code, BVH4Node, BVH4},
     light_array::LightArray,
     light_tree::LightTree,
 };
diff --git a/src/light/rectangle_light.rs b/src/light/rectangle_light.rs
index 8df2890..db01072 100644
--- a/src/light/rectangle_light.rs
+++ b/src/light/rectangle_light.rs
@@ -265,7 +265,7 @@ impl<'a> Surface for RectangleLight<'a> {
     ) {
         let _ = shader; // Silence 'unused' warning
 
-        ray_stack.pop_do_next_task(0, |ray_idx| {
+        ray_stack.pop_do_next_task(|ray_idx| {
             let time = rays.time(ray_idx);
             let orig = rays.orig(ray_idx);
             let dir = rays.dir(ray_idx);
@@ -332,8 +332,6 @@ impl<'a> Surface for RectangleLight<'a> {
                     }
                 }
             }
-
-            ([0; 4], 0)
         });
     }
 }
diff --git a/src/light/sphere_light.rs b/src/light/sphere_light.rs
index 8c596a8..e17371f 100644
--- a/src/light/sphere_light.rs
+++ b/src/light/sphere_light.rs
@@ -214,7 +214,7 @@ impl<'a> Surface for SphereLight<'a> {
     ) {
         let _ = shader; // Silence 'unused' warning
 
-        ray_stack.pop_do_next_task(0, |ray_idx| {
+        ray_stack.pop_do_next_task(|ray_idx| {
             let time = rays.time(ray_idx);
 
             // Get the transform space
@@ -242,7 +242,7 @@ impl<'a> Surface for SphereLight<'a> {
             let discriminant = (b * b) - (4.0 * a * c);
             if discriminant < 0.0 {
                 // Discriminant less than zero?  No solution => no intersection.
-                return ([0; 4], 0);
+                return;
             }
             let discriminant = discriminant.sqrt();
 
@@ -268,7 +268,7 @@ impl<'a> Surface for SphereLight<'a> {
             // Check our intersection for validity against this ray's extents
             if t0 > rays.max_t(ray_idx) || t1 <= 0.0 {
                 // Didn't hit because sphere is entirely outside of ray's extents
-                return ([0; 4], 0);
+                return;
             }
 
             let t = if t0 > 0.0 {
@@ -278,7 +278,7 @@ impl<'a> Surface for SphereLight<'a> {
             } else {
                 // Didn't hit because ray is entirely within the sphere, and
                 // therefore doesn't hit its surface.
-                return ([0; 4], 0);
+                return;
             };
 
             // We hit the sphere, so calculate intersection info.
@@ -334,8 +334,6 @@ impl<'a> Surface for SphereLight<'a> {
                 // Set ray's max t
                 rays.set_max_t(ray_idx, t);
             }
-
-            ([0; 4], 0)
         });
     }
 }
diff --git a/src/ray.rs b/src/ray.rs
index 4312f32..2fa92de 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -1,6 +1,6 @@
 #![allow(dead_code)]
 
-use float4::Float4;
+use float4::{Bool4, Float4};
 
 use crate::math::{Matrix4x4, Point, Vector};
 
@@ -293,11 +293,31 @@ impl RayStack {
     }
 
     /// Pops the next task off the stack, and executes the provided closure for
-    /// each ray index in the task.  The return value of the closure is the list
-    /// of lanes (by index) to add the given ray index back into.
-    pub fn pop_do_next_task<F>(&mut self, needed_lanes: usize, mut handle_ray: F)
+    /// each ray index in the task.
+    pub fn pop_do_next_task<F>(&mut self, mut handle_ray: F)
     where
-        F: FnMut(usize) -> ([u8; 4], usize),
+        F: FnMut(usize),
+    {
+        // Pop the task and do necessary bookkeeping.
+        let task = self.tasks.pop().unwrap();
+        let task_range = (task.start_idx, self.lanes[task.lane].end_len);
+        self.lanes[task.lane].end_len = task.start_idx;
+
+        // Execute task.
+        for i in task_range.0..task_range.1 {
+            let ray_idx = self.lanes[task.lane].idxs[i];
+            handle_ray(ray_idx as usize);
+        }
+
+        self.lanes[task.lane].idxs.truncate(task_range.0);
+    }
+
+    /// Pops the next task off the stack, executes the provided closure for
+    /// each ray index in the task, and pushes the ray indices back onto the
+    /// indicated lanes.
+    pub fn pop_do_next_task_and_push_rays<F>(&mut self, needed_lanes: usize, mut handle_ray: F)
+    where
+        F: FnMut(usize) -> (Bool4, usize),
     {
         // Prepare lanes.
         self.ensure_lane_count(needed_lanes);
@@ -311,13 +331,15 @@ impl RayStack {
         let mut source_lane_cap = task_range.0;
         for i in task_range.0..task_range.1 {
             let ray_idx = self.lanes[task.lane].idxs[i];
-            let (add_list, list_len) = handle_ray(ray_idx as usize);
-            for &l in &add_list[..list_len] {
-                if l == task.lane as u8 {
-                    self.lanes[l as usize].idxs[source_lane_cap] = ray_idx;
-                    source_lane_cap += 1;
-                } else {
-                    self.lanes[l as usize].idxs.push(ray_idx);
+            let (push_mask, c) = handle_ray(ray_idx as usize);
+            for l in 0..c {
+                if push_mask.get_n(l) {
+                    if l == task.lane {
+                        self.lanes[l as usize].idxs[source_lane_cap] = ray_idx;
+                        source_lane_cap += 1;
+                    } else {
+                        self.lanes[l as usize].idxs.push(ray_idx);
+                    }
                 }
             }
         }
diff --git a/src/surface/triangle_mesh.rs b/src/surface/triangle_mesh.rs
index 906b7a5..1b54232 100644
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@@ -157,7 +157,7 @@ impl<'a> Surface for TriangleMesh<'a> {
                 };
 
                 // Test each ray against the current triangle.
-                ray_stack.pop_do_next_task(0, |ray_idx| {
+                ray_stack.pop_do_next_task(|ray_idx| {
                     let ray_idx = ray_idx as usize;
                     let ray_time = rays.time(ray_idx);
 
@@ -275,8 +275,6 @@ impl<'a> Surface for TriangleMesh<'a> {
                             rays.set_max_t(ray_idx, t);
                         }
                     }
-
-                    ([0; 4], 0)
                 });
             },
         );
diff --git a/src/tracer.rs b/src/tracer.rs
index 8ba78c3..e733cdd 100644
--- a/src/tracer.rs
+++ b/src/tracer.rs
@@ -12,6 +12,8 @@ use crate::{
     transform_stack::TransformStack,
 };
 
+use float4::Bool4;
+
 pub struct Tracer<'a> {
     ray_stack: RayStack,
     inner: TracerInner<'a>,
@@ -96,10 +98,10 @@ impl<'a> TracerInner<'a> {
                     // Do transforms
                     // TODO: re-divide rays based on direction (maybe?).
                     let xforms = self.xform_stack.top();
-                    ray_stack.pop_do_next_task(2, |ray_idx| {
+                    ray_stack.pop_do_next_task_and_push_rays(2, |ray_idx| {
                         let t = rays.time(ray_idx);
                         rays.update_local(ray_idx, &lerp_slice(xforms, t));
-                        ([0, 1, 0, 0], 2)
+                        (Bool4::new(true, true, false, false), 2)
                     });
                     ray_stack.push_lanes_to_tasks(&[0, 1]);
                 }
@@ -129,16 +131,14 @@ impl<'a> TracerInner<'a> {
                     // Undo transforms
                     let xforms = self.xform_stack.top();
                     if !xforms.is_empty() {
-                        ray_stack.pop_do_next_task(0, |ray_idx| {
+                        ray_stack.pop_do_next_task(|ray_idx| {
                             let t = rays.time(ray_idx);
                             rays.update_local(ray_idx, &lerp_slice(xforms, t));
-                            ([0; 4], 0)
                         });
                     } else {
                         let ident = Matrix4x4::new();
-                        ray_stack.pop_do_next_task(0, |ray_idx| {
+                        ray_stack.pop_do_next_task(|ray_idx| {
                             rays.update_local(ray_idx, &ident);
-                            ([0; 4], 0)
                         });
                     }
                 }
diff --git a/sub_crates/float4/src/lib.rs b/sub_crates/float4/src/lib.rs
index 99c0417..327fbf9 100644
--- a/sub_crates/float4/src/lib.rs
+++ b/sub_crates/float4/src/lib.rs
@@ -621,7 +621,22 @@ mod x86_64_sse {
 
     impl Bool4 {
         #[inline(always)]
-        pub fn new() -> Bool4 {
+        pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 {
+            use std::arch::x86_64::_mm_set_ps;
+            Bool4 {
+                data: unsafe {
+                    _mm_set_ps(
+                        if d { 1.0 } else { 0.0 },
+                        if c { 1.0 } else { 0.0 },
+                        if b { 1.0 } else { 0.0 },
+                        if a { 1.0 } else { 0.0 },
+                    )
+                },
+            }
+        }
+
+        #[inline(always)]
+        pub fn new_false() -> Bool4 {
             use std::arch::x86_64::_mm_set1_ps;
             Bool4 {
                 data: unsafe { _mm_set1_ps(0.0) },
@@ -667,7 +682,8 @@ mod x86_64_sse {
         ///
         /// This is the `OR` operation on all the contained bools.  If even
         /// one bool is true, this returns true.
-        pub fn all_false(&self) -> bool {
+        #[inline(always)]
+        pub fn is_all_false(&self) -> bool {
             let a = unsafe { *(&self.data as *const __m128 as *const u128) };
             a == 0
         }

From 874b07df0298dcc5abdcc60bb5e5fb3e89cec374 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 29 Jun 2019 07:48:33 +0900
Subject: [PATCH 11/20] Filled in missing methods on the fall-back non-SIMD
 code.

---
 sub_crates/float4/src/lib.rs | 41 ++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/sub_crates/float4/src/lib.rs b/sub_crates/float4/src/lib.rs
index 327fbf9..0f081b3 100644
--- a/sub_crates/float4/src/lib.rs
+++ b/sub_crates/float4/src/lib.rs
@@ -680,8 +680,8 @@ mod x86_64_sse {
 
         /// Returns whether all four bools are false.
         ///
-        /// This is the `OR` operation on all the contained bools.  If even
-        /// one bool is true, this returns true.
+        /// This is the `NOT` operation on the result of `OR`ing all the
+        /// contained bools.  If even one bool is true, this returns false.
         #[inline(always)]
         pub fn is_all_false(&self) -> bool {
             let a = unsafe { *(&self.data as *const __m128 as *const u128) };
@@ -1269,21 +1269,25 @@ mod fallback {
         det
     }
 
-    /// Essentially a tuple of four bools, which will use SIMD operations
-    /// where possible on a platform.
-    #[cfg(feature = "simd_perf")]
-    #[derive(Debug, Copy, Clone)]
-    pub struct Bool4 {
-        data: bool32fx4,
-    }
-
-    #[cfg(not(feature = "simd_perf"))]
+    /// Essentially a tuple of four bools.
     #[derive(Debug, Copy, Clone)]
     pub struct Bool4 {
         data: [bool; 4],
     }
 
     impl Bool4 {
+        #[inline(always)]
+        pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 {
+            Bool4 { data: [a, b, c, d] }
+        }
+
+        #[inline(always)]
+        pub fn new_false() -> Bool4 {
+            Bool4 {
+                data: [false, false, false, false],
+            }
+        }
+
         /// Returns the value of the nth element.
         #[inline(always)]
         pub fn get_n(self, n: usize) -> bool {
@@ -1318,6 +1322,15 @@ mod fallback {
             self.get_n(3)
         }
 
+        /// Returns whether all four bools are false.
+        ///
+        /// This is the `NOT` operation on the result of `OR`ing all the
+        /// contained bools.  If even one bool is true, this returns false.
+        #[inline(always)]
+        pub fn is_all_false(&self) -> bool {
+            !(self.data[0] | self.data[1] | self.data[2] | self.data[3])
+        }
+
         #[inline]
         pub fn to_bitmask(self) -> u8 {
             (self.get_0() as u8)
@@ -1598,4 +1611,10 @@ mod tests {
 
         assert_eq!(r, 0b00001010);
     }
+
+    #[test]
+    fn bool4_is_all_false() {
+        assert_eq!(true, Bool4::new(false, false, false, false).is_all_false());
+        assert_eq!(false, Bool4::new(false, false, true, false).is_all_false());
+    }
 }

From 4ef376dc89a8fc7453010609cdda9841de875e34 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 29 Jun 2019 08:28:41 +0900
Subject: [PATCH 12/20] Move multiple-object logic out of BVH4.

This allows each part of Psychopath tp handle the logic in the
best way, instead of a one-size-fits-all approach.
---
 src/accel/bvh4.rs            |  21 +--
 src/ray.rs                   |  27 +++-
 src/surface/triangle_mesh.rs | 261 ++++++++++++++++++-----------------
 src/tracer.rs                |  18 ++-
 4 files changed, 163 insertions(+), 164 deletions(-)

diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index d7e68e1..e5298e7 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -96,14 +96,9 @@ impl<'a> BVH4<'a> {
         self.depth
     }
 
-    pub fn traverse<T, F>(
-        &self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-        objects: &[T],
-        mut obj_ray_test: F,
-    ) where
-        F: FnMut(&T, &mut RayBatch, &mut RayStack),
+    pub fn traverse<F>(&self, rays: &mut RayBatch, ray_stack: &mut RayStack, mut obj_ray_test: F)
+    where
+        F: FnMut(std::ops::Range<usize>, &mut RayBatch, &mut RayStack),
     {
         if self.root.is_none() {
             return;
@@ -170,16 +165,8 @@ impl<'a> BVH4<'a> {
                 &BVH4Node::Leaf { object_range } => {
                     trav_time += timer.tick() as f64;
 
-                    // Set up the tasks for each object.
-                    let obj_count = object_range.1 - object_range.0;
-                    for _ in 0..(obj_count - 1) {
-                        ray_stack.duplicate_next_task();
-                    }
-
                     // Do the ray tests.
-                    for obj in &objects[object_range.0..object_range.1] {
-                        obj_ray_test(obj, rays, ray_stack);
-                    }
+                    obj_ray_test(object_range.0..object_range.1, rays, ray_stack);
 
                     timer.tick();
 
diff --git a/src/ray.rs b/src/ray.rs
index 2fa92de..97bdd39 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -292,24 +292,37 @@ impl RayStack {
         self.lanes[l].end_len = self.lanes[l].idxs.len();
     }
 
-    /// Pops the next task off the stack, and executes the provided closure for
-    /// each ray index in the task.
-    pub fn pop_do_next_task<F>(&mut self, mut handle_ray: F)
+    // Pops the next task off the stack.
+    pub fn pop_task(&mut self) {
+        let task = self.tasks.pop().unwrap();
+        self.lanes[task.lane].end_len = task.start_idx;
+        self.lanes[task.lane].idxs.truncate(task.start_idx);
+    }
+
+    // Executes a task without popping it from the task stack.
+    pub fn do_next_task<F>(&mut self, mut handle_ray: F)
     where
         F: FnMut(usize),
     {
-        // Pop the task and do necessary bookkeeping.
-        let task = self.tasks.pop().unwrap();
+        let task = self.tasks.last().unwrap();
         let task_range = (task.start_idx, self.lanes[task.lane].end_len);
-        self.lanes[task.lane].end_len = task.start_idx;
 
         // Execute task.
         for i in task_range.0..task_range.1 {
             let ray_idx = self.lanes[task.lane].idxs[i];
             handle_ray(ray_idx as usize);
         }
+    }
 
-        self.lanes[task.lane].idxs.truncate(task_range.0);
+    /// Pops the next task off the stack, and executes the provided closure for
+    /// each ray index in the task.
+    #[inline(always)]
+    pub fn pop_do_next_task<F>(&mut self, handle_ray: F)
+    where
+        F: FnMut(usize),
+    {
+        self.do_next_task(handle_ray);
+        self.pop_task();
     }
 
     /// Pops the next task off the stack, executes the provided closure for
diff --git a/src/surface/triangle_mesh.rs b/src/surface/triangle_mesh.rs
index 1b54232..468edf7 100644
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@@ -130,153 +130,154 @@ impl<'a> Surface for TriangleMesh<'a> {
             Matrix4x4::new()
         };
 
-        self.accel.traverse(
-            rays,
-            ray_stack,
-            self.indices,
-            |tri_indices, rays, ray_stack| {
-                // For static triangles with static transforms, cache them.
-                let is_cached = self.time_sample_count == 1 && space.len() <= 1;
-                let mut tri = if is_cached {
-                    let tri = (
-                        self.vertices[tri_indices.0 as usize],
-                        self.vertices[tri_indices.1 as usize],
-                        self.vertices[tri_indices.2 as usize],
-                    );
-                    if space.is_empty() {
-                        tri
-                    } else {
-                        (
-                            tri.0 * static_mat_space,
-                            tri.1 * static_mat_space,
-                            tri.2 * static_mat_space,
-                        )
-                    }
-                } else {
-                    unsafe { std::mem::uninitialized() }
-                };
+        self.accel
+            .traverse(rays, ray_stack, |idx_range, rays, ray_stack| {
+                for tri_idx in idx_range {
+                    let tri_indices = self.indices[tri_idx];
 
-                // Test each ray against the current triangle.
-                ray_stack.pop_do_next_task(|ray_idx| {
-                    let ray_idx = ray_idx as usize;
-                    let ray_time = rays.time(ray_idx);
-
-                    // Get triangle if necessary
-                    if !is_cached {
-                        tri = if self.time_sample_count == 1 {
-                            // No deformation motion blur, so fast-path it.
+                    // For static triangles with static transforms, cache them.
+                    let is_cached = self.time_sample_count == 1 && space.len() <= 1;
+                    let mut tri = if is_cached {
+                        let tri = (
+                            self.vertices[tri_indices.0 as usize],
+                            self.vertices[tri_indices.1 as usize],
+                            self.vertices[tri_indices.2 as usize],
+                        );
+                        if space.is_empty() {
+                            tri
+                        } else {
                             (
-                                self.vertices[tri_indices.0 as usize],
-                                self.vertices[tri_indices.1 as usize],
-                                self.vertices[tri_indices.2 as usize],
+                                tri.0 * static_mat_space,
+                                tri.1 * static_mat_space,
+                                tri.2 * static_mat_space,
                             )
-                        } else {
-                            // Deformation motion blur, need to interpolate.
-                            let p0_slice = &self.vertices[(tri_indices.0 as usize
-                                * self.time_sample_count)
-                                ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
-                            let p1_slice = &self.vertices[(tri_indices.1 as usize
-                                * self.time_sample_count)
-                                ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
-                            let p2_slice = &self.vertices[(tri_indices.2 as usize
-                                * self.time_sample_count)
-                                ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
-
-                            let p0 = lerp_slice(p0_slice, ray_time);
-                            let p1 = lerp_slice(p1_slice, ray_time);
-                            let p2 = lerp_slice(p2_slice, ray_time);
-
-                            (p0, p1, p2)
-                        };
-                    }
-
-                    // Transform triangle if necessary, and get transform space.
-                    let mat_space = if !space.is_empty() {
-                        if space.len() > 1 {
-                            // Per-ray transform, for motion blur
-                            let mat_space = lerp_slice(space, ray_time).inverse();
-                            tri = (tri.0 * mat_space, tri.1 * mat_space, tri.2 * mat_space);
-                            mat_space
-                        } else {
-                            // Same transform for all rays
-                            if !is_cached {
-                                tri = (
-                                    tri.0 * static_mat_space,
-                                    tri.1 * static_mat_space,
-                                    tri.2 * static_mat_space,
-                                );
-                            }
-                            static_mat_space
                         }
                     } else {
-                        // No transforms
-                        Matrix4x4::new()
+                        unsafe { std::mem::uninitialized() }
                     };
 
-                    // Test ray against triangle
-                    if let Some((t, b0, b1, b2)) = triangle::intersect_ray(
-                        rays.orig(ray_idx),
-                        rays.dir(ray_idx),
-                        rays.max_t(ray_idx),
-                        tri,
-                    ) {
-                        if rays.is_occlusion(ray_idx) {
-                            isects[ray_idx] = SurfaceIntersection::Occlude;
-                            rays.mark_done(ray_idx);
-                        } else {
-                            // Calculate intersection point and error magnitudes
-                            let (pos, pos_err) = triangle::surface_point(tri, (b0, b1, b2));
+                    // Test each ray against the current triangle.
+                    ray_stack.do_next_task(|ray_idx| {
+                        let ray_idx = ray_idx as usize;
+                        let ray_time = rays.time(ray_idx);
 
-                            // Calculate geometric surface normal
-                            let geo_normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();
-
-                            // Calculate interpolated surface normal, if any
-                            let shading_normal = if let Some(normals) = self.normals {
-                                let n0_slice = &normals[(tri_indices.0 as usize
+                        // Get triangle if necessary
+                        if !is_cached {
+                            tri = if self.time_sample_count == 1 {
+                                // No deformation motion blur, so fast-path it.
+                                (
+                                    self.vertices[tri_indices.0 as usize],
+                                    self.vertices[tri_indices.1 as usize],
+                                    self.vertices[tri_indices.2 as usize],
+                                )
+                            } else {
+                                // Deformation motion blur, need to interpolate.
+                                let p0_slice = &self.vertices[(tri_indices.0 as usize
                                     * self.time_sample_count)
                                     ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
-                                let n1_slice = &normals[(tri_indices.1 as usize
+                                let p1_slice = &self.vertices[(tri_indices.1 as usize
                                     * self.time_sample_count)
                                     ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
-                                let n2_slice = &normals[(tri_indices.2 as usize
+                                let p2_slice = &self.vertices[(tri_indices.2 as usize
                                     * self.time_sample_count)
                                     ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
 
-                                let n0 = lerp_slice(n0_slice, ray_time).normalized();
-                                let n1 = lerp_slice(n1_slice, ray_time).normalized();
-                                let n2 = lerp_slice(n2_slice, ray_time).normalized();
+                                let p0 = lerp_slice(p0_slice, ray_time);
+                                let p1 = lerp_slice(p1_slice, ray_time);
+                                let p2 = lerp_slice(p2_slice, ray_time);
 
-                                let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
-                                if dot(s_nor, geo_normal) >= 0.0 {
-                                    s_nor
-                                } else {
-                                    -s_nor
-                                }
-                            } else {
-                                geo_normal
+                                (p0, p1, p2)
                             };
-
-                            let intersection_data = SurfaceIntersectionData {
-                                incoming: rays.dir(ray_idx),
-                                t: t,
-                                pos: pos,
-                                pos_err: pos_err,
-                                nor: shading_normal,
-                                nor_g: geo_normal,
-                                local_space: mat_space,
-                                sample_pdf: 0.0,
-                            };
-
-                            // Fill in intersection data
-                            isects[ray_idx] = SurfaceIntersection::Hit {
-                                intersection_data: intersection_data,
-                                closure: shader.shade(&intersection_data, ray_time),
-                            };
-                            rays.set_max_t(ray_idx, t);
                         }
-                    }
-                });
-            },
-        );
+
+                        // Transform triangle if necessary, and get transform space.
+                        let mat_space = if !space.is_empty() {
+                            if space.len() > 1 {
+                                // Per-ray transform, for motion blur
+                                let mat_space = lerp_slice(space, ray_time).inverse();
+                                tri = (tri.0 * mat_space, tri.1 * mat_space, tri.2 * mat_space);
+                                mat_space
+                            } else {
+                                // Same transform for all rays
+                                if !is_cached {
+                                    tri = (
+                                        tri.0 * static_mat_space,
+                                        tri.1 * static_mat_space,
+                                        tri.2 * static_mat_space,
+                                    );
+                                }
+                                static_mat_space
+                            }
+                        } else {
+                            // No transforms
+                            Matrix4x4::new()
+                        };
+
+                        // Test ray against triangle
+                        if let Some((t, b0, b1, b2)) = triangle::intersect_ray(
+                            rays.orig(ray_idx),
+                            rays.dir(ray_idx),
+                            rays.max_t(ray_idx),
+                            tri,
+                        ) {
+                            if rays.is_occlusion(ray_idx) {
+                                isects[ray_idx] = SurfaceIntersection::Occlude;
+                                rays.mark_done(ray_idx);
+                            } else {
+                                // Calculate intersection point and error magnitudes
+                                let (pos, pos_err) = triangle::surface_point(tri, (b0, b1, b2));
+
+                                // Calculate geometric surface normal
+                                let geo_normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();
+
+                                // Calculate interpolated surface normal, if any
+                                let shading_normal = if let Some(normals) = self.normals {
+                                    let n0_slice = &normals[(tri_indices.0 as usize
+                                        * self.time_sample_count)
+                                        ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
+                                    let n1_slice = &normals[(tri_indices.1 as usize
+                                        * self.time_sample_count)
+                                        ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
+                                    let n2_slice = &normals[(tri_indices.2 as usize
+                                        * self.time_sample_count)
+                                        ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
+
+                                    let n0 = lerp_slice(n0_slice, ray_time).normalized();
+                                    let n1 = lerp_slice(n1_slice, ray_time).normalized();
+                                    let n2 = lerp_slice(n2_slice, ray_time).normalized();
+
+                                    let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
+                                    if dot(s_nor, geo_normal) >= 0.0 {
+                                        s_nor
+                                    } else {
+                                        -s_nor
+                                    }
+                                } else {
+                                    geo_normal
+                                };
+
+                                let intersection_data = SurfaceIntersectionData {
+                                    incoming: rays.dir(ray_idx),
+                                    t: t,
+                                    pos: pos,
+                                    pos_err: pos_err,
+                                    nor: shading_normal,
+                                    nor_g: geo_normal,
+                                    local_space: mat_space,
+                                    sample_pdf: 0.0,
+                                };
+
+                                // Fill in intersection data
+                                isects[ray_idx] = SurfaceIntersection::Hit {
+                                    intersection_data: intersection_data,
+                                    closure: shader.shade(&intersection_data, ray_time),
+                                };
+                                rays.set_max_t(ray_idx, t);
+                            }
+                        }
+                    });
+                }
+                ray_stack.pop_task();
+            });
     }
 }
diff --git a/src/tracer.rs b/src/tracer.rs
index e733cdd..7969d8d 100644
--- a/src/tracer.rs
+++ b/src/tracer.rs
@@ -85,11 +85,11 @@ impl<'a> TracerInner<'a> {
         rays: &mut RayBatch,
         ray_stack: &mut RayStack,
     ) {
-        assembly.object_accel.traverse(
-            rays,
-            ray_stack,
-            &assembly.instances[..],
-            |inst, rays, ray_stack| {
+        assembly
+            .object_accel
+            .traverse(rays, ray_stack, |idx_range, rays, ray_stack| {
+                let inst = &assembly.instances[idx_range.start];
+
                 // Transform rays if needed
                 if let Some((xstart, xend)) = inst.transform_indices {
                     // Push transforms to stack
@@ -98,12 +98,11 @@ impl<'a> TracerInner<'a> {
                     // Do transforms
                     // TODO: re-divide rays based on direction (maybe?).
                     let xforms = self.xform_stack.top();
-                    ray_stack.pop_do_next_task_and_push_rays(2, |ray_idx| {
+                    ray_stack.do_next_task(|ray_idx| {
                         let t = rays.time(ray_idx);
                         rays.update_local(ray_idx, &lerp_slice(xforms, t));
-                        (Bool4::new(true, true, false, false), 2)
                     });
-                    ray_stack.push_lanes_to_tasks(&[0, 1]);
+                    ray_stack.duplicate_next_task();
                 }
 
                 // Trace rays
@@ -142,8 +141,7 @@ impl<'a> TracerInner<'a> {
                         });
                     }
                 }
-            },
-        );
+            });
     }
 
     fn trace_object<'b>(

From 14b16896ac0ce5adc5eed16f3fc50da6f43b556b Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 29 Jun 2019 08:41:11 +0900
Subject: [PATCH 13/20] Fix some compiler warnings.

---
 src/main.rs   | 1 -
 src/tracer.rs | 2 --
 2 files changed, 3 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index bd18195..9a349dc 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -11,7 +11,6 @@
 #![allow(clippy::needless_range_loop)]
 #![allow(clippy::excessive_precision)]
 
-#[macro_use]
 extern crate lazy_static;
 
 mod accel;
diff --git a/src/tracer.rs b/src/tracer.rs
index 7969d8d..d689d44 100644
--- a/src/tracer.rs
+++ b/src/tracer.rs
@@ -12,8 +12,6 @@ use crate::{
     transform_stack::TransformStack,
 };
 
-use float4::Bool4;
-
 pub struct Tracer<'a> {
     ray_stack: RayStack,
     inner: TracerInner<'a>,

From 5a53d7f6f6ab65ce244e33e2969abf28e43f5507 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 29 Jun 2019 09:20:04 +0900
Subject: [PATCH 14/20] Added some additional ray tracing stats.

---
 src/main.rs     | 5 +++++
 src/renderer.rs | 4 ++++
 src/tracer.rs   | 7 +++++++
 3 files changed, 16 insertions(+)

diff --git a/src/main.rs b/src/main.rs
index 9a349dc..753e074 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -292,6 +292,11 @@ fn main() {
                         "\t\tTrace:                  {:.3}s",
                         ntime * rstats.trace_time
                     );
+                    println!("\t\t\tRays traced:          {}", rstats.ray_count);
+                    println!(
+                        "\t\t\tRays/sec:             {}",
+                        (rstats.ray_count as f64 / (ntime * rstats.trace_time) as f64) as u64
+                    );
                     println!(
                         "\t\t\tTraversal:            {:.3}s",
                         ntime * rstats.accel_traversal_time
diff --git a/src/renderer.rs b/src/renderer.rs
index 2c3077d..4d2104e 100644
--- a/src/renderer.rs
+++ b/src/renderer.rs
@@ -42,6 +42,7 @@ pub struct RenderStats {
     pub trace_time: f64,
     pub accel_traversal_time: f64,
     pub accel_node_visits: u64,
+    pub ray_count: u64,
     pub initial_ray_generation_time: f64,
     pub ray_generation_time: f64,
     pub sample_writing_time: f64,
@@ -54,6 +55,7 @@ impl RenderStats {
             trace_time: 0.0,
             accel_traversal_time: 0.0,
             accel_node_visits: 0,
+            ray_count: 0,
             initial_ray_generation_time: 0.0,
             ray_generation_time: 0.0,
             sample_writing_time: 0.0,
@@ -65,6 +67,7 @@ impl RenderStats {
         self.trace_time += other.trace_time;
         self.accel_traversal_time += other.accel_traversal_time;
         self.accel_node_visits += other.accel_node_visits;
+        self.ray_count += other.ray_count;
         self.initial_ray_generation_time += other.initial_ray_generation_time;
         self.ray_generation_time += other.ray_generation_time;
         self.sample_writing_time += other.sample_writing_time;
@@ -344,6 +347,7 @@ impl<'a> Renderer<'a> {
         }
 
         stats.total_time += total_timer.tick() as f64;
+        stats.ray_count = tracer.rays_traced();
         ACCEL_TRAV_TIME.with(|att| {
             stats.accel_traversal_time = att.get();
             att.set(0.0);
diff --git a/src/tracer.rs b/src/tracer.rs
index d689d44..d3b5b09 100644
--- a/src/tracer.rs
+++ b/src/tracer.rs
@@ -13,6 +13,7 @@ use crate::{
 };
 
 pub struct Tracer<'a> {
+    ray_trace_count: u64,
     ray_stack: RayStack,
     inner: TracerInner<'a>,
 }
@@ -20,6 +21,7 @@ pub struct Tracer<'a> {
 impl<'a> Tracer<'a> {
     pub fn from_assembly(assembly: &'a Assembly) -> Tracer<'a> {
         Tracer {
+            ray_trace_count: 0,
             ray_stack: RayStack::new(),
             inner: TracerInner {
                 root: assembly,
@@ -30,8 +32,13 @@ impl<'a> Tracer<'a> {
     }
 
     pub fn trace<'b>(&'b mut self, rays: &mut RayBatch) -> &'b [SurfaceIntersection] {
+        self.ray_trace_count += rays.len() as u64;
         self.inner.trace(rays, &mut self.ray_stack)
     }
+
+    pub fn rays_traced(&self) -> u64 {
+        self.ray_trace_count
+    }
 }
 
 struct TracerInner<'a> {

From 68fba19fc6ed0ee2b1a17a30b33d5af2cd9af72d Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 29 Jun 2019 09:46:39 +0900
Subject: [PATCH 15/20] Removed a timer from a hot loop, and fixed node-test
 stat.

Gives I small performance boost, and now ray/node tests are
actually reported correctly.  Yay!
---
 src/accel/bvh4.rs | 13 +------------
 src/main.rs       |  4 ----
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index e5298e7..7b67540 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -104,8 +104,6 @@ impl<'a> BVH4<'a> {
             return;
         }
 
-        let mut timer = Timer::new();
-        let mut trav_time: f64 = 0.0;
         let mut node_tests: u64 = 0;
 
         let traversal_table =
@@ -116,13 +114,13 @@ impl<'a> BVH4<'a> {
         let mut stack_ptr = 1;
 
         while stack_ptr > 0 {
-            node_tests += ray_stack.ray_count_in_next_task() as u64;
             match node_stack[stack_ptr] {
                 &BVH4Node::Internal {
                     bounds,
                     children,
                     traversal_code,
                 } => {
+                    node_tests += ray_stack.ray_count_in_next_task() as u64;
                     let mut all_hits = Bool4::new_false();
 
                     // Ray testing
@@ -163,23 +161,14 @@ impl<'a> BVH4<'a> {
                 }
 
                 &BVH4Node::Leaf { object_range } => {
-                    trav_time += timer.tick() as f64;
-
                     // Do the ray tests.
                     obj_ray_test(object_range.0..object_range.1, rays, ray_stack);
 
-                    timer.tick();
-
                     stack_ptr -= 1;
                 }
             }
         }
 
-        trav_time += timer.tick() as f64;
-        ACCEL_TRAV_TIME.with(|att| {
-            let v = att.get();
-            att.set(v + trav_time);
-        });
         ACCEL_NODE_RAY_TESTS.with(|anv| {
             let v = anv.get();
             anv.set(v + node_tests);
diff --git a/src/main.rs b/src/main.rs
index 753e074..f469e98 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -297,10 +297,6 @@ fn main() {
                         "\t\t\tRays/sec:             {}",
                         (rstats.ray_count as f64 / (ntime * rstats.trace_time) as f64) as u64
                     );
-                    println!(
-                        "\t\t\tTraversal:            {:.3}s",
-                        ntime * rstats.accel_traversal_time
-                    );
                     println!("\t\t\tRay/node tests:       {}", rstats.accel_node_visits);
                     println!(
                         "\t\tInitial ray generation: {:.3}s",

From 2a0ca001e2fffcf216995a7fbf1724cc0520affe Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 29 Jun 2019 12:39:28 +0900
Subject: [PATCH 16/20] Optimized ray stack task duplication with memcopy.

---
 Cargo.lock |  7 +++++++
 Cargo.toml |  1 +
 src/ray.rs | 13 ++++++++++---
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bc8f7b6..7e4b4ef 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -100,6 +100,11 @@ dependencies = [
 name = "color"
 version = "0.1.0"
 
+[[package]]
+name = "copy_in_place"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
 [[package]]
 name = "crossbeam"
 version = "0.3.2"
@@ -239,6 +244,7 @@ dependencies = [
  "bvh_order 0.1.0",
  "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "color 0.1.0",
+ "copy_in_place 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "float4 0.1.0",
  "half 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -557,6 +563,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33"
 "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
 "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
+"checksum copy_in_place 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b792a46b1ef44bb5e9a04721d34e186522431be965a283437107843d62ddbaad"
 "checksum crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "24ce9782d4d5c53674646a6a4c1863a21a8fc0cb649b3c94dfc16e45071dea19"
 "checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3"
 "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
diff --git a/Cargo.toml b/Cargo.toml
index 1e51807..14ee2ac 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -25,6 +25,7 @@ debug = true
 # Crates.io dependencies
 base64 = "0.9"
 clap = "2.30"
+copy_in_place = "0.2.0"
 crossbeam = "0.3"
 half = "1.0"
 lazy_static = "1.0"
diff --git a/src/ray.rs b/src/ray.rs
index 97bdd39..9727522 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -279,11 +279,18 @@ impl RayStack {
         let start = task.start_idx;
         let end = self.lanes[l].end_len;
 
-        for i in start..end {
-            let idx = self.lanes[l].idxs[i];
-            self.lanes[l].idxs.push(idx);
+        // Extend the indices vector
+        self.lanes[l].idxs.reserve(end - start);
+        let old_len = self.lanes[l].idxs.len();
+        let new_len = old_len + end - start;
+        unsafe {
+            self.lanes[l].idxs.set_len(new_len);
         }
 
+        // Copy elements
+        copy_in_place::copy_in_place(&mut self.lanes[l].idxs, start..end, end);
+
+        // Push the new task onto the stack
         self.tasks.push(RayTask {
             lane: l,
             start_idx: end,

From c4b8971805dd43fab528fda3abb3b37f55451d5b Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 29 Jun 2019 12:43:24 +0900
Subject: [PATCH 17/20] Clean up compiler warnings.

---
 src/accel/bvh4.rs | 3 +--
 src/accel/mod.rs  | 1 -
 src/renderer.rs   | 9 +--------
 3 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index 7b67540..3708c64 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -13,12 +13,11 @@ use crate::{
     lerp::lerp_slice,
     math::Vector,
     ray::{RayBatch, RayStack},
-    timer::Timer,
 };
 
 use super::{
     bvh_base::{BVHBase, BVHBaseNode, BVH_MAX_DEPTH},
-    ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME,
+    ACCEL_NODE_RAY_TESTS,
 };
 
 use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
diff --git a/src/accel/mod.rs b/src/accel/mod.rs
index abbb1d4..ba83a3a 100644
--- a/src/accel/mod.rs
+++ b/src/accel/mod.rs
@@ -21,7 +21,6 @@ pub use self::{
 
 // Track BVH traversal time
 thread_local! {
-    pub static ACCEL_TRAV_TIME: Cell<f64> = Cell::new(0.0);
     pub static ACCEL_NODE_RAY_TESTS: Cell<u64> = Cell::new(0);
 }
 
diff --git a/src/renderer.rs b/src/renderer.rs
index 4d2104e..50d3061 100644
--- a/src/renderer.rs
+++ b/src/renderer.rs
@@ -12,7 +12,7 @@ use scoped_threadpool::Pool;
 use float4::Float4;
 
 use crate::{
-    accel::{ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME},
+    accel::ACCEL_NODE_RAY_TESTS,
     color::{map_0_1_to_wavelength, SpectralSample, XYZ},
     fp_utils::robust_ray_origin,
     hash::hash_u32,
@@ -40,7 +40,6 @@ pub struct Renderer<'a> {
 #[derive(Debug, Copy, Clone)]
 pub struct RenderStats {
     pub trace_time: f64,
-    pub accel_traversal_time: f64,
     pub accel_node_visits: u64,
     pub ray_count: u64,
     pub initial_ray_generation_time: f64,
@@ -53,7 +52,6 @@ impl RenderStats {
     fn new() -> RenderStats {
         RenderStats {
             trace_time: 0.0,
-            accel_traversal_time: 0.0,
             accel_node_visits: 0,
             ray_count: 0,
             initial_ray_generation_time: 0.0,
@@ -65,7 +63,6 @@ impl RenderStats {
 
     fn collect(&mut self, other: RenderStats) {
         self.trace_time += other.trace_time;
-        self.accel_traversal_time += other.accel_traversal_time;
         self.accel_node_visits += other.accel_node_visits;
         self.ray_count += other.ray_count;
         self.initial_ray_generation_time += other.initial_ray_generation_time;
@@ -348,10 +345,6 @@ impl<'a> Renderer<'a> {
 
         stats.total_time += total_timer.tick() as f64;
         stats.ray_count = tracer.rays_traced();
-        ACCEL_TRAV_TIME.with(|att| {
-            stats.accel_traversal_time = att.get();
-            att.set(0.0);
-        });
         ACCEL_NODE_RAY_TESTS.with(|anv| {
             stats.accel_node_visits = anv.get();
             anv.set(0);

From 4f7335db8c5e614b3e053751c0d706eb279dc0c0 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 29 Jun 2019 14:20:32 +0900
Subject: [PATCH 18/20] Misc optimizations that add up to a nice speed boost.

---
 src/accel/bvh4.rs | 34 +++++++++++++++-----------
 src/ray.rs        | 61 +++++++++++++++++++++++++++++------------------
 2 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs
index 3708c64..5b09e0f 100644
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@@ -125,34 +125,40 @@ impl<'a> BVH4<'a> {
                     // Ray testing
                     ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| {
                         if rays.is_done(ray_idx) {
-                            (Bool4::new_false(), 0)
+                            Bool4::new_false()
                         } else {
-                            let hits = lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
-                                rays.orig_local(ray_idx),
-                                rays.dir_inv_local(ray_idx),
-                                rays.max_t(ray_idx),
-                            );
+                            let hits = if bounds.len() == 1 {
+                                bounds[0].intersect_ray(
+                                    rays.orig_local(ray_idx),
+                                    rays.dir_inv_local(ray_idx),
+                                    rays.max_t(ray_idx),
+                                )
+                            } else {
+                                lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
+                                    rays.orig_local(ray_idx),
+                                    rays.dir_inv_local(ray_idx),
+                                    rays.max_t(ray_idx),
+                                )
+                            };
                             all_hits = all_hits | hits;
-                            (hits, children.len())
+                            hits
                         }
                     });
 
                     // If there were any intersections, create tasks.
                     if !all_hits.is_all_false() {
                         let order_code = traversal_table[traversal_code as usize];
-                        let mut lanes = [0usize; 4];
                         let mut lane_count = 0;
-                        for i in 0..children.len() {
-                            let inv_i = (children.len() - 1) - i;
-                            let child_i = ((order_code >> (inv_i * 2)) & 3) as usize;
-                            if all_hits.get_n(child_i) {
+                        let mut i = children.len() as u8;
+                        while i > 0 {
+                            i -= 1;
+                            let child_i = ((order_code >> (i * 2)) & 3) as usize;
+                            if ray_stack.push_lane_to_task(child_i) {
                                 node_stack[stack_ptr + lane_count] = &children[child_i];
-                                lanes[lane_count] = child_i;
                                 lane_count += 1;
                             }
                         }
 
-                        ray_stack.push_lanes_to_tasks(&lanes[..lane_count]);
                         stack_ptr += lane_count - 1;
                     } else {
                         stack_ptr -= 1;
diff --git a/src/ray.rs b/src/ray.rs
index 9727522..7c2bc83 100644
--- a/src/ray.rs
+++ b/src/ray.rs
@@ -259,17 +259,29 @@ impl RayStack {
         self.lanes[lane].idxs.push(ray_idx as RayIndexType);
     }
 
+    /// Pushes any excess indices on the given lane to a new task on the
+    /// task stack.
+    ///
+    /// Returns whether a task was pushed or not.  No task will be pushed
+    /// if there are no excess indices on the end of the lane.
+    pub fn push_lane_to_task(&mut self, lane_idx: usize) -> bool {
+        if self.lanes[lane_idx].end_len < self.lanes[lane_idx].idxs.len() {
+            self.tasks.push(RayTask {
+                lane: lane_idx,
+                start_idx: self.lanes[lane_idx].end_len,
+            });
+            self.lanes[lane_idx].end_len = self.lanes[lane_idx].idxs.len();
+            true
+        } else {
+            false
+        }
+    }
+
     /// Takes the given list of lane indices, and pushes any excess indices on
     /// the end of each into a new task, in the order provided.
     pub fn push_lanes_to_tasks(&mut self, lane_idxs: &[usize]) {
         for &l in lane_idxs {
-            if self.lanes[l].end_len < self.lanes[l].idxs.len() {
-                self.tasks.push(RayTask {
-                    lane: l,
-                    start_idx: self.lanes[l].end_len,
-                });
-                self.lanes[l].end_len = self.lanes[l].idxs.len();
-            }
+            self.push_lane_to_task(l);
         }
     }
 
@@ -335,35 +347,38 @@ impl RayStack {
     /// Pops the next task off the stack, executes the provided closure for
     /// each ray index in the task, and pushes the ray indices back onto the
     /// indicated lanes.
-    pub fn pop_do_next_task_and_push_rays<F>(&mut self, needed_lanes: usize, mut handle_ray: F)
+    pub fn pop_do_next_task_and_push_rays<F>(&mut self, output_lane_count: usize, mut handle_ray: F)
     where
-        F: FnMut(usize) -> (Bool4, usize),
+        F: FnMut(usize) -> Bool4,
     {
-        // Prepare lanes.
-        self.ensure_lane_count(needed_lanes);
-
         // Pop the task and do necessary bookkeeping.
         let task = self.tasks.pop().unwrap();
         let task_range = (task.start_idx, self.lanes[task.lane].end_len);
         self.lanes[task.lane].end_len = task.start_idx;
 
+        // SAFETY: this is probably evil, and depends on behavior of Vec that
+        // are not actually promised.  But we're essentially truncating the lane
+        // to the start of our task range, but will continue to access it's
+        // elements beyond that range via `get_unchecked()` below.  Because the
+        // memory is not freed nor altered, this is safe.  However, again, the
+        // Vec apis don't promise this behavior.  So:
+        //
+        // TODO: build a slightly different lane abstraction to get this same
+        // efficiency without depending on implicit Vec behavior.
+        unsafe {
+            self.lanes[task.lane].idxs.set_len(task.start_idx);
+        }
+
         // Execute task.
-        let mut source_lane_cap = task_range.0;
         for i in task_range.0..task_range.1 {
-            let ray_idx = self.lanes[task.lane].idxs[i];
-            let (push_mask, c) = handle_ray(ray_idx as usize);
-            for l in 0..c {
+            let ray_idx = *unsafe { self.lanes[task.lane].idxs.get_unchecked(i) };
+            let push_mask = handle_ray(ray_idx as usize);
+            for l in 0..output_lane_count {
                 if push_mask.get_n(l) {
-                    if l == task.lane {
-                        self.lanes[l as usize].idxs[source_lane_cap] = ray_idx;
-                        source_lane_cap += 1;
-                    } else {
-                        self.lanes[l as usize].idxs.push(ray_idx);
-                    }
+                    self.lanes[l as usize].idxs.push(ray_idx);
                 }
             }
         }
-        self.lanes[task.lane].idxs.truncate(source_lane_cap);
     }
 }
 

From 4b612e2d1ae87b8b52aa875ee9ee1c2e1581650a Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 6 Jul 2019 09:01:24 +0900
Subject: [PATCH 19/20] Leaf triangle intersection now loops over triangles per
 ray.

This is the inverse of what was being done before, which was to
loop over all of the rays for each triangle.  At the moment, this
actually appears to be a tiny bit slower, but it should allow
for future optimizations testing against multiple triangles at once.
---
 src/surface/triangle_mesh.rs | 211 ++++++++++++++++++++---------------
 1 file changed, 118 insertions(+), 93 deletions(-)

diff --git a/src/surface/triangle_mesh.rs b/src/surface/triangle_mesh.rs
index 468edf7..967f90c 100644
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@@ -14,6 +14,8 @@ use crate::{
 
 use super::{triangle, Surface, SurfaceIntersection, SurfaceIntersectionData};
 
+const MAX_LEAF_TRIANGLE_COUNT: usize = 3;
+
 #[derive(Copy, Clone, Debug)]
 pub struct TriangleMesh<'a> {
     time_sample_count: usize,
@@ -93,7 +95,7 @@ impl<'a> TriangleMesh<'a> {
         };
 
         // Build BVH
-        let accel = BVH4::from_objects(arena, &mut indices[..], 3, |tri| {
+        let accel = BVH4::from_objects(arena, &mut indices[..], MAX_LEAF_TRIANGLE_COUNT, |tri| {
             &bounds
                 [(tri.3 as usize * time_sample_count)..((tri.3 as usize + 1) * time_sample_count)]
         });
@@ -132,38 +134,64 @@ impl<'a> Surface for TriangleMesh<'a> {
 
         self.accel
             .traverse(rays, ray_stack, |idx_range, rays, ray_stack| {
-                for tri_idx in idx_range {
-                    let tri_indices = self.indices[tri_idx];
+                let tri_count = idx_range.end - idx_range.start;
 
-                    // For static triangles with static transforms, cache them.
-                    let is_cached = self.time_sample_count == 1 && space.len() <= 1;
-                    let mut tri = if is_cached {
-                        let tri = (
+                // Build the triangle cache if we can!
+                let is_cached = ray_stack.ray_count_in_next_task() >= tri_count
+                    && self.time_sample_count == 1
+                    && space.len() <= 1;
+                let mut tri_cache = [unsafe { std::mem::uninitialized() }; MAX_LEAF_TRIANGLE_COUNT];
+                if is_cached {
+                    for tri_idx in idx_range.clone() {
+                        let i = tri_idx - idx_range.start;
+                        let tri_indices = self.indices[tri_idx];
+
+                        // For static triangles with static transforms, cache them.
+                        tri_cache[i] = (
                             self.vertices[tri_indices.0 as usize],
                             self.vertices[tri_indices.1 as usize],
                             self.vertices[tri_indices.2 as usize],
                         );
-                        if space.is_empty() {
-                            tri
-                        } else {
-                            (
-                                tri.0 * static_mat_space,
-                                tri.1 * static_mat_space,
-                                tri.2 * static_mat_space,
-                            )
+                        if !space.is_empty() {
+                            tri_cache[i].0 = tri_cache[i].0 * static_mat_space;
+                            tri_cache[i].1 = tri_cache[i].1 * static_mat_space;
+                            tri_cache[i].2 = tri_cache[i].2 * static_mat_space;
                         }
+                    }
+                }
+
+                // Test each ray against the triangles.
+                ray_stack.do_next_task(|ray_idx| {
+                    let ray_idx = ray_idx as usize;
+
+                    if rays.is_done(ray_idx) {
+                        return;
+                    }
+
+                    let ray_time = rays.time(ray_idx);
+
+                    // Calculate the ray space, if necessary.
+                    let mat_space = if space.len() > 1 {
+                        // Per-ray transform, for motion blur
+                        lerp_slice(space, ray_time).inverse()
                     } else {
-                        unsafe { std::mem::uninitialized() }
+                        static_mat_space
                     };
 
-                    // Test each ray against the current triangle.
-                    ray_stack.do_next_task(|ray_idx| {
-                        let ray_idx = ray_idx as usize;
-                        let ray_time = rays.time(ray_idx);
+                    // Iterate through the triangles and test the ray against them.
+                    let mut non_shadow_hit = false;
+                    let mut hit_tri = unsafe { std::mem::uninitialized() };
+                    let mut hit_tri_indices = unsafe { std::mem::uninitialized() };
+                    let mut hit_tri_data = unsafe { std::mem::uninitialized() };
+                    for tri_idx in idx_range.clone() {
+                        let tri_indices = self.indices[tri_idx];
 
                         // Get triangle if necessary
-                        if !is_cached {
-                            tri = if self.time_sample_count == 1 {
+                        let tri = if is_cached {
+                            let i = tri_idx - idx_range.start;
+                            tri_cache[i]
+                        } else {
+                            let mut tri = if self.time_sample_count == 1 {
                                 // No deformation motion blur, so fast-path it.
                                 (
                                     self.vertices[tri_indices.0 as usize],
@@ -188,29 +216,14 @@ impl<'a> Surface for TriangleMesh<'a> {
 
                                 (p0, p1, p2)
                             };
-                        }
 
-                        // Transform triangle if necessary, and get transform space.
-                        let mat_space = if !space.is_empty() {
-                            if space.len() > 1 {
-                                // Per-ray transform, for motion blur
-                                let mat_space = lerp_slice(space, ray_time).inverse();
-                                tri = (tri.0 * mat_space, tri.1 * mat_space, tri.2 * mat_space);
-                                mat_space
-                            } else {
-                                // Same transform for all rays
-                                if !is_cached {
-                                    tri = (
-                                        tri.0 * static_mat_space,
-                                        tri.1 * static_mat_space,
-                                        tri.2 * static_mat_space,
-                                    );
-                                }
-                                static_mat_space
+                            if !space.is_empty() {
+                                tri.0 = tri.0 * mat_space;
+                                tri.1 = tri.1 * mat_space;
+                                tri.2 = tri.2 * mat_space;
                             }
-                        } else {
-                            // No transforms
-                            Matrix4x4::new()
+
+                            tri
                         };
 
                         // Test ray against triangle
@@ -223,60 +236,72 @@ impl<'a> Surface for TriangleMesh<'a> {
                             if rays.is_occlusion(ray_idx) {
                                 isects[ray_idx] = SurfaceIntersection::Occlude;
                                 rays.mark_done(ray_idx);
+                                break;
                             } else {
-                                // Calculate intersection point and error magnitudes
-                                let (pos, pos_err) = triangle::surface_point(tri, (b0, b1, b2));
-
-                                // Calculate geometric surface normal
-                                let geo_normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();
-
-                                // Calculate interpolated surface normal, if any
-                                let shading_normal = if let Some(normals) = self.normals {
-                                    let n0_slice = &normals[(tri_indices.0 as usize
-                                        * self.time_sample_count)
-                                        ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
-                                    let n1_slice = &normals[(tri_indices.1 as usize
-                                        * self.time_sample_count)
-                                        ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
-                                    let n2_slice = &normals[(tri_indices.2 as usize
-                                        * self.time_sample_count)
-                                        ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
-
-                                    let n0 = lerp_slice(n0_slice, ray_time).normalized();
-                                    let n1 = lerp_slice(n1_slice, ray_time).normalized();
-                                    let n2 = lerp_slice(n2_slice, ray_time).normalized();
-
-                                    let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
-                                    if dot(s_nor, geo_normal) >= 0.0 {
-                                        s_nor
-                                    } else {
-                                        -s_nor
-                                    }
-                                } else {
-                                    geo_normal
-                                };
-
-                                let intersection_data = SurfaceIntersectionData {
-                                    incoming: rays.dir(ray_idx),
-                                    t: t,
-                                    pos: pos,
-                                    pos_err: pos_err,
-                                    nor: shading_normal,
-                                    nor_g: geo_normal,
-                                    local_space: mat_space,
-                                    sample_pdf: 0.0,
-                                };
-
-                                // Fill in intersection data
-                                isects[ray_idx] = SurfaceIntersection::Hit {
-                                    intersection_data: intersection_data,
-                                    closure: shader.shade(&intersection_data, ray_time),
-                                };
+                                non_shadow_hit = true;
                                 rays.set_max_t(ray_idx, t);
+                                hit_tri = tri;
+                                hit_tri_indices = tri_indices;
+                                hit_tri_data = (t, b0, b1, b2);
                             }
                         }
-                    });
-                }
+                    }
+
+                    // Calculate intersection data if necessary.
+                    if non_shadow_hit {
+                        let (t, b0, b1, b2) = hit_tri_data;
+
+                        // Calculate intersection point and error magnitudes
+                        let (pos, pos_err) = triangle::surface_point(hit_tri, (b0, b1, b2));
+
+                        // Calculate geometric surface normal
+                        let geo_normal =
+                            cross(hit_tri.0 - hit_tri.1, hit_tri.0 - hit_tri.2).into_normal();
+
+                        // Calculate interpolated surface normal, if any
+                        let shading_normal = if let Some(normals) = self.normals {
+                            let n0_slice = &normals[(hit_tri_indices.0 as usize
+                                * self.time_sample_count)
+                                ..((hit_tri_indices.0 as usize + 1) * self.time_sample_count)];
+                            let n1_slice = &normals[(hit_tri_indices.1 as usize
+                                * self.time_sample_count)
+                                ..((hit_tri_indices.1 as usize + 1) * self.time_sample_count)];
+                            let n2_slice = &normals[(hit_tri_indices.2 as usize
+                                * self.time_sample_count)
+                                ..((hit_tri_indices.2 as usize + 1) * self.time_sample_count)];
+
+                            let n0 = lerp_slice(n0_slice, ray_time).normalized();
+                            let n1 = lerp_slice(n1_slice, ray_time).normalized();
+                            let n2 = lerp_slice(n2_slice, ray_time).normalized();
+
+                            let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
+                            if dot(s_nor, geo_normal) >= 0.0 {
+                                s_nor
+                            } else {
+                                -s_nor
+                            }
+                        } else {
+                            geo_normal
+                        };
+
+                        let intersection_data = SurfaceIntersectionData {
+                            incoming: rays.dir(ray_idx),
+                            t: t,
+                            pos: pos,
+                            pos_err: pos_err,
+                            nor: shading_normal,
+                            nor_g: geo_normal,
+                            local_space: mat_space,
+                            sample_pdf: 0.0,
+                        };
+
+                        // Fill in intersection data
+                        isects[ray_idx] = SurfaceIntersection::Hit {
+                            intersection_data: intersection_data,
+                            closure: shader.shade(&intersection_data, ray_time),
+                        };
+                    }
+                });
                 ray_stack.pop_task();
             });
     }

From 646139efda01598619a31335340b69a8e7c30fb6 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 6 Jul 2019 09:19:53 +0900
Subject: [PATCH 20/20] Factor out ray computations that are shared for all
 triangles.

---
 src/light/rectangle_light.rs |  3 +-
 src/surface/triangle.rs      | 83 +++++++++++++++++++++---------------
 src/surface/triangle_mesh.rs |  3 +-
 3 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/src/light/rectangle_light.rs b/src/light/rectangle_light.rs
index db01072..e399d68 100644
--- a/src/light/rectangle_light.rs
+++ b/src/light/rectangle_light.rs
@@ -284,8 +284,9 @@ impl<'a> Surface for RectangleLight<'a> {
             let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0) * space_inv;
 
             // Test against two triangles that make up the light
+            let ray_pre = triangle::RayTriPrecompute::new(dir);
             for tri in &[(p1, p2, p3), (p3, p4, p1)] {
-                if let Some((t, b0, b1, b2)) = triangle::intersect_ray(orig, dir, max_t, *tri) {
+                if let Some((t, b0, b1, b2)) = triangle::intersect_ray(orig, ray_pre, max_t, *tri) {
                     if t < max_t {
                         if rays.is_occlusion(ray_idx) {
                             isects[ray_idx] = SurfaceIntersection::Occlude;
diff --git a/src/surface/triangle.rs b/src/surface/triangle.rs
index 5f0a9f6..4aed3a3 100644
--- a/src/surface/triangle.rs
+++ b/src/surface/triangle.rs
@@ -5,6 +5,45 @@ use crate::{
     math::{Point, Vector},
 };
 
+#[derive(Debug, Copy, Clone)]
+pub struct RayTriPrecompute {
+    i: (usize, usize, usize),
+    s: (f32, f32, f32),
+}
+
+impl RayTriPrecompute {
+    pub fn new(ray_dir: Vector) -> RayTriPrecompute {
+        // Calculate the permuted dimension indices for the new ray space.
+        let (xi, yi, zi) = {
+            let xabs = ray_dir.x().abs();
+            let yabs = ray_dir.y().abs();
+            let zabs = ray_dir.z().abs();
+
+            if xabs > yabs && xabs > zabs {
+                (1, 2, 0)
+            } else if yabs > zabs {
+                (2, 0, 1)
+            } else {
+                (0, 1, 2)
+            }
+        };
+
+        let dir_x = ray_dir.get_n(xi);
+        let dir_y = ray_dir.get_n(yi);
+        let dir_z = ray_dir.get_n(zi);
+
+        // Calculate shear constants.
+        let sx = dir_x / dir_z;
+        let sy = dir_y / dir_z;
+        let sz = 1.0 / dir_z;
+
+        RayTriPrecompute {
+            i: (xi, yi, zi),
+            s: (sx, sy, sz),
+        }
+    }
+}
+
 /// Intersects `ray` with `tri`, returning `Some((t, b0, b1, b2))`, or `None`
 /// if no intersection.
 ///
@@ -18,45 +57,21 @@ use crate::{
 /// Intersection" by Woop et al.
 pub fn intersect_ray(
     ray_orig: Point,
-    ray_dir: Vector,
+    ray_pre: RayTriPrecompute,
     ray_max_t: f32,
     tri: (Point, Point, Point),
 ) -> Option<(f32, f32, f32, f32)> {
-    // Calculate the permuted dimension indices for the new ray space.
-    let (xi, yi, zi) = {
-        let xabs = ray_dir.x().abs();
-        let yabs = ray_dir.y().abs();
-        let zabs = ray_dir.z().abs();
-
-        if xabs > yabs && xabs > zabs {
-            (1, 2, 0)
-        } else if yabs > zabs {
-            (2, 0, 1)
-        } else {
-            (0, 1, 2)
-        }
-    };
-
-    let dir_x = ray_dir.get_n(xi);
-    let dir_y = ray_dir.get_n(yi);
-    let dir_z = ray_dir.get_n(zi);
-
-    // Calculate shear constants.
-    let sx = dir_x / dir_z;
-    let sy = dir_y / dir_z;
-    let sz = 1.0 / dir_z;
-
     // Calculate vertices in ray space.
     let p0 = tri.0 - ray_orig;
     let p1 = tri.1 - ray_orig;
     let p2 = tri.2 - ray_orig;
 
-    let p0x = p0.get_n(xi) - (sx * p0.get_n(zi));
-    let p0y = p0.get_n(yi) - (sy * p0.get_n(zi));
-    let p1x = p1.get_n(xi) - (sx * p1.get_n(zi));
-    let p1y = p1.get_n(yi) - (sy * p1.get_n(zi));
-    let p2x = p2.get_n(xi) - (sx * p2.get_n(zi));
-    let p2y = p2.get_n(yi) - (sy * p2.get_n(zi));
+    let p0x = p0.get_n(ray_pre.i.0) - (ray_pre.s.0 * p0.get_n(ray_pre.i.2));
+    let p0y = p0.get_n(ray_pre.i.1) - (ray_pre.s.1 * p0.get_n(ray_pre.i.2));
+    let p1x = p1.get_n(ray_pre.i.0) - (ray_pre.s.0 * p1.get_n(ray_pre.i.2));
+    let p1y = p1.get_n(ray_pre.i.1) - (ray_pre.s.1 * p1.get_n(ray_pre.i.2));
+    let p2x = p2.get_n(ray_pre.i.0) - (ray_pre.s.0 * p2.get_n(ray_pre.i.2));
+    let p2y = p2.get_n(ray_pre.i.1) - (ray_pre.s.1 * p2.get_n(ray_pre.i.2));
 
     // Calculate scaled barycentric coordinates.
     let mut e0 = (p1x * p2y) - (p1y * p2x);
@@ -82,9 +97,9 @@ pub fn intersect_ray(
     }
 
     // Calculate t of hitpoint.
-    let p0z = sz * p0.get_n(zi);
-    let p1z = sz * p1.get_n(zi);
-    let p2z = sz * p2.get_n(zi);
+    let p0z = ray_pre.s.2 * p0.get_n(ray_pre.i.2);
+    let p1z = ray_pre.s.2 * p1.get_n(ray_pre.i.2);
+    let p2z = ray_pre.s.2 * p2.get_n(ray_pre.i.2);
     let t_scaled = (e0 * p0z) + (e1 * p1z) + (e2 * p2z);
 
     // Check if the hitpoint t is within ray min/max t.
diff --git a/src/surface/triangle_mesh.rs b/src/surface/triangle_mesh.rs
index 967f90c..43388a8 100644
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@@ -183,6 +183,7 @@ impl<'a> Surface for TriangleMesh<'a> {
                     let mut hit_tri = unsafe { std::mem::uninitialized() };
                     let mut hit_tri_indices = unsafe { std::mem::uninitialized() };
                     let mut hit_tri_data = unsafe { std::mem::uninitialized() };
+                    let ray_pre = triangle::RayTriPrecompute::new(rays.dir(ray_idx));
                     for tri_idx in idx_range.clone() {
                         let tri_indices = self.indices[tri_idx];
 
@@ -229,7 +230,7 @@ impl<'a> Surface for TriangleMesh<'a> {
                         // Test ray against triangle
                         if let Some((t, b0, b1, b2)) = triangle::intersect_ray(
                             rays.orig(ray_idx),
-                            rays.dir(ray_idx),
+                            ray_pre,
                             rays.max_t(ray_idx),
                             tri,
                         ) {