Removed BVH4 and the related bitstack from AccelRay.

I couldn't make the BVH4 faster than the BVH, and the bitstack was bloating the AccelRay struct. Removing the bitstack gives a small but noticable speedup in rendering.
2017-05-12 21:07:40 -07:00 · 2017-05-12 21:07:40 -07:00 · 172e2f19ef
commit 172e2f19ef
parent 88578b9eae
8 changed files with 0 additions and 1010 deletions
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@ -1,487 +0,0 @@
 //! This BVH4 implementation pulls a lot of ideas from the paper
 //! "Efficient Ray Tracing Kernels for Modern CPU Architectures"
 //! by Fuetterling et al.
 //!
 //! Specifically, the table-based traversal order approach they
 //! propose is largely followed by this implementation.
 #![allow(dead_code)]
 use mem_arena::MemArena;
 use algorithm::{partition, partition_with_side};
 use bbox::BBox;
 use bbox4::BBox4;
 use boundable::Boundable;
 use lerp::lerp_slice;
 use ray::AccelRay;
 use timer::Timer;
 use super::bvh_base::{BVHBase, BVHBaseNode, BVH_MAX_DEPTH};
 use super::ACCEL_TRAV_TIME;
 // TRAVERSAL_TABLE
 include!("bvh4_table.inc");
 #[derive(Copy, Clone, Debug)]
 pub struct BVH4<'a> {
    root: Option<&'a BVH4Node<'a>>,
    depth: usize,
    node_count: usize,
    _bounds: Option<&'a [BBox]>,
 }
 #[derive(Copy, Clone, Debug)]
 pub enum BVH4Node<'a> {
    Internal {
        bounds: &'a [BBox4],
        children: &'a [BVH4Node<'a>],
        traversal_code: u8,
    },
    Leaf { object_range: (usize, usize) },
 }
 impl<'a> BVH4<'a> {
    pub fn from_objects<'b, T, F>(arena: &'a MemArena,
                                  objects: &mut [T],
                                  objects_per_leaf: usize,
                                  bounder: F)
                                  -> BVH4<'a>
        where F: 'b + Fn(&T) -> &'b [BBox]
    {
        if objects.len() == 0 {
            BVH4 {
                root: None,
                depth: 0,
                node_count: 0,
                _bounds: None,
            }
        } else {
            let base = BVHBase::from_objects(objects, objects_per_leaf, bounder);
            let mut fill_node = unsafe { arena.alloc_uninitialized_with_alignment::<BVH4Node>(32) };
            let node_count = BVH4::construct_from_base(arena,
                                                       &base,
                                                       &base.nodes[base.root_node_index()],
                                                       fill_node);
            BVH4 {
                root: Some(fill_node),
                depth: (base.depth / 2) + 1,
                node_count: node_count,
                _bounds: {
                    let range = base.nodes[base.root_node_index()].bounds_range();
                    Some(arena.copy_slice(&base.bounds[range.0..range.1]))
                },
            }
        }
    }
    pub fn tree_depth(&self) -> usize {
        self.depth
    }
    pub fn traverse<T, F>(&self, rays: &mut [AccelRay], objects: &[T], mut obj_ray_test: F)
        where F: FnMut(&T, &mut [AccelRay])
    {
        if self.root.is_none() {
            return;
        }
        let mut trav_time: f64 = 0.0;
        let mut timer = Timer::new();
        // +2 of max depth for root and last child
        let mut node_stack = [self.root; BVH_MAX_DEPTH + 2];
        let mut ray_i_stack = [rays.len(); BVH_MAX_DEPTH + 2];
        let mut stack_ptr = 1;
        let mut unpopped = 0;
        let mut first_loop = true;
        let ray_code = ((rays[0].dir_inv.x() < 0.0) as u8) |
                       (((rays[0].dir_inv.y() < 0.0) as u8) << 1) |
                       (((rays[0].dir_inv.z() < 0.0) as u8) << 2);
        while stack_ptr > 0 {
            match node_stack[stack_ptr] {
                Some(&BVH4Node::Internal { bounds, children, traversal_code }) => {
                    let node_order_code = {
                        TRAVERSAL_TABLE[ray_code as usize][traversal_code as usize]
                    };
                    let noc1 = node_order_code & 3;
                    let noc2 = (node_order_code >> 2) & 3;
                    let noc3 = (node_order_code >> 4) & 3;
                    let noc4 = (node_order_code >> 6) & 3;
                    let mut all_hits = 0;
                    // Ray testing
                    let part;
                    {
                        // Common code for ray testing below
                        let mut test_ray = |r: &mut AccelRay| {
                            let hits = lerp_slice(bounds, r.time)
                                .intersect_accel_ray(r)
                                .to_bitmask();
                            all_hits |= hits;
                            if hits != 0 {
                                // Push hit bits onto ray's traversal stack
                                let shuffled_hits = match children.len() {
                                    4 => {
                                        ((hits >> noc1) & 1) | (((hits >> noc2) & 1) << 1) |
                                        (((hits >> noc3) & 1) << 2) |
                                        (((hits >> noc4) & 1) << 3)
                                    }
                                    3 => {
                                        ((hits >> noc1) & 1) | (((hits >> noc2) & 1) << 1) |
                                        (((hits >> noc3) & 1) << 2)
                                    }
                                    2 => ((hits >> noc1) & 1) | (((hits >> noc2) & 1) << 1),
                                    _ => unreachable!(),
                                };
                                r.trav_stack.push_n(shuffled_hits, children.len() as u8);
                                return true;
                            }
                            return false;
                        };
                        // Skip some tests if it's the first loop
                        part = if first_loop {
                            filter_rays(&ray_i_stack[stack_ptr..],
                                        &mut rays[..ray_i_stack[stack_ptr]],
                                        unpopped,
                                        |r, _| {
                                            if !r.is_done() {
                                                return test_ray(r);
                                            }
                                            return false;
                                        })
                        } else {
                            filter_rays(&ray_i_stack[stack_ptr..],
                                        &mut rays[..ray_i_stack[stack_ptr]],
                                        unpopped,
                                        |r, pop_count| {
                                if (!r.is_done()) && r.trav_stack.pop_to_nth(pop_count) {
                                    return test_ray(r);
                                }
                                return false;
                            })
                        };
                    }
                    unpopped = 0;
                    // Update stack based on ray testing results
                    if part > 0 {
                        for i in 0..children.len() {
                            let inv_i = (children.len() - 1) - i;
                            let child_i = ((node_order_code >> (inv_i * 2)) & 3) as usize;
                            node_stack[stack_ptr + i] = if ((all_hits >> child_i) & 1) == 0 {
                                None
                            } else {
                                Some(&children[child_i])
                            };
                            ray_i_stack[stack_ptr + i] = part;
                        }
                        stack_ptr += children.len() - 1;
                    } else {
                        stack_ptr -= 1;
                    }
                }
                Some(&BVH4Node::Leaf { object_range }) => {
                    let part = if !first_loop {
                        filter_rays(&ray_i_stack[stack_ptr..],
                                    &mut rays[..ray_i_stack[stack_ptr]],
                                    unpopped,
                                    |r, pop_count| {
                                        (!r.is_done()) && r.trav_stack.pop_to_nth(pop_count)
                                    })
                    } else {
                        ray_i_stack[stack_ptr]
                    };
                    unpopped = 0;
                    trav_time += timer.tick() as f64;
                    for obj in &objects[object_range.0..object_range.1] {
                        obj_ray_test(obj, &mut rays[..part]);
                    }
                    timer.tick();
                    stack_ptr -= 1;
                }
                None => {
                    if !first_loop {
                        unpopped += 1;
                    }
                    stack_ptr -= 1;
                }
            }
            first_loop = false;
        }
        // Pop any unpopped bits of the ray traversal stacks
        if unpopped > 0 {
            filter_rays(&ray_i_stack[1..],
                        &mut rays[..ray_i_stack[1]],
                        unpopped - 1,
                        |r, pop_count| r.trav_stack.pop_to_nth(pop_count));
        }
        trav_time += timer.tick() as f64;
        ACCEL_TRAV_TIME.with(|att| {
            let v = att.get();
            att.set(v + trav_time);
        });
    }
    fn construct_from_base(arena: &'a MemArena,
                           base: &BVHBase,
                           node: &BVHBaseNode,
                           fill_node: &mut BVH4Node<'a>)
                           -> usize {
        let mut node_count = 0;
        match node {
            // Create internal node
            &BVHBaseNode::Internal { bounds_range: _, children_indices, split_axis } => {
                let child_l = &base.nodes[children_indices.0];
                let child_r = &base.nodes[children_indices.1];
                // Prepare convenient access to the stuff we need.
                let child_count;
                let children; // [Optional, Optional, Optional, Optional]
                let split_axis_l; // Optional
                let split_axis_r; // Optional
                match child_l {
                    &BVHBaseNode::Internal { children_indices: i_l, split_axis: s_l, .. } => {
                        match child_r {
                            &BVHBaseNode::Internal { children_indices: i_r,
                                                     split_axis: s_r,
                                                     .. } => {
                                // Four nodes
                                child_count = 4;
                                children = [Some(&base.nodes[i_l.0]),
                                            Some(&base.nodes[i_l.1]),
                                            Some(&base.nodes[i_r.0]),
                                            Some(&base.nodes[i_r.1])];
                                split_axis_l = Some(s_l);
                                split_axis_r = Some(s_r);
                            }
                            &BVHBaseNode::Leaf { .. } => {
                                // Three nodes with left split
                                child_count = 3;
                                children = [Some(&base.nodes[i_l.0]),
                                            Some(&base.nodes[i_l.1]),
                                            Some(child_r),
                                            None];
                                split_axis_l = Some(s_l);
                                split_axis_r = None;
                            }
                        }
                    }
                    &BVHBaseNode::Leaf { .. } => {
                        match child_r {
                            &BVHBaseNode::Internal { children_indices: i_r,
                                                     split_axis: s_r,
                                                     .. } => {
                                // Three nodes with right split
                                child_count = 3;
                                children = [Some(child_l),
                                            Some(&base.nodes[i_r.0]),
                                            Some(&base.nodes[i_r.1]),
                                            None];
                                split_axis_l = None;
                                split_axis_r = Some(s_r);
                            }
                            &BVHBaseNode::Leaf { .. } => {
                                // Two nodes
                                child_count = 2;
                                children = [Some(child_l), Some(child_r), None, None];
                                split_axis_l = None;
                                split_axis_r = None;
                            }
                        }
                    }
                }
                node_count += child_count;
                // Construct bounds
                let bounds = {
                    let bounds_len = children.iter()
                        .map(|c| if let &Some(n) = c {
                            let len = n.bounds_range().1 - n.bounds_range().0;
                            debug_assert!(len >= 1);
                            len
                        } else {
                            0
                        })
                        .max()
                        .unwrap();
                    debug_assert!(bounds_len >= 1);
                    let mut bounds =
                        unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) };
                    if bounds_len < 2 {
                        let b1 = children[0]
                            .map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
                        let b2 = children[1]
                            .map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
                        let b3 = children[2]
                            .map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
                        let b4 = children[3]
                            .map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
                        bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4);
                    } else {
                        for (i, b) in bounds.iter_mut().enumerate() {
                            let time = i as f32 / (bounds_len - 1) as f32;
                            let b1 = children[0].map_or(BBox::new(), |c| {
                                let (x, y) = c.bounds_range();
                                lerp_slice(&base.bounds[x..y], time)
                            });
                            let b2 = children[1].map_or(BBox::new(), |c| {
                                let (x, y) = c.bounds_range();
                                lerp_slice(&base.bounds[x..y], time)
                            });
                            let b3 = children[2].map_or(BBox::new(), |c| {
                                let (x, y) = c.bounds_range();
                                lerp_slice(&base.bounds[x..y], time)
                            });
                            let b4 = children[3].map_or(BBox::new(), |c| {
                                let (x, y) = c.bounds_range();
                                lerp_slice(&base.bounds[x..y], time)
                            });
                            *b = BBox4::from_bboxes(b1, b2, b3, b4);
                        }
                    }
                    bounds
                };
                // Construct child nodes
                let mut child_nodes =
                    unsafe {
                        arena.alloc_array_uninitialized_with_alignment::<BVH4Node>(child_count, 32)
                    };
                for (i, c) in children[0..child_count].iter().enumerate() {
                    node_count +=
                        BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]);
                }
                // Build this node
                let traversal_code = {
                    let topology_code = if child_count == 4 {
                        0
                    } else if child_count == 2 {
                        3
                    } else if split_axis_l.is_some() {
                        1
                    } else {
                        2
                    };
                    calc_traversal_code(split_axis,
                                        split_axis_l.unwrap_or(split_axis_r.unwrap_or(0)),
                                        if child_count == 4 {
                                            split_axis_r.unwrap()
                                        } else {
                                            0
                                        },
                                        topology_code)
                };
                *fill_node = BVH4Node::Internal {
                    bounds: bounds,
                    children: child_nodes,
                    traversal_code: traversal_code,
                };
            }
            // Create internal node
            &BVHBaseNode::Leaf { object_range, .. } => {
                *fill_node = BVH4Node::Leaf { object_range: object_range };
                node_count += 1;
            }
        }
        return node_count;
    }
 }
 impl<'a> Boundable for BVH4<'a> {
    fn bounds<'b>(&'b self) -> &'b [BBox] {
        self._bounds.unwrap_or(&[])
    }
 }
 // Calculates the traversal code for a BVH4 node based on the splits and topology
 // of its children.
 //
 // split_1 is the top split.
 //
 // split_2 is either the left or right split depending on topology, and is only
 // relevant for topologies 0-2.  For topology 3 it should be 0.
 //
 // split_3 is always the right split, and is only relevant for topology 0. For
 // topologies 1-3 it should be 0.
 //
 // topology can be 0-3:
 //     0: All three splits exist, representing 4 BVH4 children.
 //     1: Two splits exist: top split and left split, representing 3 BVH4 children.
 //     2: Two splits exist: top split and right split, representing 3 BVH4 children.
 //     3: Only the top split exists, representing 2 BVH4 children.
 fn calc_traversal_code(split_1: u8, split_2: u8, split_3: u8, topology: u8) -> u8 {
    debug_assert!(!(topology > 0 && split_3 > 0));
    debug_assert!(!(topology > 2 && split_2 > 0));
    static T_TABLE: [u8; 4] = [0, 27, 27 + 9, 27 + 9 + 9];
    split_1 + (split_2 * 3) + (split_3 * 9) + T_TABLE[topology as usize]
 }
 fn filter_rays<F>(ray_i_stack: &[usize],
                  rays: &mut [AccelRay],
                  unpopped: usize,
                  mut ray_test: F)
                  -> usize
    where F: FnMut(&mut AccelRay, usize) -> bool
 {
    let part = if ray_i_stack[0] == ray_i_stack[unpopped] {
        let pop_count = unpopped + 1;
        partition(rays, |r| ray_test(r, pop_count))
    } else {
        let mut part_n = [0, rays.len() - 1]; // Where we are in the partition
        let mut part_pop = [unpopped, 0]; // Number of bits to pop on the left and right side
        partition_with_side(rays, |r, side| {
            let pop_count = if !side {
                while part_n[0] >= ray_i_stack[part_pop[0]] {
                    part_pop[0] -= 1;
                }
                part_n[0] += 1;
                part_pop[0]
            } else {
                while part_n[1] < ray_i_stack[part_pop[1] + 1] && part_pop[1] < unpopped {
                    part_pop[1] += 1;
                }
                part_n[1] -= 1;
                part_pop[1]
            };
            return ray_test(r, pop_count + 1);
        })
    };
    part
 }
--- a/src/accel/bvh4_table.inc
+++ b/src/accel/bvh4_table.inc
@ -1,35 +0,0 @@
 static TRAVERSAL_TABLE: [[u8; 48]; 8] = [
    [228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228,
     36, 36, 36, 36, 36, 36, 36, 36, 36,
     36, 36, 36, 36, 36, 36, 36, 36, 36,
     4, 4, 4],
    [27, 177, 177, 75, 180, 180, 75, 180, 180, 30, 225, 225, 78, 228, 228, 78, 228, 228, 30, 225, 225, 78, 228, 228, 78, 228, 228,
     6, 33, 33, 18, 36, 36, 18, 36, 36,
     6, 24, 24, 9, 36, 36, 9, 36, 36,
     1, 4, 4],
    [228, 78, 228, 225, 30, 225, 228, 78, 228, 180, 75, 180, 177, 27, 177, 180, 75, 180, 228, 78, 228, 225, 30, 225, 228, 78, 228,
     36, 18, 36, 33, 6, 33, 36, 18, 36,
     36, 9, 36, 24, 6, 24, 36, 9, 36,
     4, 1, 4],
    [27, 27, 177, 27, 27, 177, 75, 75, 180, 27, 27, 177, 27, 27, 177, 75, 75, 180, 30, 30, 225, 30, 30, 225, 78, 78, 228,
     6, 6, 33, 6, 6, 33, 18, 18, 36,
     6, 6, 24, 6, 6, 24, 9, 9, 36,
     1, 1, 4],
    [228, 228, 78, 228, 228, 78, 225, 225, 30, 228, 228, 78, 228, 228, 78, 225, 225, 30, 180, 180, 75, 180, 180, 75, 177, 177, 27,
     36, 36, 18, 36, 36, 18, 33, 33, 6,
     36, 36, 9, 36, 36, 9, 24, 24, 6,
     4, 4, 1],
    [27, 177, 27, 75, 180, 75, 27, 177, 27, 30, 225, 30, 78, 228, 78, 30, 225, 30, 27, 177, 27, 75, 180, 75, 27, 177, 27,
     6, 33, 6, 18, 36, 18, 6, 33, 6,
     6, 24, 6, 9, 36, 9, 6, 24, 6,
     1, 4, 1],
    [228, 78, 78, 225, 30, 30, 225, 30, 30, 180, 75, 75, 177, 27, 27, 177, 27, 27, 180, 75, 75, 177, 27, 27, 177, 27, 27,
     36, 18, 18, 33, 6, 6, 33, 6, 6,
     36, 9, 9, 24, 6, 6, 24, 6, 6,
     4, 1, 1],
    [27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
     6, 6, 6, 6, 6, 6, 6, 6, 6,
     6, 6, 6, 6, 6, 6, 6, 6, 6,
     1, 1, 1],
 ];
--- a/src/accel/create_bvh4_table.py
+++ b/src/accel/create_bvh4_table.py
@ -1,58 +0,0 @@
 #!/usr/bin/env python
 if __name__ == "__main__":
    text = "static TRAVERSAL_TABLE: [[u8; 48]; 8] = [\n"
    for raydir in range(0, 8):
        ray = [raydir & 1, (raydir >> 1) & 1, (raydir >> 2) & 1]
        text += "    ["
        for splits in [[s1, s2, s3] for s3 in range(0,3) for s2 in range(0,3) for s1 in range(0,3)]:
            perm = [0, 1, 2, 3]
            if ray[splits[1]] == 1:
                perm = [perm[1], perm[0]] + perm[2:4]
            if ray[splits[2]] == 1:
                perm = perm[0:2] + [perm[3], perm[2]]
            if ray[splits[0]] == 1:
                perm = perm[2:4] + perm[0:2]
            perm = perm[0] + (perm[1] << 2) + (perm[2] << 4) + (perm[3] << 6)
            text += "%d, " % perm
        text = text[:-1]
        text += "\n     "
        for splits in [[s1, s2] for s2 in range(0,3) for s1 in range(0,3)]:
            perm = [0, 1, 2]
            if ray[splits[1]] == 1:
                perm = [perm[1], perm[0], perm[2]]
            if ray[splits[0]] == 1:
                perm = [perm[2], perm[0], perm[1]]
            perm = perm[0] + (perm[1] << 2) + (perm[2] << 4)
            text += "%d, " % perm
        text = text[:-1]
        text += "\n     "
        for splits in [[s1, s2] for s2 in range(0,3) for s1 in range(0,3)]:
            perm = [0, 1, 2]
            if ray[splits[1]] == 1:
                perm = [perm[0], perm[2], perm[1]]
            if ray[splits[0]] == 1:
                perm = [perm[1], perm[2], perm[0]]
            perm = perm[0] + (perm[1] << 2) + (perm[2] << 4)
            text += "%d, " % perm
        text = text[:-1]
        text += "\n     "
        for split in [s1 for s1 in range(0,3)]:
            perm = [0, 1]
            if ray[split] == 1:
                perm = [perm[1], perm[0]]
            perm = perm[0] + (perm[1] << 2)
            text += "%d, " % perm
        text = text[:-1]
        text = text[:-1] + "],\n"
    text += "];\n"
    print text
--- a/src/accel/mod.rs
+++ b/src/accel/mod.rs
@ -1,6 +1,5 @@
 mod bvh_base;
 mod bvh;
 mod bvh4;
 mod light_array;
 mod light_tree;
 mod objects_split;
@ -11,7 +10,6 @@ use math::{Vector, Point, Normal};
 use shading::surface_closure::SurfaceClosure;
 pub use self::bvh::{BVH, BVHNode};
 pub use self::bvh4::{BVH4, BVH4Node};
 pub use self::light_tree::LightTree;
 // Track BVH traversal time
--- a/src/bbox4.rs
+++ b/src/bbox4.rs
@ -1,116 +0,0 @@
 #![allow(dead_code)]
 use std;
 use std::ops::{BitOr, BitOrAssign};
 use bbox::BBox;
 use float4::{Float4, Bool4, v_min, v_max};
 use lerp::{lerp, Lerp};
 use ray::AccelRay;
 const BBOX_MAXT_ADJUST: f32 = 1.00000024;
 /// A SIMD set of 4 3D axis-aligned bounding boxes.
 #[derive(Debug, Copy, Clone)]
 pub struct BBox4 {
    pub x: (Float4, Float4), // (min, max)
    pub y: (Float4, Float4), // (min, max)
    pub z: (Float4, Float4), // (min, max)
 }
 impl BBox4 {
    /// Creates a degenerate BBox with +infinity min and -infinity max.
    pub fn new() -> BBox4 {
        BBox4 {
            x: (Float4::splat(std::f32::INFINITY), Float4::splat(std::f32::NEG_INFINITY)),
            y: (Float4::splat(std::f32::INFINITY), Float4::splat(std::f32::NEG_INFINITY)),
            z: (Float4::splat(std::f32::INFINITY), Float4::splat(std::f32::NEG_INFINITY)),
        }
    }
    /// Creates a BBox with min as the minimum extent and max as the maximum
    /// extent.
    pub fn from_bboxes(b1: BBox, b2: BBox, b3: BBox, b4: BBox) -> BBox4 {
        BBox4 {
            x: (Float4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()),
                Float4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x())),
            y: (Float4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()),
                Float4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y())),
            z: (Float4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()),
                Float4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z())),
        }
    }
    // Returns whether the given ray intersects with the bboxes.
    pub fn intersect_accel_ray(&self, ray: &AccelRay) -> Bool4 {
        // Precalculate ray direction sign booleans.
        // Doing it up here slightly speeds things up lower down.
        let ray_pos = (ray.dir_inv.x() >= 0.0, ray.dir_inv.y() >= 0.0, ray.dir_inv.z() >= 0.0);
        // Convert ray to SIMD form
        let ray4_o =
            (Float4::splat(ray.orig.x()), Float4::splat(ray.orig.y()), Float4::splat(ray.orig.z()));
        let ray4_dinv = (Float4::splat(ray.dir_inv.x()),
                         Float4::splat(ray.dir_inv.y()),
                         Float4::splat(ray.dir_inv.z()));
        // Calculate the plane intersections
        let (xlos, xhis) = if ray_pos.0 {
            ((self.x.0 - ray4_o.0) * ray4_dinv.0, (self.x.1 - ray4_o.0) * ray4_dinv.0)
        } else {
            ((self.x.1 - ray4_o.0) * ray4_dinv.0, (self.x.0 - ray4_o.0) * ray4_dinv.0)
        };
        let (ylos, yhis) = if ray_pos.1 {
            ((self.y.0 - ray4_o.1) * ray4_dinv.1, (self.y.1 - ray4_o.1) * ray4_dinv.1)
        } else {
            ((self.y.1 - ray4_o.1) * ray4_dinv.1, (self.y.0 - ray4_o.1) * ray4_dinv.1)
        };
        let (zlos, zhis) = if ray_pos.2 {
            ((self.z.0 - ray4_o.2) * ray4_dinv.2, (self.z.1 - ray4_o.2) * ray4_dinv.2)
        } else {
            ((self.z.1 - ray4_o.2) * ray4_dinv.2, (self.z.0 - ray4_o.2) * ray4_dinv.2)
        };
        // Get the minimum and maximum hits
        let mins = v_max(v_max(xlos, ylos), v_max(zlos, Float4::splat(0.0)));
        let maxs = v_max(v_min(v_min(xhis, yhis), zhis),
                         Float4::splat(std::f32::NEG_INFINITY)) *
                   Float4::splat(BBOX_MAXT_ADJUST);
        // Check for hits
        let hits = mins.lt(Float4::splat(ray.max_t)) & mins.lte(maxs);
        return hits;
    }
 }
 /// Union of two BBoxes.
 impl BitOr for BBox4 {
    type Output = BBox4;
    fn bitor(self, rhs: BBox4) -> BBox4 {
        BBox4 {
            x: (self.x.0.v_min(rhs.x.0), self.x.1.v_max(rhs.x.1)),
            y: (self.y.0.v_min(rhs.y.0), self.y.1.v_max(rhs.y.1)),
            z: (self.z.0.v_min(rhs.z.0), self.z.1.v_max(rhs.z.1)),
        }
    }
 }
 impl BitOrAssign for BBox4 {
    fn bitor_assign(&mut self, rhs: BBox4) {
        *self = *self | rhs;
    }
 }
 impl Lerp for BBox4 {
    fn lerp(self, other: BBox4, alpha: f32) -> BBox4 {
        BBox4 {
            x: (lerp(self.x.0, other.x.0, alpha), lerp(self.x.1, other.x.1, alpha)),
            y: (lerp(self.y.0, other.y.0, alpha), lerp(self.y.1, other.y.1, alpha)),
            z: (lerp(self.z.0, other.z.0, alpha), lerp(self.z.1, other.z.1, alpha)),
        }
    }
 }
--- a/src/bitstack.rs
+++ b/src/bitstack.rs
@ -1,303 +0,0 @@
 #![allow(dead_code)]
 use std::mem::size_of;
 #[derive(Copy, Clone, Debug)]
 pub struct BitStack128 {
    data: (u64, u64),
 }
 impl BitStack128 {
    pub fn new() -> BitStack128 {
        BitStack128 { data: (0, 0) }
    }
    pub fn new_with_1() -> BitStack128 {
        BitStack128 { data: (1, 0) }
    }
    /// Push a bit onto the top of the stack.
    pub fn push(&mut self, value: bool) {
        // Verify no stack overflow
        debug_assert!((self.data.1 >> ((size_of::<u64>() * 8) - 1)) == 0);
        self.data.1 = (self.data.1 << 1) | (self.data.0 >> ((size_of::<u64>() * 8) - 1));
        self.data.0 <<= 1;
        self.data.0 |= value as u64;
    }
    /// Push n bits onto the top of the stack.  The input
    /// bits are passed as an integer, with the bit that
    /// will be on top in the least significant digit, and
    /// the rest following in order from there.
    ///
    /// Note that unless you are running a debug build, no
    /// effort is made to verify that only the first n
    /// bits of the passed value are used.  So if other
    /// bits are non-zero this will produce incorrect results.
    pub fn push_n(&mut self, value: u8, count: u8) {
        // Verify no bitstack overflow
        debug_assert!((self.data.1 >> ((size_of::<u64>() * 8) - count as usize)) == 0);
        // Verify no bits outside of the n-bit range
        debug_assert!(if count < (size_of::<u8>() * 8) as u8 {
            value & (!((1 << count) - 1)) == 0
        } else {
            true
        });
        debug_assert!(count <= (size_of::<u8>() * 8) as u8);
        self.data.1 = (self.data.1 << count as usize) |
                      (self.data.0 >> ((size_of::<u64>() * 8) - count as usize));
        self.data.0 <<= count as u64;
        self.data.0 |= value as u64;
    }
    /// Pop the top bit off the stack.
    pub fn pop(&mut self) -> bool {
        let b = (self.data.0 & 1) != 0;
        self.data.0 = (self.data.0 >> 1) | (self.data.1 << ((size_of::<u64>() * 8) - 1));
        self.data.1 >>= 1;
        return b;
    }
    /// Pop the top n bits off the stack.  The bits are returned as
    /// an integer, with the top bit in the least significant digit,
    /// and the rest following in order from there.
    pub fn pop_n(&mut self, n: usize) -> u64 {
        debug_assert!(n < (size_of::<BitStack128>() * 8)); // Can't pop more than we have
        debug_assert!(n < (size_of::<u64>() * 8)); // Can't pop more than the return type can hold
        let b = self.data.0 & ((1 << n) - 1);
        self.data.0 = (self.data.0 >> n) | (self.data.1 << ((size_of::<u64>() * 8) - n));
        self.data.1 >>= n;
        return b;
    }
    /// Pop the top n bits off the stack, but return only the nth bit.
    pub fn pop_to_nth(&mut self, n: usize) -> bool {
        debug_assert!(n > 0);
        debug_assert!(n < (size_of::<BitStack128>() * 8)); // Can't pop more than we have
        debug_assert!(n < (size_of::<u64>() * 8)); // Can't pop more than the return type can hold
        let b = (self.data.0 & (1 << (n - 1))) != 0;
        self.data.0 = (self.data.0 >> n) | (self.data.1 << ((size_of::<u64>() * 8) - n));
        self.data.1 >>= n;
        return b;
    }
    /// Read the top bit of the stack without popping it.
    pub fn peek(&self) -> bool {
        (self.data.0 & 1) != 0
    }
    /// Read the top n bits of the stack without popping them.  The bits
    /// are returned as an integer, with the top bit in the least
    /// significant digit, and the rest following in order from there.
    pub fn peek_n(&self, n: usize) -> u64 {
        // Can't return more than we have
        debug_assert!(n < (size_of::<BitStack128>() * 8));
        // Can't return more than the return type can hold
        debug_assert!(n < (size_of::<u64>() * 8));
        self.data.0 & ((1 << n) - 1)
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn push() {
        let mut bs = BitStack128::new();
        bs.push(true);
        bs.push(false);
        bs.push(true);
        bs.push(true);
        bs.push(false);
        bs.push(true);
        bs.push(true);
        bs.push(true);
        assert!(bs.data.0 == 0b10110111);
        assert!(bs.data.1 == 0);
    }
    #[test]
    fn push_overflow() {
        let mut bs = BitStack128::new();
        for _ in 0..9 {
            bs.push(true);
            bs.push(false);
            bs.push(true);
            bs.push(true);
            bs.push(false);
            bs.push(true);
            bs.push(true);
            bs.push(true);
        }
        assert!(bs.data.0 == 0b1011011110110111101101111011011110110111101101111011011110110111);
        assert!(bs.data.1 == 0b10110111);
    }
    #[test]
    fn pop() {
        let mut bs = BitStack128::new();
        bs.data.0 = 0b10110111;
        assert!(bs.pop() == true);
        assert!(bs.pop() == true);
        assert!(bs.pop() == true);
        assert!(bs.pop() == false);
        assert!(bs.pop() == true);
        assert!(bs.pop() == true);
        assert!(bs.pop() == false);
        assert!(bs.pop() == true);
    }
    #[test]
    fn pop_overflow() {
        let mut bs = BitStack128::new();
        bs.data.0 = 0b1011011110110111101101111011011110110111101101111011011110110111;
        bs.data.1 = 0b10110111;
        for _ in 0..9 {
            assert!(bs.pop() == true);
            assert!(bs.pop() == true);
            assert!(bs.pop() == true);
            assert!(bs.pop() == false);
            assert!(bs.pop() == true);
            assert!(bs.pop() == true);
            assert!(bs.pop() == false);
            assert!(bs.pop() == true);
        }
    }
    #[test]
    fn push_n() {
        let mut bs = BitStack128::new();
        bs.push_n(0b10110, 5);
        bs.push_n(0b10110111, 8);
        assert!(bs.data.0 == 0b1011010110111);
    }
    #[test]
    fn push_n_overflow() {
        let mut bs = BitStack128::new();
        for _ in 0..9 {
            bs.push_n(0b10110111, 8);
        }
        assert!(bs.data.0 == 0b1011011110110111101101111011011110110111101101111011011110110111);
        assert!(bs.data.1 == 0b10110111);
    }
    #[test]
    fn pop_n() {
        let mut bs = BitStack128::new();
        bs.data.0 = 0b0010_1000_1100_1110_0101_0111;
        assert!(bs.pop_n(4) == 0b0111);
        assert!(bs.data.0 == 0b0010_1000_1100_1110_0101);
        assert!(bs.pop_n(4) == 0b0101);
        assert!(bs.data.0 == 0b0010_1000_1100_1110);
        assert!(bs.pop_n(4) == 0b1110);
        assert!(bs.data.0 == 0b0010_1000_1100);
        assert!(bs.pop_n(4) == 0b1100);
        assert!(bs.data.0 == 0b0010_1000);
        assert!(bs.pop_n(4) == 0b1000);
        assert!(bs.data.0 == 0b0010);
        assert!(bs.pop_n(4) == 0b0010);
        assert!(bs.data.0 == 0);
    }
    #[test]
    fn pop_n_overflow() {
        let mut bs = BitStack128::new();
        bs.data.0 = 0b1011011110110111101101111011011110110111101101111011011110110111;
        bs.data.1 = 0b10110111;
        for _ in 0..9 {
            assert!(bs.pop_n(8) == 0b10110111);
        }
    }
    #[test]
    fn pop_to_nth() {
        let mut bs = BitStack128::new();
        bs.data.0 = 0b0010_1000_1100_1110_0101_0111;
        assert!(bs.pop_to_nth(4) == false);
        assert!(bs.data.0 == 0b0010_1000_1100_1110_0101);
        assert!(bs.pop_to_nth(4) == false);
        assert!(bs.data.0 == 0b0010_1000_1100_1110);
        assert!(bs.pop_to_nth(4) == true);
        assert!(bs.data.0 == 0b0010_1000_1100);
        assert!(bs.pop_to_nth(4) == true);
        assert!(bs.data.0 == 0b0010_1000);
        assert!(bs.pop_to_nth(4) == true);
        assert!(bs.data.0 == 0b0010);
        assert!(bs.pop_to_nth(4) == false);
        assert!(bs.data.0 == 0);
    }
    #[test]
    fn pop_to_nth_overflow() {
        let mut bs = BitStack128::new();
        bs.data.0 = 0b00110111_10110111_00110111_10110111_00110111_10110111_00110111_10110111;
        bs.data.1 = 0b00110111_10110111;
        for _ in 0..5 {
            assert!(bs.pop_to_nth(8) == true);
            assert!(bs.pop_to_nth(8) == false);
        }
    }
    #[test]
    fn peek() {
        let mut bs = BitStack128::new();
        bs.data.0 = 0b10110111;
        assert!(bs.peek() == true);
        bs.pop();
        assert!(bs.peek() == true);
        bs.pop();
        assert!(bs.peek() == true);
        bs.pop();
        assert!(bs.peek() == false);
        bs.pop();
        assert!(bs.peek() == true);
        bs.pop();
        assert!(bs.peek() == true);
        bs.pop();
        assert!(bs.peek() == false);
        bs.pop();
        assert!(bs.peek() == true);
    }
    #[test]
    fn peek_n() {
        let mut bs = BitStack128::new();
        bs.data.0 = 0b10110111;
        assert!(bs.peek_n(4) == 0b0111);
        bs.pop_n(4);
        assert!(bs.peek_n(4) == 0b1011);
        bs.pop_n(4);
    }
 }
--- a/src/main.rs
+++ b/src/main.rs
@ -23,8 +23,6 @@ extern crate lazy_static;
 mod accel;
 mod algorithm;
 mod bbox;
 mod bbox4;
 mod bitstack;
 mod boundable;
 mod camera;
 mod color;
@ -61,9 +59,7 @@ use ray::{Ray, AccelRay};
 use surface::SurfaceIntersection;
 use renderer::LightPath;
 use bbox::BBox;
 use bbox4::BBox4;
 use accel::BVHNode;
 use accel::BVH4Node;
 use timer::Timer;
@ -130,9 +126,7 @@ fn main() {
                 mem::size_of::<SurfaceIntersection>());
        println!("LightPath size: {} bytes", mem::size_of::<LightPath>());
        println!("BBox size: {} bytes", mem::size_of::<BBox>());
        println!("BBox4 size: {} bytes", mem::size_of::<BBox4>());
        println!("BVHNode size: {} bytes", mem::size_of::<BVHNode>());
        println!("BVH4Node size: {} bytes", mem::size_of::<BVH4Node>());
        return;
    }
--- a/src/ray.rs
+++ b/src/ray.rs
@ -2,7 +2,6 @@
 use std;
 use bitstack::BitStack128;
 use float4::Float4;
 use math::{Vector, Point, Matrix4x4};
@ -59,7 +58,6 @@ pub struct AccelRay {
    pub time: f32,
    pub flags: u32,
    pub id: u32,
    pub trav_stack: BitStack128,
 }
 impl AccelRay {
@ -71,7 +69,6 @@ impl AccelRay {
            time: ray.time,
            flags: ray.flags,
            id: id,
            trav_stack: BitStack128::new_with_1(),
        }
    }