diff --git a/src/accel/bvh4_simd.rs b/src/accel/bvh4_simd.rs new file mode 100644 index 0000000..06a9f15 --- /dev/null +++ b/src/accel/bvh4_simd.rs @@ -0,0 +1,387 @@ +//! This BVH4 implementation pulls a lot of ideas from the paper +//! "Efficient Ray Tracing Kernels for Modern CPU Architectures" +//! by Fuetterling et al. +//! +//! Specifically, the table-based traversal order approach they +//! propose is largely followed by this implementation. + +#![allow(dead_code)] + +use mem_arena::MemArena; + +use crate::{ + bbox::BBox, + bbox4::BBox4, + boundable::Boundable, + lerp::lerp_slice, + math::Vector, + ray::{RayBatch, RayStack}, + timer::Timer, +}; + +use super::{ + bvh_base::{BVHBase, BVHBaseNode, BVH_MAX_DEPTH}, + ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME, +}; + +use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE}; + +pub fn ray_code(dir: Vector) -> usize { + let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0]; + ray_sign_is_neg[0] as usize + + ((ray_sign_is_neg[1] as usize) << 1) + + ((ray_sign_is_neg[2] as usize) << 2) +} + +#[derive(Copy, Clone, Debug)] +pub struct BVH4<'a> { + root: Option<&'a BVH4Node<'a>>, + depth: usize, + node_count: usize, + _bounds: Option<&'a [BBox]>, +} + +#[derive(Copy, Clone, Debug)] +pub enum BVH4Node<'a> { + Internal { + bounds: &'a [BBox4], + children: &'a [BVH4Node<'a>], + traversal_code: u8, + }, + + Leaf { + object_range: (usize, usize), + }, +} + +impl<'a> BVH4<'a> { + pub fn from_objects<'b, T, F>( + arena: &'a MemArena, + objects: &mut [T], + objects_per_leaf: usize, + bounder: F, + ) -> BVH4<'a> + where + F: 'b + Fn(&T) -> &'b [BBox], + { + if objects.len() == 0 { + BVH4 { + root: None, + depth: 0, + node_count: 0, + _bounds: None, + } + } else { + let base = BVHBase::from_objects(objects, objects_per_leaf, bounder); + + let fill_node = unsafe { arena.alloc_uninitialized_with_alignment::(32) }; + let node_count = BVH4::construct_from_base( + arena, + &base, + &base.nodes[base.root_node_index()], + fill_node, + ); + + BVH4 { + root: Some(fill_node), + depth: (base.depth / 2) + 1, + node_count: node_count, + _bounds: { + let range = base.nodes[base.root_node_index()].bounds_range(); + Some(arena.copy_slice(&base.bounds[range.0..range.1])) + }, + } + } + } + + pub fn tree_depth(&self) -> usize { + self.depth + } + + pub fn traverse( + &self, + rays: &mut RayBatch, + ray_stack: &mut RayStack, + objects: &[T], + mut obj_ray_test: F, + ) where + F: FnMut(&T, &mut RayBatch, &mut RayStack), + { + if self.root.is_none() { + return; + } + + let mut trav_time: f64 = 0.0; + let mut timer = Timer::new(); + + let traversal_table = + &TRAVERSAL_TABLE[ray_code(rays.dir_inv_local(ray_stack.next_task_ray_idx(0)))]; + + // +2 of max depth for root and last child + let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2]; + let mut stack_ptr = 1; + + while stack_ptr > 0 { + match node_stack[stack_ptr] { + &BVH4Node::Internal { + bounds, + children, + traversal_code, + } => { + let mut all_hits = 0; + + // Ray testing + ray_stack.pop_do_next_task(children.len(), |ray_idx| { + if rays.is_done(ray_idx) { + ([0, 1, 2, 3, 4, 5, 6, 7], 0) + } else { + let hits = lerp_slice(bounds, rays.time(ray_idx)) + .intersect_ray( + rays.orig_local(ray_idx), + rays.dir_inv_local(ray_idx), + rays.max_t(ray_idx), + ) + .to_bitmask(); + + if hits != 0 { + all_hits |= hits; + let mut lanes = [0u8; 8]; + let mut lane_count = 0; + for i in 0..children.len() { + if (hits >> i) & 1 != 0 { + lanes[lane_count] = i as u8; + lane_count += 1; + } + } + (lanes, lane_count) + } else { + ([0, 1, 2, 3, 4, 5, 6, 7], 0) + } + } + }); + + // If there were any intersections, create tasks. + if all_hits > 0 { + let order_code = traversal_table[traversal_code as usize]; + let mut lanes = [0usize; 4]; + let mut lane_count = 0; + for i in 0..children.len() { + let inv_i = (children.len() - 1) - i; + let child_i = ((order_code >> (inv_i * 2)) & 3) as usize; + if ((all_hits >> child_i) & 1) != 0 { + node_stack[stack_ptr + lane_count] = &children[child_i]; + lanes[lane_count] = child_i; + lane_count += 1; + } + } + + ray_stack.push_lanes_to_tasks(&lanes[..lane_count]); + stack_ptr += lane_count - 1; + } else { + stack_ptr -= 1; + } + } + + &BVH4Node::Leaf { object_range } => { + trav_time += timer.tick() as f64; + + // Set up the tasks for each object. + let obj_count = object_range.1 - object_range.0; + for _ in 0..(obj_count - 1) { + ray_stack.duplicate_next_task(); + } + + // Do the ray tests. + for obj in &objects[object_range.0..object_range.1] { + obj_ray_test(obj, rays, ray_stack); + } + + timer.tick(); + + stack_ptr -= 1; + } + } + } + + trav_time += timer.tick() as f64; + ACCEL_TRAV_TIME.with(|att| { + let v = att.get(); + att.set(v + trav_time); + }); + } + + fn construct_from_base( + arena: &'a MemArena, + base: &BVHBase, + node: &BVHBaseNode, + fill_node: &mut BVH4Node<'a>, + ) -> usize { + let mut node_count = 0; + + match node { + // Create internal node + &BVHBaseNode::Internal { + bounds_range: _, + children_indices, + split_axis, + } => { + let child_l = &base.nodes[children_indices.0]; + let child_r = &base.nodes[children_indices.1]; + + // Prepare convenient access to the stuff we need. + let child_count: usize; + let children; // [Optional, Optional, Optional, Optional] + let split_info: SplitAxes; + match *child_l { + BVHBaseNode::Internal { + children_indices: i_l, + split_axis: s_l, + .. + } => { + match *child_r { + BVHBaseNode::Internal { + children_indices: i_r, + split_axis: s_r, + .. + } => { + // Four nodes + child_count = 4; + children = [ + Some(&base.nodes[i_l.0]), + Some(&base.nodes[i_l.1]), + Some(&base.nodes[i_r.0]), + Some(&base.nodes[i_r.1]), + ]; + split_info = SplitAxes::Full((split_axis, s_l, s_r)); + } + BVHBaseNode::Leaf { .. } => { + // Three nodes with left split + child_count = 3; + children = [ + Some(&base.nodes[i_l.0]), + Some(&base.nodes[i_l.1]), + Some(child_r), + None, + ]; + split_info = SplitAxes::Left((split_axis, s_l)); + } + } + } + BVHBaseNode::Leaf { .. } => { + match *child_r { + BVHBaseNode::Internal { + children_indices: i_r, + split_axis: s_r, + .. + } => { + // Three nodes with right split + child_count = 3; + children = [ + Some(child_l), + Some(&base.nodes[i_r.0]), + Some(&base.nodes[i_r.1]), + None, + ]; + split_info = SplitAxes::Right((split_axis, s_r)); + } + BVHBaseNode::Leaf { .. } => { + // Two nodes + child_count = 2; + children = [Some(child_l), Some(child_r), None, None]; + split_info = SplitAxes::TopOnly(split_axis); + } + } + } + } + + node_count += child_count; + + // Construct bounds + let bounds = { + let bounds_len = children + .iter() + .map(|c| { + if let &Some(n) = c { + let len = n.bounds_range().1 - n.bounds_range().0; + debug_assert!(len >= 1); + len + } else { + 0 + } + }) + .max() + .unwrap(); + debug_assert!(bounds_len >= 1); + let bounds = + unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) }; + if bounds_len < 2 { + let b1 = + children[0].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); + let b2 = + children[1].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); + let b3 = + children[2].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); + let b4 = + children[3].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); + bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4); + } else { + for (i, b) in bounds.iter_mut().enumerate() { + let time = i as f32 / (bounds_len - 1) as f32; + + let b1 = children[0].map_or(BBox::new(), |c| { + let (x, y) = c.bounds_range(); + lerp_slice(&base.bounds[x..y], time) + }); + let b2 = children[1].map_or(BBox::new(), |c| { + let (x, y) = c.bounds_range(); + lerp_slice(&base.bounds[x..y], time) + }); + let b3 = children[2].map_or(BBox::new(), |c| { + let (x, y) = c.bounds_range(); + lerp_slice(&base.bounds[x..y], time) + }); + let b4 = children[3].map_or(BBox::new(), |c| { + let (x, y) = c.bounds_range(); + lerp_slice(&base.bounds[x..y], time) + }); + *b = BBox4::from_bboxes(b1, b2, b3, b4); + } + } + bounds + }; + + // Construct child nodes + let child_nodes = unsafe { + arena.alloc_array_uninitialized_with_alignment::(child_count, 32) + }; + for (i, c) in children[0..child_count].iter().enumerate() { + node_count += + BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]); + } + + // Build this node + *fill_node = BVH4Node::Internal { + bounds: bounds, + children: child_nodes, + traversal_code: calc_traversal_code(split_info), + }; + } + + // Create internal node + &BVHBaseNode::Leaf { object_range, .. } => { + *fill_node = BVH4Node::Leaf { + object_range: object_range, + }; + node_count += 1; + } + } + + return node_count; + } +} + +impl<'a> Boundable for BVH4<'a> { + fn bounds<'b>(&'b self) -> &'b [BBox] { + self._bounds.unwrap_or(&[]) + } +} diff --git a/src/accel/mod.rs b/src/accel/mod.rs index abbb1d4..1bac6d7 100644 --- a/src/accel/mod.rs +++ b/src/accel/mod.rs @@ -1,5 +1,6 @@ // mod bvh; mod bvh4; +mod bvh4_simd; mod bvh_base; mod light_array; mod light_tree; @@ -14,7 +15,7 @@ use crate::{ pub use self::{ // bvh::{BVHNode, BVH}, - bvh4::{ray_code, BVH4Node, BVH4}, + bvh4_simd::{ray_code, BVH4Node, BVH4}, light_array::LightArray, light_tree::LightTree, }; diff --git a/src/bbox4.rs b/src/bbox4.rs new file mode 100644 index 0000000..71793a4 --- /dev/null +++ b/src/bbox4.rs @@ -0,0 +1,139 @@ +#![allow(dead_code)] + +use std; +use std::ops::{BitOr, BitOrAssign}; + +use crate::{ + bbox::BBox, + lerp::{lerp, Lerp}, + math::{Point, Vector}, +}; + +use float4::{Bool4, Float4}; + +const BBOX_MAXT_ADJUST: f32 = 1.00000024; + +/// A SIMD set of 4 3D axis-aligned bounding boxes. +#[derive(Debug, Copy, Clone)] +pub struct BBox4 { + pub x: (Float4, Float4), // (min, max) + pub y: (Float4, Float4), // (min, max) + pub z: (Float4, Float4), // (min, max) +} + +impl BBox4 { + /// Creates a degenerate BBox with +infinity min and -infinity max. + pub fn new() -> BBox4 { + BBox4 { + x: ( + Float4::splat(std::f32::INFINITY), + Float4::splat(std::f32::NEG_INFINITY), + ), + y: ( + Float4::splat(std::f32::INFINITY), + Float4::splat(std::f32::NEG_INFINITY), + ), + z: ( + Float4::splat(std::f32::INFINITY), + Float4::splat(std::f32::NEG_INFINITY), + ), + } + } + + /// Creates a BBox with min as the minimum extent and max as the maximum + /// extent. + pub fn from_bboxes(b1: BBox, b2: BBox, b3: BBox, b4: BBox) -> BBox4 { + BBox4 { + x: ( + Float4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()), + Float4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()), + ), + y: ( + Float4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()), + Float4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()), + ), + z: ( + Float4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()), + Float4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z()), + ), + } + } + + // Returns whether the given ray intersects with the bboxes. + pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> Bool4 { + // Get the ray data into SIMD format. + let ro_x = orig.co.all_0(); + let ro_y = orig.co.all_1(); + let ro_z = orig.co.all_2(); + let rdi_x = dir_inv.co.all_0(); + let rdi_y = dir_inv.co.all_1(); + let rdi_z = dir_inv.co.all_2(); + let max_t = Float4::splat(max_t); + + // Slab tests + let t1_x = (self.x.0 - ro_x) * rdi_x; + let t1_y = (self.y.0 - ro_y) * rdi_y; + let t1_z = (self.z.0 - ro_z) * rdi_z; + let t2_x = (self.x.1 - ro_x) * rdi_x; + let t2_y = (self.y.1 - ro_y) * rdi_y; + let t2_z = (self.z.1 - ro_z) * rdi_z; + + // Get the far and near t hits for each axis. + let t_far_x = t1_x.v_max(t2_x); + let t_far_y = t1_y.v_max(t2_y); + let t_far_z = t1_z.v_max(t2_z); + let t_near_x = t1_x.v_min(t2_x); + let t_near_y = t1_y.v_min(t2_y); + let t_near_z = t1_z.v_min(t2_z); + + // Calculate over-all far t hit. + let far_t = + (t_far_x.v_min(t_far_y.v_min(t_far_z)) * Float4::splat(BBOX_MAXT_ADJUST)).v_min(max_t); + + // Calculate over-all near t hit. + let near_t = t_near_x + .v_max(t_near_y) + .v_max(t_near_z.v_max(Float4::splat(0.0))); + + // Hit results + near_t.lt(far_t) + } +} + +/// Union of two BBoxes. +impl BitOr for BBox4 { + type Output = BBox4; + + fn bitor(self, rhs: BBox4) -> BBox4 { + BBox4 { + x: (self.x.0.v_min(rhs.x.0), self.x.1.v_max(rhs.x.1)), + y: (self.y.0.v_min(rhs.y.0), self.y.1.v_max(rhs.y.1)), + z: (self.z.0.v_min(rhs.z.0), self.z.1.v_max(rhs.z.1)), + } + } +} + +impl BitOrAssign for BBox4 { + fn bitor_assign(&mut self, rhs: BBox4) { + *self = *self | rhs; + } +} + +impl Lerp for BBox4 { + fn lerp(self, other: BBox4, alpha: f32) -> BBox4 { + BBox4 { + x: ( + lerp(self.x.0, other.x.0, alpha), + lerp(self.x.1, other.x.1, alpha), + ), + y: ( + lerp(self.y.0, other.y.0, alpha), + lerp(self.y.1, other.y.1, alpha), + ), + z: ( + lerp(self.z.0, other.z.0, alpha), + lerp(self.z.1, other.z.1, alpha), + ), + } + } +} diff --git a/src/main.rs b/src/main.rs index bd5cf51..bd18195 100644 --- a/src/main.rs +++ b/src/main.rs @@ -17,6 +17,7 @@ extern crate lazy_static; mod accel; mod algorithm; mod bbox; +mod bbox4; mod boundable; mod camera; mod color; diff --git a/src/ray.rs b/src/ray.rs index 3b485f0..852f5c9 100644 --- a/src/ray.rs +++ b/src/ray.rs @@ -273,6 +273,25 @@ impl RayStack { } } + pub fn duplicate_next_task(&mut self) { + let task = self.tasks.last().unwrap(); + let l = task.lane; + let start = task.start_idx; + let end = self.lanes[l].end_len; + + for i in start..end { + let idx = self.lanes[l].idxs[i]; + self.lanes[l].idxs.push(idx); + } + + self.tasks.push(RayTask { + lane: l, + start_idx: end, + }); + + self.lanes[l].end_len = self.lanes[l].idxs.len(); + } + /// Pops the next task off the stack, and executes the provided closure for /// each ray index in the task. The return value of the closure is the list /// of lanes (by index) to add the given ray index back into.