diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs index 6ee9525..d7e68e1 100644 --- a/src/accel/bvh4.rs +++ b/src/accel/bvh4.rs @@ -1,13 +1,17 @@ +//! This BVH4 implementation is based on the ideas from the paper +//! "Efficient Ray Tracing Kernels for Modern CPU Architectures" +//! by Fuetterling et al. + #![allow(dead_code)] -use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE}; -use math3d::Vector; use mem_arena::MemArena; use crate::{ bbox::BBox, + bbox4::BBox4, boundable::Boundable, lerp::lerp_slice, + math::Vector, ray::{RayBatch, RayStack}, timer::Timer, }; @@ -17,6 +21,9 @@ use super::{ ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME, }; +use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE}; +use float4::Bool4; + pub fn ray_code(dir: Vector) -> usize { let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0]; ray_sign_is_neg[0] as usize @@ -28,20 +35,19 @@ pub fn ray_code(dir: Vector) -> usize { pub struct BVH4<'a> { root: Option<&'a BVH4Node<'a>>, depth: usize, + node_count: usize, + _bounds: Option<&'a [BBox]>, } #[derive(Copy, Clone, Debug)] pub enum BVH4Node<'a> { - Inner { - traversal_code: u8, - bounds_start: &'a BBox, - bounds_len: u16, + Internal { + bounds: &'a [BBox4], children: &'a [BVH4Node<'a>], + traversal_code: u8, }, Leaf { - bounds_start: &'a BBox, - bounds_len: u16, object_range: (usize, usize), }, } @@ -56,19 +62,32 @@ impl<'a> BVH4<'a> { where F: 'b + Fn(&T) -> &'b [BBox], { - if objects.is_empty() { + if objects.len() == 0 { BVH4 { root: None, depth: 0, + node_count: 0, + _bounds: None, } } else { let base = BVHBase::from_objects(objects, objects_per_leaf, bounder); - let root = unsafe { arena.alloc_uninitialized::() }; - BVH4::construct_from_base(arena, &base, base.root_node_index(), root); + let fill_node = unsafe { arena.alloc_uninitialized_with_alignment::(32) }; + let node_count = BVH4::construct_from_base( + arena, + &base, + &base.nodes[base.root_node_index()], + fill_node, + ); + BVH4 { - root: Some(root), - depth: base.depth, + root: Some(fill_node), + depth: (base.depth / 2) + 1, + node_count: node_count, + _bounds: { + let range = base.nodes[base.root_node_index()].bounds_range(); + Some(arena.copy_slice(&base.bounds[range.0..range.1])) + }, } } } @@ -103,117 +122,63 @@ impl<'a> BVH4<'a> { while stack_ptr > 0 { node_tests += ray_stack.ray_count_in_next_task() as u64; - match *node_stack[stack_ptr] { - BVH4Node::Inner { - traversal_code, - bounds_start, - bounds_len, + match node_stack[stack_ptr] { + &BVH4Node::Internal { + bounds, children, + traversal_code, } => { - // Test rays against bbox. - let bounds = - unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) }; + let mut all_hits = Bool4::new_false(); - let mut hit_count = 0; - ray_stack.pop_do_next_task(children.len(), |ray_idx| { - let hit = (!rays.is_done(ray_idx)) - && lerp_slice(bounds, rays.time(ray_idx)).intersect_ray( + // Ray testing + ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| { + if rays.is_done(ray_idx) { + (Bool4::new_false(), 0) + } else { + let hits = lerp_slice(bounds, rays.time(ray_idx)).intersect_ray( rays.orig_local(ray_idx), rays.dir_inv_local(ray_idx), rays.max_t(ray_idx), ); - - if hit { - hit_count += 1; - ([0, 1, 2, 3], children.len()) - } else { - ([0; 4], 0) + all_hits = all_hits | hits; + (hits, children.len()) } }); // If there were any intersections, create tasks. - if hit_count > 0 { + if !all_hits.is_all_false() { let order_code = traversal_table[traversal_code as usize]; - match children.len() { - 4 => { - let i4 = ((order_code >> 6) & 0b11) as usize; - let i3 = ((order_code >> 4) & 0b11) as usize; - let i2 = ((order_code >> 2) & 0b11) as usize; - let i1 = (order_code & 0b11) as usize; - - ray_stack.push_lanes_to_tasks(&[i4, i3, i2, i1]); - - node_stack[stack_ptr] = &children[i4]; - node_stack[stack_ptr + 1] = &children[i3]; - node_stack[stack_ptr + 2] = &children[i2]; - node_stack[stack_ptr + 3] = &children[i1]; - - stack_ptr += 3; + let mut lanes = [0usize; 4]; + let mut lane_count = 0; + for i in 0..children.len() { + let inv_i = (children.len() - 1) - i; + let child_i = ((order_code >> (inv_i * 2)) & 3) as usize; + if all_hits.get_n(child_i) { + node_stack[stack_ptr + lane_count] = &children[child_i]; + lanes[lane_count] = child_i; + lane_count += 1; } - 3 => { - let i3 = ((order_code >> 4) & 0b11) as usize; - let i2 = ((order_code >> 2) & 0b11) as usize; - let i1 = (order_code & 0b11) as usize; - - ray_stack.push_lanes_to_tasks(&[i3, i2, i1]); - - node_stack[stack_ptr] = &children[i3]; - node_stack[stack_ptr + 1] = &children[i2]; - node_stack[stack_ptr + 2] = &children[i1]; - - stack_ptr += 2; - } - 2 => { - let i2 = ((order_code >> 2) & 0b11) as usize; - let i1 = (order_code & 0b11) as usize; - - ray_stack.push_lanes_to_tasks(&[i2, i1]); - - node_stack[stack_ptr] = &children[i2]; - node_stack[stack_ptr + 1] = &children[i1]; - - stack_ptr += 1; - } - _ => unreachable!(), } + + ray_stack.push_lanes_to_tasks(&lanes[..lane_count]); + stack_ptr += lane_count - 1; } else { stack_ptr -= 1; } } - BVH4Node::Leaf { - object_range, - bounds_start, - bounds_len, - } => { - // Test rays against bounds. - let bounds = - unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) }; - let object_count = object_range.1 - object_range.0; - let mut hit_count = 0; - - ray_stack.pop_do_next_task(object_count, |ray_idx| { - let hit = (!rays.is_done(ray_idx)) - && lerp_slice(bounds, rays.time(ray_idx)).intersect_ray( - rays.orig_local(ray_idx), - rays.dir_inv_local(ray_idx), - rays.max_t(ray_idx), - ); - if hit { - hit_count += 1; - ([0, 1, 2, 3], object_count) - } else { - ([0; 4], 0) - } - }); - + &BVH4Node::Leaf { object_range } => { trav_time += timer.tick() as f64; - if hit_count > 0 { - ray_stack.push_lanes_to_tasks(&[0, 1, 2, 3, 4, 5, 6, 7][..object_count]); - for obj in &objects[object_range.0..object_range.1] { - obj_ray_test(obj, rays, ray_stack); - } + // Set up the tasks for each object. + let obj_count = object_range.1 - object_range.0; + for _ in 0..(obj_count - 1) { + ray_stack.duplicate_next_task(); + } + + // Do the ray tests. + for obj in &objects[object_range.0..object_range.1] { + obj_ray_test(obj, rays, ray_stack); } timer.tick(); @@ -237,12 +202,15 @@ impl<'a> BVH4<'a> { fn construct_from_base( arena: &'a MemArena, base: &BVHBase, - node_index: usize, - node_mem: &mut BVH4Node<'a>, - ) { - match base.nodes[node_index] { - BVHBaseNode::Internal { - bounds_range, + node: &BVHBaseNode, + fill_node: &mut BVH4Node<'a>, + ) -> usize { + let mut node_count = 0; + + match node { + // Create internal node + &BVHBaseNode::Internal { + bounds_range: _, children_indices, split_axis, } => { @@ -251,7 +219,7 @@ impl<'a> BVH4<'a> { // Prepare convenient access to the stuff we need. let child_count: usize; - let child_indices: [usize; 4]; + let children; // [Optional, Optional, Optional, Optional] let split_info: SplitAxes; match *child_l { BVHBaseNode::Internal { @@ -267,13 +235,23 @@ impl<'a> BVH4<'a> { } => { // Four nodes child_count = 4; - child_indices = [i_l.0, i_l.1, i_r.0, i_r.1]; + children = [ + Some(&base.nodes[i_l.0]), + Some(&base.nodes[i_l.1]), + Some(&base.nodes[i_r.0]), + Some(&base.nodes[i_r.1]), + ]; split_info = SplitAxes::Full((split_axis, s_l, s_r)); } BVHBaseNode::Leaf { .. } => { // Three nodes with left split child_count = 3; - child_indices = [i_l.0, i_l.1, children_indices.1, 0]; + children = [ + Some(&base.nodes[i_l.0]), + Some(&base.nodes[i_l.1]), + Some(child_r), + None, + ]; split_info = SplitAxes::Left((split_axis, s_l)); } } @@ -287,76 +265,112 @@ impl<'a> BVH4<'a> { } => { // Three nodes with right split child_count = 3; - child_indices = [children_indices.0, i_r.0, i_r.1, 0]; + children = [ + Some(child_l), + Some(&base.nodes[i_r.0]), + Some(&base.nodes[i_r.1]), + None, + ]; split_info = SplitAxes::Right((split_axis, s_r)); } BVHBaseNode::Leaf { .. } => { // Two nodes child_count = 2; - child_indices = [children_indices.0, children_indices.1, 0, 0]; + children = [Some(child_l), Some(child_r), None, None]; split_info = SplitAxes::TopOnly(split_axis); } } } } - // Copy bounds - let bounds = arena - .copy_slice_with_alignment(&base.bounds[bounds_range.0..bounds_range.1], 32); + node_count += child_count; - // Build children - let children_mem = unsafe { + // Construct bounds + let bounds = { + let bounds_len = children + .iter() + .map(|c| { + if let &Some(n) = c { + let len = n.bounds_range().1 - n.bounds_range().0; + debug_assert!(len >= 1); + len + } else { + 0 + } + }) + .max() + .unwrap(); + debug_assert!(bounds_len >= 1); + let bounds = + unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) }; + if bounds_len < 2 { + let b1 = + children[0].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); + let b2 = + children[1].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); + let b3 = + children[2].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); + let b4 = + children[3].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); + bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4); + } else { + for (i, b) in bounds.iter_mut().enumerate() { + let time = i as f32 / (bounds_len - 1) as f32; + + let b1 = children[0].map_or(BBox::new(), |c| { + let (x, y) = c.bounds_range(); + lerp_slice(&base.bounds[x..y], time) + }); + let b2 = children[1].map_or(BBox::new(), |c| { + let (x, y) = c.bounds_range(); + lerp_slice(&base.bounds[x..y], time) + }); + let b3 = children[2].map_or(BBox::new(), |c| { + let (x, y) = c.bounds_range(); + lerp_slice(&base.bounds[x..y], time) + }); + let b4 = children[3].map_or(BBox::new(), |c| { + let (x, y) = c.bounds_range(); + lerp_slice(&base.bounds[x..y], time) + }); + *b = BBox4::from_bboxes(b1, b2, b3, b4); + } + } + bounds + }; + + // Construct child nodes + let child_nodes = unsafe { arena.alloc_array_uninitialized_with_alignment::(child_count, 32) }; - for i in 0..child_count { - BVH4::construct_from_base(arena, base, child_indices[i], &mut children_mem[i]); + for (i, c) in children[0..child_count].iter().enumerate() { + node_count += + BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]); } - // Fill in node - *node_mem = BVH4Node::Inner { + // Build this node + *fill_node = BVH4Node::Internal { + bounds: bounds, + children: child_nodes, traversal_code: calc_traversal_code(split_info), - bounds_start: &bounds[0], - bounds_len: bounds.len() as u16, - children: children_mem, }; } - BVHBaseNode::Leaf { - bounds_range, - object_range, - } => { - let bounds = arena.copy_slice(&base.bounds[bounds_range.0..bounds_range.1]); - - *node_mem = BVH4Node::Leaf { - bounds_start: &bounds[0], - bounds_len: bounds.len() as u16, + // Create internal node + &BVHBaseNode::Leaf { object_range, .. } => { + *fill_node = BVH4Node::Leaf { object_range: object_range, }; + node_count += 1; } } - } -} -lazy_static! { - static ref DEGENERATE_BOUNDS: [BBox; 1] = [BBox::new()]; + return node_count; + } } impl<'a> Boundable for BVH4<'a> { - fn bounds(&self) -> &[BBox] { - match self.root { - None => &DEGENERATE_BOUNDS[..], - Some(root) => match *root { - BVH4Node::Inner { - bounds_start, - bounds_len, - .. - } - | BVH4Node::Leaf { - bounds_start, - bounds_len, - .. - } => unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) }, - }, - } + fn bounds<'b>(&'b self) -> &'b [BBox] { + self._bounds.unwrap_or(&[]) } } diff --git a/src/accel/bvh4_simd.rs b/src/accel/bvh4_simd.rs deleted file mode 100644 index 2ad0848..0000000 --- a/src/accel/bvh4_simd.rs +++ /dev/null @@ -1,386 +0,0 @@ -//! This BVH4 implementation pulls a lot of ideas from the paper -//! "Efficient Ray Tracing Kernels for Modern CPU Architectures" -//! by Fuetterling et al. -//! -//! Specifically, the table-based traversal order approach they -//! propose is largely followed by this implementation. - -#![allow(dead_code)] - -use mem_arena::MemArena; - -use crate::{ - bbox::BBox, - bbox4::BBox4, - boundable::Boundable, - lerp::lerp_slice, - math::Vector, - ray::{RayBatch, RayStack}, - timer::Timer, -}; - -use super::{ - bvh_base::{BVHBase, BVHBaseNode, BVH_MAX_DEPTH}, - ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME, -}; - -use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE}; -use float4::Bool4; - -pub fn ray_code(dir: Vector) -> usize { - let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0]; - ray_sign_is_neg[0] as usize - + ((ray_sign_is_neg[1] as usize) << 1) - + ((ray_sign_is_neg[2] as usize) << 2) -} - -#[derive(Copy, Clone, Debug)] -pub struct BVH4<'a> { - root: Option<&'a BVH4Node<'a>>, - depth: usize, - node_count: usize, - _bounds: Option<&'a [BBox]>, -} - -#[derive(Copy, Clone, Debug)] -pub enum BVH4Node<'a> { - Internal { - bounds: &'a [BBox4], - children: &'a [BVH4Node<'a>], - traversal_code: u8, - }, - - Leaf { - object_range: (usize, usize), - }, -} - -impl<'a> BVH4<'a> { - pub fn from_objects<'b, T, F>( - arena: &'a MemArena, - objects: &mut [T], - objects_per_leaf: usize, - bounder: F, - ) -> BVH4<'a> - where - F: 'b + Fn(&T) -> &'b [BBox], - { - if objects.len() == 0 { - BVH4 { - root: None, - depth: 0, - node_count: 0, - _bounds: None, - } - } else { - let base = BVHBase::from_objects(objects, objects_per_leaf, bounder); - - let fill_node = unsafe { arena.alloc_uninitialized_with_alignment::(32) }; - let node_count = BVH4::construct_from_base( - arena, - &base, - &base.nodes[base.root_node_index()], - fill_node, - ); - - BVH4 { - root: Some(fill_node), - depth: (base.depth / 2) + 1, - node_count: node_count, - _bounds: { - let range = base.nodes[base.root_node_index()].bounds_range(); - Some(arena.copy_slice(&base.bounds[range.0..range.1])) - }, - } - } - } - - pub fn tree_depth(&self) -> usize { - self.depth - } - - pub fn traverse( - &self, - rays: &mut RayBatch, - ray_stack: &mut RayStack, - objects: &[T], - mut obj_ray_test: F, - ) where - F: FnMut(&T, &mut RayBatch, &mut RayStack), - { - if self.root.is_none() { - return; - } - - let mut trav_time: f64 = 0.0; - let mut timer = Timer::new(); - - let traversal_table = - &TRAVERSAL_TABLE[ray_code(rays.dir_inv_local(ray_stack.next_task_ray_idx(0)))]; - - // +2 of max depth for root and last child - let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2]; - let mut stack_ptr = 1; - - while stack_ptr > 0 { - match node_stack[stack_ptr] { - &BVH4Node::Internal { - bounds, - children, - traversal_code, - } => { - let mut all_hits = Bool4::new(); - - // Ray testing - ray_stack.pop_do_next_task(children.len(), |ray_idx| { - if rays.is_done(ray_idx) { - ([0; 4], 0) - } else { - let hits = lerp_slice(bounds, rays.time(ray_idx)).intersect_ray( - rays.orig_local(ray_idx), - rays.dir_inv_local(ray_idx), - rays.max_t(ray_idx), - ); - - if !hits.all_false() { - all_hits = all_hits | hits; - let mut lanes = [0u8; 4]; - let mut lane_count = 0; - for i in 0..children.len() { - if hits.get_n(i) { - lanes[lane_count] = i as u8; - lane_count += 1; - } - } - (lanes, lane_count) - } else { - ([0; 4], 0) - } - } - }); - - // If there were any intersections, create tasks. - if !all_hits.all_false() { - let order_code = traversal_table[traversal_code as usize]; - let mut lanes = [0usize; 4]; - let mut lane_count = 0; - for i in 0..children.len() { - let inv_i = (children.len() - 1) - i; - let child_i = ((order_code >> (inv_i * 2)) & 3) as usize; - if all_hits.get_n(child_i) { - node_stack[stack_ptr + lane_count] = &children[child_i]; - lanes[lane_count] = child_i; - lane_count += 1; - } - } - - ray_stack.push_lanes_to_tasks(&lanes[..lane_count]); - stack_ptr += lane_count - 1; - } else { - stack_ptr -= 1; - } - } - - &BVH4Node::Leaf { object_range } => { - trav_time += timer.tick() as f64; - - // Set up the tasks for each object. - let obj_count = object_range.1 - object_range.0; - for _ in 0..(obj_count - 1) { - ray_stack.duplicate_next_task(); - } - - // Do the ray tests. - for obj in &objects[object_range.0..object_range.1] { - obj_ray_test(obj, rays, ray_stack); - } - - timer.tick(); - - stack_ptr -= 1; - } - } - } - - trav_time += timer.tick() as f64; - ACCEL_TRAV_TIME.with(|att| { - let v = att.get(); - att.set(v + trav_time); - }); - } - - fn construct_from_base( - arena: &'a MemArena, - base: &BVHBase, - node: &BVHBaseNode, - fill_node: &mut BVH4Node<'a>, - ) -> usize { - let mut node_count = 0; - - match node { - // Create internal node - &BVHBaseNode::Internal { - bounds_range: _, - children_indices, - split_axis, - } => { - let child_l = &base.nodes[children_indices.0]; - let child_r = &base.nodes[children_indices.1]; - - // Prepare convenient access to the stuff we need. - let child_count: usize; - let children; // [Optional, Optional, Optional, Optional] - let split_info: SplitAxes; - match *child_l { - BVHBaseNode::Internal { - children_indices: i_l, - split_axis: s_l, - .. - } => { - match *child_r { - BVHBaseNode::Internal { - children_indices: i_r, - split_axis: s_r, - .. - } => { - // Four nodes - child_count = 4; - children = [ - Some(&base.nodes[i_l.0]), - Some(&base.nodes[i_l.1]), - Some(&base.nodes[i_r.0]), - Some(&base.nodes[i_r.1]), - ]; - split_info = SplitAxes::Full((split_axis, s_l, s_r)); - } - BVHBaseNode::Leaf { .. } => { - // Three nodes with left split - child_count = 3; - children = [ - Some(&base.nodes[i_l.0]), - Some(&base.nodes[i_l.1]), - Some(child_r), - None, - ]; - split_info = SplitAxes::Left((split_axis, s_l)); - } - } - } - BVHBaseNode::Leaf { .. } => { - match *child_r { - BVHBaseNode::Internal { - children_indices: i_r, - split_axis: s_r, - .. - } => { - // Three nodes with right split - child_count = 3; - children = [ - Some(child_l), - Some(&base.nodes[i_r.0]), - Some(&base.nodes[i_r.1]), - None, - ]; - split_info = SplitAxes::Right((split_axis, s_r)); - } - BVHBaseNode::Leaf { .. } => { - // Two nodes - child_count = 2; - children = [Some(child_l), Some(child_r), None, None]; - split_info = SplitAxes::TopOnly(split_axis); - } - } - } - } - - node_count += child_count; - - // Construct bounds - let bounds = { - let bounds_len = children - .iter() - .map(|c| { - if let &Some(n) = c { - let len = n.bounds_range().1 - n.bounds_range().0; - debug_assert!(len >= 1); - len - } else { - 0 - } - }) - .max() - .unwrap(); - debug_assert!(bounds_len >= 1); - let bounds = - unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) }; - if bounds_len < 2 { - let b1 = - children[0].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); - let b2 = - children[1].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); - let b3 = - children[2].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); - let b4 = - children[3].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); - bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4); - } else { - for (i, b) in bounds.iter_mut().enumerate() { - let time = i as f32 / (bounds_len - 1) as f32; - - let b1 = children[0].map_or(BBox::new(), |c| { - let (x, y) = c.bounds_range(); - lerp_slice(&base.bounds[x..y], time) - }); - let b2 = children[1].map_or(BBox::new(), |c| { - let (x, y) = c.bounds_range(); - lerp_slice(&base.bounds[x..y], time) - }); - let b3 = children[2].map_or(BBox::new(), |c| { - let (x, y) = c.bounds_range(); - lerp_slice(&base.bounds[x..y], time) - }); - let b4 = children[3].map_or(BBox::new(), |c| { - let (x, y) = c.bounds_range(); - lerp_slice(&base.bounds[x..y], time) - }); - *b = BBox4::from_bboxes(b1, b2, b3, b4); - } - } - bounds - }; - - // Construct child nodes - let child_nodes = unsafe { - arena.alloc_array_uninitialized_with_alignment::(child_count, 32) - }; - for (i, c) in children[0..child_count].iter().enumerate() { - node_count += - BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]); - } - - // Build this node - *fill_node = BVH4Node::Internal { - bounds: bounds, - children: child_nodes, - traversal_code: calc_traversal_code(split_info), - }; - } - - // Create internal node - &BVHBaseNode::Leaf { object_range, .. } => { - *fill_node = BVH4Node::Leaf { - object_range: object_range, - }; - node_count += 1; - } - } - - return node_count; - } -} - -impl<'a> Boundable for BVH4<'a> { - fn bounds<'b>(&'b self) -> &'b [BBox] { - self._bounds.unwrap_or(&[]) - } -} diff --git a/src/accel/mod.rs b/src/accel/mod.rs index 1bac6d7..abbb1d4 100644 --- a/src/accel/mod.rs +++ b/src/accel/mod.rs @@ -1,6 +1,5 @@ // mod bvh; mod bvh4; -mod bvh4_simd; mod bvh_base; mod light_array; mod light_tree; @@ -15,7 +14,7 @@ use crate::{ pub use self::{ // bvh::{BVHNode, BVH}, - bvh4_simd::{ray_code, BVH4Node, BVH4}, + bvh4::{ray_code, BVH4Node, BVH4}, light_array::LightArray, light_tree::LightTree, }; diff --git a/src/light/rectangle_light.rs b/src/light/rectangle_light.rs index 8df2890..db01072 100644 --- a/src/light/rectangle_light.rs +++ b/src/light/rectangle_light.rs @@ -265,7 +265,7 @@ impl<'a> Surface for RectangleLight<'a> { ) { let _ = shader; // Silence 'unused' warning - ray_stack.pop_do_next_task(0, |ray_idx| { + ray_stack.pop_do_next_task(|ray_idx| { let time = rays.time(ray_idx); let orig = rays.orig(ray_idx); let dir = rays.dir(ray_idx); @@ -332,8 +332,6 @@ impl<'a> Surface for RectangleLight<'a> { } } } - - ([0; 4], 0) }); } } diff --git a/src/light/sphere_light.rs b/src/light/sphere_light.rs index 8c596a8..e17371f 100644 --- a/src/light/sphere_light.rs +++ b/src/light/sphere_light.rs @@ -214,7 +214,7 @@ impl<'a> Surface for SphereLight<'a> { ) { let _ = shader; // Silence 'unused' warning - ray_stack.pop_do_next_task(0, |ray_idx| { + ray_stack.pop_do_next_task(|ray_idx| { let time = rays.time(ray_idx); // Get the transform space @@ -242,7 +242,7 @@ impl<'a> Surface for SphereLight<'a> { let discriminant = (b * b) - (4.0 * a * c); if discriminant < 0.0 { // Discriminant less than zero? No solution => no intersection. - return ([0; 4], 0); + return; } let discriminant = discriminant.sqrt(); @@ -268,7 +268,7 @@ impl<'a> Surface for SphereLight<'a> { // Check our intersection for validity against this ray's extents if t0 > rays.max_t(ray_idx) || t1 <= 0.0 { // Didn't hit because sphere is entirely outside of ray's extents - return ([0; 4], 0); + return; } let t = if t0 > 0.0 { @@ -278,7 +278,7 @@ impl<'a> Surface for SphereLight<'a> { } else { // Didn't hit because ray is entirely within the sphere, and // therefore doesn't hit its surface. - return ([0; 4], 0); + return; }; // We hit the sphere, so calculate intersection info. @@ -334,8 +334,6 @@ impl<'a> Surface for SphereLight<'a> { // Set ray's max t rays.set_max_t(ray_idx, t); } - - ([0; 4], 0) }); } } diff --git a/src/ray.rs b/src/ray.rs index 4312f32..2fa92de 100644 --- a/src/ray.rs +++ b/src/ray.rs @@ -1,6 +1,6 @@ #![allow(dead_code)] -use float4::Float4; +use float4::{Bool4, Float4}; use crate::math::{Matrix4x4, Point, Vector}; @@ -293,11 +293,31 @@ impl RayStack { } /// Pops the next task off the stack, and executes the provided closure for - /// each ray index in the task. The return value of the closure is the list - /// of lanes (by index) to add the given ray index back into. - pub fn pop_do_next_task(&mut self, needed_lanes: usize, mut handle_ray: F) + /// each ray index in the task. + pub fn pop_do_next_task(&mut self, mut handle_ray: F) where - F: FnMut(usize) -> ([u8; 4], usize), + F: FnMut(usize), + { + // Pop the task and do necessary bookkeeping. + let task = self.tasks.pop().unwrap(); + let task_range = (task.start_idx, self.lanes[task.lane].end_len); + self.lanes[task.lane].end_len = task.start_idx; + + // Execute task. + for i in task_range.0..task_range.1 { + let ray_idx = self.lanes[task.lane].idxs[i]; + handle_ray(ray_idx as usize); + } + + self.lanes[task.lane].idxs.truncate(task_range.0); + } + + /// Pops the next task off the stack, executes the provided closure for + /// each ray index in the task, and pushes the ray indices back onto the + /// indicated lanes. + pub fn pop_do_next_task_and_push_rays(&mut self, needed_lanes: usize, mut handle_ray: F) + where + F: FnMut(usize) -> (Bool4, usize), { // Prepare lanes. self.ensure_lane_count(needed_lanes); @@ -311,13 +331,15 @@ impl RayStack { let mut source_lane_cap = task_range.0; for i in task_range.0..task_range.1 { let ray_idx = self.lanes[task.lane].idxs[i]; - let (add_list, list_len) = handle_ray(ray_idx as usize); - for &l in &add_list[..list_len] { - if l == task.lane as u8 { - self.lanes[l as usize].idxs[source_lane_cap] = ray_idx; - source_lane_cap += 1; - } else { - self.lanes[l as usize].idxs.push(ray_idx); + let (push_mask, c) = handle_ray(ray_idx as usize); + for l in 0..c { + if push_mask.get_n(l) { + if l == task.lane { + self.lanes[l as usize].idxs[source_lane_cap] = ray_idx; + source_lane_cap += 1; + } else { + self.lanes[l as usize].idxs.push(ray_idx); + } } } } diff --git a/src/surface/triangle_mesh.rs b/src/surface/triangle_mesh.rs index 906b7a5..1b54232 100644 --- a/src/surface/triangle_mesh.rs +++ b/src/surface/triangle_mesh.rs @@ -157,7 +157,7 @@ impl<'a> Surface for TriangleMesh<'a> { }; // Test each ray against the current triangle. - ray_stack.pop_do_next_task(0, |ray_idx| { + ray_stack.pop_do_next_task(|ray_idx| { let ray_idx = ray_idx as usize; let ray_time = rays.time(ray_idx); @@ -275,8 +275,6 @@ impl<'a> Surface for TriangleMesh<'a> { rays.set_max_t(ray_idx, t); } } - - ([0; 4], 0) }); }, ); diff --git a/src/tracer.rs b/src/tracer.rs index 8ba78c3..e733cdd 100644 --- a/src/tracer.rs +++ b/src/tracer.rs @@ -12,6 +12,8 @@ use crate::{ transform_stack::TransformStack, }; +use float4::Bool4; + pub struct Tracer<'a> { ray_stack: RayStack, inner: TracerInner<'a>, @@ -96,10 +98,10 @@ impl<'a> TracerInner<'a> { // Do transforms // TODO: re-divide rays based on direction (maybe?). let xforms = self.xform_stack.top(); - ray_stack.pop_do_next_task(2, |ray_idx| { + ray_stack.pop_do_next_task_and_push_rays(2, |ray_idx| { let t = rays.time(ray_idx); rays.update_local(ray_idx, &lerp_slice(xforms, t)); - ([0, 1, 0, 0], 2) + (Bool4::new(true, true, false, false), 2) }); ray_stack.push_lanes_to_tasks(&[0, 1]); } @@ -129,16 +131,14 @@ impl<'a> TracerInner<'a> { // Undo transforms let xforms = self.xform_stack.top(); if !xforms.is_empty() { - ray_stack.pop_do_next_task(0, |ray_idx| { + ray_stack.pop_do_next_task(|ray_idx| { let t = rays.time(ray_idx); rays.update_local(ray_idx, &lerp_slice(xforms, t)); - ([0; 4], 0) }); } else { let ident = Matrix4x4::new(); - ray_stack.pop_do_next_task(0, |ray_idx| { + ray_stack.pop_do_next_task(|ray_idx| { rays.update_local(ray_idx, &ident); - ([0; 4], 0) }); } } diff --git a/sub_crates/float4/src/lib.rs b/sub_crates/float4/src/lib.rs index 99c0417..327fbf9 100644 --- a/sub_crates/float4/src/lib.rs +++ b/sub_crates/float4/src/lib.rs @@ -621,7 +621,22 @@ mod x86_64_sse { impl Bool4 { #[inline(always)] - pub fn new() -> Bool4 { + pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 { + use std::arch::x86_64::_mm_set_ps; + Bool4 { + data: unsafe { + _mm_set_ps( + if d { 1.0 } else { 0.0 }, + if c { 1.0 } else { 0.0 }, + if b { 1.0 } else { 0.0 }, + if a { 1.0 } else { 0.0 }, + ) + }, + } + } + + #[inline(always)] + pub fn new_false() -> Bool4 { use std::arch::x86_64::_mm_set1_ps; Bool4 { data: unsafe { _mm_set1_ps(0.0) }, @@ -667,7 +682,8 @@ mod x86_64_sse { /// /// This is the `OR` operation on all the contained bools. If even /// one bool is true, this returns true. - pub fn all_false(&self) -> bool { + #[inline(always)] + pub fn is_all_false(&self) -> bool { let a = unsafe { *(&self.data as *const __m128 as *const u128) }; a == 0 }