diff --git a/Cargo.lock b/Cargo.lock index bc8f7b6..7e4b4ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -100,6 +100,11 @@ dependencies = [ name = "color" version = "0.1.0" +[[package]] +name = "copy_in_place" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "crossbeam" version = "0.3.2" @@ -239,6 +244,7 @@ dependencies = [ "bvh_order 0.1.0", "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", "color 0.1.0", + "copy_in_place 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", "float4 0.1.0", "half 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", @@ -557,6 +563,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" "checksum cfg-if 0.1.9 (registry+https://github.com/rust-lang/crates.io-index)" = "b486ce3ccf7ffd79fdeb678eac06a9e6c09fc88d33836340becb8fffe87c5e33" "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" +"checksum copy_in_place 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b792a46b1ef44bb5e9a04721d34e186522431be965a283437107843d62ddbaad" "checksum crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "24ce9782d4d5c53674646a6a4c1863a21a8fc0cb649b3c94dfc16e45071dea19" "checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" diff --git a/Cargo.toml b/Cargo.toml index 1e51807..14ee2ac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,6 +25,7 @@ debug = true # Crates.io dependencies base64 = "0.9" clap = "2.30" +copy_in_place = "0.2.0" crossbeam = "0.3" half = "1.0" lazy_static = "1.0" diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs index 11766bc..5b09e0f 100644 --- a/src/accel/bvh4.rs +++ b/src/accel/bvh4.rs @@ -1,36 +1,52 @@ +//! This BVH4 implementation is based on the ideas from the paper +//! "Efficient Ray Tracing Kernels for Modern CPU Architectures" +//! by Fuetterling et al. + #![allow(dead_code)] -use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE}; use mem_arena::MemArena; use crate::{ - algorithm::partition, bbox::BBox, boundable::Boundable, lerp::lerp_slice, ray::AccelRay, - timer::Timer, + bbox::BBox, + bbox4::BBox4, + boundable::Boundable, + lerp::lerp_slice, + math::Vector, + ray::{RayBatch, RayStack}, }; use super::{ bvh_base::{BVHBase, BVHBaseNode, BVH_MAX_DEPTH}, - ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME, + ACCEL_NODE_RAY_TESTS, }; +use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE}; +use float4::Bool4; + +pub fn ray_code(dir: Vector) -> usize { + let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0]; + ray_sign_is_neg[0] as usize + + ((ray_sign_is_neg[1] as usize) << 1) + + ((ray_sign_is_neg[2] as usize) << 2) +} + #[derive(Copy, Clone, Debug)] pub struct BVH4<'a> { root: Option<&'a BVH4Node<'a>>, depth: usize, + node_count: usize, + _bounds: Option<&'a [BBox]>, } #[derive(Copy, Clone, Debug)] pub enum BVH4Node<'a> { - Inner { - traversal_code: u8, - bounds_start: &'a BBox, - bounds_len: u16, + Internal { + bounds: &'a [BBox4], children: &'a [BVH4Node<'a>], + traversal_code: u8, }, Leaf { - bounds_start: &'a BBox, - bounds_len: u16, object_range: (usize, usize), }, } @@ -45,19 +61,32 @@ impl<'a> BVH4<'a> { where F: 'b + Fn(&T) -> &'b [BBox], { - if objects.is_empty() { + if objects.len() == 0 { BVH4 { root: None, depth: 0, + node_count: 0, + _bounds: None, } } else { let base = BVHBase::from_objects(objects, objects_per_leaf, bounder); - let root = unsafe { arena.alloc_uninitialized::() }; - BVH4::construct_from_base(arena, &base, base.root_node_index(), root); + let fill_node = unsafe { arena.alloc_uninitialized_with_alignment::(32) }; + let node_count = BVH4::construct_from_base( + arena, + &base, + &base.nodes[base.root_node_index()], + fill_node, + ); + BVH4 { - root: Some(root), - depth: base.depth, + root: Some(fill_node), + depth: (base.depth / 2) + 1, + node_count: node_count, + _bounds: { + let range = base.nodes[base.root_node_index()].bounds_range(); + Some(arena.copy_slice(&base.bounds[range.0..range.1])) + }, } } } @@ -66,135 +95,85 @@ impl<'a> BVH4<'a> { self.depth } - pub fn traverse(&self, rays: &mut [AccelRay], objects: &[T], mut obj_ray_test: F) + pub fn traverse(&self, rays: &mut RayBatch, ray_stack: &mut RayStack, mut obj_ray_test: F) where - F: FnMut(&T, &mut [AccelRay]), + F: FnMut(std::ops::Range, &mut RayBatch, &mut RayStack), { if self.root.is_none() { return; } - let mut timer = Timer::new(); - let mut trav_time: f64 = 0.0; let mut node_tests: u64 = 0; - let traversal_table = { - let ray_sign_is_neg = [ - rays[0].dir_inv.x() < 0.0, - rays[0].dir_inv.y() < 0.0, - rays[0].dir_inv.z() < 0.0, - ]; - let ray_code = ray_sign_is_neg[0] as usize - + ((ray_sign_is_neg[1] as usize) << 1) - + ((ray_sign_is_neg[2] as usize) << 2); - &TRAVERSAL_TABLE[ray_code] - }; + let traversal_table = + &TRAVERSAL_TABLE[ray_code(rays.dir_inv_local(ray_stack.next_task_ray_idx(0)))]; // +2 of max depth for root and last child let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2]; - let mut ray_i_stack = [rays.len(); (BVH_MAX_DEPTH * 3) + 2]; let mut stack_ptr = 1; while stack_ptr > 0 { - node_tests += ray_i_stack[stack_ptr] as u64; - match *node_stack[stack_ptr] { - BVH4Node::Inner { - traversal_code, - bounds_start, - bounds_len, + match node_stack[stack_ptr] { + &BVH4Node::Internal { + bounds, children, + traversal_code, } => { - let bounds = - unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) }; - let part = partition(&mut rays[..ray_i_stack[stack_ptr]], |r| { - (!r.is_done()) && lerp_slice(bounds, r.time).intersect_accel_ray(r) - }); - if part > 0 { - let order_code = traversal_table[traversal_code as usize]; - match children.len() { - 4 => { - let i4 = ((order_code >> 6) & 0b11) as usize; - let i3 = ((order_code >> 4) & 0b11) as usize; - let i2 = ((order_code >> 2) & 0b11) as usize; - let i1 = (order_code & 0b11) as usize; + node_tests += ray_stack.ray_count_in_next_task() as u64; + let mut all_hits = Bool4::new_false(); - ray_i_stack[stack_ptr] = part; - ray_i_stack[stack_ptr + 1] = part; - ray_i_stack[stack_ptr + 2] = part; - ray_i_stack[stack_ptr + 3] = part; - - node_stack[stack_ptr] = &children[i4]; - node_stack[stack_ptr + 1] = &children[i3]; - node_stack[stack_ptr + 2] = &children[i2]; - node_stack[stack_ptr + 3] = &children[i1]; - - stack_ptr += 3; - } - 3 => { - let i3 = ((order_code >> 4) & 0b11) as usize; - let i2 = ((order_code >> 2) & 0b11) as usize; - let i1 = (order_code & 0b11) as usize; - - ray_i_stack[stack_ptr] = part; - ray_i_stack[stack_ptr + 1] = part; - ray_i_stack[stack_ptr + 2] = part; - - node_stack[stack_ptr] = &children[i3]; - node_stack[stack_ptr + 1] = &children[i2]; - node_stack[stack_ptr + 2] = &children[i1]; - - stack_ptr += 2; - } - 2 => { - let i2 = ((order_code >> 2) & 0b11) as usize; - let i1 = (order_code & 0b11) as usize; - - ray_i_stack[stack_ptr] = part; - ray_i_stack[stack_ptr + 1] = part; - - node_stack[stack_ptr] = &children[i2]; - node_stack[stack_ptr + 1] = &children[i1]; - - stack_ptr += 1; - } - _ => unreachable!(), + // Ray testing + ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| { + if rays.is_done(ray_idx) { + Bool4::new_false() + } else { + let hits = if bounds.len() == 1 { + bounds[0].intersect_ray( + rays.orig_local(ray_idx), + rays.dir_inv_local(ray_idx), + rays.max_t(ray_idx), + ) + } else { + lerp_slice(bounds, rays.time(ray_idx)).intersect_ray( + rays.orig_local(ray_idx), + rays.dir_inv_local(ray_idx), + rays.max_t(ray_idx), + ) + }; + all_hits = all_hits | hits; + hits } + }); + + // If there were any intersections, create tasks. + if !all_hits.is_all_false() { + let order_code = traversal_table[traversal_code as usize]; + let mut lane_count = 0; + let mut i = children.len() as u8; + while i > 0 { + i -= 1; + let child_i = ((order_code >> (i * 2)) & 3) as usize; + if ray_stack.push_lane_to_task(child_i) { + node_stack[stack_ptr + lane_count] = &children[child_i]; + lane_count += 1; + } + } + + stack_ptr += lane_count - 1; } else { stack_ptr -= 1; } } - BVH4Node::Leaf { - object_range, - bounds_start, - bounds_len, - } => { - let bounds = - unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) }; - let part = partition(&mut rays[..ray_i_stack[stack_ptr]], |r| { - (!r.is_done()) && lerp_slice(bounds, r.time).intersect_accel_ray(r) - }); - - trav_time += timer.tick() as f64; - - if part > 0 { - for obj in &objects[object_range.0..object_range.1] { - obj_ray_test(obj, &mut rays[..part]); - } - } - - timer.tick(); + &BVH4Node::Leaf { object_range } => { + // Do the ray tests. + obj_ray_test(object_range.0..object_range.1, rays, ray_stack); stack_ptr -= 1; } } } - trav_time += timer.tick() as f64; - ACCEL_TRAV_TIME.with(|att| { - let v = att.get(); - att.set(v + trav_time); - }); ACCEL_NODE_RAY_TESTS.with(|anv| { let v = anv.get(); anv.set(v + node_tests); @@ -204,12 +183,15 @@ impl<'a> BVH4<'a> { fn construct_from_base( arena: &'a MemArena, base: &BVHBase, - node_index: usize, - node_mem: &mut BVH4Node<'a>, - ) { - match base.nodes[node_index] { - BVHBaseNode::Internal { - bounds_range, + node: &BVHBaseNode, + fill_node: &mut BVH4Node<'a>, + ) -> usize { + let mut node_count = 0; + + match node { + // Create internal node + &BVHBaseNode::Internal { + bounds_range: _, children_indices, split_axis, } => { @@ -218,7 +200,7 @@ impl<'a> BVH4<'a> { // Prepare convenient access to the stuff we need. let child_count: usize; - let child_indices: [usize; 4]; + let children; // [Optional, Optional, Optional, Optional] let split_info: SplitAxes; match *child_l { BVHBaseNode::Internal { @@ -234,13 +216,23 @@ impl<'a> BVH4<'a> { } => { // Four nodes child_count = 4; - child_indices = [i_l.0, i_l.1, i_r.0, i_r.1]; + children = [ + Some(&base.nodes[i_l.0]), + Some(&base.nodes[i_l.1]), + Some(&base.nodes[i_r.0]), + Some(&base.nodes[i_r.1]), + ]; split_info = SplitAxes::Full((split_axis, s_l, s_r)); } BVHBaseNode::Leaf { .. } => { // Three nodes with left split child_count = 3; - child_indices = [i_l.0, i_l.1, children_indices.1, 0]; + children = [ + Some(&base.nodes[i_l.0]), + Some(&base.nodes[i_l.1]), + Some(child_r), + None, + ]; split_info = SplitAxes::Left((split_axis, s_l)); } } @@ -254,76 +246,112 @@ impl<'a> BVH4<'a> { } => { // Three nodes with right split child_count = 3; - child_indices = [children_indices.0, i_r.0, i_r.1, 0]; + children = [ + Some(child_l), + Some(&base.nodes[i_r.0]), + Some(&base.nodes[i_r.1]), + None, + ]; split_info = SplitAxes::Right((split_axis, s_r)); } BVHBaseNode::Leaf { .. } => { // Two nodes child_count = 2; - child_indices = [children_indices.0, children_indices.1, 0, 0]; + children = [Some(child_l), Some(child_r), None, None]; split_info = SplitAxes::TopOnly(split_axis); } } } } - // Copy bounds - let bounds = arena - .copy_slice_with_alignment(&base.bounds[bounds_range.0..bounds_range.1], 32); + node_count += child_count; - // Build children - let children_mem = unsafe { + // Construct bounds + let bounds = { + let bounds_len = children + .iter() + .map(|c| { + if let &Some(n) = c { + let len = n.bounds_range().1 - n.bounds_range().0; + debug_assert!(len >= 1); + len + } else { + 0 + } + }) + .max() + .unwrap(); + debug_assert!(bounds_len >= 1); + let bounds = + unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) }; + if bounds_len < 2 { + let b1 = + children[0].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); + let b2 = + children[1].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); + let b3 = + children[2].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); + let b4 = + children[3].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]); + bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4); + } else { + for (i, b) in bounds.iter_mut().enumerate() { + let time = i as f32 / (bounds_len - 1) as f32; + + let b1 = children[0].map_or(BBox::new(), |c| { + let (x, y) = c.bounds_range(); + lerp_slice(&base.bounds[x..y], time) + }); + let b2 = children[1].map_or(BBox::new(), |c| { + let (x, y) = c.bounds_range(); + lerp_slice(&base.bounds[x..y], time) + }); + let b3 = children[2].map_or(BBox::new(), |c| { + let (x, y) = c.bounds_range(); + lerp_slice(&base.bounds[x..y], time) + }); + let b4 = children[3].map_or(BBox::new(), |c| { + let (x, y) = c.bounds_range(); + lerp_slice(&base.bounds[x..y], time) + }); + *b = BBox4::from_bboxes(b1, b2, b3, b4); + } + } + bounds + }; + + // Construct child nodes + let child_nodes = unsafe { arena.alloc_array_uninitialized_with_alignment::(child_count, 32) }; - for i in 0..child_count { - BVH4::construct_from_base(arena, base, child_indices[i], &mut children_mem[i]); + for (i, c) in children[0..child_count].iter().enumerate() { + node_count += + BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]); } - // Fill in node - *node_mem = BVH4Node::Inner { + // Build this node + *fill_node = BVH4Node::Internal { + bounds: bounds, + children: child_nodes, traversal_code: calc_traversal_code(split_info), - bounds_start: &bounds[0], - bounds_len: bounds.len() as u16, - children: children_mem, }; } - BVHBaseNode::Leaf { - bounds_range, - object_range, - } => { - let bounds = arena.copy_slice(&base.bounds[bounds_range.0..bounds_range.1]); - - *node_mem = BVH4Node::Leaf { - bounds_start: &bounds[0], - bounds_len: bounds.len() as u16, + // Create internal node + &BVHBaseNode::Leaf { object_range, .. } => { + *fill_node = BVH4Node::Leaf { object_range: object_range, }; + node_count += 1; } } - } -} -lazy_static! { - static ref DEGENERATE_BOUNDS: [BBox; 1] = [BBox::new()]; + return node_count; + } } impl<'a> Boundable for BVH4<'a> { - fn bounds(&self) -> &[BBox] { - match self.root { - None => &DEGENERATE_BOUNDS[..], - Some(root) => match *root { - BVH4Node::Inner { - bounds_start, - bounds_len, - .. - } - | BVH4Node::Leaf { - bounds_start, - bounds_len, - .. - } => unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) }, - }, - } + fn bounds<'b>(&'b self) -> &'b [BBox] { + self._bounds.unwrap_or(&[]) } } diff --git a/src/accel/mod.rs b/src/accel/mod.rs index fe8ee3d..ba83a3a 100644 --- a/src/accel/mod.rs +++ b/src/accel/mod.rs @@ -1,4 +1,4 @@ -mod bvh; +// mod bvh; mod bvh4; mod bvh_base; mod light_array; @@ -13,15 +13,14 @@ use crate::{ }; pub use self::{ - bvh::{BVHNode, BVH}, - bvh4::{BVH4Node, BVH4}, + // bvh::{BVHNode, BVH}, + bvh4::{ray_code, BVH4Node, BVH4}, light_array::LightArray, light_tree::LightTree, }; // Track BVH traversal time thread_local! { - pub static ACCEL_TRAV_TIME: Cell = Cell::new(0.0); pub static ACCEL_NODE_RAY_TESTS: Cell = Cell::new(0); } diff --git a/src/bbox.rs b/src/bbox.rs index 33d3e6e..a4a43bb 100644 --- a/src/bbox.rs +++ b/src/bbox.rs @@ -7,8 +7,7 @@ use std::{ use crate::{ lerp::{lerp, lerp_slice, Lerp}, - math::{fast_minf32, Matrix4x4, Point}, - ray::AccelRay, + math::{fast_minf32, Matrix4x4, Point, Vector}, }; const BBOX_MAXT_ADJUST: f32 = 1.000_000_24; @@ -40,17 +39,17 @@ impl BBox { } // Returns whether the given ray intersects with the bbox. - pub fn intersect_accel_ray(&self, ray: &AccelRay) -> bool { + pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> bool { // Calculate slab intersections - let t1 = (self.min.co - ray.orig.co) * ray.dir_inv.co; - let t2 = (self.max.co - ray.orig.co) * ray.dir_inv.co; + let t1 = (self.min.co - orig.co) * dir_inv.co; + let t2 = (self.max.co - orig.co) * dir_inv.co; // Find the far and near intersection let mut far_t = t1.v_max(t2); let mut near_t = t1.v_min(t2); far_t.set_3(std::f32::INFINITY); near_t.set_3(0.0); - let far_hit_t = fast_minf32(far_t.h_min() * BBOX_MAXT_ADJUST, ray.max_t); + let far_hit_t = fast_minf32(far_t.h_min() * BBOX_MAXT_ADJUST, max_t); let near_hit_t = near_t.h_max(); // Did we hit? diff --git a/src/bbox4.rs b/src/bbox4.rs new file mode 100644 index 0000000..71793a4 --- /dev/null +++ b/src/bbox4.rs @@ -0,0 +1,139 @@ +#![allow(dead_code)] + +use std; +use std::ops::{BitOr, BitOrAssign}; + +use crate::{ + bbox::BBox, + lerp::{lerp, Lerp}, + math::{Point, Vector}, +}; + +use float4::{Bool4, Float4}; + +const BBOX_MAXT_ADJUST: f32 = 1.00000024; + +/// A SIMD set of 4 3D axis-aligned bounding boxes. +#[derive(Debug, Copy, Clone)] +pub struct BBox4 { + pub x: (Float4, Float4), // (min, max) + pub y: (Float4, Float4), // (min, max) + pub z: (Float4, Float4), // (min, max) +} + +impl BBox4 { + /// Creates a degenerate BBox with +infinity min and -infinity max. + pub fn new() -> BBox4 { + BBox4 { + x: ( + Float4::splat(std::f32::INFINITY), + Float4::splat(std::f32::NEG_INFINITY), + ), + y: ( + Float4::splat(std::f32::INFINITY), + Float4::splat(std::f32::NEG_INFINITY), + ), + z: ( + Float4::splat(std::f32::INFINITY), + Float4::splat(std::f32::NEG_INFINITY), + ), + } + } + + /// Creates a BBox with min as the minimum extent and max as the maximum + /// extent. + pub fn from_bboxes(b1: BBox, b2: BBox, b3: BBox, b4: BBox) -> BBox4 { + BBox4 { + x: ( + Float4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()), + Float4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()), + ), + y: ( + Float4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()), + Float4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()), + ), + z: ( + Float4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()), + Float4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z()), + ), + } + } + + // Returns whether the given ray intersects with the bboxes. + pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> Bool4 { + // Get the ray data into SIMD format. + let ro_x = orig.co.all_0(); + let ro_y = orig.co.all_1(); + let ro_z = orig.co.all_2(); + let rdi_x = dir_inv.co.all_0(); + let rdi_y = dir_inv.co.all_1(); + let rdi_z = dir_inv.co.all_2(); + let max_t = Float4::splat(max_t); + + // Slab tests + let t1_x = (self.x.0 - ro_x) * rdi_x; + let t1_y = (self.y.0 - ro_y) * rdi_y; + let t1_z = (self.z.0 - ro_z) * rdi_z; + let t2_x = (self.x.1 - ro_x) * rdi_x; + let t2_y = (self.y.1 - ro_y) * rdi_y; + let t2_z = (self.z.1 - ro_z) * rdi_z; + + // Get the far and near t hits for each axis. + let t_far_x = t1_x.v_max(t2_x); + let t_far_y = t1_y.v_max(t2_y); + let t_far_z = t1_z.v_max(t2_z); + let t_near_x = t1_x.v_min(t2_x); + let t_near_y = t1_y.v_min(t2_y); + let t_near_z = t1_z.v_min(t2_z); + + // Calculate over-all far t hit. + let far_t = + (t_far_x.v_min(t_far_y.v_min(t_far_z)) * Float4::splat(BBOX_MAXT_ADJUST)).v_min(max_t); + + // Calculate over-all near t hit. + let near_t = t_near_x + .v_max(t_near_y) + .v_max(t_near_z.v_max(Float4::splat(0.0))); + + // Hit results + near_t.lt(far_t) + } +} + +/// Union of two BBoxes. +impl BitOr for BBox4 { + type Output = BBox4; + + fn bitor(self, rhs: BBox4) -> BBox4 { + BBox4 { + x: (self.x.0.v_min(rhs.x.0), self.x.1.v_max(rhs.x.1)), + y: (self.y.0.v_min(rhs.y.0), self.y.1.v_max(rhs.y.1)), + z: (self.z.0.v_min(rhs.z.0), self.z.1.v_max(rhs.z.1)), + } + } +} + +impl BitOrAssign for BBox4 { + fn bitor_assign(&mut self, rhs: BBox4) { + *self = *self | rhs; + } +} + +impl Lerp for BBox4 { + fn lerp(self, other: BBox4, alpha: f32) -> BBox4 { + BBox4 { + x: ( + lerp(self.x.0, other.x.0, alpha), + lerp(self.x.1, other.x.1, alpha), + ), + y: ( + lerp(self.y.0, other.y.0, alpha), + lerp(self.y.1, other.y.1, alpha), + ), + z: ( + lerp(self.z.0, other.z.0, alpha), + lerp(self.z.1, other.z.1, alpha), + ), + } + } +} diff --git a/src/camera.rs b/src/camera.rs index e3ed8c5..287805c 100644 --- a/src/camera.rs +++ b/src/camera.rs @@ -92,6 +92,12 @@ impl<'a> Camera<'a> { ) .normalized(); - Ray::new(orig * transform, dir * transform, time, wavelength, false) + Ray { + orig: orig * transform, + dir: dir * transform, + time: time, + wavelength: wavelength, + max_t: std::f32::INFINITY, + } } } diff --git a/src/light/rectangle_light.rs b/src/light/rectangle_light.rs index 98bae49..e399d68 100644 --- a/src/light/rectangle_light.rs +++ b/src/light/rectangle_light.rs @@ -6,7 +6,7 @@ use crate::{ color::{Color, SpectralSample}, lerp::lerp_slice, math::{cross, dot, Matrix4x4, Normal, Point, Vector}, - ray::{AccelRay, Ray}, + ray::{RayBatch, RayStack}, sampling::{ spherical_triangle_solid_angle, triangle_surface_area, uniform_sample_spherical_triangle, uniform_sample_triangle, @@ -257,20 +257,23 @@ impl<'a> SurfaceLight for RectangleLight<'a> { impl<'a> Surface for RectangleLight<'a> { fn intersect_rays( &self, - accel_rays: &mut [AccelRay], - wrays: &[Ray], + rays: &mut RayBatch, + ray_stack: &mut RayStack, isects: &mut [SurfaceIntersection], shader: &SurfaceShader, space: &[Matrix4x4], ) { let _ = shader; // Silence 'unused' warning - for r in accel_rays.iter_mut() { - let wr = &wrays[r.id as usize]; + ray_stack.pop_do_next_task(|ray_idx| { + let time = rays.time(ray_idx); + let orig = rays.orig(ray_idx); + let dir = rays.dir(ray_idx); + let max_t = rays.max_t(ray_idx); // Calculate time interpolated values - let dim = lerp_slice(self.dimensions, r.time); - let xform = lerp_slice(space, r.time); + let dim = lerp_slice(self.dimensions, time); + let xform = lerp_slice(space, time); let space_inv = xform.inverse(); @@ -281,18 +284,19 @@ impl<'a> Surface for RectangleLight<'a> { let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0) * space_inv; // Test against two triangles that make up the light + let ray_pre = triangle::RayTriPrecompute::new(dir); for tri in &[(p1, p2, p3), (p3, p4, p1)] { - if let Some((t, b0, b1, b2)) = triangle::intersect_ray(wr, *tri) { - if t < r.max_t { - if r.is_occlusion() { - isects[r.id as usize] = SurfaceIntersection::Occlude; - r.mark_done(); + if let Some((t, b0, b1, b2)) = triangle::intersect_ray(orig, ray_pre, max_t, *tri) { + if t < max_t { + if rays.is_occlusion(ray_idx) { + isects[ray_idx] = SurfaceIntersection::Occlude; + rays.mark_done(ray_idx); } else { let (pos, pos_err) = triangle::surface_point(*tri, (b0, b1, b2)); let normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal(); let intersection_data = SurfaceIntersectionData { - incoming: wr.dir, + incoming: dir, t: t, pos: pos, pos_err: pos_err, @@ -301,35 +305,35 @@ impl<'a> Surface for RectangleLight<'a> { local_space: xform, sample_pdf: self.sample_pdf( &xform, - wr.orig, - wr.dir, + orig, + dir, pos, - wr.wavelength, - r.time, + rays.wavelength(ray_idx), + time, ), }; let closure = { let inv_surface_area = (1.0 / (dim.0 as f64 * dim.1 as f64)) as f32; - let color = lerp_slice(self.colors, r.time) * inv_surface_area; + let color = lerp_slice(self.colors, time) * inv_surface_area; SurfaceClosure::Emit(color) }; // Fill in intersection - isects[r.id as usize] = SurfaceIntersection::Hit { + isects[ray_idx] = SurfaceIntersection::Hit { intersection_data: intersection_data, closure: closure, }; // Set ray's max t - r.max_t = t; + rays.set_max_t(ray_idx, t); } break; } } } - } + }); } } diff --git a/src/light/sphere_light.rs b/src/light/sphere_light.rs index 2323902..e17371f 100644 --- a/src/light/sphere_light.rs +++ b/src/light/sphere_light.rs @@ -8,7 +8,7 @@ use crate::{ color::{Color, SpectralSample}, lerp::lerp_slice, math::{coordinate_system_from_vector, dot, Matrix4x4, Normal, Point, Vector}, - ray::{AccelRay, Ray}, + ray::{RayBatch, RayStack}, sampling::{uniform_sample_cone, uniform_sample_cone_pdf, uniform_sample_sphere}, shading::surface_closure::SurfaceClosure, shading::SurfaceShader, @@ -206,26 +206,26 @@ impl<'a> SurfaceLight for SphereLight<'a> { impl<'a> Surface for SphereLight<'a> { fn intersect_rays( &self, - accel_rays: &mut [AccelRay], - wrays: &[Ray], + rays: &mut RayBatch, + ray_stack: &mut RayStack, isects: &mut [SurfaceIntersection], shader: &SurfaceShader, space: &[Matrix4x4], ) { let _ = shader; // Silence 'unused' warning - for r in accel_rays.iter_mut() { - let wr = &wrays[r.id as usize]; + ray_stack.pop_do_next_task(|ray_idx| { + let time = rays.time(ray_idx); // Get the transform space - let xform = lerp_slice(space, r.time); + let xform = lerp_slice(space, time); // Get the radius of the sphere at the ray's time - let radius = lerp_slice(self.radii, r.time); // Radius of the sphere + let radius = lerp_slice(self.radii, time); // Radius of the sphere // Get the ray origin and direction in local space - let orig = r.orig.into_vector(); - let dir = wr.dir * xform; + let orig = rays.orig(ray_idx).into_vector(); + let dir = rays.dir(ray_idx) * xform; // Code adapted to Rust from https://github.com/Tecla/Rayito // Ray-sphere intersection can result in either zero, one or two points @@ -242,7 +242,7 @@ impl<'a> Surface for SphereLight<'a> { let discriminant = (b * b) - (4.0 * a * c); if discriminant < 0.0 { // Discriminant less than zero? No solution => no intersection. - continue; + return; } let discriminant = discriminant.sqrt(); @@ -257,7 +257,7 @@ impl<'a> Surface for SphereLight<'a> { // Get our final parametric values let mut t0 = q / a; - let mut t1 = if q != 0.0 { c / q } else { r.max_t }; + let mut t1 = if q != 0.0 { c / q } else { rays.max_t(ray_idx) }; // Swap them so they are ordered right if t0 > t1 { @@ -266,25 +266,25 @@ impl<'a> Surface for SphereLight<'a> { } // Check our intersection for validity against this ray's extents - if t0 > r.max_t || t1 <= 0.0 { - // Didn't hit because shere is entirely outside of ray's extents - continue; + if t0 > rays.max_t(ray_idx) || t1 <= 0.0 { + // Didn't hit because sphere is entirely outside of ray's extents + return; } let t = if t0 > 0.0 { t0 - } else if t1 <= r.max_t { + } else if t1 <= rays.max_t(ray_idx) { t1 } else { // Didn't hit because ray is entirely within the sphere, and // therefore doesn't hit its surface. - continue; + return; }; // We hit the sphere, so calculate intersection info. - if r.is_occlusion() { - isects[r.id as usize] = SurfaceIntersection::Occlude; - r.mark_done(); + if rays.is_occlusion(ray_idx) { + isects[ray_idx] = SurfaceIntersection::Occlude; + rays.mark_done(ray_idx); } else { let inv_xform = xform.inverse(); @@ -300,7 +300,7 @@ impl<'a> Surface for SphereLight<'a> { let normal = unit_pos.into_normal() * inv_xform; let intersection_data = SurfaceIntersectionData { - incoming: wr.dir, + incoming: rays.dir(ray_idx), t: t, pos: pos, pos_err: pos_err, @@ -309,32 +309,32 @@ impl<'a> Surface for SphereLight<'a> { local_space: xform, sample_pdf: self.sample_pdf( &xform, - wr.orig, - wr.dir, + rays.orig(ray_idx), + rays.dir(ray_idx), 0.0, 0.0, - wr.wavelength, - r.time, + rays.wavelength(ray_idx), + time, ), }; let closure = { let inv_surface_area = (1.0 / (4.0 * PI_64 * radius as f64 * radius as f64)) as f32; - let color = lerp_slice(self.colors, r.time) * inv_surface_area; + let color = lerp_slice(self.colors, time) * inv_surface_area; SurfaceClosure::Emit(color) }; // Fill in intersection - isects[r.id as usize] = SurfaceIntersection::Hit { + isects[ray_idx] = SurfaceIntersection::Hit { intersection_data: intersection_data, closure: closure, }; // Set ray's max t - r.max_t = t; + rays.set_max_t(ray_idx, t); } - } + }); } } diff --git a/src/main.rs b/src/main.rs index c1f5cef..f469e98 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,12 +11,12 @@ #![allow(clippy::needless_range_loop)] #![allow(clippy::excessive_precision)] -#[macro_use] extern crate lazy_static; mod accel; mod algorithm; mod bbox; +mod bbox4; mod boundable; mod camera; mod color; @@ -47,10 +47,9 @@ use nom::{error_position, take_until}; use mem_arena::MemArena; use crate::{ - accel::{BVH4Node, BVHNode}, + accel::BVH4Node, bbox::BBox, parse::{parse_scene, DataTree}, - ray::{AccelRay, Ray}, renderer::LightPath, surface::SurfaceIntersection, timer::Timer, @@ -159,15 +158,13 @@ fn main() { // Print some misc useful dev info. if args.is_present("dev") { - println!("Ray size: {} bytes", mem::size_of::()); - println!("AccelRay size: {} bytes", mem::size_of::()); println!( "SurfaceIntersection size: {} bytes", mem::size_of::() ); println!("LightPath size: {} bytes", mem::size_of::()); println!("BBox size: {} bytes", mem::size_of::()); - println!("BVHNode size: {} bytes", mem::size_of::()); + // println!("BVHNode size: {} bytes", mem::size_of::()); println!("BVH4Node size: {} bytes", mem::size_of::()); return; } @@ -295,9 +292,10 @@ fn main() { "\t\tTrace: {:.3}s", ntime * rstats.trace_time ); + println!("\t\t\tRays traced: {}", rstats.ray_count); println!( - "\t\t\tTraversal: {:.3}s", - ntime * rstats.accel_traversal_time + "\t\t\tRays/sec: {}", + (rstats.ray_count as f64 / (ntime * rstats.trace_time) as f64) as u64 ); println!("\t\t\tRay/node tests: {}", rstats.accel_node_visits); println!( diff --git a/src/ray.rs b/src/ray.rs index cf91b74..7c2bc83 100644 --- a/src/ray.rs +++ b/src/ray.rs @@ -1,102 +1,401 @@ #![allow(dead_code)] -use float4::Float4; +use float4::{Bool4, Float4}; use crate::math::{Matrix4x4, Point, Vector}; -const OCCLUSION_FLAG: u32 = 1; -const DONE_FLAG: u32 = 1 << 1; +type RayIndexType = u16; +type FlagType = u8; +const OCCLUSION_FLAG: FlagType = 1; +const DONE_FLAG: FlagType = 1 << 1; +/// This is never used directly in ray tracing--it's only used as a convenience +/// for filling the RayBatch structure. #[derive(Debug, Copy, Clone)] pub struct Ray { pub orig: Point, pub dir: Vector, - pub max_t: f32, pub time: f32, pub wavelength: f32, - pub flags: u32, + pub max_t: f32, } -impl Ray { - pub fn new(orig: Point, dir: Vector, time: f32, wavelength: f32, is_occ: bool) -> Ray { - if !is_occ { - Ray { - orig: orig, - dir: dir, - max_t: std::f32::INFINITY, - time: time, - wavelength: wavelength, - flags: 0, - } - } else { - Ray { - orig: orig, - dir: dir, - max_t: 1.0, - time: time, - wavelength: wavelength, - flags: OCCLUSION_FLAG, - } +/// The hot (frequently accessed) parts of ray data. +#[derive(Debug, Copy, Clone)] +struct RayHot { + orig_local: Point, // Local-space ray origin + dir_inv_local: Vector, // Local-space 1.0/ray direction + max_t: f32, + time: f32, + flags: FlagType, +} + +/// The cold (infrequently accessed) parts of ray data. +#[derive(Debug, Copy, Clone)] +struct RayCold { + orig: Point, // World-space ray origin + dir: Vector, // World-space ray direction + wavelength: f32, +} + +/// A batch of rays, separated into hot and cold parts. +#[derive(Debug)] +pub struct RayBatch { + hot: Vec, + cold: Vec, +} + +impl RayBatch { + /// Creates a new empty ray batch. + pub fn new() -> RayBatch { + RayBatch { + hot: Vec::new(), + cold: Vec::new(), } } - pub fn transform(&mut self, mat: &Matrix4x4) { - self.orig = self.orig * *mat; - self.dir = self.dir * *mat; + /// Creates a new empty ray batch, with pre-allocated capacity for + /// `n` rays. + pub fn with_capacity(n: usize) -> RayBatch { + RayBatch { + hot: Vec::with_capacity(n), + cold: Vec::with_capacity(n), + } } - pub fn is_occlusion(&self) -> bool { - (self.flags & OCCLUSION_FLAG) != 0 - } -} - -#[derive(Debug, Copy, Clone)] -pub struct AccelRay { - pub orig: Point, - pub dir_inv: Vector, - pub max_t: f32, - pub time: f32, - pub flags: u32, - pub id: u32, -} - -impl AccelRay { - pub fn new(ray: &Ray, id: u32) -> AccelRay { - AccelRay { - orig: ray.orig, - dir_inv: Vector { - co: Float4::splat(1.0) / ray.dir.co, - }, + pub fn push(&mut self, ray: Ray, is_occlusion: bool) { + self.hot.push(RayHot { + orig_local: ray.orig, // Bogus, to place-hold. + dir_inv_local: ray.dir, // Bogus, to place-hold. max_t: ray.max_t, time: ray.time, - flags: ray.flags, - id: id, + flags: if is_occlusion { OCCLUSION_FLAG } else { 0 }, + }); + self.cold.push(RayCold { + orig: ray.orig, + dir: ray.dir, + wavelength: ray.wavelength, + }); + } + + pub fn swap(&mut self, a: usize, b: usize) { + self.hot.swap(a, b); + self.cold.swap(a, b); + } + + pub fn set_from_ray(&mut self, ray: &Ray, is_occlusion: bool, idx: usize) { + self.hot[idx].orig_local = ray.orig; + self.hot[idx].dir_inv_local = Vector { + co: Float4::splat(1.0) / ray.dir.co, + }; + self.hot[idx].max_t = ray.max_t; + self.hot[idx].time = ray.time; + self.hot[idx].flags = if is_occlusion { OCCLUSION_FLAG } else { 0 }; + + self.cold[idx].orig = ray.orig; + self.cold[idx].dir = ray.dir; + self.cold[idx].wavelength = ray.wavelength; + } + + pub fn truncate(&mut self, len: usize) { + self.hot.truncate(len); + self.cold.truncate(len); + } + + /// Clear all rays, settings the size of the batch back to zero. + /// + /// Capacity is maintained. + pub fn clear(&mut self) { + self.hot.clear(); + self.cold.clear(); + } + + pub fn len(&self) -> usize { + self.hot.len() + } + + /// Updates the accel data of the given ray (at index `idx`) with the + /// given world-to-local-space transform matrix. + /// + /// This should be called when entering (and exiting) traversal of a + /// new transform space. + pub fn update_local(&mut self, idx: usize, xform: &Matrix4x4) { + self.hot[idx].orig_local = self.cold[idx].orig * *xform; + self.hot[idx].dir_inv_local = Vector { + co: Float4::splat(1.0) / (self.cold[idx].dir * *xform).co, + }; + } + + //========================================================== + // Data access + + #[inline(always)] + pub fn orig(&self, idx: usize) -> Point { + self.cold[idx].orig + } + + #[inline(always)] + pub fn dir(&self, idx: usize) -> Vector { + self.cold[idx].dir + } + + #[inline(always)] + pub fn orig_local(&self, idx: usize) -> Point { + self.hot[idx].orig_local + } + + #[inline(always)] + pub fn dir_inv_local(&self, idx: usize) -> Vector { + self.hot[idx].dir_inv_local + } + + #[inline(always)] + pub fn time(&self, idx: usize) -> f32 { + self.hot[idx].time + } + + #[inline(always)] + pub fn max_t(&self, idx: usize) -> f32 { + self.hot[idx].max_t + } + + #[inline(always)] + pub fn set_max_t(&mut self, idx: usize, new_max_t: f32) { + self.hot[idx].max_t = new_max_t; + } + + #[inline(always)] + pub fn wavelength(&self, idx: usize) -> f32 { + self.cold[idx].wavelength + } + + /// Returns whether the given ray (at index `idx`) is an occlusion ray. + #[inline(always)] + pub fn is_occlusion(&self, idx: usize) -> bool { + (self.hot[idx].flags & OCCLUSION_FLAG) != 0 + } + + /// Returns whether the given ray (at index `idx`) has finished traversal. + #[inline(always)] + pub fn is_done(&self, idx: usize) -> bool { + (self.hot[idx].flags & DONE_FLAG) != 0 + } + + /// Marks the given ray (at index `idx`) as an occlusion ray. + #[inline(always)] + pub fn mark_occlusion(&mut self, idx: usize) { + self.hot[idx].flags |= OCCLUSION_FLAG + } + + /// Marks the given ray (at index `idx`) as having finished traversal. + #[inline(always)] + pub fn mark_done(&mut self, idx: usize) { + self.hot[idx].flags |= DONE_FLAG + } +} + +/// A structure used for tracking traversal of a ray batch through a scene. +#[derive(Debug)] +pub struct RayStack { + lanes: Vec, + tasks: Vec, +} + +impl RayStack { + pub fn new() -> RayStack { + RayStack { + lanes: Vec::new(), + tasks: Vec::new(), } } - pub fn update_from_world_ray(&mut self, wr: &Ray) { - self.orig = wr.orig; - self.dir_inv = Vector { - co: Float4::splat(1.0) / wr.dir.co, - }; + /// Returns whether the stack is empty of tasks or not. + pub fn is_empty(&self) -> bool { + self.tasks.is_empty() } - pub fn update_from_xformed_world_ray(&mut self, wr: &Ray, mat: &Matrix4x4) { - self.orig = wr.orig * *mat; - self.dir_inv = Vector { - co: Float4::splat(1.0) / (wr.dir * *mat).co, - }; + /// Makes sure there are at least `count` lanes. + pub fn ensure_lane_count(&mut self, count: usize) { + while self.lanes.len() < count { + self.lanes.push(Lane { + idxs: Vec::new(), + end_len: 0, + }) + } } - pub fn is_occlusion(&self) -> bool { - (self.flags & OCCLUSION_FLAG) != 0 + pub fn ray_count_in_next_task(&self) -> usize { + let task = self.tasks.last().unwrap(); + let end = self.lanes[task.lane].end_len; + end - task.start_idx } - pub fn is_done(&self) -> bool { - (self.flags & DONE_FLAG) != 0 + pub fn next_task_ray_idx(&self, i: usize) -> usize { + let task = self.tasks.last().unwrap(); + let i = i + task.start_idx; + debug_assert!(i < self.lanes[task.lane].end_len); + self.lanes[task.lane].idxs[i] as usize } - pub fn mark_done(&mut self) { - self.flags |= DONE_FLAG; + /// Clears the lanes and tasks of the RayStack. + /// + /// Note: this is (importantly) different than calling clear individually + /// on the `lanes` and `tasks` members. Specifically, we don't want to + /// clear `lanes` itself, as that would also free all the memory of the + /// individual lanes. Instead, we want to iterate over the individual + /// lanes and clear them, but leave `lanes` itself untouched. + pub fn clear(&mut self) { + for lane in self.lanes.iter_mut() { + lane.idxs.clear(); + lane.end_len = 0; + } + + self.tasks.clear(); + } + + /// Pushes the given ray index onto the end of the specified lane. + pub fn push_ray_index(&mut self, ray_idx: usize, lane: usize) { + assert!(self.lanes.len() > lane); + self.lanes[lane].idxs.push(ray_idx as RayIndexType); + } + + /// Pushes any excess indices on the given lane to a new task on the + /// task stack. + /// + /// Returns whether a task was pushed or not. No task will be pushed + /// if there are no excess indices on the end of the lane. + pub fn push_lane_to_task(&mut self, lane_idx: usize) -> bool { + if self.lanes[lane_idx].end_len < self.lanes[lane_idx].idxs.len() { + self.tasks.push(RayTask { + lane: lane_idx, + start_idx: self.lanes[lane_idx].end_len, + }); + self.lanes[lane_idx].end_len = self.lanes[lane_idx].idxs.len(); + true + } else { + false + } + } + + /// Takes the given list of lane indices, and pushes any excess indices on + /// the end of each into a new task, in the order provided. + pub fn push_lanes_to_tasks(&mut self, lane_idxs: &[usize]) { + for &l in lane_idxs { + self.push_lane_to_task(l); + } + } + + pub fn duplicate_next_task(&mut self) { + let task = self.tasks.last().unwrap(); + let l = task.lane; + let start = task.start_idx; + let end = self.lanes[l].end_len; + + // Extend the indices vector + self.lanes[l].idxs.reserve(end - start); + let old_len = self.lanes[l].idxs.len(); + let new_len = old_len + end - start; + unsafe { + self.lanes[l].idxs.set_len(new_len); + } + + // Copy elements + copy_in_place::copy_in_place(&mut self.lanes[l].idxs, start..end, end); + + // Push the new task onto the stack + self.tasks.push(RayTask { + lane: l, + start_idx: end, + }); + + self.lanes[l].end_len = self.lanes[l].idxs.len(); + } + + // Pops the next task off the stack. + pub fn pop_task(&mut self) { + let task = self.tasks.pop().unwrap(); + self.lanes[task.lane].end_len = task.start_idx; + self.lanes[task.lane].idxs.truncate(task.start_idx); + } + + // Executes a task without popping it from the task stack. + pub fn do_next_task(&mut self, mut handle_ray: F) + where + F: FnMut(usize), + { + let task = self.tasks.last().unwrap(); + let task_range = (task.start_idx, self.lanes[task.lane].end_len); + + // Execute task. + for i in task_range.0..task_range.1 { + let ray_idx = self.lanes[task.lane].idxs[i]; + handle_ray(ray_idx as usize); + } + } + + /// Pops the next task off the stack, and executes the provided closure for + /// each ray index in the task. + #[inline(always)] + pub fn pop_do_next_task(&mut self, handle_ray: F) + where + F: FnMut(usize), + { + self.do_next_task(handle_ray); + self.pop_task(); + } + + /// Pops the next task off the stack, executes the provided closure for + /// each ray index in the task, and pushes the ray indices back onto the + /// indicated lanes. + pub fn pop_do_next_task_and_push_rays(&mut self, output_lane_count: usize, mut handle_ray: F) + where + F: FnMut(usize) -> Bool4, + { + // Pop the task and do necessary bookkeeping. + let task = self.tasks.pop().unwrap(); + let task_range = (task.start_idx, self.lanes[task.lane].end_len); + self.lanes[task.lane].end_len = task.start_idx; + + // SAFETY: this is probably evil, and depends on behavior of Vec that + // are not actually promised. But we're essentially truncating the lane + // to the start of our task range, but will continue to access it's + // elements beyond that range via `get_unchecked()` below. Because the + // memory is not freed nor altered, this is safe. However, again, the + // Vec apis don't promise this behavior. So: + // + // TODO: build a slightly different lane abstraction to get this same + // efficiency without depending on implicit Vec behavior. + unsafe { + self.lanes[task.lane].idxs.set_len(task.start_idx); + } + + // Execute task. + for i in task_range.0..task_range.1 { + let ray_idx = *unsafe { self.lanes[task.lane].idxs.get_unchecked(i) }; + let push_mask = handle_ray(ray_idx as usize); + for l in 0..output_lane_count { + if push_mask.get_n(l) { + self.lanes[l as usize].idxs.push(ray_idx); + } + } + } } } + +/// A lane within a RayStack. +#[derive(Debug)] +struct Lane { + idxs: Vec, + end_len: usize, +} + +/// A task within a RayStack. +// +// Specifies the lane that the relevant ray pointers are in, and the +// starting index within that lane. The relevant pointers are always +// `&[start_idx..]` within the given lane. +#[derive(Debug)] +struct RayTask { + lane: usize, + start_idx: usize, +} diff --git a/src/renderer.rs b/src/renderer.rs index 8f1471f..50d3061 100644 --- a/src/renderer.rs +++ b/src/renderer.rs @@ -12,8 +12,7 @@ use scoped_threadpool::Pool; use float4::Float4; use crate::{ - accel::{ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME}, - algorithm::partition_pair, + accel::ACCEL_NODE_RAY_TESTS, color::{map_0_1_to_wavelength, SpectralSample, XYZ}, fp_utils::robust_ray_origin, hash::hash_u32, @@ -21,7 +20,7 @@ use crate::{ image::Image, math::{fast_logit, upper_power_of_two}, mis::power_heuristic, - ray::Ray, + ray::{Ray, RayBatch}, scene::{Scene, SceneLightSample}, surface, timer::Timer, @@ -41,8 +40,8 @@ pub struct Renderer<'a> { #[derive(Debug, Copy, Clone)] pub struct RenderStats { pub trace_time: f64, - pub accel_traversal_time: f64, pub accel_node_visits: u64, + pub ray_count: u64, pub initial_ray_generation_time: f64, pub ray_generation_time: f64, pub sample_writing_time: f64, @@ -53,8 +52,8 @@ impl RenderStats { fn new() -> RenderStats { RenderStats { trace_time: 0.0, - accel_traversal_time: 0.0, accel_node_visits: 0, + ray_count: 0, initial_ray_generation_time: 0.0, ray_generation_time: 0.0, sample_writing_time: 0.0, @@ -64,8 +63,8 @@ impl RenderStats { fn collect(&mut self, other: RenderStats) { self.trace_time += other.trace_time; - self.accel_traversal_time += other.accel_traversal_time; self.accel_node_visits += other.accel_node_visits; + self.ray_count += other.ray_count; self.initial_ray_generation_time += other.initial_ray_generation_time; self.ray_generation_time += other.ray_generation_time; self.sample_writing_time += other.sample_writing_time; @@ -207,7 +206,7 @@ impl<'a> Renderer<'a> { let mut total_timer = Timer::new(); let mut paths = Vec::new(); - let mut rays = Vec::new(); + let mut rays = RayBatch::new(); let mut tracer = Tracer::from_assembly(&self.scene.root); let mut xform_stack = TransformStack::new(); @@ -266,7 +265,7 @@ impl<'a> Renderer<'a> { offset + si as u32, ); paths.push(path); - rays.push(ray); + rays.push(ray, false); } } } @@ -276,13 +275,20 @@ impl<'a> Renderer<'a> { let mut pi = paths.len(); while pi > 0 { // Test rays against scene - let isects = tracer.trace(&rays); + let isects = tracer.trace(&mut rays); stats.trace_time += timer.tick() as f64; // Determine next rays to shoot based on result - pi = partition_pair(&mut paths[..pi], &mut rays[..pi], |i, path, ray| { - path.next(&mut xform_stack, &self.scene, &isects[i], &mut *ray) - }); + let mut new_end = 0; + for i in 0..pi { + if paths[i].next(&mut xform_stack, &self.scene, &isects[i], &mut rays, i) { + paths.swap(new_end, i); + rays.swap(new_end, i); + new_end += 1; + } + } + rays.truncate(new_end); + pi = new_end; stats.ray_generation_time += timer.tick() as f64; } @@ -338,10 +344,7 @@ impl<'a> Renderer<'a> { } stats.total_time += total_timer.tick() as f64; - ACCEL_TRAV_TIME.with(|att| { - stats.accel_traversal_time = att.get(); - att.set(0.0); - }); + stats.ray_count = tracer.rays_traced(); ACCEL_NODE_RAY_TESTS.with(|anv| { stats.accel_node_visits = anv.get(); anv.set(0); @@ -431,7 +434,8 @@ impl LightPath { xform_stack: &mut TransformStack, scene: &Scene, isect: &surface::SurfaceIntersection, - ray: &mut Ray, + rays: &mut RayBatch, + ray_idx: usize, ) -> bool { match self.event { //-------------------------------------------------------------------- @@ -496,13 +500,13 @@ impl LightPath { // Distant light SceneLightSample::Distant { direction, .. } => { let (attenuation, closure_pdf) = closure.evaluate( - ray.dir, + rays.dir(ray_idx), direction, idata.nor, idata.nor_g, self.wavelength, ); - let mut shadow_ray = { + let shadow_ray = { // Calculate the shadow ray for testing if the light is // in shadow or not. let offset_pos = robust_ray_origin( @@ -511,15 +515,14 @@ impl LightPath { idata.nor_g.normalized(), direction, ); - Ray::new( - offset_pos, - direction, - self.time, - self.wavelength, - true, - ) + Ray { + orig: offset_pos, + dir: direction, + time: self.time, + wavelength: self.wavelength, + max_t: std::f32::INFINITY, + } }; - shadow_ray.max_t = std::f32::INFINITY; (attenuation, closure_pdf, shadow_ray) } @@ -527,7 +530,7 @@ impl LightPath { SceneLightSample::Surface { sample_geo, .. } => { let dir = sample_geo.0 - idata.pos; let (attenuation, closure_pdf) = closure.evaluate( - ray.dir, + rays.dir(ray_idx), dir, idata.nor, idata.nor_g, @@ -548,13 +551,13 @@ impl LightPath { sample_geo.1.normalized(), -dir, ); - Ray::new( - offset_pos, - offset_end - offset_pos, - self.time, - self.wavelength, - true, - ) + Ray { + orig: offset_pos, + dir: offset_end - offset_pos, + time: self.time, + wavelength: self.wavelength, + max_t: 1.0, + } }; (attenuation, closure_pdf, shadow_ray) } @@ -572,7 +575,7 @@ impl LightPath { light_info.color().e * attenuation.e * self.light_attenuation / (light_mis_pdf * light_sel_pdf); - *ray = shadow_ray; + rays.set_from_ray(&shadow_ray, true, ray_idx); true } @@ -609,8 +612,13 @@ impl LightPath { idata.nor_g.normalized(), dir, ); - self.next_bounce_ray = - Some(Ray::new(offset_pos, dir, self.time, self.wavelength, false)); + self.next_bounce_ray = Some(Ray { + orig: offset_pos, + dir: dir, + time: self.time, + wavelength: self.wavelength, + max_t: std::f32::INFINITY, + }); true } else { @@ -626,7 +634,7 @@ impl LightPath { self.event = LightPathEvent::ShadowRay; return true; } else if do_bounce { - *ray = self.next_bounce_ray.unwrap(); + rays.set_from_ray(&self.next_bounce_ray.unwrap(), false, ray_idx); self.event = LightPathEvent::BounceRay; self.light_attenuation *= self.next_attenuation_fac; return true; @@ -657,7 +665,7 @@ impl LightPath { // Set up for the next bounce, if any if let Some(ref nbr) = self.next_bounce_ray { - *ray = *nbr; + rays.set_from_ray(nbr, false, ray_idx); self.light_attenuation *= self.next_attenuation_fac; self.event = LightPathEvent::BounceRay; return true; diff --git a/src/surface/micropoly_batch.rs b/src/surface/micropoly_batch.rs index 36d686f..8bb9447 100644 --- a/src/surface/micropoly_batch.rs +++ b/src/surface/micropoly_batch.rs @@ -8,7 +8,7 @@ use crate::{ boundable::Boundable, lerp::lerp_slice, math::{cross, dot, Matrix4x4, Normal, Point}, - ray::{AccelRay, Ray}, + ray::{RayBatch, RayStack, RayTask} shading::surface_closure::SurfaceClosure, }; @@ -99,8 +99,8 @@ impl<'a> MicropolyBatch<'a> { impl<'a> MicropolyBatch<'a> { fn intersect_rays( &self, - accel_rays: &mut [AccelRay], - wrays: &[Ray], + rays: &mut RayBatch, + ray_stack: &mut RayStack, isects: &mut [SurfaceIntersection], space: &[Matrix4x4], ) { @@ -112,7 +112,7 @@ impl<'a> MicropolyBatch<'a> { }; self.accel - .traverse(&mut accel_rays[..], self.indices, |tri_indices, rs| { + .traverse(rays, ray_stack, self.indices, |tri_indices, rs| { // For static triangles with static transforms, cache them. let is_cached = self.time_sample_count == 1 && space.len() <= 1; let mut tri = if is_cached { diff --git a/src/surface/mod.rs b/src/surface/mod.rs index 9c2b761..2f90223 100644 --- a/src/surface/mod.rs +++ b/src/surface/mod.rs @@ -1,6 +1,6 @@ #![allow(dead_code)] -pub mod micropoly_batch; +// pub mod micropoly_batch; pub mod triangle; pub mod triangle_mesh; @@ -9,7 +9,7 @@ use std::fmt::Debug; use crate::{ boundable::Boundable, math::{Matrix4x4, Normal, Point, Vector}, - ray::{AccelRay, Ray}, + ray::{RayBatch, RayStack}, shading::surface_closure::SurfaceClosure, shading::SurfaceShader, }; @@ -17,8 +17,8 @@ use crate::{ pub trait Surface: Boundable + Debug + Sync { fn intersect_rays( &self, - accel_rays: &mut [AccelRay], - wrays: &[Ray], + rays: &mut RayBatch, + ray_stack: &mut RayStack, isects: &mut [SurfaceIntersection], shader: &SurfaceShader, space: &[Matrix4x4], diff --git a/src/surface/triangle.rs b/src/surface/triangle.rs index c252e59..4aed3a3 100644 --- a/src/surface/triangle.rs +++ b/src/surface/triangle.rs @@ -1,6 +1,48 @@ #![allow(dead_code)] -use crate::{fp_utils::fp_gamma, math::Point, ray::Ray}; +use crate::{ + fp_utils::fp_gamma, + math::{Point, Vector}, +}; + +#[derive(Debug, Copy, Clone)] +pub struct RayTriPrecompute { + i: (usize, usize, usize), + s: (f32, f32, f32), +} + +impl RayTriPrecompute { + pub fn new(ray_dir: Vector) -> RayTriPrecompute { + // Calculate the permuted dimension indices for the new ray space. + let (xi, yi, zi) = { + let xabs = ray_dir.x().abs(); + let yabs = ray_dir.y().abs(); + let zabs = ray_dir.z().abs(); + + if xabs > yabs && xabs > zabs { + (1, 2, 0) + } else if yabs > zabs { + (2, 0, 1) + } else { + (0, 1, 2) + } + }; + + let dir_x = ray_dir.get_n(xi); + let dir_y = ray_dir.get_n(yi); + let dir_z = ray_dir.get_n(zi); + + // Calculate shear constants. + let sx = dir_x / dir_z; + let sy = dir_y / dir_z; + let sz = 1.0 / dir_z; + + RayTriPrecompute { + i: (xi, yi, zi), + s: (sx, sy, sz), + } + } +} /// Intersects `ray` with `tri`, returning `Some((t, b0, b1, b2))`, or `None` /// if no intersection. @@ -13,42 +55,23 @@ use crate::{fp_utils::fp_gamma, math::Point, ray::Ray}; /// /// Uses the ray-triangle test from the paper "Watertight Ray/Triangle /// Intersection" by Woop et al. -pub fn intersect_ray(ray: &Ray, tri: (Point, Point, Point)) -> Option<(f32, f32, f32, f32)> { - // Calculate the permuted dimension indices for the new ray space. - let (xi, yi, zi) = { - let xabs = ray.dir.x().abs(); - let yabs = ray.dir.y().abs(); - let zabs = ray.dir.z().abs(); - - if xabs > yabs && xabs > zabs { - (1, 2, 0) - } else if yabs > zabs { - (2, 0, 1) - } else { - (0, 1, 2) - } - }; - - let dir_x = ray.dir.get_n(xi); - let dir_y = ray.dir.get_n(yi); - let dir_z = ray.dir.get_n(zi); - - // Calculate shear constants. - let sx = dir_x / dir_z; - let sy = dir_y / dir_z; - let sz = 1.0 / dir_z; - +pub fn intersect_ray( + ray_orig: Point, + ray_pre: RayTriPrecompute, + ray_max_t: f32, + tri: (Point, Point, Point), +) -> Option<(f32, f32, f32, f32)> { // Calculate vertices in ray space. - let p0 = tri.0 - ray.orig; - let p1 = tri.1 - ray.orig; - let p2 = tri.2 - ray.orig; + let p0 = tri.0 - ray_orig; + let p1 = tri.1 - ray_orig; + let p2 = tri.2 - ray_orig; - let p0x = p0.get_n(xi) - (sx * p0.get_n(zi)); - let p0y = p0.get_n(yi) - (sy * p0.get_n(zi)); - let p1x = p1.get_n(xi) - (sx * p1.get_n(zi)); - let p1y = p1.get_n(yi) - (sy * p1.get_n(zi)); - let p2x = p2.get_n(xi) - (sx * p2.get_n(zi)); - let p2y = p2.get_n(yi) - (sy * p2.get_n(zi)); + let p0x = p0.get_n(ray_pre.i.0) - (ray_pre.s.0 * p0.get_n(ray_pre.i.2)); + let p0y = p0.get_n(ray_pre.i.1) - (ray_pre.s.1 * p0.get_n(ray_pre.i.2)); + let p1x = p1.get_n(ray_pre.i.0) - (ray_pre.s.0 * p1.get_n(ray_pre.i.2)); + let p1y = p1.get_n(ray_pre.i.1) - (ray_pre.s.1 * p1.get_n(ray_pre.i.2)); + let p2x = p2.get_n(ray_pre.i.0) - (ray_pre.s.0 * p2.get_n(ray_pre.i.2)); + let p2y = p2.get_n(ray_pre.i.1) - (ray_pre.s.1 * p2.get_n(ray_pre.i.2)); // Calculate scaled barycentric coordinates. let mut e0 = (p1x * p2y) - (p1y * p2x); @@ -74,14 +97,14 @@ pub fn intersect_ray(ray: &Ray, tri: (Point, Point, Point)) -> Option<(f32, f32, } // Calculate t of hitpoint. - let p0z = sz * p0.get_n(zi); - let p1z = sz * p1.get_n(zi); - let p2z = sz * p2.get_n(zi); + let p0z = ray_pre.s.2 * p0.get_n(ray_pre.i.2); + let p1z = ray_pre.s.2 * p1.get_n(ray_pre.i.2); + let p2z = ray_pre.s.2 * p2.get_n(ray_pre.i.2); let t_scaled = (e0 * p0z) + (e1 * p1z) + (e2 * p2z); // Check if the hitpoint t is within ray min/max t. - if (det > 0.0 && (t_scaled <= 0.0 || t_scaled > (ray.max_t * det))) - || (det < 0.0 && (t_scaled >= 0.0 || t_scaled < (ray.max_t * det))) + if (det > 0.0 && (t_scaled <= 0.0 || t_scaled > (ray_max_t * det))) + || (det < 0.0 && (t_scaled >= 0.0 || t_scaled < (ray_max_t * det))) { return None; } diff --git a/src/surface/triangle_mesh.rs b/src/surface/triangle_mesh.rs index a067416..43388a8 100644 --- a/src/surface/triangle_mesh.rs +++ b/src/surface/triangle_mesh.rs @@ -8,12 +8,14 @@ use crate::{ boundable::Boundable, lerp::lerp_slice, math::{cross, dot, Matrix4x4, Normal, Point}, - ray::{AccelRay, Ray}, + ray::{RayBatch, RayStack}, shading::SurfaceShader, }; use super::{triangle, Surface, SurfaceIntersection, SurfaceIntersectionData}; +const MAX_LEAF_TRIANGLE_COUNT: usize = 3; + #[derive(Copy, Clone, Debug)] pub struct TriangleMesh<'a> { time_sample_count: usize, @@ -93,7 +95,7 @@ impl<'a> TriangleMesh<'a> { }; // Build BVH - let accel = BVH4::from_objects(arena, &mut indices[..], 3, |tri| { + let accel = BVH4::from_objects(arena, &mut indices[..], MAX_LEAF_TRIANGLE_COUNT, |tri| { &bounds [(tri.3 as usize * time_sample_count)..((tri.3 as usize + 1) * time_sample_count)] }); @@ -117,8 +119,8 @@ impl<'a> Boundable for TriangleMesh<'a> { impl<'a> Surface for TriangleMesh<'a> { fn intersect_rays( &self, - accel_rays: &mut [AccelRay], - wrays: &[Ray], + rays: &mut RayBatch, + ray_stack: &mut RayStack, isects: &mut [SurfaceIntersection], shader: &SurfaceShader, space: &[Matrix4x4], @@ -131,144 +133,177 @@ impl<'a> Surface for TriangleMesh<'a> { }; self.accel - .traverse(&mut accel_rays[..], self.indices, |tri_indices, rs| { - // For static triangles with static transforms, cache them. - let is_cached = self.time_sample_count == 1 && space.len() <= 1; - let mut tri = if is_cached { - let tri = ( - self.vertices[tri_indices.0 as usize], - self.vertices[tri_indices.1 as usize], - self.vertices[tri_indices.2 as usize], - ); - if space.is_empty() { - tri - } else { - ( - tri.0 * static_mat_space, - tri.1 * static_mat_space, - tri.2 * static_mat_space, - ) - } - } else { - unsafe { std::mem::uninitialized() } - }; + .traverse(rays, ray_stack, |idx_range, rays, ray_stack| { + let tri_count = idx_range.end - idx_range.start; - // Test each ray against the current triangle. - for r in rs { - let wr = &wrays[r.id as usize]; + // Build the triangle cache if we can! + let is_cached = ray_stack.ray_count_in_next_task() >= tri_count + && self.time_sample_count == 1 + && space.len() <= 1; + let mut tri_cache = [unsafe { std::mem::uninitialized() }; MAX_LEAF_TRIANGLE_COUNT]; + if is_cached { + for tri_idx in idx_range.clone() { + let i = tri_idx - idx_range.start; + let tri_indices = self.indices[tri_idx]; - // Get triangle if necessary - if !is_cached { - tri = if self.time_sample_count == 1 { - // No deformation motion blur, so fast-path it. - ( - self.vertices[tri_indices.0 as usize], - self.vertices[tri_indices.1 as usize], - self.vertices[tri_indices.2 as usize], - ) - } else { - // Deformation motion blur, need to interpolate. - let p0_slice = &self.vertices[(tri_indices.0 as usize - * self.time_sample_count) - ..((tri_indices.0 as usize + 1) * self.time_sample_count)]; - let p1_slice = &self.vertices[(tri_indices.1 as usize - * self.time_sample_count) - ..((tri_indices.1 as usize + 1) * self.time_sample_count)]; - let p2_slice = &self.vertices[(tri_indices.2 as usize - * self.time_sample_count) - ..((tri_indices.2 as usize + 1) * self.time_sample_count)]; - - let p0 = lerp_slice(p0_slice, wr.time); - let p1 = lerp_slice(p1_slice, wr.time); - let p2 = lerp_slice(p2_slice, wr.time); - - (p0, p1, p2) - }; - } - - // Transform triangle if necessary, and get transform space. - let mat_space = if !space.is_empty() { - if space.len() > 1 { - // Per-ray transform, for motion blur - let mat_space = lerp_slice(space, wr.time).inverse(); - tri = (tri.0 * mat_space, tri.1 * mat_space, tri.2 * mat_space); - mat_space - } else { - // Same transform for all rays - if !is_cached { - tri = ( - tri.0 * static_mat_space, - tri.1 * static_mat_space, - tri.2 * static_mat_space, - ); - } - static_mat_space - } - } else { - // No transforms - Matrix4x4::new() - }; - - // Test ray against triangle - if let Some((t, b0, b1, b2)) = triangle::intersect_ray(wr, tri) { - if t < r.max_t { - if r.is_occlusion() { - isects[r.id as usize] = SurfaceIntersection::Occlude; - r.mark_done(); - } else { - // Calculate intersection point and error magnitudes - let (pos, pos_err) = triangle::surface_point(tri, (b0, b1, b2)); - - // Calculate geometric surface normal - let geo_normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal(); - - // Calculate interpolated surface normal, if any - let shading_normal = if let Some(normals) = self.normals { - let n0_slice = &normals[(tri_indices.0 as usize - * self.time_sample_count) - ..((tri_indices.0 as usize + 1) * self.time_sample_count)]; - let n1_slice = &normals[(tri_indices.1 as usize - * self.time_sample_count) - ..((tri_indices.1 as usize + 1) * self.time_sample_count)]; - let n2_slice = &normals[(tri_indices.2 as usize - * self.time_sample_count) - ..((tri_indices.2 as usize + 1) * self.time_sample_count)]; - - let n0 = lerp_slice(n0_slice, wr.time).normalized(); - let n1 = lerp_slice(n1_slice, wr.time).normalized(); - let n2 = lerp_slice(n2_slice, wr.time).normalized(); - - let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space; - if dot(s_nor, geo_normal) >= 0.0 { - s_nor - } else { - -s_nor - } - } else { - geo_normal - }; - - let intersection_data = SurfaceIntersectionData { - incoming: wr.dir, - t: t, - pos: pos, - pos_err: pos_err, - nor: shading_normal, - nor_g: geo_normal, - local_space: mat_space, - sample_pdf: 0.0, - }; - - // Fill in intersection data - isects[r.id as usize] = SurfaceIntersection::Hit { - intersection_data: intersection_data, - closure: shader.shade(&intersection_data, wr.time), - }; - r.max_t = t; - } + // For static triangles with static transforms, cache them. + tri_cache[i] = ( + self.vertices[tri_indices.0 as usize], + self.vertices[tri_indices.1 as usize], + self.vertices[tri_indices.2 as usize], + ); + if !space.is_empty() { + tri_cache[i].0 = tri_cache[i].0 * static_mat_space; + tri_cache[i].1 = tri_cache[i].1 * static_mat_space; + tri_cache[i].2 = tri_cache[i].2 * static_mat_space; } } } + + // Test each ray against the triangles. + ray_stack.do_next_task(|ray_idx| { + let ray_idx = ray_idx as usize; + + if rays.is_done(ray_idx) { + return; + } + + let ray_time = rays.time(ray_idx); + + // Calculate the ray space, if necessary. + let mat_space = if space.len() > 1 { + // Per-ray transform, for motion blur + lerp_slice(space, ray_time).inverse() + } else { + static_mat_space + }; + + // Iterate through the triangles and test the ray against them. + let mut non_shadow_hit = false; + let mut hit_tri = unsafe { std::mem::uninitialized() }; + let mut hit_tri_indices = unsafe { std::mem::uninitialized() }; + let mut hit_tri_data = unsafe { std::mem::uninitialized() }; + let ray_pre = triangle::RayTriPrecompute::new(rays.dir(ray_idx)); + for tri_idx in idx_range.clone() { + let tri_indices = self.indices[tri_idx]; + + // Get triangle if necessary + let tri = if is_cached { + let i = tri_idx - idx_range.start; + tri_cache[i] + } else { + let mut tri = if self.time_sample_count == 1 { + // No deformation motion blur, so fast-path it. + ( + self.vertices[tri_indices.0 as usize], + self.vertices[tri_indices.1 as usize], + self.vertices[tri_indices.2 as usize], + ) + } else { + // Deformation motion blur, need to interpolate. + let p0_slice = &self.vertices[(tri_indices.0 as usize + * self.time_sample_count) + ..((tri_indices.0 as usize + 1) * self.time_sample_count)]; + let p1_slice = &self.vertices[(tri_indices.1 as usize + * self.time_sample_count) + ..((tri_indices.1 as usize + 1) * self.time_sample_count)]; + let p2_slice = &self.vertices[(tri_indices.2 as usize + * self.time_sample_count) + ..((tri_indices.2 as usize + 1) * self.time_sample_count)]; + + let p0 = lerp_slice(p0_slice, ray_time); + let p1 = lerp_slice(p1_slice, ray_time); + let p2 = lerp_slice(p2_slice, ray_time); + + (p0, p1, p2) + }; + + if !space.is_empty() { + tri.0 = tri.0 * mat_space; + tri.1 = tri.1 * mat_space; + tri.2 = tri.2 * mat_space; + } + + tri + }; + + // Test ray against triangle + if let Some((t, b0, b1, b2)) = triangle::intersect_ray( + rays.orig(ray_idx), + ray_pre, + rays.max_t(ray_idx), + tri, + ) { + if rays.is_occlusion(ray_idx) { + isects[ray_idx] = SurfaceIntersection::Occlude; + rays.mark_done(ray_idx); + break; + } else { + non_shadow_hit = true; + rays.set_max_t(ray_idx, t); + hit_tri = tri; + hit_tri_indices = tri_indices; + hit_tri_data = (t, b0, b1, b2); + } + } + } + + // Calculate intersection data if necessary. + if non_shadow_hit { + let (t, b0, b1, b2) = hit_tri_data; + + // Calculate intersection point and error magnitudes + let (pos, pos_err) = triangle::surface_point(hit_tri, (b0, b1, b2)); + + // Calculate geometric surface normal + let geo_normal = + cross(hit_tri.0 - hit_tri.1, hit_tri.0 - hit_tri.2).into_normal(); + + // Calculate interpolated surface normal, if any + let shading_normal = if let Some(normals) = self.normals { + let n0_slice = &normals[(hit_tri_indices.0 as usize + * self.time_sample_count) + ..((hit_tri_indices.0 as usize + 1) * self.time_sample_count)]; + let n1_slice = &normals[(hit_tri_indices.1 as usize + * self.time_sample_count) + ..((hit_tri_indices.1 as usize + 1) * self.time_sample_count)]; + let n2_slice = &normals[(hit_tri_indices.2 as usize + * self.time_sample_count) + ..((hit_tri_indices.2 as usize + 1) * self.time_sample_count)]; + + let n0 = lerp_slice(n0_slice, ray_time).normalized(); + let n1 = lerp_slice(n1_slice, ray_time).normalized(); + let n2 = lerp_slice(n2_slice, ray_time).normalized(); + + let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space; + if dot(s_nor, geo_normal) >= 0.0 { + s_nor + } else { + -s_nor + } + } else { + geo_normal + }; + + let intersection_data = SurfaceIntersectionData { + incoming: rays.dir(ray_idx), + t: t, + pos: pos, + pos_err: pos_err, + nor: shading_normal, + nor_g: geo_normal, + local_space: mat_space, + sample_pdf: 0.0, + }; + + // Fill in intersection data + isects[ray_idx] = SurfaceIntersection::Hit { + intersection_data: intersection_data, + closure: shader.shade(&intersection_data, ray_time), + }; + } + }); + ray_stack.pop_task(); }); } } diff --git a/src/tracer.rs b/src/tracer.rs index 4105dfc..d3b5b09 100644 --- a/src/tracer.rs +++ b/src/tracer.rs @@ -1,10 +1,11 @@ use std::iter; use crate::{ - algorithm::partition, + accel::ray_code, color::{rec709_to_xyz, Color}, lerp::lerp_slice, - ray::{AccelRay, Ray}, + math::Matrix4x4, + ray::{RayBatch, RayStack}, scene::{Assembly, InstanceType, Object}, shading::{SimpleSurfaceShader, SurfaceShader}, surface::SurfaceIntersection, @@ -12,14 +13,16 @@ use crate::{ }; pub struct Tracer<'a> { - rays: Vec, + ray_trace_count: u64, + ray_stack: RayStack, inner: TracerInner<'a>, } impl<'a> Tracer<'a> { pub fn from_assembly(assembly: &'a Assembly) -> Tracer<'a> { Tracer { - rays: Vec::new(), + ray_trace_count: 0, + ray_stack: RayStack::new(), inner: TracerInner { root: assembly, xform_stack: TransformStack::new(), @@ -28,17 +31,13 @@ impl<'a> Tracer<'a> { } } - pub fn trace<'b>(&'b mut self, wrays: &[Ray]) -> &'b [SurfaceIntersection] { - self.rays.clear(); - self.rays.reserve(wrays.len()); - let mut ids = 0..(wrays.len() as u32); - self.rays.extend( - wrays - .iter() - .map(|wr| AccelRay::new(wr, ids.next().unwrap())), - ); + pub fn trace<'b>(&'b mut self, rays: &mut RayBatch) -> &'b [SurfaceIntersection] { + self.ray_trace_count += rays.len() as u64; + self.inner.trace(rays, &mut self.ray_stack) + } - self.inner.trace(wrays, &mut self.rays[..]) + pub fn rays_traced(&self) -> u64 { + self.ray_trace_count } } @@ -49,16 +48,37 @@ struct TracerInner<'a> { } impl<'a> TracerInner<'a> { - fn trace<'b>(&'b mut self, wrays: &[Ray], rays: &mut [AccelRay]) -> &'b [SurfaceIntersection] { + fn trace<'b>( + &'b mut self, + rays: &mut RayBatch, + ray_stack: &mut RayStack, + ) -> &'b [SurfaceIntersection] { + ray_stack.clear(); + // Ready the isects self.isects.clear(); - self.isects.reserve(wrays.len()); + self.isects.reserve(rays.len()); self.isects - .extend(iter::repeat(SurfaceIntersection::Miss).take(wrays.len())); + .extend(iter::repeat(SurfaceIntersection::Miss).take(rays.len())); - let mut ray_sets = split_rays_by_direction(&mut rays[..]); - for ray_set in ray_sets.iter_mut().filter(|ray_set| !ray_set.is_empty()) { - self.trace_assembly(self.root, wrays, ray_set); + // Prep the accel part of the rays. + { + let ident = Matrix4x4::new(); + for i in 0..rays.len() { + rays.update_local(i, &ident); + } + } + + // Divide the rays into 8 different lanes by direction. + ray_stack.ensure_lane_count(8); + for i in 0..rays.len() { + ray_stack.push_ray_index(i, ray_code(rays.dir(i))); + } + ray_stack.push_lanes_to_tasks(&[0, 1, 2, 3, 4, 5, 6, 7]); + + // Trace each of the 8 lanes separately. + while !ray_stack.is_empty() { + self.trace_assembly(self.root, rays, ray_stack); } &self.isects @@ -67,82 +87,43 @@ impl<'a> TracerInner<'a> { fn trace_assembly<'b>( &'b mut self, assembly: &Assembly, - wrays: &[Ray], - accel_rays: &mut [AccelRay], + rays: &mut RayBatch, + ray_stack: &mut RayStack, ) { assembly .object_accel - .traverse(&mut accel_rays[..], &assembly.instances[..], |inst, rs| { + .traverse(rays, ray_stack, |idx_range, rays, ray_stack| { + let inst = &assembly.instances[idx_range.start]; + // Transform rays if needed if let Some((xstart, xend)) = inst.transform_indices { // Push transforms to stack self.xform_stack.push(&assembly.xforms[xstart..xend]); // Do transforms + // TODO: re-divide rays based on direction (maybe?). let xforms = self.xform_stack.top(); - for ray in &mut rs[..] { - let id = ray.id; - let t = ray.time; - ray.update_from_xformed_world_ray( - &wrays[id as usize], - &lerp_slice(xforms, t), - ); - } + ray_stack.do_next_task(|ray_idx| { + let t = rays.time(ray_idx); + rays.update_local(ray_idx, &lerp_slice(xforms, t)); + }); + ray_stack.duplicate_next_task(); } // Trace rays - { - // This is kind of weird looking, but what we're doing here is - // splitting the rays up based on direction if they were - // transformed, and not splitting them up if they weren't - // transformed. - // But to keep the actual tracing code in one place (DRY), - // we map both cases to an array slice that contains slices of - // ray arrays. Gah... that's confusing even when explained. - // TODO: do this in a way that's less confusing. Probably split - // the tracing code out into a trace_instance() method or - // something. - let mut tmp = if inst.transform_indices.is_some() { - split_rays_by_direction(rs) - } else { - [ - &mut rs[..], - &mut [], - &mut [], - &mut [], - &mut [], - &mut [], - &mut [], - &mut [], - ] - }; - let ray_sets = if inst.transform_indices.is_some() { - &mut tmp[..] - } else { - &mut tmp[..1] - }; + match inst.instance_type { + InstanceType::Object => { + self.trace_object( + &assembly.objects[inst.data_index], + inst.surface_shader_index + .map(|i| assembly.surface_shaders[i]), + rays, + ray_stack, + ); + } - // Loop through the split ray slices and trace them - for ray_set in ray_sets.iter_mut().filter(|ray_set| !ray_set.is_empty()) { - match inst.instance_type { - InstanceType::Object => { - self.trace_object( - &assembly.objects[inst.data_index], - inst.surface_shader_index - .map(|i| assembly.surface_shaders[i]), - wrays, - ray_set, - ); - } - - InstanceType::Assembly => { - self.trace_assembly( - &assembly.assemblies[inst.data_index], - wrays, - ray_set, - ); - } - } + InstanceType::Assembly => { + self.trace_assembly(&assembly.assemblies[inst.data_index], rays, ray_stack); } } @@ -154,19 +135,15 @@ impl<'a> TracerInner<'a> { // Undo transforms let xforms = self.xform_stack.top(); if !xforms.is_empty() { - for ray in &mut rs[..] { - let id = ray.id; - let t = ray.time; - ray.update_from_xformed_world_ray( - &wrays[id as usize], - &lerp_slice(xforms, t), - ); - } + ray_stack.pop_do_next_task(|ray_idx| { + let t = rays.time(ray_idx); + rays.update_local(ray_idx, &lerp_slice(xforms, t)); + }); } else { - for ray in &mut rs[..] { - let id = ray.id; - ray.update_from_world_ray(&wrays[id as usize]); - } + let ident = Matrix4x4::new(); + ray_stack.pop_do_next_task(|ray_idx| { + rays.update_local(ray_idx, &ident); + }); } } }); @@ -176,8 +153,8 @@ impl<'a> TracerInner<'a> { &'b mut self, obj: &Object, surface_shader: Option<&SurfaceShader>, - wrays: &[Ray], - rays: &mut [AccelRay], + rays: &mut RayBatch, + ray_stack: &mut RayStack, ) { match *obj { Object::Surface(surface) => { @@ -188,7 +165,7 @@ impl<'a> TracerInner<'a> { surface.intersect_rays( rays, - wrays, + ray_stack, &mut self.isects, shader, self.xform_stack.top(), @@ -203,7 +180,7 @@ impl<'a> TracerInner<'a> { surface.intersect_rays( rays, - wrays, + ray_stack, &mut self.isects, &bogus_shader, self.xform_stack.top(), @@ -212,27 +189,3 @@ impl<'a> TracerInner<'a> { } } } - -fn split_rays_by_direction(rays: &mut [AccelRay]) -> [&mut [AccelRay]; 8] { - // | | | | | | | | | - // s1 s2 s3 s4 s5 s6 s7 - let s4 = partition(&mut rays[..], |r| r.dir_inv.x() >= 0.0); - - let s2 = partition(&mut rays[..s4], |r| r.dir_inv.y() >= 0.0); - let s6 = s4 + partition(&mut rays[s4..], |r| r.dir_inv.y() >= 0.0); - - let s1 = partition(&mut rays[..s2], |r| r.dir_inv.z() >= 0.0); - let s3 = s2 + partition(&mut rays[s2..s4], |r| r.dir_inv.z() >= 0.0); - let s5 = s4 + partition(&mut rays[s4..s6], |r| r.dir_inv.z() >= 0.0); - let s7 = s6 + partition(&mut rays[s6..], |r| r.dir_inv.z() >= 0.0); - - let (rest, rs7) = rays.split_at_mut(s7); - let (rest, rs6) = rest.split_at_mut(s6); - let (rest, rs5) = rest.split_at_mut(s5); - let (rest, rs4) = rest.split_at_mut(s4); - let (rest, rs3) = rest.split_at_mut(s3); - let (rest, rs2) = rest.split_at_mut(s2); - let (rs0, rs1) = rest.split_at_mut(s1); - - [rs0, rs1, rs2, rs3, rs4, rs5, rs6, rs7] -} diff --git a/sub_crates/float4/src/lib.rs b/sub_crates/float4/src/lib.rs index 4006301..0f081b3 100644 --- a/sub_crates/float4/src/lib.rs +++ b/sub_crates/float4/src/lib.rs @@ -620,6 +620,29 @@ mod x86_64_sse { } impl Bool4 { + #[inline(always)] + pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 { + use std::arch::x86_64::_mm_set_ps; + Bool4 { + data: unsafe { + _mm_set_ps( + if d { 1.0 } else { 0.0 }, + if c { 1.0 } else { 0.0 }, + if b { 1.0 } else { 0.0 }, + if a { 1.0 } else { 0.0 }, + ) + }, + } + } + + #[inline(always)] + pub fn new_false() -> Bool4 { + use std::arch::x86_64::_mm_set1_ps; + Bool4 { + data: unsafe { _mm_set1_ps(0.0) }, + } + } + /// Returns the value of the nth element. #[inline(always)] pub fn get_n(&self, n: usize) -> bool { @@ -637,24 +660,34 @@ mod x86_64_sse { self.get_n(0) } - /// Returns the value of the 1th element. + /// Returns the value of the 1st element. #[inline(always)] pub fn get_1(&self) -> bool { self.get_n(1) } - /// Returns the value of the 2th element. + /// Returns the value of the 2nd element. #[inline(always)] pub fn get_2(&self) -> bool { self.get_n(2) } - /// Returns the value of the 3th element. + /// Returns the value of the 3rd element. #[inline(always)] pub fn get_3(&self) -> bool { self.get_n(3) } + /// Returns whether all four bools are false. + /// + /// This is the `NOT` operation on the result of `OR`ing all the + /// contained bools. If even one bool is true, this returns false. + #[inline(always)] + pub fn is_all_false(&self) -> bool { + let a = unsafe { *(&self.data as *const __m128 as *const u128) }; + a == 0 + } + #[inline] pub fn to_bitmask(&self) -> u8 { let a = unsafe { *(&self.data as *const __m128 as *const u8).offset(0) }; @@ -1236,21 +1269,25 @@ mod fallback { det } - /// Essentially a tuple of four bools, which will use SIMD operations - /// where possible on a platform. - #[cfg(feature = "simd_perf")] - #[derive(Debug, Copy, Clone)] - pub struct Bool4 { - data: bool32fx4, - } - - #[cfg(not(feature = "simd_perf"))] + /// Essentially a tuple of four bools. #[derive(Debug, Copy, Clone)] pub struct Bool4 { data: [bool; 4], } impl Bool4 { + #[inline(always)] + pub fn new(a: bool, b: bool, c: bool, d: bool) -> Bool4 { + Bool4 { data: [a, b, c, d] } + } + + #[inline(always)] + pub fn new_false() -> Bool4 { + Bool4 { + data: [false, false, false, false], + } + } + /// Returns the value of the nth element. #[inline(always)] pub fn get_n(self, n: usize) -> bool { @@ -1285,6 +1322,15 @@ mod fallback { self.get_n(3) } + /// Returns whether all four bools are false. + /// + /// This is the `NOT` operation on the result of `OR`ing all the + /// contained bools. If even one bool is true, this returns false. + #[inline(always)] + pub fn is_all_false(&self) -> bool { + !(self.data[0] | self.data[1] | self.data[2] | self.data[3]) + } + #[inline] pub fn to_bitmask(self) -> u8 { (self.get_0() as u8) @@ -1565,4 +1611,10 @@ mod tests { assert_eq!(r, 0b00001010); } + + #[test] + fn bool4_is_all_false() { + assert_eq!(true, Bool4::new(false, false, false, false).is_all_false()); + assert_eq!(false, Bool4::new(false, false, true, false).is_all_false()); + } }