Implemented a SIMD version of the BVH4.
It does indeed appear to be faster with this style of traversal!
This commit is contained in:
parent
50f9fd851b
commit
aed0f2ede1
387
src/accel/bvh4_simd.rs
Normal file
387
src/accel/bvh4_simd.rs
Normal file
|
@ -0,0 +1,387 @@
|
|||
//! This BVH4 implementation pulls a lot of ideas from the paper
|
||||
//! "Efficient Ray Tracing Kernels for Modern CPU Architectures"
|
||||
//! by Fuetterling et al.
|
||||
//!
|
||||
//! Specifically, the table-based traversal order approach they
|
||||
//! propose is largely followed by this implementation.
|
||||
|
||||
#![allow(dead_code)]
|
||||
|
||||
use mem_arena::MemArena;
|
||||
|
||||
use crate::{
|
||||
bbox::BBox,
|
||||
bbox4::BBox4,
|
||||
boundable::Boundable,
|
||||
lerp::lerp_slice,
|
||||
math::Vector,
|
||||
ray::{RayBatch, RayStack},
|
||||
timer::Timer,
|
||||
};
|
||||
|
||||
use super::{
|
||||
bvh_base::{BVHBase, BVHBaseNode, BVH_MAX_DEPTH},
|
||||
ACCEL_NODE_RAY_TESTS, ACCEL_TRAV_TIME,
|
||||
};
|
||||
|
||||
use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
|
||||
|
||||
pub fn ray_code(dir: Vector) -> usize {
|
||||
let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
|
||||
ray_sign_is_neg[0] as usize
|
||||
+ ((ray_sign_is_neg[1] as usize) << 1)
|
||||
+ ((ray_sign_is_neg[2] as usize) << 2)
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub struct BVH4<'a> {
|
||||
root: Option<&'a BVH4Node<'a>>,
|
||||
depth: usize,
|
||||
node_count: usize,
|
||||
_bounds: Option<&'a [BBox]>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug)]
|
||||
pub enum BVH4Node<'a> {
|
||||
Internal {
|
||||
bounds: &'a [BBox4],
|
||||
children: &'a [BVH4Node<'a>],
|
||||
traversal_code: u8,
|
||||
},
|
||||
|
||||
Leaf {
|
||||
object_range: (usize, usize),
|
||||
},
|
||||
}
|
||||
|
||||
impl<'a> BVH4<'a> {
|
||||
pub fn from_objects<'b, T, F>(
|
||||
arena: &'a MemArena,
|
||||
objects: &mut [T],
|
||||
objects_per_leaf: usize,
|
||||
bounder: F,
|
||||
) -> BVH4<'a>
|
||||
where
|
||||
F: 'b + Fn(&T) -> &'b [BBox],
|
||||
{
|
||||
if objects.len() == 0 {
|
||||
BVH4 {
|
||||
root: None,
|
||||
depth: 0,
|
||||
node_count: 0,
|
||||
_bounds: None,
|
||||
}
|
||||
} else {
|
||||
let base = BVHBase::from_objects(objects, objects_per_leaf, bounder);
|
||||
|
||||
let fill_node = unsafe { arena.alloc_uninitialized_with_alignment::<BVH4Node>(32) };
|
||||
let node_count = BVH4::construct_from_base(
|
||||
arena,
|
||||
&base,
|
||||
&base.nodes[base.root_node_index()],
|
||||
fill_node,
|
||||
);
|
||||
|
||||
BVH4 {
|
||||
root: Some(fill_node),
|
||||
depth: (base.depth / 2) + 1,
|
||||
node_count: node_count,
|
||||
_bounds: {
|
||||
let range = base.nodes[base.root_node_index()].bounds_range();
|
||||
Some(arena.copy_slice(&base.bounds[range.0..range.1]))
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tree_depth(&self) -> usize {
|
||||
self.depth
|
||||
}
|
||||
|
||||
pub fn traverse<T, F>(
|
||||
&self,
|
||||
rays: &mut RayBatch,
|
||||
ray_stack: &mut RayStack,
|
||||
objects: &[T],
|
||||
mut obj_ray_test: F,
|
||||
) where
|
||||
F: FnMut(&T, &mut RayBatch, &mut RayStack),
|
||||
{
|
||||
if self.root.is_none() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut trav_time: f64 = 0.0;
|
||||
let mut timer = Timer::new();
|
||||
|
||||
let traversal_table =
|
||||
&TRAVERSAL_TABLE[ray_code(rays.dir_inv_local(ray_stack.next_task_ray_idx(0)))];
|
||||
|
||||
// +2 of max depth for root and last child
|
||||
let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2];
|
||||
let mut stack_ptr = 1;
|
||||
|
||||
while stack_ptr > 0 {
|
||||
match node_stack[stack_ptr] {
|
||||
&BVH4Node::Internal {
|
||||
bounds,
|
||||
children,
|
||||
traversal_code,
|
||||
} => {
|
||||
let mut all_hits = 0;
|
||||
|
||||
// Ray testing
|
||||
ray_stack.pop_do_next_task(children.len(), |ray_idx| {
|
||||
if rays.is_done(ray_idx) {
|
||||
([0, 1, 2, 3, 4, 5, 6, 7], 0)
|
||||
} else {
|
||||
let hits = lerp_slice(bounds, rays.time(ray_idx))
|
||||
.intersect_ray(
|
||||
rays.orig_local(ray_idx),
|
||||
rays.dir_inv_local(ray_idx),
|
||||
rays.max_t(ray_idx),
|
||||
)
|
||||
.to_bitmask();
|
||||
|
||||
if hits != 0 {
|
||||
all_hits |= hits;
|
||||
let mut lanes = [0u8; 8];
|
||||
let mut lane_count = 0;
|
||||
for i in 0..children.len() {
|
||||
if (hits >> i) & 1 != 0 {
|
||||
lanes[lane_count] = i as u8;
|
||||
lane_count += 1;
|
||||
}
|
||||
}
|
||||
(lanes, lane_count)
|
||||
} else {
|
||||
([0, 1, 2, 3, 4, 5, 6, 7], 0)
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// If there were any intersections, create tasks.
|
||||
if all_hits > 0 {
|
||||
let order_code = traversal_table[traversal_code as usize];
|
||||
let mut lanes = [0usize; 4];
|
||||
let mut lane_count = 0;
|
||||
for i in 0..children.len() {
|
||||
let inv_i = (children.len() - 1) - i;
|
||||
let child_i = ((order_code >> (inv_i * 2)) & 3) as usize;
|
||||
if ((all_hits >> child_i) & 1) != 0 {
|
||||
node_stack[stack_ptr + lane_count] = &children[child_i];
|
||||
lanes[lane_count] = child_i;
|
||||
lane_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
ray_stack.push_lanes_to_tasks(&lanes[..lane_count]);
|
||||
stack_ptr += lane_count - 1;
|
||||
} else {
|
||||
stack_ptr -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
&BVH4Node::Leaf { object_range } => {
|
||||
trav_time += timer.tick() as f64;
|
||||
|
||||
// Set up the tasks for each object.
|
||||
let obj_count = object_range.1 - object_range.0;
|
||||
for _ in 0..(obj_count - 1) {
|
||||
ray_stack.duplicate_next_task();
|
||||
}
|
||||
|
||||
// Do the ray tests.
|
||||
for obj in &objects[object_range.0..object_range.1] {
|
||||
obj_ray_test(obj, rays, ray_stack);
|
||||
}
|
||||
|
||||
timer.tick();
|
||||
|
||||
stack_ptr -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
trav_time += timer.tick() as f64;
|
||||
ACCEL_TRAV_TIME.with(|att| {
|
||||
let v = att.get();
|
||||
att.set(v + trav_time);
|
||||
});
|
||||
}
|
||||
|
||||
fn construct_from_base(
|
||||
arena: &'a MemArena,
|
||||
base: &BVHBase,
|
||||
node: &BVHBaseNode,
|
||||
fill_node: &mut BVH4Node<'a>,
|
||||
) -> usize {
|
||||
let mut node_count = 0;
|
||||
|
||||
match node {
|
||||
// Create internal node
|
||||
&BVHBaseNode::Internal {
|
||||
bounds_range: _,
|
||||
children_indices,
|
||||
split_axis,
|
||||
} => {
|
||||
let child_l = &base.nodes[children_indices.0];
|
||||
let child_r = &base.nodes[children_indices.1];
|
||||
|
||||
// Prepare convenient access to the stuff we need.
|
||||
let child_count: usize;
|
||||
let children; // [Optional, Optional, Optional, Optional]
|
||||
let split_info: SplitAxes;
|
||||
match *child_l {
|
||||
BVHBaseNode::Internal {
|
||||
children_indices: i_l,
|
||||
split_axis: s_l,
|
||||
..
|
||||
} => {
|
||||
match *child_r {
|
||||
BVHBaseNode::Internal {
|
||||
children_indices: i_r,
|
||||
split_axis: s_r,
|
||||
..
|
||||
} => {
|
||||
// Four nodes
|
||||
child_count = 4;
|
||||
children = [
|
||||
Some(&base.nodes[i_l.0]),
|
||||
Some(&base.nodes[i_l.1]),
|
||||
Some(&base.nodes[i_r.0]),
|
||||
Some(&base.nodes[i_r.1]),
|
||||
];
|
||||
split_info = SplitAxes::Full((split_axis, s_l, s_r));
|
||||
}
|
||||
BVHBaseNode::Leaf { .. } => {
|
||||
// Three nodes with left split
|
||||
child_count = 3;
|
||||
children = [
|
||||
Some(&base.nodes[i_l.0]),
|
||||
Some(&base.nodes[i_l.1]),
|
||||
Some(child_r),
|
||||
None,
|
||||
];
|
||||
split_info = SplitAxes::Left((split_axis, s_l));
|
||||
}
|
||||
}
|
||||
}
|
||||
BVHBaseNode::Leaf { .. } => {
|
||||
match *child_r {
|
||||
BVHBaseNode::Internal {
|
||||
children_indices: i_r,
|
||||
split_axis: s_r,
|
||||
..
|
||||
} => {
|
||||
// Three nodes with right split
|
||||
child_count = 3;
|
||||
children = [
|
||||
Some(child_l),
|
||||
Some(&base.nodes[i_r.0]),
|
||||
Some(&base.nodes[i_r.1]),
|
||||
None,
|
||||
];
|
||||
split_info = SplitAxes::Right((split_axis, s_r));
|
||||
}
|
||||
BVHBaseNode::Leaf { .. } => {
|
||||
// Two nodes
|
||||
child_count = 2;
|
||||
children = [Some(child_l), Some(child_r), None, None];
|
||||
split_info = SplitAxes::TopOnly(split_axis);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
node_count += child_count;
|
||||
|
||||
// Construct bounds
|
||||
let bounds = {
|
||||
let bounds_len = children
|
||||
.iter()
|
||||
.map(|c| {
|
||||
if let &Some(n) = c {
|
||||
let len = n.bounds_range().1 - n.bounds_range().0;
|
||||
debug_assert!(len >= 1);
|
||||
len
|
||||
} else {
|
||||
0
|
||||
}
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
debug_assert!(bounds_len >= 1);
|
||||
let bounds =
|
||||
unsafe { arena.alloc_array_uninitialized_with_alignment(bounds_len, 32) };
|
||||
if bounds_len < 2 {
|
||||
let b1 =
|
||||
children[0].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
|
||||
let b2 =
|
||||
children[1].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
|
||||
let b3 =
|
||||
children[2].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
|
||||
let b4 =
|
||||
children[3].map_or(BBox::new(), |c| base.bounds[c.bounds_range().0]);
|
||||
bounds[0] = BBox4::from_bboxes(b1, b2, b3, b4);
|
||||
} else {
|
||||
for (i, b) in bounds.iter_mut().enumerate() {
|
||||
let time = i as f32 / (bounds_len - 1) as f32;
|
||||
|
||||
let b1 = children[0].map_or(BBox::new(), |c| {
|
||||
let (x, y) = c.bounds_range();
|
||||
lerp_slice(&base.bounds[x..y], time)
|
||||
});
|
||||
let b2 = children[1].map_or(BBox::new(), |c| {
|
||||
let (x, y) = c.bounds_range();
|
||||
lerp_slice(&base.bounds[x..y], time)
|
||||
});
|
||||
let b3 = children[2].map_or(BBox::new(), |c| {
|
||||
let (x, y) = c.bounds_range();
|
||||
lerp_slice(&base.bounds[x..y], time)
|
||||
});
|
||||
let b4 = children[3].map_or(BBox::new(), |c| {
|
||||
let (x, y) = c.bounds_range();
|
||||
lerp_slice(&base.bounds[x..y], time)
|
||||
});
|
||||
*b = BBox4::from_bboxes(b1, b2, b3, b4);
|
||||
}
|
||||
}
|
||||
bounds
|
||||
};
|
||||
|
||||
// Construct child nodes
|
||||
let child_nodes = unsafe {
|
||||
arena.alloc_array_uninitialized_with_alignment::<BVH4Node>(child_count, 32)
|
||||
};
|
||||
for (i, c) in children[0..child_count].iter().enumerate() {
|
||||
node_count +=
|
||||
BVH4::construct_from_base(arena, base, c.unwrap(), &mut child_nodes[i]);
|
||||
}
|
||||
|
||||
// Build this node
|
||||
*fill_node = BVH4Node::Internal {
|
||||
bounds: bounds,
|
||||
children: child_nodes,
|
||||
traversal_code: calc_traversal_code(split_info),
|
||||
};
|
||||
}
|
||||
|
||||
// Create internal node
|
||||
&BVHBaseNode::Leaf { object_range, .. } => {
|
||||
*fill_node = BVH4Node::Leaf {
|
||||
object_range: object_range,
|
||||
};
|
||||
node_count += 1;
|
||||
}
|
||||
}
|
||||
|
||||
return node_count;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Boundable for BVH4<'a> {
|
||||
fn bounds<'b>(&'b self) -> &'b [BBox] {
|
||||
self._bounds.unwrap_or(&[])
|
||||
}
|
||||
}
|
|
@ -1,5 +1,6 @@
|
|||
// mod bvh;
|
||||
mod bvh4;
|
||||
mod bvh4_simd;
|
||||
mod bvh_base;
|
||||
mod light_array;
|
||||
mod light_tree;
|
||||
|
@ -14,7 +15,7 @@ use crate::{
|
|||
|
||||
pub use self::{
|
||||
// bvh::{BVHNode, BVH},
|
||||
bvh4::{ray_code, BVH4Node, BVH4},
|
||||
bvh4_simd::{ray_code, BVH4Node, BVH4},
|
||||
light_array::LightArray,
|
||||
light_tree::LightTree,
|
||||
};
|
||||
|
|
139
src/bbox4.rs
Normal file
139
src/bbox4.rs
Normal file
|
@ -0,0 +1,139 @@
|
|||
#![allow(dead_code)]
|
||||
|
||||
use std;
|
||||
use std::ops::{BitOr, BitOrAssign};
|
||||
|
||||
use crate::{
|
||||
bbox::BBox,
|
||||
lerp::{lerp, Lerp},
|
||||
math::{Point, Vector},
|
||||
};
|
||||
|
||||
use float4::{Bool4, Float4};
|
||||
|
||||
const BBOX_MAXT_ADJUST: f32 = 1.00000024;
|
||||
|
||||
/// A SIMD set of 4 3D axis-aligned bounding boxes.
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct BBox4 {
|
||||
pub x: (Float4, Float4), // (min, max)
|
||||
pub y: (Float4, Float4), // (min, max)
|
||||
pub z: (Float4, Float4), // (min, max)
|
||||
}
|
||||
|
||||
impl BBox4 {
|
||||
/// Creates a degenerate BBox with +infinity min and -infinity max.
|
||||
pub fn new() -> BBox4 {
|
||||
BBox4 {
|
||||
x: (
|
||||
Float4::splat(std::f32::INFINITY),
|
||||
Float4::splat(std::f32::NEG_INFINITY),
|
||||
),
|
||||
y: (
|
||||
Float4::splat(std::f32::INFINITY),
|
||||
Float4::splat(std::f32::NEG_INFINITY),
|
||||
),
|
||||
z: (
|
||||
Float4::splat(std::f32::INFINITY),
|
||||
Float4::splat(std::f32::NEG_INFINITY),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a BBox with min as the minimum extent and max as the maximum
|
||||
/// extent.
|
||||
pub fn from_bboxes(b1: BBox, b2: BBox, b3: BBox, b4: BBox) -> BBox4 {
|
||||
BBox4 {
|
||||
x: (
|
||||
Float4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()),
|
||||
Float4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()),
|
||||
),
|
||||
y: (
|
||||
Float4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()),
|
||||
Float4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()),
|
||||
),
|
||||
z: (
|
||||
Float4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()),
|
||||
Float4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z()),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
// Returns whether the given ray intersects with the bboxes.
|
||||
pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> Bool4 {
|
||||
// Get the ray data into SIMD format.
|
||||
let ro_x = orig.co.all_0();
|
||||
let ro_y = orig.co.all_1();
|
||||
let ro_z = orig.co.all_2();
|
||||
let rdi_x = dir_inv.co.all_0();
|
||||
let rdi_y = dir_inv.co.all_1();
|
||||
let rdi_z = dir_inv.co.all_2();
|
||||
let max_t = Float4::splat(max_t);
|
||||
|
||||
// Slab tests
|
||||
let t1_x = (self.x.0 - ro_x) * rdi_x;
|
||||
let t1_y = (self.y.0 - ro_y) * rdi_y;
|
||||
let t1_z = (self.z.0 - ro_z) * rdi_z;
|
||||
let t2_x = (self.x.1 - ro_x) * rdi_x;
|
||||
let t2_y = (self.y.1 - ro_y) * rdi_y;
|
||||
let t2_z = (self.z.1 - ro_z) * rdi_z;
|
||||
|
||||
// Get the far and near t hits for each axis.
|
||||
let t_far_x = t1_x.v_max(t2_x);
|
||||
let t_far_y = t1_y.v_max(t2_y);
|
||||
let t_far_z = t1_z.v_max(t2_z);
|
||||
let t_near_x = t1_x.v_min(t2_x);
|
||||
let t_near_y = t1_y.v_min(t2_y);
|
||||
let t_near_z = t1_z.v_min(t2_z);
|
||||
|
||||
// Calculate over-all far t hit.
|
||||
let far_t =
|
||||
(t_far_x.v_min(t_far_y.v_min(t_far_z)) * Float4::splat(BBOX_MAXT_ADJUST)).v_min(max_t);
|
||||
|
||||
// Calculate over-all near t hit.
|
||||
let near_t = t_near_x
|
||||
.v_max(t_near_y)
|
||||
.v_max(t_near_z.v_max(Float4::splat(0.0)));
|
||||
|
||||
// Hit results
|
||||
near_t.lt(far_t)
|
||||
}
|
||||
}
|
||||
|
||||
/// Union of two BBoxes.
|
||||
impl BitOr for BBox4 {
|
||||
type Output = BBox4;
|
||||
|
||||
fn bitor(self, rhs: BBox4) -> BBox4 {
|
||||
BBox4 {
|
||||
x: (self.x.0.v_min(rhs.x.0), self.x.1.v_max(rhs.x.1)),
|
||||
y: (self.y.0.v_min(rhs.y.0), self.y.1.v_max(rhs.y.1)),
|
||||
z: (self.z.0.v_min(rhs.z.0), self.z.1.v_max(rhs.z.1)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BitOrAssign for BBox4 {
|
||||
fn bitor_assign(&mut self, rhs: BBox4) {
|
||||
*self = *self | rhs;
|
||||
}
|
||||
}
|
||||
|
||||
impl Lerp for BBox4 {
|
||||
fn lerp(self, other: BBox4, alpha: f32) -> BBox4 {
|
||||
BBox4 {
|
||||
x: (
|
||||
lerp(self.x.0, other.x.0, alpha),
|
||||
lerp(self.x.1, other.x.1, alpha),
|
||||
),
|
||||
y: (
|
||||
lerp(self.y.0, other.y.0, alpha),
|
||||
lerp(self.y.1, other.y.1, alpha),
|
||||
),
|
||||
z: (
|
||||
lerp(self.z.0, other.z.0, alpha),
|
||||
lerp(self.z.1, other.z.1, alpha),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,6 +17,7 @@ extern crate lazy_static;
|
|||
mod accel;
|
||||
mod algorithm;
|
||||
mod bbox;
|
||||
mod bbox4;
|
||||
mod boundable;
|
||||
mod camera;
|
||||
mod color;
|
||||
|
|
19
src/ray.rs
19
src/ray.rs
|
@ -273,6 +273,25 @@ impl RayStack {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn duplicate_next_task(&mut self) {
|
||||
let task = self.tasks.last().unwrap();
|
||||
let l = task.lane;
|
||||
let start = task.start_idx;
|
||||
let end = self.lanes[l].end_len;
|
||||
|
||||
for i in start..end {
|
||||
let idx = self.lanes[l].idxs[i];
|
||||
self.lanes[l].idxs.push(idx);
|
||||
}
|
||||
|
||||
self.tasks.push(RayTask {
|
||||
lane: l,
|
||||
start_idx: end,
|
||||
});
|
||||
|
||||
self.lanes[l].end_len = self.lanes[l].idxs.len();
|
||||
}
|
||||
|
||||
/// Pops the next task off the stack, and executes the provided closure for
|
||||
/// each ray index in the task. The return value of the closure is the list
|
||||
/// of lanes (by index) to add the given ray index back into.
|
||||
|
|
Loading…
Reference in New Issue
Block a user