Misc optimizations that add up to a nice speed boost.

This commit is contained in:
Nathan Vegdahl 2019-06-29 14:20:32 +09:00
parent c4b8971805
commit 4f7335db8c
2 changed files with 58 additions and 37 deletions

View File

@ -125,34 +125,40 @@ impl<'a> BVH4<'a> {
// Ray testing // Ray testing
ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| { ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| {
if rays.is_done(ray_idx) { if rays.is_done(ray_idx) {
(Bool4::new_false(), 0) Bool4::new_false()
} else { } else {
let hits = lerp_slice(bounds, rays.time(ray_idx)).intersect_ray( let hits = if bounds.len() == 1 {
rays.orig_local(ray_idx), bounds[0].intersect_ray(
rays.dir_inv_local(ray_idx), rays.orig_local(ray_idx),
rays.max_t(ray_idx), rays.dir_inv_local(ray_idx),
); rays.max_t(ray_idx),
)
} else {
lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
rays.orig_local(ray_idx),
rays.dir_inv_local(ray_idx),
rays.max_t(ray_idx),
)
};
all_hits = all_hits | hits; all_hits = all_hits | hits;
(hits, children.len()) hits
} }
}); });
// If there were any intersections, create tasks. // If there were any intersections, create tasks.
if !all_hits.is_all_false() { if !all_hits.is_all_false() {
let order_code = traversal_table[traversal_code as usize]; let order_code = traversal_table[traversal_code as usize];
let mut lanes = [0usize; 4];
let mut lane_count = 0; let mut lane_count = 0;
for i in 0..children.len() { let mut i = children.len() as u8;
let inv_i = (children.len() - 1) - i; while i > 0 {
let child_i = ((order_code >> (inv_i * 2)) & 3) as usize; i -= 1;
if all_hits.get_n(child_i) { let child_i = ((order_code >> (i * 2)) & 3) as usize;
if ray_stack.push_lane_to_task(child_i) {
node_stack[stack_ptr + lane_count] = &children[child_i]; node_stack[stack_ptr + lane_count] = &children[child_i];
lanes[lane_count] = child_i;
lane_count += 1; lane_count += 1;
} }
} }
ray_stack.push_lanes_to_tasks(&lanes[..lane_count]);
stack_ptr += lane_count - 1; stack_ptr += lane_count - 1;
} else { } else {
stack_ptr -= 1; stack_ptr -= 1;

View File

@ -259,17 +259,29 @@ impl RayStack {
self.lanes[lane].idxs.push(ray_idx as RayIndexType); self.lanes[lane].idxs.push(ray_idx as RayIndexType);
} }
/// Pushes any excess indices on the given lane to a new task on the
/// task stack.
///
/// Returns whether a task was pushed or not. No task will be pushed
/// if there are no excess indices on the end of the lane.
pub fn push_lane_to_task(&mut self, lane_idx: usize) -> bool {
if self.lanes[lane_idx].end_len < self.lanes[lane_idx].idxs.len() {
self.tasks.push(RayTask {
lane: lane_idx,
start_idx: self.lanes[lane_idx].end_len,
});
self.lanes[lane_idx].end_len = self.lanes[lane_idx].idxs.len();
true
} else {
false
}
}
/// Takes the given list of lane indices, and pushes any excess indices on /// Takes the given list of lane indices, and pushes any excess indices on
/// the end of each into a new task, in the order provided. /// the end of each into a new task, in the order provided.
pub fn push_lanes_to_tasks(&mut self, lane_idxs: &[usize]) { pub fn push_lanes_to_tasks(&mut self, lane_idxs: &[usize]) {
for &l in lane_idxs { for &l in lane_idxs {
if self.lanes[l].end_len < self.lanes[l].idxs.len() { self.push_lane_to_task(l);
self.tasks.push(RayTask {
lane: l,
start_idx: self.lanes[l].end_len,
});
self.lanes[l].end_len = self.lanes[l].idxs.len();
}
} }
} }
@ -335,35 +347,38 @@ impl RayStack {
/// Pops the next task off the stack, executes the provided closure for /// Pops the next task off the stack, executes the provided closure for
/// each ray index in the task, and pushes the ray indices back onto the /// each ray index in the task, and pushes the ray indices back onto the
/// indicated lanes. /// indicated lanes.
pub fn pop_do_next_task_and_push_rays<F>(&mut self, needed_lanes: usize, mut handle_ray: F) pub fn pop_do_next_task_and_push_rays<F>(&mut self, output_lane_count: usize, mut handle_ray: F)
where where
F: FnMut(usize) -> (Bool4, usize), F: FnMut(usize) -> Bool4,
{ {
// Prepare lanes.
self.ensure_lane_count(needed_lanes);
// Pop the task and do necessary bookkeeping. // Pop the task and do necessary bookkeeping.
let task = self.tasks.pop().unwrap(); let task = self.tasks.pop().unwrap();
let task_range = (task.start_idx, self.lanes[task.lane].end_len); let task_range = (task.start_idx, self.lanes[task.lane].end_len);
self.lanes[task.lane].end_len = task.start_idx; self.lanes[task.lane].end_len = task.start_idx;
// SAFETY: this is probably evil, and depends on behavior of Vec that
// are not actually promised. But we're essentially truncating the lane
// to the start of our task range, but will continue to access it's
// elements beyond that range via `get_unchecked()` below. Because the
// memory is not freed nor altered, this is safe. However, again, the
// Vec apis don't promise this behavior. So:
//
// TODO: build a slightly different lane abstraction to get this same
// efficiency without depending on implicit Vec behavior.
unsafe {
self.lanes[task.lane].idxs.set_len(task.start_idx);
}
// Execute task. // Execute task.
let mut source_lane_cap = task_range.0;
for i in task_range.0..task_range.1 { for i in task_range.0..task_range.1 {
let ray_idx = self.lanes[task.lane].idxs[i]; let ray_idx = *unsafe { self.lanes[task.lane].idxs.get_unchecked(i) };
let (push_mask, c) = handle_ray(ray_idx as usize); let push_mask = handle_ray(ray_idx as usize);
for l in 0..c { for l in 0..output_lane_count {
if push_mask.get_n(l) { if push_mask.get_n(l) {
if l == task.lane { self.lanes[l as usize].idxs.push(ray_idx);
self.lanes[l as usize].idxs[source_lane_cap] = ray_idx;
source_lane_cap += 1;
} else {
self.lanes[l as usize].idxs.push(ray_idx);
}
} }
} }
} }
self.lanes[task.lane].idxs.truncate(source_lane_cap);
} }
} }