From 4f7335db8c5e614b3e053751c0d706eb279dc0c0 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Sat, 29 Jun 2019 14:20:32 +0900 Subject: [PATCH] Misc optimizations that add up to a nice speed boost. --- src/accel/bvh4.rs | 34 +++++++++++++++----------- src/ray.rs | 61 +++++++++++++++++++++++++++++------------------ 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/src/accel/bvh4.rs b/src/accel/bvh4.rs index 3708c64..5b09e0f 100644 --- a/src/accel/bvh4.rs +++ b/src/accel/bvh4.rs @@ -125,34 +125,40 @@ impl<'a> BVH4<'a> { // Ray testing ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| { if rays.is_done(ray_idx) { - (Bool4::new_false(), 0) + Bool4::new_false() } else { - let hits = lerp_slice(bounds, rays.time(ray_idx)).intersect_ray( - rays.orig_local(ray_idx), - rays.dir_inv_local(ray_idx), - rays.max_t(ray_idx), - ); + let hits = if bounds.len() == 1 { + bounds[0].intersect_ray( + rays.orig_local(ray_idx), + rays.dir_inv_local(ray_idx), + rays.max_t(ray_idx), + ) + } else { + lerp_slice(bounds, rays.time(ray_idx)).intersect_ray( + rays.orig_local(ray_idx), + rays.dir_inv_local(ray_idx), + rays.max_t(ray_idx), + ) + }; all_hits = all_hits | hits; - (hits, children.len()) + hits } }); // If there were any intersections, create tasks. if !all_hits.is_all_false() { let order_code = traversal_table[traversal_code as usize]; - let mut lanes = [0usize; 4]; let mut lane_count = 0; - for i in 0..children.len() { - let inv_i = (children.len() - 1) - i; - let child_i = ((order_code >> (inv_i * 2)) & 3) as usize; - if all_hits.get_n(child_i) { + let mut i = children.len() as u8; + while i > 0 { + i -= 1; + let child_i = ((order_code >> (i * 2)) & 3) as usize; + if ray_stack.push_lane_to_task(child_i) { node_stack[stack_ptr + lane_count] = &children[child_i]; - lanes[lane_count] = child_i; lane_count += 1; } } - ray_stack.push_lanes_to_tasks(&lanes[..lane_count]); stack_ptr += lane_count - 1; } else { stack_ptr -= 1; diff --git a/src/ray.rs b/src/ray.rs index 9727522..7c2bc83 100644 --- a/src/ray.rs +++ b/src/ray.rs @@ -259,17 +259,29 @@ impl RayStack { self.lanes[lane].idxs.push(ray_idx as RayIndexType); } + /// Pushes any excess indices on the given lane to a new task on the + /// task stack. + /// + /// Returns whether a task was pushed or not. No task will be pushed + /// if there are no excess indices on the end of the lane. + pub fn push_lane_to_task(&mut self, lane_idx: usize) -> bool { + if self.lanes[lane_idx].end_len < self.lanes[lane_idx].idxs.len() { + self.tasks.push(RayTask { + lane: lane_idx, + start_idx: self.lanes[lane_idx].end_len, + }); + self.lanes[lane_idx].end_len = self.lanes[lane_idx].idxs.len(); + true + } else { + false + } + } + /// Takes the given list of lane indices, and pushes any excess indices on /// the end of each into a new task, in the order provided. pub fn push_lanes_to_tasks(&mut self, lane_idxs: &[usize]) { for &l in lane_idxs { - if self.lanes[l].end_len < self.lanes[l].idxs.len() { - self.tasks.push(RayTask { - lane: l, - start_idx: self.lanes[l].end_len, - }); - self.lanes[l].end_len = self.lanes[l].idxs.len(); - } + self.push_lane_to_task(l); } } @@ -335,35 +347,38 @@ impl RayStack { /// Pops the next task off the stack, executes the provided closure for /// each ray index in the task, and pushes the ray indices back onto the /// indicated lanes. - pub fn pop_do_next_task_and_push_rays(&mut self, needed_lanes: usize, mut handle_ray: F) + pub fn pop_do_next_task_and_push_rays(&mut self, output_lane_count: usize, mut handle_ray: F) where - F: FnMut(usize) -> (Bool4, usize), + F: FnMut(usize) -> Bool4, { - // Prepare lanes. - self.ensure_lane_count(needed_lanes); - // Pop the task and do necessary bookkeeping. let task = self.tasks.pop().unwrap(); let task_range = (task.start_idx, self.lanes[task.lane].end_len); self.lanes[task.lane].end_len = task.start_idx; + // SAFETY: this is probably evil, and depends on behavior of Vec that + // are not actually promised. But we're essentially truncating the lane + // to the start of our task range, but will continue to access it's + // elements beyond that range via `get_unchecked()` below. Because the + // memory is not freed nor altered, this is safe. However, again, the + // Vec apis don't promise this behavior. So: + // + // TODO: build a slightly different lane abstraction to get this same + // efficiency without depending on implicit Vec behavior. + unsafe { + self.lanes[task.lane].idxs.set_len(task.start_idx); + } + // Execute task. - let mut source_lane_cap = task_range.0; for i in task_range.0..task_range.1 { - let ray_idx = self.lanes[task.lane].idxs[i]; - let (push_mask, c) = handle_ray(ray_idx as usize); - for l in 0..c { + let ray_idx = *unsafe { self.lanes[task.lane].idxs.get_unchecked(i) }; + let push_mask = handle_ray(ray_idx as usize); + for l in 0..output_lane_count { if push_mask.get_n(l) { - if l == task.lane { - self.lanes[l as usize].idxs[source_lane_cap] = ray_idx; - source_lane_cap += 1; - } else { - self.lanes[l as usize].idxs.push(ray_idx); - } + self.lanes[l as usize].idxs.push(ray_idx); } } } - self.lanes[task.lane].idxs.truncate(source_lane_cap); } }