From 0dfe916523c00a0979de6fb1f781c3a51ba81113 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Fri, 24 Apr 2020 21:05:29 +0900 Subject: [PATCH] Preparing for SIMD accelerated Sobol sampling. This implements the 4-wide API, and moves the renderer over to it. But the actual implementation is still scalar. --- src/renderer.rs | 57 +++++++++++++------------- sub_crates/sobol/build.rs | 41 ++++++++++++++----- sub_crates/sobol/src/lib.rs | 56 ++++++++++++++++++-------- sub_crates/sobol/src/wide.rs | 78 ++++++++++++++++++++++++++++++++++++ 4 files changed, 179 insertions(+), 53 deletions(-) create mode 100644 sub_crates/sobol/src/wide.rs diff --git a/src/renderer.rs b/src/renderer.rs index 797abb2..be045ef 100644 --- a/src/renderer.rs +++ b/src/renderer.rs @@ -241,14 +241,14 @@ impl<'a> Renderer<'a> { for y in bucket.y..(bucket.y + bucket.h) { for x in bucket.x..(bucket.x + bucket.w) { for si in 0..self.spp { + // Raw sample numbers. + let (d0, d1, d2, d3) = get_sample_4d(0, si as u32, (x, y), self.seed); + let (d4, _, _, _) = get_sample_4d(1, si as u32, (x, y), self.seed); + // Calculate image plane x and y coordinates let (img_x, img_y) = { - let filter_x = - probit(get_sample(3, si as u32, (x, y), self.seed), 2.0 / 6.0) - + 0.5; - let filter_y = - probit(get_sample(4, si as u32, (x, y), self.seed), 2.0 / 6.0) - + 0.5; + let filter_x = probit(d3, 2.0 / 6.0) + 0.5; + let filter_y = probit(d4, 2.0 / 6.0) + 0.5; let samp_x = (filter_x + x as f32) * cmpx; let samp_y = (filter_y + y as f32) * cmpy; ((samp_x - 0.5) * x_extent, (0.5 - samp_y) * y_extent) @@ -260,11 +260,8 @@ impl<'a> Renderer<'a> { self.seed, (x, y), (img_x, img_y), - ( - get_sample(1, si as u32, (x, y), self.seed), - get_sample(2, si as u32, (x, y), self.seed), - ), - get_sample(0, si as u32, (x, y), self.seed), + (d1, d2), + d0, map_0_1_to_wavelength(golden_ratio_sample( si as u32, hash_u32((x << 16) ^ y, self.seed), @@ -438,10 +435,10 @@ impl LightPath { self.sampling_seed += 1; } - fn next_lds_samp(&mut self) -> f32 { + fn next_lds_samp(&mut self) -> (f32, f32, f32, f32) { let dimension = self.dim_offset; self.dim_offset += 1; - get_sample( + get_sample_4d( dimension, self.sample_number, self.pixel_co, @@ -490,12 +487,8 @@ impl LightPath { // Prepare light ray self.next_lds_sequence(); - let light_n = self.next_lds_samp(); - let light_uvw = ( - self.next_lds_samp(), - self.next_lds_samp(), - self.next_lds_samp(), - ); + let (light_n, d2, d3, d4) = self.next_lds_samp(); + let light_uvw = (d2, d3, d4); xform_stack.clear(); let light_info = scene.sample_lights( xform_stack, @@ -609,8 +602,7 @@ impl LightPath { // Sample closure let (dir, filter, pdf) = { self.next_lds_sequence(); - let u = self.next_lds_samp(); - let v = self.next_lds_samp(); + let (u, v, _, _) = self.next_lds_samp(); closure.sample( idata.incoming, idata.nor, @@ -703,20 +695,31 @@ impl LightPath { /// and switching to random samples at higher dimensions where /// LDS samples aren't available. #[inline(always)] -fn get_sample(dimension: u32, i: u32, pixel_co: (u32, u32), seed: u32) -> f32 { +fn get_sample_4d( + dimension_set: u32, + i: u32, + pixel_co: (u32, u32), + seed: u32, +) -> (f32, f32, f32, f32) { // A unique seed for every pixel coordinate up to a resolution of // 65536 x 65536. Also incorperating the seed. let seed = pixel_co.0 ^ (pixel_co.1 << 16) ^ seed.wrapping_mul(0x736caf6f); - match dimension { - d if d < sobol::MAX_DIMENSION as u32 => { + match dimension_set { + ds if ds < sobol::MAX_DIMENSION as u32 => { // Sobol sampling. - sobol::sample(d, i, seed) + let n4 = sobol::sample_4d(ds, i, seed); + (n4[0], n4[1], n4[2], n4[3]) } - d => { + ds => { // Random sampling. use crate::hash::hash_u32_to_f32; - hash_u32_to_f32(d ^ (i << 16), seed) + ( + hash_u32_to_f32((ds * 4 + 0) ^ (i << 16), seed), + hash_u32_to_f32((ds * 4 + 1) ^ (i << 16), seed), + hash_u32_to_f32((ds * 4 + 2) ^ (i << 16), seed), + hash_u32_to_f32((ds * 4 + 3) ^ (i << 16), seed), + ) } } } diff --git a/sub_crates/sobol/build.rs b/sub_crates/sobol/build.rs index db53598..a0dbaa4 100644 --- a/sub_crates/sobol/build.rs +++ b/sub_crates/sobol/build.rs @@ -4,7 +4,7 @@ use std::{env, fs::File, io::Write, path::Path}; /// How many components to generate. -const NUM_DIMENSIONS: usize = 1024; +const NUM_DIMENSIONS: usize = 256; /// What file to generate the numbers from. const DIRECTION_NUMBERS_TEXT: &str = include_str!("direction_numbers/new-joe-kuo-5.1024.txt"); @@ -22,15 +22,36 @@ fn main() { .unwrap(); // Write the vectors. - // We actually write them with reversed bits due to how the library uses - // them, which is atypical. - f.write_all(format!("pub const REV_VECTORS: &[[u{0}; {0}]] = &[\n", SOBOL_BITS).as_bytes()) - .unwrap(); - for v in vectors.iter() { + // We write them in a rather atypical way because of how the library + // uses them. First, we interleave the numbers of each set of four + // dimensions, for SIMD evaluation. Second, each number is written + // with reversed bits, to avoid needing to reverse them before scrambling. + f.write_all( + format!( + "pub const REV_VECTORS: &[[[u{0}; 4]; {0}]] = &[\n", + SOBOL_BITS + ) + .as_bytes(), + ) + .unwrap(); + for d4 in vectors.chunks_exact(4) { f.write_all(" [\n".as_bytes()).unwrap(); - for n in v.iter() { - f.write_all(format!(" 0x{:08x},\n", n.reverse_bits()).as_bytes()) - .unwrap(); + for ((a, b), (c, d)) in d4[0] + .iter() + .zip(d4[1].iter()) + .zip(d4[2].iter().zip(d4[3].iter())) + { + f.write_all( + format!( + " [0x{:08x}, 0x{:08x}, 0x{:08x}, 0x{:08x}],\n", + a.reverse_bits(), + b.reverse_bits(), + c.reverse_bits(), + d.reverse_bits() + ) + .as_bytes(), + ) + .unwrap(); } f.write_all(" ],\n".as_bytes()).unwrap(); } @@ -87,7 +108,7 @@ fn main() { // OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN // IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -type SobolInt = u16; +type SobolInt = u32; const SOBOL_BITS: usize = std::mem::size_of::() * 8; pub fn generate_direction_vectors(dimensions: usize) -> Vec<[SobolInt; SOBOL_BITS]> { diff --git a/sub_crates/sobol/src/lib.rs b/sub_crates/sobol/src/lib.rs index 21bc6bb..95110c5 100644 --- a/sub_crates/sobol/src/lib.rs +++ b/sub_crates/sobol/src/lib.rs @@ -1,5 +1,8 @@ //! An implementation of the Sobol sequence with Owen scrambling. +mod wide; +use wide::Int4; + // The following `include` provides `MAX_DIMENSION` and `REV_VECTORS`. // See the build.rs file for how this included file is generated. include!(concat!(env!("OUT_DIR"), "/vectors.inc")); @@ -15,17 +18,24 @@ include!(concat!(env!("OUT_DIR"), "/vectors.inc")); /// parameter exceeds 2^16-1, the sample set will start repeating. #[inline] pub fn sample(dimension: u32, index: u32, seed: u32) -> f32 { + sample_4d(dimension >> 2, index, seed)[(dimension & 0b11) as usize] +} + +/// Same as `sample()` but calculates a set of 4 dimensions all at once +/// using SIMD. +#[inline] +pub fn sample_4d(dimension_set: u32, index: u32, seed: u32) -> [f32; 4] { // This index shuffling approach is due to Brent Burley, and is // what allows us to create statistically independent Sobol sequences. let shuffled_rev_index = lk_scramble(index.reverse_bits(), seed); - let sobol = lk_scramble( - sobol_u32_rev(dimension, shuffled_rev_index), - dimension ^ seed, + let sobol = lk_int4_scramble( + sobol_int4_rev(dimension_set, shuffled_rev_index), + dimension_set ^ seed, ) .reverse_bits(); - u32_to_0_1_f32(sobol) + sobol.to_norm_floats() } //---------------------------------------------------------------------- @@ -40,22 +50,22 @@ pub fn sample(dimension: u32, index: u32, seed: u32) -> f32 { /// Note: if the `index` parameter exceeds 2^16-1, the sample set will start /// repeating. #[inline(always)] -fn sobol_u32_rev(dimension: u32, index: u32) -> u32 { - assert!(dimension < MAX_DIMENSION); - let vecs = &REV_VECTORS[dimension as usize]; +fn sobol_int4_rev(dimension_set: u32, index: u32) -> Int4 { + assert!(dimension_set < (MAX_DIMENSION / 4)); + let vecs = &REV_VECTORS[dimension_set as usize]; let mut index = (index >> 16) as u16; - let mut result = 0; + let mut result = Int4::zero(); let mut i = 0; while index != 0 { let j = index.leading_zeros(); - result ^= vecs[(i + j) as usize]; + result ^= vecs[(i + j) as usize].into(); i += j + 1; index <<= j; index <<= 1; } - result as u32 + result } /// Scrambles `n` using the Laine Karras hash. This is equivalent to Owen @@ -85,6 +95,26 @@ fn lk_scramble(mut n: u32, scramble: u32) -> u32 { n } +/// Same as `lk_scramble()`, except does it on 4 integers at a time +/// with SIMD. +#[inline(always)] +fn lk_int4_scramble(mut n: Int4, scramble: u32) -> Int4 { + n += { + let a = hash(scramble, 2); + let b = a ^ 0x174f18ab; + let c = a ^ 0x691e72ca; + let d = a ^ 0xb40cc1b8; + [a, b, c, d].into() + }; + + n ^= [0xdc967795; 4].into(); + n *= [0x97b754b7; 4].into(); + n ^= [0x866350b1; 4].into(); + n *= [0x9e3779cd; 4].into(); + + n +} + /// Same as `lk_scramble()` except uses a slower more full version of /// hashing. /// @@ -114,9 +144,3 @@ fn hash(n: u32, rounds: u32) -> u32 { } hash } - -#[inline(always)] -fn u32_to_0_1_f32(n: u32) -> f32 { - const ONE_OVER_32BITS: f32 = 1.0 / (1u64 << 32) as f32; - n as f32 * ONE_OVER_32BITS -} diff --git a/sub_crates/sobol/src/wide.rs b/sub_crates/sobol/src/wide.rs new file mode 100644 index 0000000..90258d5 --- /dev/null +++ b/sub_crates/sobol/src/wide.rs @@ -0,0 +1,78 @@ +#[derive(Debug, Copy, Clone)] +#[repr(align(16))] +pub(crate) struct Int4 { + v: [u32; 4], +} + +impl Int4 { + pub fn zero() -> Int4 { + Int4 { v: [0, 0, 0, 0] } + } + + /// Converts the full range of a 32 bit integer to a float in [0, 1). + pub fn to_norm_floats(self) -> [f32; 4] { + const ONE_OVER_32BITS: f32 = 1.0 / (1u64 << 32) as f32; + [ + self.v[0] as f32 * ONE_OVER_32BITS, + self.v[1] as f32 * ONE_OVER_32BITS, + self.v[2] as f32 * ONE_OVER_32BITS, + self.v[3] as f32 * ONE_OVER_32BITS, + ] + } + + pub fn reverse_bits(self) -> Int4 { + Int4 { + v: [ + self.v[0].reverse_bits(), + self.v[1].reverse_bits(), + self.v[2].reverse_bits(), + self.v[3].reverse_bits(), + ], + } + } +} + +impl std::ops::MulAssign for Int4 { + fn mul_assign(&mut self, other: Self) { + *self = Int4 { + v: [ + self.v[0].wrapping_mul(other.v[0]), + self.v[1].wrapping_mul(other.v[1]), + self.v[2].wrapping_mul(other.v[2]), + self.v[3].wrapping_mul(other.v[3]), + ], + }; + } +} + +impl std::ops::AddAssign for Int4 { + fn add_assign(&mut self, other: Self) { + *self = Int4 { + v: [ + self.v[0].wrapping_add(other.v[0]), + self.v[1].wrapping_add(other.v[1]), + self.v[2].wrapping_add(other.v[2]), + self.v[3].wrapping_add(other.v[3]), + ], + }; + } +} + +impl std::ops::BitXorAssign for Int4 { + fn bitxor_assign(&mut self, other: Self) { + *self = Int4 { + v: [ + self.v[0] ^ other.v[0], + self.v[1] ^ other.v[1], + self.v[2] ^ other.v[2], + self.v[3] ^ other.v[3], + ], + }; + } +} + +impl From<[u32; 4]> for Int4 { + fn from(v: [u32; 4]) -> Self { + Int4 { v: v } + } +}