Preparing for SIMD accelerated Sobol sampling.

This implements the 4-wide API, and moves the renderer over to it.
But the actual implementation is still scalar.
This commit is contained in:
Nathan Vegdahl 2020-04-24 21:05:29 +09:00
parent 78b5cf4c53
commit 0dfe916523
4 changed files with 179 additions and 53 deletions

View File

@ -241,14 +241,14 @@ impl<'a> Renderer<'a> {
for y in bucket.y..(bucket.y + bucket.h) { for y in bucket.y..(bucket.y + bucket.h) {
for x in bucket.x..(bucket.x + bucket.w) { for x in bucket.x..(bucket.x + bucket.w) {
for si in 0..self.spp { for si in 0..self.spp {
// Raw sample numbers.
let (d0, d1, d2, d3) = get_sample_4d(0, si as u32, (x, y), self.seed);
let (d4, _, _, _) = get_sample_4d(1, si as u32, (x, y), self.seed);
// Calculate image plane x and y coordinates // Calculate image plane x and y coordinates
let (img_x, img_y) = { let (img_x, img_y) = {
let filter_x = let filter_x = probit(d3, 2.0 / 6.0) + 0.5;
probit(get_sample(3, si as u32, (x, y), self.seed), 2.0 / 6.0) let filter_y = probit(d4, 2.0 / 6.0) + 0.5;
+ 0.5;
let filter_y =
probit(get_sample(4, si as u32, (x, y), self.seed), 2.0 / 6.0)
+ 0.5;
let samp_x = (filter_x + x as f32) * cmpx; let samp_x = (filter_x + x as f32) * cmpx;
let samp_y = (filter_y + y as f32) * cmpy; let samp_y = (filter_y + y as f32) * cmpy;
((samp_x - 0.5) * x_extent, (0.5 - samp_y) * y_extent) ((samp_x - 0.5) * x_extent, (0.5 - samp_y) * y_extent)
@ -260,11 +260,8 @@ impl<'a> Renderer<'a> {
self.seed, self.seed,
(x, y), (x, y),
(img_x, img_y), (img_x, img_y),
( (d1, d2),
get_sample(1, si as u32, (x, y), self.seed), d0,
get_sample(2, si as u32, (x, y), self.seed),
),
get_sample(0, si as u32, (x, y), self.seed),
map_0_1_to_wavelength(golden_ratio_sample( map_0_1_to_wavelength(golden_ratio_sample(
si as u32, si as u32,
hash_u32((x << 16) ^ y, self.seed), hash_u32((x << 16) ^ y, self.seed),
@ -438,10 +435,10 @@ impl LightPath {
self.sampling_seed += 1; self.sampling_seed += 1;
} }
fn next_lds_samp(&mut self) -> f32 { fn next_lds_samp(&mut self) -> (f32, f32, f32, f32) {
let dimension = self.dim_offset; let dimension = self.dim_offset;
self.dim_offset += 1; self.dim_offset += 1;
get_sample( get_sample_4d(
dimension, dimension,
self.sample_number, self.sample_number,
self.pixel_co, self.pixel_co,
@ -490,12 +487,8 @@ impl LightPath {
// Prepare light ray // Prepare light ray
self.next_lds_sequence(); self.next_lds_sequence();
let light_n = self.next_lds_samp(); let (light_n, d2, d3, d4) = self.next_lds_samp();
let light_uvw = ( let light_uvw = (d2, d3, d4);
self.next_lds_samp(),
self.next_lds_samp(),
self.next_lds_samp(),
);
xform_stack.clear(); xform_stack.clear();
let light_info = scene.sample_lights( let light_info = scene.sample_lights(
xform_stack, xform_stack,
@ -609,8 +602,7 @@ impl LightPath {
// Sample closure // Sample closure
let (dir, filter, pdf) = { let (dir, filter, pdf) = {
self.next_lds_sequence(); self.next_lds_sequence();
let u = self.next_lds_samp(); let (u, v, _, _) = self.next_lds_samp();
let v = self.next_lds_samp();
closure.sample( closure.sample(
idata.incoming, idata.incoming,
idata.nor, idata.nor,
@ -703,20 +695,31 @@ impl LightPath {
/// and switching to random samples at higher dimensions where /// and switching to random samples at higher dimensions where
/// LDS samples aren't available. /// LDS samples aren't available.
#[inline(always)] #[inline(always)]
fn get_sample(dimension: u32, i: u32, pixel_co: (u32, u32), seed: u32) -> f32 { fn get_sample_4d(
dimension_set: u32,
i: u32,
pixel_co: (u32, u32),
seed: u32,
) -> (f32, f32, f32, f32) {
// A unique seed for every pixel coordinate up to a resolution of // A unique seed for every pixel coordinate up to a resolution of
// 65536 x 65536. Also incorperating the seed. // 65536 x 65536. Also incorperating the seed.
let seed = pixel_co.0 ^ (pixel_co.1 << 16) ^ seed.wrapping_mul(0x736caf6f); let seed = pixel_co.0 ^ (pixel_co.1 << 16) ^ seed.wrapping_mul(0x736caf6f);
match dimension { match dimension_set {
d if d < sobol::MAX_DIMENSION as u32 => { ds if ds < sobol::MAX_DIMENSION as u32 => {
// Sobol sampling. // Sobol sampling.
sobol::sample(d, i, seed) let n4 = sobol::sample_4d(ds, i, seed);
(n4[0], n4[1], n4[2], n4[3])
} }
d => { ds => {
// Random sampling. // Random sampling.
use crate::hash::hash_u32_to_f32; use crate::hash::hash_u32_to_f32;
hash_u32_to_f32(d ^ (i << 16), seed) (
hash_u32_to_f32((ds * 4 + 0) ^ (i << 16), seed),
hash_u32_to_f32((ds * 4 + 1) ^ (i << 16), seed),
hash_u32_to_f32((ds * 4 + 2) ^ (i << 16), seed),
hash_u32_to_f32((ds * 4 + 3) ^ (i << 16), seed),
)
} }
} }
} }

View File

@ -4,7 +4,7 @@
use std::{env, fs::File, io::Write, path::Path}; use std::{env, fs::File, io::Write, path::Path};
/// How many components to generate. /// How many components to generate.
const NUM_DIMENSIONS: usize = 1024; const NUM_DIMENSIONS: usize = 256;
/// What file to generate the numbers from. /// What file to generate the numbers from.
const DIRECTION_NUMBERS_TEXT: &str = include_str!("direction_numbers/new-joe-kuo-5.1024.txt"); const DIRECTION_NUMBERS_TEXT: &str = include_str!("direction_numbers/new-joe-kuo-5.1024.txt");
@ -22,14 +22,35 @@ fn main() {
.unwrap(); .unwrap();
// Write the vectors. // Write the vectors.
// We actually write them with reversed bits due to how the library uses // We write them in a rather atypical way because of how the library
// them, which is atypical. // uses them. First, we interleave the numbers of each set of four
f.write_all(format!("pub const REV_VECTORS: &[[u{0}; {0}]] = &[\n", SOBOL_BITS).as_bytes()) // dimensions, for SIMD evaluation. Second, each number is written
// with reversed bits, to avoid needing to reverse them before scrambling.
f.write_all(
format!(
"pub const REV_VECTORS: &[[[u{0}; 4]; {0}]] = &[\n",
SOBOL_BITS
)
.as_bytes(),
)
.unwrap(); .unwrap();
for v in vectors.iter() { for d4 in vectors.chunks_exact(4) {
f.write_all(" [\n".as_bytes()).unwrap(); f.write_all(" [\n".as_bytes()).unwrap();
for n in v.iter() { for ((a, b), (c, d)) in d4[0]
f.write_all(format!(" 0x{:08x},\n", n.reverse_bits()).as_bytes()) .iter()
.zip(d4[1].iter())
.zip(d4[2].iter().zip(d4[3].iter()))
{
f.write_all(
format!(
" [0x{:08x}, 0x{:08x}, 0x{:08x}, 0x{:08x}],\n",
a.reverse_bits(),
b.reverse_bits(),
c.reverse_bits(),
d.reverse_bits()
)
.as_bytes(),
)
.unwrap(); .unwrap();
} }
f.write_all(" ],\n".as_bytes()).unwrap(); f.write_all(" ],\n".as_bytes()).unwrap();
@ -87,7 +108,7 @@ fn main() {
// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN // OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
// IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
type SobolInt = u16; type SobolInt = u32;
const SOBOL_BITS: usize = std::mem::size_of::<SobolInt>() * 8; const SOBOL_BITS: usize = std::mem::size_of::<SobolInt>() * 8;
pub fn generate_direction_vectors(dimensions: usize) -> Vec<[SobolInt; SOBOL_BITS]> { pub fn generate_direction_vectors(dimensions: usize) -> Vec<[SobolInt; SOBOL_BITS]> {

View File

@ -1,5 +1,8 @@
//! An implementation of the Sobol sequence with Owen scrambling. //! An implementation of the Sobol sequence with Owen scrambling.
mod wide;
use wide::Int4;
// The following `include` provides `MAX_DIMENSION` and `REV_VECTORS`. // The following `include` provides `MAX_DIMENSION` and `REV_VECTORS`.
// See the build.rs file for how this included file is generated. // See the build.rs file for how this included file is generated.
include!(concat!(env!("OUT_DIR"), "/vectors.inc")); include!(concat!(env!("OUT_DIR"), "/vectors.inc"));
@ -15,17 +18,24 @@ include!(concat!(env!("OUT_DIR"), "/vectors.inc"));
/// parameter exceeds 2^16-1, the sample set will start repeating. /// parameter exceeds 2^16-1, the sample set will start repeating.
#[inline] #[inline]
pub fn sample(dimension: u32, index: u32, seed: u32) -> f32 { pub fn sample(dimension: u32, index: u32, seed: u32) -> f32 {
sample_4d(dimension >> 2, index, seed)[(dimension & 0b11) as usize]
}
/// Same as `sample()` but calculates a set of 4 dimensions all at once
/// using SIMD.
#[inline]
pub fn sample_4d(dimension_set: u32, index: u32, seed: u32) -> [f32; 4] {
// This index shuffling approach is due to Brent Burley, and is // This index shuffling approach is due to Brent Burley, and is
// what allows us to create statistically independent Sobol sequences. // what allows us to create statistically independent Sobol sequences.
let shuffled_rev_index = lk_scramble(index.reverse_bits(), seed); let shuffled_rev_index = lk_scramble(index.reverse_bits(), seed);
let sobol = lk_scramble( let sobol = lk_int4_scramble(
sobol_u32_rev(dimension, shuffled_rev_index), sobol_int4_rev(dimension_set, shuffled_rev_index),
dimension ^ seed, dimension_set ^ seed,
) )
.reverse_bits(); .reverse_bits();
u32_to_0_1_f32(sobol) sobol.to_norm_floats()
} }
//---------------------------------------------------------------------- //----------------------------------------------------------------------
@ -40,22 +50,22 @@ pub fn sample(dimension: u32, index: u32, seed: u32) -> f32 {
/// Note: if the `index` parameter exceeds 2^16-1, the sample set will start /// Note: if the `index` parameter exceeds 2^16-1, the sample set will start
/// repeating. /// repeating.
#[inline(always)] #[inline(always)]
fn sobol_u32_rev(dimension: u32, index: u32) -> u32 { fn sobol_int4_rev(dimension_set: u32, index: u32) -> Int4 {
assert!(dimension < MAX_DIMENSION); assert!(dimension_set < (MAX_DIMENSION / 4));
let vecs = &REV_VECTORS[dimension as usize]; let vecs = &REV_VECTORS[dimension_set as usize];
let mut index = (index >> 16) as u16; let mut index = (index >> 16) as u16;
let mut result = 0; let mut result = Int4::zero();
let mut i = 0; let mut i = 0;
while index != 0 { while index != 0 {
let j = index.leading_zeros(); let j = index.leading_zeros();
result ^= vecs[(i + j) as usize]; result ^= vecs[(i + j) as usize].into();
i += j + 1; i += j + 1;
index <<= j; index <<= j;
index <<= 1; index <<= 1;
} }
result as u32 result
} }
/// Scrambles `n` using the Laine Karras hash. This is equivalent to Owen /// Scrambles `n` using the Laine Karras hash. This is equivalent to Owen
@ -85,6 +95,26 @@ fn lk_scramble(mut n: u32, scramble: u32) -> u32 {
n n
} }
/// Same as `lk_scramble()`, except does it on 4 integers at a time
/// with SIMD.
#[inline(always)]
fn lk_int4_scramble(mut n: Int4, scramble: u32) -> Int4 {
n += {
let a = hash(scramble, 2);
let b = a ^ 0x174f18ab;
let c = a ^ 0x691e72ca;
let d = a ^ 0xb40cc1b8;
[a, b, c, d].into()
};
n ^= [0xdc967795; 4].into();
n *= [0x97b754b7; 4].into();
n ^= [0x866350b1; 4].into();
n *= [0x9e3779cd; 4].into();
n
}
/// Same as `lk_scramble()` except uses a slower more full version of /// Same as `lk_scramble()` except uses a slower more full version of
/// hashing. /// hashing.
/// ///
@ -114,9 +144,3 @@ fn hash(n: u32, rounds: u32) -> u32 {
} }
hash hash
} }
#[inline(always)]
fn u32_to_0_1_f32(n: u32) -> f32 {
const ONE_OVER_32BITS: f32 = 1.0 / (1u64 << 32) as f32;
n as f32 * ONE_OVER_32BITS
}

View File

@ -0,0 +1,78 @@
#[derive(Debug, Copy, Clone)]
#[repr(align(16))]
pub(crate) struct Int4 {
v: [u32; 4],
}
impl Int4 {
pub fn zero() -> Int4 {
Int4 { v: [0, 0, 0, 0] }
}
/// Converts the full range of a 32 bit integer to a float in [0, 1).
pub fn to_norm_floats(self) -> [f32; 4] {
const ONE_OVER_32BITS: f32 = 1.0 / (1u64 << 32) as f32;
[
self.v[0] as f32 * ONE_OVER_32BITS,
self.v[1] as f32 * ONE_OVER_32BITS,
self.v[2] as f32 * ONE_OVER_32BITS,
self.v[3] as f32 * ONE_OVER_32BITS,
]
}
pub fn reverse_bits(self) -> Int4 {
Int4 {
v: [
self.v[0].reverse_bits(),
self.v[1].reverse_bits(),
self.v[2].reverse_bits(),
self.v[3].reverse_bits(),
],
}
}
}
impl std::ops::MulAssign for Int4 {
fn mul_assign(&mut self, other: Self) {
*self = Int4 {
v: [
self.v[0].wrapping_mul(other.v[0]),
self.v[1].wrapping_mul(other.v[1]),
self.v[2].wrapping_mul(other.v[2]),
self.v[3].wrapping_mul(other.v[3]),
],
};
}
}
impl std::ops::AddAssign for Int4 {
fn add_assign(&mut self, other: Self) {
*self = Int4 {
v: [
self.v[0].wrapping_add(other.v[0]),
self.v[1].wrapping_add(other.v[1]),
self.v[2].wrapping_add(other.v[2]),
self.v[3].wrapping_add(other.v[3]),
],
};
}
}
impl std::ops::BitXorAssign for Int4 {
fn bitxor_assign(&mut self, other: Self) {
*self = Int4 {
v: [
self.v[0] ^ other.v[0],
self.v[1] ^ other.v[1],
self.v[2] ^ other.v[2],
self.v[3] ^ other.v[3],
],
};
}
}
impl From<[u32; 4]> for Int4 {
fn from(v: [u32; 4]) -> Self {
Int4 { v: v }
}
}