Preparing for SIMD accelerated Sobol sampling.
This implements the 4-wide API, and moves the renderer over to it. But the actual implementation is still scalar.
This commit is contained in:
parent
78b5cf4c53
commit
0dfe916523
|
@ -241,14 +241,14 @@ impl<'a> Renderer<'a> {
|
||||||
for y in bucket.y..(bucket.y + bucket.h) {
|
for y in bucket.y..(bucket.y + bucket.h) {
|
||||||
for x in bucket.x..(bucket.x + bucket.w) {
|
for x in bucket.x..(bucket.x + bucket.w) {
|
||||||
for si in 0..self.spp {
|
for si in 0..self.spp {
|
||||||
|
// Raw sample numbers.
|
||||||
|
let (d0, d1, d2, d3) = get_sample_4d(0, si as u32, (x, y), self.seed);
|
||||||
|
let (d4, _, _, _) = get_sample_4d(1, si as u32, (x, y), self.seed);
|
||||||
|
|
||||||
// Calculate image plane x and y coordinates
|
// Calculate image plane x and y coordinates
|
||||||
let (img_x, img_y) = {
|
let (img_x, img_y) = {
|
||||||
let filter_x =
|
let filter_x = probit(d3, 2.0 / 6.0) + 0.5;
|
||||||
probit(get_sample(3, si as u32, (x, y), self.seed), 2.0 / 6.0)
|
let filter_y = probit(d4, 2.0 / 6.0) + 0.5;
|
||||||
+ 0.5;
|
|
||||||
let filter_y =
|
|
||||||
probit(get_sample(4, si as u32, (x, y), self.seed), 2.0 / 6.0)
|
|
||||||
+ 0.5;
|
|
||||||
let samp_x = (filter_x + x as f32) * cmpx;
|
let samp_x = (filter_x + x as f32) * cmpx;
|
||||||
let samp_y = (filter_y + y as f32) * cmpy;
|
let samp_y = (filter_y + y as f32) * cmpy;
|
||||||
((samp_x - 0.5) * x_extent, (0.5 - samp_y) * y_extent)
|
((samp_x - 0.5) * x_extent, (0.5 - samp_y) * y_extent)
|
||||||
|
@ -260,11 +260,8 @@ impl<'a> Renderer<'a> {
|
||||||
self.seed,
|
self.seed,
|
||||||
(x, y),
|
(x, y),
|
||||||
(img_x, img_y),
|
(img_x, img_y),
|
||||||
(
|
(d1, d2),
|
||||||
get_sample(1, si as u32, (x, y), self.seed),
|
d0,
|
||||||
get_sample(2, si as u32, (x, y), self.seed),
|
|
||||||
),
|
|
||||||
get_sample(0, si as u32, (x, y), self.seed),
|
|
||||||
map_0_1_to_wavelength(golden_ratio_sample(
|
map_0_1_to_wavelength(golden_ratio_sample(
|
||||||
si as u32,
|
si as u32,
|
||||||
hash_u32((x << 16) ^ y, self.seed),
|
hash_u32((x << 16) ^ y, self.seed),
|
||||||
|
@ -438,10 +435,10 @@ impl LightPath {
|
||||||
self.sampling_seed += 1;
|
self.sampling_seed += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn next_lds_samp(&mut self) -> f32 {
|
fn next_lds_samp(&mut self) -> (f32, f32, f32, f32) {
|
||||||
let dimension = self.dim_offset;
|
let dimension = self.dim_offset;
|
||||||
self.dim_offset += 1;
|
self.dim_offset += 1;
|
||||||
get_sample(
|
get_sample_4d(
|
||||||
dimension,
|
dimension,
|
||||||
self.sample_number,
|
self.sample_number,
|
||||||
self.pixel_co,
|
self.pixel_co,
|
||||||
|
@ -490,12 +487,8 @@ impl LightPath {
|
||||||
|
|
||||||
// Prepare light ray
|
// Prepare light ray
|
||||||
self.next_lds_sequence();
|
self.next_lds_sequence();
|
||||||
let light_n = self.next_lds_samp();
|
let (light_n, d2, d3, d4) = self.next_lds_samp();
|
||||||
let light_uvw = (
|
let light_uvw = (d2, d3, d4);
|
||||||
self.next_lds_samp(),
|
|
||||||
self.next_lds_samp(),
|
|
||||||
self.next_lds_samp(),
|
|
||||||
);
|
|
||||||
xform_stack.clear();
|
xform_stack.clear();
|
||||||
let light_info = scene.sample_lights(
|
let light_info = scene.sample_lights(
|
||||||
xform_stack,
|
xform_stack,
|
||||||
|
@ -609,8 +602,7 @@ impl LightPath {
|
||||||
// Sample closure
|
// Sample closure
|
||||||
let (dir, filter, pdf) = {
|
let (dir, filter, pdf) = {
|
||||||
self.next_lds_sequence();
|
self.next_lds_sequence();
|
||||||
let u = self.next_lds_samp();
|
let (u, v, _, _) = self.next_lds_samp();
|
||||||
let v = self.next_lds_samp();
|
|
||||||
closure.sample(
|
closure.sample(
|
||||||
idata.incoming,
|
idata.incoming,
|
||||||
idata.nor,
|
idata.nor,
|
||||||
|
@ -703,20 +695,31 @@ impl LightPath {
|
||||||
/// and switching to random samples at higher dimensions where
|
/// and switching to random samples at higher dimensions where
|
||||||
/// LDS samples aren't available.
|
/// LDS samples aren't available.
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn get_sample(dimension: u32, i: u32, pixel_co: (u32, u32), seed: u32) -> f32 {
|
fn get_sample_4d(
|
||||||
|
dimension_set: u32,
|
||||||
|
i: u32,
|
||||||
|
pixel_co: (u32, u32),
|
||||||
|
seed: u32,
|
||||||
|
) -> (f32, f32, f32, f32) {
|
||||||
// A unique seed for every pixel coordinate up to a resolution of
|
// A unique seed for every pixel coordinate up to a resolution of
|
||||||
// 65536 x 65536. Also incorperating the seed.
|
// 65536 x 65536. Also incorperating the seed.
|
||||||
let seed = pixel_co.0 ^ (pixel_co.1 << 16) ^ seed.wrapping_mul(0x736caf6f);
|
let seed = pixel_co.0 ^ (pixel_co.1 << 16) ^ seed.wrapping_mul(0x736caf6f);
|
||||||
|
|
||||||
match dimension {
|
match dimension_set {
|
||||||
d if d < sobol::MAX_DIMENSION as u32 => {
|
ds if ds < sobol::MAX_DIMENSION as u32 => {
|
||||||
// Sobol sampling.
|
// Sobol sampling.
|
||||||
sobol::sample(d, i, seed)
|
let n4 = sobol::sample_4d(ds, i, seed);
|
||||||
|
(n4[0], n4[1], n4[2], n4[3])
|
||||||
}
|
}
|
||||||
d => {
|
ds => {
|
||||||
// Random sampling.
|
// Random sampling.
|
||||||
use crate::hash::hash_u32_to_f32;
|
use crate::hash::hash_u32_to_f32;
|
||||||
hash_u32_to_f32(d ^ (i << 16), seed)
|
(
|
||||||
|
hash_u32_to_f32((ds * 4 + 0) ^ (i << 16), seed),
|
||||||
|
hash_u32_to_f32((ds * 4 + 1) ^ (i << 16), seed),
|
||||||
|
hash_u32_to_f32((ds * 4 + 2) ^ (i << 16), seed),
|
||||||
|
hash_u32_to_f32((ds * 4 + 3) ^ (i << 16), seed),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
use std::{env, fs::File, io::Write, path::Path};
|
use std::{env, fs::File, io::Write, path::Path};
|
||||||
|
|
||||||
/// How many components to generate.
|
/// How many components to generate.
|
||||||
const NUM_DIMENSIONS: usize = 1024;
|
const NUM_DIMENSIONS: usize = 256;
|
||||||
|
|
||||||
/// What file to generate the numbers from.
|
/// What file to generate the numbers from.
|
||||||
const DIRECTION_NUMBERS_TEXT: &str = include_str!("direction_numbers/new-joe-kuo-5.1024.txt");
|
const DIRECTION_NUMBERS_TEXT: &str = include_str!("direction_numbers/new-joe-kuo-5.1024.txt");
|
||||||
|
@ -22,14 +22,35 @@ fn main() {
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
// Write the vectors.
|
// Write the vectors.
|
||||||
// We actually write them with reversed bits due to how the library uses
|
// We write them in a rather atypical way because of how the library
|
||||||
// them, which is atypical.
|
// uses them. First, we interleave the numbers of each set of four
|
||||||
f.write_all(format!("pub const REV_VECTORS: &[[u{0}; {0}]] = &[\n", SOBOL_BITS).as_bytes())
|
// dimensions, for SIMD evaluation. Second, each number is written
|
||||||
|
// with reversed bits, to avoid needing to reverse them before scrambling.
|
||||||
|
f.write_all(
|
||||||
|
format!(
|
||||||
|
"pub const REV_VECTORS: &[[[u{0}; 4]; {0}]] = &[\n",
|
||||||
|
SOBOL_BITS
|
||||||
|
)
|
||||||
|
.as_bytes(),
|
||||||
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
for v in vectors.iter() {
|
for d4 in vectors.chunks_exact(4) {
|
||||||
f.write_all(" [\n".as_bytes()).unwrap();
|
f.write_all(" [\n".as_bytes()).unwrap();
|
||||||
for n in v.iter() {
|
for ((a, b), (c, d)) in d4[0]
|
||||||
f.write_all(format!(" 0x{:08x},\n", n.reverse_bits()).as_bytes())
|
.iter()
|
||||||
|
.zip(d4[1].iter())
|
||||||
|
.zip(d4[2].iter().zip(d4[3].iter()))
|
||||||
|
{
|
||||||
|
f.write_all(
|
||||||
|
format!(
|
||||||
|
" [0x{:08x}, 0x{:08x}, 0x{:08x}, 0x{:08x}],\n",
|
||||||
|
a.reverse_bits(),
|
||||||
|
b.reverse_bits(),
|
||||||
|
c.reverse_bits(),
|
||||||
|
d.reverse_bits()
|
||||||
|
)
|
||||||
|
.as_bytes(),
|
||||||
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
f.write_all(" ],\n".as_bytes()).unwrap();
|
f.write_all(" ],\n".as_bytes()).unwrap();
|
||||||
|
@ -87,7 +108,7 @@ fn main() {
|
||||||
// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
// OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
|
||||||
// IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
// IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
type SobolInt = u16;
|
type SobolInt = u32;
|
||||||
const SOBOL_BITS: usize = std::mem::size_of::<SobolInt>() * 8;
|
const SOBOL_BITS: usize = std::mem::size_of::<SobolInt>() * 8;
|
||||||
|
|
||||||
pub fn generate_direction_vectors(dimensions: usize) -> Vec<[SobolInt; SOBOL_BITS]> {
|
pub fn generate_direction_vectors(dimensions: usize) -> Vec<[SobolInt; SOBOL_BITS]> {
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
//! An implementation of the Sobol sequence with Owen scrambling.
|
//! An implementation of the Sobol sequence with Owen scrambling.
|
||||||
|
|
||||||
|
mod wide;
|
||||||
|
use wide::Int4;
|
||||||
|
|
||||||
// The following `include` provides `MAX_DIMENSION` and `REV_VECTORS`.
|
// The following `include` provides `MAX_DIMENSION` and `REV_VECTORS`.
|
||||||
// See the build.rs file for how this included file is generated.
|
// See the build.rs file for how this included file is generated.
|
||||||
include!(concat!(env!("OUT_DIR"), "/vectors.inc"));
|
include!(concat!(env!("OUT_DIR"), "/vectors.inc"));
|
||||||
|
@ -15,17 +18,24 @@ include!(concat!(env!("OUT_DIR"), "/vectors.inc"));
|
||||||
/// parameter exceeds 2^16-1, the sample set will start repeating.
|
/// parameter exceeds 2^16-1, the sample set will start repeating.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn sample(dimension: u32, index: u32, seed: u32) -> f32 {
|
pub fn sample(dimension: u32, index: u32, seed: u32) -> f32 {
|
||||||
|
sample_4d(dimension >> 2, index, seed)[(dimension & 0b11) as usize]
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as `sample()` but calculates a set of 4 dimensions all at once
|
||||||
|
/// using SIMD.
|
||||||
|
#[inline]
|
||||||
|
pub fn sample_4d(dimension_set: u32, index: u32, seed: u32) -> [f32; 4] {
|
||||||
// This index shuffling approach is due to Brent Burley, and is
|
// This index shuffling approach is due to Brent Burley, and is
|
||||||
// what allows us to create statistically independent Sobol sequences.
|
// what allows us to create statistically independent Sobol sequences.
|
||||||
let shuffled_rev_index = lk_scramble(index.reverse_bits(), seed);
|
let shuffled_rev_index = lk_scramble(index.reverse_bits(), seed);
|
||||||
|
|
||||||
let sobol = lk_scramble(
|
let sobol = lk_int4_scramble(
|
||||||
sobol_u32_rev(dimension, shuffled_rev_index),
|
sobol_int4_rev(dimension_set, shuffled_rev_index),
|
||||||
dimension ^ seed,
|
dimension_set ^ seed,
|
||||||
)
|
)
|
||||||
.reverse_bits();
|
.reverse_bits();
|
||||||
|
|
||||||
u32_to_0_1_f32(sobol)
|
sobol.to_norm_floats()
|
||||||
}
|
}
|
||||||
|
|
||||||
//----------------------------------------------------------------------
|
//----------------------------------------------------------------------
|
||||||
|
@ -40,22 +50,22 @@ pub fn sample(dimension: u32, index: u32, seed: u32) -> f32 {
|
||||||
/// Note: if the `index` parameter exceeds 2^16-1, the sample set will start
|
/// Note: if the `index` parameter exceeds 2^16-1, the sample set will start
|
||||||
/// repeating.
|
/// repeating.
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn sobol_u32_rev(dimension: u32, index: u32) -> u32 {
|
fn sobol_int4_rev(dimension_set: u32, index: u32) -> Int4 {
|
||||||
assert!(dimension < MAX_DIMENSION);
|
assert!(dimension_set < (MAX_DIMENSION / 4));
|
||||||
let vecs = &REV_VECTORS[dimension as usize];
|
let vecs = &REV_VECTORS[dimension_set as usize];
|
||||||
let mut index = (index >> 16) as u16;
|
let mut index = (index >> 16) as u16;
|
||||||
|
|
||||||
let mut result = 0;
|
let mut result = Int4::zero();
|
||||||
let mut i = 0;
|
let mut i = 0;
|
||||||
while index != 0 {
|
while index != 0 {
|
||||||
let j = index.leading_zeros();
|
let j = index.leading_zeros();
|
||||||
result ^= vecs[(i + j) as usize];
|
result ^= vecs[(i + j) as usize].into();
|
||||||
i += j + 1;
|
i += j + 1;
|
||||||
index <<= j;
|
index <<= j;
|
||||||
index <<= 1;
|
index <<= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
result as u32
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Scrambles `n` using the Laine Karras hash. This is equivalent to Owen
|
/// Scrambles `n` using the Laine Karras hash. This is equivalent to Owen
|
||||||
|
@ -85,6 +95,26 @@ fn lk_scramble(mut n: u32, scramble: u32) -> u32 {
|
||||||
n
|
n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Same as `lk_scramble()`, except does it on 4 integers at a time
|
||||||
|
/// with SIMD.
|
||||||
|
#[inline(always)]
|
||||||
|
fn lk_int4_scramble(mut n: Int4, scramble: u32) -> Int4 {
|
||||||
|
n += {
|
||||||
|
let a = hash(scramble, 2);
|
||||||
|
let b = a ^ 0x174f18ab;
|
||||||
|
let c = a ^ 0x691e72ca;
|
||||||
|
let d = a ^ 0xb40cc1b8;
|
||||||
|
[a, b, c, d].into()
|
||||||
|
};
|
||||||
|
|
||||||
|
n ^= [0xdc967795; 4].into();
|
||||||
|
n *= [0x97b754b7; 4].into();
|
||||||
|
n ^= [0x866350b1; 4].into();
|
||||||
|
n *= [0x9e3779cd; 4].into();
|
||||||
|
|
||||||
|
n
|
||||||
|
}
|
||||||
|
|
||||||
/// Same as `lk_scramble()` except uses a slower more full version of
|
/// Same as `lk_scramble()` except uses a slower more full version of
|
||||||
/// hashing.
|
/// hashing.
|
||||||
///
|
///
|
||||||
|
@ -114,9 +144,3 @@ fn hash(n: u32, rounds: u32) -> u32 {
|
||||||
}
|
}
|
||||||
hash
|
hash
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
fn u32_to_0_1_f32(n: u32) -> f32 {
|
|
||||||
const ONE_OVER_32BITS: f32 = 1.0 / (1u64 << 32) as f32;
|
|
||||||
n as f32 * ONE_OVER_32BITS
|
|
||||||
}
|
|
||||||
|
|
78
sub_crates/sobol/src/wide.rs
Normal file
78
sub_crates/sobol/src/wide.rs
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
#[derive(Debug, Copy, Clone)]
|
||||||
|
#[repr(align(16))]
|
||||||
|
pub(crate) struct Int4 {
|
||||||
|
v: [u32; 4],
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Int4 {
|
||||||
|
pub fn zero() -> Int4 {
|
||||||
|
Int4 { v: [0, 0, 0, 0] }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Converts the full range of a 32 bit integer to a float in [0, 1).
|
||||||
|
pub fn to_norm_floats(self) -> [f32; 4] {
|
||||||
|
const ONE_OVER_32BITS: f32 = 1.0 / (1u64 << 32) as f32;
|
||||||
|
[
|
||||||
|
self.v[0] as f32 * ONE_OVER_32BITS,
|
||||||
|
self.v[1] as f32 * ONE_OVER_32BITS,
|
||||||
|
self.v[2] as f32 * ONE_OVER_32BITS,
|
||||||
|
self.v[3] as f32 * ONE_OVER_32BITS,
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn reverse_bits(self) -> Int4 {
|
||||||
|
Int4 {
|
||||||
|
v: [
|
||||||
|
self.v[0].reverse_bits(),
|
||||||
|
self.v[1].reverse_bits(),
|
||||||
|
self.v[2].reverse_bits(),
|
||||||
|
self.v[3].reverse_bits(),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::ops::MulAssign for Int4 {
|
||||||
|
fn mul_assign(&mut self, other: Self) {
|
||||||
|
*self = Int4 {
|
||||||
|
v: [
|
||||||
|
self.v[0].wrapping_mul(other.v[0]),
|
||||||
|
self.v[1].wrapping_mul(other.v[1]),
|
||||||
|
self.v[2].wrapping_mul(other.v[2]),
|
||||||
|
self.v[3].wrapping_mul(other.v[3]),
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::ops::AddAssign for Int4 {
|
||||||
|
fn add_assign(&mut self, other: Self) {
|
||||||
|
*self = Int4 {
|
||||||
|
v: [
|
||||||
|
self.v[0].wrapping_add(other.v[0]),
|
||||||
|
self.v[1].wrapping_add(other.v[1]),
|
||||||
|
self.v[2].wrapping_add(other.v[2]),
|
||||||
|
self.v[3].wrapping_add(other.v[3]),
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::ops::BitXorAssign for Int4 {
|
||||||
|
fn bitxor_assign(&mut self, other: Self) {
|
||||||
|
*self = Int4 {
|
||||||
|
v: [
|
||||||
|
self.v[0] ^ other.v[0],
|
||||||
|
self.v[1] ^ other.v[1],
|
||||||
|
self.v[2] ^ other.v[2],
|
||||||
|
self.v[3] ^ other.v[3],
|
||||||
|
],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<[u32; 4]> for Int4 {
|
||||||
|
fn from(v: [u32; 4]) -> Self {
|
||||||
|
Int4 { v: v }
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user