Preparing for SIMD accelerated Sobol sampling.

This implements the 4-wide API, and moves the renderer over to it. But the actual implementation is still scalar.
2020-04-24 21:05:29 +09:00 · 2020-04-24 21:05:29 +09:00 · 0dfe916523
commit 0dfe916523
parent 78b5cf4c53
4 changed files with 179 additions and 53 deletions
--- a/src/renderer.rs
+++ b/src/renderer.rs
@ -241,14 +241,14 @@ impl<'a> Renderer<'a> {
            for y in bucket.y..(bucket.y + bucket.h) {
                for x in bucket.x..(bucket.x + bucket.w) {
                    for si in 0..self.spp {
                        // Raw sample numbers.
                        let (d0, d1, d2, d3) = get_sample_4d(0, si as u32, (x, y), self.seed);
                        let (d4, _, _, _) = get_sample_4d(1, si as u32, (x, y), self.seed);
                        // Calculate image plane x and y coordinates
                        let (img_x, img_y) = {
-                            let filter_x =
+                            let filter_x = probit(d3, 2.0 / 6.0) + 0.5;
-                                probit(get_sample(3, si as u32, (x, y), self.seed), 2.0 / 6.0)
+                            let filter_y = probit(d4, 2.0 / 6.0) + 0.5;
                                    + 0.5;
                            let filter_y =
                                probit(get_sample(4, si as u32, (x, y), self.seed), 2.0 / 6.0)
                                    + 0.5;
                            let samp_x = (filter_x + x as f32) * cmpx;
                            let samp_y = (filter_y + y as f32) * cmpy;
                            ((samp_x - 0.5) * x_extent, (0.5 - samp_y) * y_extent)
@ -260,11 +260,8 @@ impl<'a> Renderer<'a> {
                            self.seed,
                            (x, y),
                            (img_x, img_y),
-                            (
+                            (d1, d2),
-                                get_sample(1, si as u32, (x, y), self.seed),
+                            d0,
                                get_sample(2, si as u32, (x, y), self.seed),
                            ),
                            get_sample(0, si as u32, (x, y), self.seed),
                            map_0_1_to_wavelength(golden_ratio_sample(
                                si as u32,
                                hash_u32((x << 16) ^ y, self.seed),
@ -438,10 +435,10 @@ impl LightPath {
        self.sampling_seed += 1;
    }
-    fn next_lds_samp(&mut self) -> f32 {
+    fn next_lds_samp(&mut self) -> (f32, f32, f32, f32) {
        let dimension = self.dim_offset;
        self.dim_offset += 1;
-        get_sample(
+        get_sample_4d(
            dimension,
            self.sample_number,
            self.pixel_co,
@ -490,12 +487,8 @@ impl LightPath {
                    // Prepare light ray
                    self.next_lds_sequence();
-                    let light_n = self.next_lds_samp();
+                    let (light_n, d2, d3, d4) = self.next_lds_samp();
-                    let light_uvw = (
+                    let light_uvw = (d2, d3, d4);
                        self.next_lds_samp(),
                        self.next_lds_samp(),
                        self.next_lds_samp(),
                    );
                    xform_stack.clear();
                    let light_info = scene.sample_lights(
                        xform_stack,
@ -609,8 +602,7 @@ impl LightPath {
                        // Sample closure
                        let (dir, filter, pdf) = {
                            self.next_lds_sequence();
-                            let u = self.next_lds_samp();
+                            let (u, v, _, _) = self.next_lds_samp();
                            let v = self.next_lds_samp();
                            closure.sample(
                                idata.incoming,
                                idata.nor,
@ -703,20 +695,31 @@ impl LightPath {
 /// and switching to random samples at higher dimensions where
 /// LDS samples aren't available.
 #[inline(always)]
-fn get_sample(dimension: u32, i: u32, pixel_co: (u32, u32), seed: u32) -> f32 {
+fn get_sample_4d(
    dimension_set: u32,
    i: u32,
    pixel_co: (u32, u32),
    seed: u32,
 ) -> (f32, f32, f32, f32) {
    // A unique seed for every pixel coordinate up to a resolution of
    // 65536 x 65536.  Also incorperating the seed.
    let seed = pixel_co.0 ^ (pixel_co.1 << 16) ^ seed.wrapping_mul(0x736caf6f);
-    match dimension {
+    match dimension_set {
-        d if d < sobol::MAX_DIMENSION as u32 => {
+        ds if ds < sobol::MAX_DIMENSION as u32 => {
            // Sobol sampling.
-            sobol::sample(d, i, seed)
+            let n4 = sobol::sample_4d(ds, i, seed);
            (n4[0], n4[1], n4[2], n4[3])
        }
-        d => {
+        ds => {
            // Random sampling.
            use crate::hash::hash_u32_to_f32;
-            hash_u32_to_f32(d ^ (i << 16), seed)
+            (
                hash_u32_to_f32((ds * 4 + 0) ^ (i << 16), seed),
                hash_u32_to_f32((ds * 4 + 1) ^ (i << 16), seed),
                hash_u32_to_f32((ds * 4 + 2) ^ (i << 16), seed),
                hash_u32_to_f32((ds * 4 + 3) ^ (i << 16), seed),
            )
        }
    }
 }
--- a/sub_crates/sobol/build.rs
+++ b/sub_crates/sobol/build.rs
@ -4,7 +4,7 @@
 use std::{env, fs::File, io::Write, path::Path};
 /// How many components to generate.
-const NUM_DIMENSIONS: usize = 1024;
+const NUM_DIMENSIONS: usize = 256;
 /// What file to generate the numbers from.
 const DIRECTION_NUMBERS_TEXT: &str = include_str!("direction_numbers/new-joe-kuo-5.1024.txt");
@ -22,14 +22,35 @@ fn main() {
        .unwrap();
    // Write the vectors.
-    // We actually write them with reversed bits due to how the library uses
+    // We write them in a rather atypical way because of how the library
-    // them, which is atypical.
+    // uses them.  First, we interleave the numbers of each set of four
-    f.write_all(format!("pub const REV_VECTORS: &[[u{0}; {0}]] = &[\n", SOBOL_BITS).as_bytes())
+    // dimensions, for SIMD evaluation.  Second, each number is written
    // with reversed bits, to avoid needing to reverse them before scrambling.
    f.write_all(
        format!(
            "pub const REV_VECTORS: &[[[u{0}; 4]; {0}]] = &[\n",
            SOBOL_BITS
        )
        .as_bytes(),
    )
    .unwrap();
-    for v in vectors.iter() {
+    for d4 in vectors.chunks_exact(4) {
        f.write_all("  [\n".as_bytes()).unwrap();
-        for n in v.iter() {
+        for ((a, b), (c, d)) in d4[0]
-            f.write_all(format!("    0x{:08x},\n", n.reverse_bits()).as_bytes())
+            .iter()
            .zip(d4[1].iter())
            .zip(d4[2].iter().zip(d4[3].iter()))
        {
            f.write_all(
                format!(
                    "    [0x{:08x}, 0x{:08x}, 0x{:08x}, 0x{:08x}],\n",
                    a.reverse_bits(),
                    b.reverse_bits(),
                    c.reverse_bits(),
                    d.reverse_bits()
                )
                .as_bytes(),
            )
            .unwrap();
        }
        f.write_all("  ],\n".as_bytes()).unwrap();
@ -87,7 +108,7 @@ fn main() {
 //     OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
 //     IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-type SobolInt = u16;
+type SobolInt = u32;
 const SOBOL_BITS: usize = std::mem::size_of::<SobolInt>() * 8;
 pub fn generate_direction_vectors(dimensions: usize) -> Vec<[SobolInt; SOBOL_BITS]> {
--- a/sub_crates/sobol/src/lib.rs
+++ b/sub_crates/sobol/src/lib.rs
@ -1,5 +1,8 @@
 //! An implementation of the Sobol sequence with Owen scrambling.
 mod wide;
 use wide::Int4;
 // The following `include` provides `MAX_DIMENSION` and `REV_VECTORS`.
 // See the build.rs file for how this included file is generated.
 include!(concat!(env!("OUT_DIR"), "/vectors.inc"));
@ -15,17 +18,24 @@ include!(concat!(env!("OUT_DIR"), "/vectors.inc"));
 /// parameter exceeds 2^16-1, the sample set will start repeating.
 #[inline]
 pub fn sample(dimension: u32, index: u32, seed: u32) -> f32 {
    sample_4d(dimension >> 2, index, seed)[(dimension & 0b11) as usize]
 }
 /// Same as `sample()` but calculates a set of 4 dimensions all at once
 /// using SIMD.
 #[inline]
 pub fn sample_4d(dimension_set: u32, index: u32, seed: u32) -> [f32; 4] {
    // This index shuffling approach is due to Brent Burley, and is
    // what allows us to create statistically independent Sobol sequences.
    let shuffled_rev_index = lk_scramble(index.reverse_bits(), seed);
-    let sobol = lk_scramble(
+    let sobol = lk_int4_scramble(
-        sobol_u32_rev(dimension, shuffled_rev_index),
+        sobol_int4_rev(dimension_set, shuffled_rev_index),
-        dimension ^ seed,
+        dimension_set ^ seed,
    )
    .reverse_bits();
-    u32_to_0_1_f32(sobol)
+    sobol.to_norm_floats()
 }
 //----------------------------------------------------------------------
@ -40,22 +50,22 @@ pub fn sample(dimension: u32, index: u32, seed: u32) -> f32 {
 /// Note: if the `index` parameter exceeds 2^16-1, the sample set will start
 /// repeating.
 #[inline(always)]
-fn sobol_u32_rev(dimension: u32, index: u32) -> u32 {
+fn sobol_int4_rev(dimension_set: u32, index: u32) -> Int4 {
-    assert!(dimension < MAX_DIMENSION);
+    assert!(dimension_set < (MAX_DIMENSION / 4));
-    let vecs = &REV_VECTORS[dimension as usize];
+    let vecs = &REV_VECTORS[dimension_set as usize];
    let mut index = (index >> 16) as u16;
-    let mut result = 0;
+    let mut result = Int4::zero();
    let mut i = 0;
    while index != 0 {
        let j = index.leading_zeros();
-        result ^= vecs[(i + j) as usize];
+        result ^= vecs[(i + j) as usize].into();
        i += j + 1;
        index <<= j;
        index <<= 1;
    }
-    result as u32
+    result
 }
 /// Scrambles `n` using the Laine Karras hash.  This is equivalent to Owen
@ -85,6 +95,26 @@ fn lk_scramble(mut n: u32, scramble: u32) -> u32 {
    n
 }
 /// Same as `lk_scramble()`, except does it on 4 integers at a time
 /// with SIMD.
 #[inline(always)]
 fn lk_int4_scramble(mut n: Int4, scramble: u32) -> Int4 {
    n += {
        let a = hash(scramble, 2);
        let b = a ^ 0x174f18ab;
        let c = a ^ 0x691e72ca;
        let d = a ^ 0xb40cc1b8;
        [a, b, c, d].into()
    };
    n ^= [0xdc967795; 4].into();
    n *= [0x97b754b7; 4].into();
    n ^= [0x866350b1; 4].into();
    n *= [0x9e3779cd; 4].into();
    n
 }
 /// Same as `lk_scramble()` except uses a slower more full version of
 /// hashing.
 ///
@ -114,9 +144,3 @@ fn hash(n: u32, rounds: u32) -> u32 {
    }
    hash
 }
 #[inline(always)]
 fn u32_to_0_1_f32(n: u32) -> f32 {
    const ONE_OVER_32BITS: f32 = 1.0 / (1u64 << 32) as f32;
    n as f32 * ONE_OVER_32BITS
 }
--- a/sub_crates/sobol/src/wide.rs
+++ b/sub_crates/sobol/src/wide.rs
@ -0,0 +1,78 @@
 #[derive(Debug, Copy, Clone)]
 #[repr(align(16))]
 pub(crate) struct Int4 {
    v: [u32; 4],
 }
 impl Int4 {
    pub fn zero() -> Int4 {
        Int4 { v: [0, 0, 0, 0] }
    }
    /// Converts the full range of a 32 bit integer to a float in [0, 1).
    pub fn to_norm_floats(self) -> [f32; 4] {
        const ONE_OVER_32BITS: f32 = 1.0 / (1u64 << 32) as f32;
        [
            self.v[0] as f32 * ONE_OVER_32BITS,
            self.v[1] as f32 * ONE_OVER_32BITS,
            self.v[2] as f32 * ONE_OVER_32BITS,
            self.v[3] as f32 * ONE_OVER_32BITS,
        ]
    }
    pub fn reverse_bits(self) -> Int4 {
        Int4 {
            v: [
                self.v[0].reverse_bits(),
                self.v[1].reverse_bits(),
                self.v[2].reverse_bits(),
                self.v[3].reverse_bits(),
            ],
        }
    }
 }
 impl std::ops::MulAssign for Int4 {
    fn mul_assign(&mut self, other: Self) {
        *self = Int4 {
            v: [
                self.v[0].wrapping_mul(other.v[0]),
                self.v[1].wrapping_mul(other.v[1]),
                self.v[2].wrapping_mul(other.v[2]),
                self.v[3].wrapping_mul(other.v[3]),
            ],
        };
    }
 }
 impl std::ops::AddAssign for Int4 {
    fn add_assign(&mut self, other: Self) {
        *self = Int4 {
            v: [
                self.v[0].wrapping_add(other.v[0]),
                self.v[1].wrapping_add(other.v[1]),
                self.v[2].wrapping_add(other.v[2]),
                self.v[3].wrapping_add(other.v[3]),
            ],
        };
    }
 }
 impl std::ops::BitXorAssign for Int4 {
    fn bitxor_assign(&mut self, other: Self) {
        *self = Int4 {
            v: [
                self.v[0] ^ other.v[0],
                self.v[1] ^ other.v[1],
                self.v[2] ^ other.v[2],
                self.v[3] ^ other.v[3],
            ],
        };
    }
 }
 impl From<[u32; 4]> for Int4 {
    fn from(v: [u32; 4]) -> Self {
        Int4 { v: v }
    }
 }