From 0dfe916523c00a0979de6fb1f781c3a51ba81113 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Fri, 24 Apr 2020 21:05:29 +0900
Subject: [PATCH] Preparing for SIMD accelerated Sobol sampling.

This implements the 4-wide API, and moves the renderer over to it.
But the actual implementation is still scalar.
---
 src/renderer.rs              | 57 +++++++++++++-------------
 sub_crates/sobol/build.rs    | 41 ++++++++++++++-----
 sub_crates/sobol/src/lib.rs  | 56 ++++++++++++++++++--------
 sub_crates/sobol/src/wide.rs | 78 ++++++++++++++++++++++++++++++++++++
 4 files changed, 179 insertions(+), 53 deletions(-)
 create mode 100644 sub_crates/sobol/src/wide.rs

diff --git a/src/renderer.rs b/src/renderer.rs
index 797abb2..be045ef 100644
--- a/src/renderer.rs
+++ b/src/renderer.rs
@@ -241,14 +241,14 @@ impl<'a> Renderer<'a> {
             for y in bucket.y..(bucket.y + bucket.h) {
                 for x in bucket.x..(bucket.x + bucket.w) {
                     for si in 0..self.spp {
+                        // Raw sample numbers.
+                        let (d0, d1, d2, d3) = get_sample_4d(0, si as u32, (x, y), self.seed);
+                        let (d4, _, _, _) = get_sample_4d(1, si as u32, (x, y), self.seed);
+
                         // Calculate image plane x and y coordinates
                         let (img_x, img_y) = {
-                            let filter_x =
-                                probit(get_sample(3, si as u32, (x, y), self.seed), 2.0 / 6.0)
-                                    + 0.5;
-                            let filter_y =
-                                probit(get_sample(4, si as u32, (x, y), self.seed), 2.0 / 6.0)
-                                    + 0.5;
+                            let filter_x = probit(d3, 2.0 / 6.0) + 0.5;
+                            let filter_y = probit(d4, 2.0 / 6.0) + 0.5;
                             let samp_x = (filter_x + x as f32) * cmpx;
                             let samp_y = (filter_y + y as f32) * cmpy;
                             ((samp_x - 0.5) * x_extent, (0.5 - samp_y) * y_extent)
@@ -260,11 +260,8 @@ impl<'a> Renderer<'a> {
                             self.seed,
                             (x, y),
                             (img_x, img_y),
-                            (
-                                get_sample(1, si as u32, (x, y), self.seed),
-                                get_sample(2, si as u32, (x, y), self.seed),
-                            ),
-                            get_sample(0, si as u32, (x, y), self.seed),
+                            (d1, d2),
+                            d0,
                             map_0_1_to_wavelength(golden_ratio_sample(
                                 si as u32,
                                 hash_u32((x << 16) ^ y, self.seed),
@@ -438,10 +435,10 @@ impl LightPath {
         self.sampling_seed += 1;
     }
 
-    fn next_lds_samp(&mut self) -> f32 {
+    fn next_lds_samp(&mut self) -> (f32, f32, f32, f32) {
         let dimension = self.dim_offset;
         self.dim_offset += 1;
-        get_sample(
+        get_sample_4d(
             dimension,
             self.sample_number,
             self.pixel_co,
@@ -490,12 +487,8 @@ impl LightPath {
 
                     // Prepare light ray
                     self.next_lds_sequence();
-                    let light_n = self.next_lds_samp();
-                    let light_uvw = (
-                        self.next_lds_samp(),
-                        self.next_lds_samp(),
-                        self.next_lds_samp(),
-                    );
+                    let (light_n, d2, d3, d4) = self.next_lds_samp();
+                    let light_uvw = (d2, d3, d4);
                     xform_stack.clear();
                     let light_info = scene.sample_lights(
                         xform_stack,
@@ -609,8 +602,7 @@ impl LightPath {
                         // Sample closure
                         let (dir, filter, pdf) = {
                             self.next_lds_sequence();
-                            let u = self.next_lds_samp();
-                            let v = self.next_lds_samp();
+                            let (u, v, _, _) = self.next_lds_samp();
                             closure.sample(
                                 idata.incoming,
                                 idata.nor,
@@ -703,20 +695,31 @@ impl LightPath {
 /// and switching to random samples at higher dimensions where
 /// LDS samples aren't available.
 #[inline(always)]
-fn get_sample(dimension: u32, i: u32, pixel_co: (u32, u32), seed: u32) -> f32 {
+fn get_sample_4d(
+    dimension_set: u32,
+    i: u32,
+    pixel_co: (u32, u32),
+    seed: u32,
+) -> (f32, f32, f32, f32) {
     // A unique seed for every pixel coordinate up to a resolution of
     // 65536 x 65536.  Also incorperating the seed.
     let seed = pixel_co.0 ^ (pixel_co.1 << 16) ^ seed.wrapping_mul(0x736caf6f);
 
-    match dimension {
-        d if d < sobol::MAX_DIMENSION as u32 => {
+    match dimension_set {
+        ds if ds < sobol::MAX_DIMENSION as u32 => {
             // Sobol sampling.
-            sobol::sample(d, i, seed)
+            let n4 = sobol::sample_4d(ds, i, seed);
+            (n4[0], n4[1], n4[2], n4[3])
         }
-        d => {
+        ds => {
             // Random sampling.
             use crate::hash::hash_u32_to_f32;
-            hash_u32_to_f32(d ^ (i << 16), seed)
+            (
+                hash_u32_to_f32((ds * 4 + 0) ^ (i << 16), seed),
+                hash_u32_to_f32((ds * 4 + 1) ^ (i << 16), seed),
+                hash_u32_to_f32((ds * 4 + 2) ^ (i << 16), seed),
+                hash_u32_to_f32((ds * 4 + 3) ^ (i << 16), seed),
+            )
         }
     }
 }
diff --git a/sub_crates/sobol/build.rs b/sub_crates/sobol/build.rs
index db53598..a0dbaa4 100644
--- a/sub_crates/sobol/build.rs
+++ b/sub_crates/sobol/build.rs
@@ -4,7 +4,7 @@
 use std::{env, fs::File, io::Write, path::Path};
 
 /// How many components to generate.
-const NUM_DIMENSIONS: usize = 1024;
+const NUM_DIMENSIONS: usize = 256;
 
 /// What file to generate the numbers from.
 const DIRECTION_NUMBERS_TEXT: &str = include_str!("direction_numbers/new-joe-kuo-5.1024.txt");
@@ -22,15 +22,36 @@ fn main() {
         .unwrap();
 
     // Write the vectors.
-    // We actually write them with reversed bits due to how the library uses
-    // them, which is atypical.
-    f.write_all(format!("pub const REV_VECTORS: &[[u{0}; {0}]] = &[\n", SOBOL_BITS).as_bytes())
-        .unwrap();
-    for v in vectors.iter() {
+    // We write them in a rather atypical way because of how the library
+    // uses them.  First, we interleave the numbers of each set of four
+    // dimensions, for SIMD evaluation.  Second, each number is written
+    // with reversed bits, to avoid needing to reverse them before scrambling.
+    f.write_all(
+        format!(
+            "pub const REV_VECTORS: &[[[u{0}; 4]; {0}]] = &[\n",
+            SOBOL_BITS
+        )
+        .as_bytes(),
+    )
+    .unwrap();
+    for d4 in vectors.chunks_exact(4) {
         f.write_all("  [\n".as_bytes()).unwrap();
-        for n in v.iter() {
-            f.write_all(format!("    0x{:08x},\n", n.reverse_bits()).as_bytes())
-                .unwrap();
+        for ((a, b), (c, d)) in d4[0]
+            .iter()
+            .zip(d4[1].iter())
+            .zip(d4[2].iter().zip(d4[3].iter()))
+        {
+            f.write_all(
+                format!(
+                    "    [0x{:08x}, 0x{:08x}, 0x{:08x}, 0x{:08x}],\n",
+                    a.reverse_bits(),
+                    b.reverse_bits(),
+                    c.reverse_bits(),
+                    d.reverse_bits()
+                )
+                .as_bytes(),
+            )
+            .unwrap();
         }
         f.write_all("  ],\n".as_bytes()).unwrap();
     }
@@ -87,7 +108,7 @@ fn main() {
 //     OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
 //     IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-type SobolInt = u16;
+type SobolInt = u32;
 const SOBOL_BITS: usize = std::mem::size_of::<SobolInt>() * 8;
 
 pub fn generate_direction_vectors(dimensions: usize) -> Vec<[SobolInt; SOBOL_BITS]> {
diff --git a/sub_crates/sobol/src/lib.rs b/sub_crates/sobol/src/lib.rs
index 21bc6bb..95110c5 100644
--- a/sub_crates/sobol/src/lib.rs
+++ b/sub_crates/sobol/src/lib.rs
@@ -1,5 +1,8 @@
 //! An implementation of the Sobol sequence with Owen scrambling.
 
+mod wide;
+use wide::Int4;
+
 // The following `include` provides `MAX_DIMENSION` and `REV_VECTORS`.
 // See the build.rs file for how this included file is generated.
 include!(concat!(env!("OUT_DIR"), "/vectors.inc"));
@@ -15,17 +18,24 @@ include!(concat!(env!("OUT_DIR"), "/vectors.inc"));
 /// parameter exceeds 2^16-1, the sample set will start repeating.
 #[inline]
 pub fn sample(dimension: u32, index: u32, seed: u32) -> f32 {
+    sample_4d(dimension >> 2, index, seed)[(dimension & 0b11) as usize]
+}
+
+/// Same as `sample()` but calculates a set of 4 dimensions all at once
+/// using SIMD.
+#[inline]
+pub fn sample_4d(dimension_set: u32, index: u32, seed: u32) -> [f32; 4] {
     // This index shuffling approach is due to Brent Burley, and is
     // what allows us to create statistically independent Sobol sequences.
     let shuffled_rev_index = lk_scramble(index.reverse_bits(), seed);
 
-    let sobol = lk_scramble(
-        sobol_u32_rev(dimension, shuffled_rev_index),
-        dimension ^ seed,
+    let sobol = lk_int4_scramble(
+        sobol_int4_rev(dimension_set, shuffled_rev_index),
+        dimension_set ^ seed,
     )
     .reverse_bits();
 
-    u32_to_0_1_f32(sobol)
+    sobol.to_norm_floats()
 }
 
 //----------------------------------------------------------------------
@@ -40,22 +50,22 @@ pub fn sample(dimension: u32, index: u32, seed: u32) -> f32 {
 /// Note: if the `index` parameter exceeds 2^16-1, the sample set will start
 /// repeating.
 #[inline(always)]
-fn sobol_u32_rev(dimension: u32, index: u32) -> u32 {
-    assert!(dimension < MAX_DIMENSION);
-    let vecs = &REV_VECTORS[dimension as usize];
+fn sobol_int4_rev(dimension_set: u32, index: u32) -> Int4 {
+    assert!(dimension_set < (MAX_DIMENSION / 4));
+    let vecs = &REV_VECTORS[dimension_set as usize];
     let mut index = (index >> 16) as u16;
 
-    let mut result = 0;
+    let mut result = Int4::zero();
     let mut i = 0;
     while index != 0 {
         let j = index.leading_zeros();
-        result ^= vecs[(i + j) as usize];
+        result ^= vecs[(i + j) as usize].into();
         i += j + 1;
         index <<= j;
         index <<= 1;
     }
 
-    result as u32
+    result
 }
 
 /// Scrambles `n` using the Laine Karras hash.  This is equivalent to Owen
@@ -85,6 +95,26 @@ fn lk_scramble(mut n: u32, scramble: u32) -> u32 {
     n
 }
 
+/// Same as `lk_scramble()`, except does it on 4 integers at a time
+/// with SIMD.
+#[inline(always)]
+fn lk_int4_scramble(mut n: Int4, scramble: u32) -> Int4 {
+    n += {
+        let a = hash(scramble, 2);
+        let b = a ^ 0x174f18ab;
+        let c = a ^ 0x691e72ca;
+        let d = a ^ 0xb40cc1b8;
+        [a, b, c, d].into()
+    };
+
+    n ^= [0xdc967795; 4].into();
+    n *= [0x97b754b7; 4].into();
+    n ^= [0x866350b1; 4].into();
+    n *= [0x9e3779cd; 4].into();
+
+    n
+}
+
 /// Same as `lk_scramble()` except uses a slower more full version of
 /// hashing.
 ///
@@ -114,9 +144,3 @@ fn hash(n: u32, rounds: u32) -> u32 {
     }
     hash
 }
-
-#[inline(always)]
-fn u32_to_0_1_f32(n: u32) -> f32 {
-    const ONE_OVER_32BITS: f32 = 1.0 / (1u64 << 32) as f32;
-    n as f32 * ONE_OVER_32BITS
-}
diff --git a/sub_crates/sobol/src/wide.rs b/sub_crates/sobol/src/wide.rs
new file mode 100644
index 0000000..90258d5
--- /dev/null
+++ b/sub_crates/sobol/src/wide.rs
@@ -0,0 +1,78 @@
+#[derive(Debug, Copy, Clone)]
+#[repr(align(16))]
+pub(crate) struct Int4 {
+    v: [u32; 4],
+}
+
+impl Int4 {
+    pub fn zero() -> Int4 {
+        Int4 { v: [0, 0, 0, 0] }
+    }
+
+    /// Converts the full range of a 32 bit integer to a float in [0, 1).
+    pub fn to_norm_floats(self) -> [f32; 4] {
+        const ONE_OVER_32BITS: f32 = 1.0 / (1u64 << 32) as f32;
+        [
+            self.v[0] as f32 * ONE_OVER_32BITS,
+            self.v[1] as f32 * ONE_OVER_32BITS,
+            self.v[2] as f32 * ONE_OVER_32BITS,
+            self.v[3] as f32 * ONE_OVER_32BITS,
+        ]
+    }
+
+    pub fn reverse_bits(self) -> Int4 {
+        Int4 {
+            v: [
+                self.v[0].reverse_bits(),
+                self.v[1].reverse_bits(),
+                self.v[2].reverse_bits(),
+                self.v[3].reverse_bits(),
+            ],
+        }
+    }
+}
+
+impl std::ops::MulAssign for Int4 {
+    fn mul_assign(&mut self, other: Self) {
+        *self = Int4 {
+            v: [
+                self.v[0].wrapping_mul(other.v[0]),
+                self.v[1].wrapping_mul(other.v[1]),
+                self.v[2].wrapping_mul(other.v[2]),
+                self.v[3].wrapping_mul(other.v[3]),
+            ],
+        };
+    }
+}
+
+impl std::ops::AddAssign for Int4 {
+    fn add_assign(&mut self, other: Self) {
+        *self = Int4 {
+            v: [
+                self.v[0].wrapping_add(other.v[0]),
+                self.v[1].wrapping_add(other.v[1]),
+                self.v[2].wrapping_add(other.v[2]),
+                self.v[3].wrapping_add(other.v[3]),
+            ],
+        };
+    }
+}
+
+impl std::ops::BitXorAssign for Int4 {
+    fn bitxor_assign(&mut self, other: Self) {
+        *self = Int4 {
+            v: [
+                self.v[0] ^ other.v[0],
+                self.v[1] ^ other.v[1],
+                self.v[2] ^ other.v[2],
+                self.v[3] ^ other.v[3],
+            ],
+        };
+    }
+}
+
+impl From<[u32; 4]> for Int4 {
+    fn from(v: [u32; 4]) -> Self {
+        Int4 { v: v }
+    }
+}