Accelerate the Sobol sampler with SIMD on x86_64.

2020-04-24 23:32:43 +09:00 · 2020-04-24 23:32:43 +09:00 · 72adbedbb4
commit 72adbedbb4
parent 0dfe916523
3 changed files with 232 additions and 69 deletions
--- a/sub_crates/sobol/build.rs
+++ b/sub_crates/sobol/build.rs
@ -4,7 +4,7 @@
 use std::{env, fs::File, io::Write, path::Path};

 /// How many components to generate.
-const NUM_DIMENSIONS: usize = 256;
+const NUM_DIMENSIONS: usize = 128;

 /// What file to generate the numbers from.
 const DIRECTION_NUMBERS_TEXT: &str = include_str!("direction_numbers/new-joe-kuo-5.1024.txt");
--- a/sub_crates/sobol/src/lib.rs
+++ b/sub_crates/sobol/src/lib.rs
@ -70,7 +70,7 @@ fn sobol_int4_rev(dimension_set: u32, index: u32) -> Int4 {

 /// Scrambles `n` using the Laine Karras hash.  This is equivalent to Owen
 /// scrambling, but on reversed bits.
-#[inline(always)]
+#[inline]
 fn lk_scramble(mut n: u32, scramble: u32) -> u32 {
    // This uses the technique presented in the paper "Stratified Sampling for
    // Stochastic Transparency" by Laine and Karras to scramble the bits.
--- a/sub_crates/sobol/src/wide.rs
+++ b/sub_crates/sobol/src/wide.rs
@ -1,10 +1,170 @@
-#[derive(Debug, Copy, Clone)]
-#[repr(align(16))]
-pub(crate) struct Int4 {
-    v: [u32; 4],
-}
+//--------------------------------------------------------------------------
+// x86/64 SSE
+#[cfg(target_arch = "x86_64")]
+// #[cfg(all(target_arch = "x86_64", target_feature = "sse4.1"))]
+pub(crate) mod sse {
+    use core::arch::x86_64::{
+        __m128i,

-impl Int4 {
+        // SSE2 or less
+        _mm_add_epi32,
+        _mm_and_si128,
+        _mm_cvtepi32_ps,
+        _mm_mul_ps,
+        _mm_or_si128,
+        _mm_set1_epi32,
+        _mm_set1_ps,
+        _mm_setzero_si128,
+        _mm_slli_epi32,
+        _mm_srli_epi32,
+        _mm_xor_si128,
+    };
+
+    use core::arch::x86_64::{
+        // SSE3 / SSE4.1
+        // Note: these aren't necessarily actually available on all
+        // x86_64 platforms, so their use here isn't quite correct
+        // with the platform guard above.
+        // TODO: fix this at some point.
+        _mm_loadu_si128,
+        _mm_mullo_epi32,
+    };
+
+    #[derive(Debug, Copy, Clone)]
+    pub(crate) struct Int4 {
+        v: __m128i,
+    }
+
+    impl Int4 {
+        #[inline(always)]
+        pub fn zero() -> Int4 {
+            Int4 {
+                v: unsafe { _mm_setzero_si128() },
+            }
+        }
+
+        /// Converts the full range of a 32 bit integer to a float in [0, 1).
+        #[inline(always)]
+        pub fn to_norm_floats(self) -> [f32; 4] {
+            const ONE_OVER_31BITS: f32 = 1.0 / (1u64 << 31) as f32;
+            let n4 = unsafe {
+                _mm_mul_ps(
+                    _mm_cvtepi32_ps(_mm_srli_epi32(self.v, 1)),
+                    _mm_set1_ps(ONE_OVER_31BITS),
+                )
+            };
+
+            unsafe { std::mem::transmute(n4) }
+        }
+
+        #[inline]
+        pub fn reverse_bits(self) -> Int4 {
+            let mut n = self.v;
+            unsafe {
+                let a = _mm_slli_epi32(n, 16);
+                let b = _mm_srli_epi32(n, 16);
+                n = _mm_or_si128(a, b);
+
+                //----
+                let a = _mm_and_si128(
+                    _mm_slli_epi32(n, 8),
+                    _mm_set1_epi32(std::mem::transmute(0xff00ff00u32)),
+                );
+                let b = _mm_and_si128(
+                    _mm_srli_epi32(n, 8),
+                    _mm_set1_epi32(std::mem::transmute(0x00ff00ffu32)),
+                );
+                n = _mm_or_si128(a, b);
+
+                //----
+                let a = _mm_and_si128(
+                    _mm_slli_epi32(n, 4),
+                    _mm_set1_epi32(std::mem::transmute(0xf0f0f0f0u32)),
+                );
+                let b = _mm_and_si128(
+                    _mm_srli_epi32(n, 4),
+                    _mm_set1_epi32(std::mem::transmute(0x0f0f0f0fu32)),
+                );
+                n = _mm_or_si128(a, b);
+
+                //----
+                let a = _mm_and_si128(
+                    _mm_slli_epi32(n, 2),
+                    _mm_set1_epi32(std::mem::transmute(0xccccccccu32)),
+                );
+                let b = _mm_and_si128(
+                    _mm_srli_epi32(n, 2),
+                    _mm_set1_epi32(std::mem::transmute(0x33333333u32)),
+                );
+                n = _mm_or_si128(a, b);
+
+                //----
+                let a = _mm_and_si128(
+                    _mm_slli_epi32(n, 1),
+                    _mm_set1_epi32(std::mem::transmute(0xaaaaaaaau32)),
+                );
+                let b = _mm_and_si128(
+                    _mm_srli_epi32(n, 1),
+                    _mm_set1_epi32(std::mem::transmute(0x55555555u32)),
+                );
+                n = _mm_or_si128(a, b);
+
+                Int4 { v: n }
+            }
+        }
+    }
+
+    impl std::ops::MulAssign for Int4 {
+        #[inline(always)]
+        fn mul_assign(&mut self, other: Self) {
+            *self = Int4 {
+                v: unsafe { _mm_mullo_epi32(self.v, other.v) },
+            };
+        }
+    }
+
+    impl std::ops::AddAssign for Int4 {
+        #[inline(always)]
+        fn add_assign(&mut self, other: Self) {
+            *self = Int4 {
+                v: unsafe { _mm_add_epi32(self.v, other.v) },
+            };
+        }
+    }
+
+    impl std::ops::BitXorAssign for Int4 {
+        #[inline(always)]
+        fn bitxor_assign(&mut self, other: Self) {
+            *self = Int4 {
+                v: unsafe { _mm_xor_si128(self.v, other.v) },
+            };
+        }
+    }
+
+    impl From<[u32; 4]> for Int4 {
+        #[inline(always)]
+        fn from(v: [u32; 4]) -> Self {
+            Int4 {
+                v: unsafe { _mm_loadu_si128(std::mem::transmute(&v as *const u32)) },
+            }
+        }
+    }
+}
+#[cfg(target_arch = "x86_64")]
+pub(crate) use sse::Int4;
+
+//--------------------------------------------------------------------------
+// Fallback
+#[cfg(not(target_arch = "x86_64"))]
+// #[cfg(not(all(target_arch = "x86_64", target_feature = "sse4.1")))]
+pub(crate) mod fallback {
+    #[derive(Debug, Copy, Clone)]
+    #[repr(align(16))]
+    pub(crate) struct Int4 {
+        v: [u32; 4],
+    }
+
+    impl Int4 {
        pub fn zero() -> Int4 {
            Int4 { v: [0, 0, 0, 0] }
        }
@ -30,9 +190,9 @@ impl Int4 {
                ],
            }
        }
-}
+    }

-impl std::ops::MulAssign for Int4 {
+    impl std::ops::MulAssign for Int4 {
        fn mul_assign(&mut self, other: Self) {
            *self = Int4 {
                v: [
@ -43,9 +203,9 @@ impl std::ops::MulAssign for Int4 {
                ],
            };
        }
-}
+    }

-impl std::ops::AddAssign for Int4 {
+    impl std::ops::AddAssign for Int4 {
        fn add_assign(&mut self, other: Self) {
            *self = Int4 {
                v: [
@ -56,9 +216,9 @@ impl std::ops::AddAssign for Int4 {
                ],
            };
        }
-}
+    }

-impl std::ops::BitXorAssign for Int4 {
+    impl std::ops::BitXorAssign for Int4 {
        fn bitxor_assign(&mut self, other: Self) {
            *self = Int4 {
                v: [
@ -69,10 +229,13 @@ impl std::ops::BitXorAssign for Int4 {
                ],
            };
        }
-}
+    }

-impl From<[u32; 4]> for Int4 {
+    impl From<[u32; 4]> for Int4 {
        fn from(v: [u32; 4]) -> Self {
            Int4 { v: v }
        }
+    }
 }
+#[cfg(not(target_arch = "x86_64"))]
+pub(crate) use fallback::Int4;