diff --git a/sub_crates/sobol/src/lib.rs b/sub_crates/sobol/src/lib.rs index a18bb58..b149ea8 100644 --- a/sub_crates/sobol/src/lib.rs +++ b/sub_crates/sobol/src/lib.rs @@ -59,33 +59,17 @@ pub fn sample_4d(sample_index: u32, dimension_set: u32, seed: u32) -> [f32; 4] { //---------------------------------------------------------------------- -// The permutation constants used in `lk_scramble()`. -// Each tuple is for one round of permutation. The first tuple is -// optimized, and the remaining are random aside from making sure -// that they are appropriately even or odd. -const PERMS: &[(u32, u32)] = &[ - (0x9ac7ea2a, 0x7d1e78d3), - (0x2ce68764, 0x9dd00551), - (0x79b82526, 0x2dfc1a6b), - (0xf358b1d0, 0x38743c65), -]; - -// How many permutation rounds to do. -// In practice it seems like one round is plenty, but I'm leaving more -// available in case we want to increase them later. -const ROUNDS: usize = 1; - /// Scrambles `n` using a novel variation on the Laine-Karras hash. /// /// This is equivalent to Owen scrambling, but on reversed bits. #[inline(always)] fn lk_scramble(mut n: u32, scramble: u32) -> u32 { - n = n.wrapping_add(hash(scramble, 2)); + let scramble = hash(scramble); - for &(p1, p2) in PERMS.iter().take(ROUNDS) { - n ^= n.wrapping_mul(p1); - n = n.wrapping_mul(p2); - } + n = n.wrapping_add(scramble); + n ^= n.wrapping_mul(0x3354734a); + n = n.wrapping_add(n << 2); + n ^= n.wrapping_mul(scramble & !1); n } @@ -93,25 +77,28 @@ fn lk_scramble(mut n: u32, scramble: u32) -> u32 { /// Same as `lk_scramble()`, except does it on 4 integers at a time. #[inline(always)] fn lk_scramble_int4(mut n: Int4, scramble: u32) -> Int4 { - n += hash_int4([scramble; 4].into(), 2); + let scramble = hash_int4([scramble; 4].into()); - for &(p1, p2) in PERMS.iter().take(ROUNDS) { - n ^= n * [p1; 4].into(); - n *= [p2; 4].into(); - } + n += scramble; + n ^= n * [0x3354734a; 4].into(); + n += n << 2; + n ^= n * (scramble & [!1; 4].into()); n } -/// A simple 32-bit hash function. Its quality can be tuned with -/// the number of rounds used. +/// A good 32-bit hash function. +/// From https://github.com/skeeto/hash-prospector #[inline(always)] -fn hash(n: u32, rounds: u32) -> u32 { +fn hash(n: u32) -> u32 { let mut hash = n ^ 0x79c68e4a; - for _ in 0..rounds { - hash = hash.wrapping_mul(0x736caf6f); - hash ^= hash.wrapping_shr(16); - } + + hash ^= hash >> 16; + hash = hash.wrapping_mul(0x7feb352d); + hash ^= hash >> 15; + hash = hash.wrapping_mul(0x846ca68b); + hash ^= hash >> 16; + hash } @@ -120,12 +107,14 @@ fn hash(n: u32, rounds: u32) -> u32 { /// Each of the four numbers gets a different hash, so even if all input /// numbers are the same, the outputs will still be different for each of them. #[inline(always)] -fn hash_int4(n: Int4, rounds: u32) -> Int4 { - let mut hash = n; - hash ^= [0x912f69ba, 0x174f18ab, 0x691e72ca, 0xb40cc1b8].into(); - for _ in 0..rounds { - hash *= [0x736caf6f; 4].into(); - hash ^= hash.shr16(); - } +fn hash_int4(n: Int4) -> Int4 { + let mut hash = n ^ [0x912f69ba, 0x174f18ab, 0x691e72ca, 0xb40cc1b8].into(); + + hash ^= hash >> 16; + hash *= [0x7feb352d; 4].into(); + hash ^= hash >> 15; + hash *= [0x846ca68b; 4].into(); + hash ^= hash >> 16; + hash } diff --git a/sub_crates/sobol/src/wide.rs b/sub_crates/sobol/src/wide.rs index 8602a2c..831648d 100644 --- a/sub_crates/sobol/src/wide.rs +++ b/sub_crates/sobol/src/wide.rs @@ -5,8 +5,8 @@ pub(crate) mod sse { use core::arch::x86_64::{ __m128i, _mm_add_epi32, _mm_and_si128, _mm_cvtepi32_ps, _mm_mul_ps, _mm_or_si128, - _mm_set1_epi32, _mm_set1_ps, _mm_set_epi32, _mm_setzero_si128, _mm_slli_epi32, - _mm_srli_epi32, _mm_xor_si128, + _mm_set1_epi32, _mm_set1_ps, _mm_set_epi32, _mm_setzero_si128, _mm_sll_epi32, + _mm_slli_epi32, _mm_srl_epi32, _mm_srli_epi32, _mm_xor_si128, }; #[derive(Debug, Copy, Clone)] @@ -91,13 +91,6 @@ pub(crate) mod sse { Int4 { v: n } } } - - #[inline(always)] - pub(crate) fn shr16(self) -> Int4 { - Int4 { - v: unsafe { _mm_srli_epi32(self.v, 16) }, - } - } } impl std::ops::Mul for Int4 { @@ -152,12 +145,54 @@ pub(crate) mod sse { } } + impl std::ops::BitXor for Int4 { + type Output = Int4; + + #[inline(always)] + fn bitxor(self, other: Self) -> Int4 { + Int4 { + v: unsafe { _mm_xor_si128(self.v, other.v) }, + } + } + } + impl std::ops::BitXorAssign for Int4 { #[inline(always)] fn bitxor_assign(&mut self, other: Self) { - *self = Int4 { - v: unsafe { _mm_xor_si128(self.v, other.v) }, - }; + *self = *self ^ other; + } + } + + impl std::ops::BitAnd for Int4 { + type Output = Int4; + + #[inline(always)] + fn bitand(self, other: Self) -> Int4 { + Int4 { + v: unsafe { _mm_and_si128(self.v, other.v) }, + } + } + } + + impl std::ops::Shl for Int4 { + type Output = Int4; + + #[inline(always)] + fn shl(self, other: i32) -> Int4 { + Int4 { + v: unsafe { _mm_sll_epi32(self.v, _mm_set1_epi32(other)) }, + } + } + } + + impl std::ops::Shr for Int4 { + type Output = Int4; + + #[inline(always)] + fn shr(self, other: i32) -> Int4 { + Int4 { + v: unsafe { _mm_srl_epi32(self.v, _mm_set1_epi32(other)) }, + } } } @@ -210,14 +245,18 @@ pub(crate) mod fallback { ], } } + } - pub(crate) fn shr16(self) -> Int4 { + impl std::ops::Mul for Int4 { + type Output = Int4; + + fn mul(self, other: Self) -> Int4 { Int4 { v: [ - self.v[0] >> 16, - self.v[1] >> 16, - self.v[2] >> 16, - self.v[3] >> 16, + self.v[0].wrapping_mul(other.v[0]), + self.v[1].wrapping_mul(other.v[1]), + self.v[2].wrapping_mul(other.v[2]), + self.v[3].wrapping_mul(other.v[3]), ], } } @@ -225,14 +264,7 @@ pub(crate) mod fallback { impl std::ops::MulAssign for Int4 { fn mul_assign(&mut self, other: Self) { - *self = Int4 { - v: [ - self.v[0].wrapping_mul(other.v[0]), - self.v[1].wrapping_mul(other.v[1]), - self.v[2].wrapping_mul(other.v[2]), - self.v[3].wrapping_mul(other.v[3]), - ], - }; + *self = *self * other; } } @@ -249,16 +281,75 @@ pub(crate) mod fallback { } } - impl std::ops::BitXorAssign for Int4 { - fn bitxor_assign(&mut self, other: Self) { - *self = Int4 { + impl std::ops::BitAnd for Int4 { + type Output = Int4; + fn bitand(self, other: Self) -> Int4 { + Int4 { + v: [ + self.v[0] & other.v[0], + self.v[1] & other.v[1], + self.v[2] & other.v[2], + self.v[3] & other.v[3], + ], + } + } + } + + impl std::ops::BitAndAssign for Int4 { + fn bitand_assign(&mut self, other: Self) { + *self = *self & other; + } + } + + impl std::ops::BitXor for Int4 { + type Output = Int4; + fn bitxor(self, other: Self) -> Int4 { + Int4 { v: [ self.v[0] ^ other.v[0], self.v[1] ^ other.v[1], self.v[2] ^ other.v[2], self.v[3] ^ other.v[3], ], - }; + } + } + } + + impl std::ops::BitXorAssign for Int4 { + fn bitxor_assign(&mut self, other: Self) { + *self = *self ^ other; + } + } + + impl std::ops::Shl for Int4 { + type Output = Int4; + + #[inline(always)] + fn shl(self, other: i32) -> Int4 { + Int4 { + v: [ + self.v[0] << other, + self.v[1] << other, + self.v[2] << other, + self.v[3] << other, + ], + } + } + } + + impl std::ops::Shr for Int4 { + type Output = Int4; + + #[inline(always)] + fn shr(self, other: i32) -> Int4 { + Int4 { + v: [ + self.v[0] >> other, + self.v[1] >> other, + self.v[2] >> other, + self.v[3] >> other, + ], + } } }