Wasn't using the bit shifting instructions the correct way. SSE instructions are very non-obvious...
444 lines
12 KiB
Rust
444 lines
12 KiB
Rust
//--------------------------------------------------------------------------
|
|
// x86/64 SSE
|
|
#[cfg(target_arch = "x86_64")]
|
|
// #[cfg(all(target_arch = "x86_64", target_feature = "sse4.1"))]
|
|
pub(crate) mod sse {
|
|
use core::arch::x86_64::{
|
|
__m128i, _mm_add_epi32, _mm_and_si128, _mm_cvtepi32_ps, _mm_mul_ps, _mm_or_si128,
|
|
_mm_set1_epi32, _mm_set1_ps, _mm_set_epi32, _mm_setzero_si128, _mm_sll_epi32,
|
|
_mm_slli_epi32, _mm_srl_epi32, _mm_srli_epi32, _mm_xor_si128,
|
|
};
|
|
|
|
#[derive(Debug, Copy, Clone)]
|
|
pub(crate) struct Int4 {
|
|
v: __m128i,
|
|
}
|
|
|
|
impl Int4 {
|
|
#[inline(always)]
|
|
pub fn zero() -> Int4 {
|
|
Int4 {
|
|
v: unsafe { _mm_setzero_si128() },
|
|
}
|
|
}
|
|
|
|
pub fn get(self, i: usize) -> u32 {
|
|
let n: [u32; 4] = unsafe { std::mem::transmute(self) };
|
|
n[i]
|
|
}
|
|
|
|
/// Converts the full range of a 32 bit integer to a float in [0, 1).
|
|
#[inline(always)]
|
|
pub fn to_norm_floats(self) -> [f32; 4] {
|
|
const ONE_OVER_31BITS: f32 = 1.0 / (1u64 << 31) as f32;
|
|
let n4 = unsafe {
|
|
_mm_mul_ps(
|
|
_mm_cvtepi32_ps(_mm_srli_epi32(self.v, 1)),
|
|
_mm_set1_ps(ONE_OVER_31BITS),
|
|
)
|
|
};
|
|
|
|
unsafe { std::mem::transmute(n4) }
|
|
}
|
|
|
|
#[inline]
|
|
pub fn reverse_bits(self) -> Int4 {
|
|
let mut n = self.v;
|
|
unsafe {
|
|
let a = _mm_slli_epi32(n, 16);
|
|
let b = _mm_srli_epi32(n, 16);
|
|
n = _mm_or_si128(a, b);
|
|
|
|
//----
|
|
let a = _mm_and_si128(
|
|
_mm_slli_epi32(n, 8),
|
|
_mm_set1_epi32(std::mem::transmute(0xff00ff00u32)),
|
|
);
|
|
let b = _mm_and_si128(
|
|
_mm_srli_epi32(n, 8),
|
|
_mm_set1_epi32(std::mem::transmute(0x00ff00ffu32)),
|
|
);
|
|
n = _mm_or_si128(a, b);
|
|
|
|
//----
|
|
let a = _mm_and_si128(
|
|
_mm_slli_epi32(n, 4),
|
|
_mm_set1_epi32(std::mem::transmute(0xf0f0f0f0u32)),
|
|
);
|
|
let b = _mm_and_si128(
|
|
_mm_srli_epi32(n, 4),
|
|
_mm_set1_epi32(std::mem::transmute(0x0f0f0f0fu32)),
|
|
);
|
|
n = _mm_or_si128(a, b);
|
|
|
|
//----
|
|
let a = _mm_and_si128(
|
|
_mm_slli_epi32(n, 2),
|
|
_mm_set1_epi32(std::mem::transmute(0xccccccccu32)),
|
|
);
|
|
let b = _mm_and_si128(
|
|
_mm_srli_epi32(n, 2),
|
|
_mm_set1_epi32(std::mem::transmute(0x33333333u32)),
|
|
);
|
|
n = _mm_or_si128(a, b);
|
|
|
|
//----
|
|
let a = _mm_and_si128(
|
|
_mm_slli_epi32(n, 1),
|
|
_mm_set1_epi32(std::mem::transmute(0xaaaaaaaau32)),
|
|
);
|
|
let b = _mm_and_si128(
|
|
_mm_srli_epi32(n, 1),
|
|
_mm_set1_epi32(std::mem::transmute(0x55555555u32)),
|
|
);
|
|
n = _mm_or_si128(a, b);
|
|
|
|
Int4 { v: n }
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::ops::Mul for Int4 {
|
|
type Output = Int4;
|
|
|
|
#[inline(always)]
|
|
fn mul(self, other: Self) -> Int4 {
|
|
// This only works with SSE 4.1 support.
|
|
#[cfg(target_feature = "sse4.1")]
|
|
unsafe {
|
|
use core::arch::x86_64::_mm_mullo_epi32;
|
|
Int4 {
|
|
v: _mm_mullo_epi32(self.v, other.v),
|
|
}
|
|
}
|
|
|
|
// This works on all x86-64 chips.
|
|
#[cfg(not(target_feature = "sse4.1"))]
|
|
unsafe {
|
|
use core::arch::x86_64::{_mm_mul_epu32, _mm_shuffle_epi32};
|
|
let a = _mm_and_si128(
|
|
_mm_mul_epu32(self.v, other.v),
|
|
_mm_set_epi32(0, 0xffffffffu32 as i32, 0, 0xffffffffu32 as i32),
|
|
);
|
|
let b = _mm_and_si128(
|
|
_mm_mul_epu32(
|
|
_mm_shuffle_epi32(self.v, 0b11_11_01_01),
|
|
_mm_shuffle_epi32(other.v, 0b11_11_01_01),
|
|
),
|
|
_mm_set_epi32(0, 0xffffffffu32 as i32, 0, 0xffffffffu32 as i32),
|
|
);
|
|
Int4 {
|
|
v: _mm_or_si128(a, _mm_shuffle_epi32(b, 0b10_11_00_01)),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::ops::MulAssign for Int4 {
|
|
#[inline(always)]
|
|
fn mul_assign(&mut self, other: Self) {
|
|
*self = *self * other;
|
|
}
|
|
}
|
|
|
|
impl std::ops::AddAssign for Int4 {
|
|
#[inline(always)]
|
|
fn add_assign(&mut self, other: Self) {
|
|
*self = Int4 {
|
|
v: unsafe { _mm_add_epi32(self.v, other.v) },
|
|
};
|
|
}
|
|
}
|
|
|
|
impl std::ops::BitAnd for Int4 {
|
|
type Output = Int4;
|
|
|
|
#[inline(always)]
|
|
fn bitand(self, other: Self) -> Int4 {
|
|
Int4 {
|
|
v: unsafe { _mm_and_si128(self.v, other.v) },
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::ops::BitAndAssign for Int4 {
|
|
fn bitand_assign(&mut self, other: Self) {
|
|
*self = *self & other;
|
|
}
|
|
}
|
|
|
|
impl std::ops::BitOr for Int4 {
|
|
type Output = Int4;
|
|
|
|
#[inline(always)]
|
|
fn bitor(self, other: Self) -> Int4 {
|
|
Int4 {
|
|
v: unsafe { _mm_or_si128(self.v, other.v) },
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::ops::BitOrAssign for Int4 {
|
|
fn bitor_assign(&mut self, other: Self) {
|
|
*self = *self | other;
|
|
}
|
|
}
|
|
|
|
impl std::ops::BitXor for Int4 {
|
|
type Output = Int4;
|
|
|
|
#[inline(always)]
|
|
fn bitxor(self, other: Self) -> Int4 {
|
|
Int4 {
|
|
v: unsafe { _mm_xor_si128(self.v, other.v) },
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::ops::BitXorAssign for Int4 {
|
|
#[inline(always)]
|
|
fn bitxor_assign(&mut self, other: Self) {
|
|
*self = *self ^ other;
|
|
}
|
|
}
|
|
|
|
impl std::ops::Shl<i32> for Int4 {
|
|
type Output = Int4;
|
|
|
|
#[inline(always)]
|
|
fn shl(self, other: i32) -> Int4 {
|
|
Int4 {
|
|
v: unsafe { _mm_sll_epi32(self.v, _mm_set_epi32(0, 0, 0, other)) },
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::ops::Shr<i32> for Int4 {
|
|
type Output = Int4;
|
|
|
|
#[inline(always)]
|
|
fn shr(self, other: i32) -> Int4 {
|
|
Int4 {
|
|
v: unsafe { _mm_srl_epi32(self.v, _mm_set_epi32(0, 0, 0, other)) },
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<[u32; 4]> for Int4 {
|
|
#[inline(always)]
|
|
fn from(v: [u32; 4]) -> Self {
|
|
Int4 {
|
|
v: unsafe { std::mem::transmute(v) },
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn from_array() {
|
|
let a = Int4::from([1, 2, 3, 4]);
|
|
assert_eq!(a.get(0), 1);
|
|
assert_eq!(a.get(1), 2);
|
|
assert_eq!(a.get(2), 3);
|
|
assert_eq!(a.get(3), 4);
|
|
}
|
|
|
|
#[test]
|
|
fn shr() {
|
|
let a = Int4::from([0xffffffff; 4]) >> 16;
|
|
assert_eq!(a.get(0), 0x0000ffff);
|
|
assert_eq!(a.get(1), 0x0000ffff);
|
|
assert_eq!(a.get(2), 0x0000ffff);
|
|
assert_eq!(a.get(3), 0x0000ffff);
|
|
}
|
|
|
|
#[test]
|
|
fn shl() {
|
|
let a = Int4::from([0xffffffff; 4]) << 16;
|
|
assert_eq!(a.get(0), 0xffff0000);
|
|
assert_eq!(a.get(1), 0xffff0000);
|
|
assert_eq!(a.get(2), 0xffff0000);
|
|
assert_eq!(a.get(3), 0xffff0000);
|
|
}
|
|
}
|
|
}
|
|
#[cfg(target_arch = "x86_64")]
|
|
pub(crate) use sse::Int4;
|
|
|
|
//--------------------------------------------------------------------------
|
|
// Fallback
|
|
#[cfg(not(target_arch = "x86_64"))]
|
|
// #[cfg(not(all(target_arch = "x86_64", target_feature = "sse4.1")))]
|
|
pub(crate) mod fallback {
|
|
#[derive(Debug, Copy, Clone)]
|
|
#[repr(align(16))]
|
|
pub(crate) struct Int4 {
|
|
v: [u32; 4],
|
|
}
|
|
|
|
impl Int4 {
|
|
pub fn zero() -> Int4 {
|
|
Int4 { v: [0, 0, 0, 0] }
|
|
}
|
|
|
|
/// Converts the full range of a 32 bit integer to a float in [0, 1).
|
|
pub fn to_norm_floats(self) -> [f32; 4] {
|
|
const ONE_OVER_32BITS: f32 = 1.0 / (1u64 << 32) as f32;
|
|
[
|
|
self.v[0] as f32 * ONE_OVER_32BITS,
|
|
self.v[1] as f32 * ONE_OVER_32BITS,
|
|
self.v[2] as f32 * ONE_OVER_32BITS,
|
|
self.v[3] as f32 * ONE_OVER_32BITS,
|
|
]
|
|
}
|
|
|
|
pub fn reverse_bits(self) -> Int4 {
|
|
Int4 {
|
|
v: [
|
|
self.v[0].reverse_bits(),
|
|
self.v[1].reverse_bits(),
|
|
self.v[2].reverse_bits(),
|
|
self.v[3].reverse_bits(),
|
|
],
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::ops::Mul for Int4 {
|
|
type Output = Int4;
|
|
|
|
fn mul(self, other: Self) -> Int4 {
|
|
Int4 {
|
|
v: [
|
|
self.v[0].wrapping_mul(other.v[0]),
|
|
self.v[1].wrapping_mul(other.v[1]),
|
|
self.v[2].wrapping_mul(other.v[2]),
|
|
self.v[3].wrapping_mul(other.v[3]),
|
|
],
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::ops::MulAssign for Int4 {
|
|
fn mul_assign(&mut self, other: Self) {
|
|
*self = *self * other;
|
|
}
|
|
}
|
|
|
|
impl std::ops::AddAssign for Int4 {
|
|
fn add_assign(&mut self, other: Self) {
|
|
*self = Int4 {
|
|
v: [
|
|
self.v[0].wrapping_add(other.v[0]),
|
|
self.v[1].wrapping_add(other.v[1]),
|
|
self.v[2].wrapping_add(other.v[2]),
|
|
self.v[3].wrapping_add(other.v[3]),
|
|
],
|
|
};
|
|
}
|
|
}
|
|
|
|
impl std::ops::BitAnd for Int4 {
|
|
type Output = Int4;
|
|
fn bitand(self, other: Self) -> Int4 {
|
|
Int4 {
|
|
v: [
|
|
self.v[0] & other.v[0],
|
|
self.v[1] & other.v[1],
|
|
self.v[2] & other.v[2],
|
|
self.v[3] & other.v[3],
|
|
],
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::ops::BitAndAssign for Int4 {
|
|
fn bitand_assign(&mut self, other: Self) {
|
|
*self = *self & other;
|
|
}
|
|
}
|
|
|
|
impl std::ops::BitOr for Int4 {
|
|
type Output = Int4;
|
|
fn bitor(self, other: Self) -> Int4 {
|
|
Int4 {
|
|
v: [
|
|
self.v[0] | other.v[0],
|
|
self.v[1] | other.v[1],
|
|
self.v[2] | other.v[2],
|
|
self.v[3] | other.v[3],
|
|
],
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::ops::BitOrAssign for Int4 {
|
|
fn bitor_assign(&mut self, other: Self) {
|
|
*self = *self | other;
|
|
}
|
|
}
|
|
|
|
impl std::ops::BitXor for Int4 {
|
|
type Output = Int4;
|
|
fn bitxor(self, other: Self) -> Int4 {
|
|
Int4 {
|
|
v: [
|
|
self.v[0] ^ other.v[0],
|
|
self.v[1] ^ other.v[1],
|
|
self.v[2] ^ other.v[2],
|
|
self.v[3] ^ other.v[3],
|
|
],
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::ops::BitXorAssign for Int4 {
|
|
fn bitxor_assign(&mut self, other: Self) {
|
|
*self = *self ^ other;
|
|
}
|
|
}
|
|
|
|
impl std::ops::Shl<i32> for Int4 {
|
|
type Output = Int4;
|
|
|
|
#[inline(always)]
|
|
fn shl(self, other: i32) -> Int4 {
|
|
Int4 {
|
|
v: [
|
|
self.v[0] << other,
|
|
self.v[1] << other,
|
|
self.v[2] << other,
|
|
self.v[3] << other,
|
|
],
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::ops::Shr<i32> for Int4 {
|
|
type Output = Int4;
|
|
|
|
#[inline(always)]
|
|
fn shr(self, other: i32) -> Int4 {
|
|
Int4 {
|
|
v: [
|
|
self.v[0] >> other,
|
|
self.v[1] >> other,
|
|
self.v[2] >> other,
|
|
self.v[3] >> other,
|
|
],
|
|
}
|
|
}
|
|
}
|
|
|
|
impl From<[u32; 4]> for Int4 {
|
|
fn from(v: [u32; 4]) -> Self {
|
|
Int4 { v }
|
|
}
|
|
}
|
|
}
|
|
#[cfg(not(target_arch = "x86_64"))]
|
|
pub(crate) use fallback::Int4;
|