//--------------------------------------------------------------------------
// x86/64 SSE
#[cfg(target_arch = "x86_64")]
// #[cfg(all(target_arch = "x86_64", target_feature = "sse4.1"))]
pub(crate) mod sse {
    use core::arch::x86_64::{
        __m128i, _mm_add_epi32, _mm_and_si128, _mm_cvtepi32_ps, _mm_mul_ps, _mm_or_si128,
        _mm_set1_epi32, _mm_set1_ps, _mm_set_epi32, _mm_setzero_si128, _mm_slli_epi32,
        _mm_srli_epi32, _mm_xor_si128,
    };

    #[derive(Debug, Copy, Clone)]
    pub(crate) struct Int4 {
        v: __m128i,
    }

    impl Int4 {
        #[inline(always)]
        pub fn zero() -> Int4 {
            Int4 {
                v: unsafe { _mm_setzero_si128() },
            }
        }

        /// Converts the full range of a 32 bit integer to a float in [0, 1).
        #[inline(always)]
        pub fn to_norm_floats(self) -> [f32; 4] {
            const ONE_OVER_31BITS: f32 = 1.0 / (1u64 << 31) as f32;
            let n4 = unsafe {
                _mm_mul_ps(
                    _mm_cvtepi32_ps(_mm_srli_epi32(self.v, 1)),
                    _mm_set1_ps(ONE_OVER_31BITS),
                )
            };

            unsafe { std::mem::transmute(n4) }
        }

        #[inline]
        pub fn reverse_bits(self) -> Int4 {
            let mut n = self.v;
            unsafe {
                let a = _mm_slli_epi32(n, 16);
                let b = _mm_srli_epi32(n, 16);
                n = _mm_or_si128(a, b);

                //----
                let a = _mm_and_si128(
                    _mm_slli_epi32(n, 8),
                    _mm_set1_epi32(std::mem::transmute(0xff00ff00u32)),
                );
                let b = _mm_and_si128(
                    _mm_srli_epi32(n, 8),
                    _mm_set1_epi32(std::mem::transmute(0x00ff00ffu32)),
                );
                n = _mm_or_si128(a, b);

                //----
                let a = _mm_and_si128(
                    _mm_slli_epi32(n, 4),
                    _mm_set1_epi32(std::mem::transmute(0xf0f0f0f0u32)),
                );
                let b = _mm_and_si128(
                    _mm_srli_epi32(n, 4),
                    _mm_set1_epi32(std::mem::transmute(0x0f0f0f0fu32)),
                );
                n = _mm_or_si128(a, b);

                //----
                let a = _mm_and_si128(
                    _mm_slli_epi32(n, 2),
                    _mm_set1_epi32(std::mem::transmute(0xccccccccu32)),
                );
                let b = _mm_and_si128(
                    _mm_srli_epi32(n, 2),
                    _mm_set1_epi32(std::mem::transmute(0x33333333u32)),
                );
                n = _mm_or_si128(a, b);

                //----
                let a = _mm_and_si128(
                    _mm_slli_epi32(n, 1),
                    _mm_set1_epi32(std::mem::transmute(0xaaaaaaaau32)),
                );
                let b = _mm_and_si128(
                    _mm_srli_epi32(n, 1),
                    _mm_set1_epi32(std::mem::transmute(0x55555555u32)),
                );
                n = _mm_or_si128(a, b);

                Int4 { v: n }
            }
        }

        #[inline(always)]
        pub(crate) fn shr16(self) -> Int4 {
            Int4 {
                v: unsafe { _mm_srli_epi32(self.v, 16) },
            }
        }
    }

    impl std::ops::MulAssign for Int4 {
        #[inline(always)]
        fn mul_assign(&mut self, other: Self) {
            // This only works with SSE 4.1 support.
            #[cfg(target_feature = "sse4.1")]
            unsafe {
                use core::arch::x86_64::_mm_mullo_epi32;
                *self = Int4 {
                    v: _mm_mullo_epi32(self.v, other.v),
                };
            }

            // This works on all x86-64 chips.
            #[cfg(not(target_feature = "sse4.1"))]
            unsafe {
                use core::arch::x86_64::{_mm_mul_epu32, _mm_shuffle_epi32};
                let a = _mm_and_si128(
                    _mm_mul_epu32(self.v, other.v),
                    _mm_set_epi32(0, 0xffffffffu32 as i32, 0, 0xffffffffu32 as i32),
                );
                let b = _mm_and_si128(
                    _mm_mul_epu32(
                        _mm_shuffle_epi32(self.v, 0b11_11_01_01),
                        _mm_shuffle_epi32(other.v, 0b11_11_01_01),
                    ),
                    _mm_set_epi32(0, 0xffffffffu32 as i32, 0, 0xffffffffu32 as i32),
                );
                *self = Int4 {
                    v: _mm_or_si128(a, _mm_shuffle_epi32(b, 0b10_11_00_01)),
                };
            }
        }
    }

    impl std::ops::AddAssign for Int4 {
        #[inline(always)]
        fn add_assign(&mut self, other: Self) {
            *self = Int4 {
                v: unsafe { _mm_add_epi32(self.v, other.v) },
            };
        }
    }

    impl std::ops::BitXorAssign for Int4 {
        #[inline(always)]
        fn bitxor_assign(&mut self, other: Self) {
            *self = Int4 {
                v: unsafe { _mm_xor_si128(self.v, other.v) },
            };
        }
    }

    impl From<[u32; 4]> for Int4 {
        #[inline(always)]
        fn from(v: [u32; 4]) -> Self {
            Int4 {
                v: unsafe { std::mem::transmute(v) },
            }
        }
    }
}
#[cfg(target_arch = "x86_64")]
pub(crate) use sse::Int4;

//--------------------------------------------------------------------------
// Fallback
#[cfg(not(target_arch = "x86_64"))]
// #[cfg(not(all(target_arch = "x86_64", target_feature = "sse4.1")))]
pub(crate) mod fallback {
    #[derive(Debug, Copy, Clone)]
    #[repr(align(16))]
    pub(crate) struct Int4 {
        v: [u32; 4],
    }

    impl Int4 {
        pub fn zero() -> Int4 {
            Int4 { v: [0, 0, 0, 0] }
        }

        /// Converts the full range of a 32 bit integer to a float in [0, 1).
        pub fn to_norm_floats(self) -> [f32; 4] {
            const ONE_OVER_32BITS: f32 = 1.0 / (1u64 << 32) as f32;
            [
                self.v[0] as f32 * ONE_OVER_32BITS,
                self.v[1] as f32 * ONE_OVER_32BITS,
                self.v[2] as f32 * ONE_OVER_32BITS,
                self.v[3] as f32 * ONE_OVER_32BITS,
            ]
        }

        pub fn reverse_bits(self) -> Int4 {
            Int4 {
                v: [
                    self.v[0].reverse_bits(),
                    self.v[1].reverse_bits(),
                    self.v[2].reverse_bits(),
                    self.v[3].reverse_bits(),
                ],
            }
        }

        pub(crate) fn shr16(self) -> Int4 {
            Int4 {
                v: [
                    self.v[0] >> 16,
                    self.v[1] >> 16,
                    self.v[2] >> 16,
                    self.v[3] >> 16,
                ],
            }
        }
    }

    impl std::ops::MulAssign for Int4 {
        fn mul_assign(&mut self, other: Self) {
            *self = Int4 {
                v: [
                    self.v[0].wrapping_mul(other.v[0]),
                    self.v[1].wrapping_mul(other.v[1]),
                    self.v[2].wrapping_mul(other.v[2]),
                    self.v[3].wrapping_mul(other.v[3]),
                ],
            };
        }
    }

    impl std::ops::AddAssign for Int4 {
        fn add_assign(&mut self, other: Self) {
            *self = Int4 {
                v: [
                    self.v[0].wrapping_add(other.v[0]),
                    self.v[1].wrapping_add(other.v[1]),
                    self.v[2].wrapping_add(other.v[2]),
                    self.v[3].wrapping_add(other.v[3]),
                ],
            };
        }
    }

    impl std::ops::BitXorAssign for Int4 {
        fn bitxor_assign(&mut self, other: Self) {
            *self = Int4 {
                v: [
                    self.v[0] ^ other.v[0],
                    self.v[1] ^ other.v[1],
                    self.v[2] ^ other.v[2],
                    self.v[3] ^ other.v[3],
                ],
            };
        }
    }

    impl From<[u32; 4]> for Int4 {
        fn from(v: [u32; 4]) -> Self {
            Int4 { v }
        }
    }
}
#[cfg(not(target_arch = "x86_64"))]
pub(crate) use fallback::Int4;