Sped up Float4::h_sum for platforms with SSE3.
Since this is used heavily during matrix multiplication, gives a nice little speed boost.
This commit is contained in:
parent
27d1b2286b
commit
8e791259b3
|
@ -31,7 +31,34 @@ mod x86_64_sse {
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn h_sum(&self) -> f32 {
|
pub fn h_sum(&self) -> f32 {
|
||||||
(self.get_0() + self.get_1()) + (self.get_2() + self.get_3())
|
#[cfg(target_feature = "sse3")]
|
||||||
|
{
|
||||||
|
use std::arch::x86_64::{
|
||||||
|
_mm_add_ps, _mm_add_ss, _mm_cvtss_f32, _mm_movehdup_ps, _mm_movehl_ps,
|
||||||
|
};
|
||||||
|
unsafe {
|
||||||
|
let v = self.data;
|
||||||
|
let shuf = _mm_movehdup_ps(v);
|
||||||
|
let sums = _mm_add_ps(v, shuf);
|
||||||
|
let shuf = _mm_movehl_ps(shuf, sums);
|
||||||
|
let sums = _mm_add_ss(sums, shuf);
|
||||||
|
_mm_cvtss_f32(sums)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#[cfg(not(target_feature = "sse3"))]
|
||||||
|
{
|
||||||
|
use std::arch::x86_64::{
|
||||||
|
_mm_add_ps, _mm_add_ss, _mm_cvtss_f32, _mm_movehl_ps, _mm_shuffle_ps,
|
||||||
|
};
|
||||||
|
unsafe {
|
||||||
|
let v = self.data;
|
||||||
|
let shuf = _mm_shuffle_ps(v, v, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0));
|
||||||
|
let sums = _mm_add_ps(v, shuf);
|
||||||
|
let shuf = _mm_movehl_ps(shuf, sums);
|
||||||
|
let sums = _mm_add_ss(sums, shuf);
|
||||||
|
_mm_cvtss_f32(sums)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user