use std::ops::{AddAssign, DivAssign, MulAssign, SubAssign};

use approx::relative_eq;

use crate::{difference_of_products, two_prod, two_sum};

pub use fallback::Float4;
mod fallback {
    use std::ops::{Add, Div, Mul, Neg, Sub};

    use crate::FMulAdd;

    #[derive(Debug, Copy, Clone)]
    #[repr(C, align(16))]
    pub struct Float4 {
        n: [f32; 4],
    }

    impl Float4 {
        /// Create a new `Float4` with the given components.
        #[inline(always)]
        pub fn new(a: f32, b: f32, c: f32, d: f32) -> Self {
            Self { n: [a, b, c, d] }
        }

        /// Create a new `Float4` with all elements set to `n`.
        #[inline(always)]
        pub fn splat(n: f32) -> Self {
            Self { n: [n, n, n, n] }
        }

        /// Component-wise fused multiply-add.
        ///
        /// `(self * a) + b` with only one rounding error.
        #[inline(always)]
        pub fn mul_add(self, a: Self, b: Self) -> Self {
            Self {
                n: [
                    self.n[0].mul_add(a.n[0], b.n[0]),
                    self.n[1].mul_add(a.n[1], b.n[1]),
                    self.n[2].mul_add(a.n[2], b.n[2]),
                    self.n[3].mul_add(a.n[3], b.n[3]),
                ],
            }
        }

        /// Vertical minimum.
        #[inline(always)]
        pub fn min(self, a: Self) -> Self {
            Self {
                n: [
                    self.n[0].min(a.n[0]),
                    self.n[1].min(a.n[1]),
                    self.n[2].min(a.n[2]),
                    self.n[3].min(a.n[3]),
                ],
            }
        }

        /// Vertical maximum.
        #[inline(always)]
        pub fn max(self, a: Self) -> Self {
            Self {
                n: [
                    self.n[0].max(a.n[0]),
                    self.n[1].max(a.n[1]),
                    self.n[2].max(a.n[2]),
                    self.n[3].max(a.n[3]),
                ],
            }
        }

        // /// Horizontal minimum.
        // #[inline(always)]
        // pub fn hmin(self) -> f32 {
        //     let a = self.n[0].min(self.n[1]);
        //     let b = self.n[2].min(self.n[3]);
        //     a.min(b)
        // }

        // /// Horizontal maximum.
        // #[inline(always)]
        // pub fn hmax(self) -> f32 {
        //     let a = self.n[0].max(self.n[1]);
        //     let b = self.n[2].max(self.n[3]);
        //     a.max(b)
        // }

        //-----------------------------------------------------
        // Individual components.

        #[inline(always)]
        pub fn a(self) -> f32 {
            self.n[0]
        }

        #[inline(always)]
        pub fn b(self) -> f32 {
            self.n[1]
        }

        #[inline(always)]
        pub fn c(self) -> f32 {
            self.n[2]
        }

        #[inline(always)]
        pub fn d(self) -> f32 {
            self.n[3]
        }

        #[inline(always)]
        #[must_use]
        pub fn set_a(self, n: f32) -> Self {
            Self {
                n: [n, self.n[1], self.n[2], self.n[3]],
            }
        }

        #[inline(always)]
        #[must_use]
        pub fn set_b(self, n: f32) -> Self {
            Self {
                n: [self.n[0], n, self.n[2], self.n[3]],
            }
        }

        #[inline(always)]
        #[must_use]
        pub fn set_c(self, n: f32) -> Self {
            Self {
                n: [self.n[0], self.n[1], n, self.n[3]],
            }
        }

        #[inline(always)]
        #[must_use]
        pub fn set_d(self, n: f32) -> Self {
            Self {
                n: [self.n[0], self.n[1], self.n[2], n],
            }
        }

        //-----------------------------------------------------
        // Shuffles.

        #[inline(always)]
        pub fn aaaa(self) -> Self {
            let a = self.n[0];
            Self { n: [a, a, a, a] }
        }

        #[inline(always)]
        pub fn bbbb(self) -> Self {
            let b = self.n[1];
            Self { n: [b, b, b, b] }
        }

        #[inline(always)]
        pub fn cccc(self) -> Self {
            let c = self.n[2];
            Self { n: [c, c, c, c] }
        }

        #[inline(always)]
        pub fn dddd(self) -> Self {
            let d = self.n[3];
            Self { n: [d, d, d, d] }
        }

        #[inline(always)]
        pub fn bcad(self) -> Self {
            let a = self.n[0];
            let b = self.n[1];
            let c = self.n[2];
            let d = self.n[3];
            Self { n: [b, c, a, d] }
        }

        #[inline(always)]
        pub fn cabd(self) -> Self {
            let a = self.n[0];
            let b = self.n[1];
            let c = self.n[2];
            let d = self.n[3];
            Self { n: [c, a, b, d] }
        }
    }

    impl Add for Float4 {
        type Output = Self;

        #[inline(always)]
        fn add(self, rhs: Self) -> Self {
            Self {
                n: [
                    self.n[0] + rhs.n[0],
                    self.n[1] + rhs.n[1],
                    self.n[2] + rhs.n[2],
                    self.n[3] + rhs.n[3],
                ],
            }
        }
    }

    impl Sub for Float4 {
        type Output = Self;

        #[inline(always)]
        fn sub(self, rhs: Self) -> Self {
            Self {
                n: [
                    self.n[0] - rhs.n[0],
                    self.n[1] - rhs.n[1],
                    self.n[2] - rhs.n[2],
                    self.n[3] - rhs.n[3],
                ],
            }
        }
    }

    impl Mul for Float4 {
        type Output = Self;

        #[inline(always)]
        fn mul(self, rhs: Self) -> Self {
            Self {
                n: [
                    self.n[0] * rhs.n[0],
                    self.n[1] * rhs.n[1],
                    self.n[2] * rhs.n[2],
                    self.n[3] * rhs.n[3],
                ],
            }
        }
    }

    impl Mul<f32> for Float4 {
        type Output = Self;

        #[inline(always)]
        fn mul(self, rhs: f32) -> Self {
            Self {
                n: [
                    self.n[0] * rhs,
                    self.n[1] * rhs,
                    self.n[2] * rhs,
                    self.n[3] * rhs,
                ],
            }
        }
    }

    impl Div for Float4 {
        type Output = Self;

        #[inline(always)]
        fn div(self, rhs: Self) -> Self {
            Self {
                n: [
                    self.n[0] / rhs.n[0],
                    self.n[1] / rhs.n[1],
                    self.n[2] / rhs.n[2],
                    self.n[3] / rhs.n[3],
                ],
            }
        }
    }

    impl Div<f32> for Float4 {
        type Output = Self;

        #[inline(always)]
        fn div(self, rhs: f32) -> Self {
            Self {
                n: [
                    self.n[0] / rhs,
                    self.n[1] / rhs,
                    self.n[2] / rhs,
                    self.n[3] / rhs,
                ],
            }
        }
    }

    impl Neg for Float4 {
        type Output = Self;

        #[inline(always)]
        fn neg(self) -> Self {
            Self {
                n: [-self.n[0], -self.n[1], -self.n[2], -self.n[3]],
            }
        }
    }

    impl FMulAdd for Float4 {
        fn fma(self, b: Self, c: Self) -> Self {
            self.mul_add(b, c)
        }
    }
}

//-------------------------------------------------------------
// Float4 impls that don't depend on its inner representation.

impl Float4 {
    /// 3D dot product (only uses the first 3 components).
    #[inline(always)]
    pub fn dot_3(a: Self, b: Self) -> f32 {
        let (p, p_err) = two_prod(a, b);

        // Products.
        let (x, x_err) = (p.a(), p_err.a());
        let (y, y_err) = (p.b(), p_err.b());
        let (z, z_err) = (p.c(), p_err.c());

        // Sums.
        let (s1, s1_err) = two_sum(x, y);
        let err1 = x_err + (y_err + s1_err);

        let (s2, s2_err) = two_sum(s1, z);
        let err2 = z_err + (err1 + s2_err);

        // Final result with rounding error compensation.
        s2 + err2
    }

    /// 3D dot product (only uses the first 3 components).
    ///
    /// Faster but less precise version.
    #[inline(always)]
    pub fn dot_3_fast(a: Self, b: Self) -> f32 {
        let c = a * b;
        c.a() + c.b() + c.c()
    }

    #[inline(always)]
    pub fn transpose_3x3(m: [Self; 3]) -> [Self; 3] {
        [
            // The fourth component in each row below is arbitrary,
            // but in this case chosen so that it matches the
            // behavior of the SSE version of transpose_3x3.
            Self::new(m[0].a(), m[1].a(), m[2].a(), m[2].d()),
            Self::new(m[0].b(), m[1].b(), m[2].b(), m[2].d()),
            Self::new(m[0].c(), m[1].c(), m[2].c(), m[2].d()),
        ]
    }

    /// Invert a 3x3 matrix.
    ///
    /// Returns `None` if not invertible.
    #[inline]
    pub fn invert_3x3(m: [Self; 3]) -> Option<[Self; 3]> {
        let m0_bca = m[0].bcad();
        let m1_bca = m[1].bcad();
        let m2_bca = m[2].bcad();
        let m0_cab = m[0].cabd();
        let m1_cab = m[1].cabd();
        let m2_cab = m[2].cabd();
        let abc = difference_of_products(m1_bca, m2_cab, m1_cab, m2_bca);
        let def = difference_of_products(m2_bca, m0_cab, m2_cab, m0_bca);
        let ghi = difference_of_products(m0_bca, m1_cab, m0_cab, m1_bca);

        let det = Self::dot_3(
            Self::new(abc.a(), def.a(), ghi.a(), 0.0),
            Self::new(m[0].a(), m[1].a(), m[2].a(), 0.0),
        );

        if det == 0.0 {
            None
        } else {
            Some(Self::transpose_3x3([abc / det, def / det, ghi / det]))
        }
    }

    /// Invert a 3x3 matrix.  Faster but less precise version.
    ///
    /// Returns `None` if not invertible.
    #[inline]
    pub fn invert_3x3_fast(m: [Self; 3]) -> Option<[Self; 3]> {
        let m0_bca = m[0].bcad();
        let m1_bca = m[1].bcad();
        let m2_bca = m[2].bcad();
        let m0_cab = m[0].cabd();
        let m1_cab = m[1].cabd();
        let m2_cab = m[2].cabd();
        let abc = (m1_bca * m2_cab) - (m1_cab * m2_bca);
        let def = (m2_bca * m0_cab) - (m2_cab * m0_bca);
        let ghi = (m0_bca * m1_cab) - (m0_cab * m1_bca);

        let det = Self::dot_3_fast(
            Self::new(abc.a(), def.a(), ghi.a(), 0.0),
            Self::new(m[0].a(), m[1].a(), m[2].a(), 0.0),
        );

        if det == 0.0 {
            None
        } else {
            Some(Self::transpose_3x3([abc / det, def / det, ghi / det]))
        }
    }

    /// Multiplies a 3D vector with a 3x3 matrix.
    #[inline]
    pub fn vec_mul_3x3(self, m: &[Self; 3]) -> Self {
        let x = self.aaaa();
        let y = self.bbbb();
        let z = self.cccc();

        // Products.
        let (a, a_err) = two_prod(x, m[0]);
        let (b, b_err) = two_prod(y, m[1]);
        let (c, c_err) = two_prod(z, m[2]);

        // Sums.
        let (s1, s1_err) = two_sum(a, b);
        let err1 = a_err + (b_err + s1_err);

        let (s2, s2_err) = two_sum(c, s1);
        let err2 = c_err + (err1 + s2_err);

        s2 + err2
    }

    /// Multiplies a 3D vector with a 3x3 matrix.
    ///
    /// Faster but less precise version.
    #[inline]
    pub fn vec_mul_3x3_fast(self, m: &[Self; 3]) -> Self {
        let x = self.aaaa();
        let y = self.bbbb();
        let z = self.cccc();

        (x * m[0]) + (y * m[1]) + (z * m[2])
    }

    /// Transforms a 3d point by an affine transform.
    ///
    /// `m` is the 3x3 part of the affine transform, `t` is the translation part.
    #[inline]
    pub fn vec_mul_affine(self, m: &[Self; 3], t: Self) -> Self {
        let x = self.aaaa();
        let y = self.bbbb();
        let z = self.cccc();

        // Products.
        let (a, a_err) = two_prod(x, m[0]);
        let (b, b_err) = two_prod(y, m[1]);
        let (c, c_err) = two_prod(z, m[2]);

        // Sums.
        let (s1, s1_err) = two_sum(a, b);
        let err1 = a_err + (b_err + s1_err);

        let (s2, s2_err) = two_sum(c, s1);
        let err2 = c_err + (err1 + s2_err);

        let (s3, s3_err) = two_sum(t, s2);
        let err3 = err2 + s3_err;

        s3 + err3
    }

    /// Transforms a 3d point by an affine transform.
    ///
    /// Faster but less precise version.
    #[inline]
    pub fn vec_mul_affine_fast(self, m: &[Self; 3], t: Self) -> Self {
        let x = self.aaaa();
        let y = self.bbbb();
        let z = self.cccc();

        (x * m[0]) + (y * m[1]) + (z * m[2]) + t
    }

    /// Transforms a 3d point by an affine transform, except it applies
    /// the translation part before the 3x3 part.
    ///
    /// This is primarily useful for performing efficient inverse transforms by
    /// passing an inverted 3x3 part and a negated translation part.
    ///
    /// `m` is the 3x3 part of the affine transform, `t` is the translation part.
    #[inline]
    pub fn vec_mul_affine_rev(self, m: &[Self; 3], t: Self) -> Self {
        let (v, v_err) = two_sum(self, t);

        let (x, x_err) = (v.aaaa(), v_err.aaaa());
        let (y, y_err) = (v.bbbb(), v_err.bbbb());
        let (z, z_err) = (v.cccc(), v_err.cccc());

        // Products.
        let ((a, a_err1), a_err2) = (two_prod(x, m[0]), x_err * m[0]);
        let ((b, b_err1), b_err2) = (two_prod(y, m[1]), y_err * m[1]);
        let ((c, c_err1), c_err2) = (two_prod(z, m[2]), z_err * m[2]);
        let a_err = a_err1 + a_err2;
        let b_err = b_err1 + b_err2;
        let c_err = c_err1 + c_err2;

        // Sums.
        let (s1, s1_err) = two_sum(a, b);
        let err1 = a_err + (b_err + s1_err);

        let (s2, s2_err) = two_sum(c, s1);
        let err2 = c_err + (err1 + s2_err);

        let (s3, s3_err) = two_sum(t, s2);
        let err3 = err2 + s3_err;

        s3 + err3
    }

    /// Transforms a 3d point by an affine transform, except it applies
    /// the translation part before the 3x3 part.
    ///
    /// Faster but less precise version.
    #[inline]
    pub fn vec_mul_affine_rev_fast(self, m: &[Self; 3], t: Self) -> Self {
        let v = self + t;

        let x = v.aaaa();
        let y = v.bbbb();
        let z = v.cccc();

        (x * m[0]) + (y * m[1]) + (z * m[2])
    }

    /// Returns whether the `Float4`s are approximately equal to each
    /// other.
    ///
    /// Each corresponding element cannot have a relative error exceeding
    /// `epsilon`.
    pub(crate) fn aprx_eq(a: Self, b: Self, epsilon: f32) -> bool {
        let mut eq = true;
        eq &= relative_eq!(a.a(), b.a(), epsilon = epsilon);
        eq &= relative_eq!(a.b(), b.b(), epsilon = epsilon);
        eq &= relative_eq!(a.c(), b.c(), epsilon = epsilon);
        eq &= relative_eq!(a.d(), b.d(), epsilon = epsilon);
        eq
    }
}

impl AddAssign for Float4 {
    #[inline(always)]
    fn add_assign(&mut self, rhs: Self) {
        *self = *self + rhs;
    }
}

impl SubAssign for Float4 {
    #[inline(always)]
    fn sub_assign(&mut self, rhs: Self) {
        *self = *self - rhs;
    }
}

impl MulAssign for Float4 {
    #[inline(always)]
    fn mul_assign(&mut self, rhs: Self) {
        *self = *self * rhs;
    }
}

impl MulAssign<f32> for Float4 {
    #[inline(always)]
    fn mul_assign(&mut self, rhs: f32) {
        *self = *self * rhs;
    }
}

impl DivAssign for Float4 {
    #[inline(always)]
    fn div_assign(&mut self, rhs: Self) {
        *self = *self / rhs;
    }
}

impl DivAssign<f32> for Float4 {
    #[inline(always)]
    fn div_assign(&mut self, rhs: f32) {
        *self = *self / rhs;
    }
}