From 7066c3818961d3278e8f065d89946d080856a589 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Thu, 10 Sep 2020 22:36:20 +0900
Subject: [PATCH 01/25] Implement an experimental packed HDR RGB 32-bit storage
 format.

---
 sub_crates/trifloat/benches/bench.rs |  32 ++-
 sub_crates/trifloat/src/lib.rs       |   1 +
 sub_crates/trifloat/src/rgb32.rs     | 290 +++++++++++++++++++++++++++
 3 files changed, 319 insertions(+), 4 deletions(-)
 create mode 100644 sub_crates/trifloat/src/rgb32.rs
diff --git a/sub_crates/trifloat/benches/bench.rs b/sub_crates/trifloat/benches/bench.rs
index 727661a..5b4d0cd 100644
--- a/sub_crates/trifloat/benches/bench.rs
+++ b/sub_crates/trifloat/benches/bench.rs
@@ -1,15 +1,15 @@
 use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
 use rand::{rngs::SmallRng, FromEntropy, Rng};
-use trifloat::{signed48, unsigned32};
+use trifloat::{rgb32, signed48, unsigned32};
 
 //----
 
 fn unsigned32_encode_100_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
-        let x = rng.gen::<f32>() - 0.5;
-        let y = rng.gen::<f32>() - 0.5;
-        let z = rng.gen::<f32>() - 0.5;
+        let x = rng.gen::<f32>();
+        let y = rng.gen::<f32>();
+        let z = rng.gen::<f32>();
         for _ in 0..100 {
             black_box(unsigned32::encode(black_box((x, y, z))));
         }
@@ -48,6 +48,28 @@ fn signed48_decode_100_values(bench: &mut Bencher) {
     });
 }
 
+fn rgb32_encode_100_values(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let y = rng.gen::<f32>();
+        let x = rng.gen::<f32>();
+        let z = rng.gen::<f32>();
+        for _ in 0..100 {
+            black_box(rgb32::encode(black_box((x, y, z))));
+        }
+    });
+}
+
+fn rgb32_decode_100_values(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let v = rng.gen::<u32>();
+        for _ in 0..100 {
+            black_box(rgb32::decode(black_box(v)));
+        }
+    });
+}
+
 //----
 
 benchmark_group!(
@@ -56,5 +78,7 @@ benchmark_group!(
     unsigned32_decode_100_values,
     signed48_encode_100_values,
     signed48_decode_100_values,
+    rgb32_encode_100_values,
+    rgb32_decode_100_values,
 );
 benchmark_main!(benches);
diff --git a/sub_crates/trifloat/src/lib.rs b/sub_crates/trifloat/src/lib.rs
index eaefb5f..dafef1e 100644
--- a/sub_crates/trifloat/src/lib.rs
+++ b/sub_crates/trifloat/src/lib.rs
@@ -4,6 +4,7 @@
 //! The motivating use-case for this is compactly storing HDR RGB colors.  But
 //! it may be useful for other things as well.
 
+pub mod rgb32;
 pub mod signed48;
 pub mod unsigned32;
 
diff --git a/sub_crates/trifloat/src/rgb32.rs b/sub_crates/trifloat/src/rgb32.rs
new file mode 100644
index 0000000..4a79158
--- /dev/null
+++ b/sub_crates/trifloat/src/rgb32.rs
@@ -0,0 +1,290 @@
+//! Encoding/decoding for specialized HDR RGB 32-bit storage format.
+//!
+//! The motivation for this format is to separate out the luma of
+//! the color from its chromaticity, in the same spirit as most
+//! image and video compression approaches, and then allocate more
+//! data to the luma component since that's what the human eye is
+//! most sensitive to.
+//!
+//! This encoding first transforms into YCoCg colorspace, and then
+//! fiddles the resulting Y, Co, and Cg components into a special
+//! 32-bit format.  The Y component is stored as an unsigned float,
+//! with 6 bits of exponent and 10 bits of mantissa.  The Co and Cg
+//! components are stored as 8-bit integers.
+//!
+//! The layout is:
+//!
+//! 1. Y-exponent: 6 bits
+//! 2. Y-mantissa: 10 bits
+//! 3. Co: 8 bits
+//! 4. Cg: 8 bits
+//!
+//! The Y component follows the convention of a mantissa with an
+//! implicit leading one, giving it 11 bits of precision.  The
+//! exponent has a bias of 24.
+
+/// Encodes three floating point RGB values into a packed 32-bit format.
+///
+/// Warning: negative values and NaN's are _not_ supported.  There are
+/// debug-only assertions in place to catch such values in the input
+/// floats.
+#[inline]
+pub fn encode(floats: (f32, f32, f32)) -> u32 {
+    debug_assert!(
+        floats.0 >= 0.0
+            && floats.1 >= 0.0
+            && floats.2 >= 0.0
+            && !floats.0.is_nan()
+            && !floats.1.is_nan()
+            && !floats.2.is_nan(),
+        "trifloat::rgb32::encode(): encoding to unsigned tri-floats only \
+         works correctly for positive, non-NaN numbers, but the numbers passed \
+         were: ({}, {}, {})",
+        floats.0,
+        floats.1,
+        floats.2
+    );
+
+    // Convert to YCoCg colorspace.
+    let y = (floats.0 * 0.25) + (floats.1 * 0.5) + (floats.2 * 0.25);
+    let co = (floats.0 * 0.5) + (floats.2 * -0.5);
+    let cg = (floats.0 * -0.25) + (floats.1 * 0.5) + (floats.2 * -0.25);
+
+    if y <= 0.0 {
+        // Corner case: black.
+        return 0;
+    } else if y.is_infinite() {
+        // Corner case: infinite white.
+        return 0xffff7f7f;
+    }
+
+    // Encode Co and Cg as 8-bit integers.
+    // Note that the max values for each of these will get clamped
+    // very slightly, but that represents extremely saturated
+    // colors, where the human eye is not very sensitive to chroma
+    // differences anyway.  And the trade-off is that we can
+    // represent 0.0 (completely unsaturated, no chroma) exactly.
+    let inv_y = 1.0 / y;
+    let co_8bit = ((co * inv_y * 63.5) + 127.5).min(255.0).max(0.0) as u8;
+    let cg_8bit = ((cg * inv_y * 127.0) + 127.5).min(255.0).max(0.0) as u8;
+
+    // Bit-fiddle to get the float components of Y.
+    // This assumes we're working with a standard 32-bit IEEE float.
+    let y_ieee_bits = y.to_bits();
+    let y_mantissa = (y_ieee_bits >> 13) & 0b11_1111_1111;
+    let y_exp = ((y_ieee_bits >> 23) & 0b1111_1111) as i32 - 127;
+
+    // Pack values into a u32 and return.
+    if y_exp <= -24 {
+        // Corner-case:
+        // Luma is so dark that it will be zero at our precision,
+        // and hence black.
+        0
+    } else if y_exp >= 40 {
+        dbg!();
+        // Corner-case:
+        // Luma is so bright that it exceeds our max value, so saturate
+        // the luma.
+        0xffff0000 | ((co_8bit as u32) << 8) | cg_8bit as u32
+    } else {
+        // Common case.
+        let exp = (y_exp + 24) as u32;
+        (exp << 26) | (y_mantissa << 16) | ((co_8bit as u32) << 8) | cg_8bit as u32
+    }
+}
+
+/// Decodes a packed HDR RGB 32-bit format into three full
+/// floating point RGB numbers.
+///
+/// This operation is lossless and cannot fail.
+#[inline]
+pub fn decode(packed_rgb: u32) -> (f32, f32, f32) {
+    // Reconstruct Y, Co, and Cg from the packed bits.
+    let y = {
+        let exp = (packed_rgb & 0xfc00_0000) >> 26;
+        if exp == 0 {
+            0.0
+        } else {
+            f32::from_bits(((exp + 103) << 23) | ((packed_rgb & 0x03ff_0000) >> 3))
+        }
+    };
+    let co = {
+        let co_8bit = (packed_rgb >> 8) & 0xff;
+        ((co_8bit as f32) - 127.0) * (1.0 / 63.5) * y
+    };
+    let cg = {
+        let cg_8bit = packed_rgb & 0xff;
+        ((cg_8bit as f32) - 127.0) * (1.0 / 127.0) * y
+    };
+
+    // Convert back to RGB.
+    let tmp = y - cg;
+    let r = (tmp + co).max(0.0);
+    let g = (y + cg).max(0.0);
+    let b = (tmp - co).max(0.0);
+
+    (r, g, b)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) {
+        decode(encode(floats))
+    }
+
+    #[test]
+    fn all_zeros() {
+        let fs = (0.0f32, 0.0f32, 0.0f32);
+
+        let tri = encode(fs);
+        let fs2 = decode(tri);
+
+        assert_eq!(tri, 0u32);
+        assert_eq!(fs, fs2);
+    }
+
+    #[test]
+    fn powers_of_two() {
+        let mut n = 1.0f32 / 65536.0;
+        for _ in 0..48 {
+            let fs = (n, n, n);
+
+            assert_eq!(fs, round_trip(fs));
+            n *= 2.0;
+        }
+    }
+
+    #[test]
+    fn integers() {
+        let mut n = 1.0f32;
+        for _ in 0..2048 {
+            let fs = (n, n, n);
+
+            assert_eq!(fs, round_trip(fs));
+            n += 1.0;
+        }
+    }
+
+    #[test]
+    fn full_saturation() {
+        let fs1 = (1.0, 0.0, 0.0);
+        let fs2 = (0.0, 1.0, 0.0);
+        let fs3 = (0.0, 0.0, 1.0);
+
+        assert_eq!(fs1, round_trip(fs1));
+        assert_eq!(fs2, round_trip(fs2));
+        assert_eq!(fs3, round_trip(fs3));
+    }
+
+    #[test]
+    fn saturate() {
+        let fs = (10000000000000.0, 10000000000000.0, 10000000000000.0);
+
+        assert_eq!(
+            (1098974760000.0, 1098974760000.0, 1098974760000.0),
+            round_trip(fs)
+        );
+    }
+
+    #[test]
+    fn inf_saturate() {
+        use std::f32::INFINITY;
+        let fs = (INFINITY, INFINITY, INFINITY);
+
+        assert_eq!(
+            (1098974760000.0, 1098974760000.0, 1098974760000.0),
+            round_trip(fs)
+        );
+    }
+
+    #[test]
+    fn partial_saturate() {
+        let fs1 = (10000000000000.0, 0.0, 0.0);
+        let fs2 = (0.0, 10000000000000.0, 0.0);
+        let fs3 = (0.0, 0.0, 10000000000000.0);
+
+        assert_eq!(round_trip(fs1), (4395899000000.0, 0.0, 0.0));
+        assert_eq!(round_trip(fs2), (0.0, 2197949500000.0, 0.0));
+        assert_eq!(round_trip(fs3), (0.0, 0.0, 4395899000000.0));
+    }
+
+    // #[test]
+    // fn accuracy() {
+    //     let mut n = 1.0;
+    //     for _ in 0..256 {
+    //         let (x, _, _) = round_trip((n, 0.0, 0.0));
+    //         assert_eq!(n, x);
+    //         n += 1.0 / 256.0;
+    //     }
+    // }
+
+    // #[test]
+    // fn rounding() {
+    //     let fs = (7.0f32, 513.0f32, 1.0f32);
+    //     assert_eq!(round_trip(fs), (8.0, 514.0, 2.0));
+    // }
+
+    // #[test]
+    // fn rounding_edge_case() {
+    //     let fs = (1023.0f32, 0.0f32, 0.0f32);
+
+    //     assert_eq!(round_trip(fs), (1024.0, 0.0, 0.0),);
+    // }
+
+    // #[test]
+    // fn smallest_value() {
+    //     let fs = (MIN, MIN * 0.5, MIN * 0.49);
+    //     assert_eq!(round_trip(fs), (MIN, MIN, 0.0));
+    //     assert_eq!(decode(0x00_80_40_00), (MIN, MIN, 0.0));
+    // }
+
+    // #[test]
+    // fn underflow() {
+    //     let fs = (MIN * 0.49, 0.0, 0.0);
+    //     assert_eq!(encode(fs), 0);
+    //     assert_eq!(round_trip(fs), (0.0, 0.0, 0.0));
+    // }
+
+    // #[test]
+    // #[should_panic]
+    // fn nans_01() {
+    //     encode((std::f32::NAN, 0.0, 0.0));
+    // }
+
+    // #[test]
+    // #[should_panic]
+    // fn nans_02() {
+    //     encode((0.0, std::f32::NAN, 0.0));
+    // }
+
+    // #[test]
+    // #[should_panic]
+    // fn nans_03() {
+    //     encode((0.0, 0.0, std::f32::NAN));
+    // }
+
+    // #[test]
+    // #[should_panic]
+    // fn negative_01() {
+    //     encode((-1.0, 0.0, 0.0));
+    // }
+
+    // #[test]
+    // #[should_panic]
+    // fn negative_02() {
+    //     encode((0.0, -1.0, 0.0));
+    // }
+
+    // #[test]
+    // #[should_panic]
+    // fn negative_03() {
+    //     encode((0.0, 0.0, -1.0));
+    // }
+
+    // #[test]
+    // fn negative_04() {
+    //     encode((-0.0, -0.0, -0.0));
+    // }
+}

From 339568ec0c2b551a776285d8129902c97c70c86e Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Thu, 10 Sep 2020 22:39:17 +0900
Subject: [PATCH 02/25] Remove stale comments.

---
 sub_crates/trifloat/src/rgb32.rs | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/sub_crates/trifloat/src/rgb32.rs b/sub_crates/trifloat/src/rgb32.rs
index 4a79158..7fe1248 100644
--- a/sub_crates/trifloat/src/rgb32.rs
+++ b/sub_crates/trifloat/src/rgb32.rs
@@ -59,11 +59,6 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
     }
 
     // Encode Co and Cg as 8-bit integers.
-    // Note that the max values for each of these will get clamped
-    // very slightly, but that represents extremely saturated
-    // colors, where the human eye is not very sensitive to chroma
-    // differences anyway.  And the trade-off is that we can
-    // represent 0.0 (completely unsaturated, no chroma) exactly.
     let inv_y = 1.0 / y;
     let co_8bit = ((co * inv_y * 63.5) + 127.5).min(255.0).max(0.0) as u8;
     let cg_8bit = ((cg * inv_y * 127.0) + 127.5).min(255.0).max(0.0) as u8;
@@ -95,8 +90,6 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
 
 /// Decodes a packed HDR RGB 32-bit format into three full
 /// floating point RGB numbers.
-///
-/// This operation is lossless and cannot fail.
 #[inline]
 pub fn decode(packed_rgb: u32) -> (f32, f32, f32) {
     // Reconstruct Y, Co, and Cg from the packed bits.

From d6ab9d06be4742f10ea50c0db730d2db8e91c08c Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Fri, 11 Sep 2020 21:57:43 +0900
Subject: [PATCH 03/25] More work on the packed HDR RGB 32-bit format.

Switched to a different chroma encoding, which is notably faster
and never produces negative values when decoded.
---
 sub_crates/trifloat/src/lib.rs   |   5 +
 sub_crates/trifloat/src/rgb32.rs | 280 +++++++++++++++++--------------
 2 files changed, 155 insertions(+), 130 deletions(-)

diff --git a/sub_crates/trifloat/src/lib.rs b/sub_crates/trifloat/src/lib.rs
index dafef1e..bfc7092 100644
--- a/sub_crates/trifloat/src/lib.rs
+++ b/sub_crates/trifloat/src/lib.rs
@@ -33,3 +33,8 @@ fn fiddle_log2(n: f32) -> i32 {
     use std::f32;
     ((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127
 }
+
+#[inline(always)]
+fn clamp_0_1(n: f32) -> f32 {
+    n.max(0.0).min(1.0)
+}
diff --git a/sub_crates/trifloat/src/rgb32.rs b/sub_crates/trifloat/src/rgb32.rs
index 7fe1248..f903c34 100644
--- a/sub_crates/trifloat/src/rgb32.rs
+++ b/sub_crates/trifloat/src/rgb32.rs
@@ -1,33 +1,55 @@
-//! Encoding/decoding for specialized HDR RGB 32-bit storage format.
+//! Encoding/decoding for a specialized HDR RGB 32-bit storage format.
 //!
 //! The motivation for this format is to separate out the luma of
 //! the color from its chromaticity, in the same spirit as most
 //! image and video compression approaches, and then allocate more
-//! data to the luma component since that's what the human eye is
-//! most sensitive to.
+//! bits to storing the luma component since that's what the human
+//! eye is most sensitive to.
 //!
-//! This encoding first transforms into YCoCg colorspace, and then
-//! fiddles the resulting Y, Co, and Cg components into a special
-//! 32-bit format.  The Y component is stored as an unsigned float,
-//! with 6 bits of exponent and 10 bits of mantissa.  The Co and Cg
-//! components are stored as 8-bit integers.
+//! This encoding first transforms the color into a Y (luma) component
+//! and two chroma components (green-magenta and red-blue), and then
+//! fiddles those components into a special 32-bit format.
+//! The Y component is stored as an unsigned float, with 6 bits of
+//! exponent and 10 bits of mantissa.  The two chroma components are
+//! each stored as 8-bit integers.
 //!
 //! The layout is:
 //!
 //! 1. Y-exponent: 6 bits
 //! 2. Y-mantissa: 10 bits
-//! 3. Co: 8 bits
-//! 4. Cg: 8 bits
+//! 3. Green-Magenta: 8 bits
+//! 4. Red-Blue: 8 bits
 //!
-//! The Y component follows the convention of a mantissa with an
-//! implicit leading one, giving it 11 bits of precision.  The
-//! exponent has a bias of 24.
+//! The Y-mantissa has an implicit leading one, giving 11 bits of
+//! precision.
+
+use crate::clamp_0_1;
+
+const EXP_BIAS: i32 = 23;
+
+/// The largest value this format can store.
+///
+/// More precisely, this is the largest value that can be *reliably*
+/// stored.
+///
+/// This can be exceeded by individual channels in limited cases due
+/// to the color transform used.  But values *at least* this large
+/// can be relied on.
+pub const MAX: f32 = ((1u64 << (63 - EXP_BIAS)) - (1 << (52 - EXP_BIAS))) as f32;
+
+/// The smallest non-zero value this format can store.
+///
+/// Note that since this is effectively a shared-exponent format,
+/// the numerical precision of all channels depends on the magnitude
+/// of the over-all RGB color.
+pub const MIN: f32 = 1.0 / (1 << (EXP_BIAS - 2)) as f32;
 
 /// Encodes three floating point RGB values into a packed 32-bit format.
 ///
 /// Warning: negative values and NaN's are _not_ supported.  There are
 /// debug-only assertions in place to catch such values in the input
-/// floats.
+/// floats.  Infinity in any channel will saturate the whole color to
+/// the brightest representable white.
 #[inline]
 pub fn encode(floats: (f32, f32, f32)) -> u32 {
     debug_assert!(
@@ -45,23 +67,15 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
         floats.2
     );
 
-    // Convert to YCoCg colorspace.
-    let y = (floats.0 * 0.25) + (floats.1 * 0.5) + (floats.2 * 0.25);
-    let co = (floats.0 * 0.5) + (floats.2 * -0.5);
-    let cg = (floats.0 * -0.25) + (floats.1 * 0.5) + (floats.2 * -0.25);
-
-    if y <= 0.0 {
-        // Corner case: black.
-        return 0;
-    } else if y.is_infinite() {
-        // Corner case: infinite white.
-        return 0xffff7f7f;
-    }
-
-    // Encode Co and Cg as 8-bit integers.
-    let inv_y = 1.0 / y;
-    let co_8bit = ((co * inv_y * 63.5) + 127.5).min(255.0).max(0.0) as u8;
-    let cg_8bit = ((cg * inv_y * 127.0) + 127.5).min(255.0).max(0.0) as u8;
+    // Convert to Y/Green-Magenta/Red-Blue components.
+    let u = floats.0 + floats.2;
+    let y = (u * 0.5) + floats.1;
+    let green_magenta = clamp_0_1(floats.1 / y);
+    let red_blue = if u > 0.0 {
+        clamp_0_1(floats.0 / u)
+    } else {
+        0.5
+    };
 
     // Bit-fiddle to get the float components of Y.
     // This assumes we're working with a standard 32-bit IEEE float.
@@ -69,22 +83,31 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
     let y_mantissa = (y_ieee_bits >> 13) & 0b11_1111_1111;
     let y_exp = ((y_ieee_bits >> 23) & 0b1111_1111) as i32 - 127;
 
+    // Encode Cg and Cr as 8-bit integers.
+    let gm_8bit = ((green_magenta * 254.0) + 0.5) as u8;
+    let rb_8bit = ((red_blue * 254.0) + 0.5) as u8;
+
     // Pack values into a u32 and return.
-    if y_exp <= -24 {
-        // Corner-case:
+    if y_exp <= (0 - EXP_BIAS) {
+        // Early-out corner-case:
         // Luma is so dark that it will be zero at our precision,
         // and hence black.
         0
-    } else if y_exp >= 40 {
-        dbg!();
+    } else if y_exp >= (63 - EXP_BIAS) {
         // Corner-case:
         // Luma is so bright that it exceeds our max value, so saturate
         // the luma.
-        0xffff0000 | ((co_8bit as u32) << 8) | cg_8bit as u32
+        if y.is_infinite() {
+            // If luma is infinity, our chroma values are bogus, so
+            // just go with white.
+            0xffff7f7f
+        } else {
+            0xffff0000 | ((gm_8bit as u32) << 8) | rb_8bit as u32
+        }
     } else {
         // Common case.
-        let exp = (y_exp + 24) as u32;
-        (exp << 26) | (y_mantissa << 16) | ((co_8bit as u32) << 8) | cg_8bit as u32
+        let exp = (y_exp + EXP_BIAS) as u32;
+        (exp << 26) | (y_mantissa << 16) | ((gm_8bit as u32) << 8) | rb_8bit as u32
     }
 }
 
@@ -92,29 +115,32 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
 /// floating point RGB numbers.
 #[inline]
 pub fn decode(packed_rgb: u32) -> (f32, f32, f32) {
-    // Reconstruct Y, Co, and Cg from the packed bits.
+    // Pull out Y, Green-Magenta, and Red-Blue from the packed
+    // bits.
     let y = {
         let exp = (packed_rgb & 0xfc00_0000) >> 26;
         if exp == 0 {
             0.0
         } else {
-            f32::from_bits(((exp + 103) << 23) | ((packed_rgb & 0x03ff_0000) >> 3))
+            f32::from_bits(
+                ((exp + (127 - EXP_BIAS as u32)) << 23) | ((packed_rgb & 0x03ff_0000) >> 3),
+            )
         }
     };
-    let co = {
-        let co_8bit = (packed_rgb >> 8) & 0xff;
-        ((co_8bit as f32) - 127.0) * (1.0 / 63.5) * y
+    let green_magenta = {
+        let gm_8bit = (packed_rgb >> 8) & 0xff;
+        (gm_8bit as f32) * (1.0 / 254.0)
     };
-    let cg = {
-        let cg_8bit = packed_rgb & 0xff;
-        ((cg_8bit as f32) - 127.0) * (1.0 / 127.0) * y
+    let red_blue = {
+        let rb_8bit = packed_rgb & 0xff;
+        (rb_8bit as f32) * (1.0 / 254.0)
     };
 
     // Convert back to RGB.
-    let tmp = y - cg;
-    let r = (tmp + co).max(0.0);
-    let g = (y + cg).max(0.0);
-    let b = (tmp - co).max(0.0);
+    let g = y * green_magenta;
+    let u = (y - g) * 2.0;
+    let r = u * red_blue;
+    let b = u - r;
 
     (r, g, b)
 }
@@ -161,7 +187,7 @@ mod tests {
     }
 
     #[test]
-    fn full_saturation() {
+    fn color_saturation() {
         let fs1 = (1.0, 0.0, 0.0);
         let fs2 = (0.0, 1.0, 0.0);
         let fs3 = (0.0, 0.0, 1.0);
@@ -172,112 +198,106 @@ mod tests {
     }
 
     #[test]
-    fn saturate() {
+    fn num_saturate() {
         let fs = (10000000000000.0, 10000000000000.0, 10000000000000.0);
 
-        assert_eq!(
-            (1098974760000.0, 1098974760000.0, 1098974760000.0),
-            round_trip(fs)
-        );
+        assert_eq!((MAX, MAX, MAX), round_trip(fs));
     }
 
     #[test]
-    fn inf_saturate() {
+    fn num_inf_saturate() {
         use std::f32::INFINITY;
         let fs = (INFINITY, INFINITY, INFINITY);
 
-        assert_eq!(
-            (1098974760000.0, 1098974760000.0, 1098974760000.0),
-            round_trip(fs)
-        );
+        assert_eq!((MAX, MAX, MAX), round_trip(fs));
     }
 
     #[test]
-    fn partial_saturate() {
+    fn num_partial_saturate() {
         let fs1 = (10000000000000.0, 0.0, 0.0);
         let fs2 = (0.0, 10000000000000.0, 0.0);
         let fs3 = (0.0, 0.0, 10000000000000.0);
 
-        assert_eq!(round_trip(fs1), (4395899000000.0, 0.0, 0.0));
-        assert_eq!(round_trip(fs2), (0.0, 2197949500000.0, 0.0));
-        assert_eq!(round_trip(fs3), (0.0, 0.0, 4395899000000.0));
+        assert_eq!((MAX * 4.0, 0.0, 0.0), round_trip(fs1));
+        assert_eq!((0.0, MAX * 2.0, 0.0), round_trip(fs2));
+        assert_eq!((0.0, 0.0, MAX * 4.0), round_trip(fs3));
     }
 
-    // #[test]
-    // fn accuracy() {
-    //     let mut n = 1.0;
-    //     for _ in 0..256 {
-    //         let (x, _, _) = round_trip((n, 0.0, 0.0));
-    //         assert_eq!(n, x);
-    //         n += 1.0 / 256.0;
-    //     }
-    // }
+    #[test]
+    fn largest_value() {
+        let fs1 = (MAX, MAX, MAX);
+        let fs2 = (MAX, 0.0, 0.0);
+        let fs3 = (0.0, MAX, 0.0);
+        let fs4 = (0.0, 0.0, MAX);
 
-    // #[test]
-    // fn rounding() {
-    //     let fs = (7.0f32, 513.0f32, 1.0f32);
-    //     assert_eq!(round_trip(fs), (8.0, 514.0, 2.0));
-    // }
+        assert_eq!(fs1, round_trip(fs1));
+        assert_eq!(fs2, round_trip(fs2));
+        assert_eq!(fs3, round_trip(fs3));
+        assert_eq!(fs4, round_trip(fs4));
+    }
 
-    // #[test]
-    // fn rounding_edge_case() {
-    //     let fs = (1023.0f32, 0.0f32, 0.0f32);
+    #[test]
+    fn smallest_value() {
+        let fs1 = (MIN, MIN, MIN);
+        let fs2 = (MIN, 0.0, 0.0);
+        let fs3 = (0.0, MIN, 0.0);
+        let fs4 = (0.0, 0.0, MIN);
 
-    //     assert_eq!(round_trip(fs), (1024.0, 0.0, 0.0),);
-    // }
+        assert_eq!(fs1, round_trip(fs1));
+        assert_eq!(fs2, round_trip(fs2));
+        assert_eq!(fs3, round_trip(fs3));
+        assert_eq!(fs4, round_trip(fs4));
+    }
 
-    // #[test]
-    // fn smallest_value() {
-    //     let fs = (MIN, MIN * 0.5, MIN * 0.49);
-    //     assert_eq!(round_trip(fs), (MIN, MIN, 0.0));
-    //     assert_eq!(decode(0x00_80_40_00), (MIN, MIN, 0.0));
-    // }
+    #[test]
+    fn underflow() {
+        let fs1 = (MIN * 0.5, 0.0, 0.0);
+        let fs2 = (0.0, MIN * 0.25, 0.0);
+        let fs3 = (0.0, 0.0, MIN * 0.5);
 
-    // #[test]
-    // fn underflow() {
-    //     let fs = (MIN * 0.49, 0.0, 0.0);
-    //     assert_eq!(encode(fs), 0);
-    //     assert_eq!(round_trip(fs), (0.0, 0.0, 0.0));
-    // }
+        assert_eq!(round_trip(fs1), (0.0, 0.0, 0.0));
+        assert_eq!(round_trip(fs2), (0.0, 0.0, 0.0));
+        assert_eq!(round_trip(fs3), (0.0, 0.0, 0.0));
+    }
 
-    // #[test]
-    // #[should_panic]
-    // fn nans_01() {
-    //     encode((std::f32::NAN, 0.0, 0.0));
-    // }
+    #[test]
+    #[should_panic]
+    fn nans_01() {
+        encode((std::f32::NAN, 0.0, 0.0));
+    }
 
-    // #[test]
-    // #[should_panic]
-    // fn nans_02() {
-    //     encode((0.0, std::f32::NAN, 0.0));
-    // }
+    #[test]
+    #[should_panic]
+    fn nans_02() {
+        encode((0.0, std::f32::NAN, 0.0));
+    }
 
-    // #[test]
-    // #[should_panic]
-    // fn nans_03() {
-    //     encode((0.0, 0.0, std::f32::NAN));
-    // }
+    #[test]
+    #[should_panic]
+    fn nans_03() {
+        encode((0.0, 0.0, std::f32::NAN));
+    }
 
-    // #[test]
-    // #[should_panic]
-    // fn negative_01() {
-    //     encode((-1.0, 0.0, 0.0));
-    // }
+    #[test]
+    #[should_panic]
+    fn negative_01() {
+        encode((-1.0, 0.0, 0.0));
+    }
 
-    // #[test]
-    // #[should_panic]
-    // fn negative_02() {
-    //     encode((0.0, -1.0, 0.0));
-    // }
+    #[test]
+    #[should_panic]
+    fn negative_02() {
+        encode((0.0, -1.0, 0.0));
+    }
 
-    // #[test]
-    // #[should_panic]
-    // fn negative_03() {
-    //     encode((0.0, 0.0, -1.0));
-    // }
+    #[test]
+    #[should_panic]
+    fn negative_03() {
+        encode((0.0, 0.0, -1.0));
+    }
 
-    // #[test]
-    // fn negative_04() {
-    //     encode((-0.0, -0.0, -0.0));
-    // }
+    #[test]
+    fn negative_04() {
+        encode((-0.0, -0.0, -0.0));
+    }
 }

From bd6cf359b401d22e3aa757f2e138f31370860998 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 13 Sep 2020 06:42:58 +0900
Subject: [PATCH 04/25] Some code clean-up in the RGB32 encoding/decoding code.

---
 sub_crates/trifloat/src/lib.rs   |  5 ---
 sub_crates/trifloat/src/rgb32.rs | 62 ++++++++++++++++++--------------
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/sub_crates/trifloat/src/lib.rs b/sub_crates/trifloat/src/lib.rs
index bfc7092..dafef1e 100644
--- a/sub_crates/trifloat/src/lib.rs
+++ b/sub_crates/trifloat/src/lib.rs
@@ -33,8 +33,3 @@ fn fiddle_log2(n: f32) -> i32 {
     use std::f32;
     ((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127
 }
-
-#[inline(always)]
-fn clamp_0_1(n: f32) -> f32 {
-    n.max(0.0).min(1.0)
-}
diff --git a/sub_crates/trifloat/src/rgb32.rs b/sub_crates/trifloat/src/rgb32.rs
index f903c34..46b9f83 100644
--- a/sub_crates/trifloat/src/rgb32.rs
+++ b/sub_crates/trifloat/src/rgb32.rs
@@ -23,8 +23,6 @@
 //! The Y-mantissa has an implicit leading one, giving 11 bits of
 //! precision.
 
-use crate::clamp_0_1;
-
 const EXP_BIAS: i32 = 23;
 
 /// The largest value this format can store.
@@ -44,6 +42,30 @@ pub const MAX: f32 = ((1u64 << (63 - EXP_BIAS)) - (1 << (52 - EXP_BIAS))) as f32
 /// of the over-all RGB color.
 pub const MIN: f32 = 1.0 / (1 << (EXP_BIAS - 2)) as f32;
 
+/// The output c1 and c2 values should always be in the range [0, 1].
+#[inline(always)]
+fn rgb2ycc(rgb: (f32, f32, f32)) -> (f32, f32, f32) {
+    let rb = rgb.0 + rgb.2;
+    let y = (rb * 0.5) + rgb.1;
+    let c1 = rgb.1 / y;
+    let c2 = if rb > 0.0 { rgb.0 / rb } else { 0.5 };
+
+    (y, c1, c2)
+}
+
+/// The input c1 and c2 values should always be in the range [0, 1].
+#[inline(always)]
+fn ycc2rgb(ycc: (f32, f32, f32)) -> (f32, f32, f32) {
+    let (y, c1, c2) = ycc;
+
+    let g = y * c1;
+    let rb = (y - g) * 2.0;
+    let r = rb * c2;
+    let b = rb - r;
+
+    (r, g, b)
+}
+
 /// Encodes three floating point RGB values into a packed 32-bit format.
 ///
 /// Warning: negative values and NaN's are _not_ supported.  There are
@@ -68,14 +90,7 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
     );
 
     // Convert to Y/Green-Magenta/Red-Blue components.
-    let u = floats.0 + floats.2;
-    let y = (u * 0.5) + floats.1;
-    let green_magenta = clamp_0_1(floats.1 / y);
-    let red_blue = if u > 0.0 {
-        clamp_0_1(floats.0 / u)
-    } else {
-        0.5
-    };
+    let (y, c1, c2) = rgb2ycc(floats);
 
     // Bit-fiddle to get the float components of Y.
     // This assumes we're working with a standard 32-bit IEEE float.
@@ -84,8 +99,8 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
     let y_exp = ((y_ieee_bits >> 23) & 0b1111_1111) as i32 - 127;
 
     // Encode Cg and Cr as 8-bit integers.
-    let gm_8bit = ((green_magenta * 254.0) + 0.5) as u8;
-    let rb_8bit = ((red_blue * 254.0) + 0.5) as u8;
+    let c1_8bit = ((c1 * 254.0) + 0.5) as u8;
+    let c2_8bit = ((c2 * 254.0) + 0.5) as u8;
 
     // Pack values into a u32 and return.
     if y_exp <= (0 - EXP_BIAS) {
@@ -102,12 +117,12 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
             // just go with white.
             0xffff7f7f
         } else {
-            0xffff0000 | ((gm_8bit as u32) << 8) | rb_8bit as u32
+            0xffff0000 | ((c1_8bit as u32) << 8) | c2_8bit as u32
         }
     } else {
         // Common case.
         let exp = (y_exp + EXP_BIAS) as u32;
-        (exp << 26) | (y_mantissa << 16) | ((gm_8bit as u32) << 8) | rb_8bit as u32
+        (exp << 26) | (y_mantissa << 16) | ((c1_8bit as u32) << 8) | c2_8bit as u32
     }
 }
 
@@ -127,22 +142,17 @@ pub fn decode(packed_rgb: u32) -> (f32, f32, f32) {
             )
         }
     };
-    let green_magenta = {
-        let gm_8bit = (packed_rgb >> 8) & 0xff;
-        (gm_8bit as f32) * (1.0 / 254.0)
+    let c1 = {
+        let c1_8bit = (packed_rgb >> 8) & 0xff;
+        (c1_8bit as f32) * (1.0 / 254.0)
     };
-    let red_blue = {
-        let rb_8bit = packed_rgb & 0xff;
-        (rb_8bit as f32) * (1.0 / 254.0)
+    let c2 = {
+        let c2_8bit = packed_rgb & 0xff;
+        (c2_8bit as f32) * (1.0 / 254.0)
     };
 
     // Convert back to RGB.
-    let g = y * green_magenta;
-    let u = (y - g) * 2.0;
-    let r = u * red_blue;
-    let b = u - r;
-
-    (r, g, b)
+    ycc2rgb((y, c1, c2))
 }
 
 #[cfg(test)]

From c1f516c2b67d3cc1cf31d2265ab46fe4236dcba0 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 13 Sep 2020 11:22:48 +0900
Subject: [PATCH 05/25] Use a better chroma formula for the RGB32 format.

This makes much better use of the bit space.
---
 sub_crates/trifloat/src/rgb32.rs | 80 ++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 36 deletions(-)

diff --git a/sub_crates/trifloat/src/rgb32.rs b/sub_crates/trifloat/src/rgb32.rs
index 46b9f83..f2f194a 100644
--- a/sub_crates/trifloat/src/rgb32.rs
+++ b/sub_crates/trifloat/src/rgb32.rs
@@ -7,23 +7,24 @@
 //! eye is most sensitive to.
 //!
 //! This encoding first transforms the color into a Y (luma) component
-//! and two chroma components (green-magenta and red-blue), and then
-//! fiddles those components into a special 32-bit format.
-//! The Y component is stored as an unsigned float, with 6 bits of
-//! exponent and 10 bits of mantissa.  The two chroma components are
-//! each stored as 8-bit integers.
+//! and two chroma components (C1 and C2), and then fiddles those
+//! components into a special 32-bit format.  The Y component is stored
+//! as an unsigned float, with 6 bits of exponent and 10 bits of
+//! mantissa.  The two chroma components are each stored as 8-bit
+//! integers.
 //!
 //! The layout is:
 //!
 //! 1. Y-exponent: 6 bits
 //! 2. Y-mantissa: 10 bits
-//! 3. Green-Magenta: 8 bits
-//! 4. Red-Blue: 8 bits
+//! 3. C1: 8 bits
+//! 4. C2: 8 bits
 //!
 //! The Y-mantissa has an implicit leading one, giving 11 bits of
 //! precision.
 
 const EXP_BIAS: i32 = 23;
+const CHROMA_QUANT: u32 = 254;
 
 /// The largest value this format can store.
 ///
@@ -42,28 +43,35 @@ pub const MAX: f32 = ((1u64 << (63 - EXP_BIAS)) - (1 << (52 - EXP_BIAS))) as f32
 /// of the over-all RGB color.
 pub const MIN: f32 = 1.0 / (1 << (EXP_BIAS - 2)) as f32;
 
-/// The output c1 and c2 values should always be in the range [0, 1].
+/// The output c1 and c2 values are always in the range [0, 1].
 #[inline(always)]
-fn rgb2ycc(rgb: (f32, f32, f32)) -> (f32, f32, f32) {
-    let rb = rgb.0 + rgb.2;
-    let y = (rb * 0.5) + rgb.1;
-    let c1 = rgb.1 / y;
-    let c2 = if rb > 0.0 { rgb.0 / rb } else { 0.5 };
+fn rgb2ycc(r: f32, g: f32, b: f32) -> (f32, f32, f32) {
+    let rb = (r + b) * 0.5;
+    let y = g + rb;
 
-    (y, c1, c2)
+    if r > b {
+        (y, rb / y, b / y)
+    } else {
+        (y, r / y, rb / y)
+    }
 }
 
 /// The input c1 and c2 values should always be in the range [0, 1].
 #[inline(always)]
-fn ycc2rgb(ycc: (f32, f32, f32)) -> (f32, f32, f32) {
-    let (y, c1, c2) = ycc;
-
-    let g = y * c1;
-    let rb = (y - g) * 2.0;
-    let r = rb * c2;
-    let b = rb - r;
-
-    (r, g, b)
+fn ycc2rgb(y: f32, c1: f32, c2: f32) -> (f32, f32, f32) {
+    if c1 > c2 {
+        let rb = y * c1;
+        let g = y - rb;
+        let b = y * c2;
+        let r = (rb * 2.0) - b;
+        (r, g, b)
+    } else {
+        let rb = y * c2;
+        let g = y - rb;
+        let r = y * c1;
+        let b = (rb * 2.0) - r;
+        (r, g, b)
+    }
 }
 
 /// Encodes three floating point RGB values into a packed 32-bit format.
@@ -89,8 +97,8 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
         floats.2
     );
 
-    // Convert to Y/Green-Magenta/Red-Blue components.
-    let (y, c1, c2) = rgb2ycc(floats);
+    // Convert to luma-chroma format.
+    let (y, c1, c2) = rgb2ycc(floats.0, floats.1, floats.2);
 
     // Bit-fiddle to get the float components of Y.
     // This assumes we're working with a standard 32-bit IEEE float.
@@ -98,9 +106,9 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
     let y_mantissa = (y_ieee_bits >> 13) & 0b11_1111_1111;
     let y_exp = ((y_ieee_bits >> 23) & 0b1111_1111) as i32 - 127;
 
-    // Encode Cg and Cr as 8-bit integers.
-    let c1_8bit = ((c1 * 254.0) + 0.5) as u8;
-    let c2_8bit = ((c2 * 254.0) + 0.5) as u8;
+    // Encode C1 and C2 as 8-bit integers.
+    let c1_8bit = ((c1 * CHROMA_QUANT as f32) + 0.5) as u32;
+    let c2_8bit = ((c2 * CHROMA_QUANT as f32) + 0.5) as u32;
 
     // Pack values into a u32 and return.
     if y_exp <= (0 - EXP_BIAS) {
@@ -115,14 +123,15 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
         if y.is_infinite() {
             // If luma is infinity, our chroma values are bogus, so
             // just go with white.
-            0xffff7f7f
+            let tmp = CHROMA_QUANT / 2;
+            0xffff0000 | tmp << 8 | tmp
         } else {
-            0xffff0000 | ((c1_8bit as u32) << 8) | c2_8bit as u32
+            0xffff0000 | (c1_8bit << 8) | c2_8bit
         }
     } else {
         // Common case.
         let exp = (y_exp + EXP_BIAS) as u32;
-        (exp << 26) | (y_mantissa << 16) | ((c1_8bit as u32) << 8) | c2_8bit as u32
+        (exp << 26) | (y_mantissa << 16) | (c1_8bit << 8) | c2_8bit
     }
 }
 
@@ -130,8 +139,7 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
 /// floating point RGB numbers.
 #[inline]
 pub fn decode(packed_rgb: u32) -> (f32, f32, f32) {
-    // Pull out Y, Green-Magenta, and Red-Blue from the packed
-    // bits.
+    // Pull out Y, C1, and C2 from the packed bits.
     let y = {
         let exp = (packed_rgb & 0xfc00_0000) >> 26;
         if exp == 0 {
@@ -144,15 +152,15 @@ pub fn decode(packed_rgb: u32) -> (f32, f32, f32) {
     };
     let c1 = {
         let c1_8bit = (packed_rgb >> 8) & 0xff;
-        (c1_8bit as f32) * (1.0 / 254.0)
+        (c1_8bit as f32) * (1.0 / CHROMA_QUANT as f32)
     };
     let c2 = {
         let c2_8bit = packed_rgb & 0xff;
-        (c2_8bit as f32) * (1.0 / 254.0)
+        (c2_8bit as f32) * (1.0 / CHROMA_QUANT as f32)
     };
 
     // Convert back to RGB.
-    ycc2rgb((y, c1, c2))
+    ycc2rgb(y, c1, c2)
 }
 
 #[cfg(test)]

From f13ffac7bdb386dfa8a615d842ec1646a1dc4b81 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Fri, 18 Sep 2020 17:57:13 +0900
Subject: [PATCH 06/25] Removed the experimental luma-chroma color format.

It was a worthwhile experiment, but for it to really work it needs
a really proper luma-chroma separation, which is both slower than
I really want, and requires knowing the colorspace being used.

I might make another go at this based on the TIFF LogLUV color
format, requiring XYZ as input.
---
 sub_crates/trifloat/benches/bench.rs |  26 +--
 sub_crates/trifloat/src/lib.rs       |   1 -
 sub_crates/trifloat/src/rgb32.rs     | 321 ---------------------------
 3 files changed, 1 insertion(+), 347 deletions(-)
 delete mode 100644 sub_crates/trifloat/src/rgb32.rs

diff --git a/sub_crates/trifloat/benches/bench.rs b/sub_crates/trifloat/benches/bench.rs
index 5b4d0cd..7f27f04 100644
--- a/sub_crates/trifloat/benches/bench.rs
+++ b/sub_crates/trifloat/benches/bench.rs
@@ -1,6 +1,6 @@
 use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
 use rand::{rngs::SmallRng, FromEntropy, Rng};
-use trifloat::{rgb32, signed48, unsigned32};
+use trifloat::{signed48, unsigned32};
 
 //----
 
@@ -48,28 +48,6 @@ fn signed48_decode_100_values(bench: &mut Bencher) {
     });
 }
 
-fn rgb32_encode_100_values(bench: &mut Bencher) {
-    let mut rng = SmallRng::from_entropy();
-    bench.iter(|| {
-        let y = rng.gen::<f32>();
-        let x = rng.gen::<f32>();
-        let z = rng.gen::<f32>();
-        for _ in 0..100 {
-            black_box(rgb32::encode(black_box((x, y, z))));
-        }
-    });
-}
-
-fn rgb32_decode_100_values(bench: &mut Bencher) {
-    let mut rng = SmallRng::from_entropy();
-    bench.iter(|| {
-        let v = rng.gen::<u32>();
-        for _ in 0..100 {
-            black_box(rgb32::decode(black_box(v)));
-        }
-    });
-}
-
 //----
 
 benchmark_group!(
@@ -78,7 +56,5 @@ benchmark_group!(
     unsigned32_decode_100_values,
     signed48_encode_100_values,
     signed48_decode_100_values,
-    rgb32_encode_100_values,
-    rgb32_decode_100_values,
 );
 benchmark_main!(benches);
diff --git a/sub_crates/trifloat/src/lib.rs b/sub_crates/trifloat/src/lib.rs
index dafef1e..eaefb5f 100644
--- a/sub_crates/trifloat/src/lib.rs
+++ b/sub_crates/trifloat/src/lib.rs
@@ -4,7 +4,6 @@
 //! The motivating use-case for this is compactly storing HDR RGB colors.  But
 //! it may be useful for other things as well.
 
-pub mod rgb32;
 pub mod signed48;
 pub mod unsigned32;
 
diff --git a/sub_crates/trifloat/src/rgb32.rs b/sub_crates/trifloat/src/rgb32.rs
deleted file mode 100644
index f2f194a..0000000
--- a/sub_crates/trifloat/src/rgb32.rs
+++ /dev/null
@@ -1,321 +0,0 @@
-//! Encoding/decoding for a specialized HDR RGB 32-bit storage format.
-//!
-//! The motivation for this format is to separate out the luma of
-//! the color from its chromaticity, in the same spirit as most
-//! image and video compression approaches, and then allocate more
-//! bits to storing the luma component since that's what the human
-//! eye is most sensitive to.
-//!
-//! This encoding first transforms the color into a Y (luma) component
-//! and two chroma components (C1 and C2), and then fiddles those
-//! components into a special 32-bit format.  The Y component is stored
-//! as an unsigned float, with 6 bits of exponent and 10 bits of
-//! mantissa.  The two chroma components are each stored as 8-bit
-//! integers.
-//!
-//! The layout is:
-//!
-//! 1. Y-exponent: 6 bits
-//! 2. Y-mantissa: 10 bits
-//! 3. C1: 8 bits
-//! 4. C2: 8 bits
-//!
-//! The Y-mantissa has an implicit leading one, giving 11 bits of
-//! precision.
-
-const EXP_BIAS: i32 = 23;
-const CHROMA_QUANT: u32 = 254;
-
-/// The largest value this format can store.
-///
-/// More precisely, this is the largest value that can be *reliably*
-/// stored.
-///
-/// This can be exceeded by individual channels in limited cases due
-/// to the color transform used.  But values *at least* this large
-/// can be relied on.
-pub const MAX: f32 = ((1u64 << (63 - EXP_BIAS)) - (1 << (52 - EXP_BIAS))) as f32;
-
-/// The smallest non-zero value this format can store.
-///
-/// Note that since this is effectively a shared-exponent format,
-/// the numerical precision of all channels depends on the magnitude
-/// of the over-all RGB color.
-pub const MIN: f32 = 1.0 / (1 << (EXP_BIAS - 2)) as f32;
-
-/// The output c1 and c2 values are always in the range [0, 1].
-#[inline(always)]
-fn rgb2ycc(r: f32, g: f32, b: f32) -> (f32, f32, f32) {
-    let rb = (r + b) * 0.5;
-    let y = g + rb;
-
-    if r > b {
-        (y, rb / y, b / y)
-    } else {
-        (y, r / y, rb / y)
-    }
-}
-
-/// The input c1 and c2 values should always be in the range [0, 1].
-#[inline(always)]
-fn ycc2rgb(y: f32, c1: f32, c2: f32) -> (f32, f32, f32) {
-    if c1 > c2 {
-        let rb = y * c1;
-        let g = y - rb;
-        let b = y * c2;
-        let r = (rb * 2.0) - b;
-        (r, g, b)
-    } else {
-        let rb = y * c2;
-        let g = y - rb;
-        let r = y * c1;
-        let b = (rb * 2.0) - r;
-        (r, g, b)
-    }
-}
-
-/// Encodes three floating point RGB values into a packed 32-bit format.
-///
-/// Warning: negative values and NaN's are _not_ supported.  There are
-/// debug-only assertions in place to catch such values in the input
-/// floats.  Infinity in any channel will saturate the whole color to
-/// the brightest representable white.
-#[inline]
-pub fn encode(floats: (f32, f32, f32)) -> u32 {
-    debug_assert!(
-        floats.0 >= 0.0
-            && floats.1 >= 0.0
-            && floats.2 >= 0.0
-            && !floats.0.is_nan()
-            && !floats.1.is_nan()
-            && !floats.2.is_nan(),
-        "trifloat::rgb32::encode(): encoding to unsigned tri-floats only \
-         works correctly for positive, non-NaN numbers, but the numbers passed \
-         were: ({}, {}, {})",
-        floats.0,
-        floats.1,
-        floats.2
-    );
-
-    // Convert to luma-chroma format.
-    let (y, c1, c2) = rgb2ycc(floats.0, floats.1, floats.2);
-
-    // Bit-fiddle to get the float components of Y.
-    // This assumes we're working with a standard 32-bit IEEE float.
-    let y_ieee_bits = y.to_bits();
-    let y_mantissa = (y_ieee_bits >> 13) & 0b11_1111_1111;
-    let y_exp = ((y_ieee_bits >> 23) & 0b1111_1111) as i32 - 127;
-
-    // Encode C1 and C2 as 8-bit integers.
-    let c1_8bit = ((c1 * CHROMA_QUANT as f32) + 0.5) as u32;
-    let c2_8bit = ((c2 * CHROMA_QUANT as f32) + 0.5) as u32;
-
-    // Pack values into a u32 and return.
-    if y_exp <= (0 - EXP_BIAS) {
-        // Early-out corner-case:
-        // Luma is so dark that it will be zero at our precision,
-        // and hence black.
-        0
-    } else if y_exp >= (63 - EXP_BIAS) {
-        // Corner-case:
-        // Luma is so bright that it exceeds our max value, so saturate
-        // the luma.
-        if y.is_infinite() {
-            // If luma is infinity, our chroma values are bogus, so
-            // just go with white.
-            let tmp = CHROMA_QUANT / 2;
-            0xffff0000 | tmp << 8 | tmp
-        } else {
-            0xffff0000 | (c1_8bit << 8) | c2_8bit
-        }
-    } else {
-        // Common case.
-        let exp = (y_exp + EXP_BIAS) as u32;
-        (exp << 26) | (y_mantissa << 16) | (c1_8bit << 8) | c2_8bit
-    }
-}
-
-/// Decodes a packed HDR RGB 32-bit format into three full
-/// floating point RGB numbers.
-#[inline]
-pub fn decode(packed_rgb: u32) -> (f32, f32, f32) {
-    // Pull out Y, C1, and C2 from the packed bits.
-    let y = {
-        let exp = (packed_rgb & 0xfc00_0000) >> 26;
-        if exp == 0 {
-            0.0
-        } else {
-            f32::from_bits(
-                ((exp + (127 - EXP_BIAS as u32)) << 23) | ((packed_rgb & 0x03ff_0000) >> 3),
-            )
-        }
-    };
-    let c1 = {
-        let c1_8bit = (packed_rgb >> 8) & 0xff;
-        (c1_8bit as f32) * (1.0 / CHROMA_QUANT as f32)
-    };
-    let c2 = {
-        let c2_8bit = packed_rgb & 0xff;
-        (c2_8bit as f32) * (1.0 / CHROMA_QUANT as f32)
-    };
-
-    // Convert back to RGB.
-    ycc2rgb(y, c1, c2)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) {
-        decode(encode(floats))
-    }
-
-    #[test]
-    fn all_zeros() {
-        let fs = (0.0f32, 0.0f32, 0.0f32);
-
-        let tri = encode(fs);
-        let fs2 = decode(tri);
-
-        assert_eq!(tri, 0u32);
-        assert_eq!(fs, fs2);
-    }
-
-    #[test]
-    fn powers_of_two() {
-        let mut n = 1.0f32 / 65536.0;
-        for _ in 0..48 {
-            let fs = (n, n, n);
-
-            assert_eq!(fs, round_trip(fs));
-            n *= 2.0;
-        }
-    }
-
-    #[test]
-    fn integers() {
-        let mut n = 1.0f32;
-        for _ in 0..2048 {
-            let fs = (n, n, n);
-
-            assert_eq!(fs, round_trip(fs));
-            n += 1.0;
-        }
-    }
-
-    #[test]
-    fn color_saturation() {
-        let fs1 = (1.0, 0.0, 0.0);
-        let fs2 = (0.0, 1.0, 0.0);
-        let fs3 = (0.0, 0.0, 1.0);
-
-        assert_eq!(fs1, round_trip(fs1));
-        assert_eq!(fs2, round_trip(fs2));
-        assert_eq!(fs3, round_trip(fs3));
-    }
-
-    #[test]
-    fn num_saturate() {
-        let fs = (10000000000000.0, 10000000000000.0, 10000000000000.0);
-
-        assert_eq!((MAX, MAX, MAX), round_trip(fs));
-    }
-
-    #[test]
-    fn num_inf_saturate() {
-        use std::f32::INFINITY;
-        let fs = (INFINITY, INFINITY, INFINITY);
-
-        assert_eq!((MAX, MAX, MAX), round_trip(fs));
-    }
-
-    #[test]
-    fn num_partial_saturate() {
-        let fs1 = (10000000000000.0, 0.0, 0.0);
-        let fs2 = (0.0, 10000000000000.0, 0.0);
-        let fs3 = (0.0, 0.0, 10000000000000.0);
-
-        assert_eq!((MAX * 4.0, 0.0, 0.0), round_trip(fs1));
-        assert_eq!((0.0, MAX * 2.0, 0.0), round_trip(fs2));
-        assert_eq!((0.0, 0.0, MAX * 4.0), round_trip(fs3));
-    }
-
-    #[test]
-    fn largest_value() {
-        let fs1 = (MAX, MAX, MAX);
-        let fs2 = (MAX, 0.0, 0.0);
-        let fs3 = (0.0, MAX, 0.0);
-        let fs4 = (0.0, 0.0, MAX);
-
-        assert_eq!(fs1, round_trip(fs1));
-        assert_eq!(fs2, round_trip(fs2));
-        assert_eq!(fs3, round_trip(fs3));
-        assert_eq!(fs4, round_trip(fs4));
-    }
-
-    #[test]
-    fn smallest_value() {
-        let fs1 = (MIN, MIN, MIN);
-        let fs2 = (MIN, 0.0, 0.0);
-        let fs3 = (0.0, MIN, 0.0);
-        let fs4 = (0.0, 0.0, MIN);
-
-        assert_eq!(fs1, round_trip(fs1));
-        assert_eq!(fs2, round_trip(fs2));
-        assert_eq!(fs3, round_trip(fs3));
-        assert_eq!(fs4, round_trip(fs4));
-    }
-
-    #[test]
-    fn underflow() {
-        let fs1 = (MIN * 0.5, 0.0, 0.0);
-        let fs2 = (0.0, MIN * 0.25, 0.0);
-        let fs3 = (0.0, 0.0, MIN * 0.5);
-
-        assert_eq!(round_trip(fs1), (0.0, 0.0, 0.0));
-        assert_eq!(round_trip(fs2), (0.0, 0.0, 0.0));
-        assert_eq!(round_trip(fs3), (0.0, 0.0, 0.0));
-    }
-
-    #[test]
-    #[should_panic]
-    fn nans_01() {
-        encode((std::f32::NAN, 0.0, 0.0));
-    }
-
-    #[test]
-    #[should_panic]
-    fn nans_02() {
-        encode((0.0, std::f32::NAN, 0.0));
-    }
-
-    #[test]
-    #[should_panic]
-    fn nans_03() {
-        encode((0.0, 0.0, std::f32::NAN));
-    }
-
-    #[test]
-    #[should_panic]
-    fn negative_01() {
-        encode((-1.0, 0.0, 0.0));
-    }
-
-    #[test]
-    #[should_panic]
-    fn negative_02() {
-        encode((0.0, -1.0, 0.0));
-    }
-
-    #[test]
-    #[should_panic]
-    fn negative_03() {
-        encode((0.0, 0.0, -1.0));
-    }
-
-    #[test]
-    fn negative_04() {
-        encode((-0.0, -0.0, -0.0));
-    }
-}

From 96b8dd84b9d50960a2b4bf7198607d7ef55f7700 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Fri, 18 Sep 2020 21:04:16 +0900
Subject: [PATCH 07/25] Cleaned up the u32 trifloat implementation.

This also makes encoding faster.  However, it no longer does
rounding to the nearest precision when encoding, and insead does
flooring.  This seems like a reasonable tradeoff: if you want more
precision... you should use a format with more precision.
---
 sub_crates/trifloat/src/unsigned32.rs | 100 ++++++++++++--------------
 1 file changed, 45 insertions(+), 55 deletions(-)

diff --git a/sub_crates/trifloat/src/unsigned32.rs b/sub_crates/trifloat/src/unsigned32.rs
index 56baf1f..ff462fc 100644
--- a/sub_crates/trifloat/src/unsigned32.rs
+++ b/sub_crates/trifloat/src/unsigned32.rs
@@ -2,7 +2,7 @@
 //!
 //! The encoding uses 9 bits of mantissa per number, and 5 bits for the shared
 //! exponent.  The bit layout is [mantissa 1, mantissa 2, mantissa 3, exponent].
-//! The exponent is stored as an unsigned integer with a bias of 10.
+//! The exponent is stored as an unsigned integer with a bias of 11.
 //!
 //! The largest representable number is `2^21 - 4096`, and the smallest
 //! representable non-zero number is `2^-19`.
@@ -14,22 +14,20 @@
 use crate::{fiddle_exp2, fiddle_log2};
 
 /// Largest representable number.
-pub const MAX: f32 = 2_093_056.0;
+pub const MAX: f32 = ((1u64 << (32 - EXP_BIAS)) - (1 << (32 - EXP_BIAS - 9))) as f32;
 
 /// Smallest representable non-zero number.
-pub const MIN: f32 = 0.000_001_907_348_6;
+pub const MIN: f32 = 1.0 / (1 << (EXP_BIAS + 8)) as f32;
 
 /// Difference between 1.0 and the next largest representable number.
 pub const EPSILON: f32 = 1.0 / 256.0;
 
-const EXP_BIAS: i32 = 10;
-const MIN_EXP: i32 = 0 - EXP_BIAS;
-const MAX_EXP: i32 = 31 - EXP_BIAS;
+const EXP_BIAS: i32 = 11;
 
 /// Encodes three floating point values into a signed 32-bit trifloat.
 ///
 /// Input floats larger than `MAX` will saturate to `MAX`, including infinity.
-/// Values are converted to trifloat precision by rounding.
+/// Values are converted to trifloat precision by rounding down.
 ///
 /// Warning: negative values and NaN's are _not_ supported by the trifloat
 /// format.  There are debug-only assertions in place to catch such
@@ -51,31 +49,19 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
         floats.2
     );
 
-    // Find the largest of the three values.
-    let largest_value = floats.0.max(floats.1.max(floats.2));
-    if largest_value <= 0.0 {
+    let largest = floats.0.max(floats.1.max(floats.2));
+
+    if largest < MIN {
         return 0;
+    } else {
+        let e = fiddle_log2(largest).max(-EXP_BIAS).min(31 - EXP_BIAS);
+        let inv_multiplier = fiddle_exp2(-e + 8);
+        let x = (floats.0 * inv_multiplier).min(511.0) as u32;
+        let y = (floats.1 * inv_multiplier).min(511.0) as u32;
+        let z = (floats.2 * inv_multiplier).min(511.0) as u32;
+
+        (x << (9 + 9 + 5)) | (y << (9 + 5)) | (z << 5) | (e + EXP_BIAS) as u32
     }
-
-    // Calculate the exponent and 1.0/multiplier for encoding the values.
-    let mut exponent = (fiddle_log2(largest_value) + 1).max(MIN_EXP).min(MAX_EXP);
-    let mut inv_multiplier = fiddle_exp2(-exponent + 9);
-
-    // Edge-case: make sure rounding pushes the largest value up
-    // appropriately if needed.
-    if (largest_value * inv_multiplier) + 0.5 >= 512.0 {
-        exponent = (exponent + 1).min(MAX_EXP);
-        inv_multiplier = fiddle_exp2(-exponent + 9);
-    }
-
-    // Quantize and encode values.
-    let x = (floats.0 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
-    let y = (floats.1 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
-    let z = (floats.2 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
-    let e = (exponent + EXP_BIAS) as u32 & 0b1_1111;
-
-    // Pack values into a u32.
-    (x << (5 + 9 + 9)) | (y << (5 + 9)) | (z << 5) | e
 }
 
 /// Decodes an unsigned 32-bit trifloat into three full floating point numbers.
@@ -84,12 +70,12 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
 #[inline]
 pub fn decode(trifloat: u32) -> (f32, f32, f32) {
     // Unpack values.
-    let x = trifloat >> (5 + 9 + 9);
-    let y = (trifloat >> (5 + 9)) & 0b1_1111_1111;
+    let x = trifloat >> (9 + 9 + 5);
+    let y = (trifloat >> (9 + 5)) & 0b1_1111_1111;
     let z = (trifloat >> 5) & 0b1_1111_1111;
     let e = trifloat & 0b1_1111;
 
-    let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 9);
+    let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 8);
 
     (
         x as f32 * multiplier,
@@ -120,11 +106,11 @@ mod tests {
     #[test]
     fn powers_of_two() {
         let fs = (8.0f32, 128.0f32, 0.5f32);
-        assert_eq!(round_trip(fs), fs);
+        assert_eq!(fs, round_trip(fs));
     }
 
     #[test]
-    fn accuracy() {
+    fn accuracy_01() {
         let mut n = 1.0;
         for _ in 0..256 {
             let (x, _, _) = round_trip((n, 0.0, 0.0));
@@ -133,6 +119,17 @@ mod tests {
         }
     }
 
+    #[test]
+    #[should_panic]
+    fn accuracy_02() {
+        let mut n = 1.0;
+        for _ in 0..512 {
+            let (x, _, _) = round_trip((n, 0.0, 0.0));
+            assert_eq!(n, x);
+            n += 1.0 / 512.0;
+        }
+    }
+
     #[test]
     fn integers() {
         for n in 0..=512 {
@@ -142,24 +139,17 @@ mod tests {
     }
 
     #[test]
-    fn rounding() {
+    fn precision_floor() {
         let fs = (7.0f32, 513.0f32, 1.0f32);
-        assert_eq!(round_trip(fs), (8.0, 514.0, 2.0));
-    }
-
-    #[test]
-    fn rounding_edge_case() {
-        let fs = (1023.0f32, 0.0f32, 0.0f32);
-
-        assert_eq!(round_trip(fs), (1024.0, 0.0, 0.0),);
+        assert_eq!((6.0, 512.0, 0.0), round_trip(fs));
     }
 
     #[test]
     fn saturate() {
         let fs = (9999999999.0, 9999999999.0, 9999999999.0);
 
-        assert_eq!(round_trip(fs), (MAX, MAX, MAX));
-        assert_eq!(decode(0xFFFFFFFF), (MAX, MAX, MAX),);
+        assert_eq!((MAX, MAX, MAX), round_trip(fs));
+        assert_eq!((MAX, MAX, MAX), decode(0xFFFFFFFF));
     }
 
     #[test]
@@ -167,29 +157,29 @@ mod tests {
         use std::f32::INFINITY;
         let fs = (INFINITY, 0.0, 0.0);
 
-        assert_eq!(round_trip(fs), (MAX, 0.0, 0.0));
-        assert_eq!(encode(fs), 0xFF80001F,);
+        assert_eq!((MAX, 0.0, 0.0), round_trip(fs));
+        assert_eq!(0xFF80001F, encode(fs));
     }
 
     #[test]
     fn partial_saturate() {
         let fs = (9999999999.0, 4096.0, 262144.0);
 
-        assert_eq!(round_trip(fs), (MAX, 4096.0, 262144.0));
+        assert_eq!((MAX, 4096.0, 262144.0), round_trip(fs));
     }
 
     #[test]
     fn smallest_value() {
-        let fs = (MIN, MIN * 0.5, MIN * 0.49);
-        assert_eq!(round_trip(fs), (MIN, MIN, 0.0));
-        assert_eq!(decode(0x00_80_40_00), (MIN, MIN, 0.0));
+        let fs = (MIN * 1.5, MIN, MIN * 0.5);
+        assert_eq!((MIN, MIN, 0.0), round_trip(fs));
+        assert_eq!((MIN, MIN, 0.0), decode(0x00_80_40_00));
     }
 
     #[test]
     fn underflow() {
-        let fs = (MIN * 0.49, 0.0, 0.0);
-        assert_eq!(encode(fs), 0);
-        assert_eq!(round_trip(fs), (0.0, 0.0, 0.0));
+        let fs = (MIN * 0.99, 0.0, 0.0);
+        assert_eq!(0, encode(fs));
+        assert_eq!((0.0, 0.0, 0.0), round_trip(fs));
     }
 
     #[test]

From 9fb3c95171a2e6a6c4674922289eba1cca7a1108 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Fri, 18 Sep 2020 22:12:12 +0900
Subject: [PATCH 08/25] Added an unsigned 40-bit trifloat format.

It is identical to the 32-bit format, except with more precision
and range due to using more bits.  This format should comfortably
store any color information with precision easily exceeding the
limits of human vision.
---
 sub_crates/trifloat/benches/bench.rs  |  26 ++-
 sub_crates/trifloat/src/lib.rs        |   1 +
 sub_crates/trifloat/src/unsigned32.rs |   2 +-
 sub_crates/trifloat/src/unsigned40.rs | 237 ++++++++++++++++++++++++++
 4 files changed, 264 insertions(+), 2 deletions(-)
 create mode 100644 sub_crates/trifloat/src/unsigned40.rs

diff --git a/sub_crates/trifloat/benches/bench.rs b/sub_crates/trifloat/benches/bench.rs
index 7f27f04..43d968f 100644
--- a/sub_crates/trifloat/benches/bench.rs
+++ b/sub_crates/trifloat/benches/bench.rs
@@ -1,6 +1,6 @@
 use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
 use rand::{rngs::SmallRng, FromEntropy, Rng};
-use trifloat::{signed48, unsigned32};
+use trifloat::{signed48, unsigned32, unsigned40};
 
 //----
 
@@ -26,6 +26,28 @@ fn unsigned32_decode_100_values(bench: &mut Bencher) {
     });
 }
 
+fn unsigned40_encode_100_values(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let x = rng.gen::<f32>();
+        let y = rng.gen::<f32>();
+        let z = rng.gen::<f32>();
+        for _ in 0..100 {
+            black_box(unsigned40::encode(black_box((x, y, z))));
+        }
+    });
+}
+
+fn unsigned40_decode_100_values(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let v = rng.gen::<u64>();
+        for _ in 0..100 {
+            black_box(unsigned40::decode(black_box(v)));
+        }
+    });
+}
+
 fn signed48_encode_100_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
@@ -54,6 +76,8 @@ benchmark_group!(
     benches,
     unsigned32_encode_100_values,
     unsigned32_decode_100_values,
+    unsigned40_encode_100_values,
+    unsigned40_decode_100_values,
     signed48_encode_100_values,
     signed48_decode_100_values,
 );
diff --git a/sub_crates/trifloat/src/lib.rs b/sub_crates/trifloat/src/lib.rs
index eaefb5f..2e1afa2 100644
--- a/sub_crates/trifloat/src/lib.rs
+++ b/sub_crates/trifloat/src/lib.rs
@@ -6,6 +6,7 @@
 
 pub mod signed48;
 pub mod unsigned32;
+pub mod unsigned40;
 
 //===========================================================================
 // Some shared functions used by the other modules in this crate.
diff --git a/sub_crates/trifloat/src/unsigned32.rs b/sub_crates/trifloat/src/unsigned32.rs
index ff462fc..700e45a 100644
--- a/sub_crates/trifloat/src/unsigned32.rs
+++ b/sub_crates/trifloat/src/unsigned32.rs
@@ -24,7 +24,7 @@ pub const EPSILON: f32 = 1.0 / 256.0;
 
 const EXP_BIAS: i32 = 11;
 
-/// Encodes three floating point values into a signed 32-bit trifloat.
+/// Encodes three floating point values into an unsigned 32-bit trifloat.
 ///
 /// Input floats larger than `MAX` will saturate to `MAX`, including infinity.
 /// Values are converted to trifloat precision by rounding down.
diff --git a/sub_crates/trifloat/src/unsigned40.rs b/sub_crates/trifloat/src/unsigned40.rs
new file mode 100644
index 0000000..5fcd8ac
--- /dev/null
+++ b/sub_crates/trifloat/src/unsigned40.rs
@@ -0,0 +1,237 @@
+//! Encoding/decoding for unsigned 40-bit trifloat numbers.
+//!
+//! The encoding uses 11 bits of mantissa per number, and 7 bits for the shared
+//! exponent.  The bit layout is [mantissa 1, mantissa 2, mantissa 3, exponent].
+//! The exponent is stored as an unsigned integer with a bias of 32.
+//!
+//! The largest representable number is just under `2^96`, and the smallest
+//! representable non-zero number is `2^-42`.
+//!
+//! Since the exponent is shared between the three values, the precision
+//! of all three values depends on the largest of the three.  All integers
+//! up to 2048 can be represented exactly in the largest value.
+
+use crate::{fiddle_exp2, fiddle_log2};
+
+/// Largest representable number.
+pub const MAX: f32 = ((1u128 << (128 - EXP_BIAS)) - (1 << (128 - EXP_BIAS - 11))) as f32;
+
+/// Smallest representable non-zero number.
+pub const MIN: f32 = 1.0 / (1u128 << (EXP_BIAS + 10)) as f32;
+
+/// Difference between 1.0 and the next largest representable number.
+pub const EPSILON: f32 = 1.0 / 1024.0;
+
+const EXP_BIAS: i32 = 32;
+
+/// Encodes three floating point values into an unsigned 40-bit trifloat.
+///
+/// Input floats larger than `MAX` will saturate to `MAX`, including infinity.
+/// Values are converted to trifloat precision by rounding down.
+///
+/// Only the lower 40 bits of the return value are used.  The highest 24 bits
+/// will all be zero and can be safely discarded.
+///
+/// Warning: negative values and NaN's are _not_ supported by the trifloat
+/// format.  There are debug-only assertions in place to catch such
+/// values in the input floats.
+#[inline]
+pub fn encode(floats: (f32, f32, f32)) -> u64 {
+    debug_assert!(
+        floats.0 >= 0.0
+            && floats.1 >= 0.0
+            && floats.2 >= 0.0
+            && !floats.0.is_nan()
+            && !floats.1.is_nan()
+            && !floats.2.is_nan(),
+        "trifloat::unsigned32::encode(): encoding to unsigned tri-floats only \
+         works correctly for positive, non-NaN numbers, but the numbers passed \
+         were: ({}, {}, {})",
+        floats.0,
+        floats.1,
+        floats.2
+    );
+
+    let largest = floats.0.max(floats.1.max(floats.2));
+
+    if largest < MIN {
+        return 0;
+    } else {
+        let e = fiddle_log2(largest).max(-EXP_BIAS).min(127 - EXP_BIAS);
+        let inv_multiplier = fiddle_exp2(-e + 10);
+        let x = (floats.0 * inv_multiplier).min(2047.0) as u64;
+        let y = (floats.1 * inv_multiplier).min(2047.0) as u64;
+        let z = (floats.2 * inv_multiplier).min(2047.0) as u64;
+
+        (x << (11 + 11 + 7)) | (y << (11 + 7)) | (z << 7) | (e + EXP_BIAS) as u64
+    }
+}
+
+/// Decodes an unsigned 40-bit trifloat into three full floating point numbers.
+///
+/// This operation is lossless and cannot fail.  Only the lower 40 bits of the
+/// input value are used--the upper 24 bits can safely be anything and are
+/// ignored.
+#[inline]
+pub fn decode(trifloat: u64) -> (f32, f32, f32) {
+    // Unpack values.
+    let x = trifloat >> (11 + 11 + 7);
+    let y = (trifloat >> (11 + 7)) & 0b111_1111_1111;
+    let z = (trifloat >> 7) & 0b111_1111_1111;
+    let e = trifloat & 0b111_1111;
+
+    let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 10);
+
+    (
+        x as f32 * multiplier,
+        y as f32 * multiplier,
+        z as f32 * multiplier,
+    )
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) {
+        decode(encode(floats))
+    }
+
+    #[test]
+    fn all_zeros() {
+        let fs = (0.0f32, 0.0f32, 0.0f32);
+
+        let tri = encode(fs);
+        let fs2 = decode(tri);
+
+        assert_eq!(tri, 0u64);
+        assert_eq!(fs, fs2);
+    }
+
+    #[test]
+    fn powers_of_two() {
+        let fs = (8.0f32, 128.0f32, 0.5f32);
+        assert_eq!(fs, round_trip(fs));
+    }
+
+    #[test]
+    fn accuracy_01() {
+        let mut n = 1.0;
+        for _ in 0..1024 {
+            let (x, _, _) = round_trip((n, 0.0, 0.0));
+            assert_eq!(n, x);
+            n += 1.0 / 1024.0;
+        }
+    }
+
+    #[test]
+    #[should_panic]
+    fn accuracy_02() {
+        let mut n = 1.0;
+        for _ in 0..2048 {
+            let (x, _, _) = round_trip((n, 0.0, 0.0));
+            assert_eq!(n, x);
+            n += 1.0 / 2048.0;
+        }
+    }
+
+    #[test]
+    fn integers() {
+        for n in 0..=2048 {
+            let (x, _, _) = round_trip((n as f32, 0.0, 0.0));
+            assert_eq!(n as f32, x);
+        }
+    }
+
+    #[test]
+    fn precision_floor() {
+        let fs = (7.0f32, 2049.0f32, 1.0f32);
+        assert_eq!((6.0, 2048.0, 0.0), round_trip(fs));
+    }
+
+    #[test]
+    fn saturate() {
+        let fs = (1.0e+30, 1.0e+30, 1.0e+30);
+
+        assert_eq!((MAX, MAX, MAX), round_trip(fs));
+        assert_eq!((MAX, MAX, MAX), decode(0xff_ffff_ffff));
+    }
+
+    #[test]
+    fn inf_saturate() {
+        use std::f32::INFINITY;
+        let fs = (INFINITY, 0.0, 0.0);
+
+        assert_eq!((MAX, 0.0, 0.0), round_trip(fs));
+        assert_eq!(0xffe000007f, encode(fs));
+    }
+
+    #[test]
+    fn partial_saturate() {
+        let fs = (
+            1.0e+30,
+            (1u128 << (128 - EXP_BIAS - 11)) as f32,
+            (1u128 << (128 - EXP_BIAS - 12)) as f32,
+        );
+
+        assert_eq!(
+            (MAX, (1u128 << (128 - EXP_BIAS - 11)) as f32, 0.0),
+            round_trip(fs)
+        );
+    }
+
+    #[test]
+    fn smallest_value() {
+        let fs = (MIN * 1.5, MIN, MIN * 0.5);
+        assert_eq!((MIN, MIN, 0.0), round_trip(fs));
+        assert_eq!((MIN, MIN, 0.0), decode(0x20_04_00_00));
+    }
+
+    #[test]
+    fn underflow() {
+        let fs = (MIN * 0.99, 0.0, 0.0);
+        assert_eq!(0, encode(fs));
+        assert_eq!((0.0, 0.0, 0.0), round_trip(fs));
+    }
+
+    #[test]
+    #[should_panic]
+    fn nans_01() {
+        encode((std::f32::NAN, 0.0, 0.0));
+    }
+
+    #[test]
+    #[should_panic]
+    fn nans_02() {
+        encode((0.0, std::f32::NAN, 0.0));
+    }
+
+    #[test]
+    #[should_panic]
+    fn nans_03() {
+        encode((0.0, 0.0, std::f32::NAN));
+    }
+
+    #[test]
+    #[should_panic]
+    fn negative_01() {
+        encode((-1.0, 0.0, 0.0));
+    }
+
+    #[test]
+    #[should_panic]
+    fn negative_02() {
+        encode((0.0, -1.0, 0.0));
+    }
+
+    #[test]
+    #[should_panic]
+    fn negative_03() {
+        encode((0.0, 0.0, -1.0));
+    }
+
+    #[test]
+    fn negative_04() {
+        encode((-0.0, -0.0, -0.0));
+    }
+}

From fd98b333335f8d0bd01497819e73256cbeeeed57 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 19 Sep 2020 08:28:59 +0900
Subject: [PATCH 09/25] Make unsigned40 trifloat encoding a byte array.

The whole point of these formats is to compress down to less
space, so let's not leave actually putting it in the space-saving
form on the client code.
---
 sub_crates/trifloat/benches/bench.rs  | 44 ++++++++++--------
 sub_crates/trifloat/src/unsigned40.rs | 64 ++++++++++++++++++++-------
 2 files changed, 72 insertions(+), 36 deletions(-)

diff --git a/sub_crates/trifloat/benches/bench.rs b/sub_crates/trifloat/benches/bench.rs
index 43d968f..3dcd704 100644
--- a/sub_crates/trifloat/benches/bench.rs
+++ b/sub_crates/trifloat/benches/bench.rs
@@ -4,67 +4,73 @@ use trifloat::{signed48, unsigned32, unsigned40};
 
 //----
 
-fn unsigned32_encode_100_values(bench: &mut Bencher) {
+fn unsigned32_encode_1000_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
         let x = rng.gen::<f32>();
         let y = rng.gen::<f32>();
         let z = rng.gen::<f32>();
-        for _ in 0..100 {
+        for _ in 0..1000 {
             black_box(unsigned32::encode(black_box((x, y, z))));
         }
     });
 }
 
-fn unsigned32_decode_100_values(bench: &mut Bencher) {
+fn unsigned32_decode_1000_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
         let v = rng.gen::<u32>();
-        for _ in 0..100 {
+        for _ in 0..1000 {
             black_box(unsigned32::decode(black_box(v)));
         }
     });
 }
 
-fn unsigned40_encode_100_values(bench: &mut Bencher) {
+fn unsigned40_encode_1000_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
         let x = rng.gen::<f32>();
         let y = rng.gen::<f32>();
         let z = rng.gen::<f32>();
-        for _ in 0..100 {
+        for _ in 0..1000 {
             black_box(unsigned40::encode(black_box((x, y, z))));
         }
     });
 }
 
-fn unsigned40_decode_100_values(bench: &mut Bencher) {
+fn unsigned40_decode_1000_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
-        let v = rng.gen::<u64>();
-        for _ in 0..100 {
+        let v = [
+            rng.gen::<u8>(),
+            rng.gen::<u8>(),
+            rng.gen::<u8>(),
+            rng.gen::<u8>(),
+            rng.gen::<u8>(),
+        ];
+        for _ in 0..1000 {
             black_box(unsigned40::decode(black_box(v)));
         }
     });
 }
 
-fn signed48_encode_100_values(bench: &mut Bencher) {
+fn signed48_encode_1000_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
         let x = rng.gen::<f32>() - 0.5;
         let y = rng.gen::<f32>() - 0.5;
         let z = rng.gen::<f32>() - 0.5;
-        for _ in 0..100 {
+        for _ in 0..1000 {
             black_box(signed48::encode(black_box((x, y, z))));
         }
     });
 }
 
-fn signed48_decode_100_values(bench: &mut Bencher) {
+fn signed48_decode_1000_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
         let v = rng.gen::<u64>() & 0x0000_FFFF_FFFF_FFFF;
-        for _ in 0..100 {
+        for _ in 0..1000 {
             black_box(signed48::decode(black_box(v)));
         }
     });
@@ -74,11 +80,11 @@ fn signed48_decode_100_values(bench: &mut Bencher) {
 
 benchmark_group!(
     benches,
-    unsigned32_encode_100_values,
-    unsigned32_decode_100_values,
-    unsigned40_encode_100_values,
-    unsigned40_decode_100_values,
-    signed48_encode_100_values,
-    signed48_decode_100_values,
+    unsigned32_encode_1000_values,
+    unsigned32_decode_1000_values,
+    unsigned40_encode_1000_values,
+    unsigned40_decode_1000_values,
+    signed48_encode_1000_values,
+    signed48_decode_1000_values,
 );
 benchmark_main!(benches);
diff --git a/sub_crates/trifloat/src/unsigned40.rs b/sub_crates/trifloat/src/unsigned40.rs
index 5fcd8ac..3766d41 100644
--- a/sub_crates/trifloat/src/unsigned40.rs
+++ b/sub_crates/trifloat/src/unsigned40.rs
@@ -29,14 +29,25 @@ const EXP_BIAS: i32 = 32;
 /// Input floats larger than `MAX` will saturate to `MAX`, including infinity.
 /// Values are converted to trifloat precision by rounding down.
 ///
-/// Only the lower 40 bits of the return value are used.  The highest 24 bits
-/// will all be zero and can be safely discarded.
-///
 /// Warning: negative values and NaN's are _not_ supported by the trifloat
 /// format.  There are debug-only assertions in place to catch such
 /// values in the input floats.
 #[inline]
-pub fn encode(floats: (f32, f32, f32)) -> u64 {
+pub fn encode(floats: (f32, f32, f32)) -> [u8; 5] {
+    u64_to_bytes(encode_64(floats))
+}
+
+/// Decodes an unsigned 40-bit trifloat into three full floating point numbers.
+///
+/// This operation is lossless and cannot fail.
+#[inline]
+pub fn decode(trifloat: [u8; 5]) -> (f32, f32, f32) {
+    decode_64(bytes_to_u64(trifloat))
+}
+
+// Workhorse encode function, which operates on u64.
+#[inline(always)]
+fn encode_64(floats: (f32, f32, f32)) -> u64 {
     debug_assert!(
         floats.0 >= 0.0
             && floats.1 >= 0.0
@@ -67,13 +78,9 @@ pub fn encode(floats: (f32, f32, f32)) -> u64 {
     }
 }
 
-/// Decodes an unsigned 40-bit trifloat into three full floating point numbers.
-///
-/// This operation is lossless and cannot fail.  Only the lower 40 bits of the
-/// input value are used--the upper 24 bits can safely be anything and are
-/// ignored.
-#[inline]
-pub fn decode(trifloat: u64) -> (f32, f32, f32) {
+// Workhorse decode function, which operates on u64.
+#[inline(always)]
+fn decode_64(trifloat: u64) -> (f32, f32, f32) {
     // Unpack values.
     let x = trifloat >> (11 + 11 + 7);
     let y = (trifloat >> (11 + 7)) & 0b111_1111_1111;
@@ -89,6 +96,29 @@ pub fn decode(trifloat: u64) -> (f32, f32, f32) {
     )
 }
 
+#[inline(always)]
+fn u64_to_bytes(n: u64) -> [u8; 5] {
+    let a = n.to_ne_bytes();
+    let mut b = [0u8; 5];
+    if cfg!(target_endian = "big") {
+        (&mut b[..]).copy_from_slice(&a[3..8]);
+    } else {
+        (&mut b[..]).copy_from_slice(&a[0..5]);
+    }
+    b
+}
+
+#[inline(always)]
+fn bytes_to_u64(a: [u8; 5]) -> u64 {
+    let mut b = [0u8; 8];
+    if cfg!(target_endian = "big") {
+        (&mut b[3..8]).copy_from_slice(&a[..]);
+    } else {
+        (&mut b[0..5]).copy_from_slice(&a[..]);
+    }
+    u64::from_ne_bytes(b)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -101,8 +131,8 @@ mod tests {
     fn all_zeros() {
         let fs = (0.0f32, 0.0f32, 0.0f32);
 
-        let tri = encode(fs);
-        let fs2 = decode(tri);
+        let tri = encode_64(fs);
+        let fs2 = decode_64(tri);
 
         assert_eq!(tri, 0u64);
         assert_eq!(fs, fs2);
@@ -154,7 +184,7 @@ mod tests {
         let fs = (1.0e+30, 1.0e+30, 1.0e+30);
 
         assert_eq!((MAX, MAX, MAX), round_trip(fs));
-        assert_eq!((MAX, MAX, MAX), decode(0xff_ffff_ffff));
+        assert_eq!((MAX, MAX, MAX), decode_64(0xff_ffff_ffff));
     }
 
     #[test]
@@ -163,7 +193,7 @@ mod tests {
         let fs = (INFINITY, 0.0, 0.0);
 
         assert_eq!((MAX, 0.0, 0.0), round_trip(fs));
-        assert_eq!(0xffe000007f, encode(fs));
+        assert_eq!(0xffe000007f, encode_64(fs));
     }
 
     #[test]
@@ -184,13 +214,13 @@ mod tests {
     fn smallest_value() {
         let fs = (MIN * 1.5, MIN, MIN * 0.5);
         assert_eq!((MIN, MIN, 0.0), round_trip(fs));
-        assert_eq!((MIN, MIN, 0.0), decode(0x20_04_00_00));
+        assert_eq!((MIN, MIN, 0.0), decode_64(0x20_04_00_00));
     }
 
     #[test]
     fn underflow() {
         let fs = (MIN * 0.99, 0.0, 0.0);
-        assert_eq!(0, encode(fs));
+        assert_eq!(0, encode_64(fs));
         assert_eq!((0.0, 0.0, 0.0), round_trip(fs));
     }
 

From 49c97bf0fe99ae1117e9056f35f82a52563a3d60 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 19 Sep 2020 08:53:53 +0900
Subject: [PATCH 10/25] Cleaned up the signed48 trifloat code.

---
 sub_crates/trifloat/src/signed48.rs | 127 +++++++++++-----------------
 1 file changed, 51 insertions(+), 76 deletions(-)

diff --git a/sub_crates/trifloat/src/signed48.rs b/sub_crates/trifloat/src/signed48.rs
index 5dd99bf..5b56cd3 100644
--- a/sub_crates/trifloat/src/signed48.rs
+++ b/sub_crates/trifloat/src/signed48.rs
@@ -3,9 +3,9 @@
 //! The encoding uses 13 bits of mantissa and 1 sign bit per number, and 6
 //! bits for the shared exponent. The bit layout is: [sign 1, mantissa 1,
 //! sign 2, mantissa 2, sign 3, mantissa 3, exponent].  The exponent is stored
-//! as an unsigned integer with a bias of 25.
+//! as an unsigned integer with a bias of 26.
 //!
-//! The largest representable number is `2^38 - 2^25`, and the smallest
+//! The largest representable number is just under `2^38`, and the smallest
 //! representable positive number is `2^-38`.
 //!
 //! Since the exponent is shared between all three values, the precision
@@ -18,32 +18,29 @@
 use crate::{fiddle_exp2, fiddle_log2};
 
 /// Largest representable number.
-pub const MAX: f32 = 274_844_352_512.0;
+pub const MAX: f32 = ((1u128 << (64 - EXP_BIAS)) - (1 << (64 - EXP_BIAS - 13))) as f32;
 
 /// Smallest representable number.
 ///
 /// Note this is not the smallest _magnitude_ number.  This is a negative
 /// number of large magnitude.
-pub const MIN: f32 = -274_844_352_512.0;
+pub const MIN: f32 = -MAX;
 
 /// Smallest representable positive number.
 ///
 /// This is the number with the smallest possible magnitude (aside from zero).
-#[allow(clippy::excessive_precision)]
-pub const MIN_POSITIVE: f32 = 0.000_000_000_003_637_978_807_091_713;
+pub const MIN_POSITIVE: f32 = 1.0 / (1u128 << (EXP_BIAS + 12)) as f32;
 
 /// Difference between 1.0 and the next largest representable number.
 pub const EPSILON: f32 = 1.0 / 4096.0;
 
-const EXP_BIAS: i32 = 25;
-const MIN_EXP: i32 = 0 - EXP_BIAS;
-const MAX_EXP: i32 = 63 - EXP_BIAS;
+const EXP_BIAS: i32 = 26;
 
 /// Encodes three floating point values into a signed 48-bit trifloat.
 ///
 /// Input floats that are larger than `MAX` or smaller than `MIN` will saturate
 /// to `MAX` and `MIN` respectively, including +/- infinity.  Values are
-/// converted to trifloat precision by rounding.
+/// converted to trifloat precision by rounding towards zero.
 ///
 /// Only the lower 48 bits of the return value are used.  The highest 16 bits
 /// will all be zero and can be safely discarded.
@@ -62,46 +59,31 @@ pub fn encode(floats: (f32, f32, f32)) -> u64 {
         floats.2
     );
 
-    // Find the largest (in magnitude) of the three values.
-    let largest_value = {
-        let mut largest_value: f32 = 0.0;
-        if floats.0.abs() > largest_value.abs() {
-            largest_value = floats.0;
-        }
-        if floats.1.abs() > largest_value.abs() {
-            largest_value = floats.1;
-        }
-        if floats.2.abs() > largest_value.abs() {
-            largest_value = floats.2;
-        }
-        largest_value
-    };
+    let floats_abs = (floats.0.abs(), floats.1.abs(), floats.2.abs());
 
-    // Calculate the exponent and 1.0/multiplier for encoding the values.
-    let (exponent, inv_multiplier) = {
-        let mut exponent = (fiddle_log2(largest_value) + 1).max(MIN_EXP).min(MAX_EXP);
-        let mut inv_multiplier = fiddle_exp2(-exponent + 13);
+    let largest_abs = floats_abs.0.max(floats_abs.1.max(floats_abs.2));
 
-        // Edge-case: make sure rounding pushes the largest value up
-        // appropriately if needed.
-        if (largest_value * inv_multiplier).abs() + 0.5 >= 8192.0 {
-            exponent = (exponent + 1).min(MAX_EXP);
-            inv_multiplier = fiddle_exp2(-exponent + 13);
-        }
-        (exponent, inv_multiplier)
-    };
+    if largest_abs < MIN_POSITIVE {
+        0
+    } else {
+        let e = fiddle_log2(largest_abs).max(-EXP_BIAS).min(63 - EXP_BIAS);
+        let inv_multiplier = fiddle_exp2(-e + 12);
 
-    // Quantize and encode values.
-    let x = (floats.0.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111;
-    let x_sign = (floats.0.to_bits() >> 31) as u64;
-    let y = (floats.1.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111;
-    let y_sign = (floats.1.to_bits() >> 31) as u64;
-    let z = (floats.2.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111;
-    let z_sign = (floats.2.to_bits() >> 31) as u64;
-    let e = (exponent + EXP_BIAS) as u64 & 0b111_111;
+        let x_sign = (floats.0.to_bits() >> 31) as u64;
+        let x = (floats_abs.0 * inv_multiplier).min(8191.0) as u64;
+        let y_sign = (floats.1.to_bits() >> 31) as u64;
+        let y = (floats_abs.1 * inv_multiplier).min(8191.0) as u64;
+        let z_sign = (floats.2.to_bits() >> 31) as u64;
+        let z = (floats_abs.2 * inv_multiplier).min(8191.0) as u64;
 
-    // Pack values into a single u64 and return.
-    (x_sign << 47) | (x << 34) | (y_sign << 33) | (y << 20) | (z_sign << 19) | (z << 6) | e
+        (x_sign << 47)
+            | (x << 34)
+            | (y_sign << 33)
+            | (y << 20)
+            | (z_sign << 19)
+            | (z << 6)
+            | (e + EXP_BIAS) as u64
+    }
 }
 
 /// Decodes a signed 48-bit trifloat into three full floating point numbers.
@@ -122,7 +104,7 @@ pub fn decode(trifloat: u64) -> (f32, f32, f32) {
 
     let e = trifloat & 0b111_111;
 
-    let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 13);
+    let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 12);
 
     (
         f32::from_bits((x as f32 * multiplier).to_bits() | x_sign),
@@ -153,7 +135,7 @@ mod tests {
     #[test]
     fn powers_of_two() {
         let fs = (8.0f32, 128.0f32, 0.5f32);
-        assert_eq!(round_trip(fs), fs);
+        assert_eq!(fs, round_trip(fs));
     }
 
     #[test]
@@ -196,18 +178,11 @@ mod tests {
     }
 
     #[test]
-    fn rounding() {
+    fn precision_floor() {
         let fs = (7.0f32, 8193.0f32, -1.0f32);
         let fsn = (-7.0f32, -8193.0f32, 1.0f32);
-        assert_eq!(round_trip(fs), (8.0, 8194.0, -2.0));
-        assert_eq!(round_trip(fsn), (-8.0, -8194.0, 2.0));
-    }
-
-    #[test]
-    fn rounding_edge_case() {
-        let fs = (16383.0f32, 0.0f32, 0.0f32);
-
-        assert_eq!(round_trip(fs), (16384.0, 0.0, 0.0),);
+        assert_eq!((6.0, 8192.0, -0.0), round_trip(fs));
+        assert_eq!((-6.0, -8192.0, 0.0), round_trip(fsn));
     }
 
     #[test]
@@ -223,10 +198,10 @@ mod tests {
             -99_999_999_999_999.0,
         );
 
-        assert_eq!(round_trip(fs), (MAX, MAX, MAX));
-        assert_eq!(round_trip(fsn), (MIN, MIN, MIN));
-        assert_eq!(decode(0x7FFD_FFF7_FFFF), (MAX, MAX, MAX));
-        assert_eq!(decode(0xFFFF_FFFF_FFFF), (MIN, MIN, MIN));
+        assert_eq!((MAX, MAX, MAX), round_trip(fs));
+        assert_eq!((MIN, MIN, MIN), round_trip(fsn));
+        assert_eq!((MAX, MAX, MAX), decode(0x7FFD_FFF7_FFFF));
+        assert_eq!((MIN, MIN, MIN), decode(0xFFFF_FFFF_FFFF));
     }
 
     #[test]
@@ -235,10 +210,10 @@ mod tests {
         let fs = (INFINITY, 0.0, 0.0);
         let fsn = (-INFINITY, 0.0, 0.0);
 
-        assert_eq!(round_trip(fs), (MAX, 0.0, 0.0));
-        assert_eq!(round_trip(fsn), (MIN, 0.0, 0.0));
-        assert_eq!(encode(fs), 0x7FFC0000003F);
-        assert_eq!(encode(fsn), 0xFFFC0000003F);
+        assert_eq!((MAX, 0.0, 0.0), round_trip(fs));
+        assert_eq!((MIN, 0.0, 0.0), round_trip(fsn));
+        assert_eq!(0x7FFC0000003F, encode(fs));
+        assert_eq!(0xFFFC0000003F, encode(fsn));
     }
 
     #[test]
@@ -246,25 +221,25 @@ mod tests {
         let fs = (99_999_999_999_999.0, 4294967296.0, -17179869184.0);
         let fsn = (-99_999_999_999_999.0, 4294967296.0, -17179869184.0);
 
-        assert_eq!(round_trip(fs), (MAX, 4294967296.0, -17179869184.0));
-        assert_eq!(round_trip(fsn), (MIN, 4294967296.0, -17179869184.0));
+        assert_eq!((MAX, 4294967296.0, -17179869184.0), round_trip(fs));
+        assert_eq!((MIN, 4294967296.0, -17179869184.0), round_trip(fsn));
     }
 
     #[test]
     fn smallest_value() {
-        let fs = (MIN_POSITIVE, MIN_POSITIVE * 0.5, MIN_POSITIVE * 0.49);
-        let fsn = (-MIN_POSITIVE, -MIN_POSITIVE * 0.5, -MIN_POSITIVE * 0.49);
+        let fs = (MIN_POSITIVE * 1.5, MIN_POSITIVE, MIN_POSITIVE * 0.50);
+        let fsn = (-MIN_POSITIVE * 1.5, -MIN_POSITIVE, -MIN_POSITIVE * 0.50);
 
-        assert_eq!(decode(0x600100000), (MIN_POSITIVE, -MIN_POSITIVE, 0.0));
-        assert_eq!(round_trip(fs), (MIN_POSITIVE, MIN_POSITIVE, 0.0));
-        assert_eq!(round_trip(fsn), (-MIN_POSITIVE, -MIN_POSITIVE, -0.0));
+        assert_eq!((MIN_POSITIVE, -MIN_POSITIVE, 0.0), decode(0x600100000));
+        assert_eq!((MIN_POSITIVE, MIN_POSITIVE, 0.0), round_trip(fs));
+        assert_eq!((-MIN_POSITIVE, -MIN_POSITIVE, -0.0), round_trip(fsn));
     }
 
     #[test]
     fn underflow() {
-        let fs = (MIN_POSITIVE * 0.49, -MIN_POSITIVE * 0.49, 0.0);
-        assert_eq!(encode(fs), 0x200000000);
-        assert_eq!(round_trip(fs), (0.0, -0.0, 0.0));
+        let fs = (MIN_POSITIVE * 0.5, -MIN_POSITIVE * 0.5, MIN_POSITIVE);
+        assert_eq!(0x200000040, encode(fs));
+        assert_eq!((0.0, -0.0, MIN_POSITIVE), round_trip(fs));
     }
 
     #[test]

From 66e9abe66e56bc4c19d8fbae00533446c4acc845 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 19 Sep 2020 12:22:51 +0900
Subject: [PATCH 11/25] Make signed48 trifloat encoding also a byte array.

Same reason as for the unsigned40 encoding in an earlier commit.
---
 sub_crates/trifloat/benches/bench.rs |  9 +++-
 sub_crates/trifloat/src/signed48.rs  | 80 +++++++++++++++++++---------
 2 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/sub_crates/trifloat/benches/bench.rs b/sub_crates/trifloat/benches/bench.rs
index 3dcd704..fa910ac 100644
--- a/sub_crates/trifloat/benches/bench.rs
+++ b/sub_crates/trifloat/benches/bench.rs
@@ -69,7 +69,14 @@ fn signed48_encode_1000_values(bench: &mut Bencher) {
 fn signed48_decode_1000_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
-        let v = rng.gen::<u64>() & 0x0000_FFFF_FFFF_FFFF;
+        let v = [
+            rng.gen::<u8>(),
+            rng.gen::<u8>(),
+            rng.gen::<u8>(),
+            rng.gen::<u8>(),
+            rng.gen::<u8>(),
+            rng.gen::<u8>(),
+        ];
         for _ in 0..1000 {
             black_box(signed48::decode(black_box(v)));
         }
diff --git a/sub_crates/trifloat/src/signed48.rs b/sub_crates/trifloat/src/signed48.rs
index 5b56cd3..f43274d 100644
--- a/sub_crates/trifloat/src/signed48.rs
+++ b/sub_crates/trifloat/src/signed48.rs
@@ -42,13 +42,24 @@ const EXP_BIAS: i32 = 26;
 /// to `MAX` and `MIN` respectively, including +/- infinity.  Values are
 /// converted to trifloat precision by rounding towards zero.
 ///
-/// Only the lower 48 bits of the return value are used.  The highest 16 bits
-/// will all be zero and can be safely discarded.
-///
 /// Warning: NaN's are _not_ supported by the trifloat format.  There are
 /// debug-only assertions in place to catch such values in the input floats.
 #[inline]
-pub fn encode(floats: (f32, f32, f32)) -> u64 {
+pub fn encode(floats: (f32, f32, f32)) -> [u8; 6] {
+    u64_to_bytes(encode_64(floats))
+}
+
+/// Decodes a signed 48-bit trifloat into three full floating point numbers.
+///
+/// This operation is lossless and cannot fail.
+#[inline]
+pub fn decode(trifloat: [u8; 6]) -> (f32, f32, f32) {
+    decode_64(bytes_to_u64(trifloat))
+}
+
+// Workhorse encode function, which operates on u64.
+#[inline(always)]
+fn encode_64(floats: (f32, f32, f32)) -> u64 {
     debug_assert!(
         !floats.0.is_nan() && !floats.1.is_nan() && !floats.2.is_nan(),
         "trifloat::signed48::encode(): encoding to signed tri-floats only \
@@ -86,13 +97,9 @@ pub fn encode(floats: (f32, f32, f32)) -> u64 {
     }
 }
 
-/// Decodes a signed 48-bit trifloat into three full floating point numbers.
-///
-/// This operation is lossless and cannot fail.  Only the lower 48 bits of the
-/// input value are used--the upper 16 bits can safely be anything and are
-/// ignored.
-#[inline]
-pub fn decode(trifloat: u64) -> (f32, f32, f32) {
+// Workhorse decode function, which operates on u64.
+#[inline(always)]
+fn decode_64(trifloat: u64) -> (f32, f32, f32) {
     // Unpack values.
     let x = (trifloat >> 34) & 0b111_11111_11111;
     let y = (trifloat >> 20) & 0b111_11111_11111;
@@ -113,6 +120,29 @@ pub fn decode(trifloat: u64) -> (f32, f32, f32) {
     )
 }
 
+#[inline(always)]
+fn u64_to_bytes(n: u64) -> [u8; 6] {
+    let a = n.to_ne_bytes();
+    let mut b = [0u8; 6];
+    if cfg!(target_endian = "big") {
+        (&mut b[..]).copy_from_slice(&a[2..8]);
+    } else {
+        (&mut b[..]).copy_from_slice(&a[0..6]);
+    }
+    b
+}
+
+#[inline(always)]
+fn bytes_to_u64(a: [u8; 6]) -> u64 {
+    let mut b = [0u8; 8];
+    if cfg!(target_endian = "big") {
+        (&mut b[2..8]).copy_from_slice(&a[..]);
+    } else {
+        (&mut b[0..6]).copy_from_slice(&a[..]);
+    }
+    u64::from_ne_bytes(b)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -125,8 +155,8 @@ mod tests {
     fn all_zeros() {
         let fs = (0.0f32, 0.0f32, 0.0f32);
 
-        let tri = encode(fs);
-        let fs2 = decode(tri);
+        let tri = encode_64(fs);
+        let fs2 = decode_64(tri);
 
         assert_eq!(tri, 0);
         assert_eq!(fs, fs2);
@@ -200,8 +230,8 @@ mod tests {
 
         assert_eq!((MAX, MAX, MAX), round_trip(fs));
         assert_eq!((MIN, MIN, MIN), round_trip(fsn));
-        assert_eq!((MAX, MAX, MAX), decode(0x7FFD_FFF7_FFFF));
-        assert_eq!((MIN, MIN, MIN), decode(0xFFFF_FFFF_FFFF));
+        assert_eq!((MAX, MAX, MAX), decode_64(0x7FFD_FFF7_FFFF));
+        assert_eq!((MIN, MIN, MIN), decode_64(0xFFFF_FFFF_FFFF));
     }
 
     #[test]
@@ -212,8 +242,8 @@ mod tests {
 
         assert_eq!((MAX, 0.0, 0.0), round_trip(fs));
         assert_eq!((MIN, 0.0, 0.0), round_trip(fsn));
-        assert_eq!(0x7FFC0000003F, encode(fs));
-        assert_eq!(0xFFFC0000003F, encode(fsn));
+        assert_eq!(0x7FFC0000003F, encode_64(fs));
+        assert_eq!(0xFFFC0000003F, encode_64(fsn));
     }
 
     #[test]
@@ -230,7 +260,7 @@ mod tests {
         let fs = (MIN_POSITIVE * 1.5, MIN_POSITIVE, MIN_POSITIVE * 0.50);
         let fsn = (-MIN_POSITIVE * 1.5, -MIN_POSITIVE, -MIN_POSITIVE * 0.50);
 
-        assert_eq!((MIN_POSITIVE, -MIN_POSITIVE, 0.0), decode(0x600100000));
+        assert_eq!((MIN_POSITIVE, -MIN_POSITIVE, 0.0), decode_64(0x600100000));
         assert_eq!((MIN_POSITIVE, MIN_POSITIVE, 0.0), round_trip(fs));
         assert_eq!((-MIN_POSITIVE, -MIN_POSITIVE, -0.0), round_trip(fsn));
     }
@@ -238,7 +268,7 @@ mod tests {
     #[test]
     fn underflow() {
         let fs = (MIN_POSITIVE * 0.5, -MIN_POSITIVE * 0.5, MIN_POSITIVE);
-        assert_eq!(0x200000040, encode(fs));
+        assert_eq!(0x200000040, encode_64(fs));
         assert_eq!((0.0, -0.0, MIN_POSITIVE), round_trip(fs));
     }
 
@@ -248,13 +278,13 @@ mod tests {
         let fs2 = (-63456254.2, 5235423.53, 54353.3);
         let fs3 = (-0.000000634, 0.00000000005, 0.00000000892);
 
-        let n1 = encode(fs1);
-        let n2 = encode(fs2);
-        let n3 = encode(fs3);
+        let n1 = encode_64(fs1);
+        let n2 = encode_64(fs2);
+        let n3 = encode_64(fs3);
 
-        assert_eq!(decode(n1), decode(n1 | 0xffff_0000_0000_0000));
-        assert_eq!(decode(n2), decode(n2 | 0xffff_0000_0000_0000));
-        assert_eq!(decode(n3), decode(n3 | 0xffff_0000_0000_0000));
+        assert_eq!(decode_64(n1), decode_64(n1 | 0xffff_0000_0000_0000));
+        assert_eq!(decode_64(n2), decode_64(n2 | 0xffff_0000_0000_0000));
+        assert_eq!(decode_64(n3), decode_64(n3 | 0xffff_0000_0000_0000));
     }
 
     #[test]

From 485da9f918be159d5c2b8b426dbe79184c8a40f7 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 19 Sep 2020 19:51:40 +0900
Subject: [PATCH 12/25] Add a new 32-bit Luv color format.

It's based on the SGI LogLuv format, but using a floating point
instead of log encoding for the luminance.
---
 sub_crates/trifloat/benches/bench.rs |  26 ++-
 sub_crates/trifloat/src/lib.rs       |   1 +
 sub_crates/trifloat/src/luv32.rs     | 259 +++++++++++++++++++++++++++
 3 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 sub_crates/trifloat/src/luv32.rs

diff --git a/sub_crates/trifloat/benches/bench.rs b/sub_crates/trifloat/benches/bench.rs
index fa910ac..e9d7feb 100644
--- a/sub_crates/trifloat/benches/bench.rs
+++ b/sub_crates/trifloat/benches/bench.rs
@@ -1,6 +1,6 @@
 use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
 use rand::{rngs::SmallRng, FromEntropy, Rng};
-use trifloat::{signed48, unsigned32, unsigned40};
+use trifloat::{luv32, signed48, unsigned32, unsigned40};
 
 //----
 
@@ -83,6 +83,28 @@ fn signed48_decode_1000_values(bench: &mut Bencher) {
     });
 }
 
+fn luv32_encode_1000_values(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let x = rng.gen::<f32>();
+        let y = rng.gen::<f32>();
+        let z = rng.gen::<f32>();
+        for _ in 0..1000 {
+            black_box(luv32::encode(black_box((x, y, z))));
+        }
+    });
+}
+
+fn luv32_decode_1000_values(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let v = rng.gen::<u32>();
+        for _ in 0..1000 {
+            black_box(luv32::decode(black_box(v)));
+        }
+    });
+}
+
 //----
 
 benchmark_group!(
@@ -93,5 +115,7 @@ benchmark_group!(
     unsigned40_decode_1000_values,
     signed48_encode_1000_values,
     signed48_decode_1000_values,
+    luv32_encode_1000_values,
+    luv32_decode_1000_values,
 );
 benchmark_main!(benches);
diff --git a/sub_crates/trifloat/src/lib.rs b/sub_crates/trifloat/src/lib.rs
index 2e1afa2..51a9924 100644
--- a/sub_crates/trifloat/src/lib.rs
+++ b/sub_crates/trifloat/src/lib.rs
@@ -4,6 +4,7 @@
 //! The motivating use-case for this is compactly storing HDR RGB colors.  But
 //! it may be useful for other things as well.
 
+pub mod luv32;
 pub mod signed48;
 pub mod unsigned32;
 pub mod unsigned40;
diff --git a/sub_crates/trifloat/src/luv32.rs b/sub_crates/trifloat/src/luv32.rs
new file mode 100644
index 0000000..aa35858
--- /dev/null
+++ b/sub_crates/trifloat/src/luv32.rs
@@ -0,0 +1,259 @@
+//! Encoding/decoding for 32-bit HDR Luv color format.
+//!
+//! This encoding is based on the ideas behind the SGI LogLUV format,
+//! but using a floating point rather than log encoding to store the L
+//! component for the sake of faster encoding/decoding.
+//!
+//! The encoding uses 16 bits for the L component, and 8 bits each for the
+//! u and v components.  The L component's 16 bits are split into 10 bits of
+//! mantissa and 6 bits of exponent.  The mantissa uses an implicit-leading-1
+//! format, giving it 11 bits of precision, and the exponent bias is 26.
+//!
+//! The format encodes from, and decodes to, CIE XYZ color values.
+//!
+//! This format is explicitly designed to support HDR color, with a supported
+//! dynamic range of about 63 stops.  Specifically, the largest supported input
+//! Y value is just under `2^38`, and the smallest (non-zero) Y is `2^-25`.  Y
+//! values smaller than that range will underflow to zero, and larger will
+//! saturate to the max value.
+
+#![allow(clippy::cast_lossless)]
+
+const EXP_BIAS: i32 = 26;
+const UV_SCALE: f32 = 410.0;
+
+/// Largest representable Y component.
+pub const Y_MAX: f32 = ((1u64 << (64 - EXP_BIAS)) - (1u64 << (64 - EXP_BIAS - 11))) as f32;
+
+/// Smallest representable non-zero Y component.
+pub const Y_MIN: f32 = 1.0 / (1u64 << (EXP_BIAS - 1)) as f32;
+
+/// Encodes a CIE XYZ triplet into the 32-bit Luv format.
+#[inline]
+pub fn encode(xyz: (f32, f32, f32)) -> u32 {
+    debug_assert!(
+        xyz.0 >= 0.0
+            && xyz.1 >= 0.0
+            && xyz.2 >= 0.0
+            && !xyz.0.is_nan()
+            && !xyz.1.is_nan()
+            && !xyz.2.is_nan(),
+        "trifloat::yuv32::encode(): encoding to yuv32 only \
+         works correctly for positive, non-NaN numbers, but the numbers passed \
+         were: ({}, {}, {})",
+        xyz.0,
+        xyz.1,
+        xyz.2
+    );
+
+    // Special case: if Y is infinite, saturate to the brightest
+    // white, since with infinities we have no reasonable basis
+    // for determining chromaticity.
+    if xyz.1.is_infinite() {
+        let s = 1.0 + (15.0 * 1.0) + (3.0 * 1.0);
+        let u = ((4.0 * UV_SCALE) * 1.0 / s) as u32;
+        let v = ((9.0 * UV_SCALE) * 1.0 / s) as u32;
+        return 0xffff0000 | (u << 8) | v;
+    }
+
+    let s = xyz.0 + (15.0 * xyz.1) + (3.0 * xyz.2);
+    let u = ((4.0 * UV_SCALE) * xyz.0 / s).max(0.0).min(255.0) as u32;
+    let v = ((9.0 * UV_SCALE) * xyz.1 / s).max(0.0).min(255.0) as u32;
+
+    let (l_exp, l_mant) = {
+        let n = xyz.1.to_bits();
+        let exp = (n >> 23) as i32 - 127 + EXP_BIAS;
+        if exp <= 0 {
+            return 0;
+        } else if exp > 63 {
+            (63, 0b11_1111_1111)
+        } else {
+            (exp as u32, (n & 0x7fffff) >> 13)
+        }
+    };
+
+    (l_exp << 26) | (l_mant << 16) | (u << 8) | v
+}
+
+/// Decodes a 32-bit Luv formatted value into a CIE XYZ triplet.
+#[inline]
+pub fn decode(luv32: u32) -> (f32, f32, f32) {
+    // Unpack values.
+    let l_exp = luv32 >> 26;
+    let l_mant = (luv32 >> 16) & 0b11_1111_1111;
+    let u = ((luv32 >> 8) & 0xff) as f32 * (1.0 / UV_SCALE);
+    let v4 = (luv32 & 0xff).max(1) as f32 * (4.0 / UV_SCALE); // 4 * v
+
+    if l_exp == 0 {
+        return (0.0, 0.0, 0.0);
+    }
+
+    let y = f32::from_bits(((l_exp + 127 - EXP_BIAS as u32) << 23) | (l_mant << 13));
+    let x = y * u * (9.0 / v4); // y * (9u / 4v)
+    let z = (y / v4) * (12.0 - (3.0 * u) - (5.0 * v4)); // y * ((12 - 3u - 20v) / 4v)
+
+    (x, y, z.max(0.0))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) {
+        decode(encode(floats))
+    }
+
+    #[test]
+    fn all_zeros() {
+        let fs = (0.0f32, 0.0f32, 0.0f32);
+
+        let tri = encode(fs);
+        let fs2 = decode(tri);
+
+        assert_eq!(tri, 0u32);
+        assert_eq!(fs, fs2);
+    }
+
+    #[test]
+    fn powers_of_two() {
+        let mut n = 0.25;
+        for _ in 0..20 {
+            let a = (n as f32, n as f32, n as f32);
+            let b = round_trip(a);
+
+            let rd0 = 2.0 * (a.0 - b.0).abs() / (a.0 + b.0);
+            let rd2 = 2.0 * (a.2 - b.2).abs() / (a.2 + b.2);
+
+            assert_eq!(a.1, b.1);
+            assert!(rd0 < 0.01);
+            assert!(rd2 < 0.01);
+
+            n *= 2.0;
+        }
+    }
+
+    #[test]
+    fn accuracy_01() {
+        let mut n = 1.0;
+        for _ in 0..1024 {
+            let a = (n as f32, n as f32, n as f32);
+            let b = round_trip(a);
+
+            let rd0 = 2.0 * (a.0 - b.0).abs() / (a.0 + b.0);
+            let rd2 = 2.0 * (a.2 - b.2).abs() / (a.2 + b.2);
+
+            assert_eq!(a.1, b.1);
+            assert!(rd0 < 0.01);
+            assert!(rd2 < 0.01);
+
+            n += 1.0 / 1024.0;
+        }
+    }
+
+    #[test]
+    #[should_panic]
+    fn accuracy_02() {
+        let mut n = 1.0;
+        for _ in 0..2048 {
+            let a = (n as f32, n as f32, n as f32);
+            let b = round_trip(a);
+            assert_eq!(a.1, b.1);
+            n += 1.0 / 2048.0;
+        }
+    }
+
+    #[test]
+    fn integers() {
+        for n in 1..=512 {
+            let a = (n as f32, n as f32, n as f32);
+            let b = round_trip(a);
+
+            let rd0 = 2.0 * (a.0 - b.0).abs() / (a.0 + b.0);
+            let rd2 = 2.0 * (a.2 - b.2).abs() / (a.2 + b.2);
+
+            assert_eq!(a.1, b.1);
+            assert!(rd0 < 0.01);
+            assert!(rd2 < 0.01);
+        }
+    }
+
+    #[test]
+    fn precision_floor() {
+        let a = (2049.0f32, 2049.0f32, 2049.0f32);
+        let b = round_trip(a);
+        assert_eq!(2048.0, b.1);
+    }
+
+    #[test]
+    fn saturate_y() {
+        let fs = (1.0e+20, 1.0e+20, 1.0e+20);
+
+        assert_eq!(Y_MAX, round_trip(fs).1);
+        assert_eq!(Y_MAX, decode(0xFFFFFFFF).1);
+    }
+
+    #[test]
+    fn inf_saturate() {
+        use std::f32::INFINITY;
+        let fs = (INFINITY, INFINITY, INFINITY);
+
+        assert_eq!(Y_MAX, round_trip(fs).1);
+        assert_eq!(0xffff56c2, encode(fs));
+    }
+
+    #[test]
+    fn smallest_value() {
+        let a = (Y_MIN, Y_MIN, Y_MIN);
+        let b = (Y_MIN * 0.99, Y_MIN * 0.99, Y_MIN * 0.99);
+        assert_eq!(Y_MIN, round_trip(a).1);
+        assert_eq!(0.0, round_trip(b).1);
+    }
+
+    #[test]
+    fn underflow() {
+        let fs = (Y_MIN * 0.99, Y_MIN * 0.99, Y_MIN * 0.99);
+        assert_eq!(0, encode(fs));
+        assert_eq!((0.0, 0.0, 0.0), round_trip(fs));
+    }
+
+    #[test]
+    #[should_panic]
+    fn nans_01() {
+        encode((std::f32::NAN, 0.0, 0.0));
+    }
+
+    #[test]
+    #[should_panic]
+    fn nans_02() {
+        encode((0.0, std::f32::NAN, 0.0));
+    }
+
+    #[test]
+    #[should_panic]
+    fn nans_03() {
+        encode((0.0, 0.0, std::f32::NAN));
+    }
+
+    #[test]
+    #[should_panic]
+    fn negative_01() {
+        encode((-1.0, 0.0, 0.0));
+    }
+
+    #[test]
+    #[should_panic]
+    fn negative_02() {
+        encode((0.0, -1.0, 0.0));
+    }
+
+    #[test]
+    #[should_panic]
+    fn negative_03() {
+        encode((0.0, 0.0, -1.0));
+    }
+
+    #[test]
+    fn negative_04() {
+        encode((-0.0, -0.0, -0.0));
+    }
+}

From 03bedcb5949fe5727046a0f69c69f06c2ad890eb Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sat, 19 Sep 2020 23:57:59 +0900
Subject: [PATCH 13/25] Cleanup, tweaks, and better documentation for the
 32-bit Luv format.

---
 sub_crates/trifloat/benches/bench.rs          | 14 +--
 .../trifloat/src/{luv32.rs => fluv32.rs}      | 88 +++++++++++++------
 sub_crates/trifloat/src/lib.rs                |  2 +-
 3 files changed, 67 insertions(+), 37 deletions(-)
 rename sub_crates/trifloat/src/{luv32.rs => fluv32.rs} (65%)

diff --git a/sub_crates/trifloat/benches/bench.rs b/sub_crates/trifloat/benches/bench.rs
index e9d7feb..80582ab 100644
--- a/sub_crates/trifloat/benches/bench.rs
+++ b/sub_crates/trifloat/benches/bench.rs
@@ -1,6 +1,6 @@
 use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
 use rand::{rngs::SmallRng, FromEntropy, Rng};
-use trifloat::{luv32, signed48, unsigned32, unsigned40};
+use trifloat::{fluv32, signed48, unsigned32, unsigned40};
 
 //----
 
@@ -83,24 +83,24 @@ fn signed48_decode_1000_values(bench: &mut Bencher) {
     });
 }
 
-fn luv32_encode_1000_values(bench: &mut Bencher) {
+fn fluv32_encode_1000_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
         let x = rng.gen::<f32>();
         let y = rng.gen::<f32>();
         let z = rng.gen::<f32>();
         for _ in 0..1000 {
-            black_box(luv32::encode(black_box((x, y, z))));
+            black_box(fluv32::encode(black_box((x, y, z))));
         }
     });
 }
 
-fn luv32_decode_1000_values(bench: &mut Bencher) {
+fn fluv32_decode_1000_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
         let v = rng.gen::<u32>();
         for _ in 0..1000 {
-            black_box(luv32::decode(black_box(v)));
+            black_box(fluv32::decode(black_box(v)));
         }
     });
 }
@@ -115,7 +115,7 @@ benchmark_group!(
     unsigned40_decode_1000_values,
     signed48_encode_1000_values,
     signed48_decode_1000_values,
-    luv32_encode_1000_values,
-    luv32_decode_1000_values,
+    fluv32_encode_1000_values,
+    fluv32_decode_1000_values,
 );
 benchmark_main!(benches);
diff --git a/sub_crates/trifloat/src/luv32.rs b/sub_crates/trifloat/src/fluv32.rs
similarity index 65%
rename from sub_crates/trifloat/src/luv32.rs
rename to sub_crates/trifloat/src/fluv32.rs
index aa35858..6d300df 100644
--- a/sub_crates/trifloat/src/luv32.rs
+++ b/sub_crates/trifloat/src/fluv32.rs
@@ -1,25 +1,51 @@
-//! Encoding/decoding for 32-bit HDR Luv color format.
+//! Encoding/decoding for the 32-bit FloatLuv color format.
 //!
-//! This encoding is based on the ideas behind the SGI LogLUV format,
-//! but using a floating point rather than log encoding to store the L
-//! component for the sake of faster encoding/decoding.
+//! This encoding is based on, but is slightly different than, the 32-bit
+//! LogLuv format from the paper "Overcoming Gamut and Dynamic Range
+//! Limitations in Digital Images" by Greg Ward.  It uses the same uv chroma
+//! storage, but uses a floating point rather than log encoding to store
+//! luminance, mainly for the sake of faster decoding.  It also omits the sign
+//! bit of LogLuv, foregoing negative luminance capabilities.
 //!
-//! The encoding uses 16 bits for the L component, and 8 bits each for the
-//! u and v components.  The L component's 16 bits are split into 10 bits of
-//! mantissa and 6 bits of exponent.  The mantissa uses an implicit-leading-1
-//! format, giving it 11 bits of precision, and the exponent bias is 26.
+//! Compared to LogLuv, this format's chroma precision is identical and its
+//! luminance precision is better, but its luminance *range* is smaller.
+//! The supported luminance range is still substantial, however (see
+//! "Luminance details" below).
 //!
-//! The format encodes from, and decodes to, CIE XYZ color values.
+//! Like the LogLuv format, this is an absolute rather than relative color
+//! encoding, and as such takes CIE XYZ triplets as input.  It is *not*
+//! designed to take arbitrary floating point triplets, and will perform poorly
+//! if e.g. passed RGB values.
 //!
-//! This format is explicitly designed to support HDR color, with a supported
-//! dynamic range of about 63 stops.  Specifically, the largest supported input
-//! Y value is just under `2^38`, and the smallest (non-zero) Y is `2^-25`.  Y
-//! values smaller than that range will underflow to zero, and larger will
-//! saturate to the max value.
+//! The bit layout is:
+//!
+//! 1. luminance exponent (6 bits, bias 27)
+//! 2. luminance mantissa (10 stored bits, 11 bits precision)
+//! 3. u (8 bits)
+//! 4. v (8 bits)
+//!
+//! ## Luminance details
+//!
+//! Quoting Greg Ward about luminance ranges:
+//!
+//! > The sun is about `10^8 cd/m^2`, and the underside of a rock on a moonless
+//! > night is probably around `10^-6` or so [...]
+//!
+//! The luminance range of this format is from about `10^11` on the brightest
+//! end, to about `10^-8` on the darkest (excluding zero itself, which can also
+//! be stored).
+//!
+//! That gives this format almost five orders of magnitude more dynamic range
+//! than is likely to be needed for any practical situation.  Moreover, that
+//! extra range is split between both the high and low end, giving a
+//! comfortable buffer on both ends for extreme situations.
+//!
+//! Like the LogLuv format, the input CIE Y value is taken directly as the
+//! luminance value.
 
 #![allow(clippy::cast_lossless)]
 
-const EXP_BIAS: i32 = 26;
+const EXP_BIAS: i32 = 27;
 const UV_SCALE: f32 = 410.0;
 
 /// Largest representable Y component.
@@ -28,7 +54,10 @@ pub const Y_MAX: f32 = ((1u64 << (64 - EXP_BIAS)) - (1u64 << (64 - EXP_BIAS - 11
 /// Smallest representable non-zero Y component.
 pub const Y_MIN: f32 = 1.0 / (1u64 << (EXP_BIAS - 1)) as f32;
 
-/// Encodes a CIE XYZ triplet into the 32-bit Luv format.
+/// Difference between 1.0 and the next largest representable Y value.
+pub const Y_EPSILON: f32 = 1.0 / 1024.0;
+
+/// Encodes from CIE XYZ to 32-bit FloatLuv.
 #[inline]
 pub fn encode(xyz: (f32, f32, f32)) -> u32 {
     debug_assert!(
@@ -46,25 +75,26 @@ pub fn encode(xyz: (f32, f32, f32)) -> u32 {
         xyz.2
     );
 
+    // Calculates the 16-bit encoding of the UV values for the given XYZ input.
+    fn encode_uv(xyz: (f32, f32, f32)) -> u32 {
+        let s = xyz.0 + (15.0 * xyz.1) + (3.0 * xyz.2);
+        let u = ((4.0 * UV_SCALE) * xyz.0 / s).max(0.0).min(255.0) as u32;
+        let v = ((9.0 * UV_SCALE) * xyz.1 / s).max(0.0).min(255.0) as u32;
+        (u << 8) | v
+    };
+
     // Special case: if Y is infinite, saturate to the brightest
     // white, since with infinities we have no reasonable basis
     // for determining chromaticity.
     if xyz.1.is_infinite() {
-        let s = 1.0 + (15.0 * 1.0) + (3.0 * 1.0);
-        let u = ((4.0 * UV_SCALE) * 1.0 / s) as u32;
-        let v = ((9.0 * UV_SCALE) * 1.0 / s) as u32;
-        return 0xffff0000 | (u << 8) | v;
+        return 0xffff0000 | encode_uv((1.0, 1.0, 1.0));
     }
 
-    let s = xyz.0 + (15.0 * xyz.1) + (3.0 * xyz.2);
-    let u = ((4.0 * UV_SCALE) * xyz.0 / s).max(0.0).min(255.0) as u32;
-    let v = ((9.0 * UV_SCALE) * xyz.1 / s).max(0.0).min(255.0) as u32;
-
     let (l_exp, l_mant) = {
         let n = xyz.1.to_bits();
         let exp = (n >> 23) as i32 - 127 + EXP_BIAS;
         if exp <= 0 {
-            return 0;
+            return encode_uv((1.0, 1.0, 1.0));
         } else if exp > 63 {
             (63, 0b11_1111_1111)
         } else {
@@ -72,10 +102,10 @@ pub fn encode(xyz: (f32, f32, f32)) -> u32 {
         }
     };
 
-    (l_exp << 26) | (l_mant << 16) | (u << 8) | v
+    (l_exp << 26) | (l_mant << 16) | encode_uv(xyz)
 }
 
-/// Decodes a 32-bit Luv formatted value into a CIE XYZ triplet.
+/// Decodes from 32-bit FloatLuv to CIE XYZ.
 #[inline]
 pub fn decode(luv32: u32) -> (f32, f32, f32) {
     // Unpack values.
@@ -110,7 +140,7 @@ mod tests {
         let tri = encode(fs);
         let fs2 = decode(tri);
 
-        assert_eq!(tri, 0u32);
+        assert_eq!(0x000056c2, tri);
         assert_eq!(fs, fs2);
     }
 
@@ -212,7 +242,7 @@ mod tests {
     #[test]
     fn underflow() {
         let fs = (Y_MIN * 0.99, Y_MIN * 0.99, Y_MIN * 0.99);
-        assert_eq!(0, encode(fs));
+        assert_eq!(0x000056c2, encode(fs));
         assert_eq!((0.0, 0.0, 0.0), round_trip(fs));
     }
 
diff --git a/sub_crates/trifloat/src/lib.rs b/sub_crates/trifloat/src/lib.rs
index 51a9924..ba1350d 100644
--- a/sub_crates/trifloat/src/lib.rs
+++ b/sub_crates/trifloat/src/lib.rs
@@ -4,7 +4,7 @@
 //! The motivating use-case for this is compactly storing HDR RGB colors.  But
 //! it may be useful for other things as well.
 
-pub mod luv32;
+pub mod fluv32;
 pub mod signed48;
 pub mod unsigned32;
 pub mod unsigned40;

From cda9156af8a60d403d70c1f408b814d375ca1e36 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 20 Sep 2020 01:43:36 +0900
Subject: [PATCH 14/25] Optimize FloatLuv decoding.

Speed ups of over 20%.
---
 sub_crates/trifloat/src/fluv32.rs | 32 ++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/sub_crates/trifloat/src/fluv32.rs b/sub_crates/trifloat/src/fluv32.rs
index 6d300df..6657c98 100644
--- a/sub_crates/trifloat/src/fluv32.rs
+++ b/sub_crates/trifloat/src/fluv32.rs
@@ -108,21 +108,31 @@ pub fn encode(xyz: (f32, f32, f32)) -> u32 {
 /// Decodes from 32-bit FloatLuv to CIE XYZ.
 #[inline]
 pub fn decode(luv32: u32) -> (f32, f32, f32) {
-    // Unpack values.
-    let l_exp = luv32 >> 26;
-    let l_mant = (luv32 >> 16) & 0b11_1111_1111;
-    let u = ((luv32 >> 8) & 0xff) as f32 * (1.0 / UV_SCALE);
-    let v4 = (luv32 & 0xff).max(1) as f32 * (4.0 / UV_SCALE); // 4 * v
-
-    if l_exp == 0 {
+    // Check for zero.
+    if luv32 & 0xffff0000 == 0 {
         return (0.0, 0.0, 0.0);
     }
 
-    let y = f32::from_bits(((l_exp + 127 - EXP_BIAS as u32) << 23) | (l_mant << 13));
-    let x = y * u * (9.0 / v4); // y * (9u / 4v)
-    let z = (y / v4) * (12.0 - (3.0 * u) - (5.0 * v4)); // y * ((12 - 3u - 20v) / 4v)
+    // Unpack values.
+    let l_exp = luv32 >> 26;
+    let l_mant = (luv32 >> 16) & 0x3ff;
+    let u = ((luv32 >> 8) & 0xff) as f32; // Range 0.0-255.0
+    let v = (luv32 & 0xff).max(1) as f32; // Range 0.0-255.0
 
-    (x, y, z.max(0.0))
+    // Calculate y.
+    let y = f32::from_bits(((l_exp + 127 - EXP_BIAS as u32) << 23) | (l_mant << 13));
+
+    // Calculate x and z.
+    // This is re-worked from the original equations, to allow a bunch of stuff
+    // to cancel out and avoid operations.  It makes the underlying equations a
+    // bit non-obvious.
+    // We also roll the UV_SCALE application into the final x and z formulas,
+    // since most of that also cancels if we do it there.
+    let tmp = y / v;
+    let x = tmp * (u * 2.25); // y * (9u / 4v)
+    let z = tmp * ((3.0 * UV_SCALE) - (0.75 * u) - (5.0 * v)).max(0.0); // y * ((12 - 3u - 20v) / 4v)
+
+    (x, y, z)
 }
 
 #[cfg(test)]

From f20567247d18f04f58c42af20f922b5e6a2858d6 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 20 Sep 2020 02:02:35 +0900
Subject: [PATCH 15/25] Go all in with the fluv32 naming.

---
 sub_crates/trifloat/src/fluv32.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sub_crates/trifloat/src/fluv32.rs b/sub_crates/trifloat/src/fluv32.rs
index 6657c98..95a1a0b 100644
--- a/sub_crates/trifloat/src/fluv32.rs
+++ b/sub_crates/trifloat/src/fluv32.rs
@@ -107,17 +107,17 @@ pub fn encode(xyz: (f32, f32, f32)) -> u32 {
 
 /// Decodes from 32-bit FloatLuv to CIE XYZ.
 #[inline]
-pub fn decode(luv32: u32) -> (f32, f32, f32) {
+pub fn decode(fluv32: u32) -> (f32, f32, f32) {
     // Check for zero.
-    if luv32 & 0xffff0000 == 0 {
+    if fluv32 & 0xffff0000 == 0 {
         return (0.0, 0.0, 0.0);
     }
 
     // Unpack values.
-    let l_exp = luv32 >> 26;
-    let l_mant = (luv32 >> 16) & 0x3ff;
-    let u = ((luv32 >> 8) & 0xff) as f32; // Range 0.0-255.0
-    let v = (luv32 & 0xff).max(1) as f32; // Range 0.0-255.0
+    let l_exp = fluv32 >> 26;
+    let l_mant = (fluv32 >> 16) & 0x3ff;
+    let u = ((fluv32 >> 8) & 0xff) as f32; // Range 0.0-255.0
+    let v = (fluv32 & 0xff).max(1) as f32; // Range 0.0-255.0
 
     // Calculate y.
     let y = f32::from_bits(((l_exp + 127 - EXP_BIAS as u32) << 23) | (l_mant << 13));

From 8dee53d1fc10e9bb3d09b614825f3e108ca23e38 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 20 Sep 2020 02:35:37 +0900
Subject: [PATCH 16/25] Additional optimization to fluv32 decoding.

Tiny change but with a nice speed bump.
---
 sub_crates/trifloat/src/fluv32.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sub_crates/trifloat/src/fluv32.rs b/sub_crates/trifloat/src/fluv32.rs
index 95a1a0b..1e83a27 100644
--- a/sub_crates/trifloat/src/fluv32.rs
+++ b/sub_crates/trifloat/src/fluv32.rs
@@ -79,7 +79,7 @@ pub fn encode(xyz: (f32, f32, f32)) -> u32 {
     fn encode_uv(xyz: (f32, f32, f32)) -> u32 {
         let s = xyz.0 + (15.0 * xyz.1) + (3.0 * xyz.2);
         let u = ((4.0 * UV_SCALE) * xyz.0 / s).max(0.0).min(255.0) as u32;
-        let v = ((9.0 * UV_SCALE) * xyz.1 / s).max(0.0).min(255.0) as u32;
+        let v = ((9.0 * UV_SCALE) * xyz.1 / s).max(1.0).min(255.0) as u32;
         (u << 8) | v
     };
 
@@ -117,7 +117,7 @@ pub fn decode(fluv32: u32) -> (f32, f32, f32) {
     let l_exp = fluv32 >> 26;
     let l_mant = (fluv32 >> 16) & 0x3ff;
     let u = ((fluv32 >> 8) & 0xff) as f32; // Range 0.0-255.0
-    let v = (fluv32 & 0xff).max(1) as f32; // Range 0.0-255.0
+    let v = (fluv32 & 0xff) as f32; // Range 0.0-255.0
 
     // Calculate y.
     let y = f32::from_bits(((l_exp + 127 - EXP_BIAS as u32) << 23) | (l_mant << 13));

From f4ef11f9f38b5ed84ee60cb4e4a24bf8738ee9f4 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 20 Sep 2020 03:02:56 +0900
Subject: [PATCH 17/25] Fix some names in fluv32 error messages.

---
 sub_crates/trifloat/src/fluv32.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sub_crates/trifloat/src/fluv32.rs b/sub_crates/trifloat/src/fluv32.rs
index 1e83a27..0b5af89 100644
--- a/sub_crates/trifloat/src/fluv32.rs
+++ b/sub_crates/trifloat/src/fluv32.rs
@@ -67,7 +67,7 @@ pub fn encode(xyz: (f32, f32, f32)) -> u32 {
             && !xyz.0.is_nan()
             && !xyz.1.is_nan()
             && !xyz.2.is_nan(),
-        "trifloat::yuv32::encode(): encoding to yuv32 only \
+        "trifloat::fluv32::encode(): encoding to fluv32 only \
          works correctly for positive, non-NaN numbers, but the numbers passed \
          were: ({}, {}, {})",
         xyz.0,

From 3eff608493777b5141a392f7f042e1da3d2c9f18 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 20 Sep 2020 10:07:02 +0900
Subject: [PATCH 18/25] More FloatLuv32 optimizations, and general code
 cleanup.

This gives another little speed boost to decoding, but gives a
massive (over 3x) speed boost to encoding.
---
 sub_crates/trifloat/src/fluv32.rs | 66 +++++++++++++++++++------------
 1 file changed, 41 insertions(+), 25 deletions(-)

diff --git a/sub_crates/trifloat/src/fluv32.rs b/sub_crates/trifloat/src/fluv32.rs
index 0b5af89..18d003f 100644
--- a/sub_crates/trifloat/src/fluv32.rs
+++ b/sub_crates/trifloat/src/fluv32.rs
@@ -76,33 +76,41 @@ pub fn encode(xyz: (f32, f32, f32)) -> u32 {
     );
 
     // Calculates the 16-bit encoding of the UV values for the given XYZ input.
+    #[inline(always)]
     fn encode_uv(xyz: (f32, f32, f32)) -> u32 {
         let s = xyz.0 + (15.0 * xyz.1) + (3.0 * xyz.2);
-        let u = ((4.0 * UV_SCALE) * xyz.0 / s).max(0.0).min(255.0) as u32;
-        let v = ((9.0 * UV_SCALE) * xyz.1 / s).max(1.0).min(255.0) as u32;
-        (u << 8) | v
+
+        // The `+ 0.5` is for rounding, and is not part of the normal equation.
+        // The minimum value of 1.0 for v is to avoid a possible divide by zero
+        // when decoding.  A value less than 1.0 is outside the real colors,
+        // so we don't need to store it anyway.
+        let u = (((4.0 * UV_SCALE) * xyz.0 / s) + 0.5).max(0.0).min(255.0);
+        let v = (((9.0 * UV_SCALE) * xyz.1 / s) + 0.5).max(1.0).min(255.0);
+
+        ((u as u32) << 8) | (v as u32)
     };
 
-    // Special case: if Y is infinite, saturate to the brightest
-    // white, since with infinities we have no reasonable basis
-    // for determining chromaticity.
-    if xyz.1.is_infinite() {
-        return 0xffff0000 | encode_uv((1.0, 1.0, 1.0));
-    }
+    let y_bits = xyz.1.to_bits();
+    let exp = (y_bits >> 23) as i32 - 127 + EXP_BIAS;
 
-    let (l_exp, l_mant) = {
-        let n = xyz.1.to_bits();
-        let exp = (n >> 23) as i32 - 127 + EXP_BIAS;
-        if exp <= 0 {
-            return encode_uv((1.0, 1.0, 1.0));
-        } else if exp > 63 {
-            (63, 0b11_1111_1111)
+    if exp <= 0 {
+        // Special case: black.
+        encode_uv((1.0, 1.0, 1.0))
+    } else if exp > 63 {
+        if xyz.1.is_infinite() {
+            // Special case: infinity.  In this case, we don't have any
+            // reasonable basis for calculating chroma, so just return
+            // the brightest white.
+            0xffff0000 | encode_uv((1.0, 1.0, 1.0))
         } else {
-            (exp as u32, (n & 0x7fffff) >> 13)
+            // Special case: non-infinite, but brighter luma than can be
+            // represented.  Return the correct chroma, and the brightest luma.
+            0xffff0000 | encode_uv(xyz)
         }
-    };
-
-    (l_exp << 26) | (l_mant << 16) | encode_uv(xyz)
+    } else {
+        // Common case.
+        ((exp as u32) << 26) | ((y_bits & 0x07fe000) << 3) | encode_uv(xyz)
+    }
 }
 
 /// Decodes from 32-bit FloatLuv to CIE XYZ.
@@ -117,7 +125,7 @@ pub fn decode(fluv32: u32) -> (f32, f32, f32) {
     let l_exp = fluv32 >> 26;
     let l_mant = (fluv32 >> 16) & 0x3ff;
     let u = ((fluv32 >> 8) & 0xff) as f32; // Range 0.0-255.0
-    let v = (fluv32 & 0xff) as f32; // Range 0.0-255.0
+    let v = (fluv32 & 0xff) as f32; // Range 1.0-255.0
 
     // Calculate y.
     let y = f32::from_bits(((l_exp + 127 - EXP_BIAS as u32) << 23) | (l_mant << 13));
@@ -130,7 +138,7 @@ pub fn decode(fluv32: u32) -> (f32, f32, f32) {
     // since most of that also cancels if we do it there.
     let tmp = y / v;
     let x = tmp * (u * 2.25); // y * (9u / 4v)
-    let z = tmp * ((3.0 * UV_SCALE) - (0.75 * u) - (5.0 * v)).max(0.0); // y * ((12 - 3u - 20v) / 4v)
+    let z = tmp * ((3.0 * UV_SCALE) - (0.75 * u) - (5.0 * v)); // y * ((12 - 3u - 20v) / 4v)
 
     (x, y, z)
 }
@@ -219,9 +227,8 @@ mod tests {
 
     #[test]
     fn precision_floor() {
-        let a = (2049.0f32, 2049.0f32, 2049.0f32);
-        let b = round_trip(a);
-        assert_eq!(2048.0, b.1);
+        let fs = (2049.0f32, 2049.0f32, 2049.0f32);
+        assert_eq!(2048.0, round_trip(fs).1);
     }
 
     #[test]
@@ -256,6 +263,15 @@ mod tests {
         assert_eq!((0.0, 0.0, 0.0), round_trip(fs));
     }
 
+    #[test]
+    fn negative_z_impossible() {
+        // These are very specific values, which should result in smallest
+        // possible z value (specifically z = 0.0 with no quantization) while
+        // still having positive values in x and y.
+        let fs = (248.0 / 565.0, 9827.0 / 8475.0, 0.0);
+        assert!(round_trip(fs).2 >= 0.0);
+    }
+
     #[test]
     #[should_panic]
     fn nans_01() {

From 161e47fc443e51baa9989cd700c4da709bdab515 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 20 Sep 2020 10:37:26 +0900
Subject: [PATCH 19/25] Clean up oct32norm code a bit.

---
 sub_crates/oct32norm/src/lib.rs | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/sub_crates/oct32norm/src/lib.rs b/sub_crates/oct32norm/src/lib.rs
index 63c085f..46c1086 100644
--- a/sub_crates/oct32norm/src/lib.rs
+++ b/sub_crates/oct32norm/src/lib.rs
@@ -10,21 +10,19 @@
 /// matters to the encoding process, not the length.
 #[inline]
 pub fn encode(vec: (f32, f32, f32)) -> u32 {
-    let inv_l1_norm = 1.0f32 / (vec.0.abs() + vec.1.abs() + vec.2.abs());
+    let l1_norm = vec.0.abs() + vec.1.abs() + vec.2.abs();
+    let v0_norm = vec.0 / l1_norm;
+    let v1_norm = vec.1 / l1_norm;
 
     let (u, v) = if vec.2 < 0.0 {
         (
-            u32::from(to_snorm_16(
-                (1.0 - (vec.1 * inv_l1_norm).abs()) * sign(vec.0),
-            )),
-            u32::from(to_snorm_16(
-                (1.0 - (vec.0 * inv_l1_norm).abs()) * sign(vec.1),
-            )),
+            u32::from(to_snorm_16((1.0 - v1_norm.abs()) * sign(vec.0))),
+            u32::from(to_snorm_16((1.0 - v0_norm.abs()) * sign(vec.1))),
         )
     } else {
         (
-            u32::from(to_snorm_16(vec.0 * inv_l1_norm)),
-            u32::from(to_snorm_16(vec.1 * inv_l1_norm)),
+            u32::from(to_snorm_16(v0_norm)),
+            u32::from(to_snorm_16(v1_norm)),
         )
     };
 
@@ -52,15 +50,12 @@ pub fn decode(n: u32) -> (f32, f32, f32) {
 
 #[inline(always)]
 fn to_snorm_16(n: f32) -> u16 {
-    (n.max(-1.0).min(1.0) * ((1u32 << (16 - 1)) - 1) as f32).round() as i16 as u16
+    (n * ((1u32 << (16 - 1)) - 1) as f32).round() as i16 as u16
 }
 
 #[inline(always)]
 fn from_snorm_16(n: u16) -> f32 {
-    f32::from(n as i16)
-        * (1.0f32 / ((1u32 << (16 - 1)) - 1) as f32)
-            .max(-1.0)
-            .min(1.0)
+    f32::from(n as i16) * (1.0f32 / ((1u32 << (16 - 1)) - 1) as f32)
 }
 
 #[inline(always)]

From 8c738b2f39196624e5e5d7a4305e428e74f608bc Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 20 Sep 2020 11:04:37 +0900
Subject: [PATCH 20/25] Use FloatLuv32 in Psychopath for encoding XYZ colors.

---
 src/color.rs | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/color.rs b/src/color.rs
index 178e47c..14b8498 100644
--- a/src/color.rs
+++ b/src/color.rs
@@ -7,7 +7,7 @@ pub use color::{
 use glam::Vec4;
 use half::f16;
 use spectral_upsampling::meng::{spectrum_xyz_to_p_4, EQUAL_ENERGY_REFLECTANCE};
-use trifloat::signed48;
+use trifloat::fluv32;
 
 use crate::{lerp::Lerp, math::fast_exp};
 
@@ -144,7 +144,7 @@ impl Color {
     /// Returns the post-compression size of this color.
     pub fn compressed_size(&self) -> usize {
         match self {
-            Color::XYZ(_, _, _) => 7,
+            Color::XYZ(_, _, _) => 5,
 
             Color::Blackbody { .. } => 5,
 
@@ -162,9 +162,7 @@ impl Color {
         match *self {
             Color::XYZ(x, y, z) => {
                 out_data[0] = 0; // Discriminant
-                let col = signed48::encode((x, y, z));
-                let col = col.to_le_bytes();
-                (&mut out_data[1..7]).copy_from_slice(&col[0..6]);
+                (&mut out_data[1..5]).copy_from_slice(&fluv32::encode((x, y, z)).to_ne_bytes()[..]);
             }
 
             Color::Blackbody {
@@ -202,10 +200,10 @@ impl Color {
         match in_data[0] {
             0 => {
                 // XYZ
-                let mut bytes = [0u8; 8];
-                (&mut bytes[0..6]).copy_from_slice(&in_data[1..7]);
-                let (x, y, z) = signed48::decode(u64::from_le_bytes(bytes));
-                (Color::XYZ(x, y, z), 7)
+                let mut bytes = [0u8; 4];
+                (&mut bytes[..]).copy_from_slice(&in_data[1..5]);
+                let (x, y, z) = fluv32::decode(u32::from_ne_bytes(bytes));
+                (Color::XYZ(x, y, z), 5)
             }
 
             1 => {

From 0ba1acc42d4567157a1a03d663df12bc625c02b3 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 20 Sep 2020 14:29:23 +0900
Subject: [PATCH 21/25] Add oct32 `encode_precise()` function.

This is significantly slower than `encode()`, but a little more precise.
---
 sub_crates/oct32norm/benches/bench.rs        |  29 ++++-
 sub_crates/oct32norm/src/lib.rs              | 110 +++++++++++++++----
 sub_crates/oct32norm/tests/proptest_tests.rs |  15 ++-
 3 files changed, 119 insertions(+), 35 deletions(-)

diff --git a/sub_crates/oct32norm/benches/bench.rs b/sub_crates/oct32norm/benches/bench.rs
index 18a6940..012d7fb 100644
--- a/sub_crates/oct32norm/benches/bench.rs
+++ b/sub_crates/oct32norm/benches/bench.rs
@@ -1,26 +1,38 @@
 use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
-use oct32norm::{decode, encode};
+use oct32norm::{decode, encode, encode_precise};
 use rand::{rngs::SmallRng, FromEntropy, Rng};
 
 //----
 
-fn encode_100_values(bench: &mut Bencher) {
+fn encode_1000_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
         let x = rng.gen::<f32>() - 0.5;
         let y = rng.gen::<f32>() - 0.5;
         let z = rng.gen::<f32>() - 0.5;
-        for _ in 0..100 {
+        for _ in 0..1000 {
             black_box(encode(black_box((x, y, z))));
         }
     });
 }
 
-fn decode_100_values(bench: &mut Bencher) {
+fn encode_precise_1000_values(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let x = rng.gen::<f32>() - 0.5;
+        let y = rng.gen::<f32>() - 0.5;
+        let z = rng.gen::<f32>() - 0.5;
+        for _ in 0..1000 {
+            black_box(encode_precise(black_box((x, y, z))));
+        }
+    });
+}
+
+fn decode_1000_values(bench: &mut Bencher) {
     let mut rng = SmallRng::from_entropy();
     bench.iter(|| {
         let v = rng.gen::<u32>();
-        for _ in 0..100 {
+        for _ in 0..1000 {
             black_box(decode(black_box(v)));
         }
     });
@@ -28,5 +40,10 @@ fn decode_100_values(bench: &mut Bencher) {
 
 //----
 
-benchmark_group!(benches, encode_100_values, decode_100_values,);
+benchmark_group!(
+    benches,
+    encode_1000_values,
+    encode_precise_1000_values,
+    decode_1000_values,
+);
 benchmark_main!(benches);
diff --git a/sub_crates/oct32norm/src/lib.rs b/sub_crates/oct32norm/src/lib.rs
index 46c1086..6f54ebb 100644
--- a/sub_crates/oct32norm/src/lib.rs
+++ b/sub_crates/oct32norm/src/lib.rs
@@ -4,29 +4,60 @@
 //! of Efficient Representations for Independent Unit Vectors" by
 //! Cigolle et al.
 
+const STEP_SIZE: f32 = 1.0 / STEPS;
+const STEPS: f32 = ((1 << (16 - 1)) - 1) as f32;
+
 /// Encodes a vector of three floats to the oct32 format.
 ///
 /// The input vector does not need to be normalized--only the direction
 /// matters to the encoding process, not the length.
 #[inline]
 pub fn encode(vec: (f32, f32, f32)) -> u32 {
-    let l1_norm = vec.0.abs() + vec.1.abs() + vec.2.abs();
-    let v0_norm = vec.0 / l1_norm;
-    let v1_norm = vec.1 / l1_norm;
+    let (u, v) = vec3_to_oct(vec);
+    ((to_snorm_16(u) as u32) << 16) | to_snorm_16(v) as u32
+}
 
-    let (u, v) = if vec.2 < 0.0 {
-        (
-            u32::from(to_snorm_16((1.0 - v1_norm.abs()) * sign(vec.0))),
-            u32::from(to_snorm_16((1.0 - v0_norm.abs()) * sign(vec.1))),
-        )
-    } else {
-        (
-            u32::from(to_snorm_16(v0_norm)),
-            u32::from(to_snorm_16(v1_norm)),
-        )
+/// Encodes a vector of three floats to the oct32 format.
+///
+/// This is the same as `encode()` except that it is slower and encodes
+/// with slightly better precision.
+pub fn encode_precise(vec: (f32, f32, f32)) -> u32 {
+    #[inline(always)]
+    fn dot_norm(a: (f32, f32, f32), b: (f32, f32, f32)) -> f64 {
+        let l = ((a.0 as f64 * a.0 as f64) + (a.1 as f64 * a.1 as f64) + (a.2 as f64 * a.2 as f64))
+            .sqrt();
+        ((a.0 as f64 * b.0 as f64) + (a.1 as f64 * b.1 as f64) + (a.2 as f64 * b.2 as f64)) / l
+    }
+
+    // Calculate the initial floored version.
+    let s = {
+        let mut s = vec3_to_oct(vec); // Remap to the square.
+        s.0 = (s.0.max(-1.0).min(1.0) * STEPS).floor() * STEP_SIZE;
+        s.1 = (s.1.max(-1.0).min(1.0) * STEPS).floor() * STEP_SIZE;
+        s
     };
 
-    (u << 16) | v
+    // Test all combinations of floor and ceil and keep the best.
+    // Note that at +/- 1, this will exit the square, but that
+    // will be a worse encoding and never win.
+    let mut best_rep = s;
+    let mut max_dot = 0.0;
+    for &(i, j) in &[
+        (0.0, 0.0),
+        (0.0, STEP_SIZE),
+        (STEP_SIZE, 0.0),
+        (STEP_SIZE, STEP_SIZE),
+    ] {
+        let candidate = (s.0 + i, s.1 + j);
+        let oct = oct_to_vec3(candidate);
+        let dot = dot_norm(oct, vec);
+        if dot > max_dot {
+            best_rep = candidate;
+            max_dot = dot;
+        }
+    }
+
+    ((to_snorm_16(best_rep.0) as u32) << 16) | to_snorm_16(best_rep.1) as u32
 }
 
 /// Decodes from an oct32 to a vector of three floats.
@@ -35,27 +66,45 @@ pub fn encode(vec: (f32, f32, f32)) -> u32 {
 /// needs a normalized vector should normalize the returned vector.
 #[inline]
 pub fn decode(n: u32) -> (f32, f32, f32) {
-    let mut vec0 = from_snorm_16((n >> 16) as u16);
-    let mut vec1 = from_snorm_16(n as u16);
-    let vec2 = 1.0 - (vec0.abs() + vec1.abs());
+    oct_to_vec3((from_snorm_16((n >> 16) as u16), from_snorm_16(n as u16)))
+}
+
+#[inline(always)]
+fn vec3_to_oct(vec: (f32, f32, f32)) -> (f32, f32) {
+    let l1_norm = vec.0.abs() + vec.1.abs() + vec.2.abs();
+    let u = vec.0 / l1_norm;
+    let v = vec.1 / l1_norm;
+
+    if vec.2 > 0.0 {
+        (u, v)
+    } else {
+        ((1.0 - v.abs()) * sign(vec.0), (1.0 - u.abs()) * sign(vec.1))
+    }
+}
+
+#[inline(always)]
+fn oct_to_vec3(oct: (f32, f32)) -> (f32, f32, f32) {
+    let vec2 = 1.0 - (oct.0.abs() + oct.1.abs());
 
     if vec2 < 0.0 {
-        let old_x = vec0;
-        vec0 = (1.0 - vec1.abs()) * sign(old_x);
-        vec1 = (1.0 - old_x.abs()) * sign(vec1);
+        (
+            (1.0 - oct.1.abs()) * sign(oct.0),
+            (1.0 - oct.0.abs()) * sign(oct.1),
+            vec2,
+        )
+    } else {
+        (oct.0, oct.1, vec2)
     }
-
-    (vec0, vec1, vec2)
 }
 
 #[inline(always)]
 fn to_snorm_16(n: f32) -> u16 {
-    (n * ((1u32 << (16 - 1)) - 1) as f32).round() as i16 as u16
+    (n * STEPS).round() as i16 as u16
 }
 
 #[inline(always)]
 fn from_snorm_16(n: u16) -> f32 {
-    f32::from(n as i16) * (1.0f32 / ((1u32 << (16 - 1)) - 1) as f32)
+    f32::from(n as i16) * STEP_SIZE
 }
 
 #[inline(always)]
@@ -75,21 +124,27 @@ mod tests {
     fn axis_directions() {
         let px = (1.0, 0.0, 0.0);
         let px_oct = encode(px);
+        let px_octp = encode_precise(px);
 
         let nx = (-1.0, 0.0, 0.0);
         let nx_oct = encode(nx);
+        let nx_octp = encode_precise(nx);
 
         let py = (0.0, 1.0, 0.0);
         let py_oct = encode(py);
+        let py_octp = encode_precise(py);
 
         let ny = (0.0, -1.0, 0.0);
         let ny_oct = encode(ny);
+        let ny_octp = encode_precise(ny);
 
         let pz = (0.0, 0.0, 1.0);
         let pz_oct = encode(pz);
+        let pz_octp = encode_precise(pz);
 
         let nz = (0.0, 0.0, -1.0);
         let nz_oct = encode(nz);
+        let nz_octp = encode_precise(nz);
 
         assert_eq!(px, decode(px_oct));
         assert_eq!(nx, decode(nx_oct));
@@ -97,5 +152,12 @@ mod tests {
         assert_eq!(ny, decode(ny_oct));
         assert_eq!(pz, decode(pz_oct));
         assert_eq!(nz, decode(nz_oct));
+
+        assert_eq!(px, decode(px_octp));
+        assert_eq!(nx, decode(nx_octp));
+        assert_eq!(py, decode(py_octp));
+        assert_eq!(ny, decode(ny_octp));
+        assert_eq!(pz, decode(pz_octp));
+        assert_eq!(nz, decode(nz_octp));
     }
 }
diff --git a/sub_crates/oct32norm/tests/proptest_tests.rs b/sub_crates/oct32norm/tests/proptest_tests.rs
index 7cf8d71..3234055 100644
--- a/sub_crates/oct32norm/tests/proptest_tests.rs
+++ b/sub_crates/oct32norm/tests/proptest_tests.rs
@@ -2,7 +2,7 @@
 extern crate proptest;
 extern crate oct32norm;
 
-use oct32norm::{decode, encode};
+use oct32norm::{decode, encode, encode_precise};
 use proptest::test_runner::Config;
 
 /// Calculates the cosine of the angle between the two vectors,
@@ -56,18 +56,23 @@ proptest! {
     #[test]
     fn pt_roundtrip_angle_precision(v in (-1.0f32..1.0, -1.0f32..1.0, -1.0f32..1.0)) {
         let oct = encode(v);
-        let v2 = decode(oct);
+        let octp = encode_precise(v);
 
         // Check if the angle between the original and the roundtrip
         // is less than 0.004 degrees
-        assert!(cos_gt(v, v2, 0.9999999976));
+        assert!(cos_gt(v, decode(oct), 0.9999999976));
+
+        // Check if the angle between the original and the roundtrip
+        // is less than 0.003 degrees
+        assert!(cos_gt(v, decode(octp), 0.9999999986));
     }
 
     #[test]
     fn pt_roundtrip_component_precision(v in (-1.0f32..1.0, -1.0f32..1.0, -1.0f32..1.0)) {
         let oct = encode(v);
-        let v2 = decode(oct);
+        let octp = encode_precise(v);
 
-        assert!(l1_delta_lt(v, v2, 0.00005));
+        assert!(l1_delta_lt(v, decode(oct), 0.00005));
+        assert!(l1_delta_lt(v, decode(octp), 0.00003));
     }
 }

From 05f9621ac5bb4f1249cab8397d7a6f366b39a146 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 20 Sep 2020 15:16:20 +0900
Subject: [PATCH 22/25] Added a FloatLuv decode function to decode to Yuv
 instead of XYZ.

This is useful because it's super fast, and chromaticity lookups
are typical for spectral upsampling anyway, so this will likely
enable cutting out a bunch of unecessary intermediate calculations.
---
 sub_crates/trifloat/benches/bench.rs | 11 ++++++++++
 sub_crates/trifloat/src/fluv32.rs    | 33 +++++++++++++++++++++++++++-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/sub_crates/trifloat/benches/bench.rs b/sub_crates/trifloat/benches/bench.rs
index 80582ab..1e7002b 100644
--- a/sub_crates/trifloat/benches/bench.rs
+++ b/sub_crates/trifloat/benches/bench.rs
@@ -105,6 +105,16 @@ fn fluv32_decode_1000_values(bench: &mut Bencher) {
     });
 }
 
+fn fluv32_decode_yuv_1000_values(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let v = rng.gen::<u32>();
+        for _ in 0..1000 {
+            black_box(fluv32::decode_yuv(black_box(v)));
+        }
+    });
+}
+
 //----
 
 benchmark_group!(
@@ -117,5 +127,6 @@ benchmark_group!(
     signed48_decode_1000_values,
     fluv32_encode_1000_values,
     fluv32_decode_1000_values,
+    fluv32_decode_yuv_1000_values,
 );
 benchmark_main!(benches);
diff --git a/sub_crates/trifloat/src/fluv32.rs b/sub_crates/trifloat/src/fluv32.rs
index 18d003f..40d4a95 100644
--- a/sub_crates/trifloat/src/fluv32.rs
+++ b/sub_crates/trifloat/src/fluv32.rs
@@ -46,7 +46,9 @@
 #![allow(clippy::cast_lossless)]
 
 const EXP_BIAS: i32 = 27;
-const UV_SCALE: f32 = 410.0;
+
+/// The scale factor of the quantized UV components.
+pub const UV_SCALE: f32 = 410.0;
 
 /// Largest representable Y component.
 pub const Y_MAX: f32 = ((1u64 << (64 - EXP_BIAS)) - (1u64 << (64 - EXP_BIAS - 11))) as f32;
@@ -143,6 +145,27 @@ pub fn decode(fluv32: u32) -> (f32, f32, f32) {
     (x, y, z)
 }
 
+/// Decodes from 32-bit FloatLuv to Yuv.
+///
+/// The Y component is the luminance, and is simply the Y from CIE XYZ.
+///
+/// The u and v components are the CIE LUV u' and v' chromaticity coordinates,
+/// but returned as `u8`s, and scaled by `UV_SCALE` to fit the range 0-255.
+#[inline]
+pub fn decode_yuv(fluv32: u32) -> (f32, u8, u8) {
+    // Check for zero.
+    if fluv32 & 0xffff0000 == 0 {
+        return (0.0, (fluv32 >> 8) as u8, fluv32 as u8);
+    }
+
+    // Calculate y.
+    let l_exp = fluv32 >> 26;
+    let l_mant = (fluv32 >> 16) & 0x3ff;
+    let y = f32::from_bits(((l_exp + 127 - EXP_BIAS as u32) << 23) | (l_mant << 13));
+
+    (y, (fluv32 >> 8) as u8, fluv32 as u8)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -231,6 +254,14 @@ mod tests {
         assert_eq!(2048.0, round_trip(fs).1);
     }
 
+    #[test]
+    fn decode_yuv_01() {
+        let fs = (1.0, 1.0, 1.0);
+        let a = encode(fs);
+
+        assert_eq!((1.0, 0x56, 0xc2), decode_yuv(a));
+    }
+
     #[test]
     fn saturate_y() {
         let fs = (1.0e+20, 1.0e+20, 1.0e+20);

From 066105b20a8c72dc4c3992748284baec3eed3111 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Mon, 21 Sep 2020 09:29:45 +0900
Subject: [PATCH 23/25] Fluv32: slightly tweak the u/v scaling constants.

This allows perfect representation of E (equal energy spectrum).
It's not important from a perceptual standpoint, but it provides
a simple way for Psychopath to represent E when needed for other
purposes.
---
 sub_crates/trifloat/src/fluv32.rs | 71 ++++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 24 deletions(-)

diff --git a/sub_crates/trifloat/src/fluv32.rs b/sub_crates/trifloat/src/fluv32.rs
index 40d4a95..814a201 100644
--- a/sub_crates/trifloat/src/fluv32.rs
+++ b/sub_crates/trifloat/src/fluv32.rs
@@ -2,12 +2,16 @@
 //!
 //! This encoding is based on, but is slightly different than, the 32-bit
 //! LogLuv format from the paper "Overcoming Gamut and Dynamic Range
-//! Limitations in Digital Images" by Greg Ward.  It uses the same uv chroma
-//! storage, but uses a floating point rather than log encoding to store
-//! luminance, mainly for the sake of faster decoding.  It also omits the sign
-//! bit of LogLuv, foregoing negative luminance capabilities.
+//! Limitations in Digital Images" by Greg Ward:
 //!
-//! Compared to LogLuv, this format's chroma precision is identical and its
+//! * It uses the same uv chroma storage approach, but with *very* slightly
+//!   tweaked scales to allow perfect representation of E.
+//! * It uses uses a floating point rather than log encoding to store
+//!   luminance, mainly for the sake of faster decoding.
+//! * It also omits the sign bit of LogLuv, foregoing negative luminance
+//!   capabilities.
+//!
+//! Compared to LogLuv, this format's chroma precision is the same and its
 //! luminance precision is better, but its luminance *range* is smaller.
 //! The supported luminance range is still substantial, however (see
 //! "Luminance details" below).
@@ -47,8 +51,11 @@
 
 const EXP_BIAS: i32 = 27;
 
-/// The scale factor of the quantized UV components.
-pub const UV_SCALE: f32 = 410.0;
+/// The scale factor of the quantized U component.
+pub const U_SCALE: f32 = 817.0 / 2.0;
+
+/// The scale factor of the quantized V component.
+pub const V_SCALE: f32 = 1235.0 / 3.0;
 
 /// Largest representable Y component.
 pub const Y_MAX: f32 = ((1u64 << (64 - EXP_BIAS)) - (1u64 << (64 - EXP_BIAS - 11))) as f32;
@@ -86,8 +93,8 @@ pub fn encode(xyz: (f32, f32, f32)) -> u32 {
         // The minimum value of 1.0 for v is to avoid a possible divide by zero
         // when decoding.  A value less than 1.0 is outside the real colors,
         // so we don't need to store it anyway.
-        let u = (((4.0 * UV_SCALE) * xyz.0 / s) + 0.5).max(0.0).min(255.0);
-        let v = (((9.0 * UV_SCALE) * xyz.1 / s) + 0.5).max(1.0).min(255.0);
+        let u = (((4.0 * U_SCALE) * xyz.0 / s) + 0.5).max(0.0).min(255.0);
+        let v = (((9.0 * V_SCALE) * xyz.1 / s) + 0.5).max(1.0).min(255.0);
 
         ((u as u32) << 8) | (v as u32)
     };
@@ -136,13 +143,14 @@ pub fn decode(fluv32: u32) -> (f32, f32, f32) {
     // This is re-worked from the original equations, to allow a bunch of stuff
     // to cancel out and avoid operations.  It makes the underlying equations a
     // bit non-obvious.
-    // We also roll the UV_SCALE application into the final x and z formulas,
-    // since most of that also cancels if we do it there.
+    // We also roll the U/V_SCALE application into the final x and z formulas,
+    // since some of that cancels out as well, and all of it can be avoided at
+    // runtime that way.
     let tmp = y / v;
-    let x = tmp * (u * 2.25); // y * (9u / 4v)
-    let z = tmp * ((3.0 * UV_SCALE) - (0.75 * u) - (5.0 * v)); // y * ((12 - 3u - 20v) / 4v)
+    let x = tmp * ((2.25 * V_SCALE / U_SCALE) * u); // y * (9u / 4v)
+    let z = tmp * ((3.0 * V_SCALE) - ((0.75 * V_SCALE / U_SCALE) * u) - (5.0 * v)); // y * ((12 - 3u - 20v) / 4v)
 
-    (x, y, z)
+    (x, y, z.max(0.0))
 }
 
 /// Decodes from 32-bit FloatLuv to Yuv.
@@ -150,7 +158,8 @@ pub fn decode(fluv32: u32) -> (f32, f32, f32) {
 /// The Y component is the luminance, and is simply the Y from CIE XYZ.
 ///
 /// The u and v components are the CIE LUV u' and v' chromaticity coordinates,
-/// but returned as `u8`s, and scaled by `UV_SCALE` to fit the range 0-255.
+/// but returned as `u8`s, and scaled by `U_SCALE` and `V_SCALE` respectively
+/// to fit the range 0-255.
 #[inline]
 pub fn decode_yuv(fluv32: u32) -> (f32, u8, u8) {
     // Check for zero.
@@ -181,10 +190,24 @@ mod tests {
         let tri = encode(fs);
         let fs2 = decode(tri);
 
-        assert_eq!(0x000056c2, tri);
+        assert_eq!(0x000056c3, tri);
         assert_eq!(fs, fs2);
     }
 
+    #[test]
+    fn all_ones() {
+        let fs = (1.0f32, 1.0f32, 1.0f32);
+
+        let tri = encode(fs);
+        let fs2 = decode(tri);
+
+        assert_eq!(0x6c0056c3, tri);
+
+        assert!((fs.0 - fs2.0).abs() < 0.0000001);
+        assert_eq!(fs.1, fs2.1);
+        assert!((fs.2 - fs2.2).abs() < 0.0000001);
+    }
+
     #[test]
     fn powers_of_two() {
         let mut n = 0.25;
@@ -259,7 +282,7 @@ mod tests {
         let fs = (1.0, 1.0, 1.0);
         let a = encode(fs);
 
-        assert_eq!((1.0, 0x56, 0xc2), decode_yuv(a));
+        assert_eq!((1.0, 0x56, 0xc3), decode_yuv(a));
     }
 
     #[test]
@@ -276,7 +299,7 @@ mod tests {
         let fs = (INFINITY, INFINITY, INFINITY);
 
         assert_eq!(Y_MAX, round_trip(fs).1);
-        assert_eq!(0xffff56c2, encode(fs));
+        assert_eq!(0xffff56c3, encode(fs));
     }
 
     #[test]
@@ -290,17 +313,17 @@ mod tests {
     #[test]
     fn underflow() {
         let fs = (Y_MIN * 0.99, Y_MIN * 0.99, Y_MIN * 0.99);
-        assert_eq!(0x000056c2, encode(fs));
+        assert_eq!(0x000056c3, encode(fs));
         assert_eq!((0.0, 0.0, 0.0), round_trip(fs));
     }
 
     #[test]
     fn negative_z_impossible() {
-        // These are very specific values, which should result in smallest
-        // possible z value (specifically z = 0.0 with no quantization) while
-        // still having positive values in x and y.
-        let fs = (248.0 / 565.0, 9827.0 / 8475.0, 0.0);
-        assert!(round_trip(fs).2 >= 0.0);
+        for y in 0..1024 {
+            let fs = (1.0, 1.0 + (y as f32 / 4096.0), 0.0);
+            let fs2 = round_trip(fs);
+            assert!(fs2.2 >= 0.0);
+        }
     }
 
     #[test]

From 9cf5ebdf914874012e70d42685eeeda82e1b1b6f Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Mon, 21 Sep 2020 11:54:14 +0900
Subject: [PATCH 24/25] Cleaning up the code in fluv32 a bit.

---
 sub_crates/trifloat/src/fluv32.rs | 40 +++++++++++++------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/sub_crates/trifloat/src/fluv32.rs b/sub_crates/trifloat/src/fluv32.rs
index 814a201..c0471fb 100644
--- a/sub_crates/trifloat/src/fluv32.rs
+++ b/sub_crates/trifloat/src/fluv32.rs
@@ -125,19 +125,10 @@ pub fn encode(xyz: (f32, f32, f32)) -> u32 {
 /// Decodes from 32-bit FloatLuv to CIE XYZ.
 #[inline]
 pub fn decode(fluv32: u32) -> (f32, f32, f32) {
-    // Check for zero.
-    if fluv32 & 0xffff0000 == 0 {
-        return (0.0, 0.0, 0.0);
-    }
-
     // Unpack values.
-    let l_exp = fluv32 >> 26;
-    let l_mant = (fluv32 >> 16) & 0x3ff;
-    let u = ((fluv32 >> 8) & 0xff) as f32; // Range 0.0-255.0
-    let v = (fluv32 & 0xff) as f32; // Range 1.0-255.0
-
-    // Calculate y.
-    let y = f32::from_bits(((l_exp + 127 - EXP_BIAS as u32) << 23) | (l_mant << 13));
+    let (y, u, v) = decode_yuv(fluv32);
+    let u = u as f32;
+    let v = v as f32;
 
     // Calculate x and z.
     // This is re-worked from the original equations, to allow a bunch of stuff
@@ -146,9 +137,10 @@ pub fn decode(fluv32: u32) -> (f32, f32, f32) {
     // We also roll the U/V_SCALE application into the final x and z formulas,
     // since some of that cancels out as well, and all of it can be avoided at
     // runtime that way.
+    const VU_RATIO: f32 = V_SCALE / U_SCALE;
     let tmp = y / v;
-    let x = tmp * ((2.25 * V_SCALE / U_SCALE) * u); // y * (9u / 4v)
-    let z = tmp * ((3.0 * V_SCALE) - ((0.75 * V_SCALE / U_SCALE) * u) - (5.0 * v)); // y * ((12 - 3u - 20v) / 4v)
+    let x = tmp * ((2.25 * VU_RATIO) * u); // y * (9u / 4v)
+    let z = tmp * ((3.0 * V_SCALE) - ((0.75 * VU_RATIO) * u) - (5.0 * v)); // y * ((12 - 3u - 20v) / 4v)
 
     (x, y, z.max(0.0))
 }
@@ -162,17 +154,17 @@ pub fn decode(fluv32: u32) -> (f32, f32, f32) {
 /// to fit the range 0-255.
 #[inline]
 pub fn decode_yuv(fluv32: u32) -> (f32, u8, u8) {
-    // Check for zero.
-    if fluv32 & 0xffff0000 == 0 {
-        return (0.0, (fluv32 >> 8) as u8, fluv32 as u8);
+    const BIAS_OFFSET: u32 = (127 - EXP_BIAS as u32) << 23;
+
+    let y = f32::from_bits(((fluv32 & 0xffff0000) >> 3) + BIAS_OFFSET);
+    let u = (fluv32 >> 8) as u8;
+    let v = fluv32 as u8;
+
+    if fluv32 <= 0xffff {
+        (0.0, u, v)
+    } else {
+        (y, u, v)
     }
-
-    // Calculate y.
-    let l_exp = fluv32 >> 26;
-    let l_mant = (fluv32 >> 16) & 0x3ff;
-    let y = f32::from_bits(((l_exp + 127 - EXP_BIAS as u32) << 23) | (l_mant << 13));
-
-    (y, (fluv32 >> 8) as u8, fluv32 as u8)
 }
 
 #[cfg(test)]

From 6d6904a615385c9320f42591abc699d490960259 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Tue, 22 Sep 2020 11:06:40 +0900
Subject: [PATCH 25/25] FLuv32: increase dynamic range, and decrease precision.

This still exceeds the precision of LogLuv, but lets us match its
dynamic range.
---
 sub_crates/trifloat/src/fluv32.rs | 113 ++++++++++++++++--------------
 1 file changed, 60 insertions(+), 53 deletions(-)

diff --git a/sub_crates/trifloat/src/fluv32.rs b/sub_crates/trifloat/src/fluv32.rs
index c0471fb..09fc1d1 100644
--- a/sub_crates/trifloat/src/fluv32.rs
+++ b/sub_crates/trifloat/src/fluv32.rs
@@ -1,4 +1,4 @@
-//! Encoding/decoding for the 32-bit FloatLuv color format.
+//! Encoding/decoding for the 32-bit FLuv32 color format.
 //!
 //! This encoding is based on, but is slightly different than, the 32-bit
 //! LogLuv format from the paper "Overcoming Gamut and Dynamic Range
@@ -6,27 +6,27 @@
 //!
 //! * It uses the same uv chroma storage approach, but with *very* slightly
 //!   tweaked scales to allow perfect representation of E.
-//! * It uses uses a floating point rather than log encoding to store
-//!   luminance, mainly for the sake of faster decoding.
-//! * It also omits the sign bit of LogLuv, foregoing negative luminance
+//! * It uses a floating point rather than log encoding to store luminance,
+//!   mainly for the sake of faster decoding.
+//! * Unlike LogLuv, this format's dynamic range is biased to put more of it
+//!   above 1.0 (see Luminance details below).
+//! * It omits the sign bit of LogLuv, foregoing negative luminance
 //!   capabilities.
 //!
-//! Compared to LogLuv, this format's chroma precision is the same and its
-//! luminance precision is better, but its luminance *range* is smaller.
-//! The supported luminance range is still substantial, however (see
-//! "Luminance details" below).
+//! This format has the same chroma precision, very slightly improved luminance
+//! precision, and the same 127-stops of dynamic range as LogLuv.
 //!
 //! Like the LogLuv format, this is an absolute rather than relative color
 //! encoding, and as such takes CIE XYZ triplets as input.  It is *not*
 //! designed to take arbitrary floating point triplets, and will perform poorly
 //! if e.g. passed RGB values.
 //!
-//! The bit layout is:
+//! The bit layout is (from most to least significant bit):
 //!
-//! 1. luminance exponent (6 bits, bias 27)
-//! 2. luminance mantissa (10 stored bits, 11 bits precision)
-//! 3. u (8 bits)
-//! 4. v (8 bits)
+//! * 7 bits: luminance exponent (bias 42)
+//! * 9 bits: luminance mantissa (implied leading 1, for 10 bits precision)
+//! * 8 bits: u'
+//! * 8 bits: v'
 //!
 //! ## Luminance details
 //!
@@ -35,21 +35,36 @@
 //! > The sun is about `10^8 cd/m^2`, and the underside of a rock on a moonless
 //! > night is probably around `10^-6` or so [...]
 //!
-//! The luminance range of this format is from about `10^11` on the brightest
-//! end, to about `10^-8` on the darkest (excluding zero itself, which can also
-//! be stored).
+//! See also Wikipedia's
+//! [list of luminance levels](https://en.wikipedia.org/wiki/Orders_of_magnitude_(luminance)).
 //!
-//! That gives this format almost five orders of magnitude more dynamic range
-//! than is likely to be needed for any practical situation.  Moreover, that
-//! extra range is split between both the high and low end, giving a
-//! comfortable buffer on both ends for extreme situations.
+//! The luminance range of the original LogLuv is about `10^-19` to `10^19`,
+//! splitting the range evenly above and below 1.0.  Given the massive dynamic
+//! range, and the fact that all day-to-day luminance levels trivially fit
+//! within that, that's a perfectly reasonable choice.
 //!
-//! Like the LogLuv format, the input CIE Y value is taken directly as the
-//! luminance value.
+//! However, there are some stellar events like supernovae that are trillions
+//! of times brighter than the sun, and would exceed `10^19`.  Conversely,
+//! there likely isn't much use for significantly smaller values than `10^-10`
+//! or so.  So although recording supernovae in physical units with a graphics
+//! format seems unlikely, it doesn't hurt to bias the range towards brighter
+//! luminance levels.
+//!
+//! With that in mind, FLuv32 uses an exponent bias of 42, putting twice as
+//! many stops of dynamic range above 1.0 as below it, giving a luminance range
+//! of roughly `10^-13` to `10^25`.  It's the same dynamic range as
+//! LogLuv (about 127 stops), but with more of that range placed above 1.0.
+//!
+//! Like typical floating point, the mantissa is treated as having an implicit
+//! leading 1, giving it an extra bit of precision.  The smallest exponent
+//! indicates a value of zero, and a valid encoding should also set the
+//! mantissa to zero in that case (denormal numbers are not supported).  The
+//! largest exponent is given no special treatment (no infinities, no NaN).
 
 #![allow(clippy::cast_lossless)]
 
-const EXP_BIAS: i32 = 27;
+const EXP_BIAS: i32 = 42;
+const BIAS_OFFSET: u32 = 127 - EXP_BIAS as u32;
 
 /// The scale factor of the quantized U component.
 pub const U_SCALE: f32 = 817.0 / 2.0;
@@ -58,13 +73,13 @@ pub const U_SCALE: f32 = 817.0 / 2.0;
 pub const V_SCALE: f32 = 1235.0 / 3.0;
 
 /// Largest representable Y component.
-pub const Y_MAX: f32 = ((1u64 << (64 - EXP_BIAS)) - (1u64 << (64 - EXP_BIAS - 11))) as f32;
+pub const Y_MAX: f32 = ((1u128 << (128 - EXP_BIAS)) - (1u128 << (128 - EXP_BIAS - 10))) as f32;
 
 /// Smallest representable non-zero Y component.
-pub const Y_MIN: f32 = 1.0 / (1u64 << (EXP_BIAS - 1)) as f32;
+pub const Y_MIN: f32 = 1.0 / (1u128 << (EXP_BIAS - 1)) as f32;
 
 /// Difference between 1.0 and the next largest representable Y value.
-pub const Y_EPSILON: f32 = 1.0 / 1024.0;
+pub const Y_EPSILON: f32 = 1.0 / 512.0;
 
 /// Encodes from CIE XYZ to 32-bit FloatLuv.
 #[inline]
@@ -99,13 +114,12 @@ pub fn encode(xyz: (f32, f32, f32)) -> u32 {
         ((u as u32) << 8) | (v as u32)
     };
 
-    let y_bits = xyz.1.to_bits();
-    let exp = (y_bits >> 23) as i32 - 127 + EXP_BIAS;
+    let y_bits = xyz.1.to_bits() & 0x7fffffff;
 
-    if exp <= 0 {
+    if y_bits < ((BIAS_OFFSET + 1) << 23) {
         // Special case: black.
         encode_uv((1.0, 1.0, 1.0))
-    } else if exp > 63 {
+    } else if y_bits >= ((BIAS_OFFSET + 128) << 23) {
         if xyz.1.is_infinite() {
             // Special case: infinity.  In this case, we don't have any
             // reasonable basis for calculating chroma, so just return
@@ -118,7 +132,7 @@ pub fn encode(xyz: (f32, f32, f32)) -> u32 {
         }
     } else {
         // Common case.
-        ((exp as u32) << 26) | ((y_bits & 0x07fe000) << 3) | encode_uv(xyz)
+        (((y_bits - (BIAS_OFFSET << 23)) << 2) & 0xffff0000) | encode_uv(xyz)
     }
 }
 
@@ -154,9 +168,7 @@ pub fn decode(fluv32: u32) -> (f32, f32, f32) {
 /// to fit the range 0-255.
 #[inline]
 pub fn decode_yuv(fluv32: u32) -> (f32, u8, u8) {
-    const BIAS_OFFSET: u32 = (127 - EXP_BIAS as u32) << 23;
-
-    let y = f32::from_bits(((fluv32 & 0xffff0000) >> 3) + BIAS_OFFSET);
+    let y = f32::from_bits(((fluv32 & 0xffff0000) >> 2) + (BIAS_OFFSET << 23));
     let u = (fluv32 >> 8) as u8;
     let v = fluv32 as u8;
 
@@ -193,11 +205,10 @@ mod tests {
         let tri = encode(fs);
         let fs2 = decode(tri);
 
-        assert_eq!(0x6c0056c3, tri);
-
-        assert!((fs.0 - fs2.0).abs() < 0.0000001);
         assert_eq!(fs.1, fs2.1);
+        assert!((fs.0 - fs2.0).abs() < 0.0000001);
         assert!((fs.2 - fs2.2).abs() < 0.0000001);
+        assert_eq!(0x540056c3, tri);
     }
 
     #[test]
@@ -221,7 +232,7 @@ mod tests {
     #[test]
     fn accuracy_01() {
         let mut n = 1.0;
-        for _ in 0..1024 {
+        for _ in 0..512 {
             let a = (n as f32, n as f32, n as f32);
             let b = round_trip(a);
 
@@ -232,7 +243,7 @@ mod tests {
             assert!(rd0 < 0.01);
             assert!(rd2 < 0.01);
 
-            n += 1.0 / 1024.0;
+            n += 1.0 / 512.0;
         }
     }
 
@@ -240,11 +251,11 @@ mod tests {
     #[should_panic]
     fn accuracy_02() {
         let mut n = 1.0;
-        for _ in 0..2048 {
+        for _ in 0..1024 {
             let a = (n as f32, n as f32, n as f32);
             let b = round_trip(a);
             assert_eq!(a.1, b.1);
-            n += 1.0 / 2048.0;
+            n += 1.0 / 1024.0;
         }
     }
 
@@ -279,7 +290,7 @@ mod tests {
 
     #[test]
     fn saturate_y() {
-        let fs = (1.0e+20, 1.0e+20, 1.0e+20);
+        let fs = (1.0e+28, 1.0e+28, 1.0e+28);
 
         assert_eq!(Y_MAX, round_trip(fs).1);
         assert_eq!(Y_MAX, decode(0xFFFFFFFF).1);
@@ -295,18 +306,14 @@ mod tests {
     }
 
     #[test]
-    fn smallest_value() {
-        let a = (Y_MIN, Y_MIN, Y_MIN);
-        let b = (Y_MIN * 0.99, Y_MIN * 0.99, Y_MIN * 0.99);
-        assert_eq!(Y_MIN, round_trip(a).1);
-        assert_eq!(0.0, round_trip(b).1);
-    }
+    fn smallest_value_and_underflow() {
+        let fs1 = (Y_MIN, Y_MIN, Y_MIN);
+        let fs2 = (Y_MIN * 0.99, Y_MIN * 0.99, Y_MIN * 0.99);
 
-    #[test]
-    fn underflow() {
-        let fs = (Y_MIN * 0.99, Y_MIN * 0.99, Y_MIN * 0.99);
-        assert_eq!(0x000056c3, encode(fs));
-        assert_eq!((0.0, 0.0, 0.0), round_trip(fs));
+        dbg!(Y_MIN);
+        assert_eq!(fs1.1, round_trip(fs1).1);
+        assert_eq!(0.0, round_trip(fs2).1);
+        assert_eq!(0x000056c3, encode(fs2));
     }
 
     #[test]