diff --git a/src/color.rs b/src/color.rs index f9515b8..67b880c 100644 --- a/src/color.rs +++ b/src/color.rs @@ -5,7 +5,7 @@ pub use color::{ use glam::Vec4; use half::f16; use spectral_upsampling::meng::{spectrum_xyz_to_p_4, EQUAL_ENERGY_REFLECTANCE}; -use trifloat::signed48; +use trifloat::fluv32; use crate::{lerp::Lerp, math::fast_exp}; @@ -142,7 +142,7 @@ impl Color { /// Returns the post-compression size of this color. pub fn compressed_size(&self) -> usize { match self { - Color::XYZ(_, _, _) => 7, + Color::XYZ(_, _, _) => 5, Color::Blackbody { .. } => 5, @@ -160,9 +160,7 @@ impl Color { match *self { Color::XYZ(x, y, z) => { out_data[0] = 0; // Discriminant - let col = signed48::encode((x, y, z)); - let col = col.to_le_bytes(); - (&mut out_data[1..7]).copy_from_slice(&col[0..6]); + (&mut out_data[1..5]).copy_from_slice(&fluv32::encode((x, y, z)).to_ne_bytes()[..]); } Color::Blackbody { @@ -200,10 +198,10 @@ impl Color { match in_data[0] { 0 => { // XYZ - let mut bytes = [0u8; 8]; - (&mut bytes[0..6]).copy_from_slice(&in_data[1..7]); - let (x, y, z) = signed48::decode(u64::from_le_bytes(bytes)); - (Color::XYZ(x, y, z), 7) + let mut bytes = [0u8; 4]; + (&mut bytes[..]).copy_from_slice(&in_data[1..5]); + let (x, y, z) = fluv32::decode(u32::from_ne_bytes(bytes)); + (Color::XYZ(x, y, z), 5) } 1 => { diff --git a/sub_crates/oct32norm/benches/bench.rs b/sub_crates/oct32norm/benches/bench.rs index 18a6940..012d7fb 100644 --- a/sub_crates/oct32norm/benches/bench.rs +++ b/sub_crates/oct32norm/benches/bench.rs @@ -1,26 +1,38 @@ use bencher::{benchmark_group, benchmark_main, black_box, Bencher}; -use oct32norm::{decode, encode}; +use oct32norm::{decode, encode, encode_precise}; use rand::{rngs::SmallRng, FromEntropy, Rng}; //---- -fn encode_100_values(bench: &mut Bencher) { +fn encode_1000_values(bench: &mut Bencher) { let mut rng = SmallRng::from_entropy(); bench.iter(|| { let x = rng.gen::() - 0.5; let y = rng.gen::() - 0.5; let z = rng.gen::() - 0.5; - for _ in 0..100 { + for _ in 0..1000 { black_box(encode(black_box((x, y, z)))); } }); } -fn decode_100_values(bench: &mut Bencher) { +fn encode_precise_1000_values(bench: &mut Bencher) { + let mut rng = SmallRng::from_entropy(); + bench.iter(|| { + let x = rng.gen::() - 0.5; + let y = rng.gen::() - 0.5; + let z = rng.gen::() - 0.5; + for _ in 0..1000 { + black_box(encode_precise(black_box((x, y, z)))); + } + }); +} + +fn decode_1000_values(bench: &mut Bencher) { let mut rng = SmallRng::from_entropy(); bench.iter(|| { let v = rng.gen::(); - for _ in 0..100 { + for _ in 0..1000 { black_box(decode(black_box(v))); } }); @@ -28,5 +40,10 @@ fn decode_100_values(bench: &mut Bencher) { //---- -benchmark_group!(benches, encode_100_values, decode_100_values,); +benchmark_group!( + benches, + encode_1000_values, + encode_precise_1000_values, + decode_1000_values, +); benchmark_main!(benches); diff --git a/sub_crates/oct32norm/src/lib.rs b/sub_crates/oct32norm/src/lib.rs index 63c085f..6f54ebb 100644 --- a/sub_crates/oct32norm/src/lib.rs +++ b/sub_crates/oct32norm/src/lib.rs @@ -4,31 +4,60 @@ //! of Efficient Representations for Independent Unit Vectors" by //! Cigolle et al. +const STEP_SIZE: f32 = 1.0 / STEPS; +const STEPS: f32 = ((1 << (16 - 1)) - 1) as f32; + /// Encodes a vector of three floats to the oct32 format. /// /// The input vector does not need to be normalized--only the direction /// matters to the encoding process, not the length. #[inline] pub fn encode(vec: (f32, f32, f32)) -> u32 { - let inv_l1_norm = 1.0f32 / (vec.0.abs() + vec.1.abs() + vec.2.abs()); + let (u, v) = vec3_to_oct(vec); + ((to_snorm_16(u) as u32) << 16) | to_snorm_16(v) as u32 +} - let (u, v) = if vec.2 < 0.0 { - ( - u32::from(to_snorm_16( - (1.0 - (vec.1 * inv_l1_norm).abs()) * sign(vec.0), - )), - u32::from(to_snorm_16( - (1.0 - (vec.0 * inv_l1_norm).abs()) * sign(vec.1), - )), - ) - } else { - ( - u32::from(to_snorm_16(vec.0 * inv_l1_norm)), - u32::from(to_snorm_16(vec.1 * inv_l1_norm)), - ) +/// Encodes a vector of three floats to the oct32 format. +/// +/// This is the same as `encode()` except that it is slower and encodes +/// with slightly better precision. +pub fn encode_precise(vec: (f32, f32, f32)) -> u32 { + #[inline(always)] + fn dot_norm(a: (f32, f32, f32), b: (f32, f32, f32)) -> f64 { + let l = ((a.0 as f64 * a.0 as f64) + (a.1 as f64 * a.1 as f64) + (a.2 as f64 * a.2 as f64)) + .sqrt(); + ((a.0 as f64 * b.0 as f64) + (a.1 as f64 * b.1 as f64) + (a.2 as f64 * b.2 as f64)) / l + } + + // Calculate the initial floored version. + let s = { + let mut s = vec3_to_oct(vec); // Remap to the square. + s.0 = (s.0.max(-1.0).min(1.0) * STEPS).floor() * STEP_SIZE; + s.1 = (s.1.max(-1.0).min(1.0) * STEPS).floor() * STEP_SIZE; + s }; - (u << 16) | v + // Test all combinations of floor and ceil and keep the best. + // Note that at +/- 1, this will exit the square, but that + // will be a worse encoding and never win. + let mut best_rep = s; + let mut max_dot = 0.0; + for &(i, j) in &[ + (0.0, 0.0), + (0.0, STEP_SIZE), + (STEP_SIZE, 0.0), + (STEP_SIZE, STEP_SIZE), + ] { + let candidate = (s.0 + i, s.1 + j); + let oct = oct_to_vec3(candidate); + let dot = dot_norm(oct, vec); + if dot > max_dot { + best_rep = candidate; + max_dot = dot; + } + } + + ((to_snorm_16(best_rep.0) as u32) << 16) | to_snorm_16(best_rep.1) as u32 } /// Decodes from an oct32 to a vector of three floats. @@ -37,30 +66,45 @@ pub fn encode(vec: (f32, f32, f32)) -> u32 { /// needs a normalized vector should normalize the returned vector. #[inline] pub fn decode(n: u32) -> (f32, f32, f32) { - let mut vec0 = from_snorm_16((n >> 16) as u16); - let mut vec1 = from_snorm_16(n as u16); - let vec2 = 1.0 - (vec0.abs() + vec1.abs()); + oct_to_vec3((from_snorm_16((n >> 16) as u16), from_snorm_16(n as u16))) +} + +#[inline(always)] +fn vec3_to_oct(vec: (f32, f32, f32)) -> (f32, f32) { + let l1_norm = vec.0.abs() + vec.1.abs() + vec.2.abs(); + let u = vec.0 / l1_norm; + let v = vec.1 / l1_norm; + + if vec.2 > 0.0 { + (u, v) + } else { + ((1.0 - v.abs()) * sign(vec.0), (1.0 - u.abs()) * sign(vec.1)) + } +} + +#[inline(always)] +fn oct_to_vec3(oct: (f32, f32)) -> (f32, f32, f32) { + let vec2 = 1.0 - (oct.0.abs() + oct.1.abs()); if vec2 < 0.0 { - let old_x = vec0; - vec0 = (1.0 - vec1.abs()) * sign(old_x); - vec1 = (1.0 - old_x.abs()) * sign(vec1); + ( + (1.0 - oct.1.abs()) * sign(oct.0), + (1.0 - oct.0.abs()) * sign(oct.1), + vec2, + ) + } else { + (oct.0, oct.1, vec2) } - - (vec0, vec1, vec2) } #[inline(always)] fn to_snorm_16(n: f32) -> u16 { - (n.max(-1.0).min(1.0) * ((1u32 << (16 - 1)) - 1) as f32).round() as i16 as u16 + (n * STEPS).round() as i16 as u16 } #[inline(always)] fn from_snorm_16(n: u16) -> f32 { - f32::from(n as i16) - * (1.0f32 / ((1u32 << (16 - 1)) - 1) as f32) - .max(-1.0) - .min(1.0) + f32::from(n as i16) * STEP_SIZE } #[inline(always)] @@ -80,21 +124,27 @@ mod tests { fn axis_directions() { let px = (1.0, 0.0, 0.0); let px_oct = encode(px); + let px_octp = encode_precise(px); let nx = (-1.0, 0.0, 0.0); let nx_oct = encode(nx); + let nx_octp = encode_precise(nx); let py = (0.0, 1.0, 0.0); let py_oct = encode(py); + let py_octp = encode_precise(py); let ny = (0.0, -1.0, 0.0); let ny_oct = encode(ny); + let ny_octp = encode_precise(ny); let pz = (0.0, 0.0, 1.0); let pz_oct = encode(pz); + let pz_octp = encode_precise(pz); let nz = (0.0, 0.0, -1.0); let nz_oct = encode(nz); + let nz_octp = encode_precise(nz); assert_eq!(px, decode(px_oct)); assert_eq!(nx, decode(nx_oct)); @@ -102,5 +152,12 @@ mod tests { assert_eq!(ny, decode(ny_oct)); assert_eq!(pz, decode(pz_oct)); assert_eq!(nz, decode(nz_oct)); + + assert_eq!(px, decode(px_octp)); + assert_eq!(nx, decode(nx_octp)); + assert_eq!(py, decode(py_octp)); + assert_eq!(ny, decode(ny_octp)); + assert_eq!(pz, decode(pz_octp)); + assert_eq!(nz, decode(nz_octp)); } } diff --git a/sub_crates/oct32norm/tests/proptest_tests.rs b/sub_crates/oct32norm/tests/proptest_tests.rs index 7cf8d71..3234055 100644 --- a/sub_crates/oct32norm/tests/proptest_tests.rs +++ b/sub_crates/oct32norm/tests/proptest_tests.rs @@ -2,7 +2,7 @@ extern crate proptest; extern crate oct32norm; -use oct32norm::{decode, encode}; +use oct32norm::{decode, encode, encode_precise}; use proptest::test_runner::Config; /// Calculates the cosine of the angle between the two vectors, @@ -56,18 +56,23 @@ proptest! { #[test] fn pt_roundtrip_angle_precision(v in (-1.0f32..1.0, -1.0f32..1.0, -1.0f32..1.0)) { let oct = encode(v); - let v2 = decode(oct); + let octp = encode_precise(v); // Check if the angle between the original and the roundtrip // is less than 0.004 degrees - assert!(cos_gt(v, v2, 0.9999999976)); + assert!(cos_gt(v, decode(oct), 0.9999999976)); + + // Check if the angle between the original and the roundtrip + // is less than 0.003 degrees + assert!(cos_gt(v, decode(octp), 0.9999999986)); } #[test] fn pt_roundtrip_component_precision(v in (-1.0f32..1.0, -1.0f32..1.0, -1.0f32..1.0)) { let oct = encode(v); - let v2 = decode(oct); + let octp = encode_precise(v); - assert!(l1_delta_lt(v, v2, 0.00005)); + assert!(l1_delta_lt(v, decode(oct), 0.00005)); + assert!(l1_delta_lt(v, decode(octp), 0.00003)); } } diff --git a/sub_crates/trifloat/benches/bench.rs b/sub_crates/trifloat/benches/bench.rs index 727661a..1e7002b 100644 --- a/sub_crates/trifloat/benches/bench.rs +++ b/sub_crates/trifloat/benches/bench.rs @@ -1,60 +1,132 @@ use bencher::{benchmark_group, benchmark_main, black_box, Bencher}; use rand::{rngs::SmallRng, FromEntropy, Rng}; -use trifloat::{signed48, unsigned32}; +use trifloat::{fluv32, signed48, unsigned32, unsigned40}; //---- -fn unsigned32_encode_100_values(bench: &mut Bencher) { +fn unsigned32_encode_1000_values(bench: &mut Bencher) { let mut rng = SmallRng::from_entropy(); bench.iter(|| { - let x = rng.gen::() - 0.5; - let y = rng.gen::() - 0.5; - let z = rng.gen::() - 0.5; - for _ in 0..100 { + let x = rng.gen::(); + let y = rng.gen::(); + let z = rng.gen::(); + for _ in 0..1000 { black_box(unsigned32::encode(black_box((x, y, z)))); } }); } -fn unsigned32_decode_100_values(bench: &mut Bencher) { +fn unsigned32_decode_1000_values(bench: &mut Bencher) { let mut rng = SmallRng::from_entropy(); bench.iter(|| { let v = rng.gen::(); - for _ in 0..100 { + for _ in 0..1000 { black_box(unsigned32::decode(black_box(v))); } }); } -fn signed48_encode_100_values(bench: &mut Bencher) { +fn unsigned40_encode_1000_values(bench: &mut Bencher) { + let mut rng = SmallRng::from_entropy(); + bench.iter(|| { + let x = rng.gen::(); + let y = rng.gen::(); + let z = rng.gen::(); + for _ in 0..1000 { + black_box(unsigned40::encode(black_box((x, y, z)))); + } + }); +} + +fn unsigned40_decode_1000_values(bench: &mut Bencher) { + let mut rng = SmallRng::from_entropy(); + bench.iter(|| { + let v = [ + rng.gen::(), + rng.gen::(), + rng.gen::(), + rng.gen::(), + rng.gen::(), + ]; + for _ in 0..1000 { + black_box(unsigned40::decode(black_box(v))); + } + }); +} + +fn signed48_encode_1000_values(bench: &mut Bencher) { let mut rng = SmallRng::from_entropy(); bench.iter(|| { let x = rng.gen::() - 0.5; let y = rng.gen::() - 0.5; let z = rng.gen::() - 0.5; - for _ in 0..100 { + for _ in 0..1000 { black_box(signed48::encode(black_box((x, y, z)))); } }); } -fn signed48_decode_100_values(bench: &mut Bencher) { +fn signed48_decode_1000_values(bench: &mut Bencher) { let mut rng = SmallRng::from_entropy(); bench.iter(|| { - let v = rng.gen::() & 0x0000_FFFF_FFFF_FFFF; - for _ in 0..100 { + let v = [ + rng.gen::(), + rng.gen::(), + rng.gen::(), + rng.gen::(), + rng.gen::(), + rng.gen::(), + ]; + for _ in 0..1000 { black_box(signed48::decode(black_box(v))); } }); } +fn fluv32_encode_1000_values(bench: &mut Bencher) { + let mut rng = SmallRng::from_entropy(); + bench.iter(|| { + let x = rng.gen::(); + let y = rng.gen::(); + let z = rng.gen::(); + for _ in 0..1000 { + black_box(fluv32::encode(black_box((x, y, z)))); + } + }); +} + +fn fluv32_decode_1000_values(bench: &mut Bencher) { + let mut rng = SmallRng::from_entropy(); + bench.iter(|| { + let v = rng.gen::(); + for _ in 0..1000 { + black_box(fluv32::decode(black_box(v))); + } + }); +} + +fn fluv32_decode_yuv_1000_values(bench: &mut Bencher) { + let mut rng = SmallRng::from_entropy(); + bench.iter(|| { + let v = rng.gen::(); + for _ in 0..1000 { + black_box(fluv32::decode_yuv(black_box(v))); + } + }); +} + //---- benchmark_group!( benches, - unsigned32_encode_100_values, - unsigned32_decode_100_values, - signed48_encode_100_values, - signed48_decode_100_values, + unsigned32_encode_1000_values, + unsigned32_decode_1000_values, + unsigned40_encode_1000_values, + unsigned40_decode_1000_values, + signed48_encode_1000_values, + signed48_decode_1000_values, + fluv32_encode_1000_values, + fluv32_decode_1000_values, + fluv32_decode_yuv_1000_values, ); benchmark_main!(benches); diff --git a/sub_crates/trifloat/src/fluv32.rs b/sub_crates/trifloat/src/fluv32.rs new file mode 100644 index 0000000..09fc1d1 --- /dev/null +++ b/sub_crates/trifloat/src/fluv32.rs @@ -0,0 +1,368 @@ +//! Encoding/decoding for the 32-bit FLuv32 color format. +//! +//! This encoding is based on, but is slightly different than, the 32-bit +//! LogLuv format from the paper "Overcoming Gamut and Dynamic Range +//! Limitations in Digital Images" by Greg Ward: +//! +//! * It uses the same uv chroma storage approach, but with *very* slightly +//! tweaked scales to allow perfect representation of E. +//! * It uses a floating point rather than log encoding to store luminance, +//! mainly for the sake of faster decoding. +//! * Unlike LogLuv, this format's dynamic range is biased to put more of it +//! above 1.0 (see Luminance details below). +//! * It omits the sign bit of LogLuv, foregoing negative luminance +//! capabilities. +//! +//! This format has the same chroma precision, very slightly improved luminance +//! precision, and the same 127-stops of dynamic range as LogLuv. +//! +//! Like the LogLuv format, this is an absolute rather than relative color +//! encoding, and as such takes CIE XYZ triplets as input. It is *not* +//! designed to take arbitrary floating point triplets, and will perform poorly +//! if e.g. passed RGB values. +//! +//! The bit layout is (from most to least significant bit): +//! +//! * 7 bits: luminance exponent (bias 42) +//! * 9 bits: luminance mantissa (implied leading 1, for 10 bits precision) +//! * 8 bits: u' +//! * 8 bits: v' +//! +//! ## Luminance details +//! +//! Quoting Greg Ward about luminance ranges: +//! +//! > The sun is about `10^8 cd/m^2`, and the underside of a rock on a moonless +//! > night is probably around `10^-6` or so [...] +//! +//! See also Wikipedia's +//! [list of luminance levels](https://en.wikipedia.org/wiki/Orders_of_magnitude_(luminance)). +//! +//! The luminance range of the original LogLuv is about `10^-19` to `10^19`, +//! splitting the range evenly above and below 1.0. Given the massive dynamic +//! range, and the fact that all day-to-day luminance levels trivially fit +//! within that, that's a perfectly reasonable choice. +//! +//! However, there are some stellar events like supernovae that are trillions +//! of times brighter than the sun, and would exceed `10^19`. Conversely, +//! there likely isn't much use for significantly smaller values than `10^-10` +//! or so. So although recording supernovae in physical units with a graphics +//! format seems unlikely, it doesn't hurt to bias the range towards brighter +//! luminance levels. +//! +//! With that in mind, FLuv32 uses an exponent bias of 42, putting twice as +//! many stops of dynamic range above 1.0 as below it, giving a luminance range +//! of roughly `10^-13` to `10^25`. It's the same dynamic range as +//! LogLuv (about 127 stops), but with more of that range placed above 1.0. +//! +//! Like typical floating point, the mantissa is treated as having an implicit +//! leading 1, giving it an extra bit of precision. The smallest exponent +//! indicates a value of zero, and a valid encoding should also set the +//! mantissa to zero in that case (denormal numbers are not supported). The +//! largest exponent is given no special treatment (no infinities, no NaN). + +#![allow(clippy::cast_lossless)] + +const EXP_BIAS: i32 = 42; +const BIAS_OFFSET: u32 = 127 - EXP_BIAS as u32; + +/// The scale factor of the quantized U component. +pub const U_SCALE: f32 = 817.0 / 2.0; + +/// The scale factor of the quantized V component. +pub const V_SCALE: f32 = 1235.0 / 3.0; + +/// Largest representable Y component. +pub const Y_MAX: f32 = ((1u128 << (128 - EXP_BIAS)) - (1u128 << (128 - EXP_BIAS - 10))) as f32; + +/// Smallest representable non-zero Y component. +pub const Y_MIN: f32 = 1.0 / (1u128 << (EXP_BIAS - 1)) as f32; + +/// Difference between 1.0 and the next largest representable Y value. +pub const Y_EPSILON: f32 = 1.0 / 512.0; + +/// Encodes from CIE XYZ to 32-bit FloatLuv. +#[inline] +pub fn encode(xyz: (f32, f32, f32)) -> u32 { + debug_assert!( + xyz.0 >= 0.0 + && xyz.1 >= 0.0 + && xyz.2 >= 0.0 + && !xyz.0.is_nan() + && !xyz.1.is_nan() + && !xyz.2.is_nan(), + "trifloat::fluv32::encode(): encoding to fluv32 only \ + works correctly for positive, non-NaN numbers, but the numbers passed \ + were: ({}, {}, {})", + xyz.0, + xyz.1, + xyz.2 + ); + + // Calculates the 16-bit encoding of the UV values for the given XYZ input. + #[inline(always)] + fn encode_uv(xyz: (f32, f32, f32)) -> u32 { + let s = xyz.0 + (15.0 * xyz.1) + (3.0 * xyz.2); + + // The `+ 0.5` is for rounding, and is not part of the normal equation. + // The minimum value of 1.0 for v is to avoid a possible divide by zero + // when decoding. A value less than 1.0 is outside the real colors, + // so we don't need to store it anyway. + let u = (((4.0 * U_SCALE) * xyz.0 / s) + 0.5).max(0.0).min(255.0); + let v = (((9.0 * V_SCALE) * xyz.1 / s) + 0.5).max(1.0).min(255.0); + + ((u as u32) << 8) | (v as u32) + }; + + let y_bits = xyz.1.to_bits() & 0x7fffffff; + + if y_bits < ((BIAS_OFFSET + 1) << 23) { + // Special case: black. + encode_uv((1.0, 1.0, 1.0)) + } else if y_bits >= ((BIAS_OFFSET + 128) << 23) { + if xyz.1.is_infinite() { + // Special case: infinity. In this case, we don't have any + // reasonable basis for calculating chroma, so just return + // the brightest white. + 0xffff0000 | encode_uv((1.0, 1.0, 1.0)) + } else { + // Special case: non-infinite, but brighter luma than can be + // represented. Return the correct chroma, and the brightest luma. + 0xffff0000 | encode_uv(xyz) + } + } else { + // Common case. + (((y_bits - (BIAS_OFFSET << 23)) << 2) & 0xffff0000) | encode_uv(xyz) + } +} + +/// Decodes from 32-bit FloatLuv to CIE XYZ. +#[inline] +pub fn decode(fluv32: u32) -> (f32, f32, f32) { + // Unpack values. + let (y, u, v) = decode_yuv(fluv32); + let u = u as f32; + let v = v as f32; + + // Calculate x and z. + // This is re-worked from the original equations, to allow a bunch of stuff + // to cancel out and avoid operations. It makes the underlying equations a + // bit non-obvious. + // We also roll the U/V_SCALE application into the final x and z formulas, + // since some of that cancels out as well, and all of it can be avoided at + // runtime that way. + const VU_RATIO: f32 = V_SCALE / U_SCALE; + let tmp = y / v; + let x = tmp * ((2.25 * VU_RATIO) * u); // y * (9u / 4v) + let z = tmp * ((3.0 * V_SCALE) - ((0.75 * VU_RATIO) * u) - (5.0 * v)); // y * ((12 - 3u - 20v) / 4v) + + (x, y, z.max(0.0)) +} + +/// Decodes from 32-bit FloatLuv to Yuv. +/// +/// The Y component is the luminance, and is simply the Y from CIE XYZ. +/// +/// The u and v components are the CIE LUV u' and v' chromaticity coordinates, +/// but returned as `u8`s, and scaled by `U_SCALE` and `V_SCALE` respectively +/// to fit the range 0-255. +#[inline] +pub fn decode_yuv(fluv32: u32) -> (f32, u8, u8) { + let y = f32::from_bits(((fluv32 & 0xffff0000) >> 2) + (BIAS_OFFSET << 23)); + let u = (fluv32 >> 8) as u8; + let v = fluv32 as u8; + + if fluv32 <= 0xffff { + (0.0, u, v) + } else { + (y, u, v) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) { + decode(encode(floats)) + } + + #[test] + fn all_zeros() { + let fs = (0.0f32, 0.0f32, 0.0f32); + + let tri = encode(fs); + let fs2 = decode(tri); + + assert_eq!(0x000056c3, tri); + assert_eq!(fs, fs2); + } + + #[test] + fn all_ones() { + let fs = (1.0f32, 1.0f32, 1.0f32); + + let tri = encode(fs); + let fs2 = decode(tri); + + assert_eq!(fs.1, fs2.1); + assert!((fs.0 - fs2.0).abs() < 0.0000001); + assert!((fs.2 - fs2.2).abs() < 0.0000001); + assert_eq!(0x540056c3, tri); + } + + #[test] + fn powers_of_two() { + let mut n = 0.25; + for _ in 0..20 { + let a = (n as f32, n as f32, n as f32); + let b = round_trip(a); + + let rd0 = 2.0 * (a.0 - b.0).abs() / (a.0 + b.0); + let rd2 = 2.0 * (a.2 - b.2).abs() / (a.2 + b.2); + + assert_eq!(a.1, b.1); + assert!(rd0 < 0.01); + assert!(rd2 < 0.01); + + n *= 2.0; + } + } + + #[test] + fn accuracy_01() { + let mut n = 1.0; + for _ in 0..512 { + let a = (n as f32, n as f32, n as f32); + let b = round_trip(a); + + let rd0 = 2.0 * (a.0 - b.0).abs() / (a.0 + b.0); + let rd2 = 2.0 * (a.2 - b.2).abs() / (a.2 + b.2); + + assert_eq!(a.1, b.1); + assert!(rd0 < 0.01); + assert!(rd2 < 0.01); + + n += 1.0 / 512.0; + } + } + + #[test] + #[should_panic] + fn accuracy_02() { + let mut n = 1.0; + for _ in 0..1024 { + let a = (n as f32, n as f32, n as f32); + let b = round_trip(a); + assert_eq!(a.1, b.1); + n += 1.0 / 1024.0; + } + } + + #[test] + fn integers() { + for n in 1..=512 { + let a = (n as f32, n as f32, n as f32); + let b = round_trip(a); + + let rd0 = 2.0 * (a.0 - b.0).abs() / (a.0 + b.0); + let rd2 = 2.0 * (a.2 - b.2).abs() / (a.2 + b.2); + + assert_eq!(a.1, b.1); + assert!(rd0 < 0.01); + assert!(rd2 < 0.01); + } + } + + #[test] + fn precision_floor() { + let fs = (2049.0f32, 2049.0f32, 2049.0f32); + assert_eq!(2048.0, round_trip(fs).1); + } + + #[test] + fn decode_yuv_01() { + let fs = (1.0, 1.0, 1.0); + let a = encode(fs); + + assert_eq!((1.0, 0x56, 0xc3), decode_yuv(a)); + } + + #[test] + fn saturate_y() { + let fs = (1.0e+28, 1.0e+28, 1.0e+28); + + assert_eq!(Y_MAX, round_trip(fs).1); + assert_eq!(Y_MAX, decode(0xFFFFFFFF).1); + } + + #[test] + fn inf_saturate() { + use std::f32::INFINITY; + let fs = (INFINITY, INFINITY, INFINITY); + + assert_eq!(Y_MAX, round_trip(fs).1); + assert_eq!(0xffff56c3, encode(fs)); + } + + #[test] + fn smallest_value_and_underflow() { + let fs1 = (Y_MIN, Y_MIN, Y_MIN); + let fs2 = (Y_MIN * 0.99, Y_MIN * 0.99, Y_MIN * 0.99); + + dbg!(Y_MIN); + assert_eq!(fs1.1, round_trip(fs1).1); + assert_eq!(0.0, round_trip(fs2).1); + assert_eq!(0x000056c3, encode(fs2)); + } + + #[test] + fn negative_z_impossible() { + for y in 0..1024 { + let fs = (1.0, 1.0 + (y as f32 / 4096.0), 0.0); + let fs2 = round_trip(fs); + assert!(fs2.2 >= 0.0); + } + } + + #[test] + #[should_panic] + fn nans_01() { + encode((std::f32::NAN, 0.0, 0.0)); + } + + #[test] + #[should_panic] + fn nans_02() { + encode((0.0, std::f32::NAN, 0.0)); + } + + #[test] + #[should_panic] + fn nans_03() { + encode((0.0, 0.0, std::f32::NAN)); + } + + #[test] + #[should_panic] + fn negative_01() { + encode((-1.0, 0.0, 0.0)); + } + + #[test] + #[should_panic] + fn negative_02() { + encode((0.0, -1.0, 0.0)); + } + + #[test] + #[should_panic] + fn negative_03() { + encode((0.0, 0.0, -1.0)); + } + + #[test] + fn negative_04() { + encode((-0.0, -0.0, -0.0)); + } +} diff --git a/sub_crates/trifloat/src/lib.rs b/sub_crates/trifloat/src/lib.rs index eaefb5f..ba1350d 100644 --- a/sub_crates/trifloat/src/lib.rs +++ b/sub_crates/trifloat/src/lib.rs @@ -4,8 +4,10 @@ //! The motivating use-case for this is compactly storing HDR RGB colors. But //! it may be useful for other things as well. +pub mod fluv32; pub mod signed48; pub mod unsigned32; +pub mod unsigned40; //=========================================================================== // Some shared functions used by the other modules in this crate. diff --git a/sub_crates/trifloat/src/signed48.rs b/sub_crates/trifloat/src/signed48.rs index 5dd99bf..f43274d 100644 --- a/sub_crates/trifloat/src/signed48.rs +++ b/sub_crates/trifloat/src/signed48.rs @@ -3,9 +3,9 @@ //! The encoding uses 13 bits of mantissa and 1 sign bit per number, and 6 //! bits for the shared exponent. The bit layout is: [sign 1, mantissa 1, //! sign 2, mantissa 2, sign 3, mantissa 3, exponent]. The exponent is stored -//! as an unsigned integer with a bias of 25. +//! as an unsigned integer with a bias of 26. //! -//! The largest representable number is `2^38 - 2^25`, and the smallest +//! The largest representable number is just under `2^38`, and the smallest //! representable positive number is `2^-38`. //! //! Since the exponent is shared between all three values, the precision @@ -18,40 +18,48 @@ use crate::{fiddle_exp2, fiddle_log2}; /// Largest representable number. -pub const MAX: f32 = 274_844_352_512.0; +pub const MAX: f32 = ((1u128 << (64 - EXP_BIAS)) - (1 << (64 - EXP_BIAS - 13))) as f32; /// Smallest representable number. /// /// Note this is not the smallest _magnitude_ number. This is a negative /// number of large magnitude. -pub const MIN: f32 = -274_844_352_512.0; +pub const MIN: f32 = -MAX; /// Smallest representable positive number. /// /// This is the number with the smallest possible magnitude (aside from zero). -#[allow(clippy::excessive_precision)] -pub const MIN_POSITIVE: f32 = 0.000_000_000_003_637_978_807_091_713; +pub const MIN_POSITIVE: f32 = 1.0 / (1u128 << (EXP_BIAS + 12)) as f32; /// Difference between 1.0 and the next largest representable number. pub const EPSILON: f32 = 1.0 / 4096.0; -const EXP_BIAS: i32 = 25; -const MIN_EXP: i32 = 0 - EXP_BIAS; -const MAX_EXP: i32 = 63 - EXP_BIAS; +const EXP_BIAS: i32 = 26; /// Encodes three floating point values into a signed 48-bit trifloat. /// /// Input floats that are larger than `MAX` or smaller than `MIN` will saturate /// to `MAX` and `MIN` respectively, including +/- infinity. Values are -/// converted to trifloat precision by rounding. -/// -/// Only the lower 48 bits of the return value are used. The highest 16 bits -/// will all be zero and can be safely discarded. +/// converted to trifloat precision by rounding towards zero. /// /// Warning: NaN's are _not_ supported by the trifloat format. There are /// debug-only assertions in place to catch such values in the input floats. #[inline] -pub fn encode(floats: (f32, f32, f32)) -> u64 { +pub fn encode(floats: (f32, f32, f32)) -> [u8; 6] { + u64_to_bytes(encode_64(floats)) +} + +/// Decodes a signed 48-bit trifloat into three full floating point numbers. +/// +/// This operation is lossless and cannot fail. +#[inline] +pub fn decode(trifloat: [u8; 6]) -> (f32, f32, f32) { + decode_64(bytes_to_u64(trifloat)) +} + +// Workhorse encode function, which operates on u64. +#[inline(always)] +fn encode_64(floats: (f32, f32, f32)) -> u64 { debug_assert!( !floats.0.is_nan() && !floats.1.is_nan() && !floats.2.is_nan(), "trifloat::signed48::encode(): encoding to signed tri-floats only \ @@ -62,55 +70,36 @@ pub fn encode(floats: (f32, f32, f32)) -> u64 { floats.2 ); - // Find the largest (in magnitude) of the three values. - let largest_value = { - let mut largest_value: f32 = 0.0; - if floats.0.abs() > largest_value.abs() { - largest_value = floats.0; - } - if floats.1.abs() > largest_value.abs() { - largest_value = floats.1; - } - if floats.2.abs() > largest_value.abs() { - largest_value = floats.2; - } - largest_value - }; + let floats_abs = (floats.0.abs(), floats.1.abs(), floats.2.abs()); - // Calculate the exponent and 1.0/multiplier for encoding the values. - let (exponent, inv_multiplier) = { - let mut exponent = (fiddle_log2(largest_value) + 1).max(MIN_EXP).min(MAX_EXP); - let mut inv_multiplier = fiddle_exp2(-exponent + 13); + let largest_abs = floats_abs.0.max(floats_abs.1.max(floats_abs.2)); - // Edge-case: make sure rounding pushes the largest value up - // appropriately if needed. - if (largest_value * inv_multiplier).abs() + 0.5 >= 8192.0 { - exponent = (exponent + 1).min(MAX_EXP); - inv_multiplier = fiddle_exp2(-exponent + 13); - } - (exponent, inv_multiplier) - }; + if largest_abs < MIN_POSITIVE { + 0 + } else { + let e = fiddle_log2(largest_abs).max(-EXP_BIAS).min(63 - EXP_BIAS); + let inv_multiplier = fiddle_exp2(-e + 12); - // Quantize and encode values. - let x = (floats.0.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111; - let x_sign = (floats.0.to_bits() >> 31) as u64; - let y = (floats.1.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111; - let y_sign = (floats.1.to_bits() >> 31) as u64; - let z = (floats.2.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111; - let z_sign = (floats.2.to_bits() >> 31) as u64; - let e = (exponent + EXP_BIAS) as u64 & 0b111_111; + let x_sign = (floats.0.to_bits() >> 31) as u64; + let x = (floats_abs.0 * inv_multiplier).min(8191.0) as u64; + let y_sign = (floats.1.to_bits() >> 31) as u64; + let y = (floats_abs.1 * inv_multiplier).min(8191.0) as u64; + let z_sign = (floats.2.to_bits() >> 31) as u64; + let z = (floats_abs.2 * inv_multiplier).min(8191.0) as u64; - // Pack values into a single u64 and return. - (x_sign << 47) | (x << 34) | (y_sign << 33) | (y << 20) | (z_sign << 19) | (z << 6) | e + (x_sign << 47) + | (x << 34) + | (y_sign << 33) + | (y << 20) + | (z_sign << 19) + | (z << 6) + | (e + EXP_BIAS) as u64 + } } -/// Decodes a signed 48-bit trifloat into three full floating point numbers. -/// -/// This operation is lossless and cannot fail. Only the lower 48 bits of the -/// input value are used--the upper 16 bits can safely be anything and are -/// ignored. -#[inline] -pub fn decode(trifloat: u64) -> (f32, f32, f32) { +// Workhorse decode function, which operates on u64. +#[inline(always)] +fn decode_64(trifloat: u64) -> (f32, f32, f32) { // Unpack values. let x = (trifloat >> 34) & 0b111_11111_11111; let y = (trifloat >> 20) & 0b111_11111_11111; @@ -122,7 +111,7 @@ pub fn decode(trifloat: u64) -> (f32, f32, f32) { let e = trifloat & 0b111_111; - let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 13); + let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 12); ( f32::from_bits((x as f32 * multiplier).to_bits() | x_sign), @@ -131,6 +120,29 @@ pub fn decode(trifloat: u64) -> (f32, f32, f32) { ) } +#[inline(always)] +fn u64_to_bytes(n: u64) -> [u8; 6] { + let a = n.to_ne_bytes(); + let mut b = [0u8; 6]; + if cfg!(target_endian = "big") { + (&mut b[..]).copy_from_slice(&a[2..8]); + } else { + (&mut b[..]).copy_from_slice(&a[0..6]); + } + b +} + +#[inline(always)] +fn bytes_to_u64(a: [u8; 6]) -> u64 { + let mut b = [0u8; 8]; + if cfg!(target_endian = "big") { + (&mut b[2..8]).copy_from_slice(&a[..]); + } else { + (&mut b[0..6]).copy_from_slice(&a[..]); + } + u64::from_ne_bytes(b) +} + #[cfg(test)] mod tests { use super::*; @@ -143,8 +155,8 @@ mod tests { fn all_zeros() { let fs = (0.0f32, 0.0f32, 0.0f32); - let tri = encode(fs); - let fs2 = decode(tri); + let tri = encode_64(fs); + let fs2 = decode_64(tri); assert_eq!(tri, 0); assert_eq!(fs, fs2); @@ -153,7 +165,7 @@ mod tests { #[test] fn powers_of_two() { let fs = (8.0f32, 128.0f32, 0.5f32); - assert_eq!(round_trip(fs), fs); + assert_eq!(fs, round_trip(fs)); } #[test] @@ -196,18 +208,11 @@ mod tests { } #[test] - fn rounding() { + fn precision_floor() { let fs = (7.0f32, 8193.0f32, -1.0f32); let fsn = (-7.0f32, -8193.0f32, 1.0f32); - assert_eq!(round_trip(fs), (8.0, 8194.0, -2.0)); - assert_eq!(round_trip(fsn), (-8.0, -8194.0, 2.0)); - } - - #[test] - fn rounding_edge_case() { - let fs = (16383.0f32, 0.0f32, 0.0f32); - - assert_eq!(round_trip(fs), (16384.0, 0.0, 0.0),); + assert_eq!((6.0, 8192.0, -0.0), round_trip(fs)); + assert_eq!((-6.0, -8192.0, 0.0), round_trip(fsn)); } #[test] @@ -223,10 +228,10 @@ mod tests { -99_999_999_999_999.0, ); - assert_eq!(round_trip(fs), (MAX, MAX, MAX)); - assert_eq!(round_trip(fsn), (MIN, MIN, MIN)); - assert_eq!(decode(0x7FFD_FFF7_FFFF), (MAX, MAX, MAX)); - assert_eq!(decode(0xFFFF_FFFF_FFFF), (MIN, MIN, MIN)); + assert_eq!((MAX, MAX, MAX), round_trip(fs)); + assert_eq!((MIN, MIN, MIN), round_trip(fsn)); + assert_eq!((MAX, MAX, MAX), decode_64(0x7FFD_FFF7_FFFF)); + assert_eq!((MIN, MIN, MIN), decode_64(0xFFFF_FFFF_FFFF)); } #[test] @@ -235,10 +240,10 @@ mod tests { let fs = (INFINITY, 0.0, 0.0); let fsn = (-INFINITY, 0.0, 0.0); - assert_eq!(round_trip(fs), (MAX, 0.0, 0.0)); - assert_eq!(round_trip(fsn), (MIN, 0.0, 0.0)); - assert_eq!(encode(fs), 0x7FFC0000003F); - assert_eq!(encode(fsn), 0xFFFC0000003F); + assert_eq!((MAX, 0.0, 0.0), round_trip(fs)); + assert_eq!((MIN, 0.0, 0.0), round_trip(fsn)); + assert_eq!(0x7FFC0000003F, encode_64(fs)); + assert_eq!(0xFFFC0000003F, encode_64(fsn)); } #[test] @@ -246,25 +251,25 @@ mod tests { let fs = (99_999_999_999_999.0, 4294967296.0, -17179869184.0); let fsn = (-99_999_999_999_999.0, 4294967296.0, -17179869184.0); - assert_eq!(round_trip(fs), (MAX, 4294967296.0, -17179869184.0)); - assert_eq!(round_trip(fsn), (MIN, 4294967296.0, -17179869184.0)); + assert_eq!((MAX, 4294967296.0, -17179869184.0), round_trip(fs)); + assert_eq!((MIN, 4294967296.0, -17179869184.0), round_trip(fsn)); } #[test] fn smallest_value() { - let fs = (MIN_POSITIVE, MIN_POSITIVE * 0.5, MIN_POSITIVE * 0.49); - let fsn = (-MIN_POSITIVE, -MIN_POSITIVE * 0.5, -MIN_POSITIVE * 0.49); + let fs = (MIN_POSITIVE * 1.5, MIN_POSITIVE, MIN_POSITIVE * 0.50); + let fsn = (-MIN_POSITIVE * 1.5, -MIN_POSITIVE, -MIN_POSITIVE * 0.50); - assert_eq!(decode(0x600100000), (MIN_POSITIVE, -MIN_POSITIVE, 0.0)); - assert_eq!(round_trip(fs), (MIN_POSITIVE, MIN_POSITIVE, 0.0)); - assert_eq!(round_trip(fsn), (-MIN_POSITIVE, -MIN_POSITIVE, -0.0)); + assert_eq!((MIN_POSITIVE, -MIN_POSITIVE, 0.0), decode_64(0x600100000)); + assert_eq!((MIN_POSITIVE, MIN_POSITIVE, 0.0), round_trip(fs)); + assert_eq!((-MIN_POSITIVE, -MIN_POSITIVE, -0.0), round_trip(fsn)); } #[test] fn underflow() { - let fs = (MIN_POSITIVE * 0.49, -MIN_POSITIVE * 0.49, 0.0); - assert_eq!(encode(fs), 0x200000000); - assert_eq!(round_trip(fs), (0.0, -0.0, 0.0)); + let fs = (MIN_POSITIVE * 0.5, -MIN_POSITIVE * 0.5, MIN_POSITIVE); + assert_eq!(0x200000040, encode_64(fs)); + assert_eq!((0.0, -0.0, MIN_POSITIVE), round_trip(fs)); } #[test] @@ -273,13 +278,13 @@ mod tests { let fs2 = (-63456254.2, 5235423.53, 54353.3); let fs3 = (-0.000000634, 0.00000000005, 0.00000000892); - let n1 = encode(fs1); - let n2 = encode(fs2); - let n3 = encode(fs3); + let n1 = encode_64(fs1); + let n2 = encode_64(fs2); + let n3 = encode_64(fs3); - assert_eq!(decode(n1), decode(n1 | 0xffff_0000_0000_0000)); - assert_eq!(decode(n2), decode(n2 | 0xffff_0000_0000_0000)); - assert_eq!(decode(n3), decode(n3 | 0xffff_0000_0000_0000)); + assert_eq!(decode_64(n1), decode_64(n1 | 0xffff_0000_0000_0000)); + assert_eq!(decode_64(n2), decode_64(n2 | 0xffff_0000_0000_0000)); + assert_eq!(decode_64(n3), decode_64(n3 | 0xffff_0000_0000_0000)); } #[test] diff --git a/sub_crates/trifloat/src/unsigned32.rs b/sub_crates/trifloat/src/unsigned32.rs index 56baf1f..700e45a 100644 --- a/sub_crates/trifloat/src/unsigned32.rs +++ b/sub_crates/trifloat/src/unsigned32.rs @@ -2,7 +2,7 @@ //! //! The encoding uses 9 bits of mantissa per number, and 5 bits for the shared //! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3, exponent]. -//! The exponent is stored as an unsigned integer with a bias of 10. +//! The exponent is stored as an unsigned integer with a bias of 11. //! //! The largest representable number is `2^21 - 4096`, and the smallest //! representable non-zero number is `2^-19`. @@ -14,22 +14,20 @@ use crate::{fiddle_exp2, fiddle_log2}; /// Largest representable number. -pub const MAX: f32 = 2_093_056.0; +pub const MAX: f32 = ((1u64 << (32 - EXP_BIAS)) - (1 << (32 - EXP_BIAS - 9))) as f32; /// Smallest representable non-zero number. -pub const MIN: f32 = 0.000_001_907_348_6; +pub const MIN: f32 = 1.0 / (1 << (EXP_BIAS + 8)) as f32; /// Difference between 1.0 and the next largest representable number. pub const EPSILON: f32 = 1.0 / 256.0; -const EXP_BIAS: i32 = 10; -const MIN_EXP: i32 = 0 - EXP_BIAS; -const MAX_EXP: i32 = 31 - EXP_BIAS; +const EXP_BIAS: i32 = 11; -/// Encodes three floating point values into a signed 32-bit trifloat. +/// Encodes three floating point values into an unsigned 32-bit trifloat. /// /// Input floats larger than `MAX` will saturate to `MAX`, including infinity. -/// Values are converted to trifloat precision by rounding. +/// Values are converted to trifloat precision by rounding down. /// /// Warning: negative values and NaN's are _not_ supported by the trifloat /// format. There are debug-only assertions in place to catch such @@ -51,31 +49,19 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 { floats.2 ); - // Find the largest of the three values. - let largest_value = floats.0.max(floats.1.max(floats.2)); - if largest_value <= 0.0 { + let largest = floats.0.max(floats.1.max(floats.2)); + + if largest < MIN { return 0; + } else { + let e = fiddle_log2(largest).max(-EXP_BIAS).min(31 - EXP_BIAS); + let inv_multiplier = fiddle_exp2(-e + 8); + let x = (floats.0 * inv_multiplier).min(511.0) as u32; + let y = (floats.1 * inv_multiplier).min(511.0) as u32; + let z = (floats.2 * inv_multiplier).min(511.0) as u32; + + (x << (9 + 9 + 5)) | (y << (9 + 5)) | (z << 5) | (e + EXP_BIAS) as u32 } - - // Calculate the exponent and 1.0/multiplier for encoding the values. - let mut exponent = (fiddle_log2(largest_value) + 1).max(MIN_EXP).min(MAX_EXP); - let mut inv_multiplier = fiddle_exp2(-exponent + 9); - - // Edge-case: make sure rounding pushes the largest value up - // appropriately if needed. - if (largest_value * inv_multiplier) + 0.5 >= 512.0 { - exponent = (exponent + 1).min(MAX_EXP); - inv_multiplier = fiddle_exp2(-exponent + 9); - } - - // Quantize and encode values. - let x = (floats.0 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111; - let y = (floats.1 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111; - let z = (floats.2 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111; - let e = (exponent + EXP_BIAS) as u32 & 0b1_1111; - - // Pack values into a u32. - (x << (5 + 9 + 9)) | (y << (5 + 9)) | (z << 5) | e } /// Decodes an unsigned 32-bit trifloat into three full floating point numbers. @@ -84,12 +70,12 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 { #[inline] pub fn decode(trifloat: u32) -> (f32, f32, f32) { // Unpack values. - let x = trifloat >> (5 + 9 + 9); - let y = (trifloat >> (5 + 9)) & 0b1_1111_1111; + let x = trifloat >> (9 + 9 + 5); + let y = (trifloat >> (9 + 5)) & 0b1_1111_1111; let z = (trifloat >> 5) & 0b1_1111_1111; let e = trifloat & 0b1_1111; - let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 9); + let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 8); ( x as f32 * multiplier, @@ -120,11 +106,11 @@ mod tests { #[test] fn powers_of_two() { let fs = (8.0f32, 128.0f32, 0.5f32); - assert_eq!(round_trip(fs), fs); + assert_eq!(fs, round_trip(fs)); } #[test] - fn accuracy() { + fn accuracy_01() { let mut n = 1.0; for _ in 0..256 { let (x, _, _) = round_trip((n, 0.0, 0.0)); @@ -133,6 +119,17 @@ mod tests { } } + #[test] + #[should_panic] + fn accuracy_02() { + let mut n = 1.0; + for _ in 0..512 { + let (x, _, _) = round_trip((n, 0.0, 0.0)); + assert_eq!(n, x); + n += 1.0 / 512.0; + } + } + #[test] fn integers() { for n in 0..=512 { @@ -142,24 +139,17 @@ mod tests { } #[test] - fn rounding() { + fn precision_floor() { let fs = (7.0f32, 513.0f32, 1.0f32); - assert_eq!(round_trip(fs), (8.0, 514.0, 2.0)); - } - - #[test] - fn rounding_edge_case() { - let fs = (1023.0f32, 0.0f32, 0.0f32); - - assert_eq!(round_trip(fs), (1024.0, 0.0, 0.0),); + assert_eq!((6.0, 512.0, 0.0), round_trip(fs)); } #[test] fn saturate() { let fs = (9999999999.0, 9999999999.0, 9999999999.0); - assert_eq!(round_trip(fs), (MAX, MAX, MAX)); - assert_eq!(decode(0xFFFFFFFF), (MAX, MAX, MAX),); + assert_eq!((MAX, MAX, MAX), round_trip(fs)); + assert_eq!((MAX, MAX, MAX), decode(0xFFFFFFFF)); } #[test] @@ -167,29 +157,29 @@ mod tests { use std::f32::INFINITY; let fs = (INFINITY, 0.0, 0.0); - assert_eq!(round_trip(fs), (MAX, 0.0, 0.0)); - assert_eq!(encode(fs), 0xFF80001F,); + assert_eq!((MAX, 0.0, 0.0), round_trip(fs)); + assert_eq!(0xFF80001F, encode(fs)); } #[test] fn partial_saturate() { let fs = (9999999999.0, 4096.0, 262144.0); - assert_eq!(round_trip(fs), (MAX, 4096.0, 262144.0)); + assert_eq!((MAX, 4096.0, 262144.0), round_trip(fs)); } #[test] fn smallest_value() { - let fs = (MIN, MIN * 0.5, MIN * 0.49); - assert_eq!(round_trip(fs), (MIN, MIN, 0.0)); - assert_eq!(decode(0x00_80_40_00), (MIN, MIN, 0.0)); + let fs = (MIN * 1.5, MIN, MIN * 0.5); + assert_eq!((MIN, MIN, 0.0), round_trip(fs)); + assert_eq!((MIN, MIN, 0.0), decode(0x00_80_40_00)); } #[test] fn underflow() { - let fs = (MIN * 0.49, 0.0, 0.0); - assert_eq!(encode(fs), 0); - assert_eq!(round_trip(fs), (0.0, 0.0, 0.0)); + let fs = (MIN * 0.99, 0.0, 0.0); + assert_eq!(0, encode(fs)); + assert_eq!((0.0, 0.0, 0.0), round_trip(fs)); } #[test] diff --git a/sub_crates/trifloat/src/unsigned40.rs b/sub_crates/trifloat/src/unsigned40.rs new file mode 100644 index 0000000..3766d41 --- /dev/null +++ b/sub_crates/trifloat/src/unsigned40.rs @@ -0,0 +1,267 @@ +//! Encoding/decoding for unsigned 40-bit trifloat numbers. +//! +//! The encoding uses 11 bits of mantissa per number, and 7 bits for the shared +//! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3, exponent]. +//! The exponent is stored as an unsigned integer with a bias of 32. +//! +//! The largest representable number is just under `2^96`, and the smallest +//! representable non-zero number is `2^-42`. +//! +//! Since the exponent is shared between the three values, the precision +//! of all three values depends on the largest of the three. All integers +//! up to 2048 can be represented exactly in the largest value. + +use crate::{fiddle_exp2, fiddle_log2}; + +/// Largest representable number. +pub const MAX: f32 = ((1u128 << (128 - EXP_BIAS)) - (1 << (128 - EXP_BIAS - 11))) as f32; + +/// Smallest representable non-zero number. +pub const MIN: f32 = 1.0 / (1u128 << (EXP_BIAS + 10)) as f32; + +/// Difference between 1.0 and the next largest representable number. +pub const EPSILON: f32 = 1.0 / 1024.0; + +const EXP_BIAS: i32 = 32; + +/// Encodes three floating point values into an unsigned 40-bit trifloat. +/// +/// Input floats larger than `MAX` will saturate to `MAX`, including infinity. +/// Values are converted to trifloat precision by rounding down. +/// +/// Warning: negative values and NaN's are _not_ supported by the trifloat +/// format. There are debug-only assertions in place to catch such +/// values in the input floats. +#[inline] +pub fn encode(floats: (f32, f32, f32)) -> [u8; 5] { + u64_to_bytes(encode_64(floats)) +} + +/// Decodes an unsigned 40-bit trifloat into three full floating point numbers. +/// +/// This operation is lossless and cannot fail. +#[inline] +pub fn decode(trifloat: [u8; 5]) -> (f32, f32, f32) { + decode_64(bytes_to_u64(trifloat)) +} + +// Workhorse encode function, which operates on u64. +#[inline(always)] +fn encode_64(floats: (f32, f32, f32)) -> u64 { + debug_assert!( + floats.0 >= 0.0 + && floats.1 >= 0.0 + && floats.2 >= 0.0 + && !floats.0.is_nan() + && !floats.1.is_nan() + && !floats.2.is_nan(), + "trifloat::unsigned32::encode(): encoding to unsigned tri-floats only \ + works correctly for positive, non-NaN numbers, but the numbers passed \ + were: ({}, {}, {})", + floats.0, + floats.1, + floats.2 + ); + + let largest = floats.0.max(floats.1.max(floats.2)); + + if largest < MIN { + return 0; + } else { + let e = fiddle_log2(largest).max(-EXP_BIAS).min(127 - EXP_BIAS); + let inv_multiplier = fiddle_exp2(-e + 10); + let x = (floats.0 * inv_multiplier).min(2047.0) as u64; + let y = (floats.1 * inv_multiplier).min(2047.0) as u64; + let z = (floats.2 * inv_multiplier).min(2047.0) as u64; + + (x << (11 + 11 + 7)) | (y << (11 + 7)) | (z << 7) | (e + EXP_BIAS) as u64 + } +} + +// Workhorse decode function, which operates on u64. +#[inline(always)] +fn decode_64(trifloat: u64) -> (f32, f32, f32) { + // Unpack values. + let x = trifloat >> (11 + 11 + 7); + let y = (trifloat >> (11 + 7)) & 0b111_1111_1111; + let z = (trifloat >> 7) & 0b111_1111_1111; + let e = trifloat & 0b111_1111; + + let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 10); + + ( + x as f32 * multiplier, + y as f32 * multiplier, + z as f32 * multiplier, + ) +} + +#[inline(always)] +fn u64_to_bytes(n: u64) -> [u8; 5] { + let a = n.to_ne_bytes(); + let mut b = [0u8; 5]; + if cfg!(target_endian = "big") { + (&mut b[..]).copy_from_slice(&a[3..8]); + } else { + (&mut b[..]).copy_from_slice(&a[0..5]); + } + b +} + +#[inline(always)] +fn bytes_to_u64(a: [u8; 5]) -> u64 { + let mut b = [0u8; 8]; + if cfg!(target_endian = "big") { + (&mut b[3..8]).copy_from_slice(&a[..]); + } else { + (&mut b[0..5]).copy_from_slice(&a[..]); + } + u64::from_ne_bytes(b) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) { + decode(encode(floats)) + } + + #[test] + fn all_zeros() { + let fs = (0.0f32, 0.0f32, 0.0f32); + + let tri = encode_64(fs); + let fs2 = decode_64(tri); + + assert_eq!(tri, 0u64); + assert_eq!(fs, fs2); + } + + #[test] + fn powers_of_two() { + let fs = (8.0f32, 128.0f32, 0.5f32); + assert_eq!(fs, round_trip(fs)); + } + + #[test] + fn accuracy_01() { + let mut n = 1.0; + for _ in 0..1024 { + let (x, _, _) = round_trip((n, 0.0, 0.0)); + assert_eq!(n, x); + n += 1.0 / 1024.0; + } + } + + #[test] + #[should_panic] + fn accuracy_02() { + let mut n = 1.0; + for _ in 0..2048 { + let (x, _, _) = round_trip((n, 0.0, 0.0)); + assert_eq!(n, x); + n += 1.0 / 2048.0; + } + } + + #[test] + fn integers() { + for n in 0..=2048 { + let (x, _, _) = round_trip((n as f32, 0.0, 0.0)); + assert_eq!(n as f32, x); + } + } + + #[test] + fn precision_floor() { + let fs = (7.0f32, 2049.0f32, 1.0f32); + assert_eq!((6.0, 2048.0, 0.0), round_trip(fs)); + } + + #[test] + fn saturate() { + let fs = (1.0e+30, 1.0e+30, 1.0e+30); + + assert_eq!((MAX, MAX, MAX), round_trip(fs)); + assert_eq!((MAX, MAX, MAX), decode_64(0xff_ffff_ffff)); + } + + #[test] + fn inf_saturate() { + use std::f32::INFINITY; + let fs = (INFINITY, 0.0, 0.0); + + assert_eq!((MAX, 0.0, 0.0), round_trip(fs)); + assert_eq!(0xffe000007f, encode_64(fs)); + } + + #[test] + fn partial_saturate() { + let fs = ( + 1.0e+30, + (1u128 << (128 - EXP_BIAS - 11)) as f32, + (1u128 << (128 - EXP_BIAS - 12)) as f32, + ); + + assert_eq!( + (MAX, (1u128 << (128 - EXP_BIAS - 11)) as f32, 0.0), + round_trip(fs) + ); + } + + #[test] + fn smallest_value() { + let fs = (MIN * 1.5, MIN, MIN * 0.5); + assert_eq!((MIN, MIN, 0.0), round_trip(fs)); + assert_eq!((MIN, MIN, 0.0), decode_64(0x20_04_00_00)); + } + + #[test] + fn underflow() { + let fs = (MIN * 0.99, 0.0, 0.0); + assert_eq!(0, encode_64(fs)); + assert_eq!((0.0, 0.0, 0.0), round_trip(fs)); + } + + #[test] + #[should_panic] + fn nans_01() { + encode((std::f32::NAN, 0.0, 0.0)); + } + + #[test] + #[should_panic] + fn nans_02() { + encode((0.0, std::f32::NAN, 0.0)); + } + + #[test] + #[should_panic] + fn nans_03() { + encode((0.0, 0.0, std::f32::NAN)); + } + + #[test] + #[should_panic] + fn negative_01() { + encode((-1.0, 0.0, 0.0)); + } + + #[test] + #[should_panic] + fn negative_02() { + encode((0.0, -1.0, 0.0)); + } + + #[test] + #[should_panic] + fn negative_03() { + encode((0.0, 0.0, -1.0)); + } + + #[test] + fn negative_04() { + encode((-0.0, -0.0, -0.0)); + } +}