diff --git a/sub_crates/trifloat/src/lib.rs b/sub_crates/trifloat/src/lib.rs index 2fb4556..eaefb5f 100644 --- a/sub_crates/trifloat/src/lib.rs +++ b/sub_crates/trifloat/src/lib.rs @@ -1,4 +1,34 @@ +//! Functions for storing triplets of floating point values in a +//! shared-exponent format. +//! +//! The motivating use-case for this is compactly storing HDR RGB colors. But +//! it may be useful for other things as well. + pub mod signed48; -/// This crate provides types and functions for storing triplets of floating -/// point values in a shared-exponent format. pub mod unsigned32; + +//=========================================================================== +// Some shared functions used by the other modules in this crate. + +/// Calculates 2.0^exp using IEEE bit fiddling. +/// +/// Only works for integer exponents in the range [-126, 127] +/// due to IEEE 32-bit float limits. +#[inline(always)] +fn fiddle_exp2(exp: i32) -> f32 { + use std::f32; + f32::from_bits(((exp + 127) as u32) << 23) +} + +/// Calculates a floor(log2(n)) using IEEE bit fiddling. +/// +/// Because of IEEE floating point format, infinity and NaN +/// floating point values return 128, and subnormal numbers always +/// return -127. These particular behaviors are not, of course, +/// mathemetically correct, but are actually desireable for the +/// calculations in this library. +#[inline(always)] +fn fiddle_log2(n: f32) -> i32 { + use std::f32; + ((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127 +} diff --git a/sub_crates/trifloat/src/signed48.rs b/sub_crates/trifloat/src/signed48.rs index 7cc9f21..24f4437 100644 --- a/sub_crates/trifloat/src/signed48.rs +++ b/sub_crates/trifloat/src/signed48.rs @@ -1,57 +1,59 @@ -//! Encoding/decoding for a 48-bit shared-exponent representation of three -//! signed floating point numbers. +//! Encoding/decoding for signed 48-bit trifloat numbers. //! -//! This is useful for e.g. compactly storing HDR colors. The encoding -//! uses 14 bits of mantissa per number (including the sign bit for each) and 6 -//! bits for the shared exponent. The bit layout is [mantissa 1, mantissa 2, -//! mantissa 3, exponent]. The exponent is stored as an unsigned integer with -//! a bias of 32. The mantissas are stored as a single leading sign bit and 13 -//! bits of unsigned integer. +//! The encoding uses 13 bits of mantissa and 1 sign bit per number, and 6 +//! bits for the shared exponent. The bit layout is: [sign 1, mantissa 1, +//! sign 2, mantissa 2, sign 3, mantissa 3, exponent]. The exponent is stored +//! as an unsigned integer with a bias of 25. //! -//! The largest representable number is ?, and the smallest -//! representable positive number is ?. +//! The largest representable number is `2^38 - 2^25`, and the smallest +//! representable positive number is `2^-38`. //! -//! Since the exponent is shared between the three values, the precision -//! of all three values depends on the largest (in absolute value) of the -//! three. All integers in the range [-8191, 8191] can be represented exactly -//! in the largest value. +//! Since the exponent is shared between all three values, the precision +//! of all three values depends on the largest (in magnitude) of the three. +//! All integers in the range `[-8192, 8192]` can be represented exactly in the +//! largest value. + +use crate::{fiddle_exp2, fiddle_log2}; /// Largest representable number. -pub const MAX: f32 = 35_180_077_121_536.0; +pub const MAX: f32 = 274_844_352_512.0; -/// Smallest representable non-zero number. -pub const MIN_POSITIVE: f32 = 0.000_000_000_465_661_287; +/// Smallest representable number. +/// +/// Note this is not the smallest _magnitude_ number. This is a negative +/// number of large magnitude. +pub const MIN: f32 = -274_844_352_512.0; -pub const MIN: f32 = -35_180_077_121_536.0; +/// Smallest representable positive number. +/// +/// This is the number with the smallest possible magnitude (aside from zero). +pub const MIN_POSITIVE: f32 = 0.000_000_000_003_637_978_807_091_713; /// Difference between 1.0 and the next largest representable number. pub const EPSILON: f32 = 1.0 / 4096.0; -const EXP_BIAS: i32 = 31 - 13; +const EXP_BIAS: i32 = 25; const MIN_EXP: i32 = 0 - EXP_BIAS; const MAX_EXP: i32 = 63 - EXP_BIAS; -/// Encodes three floating point values into a 48-bit trifloat format. +/// Encodes three floating point values into a signed 48-bit trifloat. /// -/// Note that even though the return value is a u64, only the lower 48 -/// bits are used. +/// Input floats that are larger than `MAX` or smaller than `MIN` will saturate +/// to `MAX` and `MIN` respectively, including +/- infinity. Values are +/// converted to trifloat precision by rounding. /// -/// Floats that are larger than the max representable value in trifloat -/// will saturate. Values are converted to trifloat by rounding, so the -/// max error introduced by this function is epsilon / 2. +/// Only the lower 48 bits of the return value are used. The highest 16 bits +/// will all be zero and can be safely discarded. /// -/// Warning: NaN's are _not_ supported by the trifloat -/// format. There are debug-only assertions in place to catch such -/// values in the input floats. Infinity is also not supported in the -/// format, but will simply saturate to the largest-magnitude representable -/// value. +/// Warning: NaN's are _not_ supported by the trifloat format. There are +/// debug-only assertions in place to catch such values in the input floats. #[inline] pub fn encode(floats: (f32, f32, f32)) -> u64 { debug_assert!( !floats.0.is_nan() && !floats.1.is_nan() && !floats.2.is_nan(), - "trifloat::s48::encode(): encoding to signed 48-bit tri-floats only works correctly for \ - non-NaN numbers, but the numbers passed were: ({}, \ - {}, {})", + "trifloat::signed48::encode(): encoding to signed tri-floats only \ + works correctly for non-NaN numbers, but the numbers passed were: \ + ({}, {}, {})", floats.0, floats.1, floats.2 @@ -79,7 +81,7 @@ pub fn encode(floats: (f32, f32, f32)) -> u64 { // Edge-case: make sure rounding pushes the largest value up // appropriately if needed. - if (largest_value * inv_multiplier).abs() + 0.5 >= 8191.0 { + if (largest_value * inv_multiplier).abs() + 0.5 >= 8192.0 { exponent = (exponent + 1).min(MAX_EXP); inv_multiplier = fiddle_exp2(-exponent + 13); } @@ -99,52 +101,33 @@ pub fn encode(floats: (f32, f32, f32)) -> u64 { (x_sign << 47) | (x << 34) | (y_sign << 33) | (y << 20) | (z_sign << 19) | (z << 6) | e } -/// Decodes a 48-bit trifloat into three full floating point numbers. +/// Decodes a signed 48-bit trifloat into three full floating point numbers. /// -/// This operation is lossless and cannot fail. +/// This operation is lossless and cannot fail. Only the lower 48 bits of the +/// input value are used--the upper 16 bits can safely be anything and are +/// ignored. #[inline] pub fn decode(trifloat: u64) -> (f32, f32, f32) { // Unpack values. - let x_sign = (trifloat >> 47) as u32; let x = (trifloat >> 34) & 0b111_11111_11111; - let y_sign = ((trifloat >> 33) & 1) as u32; let y = (trifloat >> 20) & 0b111_11111_11111; - let z_sign = ((trifloat >> 19) & 1) as u32; let z = (trifloat >> 6) & 0b111_11111_11111; + + let x_sign = ((trifloat >> 16) & 0x8000_0000) as u32; + let y_sign = ((trifloat >> 2) & 0x8000_0000) as u32; + let z_sign = ((trifloat << 12) & 0x8000_0000) as u32; + let e = trifloat & 0b111_111; let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 13); ( - f32::from_bits((x as f32 * multiplier).to_bits() | (x_sign << 31)), - f32::from_bits((y as f32 * multiplier).to_bits() | (y_sign << 31)), - f32::from_bits((z as f32 * multiplier).to_bits() | (z_sign << 31)), + f32::from_bits((x as f32 * multiplier).to_bits() | x_sign), + f32::from_bits((y as f32 * multiplier).to_bits() | y_sign), + f32::from_bits((z as f32 * multiplier).to_bits() | z_sign), ) } -/// Calculates 2.0^exp using IEEE bit fiddling. -/// -/// Only works for integer exponents in the range [-126, 127] -/// due to IEEE 32-bit float limits. -#[inline(always)] -fn fiddle_exp2(exp: i32) -> f32 { - use std::f32; - f32::from_bits(((exp + 127) as u32) << 23) -} - -/// Calculates a floor(log2(n)) using IEEE bit fiddling. -/// -/// Because of IEEE floating point format, infinity and NaN -/// floating point values return 128, and subnormal numbers always -/// return -127. These particular behaviors are not, of course, -/// mathemetically correct, but are actually desireable for the -/// calculations in this library. -#[inline(always)] -fn fiddle_log2(n: f32) -> i32 { - use std::f32; - ((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127 -} - #[cfg(test)] mod tests { use super::*; @@ -170,6 +153,27 @@ mod tests { assert_eq!(round_trip(fs), fs); } + #[test] + fn signs() { + let fs1 = (1.0f32, 1.0f32, 1.0f32); + let fs2 = (1.0f32, 1.0f32, -1.0f32); + let fs3 = (1.0f32, -1.0f32, 1.0f32); + let fs4 = (1.0f32, -1.0f32, -1.0f32); + let fs5 = (-1.0f32, 1.0f32, 1.0f32); + let fs6 = (-1.0f32, 1.0f32, -1.0f32); + let fs7 = (-1.0f32, -1.0f32, 1.0f32); + let fs8 = (-1.0f32, -1.0f32, -1.0f32); + + assert_eq!(fs1, round_trip(fs1)); + assert_eq!(fs2, round_trip(fs2)); + assert_eq!(fs3, round_trip(fs3)); + assert_eq!(fs4, round_trip(fs4)); + assert_eq!(fs5, round_trip(fs5)); + assert_eq!(fs6, round_trip(fs6)); + assert_eq!(fs7, round_trip(fs7)); + assert_eq!(fs8, round_trip(fs8)); + } + #[test] fn accuracy() { let mut n = 1.0; @@ -182,7 +186,7 @@ mod tests { #[test] fn integers() { - for n in 0..=512 { + for n in -8192i32..=8192i32 { let (x, _, _) = round_trip((n as f32, 0.0, 0.0)); assert_eq!(n as f32, x); } @@ -248,9 +252,9 @@ mod tests { let fs = (MIN_POSITIVE, MIN_POSITIVE * 0.5, MIN_POSITIVE * 0.49); let fsn = (-MIN_POSITIVE, -MIN_POSITIVE * 0.5, -MIN_POSITIVE * 0.49); + assert_eq!(decode(0x600100000), (MIN_POSITIVE, -MIN_POSITIVE, 0.0)); assert_eq!(round_trip(fs), (MIN_POSITIVE, MIN_POSITIVE, 0.0)); assert_eq!(round_trip(fsn), (-MIN_POSITIVE, -MIN_POSITIVE, -0.0)); - assert_eq!(decode(0x600100000), (MIN_POSITIVE, -MIN_POSITIVE, 0.0)); } #[test] @@ -259,4 +263,37 @@ mod tests { assert_eq!(encode(fs), 0x200000000); assert_eq!(round_trip(fs), (0.0, -0.0, 0.0)); } + + #[test] + fn garbage_upper_bits_decode() { + let fs1 = (4.0, -623.53, 12.3); + let fs2 = (-63456254.2, 5235423.53, 54353.3); + let fs3 = (-0.000000634, 0.00000000005, 0.00000000892); + + let n1 = encode(fs1); + let n2 = encode(fs2); + let n3 = encode(fs3); + + assert_eq!(decode(n1), decode(n1 | 0xffff_0000_0000_0000)); + assert_eq!(decode(n2), decode(n2 | 0xffff_0000_0000_0000)); + assert_eq!(decode(n3), decode(n3 | 0xffff_0000_0000_0000)); + } + + #[test] + #[should_panic] + fn nans_01() { + encode((std::f32::NAN, 1.0, -1.0)); + } + + #[test] + #[should_panic] + fn nans_02() { + encode((1.0, std::f32::NAN, -1.0)); + } + + #[test] + #[should_panic] + fn nans_03() { + encode((1.0, -1.0, std::f32::NAN)); + } } diff --git a/sub_crates/trifloat/src/unsigned32.rs b/sub_crates/trifloat/src/unsigned32.rs index fad6d45..56baf1f 100644 --- a/sub_crates/trifloat/src/unsigned32.rs +++ b/sub_crates/trifloat/src/unsigned32.rs @@ -1,19 +1,18 @@ -//! Encoding/decoding for a 32-bit shared-exponent representation of three -//! positive floating point numbers. +//! Encoding/decoding for unsigned 32-bit trifloat numbers. //! -//! This is useful for e.g. compactly storing HDR colors. The encoding -//! uses 9 bits of mantissa per number, and 5 bits for the shared -//! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3, -//! exponent]. The exponent is stored as an unsigned integer with a -//! bias of 10. +//! The encoding uses 9 bits of mantissa per number, and 5 bits for the shared +//! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3, exponent]. +//! The exponent is stored as an unsigned integer with a bias of 10. //! -//! The largest representable number is 2^21 - 4096, and the smallest -//! representable non-zero number is 2^-19. +//! The largest representable number is `2^21 - 4096`, and the smallest +//! representable non-zero number is `2^-19`. //! //! Since the exponent is shared between the three values, the precision //! of all three values depends on the largest of the three. All integers //! up to 512 can be represented exactly in the largest value. +use crate::{fiddle_exp2, fiddle_log2}; + /// Largest representable number. pub const MAX: f32 = 2_093_056.0; @@ -23,19 +22,18 @@ pub const MIN: f32 = 0.000_001_907_348_6; /// Difference between 1.0 and the next largest representable number. pub const EPSILON: f32 = 1.0 / 256.0; -#[derive(Debug, Copy, Clone)] -pub struct U9(u32); +const EXP_BIAS: i32 = 10; +const MIN_EXP: i32 = 0 - EXP_BIAS; +const MAX_EXP: i32 = 31 - EXP_BIAS; -/// Encodes three floating point values into the trifloat format. +/// Encodes three floating point values into a signed 32-bit trifloat. /// -/// Floats that are larger than the max representable value in trifloat -/// will saturate. Values are converted to trifloat by rounding, so the -/// max error introduced by this function is epsilon / 2. +/// Input floats larger than `MAX` will saturate to `MAX`, including infinity. +/// Values are converted to trifloat precision by rounding. /// /// Warning: negative values and NaN's are _not_ supported by the trifloat /// format. There are debug-only assertions in place to catch such -/// values in the input floats. Infinity is also not supported in the -/// format, but will simply saturate to the largest representable value. +/// values in the input floats. #[inline] pub fn encode(floats: (f32, f32, f32)) -> u32 { debug_assert!( @@ -45,9 +43,9 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 { && !floats.0.is_nan() && !floats.1.is_nan() && !floats.2.is_nan(), - "trifloat::encode(): encoding to tri-floats only works correctly for \ - positive, non-NaN numbers, but the numbers passed were: ({}, \ - {}, {})", + "trifloat::unsigned32::encode(): encoding to unsigned tri-floats only \ + works correctly for positive, non-NaN numbers, but the numbers passed \ + were: ({}, {}, {})", floats.0, floats.1, floats.2 @@ -60,13 +58,13 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 { } // Calculate the exponent and 1.0/multiplier for encoding the values. - let mut exponent = (fiddle_log2(largest_value) + 1).max(-10).min(21); + let mut exponent = (fiddle_log2(largest_value) + 1).max(MIN_EXP).min(MAX_EXP); let mut inv_multiplier = fiddle_exp2(-exponent + 9); // Edge-case: make sure rounding pushes the largest value up // appropriately if needed. if (largest_value * inv_multiplier) + 0.5 >= 512.0 { - exponent = (exponent + 1).min(21); + exponent = (exponent + 1).min(MAX_EXP); inv_multiplier = fiddle_exp2(-exponent + 9); } @@ -74,13 +72,13 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 { let x = (floats.0 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111; let y = (floats.1 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111; let z = (floats.2 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111; - let e = (exponent + 10) as u32 & 0b1_1111; + let e = (exponent + EXP_BIAS) as u32 & 0b1_1111; // Pack values into a u32. (x << (5 + 9 + 9)) | (y << (5 + 9)) | (z << 5) | e } -/// Decodes a trifloat into three full floating point numbers. +/// Decodes an unsigned 32-bit trifloat into three full floating point numbers. /// /// This operation is lossless and cannot fail. #[inline] @@ -91,7 +89,7 @@ pub fn decode(trifloat: u32) -> (f32, f32, f32) { let z = (trifloat >> 5) & 0b1_1111_1111; let e = trifloat & 0b1_1111; - let multiplier = fiddle_exp2(e as i32 - 10 - 9); + let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 9); ( x as f32 * multiplier, @@ -100,29 +98,6 @@ pub fn decode(trifloat: u32) -> (f32, f32, f32) { ) } -/// Calculates 2.0^exp using IEEE bit fiddling. -/// -/// Only works for integer exponents in the range [-126, 127] -/// due to IEEE 32-bit float limits. -#[inline(always)] -fn fiddle_exp2(exp: i32) -> f32 { - use std::f32; - f32::from_bits(((exp + 127) as u32) << 23) -} - -/// Calculates a floor(log2(n)) using IEEE bit fiddling. -/// -/// Because of IEEE floating point format, infinity and NaN -/// floating point values return 128, and subnormal numbers always -/// return -127. These particular behaviors are not, of course, -/// mathemetically correct, but are actually desireable for the -/// calculations in this library. -#[inline(always)] -fn fiddle_log2(n: f32) -> i32 { - use std::f32; - ((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127 -} - #[cfg(test)] mod tests { use super::*; @@ -216,4 +191,45 @@ mod tests { assert_eq!(encode(fs), 0); assert_eq!(round_trip(fs), (0.0, 0.0, 0.0)); } + + #[test] + #[should_panic] + fn nans_01() { + encode((std::f32::NAN, 0.0, 0.0)); + } + + #[test] + #[should_panic] + fn nans_02() { + encode((0.0, std::f32::NAN, 0.0)); + } + + #[test] + #[should_panic] + fn nans_03() { + encode((0.0, 0.0, std::f32::NAN)); + } + + #[test] + #[should_panic] + fn negative_01() { + encode((-1.0, 0.0, 0.0)); + } + + #[test] + #[should_panic] + fn negative_02() { + encode((0.0, -1.0, 0.0)); + } + + #[test] + #[should_panic] + fn negative_03() { + encode((0.0, 0.0, -1.0)); + } + + #[test] + fn negative_04() { + encode((-0.0, -0.0, -0.0)); + } }