From e31ec6eb4ef39a35d477a546ceaa7d76709f80b0 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Sun, 7 Jul 2019 14:02:09 +0900 Subject: [PATCH] Added a new trifloat type that uses 48 bits and is signed. --- sub_crates/trifloat/benches/bench.rs | 40 +++- sub_crates/trifloat/src/lib.rs | 220 +-------------------- sub_crates/trifloat/src/signed48.rs | 262 ++++++++++++++++++++++++++ sub_crates/trifloat/src/unsigned32.rs | 219 +++++++++++++++++++++ 4 files changed, 519 insertions(+), 222 deletions(-) create mode 100644 sub_crates/trifloat/src/signed48.rs create mode 100644 sub_crates/trifloat/src/unsigned32.rs diff --git a/sub_crates/trifloat/benches/bench.rs b/sub_crates/trifloat/benches/bench.rs index e9ca5b5..727661a 100644 --- a/sub_crates/trifloat/benches/bench.rs +++ b/sub_crates/trifloat/benches/bench.rs @@ -1,32 +1,60 @@ use bencher::{benchmark_group, benchmark_main, black_box, Bencher}; use rand::{rngs::SmallRng, FromEntropy, Rng}; -use trifloat::{decode, encode}; +use trifloat::{signed48, unsigned32}; //---- -fn encode_100_values(bench: &mut Bencher) { +fn unsigned32_encode_100_values(bench: &mut Bencher) { let mut rng = SmallRng::from_entropy(); bench.iter(|| { let x = rng.gen::() - 0.5; let y = rng.gen::() - 0.5; let z = rng.gen::() - 0.5; for _ in 0..100 { - black_box(encode(black_box((x, y, z)))); + black_box(unsigned32::encode(black_box((x, y, z)))); } }); } -fn decode_100_values(bench: &mut Bencher) { +fn unsigned32_decode_100_values(bench: &mut Bencher) { let mut rng = SmallRng::from_entropy(); bench.iter(|| { let v = rng.gen::(); for _ in 0..100 { - black_box(decode(black_box(v))); + black_box(unsigned32::decode(black_box(v))); + } + }); +} + +fn signed48_encode_100_values(bench: &mut Bencher) { + let mut rng = SmallRng::from_entropy(); + bench.iter(|| { + let x = rng.gen::() - 0.5; + let y = rng.gen::() - 0.5; + let z = rng.gen::() - 0.5; + for _ in 0..100 { + black_box(signed48::encode(black_box((x, y, z)))); + } + }); +} + +fn signed48_decode_100_values(bench: &mut Bencher) { + let mut rng = SmallRng::from_entropy(); + bench.iter(|| { + let v = rng.gen::() & 0x0000_FFFF_FFFF_FFFF; + for _ in 0..100 { + black_box(signed48::decode(black_box(v))); } }); } //---- -benchmark_group!(benches, encode_100_values, decode_100_values,); +benchmark_group!( + benches, + unsigned32_encode_100_values, + unsigned32_decode_100_values, + signed48_encode_100_values, + signed48_decode_100_values, +); benchmark_main!(benches); diff --git a/sub_crates/trifloat/src/lib.rs b/sub_crates/trifloat/src/lib.rs index 5178ba6..2fb4556 100644 --- a/sub_crates/trifloat/src/lib.rs +++ b/sub_crates/trifloat/src/lib.rs @@ -1,216 +1,4 @@ -//! Encoding/decoding for a 32-bit shared-exponent representation of three -//! positive floating point numbers. -//! -//! This is useful for e.g. compactly storing HDR colors. The encoding -//! uses 9 bits of mantissa per number, and 5 bits for the shared -//! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3, -//! exponent]. The exponent is stored as an unsigned integer with a -//! bias of 10. -//! -//! The largest representable number is 2^21 - 4096, and the smallest -//! representable non-zero number is 2^-19. -//! -//! Since the exponent is shared between the three values, the precision -//! of all three values depends on the largest of the three. All integers -//! up to 512 can be represented exactly in the largest value. - -/// Largest representable number. -pub const MAX: f32 = 2_093_056.0; - -/// Smallest representable non-zero number. -pub const MIN: f32 = 0.000_001_907_348_6; - -/// Difference between 1.0 and the next largest representable number. -pub const EPSILON: f32 = 1.0 / 256.0; - -/// Encodes three floating point values into the trifloat format. -/// -/// Floats that are larger than the max representable value in trifloat -/// will saturate. Values are converted to trifloat by rounding, so the -/// max error introduced by this function is epsilon / 2. -/// -/// Warning: negative values and NaN's are _not_ supported by the trifloat -/// format. There are debug-only assertions in place to catch such -/// values in the input floats. Infinity is also not supported in the -/// format, but will simply saturate to the largest representable value. -#[inline] -pub fn encode(floats: (f32, f32, f32)) -> u32 { - debug_assert!( - floats.0 >= 0.0 - && floats.1 >= 0.0 - && floats.2 >= 0.0 - && !floats.0.is_nan() - && !floats.1.is_nan() - && !floats.2.is_nan(), - "trifloat::encode(): encoding to tri-floats only works correctly for \ - positive, non-NaN numbers, but the numbers passed were: ({}, \ - {}, {})", - floats.0, - floats.1, - floats.2 - ); - - // Find the largest of the three values. - let largest_value = floats.0.max(floats.1.max(floats.2)); - if largest_value <= 0.0 { - return 0; - } - - // Calculate the exponent and 1.0/multiplier for encoding the values. - let mut exponent = (fiddle_log2(largest_value) + 1).max(-10).min(21); - let mut inv_multiplier = fiddle_exp2(-exponent + 9); - - // Edge-case: make sure rounding pushes the largest value up - // appropriately if needed. - if (largest_value * inv_multiplier) + 0.5 >= 512.0 { - exponent = (exponent + 1).min(21); - inv_multiplier = fiddle_exp2(-exponent + 9); - } - - // Quantize and encode values. - let x = (floats.0 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111; - let y = (floats.1 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111; - let z = (floats.2 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111; - let e = (exponent + 10) as u32 & 0b1_1111; - - // Pack values into a u32. - (x << (5 + 9 + 9)) | (y << (5 + 9)) | (z << 5) | e -} - -/// Decodes a trifloat into three full floating point numbers. -/// -/// This operation is lossless and cannot fail. -#[inline] -pub fn decode(trifloat: u32) -> (f32, f32, f32) { - // Unpack values. - let x = trifloat >> (5 + 9 + 9); - let y = (trifloat >> (5 + 9)) & 0b1_1111_1111; - let z = (trifloat >> 5) & 0b1_1111_1111; - let e = trifloat & 0b1_1111; - - let multiplier = fiddle_exp2(e as i32 - 10 - 9); - - ( - x as f32 * multiplier, - y as f32 * multiplier, - z as f32 * multiplier, - ) -} - -/// Calculates 2.0^exp using IEEE bit fiddling. -/// -/// Only works for integer exponents in the range [-126, 127] -/// due to IEEE 32-bit float limits. -#[inline(always)] -fn fiddle_exp2(exp: i32) -> f32 { - use std::f32; - f32::from_bits(((exp + 127) as u32) << 23) -} - -/// Calculates a floor(log2(n)) using IEEE bit fiddling. -/// -/// Because of IEEE floating point format, infinity and NaN -/// floating point values return 128, and subnormal numbers always -/// return -127. These particular behaviors are not, of course, -/// mathemetically correct, but are actually desireable for the -/// calculations in this library. -#[inline(always)] -fn fiddle_log2(n: f32) -> i32 { - use std::f32; - ((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127 -} - -#[cfg(test)] -mod tests { - use super::*; - - fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) { - decode(encode(floats)) - } - - #[test] - fn all_zeros() { - let fs = (0.0f32, 0.0f32, 0.0f32); - - let tri = encode(fs); - let fs2 = decode(tri); - - assert_eq!(tri, 0u32); - assert_eq!(fs, fs2); - } - - #[test] - fn powers_of_two() { - let fs = (8.0f32, 128.0f32, 0.5f32); - assert_eq!(round_trip(fs), fs); - } - - #[test] - fn accuracy() { - let mut n = 1.0; - for _ in 0..256 { - let (x, _, _) = round_trip((n, 0.0, 0.0)); - assert_eq!(n, x); - n += 1.0 / 256.0; - } - } - - #[test] - fn integers() { - for n in 0..=512 { - let (x, _, _) = round_trip((n as f32, 0.0, 0.0)); - assert_eq!(n as f32, x); - } - } - - #[test] - fn rounding() { - let fs = (7.0f32, 513.0f32, 1.0f32); - assert_eq!(round_trip(fs), (8.0, 514.0, 2.0)); - } - - #[test] - fn rounding_edge_case() { - let fs = (1023.0f32, 0.0f32, 0.0f32); - - assert_eq!(round_trip(fs), (1024.0, 0.0, 0.0),); - } - - #[test] - fn saturate() { - let fs = (9999999999.0, 9999999999.0, 9999999999.0); - - assert_eq!(round_trip(fs), (MAX, MAX, MAX)); - assert_eq!(decode(0xFFFFFFFF), (MAX, MAX, MAX),); - } - - #[test] - fn inf_saturate() { - use std::f32::INFINITY; - let fs = (INFINITY, 0.0, 0.0); - - assert_eq!(round_trip(fs), (MAX, 0.0, 0.0)); - assert_eq!(encode(fs), 0xFF80001F,); - } - - #[test] - fn partial_saturate() { - let fs = (9999999999.0, 4096.0, 262144.0); - - assert_eq!(round_trip(fs), (MAX, 4096.0, 262144.0)); - } - - #[test] - fn smallest_value() { - let fs = (MIN, MIN * 0.5, MIN * 0.49); - assert_eq!(round_trip(fs), (MIN, MIN, 0.0)); - assert_eq!(decode(0x00_80_40_00), (MIN, MIN, 0.0)); - } - - #[test] - fn underflow() { - let fs = (MIN * 0.49, 0.0, 0.0); - assert_eq!(encode(fs), 0); - assert_eq!(round_trip(fs), (0.0, 0.0, 0.0)); - } -} +pub mod signed48; +/// This crate provides types and functions for storing triplets of floating +/// point values in a shared-exponent format. +pub mod unsigned32; diff --git a/sub_crates/trifloat/src/signed48.rs b/sub_crates/trifloat/src/signed48.rs new file mode 100644 index 0000000..7cc9f21 --- /dev/null +++ b/sub_crates/trifloat/src/signed48.rs @@ -0,0 +1,262 @@ +//! Encoding/decoding for a 48-bit shared-exponent representation of three +//! signed floating point numbers. +//! +//! This is useful for e.g. compactly storing HDR colors. The encoding +//! uses 14 bits of mantissa per number (including the sign bit for each) and 6 +//! bits for the shared exponent. The bit layout is [mantissa 1, mantissa 2, +//! mantissa 3, exponent]. The exponent is stored as an unsigned integer with +//! a bias of 32. The mantissas are stored as a single leading sign bit and 13 +//! bits of unsigned integer. +//! +//! The largest representable number is ?, and the smallest +//! representable positive number is ?. +//! +//! Since the exponent is shared between the three values, the precision +//! of all three values depends on the largest (in absolute value) of the +//! three. All integers in the range [-8191, 8191] can be represented exactly +//! in the largest value. + +/// Largest representable number. +pub const MAX: f32 = 35_180_077_121_536.0; + +/// Smallest representable non-zero number. +pub const MIN_POSITIVE: f32 = 0.000_000_000_465_661_287; + +pub const MIN: f32 = -35_180_077_121_536.0; + +/// Difference between 1.0 and the next largest representable number. +pub const EPSILON: f32 = 1.0 / 4096.0; + +const EXP_BIAS: i32 = 31 - 13; +const MIN_EXP: i32 = 0 - EXP_BIAS; +const MAX_EXP: i32 = 63 - EXP_BIAS; + +/// Encodes three floating point values into a 48-bit trifloat format. +/// +/// Note that even though the return value is a u64, only the lower 48 +/// bits are used. +/// +/// Floats that are larger than the max representable value in trifloat +/// will saturate. Values are converted to trifloat by rounding, so the +/// max error introduced by this function is epsilon / 2. +/// +/// Warning: NaN's are _not_ supported by the trifloat +/// format. There are debug-only assertions in place to catch such +/// values in the input floats. Infinity is also not supported in the +/// format, but will simply saturate to the largest-magnitude representable +/// value. +#[inline] +pub fn encode(floats: (f32, f32, f32)) -> u64 { + debug_assert!( + !floats.0.is_nan() && !floats.1.is_nan() && !floats.2.is_nan(), + "trifloat::s48::encode(): encoding to signed 48-bit tri-floats only works correctly for \ + non-NaN numbers, but the numbers passed were: ({}, \ + {}, {})", + floats.0, + floats.1, + floats.2 + ); + + // Find the largest (in magnitude) of the three values. + let largest_value = { + let mut largest_value: f32 = 0.0; + if floats.0.abs() > largest_value.abs() { + largest_value = floats.0; + } + if floats.1.abs() > largest_value.abs() { + largest_value = floats.1; + } + if floats.2.abs() > largest_value.abs() { + largest_value = floats.2; + } + largest_value + }; + + // Calculate the exponent and 1.0/multiplier for encoding the values. + let (exponent, inv_multiplier) = { + let mut exponent = (fiddle_log2(largest_value) + 1).max(MIN_EXP).min(MAX_EXP); + let mut inv_multiplier = fiddle_exp2(-exponent + 13); + + // Edge-case: make sure rounding pushes the largest value up + // appropriately if needed. + if (largest_value * inv_multiplier).abs() + 0.5 >= 8191.0 { + exponent = (exponent + 1).min(MAX_EXP); + inv_multiplier = fiddle_exp2(-exponent + 13); + } + (exponent, inv_multiplier) + }; + + // Quantize and encode values. + let x = (floats.0.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111; + let x_sign = (floats.0.to_bits() >> 31) as u64; + let y = (floats.1.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111; + let y_sign = (floats.1.to_bits() >> 31) as u64; + let z = (floats.2.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111; + let z_sign = (floats.2.to_bits() >> 31) as u64; + let e = (exponent + EXP_BIAS) as u64 & 0b111_111; + + // Pack values into a single u64 and return. + (x_sign << 47) | (x << 34) | (y_sign << 33) | (y << 20) | (z_sign << 19) | (z << 6) | e +} + +/// Decodes a 48-bit trifloat into three full floating point numbers. +/// +/// This operation is lossless and cannot fail. +#[inline] +pub fn decode(trifloat: u64) -> (f32, f32, f32) { + // Unpack values. + let x_sign = (trifloat >> 47) as u32; + let x = (trifloat >> 34) & 0b111_11111_11111; + let y_sign = ((trifloat >> 33) & 1) as u32; + let y = (trifloat >> 20) & 0b111_11111_11111; + let z_sign = ((trifloat >> 19) & 1) as u32; + let z = (trifloat >> 6) & 0b111_11111_11111; + let e = trifloat & 0b111_111; + + let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 13); + + ( + f32::from_bits((x as f32 * multiplier).to_bits() | (x_sign << 31)), + f32::from_bits((y as f32 * multiplier).to_bits() | (y_sign << 31)), + f32::from_bits((z as f32 * multiplier).to_bits() | (z_sign << 31)), + ) +} + +/// Calculates 2.0^exp using IEEE bit fiddling. +/// +/// Only works for integer exponents in the range [-126, 127] +/// due to IEEE 32-bit float limits. +#[inline(always)] +fn fiddle_exp2(exp: i32) -> f32 { + use std::f32; + f32::from_bits(((exp + 127) as u32) << 23) +} + +/// Calculates a floor(log2(n)) using IEEE bit fiddling. +/// +/// Because of IEEE floating point format, infinity and NaN +/// floating point values return 128, and subnormal numbers always +/// return -127. These particular behaviors are not, of course, +/// mathemetically correct, but are actually desireable for the +/// calculations in this library. +#[inline(always)] +fn fiddle_log2(n: f32) -> i32 { + use std::f32; + ((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127 +} + +#[cfg(test)] +mod tests { + use super::*; + + fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) { + decode(encode(floats)) + } + + #[test] + fn all_zeros() { + let fs = (0.0f32, 0.0f32, 0.0f32); + + let tri = encode(fs); + let fs2 = decode(tri); + + assert_eq!(tri, 0); + assert_eq!(fs, fs2); + } + + #[test] + fn powers_of_two() { + let fs = (8.0f32, 128.0f32, 0.5f32); + assert_eq!(round_trip(fs), fs); + } + + #[test] + fn accuracy() { + let mut n = 1.0; + for _ in 0..256 { + let (x, _, _) = round_trip((n, 0.0, 0.0)); + assert_eq!(n, x); + n += 1.0 / 256.0; + } + } + + #[test] + fn integers() { + for n in 0..=512 { + let (x, _, _) = round_trip((n as f32, 0.0, 0.0)); + assert_eq!(n as f32, x); + } + } + + #[test] + fn rounding() { + let fs = (7.0f32, 8193.0f32, -1.0f32); + let fsn = (-7.0f32, -8193.0f32, 1.0f32); + assert_eq!(round_trip(fs), (8.0, 8194.0, -2.0)); + assert_eq!(round_trip(fsn), (-8.0, -8194.0, 2.0)); + } + + #[test] + fn rounding_edge_case() { + let fs = (16383.0f32, 0.0f32, 0.0f32); + + assert_eq!(round_trip(fs), (16384.0, 0.0, 0.0),); + } + + #[test] + fn saturate() { + let fs = ( + 99_999_999_999_999.0, + 99_999_999_999_999.0, + 99_999_999_999_999.0, + ); + let fsn = ( + -99_999_999_999_999.0, + -99_999_999_999_999.0, + -99_999_999_999_999.0, + ); + + assert_eq!(round_trip(fs), (MAX, MAX, MAX)); + assert_eq!(round_trip(fsn), (MIN, MIN, MIN)); + assert_eq!(decode(0x7FFD_FFF7_FFFF), (MAX, MAX, MAX)); + assert_eq!(decode(0xFFFF_FFFF_FFFF), (MIN, MIN, MIN)); + } + + #[test] + fn inf_saturate() { + use std::f32::INFINITY; + let fs = (INFINITY, 0.0, 0.0); + let fsn = (-INFINITY, 0.0, 0.0); + + assert_eq!(round_trip(fs), (MAX, 0.0, 0.0)); + assert_eq!(round_trip(fsn), (MIN, 0.0, 0.0)); + assert_eq!(encode(fs), 0x7FFC0000003F); + assert_eq!(encode(fsn), 0xFFFC0000003F); + } + + #[test] + fn partial_saturate() { + let fs = (99_999_999_999_999.0, 4294967296.0, -17179869184.0); + let fsn = (-99_999_999_999_999.0, 4294967296.0, -17179869184.0); + + assert_eq!(round_trip(fs), (MAX, 4294967296.0, -17179869184.0)); + assert_eq!(round_trip(fsn), (MIN, 4294967296.0, -17179869184.0)); + } + + #[test] + fn smallest_value() { + let fs = (MIN_POSITIVE, MIN_POSITIVE * 0.5, MIN_POSITIVE * 0.49); + let fsn = (-MIN_POSITIVE, -MIN_POSITIVE * 0.5, -MIN_POSITIVE * 0.49); + + assert_eq!(round_trip(fs), (MIN_POSITIVE, MIN_POSITIVE, 0.0)); + assert_eq!(round_trip(fsn), (-MIN_POSITIVE, -MIN_POSITIVE, -0.0)); + assert_eq!(decode(0x600100000), (MIN_POSITIVE, -MIN_POSITIVE, 0.0)); + } + + #[test] + fn underflow() { + let fs = (MIN_POSITIVE * 0.49, -MIN_POSITIVE * 0.49, 0.0); + assert_eq!(encode(fs), 0x200000000); + assert_eq!(round_trip(fs), (0.0, -0.0, 0.0)); + } +} diff --git a/sub_crates/trifloat/src/unsigned32.rs b/sub_crates/trifloat/src/unsigned32.rs new file mode 100644 index 0000000..fad6d45 --- /dev/null +++ b/sub_crates/trifloat/src/unsigned32.rs @@ -0,0 +1,219 @@ +//! Encoding/decoding for a 32-bit shared-exponent representation of three +//! positive floating point numbers. +//! +//! This is useful for e.g. compactly storing HDR colors. The encoding +//! uses 9 bits of mantissa per number, and 5 bits for the shared +//! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3, +//! exponent]. The exponent is stored as an unsigned integer with a +//! bias of 10. +//! +//! The largest representable number is 2^21 - 4096, and the smallest +//! representable non-zero number is 2^-19. +//! +//! Since the exponent is shared between the three values, the precision +//! of all three values depends on the largest of the three. All integers +//! up to 512 can be represented exactly in the largest value. + +/// Largest representable number. +pub const MAX: f32 = 2_093_056.0; + +/// Smallest representable non-zero number. +pub const MIN: f32 = 0.000_001_907_348_6; + +/// Difference between 1.0 and the next largest representable number. +pub const EPSILON: f32 = 1.0 / 256.0; + +#[derive(Debug, Copy, Clone)] +pub struct U9(u32); + +/// Encodes three floating point values into the trifloat format. +/// +/// Floats that are larger than the max representable value in trifloat +/// will saturate. Values are converted to trifloat by rounding, so the +/// max error introduced by this function is epsilon / 2. +/// +/// Warning: negative values and NaN's are _not_ supported by the trifloat +/// format. There are debug-only assertions in place to catch such +/// values in the input floats. Infinity is also not supported in the +/// format, but will simply saturate to the largest representable value. +#[inline] +pub fn encode(floats: (f32, f32, f32)) -> u32 { + debug_assert!( + floats.0 >= 0.0 + && floats.1 >= 0.0 + && floats.2 >= 0.0 + && !floats.0.is_nan() + && !floats.1.is_nan() + && !floats.2.is_nan(), + "trifloat::encode(): encoding to tri-floats only works correctly for \ + positive, non-NaN numbers, but the numbers passed were: ({}, \ + {}, {})", + floats.0, + floats.1, + floats.2 + ); + + // Find the largest of the three values. + let largest_value = floats.0.max(floats.1.max(floats.2)); + if largest_value <= 0.0 { + return 0; + } + + // Calculate the exponent and 1.0/multiplier for encoding the values. + let mut exponent = (fiddle_log2(largest_value) + 1).max(-10).min(21); + let mut inv_multiplier = fiddle_exp2(-exponent + 9); + + // Edge-case: make sure rounding pushes the largest value up + // appropriately if needed. + if (largest_value * inv_multiplier) + 0.5 >= 512.0 { + exponent = (exponent + 1).min(21); + inv_multiplier = fiddle_exp2(-exponent + 9); + } + + // Quantize and encode values. + let x = (floats.0 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111; + let y = (floats.1 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111; + let z = (floats.2 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111; + let e = (exponent + 10) as u32 & 0b1_1111; + + // Pack values into a u32. + (x << (5 + 9 + 9)) | (y << (5 + 9)) | (z << 5) | e +} + +/// Decodes a trifloat into three full floating point numbers. +/// +/// This operation is lossless and cannot fail. +#[inline] +pub fn decode(trifloat: u32) -> (f32, f32, f32) { + // Unpack values. + let x = trifloat >> (5 + 9 + 9); + let y = (trifloat >> (5 + 9)) & 0b1_1111_1111; + let z = (trifloat >> 5) & 0b1_1111_1111; + let e = trifloat & 0b1_1111; + + let multiplier = fiddle_exp2(e as i32 - 10 - 9); + + ( + x as f32 * multiplier, + y as f32 * multiplier, + z as f32 * multiplier, + ) +} + +/// Calculates 2.0^exp using IEEE bit fiddling. +/// +/// Only works for integer exponents in the range [-126, 127] +/// due to IEEE 32-bit float limits. +#[inline(always)] +fn fiddle_exp2(exp: i32) -> f32 { + use std::f32; + f32::from_bits(((exp + 127) as u32) << 23) +} + +/// Calculates a floor(log2(n)) using IEEE bit fiddling. +/// +/// Because of IEEE floating point format, infinity and NaN +/// floating point values return 128, and subnormal numbers always +/// return -127. These particular behaviors are not, of course, +/// mathemetically correct, but are actually desireable for the +/// calculations in this library. +#[inline(always)] +fn fiddle_log2(n: f32) -> i32 { + use std::f32; + ((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127 +} + +#[cfg(test)] +mod tests { + use super::*; + + fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) { + decode(encode(floats)) + } + + #[test] + fn all_zeros() { + let fs = (0.0f32, 0.0f32, 0.0f32); + + let tri = encode(fs); + let fs2 = decode(tri); + + assert_eq!(tri, 0u32); + assert_eq!(fs, fs2); + } + + #[test] + fn powers_of_two() { + let fs = (8.0f32, 128.0f32, 0.5f32); + assert_eq!(round_trip(fs), fs); + } + + #[test] + fn accuracy() { + let mut n = 1.0; + for _ in 0..256 { + let (x, _, _) = round_trip((n, 0.0, 0.0)); + assert_eq!(n, x); + n += 1.0 / 256.0; + } + } + + #[test] + fn integers() { + for n in 0..=512 { + let (x, _, _) = round_trip((n as f32, 0.0, 0.0)); + assert_eq!(n as f32, x); + } + } + + #[test] + fn rounding() { + let fs = (7.0f32, 513.0f32, 1.0f32); + assert_eq!(round_trip(fs), (8.0, 514.0, 2.0)); + } + + #[test] + fn rounding_edge_case() { + let fs = (1023.0f32, 0.0f32, 0.0f32); + + assert_eq!(round_trip(fs), (1024.0, 0.0, 0.0),); + } + + #[test] + fn saturate() { + let fs = (9999999999.0, 9999999999.0, 9999999999.0); + + assert_eq!(round_trip(fs), (MAX, MAX, MAX)); + assert_eq!(decode(0xFFFFFFFF), (MAX, MAX, MAX),); + } + + #[test] + fn inf_saturate() { + use std::f32::INFINITY; + let fs = (INFINITY, 0.0, 0.0); + + assert_eq!(round_trip(fs), (MAX, 0.0, 0.0)); + assert_eq!(encode(fs), 0xFF80001F,); + } + + #[test] + fn partial_saturate() { + let fs = (9999999999.0, 4096.0, 262144.0); + + assert_eq!(round_trip(fs), (MAX, 4096.0, 262144.0)); + } + + #[test] + fn smallest_value() { + let fs = (MIN, MIN * 0.5, MIN * 0.49); + assert_eq!(round_trip(fs), (MIN, MIN, 0.0)); + assert_eq!(decode(0x00_80_40_00), (MIN, MIN, 0.0)); + } + + #[test] + fn underflow() { + let fs = (MIN * 0.49, 0.0, 0.0); + assert_eq!(encode(fs), 0); + assert_eq!(round_trip(fs), (0.0, 0.0, 0.0)); + } +}