Added a new trifloat type that uses 48 bits and is signed.
This commit is contained in:
parent
152d265c82
commit
e31ec6eb4e
|
@ -1,32 +1,60 @@
|
|||
use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
|
||||
use rand::{rngs::SmallRng, FromEntropy, Rng};
|
||||
use trifloat::{decode, encode};
|
||||
use trifloat::{signed48, unsigned32};
|
||||
|
||||
//----
|
||||
|
||||
fn encode_100_values(bench: &mut Bencher) {
|
||||
fn unsigned32_encode_100_values(bench: &mut Bencher) {
|
||||
let mut rng = SmallRng::from_entropy();
|
||||
bench.iter(|| {
|
||||
let x = rng.gen::<f32>() - 0.5;
|
||||
let y = rng.gen::<f32>() - 0.5;
|
||||
let z = rng.gen::<f32>() - 0.5;
|
||||
for _ in 0..100 {
|
||||
black_box(encode(black_box((x, y, z))));
|
||||
black_box(unsigned32::encode(black_box((x, y, z))));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
fn decode_100_values(bench: &mut Bencher) {
|
||||
fn unsigned32_decode_100_values(bench: &mut Bencher) {
|
||||
let mut rng = SmallRng::from_entropy();
|
||||
bench.iter(|| {
|
||||
let v = rng.gen::<u32>();
|
||||
for _ in 0..100 {
|
||||
black_box(decode(black_box(v)));
|
||||
black_box(unsigned32::decode(black_box(v)));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
fn signed48_encode_100_values(bench: &mut Bencher) {
|
||||
let mut rng = SmallRng::from_entropy();
|
||||
bench.iter(|| {
|
||||
let x = rng.gen::<f32>() - 0.5;
|
||||
let y = rng.gen::<f32>() - 0.5;
|
||||
let z = rng.gen::<f32>() - 0.5;
|
||||
for _ in 0..100 {
|
||||
black_box(signed48::encode(black_box((x, y, z))));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
fn signed48_decode_100_values(bench: &mut Bencher) {
|
||||
let mut rng = SmallRng::from_entropy();
|
||||
bench.iter(|| {
|
||||
let v = rng.gen::<u64>() & 0x0000_FFFF_FFFF_FFFF;
|
||||
for _ in 0..100 {
|
||||
black_box(signed48::decode(black_box(v)));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
//----
|
||||
|
||||
benchmark_group!(benches, encode_100_values, decode_100_values,);
|
||||
benchmark_group!(
|
||||
benches,
|
||||
unsigned32_encode_100_values,
|
||||
unsigned32_decode_100_values,
|
||||
signed48_encode_100_values,
|
||||
signed48_decode_100_values,
|
||||
);
|
||||
benchmark_main!(benches);
|
||||
|
|
|
@ -1,216 +1,4 @@
|
|||
//! Encoding/decoding for a 32-bit shared-exponent representation of three
|
||||
//! positive floating point numbers.
|
||||
//!
|
||||
//! This is useful for e.g. compactly storing HDR colors. The encoding
|
||||
//! uses 9 bits of mantissa per number, and 5 bits for the shared
|
||||
//! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3,
|
||||
//! exponent]. The exponent is stored as an unsigned integer with a
|
||||
//! bias of 10.
|
||||
//!
|
||||
//! The largest representable number is 2^21 - 4096, and the smallest
|
||||
//! representable non-zero number is 2^-19.
|
||||
//!
|
||||
//! Since the exponent is shared between the three values, the precision
|
||||
//! of all three values depends on the largest of the three. All integers
|
||||
//! up to 512 can be represented exactly in the largest value.
|
||||
|
||||
/// Largest representable number.
|
||||
pub const MAX: f32 = 2_093_056.0;
|
||||
|
||||
/// Smallest representable non-zero number.
|
||||
pub const MIN: f32 = 0.000_001_907_348_6;
|
||||
|
||||
/// Difference between 1.0 and the next largest representable number.
|
||||
pub const EPSILON: f32 = 1.0 / 256.0;
|
||||
|
||||
/// Encodes three floating point values into the trifloat format.
|
||||
///
|
||||
/// Floats that are larger than the max representable value in trifloat
|
||||
/// will saturate. Values are converted to trifloat by rounding, so the
|
||||
/// max error introduced by this function is epsilon / 2.
|
||||
///
|
||||
/// Warning: negative values and NaN's are _not_ supported by the trifloat
|
||||
/// format. There are debug-only assertions in place to catch such
|
||||
/// values in the input floats. Infinity is also not supported in the
|
||||
/// format, but will simply saturate to the largest representable value.
|
||||
#[inline]
|
||||
pub fn encode(floats: (f32, f32, f32)) -> u32 {
|
||||
debug_assert!(
|
||||
floats.0 >= 0.0
|
||||
&& floats.1 >= 0.0
|
||||
&& floats.2 >= 0.0
|
||||
&& !floats.0.is_nan()
|
||||
&& !floats.1.is_nan()
|
||||
&& !floats.2.is_nan(),
|
||||
"trifloat::encode(): encoding to tri-floats only works correctly for \
|
||||
positive, non-NaN numbers, but the numbers passed were: ({}, \
|
||||
{}, {})",
|
||||
floats.0,
|
||||
floats.1,
|
||||
floats.2
|
||||
);
|
||||
|
||||
// Find the largest of the three values.
|
||||
let largest_value = floats.0.max(floats.1.max(floats.2));
|
||||
if largest_value <= 0.0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Calculate the exponent and 1.0/multiplier for encoding the values.
|
||||
let mut exponent = (fiddle_log2(largest_value) + 1).max(-10).min(21);
|
||||
let mut inv_multiplier = fiddle_exp2(-exponent + 9);
|
||||
|
||||
// Edge-case: make sure rounding pushes the largest value up
|
||||
// appropriately if needed.
|
||||
if (largest_value * inv_multiplier) + 0.5 >= 512.0 {
|
||||
exponent = (exponent + 1).min(21);
|
||||
inv_multiplier = fiddle_exp2(-exponent + 9);
|
||||
}
|
||||
|
||||
// Quantize and encode values.
|
||||
let x = (floats.0 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
||||
let y = (floats.1 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
||||
let z = (floats.2 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
||||
let e = (exponent + 10) as u32 & 0b1_1111;
|
||||
|
||||
// Pack values into a u32.
|
||||
(x << (5 + 9 + 9)) | (y << (5 + 9)) | (z << 5) | e
|
||||
}
|
||||
|
||||
/// Decodes a trifloat into three full floating point numbers.
|
||||
///
|
||||
/// This operation is lossless and cannot fail.
|
||||
#[inline]
|
||||
pub fn decode(trifloat: u32) -> (f32, f32, f32) {
|
||||
// Unpack values.
|
||||
let x = trifloat >> (5 + 9 + 9);
|
||||
let y = (trifloat >> (5 + 9)) & 0b1_1111_1111;
|
||||
let z = (trifloat >> 5) & 0b1_1111_1111;
|
||||
let e = trifloat & 0b1_1111;
|
||||
|
||||
let multiplier = fiddle_exp2(e as i32 - 10 - 9);
|
||||
|
||||
(
|
||||
x as f32 * multiplier,
|
||||
y as f32 * multiplier,
|
||||
z as f32 * multiplier,
|
||||
)
|
||||
}
|
||||
|
||||
/// Calculates 2.0^exp using IEEE bit fiddling.
|
||||
///
|
||||
/// Only works for integer exponents in the range [-126, 127]
|
||||
/// due to IEEE 32-bit float limits.
|
||||
#[inline(always)]
|
||||
fn fiddle_exp2(exp: i32) -> f32 {
|
||||
use std::f32;
|
||||
f32::from_bits(((exp + 127) as u32) << 23)
|
||||
}
|
||||
|
||||
/// Calculates a floor(log2(n)) using IEEE bit fiddling.
|
||||
///
|
||||
/// Because of IEEE floating point format, infinity and NaN
|
||||
/// floating point values return 128, and subnormal numbers always
|
||||
/// return -127. These particular behaviors are not, of course,
|
||||
/// mathemetically correct, but are actually desireable for the
|
||||
/// calculations in this library.
|
||||
#[inline(always)]
|
||||
fn fiddle_log2(n: f32) -> i32 {
|
||||
use std::f32;
|
||||
((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) {
|
||||
decode(encode(floats))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn all_zeros() {
|
||||
let fs = (0.0f32, 0.0f32, 0.0f32);
|
||||
|
||||
let tri = encode(fs);
|
||||
let fs2 = decode(tri);
|
||||
|
||||
assert_eq!(tri, 0u32);
|
||||
assert_eq!(fs, fs2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn powers_of_two() {
|
||||
let fs = (8.0f32, 128.0f32, 0.5f32);
|
||||
assert_eq!(round_trip(fs), fs);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn accuracy() {
|
||||
let mut n = 1.0;
|
||||
for _ in 0..256 {
|
||||
let (x, _, _) = round_trip((n, 0.0, 0.0));
|
||||
assert_eq!(n, x);
|
||||
n += 1.0 / 256.0;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn integers() {
|
||||
for n in 0..=512 {
|
||||
let (x, _, _) = round_trip((n as f32, 0.0, 0.0));
|
||||
assert_eq!(n as f32, x);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rounding() {
|
||||
let fs = (7.0f32, 513.0f32, 1.0f32);
|
||||
assert_eq!(round_trip(fs), (8.0, 514.0, 2.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rounding_edge_case() {
|
||||
let fs = (1023.0f32, 0.0f32, 0.0f32);
|
||||
|
||||
assert_eq!(round_trip(fs), (1024.0, 0.0, 0.0),);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn saturate() {
|
||||
let fs = (9999999999.0, 9999999999.0, 9999999999.0);
|
||||
|
||||
assert_eq!(round_trip(fs), (MAX, MAX, MAX));
|
||||
assert_eq!(decode(0xFFFFFFFF), (MAX, MAX, MAX),);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn inf_saturate() {
|
||||
use std::f32::INFINITY;
|
||||
let fs = (INFINITY, 0.0, 0.0);
|
||||
|
||||
assert_eq!(round_trip(fs), (MAX, 0.0, 0.0));
|
||||
assert_eq!(encode(fs), 0xFF80001F,);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_saturate() {
|
||||
let fs = (9999999999.0, 4096.0, 262144.0);
|
||||
|
||||
assert_eq!(round_trip(fs), (MAX, 4096.0, 262144.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn smallest_value() {
|
||||
let fs = (MIN, MIN * 0.5, MIN * 0.49);
|
||||
assert_eq!(round_trip(fs), (MIN, MIN, 0.0));
|
||||
assert_eq!(decode(0x00_80_40_00), (MIN, MIN, 0.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn underflow() {
|
||||
let fs = (MIN * 0.49, 0.0, 0.0);
|
||||
assert_eq!(encode(fs), 0);
|
||||
assert_eq!(round_trip(fs), (0.0, 0.0, 0.0));
|
||||
}
|
||||
}
|
||||
pub mod signed48;
|
||||
/// This crate provides types and functions for storing triplets of floating
|
||||
/// point values in a shared-exponent format.
|
||||
pub mod unsigned32;
|
||||
|
|
262
sub_crates/trifloat/src/signed48.rs
Normal file
262
sub_crates/trifloat/src/signed48.rs
Normal file
|
@ -0,0 +1,262 @@
|
|||
//! Encoding/decoding for a 48-bit shared-exponent representation of three
|
||||
//! signed floating point numbers.
|
||||
//!
|
||||
//! This is useful for e.g. compactly storing HDR colors. The encoding
|
||||
//! uses 14 bits of mantissa per number (including the sign bit for each) and 6
|
||||
//! bits for the shared exponent. The bit layout is [mantissa 1, mantissa 2,
|
||||
//! mantissa 3, exponent]. The exponent is stored as an unsigned integer with
|
||||
//! a bias of 32. The mantissas are stored as a single leading sign bit and 13
|
||||
//! bits of unsigned integer.
|
||||
//!
|
||||
//! The largest representable number is ?, and the smallest
|
||||
//! representable positive number is ?.
|
||||
//!
|
||||
//! Since the exponent is shared between the three values, the precision
|
||||
//! of all three values depends on the largest (in absolute value) of the
|
||||
//! three. All integers in the range [-8191, 8191] can be represented exactly
|
||||
//! in the largest value.
|
||||
|
||||
/// Largest representable number.
|
||||
pub const MAX: f32 = 35_180_077_121_536.0;
|
||||
|
||||
/// Smallest representable non-zero number.
|
||||
pub const MIN_POSITIVE: f32 = 0.000_000_000_465_661_287;
|
||||
|
||||
pub const MIN: f32 = -35_180_077_121_536.0;
|
||||
|
||||
/// Difference between 1.0 and the next largest representable number.
|
||||
pub const EPSILON: f32 = 1.0 / 4096.0;
|
||||
|
||||
const EXP_BIAS: i32 = 31 - 13;
|
||||
const MIN_EXP: i32 = 0 - EXP_BIAS;
|
||||
const MAX_EXP: i32 = 63 - EXP_BIAS;
|
||||
|
||||
/// Encodes three floating point values into a 48-bit trifloat format.
|
||||
///
|
||||
/// Note that even though the return value is a u64, only the lower 48
|
||||
/// bits are used.
|
||||
///
|
||||
/// Floats that are larger than the max representable value in trifloat
|
||||
/// will saturate. Values are converted to trifloat by rounding, so the
|
||||
/// max error introduced by this function is epsilon / 2.
|
||||
///
|
||||
/// Warning: NaN's are _not_ supported by the trifloat
|
||||
/// format. There are debug-only assertions in place to catch such
|
||||
/// values in the input floats. Infinity is also not supported in the
|
||||
/// format, but will simply saturate to the largest-magnitude representable
|
||||
/// value.
|
||||
#[inline]
|
||||
pub fn encode(floats: (f32, f32, f32)) -> u64 {
|
||||
debug_assert!(
|
||||
!floats.0.is_nan() && !floats.1.is_nan() && !floats.2.is_nan(),
|
||||
"trifloat::s48::encode(): encoding to signed 48-bit tri-floats only works correctly for \
|
||||
non-NaN numbers, but the numbers passed were: ({}, \
|
||||
{}, {})",
|
||||
floats.0,
|
||||
floats.1,
|
||||
floats.2
|
||||
);
|
||||
|
||||
// Find the largest (in magnitude) of the three values.
|
||||
let largest_value = {
|
||||
let mut largest_value: f32 = 0.0;
|
||||
if floats.0.abs() > largest_value.abs() {
|
||||
largest_value = floats.0;
|
||||
}
|
||||
if floats.1.abs() > largest_value.abs() {
|
||||
largest_value = floats.1;
|
||||
}
|
||||
if floats.2.abs() > largest_value.abs() {
|
||||
largest_value = floats.2;
|
||||
}
|
||||
largest_value
|
||||
};
|
||||
|
||||
// Calculate the exponent and 1.0/multiplier for encoding the values.
|
||||
let (exponent, inv_multiplier) = {
|
||||
let mut exponent = (fiddle_log2(largest_value) + 1).max(MIN_EXP).min(MAX_EXP);
|
||||
let mut inv_multiplier = fiddle_exp2(-exponent + 13);
|
||||
|
||||
// Edge-case: make sure rounding pushes the largest value up
|
||||
// appropriately if needed.
|
||||
if (largest_value * inv_multiplier).abs() + 0.5 >= 8191.0 {
|
||||
exponent = (exponent + 1).min(MAX_EXP);
|
||||
inv_multiplier = fiddle_exp2(-exponent + 13);
|
||||
}
|
||||
(exponent, inv_multiplier)
|
||||
};
|
||||
|
||||
// Quantize and encode values.
|
||||
let x = (floats.0.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111;
|
||||
let x_sign = (floats.0.to_bits() >> 31) as u64;
|
||||
let y = (floats.1.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111;
|
||||
let y_sign = (floats.1.to_bits() >> 31) as u64;
|
||||
let z = (floats.2.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111;
|
||||
let z_sign = (floats.2.to_bits() >> 31) as u64;
|
||||
let e = (exponent + EXP_BIAS) as u64 & 0b111_111;
|
||||
|
||||
// Pack values into a single u64 and return.
|
||||
(x_sign << 47) | (x << 34) | (y_sign << 33) | (y << 20) | (z_sign << 19) | (z << 6) | e
|
||||
}
|
||||
|
||||
/// Decodes a 48-bit trifloat into three full floating point numbers.
|
||||
///
|
||||
/// This operation is lossless and cannot fail.
|
||||
#[inline]
|
||||
pub fn decode(trifloat: u64) -> (f32, f32, f32) {
|
||||
// Unpack values.
|
||||
let x_sign = (trifloat >> 47) as u32;
|
||||
let x = (trifloat >> 34) & 0b111_11111_11111;
|
||||
let y_sign = ((trifloat >> 33) & 1) as u32;
|
||||
let y = (trifloat >> 20) & 0b111_11111_11111;
|
||||
let z_sign = ((trifloat >> 19) & 1) as u32;
|
||||
let z = (trifloat >> 6) & 0b111_11111_11111;
|
||||
let e = trifloat & 0b111_111;
|
||||
|
||||
let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 13);
|
||||
|
||||
(
|
||||
f32::from_bits((x as f32 * multiplier).to_bits() | (x_sign << 31)),
|
||||
f32::from_bits((y as f32 * multiplier).to_bits() | (y_sign << 31)),
|
||||
f32::from_bits((z as f32 * multiplier).to_bits() | (z_sign << 31)),
|
||||
)
|
||||
}
|
||||
|
||||
/// Calculates 2.0^exp using IEEE bit fiddling.
|
||||
///
|
||||
/// Only works for integer exponents in the range [-126, 127]
|
||||
/// due to IEEE 32-bit float limits.
|
||||
#[inline(always)]
|
||||
fn fiddle_exp2(exp: i32) -> f32 {
|
||||
use std::f32;
|
||||
f32::from_bits(((exp + 127) as u32) << 23)
|
||||
}
|
||||
|
||||
/// Calculates a floor(log2(n)) using IEEE bit fiddling.
|
||||
///
|
||||
/// Because of IEEE floating point format, infinity and NaN
|
||||
/// floating point values return 128, and subnormal numbers always
|
||||
/// return -127. These particular behaviors are not, of course,
|
||||
/// mathemetically correct, but are actually desireable for the
|
||||
/// calculations in this library.
|
||||
#[inline(always)]
|
||||
fn fiddle_log2(n: f32) -> i32 {
|
||||
use std::f32;
|
||||
((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) {
|
||||
decode(encode(floats))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn all_zeros() {
|
||||
let fs = (0.0f32, 0.0f32, 0.0f32);
|
||||
|
||||
let tri = encode(fs);
|
||||
let fs2 = decode(tri);
|
||||
|
||||
assert_eq!(tri, 0);
|
||||
assert_eq!(fs, fs2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn powers_of_two() {
|
||||
let fs = (8.0f32, 128.0f32, 0.5f32);
|
||||
assert_eq!(round_trip(fs), fs);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn accuracy() {
|
||||
let mut n = 1.0;
|
||||
for _ in 0..256 {
|
||||
let (x, _, _) = round_trip((n, 0.0, 0.0));
|
||||
assert_eq!(n, x);
|
||||
n += 1.0 / 256.0;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn integers() {
|
||||
for n in 0..=512 {
|
||||
let (x, _, _) = round_trip((n as f32, 0.0, 0.0));
|
||||
assert_eq!(n as f32, x);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rounding() {
|
||||
let fs = (7.0f32, 8193.0f32, -1.0f32);
|
||||
let fsn = (-7.0f32, -8193.0f32, 1.0f32);
|
||||
assert_eq!(round_trip(fs), (8.0, 8194.0, -2.0));
|
||||
assert_eq!(round_trip(fsn), (-8.0, -8194.0, 2.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rounding_edge_case() {
|
||||
let fs = (16383.0f32, 0.0f32, 0.0f32);
|
||||
|
||||
assert_eq!(round_trip(fs), (16384.0, 0.0, 0.0),);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn saturate() {
|
||||
let fs = (
|
||||
99_999_999_999_999.0,
|
||||
99_999_999_999_999.0,
|
||||
99_999_999_999_999.0,
|
||||
);
|
||||
let fsn = (
|
||||
-99_999_999_999_999.0,
|
||||
-99_999_999_999_999.0,
|
||||
-99_999_999_999_999.0,
|
||||
);
|
||||
|
||||
assert_eq!(round_trip(fs), (MAX, MAX, MAX));
|
||||
assert_eq!(round_trip(fsn), (MIN, MIN, MIN));
|
||||
assert_eq!(decode(0x7FFD_FFF7_FFFF), (MAX, MAX, MAX));
|
||||
assert_eq!(decode(0xFFFF_FFFF_FFFF), (MIN, MIN, MIN));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn inf_saturate() {
|
||||
use std::f32::INFINITY;
|
||||
let fs = (INFINITY, 0.0, 0.0);
|
||||
let fsn = (-INFINITY, 0.0, 0.0);
|
||||
|
||||
assert_eq!(round_trip(fs), (MAX, 0.0, 0.0));
|
||||
assert_eq!(round_trip(fsn), (MIN, 0.0, 0.0));
|
||||
assert_eq!(encode(fs), 0x7FFC0000003F);
|
||||
assert_eq!(encode(fsn), 0xFFFC0000003F);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_saturate() {
|
||||
let fs = (99_999_999_999_999.0, 4294967296.0, -17179869184.0);
|
||||
let fsn = (-99_999_999_999_999.0, 4294967296.0, -17179869184.0);
|
||||
|
||||
assert_eq!(round_trip(fs), (MAX, 4294967296.0, -17179869184.0));
|
||||
assert_eq!(round_trip(fsn), (MIN, 4294967296.0, -17179869184.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn smallest_value() {
|
||||
let fs = (MIN_POSITIVE, MIN_POSITIVE * 0.5, MIN_POSITIVE * 0.49);
|
||||
let fsn = (-MIN_POSITIVE, -MIN_POSITIVE * 0.5, -MIN_POSITIVE * 0.49);
|
||||
|
||||
assert_eq!(round_trip(fs), (MIN_POSITIVE, MIN_POSITIVE, 0.0));
|
||||
assert_eq!(round_trip(fsn), (-MIN_POSITIVE, -MIN_POSITIVE, -0.0));
|
||||
assert_eq!(decode(0x600100000), (MIN_POSITIVE, -MIN_POSITIVE, 0.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn underflow() {
|
||||
let fs = (MIN_POSITIVE * 0.49, -MIN_POSITIVE * 0.49, 0.0);
|
||||
assert_eq!(encode(fs), 0x200000000);
|
||||
assert_eq!(round_trip(fs), (0.0, -0.0, 0.0));
|
||||
}
|
||||
}
|
219
sub_crates/trifloat/src/unsigned32.rs
Normal file
219
sub_crates/trifloat/src/unsigned32.rs
Normal file
|
@ -0,0 +1,219 @@
|
|||
//! Encoding/decoding for a 32-bit shared-exponent representation of three
|
||||
//! positive floating point numbers.
|
||||
//!
|
||||
//! This is useful for e.g. compactly storing HDR colors. The encoding
|
||||
//! uses 9 bits of mantissa per number, and 5 bits for the shared
|
||||
//! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3,
|
||||
//! exponent]. The exponent is stored as an unsigned integer with a
|
||||
//! bias of 10.
|
||||
//!
|
||||
//! The largest representable number is 2^21 - 4096, and the smallest
|
||||
//! representable non-zero number is 2^-19.
|
||||
//!
|
||||
//! Since the exponent is shared between the three values, the precision
|
||||
//! of all three values depends on the largest of the three. All integers
|
||||
//! up to 512 can be represented exactly in the largest value.
|
||||
|
||||
/// Largest representable number.
|
||||
pub const MAX: f32 = 2_093_056.0;
|
||||
|
||||
/// Smallest representable non-zero number.
|
||||
pub const MIN: f32 = 0.000_001_907_348_6;
|
||||
|
||||
/// Difference between 1.0 and the next largest representable number.
|
||||
pub const EPSILON: f32 = 1.0 / 256.0;
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct U9(u32);
|
||||
|
||||
/// Encodes three floating point values into the trifloat format.
|
||||
///
|
||||
/// Floats that are larger than the max representable value in trifloat
|
||||
/// will saturate. Values are converted to trifloat by rounding, so the
|
||||
/// max error introduced by this function is epsilon / 2.
|
||||
///
|
||||
/// Warning: negative values and NaN's are _not_ supported by the trifloat
|
||||
/// format. There are debug-only assertions in place to catch such
|
||||
/// values in the input floats. Infinity is also not supported in the
|
||||
/// format, but will simply saturate to the largest representable value.
|
||||
#[inline]
|
||||
pub fn encode(floats: (f32, f32, f32)) -> u32 {
|
||||
debug_assert!(
|
||||
floats.0 >= 0.0
|
||||
&& floats.1 >= 0.0
|
||||
&& floats.2 >= 0.0
|
||||
&& !floats.0.is_nan()
|
||||
&& !floats.1.is_nan()
|
||||
&& !floats.2.is_nan(),
|
||||
"trifloat::encode(): encoding to tri-floats only works correctly for \
|
||||
positive, non-NaN numbers, but the numbers passed were: ({}, \
|
||||
{}, {})",
|
||||
floats.0,
|
||||
floats.1,
|
||||
floats.2
|
||||
);
|
||||
|
||||
// Find the largest of the three values.
|
||||
let largest_value = floats.0.max(floats.1.max(floats.2));
|
||||
if largest_value <= 0.0 {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Calculate the exponent and 1.0/multiplier for encoding the values.
|
||||
let mut exponent = (fiddle_log2(largest_value) + 1).max(-10).min(21);
|
||||
let mut inv_multiplier = fiddle_exp2(-exponent + 9);
|
||||
|
||||
// Edge-case: make sure rounding pushes the largest value up
|
||||
// appropriately if needed.
|
||||
if (largest_value * inv_multiplier) + 0.5 >= 512.0 {
|
||||
exponent = (exponent + 1).min(21);
|
||||
inv_multiplier = fiddle_exp2(-exponent + 9);
|
||||
}
|
||||
|
||||
// Quantize and encode values.
|
||||
let x = (floats.0 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
||||
let y = (floats.1 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
||||
let z = (floats.2 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
||||
let e = (exponent + 10) as u32 & 0b1_1111;
|
||||
|
||||
// Pack values into a u32.
|
||||
(x << (5 + 9 + 9)) | (y << (5 + 9)) | (z << 5) | e
|
||||
}
|
||||
|
||||
/// Decodes a trifloat into three full floating point numbers.
|
||||
///
|
||||
/// This operation is lossless and cannot fail.
|
||||
#[inline]
|
||||
pub fn decode(trifloat: u32) -> (f32, f32, f32) {
|
||||
// Unpack values.
|
||||
let x = trifloat >> (5 + 9 + 9);
|
||||
let y = (trifloat >> (5 + 9)) & 0b1_1111_1111;
|
||||
let z = (trifloat >> 5) & 0b1_1111_1111;
|
||||
let e = trifloat & 0b1_1111;
|
||||
|
||||
let multiplier = fiddle_exp2(e as i32 - 10 - 9);
|
||||
|
||||
(
|
||||
x as f32 * multiplier,
|
||||
y as f32 * multiplier,
|
||||
z as f32 * multiplier,
|
||||
)
|
||||
}
|
||||
|
||||
/// Calculates 2.0^exp using IEEE bit fiddling.
|
||||
///
|
||||
/// Only works for integer exponents in the range [-126, 127]
|
||||
/// due to IEEE 32-bit float limits.
|
||||
#[inline(always)]
|
||||
fn fiddle_exp2(exp: i32) -> f32 {
|
||||
use std::f32;
|
||||
f32::from_bits(((exp + 127) as u32) << 23)
|
||||
}
|
||||
|
||||
/// Calculates a floor(log2(n)) using IEEE bit fiddling.
|
||||
///
|
||||
/// Because of IEEE floating point format, infinity and NaN
|
||||
/// floating point values return 128, and subnormal numbers always
|
||||
/// return -127. These particular behaviors are not, of course,
|
||||
/// mathemetically correct, but are actually desireable for the
|
||||
/// calculations in this library.
|
||||
#[inline(always)]
|
||||
fn fiddle_log2(n: f32) -> i32 {
|
||||
use std::f32;
|
||||
((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) {
|
||||
decode(encode(floats))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn all_zeros() {
|
||||
let fs = (0.0f32, 0.0f32, 0.0f32);
|
||||
|
||||
let tri = encode(fs);
|
||||
let fs2 = decode(tri);
|
||||
|
||||
assert_eq!(tri, 0u32);
|
||||
assert_eq!(fs, fs2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn powers_of_two() {
|
||||
let fs = (8.0f32, 128.0f32, 0.5f32);
|
||||
assert_eq!(round_trip(fs), fs);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn accuracy() {
|
||||
let mut n = 1.0;
|
||||
for _ in 0..256 {
|
||||
let (x, _, _) = round_trip((n, 0.0, 0.0));
|
||||
assert_eq!(n, x);
|
||||
n += 1.0 / 256.0;
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn integers() {
|
||||
for n in 0..=512 {
|
||||
let (x, _, _) = round_trip((n as f32, 0.0, 0.0));
|
||||
assert_eq!(n as f32, x);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rounding() {
|
||||
let fs = (7.0f32, 513.0f32, 1.0f32);
|
||||
assert_eq!(round_trip(fs), (8.0, 514.0, 2.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rounding_edge_case() {
|
||||
let fs = (1023.0f32, 0.0f32, 0.0f32);
|
||||
|
||||
assert_eq!(round_trip(fs), (1024.0, 0.0, 0.0),);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn saturate() {
|
||||
let fs = (9999999999.0, 9999999999.0, 9999999999.0);
|
||||
|
||||
assert_eq!(round_trip(fs), (MAX, MAX, MAX));
|
||||
assert_eq!(decode(0xFFFFFFFF), (MAX, MAX, MAX),);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn inf_saturate() {
|
||||
use std::f32::INFINITY;
|
||||
let fs = (INFINITY, 0.0, 0.0);
|
||||
|
||||
assert_eq!(round_trip(fs), (MAX, 0.0, 0.0));
|
||||
assert_eq!(encode(fs), 0xFF80001F,);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_saturate() {
|
||||
let fs = (9999999999.0, 4096.0, 262144.0);
|
||||
|
||||
assert_eq!(round_trip(fs), (MAX, 4096.0, 262144.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn smallest_value() {
|
||||
let fs = (MIN, MIN * 0.5, MIN * 0.49);
|
||||
assert_eq!(round_trip(fs), (MIN, MIN, 0.0));
|
||||
assert_eq!(decode(0x00_80_40_00), (MIN, MIN, 0.0));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn underflow() {
|
||||
let fs = (MIN * 0.49, 0.0, 0.0);
|
||||
assert_eq!(encode(fs), 0);
|
||||
assert_eq!(round_trip(fs), (0.0, 0.0, 0.0));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user