Some cleanup and improvements to the trifloat sub-crate.
This commit is contained in:
parent
e31ec6eb4e
commit
103775f0e9
|
@ -1,4 +1,34 @@
|
||||||
|
//! Functions for storing triplets of floating point values in a
|
||||||
|
//! shared-exponent format.
|
||||||
|
//!
|
||||||
|
//! The motivating use-case for this is compactly storing HDR RGB colors. But
|
||||||
|
//! it may be useful for other things as well.
|
||||||
|
|
||||||
pub mod signed48;
|
pub mod signed48;
|
||||||
/// This crate provides types and functions for storing triplets of floating
|
|
||||||
/// point values in a shared-exponent format.
|
|
||||||
pub mod unsigned32;
|
pub mod unsigned32;
|
||||||
|
|
||||||
|
//===========================================================================
|
||||||
|
// Some shared functions used by the other modules in this crate.
|
||||||
|
|
||||||
|
/// Calculates 2.0^exp using IEEE bit fiddling.
|
||||||
|
///
|
||||||
|
/// Only works for integer exponents in the range [-126, 127]
|
||||||
|
/// due to IEEE 32-bit float limits.
|
||||||
|
#[inline(always)]
|
||||||
|
fn fiddle_exp2(exp: i32) -> f32 {
|
||||||
|
use std::f32;
|
||||||
|
f32::from_bits(((exp + 127) as u32) << 23)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculates a floor(log2(n)) using IEEE bit fiddling.
|
||||||
|
///
|
||||||
|
/// Because of IEEE floating point format, infinity and NaN
|
||||||
|
/// floating point values return 128, and subnormal numbers always
|
||||||
|
/// return -127. These particular behaviors are not, of course,
|
||||||
|
/// mathemetically correct, but are actually desireable for the
|
||||||
|
/// calculations in this library.
|
||||||
|
#[inline(always)]
|
||||||
|
fn fiddle_log2(n: f32) -> i32 {
|
||||||
|
use std::f32;
|
||||||
|
((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127
|
||||||
|
}
|
||||||
|
|
|
@ -1,57 +1,59 @@
|
||||||
//! Encoding/decoding for a 48-bit shared-exponent representation of three
|
//! Encoding/decoding for signed 48-bit trifloat numbers.
|
||||||
//! signed floating point numbers.
|
|
||||||
//!
|
//!
|
||||||
//! This is useful for e.g. compactly storing HDR colors. The encoding
|
//! The encoding uses 13 bits of mantissa and 1 sign bit per number, and 6
|
||||||
//! uses 14 bits of mantissa per number (including the sign bit for each) and 6
|
//! bits for the shared exponent. The bit layout is: [sign 1, mantissa 1,
|
||||||
//! bits for the shared exponent. The bit layout is [mantissa 1, mantissa 2,
|
//! sign 2, mantissa 2, sign 3, mantissa 3, exponent]. The exponent is stored
|
||||||
//! mantissa 3, exponent]. The exponent is stored as an unsigned integer with
|
//! as an unsigned integer with a bias of 25.
|
||||||
//! a bias of 32. The mantissas are stored as a single leading sign bit and 13
|
|
||||||
//! bits of unsigned integer.
|
|
||||||
//!
|
//!
|
||||||
//! The largest representable number is ?, and the smallest
|
//! The largest representable number is `2^38 - 2^25`, and the smallest
|
||||||
//! representable positive number is ?.
|
//! representable positive number is `2^-38`.
|
||||||
//!
|
//!
|
||||||
//! Since the exponent is shared between the three values, the precision
|
//! Since the exponent is shared between all three values, the precision
|
||||||
//! of all three values depends on the largest (in absolute value) of the
|
//! of all three values depends on the largest (in magnitude) of the three.
|
||||||
//! three. All integers in the range [-8191, 8191] can be represented exactly
|
//! All integers in the range `[-8192, 8192]` can be represented exactly in the
|
||||||
//! in the largest value.
|
//! largest value.
|
||||||
|
|
||||||
|
use crate::{fiddle_exp2, fiddle_log2};
|
||||||
|
|
||||||
/// Largest representable number.
|
/// Largest representable number.
|
||||||
pub const MAX: f32 = 35_180_077_121_536.0;
|
pub const MAX: f32 = 274_844_352_512.0;
|
||||||
|
|
||||||
/// Smallest representable non-zero number.
|
/// Smallest representable number.
|
||||||
pub const MIN_POSITIVE: f32 = 0.000_000_000_465_661_287;
|
///
|
||||||
|
/// Note this is not the smallest _magnitude_ number. This is a negative
|
||||||
|
/// number of large magnitude.
|
||||||
|
pub const MIN: f32 = -274_844_352_512.0;
|
||||||
|
|
||||||
pub const MIN: f32 = -35_180_077_121_536.0;
|
/// Smallest representable positive number.
|
||||||
|
///
|
||||||
|
/// This is the number with the smallest possible magnitude (aside from zero).
|
||||||
|
pub const MIN_POSITIVE: f32 = 0.000_000_000_003_637_978_807_091_713;
|
||||||
|
|
||||||
/// Difference between 1.0 and the next largest representable number.
|
/// Difference between 1.0 and the next largest representable number.
|
||||||
pub const EPSILON: f32 = 1.0 / 4096.0;
|
pub const EPSILON: f32 = 1.0 / 4096.0;
|
||||||
|
|
||||||
const EXP_BIAS: i32 = 31 - 13;
|
const EXP_BIAS: i32 = 25;
|
||||||
const MIN_EXP: i32 = 0 - EXP_BIAS;
|
const MIN_EXP: i32 = 0 - EXP_BIAS;
|
||||||
const MAX_EXP: i32 = 63 - EXP_BIAS;
|
const MAX_EXP: i32 = 63 - EXP_BIAS;
|
||||||
|
|
||||||
/// Encodes three floating point values into a 48-bit trifloat format.
|
/// Encodes three floating point values into a signed 48-bit trifloat.
|
||||||
///
|
///
|
||||||
/// Note that even though the return value is a u64, only the lower 48
|
/// Input floats that are larger than `MAX` or smaller than `MIN` will saturate
|
||||||
/// bits are used.
|
/// to `MAX` and `MIN` respectively, including +/- infinity. Values are
|
||||||
|
/// converted to trifloat precision by rounding.
|
||||||
///
|
///
|
||||||
/// Floats that are larger than the max representable value in trifloat
|
/// Only the lower 48 bits of the return value are used. The highest 16 bits
|
||||||
/// will saturate. Values are converted to trifloat by rounding, so the
|
/// will all be zero and can be safely discarded.
|
||||||
/// max error introduced by this function is epsilon / 2.
|
|
||||||
///
|
///
|
||||||
/// Warning: NaN's are _not_ supported by the trifloat
|
/// Warning: NaN's are _not_ supported by the trifloat format. There are
|
||||||
/// format. There are debug-only assertions in place to catch such
|
/// debug-only assertions in place to catch such values in the input floats.
|
||||||
/// values in the input floats. Infinity is also not supported in the
|
|
||||||
/// format, but will simply saturate to the largest-magnitude representable
|
|
||||||
/// value.
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn encode(floats: (f32, f32, f32)) -> u64 {
|
pub fn encode(floats: (f32, f32, f32)) -> u64 {
|
||||||
debug_assert!(
|
debug_assert!(
|
||||||
!floats.0.is_nan() && !floats.1.is_nan() && !floats.2.is_nan(),
|
!floats.0.is_nan() && !floats.1.is_nan() && !floats.2.is_nan(),
|
||||||
"trifloat::s48::encode(): encoding to signed 48-bit tri-floats only works correctly for \
|
"trifloat::signed48::encode(): encoding to signed tri-floats only \
|
||||||
non-NaN numbers, but the numbers passed were: ({}, \
|
works correctly for non-NaN numbers, but the numbers passed were: \
|
||||||
{}, {})",
|
({}, {}, {})",
|
||||||
floats.0,
|
floats.0,
|
||||||
floats.1,
|
floats.1,
|
||||||
floats.2
|
floats.2
|
||||||
|
@ -79,7 +81,7 @@ pub fn encode(floats: (f32, f32, f32)) -> u64 {
|
||||||
|
|
||||||
// Edge-case: make sure rounding pushes the largest value up
|
// Edge-case: make sure rounding pushes the largest value up
|
||||||
// appropriately if needed.
|
// appropriately if needed.
|
||||||
if (largest_value * inv_multiplier).abs() + 0.5 >= 8191.0 {
|
if (largest_value * inv_multiplier).abs() + 0.5 >= 8192.0 {
|
||||||
exponent = (exponent + 1).min(MAX_EXP);
|
exponent = (exponent + 1).min(MAX_EXP);
|
||||||
inv_multiplier = fiddle_exp2(-exponent + 13);
|
inv_multiplier = fiddle_exp2(-exponent + 13);
|
||||||
}
|
}
|
||||||
|
@ -99,52 +101,33 @@ pub fn encode(floats: (f32, f32, f32)) -> u64 {
|
||||||
(x_sign << 47) | (x << 34) | (y_sign << 33) | (y << 20) | (z_sign << 19) | (z << 6) | e
|
(x_sign << 47) | (x << 34) | (y_sign << 33) | (y << 20) | (z_sign << 19) | (z << 6) | e
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Decodes a 48-bit trifloat into three full floating point numbers.
|
/// Decodes a signed 48-bit trifloat into three full floating point numbers.
|
||||||
///
|
///
|
||||||
/// This operation is lossless and cannot fail.
|
/// This operation is lossless and cannot fail. Only the lower 48 bits of the
|
||||||
|
/// input value are used--the upper 16 bits can safely be anything and are
|
||||||
|
/// ignored.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn decode(trifloat: u64) -> (f32, f32, f32) {
|
pub fn decode(trifloat: u64) -> (f32, f32, f32) {
|
||||||
// Unpack values.
|
// Unpack values.
|
||||||
let x_sign = (trifloat >> 47) as u32;
|
|
||||||
let x = (trifloat >> 34) & 0b111_11111_11111;
|
let x = (trifloat >> 34) & 0b111_11111_11111;
|
||||||
let y_sign = ((trifloat >> 33) & 1) as u32;
|
|
||||||
let y = (trifloat >> 20) & 0b111_11111_11111;
|
let y = (trifloat >> 20) & 0b111_11111_11111;
|
||||||
let z_sign = ((trifloat >> 19) & 1) as u32;
|
|
||||||
let z = (trifloat >> 6) & 0b111_11111_11111;
|
let z = (trifloat >> 6) & 0b111_11111_11111;
|
||||||
|
|
||||||
|
let x_sign = ((trifloat >> 16) & 0x8000_0000) as u32;
|
||||||
|
let y_sign = ((trifloat >> 2) & 0x8000_0000) as u32;
|
||||||
|
let z_sign = ((trifloat << 12) & 0x8000_0000) as u32;
|
||||||
|
|
||||||
let e = trifloat & 0b111_111;
|
let e = trifloat & 0b111_111;
|
||||||
|
|
||||||
let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 13);
|
let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 13);
|
||||||
|
|
||||||
(
|
(
|
||||||
f32::from_bits((x as f32 * multiplier).to_bits() | (x_sign << 31)),
|
f32::from_bits((x as f32 * multiplier).to_bits() | x_sign),
|
||||||
f32::from_bits((y as f32 * multiplier).to_bits() | (y_sign << 31)),
|
f32::from_bits((y as f32 * multiplier).to_bits() | y_sign),
|
||||||
f32::from_bits((z as f32 * multiplier).to_bits() | (z_sign << 31)),
|
f32::from_bits((z as f32 * multiplier).to_bits() | z_sign),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculates 2.0^exp using IEEE bit fiddling.
|
|
||||||
///
|
|
||||||
/// Only works for integer exponents in the range [-126, 127]
|
|
||||||
/// due to IEEE 32-bit float limits.
|
|
||||||
#[inline(always)]
|
|
||||||
fn fiddle_exp2(exp: i32) -> f32 {
|
|
||||||
use std::f32;
|
|
||||||
f32::from_bits(((exp + 127) as u32) << 23)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Calculates a floor(log2(n)) using IEEE bit fiddling.
|
|
||||||
///
|
|
||||||
/// Because of IEEE floating point format, infinity and NaN
|
|
||||||
/// floating point values return 128, and subnormal numbers always
|
|
||||||
/// return -127. These particular behaviors are not, of course,
|
|
||||||
/// mathemetically correct, but are actually desireable for the
|
|
||||||
/// calculations in this library.
|
|
||||||
#[inline(always)]
|
|
||||||
fn fiddle_log2(n: f32) -> i32 {
|
|
||||||
use std::f32;
|
|
||||||
((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
@ -170,6 +153,27 @@ mod tests {
|
||||||
assert_eq!(round_trip(fs), fs);
|
assert_eq!(round_trip(fs), fs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn signs() {
|
||||||
|
let fs1 = (1.0f32, 1.0f32, 1.0f32);
|
||||||
|
let fs2 = (1.0f32, 1.0f32, -1.0f32);
|
||||||
|
let fs3 = (1.0f32, -1.0f32, 1.0f32);
|
||||||
|
let fs4 = (1.0f32, -1.0f32, -1.0f32);
|
||||||
|
let fs5 = (-1.0f32, 1.0f32, 1.0f32);
|
||||||
|
let fs6 = (-1.0f32, 1.0f32, -1.0f32);
|
||||||
|
let fs7 = (-1.0f32, -1.0f32, 1.0f32);
|
||||||
|
let fs8 = (-1.0f32, -1.0f32, -1.0f32);
|
||||||
|
|
||||||
|
assert_eq!(fs1, round_trip(fs1));
|
||||||
|
assert_eq!(fs2, round_trip(fs2));
|
||||||
|
assert_eq!(fs3, round_trip(fs3));
|
||||||
|
assert_eq!(fs4, round_trip(fs4));
|
||||||
|
assert_eq!(fs5, round_trip(fs5));
|
||||||
|
assert_eq!(fs6, round_trip(fs6));
|
||||||
|
assert_eq!(fs7, round_trip(fs7));
|
||||||
|
assert_eq!(fs8, round_trip(fs8));
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn accuracy() {
|
fn accuracy() {
|
||||||
let mut n = 1.0;
|
let mut n = 1.0;
|
||||||
|
@ -182,7 +186,7 @@ mod tests {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn integers() {
|
fn integers() {
|
||||||
for n in 0..=512 {
|
for n in -8192i32..=8192i32 {
|
||||||
let (x, _, _) = round_trip((n as f32, 0.0, 0.0));
|
let (x, _, _) = round_trip((n as f32, 0.0, 0.0));
|
||||||
assert_eq!(n as f32, x);
|
assert_eq!(n as f32, x);
|
||||||
}
|
}
|
||||||
|
@ -248,9 +252,9 @@ mod tests {
|
||||||
let fs = (MIN_POSITIVE, MIN_POSITIVE * 0.5, MIN_POSITIVE * 0.49);
|
let fs = (MIN_POSITIVE, MIN_POSITIVE * 0.5, MIN_POSITIVE * 0.49);
|
||||||
let fsn = (-MIN_POSITIVE, -MIN_POSITIVE * 0.5, -MIN_POSITIVE * 0.49);
|
let fsn = (-MIN_POSITIVE, -MIN_POSITIVE * 0.5, -MIN_POSITIVE * 0.49);
|
||||||
|
|
||||||
|
assert_eq!(decode(0x600100000), (MIN_POSITIVE, -MIN_POSITIVE, 0.0));
|
||||||
assert_eq!(round_trip(fs), (MIN_POSITIVE, MIN_POSITIVE, 0.0));
|
assert_eq!(round_trip(fs), (MIN_POSITIVE, MIN_POSITIVE, 0.0));
|
||||||
assert_eq!(round_trip(fsn), (-MIN_POSITIVE, -MIN_POSITIVE, -0.0));
|
assert_eq!(round_trip(fsn), (-MIN_POSITIVE, -MIN_POSITIVE, -0.0));
|
||||||
assert_eq!(decode(0x600100000), (MIN_POSITIVE, -MIN_POSITIVE, 0.0));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -259,4 +263,37 @@ mod tests {
|
||||||
assert_eq!(encode(fs), 0x200000000);
|
assert_eq!(encode(fs), 0x200000000);
|
||||||
assert_eq!(round_trip(fs), (0.0, -0.0, 0.0));
|
assert_eq!(round_trip(fs), (0.0, -0.0, 0.0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn garbage_upper_bits_decode() {
|
||||||
|
let fs1 = (4.0, -623.53, 12.3);
|
||||||
|
let fs2 = (-63456254.2, 5235423.53, 54353.3);
|
||||||
|
let fs3 = (-0.000000634, 0.00000000005, 0.00000000892);
|
||||||
|
|
||||||
|
let n1 = encode(fs1);
|
||||||
|
let n2 = encode(fs2);
|
||||||
|
let n3 = encode(fs3);
|
||||||
|
|
||||||
|
assert_eq!(decode(n1), decode(n1 | 0xffff_0000_0000_0000));
|
||||||
|
assert_eq!(decode(n2), decode(n2 | 0xffff_0000_0000_0000));
|
||||||
|
assert_eq!(decode(n3), decode(n3 | 0xffff_0000_0000_0000));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn nans_01() {
|
||||||
|
encode((std::f32::NAN, 1.0, -1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn nans_02() {
|
||||||
|
encode((1.0, std::f32::NAN, -1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn nans_03() {
|
||||||
|
encode((1.0, -1.0, std::f32::NAN));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,19 +1,18 @@
|
||||||
//! Encoding/decoding for a 32-bit shared-exponent representation of three
|
//! Encoding/decoding for unsigned 32-bit trifloat numbers.
|
||||||
//! positive floating point numbers.
|
|
||||||
//!
|
//!
|
||||||
//! This is useful for e.g. compactly storing HDR colors. The encoding
|
//! The encoding uses 9 bits of mantissa per number, and 5 bits for the shared
|
||||||
//! uses 9 bits of mantissa per number, and 5 bits for the shared
|
//! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3, exponent].
|
||||||
//! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3,
|
//! The exponent is stored as an unsigned integer with a bias of 10.
|
||||||
//! exponent]. The exponent is stored as an unsigned integer with a
|
|
||||||
//! bias of 10.
|
|
||||||
//!
|
//!
|
||||||
//! The largest representable number is 2^21 - 4096, and the smallest
|
//! The largest representable number is `2^21 - 4096`, and the smallest
|
||||||
//! representable non-zero number is 2^-19.
|
//! representable non-zero number is `2^-19`.
|
||||||
//!
|
//!
|
||||||
//! Since the exponent is shared between the three values, the precision
|
//! Since the exponent is shared between the three values, the precision
|
||||||
//! of all three values depends on the largest of the three. All integers
|
//! of all three values depends on the largest of the three. All integers
|
||||||
//! up to 512 can be represented exactly in the largest value.
|
//! up to 512 can be represented exactly in the largest value.
|
||||||
|
|
||||||
|
use crate::{fiddle_exp2, fiddle_log2};
|
||||||
|
|
||||||
/// Largest representable number.
|
/// Largest representable number.
|
||||||
pub const MAX: f32 = 2_093_056.0;
|
pub const MAX: f32 = 2_093_056.0;
|
||||||
|
|
||||||
|
@ -23,19 +22,18 @@ pub const MIN: f32 = 0.000_001_907_348_6;
|
||||||
/// Difference between 1.0 and the next largest representable number.
|
/// Difference between 1.0 and the next largest representable number.
|
||||||
pub const EPSILON: f32 = 1.0 / 256.0;
|
pub const EPSILON: f32 = 1.0 / 256.0;
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone)]
|
const EXP_BIAS: i32 = 10;
|
||||||
pub struct U9(u32);
|
const MIN_EXP: i32 = 0 - EXP_BIAS;
|
||||||
|
const MAX_EXP: i32 = 31 - EXP_BIAS;
|
||||||
|
|
||||||
/// Encodes three floating point values into the trifloat format.
|
/// Encodes three floating point values into a signed 32-bit trifloat.
|
||||||
///
|
///
|
||||||
/// Floats that are larger than the max representable value in trifloat
|
/// Input floats larger than `MAX` will saturate to `MAX`, including infinity.
|
||||||
/// will saturate. Values are converted to trifloat by rounding, so the
|
/// Values are converted to trifloat precision by rounding.
|
||||||
/// max error introduced by this function is epsilon / 2.
|
|
||||||
///
|
///
|
||||||
/// Warning: negative values and NaN's are _not_ supported by the trifloat
|
/// Warning: negative values and NaN's are _not_ supported by the trifloat
|
||||||
/// format. There are debug-only assertions in place to catch such
|
/// format. There are debug-only assertions in place to catch such
|
||||||
/// values in the input floats. Infinity is also not supported in the
|
/// values in the input floats.
|
||||||
/// format, but will simply saturate to the largest representable value.
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn encode(floats: (f32, f32, f32)) -> u32 {
|
pub fn encode(floats: (f32, f32, f32)) -> u32 {
|
||||||
debug_assert!(
|
debug_assert!(
|
||||||
|
@ -45,9 +43,9 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
|
||||||
&& !floats.0.is_nan()
|
&& !floats.0.is_nan()
|
||||||
&& !floats.1.is_nan()
|
&& !floats.1.is_nan()
|
||||||
&& !floats.2.is_nan(),
|
&& !floats.2.is_nan(),
|
||||||
"trifloat::encode(): encoding to tri-floats only works correctly for \
|
"trifloat::unsigned32::encode(): encoding to unsigned tri-floats only \
|
||||||
positive, non-NaN numbers, but the numbers passed were: ({}, \
|
works correctly for positive, non-NaN numbers, but the numbers passed \
|
||||||
{}, {})",
|
were: ({}, {}, {})",
|
||||||
floats.0,
|
floats.0,
|
||||||
floats.1,
|
floats.1,
|
||||||
floats.2
|
floats.2
|
||||||
|
@ -60,13 +58,13 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the exponent and 1.0/multiplier for encoding the values.
|
// Calculate the exponent and 1.0/multiplier for encoding the values.
|
||||||
let mut exponent = (fiddle_log2(largest_value) + 1).max(-10).min(21);
|
let mut exponent = (fiddle_log2(largest_value) + 1).max(MIN_EXP).min(MAX_EXP);
|
||||||
let mut inv_multiplier = fiddle_exp2(-exponent + 9);
|
let mut inv_multiplier = fiddle_exp2(-exponent + 9);
|
||||||
|
|
||||||
// Edge-case: make sure rounding pushes the largest value up
|
// Edge-case: make sure rounding pushes the largest value up
|
||||||
// appropriately if needed.
|
// appropriately if needed.
|
||||||
if (largest_value * inv_multiplier) + 0.5 >= 512.0 {
|
if (largest_value * inv_multiplier) + 0.5 >= 512.0 {
|
||||||
exponent = (exponent + 1).min(21);
|
exponent = (exponent + 1).min(MAX_EXP);
|
||||||
inv_multiplier = fiddle_exp2(-exponent + 9);
|
inv_multiplier = fiddle_exp2(-exponent + 9);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,13 +72,13 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
|
||||||
let x = (floats.0 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
let x = (floats.0 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
||||||
let y = (floats.1 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
let y = (floats.1 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
||||||
let z = (floats.2 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
let z = (floats.2 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
||||||
let e = (exponent + 10) as u32 & 0b1_1111;
|
let e = (exponent + EXP_BIAS) as u32 & 0b1_1111;
|
||||||
|
|
||||||
// Pack values into a u32.
|
// Pack values into a u32.
|
||||||
(x << (5 + 9 + 9)) | (y << (5 + 9)) | (z << 5) | e
|
(x << (5 + 9 + 9)) | (y << (5 + 9)) | (z << 5) | e
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Decodes a trifloat into three full floating point numbers.
|
/// Decodes an unsigned 32-bit trifloat into three full floating point numbers.
|
||||||
///
|
///
|
||||||
/// This operation is lossless and cannot fail.
|
/// This operation is lossless and cannot fail.
|
||||||
#[inline]
|
#[inline]
|
||||||
|
@ -91,7 +89,7 @@ pub fn decode(trifloat: u32) -> (f32, f32, f32) {
|
||||||
let z = (trifloat >> 5) & 0b1_1111_1111;
|
let z = (trifloat >> 5) & 0b1_1111_1111;
|
||||||
let e = trifloat & 0b1_1111;
|
let e = trifloat & 0b1_1111;
|
||||||
|
|
||||||
let multiplier = fiddle_exp2(e as i32 - 10 - 9);
|
let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 9);
|
||||||
|
|
||||||
(
|
(
|
||||||
x as f32 * multiplier,
|
x as f32 * multiplier,
|
||||||
|
@ -100,29 +98,6 @@ pub fn decode(trifloat: u32) -> (f32, f32, f32) {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calculates 2.0^exp using IEEE bit fiddling.
|
|
||||||
///
|
|
||||||
/// Only works for integer exponents in the range [-126, 127]
|
|
||||||
/// due to IEEE 32-bit float limits.
|
|
||||||
#[inline(always)]
|
|
||||||
fn fiddle_exp2(exp: i32) -> f32 {
|
|
||||||
use std::f32;
|
|
||||||
f32::from_bits(((exp + 127) as u32) << 23)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Calculates a floor(log2(n)) using IEEE bit fiddling.
|
|
||||||
///
|
|
||||||
/// Because of IEEE floating point format, infinity and NaN
|
|
||||||
/// floating point values return 128, and subnormal numbers always
|
|
||||||
/// return -127. These particular behaviors are not, of course,
|
|
||||||
/// mathemetically correct, but are actually desireable for the
|
|
||||||
/// calculations in this library.
|
|
||||||
#[inline(always)]
|
|
||||||
fn fiddle_log2(n: f32) -> i32 {
|
|
||||||
use std::f32;
|
|
||||||
((f32::to_bits(n) >> 23) & 0b1111_1111) as i32 - 127
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
@ -216,4 +191,45 @@ mod tests {
|
||||||
assert_eq!(encode(fs), 0);
|
assert_eq!(encode(fs), 0);
|
||||||
assert_eq!(round_trip(fs), (0.0, 0.0, 0.0));
|
assert_eq!(round_trip(fs), (0.0, 0.0, 0.0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn nans_01() {
|
||||||
|
encode((std::f32::NAN, 0.0, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn nans_02() {
|
||||||
|
encode((0.0, std::f32::NAN, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn nans_03() {
|
||||||
|
encode((0.0, 0.0, std::f32::NAN));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn negative_01() {
|
||||||
|
encode((-1.0, 0.0, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn negative_02() {
|
||||||
|
encode((0.0, -1.0, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn negative_03() {
|
||||||
|
encode((0.0, 0.0, -1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn negative_04() {
|
||||||
|
encode((-0.0, -0.0, -0.0));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user