Merge branch 'master' into micropoly
This commit is contained in:
commit
e7ddc6f0c5
16
src/color.rs
16
src/color.rs
|
@ -5,7 +5,7 @@ pub use color::{
|
||||||
use glam::Vec4;
|
use glam::Vec4;
|
||||||
use half::f16;
|
use half::f16;
|
||||||
use spectral_upsampling::meng::{spectrum_xyz_to_p_4, EQUAL_ENERGY_REFLECTANCE};
|
use spectral_upsampling::meng::{spectrum_xyz_to_p_4, EQUAL_ENERGY_REFLECTANCE};
|
||||||
use trifloat::signed48;
|
use trifloat::fluv32;
|
||||||
|
|
||||||
use crate::{lerp::Lerp, math::fast_exp};
|
use crate::{lerp::Lerp, math::fast_exp};
|
||||||
|
|
||||||
|
@ -142,7 +142,7 @@ impl Color {
|
||||||
/// Returns the post-compression size of this color.
|
/// Returns the post-compression size of this color.
|
||||||
pub fn compressed_size(&self) -> usize {
|
pub fn compressed_size(&self) -> usize {
|
||||||
match self {
|
match self {
|
||||||
Color::XYZ(_, _, _) => 7,
|
Color::XYZ(_, _, _) => 5,
|
||||||
|
|
||||||
Color::Blackbody { .. } => 5,
|
Color::Blackbody { .. } => 5,
|
||||||
|
|
||||||
|
@ -160,9 +160,7 @@ impl Color {
|
||||||
match *self {
|
match *self {
|
||||||
Color::XYZ(x, y, z) => {
|
Color::XYZ(x, y, z) => {
|
||||||
out_data[0] = 0; // Discriminant
|
out_data[0] = 0; // Discriminant
|
||||||
let col = signed48::encode((x, y, z));
|
(&mut out_data[1..5]).copy_from_slice(&fluv32::encode((x, y, z)).to_ne_bytes()[..]);
|
||||||
let col = col.to_le_bytes();
|
|
||||||
(&mut out_data[1..7]).copy_from_slice(&col[0..6]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Color::Blackbody {
|
Color::Blackbody {
|
||||||
|
@ -200,10 +198,10 @@ impl Color {
|
||||||
match in_data[0] {
|
match in_data[0] {
|
||||||
0 => {
|
0 => {
|
||||||
// XYZ
|
// XYZ
|
||||||
let mut bytes = [0u8; 8];
|
let mut bytes = [0u8; 4];
|
||||||
(&mut bytes[0..6]).copy_from_slice(&in_data[1..7]);
|
(&mut bytes[..]).copy_from_slice(&in_data[1..5]);
|
||||||
let (x, y, z) = signed48::decode(u64::from_le_bytes(bytes));
|
let (x, y, z) = fluv32::decode(u32::from_ne_bytes(bytes));
|
||||||
(Color::XYZ(x, y, z), 7)
|
(Color::XYZ(x, y, z), 5)
|
||||||
}
|
}
|
||||||
|
|
||||||
1 => {
|
1 => {
|
||||||
|
|
|
@ -1,26 +1,38 @@
|
||||||
use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
|
use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
|
||||||
use oct32norm::{decode, encode};
|
use oct32norm::{decode, encode, encode_precise};
|
||||||
use rand::{rngs::SmallRng, FromEntropy, Rng};
|
use rand::{rngs::SmallRng, FromEntropy, Rng};
|
||||||
|
|
||||||
//----
|
//----
|
||||||
|
|
||||||
fn encode_100_values(bench: &mut Bencher) {
|
fn encode_1000_values(bench: &mut Bencher) {
|
||||||
let mut rng = SmallRng::from_entropy();
|
let mut rng = SmallRng::from_entropy();
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
let x = rng.gen::<f32>() - 0.5;
|
let x = rng.gen::<f32>() - 0.5;
|
||||||
let y = rng.gen::<f32>() - 0.5;
|
let y = rng.gen::<f32>() - 0.5;
|
||||||
let z = rng.gen::<f32>() - 0.5;
|
let z = rng.gen::<f32>() - 0.5;
|
||||||
for _ in 0..100 {
|
for _ in 0..1000 {
|
||||||
black_box(encode(black_box((x, y, z))));
|
black_box(encode(black_box((x, y, z))));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn decode_100_values(bench: &mut Bencher) {
|
fn encode_precise_1000_values(bench: &mut Bencher) {
|
||||||
|
let mut rng = SmallRng::from_entropy();
|
||||||
|
bench.iter(|| {
|
||||||
|
let x = rng.gen::<f32>() - 0.5;
|
||||||
|
let y = rng.gen::<f32>() - 0.5;
|
||||||
|
let z = rng.gen::<f32>() - 0.5;
|
||||||
|
for _ in 0..1000 {
|
||||||
|
black_box(encode_precise(black_box((x, y, z))));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn decode_1000_values(bench: &mut Bencher) {
|
||||||
let mut rng = SmallRng::from_entropy();
|
let mut rng = SmallRng::from_entropy();
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
let v = rng.gen::<u32>();
|
let v = rng.gen::<u32>();
|
||||||
for _ in 0..100 {
|
for _ in 0..1000 {
|
||||||
black_box(decode(black_box(v)));
|
black_box(decode(black_box(v)));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@ -28,5 +40,10 @@ fn decode_100_values(bench: &mut Bencher) {
|
||||||
|
|
||||||
//----
|
//----
|
||||||
|
|
||||||
benchmark_group!(benches, encode_100_values, decode_100_values,);
|
benchmark_group!(
|
||||||
|
benches,
|
||||||
|
encode_1000_values,
|
||||||
|
encode_precise_1000_values,
|
||||||
|
decode_1000_values,
|
||||||
|
);
|
||||||
benchmark_main!(benches);
|
benchmark_main!(benches);
|
||||||
|
|
|
@ -4,31 +4,60 @@
|
||||||
//! of Efficient Representations for Independent Unit Vectors" by
|
//! of Efficient Representations for Independent Unit Vectors" by
|
||||||
//! Cigolle et al.
|
//! Cigolle et al.
|
||||||
|
|
||||||
|
const STEP_SIZE: f32 = 1.0 / STEPS;
|
||||||
|
const STEPS: f32 = ((1 << (16 - 1)) - 1) as f32;
|
||||||
|
|
||||||
/// Encodes a vector of three floats to the oct32 format.
|
/// Encodes a vector of three floats to the oct32 format.
|
||||||
///
|
///
|
||||||
/// The input vector does not need to be normalized--only the direction
|
/// The input vector does not need to be normalized--only the direction
|
||||||
/// matters to the encoding process, not the length.
|
/// matters to the encoding process, not the length.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn encode(vec: (f32, f32, f32)) -> u32 {
|
pub fn encode(vec: (f32, f32, f32)) -> u32 {
|
||||||
let inv_l1_norm = 1.0f32 / (vec.0.abs() + vec.1.abs() + vec.2.abs());
|
let (u, v) = vec3_to_oct(vec);
|
||||||
|
((to_snorm_16(u) as u32) << 16) | to_snorm_16(v) as u32
|
||||||
|
}
|
||||||
|
|
||||||
let (u, v) = if vec.2 < 0.0 {
|
/// Encodes a vector of three floats to the oct32 format.
|
||||||
(
|
///
|
||||||
u32::from(to_snorm_16(
|
/// This is the same as `encode()` except that it is slower and encodes
|
||||||
(1.0 - (vec.1 * inv_l1_norm).abs()) * sign(vec.0),
|
/// with slightly better precision.
|
||||||
)),
|
pub fn encode_precise(vec: (f32, f32, f32)) -> u32 {
|
||||||
u32::from(to_snorm_16(
|
#[inline(always)]
|
||||||
(1.0 - (vec.0 * inv_l1_norm).abs()) * sign(vec.1),
|
fn dot_norm(a: (f32, f32, f32), b: (f32, f32, f32)) -> f64 {
|
||||||
)),
|
let l = ((a.0 as f64 * a.0 as f64) + (a.1 as f64 * a.1 as f64) + (a.2 as f64 * a.2 as f64))
|
||||||
)
|
.sqrt();
|
||||||
} else {
|
((a.0 as f64 * b.0 as f64) + (a.1 as f64 * b.1 as f64) + (a.2 as f64 * b.2 as f64)) / l
|
||||||
(
|
}
|
||||||
u32::from(to_snorm_16(vec.0 * inv_l1_norm)),
|
|
||||||
u32::from(to_snorm_16(vec.1 * inv_l1_norm)),
|
// Calculate the initial floored version.
|
||||||
)
|
let s = {
|
||||||
|
let mut s = vec3_to_oct(vec); // Remap to the square.
|
||||||
|
s.0 = (s.0.max(-1.0).min(1.0) * STEPS).floor() * STEP_SIZE;
|
||||||
|
s.1 = (s.1.max(-1.0).min(1.0) * STEPS).floor() * STEP_SIZE;
|
||||||
|
s
|
||||||
};
|
};
|
||||||
|
|
||||||
(u << 16) | v
|
// Test all combinations of floor and ceil and keep the best.
|
||||||
|
// Note that at +/- 1, this will exit the square, but that
|
||||||
|
// will be a worse encoding and never win.
|
||||||
|
let mut best_rep = s;
|
||||||
|
let mut max_dot = 0.0;
|
||||||
|
for &(i, j) in &[
|
||||||
|
(0.0, 0.0),
|
||||||
|
(0.0, STEP_SIZE),
|
||||||
|
(STEP_SIZE, 0.0),
|
||||||
|
(STEP_SIZE, STEP_SIZE),
|
||||||
|
] {
|
||||||
|
let candidate = (s.0 + i, s.1 + j);
|
||||||
|
let oct = oct_to_vec3(candidate);
|
||||||
|
let dot = dot_norm(oct, vec);
|
||||||
|
if dot > max_dot {
|
||||||
|
best_rep = candidate;
|
||||||
|
max_dot = dot;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
((to_snorm_16(best_rep.0) as u32) << 16) | to_snorm_16(best_rep.1) as u32
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Decodes from an oct32 to a vector of three floats.
|
/// Decodes from an oct32 to a vector of three floats.
|
||||||
|
@ -37,30 +66,45 @@ pub fn encode(vec: (f32, f32, f32)) -> u32 {
|
||||||
/// needs a normalized vector should normalize the returned vector.
|
/// needs a normalized vector should normalize the returned vector.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn decode(n: u32) -> (f32, f32, f32) {
|
pub fn decode(n: u32) -> (f32, f32, f32) {
|
||||||
let mut vec0 = from_snorm_16((n >> 16) as u16);
|
oct_to_vec3((from_snorm_16((n >> 16) as u16), from_snorm_16(n as u16)))
|
||||||
let mut vec1 = from_snorm_16(n as u16);
|
}
|
||||||
let vec2 = 1.0 - (vec0.abs() + vec1.abs());
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn vec3_to_oct(vec: (f32, f32, f32)) -> (f32, f32) {
|
||||||
|
let l1_norm = vec.0.abs() + vec.1.abs() + vec.2.abs();
|
||||||
|
let u = vec.0 / l1_norm;
|
||||||
|
let v = vec.1 / l1_norm;
|
||||||
|
|
||||||
|
if vec.2 > 0.0 {
|
||||||
|
(u, v)
|
||||||
|
} else {
|
||||||
|
((1.0 - v.abs()) * sign(vec.0), (1.0 - u.abs()) * sign(vec.1))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn oct_to_vec3(oct: (f32, f32)) -> (f32, f32, f32) {
|
||||||
|
let vec2 = 1.0 - (oct.0.abs() + oct.1.abs());
|
||||||
|
|
||||||
if vec2 < 0.0 {
|
if vec2 < 0.0 {
|
||||||
let old_x = vec0;
|
(
|
||||||
vec0 = (1.0 - vec1.abs()) * sign(old_x);
|
(1.0 - oct.1.abs()) * sign(oct.0),
|
||||||
vec1 = (1.0 - old_x.abs()) * sign(vec1);
|
(1.0 - oct.0.abs()) * sign(oct.1),
|
||||||
|
vec2,
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
(oct.0, oct.1, vec2)
|
||||||
}
|
}
|
||||||
|
|
||||||
(vec0, vec1, vec2)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn to_snorm_16(n: f32) -> u16 {
|
fn to_snorm_16(n: f32) -> u16 {
|
||||||
(n.max(-1.0).min(1.0) * ((1u32 << (16 - 1)) - 1) as f32).round() as i16 as u16
|
(n * STEPS).round() as i16 as u16
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn from_snorm_16(n: u16) -> f32 {
|
fn from_snorm_16(n: u16) -> f32 {
|
||||||
f32::from(n as i16)
|
f32::from(n as i16) * STEP_SIZE
|
||||||
* (1.0f32 / ((1u32 << (16 - 1)) - 1) as f32)
|
|
||||||
.max(-1.0)
|
|
||||||
.min(1.0)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
|
@ -80,21 +124,27 @@ mod tests {
|
||||||
fn axis_directions() {
|
fn axis_directions() {
|
||||||
let px = (1.0, 0.0, 0.0);
|
let px = (1.0, 0.0, 0.0);
|
||||||
let px_oct = encode(px);
|
let px_oct = encode(px);
|
||||||
|
let px_octp = encode_precise(px);
|
||||||
|
|
||||||
let nx = (-1.0, 0.0, 0.0);
|
let nx = (-1.0, 0.0, 0.0);
|
||||||
let nx_oct = encode(nx);
|
let nx_oct = encode(nx);
|
||||||
|
let nx_octp = encode_precise(nx);
|
||||||
|
|
||||||
let py = (0.0, 1.0, 0.0);
|
let py = (0.0, 1.0, 0.0);
|
||||||
let py_oct = encode(py);
|
let py_oct = encode(py);
|
||||||
|
let py_octp = encode_precise(py);
|
||||||
|
|
||||||
let ny = (0.0, -1.0, 0.0);
|
let ny = (0.0, -1.0, 0.0);
|
||||||
let ny_oct = encode(ny);
|
let ny_oct = encode(ny);
|
||||||
|
let ny_octp = encode_precise(ny);
|
||||||
|
|
||||||
let pz = (0.0, 0.0, 1.0);
|
let pz = (0.0, 0.0, 1.0);
|
||||||
let pz_oct = encode(pz);
|
let pz_oct = encode(pz);
|
||||||
|
let pz_octp = encode_precise(pz);
|
||||||
|
|
||||||
let nz = (0.0, 0.0, -1.0);
|
let nz = (0.0, 0.0, -1.0);
|
||||||
let nz_oct = encode(nz);
|
let nz_oct = encode(nz);
|
||||||
|
let nz_octp = encode_precise(nz);
|
||||||
|
|
||||||
assert_eq!(px, decode(px_oct));
|
assert_eq!(px, decode(px_oct));
|
||||||
assert_eq!(nx, decode(nx_oct));
|
assert_eq!(nx, decode(nx_oct));
|
||||||
|
@ -102,5 +152,12 @@ mod tests {
|
||||||
assert_eq!(ny, decode(ny_oct));
|
assert_eq!(ny, decode(ny_oct));
|
||||||
assert_eq!(pz, decode(pz_oct));
|
assert_eq!(pz, decode(pz_oct));
|
||||||
assert_eq!(nz, decode(nz_oct));
|
assert_eq!(nz, decode(nz_oct));
|
||||||
|
|
||||||
|
assert_eq!(px, decode(px_octp));
|
||||||
|
assert_eq!(nx, decode(nx_octp));
|
||||||
|
assert_eq!(py, decode(py_octp));
|
||||||
|
assert_eq!(ny, decode(ny_octp));
|
||||||
|
assert_eq!(pz, decode(pz_octp));
|
||||||
|
assert_eq!(nz, decode(nz_octp));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
extern crate proptest;
|
extern crate proptest;
|
||||||
extern crate oct32norm;
|
extern crate oct32norm;
|
||||||
|
|
||||||
use oct32norm::{decode, encode};
|
use oct32norm::{decode, encode, encode_precise};
|
||||||
use proptest::test_runner::Config;
|
use proptest::test_runner::Config;
|
||||||
|
|
||||||
/// Calculates the cosine of the angle between the two vectors,
|
/// Calculates the cosine of the angle between the two vectors,
|
||||||
|
@ -56,18 +56,23 @@ proptest! {
|
||||||
#[test]
|
#[test]
|
||||||
fn pt_roundtrip_angle_precision(v in (-1.0f32..1.0, -1.0f32..1.0, -1.0f32..1.0)) {
|
fn pt_roundtrip_angle_precision(v in (-1.0f32..1.0, -1.0f32..1.0, -1.0f32..1.0)) {
|
||||||
let oct = encode(v);
|
let oct = encode(v);
|
||||||
let v2 = decode(oct);
|
let octp = encode_precise(v);
|
||||||
|
|
||||||
// Check if the angle between the original and the roundtrip
|
// Check if the angle between the original and the roundtrip
|
||||||
// is less than 0.004 degrees
|
// is less than 0.004 degrees
|
||||||
assert!(cos_gt(v, v2, 0.9999999976));
|
assert!(cos_gt(v, decode(oct), 0.9999999976));
|
||||||
|
|
||||||
|
// Check if the angle between the original and the roundtrip
|
||||||
|
// is less than 0.003 degrees
|
||||||
|
assert!(cos_gt(v, decode(octp), 0.9999999986));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn pt_roundtrip_component_precision(v in (-1.0f32..1.0, -1.0f32..1.0, -1.0f32..1.0)) {
|
fn pt_roundtrip_component_precision(v in (-1.0f32..1.0, -1.0f32..1.0, -1.0f32..1.0)) {
|
||||||
let oct = encode(v);
|
let oct = encode(v);
|
||||||
let v2 = decode(oct);
|
let octp = encode_precise(v);
|
||||||
|
|
||||||
assert!(l1_delta_lt(v, v2, 0.00005));
|
assert!(l1_delta_lt(v, decode(oct), 0.00005));
|
||||||
|
assert!(l1_delta_lt(v, decode(octp), 0.00003));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,60 +1,132 @@
|
||||||
use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
|
use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
|
||||||
use rand::{rngs::SmallRng, FromEntropy, Rng};
|
use rand::{rngs::SmallRng, FromEntropy, Rng};
|
||||||
use trifloat::{signed48, unsigned32};
|
use trifloat::{fluv32, signed48, unsigned32, unsigned40};
|
||||||
|
|
||||||
//----
|
//----
|
||||||
|
|
||||||
fn unsigned32_encode_100_values(bench: &mut Bencher) {
|
fn unsigned32_encode_1000_values(bench: &mut Bencher) {
|
||||||
let mut rng = SmallRng::from_entropy();
|
let mut rng = SmallRng::from_entropy();
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
let x = rng.gen::<f32>() - 0.5;
|
let x = rng.gen::<f32>();
|
||||||
let y = rng.gen::<f32>() - 0.5;
|
let y = rng.gen::<f32>();
|
||||||
let z = rng.gen::<f32>() - 0.5;
|
let z = rng.gen::<f32>();
|
||||||
for _ in 0..100 {
|
for _ in 0..1000 {
|
||||||
black_box(unsigned32::encode(black_box((x, y, z))));
|
black_box(unsigned32::encode(black_box((x, y, z))));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn unsigned32_decode_100_values(bench: &mut Bencher) {
|
fn unsigned32_decode_1000_values(bench: &mut Bencher) {
|
||||||
let mut rng = SmallRng::from_entropy();
|
let mut rng = SmallRng::from_entropy();
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
let v = rng.gen::<u32>();
|
let v = rng.gen::<u32>();
|
||||||
for _ in 0..100 {
|
for _ in 0..1000 {
|
||||||
black_box(unsigned32::decode(black_box(v)));
|
black_box(unsigned32::decode(black_box(v)));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn signed48_encode_100_values(bench: &mut Bencher) {
|
fn unsigned40_encode_1000_values(bench: &mut Bencher) {
|
||||||
|
let mut rng = SmallRng::from_entropy();
|
||||||
|
bench.iter(|| {
|
||||||
|
let x = rng.gen::<f32>();
|
||||||
|
let y = rng.gen::<f32>();
|
||||||
|
let z = rng.gen::<f32>();
|
||||||
|
for _ in 0..1000 {
|
||||||
|
black_box(unsigned40::encode(black_box((x, y, z))));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn unsigned40_decode_1000_values(bench: &mut Bencher) {
|
||||||
|
let mut rng = SmallRng::from_entropy();
|
||||||
|
bench.iter(|| {
|
||||||
|
let v = [
|
||||||
|
rng.gen::<u8>(),
|
||||||
|
rng.gen::<u8>(),
|
||||||
|
rng.gen::<u8>(),
|
||||||
|
rng.gen::<u8>(),
|
||||||
|
rng.gen::<u8>(),
|
||||||
|
];
|
||||||
|
for _ in 0..1000 {
|
||||||
|
black_box(unsigned40::decode(black_box(v)));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn signed48_encode_1000_values(bench: &mut Bencher) {
|
||||||
let mut rng = SmallRng::from_entropy();
|
let mut rng = SmallRng::from_entropy();
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
let x = rng.gen::<f32>() - 0.5;
|
let x = rng.gen::<f32>() - 0.5;
|
||||||
let y = rng.gen::<f32>() - 0.5;
|
let y = rng.gen::<f32>() - 0.5;
|
||||||
let z = rng.gen::<f32>() - 0.5;
|
let z = rng.gen::<f32>() - 0.5;
|
||||||
for _ in 0..100 {
|
for _ in 0..1000 {
|
||||||
black_box(signed48::encode(black_box((x, y, z))));
|
black_box(signed48::encode(black_box((x, y, z))));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn signed48_decode_100_values(bench: &mut Bencher) {
|
fn signed48_decode_1000_values(bench: &mut Bencher) {
|
||||||
let mut rng = SmallRng::from_entropy();
|
let mut rng = SmallRng::from_entropy();
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
let v = rng.gen::<u64>() & 0x0000_FFFF_FFFF_FFFF;
|
let v = [
|
||||||
for _ in 0..100 {
|
rng.gen::<u8>(),
|
||||||
|
rng.gen::<u8>(),
|
||||||
|
rng.gen::<u8>(),
|
||||||
|
rng.gen::<u8>(),
|
||||||
|
rng.gen::<u8>(),
|
||||||
|
rng.gen::<u8>(),
|
||||||
|
];
|
||||||
|
for _ in 0..1000 {
|
||||||
black_box(signed48::decode(black_box(v)));
|
black_box(signed48::decode(black_box(v)));
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn fluv32_encode_1000_values(bench: &mut Bencher) {
|
||||||
|
let mut rng = SmallRng::from_entropy();
|
||||||
|
bench.iter(|| {
|
||||||
|
let x = rng.gen::<f32>();
|
||||||
|
let y = rng.gen::<f32>();
|
||||||
|
let z = rng.gen::<f32>();
|
||||||
|
for _ in 0..1000 {
|
||||||
|
black_box(fluv32::encode(black_box((x, y, z))));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fluv32_decode_1000_values(bench: &mut Bencher) {
|
||||||
|
let mut rng = SmallRng::from_entropy();
|
||||||
|
bench.iter(|| {
|
||||||
|
let v = rng.gen::<u32>();
|
||||||
|
for _ in 0..1000 {
|
||||||
|
black_box(fluv32::decode(black_box(v)));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fluv32_decode_yuv_1000_values(bench: &mut Bencher) {
|
||||||
|
let mut rng = SmallRng::from_entropy();
|
||||||
|
bench.iter(|| {
|
||||||
|
let v = rng.gen::<u32>();
|
||||||
|
for _ in 0..1000 {
|
||||||
|
black_box(fluv32::decode_yuv(black_box(v)));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
//----
|
//----
|
||||||
|
|
||||||
benchmark_group!(
|
benchmark_group!(
|
||||||
benches,
|
benches,
|
||||||
unsigned32_encode_100_values,
|
unsigned32_encode_1000_values,
|
||||||
unsigned32_decode_100_values,
|
unsigned32_decode_1000_values,
|
||||||
signed48_encode_100_values,
|
unsigned40_encode_1000_values,
|
||||||
signed48_decode_100_values,
|
unsigned40_decode_1000_values,
|
||||||
|
signed48_encode_1000_values,
|
||||||
|
signed48_decode_1000_values,
|
||||||
|
fluv32_encode_1000_values,
|
||||||
|
fluv32_decode_1000_values,
|
||||||
|
fluv32_decode_yuv_1000_values,
|
||||||
);
|
);
|
||||||
benchmark_main!(benches);
|
benchmark_main!(benches);
|
||||||
|
|
368
sub_crates/trifloat/src/fluv32.rs
Normal file
368
sub_crates/trifloat/src/fluv32.rs
Normal file
|
@ -0,0 +1,368 @@
|
||||||
|
//! Encoding/decoding for the 32-bit FLuv32 color format.
|
||||||
|
//!
|
||||||
|
//! This encoding is based on, but is slightly different than, the 32-bit
|
||||||
|
//! LogLuv format from the paper "Overcoming Gamut and Dynamic Range
|
||||||
|
//! Limitations in Digital Images" by Greg Ward:
|
||||||
|
//!
|
||||||
|
//! * It uses the same uv chroma storage approach, but with *very* slightly
|
||||||
|
//! tweaked scales to allow perfect representation of E.
|
||||||
|
//! * It uses a floating point rather than log encoding to store luminance,
|
||||||
|
//! mainly for the sake of faster decoding.
|
||||||
|
//! * Unlike LogLuv, this format's dynamic range is biased to put more of it
|
||||||
|
//! above 1.0 (see Luminance details below).
|
||||||
|
//! * It omits the sign bit of LogLuv, foregoing negative luminance
|
||||||
|
//! capabilities.
|
||||||
|
//!
|
||||||
|
//! This format has the same chroma precision, very slightly improved luminance
|
||||||
|
//! precision, and the same 127-stops of dynamic range as LogLuv.
|
||||||
|
//!
|
||||||
|
//! Like the LogLuv format, this is an absolute rather than relative color
|
||||||
|
//! encoding, and as such takes CIE XYZ triplets as input. It is *not*
|
||||||
|
//! designed to take arbitrary floating point triplets, and will perform poorly
|
||||||
|
//! if e.g. passed RGB values.
|
||||||
|
//!
|
||||||
|
//! The bit layout is (from most to least significant bit):
|
||||||
|
//!
|
||||||
|
//! * 7 bits: luminance exponent (bias 42)
|
||||||
|
//! * 9 bits: luminance mantissa (implied leading 1, for 10 bits precision)
|
||||||
|
//! * 8 bits: u'
|
||||||
|
//! * 8 bits: v'
|
||||||
|
//!
|
||||||
|
//! ## Luminance details
|
||||||
|
//!
|
||||||
|
//! Quoting Greg Ward about luminance ranges:
|
||||||
|
//!
|
||||||
|
//! > The sun is about `10^8 cd/m^2`, and the underside of a rock on a moonless
|
||||||
|
//! > night is probably around `10^-6` or so [...]
|
||||||
|
//!
|
||||||
|
//! See also Wikipedia's
|
||||||
|
//! [list of luminance levels](https://en.wikipedia.org/wiki/Orders_of_magnitude_(luminance)).
|
||||||
|
//!
|
||||||
|
//! The luminance range of the original LogLuv is about `10^-19` to `10^19`,
|
||||||
|
//! splitting the range evenly above and below 1.0. Given the massive dynamic
|
||||||
|
//! range, and the fact that all day-to-day luminance levels trivially fit
|
||||||
|
//! within that, that's a perfectly reasonable choice.
|
||||||
|
//!
|
||||||
|
//! However, there are some stellar events like supernovae that are trillions
|
||||||
|
//! of times brighter than the sun, and would exceed `10^19`. Conversely,
|
||||||
|
//! there likely isn't much use for significantly smaller values than `10^-10`
|
||||||
|
//! or so. So although recording supernovae in physical units with a graphics
|
||||||
|
//! format seems unlikely, it doesn't hurt to bias the range towards brighter
|
||||||
|
//! luminance levels.
|
||||||
|
//!
|
||||||
|
//! With that in mind, FLuv32 uses an exponent bias of 42, putting twice as
|
||||||
|
//! many stops of dynamic range above 1.0 as below it, giving a luminance range
|
||||||
|
//! of roughly `10^-13` to `10^25`. It's the same dynamic range as
|
||||||
|
//! LogLuv (about 127 stops), but with more of that range placed above 1.0.
|
||||||
|
//!
|
||||||
|
//! Like typical floating point, the mantissa is treated as having an implicit
|
||||||
|
//! leading 1, giving it an extra bit of precision. The smallest exponent
|
||||||
|
//! indicates a value of zero, and a valid encoding should also set the
|
||||||
|
//! mantissa to zero in that case (denormal numbers are not supported). The
|
||||||
|
//! largest exponent is given no special treatment (no infinities, no NaN).
|
||||||
|
|
||||||
|
#![allow(clippy::cast_lossless)]
|
||||||
|
|
||||||
|
const EXP_BIAS: i32 = 42;
|
||||||
|
const BIAS_OFFSET: u32 = 127 - EXP_BIAS as u32;
|
||||||
|
|
||||||
|
/// The scale factor of the quantized U component.
|
||||||
|
pub const U_SCALE: f32 = 817.0 / 2.0;
|
||||||
|
|
||||||
|
/// The scale factor of the quantized V component.
|
||||||
|
pub const V_SCALE: f32 = 1235.0 / 3.0;
|
||||||
|
|
||||||
|
/// Largest representable Y component.
|
||||||
|
pub const Y_MAX: f32 = ((1u128 << (128 - EXP_BIAS)) - (1u128 << (128 - EXP_BIAS - 10))) as f32;
|
||||||
|
|
||||||
|
/// Smallest representable non-zero Y component.
|
||||||
|
pub const Y_MIN: f32 = 1.0 / (1u128 << (EXP_BIAS - 1)) as f32;
|
||||||
|
|
||||||
|
/// Difference between 1.0 and the next largest representable Y value.
|
||||||
|
pub const Y_EPSILON: f32 = 1.0 / 512.0;
|
||||||
|
|
||||||
|
/// Encodes from CIE XYZ to 32-bit FloatLuv.
|
||||||
|
#[inline]
|
||||||
|
pub fn encode(xyz: (f32, f32, f32)) -> u32 {
|
||||||
|
debug_assert!(
|
||||||
|
xyz.0 >= 0.0
|
||||||
|
&& xyz.1 >= 0.0
|
||||||
|
&& xyz.2 >= 0.0
|
||||||
|
&& !xyz.0.is_nan()
|
||||||
|
&& !xyz.1.is_nan()
|
||||||
|
&& !xyz.2.is_nan(),
|
||||||
|
"trifloat::fluv32::encode(): encoding to fluv32 only \
|
||||||
|
works correctly for positive, non-NaN numbers, but the numbers passed \
|
||||||
|
were: ({}, {}, {})",
|
||||||
|
xyz.0,
|
||||||
|
xyz.1,
|
||||||
|
xyz.2
|
||||||
|
);
|
||||||
|
|
||||||
|
// Calculates the 16-bit encoding of the UV values for the given XYZ input.
|
||||||
|
#[inline(always)]
|
||||||
|
fn encode_uv(xyz: (f32, f32, f32)) -> u32 {
|
||||||
|
let s = xyz.0 + (15.0 * xyz.1) + (3.0 * xyz.2);
|
||||||
|
|
||||||
|
// The `+ 0.5` is for rounding, and is not part of the normal equation.
|
||||||
|
// The minimum value of 1.0 for v is to avoid a possible divide by zero
|
||||||
|
// when decoding. A value less than 1.0 is outside the real colors,
|
||||||
|
// so we don't need to store it anyway.
|
||||||
|
let u = (((4.0 * U_SCALE) * xyz.0 / s) + 0.5).max(0.0).min(255.0);
|
||||||
|
let v = (((9.0 * V_SCALE) * xyz.1 / s) + 0.5).max(1.0).min(255.0);
|
||||||
|
|
||||||
|
((u as u32) << 8) | (v as u32)
|
||||||
|
};
|
||||||
|
|
||||||
|
let y_bits = xyz.1.to_bits() & 0x7fffffff;
|
||||||
|
|
||||||
|
if y_bits < ((BIAS_OFFSET + 1) << 23) {
|
||||||
|
// Special case: black.
|
||||||
|
encode_uv((1.0, 1.0, 1.0))
|
||||||
|
} else if y_bits >= ((BIAS_OFFSET + 128) << 23) {
|
||||||
|
if xyz.1.is_infinite() {
|
||||||
|
// Special case: infinity. In this case, we don't have any
|
||||||
|
// reasonable basis for calculating chroma, so just return
|
||||||
|
// the brightest white.
|
||||||
|
0xffff0000 | encode_uv((1.0, 1.0, 1.0))
|
||||||
|
} else {
|
||||||
|
// Special case: non-infinite, but brighter luma than can be
|
||||||
|
// represented. Return the correct chroma, and the brightest luma.
|
||||||
|
0xffff0000 | encode_uv(xyz)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Common case.
|
||||||
|
(((y_bits - (BIAS_OFFSET << 23)) << 2) & 0xffff0000) | encode_uv(xyz)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decodes from 32-bit FloatLuv to CIE XYZ.
|
||||||
|
#[inline]
|
||||||
|
pub fn decode(fluv32: u32) -> (f32, f32, f32) {
|
||||||
|
// Unpack values.
|
||||||
|
let (y, u, v) = decode_yuv(fluv32);
|
||||||
|
let u = u as f32;
|
||||||
|
let v = v as f32;
|
||||||
|
|
||||||
|
// Calculate x and z.
|
||||||
|
// This is re-worked from the original equations, to allow a bunch of stuff
|
||||||
|
// to cancel out and avoid operations. It makes the underlying equations a
|
||||||
|
// bit non-obvious.
|
||||||
|
// We also roll the U/V_SCALE application into the final x and z formulas,
|
||||||
|
// since some of that cancels out as well, and all of it can be avoided at
|
||||||
|
// runtime that way.
|
||||||
|
const VU_RATIO: f32 = V_SCALE / U_SCALE;
|
||||||
|
let tmp = y / v;
|
||||||
|
let x = tmp * ((2.25 * VU_RATIO) * u); // y * (9u / 4v)
|
||||||
|
let z = tmp * ((3.0 * V_SCALE) - ((0.75 * VU_RATIO) * u) - (5.0 * v)); // y * ((12 - 3u - 20v) / 4v)
|
||||||
|
|
||||||
|
(x, y, z.max(0.0))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decodes from 32-bit FloatLuv to Yuv.
|
||||||
|
///
|
||||||
|
/// The Y component is the luminance, and is simply the Y from CIE XYZ.
|
||||||
|
///
|
||||||
|
/// The u and v components are the CIE LUV u' and v' chromaticity coordinates,
|
||||||
|
/// but returned as `u8`s, and scaled by `U_SCALE` and `V_SCALE` respectively
|
||||||
|
/// to fit the range 0-255.
|
||||||
|
#[inline]
|
||||||
|
pub fn decode_yuv(fluv32: u32) -> (f32, u8, u8) {
|
||||||
|
let y = f32::from_bits(((fluv32 & 0xffff0000) >> 2) + (BIAS_OFFSET << 23));
|
||||||
|
let u = (fluv32 >> 8) as u8;
|
||||||
|
let v = fluv32 as u8;
|
||||||
|
|
||||||
|
if fluv32 <= 0xffff {
|
||||||
|
(0.0, u, v)
|
||||||
|
} else {
|
||||||
|
(y, u, v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) {
|
||||||
|
decode(encode(floats))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn all_zeros() {
|
||||||
|
let fs = (0.0f32, 0.0f32, 0.0f32);
|
||||||
|
|
||||||
|
let tri = encode(fs);
|
||||||
|
let fs2 = decode(tri);
|
||||||
|
|
||||||
|
assert_eq!(0x000056c3, tri);
|
||||||
|
assert_eq!(fs, fs2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn all_ones() {
|
||||||
|
let fs = (1.0f32, 1.0f32, 1.0f32);
|
||||||
|
|
||||||
|
let tri = encode(fs);
|
||||||
|
let fs2 = decode(tri);
|
||||||
|
|
||||||
|
assert_eq!(fs.1, fs2.1);
|
||||||
|
assert!((fs.0 - fs2.0).abs() < 0.0000001);
|
||||||
|
assert!((fs.2 - fs2.2).abs() < 0.0000001);
|
||||||
|
assert_eq!(0x540056c3, tri);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn powers_of_two() {
|
||||||
|
let mut n = 0.25;
|
||||||
|
for _ in 0..20 {
|
||||||
|
let a = (n as f32, n as f32, n as f32);
|
||||||
|
let b = round_trip(a);
|
||||||
|
|
||||||
|
let rd0 = 2.0 * (a.0 - b.0).abs() / (a.0 + b.0);
|
||||||
|
let rd2 = 2.0 * (a.2 - b.2).abs() / (a.2 + b.2);
|
||||||
|
|
||||||
|
assert_eq!(a.1, b.1);
|
||||||
|
assert!(rd0 < 0.01);
|
||||||
|
assert!(rd2 < 0.01);
|
||||||
|
|
||||||
|
n *= 2.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn accuracy_01() {
|
||||||
|
let mut n = 1.0;
|
||||||
|
for _ in 0..512 {
|
||||||
|
let a = (n as f32, n as f32, n as f32);
|
||||||
|
let b = round_trip(a);
|
||||||
|
|
||||||
|
let rd0 = 2.0 * (a.0 - b.0).abs() / (a.0 + b.0);
|
||||||
|
let rd2 = 2.0 * (a.2 - b.2).abs() / (a.2 + b.2);
|
||||||
|
|
||||||
|
assert_eq!(a.1, b.1);
|
||||||
|
assert!(rd0 < 0.01);
|
||||||
|
assert!(rd2 < 0.01);
|
||||||
|
|
||||||
|
n += 1.0 / 512.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn accuracy_02() {
|
||||||
|
let mut n = 1.0;
|
||||||
|
for _ in 0..1024 {
|
||||||
|
let a = (n as f32, n as f32, n as f32);
|
||||||
|
let b = round_trip(a);
|
||||||
|
assert_eq!(a.1, b.1);
|
||||||
|
n += 1.0 / 1024.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn integers() {
|
||||||
|
for n in 1..=512 {
|
||||||
|
let a = (n as f32, n as f32, n as f32);
|
||||||
|
let b = round_trip(a);
|
||||||
|
|
||||||
|
let rd0 = 2.0 * (a.0 - b.0).abs() / (a.0 + b.0);
|
||||||
|
let rd2 = 2.0 * (a.2 - b.2).abs() / (a.2 + b.2);
|
||||||
|
|
||||||
|
assert_eq!(a.1, b.1);
|
||||||
|
assert!(rd0 < 0.01);
|
||||||
|
assert!(rd2 < 0.01);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn precision_floor() {
|
||||||
|
let fs = (2049.0f32, 2049.0f32, 2049.0f32);
|
||||||
|
assert_eq!(2048.0, round_trip(fs).1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn decode_yuv_01() {
|
||||||
|
let fs = (1.0, 1.0, 1.0);
|
||||||
|
let a = encode(fs);
|
||||||
|
|
||||||
|
assert_eq!((1.0, 0x56, 0xc3), decode_yuv(a));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn saturate_y() {
|
||||||
|
let fs = (1.0e+28, 1.0e+28, 1.0e+28);
|
||||||
|
|
||||||
|
assert_eq!(Y_MAX, round_trip(fs).1);
|
||||||
|
assert_eq!(Y_MAX, decode(0xFFFFFFFF).1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn inf_saturate() {
|
||||||
|
use std::f32::INFINITY;
|
||||||
|
let fs = (INFINITY, INFINITY, INFINITY);
|
||||||
|
|
||||||
|
assert_eq!(Y_MAX, round_trip(fs).1);
|
||||||
|
assert_eq!(0xffff56c3, encode(fs));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn smallest_value_and_underflow() {
|
||||||
|
let fs1 = (Y_MIN, Y_MIN, Y_MIN);
|
||||||
|
let fs2 = (Y_MIN * 0.99, Y_MIN * 0.99, Y_MIN * 0.99);
|
||||||
|
|
||||||
|
dbg!(Y_MIN);
|
||||||
|
assert_eq!(fs1.1, round_trip(fs1).1);
|
||||||
|
assert_eq!(0.0, round_trip(fs2).1);
|
||||||
|
assert_eq!(0x000056c3, encode(fs2));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn negative_z_impossible() {
|
||||||
|
for y in 0..1024 {
|
||||||
|
let fs = (1.0, 1.0 + (y as f32 / 4096.0), 0.0);
|
||||||
|
let fs2 = round_trip(fs);
|
||||||
|
assert!(fs2.2 >= 0.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn nans_01() {
|
||||||
|
encode((std::f32::NAN, 0.0, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn nans_02() {
|
||||||
|
encode((0.0, std::f32::NAN, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn nans_03() {
|
||||||
|
encode((0.0, 0.0, std::f32::NAN));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn negative_01() {
|
||||||
|
encode((-1.0, 0.0, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn negative_02() {
|
||||||
|
encode((0.0, -1.0, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn negative_03() {
|
||||||
|
encode((0.0, 0.0, -1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn negative_04() {
|
||||||
|
encode((-0.0, -0.0, -0.0));
|
||||||
|
}
|
||||||
|
}
|
|
@ -4,8 +4,10 @@
|
||||||
//! The motivating use-case for this is compactly storing HDR RGB colors. But
|
//! The motivating use-case for this is compactly storing HDR RGB colors. But
|
||||||
//! it may be useful for other things as well.
|
//! it may be useful for other things as well.
|
||||||
|
|
||||||
|
pub mod fluv32;
|
||||||
pub mod signed48;
|
pub mod signed48;
|
||||||
pub mod unsigned32;
|
pub mod unsigned32;
|
||||||
|
pub mod unsigned40;
|
||||||
|
|
||||||
//===========================================================================
|
//===========================================================================
|
||||||
// Some shared functions used by the other modules in this crate.
|
// Some shared functions used by the other modules in this crate.
|
||||||
|
|
|
@ -3,9 +3,9 @@
|
||||||
//! The encoding uses 13 bits of mantissa and 1 sign bit per number, and 6
|
//! The encoding uses 13 bits of mantissa and 1 sign bit per number, and 6
|
||||||
//! bits for the shared exponent. The bit layout is: [sign 1, mantissa 1,
|
//! bits for the shared exponent. The bit layout is: [sign 1, mantissa 1,
|
||||||
//! sign 2, mantissa 2, sign 3, mantissa 3, exponent]. The exponent is stored
|
//! sign 2, mantissa 2, sign 3, mantissa 3, exponent]. The exponent is stored
|
||||||
//! as an unsigned integer with a bias of 25.
|
//! as an unsigned integer with a bias of 26.
|
||||||
//!
|
//!
|
||||||
//! The largest representable number is `2^38 - 2^25`, and the smallest
|
//! The largest representable number is just under `2^38`, and the smallest
|
||||||
//! representable positive number is `2^-38`.
|
//! representable positive number is `2^-38`.
|
||||||
//!
|
//!
|
||||||
//! Since the exponent is shared between all three values, the precision
|
//! Since the exponent is shared between all three values, the precision
|
||||||
|
@ -18,40 +18,48 @@
|
||||||
use crate::{fiddle_exp2, fiddle_log2};
|
use crate::{fiddle_exp2, fiddle_log2};
|
||||||
|
|
||||||
/// Largest representable number.
|
/// Largest representable number.
|
||||||
pub const MAX: f32 = 274_844_352_512.0;
|
pub const MAX: f32 = ((1u128 << (64 - EXP_BIAS)) - (1 << (64 - EXP_BIAS - 13))) as f32;
|
||||||
|
|
||||||
/// Smallest representable number.
|
/// Smallest representable number.
|
||||||
///
|
///
|
||||||
/// Note this is not the smallest _magnitude_ number. This is a negative
|
/// Note this is not the smallest _magnitude_ number. This is a negative
|
||||||
/// number of large magnitude.
|
/// number of large magnitude.
|
||||||
pub const MIN: f32 = -274_844_352_512.0;
|
pub const MIN: f32 = -MAX;
|
||||||
|
|
||||||
/// Smallest representable positive number.
|
/// Smallest representable positive number.
|
||||||
///
|
///
|
||||||
/// This is the number with the smallest possible magnitude (aside from zero).
|
/// This is the number with the smallest possible magnitude (aside from zero).
|
||||||
#[allow(clippy::excessive_precision)]
|
pub const MIN_POSITIVE: f32 = 1.0 / (1u128 << (EXP_BIAS + 12)) as f32;
|
||||||
pub const MIN_POSITIVE: f32 = 0.000_000_000_003_637_978_807_091_713;
|
|
||||||
|
|
||||||
/// Difference between 1.0 and the next largest representable number.
|
/// Difference between 1.0 and the next largest representable number.
|
||||||
pub const EPSILON: f32 = 1.0 / 4096.0;
|
pub const EPSILON: f32 = 1.0 / 4096.0;
|
||||||
|
|
||||||
const EXP_BIAS: i32 = 25;
|
const EXP_BIAS: i32 = 26;
|
||||||
const MIN_EXP: i32 = 0 - EXP_BIAS;
|
|
||||||
const MAX_EXP: i32 = 63 - EXP_BIAS;
|
|
||||||
|
|
||||||
/// Encodes three floating point values into a signed 48-bit trifloat.
|
/// Encodes three floating point values into a signed 48-bit trifloat.
|
||||||
///
|
///
|
||||||
/// Input floats that are larger than `MAX` or smaller than `MIN` will saturate
|
/// Input floats that are larger than `MAX` or smaller than `MIN` will saturate
|
||||||
/// to `MAX` and `MIN` respectively, including +/- infinity. Values are
|
/// to `MAX` and `MIN` respectively, including +/- infinity. Values are
|
||||||
/// converted to trifloat precision by rounding.
|
/// converted to trifloat precision by rounding towards zero.
|
||||||
///
|
|
||||||
/// Only the lower 48 bits of the return value are used. The highest 16 bits
|
|
||||||
/// will all be zero and can be safely discarded.
|
|
||||||
///
|
///
|
||||||
/// Warning: NaN's are _not_ supported by the trifloat format. There are
|
/// Warning: NaN's are _not_ supported by the trifloat format. There are
|
||||||
/// debug-only assertions in place to catch such values in the input floats.
|
/// debug-only assertions in place to catch such values in the input floats.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn encode(floats: (f32, f32, f32)) -> u64 {
|
pub fn encode(floats: (f32, f32, f32)) -> [u8; 6] {
|
||||||
|
u64_to_bytes(encode_64(floats))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decodes a signed 48-bit trifloat into three full floating point numbers.
|
||||||
|
///
|
||||||
|
/// This operation is lossless and cannot fail.
|
||||||
|
#[inline]
|
||||||
|
pub fn decode(trifloat: [u8; 6]) -> (f32, f32, f32) {
|
||||||
|
decode_64(bytes_to_u64(trifloat))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Workhorse encode function, which operates on u64.
|
||||||
|
#[inline(always)]
|
||||||
|
fn encode_64(floats: (f32, f32, f32)) -> u64 {
|
||||||
debug_assert!(
|
debug_assert!(
|
||||||
!floats.0.is_nan() && !floats.1.is_nan() && !floats.2.is_nan(),
|
!floats.0.is_nan() && !floats.1.is_nan() && !floats.2.is_nan(),
|
||||||
"trifloat::signed48::encode(): encoding to signed tri-floats only \
|
"trifloat::signed48::encode(): encoding to signed tri-floats only \
|
||||||
|
@ -62,55 +70,36 @@ pub fn encode(floats: (f32, f32, f32)) -> u64 {
|
||||||
floats.2
|
floats.2
|
||||||
);
|
);
|
||||||
|
|
||||||
// Find the largest (in magnitude) of the three values.
|
let floats_abs = (floats.0.abs(), floats.1.abs(), floats.2.abs());
|
||||||
let largest_value = {
|
|
||||||
let mut largest_value: f32 = 0.0;
|
|
||||||
if floats.0.abs() > largest_value.abs() {
|
|
||||||
largest_value = floats.0;
|
|
||||||
}
|
|
||||||
if floats.1.abs() > largest_value.abs() {
|
|
||||||
largest_value = floats.1;
|
|
||||||
}
|
|
||||||
if floats.2.abs() > largest_value.abs() {
|
|
||||||
largest_value = floats.2;
|
|
||||||
}
|
|
||||||
largest_value
|
|
||||||
};
|
|
||||||
|
|
||||||
// Calculate the exponent and 1.0/multiplier for encoding the values.
|
let largest_abs = floats_abs.0.max(floats_abs.1.max(floats_abs.2));
|
||||||
let (exponent, inv_multiplier) = {
|
|
||||||
let mut exponent = (fiddle_log2(largest_value) + 1).max(MIN_EXP).min(MAX_EXP);
|
|
||||||
let mut inv_multiplier = fiddle_exp2(-exponent + 13);
|
|
||||||
|
|
||||||
// Edge-case: make sure rounding pushes the largest value up
|
if largest_abs < MIN_POSITIVE {
|
||||||
// appropriately if needed.
|
0
|
||||||
if (largest_value * inv_multiplier).abs() + 0.5 >= 8192.0 {
|
} else {
|
||||||
exponent = (exponent + 1).min(MAX_EXP);
|
let e = fiddle_log2(largest_abs).max(-EXP_BIAS).min(63 - EXP_BIAS);
|
||||||
inv_multiplier = fiddle_exp2(-exponent + 13);
|
let inv_multiplier = fiddle_exp2(-e + 12);
|
||||||
}
|
|
||||||
(exponent, inv_multiplier)
|
|
||||||
};
|
|
||||||
|
|
||||||
// Quantize and encode values.
|
|
||||||
let x = (floats.0.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111;
|
|
||||||
let x_sign = (floats.0.to_bits() >> 31) as u64;
|
let x_sign = (floats.0.to_bits() >> 31) as u64;
|
||||||
let y = (floats.1.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111;
|
let x = (floats_abs.0 * inv_multiplier).min(8191.0) as u64;
|
||||||
let y_sign = (floats.1.to_bits() >> 31) as u64;
|
let y_sign = (floats.1.to_bits() >> 31) as u64;
|
||||||
let z = (floats.2.abs() * inv_multiplier + 0.5).min(8191.0) as u64 & 0b111_11111_11111;
|
let y = (floats_abs.1 * inv_multiplier).min(8191.0) as u64;
|
||||||
let z_sign = (floats.2.to_bits() >> 31) as u64;
|
let z_sign = (floats.2.to_bits() >> 31) as u64;
|
||||||
let e = (exponent + EXP_BIAS) as u64 & 0b111_111;
|
let z = (floats_abs.2 * inv_multiplier).min(8191.0) as u64;
|
||||||
|
|
||||||
// Pack values into a single u64 and return.
|
(x_sign << 47)
|
||||||
(x_sign << 47) | (x << 34) | (y_sign << 33) | (y << 20) | (z_sign << 19) | (z << 6) | e
|
| (x << 34)
|
||||||
|
| (y_sign << 33)
|
||||||
|
| (y << 20)
|
||||||
|
| (z_sign << 19)
|
||||||
|
| (z << 6)
|
||||||
|
| (e + EXP_BIAS) as u64
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Decodes a signed 48-bit trifloat into three full floating point numbers.
|
// Workhorse decode function, which operates on u64.
|
||||||
///
|
#[inline(always)]
|
||||||
/// This operation is lossless and cannot fail. Only the lower 48 bits of the
|
fn decode_64(trifloat: u64) -> (f32, f32, f32) {
|
||||||
/// input value are used--the upper 16 bits can safely be anything and are
|
|
||||||
/// ignored.
|
|
||||||
#[inline]
|
|
||||||
pub fn decode(trifloat: u64) -> (f32, f32, f32) {
|
|
||||||
// Unpack values.
|
// Unpack values.
|
||||||
let x = (trifloat >> 34) & 0b111_11111_11111;
|
let x = (trifloat >> 34) & 0b111_11111_11111;
|
||||||
let y = (trifloat >> 20) & 0b111_11111_11111;
|
let y = (trifloat >> 20) & 0b111_11111_11111;
|
||||||
|
@ -122,7 +111,7 @@ pub fn decode(trifloat: u64) -> (f32, f32, f32) {
|
||||||
|
|
||||||
let e = trifloat & 0b111_111;
|
let e = trifloat & 0b111_111;
|
||||||
|
|
||||||
let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 13);
|
let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 12);
|
||||||
|
|
||||||
(
|
(
|
||||||
f32::from_bits((x as f32 * multiplier).to_bits() | x_sign),
|
f32::from_bits((x as f32 * multiplier).to_bits() | x_sign),
|
||||||
|
@ -131,6 +120,29 @@ pub fn decode(trifloat: u64) -> (f32, f32, f32) {
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn u64_to_bytes(n: u64) -> [u8; 6] {
|
||||||
|
let a = n.to_ne_bytes();
|
||||||
|
let mut b = [0u8; 6];
|
||||||
|
if cfg!(target_endian = "big") {
|
||||||
|
(&mut b[..]).copy_from_slice(&a[2..8]);
|
||||||
|
} else {
|
||||||
|
(&mut b[..]).copy_from_slice(&a[0..6]);
|
||||||
|
}
|
||||||
|
b
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn bytes_to_u64(a: [u8; 6]) -> u64 {
|
||||||
|
let mut b = [0u8; 8];
|
||||||
|
if cfg!(target_endian = "big") {
|
||||||
|
(&mut b[2..8]).copy_from_slice(&a[..]);
|
||||||
|
} else {
|
||||||
|
(&mut b[0..6]).copy_from_slice(&a[..]);
|
||||||
|
}
|
||||||
|
u64::from_ne_bytes(b)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
@ -143,8 +155,8 @@ mod tests {
|
||||||
fn all_zeros() {
|
fn all_zeros() {
|
||||||
let fs = (0.0f32, 0.0f32, 0.0f32);
|
let fs = (0.0f32, 0.0f32, 0.0f32);
|
||||||
|
|
||||||
let tri = encode(fs);
|
let tri = encode_64(fs);
|
||||||
let fs2 = decode(tri);
|
let fs2 = decode_64(tri);
|
||||||
|
|
||||||
assert_eq!(tri, 0);
|
assert_eq!(tri, 0);
|
||||||
assert_eq!(fs, fs2);
|
assert_eq!(fs, fs2);
|
||||||
|
@ -153,7 +165,7 @@ mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn powers_of_two() {
|
fn powers_of_two() {
|
||||||
let fs = (8.0f32, 128.0f32, 0.5f32);
|
let fs = (8.0f32, 128.0f32, 0.5f32);
|
||||||
assert_eq!(round_trip(fs), fs);
|
assert_eq!(fs, round_trip(fs));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -196,18 +208,11 @@ mod tests {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn rounding() {
|
fn precision_floor() {
|
||||||
let fs = (7.0f32, 8193.0f32, -1.0f32);
|
let fs = (7.0f32, 8193.0f32, -1.0f32);
|
||||||
let fsn = (-7.0f32, -8193.0f32, 1.0f32);
|
let fsn = (-7.0f32, -8193.0f32, 1.0f32);
|
||||||
assert_eq!(round_trip(fs), (8.0, 8194.0, -2.0));
|
assert_eq!((6.0, 8192.0, -0.0), round_trip(fs));
|
||||||
assert_eq!(round_trip(fsn), (-8.0, -8194.0, 2.0));
|
assert_eq!((-6.0, -8192.0, 0.0), round_trip(fsn));
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn rounding_edge_case() {
|
|
||||||
let fs = (16383.0f32, 0.0f32, 0.0f32);
|
|
||||||
|
|
||||||
assert_eq!(round_trip(fs), (16384.0, 0.0, 0.0),);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -223,10 +228,10 @@ mod tests {
|
||||||
-99_999_999_999_999.0,
|
-99_999_999_999_999.0,
|
||||||
);
|
);
|
||||||
|
|
||||||
assert_eq!(round_trip(fs), (MAX, MAX, MAX));
|
assert_eq!((MAX, MAX, MAX), round_trip(fs));
|
||||||
assert_eq!(round_trip(fsn), (MIN, MIN, MIN));
|
assert_eq!((MIN, MIN, MIN), round_trip(fsn));
|
||||||
assert_eq!(decode(0x7FFD_FFF7_FFFF), (MAX, MAX, MAX));
|
assert_eq!((MAX, MAX, MAX), decode_64(0x7FFD_FFF7_FFFF));
|
||||||
assert_eq!(decode(0xFFFF_FFFF_FFFF), (MIN, MIN, MIN));
|
assert_eq!((MIN, MIN, MIN), decode_64(0xFFFF_FFFF_FFFF));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -235,10 +240,10 @@ mod tests {
|
||||||
let fs = (INFINITY, 0.0, 0.0);
|
let fs = (INFINITY, 0.0, 0.0);
|
||||||
let fsn = (-INFINITY, 0.0, 0.0);
|
let fsn = (-INFINITY, 0.0, 0.0);
|
||||||
|
|
||||||
assert_eq!(round_trip(fs), (MAX, 0.0, 0.0));
|
assert_eq!((MAX, 0.0, 0.0), round_trip(fs));
|
||||||
assert_eq!(round_trip(fsn), (MIN, 0.0, 0.0));
|
assert_eq!((MIN, 0.0, 0.0), round_trip(fsn));
|
||||||
assert_eq!(encode(fs), 0x7FFC0000003F);
|
assert_eq!(0x7FFC0000003F, encode_64(fs));
|
||||||
assert_eq!(encode(fsn), 0xFFFC0000003F);
|
assert_eq!(0xFFFC0000003F, encode_64(fsn));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -246,25 +251,25 @@ mod tests {
|
||||||
let fs = (99_999_999_999_999.0, 4294967296.0, -17179869184.0);
|
let fs = (99_999_999_999_999.0, 4294967296.0, -17179869184.0);
|
||||||
let fsn = (-99_999_999_999_999.0, 4294967296.0, -17179869184.0);
|
let fsn = (-99_999_999_999_999.0, 4294967296.0, -17179869184.0);
|
||||||
|
|
||||||
assert_eq!(round_trip(fs), (MAX, 4294967296.0, -17179869184.0));
|
assert_eq!((MAX, 4294967296.0, -17179869184.0), round_trip(fs));
|
||||||
assert_eq!(round_trip(fsn), (MIN, 4294967296.0, -17179869184.0));
|
assert_eq!((MIN, 4294967296.0, -17179869184.0), round_trip(fsn));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn smallest_value() {
|
fn smallest_value() {
|
||||||
let fs = (MIN_POSITIVE, MIN_POSITIVE * 0.5, MIN_POSITIVE * 0.49);
|
let fs = (MIN_POSITIVE * 1.5, MIN_POSITIVE, MIN_POSITIVE * 0.50);
|
||||||
let fsn = (-MIN_POSITIVE, -MIN_POSITIVE * 0.5, -MIN_POSITIVE * 0.49);
|
let fsn = (-MIN_POSITIVE * 1.5, -MIN_POSITIVE, -MIN_POSITIVE * 0.50);
|
||||||
|
|
||||||
assert_eq!(decode(0x600100000), (MIN_POSITIVE, -MIN_POSITIVE, 0.0));
|
assert_eq!((MIN_POSITIVE, -MIN_POSITIVE, 0.0), decode_64(0x600100000));
|
||||||
assert_eq!(round_trip(fs), (MIN_POSITIVE, MIN_POSITIVE, 0.0));
|
assert_eq!((MIN_POSITIVE, MIN_POSITIVE, 0.0), round_trip(fs));
|
||||||
assert_eq!(round_trip(fsn), (-MIN_POSITIVE, -MIN_POSITIVE, -0.0));
|
assert_eq!((-MIN_POSITIVE, -MIN_POSITIVE, -0.0), round_trip(fsn));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn underflow() {
|
fn underflow() {
|
||||||
let fs = (MIN_POSITIVE * 0.49, -MIN_POSITIVE * 0.49, 0.0);
|
let fs = (MIN_POSITIVE * 0.5, -MIN_POSITIVE * 0.5, MIN_POSITIVE);
|
||||||
assert_eq!(encode(fs), 0x200000000);
|
assert_eq!(0x200000040, encode_64(fs));
|
||||||
assert_eq!(round_trip(fs), (0.0, -0.0, 0.0));
|
assert_eq!((0.0, -0.0, MIN_POSITIVE), round_trip(fs));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -273,13 +278,13 @@ mod tests {
|
||||||
let fs2 = (-63456254.2, 5235423.53, 54353.3);
|
let fs2 = (-63456254.2, 5235423.53, 54353.3);
|
||||||
let fs3 = (-0.000000634, 0.00000000005, 0.00000000892);
|
let fs3 = (-0.000000634, 0.00000000005, 0.00000000892);
|
||||||
|
|
||||||
let n1 = encode(fs1);
|
let n1 = encode_64(fs1);
|
||||||
let n2 = encode(fs2);
|
let n2 = encode_64(fs2);
|
||||||
let n3 = encode(fs3);
|
let n3 = encode_64(fs3);
|
||||||
|
|
||||||
assert_eq!(decode(n1), decode(n1 | 0xffff_0000_0000_0000));
|
assert_eq!(decode_64(n1), decode_64(n1 | 0xffff_0000_0000_0000));
|
||||||
assert_eq!(decode(n2), decode(n2 | 0xffff_0000_0000_0000));
|
assert_eq!(decode_64(n2), decode_64(n2 | 0xffff_0000_0000_0000));
|
||||||
assert_eq!(decode(n3), decode(n3 | 0xffff_0000_0000_0000));
|
assert_eq!(decode_64(n3), decode_64(n3 | 0xffff_0000_0000_0000));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
//!
|
//!
|
||||||
//! The encoding uses 9 bits of mantissa per number, and 5 bits for the shared
|
//! The encoding uses 9 bits of mantissa per number, and 5 bits for the shared
|
||||||
//! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3, exponent].
|
//! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3, exponent].
|
||||||
//! The exponent is stored as an unsigned integer with a bias of 10.
|
//! The exponent is stored as an unsigned integer with a bias of 11.
|
||||||
//!
|
//!
|
||||||
//! The largest representable number is `2^21 - 4096`, and the smallest
|
//! The largest representable number is `2^21 - 4096`, and the smallest
|
||||||
//! representable non-zero number is `2^-19`.
|
//! representable non-zero number is `2^-19`.
|
||||||
|
@ -14,22 +14,20 @@
|
||||||
use crate::{fiddle_exp2, fiddle_log2};
|
use crate::{fiddle_exp2, fiddle_log2};
|
||||||
|
|
||||||
/// Largest representable number.
|
/// Largest representable number.
|
||||||
pub const MAX: f32 = 2_093_056.0;
|
pub const MAX: f32 = ((1u64 << (32 - EXP_BIAS)) - (1 << (32 - EXP_BIAS - 9))) as f32;
|
||||||
|
|
||||||
/// Smallest representable non-zero number.
|
/// Smallest representable non-zero number.
|
||||||
pub const MIN: f32 = 0.000_001_907_348_6;
|
pub const MIN: f32 = 1.0 / (1 << (EXP_BIAS + 8)) as f32;
|
||||||
|
|
||||||
/// Difference between 1.0 and the next largest representable number.
|
/// Difference between 1.0 and the next largest representable number.
|
||||||
pub const EPSILON: f32 = 1.0 / 256.0;
|
pub const EPSILON: f32 = 1.0 / 256.0;
|
||||||
|
|
||||||
const EXP_BIAS: i32 = 10;
|
const EXP_BIAS: i32 = 11;
|
||||||
const MIN_EXP: i32 = 0 - EXP_BIAS;
|
|
||||||
const MAX_EXP: i32 = 31 - EXP_BIAS;
|
|
||||||
|
|
||||||
/// Encodes three floating point values into a signed 32-bit trifloat.
|
/// Encodes three floating point values into an unsigned 32-bit trifloat.
|
||||||
///
|
///
|
||||||
/// Input floats larger than `MAX` will saturate to `MAX`, including infinity.
|
/// Input floats larger than `MAX` will saturate to `MAX`, including infinity.
|
||||||
/// Values are converted to trifloat precision by rounding.
|
/// Values are converted to trifloat precision by rounding down.
|
||||||
///
|
///
|
||||||
/// Warning: negative values and NaN's are _not_ supported by the trifloat
|
/// Warning: negative values and NaN's are _not_ supported by the trifloat
|
||||||
/// format. There are debug-only assertions in place to catch such
|
/// format. There are debug-only assertions in place to catch such
|
||||||
|
@ -51,31 +49,19 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
|
||||||
floats.2
|
floats.2
|
||||||
);
|
);
|
||||||
|
|
||||||
// Find the largest of the three values.
|
let largest = floats.0.max(floats.1.max(floats.2));
|
||||||
let largest_value = floats.0.max(floats.1.max(floats.2));
|
|
||||||
if largest_value <= 0.0 {
|
if largest < MIN {
|
||||||
return 0;
|
return 0;
|
||||||
|
} else {
|
||||||
|
let e = fiddle_log2(largest).max(-EXP_BIAS).min(31 - EXP_BIAS);
|
||||||
|
let inv_multiplier = fiddle_exp2(-e + 8);
|
||||||
|
let x = (floats.0 * inv_multiplier).min(511.0) as u32;
|
||||||
|
let y = (floats.1 * inv_multiplier).min(511.0) as u32;
|
||||||
|
let z = (floats.2 * inv_multiplier).min(511.0) as u32;
|
||||||
|
|
||||||
|
(x << (9 + 9 + 5)) | (y << (9 + 5)) | (z << 5) | (e + EXP_BIAS) as u32
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the exponent and 1.0/multiplier for encoding the values.
|
|
||||||
let mut exponent = (fiddle_log2(largest_value) + 1).max(MIN_EXP).min(MAX_EXP);
|
|
||||||
let mut inv_multiplier = fiddle_exp2(-exponent + 9);
|
|
||||||
|
|
||||||
// Edge-case: make sure rounding pushes the largest value up
|
|
||||||
// appropriately if needed.
|
|
||||||
if (largest_value * inv_multiplier) + 0.5 >= 512.0 {
|
|
||||||
exponent = (exponent + 1).min(MAX_EXP);
|
|
||||||
inv_multiplier = fiddle_exp2(-exponent + 9);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Quantize and encode values.
|
|
||||||
let x = (floats.0 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
|
||||||
let y = (floats.1 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
|
||||||
let z = (floats.2 * inv_multiplier + 0.5).min(511.0) as u32 & 0b1_1111_1111;
|
|
||||||
let e = (exponent + EXP_BIAS) as u32 & 0b1_1111;
|
|
||||||
|
|
||||||
// Pack values into a u32.
|
|
||||||
(x << (5 + 9 + 9)) | (y << (5 + 9)) | (z << 5) | e
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Decodes an unsigned 32-bit trifloat into three full floating point numbers.
|
/// Decodes an unsigned 32-bit trifloat into three full floating point numbers.
|
||||||
|
@ -84,12 +70,12 @@ pub fn encode(floats: (f32, f32, f32)) -> u32 {
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn decode(trifloat: u32) -> (f32, f32, f32) {
|
pub fn decode(trifloat: u32) -> (f32, f32, f32) {
|
||||||
// Unpack values.
|
// Unpack values.
|
||||||
let x = trifloat >> (5 + 9 + 9);
|
let x = trifloat >> (9 + 9 + 5);
|
||||||
let y = (trifloat >> (5 + 9)) & 0b1_1111_1111;
|
let y = (trifloat >> (9 + 5)) & 0b1_1111_1111;
|
||||||
let z = (trifloat >> 5) & 0b1_1111_1111;
|
let z = (trifloat >> 5) & 0b1_1111_1111;
|
||||||
let e = trifloat & 0b1_1111;
|
let e = trifloat & 0b1_1111;
|
||||||
|
|
||||||
let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 9);
|
let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 8);
|
||||||
|
|
||||||
(
|
(
|
||||||
x as f32 * multiplier,
|
x as f32 * multiplier,
|
||||||
|
@ -120,11 +106,11 @@ mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn powers_of_two() {
|
fn powers_of_two() {
|
||||||
let fs = (8.0f32, 128.0f32, 0.5f32);
|
let fs = (8.0f32, 128.0f32, 0.5f32);
|
||||||
assert_eq!(round_trip(fs), fs);
|
assert_eq!(fs, round_trip(fs));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn accuracy() {
|
fn accuracy_01() {
|
||||||
let mut n = 1.0;
|
let mut n = 1.0;
|
||||||
for _ in 0..256 {
|
for _ in 0..256 {
|
||||||
let (x, _, _) = round_trip((n, 0.0, 0.0));
|
let (x, _, _) = round_trip((n, 0.0, 0.0));
|
||||||
|
@ -133,6 +119,17 @@ mod tests {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn accuracy_02() {
|
||||||
|
let mut n = 1.0;
|
||||||
|
for _ in 0..512 {
|
||||||
|
let (x, _, _) = round_trip((n, 0.0, 0.0));
|
||||||
|
assert_eq!(n, x);
|
||||||
|
n += 1.0 / 512.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn integers() {
|
fn integers() {
|
||||||
for n in 0..=512 {
|
for n in 0..=512 {
|
||||||
|
@ -142,24 +139,17 @@ mod tests {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn rounding() {
|
fn precision_floor() {
|
||||||
let fs = (7.0f32, 513.0f32, 1.0f32);
|
let fs = (7.0f32, 513.0f32, 1.0f32);
|
||||||
assert_eq!(round_trip(fs), (8.0, 514.0, 2.0));
|
assert_eq!((6.0, 512.0, 0.0), round_trip(fs));
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn rounding_edge_case() {
|
|
||||||
let fs = (1023.0f32, 0.0f32, 0.0f32);
|
|
||||||
|
|
||||||
assert_eq!(round_trip(fs), (1024.0, 0.0, 0.0),);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn saturate() {
|
fn saturate() {
|
||||||
let fs = (9999999999.0, 9999999999.0, 9999999999.0);
|
let fs = (9999999999.0, 9999999999.0, 9999999999.0);
|
||||||
|
|
||||||
assert_eq!(round_trip(fs), (MAX, MAX, MAX));
|
assert_eq!((MAX, MAX, MAX), round_trip(fs));
|
||||||
assert_eq!(decode(0xFFFFFFFF), (MAX, MAX, MAX),);
|
assert_eq!((MAX, MAX, MAX), decode(0xFFFFFFFF));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -167,29 +157,29 @@ mod tests {
|
||||||
use std::f32::INFINITY;
|
use std::f32::INFINITY;
|
||||||
let fs = (INFINITY, 0.0, 0.0);
|
let fs = (INFINITY, 0.0, 0.0);
|
||||||
|
|
||||||
assert_eq!(round_trip(fs), (MAX, 0.0, 0.0));
|
assert_eq!((MAX, 0.0, 0.0), round_trip(fs));
|
||||||
assert_eq!(encode(fs), 0xFF80001F,);
|
assert_eq!(0xFF80001F, encode(fs));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn partial_saturate() {
|
fn partial_saturate() {
|
||||||
let fs = (9999999999.0, 4096.0, 262144.0);
|
let fs = (9999999999.0, 4096.0, 262144.0);
|
||||||
|
|
||||||
assert_eq!(round_trip(fs), (MAX, 4096.0, 262144.0));
|
assert_eq!((MAX, 4096.0, 262144.0), round_trip(fs));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn smallest_value() {
|
fn smallest_value() {
|
||||||
let fs = (MIN, MIN * 0.5, MIN * 0.49);
|
let fs = (MIN * 1.5, MIN, MIN * 0.5);
|
||||||
assert_eq!(round_trip(fs), (MIN, MIN, 0.0));
|
assert_eq!((MIN, MIN, 0.0), round_trip(fs));
|
||||||
assert_eq!(decode(0x00_80_40_00), (MIN, MIN, 0.0));
|
assert_eq!((MIN, MIN, 0.0), decode(0x00_80_40_00));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn underflow() {
|
fn underflow() {
|
||||||
let fs = (MIN * 0.49, 0.0, 0.0);
|
let fs = (MIN * 0.99, 0.0, 0.0);
|
||||||
assert_eq!(encode(fs), 0);
|
assert_eq!(0, encode(fs));
|
||||||
assert_eq!(round_trip(fs), (0.0, 0.0, 0.0));
|
assert_eq!((0.0, 0.0, 0.0), round_trip(fs));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
267
sub_crates/trifloat/src/unsigned40.rs
Normal file
267
sub_crates/trifloat/src/unsigned40.rs
Normal file
|
@ -0,0 +1,267 @@
|
||||||
|
//! Encoding/decoding for unsigned 40-bit trifloat numbers.
|
||||||
|
//!
|
||||||
|
//! The encoding uses 11 bits of mantissa per number, and 7 bits for the shared
|
||||||
|
//! exponent. The bit layout is [mantissa 1, mantissa 2, mantissa 3, exponent].
|
||||||
|
//! The exponent is stored as an unsigned integer with a bias of 32.
|
||||||
|
//!
|
||||||
|
//! The largest representable number is just under `2^96`, and the smallest
|
||||||
|
//! representable non-zero number is `2^-42`.
|
||||||
|
//!
|
||||||
|
//! Since the exponent is shared between the three values, the precision
|
||||||
|
//! of all three values depends on the largest of the three. All integers
|
||||||
|
//! up to 2048 can be represented exactly in the largest value.
|
||||||
|
|
||||||
|
use crate::{fiddle_exp2, fiddle_log2};
|
||||||
|
|
||||||
|
/// Largest representable number.
|
||||||
|
pub const MAX: f32 = ((1u128 << (128 - EXP_BIAS)) - (1 << (128 - EXP_BIAS - 11))) as f32;
|
||||||
|
|
||||||
|
/// Smallest representable non-zero number.
|
||||||
|
pub const MIN: f32 = 1.0 / (1u128 << (EXP_BIAS + 10)) as f32;
|
||||||
|
|
||||||
|
/// Difference between 1.0 and the next largest representable number.
|
||||||
|
pub const EPSILON: f32 = 1.0 / 1024.0;
|
||||||
|
|
||||||
|
const EXP_BIAS: i32 = 32;
|
||||||
|
|
||||||
|
/// Encodes three floating point values into an unsigned 40-bit trifloat.
|
||||||
|
///
|
||||||
|
/// Input floats larger than `MAX` will saturate to `MAX`, including infinity.
|
||||||
|
/// Values are converted to trifloat precision by rounding down.
|
||||||
|
///
|
||||||
|
/// Warning: negative values and NaN's are _not_ supported by the trifloat
|
||||||
|
/// format. There are debug-only assertions in place to catch such
|
||||||
|
/// values in the input floats.
|
||||||
|
#[inline]
|
||||||
|
pub fn encode(floats: (f32, f32, f32)) -> [u8; 5] {
|
||||||
|
u64_to_bytes(encode_64(floats))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decodes an unsigned 40-bit trifloat into three full floating point numbers.
|
||||||
|
///
|
||||||
|
/// This operation is lossless and cannot fail.
|
||||||
|
#[inline]
|
||||||
|
pub fn decode(trifloat: [u8; 5]) -> (f32, f32, f32) {
|
||||||
|
decode_64(bytes_to_u64(trifloat))
|
||||||
|
}
|
||||||
|
|
||||||
|
// Workhorse encode function, which operates on u64.
|
||||||
|
#[inline(always)]
|
||||||
|
fn encode_64(floats: (f32, f32, f32)) -> u64 {
|
||||||
|
debug_assert!(
|
||||||
|
floats.0 >= 0.0
|
||||||
|
&& floats.1 >= 0.0
|
||||||
|
&& floats.2 >= 0.0
|
||||||
|
&& !floats.0.is_nan()
|
||||||
|
&& !floats.1.is_nan()
|
||||||
|
&& !floats.2.is_nan(),
|
||||||
|
"trifloat::unsigned32::encode(): encoding to unsigned tri-floats only \
|
||||||
|
works correctly for positive, non-NaN numbers, but the numbers passed \
|
||||||
|
were: ({}, {}, {})",
|
||||||
|
floats.0,
|
||||||
|
floats.1,
|
||||||
|
floats.2
|
||||||
|
);
|
||||||
|
|
||||||
|
let largest = floats.0.max(floats.1.max(floats.2));
|
||||||
|
|
||||||
|
if largest < MIN {
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
let e = fiddle_log2(largest).max(-EXP_BIAS).min(127 - EXP_BIAS);
|
||||||
|
let inv_multiplier = fiddle_exp2(-e + 10);
|
||||||
|
let x = (floats.0 * inv_multiplier).min(2047.0) as u64;
|
||||||
|
let y = (floats.1 * inv_multiplier).min(2047.0) as u64;
|
||||||
|
let z = (floats.2 * inv_multiplier).min(2047.0) as u64;
|
||||||
|
|
||||||
|
(x << (11 + 11 + 7)) | (y << (11 + 7)) | (z << 7) | (e + EXP_BIAS) as u64
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Workhorse decode function, which operates on u64.
|
||||||
|
#[inline(always)]
|
||||||
|
fn decode_64(trifloat: u64) -> (f32, f32, f32) {
|
||||||
|
// Unpack values.
|
||||||
|
let x = trifloat >> (11 + 11 + 7);
|
||||||
|
let y = (trifloat >> (11 + 7)) & 0b111_1111_1111;
|
||||||
|
let z = (trifloat >> 7) & 0b111_1111_1111;
|
||||||
|
let e = trifloat & 0b111_1111;
|
||||||
|
|
||||||
|
let multiplier = fiddle_exp2(e as i32 - EXP_BIAS - 10);
|
||||||
|
|
||||||
|
(
|
||||||
|
x as f32 * multiplier,
|
||||||
|
y as f32 * multiplier,
|
||||||
|
z as f32 * multiplier,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn u64_to_bytes(n: u64) -> [u8; 5] {
|
||||||
|
let a = n.to_ne_bytes();
|
||||||
|
let mut b = [0u8; 5];
|
||||||
|
if cfg!(target_endian = "big") {
|
||||||
|
(&mut b[..]).copy_from_slice(&a[3..8]);
|
||||||
|
} else {
|
||||||
|
(&mut b[..]).copy_from_slice(&a[0..5]);
|
||||||
|
}
|
||||||
|
b
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
fn bytes_to_u64(a: [u8; 5]) -> u64 {
|
||||||
|
let mut b = [0u8; 8];
|
||||||
|
if cfg!(target_endian = "big") {
|
||||||
|
(&mut b[3..8]).copy_from_slice(&a[..]);
|
||||||
|
} else {
|
||||||
|
(&mut b[0..5]).copy_from_slice(&a[..]);
|
||||||
|
}
|
||||||
|
u64::from_ne_bytes(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn round_trip(floats: (f32, f32, f32)) -> (f32, f32, f32) {
|
||||||
|
decode(encode(floats))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn all_zeros() {
|
||||||
|
let fs = (0.0f32, 0.0f32, 0.0f32);
|
||||||
|
|
||||||
|
let tri = encode_64(fs);
|
||||||
|
let fs2 = decode_64(tri);
|
||||||
|
|
||||||
|
assert_eq!(tri, 0u64);
|
||||||
|
assert_eq!(fs, fs2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn powers_of_two() {
|
||||||
|
let fs = (8.0f32, 128.0f32, 0.5f32);
|
||||||
|
assert_eq!(fs, round_trip(fs));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn accuracy_01() {
|
||||||
|
let mut n = 1.0;
|
||||||
|
for _ in 0..1024 {
|
||||||
|
let (x, _, _) = round_trip((n, 0.0, 0.0));
|
||||||
|
assert_eq!(n, x);
|
||||||
|
n += 1.0 / 1024.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn accuracy_02() {
|
||||||
|
let mut n = 1.0;
|
||||||
|
for _ in 0..2048 {
|
||||||
|
let (x, _, _) = round_trip((n, 0.0, 0.0));
|
||||||
|
assert_eq!(n, x);
|
||||||
|
n += 1.0 / 2048.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn integers() {
|
||||||
|
for n in 0..=2048 {
|
||||||
|
let (x, _, _) = round_trip((n as f32, 0.0, 0.0));
|
||||||
|
assert_eq!(n as f32, x);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn precision_floor() {
|
||||||
|
let fs = (7.0f32, 2049.0f32, 1.0f32);
|
||||||
|
assert_eq!((6.0, 2048.0, 0.0), round_trip(fs));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn saturate() {
|
||||||
|
let fs = (1.0e+30, 1.0e+30, 1.0e+30);
|
||||||
|
|
||||||
|
assert_eq!((MAX, MAX, MAX), round_trip(fs));
|
||||||
|
assert_eq!((MAX, MAX, MAX), decode_64(0xff_ffff_ffff));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn inf_saturate() {
|
||||||
|
use std::f32::INFINITY;
|
||||||
|
let fs = (INFINITY, 0.0, 0.0);
|
||||||
|
|
||||||
|
assert_eq!((MAX, 0.0, 0.0), round_trip(fs));
|
||||||
|
assert_eq!(0xffe000007f, encode_64(fs));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn partial_saturate() {
|
||||||
|
let fs = (
|
||||||
|
1.0e+30,
|
||||||
|
(1u128 << (128 - EXP_BIAS - 11)) as f32,
|
||||||
|
(1u128 << (128 - EXP_BIAS - 12)) as f32,
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
(MAX, (1u128 << (128 - EXP_BIAS - 11)) as f32, 0.0),
|
||||||
|
round_trip(fs)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn smallest_value() {
|
||||||
|
let fs = (MIN * 1.5, MIN, MIN * 0.5);
|
||||||
|
assert_eq!((MIN, MIN, 0.0), round_trip(fs));
|
||||||
|
assert_eq!((MIN, MIN, 0.0), decode_64(0x20_04_00_00));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn underflow() {
|
||||||
|
let fs = (MIN * 0.99, 0.0, 0.0);
|
||||||
|
assert_eq!(0, encode_64(fs));
|
||||||
|
assert_eq!((0.0, 0.0, 0.0), round_trip(fs));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn nans_01() {
|
||||||
|
encode((std::f32::NAN, 0.0, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn nans_02() {
|
||||||
|
encode((0.0, std::f32::NAN, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn nans_03() {
|
||||||
|
encode((0.0, 0.0, std::f32::NAN));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn negative_01() {
|
||||||
|
encode((-1.0, 0.0, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn negative_02() {
|
||||||
|
encode((0.0, -1.0, 0.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn negative_03() {
|
||||||
|
encode((0.0, 0.0, -1.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn negative_04() {
|
||||||
|
encode((-0.0, -0.0, -0.0));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user