Name a constant better.

Fix broken tests that I forgot to update.
Move ulp increment/decrement functions into rmath utils.
2023-08-04 00:51:02 +02:00 · 2023-08-04 00:46:16 +02:00 · 2023-08-04 00:45:44 +02:00 · 2023-08-03 23:55:26 +02:00 · 2023-08-03 23:34:30 +02:00 · 2022-08-17 15:24:20 -07:00
80 changed files with 7619 additions and 4067 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1,5 +1,7 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
+version = 3
+
 [[package]]
 name = "ansi_term"
 version = "0.11.0"
@ -9,15 +11,6 @@ dependencies = [
 "winapi",
 ]

-[[package]]
-name = "approx"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f2a05fd1bd10b2527e20a2cd32d8873d115b8b39fe219ee25f42a8aca6ba278"
-dependencies = [
- "num-traits",
-]
-
 [[package]]
 name = "arrayvec"
 version = "0.5.2"
@ -133,6 +126,15 @@ dependencies = [
 [[package]]
 name = "color"
 version = "0.1.0"
+dependencies = [
+ "colorbox",
+]
+
+[[package]]
+name = "colorbox"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d27d55561009760957654f467735e73806f8bc2d081cc4a22e93403ecd156fc"

 [[package]]
 name = "compact"
@ -155,6 +157,10 @@ version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24ce9782d4d5c53674646a6a4c1863a21a8fc0cb649b3c94dfc16e45071dea19"

+[[package]]
+name = "data_tree"
+version = "0.1.0"
+
 [[package]]
 name = "fastapprox"
 version = "0.3.0"
@ -184,12 +190,6 @@ dependencies = [
 "wasi",
 ]

-[[package]]
-name = "glam"
-version = "0.15.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "411e0584defa447c328f25c756ba3d0685727ecc126b46c3c1176001141cd4b6"
-
 [[package]]
 name = "half"
 version = "1.7.1"
@ -240,14 +240,6 @@ version = "0.2.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e"

-[[package]]
-name = "math3d"
-version = "0.1.0"
-dependencies = [
- "approx",
- "glam",
-]
-
 [[package]]
 name = "memchr"
 version = "2.4.0"
@ -350,20 +342,22 @@ dependencies = [
 "bvh_order",
 "clap",
 "color",
+ "colorbox",
 "compact",
 "copy_in_place",
 "crossbeam",
+ "data_tree",
 "fastapprox",
- "glam",
 "half",
 "halton",
 "kioku",
 "lazy_static",
- "math3d",
 "nom",
 "num_cpus",
 "openexr",
 "png_encode_mini",
+ "rmath",
+ "rrand",
 "rustc-serialize",
 "scoped_threadpool",
 "sobol_burley",
@ -569,6 +563,22 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "rmath"
+version = "0.1.0"
+dependencies = [
+ "bencher",
+ "rand 0.6.5",
+]
+
+[[package]]
+name = "rrand"
+version = "0.1.0"
+dependencies = [
+ "bencher",
+ "rand 0.6.5",
+]
+
 [[package]]
 name = "rustc-serialize"
 version = "0.3.24"
@ -607,15 +617,15 @@ checksum = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8"

 [[package]]
 name = "sobol_burley"
-version = "0.3.0"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "26e3528b09b1f1b1e152342a4462d1e80d568dc5623a0772252a6e584a53d550"
+checksum = "3441b32bbb896e372f1b8e7eb51a983713aef99599c32c0eb69183aa490cb6a0"

 [[package]]
 name = "spectral_upsampling"
 version = "0.1.0"
 dependencies = [
- "glam",
+ "rmath",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -3,8 +3,10 @@ members = [
    "sub_crates/bvh_order",
    "sub_crates/color",
    "sub_crates/compact",
+    "sub_crates/data_tree",
    "sub_crates/halton",
-    "sub_crates/math3d",
+    "sub_crates/rmath",
+    "sub_crates/rrand",
    "sub_crates/spectral_upsampling",
 ]

@ -30,13 +32,14 @@ nom = "5"
 num_cpus = "1.8"
 openexr = "0.7"
 kioku = "0.3"
-sobol_burley = "0.3"
+sobol_burley = "0.4"
 png_encode_mini = "0.1.2"
 rustc-serialize = "0.3"
 scoped_threadpool = "0.1"
 time = "0.1"
-glam = "0.15"
 fastapprox = "0.3"
+colorbox = "0.3"
+

 # Local crate dependencies
 [dependencies.bvh_order]
@ -47,12 +50,18 @@ path = "sub_crates/color"

 [dependencies.compact]
 path = "sub_crates/compact"
-[dependencies.halton]

+[dependencies.data_tree]
+path = "sub_crates/data_tree"
+
+[dependencies.halton]
 path = "sub_crates/halton"

-[dependencies.math3d]
-path = "sub_crates/math3d"
+[dependencies.rmath]
+path = "sub_crates/rmath"
+
+[dependencies.rrand]
+path = "sub_crates/rrand"

 [dependencies.spectral_upsampling]
 path = "sub_crates/spectral_upsampling"
--- a/README.md
+++ b/README.md
@ -13,9 +13,7 @@ efficiently handle very large data sets, complex shading, motion blur, color
 management, etc. presents a much richer and more challenging problem space to
 explore than just writing a basic path tracer.

-
 ## Building
-
 Psychopath is written in [Rust](https://www.rust-lang.org), and is pretty
 straightforward to build except for its OpenEXR dependency.

@ -36,7 +34,6 @@ documented in the [OpenEXR-rs readme](https://github.com/cessen/openexr-rs/blob/
 Once those environment variables are set, then you should be able to build using
 the same simple cargo command above.

-
 # PsychoBlend

 Included in the repository is an add-on for [Blender](http://www.blender.org)
@ -53,6 +50,15 @@ doesn't support them yet.
 - Exports dupligroups with full hierarchical instancing
 - Limited auto-detection of instanced meshes

+# Contributing
+
+I'm not looking for contributions right now, and I'm likely to reject pull
+requests.  This is currently a solo project and I like it that way.
+
+However, if you're looking for projects _related_ to Psychopath to contribute to,
+[OpenEXR-rs](https://github.com/cessen/openexr-rs) is definitely a
+collaborative project that I would love more help with!  And I fully expect more
+such projects to come out of Psychopath in the future.

 # License

@ -63,13 +69,3 @@ See LICENSE.md for details.  But the gist is:
 * Most crates under the `sub_crates` directory are dual-licensed under MIT and Apache 2.0 (but with some exceptions--see each crate for its respective licenses).

 The intent of this scheme is to keep Psychopath itself copyleft, while allowing smaller reusable components to be licensed more liberally.
-
-
-# Contributing
-
-This is a personal, experimental, for-fun project, and I am specifically
-not looking for contributions of any kind.  All PRs will be rejected
-without review.
-
-However, feel free to fork this into an entirely new project, or examine
-the code for ideas for a project of your own.
--- a/build_native.sh
+++ b/build_native.sh
@ -0,0 +1,2 @@
+#!/bin/sh
+RUSTFLAGS="-C target-cpu=native" cargo build --release
--- a/example_scenes/cube.psy
+++ b/example_scenes/cube.psy
@ -1,60 +1,73 @@
-Scene $Scene_fr1 {
-    Output {
-        Path ["test_renders/cube.png"]
-    }
-    RenderSettings {
-        Resolution [960 540]
-        SamplesPerPixel [16]
-        Seed [1]
-    }
-    Camera {
-        Fov [49.134342]
-        FocalDistance [9.559999]
-        ApertureRadius [0.250000]
-        Transform [0.685881 0.727634 -0.010817 0.000000 -0.317370 0.312469 0.895343 0.000000 -0.654862 0.610666 -0.445245 0.000000 7.481132 -6.507640 5.343665 1.000000]
-    }
-    World {
-        BackgroundShader {
-            Type [Color]
-            Color [rec709, 0.050876 0.050876 0.050876]
-        }
-    }
-    Shaders {
-        SurfaceShader $Material {
-            Type [Lambert]
-            Color [rec709, 0.800000 0.800000 0.800000]
-        }
-    }
-    Objects {
-        MeshSurface $__Plane_ {
-            SurfaceShaderBind [$Material]
-            Vertices [-1.000000 -1.000000 0.000000 1.000000 -1.000000 0.000000 -1.000000 1.000000 0.000000 1.000000 1.000000 0.000000]
-            FaceVertCounts [4 ]
-            FaceVertIndices [0 1 3 2 ]
-        }
-        MeshSurface $__Cube_ {
-            SurfaceShaderBind [$Material]
-            Vertices [1.000000 1.000000 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 0.999999 1.000000 0.999999 -1.000001 1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 1.000000 ]
-            FaceVertCounts [4 4 4 4 4 4 ]
-            FaceVertIndices [0 1 2 3 4 7 6 5 0 4 5 1 1 5 6 2 2 6 7 3 4 0 3 7 ]
-        }
-        SphereLight $__Lamp {
-            Color [rec709, 50.000000 50.000000 50.000000]
-            Radius [0.100000]
-        }
-    }
-    Assembly {
-        Instance {
-            Data [$__Plane_]
-            Transform [0.078868 -0.000000 0.000000 -0.000000 -0.000000 0.078868 -0.000000 0.000000 0.000000 -0.000000 0.078868 -0.000000 -0.000000 0.000000 -0.000000 1.000000]
-        }
-        Instance {
-            Data [$__Cube_]
-            Transform [1.000000 -0.000000 0.000000 -0.000000 -0.000000 1.000000 -0.000000 0.000000 0.000000 -0.000000 1.000000 -0.000000 -0.000000 0.000000 -1.000000 1.000000]
-        }
-        Instance {
-            Data [$__Lamp]
-            Transform [0.019856 -0.060763 0.000000 -0.000000 0.015191 0.079422 -0.000000 0.000000 0.000000 -0.000000 1.000000 -0.000000 -0.026851 -0.125233 -4.432303 1.000000]
-        }
+ClearScene {}
+
+Output {
+    Path ["test_renders/cube.png"]
+}
+
+RenderSettings {
+    Resolution [960 540]
+    SamplesPerPixel [16]
+    Seed [1]
+}
+
+Camera {
+    Fov [49.134342]
+    FocalDistance [9.559999]
+    ApertureRadius [0.250000]
+    Transform [0.685881 0.727634 -0.010817 -0.317370 0.312469 0.895343 -0.654862 0.610666 -0.445245 7.481132 -6.507640 5.343665]
+}
+
+World {
+    BackgroundShader {
+        Type [Color]
+        Color [rec709, 0.050876 0.050876 0.050876]
+    }
+}
+
+AddShader {
+    SurfaceShader {
+        Name [Material]
+        Type [Lambert]
+        Color [rec709, 0.800000 0.800000 0.800000]
+    }
+}
+
+AddObject {
+    MeshSurface {
+        Name [__Plane_]
+        SurfaceShaderBind [Material]
+        Vertices [-1.000000 -1.000000 0.000000 1.000000 -1.000000 0.000000 -1.000000 1.000000 0.000000 1.000000 1.000000 0.000000]
+        FaceVertCounts [4 ]
+        FaceVertIndices [0 1 3 2 ]
+    }
+    MeshSurface {
+        Name [__Cube_]
+        SurfaceShaderBind [Material]
+        Vertices [1.000000 1.000000 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 0.999999 1.000000 0.999999 -1.000001 1.000000 -1.000000 -1.000000 1.000000 -1.000000 1.000000 1.000000 ]
+        FaceVertCounts [4 4 4 4 4 4 ]
+        FaceVertIndices [0 1 2 3 4 7 6 5 0 4 5 1 1 5 6 2 2 6 7 3 4 0 3 7 ]
+    }
+    SphereLight {
+        Name [__Lamp]
+        Color [rec709, 50.000000 50.000000 50.000000]
+        Radius [0.100000]
+    }
+}
+
+AddInstance {
+    Instance {
+        Name [__Plane_]
+        Data [__Plane_]
+        Transform [0.078868 0.000000 0.000000 0.000000 0.078868 0.000000 0.000000 0.000000 0.078868 0.000000 0.000000 0.000000]
+    }
+    Instance {
+        Name [__Cube_]
+        Data [__Cube_]
+        Transform [1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 -1.000000]
+    }
+    Instance {
+        Name [__Lamp]
+        Data [__Lamp]
+        Transform [0.019856 -0.060763 0.000000 0.015191 0.079422 0.000000 0.000000 0.000000 1.000000 -0.026851 -0.125233 -4.432303]
    }
 }
--- a/psychoblend/init.py
+++ b/psychoblend/init.py
@ -1,22 +1,17 @@
 bl_info = {
    "name": "PsychoBlend",
-    "version": (0, 1),
+    "version": (0, 1, 0),
    "author": "Nathan Vegdahl",
-    "blender": (2, 70, 0),
+    "blender": (3, 1, 0),
    "description": "Psychopath renderer integration",
    "location": "",
-    "wiki_url": "https://github.com/cessen/psychopath/wiki",
-    "tracker_url": "https://github.com/cessen/psychopath/issues",
-    "category": "Render"}
+    # "wiki_url": "https://github.com/cessen/psychopath/wiki",
+    # "tracker_url": "https://github.com/cessen/psychopath/issues",
+    "category": "Render"
+}


-if "bpy" in locals():
-    import imp
-    imp.reload(ui)
-    imp.reload(psy_export)
-    imp.reload(render)
-else:
-    from . import ui, psy_export, render
+from . import ui, psy_export, render

 import bpy
 from bpy.types import (AddonPreferences,
@ -33,53 +28,46 @@ from bpy.props import (StringProperty,
                       )


-# Custom Scene settings
+# Custom Scene settings.
 class RenderPsychopathSettingsScene(PropertyGroup):
-    spp = IntProperty(
+    spp: IntProperty(
        name="Samples Per Pixel", description="Total number of samples to take per pixel",
        min=1, max=65536, default=16
        )

-    max_samples_per_bucket = IntProperty(
-        name="Max Samples Per Bucket", description="How many samples to simultaneously calculate per thread; indirectly determines bucket size",
-        min=1, max=2**28, soft_max=2**16, default=4096
+    bucket_size: IntProperty(
+        name="Bucket Size", description="The height and width of each render bucket in pixels.",
+        min=1, max=4096, soft_max=256, default=32
        )

-    dicing_rate = FloatProperty(
+    dicing_rate: FloatProperty(
        name="Dicing Rate", description="The target microgeometry width in pixels",
        min=0.0001, max=100.0, soft_min=0.125, soft_max=1.0, default=0.25
        )

-    motion_blur_segments = IntProperty(
+    motion_blur_segments: IntProperty(
        name="Motion Segments", description="The number of segments to use in motion blur.  Zero means no motion blur.  Will be rounded down to the nearest power of two.",
        min=0, max=256, default=0
        )

-    shutter_start = FloatProperty(
+    shutter_start: FloatProperty(
        name="Shutter Open", description="The time during the frame that the shutter opens, for motion blur",
        min=-1.0, max=1.0, soft_min=0.0, soft_max=1.0, default=0.0
        )

-    shutter_end = FloatProperty(
+    shutter_end: FloatProperty(
        name="Shutter Close", description="The time during the frame that the shutter closes, for motion blur",
        min=-1.0, max=1.0, soft_min=0.0, soft_max=1.0, default=0.5
        )

-    export_path = StringProperty(
+    export_path: StringProperty(
        name="Export Path", description="The path to where the .psy files should be exported when rendering.  If left blank, /tmp or the equivalent is used.",
        subtype='FILE_PATH'
        )

-# Custom Camera properties
-class PsychopathCamera(bpy.types.PropertyGroup):
-    aperture_radius = FloatProperty(
-        name="Aperture Radius", description="Size of the camera's aperture, for DoF",
-        min=0.0, max=10000.0, soft_min=0.0, soft_max=2.0, default=0.0
-        )
-
-# Psychopath material
+# Psychopath material.
 class PsychopathLight(bpy.types.PropertyGroup):
-    color_type = EnumProperty(
+    color_type: EnumProperty(
        name="Color Type", description="",
        items=[
            ('Rec709', 'Rec709', ""),
@ -89,27 +77,27 @@ class PsychopathLight(bpy.types.PropertyGroup):
        default="Rec709"
        )

-    color_blackbody_temp = FloatProperty(
+    color_blackbody_temp: FloatProperty(
        name="Temperature", description="Blackbody temperature in kelvin",
        min=0.0, soft_min=800.0, soft_max=6500.0, default=1200.0
        )

-# Custom Mesh properties
+# Custom Mesh properties.
 class PsychopathMesh(bpy.types.PropertyGroup):
-    is_subdivision_surface = BoolProperty(
+    is_subdivision_surface: BoolProperty(
        name="Is Subdivision Surface", description="Whether this is a sibdivision surface or just a normal mesh",
        default=False
        )

 # Psychopath material
 class PsychopathMaterial(bpy.types.PropertyGroup):
-    surface_shader_type = EnumProperty(
+    surface_shader_type: EnumProperty(
        name="Surface Shader Type", description="",
        items=[('Emit', 'Emit', ""), ('Lambert', 'Lambert', ""), ('GGX', 'GGX', "")],
        default="Lambert"
        )

-    color_type = EnumProperty(
+    color_type: EnumProperty(
        name="Color Type", description="",
        items=[
            ('Rec709', 'Rec709', ""),
@ -119,39 +107,47 @@ class PsychopathMaterial(bpy.types.PropertyGroup):
        default="Rec709"
        )

-    color = FloatVectorProperty(
+    color: FloatVectorProperty(
        name="Color", description="",
        subtype='COLOR',
        min=0.0, soft_min=0.0, soft_max = 1.0,
        default=[0.8,0.8,0.8]
        )

-    color_blackbody_temp = FloatProperty(
+    color_blackbody_temp: FloatProperty(
        name="Temperature", description="Blackbody temperature in kelvin",
        min=0.0, soft_min=800.0, soft_max=6500.0, default=1200.0
        )

-    roughness = FloatProperty(
+    roughness: FloatProperty(
        name="Roughness", description="",
        min=-1.0, max=1.0, soft_min=0.0, soft_max=1.0, default=0.1
        )

-    tail_shape = FloatProperty(
+    tail_shape: FloatProperty(
        name="Tail Shape", description="",
        min=0.0, max=8.0, soft_min=1.0, soft_max=3.0, default=2.0
        )

-    fresnel = FloatProperty(
+    fresnel: FloatProperty(
        name="Fresnel", description="",
        min=0.0, max=1.0, soft_min=0.0, soft_max=1.0, default=0.9
        )

+# Psychopath world.
+class PsychopathWorld(bpy.types.PropertyGroup):
+    background_color: FloatVectorProperty(
+        name="Background Color", description="",
+        subtype='COLOR',
+        min=0.0, soft_min=0.0, soft_max = 1.0,
+        default=[0.8,0.8,0.8]
+        )

-# Addon Preferences
+# Addon Preferences.
 class PsychopathPreferences(AddonPreferences):
    bl_idname = __name__

-    filepath_psychopath = StringProperty(
+    filepath_psychopath: StringProperty(
                name="Psychopath Location",
                description="Path to renderer executable",
                subtype='DIR_PATH',
@ -166,15 +162,15 @@ class PsychopathPreferences(AddonPreferences):
 def register():
    bpy.utils.register_class(PsychopathPreferences)
    bpy.utils.register_class(RenderPsychopathSettingsScene)
-    bpy.utils.register_class(PsychopathCamera)
    bpy.utils.register_class(PsychopathLight)
    bpy.utils.register_class(PsychopathMesh)
    bpy.utils.register_class(PsychopathMaterial)
+    bpy.utils.register_class(PsychopathWorld)
    bpy.types.Scene.psychopath = PointerProperty(type=RenderPsychopathSettingsScene)
-    bpy.types.Camera.psychopath = PointerProperty(type=PsychopathCamera)
-    bpy.types.Lamp.psychopath = PointerProperty(type=PsychopathLight)
+    bpy.types.Light.psychopath = PointerProperty(type=PsychopathLight)
    bpy.types.Mesh.psychopath = PointerProperty(type=PsychopathMesh)
    bpy.types.Material.psychopath = PointerProperty(type=PsychopathMaterial)
+    bpy.types.World.psychopath = PointerProperty(type=PsychopathWorld)
    render.register()
    ui.register()

@ -182,14 +178,14 @@ def register():
 def unregister():
    bpy.utils.unregister_class(PsychopathPreferences)
    bpy.utils.unregister_class(RenderPsychopathSettingsScene)
-    bpy.utils.unregister_class(PsychopathCamera)
    bpy.utils.unregister_class(PsychopathLight)
    bpy.utils.unregister_class(PsychopathMesh)
    bpy.utils.unregister_class(PsychopathMaterial)
+    bpy.utils.unregister_class(PsychopathWorld)
    del bpy.types.Scene.psychopath
-    del bpy.types.Camera.psychopath
-    del bpy.types.Lamp.psychopath
+    del bpy.types.Light.psychopath
    del bpy.types.Mesh.psychopath
    del bpy.types.Material.psychopath
+    del bpy.types.World.psychopath
    render.unregister()
    ui.unregister()
--- a/psychoblend/assembly.py
+++ b/psychoblend/assembly.py
@ -1,398 +0,0 @@
-import bpy
-
-from .util import escape_name, mat2str, needs_def_mb, needs_xform_mb, ExportCancelled
-
-class Assembly:
-    def __init__(self, render_engine, objects, visible_layers, group_prefix="", translation_offset=(0,0,0)):
-        self.name = group_prefix
-        self.translation_offset = translation_offset
-        self.render_engine = render_engine
-        
-        self.materials = []
-        self.objects = []
-        self.instances = []
-
-        self.material_names = set()
-        self.mesh_names = set()
-        self.assembly_names = set()
-
-        # Collect all the objects, materials, instances, etc.
-        for ob in objects:
-            # Check if render is cancelled
-            if render_engine.test_break():
-                raise ExportCancelled()
-
-            # Check if the object is visible for rendering
-            vis_layer = False
-            for i in range(len(ob.layers)):
-                vis_layer = vis_layer or (ob.layers[i] and visible_layers[i])
-            if ob.hide_render or not vis_layer:
-                continue
-
-            # Store object data
-            name = None
-
-            if ob.type == 'EMPTY':
-                if ob.dupli_type == 'GROUP':
-                    name = group_prefix + "__" + escape_name(ob.dupli_group.name)
-                    if name not in self.assembly_names:
-                        self.assembly_names.add(name)
-                        self.objects += [Assembly(self.render_engine, ob.dupli_group.objects, ob.dupli_group.layers, name, ob.dupli_group.dupli_offset*-1)]
-            elif ob.type == 'MESH':
-                name = self.get_mesh(ob, group_prefix)
-            elif ob.type == 'LAMP' and ob.data.type == 'POINT':
-                name = self.get_sphere_lamp(ob, group_prefix)
-            elif ob.type == 'LAMP' and ob.data.type == 'AREA':
-                name = self.get_rect_lamp(ob, group_prefix)
-            
-            # Store instance
-            if name != None:
-                self.instances += [Instance(render_engine, ob, name)]
-
-    def export(self, render_engine, w):
-        if self.name == "":
-            w.write("Assembly {\n")
-        else:
-            w.write("Assembly $%s {\n" % self.name)
-        w.indent()
-
-        for mat in self.materials:
-            # Check if render is cancelled
-            if render_engine.test_break():
-                raise ExportCancelled()
-            mat.export(render_engine, w)
-
-        for ob in self.objects:
-            # Check if render is cancelled
-            if render_engine.test_break():
-                raise ExportCancelled()
-            ob.export(render_engine, w)
-
-        for inst in self.instances:
-            # Check if render is cancelled
-            if render_engine.test_break():
-                raise ExportCancelled()
-            inst.export(render_engine, w)
-
-        w.unindent()
-        w.write("}\n")
-    
-    #----------------
-
-    def take_sample(self, render_engine, scene, time):
-        for mat in self.materials:
-            # Check if render is cancelled
-            if render_engine.test_break():
-                raise ExportCancelled()
-            mat.take_sample(render_engine, scene, time)
-
-        for ob in self.objects:
-            # Check if render is cancelled
-            if render_engine.test_break():
-                raise ExportCancelled()
-            ob.take_sample(render_engine, scene, time)
-
-        for inst in self.instances:
-            # Check if render is cancelled
-            if render_engine.test_break():
-                raise ExportCancelled()
-            inst.take_sample(render_engine, time, self.translation_offset)
-    
-    def cleanup(self):
-        for mat in self.materials:
-            mat.cleanup()
-        for ob in self.objects:
-            ob.cleanup()
-
-    def get_mesh(self, ob, group_prefix):
-        # Figure out if we need to export or not and figure out what name to
-        # export with.
-        has_modifiers = len(ob.modifiers) > 0
-        deform_mb = needs_def_mb(ob)
-        if has_modifiers or deform_mb:
-            mesh_name = group_prefix + escape_name("__" + ob.name + "__" + ob.data.name + "_")
-        else:
-            mesh_name = group_prefix + escape_name("__" + ob.data.name + "_")
-        has_faces = len(ob.data.polygons) > 0
-        should_export_mesh = has_faces and (mesh_name not in self.mesh_names)
-        
-        # Get mesh
-        if should_export_mesh:
-            self.mesh_names.add(mesh_name)
-            self.objects += [Mesh(self.render_engine, ob, mesh_name)]
-
-            # Get materials
-            for ms in ob.material_slots:
-                if ms != None:
-                    if ms.material.name not in self.material_names:
-                        self.material_names.add(ms.material.name)
-                        self.materials += [Material(self.render_engine, ms.material)]
-
-            return mesh_name
-        else:
-            return None
-
-
-    def get_sphere_lamp(self, ob, group_prefix):
-        name = group_prefix + "__" + escape_name(ob.name)
-        self.objects += [SphereLamp(self.render_engine, ob, name)]
-        return name
-
-    def get_rect_lamp(self, ob, group_prefix):
-        name = group_prefix + "__" + escape_name(ob.name)
-        self.objects += [RectLamp(self.render_engine, ob, name)]
-        return name
-
-
-#=========================================================================
-
-
-class Mesh:
-    """ Holds data for a mesh to be exported.
-    """
-    def __init__(self, render_engine, ob, name):
-        self.ob = ob
-        self.name = name
-        self.needs_mb = needs_def_mb(self.ob)
-        self.time_meshes = []
-
-    def take_sample(self, render_engine, scene, time):
-        if len(self.time_meshes) == 0 or self.needs_mb:
-            render_engine.update_stats("", "Psychopath: Collecting '{}' at time {}".format(self.ob.name, time))
-            self.time_meshes += [self.ob.to_mesh(scene, True, 'RENDER')]
-    
-    def cleanup(self):
-        for mesh in self.time_meshes:
-            bpy.data.meshes.remove(mesh)
-
-    def export(self, render_engine, w):
-        render_engine.update_stats("", "Psychopath: Exporting %s" % self.ob.name)
-
-        if self.ob.data.psychopath.is_subdivision_surface == False:
-            # Exporting normal mesh
-            w.write("MeshSurface $%s {\n" % self.name)
-            w.indent()
-        else:
-            # Exporting subdivision surface cage
-            w.write("SubdivisionSurface $%s {\n" % self.name)
-            w.indent()
-
-        # Write vertices and (if it's smooth shaded) normals
-        for ti in range(len(self.time_meshes)):
-            w.write("Vertices [")
-            w.write(" ".join([("%f" % i) for vert in self.time_meshes[ti].vertices for i in vert.co]), False)
-            w.write("]\n", False)
-            if self.time_meshes[0].polygons[0].use_smooth and self.ob.data.psychopath.is_subdivision_surface == False:
-                w.write("Normals [")
-                w.write(" ".join([("%f" % i) for vert in self.time_meshes[ti].vertices for i in vert.normal]), False)
-                w.write("]\n", False)
-
-        # Write face vertex counts
-        w.write("FaceVertCounts [")
-        w.write(" ".join([("%d" % len(p.vertices)) for p in self.time_meshes[0].polygons]), False)
-        w.write("]\n", False)
-
-        # Write face vertex indices
-        w.write("FaceVertIndices [")
-        w.write(" ".join([("%d"%v) for p in self.time_meshes[0].polygons for v in p.vertices]), False)
-        w.write("]\n", False)
-
-        # MeshSurface/SubdivisionSurface section end
-        w.unindent()
-        w.write("}\n")
-
-
-class SphereLamp:
-    """ Holds data for a sphere light to be exported.
-    """
-    def __init__(self, render_engine, ob, name):
-        self.ob = ob
-        self.name = name
-        self.time_col = []
-        self.time_rad = []
-
-    def take_sample(self, render_engine, scene, time):
-        render_engine.update_stats("", "Psychopath: Collecting '{}' at time {}".format(self.ob.name, time))
-
-        if self.ob.data.psychopath.color_type == 'Rec709':
-            self.time_col += [('Rec709', self.ob.data.color * self.ob.data.energy)]
-        elif self.ob.data.psychopath.color_type == 'Blackbody':
-            self.time_col += [('Blackbody', self.ob.data.psychopath.color_blackbody_temp, self.ob.data.energy)]
-        elif self.ob.data.psychopath.color_type == 'ColorTemperature':
-            self.time_col += [('ColorTemperature', self.ob.data.psychopath.color_blackbody_temp, self.ob.data.energy)]
-
-        self.time_rad += [self.ob.data.shadow_soft_size]
-
-    def cleanup(self):
-        pass
-    
-    def export(self, render_engine, w):
-        render_engine.update_stats("", "Psychopath: Exporting %s" % self.ob.name)
-
-        w.write("SphereLight $%s {\n" % self.name)
-        w.indent()
-        for col in self.time_col:
-            if col[0] == 'Rec709':
-                w.write("Color [rec709, %f %f %f]\n" % (col[1][0], col[1][1], col[1][2]))
-            elif col[0] == 'Blackbody':
-                w.write("Color [blackbody, %f %f]\n" % (col[1], col[2]))
-            elif col[0] == 'ColorTemperature':
-                w.write("Color [color_temperature, %f %f]\n" % (col[1], col[2]))
-        for rad in self.time_rad:
-            w.write("Radius [%f]\n" % rad)
-
-        w.unindent()
-        w.write("}\n")
-
-
-class RectLamp:
-    """ Holds data for a rectangular light to be exported.
-    """
-    def __init__(self, render_engine, ob, name):
-        self.ob = ob
-        self.name = name
-        self.time_col = []
-        self.time_dim = []
-
-    def take_sample(self, render_engine, scene, time):
-        render_engine.update_stats("", "Psychopath: Collecting '{}' at time {}".format(self.ob.name, time))
-
-        if self.ob.data.psychopath.color_type == 'Rec709':
-            self.time_col += [('Rec709', self.ob.data.color * self.ob.data.energy)]
-        elif self.ob.data.psychopath.color_type == 'Blackbody':
-            self.time_col += [('Blackbody', self.ob.data.psychopath.color_blackbody_temp, self.ob.data.energy)]
-        elif self.ob.data.psychopath.color_type == 'ColorTemperature':
-            self.time_col += [('ColorTemperature', self.ob.data.psychopath.color_blackbody_temp, self.ob.data.energy)]
-
-        if self.ob.data.shape == 'RECTANGLE':
-            self.time_dim += [(self.ob.data.size, self.ob.data.size_y)]
-        else:
-            self.time_dim += [(self.ob.data.size, self.ob.data.size)]
-    
-    def cleanup(self):
-        pass
-    
-    def export(self, render_engine, w):
-        render_engine.update_stats("", "Psychopath: Exporting %s" % self.ob.name)
-
-        w.write("RectangleLight $%s {\n" % self.name)
-        w.indent()
-        for col in self.time_col:
-            if col[0] == 'Rec709':
-                w.write("Color [rec709, %f %f %f]\n" % (col[1][0], col[1][1], col[1][2]))
-            elif col[0] == 'Blackbody':
-                w.write("Color [blackbody, %f %f]\n" % (col[1], col[2]))
-            elif col[0] == 'ColorTemperature':
-                w.write("Color [color_temperature, %f %f]\n" % (col[1], col[2]))
-        for dim in self.time_dim:
-            w.write("Dimensions [%f %f]\n" % dim)
-
-        w.unindent()
-        w.write("}\n")
-
-
-class Instance:
-    def __init__(self, render_engine, ob, data_name):
-        self.ob = ob
-        self.data_name = data_name
-        self.needs_mb = needs_xform_mb(self.ob)
-        self.time_xforms = []
-
-    def take_sample(self, render_engine, time, translation_offset):
-        if len(self.time_xforms) == 0 or self.needs_mb:
-            render_engine.update_stats("", "Psychopath: Collecting '{}' xforms at time {}".format(self.ob.name, time))
-            mat = self.ob.matrix_world.copy()
-            mat[0][3] += translation_offset[0]
-            mat[1][3] += translation_offset[1]
-            mat[2][3] += translation_offset[2]
-            self.time_xforms += [mat]
-
-    def export(self, render_engine, w):
-        render_engine.update_stats("", "Psychopath: Exporting %s" % self.ob.name)
-
-        w.write("Instance {\n")
-        w.indent()
-        w.write("Data [$%s]\n" % self.data_name)
-        for mat in self.time_xforms:
-            w.write("Transform [%s]\n" % mat2str(mat.inverted()))
-        for ms in self.ob.material_slots:
-            if ms != None:
-                w.write("SurfaceShaderBind [$%s]\n" % escape_name(ms.material.name))
-                break
-        w.unindent()
-        w.write("}\n")
-
-
-class Material:
-    def __init__(self, render_engine, material):
-        self.mat = material
-
-    def take_sample(self, render_engine, time, translation_offset):
-        # TODO: motion blur of material settings
-        pass
-
-    def export(self, render_engine, w):
-        render_engine.update_stats("", "Psychopath: Exporting %s" % self.mat.name)
-
-        w.write("SurfaceShader $%s {\n" % escape_name(self.mat.name))
-        w.indent()
-        if self.mat.psychopath.surface_shader_type == 'Emit':
-            w.write("Type [Emit]\n")
-            if self.mat.psychopath.color_type == 'Rec709':
-                col = self.mat.psychopath.color
-                w.write("Color [rec709, %f %f %f]\n" % (
-                    col[0], col[1], col[2],
-                ))
-            elif self.mat.psychopath.color_type == 'Blackbody':
-                w.write("Color [blackbody, %f %f]\n" % (
-                    self.mat.psychopath.color_blackbody_temp,
-                    1.0,
-                ))
-            elif self.mat.psychopath.color_type == 'ColorTemperature':
-                w.write("Color [color_temperature, %f %f]\n" % (
-                    self.mat.psychopath.color_blackbody_temp,
-                    1.0,
-                ))
-        elif self.mat.psychopath.surface_shader_type == 'Lambert':
-            w.write("Type [Lambert]\n")
-            if self.mat.psychopath.color_type == 'Rec709':
-                col = self.mat.psychopath.color
-                w.write("Color [rec709, %f %f %f]\n" % (
-                    col[0], col[1], col[2],
-                ))
-            elif self.mat.psychopath.color_type == 'Blackbody':
-                w.write("Color [blackbody, %f %f]\n" % (
-                    self.mat.psychopath.color_blackbody_temp,
-                    1.0,
-                ))
-            elif self.mat.psychopath.color_type == 'ColorTemperature':
-                w.write("Color [color_temperature, %f %f]\n" % (
-                    self.mat.psychopath.color_blackbody_temp,
-                    1.0,
-                ))
-        elif self.mat.psychopath.surface_shader_type == 'GGX':
-            w.write("Type [GGX]\n")
-            if self.mat.psychopath.color_type == 'Rec709':
-                col = self.mat.psychopath.color
-                w.write("Color [rec709, %f %f %f]\n" % (
-                    col[0], col[1], col[2],
-                ))
-            elif self.mat.psychopath.color_type == 'Blackbody':
-                w.write("Color [blackbody, %f %f]\n" % (
-                    self.mat.psychopath.color_blackbody_temp,
-                    1.0,
-                ))
-            elif self.mat.psychopath.color_type == 'ColorTemperature':
-                w.write("Color [color_temperature, %f %f]\n" % (
-                    self.mat.psychopath.color_blackbody_temp,
-                    1.0,
-                ))
-            w.write("Roughness [%f]\n" % self.mat.psychopath.roughness)
-            w.write("Fresnel [%f]\n" % self.mat.psychopath.fresnel)
-        else:
-            raise "Unsupported surface shader type '%s'" % self.mat.psychopath.surface_shader_type
-        w.unindent()
-        w.write("}\n")
-
-    def cleanup(self):
-        pass
--- a/psychoblend/material.py
+++ b/psychoblend/material.py
@ -0,0 +1,77 @@
+import bpy
+
+from .util import escape_name, mat2str, needs_def_mb, needs_xform_mb, ExportCancelled
+
+class Material:
+    def __init__(self, render_engine, depsgraph, material):
+        self.mat = material
+
+    def take_sample(self, render_engine, depsgraph, time):
+        # TODO: motion blur of material settings
+        pass
+
+    def export(self, render_engine, w):
+        render_engine.update_stats("", "Psychopath: Exporting %s" % self.mat.name)
+
+        w.write("SurfaceShader $%s {\n" % escape_name(self.mat.name))
+        w.indent()
+        if self.mat.psychopath.surface_shader_type == 'Emit':
+            w.write("Type [Emit]\n")
+            if self.mat.psychopath.color_type == 'Rec709':
+                col = self.mat.psychopath.color
+                w.write("Color [rec709, %f %f %f]\n" % (
+                    col[0], col[1], col[2],
+                ))
+            elif self.mat.psychopath.color_type == 'Blackbody':
+                w.write("Color [blackbody, %f %f]\n" % (
+                    self.mat.psychopath.color_blackbody_temp,
+                    1.0,
+                ))
+            elif self.mat.psychopath.color_type == 'ColorTemperature':
+                w.write("Color [color_temperature, %f %f]\n" % (
+                    self.mat.psychopath.color_blackbody_temp,
+                    1.0,
+                ))
+        elif self.mat.psychopath.surface_shader_type == 'Lambert':
+            w.write("Type [Lambert]\n")
+            if self.mat.psychopath.color_type == 'Rec709':
+                col = self.mat.psychopath.color
+                w.write("Color [rec709, %f %f %f]\n" % (
+                    col[0], col[1], col[2],
+                ))
+            elif self.mat.psychopath.color_type == 'Blackbody':
+                w.write("Color [blackbody, %f %f]\n" % (
+                    self.mat.psychopath.color_blackbody_temp,
+                    1.0,
+                ))
+            elif self.mat.psychopath.color_type == 'ColorTemperature':
+                w.write("Color [color_temperature, %f %f]\n" % (
+                    self.mat.psychopath.color_blackbody_temp,
+                    1.0,
+                ))
+        elif self.mat.psychopath.surface_shader_type == 'GGX':
+            w.write("Type [GGX]\n")
+            if self.mat.psychopath.color_type == 'Rec709':
+                col = self.mat.psychopath.color
+                w.write("Color [rec709, %f %f %f]\n" % (
+                    col[0], col[1], col[2],
+                ))
+            elif self.mat.psychopath.color_type == 'Blackbody':
+                w.write("Color [blackbody, %f %f]\n" % (
+                    self.mat.psychopath.color_blackbody_temp,
+                    1.0,
+                ))
+            elif self.mat.psychopath.color_type == 'ColorTemperature':
+                w.write("Color [color_temperature, %f %f]\n" % (
+                    self.mat.psychopath.color_blackbody_temp,
+                    1.0,
+                ))
+            w.write("Roughness [%f]\n" % self.mat.psychopath.roughness)
+            w.write("Fresnel [%f]\n" % self.mat.psychopath.fresnel)
+        else:
+            raise "Unsupported surface shader type '%s'" % self.mat.psychopath.surface_shader_type
+        w.unindent()
+        w.write("}\n")
+
+    def cleanup(self):
+        pass
--- a/psychoblend/objects.py
+++ b/psychoblend/objects.py
@ -0,0 +1,206 @@
+import bpy
+
+from .util import escape_name, mat2str, needs_def_mb, needs_xform_mb, ExportCancelled
+from mathutils import Vector, Matrix
+
+def make_object_data_cache(render_engine, depsgraph, ob, name):
+    if ob.type == 'MESH':
+        return Mesh(render_engine, depsgraph, ob, name)
+    elif ob.type == 'LIGHT':
+        if ob.data.type == 'POINT':
+            return SphereLamp(render_engine, depsgraph, ob, name)
+        elif ob.data.type == 'AREA':
+            return RectLamp(render_engine, depsgraph, ob, name)
+        elif ob.data.type == 'AREA':
+            return RectLamp(render_engine, depsgraph, ob, name)
+
+class Mesh:
+    """ Holds data for a mesh to be exported.
+    """
+    def __init__(self, render_engine, depsgraph, ob, name):
+        self.name = name
+        self.material_name = None
+        if len(ob.material_slots) >= 1 and ob.material_slots[0].material != None:
+            self.material_name = ob.material_slots[0].material.name
+        self.is_subdiv = ob.data.psychopath.is_subdivision_surface
+        self.needs_mb = needs_def_mb(ob)
+        self.time_meshes = []
+
+    def take_sample(self, render_engine, depsgraph, ob, time):
+        if len(self.time_meshes) == 0 or self.needs_mb:
+            render_engine.update_stats("", "Psychopath: Collecting '{}' at time {}".format(self.name, time))
+            self.time_meshes += [ob.to_mesh(depsgraph=depsgraph).copy()]
+    
+    def cleanup(self):
+        for mesh in self.time_meshes:
+            bpy.data.meshes.remove(mesh)
+
+    def export(self, render_engine, w):
+        render_engine.update_stats("", "Psychopath: Exporting %s" % self.name)
+
+        if self.is_subdiv == False:
+            # Exporting normal mesh
+            w.write("MeshSurface $%s {\n" % escape_name(self.name))
+            w.indent()
+        else:
+            # Exporting subdivision surface cage
+            w.write("SubdivisionSurface $%s {\n" % escape_name(self.name))
+            w.indent()
+
+        # Material bindings.
+        if self.material_name != None:
+            w.write("SurfaceShaderBind [${}]\n".format(escape_name(self.material_name)))
+
+        # Write vertices and (if it's smooth shaded) normals
+        for ti in range(len(self.time_meshes)):
+            w.write("Vertices [")
+            w.write(" ".join([("%f" % i) for vert in self.time_meshes[ti].vertices for i in vert.co]), False)
+            w.write("]\n", False)
+            if self.time_meshes[0].polygons[0].use_smooth and self.is_subdiv == False:
+                w.write("Normals [")
+                w.write(" ".join([("%f" % i) for vert in self.time_meshes[ti].vertices for i in vert.normal]), False)
+                w.write("]\n", False)
+
+        # Write face vertex counts
+        w.write("FaceVertCounts [")
+        w.write(" ".join([("%d" % len(p.vertices)) for p in self.time_meshes[0].polygons]), False)
+        w.write("]\n", False)
+
+        # Write face vertex indices
+        w.write("FaceVertIndices [")
+        w.write(" ".join([("%d"%v) for p in self.time_meshes[0].polygons for v in p.vertices]), False)
+        w.write("]\n", False)
+
+        # MeshSurface/SubdivisionSurface section end
+        w.unindent()
+        w.write("}\n")
+
+
+class SphereLamp:
+    """ Holds data for a sphere light to be exported.
+    """
+    def __init__(self, render_engine, depsgraph, ob, name):
+        self.name = name
+        self.time_col = []
+        self.time_rad = []
+
+    def take_sample(self, render_engine, depsgraph, ob, time):
+        render_engine.update_stats("", "Psychopath: Collecting '{}' at time {}".format(ob.name, time))
+
+        if ob.data.psychopath.color_type == 'Rec709':
+            self.time_col += [('Rec709', ob.data.color * ob.data.energy)]
+        elif ob.data.psychopath.color_type == 'Blackbody':
+            self.time_col += [('Blackbody', ob.data.psychopath.color_blackbody_temp, ob.data.energy)]
+        elif ob.data.psychopath.color_type == 'ColorTemperature':
+            self.time_col += [('ColorTemperature', ob.data.psychopath.color_blackbody_temp, ob.data.energy)]
+
+        self.time_rad += [ob.data.shadow_soft_size]
+
+    def cleanup(self):
+        pass
+    
+    def export(self, render_engine, w):
+        render_engine.update_stats("", "Psychopath: Exporting %s" % self.name)
+
+        w.write("SphereLight $%s {\n" % escape_name(self.name))
+        w.indent()
+        for col in self.time_col:
+            if col[0] == 'Rec709':
+                w.write("Color [rec709, %f %f %f]\n" % (col[1][0], col[1][1], col[1][2]))
+            elif col[0] == 'Blackbody':
+                w.write("Color [blackbody, %f %f]\n" % (col[1], col[2]))
+            elif col[0] == 'ColorTemperature':
+                w.write("Color [color_temperature, %f %f]\n" % (col[1], col[2]))
+        for rad in self.time_rad:
+            w.write("Radius [%f]\n" % rad)
+
+        w.unindent()
+        w.write("}\n")
+
+
+class RectLamp:
+    """ Holds data for a rectangular light to be exported.
+    """
+    def __init__(self, render_engine, depsgraph, ob, name):
+        self.name = name
+        self.time_col = []
+        self.time_dim = []
+
+    def take_sample(self, render_engine, depsgraph, ob, time):
+        render_engine.update_stats("", "Psychopath: Collecting '{}' at time {}".format(self.name, time))
+
+        if ob.data.psychopath.color_type == 'Rec709':
+            self.time_col += [('Rec709', ob.data.color * ob.data.energy / 2)]
+        elif ob.data.psychopath.color_type == 'Blackbody':
+            self.time_col += [('Blackbody', ob.data.psychopath.color_blackbody_temp, ob.data.energy)]
+        elif ob.data.psychopath.color_type == 'ColorTemperature':
+            self.time_col += [('ColorTemperature', ob.data.psychopath.color_blackbody_temp, ob.data.energy)]
+
+        if ob.data.shape == 'RECTANGLE':
+            self.time_dim += [(ob.data.size, ob.data.size_y)]
+        else:
+            self.time_dim += [(ob.data.size, ob.data.size)]
+
+    def cleanup(self):
+        pass
+    
+    def export(self, render_engine, w):
+        render_engine.update_stats("", "Psychopath: Exporting %s" % self.name)
+
+        w.write("RectangleLight $%s {\n" % escape_name(self.name))
+        w.indent()
+        for col in self.time_col:
+            if col[0] == 'Rec709':
+                w.write("Color [rec709, %f %f %f]\n" % (col[1][0], col[1][1], col[1][2]))
+            elif col[0] == 'Blackbody':
+                w.write("Color [blackbody, %f %f]\n" % (col[1], col[2]))
+            elif col[0] == 'ColorTemperature':
+                w.write("Color [color_temperature, %f %f]\n" % (col[1], col[2]))
+        for dim in self.time_dim:
+            w.write("Dimensions [%f %f]\n" % dim)
+
+        w.unindent()
+        w.write("}\n")
+
+
+class DistantDiskLamp:
+    def __init__(self, render_engine, depsgraph, ob, name):
+        self.name = name
+        self.time_col = []
+        self.time_dir = []
+        self.time_rad = []
+
+    def take_sample(self, render_engine, depsgraph, ob, time):
+        render_engine.update_stats("", "Psychopath: Collecting '{}' at time {}".format(self.name, time))
+        self.time_dir += [tuple(ob.matrix_world.to_3x3() @ Vector((0, 0, -1)))]
+
+        if ob.data.psychopath.color_type == 'Rec709':
+            self.time_col += [('Rec709', ob.data.color * ob.data.energy)]
+        elif ob.data.psychopath.color_type == 'Blackbody':
+            self.time_col += [('Blackbody', ob.data.psychopath.color_blackbody_temp, ob.data.energy)]
+        elif ob.data.psychopath.color_type == 'ColorTemperature':
+            self.time_col += [('ColorTemperature', ob.data.psychopath.color_blackbody_temp, ob.data.energy)]
+
+        self.time_rad += [ob.data.shadow_soft_size]
+
+    def cleanup(self):
+        pass
+
+    def export(self, render_engine, w):
+        render_engine.update_stats("", "Psychopath: Exporting %s" % escape_name(self.name))
+        w.write("DistantDiskLight $%s {\n" % self.name)
+        w.indent()
+        for direc in self.time_dir:
+            w.write("Direction [%f %f %f]\n" % (direc[0], direc[1], direc[2]))
+        for col in self.time_col:
+            if col[0] == 'Rec709':
+                w.write("Color [rec709, %f %f %f]\n" % (col[1][0], col[1][1], col[1][2]))
+            elif col[0] == 'Blackbody':
+                w.write("Color [blackbody, %f %f]\n" % (col[1], col[2]))
+            elif col[0] == 'ColorTemperature':
+                w.write("Color [color_temperature, %f %f]\n" % (col[1], col[2]))
+        for rad in self.time_rad:
+            w.write("Radius [%f]\n" % rad)
+
+        w.unindent()
+        w.write("}\n")
--- a/psychoblend/psy_export.py
+++ b/psychoblend/psy_export.py
@ -2,9 +2,11 @@ import bpy

 from math import log

-from .assembly import Assembly
+from .material import Material
+from .objects import make_object_data_cache, Mesh, DistantDiskLamp
 from .util import escape_name, mat2str, ExportCancelled
-from .world import World
+from .world import World, Camera
+from . import bl_info


 class IndentedWriter:
@ -29,25 +31,43 @@ class IndentedWriter:


 class PsychoExporter:
-    def __init__(self, f, render_engine, scene):
+    def __init__(self, f, render_engine, depsgraph):
        self.w = IndentedWriter(f)
        self.render_engine = render_engine
-        self.scene = scene
+        self.depsgraph = depsgraph
+        self.scene = depsgraph.scene
+        self.view_layer = depsgraph.view_layer

-        self.mesh_names = {}
-        self.group_names = {}
+        # For camera data.
+        res_x = int(self.scene.render.resolution_x * (self.scene.render.resolution_percentage / 100))
+        res_y = int(self.scene.render.resolution_y * (self.scene.render.resolution_percentage / 100))
+        self.camera = Camera(render_engine, depsgraph.scene.camera, float(res_x) / float(res_y))

-        # Motion blur segments are rounded down to a power of two
-        if scene.psychopath.motion_blur_segments > 0:
-            self.time_samples = (2**int(log(scene.psychopath.motion_blur_segments, 2))) + 1
+        # For world data.
+        self.world = World(render_engine, depsgraph)
+
+        # For all objects except sun lamps.
+        self.object_data = {} # name -> cached_data
+        self.instances = {} # instance_id -> [object_data_name, transform_list]
+
+        # For all sun lamps.
+        self.sun_lamp_data = {} # name -> cached_data
+        self.sun_lamp_instances = {} # instance_id -> [sun_lamp_data_name, transform_list]
+
+        # For all materials.
+        self.materials = {} # name -> cached_data
+
+        # Motion blur segments are rounded down to a power of two.
+        if self.scene.psychopath.motion_blur_segments > 0:
+            self.time_samples = (2**int(log(self.scene.psychopath.motion_blur_segments, 2))) + 1
        else:
            self.time_samples = 1

-        # pre-calculate useful values for exporting motion blur
-        self.shutter_start = scene.psychopath.shutter_start
-        self.shutter_diff = (scene.psychopath.shutter_end - scene.psychopath.shutter_start) / max(1, (self.time_samples-1))
+        # pre-calculate useful values for exporting motion blur.
+        self.shutter_start = self.scene.psychopath.shutter_start
+        self.shutter_diff = (self.scene.psychopath.shutter_end - self.scene.psychopath.shutter_start) / max(1, (self.time_samples-1))

-        self.fr = scene.frame_current
+        self.fr = self.scene.frame_current


    def set_frame(self, frame, fraction):
@ -70,25 +90,31 @@ class PsychoExporter:

    def _export_psy(self):
        # Info
-        self.w.write("# Exported from Blender 2.7x\n")
+        self.w.write("# Exported from Blender {} with PsychoBlend {}.{}.{}\n".format(
+            bpy.app.version_string,
+            bl_info["version"][0],
+            bl_info["version"][1],
+            bl_info["version"][2],
+        ))

        # Scene begin
        self.w.write("\n\nScene $%s_fr%d {\n" % (escape_name(self.scene.name), self.fr))
        self.w.indent()

-        #######################
-        # Output section begin
+        #------------------------------------------------------
+        # Output section.
+
        self.w.write("Output {\n")
        self.w.indent()

        self.w.write('Path [""]\n')

-        # Output section end
        self.w.unindent()
        self.w.write("}\n")

-        ###############################
-        # RenderSettings section begin
+        #------------------------------------------------------
+        # RenderSettings section.
+
        self.w.write("RenderSettings {\n")
        self.w.indent()

@ -99,34 +125,145 @@ class PsychoExporter:
        self.w.write("DicingRate [%f]\n" % self.scene.psychopath.dicing_rate)
        self.w.write('Seed [%d]\n' % self.fr)

-        # RenderSettings section end
        self.w.unindent()
        self.w.write("}\n")

-        ###############################
-        # Export world and object data
-        world = None
-        root_assembly = None
+        #------------------------------------------------------
+        # Collect materials.
+
+        # TODO: handle situations where there are more than one
+        # material with the same name.  This can happen through
+        # library linking.
+
+        for inst in self.depsgraph.object_instances:
+            ob = inst.object
+            if ob.type in ['MESH']:
+                for ms in ob.material_slots:
+                    if ms.material != None:
+                        if ms.material.name not in self.materials:
+                            self.materials[ms.material.name] = Material(self.render_engine, self.depsgraph, ms.material)
+
+        #------------------------------------------------------
+        # Collect world and object data.
+
        try:
-            # Prep for data collection
-            world = World(self.render_engine, self.scene, self.scene.layers, float(res_x) / float(res_y))
-            root_assembly = Assembly(self.render_engine, self.scene.objects, self.scene.layers)
-
-            # Collect data for each time sample
            for i in range(self.time_samples):
-                time = self.fr + self.shutter_start + (self.shutter_diff*i)
-                self.set_frame(self.fr, self.shutter_start + (self.shutter_diff*i))
-                world.take_sample(self.render_engine, self.scene, time)
-                root_assembly.take_sample(self.render_engine, self.scene, time)
+                # Check if render is cancelled
+                if self.render_engine.test_break():
+                    raise ExportCancelled()

-            # Export collected data
-            world.export(self.render_engine, self.w)
-            root_assembly.export(self.render_engine, self.w)
-        finally:
-            if world != None:
-                world.cleanup()
-            if root_assembly != None:
-                root_assembly.cleanup()
+                subframe = self.shutter_start + (self.shutter_diff*i)
+                time = self.fr + subframe
+                self.depsgraph.scene.frame_set(self.fr, subframe=subframe)
+                self.depsgraph.update()
+
+                # Collect camera and world data.
+                self.camera.take_sample(self.render_engine, self.depsgraph, time)
+                self.world.take_sample(self.render_engine, self.depsgraph, time)
+
+                # Collect renderable objects.
+                collected_objs = set() # Names of the objects whose data has already been collected.
+                for inst in self.depsgraph.object_instances:
+                    # Check if render is cancelled
+                    if self.render_engine.test_break():
+                        raise ExportCancelled()
+
+                    if inst.object.type not in ['MESH', 'LIGHT']:
+                        continue
+
+                    # We use this a couple of times, so make a shorthand.
+                    is_sun_lamp = inst.object.type == 'LIGHT' and inst.object.data.type == 'SUN'
+
+                    # TODO: handle situations where there are more than one
+                    # object with the same name.  This can happen through
+                    # library linking.
+
+                    # Get a unique id for the instance.  This is surprisingly
+                    # tricky, because the instance's "persistent_id" property
+                    # isn't globally unique, as I would have expected from
+                    # the documentation.
+                    id = None
+                    if inst.is_instance:
+                        id = (
+                            hash((inst.object.name, inst.parent.name)),
+                            # Has to be turned into a tuple, otherwise it doesn't
+                            # work as part of the ID for some reason.
+                            tuple(inst.persistent_id),
+                        )
+                    else:
+                        id = inst.object.name
+
+                    # Save the instance transforms.
+                    if is_sun_lamp:
+                        if id not in self.sun_lamp_instances:
+                            self.sun_lamp_instances[id] = [inst.object.name, [inst.matrix_world.copy()]]
+                        else:
+                            self.sun_lamp_instances[id][1] += [inst.matrix_world.copy()]
+                    else:
+                        if id not in self.instances:
+                            self.instances[id] = [inst.object.name, [inst.matrix_world.copy()]]
+                        else:
+                            self.instances[id][1] += [inst.matrix_world.copy()]
+
+                    # Save the object data if it hasn't already been saved.
+                    if inst.object.name not in collected_objs:
+                        collected_objs.add(inst.object.name)
+                        if is_sun_lamp:
+                            if inst.object.name not in self.sun_lamp_data:
+                                self.sun_lamp_data[inst.object.name] = DistantDiskLamp(self.render_engine, self.depsgraph, inst.object, inst.object.name)
+                            self.sun_lamp_data[inst.object.name].take_sample(self.render_engine, self.depsgraph, inst.object, time)
+                        else:
+                            if inst.object.name not in self.object_data:
+                                self.object_data[inst.object.name] = make_object_data_cache(self.render_engine, self.depsgraph, inst.object, inst.object.name)
+                            self.object_data[inst.object.name].take_sample(self.render_engine, self.depsgraph, inst.object, time)
+
+            #------------------------------------------------------
+            # Export world and object data.
+
+            self.camera.export(self.render_engine, self.w)
+            self.world.export(self.render_engine, self.w)
+
+            self.w.write("Assembly {\n")
+            self.w.indent()
+
+            # Export materials.
+            for name in self.materials:
+                self.materials[name].export(self.render_engine, self.w)
+
+            # Export objects.
+            for name in self.object_data:
+                self.object_data[name].export(self.render_engine, self.w)
+
+            # Export instances.
+            for id in self.instances:
+                [obj_name, xforms] = self.instances[id]
+                self.render_engine.update_stats("", "Psychopath: Exporting %s instance" % obj_name)
+
+                prefix = str(hex(hash(id)))
+                name = "inst_{}__{}".format(prefix, escape_name(obj_name))
+
+                self.w.write("Instance {\n")
+                self.w.indent()
+                self.w.write("Data [${}]\n".format(escape_name(obj_name)))
+                for mat in xforms:
+                    self.w.write("Transform [{}]\n".format(mat2str(mat)))
+                self.w.unindent()
+                self.w.write("}\n")
+
+            self.w.unindent()
+            self.w.write("}\n")
+        finally:            
+            #------------------------------------------------------
+            # Cleanup collected data.
+
+            self.camera.cleanup()
+            self.world.cleanup()
+
+            for data in self.sun_lamp_data:
+                self.sun_lamp_data[data].cleanup()
+
+            for data in self.object_data:
+                self.object_data[data].cleanup()

        # Scene end
        self.w.unindent()
--- a/psychoblend/render.py
+++ b/psychoblend/render.py
@ -9,11 +9,37 @@ from . import psy_export
 class PsychopathRender(bpy.types.RenderEngine):
    bl_idname = 'PSYCHOPATH_RENDER'
    bl_label = "Psychopath"
-    DELAY = 1.0
+    bl_use_preview = False
+
+    def __init__(self):
+        pass
+
+    def __del__(self):
+        pass
+
+    def update(self, data, depsgraph):
+        pass
+
+    def render(self, depsgraph):
+        self._process = None
+        try:
+            self._render(depsgraph)
+        except:
+            if self._process != None:
+                self._process.terminate()
+            raise
+
+    def view_update(self, context, depsgraph):
+        pass
+
+    def view_draw(self, context, depsgraph):
+        pass
+
+    #----------------------------------------------------------

    @staticmethod
    def _locate_binary():
-        addon_prefs = bpy.context.user_preferences.addons[__package__].preferences
+        addon_prefs = bpy.context.preferences.addons[__package__].preferences

        # Use the system preference if its set.
        psy_binary = addon_prefs.filepath_psychopath
@ -23,7 +49,7 @@ class PsychopathRender(bpy.types.RenderEngine):
            else:
                print("User Preference to psychopath %r NOT FOUND, checking $PATH" % psy_binary)

-        # search the path all os's
+        # Search for the path.
        psy_binary_default = "psychopath"

        os_path_ls = os.getenv("PATH").split(':') + [""]
@ -45,13 +71,13 @@ class PsychopathRender(bpy.types.RenderEngine):
        if crop != None:
            args += ["--crop", str(crop[0]), str(self.size_y - crop[3]), str(crop[2] - 1), str(self.size_y - crop[1] - 1)]
        if use_stdin:
-            args += ["--spb", str(scene.psychopath.max_samples_per_bucket), "--serialized_output", "--use_stdin"]
+            args += ["--bucket_size", str(scene.psychopath.bucket_size), "--serialized_output", "--use_stdin"]
        else:
-            args += ["--spb", str(scene.psychopath.max_samples_per_bucket), "--serialized_output", "-i", psy_filepath]
+            args += ["--bucket_size", str(scene.psychopath.bucket_size), "--serialized_output", "-i", psy_filepath]

        # Start Rendering!
        try:
-            self._process = subprocess.Popen([psy_binary] + args, bufsize=1, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+            self._process = subprocess.Popen([psy_binary] + args, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        except OSError:
            # TODO, report api
            print("Psychopath: could not execute '%s'" % psy_binary)
@ -73,28 +99,21 @@ class PsychopathRender(bpy.types.RenderEngine):
        height = bucket_info[3] - bucket_info[1]

        # Decode pixel data
-        pixels = [p for p in struct.iter_unpack("ffff", base64.b64decode(pixels_encoded))]
-        pixels_flipped = []
+        pixels_unpacked = [p for p in struct.iter_unpack("ffff", base64.b64decode(pixels_encoded))]
+        pixels = []
        for i in range(height):
            n = height - i - 1
-            pixels_flipped += pixels[n*width:(n+1)*width]
+            pixels += pixels_unpacked[n*width:(n+1)*width]

        # Write pixel data to render image
        result = self.begin_result(x, y, width, height)
        lay = result.layers[0].passes["Combined"]
-        lay.rect = pixels_flipped
+        lay.rect = pixels
        self.end_result(result)

-    def render(self, scene):
-        self._process = None
-        try:
-            self._render(scene)
-        except:
-            if self._process != None:
-                self._process.terminate()
-            raise
+    def _render(self, depsgraph):
+        scene = depsgraph.scene

-    def _render(self, scene):
        # has to be called to update the frame on exporting animations
        scene.frame_set(scene.frame_current)

@ -130,8 +149,8 @@ class PsychopathRender(bpy.types.RenderEngine):
                return

            self.update_stats("", "Psychopath: Collecting...")
-            # Export to Psychopath's stdin
-            if not psy_export.PsychoExporter(self._process.stdin, self, scene).export_psy():
+            # Export to Psychopath's stdin.
+            if not psy_export.PsychoExporter(self._process.stdin, self, depsgraph).export_psy():
                # Render cancelled in the middle of exporting,
                # so just return.
                self._process.terminate()
@ -142,7 +161,7 @@ class PsychopathRender(bpy.types.RenderEngine):
            # Export to file
            self.update_stats("", "Psychopath: Exporting data from Blender")
            with open(export_path, 'w+b') as f:
-                if not psy_export.PsychoExporter(f, self, scene).export_psy():
+                if not psy_export.PsychoExporter(f, self, depsgraph).export_psy():
                    # Render cancelled in the middle of exporting,
                    # so just return.
                    return
@ -183,7 +202,7 @@ class PsychopathRender(bpy.types.RenderEngine):
            # Get render output from stdin
            tmp = self._process.stdout.read1(2**16)
            if len(tmp) == 0:
-                time.sleep(0.0001) # Don't spin on the CPU
+                time.sleep(0.001) # Don't spin on the CPU
                if render_process_finished:
                    all_output_consumed = True
                continue
--- a/psychoblend/ui.py
+++ b/psychoblend/ui.py
@ -1,18 +1,291 @@
 import bpy

-# Use some of the existing buttons.
-from bl_ui import properties_render
-properties_render.RENDER_PT_render.COMPAT_ENGINES.add('PSYCHOPATH_RENDER')
-properties_render.RENDER_PT_dimensions.COMPAT_ENGINES.add('PSYCHOPATH_RENDER')
-properties_render.RENDER_PT_output.COMPAT_ENGINES.add('PSYCHOPATH_RENDER')
-del properties_render
+#--------------------------------------------------------------
+# Specify which existing Blender UI panels Psychopath
+# uses/is compatible with.

-from bl_ui import properties_data_camera
-properties_data_camera.DATA_PT_lens.COMPAT_ENGINES.add('PSYCHOPATH_RENDER')
-properties_data_camera.DATA_PT_camera.COMPAT_ENGINES.add('PSYCHOPATH_RENDER')
-properties_data_camera.DATA_PT_camera_display.COMPAT_ENGINES.add('PSYCHOPATH_RENDER')
-properties_data_camera.DATA_PT_custom_props_camera.COMPAT_ENGINES.add('PSYCHOPATH_RENDER')
-del properties_data_camera
+import bl_ui
+
+def register_engine_with_panels(area, list):
+    # TODO: reverse this, so we're checking if the list item is
+    # in the real panels, and throw an error if it's not.  That
+    # way things don't just silently fail.
+    for p in list:
+        eval("bl_ui.{}.{}.COMPAT_ENGINES.add('PSYCHOPATH_RENDER')".format(area, p))
+
+register_engine_with_panels(
+    "properties_render",
+    [
+        "RENDER_PT_color_management",
+        "RENDER_PT_color_management_curves",
+    ]
+)
+
+register_engine_with_panels(
+    "properties_output",
+    [
+        "RENDER_PT_encoding",
+        "RENDER_PT_encoding_audio",
+        "RENDER_PT_encoding_video",
+        "RENDER_PT_format",
+        "RENDER_PT_frame_range",
+        "RENDER_PT_output",
+        "RENDER_PT_output_views",
+        "RENDER_PT_post_processing",
+        "RENDER_PT_stamp",
+        "RENDER_PT_stamp_burn",
+        "RENDER_PT_stamp_note",
+        # "RENDER_PT_stereoscopy",
+        "RENDER_PT_time_stretching",
+    ]
+)
+
+register_engine_with_panels(
+    "properties_view_layer",
+    [
+        "VIEWLAYER_PT_layer",
+    ]
+)
+
+register_engine_with_panels(
+    "properties_data_camera",
+    [
+        "DATA_PT_context_camera",
+        "DATA_PT_custom_props_camera",
+        "DATA_PT_camera",
+        "DATA_PT_lens",
+        "DATA_PT_camera_dof",
+        "DATA_PT_camera_dof_aperture",
+        "DATA_PT_camera_display",
+        "DATA_PT_camera_display_composition_guides",
+        "DATA_PT_camera_safe_areas",
+        "DATA_PT_camera_safe_areas_center_cut",
+        "DATA_PT_camera_background_image",
+    ]
+)
+
+register_engine_with_panels(
+    "properties_data_mesh",
+    [
+        "DATA_PT_context_mesh",
+        "DATA_PT_custom_props_mesh",
+        "DATA_PT_customdata",
+        "DATA_PT_face_maps",
+        "DATA_PT_mesh_attributes",
+        "DATA_PT_normals",
+        "DATA_PT_remesh",
+        "DATA_PT_shape_keys",
+        "DATA_PT_texture_space",
+        "DATA_PT_uv_texture",
+        "DATA_PT_vertex_colors",
+        "DATA_PT_vertex_groups",
+    ]
+)
+
+register_engine_with_panels(
+    "properties_particle",
+    [
+        "PARTICLE_MT_context_menu",
+        "PARTICLE_PT_boidbrain",
+        "PARTICLE_PT_cache",
+        "PARTICLE_PT_children",
+        "PARTICLE_PT_children_clumping",
+        "PARTICLE_PT_children_clumping_noise",
+        "PARTICLE_PT_children_kink",
+        "PARTICLE_PT_children_parting",
+        "PARTICLE_PT_children_roughness",
+        "PARTICLE_PT_context_particles",
+        "PARTICLE_PT_custom_props",
+        "PARTICLE_PT_draw",
+        "PARTICLE_PT_emission",
+        "PARTICLE_PT_emission_source",
+        "PARTICLE_PT_field_weights",
+        "PARTICLE_PT_force_fields",
+        "PARTICLE_PT_force_fields_type1",
+        "PARTICLE_PT_force_fields_type1_falloff",
+        "PARTICLE_PT_force_fields_type2",
+        "PARTICLE_PT_force_fields_type2_falloff",
+        "PARTICLE_PT_hair_dynamics",
+        "PARTICLE_PT_hair_dynamics_collision",
+        "PARTICLE_PT_hair_dynamics_presets",
+        "PARTICLE_PT_hair_dynamics_structure",
+        "PARTICLE_PT_hair_dynamics_volume",
+        "PARTICLE_PT_hair_shape",
+        "PARTICLE_PT_physics",
+        "PARTICLE_PT_physics_boids_battle",
+        "PARTICLE_PT_physics_boids_misc",
+        "PARTICLE_PT_physics_boids_movement",
+        "PARTICLE_PT_physics_deflection",
+        "PARTICLE_PT_physics_fluid_advanced",
+        "PARTICLE_PT_physics_fluid_interaction",
+        "PARTICLE_PT_physics_fluid_springs",
+        "PARTICLE_PT_physics_fluid_springs_advanced",
+        "PARTICLE_PT_physics_fluid_springs_viscoelastic",
+        "PARTICLE_PT_physics_forces",
+        "PARTICLE_PT_physics_integration",
+        "PARTICLE_PT_physics_relations",
+        "PARTICLE_PT_render",
+        "PARTICLE_PT_render_collection",
+        "PARTICLE_PT_render_collection_use_count",
+        "PARTICLE_PT_render_extra",
+        "PARTICLE_PT_render_object",
+        "PARTICLE_PT_render_path",
+        "PARTICLE_PT_render_path_timing",
+        "PARTICLE_PT_rotation",
+        "PARTICLE_PT_rotation_angular_velocity",
+        "PARTICLE_PT_textures",
+        "PARTICLE_PT_velocity",
+        "PARTICLE_PT_vertexgroups",
+    ]
+)
+
+# Physics.
+# (Why these require renderer compatibility settings
+# is beyond me.  But they do.)
+register_engine_with_panels(
+    "properties_physics_cloth",
+    [
+        "PHYSICS_PT_cloth",
+        "PHYSICS_PT_cloth_cache",
+        "PHYSICS_PT_cloth_collision",
+        "PHYSICS_PT_cloth_damping",
+        "PHYSICS_PT_cloth_field_weights",
+        "PHYSICS_PT_cloth_internal_springs",
+        "PHYSICS_PT_cloth_object_collision",
+        "PHYSICS_PT_cloth_physical_properties",
+        "PHYSICS_PT_cloth_pressure",
+        "PHYSICS_PT_cloth_property_weights",
+        "PHYSICS_PT_cloth_self_collision",
+        "PHYSICS_PT_cloth_shape",
+        "PHYSICS_PT_cloth_stiffness", 
+    ],
+)
+register_engine_with_panels(
+    "properties_physics_common",
+    [
+        "PHYSICS_PT_add",
+    ]
+)
+register_engine_with_panels(
+    "properties_physics_dynamicpaint",
+    [
+        "PHYSICS_PT_dp_brush_source",
+        "PHYSICS_PT_dp_brush_source_color_ramp",
+        "PHYSICS_PT_dp_brush_velocity",
+        "PHYSICS_PT_dp_brush_velocity_color_ramp",
+        "PHYSICS_PT_dp_brush_velocity_smudge",
+        "PHYSICS_PT_dp_brush_wave",
+        "PHYSICS_PT_dp_cache",
+        "PHYSICS_PT_dp_canvas_initial_color",
+        "PHYSICS_PT_dp_canvas_output",
+        "PHYSICS_PT_dp_canvas_output_paintmaps",
+        "PHYSICS_PT_dp_canvas_output_wetmaps",
+        "PHYSICS_PT_dp_effects",
+        "PHYSICS_PT_dp_effects_drip",
+        "PHYSICS_PT_dp_effects_drip_weights",
+        "PHYSICS_PT_dp_effects_shrink",
+        "PHYSICS_PT_dp_effects_spread",
+        "PHYSICS_PT_dp_surface_canvas",
+        "PHYSICS_PT_dp_surface_canvas_paint_dissolve",
+        "PHYSICS_PT_dp_surface_canvas_paint_dry",
+        "PHYSICS_PT_dynamic_paint",
+        "PHYSICS_PT_dynamic_paint_settings",
+    ]
+)
+register_engine_with_panels(
+    "properties_physics_field",
+    [
+        "PHYSICS_PT_collision",
+        "PHYSICS_PT_collision_particle",
+        "PHYSICS_PT_collision_softbody",
+        "PHYSICS_PT_field",
+        "PHYSICS_PT_field_falloff",
+        "PHYSICS_PT_field_falloff_angular",
+        "PHYSICS_PT_field_falloff_radial",
+        "PHYSICS_PT_field_settings",
+        "PHYSICS_PT_field_settings_kink",
+        "PHYSICS_PT_field_settings_texture_select",
+    ]
+)
+register_engine_with_panels(
+    "properties_physics_fluid",
+    [
+        "PHYSICS_PT_adaptive_domain",
+        "PHYSICS_PT_borders",
+        "PHYSICS_PT_cache",
+        "PHYSICS_PT_collections",
+        "PHYSICS_PT_diffusion",
+        "PHYSICS_PT_export",
+        "PHYSICS_PT_field_weights",
+        "PHYSICS_PT_fire",
+        "PHYSICS_PT_flow_initial_velocity",
+        "PHYSICS_PT_flow_source",
+        "PHYSICS_PT_flow_texture",
+        "PHYSICS_PT_fluid",
+        "PHYSICS_PT_guide",
+        "PHYSICS_PT_liquid",
+        "PHYSICS_PT_mesh",
+        "PHYSICS_PT_noise",
+        "PHYSICS_PT_particles",
+        "PHYSICS_PT_settings",
+        "PHYSICS_PT_smoke",
+        "PHYSICS_PT_smoke_dissolve",
+        "PHYSICS_PT_viscosity",
+    ]
+)
+register_engine_with_panels(
+    "properties_physics_rigidbody",
+    [
+        "PHYSICS_PT_rigid_body",
+        "PHYSICS_PT_rigid_body_collisions",
+        "PHYSICS_PT_rigid_body_collisions_collections",
+        "PHYSICS_PT_rigid_body_collisions_sensitivity",
+        "PHYSICS_PT_rigid_body_collisions_surface",
+        "PHYSICS_PT_rigid_body_dynamics",
+        "PHYSICS_PT_rigid_body_dynamics_deactivation",
+        "PHYSICS_PT_rigid_body_settings",
+    ]
+)
+register_engine_with_panels(
+    "properties_physics_rigidbody_constraint",
+    [
+        "PHYSICS_PT_rigid_body_constraint",
+        "PHYSICS_PT_rigid_body_constraint_limits",
+        "PHYSICS_PT_rigid_body_constraint_limits_angular",
+        "PHYSICS_PT_rigid_body_constraint_limits_linear",
+        "PHYSICS_PT_rigid_body_constraint_motor",
+        "PHYSICS_PT_rigid_body_constraint_motor_angular",
+        "PHYSICS_PT_rigid_body_constraint_motor_linear",
+        "PHYSICS_PT_rigid_body_constraint_objects",
+        "PHYSICS_PT_rigid_body_constraint_override_iterations",
+        "PHYSICS_PT_rigid_body_constraint_settings",
+        "PHYSICS_PT_rigid_body_constraint_springs",
+        "PHYSICS_PT_rigid_body_constraint_springs_angular",
+        "PHYSICS_PT_rigid_body_constraint_springs_linear",
+    ]
+)
+register_engine_with_panels(
+    "properties_physics_softbody",
+    [
+        "PHYSICS_PT_softbody",
+        "PHYSICS_PT_softbody_cache",
+        "PHYSICS_PT_softbody_collision",
+        "PHYSICS_PT_softbody_edge",
+        "PHYSICS_PT_softbody_edge_aerodynamics",
+        "PHYSICS_PT_softbody_edge_stiffness",
+        "PHYSICS_PT_softbody_field_weights",
+        "PHYSICS_PT_softbody_goal",
+        "PHYSICS_PT_softbody_goal_settings",
+        "PHYSICS_PT_softbody_goal_strengths",
+        "PHYSICS_PT_softbody_object",
+        "PHYSICS_PT_softbody_simulation",
+        "PHYSICS_PT_softbody_solver",
+        "PHYSICS_PT_softbody_solver_diagnostics",
+        "PHYSICS_PT_softbody_solver_helpers",
+    ]
+)
+
+#--------------------------------------------------------------

 class PsychopathPanel():
    COMPAT_ENGINES = {'PSYCHOPATH_RENDER'}
@ -20,7 +293,7 @@ class PsychopathPanel():
    @classmethod
    def poll(cls, context):
        rd = context.scene.render
-        return (rd.use_game_engine is False) and (rd.engine in cls.COMPAT_ENGINES)
+        return rd.engine in cls.COMPAT_ENGINES


 class RENDER_PT_psychopath_render_settings(PsychopathPanel, bpy.types.Panel):
@ -47,7 +320,7 @@ class RENDER_PT_psychopath_render_settings(PsychopathPanel, bpy.types.Panel):
        col.prop(scene.psychopath, "shutter_end")

        col.label(text="Performance")
-        col.prop(scene.psychopath, "max_samples_per_bucket")
+        col.prop(scene.psychopath, "bucket_size")


 class RENDER_PT_psychopath_export_settings(PsychopathPanel, bpy.types.Panel):
@ -78,29 +351,7 @@ class WORLD_PT_psychopath_background(PsychopathPanel, bpy.types.Panel):
        layout = self.layout

        world = context.world
-        layout.prop(world, "horizon_color", text="Color")
-
-
-class DATA_PT_psychopath_camera_dof(PsychopathPanel, bpy.types.Panel):
-    bl_label = "Depth of Field"
-    bl_space_type = 'PROPERTIES'
-    bl_region_type = 'WINDOW'
-    bl_context = "data"
-
-    @classmethod
-    def poll(cls, context):
-        engine = context.scene.render.engine
-        return context.camera and PsychopathPanel.poll(context)
-
-    def draw(self, context):
-        ob = context.active_object
-        layout = self.layout
-
-        col = layout.column()
-
-        col.prop(ob.data, "dof_object")
-        col.prop(ob.data, "dof_distance")
-        col.prop(ob.data.psychopath, "aperture_radius")
+        layout.prop(world.psychopath, "background_color", text="Color")


 class DATA_PT_psychopath_lamp(PsychopathPanel, bpy.types.Panel):
@ -112,7 +363,7 @@ class DATA_PT_psychopath_lamp(PsychopathPanel, bpy.types.Panel):
    @classmethod
    def poll(cls, context):
        engine = context.scene.render.engine
-        return context.lamp and PsychopathPanel.poll(context)
+        return context.active_object.type == 'LIGHT' and PsychopathPanel.poll(context)

    def draw(self, context):
        ob = context.active_object
@ -144,22 +395,23 @@ class DATA_PT_psychopath_area_lamp(PsychopathPanel, bpy.types.Panel):

    @classmethod
    def poll(cls, context):
-        lamp = context.lamp
        engine = context.scene.render.engine
-        return (lamp and lamp.type == 'AREA') and (engine in cls.COMPAT_ENGINES)
+        return context.active_object.type == 'LIGHT' \
+            and context.active_object.data.type == 'AREA' \
+            and (engine in cls.COMPAT_ENGINES)

    def draw(self, context):
        layout = self.layout

-        lamp = context.lamp
+        lamp = context.active_object.data

        col = layout.column()
        col.row().prop(lamp, "shape", expand=True)
        sub = col.row(align=True)

-        if lamp.shape == 'SQUARE':
+        if lamp.shape == 'SQUARE' or lamp.shape == 'DISK':
            sub.prop(lamp, "size")
-        elif lamp.shape == 'RECTANGLE':
+        elif lamp.shape == 'RECTANGLE' or lamp.shape == 'ELLIPSE':
            sub.prop(lamp, "size", text="Size X")
            sub.prop(lamp, "size_y", text="Size Y")

@ -208,10 +460,10 @@ class MATERIAL_PT_psychopath_context_material(PsychopathPanel, bpy.types.Panel):
            row.template_list("MATERIAL_UL_matslots", "", ob, "material_slots", ob, "active_material_index", rows=1)

            col = row.column(align=True)
-            col.operator("object.material_slot_add", icon='ZOOMIN', text="")
-            col.operator("object.material_slot_remove", icon='ZOOMOUT', text="")
+            col.operator("object.material_slot_add", icon='ADD', text="")
+            col.operator("object.material_slot_remove", icon='REMOVE', text="")

-            col.menu("MATERIAL_MT_specials", icon='DOWNARROW_HLT', text="")
+            col.menu("MATERIAL_MT_context_menu", icon='DOWNARROW_HLT', text="")

            if ob.mode == 'EDIT':
                row = layout.row(align=True)
@ -219,7 +471,7 @@ class MATERIAL_PT_psychopath_context_material(PsychopathPanel, bpy.types.Panel):
                row.operator("object.material_slot_select", text="Select")
                row.operator("object.material_slot_deselect", text="Deselect")

-        split = layout.split(percentage=0.65)
+        split = layout.split(factor=0.65)

        if ob:
            split.template_ID(ob, "active_material", new="material.new")
@ -271,7 +523,6 @@ def register():
    bpy.utils.register_class(RENDER_PT_psychopath_render_settings)
    bpy.utils.register_class(RENDER_PT_psychopath_export_settings)
    bpy.utils.register_class(WORLD_PT_psychopath_background)
-    bpy.utils.register_class(DATA_PT_psychopath_camera_dof)
    bpy.utils.register_class(DATA_PT_psychopath_mesh)
    bpy.utils.register_class(DATA_PT_psychopath_lamp)
    bpy.utils.register_class(DATA_PT_psychopath_area_lamp)
@ -282,8 +533,7 @@ def unregister():
    bpy.utils.unregister_class(RENDER_PT_psychopath_render_settings)
    bpy.utils.unregister_class(RENDER_PT_psychopath_export_settings)
    bpy.utils.unregister_class(WORLD_PT_psychopath_background)
-    bpy.utils.unregister_class(DATA_PT_psychopath_camera_dof)
-    bpy.utils.register_class(DATA_PT_psychopath_mesh)
+    bpy.utils.unregister_class(DATA_PT_psychopath_mesh)
    bpy.utils.unregister_class(DATA_PT_psychopath_lamp)
    bpy.utils.unregister_class(DATA_PT_psychopath_area_lamp)
    bpy.utils.unregister_class(MATERIAL_PT_psychopath_context_material)
--- a/psychoblend/world.py
+++ b/psychoblend/world.py
@ -1,52 +1,33 @@
 import bpy

-from math import degrees, tan, atan
+from math import degrees, sin, asin, tan, atan
 from mathutils import Vector, Matrix

 from .util import escape_name, mat2str, ExportCancelled

 class World:
-    def __init__(self, render_engine, scene, visible_layers, aspect_ratio):
+    def __init__(self, render_engine, depsgraph):
+        scene = depsgraph.scene
        self.background_shader = BackgroundShader(render_engine, scene.world)
-        self.camera = Camera(render_engine, scene.camera, aspect_ratio)
-        self.lights = []

-        # Collect infinite-extent light sources.
-        # TODO: also get sun lamps inside group instances.
-        for ob in scene.objects:
-            if ob.type == 'LAMP' and ob.data.type == 'SUN':
-                name = escape_name(ob.name)
-                self.lights += [DistantDiskLamp(ob, name)]

-    def take_sample(self, render_engine, scene, time):
-        self.camera.take_sample(render_engine, scene, time)
+    def take_sample(self, render_engine, depsgraph, time):
+        if render_engine.test_break():
+            raise ExportCancelled()
+        self.background_shader.take_sample(render_engine, depsgraph, time)

-        for light in self.lights:
-            # Check if render is cancelled
-            if render_engine.test_break():
-                raise ExportCancelled()
-            light.take_sample(render_engine, scene, time)
+    def cleanup(self):
+        pass

    def export(self, render_engine, w):
-        self.camera.export(render_engine, w)
-
        w.write("World {\n")
        w.indent()

        self.background_shader.export(render_engine, w)

-        for light in self.lights:
-            light.export(render_engine, w)
-
        w.unindent()
        w.write("}\n")

-    def cleanup(self):
-        # For future use.  This is run by the calling code when finished,
-        # even if export did not succeed.
-        pass
-
-#================================================================

 class Camera:
    def __init__(self, render_engine, ob, aspect_ratio):
@ -58,31 +39,43 @@ class Camera:
        self.focal_distances = []
        self.xforms = []

-    def take_sample(self, render_engine, scene, time):
+    def take_sample(self, render_engine, depsgraph, time):
        render_engine.update_stats("", "Psychopath: Collecting '{}' at time {}".format(self.ob.name, time))

-        # Fov
-        if self.aspect_ratio >= 1.0:
-            self.fovs += [degrees(self.ob.data.angle)]
+        # Fov.
+        # TODO: account for the various ways sensor size can be specified.
+        x_extent = depsgraph.scene.render.resolution_x / depsgraph.scene.render.pixel_aspect_x
+        y_extent = depsgraph.scene.render.resolution_y / depsgraph.scene.render.pixel_aspect_y
+        aspect_ratio = x_extent / y_extent
+        if aspect_ratio >= 1.0:
+            self.fovs += [degrees(self.ob.data.angle_x)]
        else:
-            self.fovs += [degrees(2.0 * atan(tan(self.ob.data.angle * 0.5) * self.aspect_ratio))]
+            self.fovs += [degrees(2.0 * atan(tan(self.ob.data.angle_x * 0.5) * aspect_ratio))]

-        # Aperture radius
-        self.aperture_radii += [self.ob.data.psychopath.aperture_radius]
+        if self.ob.data.dof.use_dof:
+            # Aperture radius.
+            radius = self.ob.data.lens / 2000.0 / self.ob.data.dof.aperture_fstop
+            self.aperture_radii += [radius]

-        # Dof distance
-        if self.ob.data.dof_object == None:
-            self.focal_distances += [self.ob.data.dof_distance]
+            # Dof distance
+            if self.ob.data.dof.focus_object == None:
+                self.focal_distances += [self.ob.data.dof.focus_distance]
+            else:
+                # TODO: implement DoF object tracking here
+                self.focal_distances += [0.0]
+                print("WARNING: DoF object tracking not yet implemented.")
        else:
-            # TODO: implement DoF object tracking here
-            self.focal_distances += [0.0]
-            print("WARNING: DoF object tracking not yet implemented.")
+            self.aperture_radii += [0.0]
+            self.focal_distances += [1.0]

        # Transform
        mat = self.ob.matrix_world.copy()
        matz = Matrix()
        matz[2][2] = -1
-        self.xforms += [mat * matz]
+        self.xforms += [(mat @ matz).inverted()]
+
+    def cleanup(self):
+        pass

    def export(self, render_engine, w):
        render_engine.update_stats("", "Psychopath: Exporting %s" % self.ob.name)
@ -108,55 +101,24 @@ class Camera:
 class BackgroundShader:
    def __init__(self, render_engine, world):
        self.world = world
+        self.color = []
+
+    def take_sample(self, render_engine, depsgraph, time):
        if self.world != None:
-            self.color = (world.horizon_color[0], world.horizon_color[1], world.horizon_color[2])
+            self.color += [(
+                self.world.psychopath.background_color[0],
+                self.world.psychopath.background_color[1],
+                self.world.psychopath.background_color[2],
+            )]

    def export(self, render_engine, w):
        if self.world != None:
            w.write("BackgroundShader {\n")
            w.indent();
            w.write("Type [Color]\n")
-            w.write("Color [rec709, %f %f %f]\n" % self.color)
+            for c in self.color:
+                w.write("Color [rec709, %f %f %f]\n" % c)
            w.unindent()
            w.write("}\n")


-class DistantDiskLamp:
-    def __init__(self, ob, name):
-        self.ob = ob
-        self.name = name
-        self.time_col = []
-        self.time_dir = []
-        self.time_rad = []
-
-    def take_sample(self, render_engine, scene, time):
-        render_engine.update_stats("", "Psychopath: Collecting '{}' at time {}".format(self.ob.name, time))
-        self.time_dir += [tuple(self.ob.matrix_world.to_3x3() * Vector((0, 0, -1)))]
-
-        if self.ob.data.psychopath.color_type == 'Rec709':
-            self.time_col += [('Rec709', self.ob.data.color * self.ob.data.energy)]
-        elif self.ob.data.psychopath.color_type == 'Blackbody':
-            self.time_col += [('Blackbody', self.ob.data.psychopath.color_blackbody_temp, self.ob.data.energy)]
-        elif self.ob.data.psychopath.color_type == 'ColorTemperature':
-            self.time_col += [('ColorTemperature', self.ob.data.psychopath.color_blackbody_temp, self.ob.data.energy)]
-
-        self.time_rad += [self.ob.data.shadow_soft_size]
-
-    def export(self, render_engine, w):
-        render_engine.update_stats("", "Psychopath: Exporting %s" % self.ob.name)
-        w.write("DistantDiskLight $%s {\n" % self.name)
-        w.indent()
-        for direc in self.time_dir:
-            w.write("Direction [%f %f %f]\n" % (direc[0], direc[1], direc[2]))
-        for col in self.time_col:
-            if col[0] == 'Rec709':
-                w.write("Color [rec709, %f %f %f]\n" % (col[1][0], col[1][1], col[1][2]))
-            elif col[0] == 'Blackbody':
-                w.write("Color [blackbody, %f %f]\n" % (col[1], col[2]))
-            elif col[0] == 'ColorTemperature':
-                w.write("Color [color_temperature, %f %f]\n" % (col[1], col[2]))
-        for rad in self.time_rad:
-            w.write("Radius [%f]\n" % rad)
-
-        w.unindent()
-        w.write("}\n")
--- a/src/accel/bvh.rs
+++ b/src/accel/bvh.rs
@ -67,7 +67,91 @@ impl<'a> BVH<'a> {
        self.depth
    }

-    pub fn traverse<T, F>(&self, rays: &mut [AccelRay], objects: &[T], mut obj_ray_test: F)
+    pub fn traverse<T, F>(&self, ray: &mut AccelRay, objects: &[T], mut obj_ray_test: F)
+    where
+        F: FnMut(&T, &mut AccelRay),
+    {
+        if self.root.is_none() {
+            return;
+        }
+
+        let mut timer = Timer::new();
+        let mut trav_time: f64 = 0.0;
+        let mut node_tests: u64 = 0;
+
+        let ray_sign = [
+            ray.dir_inv.x() >= 0.0,
+            ray.dir_inv.y() >= 0.0,
+            ray.dir_inv.z() >= 0.0,
+        ];
+
+        // +2 of max depth for root and last child
+        let mut node_stack = [self.root.unwrap(); BVH_MAX_DEPTH + 2];
+        let mut stack_ptr = 1;
+
+        while stack_ptr > 0 && !ray.is_done {
+            node_tests += 1;
+            match *node_stack[stack_ptr] {
+                BVHNode::Internal {
+                    children,
+                    bounds_start,
+                    bounds_len,
+                    split_axis,
+                } => {
+                    let bounds =
+                        unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
+                    let is_hit = lerp_slice(bounds, ray.time).intersect_accel_ray(&ray);
+
+                    if is_hit {
+                        if ray_sign[split_axis as usize] {
+                            node_stack[stack_ptr] = children.1;
+                            node_stack[stack_ptr + 1] = children.0;
+                        } else {
+                            node_stack[stack_ptr] = children.0;
+                            node_stack[stack_ptr + 1] = children.1;
+                        }
+                        stack_ptr += 1;
+                    } else {
+                        stack_ptr -= 1;
+                    }
+                }
+
+                BVHNode::Leaf {
+                    object_range,
+                    bounds_start,
+                    bounds_len,
+                } => {
+                    let bounds =
+                        unsafe { std::slice::from_raw_parts(bounds_start, bounds_len as usize) };
+                    let is_hit = lerp_slice(bounds, ray.time).intersect_accel_ray(&ray);
+
+                    trav_time += timer.tick() as f64;
+
+                    if is_hit {
+                        for obj in &objects[object_range.0..object_range.1] {
+                            obj_ray_test(obj, ray);
+                        }
+                    }
+
+                    timer.tick();
+
+                    stack_ptr -= 1;
+                }
+            }
+        }
+
+        trav_time += timer.tick() as f64;
+        ACCEL_TRAV_TIME.with(|att| {
+            let v = att.get();
+            att.set(v + trav_time);
+        });
+        ACCEL_NODE_RAY_TESTS.with(|anv| {
+            let v = anv.get();
+            anv.set(v + node_tests);
+        });
+    }
+
+    pub fn traverse_multi<T, F>(&self, rays: &mut [AccelRay], objects: &[T], mut obj_ray_test: F)
    where
        F: FnMut(&T, &mut [AccelRay]),
    {
--- a/src/accel/bvh4.rs
+++ b/src/accel/bvh4.rs
@ -1,13 +1,7 @@
-//! This BVH4 implementation is based on the ideas from the paper
-//! "Efficient Ray Tracing Kernels for Modern CPU Architectures"
-//! by Fuetterling et al.
-
 #![allow(dead_code)]

 use std::mem::{transmute, MaybeUninit};

-use glam::BVec4A;
-
 use kioku::Arena;

 use crate::{
@ -16,7 +10,7 @@ use crate::{
    boundable::Boundable,
    lerp::lerp_slice,
    math::Vector,
-    ray::{RayBatch, RayStack},
+    ray::{LocalRay, Ray},
 };

 use super::{
@ -25,6 +19,7 @@ use super::{
 };

 use bvh_order::{calc_traversal_code, SplitAxes, TRAVERSAL_TABLE};
+use rmath::wide4::Float4;

 pub fn ray_code(dir: Vector) -> usize {
    let ray_sign_is_neg = [dir.x() < 0.0, dir.y() < 0.0, dir.z() < 0.0];
@ -33,6 +28,8 @@ pub fn ray_code(dir: Vector) -> usize {
        + ((ray_sign_is_neg[2] as usize) << 2)
 }

+//-------------------------------------------------------------
+
 #[derive(Copy, Clone, Debug)]
 pub struct BVH4<'a> {
    root: Option<&'a BVH4Node<'a>>,
@ -98,9 +95,9 @@ impl<'a> BVH4<'a> {
        self.depth
    }

-    pub fn traverse<F>(&self, rays: &mut RayBatch, ray_stack: &mut RayStack, mut obj_ray_test: F)
+    pub fn traverse<F>(&self, ray: &mut Ray, local_ray: &LocalRay, mut obj_ray_test: F)
    where
-        F: FnMut(std::ops::Range<usize>, &mut RayBatch, &mut RayStack),
+        F: FnMut(std::ops::Range<usize>, &mut Ray),
    {
        if self.root.is_none() {
            return;
@ -108,55 +105,48 @@ impl<'a> BVH4<'a> {

        let mut node_tests: u64 = 0;

-        let traversal_table =
-            &TRAVERSAL_TABLE[ray_code(rays.dir_inv_local(ray_stack.next_task_ray_idx(0)))];
+        // SIMD-ready ray data.
+        let orig4 = [
+            local_ray.orig.0.aaaa(),
+            local_ray.orig.0.bbbb(),
+            local_ray.orig.0.cccc(),
+        ];
+        let dir_inv4 = [
+            local_ray.dir_inv.0.aaaa(),
+            local_ray.dir_inv.0.bbbb(),
+            local_ray.dir_inv.0.cccc(),
+        ];
+        let mut max_t4 = Float4::splat(ray.max_t);

        // +2 of max depth for root and last child
        let mut node_stack = [self.root.unwrap(); (BVH_MAX_DEPTH * 3) + 2];
        let mut stack_ptr = 1;

-        while stack_ptr > 0 {
+        let traversal_table = &TRAVERSAL_TABLE[ray_code(local_ray.dir_inv)];
+
+        while stack_ptr > 0 && !ray.is_done() {
            match *node_stack[stack_ptr] {
                BVH4Node::Internal {
                    bounds,
                    children,
                    traversal_code,
                } => {
-                    node_tests += ray_stack.ray_count_in_next_task() as u64;
-                    let mut all_hits = BVec4A::default();
+                    node_tests += 1;

-                    // Ray testing
-                    ray_stack.pop_do_next_task_and_push_rays(children.len(), |ray_idx| {
-                        if rays.is_done(ray_idx) {
-                            BVec4A::default()
-                        } else {
-                            let hits = if bounds.len() == 1 {
-                                bounds[0].intersect_ray(
-                                    rays.orig_local(ray_idx),
-                                    rays.dir_inv_local(ray_idx),
-                                    rays.max_t(ray_idx),
-                                )
-                            } else {
-                                lerp_slice(bounds, rays.time(ray_idx)).intersect_ray(
-                                    rays.orig_local(ray_idx),
-                                    rays.dir_inv_local(ray_idx),
-                                    rays.max_t(ray_idx),
-                                )
-                            };
-                            all_hits |= hits;
-                            hits
-                        }
-                    });
+                    let hits = if bounds.len() == 1 {
+                        bounds[0].intersect_ray(orig4, dir_inv4, max_t4)
+                    } else {
+                        lerp_slice(bounds, ray.time).intersect_ray(orig4, dir_inv4, max_t4)
+                    };

-                    // If there were any intersections, create tasks.
-                    if all_hits.any() {
+                    // Push child nodes onto the stack if there were any hits.
+                    if hits.any() {
                        let order_code = traversal_table[traversal_code as usize];
+                        let hits = hits.to_bools();
                        let mut lane_count = 0;
-                        let mut i = children.len() as u8;
-                        while i > 0 {
-                            i -= 1;
+                        for i in (0..children.len() as u8).rev() {
                            let child_i = ((order_code >> (i * 2)) & 3) as usize;
-                            if ray_stack.push_lane_to_task(child_i) {
+                            if hits[child_i] {
                                node_stack[stack_ptr + lane_count] = &children[child_i];
                                lane_count += 1;
                            }
@ -169,8 +159,10 @@ impl<'a> BVH4<'a> {
                }

                BVH4Node::Leaf { object_range } => {
-                    // Do the ray tests.
-                    obj_ray_test(object_range.0..object_range.1, rays, ray_stack);
+                    obj_ray_test(object_range.0..object_range.1, ray);
+
+                    // Update SIMD max_t in case there was a hit.
+                    max_t4 = Float4::splat(ray.max_t);

                    stack_ptr -= 1;
                }
--- a/src/algorithm.rs
+++ b/src/algorithm.rs
@ -5,10 +5,9 @@ use std::{
    mem::MaybeUninit,
 };

-use crate::{
-    hash::hash_u64,
-    lerp::{lerp_slice, Lerp},
-};
+use rrand::mix64_seed;
+
+use crate::lerp::{lerp_slice, Lerp};

 /// Selects an item from a slice based on a weighting function and a
 /// number (n) between 0.0 and 1.0.  Returns the index of the selected
@ -209,7 +208,7 @@ where
    let mut seed = n as u64;

    loop {
-        let i = left + (hash_u64(right as u64, seed) as usize % (right - left));
+        let i = left + (mix64_seed(right as u64, seed) as usize % (right - left));

        slc.swap(i, right - 1);
        let ii = left + {
--- a/src/bbox.rs
+++ b/src/bbox.rs
@ -7,7 +7,7 @@ use std::{

 use crate::{
    lerp::{lerp, lerp_slice, Lerp},
-    math::{Point, Transform, Vector},
+    math::{fast_minf32, Point, Vector, Xform},
 };

 const BBOX_MAXT_ADJUST: f32 = 1.000_000_24;
@ -41,21 +41,23 @@ impl BBox {
    // Returns whether the given ray intersects with the bbox.
    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> bool {
        // Calculate slab intersections
-        let t1 = (self.min.co - orig.co) * dir_inv.co;
-        let t2 = (self.max.co - orig.co) * dir_inv.co;
+        let t1 = (self.min.0 - orig.0) * dir_inv.0;
+        let t2 = (self.max.0 - orig.0) * dir_inv.0;

        // Find the far and near intersection
-        let far_t = t1.max(t2).extend(std::f32::INFINITY);
-        let near_t = t1.min(t2).extend(0.0);
-        let far_hit_t = (far_t.min_element() * BBOX_MAXT_ADJUST).min(max_t);
+        let far_t = t1.max(t2).set_d(std::f32::INFINITY);
+        let near_t = t1.min(t2).set_d(0.0);
+        let far_hit_t = fast_minf32(far_t.min_element() * BBOX_MAXT_ADJUST, max_t);
        let near_hit_t = near_t.max_element();

        // Did we hit?
        near_hit_t <= far_hit_t
    }

-    // Creates a new BBox transformed into a different space.
-    pub fn transformed(&self, xform: Transform) -> BBox {
+    // Creates a new BBox transformed from its local space to the
+    // given space.
+    #[must_use]
+    pub fn xform(&self, xform: &Xform) -> BBox {
        // BBox corners
        let vs = [
            Point::new(self.min.x(), self.min.y(), self.min.z()),
@ -71,7 +73,7 @@ impl BBox {
        // Transform BBox corners and make new bbox
        let mut b = BBox::new();
        for v in &vs {
-            let v = *v * xform;
+            let v = v.xform(xform);
            b.min = v.min(b.min);
            b.max = v.max(b.max);
        }
@ -103,12 +105,8 @@ impl BitOr for BBox {

    fn bitor(self, rhs: BBox) -> BBox {
        BBox::from_points(
-            Point {
-                co: self.min.co.min(rhs.min.co),
-            },
-            Point {
-                co: self.max.co.max(rhs.max.co),
-            },
+            Point(self.min.0.min(rhs.min.0)),
+            Point(self.max.0.max(rhs.max.0)),
        )
    }
 }
@ -124,14 +122,7 @@ impl BitOr<Point> for BBox {
    type Output = BBox;

    fn bitor(self, rhs: Point) -> BBox {
-        BBox::from_points(
-            Point {
-                co: self.min.co.min(rhs.co),
-            },
-            Point {
-                co: self.max.co.max(rhs.co),
-            },
-        )
+        BBox::from_points(Point(self.min.0.min(rhs.0)), Point(self.max.0.max(rhs.0)))
    }
 }

@ -150,7 +141,7 @@ impl Lerp for BBox {
    }
 }

-pub fn transform_bbox_slice_from(bbs_in: &[BBox], xforms: &[Transform], bbs_out: &mut Vec<BBox>) {
+pub fn transform_bbox_slice_from(bbs_in: &[BBox], xforms: &[Xform], bbs_out: &mut Vec<BBox>) {
    bbs_out.clear();

    // Transform the bounding boxes
@ -158,17 +149,17 @@ pub fn transform_bbox_slice_from(bbs_in: &[BBox], xforms: &[Transform], bbs_out:
        bbs_out.extend_from_slice(bbs_in);
    } else if bbs_in.len() == xforms.len() {
        for (bb, xf) in Iterator::zip(bbs_in.iter(), xforms.iter()) {
-            bbs_out.push(bb.transformed(xf.inverse()));
+            bbs_out.push(bb.xform(&xf));
        }
    } else if bbs_in.len() > xforms.len() {
        let s = (bbs_in.len() - 1) as f32;
        for (i, bb) in bbs_in.iter().enumerate() {
-            bbs_out.push(bb.transformed(lerp_slice(xforms, i as f32 / s).inverse()));
+            bbs_out.push(bb.xform(&lerp_slice(xforms, i as f32 / s)));
        }
    } else if bbs_in.len() < xforms.len() {
        let s = (xforms.len() - 1) as f32;
        for (i, xf) in xforms.iter().enumerate() {
-            bbs_out.push(lerp_slice(bbs_in, i as f32 / s).transformed(xf.inverse()));
+            bbs_out.push(lerp_slice(bbs_in, i as f32 / s).xform(&xf));
        }
    }
 }
--- a/src/bbox4.rs
+++ b/src/bbox4.rs
@ -6,19 +6,18 @@ use std::ops::{BitOr, BitOrAssign};
 use crate::{
    bbox::BBox,
    lerp::{lerp, Lerp},
-    math::{Point, Vector},
 };

-use glam::{BVec4A, Vec4};
+use rmath::wide4::{Bool4, Float4};

 const BBOX_MAXT_ADJUST: f32 = 1.000_000_24;

 /// A SIMD set of 4 3D axis-aligned bounding boxes.
 #[derive(Debug, Copy, Clone)]
 pub struct BBox4 {
-    pub x: (Vec4, Vec4), // (min, max)
-    pub y: (Vec4, Vec4), // (min, max)
-    pub z: (Vec4, Vec4), // (min, max)
+    pub x: (Float4, Float4), // (min, max)
+    pub y: (Float4, Float4), // (min, max)
+    pub z: (Float4, Float4), // (min, max)
 }

 impl BBox4 {
@ -26,16 +25,16 @@ impl BBox4 {
    pub fn new() -> BBox4 {
        BBox4 {
            x: (
-                Vec4::splat(std::f32::INFINITY),
-                Vec4::splat(std::f32::NEG_INFINITY),
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
            ),
            y: (
-                Vec4::splat(std::f32::INFINITY),
-                Vec4::splat(std::f32::NEG_INFINITY),
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
            ),
            z: (
-                Vec4::splat(std::f32::INFINITY),
-                Vec4::splat(std::f32::NEG_INFINITY),
+                Float4::splat(std::f32::INFINITY),
+                Float4::splat(std::f32::NEG_INFINITY),
            ),
        }
    }
@ -45,38 +44,30 @@ impl BBox4 {
    pub fn from_bboxes(b1: BBox, b2: BBox, b3: BBox, b4: BBox) -> BBox4 {
        BBox4 {
            x: (
-                Vec4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()),
-                Vec4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()),
+                Float4::new(b1.min.x(), b2.min.x(), b3.min.x(), b4.min.x()),
+                Float4::new(b1.max.x(), b2.max.x(), b3.max.x(), b4.max.x()),
            ),
            y: (
-                Vec4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()),
-                Vec4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()),
+                Float4::new(b1.min.y(), b2.min.y(), b3.min.y(), b4.min.y()),
+                Float4::new(b1.max.y(), b2.max.y(), b3.max.y(), b4.max.y()),
            ),
            z: (
-                Vec4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()),
-                Vec4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z()),
+                Float4::new(b1.min.z(), b2.min.z(), b3.min.z(), b4.min.z()),
+                Float4::new(b1.max.z(), b2.max.z(), b3.max.z(), b4.max.z()),
            ),
        }
    }

    // Returns whether the given ray intersects with the bboxes.
-    pub fn intersect_ray(&self, orig: Point, dir_inv: Vector, max_t: f32) -> BVec4A {
-        // Get the ray data into SIMD format.
-        let ro_x = Vec4::splat(orig.co[0]);
-        let ro_y = Vec4::splat(orig.co[1]);
-        let ro_z = Vec4::splat(orig.co[2]);
-        let rdi_x = Vec4::splat(dir_inv.co[0]);
-        let rdi_y = Vec4::splat(dir_inv.co[1]);
-        let rdi_z = Vec4::splat(dir_inv.co[2]);
-        let max_t = Vec4::splat(max_t);
-
+    #[inline(always)]
+    pub fn intersect_ray(&self, orig: [Float4; 3], dir_inv: [Float4; 3], max_t: Float4) -> Bool4 {
        // Slab tests
-        let t1_x = (self.x.0 - ro_x) * rdi_x;
-        let t1_y = (self.y.0 - ro_y) * rdi_y;
-        let t1_z = (self.z.0 - ro_z) * rdi_z;
-        let t2_x = (self.x.1 - ro_x) * rdi_x;
-        let t2_y = (self.y.1 - ro_y) * rdi_y;
-        let t2_z = (self.z.1 - ro_z) * rdi_z;
+        let t1_x = (self.x.0 - orig[0]) * dir_inv[0];
+        let t1_y = (self.y.0 - orig[1]) * dir_inv[1];
+        let t1_z = (self.z.0 - orig[2]) * dir_inv[2];
+        let t2_x = (self.x.1 - orig[0]) * dir_inv[0];
+        let t2_y = (self.y.1 - orig[1]) * dir_inv[1];
+        let t2_z = (self.z.1 - orig[2]) * dir_inv[2];

        // Get the far and near t hits for each axis.
        let t_far_x = t1_x.max(t2_x);
@ -87,10 +78,11 @@ impl BBox4 {
        let t_near_z = t1_z.min(t2_z);

        // Calculate over-all far t hit.
-        let far_t = (t_far_x.min(t_far_y.min(t_far_z)) * Vec4::splat(BBOX_MAXT_ADJUST)).min(max_t);
+        let far_t =
+            (t_far_x.min(t_far_y.min(t_far_z)) * Float4::splat(BBOX_MAXT_ADJUST)).min(max_t);

        // Calculate over-all near t hit.
-        let near_t = t_near_x.max(t_near_y).max(t_near_z.max(Vec4::splat(0.0)));
+        let near_t = t_near_x.max(t_near_y).max(t_near_z.max(Float4::splat(0.0)));

        // Hit results
        near_t.cmplt(far_t)
--- a/src/camera.rs
+++ b/src/camera.rs
@ -1,31 +1,28 @@
 #![allow(dead_code)]

-use kioku::Arena;
-
 use crate::{
    lerp::lerp_slice,
-    math::{Point, Transform, Vector},
+    math::{Point, Vector, Xform},
    ray::Ray,
    sampling::square_to_circle,
 };

-#[derive(Copy, Clone, Debug)]
-pub struct Camera<'a> {
-    transforms: &'a [Transform],
-    fovs: &'a [f32],
-    tfovs: &'a [f32],
-    aperture_radii: &'a [f32],
-    focus_distances: &'a [f32],
+#[derive(Debug, Clone)]
+pub struct Camera {
+    transforms: Vec<Xform>,
+    fovs: Vec<f32>,
+    tfovs: Vec<f32>,
+    aperture_radii: Vec<f32>,
+    focus_distances: Vec<f32>,
 }

-impl<'a> Camera<'a> {
+impl Camera {
    pub fn new(
-        arena: &'a Arena,
-        transforms: &[Transform],
+        transforms: &[Xform],
        fovs: &[f32],
        mut aperture_radii: &[f32],
        mut focus_distances: &[f32],
-    ) -> Camera<'a> {
+    ) -> Camera {
        assert!(!transforms.is_empty(), "Camera has no transform(s)!");
        assert!(!fovs.is_empty(), "Camera has no fov(s)!");

@ -63,20 +60,20 @@ impl<'a> Camera<'a> {
            .collect();

        Camera {
-            transforms: arena.copy_slice(&transforms),
-            fovs: arena.copy_slice(&fovs),
-            tfovs: arena.copy_slice(&tfovs),
-            aperture_radii: arena.copy_slice(&aperture_radii),
-            focus_distances: arena.copy_slice(&focus_distances),
+            transforms: transforms.into(),
+            fovs: fovs.into(),
+            tfovs: tfovs.into(),
+            aperture_radii: aperture_radii.into(),
+            focus_distances: focus_distances.into(),
        }
    }

    pub fn generate_ray(&self, x: f32, y: f32, time: f32, wavelength: f32, u: f32, v: f32) -> Ray {
        // Get time-interpolated camera settings
-        let transform = lerp_slice(self.transforms, time);
-        let tfov = lerp_slice(self.tfovs, time);
-        let aperture_radius = lerp_slice(self.aperture_radii, time);
-        let focus_distance = lerp_slice(self.focus_distances, time);
+        let transform = lerp_slice(&self.transforms, time).to_full_fast().unwrap();
+        let tfov = lerp_slice(&self.tfovs, time);
+        let aperture_radius = lerp_slice(&self.aperture_radii, time);
+        let focus_distance = lerp_slice(&self.focus_distances, time);

        // Ray origin
        let orig = {
@ -92,12 +89,13 @@ impl<'a> Camera<'a> {
        )
        .normalized();

-        Ray {
-            orig: orig * transform,
-            dir: dir * transform,
-            time: time,
-            wavelength: wavelength,
-            max_t: std::f32::INFINITY,
-        }
+        Ray::new(
+            orig.xform_inv_fast(&transform),
+            dir.xform_inv_fast(&transform),
+            time,
+            wavelength,
+            std::f32::INFINITY,
+            false,
+        )
    }
 }
--- a/src/color.rs
+++ b/src/color.rs
@ -1,13 +1,8 @@
 use std::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign};

-pub use color::{
-    rec709_e_to_xyz, rec709_to_xyz, xyz_to_aces_ap0, xyz_to_aces_ap0_e, xyz_to_rec709,
-    xyz_to_rec709_e,
-};
+use crate::math::Float4;
 use compact::fluv::fluv32;
-use glam::Vec4;
 use half::f16;
-use spectral_upsampling::meng::{spectrum_xyz_to_p_4, EQUAL_ENERGY_REFLECTANCE};

 use crate::{lerp::Lerp, math::fast_exp};

@ -31,10 +26,10 @@ fn nth_wavelength(hero_wavelength: f32, n: usize) -> f32 {
    }
 }

-/// Returns all wavelengths of a hero wavelength set as a Vec4
+/// Returns all wavelengths of a hero wavelength set as a Float4
 #[inline(always)]
-fn wavelengths(hero_wavelength: f32) -> Vec4 {
-    Vec4::new(
+fn wavelengths(hero_wavelength: f32) -> Float4 {
+    Float4::new(
        nth_wavelength(hero_wavelength, 0),
        nth_wavelength(hero_wavelength, 1),
        nth_wavelength(hero_wavelength, 2),
@ -94,7 +89,7 @@ impl Color {
            } => {
                SpectralSample::from_parts(
                    // TODO: make this SIMD
-                    Vec4::new(
+                    Float4::new(
                        plancks_law(temperature, wls[0]) * factor,
                        plancks_law(temperature, wls[1]) * factor,
                        plancks_law(temperature, wls[2]) * factor,
@ -109,7 +104,7 @@ impl Color {
            } => {
                SpectralSample::from_parts(
                    // TODO: make this SIMD
-                    Vec4::new(
+                    Float4::new(
                        plancks_law_normalized(temperature, wls[0]) * factor,
                        plancks_law_normalized(temperature, wls[1]) * factor,
                        plancks_law_normalized(temperature, wls[2]) * factor,
@ -386,7 +381,7 @@ fn plancks_law_normalized(temperature: f32, wavelength: f32) -> f32 {

 #[derive(Copy, Clone, Debug)]
 pub struct SpectralSample {
-    pub e: Vec4,
+    pub e: Float4,
    hero_wavelength: f32,
 }

@ -394,7 +389,7 @@ impl SpectralSample {
    pub fn new(wavelength: f32) -> SpectralSample {
        debug_assert!(wavelength >= WL_MIN && wavelength <= WL_MAX);
        SpectralSample {
-            e: Vec4::splat(0.0),
+            e: Float4::splat(0.0),
            hero_wavelength: wavelength,
        }
    }
@ -403,12 +398,12 @@ impl SpectralSample {
    pub fn from_value(value: f32, wavelength: f32) -> SpectralSample {
        debug_assert!(wavelength >= WL_MIN && wavelength <= WL_MAX);
        SpectralSample {
-            e: Vec4::splat(value),
+            e: Float4::splat(value),
            hero_wavelength: wavelength,
        }
    }

-    pub fn from_parts(e: Vec4, wavelength: f32) -> SpectralSample {
+    pub fn from_parts(e: Float4, wavelength: f32) -> SpectralSample {
        debug_assert!(wavelength >= WL_MIN && wavelength <= WL_MAX);
        SpectralSample {
            e: e,
@ -599,30 +594,61 @@ impl DivAssign<f32> for XYZ {
 /// the method in the paper "Physically Meaningful Rendering using Tristimulus
 /// Colours" by Meng et al.
 #[inline(always)]
-fn xyz_to_spectrum_4(xyz: (f32, f32, f32), wavelengths: Vec4) -> Vec4 {
-    spectrum_xyz_to_p_4(wavelengths, xyz) * Vec4::splat(1.0 / EQUAL_ENERGY_REFLECTANCE)
-    // aces_to_spectrum_p4(wavelengths, xyz_to_aces_ap0_e(xyz))
+fn xyz_to_spectrum_4(xyz: (f32, f32, f32), wavelengths: Float4) -> Float4 {
+    use spectral_upsampling as su;
+
+    // su::meng::spectrum_xyz_to_p_4(wavelengths, xyz)
+    //     * Float4::splat(1.0 / su::meng::EQUAL_ENERGY_REFLECTANCE)
+
+    su::jakob::rec2020_to_spectrum_p4(wavelengths, color::xyz_to_rec2020_e(xyz))
+
+    // su::jakob::rec709_to_spectrum_p4(wavelengths, color::xyz_to_rec709_e(xyz))
 }

 /// Close analytic approximations of the CIE 1931 XYZ color curves.
 /// From the paper "Simple Analytic Approximations to the CIE XYZ Color Matching
 /// Functions" by Wyman et al.
 pub fn x_1931(wavelength: f32) -> f32 {
-    let t1 = (wavelength - 442.0) * (if wavelength < 442.0 { 0.0624 } else { 0.0374 });
-    let t2 = (wavelength - 599.8) * (if wavelength < 599.8 { 0.0264 } else { 0.0323 });
-    let t3 = (wavelength - 501.1) * (if wavelength < 501.1 { 0.0490 } else { 0.0382 });
-    (0.362 * fast_exp(-0.5 * t1 * t1)) + (1.056 * fast_exp(-0.5 * t2 * t2))
-        - (0.065 * fast_exp(-0.5 * t3 * t3))
+    use colorbox::tables::cie_1931_xyz::{MAX_WAVELENGTH, MIN_WAVELENGTH, X};
+
+    let norm = 1.0 / (MAX_WAVELENGTH - MIN_WAVELENGTH);
+    let n = (wavelength - MIN_WAVELENGTH) * norm;
+
+    if n < 0.0 {
+        X[0]
+    } else if n > 1.0 {
+        *X.last().unwrap()
+    } else {
+        crate::lerp::lerp_slice(X, n)
+    }
 }

 pub fn y_1931(wavelength: f32) -> f32 {
-    let t1 = (wavelength - 568.8) * (if wavelength < 568.8 { 0.0213 } else { 0.0247 });
-    let t2 = (wavelength - 530.9) * (if wavelength < 530.9 { 0.0613 } else { 0.0322 });
-    (0.821 * fast_exp(-0.5 * t1 * t1)) + (0.286 * fast_exp(-0.5 * t2 * t2))
+    use colorbox::tables::cie_1931_xyz::{MAX_WAVELENGTH, MIN_WAVELENGTH, Y};
+
+    let norm = 1.0 / (MAX_WAVELENGTH - MIN_WAVELENGTH);
+    let n = (wavelength - MIN_WAVELENGTH) * norm;
+
+    if n < 0.0 {
+        Y[0]
+    } else if n > 1.0 {
+        *Y.last().unwrap()
+    } else {
+        crate::lerp::lerp_slice(Y, n)
+    }
 }

 pub fn z_1931(wavelength: f32) -> f32 {
-    let t1 = (wavelength - 437.0) * (if wavelength < 437.0 { 0.0845 } else { 0.0278 });
-    let t2 = (wavelength - 459.0) * (if wavelength < 459.0 { 0.0385 } else { 0.0725 });
-    (1.217 * fast_exp(-0.5 * t1 * t1)) + (0.681 * fast_exp(-0.5 * t2 * t2))
+    use colorbox::tables::cie_1931_xyz::{MAX_WAVELENGTH, MIN_WAVELENGTH, Z};
+
+    let norm = 1.0 / (MAX_WAVELENGTH - MIN_WAVELENGTH);
+    let n = (wavelength - MIN_WAVELENGTH) * norm;
+
+    if n < 0.0 {
+        Z[0]
+    } else if n > 1.0 {
+        *Z.last().unwrap()
+    } else {
+        crate::lerp::lerp_slice(Z, n)
+    }
 }
--- a/src/fp_utils.rs
+++ b/src/fp_utils.rs
@ -4,6 +4,7 @@
 //! From Theory to Implementation" 3rd edition by Pharr et al.

 use crate::math::{dot, Normal, Point, Vector};
+pub use rmath::utils::{decrement_ulp, increment_ulp};

 #[inline(always)]
 pub fn fp_gamma(n: u32) -> f32 {
@ -12,36 +13,6 @@ pub fn fp_gamma(n: u32) -> f32 {
    (e * n as f32) / (1.0 - (e * n as f32))
 }

-pub fn increment_ulp(v: f32) -> f32 {
-    if v.is_finite() {
-        if v > 0.0 {
-            f32::from_bits(v.to_bits() + 1)
-        } else if v < -0.0 {
-            f32::from_bits(v.to_bits() - 1)
-        } else {
-            f32::from_bits(0x00_00_00_01)
-        }
-    } else {
-        // Infinity or NaN.
-        v
-    }
-}
-
-pub fn decrement_ulp(v: f32) -> f32 {
-    if v.is_finite() {
-        if v > 0.0 {
-            f32::from_bits(v.to_bits() - 1)
-        } else if v < -0.0 {
-            f32::from_bits(v.to_bits() + 1)
-        } else {
-            f32::from_bits(0x80_00_00_01)
-        }
-    } else {
-        // Infinity or NaN.
-        v
-    }
-}
-
 pub fn robust_ray_origin(pos: Point, pos_err: f32, nor: Normal, ray_dir: Vector) -> Point {
    // Get surface normal pointing in the same
    // direction as ray_dir.
@ -81,51 +52,7 @@ pub fn robust_ray_origin(pos: Point, pos_err: f32, nor: Normal, ray_dir: Vector)
    Point::new(x, y, z)
 }

-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn inc_ulp() {
-        assert!(increment_ulp(1.0) > 1.0);
-        assert!(increment_ulp(-1.0) > -1.0);
-    }
-
-    #[test]
-    fn dec_ulp() {
-        assert!(decrement_ulp(1.0) < 1.0);
-        assert!(decrement_ulp(-1.0) < -1.0);
-    }
-
-    #[test]
-    fn inc_ulp_zero() {
-        assert!(increment_ulp(0.0) > 0.0);
-        assert!(increment_ulp(0.0) > -0.0);
-        assert!(increment_ulp(-0.0) > 0.0);
-        assert!(increment_ulp(-0.0) > -0.0);
-    }
-
-    #[test]
-    fn dec_ulp_zero() {
-        assert!(decrement_ulp(0.0) < 0.0);
-        assert!(decrement_ulp(0.0) < -0.0);
-        assert!(decrement_ulp(-0.0) < 0.0);
-        assert!(decrement_ulp(-0.0) < -0.0);
-    }
-
-    #[test]
-    fn inc_dec_ulp() {
-        assert_eq!(decrement_ulp(increment_ulp(1.0)), 1.0);
-        assert_eq!(decrement_ulp(increment_ulp(-1.0)), -1.0);
-        assert_eq!(decrement_ulp(increment_ulp(1.2)), 1.2);
-        assert_eq!(decrement_ulp(increment_ulp(-1.2)), -1.2);
-    }
-
-    #[test]
-    fn dec_inc_ulp() {
-        assert_eq!(increment_ulp(decrement_ulp(1.0)), 1.0);
-        assert_eq!(increment_ulp(decrement_ulp(-1.0)), -1.0);
-        assert_eq!(increment_ulp(decrement_ulp(1.2)), 1.2);
-        assert_eq!(increment_ulp(decrement_ulp(-1.2)), -1.2);
-    }
-}
+// #[cfg(test)]
+// mod tests {
+//     use super::*;
+// }
--- a/src/hash.rs
+++ b/src/hash.rs
@ -1,29 +0,0 @@
-pub fn hash_u32(n: u32, seed: u32) -> u32 {
-    let mut hash = n;
-    for _ in 0..3 {
-        hash = hash.wrapping_mul(0x736caf6f);
-        hash ^= hash.wrapping_shr(16);
-        hash ^= seed;
-    }
-
-    hash
-}
-
-pub fn hash_u64(n: u64, seed: u64) -> u64 {
-    let mut hash = n;
-    for _ in 0..4 {
-        hash = hash.wrapping_mul(32_416_190_071 * 314_604_959);
-        hash ^= hash.wrapping_shr(32);
-        hash ^= seed;
-    }
-
-    hash
-}
-
-/// Returns a random float in [0, 1] based on 'n' and a seed.
-/// Generally use n for getting a bunch of different random
-/// numbers, and use seed to vary between runs.
-pub fn hash_u32_to_f32(n: u32, seed: u32) -> f32 {
-    const INV_MAX: f32 = 1.0 / std::u32::MAX as f32;
-    hash_u32(n, seed) as f32 * INV_MAX
-}
--- a/src/hilbert.rs
+++ b/src/hilbert.rs
@ -1,77 +0,0 @@
-#![allow(dead_code)]
-
-const N: u32 = 1 << 16;
-
-// Utility function used by the functions below.
-fn hil_rot(n: u32, rx: u32, ry: u32, x: &mut u32, y: &mut u32) {
-    use std::mem;
-    if ry == 0 {
-        if rx == 1 {
-            *x = (n - 1).wrapping_sub(*x);
-            *y = (n - 1).wrapping_sub(*y);
-        }
-        mem::swap(x, y);
-    }
-}
-
-/// Convert (x,y) to hilbert curve index.
-///
-/// x: The x coordinate.  Must be a positive integer no greater than 2^16-1.
-/// y: The y coordinate.  Must be a positive integer no greater than 2^16-1.
-///
-/// Returns the hilbert curve index corresponding to the (x,y) coordinates given.
-pub fn xy2d(x: u32, y: u32) -> u32 {
-    assert!(x < N);
-    assert!(y < N);
-
-    let (mut x, mut y) = (x, y);
-    let mut d = 0;
-    let mut s = N >> 1;
-    while s > 0 {
-        let rx = if (x & s) > 0 { 1 } else { 0 };
-        let ry = if (y & s) > 0 { 1 } else { 0 };
-        d += s * s * ((3 * rx) ^ ry);
-        hil_rot(s, rx, ry, &mut x, &mut y);
-
-        s >>= 1
-    }
-
-    d
-}
-
-/// Convert hilbert curve index to (x,y).
-///
-/// d: The hilbert curve index.
-///
-/// Returns the (x, y) coords at the given index.
-pub fn d2xy(d: u32) -> (u32, u32) {
-    let (mut x, mut y) = (0, 0);
-    let mut s = 1;
-    let mut t = d;
-    while s < N {
-        let rx = 1 & (t >> 1);
-        let ry = 1 & (t ^ rx);
-        hil_rot(s, rx, ry, &mut x, &mut y);
-        x += s * rx;
-        y += s * ry;
-        t >>= 2;
-
-        s <<= 1;
-    }
-
-    (x, y)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn reversible() {
-        let d = 54;
-        let (x, y) = d2xy(d);
-        let d2 = xy2d(x, y);
-
-        assert_eq!(d, d2);
-    }
-}
--- a/src/image.rs
+++ b/src/image.rs
@ -13,7 +13,9 @@ use std::{

 use half::f16;

-use crate::color::{xyz_to_rec709_e, XYZ};
+pub use color::{rec709_e_to_xyz, xyz_to_rec709_e};
+
+use crate::color::XYZ;

 #[derive(Debug)]
 #[allow(clippy::type_complexity)]
@ -236,22 +238,35 @@ impl<'a> Bucket<'a> {
    where
        F: Fn((f32, f32, f32)) -> (f32, f32, f32),
    {
-        use std::slice;
-        let mut data = Vec::with_capacity(
-            (4 * (self.max.0 - self.min.0) * (self.max.1 - self.min.1)) as usize,
+        let data_u8 = self.rgba_raw(color_convert);
+        base64::encode(&data_u8)
+    }
+
+    /// Returns the bucket's contents as a binary string.
+    ///
+    /// The data is laid out as four-floats-per-pixel in scanline order.
+    /// The fourth channel is alpha, and is set to 1.0 for all pixels.
+    ///
+    /// `color_convert` lets you do a colorspace conversion before base64
+    /// encoding if desired.
+    pub fn rgba_raw<F>(&mut self, color_convert: F) -> Vec<u8>
+    where
+        F: Fn((f32, f32, f32)) -> (f32, f32, f32),
+    {
+        let mut data: Vec<u8> = Vec::with_capacity(
+            std::mem::size_of::<f32>()
+                * (4 * (self.max.0 - self.min.0) * (self.max.1 - self.min.1)) as usize,
        );
        for y in self.min.1..self.max.1 {
            for x in self.min.0..self.max.0 {
                let color = color_convert(self.get(x, y).to_tuple());
-                data.push(color.0);
-                data.push(color.1);
-                data.push(color.2);
-                data.push(1.0);
+                data.extend_from_slice(&color.0.to_ne_bytes());
+                data.extend_from_slice(&color.1.to_ne_bytes());
+                data.extend_from_slice(&color.2.to_ne_bytes());
+                data.extend_from_slice(&1.0f32.to_ne_bytes());
            }
        }
-        let data_u8 =
-            unsafe { slice::from_raw_parts(&data[0] as *const f32 as *const u8, data.len() * 4) };
-        base64::encode(data_u8)
+        data
    }
 }

--- a/src/lerp.rs
+++ b/src/lerp.rs
@ -1,6 +1,6 @@
 #![allow(dead_code)]

-use math3d::{Normal, Point, Transform, Vector};
+use rmath::{wide4::Float4, Normal, Point, Vector, Xform};

 /// Trait for allowing a type to be linearly interpolated.
 pub trait Lerp: Copy {
@ -100,36 +100,34 @@ impl<T: Lerp> Lerp for [T; 4] {
    }
 }

-impl Lerp for glam::Vec4 {
-    fn lerp(self, other: glam::Vec4, alpha: f32) -> glam::Vec4 {
+impl Lerp for Float4 {
+    fn lerp(self, other: Self, alpha: f32) -> Self {
        (self * (1.0 - alpha)) + (other * alpha)
    }
 }

-impl Lerp for Transform {
-    fn lerp(self, other: Transform, alpha: f32) -> Transform {
+impl Lerp for Xform {
+    fn lerp(self, other: Self, alpha: f32) -> Self {
        (self * (1.0 - alpha)) + (other * alpha)
    }
 }

 impl Lerp for Normal {
-    fn lerp(self, other: Normal, alpha: f32) -> Normal {
+    fn lerp(self, other: Self, alpha: f32) -> Self {
        (self * (1.0 - alpha)) + (other * alpha)
    }
 }

 impl Lerp for Point {
-    fn lerp(self, other: Point, alpha: f32) -> Point {
-        let s = self;
-        let o = other;
-        Point {
-            co: (s.co * (1.0 - alpha)) + (o.co * alpha),
-        }
+    fn lerp(self, other: Self, alpha: f32) -> Self {
+        let a = self.0;
+        let b = other.0;
+        Point((a * (1.0 - alpha)) + (b * alpha))
    }
 }

 impl Lerp for Vector {
-    fn lerp(self, other: Vector, alpha: f32) -> Vector {
+    fn lerp(self, other: Self, alpha: f32) -> Self {
        (self * (1.0 - alpha)) + (other * alpha)
    }
 }
@ -215,20 +213,18 @@ mod tests {

    #[test]
    fn lerp_matrix() {
-        let a = Transform::new_from_values(
-            0.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0,
-        );
-        let b = Transform::new_from_values(
+        let a = Xform::new(0.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0);
+        let b = Xform::new(
            -1.0, 1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0,
        );

-        let c1 = Transform::new_from_values(
+        let c1 = Xform::new(
            -0.25, 1.75, 2.25, 3.25, 4.25, 5.25, 6.25, 7.25, 8.25, 9.25, 10.25, 11.25,
        );
-        let c2 = Transform::new_from_values(
+        let c2 = Xform::new(
            -0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5,
        );
-        let c3 = Transform::new_from_values(
+        let c3 = Xform::new(
            -0.75, 1.25, 2.75, 3.75, 4.75, 5.75, 6.75, 7.75, 8.75, 9.75, 10.75, 11.75,
        );

--- a/src/light/distant_disk_light.rs
+++ b/src/light/distant_disk_light.rs
@ -1,7 +1,5 @@
 use std::f64::consts::PI as PI_64;

-use kioku::Arena;
-
 use crate::{
    color::{Color, SpectralSample},
    lerp::lerp_slice,
@ -13,24 +11,19 @@ use super::WorldLightSource;

 // TODO: handle case where radius = 0.0.

-#[derive(Copy, Clone, Debug)]
-pub struct DistantDiskLight<'a> {
-    radii: &'a [f32],
-    directions: &'a [Vector],
-    colors: &'a [Color],
+#[derive(Debug, Clone)]
+pub struct DistantDiskLight {
+    radii: Vec<f32>,
+    directions: Vec<Vector>,
+    colors: Vec<Color>,
 }

-impl<'a> DistantDiskLight<'a> {
-    pub fn new(
-        arena: &'a Arena,
-        radii: &[f32],
-        directions: &[Vector],
-        colors: &[Color],
-    ) -> DistantDiskLight<'a> {
+impl DistantDiskLight {
+    pub fn new(radii: &[f32], directions: &[Vector], colors: &[Color]) -> DistantDiskLight {
        DistantDiskLight {
-            radii: arena.copy_slice(&radii),
-            directions: arena.copy_slice(&directions),
-            colors: arena.copy_slice(&colors),
+            radii: radii.into(),
+            directions: directions.into(),
+            colors: colors.into(),
        }
    }

@ -55,7 +48,7 @@ impl<'a> DistantDiskLight<'a> {
    // }
 }

-impl<'a> WorldLightSource for DistantDiskLight<'a> {
+impl WorldLightSource for DistantDiskLight {
    fn sample_from_point(
        &self,
        u: f32,
@ -64,9 +57,9 @@ impl<'a> WorldLightSource for DistantDiskLight<'a> {
        time: f32,
    ) -> (SpectralSample, Vector, f32) {
        // Calculate time interpolated values
-        let radius: f64 = lerp_slice(self.radii, time) as f64;
-        let direction = lerp_slice(self.directions, time);
-        let col = lerp_slice(self.colors, time);
+        let radius: f64 = lerp_slice(&self.radii, time) as f64;
+        let direction = lerp_slice(&self.directions, time);
+        let col = lerp_slice(&self.colors, time);
        let solid_angle_inv = 1.0 / (2.0 * PI_64 * (1.0 - radius.cos()));

        // Create a coordinate system from the vector pointing at the center of
--- a/src/light/mod.rs
+++ b/src/light/mod.rs
@ -6,7 +6,7 @@ use std::fmt::Debug;

 use crate::{
    color::SpectralSample,
-    math::{Normal, Point, Transform, Vector},
+    math::{Normal, Point, Vector, XformFull},
    surface::Surface,
 };

@ -34,7 +34,7 @@ pub trait SurfaceLight: Surface {
    /// - The pdf of the sample.
    fn sample_from_point(
        &self,
-        space: &Transform,
+        space: &XformFull,
        arr: Point,
        u: f32,
        v: f32,
--- a/src/light/rectangle_light.rs
+++ b/src/light/rectangle_light.rs
@ -5,8 +5,8 @@ use crate::{
    boundable::Boundable,
    color::{Color, SpectralSample},
    lerp::lerp_slice,
-    math::{cross, dot, Normal, Point, Transform, Vector},
-    ray::{RayBatch, RayStack},
+    math::{cross, dot, Normal, Point, Vector, XformFull},
+    ray::{LocalRay, Ray},
    sampling::{
        spherical_triangle_solid_angle, triangle_surface_area, uniform_sample_spherical_triangle,
        uniform_sample_triangle,
@ -51,7 +51,7 @@ impl<'a> RectangleLight<'a> {
    // more efficiently by inlining it there.
    fn sample_pdf(
        &self,
-        space: &Transform,
+        space: &XformFull,
        arr: Point,
        sample_dir: Vector,
        hit_point: Point,
@ -64,11 +64,10 @@ impl<'a> RectangleLight<'a> {
        let dim = lerp_slice(self.dimensions, time);

        // Get the four corners of the rectangle, transformed into world space
-        let space_inv = space.inverse();
-        let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0) * space_inv;
-        let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0) * space_inv;
-        let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0) * space_inv;
-        let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0) * space_inv;
+        let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0).xform(space);
+        let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0).xform(space);
+        let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0).xform(space);
+        let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0).xform(space);

        // Get the four corners of the rectangle, projected on to the unit
        // sphere centered around arr.
@ -82,7 +81,7 @@ impl<'a> RectangleLight<'a> {
        let area_2 = spherical_triangle_solid_angle(sp4, sp1, sp3);

        // World-space surface normal
-        let normal = Normal::new(0.0, 0.0, 1.0) * space_inv;
+        let normal = Normal::new(0.0, 0.0, 1.0).xform_fast(space);

        // PDF
        if (area_1 + area_2) < SIMPLE_SAMPLING_THRESHOLD {
@ -97,7 +96,7 @@ impl<'a> RectangleLight<'a> {

    // fn outgoing(
    //     &self,
-    //     space: &Transform,
+    //     space: &XformFull,
    //     dir: Vector,
    //     u: f32,
    //     v: f32,
@ -120,7 +119,7 @@ impl<'a> RectangleLight<'a> {
 impl<'a> SurfaceLight for RectangleLight<'a> {
    fn sample_from_point(
        &self,
-        space: &Transform,
+        space: &XformFull,
        arr: Point,
        u: f32,
        v: f32,
@ -135,11 +134,10 @@ impl<'a> SurfaceLight for RectangleLight<'a> {
        let surface_area_inv: f64 = 1.0 / surface_area;

        // Get the four corners of the rectangle, transformed into world space
-        let space_inv = space.inverse();
-        let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0) * space_inv;
-        let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0) * space_inv;
-        let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0) * space_inv;
-        let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0) * space_inv;
+        let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0).xform(space);
+        let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0).xform(space);
+        let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0).xform(space);
+        let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0).xform(space);

        // Get the four corners of the rectangle relative to arr.
        let lp1 = p1 - arr;
@ -158,7 +156,7 @@ impl<'a> SurfaceLight for RectangleLight<'a> {
        let area_2 = spherical_triangle_solid_angle(sp4, sp1, sp3);

        // Calculate world-space surface normal
-        let normal = Normal::new(0.0, 0.0, 1.0) * space_inv;
+        let normal = Normal::new(0.0, 0.0, 1.0).xform_fast(space);

        if (area_1 + area_2) < SIMPLE_SAMPLING_THRESHOLD {
            // Simple sampling for more distant lights
@ -215,18 +213,16 @@ impl<'a> SurfaceLight for RectangleLight<'a> {
            };

            // Project shadow_vec back onto the light's surface
-            let arr_local = arr * *space;
-            let shadow_vec_local = shadow_vec * *space;
+            let arr_local = arr.xform_inv(space);
+            let shadow_vec_local = shadow_vec.xform_inv(space);
            let shadow_vec_local = shadow_vec_local * (-arr_local.z() / shadow_vec_local.z());
            let mut sample_point_local = arr_local + shadow_vec_local;
            {
                let x = sample_point_local.x().max(dim.0 * -0.5).min(dim.0 * 0.5);
                let y = sample_point_local.y().max(dim.1 * -0.5).min(dim.1 * 0.5);
-                sample_point_local.set_x(x);
-                sample_point_local.set_y(y);
-                sample_point_local.set_z(0.0);
+                sample_point_local = Point::new(x, y, 0.0);
            }
-            let sample_point = sample_point_local * space_inv;
+            let sample_point = sample_point_local.xform(space);
            let point_err = 0.0001; // TODO: this is a hack, do properly.

            // Calculate pdf and light energy
@ -255,85 +251,77 @@ impl<'a> SurfaceLight for RectangleLight<'a> {
 }

 impl<'a> Surface for RectangleLight<'a> {
-    fn intersect_rays(
+    fn intersect_ray(
        &self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-        isects: &mut [SurfaceIntersection],
-        shader: &dyn SurfaceShader,
-        space: &[Transform],
+        ray: &mut Ray,
+        _local_ray: &LocalRay,
+        space: &XformFull,
+        isect: &mut SurfaceIntersection,
+        _shaders: &[&dyn SurfaceShader],
    ) {
-        let _ = shader; // Silence 'unused' warning
+        let time = ray.time;

-        ray_stack.pop_do_next_task(|ray_idx| {
-            let time = rays.time(ray_idx);
-            let orig = rays.orig(ray_idx);
-            let dir = rays.dir(ray_idx);
-            let max_t = rays.max_t(ray_idx);
+        // Calculate time interpolated values.
+        let dim = lerp_slice(self.dimensions, time);

-            // Calculate time interpolated values
-            let dim = lerp_slice(self.dimensions, time);
-            let xform = lerp_slice(space, time);
+        // Get the four corners of the rectangle, transformed into world space.
+        let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0).xform(space);
+        let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0).xform(space);
+        let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0).xform(space);
+        let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0).xform(space);

-            let space_inv = xform.inverse();
+        // Test against two triangles that make up the light.
+        let ray_pre = triangle::RayTriPrecompute::new(ray.dir);
+        for tri in &[(p1, p2, p3), (p3, p4, p1)] {
+            if let Some((t, b0, b1, b2)) =
+                triangle::intersect_ray(ray.orig, ray_pre, ray.max_t, *tri)
+            {
+                if t < ray.max_t {
+                    if ray.is_occlusion() {
+                        *isect = SurfaceIntersection::Occlude;
+                        ray.mark_done();
+                        return;
+                    } else {
+                        let (pos, pos_err) = triangle::surface_point(*tri, (b0, b1, b2));
+                        let normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();

-            // Get the four corners of the rectangle, transformed into world space
-            let p1 = Point::new(dim.0 * 0.5, dim.1 * 0.5, 0.0) * space_inv;
-            let p2 = Point::new(dim.0 * -0.5, dim.1 * 0.5, 0.0) * space_inv;
-            let p3 = Point::new(dim.0 * -0.5, dim.1 * -0.5, 0.0) * space_inv;
-            let p4 = Point::new(dim.0 * 0.5, dim.1 * -0.5, 0.0) * space_inv;
+                        let intersection_data = SurfaceIntersectionData {
+                            incoming: ray.dir,
+                            t: t,
+                            pos: pos,
+                            pos_err: pos_err,
+                            nor: normal,
+                            nor_g: normal,
+                            local_space: *space,
+                            sample_pdf: self.sample_pdf(
+                                space,
+                                ray.orig,
+                                ray.dir,
+                                pos,
+                                ray.wavelength,
+                                time,
+                            ),
+                        };

-            // Test against two triangles that make up the light
-            let ray_pre = triangle::RayTriPrecompute::new(dir);
-            for tri in &[(p1, p2, p3), (p3, p4, p1)] {
-                if let Some((t, b0, b1, b2)) = triangle::intersect_ray(orig, ray_pre, max_t, *tri) {
-                    if t < max_t {
-                        if rays.is_occlusion(ray_idx) {
-                            isects[ray_idx] = SurfaceIntersection::Occlude;
-                            rays.mark_done(ray_idx);
-                        } else {
-                            let (pos, pos_err) = triangle::surface_point(*tri, (b0, b1, b2));
-                            let normal = cross(tri.0 - tri.1, tri.0 - tri.2).into_normal();
+                        let closure = {
+                            let inv_surface_area = (1.0 / (dim.0 as f64 * dim.1 as f64)) as f32;
+                            let color = lerp_slice(self.colors, time) * inv_surface_area;
+                            SurfaceClosure::Emit(color)
+                        };

-                            let intersection_data = SurfaceIntersectionData {
-                                incoming: dir,
-                                t: t,
-                                pos: pos,
-                                pos_err: pos_err,
-                                nor: normal,
-                                nor_g: normal,
-                                local_space: xform,
-                                sample_pdf: self.sample_pdf(
-                                    &xform,
-                                    orig,
-                                    dir,
-                                    pos,
-                                    rays.wavelength(ray_idx),
-                                    time,
-                                ),
-                            };
+                        // Fill in intersection.
+                        *isect = SurfaceIntersection::Hit {
+                            intersection_data: intersection_data,
+                            closure: closure,
+                        };

-                            let closure = {
-                                let inv_surface_area = (1.0 / (dim.0 as f64 * dim.1 as f64)) as f32;
-                                let color = lerp_slice(self.colors, time) * inv_surface_area;
-                                SurfaceClosure::Emit(color)
-                            };
-
-                            // Fill in intersection
-                            isects[ray_idx] = SurfaceIntersection::Hit {
-                                intersection_data: intersection_data,
-                                closure: closure,
-                            };
-
-                            // Set ray's max t
-                            rays.set_max_t(ray_idx, t);
-                        }
-
-                        break;
+                        ray.max_t = t;
                    }
+
+                    break;
                }
            }
-        });
+        }
    }
 }

--- a/src/light/sphere_light.rs
+++ b/src/light/sphere_light.rs
@ -7,8 +7,8 @@ use crate::{
    boundable::Boundable,
    color::{Color, SpectralSample},
    lerp::lerp_slice,
-    math::{coordinate_system_from_vector, dot, Normal, Point, Transform, Vector},
-    ray::{RayBatch, RayStack},
+    math::{coordinate_system_from_vector, dot, Normal, Point, Vector, XformFull},
+    ray::{LocalRay, Ray},
    sampling::{uniform_sample_cone, uniform_sample_cone_pdf, uniform_sample_sphere},
    shading::surface_closure::SurfaceClosure,
    shading::SurfaceShader,
@ -50,7 +50,7 @@ impl<'a> SphereLight<'a> {
    // more efficiently by inlining it there.
    fn sample_pdf(
        &self,
-        space: &Transform,
+        space: &XformFull,
        arr: Point,
        sample_dir: Vector,
        sample_u: f32,
@ -61,7 +61,7 @@ impl<'a> SphereLight<'a> {
        // We're not using these, silence warnings
        let _ = (sample_dir, sample_u, sample_v, wavelength);

-        let arr = arr * *space;
+        let arr = arr.xform_inv(space);
        let pos = Point::new(0.0, 0.0, 0.0);
        let radius: f64 = lerp_slice(self.radii, time) as f64;

@ -84,7 +84,7 @@ impl<'a> SphereLight<'a> {
 impl<'a> SurfaceLight for SphereLight<'a> {
    fn sample_from_point(
        &self,
-        space: &Transform,
+        space: &XformFull,
        arr: Point,
        u: f32,
        v: f32,
@ -92,12 +92,9 @@ impl<'a> SurfaceLight for SphereLight<'a> {
        time: f32,
    ) -> (SpectralSample, (Point, Normal, f32), f32) {
        // TODO: track fp error due to transforms
-        let arr = arr * *space;
+        let arr = arr.xform_inv(space);
        let pos = Point::new(0.0, 0.0, 0.0);

-        // Precalculate local->world space transform matrix
-        let inv_space = space.inverse();
-
        // Calculate time interpolated values
        let radius: f64 = lerp_slice(self.radii, time) as f64;
        let col = lerp_slice(self.colors, time);
@ -115,7 +112,7 @@ impl<'a> SurfaceLight for SphereLight<'a> {
        // TODO: do this properly.  This is a total hack.
        let sample_point_err = {
            let v = Vector::new(radius as f32, radius as f32, radius as f32);
-            let v2 = v * inv_space;
+            let v2 = v.xform(space);
            v2.length() * SAMPLE_POINT_FUDGE
        };

@ -159,8 +156,8 @@ impl<'a> SurfaceLight for SphereLight<'a> {
                let normal = (arr + sample_vec).into_vector().normalized();
                let point = normal * radius as f32;
                (
-                    point.into_point() * inv_space,
-                    normal.into_normal() * inv_space,
+                    point.into_point().xform(space),
+                    normal.into_normal().xform_fast(space),
                )
            };
            let pdf = uniform_sample_cone_pdf(cos_theta_max);
@ -177,8 +174,8 @@ impl<'a> SurfaceLight for SphereLight<'a> {
                let normal = (arr + sample_vec).into_vector().normalized();
                let point = normal * radius as f32;
                (
-                    point.into_point() * inv_space,
-                    normal.into_normal() * inv_space,
+                    point.into_point().xform(space),
+                    normal.into_normal().xform_fast(space),
                )
            };
            let pdf = 1.0 / (4.0 * PI_64);
@ -204,137 +201,122 @@ impl<'a> SurfaceLight for SphereLight<'a> {
 }

 impl<'a> Surface for SphereLight<'a> {
-    fn intersect_rays(
+    fn intersect_ray(
        &self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-        isects: &mut [SurfaceIntersection],
-        shader: &dyn SurfaceShader,
-        space: &[Transform],
+        ray: &mut Ray,
+        local_ray: &LocalRay,
+        space: &XformFull,
+        isect: &mut SurfaceIntersection,
+        _shaders: &[&dyn SurfaceShader],
    ) {
-        let _ = shader; // Silence 'unused' warning
+        let time = ray.time;

-        ray_stack.pop_do_next_task(|ray_idx| {
-            let time = rays.time(ray_idx);
+        // Get the radius of the sphere at the ray's time
+        let radius = lerp_slice(self.radii, time); // Radius of the sphere

-            // Get the transform space
-            let xform = lerp_slice(space, time);
+        // Code adapted to Rust from https://github.com/Tecla/Rayito
+        // Ray-sphere intersection can result in either zero, one or two points
+        // of intersection.  It turns into a quadratic equation, so we just find
+        // the solution using the quadratic formula.  Note that there is a
+        // slightly more stable form of it when computing it on a computer, and
+        // we use that method to keep everything accurate.

-            // Get the radius of the sphere at the ray's time
-            let radius = lerp_slice(self.radii, time); // Radius of the sphere
+        // Calculate quadratic coeffs
+        let a = local_ray.dir.length2();
+        let b = 2.0 * dot(local_ray.dir, local_ray.orig.into_vector());
+        let c = local_ray.orig.into_vector().length2() - (radius * radius);

-            // Get the ray origin and direction in local space
-            let orig = rays.orig_local(ray_idx).into_vector();
-            let dir = rays.dir(ray_idx) * xform;
+        let discriminant = (b * b) - (4.0 * a * c);
+        if discriminant < 0.0 {
+            // Discriminant less than zero?  No solution => no intersection.
+            return;
+        }
+        let discriminant = discriminant.sqrt();

-            // Code adapted to Rust from https://github.com/Tecla/Rayito
-            // Ray-sphere intersection can result in either zero, one or two points
-            // of intersection.  It turns into a quadratic equation, so we just find
-            // the solution using the quadratic formula.  Note that there is a
-            // slightly more stable form of it when computing it on a computer, and
-            // we use that method to keep everything accurate.
+        // Compute a more stable form of our param t (t0 = q/a, t1 = c/q)
+        // q = -0.5 * (b - sqrt(b * b - 4.0 * a * c)) if b < 0, or
+        // q = -0.5 * (b + sqrt(b * b - 4.0 * a * c)) if b >= 0
+        let q = if b < 0.0 {
+            -0.5 * (b - discriminant)
+        } else {
+            -0.5 * (b + discriminant)
+        };

-            // Calculate quadratic coeffs
-            let a = dir.length2();
-            let b = 2.0 * dot(dir, orig);
-            let c = orig.length2() - (radius * radius);
+        // Get our final parametric values
+        let mut t0 = q / a;
+        let mut t1 = if q != 0.0 { c / q } else { ray.max_t };

-            let discriminant = (b * b) - (4.0 * a * c);
-            if discriminant < 0.0 {
-                // Discriminant less than zero?  No solution => no intersection.
-                return;
-            }
-            let discriminant = discriminant.sqrt();
+        // Swap them so they are ordered right
+        if t0 > t1 {
+            use std::mem::swap;
+            swap(&mut t0, &mut t1);
+        }

-            // Compute a more stable form of our param t (t0 = q/a, t1 = c/q)
-            // q = -0.5 * (b - sqrt(b * b - 4.0 * a * c)) if b < 0, or
-            // q = -0.5 * (b + sqrt(b * b - 4.0 * a * c)) if b >= 0
-            let q = if b < 0.0 {
-                -0.5 * (b - discriminant)
-            } else {
-                -0.5 * (b + discriminant)
+        // Check our intersection for validity against this ray's extents
+        if t0 > ray.max_t || t1 <= 0.0 {
+            // Didn't hit because sphere is entirely outside of ray's extents
+            return;
+        }
+
+        let t = if t0 > 0.0 {
+            t0
+        } else if t1 <= ray.max_t {
+            t1
+        } else {
+            // Didn't hit because ray is entirely within the sphere, and
+            // therefore doesn't hit its surface.
+            return;
+        };
+
+        // We hit the sphere, so calculate intersection info.
+        if ray.is_occlusion() {
+            *isect = SurfaceIntersection::Occlude;
+            ray.mark_done();
+        } else {
+            // Position is calculated from the local-space ray and t, and then
+            // re-projected onto the surface of the sphere.
+            let t_pos = local_ray.orig + (local_ray.dir * t);
+            let unit_pos = t_pos.into_vector().normalized();
+            let pos = (unit_pos * radius).xform(space).into_point();
+
+            // TODO: proper error bounds.
+            let pos_err = 0.001;
+
+            let normal = unit_pos.into_normal().xform_fast(space);
+
+            let intersection_data = SurfaceIntersectionData {
+                incoming: ray.dir,
+                t: t,
+                pos: pos,
+                pos_err: pos_err,
+                nor: normal,
+                nor_g: normal,
+                local_space: *space,
+                sample_pdf: self.sample_pdf(
+                    space,
+                    ray.orig,
+                    ray.dir,
+                    0.0,
+                    0.0,
+                    ray.wavelength,
+                    time,
+                ),
            };

-            // Get our final parametric values
-            let mut t0 = q / a;
-            let mut t1 = if q != 0.0 { c / q } else { rays.max_t(ray_idx) };
-
-            // Swap them so they are ordered right
-            if t0 > t1 {
-                use std::mem::swap;
-                swap(&mut t0, &mut t1);
-            }
-
-            // Check our intersection for validity against this ray's extents
-            if t0 > rays.max_t(ray_idx) || t1 <= 0.0 {
-                // Didn't hit because sphere is entirely outside of ray's extents
-                return;
-            }
-
-            let t = if t0 > 0.0 {
-                t0
-            } else if t1 <= rays.max_t(ray_idx) {
-                t1
-            } else {
-                // Didn't hit because ray is entirely within the sphere, and
-                // therefore doesn't hit its surface.
-                return;
+            let closure = {
+                let inv_surface_area = (1.0 / (4.0 * PI_64 * radius as f64 * radius as f64)) as f32;
+                let color = lerp_slice(self.colors, time) * inv_surface_area;
+                SurfaceClosure::Emit(color)
            };

-            // We hit the sphere, so calculate intersection info.
-            if rays.is_occlusion(ray_idx) {
-                isects[ray_idx] = SurfaceIntersection::Occlude;
-                rays.mark_done(ray_idx);
-            } else {
-                let inv_xform = xform.inverse();
+            // Fill in intersection
+            *isect = SurfaceIntersection::Hit {
+                intersection_data: intersection_data,
+                closure: closure,
+            };

-                // Position is calculated from the local-space ray and t, and then
-                // re-projected onto the surface of the sphere.
-                let t_pos = orig + (dir * t);
-                let unit_pos = t_pos.normalized();
-                let pos = (unit_pos * radius * inv_xform).into_point();
-
-                // TODO: proper error bounds.
-                let pos_err = 0.001;
-
-                let normal = unit_pos.into_normal() * inv_xform;
-
-                let intersection_data = SurfaceIntersectionData {
-                    incoming: rays.dir(ray_idx),
-                    t: t,
-                    pos: pos,
-                    pos_err: pos_err,
-                    nor: normal,
-                    nor_g: normal,
-                    local_space: xform,
-                    sample_pdf: self.sample_pdf(
-                        &xform,
-                        rays.orig(ray_idx),
-                        rays.dir(ray_idx),
-                        0.0,
-                        0.0,
-                        rays.wavelength(ray_idx),
-                        time,
-                    ),
-                };
-
-                let closure = {
-                    let inv_surface_area =
-                        (1.0 / (4.0 * PI_64 * radius as f64 * radius as f64)) as f32;
-                    let color = lerp_slice(self.colors, time) * inv_surface_area;
-                    SurfaceClosure::Emit(color)
-                };
-
-                // Fill in intersection
-                isects[ray_idx] = SurfaceIntersection::Hit {
-                    intersection_data: intersection_data,
-                    closure: closure,
-                };
-
-                // Set ray's max t
-                rays.set_max_t(ray_idx, t);
-            }
-        });
+            ray.max_t = t;
+        }
    }
 }

--- a/src/main.rs
+++ b/src/main.rs
@ -22,8 +22,6 @@ mod boundable;
 mod camera;
 mod color;
 mod fp_utils;
-mod hash;
-mod hilbert;
 mod image;
 mod lerp;
 mod light;
@ -34,11 +32,13 @@ mod ray;
 mod renderer;
 mod sampling;
 mod scene;
+mod scramble;
 mod shading;
+mod space_fill;
 mod surface;
 mod timer;
 mod tracer;
-mod transform_stack;
+// mod transform_stack;

 use std::{fs::File, io, io::Read, mem, path::Path, str::FromStr};

@ -51,7 +51,6 @@ use crate::{
    accel::BVH4Node,
    bbox::BBox,
    parse::{parse_scene, DataTree},
-    renderer::LightPath,
    surface::SurfaceIntersection,
    timer::Timer,
 };
@ -89,11 +88,11 @@ fn main() {
                }),
        )
        .arg(
-            Arg::with_name("max_bucket_samples")
+            Arg::with_name("bucket_size")
                .short("b")
-                .long("spb")
+                .long("bucket_size")
                .value_name("N")
-                .help("Target number of samples per bucket (determines bucket size)")
+                .help("Height and width of each render bucket in pixels.")
                .takes_value(true)
                .validator(|s| {
                    usize::from_str(&s)
@ -163,7 +162,6 @@ fn main() {
            "SurfaceIntersection size:  {} bytes",
            mem::size_of::<SurfaceIntersection>()
        );
-        println!("LightPath size: {} bytes", mem::size_of::<LightPath>());
        println!("BBox size: {} bytes", mem::size_of::<BBox>());
        // println!("BVHNode size: {} bytes", mem::size_of::<BVHNode>());
        println!("BVH4Node size: {} bytes", mem::size_of::<BVH4Node>());
@ -259,12 +257,11 @@ fn main() {
                    r.spp = usize::from_str(spp).unwrap();
                }

-                let max_samples_per_bucket =
-                    if let Some(max_samples_per_bucket) = args.value_of("max_bucket_samples") {
-                        u32::from_str(max_samples_per_bucket).unwrap()
-                    } else {
-                        4096
-                    };
+                let bucket_size = if let Some(bucket_size) = args.value_of("bucket_size") {
+                    u32::from_str(bucket_size).unwrap()
+                } else {
+                    32
+                };

                let thread_count = if let Some(threads) = args.value_of("threads") {
                    u32::from_str(threads).unwrap()
@ -280,7 +277,7 @@ fn main() {
                    println!("Rendering scene with {} threads...", thread_count);
                }
                let (mut image, rstats) = r.render(
-                    max_samples_per_bucket,
+                    bucket_size,
                    crop,
                    thread_count,
                    args.is_present("serialized_output"),
@ -288,30 +285,9 @@ fn main() {
                // Print render stats
                if !args.is_present("serialized_output") {
                    let rtime = t.tick();
-                    let ntime = rtime as f64 / rstats.total_time;
                    println!("\tRendered scene in {:.3}s", rtime);
-                    println!(
-                        "\t\tTrace:                  {:.3}s",
-                        ntime * rstats.trace_time
-                    );
                    println!("\t\t\tRays traced:          {}", rstats.ray_count);
-                    println!(
-                        "\t\t\tRays/sec:             {}",
-                        (rstats.ray_count as f64 / (ntime * rstats.trace_time) as f64) as u64
-                    );
                    println!("\t\t\tRay/node tests:       {}", rstats.accel_node_visits);
-                    println!(
-                        "\t\tInitial ray generation: {:.3}s",
-                        ntime * rstats.initial_ray_generation_time
-                    );
-                    println!(
-                        "\t\tRay generation:         {:.3}s",
-                        ntime * rstats.ray_generation_time
-                    );
-                    println!(
-                        "\t\tSample writing:         {:.3}s",
-                        ntime * rstats.sample_writing_time
-                    );
                }

                // Write to disk
--- a/src/math.rs
+++ b/src/math.rs
@ -2,18 +2,78 @@

 use std::f32;

-pub use math3d::{cross, dot, CrossProduct, DotProduct, Normal, Point, Transform, Vector};
+pub use rmath::{
+    cross, cross_fast, dot, dot_fast, wide4::Float4, AsXform, CrossProduct, DotProduct, Normal,
+    Point, Vector, Xform, XformFull,
+};
+
+/// Clamps a value between a min and max.
+pub fn clamp<T: PartialOrd>(v: T, lower: T, upper: T) -> T {
+    if v < lower {
+        lower
+    } else if v > upper {
+        upper
+    } else {
+        v
+    }
+}
+
+// The stdlib min function is slower than a simple if statement for some reason.
+pub fn fast_minf32(a: f32, b: f32) -> f32 {
+    if a < b {
+        a
+    } else {
+        b
+    }
+}
+
+// The stdlib max function is slower than a simple if statement for some reason.
+pub fn fast_maxf32(a: f32, b: f32) -> f32 {
+    if a > b {
+        a
+    } else {
+        b
+    }
+}
+
+/// Rounds an integer up to the next power of two.
+pub fn upper_power_of_two(mut v: u32) -> u32 {
+    v -= 1;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v + 1
+}

 /// Gets the log base 2 of the given integer
-pub fn log2_64(n: u64) -> u64 {
-    // This works by finding the largest non-zero binary digit in the
-    // number.  Its bit position is then the log2 of the integer.
+pub fn log2_64(mut value: u64) -> u64 {
+    // This works by doing a binary search for the largest non-zero binary
+    // digit in the number.  Its bit position is then the log2 of the integer.

-    if n == 0 {
-        0
-    } else {
-        (63 - n.leading_zeros()) as u64
+    let mut log = 0;
+
+    const POWERS: [(u64, u64); 6] = [
+        (32, (1 << 32) - 1),
+        (16, (1 << 16) - 1),
+        (8, (1 << 8) - 1),
+        (4, (1 << 4) - 1),
+        (2, (1 << 2) - 1),
+        (1, (1 << 1) - 1),
+    ];
+
+    for &(i, j) in &POWERS {
+        let tmp = value >> i;
+        if tmp != 0 {
+            log += i;
+            value = tmp;
+        } else {
+            value &= j;
+        }
    }
+
+    log
 }

 /// Creates a coordinate system from a single vector.
--- a/src/parse/psy.rs
+++ b/src/parse/psy.rs
@ -6,14 +6,10 @@ use nom::{combinator::all_consuming, sequence::tuple, IResult};

 use kioku::Arena;

+use color::rec709_e_to_xyz;
+
 use crate::{
-    camera::Camera,
-    color::{rec709_e_to_xyz, Color},
-    light::WorldLightSource,
-    math::Transform,
-    renderer::Renderer,
-    scene::Scene,
-    scene::World,
+    camera::Camera, color::Color, math::Xform, renderer::Renderer, scene::Scene, scene::World,
 };

 use super::{
@ -153,13 +149,10 @@ pub fn parse_scene<'a>(
    )?;

    // Parse camera
-    let camera = parse_camera(
-        arena,
-        tree.iter_children_with_type("Camera").nth(0).unwrap(),
-    )?;
+    let camera = parse_camera(tree.iter_children_with_type("Camera").nth(0).unwrap())?;

    // Parse world
-    let world = parse_world(arena, tree.iter_children_with_type("World").nth(0).unwrap())?;
+    let world = parse_world(tree.iter_children_with_type("World").nth(0).unwrap())?;

    // Parse root scene assembly
    let assembly = parse_assembly(
@ -350,7 +343,7 @@ fn parse_render_settings(tree: &DataTree) -> Result<((u32, u32), u32, u32), PsyP
    };
 }

-fn parse_camera<'a>(arena: &'a Arena, tree: &'a DataTree) -> Result<Camera<'a>, PsyParseError> {
+fn parse_camera<'a>(tree: &'a DataTree) -> Result<Camera, PsyParseError> {
    if let DataTree::Internal { ref children, .. } = *tree {
        let mut mats = Vec::new();
        let mut fovs = Vec::new();
@ -435,13 +428,7 @@ fn parse_camera<'a>(arena: &'a Arena, tree: &'a DataTree) -> Result<Camera<'a>,
            }
        }

-        return Ok(Camera::new(
-            arena,
-            &mats,
-            &fovs,
-            &aperture_radii,
-            &focus_distances,
-        ));
+        return Ok(Camera::new(&mats, &fovs, &aperture_radii, &focus_distances));
    } else {
        return Err(PsyParseError::ExpectedInternalNode(
            tree.byte_offset(),
@ -452,10 +439,10 @@ fn parse_camera<'a>(arena: &'a Arena, tree: &'a DataTree) -> Result<Camera<'a>,
    }
 }

-fn parse_world<'a>(arena: &'a Arena, tree: &'a DataTree) -> Result<World<'a>, PsyParseError> {
+fn parse_world(tree: &DataTree) -> Result<World, PsyParseError> {
    if tree.is_internal() {
        let background_color;
-        let mut lights: Vec<&dyn WorldLightSource> = Vec::new();
+        let mut lights: Vec<_> = Vec::new();

        // Parse background shader
        let bgs = {
@ -531,7 +518,7 @@ fn parse_world<'a>(arena: &'a Arena, tree: &'a DataTree) -> Result<World<'a>, Ps
        for child in tree.iter_children() {
            match *child {
                DataTree::Internal { type_name, .. } if type_name == "DistantDiskLight" => {
-                    lights.push(arena.alloc(parse_distant_disk_light(arena, child)?));
+                    lights.push(parse_distant_disk_light(child)?);
                }

                _ => {}
@ -541,7 +528,7 @@ fn parse_world<'a>(arena: &'a Arena, tree: &'a DataTree) -> Result<World<'a>, Ps
        // Build and return the world
        return Ok(World {
            background_color: background_color,
-            lights: arena.copy_slice(&lights),
+            lights: lights,
        });
    } else {
        return Err(PsyParseError::ExpectedInternalNode(
@ -553,17 +540,17 @@ fn parse_world<'a>(arena: &'a Arena, tree: &'a DataTree) -> Result<World<'a>, Ps
    }
 }

-pub fn parse_matrix(contents: &str) -> Result<Transform, PsyParseError> {
+pub fn parse_matrix(contents: &str) -> Result<Xform, PsyParseError> {
    if let IResult::Ok((leftover, ns)) = all_consuming(tuple((
        ws_f32, ws_f32, ws_f32, ws_f32, ws_f32, ws_f32, ws_f32, ws_f32, ws_f32, ws_f32, ws_f32,
        ws_f32, ws_f32, ws_f32, ws_f32, ws_f32,
    )))(contents)
    {
        if leftover.is_empty() {
-            return Ok(Transform::new_from_values(
+            return Ok(Xform::new(
                // We throw away the last row, since it's not necessarily affine.
                // TODO: is there a more correct way to handle this?
-                ns.0, ns.4, ns.8, ns.12, ns.1, ns.5, ns.9, ns.13, ns.2, ns.6, ns.10, ns.14,
+                ns.0, ns.1, ns.2, ns.4, ns.5, ns.6, ns.8, ns.9, ns.10, ns.12, ns.13, ns.14,
            ));
        }
    }
--- a/src/parse/psy_assembly.rs
+++ b/src/parse/psy_assembly.rs
@ -50,23 +50,6 @@ pub fn parse_assembly<'a>(
                        child.iter_leaf_children_with_type("Data").nth(0).unwrap().1
                    };

-                    // Get surface shader binding, if any.
-                    let surface_shader_name = if child
-                        .iter_leaf_children_with_type("SurfaceShaderBind")
-                        .count()
-                        > 0
-                    {
-                        Some(
-                            child
-                                .iter_leaf_children_with_type("SurfaceShaderBind")
-                                .nth(0)
-                                .unwrap()
-                                .1,
-                        )
-                    } else {
-                        None
-                    };
-
                    // Get xforms
                    let mut xforms = Vec::new();
                    for (_, contents, _) in child.iter_leaf_children_with_type("Transform") {
@ -75,7 +58,7 @@ pub fn parse_assembly<'a>(

                    // Add instance
                    if builder.name_exists(name) {
-                        builder.add_instance(name, surface_shader_name, Some(&xforms));
+                        builder.add_instance(name, Some(&xforms));
                    } else {
                        return Err(PsyParseError::InstancedMissingData(
                            child.iter_leaf_children_with_type("Data").nth(0).unwrap().2,
@ -113,7 +96,11 @@ pub fn parse_assembly<'a>(
                    {
                        builder.add_object(
                            ident,
-                            Object::Surface(arena.alloc(parse_mesh_surface(arena, child)?)),
+                            Object::Surface(arena.alloc(parse_mesh_surface(
+                                arena,
+                                child,
+                                &builder.surface_shader_map,
+                            )?)),
                        );
                    } else {
                        // TODO: error condition of some kind, because no ident
--- a/src/parse/psy_light.rs
+++ b/src/parse/psy_light.rs
@ -17,10 +17,7 @@ use super::{
    DataTree,
 };

-pub fn parse_distant_disk_light<'a>(
-    arena: &'a Arena,
-    tree: &'a DataTree,
-) -> Result<DistantDiskLight<'a>, PsyParseError> {
+pub fn parse_distant_disk_light<'a>(tree: &'a DataTree) -> Result<DistantDiskLight, PsyParseError> {
    if let DataTree::Internal { ref children, .. } = *tree {
        let mut radii = Vec::new();
        let mut directions = Vec::new();
@ -77,7 +74,7 @@ pub fn parse_distant_disk_light<'a>(
            }
        }

-        return Ok(DistantDiskLight::new(arena, &radii, &directions, &colors));
+        return Ok(DistantDiskLight::new(&radii, &directions, &colors));
    } else {
        return Err(PsyParseError::UnknownError(tree.byte_offset()));
    }
--- a/src/parse/psy_mesh_surface.rs
+++ b/src/parse/psy_mesh_surface.rs
@ -1,6 +1,6 @@
 #![allow(dead_code)]

-use std::result::Result;
+use std::{collections::HashMap, result::Result};

 use nom::{sequence::tuple, IResult};

@ -27,7 +27,9 @@ use super::{
 pub fn parse_mesh_surface<'a>(
    arena: &'a Arena,
    tree: &'a DataTree,
+    surface_shader_map: &HashMap<String, usize>,
 ) -> Result<TriangleMesh<'a>, PsyParseError> {
+    let mut shader_idx = None;
    let mut verts = Vec::new(); // Vec of vecs, one for each time sample
    let mut normals = Vec::new(); // Vec of vecs, on for each time sample
    let mut face_vert_counts = Vec::new();
@ -36,6 +38,20 @@ pub fn parse_mesh_surface<'a>(
    // TODO: make sure there are the right number of various children,
    // and other validation.

+    // Get surface shader binding, if any.
+    if tree
+        .iter_leaf_children_with_type("SurfaceShaderBind")
+        .count()
+        > 0
+    {
+        let name = tree
+            .iter_leaf_children_with_type("SurfaceShaderBind")
+            .nth(0)
+            .unwrap()
+            .1;
+        shader_idx = surface_shader_map.get(name).map(|i| *i);
+    }
+
    // Get verts
    for (_, mut text, _) in tree.iter_leaf_children_with_type("Vertices") {
        // Collect verts for this time sample
@ -116,6 +132,7 @@ pub fn parse_mesh_surface<'a>(

    Ok(TriangleMesh::from_verts_and_indices(
        arena,
+        shader_idx,
        &verts,
        &if normals.is_empty() {
            None
--- a/src/ray.rs
+++ b/src/ray.rs
@ -1,16 +1,11 @@
 #![allow(dead_code)]

-use glam::BVec4A;
+use crate::math::{Point, Vector, XformFull};

-use crate::math::{Point, Transform, Vector};
-
-type RayIndexType = u16;
 type FlagType = u8;
 const OCCLUSION_FLAG: FlagType = 1;
 const DONE_FLAG: FlagType = 1 << 1;

-/// This is never used directly in ray tracing--it's only used as a convenience
-/// for filling the RayBatch structure.
 #[derive(Debug, Copy, Clone)]
 pub struct Ray {
    pub orig: Point,
@ -18,384 +13,85 @@ pub struct Ray {
    pub time: f32,
    pub wavelength: f32,
    pub max_t: f32,
+    pub flags: FlagType,
 }

-/// The hot (frequently accessed) parts of ray data.
+/// A specifically local-space ray, for passing to functions when we've
+/// already calculated the local-space version of a ray for the object
+/// in question.
+///
+/// Also includes `dir_inv`, which is generally useful to have as well.
 #[derive(Debug, Copy, Clone)]
-struct RayHot {
-    orig_local: Point,     // Local-space ray origin
-    dir_inv_local: Vector, // Local-space 1.0/ray direction
-    max_t: f32,
-    time: f32,
-    flags: FlagType,
+pub struct LocalRay {
+    pub orig: Point,
+    pub dir: Vector,
+    pub dir_inv: Vector,
 }

-/// The cold (infrequently accessed) parts of ray data.
-#[derive(Debug, Copy, Clone)]
-struct RayCold {
-    orig: Point, // World-space ray origin
-    dir: Vector, // World-space ray direction
-    wavelength: f32,
-}
-
-/// A batch of rays, separated into hot and cold parts.
-#[derive(Debug)]
-pub struct RayBatch {
-    hot: Vec<RayHot>,
-    cold: Vec<RayCold>,
-}
-
-impl RayBatch {
-    /// Creates a new empty ray batch.
-    pub fn new() -> RayBatch {
-        RayBatch {
-            hot: Vec::new(),
-            cold: Vec::new(),
-        }
-    }
-
-    /// Creates a new empty ray batch, with pre-allocated capacity for
-    /// `n` rays.
-    pub fn with_capacity(n: usize) -> RayBatch {
-        RayBatch {
-            hot: Vec::with_capacity(n),
-            cold: Vec::with_capacity(n),
-        }
-    }
-
-    pub fn push(&mut self, ray: Ray, is_occlusion: bool) {
-        self.hot.push(RayHot {
-            orig_local: ray.orig,   // Bogus, to place-hold.
-            dir_inv_local: ray.dir, // Bogus, to place-hold.
-            max_t: ray.max_t,
-            time: ray.time,
+impl Ray {
+    pub fn new(
+        orig: Point,
+        dir: Vector,
+        time: f32,
+        wavelength: f32,
+        max_t: f32,
+        is_occlusion: bool,
+    ) -> Self {
+        Self {
+            orig: orig,
+            dir: dir,
+            time: time,
+            wavelength: wavelength,
+            max_t: max_t,
            flags: if is_occlusion { OCCLUSION_FLAG } else { 0 },
-        });
-        self.cold.push(RayCold {
-            orig: ray.orig,
-            dir: ray.dir,
-            wavelength: ray.wavelength,
-        });
+        }
    }

-    pub fn swap(&mut self, a: usize, b: usize) {
-        self.hot.swap(a, b);
-        self.cold.swap(a, b);
+    /// Creates a local ray from the given transform.
+    pub fn to_local_xform(&self, xform: &XformFull) -> LocalRay {
+        let orig = self.orig.xform_inv(xform);
+        let dir = self.dir.xform_inv(xform);
+
+        LocalRay {
+            orig: orig,
+            dir: dir,
+            dir_inv: dir.recip(),
+        }
    }

-    pub fn set_from_ray(&mut self, ray: &Ray, is_occlusion: bool, idx: usize) {
-        self.hot[idx].orig_local = ray.orig;
-        self.hot[idx].dir_inv_local = Vector {
-            co: ray.dir.co.recip(),
-        };
-        self.hot[idx].max_t = ray.max_t;
-        self.hot[idx].time = ray.time;
-        self.hot[idx].flags = if is_occlusion { OCCLUSION_FLAG } else { 0 };
-
-        self.cold[idx].orig = ray.orig;
-        self.cold[idx].dir = ray.dir;
-        self.cold[idx].wavelength = ray.wavelength;
+    /// Creates a local ray with no transform applied.
+    pub fn to_local(&self) -> LocalRay {
+        LocalRay {
+            orig: self.orig,
+            dir: self.dir,
+            dir_inv: self.dir.recip(),
+        }
    }

-    pub fn truncate(&mut self, len: usize) {
-        self.hot.truncate(len);
-        self.cold.truncate(len);
-    }
-
-    /// Clear all rays, settings the size of the batch back to zero.
-    ///
-    /// Capacity is maintained.
-    pub fn clear(&mut self) {
-        self.hot.clear();
-        self.cold.clear();
-    }
-
-    pub fn len(&self) -> usize {
-        self.hot.len()
-    }
-
-    /// Updates the accel data of the given ray (at index `idx`) with the
-    /// given world-to-local-space transform matrix.
-    ///
-    /// This should be called when entering (and exiting) traversal of a
-    /// new transform space.
-    pub fn update_local(&mut self, idx: usize, xform: &Transform) {
-        self.hot[idx].orig_local = self.cold[idx].orig * *xform;
-        self.hot[idx].dir_inv_local = Vector {
-            co: (self.cold[idx].dir * *xform).co.recip(),
-        };
-    }
-
-    //==========================================================
-    // Data access
+    //---------------------------------------------------------
+    // Flags.

+    /// Returns whether this is an occlusion ray.
    #[inline(always)]
-    pub fn orig(&self, idx: usize) -> Point {
-        self.cold[idx].orig
+    pub fn is_occlusion(&self) -> bool {
+        (self.flags & OCCLUSION_FLAG) != 0
    }

+    /// Returns whether this ray has finished traversal.
    #[inline(always)]
-    pub fn dir(&self, idx: usize) -> Vector {
-        self.cold[idx].dir
+    pub fn is_done(&self) -> bool {
+        (self.flags & DONE_FLAG) != 0
    }

+    /// Marks this as an occlusion ray.
    #[inline(always)]
-    pub fn orig_local(&self, idx: usize) -> Point {
-        self.hot[idx].orig_local
+    pub fn mark_occlusion(&mut self) {
+        self.flags |= OCCLUSION_FLAG
    }

+    /// Marks this as having finished traversal.
    #[inline(always)]
-    pub fn dir_inv_local(&self, idx: usize) -> Vector {
-        self.hot[idx].dir_inv_local
-    }
-
-    #[inline(always)]
-    pub fn time(&self, idx: usize) -> f32 {
-        self.hot[idx].time
-    }
-
-    #[inline(always)]
-    pub fn max_t(&self, idx: usize) -> f32 {
-        self.hot[idx].max_t
-    }
-
-    #[inline(always)]
-    pub fn set_max_t(&mut self, idx: usize, new_max_t: f32) {
-        self.hot[idx].max_t = new_max_t;
-    }
-
-    #[inline(always)]
-    pub fn wavelength(&self, idx: usize) -> f32 {
-        self.cold[idx].wavelength
-    }
-
-    /// Returns whether the given ray (at index `idx`) is an occlusion ray.
-    #[inline(always)]
-    pub fn is_occlusion(&self, idx: usize) -> bool {
-        (self.hot[idx].flags & OCCLUSION_FLAG) != 0
-    }
-
-    /// Returns whether the given ray (at index `idx`) has finished traversal.
-    #[inline(always)]
-    pub fn is_done(&self, idx: usize) -> bool {
-        (self.hot[idx].flags & DONE_FLAG) != 0
-    }
-
-    /// Marks the given ray (at index `idx`) as an occlusion ray.
-    #[inline(always)]
-    pub fn mark_occlusion(&mut self, idx: usize) {
-        self.hot[idx].flags |= OCCLUSION_FLAG
-    }
-
-    /// Marks the given ray (at index `idx`) as having finished traversal.
-    #[inline(always)]
-    pub fn mark_done(&mut self, idx: usize) {
-        self.hot[idx].flags |= DONE_FLAG
+    pub fn mark_done(&mut self) {
+        self.flags |= DONE_FLAG
    }
 }
-
-/// A structure used for tracking traversal of a ray batch through a scene.
-#[derive(Debug)]
-pub struct RayStack {
-    lanes: Vec<Lane>,
-    tasks: Vec<RayTask>,
-}
-
-impl RayStack {
-    pub fn new() -> RayStack {
-        RayStack {
-            lanes: Vec::new(),
-            tasks: Vec::new(),
-        }
-    }
-
-    /// Returns whether the stack is empty of tasks or not.
-    pub fn is_empty(&self) -> bool {
-        self.tasks.is_empty()
-    }
-
-    /// Makes sure there are at least `count` lanes.
-    pub fn ensure_lane_count(&mut self, count: usize) {
-        while self.lanes.len() < count {
-            self.lanes.push(Lane {
-                idxs: Vec::new(),
-                end_len: 0,
-            })
-        }
-    }
-
-    pub fn ray_count_in_next_task(&self) -> usize {
-        let task = self.tasks.last().unwrap();
-        let end = self.lanes[task.lane].end_len;
-        end - task.start_idx
-    }
-
-    pub fn next_task_ray_idx(&self, i: usize) -> usize {
-        let task = self.tasks.last().unwrap();
-        let i = i + task.start_idx;
-        debug_assert!(i < self.lanes[task.lane].end_len);
-        self.lanes[task.lane].idxs[i] as usize
-    }
-
-    /// Clears the lanes and tasks of the RayStack.
-    ///
-    /// Note: this is (importantly) different than calling clear individually
-    /// on the `lanes` and `tasks` members.  Specifically, we don't want to
-    /// clear `lanes` itself, as that would also free all the memory of the
-    /// individual lanes.  Instead, we want to iterate over the individual
-    /// lanes and clear them, but leave `lanes` itself untouched.
-    pub fn clear(&mut self) {
-        for lane in self.lanes.iter_mut() {
-            lane.idxs.clear();
-            lane.end_len = 0;
-        }
-
-        self.tasks.clear();
-    }
-
-    /// Pushes the given ray index onto the end of the specified lane.
-    pub fn push_ray_index(&mut self, ray_idx: usize, lane: usize) {
-        assert!(self.lanes.len() > lane);
-        self.lanes[lane].idxs.push(ray_idx as RayIndexType);
-    }
-
-    /// Pushes any excess indices on the given lane to a new task on the
-    /// task stack.
-    ///
-    /// Returns whether a task was pushed or not.  No task will be pushed
-    /// if there are no excess indices on the end of the lane.
-    pub fn push_lane_to_task(&mut self, lane_idx: usize) -> bool {
-        if self.lanes[lane_idx].end_len < self.lanes[lane_idx].idxs.len() {
-            self.tasks.push(RayTask {
-                lane: lane_idx,
-                start_idx: self.lanes[lane_idx].end_len,
-            });
-            self.lanes[lane_idx].end_len = self.lanes[lane_idx].idxs.len();
-            true
-        } else {
-            false
-        }
-    }
-
-    /// Takes the given list of lane indices, and pushes any excess indices on
-    /// the end of each into a new task, in the order provided.
-    pub fn push_lanes_to_tasks(&mut self, lane_idxs: &[usize]) {
-        for &l in lane_idxs {
-            self.push_lane_to_task(l);
-        }
-    }
-
-    pub fn duplicate_next_task(&mut self) {
-        let task = self.tasks.last().unwrap();
-        let l = task.lane;
-        let start = task.start_idx;
-        let end = self.lanes[l].end_len;
-
-        // Extend the indices vector
-        self.lanes[l].idxs.reserve(end - start);
-        let old_len = self.lanes[l].idxs.len();
-        let new_len = old_len + end - start;
-        unsafe {
-            self.lanes[l].idxs.set_len(new_len);
-        }
-
-        // Copy elements
-        copy_in_place::copy_in_place(&mut self.lanes[l].idxs, start..end, end);
-
-        // Push the new task onto the stack
-        self.tasks.push(RayTask {
-            lane: l,
-            start_idx: end,
-        });
-
-        self.lanes[l].end_len = self.lanes[l].idxs.len();
-    }
-
-    // Pops the next task off the stack.
-    pub fn pop_task(&mut self) {
-        let task = self.tasks.pop().unwrap();
-        self.lanes[task.lane].end_len = task.start_idx;
-        self.lanes[task.lane].idxs.truncate(task.start_idx);
-    }
-
-    // Executes a task without popping it from the task stack.
-    pub fn do_next_task<F>(&mut self, mut handle_ray: F)
-    where
-        F: FnMut(usize),
-    {
-        let task = self.tasks.last().unwrap();
-        let task_range = (task.start_idx, self.lanes[task.lane].end_len);
-
-        // Execute task.
-        for i in task_range.0..task_range.1 {
-            let ray_idx = self.lanes[task.lane].idxs[i];
-            handle_ray(ray_idx as usize);
-        }
-    }
-
-    /// Pops the next task off the stack, and executes the provided closure for
-    /// each ray index in the task.
-    #[inline(always)]
-    pub fn pop_do_next_task<F>(&mut self, handle_ray: F)
-    where
-        F: FnMut(usize),
-    {
-        self.do_next_task(handle_ray);
-        self.pop_task();
-    }
-
-    /// Pops the next task off the stack, executes the provided closure for
-    /// each ray index in the task, and pushes the ray indices back onto the
-    /// indicated lanes.
-    pub fn pop_do_next_task_and_push_rays<F>(&mut self, output_lane_count: usize, mut handle_ray: F)
-    where
-        F: FnMut(usize) -> BVec4A,
-    {
-        // Pop the task and do necessary bookkeeping.
-        let task = self.tasks.pop().unwrap();
-        let task_range = (task.start_idx, self.lanes[task.lane].end_len);
-        self.lanes[task.lane].end_len = task.start_idx;
-
-        // SAFETY: this is probably evil, and depends on behavior of Vec that
-        // are not actually promised.  But we're essentially truncating the lane
-        // to the start of our task range, but will continue to access it's
-        // elements beyond that range via `get_unchecked()` below.  Because the
-        // memory is not freed nor altered, this is safe.  However, again, the
-        // Vec apis don't promise this behavior.  So:
-        //
-        // TODO: build a slightly different lane abstraction to get this same
-        // efficiency without depending on implicit Vec behavior.
-        unsafe {
-            self.lanes[task.lane].idxs.set_len(task.start_idx);
-        }
-
-        // Execute task.
-        for i in task_range.0..task_range.1 {
-            let ray_idx = *unsafe { self.lanes[task.lane].idxs.get_unchecked(i) };
-            let push_mask = handle_ray(ray_idx as usize).bitmask();
-            for l in 0..output_lane_count {
-                if (push_mask & (1 << l)) != 0 {
-                    self.lanes[l as usize].idxs.push(ray_idx);
-                }
-            }
-        }
-    }
-}
-
-/// A lane within a RayStack.
-#[derive(Debug)]
-struct Lane {
-    idxs: Vec<RayIndexType>,
-    end_len: usize,
-}
-
-/// A task within a RayStack.
-//
-// Specifies the lane that the relevant ray pointers are in, and the
-// starting index within that lane.  The relevant pointers are always
-// `&[start_idx..]` within the given lane.
-#[derive(Debug)]
-struct RayTask {
-    lane: usize,
-    start_idx: usize,
-}
--- a/src/renderer.rs
+++ b/src/renderer.rs
--- a/src/sampling/monte_carlo.rs
+++ b/src/sampling/monte_carlo.rs
@ -2,7 +2,7 @@

 use std::{f32::consts::FRAC_PI_4 as QPI_32, f32::consts::PI as PI_32, f64::consts::PI as PI_64};

-use crate::math::{cross, dot, Point, Vector};
+use crate::math::{cross_fast, dot_fast, Point, Vector};

 /// Maps the unit square to the unit circle.
 /// NOTE: x and y should be distributed within [-1, 1],
@ -90,7 +90,7 @@ pub fn uniform_sample_triangle(va: Vector, vb: Vector, vc: Vector, i: f32, j: f3

 /// Calculates the surface area of a triangle.
 pub fn triangle_surface_area(p0: Point, p1: Point, p2: Point) -> f32 {
-    0.5 * cross(p1 - p0, p2 - p0).length()
+    0.5 * cross_fast(p1 - p0, p2 - p0).length()
 }

 /// Calculates the projected solid angle of a spherical triangle.
@ -98,9 +98,9 @@ pub fn triangle_surface_area(p0: Point, p1: Point, p2: Point) -> f32 {
 /// A, B, and C are the points of the triangle on a unit sphere.
 pub fn spherical_triangle_solid_angle(va: Vector, vb: Vector, vc: Vector) -> f32 {
    // Calculate sines and cosines of the spherical triangle's edge lengths
-    let cos_a: f64 = dot(vb, vc).max(-1.0).min(1.0) as f64;
-    let cos_b: f64 = dot(vc, va).max(-1.0).min(1.0) as f64;
-    let cos_c: f64 = dot(va, vb).max(-1.0).min(1.0) as f64;
+    let cos_a: f64 = dot_fast(vb, vc).max(-1.0).min(1.0) as f64;
+    let cos_b: f64 = dot_fast(vc, va).max(-1.0).min(1.0) as f64;
+    let cos_c: f64 = dot_fast(va, vb).max(-1.0).min(1.0) as f64;
    let sin_a: f64 = (1.0 - (cos_a * cos_a)).sqrt();
    let sin_b: f64 = (1.0 - (cos_b * cos_b)).sqrt();
    let sin_c: f64 = (1.0 - (cos_c * cos_c)).sqrt();
@ -141,9 +141,9 @@ pub fn uniform_sample_spherical_triangle(
    j: f32,
 ) -> Vector {
    // Calculate sines and cosines of the spherical triangle's edge lengths
-    let cos_a: f64 = dot(vb, vc).max(-1.0).min(1.0) as f64;
-    let cos_b: f64 = dot(vc, va).max(-1.0).min(1.0) as f64;
-    let cos_c: f64 = dot(va, vb).max(-1.0).min(1.0) as f64;
+    let cos_a: f64 = dot_fast(vb, vc).max(-1.0).min(1.0) as f64;
+    let cos_b: f64 = dot_fast(vc, va).max(-1.0).min(1.0) as f64;
+    let cos_c: f64 = dot_fast(va, vb).max(-1.0).min(1.0) as f64;
    let sin_a: f64 = (1.0 - (cos_a * cos_a)).sqrt();
    let sin_b: f64 = (1.0 - (cos_b * cos_b)).sqrt();
    let sin_c: f64 = (1.0 - (cos_c * cos_c)).sqrt();
@ -191,10 +191,10 @@ pub fn uniform_sample_spherical_triangle(
    let q_bottom = ((v * s) + (u * t)) * sin_va;
    let q = q_top / q_bottom;

-    let vc_2 =
-        (va * q as f32) + ((vc - (va * dot(vc, va))).normalized() * (1.0 - (q * q)).sqrt() as f32);
+    let vc_2 = (va * q as f32)
+        + ((vc - (va * dot_fast(vc, va))).normalized() * (1.0 - (q * q)).sqrt() as f32);

-    let z = 1.0 - (j * (1.0 - dot(vc_2, vb)));
+    let z = 1.0 - (j * (1.0 - dot_fast(vc_2, vb)));

-    (vb * z) + ((vc_2 - (vb * dot(vc_2, vb))).normalized() * (1.0 - (z * z)).sqrt())
+    (vb * z) + ((vc_2 - (vb * dot_fast(vc_2, vb))).normalized() * (1.0 - (z * z)).sqrt())
 }
--- a/src/scene/assembly.rs
+++ b/src/scene/assembly.rs
@ -10,10 +10,9 @@ use crate::{
    color::SpectralSample,
    lerp::lerp_slice,
    light::SurfaceLight,
-    math::{Normal, Point, Transform},
+    math::{Normal, Point, Xform, XformFull},
    shading::SurfaceShader,
    surface::{Surface, SurfaceIntersection},
-    transform_stack::TransformStack,
 };

 #[derive(Copy, Clone, Debug)]
@ -21,7 +20,7 @@ pub struct Assembly<'a> {
    // Instance list
    pub instances: &'a [Instance],
    pub light_instances: &'a [Instance],
-    pub xforms: &'a [Transform],
+    pub xforms: &'a [Xform],

    // Surface shader list
    pub surface_shaders: &'a [&'a dyn SurfaceShader],
@ -45,11 +44,11 @@ impl<'a> Assembly<'a> {
    // Returns (light_color, (sample_point, normal, point_err), pdf, selection_pdf)
    pub fn sample_lights(
        &self,
-        xform_stack: &mut TransformStack,
        n: f32,
        uvw: (f32, f32, f32),
        wavelength: f32,
        time: f32,
+        space: &XformFull,
        intr: &SurfaceIntersection,
    ) -> Option<(SpectralSample, (Point, Normal, f32), f32, f32)> {
        if let SurfaceIntersection::Hit {
@ -57,46 +56,44 @@ impl<'a> Assembly<'a> {
            closure,
        } = *intr
        {
-            let sel_xform = if !xform_stack.top().is_empty() {
-                lerp_slice(xform_stack.top(), time)
-            } else {
-                Transform::new()
-            };
            if let Some((light_i, sel_pdf, whittled_n)) = self.light_accel.select(
-                idata.incoming * sel_xform,
-                idata.pos * sel_xform,
-                idata.nor * sel_xform,
-                idata.nor_g * sel_xform,
+                idata.incoming.xform_inv(space),
+                idata.pos.xform_inv(space),
+                idata.nor.xform_inv_fast(space),
+                idata.nor_g.xform_inv_fast(space),
                &closure,
                time,
                n,
            ) {
                let inst = self.light_instances[light_i];
+
+                // Handle transforms.
+                let local_space = if let Some((a, b)) = inst.transform_indices {
+                    if let Some(new_space) = lerp_slice(&self.xforms[a..b], time)
+                        .compose(&space.fwd)
+                        .to_full()
+                    {
+                        new_space
+                    } else {
+                        // Invalid transform.  Give up.
+                        return None;
+                    }
+                } else {
+                    *space
+                };
+
                match inst.instance_type {
                    InstanceType::Object => {
                        match self.objects[inst.data_index] {
                            Object::SurfaceLight(light) => {
-                                // Get the world-to-object space transform of the light
-                                let xform = if let Some((a, b)) = inst.transform_indices {
-                                    let pxforms = xform_stack.top();
-                                    let xform = lerp_slice(&self.xforms[a..b], time);
-                                    if !pxforms.is_empty() {
-                                        lerp_slice(pxforms, time) * xform
-                                    } else {
-                                        xform
-                                    }
-                                } else {
-                                    let pxforms = xform_stack.top();
-                                    if !pxforms.is_empty() {
-                                        lerp_slice(pxforms, time)
-                                    } else {
-                                        Transform::new()
-                                    }
-                                };
-
                                // Sample the light
                                let (color, sample_geo, pdf) = light.sample_from_point(
-                                    &xform, idata.pos, uvw.0, uvw.1, wavelength, time,
+                                    &local_space,
+                                    idata.pos,
+                                    uvw.0,
+                                    uvw.1,
+                                    wavelength,
+                                    time,
                                );
                                return Some((color, sample_geo, pdf, sel_pdf));
                            }
@ -106,27 +103,16 @@ impl<'a> Assembly<'a> {
                    }

                    InstanceType::Assembly => {
-                        // Push the world-to-object space transforms of the assembly onto
-                        // the transform stack.
-                        if let Some((a, b)) = inst.transform_indices {
-                            xform_stack.push(&self.xforms[a..b]);
-                        }
-
                        // Sample sub-assembly lights
                        let sample = self.assemblies[inst.data_index].sample_lights(
-                            xform_stack,
                            whittled_n,
                            uvw,
                            wavelength,
                            time,
+                            &local_space,
                            intr,
                        );

-                        // Pop the assembly's transforms off the transform stack.
-                        if inst.transform_indices.is_some() {
-                            xform_stack.pop();
-                        }
-
                        // Return sample
                        return sample.map(|(ss, v, pdf, spdf)| (ss, v, pdf, spdf * sel_pdf));
                    }
@ -152,11 +138,11 @@ pub struct AssemblyBuilder<'a> {

    // Instance list
    instances: Vec<Instance>,
-    xforms: Vec<Transform>,
+    xforms: Vec<Xform>,

    // Shader list
    surface_shaders: Vec<&'a dyn SurfaceShader>,
-    surface_shader_map: HashMap<String, usize>, // map Name -> Index
+    pub surface_shader_map: HashMap<String, usize>, // map Name -> Index

    // Object list
    objects: Vec<Object<'a>>,
@ -220,12 +206,7 @@ impl<'a> AssemblyBuilder<'a> {
        self.assemblies.push(asmb);
    }

-    pub fn add_instance(
-        &mut self,
-        name: &str,
-        surface_shader_name: Option<&str>,
-        xforms: Option<&[Transform]>,
-    ) {
+    pub fn add_instance(&mut self, name: &str, xforms: Option<&[Xform]>) {
        // Make sure name exists
        if !self.name_exists(name) {
            panic!("Attempted to add instance with a name that doesn't exist.");
@ -247,12 +228,6 @@ impl<'a> AssemblyBuilder<'a> {
            Instance {
                instance_type: InstanceType::Object,
                data_index: self.object_map[name],
-                surface_shader_index: surface_shader_name.map(|name| {
-                    *self
-                        .surface_shader_map
-                        .get(name)
-                        .unwrap_or_else(|| panic!("Unknown surface shader '{}'.", name))
-                }),
                id: self.instances.len(),
                transform_indices: xforms
                    .map(|xf| (self.xforms.len(), self.xforms.len() + xf.len())),
@ -261,12 +236,6 @@ impl<'a> AssemblyBuilder<'a> {
            Instance {
                instance_type: InstanceType::Assembly,
                data_index: self.assembly_map[name],
-                surface_shader_index: surface_shader_name.map(|name| {
-                    *self
-                        .surface_shader_map
-                        .get(name)
-                        .unwrap_or_else(|| panic!("Unknown surface shader '{}'.", name))
-                }),
                id: self.instances.len(),
                transform_indices: xforms
                    .map(|xf| (self.xforms.len(), self.xforms.len() + xf.len())),
@ -405,7 +374,6 @@ pub enum Object<'a> {
 pub struct Instance {
    pub instance_type: InstanceType,
    pub data_index: usize,
-    pub surface_shader_index: Option<usize>,
    pub id: usize,
    pub transform_indices: Option<(usize, usize)>,
 }
--- a/src/scene/mod.rs
+++ b/src/scene/mod.rs
@ -6,9 +6,9 @@ use crate::{
    algorithm::weighted_choice,
    camera::Camera,
    color::SpectralSample,
-    math::{Normal, Point, Vector},
+    light::WorldLightSource,
+    math::{Normal, Point, Vector, XformFull},
    surface::SurfaceIntersection,
-    transform_stack::TransformStack,
 };

 pub use self::{
@ -19,19 +19,19 @@ pub use self::{
 #[derive(Debug)]
 pub struct Scene<'a> {
    pub name: Option<String>,
-    pub camera: Camera<'a>,
-    pub world: World<'a>,
+    pub camera: Camera,
+    pub world: World,
    pub root: Assembly<'a>,
 }

 impl<'a> Scene<'a> {
    pub fn sample_lights(
        &self,
-        xform_stack: &mut TransformStack,
        n: f32,
        uvw: (f32, f32, f32),
        wavelength: f32,
        time: f32,
+        space: &XformFull,
        intr: &SurfaceIntersection,
    ) -> SceneLightSample {
        // TODO: this just selects between world lights and local lights
@ -68,7 +68,7 @@ impl<'a> Scene<'a> {
            if n < wl_prob {
                // World lights
                let n = n / wl_prob;
-                let (i, p) = weighted_choice(self.world.lights, n, |l| l.approximate_energy());
+                let (i, p) = weighted_choice(&self.world.lights, n, |l| l.approximate_energy());
                let (ss, sv, pdf) =
                    self.world.lights[i].sample_from_point(uvw.0, uvw.1, wavelength, time);
                return SceneLightSample::Distant {
@ -81,9 +81,9 @@ impl<'a> Scene<'a> {
                // Local lights
                let n = (n - wl_prob) / (1.0 - wl_prob);

-                if let Some((ss, sgeo, pdf, spdf)) =
-                    self.root
-                        .sample_lights(xform_stack, n, uvw, wavelength, time, intr)
+                if let Some((ss, sgeo, pdf, spdf)) = self
+                    .root
+                    .sample_lights(n, uvw, wavelength, time, space, intr)
                {
                    return SceneLightSample::Surface {
                        color: ss,
--- a/src/scene/world.rs
+++ b/src/scene/world.rs
@ -1,7 +1,7 @@
-use crate::{color::Color, light::WorldLightSource};
+use crate::{color::Color, light::DistantDiskLight};

 #[derive(Debug)]
-pub struct World<'a> {
+pub struct World {
    pub background_color: Color,
-    pub lights: &'a [&'a dyn WorldLightSource],
+    pub lights: Vec<DistantDiskLight>,
 }
--- a/src/scramble.rs
+++ b/src/scramble.rs
@ -0,0 +1,101 @@
+#![allow(dead_code)]
+
+/// Performs a base-2 Owen scramble on an integer.
+pub fn owen2(n: u32, seed: u32) -> u32 {
+    // Multiply by a large random prime and xor by a random number.
+    // This is to ensure that the seed doesn't behave poorly with
+    // e.g. incrementing parameters, and also that zero doesn't
+    // map to zero in the hash function.
+    let seed = seed.wrapping_mul(0x68318d2f) ^ 0x5adbc2a7;
+
+    let mut result = n;
+    for i in 0..32 {
+        result ^= hash((n & (!1 << i)) ^ seed) & (1 << i);
+    }
+
+    result
+}
+
+#[inline(always)]
+pub fn owen4_fast(mut n: u32, seed: u32) -> u32 {
+    let scramble = hash(seed);
+
+    n = n.reverse_bits();
+
+    n ^= n.wrapping_mul(0x3d20adea);
+    n ^= (n >> 1) & (n << 1) & 0x55555555;
+    n = n.wrapping_add(scramble);
+    n = n.wrapping_mul((scramble >> 16) | 1);
+    n ^= (n >> 1) & (n << 1) & 0x55555555;
+    n ^= n.wrapping_mul(0x05526c56);
+    n ^= n.wrapping_mul(0x53a22864);
+
+    n.reverse_bits()
+}
+
+pub fn owen4(n: u32, seed: u32) -> u32 {
+    // Bit-packed permutation table.
+    const PERMUTATION_TABLE: [u8; 24] = [
+        0 | (1 << 2) | (2 << 4) | (3 << 6), // [0, 1, 2, 3],
+        0 | (1 << 2) | (3 << 4) | (2 << 6), // [0, 1, 3, 2],
+        0 | (2 << 2) | (1 << 4) | (3 << 6), // [0, 2, 1, 3],
+        0 | (2 << 2) | (3 << 4) | (1 << 6), // [0, 2, 3, 1],
+        0 | (3 << 2) | (1 << 4) | (2 << 6), // [0, 3, 1, 2],
+        0 | (3 << 2) | (2 << 4) | (1 << 6), // [0, 3, 2, 1],
+        1 | (0 << 2) | (2 << 4) | (3 << 6), // [1, 0, 2, 3],
+        1 | (0 << 2) | (3 << 4) | (2 << 6), // [1, 0, 3, 2],
+        1 | (2 << 2) | (0 << 4) | (3 << 6), // [1, 2, 0, 3],
+        1 | (2 << 2) | (3 << 4) | (0 << 6), // [1, 2, 3, 0],
+        1 | (3 << 2) | (0 << 4) | (2 << 6), // [1, 3, 0, 2],
+        1 | (3 << 2) | (2 << 4) | (0 << 6), // [1, 3, 2, 0],
+        2 | (0 << 2) | (1 << 4) | (3 << 6), // [2, 0, 1, 3],
+        2 | (0 << 2) | (3 << 4) | (1 << 6), // [2, 0, 3, 1],
+        2 | (1 << 2) | (0 << 4) | (3 << 6), // [2, 1, 0, 3],
+        2 | (1 << 2) | (3 << 4) | (0 << 6), // [2, 1, 3, 0],
+        2 | (3 << 2) | (0 << 4) | (1 << 6), // [2, 3, 0, 1],
+        2 | (3 << 2) | (1 << 4) | (0 << 6), // [2, 3, 1, 0],
+        3 | (0 << 2) | (1 << 4) | (2 << 6), // [3, 0, 1, 2],
+        3 | (0 << 2) | (2 << 4) | (1 << 6), // [3, 0, 2, 1],
+        3 | (1 << 2) | (0 << 4) | (2 << 6), // [3, 1, 0, 2],
+        3 | (1 << 2) | (2 << 4) | (0 << 6), // [3, 1, 2, 0],
+        3 | (2 << 2) | (0 << 4) | (1 << 6), // [3, 2, 0, 1],
+        3 | (2 << 2) | (1 << 4) | (0 << 6), // [3, 2, 1, 0],
+    ];
+
+    // Multiply by a large random prime and xor by a random number.
+    // This is to ensure that the seed doesn't behave poorly with
+    // e.g. incrementing parameters, and also that zero doesn't
+    // map to zero in the hash function.
+    let seed = seed.wrapping_mul(0xe8559dcb) ^ 0x372fcdb9;
+
+    let mut result = 0;
+    for i in 0..16 {
+        let mask = !0b11 << (i * 2);
+        let perm_entry = PERMUTATION_TABLE[
+            // The xor with `i` is to ensure runs of zeros in `n` still
+            // result in different shuffles on each iteration.  `i` is
+            // shifted to avoid interacting poorly with an incrementing
+            // `n`.
+            (hash((n & mask) ^ seed ^ (i << 16)) % 24) as usize
+        ];
+        let perm_cell_idx = ((n >> (i * 2)) & 0b11) as usize;
+        result |= (((perm_entry >> (perm_cell_idx * 2)) & 0b11) as u32) << (i * 2);
+    }
+
+    result
+}
+
+//-------------------------------------------------------------
+
+/// Fast bit-mixing hash for use in the functions above.
+#[inline(always)]
+pub fn hash(mut n: u32) -> u32 {
+    // From https://github.com/skeeto/hash-prospector
+    n ^= n >> 16;
+    n = n.wrapping_mul(0x21f0aaad);
+    n ^= n >> 15;
+    n = n.wrapping_mul(0xd35a2d97);
+    n ^= n >> 15;
+
+    n
+}
--- a/src/shading/surface_closure.rs
+++ b/src/shading/surface_closure.rs
@ -2,12 +2,10 @@

 use std::f32::consts::PI as PI_32;

-use glam::Vec4;
-
 use crate::{
    color::{Color, SpectralSample},
    lerp::{lerp, Lerp},
-    math::{dot, zup_to_vec, Normal, Vector},
+    math::{clamp, dot_fast, zup_to_vec, Float4, Normal, Vector},
    sampling::cosine_sample_hemisphere,
 };

@ -289,7 +287,7 @@ mod lambert_closure {
        uv: (f32, f32),
        wavelength: f32,
    ) -> (Vector, SpectralSample, f32) {
-        let (nn, flipped_nor_g) = if dot(nor_g.into_vector(), inc) <= 0.0 {
+        let (nn, flipped_nor_g) = if dot_fast(nor_g.into_vector(), inc) <= 0.0 {
            (nor.normalized().into_vector(), nor_g.into_vector())
        } else {
            (-nor.normalized().into_vector(), -nor_g.into_vector())
@ -302,7 +300,7 @@ mod lambert_closure {
        let out = zup_to_vec(dir, nn);

        // Make sure it's not on the wrong side of the geometric normal.
-        if dot(flipped_nor_g, out) >= 0.0 {
+        if dot_fast(flipped_nor_g, out) >= 0.0 {
            (out, color.to_spectral_sample(wavelength) * pdf, pdf)
        } else {
            (out, SpectralSample::new(0.0), 0.0)
@ -317,14 +315,14 @@ mod lambert_closure {
        nor_g: Normal,
        wavelength: f32,
    ) -> (SpectralSample, f32) {
-        let (nn, flipped_nor_g) = if dot(nor_g.into_vector(), inc) <= 0.0 {
+        let (nn, flipped_nor_g) = if dot_fast(nor_g.into_vector(), inc) <= 0.0 {
            (nor.normalized().into_vector(), nor_g.into_vector())
        } else {
            (-nor.normalized().into_vector(), -nor_g.into_vector())
        };

-        if dot(flipped_nor_g, out) >= 0.0 {
-            let fac = dot(nn, out.normalized()).max(0.0) * INV_PI;
+        if dot_fast(flipped_nor_g, out) >= 0.0 {
+            let fac = dot_fast(nn, out.normalized()).max(0.0) * INV_PI;
            (color.to_spectral_sample(wavelength) * fac, fac)
        } else {
            (SpectralSample::new(0.0), 0.0)
@ -383,14 +381,14 @@ mod lambert_closure {
            let cos_theta_max = (1.0 - sin_theta_max2).sqrt();

            let v = to_light_center.normalized();
-            let nn = if dot(nor_g.into_vector(), inc) <= 0.0 {
+            let nn = if dot_fast(nor_g.into_vector(), inc) <= 0.0 {
                nor.normalized()
            } else {
                -nor.normalized()
            }
            .into_vector();

-            let cos_nv = dot(nn, v).max(-1.0).min(1.0);
+            let cos_nv = dot_fast(nn, v).max(-1.0).min(1.0);

            // Alt implementation from the SPI paper.
            // Worse sampling, but here for reference.
@ -428,7 +426,7 @@ mod ggx_closure {
        wavelength: f32,
    ) -> (Vector, SpectralSample, f32) {
        // Get normalized surface normal
-        let (nn, flipped_nor_g) = if dot(nor_g.into_vector(), inc) <= 0.0 {
+        let (nn, flipped_nor_g) = if dot_fast(nor_g.into_vector(), inc) <= 0.0 {
            (nor.normalized().into_vector(), nor_g.into_vector())
        } else {
            (-nor.normalized().into_vector(), -nor_g.into_vector())
@ -442,10 +440,10 @@ mod ggx_closure {
        let mut half_dir = Vector::new(angle.cos() * theta_sin, angle.sin() * theta_sin, theta_cos);
        half_dir = zup_to_vec(half_dir, nn).normalized();

-        let out = inc - (half_dir * 2.0 * dot(inc, half_dir));
+        let out = inc - (half_dir * 2.0 * dot_fast(inc, half_dir));

        // Make sure it's not on the wrong side of the geometric normal.
-        if dot(flipped_nor_g, out) >= 0.0 {
+        if dot_fast(flipped_nor_g, out) >= 0.0 {
            let (filter, pdf) = evaluate(col, roughness, fresnel, inc, out, nor, nor_g, wavelength);
            (out, filter, pdf)
        } else {
@ -469,23 +467,23 @@ mod ggx_closure {
        let hh = (aa + bb).normalized(); // Half-way between aa and bb

        // Surface normal
-        let (nn, flipped_nor_g) = if dot(nor_g.into_vector(), inc) <= 0.0 {
+        let (nn, flipped_nor_g) = if dot_fast(nor_g.into_vector(), inc) <= 0.0 {
            (nor.normalized().into_vector(), nor_g.into_vector())
        } else {
            (-nor.normalized().into_vector(), -nor_g.into_vector())
        };

        // Make sure everything's on the correct side of the surface
-        if dot(nn, aa) < 0.0 || dot(nn, bb) < 0.0 || dot(flipped_nor_g, bb) < 0.0 {
+        if dot_fast(nn, aa) < 0.0 || dot_fast(nn, bb) < 0.0 || dot_fast(flipped_nor_g, bb) < 0.0 {
            return (SpectralSample::new(0.0), 0.0);
        }

        // Calculate needed dot products
-        let na = dot(nn, aa).clamp(-1.0, 1.0);
-        let nb = dot(nn, bb).clamp(-1.0, 1.0);
-        let ha = dot(hh, aa).clamp(-1.0, 1.0);
-        let hb = dot(hh, bb).clamp(-1.0, 1.0);
-        let nh = dot(nn, hh).clamp(-1.0, 1.0);
+        let na = clamp(dot_fast(nn, aa), -1.0, 1.0);
+        let nb = clamp(dot_fast(nn, bb), -1.0, 1.0);
+        let ha = clamp(dot_fast(hh, aa), -1.0, 1.0);
+        let hb = clamp(dot_fast(hh, bb), -1.0, 1.0);
+        let nh = clamp(dot_fast(nn, hh), -1.0, 1.0);

        // Calculate F - Fresnel
        let col_f = {
@ -512,7 +510,7 @@ mod ggx_closure {
                rev_fresnel,
            );

-            SpectralSample::from_parts(Vec4::new(c0, c1, c2, c3), wavelength)
+            SpectralSample::from_parts(Float4::new(c0, c1, c2, c3), wavelength)
        };

        // Calculate everything else
@ -556,7 +554,7 @@ mod ggx_closure {
        assert!(cos_theta_max <= 1.0);

        // Surface normal
-        let nn = if dot(nor.into_vector(), inc) < 0.0 {
+        let nn = if dot_fast(nor.into_vector(), inc) < 0.0 {
            nor.normalized()
        } else {
            -nor.normalized() // If back-facing, flip normal
@ -574,9 +572,9 @@ mod ggx_closure {
        //    let vv = Halton::sample(1, i);
        //    let mut samp = uniform_sample_cone(uu, vv, cos_theta_max);
        //    samp = zup_to_vec(samp, bb).normalized();
-        //    if dot(nn, samp) > 0.0 {
+        //    if dot_fast(nn, samp) > 0.0 {
        //        let hh = (aa+samp).normalized();
-        //        fac += ggx_d(dot(nn, hh), roughness);
+        //        fac += ggx_d(dot_fast(nn, hh), roughness);
        //    }
        //}
        //fac /= N * N;
@ -584,7 +582,7 @@ mod ggx_closure {
        // Approximate method
        let theta = cos_theta_max.acos();
        let hh = (aa + bb).normalized();
-        let nh = dot(nn, hh).clamp(-1.0, 1.0);
+        let nh = clamp(dot_fast(nn, hh), -1.0, 1.0);
        let fac = ggx_d(nh, (1.0f32).min(roughness.sqrt() + (2.0 * theta / PI_32)));

        fac * (1.0f32).min(1.0 - cos_theta_max) * INV_PI
--- a/src/space_fill.rs
+++ b/src/space_fill.rs
@ -0,0 +1,216 @@
+//! Space-filling curves and other related functionality.
+
+#![allow(dead_code)]
+
+pub mod hilbert {
+    const N: u32 = 1 << 16;
+
+    /// Convert (x,y) to hilbert curve index.
+    ///
+    /// x: The x coordinate.  Must be no greater than 2^16-1.
+    /// y: The y coordinate.  Must be no greater than 2^16-1.
+    /// n: Basically the "resolution" of the curve, on one side.
+    ///
+    /// Returns the hilbert curve index corresponding to the (x,y) coordinates given.
+    pub fn encode(x: u32, y: u32, n: u32) -> u32 {
+        assert!(x < N);
+        assert!(y < N);
+
+        let (mut x, mut y) = (x, y);
+        let mut d = 0;
+        let mut s = n >> 1;
+        while s > 0 {
+            let rx = if (x & s) > 0 { 1 } else { 0 };
+            let ry = if (y & s) > 0 { 1 } else { 0 };
+            d += s * s * ((3 * rx) ^ ry);
+            (x, y) = hilbert_rotate(s, rx, ry, x, y);
+
+            s >>= 1
+        }
+
+        d
+    }
+
+    /// Convert hilbert curve index to (x,y).
+    ///
+    /// d: The hilbert curve index.
+    /// n: Basically the "resolution" of the curve, on one side.
+    ///
+    /// Returns the (x, y) coords at the given index.
+    pub fn decode(d: u32, n: u32) -> (u32, u32) {
+        let (mut x, mut y) = (0, 0);
+        let mut s = 1;
+        let mut t = d;
+        while s < n {
+            let rx = 1 & (t >> 1);
+            let ry = 1 & (t ^ rx);
+            (x, y) = hilbert_rotate(s, rx, ry, x, y);
+            x += s * rx;
+            y += s * ry;
+            t >>= 2;
+
+            s <<= 1;
+        }
+
+        (x, y)
+    }
+
+    //------------
+    // Utilities.
+
+    fn hilbert_rotate(n: u32, rx: u32, ry: u32, x: u32, y: u32) -> (u32, u32) {
+        if ry == 0 {
+            if rx == 1 {
+                ((n - 1).wrapping_sub(y), (n - 1).wrapping_sub(x))
+            } else {
+                (y, x)
+            }
+        } else {
+            (x, y)
+        }
+    }
+}
+
+pub mod morton {
+    const N: u32 = 1 << 16;
+
+    /// Convert (x,y) to morton curve index.
+    ///
+    /// x: The x coordinate.  Should be no greater than 2^16-1.
+    /// y: The y coordinate.  Should be no greater than 2^16-1.
+    ///
+    /// Returns the morton curve index corresponding to the (x,y) coordinates given.
+    pub fn encode(x: u32, y: u32) -> u32 {
+        debug_assert!(x < N);
+        debug_assert!(y < N);
+
+        part_1_by_1(x) | (part_1_by_1(y) << 1)
+    }
+
+    /// Convert morton curve index to (x,y).
+    ///
+    /// i: The morton curve index.
+    ///
+    /// Returns the (x, y) coords at the given index.
+    pub fn decode(i: u32) -> (u32, u32) {
+        (compact_1_by_1(i), compact_1_by_1(i >> 1))
+    }
+
+    //------------
+    // Utilities.
+
+    #[inline(always)]
+    fn part_1_by_1(mut x: u32) -> u32 {
+        x &= 0x0000ffff;
+        x = (x ^ (x << 8)) & 0x00ff00ff;
+        x = (x ^ (x << 4)) & 0x0f0f0f0f;
+        x = (x ^ (x << 2)) & 0x33333333;
+        x = (x ^ (x << 1)) & 0x55555555;
+        x
+    }
+
+    #[inline(always)]
+    fn compact_1_by_1(mut x: u32) -> u32 {
+        x &= 0x55555555;
+        x = (x ^ (x >> 1)) & 0x33333333;
+        x = (x ^ (x >> 2)) & 0x0f0f0f0f;
+        x = (x ^ (x >> 4)) & 0x00ff00ff;
+        x = (x ^ (x >> 8)) & 0x0000ffff;
+        x
+    }
+}
+
+/// Yields coordinates in outward spiral, but incorporating a Hilbert
+/// curve at the smaller scales.
+pub mod hilbert_spiral {
+    /// Convert from hilbert-spiral index to (x,y).
+    ///
+    /// Note: this returns both negative and positive coordinates.
+    /// It starts at 0,0 and spirals outwards.
+    ///
+    /// i: The hilbert-spiral index.
+    /// hilbert_size: the size of the hulbert blocks on a side.  Will be
+    ///               rounded down to the nearest power of two.
+    ///
+    /// Returns the (x, y) coords at the given index.
+    pub fn decode(i: u32, hilbert_size: u32) -> (i32, i32) {
+        assert!(hilbert_size > 0);
+        let hilbert_size = 1 << (31 - u32::leading_zeros(hilbert_size));
+
+        let hilbert_cells = hilbert_size * hilbert_size;
+
+        let hilbert_i = i % hilbert_cells;
+        let spiral_i = i / hilbert_cells;
+
+        let (mut sx, mut sy, section) = decode_spiral(spiral_i);
+        sx = (sx * hilbert_size as i32) - (hilbert_size / 2) as i32;
+        sy = (sy * hilbert_size as i32) - (hilbert_size / 2) as i32;
+
+        let (hx, hy) = {
+            let (hx, hy) = super::hilbert::decode(hilbert_i, hilbert_size);
+            let a = hilbert_size - 1;
+            match section {
+                0 => (hy, hx),
+                1 => (a - hx, a - hy),
+                2 => (a - hy, a - hx),
+                3 => (hx, hy),
+                _ => unreachable!(),
+            }
+        };
+
+        (sx + hx as i32, sy + hy as i32)
+    }
+
+    pub fn decode_spiral(i: u32) -> (i32, i32, u32) {
+        if i == 0 {
+            return (0, 0, 3);
+        }
+
+        // 0 = first ring outside of center, 1 = second, and so on.
+        let ring = (((i as f64).sqrt() - 1.0) / 2.0) as u32;
+
+        // The size of the ring along one side.
+        let size = 1 + ((ring + 1) * 2);
+
+        let n = i - ((size - 2) * (size - 2)); // The zero-indexed cell of the ring.
+        let arm = n / (size - 1); // The arm of the ring.
+        let arm_n = n % (size - 1); // The index within the arm of the ring.
+
+        // The two coordinates.  They just need to be flipped around depending on the arm.
+        let radius = ring as i32 + 1;
+        let d = -(size as i32 / 2) + 1 + arm_n as i32;
+
+        match arm {
+            0 => (radius, d, 0),
+            1 => (-d, radius, if arm_n == (size - 2) { 2 } else { 1 }),
+            2 => (-radius, -d, 2),
+            3 => (d, -radius, 3),
+            _ => unreachable!(),
+        }
+    }
+}
+
+//-------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn hilbert_reversible() {
+        let i = 0x4c8587a2;
+        let (x, y) = hilbert::decode(i, 1 << 16);
+        let i2 = hilbert::encode(x, y, 1 << 16);
+
+        assert_eq!(i, i2);
+    }
+
+    #[test]
+    fn morton_reversible() {
+        let i = 0x4c8587a2;
+        let (x, y) = morton::decode(i);
+        let i2 = morton::encode(x, y);
+
+        assert_eq!(i, i2);
+    }
+}
--- a/src/surface/micropoly_batch.rs
+++ b/src/surface/micropoly_batch.rs
@ -9,7 +9,7 @@ use crate::{
    bbox::BBox,
    boundable::Boundable,
    lerp::lerp_slice,
-    math::{cross, dot, Normal, Point, Transform},
+    math::{cross, dot, Normal, Point, Xform},
    ray::{RayBatch, RayStack},
    shading::SurfaceClosure,
 };
@ -150,13 +150,13 @@ impl<'a> MicropolyBatch<'a> {
        rays: &mut RayBatch,
        ray_stack: &mut RayStack,
        isects: &mut [SurfaceIntersection],
-        space: &[Transform],
+        space: &[Xform],
    ) {
        // Precalculate transform for non-motion blur cases
        let static_mat_space = if space.len() == 1 {
-            lerp_slice(space, 0.0).inverse()
+            space[0]
        } else {
-            Transform::new()
+            Xform::identity()
        };

        self.accel
@ -182,11 +182,11 @@ impl<'a> MicropolyBatch<'a> {
                            );
                            if !space.is_empty() {
                                (*tri_cache[i].as_mut_ptr()).0 =
-                                    (*tri_cache[i].as_mut_ptr()).0 * static_mat_space;
+                                    (*tri_cache[i].as_mut_ptr()).0.xform(&static_mat_space);
                                (*tri_cache[i].as_mut_ptr()).1 =
-                                    (*tri_cache[i].as_mut_ptr()).1 * static_mat_space;
+                                    (*tri_cache[i].as_mut_ptr()).1.xform(&static_mat_space);
                                (*tri_cache[i].as_mut_ptr()).2 =
-                                    (*tri_cache[i].as_mut_ptr()).2 * static_mat_space;
+                                    (*tri_cache[i].as_mut_ptr()).2.xform(&static_mat_space);
                            }
                        }
                    }
@ -205,7 +205,7 @@ impl<'a> MicropolyBatch<'a> {
                    // Calculate the ray space, if necessary.
                    let mat_space = if space.len() > 1 {
                        // Per-ray transform, for motion blur
-                        lerp_slice(space, ray_time).inverse()
+                        lerp_slice(space, ray_time)
                    } else {
                        static_mat_space
                    };
@ -251,9 +251,9 @@ impl<'a> MicropolyBatch<'a> {
                            };

                            if !space.is_empty() {
-                                tri.0 = tri.0 * mat_space;
-                                tri.1 = tri.1 * mat_space;
-                                tri.2 = tri.2 * mat_space;
+                                tri.0 = tri.0.xform(&mat_space);
+                                tri.1 = tri.1.xform(&mat_space);
+                                tri.2 = tri.2.xform(&mat_space);
                            }

                            tri
@ -284,6 +284,13 @@ impl<'a> MicropolyBatch<'a> {

                    // Calculate intersection data if necessary.
                    if non_shadow_hit {
+                        // Get the full space data.
+                        let mat_space = if let Some(space) = mat_space.to_full() {
+                            space
+                        } else {
+                            return;
+                        };
+
                        let hit_tri = unsafe { hit_tri.assume_init() };
                        let hit_tri_indices = unsafe { hit_tri_indices.assume_init() };
                        let (t, b0, b1, b2) = unsafe { hit_tri_data.assume_init() };
@ -311,7 +318,7 @@ impl<'a> MicropolyBatch<'a> {
                            let n1 = lerp_slice(n1_slice, ray_time).normalized();
                            let n2 = lerp_slice(n2_slice, ray_time).normalized();

-                            let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
+                            let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)).xform_fast(&mat_space);
                            if dot(s_nor, geo_normal) >= 0.0 {
                                s_nor
                            } else {
--- a/src/surface/mod.rs
+++ b/src/surface/mod.rs
@ -2,7 +2,6 @@

 // pub mod micropoly_batch;
 pub mod bilinear_patch;
-pub mod micropoly_batch;
 pub mod triangle;
 pub mod triangle_mesh;

@ -10,8 +9,8 @@ use std::fmt::Debug;

 use crate::{
    boundable::Boundable,
-    math::{Normal, Point, Transform, Vector},
-    ray::{RayBatch, RayStack},
+    math::{Normal, Point, Vector, XformFull},
+    ray::{LocalRay, Ray},
    shading::surface_closure::SurfaceClosure,
    shading::SurfaceShader,
 };
@ -19,13 +18,13 @@ use crate::{
 const MAX_EDGE_DICE: u32 = 128;

 pub trait Surface: Boundable + Debug + Sync {
-    fn intersect_rays(
+    fn intersect_ray(
        &self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-        isects: &mut [SurfaceIntersection],
-        shader: &dyn SurfaceShader,
-        space: &[Transform],
+        ray: &mut Ray,
+        local_ray: &LocalRay,
+        space: &XformFull,
+        isect: &mut SurfaceIntersection,
+        shaders: &[&dyn SurfaceShader],
    );
 }

@ -80,13 +79,13 @@ pub enum SurfaceIntersection {

 #[derive(Debug, Copy, Clone)]
 pub struct SurfaceIntersectionData {
-    pub incoming: Vector, // Direction of the incoming ray
-    pub pos: Point,       // Position of the intersection
+    pub incoming: Vector, // Direction of the incoming ray.
+    pub pos: Point,       // Position of the intersection.
    pub pos_err: f32,     // Error magnitude of the intersection position.  Imagine
    // a cube centered around `pos` with dimensions of `2 * pos_err`.
-    pub nor: Normal,            // Shading normal
-    pub nor_g: Normal,          // True geometric normal
-    pub local_space: Transform, // Matrix from global space to local space
-    pub t: f32,                 // Ray t-value at the intersection point
-    pub sample_pdf: f32,        // The PDF of getting this point by explicitly sampling the surface
+    pub nor: Normal,            // Shading normal.
+    pub nor_g: Normal,          // True geometric normal.
+    pub local_space: XformFull, // Matrix from local to world space.
+    pub t: f32,                 // Ray t-value at the intersection point.
+    pub sample_pdf: f32,        // The PDF of getting this point by explicitly sampling the surface.
 }
--- a/src/surface/triangle.rs
+++ b/src/surface/triangle.rs
@ -162,7 +162,7 @@ pub fn surface_point(tri: (Point, Point, Point), bary: (f32, f32, f32)) -> (Poin
        + (tri.1.into_vector().abs() * bary.1)
        + (tri.2.into_vector().abs() * bary.2))
        * fp_gamma(7))
-    .co
+    .0
    .max_element();

    (pos, pos_err)
--- a/src/surface/triangle_mesh.rs
+++ b/src/surface/triangle_mesh.rs
@ -6,10 +6,11 @@ use crate::{
    accel::BVH4,
    bbox::BBox,
    boundable::Boundable,
+    color::Color,
    lerp::lerp_slice,
-    math::{cross, dot, Normal, Point, Transform},
-    ray::{RayBatch, RayStack},
-    shading::SurfaceShader,
+    math::{cross, dot, Normal, Point, XformFull},
+    ray::{LocalRay, Ray},
+    shading::{SimpleSurfaceShader, SurfaceShader},
 };

 use super::{triangle, Surface, SurfaceIntersection, SurfaceIntersectionData};
@ -18,6 +19,7 @@ const MAX_LEAF_TRIANGLE_COUNT: usize = 3;

 #[derive(Copy, Clone, Debug)]
 pub struct TriangleMesh<'a> {
+    pub shader_idx: Option<usize>,
    time_sample_count: usize,
    vertices: &'a [Point], // Vertices, with the time samples for each vertex stored contiguously
    normals: Option<&'a [Normal]>, // Vertex normals, organized the same as `vertices`
@ -28,6 +30,7 @@ pub struct TriangleMesh<'a> {
 impl<'a> TriangleMesh<'a> {
    pub fn from_verts_and_indices<'b>(
        arena: &'b Arena,
+        shader_idx: Option<usize>,
        verts: &[Vec<Point>],
        vert_normals: &Option<Vec<Vec<Normal>>>,
        tri_indices: &[(usize, usize, usize)],
@ -106,6 +109,7 @@ impl<'a> TriangleMesh<'a> {
        });

        TriangleMesh {
+            shader_idx: shader_idx,
            time_sample_count: time_sample_count,
            vertices: vertices,
            normals: normals,
@ -122,202 +126,135 @@ impl<'a> Boundable for TriangleMesh<'a> {
 }

 impl<'a> Surface for TriangleMesh<'a> {
-    fn intersect_rays(
+    fn intersect_ray(
        &self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-        isects: &mut [SurfaceIntersection],
-        shader: &dyn SurfaceShader,
-        space: &[Transform],
+        ray: &mut Ray,
+        local_ray: &LocalRay,
+        space: &XformFull,
+        isect: &mut SurfaceIntersection,
+        shaders: &[&dyn SurfaceShader],
    ) {
-        // Precalculate transform for non-motion blur cases
-        let static_mat_space = if space.len() == 1 {
-            lerp_slice(space, 0.0).inverse()
-        } else {
-            Transform::new()
+        let unassigned_shader = SimpleSurfaceShader::Emit {
+            color: Color::new_xyz(color::rec709_to_xyz((1.0, 0.0, 1.0))),
        };

-        self.accel
-            .traverse(rays, ray_stack, |idx_range, rays, ray_stack| {
-                let tri_count = idx_range.end - idx_range.start;
+        let shader = if let Some(idx) = self.shader_idx {
+            shaders[idx]
+        } else {
+            &unassigned_shader
+        };

-                // Build the triangle cache if we can!
-                let is_cached = ray_stack.ray_count_in_next_task() >= tri_count
-                    && self.time_sample_count == 1
-                    && space.len() <= 1;
-                let mut tri_cache = [std::mem::MaybeUninit::uninit(); MAX_LEAF_TRIANGLE_COUNT];
-                if is_cached {
-                    for tri_idx in idx_range.clone() {
-                        let i = tri_idx - idx_range.start;
-                        let tri_indices = self.indices[tri_idx];
+        self.accel.traverse(ray, local_ray, |idx_range, ray| {
+            // Iterate through the triangles and test the ray against them.
+            let mut non_shadow_hit = false;
+            let mut hit_tri = std::mem::MaybeUninit::uninit();
+            let mut hit_tri_indices = std::mem::MaybeUninit::uninit();
+            let mut hit_tri_data = std::mem::MaybeUninit::uninit();
+            let ray_pre = triangle::RayTriPrecompute::new(ray.dir);
+            for tri_idx in idx_range.clone() {
+                let tri_indices = self.indices[tri_idx];

-                        // For static triangles with static transforms, cache them.
+                // Get triangle.
+                let mut tri = if self.time_sample_count == 1 {
+                    // No deformation motion blur, so fast-path it.
+                    (
+                        self.vertices[tri_indices.0 as usize],
+                        self.vertices[tri_indices.1 as usize],
+                        self.vertices[tri_indices.2 as usize],
+                    )
+                } else {
+                    // Deformation motion blur, need to interpolate.
+                    let p0_slice = &self.vertices[(tri_indices.0 as usize * self.time_sample_count)
+                        ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
+                    let p1_slice = &self.vertices[(tri_indices.1 as usize * self.time_sample_count)
+                        ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
+                    let p2_slice = &self.vertices[(tri_indices.2 as usize * self.time_sample_count)
+                        ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
+
+                    let p0 = lerp_slice(p0_slice, ray.time);
+                    let p1 = lerp_slice(p1_slice, ray.time);
+                    let p2 = lerp_slice(p2_slice, ray.time);
+
+                    (p0, p1, p2)
+                };
+
+                // Transform triangle into world space.
+                tri.0 = tri.0.xform(space);
+                tri.1 = tri.1.xform(space);
+                tri.2 = tri.2.xform(space);
+
+                // Test ray against triangle
+                if let Some((t, b0, b1, b2)) =
+                    triangle::intersect_ray(ray.orig, ray_pre, ray.max_t, tri)
+                {
+                    if ray.is_occlusion() {
+                        *isect = SurfaceIntersection::Occlude;
+                        ray.mark_done();
+                        break;
+                    } else {
+                        non_shadow_hit = true;
+                        ray.max_t = t;
                        unsafe {
-                            *tri_cache[i].as_mut_ptr() = (
-                                self.vertices[tri_indices.0 as usize],
-                                self.vertices[tri_indices.1 as usize],
-                                self.vertices[tri_indices.2 as usize],
-                            );
-                            if !space.is_empty() {
-                                (*tri_cache[i].as_mut_ptr()).0 =
-                                    (*tri_cache[i].as_mut_ptr()).0 * static_mat_space;
-                                (*tri_cache[i].as_mut_ptr()).1 =
-                                    (*tri_cache[i].as_mut_ptr()).1 * static_mat_space;
-                                (*tri_cache[i].as_mut_ptr()).2 =
-                                    (*tri_cache[i].as_mut_ptr()).2 * static_mat_space;
-                            }
+                            *hit_tri.as_mut_ptr() = tri;
+                            *hit_tri_indices.as_mut_ptr() = tri_indices;
+                            *hit_tri_data.as_mut_ptr() = (t, b0, b1, b2);
                        }
                    }
                }
+            }

-                // Test each ray against the triangles.
-                ray_stack.do_next_task(|ray_idx| {
-                    let ray_idx = ray_idx as usize;
+            // Calculate intersection data if necessary.
+            if non_shadow_hit {
+                let hit_tri = unsafe { hit_tri.assume_init() };
+                let (t, b0, b1, b2) = unsafe { hit_tri_data.assume_init() };

-                    if rays.is_done(ray_idx) {
-                        return;
-                    }
+                // Calculate intersection point and error magnitudes
+                let (pos, pos_err) = triangle::surface_point(hit_tri, (b0, b1, b2));

-                    let ray_time = rays.time(ray_idx);
+                // Calculate geometric surface normal
+                let geo_normal = cross(hit_tri.0 - hit_tri.1, hit_tri.0 - hit_tri.2).into_normal();

-                    // Calculate the ray space, if necessary.
-                    let mat_space = if space.len() > 1 {
-                        // Per-ray transform, for motion blur
-                        lerp_slice(space, ray_time).inverse()
+                // Calculate interpolated surface normal, if any
+                let shading_normal = if let Some(normals) = self.normals {
+                    let hit_tri_indices = unsafe { hit_tri_indices.assume_init() };
+                    let n0_slice = &normals[(hit_tri_indices.0 as usize * self.time_sample_count)
+                        ..((hit_tri_indices.0 as usize + 1) * self.time_sample_count)];
+                    let n1_slice = &normals[(hit_tri_indices.1 as usize * self.time_sample_count)
+                        ..((hit_tri_indices.1 as usize + 1) * self.time_sample_count)];
+                    let n2_slice = &normals[(hit_tri_indices.2 as usize * self.time_sample_count)
+                        ..((hit_tri_indices.2 as usize + 1) * self.time_sample_count)];
+
+                    let n0 = lerp_slice(n0_slice, ray.time).normalized();
+                    let n1 = lerp_slice(n1_slice, ray.time).normalized();
+                    let n2 = lerp_slice(n2_slice, ray.time).normalized();
+
+                    let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)).xform_fast(&space);
+                    if dot(s_nor, geo_normal) >= 0.0 {
+                        s_nor
                    } else {
-                        static_mat_space
-                    };
-
-                    // Iterate through the triangles and test the ray against them.
-                    let mut non_shadow_hit = false;
-                    let mut hit_tri = std::mem::MaybeUninit::uninit();
-                    let mut hit_tri_indices = std::mem::MaybeUninit::uninit();
-                    let mut hit_tri_data = std::mem::MaybeUninit::uninit();
-                    let ray_pre = triangle::RayTriPrecompute::new(rays.dir(ray_idx));
-                    for tri_idx in idx_range.clone() {
-                        let tri_indices = self.indices[tri_idx];
-
-                        // Get triangle if necessary
-                        let tri = if is_cached {
-                            let i = tri_idx - idx_range.start;
-                            unsafe { tri_cache[i].assume_init() }
-                        } else {
-                            let mut tri = if self.time_sample_count == 1 {
-                                // No deformation motion blur, so fast-path it.
-                                (
-                                    self.vertices[tri_indices.0 as usize],
-                                    self.vertices[tri_indices.1 as usize],
-                                    self.vertices[tri_indices.2 as usize],
-                                )
-                            } else {
-                                // Deformation motion blur, need to interpolate.
-                                let p0_slice = &self.vertices[(tri_indices.0 as usize
-                                    * self.time_sample_count)
-                                    ..((tri_indices.0 as usize + 1) * self.time_sample_count)];
-                                let p1_slice = &self.vertices[(tri_indices.1 as usize
-                                    * self.time_sample_count)
-                                    ..((tri_indices.1 as usize + 1) * self.time_sample_count)];
-                                let p2_slice = &self.vertices[(tri_indices.2 as usize
-                                    * self.time_sample_count)
-                                    ..((tri_indices.2 as usize + 1) * self.time_sample_count)];
-
-                                let p0 = lerp_slice(p0_slice, ray_time);
-                                let p1 = lerp_slice(p1_slice, ray_time);
-                                let p2 = lerp_slice(p2_slice, ray_time);
-
-                                (p0, p1, p2)
-                            };
-
-                            if !space.is_empty() {
-                                tri.0 = tri.0 * mat_space;
-                                tri.1 = tri.1 * mat_space;
-                                tri.2 = tri.2 * mat_space;
-                            }
-
-                            tri
-                        };
-
-                        // Test ray against triangle
-                        if let Some((t, b0, b1, b2)) = triangle::intersect_ray(
-                            rays.orig(ray_idx),
-                            ray_pre,
-                            rays.max_t(ray_idx),
-                            tri,
-                        ) {
-                            if rays.is_occlusion(ray_idx) {
-                                isects[ray_idx] = SurfaceIntersection::Occlude;
-                                rays.mark_done(ray_idx);
-                                break;
-                            } else {
-                                non_shadow_hit = true;
-                                rays.set_max_t(ray_idx, t);
-                                unsafe {
-                                    *hit_tri.as_mut_ptr() = tri;
-                                    *hit_tri_indices.as_mut_ptr() = tri_indices;
-                                    *hit_tri_data.as_mut_ptr() = (t, b0, b1, b2);
-                                }
-                            }
-                        }
+                        -s_nor
                    }
+                } else {
+                    geo_normal
+                };

-                    // Calculate intersection data if necessary.
-                    if non_shadow_hit {
-                        let hit_tri = unsafe { hit_tri.assume_init() };
-                        let (t, b0, b1, b2) = unsafe { hit_tri_data.assume_init() };
+                let intersection_data = SurfaceIntersectionData {
+                    incoming: ray.dir,
+                    t: t,
+                    pos: pos,
+                    pos_err: pos_err,
+                    nor: shading_normal,
+                    nor_g: geo_normal,
+                    local_space: *space,
+                    sample_pdf: 0.0,
+                };

-                        // Calculate intersection point and error magnitudes
-                        let (pos, pos_err) = triangle::surface_point(hit_tri, (b0, b1, b2));
-
-                        // Calculate geometric surface normal
-                        let geo_normal =
-                            cross(hit_tri.0 - hit_tri.1, hit_tri.0 - hit_tri.2).into_normal();
-
-                        // Calculate interpolated surface normal, if any
-                        let shading_normal = if let Some(normals) = self.normals {
-                            let hit_tri_indices = unsafe { hit_tri_indices.assume_init() };
-                            let n0_slice = &normals[(hit_tri_indices.0 as usize
-                                * self.time_sample_count)
-                                ..((hit_tri_indices.0 as usize + 1) * self.time_sample_count)];
-                            let n1_slice = &normals[(hit_tri_indices.1 as usize
-                                * self.time_sample_count)
-                                ..((hit_tri_indices.1 as usize + 1) * self.time_sample_count)];
-                            let n2_slice = &normals[(hit_tri_indices.2 as usize
-                                * self.time_sample_count)
-                                ..((hit_tri_indices.2 as usize + 1) * self.time_sample_count)];
-
-                            let n0 = lerp_slice(n0_slice, ray_time).normalized();
-                            let n1 = lerp_slice(n1_slice, ray_time).normalized();
-                            let n2 = lerp_slice(n2_slice, ray_time).normalized();
-
-                            let s_nor = ((n0 * b0) + (n1 * b1) + (n2 * b2)) * mat_space;
-                            if dot(s_nor, geo_normal) >= 0.0 {
-                                s_nor
-                            } else {
-                                -s_nor
-                            }
-                        } else {
-                            geo_normal
-                        };
-
-                        let intersection_data = SurfaceIntersectionData {
-                            incoming: rays.dir(ray_idx),
-                            t: t,
-                            pos: pos,
-                            pos_err: pos_err,
-                            nor: shading_normal,
-                            nor_g: geo_normal,
-                            local_space: mat_space,
-                            sample_pdf: 0.0,
-                        };
-
-                        // Fill in intersection data
-                        isects[ray_idx] = SurfaceIntersection::Hit {
-                            intersection_data: intersection_data,
-                            closure: shader.shade(&intersection_data, ray_time),
-                        };
-                    }
-                });
-                ray_stack.pop_task();
-            });
+                // Fill in intersection data
+                *isect = SurfaceIntersection::Hit {
+                    intersection_data: intersection_data,
+                    closure: shader.shade(&intersection_data, ray.time),
+                };
+            }
+        });
    }
 }
--- a/src/tracer.rs
+++ b/src/tracer.rs
@ -1,190 +1,119 @@
-use std::iter;
-
 use crate::{
-    accel::ray_code,
-    color::{rec709_to_xyz, Color},
    lerp::lerp_slice,
-    math::Transform,
-    ray::{RayBatch, RayStack},
+    math::XformFull,
+    ray::{LocalRay, Ray},
    scene::{Assembly, InstanceType, Object},
-    shading::{SimpleSurfaceShader, SurfaceShader},
+    shading::SurfaceShader,
    surface::SurfaceIntersection,
-    transform_stack::TransformStack,
 };

 pub struct Tracer<'a> {
+    root: &'a Assembly<'a>,
    ray_trace_count: u64,
-    ray_stack: RayStack,
-    inner: TracerInner<'a>,
 }

 impl<'a> Tracer<'a> {
    pub fn from_assembly(assembly: &'a Assembly) -> Tracer<'a> {
        Tracer {
+            root: assembly,
            ray_trace_count: 0,
-            ray_stack: RayStack::new(),
-            inner: TracerInner {
-                root: assembly,
-                xform_stack: TransformStack::new(),
-                isects: Vec::new(),
-            },
        }
    }

-    pub fn trace<'b>(&'b mut self, rays: &mut RayBatch) -> &'b [SurfaceIntersection] {
-        self.ray_trace_count += rays.len() as u64;
-        self.inner.trace(rays, &mut self.ray_stack)
-    }
-
    pub fn rays_traced(&self) -> u64 {
        self.ray_trace_count
    }
-}

-struct TracerInner<'a> {
-    root: &'a Assembly<'a>,
-    xform_stack: TransformStack,
-    isects: Vec<SurfaceIntersection>,
-}
+    pub fn trace(&mut self, mut ray: Ray) -> SurfaceIntersection {
+        self.ray_trace_count += 1;

-impl<'a> TracerInner<'a> {
-    fn trace<'b>(
-        &'b mut self,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
-    ) -> &'b [SurfaceIntersection] {
-        ray_stack.clear();
+        let local_ray = ray.to_local();
+        let space = XformFull::identity();
+        let mut isect = SurfaceIntersection::Miss;

-        // Ready the isects
-        self.isects.clear();
-        self.isects.reserve(rays.len());
-        self.isects
-            .extend(iter::repeat(SurfaceIntersection::Miss).take(rays.len()));
+        self.trace_assembly(self.root, &mut ray, &local_ray, &space, &mut isect);

-        // Prep the accel part of the rays.
-        {
-            let ident = Transform::new();
-            for i in 0..rays.len() {
-                rays.update_local(i, &ident);
-            }
-        }
-
-        // Divide the rays into 8 different lanes by direction.
-        ray_stack.ensure_lane_count(8);
-        for i in 0..rays.len() {
-            ray_stack.push_ray_index(i, ray_code(rays.dir(i)));
-        }
-        ray_stack.push_lanes_to_tasks(&[0, 1, 2, 3, 4, 5, 6, 7]);
-
-        // Trace each of the 8 lanes separately.
-        while !ray_stack.is_empty() {
-            self.trace_assembly(self.root, rays, ray_stack);
-        }
-
-        &self.isects
+        isect
    }

-    fn trace_assembly<'b>(
-        &'b mut self,
+    fn trace_assembly(
+        &mut self,
        assembly: &Assembly,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
+        ray: &mut Ray,
+        local_ray: &LocalRay,
+        space: &XformFull,
+        isect: &mut SurfaceIntersection,
    ) {
        assembly
            .object_accel
-            .traverse(rays, ray_stack, |idx_range, rays, ray_stack| {
-                let inst = &assembly.instances[idx_range.start];
+            .traverse(ray, local_ray, |idx_range, ray| {
+                for inst_idx in idx_range {
+                    let inst = &assembly.instances[inst_idx];

-                // Transform rays if needed
-                if let Some((xstart, xend)) = inst.transform_indices {
-                    // Push transforms to stack
-                    self.xform_stack.push(&assembly.xforms[xstart..xend]);
+                    // Handle transforms if needed.
+                    let (local_space, local_ray) = if let Some((xstart, xend)) =
+                        inst.transform_indices
+                    {
+                        let instance_xform = lerp_slice(&assembly.xforms[xstart..xend], ray.time);
+                        let combined_xform = instance_xform.compose(&space.fwd);

-                    // Do transforms
-                    // TODO: re-divide rays based on direction (maybe?).
-                    let xforms = self.xform_stack.top();
-                    ray_stack.do_next_task(|ray_idx| {
-                        let t = rays.time(ray_idx);
-                        rays.update_local(ray_idx, &lerp_slice(xforms, t));
-                    });
-                    ray_stack.duplicate_next_task();
-                }
-
-                // Trace rays
-                match inst.instance_type {
-                    InstanceType::Object => {
-                        self.trace_object(
-                            &assembly.objects[inst.data_index],
-                            inst.surface_shader_index
-                                .map(|i| assembly.surface_shaders[i]),
-                            rays,
-                            ray_stack,
-                        );
-                    }
-
-                    InstanceType::Assembly => {
-                        self.trace_assembly(&assembly.assemblies[inst.data_index], rays, ray_stack);
-                    }
-                }
-
-                // Un-transform rays if needed
-                if inst.transform_indices.is_some() {
-                    // Pop transforms off stack
-                    self.xform_stack.pop();
-
-                    // Undo transforms
-                    let xforms = self.xform_stack.top();
-                    if !xforms.is_empty() {
-                        ray_stack.pop_do_next_task(|ray_idx| {
-                            let t = rays.time(ray_idx);
-                            rays.update_local(ray_idx, &lerp_slice(xforms, t));
-                        });
+                        if let Some(xform) = combined_xform.to_full() {
+                            (xform, ray.to_local_xform(&xform))
+                        } else {
+                            // Invalid transform, so skip traversing into this instance.
+                            continue;
+                        }
                    } else {
-                        let ident = Transform::new();
-                        ray_stack.pop_do_next_task(|ray_idx| {
-                            rays.update_local(ray_idx, &ident);
-                        });
+                        (*space, *local_ray)
+                    };
+
+                    // Trace ray.
+                    match inst.instance_type {
+                        InstanceType::Object => {
+                            self.trace_object(
+                                &assembly.objects[inst.data_index],
+                                ray,
+                                &local_ray,
+                                &local_space,
+                                isect,
+                                assembly.surface_shaders,
+                            );
+                        }
+
+                        InstanceType::Assembly => {
+                            self.trace_assembly(
+                                &assembly.assemblies[inst.data_index],
+                                ray,
+                                &local_ray,
+                                &local_space,
+                                isect,
+                            );
+                        }
+                    }
+
+                    if ray.is_done() {
+                        return;
                    }
                }
            });
    }

    fn trace_object<'b>(
-        &'b mut self,
+        &mut self,
        obj: &Object,
-        surface_shader: Option<&dyn SurfaceShader>,
-        rays: &mut RayBatch,
-        ray_stack: &mut RayStack,
+        ray: &mut Ray,
+        local_ray: &LocalRay,
+        space: &XformFull,
+        isect: &mut SurfaceIntersection,
+        shaders: &[&dyn SurfaceShader],
    ) {
        match *obj {
            Object::Surface(surface) => {
-                let unassigned_shader = SimpleSurfaceShader::Emit {
-                    color: Color::new_xyz(rec709_to_xyz((1.0, 0.0, 1.0))),
-                };
-                let shader = surface_shader.unwrap_or(&unassigned_shader);
-
-                surface.intersect_rays(
-                    rays,
-                    ray_stack,
-                    &mut self.isects,
-                    shader,
-                    self.xform_stack.top(),
-                );
+                surface.intersect_ray(ray, local_ray, space, isect, shaders);
            }

            Object::SurfaceLight(surface) => {
-                // Lights don't use shaders
-                let bogus_shader = SimpleSurfaceShader::Emit {
-                    color: Color::new_xyz(rec709_to_xyz((1.0, 0.0, 1.0))),
-                };
-
-                surface.intersect_rays(
-                    rays,
-                    ray_stack,
-                    &mut self.isects,
-                    &bogus_shader,
-                    self.xform_stack.top(),
-                );
+                surface.intersect_ray(ray, local_ray, space, isect, shaders);
            }
        }
    }
--- a/src/transform_stack.rs
+++ b/src/transform_stack.rs
@ -1,83 +1,30 @@
-use std::{
-    cmp,
-    mem::{transmute, MaybeUninit},
-};
-
-use crate::{algorithm::merge_slices_to, math::Transform};
+use crate::math::Xform;

 pub struct TransformStack {
-    stack: Vec<MaybeUninit<Transform>>,
-    stack_indices: Vec<usize>,
+    stack: Vec<Xform>,
 }

 impl TransformStack {
    pub fn new() -> TransformStack {
-        let mut ts = TransformStack {
-            stack: Vec::new(),
-            stack_indices: Vec::new(),
-        };
-
-        ts.stack_indices.push(0);
-        ts.stack_indices.push(0);
-
-        ts
+        TransformStack { stack: Vec::new() }
    }

    pub fn clear(&mut self) {
        self.stack.clear();
-        self.stack_indices.clear();
-        self.stack_indices.push(0);
-        self.stack_indices.push(0);
    }

-    pub fn push(&mut self, xforms: &[Transform]) {
-        assert!(!xforms.is_empty());
-
-        if self.stack.is_empty() {
-            let xforms: &[MaybeUninit<Transform>] = unsafe { transmute(xforms) };
-            self.stack.extend(xforms);
-        } else {
-            let sil = self.stack_indices.len();
-            let i1 = self.stack_indices[sil - 2];
-            let i2 = self.stack_indices[sil - 1];
-            // Reserve stack space for the new transforms.
-            // Note this leaves exposed uninitialized memory.  The subsequent call to
-            // merge_slices_to() fills that memory in.
-            {
-                let maxlen = cmp::max(xforms.len(), i2 - i1);
-                self.stack.reserve(maxlen);
-                let l = self.stack.len();
-                unsafe { self.stack.set_len(l + maxlen) };
-            }
-            let (xfs1, xfs2) = self.stack.split_at_mut(i2);
-            merge_slices_to(
-                unsafe { transmute(&xfs1[i1..i2]) },
-                xforms,
-                xfs2,
-                |xf1, xf2| *xf1 * *xf2,
-            );
+    pub fn push(&mut self, xform: Xform) {
+        match self.stack.last() {
+            None => self.stack.push(xform),
+            Some(prev_xform) => self.stack.push(xform.compose(prev_xform)),
        }
-
-        self.stack_indices.push(self.stack.len());
    }

-    pub fn pop(&mut self) {
-        assert!(self.stack_indices.len() > 2);
-
-        let sl = self.stack.len();
-        let sil = self.stack_indices.len();
-        let i1 = self.stack_indices[sil - 2];
-        let i2 = self.stack_indices[sil - 1];
-
-        self.stack.truncate(sl - (i2 - i1));
-        self.stack_indices.pop();
+    pub fn pop(&mut self) -> Option<Xform> {
+        self.stack.pop()
    }

-    pub fn top(&self) -> &[Transform] {
-        let sil = self.stack_indices.len();
-        let i1 = self.stack_indices[sil - 2];
-        let i2 = self.stack_indices[sil - 1];
-
-        unsafe { transmute(&self.stack[i1..i2]) }
+    pub fn top(&self) -> Option<&Xform> {
+        self.stack.last()
    }
 }
--- a/sub_crates/color/Cargo.toml
+++ b/sub_crates/color/Cargo.toml
@ -6,6 +6,9 @@ edition = "2018"
 license = "MIT, Apache 2.0"
 build = "build.rs"

+[build-dependencies]
+colorbox = "0.3"
+
 [lib]
 name = "color"
 path = "src/lib.rs"
--- a/sub_crates/color/build.rs
+++ b/sub_crates/color/build.rs
@ -1,76 +1,46 @@
 use std::{env, fs::File, io::Write, path::Path};

-#[derive(Copy, Clone)]
-struct Chromaticities {
-    r: (f64, f64),
-    g: (f64, f64),
-    b: (f64, f64),
-    w: (f64, f64),
-}
+use colorbox::{
+    chroma::{self, Chromaticities},
+    matrix::{invert, rgb_to_xyz_matrix, xyz_chromatic_adaptation_matrix, AdaptationMethod},
+    matrix_compose,
+};

 fn main() {
    let out_dir = env::var("OUT_DIR").unwrap();

    // Rec709
    {
-        let chroma = Chromaticities {
-            r: (0.640, 0.330),
-            g: (0.300, 0.600),
-            b: (0.150, 0.060),
-            w: (0.3127, 0.3290),
-        };
-
        let dest_path = Path::new(&out_dir).join("rec709_inc.rs");
        let mut f = File::create(&dest_path).unwrap();
-        write_conversion_functions("rec709", chroma, &mut f);
+        write_conversion_functions("rec709", chroma::REC709, &mut f);
    }

    // Rec2020
    {
-        let chroma = Chromaticities {
-            r: (0.708, 0.292),
-            g: (0.170, 0.797),
-            b: (0.131, 0.046),
-            w: (0.3127, 0.3290),
-        };
-
        let dest_path = Path::new(&out_dir).join("rec2020_inc.rs");
        let mut f = File::create(&dest_path).unwrap();
-        write_conversion_functions("rec2020", chroma, &mut f);
+        write_conversion_functions("rec2020", chroma::REC2020, &mut f);
    }

    // ACES AP0
    {
-        let chroma = Chromaticities {
-            r: (0.73470, 0.26530),
-            g: (0.00000, 1.00000),
-            b: (0.00010, -0.07700),
-            w: (0.32168, 0.33767),
-        };
-
        let dest_path = Path::new(&out_dir).join("aces_ap0_inc.rs");
        let mut f = File::create(&dest_path).unwrap();
-        write_conversion_functions("aces_ap0", chroma, &mut f);
+        write_conversion_functions("aces_ap0", chroma::ACES_AP0, &mut f);
    }

    // ACES AP1
    {
-        let chroma = Chromaticities {
-            r: (0.713, 0.293),
-            g: (0.165, 0.830),
-            b: (0.128, 0.044),
-            w: (0.32168, 0.33767),
-        };
-
        let dest_path = Path::new(&out_dir).join("aces_ap1_inc.rs");
        let mut f = File::create(&dest_path).unwrap();
-        write_conversion_functions("aces_ap1", chroma, &mut f);
+        write_conversion_functions("aces_ap1", chroma::ACES_AP1, &mut f);
    }
 }

 /// Generates conversion functions for the given rgb to xyz transform matrix.
 fn write_conversion_functions(space_name: &str, chroma: Chromaticities, f: &mut File) {
-    let to_xyz = rgb_to_xyz(chroma, 1.0);
+    let to_xyz = rgb_to_xyz_matrix(chroma);

    f.write_all(
        format!(
@ -99,7 +69,7 @@ pub fn {}_to_xyz(rgb: (f32, f32, f32)) -> (f32, f32, f32) {{
    )
    .unwrap();

-    let inv = inverse(to_xyz);
+    let inv = invert(to_xyz).unwrap();
    f.write_all(
        format!(
            r#"
@ -127,12 +97,14 @@ pub fn xyz_to_{}(xyz: (f32, f32, f32)) -> (f32, f32, f32) {{
    )
    .unwrap();

-    let e_chroma = {
-        let mut e_chroma = chroma;
-        e_chroma.w = (1.0 / 3.0, 1.0 / 3.0);
-        e_chroma
-    };
-    let e_to_xyz = rgb_to_xyz(e_chroma, 1.0);
+    let e_to_xyz = matrix_compose!(
+        rgb_to_xyz_matrix(chroma),
+        xyz_chromatic_adaptation_matrix(
+            chroma.w,
+            (1.0 / 3.0, 1.0 / 3.0),
+            AdaptationMethod::Bradford,
+        ),
+    );
    f.write_all(
        format!(
            r#"
@ -160,7 +132,7 @@ pub fn {}_e_to_xyz(rgb: (f32, f32, f32)) -> (f32, f32, f32) {{
    )
    .unwrap();

-    let inv_e = inverse(e_to_xyz);
+    let inv_e = invert(e_to_xyz).unwrap();
    f.write_all(
        format!(
            r#"
@ -188,135 +160,3 @@ pub fn xyz_to_{}_e(xyz: (f32, f32, f32)) -> (f32, f32, f32) {{
    )
    .unwrap();
 }
-
-/// Port of the RGBtoXYZ function from the ACES CTL reference implementation.
-/// See lib/IlmCtlMath/CtlColorSpace.cpp in the CTL reference implementation.
-///
-/// This takes the chromaticities of an RGB colorspace and generates a
-/// transform matrix from that space to XYZ.
-///
-/// * `chroma` is the chromaticities.
-/// * `y` is the XYZ "Y" value that should map to RGB (1,1,1)
-fn rgb_to_xyz(chroma: Chromaticities, y: f64) -> [[f64; 3]; 3] {
-    // X and Z values of RGB value (1, 1, 1), or "white"
-    let x = chroma.w.0 * y / chroma.w.1;
-    let z = (1.0 - chroma.w.0 - chroma.w.1) * y / chroma.w.1;
-
-    // Scale factors for matrix rows
-    let d = chroma.r.0 * (chroma.b.1 - chroma.g.1)
-        + chroma.b.0 * (chroma.g.1 - chroma.r.1)
-        + chroma.g.0 * (chroma.r.1 - chroma.b.1);
-
-    let sr = (x * (chroma.b.1 - chroma.g.1)
-        - chroma.g.0 * (y * (chroma.b.1 - 1.0) + chroma.b.1 * (x + z))
-        + chroma.b.0 * (y * (chroma.g.1 - 1.0) + chroma.g.1 * (x + z)))
-        / d;
-
-    let sg = (x * (chroma.r.1 - chroma.b.1)
-        + chroma.r.0 * (y * (chroma.b.1 - 1.0) + chroma.b.1 * (x + z))
-        - chroma.b.0 * (y * (chroma.r.1 - 1.0) + chroma.r.1 * (x + z)))
-        / d;
-
-    let sb = (x * (chroma.g.1 - chroma.r.1)
-        - chroma.r.0 * (y * (chroma.g.1 - 1.0) + chroma.g.1 * (x + z))
-        + chroma.g.0 * (y * (chroma.r.1 - 1.0) + chroma.r.1 * (x + z)))
-        / d;
-
-    // Assemble the matrix
-    let mut mat = [[0.0; 3]; 3];
-
-    mat[0][0] = sr * chroma.r.0;
-    mat[0][1] = sg * chroma.g.0;
-    mat[0][2] = sb * chroma.b.0;
-
-    mat[1][0] = sr * chroma.r.1;
-    mat[1][1] = sg * chroma.g.1;
-    mat[1][2] = sb * chroma.b.1;
-
-    mat[2][0] = sr * (1.0 - chroma.r.0 - chroma.r.1);
-    mat[2][1] = sg * (1.0 - chroma.g.0 - chroma.g.1);
-    mat[2][2] = sb * (1.0 - chroma.b.0 - chroma.b.1);
-
-    mat
-}
-
-/// Calculates the inverse of the given 3x3 matrix.
-///
-/// Ported to Rust from `gjInverse()` in IlmBase's Imath/ImathMatrix.h
-fn inverse(m: [[f64; 3]; 3]) -> [[f64; 3]; 3] {
-    let mut s = [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]];
-    let mut t = m;
-
-    // Forward elimination
-    for i in 0..2 {
-        let mut pivot = i;
-        let mut pivotsize = t[i][i];
-
-        if pivotsize < 0.0 {
-            pivotsize = -pivotsize;
-        }
-
-        for j in (i + 1)..3 {
-            let mut tmp = t[j][i];
-
-            if tmp < 0.0 {
-                tmp = -tmp;
-            }
-
-            if tmp > pivotsize {
-                pivot = j;
-                pivotsize = tmp;
-            }
-        }
-
-        if pivotsize == 0.0 {
-            panic!("Cannot invert singular matrix.");
-        }
-
-        if pivot != i {
-            for j in 0..3 {
-                let mut tmp = t[i][j];
-                t[i][j] = t[pivot][j];
-                t[pivot][j] = tmp;
-
-                tmp = s[i][j];
-                s[i][j] = s[pivot][j];
-                s[pivot][j] = tmp;
-            }
-        }
-
-        for j in (i + 1)..3 {
-            let f = t[j][i] / t[i][i];
-
-            for k in 0..3 {
-                t[j][k] -= f * t[i][k];
-                s[j][k] -= f * s[i][k];
-            }
-        }
-    }
-
-    // Backward substitution
-    for i in (0..3).rev() {
-        let f = t[i][i];
-
-        if t[i][i] == 0.0 {
-            panic!("Cannot invert singular matrix.");
-        }
-
-        for j in 0..3 {
-            t[i][j] /= f;
-            s[i][j] /= f;
-        }
-
-        for j in 0..i {
-            let f = t[j][i];
-
-            for k in 0..3 {
-                t[j][k] -= f * t[i][k];
-                s[j][k] -= f * s[i][k];
-            }
-        }
-    }
-
-    s
-}
--- a/sub_crates/data_tree/Cargo.toml
+++ b/sub_crates/data_tree/Cargo.toml
@ -0,0 +1,10 @@
+[package]
+name = "data_tree"
+version = "0.1.0"
+authors = ["Nathan Vegdahl <cessen@cessen.com>"]
+edition = "2018"
+license = "MIT"
+
+[lib]
+name = "data_tree"
+path = "src/lib.rs"
--- a/sub_crates/data_tree/src/lib.rs
+++ b/sub_crates/data_tree/src/lib.rs
@ -0,0 +1,199 @@
+#![allow(clippy::redundant_field_names)]
+#![allow(clippy::needless_lifetimes)]
+
+mod parse;
+
+use parse::{ParseError, ParseEvent, Parser};
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum Event<'a> {
+    InnerOpen {
+        type_name: &'a str,
+        byte_offset: usize,
+    },
+    InnerClose {
+        byte_offset: usize,
+    },
+    Leaf {
+        type_name: &'a str,
+        contents: &'a str,
+        byte_offset: usize,
+    },
+    EOF,
+}
+
+//----------------------------------------------------------------------------
+
+#[derive(Debug)]
+pub enum Error {
+    ExpectedNameOrClose(usize),
+    ExpectedOpen(usize),
+    UnexpectedClose(usize),
+    UnexpectedEOF,
+    IO(std::io::Error),
+}
+
+impl std::error::Error for Error {}
+
+impl std::fmt::Display for Error {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+        write!(f, "{:?}", self)
+    }
+}
+
+impl From<ParseError> for Error {
+    fn from(e: ParseError) -> Self {
+        match e {
+            ParseError::ExpectedNameOrClose(byte_offset) => Error::ExpectedNameOrClose(byte_offset),
+            ParseError::ExpectedOpen(byte_offset) => Error::ExpectedOpen(byte_offset),
+            ParseError::UnexpectedClose(byte_offset) => Error::UnexpectedClose(byte_offset),
+        }
+    }
+}
+
+impl From<std::io::Error> for Error {
+    fn from(e: std::io::Error) -> Self {
+        Error::IO(e)
+    }
+}
+
+//-------------------------------------------------------------
+
+#[derive(Debug)]
+pub struct DataTreeReader<R: std::io::BufRead> {
+    parser: Parser,
+    reader: R,
+    buf: String,
+    eof: bool,
+}
+
+impl<R: std::io::BufRead> DataTreeReader<R> {
+    pub fn new(reader: R) -> Self {
+        Self {
+            parser: Parser::new(),
+            reader: reader,
+            buf: String::new(),
+            eof: false,
+        }
+    }
+
+    pub fn next_event<'a>(&'a mut self) -> Result<Event<'a>, Error> {
+        loop {
+            let valid_end = match self.parser.next_event()? {
+                ParseEvent::ValidEnd => true,
+                ParseEvent::NeedMoreInput => false,
+
+                // The transmutes below are because the borrow checker is
+                // over-conservative about this.  It thinks
+                // the liftime isn't valid, but since we aren't
+                // mutating self after returning (and in fact
+                // can't because of the borrow) there's no way for
+                // the references in this to become invalid.
+                ParseEvent::InnerOpen {
+                    type_name,
+                    byte_offset,
+                } => {
+                    return Ok(unsafe {
+                        std::mem::transmute::<Event, Event>(Event::InnerOpen {
+                            type_name,
+                            byte_offset,
+                        })
+                    });
+                }
+                ParseEvent::InnerClose { byte_offset } => {
+                    return Ok(unsafe {
+                        std::mem::transmute::<Event, Event>(Event::InnerClose { byte_offset })
+                    });
+                }
+                ParseEvent::Leaf {
+                    type_name,
+                    contents,
+                    byte_offset,
+                } => {
+                    return Ok(unsafe {
+                        std::mem::transmute::<Event, Event>(Event::Leaf {
+                            type_name,
+                            contents,
+                            byte_offset,
+                        })
+                    });
+                }
+            };
+
+            if !self.eof {
+                self.buf.clear();
+                let read = self.reader.read_line(&mut self.buf)?;
+                self.parser.push_data(&self.buf);
+                if read == 0 {
+                    self.eof = true;
+                }
+            } else if !valid_end {
+                return Err(Error::UnexpectedEOF);
+            } else {
+                return Ok(Event::EOF);
+            }
+        }
+    }
+
+    pub fn peek_event<'a>(&'a mut self) -> Result<Event<'a>, Error> {
+        loop {
+            let valid_end = match self.parser.peek_event()? {
+                ParseEvent::ValidEnd => true,
+                ParseEvent::NeedMoreInput => false,
+
+                // The transmutes below are because the borrow checker is
+                // over-conservative about this.  It thinks
+                // the liftime isn't valid, but since we aren't
+                // mutating self after returning (and in fact
+                // can't because of the borrow) there's no way for
+                // the references in this to become invalid.
+                ParseEvent::InnerOpen {
+                    type_name,
+                    byte_offset,
+                } => {
+                    return Ok(unsafe {
+                        std::mem::transmute::<Event, Event>(Event::InnerOpen {
+                            type_name,
+                            byte_offset,
+                        })
+                    });
+                }
+                ParseEvent::InnerClose { byte_offset } => {
+                    return Ok(unsafe {
+                        std::mem::transmute::<Event, Event>(Event::InnerClose { byte_offset })
+                    });
+                }
+                ParseEvent::Leaf {
+                    type_name,
+                    contents,
+                    byte_offset,
+                } => {
+                    return Ok(unsafe {
+                        std::mem::transmute::<Event, Event>(Event::Leaf {
+                            type_name,
+                            contents,
+                            byte_offset,
+                        })
+                    });
+                }
+            };
+
+            if !self.eof {
+                self.buf.clear();
+                let read = self.reader.read_line(&mut self.buf)?;
+                self.parser.push_data(&self.buf);
+                if read == 0 {
+                    self.eof = true;
+                }
+            } else if !valid_end {
+                return Err(Error::UnexpectedEOF);
+            } else {
+                return Ok(Event::EOF);
+            }
+        }
+    }
+
+    pub fn byte_offset(&self) -> usize {
+        self.parser.byte_offset()
+    }
+}
--- a/sub_crates/data_tree/src/parse.rs
+++ b/sub_crates/data_tree/src/parse.rs
@ -0,0 +1,762 @@
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum ParseError {
+    ExpectedNameOrClose(usize),
+    ExpectedOpen(usize),
+    UnexpectedClose(usize),
+}
+
+impl std::error::Error for ParseError {}
+
+impl std::fmt::Display for ParseError {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+        write!(f, "{:?}", self)
+    }
+}
+
+//---------------------------------------------------------------------
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum ParseEvent<'a> {
+    InnerOpen {
+        type_name: &'a str,
+        byte_offset: usize,
+    },
+    InnerClose {
+        byte_offset: usize,
+    },
+    Leaf {
+        type_name: &'a str,
+        contents: &'a str,
+        byte_offset: usize,
+    },
+    NeedMoreInput,
+    ValidEnd, // All data so far is consumed, and this is a
+              // valid place to finish the parse.
+}
+
+impl<'a> ParseEvent<'a> {
+    fn add_to_byte_offset(&self, offset: usize) -> ParseEvent<'a> {
+        match *self {
+            ParseEvent::InnerOpen {
+                type_name,
+                byte_offset,
+            } => ParseEvent::InnerOpen {
+                type_name: type_name,
+                byte_offset: byte_offset + offset,
+            },
+            ParseEvent::InnerClose { byte_offset } => ParseEvent::InnerClose {
+                byte_offset: byte_offset + offset,
+            },
+            ParseEvent::Leaf {
+                type_name,
+                contents,
+                byte_offset,
+            } => ParseEvent::Leaf {
+                type_name: type_name,
+                contents: contents,
+                byte_offset: byte_offset + offset,
+            },
+            ParseEvent::NeedMoreInput => *self,
+            ParseEvent::ValidEnd => *self,
+        }
+    }
+}
+
+//---------------------------------------------------------------------
+
+#[derive(Debug)]
+pub struct Parser {
+    buffer: String,
+    buf_consumed_idx: usize,
+    total_bytes_processed: usize,
+    inner_opens: usize,
+}
+
+impl Parser {
+    pub fn new() -> Parser {
+        Parser {
+            buffer: String::with_capacity(1024),
+            buf_consumed_idx: 0,
+            total_bytes_processed: 0,
+            inner_opens: 0,
+        }
+    }
+
+    pub fn push_data(&mut self, text: &str) {
+        // Remove any consumed data.
+        if self.buf_consumed_idx > 0 {
+            self.buffer.replace_range(..self.buf_consumed_idx, "");
+            self.buf_consumed_idx = 0;
+        }
+
+        // Add the new data.
+        self.buffer.push_str(text);
+    }
+
+    pub fn next_event<'a>(&'a mut self) -> Result<ParseEvent<'a>, ParseError> {
+        // Remove any consumed data.
+        if self.buf_consumed_idx > 0 {
+            self.buffer.replace_range(..self.buf_consumed_idx, "");
+            self.buf_consumed_idx = 0;
+        }
+
+        // Try to parse an event from the valid prefix.
+        match try_parse_event(&self.buffer) {
+            ParseEventParse::Ok(event, bytes_consumed) => {
+                // Update internal state.
+                if let ParseEvent::InnerOpen { .. } = event {
+                    self.inner_opens += 1;
+                } else if let ParseEvent::InnerClose { byte_offset, .. } = event {
+                    if self.inner_opens == 0 {
+                        return Err(ParseError::UnexpectedClose(
+                            byte_offset + self.total_bytes_processed,
+                        ));
+                    } else {
+                        self.inner_opens -= 1;
+                    }
+                }
+                self.buf_consumed_idx += bytes_consumed;
+                self.total_bytes_processed += bytes_consumed;
+
+                Ok(event.add_to_byte_offset(self.total_bytes_processed - self.buf_consumed_idx))
+            }
+            ParseEventParse::ReachedEnd => {
+                // If we consumed all data, then if all nodes are properly
+                // closed we're done.  Otherwise we need more input.
+                if self.inner_opens == 0 {
+                    Ok(ParseEvent::ValidEnd)
+                } else {
+                    Ok(ParseEvent::NeedMoreInput)
+                }
+            }
+            ParseEventParse::IncompleteData => Ok(ParseEvent::NeedMoreInput),
+
+            // Hard errors.
+            ParseEventParse::ExpectedNameOrInnerClose(byte_offset) => Err(
+                ParseError::ExpectedNameOrClose(byte_offset + self.total_bytes_processed),
+            ),
+            ParseEventParse::ExpectedOpen(byte_offset) => Err(ParseError::ExpectedOpen(
+                byte_offset + self.total_bytes_processed,
+            )),
+        }
+    }
+
+    pub fn peek_event<'a>(&'a mut self) -> Result<ParseEvent<'a>, ParseError> {
+        // Remove any consumed data.
+        if self.buf_consumed_idx > 0 {
+            self.buffer.replace_range(..self.buf_consumed_idx, "");
+            self.buf_consumed_idx = 0;
+        }
+
+        // Try to parse an event from the valid prefix.
+        match try_parse_event(&self.buffer) {
+            ParseEventParse::Ok(event, _bytes_consumed) => {
+                if let ParseEvent::InnerClose { byte_offset, .. } = event {
+                    if self.inner_opens == 0 {
+                        return Err(ParseError::UnexpectedClose(
+                            byte_offset + self.total_bytes_processed,
+                        ));
+                    }
+                }
+                Ok(event.add_to_byte_offset(self.total_bytes_processed))
+            }
+            ParseEventParse::ReachedEnd => {
+                // If we consumed all data, then if all nodes are properly
+                // closed we're done.  Otherwise we need more input.
+                if self.inner_opens == 0 {
+                    Ok(ParseEvent::ValidEnd)
+                } else {
+                    Ok(ParseEvent::NeedMoreInput)
+                }
+            }
+            ParseEventParse::IncompleteData => Ok(ParseEvent::NeedMoreInput),
+
+            // Hard errors.
+            ParseEventParse::ExpectedNameOrInnerClose(byte_offset) => Err(
+                ParseError::ExpectedNameOrClose(byte_offset + self.total_bytes_processed),
+            ),
+            ParseEventParse::ExpectedOpen(byte_offset) => Err(ParseError::ExpectedOpen(
+                byte_offset + self.total_bytes_processed,
+            )),
+        }
+    }
+
+    pub fn byte_offset(&self) -> usize {
+        self.total_bytes_processed + self.buf_consumed_idx
+    }
+}
+
+//--------------------------------------------------------------------------
+
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+enum ParseEventParse<'a> {
+    Ok(ParseEvent<'a>, usize), // (event, bytes consumed)
+    ReachedEnd,                // Reached the end of the buffer in a valid state, with no event.
+    IncompleteData,            // Need more data to parse.
+
+    // ParseErrors.
+    ExpectedNameOrInnerClose(usize),
+    ExpectedOpen(usize),
+}
+
+fn try_parse_event<'a>(text: &'a str) -> ParseEventParse<'a> {
+    // Remove leading whitespace and comments.
+    let mut source_text = skip_ws_and_comments((0, text));
+    let start_idx = source_text.0;
+
+    // First token.
+    let type_name = match next_token(source_text) {
+        // Type name, record and continue.
+        (Token::Name(tn), tail) => {
+            source_text = tail;
+            tn
+        }
+
+        // Closing tag for inner node.  Return.
+        (Token::CloseInner, tail) => {
+            return ParseEventParse::Ok(
+                ParseEvent::InnerClose {
+                    byte_offset: start_idx,
+                },
+                tail.0,
+            );
+        }
+
+        // We consumed everything as whitespace and/or
+        // comments.  Return.
+        (Token::End, _) => {
+            return ParseEventParse::ReachedEnd;
+        }
+
+        // Invalid.
+        _ => return ParseEventParse::ExpectedNameOrInnerClose(start_idx),
+    };
+
+    // Skip whitespace and comments to get the start of
+    // where there should be an open tag, for use later in error.
+    source_text = skip_ws_and_comments(source_text);
+    let open_start_idx = source_text.0;
+
+    // Last part of the event.
+    match next_token(source_text) {
+        // Begining of an inner node.
+        (Token::OpenInner, tail) => ParseEventParse::Ok(
+            ParseEvent::InnerOpen {
+                type_name: type_name,
+                byte_offset: start_idx,
+            },
+            tail.0,
+        ),
+
+        // Try to parse entire leaf node.
+        (Token::OpenLeaf, tail) => {
+            // Get contents.
+            let (contents, tail2) = parse_leaf_content(tail);
+            source_text = tail2;
+
+            // Try to get closing tag.
+            match next_token(source_text) {
+                // If it's a leaf closing tag, we're done!
+                // Return the leaf event.
+                (Token::CloseLeaf, tail) => ParseEventParse::Ok(
+                    ParseEvent::Leaf {
+                        type_name: type_name,
+                        contents: contents,
+                        byte_offset: start_idx,
+                    },
+                    tail.0,
+                ),
+
+                // Otherwise...
+                _ => {
+                    if source_text.1.is_empty() {
+                        // If there's no text left, we're just incomplete.
+                        ParseEventParse::IncompleteData
+                    } else {
+                        // Otherwise, this would be a parse error...
+                        // except that this shouldn't be reachable,
+                        // since everything should be consumable for
+                        // leaf content up until a close tag.
+                        unreachable!("Expected leaf close tag.")
+                    }
+                }
+            }
+        }
+
+        // We consumed everything else as whitespace
+        // and/or comments, so we're incomplete.  Return.
+        (Token::End, _) => ParseEventParse::IncompleteData,
+
+        // Invalid.
+        _ => ParseEventParse::ExpectedOpen(open_start_idx),
+    }
+}
+
+fn parse_leaf_content(source_text: (usize, &str)) -> (&str, (usize, &str)) {
+    let mut si = 1;
+    let mut escaped = false;
+    let mut reached_end = true;
+    for (i, c) in source_text.1.char_indices() {
+        si = i;
+        if escaped {
+            escaped = false;
+        } else if c == '\\' {
+            escaped = true;
+        } else if c == ']' {
+            reached_end = false;
+            break;
+        }
+    }
+
+    if reached_end {
+        si = source_text.1.len();
+    }
+
+    (
+        &source_text.1[0..si],
+        (source_text.0 + si, &source_text.1[si..]),
+    )
+}
+
+//--------------------------------------------------------------------------
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+enum Token<'a> {
+    OpenInner,
+    CloseInner,
+    OpenLeaf,
+    CloseLeaf,
+    Name(&'a str),
+    End,
+    Unknown,
+}
+
+fn next_token<'a>(source_text: (usize, &'a str)) -> (Token<'a>, (usize, &'a str)) {
+    let text1 = skip_ws_and_comments(source_text);
+
+    if let Some(c) = text1.1.chars().next() {
+        let text2 = (text1.0 + c.len_utf8(), &text1.1[c.len_utf8()..]);
+        match c {
+            '{' => (Token::OpenInner, text2),
+
+            '}' => (Token::CloseInner, text2),
+
+            '[' => (Token::OpenLeaf, text2),
+
+            ']' => (Token::CloseLeaf, text2),
+
+            _ => {
+                if is_ident_char(c) {
+                    // Parse type
+                    let mut si = 0;
+                    let mut reached_end = true;
+                    for (i, c) in text1.1.char_indices() {
+                        si = i;
+                        if !is_ident_char(c) {
+                            reached_end = false;
+                            break;
+                        }
+                    }
+
+                    if reached_end {
+                        si = text1.1.len();
+                    }
+
+                    (Token::Name(&text1.1[0..si]), (text1.0 + si, &text1.1[si..]))
+                } else {
+                    (Token::Unknown, text1)
+                }
+            }
+        }
+    } else {
+        (Token::End, text1)
+    }
+}
+
+fn is_ws(c: char) -> bool {
+    matches!(c, '\n' | '\r' | '\t' | ' ')
+}
+
+fn is_nl(c: char) -> bool {
+    c == '\n'
+}
+
+fn is_ident_char(c: char) -> bool {
+    c.is_alphanumeric() || c == '-' || c == '_'
+}
+
+fn skip_ws(text: &str) -> &str {
+    let mut si = 0;
+    let mut reached_end = true;
+    for (i, c) in text.char_indices() {
+        si = i;
+        if !is_ws(c) {
+            reached_end = false;
+            break;
+        }
+    }
+
+    if reached_end {
+        si = text.len();
+    }
+
+    &text[si..]
+}
+
+fn skip_comment(text: &str) -> &str {
+    let mut si = 0;
+    if text.starts_with('#') {
+        let mut reached_end = true;
+        for (i, c) in text.char_indices() {
+            si = i;
+            if is_nl(c) {
+                reached_end = false;
+                break;
+            }
+        }
+
+        if reached_end {
+            si = text.len();
+        }
+    }
+
+    &text[si..]
+}
+
+fn skip_ws_and_comments(text: (usize, &str)) -> (usize, &str) {
+    let mut remaining_text = text.1;
+
+    loop {
+        let tmp = skip_comment(skip_ws(remaining_text));
+
+        if tmp.len() == remaining_text.len() {
+            break;
+        } else {
+            remaining_text = tmp;
+        }
+    }
+
+    let offset = text.0 + text.1.len() - remaining_text.len();
+    (offset, remaining_text)
+}
+
+//--------------------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use super::{next_token, Token};
+
+    #[test]
+    fn tokenize_01() {
+        let input = (0, "Thing");
+
+        assert_eq!(next_token(input), (Token::Name("Thing"), (5, "")));
+    }
+
+    #[test]
+    fn tokenize_02() {
+        let input = (0, "  \n# gdfgdf gfdg dggdf\\sg dfgsd \n   Thing");
+
+        assert_eq!(next_token(input), (Token::Name("Thing"), (41, "")));
+    }
+
+    #[test]
+    fn tokenize_03() {
+        let input1 = (0, " Thing { }");
+        let (token1, input2) = next_token(input1);
+        let (token2, input3) = next_token(input2);
+        let (token3, input4) = next_token(input3);
+
+        assert_eq!((token1, input2.1), (Token::Name("Thing"), " { }"));
+        assert_eq!((token2, input3.1), (Token::OpenInner, " }"));
+        assert_eq!((token3, input4.1), (Token::CloseInner, ""));
+    }
+
+    #[test]
+    fn tokenize_04() {
+        let input1 = (0, " hi the[re");
+        let (token1, input2) = next_token(input1);
+        let (token2, input3) = next_token(input2);
+        let (token3, input4) = next_token(input3);
+        let (token4, input5) = next_token(input4);
+        let (token5, input6) = next_token(input5);
+
+        assert_eq!((token1, input2), (Token::Name("hi"), (3, " the[re")));
+        assert_eq!((token2, input3), (Token::Name("the"), (7, "[re")));
+        assert_eq!((token3, input4), (Token::OpenLeaf, (8, "re")));
+        assert_eq!((token4, input5), (Token::Name("re"), (10, "")));
+        assert_eq!((token5, input6), (Token::End, (10, "")));
+    }
+
+    #[test]
+    fn tokenize_05() {
+        let input1 = (0, "Thing { # A comment\n\tThing2 []\n}");
+        let (token1, input2) = next_token(input1);
+        let (token2, input3) = next_token(input2);
+        let (token3, input4) = next_token(input3);
+        let (token4, input5) = next_token(input4);
+        let (token5, input6) = next_token(input5);
+        let (token6, input7) = next_token(input6);
+        let (token7, input8) = next_token(input7);
+
+        assert_eq!(
+            (token1, input2),
+            (Token::Name("Thing"), (5, " { # A comment\n\tThing2 []\n}",))
+        );
+        assert_eq!(
+            (token2, input3),
+            (Token::OpenInner, (7, " # A comment\n\tThing2 []\n}",))
+        );
+        assert_eq!((token3, input4), (Token::Name("Thing2"), (27, " []\n}")));
+        assert_eq!((token4, input5), (Token::OpenLeaf, (29, "]\n}")));
+        assert_eq!((token5, input6), (Token::CloseLeaf, (30, "\n}")));
+        assert_eq!((token6, input7), (Token::CloseInner, (32, "")));
+        assert_eq!((token7, input8), (Token::End, (32, "")));
+    }
+
+    #[test]
+    fn try_parse_event_01() {
+        assert_eq!(try_parse_event("H"), ParseEventParse::IncompleteData,);
+    }
+
+    #[test]
+    fn try_parse_event_02() {
+        assert_eq!(try_parse_event("Hello "), ParseEventParse::IncompleteData,);
+    }
+
+    #[test]
+    fn try_parse_event_03() {
+        assert_eq!(
+            try_parse_event("Hello {"),
+            ParseEventParse::Ok(
+                ParseEvent::InnerOpen {
+                    type_name: "Hello",
+                    byte_offset: 0,
+                },
+                7
+            ),
+        );
+    }
+
+    #[test]
+    fn try_parse_event_04() {
+        assert_eq!(
+            try_parse_event("  Hello {"),
+            ParseEventParse::Ok(
+                ParseEvent::InnerOpen {
+                    type_name: "Hello",
+                    byte_offset: 2,
+                },
+                9
+            ),
+        );
+    }
+
+    #[test]
+    fn try_parse_event_05() {
+        assert_eq!(
+            try_parse_event("Hello {  "),
+            ParseEventParse::Ok(
+                ParseEvent::InnerOpen {
+                    type_name: "Hello",
+                    byte_offset: 0,
+                },
+                7
+            ),
+        );
+    }
+
+    #[test]
+    fn try_parse_event_06() {
+        assert_eq!(try_parse_event("Hello ["), ParseEventParse::IncompleteData,);
+    }
+
+    #[test]
+    fn try_parse_event_07() {
+        assert_eq!(
+            try_parse_event("Hello [some contents"),
+            ParseEventParse::IncompleteData,
+        );
+    }
+
+    #[test]
+    fn try_parse_event_08() {
+        assert_eq!(
+            try_parse_event("Hello [some contents]"),
+            ParseEventParse::Ok(
+                ParseEvent::Leaf {
+                    type_name: "Hello",
+                    contents: "some contents",
+                    byte_offset: 0,
+                },
+                21
+            ),
+        );
+    }
+
+    #[test]
+    fn try_parse_event_09() {
+        assert_eq!(
+            try_parse_event("Hello [some contents]  "),
+            ParseEventParse::Ok(
+                ParseEvent::Leaf {
+                    type_name: "Hello",
+                    contents: "some contents",
+                    byte_offset: 0,
+                },
+                21
+            ),
+        );
+    }
+
+    #[test]
+    fn try_parse_event_10() {
+        assert_eq!(
+            try_parse_event(r#"Hello [some \co\]ntents]"#),
+            ParseEventParse::Ok(
+                ParseEvent::Leaf {
+                    type_name: "Hello",
+                    contents: r#"some \co\]ntents"#,
+                    byte_offset: 0,
+                },
+                24
+            ),
+        );
+    }
+
+    #[test]
+    fn try_parse_event_11() {
+        assert_eq!(
+            try_parse_event("  # A comment\n\n     "),
+            ParseEventParse::ReachedEnd,
+        );
+    }
+
+    #[test]
+    fn parser_01() {
+        let mut parser = Parser::new();
+
+        parser.push_data("Hello");
+        assert_eq!(parser.next_event(), Ok(ParseEvent::NeedMoreInput));
+
+        parser.push_data("{");
+        assert_eq!(
+            parser.next_event(),
+            Ok(ParseEvent::InnerOpen {
+                type_name: "Hello",
+                byte_offset: 0,
+            })
+        );
+
+        assert_eq!(parser.next_event(), Ok(ParseEvent::NeedMoreInput));
+
+        parser.push_data("}");
+        assert_eq!(
+            parser.next_event(),
+            Ok(ParseEvent::InnerClose { byte_offset: 6 })
+        );
+
+        assert_eq!(parser.next_event(), Ok(ParseEvent::ValidEnd));
+    }
+
+    #[test]
+    fn parser_02() {
+        let mut parser = Parser::new();
+
+        parser.push_data("Hello");
+        assert_eq!(parser.next_event(), Ok(ParseEvent::NeedMoreInput));
+
+        parser.push_data("[");
+        assert_eq!(parser.next_event(), Ok(ParseEvent::NeedMoreInput));
+
+        parser.push_data("1.0 2.0 3.");
+        assert_eq!(parser.next_event(), Ok(ParseEvent::NeedMoreInput));
+
+        parser.push_data("0]");
+        assert_eq!(
+            parser.next_event(),
+            Ok(ParseEvent::Leaf {
+                type_name: "Hello",
+                contents: "1.0 2.0 3.0",
+                byte_offset: 0,
+            })
+        );
+
+        assert_eq!(parser.next_event(), Ok(ParseEvent::ValidEnd));
+    }
+
+    #[test]
+    fn parser_03() {
+        let mut parser = Parser::new();
+
+        parser.push_data("Hello { World [1.0 2.0 3.0] }");
+
+        assert_eq!(
+            parser.next_event(),
+            Ok(ParseEvent::InnerOpen {
+                type_name: "Hello",
+                byte_offset: 0,
+            })
+        );
+
+        assert_eq!(
+            parser.next_event(),
+            Ok(ParseEvent::Leaf {
+                type_name: "World",
+                contents: "1.0 2.0 3.0",
+                byte_offset: 8,
+            })
+        );
+
+        assert_eq!(
+            parser.next_event(),
+            Ok(ParseEvent::InnerClose { byte_offset: 28 })
+        );
+
+        // Make sure repeated calls are stable.
+        assert_eq!(parser.next_event(), Ok(ParseEvent::ValidEnd));
+        assert_eq!(parser.next_event(), Ok(ParseEvent::ValidEnd));
+        assert_eq!(parser.next_event(), Ok(ParseEvent::ValidEnd));
+    }
+
+    #[test]
+    fn parser_04() {
+        let mut parser = Parser::new();
+
+        parser.push_data("$%^&");
+        assert_eq!(parser.next_event(), Err(ParseError::ExpectedNameOrClose(0)));
+    }
+
+    #[test]
+    fn parser_05() {
+        let mut parser = Parser::new();
+
+        parser.push_data("Hello]");
+        assert_eq!(parser.next_event(), Err(ParseError::ExpectedOpen(5)));
+    }
+
+    #[test]
+    fn parser_06() {
+        let mut parser = Parser::new();
+
+        parser.push_data("Hello}");
+        assert_eq!(parser.next_event(), Err(ParseError::ExpectedOpen(5)));
+    }
+
+    #[test]
+    fn parser_07() {
+        let mut parser = Parser::new();
+
+        parser.push_data("Hello $*@^ [");
+        assert_eq!(parser.next_event(), Err(ParseError::ExpectedOpen(6)));
+    }
+
+    #[test]
+    fn parser_08() {
+        let mut parser = Parser::new();
+
+        parser.push_data("}");
+        assert_eq!(parser.next_event(), Err(ParseError::UnexpectedClose(0)));
+    }
+}
--- a/sub_crates/math3d/src/lib.rs
+++ b/sub_crates/math3d/src/lib.rs
@ -1,28 +0,0 @@
-#![allow(dead_code)]
-
-mod normal;
-mod point;
-mod transform;
-mod vector;
-
-pub use self::{normal::Normal, point::Point, transform::Transform, vector::Vector};
-
-/// Trait for calculating dot products.
-pub trait DotProduct {
-    fn dot(self, other: Self) -> f32;
-}
-
-#[inline]
-pub fn dot<T: DotProduct>(a: T, b: T) -> f32 {
-    a.dot(b)
-}
-
-/// Trait for calculating cross products.
-pub trait CrossProduct {
-    fn cross(self, other: Self) -> Self;
-}
-
-#[inline]
-pub fn cross<T: CrossProduct>(a: T, b: T) -> T {
-    a.cross(b)
-}
--- a/sub_crates/math3d/src/normal.rs
+++ b/sub_crates/math3d/src/normal.rs
@ -1,270 +0,0 @@
-#![allow(dead_code)]
-
-use std::{
-    cmp::PartialEq,
-    ops::{Add, Div, Mul, Neg, Sub},
-};
-
-use glam::Vec3A;
-
-use super::{CrossProduct, DotProduct, Transform, Vector};
-
-/// A surface normal in 3d homogeneous space.
-#[derive(Debug, Copy, Clone)]
-pub struct Normal {
-    pub co: Vec3A,
-}
-
-impl Normal {
-    #[inline(always)]
-    pub fn new(x: f32, y: f32, z: f32) -> Normal {
-        Normal {
-            co: Vec3A::new(x, y, z),
-        }
-    }
-
-    #[inline(always)]
-    pub fn length(&self) -> f32 {
-        self.co.length()
-    }
-
-    #[inline(always)]
-    pub fn length2(&self) -> f32 {
-        self.co.length_squared()
-    }
-
-    #[inline(always)]
-    pub fn normalized(&self) -> Normal {
-        Normal {
-            co: self.co.normalize(),
-        }
-    }
-
-    #[inline(always)]
-    pub fn into_vector(self) -> Vector {
-        Vector { co: self.co }
-    }
-
-    #[inline(always)]
-    pub fn get_n(&self, n: usize) -> f32 {
-        match n {
-            0 => self.x(),
-            1 => self.y(),
-            2 => self.z(),
-            _ => panic!("Attempt to access dimension beyond z."),
-        }
-    }
-
-    #[inline(always)]
-    pub fn x(&self) -> f32 {
-        self.co[0]
-    }
-
-    #[inline(always)]
-    pub fn y(&self) -> f32 {
-        self.co[1]
-    }
-
-    #[inline(always)]
-    pub fn z(&self) -> f32 {
-        self.co[2]
-    }
-
-    #[inline(always)]
-    pub fn set_x(&mut self, x: f32) {
-        self.co[0] = x;
-    }
-
-    #[inline(always)]
-    pub fn set_y(&mut self, y: f32) {
-        self.co[1] = y;
-    }
-
-    #[inline(always)]
-    pub fn set_z(&mut self, z: f32) {
-        self.co[2] = z;
-    }
-}
-
-impl PartialEq for Normal {
-    #[inline(always)]
-    fn eq(&self, other: &Normal) -> bool {
-        self.co == other.co
-    }
-}
-
-impl Add for Normal {
-    type Output = Normal;
-
-    #[inline(always)]
-    fn add(self, other: Normal) -> Normal {
-        Normal {
-            co: self.co + other.co,
-        }
-    }
-}
-
-impl Sub for Normal {
-    type Output = Normal;
-
-    #[inline(always)]
-    fn sub(self, other: Normal) -> Normal {
-        Normal {
-            co: self.co - other.co,
-        }
-    }
-}
-
-impl Mul<f32> for Normal {
-    type Output = Normal;
-
-    #[inline(always)]
-    fn mul(self, other: f32) -> Normal {
-        Normal {
-            co: self.co * other,
-        }
-    }
-}
-
-impl Mul<Transform> for Normal {
-    type Output = Normal;
-
-    #[inline]
-    fn mul(self, other: Transform) -> Normal {
-        Normal {
-            co: other.0.matrix3.inverse().transpose().mul_vec3a(self.co),
-        }
-    }
-}
-
-impl Div<f32> for Normal {
-    type Output = Normal;
-
-    #[inline(always)]
-    fn div(self, other: f32) -> Normal {
-        Normal {
-            co: self.co / other,
-        }
-    }
-}
-
-impl Neg for Normal {
-    type Output = Normal;
-
-    #[inline(always)]
-    fn neg(self) -> Normal {
-        Normal { co: self.co * -1.0 }
-    }
-}
-
-impl DotProduct for Normal {
-    #[inline(always)]
-    fn dot(self, other: Normal) -> f32 {
-        self.co.dot(other.co)
-    }
-}
-
-impl CrossProduct for Normal {
-    #[inline]
-    fn cross(self, other: Normal) -> Normal {
-        Normal {
-            co: self.co.cross(other.co),
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::super::{CrossProduct, DotProduct, Transform};
-    use super::*;
-    use approx::assert_ulps_eq;
-
-    #[test]
-    fn add() {
-        let v1 = Normal::new(1.0, 2.0, 3.0);
-        let v2 = Normal::new(1.5, 4.5, 2.5);
-        let v3 = Normal::new(2.5, 6.5, 5.5);
-
-        assert_eq!(v3, v1 + v2);
-    }
-
-    #[test]
-    fn sub() {
-        let v1 = Normal::new(1.0, 2.0, 3.0);
-        let v2 = Normal::new(1.5, 4.5, 2.5);
-        let v3 = Normal::new(-0.5, -2.5, 0.5);
-
-        assert_eq!(v3, v1 - v2);
-    }
-
-    #[test]
-    fn mul_scalar() {
-        let v1 = Normal::new(1.0, 2.0, 3.0);
-        let v2 = 2.0;
-        let v3 = Normal::new(2.0, 4.0, 6.0);
-
-        assert_eq!(v3, v1 * v2);
-    }
-
-    #[test]
-    fn mul_matrix_1() {
-        let n = Normal::new(1.0, 2.5, 4.0);
-        let m = Transform::new_from_values(
-            1.0, 2.0, 2.0, 1.5, 3.0, 6.0, 7.0, 8.0, 9.0, 2.0, 11.0, 12.0,
-        );
-        let nm = n * m;
-        let nm2 = Normal::new(-4.0625, 1.78125, -0.03125);
-        for i in 0..3 {
-            assert_ulps_eq!(nm.co[i], nm2.co[i], max_ulps = 4);
-        }
-    }
-
-    #[test]
-    fn div() {
-        let v1 = Normal::new(1.0, 2.0, 3.0);
-        let v2 = 2.0;
-        let v3 = Normal::new(0.5, 1.0, 1.5);
-
-        assert_eq!(v3, v1 / v2);
-    }
-
-    #[test]
-    fn length() {
-        let n = Normal::new(1.0, 2.0, 3.0);
-        assert!((n.length() - 3.7416573867739413).abs() < 0.000001);
-    }
-
-    #[test]
-    fn length2() {
-        let n = Normal::new(1.0, 2.0, 3.0);
-        assert_eq!(n.length2(), 14.0);
-    }
-
-    #[test]
-    fn normalized() {
-        let n1 = Normal::new(1.0, 2.0, 3.0);
-        let n2 = Normal::new(0.2672612419124244, 0.5345224838248488, 0.8017837257372732);
-        let n3 = n1.normalized();
-        assert!((n3.x() - n2.x()).abs() < 0.000001);
-        assert!((n3.y() - n2.y()).abs() < 0.000001);
-        assert!((n3.z() - n2.z()).abs() < 0.000001);
-    }
-
-    #[test]
-    fn dot_test() {
-        let v1 = Normal::new(1.0, 2.0, 3.0);
-        let v2 = Normal::new(1.5, 4.5, 2.5);
-        let v3 = 18.0f32;
-
-        assert_eq!(v3, v1.dot(v2));
-    }
-
-    #[test]
-    fn cross_test() {
-        let v1 = Normal::new(1.0, 0.0, 0.0);
-        let v2 = Normal::new(0.0, 1.0, 0.0);
-        let v3 = Normal::new(0.0, 0.0, 1.0);
-
-        assert_eq!(v3, v1.cross(v2));
-    }
-}
--- a/sub_crates/math3d/src/point.rs
+++ b/sub_crates/math3d/src/point.rs
@ -1,202 +0,0 @@
-#![allow(dead_code)]
-
-use std::{
-    cmp::PartialEq,
-    ops::{Add, Mul, Sub},
-};
-
-use glam::Vec3A;
-
-use super::{Transform, Vector};
-
-/// A position in 3d homogeneous space.
-#[derive(Debug, Copy, Clone)]
-pub struct Point {
-    pub co: Vec3A,
-}
-
-impl Point {
-    #[inline(always)]
-    pub fn new(x: f32, y: f32, z: f32) -> Point {
-        Point {
-            co: Vec3A::new(x, y, z),
-        }
-    }
-
-    #[inline(always)]
-    pub fn min(&self, other: Point) -> Point {
-        let n1 = self;
-        let n2 = other;
-
-        Point {
-            co: n1.co.min(n2.co),
-        }
-    }
-
-    #[inline(always)]
-    pub fn max(&self, other: Point) -> Point {
-        let n1 = self;
-        let n2 = other;
-
-        Point {
-            co: n1.co.max(n2.co),
-        }
-    }
-
-    #[inline(always)]
-    pub fn into_vector(self) -> Vector {
-        Vector { co: self.co }
-    }
-
-    #[inline(always)]
-    pub fn get_n(&self, n: usize) -> f32 {
-        match n {
-            0 => self.x(),
-            1 => self.y(),
-            2 => self.z(),
-            _ => panic!("Attempt to access dimension beyond z."),
-        }
-    }
-
-    #[inline(always)]
-    pub fn x(&self) -> f32 {
-        self.co[0]
-    }
-
-    #[inline(always)]
-    pub fn y(&self) -> f32 {
-        self.co[1]
-    }
-
-    #[inline(always)]
-    pub fn z(&self) -> f32 {
-        self.co[2]
-    }
-
-    #[inline(always)]
-    pub fn set_x(&mut self, x: f32) {
-        self.co[0] = x;
-    }
-
-    #[inline(always)]
-    pub fn set_y(&mut self, y: f32) {
-        self.co[1] = y;
-    }
-
-    #[inline(always)]
-    pub fn set_z(&mut self, z: f32) {
-        self.co[2] = z;
-    }
-}
-
-impl PartialEq for Point {
-    #[inline(always)]
-    fn eq(&self, other: &Point) -> bool {
-        self.co == other.co
-    }
-}
-
-impl Add<Vector> for Point {
-    type Output = Point;
-
-    #[inline(always)]
-    fn add(self, other: Vector) -> Point {
-        Point {
-            co: self.co + other.co,
-        }
-    }
-}
-
-impl Sub for Point {
-    type Output = Vector;
-
-    #[inline(always)]
-    fn sub(self, other: Point) -> Vector {
-        Vector {
-            co: self.co - other.co,
-        }
-    }
-}
-
-impl Sub<Vector> for Point {
-    type Output = Point;
-
-    #[inline(always)]
-    fn sub(self, other: Vector) -> Point {
-        Point {
-            co: self.co - other.co,
-        }
-    }
-}
-
-impl Mul<Transform> for Point {
-    type Output = Point;
-
-    #[inline]
-    fn mul(self, other: Transform) -> Point {
-        Point {
-            co: other.0.transform_point3a(self.co),
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::super::{Transform, Vector};
-    use super::*;
-
-    #[test]
-    fn add() {
-        let p1 = Point::new(1.0, 2.0, 3.0);
-        let v1 = Vector::new(1.5, 4.5, 2.5);
-        let p2 = Point::new(2.5, 6.5, 5.5);
-
-        assert_eq!(p2, p1 + v1);
-    }
-
-    #[test]
-    fn sub() {
-        let p1 = Point::new(1.0, 2.0, 3.0);
-        let p2 = Point::new(1.5, 4.5, 2.5);
-        let v1 = Vector::new(-0.5, -2.5, 0.5);
-
-        assert_eq!(v1, p1 - p2);
-    }
-
-    #[test]
-    fn mul_matrix_1() {
-        let p = Point::new(1.0, 2.5, 4.0);
-        let m = Transform::new_from_values(
-            1.0, 2.0, 2.0, 1.5, 3.0, 6.0, 7.0, 8.0, 9.0, 2.0, 11.0, 12.0,
-        );
-        let pm = Point::new(15.5, 54.0, 70.0);
-        assert_eq!(p * m, pm);
-    }
-
-    #[test]
-    fn mul_matrix_2() {
-        let p = Point::new(1.0, 2.5, 4.0);
-        let m = Transform::new_from_values(
-            1.0, 2.0, 2.0, 1.5, 3.0, 6.0, 7.0, 8.0, 9.0, 2.0, 11.0, 12.0,
-        );
-        let pm = Point::new(15.5, 54.0, 70.0);
-        assert_eq!(p * m, pm);
-    }
-
-    #[test]
-    fn mul_matrix_3() {
-        // Make sure matrix multiplication composes the way one would expect
-        let p = Point::new(1.0, 2.5, 4.0);
-        let m1 = Transform::new_from_values(
-            1.0, 2.0, 2.0, 1.5, 3.0, 6.0, 7.0, 8.0, 9.0, 2.0, 11.0, 12.0,
-        );
-        let m2 =
-            Transform::new_from_values(4.0, 1.0, 2.0, 3.5, 3.0, 6.0, 5.0, 2.0, 2.0, 2.0, 4.0, 12.0);
-        println!("{:?}", m1 * m2);
-
-        let pmm1 = p * (m1 * m2);
-        let pmm2 = (p * m1) * m2;
-
-        assert!((pmm1 - pmm2).length2() <= 0.00001); // Assert pmm1 and pmm2 are roughly equal
-    }
-}
--- a/sub_crates/math3d/src/transform.rs
+++ b/sub_crates/math3d/src/transform.rs
@ -1,178 +0,0 @@
-#![allow(dead_code)]
-
-use std::ops::{Add, Mul};
-
-use approx::relative_eq;
-use glam::{Affine3A, Mat3, Mat4, Vec3};
-
-use super::Point;
-
-/// A 4x3 affine transform matrix, used for transforms.
-#[derive(Debug, Copy, Clone, PartialEq)]
-pub struct Transform(pub Affine3A);
-
-impl Transform {
-    /// Creates a new identity matrix
-    #[inline]
-    pub fn new() -> Transform {
-        Transform(Affine3A::IDENTITY)
-    }
-
-    /// Creates a new matrix with the specified values:
-    /// a b c d
-    /// e f g h
-    /// i j k l
-    /// m n o p
-    #[inline]
-    #[allow(clippy::many_single_char_names)]
-    #[allow(clippy::too_many_arguments)]
-    pub fn new_from_values(
-        a: f32,
-        b: f32,
-        c: f32,
-        d: f32,
-        e: f32,
-        f: f32,
-        g: f32,
-        h: f32,
-        i: f32,
-        j: f32,
-        k: f32,
-        l: f32,
-    ) -> Transform {
-        Transform(Affine3A::from_mat3_translation(
-            Mat3::from_cols(Vec3::new(a, e, i), Vec3::new(b, f, j), Vec3::new(c, g, k)),
-            Vec3::new(d, h, l),
-        ))
-    }
-
-    #[inline]
-    pub fn from_location(loc: Point) -> Transform {
-        Transform(Affine3A::from_translation(loc.co.into()))
-    }
-
-    /// Returns whether the matrices are approximately equal to each other.
-    /// Each corresponding element in the matrices cannot have a relative
-    /// error exceeding epsilon.
-    #[inline]
-    pub fn aprx_eq(&self, other: Transform, epsilon: f32) -> bool {
-        let mut eq = true;
-        for c in 0..3 {
-            for r in 0..3 {
-                let a = self.0.matrix3.col(c)[r];
-                let b = other.0.matrix3.col(c)[r];
-                eq &= relative_eq!(a, b, epsilon = epsilon);
-            }
-        }
-        for i in 0..3 {
-            let a = self.0.translation[i];
-            let b = other.0.translation[i];
-            eq &= relative_eq!(a, b, epsilon = epsilon);
-        }
-        eq
-    }
-
-    /// Returns the inverse of the Matrix
-    #[inline]
-    pub fn inverse(&self) -> Transform {
-        Transform(self.0.inverse())
-    }
-}
-
-impl Default for Transform {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
-/// Multiply two matrices together
-impl Mul for Transform {
-    type Output = Self;
-
-    #[inline]
-    fn mul(self, other: Self) -> Self {
-        Self(other.0 * self.0)
-    }
-}
-
-/// Multiply a matrix by a f32
-impl Mul<f32> for Transform {
-    type Output = Self;
-
-    #[inline]
-    fn mul(self, other: f32) -> Self {
-        Self(Affine3A::from_mat4(Mat4::from(self.0) * other))
-    }
-}
-
-/// Add two matrices together
-impl Add for Transform {
-    type Output = Self;
-
-    #[inline]
-    fn add(self, other: Self) -> Self {
-        Self(Affine3A::from_mat4(
-            Mat4::from(self.0) + Mat4::from(other.0),
-        ))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn equality_test() {
-        let a = Transform::new();
-        let b = Transform::new();
-        let c =
-            Transform::new_from_values(1.1, 0.0, 0.0, 0.0, 0.0, 1.1, 0.0, 0.0, 0.0, 0.0, 1.1, 0.0);
-
-        assert_eq!(a, b);
-        assert!(a != c);
-    }
-
-    #[test]
-    fn approximate_equality_test() {
-        let a = Transform::new();
-        let b = Transform::new_from_values(
-            1.000001, 0.0, 0.0, 0.0, 0.0, 1.000001, 0.0, 0.0, 0.0, 0.0, 1.000001, 0.0,
-        );
-        let c = Transform::new_from_values(
-            1.000003, 0.0, 0.0, 0.0, 0.0, 1.000003, 0.0, 0.0, 0.0, 0.0, 1.000003, 0.0,
-        );
-        let d = Transform::new_from_values(
-            -1.000001, 0.0, 0.0, 0.0, 0.0, -1.000001, 0.0, 0.0, 0.0, 0.0, -1.000001, 0.0,
-        );
-
-        assert!(a.aprx_eq(b, 0.000001));
-        assert!(!a.aprx_eq(c, 0.000001));
-        assert!(!a.aprx_eq(d, 0.000001));
-    }
-
-    #[test]
-    fn multiply_test() {
-        let a = Transform::new_from_values(
-            1.0, 2.0, 2.0, 1.5, 3.0, 6.0, 7.0, 8.0, 9.0, 2.0, 11.0, 12.0,
-        );
-        let b = Transform::new_from_values(
-            1.0, 5.0, 9.0, 13.0, 2.0, 6.0, 10.0, 14.0, 3.0, 7.0, 11.0, 15.0,
-        );
-        let c = Transform::new_from_values(
-            97.0, 50.0, 136.0, 162.5, 110.0, 60.0, 156.0, 185.0, 123.0, 70.0, 176.0, 207.5,
-        );
-
-        assert_eq!(a * b, c);
-    }
-
-    #[test]
-    fn inverse_test() {
-        let a = Transform::new_from_values(
-            1.0, 0.33, 0.0, -2.0, 0.0, 1.0, 0.0, 0.0, 2.1, 0.7, 1.3, 0.0,
-        );
-        let b = a.inverse();
-        let c = Transform::new();
-
-        assert!((dbg!(a * b)).aprx_eq(dbg!(c), 0.0000001));
-    }
-}
--- a/sub_crates/math3d/src/vector.rs
+++ b/sub_crates/math3d/src/vector.rs
@ -1,286 +0,0 @@
-#![allow(dead_code)]
-
-use std::{
-    cmp::PartialEq,
-    ops::{Add, Div, Mul, Neg, Sub},
-};
-
-use glam::Vec3A;
-
-use super::{CrossProduct, DotProduct, Normal, Point, Transform};
-
-/// A direction vector in 3d homogeneous space.
-#[derive(Debug, Copy, Clone)]
-pub struct Vector {
-    pub co: Vec3A,
-}
-
-impl Vector {
-    #[inline(always)]
-    pub fn new(x: f32, y: f32, z: f32) -> Vector {
-        Vector {
-            co: Vec3A::new(x, y, z),
-        }
-    }
-
-    #[inline(always)]
-    pub fn length(&self) -> f32 {
-        self.co.length()
-    }
-
-    #[inline(always)]
-    pub fn length2(&self) -> f32 {
-        self.co.length_squared()
-    }
-
-    #[inline(always)]
-    pub fn normalized(&self) -> Vector {
-        Vector {
-            co: self.co.normalize(),
-        }
-    }
-
-    #[inline(always)]
-    pub fn abs(&self) -> Vector {
-        Vector {
-            co: self.co * self.co.signum(),
-        }
-    }
-
-    #[inline(always)]
-    pub fn into_point(self) -> Point {
-        Point { co: self.co }
-    }
-
-    #[inline(always)]
-    pub fn into_normal(self) -> Normal {
-        Normal { co: self.co }
-    }
-
-    #[inline(always)]
-    pub fn get_n(&self, n: usize) -> f32 {
-        match n {
-            0 => self.x(),
-            1 => self.y(),
-            2 => self.z(),
-            _ => panic!("Attempt to access dimension beyond z."),
-        }
-    }
-
-    #[inline(always)]
-    pub fn x(&self) -> f32 {
-        self.co[0]
-    }
-
-    #[inline(always)]
-    pub fn y(&self) -> f32 {
-        self.co[1]
-    }
-
-    #[inline(always)]
-    pub fn z(&self) -> f32 {
-        self.co[2]
-    }
-
-    #[inline(always)]
-    pub fn set_x(&mut self, x: f32) {
-        self.co[0] = x;
-    }
-
-    #[inline(always)]
-    pub fn set_y(&mut self, y: f32) {
-        self.co[1] = y;
-    }
-
-    #[inline(always)]
-    pub fn set_z(&mut self, z: f32) {
-        self.co[2] = z;
-    }
-}
-
-impl PartialEq for Vector {
-    #[inline(always)]
-    fn eq(&self, other: &Vector) -> bool {
-        self.co == other.co
-    }
-}
-
-impl Add for Vector {
-    type Output = Vector;
-
-    #[inline(always)]
-    fn add(self, other: Vector) -> Vector {
-        Vector {
-            co: self.co + other.co,
-        }
-    }
-}
-
-impl Sub for Vector {
-    type Output = Vector;
-
-    #[inline(always)]
-    fn sub(self, other: Vector) -> Vector {
-        Vector {
-            co: self.co - other.co,
-        }
-    }
-}
-
-impl Mul<f32> for Vector {
-    type Output = Vector;
-
-    #[inline(always)]
-    fn mul(self, other: f32) -> Vector {
-        Vector {
-            co: self.co * other,
-        }
-    }
-}
-
-impl Mul<Transform> for Vector {
-    type Output = Vector;
-
-    #[inline]
-    fn mul(self, other: Transform) -> Vector {
-        Vector {
-            co: other.0.transform_vector3a(self.co),
-        }
-    }
-}
-
-impl Div<f32> for Vector {
-    type Output = Vector;
-
-    #[inline(always)]
-    fn div(self, other: f32) -> Vector {
-        Vector {
-            co: self.co / other,
-        }
-    }
-}
-
-impl Neg for Vector {
-    type Output = Vector;
-
-    #[inline(always)]
-    fn neg(self) -> Vector {
-        Vector { co: self.co * -1.0 }
-    }
-}
-
-impl DotProduct for Vector {
-    #[inline(always)]
-    fn dot(self, other: Vector) -> f32 {
-        self.co.dot(other.co)
-    }
-}
-
-impl CrossProduct for Vector {
-    #[inline]
-    fn cross(self, other: Vector) -> Vector {
-        Vector {
-            co: self.co.cross(other.co),
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::super::{CrossProduct, DotProduct, Transform};
-    use super::*;
-
-    #[test]
-    fn add() {
-        let v1 = Vector::new(1.0, 2.0, 3.0);
-        let v2 = Vector::new(1.5, 4.5, 2.5);
-        let v3 = Vector::new(2.5, 6.5, 5.5);
-
-        assert_eq!(v3, v1 + v2);
-    }
-
-    #[test]
-    fn sub() {
-        let v1 = Vector::new(1.0, 2.0, 3.0);
-        let v2 = Vector::new(1.5, 4.5, 2.5);
-        let v3 = Vector::new(-0.5, -2.5, 0.5);
-
-        assert_eq!(v3, v1 - v2);
-    }
-
-    #[test]
-    fn mul_scalar() {
-        let v1 = Vector::new(1.0, 2.0, 3.0);
-        let v2 = 2.0;
-        let v3 = Vector::new(2.0, 4.0, 6.0);
-
-        assert_eq!(v3, v1 * v2);
-    }
-
-    #[test]
-    fn mul_matrix_1() {
-        let v = Vector::new(1.0, 2.5, 4.0);
-        let m = Transform::new_from_values(
-            1.0, 2.0, 2.0, 1.5, 3.0, 6.0, 7.0, 8.0, 9.0, 2.0, 11.0, 12.0,
-        );
-        assert_eq!(v * m, Vector::new(14.0, 46.0, 58.0));
-    }
-
-    #[test]
-    fn mul_matrix_2() {
-        let v = Vector::new(1.0, 2.5, 4.0);
-        let m = Transform::new_from_values(
-            1.0, 2.0, 2.0, 1.5, 3.0, 6.0, 7.0, 8.0, 9.0, 2.0, 11.0, 12.0,
-        );
-        assert_eq!(v * m, Vector::new(14.0, 46.0, 58.0));
-    }
-
-    #[test]
-    fn div() {
-        let v1 = Vector::new(1.0, 2.0, 3.0);
-        let v2 = 2.0;
-        let v3 = Vector::new(0.5, 1.0, 1.5);
-
-        assert_eq!(v3, v1 / v2);
-    }
-
-    #[test]
-    fn length() {
-        let v = Vector::new(1.0, 2.0, 3.0);
-        assert!((v.length() - 3.7416573867739413).abs() < 0.000001);
-    }
-
-    #[test]
-    fn length2() {
-        let v = Vector::new(1.0, 2.0, 3.0);
-        assert_eq!(v.length2(), 14.0);
-    }
-
-    #[test]
-    fn normalized() {
-        let v1 = Vector::new(1.0, 2.0, 3.0);
-        let v2 = Vector::new(0.2672612419124244, 0.5345224838248488, 0.8017837257372732);
-        let v3 = v1.normalized();
-        assert!((v3.x() - v2.x()).abs() < 0.000001);
-        assert!((v3.y() - v2.y()).abs() < 0.000001);
-        assert!((v3.z() - v2.z()).abs() < 0.000001);
-    }
-
-    #[test]
-    fn dot_test() {
-        let v1 = Vector::new(1.0, 2.0, 3.0);
-        let v2 = Vector::new(1.5, 4.5, 2.5);
-        let v3 = 18.0f32;
-
-        assert_eq!(v3, v1.dot(v2));
-    }
-
-    #[test]
-    fn cross_test() {
-        let v1 = Vector::new(1.0, 0.0, 0.0);
-        let v2 = Vector::new(0.0, 1.0, 0.0);
-        let v3 = Vector::new(0.0, 0.0, 1.0);
-
-        assert_eq!(v3, v1.cross(v2));
-    }
-}
--- a/sub_crates/math3d/Cargo.toml
+++ b/sub_crates/math3d/Cargo.toml
@ -1,15 +1,18 @@
 [package]
-name = "math3d"
+name = "rmath"
 version = "0.1.0"
 authors = ["Nathan Vegdahl <cessen@cessen.com>"]
 edition = "2018"
 license = "MIT, Apache 2.0"

 [lib]
-name = "math3d"
+name = "rmath"
 path = "src/lib.rs"

-# Local crate dependencies
-[dependencies]
-glam = "0.15"
-approx = "0.4"
+[dev-dependencies]
+bencher = "0.1.5"
+rand = "0.6"
+
+[[bench]]
+name = "bench"
+harness = false
--- a/sub_crates/math3d/LICENSE.md
+++ b/sub_crates/math3d/LICENSE.md
--- a/sub_crates/rmath/benches/bench.rs
+++ b/sub_crates/rmath/benches/bench.rs
@ -0,0 +1,202 @@
+use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
+use rand::{rngs::SmallRng, FromEntropy, Rng};
+use rmath::{CrossProduct, DotProduct, Normal, Point, Vector, Xform, XformFull};
+
+//----
+
+fn vector_cross_10000(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let v1 = Vector::new(rng.gen::<f32>(), rng.gen::<f32>(), rng.gen::<f32>());
+        let v2 = Vector::new(rng.gen::<f32>(), rng.gen::<f32>(), rng.gen::<f32>());
+        for _ in 0..10000 {
+            black_box(black_box(v1).cross(black_box(v2)));
+        }
+    });
+}
+
+fn vector_dot_10000(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let v1 = Vector::new(rng.gen::<f32>(), rng.gen::<f32>(), rng.gen::<f32>());
+        let v2 = Vector::new(rng.gen::<f32>(), rng.gen::<f32>(), rng.gen::<f32>());
+        for _ in 0..10000 {
+            black_box(black_box(v1).dot(black_box(v2)));
+        }
+    });
+}
+
+fn xform_vector_mul_10000(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let v = Vector::new(rng.gen::<f32>(), rng.gen::<f32>(), rng.gen::<f32>());
+        let x = Xform::new(
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+        );
+        for _ in 0..10000 {
+            black_box(black_box(v).xform(black_box(&x)));
+        }
+    });
+}
+
+fn xform_point_mul_10000(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let p = Point::new(rng.gen::<f32>(), rng.gen::<f32>(), rng.gen::<f32>());
+        let x = Xform::new(
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+        );
+        for _ in 0..10000 {
+            black_box(black_box(p).xform(black_box(&x)));
+        }
+    });
+}
+
+fn xform_point_mul_inv_10000(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let p = Point::new(rng.gen::<f32>(), rng.gen::<f32>(), rng.gen::<f32>());
+        let x = Xform::new(
+            1.0,
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            1.0,
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            1.0,
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+        )
+        .to_full()
+        .unwrap();
+        for _ in 0..10000 {
+            black_box(black_box(p).xform_inv(black_box(&x)));
+        }
+    });
+}
+
+fn xform_normal_mul_10000(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let n = Normal::new(rng.gen::<f32>(), rng.gen::<f32>(), rng.gen::<f32>());
+        let x = Xform::new(
+            1.0,
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            1.0,
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            1.0,
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+        )
+        .to_full()
+        .unwrap();
+        for _ in 0..10000 {
+            black_box(black_box(n).xform(black_box(&x)));
+        }
+    });
+}
+
+fn xform_xform_mul_10000(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let x1 = Xform::new(
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+        );
+        let x2 = Xform::new(
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+        );
+        for _ in 0..10000 {
+            black_box(black_box(x1).compose(black_box(&x2)));
+        }
+    });
+}
+
+fn xform_to_xformfull_10000(bench: &mut Bencher) {
+    let mut rng = SmallRng::from_entropy();
+    bench.iter(|| {
+        let x = Xform::new(
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+            rng.gen::<f32>(),
+        );
+        for _ in 0..10000 {
+            black_box(black_box(x).to_full());
+        }
+    });
+}
+
+//----
+
+benchmark_group!(
+    benches,
+    vector_cross_10000,
+    vector_dot_10000,
+    xform_vector_mul_10000,
+    xform_point_mul_10000,
+    xform_point_mul_inv_10000,
+    xform_normal_mul_10000,
+    xform_xform_mul_10000,
+    xform_to_xformfull_10000,
+);
+benchmark_main!(benches);
--- a/sub_crates/rmath/examples/precision.rs
+++ b/sub_crates/rmath/examples/precision.rs
@ -0,0 +1,268 @@
+use rand::{rngs::SmallRng, FromEntropy, Rng};
+
+use rmath::{utils::ulp_diff, wide4::Float4};
+
+type D4 = [f64; 4];
+
+fn main() {
+    let mut rng = SmallRng::from_entropy();
+
+    // Convenience functions for generating random Float4's.
+    let mut rf4 = || {
+        let mut rf = || {
+            let range = 268435456.0;
+            let n = rng.gen::<f64>();
+            ((n * range * 2.0) - range) as f32
+        };
+        Float4::new(rf(), rf(), rf(), rf())
+    };
+
+    // Dot product test.
+    println!("Dot product:");
+    {
+        let mut max_ulp_diff = 0u32;
+        for _ in 0..10000000 {
+            let v1 = rf4();
+            let v2 = rf4();
+
+            let dpa = Float4::dot_3(v1, v2);
+            let dpb = dot_3(f4_to_d4(v1), f4_to_d4(v2));
+
+            let ud = ulp_diff(dpa, dpb as f32);
+            max_ulp_diff = max_ulp_diff.max(ud);
+        }
+
+        println!("  Max error (ulps):\n    {:?}\n", max_ulp_diff);
+    }
+
+    // Cross product test.
+    println!("Cross product:");
+    {
+        let mut max_ulp_diff = [0u32; 4];
+        for _ in 0..10000000 {
+            let v1 = rf4();
+            let v2 = rf4();
+
+            let v3a = Float4::cross_3(v1, v2);
+            let v3b = cross_3(f4_to_d4(v1), f4_to_d4(v2));
+
+            let ud = ulp_diff_f4d4(v3a, v3b);
+            for i in 0..4 {
+                max_ulp_diff[i] = max_ulp_diff[i].max(ud[i]);
+            }
+        }
+
+        println!("  Max error (ulps):\n    {:?}\n", max_ulp_diff);
+    }
+
+    // Matrix inversion test.
+    println!("Matrix inversion:");
+    {
+        let mut max_ulp_diff = [[0u32; 4]; 3];
+        let mut det_ulp_hist = [0u32; 9];
+        for _ in 0..2000000 {
+            let m = [rf4(), rf4(), rf4()];
+            let ima = Float4::invert_3x3_w_det(&m);
+            let imb = invert_3x3([f4_to_d4(m[0]), f4_to_d4(m[1]), f4_to_d4(m[2])]);
+
+            if let (Some((ima, deta)), Some((imb, detb))) = (ima, imb) {
+                let det_ulp_diff = ulp_diff(deta, detb as f32);
+                let mut hist_upper = 0;
+                for i in 0..det_ulp_hist.len() {
+                    if det_ulp_diff <= hist_upper {
+                        det_ulp_hist[i] += 1;
+                        break;
+                    }
+                    if hist_upper == 0 {
+                        hist_upper += 1;
+                    } else {
+                        hist_upper *= 10;
+                    }
+                }
+
+                if det_ulp_diff == 0 {
+                    for i in 0..3 {
+                        let ud = ulp_diff_f4d4(ima[i], imb[i]);
+                        for j in 0..4 {
+                            max_ulp_diff[i][j] = max_ulp_diff[i][j].max(ud[j]);
+                        }
+                    }
+                }
+            }
+        }
+
+        println!(
+            "  Max error when determinant has 0-ulp error (ulps):\n    {:?}",
+            max_ulp_diff
+        );
+
+        let total: u32 = det_ulp_hist.iter().sum();
+        let mut ulp = 0;
+        let mut sum = 0;
+        println!("  Determinant error distribution:");
+        for h in det_ulp_hist.iter() {
+            sum += *h;
+            println!(
+                "    {:.8}% <= {} ulps",
+                sum as f64 / total as f64 * 100.0,
+                ulp
+            );
+            if ulp == 0 {
+                ulp += 1;
+            } else {
+                ulp *= 10;
+            }
+        }
+        println!();
+    }
+}
+
+//-------------------------------------------------------------
+
+fn f4_to_d4(v: Float4) -> D4 {
+    [v.a() as f64, v.b() as f64, v.c() as f64, v.d() as f64]
+}
+
+fn ulp_diff_f4d4(a: Float4, b: D4) -> [u32; 4] {
+    [
+        ulp_diff(a.a(), b[0] as f32),
+        ulp_diff(a.b(), b[1] as f32),
+        ulp_diff(a.c(), b[2] as f32),
+        ulp_diff(a.d(), b[3] as f32),
+    ]
+}
+
+//-------------------------------------------------------------
+
+fn dot_3(a: D4, b: D4) -> f64 {
+    // Products.
+    let (x, x_err) = two_prod(a[0], b[0]);
+    let (y, y_err) = two_prod(a[1], b[1]);
+    let (z, z_err) = two_prod(a[2], b[2]);
+
+    // Sums.
+    let (s1, s1_err) = two_sum(x, y);
+    let err1 = x_err + (y_err + s1_err);
+
+    let (s2, s2_err) = two_sum(s1, z);
+    let err2 = z_err + (err1 + s2_err);
+
+    // Final result with rounding error compensation.
+    s2 + err2
+}
+
+fn cross_3(a: D4, b: D4) -> D4 {
+    [
+        difference_of_products(a[1], b[2], a[2], b[1]),
+        difference_of_products(a[2], b[0], a[0], b[2]),
+        difference_of_products(a[0], b[1], a[1], b[0]),
+        difference_of_products(a[3], b[3], a[3], b[3]),
+    ]
+}
+
+fn invert_3x3(m: [D4; 3]) -> Option<([D4; 3], f64)> {
+    let m0_bca = [m[0][1], m[0][2], m[0][0], m[0][3]];
+    let m1_bca = [m[1][1], m[1][2], m[1][0], m[1][3]];
+    let m2_bca = [m[2][1], m[2][2], m[2][0], m[2][3]];
+    let m0_cab = [m[0][2], m[0][0], m[0][1], m[0][3]];
+    let m1_cab = [m[1][2], m[1][0], m[1][1], m[1][3]];
+    let m2_cab = [m[2][2], m[2][0], m[2][1], m[2][3]];
+
+    let abc = [
+        difference_of_products(m1_bca[0], m2_cab[0], m1_cab[0], m2_bca[0]),
+        difference_of_products(m1_bca[1], m2_cab[1], m1_cab[1], m2_bca[1]),
+        difference_of_products(m1_bca[2], m2_cab[2], m1_cab[2], m2_bca[2]),
+        difference_of_products(m1_bca[3], m2_cab[3], m1_cab[3], m2_bca[3]),
+    ];
+    let def = [
+        difference_of_products(m2_bca[0], m0_cab[0], m2_cab[0], m0_bca[0]),
+        difference_of_products(m2_bca[1], m0_cab[1], m2_cab[1], m0_bca[1]),
+        difference_of_products(m2_bca[2], m0_cab[2], m2_cab[2], m0_bca[2]),
+        difference_of_products(m2_bca[3], m0_cab[3], m2_cab[3], m0_bca[3]),
+    ];
+    let ghi = [
+        difference_of_products(m0_bca[0], m1_cab[0], m0_cab[0], m1_bca[0]),
+        difference_of_products(m0_bca[1], m1_cab[1], m0_cab[1], m1_bca[1]),
+        difference_of_products(m0_bca[2], m1_cab[2], m0_cab[2], m1_bca[2]),
+        difference_of_products(m0_bca[3], m1_cab[3], m0_cab[3], m1_bca[3]),
+    ];
+
+    let det = dot_3(
+        [abc[0], def[0], ghi[0], 0.0],
+        [m[0][0], m[1][0], m[2][0], 0.0],
+    );
+
+    if det == 0.0 {
+        None
+    } else {
+        Some((
+            [
+                [abc[0] / det, def[0] / det, ghi[0] / det, 0.0],
+                [abc[1] / det, def[1] / det, ghi[1] / det, 0.0],
+                [abc[2] / det, def[2] / det, ghi[2] / det, 0.0],
+            ],
+            // [
+            //     [abc[0], def[0], ghi[0], 0.0],
+            //     [abc[1], def[1], ghi[1], 0.0],
+            //     [abc[2], def[2], ghi[2], 0.0],
+            // ],
+            det,
+        ))
+    }
+}
+
+fn rel_diff(a: f64, b: f64) -> f64 {
+    (a - b).abs() / a.abs().max(b.abs())
+}
+
+//-------------------------------------------------------------
+
+/// `(a * b) - (c * d)` but done with high precision via floating point tricks.
+///
+/// See https://pharr.org/matt/blog/2019/11/03/difference-of-floats
+#[inline(always)]
+fn difference_of_products(a: f64, b: f64, c: f64, d: f64) -> f64 {
+    let cd = c * d;
+    let dop = a.mul_add(b, -cd);
+    let err = (-c).mul_add(d, cd);
+    dop + err
+}
+
+/// `(a * b) + (c * d)` but done with high precision via floating point tricks.
+#[inline(always)]
+fn sum_of_products(a: f64, b: f64, c: f64, d: f64) -> f64 {
+    let cd = c * d;
+    let sop = a.mul_add(b, cd);
+    let err = c.mul_add(d, -cd);
+    sop + err
+}
+
+/// `a * b` but also returns a rounding error for precise composition
+/// with other operations.
+#[inline(always)]
+fn two_prod(a: f64, b: f64) -> (f64, f64)
+// (product, rounding_err)
+{
+    let ab = a * b;
+    (ab, a.mul_add(b, -ab))
+}
+
+/// `a + b` but also returns a rounding error for precise composition
+/// with other operations.
+#[inline(always)]
+fn two_sum(a: f64, b: f64) -> (f64, f64)
+// (sum, rounding_err)
+{
+    let sum = a + b;
+    let delta = sum - a;
+    (sum, (a - (sum - delta)) + (b - delta))
+}
+
+#[inline(always)]
+fn two_diff(a: f64, b: f64) -> (f64, f64)
+// (diff, rounding_err)
+{
+    let diff = a - b;
+    let delta = diff - a;
+    (diff, (a - (diff - delta)) - (b + delta))
+}
--- a/sub_crates/rmath/src/lib.rs
+++ b/sub_crates/rmath/src/lib.rs
@ -0,0 +1,129 @@
+//! RMath: a math library for building CPU-based renderers.
+
+#![allow(dead_code)]
+
+mod normal;
+mod point;
+mod sealed;
+pub mod utils;
+mod vector;
+pub mod wide4;
+mod xform;
+
+use std::ops::{Add, Mul, Neg, Sub};
+
+pub use self::{
+    normal::Normal, point::Point, vector::Vector, xform::AsXform, xform::Xform, xform::XformFull,
+};
+
+/// Trait for calculating dot products.
+pub trait DotProduct {
+    fn dot(self, other: Self) -> f32;
+
+    fn dot_fast(self, other: Self) -> f32;
+}
+
+#[inline(always)]
+pub fn dot<T: DotProduct>(a: T, b: T) -> f32 {
+    a.dot(b)
+}
+
+#[inline(always)]
+pub fn dot_fast<T: DotProduct>(a: T, b: T) -> f32 {
+    a.dot_fast(b)
+}
+
+/// Trait for calculating cross products.
+pub trait CrossProduct {
+    fn cross(self, other: Self) -> Self;
+
+    fn cross_fast(self, other: Self) -> Self;
+}
+
+#[inline(always)]
+pub fn cross<T: CrossProduct>(a: T, b: T) -> T {
+    a.cross(b)
+}
+
+#[inline(always)]
+pub fn cross_fast<T: CrossProduct>(a: T, b: T) -> T {
+    a.cross_fast(b)
+}
+
+//-------------------------------------------------------------
+
+/// Trait representing types that can do fused multiply-add.
+trait FMulAdd {
+    /// `(self * b) + c` with only one floating point rounding error.
+    fn fma(self, b: Self, c: Self) -> Self;
+}
+
+impl FMulAdd for f32 {
+    fn fma(self, b: Self, c: Self) -> Self {
+        self.mul_add(b, c)
+    }
+}
+
+/// `(a * b) - (c * d)` but done with high precision via floating point tricks.
+///
+/// See https://pharr.org/matt/blog/2019/11/03/difference-of-floats
+#[inline(always)]
+fn difference_of_products<T>(a: T, b: T, c: T, d: T) -> T
+where
+    T: Copy + FMulAdd + Add<Output = T> + Mul<Output = T> + Neg<Output = T>,
+{
+    let cd = c * d;
+    let dop = a.fma(b, -cd);
+    let err = (-c).fma(d, cd);
+    dop + err
+}
+
+/// `(a * b) + (c * d)` but done with high precision via floating point tricks.
+#[inline(always)]
+fn sum_of_products<T>(a: T, b: T, c: T, d: T) -> T
+where
+    T: Copy + FMulAdd + Add<Output = T> + Mul<Output = T> + Neg<Output = T>,
+{
+    let cd = c * d;
+    let sop = a.fma(b, cd);
+    let err = c.fma(d, -cd);
+    sop + err
+}
+
+/// `a * b` but also returns a rounding error for precise composition
+/// with other operations.
+#[inline(always)]
+fn two_prod<T>(a: T, b: T) -> (T, T)
+// (product, rounding_err)
+where
+    T: Copy + FMulAdd + Mul<Output = T> + Neg<Output = T>,
+{
+    let ab = a * b;
+    (ab, a.fma(b, -ab))
+}
+
+/// `a + b` but also returns a rounding error for precise composition
+/// with other operations.
+#[inline(always)]
+fn two_sum<T>(a: T, b: T) -> (T, T)
+// (sum, rounding_err)
+where
+    T: Copy + Add<Output = T> + Sub<Output = T>,
+{
+    let sum = a + b;
+    let delta = sum - a;
+    (sum, (a - (sum - delta)) + (b - delta))
+}
+
+/// `a - b` but also returns a rounding error for precise composition
+/// with other operations.
+#[inline(always)]
+fn two_diff<T>(a: T, b: T) -> (T, T)
+// (diff, rounding_err)
+where
+    T: Copy + Add<Output = T> + Sub<Output = T>,
+{
+    let diff = a - b;
+    let delta = diff - a;
+    (diff, (a - (diff - delta)) - (b + delta))
+}
--- a/sub_crates/rmath/src/normal.rs
+++ b/sub_crates/rmath/src/normal.rs
@ -0,0 +1,318 @@
+#![allow(dead_code)]
+
+use std::cmp::PartialEq;
+use std::ops::{Add, Div, Mul, Neg, Sub};
+
+use crate::wide4::Float4;
+use crate::xform::{AsXform, XformFull};
+use crate::Vector;
+use crate::{CrossProduct, DotProduct};
+
+/// A surface normal in 3D space.
+#[derive(Debug, Copy, Clone)]
+#[repr(transparent)]
+pub struct Normal(pub Float4);
+
+impl Normal {
+    #[inline(always)]
+    pub fn new(x: f32, y: f32, z: f32) -> Self {
+        Self(Float4::new(x, y, z, 0.0))
+    }
+
+    #[inline(always)]
+    pub fn length(self) -> f32 {
+        self.length2().sqrt()
+    }
+
+    #[inline(always)]
+    pub fn length2(self) -> f32 {
+        let sqr = self.0 * self.0;
+        sqr.a() + sqr.b() + sqr.c()
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn normalized(self) -> Self {
+        Self(self.0 / self.length())
+    }
+
+    #[inline(always)]
+    pub fn into_vector(self) -> Vector {
+        Vector(self.0)
+    }
+
+    #[inline(always)]
+    pub fn x(self) -> f32 {
+        self.0.a()
+    }
+
+    #[inline(always)]
+    pub fn y(self) -> f32 {
+        self.0.b()
+    }
+
+    #[inline(always)]
+    pub fn z(self) -> f32 {
+        self.0.c()
+    }
+
+    #[inline(always)]
+    pub fn get_n(self, i: usize) -> f32 {
+        match i {
+            0 => self.x(),
+            1 => self.y(),
+            2 => self.z(),
+            _ => panic!("Out of bounds index into 3D vector."),
+        }
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_x(self, x: f32) -> Self {
+        Self(self.0.set_a(x))
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_y(self, y: f32) -> Self {
+        Self(self.0.set_b(y))
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_z(self, z: f32) -> Self {
+        Self(self.0.set_c(z))
+    }
+
+    //-------------
+    // Transforms.
+
+    /// Forward-transform the normal.
+    #[inline(always)]
+    pub fn xform(self, xform: &XformFull) -> Self {
+        Self(self.0.vec_mul_3x3(&Float4::transpose_3x3(&xform.inv_m)))
+    }
+
+    /// Inverse-transform the normal.
+    #[inline(always)]
+    pub fn xform_inv<T: AsXform>(self, xform: &T) -> Self {
+        Self(
+            self.0
+                .vec_mul_3x3(&Float4::transpose_3x3(&xform.as_xform().m)),
+        )
+    }
+
+    /// Faster but less precise version of `xform()`.
+    #[inline(always)]
+    pub fn xform_fast(self, xform: &XformFull) -> Self {
+        Self(
+            self.0
+                .vec_mul_3x3_fast(&Float4::transpose_3x3(&xform.inv_m)),
+        )
+    }
+
+    /// Faster but less precise version of `xform_inv()`.
+    #[inline(always)]
+    pub fn xform_inv_fast<T: AsXform>(self, xform: &T) -> Self {
+        Self(
+            self.0
+                .vec_mul_3x3_fast(&Float4::transpose_3x3(&xform.as_xform().m)),
+        )
+    }
+}
+
+impl Add for Normal {
+    type Output = Self;
+
+    #[inline(always)]
+    fn add(self, other: Self) -> Self {
+        Self(self.0 + other.0)
+    }
+}
+
+impl Sub for Normal {
+    type Output = Self;
+
+    #[inline(always)]
+    fn sub(self, other: Self) -> Self {
+        Self(self.0 - other.0)
+    }
+}
+
+impl Mul<f32> for Normal {
+    type Output = Self;
+
+    #[inline(always)]
+    fn mul(self, other: f32) -> Self {
+        Self(self.0 * other)
+    }
+}
+
+impl Div<f32> for Normal {
+    type Output = Self;
+
+    #[inline(always)]
+    fn div(self, other: f32) -> Self {
+        Self(self.0 / other)
+    }
+}
+
+impl Neg for Normal {
+    type Output = Self;
+
+    #[inline(always)]
+    fn neg(self) -> Self {
+        Self(-self.0)
+    }
+}
+
+impl PartialEq for Normal {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        self.0.a() == rhs.0.a() && self.0.b() == rhs.0.b() && self.0.c() == rhs.0.c()
+    }
+}
+
+impl DotProduct for Normal {
+    #[inline(always)]
+    fn dot(self, other: Self) -> f32 {
+        Float4::dot_3(self.0, other.0)
+    }
+
+    #[inline(always)]
+    fn dot_fast(self, other: Self) -> f32 {
+        Float4::dot_3_fast(self.0, other.0)
+    }
+}
+
+impl CrossProduct for Normal {
+    #[inline(always)]
+    fn cross(self, other: Self) -> Self {
+        Self(Float4::cross_3(self.0, other.0))
+    }
+
+    #[inline(always)]
+    fn cross_fast(self, other: Self) -> Self {
+        Self(Float4::cross_3_fast(self.0, other.0))
+    }
+}
+
+//-------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{CrossProduct, DotProduct, Xform};
+
+    #[test]
+    fn add() {
+        let v1 = Normal::new(1.0, 2.0, 3.0);
+        let v2 = Normal::new(1.5, 4.5, 2.5);
+        let v3 = Normal::new(2.5, 6.5, 5.5);
+
+        assert_eq!(v3, v1 + v2);
+    }
+
+    #[test]
+    fn sub() {
+        let v1 = Normal::new(1.0, 2.0, 3.0);
+        let v2 = Normal::new(1.5, 4.5, 2.5);
+        let v3 = Normal::new(-0.5, -2.5, 0.5);
+
+        assert_eq!(v3, v1 - v2);
+    }
+
+    #[test]
+    fn mul_scalar() {
+        let v1 = Normal::new(1.0, 2.0, 3.0);
+        let v2 = 2.0;
+        let v3 = Normal::new(2.0, 4.0, 6.0);
+
+        assert_eq!(v3, v1 * v2);
+    }
+
+    #[test]
+    fn xform() {
+        let n = Normal::new(1.0, 2.5, 4.0);
+        let m = Xform::new(1.0, 3.0, 9.0, 2.0, 6.0, 2.0, 2.0, 7.0, 11.0, 1.5, 8.0, 12.0)
+            .to_full()
+            .unwrap();
+
+        assert_eq!(n.xform(&m), Normal::new(-4.0625, 1.78125, -0.03125));
+        assert_eq!(n.xform(&m).xform_inv(&m), n);
+    }
+
+    #[test]
+    fn xform_fast() {
+        let n = Normal::new(1.0, 2.5, 4.0);
+        let m = Xform::new(1.0, 3.0, 9.0, 2.0, 6.0, 2.0, 2.0, 7.0, 11.0, 1.5, 8.0, 12.0)
+            .to_full()
+            .unwrap();
+
+        assert_eq!(n.xform_fast(&m), Normal::new(-4.0625, 1.78125, -0.03125));
+        assert_eq!(n.xform_fast(&m).xform_inv_fast(&m), n);
+    }
+
+    #[test]
+    fn div() {
+        let v1 = Normal::new(1.0, 2.0, 3.0);
+        let v2 = 2.0;
+        let v3 = Normal::new(0.5, 1.0, 1.5);
+
+        assert_eq!(v3, v1 / v2);
+    }
+
+    #[test]
+    fn length() {
+        let n = Normal::new(1.0, 2.0, 3.0);
+        assert!((n.length() - 3.7416573867739413).abs() < 0.000001);
+    }
+
+    #[test]
+    fn length2() {
+        let n = Normal::new(1.0, 2.0, 3.0);
+        assert_eq!(n.length2(), 14.0);
+    }
+
+    #[test]
+    fn normalized() {
+        let n1 = Normal::new(1.0, 2.0, 3.0);
+        let n2 = Normal::new(0.2672612419124244, 0.5345224838248488, 0.8017837257372732);
+        let n3 = n1.normalized();
+        assert!((n3.x() - n2.x()).abs() < 0.000001);
+        assert!((n3.y() - n2.y()).abs() < 0.000001);
+        assert!((n3.z() - n2.z()).abs() < 0.000001);
+    }
+
+    #[test]
+    fn dot() {
+        let v1 = Normal::new(1.0, 2.0, 3.0);
+        let v2 = Normal::new(1.5, 4.5, 2.5);
+
+        assert_eq!(v1.dot(v2), 18.0);
+    }
+
+    #[test]
+    fn dot_fast() {
+        let v1 = Normal::new(1.0, 2.0, 3.0);
+        let v2 = Normal::new(1.5, 4.5, 2.5);
+
+        assert_eq!(v1.dot_fast(v2), 18.0);
+    }
+
+    #[test]
+    fn cross() {
+        let v1 = Normal::new(1.0, 0.0, 0.0);
+        let v2 = Normal::new(0.0, 1.0, 0.0);
+
+        assert_eq!(v1.cross(v2), Normal::new(0.0, 0.0, 1.0));
+    }
+
+    #[test]
+    fn cross_fast() {
+        let v1 = Normal::new(1.0, 0.0, 0.0);
+        let v2 = Normal::new(0.0, 1.0, 0.0);
+
+        assert_eq!(v1.cross_fast(v2), Normal::new(0.0, 0.0, 1.0));
+    }
+}
--- a/sub_crates/rmath/src/point.rs
+++ b/sub_crates/rmath/src/point.rs
@ -0,0 +1,186 @@
+#![allow(dead_code)]
+use std::cmp::PartialEq;
+use std::ops::{Add, Sub};
+
+use crate::vector::Vector;
+use crate::wide4::Float4;
+use crate::xform::{AsXform, XformFull};
+
+/// A position in 3D space.
+#[derive(Debug, Copy, Clone)]
+#[repr(transparent)]
+pub struct Point(pub Float4);
+
+impl Point {
+    #[inline(always)]
+    pub fn new(x: f32, y: f32, z: f32) -> Self {
+        Self(Float4::new(x, y, z, 0.0))
+    }
+
+    #[inline(always)]
+    pub fn min(self, other: Self) -> Self {
+        Self(self.0.min(other.0))
+    }
+
+    #[inline(always)]
+    pub fn max(self, other: Self) -> Self {
+        Self(self.0.max(other.0))
+    }
+
+    #[inline(always)]
+    pub fn into_vector(self) -> Vector {
+        Vector(self.0)
+    }
+
+    #[inline(always)]
+    pub fn x(self) -> f32 {
+        self.0.a()
+    }
+
+    #[inline(always)]
+    pub fn y(self) -> f32 {
+        self.0.b()
+    }
+
+    #[inline(always)]
+    pub fn z(self) -> f32 {
+        self.0.c()
+    }
+
+    #[inline(always)]
+    pub fn get_n(self, i: usize) -> f32 {
+        match i {
+            0 => self.x(),
+            1 => self.y(),
+            2 => self.z(),
+            _ => panic!("Out of bounds index into 3D vector."),
+        }
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_x(self, x: f32) -> Self {
+        Self(self.0.set_a(x))
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_y(self, y: f32) -> Self {
+        Self(self.0.set_b(y))
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_z(self, z: f32) -> Self {
+        Self(self.0.set_c(z))
+    }
+
+    //-------------
+    // Transforms.
+
+    /// Forward-transform the point.
+    #[inline(always)]
+    pub fn xform<T: AsXform>(self, xform: &T) -> Self {
+        let xform = xform.as_xform();
+        Self(self.0.vec_mul_affine(&xform.m, xform.t))
+    }
+
+    /// Inverse-transform the point.
+    #[inline(always)]
+    pub fn xform_inv(self, xform: &XformFull) -> Self {
+        Self(self.0.vec_mul_affine_rev(&xform.inv_m, xform.fwd.t))
+    }
+
+    /// Faster but less precise version of `xform()`.
+    #[inline(always)]
+    pub fn xform_fast<T: AsXform>(self, xform: &T) -> Self {
+        let xform = xform.as_xform();
+        Self(self.0.vec_mul_affine_fast(&xform.m, xform.t))
+    }
+
+    /// Faster but less precise version of `xform_inv()`.
+    #[inline(always)]
+    pub fn xform_inv_fast(self, xform: &XformFull) -> Self {
+        Self(self.0.vec_mul_affine_rev_fast(&xform.inv_m, xform.fwd.t))
+    }
+}
+
+impl Add<Vector> for Point {
+    type Output = Self;
+
+    #[inline(always)]
+    fn add(self, other: Vector) -> Self {
+        Self(self.0 + other.0)
+    }
+}
+
+impl Sub for Point {
+    type Output = Vector;
+
+    #[inline(always)]
+    fn sub(self, other: Self) -> Vector {
+        Vector(self.0 - other.0)
+    }
+}
+
+impl Sub<Vector> for Point {
+    type Output = Self;
+
+    #[inline(always)]
+    fn sub(self, other: Vector) -> Self {
+        Self(self.0 - other.0)
+    }
+}
+
+impl PartialEq for Point {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        self.0.a() == rhs.0.a() && self.0.b() == rhs.0.b() && self.0.c() == rhs.0.c()
+    }
+}
+
+//-------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{Vector, Xform};
+
+    #[test]
+    fn add() {
+        let p1 = Point::new(1.0, 2.0, 3.0);
+        let v1 = Vector::new(1.5, 4.5, 2.5);
+        let p2 = Point::new(2.5, 6.5, 5.5);
+
+        assert_eq!(p2, p1 + v1);
+    }
+
+    #[test]
+    fn sub() {
+        let p1 = Point::new(1.0, 2.0, 3.0);
+        let p2 = Point::new(1.5, 4.5, 2.5);
+        let v1 = Vector::new(-0.5, -2.5, 0.5);
+
+        assert_eq!(v1, p1 - p2);
+    }
+
+    #[test]
+    fn xform() {
+        let p = Point::new(1.0, 2.5, 4.0);
+        let m = Xform::new(1.0, 3.0, 9.0, 2.0, 6.0, 2.0, 2.0, 7.0, 11.0, 1.5, 8.0, 12.0)
+            .to_full()
+            .unwrap();
+        assert_eq!(p.xform(&m), Point::new(15.5, 54.0, 70.0));
+        assert_eq!(p.xform(&m).xform_inv(&m), p);
+    }
+
+    #[test]
+    fn xform_fast() {
+        let p = Point::new(1.0, 2.5, 4.0);
+        let m = Xform::new(1.0, 3.0, 9.0, 2.0, 6.0, 2.0, 2.0, 7.0, 11.0, 1.5, 8.0, 12.0)
+            .to_full()
+            .unwrap();
+        assert_eq!(p.xform_fast(&m), Point::new(15.5, 54.0, 70.0));
+        assert_eq!(p.xform_fast(&m).xform_inv_fast(&m), p);
+    }
+}
--- a/sub_crates/rmath/src/sealed.rs
+++ b/sub_crates/rmath/src/sealed.rs
@ -0,0 +1,5 @@
+/// For sealing other traits.
+///
+/// Even though this is marked as public, this module isn't, and
+/// therefore this trait is not available outside the crate.
+pub trait Sealed {}
--- a/sub_crates/rmath/src/utils.rs
+++ b/sub_crates/rmath/src/utils.rs
@ -0,0 +1,196 @@
+/// Compute how different two floats are in ulps.
+///
+/// Notes:
+/// - Treats 0.0 and -0.0 as zero ulps apart, and extends the
+///   implications of that to the rest of the numbers.  E.g. the numbers
+///   just before and after 0.0/-0.0 are only two ulps apart, not three.
+/// - Infinity is one ulp past float max, and converse for -infinity.
+/// - If either number is NaN, returns `u32::MAX`.
+#[inline(always)]
+pub fn ulp_diff(a: f32, b: f32) -> u32 {
+    const SIGN_BIT: u32 = 1 << 31;
+    const INFINITY: u32 = 0x7f800000;
+
+    let a = a.to_bits();
+    let b = b.to_bits();
+
+    let a_sign = a & SIGN_BIT;
+    let b_sign = b & SIGN_BIT;
+    let a_abs = a & !SIGN_BIT;
+    let b_abs = b & !SIGN_BIT;
+
+    if a_abs > INFINITY || b_abs > INFINITY {
+        // NaNs always return maximum ulps apart.
+        u32::MAX
+    } else if a_sign == b_sign {
+        a_abs.max(b_abs) - a_abs.min(b_abs)
+    } else {
+        a_abs + b_abs
+    }
+}
+
+/// Checks if two floats are approximately equal, within `max_ulps`.
+#[inline(always)]
+pub fn ulps_eq(a: f32, b: f32, max_ulps: u32) -> bool {
+    // The minimum ensures that NaNs never return true.
+    ulp_diff(a, b) <= max_ulps.min(u32::MAX - 1)
+}
+
+/// Increments to the next representable floating point number.
+///
+/// Notes:
+/// - 0.0 and -0.0 are treated as the same value.  E.g. starting from the
+///   number just before -0.0, it only takes two increments to get to the
+///   number just after 0.0.
+/// - Infinity, NaN, and their negative counterparts are returned
+///   unchanged.
+/// - Incrementing `f32::MAX` results in infinity.
+#[inline(always)]
+pub fn increment_ulp(v: f32) -> f32 {
+    if v.is_finite() {
+        if v > 0.0 {
+            f32::from_bits(v.to_bits() + 1)
+        } else if v < -0.0 {
+            f32::from_bits(v.to_bits() - 1)
+        } else {
+            f32::from_bits(1)
+        }
+    } else {
+        // Infinity or NaN.
+        v
+    }
+}
+
+/// Decrements to the previous representable floating point number.
+///
+/// Notes:
+/// - 0.0 and -0.0 are treated as the same value.  E.g. starting from the
+///   number just after 0.0, it only takes two decrements to get to the
+///   number just before -0.0.
+/// - Infinity, NaN, and their negative counterparts are returned
+///   unchanged.
+/// - Decrementing `-f32::MAX` results in -infinity.
+#[inline(always)]
+pub fn decrement_ulp(v: f32) -> f32 {
+    if v.is_finite() {
+        if v > 0.0 {
+            f32::from_bits(v.to_bits() - 1)
+        } else if v < -0.0 {
+            f32::from_bits(v.to_bits() + 1)
+        } else {
+            f32::from_bits(0x80000001)
+        }
+    } else {
+        // Infinity or NaN.
+        v
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn ulp_diff_test() {
+        assert_eq!(ulp_diff(1.0, 1.0), 0);
+        assert_eq!(ulp_diff(0.0, 0.0), 0);
+        assert_eq!(ulp_diff(0.0, -0.0), 0);
+        assert_eq!(ulp_diff(1.0, 2.0), 1 << 23);
+        assert_eq!(ulp_diff(2.0, 4.0), 1 << 23);
+        assert_eq!(ulp_diff(-1.0, -2.0), 1 << 23);
+        assert_eq!(ulp_diff(-2.0, -4.0), 1 << 23);
+        assert_eq!(ulp_diff(-1.0, 1.0), 0x7f000000);
+        assert_eq!(ulp_diff(0.0, 1.0), 0x3f800000);
+        assert_eq!(ulp_diff(-0.0, 1.0), 0x3f800000);
+        assert_eq!(ulp_diff(0.0, -1.0), 0x3f800000);
+        assert_eq!(ulp_diff(-0.0, -1.0), 0x3f800000);
+        assert_eq!(ulp_diff(f32::INFINITY, -f32::INFINITY), 0xff000000);
+        assert_eq!(ulp_diff(f32::NAN, f32::NAN), 0xffffffff);
+        assert_eq!(ulp_diff(f32::NAN, 1.0), 0xffffffff);
+        assert_eq!(ulp_diff(1.0, f32::NAN), 0xffffffff);
+        assert_eq!(ulp_diff(-f32::NAN, 1.0), 0xffffffff);
+        assert_eq!(ulp_diff(1.0, -f32::NAN), 0xffffffff);
+        assert_eq!(ulp_diff(0.0, f32::from_bits(0.0f32.to_bits() + 1)), 1);
+        assert_eq!(ulp_diff(-0.0, f32::from_bits(0.0f32.to_bits() + 1)), 1);
+    }
+
+    #[test]
+    fn ulps_eq_test() {
+        assert!(ulps_eq(1.0, 1.0, 0));
+        assert!(ulps_eq(1.0, 1.0, 1));
+        assert!(ulps_eq(0.0, 0.0, 0));
+        assert!(ulps_eq(0.0, -0.0, 0));
+
+        assert!(ulps_eq(1.0, 2.0, 1 << 23));
+        assert!(!ulps_eq(1.0, 2.0, (1 << 23) - 1));
+
+        assert!(ulps_eq(0.0, f32::from_bits(0.0f32.to_bits() + 1), 1));
+        assert!(!ulps_eq(0.0, f32::from_bits(0.0f32.to_bits() + 1), 0));
+
+        assert!(ulps_eq(-0.0, f32::from_bits(0.0f32.to_bits() + 1), 1));
+        assert!(!ulps_eq(-0.0, f32::from_bits(0.0f32.to_bits() + 1), 0));
+
+        assert!(ulps_eq(std::f32::INFINITY, -std::f32::INFINITY, 0xff000000));
+        assert!(!ulps_eq(
+            std::f32::INFINITY,
+            -std::f32::INFINITY,
+            0xff000000 - 1
+        ));
+
+        assert!(!ulps_eq(std::f32::NAN, std::f32::NAN, 0));
+        assert!(!ulps_eq(-std::f32::NAN, -std::f32::NAN, 0));
+        assert!(!ulps_eq(std::f32::NAN, std::f32::NAN, u32::MAX));
+        assert!(!ulps_eq(std::f32::NAN, std::f32::INFINITY, 1 << 31));
+        assert!(!ulps_eq(std::f32::INFINITY, std::f32::NAN, 1 << 31));
+    }
+
+    #[test]
+    fn inc_ulp() {
+        assert!(increment_ulp(1.0) > 1.0);
+        assert!(increment_ulp(-1.0) > -1.0);
+        assert!(increment_ulp(0.0) > 0.0);
+        assert!(increment_ulp(0.0) > -0.0);
+        assert!(increment_ulp(-0.0) > 0.0);
+        assert!(increment_ulp(-0.0) > -0.0);
+        assert!(increment_ulp(f32::MAX) == f32::INFINITY);
+        assert!(increment_ulp(f32::INFINITY) == f32::INFINITY);
+        assert!(increment_ulp(-f32::INFINITY) == -f32::INFINITY);
+        assert!(increment_ulp(f32::NAN).is_nan());
+        assert!(increment_ulp(-f32::NAN).is_nan());
+    }
+
+    #[test]
+    fn dec_ulp() {
+        assert!(decrement_ulp(1.0) < 1.0);
+        assert!(decrement_ulp(-1.0) < -1.0);
+        assert!(decrement_ulp(0.0) < 0.0);
+        assert!(decrement_ulp(0.0) < -0.0);
+        assert!(decrement_ulp(-0.0) < 0.0);
+        assert!(decrement_ulp(-0.0) < -0.0);
+        assert!(decrement_ulp(f32::MIN) == -f32::INFINITY);
+        assert!(decrement_ulp(f32::INFINITY) == f32::INFINITY);
+        assert!(decrement_ulp(-f32::INFINITY) == -f32::INFINITY);
+        assert!(decrement_ulp(f32::NAN).is_nan());
+        assert!(decrement_ulp(-f32::NAN).is_nan());
+    }
+
+    #[test]
+    fn inc_dec_ulp() {
+        assert_eq!(decrement_ulp(increment_ulp(0.0)), 0.0);
+        assert_eq!(decrement_ulp(increment_ulp(-0.0)), 0.0);
+        assert_eq!(decrement_ulp(increment_ulp(1.0)), 1.0);
+        assert_eq!(decrement_ulp(increment_ulp(-1.0)), -1.0);
+        assert_eq!(decrement_ulp(increment_ulp(1.2)), 1.2);
+        assert_eq!(decrement_ulp(increment_ulp(-1.2)), -1.2);
+    }
+
+    #[test]
+    fn dec_inc_ulp() {
+        assert_eq!(increment_ulp(decrement_ulp(0.0)), 0.0);
+        assert_eq!(increment_ulp(decrement_ulp(-0.0)), 0.0);
+        assert_eq!(increment_ulp(decrement_ulp(1.0)), 1.0);
+        assert_eq!(increment_ulp(decrement_ulp(-1.0)), -1.0);
+        assert_eq!(increment_ulp(decrement_ulp(1.2)), 1.2);
+        assert_eq!(increment_ulp(decrement_ulp(-1.2)), -1.2);
+    }
+}
--- a/sub_crates/rmath/src/vector.rs
+++ b/sub_crates/rmath/src/vector.rs
@ -0,0 +1,325 @@
+#![allow(dead_code)]
+
+use std::cmp::PartialEq;
+use std::ops::{Add, Div, Mul, Neg, Sub};
+
+use crate::normal::Normal;
+use crate::point::Point;
+use crate::wide4::Float4;
+use crate::xform::{AsXform, XformFull};
+use crate::{CrossProduct, DotProduct};
+
+/// A direction vector in 3D space.
+#[derive(Debug, Copy, Clone)]
+#[repr(transparent)]
+pub struct Vector(pub Float4);
+
+impl Vector {
+    #[inline(always)]
+    pub fn new(x: f32, y: f32, z: f32) -> Self {
+        Self(Float4::new(x, y, z, 0.0))
+    }
+
+    #[inline(always)]
+    pub fn length(self) -> f32 {
+        self.length2().sqrt()
+    }
+
+    #[inline(always)]
+    pub fn length2(self) -> f32 {
+        let sqr = self.0 * self.0;
+        sqr.a() + sqr.b() + sqr.c()
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn normalized(self) -> Self {
+        Self(self.0 / self.length())
+    }
+
+    #[inline(always)]
+    pub fn abs(self) -> Self {
+        Self(self.0.abs())
+    }
+
+    #[inline(always)]
+    pub fn recip(self) -> Self {
+        Self(self.0.recip())
+    }
+
+    #[inline(always)]
+    pub fn into_point(self) -> Point {
+        Point(self.0)
+    }
+
+    #[inline(always)]
+    pub fn into_normal(self) -> Normal {
+        Normal(self.0)
+    }
+
+    #[inline(always)]
+    pub fn x(self) -> f32 {
+        self.0.a()
+    }
+
+    #[inline(always)]
+    pub fn y(self) -> f32 {
+        self.0.b()
+    }
+
+    #[inline(always)]
+    pub fn z(self) -> f32 {
+        self.0.c()
+    }
+
+    #[inline(always)]
+    pub fn get_n(self, i: usize) -> f32 {
+        match i {
+            0 => self.x(),
+            1 => self.y(),
+            2 => self.z(),
+            _ => panic!("Out of bounds index into 3D vector."),
+        }
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_x(self, x: f32) -> Self {
+        Self(self.0.set_a(x))
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_y(self, y: f32) -> Self {
+        Self(self.0.set_b(y))
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_z(self, z: f32) -> Self {
+        Self(self.0.set_c(z))
+    }
+
+    //-------------
+    // Transforms.
+
+    /// Forward-transform the vector.
+    #[inline(always)]
+    pub fn xform<T: AsXform>(self, xform: &T) -> Self {
+        Self(self.0.vec_mul_3x3(&xform.as_xform().m))
+    }
+
+    /// Inverse-transform the vector.
+    #[inline(always)]
+    pub fn xform_inv(self, xform: &XformFull) -> Self {
+        Self(self.0.vec_mul_3x3(&xform.inv_m))
+    }
+
+    /// Faster but less precise version of `xform()`.
+    #[inline(always)]
+    pub fn xform_fast<T: AsXform>(self, xform: &T) -> Self {
+        Self(self.0.vec_mul_3x3_fast(&xform.as_xform().m))
+    }
+
+    /// Faster but less precise version of `xform_inv()`.
+    #[inline(always)]
+    pub fn xform_inv_fast(self, xform: &XformFull) -> Self {
+        Self(self.0.vec_mul_3x3_fast(&xform.inv_m))
+    }
+}
+
+impl Add for Vector {
+    type Output = Self;
+
+    #[inline(always)]
+    fn add(self, other: Self) -> Self {
+        Self(self.0 + other.0)
+    }
+}
+
+impl Sub for Vector {
+    type Output = Self;
+
+    #[inline(always)]
+    fn sub(self, other: Self) -> Self {
+        Self(self.0 - other.0)
+    }
+}
+
+impl Mul<f32> for Vector {
+    type Output = Self;
+
+    #[inline(always)]
+    fn mul(self, other: f32) -> Self {
+        Self(self.0 * other)
+    }
+}
+
+impl Div<f32> for Vector {
+    type Output = Self;
+
+    #[inline(always)]
+    fn div(self, other: f32) -> Self {
+        Self(self.0 / other)
+    }
+}
+
+impl Neg for Vector {
+    type Output = Self;
+
+    #[inline(always)]
+    fn neg(self) -> Self {
+        Self(-self.0)
+    }
+}
+
+impl PartialEq for Vector {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        self.0.a() == rhs.0.a() && self.0.b() == rhs.0.b() && self.0.c() == rhs.0.c()
+    }
+}
+
+impl DotProduct for Vector {
+    #[inline(always)]
+    fn dot(self, other: Self) -> f32 {
+        Float4::dot_3(self.0, other.0)
+    }
+
+    #[inline(always)]
+    fn dot_fast(self, other: Self) -> f32 {
+        Float4::dot_3_fast(self.0, other.0)
+    }
+}
+
+impl CrossProduct for Vector {
+    #[inline(always)]
+    fn cross(self, other: Self) -> Self {
+        Self(Float4::cross_3(self.0, other.0))
+    }
+
+    #[inline(always)]
+    fn cross_fast(self, other: Self) -> Self {
+        Self(Float4::cross_3_fast(self.0, other.0))
+    }
+}
+
+//-------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{CrossProduct, DotProduct, Xform};
+
+    #[test]
+    fn add() {
+        let v1 = Vector::new(1.0, 2.0, 3.0);
+        let v2 = Vector::new(1.5, 4.5, 2.5);
+        let v3 = Vector::new(2.5, 6.5, 5.5);
+
+        assert_eq!(v3, v1 + v2);
+    }
+
+    #[test]
+    fn sub() {
+        let v1 = Vector::new(1.0, 2.0, 3.0);
+        let v2 = Vector::new(1.5, 4.5, 2.5);
+        let v3 = Vector::new(-0.5, -2.5, 0.5);
+
+        assert_eq!(v3, v1 - v2);
+    }
+
+    #[test]
+    fn mul_scalar() {
+        let v1 = Vector::new(1.0, 2.0, 3.0);
+        let v2 = 2.0;
+        let v3 = Vector::new(2.0, 4.0, 6.0);
+
+        assert_eq!(v3, v1 * v2);
+    }
+
+    #[test]
+    fn xform() {
+        let v = Vector::new(1.0, 2.5, 4.0);
+        let m = Xform::new(1.0, 3.0, 9.0, 2.0, 6.0, 2.0, 2.0, 7.0, 11.0, 1.5, 8.0, 12.0)
+            .to_full()
+            .unwrap();
+
+        assert_eq!(v.xform(&m), Vector::new(14.0, 46.0, 58.0));
+        assert_eq!(v.xform(&m).xform_inv(&m), v);
+    }
+
+    #[test]
+    fn xform_fast() {
+        let v = Vector::new(1.0, 2.5, 4.0);
+        let m = Xform::new(1.0, 3.0, 9.0, 2.0, 6.0, 2.0, 2.0, 7.0, 11.0, 1.5, 8.0, 12.0)
+            .to_full()
+            .unwrap();
+
+        assert_eq!(v.xform_fast(&m), Vector::new(14.0, 46.0, 58.0));
+        assert_eq!(v.xform_fast(&m).xform_inv_fast(&m), v);
+    }
+
+    #[test]
+    fn div() {
+        let v1 = Vector::new(1.0, 2.0, 3.0);
+        let v2 = 2.0;
+        let v3 = Vector::new(0.5, 1.0, 1.5);
+
+        assert_eq!(v3, v1 / v2);
+    }
+
+    #[test]
+    fn length() {
+        let v = Vector::new(1.0, 2.0, 3.0);
+        assert!((v.length() - 3.7416573867739413).abs() < 0.000001);
+    }
+
+    #[test]
+    fn length2() {
+        let v = Vector::new(1.0, 2.0, 3.0);
+        assert_eq!(v.length2(), 14.0);
+    }
+
+    #[test]
+    fn normalized() {
+        let v1 = Vector::new(1.0, 2.0, 3.0);
+        let v2 = Vector::new(0.2672612419124244, 0.5345224838248488, 0.8017837257372732);
+        let v3 = v1.normalized();
+        assert!((v3.x() - v2.x()).abs() < 0.000001);
+        assert!((v3.y() - v2.y()).abs() < 0.000001);
+        assert!((v3.z() - v2.z()).abs() < 0.000001);
+    }
+
+    #[test]
+    fn dot() {
+        let v1 = Vector::new(1.0, 2.0, 3.0);
+        let v2 = Vector::new(1.5, 4.5, 2.5);
+
+        assert_eq!(v1.dot(v2), 18.0);
+    }
+
+    #[test]
+    fn dot_fast() {
+        let v1 = Vector::new(1.0, 2.0, 3.0);
+        let v2 = Vector::new(1.5, 4.5, 2.5);
+
+        assert_eq!(v1.dot_fast(v2), 18.0);
+    }
+
+    #[test]
+    fn cross() {
+        let v1 = Vector::new(1.0, 0.0, 0.0);
+        let v2 = Vector::new(0.0, 1.0, 0.0);
+
+        assert_eq!(v1.cross(v2), Vector::new(0.0, 0.0, 1.0));
+    }
+
+    #[test]
+    fn cross_fast() {
+        let v1 = Vector::new(1.0, 0.0, 0.0);
+        let v2 = Vector::new(0.0, 1.0, 0.0);
+
+        assert_eq!(v1.cross_fast(v2), Vector::new(0.0, 0.0, 1.0));
+    }
+}
--- a/sub_crates/rmath/src/wide4/fallback.rs
+++ b/sub_crates/rmath/src/wide4/fallback.rs
@ -0,0 +1,462 @@
+use std::ops::{Add, BitAnd, BitOr, BitXor, Div, Index, Mul, Neg, Not, Sub};
+
+use crate::FMulAdd;
+
+//=============================================================
+// Float4
+
+#[derive(Debug, Copy, Clone)]
+#[repr(C, align(16))]
+pub struct Float4([f32; 4]);
+
+impl Float4 {
+    /// Create a new `Float4` with the given components.
+    #[inline(always)]
+    pub fn new(a: f32, b: f32, c: f32, d: f32) -> Self {
+        Self([a, b, c, d])
+    }
+
+    /// Create a new `Float4` with all elements set to `n`.
+    #[inline(always)]
+    pub fn splat(n: f32) -> Self {
+        Self([n, n, n, n])
+    }
+
+    /// Component-wise fused multiply-add.
+    ///
+    /// `(self * a) + b` with only one rounding error.
+    #[inline(always)]
+    pub fn mul_add(self, a: Self, b: Self) -> Self {
+        Self([
+            self.0[0].mul_add(a.0[0], b.0[0]),
+            self.0[1].mul_add(a.0[1], b.0[1]),
+            self.0[2].mul_add(a.0[2], b.0[2]),
+            self.0[3].mul_add(a.0[3], b.0[3]),
+        ])
+    }
+
+    /// Vertical minimum.
+    #[inline(always)]
+    pub fn min(self, a: Self) -> Self {
+        // Custom min to match behavior of SSE.
+        #[inline(always)]
+        pub fn minf(a: f32, b: f32) -> f32 {
+            if a < b {
+                a
+            } else {
+                b
+            }
+        }
+        Self([
+            minf(self.0[0], a.0[0]),
+            minf(self.0[1], a.0[1]),
+            minf(self.0[2], a.0[2]),
+            minf(self.0[3], a.0[3]),
+        ])
+    }
+
+    /// Vertical maximum.
+    #[inline(always)]
+    pub fn max(self, a: Self) -> Self {
+        // Custom max to match behavior of SSE.
+        #[inline(always)]
+        pub fn maxf(a: f32, b: f32) -> f32 {
+            if a > b {
+                a
+            } else {
+                b
+            }
+        }
+        Self([
+            maxf(self.0[0], a.0[0]),
+            maxf(self.0[1], a.0[1]),
+            maxf(self.0[2], a.0[2]),
+            maxf(self.0[3], a.0[3]),
+        ])
+    }
+
+    /// Horizontal minimum.
+    #[inline(always)]
+    pub fn min_element(self) -> f32 {
+        let a = self.0[0].min(self.0[1]);
+        let b = self.0[2].min(self.0[3]);
+        a.min(b)
+    }
+
+    /// Horizontal maximum.
+    #[inline(always)]
+    pub fn max_element(self) -> f32 {
+        let a = self.0[0].max(self.0[1]);
+        let b = self.0[2].max(self.0[3]);
+        a.max(b)
+    }
+
+    /// 1.0 / self
+    #[inline(always)]
+    pub fn recip(self) -> Self {
+        Float4::splat(1.0) / self
+    }
+
+    #[inline(always)]
+    pub fn abs(self) -> Self {
+        Float4::new(
+            self.a().abs(),
+            self.b().abs(),
+            self.c().abs(),
+            self.d().abs(),
+        )
+    }
+
+    //-----------------------------------------------------
+    // Comparisons.
+
+    /// Less than.
+    #[inline(always)]
+    pub fn cmplt(self, rhs: Self) -> Bool4 {
+        Bool4([
+            self.0[0] < rhs.0[0],
+            self.0[1] < rhs.0[1],
+            self.0[2] < rhs.0[2],
+            self.0[3] < rhs.0[3],
+        ])
+    }
+
+    /// Less than or equal.
+    #[inline(always)]
+    pub fn cmplte(self, rhs: Self) -> Bool4 {
+        Bool4([
+            self.0[0] <= rhs.0[0],
+            self.0[1] <= rhs.0[1],
+            self.0[2] <= rhs.0[2],
+            self.0[3] <= rhs.0[3],
+        ])
+    }
+
+    /// Greater than.
+    #[inline(always)]
+    pub fn cmpgt(self, rhs: Self) -> Bool4 {
+        Bool4([
+            self.0[0] > rhs.0[0],
+            self.0[1] > rhs.0[1],
+            self.0[2] > rhs.0[2],
+            self.0[3] > rhs.0[3],
+        ])
+    }
+
+    /// Greater than or equal.
+    #[inline(always)]
+    pub fn cmpgte(self, rhs: Self) -> Bool4 {
+        Bool4([
+            self.0[0] >= rhs.0[0],
+            self.0[1] >= rhs.0[1],
+            self.0[2] >= rhs.0[2],
+            self.0[3] >= rhs.0[3],
+        ])
+    }
+
+    /// Equal.
+    #[inline(always)]
+    pub fn cmpeq(self, rhs: Self) -> Bool4 {
+        Bool4([
+            self.0[0] == rhs.0[0],
+            self.0[1] == rhs.0[1],
+            self.0[2] == rhs.0[2],
+            self.0[3] == rhs.0[3],
+        ])
+    }
+
+    //-----------------------------------------------------
+    // Individual components.
+
+    #[inline(always)]
+    pub fn a(self) -> f32 {
+        self.0[0]
+    }
+
+    #[inline(always)]
+    pub fn b(self) -> f32 {
+        self.0[1]
+    }
+
+    #[inline(always)]
+    pub fn c(self) -> f32 {
+        self.0[2]
+    }
+
+    #[inline(always)]
+    pub fn d(self) -> f32 {
+        self.0[3]
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_a(self, n: f32) -> Self {
+        Self([n, self.0[1], self.0[2], self.0[3]])
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_b(self, n: f32) -> Self {
+        Self([self.0[0], n, self.0[2], self.0[3]])
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_c(self, n: f32) -> Self {
+        Self([self.0[0], self.0[1], n, self.0[3]])
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_d(self, n: f32) -> Self {
+        Self([self.0[0], self.0[1], self.0[2], n])
+    }
+
+    //-----------------------------------------------------
+    // Shuffles.
+
+    #[inline(always)]
+    pub fn aaaa(self) -> Self {
+        let a = self.0[0];
+        Self([a, a, a, a])
+    }
+
+    #[inline(always)]
+    pub fn bbbb(self) -> Self {
+        let b = self.0[1];
+        Self([b, b, b, b])
+    }
+
+    #[inline(always)]
+    pub fn cccc(self) -> Self {
+        let c = self.0[2];
+        Self([c, c, c, c])
+    }
+
+    #[inline(always)]
+    pub fn dddd(self) -> Self {
+        let d = self.0[3];
+        Self([d, d, d, d])
+    }
+
+    #[inline(always)]
+    pub fn bcad(self) -> Self {
+        let a = self.0[0];
+        let b = self.0[1];
+        let c = self.0[2];
+        let d = self.0[3];
+        Self([b, c, a, d])
+    }
+
+    #[inline(always)]
+    pub fn cabd(self) -> Self {
+        let a = self.0[0];
+        let b = self.0[1];
+        let c = self.0[2];
+        let d = self.0[3];
+        Self([c, a, b, d])
+    }
+}
+
+impl Index<usize> for Float4 {
+    type Output = f32;
+
+    #[inline(always)]
+    fn index(&self, idx: usize) -> &f32 {
+        &self.0[idx]
+    }
+}
+
+impl Add for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self {
+        Self([
+            self.0[0] + rhs.0[0],
+            self.0[1] + rhs.0[1],
+            self.0[2] + rhs.0[2],
+            self.0[3] + rhs.0[3],
+        ])
+    }
+}
+
+impl Sub for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn sub(self, rhs: Self) -> Self {
+        Self([
+            self.0[0] - rhs.0[0],
+            self.0[1] - rhs.0[1],
+            self.0[2] - rhs.0[2],
+            self.0[3] - rhs.0[3],
+        ])
+    }
+}
+
+impl Mul for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn mul(self, rhs: Self) -> Self {
+        Self([
+            self.0[0] * rhs.0[0],
+            self.0[1] * rhs.0[1],
+            self.0[2] * rhs.0[2],
+            self.0[3] * rhs.0[3],
+        ])
+    }
+}
+
+impl Mul<f32> for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn mul(self, rhs: f32) -> Self {
+        Self([
+            self.0[0] * rhs,
+            self.0[1] * rhs,
+            self.0[2] * rhs,
+            self.0[3] * rhs,
+        ])
+    }
+}
+
+impl Div for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn div(self, rhs: Self) -> Self {
+        Self([
+            self.0[0] / rhs.0[0],
+            self.0[1] / rhs.0[1],
+            self.0[2] / rhs.0[2],
+            self.0[3] / rhs.0[3],
+        ])
+    }
+}
+
+impl Div<f32> for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn div(self, rhs: f32) -> Self {
+        Self([
+            self.0[0] / rhs,
+            self.0[1] / rhs,
+            self.0[2] / rhs,
+            self.0[3] / rhs,
+        ])
+    }
+}
+
+impl Neg for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn neg(self) -> Self {
+        Self([-self.0[0], -self.0[1], -self.0[2], -self.0[3]])
+    }
+}
+
+impl FMulAdd for Float4 {
+    #[inline(always)]
+    fn fma(self, b: Self, c: Self) -> Self {
+        self.mul_add(b, c)
+    }
+}
+
+//=============================================================
+// Bool4
+
+#[derive(Copy, Clone)]
+#[repr(transparent)]
+pub struct Bool4([bool; 4]);
+
+impl Bool4 {
+    #[inline(always)]
+    pub fn new(a: bool, b: bool, c: bool, d: bool) -> Self {
+        Bool4([a, b, c, d])
+    }
+
+    #[inline(always)]
+    pub fn new_false() -> Self {
+        Self([false, false, false, false])
+    }
+
+    #[inline(always)]
+    pub fn to_bools(self) -> [bool; 4] {
+        self.0
+    }
+
+    /// Note: `a` goes to the least significant bit.
+    #[inline(always)]
+    pub fn bitmask(self) -> u8 {
+        self.0[0] as u8
+            | ((self.0[1] as u8) << 1)
+            | ((self.0[2] as u8) << 2)
+            | ((self.0[3] as u8) << 3)
+    }
+
+    #[inline(always)]
+    pub fn any(self) -> bool {
+        self.0[0] | &self.0[1] | self.0[2] | self.0[3]
+    }
+
+    #[inline(always)]
+    pub fn all(self) -> bool {
+        self.0[0] & &self.0[1] & self.0[2] & self.0[3]
+    }
+}
+
+impl BitAnd for Bool4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn bitand(self, rhs: Self) -> Self {
+        Self([
+            self.0[0] & rhs.0[0],
+            self.0[1] & rhs.0[1],
+            self.0[2] & rhs.0[2],
+            self.0[3] & rhs.0[3],
+        ])
+    }
+}
+
+impl BitOr for Bool4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn bitor(self, rhs: Self) -> Self {
+        Self([
+            self.0[0] | rhs.0[0],
+            self.0[1] | rhs.0[1],
+            self.0[2] | rhs.0[2],
+            self.0[3] | rhs.0[3],
+        ])
+    }
+}
+
+impl BitXor for Bool4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn bitxor(self, rhs: Self) -> Self {
+        Self([
+            self.0[0] ^ rhs.0[0],
+            self.0[1] ^ rhs.0[1],
+            self.0[2] ^ rhs.0[2],
+            self.0[3] ^ rhs.0[3],
+        ])
+    }
+}
+
+impl Not for Bool4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn not(self) -> Self {
+        Self([!self.0[0], !self.0[1], !self.0[2], !self.0[3]])
+    }
+}
--- a/sub_crates/rmath/src/wide4/mod.rs
+++ b/sub_crates/rmath/src/wide4/mod.rs
@ -0,0 +1,812 @@
+use std::{
+    cmp::{Eq, PartialEq},
+    ops::{AddAssign, BitAndAssign, BitOrAssign, BitXorAssign, DivAssign, MulAssign, SubAssign},
+};
+
+use crate::utils::ulps_eq;
+use crate::{difference_of_products, two_diff, two_prod, two_sum};
+
+//-------------------------------------------------------------
+// Which implementation to use.
+
+mod fallback;
+#[cfg(not(any(target_arch = "x86_64")))]
+pub use fallback::{Bool4, Float4};
+
+#[cfg(target_arch = "x86_64")]
+mod sse;
+#[cfg(target_arch = "x86_64")]
+pub use sse::{Bool4, Float4};
+
+//-------------------------------------------------------------
+
+impl Float4 {
+    /// 3D dot product (only uses the first 3 components).
+    #[inline(always)]
+    pub fn dot_3(a: Self, b: Self) -> f32 {
+        let (p, p_err) = two_prod(a, b);
+
+        // Products.
+        let (x, x_err) = (p.a(), p_err.a());
+        let (y, y_err) = (p.b(), p_err.b());
+        let (z, z_err) = (p.c(), p_err.c());
+
+        // Sums.
+        let (s1, s1_err) = two_sum(x, y);
+        let err1 = x_err + (y_err + s1_err);
+
+        let (s2, s2_err) = two_sum(s1, z);
+        let err2 = z_err + (err1 + s2_err);
+
+        // Final result with rounding error compensation.
+        s2 + err2
+    }
+
+    /// Faster but less precise version of `dot_3()`.
+    #[inline(always)]
+    pub fn dot_3_fast(a: Self, b: Self) -> f32 {
+        let c = a * b;
+        c.a() + c.b() + c.c()
+    }
+
+    /// 3D cross product (only uses the first 3 components).
+    #[inline(always)]
+    pub fn cross_3(a: Self, b: Self) -> Self {
+        difference_of_products(a.bcad(), b.cabd(), a.cabd(), b.bcad())
+    }
+
+    /// Faster but less precise version `cross_3()`.
+    #[inline(always)]
+    pub fn cross_3_fast(a: Self, b: Self) -> Self {
+        (a.bcad() * b.cabd()) - (a.cabd() * b.bcad())
+    }
+
+    #[inline(always)]
+    pub fn transpose_3x3(m: &[Self; 3]) -> [Self; 3] {
+        [
+            // The fourth component in each row below is arbitrary,
+            // but in this case chosen so that it matches the
+            // behavior of the SSE version of transpose_3x3.
+            Self::new(m[0].a(), m[1].a(), m[2].a(), m[2].d()),
+            Self::new(m[0].b(), m[1].b(), m[2].b(), m[2].d()),
+            Self::new(m[0].c(), m[1].c(), m[2].c(), m[2].d()),
+        ]
+    }
+
+    /// Invert a 3x3 matrix.
+    ///
+    /// Returns `None` if not invertible.
+    #[inline]
+    pub fn invert_3x3(m: &[Self; 3]) -> Option<[Self; 3]> {
+        let m0_bca = m[0].bcad();
+        let m1_bca = m[1].bcad();
+        let m2_bca = m[2].bcad();
+        let m0_cab = m[0].cabd();
+        let m1_cab = m[1].cabd();
+        let m2_cab = m[2].cabd();
+        let abc = difference_of_products(m1_bca, m2_cab, m1_cab, m2_bca);
+        let def = difference_of_products(m2_bca, m0_cab, m2_cab, m0_bca);
+        let ghi = difference_of_products(m0_bca, m1_cab, m0_cab, m1_bca);
+
+        let det = Self::dot_3(
+            Self::new(abc.a(), def.a(), ghi.a(), 0.0),
+            Self::new(m[0].a(), m[1].a(), m[2].a(), 0.0),
+        );
+
+        if det == 0.0 {
+            None
+        } else {
+            Some(Self::transpose_3x3(&[abc / det, def / det, ghi / det]))
+        }
+    }
+
+    /// Invert a 3x3 matrix, and also return the computed determinant.
+    ///
+    /// Returns `None` if not invertible.
+    #[inline]
+    pub fn invert_3x3_w_det(m: &[Self; 3]) -> Option<([Self; 3], f32)> {
+        let m0_bca = m[0].bcad();
+        let m1_bca = m[1].bcad();
+        let m2_bca = m[2].bcad();
+        let m0_cab = m[0].cabd();
+        let m1_cab = m[1].cabd();
+        let m2_cab = m[2].cabd();
+        let abc = difference_of_products(m1_bca, m2_cab, m1_cab, m2_bca);
+        let def = difference_of_products(m2_bca, m0_cab, m2_cab, m0_bca);
+        let ghi = difference_of_products(m0_bca, m1_cab, m0_cab, m1_bca);
+
+        let det = Self::dot_3(
+            Self::new(abc.a(), def.a(), ghi.a(), 0.0),
+            Self::new(m[0].a(), m[1].a(), m[2].a(), 0.0),
+        );
+
+        if det == 0.0 {
+            None
+        } else {
+            Some((Self::transpose_3x3(&[abc / det, def / det, ghi / det]), det))
+        }
+    }
+
+    /// Faster but less precise version of `invert_3x3()`.
+    #[inline]
+    pub fn invert_3x3_fast(m: &[Self; 3]) -> Option<[Self; 3]> {
+        let m0_bca = m[0].bcad();
+        let m1_bca = m[1].bcad();
+        let m2_bca = m[2].bcad();
+        let m0_cab = m[0].cabd();
+        let m1_cab = m[1].cabd();
+        let m2_cab = m[2].cabd();
+        let abc = (m1_bca * m2_cab) - (m1_cab * m2_bca);
+        let def = (m2_bca * m0_cab) - (m2_cab * m0_bca);
+        let ghi = (m0_bca * m1_cab) - (m0_cab * m1_bca);
+
+        let det = Self::dot_3_fast(
+            Self::new(abc.a(), def.a(), ghi.a(), 0.0),
+            Self::new(m[0].a(), m[1].a(), m[2].a(), 0.0),
+        );
+
+        if det == 0.0 {
+            None
+        } else {
+            Some(Self::transpose_3x3(&[abc / det, def / det, ghi / det]))
+        }
+    }
+
+    /// Multiplies a 3D vector with a 3x3 matrix.
+    #[inline]
+    pub fn vec_mul_3x3(self, m: &[Self; 3]) -> Self {
+        let x = self.aaaa();
+        let y = self.bbbb();
+        let z = self.cccc();
+
+        // Products.
+        let (a, a_err) = two_prod(x, m[0]);
+        let (b, b_err) = two_prod(y, m[1]);
+        let (c, c_err) = two_prod(z, m[2]);
+
+        // Sums.
+        let (s1, s1_err) = two_sum(a, b);
+        let err1 = a_err + (b_err + s1_err);
+
+        let (s2, s2_err) = two_sum(c, s1);
+        let err2 = c_err + (err1 + s2_err);
+
+        s2 + err2
+    }
+
+    /// Faster but less precise version of `vec_mul_3x3()`.
+    #[inline]
+    pub fn vec_mul_3x3_fast(self, m: &[Self; 3]) -> Self {
+        let x = self.aaaa();
+        let y = self.bbbb();
+        let z = self.cccc();
+
+        (x * m[0]) + (y * m[1]) + (z * m[2])
+    }
+
+    /// Transforms a 3d point by an affine transform.
+    ///
+    /// `m` is the 3x3 part of the affine transform, `t` is the translation part.
+    #[inline]
+    pub fn vec_mul_affine(self, m: &[Self; 3], t: Self) -> Self {
+        let x = self.aaaa();
+        let y = self.bbbb();
+        let z = self.cccc();
+
+        // Products.
+        let (a, a_err) = two_prod(x, m[0]);
+        let (b, b_err) = two_prod(y, m[1]);
+        let (c, c_err) = two_prod(z, m[2]);
+
+        // Sums.
+        let (s1, s1_err) = two_sum(a, b);
+        let err1 = a_err + (b_err + s1_err);
+
+        let (s2, s2_err) = two_sum(c, s1);
+        let err2 = c_err + (err1 + s2_err);
+
+        let (s3, s3_err) = two_sum(t, s2);
+        let err3 = err2 + s3_err;
+
+        s3 + err3
+    }
+
+    /// Faster but less precise version of `vec_mul_affine()`.
+    #[inline]
+    pub fn vec_mul_affine_fast(self, m: &[Self; 3], t: Self) -> Self {
+        let x = self.aaaa();
+        let y = self.bbbb();
+        let z = self.cccc();
+
+        (x * m[0]) + (y * m[1]) + (z * m[2]) + t
+    }
+
+    /// Transforms a 3d point by an affine transform, except it does
+    /// `(vec - t) * inv_m` instead of `vec * m + t`.
+    ///
+    /// This is useful for performing efficient inverse transforms while
+    /// only having to invert the 3x3 part of the transform itself.
+    ///
+    /// `inv_m` is the inverse 3x3 part of the affine transform, `t` is
+    /// the forward translation part.
+    #[inline]
+    pub fn vec_mul_affine_rev(self, inv_m: &[Self; 3], t: Self) -> Self {
+        let (v, v_err) = two_diff(self, t);
+
+        let (x, x_err) = (v.aaaa(), v_err.aaaa());
+        let (y, y_err) = (v.bbbb(), v_err.bbbb());
+        let (z, z_err) = (v.cccc(), v_err.cccc());
+
+        // Products.
+        let ((a, a_err1), a_err2) = (two_prod(x, inv_m[0]), x_err * inv_m[0]);
+        let ((b, b_err1), b_err2) = (two_prod(y, inv_m[1]), y_err * inv_m[1]);
+        let ((c, c_err1), c_err2) = (two_prod(z, inv_m[2]), z_err * inv_m[2]);
+        let a_err = a_err1 + a_err2;
+        let b_err = b_err1 + b_err2;
+        let c_err = c_err1 + c_err2;
+
+        // Sums.
+        let (s1, s1_err) = two_sum(a, b);
+        let err1 = a_err + (b_err + s1_err);
+
+        let (s2, s2_err) = two_sum(c, s1);
+        let err2 = c_err + (err1 + s2_err);
+
+        s2 + err2
+    }
+
+    /// Faster but less precise version of `vec_mul_affine_rev()`.
+    #[inline]
+    pub fn vec_mul_affine_rev_fast(self, inv_m: &[Self; 3], t: Self) -> Self {
+        let v = self - t;
+
+        let x = v.aaaa();
+        let y = v.bbbb();
+        let z = v.cccc();
+
+        (x * inv_m[0]) + (y * inv_m[1]) + (z * inv_m[2])
+    }
+
+    /// Returns whether the `Float4`s are approximately equal to each
+    /// other.
+    ///
+    /// Each corresponding element cannot have a relative error exceeding
+    /// `epsilon`.
+    pub(crate) fn aprx_eq(a: Self, b: Self, max_ulps: u32) -> bool {
+        let mut eq = true;
+        eq &= ulps_eq(a.a(), b.a(), max_ulps);
+        eq &= ulps_eq(a.b(), b.b(), max_ulps);
+        eq &= ulps_eq(a.c(), b.c(), max_ulps);
+        eq &= ulps_eq(a.d(), b.d(), max_ulps);
+        eq
+    }
+
+    /// Transforms one affine transform by another.
+    ///
+    /// The result is an affine transform that acts as a sequence of the
+    /// first followed by the second.
+    ///
+    /// `m#` is the 3x3 part of the affine transform, `t#` is the translation part.
+    #[inline]
+    pub fn affine_mul_affine(
+        m1: &[Self; 3],
+        t1: Self,
+        m2: &[Self; 3],
+        t2: Self,
+    ) -> ([Self; 3], Self) {
+        (
+            [
+                m1[0].vec_mul_3x3(&m2),
+                m1[1].vec_mul_3x3(&m2),
+                m1[2].vec_mul_3x3(&m2),
+            ],
+            t1.vec_mul_affine(&m2, t2),
+        )
+    }
+
+    /// Faster but less precise version of `affine_mul_affine()`.
+    #[inline]
+    pub fn affine_mul_affine_fast(
+        m1: &[Self; 3],
+        t1: Self,
+        m2: &[Self; 3],
+        t2: Self,
+    ) -> ([Self; 3], Self) {
+        (
+            [
+                m1[0].vec_mul_3x3_fast(&m2),
+                m1[1].vec_mul_3x3_fast(&m2),
+                m1[2].vec_mul_3x3_fast(&m2),
+            ],
+            t1.vec_mul_affine_fast(&m2, t2),
+        )
+    }
+}
+
+impl From<Float4> for (f32, f32, f32, f32) {
+    fn from(v: Float4) -> (f32, f32, f32, f32) {
+        (v.a(), v.b(), v.c(), v.d())
+    }
+}
+
+impl AddAssign for Float4 {
+    #[inline(always)]
+    fn add_assign(&mut self, rhs: Self) {
+        *self = *self + rhs;
+    }
+}
+
+impl SubAssign for Float4 {
+    #[inline(always)]
+    fn sub_assign(&mut self, rhs: Self) {
+        *self = *self - rhs;
+    }
+}
+
+impl MulAssign for Float4 {
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: Self) {
+        *self = *self * rhs;
+    }
+}
+
+impl MulAssign<f32> for Float4 {
+    #[inline(always)]
+    fn mul_assign(&mut self, rhs: f32) {
+        *self = *self * rhs;
+    }
+}
+
+impl DivAssign for Float4 {
+    #[inline(always)]
+    fn div_assign(&mut self, rhs: Self) {
+        *self = *self / rhs;
+    }
+}
+
+impl DivAssign<f32> for Float4 {
+    #[inline(always)]
+    fn div_assign(&mut self, rhs: f32) {
+        *self = *self / rhs;
+    }
+}
+
+impl PartialEq for Float4 {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        self.cmpeq(*rhs).bitmask() == 0b1111
+    }
+}
+
+//--------
+
+impl BitAndAssign for Bool4 {
+    #[inline(always)]
+    fn bitand_assign(&mut self, rhs: Self) {
+        *self = *self & rhs;
+    }
+}
+
+impl BitOrAssign for Bool4 {
+    #[inline(always)]
+    fn bitor_assign(&mut self, rhs: Self) {
+        *self = *self | rhs;
+    }
+}
+
+impl BitXorAssign for Bool4 {
+    #[inline(always)]
+    fn bitxor_assign(&mut self, rhs: Self) {
+        *self = *self ^ rhs;
+    }
+}
+
+impl PartialEq for Bool4 {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        self.bitmask() == rhs.bitmask()
+    }
+}
+
+impl Eq for Bool4 {}
+
+impl std::fmt::Debug for Bool4 {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
+        f.write_str("Bool4(")?;
+        f.debug_list().entries(self.to_bools().iter()).finish()?;
+        f.write_str(")")?;
+
+        Ok(())
+    }
+}
+
+//-------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    //------------
+    // Float4
+
+    #[test]
+    fn approximate_equality_test() {
+        let a = Float4::new(1.0, 2.0, 3.0, 4.0);
+        let b = Float4::new(1.00001, 2.00002, 3.00003, 4.00004);
+        let c = Float4::new(1.0e-43, 2.0e-43, 3.0e-43, 4.0e-43);
+        let d = Float4::new(-1.0e-43, -2.0e-43, -3.0e-43, -4.0e-43);
+
+        assert!(Float4::aprx_eq(a, a, 0));
+
+        assert!(Float4::aprx_eq(a, b, 130));
+        assert!(!Float4::aprx_eq(a, b, 120));
+
+        assert!(Float4::aprx_eq(c, d, 575));
+        assert!(!Float4::aprx_eq(c, d, 565));
+    }
+
+    #[test]
+    fn index() {
+        let v = Float4::new(0.0, 1.0, 2.0, 3.0);
+        assert_eq!(v[0], 0.0);
+        assert_eq!(v[1], 1.0);
+        assert_eq!(v[2], 2.0);
+        assert_eq!(v[3], 3.0);
+    }
+
+    #[test]
+    fn get() {
+        let v = Float4::new(0.0, 1.0, 2.0, 3.0);
+
+        assert_eq!(v.a(), 0.0);
+        assert_eq!(v.b(), 1.0);
+        assert_eq!(v.c(), 2.0);
+        assert_eq!(v.d(), 3.0);
+    }
+
+    #[test]
+    fn set() {
+        let v = Float4::new(0.0, 1.0, 2.0, 3.0);
+
+        assert_eq!(v.set_a(9.0), Float4::new(9.0, 1.0, 2.0, 3.0));
+        assert_eq!(v.set_b(9.0), Float4::new(0.0, 9.0, 2.0, 3.0));
+        assert_eq!(v.set_c(9.0), Float4::new(0.0, 1.0, 9.0, 3.0));
+        assert_eq!(v.set_d(9.0), Float4::new(0.0, 1.0, 2.0, 9.0));
+    }
+
+    #[test]
+    fn shuffle() {
+        let v = Float4::new(0.0, 1.0, 2.0, 3.0);
+
+        assert_eq!(v.aaaa(), Float4::splat(0.0));
+        assert_eq!(v.bbbb(), Float4::splat(1.0));
+        assert_eq!(v.cccc(), Float4::splat(2.0));
+        assert_eq!(v.dddd(), Float4::splat(3.0));
+
+        assert_eq!(v.bcad(), Float4::new(1.0, 2.0, 0.0, 3.0));
+        assert_eq!(v.cabd(), Float4::new(2.0, 0.0, 1.0, 3.0));
+    }
+
+    #[test]
+    fn abs() {
+        let v1 = Float4::new(-1.0, 2.0, -3.0, 4.0);
+        let v2 = Float4::new(1.0, -2.0, 3.0, -4.0);
+
+        let r = Float4::new(1.0, 2.0, 3.0, 4.0);
+
+        assert_eq!(v1.abs(), r);
+        assert_eq!(v2.abs(), r);
+    }
+
+    #[test]
+    fn neg() {
+        let v1 = Float4::new(-1.0, 2.0, -3.0, 4.0);
+        let v2 = Float4::new(1.0, -2.0, 3.0, -4.0);
+
+        assert_eq!(-v1, v2);
+        assert_eq!(-v2, v1);
+    }
+
+    #[test]
+    fn cmp_ops() {
+        let a = Float4::new(1.0, 2.0, -2.0, 0.0);
+        let b = Float4::new(1.0, -2.0, 2.0, -0.0);
+
+        assert_eq!(a.cmplt(b), Bool4::new(false, false, true, false));
+        assert_eq!(a.cmplte(b), Bool4::new(true, false, true, true));
+        assert_eq!(a.cmpgt(b), Bool4::new(false, true, false, false));
+        assert_eq!(a.cmpgte(b), Bool4::new(true, true, false, true));
+        assert_eq!(a.cmpeq(b), Bool4::new(true, false, false, true));
+    }
+
+    #[test]
+    fn min_max() {
+        let a = Float4::new(1.0, 2.0, -2.0, 4.0);
+        let b = Float4::new(1.0, -2.0, 2.0, 5.0);
+
+        assert_eq!(a.min(b), Float4::new(1.0, -2.0, -2.0, 4.0));
+        assert_eq!(a.max(b), Float4::new(1.0, 2.0, 2.0, 5.0));
+
+        let c = Float4::new(std::f32::INFINITY, 2.0, std::f32::NAN, 4.0);
+        let d = Float4::new(1.0, -std::f32::INFINITY, 2.0, std::f32::NAN);
+        let r_min = c.min(d);
+        let r_max = c.max(d);
+
+        assert_eq!(r_min.a(), 1.0);
+        assert_eq!(r_min.b(), -std::f32::INFINITY);
+        assert_eq!(r_min.c(), 2.0);
+        assert!(r_min.d().is_nan());
+
+        assert_eq!(r_max.a(), std::f32::INFINITY);
+        assert_eq!(r_max.b(), 2.0);
+        assert_eq!(r_max.c(), 2.0);
+        assert!(r_max.d().is_nan());
+    }
+
+    #[test]
+    fn dot_3() {
+        let v1 = Float4::new(1.0, 2.0, -3.0, 0.0);
+        let v2 = Float4::new(4.0, -5.0, 6.0, 0.0);
+
+        assert_eq!(Float4::dot_3(v1, v2), -24.0);
+        assert_eq!(Float4::dot_3_fast(v1, v2), -24.0);
+    }
+
+    #[test]
+    fn cross_3() {
+        let v1 = Float4::new(1.0, 2.0, -3.0, 0.0);
+        let v2 = Float4::new(4.0, -5.0, 6.0, 0.0);
+
+        let r = Float4::new(-3.0, -18.0, -13.0, 0.0);
+
+        assert_eq!(Float4::cross_3(v1, v2), r);
+        assert_eq!(Float4::cross_3(v2, v1), -r);
+        assert_eq!(Float4::cross_3_fast(v1, v2), r);
+        assert_eq!(Float4::cross_3_fast(v2, v1), -r);
+    }
+
+    #[test]
+    fn transpose_3x3() {
+        let m1 = [
+            Float4::new(1.0, 4.0, 7.0, 0.0),
+            Float4::new(2.0, 5.0, 8.0, 0.0),
+            Float4::new(3.0, 6.0, 9.0, 0.0),
+        ];
+        let m2 = [
+            Float4::new(1.0, 2.0, 3.0, 0.0),
+            Float4::new(4.0, 5.0, 6.0, 0.0),
+            Float4::new(7.0, 8.0, 9.0, 0.0),
+        ];
+
+        assert_eq!(Float4::transpose_3x3(&m1), m2);
+        assert_eq!(Float4::transpose_3x3(&m2), m1);
+    }
+
+    #[test]
+    fn invert_3x3() {
+        let m = [
+            Float4::new(1.0, 3.0, 9.0, 0.0),
+            Float4::new(2.0, 6.0, 2.0, 0.0),
+            Float4::new(2.0, 7.0, 11.0, 0.0),
+        ];
+        let inv_m = [
+            Float4::new(3.25, 1.875, -3.0, 0.0),
+            Float4::new(-1.125, -0.4375, 1.0, 0.0),
+            Float4::new(0.125, -0.0625, 0.0, 0.0),
+        ];
+
+        assert_eq!(Float4::invert_3x3(&m).unwrap(), inv_m);
+        assert_eq!(Float4::invert_3x3(&inv_m).unwrap(), m);
+        assert_eq!(Float4::invert_3x3_fast(&m).unwrap(), inv_m);
+        assert_eq!(Float4::invert_3x3_fast(&inv_m).unwrap(), m);
+    }
+
+    #[test]
+    fn vec_mul_3x3() {
+        let v = Float4::new(1.0, 2.5, 4.0, 0.0);
+        let m = [
+            Float4::new(1.0, 3.0, 9.0, 0.0),
+            Float4::new(2.0, 6.0, 2.0, 0.0),
+            Float4::new(2.0, 7.0, 11.0, 0.0),
+        ];
+
+        let r = Float4::new(14.0, 46.0, 58.0, 0.0);
+
+        assert_eq!(v.vec_mul_3x3(&m), r);
+        assert_eq!(v.vec_mul_3x3_fast(&m), r);
+    }
+
+    #[test]
+    fn vec_mul_affine() {
+        let p = Float4::new(1.0, 2.5, 4.0, 0.0);
+        let xform = (
+            [
+                Float4::new(1.0, 3.0, 9.0, 0.0),
+                Float4::new(2.0, 6.0, 2.0, 0.0),
+                Float4::new(2.0, 7.0, 11.0, 0.0),
+            ],
+            Float4::new(1.5, 8.0, 12.0, 0.0),
+        );
+
+        let r = Float4::new(15.5, 54.0, 70.0, 0.0);
+
+        assert_eq!(p.vec_mul_affine(&xform.0, xform.1), r);
+    }
+
+    #[test]
+    fn vec_mul_affine_rev() {
+        let p = Float4::new(15.5, 54.0, 70.0, 0.0);
+        let inv_m = [
+            Float4::new(3.25, 1.875, -3.0, 0.0),
+            Float4::new(-1.125, -0.4375, 1.0, 0.0),
+            Float4::new(0.125, -0.0625, 0.0, 0.0),
+        ];
+        let t = Float4::new(1.5, 8.0, 12.0, 0.0);
+
+        let r = Float4::new(1.0, 2.5, 4.0, 0.0);
+
+        assert_eq!(p.vec_mul_affine_rev(&inv_m, t), r);
+        assert_eq!(p.vec_mul_affine_rev_fast(&inv_m, t), r);
+    }
+
+    #[test]
+    fn affine_mul_affine() {
+        let a = (
+            [
+                Float4::new(1.0, 3.0, 9.0, 0.0),
+                Float4::new(2.0, 6.0, 2.0, 0.0),
+                Float4::new(2.0, 7.0, 11.0, 0.0),
+            ],
+            Float4::new(1.5, 8.0, 12.0, 0.0),
+        );
+        let b = (
+            [
+                Float4::new(1.0, 2.0, 3.0, 0.0),
+                Float4::new(5.0, 6.0, 7.0, 0.0),
+                Float4::new(9.0, 10.0, 11.0, 0.0),
+            ],
+            Float4::new(13.0, 14.0, 15.0, 0.0),
+        );
+
+        let r = (
+            [
+                Float4::new(97.0, 110.0, 123.0, 0.0),
+                Float4::new(50.0, 60.0, 70.0, 0.0),
+                Float4::new(136.0, 156.0, 176.0, 0.0),
+            ],
+            Float4::new(162.5, 185.0, 207.5, 0.0),
+        );
+
+        assert_eq!(Float4::affine_mul_affine(&a.0, a.1, &b.0, b.1), r);
+        assert_eq!(Float4::affine_mul_affine_fast(&a.0, a.1, &b.0, b.1), r);
+    }
+
+    //------------
+    // Bool4
+
+    #[test]
+    fn bitmask() {
+        assert_eq!(Bool4::new(true, false, false, false).bitmask(), 0b0001);
+        assert_eq!(Bool4::new(false, true, false, false).bitmask(), 0b0010);
+        assert_eq!(Bool4::new(false, false, true, false).bitmask(), 0b0100);
+        assert_eq!(Bool4::new(false, false, false, true).bitmask(), 0b1000);
+
+        assert_eq!(Bool4::new(false, true, false, true).bitmask(), 0b1010);
+        assert_eq!(Bool4::new(true, false, true, false).bitmask(), 0b0101);
+    }
+
+    #[test]
+    fn to_bools() {
+        assert_eq!(
+            Bool4::new(true, false, false, false).to_bools(),
+            [true, false, false, false]
+        );
+        assert_eq!(
+            Bool4::new(false, true, false, false).to_bools(),
+            [false, true, false, false]
+        );
+        assert_eq!(
+            Bool4::new(false, false, true, false).to_bools(),
+            [false, false, true, false]
+        );
+        assert_eq!(
+            Bool4::new(false, false, false, true).to_bools(),
+            [false, false, false, true]
+        );
+
+        assert_eq!(
+            Bool4::new(false, true, false, true).to_bools(),
+            [false, true, false, true]
+        );
+        assert_eq!(
+            Bool4::new(true, false, true, false).to_bools(),
+            [true, false, true, false]
+        );
+    }
+
+    #[test]
+    fn any() {
+        assert_eq!(Bool4::new(true, false, false, false).any(), true);
+        assert_eq!(Bool4::new(false, true, false, false).any(), true);
+        assert_eq!(Bool4::new(false, false, true, false).any(), true);
+        assert_eq!(Bool4::new(false, false, false, true).any(), true);
+
+        assert_eq!(Bool4::new(false, false, false, false).any(), false);
+    }
+
+    #[test]
+    fn all() {
+        assert_eq!(Bool4::new(false, true, true, true).all(), false);
+        assert_eq!(Bool4::new(true, false, true, true).all(), false);
+        assert_eq!(Bool4::new(true, true, false, true).all(), false);
+        assert_eq!(Bool4::new(true, true, true, false).all(), false);
+
+        assert_eq!(Bool4::new(true, true, true, true).all(), true);
+    }
+
+    #[test]
+    fn boolean_ops() {
+        let all = Bool4::new(true, true, true, true);
+        let none = Bool4::new(false, false, false, false);
+        let a = Bool4::new(true, false, true, false);
+        let b = Bool4::new(false, true, false, true);
+
+        // Not.
+        assert_eq!(!a, b);
+        assert_eq!(!b, a);
+        assert_eq!(!all, none);
+        assert_eq!(!none, all);
+
+        // And.
+        assert_eq!(a & b, none);
+        assert_eq!(all & none, none);
+        assert_eq!(all & all, all);
+        assert_eq!(none & none, none);
+
+        // Or.
+        assert_eq!(a | b, all);
+        assert_eq!(all | none, all);
+        assert_eq!(all | all, all);
+        assert_eq!(none | none, none);
+
+        // Xor.
+        assert_eq!(a ^ b, all);
+        assert_eq!(all ^ none, all);
+        assert_eq!(all ^ all, none);
+        assert_eq!(none ^ none, none);
+    }
+
+    #[test]
+    fn matches_fallback() {
+        fn tf1(n: Float4) -> [f32; 4] {
+            [n.a(), n.b(), n.c(), n.d()]
+        }
+        fn tf2(n: fallback::Float4) -> [f32; 4] {
+            [n.a(), n.b(), n.c(), n.d()]
+        }
+
+        let a1 = Float4::new(1.53245, 5.4234523, -424.432, 0.0004231);
+        let b1 = Float4::new(74.63, -9.65436, 3.0, -1003.3);
+        let c1 = Float4::new(-0.4216, -132.52, 8.9452, 42.0);
+
+        let a2 = fallback::Float4::new(1.53245, 5.4234523, -424.432, 0.0004231);
+        let b2 = fallback::Float4::new(74.63, -9.65436, 3.0, -1003.3);
+        let c2 = fallback::Float4::new(-0.4216, -132.52, 8.9452, 42.0);
+
+        assert_eq!(tf1(a1), tf2(a2));
+        assert_eq!(tf1(b1), tf2(b2));
+        assert_eq!(tf1(c1), tf2(c2));
+
+        assert_eq!(tf1(a1 + b1), tf2(a2 + b2));
+        assert_eq!(tf1(a1 - b1), tf2(a2 - b2));
+        assert_eq!(tf1(a1 * b1), tf2(a2 * b2));
+        assert_eq!(tf1(a1 / b1), tf2(a2 / b2));
+        assert_eq!(tf1(a1.mul_add(b1, c1)), tf2(a2.mul_add(b2, c2)));
+        assert_eq!(tf1(a1.min(b1)), tf2(a2.min(b2)));
+        assert_eq!(tf1(a1.max(b1)), tf2(a2.max(b2)));
+        assert_eq!(a1.min_element(), a2.min_element());
+        assert_eq!(a1.max_element(), a2.max_element());
+        assert_eq!(tf1(a1.recip()), tf2(a2.recip()));
+        assert_eq!(tf1(a1.abs()), tf2(a2.abs()));
+    }
+}
--- a/sub_crates/rmath/src/wide4/sse.rs
+++ b/sub_crates/rmath/src/wide4/sse.rs
@ -0,0 +1,391 @@
+use std::ops::{Add, BitAnd, BitOr, BitXor, Div, Index, Mul, Neg, Not, Sub};
+
+use std::arch::x86_64::{
+    __m128, _mm_add_ps, _mm_and_ps, _mm_castsi128_ps, _mm_cmpeq_ps, _mm_cmpge_ps, _mm_cmpgt_ps,
+    _mm_cmple_ps, _mm_cmplt_ps, _mm_div_ps, _mm_fmadd_ps, _mm_max_ps, _mm_min_ps, _mm_movemask_ps,
+    _mm_mul_ps, _mm_or_ps, _mm_set1_epi32, _mm_set1_ps, _mm_set_epi32, _mm_set_ps, _mm_setzero_ps,
+    _mm_shuffle_ps, _mm_storeu_ps, _mm_sub_ps, _mm_xor_ps,
+};
+
+use crate::FMulAdd;
+
+//=============================================================
+// Float4
+
+#[derive(Debug, Copy, Clone)]
+#[repr(transparent)]
+pub struct Float4(__m128);
+
+impl Float4 {
+    /// Create a new `Float4` with the given components.
+    #[inline(always)]
+    pub fn new(a: f32, b: f32, c: f32, d: f32) -> Self {
+        Self(unsafe { _mm_set_ps(d, c, b, a) })
+    }
+
+    /// Create a new `Float4` with all elements set to `n`.
+    #[inline(always)]
+    pub fn splat(n: f32) -> Self {
+        Self(unsafe { _mm_set1_ps(n) })
+    }
+
+    /// Component-wise fused multiply-add.
+    ///
+    /// `(self * a) + b` with only one rounding error.
+    #[inline(always)]
+    pub fn mul_add(self, a: Self, b: Self) -> Self {
+        if is_x86_feature_detected!("fma") {
+            Self(unsafe { _mm_fmadd_ps(self.0, a.0, b.0) })
+        } else {
+            Self::new(
+                self.a().mul_add(a.a(), b.a()),
+                self.b().mul_add(a.b(), b.b()),
+                self.c().mul_add(a.c(), b.c()),
+                self.d().mul_add(a.d(), b.d()),
+            )
+        }
+    }
+
+    /// Vertical minimum.
+    #[inline(always)]
+    pub fn min(self, rhs: Self) -> Self {
+        Self(unsafe { _mm_min_ps(self.0, rhs.0) })
+    }
+
+    /// Vertical maximum.
+    #[inline(always)]
+    pub fn max(self, rhs: Self) -> Self {
+        Self(unsafe { _mm_max_ps(self.0, rhs.0) })
+    }
+
+    /// Horizontal minimum.
+    #[inline(always)]
+    pub fn min_element(self) -> f32 {
+        let a = self.a().min(self.b());
+        let b = self.c().min(self.d());
+        a.min(b)
+    }
+
+    /// Horizontal maximum.
+    #[inline(always)]
+    pub fn max_element(self) -> f32 {
+        let a = self.a().max(self.b());
+        let b = self.c().max(self.d());
+        a.max(b)
+    }
+
+    /// 1.0 / self
+    #[inline(always)]
+    pub fn recip(self) -> Self {
+        // The reciprocal intrinsic is not precise enough.
+        // Self(unsafe { std::arch::x86_64::_mm_rcp_ps(self.0) })
+
+        Self::splat(1.0) / self
+    }
+
+    #[inline(always)]
+    pub fn abs(self) -> Self {
+        Self(unsafe {
+            let abs_mask = _mm_castsi128_ps(_mm_set1_epi32(!(1 << 31)));
+            _mm_and_ps(self.0, abs_mask)
+        })
+    }
+
+    //-----------------------------------------------------
+    // Comparisons.
+
+    /// Less than.
+    #[inline(always)]
+    pub fn cmplt(self, rhs: Self) -> Bool4 {
+        Bool4(unsafe { _mm_cmplt_ps(self.0, rhs.0) })
+    }
+
+    /// Less than or equal.
+    #[inline(always)]
+    pub fn cmplte(self, rhs: Self) -> Bool4 {
+        Bool4(unsafe { _mm_cmple_ps(self.0, rhs.0) })
+    }
+
+    /// Greater than.
+    #[inline(always)]
+    pub fn cmpgt(self, rhs: Self) -> Bool4 {
+        Bool4(unsafe { _mm_cmpgt_ps(self.0, rhs.0) })
+    }
+
+    /// Greater than or equal.
+    #[inline(always)]
+    pub fn cmpgte(self, rhs: Self) -> Bool4 {
+        Bool4(unsafe { _mm_cmpge_ps(self.0, rhs.0) })
+    }
+
+    /// Equal.
+    #[inline(always)]
+    pub fn cmpeq(self, rhs: Self) -> Bool4 {
+        Bool4(unsafe { _mm_cmpeq_ps(self.0, rhs.0) })
+    }
+
+    //-----------------------------------------------------
+    // Individual components.
+
+    #[inline(always)]
+    pub fn a(self) -> f32 {
+        self[0]
+    }
+
+    #[inline(always)]
+    pub fn b(self) -> f32 {
+        self[1]
+    }
+
+    #[inline(always)]
+    pub fn c(self) -> f32 {
+        self[2]
+    }
+
+    #[inline(always)]
+    pub fn d(self) -> f32 {
+        self[3]
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_a(self, n: f32) -> Self {
+        Self::new(n, self.b(), self.c(), self.d())
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_b(self, n: f32) -> Self {
+        Self::new(self.a(), n, self.c(), self.d())
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_c(self, n: f32) -> Self {
+        Self::new(self.a(), self.b(), n, self.d())
+    }
+
+    #[inline(always)]
+    #[must_use]
+    pub fn set_d(self, n: f32) -> Self {
+        Self::new(self.a(), self.b(), self.c(), n)
+    }
+
+    //-----------------------------------------------------
+    // Shuffles.
+
+    #[inline(always)]
+    pub fn aaaa(self) -> Self {
+        Self(unsafe { _mm_shuffle_ps(self.0, self.0, 0b00_00_00_00) })
+    }
+
+    #[inline(always)]
+    pub fn bbbb(self) -> Self {
+        Self(unsafe { _mm_shuffle_ps(self.0, self.0, 0b01_01_01_01) })
+    }
+
+    #[inline(always)]
+    pub fn cccc(self) -> Self {
+        Self(unsafe { _mm_shuffle_ps(self.0, self.0, 0b10_10_10_10) })
+    }
+
+    #[inline(always)]
+    pub fn dddd(self) -> Self {
+        Self(unsafe { _mm_shuffle_ps(self.0, self.0, 0b11_11_11_11) })
+    }
+
+    #[inline(always)]
+    pub fn bcad(self) -> Self {
+        Self(unsafe { _mm_shuffle_ps(self.0, self.0, 0b11_00_10_01) })
+    }
+
+    #[inline(always)]
+    pub fn cabd(self) -> Self {
+        Self(unsafe { _mm_shuffle_ps(self.0, self.0, 0b11_01_00_10) })
+    }
+}
+
+impl Index<usize> for Float4 {
+    type Output = f32;
+
+    #[inline(always)]
+    fn index(&self, idx: usize) -> &f32 {
+        let elements: &[f32; 4] = unsafe { std::mem::transmute(&self.0) };
+        match idx {
+            0 => &elements[0],
+            1 => &elements[1],
+            2 => &elements[2],
+            3 => &elements[3],
+            _ => panic!("Out of bounds access of Float4 elements."),
+        }
+    }
+}
+
+impl Add for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn add(self, rhs: Self) -> Self {
+        Self(unsafe { _mm_add_ps(self.0, rhs.0) })
+    }
+}
+
+impl Sub for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn sub(self, rhs: Self) -> Self {
+        Self(unsafe { _mm_sub_ps(self.0, rhs.0) })
+    }
+}
+
+impl Mul for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn mul(self, rhs: Self) -> Self {
+        Self(unsafe { _mm_mul_ps(self.0, rhs.0) })
+    }
+}
+
+impl Mul<f32> for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn mul(self, rhs: f32) -> Self {
+        Self(unsafe { _mm_mul_ps(self.0, _mm_set1_ps(rhs)) })
+    }
+}
+
+impl Div for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn div(self, rhs: Self) -> Self {
+        Self(unsafe { _mm_div_ps(self.0, rhs.0) })
+    }
+}
+
+impl Div<f32> for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn div(self, rhs: f32) -> Self {
+        Self(unsafe { _mm_div_ps(self.0, _mm_set1_ps(rhs)) })
+    }
+}
+
+impl Neg for Float4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn neg(self) -> Self {
+        Self(unsafe {
+            let abs_mask = _mm_castsi128_ps(_mm_set1_epi32(1 << 31));
+            _mm_xor_ps(self.0, abs_mask)
+        })
+    }
+}
+
+impl FMulAdd for Float4 {
+    #[inline(always)]
+    fn fma(self, b: Self, c: Self) -> Self {
+        self.mul_add(b, c)
+    }
+}
+
+//=============================================================
+// Bool4
+
+#[derive(Copy, Clone)]
+#[repr(transparent)]
+pub struct Bool4(__m128);
+
+impl Bool4 {
+    #[inline(always)]
+    pub fn new(a: bool, b: bool, c: bool, d: bool) -> Self {
+        const ONES: i32 = unsafe { std::mem::transmute(0xffffffffu32) };
+
+        unsafe {
+            let ints = _mm_set_epi32(
+                d as i32 * ONES,
+                c as i32 * ONES,
+                b as i32 * ONES,
+                a as i32 * ONES,
+            );
+            Bool4(_mm_castsi128_ps(ints))
+        }
+    }
+
+    #[inline(always)]
+    pub fn new_false() -> Self {
+        Self(unsafe { _mm_setzero_ps() })
+    }
+
+    #[inline(always)]
+    pub fn to_bools(self) -> [bool; 4] {
+        let mut v = [0.0f32; 4];
+        unsafe { _mm_storeu_ps((&mut v[..]).as_mut_ptr(), self.0) }
+        [
+            v[0].to_bits() != 0,
+            v[1].to_bits() != 0,
+            v[2].to_bits() != 0,
+            v[3].to_bits() != 0,
+        ]
+    }
+
+    /// Note: `a` goes to the least significant bit.
+    #[inline(always)]
+    pub fn bitmask(self) -> u8 {
+        unsafe { _mm_movemask_ps(self.0) as u8 }
+    }
+
+    #[inline(always)]
+    pub fn any(self) -> bool {
+        self.bitmask() != 0
+    }
+
+    #[inline(always)]
+    pub fn all(self) -> bool {
+        self.bitmask() == 0b1111
+    }
+}
+
+impl BitAnd for Bool4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn bitand(self, rhs: Self) -> Self {
+        Self(unsafe { _mm_and_ps(self.0, rhs.0) })
+    }
+}
+
+impl BitOr for Bool4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn bitor(self, rhs: Self) -> Self {
+        Self(unsafe { _mm_or_ps(self.0, rhs.0) })
+    }
+}
+
+impl BitXor for Bool4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn bitxor(self, rhs: Self) -> Self {
+        Self(unsafe { _mm_xor_ps(self.0, rhs.0) })
+    }
+}
+
+impl Not for Bool4 {
+    type Output = Self;
+
+    #[inline(always)]
+    fn not(self) -> Self {
+        Self(unsafe {
+            let ones = _mm_castsi128_ps(_mm_set1_epi32(!0));
+            _mm_xor_ps(self.0, ones)
+        })
+    }
+}
--- a/sub_crates/rmath/src/xform.rs
+++ b/sub_crates/rmath/src/xform.rs
@ -0,0 +1,305 @@
+#![allow(dead_code)]
+
+use std::ops::{Add, Mul};
+
+use crate::point::Point;
+use crate::sealed::Sealed;
+use crate::wide4::Float4;
+
+/// A forward affine transform.
+///
+/// Use this for working with transforms that still need to be
+/// manipulated or composed with other transforms, or for storing
+/// transforms more compactly.
+///
+/// Note: slightly counter-intuitively, even though this can perform
+/// forward (but not inverse) transforms on points and vectors, it is
+/// capable of *inverse* (but not forward) transforms on surface normals.
+/// This is because forward transforms on surface normals require the
+/// inverse transform matrix.
+///
+/// Convert to an [`XformFull`] for a larger-format type capable of
+/// efficiently performing both forward and inverse transforms on all
+/// types, but which is effectively "frozen" in terms of further
+/// manipulation of the transform itself.
+#[derive(Debug, Copy, Clone, PartialEq)]
+#[repr(C)]
+pub struct Xform {
+    /// Rotation/scale/shear matrix.
+    pub m: [Float4; 3],
+
+    /// Translation.
+    pub t: Float4,
+}
+
+impl Xform {
+    /// Creates a new affine transform with the specified values:
+    ///
+    /// ```text
+    /// a d g j
+    /// b e h k
+    /// c f i l
+    /// ```
+    ///
+    /// Where j, k, and l are the xyz translation component.
+    #[inline]
+    #[allow(clippy::many_single_char_names)]
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        a: f32,
+        b: f32,
+        c: f32,
+        d: f32,
+        e: f32,
+        f: f32,
+        g: f32,
+        h: f32,
+        i: f32,
+        j: f32,
+        k: f32,
+        l: f32,
+    ) -> Self {
+        Self {
+            m: [
+                Float4::new(a, b, c, 0.0),
+                Float4::new(d, e, f, 0.0),
+                Float4::new(g, h, i, 0.0),
+            ],
+            t: Float4::new(j, k, l, 0.0),
+        }
+    }
+
+    /// Creates a new identity transform.
+    #[inline]
+    pub fn identity() -> Self {
+        Self {
+            m: [
+                Float4::new(1.0, 0.0, 0.0, 0.0),
+                Float4::new(0.0, 1.0, 0.0, 0.0),
+                Float4::new(0.0, 0.0, 1.0, 0.0),
+            ],
+            t: Float4::splat(0.0),
+        }
+    }
+
+    #[inline]
+    pub fn from_location(loc: Point) -> Xform {
+        Self {
+            m: [
+                Float4::new(1.0, 0.0, 0.0, 0.0),
+                Float4::new(0.0, 1.0, 0.0, 0.0),
+                Float4::new(0.0, 0.0, 1.0, 0.0),
+            ],
+            t: loc.0,
+        }
+    }
+
+    /// Returns whether the matrices are approximately equal to each other.
+    /// Each corresponding element in the matrices cannot have a relative
+    /// error exceeding epsilon.
+    pub(crate) fn aprx_eq(&self, other: Xform, max_ulps: u32) -> bool {
+        let mut eq = true;
+        eq &= Float4::aprx_eq(self.m[0], other.m[0], max_ulps);
+        eq &= Float4::aprx_eq(self.m[1], other.m[1], max_ulps);
+        eq &= Float4::aprx_eq(self.m[2], other.m[2], max_ulps);
+        eq &= Float4::aprx_eq(self.t, other.t, max_ulps);
+        eq
+    }
+
+    /// Compute the "full" version of the transform.
+    #[inline]
+    pub fn to_full(&self) -> Option<XformFull> {
+        if let Some(inv_m) = Float4::invert_3x3(&self.m) {
+            Some(XformFull {
+                fwd: *self,
+                inv_m: inv_m,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Faster but less precise version of `to_full()`.
+    #[inline]
+    pub fn to_full_fast(&self) -> Option<XformFull> {
+        if let Some(inv_m) = Float4::invert_3x3_fast(&self.m) {
+            Some(XformFull {
+                fwd: *self,
+                inv_m: inv_m,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Composes two transforms together.
+    ///
+    /// The resulting transform is the same as doing `self` and then
+    /// `rhs` in sequence.
+    #[inline]
+    pub fn compose(&self, rhs: &Self) -> Self {
+        let (m, t) = Float4::affine_mul_affine(&self.m, self.t, &rhs.m, rhs.t);
+        Self { m: m, t: t }
+    }
+
+    /// Composes two transforms together.
+    ///
+    /// Faster but less precise version.
+    #[inline]
+    pub fn compose_fast(&self, rhs: &Self) -> Self {
+        let (m, t) = Float4::affine_mul_affine_fast(&self.m, self.t, &rhs.m, rhs.t);
+        Self { m: m, t: t }
+    }
+}
+
+impl Default for Xform {
+    fn default() -> Self {
+        Self::identity()
+    }
+}
+
+/// Multiply a matrix by a f32
+impl Mul<f32> for Xform {
+    type Output = Self;
+
+    #[inline]
+    fn mul(self, rhs: f32) -> Self {
+        Self {
+            m: [self.m[0] * rhs, self.m[1] * rhs, self.m[2] * rhs],
+            t: self.t * rhs,
+        }
+    }
+}
+
+/// Add two matrices together
+impl Add for Xform {
+    type Output = Self;
+
+    #[inline]
+    fn add(self, rhs: Self) -> Self {
+        Self {
+            m: [
+                self.m[0] + rhs.m[0],
+                self.m[1] + rhs.m[1],
+                self.m[2] + rhs.m[2],
+            ],
+            t: self.t + rhs.t,
+        }
+    }
+}
+
+impl AsXform for Xform {
+    #[inline(always)]
+    fn as_xform(&self) -> &Xform {
+        self
+    }
+}
+
+impl Sealed for Xform {}
+
+//-------------------------------------------------------------
+
+/// A combined forward/inverse affine transform.
+///
+/// Unlike [`Xform`], this can perform both forward and inverse
+/// transforms on all types.  However, it also takes up more space and
+/// is effectively "frozen" in terms of further manipulation.  Prefer
+/// [`Xform`] when manipulating or composing transforms, and also
+/// when storing transforms if space is a consideration.
+///
+/// Note: only the 3x3 part of the transform is stored inverted.  This
+/// is because it's both trivial and more numerically stable to reuse
+/// the forward translation vector to do inverse transforms, as
+/// `(point - fwd.t) * inv_m`.
+#[derive(Debug, Copy, Clone)]
+#[repr(C)]
+pub struct XformFull {
+    /// Forward transform.
+    pub fwd: Xform,
+
+    /// Inverse rotation/scale/shear matrix.
+    pub inv_m: [Float4; 3],
+}
+
+impl XformFull {
+    pub fn identity() -> Self {
+        Self {
+            fwd: Xform {
+                m: [
+                    Float4::new(1.0, 0.0, 0.0, 0.0),
+                    Float4::new(0.0, 1.0, 0.0, 0.0),
+                    Float4::new(0.0, 0.0, 1.0, 0.0),
+                ],
+                t: Float4::splat(0.0),
+            },
+            inv_m: [
+                Float4::new(1.0, 0.0, 0.0, 0.0),
+                Float4::new(0.0, 1.0, 0.0, 0.0),
+                Float4::new(0.0, 0.0, 1.0, 0.0),
+            ],
+        }
+    }
+}
+
+impl AsXform for XformFull {
+    #[inline(always)]
+    fn as_xform(&self) -> &Xform {
+        &self.fwd
+    }
+}
+
+impl Sealed for XformFull {}
+
+//-------------------------------------------------------------
+
+pub trait AsXform: Sealed {
+    fn as_xform(&self) -> &Xform;
+}
+
+//-------------------------------------------------------------
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn equality() {
+        let a = Xform::identity();
+        let b = Xform::identity();
+        let c = Xform::new(1.1, 0.0, 0.0, 0.0, 0.0, 1.1, 0.0, 0.0, 0.0, 0.0, 1.1, 0.0);
+
+        assert_eq!(a, b);
+        assert!(a != c);
+    }
+
+    #[test]
+    fn approximate_equality() {
+        let a = Xform::identity();
+        let b = Xform::new(
+            1.000001, 0.0, 0.0, 0.0, 1.000001, 0.0, 0.0, 0.0, 1.000001, 0.0, 0.0, 0.0,
+        );
+        let c = Xform::new(
+            1.000003, 0.0, 0.0, 0.0, 1.000003, 0.0, 0.0, 0.0, 1.000003, 0.0, 0.0, 0.0,
+        );
+
+        assert!(a.aprx_eq(b, 10));
+        assert!(!a.aprx_eq(b, 6));
+
+        assert!(a.aprx_eq(c, 27));
+        assert!(!a.aprx_eq(c, 23));
+    }
+
+    #[test]
+    fn compose() {
+        let a = Xform::new(1.0, 3.0, 9.0, 2.0, 6.0, 2.0, 2.0, 7.0, 11.0, 1.5, 8.0, 12.0);
+        let b = Xform::new(
+            1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 9.0, 10.0, 11.0, 13.0, 14.0, 15.0,
+        );
+        let c = Xform::new(
+            97.0, 110.0, 123.0, 50.0, 60.0, 70.0, 136.0, 156.0, 176.0, 162.5, 185.0, 207.5,
+        );
+
+        assert_eq!(a.compose(&b), c);
+        assert_eq!(a.compose_fast(&b), c);
+    }
+}
--- a/sub_crates/rrand/Cargo.toml
+++ b/sub_crates/rrand/Cargo.toml
@ -0,0 +1,12 @@
+[package]
+name = "rrand"
+version = "0.1.0"
+edition = "2021"
+
+[dev-dependencies]
+bencher = "0.1.5"
+rand = "0.6"
+
+[[bench]]
+name = "bench"
+harness = false
--- a/sub_crates/rrand/benches/bench.rs
+++ b/sub_crates/rrand/benches/bench.rs
@ -0,0 +1,67 @@
+use bencher::{benchmark_group, benchmark_main, black_box, Bencher};
+use rrand::{mix32, mix32_seed, mix64, mix64_seed, Rng};
+
+//----
+
+fn rng_u32_100000(bench: &mut Bencher) {
+    bench.iter(|| {
+        let mut rng = Rng::new(black_box(0));
+        for _ in 0..100000 {
+            black_box(rng.u32());
+        }
+    });
+}
+
+fn rng_u64_100000(bench: &mut Bencher) {
+    bench.iter(|| {
+        let mut rng = Rng::new(black_box(0));
+        for _ in 0..100000 {
+            black_box(rng.u64());
+        }
+    });
+}
+
+fn mix32_100000(bench: &mut Bencher) {
+    bench.iter(|| {
+        for i in 0..100000 {
+            black_box(mix32(black_box(i)));
+        }
+    });
+}
+
+fn mix64_100000(bench: &mut Bencher) {
+    bench.iter(|| {
+        for i in 0..100000 {
+            black_box(mix64(black_box(i)));
+        }
+    });
+}
+
+fn mix32_seed_100000(bench: &mut Bencher) {
+    bench.iter(|| {
+        for i in 0..100000 {
+            black_box(mix32_seed(black_box(i), black_box(0)));
+        }
+    });
+}
+
+fn mix64_seed_100000(bench: &mut Bencher) {
+    bench.iter(|| {
+        for i in 0..100000 {
+            black_box(mix64_seed(black_box(i), black_box(0)));
+        }
+    });
+}
+
+//----
+
+benchmark_group!(
+    benches,
+    rng_u32_100000,
+    rng_u64_100000,
+    mix32_100000,
+    mix64_100000,
+    mix32_seed_100000,
+    mix64_seed_100000,
+);
+benchmark_main!(benches);
--- a/sub_crates/rrand/src/lib.rs
+++ b/sub_crates/rrand/src/lib.rs
@ -0,0 +1,127 @@
+//! Sources of deterministic "randomness" for rendering applications.
+
+/// Convert a `u32` to a float in [0.0, 1.0).
+///
+/// Use for getting f32 values from random u32 sources.
+///
+/// Note: this is a linear mapping from [0, int_max] to [0.0, 1.0).
+#[inline(always)]
+pub fn u32_to_f32_norm(n: u32) -> f32 {
+    f32::from_bits((n >> 9) | 0x3f800000) - 1.0
+}
+
+//-------------------------------------------------------------
+
+/// A fast RNG.
+///
+#[derive(Debug, Copy, Clone)]
+pub struct Rng {
+    state: u64,
+}
+
+impl Rng {
+    /// Creates a new Rng from a seed.
+    ///
+    /// A seed of zero is perfectly fine, and does not affect the quality
+    /// of the generator.
+    #[inline]
+    pub fn new(seed: u64) -> Self {
+        Self { state: seed }
+    }
+
+    /// Gets the nth relative RNG stream from this one.
+    ///
+    /// The returned stream will be at the same point in its sequence as
+    /// this one.
+    #[inline]
+    pub fn nth_stream(&self, n: u64) -> Self {
+        Self {
+            // We just jump forward 2^40*n states.  This gives us 2^24
+            // unique streams, each of which is 2^40 numbers long.
+            state: self
+                .state
+                .wrapping_add(0xa0761d6478bd642f_u64.wrapping_mul(1 << 40).wrapping_mul(n)),
+        }
+    }
+
+    /// Returns a random u32 in [0, int_max].
+    #[inline(always)]
+    pub fn u32(&mut self) -> u32 {
+        self.u64() as u32
+    }
+
+    /// Returns a random u64 in [0, int_max].
+    #[inline(always)]
+    pub fn u64(&mut self) -> u64 {
+        // The wyrand RNG.
+        self.state = self.state.wrapping_add(0xa0761d6478bd642f);
+        let t = (self.state as u128).wrapping_mul(self.state as u128 ^ 0xe7037ed1a0b428db);
+        ((t >> 64) ^ t) as u64
+    }
+
+    /// Returns a random f32 in [0.0, 1.0).
+    #[inline(always)]
+    pub fn f32(&mut self) -> f32 {
+        u32_to_f32_norm(self.u32())
+    }
+}
+
+//-------------------------------------------------------------
+
+/// A fast 32-bit mixing function.
+///
+/// Scrambles the input number to produce a different deterministic
+/// "random" number.
+#[inline(always)]
+pub fn mix32(mut n: u32) -> u32 {
+    // From https://github.com/skeeto/hash-prospector
+    n ^= n >> 16;
+    n = n.wrapping_mul(0x21f0aaad);
+    n ^= n >> 15;
+    n = n.wrapping_mul(0xd35a2d97);
+    n ^= n >> 15;
+
+    // Xor by a random number so input zero doesn't map to output zero.
+    // The particular number used here isn't special.
+    n ^ 0xe6fe3beb
+}
+
+/// A fast seedable 32-bit mixing function.
+///
+/// Same as `mix32()` but takes a seed.
+#[inline(always)]
+pub fn mix32_seed(n: u32, seed: u32) -> u32 {
+    // We rotate the bits of `seed` so it's unlikely to interact with `n`
+    // in bad ways if they're both e.g. incrementing.  The particular
+    // rotation constant used here isn't special.
+    mix32(n ^ seed.rotate_left(23))
+}
+
+/// A fast 64-bit mixing function.
+///
+/// Scrambles the input number to produce a different deterministic
+/// "random" number.
+#[inline(always)]
+pub fn mix64(mut n: u64) -> u64 {
+    // From https://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html
+    n ^= n >> 30;
+    n = n.wrapping_mul(0xbf58476d1ce4e5b9);
+    n ^= n >> 27;
+    n = n.wrapping_mul(0x94d049bb133111eb);
+    n ^= n >> 31;
+
+    // Xor by a random number so input zero doesn't map to output zero.
+    // The particular number used here isn't special.
+    n ^ 0x4acc3f27cc712c9d
+}
+
+/// A fast seedable 64-bit mixing function.
+///
+/// Same as `mix64()` but takes a seed.
+#[inline(always)]
+pub fn mix64_seed(n: u64, seed: u64) -> u64 {
+    // We rotate the bits of `seed` so it's unlikely to interact with `n`
+    // in bad ways if they're both e.g. incrementing.  The particular
+    // rotation constant used here isn't special.
+    mix64(n ^ seed.rotate_left(47))
+}
--- a/sub_crates/spectral_upsampling/Cargo.toml
+++ b/sub_crates/spectral_upsampling/Cargo.toml
@ -10,4 +10,4 @@ name = "spectral_upsampling"
 path = "src/lib.rs"

 [dependencies]
-glam = "0.15"
+rmath = { path = "../rmath" }
--- a/sub_crates/spectral_upsampling/src/jakob.rs
+++ b/sub_crates/spectral_upsampling/src/jakob.rs
@ -6,7 +6,9 @@
 /// The provides similar color matching as full Jakob, at the expense of
 /// somewhat lower quality spectrums, and the inability to precalculate
 /// the coefficents for even more efficient evaluation later on.
-use glam::Vec4;
+use rmath::wide4::Float4;
+
+pub const EQUAL_ENERGY_REFLECTANCE: f32 = 1.0;

 /// How many polynomial coefficients?
 const RGB2SPEC_N_COEFFS: usize = 3;
@ -15,7 +17,7 @@ const RGB2SPEC_N_COEFFS: usize = 3;
 include!(concat!(env!("OUT_DIR"), "/jakob_table_inc.rs"));

 #[inline]
-pub fn rec709_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
+pub fn rec709_to_spectrum_p4(lambdas: Float4, rgb: (f32, f32, f32)) -> Float4 {
    small_rgb_to_spectrum_p4(
        REC709_TABLE,
        REC709_TABLE_RES,
@ -26,7 +28,7 @@ pub fn rec709_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
 }

 #[inline]
-pub fn rec2020_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
+pub fn rec2020_to_spectrum_p4(lambdas: Float4, rgb: (f32, f32, f32)) -> Float4 {
    small_rgb_to_spectrum_p4(
        REC2020_TABLE,
        REC2020_TABLE_RES,
@ -37,7 +39,7 @@ pub fn rec2020_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
 }

 #[inline]
-pub fn aces_to_spectrum_p4(lambdas: Vec4, rgb: (f32, f32, f32)) -> Vec4 {
+pub fn aces_to_spectrum_p4(lambdas: Float4, rgb: (f32, f32, f32)) -> Float4 {
    small_rgb_to_spectrum_p4(
        ACES_TABLE,
        ACES_TABLE_RES,
@ -56,9 +58,9 @@ fn small_rgb_to_spectrum_p4(
    table: &[[(f32, f32, f32); 2]],
    table_res: usize,
    table_mid_value: f32,
-    lambdas: Vec4,
+    lambdas: Float4,
    rgb: (f32, f32, f32),
-) -> Vec4 {
+) -> Float4 {
    // Determine largest RGB component, and calculate the other two
    // components scaled for lookups.
    let (i, max_val, x, y) = if rgb.0 > rgb.1 && rgb.0 > rgb.2 {
@ -71,7 +73,7 @@ fn small_rgb_to_spectrum_p4(
    if max_val == 0.0 {
        // If max_val is zero, just return zero.  This avoids NaN's from
        // divide by zero.  This is also correct, since it's black.
-        return Vec4::splat(0.0);
+        return Float4::splat(0.0);
    }
    let x = x * 63.0 / max_val;
    let y = y * 63.0 / max_val;
@ -91,20 +93,20 @@ fn small_rgb_to_spectrum_p4(

    // Convert to SIMD format for faster interpolation.
    let a0 = [
-        Vec4::new(a0[0].0, a0[0].1, a0[0].2, 0.0),
-        Vec4::new(a0[1].0, a0[1].1, a0[1].2, 0.0),
+        Float4::new(a0[0].0, a0[0].1, a0[0].2, 0.0),
+        Float4::new(a0[1].0, a0[1].1, a0[1].2, 0.0),
    ];
    let a1 = [
-        Vec4::new(a1[0].0, a1[0].1, a1[0].2, 0.0),
-        Vec4::new(a1[1].0, a1[1].1, a1[1].2, 0.0),
+        Float4::new(a1[0].0, a1[0].1, a1[0].2, 0.0),
+        Float4::new(a1[1].0, a1[1].1, a1[1].2, 0.0),
    ];
    let a2 = [
-        Vec4::new(a2[0].0, a2[0].1, a2[0].2, 0.0),
-        Vec4::new(a2[1].0, a2[1].1, a2[1].2, 0.0),
+        Float4::new(a2[0].0, a2[0].1, a2[0].2, 0.0),
+        Float4::new(a2[1].0, a2[1].1, a2[1].2, 0.0),
    ];
    let a3 = [
-        Vec4::new(a3[0].0, a3[0].1, a3[0].2, 0.0),
-        Vec4::new(a3[1].0, a3[1].1, a3[1].2, 0.0),
+        Float4::new(a3[0].0, a3[0].1, a3[0].2, 0.0),
+        Float4::new(a3[1].0, a3[1].1, a3[1].2, 0.0),
    ];

    // Do interpolation.
@ -133,22 +135,22 @@ fn small_rgb_to_spectrum_p4(
 // Coefficient -> eval functions

 #[inline(always)]
-fn rgb2spec_fma_4(a: Vec4, b: Vec4, c: Vec4) -> Vec4 {
-    (a * b) + c
+fn rgb2spec_fma_4(a: Float4, b: Float4, c: Float4) -> Float4 {
+    a.mul_add(b, c)
 }

-fn rgb2spec_eval_4(coeff: [f32; RGB2SPEC_N_COEFFS], lambda: Vec4) -> Vec4 {
-    let co0 = Vec4::splat(coeff[0]);
-    let co1 = Vec4::splat(coeff[1]);
-    let co2 = Vec4::splat(coeff[2]);
+fn rgb2spec_eval_4(coeff: [f32; RGB2SPEC_N_COEFFS], lambda: Float4) -> Float4 {
+    let co0 = Float4::splat(coeff[0]);
+    let co1 = Float4::splat(coeff[1]);
+    let co2 = Float4::splat(coeff[2]);

    let x = rgb2spec_fma_4(rgb2spec_fma_4(co0, lambda, co1), lambda, co2);

    let y = {
        // TODO: replace this with a SIMD sqrt op.
-        let (x, y, z, w) = rgb2spec_fma_4(x, x, Vec4::splat(1.0)).into();
-        Vec4::new(x.sqrt(), y.sqrt(), z.sqrt(), w.sqrt()).recip()
+        let (x, y, z, w) = rgb2spec_fma_4(x, x, Float4::splat(1.0)).into();
+        Float4::new(x.sqrt(), y.sqrt(), z.sqrt(), w.sqrt()).recip()
    };

-    rgb2spec_fma_4(Vec4::splat(0.5) * x, y, Vec4::splat(0.5))
+    rgb2spec_fma_4(Float4::splat(0.5) * x, y, Float4::splat(0.5))
 }
--- a/sub_crates/spectral_upsampling/src/meng.rs
+++ b/sub_crates/spectral_upsampling/src/meng.rs
@ -6,7 +6,7 @@

 use std::f32;

-use glam::Vec4;
+use rmath::wide4::Float4;

 mod meng_spectra_tables;

@ -174,7 +174,7 @@ pub fn spectrum_xyz_to_p(lambda: f32, xyz: (f32, f32, f32)) -> f32 {
 ///
 /// Works on 4 wavelengths at once via SIMD.
 #[inline]
-pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {
+pub fn spectrum_xyz_to_p_4(lambdas: Float4, xyz: (f32, f32, f32)) -> Float4 {
    assert!(lambdas.min_element() >= SPECTRUM_SAMPLE_MIN);
    assert!(lambdas.max_element() <= SPECTRUM_SAMPLE_MAX);

@ -184,7 +184,7 @@ pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {
        if norm < f32::MAX {
            norm
        } else {
-            return Vec4::splat(0.0);
+            return Float4::splat(0.0);
        }
    };

@ -197,7 +197,7 @@ pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {
        || uv.1 < 0.0
        || uv.1 >= SPECTRUM_GRID_HEIGHT as f32
    {
-        return Vec4::splat(0.0);
+        return Float4::splat(0.0);
    }

    let uvi = (uv.0 as i32, uv.1 as i32);
@ -214,11 +214,11 @@ pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {

    // If the cell has no points, nothing we can do, so return 0.0
    if num == 0 {
-        return Vec4::splat(0.0);
+        return Float4::splat(0.0);
    }

    // Normalize lambda to spectrum table index range.
-    let sb: Vec4 = (lambdas - Vec4::splat(SPECTRUM_SAMPLE_MIN))
+    let sb: Float4 = (lambdas - Float4::splat(SPECTRUM_SAMPLE_MIN))
        / (SPECTRUM_SAMPLE_MAX - SPECTRUM_SAMPLE_MIN)
        * (SPECTRUM_NUM_SAMPLES as f32 - 1.0);
    debug_assert!(sb.min_element() >= 0.0);
@ -226,7 +226,7 @@ pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {

    // Get the spectral values for the vertices of the grid cell.
    // TODO: use integer SIMD intrinsics to make this part faster.
-    let mut p = [Vec4::splat(0.0); 6];
+    let mut p = [Float4::splat(0.0); 6];
    let sb0: [i32; 4] = [sb[0] as i32, sb[1] as i32, sb[2] as i32, sb[3] as i32];
    assert!(sb0[0].max(sb0[1]).max(sb0[2].max(sb0[3])) < SPECTRUM_NUM_SAMPLES);
    let sb1: [i32; 4] = [
@ -235,27 +235,27 @@ pub fn spectrum_xyz_to_p_4(lambdas: Vec4, xyz: (f32, f32, f32)) -> Vec4 {
        (sb[2] as i32 + 1).min(SPECTRUM_NUM_SAMPLES - 1),
        (sb[3] as i32 + 1).min(SPECTRUM_NUM_SAMPLES - 1),
    ];
-    let sbf = sb - Vec4::new(sb0[0] as f32, sb0[1] as f32, sb0[2] as f32, sb0[3] as f32);
+    let sbf = sb - Float4::new(sb0[0] as f32, sb0[1] as f32, sb0[2] as f32, sb0[3] as f32);
    for i in 0..(num as usize) {
        debug_assert!(idx[i] >= 0);
        let spectrum = &SPECTRUM_DATA_POINTS[idx[i] as usize].spectrum;
-        let p0 = Vec4::new(
+        let p0 = Float4::new(
            spectrum[sb0[0] as usize],
            spectrum[sb0[1] as usize],
            spectrum[sb0[2] as usize],
            spectrum[sb0[3] as usize],
        );
-        let p1 = Vec4::new(
+        let p1 = Float4::new(
            spectrum[sb1[0] as usize],
            spectrum[sb1[1] as usize],
            spectrum[sb1[2] as usize],
            spectrum[sb1[3] as usize],
        );
-        p[i] = p0 * (Vec4::splat(1.0) - sbf) + p1 * sbf;
+        p[i] = p0 * (Float4::splat(1.0) - sbf) + p1 * sbf;
    }

    // Linearly interpolate the spectral power of the cell vertices.
-    let mut interpolated_p = Vec4::splat(0.0);
+    let mut interpolated_p = Float4::splat(0.0);
    if inside {
        // Fast path for normal inner quads:
        let uv2 = (uv.0 - uvi.0 as f32, uv.1 - uvi.1 as f32);
Author	SHA1	Message	Date
Nathan Vegdahl	a9c2fb1d01	Name a constant better.	2023-08-04 00:51:02 +02:00
Nathan Vegdahl	79b9bd5f49	Fix broken tests that I forgot to update.	2023-08-04 00:46:16 +02:00
Nathan Vegdahl	f4b7767198	Move ulp increment/decrement functions into rmath utils.	2023-08-04 00:45:44 +02:00
Nathan Vegdahl	911542c534	Handle NaNs in a reasonable way in rmath's ulp float functions.	2023-08-03 23:55:26 +02:00
Nathan Vegdahl	f9acc8096c	Added more tests and documentation for float ulps functions in rmath.	2023-08-03 23:34:30 +02:00
Nathan Vegdahl	76c56e16f9	Add some (commented out) experiments with Halton. Halton has less correlation artifacts than Sobol for DoF, but converges slower.	2022-08-17 15:24:20 -07:00
Nathan Vegdahl	4058c63637	Add DoF back to PsychoBlend exporter.	2022-08-17 13:43:39 -07:00
Nathan Vegdahl	cb01d1aaea	Add an f32() method to Rng.	2022-08-17 12:13:29 -07:00
Nathan Vegdahl	f0e5d538b7	Use non-Owen-scrambled samples for motion blur and DoF. This gives notably better results because it avoids what I'm calling "sample overlap", which is an issue with jittering approaches like Owen scrambling. In general, of course, Owen scrambling improves things. But particularly for motion blur it seems to cause issues.	2022-08-17 00:09:02 -07:00
Nathan Vegdahl	76781eb639	Use new fast hash for base-4 Owen scrambling.	2022-08-14 13:29:14 -07:00
Nathan Vegdahl	c603e24633	Make World own its own memory, along with distant disk lights.	2022-08-07 13:54:18 -07:00
Nathan Vegdahl	fc7b8da17d	Remove unused imports.	2022-08-07 13:29:40 -07:00
Nathan Vegdahl	b5bf580b96	Make Camera own its own memory.	2022-08-07 13:28:00 -07:00
Nathan Vegdahl	6decc48648	PsychoBlend: fix broken material ui panel.	2022-08-07 11:17:25 -07:00
Nathan Vegdahl	e244664b32	Move shader bindings to objects rather than instances.	2022-08-07 11:05:34 -07:00
Nathan Vegdahl	1c801ee605	PsychoBlend: implement object and material export. Material bindings don't work, since they're now on the objects themselves rather than the instances, and I haven't updated Psychopath itself for that yet.	2022-08-07 10:23:54 -07:00
Nathan Vegdahl	d132e6a015	Some minor cleanup in the Blender/Psychopath communication code.	2022-08-07 08:25:10 -07:00
Nathan Vegdahl	69ace90689	Continue WIP update PsychoBlend for Blender 3.x. It exports and renders successfully... except there are no objects. Just a blank background.	2022-08-06 21:40:13 -07:00
Nathan Vegdahl	6d7b8b280f	WIP update PsychoBlend for Blender 3.x. This just makes the UI not break. Exporting/rendering still doesn't work.	2022-08-06 13:50:40 -07:00
Nathan Vegdahl	1d05063190	Poking at what the new file/streaming format might be.	2022-08-04 15:40:09 -07:00
Nathan Vegdahl	2bb45a9876	Add streaming data tree parser sub-crate. Not used yet.	2022-08-04 13:50:13 -07:00
Nathan Vegdahl	e1c983a7e6	Added benchmarks to rrand sub-crate. Also misc naming cleanup.	2022-08-04 11:52:49 -07:00
Nathan Vegdahl	a12de4c3d7	Wrap sampling logic/tracking in a struct.	2022-08-04 11:11:25 -07:00
Nathan Vegdahl	1f7c412e25	Benchmarks and precision tests for RMath sub-crate.	2022-08-03 17:10:27 -07:00
Nathan Vegdahl	167d70b8df	Made the hilbert spiral order a little more pleasant.	2022-08-03 10:55:51 -07:00
Nathan Vegdahl	15cd261026	Fix bug resulting in cracks between triangles in some cases.	2022-08-03 10:23:26 -07:00
Nathan Vegdahl	9569f772f0	Fix incorrect background color handling in glossy reflections. Bug introduced in the previous refector removing the LightPath struct.	2022-08-03 00:07:03 -07:00
Nathan Vegdahl	77ac8ef9f2	Implement "hilbert spiral" bucket rendering order.	2022-08-02 23:58:56 -07:00
Nathan Vegdahl	7c750dcded	Directly specify bucket size instead of inferring from sample count. Since we aren't doing breadth-first ray tracing anymore, this makes a lot more sense.	2022-08-02 19:41:23 -07:00
Nathan Vegdahl	72b8397e9d	Finished getting rid of the LightPath struct. Also misc cleanup of related code.	2022-08-02 18:55:25 -07:00
Nathan Vegdahl	6ccd4e306d	WIP getting rid of LightPath struct. Committing at this point because: 1. It compiles. 2. Rendering is totally wrong, but in a cool way.	2022-08-02 17:29:05 -07:00
Nathan Vegdahl	181f7a6b85	Add convenience script for building with target native cpu.	2022-08-02 15:13:52 -07:00
Nathan Vegdahl	608fe8bda1	Switch to colorbox and jakob upsampling for color handling.	2022-08-02 00:18:12 -07:00
Nathan Vegdahl	5d246e66fa	Remove stats that we can't reasonably collect anymore.	2022-08-01 22:57:13 -07:00
Nathan Vegdahl	8bc6b24004	Switch to CIE XYZ lookup tables.	2022-08-01 22:04:14 -07:00
Nathan Vegdahl	caed4c67de	Do depth-first instead of breadth-first ray tracing. This simplifies a lot of code, and will make experimenting with other things a lot more straightforward.	2022-08-01 15:26:38 -07:00
Nathan Vegdahl	98a9aeb374	Minor tweaks to Owen scrambling functions.	2022-07-25 15:43:07 -07:00
Nathan Vegdahl	ef489c1ca2	Minor cleanup of the Owen scramble code.	2022-07-23 14:17:38 -07:00
Nathan Vegdahl	f95e869848	Give Owen scramble functions their own hash. This lets us move the seeding overhead outside the main loop, which in turn lets us avoid taking it every round.	2022-07-23 13:24:24 -07:00
Nathan Vegdahl	40d643b334	Put hilbert and morton code into one module.	2022-07-23 12:57:57 -07:00
Nathan Vegdahl	0df18ce908	New hash seeding approach. There didn't seem to be any issues in practice with the last approach, but I thought of some other ways things could in theory interact badly. This fixes that.	2022-07-21 14:18:55 -07:00
Nathan Vegdahl	570878d052	Eliminate branch in per-bit Owen scrambling functions.	2022-07-21 12:28:47 -07:00
Nathan Vegdahl	f5a0210cdf	More principled seeding approach in the hash functions.	2022-07-21 12:15:36 -07:00
Nathan Vegdahl	7082f2d7f4	Update hash functions to known good ones instead of bespoke ones.	2022-07-21 05:22:22 -07:00
Nathan Vegdahl	0d71ae86db	Noticed that z-scrambling is actually just base-4 Owen scrambling. Updated function name and comments to reflect that.	2022-07-21 04:44:05 -07:00
Nathan Vegdahl	6b7538e25f	Make the z-scrambling table smaller with bit fiddling. This gets it down to 24 bytes.	2022-07-21 04:20:03 -07:00
Nathan Vegdahl	ec9a121e72	Implement screen-space blue-noise sampling properly.	2022-07-21 03:59:47 -07:00
Nathan Vegdahl	83b48f722d	Simpler way to implement screen-space blue-noise sampling. We now do the index scrambling at the top of the sampling loop, which is also faster since we only have to run it once per pixel instead of once per sample.	2022-07-20 18:54:38 -07:00
Nathan Vegdahl	86814dbf8f	Minor rearrangement of ray generation sampling code.	2022-07-17 21:18:46 -07:00
Nathan Vegdahl	89429ed9f0	Fix silly bug in light transforms introduced during the switch to RMath.	2022-07-17 17:40:14 -07:00
Nathan Vegdahl	d55ec9b025	Update psychopath code to work with RMath changes.	2022-07-17 17:24:58 -07:00
Nathan Vegdahl	6dbdcba91a	Whole bunch of cleanup on RMath.	2022-07-17 16:37:15 -07:00
Nathan Vegdahl	e2044e6579	Implement simple screen-space blue-noise diffusion sampling. From the paper "Screen-Space Blue-Noise Diffusion of Monte Carlo Sampling Error via Hierarchical Ordering of Pixels" by Ahmed et al.	2022-07-16 19:35:23 -07:00
Nathan Vegdahl	ea4ba81110	Use faster routines where precision isn't needed.	2022-07-16 01:09:33 -07:00
Nathan Vegdahl	8dcf093dbb	RMath: first pass at an SSE implementation.	2022-07-16 00:03:09 -07:00
Nathan Vegdahl	08e2e6eb06	Convert Psychopath over to use new RMath library.	2022-07-15 21:42:35 -07:00
Nathan Vegdahl	a84da943d0	RMath: implement transform composition.	2022-07-15 17:51:57 -07:00
Nathan Vegdahl	5535775006	RMath: implement Bool4 type.	2022-07-15 15:20:44 -07:00
Nathan Vegdahl	fa7be4e58c	RMath: change fallback Float4 to be a tuple-struct.	2022-07-15 00:49:37 -07:00
Nathan Vegdahl	a93a3f09da	RMath: implement cross product and bring back some unit tests.	2022-07-15 00:39:14 -07:00
Nathan Vegdahl	42cd282c47	RMath: implement transforms for Vector, Point, and Normal.	2022-07-14 23:23:22 -07:00
Nathan Vegdahl	d8e1437db1	RMath: implement vector-matrix multiplication.	2022-07-14 19:02:08 -07:00
Nathan Vegdahl	c398387b55	Implement dot products and 3x3 matrix inversion. Both precise and fast versions. But untested, so might be incorrect!	2022-07-14 15:30:30 -07:00
Nathan Vegdahl	8a695a7694	Some shuffling of the math sub-crate's organization.	2022-07-14 12:31:32 -07:00
Nathan Vegdahl	732dee958e	Remove Mat3x3 from math3d lib. It was an extraneous abstraction.	2022-07-14 00:33:38 -07:00
Nathan Vegdahl	658e4746ca	Start work on new linear algebra library.	2022-07-13 18:54:44 -07:00