Turn into a "proper" library.

This commit is contained in:
Nathan Vegdahl 2024-09-10 18:19:00 +02:00
parent 158511b3aa
commit bff8bbc89c
14 changed files with 4614 additions and 1010 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
Cargo.lock
/target /target
/test_text /test_text

791
Cargo.lock generated
View File

@ -1,791 +0,0 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "ahash"
version = "0.7.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9"
dependencies = [
"getrandom",
"once_cell",
"version_check",
]
[[package]]
name = "aho-corasick"
version = "1.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
dependencies = [
"memchr",
]
[[package]]
name = "anyhow"
version = "1.0.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
[[package]]
name = "argmin"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5698c8cd3510117a4e6b96749a8061ba7dce1a19578ce4ecdb12dd36d94a7f8d"
dependencies = [
"anyhow",
"argmin-math",
"bincode 1.3.3",
"instant",
"num-traits",
"paste",
"rand",
"rand_xoshiro",
"serde",
"serde_json",
"slog",
"slog-async",
"slog-json",
"slog-term",
"thiserror",
]
[[package]]
name = "argmin-math"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75f2b0dada81340718682df780c9a696b090b6ef7e83c3dcc770af6de9302995"
dependencies = [
"anyhow",
"cfg-if",
"num-complex",
"num-integer",
"num-traits",
"rand",
"thiserror",
]
[[package]]
name = "autocfg"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
[[package]]
name = "bincode"
version = "1.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
dependencies = [
"serde",
]
[[package]]
name = "bincode"
version = "2.0.0-rc.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f11ea1a0346b94ef188834a65c068a03aec181c94896d481d7a0a40d85b0ce95"
dependencies = [
"bincode_derive",
"serde",
]
[[package]]
name = "bincode_derive"
version = "2.0.0-rc.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e30759b3b99a1b802a7a3aa21c85c3ded5c28e1c83170d82d70f08bbf7f3e4c"
dependencies = [
"virtue",
]
[[package]]
name = "bitflags"
version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "crawdad"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87fbd1ecd2ed790e11c8fbe034f9b3e7687404818d1bdfd8218d26ec645ec7c5"
[[package]]
name = "crossbeam-channel"
version = "0.5.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
[[package]]
name = "csv-core"
version = "0.1.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
dependencies = [
"memchr",
]
[[package]]
name = "deranged"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
dependencies = [
"powerfmt",
]
[[package]]
name = "dirs-next"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1"
dependencies = [
"cfg-if",
"dirs-sys-next",
]
[[package]]
name = "dirs-sys-next"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
dependencies = [
"libc",
"redox_users",
"winapi",
]
[[package]]
name = "furigana_gen"
version = "0.1.0"
dependencies = [
"lz4_flex",
"once_cell",
"regex",
"vibrato",
]
[[package]]
name = "getrandom"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
dependencies = [
"cfg-if",
"libc",
"wasi",
]
[[package]]
name = "hashbrown"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
dependencies = [
"ahash",
]
[[package]]
name = "hermit-abi"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc"
[[package]]
name = "instant"
version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
dependencies = [
"cfg-if",
]
[[package]]
name = "is-terminal"
version = "0.4.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b"
dependencies = [
"hermit-abi",
"libc",
"windows-sys",
]
[[package]]
name = "itoa"
version = "1.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
[[package]]
name = "libc"
version = "0.2.158"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
[[package]]
name = "libredox"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
dependencies = [
"bitflags",
"libc",
]
[[package]]
name = "lz4_flex"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
dependencies = [
"twox-hash",
]
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "num-complex"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
dependencies = [
"num-traits",
]
[[package]]
name = "num-conv"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "num-integer"
version = "0.1.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
dependencies = [
"num-traits",
]
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
[[package]]
name = "paste"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
[[package]]
name = "powerfmt"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "ppv-lite86"
version = "0.2.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
dependencies = [
"zerocopy",
]
[[package]]
name = "proc-macro2"
version = "1.0.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
"serde",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom",
"serde",
]
[[package]]
name = "rand_xoshiro"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
dependencies = [
"rand_core",
"serde",
]
[[package]]
name = "redox_users"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
dependencies = [
"getrandom",
"libredox",
"thiserror",
]
[[package]]
name = "regex"
version = "1.10.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
[[package]]
name = "rucrf"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "640271497e95c3a3a9502740187b5fc3a19485cad172e7be9a2fd8d86ffc1c28"
dependencies = [
"argmin",
"argmin-math",
"bincode 2.0.0-rc.3",
"crossbeam-channel",
"hashbrown",
]
[[package]]
name = "rustversion"
version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
[[package]]
name = "ryu"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
[[package]]
name = "serde"
version = "1.0.209"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.209"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "serde_json"
version = "1.0.127"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
dependencies = [
"itoa",
"memchr",
"ryu",
"serde",
]
[[package]]
name = "slog"
version = "2.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06"
[[package]]
name = "slog-async"
version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72c8038f898a2c79507940990f05386455b3a317d8f18d4caea7cbc3d5096b84"
dependencies = [
"crossbeam-channel",
"slog",
"take_mut",
"thread_local",
]
[[package]]
name = "slog-json"
version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e1e53f61af1e3c8b852eef0a9dee29008f55d6dd63794f3f12cef786cf0f219"
dependencies = [
"serde",
"serde_json",
"slog",
"time",
]
[[package]]
name = "slog-term"
version = "2.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6e022d0b998abfe5c3782c1f03551a596269450ccd677ea51c56f8b214610e8"
dependencies = [
"is-terminal",
"slog",
"term",
"thread_local",
"time",
]
[[package]]
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "syn"
version = "2.0.77"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "take_mut"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60"
[[package]]
name = "term"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f"
dependencies = [
"dirs-next",
"rustversion",
"winapi",
]
[[package]]
name = "thiserror"
version = "1.0.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "thread_local"
version = "1.1.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
dependencies = [
"cfg-if",
"once_cell",
]
[[package]]
name = "time"
version = "0.3.36"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
dependencies = [
"deranged",
"itoa",
"num-conv",
"powerfmt",
"serde",
"time-core",
"time-macros",
]
[[package]]
name = "time-core"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
[[package]]
name = "time-macros"
version = "0.2.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
dependencies = [
"num-conv",
"time-core",
]
[[package]]
name = "twox-hash"
version = "1.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
dependencies = [
"cfg-if",
"static_assertions",
]
[[package]]
name = "unicode-ident"
version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
[[package]]
name = "version_check"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "vibrato"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df95d99b268877738d8f644c38604c4c9c09950219b1d1a725ada94d62e98722"
dependencies = [
"bincode 2.0.0-rc.3",
"crawdad",
"csv-core",
"hashbrown",
"regex",
"rucrf",
]
[[package]]
name = "virtue"
version = "0.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dcc60c0624df774c82a0ef104151231d37da4962957d691c011c852b2473314"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-sys"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "zerocopy"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
dependencies = [
"byteorder",
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [
"proc-macro2",
"quote",
"syn",
]

View File

@ -3,10 +3,11 @@ name = "furigana_gen"
version = "0.1.0" version = "0.1.0"
edition = "2021" edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib]
name = "furigana_gen"
path = "src/lib.rs"
[dependencies] [dependencies]
vibrato = "0.5" vibrato = "0.5"
lz4_flex = "0.11" lz4_flex = "0.11"
regex = "1.10" quick-xml = "0.36.1"
once_cell = "1.19"

25
build.rs Normal file
View File

@ -0,0 +1,25 @@
// Generate table for traversal order of quad BVHs.
use std::{env, fs::File, io::Write, path::Path};
const KANJI: &str = include_str!("data/kanji_frequency.txt");
fn main() {
// Write traversal table to Rust file
let out_dir = env::var("OUT_DIR").unwrap();
let dest_path = Path::new(&out_dir).join("kanji_freq_inc.rs");
let mut f = File::create(&dest_path).unwrap();
f.write_all("const KANJI_FREQ: &[char] = &[".as_bytes())
.unwrap();
for c in KANJI.chars() {
if c.is_whitespace() {
continue;
}
f.write_all(format!("\n'{}',", c).as_bytes()).unwrap();
}
f.write_all("\n];".as_bytes()).unwrap();
}

4001
data/kanji_frequency.txt Normal file

File diff suppressed because it is too large Load Diff

Binary file not shown.

67
src/learner.rs Normal file
View File

@ -0,0 +1,67 @@
use std::collections::HashMap;
const MIN_MAX_DISTANCE: usize = 100;
const MAX_MAX_DISTANCE: usize = 10000;
#[derive(Debug, Copy, Clone)]
struct WordStats {
// The last position (in words processed) that this word was seen at.
last_seen_at: usize,
// How many times this word has been seen so far.
times_seen: usize,
// Maximum distance before helps is needed again.
max_distance: usize,
}
pub struct Learner {
stats: HashMap<String, WordStats>,
words_processed: usize,
times_seen_threshold: usize,
}
impl Learner {
pub fn new(times_seen_threshold: usize) -> Self {
Self {
stats: HashMap::new(),
words_processed: 0,
times_seen_threshold: times_seen_threshold,
}
}
pub fn record(&mut self, word: &str) {
self.stats
.entry(word.to_string())
.and_modify(|stats| {
let distance = self.words_processed - stats.last_seen_at;
stats.last_seen_at = self.words_processed;
stats.times_seen += 1;
if stats.times_seen <= self.times_seen_threshold {
return;
}
if distance < stats.max_distance {
stats.max_distance += distance.min((stats.max_distance as f64 * 0.5) as usize);
}
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
})
.or_insert(WordStats {
last_seen_at: self.words_processed,
times_seen: 1,
max_distance: MIN_MAX_DISTANCE,
});
self.words_processed += 1;
}
pub fn needs_help(&self, word: &str) -> bool {
if let Some(stats) = self.stats.get(word) {
let distance = self.words_processed - stats.last_seen_at;
stats.times_seen <= self.times_seen_threshold || distance > stats.max_distance
} else {
true
}
}
}

516
src/lib.rs Normal file
View File

@ -0,0 +1,516 @@
mod learner;
use std::{
collections::HashSet,
// fs::File,
io::{Cursor, Read},
};
use lz4_flex::frame::FrameDecoder;
use quick_xml::events::Event;
use vibrato::{Dictionary, Tokenizer};
use learner::Learner;
// Include KANJI_FREQ, a frequency-ordered array of kanji characters.
include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));
const DICT: &[u8] = include_bytes!("../data/dictionary/system.dic.lz4");
pub struct FuriganaGenerator {
tokenizer: Tokenizer,
exclude_kanji: HashSet<char>,
learner: Learner,
}
impl FuriganaGenerator {
// `exclude_count`: exclude the N most frequent kanji from furigana.
// Specifically, words made up *entirely* of those kanji will be excluded.
// If a word has some kanji that aren't in that set, even if it also has
// some that are, it will still get furigana.
pub fn new(exclude_count: usize, learn_mode: bool) -> Self {
let dict = {
// Note: we could just pass the decoder straight to `Dictionary::read()`
// below, and it would work. However, that ends up being slower than
// first decompressing the whole thing ahead of time.
let mut decoder = FrameDecoder::new(Cursor::new(DICT));
let mut data = Vec::new();
decoder.read_to_end(&mut data).unwrap();
Dictionary::read(Cursor::new(&data)).unwrap()
};
let exclude_kanji = {
let mut set = HashSet::new();
for &c in KANJI_FREQ.iter().take(exclude_count) {
set.insert(c);
}
set
};
Self {
tokenizer: Tokenizer::new(dict),
exclude_kanji: exclude_kanji,
learner: Learner::new(if learn_mode { 5 } else { usize::MAX }),
}
}
pub fn add_html_furigana(&mut self, text: &str) -> String {
add_html_furigana_skip_already_ruby(
&text,
&self.tokenizer,
&self.exclude_kanji,
&mut self.learner,
)
}
}
fn to_str<B: std::ops::Deref<Target = [u8]>>(bytes: &B) -> &str {
std::str::from_utf8(&bytes.deref()).unwrap()
}
/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
fn add_html_furigana_skip_already_ruby(
text: &str,
tokenizer: &Tokenizer,
exclude_kanji: &HashSet<char>,
learner: &mut Learner,
) -> String {
let mut reader = quick_xml::Reader::from_str(text);
let mut new_text = String::new();
let mut rubys: i32 = 0;
loop {
match reader.read_event() {
Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
Ok(Event::Eof) => break,
Ok(Event::Start(e)) => {
if e.name().into_inner() == b"ruby" {
rubys += 1;
}
write_xml(&mut new_text, &Event::Start(e));
}
Ok(Event::End(e)) => {
if e.name().into_inner() == b"ruby" {
rubys -= 1;
}
write_xml(&mut new_text, &Event::End(e));
}
Ok(Event::Text(e)) => {
if rubys <= 0 {
new_text.push_str(&add_html_furigana(
to_str(&e),
tokenizer,
exclude_kanji,
learner,
));
} else {
write_xml(&mut new_text, &Event::Text(e));
}
}
// All other events, just re-write them verbatim.
Ok(e) => write_xml(&mut new_text, &e),
}
}
new_text
}
/// Takes an xml event and writes it verbatim to the given string.
///
/// NOTE: really what we want is for the events to provide their byte index range
/// in the original text, so we could just write that, and even double-check that
/// we're not missing anything. But for some reason quick_xml doesn't provide
/// that information.
fn write_xml(text: &mut String, event: &quick_xml::events::Event) {
match event {
Event::Start(e) => {
text.push_str("<");
text.push_str(to_str(e));
text.push_str(">");
}
Event::End(e) => {
text.push_str("</");
text.push_str(to_str(e));
text.push_str(">");
}
Event::Empty(e) => {
text.push_str("<");
text.push_str(to_str(e));
text.push_str("/>");
}
Event::CData(e) => {
text.push_str("<![CDATA[");
text.push_str(to_str(e));
text.push_str("]]>");
}
Event::Comment(e) => {
text.push_str("<!--");
text.push_str(to_str(e));
text.push_str("-->");
}
Event::Decl(e) => {
text.push_str("<?");
text.push_str(to_str(e));
text.push_str("?>");
}
Event::PI(e) => {
text.push_str("<?");
text.push_str(to_str(e));
text.push_str("?>");
}
Event::DocType(e) => {
text.push_str("<!DOCTYPE");
text.push_str(to_str(e));
text.push_str(">");
}
Event::Text(e) => text.push_str(to_str(e)),
_ => unreachable!(),
}
}
/// Adds furigana to Japanese text, using html ruby tags.
fn add_html_furigana(
text: &str,
tokenizer: &Tokenizer,
exclude_kanji: &HashSet<char>,
learner: &mut Learner,
) -> String {
let mut worker = tokenizer.new_worker();
worker.reset_sentence(text);
worker.tokenize();
let mut new_text = String::new();
for i in 0..worker.num_tokens() {
let t = worker.token(i);
let surface = t.surface();
let needs_help = learner.needs_help(surface);
learner.record(surface);
if !needs_help {
new_text.push_str(surface);
continue;
}
let kana = t.feature().split(",").nth(1).unwrap();
let furigana_text = apply_furigana(surface, kana, exclude_kanji);
for (surf, furi) in furigana_text.iter() {
if furi.is_empty() {
new_text.push_str(surf);
continue;
}
new_text.push_str("<ruby>");
new_text.push_str(surf);
new_text.push_str("<rt>");
new_text.push_str(furi);
new_text.push_str("</rt></ruby>");
}
}
new_text
}
/// Returns a segmented list of (surface, furigana) pairs.
///
/// The furigana component of a pair may be empty, indicating no
/// furigana is needed for that surface element.
fn apply_furigana<'a>(
surface: &'a str,
kana: &'a str,
exclude_kanji: &HashSet<char>,
) -> Vec<(&'a str, &'a str)> {
let mut out = Vec::new();
if furigana_unneeded(surface, exclude_kanji) {
out.push((surface, ""));
return out;
}
let mut surface = surface;
let mut kana = kana;
// Trim any kana from the start.
{
let mut start_s = 0;
let mut start_k = 0;
for (sc, kc) in surface.chars().zip(kana.chars()) {
if is_equivalent_kana(sc, kc) {
start_s += sc.len_utf8();
start_k += kc.len_utf8();
} else {
break;
}
}
out.push((&surface[..start_s], ""));
surface = &surface[start_s..];
kana = &kana[start_k..];
}
// Trim any kana from the end.
{
let mut end_s = surface.len();
let mut end_k = kana.len();
for (sc, kc) in surface.chars().rev().zip(kana.chars().rev()) {
if is_equivalent_kana(sc, kc) {
end_s -= sc.len_utf8();
end_k -= kc.len_utf8();
} else {
break;
}
}
out.push((&surface[end_s..], ""));
surface = &surface[..end_s];
kana = &kana[..end_k];
}
// Try to uniquely match kana in the middle.
//
// This is just best-effort, and bails in any non-trivial cases.
while let Some((si, sc)) = surface.char_indices().find(|(_, c)| is_kana(*c)) {
// If there's more than one match, bail.
let equivalent_kana_count = kana
.chars()
.map(|c| is_equivalent_kana(c, sc))
.fold(0usize, |count, hit| count + hit as usize);
if equivalent_kana_count != 1 {
break;
}
// Find the one match.
let (ki, kc) = kana
.char_indices()
.find(|(_, c)| is_equivalent_kana(sc, *c))
.unwrap();
// Insert the segments.
out.insert(out.len() - 2, (&surface[..si], &kana[..ki]));
out.insert(out.len() - 2, (&surface[si..(si + sc.len_utf8())], ""));
surface = &surface[(si + sc.len_utf8())..];
kana = &kana[(ki + kc.len_utf8())..];
}
// Left over.
out.insert(out.len() - 2, (surface, kana));
out.iter().filter(|(s, _)| !s.is_empty()).copied().collect()
}
/// Due to the way this is used, this isn't meant to be exact, but instead
/// liberal in what it considers equivalent.
fn is_equivalent_kana(a: char, b: char) -> bool {
const PAIRS: &[[char; 2]] = &[['は', 'わ'], ['を', 'お'], ['づ', 'ず'], ['へ', 'え']];
const VOWELS: &[char] = &['あ', 'い', 'う', 'え', 'お', 'ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ'];
let (a, b) = match (normalize_kana(a), normalize_kana(b)) {
(Some(a), Some(b)) => (a, b),
_ => return false,
};
if a == b {
return true;
}
if a == 'ー' && VOWELS.contains(&b) {
return true;
}
if b == 'ー' && VOWELS.contains(&a) {
return true;
}
for &[c, d] in PAIRS {
if (a == c && b == d) || (a == d && b == c) {
return true;
}
}
false
}
const HIRAGANA: u32 = 0x3041;
const KATAKANA: u32 = 0x30A1;
const KANA_COUNT: u32 = 0x3097 - HIRAGANA;
pub fn is_kana(c: char) -> bool {
if c == 'ー' {
return true;
}
let c = c as u32;
if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
return true;
}
if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
return true;
}
return false;
}
pub fn normalize_kana(c: char) -> Option<char> {
if !is_kana(c) {
return None;
}
Some(katakana_to_hiragana(c).unwrap_or(c))
}
/// Returns true if furigana defininitely isn't needed.
pub fn furigana_unneeded(text: &str, exclude_kanji: &HashSet<char>) -> bool {
text.chars().all(|c| {
is_kana(c) || c.is_ascii() || c.is_numeric() || c == '々' || exclude_kanji.contains(&c)
})
}
pub fn hiragana_to_katakana(c: char) -> Option<char> {
let c = c as u32;
if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
char::try_from(c + KATAKANA - HIRAGANA).ok()
} else {
None
}
}
pub fn katakana_to_hiragana(c: char) -> Option<char> {
let c = c as u32;
if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
char::try_from(c - KATAKANA + HIRAGANA).ok()
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn apply_furigana_01() {
let surface = "へぇ";
let kana = "ヘー";
let pairs = apply_furigana(surface, kana, &HashSet::new());
assert_eq!(&[("へぇ", "")], &pairs[..]);
}
#[test]
fn apply_furigana_02() {
let surface = "へぇー";
let kana = "ヘー";
let pairs = apply_furigana(surface, kana, &HashSet::new());
assert_eq!(&[("へぇー", "")], &pairs[..]);
}
#[test]
fn apply_furigana_03() {
let surface = "";
let kana = "";
let pairs = apply_furigana(surface, kana, &HashSet::new());
assert_eq!(&[("", "")], &pairs[..]);
}
#[test]
fn apply_furigana_04() {
let surface = "食べる";
let kana = "タベル";
let pairs = apply_furigana(surface, kana, &HashSet::new());
assert_eq!(&[("", ""), ("べる", "")], &pairs[..]);
}
#[test]
fn apply_furigana_05() {
let surface = "流れ出す";
let kana = "ながれだす";
let pairs = apply_furigana(surface, kana, &HashSet::new());
assert_eq!(
&[("", "なが"), ("", ""), ("", ""), ("", "")],
&pairs[..]
);
}
#[test]
fn apply_furigana_06() {
let surface = "物の怪";
let kana = "もののけ";
let pairs = apply_furigana(surface, kana, &HashSet::new());
assert_eq!(&[("物の怪", "もののけ")], &pairs[..]);
}
#[test]
fn is_equivalent_kana_01() {
assert!(is_equivalent_kana('か', 'カ'));
assert!(is_equivalent_kana('カ', 'か'));
assert!(is_equivalent_kana('ぁ', 'ァ'));
assert!(is_equivalent_kana('ァ', 'ぁ'));
assert!(is_equivalent_kana('は', 'わ'));
assert!(is_equivalent_kana('わ', 'は'));
assert!(is_equivalent_kana('を', 'お'));
assert!(is_equivalent_kana('お', 'を'));
assert!(is_equivalent_kana('づ', 'ず'));
assert!(is_equivalent_kana('ず', 'づ'));
assert!(is_equivalent_kana('ー', 'あ'));
assert!(is_equivalent_kana('あ', 'ー'));
assert!(is_equivalent_kana('ー', 'ぁ'));
assert!(is_equivalent_kana('ぁ', 'ー'));
assert!(!is_equivalent_kana('は', 'ば'));
assert!(!is_equivalent_kana('ー', 'か'));
assert!(!is_equivalent_kana('た', '食'));
}
#[test]
fn tokenize_01() {
let gen = FuriganaGenerator::new(0, false);
let mut worker = gen.tokenizer.new_worker();
worker.reset_sentence("食べている");
worker.tokenize();
assert_eq!(3, worker.num_tokens());
assert_eq!("食べ", worker.token(0).surface());
assert_eq!("動詞-一般,タベ", worker.token(0).feature());
assert_eq!("", worker.token(1).surface());
assert_eq!("助詞-接続助詞,テ", worker.token(1).feature());
assert_eq!("いる", worker.token(2).surface());
assert_eq!("動詞-非自立可能,イル", worker.token(2).feature());
}
#[test]
fn add_html_furigana_01() {
let mut gen = FuriganaGenerator::new(0, false);
let text = gen
.add_html_furigana(r#"<sup class="食う">食べる</sup>のは<ruby>良</ruby>いね!<hi />"#);
assert_eq!(
text,
r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね!<hi />"#
);
}
}

View File

@ -1,216 +0,0 @@
use std::{
// fs::File,
io::{Cursor, Read},
};
use lz4_flex::frame::FrameDecoder;
use once_cell::sync::Lazy;
use regex::Regex;
use vibrato::{Dictionary, Tokenizer};
const DICT: &[u8] = include_bytes!("../dictionary/system.dic.lz4");
fn main() {
let dict = {
// Note: we could just pass the decoder straight to `Dictionary::read()`
// below, and it would work. However, that ends up being slower than
// first decompressing the whole thing ahead of time.
let mut decoder = FrameDecoder::new(Cursor::new(DICT));
let mut data = Vec::new();
decoder.read_to_end(&mut data).unwrap();
Dictionary::read(Cursor::new(&data)).unwrap()
};
let text = {
let mut text = String::new();
std::io::stdin().read_to_string(&mut text).unwrap();
text
};
let tokenizer = Tokenizer::new(dict);
print!("{}", add_html_furigana_skip_already_ruby(&text, &tokenizer));
}
/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
fn add_html_furigana_skip_already_ruby(text: &str, tokenizer: &Tokenizer) -> String {
static ALREADY_RUBY: Lazy<Regex> = Lazy::new(|| Regex::new(r"<ruby.*?>.*?</ruby>").unwrap());
let mut new_text = String::new();
let mut last_byte_index = 0;
for hit in ALREADY_RUBY.find_iter(text) {
new_text.push_str(&add_html_furigana(
&text[last_byte_index..hit.start()],
tokenizer,
));
new_text.push_str(hit.as_str());
last_byte_index = hit.end();
}
new_text.push_str(&add_html_furigana(&text[last_byte_index..], tokenizer));
new_text
}
/// Adds furigana to Japanese text, using html ruby tags.
fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
let mut worker = tokenizer.new_worker();
worker.reset_sentence(text);
worker.tokenize();
let mut new_text = String::new();
for i in 0..worker.num_tokens() {
let t = worker.token(i);
let surface = t.surface();
let kana = t.feature().split(",").nth(1).unwrap();
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
if kana.is_empty()
|| start_bytes == surface.len()
|| surface
.chars()
.map(|c| c.is_ascii() || c.is_numeric())
.all(|n| n)
{
new_text.push_str(surface);
} else {
let start = &surface[..start_bytes];
let mid = &surface[start_bytes..(surface.len() - end_bytes)];
let mid_kana = &kana[start_bytes..(kana.len() - end_bytes)];
let end = &surface[(surface.len() - end_bytes)..];
new_text.push_str(start);
new_text.push_str("<ruby>");
new_text.push_str(mid);
new_text.push_str("<rt>");
new_text.push_str(mid_kana);
new_text.push_str("</rt></ruby>");
new_text.push_str(end);
}
}
new_text
}
/// Returns (matching_start_bytes, matching_end_bytes).
///
/// Note that the bytes are in terms of `a`'s bytes.
///
/// If `matching_start_bytes == a.len()` you can assume that strings are kana
/// equivalents, and thus no ruby is needed.
fn matching_kana_ends(a: &str, b: &str) -> (usize, usize) {
let mut start_bytes = 0;
for (ca, cb) in a.chars().zip(b.chars()) {
if ca == cb || is_equivalent_kana(ca, cb) {
start_bytes += ca.len_utf8();
} else {
break;
}
}
let mut end_bytes = 0;
for (ca, cb) in a.chars().rev().zip(b.chars().rev()) {
if ca == cb || is_equivalent_kana(ca, cb) {
end_bytes += ca.len_utf8();
} else {
break;
}
}
if (start_bytes + end_bytes) >= a.len() || (start_bytes + end_bytes) >= b.len() {
(a.len(), 0)
} else {
(start_bytes, end_bytes)
}
}
fn is_equivalent_kana(a: char, b: char) -> bool {
let a = normalize_kana(a);
let b = normalize_kana(b);
match (a, b) {
(Some('は'), Some('わ'))
| (Some('わ'), Some('は'))
| (Some('を'), Some('お'))
| (Some('お'), Some('を'))
| (Some(_), Some('ー'))
| (Some('ー'), Some(_)) => true,
(Some(c), Some(d)) if c == d => true,
_ => false,
}
}
const HIRAGANA: u32 = 0x3041;
const KATAKANA: u32 = 0x30A1;
const KANA_COUNT: u32 = 0x3097 - HIRAGANA;
pub fn is_kana(c: char) -> bool {
if c == 'ー' {
return true;
}
let c = c as u32;
if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
return true;
}
if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
return true;
}
return false;
}
pub fn normalize_kana(c: char) -> Option<char> {
if !is_kana(c) {
return None;
}
Some(katakana_to_hiragana(c).unwrap_or(c))
}
pub fn hiragana_to_katakana(c: char) -> Option<char> {
let c = c as u32;
if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
char::try_from(c + KATAKANA - HIRAGANA).ok()
} else {
None
}
}
pub fn katakana_to_hiragana(c: char) -> Option<char> {
let c = c as u32;
if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
char::try_from(c - KATAKANA + HIRAGANA).ok()
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matching_kana_ends_01() {
let surface = "へぇ";
let kana = "ヘー";
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
assert_eq!(6, start_bytes);
assert_eq!(0, end_bytes);
}
#[test]
fn matching_kana_ends_02() {
let surface = "へぇー";
let kana = "ヘー";
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
assert_eq!(9, start_bytes);
assert_eq!(0, end_bytes);
}
}

View File