Turn into a "proper" library.
This commit is contained in:
parent
158511b3aa
commit
bff8bbc89c
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,2 +1,3 @@
|
|||
Cargo.lock
|
||||
/target
|
||||
/test_text
|
||||
|
|
791
Cargo.lock
generated
791
Cargo.lock
generated
|
@ -1,791 +0,0 @@
|
|||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 3
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.7.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "891477e0c6a8957309ee5c45a6368af3ae14bb510732d2684ffa19af310920f9"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
"once_cell",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "anyhow"
|
||||
version = "1.0.86"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
|
||||
|
||||
[[package]]
|
||||
name = "argmin"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5698c8cd3510117a4e6b96749a8061ba7dce1a19578ce4ecdb12dd36d94a7f8d"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"argmin-math",
|
||||
"bincode 1.3.3",
|
||||
"instant",
|
||||
"num-traits",
|
||||
"paste",
|
||||
"rand",
|
||||
"rand_xoshiro",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"slog",
|
||||
"slog-async",
|
||||
"slog-json",
|
||||
"slog-term",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "argmin-math"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75f2b0dada81340718682df780c9a696b090b6ef7e83c3dcc770af6de9302995"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"cfg-if",
|
||||
"num-complex",
|
||||
"num-integer",
|
||||
"num-traits",
|
||||
"rand",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
|
||||
|
||||
[[package]]
|
||||
name = "bincode"
|
||||
version = "1.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bincode"
|
||||
version = "2.0.0-rc.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f11ea1a0346b94ef188834a65c068a03aec181c94896d481d7a0a40d85b0ce95"
|
||||
dependencies = [
|
||||
"bincode_derive",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bincode_derive"
|
||||
version = "2.0.0-rc.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7e30759b3b99a1b802a7a3aa21c85c3ded5c28e1c83170d82d70f08bbf7f3e4c"
|
||||
dependencies = [
|
||||
"virtue",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "crawdad"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "87fbd1ecd2ed790e11c8fbe034f9b3e7687404818d1bdfd8218d26ec645ec7c5"
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.5.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33480d6946193aa8033910124896ca395333cae7e2d1113d1fef6c3272217df2"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
|
||||
|
||||
[[package]]
|
||||
name = "csv-core"
|
||||
version = "0.1.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.3.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
|
||||
dependencies = [
|
||||
"powerfmt",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-next"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"dirs-sys-next",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dirs-sys-next"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"redox_users",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "furigana_gen"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"lz4_flex",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"vibrato",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc"
|
||||
|
||||
[[package]]
|
||||
name = "instant"
|
||||
version = "0.1.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is-terminal"
|
||||
version = "0.4.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"libc",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.158"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
|
||||
|
||||
[[package]]
|
||||
name = "libredox"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lz4_flex"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
|
||||
dependencies = [
|
||||
"twox-hash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||
|
||||
[[package]]
|
||||
name = "num-complex"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-conv"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
||||
|
||||
[[package]]
|
||||
name = "num-integer"
|
||||
version = "0.1.46"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
|
||||
|
||||
[[package]]
|
||||
name = "paste"
|
||||
version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
|
||||
|
||||
[[package]]
|
||||
name = "powerfmt"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
|
||||
dependencies = [
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.86"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.37"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_xoshiro"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f97cdb2a36ed4183de61b2f824cc45c9f1037f28afe0a322e9fff4c108b5aaa"
|
||||
dependencies = [
|
||||
"rand_core",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_users"
|
||||
version = "0.4.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
"libredox",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.10.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b"
|
||||
|
||||
[[package]]
|
||||
name = "rucrf"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "640271497e95c3a3a9502740187b5fc3a19485cad172e7be9a2fd8d86ffc1c28"
|
||||
dependencies = [
|
||||
"argmin",
|
||||
"argmin-math",
|
||||
"bincode 2.0.0-rc.3",
|
||||
"crossbeam-channel",
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.209"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.209"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.127"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"memchr",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slog"
|
||||
version = "2.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06"
|
||||
|
||||
[[package]]
|
||||
name = "slog-async"
|
||||
version = "2.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72c8038f898a2c79507940990f05386455b3a317d8f18d4caea7cbc3d5096b84"
|
||||
dependencies = [
|
||||
"crossbeam-channel",
|
||||
"slog",
|
||||
"take_mut",
|
||||
"thread_local",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slog-json"
|
||||
version = "2.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e1e53f61af1e3c8b852eef0a9dee29008f55d6dd63794f3f12cef786cf0f219"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"serde_json",
|
||||
"slog",
|
||||
"time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "slog-term"
|
||||
version = "2.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6e022d0b998abfe5c3782c1f03551a596269450ccd677ea51c56f8b214610e8"
|
||||
dependencies = [
|
||||
"is-terminal",
|
||||
"slog",
|
||||
"term",
|
||||
"thread_local",
|
||||
"time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "static_assertions"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.77"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "take_mut"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f764005d11ee5f36500a149ace24e00e3da98b0158b3e2d53a7495660d3f4d60"
|
||||
|
||||
[[package]]
|
||||
name = "term"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f"
|
||||
dependencies = [
|
||||
"dirs-next",
|
||||
"rustversion",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "1.0.63"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c0342370b38b6a11b6cc11d6a805569958d54cfa061a29969c3b5ce2ea405724"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "1.0.63"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.36"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
|
||||
dependencies = [
|
||||
"deranged",
|
||||
"itoa",
|
||||
"num-conv",
|
||||
"powerfmt",
|
||||
"serde",
|
||||
"time-core",
|
||||
"time-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time-core"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
|
||||
|
||||
[[package]]
|
||||
name = "time-macros"
|
||||
version = "0.2.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
|
||||
dependencies = [
|
||||
"num-conv",
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "twox-hash"
|
||||
version = "1.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"static_assertions",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||
|
||||
[[package]]
|
||||
name = "vibrato"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df95d99b268877738d8f644c38604c4c9c09950219b1d1a725ada94d62e98722"
|
||||
dependencies = [
|
||||
"bincode 2.0.0-rc.3",
|
||||
"crawdad",
|
||||
"csv-core",
|
||||
"hashbrown",
|
||||
"regex",
|
||||
"rucrf",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "virtue"
|
||||
version = "0.0.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9dcc60c0624df774c82a0ef104151231d37da4962957d691c011c852b2473314"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.11.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu",
|
||||
"winapi-x86_64-pc-windows-gnu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-i686-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.52.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
|
||||
dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_gnullvm",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.7.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"zerocopy-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy-derive"
|
||||
version = "0.7.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
|
@ -3,10 +3,11 @@ name = "furigana_gen"
|
|||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
[lib]
|
||||
name = "furigana_gen"
|
||||
path = "src/lib.rs"
|
||||
|
||||
[dependencies]
|
||||
vibrato = "0.5"
|
||||
lz4_flex = "0.11"
|
||||
regex = "1.10"
|
||||
once_cell = "1.19"
|
||||
quick-xml = "0.36.1"
|
||||
|
|
25
build.rs
Normal file
25
build.rs
Normal file
|
@ -0,0 +1,25 @@
|
|||
// Generate table for traversal order of quad BVHs.
|
||||
|
||||
use std::{env, fs::File, io::Write, path::Path};
|
||||
|
||||
const KANJI: &str = include_str!("data/kanji_frequency.txt");
|
||||
|
||||
fn main() {
|
||||
// Write traversal table to Rust file
|
||||
let out_dir = env::var("OUT_DIR").unwrap();
|
||||
let dest_path = Path::new(&out_dir).join("kanji_freq_inc.rs");
|
||||
let mut f = File::create(&dest_path).unwrap();
|
||||
|
||||
f.write_all("const KANJI_FREQ: &[char] = &[".as_bytes())
|
||||
.unwrap();
|
||||
|
||||
for c in KANJI.chars() {
|
||||
if c.is_whitespace() {
|
||||
continue;
|
||||
}
|
||||
|
||||
f.write_all(format!("\n'{}',", c).as_bytes()).unwrap();
|
||||
}
|
||||
|
||||
f.write_all("\n];".as_bytes()).unwrap();
|
||||
}
|
4001
data/kanji_frequency.txt
Normal file
4001
data/kanji_frequency.txt
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
67
src/learner.rs
Normal file
67
src/learner.rs
Normal file
|
@ -0,0 +1,67 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
const MIN_MAX_DISTANCE: usize = 100;
|
||||
const MAX_MAX_DISTANCE: usize = 10000;
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct WordStats {
|
||||
// The last position (in words processed) that this word was seen at.
|
||||
last_seen_at: usize,
|
||||
|
||||
// How many times this word has been seen so far.
|
||||
times_seen: usize,
|
||||
|
||||
// Maximum distance before helps is needed again.
|
||||
max_distance: usize,
|
||||
}
|
||||
|
||||
pub struct Learner {
|
||||
stats: HashMap<String, WordStats>,
|
||||
words_processed: usize,
|
||||
times_seen_threshold: usize,
|
||||
}
|
||||
|
||||
impl Learner {
|
||||
pub fn new(times_seen_threshold: usize) -> Self {
|
||||
Self {
|
||||
stats: HashMap::new(),
|
||||
words_processed: 0,
|
||||
times_seen_threshold: times_seen_threshold,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record(&mut self, word: &str) {
|
||||
self.stats
|
||||
.entry(word.to_string())
|
||||
.and_modify(|stats| {
|
||||
let distance = self.words_processed - stats.last_seen_at;
|
||||
|
||||
stats.last_seen_at = self.words_processed;
|
||||
stats.times_seen += 1;
|
||||
if stats.times_seen <= self.times_seen_threshold {
|
||||
return;
|
||||
}
|
||||
|
||||
if distance < stats.max_distance {
|
||||
stats.max_distance += distance.min((stats.max_distance as f64 * 0.5) as usize);
|
||||
}
|
||||
|
||||
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
|
||||
})
|
||||
.or_insert(WordStats {
|
||||
last_seen_at: self.words_processed,
|
||||
times_seen: 1,
|
||||
max_distance: MIN_MAX_DISTANCE,
|
||||
});
|
||||
self.words_processed += 1;
|
||||
}
|
||||
|
||||
pub fn needs_help(&self, word: &str) -> bool {
|
||||
if let Some(stats) = self.stats.get(word) {
|
||||
let distance = self.words_processed - stats.last_seen_at;
|
||||
stats.times_seen <= self.times_seen_threshold || distance > stats.max_distance
|
||||
} else {
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
516
src/lib.rs
Normal file
516
src/lib.rs
Normal file
|
@ -0,0 +1,516 @@
|
|||
mod learner;
|
||||
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
// fs::File,
|
||||
io::{Cursor, Read},
|
||||
};
|
||||
|
||||
use lz4_flex::frame::FrameDecoder;
|
||||
use quick_xml::events::Event;
|
||||
use vibrato::{Dictionary, Tokenizer};
|
||||
|
||||
use learner::Learner;
|
||||
|
||||
// Include KANJI_FREQ, a frequency-ordered array of kanji characters.
|
||||
include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));
|
||||
|
||||
const DICT: &[u8] = include_bytes!("../data/dictionary/system.dic.lz4");
|
||||
|
||||
pub struct FuriganaGenerator {
|
||||
tokenizer: Tokenizer,
|
||||
exclude_kanji: HashSet<char>,
|
||||
learner: Learner,
|
||||
}
|
||||
|
||||
impl FuriganaGenerator {
|
||||
// `exclude_count`: exclude the N most frequent kanji from furigana.
|
||||
// Specifically, words made up *entirely* of those kanji will be excluded.
|
||||
// If a word has some kanji that aren't in that set, even if it also has
|
||||
// some that are, it will still get furigana.
|
||||
pub fn new(exclude_count: usize, learn_mode: bool) -> Self {
|
||||
let dict = {
|
||||
// Note: we could just pass the decoder straight to `Dictionary::read()`
|
||||
// below, and it would work. However, that ends up being slower than
|
||||
// first decompressing the whole thing ahead of time.
|
||||
let mut decoder = FrameDecoder::new(Cursor::new(DICT));
|
||||
let mut data = Vec::new();
|
||||
decoder.read_to_end(&mut data).unwrap();
|
||||
|
||||
Dictionary::read(Cursor::new(&data)).unwrap()
|
||||
};
|
||||
|
||||
let exclude_kanji = {
|
||||
let mut set = HashSet::new();
|
||||
for &c in KANJI_FREQ.iter().take(exclude_count) {
|
||||
set.insert(c);
|
||||
}
|
||||
set
|
||||
};
|
||||
|
||||
Self {
|
||||
tokenizer: Tokenizer::new(dict),
|
||||
exclude_kanji: exclude_kanji,
|
||||
learner: Learner::new(if learn_mode { 5 } else { usize::MAX }),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_html_furigana(&mut self, text: &str) -> String {
|
||||
add_html_furigana_skip_already_ruby(
|
||||
&text,
|
||||
&self.tokenizer,
|
||||
&self.exclude_kanji,
|
||||
&mut self.learner,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn to_str<B: std::ops::Deref<Target = [u8]>>(bytes: &B) -> &str {
|
||||
std::str::from_utf8(&bytes.deref()).unwrap()
|
||||
}
|
||||
|
||||
/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
|
||||
fn add_html_furigana_skip_already_ruby(
|
||||
text: &str,
|
||||
tokenizer: &Tokenizer,
|
||||
exclude_kanji: &HashSet<char>,
|
||||
learner: &mut Learner,
|
||||
) -> String {
|
||||
let mut reader = quick_xml::Reader::from_str(text);
|
||||
|
||||
let mut new_text = String::new();
|
||||
let mut rubys: i32 = 0;
|
||||
|
||||
loop {
|
||||
match reader.read_event() {
|
||||
Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
|
||||
Ok(Event::Eof) => break,
|
||||
|
||||
Ok(Event::Start(e)) => {
|
||||
if e.name().into_inner() == b"ruby" {
|
||||
rubys += 1;
|
||||
}
|
||||
write_xml(&mut new_text, &Event::Start(e));
|
||||
}
|
||||
|
||||
Ok(Event::End(e)) => {
|
||||
if e.name().into_inner() == b"ruby" {
|
||||
rubys -= 1;
|
||||
}
|
||||
write_xml(&mut new_text, &Event::End(e));
|
||||
}
|
||||
|
||||
Ok(Event::Text(e)) => {
|
||||
if rubys <= 0 {
|
||||
new_text.push_str(&add_html_furigana(
|
||||
to_str(&e),
|
||||
tokenizer,
|
||||
exclude_kanji,
|
||||
learner,
|
||||
));
|
||||
} else {
|
||||
write_xml(&mut new_text, &Event::Text(e));
|
||||
}
|
||||
}
|
||||
|
||||
// All other events, just re-write them verbatim.
|
||||
Ok(e) => write_xml(&mut new_text, &e),
|
||||
}
|
||||
}
|
||||
|
||||
new_text
|
||||
}
|
||||
|
||||
/// Takes an xml event and writes it verbatim to the given string.
|
||||
///
|
||||
/// NOTE: really what we want is for the events to provide their byte index range
|
||||
/// in the original text, so we could just write that, and even double-check that
|
||||
/// we're not missing anything. But for some reason quick_xml doesn't provide
|
||||
/// that information.
|
||||
fn write_xml(text: &mut String, event: &quick_xml::events::Event) {
|
||||
match event {
|
||||
Event::Start(e) => {
|
||||
text.push_str("<");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str(">");
|
||||
}
|
||||
|
||||
Event::End(e) => {
|
||||
text.push_str("</");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str(">");
|
||||
}
|
||||
|
||||
Event::Empty(e) => {
|
||||
text.push_str("<");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str("/>");
|
||||
}
|
||||
|
||||
Event::CData(e) => {
|
||||
text.push_str("<![CDATA[");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str("]]>");
|
||||
}
|
||||
|
||||
Event::Comment(e) => {
|
||||
text.push_str("<!--");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str("-->");
|
||||
}
|
||||
|
||||
Event::Decl(e) => {
|
||||
text.push_str("<?");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str("?>");
|
||||
}
|
||||
|
||||
Event::PI(e) => {
|
||||
text.push_str("<?");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str("?>");
|
||||
}
|
||||
|
||||
Event::DocType(e) => {
|
||||
text.push_str("<!DOCTYPE");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str(">");
|
||||
}
|
||||
|
||||
Event::Text(e) => text.push_str(to_str(e)),
|
||||
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds furigana to Japanese text, using html ruby tags.
|
||||
fn add_html_furigana(
|
||||
text: &str,
|
||||
tokenizer: &Tokenizer,
|
||||
exclude_kanji: &HashSet<char>,
|
||||
learner: &mut Learner,
|
||||
) -> String {
|
||||
let mut worker = tokenizer.new_worker();
|
||||
|
||||
worker.reset_sentence(text);
|
||||
worker.tokenize();
|
||||
|
||||
let mut new_text = String::new();
|
||||
for i in 0..worker.num_tokens() {
|
||||
let t = worker.token(i);
|
||||
let surface = t.surface();
|
||||
|
||||
let needs_help = learner.needs_help(surface);
|
||||
learner.record(surface);
|
||||
|
||||
if !needs_help {
|
||||
new_text.push_str(surface);
|
||||
continue;
|
||||
}
|
||||
|
||||
let kana = t.feature().split(",").nth(1).unwrap();
|
||||
|
||||
let furigana_text = apply_furigana(surface, kana, exclude_kanji);
|
||||
|
||||
for (surf, furi) in furigana_text.iter() {
|
||||
if furi.is_empty() {
|
||||
new_text.push_str(surf);
|
||||
continue;
|
||||
}
|
||||
|
||||
new_text.push_str("<ruby>");
|
||||
new_text.push_str(surf);
|
||||
new_text.push_str("<rt>");
|
||||
new_text.push_str(furi);
|
||||
new_text.push_str("</rt></ruby>");
|
||||
}
|
||||
}
|
||||
|
||||
new_text
|
||||
}
|
||||
|
||||
/// Returns a segmented list of (surface, furigana) pairs.
|
||||
///
|
||||
/// The furigana component of a pair may be empty, indicating no
|
||||
/// furigana is needed for that surface element.
|
||||
fn apply_furigana<'a>(
|
||||
surface: &'a str,
|
||||
kana: &'a str,
|
||||
exclude_kanji: &HashSet<char>,
|
||||
) -> Vec<(&'a str, &'a str)> {
|
||||
let mut out = Vec::new();
|
||||
|
||||
if furigana_unneeded(surface, exclude_kanji) {
|
||||
out.push((surface, ""));
|
||||
return out;
|
||||
}
|
||||
|
||||
let mut surface = surface;
|
||||
let mut kana = kana;
|
||||
|
||||
// Trim any kana from the start.
|
||||
{
|
||||
let mut start_s = 0;
|
||||
let mut start_k = 0;
|
||||
for (sc, kc) in surface.chars().zip(kana.chars()) {
|
||||
if is_equivalent_kana(sc, kc) {
|
||||
start_s += sc.len_utf8();
|
||||
start_k += kc.len_utf8();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
out.push((&surface[..start_s], ""));
|
||||
surface = &surface[start_s..];
|
||||
kana = &kana[start_k..];
|
||||
}
|
||||
|
||||
// Trim any kana from the end.
|
||||
{
|
||||
let mut end_s = surface.len();
|
||||
let mut end_k = kana.len();
|
||||
for (sc, kc) in surface.chars().rev().zip(kana.chars().rev()) {
|
||||
if is_equivalent_kana(sc, kc) {
|
||||
end_s -= sc.len_utf8();
|
||||
end_k -= kc.len_utf8();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
out.push((&surface[end_s..], ""));
|
||||
surface = &surface[..end_s];
|
||||
kana = &kana[..end_k];
|
||||
}
|
||||
|
||||
// Try to uniquely match kana in the middle.
|
||||
//
|
||||
// This is just best-effort, and bails in any non-trivial cases.
|
||||
while let Some((si, sc)) = surface.char_indices().find(|(_, c)| is_kana(*c)) {
|
||||
// If there's more than one match, bail.
|
||||
let equivalent_kana_count = kana
|
||||
.chars()
|
||||
.map(|c| is_equivalent_kana(c, sc))
|
||||
.fold(0usize, |count, hit| count + hit as usize);
|
||||
if equivalent_kana_count != 1 {
|
||||
break;
|
||||
}
|
||||
|
||||
// Find the one match.
|
||||
let (ki, kc) = kana
|
||||
.char_indices()
|
||||
.find(|(_, c)| is_equivalent_kana(sc, *c))
|
||||
.unwrap();
|
||||
|
||||
// Insert the segments.
|
||||
out.insert(out.len() - 2, (&surface[..si], &kana[..ki]));
|
||||
out.insert(out.len() - 2, (&surface[si..(si + sc.len_utf8())], ""));
|
||||
surface = &surface[(si + sc.len_utf8())..];
|
||||
kana = &kana[(ki + kc.len_utf8())..];
|
||||
}
|
||||
|
||||
// Left over.
|
||||
out.insert(out.len() - 2, (surface, kana));
|
||||
|
||||
out.iter().filter(|(s, _)| !s.is_empty()).copied().collect()
|
||||
}
|
||||
|
||||
/// Due to the way this is used, this isn't meant to be exact, but instead
|
||||
/// liberal in what it considers equivalent.
|
||||
fn is_equivalent_kana(a: char, b: char) -> bool {
|
||||
const PAIRS: &[[char; 2]] = &[['は', 'わ'], ['を', 'お'], ['づ', 'ず'], ['へ', 'え']];
|
||||
const VOWELS: &[char] = &['あ', 'い', 'う', 'え', 'お', 'ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ'];
|
||||
|
||||
let (a, b) = match (normalize_kana(a), normalize_kana(b)) {
|
||||
(Some(a), Some(b)) => (a, b),
|
||||
_ => return false,
|
||||
};
|
||||
|
||||
if a == b {
|
||||
return true;
|
||||
}
|
||||
|
||||
if a == 'ー' && VOWELS.contains(&b) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if b == 'ー' && VOWELS.contains(&a) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for &[c, d] in PAIRS {
|
||||
if (a == c && b == d) || (a == d && b == c) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
const HIRAGANA: u32 = 0x3041;
|
||||
const KATAKANA: u32 = 0x30A1;
|
||||
const KANA_COUNT: u32 = 0x3097 - HIRAGANA;
|
||||
|
||||
pub fn is_kana(c: char) -> bool {
|
||||
if c == 'ー' {
|
||||
return true;
|
||||
}
|
||||
|
||||
let c = c as u32;
|
||||
|
||||
if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
pub fn normalize_kana(c: char) -> Option<char> {
|
||||
if !is_kana(c) {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(katakana_to_hiragana(c).unwrap_or(c))
|
||||
}
|
||||
|
||||
/// Returns true if furigana defininitely isn't needed.
|
||||
pub fn furigana_unneeded(text: &str, exclude_kanji: &HashSet<char>) -> bool {
|
||||
text.chars().all(|c| {
|
||||
is_kana(c) || c.is_ascii() || c.is_numeric() || c == '々' || exclude_kanji.contains(&c)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn hiragana_to_katakana(c: char) -> Option<char> {
|
||||
let c = c as u32;
|
||||
if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
|
||||
char::try_from(c + KATAKANA - HIRAGANA).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn katakana_to_hiragana(c: char) -> Option<char> {
|
||||
let c = c as u32;
|
||||
if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
|
||||
char::try_from(c - KATAKANA + HIRAGANA).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn apply_furigana_01() {
|
||||
let surface = "へぇ";
|
||||
let kana = "ヘー";
|
||||
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||
|
||||
assert_eq!(&[("へぇ", "")], &pairs[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn apply_furigana_02() {
|
||||
let surface = "へぇー";
|
||||
let kana = "ヘー";
|
||||
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||
|
||||
assert_eq!(&[("へぇー", "")], &pairs[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn apply_furigana_03() {
|
||||
let surface = "へ";
|
||||
let kana = "え";
|
||||
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||
|
||||
assert_eq!(&[("へ", "")], &pairs[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn apply_furigana_04() {
|
||||
let surface = "食べる";
|
||||
let kana = "タベル";
|
||||
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||
|
||||
assert_eq!(&[("食", "タ"), ("べる", "")], &pairs[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn apply_furigana_05() {
|
||||
let surface = "流れ出す";
|
||||
let kana = "ながれだす";
|
||||
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||
|
||||
assert_eq!(
|
||||
&[("流", "なが"), ("れ", ""), ("出", "だ"), ("す", "")],
|
||||
&pairs[..]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn apply_furigana_06() {
|
||||
let surface = "物の怪";
|
||||
let kana = "もののけ";
|
||||
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||
|
||||
assert_eq!(&[("物の怪", "もののけ")], &pairs[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_equivalent_kana_01() {
|
||||
assert!(is_equivalent_kana('か', 'カ'));
|
||||
assert!(is_equivalent_kana('カ', 'か'));
|
||||
assert!(is_equivalent_kana('ぁ', 'ァ'));
|
||||
assert!(is_equivalent_kana('ァ', 'ぁ'));
|
||||
assert!(is_equivalent_kana('は', 'わ'));
|
||||
assert!(is_equivalent_kana('わ', 'は'));
|
||||
assert!(is_equivalent_kana('を', 'お'));
|
||||
assert!(is_equivalent_kana('お', 'を'));
|
||||
assert!(is_equivalent_kana('づ', 'ず'));
|
||||
assert!(is_equivalent_kana('ず', 'づ'));
|
||||
assert!(is_equivalent_kana('ー', 'あ'));
|
||||
assert!(is_equivalent_kana('あ', 'ー'));
|
||||
assert!(is_equivalent_kana('ー', 'ぁ'));
|
||||
assert!(is_equivalent_kana('ぁ', 'ー'));
|
||||
|
||||
assert!(!is_equivalent_kana('は', 'ば'));
|
||||
assert!(!is_equivalent_kana('ー', 'か'));
|
||||
assert!(!is_equivalent_kana('た', '食'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_01() {
|
||||
let gen = FuriganaGenerator::new(0, false);
|
||||
|
||||
let mut worker = gen.tokenizer.new_worker();
|
||||
worker.reset_sentence("食べている");
|
||||
worker.tokenize();
|
||||
|
||||
assert_eq!(3, worker.num_tokens());
|
||||
assert_eq!("食べ", worker.token(0).surface());
|
||||
assert_eq!("動詞-一般,タベ", worker.token(0).feature());
|
||||
assert_eq!("て", worker.token(1).surface());
|
||||
assert_eq!("助詞-接続助詞,テ", worker.token(1).feature());
|
||||
assert_eq!("いる", worker.token(2).surface());
|
||||
assert_eq!("動詞-非自立可能,イル", worker.token(2).feature());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_html_furigana_01() {
|
||||
let mut gen = FuriganaGenerator::new(0, false);
|
||||
|
||||
let text = gen
|
||||
.add_html_furigana(r#"<sup class="食う">食べる</sup>のは<ruby>良</ruby>いね!<hi />"#);
|
||||
|
||||
assert_eq!(
|
||||
text,
|
||||
r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね!<hi />"#
|
||||
);
|
||||
}
|
||||
}
|
216
src/main.rs
216
src/main.rs
|
@ -1,216 +0,0 @@
|
|||
use std::{
|
||||
// fs::File,
|
||||
io::{Cursor, Read},
|
||||
};
|
||||
|
||||
use lz4_flex::frame::FrameDecoder;
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use vibrato::{Dictionary, Tokenizer};
|
||||
|
||||
const DICT: &[u8] = include_bytes!("../dictionary/system.dic.lz4");
|
||||
|
||||
fn main() {
|
||||
let dict = {
|
||||
// Note: we could just pass the decoder straight to `Dictionary::read()`
|
||||
// below, and it would work. However, that ends up being slower than
|
||||
// first decompressing the whole thing ahead of time.
|
||||
let mut decoder = FrameDecoder::new(Cursor::new(DICT));
|
||||
let mut data = Vec::new();
|
||||
decoder.read_to_end(&mut data).unwrap();
|
||||
|
||||
Dictionary::read(Cursor::new(&data)).unwrap()
|
||||
};
|
||||
|
||||
let text = {
|
||||
let mut text = String::new();
|
||||
std::io::stdin().read_to_string(&mut text).unwrap();
|
||||
text
|
||||
};
|
||||
|
||||
let tokenizer = Tokenizer::new(dict);
|
||||
print!("{}", add_html_furigana_skip_already_ruby(&text, &tokenizer));
|
||||
}
|
||||
|
||||
/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
|
||||
fn add_html_furigana_skip_already_ruby(text: &str, tokenizer: &Tokenizer) -> String {
|
||||
static ALREADY_RUBY: Lazy<Regex> = Lazy::new(|| Regex::new(r"<ruby.*?>.*?</ruby>").unwrap());
|
||||
|
||||
let mut new_text = String::new();
|
||||
let mut last_byte_index = 0;
|
||||
for hit in ALREADY_RUBY.find_iter(text) {
|
||||
new_text.push_str(&add_html_furigana(
|
||||
&text[last_byte_index..hit.start()],
|
||||
tokenizer,
|
||||
));
|
||||
new_text.push_str(hit.as_str());
|
||||
last_byte_index = hit.end();
|
||||
}
|
||||
|
||||
new_text.push_str(&add_html_furigana(&text[last_byte_index..], tokenizer));
|
||||
|
||||
new_text
|
||||
}
|
||||
|
||||
/// Adds furigana to Japanese text, using html ruby tags.
|
||||
fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
|
||||
let mut worker = tokenizer.new_worker();
|
||||
|
||||
worker.reset_sentence(text);
|
||||
worker.tokenize();
|
||||
|
||||
let mut new_text = String::new();
|
||||
for i in 0..worker.num_tokens() {
|
||||
let t = worker.token(i);
|
||||
let surface = t.surface();
|
||||
let kana = t.feature().split(",").nth(1).unwrap();
|
||||
|
||||
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
|
||||
|
||||
if kana.is_empty()
|
||||
|| start_bytes == surface.len()
|
||||
|| surface
|
||||
.chars()
|
||||
.map(|c| c.is_ascii() || c.is_numeric())
|
||||
.all(|n| n)
|
||||
{
|
||||
new_text.push_str(surface);
|
||||
} else {
|
||||
let start = &surface[..start_bytes];
|
||||
let mid = &surface[start_bytes..(surface.len() - end_bytes)];
|
||||
let mid_kana = &kana[start_bytes..(kana.len() - end_bytes)];
|
||||
let end = &surface[(surface.len() - end_bytes)..];
|
||||
new_text.push_str(start);
|
||||
new_text.push_str("<ruby>");
|
||||
new_text.push_str(mid);
|
||||
new_text.push_str("<rt>");
|
||||
new_text.push_str(mid_kana);
|
||||
new_text.push_str("</rt></ruby>");
|
||||
new_text.push_str(end);
|
||||
}
|
||||
}
|
||||
|
||||
new_text
|
||||
}
|
||||
|
||||
/// Returns (matching_start_bytes, matching_end_bytes).
|
||||
///
|
||||
/// Note that the bytes are in terms of `a`'s bytes.
|
||||
///
|
||||
/// If `matching_start_bytes == a.len()` you can assume that strings are kana
|
||||
/// equivalents, and thus no ruby is needed.
|
||||
fn matching_kana_ends(a: &str, b: &str) -> (usize, usize) {
|
||||
let mut start_bytes = 0;
|
||||
for (ca, cb) in a.chars().zip(b.chars()) {
|
||||
if ca == cb || is_equivalent_kana(ca, cb) {
|
||||
start_bytes += ca.len_utf8();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let mut end_bytes = 0;
|
||||
for (ca, cb) in a.chars().rev().zip(b.chars().rev()) {
|
||||
if ca == cb || is_equivalent_kana(ca, cb) {
|
||||
end_bytes += ca.len_utf8();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (start_bytes + end_bytes) >= a.len() || (start_bytes + end_bytes) >= b.len() {
|
||||
(a.len(), 0)
|
||||
} else {
|
||||
(start_bytes, end_bytes)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_equivalent_kana(a: char, b: char) -> bool {
|
||||
let a = normalize_kana(a);
|
||||
let b = normalize_kana(b);
|
||||
match (a, b) {
|
||||
(Some('は'), Some('わ'))
|
||||
| (Some('わ'), Some('は'))
|
||||
| (Some('を'), Some('お'))
|
||||
| (Some('お'), Some('を'))
|
||||
| (Some(_), Some('ー'))
|
||||
| (Some('ー'), Some(_)) => true,
|
||||
|
||||
(Some(c), Some(d)) if c == d => true,
|
||||
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
const HIRAGANA: u32 = 0x3041;
|
||||
const KATAKANA: u32 = 0x30A1;
|
||||
const KANA_COUNT: u32 = 0x3097 - HIRAGANA;
|
||||
|
||||
pub fn is_kana(c: char) -> bool {
|
||||
if c == 'ー' {
|
||||
return true;
|
||||
}
|
||||
|
||||
let c = c as u32;
|
||||
|
||||
if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
pub fn normalize_kana(c: char) -> Option<char> {
|
||||
if !is_kana(c) {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(katakana_to_hiragana(c).unwrap_or(c))
|
||||
}
|
||||
|
||||
pub fn hiragana_to_katakana(c: char) -> Option<char> {
|
||||
let c = c as u32;
|
||||
if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
|
||||
char::try_from(c + KATAKANA - HIRAGANA).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn katakana_to_hiragana(c: char) -> Option<char> {
|
||||
let c = c as u32;
|
||||
if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
|
||||
char::try_from(c - KATAKANA + HIRAGANA).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matching_kana_ends_01() {
|
||||
let surface = "へぇ";
|
||||
let kana = "ヘー";
|
||||
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
|
||||
|
||||
assert_eq!(6, start_bytes);
|
||||
assert_eq!(0, end_bytes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn matching_kana_ends_02() {
|
||||
let surface = "へぇー";
|
||||
let kana = "ヘー";
|
||||
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
|
||||
|
||||
assert_eq!(9, start_bytes);
|
||||
assert_eq!(0, end_bytes);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user