Use lz4 for the in-executable dictionary data.
It's way faster to decompress, and is stills small enough.
This commit is contained in:
parent
1d64afe430
commit
8df226190b
21
Cargo.lock
generated
21
Cargo.lock
generated
|
@ -182,9 +182,9 @@ dependencies = [
|
||||||
name = "furigana_gen"
|
name = "furigana_gen"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"lz4_flex",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"regex",
|
"regex",
|
||||||
"ruzstd",
|
|
||||||
"vibrato",
|
"vibrato",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -256,6 +256,15 @@ dependencies = [
|
||||||
"libc",
|
"libc",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "lz4_flex"
|
||||||
|
version = "0.11.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
|
||||||
|
dependencies = [
|
||||||
|
"twox-hash",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memchr"
|
name = "memchr"
|
||||||
version = "2.7.4"
|
version = "2.7.4"
|
||||||
|
@ -441,16 +450,6 @@ version = "1.0.17"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
|
checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "ruzstd"
|
|
||||||
version = "0.7.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "1c8b8f3d26bd9f945e5cbae77f7cdfbf37af9a66956f1115eb4516e45df519f4"
|
|
||||||
dependencies = [
|
|
||||||
"byteorder",
|
|
||||||
"twox-hash",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ryu"
|
name = "ryu"
|
||||||
version = "1.0.18"
|
version = "1.0.18"
|
||||||
|
|
|
@ -7,6 +7,6 @@ edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
vibrato = "0.5"
|
vibrato = "0.5"
|
||||||
ruzstd = "0.7"
|
lz4_flex = "0.11"
|
||||||
regex = "1.10"
|
regex = "1.10"
|
||||||
once_cell = "1.19"
|
once_cell = "1.19"
|
||||||
|
|
BIN
dictionary/system.dic
Normal file
BIN
dictionary/system.dic
Normal file
Binary file not shown.
BIN
dictionary/system.dic.lz4
Normal file
BIN
dictionary/system.dic.lz4
Normal file
Binary file not shown.
BIN
dictionary/system.dic.xz
Normal file
BIN
dictionary/system.dic.xz
Normal file
Binary file not shown.
Binary file not shown.
14
src/main.rs
14
src/main.rs
|
@ -3,17 +3,23 @@ use std::{
|
||||||
io::{Cursor, Read},
|
io::{Cursor, Read},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use lz4_flex::frame::FrameDecoder;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use ruzstd::StreamingDecoder;
|
|
||||||
use vibrato::{Dictionary, Tokenizer};
|
use vibrato::{Dictionary, Tokenizer};
|
||||||
|
|
||||||
const DICT: &[u8] = include_bytes!("../dictionary/system.dic.zst");
|
const DICT: &[u8] = include_bytes!("../dictionary/system.dic.lz4");
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let dict = {
|
let dict = {
|
||||||
let decoder = StreamingDecoder::new(Cursor::new(DICT)).unwrap();
|
// Note: we could just pass the decoder straight to `Dictionary::read()`
|
||||||
Dictionary::read(decoder).unwrap()
|
// below, and it would work. However, that ends up being slower than
|
||||||
|
// first decompressing the whole thing ahead of time.
|
||||||
|
let mut decoder = FrameDecoder::new(Cursor::new(DICT));
|
||||||
|
let mut data = Vec::new();
|
||||||
|
decoder.read_to_end(&mut data).unwrap();
|
||||||
|
|
||||||
|
Dictionary::read(Cursor::new(&data)).unwrap()
|
||||||
};
|
};
|
||||||
|
|
||||||
let text = {
|
let text = {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user