Use lz4 for the in-executable dictionary data.

It's way faster to decompress, and is stills small enough.
This commit is contained in:
Nathan Vegdahl 2024-09-01 09:01:37 +02:00
parent 1d64afe430
commit 8df226190b
7 changed files with 21 additions and 16 deletions

21
Cargo.lock generated
View File

@ -182,9 +182,9 @@ dependencies = [
name = "furigana_gen" name = "furigana_gen"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"lz4_flex",
"once_cell", "once_cell",
"regex", "regex",
"ruzstd",
"vibrato", "vibrato",
] ]
@ -256,6 +256,15 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "lz4_flex"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
dependencies = [
"twox-hash",
]
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.7.4" version = "2.7.4"
@ -441,16 +450,6 @@ version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6" checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
[[package]]
name = "ruzstd"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c8b8f3d26bd9f945e5cbae77f7cdfbf37af9a66956f1115eb4516e45df519f4"
dependencies = [
"byteorder",
"twox-hash",
]
[[package]] [[package]]
name = "ryu" name = "ryu"
version = "1.0.18" version = "1.0.18"

View File

@ -7,6 +7,6 @@ edition = "2021"
[dependencies] [dependencies]
vibrato = "0.5" vibrato = "0.5"
ruzstd = "0.7" lz4_flex = "0.11"
regex = "1.10" regex = "1.10"
once_cell = "1.19" once_cell = "1.19"

BIN
dictionary/system.dic Normal file

Binary file not shown.

BIN
dictionary/system.dic.lz4 Normal file

Binary file not shown.

BIN
dictionary/system.dic.xz Normal file

Binary file not shown.

Binary file not shown.

View File

@ -3,17 +3,23 @@ use std::{
io::{Cursor, Read}, io::{Cursor, Read},
}; };
use lz4_flex::frame::FrameDecoder;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use regex::Regex; use regex::Regex;
use ruzstd::StreamingDecoder;
use vibrato::{Dictionary, Tokenizer}; use vibrato::{Dictionary, Tokenizer};
const DICT: &[u8] = include_bytes!("../dictionary/system.dic.zst"); const DICT: &[u8] = include_bytes!("../dictionary/system.dic.lz4");
fn main() { fn main() {
let dict = { let dict = {
let decoder = StreamingDecoder::new(Cursor::new(DICT)).unwrap(); // Note: we could just pass the decoder straight to `Dictionary::read()`
Dictionary::read(decoder).unwrap() // below, and it would work. However, that ends up being slower than
// first decompressing the whole thing ahead of time.
let mut decoder = FrameDecoder::new(Cursor::new(DICT));
let mut data = Vec::new();
decoder.read_to_end(&mut data).unwrap();
Dictionary::read(Cursor::new(&data)).unwrap()
}; };
let text = { let text = {