Use lz4 for the in-executable dictionary data.

It's way faster to decompress, and is stills small enough.
This commit is contained in:
Nathan Vegdahl 2024-09-01 09:01:37 +02:00
parent 1d64afe430
commit 8df226190b
7 changed files with 21 additions and 16 deletions

21
Cargo.lock generated
View File

@ -182,9 +182,9 @@ dependencies = [
name = "furigana_gen"
version = "0.1.0"
dependencies = [
"lz4_flex",
"once_cell",
"regex",
"ruzstd",
"vibrato",
]
@ -256,6 +256,15 @@ dependencies = [
"libc",
]
[[package]]
name = "lz4_flex"
version = "0.11.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
dependencies = [
"twox-hash",
]
[[package]]
name = "memchr"
version = "2.7.4"
@ -441,16 +450,6 @@ version = "1.0.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "955d28af4278de8121b7ebeb796b6a45735dc01436d898801014aced2773a3d6"
[[package]]
name = "ruzstd"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c8b8f3d26bd9f945e5cbae77f7cdfbf37af9a66956f1115eb4516e45df519f4"
dependencies = [
"byteorder",
"twox-hash",
]
[[package]]
name = "ryu"
version = "1.0.18"

View File

@ -7,6 +7,6 @@ edition = "2021"
[dependencies]
vibrato = "0.5"
ruzstd = "0.7"
lz4_flex = "0.11"
regex = "1.10"
once_cell = "1.19"

BIN
dictionary/system.dic Normal file

Binary file not shown.

BIN
dictionary/system.dic.lz4 Normal file

Binary file not shown.

BIN
dictionary/system.dic.xz Normal file

Binary file not shown.

Binary file not shown.

View File

@ -3,17 +3,23 @@ use std::{
io::{Cursor, Read},
};
use lz4_flex::frame::FrameDecoder;
use once_cell::sync::Lazy;
use regex::Regex;
use ruzstd::StreamingDecoder;
use vibrato::{Dictionary, Tokenizer};
const DICT: &[u8] = include_bytes!("../dictionary/system.dic.zst");
const DICT: &[u8] = include_bytes!("../dictionary/system.dic.lz4");
fn main() {
let dict = {
let decoder = StreamingDecoder::new(Cursor::new(DICT)).unwrap();
Dictionary::read(decoder).unwrap()
// Note: we could just pass the decoder straight to `Dictionary::read()`
// below, and it would work. However, that ends up being slower than
// first decompressing the whole thing ahead of time.
let mut decoder = FrameDecoder::new(Cursor::new(DICT));
let mut data = Vec::new();
decoder.read_to_end(&mut data).unwrap();
Dictionary::read(Cursor::new(&data)).unwrap()
};
let text = {