70 lines
2.0 KiB
Rust
70 lines
2.0 KiB
Rust
use std::{
|
|
env,
|
|
fs::File,
|
|
io::{BufReader, Write},
|
|
path::Path,
|
|
};
|
|
|
|
const KANJI: &str = include_str!("data/kanji_frequency.txt");
|
|
|
|
fn main() {
|
|
let out_dir = env::var("OUT_DIR").unwrap();
|
|
|
|
// Write frequency-ordered kanji array to rust file.
|
|
{
|
|
let dest_path = Path::new(&out_dir).join("kanji_freq_inc.rs");
|
|
let mut f = File::create(&dest_path).unwrap();
|
|
|
|
f.write_all("const KANJI_FREQ: &[char] = &[".as_bytes())
|
|
.unwrap();
|
|
|
|
for c in KANJI.chars() {
|
|
if c.is_whitespace() {
|
|
continue;
|
|
}
|
|
|
|
f.write_all(format!("\n'{}',", c).as_bytes()).unwrap();
|
|
}
|
|
|
|
f.write_all("\n];".as_bytes()).unwrap();
|
|
}
|
|
|
|
// Write compressed parsing dictionary to .lz4 file.
|
|
{
|
|
// Read and decompress file from .xz.
|
|
let dict_data = {
|
|
let f = File::open("data/ipadic-mecab-2_7_0/system.dic.xz").unwrap();
|
|
let mut data = Vec::new();
|
|
lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
|
|
|
|
data
|
|
};
|
|
|
|
// Recompress to .lz4.
|
|
let dest_path = Path::new(&out_dir).join("system.dic.lz4");
|
|
let f = File::create(dest_path).unwrap();
|
|
let mut encoder = lz4_flex::frame::FrameEncoder::new(f);
|
|
encoder.write(&dict_data).unwrap();
|
|
encoder.finish().unwrap();
|
|
}
|
|
|
|
// Write compressed pitch accent dictionary to .lz4 file.
|
|
{
|
|
// Read and decompress file from .xz.
|
|
let dict_data = {
|
|
let f = File::open("data/accents.tsv.xz").unwrap();
|
|
let mut data = Vec::new();
|
|
lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
|
|
|
|
data
|
|
};
|
|
|
|
// Recompress to .lz4.
|
|
let dest_path = Path::new(&out_dir).join("accents.tsv.lz4");
|
|
let f = File::create(dest_path).unwrap();
|
|
let mut encoder = lz4_flex::frame::FrameEncoder::new(f);
|
|
encoder.write(&dict_data).unwrap();
|
|
encoder.finish().unwrap();
|
|
}
|
|
}
|