furigana_gen/build.rs

70 lines
2.0 KiB
Rust

use std::{
env,
fs::File,
io::{BufReader, Write},
path::Path,
};
const KANJI: &str = include_str!("data/kanji_frequency.txt");
fn main() {
let out_dir = env::var("OUT_DIR").unwrap();
// Write frequency-ordered kanji array to rust file.
{
let dest_path = Path::new(&out_dir).join("kanji_freq_inc.rs");
let mut f = File::create(&dest_path).unwrap();
f.write_all("const KANJI_FREQ: &[char] = &[".as_bytes())
.unwrap();
for c in KANJI.chars() {
if c.is_whitespace() {
continue;
}
f.write_all(format!("\n'{}',", c).as_bytes()).unwrap();
}
f.write_all("\n];".as_bytes()).unwrap();
}
// Write compressed parsing dictionary to .lz4 file.
{
// Read and decompress file from .xz.
let dict_data = {
let f = File::open("data/ipadic-mecab-2_7_0/system.dic.xz").unwrap();
let mut data = Vec::new();
lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
data
};
// Recompress to .lz4.
let dest_path = Path::new(&out_dir).join("system.dic.lz4");
let f = File::create(dest_path).unwrap();
let mut encoder = lz4_flex::frame::FrameEncoder::new(f);
encoder.write(&dict_data).unwrap();
encoder.finish().unwrap();
}
// Write compressed pitch accent dictionary to .lz4 file.
{
// Read and decompress file from .xz.
let dict_data = {
let f = File::open("data/accents.tsv.xz").unwrap();
let mut data = Vec::new();
lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
data
};
// Recompress to .lz4.
let dest_path = Path::new(&out_dir).join("accents.tsv.lz4");
let f = File::create(dest_path).unwrap();
let mut encoder = lz4_flex::frame::FrameEncoder::new(f);
encoder.write(&dict_data).unwrap();
encoder.finish().unwrap();
}
}