Use file of kanji to exclude, instead of a frequency cutoff.

This commit is contained in:
Nathan Vegdahl 2025-09-10 05:19:18 +09:00
parent b60ecc38b3
commit 2ae9f286ad
3 changed files with 16 additions and 4037 deletions

View File

@ -5,30 +5,9 @@ use std::{
path::Path, path::Path,
}; };
const KANJI: &str = include_str!("data/kanji_frequency.txt");
fn main() { fn main() {
let out_dir = env::var("OUT_DIR").unwrap(); let out_dir = env::var("OUT_DIR").unwrap();
// Write frequency-ordered kanji array to rust file.
{
let dest_path = Path::new(&out_dir).join("kanji_freq_inc.rs");
let mut f = File::create(&dest_path).unwrap();
f.write_all("const KANJI_FREQ: &[char] = &[".as_bytes())
.unwrap();
for c in KANJI.chars() {
if c.is_whitespace() {
continue;
}
f.write_all(format!("\n'{}',", c).as_bytes()).unwrap();
}
f.write_all("\n];".as_bytes()).unwrap();
}
// Write compressed parsing dictionary to .lz4 file. // Write compressed parsing dictionary to .lz4 file.
{ {
// Read and decompress file from .xz. // Read and decompress file from .xz.

File diff suppressed because it is too large Load Diff

View File

@ -14,9 +14,6 @@ use vibrato::{Dictionary, Tokenizer};
use accent::AccentDict; use accent::AccentDict;
use learner::Learner; use learner::Learner;
// Include KANJI_FREQ, a frequency-ordered array of kanji characters.
include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));
// Parsing dictionary. // Parsing dictionary.
const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4")); const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4"));
@ -45,14 +42,18 @@ pub struct FuriganaGenerator {
} }
impl FuriganaGenerator { impl FuriganaGenerator {
// - `exclude_count`: exclude the N most frequent kanji from furigana. /// - `exclude_kanji`: don't put furigana on words whose kanji are all in
// Specifically, words made up *entirely* of those kanji will be excluded. /// this list. Note: if a word has *some* kanji that aren't in this list,
// If a word has some kanji that aren't in that set, even if it also has /// even if it also has some that are, it will still get furigana.
// some that are, it will still get furigana. /// - `exclude_words`: don't put furigana on the words in this list.
// /// - `use_hiragana`: when true, the furigana will be written in hiragana.
// - `exclude_words`: don't put furigana on the words in this list. /// When false, the furigana will be written in katakana.
/// - `accent_mark`: the character to use as the pitch accent indicator for
/// accented words.
/// - `accentless_mark`: the character to use as the pitch accent indicator
/// for accentless (へいばん) words.
pub fn new( pub fn new(
exclude_count: usize, exclude_kanji: &[char],
exclude_words: &[&str], exclude_words: &[&str],
use_hiragana: bool, use_hiragana: bool,
accent_mark: Option<String>, accent_mark: Option<String>,
@ -69,15 +70,15 @@ impl FuriganaGenerator {
Dictionary::read(Cursor::new(&data)).unwrap() Dictionary::read(Cursor::new(&data)).unwrap()
}; };
let exclude_kanji = { let exclude_kanji_set = {
let mut set = FnvHashSet::default(); let mut set = FnvHashSet::default();
for &c in KANJI_FREQ.iter().take(exclude_count) { for &c in exclude_kanji {
set.insert(c); set.insert(c);
} }
set set
}; };
let exclude_words = { let exclude_words_set = {
let mut set = FnvHashSet::default(); let mut set = FnvHashSet::default();
for word in exclude_words { for word in exclude_words {
set.insert((*word).into()); set.insert((*word).into());
@ -99,8 +100,8 @@ impl FuriganaGenerator {
Self { Self {
tokenizer: Tokenizer::new(dict), tokenizer: Tokenizer::new(dict),
accent_dict: accent::build_accent_dictionary(), accent_dict: accent::build_accent_dictionary(),
exclude_kanji: exclude_kanji, exclude_kanji: exclude_kanji_set,
exclude_words: exclude_words, exclude_words: exclude_words_set,
subs: subs, subs: subs,
use_hiragana: use_hiragana, use_hiragana: use_hiragana,
accent_mark: accent_mark, accent_mark: accent_mark,