Add option to use hiragana instead of katakana for the generated furigana.

This commit is contained in:
Nathan Vegdahl 2024-09-17 08:32:51 +02:00
parent 0266341f99
commit 7361240e49

View File

@ -35,6 +35,7 @@ pub struct FuriganaGenerator {
tokenizer: Tokenizer, tokenizer: Tokenizer,
exclude_kanji: FnvHashSet<char>, exclude_kanji: FnvHashSet<char>,
subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), String>, subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), String>,
use_hiragana: bool,
} }
impl FuriganaGenerator { impl FuriganaGenerator {
@ -42,7 +43,7 @@ impl FuriganaGenerator {
// Specifically, words made up *entirely* of those kanji will be excluded. // Specifically, words made up *entirely* of those kanji will be excluded.
// If a word has some kanji that aren't in that set, even if it also has // If a word has some kanji that aren't in that set, even if it also has
// some that are, it will still get furigana. // some that are, it will still get furigana.
pub fn new(exclude_count: usize) -> Self { pub fn new(exclude_count: usize, use_hiragana: bool) -> Self {
let dict = { let dict = {
// Note: we could just pass the decoder straight to `Dictionary::read()` // Note: we could just pass the decoder straight to `Dictionary::read()`
// below, and it would work. However, that ends up being slower than // below, and it would work. However, that ends up being slower than
@ -74,6 +75,7 @@ impl FuriganaGenerator {
tokenizer: Tokenizer::new(dict), tokenizer: Tokenizer::new(dict),
exclude_kanji: exclude_kanji, exclude_kanji: exclude_kanji,
subs: subs, subs: subs,
use_hiragana: use_hiragana,
} }
} }
@ -83,6 +85,7 @@ impl FuriganaGenerator {
exclude_kanji: &self.exclude_kanji, exclude_kanji: &self.exclude_kanji,
subs: &self.subs, subs: &self.subs,
learner: Learner::new(if learn_mode { 3 } else { usize::MAX }), learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
use_hiragana: self.use_hiragana,
} }
} }
} }
@ -92,6 +95,7 @@ pub struct Session<'a> {
exclude_kanji: &'a FnvHashSet<char>, exclude_kanji: &'a FnvHashSet<char>,
subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), String>, subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), String>,
learner: Learner, learner: Learner,
use_hiragana: bool,
} }
impl<'a> Session<'a> { impl<'a> Session<'a> {
@ -115,6 +119,7 @@ impl<'a> Session<'a> {
&self.exclude_kanji, &self.exclude_kanji,
&self.subs, &self.subs,
&mut self.learner, &mut self.learner,
self.use_hiragana,
) )
} }
} }
@ -130,6 +135,7 @@ fn add_html_furigana_skip_already_ruby(
exclude_kanji: &FnvHashSet<char>, exclude_kanji: &FnvHashSet<char>,
subs: &FnvHashMap<(Cow<str>, Cow<str>), String>, subs: &FnvHashMap<(Cow<str>, Cow<str>), String>,
learner: &mut Learner, learner: &mut Learner,
use_hiragana: bool,
) -> String { ) -> String {
let mut reader = quick_xml::Reader::from_str(text); let mut reader = quick_xml::Reader::from_str(text);
@ -168,6 +174,7 @@ fn add_html_furigana_skip_already_ruby(
exclude_kanji, exclude_kanji,
subs, subs,
learner, learner,
use_hiragana,
)); ));
} else { } else {
write_xml(&mut new_text, &Event::Text(e)); write_xml(&mut new_text, &Event::Text(e));
@ -251,6 +258,7 @@ fn add_html_furigana(
exclude_kanji: &FnvHashSet<char>, exclude_kanji: &FnvHashSet<char>,
subs: &FnvHashMap<(Cow<str>, Cow<str>), String>, subs: &FnvHashMap<(Cow<str>, Cow<str>), String>,
learner: &mut Learner, learner: &mut Learner,
use_hiragana: bool,
) -> String { ) -> String {
let mut worker = tokenizer.new_worker(); let mut worker = tokenizer.new_worker();
@ -279,9 +287,16 @@ fn add_html_furigana(
continue; continue;
} }
let kana = {
let kana = feature.split(",").nth(1).unwrap(); let kana = feature.split(",").nth(1).unwrap();
if use_hiragana {
katakana_to_hiragana_string(kana)
} else {
kana.into()
}
};
let furigana_text = apply_furigana(surface, kana, exclude_kanji); let furigana_text = apply_furigana(surface, &kana, exclude_kanji);
for (surf, furi) in furigana_text.iter() { for (surf, furi) in furigana_text.iter() {
if furi.is_empty() { if furi.is_empty() {
@ -472,6 +487,16 @@ pub fn katakana_to_hiragana(c: char) -> Option<char> {
} }
} }
pub fn katakana_to_hiragana_string(text: &str) -> String {
let mut new_text = String::new();
for c in text.chars() {
new_text.push(katakana_to_hiragana(c).unwrap_or(c));
}
new_text
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -480,7 +505,7 @@ mod tests {
pub fn get_furigana_gen() -> &'static FuriganaGenerator { pub fn get_furigana_gen() -> &'static FuriganaGenerator {
use std::sync::OnceLock; use std::sync::OnceLock;
static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new(); static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new();
FURIGEN.get_or_init(|| FuriganaGenerator::new(0)) FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false))
} }
#[test] #[test]