Add word exclude list.

This commit is contained in:
Nathan Vegdahl 2025-09-04 05:30:47 +09:00
parent 698e0e3277
commit 40ed2fd955

View File

@ -37,6 +37,7 @@ pub struct FuriganaGenerator {
tokenizer: Tokenizer, tokenizer: Tokenizer,
accent_dict: AccentDict, accent_dict: AccentDict,
exclude_kanji: FnvHashSet<char>, exclude_kanji: FnvHashSet<char>,
exclude_words: FnvHashSet<String>,
subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), (String, String)>, subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), (String, String)>,
use_hiragana: bool, use_hiragana: bool,
accent_mark: Option<String>, accent_mark: Option<String>,
@ -44,12 +45,15 @@ pub struct FuriganaGenerator {
} }
impl FuriganaGenerator { impl FuriganaGenerator {
// `exclude_count`: exclude the N most frequent kanji from furigana. // - `exclude_count`: exclude the N most frequent kanji from furigana.
// Specifically, words made up *entirely* of those kanji will be excluded. // Specifically, words made up *entirely* of those kanji will be excluded.
// If a word has some kanji that aren't in that set, even if it also has // If a word has some kanji that aren't in that set, even if it also has
// some that are, it will still get furigana. // some that are, it will still get furigana.
//
// - `exclude_words`: don't put furigana on the words in this list.
pub fn new( pub fn new(
exclude_count: usize, exclude_count: usize,
exclude_words: &[&str],
use_hiragana: bool, use_hiragana: bool,
accent_mark: Option<String>, accent_mark: Option<String>,
accentless_mark: Option<String>, accentless_mark: Option<String>,
@ -73,6 +77,14 @@ impl FuriganaGenerator {
set set
}; };
let exclude_words = {
let mut set = FnvHashSet::default();
for word in exclude_words {
set.insert((*word).into());
}
set
};
let subs = { let subs = {
let mut map: FnvHashMap<(Cow<str>, Cow<str>), (String, String)> = FnvHashMap::default(); let mut map: FnvHashMap<(Cow<str>, Cow<str>), (String, String)> = FnvHashMap::default();
for (surface, kana, (sub_kana, sub_kana_pitch_lookup)) in COMMON_SUBS.iter().copied() { for (surface, kana, (sub_kana, sub_kana_pitch_lookup)) in COMMON_SUBS.iter().copied() {
@ -88,6 +100,7 @@ impl FuriganaGenerator {
tokenizer: Tokenizer::new(dict), tokenizer: Tokenizer::new(dict),
accent_dict: accent::build_accent_dictionary(), accent_dict: accent::build_accent_dictionary(),
exclude_kanji: exclude_kanji, exclude_kanji: exclude_kanji,
exclude_words: exclude_words,
subs: subs, subs: subs,
use_hiragana: use_hiragana, use_hiragana: use_hiragana,
accent_mark: accent_mark, accent_mark: accent_mark,
@ -100,6 +113,7 @@ impl FuriganaGenerator {
tokenizer: &self.tokenizer, tokenizer: &self.tokenizer,
accent_dict: &self.accent_dict, accent_dict: &self.accent_dict,
exclude_kanji: &self.exclude_kanji, exclude_kanji: &self.exclude_kanji,
exclude_words: &self.exclude_words,
subs: &self.subs, subs: &self.subs,
learner: Learner::new(if learn_mode { 3 } else { usize::MAX }), learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
use_hiragana: self.use_hiragana, use_hiragana: self.use_hiragana,
@ -113,6 +127,7 @@ pub struct Session<'a> {
tokenizer: &'a Tokenizer, tokenizer: &'a Tokenizer,
accent_dict: &'a AccentDict, accent_dict: &'a AccentDict,
exclude_kanji: &'a FnvHashSet<char>, exclude_kanji: &'a FnvHashSet<char>,
exclude_words: &'a FnvHashSet<String>,
subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), (String, String)>, subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), (String, String)>,
learner: Learner, learner: Learner,
use_hiragana: bool, use_hiragana: bool,
@ -140,6 +155,7 @@ impl<'a> Session<'a> {
&self.tokenizer, &self.tokenizer,
&self.accent_dict, &self.accent_dict,
&self.exclude_kanji, &self.exclude_kanji,
&self.exclude_words,
&self.subs, &self.subs,
&mut self.learner, &mut self.learner,
self.use_hiragana, self.use_hiragana,
@ -159,6 +175,7 @@ fn add_html_furigana_skip_already_ruby(
tokenizer: &Tokenizer, tokenizer: &Tokenizer,
accent_dict: &AccentDict, accent_dict: &AccentDict,
exclude_kanji: &FnvHashSet<char>, exclude_kanji: &FnvHashSet<char>,
exclude_words: &FnvHashSet<String>,
subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>, subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>,
learner: &mut Learner, learner: &mut Learner,
use_hiragana: bool, use_hiragana: bool,
@ -201,6 +218,7 @@ fn add_html_furigana_skip_already_ruby(
tokenizer, tokenizer,
accent_dict, accent_dict,
exclude_kanji, exclude_kanji,
exclude_words,
subs, subs,
learner, learner,
use_hiragana, use_hiragana,
@ -288,6 +306,7 @@ fn add_html_furigana(
tokenizer: &Tokenizer, tokenizer: &Tokenizer,
accent_dict: &AccentDict, accent_dict: &AccentDict,
exclude_kanji: &FnvHashSet<char>, exclude_kanji: &FnvHashSet<char>,
exclude_words: &FnvHashSet<String>,
subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>, subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>,
learner: &mut Learner, learner: &mut Learner,
use_hiragana: bool, use_hiragana: bool,
@ -344,6 +363,7 @@ fn add_html_furigana(
surface, surface,
&kana, &kana,
exclude_kanji, exclude_kanji,
exclude_words,
pitches, pitches,
accent_mark, accent_mark,
accentless_mark, accentless_mark,
@ -378,13 +398,14 @@ fn apply_furigana<'a>(
surface: &'a str, surface: &'a str,
kana: &'a str, kana: &'a str,
exclude_kanji: &FnvHashSet<char>, exclude_kanji: &FnvHashSet<char>,
exclude_words: &FnvHashSet<String>,
pitches: &[u8], pitches: &[u8],
accent_mark: Option<&'a str>, accent_mark: Option<&'a str>,
accentless_mark: Option<&'a str>, accentless_mark: Option<&'a str>,
) -> Vec<(String, String)> { ) -> Vec<(String, String)> {
let mut out: Vec<(String, String)> = Vec::new(); let mut out: Vec<(String, String)> = Vec::new();
if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) { if furigana_unneeded(surface, exclude_kanji, exclude_words) || !is_kana_str(kana) {
return Vec::new(); return Vec::new();
} }
@ -584,10 +605,15 @@ pub fn normalize_kana(c: char) -> Option<char> {
} }
/// Returns true if furigana defininitely isn't needed. /// Returns true if furigana defininitely isn't needed.
pub fn furigana_unneeded(text: &str, exclude_kanji: &FnvHashSet<char>) -> bool { pub fn furigana_unneeded(
text.chars().all(|c| { text: &str,
is_kana(c) || c.is_ascii() || c.is_numeric() || c == '々' || exclude_kanji.contains(&c) exclude_kanji: &FnvHashSet<char>,
}) exclude_words: &FnvHashSet<String>,
) -> bool {
exclude_words.contains(text)
|| text.chars().all(|c| {
is_kana(c) || c.is_ascii() || c.is_numeric() || c == '々' || exclude_kanji.contains(&c)
})
} }
pub fn hiragana_to_katakana(c: char) -> Option<char> { pub fn hiragana_to_katakana(c: char) -> Option<char> {