Add word exclude list.

2025-09-04 05:30:47 +09:00 · 2025-09-04 05:30:47 +09:00 · 40ed2fd955
commit 40ed2fd955
parent 698e0e3277
1 changed files with 32 additions and 6 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@ -37,6 +37,7 @@ pub struct FuriganaGenerator {
    tokenizer: Tokenizer,
    accent_dict: AccentDict,
    exclude_kanji: FnvHashSet<char>,
+    exclude_words: FnvHashSet<String>,
    subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), (String, String)>,
    use_hiragana: bool,
    accent_mark: Option<String>,
@ -44,12 +45,15 @@ pub struct FuriganaGenerator {
 }

 impl FuriganaGenerator {
-    // `exclude_count`: exclude the N most frequent kanji from furigana.
+    // - `exclude_count`: exclude the N most frequent kanji from furigana.
    // Specifically, words made up *entirely* of those kanji will be excluded.
    // If a word has some kanji that aren't in that set, even if it also has
    // some that are, it will still get furigana.
+    //
+    // - `exclude_words`: don't put furigana on the words in this list.
    pub fn new(
        exclude_count: usize,
+        exclude_words: &[&str],
        use_hiragana: bool,
        accent_mark: Option<String>,
        accentless_mark: Option<String>,
@ -73,6 +77,14 @@ impl FuriganaGenerator {
            set
        };

+        let exclude_words = {
+            let mut set = FnvHashSet::default();
+            for word in exclude_words {
+                set.insert((*word).into());
+            }
+            set
+        };
+
        let subs = {
            let mut map: FnvHashMap<(Cow<str>, Cow<str>), (String, String)> = FnvHashMap::default();
            for (surface, kana, (sub_kana, sub_kana_pitch_lookup)) in COMMON_SUBS.iter().copied() {
@ -88,6 +100,7 @@ impl FuriganaGenerator {
            tokenizer: Tokenizer::new(dict),
            accent_dict: accent::build_accent_dictionary(),
            exclude_kanji: exclude_kanji,
+            exclude_words: exclude_words,
            subs: subs,
            use_hiragana: use_hiragana,
            accent_mark: accent_mark,
@ -100,6 +113,7 @@ impl FuriganaGenerator {
            tokenizer: &self.tokenizer,
            accent_dict: &self.accent_dict,
            exclude_kanji: &self.exclude_kanji,
+            exclude_words: &self.exclude_words,
            subs: &self.subs,
            learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
            use_hiragana: self.use_hiragana,
@ -113,6 +127,7 @@ pub struct Session<'a> {
    tokenizer: &'a Tokenizer,
    accent_dict: &'a AccentDict,
    exclude_kanji: &'a FnvHashSet<char>,
+    exclude_words: &'a FnvHashSet<String>,
    subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), (String, String)>,
    learner: Learner,
    use_hiragana: bool,
@ -140,6 +155,7 @@ impl<'a> Session<'a> {
            &self.tokenizer,
            &self.accent_dict,
            &self.exclude_kanji,
+            &self.exclude_words,
            &self.subs,
            &mut self.learner,
            self.use_hiragana,
@ -159,6 +175,7 @@ fn add_html_furigana_skip_already_ruby(
    tokenizer: &Tokenizer,
    accent_dict: &AccentDict,
    exclude_kanji: &FnvHashSet<char>,
+    exclude_words: &FnvHashSet<String>,
    subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>,
    learner: &mut Learner,
    use_hiragana: bool,
@ -201,6 +218,7 @@ fn add_html_furigana_skip_already_ruby(
                        tokenizer,
                        accent_dict,
                        exclude_kanji,
+                        exclude_words,
                        subs,
                        learner,
                        use_hiragana,
@ -288,6 +306,7 @@ fn add_html_furigana(
    tokenizer: &Tokenizer,
    accent_dict: &AccentDict,
    exclude_kanji: &FnvHashSet<char>,
+    exclude_words: &FnvHashSet<String>,
    subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>,
    learner: &mut Learner,
    use_hiragana: bool,
@ -344,6 +363,7 @@ fn add_html_furigana(
            surface,
            &kana,
            exclude_kanji,
+            exclude_words,
            pitches,
            accent_mark,
            accentless_mark,
@ -378,13 +398,14 @@ fn apply_furigana<'a>(
    surface: &'a str,
    kana: &'a str,
    exclude_kanji: &FnvHashSet<char>,
+    exclude_words: &FnvHashSet<String>,
    pitches: &[u8],
    accent_mark: Option<&'a str>,
    accentless_mark: Option<&'a str>,
 ) -> Vec<(String, String)> {
    let mut out: Vec<(String, String)> = Vec::new();

-    if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) {
+    if furigana_unneeded(surface, exclude_kanji, exclude_words) || !is_kana_str(kana) {
        return Vec::new();
    }

@ -584,8 +605,13 @@ pub fn normalize_kana(c: char) -> Option<char> {
 }

 /// Returns true if furigana defininitely isn't needed.
-pub fn furigana_unneeded(text: &str, exclude_kanji: &FnvHashSet<char>) -> bool {
-    text.chars().all(|c| {
+pub fn furigana_unneeded(
+    text: &str,
+    exclude_kanji: &FnvHashSet<char>,
+    exclude_words: &FnvHashSet<String>,
+) -> bool {
+    exclude_words.contains(text)
+        || text.chars().all(|c| {
            is_kana(c) || c.is_ascii() || c.is_numeric() || c == '々' || exclude_kanji.contains(&c)
        })
 }