diff --git a/src/lib.rs b/src/lib.rs index 39f950d..b0f1752 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,18 +23,20 @@ const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4")); /// A list of words that the tokenizer insists on using the less common reading /// for, with the more common reading that should be substituted. /// -/// (surface, kana, substitute_kana) -const COMMON_SUBS: &[(&str, &str, &str)] = &[ - ("額", "ガク", "ヒタイ"), - ("他", "タ", "ホカ"), - ("私", "ワタクシ", "ワタシ"), +/// (surface, kana, (substitute_kana, substitute_pitch_lookup_kana)) +const COMMON_SUBS: &[(&str, &str, (&str, &str))] = &[ + ("額", "ガク", ("ヒタイ", "ヒタイ")), + ("他", "タ", ("ホカ", "ホカ")), + ("私", "ワタクシ", ("ワタシ", "ワタシ")), + ("等", "トー", ("ナド", "ナド")), + ("大分", "オーイタ", ("ダイブ", "ダイブ")), ]; pub struct FuriganaGenerator { tokenizer: Tokenizer, accent_dict: AccentDict, exclude_kanji: FnvHashSet, - subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), String>, + subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), (String, String)>, use_hiragana: bool, mark_accent: bool, } @@ -65,9 +67,12 @@ impl FuriganaGenerator { }; let subs = { - let mut map: FnvHashMap<(Cow, Cow), String> = FnvHashMap::default(); - for (surface, feature, sub_feature) in COMMON_SUBS.iter().copied() { - map.insert((surface.into(), feature.into()), sub_feature.into()); + let mut map: FnvHashMap<(Cow, Cow), (String, String)> = FnvHashMap::default(); + for (surface, kana, (sub_kana, sub_kana_pitch_lookup)) in COMMON_SUBS.iter().copied() { + map.insert( + (surface.into(), kana.into()), + (sub_kana.into(), sub_kana_pitch_lookup.into()), + ); } map }; @@ -99,7 +104,7 @@ pub struct Session<'a> { tokenizer: &'a Tokenizer, accent_dict: &'a AccentDict, exclude_kanji: &'a FnvHashSet, - subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), String>, + subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), (String, String)>, learner: Learner, use_hiragana: bool, mark_accent: bool, @@ -143,7 +148,7 @@ fn add_html_furigana_skip_already_ruby( tokenizer: &Tokenizer, accent_dict: &AccentDict, exclude_kanji: &FnvHashSet, - subs: &FnvHashMap<(Cow, Cow), String>, + subs: &FnvHashMap<(Cow, Cow), (String, String)>, learner: &mut Learner, use_hiragana: bool, mark_accent: bool, @@ -270,7 +275,7 @@ fn add_html_furigana( tokenizer: &Tokenizer, accent_dict: &AccentDict, exclude_kanji: &FnvHashSet, - subs: &FnvHashMap<(Cow, Cow), String>, + subs: &FnvHashMap<(Cow, Cow), (String, String)>, learner: &mut Learner, use_hiragana: bool, mark_accent: bool, @@ -291,15 +296,16 @@ fn add_html_furigana( let kana_2 = feature.rsplit(",").nth(1).unwrap(); let word = feature.rsplit(",").nth(2).unwrap(); - let (kana, pkana) = - if let Some(sub_kana) = subs.get(&(Cow::from(surface), Cow::from(kana_1))) { - (sub_kana.as_str(), sub_kana.as_str()) - } else { - (kana_1, kana_2) - }; + let (kana, pitch_kana) = if let Some((sub_kana, sub_pitch_kana)) = + subs.get(&(Cow::from(surface), Cow::from(kana_1))) + { + (sub_kana.as_str(), sub_pitch_kana.as_str()) + } else { + (kana_1, kana_2) + }; let pitches = if mark_accent { - accent_dict.get(word, pkana) + accent_dict.get(word, pitch_kana) } else { &[] }; @@ -757,5 +763,16 @@ mod tests { gen_accent.add_html_furigana("私"), "ワタシ" ); + + // The added 卵 is to trigger the parse we're testing of 等. + assert_eq!( + gen.add_html_furigana("卵等"), + "タマゴナド" + ); + + assert_eq!( + gen.add_html_furigana("大分"), + "大分ダイブ" + ); } }