From 5c7c167d948e8b1ea558d838f48d0ff6db071e69 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Wed, 18 Sep 2024 13:54:28 +0200 Subject: [PATCH] Attach pitch accent indicators in a more reasonable way. --- src/lib.rs | 77 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e60c508..cd28cd8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -321,15 +321,11 @@ fn add_html_furigana( kana.into() }; - let furigana_text = apply_furigana(surface, &kana, exclude_kanji); + let furigana_text = apply_furigana(surface, &kana, pitches, exclude_kanji); if furigana_text.is_empty() { new_text.push_str(surface); } else { - for pitch in pitches { - new_text.push_str(&format!("{}", pitch)); - } - for (surf, furi) in furigana_text.iter() { if furi.is_empty() { new_text.push_str(surf); @@ -355,9 +351,10 @@ fn add_html_furigana( fn apply_furigana<'a>( surface: &'a str, kana: &'a str, + pitches: &[u8], exclude_kanji: &FnvHashSet, -) -> Vec<(&'a str, &'a str)> { - let mut out = Vec::new(); +) -> Vec<(&'a str, String)> { + let mut out: Vec<(&str, String)> = Vec::new(); if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) { return Vec::new(); @@ -378,7 +375,7 @@ fn apply_furigana<'a>( break; } } - out.push((&surface[..start_s], "")); + out.push((&surface[..start_s], "".into())); surface = &surface[start_s..]; kana = &kana[start_k..]; } @@ -395,7 +392,7 @@ fn apply_furigana<'a>( break; } } - out.push((&surface[end_s..], "")); + out.push((&surface[end_s..], "".into())); surface = &surface[..end_s]; kana = &kana[..end_k]; } @@ -420,16 +417,35 @@ fn apply_furigana<'a>( .unwrap(); // Insert the segments. - out.insert(out.len() - 2, (&surface[..si], &kana[..ki])); - out.insert(out.len() - 2, (&surface[si..(si + sc.len_utf8())], "")); + out.insert(out.len() - 2, (&surface[..si], kana[..ki].into())); + out.insert( + out.len() - 2, + (&surface[si..(si + sc.len_utf8())], "".into()), + ); surface = &surface[(si + sc.len_utf8())..]; kana = &kana[(ki + kc.len_utf8())..]; } // Left over. - out.insert(out.len() - 1, (surface, kana)); + out.insert(out.len() - 1, (surface, kana.into())); - out.iter().filter(|(s, _)| !s.is_empty()).copied().collect() + out.retain(|(s, _)| !s.is_empty()); + + // Attach pitch accent indicator(s) if we have any. + if !pitches.is_empty() { + let last = out.last_mut().unwrap(); + last.1.push_str(" "); + for (i, pitch) in pitches.iter().enumerate() { + last.1.push_str(&format!("{}", pitch)); + // If it's not the last one. + if (i + 1) < pitches.len() { + last.1.push_str(","); + } + } + last.1.push_str(" "); + } + + out } /// Due to the way this is used, this isn't meant to be exact, but instead @@ -563,7 +579,7 @@ mod tests { fn apply_furigana_01() { let surface = "へぇ"; let kana = "ヘー"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); assert!(pairs.is_empty()); } @@ -572,7 +588,7 @@ mod tests { fn apply_furigana_02() { let surface = "へぇー"; let kana = "ヘー"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); assert!(pairs.is_empty()); } @@ -581,7 +597,7 @@ mod tests { fn apply_furigana_03() { let surface = "へ"; let kana = "え"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); assert!(pairs.is_empty()); } @@ -590,19 +606,24 @@ mod tests { fn apply_furigana_04() { let surface = "食べる"; let kana = "タベル"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); - assert_eq!(&[("食", "タ"), ("べる", "")], &pairs[..]); + assert_eq!(&[("食", "タ".into()), ("べる", "".into())], &pairs[..]); } #[test] fn apply_furigana_05() { let surface = "流れ出す"; let kana = "ながれだす"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); assert_eq!( - &[("流", "なが"), ("れ", ""), ("出", "だ"), ("す", "")], + &[ + ("流", "なが".into()), + ("れ", "".into()), + ("出", "だ".into()), + ("す", "".into()) + ], &pairs[..] ); } @@ -611,18 +632,18 @@ mod tests { fn apply_furigana_06() { let surface = "物の怪"; let kana = "もののけ"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); - assert_eq!(&[("物の怪", "もののけ")], &pairs[..]); + assert_eq!(&[("物の怪", "もののけ".into())], &pairs[..]); } #[test] fn apply_furigana_07() { let surface = "ご飯"; let kana = "ゴハン"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); - assert_eq!(&[("ご", ""), ("飯", "ハン")], &pairs[..]); + assert_eq!(&[("ご", "".into()), ("飯", "ハン".into())], &pairs[..]); } #[test] @@ -699,7 +720,7 @@ mod tests { ); assert_eq!( furi_2, - r#"2べるのはいね!"# + r#"べる 2 のはいね!"# ); } @@ -715,13 +736,13 @@ mod tests { ); assert_eq!( gen_accent.add_html_furigana("額"), - "0ヒタイ" + "ヒタイ 0 " ); assert_eq!(gen.add_html_furigana("他"), "ホカ"); assert_eq!( gen_accent.add_html_furigana("他"), - "0ホカ" + "ホカ 0 " ); assert_eq!( @@ -730,7 +751,7 @@ mod tests { ); assert_eq!( gen_accent.add_html_furigana("私"), - "0ワタシ" + "ワタシ 0 " ); } }