From ba5fea6e0a4508d4752bdbc2da6bff54d97154b1 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Wed, 18 Sep 2024 13:54:28 +0200 Subject: [PATCH] Attach pitch accent indicators in a more reasonable way. We give it a class so CSS styling can be used on it more easily. --- src/lib.rs | 82 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 28 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e60c508..80cb6cf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -321,15 +321,11 @@ fn add_html_furigana( kana.into() }; - let furigana_text = apply_furigana(surface, &kana, exclude_kanji); + let furigana_text = apply_furigana(surface, &kana, pitches, exclude_kanji); if furigana_text.is_empty() { new_text.push_str(surface); } else { - for pitch in pitches { - new_text.push_str(&format!("{}", pitch)); - } - for (surf, furi) in furigana_text.iter() { if furi.is_empty() { new_text.push_str(surf); @@ -355,9 +351,10 @@ fn add_html_furigana( fn apply_furigana<'a>( surface: &'a str, kana: &'a str, + pitches: &[u8], exclude_kanji: &FnvHashSet, -) -> Vec<(&'a str, &'a str)> { - let mut out = Vec::new(); +) -> Vec<(&'a str, String)> { + let mut out: Vec<(&str, String)> = Vec::new(); if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) { return Vec::new(); @@ -378,7 +375,7 @@ fn apply_furigana<'a>( break; } } - out.push((&surface[..start_s], "")); + out.push((&surface[..start_s], "".into())); surface = &surface[start_s..]; kana = &kana[start_k..]; } @@ -395,7 +392,7 @@ fn apply_furigana<'a>( break; } } - out.push((&surface[end_s..], "")); + out.push((&surface[end_s..], "".into())); surface = &surface[..end_s]; kana = &kana[..end_k]; } @@ -420,16 +417,40 @@ fn apply_furigana<'a>( .unwrap(); // Insert the segments. - out.insert(out.len() - 2, (&surface[..si], &kana[..ki])); - out.insert(out.len() - 2, (&surface[si..(si + sc.len_utf8())], "")); + out.insert(out.len() - 2, (&surface[..si], kana[..ki].into())); + out.insert( + out.len() - 2, + (&surface[si..(si + sc.len_utf8())], "".into()), + ); surface = &surface[(si + sc.len_utf8())..]; kana = &kana[(ki + kc.len_utf8())..]; } // Left over. - out.insert(out.len() - 1, (surface, kana)); + out.insert(out.len() - 1, (surface, kana.into())); + out.retain(|(s, _)| !s.is_empty()); - out.iter().filter(|(s, _)| !s.is_empty()).copied().collect() + // Attach pitch accent indicator(s) if we have any. + if !pitches.is_empty() && pitches[0] <= 9 { + let last = out.last_mut().unwrap(); + last.1.push_str(""); + last.1.push(match pitches[0] { + 0 => '0', + 1 => '1', + 2 => '2', + 3 => '3', + 4 => '4', + 5 => '5', + 6 => '6', + 7 => '7', + 8 => '8', + 9 => '9', + _ => unreachable!(), + }); + last.1.push_str(""); + } + + out } /// Due to the way this is used, this isn't meant to be exact, but instead @@ -563,7 +584,7 @@ mod tests { fn apply_furigana_01() { let surface = "へぇ"; let kana = "ヘー"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); assert!(pairs.is_empty()); } @@ -572,7 +593,7 @@ mod tests { fn apply_furigana_02() { let surface = "へぇー"; let kana = "ヘー"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); assert!(pairs.is_empty()); } @@ -581,7 +602,7 @@ mod tests { fn apply_furigana_03() { let surface = "へ"; let kana = "え"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); assert!(pairs.is_empty()); } @@ -590,19 +611,24 @@ mod tests { fn apply_furigana_04() { let surface = "食べる"; let kana = "タベル"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); - assert_eq!(&[("食", "タ"), ("べる", "")], &pairs[..]); + assert_eq!(&[("食", "タ".into()), ("べる", "".into())], &pairs[..]); } #[test] fn apply_furigana_05() { let surface = "流れ出す"; let kana = "ながれだす"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); assert_eq!( - &[("流", "なが"), ("れ", ""), ("出", "だ"), ("す", "")], + &[ + ("流", "なが".into()), + ("れ", "".into()), + ("出", "だ".into()), + ("す", "".into()) + ], &pairs[..] ); } @@ -611,18 +637,18 @@ mod tests { fn apply_furigana_06() { let surface = "物の怪"; let kana = "もののけ"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); - assert_eq!(&[("物の怪", "もののけ")], &pairs[..]); + assert_eq!(&[("物の怪", "もののけ".into())], &pairs[..]); } #[test] fn apply_furigana_07() { let surface = "ご飯"; let kana = "ゴハン"; - let pairs = apply_furigana(surface, kana, &FnvHashSet::default()); + let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); - assert_eq!(&[("ご", ""), ("飯", "ハン")], &pairs[..]); + assert_eq!(&[("ご", "".into()), ("飯", "ハン".into())], &pairs[..]); } #[test] @@ -699,7 +725,7 @@ mod tests { ); assert_eq!( furi_2, - r#"2べるのはいね!"# + r#"べるのはいね!"# ); } @@ -715,13 +741,13 @@ mod tests { ); assert_eq!( gen_accent.add_html_furigana("額"), - "0ヒタイ" + "ヒタイ" ); assert_eq!(gen.add_html_furigana("他"), "ホカ"); assert_eq!( gen_accent.add_html_furigana("他"), - "0ホカ" + "ホカ" ); assert_eq!( @@ -730,7 +756,7 @@ mod tests { ); assert_eq!( gen_accent.add_html_furigana("私"), - "0ワタシ" + "ワタシ" ); } }