Attach pitch accent indicators in a more reasonable way.

This commit is contained in:
Nathan Vegdahl 2024-09-18 13:54:28 +02:00
parent adb58983a7
commit 5c7c167d94

View File

@ -321,15 +321,11 @@ fn add_html_furigana(
kana.into()
};
let furigana_text = apply_furigana(surface, &kana, exclude_kanji);
let furigana_text = apply_furigana(surface, &kana, pitches, exclude_kanji);
if furigana_text.is_empty() {
new_text.push_str(surface);
} else {
for pitch in pitches {
new_text.push_str(&format!("<sup>{}</sup>", pitch));
}
for (surf, furi) in furigana_text.iter() {
if furi.is_empty() {
new_text.push_str(surf);
@ -355,9 +351,10 @@ fn add_html_furigana(
fn apply_furigana<'a>(
surface: &'a str,
kana: &'a str,
pitches: &[u8],
exclude_kanji: &FnvHashSet<char>,
) -> Vec<(&'a str, &'a str)> {
let mut out = Vec::new();
) -> Vec<(&'a str, String)> {
let mut out: Vec<(&str, String)> = Vec::new();
if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) {
return Vec::new();
@ -378,7 +375,7 @@ fn apply_furigana<'a>(
break;
}
}
out.push((&surface[..start_s], ""));
out.push((&surface[..start_s], "".into()));
surface = &surface[start_s..];
kana = &kana[start_k..];
}
@ -395,7 +392,7 @@ fn apply_furigana<'a>(
break;
}
}
out.push((&surface[end_s..], ""));
out.push((&surface[end_s..], "".into()));
surface = &surface[..end_s];
kana = &kana[..end_k];
}
@ -420,16 +417,35 @@ fn apply_furigana<'a>(
.unwrap();
// Insert the segments.
out.insert(out.len() - 2, (&surface[..si], &kana[..ki]));
out.insert(out.len() - 2, (&surface[si..(si + sc.len_utf8())], ""));
out.insert(out.len() - 2, (&surface[..si], kana[..ki].into()));
out.insert(
out.len() - 2,
(&surface[si..(si + sc.len_utf8())], "".into()),
);
surface = &surface[(si + sc.len_utf8())..];
kana = &kana[(ki + kc.len_utf8())..];
}
// Left over.
out.insert(out.len() - 1, (surface, kana));
out.insert(out.len() - 1, (surface, kana.into()));
out.iter().filter(|(s, _)| !s.is_empty()).copied().collect()
out.retain(|(s, _)| !s.is_empty());
// Attach pitch accent indicator(s) if we have any.
if !pitches.is_empty() {
let last = out.last_mut().unwrap();
last.1.push_str(" <sup>");
for (i, pitch) in pitches.iter().enumerate() {
last.1.push_str(&format!("{}", pitch));
// If it's not the last one.
if (i + 1) < pitches.len() {
last.1.push_str(",");
}
}
last.1.push_str("</sup> ");
}
out
}
/// Due to the way this is used, this isn't meant to be exact, but instead
@ -563,7 +579,7 @@ mod tests {
fn apply_furigana_01() {
let surface = "へぇ";
let kana = "ヘー";
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
assert!(pairs.is_empty());
}
@ -572,7 +588,7 @@ mod tests {
fn apply_furigana_02() {
let surface = "へぇー";
let kana = "ヘー";
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
assert!(pairs.is_empty());
}
@ -581,7 +597,7 @@ mod tests {
fn apply_furigana_03() {
let surface = "";
let kana = "";
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
assert!(pairs.is_empty());
}
@ -590,19 +606,24 @@ mod tests {
fn apply_furigana_04() {
let surface = "食べる";
let kana = "タベル";
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
assert_eq!(&[("", ""), ("べる", "")], &pairs[..]);
assert_eq!(&[("", "".into()), ("べる", "".into())], &pairs[..]);
}
#[test]
fn apply_furigana_05() {
let surface = "流れ出す";
let kana = "ながれだす";
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
assert_eq!(
&[("", "なが"), ("", ""), ("", ""), ("", "")],
&[
("", "なが".into()),
("", "".into()),
("", "".into()),
("", "".into())
],
&pairs[..]
);
}
@ -611,18 +632,18 @@ mod tests {
fn apply_furigana_06() {
let surface = "物の怪";
let kana = "もののけ";
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
assert_eq!(&[("物の怪", "もののけ")], &pairs[..]);
assert_eq!(&[("物の怪", "もののけ".into())], &pairs[..]);
}
#[test]
fn apply_furigana_07() {
let surface = "ご飯";
let kana = "ゴハン";
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
assert_eq!(&[("", ""), ("", "ハン")], &pairs[..]);
assert_eq!(&[("", "".into()), ("", "ハン".into())], &pairs[..]);
}
#[test]
@ -699,7 +720,7 @@ mod tests {
);
assert_eq!(
furi_2,
r#"<sup class="食う"><sup>2</sup><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね!<hi />"#
r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby><ruby>べる<rt> <sup>2</sup> </rt></ruby></sup>のは<ruby>良</ruby>いね!<hi />"#
);
}
@ -715,13 +736,13 @@ mod tests {
);
assert_eq!(
gen_accent.add_html_furigana(""),
"<sup>0</sup><ruby>額<rt>ヒタイ</rt></ruby>"
"<ruby>額<rt>ヒタイ <sup>0</sup> </rt></ruby>"
);
assert_eq!(gen.add_html_furigana(""), "<ruby>他<rt>ホカ</rt></ruby>");
assert_eq!(
gen_accent.add_html_furigana(""),
"<sup>0</sup><ruby>他<rt>ホカ</rt></ruby>"
"<ruby>他<rt>ホカ <sup>0</sup> </rt></ruby>"
);
assert_eq!(
@ -730,7 +751,7 @@ mod tests {
);
assert_eq!(
gen_accent.add_html_furigana(""),
"<sup>0</sup><ruby>私<rt>ワタシ</rt></ruby>"
"<ruby>私<rt>ワタシ <sup>0</sup> </rt></ruby>"
);
}
}