Mark pitch accent with a span on the accented character.

Rathar than as a number, which was visually distracting.
This commit is contained in:
Nathan Vegdahl 2024-09-20 21:38:57 +02:00
parent 475a959fbe
commit 167d23077e
2 changed files with 97 additions and 43 deletions

View File

@ -59,3 +59,36 @@ impl AccentDict {
} }
} }
} }
/// Computes the byte index of the character in kana that corresponds to the
/// given pitch accent number.
pub fn accent_number_to_byte_idx(kana: &str, accent_number: u8) -> Option<usize> {
if accent_number == 0 {
return None;
}
let target = accent_number - 1;
let mut current = 0;
let mut byte_idx = 0;
let mut iter = kana.chars().peekable();
while let Some(c) = iter.next() {
if current == target {
break;
}
let next_is_mod = match iter.peek() {
Some('ゃ') | Some('ゅ') | Some('ょ') | Some('ャ') | Some('ュ') | Some('ョ') => {
true
}
_ => false,
};
if next_is_mod {
current += 1;
}
byte_idx += c.len_utf8();
}
return Some(byte_idx);
}

View File

@ -360,8 +360,8 @@ fn apply_furigana<'a>(
kana: &'a str, kana: &'a str,
pitches: &[u8], pitches: &[u8],
exclude_kanji: &FnvHashSet<char>, exclude_kanji: &FnvHashSet<char>,
) -> Vec<(&'a str, String)> { ) -> Vec<(String, String)> {
let mut out: Vec<(&str, String)> = Vec::new(); let mut out: Vec<(String, String)> = Vec::new();
if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) { if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) {
return Vec::new(); return Vec::new();
@ -382,7 +382,7 @@ fn apply_furigana<'a>(
break; break;
} }
} }
out.push((&surface[..start_s], "".into())); out.push((surface[..start_s].into(), "".into()));
surface = &surface[start_s..]; surface = &surface[start_s..];
kana = &kana[start_k..]; kana = &kana[start_k..];
} }
@ -399,7 +399,7 @@ fn apply_furigana<'a>(
break; break;
} }
} }
out.push((&surface[end_s..], "".into())); out.push((surface[end_s..].into(), "".into()));
surface = &surface[..end_s]; surface = &surface[..end_s];
kana = &kana[..end_k]; kana = &kana[..end_k];
} }
@ -424,37 +424,33 @@ fn apply_furigana<'a>(
.unwrap(); .unwrap();
// Insert the segments. // Insert the segments.
out.insert(out.len() - 2, (&surface[..si], kana[..ki].into())); out.insert(out.len() - 2, (surface[..si].into(), kana[..ki].into()));
out.insert( out.insert(
out.len() - 2, out.len() - 2,
(&surface[si..(si + sc.len_utf8())], "".into()), (surface[si..(si + sc.len_utf8())].into(), "".into()),
); );
surface = &surface[(si + sc.len_utf8())..]; surface = &surface[(si + sc.len_utf8())..];
kana = &kana[(ki + kc.len_utf8())..]; kana = &kana[(ki + kc.len_utf8())..];
} }
// Left over. // Left over.
out.insert(out.len() - 1, (surface, kana.into())); out.insert(out.len() - 1, (surface.into(), kana.into()));
out.retain(|(s, _)| !s.is_empty()); out.retain(|(s, _)| !s.is_empty());
// Attach pitch accent indicator(s) if we have any. // Attach pitch accent indicator if we have any.
if !pitches.is_empty() && pitches[0] <= 9 { if !pitches.is_empty() && pitches[0] > 0 {
let last = out.last_mut().unwrap(); let mut byte_idx = accent::accent_number_to_byte_idx(kana, pitches[0]).unwrap();
last.1.push_str("<span class=\"pitch\">"); for (ref mut s, ref mut k) in out.iter_mut() {
last.1.push(match pitches[0] { let text = if k.is_empty() { s } else { k };
0 => '', if byte_idx < text.len()
1 => '', && text.is_char_boundary(byte_idx)
2 => '', && text.is_char_boundary(byte_idx + 3)
3 => '', {
4 => '', text.insert_str(byte_idx + 3, "</span>");
5 => '', text.insert_str(byte_idx, "<span class=\"pitch_accent\">");
6 => '', }
7 => '', byte_idx -= text.len();
8 => '', }
9 => '',
_ => unreachable!(),
});
last.1.push_str("</span>");
} }
out out
@ -620,7 +616,10 @@ mod tests {
let kana = "タベル"; let kana = "タベル";
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
assert_eq!(&[("", "".into()), ("べる", "".into())], &pairs[..]); assert_eq!(
&[("".into(), "".into()), ("べる".into(), "".into())],
&pairs[..]
);
} }
#[test] #[test]
@ -631,10 +630,10 @@ mod tests {
assert_eq!( assert_eq!(
&[ &[
("", "なが".into()), ("".into(), "なが".into()),
("", "".into()), ("".into(), "".into()),
("", "".into()), ("".into(), "".into()),
("", "".into()) ("".into(), "".into())
], ],
&pairs[..] &pairs[..]
); );
@ -646,7 +645,7 @@ mod tests {
let kana = "もののけ"; let kana = "もののけ";
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
assert_eq!(&[("物の怪", "もののけ".into())], &pairs[..]); assert_eq!(&[("物の怪".into(), "もののけ".into())], &pairs[..]);
} }
#[test] #[test]
@ -655,7 +654,10 @@ mod tests {
let kana = "ゴハン"; let kana = "ゴハン";
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
assert_eq!(&[("", "".into()), ("", "ハン".into())], &pairs[..]); assert_eq!(
&[("".into(), "".into()), ("".into(), "ハン".into())],
&pairs[..]
);
} }
#[test] #[test]
@ -732,7 +734,7 @@ mod tests {
); );
assert_eq!( assert_eq!(
furi_2, furi_2,
r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby><ruby>べる<rt><span class="pitch"></span></rt></ruby></sup>のは<ruby>良</ruby>いね!<hi />"# r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby><span class="pitch_accent">べ</span>る</sup>のは<ruby>良</ruby>いね!<hi />"#
); );
} }
@ -744,47 +746,66 @@ mod tests {
assert_eq!( assert_eq!(
gen.add_html_furigana(""), gen.add_html_furigana(""),
"<ruby>額<rt>ヒタイ</rt></ruby>" r#"<ruby>額<rt>ヒタイ</rt></ruby>"#
); );
assert_eq!( assert_eq!(
gen_accent.add_html_furigana(""), gen_accent.add_html_furigana(""),
"<ruby>額<rt>ヒタイ<span class=\"pitch\"></span></rt></ruby>" r#"<ruby>額<rt>ヒタイ</rt></ruby>"#
); );
assert_eq!(gen.add_html_furigana(""), "<ruby>他<rt>ホカ</rt></ruby>"); assert_eq!(
gen.add_html_furigana(""),
r#"<ruby>他<rt>ホカ</rt></ruby>"#
);
assert_eq!( assert_eq!(
gen_accent.add_html_furigana(""), gen_accent.add_html_furigana(""),
"<ruby>他<rt>ホカ<span class=\"pitch\"></span></rt></ruby>" r#"<ruby>他<rt>ホカ</rt></ruby>"#
); );
assert_eq!( assert_eq!(
gen.add_html_furigana(""), gen.add_html_furigana(""),
"<ruby>私<rt>ワタシ</rt></ruby>" r#"<ruby>私<rt>ワタシ</rt></ruby>"#
); );
assert_eq!( assert_eq!(
gen_accent.add_html_furigana(""), gen_accent.add_html_furigana(""),
"<ruby>私<rt>ワタシ<span class=\"pitch\"></span></rt></ruby>" r#"<ruby>私<rt>ワタシ</rt></ruby>"#
); );
// The added 卵 is to trigger the parse we're testing of 等. // The added 卵 is to trigger the parse we're testing of 等.
assert_eq!( assert_eq!(
gen.add_html_furigana("卵等"), gen.add_html_furigana("卵等"),
"<ruby>卵<rt>タマゴ</rt></ruby><ruby>等<rt>ナド</rt></ruby>" r#"<ruby>卵<rt>タマゴ</rt></ruby><ruby>等<rt>ナド</rt></ruby>"#
);
assert_eq!(
gen_accent.add_html_furigana("卵等"),
r#"<ruby>卵<rt>タマゴ</rt></ruby><ruby>等<rt><span class="pitch_accent">ナ</span>ド</rt></ruby>"#
); );
assert_eq!( assert_eq!(
gen.add_html_furigana("大分"), gen.add_html_furigana("大分"),
"<ruby>大分<rt>ダイブ</rt></ruby>" r#"<ruby>大分<rt>ダイブ</rt></ruby>"#
);
assert_eq!(
gen_accent.add_html_furigana("大分"),
r#"<ruby>大分<rt>ダイブ</rt></ruby>"#
); );
assert_eq!( assert_eq!(
gen.add_html_furigana("日本"), gen.add_html_furigana("日本"),
"<ruby>日本<rt>ニホン</rt></ruby>" r#"<ruby>日本<rt>ニホン</rt></ruby>"#
);
assert_eq!(
gen_accent.add_html_furigana("日本"),
r#"<ruby>日本<rt>ニホン</rt></ruby>"#
); );
assert_eq!( assert_eq!(
gen.add_html_furigana("日本人"), gen.add_html_furigana("日本人"),
"<ruby>日本人<rt>ニホンジン</rt></ruby>" r#"<ruby>日本人<rt>ニホンジン</rt></ruby>"#
);
assert_eq!(
gen_accent.add_html_furigana("日本人"),
r#"<ruby>日本人<rt>ニホンジン</rt></ruby>"#
); );
} }
} }