Mark pitch accent with a span on the accented character.
Rathar than as a number, which was visually distracting.
This commit is contained in:
parent
475a959fbe
commit
167d23077e
|
@ -59,3 +59,36 @@ impl AccentDict {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Computes the byte index of the character in kana that corresponds to the
|
||||||
|
/// given pitch accent number.
|
||||||
|
pub fn accent_number_to_byte_idx(kana: &str, accent_number: u8) -> Option<usize> {
|
||||||
|
if accent_number == 0 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let target = accent_number - 1;
|
||||||
|
let mut current = 0;
|
||||||
|
let mut byte_idx = 0;
|
||||||
|
|
||||||
|
let mut iter = kana.chars().peekable();
|
||||||
|
while let Some(c) = iter.next() {
|
||||||
|
if current == target {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let next_is_mod = match iter.peek() {
|
||||||
|
Some('ゃ') | Some('ゅ') | Some('ょ') | Some('ャ') | Some('ュ') | Some('ョ') => {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
_ => false,
|
||||||
|
};
|
||||||
|
if next_is_mod {
|
||||||
|
current += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
byte_idx += c.len_utf8();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Some(byte_idx);
|
||||||
|
}
|
||||||
|
|
107
src/lib.rs
107
src/lib.rs
|
@ -360,8 +360,8 @@ fn apply_furigana<'a>(
|
||||||
kana: &'a str,
|
kana: &'a str,
|
||||||
pitches: &[u8],
|
pitches: &[u8],
|
||||||
exclude_kanji: &FnvHashSet<char>,
|
exclude_kanji: &FnvHashSet<char>,
|
||||||
) -> Vec<(&'a str, String)> {
|
) -> Vec<(String, String)> {
|
||||||
let mut out: Vec<(&str, String)> = Vec::new();
|
let mut out: Vec<(String, String)> = Vec::new();
|
||||||
|
|
||||||
if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) {
|
if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) {
|
||||||
return Vec::new();
|
return Vec::new();
|
||||||
|
@ -382,7 +382,7 @@ fn apply_furigana<'a>(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out.push((&surface[..start_s], "".into()));
|
out.push((surface[..start_s].into(), "".into()));
|
||||||
surface = &surface[start_s..];
|
surface = &surface[start_s..];
|
||||||
kana = &kana[start_k..];
|
kana = &kana[start_k..];
|
||||||
}
|
}
|
||||||
|
@ -399,7 +399,7 @@ fn apply_furigana<'a>(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
out.push((&surface[end_s..], "".into()));
|
out.push((surface[end_s..].into(), "".into()));
|
||||||
surface = &surface[..end_s];
|
surface = &surface[..end_s];
|
||||||
kana = &kana[..end_k];
|
kana = &kana[..end_k];
|
||||||
}
|
}
|
||||||
|
@ -424,37 +424,33 @@ fn apply_furigana<'a>(
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
// Insert the segments.
|
// Insert the segments.
|
||||||
out.insert(out.len() - 2, (&surface[..si], kana[..ki].into()));
|
out.insert(out.len() - 2, (surface[..si].into(), kana[..ki].into()));
|
||||||
out.insert(
|
out.insert(
|
||||||
out.len() - 2,
|
out.len() - 2,
|
||||||
(&surface[si..(si + sc.len_utf8())], "".into()),
|
(surface[si..(si + sc.len_utf8())].into(), "".into()),
|
||||||
);
|
);
|
||||||
surface = &surface[(si + sc.len_utf8())..];
|
surface = &surface[(si + sc.len_utf8())..];
|
||||||
kana = &kana[(ki + kc.len_utf8())..];
|
kana = &kana[(ki + kc.len_utf8())..];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Left over.
|
// Left over.
|
||||||
out.insert(out.len() - 1, (surface, kana.into()));
|
out.insert(out.len() - 1, (surface.into(), kana.into()));
|
||||||
out.retain(|(s, _)| !s.is_empty());
|
out.retain(|(s, _)| !s.is_empty());
|
||||||
|
|
||||||
// Attach pitch accent indicator(s) if we have any.
|
// Attach pitch accent indicator if we have any.
|
||||||
if !pitches.is_empty() && pitches[0] <= 9 {
|
if !pitches.is_empty() && pitches[0] > 0 {
|
||||||
let last = out.last_mut().unwrap();
|
let mut byte_idx = accent::accent_number_to_byte_idx(kana, pitches[0]).unwrap();
|
||||||
last.1.push_str("<span class=\"pitch\">");
|
for (ref mut s, ref mut k) in out.iter_mut() {
|
||||||
last.1.push(match pitches[0] {
|
let text = if k.is_empty() { s } else { k };
|
||||||
0 => '0',
|
if byte_idx < text.len()
|
||||||
1 => '1',
|
&& text.is_char_boundary(byte_idx)
|
||||||
2 => '2',
|
&& text.is_char_boundary(byte_idx + 3)
|
||||||
3 => '3',
|
{
|
||||||
4 => '4',
|
text.insert_str(byte_idx + 3, "</span>");
|
||||||
5 => '5',
|
text.insert_str(byte_idx, "<span class=\"pitch_accent\">");
|
||||||
6 => '6',
|
}
|
||||||
7 => '7',
|
byte_idx -= text.len();
|
||||||
8 => '8',
|
}
|
||||||
9 => '9',
|
|
||||||
_ => unreachable!(),
|
|
||||||
});
|
|
||||||
last.1.push_str("</span>");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
out
|
out
|
||||||
|
@ -620,7 +616,10 @@ mod tests {
|
||||||
let kana = "タベル";
|
let kana = "タベル";
|
||||||
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
||||||
|
|
||||||
assert_eq!(&[("食", "タ".into()), ("べる", "".into())], &pairs[..]);
|
assert_eq!(
|
||||||
|
&[("食".into(), "タ".into()), ("べる".into(), "".into())],
|
||||||
|
&pairs[..]
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -631,10 +630,10 @@ mod tests {
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&[
|
&[
|
||||||
("流", "なが".into()),
|
("流".into(), "なが".into()),
|
||||||
("れ", "".into()),
|
("れ".into(), "".into()),
|
||||||
("出", "だ".into()),
|
("出".into(), "だ".into()),
|
||||||
("す", "".into())
|
("す".into(), "".into())
|
||||||
],
|
],
|
||||||
&pairs[..]
|
&pairs[..]
|
||||||
);
|
);
|
||||||
|
@ -646,7 +645,7 @@ mod tests {
|
||||||
let kana = "もののけ";
|
let kana = "もののけ";
|
||||||
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
||||||
|
|
||||||
assert_eq!(&[("物の怪", "もののけ".into())], &pairs[..]);
|
assert_eq!(&[("物の怪".into(), "もののけ".into())], &pairs[..]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -655,7 +654,10 @@ mod tests {
|
||||||
let kana = "ゴハン";
|
let kana = "ゴハン";
|
||||||
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
||||||
|
|
||||||
assert_eq!(&[("ご", "".into()), ("飯", "ハン".into())], &pairs[..]);
|
assert_eq!(
|
||||||
|
&[("ご".into(), "".into()), ("飯".into(), "ハン".into())],
|
||||||
|
&pairs[..]
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -732,7 +734,7 @@ mod tests {
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
furi_2,
|
furi_2,
|
||||||
r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby><ruby>べる<rt><span class="pitch">2</span></rt></ruby></sup>のは<ruby>良</ruby>いね!<hi />"#
|
r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby><span class="pitch_accent">べ</span>る</sup>のは<ruby>良</ruby>いね!<hi />"#
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -744,47 +746,66 @@ mod tests {
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
gen.add_html_furigana("額"),
|
gen.add_html_furigana("額"),
|
||||||
"<ruby>額<rt>ヒタイ</rt></ruby>"
|
r#"<ruby>額<rt>ヒタイ</rt></ruby>"#
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
gen_accent.add_html_furigana("額"),
|
gen_accent.add_html_furigana("額"),
|
||||||
"<ruby>額<rt>ヒタイ<span class=\"pitch\">0</span></rt></ruby>"
|
r#"<ruby>額<rt>ヒタイ</rt></ruby>"#
|
||||||
);
|
);
|
||||||
|
|
||||||
assert_eq!(gen.add_html_furigana("他"), "<ruby>他<rt>ホカ</rt></ruby>");
|
assert_eq!(
|
||||||
|
gen.add_html_furigana("他"),
|
||||||
|
r#"<ruby>他<rt>ホカ</rt></ruby>"#
|
||||||
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
gen_accent.add_html_furigana("他"),
|
gen_accent.add_html_furigana("他"),
|
||||||
"<ruby>他<rt>ホカ<span class=\"pitch\">0</span></rt></ruby>"
|
r#"<ruby>他<rt>ホカ</rt></ruby>"#
|
||||||
);
|
);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
gen.add_html_furigana("私"),
|
gen.add_html_furigana("私"),
|
||||||
"<ruby>私<rt>ワタシ</rt></ruby>"
|
r#"<ruby>私<rt>ワタシ</rt></ruby>"#
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
gen_accent.add_html_furigana("私"),
|
gen_accent.add_html_furigana("私"),
|
||||||
"<ruby>私<rt>ワタシ<span class=\"pitch\">0</span></rt></ruby>"
|
r#"<ruby>私<rt>ワタシ</rt></ruby>"#
|
||||||
);
|
);
|
||||||
|
|
||||||
// The added 卵 is to trigger the parse we're testing of 等.
|
// The added 卵 is to trigger the parse we're testing of 等.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
gen.add_html_furigana("卵等"),
|
gen.add_html_furigana("卵等"),
|
||||||
"<ruby>卵<rt>タマゴ</rt></ruby><ruby>等<rt>ナド</rt></ruby>"
|
r#"<ruby>卵<rt>タマゴ</rt></ruby><ruby>等<rt>ナド</rt></ruby>"#
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
gen_accent.add_html_furigana("卵等"),
|
||||||
|
r#"<ruby>卵<rt>タマゴ</rt></ruby><ruby>等<rt><span class="pitch_accent">ナ</span>ド</rt></ruby>"#
|
||||||
);
|
);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
gen.add_html_furigana("大分"),
|
gen.add_html_furigana("大分"),
|
||||||
"<ruby>大分<rt>ダイブ</rt></ruby>"
|
r#"<ruby>大分<rt>ダイブ</rt></ruby>"#
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
gen_accent.add_html_furigana("大分"),
|
||||||
|
r#"<ruby>大分<rt>ダイブ</rt></ruby>"#
|
||||||
);
|
);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
gen.add_html_furigana("日本"),
|
gen.add_html_furigana("日本"),
|
||||||
"<ruby>日本<rt>ニホン</rt></ruby>"
|
r#"<ruby>日本<rt>ニホン</rt></ruby>"#
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
gen_accent.add_html_furigana("日本"),
|
||||||
|
r#"<ruby>日本<rt>ニホン</rt></ruby>"#
|
||||||
);
|
);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
gen.add_html_furigana("日本人"),
|
gen.add_html_furigana("日本人"),
|
||||||
"<ruby>日本人<rt>ニホンジン</rt></ruby>"
|
r#"<ruby>日本人<rt>ニホンジン</rt></ruby>"#
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
gen_accent.add_html_furigana("日本人"),
|
||||||
|
r#"<ruby>日本人<rt>ニホンジン</rt></ruby>"#
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user