diff --git a/src/accent.rs b/src/accent.rs index 198f0b0..b7e7f24 100644 --- a/src/accent.rs +++ b/src/accent.rs @@ -59,3 +59,36 @@ impl AccentDict { } } } + +/// Computes the byte index of the character in kana that corresponds to the +/// given pitch accent number. +pub fn accent_number_to_byte_idx(kana: &str, accent_number: u8) -> Option { + if accent_number == 0 { + return None; + } + + let target = accent_number - 1; + let mut current = 0; + let mut byte_idx = 0; + + let mut iter = kana.chars().peekable(); + while let Some(c) = iter.next() { + if current == target { + break; + } + + let next_is_mod = match iter.peek() { + Some('ゃ') | Some('ゅ') | Some('ょ') | Some('ャ') | Some('ュ') | Some('ョ') => { + true + } + _ => false, + }; + if next_is_mod { + current += 1; + } + + byte_idx += c.len_utf8(); + } + + return Some(byte_idx); +} diff --git a/src/lib.rs b/src/lib.rs index e2c4ba4..4738ab5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -360,8 +360,8 @@ fn apply_furigana<'a>( kana: &'a str, pitches: &[u8], exclude_kanji: &FnvHashSet, -) -> Vec<(&'a str, String)> { - let mut out: Vec<(&str, String)> = Vec::new(); +) -> Vec<(String, String)> { + let mut out: Vec<(String, String)> = Vec::new(); if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) { return Vec::new(); @@ -382,7 +382,7 @@ fn apply_furigana<'a>( break; } } - out.push((&surface[..start_s], "".into())); + out.push((surface[..start_s].into(), "".into())); surface = &surface[start_s..]; kana = &kana[start_k..]; } @@ -399,7 +399,7 @@ fn apply_furigana<'a>( break; } } - out.push((&surface[end_s..], "".into())); + out.push((surface[end_s..].into(), "".into())); surface = &surface[..end_s]; kana = &kana[..end_k]; } @@ -424,37 +424,33 @@ fn apply_furigana<'a>( .unwrap(); // Insert the segments. - out.insert(out.len() - 2, (&surface[..si], kana[..ki].into())); + out.insert(out.len() - 2, (surface[..si].into(), kana[..ki].into())); out.insert( out.len() - 2, - (&surface[si..(si + sc.len_utf8())], "".into()), + (surface[si..(si + sc.len_utf8())].into(), "".into()), ); surface = &surface[(si + sc.len_utf8())..]; kana = &kana[(ki + kc.len_utf8())..]; } // Left over. - out.insert(out.len() - 1, (surface, kana.into())); + out.insert(out.len() - 1, (surface.into(), kana.into())); out.retain(|(s, _)| !s.is_empty()); - // Attach pitch accent indicator(s) if we have any. - if !pitches.is_empty() && pitches[0] <= 9 { - let last = out.last_mut().unwrap(); - last.1.push_str(""); - last.1.push(match pitches[0] { - 0 => '0', - 1 => '1', - 2 => '2', - 3 => '3', - 4 => '4', - 5 => '5', - 6 => '6', - 7 => '7', - 8 => '8', - 9 => '9', - _ => unreachable!(), - }); - last.1.push_str(""); + // Attach pitch accent indicator if we have any. + if !pitches.is_empty() && pitches[0] > 0 { + let mut byte_idx = accent::accent_number_to_byte_idx(kana, pitches[0]).unwrap(); + for (ref mut s, ref mut k) in out.iter_mut() { + let text = if k.is_empty() { s } else { k }; + if byte_idx < text.len() + && text.is_char_boundary(byte_idx) + && text.is_char_boundary(byte_idx + 3) + { + text.insert_str(byte_idx + 3, ""); + text.insert_str(byte_idx, ""); + } + byte_idx -= text.len(); + } } out @@ -620,7 +616,10 @@ mod tests { let kana = "タベル"; let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); - assert_eq!(&[("食", "タ".into()), ("べる", "".into())], &pairs[..]); + assert_eq!( + &[("食".into(), "タ".into()), ("べる".into(), "".into())], + &pairs[..] + ); } #[test] @@ -631,10 +630,10 @@ mod tests { assert_eq!( &[ - ("流", "なが".into()), - ("れ", "".into()), - ("出", "だ".into()), - ("す", "".into()) + ("流".into(), "なが".into()), + ("れ".into(), "".into()), + ("出".into(), "だ".into()), + ("す".into(), "".into()) ], &pairs[..] ); @@ -646,7 +645,7 @@ mod tests { let kana = "もののけ"; let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); - assert_eq!(&[("物の怪", "もののけ".into())], &pairs[..]); + assert_eq!(&[("物の怪".into(), "もののけ".into())], &pairs[..]); } #[test] @@ -655,7 +654,10 @@ mod tests { let kana = "ゴハン"; let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default()); - assert_eq!(&[("ご", "".into()), ("飯", "ハン".into())], &pairs[..]); + assert_eq!( + &[("ご".into(), "".into()), ("飯".into(), "ハン".into())], + &pairs[..] + ); } #[test] @@ -732,7 +734,7 @@ mod tests { ); assert_eq!( furi_2, - r#"べるのはいね!"# + r#"のはいね!"# ); } @@ -744,47 +746,66 @@ mod tests { assert_eq!( gen.add_html_furigana("額"), - "ヒタイ" + r#"ヒタイ"# ); assert_eq!( gen_accent.add_html_furigana("額"), - "ヒタイ" + r#"ヒタイ"# ); - assert_eq!(gen.add_html_furigana("他"), "ホカ"); + assert_eq!( + gen.add_html_furigana("他"), + r#"ホカ"# + ); assert_eq!( gen_accent.add_html_furigana("他"), - "ホカ" + r#"ホカ"# ); assert_eq!( gen.add_html_furigana("私"), - "ワタシ" + r#"ワタシ"# ); assert_eq!( gen_accent.add_html_furigana("私"), - "ワタシ" + r#"ワタシ"# ); // The added 卵 is to trigger the parse we're testing of 等. assert_eq!( gen.add_html_furigana("卵等"), - "タマゴナド" + r#"タマゴナド"# + ); + assert_eq!( + gen_accent.add_html_furigana("卵等"), + r#"タマゴ"# ); assert_eq!( gen.add_html_furigana("大分"), - "大分ダイブ" + r#"大分ダイブ"# + ); + assert_eq!( + gen_accent.add_html_furigana("大分"), + r#"大分ダイブ"# ); assert_eq!( gen.add_html_furigana("日本"), - "日本ニホン" + r#"日本ニホン"# + ); + assert_eq!( + gen_accent.add_html_furigana("日本"), + r#"日本ニホン"# ); assert_eq!( gen.add_html_furigana("日本人"), - "日本人ニホンジン" + r#"日本人ニホンジン"# + ); + assert_eq!( + gen_accent.add_html_furigana("日本人"), + r#"日本人ニホンジン"# ); } }