Fix case with kana surface and ruby that aren't the same length.

It's a weird case that didn't occur to me that it could even happen, but
I guess there are dictionary entries like that.
This commit is contained in:
Nathan Vegdahl 2024-09-01 11:01:25 +02:00
parent f34a222c51
commit 158511b3aa
2 changed files with 34 additions and 4 deletions

View File

@ -65,7 +65,7 @@ fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
let surface = t.surface(); let surface = t.surface();
let kana = t.feature().split(",").nth(1).unwrap(); let kana = t.feature().split(",").nth(1).unwrap();
let (start_bytes, end_bytes) = matching_ends(surface, kana); let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
if kana.is_empty() if kana.is_empty()
|| start_bytes == surface.len() || start_bytes == surface.len()
@ -94,7 +94,12 @@ fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
} }
/// Returns (matching_start_bytes, matching_end_bytes). /// Returns (matching_start_bytes, matching_end_bytes).
fn matching_ends(a: &str, b: &str) -> (usize, usize) { ///
/// Note that the bytes are in terms of `a`'s bytes.
///
/// If `matching_start_bytes == a.len()` you can assume that strings are kana
/// equivalents, and thus no ruby is needed.
fn matching_kana_ends(a: &str, b: &str) -> (usize, usize) {
let mut start_bytes = 0; let mut start_bytes = 0;
for (ca, cb) in a.chars().zip(b.chars()) { for (ca, cb) in a.chars().zip(b.chars()) {
if ca == cb || is_equivalent_kana(ca, cb) { if ca == cb || is_equivalent_kana(ca, cb) {
@ -113,8 +118,8 @@ fn matching_ends(a: &str, b: &str) -> (usize, usize) {
} }
} }
if start_bytes == a.len() { if (start_bytes + end_bytes) >= a.len() || (start_bytes + end_bytes) >= b.len() {
(start_bytes, 0) (a.len(), 0)
} else { } else {
(start_bytes, end_bytes) (start_bytes, end_bytes)
} }
@ -184,3 +189,28 @@ pub fn katakana_to_hiragana(c: char) -> Option<char> {
None None
} }
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn matching_kana_ends_01() {
let surface = "へぇ";
let kana = "ヘー";
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
assert_eq!(6, start_bytes);
assert_eq!(0, end_bytes);
}
#[test]
fn matching_kana_ends_02() {
let surface = "へぇー";
let kana = "ヘー";
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
assert_eq!(9, start_bytes);
assert_eq!(0, end_bytes);
}
}

0
test.txt Normal file
View File