diff --git a/src/main.rs b/src/main.rs index ee0db43..37c8e1b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -65,7 +65,7 @@ fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String { let surface = t.surface(); let kana = t.feature().split(",").nth(1).unwrap(); - let (start_bytes, end_bytes) = matching_ends(surface, kana); + let (start_bytes, end_bytes) = matching_kana_ends(surface, kana); if kana.is_empty() || start_bytes == surface.len() @@ -94,7 +94,12 @@ fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String { } /// Returns (matching_start_bytes, matching_end_bytes). -fn matching_ends(a: &str, b: &str) -> (usize, usize) { +/// +/// Note that the bytes are in terms of `a`'s bytes. +/// +/// If `matching_start_bytes == a.len()` you can assume that strings are kana +/// equivalents, and thus no ruby is needed. +fn matching_kana_ends(a: &str, b: &str) -> (usize, usize) { let mut start_bytes = 0; for (ca, cb) in a.chars().zip(b.chars()) { if ca == cb || is_equivalent_kana(ca, cb) { @@ -113,8 +118,8 @@ fn matching_ends(a: &str, b: &str) -> (usize, usize) { } } - if start_bytes == a.len() { - (start_bytes, 0) + if (start_bytes + end_bytes) >= a.len() || (start_bytes + end_bytes) >= b.len() { + (a.len(), 0) } else { (start_bytes, end_bytes) } @@ -184,3 +189,28 @@ pub fn katakana_to_hiragana(c: char) -> Option { None } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn matching_kana_ends_01() { + let surface = "へぇ"; + let kana = "ヘー"; + let (start_bytes, end_bytes) = matching_kana_ends(surface, kana); + + assert_eq!(6, start_bytes); + assert_eq!(0, end_bytes); + } + + #[test] + fn matching_kana_ends_02() { + let surface = "へぇー"; + let kana = "ヘー"; + let (start_bytes, end_bytes) = matching_kana_ends(surface, kana); + + assert_eq!(9, start_bytes); + assert_eq!(0, end_bytes); + } +} diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..e69de29