Fix case with kana surface and ruby that aren't the same length.
It's a weird case that didn't occur to me that it could even happen, but I guess there are dictionary entries like that.
This commit is contained in:
parent
f34a222c51
commit
158511b3aa
38
src/main.rs
38
src/main.rs
|
@ -65,7 +65,7 @@ fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
|
|||
let surface = t.surface();
|
||||
let kana = t.feature().split(",").nth(1).unwrap();
|
||||
|
||||
let (start_bytes, end_bytes) = matching_ends(surface, kana);
|
||||
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
|
||||
|
||||
if kana.is_empty()
|
||||
|| start_bytes == surface.len()
|
||||
|
@ -94,7 +94,12 @@ fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
|
|||
}
|
||||
|
||||
/// Returns (matching_start_bytes, matching_end_bytes).
|
||||
fn matching_ends(a: &str, b: &str) -> (usize, usize) {
|
||||
///
|
||||
/// Note that the bytes are in terms of `a`'s bytes.
|
||||
///
|
||||
/// If `matching_start_bytes == a.len()` you can assume that strings are kana
|
||||
/// equivalents, and thus no ruby is needed.
|
||||
fn matching_kana_ends(a: &str, b: &str) -> (usize, usize) {
|
||||
let mut start_bytes = 0;
|
||||
for (ca, cb) in a.chars().zip(b.chars()) {
|
||||
if ca == cb || is_equivalent_kana(ca, cb) {
|
||||
|
@ -113,8 +118,8 @@ fn matching_ends(a: &str, b: &str) -> (usize, usize) {
|
|||
}
|
||||
}
|
||||
|
||||
if start_bytes == a.len() {
|
||||
(start_bytes, 0)
|
||||
if (start_bytes + end_bytes) >= a.len() || (start_bytes + end_bytes) >= b.len() {
|
||||
(a.len(), 0)
|
||||
} else {
|
||||
(start_bytes, end_bytes)
|
||||
}
|
||||
|
@ -184,3 +189,28 @@ pub fn katakana_to_hiragana(c: char) -> Option<char> {
|
|||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn matching_kana_ends_01() {
|
||||
let surface = "へぇ";
|
||||
let kana = "ヘー";
|
||||
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
|
||||
|
||||
assert_eq!(6, start_bytes);
|
||||
assert_eq!(0, end_bytes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn matching_kana_ends_02() {
|
||||
let surface = "へぇー";
|
||||
let kana = "ヘー";
|
||||
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
|
||||
|
||||
assert_eq!(9, start_bytes);
|
||||
assert_eq!(0, end_bytes);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user