Fix case with kana surface and ruby that aren't the same length.
It's a weird case that didn't occur to me that it could even happen, but I guess there are dictionary entries like that.
This commit is contained in:
parent
f34a222c51
commit
158511b3aa
38
src/main.rs
38
src/main.rs
|
@ -65,7 +65,7 @@ fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
|
||||||
let surface = t.surface();
|
let surface = t.surface();
|
||||||
let kana = t.feature().split(",").nth(1).unwrap();
|
let kana = t.feature().split(",").nth(1).unwrap();
|
||||||
|
|
||||||
let (start_bytes, end_bytes) = matching_ends(surface, kana);
|
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
|
||||||
|
|
||||||
if kana.is_empty()
|
if kana.is_empty()
|
||||||
|| start_bytes == surface.len()
|
|| start_bytes == surface.len()
|
||||||
|
@ -94,7 +94,12 @@ fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns (matching_start_bytes, matching_end_bytes).
|
/// Returns (matching_start_bytes, matching_end_bytes).
|
||||||
fn matching_ends(a: &str, b: &str) -> (usize, usize) {
|
///
|
||||||
|
/// Note that the bytes are in terms of `a`'s bytes.
|
||||||
|
///
|
||||||
|
/// If `matching_start_bytes == a.len()` you can assume that strings are kana
|
||||||
|
/// equivalents, and thus no ruby is needed.
|
||||||
|
fn matching_kana_ends(a: &str, b: &str) -> (usize, usize) {
|
||||||
let mut start_bytes = 0;
|
let mut start_bytes = 0;
|
||||||
for (ca, cb) in a.chars().zip(b.chars()) {
|
for (ca, cb) in a.chars().zip(b.chars()) {
|
||||||
if ca == cb || is_equivalent_kana(ca, cb) {
|
if ca == cb || is_equivalent_kana(ca, cb) {
|
||||||
|
@ -113,8 +118,8 @@ fn matching_ends(a: &str, b: &str) -> (usize, usize) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if start_bytes == a.len() {
|
if (start_bytes + end_bytes) >= a.len() || (start_bytes + end_bytes) >= b.len() {
|
||||||
(start_bytes, 0)
|
(a.len(), 0)
|
||||||
} else {
|
} else {
|
||||||
(start_bytes, end_bytes)
|
(start_bytes, end_bytes)
|
||||||
}
|
}
|
||||||
|
@ -184,3 +189,28 @@ pub fn katakana_to_hiragana(c: char) -> Option<char> {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn matching_kana_ends_01() {
|
||||||
|
let surface = "へぇ";
|
||||||
|
let kana = "ヘー";
|
||||||
|
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
|
||||||
|
|
||||||
|
assert_eq!(6, start_bytes);
|
||||||
|
assert_eq!(0, end_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn matching_kana_ends_02() {
|
||||||
|
let surface = "へぇー";
|
||||||
|
let kana = "ヘー";
|
||||||
|
let (start_bytes, end_bytes) = matching_kana_ends(surface, kana);
|
||||||
|
|
||||||
|
assert_eq!(9, start_bytes);
|
||||||
|
assert_eq!(0, end_bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user