Allow customizing accent marks.
This commit is contained in:
parent
40f93cf55a
commit
698e0e3277
83
src/lib.rs
83
src/lib.rs
|
@ -39,7 +39,8 @@ pub struct FuriganaGenerator {
|
||||||
exclude_kanji: FnvHashSet<char>,
|
exclude_kanji: FnvHashSet<char>,
|
||||||
subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), (String, String)>,
|
subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), (String, String)>,
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
mark_accent: bool,
|
accent_mark: Option<String>,
|
||||||
|
accentless_mark: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FuriganaGenerator {
|
impl FuriganaGenerator {
|
||||||
|
@ -47,7 +48,12 @@ impl FuriganaGenerator {
|
||||||
// Specifically, words made up *entirely* of those kanji will be excluded.
|
// Specifically, words made up *entirely* of those kanji will be excluded.
|
||||||
// If a word has some kanji that aren't in that set, even if it also has
|
// If a word has some kanji that aren't in that set, even if it also has
|
||||||
// some that are, it will still get furigana.
|
// some that are, it will still get furigana.
|
||||||
pub fn new(exclude_count: usize, use_hiragana: bool, mark_accent: bool) -> Self {
|
pub fn new(
|
||||||
|
exclude_count: usize,
|
||||||
|
use_hiragana: bool,
|
||||||
|
accent_mark: Option<String>,
|
||||||
|
accentless_mark: Option<String>,
|
||||||
|
) -> Self {
|
||||||
let dict = {
|
let dict = {
|
||||||
// Note: we could just pass the decoder straight to `Dictionary::read()`
|
// Note: we could just pass the decoder straight to `Dictionary::read()`
|
||||||
// below, and it would work. However, that ends up being slower than
|
// below, and it would work. However, that ends up being slower than
|
||||||
|
@ -84,7 +90,8 @@ impl FuriganaGenerator {
|
||||||
exclude_kanji: exclude_kanji,
|
exclude_kanji: exclude_kanji,
|
||||||
subs: subs,
|
subs: subs,
|
||||||
use_hiragana: use_hiragana,
|
use_hiragana: use_hiragana,
|
||||||
mark_accent: mark_accent,
|
accent_mark: accent_mark,
|
||||||
|
accentless_mark: accentless_mark,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,7 +103,8 @@ impl FuriganaGenerator {
|
||||||
subs: &self.subs,
|
subs: &self.subs,
|
||||||
learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
|
learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
|
||||||
use_hiragana: self.use_hiragana,
|
use_hiragana: self.use_hiragana,
|
||||||
mark_accent: self.mark_accent,
|
accent_mark: self.accent_mark.as_ref().map(|s| s.as_str()),
|
||||||
|
accentless_mark: self.accentless_mark.as_ref().map(|s| s.as_str()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -108,7 +116,8 @@ pub struct Session<'a> {
|
||||||
subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), (String, String)>,
|
subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), (String, String)>,
|
||||||
learner: Learner,
|
learner: Learner,
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
mark_accent: bool,
|
accent_mark: Option<&'a str>,
|
||||||
|
accentless_mark: Option<&'a str>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Session<'a> {
|
impl<'a> Session<'a> {
|
||||||
|
@ -134,7 +143,8 @@ impl<'a> Session<'a> {
|
||||||
&self.subs,
|
&self.subs,
|
||||||
&mut self.learner,
|
&mut self.learner,
|
||||||
self.use_hiragana,
|
self.use_hiragana,
|
||||||
self.mark_accent,
|
self.accent_mark,
|
||||||
|
self.accentless_mark,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -152,7 +162,8 @@ fn add_html_furigana_skip_already_ruby(
|
||||||
subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>,
|
subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>,
|
||||||
learner: &mut Learner,
|
learner: &mut Learner,
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
mark_accent: bool,
|
accent_mark: Option<&str>,
|
||||||
|
accentless_mark: Option<&str>,
|
||||||
) -> String {
|
) -> String {
|
||||||
let mut reader = quick_xml::Reader::from_str(text);
|
let mut reader = quick_xml::Reader::from_str(text);
|
||||||
|
|
||||||
|
@ -193,7 +204,8 @@ fn add_html_furigana_skip_already_ruby(
|
||||||
subs,
|
subs,
|
||||||
learner,
|
learner,
|
||||||
use_hiragana,
|
use_hiragana,
|
||||||
mark_accent,
|
accent_mark,
|
||||||
|
accentless_mark,
|
||||||
));
|
));
|
||||||
} else {
|
} else {
|
||||||
write_xml(&mut new_text, &Event::Text(e));
|
write_xml(&mut new_text, &Event::Text(e));
|
||||||
|
@ -279,7 +291,8 @@ fn add_html_furigana(
|
||||||
subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>,
|
subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>,
|
||||||
learner: &mut Learner,
|
learner: &mut Learner,
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
mark_accent: bool,
|
accent_mark: Option<&str>,
|
||||||
|
accentless_mark: Option<&str>,
|
||||||
) -> String {
|
) -> String {
|
||||||
let mut worker = tokenizer.new_worker();
|
let mut worker = tokenizer.new_worker();
|
||||||
|
|
||||||
|
@ -305,7 +318,7 @@ fn add_html_furigana(
|
||||||
(kana_1, kana_2)
|
(kana_1, kana_2)
|
||||||
};
|
};
|
||||||
|
|
||||||
let pitches = if mark_accent {
|
let pitches = if accent_mark.is_some() || accentless_mark.is_some() {
|
||||||
accent_dict.get(word, pitch_kana)
|
accent_dict.get(word, pitch_kana)
|
||||||
} else {
|
} else {
|
||||||
&[]
|
&[]
|
||||||
|
@ -327,7 +340,14 @@ fn add_html_furigana(
|
||||||
kana.into()
|
kana.into()
|
||||||
};
|
};
|
||||||
|
|
||||||
let furigana_text = apply_furigana(surface, &kana, pitches, exclude_kanji);
|
let furigana_text = apply_furigana(
|
||||||
|
surface,
|
||||||
|
&kana,
|
||||||
|
exclude_kanji,
|
||||||
|
pitches,
|
||||||
|
accent_mark,
|
||||||
|
accentless_mark,
|
||||||
|
);
|
||||||
|
|
||||||
if furigana_text.is_empty() {
|
if furigana_text.is_empty() {
|
||||||
new_text.push_str(surface);
|
new_text.push_str(surface);
|
||||||
|
@ -357,8 +377,10 @@ fn add_html_furigana(
|
||||||
fn apply_furigana<'a>(
|
fn apply_furigana<'a>(
|
||||||
surface: &'a str,
|
surface: &'a str,
|
||||||
kana: &'a str,
|
kana: &'a str,
|
||||||
pitches: &[u8],
|
|
||||||
exclude_kanji: &FnvHashSet<char>,
|
exclude_kanji: &FnvHashSet<char>,
|
||||||
|
pitches: &[u8],
|
||||||
|
accent_mark: Option<&'a str>,
|
||||||
|
accentless_mark: Option<&'a str>,
|
||||||
) -> Vec<(String, String)> {
|
) -> Vec<(String, String)> {
|
||||||
let mut out: Vec<(String, String)> = Vec::new();
|
let mut out: Vec<(String, String)> = Vec::new();
|
||||||
|
|
||||||
|
@ -438,7 +460,7 @@ fn apply_furigana<'a>(
|
||||||
|
|
||||||
// Attach pitch accent indicator if there is one and it's unambiguous.
|
// Attach pitch accent indicator if there is one and it's unambiguous.
|
||||||
if pitches.len() == 1 {
|
if pitches.len() == 1 {
|
||||||
if pitches[0] == 0 {
|
if pitches[0] == 0 && accentless_mark.is_some() {
|
||||||
// 平板.
|
// 平板.
|
||||||
let (s, k) = out.last_mut().unwrap();
|
let (s, k) = out.last_mut().unwrap();
|
||||||
let mark = if k.is_empty() {
|
let mark = if k.is_empty() {
|
||||||
|
@ -446,18 +468,21 @@ fn apply_furigana<'a>(
|
||||||
// extra level of furigana to make the formatting consistent.
|
// extra level of furigana to make the formatting consistent.
|
||||||
&[
|
&[
|
||||||
"<ruby>",
|
"<ruby>",
|
||||||
"<rt><ruby class=\"pitch_flat\"> <rt>o</rt></ruby></rt></ruby>",
|
"<rt><ruby class=\"pitch_flat\"> <rt>",
|
||||||
|
"</rt></ruby></rt></ruby>",
|
||||||
]
|
]
|
||||||
} else {
|
} else {
|
||||||
&["<ruby class=\"pitch_flat\">", "<rt>o</rt></ruby>"]
|
&["<ruby class=\"pitch_flat\">", "<rt>", "</rt></ruby>"]
|
||||||
};
|
};
|
||||||
let text = if k.is_empty() { s } else { k };
|
let text = if k.is_empty() { s } else { k };
|
||||||
|
|
||||||
if text.len() >= 3 && text.is_char_boundary(text.len() - 3) {
|
if text.len() >= 3 && text.is_char_boundary(text.len() - 3) {
|
||||||
text.insert_str(text.len() - 3, mark[0]);
|
text.insert_str(text.len() - 3, mark[0]);
|
||||||
text.insert_str(text.len(), mark[1]);
|
text.insert_str(text.len(), mark[1]);
|
||||||
|
text.insert_str(text.len(), accentless_mark.unwrap());
|
||||||
|
text.insert_str(text.len(), mark[2]);
|
||||||
}
|
}
|
||||||
} else {
|
} else if accent_mark.is_some() {
|
||||||
// Everything else.
|
// Everything else.
|
||||||
let mut byte_idx = accent::accent_number_to_byte_idx(kana, pitches[0]).unwrap();
|
let mut byte_idx = accent::accent_number_to_byte_idx(kana, pitches[0]).unwrap();
|
||||||
for (s, k) in out.iter_mut() {
|
for (s, k) in out.iter_mut() {
|
||||||
|
@ -466,10 +491,11 @@ fn apply_furigana<'a>(
|
||||||
// extra level of furigana to make the formatting consistent.
|
// extra level of furigana to make the formatting consistent.
|
||||||
&[
|
&[
|
||||||
"<ruby>",
|
"<ruby>",
|
||||||
"<rt><ruby class=\"pitch_accent\"> <rt>*</rt></ruby></rt></ruby>",
|
"<rt><ruby class=\"pitch_accent\"> <rt>",
|
||||||
|
"</rt></ruby></rt></ruby>",
|
||||||
]
|
]
|
||||||
} else {
|
} else {
|
||||||
&["<ruby class=\"pitch_accent\">", "<rt>*</rt></ruby>"]
|
&["<ruby class=\"pitch_accent\">", "<rt>", "</rt></ruby>"]
|
||||||
};
|
};
|
||||||
let text = if k.is_empty() { s } else { k };
|
let text = if k.is_empty() { s } else { k };
|
||||||
|
|
||||||
|
@ -477,6 +503,8 @@ fn apply_furigana<'a>(
|
||||||
&& text.is_char_boundary(byte_idx)
|
&& text.is_char_boundary(byte_idx)
|
||||||
&& text.is_char_boundary(byte_idx + 3)
|
&& text.is_char_boundary(byte_idx + 3)
|
||||||
{
|
{
|
||||||
|
text.insert_str(byte_idx + 3, mark[2]);
|
||||||
|
text.insert_str(byte_idx + 3, accent_mark.unwrap());
|
||||||
text.insert_str(byte_idx + 3, mark[1]);
|
text.insert_str(byte_idx + 3, mark[1]);
|
||||||
text.insert_str(byte_idx, mark[0]);
|
text.insert_str(byte_idx, mark[0]);
|
||||||
break;
|
break;
|
||||||
|
@ -608,19 +636,20 @@ mod tests {
|
||||||
pub fn get_furigana_gen() -> &'static FuriganaGenerator {
|
pub fn get_furigana_gen() -> &'static FuriganaGenerator {
|
||||||
use std::sync::OnceLock;
|
use std::sync::OnceLock;
|
||||||
static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new();
|
static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new();
|
||||||
FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false, false))
|
FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false, None, None))
|
||||||
}
|
}
|
||||||
pub fn get_furigana_gen_with_accent() -> &'static FuriganaGenerator {
|
pub fn get_furigana_gen_with_accent() -> &'static FuriganaGenerator {
|
||||||
use std::sync::OnceLock;
|
use std::sync::OnceLock;
|
||||||
static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new();
|
static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new();
|
||||||
FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false, true))
|
FURIGEN
|
||||||
|
.get_or_init(|| FuriganaGenerator::new(0, false, Some("*".into()), Some("o".into())))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn apply_furigana_01() {
|
fn apply_furigana_01() {
|
||||||
let surface = "へぇ";
|
let surface = "へぇ";
|
||||||
let kana = "ヘー";
|
let kana = "ヘー";
|
||||||
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &FnvHashSet::default(), &[], None, None);
|
||||||
|
|
||||||
assert!(pairs.is_empty());
|
assert!(pairs.is_empty());
|
||||||
}
|
}
|
||||||
|
@ -629,7 +658,7 @@ mod tests {
|
||||||
fn apply_furigana_02() {
|
fn apply_furigana_02() {
|
||||||
let surface = "へぇー";
|
let surface = "へぇー";
|
||||||
let kana = "ヘー";
|
let kana = "ヘー";
|
||||||
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &FnvHashSet::default(), &[], None, None);
|
||||||
|
|
||||||
assert!(pairs.is_empty());
|
assert!(pairs.is_empty());
|
||||||
}
|
}
|
||||||
|
@ -638,7 +667,7 @@ mod tests {
|
||||||
fn apply_furigana_03() {
|
fn apply_furigana_03() {
|
||||||
let surface = "へ";
|
let surface = "へ";
|
||||||
let kana = "え";
|
let kana = "え";
|
||||||
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &FnvHashSet::default(), &[], None, None);
|
||||||
|
|
||||||
assert!(pairs.is_empty());
|
assert!(pairs.is_empty());
|
||||||
}
|
}
|
||||||
|
@ -647,7 +676,7 @@ mod tests {
|
||||||
fn apply_furigana_04() {
|
fn apply_furigana_04() {
|
||||||
let surface = "食べる";
|
let surface = "食べる";
|
||||||
let kana = "タベル";
|
let kana = "タベル";
|
||||||
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &FnvHashSet::default(), &[], None, None);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&[("食".into(), "タ".into()), ("べる".into(), "".into())],
|
&[("食".into(), "タ".into()), ("べる".into(), "".into())],
|
||||||
|
@ -659,7 +688,7 @@ mod tests {
|
||||||
fn apply_furigana_05() {
|
fn apply_furigana_05() {
|
||||||
let surface = "流れ出す";
|
let surface = "流れ出す";
|
||||||
let kana = "ながれだす";
|
let kana = "ながれだす";
|
||||||
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &FnvHashSet::default(), &[], None, None);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&[
|
&[
|
||||||
|
@ -676,7 +705,7 @@ mod tests {
|
||||||
fn apply_furigana_06() {
|
fn apply_furigana_06() {
|
||||||
let surface = "物の怪";
|
let surface = "物の怪";
|
||||||
let kana = "もののけ";
|
let kana = "もののけ";
|
||||||
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &FnvHashSet::default(), &[], None, None);
|
||||||
|
|
||||||
assert_eq!(&[("物の怪".into(), "もののけ".into())], &pairs[..]);
|
assert_eq!(&[("物の怪".into(), "もののけ".into())], &pairs[..]);
|
||||||
}
|
}
|
||||||
|
@ -685,7 +714,7 @@ mod tests {
|
||||||
fn apply_furigana_07() {
|
fn apply_furigana_07() {
|
||||||
let surface = "ご飯";
|
let surface = "ご飯";
|
||||||
let kana = "ゴハン";
|
let kana = "ゴハン";
|
||||||
let pairs = apply_furigana(surface, kana, &[], &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &FnvHashSet::default(), &[], None, None);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&[("ご".into(), "".into()), ("飯".into(), "ハン".into())],
|
&[("ご".into(), "".into()), ("飯".into(), "ハン".into())],
|
||||||
|
|
Loading…
Reference in New Issue
Block a user