Add word exclude list.
This commit is contained in:
parent
698e0e3277
commit
40ed2fd955
38
src/lib.rs
38
src/lib.rs
|
|
@ -37,6 +37,7 @@ pub struct FuriganaGenerator {
|
||||||
tokenizer: Tokenizer,
|
tokenizer: Tokenizer,
|
||||||
accent_dict: AccentDict,
|
accent_dict: AccentDict,
|
||||||
exclude_kanji: FnvHashSet<char>,
|
exclude_kanji: FnvHashSet<char>,
|
||||||
|
exclude_words: FnvHashSet<String>,
|
||||||
subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), (String, String)>,
|
subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), (String, String)>,
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
accent_mark: Option<String>,
|
accent_mark: Option<String>,
|
||||||
|
|
@ -44,12 +45,15 @@ pub struct FuriganaGenerator {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FuriganaGenerator {
|
impl FuriganaGenerator {
|
||||||
// `exclude_count`: exclude the N most frequent kanji from furigana.
|
// - `exclude_count`: exclude the N most frequent kanji from furigana.
|
||||||
// Specifically, words made up *entirely* of those kanji will be excluded.
|
// Specifically, words made up *entirely* of those kanji will be excluded.
|
||||||
// If a word has some kanji that aren't in that set, even if it also has
|
// If a word has some kanji that aren't in that set, even if it also has
|
||||||
// some that are, it will still get furigana.
|
// some that are, it will still get furigana.
|
||||||
|
//
|
||||||
|
// - `exclude_words`: don't put furigana on the words in this list.
|
||||||
pub fn new(
|
pub fn new(
|
||||||
exclude_count: usize,
|
exclude_count: usize,
|
||||||
|
exclude_words: &[&str],
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
accent_mark: Option<String>,
|
accent_mark: Option<String>,
|
||||||
accentless_mark: Option<String>,
|
accentless_mark: Option<String>,
|
||||||
|
|
@ -73,6 +77,14 @@ impl FuriganaGenerator {
|
||||||
set
|
set
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let exclude_words = {
|
||||||
|
let mut set = FnvHashSet::default();
|
||||||
|
for word in exclude_words {
|
||||||
|
set.insert((*word).into());
|
||||||
|
}
|
||||||
|
set
|
||||||
|
};
|
||||||
|
|
||||||
let subs = {
|
let subs = {
|
||||||
let mut map: FnvHashMap<(Cow<str>, Cow<str>), (String, String)> = FnvHashMap::default();
|
let mut map: FnvHashMap<(Cow<str>, Cow<str>), (String, String)> = FnvHashMap::default();
|
||||||
for (surface, kana, (sub_kana, sub_kana_pitch_lookup)) in COMMON_SUBS.iter().copied() {
|
for (surface, kana, (sub_kana, sub_kana_pitch_lookup)) in COMMON_SUBS.iter().copied() {
|
||||||
|
|
@ -88,6 +100,7 @@ impl FuriganaGenerator {
|
||||||
tokenizer: Tokenizer::new(dict),
|
tokenizer: Tokenizer::new(dict),
|
||||||
accent_dict: accent::build_accent_dictionary(),
|
accent_dict: accent::build_accent_dictionary(),
|
||||||
exclude_kanji: exclude_kanji,
|
exclude_kanji: exclude_kanji,
|
||||||
|
exclude_words: exclude_words,
|
||||||
subs: subs,
|
subs: subs,
|
||||||
use_hiragana: use_hiragana,
|
use_hiragana: use_hiragana,
|
||||||
accent_mark: accent_mark,
|
accent_mark: accent_mark,
|
||||||
|
|
@ -100,6 +113,7 @@ impl FuriganaGenerator {
|
||||||
tokenizer: &self.tokenizer,
|
tokenizer: &self.tokenizer,
|
||||||
accent_dict: &self.accent_dict,
|
accent_dict: &self.accent_dict,
|
||||||
exclude_kanji: &self.exclude_kanji,
|
exclude_kanji: &self.exclude_kanji,
|
||||||
|
exclude_words: &self.exclude_words,
|
||||||
subs: &self.subs,
|
subs: &self.subs,
|
||||||
learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
|
learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
|
||||||
use_hiragana: self.use_hiragana,
|
use_hiragana: self.use_hiragana,
|
||||||
|
|
@ -113,6 +127,7 @@ pub struct Session<'a> {
|
||||||
tokenizer: &'a Tokenizer,
|
tokenizer: &'a Tokenizer,
|
||||||
accent_dict: &'a AccentDict,
|
accent_dict: &'a AccentDict,
|
||||||
exclude_kanji: &'a FnvHashSet<char>,
|
exclude_kanji: &'a FnvHashSet<char>,
|
||||||
|
exclude_words: &'a FnvHashSet<String>,
|
||||||
subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), (String, String)>,
|
subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), (String, String)>,
|
||||||
learner: Learner,
|
learner: Learner,
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
|
|
@ -140,6 +155,7 @@ impl<'a> Session<'a> {
|
||||||
&self.tokenizer,
|
&self.tokenizer,
|
||||||
&self.accent_dict,
|
&self.accent_dict,
|
||||||
&self.exclude_kanji,
|
&self.exclude_kanji,
|
||||||
|
&self.exclude_words,
|
||||||
&self.subs,
|
&self.subs,
|
||||||
&mut self.learner,
|
&mut self.learner,
|
||||||
self.use_hiragana,
|
self.use_hiragana,
|
||||||
|
|
@ -159,6 +175,7 @@ fn add_html_furigana_skip_already_ruby(
|
||||||
tokenizer: &Tokenizer,
|
tokenizer: &Tokenizer,
|
||||||
accent_dict: &AccentDict,
|
accent_dict: &AccentDict,
|
||||||
exclude_kanji: &FnvHashSet<char>,
|
exclude_kanji: &FnvHashSet<char>,
|
||||||
|
exclude_words: &FnvHashSet<String>,
|
||||||
subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>,
|
subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>,
|
||||||
learner: &mut Learner,
|
learner: &mut Learner,
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
|
|
@ -201,6 +218,7 @@ fn add_html_furigana_skip_already_ruby(
|
||||||
tokenizer,
|
tokenizer,
|
||||||
accent_dict,
|
accent_dict,
|
||||||
exclude_kanji,
|
exclude_kanji,
|
||||||
|
exclude_words,
|
||||||
subs,
|
subs,
|
||||||
learner,
|
learner,
|
||||||
use_hiragana,
|
use_hiragana,
|
||||||
|
|
@ -288,6 +306,7 @@ fn add_html_furigana(
|
||||||
tokenizer: &Tokenizer,
|
tokenizer: &Tokenizer,
|
||||||
accent_dict: &AccentDict,
|
accent_dict: &AccentDict,
|
||||||
exclude_kanji: &FnvHashSet<char>,
|
exclude_kanji: &FnvHashSet<char>,
|
||||||
|
exclude_words: &FnvHashSet<String>,
|
||||||
subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>,
|
subs: &FnvHashMap<(Cow<str>, Cow<str>), (String, String)>,
|
||||||
learner: &mut Learner,
|
learner: &mut Learner,
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
|
|
@ -344,6 +363,7 @@ fn add_html_furigana(
|
||||||
surface,
|
surface,
|
||||||
&kana,
|
&kana,
|
||||||
exclude_kanji,
|
exclude_kanji,
|
||||||
|
exclude_words,
|
||||||
pitches,
|
pitches,
|
||||||
accent_mark,
|
accent_mark,
|
||||||
accentless_mark,
|
accentless_mark,
|
||||||
|
|
@ -378,13 +398,14 @@ fn apply_furigana<'a>(
|
||||||
surface: &'a str,
|
surface: &'a str,
|
||||||
kana: &'a str,
|
kana: &'a str,
|
||||||
exclude_kanji: &FnvHashSet<char>,
|
exclude_kanji: &FnvHashSet<char>,
|
||||||
|
exclude_words: &FnvHashSet<String>,
|
||||||
pitches: &[u8],
|
pitches: &[u8],
|
||||||
accent_mark: Option<&'a str>,
|
accent_mark: Option<&'a str>,
|
||||||
accentless_mark: Option<&'a str>,
|
accentless_mark: Option<&'a str>,
|
||||||
) -> Vec<(String, String)> {
|
) -> Vec<(String, String)> {
|
||||||
let mut out: Vec<(String, String)> = Vec::new();
|
let mut out: Vec<(String, String)> = Vec::new();
|
||||||
|
|
||||||
if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) {
|
if furigana_unneeded(surface, exclude_kanji, exclude_words) || !is_kana_str(kana) {
|
||||||
return Vec::new();
|
return Vec::new();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -584,10 +605,15 @@ pub fn normalize_kana(c: char) -> Option<char> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true if furigana defininitely isn't needed.
|
/// Returns true if furigana defininitely isn't needed.
|
||||||
pub fn furigana_unneeded(text: &str, exclude_kanji: &FnvHashSet<char>) -> bool {
|
pub fn furigana_unneeded(
|
||||||
text.chars().all(|c| {
|
text: &str,
|
||||||
is_kana(c) || c.is_ascii() || c.is_numeric() || c == '々' || exclude_kanji.contains(&c)
|
exclude_kanji: &FnvHashSet<char>,
|
||||||
})
|
exclude_words: &FnvHashSet<String>,
|
||||||
|
) -> bool {
|
||||||
|
exclude_words.contains(text)
|
||||||
|
|| text.chars().all(|c| {
|
||||||
|
is_kana(c) || c.is_ascii() || c.is_numeric() || c == '々' || exclude_kanji.contains(&c)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn hiragana_to_katakana(c: char) -> Option<char> {
|
pub fn hiragana_to_katakana(c: char) -> Option<char> {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user