Use file of kanji to exclude, instead of a frequency cutoff.
This commit is contained in:
parent
b60ecc38b3
commit
2ae9f286ad
21
build.rs
21
build.rs
|
|
@ -5,30 +5,9 @@ use std::{
|
|||
path::Path,
|
||||
};
|
||||
|
||||
const KANJI: &str = include_str!("data/kanji_frequency.txt");
|
||||
|
||||
fn main() {
|
||||
let out_dir = env::var("OUT_DIR").unwrap();
|
||||
|
||||
// Write frequency-ordered kanji array to rust file.
|
||||
{
|
||||
let dest_path = Path::new(&out_dir).join("kanji_freq_inc.rs");
|
||||
let mut f = File::create(&dest_path).unwrap();
|
||||
|
||||
f.write_all("const KANJI_FREQ: &[char] = &[".as_bytes())
|
||||
.unwrap();
|
||||
|
||||
for c in KANJI.chars() {
|
||||
if c.is_whitespace() {
|
||||
continue;
|
||||
}
|
||||
|
||||
f.write_all(format!("\n'{}',", c).as_bytes()).unwrap();
|
||||
}
|
||||
|
||||
f.write_all("\n];".as_bytes()).unwrap();
|
||||
}
|
||||
|
||||
// Write compressed parsing dictionary to .lz4 file.
|
||||
{
|
||||
// Read and decompress file from .xz.
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
31
src/lib.rs
31
src/lib.rs
|
|
@ -14,9 +14,6 @@ use vibrato::{Dictionary, Tokenizer};
|
|||
use accent::AccentDict;
|
||||
use learner::Learner;
|
||||
|
||||
// Include KANJI_FREQ, a frequency-ordered array of kanji characters.
|
||||
include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));
|
||||
|
||||
// Parsing dictionary.
|
||||
const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4"));
|
||||
|
||||
|
|
@ -45,14 +42,18 @@ pub struct FuriganaGenerator {
|
|||
}
|
||||
|
||||
impl FuriganaGenerator {
|
||||
// - `exclude_count`: exclude the N most frequent kanji from furigana.
|
||||
// Specifically, words made up *entirely* of those kanji will be excluded.
|
||||
// If a word has some kanji that aren't in that set, even if it also has
|
||||
// some that are, it will still get furigana.
|
||||
//
|
||||
// - `exclude_words`: don't put furigana on the words in this list.
|
||||
/// - `exclude_kanji`: don't put furigana on words whose kanji are all in
|
||||
/// this list. Note: if a word has *some* kanji that aren't in this list,
|
||||
/// even if it also has some that are, it will still get furigana.
|
||||
/// - `exclude_words`: don't put furigana on the words in this list.
|
||||
/// - `use_hiragana`: when true, the furigana will be written in hiragana.
|
||||
/// When false, the furigana will be written in katakana.
|
||||
/// - `accent_mark`: the character to use as the pitch accent indicator for
|
||||
/// accented words.
|
||||
/// - `accentless_mark`: the character to use as the pitch accent indicator
|
||||
/// for accentless (へいばん) words.
|
||||
pub fn new(
|
||||
exclude_count: usize,
|
||||
exclude_kanji: &[char],
|
||||
exclude_words: &[&str],
|
||||
use_hiragana: bool,
|
||||
accent_mark: Option<String>,
|
||||
|
|
@ -69,15 +70,15 @@ impl FuriganaGenerator {
|
|||
Dictionary::read(Cursor::new(&data)).unwrap()
|
||||
};
|
||||
|
||||
let exclude_kanji = {
|
||||
let exclude_kanji_set = {
|
||||
let mut set = FnvHashSet::default();
|
||||
for &c in KANJI_FREQ.iter().take(exclude_count) {
|
||||
for &c in exclude_kanji {
|
||||
set.insert(c);
|
||||
}
|
||||
set
|
||||
};
|
||||
|
||||
let exclude_words = {
|
||||
let exclude_words_set = {
|
||||
let mut set = FnvHashSet::default();
|
||||
for word in exclude_words {
|
||||
set.insert((*word).into());
|
||||
|
|
@ -99,8 +100,8 @@ impl FuriganaGenerator {
|
|||
Self {
|
||||
tokenizer: Tokenizer::new(dict),
|
||||
accent_dict: accent::build_accent_dictionary(),
|
||||
exclude_kanji: exclude_kanji,
|
||||
exclude_words: exclude_words,
|
||||
exclude_kanji: exclude_kanji_set,
|
||||
exclude_words: exclude_words_set,
|
||||
subs: subs,
|
||||
use_hiragana: use_hiragana,
|
||||
accent_mark: accent_mark,
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user