Use file of kanji to exclude, instead of a frequency cutoff.
This commit is contained in:
parent
b60ecc38b3
commit
2ae9f286ad
21
build.rs
21
build.rs
|
|
@ -5,30 +5,9 @@ use std::{
|
||||||
path::Path,
|
path::Path,
|
||||||
};
|
};
|
||||||
|
|
||||||
const KANJI: &str = include_str!("data/kanji_frequency.txt");
|
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let out_dir = env::var("OUT_DIR").unwrap();
|
let out_dir = env::var("OUT_DIR").unwrap();
|
||||||
|
|
||||||
// Write frequency-ordered kanji array to rust file.
|
|
||||||
{
|
|
||||||
let dest_path = Path::new(&out_dir).join("kanji_freq_inc.rs");
|
|
||||||
let mut f = File::create(&dest_path).unwrap();
|
|
||||||
|
|
||||||
f.write_all("const KANJI_FREQ: &[char] = &[".as_bytes())
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
for c in KANJI.chars() {
|
|
||||||
if c.is_whitespace() {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
f.write_all(format!("\n'{}',", c).as_bytes()).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
f.write_all("\n];".as_bytes()).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write compressed parsing dictionary to .lz4 file.
|
// Write compressed parsing dictionary to .lz4 file.
|
||||||
{
|
{
|
||||||
// Read and decompress file from .xz.
|
// Read and decompress file from .xz.
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
31
src/lib.rs
31
src/lib.rs
|
|
@ -14,9 +14,6 @@ use vibrato::{Dictionary, Tokenizer};
|
||||||
use accent::AccentDict;
|
use accent::AccentDict;
|
||||||
use learner::Learner;
|
use learner::Learner;
|
||||||
|
|
||||||
// Include KANJI_FREQ, a frequency-ordered array of kanji characters.
|
|
||||||
include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));
|
|
||||||
|
|
||||||
// Parsing dictionary.
|
// Parsing dictionary.
|
||||||
const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4"));
|
const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4"));
|
||||||
|
|
||||||
|
|
@ -45,14 +42,18 @@ pub struct FuriganaGenerator {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FuriganaGenerator {
|
impl FuriganaGenerator {
|
||||||
// - `exclude_count`: exclude the N most frequent kanji from furigana.
|
/// - `exclude_kanji`: don't put furigana on words whose kanji are all in
|
||||||
// Specifically, words made up *entirely* of those kanji will be excluded.
|
/// this list. Note: if a word has *some* kanji that aren't in this list,
|
||||||
// If a word has some kanji that aren't in that set, even if it also has
|
/// even if it also has some that are, it will still get furigana.
|
||||||
// some that are, it will still get furigana.
|
/// - `exclude_words`: don't put furigana on the words in this list.
|
||||||
//
|
/// - `use_hiragana`: when true, the furigana will be written in hiragana.
|
||||||
// - `exclude_words`: don't put furigana on the words in this list.
|
/// When false, the furigana will be written in katakana.
|
||||||
|
/// - `accent_mark`: the character to use as the pitch accent indicator for
|
||||||
|
/// accented words.
|
||||||
|
/// - `accentless_mark`: the character to use as the pitch accent indicator
|
||||||
|
/// for accentless (へいばん) words.
|
||||||
pub fn new(
|
pub fn new(
|
||||||
exclude_count: usize,
|
exclude_kanji: &[char],
|
||||||
exclude_words: &[&str],
|
exclude_words: &[&str],
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
accent_mark: Option<String>,
|
accent_mark: Option<String>,
|
||||||
|
|
@ -69,15 +70,15 @@ impl FuriganaGenerator {
|
||||||
Dictionary::read(Cursor::new(&data)).unwrap()
|
Dictionary::read(Cursor::new(&data)).unwrap()
|
||||||
};
|
};
|
||||||
|
|
||||||
let exclude_kanji = {
|
let exclude_kanji_set = {
|
||||||
let mut set = FnvHashSet::default();
|
let mut set = FnvHashSet::default();
|
||||||
for &c in KANJI_FREQ.iter().take(exclude_count) {
|
for &c in exclude_kanji {
|
||||||
set.insert(c);
|
set.insert(c);
|
||||||
}
|
}
|
||||||
set
|
set
|
||||||
};
|
};
|
||||||
|
|
||||||
let exclude_words = {
|
let exclude_words_set = {
|
||||||
let mut set = FnvHashSet::default();
|
let mut set = FnvHashSet::default();
|
||||||
for word in exclude_words {
|
for word in exclude_words {
|
||||||
set.insert((*word).into());
|
set.insert((*word).into());
|
||||||
|
|
@ -99,8 +100,8 @@ impl FuriganaGenerator {
|
||||||
Self {
|
Self {
|
||||||
tokenizer: Tokenizer::new(dict),
|
tokenizer: Tokenizer::new(dict),
|
||||||
accent_dict: accent::build_accent_dictionary(),
|
accent_dict: accent::build_accent_dictionary(),
|
||||||
exclude_kanji: exclude_kanji,
|
exclude_kanji: exclude_kanji_set,
|
||||||
exclude_words: exclude_words,
|
exclude_words: exclude_words_set,
|
||||||
subs: subs,
|
subs: subs,
|
||||||
use_hiragana: use_hiragana,
|
use_hiragana: use_hiragana,
|
||||||
accent_mark: accent_mark,
|
accent_mark: accent_mark,
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user