First commit.
A furigana generator, that can do "spaced repetition" style reduction of furigana over the course of a text.
This commit is contained in:
commit
1c3afed157
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
Cargo.lock
|
||||||
|
/target
|
||||||
|
/test_text
|
17
Cargo.toml
Normal file
17
Cargo.toml
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
[package]
|
||||||
|
name = "furigana_gen"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[lib]
|
||||||
|
name = "furigana_gen"
|
||||||
|
path = "src/lib.rs"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
vibrato = "0.5"
|
||||||
|
lz4_flex = "0.11"
|
||||||
|
quick-xml = "0.36.1"
|
||||||
|
|
||||||
|
[build-dependencies]
|
||||||
|
lzma-rs = "0.3"
|
||||||
|
lz4_flex = "0.11"
|
50
build.rs
Normal file
50
build.rs
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
use std::{
|
||||||
|
env,
|
||||||
|
fs::File,
|
||||||
|
io::{BufReader, Write},
|
||||||
|
path::Path,
|
||||||
|
};
|
||||||
|
|
||||||
|
const KANJI: &str = include_str!("data/kanji_frequency.txt");
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let out_dir = env::var("OUT_DIR").unwrap();
|
||||||
|
|
||||||
|
// Write frequency-ordered kanji array to rust file.
|
||||||
|
{
|
||||||
|
let dest_path = Path::new(&out_dir).join("kanji_freq_inc.rs");
|
||||||
|
let mut f = File::create(&dest_path).unwrap();
|
||||||
|
|
||||||
|
f.write_all("const KANJI_FREQ: &[char] = &[".as_bytes())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
for c in KANJI.chars() {
|
||||||
|
if c.is_whitespace() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
f.write_all(format!("\n'{}',", c).as_bytes()).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
f.write_all("\n];".as_bytes()).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write compressed dictionary to .lz4 file.
|
||||||
|
{
|
||||||
|
// Read and decompress file from .xz.
|
||||||
|
let dict_data = {
|
||||||
|
let f = File::open("data/dictionary/system.dic.xz").unwrap();
|
||||||
|
let mut data = Vec::new();
|
||||||
|
lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
|
||||||
|
|
||||||
|
data
|
||||||
|
};
|
||||||
|
|
||||||
|
// Recompress to .lz4.
|
||||||
|
let dest_path = Path::new(&out_dir).join("system.dic.lz4");
|
||||||
|
let f = File::create(dest_path).unwrap();
|
||||||
|
let mut encoder = lz4_flex::frame::FrameEncoder::new(f);
|
||||||
|
encoder.write(&dict_data).unwrap();
|
||||||
|
encoder.finish().unwrap();
|
||||||
|
}
|
||||||
|
}
|
31
data/dictionary/BSD
Normal file
31
data/dictionary/BSD
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
Copyright (c) 2011-2021, The UniDic Consortium
|
||||||
|
Copyright (c) 2023, LegalOn Technologies, Inc.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
* Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
* Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in the
|
||||||
|
documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
|
||||||
|
* Neither the name of the UniDic Consortium nor the names of its
|
||||||
|
contributors may be used to endorse or promote products derived
|
||||||
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
|
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
7
data/dictionary/NOTICE
Normal file
7
data/dictionary/NOTICE
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
This software includes a binary version of data from
|
||||||
|
|
||||||
|
https://clrd.ninjal.ac.jp/unidic_archive/cwj/3.1.1/unidic-cwj-3.1.1-full.zip
|
||||||
|
|
||||||
|
where the costs and connection ids are retrained using CORE data in BCCWJ (except the PN category)
|
||||||
|
|
||||||
|
https://clrd.ninjal.ac.jp/bccwj/.
|
BIN
data/dictionary/system.dic.xz
Normal file
BIN
data/dictionary/system.dic.xz
Normal file
Binary file not shown.
4001
data/kanji_frequency.txt
Normal file
4001
data/kanji_frequency.txt
Normal file
File diff suppressed because it is too large
Load Diff
67
src/learner.rs
Normal file
67
src/learner.rs
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
const MIN_MAX_DISTANCE: usize = 100;
|
||||||
|
const MAX_MAX_DISTANCE: usize = 10000;
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone)]
|
||||||
|
struct WordStats {
|
||||||
|
// The last position (in words processed) that this word was seen at.
|
||||||
|
last_seen_at: usize,
|
||||||
|
|
||||||
|
// How many times this word has been seen so far.
|
||||||
|
times_seen: usize,
|
||||||
|
|
||||||
|
// Maximum distance before helps is needed again.
|
||||||
|
max_distance: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct Learner {
|
||||||
|
stats: HashMap<String, WordStats>,
|
||||||
|
words_processed: usize,
|
||||||
|
times_seen_threshold: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Learner {
|
||||||
|
pub fn new(times_seen_threshold: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
stats: HashMap::new(),
|
||||||
|
words_processed: 0,
|
||||||
|
times_seen_threshold: times_seen_threshold,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn record(&mut self, word: &str) {
|
||||||
|
self.stats
|
||||||
|
.entry(word.to_string())
|
||||||
|
.and_modify(|stats| {
|
||||||
|
let distance = self.words_processed - stats.last_seen_at;
|
||||||
|
|
||||||
|
stats.last_seen_at = self.words_processed;
|
||||||
|
stats.times_seen += 1;
|
||||||
|
if stats.times_seen <= self.times_seen_threshold {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if distance < stats.max_distance {
|
||||||
|
stats.max_distance += distance.min((stats.max_distance as f64 * 0.5) as usize);
|
||||||
|
}
|
||||||
|
|
||||||
|
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
|
||||||
|
})
|
||||||
|
.or_insert(WordStats {
|
||||||
|
last_seen_at: self.words_processed,
|
||||||
|
times_seen: 1,
|
||||||
|
max_distance: MIN_MAX_DISTANCE,
|
||||||
|
});
|
||||||
|
self.words_processed += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn needs_help(&self, word: &str) -> bool {
|
||||||
|
if let Some(stats) = self.stats.get(word) {
|
||||||
|
let distance = self.words_processed - stats.last_seen_at;
|
||||||
|
stats.times_seen <= self.times_seen_threshold || distance > stats.max_distance
|
||||||
|
} else {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
516
src/lib.rs
Normal file
516
src/lib.rs
Normal file
|
@ -0,0 +1,516 @@
|
||||||
|
mod learner;
|
||||||
|
|
||||||
|
use std::{
|
||||||
|
collections::HashSet,
|
||||||
|
// fs::File,
|
||||||
|
io::{Cursor, Read},
|
||||||
|
};
|
||||||
|
|
||||||
|
use lz4_flex::frame::FrameDecoder;
|
||||||
|
use quick_xml::events::Event;
|
||||||
|
use vibrato::{Dictionary, Tokenizer};
|
||||||
|
|
||||||
|
use learner::Learner;
|
||||||
|
|
||||||
|
// Include KANJI_FREQ, a frequency-ordered array of kanji characters.
|
||||||
|
include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));
|
||||||
|
|
||||||
|
const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4"));
|
||||||
|
|
||||||
|
pub struct FuriganaGenerator {
|
||||||
|
tokenizer: Tokenizer,
|
||||||
|
exclude_kanji: HashSet<char>,
|
||||||
|
learner: Learner,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FuriganaGenerator {
|
||||||
|
// `exclude_count`: exclude the N most frequent kanji from furigana.
|
||||||
|
// Specifically, words made up *entirely* of those kanji will be excluded.
|
||||||
|
// If a word has some kanji that aren't in that set, even if it also has
|
||||||
|
// some that are, it will still get furigana.
|
||||||
|
pub fn new(exclude_count: usize, learn_mode: bool) -> Self {
|
||||||
|
let dict = {
|
||||||
|
// Note: we could just pass the decoder straight to `Dictionary::read()`
|
||||||
|
// below, and it would work. However, that ends up being slower than
|
||||||
|
// first decompressing the whole thing ahead of time.
|
||||||
|
let mut decoder = FrameDecoder::new(Cursor::new(DICT));
|
||||||
|
let mut data = Vec::new();
|
||||||
|
decoder.read_to_end(&mut data).unwrap();
|
||||||
|
|
||||||
|
Dictionary::read(Cursor::new(&data)).unwrap()
|
||||||
|
};
|
||||||
|
|
||||||
|
let exclude_kanji = {
|
||||||
|
let mut set = HashSet::new();
|
||||||
|
for &c in KANJI_FREQ.iter().take(exclude_count) {
|
||||||
|
set.insert(c);
|
||||||
|
}
|
||||||
|
set
|
||||||
|
};
|
||||||
|
|
||||||
|
Self {
|
||||||
|
tokenizer: Tokenizer::new(dict),
|
||||||
|
exclude_kanji: exclude_kanji,
|
||||||
|
learner: Learner::new(if learn_mode { 5 } else { usize::MAX }),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn add_html_furigana(&mut self, text: &str) -> String {
|
||||||
|
add_html_furigana_skip_already_ruby(
|
||||||
|
&text,
|
||||||
|
&self.tokenizer,
|
||||||
|
&self.exclude_kanji,
|
||||||
|
&mut self.learner,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn to_str<B: std::ops::Deref<Target = [u8]>>(bytes: &B) -> &str {
|
||||||
|
std::str::from_utf8(&bytes.deref()).unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
|
||||||
|
fn add_html_furigana_skip_already_ruby(
|
||||||
|
text: &str,
|
||||||
|
tokenizer: &Tokenizer,
|
||||||
|
exclude_kanji: &HashSet<char>,
|
||||||
|
learner: &mut Learner,
|
||||||
|
) -> String {
|
||||||
|
let mut reader = quick_xml::Reader::from_str(text);
|
||||||
|
|
||||||
|
let mut new_text = String::new();
|
||||||
|
let mut rubys: i32 = 0;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
match reader.read_event() {
|
||||||
|
Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
|
||||||
|
Ok(Event::Eof) => break,
|
||||||
|
|
||||||
|
Ok(Event::Start(e)) => {
|
||||||
|
if e.name().into_inner() == b"ruby" {
|
||||||
|
rubys += 1;
|
||||||
|
}
|
||||||
|
write_xml(&mut new_text, &Event::Start(e));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Event::End(e)) => {
|
||||||
|
if e.name().into_inner() == b"ruby" {
|
||||||
|
rubys -= 1;
|
||||||
|
}
|
||||||
|
write_xml(&mut new_text, &Event::End(e));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Event::Text(e)) => {
|
||||||
|
if rubys <= 0 {
|
||||||
|
new_text.push_str(&add_html_furigana(
|
||||||
|
to_str(&e),
|
||||||
|
tokenizer,
|
||||||
|
exclude_kanji,
|
||||||
|
learner,
|
||||||
|
));
|
||||||
|
} else {
|
||||||
|
write_xml(&mut new_text, &Event::Text(e));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// All other events, just re-write them verbatim.
|
||||||
|
Ok(e) => write_xml(&mut new_text, &e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
new_text
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Takes an xml event and writes it verbatim to the given string.
|
||||||
|
///
|
||||||
|
/// NOTE: really what we want is for the events to provide their byte index range
|
||||||
|
/// in the original text, so we could just write that, and even double-check that
|
||||||
|
/// we're not missing anything. But for some reason quick_xml doesn't provide
|
||||||
|
/// that information.
|
||||||
|
fn write_xml(text: &mut String, event: &quick_xml::events::Event) {
|
||||||
|
match event {
|
||||||
|
Event::Start(e) => {
|
||||||
|
text.push_str("<");
|
||||||
|
text.push_str(to_str(e));
|
||||||
|
text.push_str(">");
|
||||||
|
}
|
||||||
|
|
||||||
|
Event::End(e) => {
|
||||||
|
text.push_str("</");
|
||||||
|
text.push_str(to_str(e));
|
||||||
|
text.push_str(">");
|
||||||
|
}
|
||||||
|
|
||||||
|
Event::Empty(e) => {
|
||||||
|
text.push_str("<");
|
||||||
|
text.push_str(to_str(e));
|
||||||
|
text.push_str("/>");
|
||||||
|
}
|
||||||
|
|
||||||
|
Event::CData(e) => {
|
||||||
|
text.push_str("<![CDATA[");
|
||||||
|
text.push_str(to_str(e));
|
||||||
|
text.push_str("]]>");
|
||||||
|
}
|
||||||
|
|
||||||
|
Event::Comment(e) => {
|
||||||
|
text.push_str("<!--");
|
||||||
|
text.push_str(to_str(e));
|
||||||
|
text.push_str("-->");
|
||||||
|
}
|
||||||
|
|
||||||
|
Event::Decl(e) => {
|
||||||
|
text.push_str("<?");
|
||||||
|
text.push_str(to_str(e));
|
||||||
|
text.push_str("?>");
|
||||||
|
}
|
||||||
|
|
||||||
|
Event::PI(e) => {
|
||||||
|
text.push_str("<?");
|
||||||
|
text.push_str(to_str(e));
|
||||||
|
text.push_str("?>");
|
||||||
|
}
|
||||||
|
|
||||||
|
Event::DocType(e) => {
|
||||||
|
text.push_str("<!DOCTYPE");
|
||||||
|
text.push_str(to_str(e));
|
||||||
|
text.push_str(">");
|
||||||
|
}
|
||||||
|
|
||||||
|
Event::Text(e) => text.push_str(to_str(e)),
|
||||||
|
|
||||||
|
_ => unreachable!(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds furigana to Japanese text, using html ruby tags.
|
||||||
|
fn add_html_furigana(
|
||||||
|
text: &str,
|
||||||
|
tokenizer: &Tokenizer,
|
||||||
|
exclude_kanji: &HashSet<char>,
|
||||||
|
learner: &mut Learner,
|
||||||
|
) -> String {
|
||||||
|
let mut worker = tokenizer.new_worker();
|
||||||
|
|
||||||
|
worker.reset_sentence(text);
|
||||||
|
worker.tokenize();
|
||||||
|
|
||||||
|
let mut new_text = String::new();
|
||||||
|
for i in 0..worker.num_tokens() {
|
||||||
|
let t = worker.token(i);
|
||||||
|
let surface = t.surface();
|
||||||
|
|
||||||
|
let needs_help = learner.needs_help(surface);
|
||||||
|
learner.record(surface);
|
||||||
|
|
||||||
|
if !needs_help {
|
||||||
|
new_text.push_str(surface);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let kana = t.feature().split(",").nth(1).unwrap();
|
||||||
|
|
||||||
|
let furigana_text = apply_furigana(surface, kana, exclude_kanji);
|
||||||
|
|
||||||
|
for (surf, furi) in furigana_text.iter() {
|
||||||
|
if furi.is_empty() {
|
||||||
|
new_text.push_str(surf);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
new_text.push_str("<ruby>");
|
||||||
|
new_text.push_str(surf);
|
||||||
|
new_text.push_str("<rt>");
|
||||||
|
new_text.push_str(furi);
|
||||||
|
new_text.push_str("</rt></ruby>");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
new_text
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a segmented list of (surface, furigana) pairs.
|
||||||
|
///
|
||||||
|
/// The furigana component of a pair may be empty, indicating no
|
||||||
|
/// furigana is needed for that surface element.
|
||||||
|
fn apply_furigana<'a>(
|
||||||
|
surface: &'a str,
|
||||||
|
kana: &'a str,
|
||||||
|
exclude_kanji: &HashSet<char>,
|
||||||
|
) -> Vec<(&'a str, &'a str)> {
|
||||||
|
let mut out = Vec::new();
|
||||||
|
|
||||||
|
if furigana_unneeded(surface, exclude_kanji) {
|
||||||
|
out.push((surface, ""));
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut surface = surface;
|
||||||
|
let mut kana = kana;
|
||||||
|
|
||||||
|
// Trim any kana from the start.
|
||||||
|
{
|
||||||
|
let mut start_s = 0;
|
||||||
|
let mut start_k = 0;
|
||||||
|
for (sc, kc) in surface.chars().zip(kana.chars()) {
|
||||||
|
if is_equivalent_kana(sc, kc) {
|
||||||
|
start_s += sc.len_utf8();
|
||||||
|
start_k += kc.len_utf8();
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.push((&surface[..start_s], ""));
|
||||||
|
surface = &surface[start_s..];
|
||||||
|
kana = &kana[start_k..];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Trim any kana from the end.
|
||||||
|
{
|
||||||
|
let mut end_s = surface.len();
|
||||||
|
let mut end_k = kana.len();
|
||||||
|
for (sc, kc) in surface.chars().rev().zip(kana.chars().rev()) {
|
||||||
|
if is_equivalent_kana(sc, kc) {
|
||||||
|
end_s -= sc.len_utf8();
|
||||||
|
end_k -= kc.len_utf8();
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.push((&surface[end_s..], ""));
|
||||||
|
surface = &surface[..end_s];
|
||||||
|
kana = &kana[..end_k];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to uniquely match kana in the middle.
|
||||||
|
//
|
||||||
|
// This is just best-effort, and bails in any non-trivial cases.
|
||||||
|
while let Some((si, sc)) = surface.char_indices().find(|(_, c)| is_kana(*c)) {
|
||||||
|
// If there's more than one match, bail.
|
||||||
|
let equivalent_kana_count = kana
|
||||||
|
.chars()
|
||||||
|
.map(|c| is_equivalent_kana(c, sc))
|
||||||
|
.fold(0usize, |count, hit| count + hit as usize);
|
||||||
|
if equivalent_kana_count != 1 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the one match.
|
||||||
|
let (ki, kc) = kana
|
||||||
|
.char_indices()
|
||||||
|
.find(|(_, c)| is_equivalent_kana(sc, *c))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Insert the segments.
|
||||||
|
out.insert(out.len() - 2, (&surface[..si], &kana[..ki]));
|
||||||
|
out.insert(out.len() - 2, (&surface[si..(si + sc.len_utf8())], ""));
|
||||||
|
surface = &surface[(si + sc.len_utf8())..];
|
||||||
|
kana = &kana[(ki + kc.len_utf8())..];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Left over.
|
||||||
|
out.insert(out.len() - 2, (surface, kana));
|
||||||
|
|
||||||
|
out.iter().filter(|(s, _)| !s.is_empty()).copied().collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Due to the way this is used, this isn't meant to be exact, but instead
|
||||||
|
/// liberal in what it considers equivalent.
|
||||||
|
fn is_equivalent_kana(a: char, b: char) -> bool {
|
||||||
|
const PAIRS: &[[char; 2]] = &[['は', 'わ'], ['を', 'お'], ['づ', 'ず'], ['へ', 'え']];
|
||||||
|
const VOWELS: &[char] = &['あ', 'い', 'う', 'え', 'お', 'ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ'];
|
||||||
|
|
||||||
|
let (a, b) = match (normalize_kana(a), normalize_kana(b)) {
|
||||||
|
(Some(a), Some(b)) => (a, b),
|
||||||
|
_ => return false,
|
||||||
|
};
|
||||||
|
|
||||||
|
if a == b {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if a == 'ー' && VOWELS.contains(&b) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if b == 'ー' && VOWELS.contains(&a) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
for &[c, d] in PAIRS {
|
||||||
|
if (a == c && b == d) || (a == d && b == c) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
const HIRAGANA: u32 = 0x3041;
|
||||||
|
const KATAKANA: u32 = 0x30A1;
|
||||||
|
const KANA_COUNT: u32 = 0x3097 - HIRAGANA;
|
||||||
|
|
||||||
|
pub fn is_kana(c: char) -> bool {
|
||||||
|
if c == 'ー' {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
let c = c as u32;
|
||||||
|
|
||||||
|
if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn normalize_kana(c: char) -> Option<char> {
|
||||||
|
if !is_kana(c) {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(katakana_to_hiragana(c).unwrap_or(c))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true if furigana defininitely isn't needed.
|
||||||
|
pub fn furigana_unneeded(text: &str, exclude_kanji: &HashSet<char>) -> bool {
|
||||||
|
text.chars().all(|c| {
|
||||||
|
is_kana(c) || c.is_ascii() || c.is_numeric() || c == '々' || exclude_kanji.contains(&c)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn hiragana_to_katakana(c: char) -> Option<char> {
|
||||||
|
let c = c as u32;
|
||||||
|
if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
|
||||||
|
char::try_from(c + KATAKANA - HIRAGANA).ok()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn katakana_to_hiragana(c: char) -> Option<char> {
|
||||||
|
let c = c as u32;
|
||||||
|
if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
|
||||||
|
char::try_from(c - KATAKANA + HIRAGANA).ok()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn apply_furigana_01() {
|
||||||
|
let surface = "へぇ";
|
||||||
|
let kana = "ヘー";
|
||||||
|
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||||
|
|
||||||
|
assert_eq!(&[("へぇ", "")], &pairs[..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn apply_furigana_02() {
|
||||||
|
let surface = "へぇー";
|
||||||
|
let kana = "ヘー";
|
||||||
|
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||||
|
|
||||||
|
assert_eq!(&[("へぇー", "")], &pairs[..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn apply_furigana_03() {
|
||||||
|
let surface = "へ";
|
||||||
|
let kana = "え";
|
||||||
|
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||||
|
|
||||||
|
assert_eq!(&[("へ", "")], &pairs[..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn apply_furigana_04() {
|
||||||
|
let surface = "食べる";
|
||||||
|
let kana = "タベル";
|
||||||
|
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||||
|
|
||||||
|
assert_eq!(&[("食", "タ"), ("べる", "")], &pairs[..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn apply_furigana_05() {
|
||||||
|
let surface = "流れ出す";
|
||||||
|
let kana = "ながれだす";
|
||||||
|
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
&[("流", "なが"), ("れ", ""), ("出", "だ"), ("す", "")],
|
||||||
|
&pairs[..]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn apply_furigana_06() {
|
||||||
|
let surface = "物の怪";
|
||||||
|
let kana = "もののけ";
|
||||||
|
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||||
|
|
||||||
|
assert_eq!(&[("物の怪", "もののけ")], &pairs[..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn is_equivalent_kana_01() {
|
||||||
|
assert!(is_equivalent_kana('か', 'カ'));
|
||||||
|
assert!(is_equivalent_kana('カ', 'か'));
|
||||||
|
assert!(is_equivalent_kana('ぁ', 'ァ'));
|
||||||
|
assert!(is_equivalent_kana('ァ', 'ぁ'));
|
||||||
|
assert!(is_equivalent_kana('は', 'わ'));
|
||||||
|
assert!(is_equivalent_kana('わ', 'は'));
|
||||||
|
assert!(is_equivalent_kana('を', 'お'));
|
||||||
|
assert!(is_equivalent_kana('お', 'を'));
|
||||||
|
assert!(is_equivalent_kana('づ', 'ず'));
|
||||||
|
assert!(is_equivalent_kana('ず', 'づ'));
|
||||||
|
assert!(is_equivalent_kana('ー', 'あ'));
|
||||||
|
assert!(is_equivalent_kana('あ', 'ー'));
|
||||||
|
assert!(is_equivalent_kana('ー', 'ぁ'));
|
||||||
|
assert!(is_equivalent_kana('ぁ', 'ー'));
|
||||||
|
|
||||||
|
assert!(!is_equivalent_kana('は', 'ば'));
|
||||||
|
assert!(!is_equivalent_kana('ー', 'か'));
|
||||||
|
assert!(!is_equivalent_kana('た', '食'));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_01() {
|
||||||
|
let gen = FuriganaGenerator::new(0, false);
|
||||||
|
|
||||||
|
let mut worker = gen.tokenizer.new_worker();
|
||||||
|
worker.reset_sentence("食べている");
|
||||||
|
worker.tokenize();
|
||||||
|
|
||||||
|
assert_eq!(3, worker.num_tokens());
|
||||||
|
assert_eq!("食べ", worker.token(0).surface());
|
||||||
|
assert_eq!("動詞-一般,タベ", worker.token(0).feature());
|
||||||
|
assert_eq!("て", worker.token(1).surface());
|
||||||
|
assert_eq!("助詞-接続助詞,テ", worker.token(1).feature());
|
||||||
|
assert_eq!("いる", worker.token(2).surface());
|
||||||
|
assert_eq!("動詞-非自立可能,イル", worker.token(2).feature());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn add_html_furigana_01() {
|
||||||
|
let mut gen = FuriganaGenerator::new(0, false);
|
||||||
|
|
||||||
|
let text = gen
|
||||||
|
.add_html_furigana(r#"<sup class="食う">食べる</sup>のは<ruby>良</ruby>いね!<hi />"#);
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
text,
|
||||||
|
r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね!<hi />"#
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user