First commit.
A furigana generator, that can do "spaced repetition" style reduction of furigana over the course of a text.
This commit is contained in:
commit
1c3afed157
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
Cargo.lock
|
||||
/target
|
||||
/test_text
|
17
Cargo.toml
Normal file
17
Cargo.toml
Normal file
|
@ -0,0 +1,17 @@
|
|||
[package]
|
||||
name = "furigana_gen"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[lib]
|
||||
name = "furigana_gen"
|
||||
path = "src/lib.rs"
|
||||
|
||||
[dependencies]
|
||||
vibrato = "0.5"
|
||||
lz4_flex = "0.11"
|
||||
quick-xml = "0.36.1"
|
||||
|
||||
[build-dependencies]
|
||||
lzma-rs = "0.3"
|
||||
lz4_flex = "0.11"
|
50
build.rs
Normal file
50
build.rs
Normal file
|
@ -0,0 +1,50 @@
|
|||
use std::{
|
||||
env,
|
||||
fs::File,
|
||||
io::{BufReader, Write},
|
||||
path::Path,
|
||||
};
|
||||
|
||||
const KANJI: &str = include_str!("data/kanji_frequency.txt");
|
||||
|
||||
fn main() {
|
||||
let out_dir = env::var("OUT_DIR").unwrap();
|
||||
|
||||
// Write frequency-ordered kanji array to rust file.
|
||||
{
|
||||
let dest_path = Path::new(&out_dir).join("kanji_freq_inc.rs");
|
||||
let mut f = File::create(&dest_path).unwrap();
|
||||
|
||||
f.write_all("const KANJI_FREQ: &[char] = &[".as_bytes())
|
||||
.unwrap();
|
||||
|
||||
for c in KANJI.chars() {
|
||||
if c.is_whitespace() {
|
||||
continue;
|
||||
}
|
||||
|
||||
f.write_all(format!("\n'{}',", c).as_bytes()).unwrap();
|
||||
}
|
||||
|
||||
f.write_all("\n];".as_bytes()).unwrap();
|
||||
}
|
||||
|
||||
// Write compressed dictionary to .lz4 file.
|
||||
{
|
||||
// Read and decompress file from .xz.
|
||||
let dict_data = {
|
||||
let f = File::open("data/dictionary/system.dic.xz").unwrap();
|
||||
let mut data = Vec::new();
|
||||
lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
|
||||
|
||||
data
|
||||
};
|
||||
|
||||
// Recompress to .lz4.
|
||||
let dest_path = Path::new(&out_dir).join("system.dic.lz4");
|
||||
let f = File::create(dest_path).unwrap();
|
||||
let mut encoder = lz4_flex::frame::FrameEncoder::new(f);
|
||||
encoder.write(&dict_data).unwrap();
|
||||
encoder.finish().unwrap();
|
||||
}
|
||||
}
|
31
data/dictionary/BSD
Normal file
31
data/dictionary/BSD
Normal file
|
@ -0,0 +1,31 @@
|
|||
Copyright (c) 2011-2021, The UniDic Consortium
|
||||
Copyright (c) 2023, LegalOn Technologies, Inc.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the
|
||||
distribution.
|
||||
|
||||
* Neither the name of the UniDic Consortium nor the names of its
|
||||
contributors may be used to endorse or promote products derived
|
||||
from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
7
data/dictionary/NOTICE
Normal file
7
data/dictionary/NOTICE
Normal file
|
@ -0,0 +1,7 @@
|
|||
This software includes a binary version of data from
|
||||
|
||||
https://clrd.ninjal.ac.jp/unidic_archive/cwj/3.1.1/unidic-cwj-3.1.1-full.zip
|
||||
|
||||
where the costs and connection ids are retrained using CORE data in BCCWJ (except the PN category)
|
||||
|
||||
https://clrd.ninjal.ac.jp/bccwj/.
|
BIN
data/dictionary/system.dic.xz
Normal file
BIN
data/dictionary/system.dic.xz
Normal file
Binary file not shown.
4001
data/kanji_frequency.txt
Normal file
4001
data/kanji_frequency.txt
Normal file
File diff suppressed because it is too large
Load Diff
67
src/learner.rs
Normal file
67
src/learner.rs
Normal file
|
@ -0,0 +1,67 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
const MIN_MAX_DISTANCE: usize = 100;
|
||||
const MAX_MAX_DISTANCE: usize = 10000;
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct WordStats {
|
||||
// The last position (in words processed) that this word was seen at.
|
||||
last_seen_at: usize,
|
||||
|
||||
// How many times this word has been seen so far.
|
||||
times_seen: usize,
|
||||
|
||||
// Maximum distance before helps is needed again.
|
||||
max_distance: usize,
|
||||
}
|
||||
|
||||
pub struct Learner {
|
||||
stats: HashMap<String, WordStats>,
|
||||
words_processed: usize,
|
||||
times_seen_threshold: usize,
|
||||
}
|
||||
|
||||
impl Learner {
|
||||
pub fn new(times_seen_threshold: usize) -> Self {
|
||||
Self {
|
||||
stats: HashMap::new(),
|
||||
words_processed: 0,
|
||||
times_seen_threshold: times_seen_threshold,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn record(&mut self, word: &str) {
|
||||
self.stats
|
||||
.entry(word.to_string())
|
||||
.and_modify(|stats| {
|
||||
let distance = self.words_processed - stats.last_seen_at;
|
||||
|
||||
stats.last_seen_at = self.words_processed;
|
||||
stats.times_seen += 1;
|
||||
if stats.times_seen <= self.times_seen_threshold {
|
||||
return;
|
||||
}
|
||||
|
||||
if distance < stats.max_distance {
|
||||
stats.max_distance += distance.min((stats.max_distance as f64 * 0.5) as usize);
|
||||
}
|
||||
|
||||
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
|
||||
})
|
||||
.or_insert(WordStats {
|
||||
last_seen_at: self.words_processed,
|
||||
times_seen: 1,
|
||||
max_distance: MIN_MAX_DISTANCE,
|
||||
});
|
||||
self.words_processed += 1;
|
||||
}
|
||||
|
||||
pub fn needs_help(&self, word: &str) -> bool {
|
||||
if let Some(stats) = self.stats.get(word) {
|
||||
let distance = self.words_processed - stats.last_seen_at;
|
||||
stats.times_seen <= self.times_seen_threshold || distance > stats.max_distance
|
||||
} else {
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
516
src/lib.rs
Normal file
516
src/lib.rs
Normal file
|
@ -0,0 +1,516 @@
|
|||
mod learner;
|
||||
|
||||
use std::{
|
||||
collections::HashSet,
|
||||
// fs::File,
|
||||
io::{Cursor, Read},
|
||||
};
|
||||
|
||||
use lz4_flex::frame::FrameDecoder;
|
||||
use quick_xml::events::Event;
|
||||
use vibrato::{Dictionary, Tokenizer};
|
||||
|
||||
use learner::Learner;
|
||||
|
||||
// Include KANJI_FREQ, a frequency-ordered array of kanji characters.
|
||||
include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));
|
||||
|
||||
const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4"));
|
||||
|
||||
pub struct FuriganaGenerator {
|
||||
tokenizer: Tokenizer,
|
||||
exclude_kanji: HashSet<char>,
|
||||
learner: Learner,
|
||||
}
|
||||
|
||||
impl FuriganaGenerator {
|
||||
// `exclude_count`: exclude the N most frequent kanji from furigana.
|
||||
// Specifically, words made up *entirely* of those kanji will be excluded.
|
||||
// If a word has some kanji that aren't in that set, even if it also has
|
||||
// some that are, it will still get furigana.
|
||||
pub fn new(exclude_count: usize, learn_mode: bool) -> Self {
|
||||
let dict = {
|
||||
// Note: we could just pass the decoder straight to `Dictionary::read()`
|
||||
// below, and it would work. However, that ends up being slower than
|
||||
// first decompressing the whole thing ahead of time.
|
||||
let mut decoder = FrameDecoder::new(Cursor::new(DICT));
|
||||
let mut data = Vec::new();
|
||||
decoder.read_to_end(&mut data).unwrap();
|
||||
|
||||
Dictionary::read(Cursor::new(&data)).unwrap()
|
||||
};
|
||||
|
||||
let exclude_kanji = {
|
||||
let mut set = HashSet::new();
|
||||
for &c in KANJI_FREQ.iter().take(exclude_count) {
|
||||
set.insert(c);
|
||||
}
|
||||
set
|
||||
};
|
||||
|
||||
Self {
|
||||
tokenizer: Tokenizer::new(dict),
|
||||
exclude_kanji: exclude_kanji,
|
||||
learner: Learner::new(if learn_mode { 5 } else { usize::MAX }),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_html_furigana(&mut self, text: &str) -> String {
|
||||
add_html_furigana_skip_already_ruby(
|
||||
&text,
|
||||
&self.tokenizer,
|
||||
&self.exclude_kanji,
|
||||
&mut self.learner,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn to_str<B: std::ops::Deref<Target = [u8]>>(bytes: &B) -> &str {
|
||||
std::str::from_utf8(&bytes.deref()).unwrap()
|
||||
}
|
||||
|
||||
/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
|
||||
fn add_html_furigana_skip_already_ruby(
|
||||
text: &str,
|
||||
tokenizer: &Tokenizer,
|
||||
exclude_kanji: &HashSet<char>,
|
||||
learner: &mut Learner,
|
||||
) -> String {
|
||||
let mut reader = quick_xml::Reader::from_str(text);
|
||||
|
||||
let mut new_text = String::new();
|
||||
let mut rubys: i32 = 0;
|
||||
|
||||
loop {
|
||||
match reader.read_event() {
|
||||
Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
|
||||
Ok(Event::Eof) => break,
|
||||
|
||||
Ok(Event::Start(e)) => {
|
||||
if e.name().into_inner() == b"ruby" {
|
||||
rubys += 1;
|
||||
}
|
||||
write_xml(&mut new_text, &Event::Start(e));
|
||||
}
|
||||
|
||||
Ok(Event::End(e)) => {
|
||||
if e.name().into_inner() == b"ruby" {
|
||||
rubys -= 1;
|
||||
}
|
||||
write_xml(&mut new_text, &Event::End(e));
|
||||
}
|
||||
|
||||
Ok(Event::Text(e)) => {
|
||||
if rubys <= 0 {
|
||||
new_text.push_str(&add_html_furigana(
|
||||
to_str(&e),
|
||||
tokenizer,
|
||||
exclude_kanji,
|
||||
learner,
|
||||
));
|
||||
} else {
|
||||
write_xml(&mut new_text, &Event::Text(e));
|
||||
}
|
||||
}
|
||||
|
||||
// All other events, just re-write them verbatim.
|
||||
Ok(e) => write_xml(&mut new_text, &e),
|
||||
}
|
||||
}
|
||||
|
||||
new_text
|
||||
}
|
||||
|
||||
/// Takes an xml event and writes it verbatim to the given string.
|
||||
///
|
||||
/// NOTE: really what we want is for the events to provide their byte index range
|
||||
/// in the original text, so we could just write that, and even double-check that
|
||||
/// we're not missing anything. But for some reason quick_xml doesn't provide
|
||||
/// that information.
|
||||
fn write_xml(text: &mut String, event: &quick_xml::events::Event) {
|
||||
match event {
|
||||
Event::Start(e) => {
|
||||
text.push_str("<");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str(">");
|
||||
}
|
||||
|
||||
Event::End(e) => {
|
||||
text.push_str("</");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str(">");
|
||||
}
|
||||
|
||||
Event::Empty(e) => {
|
||||
text.push_str("<");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str("/>");
|
||||
}
|
||||
|
||||
Event::CData(e) => {
|
||||
text.push_str("<![CDATA[");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str("]]>");
|
||||
}
|
||||
|
||||
Event::Comment(e) => {
|
||||
text.push_str("<!--");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str("-->");
|
||||
}
|
||||
|
||||
Event::Decl(e) => {
|
||||
text.push_str("<?");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str("?>");
|
||||
}
|
||||
|
||||
Event::PI(e) => {
|
||||
text.push_str("<?");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str("?>");
|
||||
}
|
||||
|
||||
Event::DocType(e) => {
|
||||
text.push_str("<!DOCTYPE");
|
||||
text.push_str(to_str(e));
|
||||
text.push_str(">");
|
||||
}
|
||||
|
||||
Event::Text(e) => text.push_str(to_str(e)),
|
||||
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Adds furigana to Japanese text, using html ruby tags.
|
||||
fn add_html_furigana(
|
||||
text: &str,
|
||||
tokenizer: &Tokenizer,
|
||||
exclude_kanji: &HashSet<char>,
|
||||
learner: &mut Learner,
|
||||
) -> String {
|
||||
let mut worker = tokenizer.new_worker();
|
||||
|
||||
worker.reset_sentence(text);
|
||||
worker.tokenize();
|
||||
|
||||
let mut new_text = String::new();
|
||||
for i in 0..worker.num_tokens() {
|
||||
let t = worker.token(i);
|
||||
let surface = t.surface();
|
||||
|
||||
let needs_help = learner.needs_help(surface);
|
||||
learner.record(surface);
|
||||
|
||||
if !needs_help {
|
||||
new_text.push_str(surface);
|
||||
continue;
|
||||
}
|
||||
|
||||
let kana = t.feature().split(",").nth(1).unwrap();
|
||||
|
||||
let furigana_text = apply_furigana(surface, kana, exclude_kanji);
|
||||
|
||||
for (surf, furi) in furigana_text.iter() {
|
||||
if furi.is_empty() {
|
||||
new_text.push_str(surf);
|
||||
continue;
|
||||
}
|
||||
|
||||
new_text.push_str("<ruby>");
|
||||
new_text.push_str(surf);
|
||||
new_text.push_str("<rt>");
|
||||
new_text.push_str(furi);
|
||||
new_text.push_str("</rt></ruby>");
|
||||
}
|
||||
}
|
||||
|
||||
new_text
|
||||
}
|
||||
|
||||
/// Returns a segmented list of (surface, furigana) pairs.
|
||||
///
|
||||
/// The furigana component of a pair may be empty, indicating no
|
||||
/// furigana is needed for that surface element.
|
||||
fn apply_furigana<'a>(
|
||||
surface: &'a str,
|
||||
kana: &'a str,
|
||||
exclude_kanji: &HashSet<char>,
|
||||
) -> Vec<(&'a str, &'a str)> {
|
||||
let mut out = Vec::new();
|
||||
|
||||
if furigana_unneeded(surface, exclude_kanji) {
|
||||
out.push((surface, ""));
|
||||
return out;
|
||||
}
|
||||
|
||||
let mut surface = surface;
|
||||
let mut kana = kana;
|
||||
|
||||
// Trim any kana from the start.
|
||||
{
|
||||
let mut start_s = 0;
|
||||
let mut start_k = 0;
|
||||
for (sc, kc) in surface.chars().zip(kana.chars()) {
|
||||
if is_equivalent_kana(sc, kc) {
|
||||
start_s += sc.len_utf8();
|
||||
start_k += kc.len_utf8();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
out.push((&surface[..start_s], ""));
|
||||
surface = &surface[start_s..];
|
||||
kana = &kana[start_k..];
|
||||
}
|
||||
|
||||
// Trim any kana from the end.
|
||||
{
|
||||
let mut end_s = surface.len();
|
||||
let mut end_k = kana.len();
|
||||
for (sc, kc) in surface.chars().rev().zip(kana.chars().rev()) {
|
||||
if is_equivalent_kana(sc, kc) {
|
||||
end_s -= sc.len_utf8();
|
||||
end_k -= kc.len_utf8();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
out.push((&surface[end_s..], ""));
|
||||
surface = &surface[..end_s];
|
||||
kana = &kana[..end_k];
|
||||
}
|
||||
|
||||
// Try to uniquely match kana in the middle.
|
||||
//
|
||||
// This is just best-effort, and bails in any non-trivial cases.
|
||||
while let Some((si, sc)) = surface.char_indices().find(|(_, c)| is_kana(*c)) {
|
||||
// If there's more than one match, bail.
|
||||
let equivalent_kana_count = kana
|
||||
.chars()
|
||||
.map(|c| is_equivalent_kana(c, sc))
|
||||
.fold(0usize, |count, hit| count + hit as usize);
|
||||
if equivalent_kana_count != 1 {
|
||||
break;
|
||||
}
|
||||
|
||||
// Find the one match.
|
||||
let (ki, kc) = kana
|
||||
.char_indices()
|
||||
.find(|(_, c)| is_equivalent_kana(sc, *c))
|
||||
.unwrap();
|
||||
|
||||
// Insert the segments.
|
||||
out.insert(out.len() - 2, (&surface[..si], &kana[..ki]));
|
||||
out.insert(out.len() - 2, (&surface[si..(si + sc.len_utf8())], ""));
|
||||
surface = &surface[(si + sc.len_utf8())..];
|
||||
kana = &kana[(ki + kc.len_utf8())..];
|
||||
}
|
||||
|
||||
// Left over.
|
||||
out.insert(out.len() - 2, (surface, kana));
|
||||
|
||||
out.iter().filter(|(s, _)| !s.is_empty()).copied().collect()
|
||||
}
|
||||
|
||||
/// Due to the way this is used, this isn't meant to be exact, but instead
|
||||
/// liberal in what it considers equivalent.
|
||||
fn is_equivalent_kana(a: char, b: char) -> bool {
|
||||
const PAIRS: &[[char; 2]] = &[['は', 'わ'], ['を', 'お'], ['づ', 'ず'], ['へ', 'え']];
|
||||
const VOWELS: &[char] = &['あ', 'い', 'う', 'え', 'お', 'ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ'];
|
||||
|
||||
let (a, b) = match (normalize_kana(a), normalize_kana(b)) {
|
||||
(Some(a), Some(b)) => (a, b),
|
||||
_ => return false,
|
||||
};
|
||||
|
||||
if a == b {
|
||||
return true;
|
||||
}
|
||||
|
||||
if a == 'ー' && VOWELS.contains(&b) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if b == 'ー' && VOWELS.contains(&a) {
|
||||
return true;
|
||||
}
|
||||
|
||||
for &[c, d] in PAIRS {
|
||||
if (a == c && b == d) || (a == d && b == c) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
const HIRAGANA: u32 = 0x3041;
|
||||
const KATAKANA: u32 = 0x30A1;
|
||||
const KANA_COUNT: u32 = 0x3097 - HIRAGANA;
|
||||
|
||||
pub fn is_kana(c: char) -> bool {
|
||||
if c == 'ー' {
|
||||
return true;
|
||||
}
|
||||
|
||||
let c = c as u32;
|
||||
|
||||
if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
pub fn normalize_kana(c: char) -> Option<char> {
|
||||
if !is_kana(c) {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(katakana_to_hiragana(c).unwrap_or(c))
|
||||
}
|
||||
|
||||
/// Returns true if furigana defininitely isn't needed.
|
||||
pub fn furigana_unneeded(text: &str, exclude_kanji: &HashSet<char>) -> bool {
|
||||
text.chars().all(|c| {
|
||||
is_kana(c) || c.is_ascii() || c.is_numeric() || c == '々' || exclude_kanji.contains(&c)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn hiragana_to_katakana(c: char) -> Option<char> {
|
||||
let c = c as u32;
|
||||
if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
|
||||
char::try_from(c + KATAKANA - HIRAGANA).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn katakana_to_hiragana(c: char) -> Option<char> {
|
||||
let c = c as u32;
|
||||
if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
|
||||
char::try_from(c - KATAKANA + HIRAGANA).ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn apply_furigana_01() {
|
||||
let surface = "へぇ";
|
||||
let kana = "ヘー";
|
||||
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||
|
||||
assert_eq!(&[("へぇ", "")], &pairs[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn apply_furigana_02() {
|
||||
let surface = "へぇー";
|
||||
let kana = "ヘー";
|
||||
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||
|
||||
assert_eq!(&[("へぇー", "")], &pairs[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn apply_furigana_03() {
|
||||
let surface = "へ";
|
||||
let kana = "え";
|
||||
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||
|
||||
assert_eq!(&[("へ", "")], &pairs[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn apply_furigana_04() {
|
||||
let surface = "食べる";
|
||||
let kana = "タベル";
|
||||
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||
|
||||
assert_eq!(&[("食", "タ"), ("べる", "")], &pairs[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn apply_furigana_05() {
|
||||
let surface = "流れ出す";
|
||||
let kana = "ながれだす";
|
||||
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||
|
||||
assert_eq!(
|
||||
&[("流", "なが"), ("れ", ""), ("出", "だ"), ("す", "")],
|
||||
&pairs[..]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn apply_furigana_06() {
|
||||
let surface = "物の怪";
|
||||
let kana = "もののけ";
|
||||
let pairs = apply_furigana(surface, kana, &HashSet::new());
|
||||
|
||||
assert_eq!(&[("物の怪", "もののけ")], &pairs[..]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn is_equivalent_kana_01() {
|
||||
assert!(is_equivalent_kana('か', 'カ'));
|
||||
assert!(is_equivalent_kana('カ', 'か'));
|
||||
assert!(is_equivalent_kana('ぁ', 'ァ'));
|
||||
assert!(is_equivalent_kana('ァ', 'ぁ'));
|
||||
assert!(is_equivalent_kana('は', 'わ'));
|
||||
assert!(is_equivalent_kana('わ', 'は'));
|
||||
assert!(is_equivalent_kana('を', 'お'));
|
||||
assert!(is_equivalent_kana('お', 'を'));
|
||||
assert!(is_equivalent_kana('づ', 'ず'));
|
||||
assert!(is_equivalent_kana('ず', 'づ'));
|
||||
assert!(is_equivalent_kana('ー', 'あ'));
|
||||
assert!(is_equivalent_kana('あ', 'ー'));
|
||||
assert!(is_equivalent_kana('ー', 'ぁ'));
|
||||
assert!(is_equivalent_kana('ぁ', 'ー'));
|
||||
|
||||
assert!(!is_equivalent_kana('は', 'ば'));
|
||||
assert!(!is_equivalent_kana('ー', 'か'));
|
||||
assert!(!is_equivalent_kana('た', '食'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_01() {
|
||||
let gen = FuriganaGenerator::new(0, false);
|
||||
|
||||
let mut worker = gen.tokenizer.new_worker();
|
||||
worker.reset_sentence("食べている");
|
||||
worker.tokenize();
|
||||
|
||||
assert_eq!(3, worker.num_tokens());
|
||||
assert_eq!("食べ", worker.token(0).surface());
|
||||
assert_eq!("動詞-一般,タベ", worker.token(0).feature());
|
||||
assert_eq!("て", worker.token(1).surface());
|
||||
assert_eq!("助詞-接続助詞,テ", worker.token(1).feature());
|
||||
assert_eq!("いる", worker.token(2).surface());
|
||||
assert_eq!("動詞-非自立可能,イル", worker.token(2).feature());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_html_furigana_01() {
|
||||
let mut gen = FuriganaGenerator::new(0, false);
|
||||
|
||||
let text = gen
|
||||
.add_html_furigana(r#"<sup class="食う">食べる</sup>のは<ruby>良</ruby>いね!<hi />"#);
|
||||
|
||||
assert_eq!(
|
||||
text,
|
||||
r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね!<hi />"#
|
||||
);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user