Add option to include pitch accent information with the furigana
This commit is contained in:
parent
7361240e49
commit
adb58983a7
23
build.rs
23
build.rs
|
@ -29,11 +29,11 @@ fn main() {
|
||||||
f.write_all("\n];".as_bytes()).unwrap();
|
f.write_all("\n];".as_bytes()).unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write compressed dictionary to .lz4 file.
|
// Write compressed parsing dictionary to .lz4 file.
|
||||||
{
|
{
|
||||||
// Read and decompress file from .xz.
|
// Read and decompress file from .xz.
|
||||||
let dict_data = {
|
let dict_data = {
|
||||||
let f = File::open("data/dictionary/system.dic.xz").unwrap();
|
let f = File::open("data/ipadic-mecab-2_7_0/system.dic.xz").unwrap();
|
||||||
let mut data = Vec::new();
|
let mut data = Vec::new();
|
||||||
lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
|
lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
|
||||||
|
|
||||||
|
@ -47,4 +47,23 @@ fn main() {
|
||||||
encoder.write(&dict_data).unwrap();
|
encoder.write(&dict_data).unwrap();
|
||||||
encoder.finish().unwrap();
|
encoder.finish().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Write compressed pitch accent dictionary to .lz4 file.
|
||||||
|
{
|
||||||
|
// Read and decompress file from .xz.
|
||||||
|
let dict_data = {
|
||||||
|
let f = File::open("data/accents.tsv.xz").unwrap();
|
||||||
|
let mut data = Vec::new();
|
||||||
|
lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
|
||||||
|
|
||||||
|
data
|
||||||
|
};
|
||||||
|
|
||||||
|
// Recompress to .lz4.
|
||||||
|
let dest_path = Path::new(&out_dir).join("accents.tsv.lz4");
|
||||||
|
let f = File::create(dest_path).unwrap();
|
||||||
|
let mut encoder = lz4_flex::frame::FrameEncoder::new(f);
|
||||||
|
encoder.write(&dict_data).unwrap();
|
||||||
|
encoder.finish().unwrap();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
124138
data/accents.tsv
Normal file
124138
data/accents.tsv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
data/accents.tsv.xz
Normal file
BIN
data/accents.tsv.xz
Normal file
Binary file not shown.
|
@ -1,31 +0,0 @@
|
||||||
Copyright (c) 2011-2021, The UniDic Consortium
|
|
||||||
Copyright (c) 2023, LegalOn Technologies, Inc.
|
|
||||||
All rights reserved.
|
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
|
||||||
modification, are permitted provided that the following conditions are
|
|
||||||
met:
|
|
||||||
|
|
||||||
* Redistributions of source code must retain the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer.
|
|
||||||
|
|
||||||
* Redistributions in binary form must reproduce the above copyright
|
|
||||||
notice, this list of conditions and the following disclaimer in the
|
|
||||||
documentation and/or other materials provided with the
|
|
||||||
distribution.
|
|
||||||
|
|
||||||
* Neither the name of the UniDic Consortium nor the names of its
|
|
||||||
contributors may be used to endorse or promote products derived
|
|
||||||
from this software without specific prior written permission.
|
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
||||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
||||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
||||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
||||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
||||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
||||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
||||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
||||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
||||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
@ -1,7 +0,0 @@
|
||||||
This software includes a binary version of data from
|
|
||||||
|
|
||||||
https://clrd.ninjal.ac.jp/unidic_archive/cwj/3.1.1/unidic-cwj-3.1.1-full.zip
|
|
||||||
|
|
||||||
where the costs and connection ids are retrained using CORE data in BCCWJ (except the PN category)
|
|
||||||
|
|
||||||
https://clrd.ninjal.ac.jp/bccwj/.
|
|
Binary file not shown.
73
data/ipadic-mecab-2_7_0/COPYING
Normal file
73
data/ipadic-mecab-2_7_0/COPYING
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
|
||||||
|
and Technology.
|
||||||
|
Copyright 2023, LegalOn Technologies, Inc.
|
||||||
|
All Rights Reserved.
|
||||||
|
|
||||||
|
Use, reproduction, and distribution of this software is permitted.
|
||||||
|
Any copy of this software, whether in its original form or modified,
|
||||||
|
must include both the above copyright notice and the following
|
||||||
|
paragraphs.
|
||||||
|
|
||||||
|
Nara Institute of Science and Technology (NAIST),
|
||||||
|
the copyright holders, disclaims all warranties with regard to this
|
||||||
|
software, including all implied warranties of merchantability and
|
||||||
|
fitness, in no event shall NAIST be liable for
|
||||||
|
any special, indirect or consequential damages or any damages
|
||||||
|
whatsoever resulting from loss of use, data or profits, whether in an
|
||||||
|
action of contract, negligence or other tortuous action, arising out
|
||||||
|
of or in connection with the use or performance of this software.
|
||||||
|
|
||||||
|
A large portion of the dictionary entries
|
||||||
|
originate from ICOT Free Software. The following conditions for ICOT
|
||||||
|
Free Software applies to the current dictionary as well.
|
||||||
|
|
||||||
|
Each User may also freely distribute the Program, whether in its
|
||||||
|
original form or modified, to any third party or parties, PROVIDED
|
||||||
|
that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
|
||||||
|
on, or be attached to, the Program, which is distributed substantially
|
||||||
|
in the same form as set out herein and that such intended
|
||||||
|
distribution, if actually made, will neither violate or otherwise
|
||||||
|
contravene any of the laws and regulations of the countries having
|
||||||
|
jurisdiction over the User or the intended distribution itself.
|
||||||
|
|
||||||
|
NO WARRANTY
|
||||||
|
|
||||||
|
The program was produced on an experimental basis in the course of the
|
||||||
|
research and development conducted during the project and is provided
|
||||||
|
to users as so produced on an experimental basis. Accordingly, the
|
||||||
|
program is provided without any warranty whatsoever, whether express,
|
||||||
|
implied, statutory or otherwise. The term "warranty" used herein
|
||||||
|
includes, but is not limited to, any warranty of the quality,
|
||||||
|
performance, merchantability and fitness for a particular purpose of
|
||||||
|
the program and the nonexistence of any infringement or violation of
|
||||||
|
any right of any third party.
|
||||||
|
|
||||||
|
Each user of the program will agree and understand, and be deemed to
|
||||||
|
have agreed and understood, that there is no warranty whatsoever for
|
||||||
|
the program and, accordingly, the entire risk arising from or
|
||||||
|
otherwise connected with the program is assumed by the user.
|
||||||
|
|
||||||
|
Therefore, neither ICOT, the copyright holder, or any other
|
||||||
|
organization that participated in or was otherwise related to the
|
||||||
|
development of the program and their respective officials, directors,
|
||||||
|
officers and other employees shall be held liable for any and all
|
||||||
|
damages, including, without limitation, general, special, incidental
|
||||||
|
and consequential damages, arising out of or otherwise in connection
|
||||||
|
with the use or inability to use the program or any product, material
|
||||||
|
or result produced or otherwise obtained by using the program,
|
||||||
|
regardless of whether they have been advised of, or otherwise had
|
||||||
|
knowledge of, the possibility of such damages at any time during the
|
||||||
|
project or thereafter. Each user will be deemed to have agreed to the
|
||||||
|
foregoing by his or her commencement of use of the program. The term
|
||||||
|
"use" as used herein includes, but is not limited to, the use,
|
||||||
|
modification, copying and distribution of the program and the
|
||||||
|
production of secondary products from the program.
|
||||||
|
|
||||||
|
In the case where the program, whether in its original form or
|
||||||
|
modified, was distributed or delivered to or received by a user from
|
||||||
|
any person, organization or entity other than ICOT, unless it makes or
|
||||||
|
grants independently of ICOT any specific warranty to the user in
|
||||||
|
writing, such person, organization or entity, will also be exempted
|
||||||
|
from and not be held liable to the user for any such damages as noted
|
||||||
|
above as far as the program is concerned.
|
||||||
|
÷÷
|
7
data/ipadic-mecab-2_7_0/NOTICE
Normal file
7
data/ipadic-mecab-2_7_0/NOTICE
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
This software includes a binary version of data from
|
||||||
|
|
||||||
|
http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz,
|
||||||
|
|
||||||
|
where the connection ids are remapped using CORE data in BCCWJ (except the PN category)
|
||||||
|
|
||||||
|
https://clrd.ninjal.ac.jp/bccwj/.
|
BIN
data/ipadic-mecab-2_7_0/system.dic.xz
Normal file
BIN
data/ipadic-mecab-2_7_0/system.dic.xz
Normal file
Binary file not shown.
61
src/accent.rs
Normal file
61
src/accent.rs
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
use std::{
|
||||||
|
borrow::Cow,
|
||||||
|
io::{Cursor, Read},
|
||||||
|
};
|
||||||
|
|
||||||
|
use fnv::FnvHashMap;
|
||||||
|
use lz4_flex::frame::FrameDecoder;
|
||||||
|
|
||||||
|
// Pitch accent dictionary.
|
||||||
|
const ACCENT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/accents.tsv.lz4"));
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct AccentDict {
|
||||||
|
table: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), Vec<u8>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn build_accent_dictionary() -> AccentDict {
|
||||||
|
let text = {
|
||||||
|
let mut decoder = FrameDecoder::new(Cursor::new(ACCENT));
|
||||||
|
let mut text = String::new();
|
||||||
|
decoder.read_to_string(&mut text).unwrap();
|
||||||
|
|
||||||
|
text
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut table = FnvHashMap::default();
|
||||||
|
for line in text.lines() {
|
||||||
|
let items: Vec<_> = line.split("\t").map(|t| t.trim()).collect();
|
||||||
|
|
||||||
|
let word = items[0];
|
||||||
|
let kana = if items[1].is_empty() {
|
||||||
|
items[0]
|
||||||
|
} else {
|
||||||
|
items[1]
|
||||||
|
};
|
||||||
|
let pitches = items[2]
|
||||||
|
.split(",")
|
||||||
|
.filter_map(|p| p.parse::<u8>().ok())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
table.insert(
|
||||||
|
(
|
||||||
|
Cow::Owned(word.into()),
|
||||||
|
Cow::Owned(crate::hiragana_to_katakana_string(kana)),
|
||||||
|
),
|
||||||
|
pitches,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
AccentDict { table: table }
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AccentDict {
|
||||||
|
pub fn get<'a>(&'a self, word: &'a str, kana: &'a str) -> &'a [u8] {
|
||||||
|
if let Some(p) = self.table.get(&(Cow::from(word), Cow::from(kana))) {
|
||||||
|
&p[..]
|
||||||
|
} else {
|
||||||
|
&[]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
178
src/lib.rs
178
src/lib.rs
|
@ -1,3 +1,4 @@
|
||||||
|
mod accent;
|
||||||
mod learner;
|
mod learner;
|
||||||
|
|
||||||
use std::{
|
use std::{
|
||||||
|
@ -10,32 +11,32 @@ use lz4_flex::frame::FrameDecoder;
|
||||||
use quick_xml::events::Event;
|
use quick_xml::events::Event;
|
||||||
use vibrato::{Dictionary, Tokenizer};
|
use vibrato::{Dictionary, Tokenizer};
|
||||||
|
|
||||||
|
use accent::AccentDict;
|
||||||
use learner::Learner;
|
use learner::Learner;
|
||||||
|
|
||||||
// Include KANJI_FREQ, a frequency-ordered array of kanji characters.
|
// Include KANJI_FREQ, a frequency-ordered array of kanji characters.
|
||||||
include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));
|
include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));
|
||||||
|
|
||||||
|
// Parsing dictionary.
|
||||||
const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4"));
|
const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4"));
|
||||||
|
|
||||||
/// A list of words that the tokenizer insists on using the less common reading
|
/// A list of words that the tokenizer insists on using the less common reading
|
||||||
/// for, with the more common reading that should be substituted.
|
/// for, with the more common reading that should be substituted.
|
||||||
///
|
///
|
||||||
/// (surface, feature, substitute_feature)
|
/// (surface, kana, substitute_kana)
|
||||||
const COMMON_SUBS: &[(&str, &str, &str)] = &[
|
const COMMON_SUBS: &[(&str, &str, &str)] = &[
|
||||||
("額", "名詞-普通名詞-一般,ガク", "名詞-普通名詞-一般,ヒタイ"),
|
("額", "ガク", "ヒタイ"),
|
||||||
(
|
("他", "タ", "ホカ"),
|
||||||
"他",
|
("私", "ワタクシ", "ワタシ"),
|
||||||
"名詞-普通名詞-副詞可能,タ",
|
|
||||||
"名詞-普通名詞-副詞可能,ホカ",
|
|
||||||
),
|
|
||||||
("私", "代名詞,ワタクシ", "代名詞,ワタシ"),
|
|
||||||
];
|
];
|
||||||
|
|
||||||
pub struct FuriganaGenerator {
|
pub struct FuriganaGenerator {
|
||||||
tokenizer: Tokenizer,
|
tokenizer: Tokenizer,
|
||||||
|
accent_dict: AccentDict,
|
||||||
exclude_kanji: FnvHashSet<char>,
|
exclude_kanji: FnvHashSet<char>,
|
||||||
subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), String>,
|
subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), String>,
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
|
mark_accent: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FuriganaGenerator {
|
impl FuriganaGenerator {
|
||||||
|
@ -43,7 +44,7 @@ impl FuriganaGenerator {
|
||||||
// Specifically, words made up *entirely* of those kanji will be excluded.
|
// Specifically, words made up *entirely* of those kanji will be excluded.
|
||||||
// If a word has some kanji that aren't in that set, even if it also has
|
// If a word has some kanji that aren't in that set, even if it also has
|
||||||
// some that are, it will still get furigana.
|
// some that are, it will still get furigana.
|
||||||
pub fn new(exclude_count: usize, use_hiragana: bool) -> Self {
|
pub fn new(exclude_count: usize, use_hiragana: bool, mark_accent: bool) -> Self {
|
||||||
let dict = {
|
let dict = {
|
||||||
// Note: we could just pass the decoder straight to `Dictionary::read()`
|
// Note: we could just pass the decoder straight to `Dictionary::read()`
|
||||||
// below, and it would work. However, that ends up being slower than
|
// below, and it would work. However, that ends up being slower than
|
||||||
|
@ -73,29 +74,35 @@ impl FuriganaGenerator {
|
||||||
|
|
||||||
Self {
|
Self {
|
||||||
tokenizer: Tokenizer::new(dict),
|
tokenizer: Tokenizer::new(dict),
|
||||||
|
accent_dict: accent::build_accent_dictionary(),
|
||||||
exclude_kanji: exclude_kanji,
|
exclude_kanji: exclude_kanji,
|
||||||
subs: subs,
|
subs: subs,
|
||||||
use_hiragana: use_hiragana,
|
use_hiragana: use_hiragana,
|
||||||
|
mark_accent: mark_accent,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new_session(&self, learn_mode: bool) -> Session<'_> {
|
pub fn new_session(&self, learn_mode: bool) -> Session<'_> {
|
||||||
Session {
|
Session {
|
||||||
tokenizer: &self.tokenizer,
|
tokenizer: &self.tokenizer,
|
||||||
|
accent_dict: &self.accent_dict,
|
||||||
exclude_kanji: &self.exclude_kanji,
|
exclude_kanji: &self.exclude_kanji,
|
||||||
subs: &self.subs,
|
subs: &self.subs,
|
||||||
learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
|
learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
|
||||||
use_hiragana: self.use_hiragana,
|
use_hiragana: self.use_hiragana,
|
||||||
|
mark_accent: self.mark_accent,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Session<'a> {
|
pub struct Session<'a> {
|
||||||
tokenizer: &'a Tokenizer,
|
tokenizer: &'a Tokenizer,
|
||||||
|
accent_dict: &'a AccentDict,
|
||||||
exclude_kanji: &'a FnvHashSet<char>,
|
exclude_kanji: &'a FnvHashSet<char>,
|
||||||
subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), String>,
|
subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), String>,
|
||||||
learner: Learner,
|
learner: Learner,
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
|
mark_accent: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Session<'a> {
|
impl<'a> Session<'a> {
|
||||||
|
@ -116,10 +123,12 @@ impl<'a> Session<'a> {
|
||||||
add_html_furigana_skip_already_ruby(
|
add_html_furigana_skip_already_ruby(
|
||||||
&text,
|
&text,
|
||||||
&self.tokenizer,
|
&self.tokenizer,
|
||||||
|
&self.accent_dict,
|
||||||
&self.exclude_kanji,
|
&self.exclude_kanji,
|
||||||
&self.subs,
|
&self.subs,
|
||||||
&mut self.learner,
|
&mut self.learner,
|
||||||
self.use_hiragana,
|
self.use_hiragana,
|
||||||
|
self.mark_accent,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -132,10 +141,12 @@ fn to_str<B: std::ops::Deref<Target = [u8]>>(bytes: &B) -> &str {
|
||||||
fn add_html_furigana_skip_already_ruby(
|
fn add_html_furigana_skip_already_ruby(
|
||||||
text: &str,
|
text: &str,
|
||||||
tokenizer: &Tokenizer,
|
tokenizer: &Tokenizer,
|
||||||
|
accent_dict: &AccentDict,
|
||||||
exclude_kanji: &FnvHashSet<char>,
|
exclude_kanji: &FnvHashSet<char>,
|
||||||
subs: &FnvHashMap<(Cow<str>, Cow<str>), String>,
|
subs: &FnvHashMap<(Cow<str>, Cow<str>), String>,
|
||||||
learner: &mut Learner,
|
learner: &mut Learner,
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
|
mark_accent: bool,
|
||||||
) -> String {
|
) -> String {
|
||||||
let mut reader = quick_xml::Reader::from_str(text);
|
let mut reader = quick_xml::Reader::from_str(text);
|
||||||
|
|
||||||
|
@ -171,10 +182,12 @@ fn add_html_furigana_skip_already_ruby(
|
||||||
new_text.push_str(&add_html_furigana(
|
new_text.push_str(&add_html_furigana(
|
||||||
to_str(&e),
|
to_str(&e),
|
||||||
tokenizer,
|
tokenizer,
|
||||||
|
accent_dict,
|
||||||
exclude_kanji,
|
exclude_kanji,
|
||||||
subs,
|
subs,
|
||||||
learner,
|
learner,
|
||||||
use_hiragana,
|
use_hiragana,
|
||||||
|
mark_accent,
|
||||||
));
|
));
|
||||||
} else {
|
} else {
|
||||||
write_xml(&mut new_text, &Event::Text(e));
|
write_xml(&mut new_text, &Event::Text(e));
|
||||||
|
@ -255,10 +268,12 @@ fn write_xml(text: &mut String, event: &quick_xml::events::Event) {
|
||||||
fn add_html_furigana(
|
fn add_html_furigana(
|
||||||
text: &str,
|
text: &str,
|
||||||
tokenizer: &Tokenizer,
|
tokenizer: &Tokenizer,
|
||||||
|
accent_dict: &AccentDict,
|
||||||
exclude_kanji: &FnvHashSet<char>,
|
exclude_kanji: &FnvHashSet<char>,
|
||||||
subs: &FnvHashMap<(Cow<str>, Cow<str>), String>,
|
subs: &FnvHashMap<(Cow<str>, Cow<str>), String>,
|
||||||
learner: &mut Learner,
|
learner: &mut Learner,
|
||||||
use_hiragana: bool,
|
use_hiragana: bool,
|
||||||
|
mark_accent: bool,
|
||||||
) -> String {
|
) -> String {
|
||||||
let mut worker = tokenizer.new_worker();
|
let mut worker = tokenizer.new_worker();
|
||||||
|
|
||||||
|
@ -268,15 +283,28 @@ fn add_html_furigana(
|
||||||
let mut new_text = String::new();
|
let mut new_text = String::new();
|
||||||
for i in 0..worker.num_tokens() {
|
for i in 0..worker.num_tokens() {
|
||||||
let t = worker.token(i);
|
let t = worker.token(i);
|
||||||
let (surface, feature) = {
|
let (surface, kana, pitches) = {
|
||||||
let surface = t.surface();
|
let surface = t.surface();
|
||||||
let feature = t.feature();
|
let feature = t.feature();
|
||||||
|
|
||||||
if let Some(sub_feature) = subs.get(&(Cow::from(surface), Cow::from(feature))) {
|
let kana_1 = feature.rsplit(",").nth(0).unwrap();
|
||||||
(surface, sub_feature.as_str())
|
let kana_2 = feature.rsplit(",").nth(1).unwrap();
|
||||||
|
let word = feature.rsplit(",").nth(2).unwrap();
|
||||||
|
|
||||||
|
let (kana, pkana) =
|
||||||
|
if let Some(sub_kana) = subs.get(&(Cow::from(surface), Cow::from(kana_1))) {
|
||||||
|
(sub_kana.as_str(), sub_kana.as_str())
|
||||||
|
} else {
|
||||||
|
(kana_1, kana_2)
|
||||||
|
};
|
||||||
|
|
||||||
|
let pitches = if mark_accent {
|
||||||
|
accent_dict.get(word, pkana)
|
||||||
} else {
|
} else {
|
||||||
(surface, feature)
|
&[]
|
||||||
}
|
};
|
||||||
|
|
||||||
|
(surface, kana, pitches)
|
||||||
};
|
};
|
||||||
|
|
||||||
let needs_help = learner.needs_help(surface);
|
let needs_help = learner.needs_help(surface);
|
||||||
|
@ -287,28 +315,33 @@ fn add_html_furigana(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let kana = {
|
let kana = if use_hiragana {
|
||||||
let kana = feature.split(",").nth(1).unwrap();
|
katakana_to_hiragana_string(kana)
|
||||||
if use_hiragana {
|
} else {
|
||||||
katakana_to_hiragana_string(kana)
|
kana.into()
|
||||||
} else {
|
|
||||||
kana.into()
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let furigana_text = apply_furigana(surface, &kana, exclude_kanji);
|
let furigana_text = apply_furigana(surface, &kana, exclude_kanji);
|
||||||
|
|
||||||
for (surf, furi) in furigana_text.iter() {
|
if furigana_text.is_empty() {
|
||||||
if furi.is_empty() {
|
new_text.push_str(surface);
|
||||||
new_text.push_str(surf);
|
} else {
|
||||||
continue;
|
for pitch in pitches {
|
||||||
|
new_text.push_str(&format!("<sup>{}</sup>", pitch));
|
||||||
}
|
}
|
||||||
|
|
||||||
new_text.push_str("<ruby>");
|
for (surf, furi) in furigana_text.iter() {
|
||||||
new_text.push_str(surf);
|
if furi.is_empty() {
|
||||||
new_text.push_str("<rt>");
|
new_text.push_str(surf);
|
||||||
new_text.push_str(furi);
|
continue;
|
||||||
new_text.push_str("</rt></ruby>");
|
}
|
||||||
|
|
||||||
|
new_text.push_str("<ruby>");
|
||||||
|
new_text.push_str(surf);
|
||||||
|
new_text.push_str("<rt>");
|
||||||
|
new_text.push_str(furi);
|
||||||
|
new_text.push_str("</rt></ruby>");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -326,9 +359,8 @@ fn apply_furigana<'a>(
|
||||||
) -> Vec<(&'a str, &'a str)> {
|
) -> Vec<(&'a str, &'a str)> {
|
||||||
let mut out = Vec::new();
|
let mut out = Vec::new();
|
||||||
|
|
||||||
if furigana_unneeded(surface, exclude_kanji) {
|
if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) {
|
||||||
out.push((surface, ""));
|
return Vec::new();
|
||||||
return out;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut surface = surface;
|
let mut surface = surface;
|
||||||
|
@ -454,6 +486,10 @@ pub fn is_kana(c: char) -> bool {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn is_kana_str(text: &str) -> bool {
|
||||||
|
text.chars().all(|c| is_kana(c))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn normalize_kana(c: char) -> Option<char> {
|
pub fn normalize_kana(c: char) -> Option<char> {
|
||||||
if !is_kana(c) {
|
if !is_kana(c) {
|
||||||
return None;
|
return None;
|
||||||
|
@ -497,6 +533,16 @@ pub fn katakana_to_hiragana_string(text: &str) -> String {
|
||||||
new_text
|
new_text
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn hiragana_to_katakana_string(text: &str) -> String {
|
||||||
|
let mut new_text = String::new();
|
||||||
|
|
||||||
|
for c in text.chars() {
|
||||||
|
new_text.push(hiragana_to_katakana(c).unwrap_or(c));
|
||||||
|
}
|
||||||
|
|
||||||
|
new_text
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
@ -505,7 +551,12 @@ mod tests {
|
||||||
pub fn get_furigana_gen() -> &'static FuriganaGenerator {
|
pub fn get_furigana_gen() -> &'static FuriganaGenerator {
|
||||||
use std::sync::OnceLock;
|
use std::sync::OnceLock;
|
||||||
static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new();
|
static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new();
|
||||||
FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false))
|
FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false, false))
|
||||||
|
}
|
||||||
|
pub fn get_furigana_gen_with_accent() -> &'static FuriganaGenerator {
|
||||||
|
use std::sync::OnceLock;
|
||||||
|
static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new();
|
||||||
|
FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false, true))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -514,7 +565,7 @@ mod tests {
|
||||||
let kana = "ヘー";
|
let kana = "ヘー";
|
||||||
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
|
||||||
|
|
||||||
assert_eq!(&[("へぇ", "")], &pairs[..]);
|
assert!(pairs.is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -523,7 +574,7 @@ mod tests {
|
||||||
let kana = "ヘー";
|
let kana = "ヘー";
|
||||||
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
|
||||||
|
|
||||||
assert_eq!(&[("へぇー", "")], &pairs[..]);
|
assert!(pairs.is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -532,7 +583,7 @@ mod tests {
|
||||||
let kana = "え";
|
let kana = "え";
|
||||||
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
|
let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
|
||||||
|
|
||||||
assert_eq!(&[("へ", "")], &pairs[..]);
|
assert!(pairs.is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -606,39 +657,80 @@ mod tests {
|
||||||
|
|
||||||
assert_eq!(3, worker.num_tokens());
|
assert_eq!(3, worker.num_tokens());
|
||||||
assert_eq!("食べ", worker.token(0).surface());
|
assert_eq!("食べ", worker.token(0).surface());
|
||||||
assert_eq!("動詞-一般,タベ", worker.token(0).feature());
|
assert_eq!(
|
||||||
|
"動詞,自立,*,*,一段,連用形,食べる,タベ,タベ",
|
||||||
|
worker.token(0).feature()
|
||||||
|
);
|
||||||
assert_eq!("て", worker.token(1).surface());
|
assert_eq!("て", worker.token(1).surface());
|
||||||
assert_eq!("助詞-接続助詞,テ", worker.token(1).feature());
|
assert_eq!("助詞,接続助詞,*,*,*,*,て,テ,テ", worker.token(1).feature());
|
||||||
assert_eq!("いる", worker.token(2).surface());
|
assert_eq!("いる", worker.token(2).surface());
|
||||||
assert_eq!("動詞-非自立可能,イル", worker.token(2).feature());
|
assert_eq!(
|
||||||
|
"動詞,非自立,*,*,一段,基本形,いる,イル,イル",
|
||||||
|
worker.token(2).feature()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tokenize_02() {
|
||||||
|
let mut worker = get_furigana_gen().tokenizer.new_worker();
|
||||||
|
|
||||||
|
worker.reset_sentence("そう");
|
||||||
|
worker.tokenize();
|
||||||
|
|
||||||
|
assert_eq!(1, worker.num_tokens());
|
||||||
|
assert_eq!(
|
||||||
|
"副詞,助詞類接続,*,*,*,*,そう,ソウ,ソー",
|
||||||
|
worker.token(0).feature()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn add_html_furigana_01() {
|
fn add_html_furigana_01() {
|
||||||
let mut gen = get_furigana_gen().new_session(false);
|
let mut gen = get_furigana_gen().new_session(false);
|
||||||
|
let mut gen_accent = get_furigana_gen_with_accent().new_session(false);
|
||||||
|
|
||||||
let text = gen
|
let text = r#"<sup class="食う">食べる</sup>のは<ruby>良</ruby>いね!<hi />"#;
|
||||||
.add_html_furigana(r#"<sup class="食う">食べる</sup>のは<ruby>良</ruby>いね!<hi />"#);
|
let furi_1 = gen.add_html_furigana(text);
|
||||||
|
let furi_2 = gen_accent.add_html_furigana(text);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
text,
|
furi_1,
|
||||||
r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね!<hi />"#
|
r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね!<hi />"#
|
||||||
);
|
);
|
||||||
|
assert_eq!(
|
||||||
|
furi_2,
|
||||||
|
r#"<sup class="食う"><sup>2</sup><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね!<hi />"#
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Testing custom substitutions.
|
// Testing custom substitutions.
|
||||||
#[test]
|
#[test]
|
||||||
fn add_html_furigana_02() {
|
fn add_html_furigana_02() {
|
||||||
let mut gen = get_furigana_gen().new_session(false);
|
let mut gen = get_furigana_gen().new_session(false);
|
||||||
|
let mut gen_accent = get_furigana_gen_with_accent().new_session(false);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
gen.add_html_furigana("額"),
|
gen.add_html_furigana("額"),
|
||||||
"<ruby>額<rt>ヒタイ</rt></ruby>"
|
"<ruby>額<rt>ヒタイ</rt></ruby>"
|
||||||
);
|
);
|
||||||
|
assert_eq!(
|
||||||
|
gen_accent.add_html_furigana("額"),
|
||||||
|
"<sup>0</sup><ruby>額<rt>ヒタイ</rt></ruby>"
|
||||||
|
);
|
||||||
|
|
||||||
assert_eq!(gen.add_html_furigana("他"), "<ruby>他<rt>ホカ</rt></ruby>");
|
assert_eq!(gen.add_html_furigana("他"), "<ruby>他<rt>ホカ</rt></ruby>");
|
||||||
|
assert_eq!(
|
||||||
|
gen_accent.add_html_furigana("他"),
|
||||||
|
"<sup>0</sup><ruby>他<rt>ホカ</rt></ruby>"
|
||||||
|
);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
gen.add_html_furigana("私"),
|
gen.add_html_furigana("私"),
|
||||||
"<ruby>私<rt>ワタシ</rt></ruby>"
|
"<ruby>私<rt>ワタシ</rt></ruby>"
|
||||||
);
|
);
|
||||||
|
assert_eq!(
|
||||||
|
gen_accent.add_html_furigana("私"),
|
||||||
|
"<sup>0</sup><ruby>私<rt>ワタシ</rt></ruby>"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user