First commit.

A furigana generator, that can do "spaced repetition" style reduction of furigana over the course of a text.
2024-09-10 18:22:53 +02:00 · 2024-09-10 18:22:53 +02:00 · 1c3afed157
commit 1c3afed157
9 changed files with 4692 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
+Cargo.lock
+/target
+/test_text
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,17 @@
+[package]
+name = "furigana_gen"
+version = "0.1.0"
+edition = "2021"
+
+[lib]
+name = "furigana_gen"
+path = "src/lib.rs"
+
+[dependencies]
+vibrato = "0.5"
+lz4_flex = "0.11"
+quick-xml = "0.36.1"
+
+[build-dependencies]
+lzma-rs = "0.3"
+lz4_flex = "0.11"
--- a/build.rs
+++ b/build.rs
@ -0,0 +1,50 @@
+use std::{
+    env,
+    fs::File,
+    io::{BufReader, Write},
+    path::Path,
+};
+
+const KANJI: &str = include_str!("data/kanji_frequency.txt");
+
+fn main() {
+    let out_dir = env::var("OUT_DIR").unwrap();
+
+    // Write frequency-ordered kanji array to rust file.
+    {
+        let dest_path = Path::new(&out_dir).join("kanji_freq_inc.rs");
+        let mut f = File::create(&dest_path).unwrap();
+
+        f.write_all("const KANJI_FREQ: &[char] = &[".as_bytes())
+            .unwrap();
+
+        for c in KANJI.chars() {
+            if c.is_whitespace() {
+                continue;
+            }
+
+            f.write_all(format!("\n'{}',", c).as_bytes()).unwrap();
+        }
+
+        f.write_all("\n];".as_bytes()).unwrap();
+    }
+
+    // Write compressed dictionary to .lz4 file.
+    {
+        // Read and decompress file from .xz.
+        let dict_data = {
+            let f = File::open("data/dictionary/system.dic.xz").unwrap();
+            let mut data = Vec::new();
+            lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
+
+            data
+        };
+
+        // Recompress to .lz4.
+        let dest_path = Path::new(&out_dir).join("system.dic.lz4");
+        let f = File::create(dest_path).unwrap();
+        let mut encoder = lz4_flex::frame::FrameEncoder::new(f);
+        encoder.write(&dict_data).unwrap();
+        encoder.finish().unwrap();
+    }
+}
--- a/data/dictionary/BSD
+++ b/data/dictionary/BSD
@ -0,0 +1,31 @@
+Copyright (c) 2011-2021, The UniDic Consortium
+Copyright (c) 2023, LegalOn Technologies, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the
+   distribution.
+
+ * Neither the name of the UniDic Consortium nor the names of its
+   contributors may be used to endorse or promote products derived
+   from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/data/dictionary/NOTICE
+++ b/data/dictionary/NOTICE
@ -0,0 +1,7 @@
+This software includes a binary version of data from
+
+  https://clrd.ninjal.ac.jp/unidic_archive/cwj/3.1.1/unidic-cwj-3.1.1-full.zip
+
+where the costs and connection ids are retrained using CORE data in BCCWJ (except the PN category)
+
+  https://clrd.ninjal.ac.jp/bccwj/.
--- a/data/dictionary/system.dic.xz
+++ b/data/dictionary/system.dic.xz
--- a/data/kanji_frequency.txt
+++ b/data/kanji_frequency.txt
--- a/src/learner.rs
+++ b/src/learner.rs
@ -0,0 +1,67 @@
+use std::collections::HashMap;
+
+const MIN_MAX_DISTANCE: usize = 100;
+const MAX_MAX_DISTANCE: usize = 10000;
+
+#[derive(Debug, Copy, Clone)]
+struct WordStats {
+    // The last position (in words processed) that this word was seen at.
+    last_seen_at: usize,
+
+    // How many times this word has been seen so far.
+    times_seen: usize,
+
+    // Maximum distance before helps is needed again.
+    max_distance: usize,
+}
+
+pub struct Learner {
+    stats: HashMap<String, WordStats>,
+    words_processed: usize,
+    times_seen_threshold: usize,
+}
+
+impl Learner {
+    pub fn new(times_seen_threshold: usize) -> Self {
+        Self {
+            stats: HashMap::new(),
+            words_processed: 0,
+            times_seen_threshold: times_seen_threshold,
+        }
+    }
+
+    pub fn record(&mut self, word: &str) {
+        self.stats
+            .entry(word.to_string())
+            .and_modify(|stats| {
+                let distance = self.words_processed - stats.last_seen_at;
+
+                stats.last_seen_at = self.words_processed;
+                stats.times_seen += 1;
+                if stats.times_seen <= self.times_seen_threshold {
+                    return;
+                }
+
+                if distance < stats.max_distance {
+                    stats.max_distance += distance.min((stats.max_distance as f64 * 0.5) as usize);
+                }
+
+                stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
+            })
+            .or_insert(WordStats {
+                last_seen_at: self.words_processed,
+                times_seen: 1,
+                max_distance: MIN_MAX_DISTANCE,
+            });
+        self.words_processed += 1;
+    }
+
+    pub fn needs_help(&self, word: &str) -> bool {
+        if let Some(stats) = self.stats.get(word) {
+            let distance = self.words_processed - stats.last_seen_at;
+            stats.times_seen <= self.times_seen_threshold || distance > stats.max_distance
+        } else {
+            true
+        }
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,516 @@
+mod learner;
+
+use std::{
+    collections::HashSet,
+    // fs::File,
+    io::{Cursor, Read},
+};
+
+use lz4_flex::frame::FrameDecoder;
+use quick_xml::events::Event;
+use vibrato::{Dictionary, Tokenizer};
+
+use learner::Learner;
+
+// Include KANJI_FREQ, a frequency-ordered array of kanji characters.
+include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));
+
+const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4"));
+
+pub struct FuriganaGenerator {
+    tokenizer: Tokenizer,
+    exclude_kanji: HashSet<char>,
+    learner: Learner,
+}
+
+impl FuriganaGenerator {
+    // `exclude_count`: exclude the N most frequent kanji from furigana.
+    // Specifically, words made up *entirely* of those kanji will be excluded.
+    // If a word has some kanji that aren't in that set, even if it also has
+    // some that are, it will still get furigana.
+    pub fn new(exclude_count: usize, learn_mode: bool) -> Self {
+        let dict = {
+            // Note: we could just pass the decoder straight to `Dictionary::read()`
+            // below, and it would work.  However, that ends up being slower than
+            // first decompressing the whole thing ahead of time.
+            let mut decoder = FrameDecoder::new(Cursor::new(DICT));
+            let mut data = Vec::new();
+            decoder.read_to_end(&mut data).unwrap();
+
+            Dictionary::read(Cursor::new(&data)).unwrap()
+        };
+
+        let exclude_kanji = {
+            let mut set = HashSet::new();
+            for &c in KANJI_FREQ.iter().take(exclude_count) {
+                set.insert(c);
+            }
+            set
+        };
+
+        Self {
+            tokenizer: Tokenizer::new(dict),
+            exclude_kanji: exclude_kanji,
+            learner: Learner::new(if learn_mode { 5 } else { usize::MAX }),
+        }
+    }
+
+    pub fn add_html_furigana(&mut self, text: &str) -> String {
+        add_html_furigana_skip_already_ruby(
+            &text,
+            &self.tokenizer,
+            &self.exclude_kanji,
+            &mut self.learner,
+        )
+    }
+}
+
+fn to_str<B: std::ops::Deref<Target = [u8]>>(bytes: &B) -> &str {
+    std::str::from_utf8(&bytes.deref()).unwrap()
+}
+
+/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
+fn add_html_furigana_skip_already_ruby(
+    text: &str,
+    tokenizer: &Tokenizer,
+    exclude_kanji: &HashSet<char>,
+    learner: &mut Learner,
+) -> String {
+    let mut reader = quick_xml::Reader::from_str(text);
+
+    let mut new_text = String::new();
+    let mut rubys: i32 = 0;
+
+    loop {
+        match reader.read_event() {
+            Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
+            Ok(Event::Eof) => break,
+
+            Ok(Event::Start(e)) => {
+                if e.name().into_inner() == b"ruby" {
+                    rubys += 1;
+                }
+                write_xml(&mut new_text, &Event::Start(e));
+            }
+
+            Ok(Event::End(e)) => {
+                if e.name().into_inner() == b"ruby" {
+                    rubys -= 1;
+                }
+                write_xml(&mut new_text, &Event::End(e));
+            }
+
+            Ok(Event::Text(e)) => {
+                if rubys <= 0 {
+                    new_text.push_str(&add_html_furigana(
+                        to_str(&e),
+                        tokenizer,
+                        exclude_kanji,
+                        learner,
+                    ));
+                } else {
+                    write_xml(&mut new_text, &Event::Text(e));
+                }
+            }
+
+            // All other events, just re-write them verbatim.
+            Ok(e) => write_xml(&mut new_text, &e),
+        }
+    }
+
+    new_text
+}
+
+/// Takes an xml event and writes it verbatim to the given string.
+///
+/// NOTE: really what we want is for the events to provide their byte index range
+/// in the original text, so we could just write that, and even double-check that
+/// we're not missing anything.  But for some reason quick_xml doesn't provide
+/// that information.
+fn write_xml(text: &mut String, event: &quick_xml::events::Event) {
+    match event {
+        Event::Start(e) => {
+            text.push_str("<");
+            text.push_str(to_str(e));
+            text.push_str(">");
+        }
+
+        Event::End(e) => {
+            text.push_str("</");
+            text.push_str(to_str(e));
+            text.push_str(">");
+        }
+
+        Event::Empty(e) => {
+            text.push_str("<");
+            text.push_str(to_str(e));
+            text.push_str("/>");
+        }
+
+        Event::CData(e) => {
+            text.push_str("<![CDATA[");
+            text.push_str(to_str(e));
+            text.push_str("]]>");
+        }
+
+        Event::Comment(e) => {
+            text.push_str("<!--");
+            text.push_str(to_str(e));
+            text.push_str("-->");
+        }
+
+        Event::Decl(e) => {
+            text.push_str("<?");
+            text.push_str(to_str(e));
+            text.push_str("?>");
+        }
+
+        Event::PI(e) => {
+            text.push_str("<?");
+            text.push_str(to_str(e));
+            text.push_str("?>");
+        }
+
+        Event::DocType(e) => {
+            text.push_str("<!DOCTYPE");
+            text.push_str(to_str(e));
+            text.push_str(">");
+        }
+
+        Event::Text(e) => text.push_str(to_str(e)),
+
+        _ => unreachable!(),
+    }
+}
+
+/// Adds furigana to Japanese text, using html ruby tags.
+fn add_html_furigana(
+    text: &str,
+    tokenizer: &Tokenizer,
+    exclude_kanji: &HashSet<char>,
+    learner: &mut Learner,
+) -> String {
+    let mut worker = tokenizer.new_worker();
+
+    worker.reset_sentence(text);
+    worker.tokenize();
+
+    let mut new_text = String::new();
+    for i in 0..worker.num_tokens() {
+        let t = worker.token(i);
+        let surface = t.surface();
+
+        let needs_help = learner.needs_help(surface);
+        learner.record(surface);
+
+        if !needs_help {
+            new_text.push_str(surface);
+            continue;
+        }
+
+        let kana = t.feature().split(",").nth(1).unwrap();
+
+        let furigana_text = apply_furigana(surface, kana, exclude_kanji);
+
+        for (surf, furi) in furigana_text.iter() {
+            if furi.is_empty() {
+                new_text.push_str(surf);
+                continue;
+            }
+
+            new_text.push_str("<ruby>");
+            new_text.push_str(surf);
+            new_text.push_str("<rt>");
+            new_text.push_str(furi);
+            new_text.push_str("</rt></ruby>");
+        }
+    }
+
+    new_text
+}
+
+/// Returns a segmented list of (surface, furigana) pairs.
+///
+/// The furigana component of a pair may be empty, indicating no
+/// furigana is needed for that surface element.
+fn apply_furigana<'a>(
+    surface: &'a str,
+    kana: &'a str,
+    exclude_kanji: &HashSet<char>,
+) -> Vec<(&'a str, &'a str)> {
+    let mut out = Vec::new();
+
+    if furigana_unneeded(surface, exclude_kanji) {
+        out.push((surface, ""));
+        return out;
+    }
+
+    let mut surface = surface;
+    let mut kana = kana;
+
+    // Trim any kana from the start.
+    {
+        let mut start_s = 0;
+        let mut start_k = 0;
+        for (sc, kc) in surface.chars().zip(kana.chars()) {
+            if is_equivalent_kana(sc, kc) {
+                start_s += sc.len_utf8();
+                start_k += kc.len_utf8();
+            } else {
+                break;
+            }
+        }
+        out.push((&surface[..start_s], ""));
+        surface = &surface[start_s..];
+        kana = &kana[start_k..];
+    }
+
+    // Trim any kana from the end.
+    {
+        let mut end_s = surface.len();
+        let mut end_k = kana.len();
+        for (sc, kc) in surface.chars().rev().zip(kana.chars().rev()) {
+            if is_equivalent_kana(sc, kc) {
+                end_s -= sc.len_utf8();
+                end_k -= kc.len_utf8();
+            } else {
+                break;
+            }
+        }
+        out.push((&surface[end_s..], ""));
+        surface = &surface[..end_s];
+        kana = &kana[..end_k];
+    }
+
+    // Try to uniquely match kana in the middle.
+    //
+    // This is just best-effort, and bails in any non-trivial cases.
+    while let Some((si, sc)) = surface.char_indices().find(|(_, c)| is_kana(*c)) {
+        // If there's more than one match, bail.
+        let equivalent_kana_count = kana
+            .chars()
+            .map(|c| is_equivalent_kana(c, sc))
+            .fold(0usize, |count, hit| count + hit as usize);
+        if equivalent_kana_count != 1 {
+            break;
+        }
+
+        // Find the one match.
+        let (ki, kc) = kana
+            .char_indices()
+            .find(|(_, c)| is_equivalent_kana(sc, *c))
+            .unwrap();
+
+        // Insert the segments.
+        out.insert(out.len() - 2, (&surface[..si], &kana[..ki]));
+        out.insert(out.len() - 2, (&surface[si..(si + sc.len_utf8())], ""));
+        surface = &surface[(si + sc.len_utf8())..];
+        kana = &kana[(ki + kc.len_utf8())..];
+    }
+
+    // Left over.
+    out.insert(out.len() - 2, (surface, kana));
+
+    out.iter().filter(|(s, _)| !s.is_empty()).copied().collect()
+}
+
+/// Due to the way this is used, this isn't meant to be exact, but instead
+/// liberal in what it considers equivalent.
+fn is_equivalent_kana(a: char, b: char) -> bool {
+    const PAIRS: &[[char; 2]] = &[['は', 'わ'], ['を', 'お'], ['づ', 'ず'], ['へ', 'え']];
+    const VOWELS: &[char] = &['あ', 'い', 'う', 'え', 'お', 'ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ'];
+
+    let (a, b) = match (normalize_kana(a), normalize_kana(b)) {
+        (Some(a), Some(b)) => (a, b),
+        _ => return false,
+    };
+
+    if a == b {
+        return true;
+    }
+
+    if a == 'ー' && VOWELS.contains(&b) {
+        return true;
+    }
+
+    if b == 'ー' && VOWELS.contains(&a) {
+        return true;
+    }
+
+    for &[c, d] in PAIRS {
+        if (a == c && b == d) || (a == d && b == c) {
+            return true;
+        }
+    }
+
+    false
+}
+
+const HIRAGANA: u32 = 0x3041;
+const KATAKANA: u32 = 0x30A1;
+const KANA_COUNT: u32 = 0x3097 - HIRAGANA;
+
+pub fn is_kana(c: char) -> bool {
+    if c == 'ー' {
+        return true;
+    }
+
+    let c = c as u32;
+
+    if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
+        return true;
+    }
+
+    if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
+        return true;
+    }
+
+    return false;
+}
+
+pub fn normalize_kana(c: char) -> Option<char> {
+    if !is_kana(c) {
+        return None;
+    }
+
+    Some(katakana_to_hiragana(c).unwrap_or(c))
+}
+
+/// Returns true if furigana defininitely isn't needed.
+pub fn furigana_unneeded(text: &str, exclude_kanji: &HashSet<char>) -> bool {
+    text.chars().all(|c| {
+        is_kana(c) || c.is_ascii() || c.is_numeric() || c == '々' || exclude_kanji.contains(&c)
+    })
+}
+
+pub fn hiragana_to_katakana(c: char) -> Option<char> {
+    let c = c as u32;
+    if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
+        char::try_from(c + KATAKANA - HIRAGANA).ok()
+    } else {
+        None
+    }
+}
+
+pub fn katakana_to_hiragana(c: char) -> Option<char> {
+    let c = c as u32;
+    if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
+        char::try_from(c - KATAKANA + HIRAGANA).ok()
+    } else {
+        None
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn apply_furigana_01() {
+        let surface = "へぇ";
+        let kana = "ヘー";
+        let pairs = apply_furigana(surface, kana, &HashSet::new());
+
+        assert_eq!(&[("へぇ", "")], &pairs[..]);
+    }
+
+    #[test]
+    fn apply_furigana_02() {
+        let surface = "へぇー";
+        let kana = "ヘー";
+        let pairs = apply_furigana(surface, kana, &HashSet::new());
+
+        assert_eq!(&[("へぇー", "")], &pairs[..]);
+    }
+
+    #[test]
+    fn apply_furigana_03() {
+        let surface = "へ";
+        let kana = "え";
+        let pairs = apply_furigana(surface, kana, &HashSet::new());
+
+        assert_eq!(&[("へ", "")], &pairs[..]);
+    }
+
+    #[test]
+    fn apply_furigana_04() {
+        let surface = "食べる";
+        let kana = "タベル";
+        let pairs = apply_furigana(surface, kana, &HashSet::new());
+
+        assert_eq!(&[("食", "タ"), ("べる", "")], &pairs[..]);
+    }
+
+    #[test]
+    fn apply_furigana_05() {
+        let surface = "流れ出す";
+        let kana = "ながれだす";
+        let pairs = apply_furigana(surface, kana, &HashSet::new());
+
+        assert_eq!(
+            &[("流", "なが"), ("れ", ""), ("出", "だ"), ("す", "")],
+            &pairs[..]
+        );
+    }
+
+    #[test]
+    fn apply_furigana_06() {
+        let surface = "物の怪";
+        let kana = "もののけ";
+        let pairs = apply_furigana(surface, kana, &HashSet::new());
+
+        assert_eq!(&[("物の怪", "もののけ")], &pairs[..]);
+    }
+
+    #[test]
+    fn is_equivalent_kana_01() {
+        assert!(is_equivalent_kana('か', 'カ'));
+        assert!(is_equivalent_kana('カ', 'か'));
+        assert!(is_equivalent_kana('ぁ', 'ァ'));
+        assert!(is_equivalent_kana('ァ', 'ぁ'));
+        assert!(is_equivalent_kana('は', 'わ'));
+        assert!(is_equivalent_kana('わ', 'は'));
+        assert!(is_equivalent_kana('を', 'お'));
+        assert!(is_equivalent_kana('お', 'を'));
+        assert!(is_equivalent_kana('づ', 'ず'));
+        assert!(is_equivalent_kana('ず', 'づ'));
+        assert!(is_equivalent_kana('ー', 'あ'));
+        assert!(is_equivalent_kana('あ', 'ー'));
+        assert!(is_equivalent_kana('ー', 'ぁ'));
+        assert!(is_equivalent_kana('ぁ', 'ー'));
+
+        assert!(!is_equivalent_kana('は', 'ば'));
+        assert!(!is_equivalent_kana('ー', 'か'));
+        assert!(!is_equivalent_kana('た', '食'));
+    }
+
+    #[test]
+    fn tokenize_01() {
+        let gen = FuriganaGenerator::new(0, false);
+
+        let mut worker = gen.tokenizer.new_worker();
+        worker.reset_sentence("食べている");
+        worker.tokenize();
+
+        assert_eq!(3, worker.num_tokens());
+        assert_eq!("食べ", worker.token(0).surface());
+        assert_eq!("動詞-一般,タベ", worker.token(0).feature());
+        assert_eq!("て", worker.token(1).surface());
+        assert_eq!("助詞-接続助詞,テ", worker.token(1).feature());
+        assert_eq!("いる", worker.token(2).surface());
+        assert_eq!("動詞-非自立可能,イル", worker.token(2).feature());
+    }
+
+    #[test]
+    fn add_html_furigana_01() {
+        let mut gen = FuriganaGenerator::new(0, false);
+
+        let text = gen
+            .add_html_furigana(r#"<sup class="食う">食べる</sup>のは<ruby>良</ruby>いね！<hi />"#);
+
+        assert_eq!(
+            text,
+            r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね！<hi />"#
+        );
+    }
+}