First commit.

A furigana generator, that can do "spaced repetition" style reduction of furigana over the course of a text.
2024-09-10 18:22:53 +02:00 · 2024-09-10 18:22:53 +02:00 · 1c3afed157
commit 1c3afed157
9 changed files with 4692 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,3 @@
 Cargo.lock
 /target
 /test_text
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,17 @@
 [package]
 name = "furigana_gen"
 version = "0.1.0"
 edition = "2021"
 [lib]
 name = "furigana_gen"
 path = "src/lib.rs"
 [dependencies]
 vibrato = "0.5"
 lz4_flex = "0.11"
 quick-xml = "0.36.1"
 [build-dependencies]
 lzma-rs = "0.3"
 lz4_flex = "0.11"
--- a/build.rs
+++ b/build.rs
@ -0,0 +1,50 @@
 use std::{
    env,
    fs::File,
    io::{BufReader, Write},
    path::Path,
 };
 const KANJI: &str = include_str!("data/kanji_frequency.txt");
 fn main() {
    let out_dir = env::var("OUT_DIR").unwrap();
    // Write frequency-ordered kanji array to rust file.
    {
        let dest_path = Path::new(&out_dir).join("kanji_freq_inc.rs");
        let mut f = File::create(&dest_path).unwrap();
        f.write_all("const KANJI_FREQ: &[char] = &[".as_bytes())
            .unwrap();
        for c in KANJI.chars() {
            if c.is_whitespace() {
                continue;
            }
            f.write_all(format!("\n'{}',", c).as_bytes()).unwrap();
        }
        f.write_all("\n];".as_bytes()).unwrap();
    }
    // Write compressed dictionary to .lz4 file.
    {
        // Read and decompress file from .xz.
        let dict_data = {
            let f = File::open("data/dictionary/system.dic.xz").unwrap();
            let mut data = Vec::new();
            lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
            data
        };
        // Recompress to .lz4.
        let dest_path = Path::new(&out_dir).join("system.dic.lz4");
        let f = File::create(dest_path).unwrap();
        let mut encoder = lz4_flex::frame::FrameEncoder::new(f);
        encoder.write(&dict_data).unwrap();
        encoder.finish().unwrap();
    }
 }
--- a/data/dictionary/BSD
+++ b/data/dictionary/BSD
@ -0,0 +1,31 @@
 Copyright (c) 2011-2021, The UniDic Consortium
 Copyright (c) 2023, LegalOn Technologies, Inc.
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 * Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the
   distribution.
 * Neither the name of the UniDic Consortium nor the names of its
   contributors may be used to endorse or promote products derived
   from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/data/dictionary/NOTICE
+++ b/data/dictionary/NOTICE
@ -0,0 +1,7 @@
 This software includes a binary version of data from
  https://clrd.ninjal.ac.jp/unidic_archive/cwj/3.1.1/unidic-cwj-3.1.1-full.zip
 where the costs and connection ids are retrained using CORE data in BCCWJ (except the PN category)
  https://clrd.ninjal.ac.jp/bccwj/.
--- a/data/dictionary/system.dic.xz
+++ b/data/dictionary/system.dic.xz
--- a/data/kanji_frequency.txt
+++ b/data/kanji_frequency.txt
--- a/src/learner.rs
+++ b/src/learner.rs
@ -0,0 +1,67 @@
 use std::collections::HashMap;
 const MIN_MAX_DISTANCE: usize = 100;
 const MAX_MAX_DISTANCE: usize = 10000;
 #[derive(Debug, Copy, Clone)]
 struct WordStats {
    // The last position (in words processed) that this word was seen at.
    last_seen_at: usize,
    // How many times this word has been seen so far.
    times_seen: usize,
    // Maximum distance before helps is needed again.
    max_distance: usize,
 }
 pub struct Learner {
    stats: HashMap<String, WordStats>,
    words_processed: usize,
    times_seen_threshold: usize,
 }
 impl Learner {
    pub fn new(times_seen_threshold: usize) -> Self {
        Self {
            stats: HashMap::new(),
            words_processed: 0,
            times_seen_threshold: times_seen_threshold,
        }
    }
    pub fn record(&mut self, word: &str) {
        self.stats
            .entry(word.to_string())
            .and_modify(|stats| {
                let distance = self.words_processed - stats.last_seen_at;
                stats.last_seen_at = self.words_processed;
                stats.times_seen += 1;
                if stats.times_seen <= self.times_seen_threshold {
                    return;
                }
                if distance < stats.max_distance {
                    stats.max_distance += distance.min((stats.max_distance as f64 * 0.5) as usize);
                }
                stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
            })
            .or_insert(WordStats {
                last_seen_at: self.words_processed,
                times_seen: 1,
                max_distance: MIN_MAX_DISTANCE,
            });
        self.words_processed += 1;
    }
    pub fn needs_help(&self, word: &str) -> bool {
        if let Some(stats) = self.stats.get(word) {
            let distance = self.words_processed - stats.last_seen_at;
            stats.times_seen <= self.times_seen_threshold || distance > stats.max_distance
        } else {
            true
        }
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,516 @@
 mod learner;
 use std::{
    collections::HashSet,
    // fs::File,
    io::{Cursor, Read},
 };
 use lz4_flex::frame::FrameDecoder;
 use quick_xml::events::Event;
 use vibrato::{Dictionary, Tokenizer};
 use learner::Learner;
 // Include KANJI_FREQ, a frequency-ordered array of kanji characters.
 include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));
 const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4"));
 pub struct FuriganaGenerator {
    tokenizer: Tokenizer,
    exclude_kanji: HashSet<char>,
    learner: Learner,
 }
 impl FuriganaGenerator {
    // `exclude_count`: exclude the N most frequent kanji from furigana.
    // Specifically, words made up *entirely* of those kanji will be excluded.
    // If a word has some kanji that aren't in that set, even if it also has
    // some that are, it will still get furigana.
    pub fn new(exclude_count: usize, learn_mode: bool) -> Self {
        let dict = {
            // Note: we could just pass the decoder straight to `Dictionary::read()`
            // below, and it would work.  However, that ends up being slower than
            // first decompressing the whole thing ahead of time.
            let mut decoder = FrameDecoder::new(Cursor::new(DICT));
            let mut data = Vec::new();
            decoder.read_to_end(&mut data).unwrap();
            Dictionary::read(Cursor::new(&data)).unwrap()
        };
        let exclude_kanji = {
            let mut set = HashSet::new();
            for &c in KANJI_FREQ.iter().take(exclude_count) {
                set.insert(c);
            }
            set
        };
        Self {
            tokenizer: Tokenizer::new(dict),
            exclude_kanji: exclude_kanji,
            learner: Learner::new(if learn_mode { 5 } else { usize::MAX }),
        }
    }
    pub fn add_html_furigana(&mut self, text: &str) -> String {
        add_html_furigana_skip_already_ruby(
            &text,
            &self.tokenizer,
            &self.exclude_kanji,
            &mut self.learner,
        )
    }
 }
 fn to_str<B: std::ops::Deref<Target = [u8]>>(bytes: &B) -> &str {
    std::str::from_utf8(&bytes.deref()).unwrap()
 }
 /// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
 fn add_html_furigana_skip_already_ruby(
    text: &str,
    tokenizer: &Tokenizer,
    exclude_kanji: &HashSet<char>,
    learner: &mut Learner,
 ) -> String {
    let mut reader = quick_xml::Reader::from_str(text);
    let mut new_text = String::new();
    let mut rubys: i32 = 0;
    loop {
        match reader.read_event() {
            Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
            Ok(Event::Eof) => break,
            Ok(Event::Start(e)) => {
                if e.name().into_inner() == b"ruby" {
                    rubys += 1;
                }
                write_xml(&mut new_text, &Event::Start(e));
            }
            Ok(Event::End(e)) => {
                if e.name().into_inner() == b"ruby" {
                    rubys -= 1;
                }
                write_xml(&mut new_text, &Event::End(e));
            }
            Ok(Event::Text(e)) => {
                if rubys <= 0 {
                    new_text.push_str(&add_html_furigana(
                        to_str(&e),
                        tokenizer,
                        exclude_kanji,
                        learner,
                    ));
                } else {
                    write_xml(&mut new_text, &Event::Text(e));
                }
            }
            // All other events, just re-write them verbatim.
            Ok(e) => write_xml(&mut new_text, &e),
        }
    }
    new_text
 }
 /// Takes an xml event and writes it verbatim to the given string.
 ///
 /// NOTE: really what we want is for the events to provide their byte index range
 /// in the original text, so we could just write that, and even double-check that
 /// we're not missing anything.  But for some reason quick_xml doesn't provide
 /// that information.
 fn write_xml(text: &mut String, event: &quick_xml::events::Event) {
    match event {
        Event::Start(e) => {
            text.push_str("<");
            text.push_str(to_str(e));
            text.push_str(">");
        }
        Event::End(e) => {
            text.push_str("</");
            text.push_str(to_str(e));
            text.push_str(">");
        }
        Event::Empty(e) => {
            text.push_str("<");
            text.push_str(to_str(e));
            text.push_str("/>");
        }
        Event::CData(e) => {
            text.push_str("<![CDATA[");
            text.push_str(to_str(e));
            text.push_str("]]>");
        }
        Event::Comment(e) => {
            text.push_str("<!--");
            text.push_str(to_str(e));
            text.push_str("-->");
        }
        Event::Decl(e) => {
            text.push_str("<?");
            text.push_str(to_str(e));
            text.push_str("?>");
        }
        Event::PI(e) => {
            text.push_str("<?");
            text.push_str(to_str(e));
            text.push_str("?>");
        }
        Event::DocType(e) => {
            text.push_str("<!DOCTYPE");
            text.push_str(to_str(e));
            text.push_str(">");
        }
        Event::Text(e) => text.push_str(to_str(e)),
        _ => unreachable!(),
    }
 }
 /// Adds furigana to Japanese text, using html ruby tags.
 fn add_html_furigana(
    text: &str,
    tokenizer: &Tokenizer,
    exclude_kanji: &HashSet<char>,
    learner: &mut Learner,
 ) -> String {
    let mut worker = tokenizer.new_worker();
    worker.reset_sentence(text);
    worker.tokenize();
    let mut new_text = String::new();
    for i in 0..worker.num_tokens() {
        let t = worker.token(i);
        let surface = t.surface();
        let needs_help = learner.needs_help(surface);
        learner.record(surface);
        if !needs_help {
            new_text.push_str(surface);
            continue;
        }
        let kana = t.feature().split(",").nth(1).unwrap();
        let furigana_text = apply_furigana(surface, kana, exclude_kanji);
        for (surf, furi) in furigana_text.iter() {
            if furi.is_empty() {
                new_text.push_str(surf);
                continue;
            }
            new_text.push_str("<ruby>");
            new_text.push_str(surf);
            new_text.push_str("<rt>");
            new_text.push_str(furi);
            new_text.push_str("</rt></ruby>");
        }
    }
    new_text
 }
 /// Returns a segmented list of (surface, furigana) pairs.
 ///
 /// The furigana component of a pair may be empty, indicating no
 /// furigana is needed for that surface element.
 fn apply_furigana<'a>(
    surface: &'a str,
    kana: &'a str,
    exclude_kanji: &HashSet<char>,
 ) -> Vec<(&'a str, &'a str)> {
    let mut out = Vec::new();
    if furigana_unneeded(surface, exclude_kanji) {
        out.push((surface, ""));
        return out;
    }
    let mut surface = surface;
    let mut kana = kana;
    // Trim any kana from the start.
    {
        let mut start_s = 0;
        let mut start_k = 0;
        for (sc, kc) in surface.chars().zip(kana.chars()) {
            if is_equivalent_kana(sc, kc) {
                start_s += sc.len_utf8();
                start_k += kc.len_utf8();
            } else {
                break;
            }
        }
        out.push((&surface[..start_s], ""));
        surface = &surface[start_s..];
        kana = &kana[start_k..];
    }
    // Trim any kana from the end.
    {
        let mut end_s = surface.len();
        let mut end_k = kana.len();
        for (sc, kc) in surface.chars().rev().zip(kana.chars().rev()) {
            if is_equivalent_kana(sc, kc) {
                end_s -= sc.len_utf8();
                end_k -= kc.len_utf8();
            } else {
                break;
            }
        }
        out.push((&surface[end_s..], ""));
        surface = &surface[..end_s];
        kana = &kana[..end_k];
    }
    // Try to uniquely match kana in the middle.
    //
    // This is just best-effort, and bails in any non-trivial cases.
    while let Some((si, sc)) = surface.char_indices().find(|(_, c)| is_kana(*c)) {
        // If there's more than one match, bail.
        let equivalent_kana_count = kana
            .chars()
            .map(|c| is_equivalent_kana(c, sc))
            .fold(0usize, |count, hit| count + hit as usize);
        if equivalent_kana_count != 1 {
            break;
        }
        // Find the one match.
        let (ki, kc) = kana
            .char_indices()
            .find(|(_, c)| is_equivalent_kana(sc, *c))
            .unwrap();
        // Insert the segments.
        out.insert(out.len() - 2, (&surface[..si], &kana[..ki]));
        out.insert(out.len() - 2, (&surface[si..(si + sc.len_utf8())], ""));
        surface = &surface[(si + sc.len_utf8())..];
        kana = &kana[(ki + kc.len_utf8())..];
    }
    // Left over.
    out.insert(out.len() - 2, (surface, kana));
    out.iter().filter(|(s, _)| !s.is_empty()).copied().collect()
 }
 /// Due to the way this is used, this isn't meant to be exact, but instead
 /// liberal in what it considers equivalent.
 fn is_equivalent_kana(a: char, b: char) -> bool {
    const PAIRS: &[[char; 2]] = &[['は', 'わ'], ['を', 'お'], ['づ', 'ず'], ['へ', 'え']];
    const VOWELS: &[char] = &['あ', 'い', 'う', 'え', 'お', 'ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ'];
    let (a, b) = match (normalize_kana(a), normalize_kana(b)) {
        (Some(a), Some(b)) => (a, b),
        _ => return false,
    };
    if a == b {
        return true;
    }
    if a == 'ー' && VOWELS.contains(&b) {
        return true;
    }
    if b == 'ー' && VOWELS.contains(&a) {
        return true;
    }
    for &[c, d] in PAIRS {
        if (a == c && b == d) || (a == d && b == c) {
            return true;
        }
    }
    false
 }
 const HIRAGANA: u32 = 0x3041;
 const KATAKANA: u32 = 0x30A1;
 const KANA_COUNT: u32 = 0x3097 - HIRAGANA;
 pub fn is_kana(c: char) -> bool {
    if c == 'ー' {
        return true;
    }
    let c = c as u32;
    if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
        return true;
    }
    if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
        return true;
    }
    return false;
 }
 pub fn normalize_kana(c: char) -> Option<char> {
    if !is_kana(c) {
        return None;
    }
    Some(katakana_to_hiragana(c).unwrap_or(c))
 }
 /// Returns true if furigana defininitely isn't needed.
 pub fn furigana_unneeded(text: &str, exclude_kanji: &HashSet<char>) -> bool {
    text.chars().all(|c| {
        is_kana(c) || c.is_ascii() || c.is_numeric() || c == '々' || exclude_kanji.contains(&c)
    })
 }
 pub fn hiragana_to_katakana(c: char) -> Option<char> {
    let c = c as u32;
    if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) {
        char::try_from(c + KATAKANA - HIRAGANA).ok()
    } else {
        None
    }
 }
 pub fn katakana_to_hiragana(c: char) -> Option<char> {
    let c = c as u32;
    if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) {
        char::try_from(c - KATAKANA + HIRAGANA).ok()
    } else {
        None
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    #[test]
    fn apply_furigana_01() {
        let surface = "へぇ";
        let kana = "ヘー";
        let pairs = apply_furigana(surface, kana, &HashSet::new());
        assert_eq!(&[("へぇ", "")], &pairs[..]);
    }
    #[test]
    fn apply_furigana_02() {
        let surface = "へぇー";
        let kana = "ヘー";
        let pairs = apply_furigana(surface, kana, &HashSet::new());
        assert_eq!(&[("へぇー", "")], &pairs[..]);
    }
    #[test]
    fn apply_furigana_03() {
        let surface = "へ";
        let kana = "え";
        let pairs = apply_furigana(surface, kana, &HashSet::new());
        assert_eq!(&[("へ", "")], &pairs[..]);
    }
    #[test]
    fn apply_furigana_04() {
        let surface = "食べる";
        let kana = "タベル";
        let pairs = apply_furigana(surface, kana, &HashSet::new());
        assert_eq!(&[("食", "タ"), ("べる", "")], &pairs[..]);
    }
    #[test]
    fn apply_furigana_05() {
        let surface = "流れ出す";
        let kana = "ながれだす";
        let pairs = apply_furigana(surface, kana, &HashSet::new());
        assert_eq!(
            &[("流", "なが"), ("れ", ""), ("出", "だ"), ("す", "")],
            &pairs[..]
        );
    }
    #[test]
    fn apply_furigana_06() {
        let surface = "物の怪";
        let kana = "もののけ";
        let pairs = apply_furigana(surface, kana, &HashSet::new());
        assert_eq!(&[("物の怪", "もののけ")], &pairs[..]);
    }
    #[test]
    fn is_equivalent_kana_01() {
        assert!(is_equivalent_kana('か', 'カ'));
        assert!(is_equivalent_kana('カ', 'か'));
        assert!(is_equivalent_kana('ぁ', 'ァ'));
        assert!(is_equivalent_kana('ァ', 'ぁ'));
        assert!(is_equivalent_kana('は', 'わ'));
        assert!(is_equivalent_kana('わ', 'は'));
        assert!(is_equivalent_kana('を', 'お'));
        assert!(is_equivalent_kana('お', 'を'));
        assert!(is_equivalent_kana('づ', 'ず'));
        assert!(is_equivalent_kana('ず', 'づ'));
        assert!(is_equivalent_kana('ー', 'あ'));
        assert!(is_equivalent_kana('あ', 'ー'));
        assert!(is_equivalent_kana('ー', 'ぁ'));
        assert!(is_equivalent_kana('ぁ', 'ー'));
        assert!(!is_equivalent_kana('は', 'ば'));
        assert!(!is_equivalent_kana('ー', 'か'));
        assert!(!is_equivalent_kana('た', '食'));
    }
    #[test]
    fn tokenize_01() {
        let gen = FuriganaGenerator::new(0, false);
        let mut worker = gen.tokenizer.new_worker();
        worker.reset_sentence("食べている");
        worker.tokenize();
        assert_eq!(3, worker.num_tokens());
        assert_eq!("食べ", worker.token(0).surface());
        assert_eq!("動詞-一般,タベ", worker.token(0).feature());
        assert_eq!("て", worker.token(1).surface());
        assert_eq!("助詞-接続助詞,テ", worker.token(1).feature());
        assert_eq!("いる", worker.token(2).surface());
        assert_eq!("動詞-非自立可能,イル", worker.token(2).feature());
    }
    #[test]
    fn add_html_furigana_01() {
        let mut gen = FuriganaGenerator::new(0, false);
        let text = gen
            .add_html_furigana(r#"<sup class="食う">食べる</sup>のは<ruby>良</ruby>いね！<hi />"#);
        assert_eq!(
            text,
            r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね！<hi />"#
        );
    }
 }