Make it use html as both input and output.

It now uses the <ruby> tag to specify furigana, and also skips text that already has ruby tags around it.
2024-09-01 08:27:10 +02:00 · 2024-09-01 08:27:10 +02:00 · 1d64afe430
commit 1d64afe430
parent 87085c92ec
3 changed files with 33 additions and 6 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -182,6 +182,8 @@ dependencies = [
 name = "furigana_gen"
 version = "0.1.0"
 dependencies = [
+ "once_cell",
+ "regex",
 "ruzstd",
 "vibrato",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -8,3 +8,5 @@ edition = "2021"
 [dependencies]
 vibrato = "0.5"
 ruzstd = "0.7"
+regex = "1.10"
+once_cell = "1.19"
--- a/src/main.rs
+++ b/src/main.rs
@ -3,13 +3,14 @@ use std::{
    io::{Cursor, Read},
 };

+use once_cell::sync::Lazy;
+use regex::Regex;
 use ruzstd::StreamingDecoder;
 use vibrato::{Dictionary, Tokenizer};

 const DICT: &[u8] = include_bytes!("../dictionary/system.dic.zst");

 fn main() {
-    // Loads a compiled dictionary
    let dict = {
        let decoder = StreamingDecoder::new(Cursor::new(DICT)).unwrap();
        Dictionary::read(decoder).unwrap()
@ -25,11 +26,33 @@ fn main() {
    };

    let tokenizer = Tokenizer::new(dict);
-    println!("{}", add_furigana(&text, &tokenizer));
+    print!("{}", add_html_furigana_skip_already_ruby(&text, &tokenizer));
 }

-fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String {
+/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
+fn add_html_furigana_skip_already_ruby(text: &str, tokenizer: &Tokenizer) -> String {
+    static ALREADY_RUBY: Lazy<Regex> = Lazy::new(|| Regex::new(r"<ruby.*?>.*?</ruby>").unwrap());
+
+    let mut new_text = String::new();
+    let mut last_byte_index = 0;
+    for hit in ALREADY_RUBY.find_iter(text) {
+        new_text.push_str(&add_html_furigana(
+            &text[last_byte_index..hit.start()],
+            tokenizer,
+        ));
+        new_text.push_str(hit.as_str());
+        last_byte_index = hit.end();
+    }
+
+    new_text.push_str(&add_html_furigana(&text[last_byte_index..], tokenizer));
+
+    new_text
+}
+
+/// Adds furigana to Japanese text, using html ruby tags.
+fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
    let mut worker = tokenizer.new_worker();
+
    worker.reset_sentence(text);
    worker.tokenize();

@ -55,11 +78,11 @@ fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String {
            let mid_kana = &kana[start_bytes..(kana.len() - end_bytes)];
            let end = &surface[(surface.len() - end_bytes)..];
            new_text.push_str(start);
-            new_text.push_str(" ");
+            new_text.push_str("<ruby>");
            new_text.push_str(mid);
-            new_text.push_str("[");
+            new_text.push_str("<rt>");
            new_text.push_str(mid_kana);
-            new_text.push_str("]");
+            new_text.push_str("</rt></ruby>");
            new_text.push_str(end);
        }
    }