From 1d64afe4306a81552a5cc1ba4c34d71c868b2606 Mon Sep 17 00:00:00 2001
From: Nathan Vegdahl <cessen@cessen.com>
Date: Sun, 1 Sep 2024 08:27:10 +0200
Subject: [PATCH] Make it use html as both input and output.

It now uses the <ruby> tag to specify furigana, and also skips text that
already has ruby tags around it.
---
 Cargo.lock  |  2 ++
 Cargo.toml  |  2 ++
 src/main.rs | 35 +++++++++++++++++++++++++++++------
 3 files changed, 33 insertions(+), 6 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 03d6274..e57cd2d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -182,6 +182,8 @@ dependencies = [
 name = "furigana_gen"
 version = "0.1.0"
 dependencies = [
+ "once_cell",
+ "regex",
  "ruzstd",
  "vibrato",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index f61fbb8..8db4433 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,3 +8,5 @@ edition = "2021"
 [dependencies]
 vibrato = "0.5"
 ruzstd = "0.7"
+regex = "1.10"
+once_cell = "1.19"
diff --git a/src/main.rs b/src/main.rs
index f191a97..5e7bbbe 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -3,13 +3,14 @@ use std::{
     io::{Cursor, Read},
 };
 
+use once_cell::sync::Lazy;
+use regex::Regex;
 use ruzstd::StreamingDecoder;
 use vibrato::{Dictionary, Tokenizer};
 
 const DICT: &[u8] = include_bytes!("../dictionary/system.dic.zst");
 
 fn main() {
-    // Loads a compiled dictionary
     let dict = {
         let decoder = StreamingDecoder::new(Cursor::new(DICT)).unwrap();
         Dictionary::read(decoder).unwrap()
@@ -25,11 +26,33 @@ fn main() {
     };
 
     let tokenizer = Tokenizer::new(dict);
-    println!("{}", add_furigana(&text, &tokenizer));
+    print!("{}", add_html_furigana_skip_already_ruby(&text, &tokenizer));
 }
 
-fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String {
+/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
+fn add_html_furigana_skip_already_ruby(text: &str, tokenizer: &Tokenizer) -> String {
+    static ALREADY_RUBY: Lazy<Regex> = Lazy::new(|| Regex::new(r"<ruby.*?>.*?</ruby>").unwrap());
+
+    let mut new_text = String::new();
+    let mut last_byte_index = 0;
+    for hit in ALREADY_RUBY.find_iter(text) {
+        new_text.push_str(&add_html_furigana(
+            &text[last_byte_index..hit.start()],
+            tokenizer,
+        ));
+        new_text.push_str(hit.as_str());
+        last_byte_index = hit.end();
+    }
+
+    new_text.push_str(&add_html_furigana(&text[last_byte_index..], tokenizer));
+
+    new_text
+}
+
+/// Adds furigana to Japanese text, using html ruby tags.
+fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
     let mut worker = tokenizer.new_worker();
+
     worker.reset_sentence(text);
     worker.tokenize();
 
@@ -55,11 +78,11 @@ fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String {
             let mid_kana = &kana[start_bytes..(kana.len() - end_bytes)];
             let end = &surface[(surface.len() - end_bytes)..];
             new_text.push_str(start);
-            new_text.push_str(" ");
+            new_text.push_str("<ruby>");
             new_text.push_str(mid);
-            new_text.push_str("[");
+            new_text.push_str("<rt>");
             new_text.push_str(mid_kana);
-            new_text.push_str("]");
+            new_text.push_str("</rt></ruby>");
             new_text.push_str(end);
         }
     }