Make it use html as both input and output.

It now uses the <ruby> tag to specify furigana, and also skips text that already has ruby tags around it.
2024-09-01 08:27:10 +02:00 · 2024-09-01 08:27:10 +02:00 · 1d64afe430
commit 1d64afe430
parent 87085c92ec
3 changed files with 33 additions and 6 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -182,6 +182,8 @@ dependencies = [
 name = "furigana_gen"
 version = "0.1.0"
 dependencies = [
 "once_cell",
 "regex",
 "ruzstd",
 "vibrato",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@ -8,3 +8,5 @@ edition = "2021"
 [dependencies]
 vibrato = "0.5"
 ruzstd = "0.7"
 regex = "1.10"
 once_cell = "1.19"
--- a/src/main.rs
+++ b/src/main.rs
@ -3,13 +3,14 @@ use std::{
    io::{Cursor, Read},
 };
 use once_cell::sync::Lazy;
 use regex::Regex;
 use ruzstd::StreamingDecoder;
 use vibrato::{Dictionary, Tokenizer};
 const DICT: &[u8] = include_bytes!("../dictionary/system.dic.zst");
 fn main() {
    // Loads a compiled dictionary
    let dict = {
        let decoder = StreamingDecoder::new(Cursor::new(DICT)).unwrap();
        Dictionary::read(decoder).unwrap()
@ -25,11 +26,33 @@ fn main() {
    };
    let tokenizer = Tokenizer::new(dict);
-    println!("{}", add_furigana(&text, &tokenizer));
+    print!("{}", add_html_furigana_skip_already_ruby(&text, &tokenizer));
 }
-fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String {
+/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
 fn add_html_furigana_skip_already_ruby(text: &str, tokenizer: &Tokenizer) -> String {
    static ALREADY_RUBY: Lazy<Regex> = Lazy::new(|| Regex::new(r"<ruby.*?>.*?</ruby>").unwrap());
    let mut new_text = String::new();
    let mut last_byte_index = 0;
    for hit in ALREADY_RUBY.find_iter(text) {
        new_text.push_str(&add_html_furigana(
            &text[last_byte_index..hit.start()],
            tokenizer,
        ));
        new_text.push_str(hit.as_str());
        last_byte_index = hit.end();
    }
    new_text.push_str(&add_html_furigana(&text[last_byte_index..], tokenizer));
    new_text
 }
 /// Adds furigana to Japanese text, using html ruby tags.
 fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
    let mut worker = tokenizer.new_worker();
    worker.reset_sentence(text);
    worker.tokenize();
@ -55,11 +78,11 @@ fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String {
            let mid_kana = &kana[start_bytes..(kana.len() - end_bytes)];
            let end = &surface[(surface.len() - end_bytes)..];
            new_text.push_str(start);
-            new_text.push_str(" ");
+            new_text.push_str("<ruby>");
            new_text.push_str(mid);
-            new_text.push_str("[");
+            new_text.push_str("<rt>");
            new_text.push_str(mid_kana);
-            new_text.push_str("]");
+            new_text.push_str("</rt></ruby>");
            new_text.push_str(end);
        }
    }