From 1d64afe4306a81552a5cc1ba4c34d71c868b2606 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Sun, 1 Sep 2024 08:27:10 +0200 Subject: [PATCH] Make it use html as both input and output. It now uses the tag to specify furigana, and also skips text that already has ruby tags around it. --- Cargo.lock | 2 ++ Cargo.toml | 2 ++ src/main.rs | 35 +++++++++++++++++++++++++++++------ 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 03d6274..e57cd2d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -182,6 +182,8 @@ dependencies = [ name = "furigana_gen" version = "0.1.0" dependencies = [ + "once_cell", + "regex", "ruzstd", "vibrato", ] diff --git a/Cargo.toml b/Cargo.toml index f61fbb8..8db4433 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,3 +8,5 @@ edition = "2021" [dependencies] vibrato = "0.5" ruzstd = "0.7" +regex = "1.10" +once_cell = "1.19" diff --git a/src/main.rs b/src/main.rs index f191a97..5e7bbbe 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,13 +3,14 @@ use std::{ io::{Cursor, Read}, }; +use once_cell::sync::Lazy; +use regex::Regex; use ruzstd::StreamingDecoder; use vibrato::{Dictionary, Tokenizer}; const DICT: &[u8] = include_bytes!("../dictionary/system.dic.zst"); fn main() { - // Loads a compiled dictionary let dict = { let decoder = StreamingDecoder::new(Cursor::new(DICT)).unwrap(); Dictionary::read(decoder).unwrap() @@ -25,11 +26,33 @@ fn main() { }; let tokenizer = Tokenizer::new(dict); - println!("{}", add_furigana(&text, &tokenizer)); + print!("{}", add_html_furigana_skip_already_ruby(&text, &tokenizer)); } -fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String { +/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby. +fn add_html_furigana_skip_already_ruby(text: &str, tokenizer: &Tokenizer) -> String { + static ALREADY_RUBY: Lazy = Lazy::new(|| Regex::new(r".*?").unwrap()); + + let mut new_text = String::new(); + let mut last_byte_index = 0; + for hit in ALREADY_RUBY.find_iter(text) { + new_text.push_str(&add_html_furigana( + &text[last_byte_index..hit.start()], + tokenizer, + )); + new_text.push_str(hit.as_str()); + last_byte_index = hit.end(); + } + + new_text.push_str(&add_html_furigana(&text[last_byte_index..], tokenizer)); + + new_text +} + +/// Adds furigana to Japanese text, using html ruby tags. +fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String { let mut worker = tokenizer.new_worker(); + worker.reset_sentence(text); worker.tokenize(); @@ -55,11 +78,11 @@ fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String { let mid_kana = &kana[start_bytes..(kana.len() - end_bytes)]; let end = &surface[(surface.len() - end_bytes)..]; new_text.push_str(start); - new_text.push_str(" "); + new_text.push_str(""); new_text.push_str(mid); - new_text.push_str("["); + new_text.push_str(""); new_text.push_str(mid_kana); - new_text.push_str("]"); + new_text.push_str(""); new_text.push_str(end); } }