Make it use html as both input and output.

It now uses the <ruby> tag to specify furigana, and also skips text that
already has ruby tags around it.
This commit is contained in:
Nathan Vegdahl 2024-09-01 08:27:10 +02:00
parent 87085c92ec
commit 1d64afe430
3 changed files with 33 additions and 6 deletions

2
Cargo.lock generated
View File

@ -182,6 +182,8 @@ dependencies = [
name = "furigana_gen" name = "furigana_gen"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"once_cell",
"regex",
"ruzstd", "ruzstd",
"vibrato", "vibrato",
] ]

View File

@ -8,3 +8,5 @@ edition = "2021"
[dependencies] [dependencies]
vibrato = "0.5" vibrato = "0.5"
ruzstd = "0.7" ruzstd = "0.7"
regex = "1.10"
once_cell = "1.19"

View File

@ -3,13 +3,14 @@ use std::{
io::{Cursor, Read}, io::{Cursor, Read},
}; };
use once_cell::sync::Lazy;
use regex::Regex;
use ruzstd::StreamingDecoder; use ruzstd::StreamingDecoder;
use vibrato::{Dictionary, Tokenizer}; use vibrato::{Dictionary, Tokenizer};
const DICT: &[u8] = include_bytes!("../dictionary/system.dic.zst"); const DICT: &[u8] = include_bytes!("../dictionary/system.dic.zst");
fn main() { fn main() {
// Loads a compiled dictionary
let dict = { let dict = {
let decoder = StreamingDecoder::new(Cursor::new(DICT)).unwrap(); let decoder = StreamingDecoder::new(Cursor::new(DICT)).unwrap();
Dictionary::read(decoder).unwrap() Dictionary::read(decoder).unwrap()
@ -25,11 +26,33 @@ fn main() {
}; };
let tokenizer = Tokenizer::new(dict); let tokenizer = Tokenizer::new(dict);
println!("{}", add_furigana(&text, &tokenizer)); print!("{}", add_html_furigana_skip_already_ruby(&text, &tokenizer));
} }
fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String { /// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
fn add_html_furigana_skip_already_ruby(text: &str, tokenizer: &Tokenizer) -> String {
static ALREADY_RUBY: Lazy<Regex> = Lazy::new(|| Regex::new(r"<ruby.*?>.*?</ruby>").unwrap());
let mut new_text = String::new();
let mut last_byte_index = 0;
for hit in ALREADY_RUBY.find_iter(text) {
new_text.push_str(&add_html_furigana(
&text[last_byte_index..hit.start()],
tokenizer,
));
new_text.push_str(hit.as_str());
last_byte_index = hit.end();
}
new_text.push_str(&add_html_furigana(&text[last_byte_index..], tokenizer));
new_text
}
/// Adds furigana to Japanese text, using html ruby tags.
fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
let mut worker = tokenizer.new_worker(); let mut worker = tokenizer.new_worker();
worker.reset_sentence(text); worker.reset_sentence(text);
worker.tokenize(); worker.tokenize();
@ -55,11 +78,11 @@ fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String {
let mid_kana = &kana[start_bytes..(kana.len() - end_bytes)]; let mid_kana = &kana[start_bytes..(kana.len() - end_bytes)];
let end = &surface[(surface.len() - end_bytes)..]; let end = &surface[(surface.len() - end_bytes)..];
new_text.push_str(start); new_text.push_str(start);
new_text.push_str(" "); new_text.push_str("<ruby>");
new_text.push_str(mid); new_text.push_str(mid);
new_text.push_str("["); new_text.push_str("<rt>");
new_text.push_str(mid_kana); new_text.push_str(mid_kana);
new_text.push_str("]"); new_text.push_str("</rt></ruby>");
new_text.push_str(end); new_text.push_str(end);
} }
} }