Make it use html as both input and output.
It now uses the <ruby> tag to specify furigana, and also skips text that already has ruby tags around it.
This commit is contained in:
parent
87085c92ec
commit
1d64afe430
2
Cargo.lock
generated
2
Cargo.lock
generated
|
@ -182,6 +182,8 @@ dependencies = [
|
|||
name = "furigana_gen"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"regex",
|
||||
"ruzstd",
|
||||
"vibrato",
|
||||
]
|
||||
|
|
|
@ -8,3 +8,5 @@ edition = "2021"
|
|||
[dependencies]
|
||||
vibrato = "0.5"
|
||||
ruzstd = "0.7"
|
||||
regex = "1.10"
|
||||
once_cell = "1.19"
|
||||
|
|
35
src/main.rs
35
src/main.rs
|
@ -3,13 +3,14 @@ use std::{
|
|||
io::{Cursor, Read},
|
||||
};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use ruzstd::StreamingDecoder;
|
||||
use vibrato::{Dictionary, Tokenizer};
|
||||
|
||||
const DICT: &[u8] = include_bytes!("../dictionary/system.dic.zst");
|
||||
|
||||
fn main() {
|
||||
// Loads a compiled dictionary
|
||||
let dict = {
|
||||
let decoder = StreamingDecoder::new(Cursor::new(DICT)).unwrap();
|
||||
Dictionary::read(decoder).unwrap()
|
||||
|
@ -25,11 +26,33 @@ fn main() {
|
|||
};
|
||||
|
||||
let tokenizer = Tokenizer::new(dict);
|
||||
println!("{}", add_furigana(&text, &tokenizer));
|
||||
print!("{}", add_html_furigana_skip_already_ruby(&text, &tokenizer));
|
||||
}
|
||||
|
||||
fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String {
|
||||
/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
|
||||
fn add_html_furigana_skip_already_ruby(text: &str, tokenizer: &Tokenizer) -> String {
|
||||
static ALREADY_RUBY: Lazy<Regex> = Lazy::new(|| Regex::new(r"<ruby.*?>.*?</ruby>").unwrap());
|
||||
|
||||
let mut new_text = String::new();
|
||||
let mut last_byte_index = 0;
|
||||
for hit in ALREADY_RUBY.find_iter(text) {
|
||||
new_text.push_str(&add_html_furigana(
|
||||
&text[last_byte_index..hit.start()],
|
||||
tokenizer,
|
||||
));
|
||||
new_text.push_str(hit.as_str());
|
||||
last_byte_index = hit.end();
|
||||
}
|
||||
|
||||
new_text.push_str(&add_html_furigana(&text[last_byte_index..], tokenizer));
|
||||
|
||||
new_text
|
||||
}
|
||||
|
||||
/// Adds furigana to Japanese text, using html ruby tags.
|
||||
fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
|
||||
let mut worker = tokenizer.new_worker();
|
||||
|
||||
worker.reset_sentence(text);
|
||||
worker.tokenize();
|
||||
|
||||
|
@ -55,11 +78,11 @@ fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String {
|
|||
let mid_kana = &kana[start_bytes..(kana.len() - end_bytes)];
|
||||
let end = &surface[(surface.len() - end_bytes)..];
|
||||
new_text.push_str(start);
|
||||
new_text.push_str(" ");
|
||||
new_text.push_str("<ruby>");
|
||||
new_text.push_str(mid);
|
||||
new_text.push_str("[");
|
||||
new_text.push_str("<rt>");
|
||||
new_text.push_str(mid_kana);
|
||||
new_text.push_str("]");
|
||||
new_text.push_str("</rt></ruby>");
|
||||
new_text.push_str(end);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user