Make it use html as both input and output.
It now uses the <ruby> tag to specify furigana, and also skips text that already has ruby tags around it.
This commit is contained in:
parent
87085c92ec
commit
1d64afe430
2
Cargo.lock
generated
2
Cargo.lock
generated
|
@ -182,6 +182,8 @@ dependencies = [
|
||||||
name = "furigana_gen"
|
name = "furigana_gen"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"once_cell",
|
||||||
|
"regex",
|
||||||
"ruzstd",
|
"ruzstd",
|
||||||
"vibrato",
|
"vibrato",
|
||||||
]
|
]
|
||||||
|
|
|
@ -8,3 +8,5 @@ edition = "2021"
|
||||||
[dependencies]
|
[dependencies]
|
||||||
vibrato = "0.5"
|
vibrato = "0.5"
|
||||||
ruzstd = "0.7"
|
ruzstd = "0.7"
|
||||||
|
regex = "1.10"
|
||||||
|
once_cell = "1.19"
|
||||||
|
|
35
src/main.rs
35
src/main.rs
|
@ -3,13 +3,14 @@ use std::{
|
||||||
io::{Cursor, Read},
|
io::{Cursor, Read},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use regex::Regex;
|
||||||
use ruzstd::StreamingDecoder;
|
use ruzstd::StreamingDecoder;
|
||||||
use vibrato::{Dictionary, Tokenizer};
|
use vibrato::{Dictionary, Tokenizer};
|
||||||
|
|
||||||
const DICT: &[u8] = include_bytes!("../dictionary/system.dic.zst");
|
const DICT: &[u8] = include_bytes!("../dictionary/system.dic.zst");
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
// Loads a compiled dictionary
|
|
||||||
let dict = {
|
let dict = {
|
||||||
let decoder = StreamingDecoder::new(Cursor::new(DICT)).unwrap();
|
let decoder = StreamingDecoder::new(Cursor::new(DICT)).unwrap();
|
||||||
Dictionary::read(decoder).unwrap()
|
Dictionary::read(decoder).unwrap()
|
||||||
|
@ -25,11 +26,33 @@ fn main() {
|
||||||
};
|
};
|
||||||
|
|
||||||
let tokenizer = Tokenizer::new(dict);
|
let tokenizer = Tokenizer::new(dict);
|
||||||
println!("{}", add_furigana(&text, &tokenizer));
|
print!("{}", add_html_furigana_skip_already_ruby(&text, &tokenizer));
|
||||||
}
|
}
|
||||||
|
|
||||||
fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String {
|
/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby.
|
||||||
|
fn add_html_furigana_skip_already_ruby(text: &str, tokenizer: &Tokenizer) -> String {
|
||||||
|
static ALREADY_RUBY: Lazy<Regex> = Lazy::new(|| Regex::new(r"<ruby.*?>.*?</ruby>").unwrap());
|
||||||
|
|
||||||
|
let mut new_text = String::new();
|
||||||
|
let mut last_byte_index = 0;
|
||||||
|
for hit in ALREADY_RUBY.find_iter(text) {
|
||||||
|
new_text.push_str(&add_html_furigana(
|
||||||
|
&text[last_byte_index..hit.start()],
|
||||||
|
tokenizer,
|
||||||
|
));
|
||||||
|
new_text.push_str(hit.as_str());
|
||||||
|
last_byte_index = hit.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
new_text.push_str(&add_html_furigana(&text[last_byte_index..], tokenizer));
|
||||||
|
|
||||||
|
new_text
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adds furigana to Japanese text, using html ruby tags.
|
||||||
|
fn add_html_furigana(text: &str, tokenizer: &Tokenizer) -> String {
|
||||||
let mut worker = tokenizer.new_worker();
|
let mut worker = tokenizer.new_worker();
|
||||||
|
|
||||||
worker.reset_sentence(text);
|
worker.reset_sentence(text);
|
||||||
worker.tokenize();
|
worker.tokenize();
|
||||||
|
|
||||||
|
@ -55,11 +78,11 @@ fn add_furigana(text: &str, tokenizer: &Tokenizer) -> String {
|
||||||
let mid_kana = &kana[start_bytes..(kana.len() - end_bytes)];
|
let mid_kana = &kana[start_bytes..(kana.len() - end_bytes)];
|
||||||
let end = &surface[(surface.len() - end_bytes)..];
|
let end = &surface[(surface.len() - end_bytes)..];
|
||||||
new_text.push_str(start);
|
new_text.push_str(start);
|
||||||
new_text.push_str(" ");
|
new_text.push_str("<ruby>");
|
||||||
new_text.push_str(mid);
|
new_text.push_str(mid);
|
||||||
new_text.push_str("[");
|
new_text.push_str("<rt>");
|
||||||
new_text.push_str(mid_kana);
|
new_text.push_str(mid_kana);
|
||||||
new_text.push_str("]");
|
new_text.push_str("</rt></ruby>");
|
||||||
new_text.push_str(end);
|
new_text.push_str(end);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user