commit 1c3afed15726e7496b6b1af2add1b4311a6982dc Author: Nathan Vegdahl Date: Tue Sep 10 18:22:53 2024 +0200 First commit. A furigana generator, that can do "spaced repetition" style reduction of furigana over the course of a text. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..45c109f --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +Cargo.lock +/target +/test_text diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..16b678b --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "furigana_gen" +version = "0.1.0" +edition = "2021" + +[lib] +name = "furigana_gen" +path = "src/lib.rs" + +[dependencies] +vibrato = "0.5" +lz4_flex = "0.11" +quick-xml = "0.36.1" + +[build-dependencies] +lzma-rs = "0.3" +lz4_flex = "0.11" diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..331e5ed --- /dev/null +++ b/build.rs @@ -0,0 +1,50 @@ +use std::{ + env, + fs::File, + io::{BufReader, Write}, + path::Path, +}; + +const KANJI: &str = include_str!("data/kanji_frequency.txt"); + +fn main() { + let out_dir = env::var("OUT_DIR").unwrap(); + + // Write frequency-ordered kanji array to rust file. + { + let dest_path = Path::new(&out_dir).join("kanji_freq_inc.rs"); + let mut f = File::create(&dest_path).unwrap(); + + f.write_all("const KANJI_FREQ: &[char] = &[".as_bytes()) + .unwrap(); + + for c in KANJI.chars() { + if c.is_whitespace() { + continue; + } + + f.write_all(format!("\n'{}',", c).as_bytes()).unwrap(); + } + + f.write_all("\n];".as_bytes()).unwrap(); + } + + // Write compressed dictionary to .lz4 file. + { + // Read and decompress file from .xz. + let dict_data = { + let f = File::open("data/dictionary/system.dic.xz").unwrap(); + let mut data = Vec::new(); + lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap(); + + data + }; + + // Recompress to .lz4. + let dest_path = Path::new(&out_dir).join("system.dic.lz4"); + let f = File::create(dest_path).unwrap(); + let mut encoder = lz4_flex::frame::FrameEncoder::new(f); + encoder.write(&dict_data).unwrap(); + encoder.finish().unwrap(); + } +} diff --git a/data/dictionary/BSD b/data/dictionary/BSD new file mode 100644 index 0000000..2c6b282 --- /dev/null +++ b/data/dictionary/BSD @@ -0,0 +1,31 @@ +Copyright (c) 2011-2021, The UniDic Consortium +Copyright (c) 2023, LegalOn Technologies, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + + * Neither the name of the UniDic Consortium nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/data/dictionary/NOTICE b/data/dictionary/NOTICE new file mode 100644 index 0000000..28f245a --- /dev/null +++ b/data/dictionary/NOTICE @@ -0,0 +1,7 @@ +This software includes a binary version of data from + + https://clrd.ninjal.ac.jp/unidic_archive/cwj/3.1.1/unidic-cwj-3.1.1-full.zip + +where the costs and connection ids are retrained using CORE data in BCCWJ (except the PN category) + + https://clrd.ninjal.ac.jp/bccwj/. diff --git a/data/dictionary/system.dic.xz b/data/dictionary/system.dic.xz new file mode 100644 index 0000000..7cb1e03 Binary files /dev/null and b/data/dictionary/system.dic.xz differ diff --git a/data/kanji_frequency.txt b/data/kanji_frequency.txt new file mode 100644 index 0000000..da06d82 --- /dev/null +++ b/data/kanji_frequency.txt @@ -0,0 +1,4001 @@ +一 +二 +三 +四 +五 +六 +七 +八 +九 +十 + +人 +言 +見 +出 +気 +思 +手 +何 +分 +俺 +大 +私 +上 +前 +間 +女 +事 +今 +子 +中 +自 +目 +生 +日 +時 +行 +方 +来 +様 +彼 +本 +話 +力 +体 +魔 +入 +者 +下 +先 +当 +感 +合 +後 +顔 +無 +部 +動 +少 +持 +知 +物 +心 +意 +度 +聞 +立 +全 +理 +声 +場 +身 +会 +向 +僕 +同 +取 +笑 +的 +戦 +通 +最 +長 +込 +不 +地 +口 +変 +考 +実 +年 +違 +回 +達 +家 +葉 +味 +食 +王 +面 +男 +情 +作 +悪 +外 +使 +付 +小 +切 +強 +確 +性 +明 +対 +国 +所 +死 +世 +相 +真 +君 +学 +屋 +神 +着 +開 +引 +返 +然 +界 +足 +待 +頭 +近 +名 +好 +以 +音 +用 +結 +仕 +法 +誰 +落 +高 +直 +内 +空 +解 +問 +発 +色 +教 +決 +撃 +現 +士 +続 +覚 +振 +良 +能 +指 +正 +白 +数 +視 +要 +受 +張 +信 +愛 +別 +初 +表 +突 +道 +飛 +殺 +連 +止 +必 +戻 +早 +元 +残 +定 +終 +可 +夫 +関 +金 +多 +呼 +代 +姿 +始 +父 +機 +主 +書 +歩 +楽 +勝 +重 +帰 +命 +光 +族 +息 +配 +風 +起 +他 +流 +線 +想 +水 +応 +美 +化 +状 +離 +過 +次 +得 +我 +置 +在 +緒 +親 +安 +放 +調 +背 +絶 +丈 +剣 +室 +失 +伝 +反 +首 +母 +貴 +成 +天 +夜 +倒 +術 +苦 +嫌 +番 +員 +月 +瞬 +兄 +抜 +題 +消 +逃 +乗 +新 +浮 +答 +存 +馬 +校 +集 +頼 +果 +態 +由 +認 +特 +報 +敵 +黒 +断 +輩 +素 +優 +軍 +業 +助 +進 +願 +車 +血 +抱 +守 +文 +追 +押 +精 +構 +囲 +形 +姉 +転 +走 +説 +痛 +腕 +胸 +店 +負 +赤 +兵 +座 +遠 +段 +識 +備 +打 +加 +点 +寄 +師 +触 +山 +限 +急 +差 +半 +常 +深 +予 +普 +攻 +両 +広 +告 +周 +石 +花 +殿 +寝 +運 +恐 +件 +程 +任 +供 +望 +髪 +格 +記 +服 +平 +険 +約 +海 +木 +活 +隊 +期 +電 +火 +勢 +鳴 +組 +単 +器 +語 +選 +都 +野 +利 +完 +係 +異 +隠 +経 +驚 +料 +欲 +第 +肉 +満 +公 +再 +怒 +団 +騎 +仲 +礼 +側 +掛 +際 +装 +奴 +友 +冷 +有 +夢 +探 +余 +画 +飲 +破 +怖 +交 +難 +横 +静 +訳 +計 +微 +送 +許 +似 +念 +質 +渡 +幸 +役 +居 +街 +奥 +速 +熱 +危 +青 +去 +社 +傷 +朝 +警 +議 +細 +読 +御 +品 +位 +肩 +茶 +買 +階 +端 +徒 +休 +軽 +婚 +太 +聖 +降 +共 +城 +右 +皆 +影 +頃 +響 +官 +護 +制 +移 +裏 +片 +原 +台 +武 +腹 +耳 +困 +眼 +忘 +園 +黙 +弾 +霊 +嬉 +怪 +握 +激 +況 +吸 +示 +具 +腰 +壁 +容 +宮 +悲 +義 +泣 +式 +暗 +迷 +験 +申 +和 +辺 +判 +務 +像 +帝 +娘 +論 +故 +察 +接 +未 +支 +談 +案 +吐 +壊 +試 +保 +村 +乱 +駄 +左 +惑 +客 +震 +頷 +割 +紙 +散 +興 +恋 +途 +揺 +病 +伸 +閉 +頑 +民 +土 +束 +若 +恥 +証 +技 +注 +治 +舞 +疑 +闘 +練 +爵 +昨 +叫 +種 +路 +逆 +万 +売 +為 +門 +遊 +獣 +襲 +並 +参 +図 +駆 +眠 +妹 +根 +嬢 +邪 +院 +令 +妙 +収 +謝 +竜 +量 +観 +弱 +瞳 +席 +遅 +星 +投 +巻 +隣 +船 +増 +古 +甘 +衛 +領 +退 +処 +暴 +準 +犯 +踏 +効 +与 +疲 +千 +類 +復 +景 +求 +憶 +罪 +住 +越 +習 +滅 +刺 +姫 +絡 +防 +字 +害 +詰 +局 +混 +捨 +商 +射 +密 +市 +叩 +焼 +巨 +届 +丸 +百 +扉 +育 +将 +映 +簡 +吹 +喜 +勇 +帯 +更 +避 +東 +薬 +冒 +染 +田 +雪 +派 +薄 +森 +済 +町 +極 +刻 +導 +救 +筋 +島 +象 +払 +鹿 +包 +低 +秘 +等 +例 +唇 +夏 +鬼 +呟 +工 +折 +草 +歳 +昼 +互 +介 +納 +従 +温 +脱 +超 +迎 +昔 +敷 +固 +宿 +級 +設 +非 +替 +久 +快 +香 +捕 +底 +陽 +涙 +角 +夕 +減 +衣 +職 +距 +産 +属 +川 +整 +慣 +郎 +闇 +奇 +刀 +爆 +個 +造 +球 +材 +布 +騒 +比 +誘 +窓 +慌 +敗 +呪 +戸 +照 +型 +圧 +雑 +皇 +狙 +末 +建 +改 +司 +奪 +毎 +除 +液 +争 +働 +値 +責 +麗 +歌 +輝 +弁 +否 +祭 +短 +研 +銃 +凄 +弟 +波 +炎 +独 +印 +演 +崩 +抗 +酒 +尋 +価 +旅 +提 +老 +跡 +銀 +基 +尽 +床 +緊 +権 +館 +鼻 +査 +繰 +暮 +庭 +被 +悟 +労 +脳 +了 +慢 +絵 +描 +沈 +号 +鳥 +写 +幼 +毛 +羽 +締 +頂 +舌 +留 +撫 +己 +究 +資 +陣 +適 +純 +仮 +迫 +呆 +堂 +操 +境 +飯 +雨 +頬 +嘘 +鉄 +骨 +華 +傾 +猫 +北 +悔 +晴 +拠 +修 +希 +毒 +尻 +春 +甲 +威 +衝 +積 +政 +厳 +荒 +協 +眺 +曲 +授 +紅 +扱 +補 +医 +犬 +志 +玉 +担 +輪 +漏 +枚 +洗 +週 +登 +綺 +陰 +既 +層 +裂 +遣 +伏 +率 +円 +才 +鏡 +穴 +里 +至 +井 +耐 +盛 +因 +屈 +亡 +悩 +継 +盗 +条 +肌 +宝 +京 +矢 +柄 +廊 +系 +訪 +援 +箱 +儀 +総 +科 +焦 +斬 +管 +佐 +勘 +樹 +依 +含 +汚 +眉 +睨 +展 +柔 +西 +拳 +狂 +宙 +跳 +誤 +荷 +裕 +織 +艦 +砂 +勉 +珍 +諦 +辛 +借 +尾 +模 +捜 +執 +携 +潰 +到 +掴 +句 +縁 +膝 +寂 +妖 +椅 +功 +雰 +額 +臓 +封 +遺 +覗 +袋 +課 +慮 +討 +欠 +隙 +歯 +匂 +冗 +鋭 +順 +訊 +南 +噂 +覆 +陛 +策 +杯 +懐 +板 +虫 +乳 +丁 +溜 +伯 +妻 +酷 +規 +仰 +陸 +氏 +棒 +汗 +露 +群 +皮 +魚 +換 +禁 +繋 +承 +刃 +那 +潜 +検 +貰 +臭 +氷 +史 +各 +蹴 +英 +燃 +即 +詳 +透 +呂 +製 +房 +魂 +略 +列 +庫 +戒 +桜 +算 +浴 +域 +云 +菜 +傍 +埋 +統 +評 +監 +複 +冬 +槍 +如 +拶 +挨 +奈 +硬 +黄 +致 +推 +貸 +舎 +巡 +召 +秋 +測 +寒 +秒 +褒 +奮 +飾 +健 +及 +祖 +刑 +煙 +干 +挙 +営 +殴 +訓 +穏 +喉 +幾 +歴 +濡 +砲 +雲 +充 +暇 +辞 +湯 +易 +忙 +抵 +節 +区 +鍵 +録 +宇 +撮 +努 +盾 +机 +侵 +雄 +幻 +灯 +賊 +喚 +施 +抑 +揃 +央 +索 +狼 +清 +婦 +晩 +随 +油 +妃 +忍 +憎 +択 +沢 +旦 +滑 +懸 +福 +溢 +揮 +専 +双 +費 +偽 +逸 +称 +雷 +匹 +候 +岩 +敬 +障 +便 +芸 +脚 +養 +砕 +委 +療 +匠 +給 +宅 +児 +坊 +愚 +誇 +歪 +緩 +怯 +獄 +蔵 +躍 +恵 +預 +挑 +歓 +午 +駅 +幕 +紹 +般 +噛 +狩 +挟 +貫 +膨 +招 +載 +龍 +喋 +菓 +編 +宣 +泊 +劇 +泉 +縮 +虚 +崎 +祝 +祈 +拾 +霧 +標 +損 +翼 +倉 +掃 +緑 +源 +停 +拭 +紫 +永 +糸 +倍 +麻 +善 +稼 +範 +狭 +厚 +添 +趣 +玄 +谷 +環 +脇 +章 +擦 +偉 +濃 +聴 +唱 +藤 +魅 +脅 +河 +昇 +坂 +舐 +烈 +酔 +延 +縛 +垂 +唯 +呑 +栄 +塞 +瀬 +嫁 +占 +江 +契 +橋 +塔 +省 +哀 +植 +奏 +拍 +偶 +膣 +乾 +沙 +亜 +拒 +副 +遭 +鮮 +溶 +府 +馴 +豪 +淡 +絞 +頰 +渉 +企 +蛇 +孤 +踊 +尊 +郷 +賢 +粉 +零 +詞 +審 +猛 +弓 +伊 +看 +恨 +侍 +卒 +秀 +針 +隅 +鍛 +鎧 +則 +隷 +杖 +裁 +裸 +林 +札 +博 +貨 +忠 +財 +咲 +筈 +採 +築 +堪 +控 +衆 +纏 +曜 +諸 +雅 +爪 +寧 +謎 +靴 +塗 +柱 +涼 +凍 +宗 +械 +筆 +摘 +創 +喧 +齢 +癒 +獲 +遇 +律 +侯 +詩 +洋 +慎 +乃 +誓 +灰 +揚 +嘆 +這 +羅 +殊 +臣 +恩 +噴 +浅 +誕 +璧 +鎖 +誠 +謀 +偵 +晶 +脈 +罰 +腐 +籠 +徴 +泳 +湧 +乙 +賞 +喰 +鈍 +厄 +辿 +漂 +癖 +棚 +唐 +泥 +翌 +叶 +股 +賛 +苛 +僅 +仁 +寮 +伐 +惚 +寺 +惜 +米 +請 +訴 +勿 +症 +釣 +把 +枝 +徹 +紛 +悠 +松 +豊 +囁 +塊 +譲 +勤 +鈴 +彩 +促 +卿 +粋 +顎 +剥 +皿 +凶 +績 +掻 +棄 +噓 +卵 +講 +凝 +牙 +牛 +縦 +漫 +賭 +遂 +削 +掲 +季 +据 +遮 +還 +革 +肢 +亀 +戯 +浜 +就 +災 +幽 +掘 +劣 +鼓 +犠 +嘩 +牲 +盤 +督 +惨 +憩 +肝 +湖 +洞 +紋 +貧 +寸 +紀 +廃 +暑 +暖 +摑 +港 +唾 +蒼 +揉 +競 +熟 +堵 +陥 +吉 +鑑 +釈 +騙 +卑 +躇 +躊 +盟 +拘 +飼 +錬 +斉 +赦 +述 +州 +旧 +幅 +畳 +益 +渋 +催 +捉 +津 +澄 +患 +浸 +眩 +掌 +憧 +岸 +伴 +維 +尉 +墓 +愉 +診 +徐 +袖 +臨 +童 +拝 +蓮 +幹 +農 +蓋 +瓶 +貌 +僧 +雫 +概 +磨 +豆 +孫 +訝 +免 +誌 +粘 +粒 +智 +雇 +帳 +富 +俯 +愕 +卓 +筒 +妬 +酸 +爺 +葵 +也 +排 +敏 +航 +康 +拡 +典 +罠 +壇 +隔 +塩 +畑 +覧 +弄 +汁 +之 +窺 +胆 +穂 +鋼 +蘇 +斐 +没 +煮 +錯 +巫 +骸 +却 +繁 +忌 +潤 +虐 +嚙 +閣 +虎 +芝 +旗 +慰 +尖 +託 +叱 +寿 +妄 +叔 +狐 +斜 +帽 +兼 +唸 +縄 +億 +徳 +飽 +池 +紗 +奉 +梨 +邸 +勧 +朱 +償 +麦 +傭 +班 +滴 +撤 +潔 +滲 +凛 +痕 +潮 +仏 +痺 +捧 +讐 +往 +嵐 +濁 +網 +党 +版 +羨 +羊 +嘲 +伺 +閃 +貼 +翻 +岡 +励 +阻 +渦 +堅 +憂 +枕 +署 +窟 +序 +贈 +鞄 +艶 +婆 +丘 +項 +滞 +莉 +珠 +憑 +沿 +喫 +漢 +辱 +詠 +鉱 +巣 +誉 +睡 +肯 +痴 +核 +鬱 +牢 +璃 +嫉 +葬 +是 +廷 +較 +塵 +泡 +履 +轟 +挿 +湿 +販 +咳 +鍋 +嗟 +竹 +悶 +析 +酬 +晒 +妾 +軌 +昧 +膜 +桃 +旋 +弥 +慈 +綾 +征 +蒸 +剤 +遥 +捻 +淫 +漠 +購 +熊 +豚 +鎮 +巧 +傘 +賀 +裾 +稽 +宴 +吊 +浄 +棟 +胃 +楓 +憤 +庇 +尚 +隼 +庁 +呻 +融 +琴 +冊 +砦 +擬 +咄 +餌 +此 +嗅 +凪 +詮 +銭 +刹 +献 +募 +醒 +鐘 +曖 +縫 +斗 +梅 +敢 +駐 +醜 +輸 +薇 +胴 +紡 +攣 +薔 +沸 +苗 +蜜 +囚 +阿 +杏 +憐 +壮 +翠 +勃 +猿 +瑞 +曇 +駒 +稿 +奢 +穢 +怠 +兎 +疎 +峰 +胞 +矛 +架 +缶 +枯 +殆 +佳 +殻 +罵 +均 +芽 +翔 +符 +拐 +汰 +瓦 +炭 +鱗 +須 +県 +扇 +蜘 +蛛 +儂 +籍 +乏 +濯 +茂 +笛 +梓 +牧 +爽 +侮 +税 +喪 +畜 +粗 +糖 +僚 +竦 +橘 +軒 +票 +漁 +浪 +啓 +藩 +窮 +綱 +疾 +賑 +彫 +著 +蔑 +漆 +肘 +膚 +拗 +搔 +沼 +拙 +凡 +絆 +妨 +棋 +妊 +斎 +茜 +宰 +鷹 +剝 +撒 +箇 +繫 +紐 +暫 +崖 +逢 +粧 +稲 +枠 +厨 +斧 +披 +磁 +芳 +陶 +顧 +炉 +欺 +霞 +覇 +沖 +瑠 +恭 +剛 +鎌 +逮 +荘 +薙 +煽 +滝 +垣 +蓄 +帆 +錠 +哲 +皺 +旨 +渇 +瞼 +踵 +貞 +蝶 +鞭 +惹 +襟 +仇 +釘 +耗 +貯 +暁 +萌 +朗 +些 +糧 +吾 +悦 +喘 +偏 +蜂 +冠 +昏 +迂 +雛 +冥 +佇 +箸 +繕 +堕 +槌 +摩 +屍 +脂 +繊 +穫 +廻 +馳 +痙 +蟲 +湊 +頻 +胡 +艇 +蛍 +珀 +琥 +雀 +軋 +畏 +礎 +溺 +俗 +慕 +銅 +鞘 +臆 +懇 +詫 +昂 +顕 +軟 +掠 +猟 +塾 +吠 +倫 +薫 +咎 +瞑 +柳 +埃 +禍 +閥 +株 +戴 +餅 +紳 +舟 +邦 +髭 +剰 +澪 +銘 +冴 +咆 +獅 +赴 +奔 +涯 +衰 +贅 +鶴 +苑 +飢 +刷 +燥 +鉢 +緯 +摂 +戚 +峙 +慨 +姓 +拓 +隻 +藍 +抽 +遙 +冶 +淹 +墜 +玲 +礫 +稀 +顰 +秩 +丹 +怨 +蠢 +虹 +柊 +烏 +抉 +謙 +噌 +穿 +妥 +枢 +圏 +瞥 +櫂 +塚 +綿 +腫 +其 +径 +刊 +哮 +貪 +鳳 +燐 +翡 +兆 +煩 +菊 +柚 +宜 +括 +椿 +儲 +巾 +漬 +拷 +宵 +羞 +杉 +娠 +詐 +仙 +亭 +亮 +孔 +浦 +咥 +併 +肥 +桐 +鶏 +蘭 +闊 +虜 +庶 +汐 +孝 +眷 +殲 +煌 +傲 +肺 +郭 +睦 +謁 +辻 +券 +檻 +崇 +弦 +朧 +貢 +嵌 +茉 +芯 +串 +鉛 +曹 +淵 +碧 +懲 +贄 +鳩 +岐 +稚 +綻 +縋 +炊 +凌 +彷 +寛 +葛 +壺 +汝 +曰 +宛 +硝 +薦 +洩 +舗 +諾 +蛮 +芋 +隆 +痩 +牽 +篠 +躱 +盆 +軸 +批 +吟 +諭 +彦 +俊 +狽 +粛 +某 +抹 +腸 +恰 +刈 +頓 +慄 +飴 +錆 +慶 +喝 +搾 +殖 +焚 +侶 +乞 +匿 +圭 +絹 +遜 +竿 +兜 +蕩 +哉 +湾 +咽 +岬 +柵 +絨 +攫 +洒 +栖 +燈 +貝 +冑 +紺 +弊 +婿 +又 +眸 +髄 +桁 +萎 +塀 +傑 +灼 +姻 +茎 +暦 +辰 +渚 +汲 +阪 +腑 +嗜 +躙 +棺 +袈 +裟 +堀 +賃 +巴 +椎 +或 +毯 +沌 +搬 +瘴 +炸 +棲 +垢 +搭 +槽 +撲 +鷲 +厭 +墨 +閑 +渾 +昭 +葱 +菌 +甚 +媚 +莫 +狗 +謹 +培 +玩 +欧 +蔽 +擁 +淑 +衡 +跪 +脆 +爛 +沫 +套 +嗚 +朔 +腿 +螺 +只 +娼 +媒 +蹂 +唖 +淀 +颯 +桶 +栗 +丼 +楼 +逐 +賜 +韻 +胎 +徨 +伽 +挫 +繍 +帥 +嬌 +糞 +朽 +猪 +燭 +捌 +櫻 +克 +嚇 +耕 +擲 +訂 +瞭 +憲 +簿 +鼠 +膳 +劫 +恒 +薩 +餓 +耶 +囮 +祓 +奨 +尿 +駕 +榊 +儚 +豹 +倣 +捲 +踪 +儘 +采 +躯 +盲 +祐 +蝕 +欄 +孕 +陳 +綴 +嗤 +聡 +呈 +咒 +猥 +賓 +巳 +樽 +廉 +猶 +凜 +墟 +桂 +柏 +疼 +緋 +斥 +疫 +跨 +壬 +朴 +躰 +薪 +需 +后 +蔓 +迸 +溝 +坐 +昴 +腔 +諜 +轄 +篭 +譜 +啜 +棘 +蟻 +痒 +杭 +鞠 +蚊 +泰 +郵 +蜥 +蜴 +菫 +癪 +躾 +逡 +仔 +雌 +摯 +茫 +喩 +秤 +屑 +凹 +藻 +肖 +芹 +皐 +騰 +狸 +檎 +迅 +践 +姐 +醸 +遍 +唄 +拉 +勲 +褐 +埒 +惰 +祥 +逝 +郊 +昆 +餐 +煉 +麺 +摺 +瀕 +郡 +枷 +遡 +娯 +謳 +醤 +涎 +於 +舵 +郁 +峯 +岳 +梢 +祟 +該 +笠 +寵 +閲 +靄 +瓜 +琢 +斑 +恍 +戮 +篤 +牡 +蝋 +吼 +辣 +洲 +惧 +靡 +楚 +釜 +陵 +焔 +弧 +蔭 +梯 +嶋 +幡 +填 +翁 +鯨 +宥 +逞 +庵 +梗 +毅 +襖 +糾 +笹 +嫡 +炒 +杜 +厩 +貶 +幣 +筧 +詛 +慧 +酌 +湛 +嘉 +碗 +襞 +賄 +翳 +幌 +橙 +尺 +措 +汽 +漕 +獰 +呉 +蔦 +駿 +繭 +藁 +夷 +堤 +循 +瞠 +蛙 +隈 +愁 +准 +朦 +輿 +箒 +旬 +讃 +耽 +萩 +錦 +鞍 +栽 +弛 +叉 +碑 +剖 +坑 +磐 +煎 +蹟 +俳 +啞 +姦 +謂 +嶺 +桑 +憫 +澤 +栞 +挽 +禄 +樫 +滓 +窪 +旭 +狡 +仄 +貿 +咀 +弘 +嚼 +揄 +揶 +掟 +燻 +痣 +矜 +葡 +桔 +睫 +掬 +柴 +棍 +萄 +梁 +昌 +禅 +苔 +鵜 +疇 +壱 +甦 +屠 +蟹 +淋 +膏 +屯 +噤 +洛 +妓 +茨 +榴 +憚 +傀 +儡 +唆 +藪 +挺 +撥 +蹲 +焉 +祠 +轢 +隕 +暢 +麓 +鴨 +塁 +齧 +贋 +穀 +俄 +祀 +董 +俵 +熾 +燦 +卯 +叡 +碌 +渓 +享 +呵 +吏 +瘦 +彰 +忽 +芻 +燕 +窒 +捩 +霜 +暈 +駈 +蝙 +蝠 +濤 +浩 +勅 +輔 +戟 +酢 +蒔 +粥 +苺 +姪 +贔 +櫛 +迦 +魁 +杞 +肪 +屓 +債 +碁 +鉤 +緻 +栓 +姑 +鴉 +凰 +峠 +佑 +咤 +坦 +茹 +飄 +詣 +蒲 +禿 +叙 +捗 +剃 +珈 +琲 +尼 +李 +條 +褪 +附 +箋 +峡 +肋 +洪 +悍 +袴 +曾 +裔 +肛 +冤 +弔 +捏 +篝 +癇 +鋏 +粟 +頸 +玖 +怜 +梳 +甥 +舷 +憮 +尤 +屁 +韓 +蕎 +癌 +楯 +矯 +遽 +叛 +惣 +朋 +佃 +裡 +徊 +凸 +斃 +鹸 +徘 +靭 +吻 +榛 +炙 +鷺 +寡 +菱 +嘔 +蕾 +煤 +悸 +堰 +捷 +滾 +哨 +伍 +澱 +鍔 +桟 +賽 +宦 +柿 +楠 +茸 +卸 +侘 +鉈 +播 +噪 +譚 +嬲 +遷 +鋒 +猾 +硯 +伎 +酵 +懺 +賦 +哄 +椛 +饅 +饒 +窘 +漸 +鵺 +琉 +贖 +謡 +磯 +累 +稟 +螢 +翅 +錐 +漣 +畔 +但 +嘗 +瑛 +蹄 +樋 +蠟 +椀 +蠅 +辟 +嗣 +倦 +蝉 +腺 +汎 +蒙 +漲 +錫 +拮 +賠 +泌 +轍 +刎 +芦 +諫 +恕 +盃 +匙 +旺 +膠 +瘍 +簾 +窃 +髏 +髑 +僭 +遼 +殉 +曽 +悴 +膂 +莪 +壕 +軀 +嶽 +曝 +蚕 +顛 +蠍 +囃 +謐 +雁 +彿 +哭 +脊 +糊 +屹 +勾 +几 +憔 +滸 +麟 +晦 +噺 +鯉 +孵 +弐 +攪 +褄 +朶 +汀 +躓 +鮫 +沁 +芒 +棗 +戌 +窄 +諒 +弩 +裳 +毟 +庸 +氣 +蓼 +鮎 +聳 +諏 +祷 +娶 +脛 +縞 +憾 +珊 +庄 +媛 +猊 +痍 +悉 +倖 +瞰 +賤 +瑚 +嚢 +筐 +扮 +臥 +泄 +禽 +餃 +櫓 +蝦 +樵 +罹 +睥 +醍 +椒 +醐 +埼 +且 +撰 +逅 +邂 +凱 +篇 +臼 +窯 +粕 +廟 +芙 +慟 +悼 +簪 +膿 +僥 +函 +菩 +頁 +巷 +榎 +鯛 +陀 +搦 +柑 +嵩 +賂 +芥 +濫 +鱈 +眇 +扶 +瞞 +蠱 +慇 +懃 +巌 +蓉 +箔 +鮭 +縷 +稜 +涜 +絢 +檀 +詭 +琵 +琶 +袂 +敦 +槻 +麒 +彗 +啖 +爬 +狛 +茅 +荊 +絃 +涛 +魄 +楔 +靖 +鰻 +蛾 +腎 +呷 +酎 +紬 +毬 +鳶 +彙 +筑 +壌 +肴 +蜃 +樺 +瀧 +驕 +曳 +穣 +鑽 +磔 +嘴 +矮 +惠 +諍 +壷 +梶 +湘 +笥 +誅 +泪 +塡 +窩 +吃 +斡 +鴻 +蒐 +硫 +奧 +鍬 +虔 +攘 +柩 +滋 +黛 +烙 +嘯 +梟 +楊 +訟 +鶯 +臀 +牝 +蓑 +藉 +宏 +國 +擢 +齎 +牌 +礁 +餉 +秦 +夥 +竈 +晰 +鋳 +訣 +叢 +迄 +拵 +鏃 +楕 +僻 +亘 +憊 +蛸 +箪 +燎 +靱 +姜 +憬 +齟 +齬 +墳 +淳 +臍 +繡 +鼬 +埠 +祉 +鯱 +訛 +朕 +腱 +倶 +跋 +誹 +銛 +輌 +鋸 +荻 +恫 +腋 +藝 +竄 +穹 +蟬 +胤 +囓 +咬 +劒 +簒 +罅 +誑 +阜 +涸 +啼 +劉 +扈 +舶 +葦 +毘 +貘 +逗 +蓬 +丞 +柘 +疹 +簀 +莢 +邁 +匣 +升 +袁 +淘 +誼 +餡 +悧 +祇 +侭 +蛆 +魎 +遁 +斯 +蝿 +瘤 +轡 +塹 +堆 +氾 +詈 +桿 +疵 +纂 +嚥 +闖 +潟 +站 +葎 +勒 +竪 +餞 +楢 +嗄 +蝗 +睾 +謗 +諌 +猩 +菅 +托 +鄙 +韋 +帖 +魏 +皓 +乖 +鬨 +凭 +帛 +眦 +而 +邏 +鍾 +綜 +咫 +奸 +佩 +巽 +鑢 +杓 +炬 +酊 +酩 +蠕 +宋 +猜 +兇 +惟 +碍 +遵 +竺 +訥 +灌 +坪 +閨 +瑕 +騨 +閻 +茄 +漿 +駁 +燼 +饉 +鑓 +艘 +罷 +墾 +亥 +饗 +衾 +鉾 +滔 +抓 +顚 +囀 +揆 +痢 +寅 +哺 +杵 +鯖 +侠 +沃 +燵 +荼 +魍 +懊 +撹 +醬 +砥 +諺 +嘶 +洸 +璽 +寥 +瀉 +做 +檜 +晋 +啄 +趨 +麾 +悄 +愴 +膵 +碇 +舳 +畿 +擽 +鏖 +吋 +閂 +衷 +梱 +櫃 +菖 +澹 +犀 +紆 +帷 +徽 +狒 +爾 +醇 +焙 +黎 +綽 +鴎 +灸 +蛭 +藹 +欒 +撼 +邑 +檄 +蜻 +頽 +蛉 +螂 +肚 +禊 +蟷 +租 +鰹 +錨 +盧 +椋 +傅 +瑤 +懣 +瓢 +凧 +羹 +誂 +黴 +籐 +峻 +毀 +鏑 +鰐 +漱 +斛 +禱 +洟 +恣 +筏 +蠣 +鬣 +捺 +瀟 +髯 +鳰 +丑 +玻 +雹 +魯 +廓 +噎 +夾 +厠 +喊 +逼 +鍮 +襦 +鋲 +陪 +閤 +撓 +卜 +亨 +痘 +濠 +俸 +剋 +剌 +舜 +沽 +輛 +鬚 +鵠 +儒 +註 +磋 +肆 +赫 +眈 +卍 +慾 +鹼 +婉 +拌 +渕 +曙 +霹 +靂 +愾 +鵬 +隘 +倅 +蟇 +蜀 +胱 +膀 +剪 +獪 +忸 +怩 +諧 +柾 +鐵 +倹 +劾 +溌 +鑿 +坤 +袱 +萬 +盥 +栃 +吝 +涌 +謬 +瀆 +仗 +刮 +偲 +甕 +褸 +襤 +袢 +濾 +畠 +麹 +尭 +覿 +廠 +馨 +劈 +吞 +偃 +魑 +訶 +崑 +邯 +鄲 +按 +崙 +熔 +濘 +錘 +愧 +亦 +鰭 +倭 +峨 +俣 +棹 +憺 +蝮 +瑣 +蕉 +辜 +銜 +歎 +蛹 +茗 +蘆 +截 +葺 +嘱 +壜 +槇 +鞋 +卦 +褻 +吽 +芭 +駝 +鐙 +薊 +摸 +繚 +蛟 +屏 +檬 +檸 +嵯 +盪 +脹 +鑼 +蟠 +瀑 +梃 +烹 +寓 +椰 +鈎 +鴇 +厘 +畝 +詔 +迭 +沱 +滂 +躅 +躑 +訃 +舅 +鸚 +駱 +鵡 +躁 +忖 +灘 +鹵 +髷 +霆 +筵 +籤 +𠮟 +耄 +蜩 +娩 +譫 +冽 +簞 +浚 +蕗 +鍼 +曼 +漉 +痔 +雉 +匕 +恃 +驢 +幟 +聊 +衿 +虱 +竃 +棕 +炮 +俱 +縺 +竟 +餮 +饕 +頚 +嬰 +瘡 +桧 +箍 +鵞 +褥 +緞 +臂 +濛 +棠 +縊 +聾 +笈 +隧 +掩 +櫚 +蟄 +闢 +兌 +埜 +酪 +坩 +堝 +酋 +眞 +皙 +娑 +糠 +箏 +臙 +髙 +艱 +襷 +樟 +銚 +箝 +姥 +嬪 +榜 +孺 +肱 +耀 +聘 +燗 +瀾 +慚 +箕 +誦 +釉 +槙 +炯 +庚 +疚 +麿 +掏 +橇 +瑶 +梧 +葫 +畢 +斟 +蕃 +來 +頤 +斂 +脾 +牒 +吶 +蛤 +匍 +鼾 +俟 +賎 +丙 +梵 +圃 +蕪 +痰 +儺 +剽 +笏 +已 +閾 +谺 +恢 +馭 +麩 +跫 +瑜 +鼎 +浣 +碩 +斤 +槃 +扁 +涅 +寨 +毫 +橿 +箭 +諮 +堡 +鉗 +衒 +輜 +袍 +筍 +鬘 +薯 +虞 +趙 +譬 +婢 +鋤 +幇 +沓 +喇 +款 +囂 +鸞 +薮 +韮 +叭 +擾 +匐 +孟 +韜 +殷 +麝 +蝸 +謄 +濱 +嫂 +欅 +鉦 +槐 +瑙 +瑪 +籾 +鵲 +欝 +罐 +霰 +敲 +饐 +奕 +顫 +鬩 +矩 +畦 +釦 +魘 +惡 +抄 +蛋 +撞 +蚓 +蚯 +允 +羆 +蕁 +枳 +奄 +儕 +喀 +獺 +颶 +凋 +絽 +萱 +謔 +會 +磊 +鰯 +沐 +酉 +乎 +攀 +廿 +蕨 +嫣 +瞋 +莞 +倆 +緬 +窶 +緘 +疱 +拿 +綯 +鉋 +寇 +什 +蚤 +框 +埴 +腥 +鮨 +冨 +杳 +擂 +鱒 +牆 +驀 +穽 +蹙 +縒 +胚 +頒 +莱 +諳 +虻 +懦 +籃 +恙 +骰 +寞 +聯 +踝 +楡 +渠 +閏 +驟 +篩 +蘊 +胛 +縹 +犇 +諤 +廣 +潑 +獏 +刳 +慥 +砧 +軛 +禦 +恬 +苫 +俎 +蓙 +趾 +氈 +鮒 +艫 +撻 +逍 +晨 +褌 +罌 +貂 +淆 +鴛 +鴦 +鎬 +拇 +欣 +畸 +秣 +鄭 +无 +鑷 +篁 +懼 +凉 +踞 +坎 +狢 +疋 +歿 +芍 +夙 +嶮 +茣 +輻 +黍 +鶫 +鐸 +矍 +鑠 +址 +蝎 +蜷 +伜 +鼈 +笊 +臑 +搗 +龕 +塙 +鞴 +尸 +崋 +荏 +頗 +恚 +鎗 +糺 +顏 +怙 +繻 +愈 +帙 +熨 +嗾 +瓏 +檣 +戎 +抒 +枡 +桓 +袷 +匁 +舂 +紘 +瓊 +盒 +蛞 +蝓 +邀 +綬 +疸 +錚 +鬢 +圓 +聚 +槿 +昵 +禰 +尹 +乍 +瀝 +扼 +珪 +樅 +菰 +仆 +鍍 +殭 +驒 +壽 +飜 +綸 +脣 +鷗 +陋 +茲 +抛 +逓 +俥 +擡 +弼 +鰓 +潭 +碓 +繞 +罫 +屛 +烟 +鏝 +搏 +呶 +讒 +旱 +紮 +嗇 +羌 +佗 +劍 +舫 +戈 +擱 +漑 +蹌 +貉 +聲 +闍 +欽 +垓 +忝 +塒 +匡 +拱 +踉 +廏 +戊 +鈿 +亢 +軻 +疽 +匈 +蒻 +麭 +蒟 +掣 +皹 +竣 +弑 +髣 +髴 +弋 +狄 +甜 +繃 +聟 +膾 +鮪 +劃 +藷 +絣 +嫋 +耆 +赭 +愍 +撚 +蕭 +柢 +舘 +辷 +牟 +瘠 +堺 +鮑 +埓 +鱧 +薺 +釵 +匪 +桝 +瘧 +莵 +俤 +皴 +疏 +蒜 +鯵 +渺 +釧 +廬 +圀 +繹 +廂 +崗 +羂 +渥 +婁 +曠 +寐 +佞 +喬 +捥 +稔 +馥 +鱸 +覯 +諷 +艤 +彬 +鴒 +鶺 +夭 +徂 +徠 +顆 +慊 +覡 +與 +馗 +誨 +騏 +忿 +煖 +于 +伶 +塑 +幔 +翰 +鎚 +驎 +鯰 +羚 +侃 +號 +幀 +悋 +杷 +襴 +慙 +鞣 +鰌 +冰 +杢 +裃 +莨 +蚣 +蜈 +跛 +圖 +晃 +楷 +魃 +瀞 +艷 +濶 +筺 +箴 +雖 +楪 +筅 +笙 +諱 +挾 +砒 +稗 +溯 +蹈 +僑 +悖 +宸 +俠 +艪 +薗 +頌 +旒 +庖 +羈 +嶼 +侈 +泛 +糎 +諄 +跣 +篷 +蒋 +潅 +个 +鴫 +屡 +杠 +葭 +浬 +斌 +黝 +谿 +髻 +孰 +應 +禎 +宍 +癩 +睛 +蔡 +薨 +嶌 +蒡 +鰊 +溂 +鏤 +鑚 +媼 +糟 +弖 +莚 +薀 +蝟 +椚 +襯 +逵 +佻 +厖 +腟 +椥 +稠 +栂 +羯 +蓆 +勺 +艀 +穎 +諂 +苅 +潁 +笞 +旌 +醗 +譴 +懶 +甍 +粁 +銑 +饌 +繙 +娥 +嫦 +慳 +盂 +哩 +樒 +禹 +蹠 +燧 +邕 +瀚 +簑 +霙 +侑 +臘 +枇 +鞆 +篦 +蟀 +籬 +鉞 +狷 +痾 +跏 +隋 +烽 +鈷 +舁 +梏 +悌 diff --git a/src/learner.rs b/src/learner.rs new file mode 100644 index 0000000..4250b04 --- /dev/null +++ b/src/learner.rs @@ -0,0 +1,67 @@ +use std::collections::HashMap; + +const MIN_MAX_DISTANCE: usize = 100; +const MAX_MAX_DISTANCE: usize = 10000; + +#[derive(Debug, Copy, Clone)] +struct WordStats { + // The last position (in words processed) that this word was seen at. + last_seen_at: usize, + + // How many times this word has been seen so far. + times_seen: usize, + + // Maximum distance before helps is needed again. + max_distance: usize, +} + +pub struct Learner { + stats: HashMap, + words_processed: usize, + times_seen_threshold: usize, +} + +impl Learner { + pub fn new(times_seen_threshold: usize) -> Self { + Self { + stats: HashMap::new(), + words_processed: 0, + times_seen_threshold: times_seen_threshold, + } + } + + pub fn record(&mut self, word: &str) { + self.stats + .entry(word.to_string()) + .and_modify(|stats| { + let distance = self.words_processed - stats.last_seen_at; + + stats.last_seen_at = self.words_processed; + stats.times_seen += 1; + if stats.times_seen <= self.times_seen_threshold { + return; + } + + if distance < stats.max_distance { + stats.max_distance += distance.min((stats.max_distance as f64 * 0.5) as usize); + } + + stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE); + }) + .or_insert(WordStats { + last_seen_at: self.words_processed, + times_seen: 1, + max_distance: MIN_MAX_DISTANCE, + }); + self.words_processed += 1; + } + + pub fn needs_help(&self, word: &str) -> bool { + if let Some(stats) = self.stats.get(word) { + let distance = self.words_processed - stats.last_seen_at; + stats.times_seen <= self.times_seen_threshold || distance > stats.max_distance + } else { + true + } + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..72cc9d6 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,516 @@ +mod learner; + +use std::{ + collections::HashSet, + // fs::File, + io::{Cursor, Read}, +}; + +use lz4_flex::frame::FrameDecoder; +use quick_xml::events::Event; +use vibrato::{Dictionary, Tokenizer}; + +use learner::Learner; + +// Include KANJI_FREQ, a frequency-ordered array of kanji characters. +include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs")); + +const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4")); + +pub struct FuriganaGenerator { + tokenizer: Tokenizer, + exclude_kanji: HashSet, + learner: Learner, +} + +impl FuriganaGenerator { + // `exclude_count`: exclude the N most frequent kanji from furigana. + // Specifically, words made up *entirely* of those kanji will be excluded. + // If a word has some kanji that aren't in that set, even if it also has + // some that are, it will still get furigana. + pub fn new(exclude_count: usize, learn_mode: bool) -> Self { + let dict = { + // Note: we could just pass the decoder straight to `Dictionary::read()` + // below, and it would work. However, that ends up being slower than + // first decompressing the whole thing ahead of time. + let mut decoder = FrameDecoder::new(Cursor::new(DICT)); + let mut data = Vec::new(); + decoder.read_to_end(&mut data).unwrap(); + + Dictionary::read(Cursor::new(&data)).unwrap() + }; + + let exclude_kanji = { + let mut set = HashSet::new(); + for &c in KANJI_FREQ.iter().take(exclude_count) { + set.insert(c); + } + set + }; + + Self { + tokenizer: Tokenizer::new(dict), + exclude_kanji: exclude_kanji, + learner: Learner::new(if learn_mode { 5 } else { usize::MAX }), + } + } + + pub fn add_html_furigana(&mut self, text: &str) -> String { + add_html_furigana_skip_already_ruby( + &text, + &self.tokenizer, + &self.exclude_kanji, + &mut self.learner, + ) + } +} + +fn to_str>(bytes: &B) -> &str { + std::str::from_utf8(&bytes.deref()).unwrap() +} + +/// Like `add_html_furigana()`, but skips text that already has ruby on it, to it doesn't get double-ruby. +fn add_html_furigana_skip_already_ruby( + text: &str, + tokenizer: &Tokenizer, + exclude_kanji: &HashSet, + learner: &mut Learner, +) -> String { + let mut reader = quick_xml::Reader::from_str(text); + + let mut new_text = String::new(); + let mut rubys: i32 = 0; + + loop { + match reader.read_event() { + Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), + Ok(Event::Eof) => break, + + Ok(Event::Start(e)) => { + if e.name().into_inner() == b"ruby" { + rubys += 1; + } + write_xml(&mut new_text, &Event::Start(e)); + } + + Ok(Event::End(e)) => { + if e.name().into_inner() == b"ruby" { + rubys -= 1; + } + write_xml(&mut new_text, &Event::End(e)); + } + + Ok(Event::Text(e)) => { + if rubys <= 0 { + new_text.push_str(&add_html_furigana( + to_str(&e), + tokenizer, + exclude_kanji, + learner, + )); + } else { + write_xml(&mut new_text, &Event::Text(e)); + } + } + + // All other events, just re-write them verbatim. + Ok(e) => write_xml(&mut new_text, &e), + } + } + + new_text +} + +/// Takes an xml event and writes it verbatim to the given string. +/// +/// NOTE: really what we want is for the events to provide their byte index range +/// in the original text, so we could just write that, and even double-check that +/// we're not missing anything. But for some reason quick_xml doesn't provide +/// that information. +fn write_xml(text: &mut String, event: &quick_xml::events::Event) { + match event { + Event::Start(e) => { + text.push_str("<"); + text.push_str(to_str(e)); + text.push_str(">"); + } + + Event::End(e) => { + text.push_str(""); + } + + Event::Empty(e) => { + text.push_str("<"); + text.push_str(to_str(e)); + text.push_str("/>"); + } + + Event::CData(e) => { + text.push_str(""); + } + + Event::Comment(e) => { + text.push_str(""); + } + + Event::Decl(e) => { + text.push_str(""); + } + + Event::PI(e) => { + text.push_str(""); + } + + Event::DocType(e) => { + text.push_str(""); + } + + Event::Text(e) => text.push_str(to_str(e)), + + _ => unreachable!(), + } +} + +/// Adds furigana to Japanese text, using html ruby tags. +fn add_html_furigana( + text: &str, + tokenizer: &Tokenizer, + exclude_kanji: &HashSet, + learner: &mut Learner, +) -> String { + let mut worker = tokenizer.new_worker(); + + worker.reset_sentence(text); + worker.tokenize(); + + let mut new_text = String::new(); + for i in 0..worker.num_tokens() { + let t = worker.token(i); + let surface = t.surface(); + + let needs_help = learner.needs_help(surface); + learner.record(surface); + + if !needs_help { + new_text.push_str(surface); + continue; + } + + let kana = t.feature().split(",").nth(1).unwrap(); + + let furigana_text = apply_furigana(surface, kana, exclude_kanji); + + for (surf, furi) in furigana_text.iter() { + if furi.is_empty() { + new_text.push_str(surf); + continue; + } + + new_text.push_str(""); + new_text.push_str(surf); + new_text.push_str(""); + new_text.push_str(furi); + new_text.push_str(""); + } + } + + new_text +} + +/// Returns a segmented list of (surface, furigana) pairs. +/// +/// The furigana component of a pair may be empty, indicating no +/// furigana is needed for that surface element. +fn apply_furigana<'a>( + surface: &'a str, + kana: &'a str, + exclude_kanji: &HashSet, +) -> Vec<(&'a str, &'a str)> { + let mut out = Vec::new(); + + if furigana_unneeded(surface, exclude_kanji) { + out.push((surface, "")); + return out; + } + + let mut surface = surface; + let mut kana = kana; + + // Trim any kana from the start. + { + let mut start_s = 0; + let mut start_k = 0; + for (sc, kc) in surface.chars().zip(kana.chars()) { + if is_equivalent_kana(sc, kc) { + start_s += sc.len_utf8(); + start_k += kc.len_utf8(); + } else { + break; + } + } + out.push((&surface[..start_s], "")); + surface = &surface[start_s..]; + kana = &kana[start_k..]; + } + + // Trim any kana from the end. + { + let mut end_s = surface.len(); + let mut end_k = kana.len(); + for (sc, kc) in surface.chars().rev().zip(kana.chars().rev()) { + if is_equivalent_kana(sc, kc) { + end_s -= sc.len_utf8(); + end_k -= kc.len_utf8(); + } else { + break; + } + } + out.push((&surface[end_s..], "")); + surface = &surface[..end_s]; + kana = &kana[..end_k]; + } + + // Try to uniquely match kana in the middle. + // + // This is just best-effort, and bails in any non-trivial cases. + while let Some((si, sc)) = surface.char_indices().find(|(_, c)| is_kana(*c)) { + // If there's more than one match, bail. + let equivalent_kana_count = kana + .chars() + .map(|c| is_equivalent_kana(c, sc)) + .fold(0usize, |count, hit| count + hit as usize); + if equivalent_kana_count != 1 { + break; + } + + // Find the one match. + let (ki, kc) = kana + .char_indices() + .find(|(_, c)| is_equivalent_kana(sc, *c)) + .unwrap(); + + // Insert the segments. + out.insert(out.len() - 2, (&surface[..si], &kana[..ki])); + out.insert(out.len() - 2, (&surface[si..(si + sc.len_utf8())], "")); + surface = &surface[(si + sc.len_utf8())..]; + kana = &kana[(ki + kc.len_utf8())..]; + } + + // Left over. + out.insert(out.len() - 2, (surface, kana)); + + out.iter().filter(|(s, _)| !s.is_empty()).copied().collect() +} + +/// Due to the way this is used, this isn't meant to be exact, but instead +/// liberal in what it considers equivalent. +fn is_equivalent_kana(a: char, b: char) -> bool { + const PAIRS: &[[char; 2]] = &[['は', 'わ'], ['を', 'お'], ['づ', 'ず'], ['へ', 'え']]; + const VOWELS: &[char] = &['あ', 'い', 'う', 'え', 'お', 'ぁ', 'ぃ', 'ぅ', 'ぇ', 'ぉ']; + + let (a, b) = match (normalize_kana(a), normalize_kana(b)) { + (Some(a), Some(b)) => (a, b), + _ => return false, + }; + + if a == b { + return true; + } + + if a == 'ー' && VOWELS.contains(&b) { + return true; + } + + if b == 'ー' && VOWELS.contains(&a) { + return true; + } + + for &[c, d] in PAIRS { + if (a == c && b == d) || (a == d && b == c) { + return true; + } + } + + false +} + +const HIRAGANA: u32 = 0x3041; +const KATAKANA: u32 = 0x30A1; +const KANA_COUNT: u32 = 0x3097 - HIRAGANA; + +pub fn is_kana(c: char) -> bool { + if c == 'ー' { + return true; + } + + let c = c as u32; + + if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) { + return true; + } + + if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) { + return true; + } + + return false; +} + +pub fn normalize_kana(c: char) -> Option { + if !is_kana(c) { + return None; + } + + Some(katakana_to_hiragana(c).unwrap_or(c)) +} + +/// Returns true if furigana defininitely isn't needed. +pub fn furigana_unneeded(text: &str, exclude_kanji: &HashSet) -> bool { + text.chars().all(|c| { + is_kana(c) || c.is_ascii() || c.is_numeric() || c == '々' || exclude_kanji.contains(&c) + }) +} + +pub fn hiragana_to_katakana(c: char) -> Option { + let c = c as u32; + if c >= HIRAGANA && c < (HIRAGANA + KANA_COUNT) { + char::try_from(c + KATAKANA - HIRAGANA).ok() + } else { + None + } +} + +pub fn katakana_to_hiragana(c: char) -> Option { + let c = c as u32; + if c >= KATAKANA && c < (KATAKANA + KANA_COUNT) { + char::try_from(c - KATAKANA + HIRAGANA).ok() + } else { + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn apply_furigana_01() { + let surface = "へぇ"; + let kana = "ヘー"; + let pairs = apply_furigana(surface, kana, &HashSet::new()); + + assert_eq!(&[("へぇ", "")], &pairs[..]); + } + + #[test] + fn apply_furigana_02() { + let surface = "へぇー"; + let kana = "ヘー"; + let pairs = apply_furigana(surface, kana, &HashSet::new()); + + assert_eq!(&[("へぇー", "")], &pairs[..]); + } + + #[test] + fn apply_furigana_03() { + let surface = "へ"; + let kana = "え"; + let pairs = apply_furigana(surface, kana, &HashSet::new()); + + assert_eq!(&[("へ", "")], &pairs[..]); + } + + #[test] + fn apply_furigana_04() { + let surface = "食べる"; + let kana = "タベル"; + let pairs = apply_furigana(surface, kana, &HashSet::new()); + + assert_eq!(&[("食", "タ"), ("べる", "")], &pairs[..]); + } + + #[test] + fn apply_furigana_05() { + let surface = "流れ出す"; + let kana = "ながれだす"; + let pairs = apply_furigana(surface, kana, &HashSet::new()); + + assert_eq!( + &[("流", "なが"), ("れ", ""), ("出", "だ"), ("す", "")], + &pairs[..] + ); + } + + #[test] + fn apply_furigana_06() { + let surface = "物の怪"; + let kana = "もののけ"; + let pairs = apply_furigana(surface, kana, &HashSet::new()); + + assert_eq!(&[("物の怪", "もののけ")], &pairs[..]); + } + + #[test] + fn is_equivalent_kana_01() { + assert!(is_equivalent_kana('か', 'カ')); + assert!(is_equivalent_kana('カ', 'か')); + assert!(is_equivalent_kana('ぁ', 'ァ')); + assert!(is_equivalent_kana('ァ', 'ぁ')); + assert!(is_equivalent_kana('は', 'わ')); + assert!(is_equivalent_kana('わ', 'は')); + assert!(is_equivalent_kana('を', 'お')); + assert!(is_equivalent_kana('お', 'を')); + assert!(is_equivalent_kana('づ', 'ず')); + assert!(is_equivalent_kana('ず', 'づ')); + assert!(is_equivalent_kana('ー', 'あ')); + assert!(is_equivalent_kana('あ', 'ー')); + assert!(is_equivalent_kana('ー', 'ぁ')); + assert!(is_equivalent_kana('ぁ', 'ー')); + + assert!(!is_equivalent_kana('は', 'ば')); + assert!(!is_equivalent_kana('ー', 'か')); + assert!(!is_equivalent_kana('た', '食')); + } + + #[test] + fn tokenize_01() { + let gen = FuriganaGenerator::new(0, false); + + let mut worker = gen.tokenizer.new_worker(); + worker.reset_sentence("食べている"); + worker.tokenize(); + + assert_eq!(3, worker.num_tokens()); + assert_eq!("食べ", worker.token(0).surface()); + assert_eq!("動詞-一般,タベ", worker.token(0).feature()); + assert_eq!("て", worker.token(1).surface()); + assert_eq!("助詞-接続助詞,テ", worker.token(1).feature()); + assert_eq!("いる", worker.token(2).surface()); + assert_eq!("動詞-非自立可能,イル", worker.token(2).feature()); + } + + #[test] + fn add_html_furigana_01() { + let mut gen = FuriganaGenerator::new(0, false); + + let text = gen + .add_html_furigana(r#"食べるのはいね!"#); + + assert_eq!( + text, + r#"べるのはいね!"# + ); + } +}