Add option to include pitch accent information with the furigana

2024-09-18 12:10:22 +02:00 · 2024-09-18 12:10:22 +02:00 · adb58983a7
commit adb58983a7
parent 7361240e49
11 changed files with 124435 additions and 83 deletions
--- a/build.rs
+++ b/build.rs
@ -29,11 +29,11 @@ fn main() {
        f.write_all("\n];".as_bytes()).unwrap();
    }
-    // Write compressed dictionary to .lz4 file.
+    // Write compressed parsing dictionary to .lz4 file.
    {
        // Read and decompress file from .xz.
        let dict_data = {
-            let f = File::open("data/dictionary/system.dic.xz").unwrap();
+            let f = File::open("data/ipadic-mecab-2_7_0/system.dic.xz").unwrap();
            let mut data = Vec::new();
            lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
@ -47,4 +47,23 @@ fn main() {
        encoder.write(&dict_data).unwrap();
        encoder.finish().unwrap();
    }
    // Write compressed pitch accent dictionary to .lz4 file.
    {
        // Read and decompress file from .xz.
        let dict_data = {
            let f = File::open("data/accents.tsv.xz").unwrap();
            let mut data = Vec::new();
            lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
            data
        };
        // Recompress to .lz4.
        let dest_path = Path::new(&out_dir).join("accents.tsv.lz4");
        let f = File::create(dest_path).unwrap();
        let mut encoder = lz4_flex::frame::FrameEncoder::new(f);
        encoder.write(&dict_data).unwrap();
        encoder.finish().unwrap();
    }
 }
--- a/data/accents.tsv
+++ b/data/accents.tsv
--- a/data/accents.tsv.xz
+++ b/data/accents.tsv.xz
--- a/data/dictionary/BSD
+++ b/data/dictionary/BSD
@ -1,31 +0,0 @@
 Copyright (c) 2011-2021, The UniDic Consortium
 Copyright (c) 2023, LegalOn Technologies, Inc.
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 * Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the
   distribution.
 * Neither the name of the UniDic Consortium nor the names of its
   contributors may be used to endorse or promote products derived
   from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/data/dictionary/NOTICE
+++ b/data/dictionary/NOTICE
@ -1,7 +0,0 @@
 This software includes a binary version of data from
  https://clrd.ninjal.ac.jp/unidic_archive/cwj/3.1.1/unidic-cwj-3.1.1-full.zip
 where the costs and connection ids are retrained using CORE data in BCCWJ (except the PN category)
  https://clrd.ninjal.ac.jp/bccwj/.
--- a/data/dictionary/system.dic.xz
+++ b/data/dictionary/system.dic.xz
--- a/data/ipadic-mecab-2_7_0/COPYING
+++ b/data/ipadic-mecab-2_7_0/COPYING
@ -0,0 +1,73 @@
 Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
 and Technology.
 Copyright 2023, LegalOn Technologies, Inc.
 All Rights Reserved.
 Use, reproduction, and distribution of this software is permitted.
 Any copy of this software, whether in its original form or modified,
 must include both the above copyright notice and the following
 paragraphs.
 Nara Institute of Science and Technology (NAIST),
 the copyright holders, disclaims all warranties with regard to this
 software, including all implied warranties of merchantability and
 fitness, in no event shall NAIST be liable for
 any special, indirect or consequential damages or any damages
 whatsoever resulting from loss of use, data or profits, whether in an
 action of contract, negligence or other tortuous action, arising out
 of or in connection with the use or performance of this software.
 A large portion of the dictionary entries
 originate from ICOT Free Software.  The following conditions for ICOT
 Free Software applies to the current dictionary as well.
 Each User may also freely distribute the Program, whether in its
 original form or modified, to any third party or parties, PROVIDED
 that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
 on, or be attached to, the Program, which is distributed substantially
 in the same form as set out herein and that such intended
 distribution, if actually made, will neither violate or otherwise
 contravene any of the laws and regulations of the countries having
 jurisdiction over the User or the intended distribution itself.
 NO WARRANTY
 The program was produced on an experimental basis in the course of the
 research and development conducted during the project and is provided
 to users as so produced on an experimental basis.  Accordingly, the
 program is provided without any warranty whatsoever, whether express,
 implied, statutory or otherwise.  The term "warranty" used herein
 includes, but is not limited to, any warranty of the quality,
 performance, merchantability and fitness for a particular purpose of
 the program and the nonexistence of any infringement or violation of
 any right of any third party.
 Each user of the program will agree and understand, and be deemed to
 have agreed and understood, that there is no warranty whatsoever for
 the program and, accordingly, the entire risk arising from or
 otherwise connected with the program is assumed by the user.
 Therefore, neither ICOT, the copyright holder, or any other
 organization that participated in or was otherwise related to the
 development of the program and their respective officials, directors,
 officers and other employees shall be held liable for any and all
 damages, including, without limitation, general, special, incidental
 and consequential damages, arising out of or otherwise in connection
 with the use or inability to use the program or any product, material
 or result produced or otherwise obtained by using the program,
 regardless of whether they have been advised of, or otherwise had
 knowledge of, the possibility of such damages at any time during the
 project or thereafter.  Each user will be deemed to have agreed to the
 foregoing by his or her commencement of use of the program.  The term
 "use" as used herein includes, but is not limited to, the use,
 modification, copying and distribution of the program and the
 production of secondary products from the program.
 In the case where the program, whether in its original form or
 modified, was distributed or delivered to or received by a user from
 any person, organization or entity other than ICOT, unless it makes or
 grants independently of ICOT any specific warranty to the user in
 writing, such person, organization or entity, will also be exempted
 from and not be held liable to the user for any such damages as noted
 above as far as the program is concerned.
 ÷÷
--- a/data/ipadic-mecab-2_7_0/NOTICE
+++ b/data/ipadic-mecab-2_7_0/NOTICE
@ -0,0 +1,7 @@
 This software includes a binary version of data from
  http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz,
 where the connection ids are remapped using CORE data in BCCWJ (except the PN category)
  https://clrd.ninjal.ac.jp/bccwj/.
--- a/data/ipadic-mecab-2_7_0/system.dic.xz
+++ b/data/ipadic-mecab-2_7_0/system.dic.xz
--- a/src/accent.rs
+++ b/src/accent.rs
@ -0,0 +1,61 @@
 use std::{
    borrow::Cow,
    io::{Cursor, Read},
 };
 use fnv::FnvHashMap;
 use lz4_flex::frame::FrameDecoder;
 // Pitch accent dictionary.
 const ACCENT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/accents.tsv.lz4"));
 #[derive(Debug)]
 pub struct AccentDict {
    table: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), Vec<u8>>,
 }
 pub fn build_accent_dictionary() -> AccentDict {
    let text = {
        let mut decoder = FrameDecoder::new(Cursor::new(ACCENT));
        let mut text = String::new();
        decoder.read_to_string(&mut text).unwrap();
        text
    };
    let mut table = FnvHashMap::default();
    for line in text.lines() {
        let items: Vec<_> = line.split("\t").map(|t| t.trim()).collect();
        let word = items[0];
        let kana = if items[1].is_empty() {
            items[0]
        } else {
            items[1]
        };
        let pitches = items[2]
            .split(",")
            .filter_map(|p| p.parse::<u8>().ok())
            .collect();
        table.insert(
            (
                Cow::Owned(word.into()),
                Cow::Owned(crate::hiragana_to_katakana_string(kana)),
            ),
            pitches,
        );
    }
    AccentDict { table: table }
 }
 impl AccentDict {
    pub fn get<'a>(&'a self, word: &'a str, kana: &'a str) -> &'a [u8] {
        if let Some(p) = self.table.get(&(Cow::from(word), Cow::from(kana))) {
            &p[..]
        } else {
            &[]
        }
    }
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,3 +1,4 @@
 mod accent;
 mod learner;
 use std::{
@ -10,32 +11,32 @@ use lz4_flex::frame::FrameDecoder;
 use quick_xml::events::Event;
 use vibrato::{Dictionary, Tokenizer};
 use accent::AccentDict;
 use learner::Learner;
 // Include KANJI_FREQ, a frequency-ordered array of kanji characters.
 include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));
 // Parsing dictionary.
 const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4"));
 /// A list of words that the tokenizer insists on using the less common reading
 /// for, with the more common reading that should be substituted.
 ///
-/// (surface, feature, substitute_feature)
+/// (surface, kana, substitute_kana)
 const COMMON_SUBS: &[(&str, &str, &str)] = &[
-    ("額", "名詞-普通名詞-一般,ガク", "名詞-普通名詞-一般,ヒタイ"),
+    ("額", "ガク", "ヒタイ"),
-    (
+    ("他", "タ", "ホカ"),
-        "他",
+    ("私", "ワタクシ", "ワタシ"),
        "名詞-普通名詞-副詞可能,タ",
        "名詞-普通名詞-副詞可能,ホカ",
    ),
    ("私", "代名詞,ワタクシ", "代名詞,ワタシ"),
 ];
 pub struct FuriganaGenerator {
    tokenizer: Tokenizer,
    accent_dict: AccentDict,
    exclude_kanji: FnvHashSet<char>,
    subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), String>,
    use_hiragana: bool,
    mark_accent: bool,
 }
 impl FuriganaGenerator {
@ -43,7 +44,7 @@ impl FuriganaGenerator {
    // Specifically, words made up *entirely* of those kanji will be excluded.
    // If a word has some kanji that aren't in that set, even if it also has
    // some that are, it will still get furigana.
-    pub fn new(exclude_count: usize, use_hiragana: bool) -> Self {
+    pub fn new(exclude_count: usize, use_hiragana: bool, mark_accent: bool) -> Self {
        let dict = {
            // Note: we could just pass the decoder straight to `Dictionary::read()`
            // below, and it would work.  However, that ends up being slower than
@ -73,29 +74,35 @@ impl FuriganaGenerator {
        Self {
            tokenizer: Tokenizer::new(dict),
            accent_dict: accent::build_accent_dictionary(),
            exclude_kanji: exclude_kanji,
            subs: subs,
            use_hiragana: use_hiragana,
            mark_accent: mark_accent,
        }
    }
    pub fn new_session(&self, learn_mode: bool) -> Session<'_> {
        Session {
            tokenizer: &self.tokenizer,
            accent_dict: &self.accent_dict,
            exclude_kanji: &self.exclude_kanji,
            subs: &self.subs,
            learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
            use_hiragana: self.use_hiragana,
            mark_accent: self.mark_accent,
        }
    }
 }
 pub struct Session<'a> {
    tokenizer: &'a Tokenizer,
    accent_dict: &'a AccentDict,
    exclude_kanji: &'a FnvHashSet<char>,
    subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), String>,
    learner: Learner,
    use_hiragana: bool,
    mark_accent: bool,
 }
 impl<'a> Session<'a> {
@ -116,10 +123,12 @@ impl<'a> Session<'a> {
        add_html_furigana_skip_already_ruby(
            &text,
            &self.tokenizer,
            &self.accent_dict,
            &self.exclude_kanji,
            &self.subs,
            &mut self.learner,
            self.use_hiragana,
            self.mark_accent,
        )
    }
 }
@ -132,10 +141,12 @@ fn to_str<B: std::ops::Deref<Target = [u8]>>(bytes: &B) -> &str {
 fn add_html_furigana_skip_already_ruby(
    text: &str,
    tokenizer: &Tokenizer,
    accent_dict: &AccentDict,
    exclude_kanji: &FnvHashSet<char>,
    subs: &FnvHashMap<(Cow<str>, Cow<str>), String>,
    learner: &mut Learner,
    use_hiragana: bool,
    mark_accent: bool,
 ) -> String {
    let mut reader = quick_xml::Reader::from_str(text);
@ -171,10 +182,12 @@ fn add_html_furigana_skip_already_ruby(
                    new_text.push_str(&add_html_furigana(
                        to_str(&e),
                        tokenizer,
                        accent_dict,
                        exclude_kanji,
                        subs,
                        learner,
                        use_hiragana,
                        mark_accent,
                    ));
                } else {
                    write_xml(&mut new_text, &Event::Text(e));
@ -255,10 +268,12 @@ fn write_xml(text: &mut String, event: &quick_xml::events::Event) {
 fn add_html_furigana(
    text: &str,
    tokenizer: &Tokenizer,
    accent_dict: &AccentDict,
    exclude_kanji: &FnvHashSet<char>,
    subs: &FnvHashMap<(Cow<str>, Cow<str>), String>,
    learner: &mut Learner,
    use_hiragana: bool,
    mark_accent: bool,
 ) -> String {
    let mut worker = tokenizer.new_worker();
@ -268,15 +283,28 @@ fn add_html_furigana(
    let mut new_text = String::new();
    for i in 0..worker.num_tokens() {
        let t = worker.token(i);
-        let (surface, feature) = {
+        let (surface, kana, pitches) = {
            let surface = t.surface();
            let feature = t.feature();
-            if let Some(sub_feature) = subs.get(&(Cow::from(surface), Cow::from(feature))) {
+            let kana_1 = feature.rsplit(",").nth(0).unwrap();
-                (surface, sub_feature.as_str())
+            let kana_2 = feature.rsplit(",").nth(1).unwrap();
            let word = feature.rsplit(",").nth(2).unwrap();
            let (kana, pkana) =
                if let Some(sub_kana) = subs.get(&(Cow::from(surface), Cow::from(kana_1))) {
                    (sub_kana.as_str(), sub_kana.as_str())
                } else {
                    (kana_1, kana_2)
                };
            let pitches = if mark_accent {
                accent_dict.get(word, pkana)
            } else {
-                (surface, feature)
+                &[]
-            }
+            };
            (surface, kana, pitches)
        };
        let needs_help = learner.needs_help(surface);
@ -287,28 +315,33 @@ fn add_html_furigana(
            continue;
        }
-        let kana = {
+        let kana = if use_hiragana {
-            let kana = feature.split(",").nth(1).unwrap();
+            katakana_to_hiragana_string(kana)
-            if use_hiragana {
+        } else {
-                katakana_to_hiragana_string(kana)
+            kana.into()
            } else {
                kana.into()
            }
        };
        let furigana_text = apply_furigana(surface, &kana, exclude_kanji);
-        for (surf, furi) in furigana_text.iter() {
+        if furigana_text.is_empty() {
-            if furi.is_empty() {
+            new_text.push_str(surface);
-                new_text.push_str(surf);
+        } else {
-                continue;
+            for pitch in pitches {
                new_text.push_str(&format!("<sup>{}</sup>", pitch));
            }
-            new_text.push_str("<ruby>");
+            for (surf, furi) in furigana_text.iter() {
-            new_text.push_str(surf);
+                if furi.is_empty() {
-            new_text.push_str("<rt>");
+                    new_text.push_str(surf);
-            new_text.push_str(furi);
+                    continue;
-            new_text.push_str("</rt></ruby>");
+                }
                new_text.push_str("<ruby>");
                new_text.push_str(surf);
                new_text.push_str("<rt>");
                new_text.push_str(furi);
                new_text.push_str("</rt></ruby>");
            }
        }
    }
@ -326,9 +359,8 @@ fn apply_furigana<'a>(
 ) -> Vec<(&'a str, &'a str)> {
    let mut out = Vec::new();
-    if furigana_unneeded(surface, exclude_kanji) {
+    if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) {
-        out.push((surface, ""));
+        return Vec::new();
        return out;
    }
    let mut surface = surface;
@ -454,6 +486,10 @@ pub fn is_kana(c: char) -> bool {
    return false;
 }
 pub fn is_kana_str(text: &str) -> bool {
    text.chars().all(|c| is_kana(c))
 }
 pub fn normalize_kana(c: char) -> Option<char> {
    if !is_kana(c) {
        return None;
@ -497,6 +533,16 @@ pub fn katakana_to_hiragana_string(text: &str) -> String {
    new_text
 }
 pub fn hiragana_to_katakana_string(text: &str) -> String {
    let mut new_text = String::new();
    for c in text.chars() {
        new_text.push(hiragana_to_katakana(c).unwrap_or(c));
    }
    new_text
 }
 #[cfg(test)]
 mod tests {
    use super::*;
@ -505,7 +551,12 @@ mod tests {
    pub fn get_furigana_gen() -> &'static FuriganaGenerator {
        use std::sync::OnceLock;
        static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new();
-        FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false))
+        FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false, false))
    }
    pub fn get_furigana_gen_with_accent() -> &'static FuriganaGenerator {
        use std::sync::OnceLock;
        static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new();
        FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false, true))
    }
    #[test]
@ -514,7 +565,7 @@ mod tests {
        let kana = "ヘー";
        let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
-        assert_eq!(&[("へぇ", "")], &pairs[..]);
+        assert!(pairs.is_empty());
    }
    #[test]
@ -523,7 +574,7 @@ mod tests {
        let kana = "ヘー";
        let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
-        assert_eq!(&[("へぇー", "")], &pairs[..]);
+        assert!(pairs.is_empty());
    }
    #[test]
@ -532,7 +583,7 @@ mod tests {
        let kana = "え";
        let pairs = apply_furigana(surface, kana, &FnvHashSet::default());
-        assert_eq!(&[("へ", "")], &pairs[..]);
+        assert!(pairs.is_empty());
    }
    #[test]
@ -606,39 +657,80 @@ mod tests {
        assert_eq!(3, worker.num_tokens());
        assert_eq!("食べ", worker.token(0).surface());
-        assert_eq!("動詞-一般,タベ", worker.token(0).feature());
+        assert_eq!(
            "動詞,自立,*,*,一段,連用形,食べる,タベ,タベ",
            worker.token(0).feature()
        );
        assert_eq!("て", worker.token(1).surface());
-        assert_eq!("助詞-接続助詞,テ", worker.token(1).feature());
+        assert_eq!("助詞,接続助詞,*,*,*,*,て,テ,テ", worker.token(1).feature());
        assert_eq!("いる", worker.token(2).surface());
-        assert_eq!("動詞-非自立可能,イル", worker.token(2).feature());
+        assert_eq!(
            "動詞,非自立,*,*,一段,基本形,いる,イル,イル",
            worker.token(2).feature()
        );
    }
    #[test]
    fn tokenize_02() {
        let mut worker = get_furigana_gen().tokenizer.new_worker();
        worker.reset_sentence("そう");
        worker.tokenize();
        assert_eq!(1, worker.num_tokens());
        assert_eq!(
            "副詞,助詞類接続,*,*,*,*,そう,ソウ,ソー",
            worker.token(0).feature()
        );
    }
    #[test]
    fn add_html_furigana_01() {
        let mut gen = get_furigana_gen().new_session(false);
        let mut gen_accent = get_furigana_gen_with_accent().new_session(false);
-        let text = gen
+        let text = r#"<sup class="食う">食べる</sup>のは<ruby>良</ruby>いね！<hi />"#;
-            .add_html_furigana(r#"<sup class="食う">食べる</sup>のは<ruby>良</ruby>いね！<hi />"#);
+        let furi_1 = gen.add_html_furigana(text);
        let furi_2 = gen_accent.add_html_furigana(text);
        assert_eq!(
-            text,
+            furi_1,
            r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね！<hi />"#
        );
        assert_eq!(
            furi_2,
            r#"<sup class="食う"><sup>2</sup><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね！<hi />"#
        );
    }
    // Testing custom substitutions.
    #[test]
    fn add_html_furigana_02() {
        let mut gen = get_furigana_gen().new_session(false);
        let mut gen_accent = get_furigana_gen_with_accent().new_session(false);
        assert_eq!(
            gen.add_html_furigana("額"),
            "<ruby>額<rt>ヒタイ</rt></ruby>"
        );
        assert_eq!(
            gen_accent.add_html_furigana("額"),
            "<sup>0</sup><ruby>額<rt>ヒタイ</rt></ruby>"
        );
        assert_eq!(gen.add_html_furigana("他"), "<ruby>他<rt>ホカ</rt></ruby>");
        assert_eq!(
            gen_accent.add_html_furigana("他"),
            "<sup>0</sup><ruby>他<rt>ホカ</rt></ruby>"
        );
        assert_eq!(
            gen.add_html_furigana("私"),
            "<ruby>私<rt>ワタシ</rt></ruby>"
        );
        assert_eq!(
            gen_accent.add_html_furigana("私"),
            "<sup>0</sup><ruby>私<rt>ワタシ</rt></ruby>"
        );
    }
 }