Add option to include pitch accent information with the furigana

2024-09-18 12:10:22 +02:00 · 2024-09-18 12:10:22 +02:00 · adb58983a7
commit adb58983a7
parent 7361240e49
11 changed files with 124435 additions and 83 deletions
--- a/build.rs
+++ b/build.rs
@ -29,11 +29,11 @@ fn main() {
        f.write_all("\n];".as_bytes()).unwrap();
    }

-    // Write compressed dictionary to .lz4 file.
+    // Write compressed parsing dictionary to .lz4 file.
    {
        // Read and decompress file from .xz.
        let dict_data = {
-            let f = File::open("data/dictionary/system.dic.xz").unwrap();
+            let f = File::open("data/ipadic-mecab-2_7_0/system.dic.xz").unwrap();
            let mut data = Vec::new();
            lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();

@ -47,4 +47,23 @@ fn main() {
        encoder.write(&dict_data).unwrap();
        encoder.finish().unwrap();
    }
+
+    // Write compressed pitch accent dictionary to .lz4 file.
+    {
+        // Read and decompress file from .xz.
+        let dict_data = {
+            let f = File::open("data/accents.tsv.xz").unwrap();
+            let mut data = Vec::new();
+            lzma_rs::xz_decompress(&mut BufReader::new(f), &mut data).unwrap();
+
+            data
+        };
+
+        // Recompress to .lz4.
+        let dest_path = Path::new(&out_dir).join("accents.tsv.lz4");
+        let f = File::create(dest_path).unwrap();
+        let mut encoder = lz4_flex::frame::FrameEncoder::new(f);
+        encoder.write(&dict_data).unwrap();
+        encoder.finish().unwrap();
+    }
 }
--- a/data/accents.tsv
+++ b/data/accents.tsv
--- a/data/accents.tsv.xz
+++ b/data/accents.tsv.xz
--- a/data/dictionary/BSD
+++ b/data/dictionary/BSD
@ -1,31 +0,0 @@
-Copyright (c) 2011-2021, The UniDic Consortium
-Copyright (c) 2023, LegalOn Technologies, Inc.
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
-   notice, this list of conditions and the following disclaimer in the
-   documentation and/or other materials provided with the
-   distribution.
-
- * Neither the name of the UniDic Consortium nor the names of its
-   contributors may be used to endorse or promote products derived
-   from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/data/dictionary/NOTICE
+++ b/data/dictionary/NOTICE
@ -1,7 +0,0 @@
-This software includes a binary version of data from
-
-  https://clrd.ninjal.ac.jp/unidic_archive/cwj/3.1.1/unidic-cwj-3.1.1-full.zip
-
-where the costs and connection ids are retrained using CORE data in BCCWJ (except the PN category)
-
-  https://clrd.ninjal.ac.jp/bccwj/.
--- a/data/dictionary/system.dic.xz
+++ b/data/dictionary/system.dic.xz
--- a/data/ipadic-mecab-2_7_0/COPYING
+++ b/data/ipadic-mecab-2_7_0/COPYING
@ -0,0 +1,73 @@
+Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
+and Technology.
+Copyright 2023, LegalOn Technologies, Inc.
+All Rights Reserved.
+
+Use, reproduction, and distribution of this software is permitted.
+Any copy of this software, whether in its original form or modified,
+must include both the above copyright notice and the following
+paragraphs.
+
+Nara Institute of Science and Technology (NAIST),
+the copyright holders, disclaims all warranties with regard to this
+software, including all implied warranties of merchantability and
+fitness, in no event shall NAIST be liable for
+any special, indirect or consequential damages or any damages
+whatsoever resulting from loss of use, data or profits, whether in an
+action of contract, negligence or other tortuous action, arising out
+of or in connection with the use or performance of this software.
+
+A large portion of the dictionary entries
+originate from ICOT Free Software.  The following conditions for ICOT
+Free Software applies to the current dictionary as well.
+
+Each User may also freely distribute the Program, whether in its
+original form or modified, to any third party or parties, PROVIDED
+that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+on, or be attached to, the Program, which is distributed substantially
+in the same form as set out herein and that such intended
+distribution, if actually made, will neither violate or otherwise
+contravene any of the laws and regulations of the countries having
+jurisdiction over the User or the intended distribution itself.
+
+NO WARRANTY
+
+The program was produced on an experimental basis in the course of the
+research and development conducted during the project and is provided
+to users as so produced on an experimental basis.  Accordingly, the
+program is provided without any warranty whatsoever, whether express,
+implied, statutory or otherwise.  The term "warranty" used herein
+includes, but is not limited to, any warranty of the quality,
+performance, merchantability and fitness for a particular purpose of
+the program and the nonexistence of any infringement or violation of
+any right of any third party.
+
+Each user of the program will agree and understand, and be deemed to
+have agreed and understood, that there is no warranty whatsoever for
+the program and, accordingly, the entire risk arising from or
+otherwise connected with the program is assumed by the user.
+
+Therefore, neither ICOT, the copyright holder, or any other
+organization that participated in or was otherwise related to the
+development of the program and their respective officials, directors,
+officers and other employees shall be held liable for any and all
+damages, including, without limitation, general, special, incidental
+and consequential damages, arising out of or otherwise in connection
+with the use or inability to use the program or any product, material
+or result produced or otherwise obtained by using the program,
+regardless of whether they have been advised of, or otherwise had
+knowledge of, the possibility of such damages at any time during the
+project or thereafter.  Each user will be deemed to have agreed to the
+foregoing by his or her commencement of use of the program.  The term
+"use" as used herein includes, but is not limited to, the use,
+modification, copying and distribution of the program and the
+production of secondary products from the program.
+
+In the case where the program, whether in its original form or
+modified, was distributed or delivered to or received by a user from
+any person, organization or entity other than ICOT, unless it makes or
+grants independently of ICOT any specific warranty to the user in
+writing, such person, organization or entity, will also be exempted
+from and not be held liable to the user for any such damages as noted
+above as far as the program is concerned.
+÷÷
--- a/data/ipadic-mecab-2_7_0/NOTICE
+++ b/data/ipadic-mecab-2_7_0/NOTICE
@ -0,0 +1,7 @@
+This software includes a binary version of data from
+
+  http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz,
+
+where the connection ids are remapped using CORE data in BCCWJ (except the PN category)
+
+  https://clrd.ninjal.ac.jp/bccwj/.
--- a/data/ipadic-mecab-2_7_0/system.dic.xz
+++ b/data/ipadic-mecab-2_7_0/system.dic.xz
--- a/src/accent.rs
+++ b/src/accent.rs
@ -0,0 +1,61 @@
+use std::{
+    borrow::Cow,
+    io::{Cursor, Read},
+};
+
+use fnv::FnvHashMap;
+use lz4_flex::frame::FrameDecoder;
+
+// Pitch accent dictionary.
+const ACCENT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/accents.tsv.lz4"));
+
+#[derive(Debug)]
+pub struct AccentDict {
+    table: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), Vec<u8>>,
+}
+
+pub fn build_accent_dictionary() -> AccentDict {
+    let text = {
+        let mut decoder = FrameDecoder::new(Cursor::new(ACCENT));
+        let mut text = String::new();
+        decoder.read_to_string(&mut text).unwrap();
+
+        text
+    };
+
+    let mut table = FnvHashMap::default();
+    for line in text.lines() {
+        let items: Vec<_> = line.split("\t").map(|t| t.trim()).collect();
+
+        let word = items[0];
+        let kana = if items[1].is_empty() {
+            items[0]
+        } else {
+            items[1]
+        };
+        let pitches = items[2]
+            .split(",")
+            .filter_map(|p| p.parse::<u8>().ok())
+            .collect();
+
+        table.insert(
+            (
+                Cow::Owned(word.into()),
+                Cow::Owned(crate::hiragana_to_katakana_string(kana)),
+            ),
+            pitches,
+        );
+    }
+
+    AccentDict { table: table }
+}
+
+impl AccentDict {
+    pub fn get<'a>(&'a self, word: &'a str, kana: &'a str) -> &'a [u8] {
+        if let Some(p) = self.table.get(&(Cow::from(word), Cow::from(kana))) {
+            &p[..]
+        } else {
+            &[]
+        }
+    }
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,3 +1,4 @@
+mod accent;
 mod learner;

 use std::{
@ -10,32 +11,32 @@ use lz4_flex::frame::FrameDecoder;
 use quick_xml::events::Event;
 use vibrato::{Dictionary, Tokenizer};

+use accent::AccentDict;
 use learner::Learner;

 // Include KANJI_FREQ, a frequency-ordered array of kanji characters.
 include!(concat!(env!("OUT_DIR"), "/kanji_freq_inc.rs"));

+// Parsing dictionary.
 const DICT: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/system.dic.lz4"));

 /// A list of words that the tokenizer insists on using the less common reading
 /// for, with the more common reading that should be substituted.
 ///
-/// (surface, feature, substitute_feature)
+/// (surface, kana, substitute_kana)
 const COMMON_SUBS: &[(&str, &str, &str)] = &[
-    ("額", "名詞-普通名詞-一般,ガク", "名詞-普通名詞-一般,ヒタイ"),
-    (
-        "他",
-        "名詞-普通名詞-副詞可能,タ",
-        "名詞-普通名詞-副詞可能,ホカ",
-    ),
-    ("私", "代名詞,ワタクシ", "代名詞,ワタシ"),
+    ("額", "ガク", "ヒタイ"),
+    ("他", "タ", "ホカ"),
+    ("私", "ワタクシ", "ワタシ"),
 ];

 pub struct FuriganaGenerator {
    tokenizer: Tokenizer,
+    accent_dict: AccentDict,
    exclude_kanji: FnvHashSet<char>,
    subs: FnvHashMap<(Cow<'static, str>, Cow<'static, str>), String>,
    use_hiragana: bool,
+    mark_accent: bool,
 }

 impl FuriganaGenerator {
@ -43,7 +44,7 @@ impl FuriganaGenerator {
    // Specifically, words made up *entirely* of those kanji will be excluded.
    // If a word has some kanji that aren't in that set, even if it also has
    // some that are, it will still get furigana.
-    pub fn new(exclude_count: usize, use_hiragana: bool) -> Self {
+    pub fn new(exclude_count: usize, use_hiragana: bool, mark_accent: bool) -> Self {
        let dict = {
            // Note: we could just pass the decoder straight to `Dictionary::read()`
            // below, and it would work.  However, that ends up being slower than
@ -73,29 +74,35 @@ impl FuriganaGenerator {

        Self {
            tokenizer: Tokenizer::new(dict),
+            accent_dict: accent::build_accent_dictionary(),
            exclude_kanji: exclude_kanji,
            subs: subs,
            use_hiragana: use_hiragana,
+            mark_accent: mark_accent,
        }
    }

    pub fn new_session(&self, learn_mode: bool) -> Session<'_> {
        Session {
            tokenizer: &self.tokenizer,
+            accent_dict: &self.accent_dict,
            exclude_kanji: &self.exclude_kanji,
            subs: &self.subs,
            learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
            use_hiragana: self.use_hiragana,
+            mark_accent: self.mark_accent,
        }
    }
 }

 pub struct Session<'a> {
    tokenizer: &'a Tokenizer,
+    accent_dict: &'a AccentDict,
    exclude_kanji: &'a FnvHashSet<char>,
    subs: &'a FnvHashMap<(Cow<'a, str>, Cow<'a, str>), String>,
    learner: Learner,
    use_hiragana: bool,
+    mark_accent: bool,
 }

 impl<'a> Session<'a> {
@ -116,10 +123,12 @@ impl<'a> Session<'a> {
        add_html_furigana_skip_already_ruby(
            &text,
            &self.tokenizer,
+            &self.accent_dict,
            &self.exclude_kanji,
            &self.subs,
            &mut self.learner,
            self.use_hiragana,
+            self.mark_accent,
        )
    }
 }
@ -132,10 +141,12 @@ fn to_str<B: std::ops::Deref<Target = [u8]>>(bytes: &B) -> &str {
 fn add_html_furigana_skip_already_ruby(
    text: &str,
    tokenizer: &Tokenizer,
+    accent_dict: &AccentDict,
    exclude_kanji: &FnvHashSet<char>,
    subs: &FnvHashMap<(Cow<str>, Cow<str>), String>,
    learner: &mut Learner,
    use_hiragana: bool,
+    mark_accent: bool,
 ) -> String {
    let mut reader = quick_xml::Reader::from_str(text);

@ -171,10 +182,12 @@ fn add_html_furigana_skip_already_ruby(
                    new_text.push_str(&add_html_furigana(
                        to_str(&e),
                        tokenizer,
+                        accent_dict,
                        exclude_kanji,
                        subs,
                        learner,
                        use_hiragana,
+                        mark_accent,
                    ));
                } else {
                    write_xml(&mut new_text, &Event::Text(e));
@ -255,10 +268,12 @@ fn write_xml(text: &mut String, event: &quick_xml::events::Event) {
 fn add_html_furigana(
    text: &str,
    tokenizer: &Tokenizer,
+    accent_dict: &AccentDict,
    exclude_kanji: &FnvHashSet<char>,
    subs: &FnvHashMap<(Cow<str>, Cow<str>), String>,
    learner: &mut Learner,
    use_hiragana: bool,
+    mark_accent: bool,
 ) -> String {
    let mut worker = tokenizer.new_worker();

@ -268,15 +283,28 @@ fn add_html_furigana(
    let mut new_text = String::new();
    for i in 0..worker.num_tokens() {
        let t = worker.token(i);
-        let (surface, feature) = {
+        let (surface, kana, pitches) = {
            let surface = t.surface();
            let feature = t.feature();

-            if let Some(sub_feature) = subs.get(&(Cow::from(surface), Cow::from(feature))) {
-                (surface, sub_feature.as_str())
+            let kana_1 = feature.rsplit(",").nth(0).unwrap();
+            let kana_2 = feature.rsplit(",").nth(1).unwrap();
+            let word = feature.rsplit(",").nth(2).unwrap();
+
+            let (kana, pkana) =
+                if let Some(sub_kana) = subs.get(&(Cow::from(surface), Cow::from(kana_1))) {
+                    (sub_kana.as_str(), sub_kana.as_str())
+                } else {
+                    (kana_1, kana_2)
+                };
+
+            let pitches = if mark_accent {
+                accent_dict.get(word, pkana)
            } else {
-                (surface, feature)
-            }
+                &[]
+            };
+
+            (surface, kana, pitches)
        };

        let needs_help = learner.needs_help(surface);
@ -287,28 +315,33 @@ fn add_html_furigana(
            continue;
        }

-        let kana = {
-            let kana = feature.split(",").nth(1).unwrap();
-            if use_hiragana {
-                katakana_to_hiragana_string(kana)
-            } else {
-                kana.into()
-            }
+        let kana = if use_hiragana {
+            katakana_to_hiragana_string(kana)
+        } else {
+            kana.into()
        };

        let furigana_text = apply_furigana(surface, &kana, exclude_kanji);

-        for (surf, furi) in furigana_text.iter() {
-            if furi.is_empty() {
-                new_text.push_str(surf);
-                continue;
+        if furigana_text.is_empty() {
+            new_text.push_str(surface);
+        } else {
+            for pitch in pitches {
+                new_text.push_str(&format!("<sup>{}</sup>", pitch));
            }

-            new_text.push_str("<ruby>");
-            new_text.push_str(surf);
-            new_text.push_str("<rt>");
-            new_text.push_str(furi);
-            new_text.push_str("</rt></ruby>");
+            for (surf, furi) in furigana_text.iter() {
+                if furi.is_empty() {
+                    new_text.push_str(surf);
+                    continue;
+                }
+
+                new_text.push_str("<ruby>");
+                new_text.push_str(surf);
+                new_text.push_str("<rt>");
+                new_text.push_str(furi);
+                new_text.push_str("</rt></ruby>");
+            }
        }
    }

@ -326,9 +359,8 @@ fn apply_furigana<'a>(
 ) -> Vec<(&'a str, &'a str)> {
    let mut out = Vec::new();

-    if furigana_unneeded(surface, exclude_kanji) {
-        out.push((surface, ""));
-        return out;
+    if furigana_unneeded(surface, exclude_kanji) || !is_kana_str(kana) {
+        return Vec::new();
    }

    let mut surface = surface;
@ -454,6 +486,10 @@ pub fn is_kana(c: char) -> bool {
    return false;
 }

+pub fn is_kana_str(text: &str) -> bool {
+    text.chars().all(|c| is_kana(c))
+}
+
 pub fn normalize_kana(c: char) -> Option<char> {
    if !is_kana(c) {
        return None;
@ -497,6 +533,16 @@ pub fn katakana_to_hiragana_string(text: &str) -> String {
    new_text
 }

+pub fn hiragana_to_katakana_string(text: &str) -> String {
+    let mut new_text = String::new();
+
+    for c in text.chars() {
+        new_text.push(hiragana_to_katakana(c).unwrap_or(c));
+    }
+
+    new_text
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@ -505,7 +551,12 @@ mod tests {
    pub fn get_furigana_gen() -> &'static FuriganaGenerator {
        use std::sync::OnceLock;
        static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new();
-        FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false))
+        FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false, false))
+    }
+    pub fn get_furigana_gen_with_accent() -> &'static FuriganaGenerator {
+        use std::sync::OnceLock;
+        static FURIGEN: OnceLock<FuriganaGenerator> = OnceLock::new();
+        FURIGEN.get_or_init(|| FuriganaGenerator::new(0, false, true))
    }

    #[test]
@ -514,7 +565,7 @@ mod tests {
        let kana = "ヘー";
        let pairs = apply_furigana(surface, kana, &FnvHashSet::default());

-        assert_eq!(&[("へぇ", "")], &pairs[..]);
+        assert!(pairs.is_empty());
    }

    #[test]
@ -523,7 +574,7 @@ mod tests {
        let kana = "ヘー";
        let pairs = apply_furigana(surface, kana, &FnvHashSet::default());

-        assert_eq!(&[("へぇー", "")], &pairs[..]);
+        assert!(pairs.is_empty());
    }

    #[test]
@ -532,7 +583,7 @@ mod tests {
        let kana = "え";
        let pairs = apply_furigana(surface, kana, &FnvHashSet::default());

-        assert_eq!(&[("へ", "")], &pairs[..]);
+        assert!(pairs.is_empty());
    }

    #[test]
@ -606,39 +657,80 @@ mod tests {

        assert_eq!(3, worker.num_tokens());
        assert_eq!("食べ", worker.token(0).surface());
-        assert_eq!("動詞-一般,タベ", worker.token(0).feature());
+        assert_eq!(
+            "動詞,自立,*,*,一段,連用形,食べる,タベ,タベ",
+            worker.token(0).feature()
+        );
        assert_eq!("て", worker.token(1).surface());
-        assert_eq!("助詞-接続助詞,テ", worker.token(1).feature());
+        assert_eq!("助詞,接続助詞,*,*,*,*,て,テ,テ", worker.token(1).feature());
        assert_eq!("いる", worker.token(2).surface());
-        assert_eq!("動詞-非自立可能,イル", worker.token(2).feature());
+        assert_eq!(
+            "動詞,非自立,*,*,一段,基本形,いる,イル,イル",
+            worker.token(2).feature()
+        );
+    }
+
+    #[test]
+    fn tokenize_02() {
+        let mut worker = get_furigana_gen().tokenizer.new_worker();
+
+        worker.reset_sentence("そう");
+        worker.tokenize();
+
+        assert_eq!(1, worker.num_tokens());
+        assert_eq!(
+            "副詞,助詞類接続,*,*,*,*,そう,ソウ,ソー",
+            worker.token(0).feature()
+        );
    }

    #[test]
    fn add_html_furigana_01() {
        let mut gen = get_furigana_gen().new_session(false);
+        let mut gen_accent = get_furigana_gen_with_accent().new_session(false);

-        let text = gen
-            .add_html_furigana(r#"<sup class="食う">食べる</sup>のは<ruby>良</ruby>いね！<hi />"#);
+        let text = r#"<sup class="食う">食べる</sup>のは<ruby>良</ruby>いね！<hi />"#;
+        let furi_1 = gen.add_html_furigana(text);
+        let furi_2 = gen_accent.add_html_furigana(text);

        assert_eq!(
-            text,
+            furi_1,
            r#"<sup class="食う"><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね！<hi />"#
        );
+        assert_eq!(
+            furi_2,
+            r#"<sup class="食う"><sup>2</sup><ruby>食<rt>タ</rt></ruby>べる</sup>のは<ruby>良</ruby>いね！<hi />"#
+        );
    }

    // Testing custom substitutions.
    #[test]
    fn add_html_furigana_02() {
        let mut gen = get_furigana_gen().new_session(false);
+        let mut gen_accent = get_furigana_gen_with_accent().new_session(false);

        assert_eq!(
            gen.add_html_furigana("額"),
            "<ruby>額<rt>ヒタイ</rt></ruby>"
        );
+        assert_eq!(
+            gen_accent.add_html_furigana("額"),
+            "<sup>0</sup><ruby>額<rt>ヒタイ</rt></ruby>"
+        );
+
        assert_eq!(gen.add_html_furigana("他"), "<ruby>他<rt>ホカ</rt></ruby>");
+        assert_eq!(
+            gen_accent.add_html_furigana("他"),
+            "<sup>0</sup><ruby>他<rt>ホカ</rt></ruby>"
+        );
+
        assert_eq!(
            gen.add_html_furigana("私"),
            "<ruby>私<rt>ワタシ</rt></ruby>"
        );
+        assert_eq!(
+            gen_accent.add_html_furigana("私"),
+            "<sup>0</sup><ruby>私<rt>ワタシ</rt></ruby>"
+        );
    }
 }