From cee19d4fe8abbeab42b6a7028d261af1b6ddb364 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Wed, 11 Sep 2024 12:05:57 +0200 Subject: [PATCH] Tweak the learning algorithm. It was both too conservative and not conservative enough in different circumstances. --- src/learner.rs | 18 ++++++++---------- src/lib.rs | 18 +++++++++++------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/learner.rs b/src/learner.rs index ba7b1b6..aba6c09 100644 --- a/src/learner.rs +++ b/src/learner.rs @@ -1,8 +1,8 @@ use std::collections::HashMap; -const LEARN_RATE: f64 = 1.0; -const MIN_MAX_DISTANCE: usize = 100; -const MAX_MAX_DISTANCE: usize = 10000; +const LEARN_RATE: f64 = 0.7; +const MIN_MAX_DISTANCE: usize = 10; +const MAX_MAX_DISTANCE: usize = 25000; #[derive(Debug, Copy, Clone)] pub(crate) struct WordStats { @@ -33,17 +33,17 @@ impl Learner { /// Returns the word stats, sorted by how "well known" they are according /// to the `max_distance` metric. - pub(crate) fn word_stats(&self) -> Vec<(String, WordStats)> { + pub(crate) fn word_stats(&self) -> (usize, Vec<(String, WordStats)>) { let mut stats: Vec<(String, WordStats)> = self .stats .iter() .map(|(w, s)| (w.clone(), s.clone())) .collect(); - stats.sort_unstable_by_key(|(_, s)| s.max_distance); + stats.sort_unstable_by_key(|(_, s)| (s.max_distance, s.times_seen)); stats.reverse(); - stats + (self.words_processed, stats) } pub fn record(&mut self, word: &str) { @@ -58,10 +58,8 @@ impl Learner { return; } - if distance < stats.max_distance { - stats.max_distance += - distance.min((stats.max_distance as f64 * LEARN_RATE) as usize); - } + stats.max_distance += + distance.min((stats.max_distance as f64 * LEARN_RATE) as usize); stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE); }) diff --git a/src/lib.rs b/src/lib.rs index 316b435..be82716 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,17 +51,21 @@ impl FuriganaGenerator { Self { tokenizer: Tokenizer::new(dict), exclude_kanji: exclude_kanji, - learner: Learner::new(if learn_mode { 5 } else { usize::MAX }), + learner: Learner::new(if learn_mode { 3 } else { usize::MAX }), } } - pub fn word_stats(&self) -> Vec<(String, usize, usize)> { - let mut stats = self.learner.word_stats(); + /// Returns (total_words_processed, Vec<(Word, distance, times_seen)>) + pub fn word_stats(&self) -> (usize, Vec<(String, usize, usize)>) { + let (total_words, mut stats) = self.learner.word_stats(); - stats - .drain(..) - .map(|(w, s)| (w, s.max_distance, s.times_seen)) - .collect() + ( + total_words, + stats + .drain(..) + .map(|(w, s)| (w, s.max_distance, s.times_seen)) + .collect(), + ) } pub fn add_html_furigana(&mut self, text: &str) -> String {