Tweak the learning algorithm.

It was both too conservative and not conservative enough in different
circumstances.
This commit is contained in:
Nathan Vegdahl 2024-09-11 12:05:57 +02:00
parent 44cb2b8bda
commit 80269e1ff4
2 changed files with 19 additions and 17 deletions

View File

@ -1,8 +1,8 @@
use std::collections::HashMap; use std::collections::HashMap;
const LEARN_RATE: f64 = 1.0; const LEARN_RATE: f64 = 0.5;
const MIN_MAX_DISTANCE: usize = 100; const MIN_MAX_DISTANCE: usize = 10;
const MAX_MAX_DISTANCE: usize = 10000; const MAX_MAX_DISTANCE: usize = 25000;
#[derive(Debug, Copy, Clone)] #[derive(Debug, Copy, Clone)]
pub(crate) struct WordStats { pub(crate) struct WordStats {
@ -33,17 +33,17 @@ impl Learner {
/// Returns the word stats, sorted by how "well known" they are according /// Returns the word stats, sorted by how "well known" they are according
/// to the `max_distance` metric. /// to the `max_distance` metric.
pub(crate) fn word_stats(&self) -> Vec<(String, WordStats)> { pub(crate) fn word_stats(&self) -> (usize, Vec<(String, WordStats)>) {
let mut stats: Vec<(String, WordStats)> = self let mut stats: Vec<(String, WordStats)> = self
.stats .stats
.iter() .iter()
.map(|(w, s)| (w.clone(), s.clone())) .map(|(w, s)| (w.clone(), s.clone()))
.collect(); .collect();
stats.sort_unstable_by_key(|(_, s)| s.max_distance); stats.sort_unstable_by_key(|(_, s)| (s.max_distance, s.times_seen));
stats.reverse(); stats.reverse();
stats (self.words_processed, stats)
} }
pub fn record(&mut self, word: &str) { pub fn record(&mut self, word: &str) {
@ -58,10 +58,8 @@ impl Learner {
return; return;
} }
if distance < stats.max_distance { stats.max_distance +=
stats.max_distance += distance.min((stats.max_distance as f64 * LEARN_RATE) as usize);
distance.min((stats.max_distance as f64 * LEARN_RATE) as usize);
}
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE); stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
}) })

View File

@ -51,17 +51,21 @@ impl FuriganaGenerator {
Self { Self {
tokenizer: Tokenizer::new(dict), tokenizer: Tokenizer::new(dict),
exclude_kanji: exclude_kanji, exclude_kanji: exclude_kanji,
learner: Learner::new(if learn_mode { 5 } else { usize::MAX }), learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
} }
} }
pub fn word_stats(&self) -> Vec<(String, usize, usize)> { /// Returns (total_words_processed, Vec<(Word, distance, times_seen)>)
let mut stats = self.learner.word_stats(); pub fn word_stats(&self) -> (usize, Vec<(String, usize, usize)>) {
let (total_words, mut stats) = self.learner.word_stats();
stats (
.drain(..) total_words,
.map(|(w, s)| (w, s.max_distance, s.times_seen)) stats
.collect() .drain(..)
.map(|(w, s)| (w, s.max_distance, s.times_seen))
.collect(),
)
} }
pub fn add_html_furigana(&mut self, text: &str) -> String { pub fn add_html_furigana(&mut self, text: &str) -> String {