Tweak the learning algorithm.

It was both too conservative and not conservative enough in different
circumstances.
This commit is contained in:
Nathan Vegdahl 2024-09-11 12:05:57 +02:00
parent 44cb2b8bda
commit 80269e1ff4
2 changed files with 19 additions and 17 deletions

View File

@ -1,8 +1,8 @@
use std::collections::HashMap;
const LEARN_RATE: f64 = 1.0;
const MIN_MAX_DISTANCE: usize = 100;
const MAX_MAX_DISTANCE: usize = 10000;
const LEARN_RATE: f64 = 0.5;
const MIN_MAX_DISTANCE: usize = 10;
const MAX_MAX_DISTANCE: usize = 25000;
#[derive(Debug, Copy, Clone)]
pub(crate) struct WordStats {
@ -33,17 +33,17 @@ impl Learner {
/// Returns the word stats, sorted by how "well known" they are according
/// to the `max_distance` metric.
pub(crate) fn word_stats(&self) -> Vec<(String, WordStats)> {
pub(crate) fn word_stats(&self) -> (usize, Vec<(String, WordStats)>) {
let mut stats: Vec<(String, WordStats)> = self
.stats
.iter()
.map(|(w, s)| (w.clone(), s.clone()))
.collect();
stats.sort_unstable_by_key(|(_, s)| s.max_distance);
stats.sort_unstable_by_key(|(_, s)| (s.max_distance, s.times_seen));
stats.reverse();
stats
(self.words_processed, stats)
}
pub fn record(&mut self, word: &str) {
@ -58,10 +58,8 @@ impl Learner {
return;
}
if distance < stats.max_distance {
stats.max_distance +=
distance.min((stats.max_distance as f64 * LEARN_RATE) as usize);
}
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
})

View File

@ -51,17 +51,21 @@ impl FuriganaGenerator {
Self {
tokenizer: Tokenizer::new(dict),
exclude_kanji: exclude_kanji,
learner: Learner::new(if learn_mode { 5 } else { usize::MAX }),
learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
}
}
pub fn word_stats(&self) -> Vec<(String, usize, usize)> {
let mut stats = self.learner.word_stats();
/// Returns (total_words_processed, Vec<(Word, distance, times_seen)>)
pub fn word_stats(&self) -> (usize, Vec<(String, usize, usize)>) {
let (total_words, mut stats) = self.learner.word_stats();
(
total_words,
stats
.drain(..)
.map(|(w, s)| (w, s.max_distance, s.times_seen))
.collect()
.collect(),
)
}
pub fn add_html_furigana(&mut self, text: &str) -> String {