Tweak the learning algorithm.
It was both too conservative and not conservative enough in different circumstances.
This commit is contained in:
parent
44cb2b8bda
commit
cee19d4fe8
|
@ -1,8 +1,8 @@
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
const LEARN_RATE: f64 = 1.0;
|
const LEARN_RATE: f64 = 0.7;
|
||||||
const MIN_MAX_DISTANCE: usize = 100;
|
const MIN_MAX_DISTANCE: usize = 10;
|
||||||
const MAX_MAX_DISTANCE: usize = 10000;
|
const MAX_MAX_DISTANCE: usize = 25000;
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone)]
|
#[derive(Debug, Copy, Clone)]
|
||||||
pub(crate) struct WordStats {
|
pub(crate) struct WordStats {
|
||||||
|
@ -33,17 +33,17 @@ impl Learner {
|
||||||
|
|
||||||
/// Returns the word stats, sorted by how "well known" they are according
|
/// Returns the word stats, sorted by how "well known" they are according
|
||||||
/// to the `max_distance` metric.
|
/// to the `max_distance` metric.
|
||||||
pub(crate) fn word_stats(&self) -> Vec<(String, WordStats)> {
|
pub(crate) fn word_stats(&self) -> (usize, Vec<(String, WordStats)>) {
|
||||||
let mut stats: Vec<(String, WordStats)> = self
|
let mut stats: Vec<(String, WordStats)> = self
|
||||||
.stats
|
.stats
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(w, s)| (w.clone(), s.clone()))
|
.map(|(w, s)| (w.clone(), s.clone()))
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
stats.sort_unstable_by_key(|(_, s)| s.max_distance);
|
stats.sort_unstable_by_key(|(_, s)| (s.max_distance, s.times_seen));
|
||||||
stats.reverse();
|
stats.reverse();
|
||||||
|
|
||||||
stats
|
(self.words_processed, stats)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn record(&mut self, word: &str) {
|
pub fn record(&mut self, word: &str) {
|
||||||
|
@ -58,10 +58,8 @@ impl Learner {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if distance < stats.max_distance {
|
stats.max_distance +=
|
||||||
stats.max_distance +=
|
distance.min((stats.max_distance as f64 * LEARN_RATE) as usize);
|
||||||
distance.min((stats.max_distance as f64 * LEARN_RATE) as usize);
|
|
||||||
}
|
|
||||||
|
|
||||||
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
|
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
|
||||||
})
|
})
|
||||||
|
|
18
src/lib.rs
18
src/lib.rs
|
@ -51,17 +51,21 @@ impl FuriganaGenerator {
|
||||||
Self {
|
Self {
|
||||||
tokenizer: Tokenizer::new(dict),
|
tokenizer: Tokenizer::new(dict),
|
||||||
exclude_kanji: exclude_kanji,
|
exclude_kanji: exclude_kanji,
|
||||||
learner: Learner::new(if learn_mode { 5 } else { usize::MAX }),
|
learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn word_stats(&self) -> Vec<(String, usize, usize)> {
|
/// Returns (total_words_processed, Vec<(Word, distance, times_seen)>)
|
||||||
let mut stats = self.learner.word_stats();
|
pub fn word_stats(&self) -> (usize, Vec<(String, usize, usize)>) {
|
||||||
|
let (total_words, mut stats) = self.learner.word_stats();
|
||||||
|
|
||||||
stats
|
(
|
||||||
.drain(..)
|
total_words,
|
||||||
.map(|(w, s)| (w, s.max_distance, s.times_seen))
|
stats
|
||||||
.collect()
|
.drain(..)
|
||||||
|
.map(|(w, s)| (w, s.max_distance, s.times_seen))
|
||||||
|
.collect(),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_html_furigana(&mut self, text: &str) -> String {
|
pub fn add_html_furigana(&mut self, text: &str) -> String {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user