Tweak the learning algorithm.
It was both too conservative and not conservative enough in different circumstances.
This commit is contained in:
parent
44cb2b8bda
commit
d79cc60a48
|
@ -1,8 +1,8 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
const LEARN_RATE: f64 = 1.0;
|
||||
const MIN_MAX_DISTANCE: usize = 100;
|
||||
const MAX_MAX_DISTANCE: usize = 10000;
|
||||
const LEARN_RATE: f64 = 0.7;
|
||||
const MIN_MAX_DISTANCE: usize = 10;
|
||||
const MAX_MAX_DISTANCE: usize = 75000;
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub(crate) struct WordStats {
|
||||
|
@ -33,17 +33,17 @@ impl Learner {
|
|||
|
||||
/// Returns the word stats, sorted by how "well known" they are according
|
||||
/// to the `max_distance` metric.
|
||||
pub(crate) fn word_stats(&self) -> Vec<(String, WordStats)> {
|
||||
pub(crate) fn word_stats(&self) -> (usize, Vec<(String, WordStats)>) {
|
||||
let mut stats: Vec<(String, WordStats)> = self
|
||||
.stats
|
||||
.iter()
|
||||
.map(|(w, s)| (w.clone(), s.clone()))
|
||||
.collect();
|
||||
|
||||
stats.sort_unstable_by_key(|(_, s)| s.max_distance);
|
||||
stats.sort_unstable_by_key(|(_, s)| (s.max_distance, s.times_seen));
|
||||
stats.reverse();
|
||||
|
||||
stats
|
||||
(self.words_processed, stats)
|
||||
}
|
||||
|
||||
pub fn record(&mut self, word: &str) {
|
||||
|
@ -58,10 +58,8 @@ impl Learner {
|
|||
return;
|
||||
}
|
||||
|
||||
if distance < stats.max_distance {
|
||||
stats.max_distance +=
|
||||
distance.min((stats.max_distance as f64 * LEARN_RATE) as usize);
|
||||
}
|
||||
|
||||
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
|
||||
})
|
||||
|
|
12
src/lib.rs
12
src/lib.rs
|
@ -51,17 +51,21 @@ impl FuriganaGenerator {
|
|||
Self {
|
||||
tokenizer: Tokenizer::new(dict),
|
||||
exclude_kanji: exclude_kanji,
|
||||
learner: Learner::new(if learn_mode { 5 } else { usize::MAX }),
|
||||
learner: Learner::new(if learn_mode { 3 } else { usize::MAX }),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn word_stats(&self) -> Vec<(String, usize, usize)> {
|
||||
let mut stats = self.learner.word_stats();
|
||||
/// Returns (total_words_processed, Vec<(Word, distance, times_seen)>)
|
||||
pub fn word_stats(&self) -> (usize, Vec<(String, usize, usize)>) {
|
||||
let (total_words, mut stats) = self.learner.word_stats();
|
||||
|
||||
(
|
||||
total_words,
|
||||
stats
|
||||
.drain(..)
|
||||
.map(|(w, s)| (w, s.max_distance, s.times_seen))
|
||||
.collect()
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn add_html_furigana(&mut self, text: &str) -> String {
|
||||
|
|
Loading…
Reference in New Issue
Block a user