Update learning code.

Now it tracks distance by character, and determines whether to show
furigana based on how long it's been since the last time a word was
shown with furigana rather than the last time a word was shown at all.

Also some minor performance efficiency improvements.
This commit is contained in:
Nathan Vegdahl 2024-09-20 07:38:02 +02:00
parent ba5fea6e0a
commit 9845da5a7e
2 changed files with 62 additions and 43 deletions

View File

@ -1,23 +1,27 @@
use std::collections::HashMap;
use fnv::FnvHashMap;
const LEARN_RATE: f64 = 0.7;
const MIN_MAX_DISTANCE: usize = 10;
const MAX_MAX_DISTANCE: usize = 75000;
const MAX_DISTANCE_FLOOR: usize = 15;
const MAX_DISTANCE_CEILING: usize = 100000;
#[derive(Debug, Copy, Clone)]
pub(crate) struct WordStats {
// The last position (in words processed) that this word was seen at.
// The last position (in characters processed) that this word was seen at.
last_seen_at: usize,
// The last position (in characters processed) that this word had help.
last_helped_at: usize,
// How many times this word has been seen so far.
pub times_seen: usize,
// Maximum distance before helps is needed again.
// Maximum distance (in characters) before help is needed again.
pub max_distance: usize,
}
pub struct Learner {
stats: HashMap<String, WordStats>,
stats: FnvHashMap<String, WordStats>,
chars_processed: usize,
words_processed: usize,
times_seen_threshold: usize,
}
@ -25,7 +29,8 @@ pub struct Learner {
impl Learner {
pub fn new(times_seen_threshold: usize) -> Self {
Self {
stats: HashMap::new(),
stats: FnvHashMap::default(),
chars_processed: 0,
words_processed: 0,
times_seen_threshold: times_seen_threshold,
}
@ -46,45 +51,60 @@ impl Learner {
(self.words_processed, stats)
}
pub fn record(&mut self, word: &str) {
if self.times_seen_threshold == usize::MAX {
return;
}
self.stats
.entry(word.to_string())
.and_modify(|stats| {
let distance = self.words_processed - stats.last_seen_at;
stats.last_seen_at = self.words_processed;
stats.times_seen += 1;
if stats.times_seen <= self.times_seen_threshold {
return;
}
stats.max_distance +=
distance.min((stats.max_distance as f64 * LEARN_RATE) as usize);
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
})
.or_insert(WordStats {
last_seen_at: self.words_processed,
times_seen: 1,
max_distance: MIN_MAX_DISTANCE,
});
self.words_processed += 1;
}
pub fn needs_help(&self, word: &str) -> bool {
/// Processes a word, and returns whether it needs help or not.
pub fn process(&mut self, word: &str) -> bool {
if self.times_seen_threshold == usize::MAX {
return true;
}
if let Some(stats) = self.stats.get(word) {
let distance = self.words_processed - stats.last_seen_at;
stats.times_seen <= self.times_seen_threshold || distance > stats.max_distance
// Get word stats entry.
let word_stats = if let Some(word_stats) = self.stats.get_mut(word) {
word_stats
} else {
true
self.stats.insert(
word.into(),
WordStats {
last_seen_at: 0,
last_helped_at: 0,
times_seen: 0,
max_distance: MAX_DISTANCE_FLOOR,
},
);
self.stats.get_mut(word).unwrap()
};
// Determine if help is needed.
let help = {
let help_distance = self.chars_processed - word_stats.last_helped_at;
word_stats.times_seen <= self.times_seen_threshold
|| help_distance > word_stats.max_distance
};
// Update word stats.
{
let seen_distance = self.chars_processed - word_stats.last_seen_at;
word_stats.last_seen_at = self.chars_processed;
if help {
word_stats.last_helped_at = self.chars_processed;
}
word_stats.times_seen += 1;
if word_stats.times_seen > self.times_seen_threshold {
word_stats.max_distance +=
seen_distance.min((word_stats.max_distance as f64 * LEARN_RATE) as usize);
// Clamp to floor/ceiling.
word_stats.max_distance = word_stats
.max_distance
.clamp(MAX_DISTANCE_FLOOR, MAX_DISTANCE_CEILING);
}
}
// Update position.
self.chars_processed += word.chars().count();
self.words_processed += 1;
return help;
}
}

View File

@ -307,8 +307,7 @@ fn add_html_furigana(
(surface, kana, pitches)
};
let needs_help = learner.needs_help(surface);
learner.record(surface);
let needs_help = learner.process(surface);
if !needs_help {
new_text.push_str(surface);