Add function to get word stats after processing.

This commit is contained in:
Nathan Vegdahl 2024-09-11 11:14:12 +02:00
parent 1c3afed157
commit ecbac83e26
2 changed files with 36 additions and 5 deletions

View File

@ -1,18 +1,19 @@
use std::collections::HashMap; use std::collections::HashMap;
const LEARN_RATE: f64 = 1.0;
const MIN_MAX_DISTANCE: usize = 100; const MIN_MAX_DISTANCE: usize = 100;
const MAX_MAX_DISTANCE: usize = 10000; const MAX_MAX_DISTANCE: usize = 10000;
#[derive(Debug, Copy, Clone)] #[derive(Debug, Copy, Clone)]
struct WordStats { pub(crate) struct WordStats {
// The last position (in words processed) that this word was seen at. // The last position (in words processed) that this word was seen at.
last_seen_at: usize, last_seen_at: usize,
// How many times this word has been seen so far. // How many times this word has been seen so far.
times_seen: usize, pub times_seen: usize,
// Maximum distance before helps is needed again. // Maximum distance before helps is needed again.
max_distance: usize, pub max_distance: usize,
} }
pub struct Learner { pub struct Learner {
@ -30,6 +31,21 @@ impl Learner {
} }
} }
/// Returns the word stats, sorted by how "well known" they are according
/// to the `max_distance` metric.
pub(crate) fn word_stats(&self) -> Vec<(String, WordStats)> {
let mut stats: Vec<(String, WordStats)> = self
.stats
.iter()
.map(|(w, s)| (w.clone(), s.clone()))
.collect();
stats.sort_unstable_by_key(|(_, s)| s.max_distance);
stats.reverse();
stats
}
pub fn record(&mut self, word: &str) { pub fn record(&mut self, word: &str) {
self.stats self.stats
.entry(word.to_string()) .entry(word.to_string())
@ -43,7 +59,8 @@ impl Learner {
} }
if distance < stats.max_distance { if distance < stats.max_distance {
stats.max_distance += distance.min((stats.max_distance as f64 * 0.5) as usize); stats.max_distance +=
distance.min((stats.max_distance as f64 * LEARN_RATE) as usize);
} }
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE); stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);

View File

@ -55,6 +55,15 @@ impl FuriganaGenerator {
} }
} }
pub fn word_stats(&self) -> Vec<(String, usize, usize)> {
let mut stats = self.learner.word_stats();
stats
.drain(..)
.map(|(w, s)| (w, s.max_distance, s.times_seen))
.collect()
}
pub fn add_html_furigana(&mut self, text: &str) -> String { pub fn add_html_furigana(&mut self, text: &str) -> String {
add_html_furigana_skip_already_ruby( add_html_furigana_skip_already_ruby(
&text, &text,
@ -83,7 +92,12 @@ fn add_html_furigana_skip_already_ruby(
loop { loop {
match reader.read_event() { match reader.read_event() {
Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), Err(_) => {
// If we hit a parse error, just don't add furigana.
// But still panic in debug, so we can track things down.
debug_assert!(false);
return text.into();
}
Ok(Event::Eof) => break, Ok(Event::Eof) => break,
Ok(Event::Start(e)) => { Ok(Event::Start(e)) => {