From ecbac83e26c89049c5e8f7c143c0d4bc4d223720 Mon Sep 17 00:00:00 2001 From: Nathan Vegdahl Date: Wed, 11 Sep 2024 11:14:12 +0200 Subject: [PATCH] Add function to get word stats after processing. --- src/learner.rs | 25 +++++++++++++++++++++---- src/lib.rs | 16 +++++++++++++++- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/src/learner.rs b/src/learner.rs index 4250b04..ba7b1b6 100644 --- a/src/learner.rs +++ b/src/learner.rs @@ -1,18 +1,19 @@ use std::collections::HashMap; +const LEARN_RATE: f64 = 1.0; const MIN_MAX_DISTANCE: usize = 100; const MAX_MAX_DISTANCE: usize = 10000; #[derive(Debug, Copy, Clone)] -struct WordStats { +pub(crate) struct WordStats { // The last position (in words processed) that this word was seen at. last_seen_at: usize, // How many times this word has been seen so far. - times_seen: usize, + pub times_seen: usize, // Maximum distance before helps is needed again. - max_distance: usize, + pub max_distance: usize, } pub struct Learner { @@ -30,6 +31,21 @@ impl Learner { } } + /// Returns the word stats, sorted by how "well known" they are according + /// to the `max_distance` metric. + pub(crate) fn word_stats(&self) -> Vec<(String, WordStats)> { + let mut stats: Vec<(String, WordStats)> = self + .stats + .iter() + .map(|(w, s)| (w.clone(), s.clone())) + .collect(); + + stats.sort_unstable_by_key(|(_, s)| s.max_distance); + stats.reverse(); + + stats + } + pub fn record(&mut self, word: &str) { self.stats .entry(word.to_string()) @@ -43,7 +59,8 @@ impl Learner { } if distance < stats.max_distance { - stats.max_distance += distance.min((stats.max_distance as f64 * 0.5) as usize); + stats.max_distance += + distance.min((stats.max_distance as f64 * LEARN_RATE) as usize); } stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE); diff --git a/src/lib.rs b/src/lib.rs index 72cc9d6..316b435 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -55,6 +55,15 @@ impl FuriganaGenerator { } } + pub fn word_stats(&self) -> Vec<(String, usize, usize)> { + let mut stats = self.learner.word_stats(); + + stats + .drain(..) + .map(|(w, s)| (w, s.max_distance, s.times_seen)) + .collect() + } + pub fn add_html_furigana(&mut self, text: &str) -> String { add_html_furigana_skip_already_ruby( &text, @@ -83,7 +92,12 @@ fn add_html_furigana_skip_already_ruby( loop { match reader.read_event() { - Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e), + Err(_) => { + // If we hit a parse error, just don't add furigana. + // But still panic in debug, so we can track things down. + debug_assert!(false); + return text.into(); + } Ok(Event::Eof) => break, Ok(Event::Start(e)) => {