Add function to get word stats after processing.
This commit is contained in:
parent
1c3afed157
commit
ecbac83e26
|
@ -1,18 +1,19 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
const LEARN_RATE: f64 = 1.0;
|
||||
const MIN_MAX_DISTANCE: usize = 100;
|
||||
const MAX_MAX_DISTANCE: usize = 10000;
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct WordStats {
|
||||
pub(crate) struct WordStats {
|
||||
// The last position (in words processed) that this word was seen at.
|
||||
last_seen_at: usize,
|
||||
|
||||
// How many times this word has been seen so far.
|
||||
times_seen: usize,
|
||||
pub times_seen: usize,
|
||||
|
||||
// Maximum distance before helps is needed again.
|
||||
max_distance: usize,
|
||||
pub max_distance: usize,
|
||||
}
|
||||
|
||||
pub struct Learner {
|
||||
|
@ -30,6 +31,21 @@ impl Learner {
|
|||
}
|
||||
}
|
||||
|
||||
/// Returns the word stats, sorted by how "well known" they are according
|
||||
/// to the `max_distance` metric.
|
||||
pub(crate) fn word_stats(&self) -> Vec<(String, WordStats)> {
|
||||
let mut stats: Vec<(String, WordStats)> = self
|
||||
.stats
|
||||
.iter()
|
||||
.map(|(w, s)| (w.clone(), s.clone()))
|
||||
.collect();
|
||||
|
||||
stats.sort_unstable_by_key(|(_, s)| s.max_distance);
|
||||
stats.reverse();
|
||||
|
||||
stats
|
||||
}
|
||||
|
||||
pub fn record(&mut self, word: &str) {
|
||||
self.stats
|
||||
.entry(word.to_string())
|
||||
|
@ -43,7 +59,8 @@ impl Learner {
|
|||
}
|
||||
|
||||
if distance < stats.max_distance {
|
||||
stats.max_distance += distance.min((stats.max_distance as f64 * 0.5) as usize);
|
||||
stats.max_distance +=
|
||||
distance.min((stats.max_distance as f64 * LEARN_RATE) as usize);
|
||||
}
|
||||
|
||||
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
|
||||
|
|
16
src/lib.rs
16
src/lib.rs
|
@ -55,6 +55,15 @@ impl FuriganaGenerator {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn word_stats(&self) -> Vec<(String, usize, usize)> {
|
||||
let mut stats = self.learner.word_stats();
|
||||
|
||||
stats
|
||||
.drain(..)
|
||||
.map(|(w, s)| (w, s.max_distance, s.times_seen))
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn add_html_furigana(&mut self, text: &str) -> String {
|
||||
add_html_furigana_skip_already_ruby(
|
||||
&text,
|
||||
|
@ -83,7 +92,12 @@ fn add_html_furigana_skip_already_ruby(
|
|||
|
||||
loop {
|
||||
match reader.read_event() {
|
||||
Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
|
||||
Err(_) => {
|
||||
// If we hit a parse error, just don't add furigana.
|
||||
// But still panic in debug, so we can track things down.
|
||||
debug_assert!(false);
|
||||
return text.into();
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
|
||||
Ok(Event::Start(e)) => {
|
||||
|
|
Loading…
Reference in New Issue
Block a user