Add function to get word stats after processing.
This commit is contained in:
parent
1c3afed157
commit
ecbac83e26
|
@ -1,18 +1,19 @@
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
const LEARN_RATE: f64 = 1.0;
|
||||||
const MIN_MAX_DISTANCE: usize = 100;
|
const MIN_MAX_DISTANCE: usize = 100;
|
||||||
const MAX_MAX_DISTANCE: usize = 10000;
|
const MAX_MAX_DISTANCE: usize = 10000;
|
||||||
|
|
||||||
#[derive(Debug, Copy, Clone)]
|
#[derive(Debug, Copy, Clone)]
|
||||||
struct WordStats {
|
pub(crate) struct WordStats {
|
||||||
// The last position (in words processed) that this word was seen at.
|
// The last position (in words processed) that this word was seen at.
|
||||||
last_seen_at: usize,
|
last_seen_at: usize,
|
||||||
|
|
||||||
// How many times this word has been seen so far.
|
// How many times this word has been seen so far.
|
||||||
times_seen: usize,
|
pub times_seen: usize,
|
||||||
|
|
||||||
// Maximum distance before helps is needed again.
|
// Maximum distance before helps is needed again.
|
||||||
max_distance: usize,
|
pub max_distance: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct Learner {
|
pub struct Learner {
|
||||||
|
@ -30,6 +31,21 @@ impl Learner {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the word stats, sorted by how "well known" they are according
|
||||||
|
/// to the `max_distance` metric.
|
||||||
|
pub(crate) fn word_stats(&self) -> Vec<(String, WordStats)> {
|
||||||
|
let mut stats: Vec<(String, WordStats)> = self
|
||||||
|
.stats
|
||||||
|
.iter()
|
||||||
|
.map(|(w, s)| (w.clone(), s.clone()))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
stats.sort_unstable_by_key(|(_, s)| s.max_distance);
|
||||||
|
stats.reverse();
|
||||||
|
|
||||||
|
stats
|
||||||
|
}
|
||||||
|
|
||||||
pub fn record(&mut self, word: &str) {
|
pub fn record(&mut self, word: &str) {
|
||||||
self.stats
|
self.stats
|
||||||
.entry(word.to_string())
|
.entry(word.to_string())
|
||||||
|
@ -43,7 +59,8 @@ impl Learner {
|
||||||
}
|
}
|
||||||
|
|
||||||
if distance < stats.max_distance {
|
if distance < stats.max_distance {
|
||||||
stats.max_distance += distance.min((stats.max_distance as f64 * 0.5) as usize);
|
stats.max_distance +=
|
||||||
|
distance.min((stats.max_distance as f64 * LEARN_RATE) as usize);
|
||||||
}
|
}
|
||||||
|
|
||||||
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
|
stats.max_distance = stats.max_distance.min(MAX_MAX_DISTANCE);
|
||||||
|
|
16
src/lib.rs
16
src/lib.rs
|
@ -55,6 +55,15 @@ impl FuriganaGenerator {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn word_stats(&self) -> Vec<(String, usize, usize)> {
|
||||||
|
let mut stats = self.learner.word_stats();
|
||||||
|
|
||||||
|
stats
|
||||||
|
.drain(..)
|
||||||
|
.map(|(w, s)| (w, s.max_distance, s.times_seen))
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn add_html_furigana(&mut self, text: &str) -> String {
|
pub fn add_html_furigana(&mut self, text: &str) -> String {
|
||||||
add_html_furigana_skip_already_ruby(
|
add_html_furigana_skip_already_ruby(
|
||||||
&text,
|
&text,
|
||||||
|
@ -83,7 +92,12 @@ fn add_html_furigana_skip_already_ruby(
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
match reader.read_event() {
|
match reader.read_event() {
|
||||||
Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
|
Err(_) => {
|
||||||
|
// If we hit a parse error, just don't add furigana.
|
||||||
|
// But still panic in debug, so we can track things down.
|
||||||
|
debug_assert!(false);
|
||||||
|
return text.into();
|
||||||
|
}
|
||||||
Ok(Event::Eof) => break,
|
Ok(Event::Eof) => break,
|
||||||
|
|
||||||
Ok(Event::Start(e)) => {
|
Ok(Event::Start(e)) => {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user