Use separate total value for bigrams
This commit is contained in:
parent
9addd3810b
commit
8f7959eeed
18
src/lib.rs
18
src/lib.rs
|
@ -15,7 +15,8 @@ pub mod test_data;
|
||||||
pub struct Segmenter {
|
pub struct Segmenter {
|
||||||
unigrams: HashMap<String, f64>,
|
unigrams: HashMap<String, f64>,
|
||||||
bigrams: HashMap<(String, String), f64>,
|
bigrams: HashMap<(String, String), f64>,
|
||||||
total: f64,
|
uni_total: f64,
|
||||||
|
bi_total: f64,
|
||||||
limit: usize,
|
limit: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,12 +43,12 @@ impl Segmenter {
|
||||||
unigrams: HashMap<String, f64>,
|
unigrams: HashMap<String, f64>,
|
||||||
bigrams: HashMap<(String, String), f64>,
|
bigrams: HashMap<(String, String), f64>,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let total = unigrams.values().sum();
|
|
||||||
Self {
|
Self {
|
||||||
|
uni_total: unigrams.values().sum(),
|
||||||
|
bi_total: bigrams.values().sum(),
|
||||||
unigrams,
|
unigrams,
|
||||||
bigrams,
|
bigrams,
|
||||||
limit: DEFAULT_LIMIT,
|
limit: DEFAULT_LIMIT,
|
||||||
total,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,17 +73,17 @@ impl Segmenter {
|
||||||
// Conditional probability of the word given the previous
|
// Conditional probability of the word given the previous
|
||||||
// word. The technical name is "stupid backoff" and it's
|
// word. The technical name is "stupid backoff" and it's
|
||||||
// not a probability distribution but it works well in practice.
|
// not a probability distribution but it works well in practice.
|
||||||
return (bi / self.total) / (uni / self.total);
|
return (bi / self.bi_total) / (uni / self.uni_total);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
match self.unigrams.get(word) {
|
match self.unigrams.get(word) {
|
||||||
// Probability of the given word
|
// Probability of the given word
|
||||||
Some(p) => p / self.total,
|
Some(p) => p / self.uni_total,
|
||||||
// Penalize words not found in the unigrams according
|
// Penalize words not found in the unigrams according
|
||||||
// to their length, a crucial heuristic.
|
// to their length, a crucial heuristic.
|
||||||
None => 10.0 / (self.total * 10.0f64.powi(word.len() as i32)),
|
None => 10.0 / (self.uni_total * 10.0f64.powi(word.len() as i32)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -90,11 +91,6 @@ impl Segmenter {
|
||||||
pub fn set_limit(&mut self, limit: usize) {
|
pub fn set_limit(&mut self, limit: usize) {
|
||||||
self.limit = limit;
|
self.limit = limit;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Customize the relative score by setting the `total`
|
|
||||||
pub fn set_total(&mut self, total: f64) {
|
|
||||||
self.total = total;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct SegmentState<'a> {
|
struct SegmentState<'a> {
|
||||||
|
|
Loading…
Reference in New Issue