Store splits instead of string slices

This commit is contained in:
Dirkjan Ochtman 2020-11-25 16:23:37 +01:00
parent 1df3c4397e
commit 947e003a48
1 changed files with 17 additions and 15 deletions

View File

@ -75,7 +75,7 @@ impl Segmenter {
struct SegmentState<'a> { struct SegmentState<'a> {
data: &'a Segmenter, data: &'a Segmenter,
text: &'a str, text: &'a str,
memo: HashMap<(&'a str, &'a str), (f64, Vec<&'a str>)>, memo: HashMap<(&'a str, &'a str), (f64, Vec<usize>)>,
result: &'a mut Vec<String>, result: &'a mut Vec<String>,
} }
@ -95,11 +95,11 @@ impl<'a> SegmentState<'a> {
loop { loop {
end = self.text.len().min(end + SEGMENT_SIZE); end = self.text.len().min(end + SEGMENT_SIZE);
let prefix = &self.text[start..end]; let prefix = &self.text[start..end];
let window_words = self.search(&prefix, None).1; let window_splits = self.search(&prefix, None).1;
for word in &window_words[..window_words.len().saturating_sub(5)] { for split in &window_splits[..window_splits.len().saturating_sub(5)] {
start += word.len(); self.result.push(self.text[start..start + split].into());
self.result.push((*word).into()); start += split;
} }
if end == self.text.len() { if end == self.text.len() {
@ -107,13 +107,15 @@ impl<'a> SegmentState<'a> {
} }
} }
let window_words = self.search(&self.text[start..], None).1; let window_splits = self.search(&self.text[start..], None).1;
self.result for split in window_splits {
.extend(window_words.into_iter().map(|s| s.into())); self.result.push(self.text[start..start + split].into());
start += split;
}
} }
/// Score `word` in the context of `previous` word /// Score `word` in the context of `previous` word
fn search(&mut self, text: &'a str, previous: Option<&str>) -> (f64, Vec<&'a str>) { fn search(&mut self, text: &'a str, previous: Option<&str>) -> (f64, Vec<usize>) {
if text.is_empty() { if text.is_empty() {
return (0.0, vec![]); return (0.0, vec![]);
} }
@ -124,14 +126,14 @@ impl<'a> SegmentState<'a> {
let prefix_score = self.data.score(prefix, previous).log10(); let prefix_score = self.data.score(prefix, previous).log10();
let pair = (suffix, prefix); let pair = (suffix, prefix);
let (suffix_score, suffix_words) = match self.memo.get(&pair) { let (suffix_score, suffix_splits) = match self.memo.get(&pair) {
Some((score, words)) => (*score, words.as_slice()), Some((score, splits)) => (*score, splits.as_slice()),
None => { None => {
let (suffix_score, suffix_words) = self.search(&suffix, Some(prefix)); let (suffix_score, suffix_splits) = self.search(&suffix, Some(prefix));
let value = self let value = self
.memo .memo
.entry(pair) .entry(pair)
.or_insert((suffix_score, suffix_words)); .or_insert((suffix_score, suffix_splits));
(suffix_score, value.1.as_slice()) (suffix_score, value.1.as_slice())
} }
}; };
@ -140,8 +142,8 @@ impl<'a> SegmentState<'a> {
if score > best.0 { if score > best.0 {
best.0 = score; best.0 = score;
best.1.clear(); best.1.clear();
best.1.push(prefix); best.1.push(split);
best.1.extend(suffix_words); best.1.extend(suffix_splits);
} }
} }