Avoid allocations for split vectors

This commit is contained in:
Dirkjan Ochtman 2020-11-26 09:45:06 +01:00
parent 62bb7130b6
commit 2f9cb95b5c
1 changed files with 29 additions and 20 deletions

View File

@ -79,6 +79,7 @@ struct SegmentState<'a> {
memo: HashMap<(&'a str, &'a str), (f64, Range<usize>)>, memo: HashMap<(&'a str, &'a str), (f64, Range<usize>)>,
split_cache: Vec<usize>, split_cache: Vec<usize>,
result: &'a mut Vec<String>, result: &'a mut Vec<String>,
best: Vec<Vec<usize>>,
} }
impl<'a> SegmentState<'a> { impl<'a> SegmentState<'a> {
@ -89,6 +90,7 @@ impl<'a> SegmentState<'a> {
memo: HashMap::new(), memo: HashMap::new(),
split_cache: Vec::new(), split_cache: Vec::new(),
result, result,
best: vec![vec![]; SEGMENT_SIZE],
} }
} }
@ -98,32 +100,34 @@ impl<'a> SegmentState<'a> {
loop { loop {
end = self.text.len().min(end + SEGMENT_SIZE); end = self.text.len().min(end + SEGMENT_SIZE);
let prefix = &self.text[start..end]; let prefix = &self.text[start..end];
let window_splits = self.search(&prefix, None).1; if self.search(0, &prefix, None).1 {
let splits = &self.best[0];
for split in &window_splits[..window_splits.len().saturating_sub(5)] { for split in &splits[..splits.len().saturating_sub(5)] {
self.result.push(self.text[start..start + split].into()); self.result.push(self.text[start..start + split].into());
start += split; start += split;
} }
}
if end == self.text.len() { if end == self.text.len() {
break; break;
} }
} }
let window_splits = self.search(&self.text[start..], None).1; if self.search(0, &self.text[start..], None).1 {
for split in window_splits { for split in &self.best[0] {
self.result.push(self.text[start..start + split].into()); self.result.push(self.text[start..start + split].into());
start += split; start += split;
} }
} }
/// Score `word` in the context of `previous` word
fn search(&mut self, text: &'a str, previous: Option<&str>) -> (f64, Vec<usize>) {
if text.is_empty() {
return (0.0, vec![]);
} }
let mut best = (f64::MIN, vec![]); /// Score `word` in the context of `previous` word
fn search(&mut self, level: usize, text: &'a str, previous: Option<&str>) -> (f64, bool) {
if text.is_empty() {
return (0.0, false);
}
let mut best = f64::MIN;
for split in 1..(text.len().min(self.data.limit) + 1) { for split in 1..(text.len().min(self.data.limit) + 1) {
let (prefix, suffix) = text.split_at(split); let (prefix, suffix) = text.split_at(split);
let prefix_score = self.data.score(prefix, previous).log10(); let prefix_score = self.data.score(prefix, previous).log10();
@ -132,9 +136,13 @@ impl<'a> SegmentState<'a> {
let (suffix_score, suffix_splits) = match self.memo.get(&pair) { let (suffix_score, suffix_splits) = match self.memo.get(&pair) {
Some((score, splits)) => (*score, &self.split_cache[splits.start..splits.end]), Some((score, splits)) => (*score, &self.split_cache[splits.start..splits.end]),
None => { None => {
let (suffix_score, suffix_splits) = self.search(&suffix, Some(prefix)); let (suffix_score, has_splits) = self.search(level + 1, &suffix, Some(prefix));
let start = self.split_cache.len(); let start = self.split_cache.len();
self.split_cache.extend(&suffix_splits); self.split_cache.extend(if has_splits {
&self.best[level + 1][..]
} else {
&[]
});
let end = self.split_cache.len(); let end = self.split_cache.len();
self.memo.insert(pair, (suffix_score, start..end)); self.memo.insert(pair, (suffix_score, start..end));
(suffix_score, &self.split_cache[start..end]) (suffix_score, &self.split_cache[start..end])
@ -142,15 +150,16 @@ impl<'a> SegmentState<'a> {
}; };
let score = prefix_score + suffix_score; let score = prefix_score + suffix_score;
if score > best.0 { if score > best {
best.0 = score; best = score;
best.1.clear(); let splits = &mut self.best[level];
best.1.push(split); splits.clear();
best.1.extend(suffix_splits); splits.push(split);
splits.extend(suffix_splits);
} }
} }
best (best, true)
} }
} }