Avoid allocations for split vectors
This commit is contained in:
parent
62bb7130b6
commit
2f9cb95b5c
47
src/lib.rs
47
src/lib.rs
|
@ -79,6 +79,7 @@ struct SegmentState<'a> {
|
||||||
memo: HashMap<(&'a str, &'a str), (f64, Range<usize>)>,
|
memo: HashMap<(&'a str, &'a str), (f64, Range<usize>)>,
|
||||||
split_cache: Vec<usize>,
|
split_cache: Vec<usize>,
|
||||||
result: &'a mut Vec<String>,
|
result: &'a mut Vec<String>,
|
||||||
|
best: Vec<Vec<usize>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> SegmentState<'a> {
|
impl<'a> SegmentState<'a> {
|
||||||
|
@ -89,6 +90,7 @@ impl<'a> SegmentState<'a> {
|
||||||
memo: HashMap::new(),
|
memo: HashMap::new(),
|
||||||
split_cache: Vec::new(),
|
split_cache: Vec::new(),
|
||||||
result,
|
result,
|
||||||
|
best: vec![vec![]; SEGMENT_SIZE],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,32 +100,34 @@ impl<'a> SegmentState<'a> {
|
||||||
loop {
|
loop {
|
||||||
end = self.text.len().min(end + SEGMENT_SIZE);
|
end = self.text.len().min(end + SEGMENT_SIZE);
|
||||||
let prefix = &self.text[start..end];
|
let prefix = &self.text[start..end];
|
||||||
let window_splits = self.search(&prefix, None).1;
|
if self.search(0, &prefix, None).1 {
|
||||||
|
let splits = &self.best[0];
|
||||||
for split in &window_splits[..window_splits.len().saturating_sub(5)] {
|
for split in &splits[..splits.len().saturating_sub(5)] {
|
||||||
self.result.push(self.text[start..start + split].into());
|
self.result.push(self.text[start..start + split].into());
|
||||||
start += split;
|
start += split;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if end == self.text.len() {
|
if end == self.text.len() {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let window_splits = self.search(&self.text[start..], None).1;
|
if self.search(0, &self.text[start..], None).1 {
|
||||||
for split in window_splits {
|
for split in &self.best[0] {
|
||||||
self.result.push(self.text[start..start + split].into());
|
self.result.push(self.text[start..start + split].into());
|
||||||
start += split;
|
start += split;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Score `word` in the context of `previous` word
|
|
||||||
fn search(&mut self, text: &'a str, previous: Option<&str>) -> (f64, Vec<usize>) {
|
|
||||||
if text.is_empty() {
|
|
||||||
return (0.0, vec![]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut best = (f64::MIN, vec![]);
|
/// Score `word` in the context of `previous` word
|
||||||
|
fn search(&mut self, level: usize, text: &'a str, previous: Option<&str>) -> (f64, bool) {
|
||||||
|
if text.is_empty() {
|
||||||
|
return (0.0, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut best = f64::MIN;
|
||||||
for split in 1..(text.len().min(self.data.limit) + 1) {
|
for split in 1..(text.len().min(self.data.limit) + 1) {
|
||||||
let (prefix, suffix) = text.split_at(split);
|
let (prefix, suffix) = text.split_at(split);
|
||||||
let prefix_score = self.data.score(prefix, previous).log10();
|
let prefix_score = self.data.score(prefix, previous).log10();
|
||||||
|
@ -132,9 +136,13 @@ impl<'a> SegmentState<'a> {
|
||||||
let (suffix_score, suffix_splits) = match self.memo.get(&pair) {
|
let (suffix_score, suffix_splits) = match self.memo.get(&pair) {
|
||||||
Some((score, splits)) => (*score, &self.split_cache[splits.start..splits.end]),
|
Some((score, splits)) => (*score, &self.split_cache[splits.start..splits.end]),
|
||||||
None => {
|
None => {
|
||||||
let (suffix_score, suffix_splits) = self.search(&suffix, Some(prefix));
|
let (suffix_score, has_splits) = self.search(level + 1, &suffix, Some(prefix));
|
||||||
let start = self.split_cache.len();
|
let start = self.split_cache.len();
|
||||||
self.split_cache.extend(&suffix_splits);
|
self.split_cache.extend(if has_splits {
|
||||||
|
&self.best[level + 1][..]
|
||||||
|
} else {
|
||||||
|
&[]
|
||||||
|
});
|
||||||
let end = self.split_cache.len();
|
let end = self.split_cache.len();
|
||||||
self.memo.insert(pair, (suffix_score, start..end));
|
self.memo.insert(pair, (suffix_score, start..end));
|
||||||
(suffix_score, &self.split_cache[start..end])
|
(suffix_score, &self.split_cache[start..end])
|
||||||
|
@ -142,15 +150,16 @@ impl<'a> SegmentState<'a> {
|
||||||
};
|
};
|
||||||
|
|
||||||
let score = prefix_score + suffix_score;
|
let score = prefix_score + suffix_score;
|
||||||
if score > best.0 {
|
if score > best {
|
||||||
best.0 = score;
|
best = score;
|
||||||
best.1.clear();
|
let splits = &mut self.best[level];
|
||||||
best.1.push(split);
|
splits.clear();
|
||||||
best.1.extend(suffix_splits);
|
splits.push(split);
|
||||||
|
splits.extend(suffix_splits);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
best
|
(best, true)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue