Simplify API by moving result data into Search

This commit is contained in:
Dirkjan Ochtman 2021-02-10 13:03:06 +01:00
parent 9735e64ee4
commit d190aa5240
4 changed files with 20 additions and 17 deletions

View File

@ -1,6 +1,6 @@
[package] [package]
name = "instant-segment" name = "instant-segment"
version = "0.5.1" version = "0.6.0"
authors = ["Dirkjan Ochtman <dirkjan@ochtman.nl>"] authors = ["Dirkjan Ochtman <dirkjan@ochtman.nl>"]
edition = "2018" edition = "2018"
license = "Apache-2.0" license = "Apache-2.0"

View File

@ -7,7 +7,6 @@ benchmark_main!(benches);
fn short(bench: &mut Bencher) { fn short(bench: &mut Bencher) {
let segmenter = instant_segment::test_data::segmenter(); let segmenter = instant_segment::test_data::segmenter();
let mut out = Vec::new();
let mut search = instant_segment::Search::default(); let mut search = instant_segment::Search::default();
bench.iter(|| segmenter.segment("thisisatest", &mut out, &mut search)); bench.iter(|| segmenter.segment("thisisatest", &mut search));
} }

View File

@ -51,18 +51,19 @@ impl Segmenter {
} }
} }
/// Appends list of words that is the best segmentation of `text` to `out` /// Segment the text in `input`
/// ///
/// Requires that the input `text` consists of lowercase ASCII characters only. Otherwise, /// Requires that the input `text` consists of lowercase ASCII characters only. Otherwise,
/// returns `Err(InvalidCharacter)`. The `search` parameter contains caches that are used /// returns `Err(InvalidCharacter)`. The `search` parameter contains caches that are used
/// segmentation; passing it in allows the callers to reuse the cache allocations. /// segmentation; passing it in allows the callers to reuse the cache allocations.
///
/// The segmentation result can be retrieved through the `Search::split()` method.
pub fn segment( pub fn segment(
&self, &self,
text: &str, input: &str,
out: &mut Vec<String>,
search: &mut Search, search: &mut Search,
) -> Result<(), InvalidCharacter> { ) -> Result<(), InvalidCharacter> {
SegmentState::new(Ascii::new(text)?, &self, out, search).run(); SegmentState::new(Ascii::new(input)?, &self, search).run();
Ok(()) Ok(())
} }
@ -101,7 +102,6 @@ impl Segmenter {
struct SegmentState<'a> { struct SegmentState<'a> {
data: &'a Segmenter, data: &'a Segmenter,
text: Ascii<'a>, text: Ascii<'a>,
result: &'a mut Vec<String>,
search: &'a mut Search, search: &'a mut Search,
} }
@ -109,14 +109,12 @@ impl<'a> SegmentState<'a> {
fn new( fn new(
text: Ascii<'a>, text: Ascii<'a>,
data: &'a Segmenter, data: &'a Segmenter,
result: &'a mut Vec<String>,
search: &'a mut Search, search: &'a mut Search,
) -> Self { ) -> Self {
search.clear(); search.clear();
Self { Self {
data, data,
text, text,
result,
search, search,
} }
} }
@ -134,7 +132,7 @@ impl<'a> SegmentState<'a> {
} }
for &split in splits { for &split in splits {
self.result.push(self.text[start..split].into()); self.search.result.push(self.text[start..split].into());
start = split; start = split;
} }
} }
@ -191,6 +189,7 @@ pub struct Search {
memo: HashMap<MemoKey, (f64, Range<usize>)>, memo: HashMap<MemoKey, (f64, Range<usize>)>,
split_cache: Vec<usize>, split_cache: Vec<usize>,
best: Vec<Vec<usize>>, best: Vec<Vec<usize>>,
result: Vec<String>,
} }
impl Default for Search { impl Default for Search {
@ -199,6 +198,7 @@ impl Default for Search {
memo: HashMap::default(), memo: HashMap::default(),
split_cache: Vec::with_capacity(32), split_cache: Vec::with_capacity(32),
best: vec![vec![]; SEGMENT_SIZE], best: vec![vec![]; SEGMENT_SIZE],
result: Vec::new(),
} }
} }
} }
@ -210,6 +210,12 @@ impl Search {
for inner in self.best.iter_mut() { for inner in self.best.iter_mut() {
inner.clear(); inner.clear();
} }
self.result.clear();
}
/// Get the segmentation result
pub fn split(&self) -> impl Iterator<Item = &str> {
self.result.iter().map(|v| v.as_str())
} }
} }

View File

@ -10,16 +10,14 @@ pub fn run(segmenter: &Segmenter) {
} }
pub fn assert_segments(s: &[&str], search: &mut Search, segmenter: &Segmenter) { pub fn assert_segments(s: &[&str], search: &mut Search, segmenter: &Segmenter) {
let mut out = Vec::new(); segmenter.segment(&s.join(""), search).unwrap();
segmenter.segment(&s.join(""), &mut out, search).unwrap(); let cmp = search.split().collect::<Vec<_>>();
let cmp = out.iter().map(|s| &*s).collect::<Vec<_>>();
assert_eq!(cmp, s); assert_eq!(cmp, s);
} }
pub fn check_segments(s: &[&str], search: &mut Search, segmenter: &Segmenter) -> bool { pub fn check_segments(s: &[&str], search: &mut Search, segmenter: &Segmenter) -> bool {
let mut out = Vec::new(); match segmenter.segment(&s.join(""), search) {
match segmenter.segment(&s.join(""), &mut out, search) { Ok(()) => s == search.split().collect::<Vec<_>>(),
Ok(()) => s == out.iter().map(|s| &*s).collect::<Vec<_>>(),
Err(_) => false, Err(_) => false,
} }
} }