Prevent allocations where possible

This commit is contained in:
Dirkjan Ochtman 2020-11-23 13:52:51 +01:00
parent 10afa21349
commit 0d7fbd53e7
4 changed files with 31 additions and 20 deletions

View File

@ -11,6 +11,7 @@ documentation = "https://docs.rs/word-segmenters"
[dependencies]
ahash = "0.6.1"
smartstring = "0.2.5"
thiserror = "1.0.22"
[dev-dependencies]

View File

@ -14,5 +14,6 @@ fn short(bench: &mut Bencher) {
)))
.unwrap();
bench.iter(|| segmenter.segment("thisisatest"));
let mut out = Vec::new();
bench.iter(|| segmenter.segment("thisisatest", &mut out));
}

View File

@ -8,6 +8,7 @@ use std::{
};
use ahash::AHashMap as HashMap;
use smartstring::alias::String;
use thiserror::Error;
pub struct Segmenter {
@ -32,10 +33,10 @@ impl Segmenter {
})
}
/// Returns a list of words that is the best segmentation of `text`
pub fn segment(&self, text: &str) -> Vec<String> {
/// Appends list of words that is the best segmentation of `text` to `out`
pub fn segment(&self, text: &str, out: &mut Vec<String>) {
let clean = clean(text);
SegmentState::new(&clean, &self).run()
SegmentState::new(&clean, &self, out).run()
}
fn score(&self, word: &str, previous: Option<&str>) -> f64 {
@ -76,21 +77,21 @@ struct SegmentState<'a> {
data: &'a Segmenter,
text: &'a str,
memo: HashMap<(&'a str, &'a str), (f64, Vec<&'a str>)>,
result: Vec<String>,
result: &'a mut Vec<String>,
}
impl<'a> SegmentState<'a> {
fn new(text: &'a str, data: &'a Segmenter) -> Self {
fn new(text: &'a str, data: &'a Segmenter, result: &'a mut Vec<String>) -> Self {
Self {
data,
text,
memo: HashMap::new(),
result: Vec::new(),
result,
}
}
/// Returns a list of words that is the best segmentation of `text`
pub fn run(mut self) -> Vec<String> {
fn run(mut self) {
let (mut start, mut end) = (0, 0);
loop {
end = self.text.len().min(end + SEGMENT_SIZE);
@ -109,8 +110,7 @@ impl<'a> SegmentState<'a> {
let window_words = self.search(&self.text[start..], "<s>").1;
self.result
.extend(window_words.into_iter().map(|s| s.to_owned()));
self.result
.extend(window_words.into_iter().map(|s| s.into()));
}
/// Score `word` in the context of `previous` word
@ -164,11 +164,11 @@ pub fn parse_unigrams<R: BufRead>(
let ln = ln?;
let split = ln
.find('\t')
.ok_or_else(|| ParseError::String(format!("no tab found in {:?}:{}", name, i)))?;
.ok_or_else(|| format!("no tab found in {:?}:{}", name, i))?;
let word = ln[..split].to_owned();
let word = ln[..split].into();
let p = usize::from_str(&ln[split + 1..])
.map_err(|e| ParseError::String(format!("error at {:?}:{}: {}", name, i, e)))?;
.map_err(|e| format!("error at {:?}:{}: {}", name, i, e))?;
Ok((word, p as f64))
})
.collect()
@ -189,17 +189,17 @@ pub fn parse_bigrams<R: BufRead>(
let ln = ln?;
let word_split = ln
.find(' ')
.ok_or_else(|| ParseError::String(format!("no space found in {:?}:{}", name, i)))?;
.ok_or_else(|| format!("no space found in {:?}:{}", name, i))?;
let score_split = ln[word_split + 1..]
.find('\t')
.ok_or_else(|| ParseError::String(format!("no tab found in {:?}:{}", name, i)))?
.ok_or_else(|| format!("no tab found in {:?}:{}", name, i))?
+ word_split
+ 1;
let word1 = ln[..word_split].to_owned();
let word2 = ln[word_split + 1..score_split].to_owned();
let word1 = ln[..word_split].into();
let word2 = ln[word_split + 1..score_split].into();
let p = usize::from_str(&ln[score_split + 1..])
.map_err(|e| ParseError::String(format!("error at {:?}:{}: {}", name, i, e)))?;
.map_err(|e| format!("error at {:?}:{}: {}", name, i, e))?;
Ok(((word1, word2), p as f64))
})
@ -254,6 +254,12 @@ pub enum ParseError {
String(String),
}
impl From<std::string::String> for ParseError {
fn from(s: std::string::String) -> Self {
ParseError::String(s.into())
}
}
const DEFAULT_LIMIT: usize = 24;
const DEFAULT_TOTAL: f64 = 1_024_908_267_229.0;
const SEGMENT_SIZE: usize = 250;
@ -262,6 +268,6 @@ const SEGMENT_SIZE: usize = 250;
mod tests {
#[test]
fn test_clean() {
assert_eq!(super::clean("Can't buy me love!"), "cantbuymelove");
assert_eq!(&super::clean("Can't buy me love!"), "cantbuymelove");
}
}

View File

@ -6,7 +6,10 @@ use word_segmenters::Segmenter;
macro_rules! assert_segments {
($list:expr) => {
assert_eq!(SEGMENTER.segment(&$list.join("")), $list);
let mut out = Vec::new();
SEGMENTER.segment(&$list.join(""), &mut out);
let cmp = out.iter().map(|s| &*s).collect::<Vec<_>>();
assert_eq!(cmp, $list);
};
}