Prevent allocations where possible
This commit is contained in:
parent
10afa21349
commit
0d7fbd53e7
|
@ -11,6 +11,7 @@ documentation = "https://docs.rs/word-segmenters"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
ahash = "0.6.1"
|
ahash = "0.6.1"
|
||||||
|
smartstring = "0.2.5"
|
||||||
thiserror = "1.0.22"
|
thiserror = "1.0.22"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
|
|
@ -14,5 +14,6 @@ fn short(bench: &mut Bencher) {
|
||||||
)))
|
)))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
bench.iter(|| segmenter.segment("thisisatest"));
|
let mut out = Vec::new();
|
||||||
|
bench.iter(|| segmenter.segment("thisisatest", &mut out));
|
||||||
}
|
}
|
||||||
|
|
42
src/lib.rs
42
src/lib.rs
|
@ -8,6 +8,7 @@ use std::{
|
||||||
};
|
};
|
||||||
|
|
||||||
use ahash::AHashMap as HashMap;
|
use ahash::AHashMap as HashMap;
|
||||||
|
use smartstring::alias::String;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
pub struct Segmenter {
|
pub struct Segmenter {
|
||||||
|
@ -32,10 +33,10 @@ impl Segmenter {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a list of words that is the best segmentation of `text`
|
/// Appends list of words that is the best segmentation of `text` to `out`
|
||||||
pub fn segment(&self, text: &str) -> Vec<String> {
|
pub fn segment(&self, text: &str, out: &mut Vec<String>) {
|
||||||
let clean = clean(text);
|
let clean = clean(text);
|
||||||
SegmentState::new(&clean, &self).run()
|
SegmentState::new(&clean, &self, out).run()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn score(&self, word: &str, previous: Option<&str>) -> f64 {
|
fn score(&self, word: &str, previous: Option<&str>) -> f64 {
|
||||||
|
@ -76,21 +77,21 @@ struct SegmentState<'a> {
|
||||||
data: &'a Segmenter,
|
data: &'a Segmenter,
|
||||||
text: &'a str,
|
text: &'a str,
|
||||||
memo: HashMap<(&'a str, &'a str), (f64, Vec<&'a str>)>,
|
memo: HashMap<(&'a str, &'a str), (f64, Vec<&'a str>)>,
|
||||||
result: Vec<String>,
|
result: &'a mut Vec<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> SegmentState<'a> {
|
impl<'a> SegmentState<'a> {
|
||||||
fn new(text: &'a str, data: &'a Segmenter) -> Self {
|
fn new(text: &'a str, data: &'a Segmenter, result: &'a mut Vec<String>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
data,
|
data,
|
||||||
text,
|
text,
|
||||||
memo: HashMap::new(),
|
memo: HashMap::new(),
|
||||||
result: Vec::new(),
|
result,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a list of words that is the best segmentation of `text`
|
/// Returns a list of words that is the best segmentation of `text`
|
||||||
pub fn run(mut self) -> Vec<String> {
|
fn run(mut self) {
|
||||||
let (mut start, mut end) = (0, 0);
|
let (mut start, mut end) = (0, 0);
|
||||||
loop {
|
loop {
|
||||||
end = self.text.len().min(end + SEGMENT_SIZE);
|
end = self.text.len().min(end + SEGMENT_SIZE);
|
||||||
|
@ -109,8 +110,7 @@ impl<'a> SegmentState<'a> {
|
||||||
|
|
||||||
let window_words = self.search(&self.text[start..], "<s>").1;
|
let window_words = self.search(&self.text[start..], "<s>").1;
|
||||||
self.result
|
self.result
|
||||||
.extend(window_words.into_iter().map(|s| s.to_owned()));
|
.extend(window_words.into_iter().map(|s| s.into()));
|
||||||
self.result
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Score `word` in the context of `previous` word
|
/// Score `word` in the context of `previous` word
|
||||||
|
@ -164,11 +164,11 @@ pub fn parse_unigrams<R: BufRead>(
|
||||||
let ln = ln?;
|
let ln = ln?;
|
||||||
let split = ln
|
let split = ln
|
||||||
.find('\t')
|
.find('\t')
|
||||||
.ok_or_else(|| ParseError::String(format!("no tab found in {:?}:{}", name, i)))?;
|
.ok_or_else(|| format!("no tab found in {:?}:{}", name, i))?;
|
||||||
|
|
||||||
let word = ln[..split].to_owned();
|
let word = ln[..split].into();
|
||||||
let p = usize::from_str(&ln[split + 1..])
|
let p = usize::from_str(&ln[split + 1..])
|
||||||
.map_err(|e| ParseError::String(format!("error at {:?}:{}: {}", name, i, e)))?;
|
.map_err(|e| format!("error at {:?}:{}: {}", name, i, e))?;
|
||||||
Ok((word, p as f64))
|
Ok((word, p as f64))
|
||||||
})
|
})
|
||||||
.collect()
|
.collect()
|
||||||
|
@ -189,17 +189,17 @@ pub fn parse_bigrams<R: BufRead>(
|
||||||
let ln = ln?;
|
let ln = ln?;
|
||||||
let word_split = ln
|
let word_split = ln
|
||||||
.find(' ')
|
.find(' ')
|
||||||
.ok_or_else(|| ParseError::String(format!("no space found in {:?}:{}", name, i)))?;
|
.ok_or_else(|| format!("no space found in {:?}:{}", name, i))?;
|
||||||
let score_split = ln[word_split + 1..]
|
let score_split = ln[word_split + 1..]
|
||||||
.find('\t')
|
.find('\t')
|
||||||
.ok_or_else(|| ParseError::String(format!("no tab found in {:?}:{}", name, i)))?
|
.ok_or_else(|| format!("no tab found in {:?}:{}", name, i))?
|
||||||
+ word_split
|
+ word_split
|
||||||
+ 1;
|
+ 1;
|
||||||
|
|
||||||
let word1 = ln[..word_split].to_owned();
|
let word1 = ln[..word_split].into();
|
||||||
let word2 = ln[word_split + 1..score_split].to_owned();
|
let word2 = ln[word_split + 1..score_split].into();
|
||||||
let p = usize::from_str(&ln[score_split + 1..])
|
let p = usize::from_str(&ln[score_split + 1..])
|
||||||
.map_err(|e| ParseError::String(format!("error at {:?}:{}: {}", name, i, e)))?;
|
.map_err(|e| format!("error at {:?}:{}: {}", name, i, e))?;
|
||||||
|
|
||||||
Ok(((word1, word2), p as f64))
|
Ok(((word1, word2), p as f64))
|
||||||
})
|
})
|
||||||
|
@ -254,6 +254,12 @@ pub enum ParseError {
|
||||||
String(String),
|
String(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<std::string::String> for ParseError {
|
||||||
|
fn from(s: std::string::String) -> Self {
|
||||||
|
ParseError::String(s.into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const DEFAULT_LIMIT: usize = 24;
|
const DEFAULT_LIMIT: usize = 24;
|
||||||
const DEFAULT_TOTAL: f64 = 1_024_908_267_229.0;
|
const DEFAULT_TOTAL: f64 = 1_024_908_267_229.0;
|
||||||
const SEGMENT_SIZE: usize = 250;
|
const SEGMENT_SIZE: usize = 250;
|
||||||
|
@ -262,6 +268,6 @@ const SEGMENT_SIZE: usize = 250;
|
||||||
mod tests {
|
mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_clean() {
|
fn test_clean() {
|
||||||
assert_eq!(super::clean("Can't buy me love!"), "cantbuymelove");
|
assert_eq!(&super::clean("Can't buy me love!"), "cantbuymelove");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,10 @@ use word_segmenters::Segmenter;
|
||||||
|
|
||||||
macro_rules! assert_segments {
|
macro_rules! assert_segments {
|
||||||
($list:expr) => {
|
($list:expr) => {
|
||||||
assert_eq!(SEGMENTER.segment(&$list.join("")), $list);
|
let mut out = Vec::new();
|
||||||
|
SEGMENTER.segment(&$list.join(""), &mut out);
|
||||||
|
let cmp = out.iter().map(|s| &*s).collect::<Vec<_>>();
|
||||||
|
assert_eq!(cmp, $list);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue