Make slicing cheaper by adding a little unsafe code

This commit is contained in:
Dirkjan Ochtman 2020-11-26 11:03:09 +01:00
parent bb1b1db9c5
commit bc20e39c1e
1 changed files with 39 additions and 23 deletions

View File

@ -1,7 +1,8 @@
use std::error::Error; use std::error::Error;
use std::io; use std::io;
use std::num::ParseIntError; use std::num::ParseIntError;
use std::ops::Range; use std::ops::{Index, Range};
use std::str;
use ahash::AHashMap as HashMap; use ahash::AHashMap as HashMap;
use smartstring::alias::String; use smartstring::alias::String;
@ -37,8 +38,7 @@ impl Segmenter {
/// Appends list of words that is the best segmentation of `text` to `out` /// Appends list of words that is the best segmentation of `text` to `out`
pub fn segment(&self, text: &str, out: &mut Vec<String>) { pub fn segment(&self, text: &str, out: &mut Vec<String>) {
let clean = clean(text); SegmentState::new(&Ascii::new(text), &self, out).run()
SegmentState::new(&clean, &self, out).run()
} }
fn score(&self, word: &str, previous: Option<&str>) -> f64 { fn score(&self, word: &str, previous: Option<&str>) -> f64 {
@ -75,15 +75,15 @@ impl Segmenter {
struct SegmentState<'a> { struct SegmentState<'a> {
data: &'a Segmenter, data: &'a Segmenter,
text: &'a str, text: &'a Ascii,
memo: HashMap<(&'a str, &'a str), (f64, Range<usize>)>, memo: HashMap<(Range<usize>, Range<usize>), (f64, Range<usize>)>,
split_cache: Vec<usize>, split_cache: Vec<usize>,
result: &'a mut Vec<String>, result: &'a mut Vec<String>,
best: Vec<Vec<usize>>, best: Vec<Vec<usize>>,
} }
impl<'a> SegmentState<'a> { impl<'a> SegmentState<'a> {
fn new(text: &'a str, data: &'a Segmenter, result: &'a mut Vec<String>) -> Self { fn new(text: &'a Ascii, data: &'a Segmenter, result: &'a mut Vec<String>) -> Self {
Self { Self {
data, data,
text, text,
@ -123,13 +123,11 @@ impl<'a> SegmentState<'a> {
let mut best = f64::MIN; let mut best = f64::MIN;
for split in 1..(range.len().min(self.data.limit) + 1) { for split in 1..(range.len().min(self.data.limit) + 1) {
let (start, end) = (range.start, range.end); let (start, split, end) = (range.start, range.start + split, range.end);
let (prefix, suffix) = self.text[start..end].split_at(split); let prefix = &self.text[start..split];
let split = start + split;
let prefix_score = self.data.score(prefix, previous).log10(); let prefix_score = self.data.score(prefix, previous).log10();
let pair = (suffix, prefix);
let pair = (split..end, start..split);
let (suffix_score, suffix_splits) = match self.memo.get(&pair) { let (suffix_score, suffix_splits) = match self.memo.get(&pair) {
Some((score, splits)) => (*score, &self.split_cache[splits.start..splits.end]), Some((score, splits)) => (*score, &self.split_cache[splits.start..splits.end]),
None => { None => {
@ -161,17 +159,34 @@ impl<'a> SegmentState<'a> {
} }
} }
/// Return `text` lower-cased with non-alphanumeric characters removed struct Ascii(Vec<u8>);
fn clean(s: &str) -> String {
impl Ascii {
fn new(s: &str) -> Self {
Self(
s.chars() s.chars()
.filter_map(|c| { .filter_map(|c| match c.is_ascii_alphanumeric() {
if c.is_ascii_alphanumeric() { true => Some(c.to_ascii_lowercase()),
Some(c.to_ascii_lowercase()) false => None,
} else {
None
}
}) })
.collect() .collect::<std::string::String>()
.into_bytes(),
)
}
fn len(&self) -> usize {
self.0.len()
}
}
impl Index<Range<usize>> for Ascii {
type Output = str;
fn index(&self, index: Range<usize>) -> &Self::Output {
let bytes = self.0.index(index);
// Since `Ascii` can only be instantiated with ASCII characters, this should be safe
unsafe { str::from_utf8_unchecked(bytes) }
}
} }
#[derive(Debug, Error)] #[derive(Debug, Error)]
@ -198,6 +213,7 @@ const SEGMENT_SIZE: usize = 250;
pub mod tests { pub mod tests {
#[test] #[test]
fn test_clean() { fn test_clean() {
assert_eq!(&super::clean("Can't buy me love!"), "cantbuymelove"); let text = super::Ascii::new("Can't buy me love!");
assert_eq!(&text[0..text.len()], "cantbuymelove");
} }
} }