Don't normalize input strings implicitly

This commit is contained in:
Dirkjan Ochtman 2021-02-08 15:53:24 +01:00
parent 8c08bb9e14
commit be0f8c0ed7
2 changed files with 35 additions and 21 deletions

View File

@ -52,8 +52,11 @@ impl Segmenter {
} }
/// Appends list of words that is the best segmentation of `text` to `out` /// Appends list of words that is the best segmentation of `text` to `out`
pub fn segment(&self, text: &str, out: &mut Vec<String>) { ///
SegmentState::new(&Ascii::new(text), &self, out).run() /// Requires that the input `text` consists of lowercase ASCII characters only. Otherwise,
/// returns `Err(InvalidCharacter)`.
pub fn segment(&self, text: &str, out: &mut Vec<String>) -> Result<(), InvalidCharacter> {
Ok(SegmentState::new(Ascii::new(text)?, &self, out).run())
} }
fn score(&self, word: &str, previous: Option<&str>) -> f64 { fn score(&self, word: &str, previous: Option<&str>) -> f64 {
@ -90,7 +93,7 @@ impl Segmenter {
struct SegmentState<'a> { struct SegmentState<'a> {
data: &'a Segmenter, data: &'a Segmenter,
text: &'a Ascii, text: Ascii<'a>,
memo: HashMap<MemoKey, (f64, Range<usize>)>, memo: HashMap<MemoKey, (f64, Range<usize>)>,
split_cache: Vec<usize>, split_cache: Vec<usize>,
result: &'a mut Vec<String>, result: &'a mut Vec<String>,
@ -98,7 +101,7 @@ struct SegmentState<'a> {
} }
impl<'a> SegmentState<'a> { impl<'a> SegmentState<'a> {
fn new(text: &'a Ascii, data: &'a Segmenter, result: &'a mut Vec<String>) -> Self { fn new(text: Ascii<'a>, data: &'a Segmenter, result: &'a mut Vec<String>) -> Self {
Self { Self {
data, data,
text, text,
@ -172,19 +175,16 @@ impl<'a> SegmentState<'a> {
type MemoKey = (Range<usize>, Range<usize>); type MemoKey = (Range<usize>, Range<usize>);
struct Ascii(Vec<u8>); #[derive(Debug)]
struct Ascii<'a>(&'a [u8]);
impl Ascii { impl<'a> Ascii<'a> {
fn new(s: &str) -> Self { fn new(s: &'a str) -> Result<Self, InvalidCharacter> {
Self( let bytes = s.as_bytes();
s.chars() match bytes.iter().all(|b| b.is_ascii_lowercase()) {
.filter_map(|c| match c.is_ascii_alphanumeric() { true => Ok(Self(bytes)),
true => Some(c.to_ascii_lowercase()), false => Err(InvalidCharacter),
false => None, }
})
.collect::<std::string::String>()
.into_bytes(),
)
} }
fn len(&self) -> usize { fn len(&self) -> usize {
@ -192,7 +192,7 @@ impl Ascii {
} }
} }
impl Index<Range<usize>> for Ascii { impl<'a> Index<Range<usize>> for Ascii<'a> {
type Output = str; type Output = str;
fn index(&self, index: Range<usize>) -> &Self::Output { fn index(&self, index: Range<usize>) -> &Self::Output {
@ -202,6 +202,17 @@ impl Index<Range<usize>> for Ascii {
} }
} }
#[derive(Debug)]
pub struct InvalidCharacter;
impl std::error::Error for InvalidCharacter {}
impl std::fmt::Display for InvalidCharacter {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str("invalid character")
}
}
type HashMap<K, V> = std::collections::HashMap<K, V, ahash::RandomState>; type HashMap<K, V> = std::collections::HashMap<K, V, ahash::RandomState>;
const DEFAULT_LIMIT: usize = 24; const DEFAULT_LIMIT: usize = 24;
@ -211,7 +222,8 @@ const SEGMENT_SIZE: usize = 250;
pub mod tests { pub mod tests {
#[test] #[test]
fn test_clean() { fn test_clean() {
let text = super::Ascii::new("Can't buy me love!"); super::Ascii::new("Can't buy me love!").unwrap_err();
let text = super::Ascii::new("cantbuymelove").unwrap();
assert_eq!(&text[0..text.len()], "cantbuymelove"); assert_eq!(&text[0..text.len()], "cantbuymelove");
} }
} }

View File

@ -10,15 +10,17 @@ pub fn run(segmenter: &Segmenter) {
pub fn assert_segments(segmenter: &Segmenter, s: &[&str]) { pub fn assert_segments(segmenter: &Segmenter, s: &[&str]) {
let mut out = Vec::new(); let mut out = Vec::new();
segmenter.segment(&s.join(""), &mut out); segmenter.segment(&s.join(""), &mut out).unwrap();
let cmp = out.iter().map(|s| &*s).collect::<Vec<_>>(); let cmp = out.iter().map(|s| &*s).collect::<Vec<_>>();
assert_eq!(cmp, s); assert_eq!(cmp, s);
} }
pub fn check_segments(segmenter: &Segmenter, s: &[&str]) -> bool { pub fn check_segments(segmenter: &Segmenter, s: &[&str]) -> bool {
let mut out = Vec::new(); let mut out = Vec::new();
segmenter.segment(&s.join(""), &mut out); match segmenter.segment(&s.join(""), &mut out) {
s == out.iter().map(|s| &*s).collect::<Vec<_>>() Ok(()) => s == out.iter().map(|s| &*s).collect::<Vec<_>>(),
Err(_) => false,
}
} }
/// Built-in test cases /// Built-in test cases