Abstract over test data format code and API

This commit is contained in:
Dirkjan Ochtman 2020-11-24 10:38:19 +01:00
parent 54c6e64e21
commit 540348f703
6 changed files with 75 additions and 91 deletions

View File

@ -33,7 +33,7 @@ jobs:
- uses: actions-rs/cargo@v1
with:
command: test
args: --workspace
args: --workspace --all-features
lint:
runs-on: ubuntu-latest

View File

@ -9,6 +9,9 @@ homepage = "https://github.com/InstantDomainSearch/word-segmenters"
repository = "https://github.com/InstantDomainSearch/word-segmenters"
documentation = "https://docs.rs/word-segmenters"
[features]
__test_data = []
[dependencies]
ahash = "0.6.1"
smartstring = "0.2.5"

View File

@ -1,19 +1,12 @@
use std::path::PathBuf;
#![cfg(feature = "__test_data")]
use bencher::{benchmark_group, benchmark_main, Bencher};
use word_segmenters::Segmenter;
benchmark_group!(benches, short);
benchmark_main!(benches);
fn short(bench: &mut Bencher) {
let segmenter = Segmenter::from_dir(&PathBuf::from(format!(
"{}/data",
env!("CARGO_MANIFEST_DIR")
)))
.unwrap();
let segmenter = word_segmenters::test_data::segmenter();
let mut out = Vec::new();
bench.iter(|| segmenter.segment("thisisatest", &mut out));
}

View File

@ -1,16 +1,15 @@
use std::{
fs::File,
io::{self, BufRead, BufReader},
num::ParseIntError,
ops::Range,
path::Path,
str::FromStr,
};
use std::error::Error;
use std::io;
use std::num::ParseIntError;
use std::ops::Range;
use ahash::AHashMap as HashMap;
use smartstring::alias::String;
use thiserror::Error;
#[cfg(feature = "__test_data")]
pub mod test_data;
pub struct Segmenter {
unigrams: HashMap<String, f64>,
bigrams: HashMap<(String, String), f64>,
@ -19,15 +18,18 @@ pub struct Segmenter {
}
impl Segmenter {
/// Create `Segmenter` from files in the given directory
/// Create `Segmenter` from the given iterators
///
/// Reads from `unigrams.txt` and `bigrams.txt` in `dir`.
pub fn from_dir(dir: &Path) -> Result<Self, ParseError> {
let uni_file = dir.join("unigrams.txt");
let bi_file = dir.join("bigrams.txt");
/// Note: the `String` types used in this API are defined in the `smartstring` crate. Any
/// `&str` or `String` can be converted into the `String` used here by calling `into()` on it.
pub fn from_iters<'a, U, B>(unigrams: U, bigrams: B) -> Result<Self, Box<dyn Error>>
where
U: Iterator<Item = Result<(String, f64), Box<dyn Error>>>,
B: Iterator<Item = Result<((String, String), f64), Box<dyn Error>>>,
{
Ok(Self {
unigrams: parse_unigrams(BufReader::new(File::open(&uni_file)?), uni_file.to_str())?,
bigrams: parse_bigrams(BufReader::new(File::open(&bi_file)?), bi_file.to_str())?,
unigrams: unigrams.collect::<Result<HashMap<_, _>, _>>()?,
bigrams: bigrams.collect::<Result<HashMap<_, _>, _>>()?,
limit: DEFAULT_LIMIT,
total: DEFAULT_TOTAL,
})
@ -149,63 +151,6 @@ impl<'a> SegmentState<'a> {
}
}
/// Parse unigrams from the `reader` (format: `<word>\t<int>\n`)
///
/// The optional `name` argument may be used to provide a source name for error messages.
pub fn parse_unigrams<R: BufRead>(
reader: R,
name: Option<&str>,
) -> Result<HashMap<String, f64>, ParseError> {
let name = name.unwrap_or("(unnamed)");
reader
.lines()
.enumerate()
.map(|(i, ln)| {
let ln = ln?;
let split = ln
.find('\t')
.ok_or_else(|| format!("no tab found in {:?}:{}", name, i))?;
let word = ln[..split].into();
let p = usize::from_str(&ln[split + 1..])
.map_err(|e| format!("error at {:?}:{}: {}", name, i, e))?;
Ok((word, p as f64))
})
.collect()
}
/// Parse bigrams from the `reader` (format: `<word-1> <word-2>\t<int>\n`)
///
/// The optional `name` argument may be used to provide a source name for error messages.
pub fn parse_bigrams<R: BufRead>(
reader: R,
name: Option<&str>,
) -> Result<HashMap<(String, String), f64>, ParseError> {
let name = name.unwrap_or("(unnamed)");
reader
.lines()
.enumerate()
.map(|(i, ln)| {
let ln = ln?;
let word_split = ln
.find(' ')
.ok_or_else(|| format!("no space found in {:?}:{}", name, i))?;
let score_split = ln[word_split + 1..]
.find('\t')
.ok_or_else(|| format!("no tab found in {:?}:{}", name, i))?
+ word_split
+ 1;
let word1 = ln[..word_split].into();
let word2 = ln[word_split + 1..score_split].into();
let p = usize::from_str(&ln[score_split + 1..])
.map_err(|e| format!("error at {:?}:{}: {}", name, i, e))?;
Ok(((word1, word2), p as f64))
})
.collect()
}
/// Iterator that yields `(prefix, suffix)` pairs from `text`
struct TextDivider<'a> {
text: &'a str,
@ -265,7 +210,7 @@ const DEFAULT_TOTAL: f64 = 1_024_908_267_229.0;
const SEGMENT_SIZE: usize = 250;
#[cfg(test)]
mod tests {
pub mod tests {
#[test]
fn test_clean() {
assert_eq!(&super::clean("Can't buy me love!"), "cantbuymelove");

49
src/test_data.rs Normal file
View File

@ -0,0 +1,49 @@
#![cfg(feature = "__test_data")]
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::PathBuf;
use std::str::FromStr;
use super::Segmenter;
pub fn segmenter() -> Segmenter {
let dir = PathBuf::from(format!("{}/data", env!("CARGO_MANIFEST_DIR")));
let uni_file = dir.join("unigrams.txt");
let reader = BufReader::new(File::open(&uni_file).unwrap());
let unigrams = reader.lines().enumerate().map(move |(i, ln)| {
let ln = ln?;
let split = ln
.find('\t')
.ok_or_else(|| format!("no tab found in {:?}:{}", uni_file, i))?;
let word = ln[..split].into();
let p = usize::from_str(&ln[split + 1..])
.map_err(|e| format!("error at {:?}:{}: {}", uni_file, i, e))?;
Ok((word, p as f64))
});
let bi_file = dir.join("bigrams.txt");
let reader = BufReader::new(File::open(&bi_file).unwrap());
let bigrams = reader.lines().enumerate().map(move |(i, ln)| {
let ln = ln?;
let word_split = ln
.find(' ')
.ok_or_else(|| format!("no space found in {:?}:{}", bi_file, i))?;
let score_split = ln[word_split + 1..]
.find('\t')
.ok_or_else(|| format!("no tab found in {:?}:{}", bi_file, i))?
+ word_split
+ 1;
let word1 = ln[..word_split].into();
let word2 = ln[word_split + 1..score_split].into();
let p = usize::from_str(&ln[score_split + 1..])
.map_err(|e| format!("error at {:?}:{}: {}", bi_file, i, e))?;
Ok(((word1, word2), p as f64))
});
Segmenter::from_iters(unigrams, bigrams).unwrap()
}

View File

@ -1,4 +1,4 @@
use std::path::PathBuf;
#![cfg(feature = "__test_data")]
use once_cell::sync::Lazy;
@ -170,10 +170,4 @@ fn test_segment_12() {
]);
}
static SEGMENTER: Lazy<Segmenter> = Lazy::new(|| {
Segmenter::from_dir(&PathBuf::from(format!(
"{}/data",
env!("CARGO_MANIFEST_DIR")
)))
.unwrap()
});
static SEGMENTER: Lazy<Segmenter> = Lazy::new(|| word_segmenters::test_data::segmenter());