Abstract over test data format code and API

This commit is contained in:
Dirkjan Ochtman 2020-11-24 10:38:19 +01:00
parent 54c6e64e21
commit 540348f703
6 changed files with 75 additions and 91 deletions

View File

@ -33,7 +33,7 @@ jobs:
- uses: actions-rs/cargo@v1 - uses: actions-rs/cargo@v1
with: with:
command: test command: test
args: --workspace args: --workspace --all-features
lint: lint:
runs-on: ubuntu-latest runs-on: ubuntu-latest

View File

@ -9,6 +9,9 @@ homepage = "https://github.com/InstantDomainSearch/word-segmenters"
repository = "https://github.com/InstantDomainSearch/word-segmenters" repository = "https://github.com/InstantDomainSearch/word-segmenters"
documentation = "https://docs.rs/word-segmenters" documentation = "https://docs.rs/word-segmenters"
[features]
__test_data = []
[dependencies] [dependencies]
ahash = "0.6.1" ahash = "0.6.1"
smartstring = "0.2.5" smartstring = "0.2.5"

View File

@ -1,19 +1,12 @@
use std::path::PathBuf; #![cfg(feature = "__test_data")]
use bencher::{benchmark_group, benchmark_main, Bencher}; use bencher::{benchmark_group, benchmark_main, Bencher};
use word_segmenters::Segmenter;
benchmark_group!(benches, short); benchmark_group!(benches, short);
benchmark_main!(benches); benchmark_main!(benches);
fn short(bench: &mut Bencher) { fn short(bench: &mut Bencher) {
let segmenter = Segmenter::from_dir(&PathBuf::from(format!( let segmenter = word_segmenters::test_data::segmenter();
"{}/data",
env!("CARGO_MANIFEST_DIR")
)))
.unwrap();
let mut out = Vec::new(); let mut out = Vec::new();
bench.iter(|| segmenter.segment("thisisatest", &mut out)); bench.iter(|| segmenter.segment("thisisatest", &mut out));
} }

View File

@ -1,16 +1,15 @@
use std::{ use std::error::Error;
fs::File, use std::io;
io::{self, BufRead, BufReader}, use std::num::ParseIntError;
num::ParseIntError, use std::ops::Range;
ops::Range,
path::Path,
str::FromStr,
};
use ahash::AHashMap as HashMap; use ahash::AHashMap as HashMap;
use smartstring::alias::String; use smartstring::alias::String;
use thiserror::Error; use thiserror::Error;
#[cfg(feature = "__test_data")]
pub mod test_data;
pub struct Segmenter { pub struct Segmenter {
unigrams: HashMap<String, f64>, unigrams: HashMap<String, f64>,
bigrams: HashMap<(String, String), f64>, bigrams: HashMap<(String, String), f64>,
@ -19,15 +18,18 @@ pub struct Segmenter {
} }
impl Segmenter { impl Segmenter {
/// Create `Segmenter` from files in the given directory /// Create `Segmenter` from the given iterators
/// ///
/// Reads from `unigrams.txt` and `bigrams.txt` in `dir`. /// Note: the `String` types used in this API are defined in the `smartstring` crate. Any
pub fn from_dir(dir: &Path) -> Result<Self, ParseError> { /// `&str` or `String` can be converted into the `String` used here by calling `into()` on it.
let uni_file = dir.join("unigrams.txt"); pub fn from_iters<'a, U, B>(unigrams: U, bigrams: B) -> Result<Self, Box<dyn Error>>
let bi_file = dir.join("bigrams.txt"); where
U: Iterator<Item = Result<(String, f64), Box<dyn Error>>>,
B: Iterator<Item = Result<((String, String), f64), Box<dyn Error>>>,
{
Ok(Self { Ok(Self {
unigrams: parse_unigrams(BufReader::new(File::open(&uni_file)?), uni_file.to_str())?, unigrams: unigrams.collect::<Result<HashMap<_, _>, _>>()?,
bigrams: parse_bigrams(BufReader::new(File::open(&bi_file)?), bi_file.to_str())?, bigrams: bigrams.collect::<Result<HashMap<_, _>, _>>()?,
limit: DEFAULT_LIMIT, limit: DEFAULT_LIMIT,
total: DEFAULT_TOTAL, total: DEFAULT_TOTAL,
}) })
@ -149,63 +151,6 @@ impl<'a> SegmentState<'a> {
} }
} }
/// Parse unigrams from the `reader` (format: `<word>\t<int>\n`)
///
/// The optional `name` argument may be used to provide a source name for error messages.
pub fn parse_unigrams<R: BufRead>(
reader: R,
name: Option<&str>,
) -> Result<HashMap<String, f64>, ParseError> {
let name = name.unwrap_or("(unnamed)");
reader
.lines()
.enumerate()
.map(|(i, ln)| {
let ln = ln?;
let split = ln
.find('\t')
.ok_or_else(|| format!("no tab found in {:?}:{}", name, i))?;
let word = ln[..split].into();
let p = usize::from_str(&ln[split + 1..])
.map_err(|e| format!("error at {:?}:{}: {}", name, i, e))?;
Ok((word, p as f64))
})
.collect()
}
/// Parse bigrams from the `reader` (format: `<word-1> <word-2>\t<int>\n`)
///
/// The optional `name` argument may be used to provide a source name for error messages.
pub fn parse_bigrams<R: BufRead>(
reader: R,
name: Option<&str>,
) -> Result<HashMap<(String, String), f64>, ParseError> {
let name = name.unwrap_or("(unnamed)");
reader
.lines()
.enumerate()
.map(|(i, ln)| {
let ln = ln?;
let word_split = ln
.find(' ')
.ok_or_else(|| format!("no space found in {:?}:{}", name, i))?;
let score_split = ln[word_split + 1..]
.find('\t')
.ok_or_else(|| format!("no tab found in {:?}:{}", name, i))?
+ word_split
+ 1;
let word1 = ln[..word_split].into();
let word2 = ln[word_split + 1..score_split].into();
let p = usize::from_str(&ln[score_split + 1..])
.map_err(|e| format!("error at {:?}:{}: {}", name, i, e))?;
Ok(((word1, word2), p as f64))
})
.collect()
}
/// Iterator that yields `(prefix, suffix)` pairs from `text` /// Iterator that yields `(prefix, suffix)` pairs from `text`
struct TextDivider<'a> { struct TextDivider<'a> {
text: &'a str, text: &'a str,
@ -265,7 +210,7 @@ const DEFAULT_TOTAL: f64 = 1_024_908_267_229.0;
const SEGMENT_SIZE: usize = 250; const SEGMENT_SIZE: usize = 250;
#[cfg(test)] #[cfg(test)]
mod tests { pub mod tests {
#[test] #[test]
fn test_clean() { fn test_clean() {
assert_eq!(&super::clean("Can't buy me love!"), "cantbuymelove"); assert_eq!(&super::clean("Can't buy me love!"), "cantbuymelove");

49
src/test_data.rs Normal file
View File

@ -0,0 +1,49 @@
#![cfg(feature = "__test_data")]
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::PathBuf;
use std::str::FromStr;
use super::Segmenter;
pub fn segmenter() -> Segmenter {
let dir = PathBuf::from(format!("{}/data", env!("CARGO_MANIFEST_DIR")));
let uni_file = dir.join("unigrams.txt");
let reader = BufReader::new(File::open(&uni_file).unwrap());
let unigrams = reader.lines().enumerate().map(move |(i, ln)| {
let ln = ln?;
let split = ln
.find('\t')
.ok_or_else(|| format!("no tab found in {:?}:{}", uni_file, i))?;
let word = ln[..split].into();
let p = usize::from_str(&ln[split + 1..])
.map_err(|e| format!("error at {:?}:{}: {}", uni_file, i, e))?;
Ok((word, p as f64))
});
let bi_file = dir.join("bigrams.txt");
let reader = BufReader::new(File::open(&bi_file).unwrap());
let bigrams = reader.lines().enumerate().map(move |(i, ln)| {
let ln = ln?;
let word_split = ln
.find(' ')
.ok_or_else(|| format!("no space found in {:?}:{}", bi_file, i))?;
let score_split = ln[word_split + 1..]
.find('\t')
.ok_or_else(|| format!("no tab found in {:?}:{}", bi_file, i))?
+ word_split
+ 1;
let word1 = ln[..word_split].into();
let word2 = ln[word_split + 1..score_split].into();
let p = usize::from_str(&ln[score_split + 1..])
.map_err(|e| format!("error at {:?}:{}: {}", bi_file, i, e))?;
Ok(((word1, word2), p as f64))
});
Segmenter::from_iters(unigrams, bigrams).unwrap()
}

View File

@ -1,4 +1,4 @@
use std::path::PathBuf; #![cfg(feature = "__test_data")]
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
@ -170,10 +170,4 @@ fn test_segment_12() {
]); ]);
} }
static SEGMENTER: Lazy<Segmenter> = Lazy::new(|| { static SEGMENTER: Lazy<Segmenter> = Lazy::new(|| word_segmenters::test_data::segmenter());
Segmenter::from_dir(&PathBuf::from(format!(
"{}/data",
env!("CARGO_MANIFEST_DIR")
)))
.unwrap()
});