Abstract over test data format code and API
This commit is contained in:
parent
54c6e64e21
commit
540348f703
|
@ -33,7 +33,7 @@ jobs:
|
||||||
- uses: actions-rs/cargo@v1
|
- uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
command: test
|
command: test
|
||||||
args: --workspace
|
args: --workspace --all-features
|
||||||
|
|
||||||
lint:
|
lint:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
|
@ -9,6 +9,9 @@ homepage = "https://github.com/InstantDomainSearch/word-segmenters"
|
||||||
repository = "https://github.com/InstantDomainSearch/word-segmenters"
|
repository = "https://github.com/InstantDomainSearch/word-segmenters"
|
||||||
documentation = "https://docs.rs/word-segmenters"
|
documentation = "https://docs.rs/word-segmenters"
|
||||||
|
|
||||||
|
[features]
|
||||||
|
__test_data = []
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
ahash = "0.6.1"
|
ahash = "0.6.1"
|
||||||
smartstring = "0.2.5"
|
smartstring = "0.2.5"
|
||||||
|
|
|
@ -1,19 +1,12 @@
|
||||||
use std::path::PathBuf;
|
#![cfg(feature = "__test_data")]
|
||||||
|
|
||||||
use bencher::{benchmark_group, benchmark_main, Bencher};
|
use bencher::{benchmark_group, benchmark_main, Bencher};
|
||||||
|
|
||||||
use word_segmenters::Segmenter;
|
|
||||||
|
|
||||||
benchmark_group!(benches, short);
|
benchmark_group!(benches, short);
|
||||||
benchmark_main!(benches);
|
benchmark_main!(benches);
|
||||||
|
|
||||||
fn short(bench: &mut Bencher) {
|
fn short(bench: &mut Bencher) {
|
||||||
let segmenter = Segmenter::from_dir(&PathBuf::from(format!(
|
let segmenter = word_segmenters::test_data::segmenter();
|
||||||
"{}/data",
|
|
||||||
env!("CARGO_MANIFEST_DIR")
|
|
||||||
)))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let mut out = Vec::new();
|
let mut out = Vec::new();
|
||||||
bench.iter(|| segmenter.segment("thisisatest", &mut out));
|
bench.iter(|| segmenter.segment("thisisatest", &mut out));
|
||||||
}
|
}
|
||||||
|
|
91
src/lib.rs
91
src/lib.rs
|
@ -1,16 +1,15 @@
|
||||||
use std::{
|
use std::error::Error;
|
||||||
fs::File,
|
use std::io;
|
||||||
io::{self, BufRead, BufReader},
|
use std::num::ParseIntError;
|
||||||
num::ParseIntError,
|
use std::ops::Range;
|
||||||
ops::Range,
|
|
||||||
path::Path,
|
|
||||||
str::FromStr,
|
|
||||||
};
|
|
||||||
|
|
||||||
use ahash::AHashMap as HashMap;
|
use ahash::AHashMap as HashMap;
|
||||||
use smartstring::alias::String;
|
use smartstring::alias::String;
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
|
|
||||||
|
#[cfg(feature = "__test_data")]
|
||||||
|
pub mod test_data;
|
||||||
|
|
||||||
pub struct Segmenter {
|
pub struct Segmenter {
|
||||||
unigrams: HashMap<String, f64>,
|
unigrams: HashMap<String, f64>,
|
||||||
bigrams: HashMap<(String, String), f64>,
|
bigrams: HashMap<(String, String), f64>,
|
||||||
|
@ -19,15 +18,18 @@ pub struct Segmenter {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Segmenter {
|
impl Segmenter {
|
||||||
/// Create `Segmenter` from files in the given directory
|
/// Create `Segmenter` from the given iterators
|
||||||
///
|
///
|
||||||
/// Reads from `unigrams.txt` and `bigrams.txt` in `dir`.
|
/// Note: the `String` types used in this API are defined in the `smartstring` crate. Any
|
||||||
pub fn from_dir(dir: &Path) -> Result<Self, ParseError> {
|
/// `&str` or `String` can be converted into the `String` used here by calling `into()` on it.
|
||||||
let uni_file = dir.join("unigrams.txt");
|
pub fn from_iters<'a, U, B>(unigrams: U, bigrams: B) -> Result<Self, Box<dyn Error>>
|
||||||
let bi_file = dir.join("bigrams.txt");
|
where
|
||||||
|
U: Iterator<Item = Result<(String, f64), Box<dyn Error>>>,
|
||||||
|
B: Iterator<Item = Result<((String, String), f64), Box<dyn Error>>>,
|
||||||
|
{
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
unigrams: parse_unigrams(BufReader::new(File::open(&uni_file)?), uni_file.to_str())?,
|
unigrams: unigrams.collect::<Result<HashMap<_, _>, _>>()?,
|
||||||
bigrams: parse_bigrams(BufReader::new(File::open(&bi_file)?), bi_file.to_str())?,
|
bigrams: bigrams.collect::<Result<HashMap<_, _>, _>>()?,
|
||||||
limit: DEFAULT_LIMIT,
|
limit: DEFAULT_LIMIT,
|
||||||
total: DEFAULT_TOTAL,
|
total: DEFAULT_TOTAL,
|
||||||
})
|
})
|
||||||
|
@ -149,63 +151,6 @@ impl<'a> SegmentState<'a> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse unigrams from the `reader` (format: `<word>\t<int>\n`)
|
|
||||||
///
|
|
||||||
/// The optional `name` argument may be used to provide a source name for error messages.
|
|
||||||
pub fn parse_unigrams<R: BufRead>(
|
|
||||||
reader: R,
|
|
||||||
name: Option<&str>,
|
|
||||||
) -> Result<HashMap<String, f64>, ParseError> {
|
|
||||||
let name = name.unwrap_or("(unnamed)");
|
|
||||||
reader
|
|
||||||
.lines()
|
|
||||||
.enumerate()
|
|
||||||
.map(|(i, ln)| {
|
|
||||||
let ln = ln?;
|
|
||||||
let split = ln
|
|
||||||
.find('\t')
|
|
||||||
.ok_or_else(|| format!("no tab found in {:?}:{}", name, i))?;
|
|
||||||
|
|
||||||
let word = ln[..split].into();
|
|
||||||
let p = usize::from_str(&ln[split + 1..])
|
|
||||||
.map_err(|e| format!("error at {:?}:{}: {}", name, i, e))?;
|
|
||||||
Ok((word, p as f64))
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parse bigrams from the `reader` (format: `<word-1> <word-2>\t<int>\n`)
|
|
||||||
///
|
|
||||||
/// The optional `name` argument may be used to provide a source name for error messages.
|
|
||||||
pub fn parse_bigrams<R: BufRead>(
|
|
||||||
reader: R,
|
|
||||||
name: Option<&str>,
|
|
||||||
) -> Result<HashMap<(String, String), f64>, ParseError> {
|
|
||||||
let name = name.unwrap_or("(unnamed)");
|
|
||||||
reader
|
|
||||||
.lines()
|
|
||||||
.enumerate()
|
|
||||||
.map(|(i, ln)| {
|
|
||||||
let ln = ln?;
|
|
||||||
let word_split = ln
|
|
||||||
.find(' ')
|
|
||||||
.ok_or_else(|| format!("no space found in {:?}:{}", name, i))?;
|
|
||||||
let score_split = ln[word_split + 1..]
|
|
||||||
.find('\t')
|
|
||||||
.ok_or_else(|| format!("no tab found in {:?}:{}", name, i))?
|
|
||||||
+ word_split
|
|
||||||
+ 1;
|
|
||||||
|
|
||||||
let word1 = ln[..word_split].into();
|
|
||||||
let word2 = ln[word_split + 1..score_split].into();
|
|
||||||
let p = usize::from_str(&ln[score_split + 1..])
|
|
||||||
.map_err(|e| format!("error at {:?}:{}: {}", name, i, e))?;
|
|
||||||
|
|
||||||
Ok(((word1, word2), p as f64))
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Iterator that yields `(prefix, suffix)` pairs from `text`
|
/// Iterator that yields `(prefix, suffix)` pairs from `text`
|
||||||
struct TextDivider<'a> {
|
struct TextDivider<'a> {
|
||||||
text: &'a str,
|
text: &'a str,
|
||||||
|
@ -265,7 +210,7 @@ const DEFAULT_TOTAL: f64 = 1_024_908_267_229.0;
|
||||||
const SEGMENT_SIZE: usize = 250;
|
const SEGMENT_SIZE: usize = 250;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
pub mod tests {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_clean() {
|
fn test_clean() {
|
||||||
assert_eq!(&super::clean("Can't buy me love!"), "cantbuymelove");
|
assert_eq!(&super::clean("Can't buy me love!"), "cantbuymelove");
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
#![cfg(feature = "__test_data")]
|
||||||
|
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{BufRead, BufReader};
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use super::Segmenter;
|
||||||
|
|
||||||
|
pub fn segmenter() -> Segmenter {
|
||||||
|
let dir = PathBuf::from(format!("{}/data", env!("CARGO_MANIFEST_DIR")));
|
||||||
|
|
||||||
|
let uni_file = dir.join("unigrams.txt");
|
||||||
|
let reader = BufReader::new(File::open(&uni_file).unwrap());
|
||||||
|
let unigrams = reader.lines().enumerate().map(move |(i, ln)| {
|
||||||
|
let ln = ln?;
|
||||||
|
let split = ln
|
||||||
|
.find('\t')
|
||||||
|
.ok_or_else(|| format!("no tab found in {:?}:{}", uni_file, i))?;
|
||||||
|
|
||||||
|
let word = ln[..split].into();
|
||||||
|
let p = usize::from_str(&ln[split + 1..])
|
||||||
|
.map_err(|e| format!("error at {:?}:{}: {}", uni_file, i, e))?;
|
||||||
|
Ok((word, p as f64))
|
||||||
|
});
|
||||||
|
|
||||||
|
let bi_file = dir.join("bigrams.txt");
|
||||||
|
let reader = BufReader::new(File::open(&bi_file).unwrap());
|
||||||
|
let bigrams = reader.lines().enumerate().map(move |(i, ln)| {
|
||||||
|
let ln = ln?;
|
||||||
|
let word_split = ln
|
||||||
|
.find(' ')
|
||||||
|
.ok_or_else(|| format!("no space found in {:?}:{}", bi_file, i))?;
|
||||||
|
let score_split = ln[word_split + 1..]
|
||||||
|
.find('\t')
|
||||||
|
.ok_or_else(|| format!("no tab found in {:?}:{}", bi_file, i))?
|
||||||
|
+ word_split
|
||||||
|
+ 1;
|
||||||
|
|
||||||
|
let word1 = ln[..word_split].into();
|
||||||
|
let word2 = ln[word_split + 1..score_split].into();
|
||||||
|
let p = usize::from_str(&ln[score_split + 1..])
|
||||||
|
.map_err(|e| format!("error at {:?}:{}: {}", bi_file, i, e))?;
|
||||||
|
|
||||||
|
Ok(((word1, word2), p as f64))
|
||||||
|
});
|
||||||
|
|
||||||
|
Segmenter::from_iters(unigrams, bigrams).unwrap()
|
||||||
|
}
|
|
@ -1,4 +1,4 @@
|
||||||
use std::path::PathBuf;
|
#![cfg(feature = "__test_data")]
|
||||||
|
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
|
|
||||||
|
@ -170,10 +170,4 @@ fn test_segment_12() {
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static SEGMENTER: Lazy<Segmenter> = Lazy::new(|| {
|
static SEGMENTER: Lazy<Segmenter> = Lazy::new(|| word_segmenters::test_data::segmenter());
|
||||||
Segmenter::from_dir(&PathBuf::from(format!(
|
|
||||||
"{}/data",
|
|
||||||
env!("CARGO_MANIFEST_DIR")
|
|
||||||
)))
|
|
||||||
.unwrap()
|
|
||||||
});
|
|
||||||
|
|
Loading…
Reference in New Issue