Abstract over test data format code and API

2020-11-24 10:38:19 +01:00 · 2020-11-24 10:38:19 +01:00 · 540348f703
parent 54c6e64e21
commit 540348f703
6 changed files with 75 additions and 91 deletions
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@ -33,7 +33,7 @@ jobs:
      - uses: actions-rs/cargo@v1
        with:
          command: test
-          args: --workspace
+          args: --workspace --all-features
  lint:
    runs-on: ubuntu-latest
--- a/Cargo.toml
+++ b/Cargo.toml
@ -9,6 +9,9 @@ homepage = "https://github.com/InstantDomainSearch/word-segmenters"
 repository = "https://github.com/InstantDomainSearch/word-segmenters"
 documentation = "https://docs.rs/word-segmenters"
 [features]
 __test_data = []
 [dependencies]
 ahash = "0.6.1"
 smartstring = "0.2.5"
--- a/benches/bench.rs
+++ b/benches/bench.rs
@ -1,19 +1,12 @@
-use std::path::PathBuf;
+#![cfg(feature = "__test_data")]
 use bencher::{benchmark_group, benchmark_main, Bencher};
 use word_segmenters::Segmenter;
 benchmark_group!(benches, short);
 benchmark_main!(benches);
 fn short(bench: &mut Bencher) {
-    let segmenter = Segmenter::from_dir(&PathBuf::from(format!(
+    let segmenter = word_segmenters::test_data::segmenter();
        "{}/data",
        env!("CARGO_MANIFEST_DIR")
    )))
    .unwrap();
    let mut out = Vec::new();
    bench.iter(|| segmenter.segment("thisisatest", &mut out));
 }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -1,16 +1,15 @@
-use std::{
+use std::error::Error;
-    fs::File,
+use std::io;
-    io::{self, BufRead, BufReader},
+use std::num::ParseIntError;
-    num::ParseIntError,
+use std::ops::Range;
    ops::Range,
    path::Path,
    str::FromStr,
 };
 use ahash::AHashMap as HashMap;
 use smartstring::alias::String;
 use thiserror::Error;
 #[cfg(feature = "__test_data")]
 pub mod test_data;
 pub struct Segmenter {
    unigrams: HashMap<String, f64>,
    bigrams: HashMap<(String, String), f64>,
@ -19,15 +18,18 @@ pub struct Segmenter {
 }
 impl Segmenter {
-    /// Create `Segmenter` from files in the given directory
+    /// Create `Segmenter` from the given iterators
    ///
-    /// Reads from `unigrams.txt` and `bigrams.txt` in `dir`.
+    /// Note: the `String` types used in this API are defined in the `smartstring` crate. Any
-    pub fn from_dir(dir: &Path) -> Result<Self, ParseError> {
+    /// `&str` or `String` can be converted into the `String` used here by calling `into()` on it.
-        let uni_file = dir.join("unigrams.txt");
+    pub fn from_iters<'a, U, B>(unigrams: U, bigrams: B) -> Result<Self, Box<dyn Error>>
-        let bi_file = dir.join("bigrams.txt");
+    where
        U: Iterator<Item = Result<(String, f64), Box<dyn Error>>>,
        B: Iterator<Item = Result<((String, String), f64), Box<dyn Error>>>,
    {
        Ok(Self {
-            unigrams: parse_unigrams(BufReader::new(File::open(&uni_file)?), uni_file.to_str())?,
+            unigrams: unigrams.collect::<Result<HashMap<_, _>, _>>()?,
-            bigrams: parse_bigrams(BufReader::new(File::open(&bi_file)?), bi_file.to_str())?,
+            bigrams: bigrams.collect::<Result<HashMap<_, _>, _>>()?,
            limit: DEFAULT_LIMIT,
            total: DEFAULT_TOTAL,
        })
@ -149,63 +151,6 @@ impl<'a> SegmentState<'a> {
    }
 }
 /// Parse unigrams from the `reader` (format: `<word>\t<int>\n`)
 ///
 /// The optional `name` argument may be used to provide a source name for error messages.
 pub fn parse_unigrams<R: BufRead>(
    reader: R,
    name: Option<&str>,
 ) -> Result<HashMap<String, f64>, ParseError> {
    let name = name.unwrap_or("(unnamed)");
    reader
        .lines()
        .enumerate()
        .map(|(i, ln)| {
            let ln = ln?;
            let split = ln
                .find('\t')
                .ok_or_else(|| format!("no tab found in {:?}:{}", name, i))?;
            let word = ln[..split].into();
            let p = usize::from_str(&ln[split + 1..])
                .map_err(|e| format!("error at {:?}:{}: {}", name, i, e))?;
            Ok((word, p as f64))
        })
        .collect()
 }
 /// Parse bigrams from the `reader` (format: `<word-1> <word-2>\t<int>\n`)
 ///
 /// The optional `name` argument may be used to provide a source name for error messages.
 pub fn parse_bigrams<R: BufRead>(
    reader: R,
    name: Option<&str>,
 ) -> Result<HashMap<(String, String), f64>, ParseError> {
    let name = name.unwrap_or("(unnamed)");
    reader
        .lines()
        .enumerate()
        .map(|(i, ln)| {
            let ln = ln?;
            let word_split = ln
                .find(' ')
                .ok_or_else(|| format!("no space found in {:?}:{}", name, i))?;
            let score_split = ln[word_split + 1..]
                .find('\t')
                .ok_or_else(|| format!("no tab found in {:?}:{}", name, i))?
                + word_split
                + 1;
            let word1 = ln[..word_split].into();
            let word2 = ln[word_split + 1..score_split].into();
            let p = usize::from_str(&ln[score_split + 1..])
                .map_err(|e| format!("error at {:?}:{}: {}", name, i, e))?;
            Ok(((word1, word2), p as f64))
        })
        .collect()
 }
 /// Iterator that yields `(prefix, suffix)` pairs from `text`
 struct TextDivider<'a> {
    text: &'a str,
@ -265,7 +210,7 @@ const DEFAULT_TOTAL: f64 = 1_024_908_267_229.0;
 const SEGMENT_SIZE: usize = 250;
 #[cfg(test)]
-mod tests {
+pub mod tests {
    #[test]
    fn test_clean() {
        assert_eq!(&super::clean("Can't buy me love!"), "cantbuymelove");
--- a/src/test_data.rs
+++ b/src/test_data.rs
@ -0,0 +1,49 @@
 #![cfg(feature = "__test_data")]
 use std::fs::File;
 use std::io::{BufRead, BufReader};
 use std::path::PathBuf;
 use std::str::FromStr;
 use super::Segmenter;
 pub fn segmenter() -> Segmenter {
    let dir = PathBuf::from(format!("{}/data", env!("CARGO_MANIFEST_DIR")));
    let uni_file = dir.join("unigrams.txt");
    let reader = BufReader::new(File::open(&uni_file).unwrap());
    let unigrams = reader.lines().enumerate().map(move |(i, ln)| {
        let ln = ln?;
        let split = ln
            .find('\t')
            .ok_or_else(|| format!("no tab found in {:?}:{}", uni_file, i))?;
        let word = ln[..split].into();
        let p = usize::from_str(&ln[split + 1..])
            .map_err(|e| format!("error at {:?}:{}: {}", uni_file, i, e))?;
        Ok((word, p as f64))
    });
    let bi_file = dir.join("bigrams.txt");
    let reader = BufReader::new(File::open(&bi_file).unwrap());
    let bigrams = reader.lines().enumerate().map(move |(i, ln)| {
        let ln = ln?;
        let word_split = ln
            .find(' ')
            .ok_or_else(|| format!("no space found in {:?}:{}", bi_file, i))?;
        let score_split = ln[word_split + 1..]
            .find('\t')
            .ok_or_else(|| format!("no tab found in {:?}:{}", bi_file, i))?
            + word_split
            + 1;
        let word1 = ln[..word_split].into();
        let word2 = ln[word_split + 1..score_split].into();
        let p = usize::from_str(&ln[score_split + 1..])
            .map_err(|e| format!("error at {:?}:{}: {}", bi_file, i, e))?;
        Ok(((word1, word2), p as f64))
    });
    Segmenter::from_iters(unigrams, bigrams).unwrap()
 }
--- a/tests/basic.rs
+++ b/tests/basic.rs
@ -1,4 +1,4 @@
-use std::path::PathBuf;
+#![cfg(feature = "__test_data")]
 use once_cell::sync::Lazy;
@ -170,10 +170,4 @@ fn test_segment_12() {
    ]);
 }
-static SEGMENTER: Lazy<Segmenter> = Lazy::new(|| {
+static SEGMENTER: Lazy<Segmenter> = Lazy::new(|| word_segmenters::test_data::segmenter());
    Segmenter::from_dir(&PathBuf::from(format!(
        "{}/data",
        env!("CARGO_MANIFEST_DIR")
    )))
    .unwrap()
 });