Initial version

2025-01-31 13:12:12 +00:00 · 2020-05-26 19:59:05 +02:00 · 2020-05-26 19:59:05 +02:00 · 38f9747c92
commit 38f9747c92
9 changed files with 798830 additions and 0 deletions
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@ -0,0 +1,62 @@
+name: CI
+
+on:
+  push:
+    branches: ['master']
+  pull_request:
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        rust: [stable, beta]
+        exclude:
+          - os: macos-latest
+            rust: beta
+          - os: windows-latest
+            rust: beta
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - uses: actions/checkout@v1
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: ${{ matrix.rust }}
+          override: true
+      - uses: actions-rs/cargo@v1
+        with:
+          command: build
+          args: --workspace --all-targets
+      - uses: actions-rs/cargo@v1
+        with:
+          command: test
+          args: --workspace
+
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v1
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: stable
+          override: true
+          components: rustfmt, clippy
+      - uses: actions-rs/cargo@v1
+        with:
+          command: fmt
+          args: --all -- --check
+      - uses: actions-rs/cargo@v1
+        if: always()
+        with:
+          command: clippy
+          args: --workspace --all-targets -- -D warnings
+
+  audit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+    - uses: EmbarkStudios/cargo-deny-action@v0
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+/target
+Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,12 @@
+[package]
+name = "word-segmenters"
+version = "0.1.0"
+authors = ["Dirkjan Ochtman <dirkjan@ochtman.nl>"]
+edition = "2018"
+license = "Apache-2.0"
+
+[dependencies]
+err-derive = "0.2.4"
+
+[dev-dependencies]
+once_cell = "1.4"
--- a/data/bigrams.txt
+++ b/data/bigrams.txt
--- a/data/unigrams.txt
+++ b/data/unigrams.txt
--- a/data/words.txt
+++ b/data/words.txt
--- a/deny.toml
+++ b/deny.toml
@ -0,0 +1,4 @@
+[licenses]
+allow-osi-fsf-free = "either"
+copyleft = "deny"
+private = { ignore = true }
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,245 @@
+use std::{
+    collections::HashMap,
+    fs::File,
+    io::{self, BufRead, BufReader},
+    num::ParseIntError,
+    ops::Range,
+    path::Path,
+    str::FromStr,
+};
+
+use err_derive::Error;
+
+pub struct Segmenter {
+    unigrams: HashMap<String, f64>,
+    bigrams: HashMap<(String, String), f64>,
+    total: f64,
+    limit: usize,
+}
+
+impl Segmenter {
+    /// Create `Segmenter` from files in the given directory
+    ///
+    /// Reads from `unigrams.txt` and `bigrams.txt` in `dir`.
+    pub fn from_dir(dir: &Path) -> Result<Self, ParseError> {
+        let uni_file = dir.join("unigrams.txt");
+        let bi_file = dir.join("bigrams.txt");
+        Ok(Self {
+            unigrams: parse_unigrams(BufReader::new(File::open(&uni_file)?), uni_file.to_str())?,
+            bigrams: parse_bigrams(BufReader::new(File::open(&bi_file)?), bi_file.to_str())?,
+            limit: DEFAULT_LIMIT,
+            total: DEFAULT_TOTAL,
+        })
+    }
+
+    /// Returns a list of words that is the best segmentation of `text`
+    pub fn segment(&self, text: &str) -> Vec<String> {
+        let clean = clean(text);
+        let mut words = vec![];
+        let mut memo = HashMap::new();
+
+        let (mut start, mut end) = (0, 0);
+        loop {
+            end = clean.len().min(end + SEGMENT_SIZE);
+            let prefix = &clean[start..end];
+            let window_words = self.search(&prefix, "<s>", &mut memo).1;
+
+            for word in &window_words[..window_words.len().saturating_sub(5)] {
+                start += word.len();
+                words.push(word.into());
+            }
+
+            if end == clean.len() {
+                break;
+            }
+        }
+
+        let mut window_words = self.search(&clean[start..], "<s>", &mut memo).1;
+        words.append(&mut window_words);
+        words
+    }
+
+    /// Score `word` in the context of `previous` word
+    fn search(&self, text: &str, previous: &str, memo: &mut MemoMap) -> (f64, Vec<String>) {
+        if text.is_empty() {
+            return (0.0, vec![]);
+        }
+
+        let mut best = (f64::MIN, vec![]);
+        for (prefix, suffix) in TextDivider::new(text, self.limit) {
+            let prefix_score = self.score(prefix, Some(previous)).log10();
+            let pair = (suffix.to_owned(), prefix.to_owned());
+
+            let (suffix_score, suffix_words) = match memo.get(&pair) {
+                Some((score, words)) => (*score, words.clone()),
+                None => {
+                    let (suffix_score, suffix_words) = self.search(&suffix, prefix, memo);
+                    memo.insert(pair, (suffix_score, suffix_words.clone()));
+                    (suffix_score, suffix_words)
+                }
+            };
+
+            let score = prefix_score + suffix_score;
+            if score > best.0 {
+                best.0 = score;
+                best.1.clear();
+                best.1.push(prefix.to_owned());
+                best.1.extend(suffix_words);
+            }
+        }
+
+        best
+    }
+
+    fn score(&self, word: &str, previous: Option<&str>) -> f64 {
+        match previous {
+            None => match self.unigrams.get(word) {
+                // Probabibility of the given word
+                Some(p) => p / self.total,
+                // Penalize words not found in the unigrams according
+                // to their length, a crucial heuristic.
+                None => 10.0 / (self.total * 10.0f64.powf(word.len() as f64)),
+            },
+            Some(prev) => match (
+                self.bigrams.get(&(prev.into(), word.into())),
+                self.unigrams.get(prev),
+            ) {
+                // Conditional probability of the word given the previous
+                // word. The technical name is "stupid backoff" and it's
+                // not a probability distribution but it works well in practice.
+                (Some(pb), Some(_)) => pb / self.total / self.score(prev, None),
+                // Fall back to using the unigram probability
+                _ => self.score(word, None),
+            },
+        }
+    }
+
+    /// Customize the word length `limit
+    pub fn set_limit(&mut self, limit: usize) {
+        self.limit = limit;
+    }
+
+    /// Customize the relative score by setting the `total`
+    pub fn set_total(&mut self, total: f64) {
+        self.total = total;
+    }
+}
+
+/// Parse unigrams from the `reader` (format: `<word>\t<int>\n`)
+///
+/// The optional `name` argument may be used to provide a source name for error messages.
+pub fn parse_unigrams<R: BufRead>(
+    reader: R,
+    name: Option<&str>,
+) -> Result<HashMap<String, f64>, ParseError> {
+    let name = name.unwrap_or("(unnamed)");
+    reader
+        .lines()
+        .enumerate()
+        .map(|(i, ln)| {
+            let ln = ln?;
+            let split = ln
+                .find('\t')
+                .ok_or_else(|| ParseError::String(format!("no tab found in {:?}:{}", name, i)))?;
+
+            let word = ln[..split].to_owned();
+            let p = usize::from_str(&ln[split + 1..])
+                .map_err(|e| ParseError::String(format!("error at {:?}:{}: {}", name, i, e)))?;
+            Ok((word, p as f64))
+        })
+        .collect()
+}
+
+/// Parse bigrams from the `reader` (format: `<word-1> <word-2>\t<int>\n`)
+///
+/// The optional `name` argument may be used to provide a source name for error messages.
+pub fn parse_bigrams<R: BufRead>(
+    reader: R,
+    name: Option<&str>,
+) -> Result<HashMap<(String, String), f64>, ParseError> {
+    let name = name.unwrap_or("(unnamed)");
+    reader
+        .lines()
+        .enumerate()
+        .map(|(i, ln)| {
+            let ln = ln?;
+            let word_split = ln
+                .find(' ')
+                .ok_or_else(|| ParseError::String(format!("no space found in {:?}:{}", name, i)))?;
+            let score_split = ln[word_split + 1..]
+                .find('\t')
+                .ok_or_else(|| ParseError::String(format!("no tab found in {:?}:{}", name, i)))?
+                + word_split
+                + 1;
+
+            let word1 = ln[..word_split].to_owned();
+            let word2 = ln[word_split + 1..score_split].to_owned();
+            let p = usize::from_str(&ln[score_split + 1..])
+                .map_err(|e| ParseError::String(format!("error at {:?}:{}: {}", name, i, e)))?;
+
+            Ok(((word1, word2), p as f64))
+        })
+        .collect()
+}
+
+/// Iterator that yields `(prefix, suffix)` pairs from `text`
+struct TextDivider<'a> {
+    text: &'a str,
+    split: Range<usize>,
+}
+
+impl<'a> TextDivider<'a> {
+    fn new(text: &'a str, limit: usize) -> Self {
+        TextDivider {
+            text,
+            split: 1..(text.len().min(limit) + 1),
+        }
+    }
+}
+
+impl<'a> Iterator for TextDivider<'a> {
+    type Item = (&'a str, &'a str);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.split
+            .next()
+            .map(|split| (&self.text[..split], &self.text[split..]))
+    }
+}
+
+/// Return `text` lower-cased with non-alphanumeric characters removed
+fn clean(s: &str) -> String {
+    s.chars()
+        .filter_map(|c| {
+            if c.is_ascii_alphanumeric() {
+                Some(c.to_ascii_lowercase())
+            } else {
+                None
+            }
+        })
+        .collect()
+}
+
+#[derive(Debug, Error)]
+pub enum ParseError {
+    #[error(display = "I/O error: {}", _0)]
+    Io(#[source] io::Error),
+    #[error(display = "integer parsing error: {}", _0)]
+    ParseInt(#[source] ParseIntError),
+    #[error(display = "{}", _0)]
+    String(String),
+}
+
+type MemoMap = HashMap<(String, String), (f64, Vec<String>)>;
+
+const DEFAULT_LIMIT: usize = 24;
+const DEFAULT_TOTAL: f64 = 1_024_908_267_229.0;
+const SEGMENT_SIZE: usize = 250;
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn test_clean() {
+        assert_eq!(super::clean("Can't buy me love!"), "cantbuymelove");
+    }
+}
--- a/tests/basic.rs
+++ b/tests/basic.rs
@ -0,0 +1,176 @@
+use std::path::PathBuf;
+
+use once_cell::sync::Lazy;
+
+use word_segmenters::Segmenter;
+
+macro_rules! assert_segments {
+    ($list:expr) => {
+        assert_eq!(SEGMENTER.segment(&$list.join("")), $list);
+    };
+}
+
+#[test]
+fn test_segment_0() {
+    assert_segments!(&["choose", "spain"]);
+}
+
+#[test]
+fn test_segment_1() {
+    assert_segments!(&["this", "is", "a", "test"]);
+}
+
+#[test]
+fn test_segment_2() {
+    assert_segments!(&[
+        "when",
+        "in",
+        "the",
+        "course",
+        "of",
+        "human",
+        "events",
+        "it",
+        "becomes",
+        "necessary",
+    ]);
+}
+
+#[test]
+fn test_segment_3() {
+    assert_segments!(&["who", "represents"]);
+}
+
+#[test]
+fn test_segment_4() {
+    assert_segments!(&["experts", "exchange"]);
+}
+
+#[test]
+fn test_segment_5() {
+    assert_segments!(&["speed", "of", "art"]);
+}
+
+#[test]
+fn test_segment_6() {
+    assert_segments!(&["now", "is", "the", "time", "for", "all", "good"]);
+}
+
+#[test]
+fn test_segment_7() {
+    assert_segments!(&["it", "is", "a", "truth", "universally", "acknowledged"]);
+}
+
+#[test]
+fn test_segment_8() {
+    assert_segments!(&[
+        "it", "was", "a", "bright", "cold", "day", "in", "april", "and", "the", "clocks", "were",
+        "striking", "thirteen",
+    ]);
+}
+
+#[test]
+fn test_segment_9() {
+    assert_segments!(&[
+        "it",
+        "was",
+        "the",
+        "best",
+        "of",
+        "times",
+        "it",
+        "was",
+        "the",
+        "worst",
+        "of",
+        "times",
+        "it",
+        "was",
+        "the",
+        "age",
+        "of",
+        "wisdom",
+        "it",
+        "was",
+        "the",
+        "age",
+        "of",
+        "foolishness",
+    ]);
+}
+
+#[test]
+fn test_segment_10() {
+    assert_segments!(&[
+        "as",
+        "gregor",
+        "samsa",
+        "awoke",
+        "one",
+        "morning",
+        "from",
+        "uneasy",
+        "dreams",
+        "he",
+        "found",
+        "himself",
+        "transformed",
+        "in",
+        "his",
+        "bed",
+        "into",
+        "a",
+        "gigantic",
+        "insect",
+    ]);
+}
+
+#[test]
+fn test_segment_11() {
+    assert_segments!(vec![
+        "in", "a", "hole", "in", "the", "ground", "there", "lived", "a", "hobbit", "not", "a",
+        "nasty", "dirty", "wet", "hole", "filled", "with", "the", "ends", "of", "worms", "and",
+        "an", "oozy", "smell", "nor", "yet", "a", "dry", "bare", "sandy", "hole", "with",
+        "nothing", "in", "it", "to", "sit", "down", "on", "or", "to", "eat", "it", "was", "a",
+        "hobbit", "hole", "and", "that", "means", "comfort"
+    ]);
+}
+
+#[test]
+fn test_segment_12() {
+    assert_segments!(&[
+        "far",
+        "out",
+        "in",
+        "the",
+        "uncharted",
+        "backwaters",
+        "of",
+        "the",
+        "unfashionable",
+        "end",
+        "of",
+        "the",
+        "western",
+        "spiral",
+        "arm",
+        "of",
+        "the",
+        "galaxy",
+        "lies",
+        "a",
+        "small",
+        "un",
+        "regarded",
+        "yellow",
+        "sun",
+    ]);
+}
+
+static SEGMENTER: Lazy<Segmenter> = Lazy::new(|| {
+    Segmenter::from_dir(&PathBuf::from(format!(
+        "{}/data",
+        env!("CARGO_MANIFEST_DIR")
+    )))
+    .unwrap()
+});