Initial version
This commit is contained in:
commit
38f9747c92
|
@ -0,0 +1,62 @@
|
|||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ['master']
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
test:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
||||
rust: [stable, beta]
|
||||
exclude:
|
||||
- os: macos-latest
|
||||
rust: beta
|
||||
- os: windows-latest
|
||||
rust: beta
|
||||
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: ${{ matrix.rust }}
|
||||
override: true
|
||||
- uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: build
|
||||
args: --workspace --all-targets
|
||||
- uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: test
|
||||
args: --workspace
|
||||
|
||||
lint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
- uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
profile: minimal
|
||||
toolchain: stable
|
||||
override: true
|
||||
components: rustfmt, clippy
|
||||
- uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: fmt
|
||||
args: --all -- --check
|
||||
- uses: actions-rs/cargo@v1
|
||||
if: always()
|
||||
with:
|
||||
command: clippy
|
||||
args: --workspace --all-targets -- -D warnings
|
||||
|
||||
audit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v1
|
||||
- uses: EmbarkStudios/cargo-deny-action@v0
|
|
@ -0,0 +1,2 @@
|
|||
/target
|
||||
Cargo.lock
|
|
@ -0,0 +1,12 @@
|
|||
[package]
|
||||
name = "word-segmenters"
|
||||
version = "0.1.0"
|
||||
authors = ["Dirkjan Ochtman <dirkjan@ochtman.nl>"]
|
||||
edition = "2018"
|
||||
license = "Apache-2.0"
|
||||
|
||||
[dependencies]
|
||||
err-derive = "0.2.4"
|
||||
|
||||
[dev-dependencies]
|
||||
once_cell = "1.4"
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,4 @@
|
|||
[licenses]
|
||||
allow-osi-fsf-free = "either"
|
||||
copyleft = "deny"
|
||||
private = { ignore = true }
|
|
@ -0,0 +1,245 @@
|
|||
use std::{
|
||||
collections::HashMap,
|
||||
fs::File,
|
||||
io::{self, BufRead, BufReader},
|
||||
num::ParseIntError,
|
||||
ops::Range,
|
||||
path::Path,
|
||||
str::FromStr,
|
||||
};
|
||||
|
||||
use err_derive::Error;
|
||||
|
||||
pub struct Segmenter {
|
||||
unigrams: HashMap<String, f64>,
|
||||
bigrams: HashMap<(String, String), f64>,
|
||||
total: f64,
|
||||
limit: usize,
|
||||
}
|
||||
|
||||
impl Segmenter {
|
||||
/// Create `Segmenter` from files in the given directory
|
||||
///
|
||||
/// Reads from `unigrams.txt` and `bigrams.txt` in `dir`.
|
||||
pub fn from_dir(dir: &Path) -> Result<Self, ParseError> {
|
||||
let uni_file = dir.join("unigrams.txt");
|
||||
let bi_file = dir.join("bigrams.txt");
|
||||
Ok(Self {
|
||||
unigrams: parse_unigrams(BufReader::new(File::open(&uni_file)?), uni_file.to_str())?,
|
||||
bigrams: parse_bigrams(BufReader::new(File::open(&bi_file)?), bi_file.to_str())?,
|
||||
limit: DEFAULT_LIMIT,
|
||||
total: DEFAULT_TOTAL,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns a list of words that is the best segmentation of `text`
|
||||
pub fn segment(&self, text: &str) -> Vec<String> {
|
||||
let clean = clean(text);
|
||||
let mut words = vec![];
|
||||
let mut memo = HashMap::new();
|
||||
|
||||
let (mut start, mut end) = (0, 0);
|
||||
loop {
|
||||
end = clean.len().min(end + SEGMENT_SIZE);
|
||||
let prefix = &clean[start..end];
|
||||
let window_words = self.search(&prefix, "<s>", &mut memo).1;
|
||||
|
||||
for word in &window_words[..window_words.len().saturating_sub(5)] {
|
||||
start += word.len();
|
||||
words.push(word.into());
|
||||
}
|
||||
|
||||
if end == clean.len() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let mut window_words = self.search(&clean[start..], "<s>", &mut memo).1;
|
||||
words.append(&mut window_words);
|
||||
words
|
||||
}
|
||||
|
||||
/// Score `word` in the context of `previous` word
|
||||
fn search(&self, text: &str, previous: &str, memo: &mut MemoMap) -> (f64, Vec<String>) {
|
||||
if text.is_empty() {
|
||||
return (0.0, vec![]);
|
||||
}
|
||||
|
||||
let mut best = (f64::MIN, vec![]);
|
||||
for (prefix, suffix) in TextDivider::new(text, self.limit) {
|
||||
let prefix_score = self.score(prefix, Some(previous)).log10();
|
||||
let pair = (suffix.to_owned(), prefix.to_owned());
|
||||
|
||||
let (suffix_score, suffix_words) = match memo.get(&pair) {
|
||||
Some((score, words)) => (*score, words.clone()),
|
||||
None => {
|
||||
let (suffix_score, suffix_words) = self.search(&suffix, prefix, memo);
|
||||
memo.insert(pair, (suffix_score, suffix_words.clone()));
|
||||
(suffix_score, suffix_words)
|
||||
}
|
||||
};
|
||||
|
||||
let score = prefix_score + suffix_score;
|
||||
if score > best.0 {
|
||||
best.0 = score;
|
||||
best.1.clear();
|
||||
best.1.push(prefix.to_owned());
|
||||
best.1.extend(suffix_words);
|
||||
}
|
||||
}
|
||||
|
||||
best
|
||||
}
|
||||
|
||||
fn score(&self, word: &str, previous: Option<&str>) -> f64 {
|
||||
match previous {
|
||||
None => match self.unigrams.get(word) {
|
||||
// Probabibility of the given word
|
||||
Some(p) => p / self.total,
|
||||
// Penalize words not found in the unigrams according
|
||||
// to their length, a crucial heuristic.
|
||||
None => 10.0 / (self.total * 10.0f64.powf(word.len() as f64)),
|
||||
},
|
||||
Some(prev) => match (
|
||||
self.bigrams.get(&(prev.into(), word.into())),
|
||||
self.unigrams.get(prev),
|
||||
) {
|
||||
// Conditional probability of the word given the previous
|
||||
// word. The technical name is "stupid backoff" and it's
|
||||
// not a probability distribution but it works well in practice.
|
||||
(Some(pb), Some(_)) => pb / self.total / self.score(prev, None),
|
||||
// Fall back to using the unigram probability
|
||||
_ => self.score(word, None),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Customize the word length `limit
|
||||
pub fn set_limit(&mut self, limit: usize) {
|
||||
self.limit = limit;
|
||||
}
|
||||
|
||||
/// Customize the relative score by setting the `total`
|
||||
pub fn set_total(&mut self, total: f64) {
|
||||
self.total = total;
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse unigrams from the `reader` (format: `<word>\t<int>\n`)
|
||||
///
|
||||
/// The optional `name` argument may be used to provide a source name for error messages.
|
||||
pub fn parse_unigrams<R: BufRead>(
|
||||
reader: R,
|
||||
name: Option<&str>,
|
||||
) -> Result<HashMap<String, f64>, ParseError> {
|
||||
let name = name.unwrap_or("(unnamed)");
|
||||
reader
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, ln)| {
|
||||
let ln = ln?;
|
||||
let split = ln
|
||||
.find('\t')
|
||||
.ok_or_else(|| ParseError::String(format!("no tab found in {:?}:{}", name, i)))?;
|
||||
|
||||
let word = ln[..split].to_owned();
|
||||
let p = usize::from_str(&ln[split + 1..])
|
||||
.map_err(|e| ParseError::String(format!("error at {:?}:{}: {}", name, i, e)))?;
|
||||
Ok((word, p as f64))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Parse bigrams from the `reader` (format: `<word-1> <word-2>\t<int>\n`)
|
||||
///
|
||||
/// The optional `name` argument may be used to provide a source name for error messages.
|
||||
pub fn parse_bigrams<R: BufRead>(
|
||||
reader: R,
|
||||
name: Option<&str>,
|
||||
) -> Result<HashMap<(String, String), f64>, ParseError> {
|
||||
let name = name.unwrap_or("(unnamed)");
|
||||
reader
|
||||
.lines()
|
||||
.enumerate()
|
||||
.map(|(i, ln)| {
|
||||
let ln = ln?;
|
||||
let word_split = ln
|
||||
.find(' ')
|
||||
.ok_or_else(|| ParseError::String(format!("no space found in {:?}:{}", name, i)))?;
|
||||
let score_split = ln[word_split + 1..]
|
||||
.find('\t')
|
||||
.ok_or_else(|| ParseError::String(format!("no tab found in {:?}:{}", name, i)))?
|
||||
+ word_split
|
||||
+ 1;
|
||||
|
||||
let word1 = ln[..word_split].to_owned();
|
||||
let word2 = ln[word_split + 1..score_split].to_owned();
|
||||
let p = usize::from_str(&ln[score_split + 1..])
|
||||
.map_err(|e| ParseError::String(format!("error at {:?}:{}: {}", name, i, e)))?;
|
||||
|
||||
Ok(((word1, word2), p as f64))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Iterator that yields `(prefix, suffix)` pairs from `text`
|
||||
struct TextDivider<'a> {
|
||||
text: &'a str,
|
||||
split: Range<usize>,
|
||||
}
|
||||
|
||||
impl<'a> TextDivider<'a> {
|
||||
fn new(text: &'a str, limit: usize) -> Self {
|
||||
TextDivider {
|
||||
text,
|
||||
split: 1..(text.len().min(limit) + 1),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for TextDivider<'a> {
|
||||
type Item = (&'a str, &'a str);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.split
|
||||
.next()
|
||||
.map(|split| (&self.text[..split], &self.text[split..]))
|
||||
}
|
||||
}
|
||||
|
||||
/// Return `text` lower-cased with non-alphanumeric characters removed
|
||||
fn clean(s: &str) -> String {
|
||||
s.chars()
|
||||
.filter_map(|c| {
|
||||
if c.is_ascii_alphanumeric() {
|
||||
Some(c.to_ascii_lowercase())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ParseError {
|
||||
#[error(display = "I/O error: {}", _0)]
|
||||
Io(#[source] io::Error),
|
||||
#[error(display = "integer parsing error: {}", _0)]
|
||||
ParseInt(#[source] ParseIntError),
|
||||
#[error(display = "{}", _0)]
|
||||
String(String),
|
||||
}
|
||||
|
||||
type MemoMap = HashMap<(String, String), (f64, Vec<String>)>;
|
||||
|
||||
const DEFAULT_LIMIT: usize = 24;
|
||||
const DEFAULT_TOTAL: f64 = 1_024_908_267_229.0;
|
||||
const SEGMENT_SIZE: usize = 250;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn test_clean() {
|
||||
assert_eq!(super::clean("Can't buy me love!"), "cantbuymelove");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,176 @@
|
|||
use std::path::PathBuf;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use word_segmenters::Segmenter;
|
||||
|
||||
macro_rules! assert_segments {
|
||||
($list:expr) => {
|
||||
assert_eq!(SEGMENTER.segment(&$list.join("")), $list);
|
||||
};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_0() {
|
||||
assert_segments!(&["choose", "spain"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_1() {
|
||||
assert_segments!(&["this", "is", "a", "test"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_2() {
|
||||
assert_segments!(&[
|
||||
"when",
|
||||
"in",
|
||||
"the",
|
||||
"course",
|
||||
"of",
|
||||
"human",
|
||||
"events",
|
||||
"it",
|
||||
"becomes",
|
||||
"necessary",
|
||||
]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_3() {
|
||||
assert_segments!(&["who", "represents"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_4() {
|
||||
assert_segments!(&["experts", "exchange"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_5() {
|
||||
assert_segments!(&["speed", "of", "art"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_6() {
|
||||
assert_segments!(&["now", "is", "the", "time", "for", "all", "good"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_7() {
|
||||
assert_segments!(&["it", "is", "a", "truth", "universally", "acknowledged"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_8() {
|
||||
assert_segments!(&[
|
||||
"it", "was", "a", "bright", "cold", "day", "in", "april", "and", "the", "clocks", "were",
|
||||
"striking", "thirteen",
|
||||
]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_9() {
|
||||
assert_segments!(&[
|
||||
"it",
|
||||
"was",
|
||||
"the",
|
||||
"best",
|
||||
"of",
|
||||
"times",
|
||||
"it",
|
||||
"was",
|
||||
"the",
|
||||
"worst",
|
||||
"of",
|
||||
"times",
|
||||
"it",
|
||||
"was",
|
||||
"the",
|
||||
"age",
|
||||
"of",
|
||||
"wisdom",
|
||||
"it",
|
||||
"was",
|
||||
"the",
|
||||
"age",
|
||||
"of",
|
||||
"foolishness",
|
||||
]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_10() {
|
||||
assert_segments!(&[
|
||||
"as",
|
||||
"gregor",
|
||||
"samsa",
|
||||
"awoke",
|
||||
"one",
|
||||
"morning",
|
||||
"from",
|
||||
"uneasy",
|
||||
"dreams",
|
||||
"he",
|
||||
"found",
|
||||
"himself",
|
||||
"transformed",
|
||||
"in",
|
||||
"his",
|
||||
"bed",
|
||||
"into",
|
||||
"a",
|
||||
"gigantic",
|
||||
"insect",
|
||||
]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_11() {
|
||||
assert_segments!(vec![
|
||||
"in", "a", "hole", "in", "the", "ground", "there", "lived", "a", "hobbit", "not", "a",
|
||||
"nasty", "dirty", "wet", "hole", "filled", "with", "the", "ends", "of", "worms", "and",
|
||||
"an", "oozy", "smell", "nor", "yet", "a", "dry", "bare", "sandy", "hole", "with",
|
||||
"nothing", "in", "it", "to", "sit", "down", "on", "or", "to", "eat", "it", "was", "a",
|
||||
"hobbit", "hole", "and", "that", "means", "comfort"
|
||||
]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_segment_12() {
|
||||
assert_segments!(&[
|
||||
"far",
|
||||
"out",
|
||||
"in",
|
||||
"the",
|
||||
"uncharted",
|
||||
"backwaters",
|
||||
"of",
|
||||
"the",
|
||||
"unfashionable",
|
||||
"end",
|
||||
"of",
|
||||
"the",
|
||||
"western",
|
||||
"spiral",
|
||||
"arm",
|
||||
"of",
|
||||
"the",
|
||||
"galaxy",
|
||||
"lies",
|
||||
"a",
|
||||
"small",
|
||||
"un",
|
||||
"regarded",
|
||||
"yellow",
|
||||
"sun",
|
||||
]);
|
||||
}
|
||||
|
||||
static SEGMENTER: Lazy<Segmenter> = Lazy::new(|| {
|
||||
Segmenter::from_dir(&PathBuf::from(format!(
|
||||
"{}/data",
|
||||
env!("CARGO_MANIFEST_DIR")
|
||||
)))
|
||||
.unwrap()
|
||||
});
|
Loading…
Reference in New Issue