From 9bbb633f1d62a16d8b93a9940d63bb902770b79a Mon Sep 17 00:00:00 2001 From: Nick Rempel Date: Thu, 29 Apr 2021 02:12:42 -0700 Subject: [PATCH] Flesh out README (#14) --- README.md | 109 ++++++++++++++++++++--- data/README.md | 7 ++ instant-segment-py/examples/contrived.py | 22 +++++ instant-segment/examples/contrived.rs | 24 +++++ 4 files changed, 149 insertions(+), 13 deletions(-) create mode 100644 data/README.md create mode 100644 instant-segment-py/examples/contrived.py create mode 100644 instant-segment/examples/contrived.rs diff --git a/README.md b/README.md index 18e8fbd..fecc172 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,113 @@ ![Cover logo](./cover.svg) -# instant-segment: fast English word segmentation in Rust +# Instant Segment: fast English word segmentation in Rust [![Documentation](https://docs.rs/instant-segment/badge.svg)](https://docs.rs/instant-segment/) [![Crates.io](https://img.shields.io/crates/v/instant-segment.svg)](https://crates.io/crates/instant-segment) +[![PyPI](https://img.shields.io/pypi/v/instant-segment)](https://pypi.org/project/instant-segment/) [![Build status](https://github.com/InstantDomainSearch/instant-segment/workflows/CI/badge.svg)](https://github.com/InstantDomainSearch/instant-segment/actions?query=workflow%3ACI) [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE-APACHE) -instant-segment is a fast Apache-2.0 library for English word segmentation. -It is based on the Python [wordsegment][python] project written by Grant Jenkins, +Instant Segment is a fast Apache-2.0 library for English word segmentation. It +is based on the Python [wordsegment][python] project written by Grant Jenks, which is in turn based on code from Peter Norvig's chapter [Natural Language -Corpus Data][chapter] from the book [Beautiful Data][book] (Segaran and Hammerbacher, 2009). +Corpus Data][chapter] from the book [Beautiful Data][book] (Segaran and +Hammerbacher, 2009). The data files in this repository are derived from the [Google Web Trillion Word -Corpus][corpus], as described by Thorsten Brants and Alex Franz, and [distributed][distributed] by the -Linguistic Data Consortium. Note that this data **"may only be used for linguistic -education and research"**, so for any other usage you should acquire a different data set. +Corpus][corpus], as described by Thorsten Brants and Alex Franz, and +[distributed][distributed] by the Linguistic Data Consortium. Note that this +data **"may only be used for linguistic education and research"**, so for any +other usage you should acquire a different data set. -For the microbenchmark included in this repository, instant-segment is ~17x faster than -the Python implementation. Further optimizations are planned -- see the [issues][issues]. -The API has been carefully constructed so that multiple segmentations can share -the underlying state to allow parallel usage. +For the microbenchmark included in this repository, Instant Segment is ~17x +faster than the Python implementation. Further optimizations are planned -- see +the [issues][issues]. The API has been carefully constructed so that multiple +segmentations can share the underlying state to allow parallel usage. + +## How it works + +Instant Segment works by segmenting a string into words by selecting the splits +with the highest probability given a corpus of words and their occurrences. + +For instance, provided that `choose` and `spain` occur more frequently than +`chooses` and `pain`, and that the pair `choose spain` occurs more frequently +than `chooses pain`, Instant Segment can help identify the domain +`choosespain.com` as `ChooseSpain.com` which more likely matches user intent. + +We use this technique at +[Instant Domain Search](https://instantdomainsearch.com/search/sale?q=choosespain) +to help our users find relevant domains. + +## Using the library + +### Python **(>= 3.9)** + +```sh +pip install instant-segment +``` + +### Rust + +```toml +[dependencies] +instant-segment = "0.8.1" +``` + +### Examples + +The following examples expect `unigrams` and `bigrams` to exist. See the +examples ([Rust](./instant-segment/examples/contrived.rs), +[Python](./instant-segment-py/examples/contrived.py)) to see how to construct +these objects. + +```python +import instant_segment + +segmenter = instant_segment.Segmenter(unigrams, bigrams) +search = instant_segment.Search() +segmenter.segment("instantdomainsearch", search) +print([word for word in search]) + +--> ['instant', 'domain', 'search'] +``` + +```rust +use instant_segment::{Search, Segmenter}; +use std::collections::HashMap; + +let segmenter = Segmenter::from_maps(unigrams, bigrams); +let mut search = Search::default(); +let words = segmenter + .segment("instantdomainsearch", &mut search) + .unwrap(); +println!("{:?}", words.collect::>()) + +--> ["instant", "domain", "search"] +``` + +Check out the tests for more thorough examples: +[Rust](./instant-segment/src/test_cases.rs), +[Python](./instant-segment-py/test/test.py) + +## Testing + +To run the tests run the following: + +``` +cargo t -p instant-segment --all-features +``` + +You can also test the Python bindings with: + +``` +make test-python +``` [python]: https://github.com/grantjenks/python-wordsegment [chapter]: http://norvig.com/ngrams/ [book]: http://oreilly.com/catalog/9780596157111/ -[corpus]: http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html +[corpus]: + http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html [distributed]: https://catalog.ldc.upenn.edu/LDC2006T13 -[issues]: https://github.com/InstantDomainSearch/instant-segment/issues +[issues]: https://github.com/InstantDomainSearch/instant-segment/issues \ No newline at end of file diff --git a/data/README.md b/data/README.md new file mode 100644 index 0000000..25ea2a3 --- /dev/null +++ b/data/README.md @@ -0,0 +1,7 @@ +The data files in this directory are derived from the [Google Web Trillion Word +Corpus][corpus], as described by Thorsten Brants and Alex Franz, and [distributed][distributed] by the +Linguistic Data Consortium. Note that this data **"may only be used for linguistic +education and research"**, so for any other usage you should acquire a different data set. + +[corpus]: http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html +[distributed]: https://catalog.ldc.upenn.edu/LDC2006T13 \ No newline at end of file diff --git a/instant-segment-py/examples/contrived.py b/instant-segment-py/examples/contrived.py new file mode 100644 index 0000000..68b196b --- /dev/null +++ b/instant-segment-py/examples/contrived.py @@ -0,0 +1,22 @@ +import instant_segment + + +def main(): + unigrams = [] + unigrams.append(("choose", 80_000)) + unigrams.append(("chooses", 7_000)) + unigrams.append(("spain", 20_000)) + unigrams.append(("pain", 90_000)) + + bigrams = [] + bigrams.append((("choose", "spain"), 7)) + bigrams.append((("chooses", "pain"), 0)) + + segmenter = instant_segment.Segmenter(iter(unigrams), iter(bigrams)) + search = instant_segment.Search() + segmenter.segment("choosespain", search) + print([word for word in search]) + + +if __name__ == "__main__": + main() diff --git a/instant-segment/examples/contrived.rs b/instant-segment/examples/contrived.rs new file mode 100644 index 0000000..c3cb2ab --- /dev/null +++ b/instant-segment/examples/contrived.rs @@ -0,0 +1,24 @@ +use instant_segment::{Search, Segmenter}; +use std::collections::HashMap; + +fn main() { + let mut unigrams = HashMap::default(); + + unigrams.insert("choose".into(), 80_000.0); + unigrams.insert("chooses".into(), 7_000.0); + + unigrams.insert("spain".into(), 20_000.0); + unigrams.insert("pain".into(), 90_000.0); + + let mut bigrams = HashMap::default(); + + bigrams.insert(("choose".into(), "spain".into()), 7.0); + bigrams.insert(("chooses".into(), "pain".into()), 0.0); + + let segmenter = Segmenter::from_maps(unigrams, bigrams); + let mut search = Search::default(); + + let words = segmenter.segment("choosespain", &mut search).unwrap(); + + println!("{:?}", words.collect::>()); +}