diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 0e0dcd2..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - -} \ No newline at end of file diff --git a/README.md b/README.md index a3ed505..3d916e9 100644 --- a/README.md +++ b/README.md @@ -8,26 +8,6 @@ [![Build status](https://github.com/InstantDomainSearch/instant-segment/workflows/CI/badge.svg)](https://github.com/InstantDomainSearch/instant-segment/actions?query=workflow%3ACI) [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE-APACHE) -```python -segmenter = instant_segment.Segmenter(unigrams(), bigrams()) -search = instant_segment.Search() -segmenter.segment("instantdomainsearch", search) -print([word for word in search]) - ---> ['instant', 'domain', 'search'] -``` - -```rust -let segmenter = Segmenter::from_maps(unigrams, bigrams); -let mut search = Search::default(); -let words = segmenter - .segment("instantdomainsearch", &mut search) - .unwrap(); -println!("{:?}", words.collect::>()) - ---> ["instant", "domain", "search"] -``` - Instant Segment is a fast Apache-2.0 library for English word segmentation. It is based on the Python [wordsegment][python] project written by Grant Jenks, which is in turn based on code from Peter Norvig's chapter [Natural Language @@ -45,7 +25,21 @@ faster than the Python implementation. Further optimizations are planned -- see the [issues][issues]. The API has been carefully constructed so that multiple segmentations can share the underlying state to allow parallel usage. -## Installing +## How it works + +Instant Segment works by segmenting a string into words by selecting the splits +with the highest probability given a corpus of words and their occurrences. + +For instance, provided that `choose` and `spain` occur more frequently than +`chooses` and `pain`, and that the pair `choose spain` occurs more frequently +than `chooses pain`, Instant Segment can help you split the string +`choosespain.com` into `ChooseSpain.com` which more likely matches user intent. + +We use this technique at +[Instant Domain Search](https://instantdomainsearch.com/search/sale?q=choosespain) +to do just this. + +## Using the library ### Python **(>= 3.9)** @@ -57,85 +51,41 @@ pip install instant-segment ```toml [dependencies] -instant-segment = "*" +instant-segment = "0.8.1" ``` -## Using +### Examples -Instant Segment works by segmenting a string into words by selecting the splits -with the highest probability given a corpus of words and their occurances. - -For instance, provided that `choose` and `spain` occur more frequently than -`chooses` and `pain`, Instant Segment can help you split the string -`choosespain.com` into -[`ChooseSpain.com`](https://instantdomainsearch.com/search/sale?q=choosespain) -which more likely matches user intent. +The following examples expect `unigrams` and `bigrams` to exist. See the +[examples](./examples) to see how to construct these objects. ```python import instant_segment +segmenter = instant_segment.Segmenter(unigrams, bigrams) +search = instant_segment.Search() +segmenter.segment("instantdomainsearch", search) +print([word for word in search]) -def main(): - unigrams = [] - unigrams.append(("choose", 50)) - unigrams.append(("chooses", 10)) - unigrams.append(("spain", 50)) - unigrams.append(("pain", 10)) - - bigrams = [] - bigrams.append((("choose", "spain"), 10)) - bigrams.append((("chooses", "pain"), 10)) - - segmenter = instant_segment.Segmenter(iter(unigrams), iter(bigrams)) - search = instant_segment.Search() - segmenter.segment("choosespain", search) - print([word for word in search]) - - -if __name__ == "__main__": - main() - +--> ['instant', 'domain', 'search'] ``` ```rust use instant_segment::{Search, Segmenter}; use std::collections::HashMap; -fn main() { - let mut unigrams = HashMap::default(); +let segmenter = Segmenter::from_maps(unigrams, bigrams); +let mut search = Search::default(); +let words = segmenter + .segment("instantdomainsearch", &mut search) + .unwrap(); +println!("{:?}", words.collect::>()) - unigrams.insert("choose".into(), 50 as f64); - unigrams.insert("chooses".into(), 10 as f64); - - unigrams.insert("spain".into(), 50 as f64); - unigrams.insert("pain".into(), 10 as f64); - - let mut bigrams = HashMap::default(); - - bigrams.insert(("choose".into(), "spain".into()), 10 as f64); - bigrams.insert(("chooses".into(), "pain".into()), 10 as f64); - - let segmenter = Segmenter::from_maps(unigrams, bigrams); - let mut search = Search::default(); - - let words = segmenter.segment("choosespain", &mut search).unwrap(); - - println!("{:?}", words.collect::>()) -} +--> ["instant", "domain", "search"] ``` -``` -['choose', 'spain'] -``` - -Play with the examples above to see that different numbers of occurances will -influence the results - -The example above is succinct but, in practice, you will want to load these -words and occurances from a corpus of data like the ones we provide -[here](./data). Check out -[the](./instant-segment/instant-segment-py/test/test.py) -[tests](./instant-segment/instant-segment/src/test_data.rs) to see examples of -how you might do that. +Check out the tests for a more thorough example: +[Rust](./instant-segment/src/test_cases.rs), +[Python](./instant-segment-py/test/test.py) ## Testing @@ -145,7 +95,7 @@ To run the tests run the following: cargo t -p instant-segment --all-features ``` -You can also test the python bindings with: +You can also test the Python bindings with: ``` make test-python diff --git a/examples/contrived.py b/examples/contrived.py new file mode 100644 index 0000000..d40c224 --- /dev/null +++ b/examples/contrived.py @@ -0,0 +1,22 @@ +import instant_segment + + +def main(): + unigrams = [] + unigrams.append(("choose", 50)) + unigrams.append(("chooses", 10)) + unigrams.append(("spain", 50)) + unigrams.append(("pain", 10)) + + bigrams = [] + bigrams.append((("choose", "spain"), 10)) + bigrams.append((("chooses", "pain"), 10)) + + segmenter = instant_segment.Segmenter(iter(unigrams), iter(bigrams)) + search = instant_segment.Search() + segmenter.segment("choosespain", search) + print([word for word in search]) + + +if __name__ == "__main__": + main() diff --git a/examples/contrived.rs b/examples/contrived.rs new file mode 100644 index 0000000..8411575 --- /dev/null +++ b/examples/contrived.rs @@ -0,0 +1,24 @@ +use instant_segment::{Search, Segmenter}; +use std::collections::HashMap; + +fn main() { + let mut unigrams = HashMap::default(); + + unigrams.insert("choose".into(), 50 as f64); + unigrams.insert("chooses".into(), 10 as f64); + + unigrams.insert("spain".into(), 50 as f64); + unigrams.insert("pain".into(), 10 as f64); + + let mut bigrams = HashMap::default(); + + bigrams.insert(("choose".into(), "spain".into()), 10 as f64); + bigrams.insert(("chooses".into(), "pain".into()), 10 as f64); + + let segmenter = Segmenter::from_maps(unigrams, bigrams); + let mut search = Search::default(); + + let words = segmenter.segment("choosespain", &mut search).unwrap(); + + println!("{:?}", words.collect::>()); +}