Reorg readme

2021-04-26 10:53:02 -07:00 · 2021-04-26 10:53:02 -07:00 · 356d9a0073
parent 4558b10b58
commit 356d9a0073
4 changed files with 81 additions and 88 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -1,3 +0,0 @@
-{
-
-}
--- a/README.md
+++ b/README.md
@ -8,26 +8,6 @@
 [![Build status](https://github.com/InstantDomainSearch/instant-segment/workflows/CI/badge.svg)](https://github.com/InstantDomainSearch/instant-segment/actions?query=workflow%3ACI)
 [![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE-APACHE)

-```python
-segmenter = instant_segment.Segmenter(unigrams(), bigrams())
-search = instant_segment.Search()
-segmenter.segment("instantdomainsearch", search)
-print([word for word in search])
-
--> ['instant', 'domain', 'search']
-```
-
-```rust
-let segmenter = Segmenter::from_maps(unigrams, bigrams);
-let mut search = Search::default();
-let words = segmenter
-    .segment("instantdomainsearch", &mut search)
-    .unwrap();
-println!("{:?}", words.collect::<Vec<&str>>())
-
--> ["instant", "domain", "search"]
-```
-
 Instant Segment is a fast Apache-2.0 library for English word segmentation. It
 is based on the Python [wordsegment][python] project written by Grant Jenks,
 which is in turn based on code from Peter Norvig's chapter [Natural Language
@ -45,7 +25,21 @@ faster than the Python implementation. Further optimizations are planned -- see
 the [issues][issues]. The API has been carefully constructed so that multiple
 segmentations can share the underlying state to allow parallel usage.

-## Installing
+## How it works
+
+Instant Segment works by segmenting a string into words by selecting the splits
+with the highest probability given a corpus of words and their occurrences.
+
+For instance, provided that `choose` and `spain` occur more frequently than
+`chooses` and `pain`, and that the pair `choose spain` occurs more frequently
+than `chooses pain`, Instant Segment can help you split the string
+`choosespain.com` into `ChooseSpain.com` which more likely matches user intent.
+
+We use this technique at
+[Instant Domain Search](https://instantdomainsearch.com/search/sale?q=choosespain)
+to do just this.
+
+## Using the library

 ### Python **(>= 3.9)**

@ -57,85 +51,41 @@ pip install instant-segment

 ```toml
 [dependencies]
-instant-segment = "*"
+instant-segment = "0.8.1"
 ```

-## Using
+### Examples

-Instant Segment works by segmenting a string into words by selecting the splits
-with the highest probability given a corpus of words and their occurances.
-
-For instance, provided that `choose` and `spain` occur more frequently than
-`chooses` and `pain`, Instant Segment can help you split the string
-`choosespain.com` into
-[`ChooseSpain.com`](https://instantdomainsearch.com/search/sale?q=choosespain)
-which more likely matches user intent.
+The following examples expect `unigrams` and `bigrams` to exist. See the
+[examples](./examples) to see how to construct these objects.

 ```python
 import instant_segment

+segmenter = instant_segment.Segmenter(unigrams, bigrams)
+search = instant_segment.Search()
+segmenter.segment("instantdomainsearch", search)
+print([word for word in search])

-def main():
-    unigrams = []
-    unigrams.append(("choose", 50))
-    unigrams.append(("chooses", 10))
-    unigrams.append(("spain", 50))
-    unigrams.append(("pain", 10))
-
-    bigrams = []
-    bigrams.append((("choose", "spain"), 10))
-    bigrams.append((("chooses", "pain"), 10))
-
-    segmenter = instant_segment.Segmenter(iter(unigrams), iter(bigrams))
-    search = instant_segment.Search()
-    segmenter.segment("choosespain", search)
-    print([word for word in search])
-
-
-if __name__ == "__main__":
-    main()
-
+--> ['instant', 'domain', 'search']
 ```

 ```rust
 use instant_segment::{Search, Segmenter}; use std::collections::HashMap;

-fn main() {
-    let mut unigrams = HashMap::default();
+let segmenter = Segmenter::from_maps(unigrams, bigrams);
+let mut search = Search::default();
+let words = segmenter
+    .segment("instantdomainsearch", &mut search)
+    .unwrap();
+println!("{:?}", words.collect::<Vec<&str>>())

-    unigrams.insert("choose".into(), 50 as f64);
-    unigrams.insert("chooses".into(), 10 as f64);
-
-    unigrams.insert("spain".into(), 50 as f64);
-    unigrams.insert("pain".into(), 10 as f64);
-
-    let mut bigrams = HashMap::default();
-
-    bigrams.insert(("choose".into(), "spain".into()), 10 as f64);
-    bigrams.insert(("chooses".into(), "pain".into()), 10 as f64);
-
-    let segmenter = Segmenter::from_maps(unigrams, bigrams);
-    let mut search = Search::default();
-
-    let words = segmenter.segment("choosespain", &mut search).unwrap();
-
-    println!("{:?}", words.collect::<Vec<&str>>())
-}
+--> ["instant", "domain", "search"]
 ```

-```
-['choose', 'spain']
-```
-
-Play with the examples above to see that different numbers of occurances will
-influence the results
-
-The example above is succinct but, in practice, you will want to load these
-words and occurances from a corpus of data like the ones we provide
-[here](./data). Check out
-[the](./instant-segment/instant-segment-py/test/test.py)
-[tests](./instant-segment/instant-segment/src/test_data.rs) to see examples of
-how you might do that.
+Check out the tests for a more thorough example:
+[Rust](./instant-segment/src/test_cases.rs),
+[Python](./instant-segment-py/test/test.py)

 ## Testing

@ -145,7 +95,7 @@ To run the tests run the following:
 cargo t -p instant-segment --all-features
 ```

-You can also test the python bindings with:
+You can also test the Python bindings with:

 ```
 make test-python
--- a/examples/contrived.py
+++ b/examples/contrived.py
@ -0,0 +1,22 @@
+import instant_segment
+
+
+def main():
+    unigrams = []
+    unigrams.append(("choose", 50))
+    unigrams.append(("chooses", 10))
+    unigrams.append(("spain", 50))
+    unigrams.append(("pain", 10))
+
+    bigrams = []
+    bigrams.append((("choose", "spain"), 10))
+    bigrams.append((("chooses", "pain"), 10))
+
+    segmenter = instant_segment.Segmenter(iter(unigrams), iter(bigrams))
+    search = instant_segment.Search()
+    segmenter.segment("choosespain", search)
+    print([word for word in search])
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/contrived.rs
+++ b/examples/contrived.rs
@ -0,0 +1,24 @@
+use instant_segment::{Search, Segmenter};
+use std::collections::HashMap;
+
+fn main() {
+    let mut unigrams = HashMap::default();
+
+    unigrams.insert("choose".into(), 50 as f64);
+    unigrams.insert("chooses".into(), 10 as f64);
+
+    unigrams.insert("spain".into(), 50 as f64);
+    unigrams.insert("pain".into(), 10 as f64);
+
+    let mut bigrams = HashMap::default();
+
+    bigrams.insert(("choose".into(), "spain".into()), 10 as f64);
+    bigrams.insert(("chooses".into(), "pain".into()), 10 as f64);
+
+    let segmenter = Segmenter::from_maps(unigrams, bigrams);
+    let mut search = Search::default();
+
+    let words = segmenter.segment("choosespain", &mut search).unwrap();
+
+    println!("{:?}", words.collect::<Vec<&str>>());
+}