mirror of
https://github.com/instant-labs/instant-segment.git
synced 2025-02-20 15:02:07 +00:00
Flesh out README (#14)
This commit is contained in:
parent
eca12c572f
commit
9bbb633f1d
109
README.md
109
README.md
@ -1,30 +1,113 @@
|
||||

|
||||
|
||||
# instant-segment: fast English word segmentation in Rust
|
||||
# Instant Segment: fast English word segmentation in Rust
|
||||
|
||||
[](https://docs.rs/instant-segment/)
|
||||
[](https://crates.io/crates/instant-segment)
|
||||
[](https://pypi.org/project/instant-segment/)
|
||||
[](https://github.com/InstantDomainSearch/instant-segment/actions?query=workflow%3ACI)
|
||||
[](LICENSE-APACHE)
|
||||
|
||||
instant-segment is a fast Apache-2.0 library for English word segmentation.
|
||||
It is based on the Python [wordsegment][python] project written by Grant Jenkins,
|
||||
Instant Segment is a fast Apache-2.0 library for English word segmentation. It
|
||||
is based on the Python [wordsegment][python] project written by Grant Jenks,
|
||||
which is in turn based on code from Peter Norvig's chapter [Natural Language
|
||||
Corpus Data][chapter] from the book [Beautiful Data][book] (Segaran and Hammerbacher, 2009).
|
||||
Corpus Data][chapter] from the book [Beautiful Data][book] (Segaran and
|
||||
Hammerbacher, 2009).
|
||||
|
||||
The data files in this repository are derived from the [Google Web Trillion Word
|
||||
Corpus][corpus], as described by Thorsten Brants and Alex Franz, and [distributed][distributed] by the
|
||||
Linguistic Data Consortium. Note that this data **"may only be used for linguistic
|
||||
education and research"**, so for any other usage you should acquire a different data set.
|
||||
Corpus][corpus], as described by Thorsten Brants and Alex Franz, and
|
||||
[distributed][distributed] by the Linguistic Data Consortium. Note that this
|
||||
data **"may only be used for linguistic education and research"**, so for any
|
||||
other usage you should acquire a different data set.
|
||||
|
||||
For the microbenchmark included in this repository, instant-segment is ~17x faster than
|
||||
the Python implementation. Further optimizations are planned -- see the [issues][issues].
|
||||
The API has been carefully constructed so that multiple segmentations can share
|
||||
the underlying state to allow parallel usage.
|
||||
For the microbenchmark included in this repository, Instant Segment is ~17x
|
||||
faster than the Python implementation. Further optimizations are planned -- see
|
||||
the [issues][issues]. The API has been carefully constructed so that multiple
|
||||
segmentations can share the underlying state to allow parallel usage.
|
||||
|
||||
## How it works
|
||||
|
||||
Instant Segment works by segmenting a string into words by selecting the splits
|
||||
with the highest probability given a corpus of words and their occurrences.
|
||||
|
||||
For instance, provided that `choose` and `spain` occur more frequently than
|
||||
`chooses` and `pain`, and that the pair `choose spain` occurs more frequently
|
||||
than `chooses pain`, Instant Segment can help identify the domain
|
||||
`choosespain.com` as `ChooseSpain.com` which more likely matches user intent.
|
||||
|
||||
We use this technique at
|
||||
[Instant Domain Search](https://instantdomainsearch.com/search/sale?q=choosespain)
|
||||
to help our users find relevant domains.
|
||||
|
||||
## Using the library
|
||||
|
||||
### Python **(>= 3.9)**
|
||||
|
||||
```sh
|
||||
pip install instant-segment
|
||||
```
|
||||
|
||||
### Rust
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
instant-segment = "0.8.1"
|
||||
```
|
||||
|
||||
### Examples
|
||||
|
||||
The following examples expect `unigrams` and `bigrams` to exist. See the
|
||||
examples ([Rust](./instant-segment/examples/contrived.rs),
|
||||
[Python](./instant-segment-py/examples/contrived.py)) to see how to construct
|
||||
these objects.
|
||||
|
||||
```python
|
||||
import instant_segment
|
||||
|
||||
segmenter = instant_segment.Segmenter(unigrams, bigrams)
|
||||
search = instant_segment.Search()
|
||||
segmenter.segment("instantdomainsearch", search)
|
||||
print([word for word in search])
|
||||
|
||||
--> ['instant', 'domain', 'search']
|
||||
```
|
||||
|
||||
```rust
|
||||
use instant_segment::{Search, Segmenter};
|
||||
use std::collections::HashMap;
|
||||
|
||||
let segmenter = Segmenter::from_maps(unigrams, bigrams);
|
||||
let mut search = Search::default();
|
||||
let words = segmenter
|
||||
.segment("instantdomainsearch", &mut search)
|
||||
.unwrap();
|
||||
println!("{:?}", words.collect::<Vec<&str>>())
|
||||
|
||||
--> ["instant", "domain", "search"]
|
||||
```
|
||||
|
||||
Check out the tests for more thorough examples:
|
||||
[Rust](./instant-segment/src/test_cases.rs),
|
||||
[Python](./instant-segment-py/test/test.py)
|
||||
|
||||
## Testing
|
||||
|
||||
To run the tests run the following:
|
||||
|
||||
```
|
||||
cargo t -p instant-segment --all-features
|
||||
```
|
||||
|
||||
You can also test the Python bindings with:
|
||||
|
||||
```
|
||||
make test-python
|
||||
```
|
||||
|
||||
[python]: https://github.com/grantjenks/python-wordsegment
|
||||
[chapter]: http://norvig.com/ngrams/
|
||||
[book]: http://oreilly.com/catalog/9780596157111/
|
||||
[corpus]: http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html
|
||||
[corpus]:
|
||||
http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html
|
||||
[distributed]: https://catalog.ldc.upenn.edu/LDC2006T13
|
||||
[issues]: https://github.com/InstantDomainSearch/instant-segment/issues
|
||||
[issues]: https://github.com/InstantDomainSearch/instant-segment/issues
|
7
data/README.md
Normal file
7
data/README.md
Normal file
@ -0,0 +1,7 @@
|
||||
The data files in this directory are derived from the [Google Web Trillion Word
|
||||
Corpus][corpus], as described by Thorsten Brants and Alex Franz, and [distributed][distributed] by the
|
||||
Linguistic Data Consortium. Note that this data **"may only be used for linguistic
|
||||
education and research"**, so for any other usage you should acquire a different data set.
|
||||
|
||||
[corpus]: http://googleresearch.blogspot.com/2006/08/all-our-n-gram-are-belong-to-you.html
|
||||
[distributed]: https://catalog.ldc.upenn.edu/LDC2006T13
|
22
instant-segment-py/examples/contrived.py
Normal file
22
instant-segment-py/examples/contrived.py
Normal file
@ -0,0 +1,22 @@
|
||||
import instant_segment
|
||||
|
||||
|
||||
def main():
|
||||
unigrams = []
|
||||
unigrams.append(("choose", 80_000))
|
||||
unigrams.append(("chooses", 7_000))
|
||||
unigrams.append(("spain", 20_000))
|
||||
unigrams.append(("pain", 90_000))
|
||||
|
||||
bigrams = []
|
||||
bigrams.append((("choose", "spain"), 7))
|
||||
bigrams.append((("chooses", "pain"), 0))
|
||||
|
||||
segmenter = instant_segment.Segmenter(iter(unigrams), iter(bigrams))
|
||||
search = instant_segment.Search()
|
||||
segmenter.segment("choosespain", search)
|
||||
print([word for word in search])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
24
instant-segment/examples/contrived.rs
Normal file
24
instant-segment/examples/contrived.rs
Normal file
@ -0,0 +1,24 @@
|
||||
use instant_segment::{Search, Segmenter};
|
||||
use std::collections::HashMap;
|
||||
|
||||
fn main() {
|
||||
let mut unigrams = HashMap::default();
|
||||
|
||||
unigrams.insert("choose".into(), 80_000.0);
|
||||
unigrams.insert("chooses".into(), 7_000.0);
|
||||
|
||||
unigrams.insert("spain".into(), 20_000.0);
|
||||
unigrams.insert("pain".into(), 90_000.0);
|
||||
|
||||
let mut bigrams = HashMap::default();
|
||||
|
||||
bigrams.insert(("choose".into(), "spain".into()), 7.0);
|
||||
bigrams.insert(("chooses".into(), "pain".into()), 0.0);
|
||||
|
||||
let segmenter = Segmenter::from_maps(unigrams, bigrams);
|
||||
let mut search = Search::default();
|
||||
|
||||
let words = segmenter.segment("choosespain", &mut search).unwrap();
|
||||
|
||||
println!("{:?}", words.collect::<Vec<&str>>());
|
||||
}
|
Loading…
Reference in New Issue
Block a user