Reorg readme
This commit is contained in:
parent
4558b10b58
commit
356d9a0073
|
@ -1,3 +0,0 @@
|
||||||
{
|
|
||||||
|
|
||||||
}
|
|
120
README.md
120
README.md
|
@ -8,26 +8,6 @@
|
||||||
[![Build status](https://github.com/InstantDomainSearch/instant-segment/workflows/CI/badge.svg)](https://github.com/InstantDomainSearch/instant-segment/actions?query=workflow%3ACI)
|
[![Build status](https://github.com/InstantDomainSearch/instant-segment/workflows/CI/badge.svg)](https://github.com/InstantDomainSearch/instant-segment/actions?query=workflow%3ACI)
|
||||||
[![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE-APACHE)
|
[![License: Apache 2.0](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](LICENSE-APACHE)
|
||||||
|
|
||||||
```python
|
|
||||||
segmenter = instant_segment.Segmenter(unigrams(), bigrams())
|
|
||||||
search = instant_segment.Search()
|
|
||||||
segmenter.segment("instantdomainsearch", search)
|
|
||||||
print([word for word in search])
|
|
||||||
|
|
||||||
--> ['instant', 'domain', 'search']
|
|
||||||
```
|
|
||||||
|
|
||||||
```rust
|
|
||||||
let segmenter = Segmenter::from_maps(unigrams, bigrams);
|
|
||||||
let mut search = Search::default();
|
|
||||||
let words = segmenter
|
|
||||||
.segment("instantdomainsearch", &mut search)
|
|
||||||
.unwrap();
|
|
||||||
println!("{:?}", words.collect::<Vec<&str>>())
|
|
||||||
|
|
||||||
--> ["instant", "domain", "search"]
|
|
||||||
```
|
|
||||||
|
|
||||||
Instant Segment is a fast Apache-2.0 library for English word segmentation. It
|
Instant Segment is a fast Apache-2.0 library for English word segmentation. It
|
||||||
is based on the Python [wordsegment][python] project written by Grant Jenks,
|
is based on the Python [wordsegment][python] project written by Grant Jenks,
|
||||||
which is in turn based on code from Peter Norvig's chapter [Natural Language
|
which is in turn based on code from Peter Norvig's chapter [Natural Language
|
||||||
|
@ -45,7 +25,21 @@ faster than the Python implementation. Further optimizations are planned -- see
|
||||||
the [issues][issues]. The API has been carefully constructed so that multiple
|
the [issues][issues]. The API has been carefully constructed so that multiple
|
||||||
segmentations can share the underlying state to allow parallel usage.
|
segmentations can share the underlying state to allow parallel usage.
|
||||||
|
|
||||||
## Installing
|
## How it works
|
||||||
|
|
||||||
|
Instant Segment works by segmenting a string into words by selecting the splits
|
||||||
|
with the highest probability given a corpus of words and their occurrences.
|
||||||
|
|
||||||
|
For instance, provided that `choose` and `spain` occur more frequently than
|
||||||
|
`chooses` and `pain`, and that the pair `choose spain` occurs more frequently
|
||||||
|
than `chooses pain`, Instant Segment can help you split the string
|
||||||
|
`choosespain.com` into `ChooseSpain.com` which more likely matches user intent.
|
||||||
|
|
||||||
|
We use this technique at
|
||||||
|
[Instant Domain Search](https://instantdomainsearch.com/search/sale?q=choosespain)
|
||||||
|
to do just this.
|
||||||
|
|
||||||
|
## Using the library
|
||||||
|
|
||||||
### Python **(>= 3.9)**
|
### Python **(>= 3.9)**
|
||||||
|
|
||||||
|
@ -57,85 +51,41 @@ pip install instant-segment
|
||||||
|
|
||||||
```toml
|
```toml
|
||||||
[dependencies]
|
[dependencies]
|
||||||
instant-segment = "*"
|
instant-segment = "0.8.1"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Using
|
### Examples
|
||||||
|
|
||||||
Instant Segment works by segmenting a string into words by selecting the splits
|
The following examples expect `unigrams` and `bigrams` to exist. See the
|
||||||
with the highest probability given a corpus of words and their occurances.
|
[examples](./examples) to see how to construct these objects.
|
||||||
|
|
||||||
For instance, provided that `choose` and `spain` occur more frequently than
|
|
||||||
`chooses` and `pain`, Instant Segment can help you split the string
|
|
||||||
`choosespain.com` into
|
|
||||||
[`ChooseSpain.com`](https://instantdomainsearch.com/search/sale?q=choosespain)
|
|
||||||
which more likely matches user intent.
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import instant_segment
|
import instant_segment
|
||||||
|
|
||||||
|
segmenter = instant_segment.Segmenter(unigrams, bigrams)
|
||||||
|
search = instant_segment.Search()
|
||||||
|
segmenter.segment("instantdomainsearch", search)
|
||||||
|
print([word for word in search])
|
||||||
|
|
||||||
def main():
|
--> ['instant', 'domain', 'search']
|
||||||
unigrams = []
|
|
||||||
unigrams.append(("choose", 50))
|
|
||||||
unigrams.append(("chooses", 10))
|
|
||||||
unigrams.append(("spain", 50))
|
|
||||||
unigrams.append(("pain", 10))
|
|
||||||
|
|
||||||
bigrams = []
|
|
||||||
bigrams.append((("choose", "spain"), 10))
|
|
||||||
bigrams.append((("chooses", "pain"), 10))
|
|
||||||
|
|
||||||
segmenter = instant_segment.Segmenter(iter(unigrams), iter(bigrams))
|
|
||||||
search = instant_segment.Search()
|
|
||||||
segmenter.segment("choosespain", search)
|
|
||||||
print([word for word in search])
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
```rust
|
```rust
|
||||||
use instant_segment::{Search, Segmenter}; use std::collections::HashMap;
|
use instant_segment::{Search, Segmenter}; use std::collections::HashMap;
|
||||||
|
|
||||||
fn main() {
|
let segmenter = Segmenter::from_maps(unigrams, bigrams);
|
||||||
let mut unigrams = HashMap::default();
|
let mut search = Search::default();
|
||||||
|
let words = segmenter
|
||||||
|
.segment("instantdomainsearch", &mut search)
|
||||||
|
.unwrap();
|
||||||
|
println!("{:?}", words.collect::<Vec<&str>>())
|
||||||
|
|
||||||
unigrams.insert("choose".into(), 50 as f64);
|
--> ["instant", "domain", "search"]
|
||||||
unigrams.insert("chooses".into(), 10 as f64);
|
|
||||||
|
|
||||||
unigrams.insert("spain".into(), 50 as f64);
|
|
||||||
unigrams.insert("pain".into(), 10 as f64);
|
|
||||||
|
|
||||||
let mut bigrams = HashMap::default();
|
|
||||||
|
|
||||||
bigrams.insert(("choose".into(), "spain".into()), 10 as f64);
|
|
||||||
bigrams.insert(("chooses".into(), "pain".into()), 10 as f64);
|
|
||||||
|
|
||||||
let segmenter = Segmenter::from_maps(unigrams, bigrams);
|
|
||||||
let mut search = Search::default();
|
|
||||||
|
|
||||||
let words = segmenter.segment("choosespain", &mut search).unwrap();
|
|
||||||
|
|
||||||
println!("{:?}", words.collect::<Vec<&str>>())
|
|
||||||
}
|
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
Check out the tests for a more thorough example:
|
||||||
['choose', 'spain']
|
[Rust](./instant-segment/src/test_cases.rs),
|
||||||
```
|
[Python](./instant-segment-py/test/test.py)
|
||||||
|
|
||||||
Play with the examples above to see that different numbers of occurances will
|
|
||||||
influence the results
|
|
||||||
|
|
||||||
The example above is succinct but, in practice, you will want to load these
|
|
||||||
words and occurances from a corpus of data like the ones we provide
|
|
||||||
[here](./data). Check out
|
|
||||||
[the](./instant-segment/instant-segment-py/test/test.py)
|
|
||||||
[tests](./instant-segment/instant-segment/src/test_data.rs) to see examples of
|
|
||||||
how you might do that.
|
|
||||||
|
|
||||||
## Testing
|
## Testing
|
||||||
|
|
||||||
|
@ -145,7 +95,7 @@ To run the tests run the following:
|
||||||
cargo t -p instant-segment --all-features
|
cargo t -p instant-segment --all-features
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also test the python bindings with:
|
You can also test the Python bindings with:
|
||||||
|
|
||||||
```
|
```
|
||||||
make test-python
|
make test-python
|
||||||
|
|
|
@ -0,0 +1,22 @@
|
||||||
|
import instant_segment
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
unigrams = []
|
||||||
|
unigrams.append(("choose", 50))
|
||||||
|
unigrams.append(("chooses", 10))
|
||||||
|
unigrams.append(("spain", 50))
|
||||||
|
unigrams.append(("pain", 10))
|
||||||
|
|
||||||
|
bigrams = []
|
||||||
|
bigrams.append((("choose", "spain"), 10))
|
||||||
|
bigrams.append((("chooses", "pain"), 10))
|
||||||
|
|
||||||
|
segmenter = instant_segment.Segmenter(iter(unigrams), iter(bigrams))
|
||||||
|
search = instant_segment.Search()
|
||||||
|
segmenter.segment("choosespain", search)
|
||||||
|
print([word for word in search])
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,24 @@
|
||||||
|
use instant_segment::{Search, Segmenter};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let mut unigrams = HashMap::default();
|
||||||
|
|
||||||
|
unigrams.insert("choose".into(), 50 as f64);
|
||||||
|
unigrams.insert("chooses".into(), 10 as f64);
|
||||||
|
|
||||||
|
unigrams.insert("spain".into(), 50 as f64);
|
||||||
|
unigrams.insert("pain".into(), 10 as f64);
|
||||||
|
|
||||||
|
let mut bigrams = HashMap::default();
|
||||||
|
|
||||||
|
bigrams.insert(("choose".into(), "spain".into()), 10 as f64);
|
||||||
|
bigrams.insert(("chooses".into(), "pain".into()), 10 as f64);
|
||||||
|
|
||||||
|
let segmenter = Segmenter::from_maps(unigrams, bigrams);
|
||||||
|
let mut search = Search::default();
|
||||||
|
|
||||||
|
let words = segmenter.segment("choosespain", &mut search).unwrap();
|
||||||
|
|
||||||
|
println!("{:?}", words.collect::<Vec<&str>>());
|
||||||
|
}
|
Loading…
Reference in New Issue