py: initial version of Python bindings
This commit is contained in:
parent
f6061044fc
commit
fd774ad465
|
@ -0,0 +1,11 @@
|
||||||
|
[target.x86_64-apple-darwin]
|
||||||
|
rustflags = [
|
||||||
|
"-C", "link-arg=-undefined",
|
||||||
|
"-C", "link-arg=dynamic_lookup",
|
||||||
|
]
|
||||||
|
|
||||||
|
[target.aarch64-apple-darwin]
|
||||||
|
rustflags = [
|
||||||
|
"-C", "link-arg=-undefined",
|
||||||
|
"-C", "link-arg=dynamic_lookup",
|
||||||
|
]
|
|
@ -1,2 +1,4 @@
|
||||||
/target
|
/target
|
||||||
Cargo.lock
|
Cargo.lock
|
||||||
|
*.so
|
||||||
|
__pycache__
|
||||||
|
|
30
Cargo.toml
30
Cargo.toml
|
@ -1,28 +1,2 @@
|
||||||
[package]
|
[workspace]
|
||||||
name = "instant-segment"
|
members = ["instant-segment", "instant-segment-py"]
|
||||||
version = "0.7.1"
|
|
||||||
authors = ["Dirkjan Ochtman <dirkjan@ochtman.nl>"]
|
|
||||||
edition = "2018"
|
|
||||||
license = "Apache-2.0"
|
|
||||||
description = "Fast English word segmentation"
|
|
||||||
homepage = "https://github.com/InstantDomainSearch/instant-segment"
|
|
||||||
repository = "https://github.com/InstantDomainSearch/instant-segment"
|
|
||||||
documentation = "https://docs.rs/instant-segment"
|
|
||||||
|
|
||||||
[features]
|
|
||||||
__test_data = ["test-cases"]
|
|
||||||
test-cases = []
|
|
||||||
with-serde = ["serde", "ahash/serde", "smartstring/serde"]
|
|
||||||
|
|
||||||
[dependencies]
|
|
||||||
ahash = "0.7.0"
|
|
||||||
smartstring = "0.2.5"
|
|
||||||
serde = { version = "1.0.123", features = ["derive"], optional = true }
|
|
||||||
|
|
||||||
[dev-dependencies]
|
|
||||||
bencher = "0.1.5"
|
|
||||||
once_cell = "1.4"
|
|
||||||
|
|
||||||
[[bench]]
|
|
||||||
name = "bench"
|
|
||||||
harness = false
|
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
test-python:
|
||||||
|
cargo build --release
|
||||||
|
cp target/release/libinstant_segment.dylib instant-segment-py/test/instant_segment.so
|
||||||
|
PYTHONPATH=instant-segment-py/test/ python3 -m test
|
|
@ -0,0 +1,20 @@
|
||||||
|
[package]
|
||||||
|
name = "instant-segment-py"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Dirkjan Ochtman <dirkjan@ochtman.nl>"]
|
||||||
|
edition = "2018"
|
||||||
|
license = "Apache-2.0"
|
||||||
|
workspace = ".."
|
||||||
|
|
||||||
|
[lib]
|
||||||
|
name = "instant_segment"
|
||||||
|
crate-type = ["cdylib"]
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
ahash = "0.7.2"
|
||||||
|
instant-segment = { version = "0.7", path = "../instant-segment" }
|
||||||
|
pyo3 = { version = "0.13.2", features = ["extension-module"] }
|
||||||
|
smartstring = "0.2.6"
|
||||||
|
|
||||||
|
[package.metadata.maturin]
|
||||||
|
name = "instant-segment"
|
|
@ -0,0 +1,108 @@
|
||||||
|
use pyo3::exceptions::PyValueError;
|
||||||
|
use pyo3::proc_macro::{pyclass, pymethods, pymodule, pyproto};
|
||||||
|
use pyo3::types::{PyIterator, PyModule};
|
||||||
|
use pyo3::{PyErr, PyIterProtocol, PyRef, PyRefMut, PyResult, Python};
|
||||||
|
use smartstring::alias::String as SmartString;
|
||||||
|
|
||||||
|
#[pymodule]
|
||||||
|
fn instant_segment(_: Python, m: &PyModule) -> PyResult<()> {
|
||||||
|
m.add_class::<Search>()?;
|
||||||
|
m.add_class::<Segmenter>()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pyclass]
|
||||||
|
struct Segmenter {
|
||||||
|
inner: instant_segment::Segmenter,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl Segmenter {
|
||||||
|
#[new]
|
||||||
|
fn new(unigrams: &PyIterator, bigrams: &PyIterator) -> PyResult<Self> {
|
||||||
|
let unigrams = unigrams
|
||||||
|
.map(|item| {
|
||||||
|
let item = item?;
|
||||||
|
let key = item.get_item(0)?.extract::<&str>()?;
|
||||||
|
let val = item.get_item(1)?.extract::<f64>()?;
|
||||||
|
Ok((SmartString::from(key), val))
|
||||||
|
})
|
||||||
|
.collect::<Result<HashMap<_, _>, PyErr>>()?;
|
||||||
|
|
||||||
|
let bigrams = bigrams
|
||||||
|
.map(|item| {
|
||||||
|
let item = item?;
|
||||||
|
|
||||||
|
let key = item.get_item(0)?;
|
||||||
|
let first = key.get_item(0)?.extract::<&str>()?;
|
||||||
|
let second = key.get_item(1)?.extract::<&str>()?;
|
||||||
|
|
||||||
|
let val = item.get_item(1)?.extract::<f64>()?;
|
||||||
|
Ok(((SmartString::from(first), SmartString::from(second)), val))
|
||||||
|
})
|
||||||
|
.collect::<Result<HashMap<_, _>, PyErr>>()?;
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
inner: instant_segment::Segmenter::from_maps(unigrams, bigrams),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn segment(&self, s: &str, search: &mut Search) -> PyResult<()> {
|
||||||
|
match self.inner.segment(s, &mut search.inner) {
|
||||||
|
Ok(_) => {
|
||||||
|
search.cur = Some(0);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(_) => Err(PyValueError::new_err(
|
||||||
|
"only lowercase ASCII letters allowed",
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Search buffer and result set
|
||||||
|
#[pyclass]
|
||||||
|
struct Search {
|
||||||
|
inner: instant_segment::Search,
|
||||||
|
cur: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl Search {
|
||||||
|
/// Initialize an empty search buffer
|
||||||
|
#[new]
|
||||||
|
fn new() -> Self {
|
||||||
|
Self {
|
||||||
|
inner: instant_segment::Search::default(),
|
||||||
|
cur: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pyproto]
|
||||||
|
impl PyIterProtocol for Search {
|
||||||
|
fn __iter__(slf: PyRef<Self>) -> PyRef<Self> {
|
||||||
|
slf
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the next closest point
|
||||||
|
fn __next__(mut slf: PyRefMut<Self>) -> Option<String> {
|
||||||
|
let idx = match &slf.cur {
|
||||||
|
Some(idx) => *idx,
|
||||||
|
None => return None,
|
||||||
|
};
|
||||||
|
|
||||||
|
let word = match slf.inner.get(idx) {
|
||||||
|
Some(word) => String::from(word),
|
||||||
|
None => {
|
||||||
|
slf.cur = None;
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
slf.cur = Some(idx + 1);
|
||||||
|
Some(word)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type HashMap<K, V> = std::collections::HashMap<K, V, ahash::RandomState>;
|
|
@ -0,0 +1,23 @@
|
||||||
|
import instant_segment, os, sys
|
||||||
|
|
||||||
|
DATA_DIR = os.path.join(os.path.dirname(__file__), '../../data/')
|
||||||
|
|
||||||
|
def unigrams():
|
||||||
|
for ln in open(os.path.join(DATA_DIR, 'unigrams.txt')):
|
||||||
|
parts = ln.split('\t', 1)
|
||||||
|
yield (parts[0], float(parts[1].strip()))
|
||||||
|
|
||||||
|
def bigrams():
|
||||||
|
for ln in open(os.path.join(DATA_DIR, 'bigrams.txt')):
|
||||||
|
word_split = ln.split(' ', 1)
|
||||||
|
score_split = word_split[1].split('\t', 1)
|
||||||
|
yield ((word_split[0], score_split[0]), float(score_split[1].strip()))
|
||||||
|
|
||||||
|
def main():
|
||||||
|
segmenter = instant_segment.Segmenter(unigrams(), bigrams())
|
||||||
|
search = instant_segment.Search()
|
||||||
|
segmenter.segment('thisisatest', search)
|
||||||
|
print([word for word in search])
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
Loading…
Reference in New Issue