From fd774ad465960148b6fbeb4301cae502d750e6cc Mon Sep 17 00:00:00 2001 From: Dirkjan Ochtman Date: Wed, 24 Mar 2021 10:59:57 +0100 Subject: [PATCH] py: initial version of Python bindings --- .cargo/config.toml | 11 ++++ .gitignore | 2 + Cargo.toml | 30 +-------- Makefile | 4 ++ instant-segment-py/Cargo.toml | 20 ++++++ instant-segment-py/src/lib.rs | 108 ++++++++++++++++++++++++++++++++ instant-segment-py/test/test.py | 23 +++++++ 7 files changed, 170 insertions(+), 28 deletions(-) create mode 100644 .cargo/config.toml create mode 100644 Makefile create mode 100644 instant-segment-py/Cargo.toml create mode 100644 instant-segment-py/src/lib.rs create mode 100644 instant-segment-py/test/test.py diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..d47f983 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,11 @@ +[target.x86_64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] + +[target.aarch64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] diff --git a/.gitignore b/.gitignore index 96ef6c0..1914827 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ /target Cargo.lock +*.so +__pycache__ diff --git a/Cargo.toml b/Cargo.toml index b6ca6d2..c1481ab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,28 +1,2 @@ -[package] -name = "instant-segment" -version = "0.7.1" -authors = ["Dirkjan Ochtman "] -edition = "2018" -license = "Apache-2.0" -description = "Fast English word segmentation" -homepage = "https://github.com/InstantDomainSearch/instant-segment" -repository = "https://github.com/InstantDomainSearch/instant-segment" -documentation = "https://docs.rs/instant-segment" - -[features] -__test_data = ["test-cases"] -test-cases = [] -with-serde = ["serde", "ahash/serde", "smartstring/serde"] - -[dependencies] -ahash = "0.7.0" -smartstring = "0.2.5" -serde = { version = "1.0.123", features = ["derive"], optional = true } - -[dev-dependencies] -bencher = "0.1.5" -once_cell = "1.4" - -[[bench]] -name = "bench" -harness = false +[workspace] +members = ["instant-segment", "instant-segment-py"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..32c5a63 --- /dev/null +++ b/Makefile @@ -0,0 +1,4 @@ +test-python: + cargo build --release + cp target/release/libinstant_segment.dylib instant-segment-py/test/instant_segment.so + PYTHONPATH=instant-segment-py/test/ python3 -m test diff --git a/instant-segment-py/Cargo.toml b/instant-segment-py/Cargo.toml new file mode 100644 index 0000000..6c5289f --- /dev/null +++ b/instant-segment-py/Cargo.toml @@ -0,0 +1,20 @@ +[package] +name = "instant-segment-py" +version = "0.1.0" +authors = ["Dirkjan Ochtman "] +edition = "2018" +license = "Apache-2.0" +workspace = ".." + +[lib] +name = "instant_segment" +crate-type = ["cdylib"] + +[dependencies] +ahash = "0.7.2" +instant-segment = { version = "0.7", path = "../instant-segment" } +pyo3 = { version = "0.13.2", features = ["extension-module"] } +smartstring = "0.2.6" + +[package.metadata.maturin] +name = "instant-segment" diff --git a/instant-segment-py/src/lib.rs b/instant-segment-py/src/lib.rs new file mode 100644 index 0000000..2043fe5 --- /dev/null +++ b/instant-segment-py/src/lib.rs @@ -0,0 +1,108 @@ +use pyo3::exceptions::PyValueError; +use pyo3::proc_macro::{pyclass, pymethods, pymodule, pyproto}; +use pyo3::types::{PyIterator, PyModule}; +use pyo3::{PyErr, PyIterProtocol, PyRef, PyRefMut, PyResult, Python}; +use smartstring::alias::String as SmartString; + +#[pymodule] +fn instant_segment(_: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + Ok(()) +} + +#[pyclass] +struct Segmenter { + inner: instant_segment::Segmenter, +} + +#[pymethods] +impl Segmenter { + #[new] + fn new(unigrams: &PyIterator, bigrams: &PyIterator) -> PyResult { + let unigrams = unigrams + .map(|item| { + let item = item?; + let key = item.get_item(0)?.extract::<&str>()?; + let val = item.get_item(1)?.extract::()?; + Ok((SmartString::from(key), val)) + }) + .collect::, PyErr>>()?; + + let bigrams = bigrams + .map(|item| { + let item = item?; + + let key = item.get_item(0)?; + let first = key.get_item(0)?.extract::<&str>()?; + let second = key.get_item(1)?.extract::<&str>()?; + + let val = item.get_item(1)?.extract::()?; + Ok(((SmartString::from(first), SmartString::from(second)), val)) + }) + .collect::, PyErr>>()?; + + Ok(Self { + inner: instant_segment::Segmenter::from_maps(unigrams, bigrams), + }) + } + + fn segment(&self, s: &str, search: &mut Search) -> PyResult<()> { + match self.inner.segment(s, &mut search.inner) { + Ok(_) => { + search.cur = Some(0); + Ok(()) + } + Err(_) => Err(PyValueError::new_err( + "only lowercase ASCII letters allowed", + )), + } + } +} + +/// Search buffer and result set +#[pyclass] +struct Search { + inner: instant_segment::Search, + cur: Option, +} + +#[pymethods] +impl Search { + /// Initialize an empty search buffer + #[new] + fn new() -> Self { + Self { + inner: instant_segment::Search::default(), + cur: None, + } + } +} + +#[pyproto] +impl PyIterProtocol for Search { + fn __iter__(slf: PyRef) -> PyRef { + slf + } + + /// Return the next closest point + fn __next__(mut slf: PyRefMut) -> Option { + let idx = match &slf.cur { + Some(idx) => *idx, + None => return None, + }; + + let word = match slf.inner.get(idx) { + Some(word) => String::from(word), + None => { + slf.cur = None; + return None; + } + }; + + slf.cur = Some(idx + 1); + Some(word) + } +} + +type HashMap = std::collections::HashMap; diff --git a/instant-segment-py/test/test.py b/instant-segment-py/test/test.py new file mode 100644 index 0000000..bb41da9 --- /dev/null +++ b/instant-segment-py/test/test.py @@ -0,0 +1,23 @@ +import instant_segment, os, sys + +DATA_DIR = os.path.join(os.path.dirname(__file__), '../../data/') + +def unigrams(): + for ln in open(os.path.join(DATA_DIR, 'unigrams.txt')): + parts = ln.split('\t', 1) + yield (parts[0], float(parts[1].strip())) + +def bigrams(): + for ln in open(os.path.join(DATA_DIR, 'bigrams.txt')): + word_split = ln.split(' ', 1) + score_split = word_split[1].split('\t', 1) + yield ((word_split[0], score_split[0]), float(score_split[1].strip())) + +def main(): + segmenter = instant_segment.Segmenter(unigrams(), bigrams()) + search = instant_segment.Search() + segmenter.segment('thisisatest', search) + print([word for word in search]) + +if __name__ == '__main__': + main()