py: initial version of Python bindings

This commit is contained in:
Dirkjan Ochtman 2021-03-24 10:59:57 +01:00
parent f6061044fc
commit fd774ad465
7 changed files with 170 additions and 28 deletions

11
.cargo/config.toml Normal file
View File

@ -0,0 +1,11 @@
[target.x86_64-apple-darwin]
rustflags = [
"-C", "link-arg=-undefined",
"-C", "link-arg=dynamic_lookup",
]
[target.aarch64-apple-darwin]
rustflags = [
"-C", "link-arg=-undefined",
"-C", "link-arg=dynamic_lookup",
]

2
.gitignore vendored
View File

@ -1,2 +1,4 @@
/target
Cargo.lock
*.so
__pycache__

View File

@ -1,28 +1,2 @@
[package]
name = "instant-segment"
version = "0.7.1"
authors = ["Dirkjan Ochtman <dirkjan@ochtman.nl>"]
edition = "2018"
license = "Apache-2.0"
description = "Fast English word segmentation"
homepage = "https://github.com/InstantDomainSearch/instant-segment"
repository = "https://github.com/InstantDomainSearch/instant-segment"
documentation = "https://docs.rs/instant-segment"
[features]
__test_data = ["test-cases"]
test-cases = []
with-serde = ["serde", "ahash/serde", "smartstring/serde"]
[dependencies]
ahash = "0.7.0"
smartstring = "0.2.5"
serde = { version = "1.0.123", features = ["derive"], optional = true }
[dev-dependencies]
bencher = "0.1.5"
once_cell = "1.4"
[[bench]]
name = "bench"
harness = false
[workspace]
members = ["instant-segment", "instant-segment-py"]

4
Makefile Normal file
View File

@ -0,0 +1,4 @@
test-python:
cargo build --release
cp target/release/libinstant_segment.dylib instant-segment-py/test/instant_segment.so
PYTHONPATH=instant-segment-py/test/ python3 -m test

View File

@ -0,0 +1,20 @@
[package]
name = "instant-segment-py"
version = "0.1.0"
authors = ["Dirkjan Ochtman <dirkjan@ochtman.nl>"]
edition = "2018"
license = "Apache-2.0"
workspace = ".."
[lib]
name = "instant_segment"
crate-type = ["cdylib"]
[dependencies]
ahash = "0.7.2"
instant-segment = { version = "0.7", path = "../instant-segment" }
pyo3 = { version = "0.13.2", features = ["extension-module"] }
smartstring = "0.2.6"
[package.metadata.maturin]
name = "instant-segment"

View File

@ -0,0 +1,108 @@
use pyo3::exceptions::PyValueError;
use pyo3::proc_macro::{pyclass, pymethods, pymodule, pyproto};
use pyo3::types::{PyIterator, PyModule};
use pyo3::{PyErr, PyIterProtocol, PyRef, PyRefMut, PyResult, Python};
use smartstring::alias::String as SmartString;
#[pymodule]
fn instant_segment(_: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<Search>()?;
m.add_class::<Segmenter>()?;
Ok(())
}
#[pyclass]
struct Segmenter {
inner: instant_segment::Segmenter,
}
#[pymethods]
impl Segmenter {
#[new]
fn new(unigrams: &PyIterator, bigrams: &PyIterator) -> PyResult<Self> {
let unigrams = unigrams
.map(|item| {
let item = item?;
let key = item.get_item(0)?.extract::<&str>()?;
let val = item.get_item(1)?.extract::<f64>()?;
Ok((SmartString::from(key), val))
})
.collect::<Result<HashMap<_, _>, PyErr>>()?;
let bigrams = bigrams
.map(|item| {
let item = item?;
let key = item.get_item(0)?;
let first = key.get_item(0)?.extract::<&str>()?;
let second = key.get_item(1)?.extract::<&str>()?;
let val = item.get_item(1)?.extract::<f64>()?;
Ok(((SmartString::from(first), SmartString::from(second)), val))
})
.collect::<Result<HashMap<_, _>, PyErr>>()?;
Ok(Self {
inner: instant_segment::Segmenter::from_maps(unigrams, bigrams),
})
}
fn segment(&self, s: &str, search: &mut Search) -> PyResult<()> {
match self.inner.segment(s, &mut search.inner) {
Ok(_) => {
search.cur = Some(0);
Ok(())
}
Err(_) => Err(PyValueError::new_err(
"only lowercase ASCII letters allowed",
)),
}
}
}
/// Search buffer and result set
#[pyclass]
struct Search {
inner: instant_segment::Search,
cur: Option<usize>,
}
#[pymethods]
impl Search {
/// Initialize an empty search buffer
#[new]
fn new() -> Self {
Self {
inner: instant_segment::Search::default(),
cur: None,
}
}
}
#[pyproto]
impl PyIterProtocol for Search {
fn __iter__(slf: PyRef<Self>) -> PyRef<Self> {
slf
}
/// Return the next closest point
fn __next__(mut slf: PyRefMut<Self>) -> Option<String> {
let idx = match &slf.cur {
Some(idx) => *idx,
None => return None,
};
let word = match slf.inner.get(idx) {
Some(word) => String::from(word),
None => {
slf.cur = None;
return None;
}
};
slf.cur = Some(idx + 1);
Some(word)
}
}
type HashMap<K, V> = std::collections::HashMap<K, V, ahash::RandomState>;

View File

@ -0,0 +1,23 @@
import instant_segment, os, sys
DATA_DIR = os.path.join(os.path.dirname(__file__), '../../data/')
def unigrams():
for ln in open(os.path.join(DATA_DIR, 'unigrams.txt')):
parts = ln.split('\t', 1)
yield (parts[0], float(parts[1].strip()))
def bigrams():
for ln in open(os.path.join(DATA_DIR, 'bigrams.txt')):
word_split = ln.split(' ', 1)
score_split = word_split[1].split('\t', 1)
yield ((word_split[0], score_split[0]), float(score_split[1].strip()))
def main():
segmenter = instant_segment.Segmenter(unigrams(), bigrams())
search = instant_segment.Search()
segmenter.segment('thisisatest', search)
print([word for word in search])
if __name__ == '__main__':
main()