py: initial version of Python bindings
This commit is contained in:
parent
f6061044fc
commit
fd774ad465
|
@ -0,0 +1,11 @@
|
|||
[target.x86_64-apple-darwin]
|
||||
rustflags = [
|
||||
"-C", "link-arg=-undefined",
|
||||
"-C", "link-arg=dynamic_lookup",
|
||||
]
|
||||
|
||||
[target.aarch64-apple-darwin]
|
||||
rustflags = [
|
||||
"-C", "link-arg=-undefined",
|
||||
"-C", "link-arg=dynamic_lookup",
|
||||
]
|
|
@ -1,2 +1,4 @@
|
|||
/target
|
||||
Cargo.lock
|
||||
*.so
|
||||
__pycache__
|
||||
|
|
30
Cargo.toml
30
Cargo.toml
|
@ -1,28 +1,2 @@
|
|||
[package]
|
||||
name = "instant-segment"
|
||||
version = "0.7.1"
|
||||
authors = ["Dirkjan Ochtman <dirkjan@ochtman.nl>"]
|
||||
edition = "2018"
|
||||
license = "Apache-2.0"
|
||||
description = "Fast English word segmentation"
|
||||
homepage = "https://github.com/InstantDomainSearch/instant-segment"
|
||||
repository = "https://github.com/InstantDomainSearch/instant-segment"
|
||||
documentation = "https://docs.rs/instant-segment"
|
||||
|
||||
[features]
|
||||
__test_data = ["test-cases"]
|
||||
test-cases = []
|
||||
with-serde = ["serde", "ahash/serde", "smartstring/serde"]
|
||||
|
||||
[dependencies]
|
||||
ahash = "0.7.0"
|
||||
smartstring = "0.2.5"
|
||||
serde = { version = "1.0.123", features = ["derive"], optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
bencher = "0.1.5"
|
||||
once_cell = "1.4"
|
||||
|
||||
[[bench]]
|
||||
name = "bench"
|
||||
harness = false
|
||||
[workspace]
|
||||
members = ["instant-segment", "instant-segment-py"]
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
test-python:
|
||||
cargo build --release
|
||||
cp target/release/libinstant_segment.dylib instant-segment-py/test/instant_segment.so
|
||||
PYTHONPATH=instant-segment-py/test/ python3 -m test
|
|
@ -0,0 +1,20 @@
|
|||
[package]
|
||||
name = "instant-segment-py"
|
||||
version = "0.1.0"
|
||||
authors = ["Dirkjan Ochtman <dirkjan@ochtman.nl>"]
|
||||
edition = "2018"
|
||||
license = "Apache-2.0"
|
||||
workspace = ".."
|
||||
|
||||
[lib]
|
||||
name = "instant_segment"
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
ahash = "0.7.2"
|
||||
instant-segment = { version = "0.7", path = "../instant-segment" }
|
||||
pyo3 = { version = "0.13.2", features = ["extension-module"] }
|
||||
smartstring = "0.2.6"
|
||||
|
||||
[package.metadata.maturin]
|
||||
name = "instant-segment"
|
|
@ -0,0 +1,108 @@
|
|||
use pyo3::exceptions::PyValueError;
|
||||
use pyo3::proc_macro::{pyclass, pymethods, pymodule, pyproto};
|
||||
use pyo3::types::{PyIterator, PyModule};
|
||||
use pyo3::{PyErr, PyIterProtocol, PyRef, PyRefMut, PyResult, Python};
|
||||
use smartstring::alias::String as SmartString;
|
||||
|
||||
#[pymodule]
|
||||
fn instant_segment(_: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<Search>()?;
|
||||
m.add_class::<Segmenter>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
struct Segmenter {
|
||||
inner: instant_segment::Segmenter,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Segmenter {
|
||||
#[new]
|
||||
fn new(unigrams: &PyIterator, bigrams: &PyIterator) -> PyResult<Self> {
|
||||
let unigrams = unigrams
|
||||
.map(|item| {
|
||||
let item = item?;
|
||||
let key = item.get_item(0)?.extract::<&str>()?;
|
||||
let val = item.get_item(1)?.extract::<f64>()?;
|
||||
Ok((SmartString::from(key), val))
|
||||
})
|
||||
.collect::<Result<HashMap<_, _>, PyErr>>()?;
|
||||
|
||||
let bigrams = bigrams
|
||||
.map(|item| {
|
||||
let item = item?;
|
||||
|
||||
let key = item.get_item(0)?;
|
||||
let first = key.get_item(0)?.extract::<&str>()?;
|
||||
let second = key.get_item(1)?.extract::<&str>()?;
|
||||
|
||||
let val = item.get_item(1)?.extract::<f64>()?;
|
||||
Ok(((SmartString::from(first), SmartString::from(second)), val))
|
||||
})
|
||||
.collect::<Result<HashMap<_, _>, PyErr>>()?;
|
||||
|
||||
Ok(Self {
|
||||
inner: instant_segment::Segmenter::from_maps(unigrams, bigrams),
|
||||
})
|
||||
}
|
||||
|
||||
fn segment(&self, s: &str, search: &mut Search) -> PyResult<()> {
|
||||
match self.inner.segment(s, &mut search.inner) {
|
||||
Ok(_) => {
|
||||
search.cur = Some(0);
|
||||
Ok(())
|
||||
}
|
||||
Err(_) => Err(PyValueError::new_err(
|
||||
"only lowercase ASCII letters allowed",
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Search buffer and result set
|
||||
#[pyclass]
|
||||
struct Search {
|
||||
inner: instant_segment::Search,
|
||||
cur: Option<usize>,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Search {
|
||||
/// Initialize an empty search buffer
|
||||
#[new]
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
inner: instant_segment::Search::default(),
|
||||
cur: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[pyproto]
|
||||
impl PyIterProtocol for Search {
|
||||
fn __iter__(slf: PyRef<Self>) -> PyRef<Self> {
|
||||
slf
|
||||
}
|
||||
|
||||
/// Return the next closest point
|
||||
fn __next__(mut slf: PyRefMut<Self>) -> Option<String> {
|
||||
let idx = match &slf.cur {
|
||||
Some(idx) => *idx,
|
||||
None => return None,
|
||||
};
|
||||
|
||||
let word = match slf.inner.get(idx) {
|
||||
Some(word) => String::from(word),
|
||||
None => {
|
||||
slf.cur = None;
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
slf.cur = Some(idx + 1);
|
||||
Some(word)
|
||||
}
|
||||
}
|
||||
|
||||
type HashMap<K, V> = std::collections::HashMap<K, V, ahash::RandomState>;
|
|
@ -0,0 +1,23 @@
|
|||
import instant_segment, os, sys
|
||||
|
||||
DATA_DIR = os.path.join(os.path.dirname(__file__), '../../data/')
|
||||
|
||||
def unigrams():
|
||||
for ln in open(os.path.join(DATA_DIR, 'unigrams.txt')):
|
||||
parts = ln.split('\t', 1)
|
||||
yield (parts[0], float(parts[1].strip()))
|
||||
|
||||
def bigrams():
|
||||
for ln in open(os.path.join(DATA_DIR, 'bigrams.txt')):
|
||||
word_split = ln.split(' ', 1)
|
||||
score_split = word_split[1].split('\t', 1)
|
||||
yield ((word_split[0], score_split[0]), float(score_split[1].strip()))
|
||||
|
||||
def main():
|
||||
segmenter = instant_segment.Segmenter(unigrams(), bigrams())
|
||||
search = instant_segment.Search()
|
||||
segmenter.segment('thisisatest', search)
|
||||
print([word for word in search])
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in New Issue