From fd774ad465960148b6fbeb4301cae502d750e6cc Mon Sep 17 00:00:00 2001
From: Dirkjan Ochtman <dirkjan@ochtman.nl>
Date: Wed, 24 Mar 2021 10:59:57 +0100
Subject: [PATCH] py: initial version of Python bindings

---
 .cargo/config.toml              |  11 ++++
 .gitignore                      |   2 +
 Cargo.toml                      |  30 +--------
 Makefile                        |   4 ++
 instant-segment-py/Cargo.toml   |  20 ++++++
 instant-segment-py/src/lib.rs   | 108 ++++++++++++++++++++++++++++++++
 instant-segment-py/test/test.py |  23 +++++++
 7 files changed, 170 insertions(+), 28 deletions(-)
 create mode 100644 .cargo/config.toml
 create mode 100644 Makefile
 create mode 100644 instant-segment-py/Cargo.toml
 create mode 100644 instant-segment-py/src/lib.rs
 create mode 100644 instant-segment-py/test/test.py
diff --git a/.cargo/config.toml b/.cargo/config.toml
new file mode 100644
index 0000000..d47f983
--- /dev/null
+++ b/.cargo/config.toml
@@ -0,0 +1,11 @@
+[target.x86_64-apple-darwin]
+rustflags = [
+  "-C", "link-arg=-undefined",
+  "-C", "link-arg=dynamic_lookup",
+]
+
+[target.aarch64-apple-darwin]
+rustflags = [
+  "-C", "link-arg=-undefined",
+  "-C", "link-arg=dynamic_lookup",
+]
diff --git a/.gitignore b/.gitignore
index 96ef6c0..1914827 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
 /target
 Cargo.lock
+*.so
+__pycache__
diff --git a/Cargo.toml b/Cargo.toml
index b6ca6d2..c1481ab 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,28 +1,2 @@
-[package]
-name = "instant-segment"
-version = "0.7.1"
-authors = ["Dirkjan Ochtman <dirkjan@ochtman.nl>"]
-edition = "2018"
-license = "Apache-2.0"
-description = "Fast English word segmentation"
-homepage = "https://github.com/InstantDomainSearch/instant-segment"
-repository = "https://github.com/InstantDomainSearch/instant-segment"
-documentation = "https://docs.rs/instant-segment"
-
-[features]
-__test_data = ["test-cases"]
-test-cases = []
-with-serde = ["serde", "ahash/serde", "smartstring/serde"]
-
-[dependencies]
-ahash = "0.7.0"
-smartstring = "0.2.5"
-serde = { version = "1.0.123", features = ["derive"], optional = true }
-
-[dev-dependencies]
-bencher = "0.1.5"
-once_cell = "1.4"
-
-[[bench]]
-name = "bench"
-harness = false
+[workspace]
+members = ["instant-segment", "instant-segment-py"]
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..32c5a63
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,4 @@
+test-python:
+	cargo build --release
+	cp target/release/libinstant_segment.dylib instant-segment-py/test/instant_segment.so
+	PYTHONPATH=instant-segment-py/test/ python3 -m test
diff --git a/instant-segment-py/Cargo.toml b/instant-segment-py/Cargo.toml
new file mode 100644
index 0000000..6c5289f
--- /dev/null
+++ b/instant-segment-py/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "instant-segment-py"
+version = "0.1.0"
+authors = ["Dirkjan Ochtman <dirkjan@ochtman.nl>"]
+edition = "2018"
+license = "Apache-2.0"
+workspace = ".."
+
+[lib]
+name = "instant_segment"
+crate-type = ["cdylib"]
+
+[dependencies]
+ahash = "0.7.2"
+instant-segment = { version = "0.7", path = "../instant-segment" }
+pyo3 = { version = "0.13.2", features = ["extension-module"] }
+smartstring = "0.2.6"
+
+[package.metadata.maturin]
+name = "instant-segment"
diff --git a/instant-segment-py/src/lib.rs b/instant-segment-py/src/lib.rs
new file mode 100644
index 0000000..2043fe5
--- /dev/null
+++ b/instant-segment-py/src/lib.rs
@@ -0,0 +1,108 @@
+use pyo3::exceptions::PyValueError;
+use pyo3::proc_macro::{pyclass, pymethods, pymodule, pyproto};
+use pyo3::types::{PyIterator, PyModule};
+use pyo3::{PyErr, PyIterProtocol, PyRef, PyRefMut, PyResult, Python};
+use smartstring::alias::String as SmartString;
+
+#[pymodule]
+fn instant_segment(_: Python, m: &PyModule) -> PyResult<()> {
+    m.add_class::<Search>()?;
+    m.add_class::<Segmenter>()?;
+    Ok(())
+}
+
+#[pyclass]
+struct Segmenter {
+    inner: instant_segment::Segmenter,
+}
+
+#[pymethods]
+impl Segmenter {
+    #[new]
+    fn new(unigrams: &PyIterator, bigrams: &PyIterator) -> PyResult<Self> {
+        let unigrams = unigrams
+            .map(|item| {
+                let item = item?;
+                let key = item.get_item(0)?.extract::<&str>()?;
+                let val = item.get_item(1)?.extract::<f64>()?;
+                Ok((SmartString::from(key), val))
+            })
+            .collect::<Result<HashMap<_, _>, PyErr>>()?;
+
+        let bigrams = bigrams
+            .map(|item| {
+                let item = item?;
+
+                let key = item.get_item(0)?;
+                let first = key.get_item(0)?.extract::<&str>()?;
+                let second = key.get_item(1)?.extract::<&str>()?;
+
+                let val = item.get_item(1)?.extract::<f64>()?;
+                Ok(((SmartString::from(first), SmartString::from(second)), val))
+            })
+            .collect::<Result<HashMap<_, _>, PyErr>>()?;
+
+        Ok(Self {
+            inner: instant_segment::Segmenter::from_maps(unigrams, bigrams),
+        })
+    }
+
+    fn segment(&self, s: &str, search: &mut Search) -> PyResult<()> {
+        match self.inner.segment(s, &mut search.inner) {
+            Ok(_) => {
+                search.cur = Some(0);
+                Ok(())
+            }
+            Err(_) => Err(PyValueError::new_err(
+                "only lowercase ASCII letters allowed",
+            )),
+        }
+    }
+}
+
+/// Search buffer and result set
+#[pyclass]
+struct Search {
+    inner: instant_segment::Search,
+    cur: Option<usize>,
+}
+
+#[pymethods]
+impl Search {
+    /// Initialize an empty search buffer
+    #[new]
+    fn new() -> Self {
+        Self {
+            inner: instant_segment::Search::default(),
+            cur: None,
+        }
+    }
+}
+
+#[pyproto]
+impl PyIterProtocol for Search {
+    fn __iter__(slf: PyRef<Self>) -> PyRef<Self> {
+        slf
+    }
+
+    /// Return the next closest point
+    fn __next__(mut slf: PyRefMut<Self>) -> Option<String> {
+        let idx = match &slf.cur {
+            Some(idx) => *idx,
+            None => return None,
+        };
+
+        let word = match slf.inner.get(idx) {
+            Some(word) => String::from(word),
+            None => {
+                slf.cur = None;
+                return None;
+            }
+        };
+
+        slf.cur = Some(idx + 1);
+        Some(word)
+    }
+}
+
+type HashMap<K, V> = std::collections::HashMap<K, V, ahash::RandomState>;
diff --git a/instant-segment-py/test/test.py b/instant-segment-py/test/test.py
new file mode 100644
index 0000000..bb41da9
--- /dev/null
+++ b/instant-segment-py/test/test.py
@@ -0,0 +1,23 @@
+import instant_segment, os, sys
+
+DATA_DIR = os.path.join(os.path.dirname(__file__), '../../data/')
+
+def unigrams():
+    for ln in open(os.path.join(DATA_DIR, 'unigrams.txt')):
+        parts = ln.split('\t', 1)
+        yield (parts[0], float(parts[1].strip()))
+
+def bigrams():
+    for ln in open(os.path.join(DATA_DIR, 'bigrams.txt')):
+        word_split = ln.split(' ', 1)
+        score_split = word_split[1].split('\t', 1)
+        yield ((word_split[0], score_split[0]), float(score_split[1].strip()))
+
+def main():
+    segmenter = instant_segment.Segmenter(unigrams(), bigrams())
+    search = instant_segment.Search()
+    segmenter.segment('thisisatest', search)
+    print([word for word in search])
+
+if __name__ == '__main__':
+    main()