Simplify Segmenter setup API

This commit is contained in:
Dirkjan Ochtman 2020-12-07 11:39:49 +01:00
parent 4b7760ee41
commit eeb9c77bc7
2 changed files with 17 additions and 19 deletions

View File

@ -1,4 +1,3 @@
use std::error::Error;
use std::io; use std::io;
use std::num::ParseIntError; use std::num::ParseIntError;
use std::ops::{Index, Range}; use std::ops::{Index, Range};
@ -23,17 +22,17 @@ impl Segmenter {
/// ///
/// Note: the `String` types used in this API are defined in the `smartstring` crate. Any /// Note: the `String` types used in this API are defined in the `smartstring` crate. Any
/// `&str` or `String` can be converted into the `String` used here by calling `into()` on it. /// `&str` or `String` can be converted into the `String` used here by calling `into()` on it.
pub fn from_iters<U, B>(unigrams: U, bigrams: B) -> Result<Self, Box<dyn Error>> pub fn from_iters<U, B>(unigrams: U, bigrams: B) -> Self
where where
U: Iterator<Item = Result<(String, f64), Box<dyn Error>>>, U: Iterator<Item = (String, f64)>,
B: Iterator<Item = Result<((String, String), f64), Box<dyn Error>>>, B: Iterator<Item = ((String, String), f64)>,
{ {
Ok(Self { Self {
unigrams: unigrams.collect::<Result<HashMap<_, _>, _>>()?, unigrams: unigrams.collect::<HashMap<_, _>>(),
bigrams: bigrams.collect::<Result<HashMap<_, _>, _>>()?, bigrams: bigrams.collect::<HashMap<_, _>>(),
limit: DEFAULT_LIMIT, limit: DEFAULT_LIMIT,
total: DEFAULT_TOTAL, total: DEFAULT_TOTAL,
}) }
} }
/// Appends list of words that is the best segmentation of `text` to `out` /// Appends list of words that is the best segmentation of `text` to `out`

View File

@ -13,37 +13,36 @@ pub fn segmenter() -> Segmenter {
let uni_file = dir.join("unigrams.txt"); let uni_file = dir.join("unigrams.txt");
let reader = BufReader::new(File::open(&uni_file).unwrap()); let reader = BufReader::new(File::open(&uni_file).unwrap());
let unigrams = reader.lines().enumerate().map(move |(i, ln)| { let unigrams = reader.lines().enumerate().map(move |(i, ln)| {
let ln = ln?; let ln = ln.expect(&format!("line error at {:?}:{}", uni_file, i));
let split = ln let split = ln
.find('\t') .find('\t')
.ok_or_else(|| format!("no tab found in {:?}:{}", uni_file, i))?; .expect(&format!("no tab found in {:?}:{}", uni_file, i));
let word = ln[..split].into(); let word = ln[..split].into();
let p = usize::from_str(&ln[split + 1..]) let p = usize::from_str(&ln[split + 1..]).expect(&format!("error at {:?}:{}", uni_file, i));
.map_err(|e| format!("error at {:?}:{}: {}", uni_file, i, e))?; (word, p as f64)
Ok((word, p as f64))
}); });
let bi_file = dir.join("bigrams.txt"); let bi_file = dir.join("bigrams.txt");
let reader = BufReader::new(File::open(&bi_file).unwrap()); let reader = BufReader::new(File::open(&bi_file).unwrap());
let bigrams = reader.lines().enumerate().map(move |(i, ln)| { let bigrams = reader.lines().enumerate().map(move |(i, ln)| {
let ln = ln?; let ln = ln.expect(&format!("line error at {:?}:{}", bi_file, i));
let word_split = ln let word_split = ln
.find(' ') .find(' ')
.ok_or_else(|| format!("no space found in {:?}:{}", bi_file, i))?; .expect(&format!("no space found in {:?}:{}", bi_file, i));
let score_split = ln[word_split + 1..] let score_split = ln[word_split + 1..]
.find('\t') .find('\t')
.ok_or_else(|| format!("no tab found in {:?}:{}", bi_file, i))? .expect(&format!("no tab found in {:?}:{}", bi_file, i))
+ word_split + word_split
+ 1; + 1;
let word1 = ln[..word_split].into(); let word1 = ln[..word_split].into();
let word2 = ln[word_split + 1..score_split].into(); let word2 = ln[word_split + 1..score_split].into();
let p = usize::from_str(&ln[score_split + 1..]) let p = usize::from_str(&ln[score_split + 1..])
.map_err(|e| format!("error at {:?}:{}: {}", bi_file, i, e))?; .expect(&format!("error at {:?}:{}", bi_file, i));
Ok(((word1, word2), p as f64)) ((word1, word2), p as f64)
}); });
Segmenter::from_iters(unigrams, bigrams).unwrap() Segmenter::from_iters(unigrams, bigrams)
} }