Simplify Segmenter setup API
This commit is contained in:
parent
4b7760ee41
commit
eeb9c77bc7
15
src/lib.rs
15
src/lib.rs
|
@ -1,4 +1,3 @@
|
||||||
use std::error::Error;
|
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::num::ParseIntError;
|
use std::num::ParseIntError;
|
||||||
use std::ops::{Index, Range};
|
use std::ops::{Index, Range};
|
||||||
|
@ -23,17 +22,17 @@ impl Segmenter {
|
||||||
///
|
///
|
||||||
/// Note: the `String` types used in this API are defined in the `smartstring` crate. Any
|
/// Note: the `String` types used in this API are defined in the `smartstring` crate. Any
|
||||||
/// `&str` or `String` can be converted into the `String` used here by calling `into()` on it.
|
/// `&str` or `String` can be converted into the `String` used here by calling `into()` on it.
|
||||||
pub fn from_iters<U, B>(unigrams: U, bigrams: B) -> Result<Self, Box<dyn Error>>
|
pub fn from_iters<U, B>(unigrams: U, bigrams: B) -> Self
|
||||||
where
|
where
|
||||||
U: Iterator<Item = Result<(String, f64), Box<dyn Error>>>,
|
U: Iterator<Item = (String, f64)>,
|
||||||
B: Iterator<Item = Result<((String, String), f64), Box<dyn Error>>>,
|
B: Iterator<Item = ((String, String), f64)>,
|
||||||
{
|
{
|
||||||
Ok(Self {
|
Self {
|
||||||
unigrams: unigrams.collect::<Result<HashMap<_, _>, _>>()?,
|
unigrams: unigrams.collect::<HashMap<_, _>>(),
|
||||||
bigrams: bigrams.collect::<Result<HashMap<_, _>, _>>()?,
|
bigrams: bigrams.collect::<HashMap<_, _>>(),
|
||||||
limit: DEFAULT_LIMIT,
|
limit: DEFAULT_LIMIT,
|
||||||
total: DEFAULT_TOTAL,
|
total: DEFAULT_TOTAL,
|
||||||
})
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Appends list of words that is the best segmentation of `text` to `out`
|
/// Appends list of words that is the best segmentation of `text` to `out`
|
||||||
|
|
|
@ -13,37 +13,36 @@ pub fn segmenter() -> Segmenter {
|
||||||
let uni_file = dir.join("unigrams.txt");
|
let uni_file = dir.join("unigrams.txt");
|
||||||
let reader = BufReader::new(File::open(&uni_file).unwrap());
|
let reader = BufReader::new(File::open(&uni_file).unwrap());
|
||||||
let unigrams = reader.lines().enumerate().map(move |(i, ln)| {
|
let unigrams = reader.lines().enumerate().map(move |(i, ln)| {
|
||||||
let ln = ln?;
|
let ln = ln.expect(&format!("line error at {:?}:{}", uni_file, i));
|
||||||
let split = ln
|
let split = ln
|
||||||
.find('\t')
|
.find('\t')
|
||||||
.ok_or_else(|| format!("no tab found in {:?}:{}", uni_file, i))?;
|
.expect(&format!("no tab found in {:?}:{}", uni_file, i));
|
||||||
|
|
||||||
let word = ln[..split].into();
|
let word = ln[..split].into();
|
||||||
let p = usize::from_str(&ln[split + 1..])
|
let p = usize::from_str(&ln[split + 1..]).expect(&format!("error at {:?}:{}", uni_file, i));
|
||||||
.map_err(|e| format!("error at {:?}:{}: {}", uni_file, i, e))?;
|
(word, p as f64)
|
||||||
Ok((word, p as f64))
|
|
||||||
});
|
});
|
||||||
|
|
||||||
let bi_file = dir.join("bigrams.txt");
|
let bi_file = dir.join("bigrams.txt");
|
||||||
let reader = BufReader::new(File::open(&bi_file).unwrap());
|
let reader = BufReader::new(File::open(&bi_file).unwrap());
|
||||||
let bigrams = reader.lines().enumerate().map(move |(i, ln)| {
|
let bigrams = reader.lines().enumerate().map(move |(i, ln)| {
|
||||||
let ln = ln?;
|
let ln = ln.expect(&format!("line error at {:?}:{}", bi_file, i));
|
||||||
let word_split = ln
|
let word_split = ln
|
||||||
.find(' ')
|
.find(' ')
|
||||||
.ok_or_else(|| format!("no space found in {:?}:{}", bi_file, i))?;
|
.expect(&format!("no space found in {:?}:{}", bi_file, i));
|
||||||
let score_split = ln[word_split + 1..]
|
let score_split = ln[word_split + 1..]
|
||||||
.find('\t')
|
.find('\t')
|
||||||
.ok_or_else(|| format!("no tab found in {:?}:{}", bi_file, i))?
|
.expect(&format!("no tab found in {:?}:{}", bi_file, i))
|
||||||
+ word_split
|
+ word_split
|
||||||
+ 1;
|
+ 1;
|
||||||
|
|
||||||
let word1 = ln[..word_split].into();
|
let word1 = ln[..word_split].into();
|
||||||
let word2 = ln[word_split + 1..score_split].into();
|
let word2 = ln[word_split + 1..score_split].into();
|
||||||
let p = usize::from_str(&ln[score_split + 1..])
|
let p = usize::from_str(&ln[score_split + 1..])
|
||||||
.map_err(|e| format!("error at {:?}:{}: {}", bi_file, i, e))?;
|
.expect(&format!("error at {:?}:{}", bi_file, i));
|
||||||
|
|
||||||
Ok(((word1, word2), p as f64))
|
((word1, word2), p as f64)
|
||||||
});
|
});
|
||||||
|
|
||||||
Segmenter::from_iters(unigrams, bigrams).unwrap()
|
Segmenter::from_iters(unigrams, bigrams)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue