Optimize test data reader
This commit is contained in:
parent
fd774ad465
commit
8fe1b2ab46
|
@ -5,7 +5,7 @@ use std::io::{BufRead, BufReader};
|
|||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
|
||||
use super::Segmenter;
|
||||
use super::{HashMap, Segmenter};
|
||||
|
||||
#[test]
|
||||
fn test_data() {
|
||||
|
@ -14,25 +14,31 @@ fn test_data() {
|
|||
|
||||
pub fn segmenter() -> Segmenter {
|
||||
let dir = PathBuf::from(format!("{}/../data", env!("CARGO_MANIFEST_DIR")));
|
||||
let mut ln = String::new();
|
||||
|
||||
let uni_file = dir.join("unigrams.txt");
|
||||
let reader = BufReader::new(File::open(&uni_file).unwrap());
|
||||
let unigrams = reader.lines().enumerate().map(move |(i, ln)| {
|
||||
let ln = ln.unwrap_or_else(|e| panic!("line error at {:?}:{}: {}", uni_file, i, e));
|
||||
let mut reader = BufReader::new(File::open(&uni_file).unwrap());
|
||||
let mut i = 0;
|
||||
let mut unigrams = HashMap::default();
|
||||
while reader.read_line(&mut ln).unwrap() > 0 {
|
||||
i += 1;
|
||||
let split = ln
|
||||
.find('\t')
|
||||
.unwrap_or_else(|| panic!("no tab found in {:?}:{}", uni_file, i));
|
||||
|
||||
let word = ln[..split].into();
|
||||
let p = usize::from_str(&ln[split + 1..])
|
||||
let p = usize::from_str(&ln[split + 1..].trim())
|
||||
.unwrap_or_else(|e| panic!("error at {:?}:{}: {}", uni_file, i, e));
|
||||
(word, p as f64)
|
||||
});
|
||||
unigrams.insert(word, p as f64);
|
||||
ln.clear();
|
||||
}
|
||||
|
||||
let bi_file = dir.join("bigrams.txt");
|
||||
let reader = BufReader::new(File::open(&bi_file).unwrap());
|
||||
let bigrams = reader.lines().enumerate().map(move |(i, ln)| {
|
||||
let ln = ln.unwrap_or_else(|e| panic!("line error at {:?}:{}: {}", bi_file, i, e));
|
||||
let mut reader = BufReader::new(File::open(&bi_file).unwrap());
|
||||
let mut i = 0;
|
||||
let mut bigrams = HashMap::default();
|
||||
while reader.read_line(&mut ln).unwrap() > 0 {
|
||||
i += 1;
|
||||
let word_split = ln
|
||||
.find(' ')
|
||||
.unwrap_or_else(|| panic!("no space found in {:?}:{}", bi_file, i));
|
||||
|
@ -44,11 +50,12 @@ pub fn segmenter() -> Segmenter {
|
|||
|
||||
let word1 = ln[..word_split].into();
|
||||
let word2 = ln[word_split + 1..score_split].into();
|
||||
let p = usize::from_str(&ln[score_split + 1..])
|
||||
let p = usize::from_str(&ln[score_split + 1..].trim())
|
||||
.unwrap_or_else(|e| panic!("error at {:?}:{}: {}", bi_file, i, e));
|
||||
|
||||
((word1, word2), p as f64)
|
||||
});
|
||||
bigrams.insert((word1, word2), p as f64);
|
||||
ln.clear();
|
||||
}
|
||||
|
||||
Segmenter::from_iters(unigrams, bigrams)
|
||||
Segmenter::from_maps(unigrams, bigrams)
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue