Optimize test data reader

This commit is contained in:
Dirkjan Ochtman 2021-03-24 11:34:09 +01:00
parent fd774ad465
commit 8fe1b2ab46
1 changed files with 21 additions and 14 deletions

View File

@ -5,7 +5,7 @@ use std::io::{BufRead, BufReader};
use std::path::PathBuf;
use std::str::FromStr;
use super::Segmenter;
use super::{HashMap, Segmenter};
#[test]
fn test_data() {
@ -14,25 +14,31 @@ fn test_data() {
pub fn segmenter() -> Segmenter {
let dir = PathBuf::from(format!("{}/../data", env!("CARGO_MANIFEST_DIR")));
let mut ln = String::new();
let uni_file = dir.join("unigrams.txt");
let reader = BufReader::new(File::open(&uni_file).unwrap());
let unigrams = reader.lines().enumerate().map(move |(i, ln)| {
let ln = ln.unwrap_or_else(|e| panic!("line error at {:?}:{}: {}", uni_file, i, e));
let mut reader = BufReader::new(File::open(&uni_file).unwrap());
let mut i = 0;
let mut unigrams = HashMap::default();
while reader.read_line(&mut ln).unwrap() > 0 {
i += 1;
let split = ln
.find('\t')
.unwrap_or_else(|| panic!("no tab found in {:?}:{}", uni_file, i));
let word = ln[..split].into();
let p = usize::from_str(&ln[split + 1..])
let p = usize::from_str(&ln[split + 1..].trim())
.unwrap_or_else(|e| panic!("error at {:?}:{}: {}", uni_file, i, e));
(word, p as f64)
});
unigrams.insert(word, p as f64);
ln.clear();
}
let bi_file = dir.join("bigrams.txt");
let reader = BufReader::new(File::open(&bi_file).unwrap());
let bigrams = reader.lines().enumerate().map(move |(i, ln)| {
let ln = ln.unwrap_or_else(|e| panic!("line error at {:?}:{}: {}", bi_file, i, e));
let mut reader = BufReader::new(File::open(&bi_file).unwrap());
let mut i = 0;
let mut bigrams = HashMap::default();
while reader.read_line(&mut ln).unwrap() > 0 {
i += 1;
let word_split = ln
.find(' ')
.unwrap_or_else(|| panic!("no space found in {:?}:{}", bi_file, i));
@ -44,11 +50,12 @@ pub fn segmenter() -> Segmenter {
let word1 = ln[..word_split].into();
let word2 = ln[word_split + 1..score_split].into();
let p = usize::from_str(&ln[score_split + 1..])
let p = usize::from_str(&ln[score_split + 1..].trim())
.unwrap_or_else(|e| panic!("error at {:?}:{}: {}", bi_file, i, e));
((word1, word2), p as f64)
});
bigrams.insert((word1, word2), p as f64);
ln.clear();
}
Segmenter::from_iters(unigrams, bigrams)
Segmenter::from_maps(unigrams, bigrams)
}