Optimize test data reader

This commit is contained in:
Dirkjan Ochtman 2021-03-24 11:34:09 +01:00
parent fd774ad465
commit 8fe1b2ab46
1 changed files with 21 additions and 14 deletions

View File

@ -5,7 +5,7 @@ use std::io::{BufRead, BufReader};
use std::path::PathBuf; use std::path::PathBuf;
use std::str::FromStr; use std::str::FromStr;
use super::Segmenter; use super::{HashMap, Segmenter};
#[test] #[test]
fn test_data() { fn test_data() {
@ -14,25 +14,31 @@ fn test_data() {
pub fn segmenter() -> Segmenter { pub fn segmenter() -> Segmenter {
let dir = PathBuf::from(format!("{}/../data", env!("CARGO_MANIFEST_DIR"))); let dir = PathBuf::from(format!("{}/../data", env!("CARGO_MANIFEST_DIR")));
let mut ln = String::new();
let uni_file = dir.join("unigrams.txt"); let uni_file = dir.join("unigrams.txt");
let reader = BufReader::new(File::open(&uni_file).unwrap()); let mut reader = BufReader::new(File::open(&uni_file).unwrap());
let unigrams = reader.lines().enumerate().map(move |(i, ln)| { let mut i = 0;
let ln = ln.unwrap_or_else(|e| panic!("line error at {:?}:{}: {}", uni_file, i, e)); let mut unigrams = HashMap::default();
while reader.read_line(&mut ln).unwrap() > 0 {
i += 1;
let split = ln let split = ln
.find('\t') .find('\t')
.unwrap_or_else(|| panic!("no tab found in {:?}:{}", uni_file, i)); .unwrap_or_else(|| panic!("no tab found in {:?}:{}", uni_file, i));
let word = ln[..split].into(); let word = ln[..split].into();
let p = usize::from_str(&ln[split + 1..]) let p = usize::from_str(&ln[split + 1..].trim())
.unwrap_or_else(|e| panic!("error at {:?}:{}: {}", uni_file, i, e)); .unwrap_or_else(|e| panic!("error at {:?}:{}: {}", uni_file, i, e));
(word, p as f64) unigrams.insert(word, p as f64);
}); ln.clear();
}
let bi_file = dir.join("bigrams.txt"); let bi_file = dir.join("bigrams.txt");
let reader = BufReader::new(File::open(&bi_file).unwrap()); let mut reader = BufReader::new(File::open(&bi_file).unwrap());
let bigrams = reader.lines().enumerate().map(move |(i, ln)| { let mut i = 0;
let ln = ln.unwrap_or_else(|e| panic!("line error at {:?}:{}: {}", bi_file, i, e)); let mut bigrams = HashMap::default();
while reader.read_line(&mut ln).unwrap() > 0 {
i += 1;
let word_split = ln let word_split = ln
.find(' ') .find(' ')
.unwrap_or_else(|| panic!("no space found in {:?}:{}", bi_file, i)); .unwrap_or_else(|| panic!("no space found in {:?}:{}", bi_file, i));
@ -44,11 +50,12 @@ pub fn segmenter() -> Segmenter {
let word1 = ln[..word_split].into(); let word1 = ln[..word_split].into();
let word2 = ln[word_split + 1..score_split].into(); let word2 = ln[word_split + 1..score_split].into();
let p = usize::from_str(&ln[score_split + 1..]) let p = usize::from_str(&ln[score_split + 1..].trim())
.unwrap_or_else(|e| panic!("error at {:?}:{}: {}", bi_file, i, e)); .unwrap_or_else(|e| panic!("error at {:?}:{}: {}", bi_file, i, e));
((word1, word2), p as f64) bigrams.insert((word1, word2), p as f64);
}); ln.clear();
}
Segmenter::from_iters(unigrams, bigrams)
Segmenter::from_maps(unigrams, bigrams)
} }