Optimize test data reader
This commit is contained in:
parent
fd774ad465
commit
8fe1b2ab46
|
@ -5,7 +5,7 @@ use std::io::{BufRead, BufReader};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
|
|
||||||
use super::Segmenter;
|
use super::{HashMap, Segmenter};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_data() {
|
fn test_data() {
|
||||||
|
@ -14,25 +14,31 @@ fn test_data() {
|
||||||
|
|
||||||
pub fn segmenter() -> Segmenter {
|
pub fn segmenter() -> Segmenter {
|
||||||
let dir = PathBuf::from(format!("{}/../data", env!("CARGO_MANIFEST_DIR")));
|
let dir = PathBuf::from(format!("{}/../data", env!("CARGO_MANIFEST_DIR")));
|
||||||
|
let mut ln = String::new();
|
||||||
|
|
||||||
let uni_file = dir.join("unigrams.txt");
|
let uni_file = dir.join("unigrams.txt");
|
||||||
let reader = BufReader::new(File::open(&uni_file).unwrap());
|
let mut reader = BufReader::new(File::open(&uni_file).unwrap());
|
||||||
let unigrams = reader.lines().enumerate().map(move |(i, ln)| {
|
let mut i = 0;
|
||||||
let ln = ln.unwrap_or_else(|e| panic!("line error at {:?}:{}: {}", uni_file, i, e));
|
let mut unigrams = HashMap::default();
|
||||||
|
while reader.read_line(&mut ln).unwrap() > 0 {
|
||||||
|
i += 1;
|
||||||
let split = ln
|
let split = ln
|
||||||
.find('\t')
|
.find('\t')
|
||||||
.unwrap_or_else(|| panic!("no tab found in {:?}:{}", uni_file, i));
|
.unwrap_or_else(|| panic!("no tab found in {:?}:{}", uni_file, i));
|
||||||
|
|
||||||
let word = ln[..split].into();
|
let word = ln[..split].into();
|
||||||
let p = usize::from_str(&ln[split + 1..])
|
let p = usize::from_str(&ln[split + 1..].trim())
|
||||||
.unwrap_or_else(|e| panic!("error at {:?}:{}: {}", uni_file, i, e));
|
.unwrap_or_else(|e| panic!("error at {:?}:{}: {}", uni_file, i, e));
|
||||||
(word, p as f64)
|
unigrams.insert(word, p as f64);
|
||||||
});
|
ln.clear();
|
||||||
|
}
|
||||||
|
|
||||||
let bi_file = dir.join("bigrams.txt");
|
let bi_file = dir.join("bigrams.txt");
|
||||||
let reader = BufReader::new(File::open(&bi_file).unwrap());
|
let mut reader = BufReader::new(File::open(&bi_file).unwrap());
|
||||||
let bigrams = reader.lines().enumerate().map(move |(i, ln)| {
|
let mut i = 0;
|
||||||
let ln = ln.unwrap_or_else(|e| panic!("line error at {:?}:{}: {}", bi_file, i, e));
|
let mut bigrams = HashMap::default();
|
||||||
|
while reader.read_line(&mut ln).unwrap() > 0 {
|
||||||
|
i += 1;
|
||||||
let word_split = ln
|
let word_split = ln
|
||||||
.find(' ')
|
.find(' ')
|
||||||
.unwrap_or_else(|| panic!("no space found in {:?}:{}", bi_file, i));
|
.unwrap_or_else(|| panic!("no space found in {:?}:{}", bi_file, i));
|
||||||
|
@ -44,11 +50,12 @@ pub fn segmenter() -> Segmenter {
|
||||||
|
|
||||||
let word1 = ln[..word_split].into();
|
let word1 = ln[..word_split].into();
|
||||||
let word2 = ln[word_split + 1..score_split].into();
|
let word2 = ln[word_split + 1..score_split].into();
|
||||||
let p = usize::from_str(&ln[score_split + 1..])
|
let p = usize::from_str(&ln[score_split + 1..].trim())
|
||||||
.unwrap_or_else(|e| panic!("error at {:?}:{}: {}", bi_file, i, e));
|
.unwrap_or_else(|e| panic!("error at {:?}:{}: {}", bi_file, i, e));
|
||||||
|
|
||||||
((word1, word2), p as f64)
|
bigrams.insert((word1, word2), p as f64);
|
||||||
});
|
ln.clear();
|
||||||
|
}
|
||||||
Segmenter::from_iters(unigrams, bigrams)
|
|
||||||
|
Segmenter::from_maps(unigrams, bigrams)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue