diff --git a/instant-segment/src/test_data.rs b/instant-segment/src/test_data.rs index d15b1ad..04fe22b 100644 --- a/instant-segment/src/test_data.rs +++ b/instant-segment/src/test_data.rs @@ -5,7 +5,7 @@ use std::io::{BufRead, BufReader}; use std::path::PathBuf; use std::str::FromStr; -use super::Segmenter; +use super::{HashMap, Segmenter}; #[test] fn test_data() { @@ -14,25 +14,31 @@ fn test_data() { pub fn segmenter() -> Segmenter { let dir = PathBuf::from(format!("{}/../data", env!("CARGO_MANIFEST_DIR"))); + let mut ln = String::new(); let uni_file = dir.join("unigrams.txt"); - let reader = BufReader::new(File::open(&uni_file).unwrap()); - let unigrams = reader.lines().enumerate().map(move |(i, ln)| { - let ln = ln.unwrap_or_else(|e| panic!("line error at {:?}:{}: {}", uni_file, i, e)); + let mut reader = BufReader::new(File::open(&uni_file).unwrap()); + let mut i = 0; + let mut unigrams = HashMap::default(); + while reader.read_line(&mut ln).unwrap() > 0 { + i += 1; let split = ln .find('\t') .unwrap_or_else(|| panic!("no tab found in {:?}:{}", uni_file, i)); let word = ln[..split].into(); - let p = usize::from_str(&ln[split + 1..]) + let p = usize::from_str(&ln[split + 1..].trim()) .unwrap_or_else(|e| panic!("error at {:?}:{}: {}", uni_file, i, e)); - (word, p as f64) - }); + unigrams.insert(word, p as f64); + ln.clear(); + } let bi_file = dir.join("bigrams.txt"); - let reader = BufReader::new(File::open(&bi_file).unwrap()); - let bigrams = reader.lines().enumerate().map(move |(i, ln)| { - let ln = ln.unwrap_or_else(|e| panic!("line error at {:?}:{}: {}", bi_file, i, e)); + let mut reader = BufReader::new(File::open(&bi_file).unwrap()); + let mut i = 0; + let mut bigrams = HashMap::default(); + while reader.read_line(&mut ln).unwrap() > 0 { + i += 1; let word_split = ln .find(' ') .unwrap_or_else(|| panic!("no space found in {:?}:{}", bi_file, i)); @@ -44,11 +50,12 @@ pub fn segmenter() -> Segmenter { let word1 = ln[..word_split].into(); let word2 = ln[word_split + 1..score_split].into(); - let p = usize::from_str(&ln[score_split + 1..]) + let p = usize::from_str(&ln[score_split + 1..].trim()) .unwrap_or_else(|e| panic!("error at {:?}:{}: {}", bi_file, i, e)); - ((word1, word2), p as f64) - }); + bigrams.insert((word1, word2), p as f64); + ln.clear(); + } - Segmenter::from_iters(unigrams, bigrams) + Segmenter::from_maps(unigrams, bigrams) }