Add Rust code to process ngram data
This commit is contained in:
parent
cc95d39063
commit
fcf24c7543
|
@ -23,6 +23,7 @@ serde = { version = "1.0.123", features = ["derive"], optional = true }
|
|||
[dev-dependencies]
|
||||
bencher = "0.1.5"
|
||||
once_cell = "1.4"
|
||||
rayon = "1.5.1"
|
||||
|
||||
[[bench]]
|
||||
name = "bench"
|
||||
|
|
|
@ -0,0 +1,260 @@
|
|||
//! Merge downloaded data to synthesize test data files
|
||||
//!
|
||||
//! This is not actually an example, but a tool to help recreate the required
|
||||
//! data files from publicly available sources. See the README in `/data`.
|
||||
|
||||
use std::cmp::Reverse;
|
||||
use std::fs::File;
|
||||
use std::io::Write;
|
||||
use std::io::{BufRead, BufReader, BufWriter};
|
||||
use std::str::FromStr;
|
||||
|
||||
use ahash::{AHashMap as HashMap, AHashSet as HashSet};
|
||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
use smartstring::alias::String as SmartString;
|
||||
|
||||
fn main() {
|
||||
let word_list = read_word_list();
|
||||
process_unigrams(&word_list);
|
||||
process_bigrams(&word_list);
|
||||
}
|
||||
|
||||
/// Read bigrams from the input file parts, filter them, and write to file
|
||||
fn process_bigrams(word_list: &HashSet<SmartString>) {
|
||||
let bigrams = (0..BIGRAM_PARTS)
|
||||
.into_par_iter()
|
||||
.map(|part| {
|
||||
let fname = format!("data/cache/eng-2-{:05}-{:05}.txt", part, BIGRAM_PARTS);
|
||||
let f = File::open(&fname).unwrap();
|
||||
let mut reader = BufReader::with_capacity(4 * 1024 * 1024, f);
|
||||
|
||||
let mut ln = String::new();
|
||||
let mut bigrams = HashMap::new();
|
||||
loop {
|
||||
// Example line: `using pozzolan 1925,1,1 1947,2,2 1948,2,2 (...)\n`
|
||||
// Tab-separated line. The first column contains two words, separated by a space.
|
||||
// Other columns contain a comma-separated triple of (year, match count, volume
|
||||
// count).
|
||||
|
||||
ln.clear();
|
||||
match reader.read_line(&mut ln) {
|
||||
Ok(0) => break,
|
||||
Err(e) => {
|
||||
eprintln!("error: {:?}", e);
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let mut iter = ln.trim().split('\t');
|
||||
let words = match iter.next() {
|
||||
Some(word) => word,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let mut word_iter = words.split(' ');
|
||||
let word1 = match word_iter.next() {
|
||||
Some(word) => word,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let word1 = match normalize(word1, word_list) {
|
||||
Some(word) => word,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let word2 = match word_iter.next() {
|
||||
Some(word) if word_list.contains(word) => word,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let word2 = match normalize(word2, word_list) {
|
||||
Some(word) => word,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let mut matches = 0;
|
||||
for year_data in iter {
|
||||
let mut parts = year_data.split(',');
|
||||
if parts.next().unwrap() < START_YEAR {
|
||||
continue;
|
||||
}
|
||||
matches += usize::from_str(parts.next().unwrap()).unwrap();
|
||||
}
|
||||
|
||||
if bigrams.capacity() == 0 {
|
||||
// While it's not uncommon for a part to result in 0 words, the average for
|
||||
// parts that contain more than 0 is about 300k, median is about 350k. Allocate
|
||||
// a decent chunk immediately to avoid too many intermediate reallocations.
|
||||
bigrams.reserve(256 * 1024)
|
||||
}
|
||||
|
||||
*bigrams.entry((word1, word2)).or_default() += matches;
|
||||
}
|
||||
|
||||
eprintln!("extracted {} bigrams from part {}", bigrams.len(), part);
|
||||
bigrams
|
||||
})
|
||||
.reduce(
|
||||
HashMap::<(SmartString, SmartString), usize>::new,
|
||||
|mut left, right| {
|
||||
for (k, v) in right.into_iter() {
|
||||
*left.entry(k).or_default() += v;
|
||||
}
|
||||
left
|
||||
},
|
||||
);
|
||||
|
||||
let f = File::create("data/en-bigrams.txt").unwrap();
|
||||
let mut writer = BufWriter::with_capacity(4 * 1024 * 1024, f);
|
||||
let mut bigrams = bigrams.into_iter().collect::<Vec<_>>();
|
||||
bigrams.sort_by_key(|(_, freq)| Reverse(*freq));
|
||||
for (i, ((left, right), freq)) in bigrams.into_iter().enumerate() {
|
||||
if i == MAX_BIGRAMS {
|
||||
break;
|
||||
}
|
||||
|
||||
writeln!(writer, "{} {}\t{}", left, right, freq).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
/// Read unigrams from the input file parts, filter them, and write to file
|
||||
fn process_unigrams(word_list: &HashSet<SmartString>) {
|
||||
let unigrams = (0..UNIGRAM_PARTS)
|
||||
.into_par_iter()
|
||||
.map(|part| {
|
||||
let fname = format!("data/cache/eng-1-{:05}-{:05}.txt", part, UNIGRAM_PARTS);
|
||||
let f = File::open(&fname).unwrap();
|
||||
let mut reader = BufReader::with_capacity(4 * 1024 * 1024, f);
|
||||
|
||||
let mut ln = String::new();
|
||||
let mut unigrams = HashMap::with_capacity(8 * 1024);
|
||||
loop {
|
||||
// Example line: `ephedrins 1924,1,1 1928,1,1 1931,2,1 (...)\n`
|
||||
// Tab-separated line. The first column contains the word. All later columns
|
||||
// contain a comma-separated triple of (year, match count, volume count).
|
||||
|
||||
ln.clear();
|
||||
match reader.read_line(&mut ln) {
|
||||
Ok(0) => break,
|
||||
Err(e) => {
|
||||
eprintln!("error: {:?}", e);
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let mut iter = ln.trim().split('\t');
|
||||
let word = match iter.next() {
|
||||
Some(word) => word,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let word = match normalize(word, word_list) {
|
||||
Some(word) => word,
|
||||
_ => continue,
|
||||
};
|
||||
|
||||
let mut matches = 0;
|
||||
for year_data in iter {
|
||||
let mut parts = year_data.split(',');
|
||||
if parts.next().unwrap() < START_YEAR {
|
||||
continue;
|
||||
}
|
||||
matches += usize::from_str(parts.next().unwrap()).unwrap();
|
||||
}
|
||||
|
||||
*unigrams.entry(word).or_default() += matches;
|
||||
}
|
||||
|
||||
eprintln!("extracted {} unigrams from part {}", unigrams.len(), part);
|
||||
unigrams
|
||||
})
|
||||
.reduce(HashMap::<SmartString, usize>::new, |mut left, right| {
|
||||
for (k, v) in right.into_iter() {
|
||||
*left.entry(k).or_default() += v;
|
||||
}
|
||||
left
|
||||
});
|
||||
|
||||
let mut unigrams = unigrams.into_iter().collect::<Vec<_>>();
|
||||
unigrams.sort_by_key(|(_, freq)| Reverse(*freq));
|
||||
let f = File::create("data/en-unigrams.txt").unwrap();
|
||||
let mut writer = BufWriter::with_capacity(4 * 1024 * 1024, f);
|
||||
for (i, (word, freq)) in unigrams.into_iter().enumerate() {
|
||||
if i == MAX_UNIGRAMS {
|
||||
break;
|
||||
}
|
||||
|
||||
writeln!(writer, "{}\t{}", word, freq).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
/// Read the word list and gather it up into a hash set for easy lookups
|
||||
///
|
||||
/// We use this to filter crappy words out of the (pretty noisy) ngram data.
|
||||
/// Considering the way we want to [`normalize()`], we'll filter for
|
||||
/// only-letter contents but keep any uppercase characters intact.
|
||||
fn read_word_list() -> HashSet<SmartString> {
|
||||
const AVERAGE_WORD_LIST_LINE_LEN: usize = 9;
|
||||
|
||||
let f = File::open("data/cache/eng-wordlist.txt").unwrap();
|
||||
let size = f.metadata().unwrap().len() as usize;
|
||||
let mut reader = BufReader::with_capacity(4 * 1024 * 1024, f);
|
||||
|
||||
eprintln!("read word list...");
|
||||
let mut word_list = HashSet::with_capacity(size / AVERAGE_WORD_LIST_LINE_LEN);
|
||||
let mut ln = String::new();
|
||||
loop {
|
||||
// Example line: `A\n` (`BufRead::read_line()` includes the trailing newline character)
|
||||
|
||||
ln.clear();
|
||||
match reader.read_line(&mut ln) {
|
||||
Ok(0) => break,
|
||||
Err(e) => {
|
||||
eprintln!("error: {:?}", e);
|
||||
break;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let word = ln.trim_end(); // Need to remove the trailing newlines here
|
||||
if word.as_bytes().iter().all(|b| b.is_ascii_alphabetic()) {
|
||||
word_list.insert(word.into());
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("read {} words from word list", word_list.len());
|
||||
word_list
|
||||
}
|
||||
|
||||
/// Normalize the input word and filter it
|
||||
///
|
||||
/// The order in which we do things here matters quite a bit. First we trim
|
||||
/// the word to get rid of surrounding whitespace (which can make the word list
|
||||
/// lookup fail). Then we check if the word consists of only letters -- we
|
||||
/// disregard any words with digits or punctuation for our purposes. Only then
|
||||
/// we lowercase the word.
|
||||
///
|
||||
/// This has to happen last so that we get the correct match counts from the
|
||||
/// ngram data. For example, the word 'Spain' is usually capitalized, and only
|
||||
/// the capitalized version is in the word list. For our purposes though, we
|
||||
/// want to operate on lowercased words, so we'll do that after filtering.
|
||||
fn normalize(word: &str, list: &HashSet<SmartString>) -> Option<SmartString> {
|
||||
let word = word.trim();
|
||||
if !word.as_bytes().iter().all(|b| b.is_ascii_alphabetic()) || !list.contains(word) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut word = SmartString::from(word);
|
||||
word.make_ascii_lowercase();
|
||||
Some(word)
|
||||
}
|
||||
|
||||
const MAX_UNIGRAMS: usize = 256 * 1024;
|
||||
const MAX_BIGRAMS: usize = 256 * 1024;
|
||||
|
||||
const UNIGRAM_PARTS: usize = 24;
|
||||
const BIGRAM_PARTS: usize = 589;
|
||||
|
||||
const START_YEAR: &str = "2000";
|
Loading…
Reference in New Issue