From fcf24c754300d8f1e8729a020df5a2e766128166 Mon Sep 17 00:00:00 2001 From: Dirkjan Ochtman Date: Tue, 1 Jun 2021 15:14:01 +0200 Subject: [PATCH] Add Rust code to process ngram data --- instant-segment/Cargo.toml | 1 + instant-segment/examples/merge.rs | 260 ++++++++++++++++++++++++++++++ 2 files changed, 261 insertions(+) create mode 100644 instant-segment/examples/merge.rs diff --git a/instant-segment/Cargo.toml b/instant-segment/Cargo.toml index 1b5d778..5287a14 100644 --- a/instant-segment/Cargo.toml +++ b/instant-segment/Cargo.toml @@ -23,6 +23,7 @@ serde = { version = "1.0.123", features = ["derive"], optional = true } [dev-dependencies] bencher = "0.1.5" once_cell = "1.4" +rayon = "1.5.1" [[bench]] name = "bench" diff --git a/instant-segment/examples/merge.rs b/instant-segment/examples/merge.rs new file mode 100644 index 0000000..f494ca4 --- /dev/null +++ b/instant-segment/examples/merge.rs @@ -0,0 +1,260 @@ +//! Merge downloaded data to synthesize test data files +//! +//! This is not actually an example, but a tool to help recreate the required +//! data files from publicly available sources. See the README in `/data`. + +use std::cmp::Reverse; +use std::fs::File; +use std::io::Write; +use std::io::{BufRead, BufReader, BufWriter}; +use std::str::FromStr; + +use ahash::{AHashMap as HashMap, AHashSet as HashSet}; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; +use smartstring::alias::String as SmartString; + +fn main() { + let word_list = read_word_list(); + process_unigrams(&word_list); + process_bigrams(&word_list); +} + +/// Read bigrams from the input file parts, filter them, and write to file +fn process_bigrams(word_list: &HashSet) { + let bigrams = (0..BIGRAM_PARTS) + .into_par_iter() + .map(|part| { + let fname = format!("data/cache/eng-2-{:05}-{:05}.txt", part, BIGRAM_PARTS); + let f = File::open(&fname).unwrap(); + let mut reader = BufReader::with_capacity(4 * 1024 * 1024, f); + + let mut ln = String::new(); + let mut bigrams = HashMap::new(); + loop { + // Example line: `using pozzolan 1925,1,1 1947,2,2 1948,2,2 (...)\n` + // Tab-separated line. The first column contains two words, separated by a space. + // Other columns contain a comma-separated triple of (year, match count, volume + // count). + + ln.clear(); + match reader.read_line(&mut ln) { + Ok(0) => break, + Err(e) => { + eprintln!("error: {:?}", e); + break; + } + _ => {} + } + + let mut iter = ln.trim().split('\t'); + let words = match iter.next() { + Some(word) => word, + None => continue, + }; + + let mut word_iter = words.split(' '); + let word1 = match word_iter.next() { + Some(word) => word, + _ => continue, + }; + + let word1 = match normalize(word1, word_list) { + Some(word) => word, + _ => continue, + }; + + let word2 = match word_iter.next() { + Some(word) if word_list.contains(word) => word, + _ => continue, + }; + + let word2 = match normalize(word2, word_list) { + Some(word) => word, + _ => continue, + }; + + let mut matches = 0; + for year_data in iter { + let mut parts = year_data.split(','); + if parts.next().unwrap() < START_YEAR { + continue; + } + matches += usize::from_str(parts.next().unwrap()).unwrap(); + } + + if bigrams.capacity() == 0 { + // While it's not uncommon for a part to result in 0 words, the average for + // parts that contain more than 0 is about 300k, median is about 350k. Allocate + // a decent chunk immediately to avoid too many intermediate reallocations. + bigrams.reserve(256 * 1024) + } + + *bigrams.entry((word1, word2)).or_default() += matches; + } + + eprintln!("extracted {} bigrams from part {}", bigrams.len(), part); + bigrams + }) + .reduce( + HashMap::<(SmartString, SmartString), usize>::new, + |mut left, right| { + for (k, v) in right.into_iter() { + *left.entry(k).or_default() += v; + } + left + }, + ); + + let f = File::create("data/en-bigrams.txt").unwrap(); + let mut writer = BufWriter::with_capacity(4 * 1024 * 1024, f); + let mut bigrams = bigrams.into_iter().collect::>(); + bigrams.sort_by_key(|(_, freq)| Reverse(*freq)); + for (i, ((left, right), freq)) in bigrams.into_iter().enumerate() { + if i == MAX_BIGRAMS { + break; + } + + writeln!(writer, "{} {}\t{}", left, right, freq).unwrap(); + } +} + +/// Read unigrams from the input file parts, filter them, and write to file +fn process_unigrams(word_list: &HashSet) { + let unigrams = (0..UNIGRAM_PARTS) + .into_par_iter() + .map(|part| { + let fname = format!("data/cache/eng-1-{:05}-{:05}.txt", part, UNIGRAM_PARTS); + let f = File::open(&fname).unwrap(); + let mut reader = BufReader::with_capacity(4 * 1024 * 1024, f); + + let mut ln = String::new(); + let mut unigrams = HashMap::with_capacity(8 * 1024); + loop { + // Example line: `ephedrins 1924,1,1 1928,1,1 1931,2,1 (...)\n` + // Tab-separated line. The first column contains the word. All later columns + // contain a comma-separated triple of (year, match count, volume count). + + ln.clear(); + match reader.read_line(&mut ln) { + Ok(0) => break, + Err(e) => { + eprintln!("error: {:?}", e); + break; + } + _ => {} + } + + let mut iter = ln.trim().split('\t'); + let word = match iter.next() { + Some(word) => word, + _ => continue, + }; + + let word = match normalize(word, word_list) { + Some(word) => word, + _ => continue, + }; + + let mut matches = 0; + for year_data in iter { + let mut parts = year_data.split(','); + if parts.next().unwrap() < START_YEAR { + continue; + } + matches += usize::from_str(parts.next().unwrap()).unwrap(); + } + + *unigrams.entry(word).or_default() += matches; + } + + eprintln!("extracted {} unigrams from part {}", unigrams.len(), part); + unigrams + }) + .reduce(HashMap::::new, |mut left, right| { + for (k, v) in right.into_iter() { + *left.entry(k).or_default() += v; + } + left + }); + + let mut unigrams = unigrams.into_iter().collect::>(); + unigrams.sort_by_key(|(_, freq)| Reverse(*freq)); + let f = File::create("data/en-unigrams.txt").unwrap(); + let mut writer = BufWriter::with_capacity(4 * 1024 * 1024, f); + for (i, (word, freq)) in unigrams.into_iter().enumerate() { + if i == MAX_UNIGRAMS { + break; + } + + writeln!(writer, "{}\t{}", word, freq).unwrap(); + } +} + +/// Read the word list and gather it up into a hash set for easy lookups +/// +/// We use this to filter crappy words out of the (pretty noisy) ngram data. +/// Considering the way we want to [`normalize()`], we'll filter for +/// only-letter contents but keep any uppercase characters intact. +fn read_word_list() -> HashSet { + const AVERAGE_WORD_LIST_LINE_LEN: usize = 9; + + let f = File::open("data/cache/eng-wordlist.txt").unwrap(); + let size = f.metadata().unwrap().len() as usize; + let mut reader = BufReader::with_capacity(4 * 1024 * 1024, f); + + eprintln!("read word list..."); + let mut word_list = HashSet::with_capacity(size / AVERAGE_WORD_LIST_LINE_LEN); + let mut ln = String::new(); + loop { + // Example line: `A\n` (`BufRead::read_line()` includes the trailing newline character) + + ln.clear(); + match reader.read_line(&mut ln) { + Ok(0) => break, + Err(e) => { + eprintln!("error: {:?}", e); + break; + } + _ => {} + } + + let word = ln.trim_end(); // Need to remove the trailing newlines here + if word.as_bytes().iter().all(|b| b.is_ascii_alphabetic()) { + word_list.insert(word.into()); + } + } + + eprintln!("read {} words from word list", word_list.len()); + word_list +} + +/// Normalize the input word and filter it +/// +/// The order in which we do things here matters quite a bit. First we trim +/// the word to get rid of surrounding whitespace (which can make the word list +/// lookup fail). Then we check if the word consists of only letters -- we +/// disregard any words with digits or punctuation for our purposes. Only then +/// we lowercase the word. +/// +/// This has to happen last so that we get the correct match counts from the +/// ngram data. For example, the word 'Spain' is usually capitalized, and only +/// the capitalized version is in the word list. For our purposes though, we +/// want to operate on lowercased words, so we'll do that after filtering. +fn normalize(word: &str, list: &HashSet) -> Option { + let word = word.trim(); + if !word.as_bytes().iter().all(|b| b.is_ascii_alphabetic()) || !list.contains(word) { + return None; + } + + let mut word = SmartString::from(word); + word.make_ascii_lowercase(); + Some(word) +} + +const MAX_UNIGRAMS: usize = 256 * 1024; +const MAX_BIGRAMS: usize = 256 * 1024; + +const UNIGRAM_PARTS: usize = 24; +const BIGRAM_PARTS: usize = 589; + +const START_YEAR: &str = "2000";