Add Rust code to process ngram data

2021-06-01 15:14:01 +02:00 · 2021-06-01 15:14:01 +02:00 · fcf24c7543
parent cc95d39063
commit fcf24c7543
2 changed files with 261 additions and 0 deletions
--- a/instant-segment/Cargo.toml
+++ b/instant-segment/Cargo.toml
@ -23,6 +23,7 @@ serde = { version = "1.0.123", features = ["derive"], optional = true }
 [dev-dependencies]
 bencher = "0.1.5"
 once_cell = "1.4"
+rayon = "1.5.1"

 [[bench]]
 name = "bench"
--- a/instant-segment/examples/merge.rs
+++ b/instant-segment/examples/merge.rs
@ -0,0 +1,260 @@
+//! Merge downloaded data to synthesize test data files
+//!
+//! This is not actually an example, but a tool to help recreate the required
+//! data files from publicly available sources. See the README in `/data`.
+
+use std::cmp::Reverse;
+use std::fs::File;
+use std::io::Write;
+use std::io::{BufRead, BufReader, BufWriter};
+use std::str::FromStr;
+
+use ahash::{AHashMap as HashMap, AHashSet as HashSet};
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+use smartstring::alias::String as SmartString;
+
+fn main() {
+    let word_list = read_word_list();
+    process_unigrams(&word_list);
+    process_bigrams(&word_list);
+}
+
+/// Read bigrams from the input file parts, filter them, and write to file
+fn process_bigrams(word_list: &HashSet<SmartString>) {
+    let bigrams = (0..BIGRAM_PARTS)
+        .into_par_iter()
+        .map(|part| {
+            let fname = format!("data/cache/eng-2-{:05}-{:05}.txt", part, BIGRAM_PARTS);
+            let f = File::open(&fname).unwrap();
+            let mut reader = BufReader::with_capacity(4 * 1024 * 1024, f);
+
+            let mut ln = String::new();
+            let mut bigrams = HashMap::new();
+            loop {
+                // Example line: `using pozzolan	1925,1,1	1947,2,2	1948,2,2	(...)\n`
+                // Tab-separated line. The first column contains two words, separated by a space.
+                // Other columns contain a comma-separated triple of (year, match count, volume
+                // count).
+
+                ln.clear();
+                match reader.read_line(&mut ln) {
+                    Ok(0) => break,
+                    Err(e) => {
+                        eprintln!("error: {:?}", e);
+                        break;
+                    }
+                    _ => {}
+                }
+
+                let mut iter = ln.trim().split('\t');
+                let words = match iter.next() {
+                    Some(word) => word,
+                    None => continue,
+                };
+
+                let mut word_iter = words.split(' ');
+                let word1 = match word_iter.next() {
+                    Some(word) => word,
+                    _ => continue,
+                };
+
+                let word1 = match normalize(word1, word_list) {
+                    Some(word) => word,
+                    _ => continue,
+                };
+
+                let word2 = match word_iter.next() {
+                    Some(word) if word_list.contains(word) => word,
+                    _ => continue,
+                };
+
+                let word2 = match normalize(word2, word_list) {
+                    Some(word) => word,
+                    _ => continue,
+                };
+
+                let mut matches = 0;
+                for year_data in iter {
+                    let mut parts = year_data.split(',');
+                    if parts.next().unwrap() < START_YEAR {
+                        continue;
+                    }
+                    matches += usize::from_str(parts.next().unwrap()).unwrap();
+                }
+
+                if bigrams.capacity() == 0 {
+                    // While it's not uncommon for a part to result in 0 words, the average for
+                    // parts that contain more than 0 is about 300k, median is about 350k. Allocate
+                    // a decent chunk immediately to avoid too many intermediate reallocations.
+                    bigrams.reserve(256 * 1024)
+                }
+
+                *bigrams.entry((word1, word2)).or_default() += matches;
+            }
+
+            eprintln!("extracted {} bigrams from part {}", bigrams.len(), part);
+            bigrams
+        })
+        .reduce(
+            HashMap::<(SmartString, SmartString), usize>::new,
+            |mut left, right| {
+                for (k, v) in right.into_iter() {
+                    *left.entry(k).or_default() += v;
+                }
+                left
+            },
+        );
+
+    let f = File::create("data/en-bigrams.txt").unwrap();
+    let mut writer = BufWriter::with_capacity(4 * 1024 * 1024, f);
+    let mut bigrams = bigrams.into_iter().collect::<Vec<_>>();
+    bigrams.sort_by_key(|(_, freq)| Reverse(*freq));
+    for (i, ((left, right), freq)) in bigrams.into_iter().enumerate() {
+        if i == MAX_BIGRAMS {
+            break;
+        }
+
+        writeln!(writer, "{} {}\t{}", left, right, freq).unwrap();
+    }
+}
+
+/// Read unigrams from the input file parts, filter them, and write to file
+fn process_unigrams(word_list: &HashSet<SmartString>) {
+    let unigrams = (0..UNIGRAM_PARTS)
+        .into_par_iter()
+        .map(|part| {
+            let fname = format!("data/cache/eng-1-{:05}-{:05}.txt", part, UNIGRAM_PARTS);
+            let f = File::open(&fname).unwrap();
+            let mut reader = BufReader::with_capacity(4 * 1024 * 1024, f);
+
+            let mut ln = String::new();
+            let mut unigrams = HashMap::with_capacity(8 * 1024);
+            loop {
+                // Example line: `ephedrins	1924,1,1	1928,1,1	1931,2,1	(...)\n`
+                // Tab-separated line. The first column contains the word. All later columns
+                // contain a comma-separated triple of (year, match count, volume count).
+
+                ln.clear();
+                match reader.read_line(&mut ln) {
+                    Ok(0) => break,
+                    Err(e) => {
+                        eprintln!("error: {:?}", e);
+                        break;
+                    }
+                    _ => {}
+                }
+
+                let mut iter = ln.trim().split('\t');
+                let word = match iter.next() {
+                    Some(word) => word,
+                    _ => continue,
+                };
+
+                let word = match normalize(word, word_list) {
+                    Some(word) => word,
+                    _ => continue,
+                };
+
+                let mut matches = 0;
+                for year_data in iter {
+                    let mut parts = year_data.split(',');
+                    if parts.next().unwrap() < START_YEAR {
+                        continue;
+                    }
+                    matches += usize::from_str(parts.next().unwrap()).unwrap();
+                }
+
+                *unigrams.entry(word).or_default() += matches;
+            }
+
+            eprintln!("extracted {} unigrams from part {}", unigrams.len(), part);
+            unigrams
+        })
+        .reduce(HashMap::<SmartString, usize>::new, |mut left, right| {
+            for (k, v) in right.into_iter() {
+                *left.entry(k).or_default() += v;
+            }
+            left
+        });
+
+    let mut unigrams = unigrams.into_iter().collect::<Vec<_>>();
+    unigrams.sort_by_key(|(_, freq)| Reverse(*freq));
+    let f = File::create("data/en-unigrams.txt").unwrap();
+    let mut writer = BufWriter::with_capacity(4 * 1024 * 1024, f);
+    for (i, (word, freq)) in unigrams.into_iter().enumerate() {
+        if i == MAX_UNIGRAMS {
+            break;
+        }
+
+        writeln!(writer, "{}\t{}", word, freq).unwrap();
+    }
+}
+
+/// Read the word list and gather it up into a hash set for easy lookups
+///
+/// We use this to filter crappy words out of the (pretty noisy) ngram data.
+/// Considering the way we want to [`normalize()`], we'll filter for
+/// only-letter contents but keep any uppercase characters intact.
+fn read_word_list() -> HashSet<SmartString> {
+    const AVERAGE_WORD_LIST_LINE_LEN: usize = 9;
+
+    let f = File::open("data/cache/eng-wordlist.txt").unwrap();
+    let size = f.metadata().unwrap().len() as usize;
+    let mut reader = BufReader::with_capacity(4 * 1024 * 1024, f);
+
+    eprintln!("read word list...");
+    let mut word_list = HashSet::with_capacity(size / AVERAGE_WORD_LIST_LINE_LEN);
+    let mut ln = String::new();
+    loop {
+        // Example line: `A\n` (`BufRead::read_line()` includes the trailing newline character)
+
+        ln.clear();
+        match reader.read_line(&mut ln) {
+            Ok(0) => break,
+            Err(e) => {
+                eprintln!("error: {:?}", e);
+                break;
+            }
+            _ => {}
+        }
+
+        let word = ln.trim_end(); // Need to remove the trailing newlines here
+        if word.as_bytes().iter().all(|b| b.is_ascii_alphabetic()) {
+            word_list.insert(word.into());
+        }
+    }
+
+    eprintln!("read {} words from word list", word_list.len());
+    word_list
+}
+
+/// Normalize the input word and filter it
+///
+/// The order in which we do things here matters quite a bit. First we trim
+/// the word to get rid of surrounding whitespace (which can make the word list
+/// lookup fail). Then we check if the word consists of only letters -- we
+/// disregard any words with digits or punctuation for our purposes. Only then
+/// we lowercase the word.
+///
+/// This has to happen last so that we get the correct match counts from the
+/// ngram data. For example, the word 'Spain' is usually capitalized, and only
+/// the capitalized version is in the word list. For our purposes though, we
+/// want to operate on lowercased words, so we'll do that after filtering.
+fn normalize(word: &str, list: &HashSet<SmartString>) -> Option<SmartString> {
+    let word = word.trim();
+    if !word.as_bytes().iter().all(|b| b.is_ascii_alphabetic()) || !list.contains(word) {
+        return None;
+    }
+
+    let mut word = SmartString::from(word);
+    word.make_ascii_lowercase();
+    Some(word)
+}
+
+const MAX_UNIGRAMS: usize = 256 * 1024;
+const MAX_BIGRAMS: usize = 256 * 1024;
+
+const UNIGRAM_PARTS: usize = 24;
+const BIGRAM_PARTS: usize = 589;
+
+const START_YEAR: &str = "2000";