Add script to download word list input data

2021-04-30 16:05:11 +02:00 · 2021-04-30 16:05:11 +02:00 · cc95d39063
parent 57221b1dd5
commit cc95d39063
2 changed files with 95 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,5 @@
 Cargo.lock
 *.so
 __pycache__
+.DS_Store
+/data/cache
--- a/data/grab.py
+++ b/data/grab.py
@ -0,0 +1,93 @@
+import urllib.request
+import gzip
+import shutil
+import os.path
+
+BLOCK_SIZE = 4 * 1024 * 1024
+
+UNIGRAM_PARTS = 24
+BIGRAM_PARTS = 589
+NGRAM_URL = 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/{n}-{part:05}-of-{parts:05}.gz'
+
+WORD_LIST_URL = 'http://app.aspell.net/create?max_size=60&spelling=US&spelling=GBs&spelling=GBz&spelling=CA&spelling=AU&max_variant=2&diacritic=strip&special=hacker&download=wordlist&encoding=utf-8&format=inline'
+
+
+def download(url, fn):
+    '''Download the given url and write it to the given fn
+
+    Download in blocks of size BLOCK_SIZE and report progress after every 5% of the file.
+    '''
+    try:
+        u = urllib.request.urlopen(url)
+    except urllib.error.HTTPError as err:
+        print(f'error for {url}: {err}')
+        return 0
+
+    with open(fn + '.tmp', 'wb') as f:
+        length = u.info().get('Content-Length')
+        print('downloading: %s (%s bytes)' % (fn, length))
+
+        downloaded, complete, notified = 0, 0.0, 0.0
+        while True:
+            buf = u.read(BLOCK_SIZE)
+            if not buf:
+                break
+
+            downloaded += len(buf)
+            f.write(buf)
+
+            if length is None:
+                continue
+
+            complete = downloaded / int(length)
+            if complete < notified + 0.05:
+                continue
+
+            notified = complete
+            status = '%10d  [%3.2f%%]' % (downloaded, complete * 100)
+            status = status + chr(8) * (len(status) + 1)
+            print(status)
+
+    os.rename(fn + '.tmp', fn)
+    return complete
+
+
+def cache(n, part, parts):
+    '''Downloads and decompresses ngram data files as necessary
+
+    First downloads, then decompresses the given ngram part file. Will do nothing
+    if the decompressed file already exist and will only decompress if the compressed
+    file for this part already exists in the proper location.'''
+    compressed = f'cache/eng-{n}-{part:05}-{parts:05}.gz'
+    plain = compressed[:-3] + '.txt'
+
+    if os.path.isfile(plain):
+        return
+    elif not os.path.isfile(compressed):
+        url = NGRAM_URL.format(n=n, part=part, parts=parts)
+        complete = download(url, compressed)
+        print(f'downloaded {compressed} ({complete * 100:3.2f}% complete)')
+
+    if os.path.isfile(compressed):
+        with open(plain + '.tmp', 'wb') as output, gzip.open(compressed, 'rb') as input:
+            output.write(input.read())
+        os.rename(plain + '.tmp', plain)
+        print(f'decompressed {compressed}')
+    else:
+        print(f'{compressed} not found')
+
+
+def main():
+    if not os.path.exists('cache'):
+        os.mkdir('cache')
+
+    wl_fn = 'cache/eng-wordlist.txt'
+    if not os.path.isfile(wl_fn):
+        download(WORD_LIST_URL, wl_fn)
+    for part in range(UNIGRAM_PARTS):
+        cache(1, part, UNIGRAM_PARTS)
+    for part in range(BIGRAM_PARTS):
+        cache(2, part, BIGRAM_PARTS)
+
+if __name__ == '__main__':
+    main()