Add script to download word list input data

This commit is contained in:
Dirkjan Ochtman 2021-04-30 16:05:11 +02:00
parent 57221b1dd5
commit cc95d39063
2 changed files with 95 additions and 0 deletions

2
.gitignore vendored
View File

@ -2,3 +2,5 @@
Cargo.lock
*.so
__pycache__
.DS_Store
/data/cache

93
data/grab.py Normal file
View File

@ -0,0 +1,93 @@
import urllib.request
import gzip
import shutil
import os.path
BLOCK_SIZE = 4 * 1024 * 1024
UNIGRAM_PARTS = 24
BIGRAM_PARTS = 589
NGRAM_URL = 'http://storage.googleapis.com/books/ngrams/books/20200217/eng/{n}-{part:05}-of-{parts:05}.gz'
WORD_LIST_URL = 'http://app.aspell.net/create?max_size=60&spelling=US&spelling=GBs&spelling=GBz&spelling=CA&spelling=AU&max_variant=2&diacritic=strip&special=hacker&download=wordlist&encoding=utf-8&format=inline'
def download(url, fn):
'''Download the given url and write it to the given fn
Download in blocks of size BLOCK_SIZE and report progress after every 5% of the file.
'''
try:
u = urllib.request.urlopen(url)
except urllib.error.HTTPError as err:
print(f'error for {url}: {err}')
return 0
with open(fn + '.tmp', 'wb') as f:
length = u.info().get('Content-Length')
print('downloading: %s (%s bytes)' % (fn, length))
downloaded, complete, notified = 0, 0.0, 0.0
while True:
buf = u.read(BLOCK_SIZE)
if not buf:
break
downloaded += len(buf)
f.write(buf)
if length is None:
continue
complete = downloaded / int(length)
if complete < notified + 0.05:
continue
notified = complete
status = '%10d [%3.2f%%]' % (downloaded, complete * 100)
status = status + chr(8) * (len(status) + 1)
print(status)
os.rename(fn + '.tmp', fn)
return complete
def cache(n, part, parts):
'''Downloads and decompresses ngram data files as necessary
First downloads, then decompresses the given ngram part file. Will do nothing
if the decompressed file already exist and will only decompress if the compressed
file for this part already exists in the proper location.'''
compressed = f'cache/eng-{n}-{part:05}-{parts:05}.gz'
plain = compressed[:-3] + '.txt'
if os.path.isfile(plain):
return
elif not os.path.isfile(compressed):
url = NGRAM_URL.format(n=n, part=part, parts=parts)
complete = download(url, compressed)
print(f'downloaded {compressed} ({complete * 100:3.2f}% complete)')
if os.path.isfile(compressed):
with open(plain + '.tmp', 'wb') as output, gzip.open(compressed, 'rb') as input:
output.write(input.read())
os.rename(plain + '.tmp', plain)
print(f'decompressed {compressed}')
else:
print(f'{compressed} not found')
def main():
if not os.path.exists('cache'):
os.mkdir('cache')
wl_fn = 'cache/eng-wordlist.txt'
if not os.path.isfile(wl_fn):
download(WORD_LIST_URL, wl_fn)
for part in range(UNIGRAM_PARTS):
cache(1, part, UNIGRAM_PARTS)
for part in range(BIGRAM_PARTS):
cache(2, part, BIGRAM_PARTS)
if __name__ == '__main__':
main()