Don't index non-letter characters in translation example

This commit is contained in:
Nicholas Rempel 2021-05-31 15:14:19 -07:00 committed by Dirkjan Ochtman
parent 67778600ef
commit c1c5086454
1 changed files with 11 additions and 5 deletions

View File

@ -1,12 +1,12 @@
import asyncio import asyncio
import json import json
import os import os
from string import digits, punctuation, whitespace
import sys import sys
import aiohttp import aiohttp
import instant_distance import instant_distance
from progress.bar import IncrementalBar from progress.bar import IncrementalBar
from progress.spinner import Spinner
MAX_LINES = 100_000 MAX_LINES = 100_000
LANGS = ("en", "fr", "it") LANGS = ("en", "fr", "it")
@ -55,7 +55,7 @@ async def download_build_index():
# save on space and time # save on space and time
if lineno > MAX_LINES: if lineno > MAX_LINES:
break break
linestr = line.decode("utf-8") linestr = line.decode("utf-8")
tokens = linestr.split(" ") tokens = linestr.split(" ")
@ -68,9 +68,15 @@ async def download_build_index():
if lang == "en": if lang == "en":
word_map[value] = embedding word_map[value] = embedding
else: else:
# Don't index words that exist in english # Don't index words that exist in english or
# to improve the quality of the results. # that include non-letter characters to improve
if value in word_map: # the quality of the results.
if (
value in word_map
or any(p in value for p in punctuation)
or any(p in value for p in whitespace)
or any(p in value for p in digits)
):
continue continue
# We track values here to build the instant-distance index # We track values here to build the instant-distance index