Don't index non-letter characters in translation example
This commit is contained in:
parent
67778600ef
commit
c1c5086454
|
@ -1,12 +1,12 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
from string import digits, punctuation, whitespace
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import instant_distance
|
import instant_distance
|
||||||
from progress.bar import IncrementalBar
|
from progress.bar import IncrementalBar
|
||||||
from progress.spinner import Spinner
|
|
||||||
|
|
||||||
MAX_LINES = 100_000
|
MAX_LINES = 100_000
|
||||||
LANGS = ("en", "fr", "it")
|
LANGS = ("en", "fr", "it")
|
||||||
|
@ -68,9 +68,15 @@ async def download_build_index():
|
||||||
if lang == "en":
|
if lang == "en":
|
||||||
word_map[value] = embedding
|
word_map[value] = embedding
|
||||||
else:
|
else:
|
||||||
# Don't index words that exist in english
|
# Don't index words that exist in english or
|
||||||
# to improve the quality of the results.
|
# that include non-letter characters to improve
|
||||||
if value in word_map:
|
# the quality of the results.
|
||||||
|
if (
|
||||||
|
value in word_map
|
||||||
|
or any(p in value for p in punctuation)
|
||||||
|
or any(p in value for p in whitespace)
|
||||||
|
or any(p in value for p in digits)
|
||||||
|
):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# We track values here to build the instant-distance index
|
# We track values here to build the instant-distance index
|
||||||
|
|
Loading…
Reference in New Issue