2021-03-24 09:59:57 +00:00
|
|
|
import instant_segment, os, sys
|
|
|
|
|
|
|
|
DATA_DIR = os.path.join(os.path.dirname(__file__), '../../data/')
|
|
|
|
|
|
|
|
def unigrams():
|
2021-08-31 12:49:38 +00:00
|
|
|
for ln in open(os.path.join(DATA_DIR, 'en-unigrams.txt')):
|
2021-03-24 09:59:57 +00:00
|
|
|
parts = ln.split('\t', 1)
|
|
|
|
yield (parts[0], float(parts[1].strip()))
|
|
|
|
|
|
|
|
def bigrams():
|
2021-08-31 12:49:38 +00:00
|
|
|
for ln in open(os.path.join(DATA_DIR, 'en-bigrams.txt')):
|
2021-03-24 09:59:57 +00:00
|
|
|
word_split = ln.split(' ', 1)
|
|
|
|
score_split = word_split[1].split('\t', 1)
|
|
|
|
yield ((word_split[0], score_split[0]), float(score_split[1].strip()))
|
|
|
|
|
|
|
|
def main():
|
|
|
|
segmenter = instant_segment.Segmenter(unigrams(), bigrams())
|
|
|
|
search = instant_segment.Search()
|
|
|
|
segmenter.segment('thisisatest', search)
|
|
|
|
print([word for word in search])
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|