imaginaryfriend/src/tokenizer.py

73 lines
2.3 KiB
Python
Raw Normal View History

2016-12-09 19:37:55 +01:00
import re
from src.utils import random_element
from src.config import config
class Tokenizer:
def __init__(self):
2017-08-18 02:48:09 +02:00
self.chain_len = config.getint('grammar', 'chain_len')
2016-12-09 19:37:55 +01:00
self.stop_word = config['grammar']['stop_word']
self.endsen = config['grammar']['endsen']
2017-08-18 02:48:09 +02:00
self.garbage = config['grammar']['garbage']
# https://core.telegram.org/bots/api#messageentity
self.garbage_entities = config.getlist('grammar', 'garbage_entities')
2016-12-09 19:37:55 +01:00
def split_to_trigrams(self, words_list):
if len(words_list) <= self.chain_len:
2016-12-09 19:37:55 +01:00
yield from ()
words = [self.stop_word]
2017-08-18 02:48:09 +02:00
for word in words_list:
words.append(word)
if word[-1] in self.endsen:
words.append(self.stop_word)
if words[-1] != self.stop_word:
words.append(self.stop_word)
2017-08-18 02:48:09 +02:00
for i in range(len(words) - self.chain_len):
j = i + self.chain_len + 1
yield words[i:j]
2016-12-09 19:37:55 +01:00
def extract_words(self, message):
symbols = list(re.sub('\s', ' ', self.remove_garbage_entities(message)))
2016-12-09 19:37:55 +01:00
return list(filter(None, map(self.prettify, ''.join(symbols).split(' '))))
2016-12-09 19:37:55 +01:00
def random_end_sentence_token(self):
return random_element(list(self.endsen))
def remove_garbage_entities(self, message):
encoding = 'utf-16-le'
utf16bytes = message.text.encode(encoding)
result = bytearray()
cur_pos = 0
for e in message.entities:
start_pos = e.offset * 2
end_pos = (e.offset + e.length) * 2
result += utf16bytes[cur_pos:start_pos]
if e.type not in self.garbage_entities:
result += utf16bytes[start_pos:end_pos]
cur_pos = end_pos
result += utf16bytes[cur_pos:]
2018-04-10 13:43:53 +02:00
return result.decode(encoding)
2016-12-09 19:37:55 +01:00
def prettify(self, word):
2016-12-09 19:37:55 +01:00
lowercase_word = word.lower().strip()
last_symbol = lowercase_word[-1:]
if last_symbol not in self.endsen:
2016-12-09 19:37:55 +01:00
last_symbol = ''
pretty_word = lowercase_word.strip(self.garbage)
2016-12-09 19:37:55 +01:00
if pretty_word != '' and len(pretty_word) > 2:
return pretty_word + last_symbol
elif lowercase_word in self.garbage:
2016-12-09 19:37:55 +01:00
return None
return lowercase_word