issue #48: comments & small fixes

This commit is contained in:
Katya 2017-08-18 03:48:09 +03:00
parent 737375bf28
commit c56124bb9b
3 changed files with 52 additions and 31 deletions

View File

@ -10,13 +10,13 @@ spam_stickers=BQADAgADSAIAAkcGQwU-G-9SZUDTWAI
level=INFO level=INFO
[grammar] [grammar]
chain_length=2 chain_len=2
separator=\x02 separator=\x02
stop_word=\x00 stop_word=\x00
max_words=30 max_wrds=30
max_messages=5 max_msgs=5
end_sentence=.....!!? endsent=.....!!?
all=.!?;()\-—"[]{}«»/*&^#$ garbage=«<{([.!?;\-—"/*&^#$|#$%^&*(№;%:?*])}>»
[media_checker] [media_checker]
lifetime=28800.0 lifetime=28800.0

View File

@ -11,43 +11,53 @@ class ReplyGenerator:
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.trigram_repository = trigram_repository self.trigram_repository = trigram_repository
self.max_words = config.getint('grammar', 'max_words') self.max_wrds = config.getint('grammar', 'max_wrds')
self.max_messages = config.getint('grammar', 'max_messages') self.max_msgs = config.getint('grammar', 'max_msgs')
self.stop_word = config['grammar']['stop_word'] self.stop_word = config['grammar']['stop_word']
self.separator = config['grammar']['separator'] self.separator = config['grammar']['separator']
self.end_sentence = config['grammar']['end_sentence'] self.endsen = config['grammar']['endsen']
def generate(self, message): def generate(self, message):
""" """
Generates response based on message words Generates response based on given message
:param message: Message :param message: Message
:return: Response or empty string, if generated response equals to user message :return:
- response (a message)
- empty string (if response == message)
""" """
words = self.tokenizer.extract_words(message) words = self.tokenizer.extract_words(message)
# TODO explain this
pairs = [trigram[:-1] for trigram in self.tokenizer.split_to_trigrams(words)] pairs = [trigram[:-1] for trigram in self.tokenizer.split_to_trigrams(words)]
# TODO explain why it returns what it returns
messages = [self.__generate_best_message(chat_id=message.chat_id, pair=pair) for pair in pairs] messages = [self.__generate_best_message(chat_id=message.chat_id, pair=pair) for pair in pairs]
longest_message = max(messages, key=len) if len(messages) else '' longest_message = max(messages, key=len) if len(messages) else ''
if longest_message and strings_has_equal_letters(longest_message, ''.join(words)): if longest_message and strings_has_equal_letters(longest_message, ''.join(words)):
return '' return ''
return longest_message return longest_message
def __generate_best_message(self, chat_id, pair): def __generate_best_message(self, chat_id, pair):
# TODO explain this method
best_message = '' best_message = ''
for _ in range(self.max_messages): for _ in range(self.max_msgs):
generated = self.__generate_sentence(chat_id=chat_id, pair=pair) generated = self.__generate_sentence(chat_id=chat_id, pair=pair)
if len(generated) > len(best_message): if len(generated) > len(best_message):
best_message = generated best_message = generated
# TODO explain the concept of the BEST message
return best_message return best_message
def __generate_sentence(self, chat_id, pair): def __generate_sentence(self, chat_id, pair):
# TODO explain this method
gen_words = [] gen_words = []
key = self.separator.join(pair) key = self.separator.join(pair)
for _ in range(self.max_words): # TODO explain this loop
for _ in range(self.max_wrds):
words = key.split(self.separator) words = key.split(self.separator)
gen_words.append(words[1] if len(gen_words) == 0 else words[1]) gen_words.append(words[1] if len(gen_words) == 0 else words[1])
@ -58,18 +68,25 @@ class ReplyGenerator:
key = self.separator.join(words[1:] + [next_word]) key = self.separator.join(words[1:] + [next_word])
# Append last word, if it not already in the list # TODO explain what's last word
# Append last word unless it is in the list already
last_word = key.split(self.separator)[-1] last_word = key.split(self.separator)[-1]
if last_word not in gen_words: if last_word not in gen_words:
gen_words.append(last_word) gen_words.append(last_word)
# Keep only one word, if all words in list are the same # If all words are equal (if set(words) == words[0]), leave just 1 word
if all(w == gen_words[0] for w in gen_words): if len(set(gen_words)) == 1:
gen_words = [gen_words[0]] gen_words = list(set(gen_words))
gen_words = list(filter(lambda w: w != self.stop_word, gen_words)) gen_words = [w for w in gen_words if w != self.stop_word]
# TODO maybe move wordlist preparations to some function?
# TODO maybe move generating the sentence to a function?
sentence = ' '.join(gen_words).strip() sentence = ' '.join(gen_words).strip()
if sentence[-1:] not in self.end_sentence: if sentence[-1:] not in self.endsen:
# TODO explain this pls:
sentence += self.tokenizer.random_end_sentence_token() sentence += self.tokenizer.random_end_sentence_token()
# sentence = capitalize(sentence)
# TODO my intuition tells me we shouldn't return fun(obj), but IDK really
return capitalize(sentence) return sentence

View File

@ -5,41 +5,45 @@ from src.config import config
class Tokenizer: class Tokenizer:
def __init__(self): def __init__(self):
self.chain_length = config.getint('grammar', 'chain_length') self.chain_len = config.getint('grammar', 'chain_len')
self.stop_word = config['grammar']['stop_word'] self.stop_word = config['grammar']['stop_word']
self.end_sentence = config['grammar']['end_sentence'] self.endsent = config['grammar']['endsent']
self.garbage_tokens = config['grammar']['all'] self.garbage = config['grammar']['garbage']
def split_to_trigrams(self, src_words): def split_to_trigrams(self, src_words):
if len(src_words) <= self.chain_length: if len(src_words) <= self.chain_len:
yield from () yield from ()
words = [self.stop_word] words = [self.stop_word]
for word in src_words: for word in src_words:
words.append(word) words.append(word)
if word[-1] in self.end_sentence: if word[-1] in self.endsent:
words.append(self.stop_word) words.append(self.stop_word)
if words[-1] != self.stop_word: if words[-1] != self.stop_word:
words.append(self.stop_word) words.append(self.stop_word)
for i in range(len(words) - self.chain_length): for i in range(len(words) - self.chain_len):
yield words[i:i + self.chain_length + 1] j = i + self.chain_len + 1
yield words[i : j]
def extract_words(self, message): def extract_words(self, message):
symbols = list(re.sub('\s', ' ', message.text)) symbols = list(re.sub('\s', ' ', message.text))
for entity in message.entities: for entity in message.entities:
symbols[entity.offset:entity.length + entity.offset] = ' ' * entity.length # TODO: explain the code
# TODO: validate the formula
symbols[entity.offset : (entity.length+entity.offset)] = ' ' * entity.length
return list(filter(None, map(self.__prettify, ''.join(symbols).split(' ')))) return list(filter(None, map(self.__prettify, ''.join(symbols).split(' '))))
def random_end_sentence_token(self): def random_end_sentence_token(self):
return random_element(list(self.end_sentence)) return random_element(list(self.endsent))
def __prettify(self, word): def __prettify(self, word):
lowercase_word = word.lower().strip() lowercase_word = word.lower().strip()
last_symbol = lowercase_word[-1:] last_symbol = lowercase_word[-1:]
if last_symbol not in self.end_sentence: if last_symbol not in self.endsent:
last_symbol = '' last_symbol = ''
pretty_word = lowercase_word.strip(self.garbage_tokens) pretty_word = lowercase_word.strip(self.garbage_tokens)