From c56124bb9ba7908d2f54e9483fbad75f1e7ae648 Mon Sep 17 00:00:00 2001 From: Katya Date: Fri, 18 Aug 2017 03:48:09 +0300 Subject: [PATCH] issue #48: comments & small fixes --- resources/main.cfg | 10 +++---- src/service/reply_generator.py | 49 +++++++++++++++++++++++----------- src/tokenizer.py | 24 ++++++++++------- 3 files changed, 52 insertions(+), 31 deletions(-) diff --git a/resources/main.cfg b/resources/main.cfg index 1fb6466..070fb7b 100644 --- a/resources/main.cfg +++ b/resources/main.cfg @@ -10,13 +10,13 @@ spam_stickers=BQADAgADSAIAAkcGQwU-G-9SZUDTWAI level=INFO [grammar] -chain_length=2 +chain_len=2 separator=\x02 stop_word=\x00 -max_words=30 -max_messages=5 -end_sentence=.....!!? -all=.!?;()\-—"[]{}«»/*&^#$ +max_wrds=30 +max_msgs=5 +endsent=.....!!? +garbage=«<{([.!?;\-—"/*&^#$|#$%^&*(№;%:?*])}>» [media_checker] lifetime=28800.0 diff --git a/src/service/reply_generator.py b/src/service/reply_generator.py index 0199267..d90cde6 100644 --- a/src/service/reply_generator.py +++ b/src/service/reply_generator.py @@ -11,43 +11,53 @@ class ReplyGenerator: self.tokenizer = tokenizer self.trigram_repository = trigram_repository - self.max_words = config.getint('grammar', 'max_words') - self.max_messages = config.getint('grammar', 'max_messages') + self.max_wrds = config.getint('grammar', 'max_wrds') + self.max_msgs = config.getint('grammar', 'max_msgs') self.stop_word = config['grammar']['stop_word'] self.separator = config['grammar']['separator'] - self.end_sentence = config['grammar']['end_sentence'] + self.endsen = config['grammar']['endsen'] def generate(self, message): """ - Generates response based on message words + Generates response based on given message + :param message: Message - :return: Response or empty string, if generated response equals to user message + :return: + - response (a message) + - empty string (if response == message) """ + words = self.tokenizer.extract_words(message) + + # TODO explain this pairs = [trigram[:-1] for trigram in self.tokenizer.split_to_trigrams(words)] + + # TODO explain why it returns what it returns messages = [self.__generate_best_message(chat_id=message.chat_id, pair=pair) for pair in pairs] longest_message = max(messages, key=len) if len(messages) else '' - if longest_message and strings_has_equal_letters(longest_message, ''.join(words)): return '' return longest_message def __generate_best_message(self, chat_id, pair): + # TODO explain this method best_message = '' - for _ in range(self.max_messages): + for _ in range(self.max_msgs): generated = self.__generate_sentence(chat_id=chat_id, pair=pair) if len(generated) > len(best_message): best_message = generated - + # TODO explain the concept of the BEST message return best_message def __generate_sentence(self, chat_id, pair): + # TODO explain this method gen_words = [] key = self.separator.join(pair) - for _ in range(self.max_words): + # TODO explain this loop + for _ in range(self.max_wrds): words = key.split(self.separator) gen_words.append(words[1] if len(gen_words) == 0 else words[1]) @@ -58,18 +68,25 @@ class ReplyGenerator: key = self.separator.join(words[1:] + [next_word]) - # Append last word, if it not already in the list + # TODO explain what's last word + # Append last word unless it is in the list already last_word = key.split(self.separator)[-1] if last_word not in gen_words: gen_words.append(last_word) - # Keep only one word, if all words in list are the same - if all(w == gen_words[0] for w in gen_words): - gen_words = [gen_words[0]] + # If all words are equal (if set(words) == words[0]), leave just 1 word + if len(set(gen_words)) == 1: + gen_words = list(set(gen_words)) - gen_words = list(filter(lambda w: w != self.stop_word, gen_words)) + gen_words = [w for w in gen_words if w != self.stop_word] + # TODO maybe move wordlist preparations to some function? + + # TODO maybe move generating the sentence to a function? sentence = ' '.join(gen_words).strip() - if sentence[-1:] not in self.end_sentence: + if sentence[-1:] not in self.endsen: + # TODO explain this pls: sentence += self.tokenizer.random_end_sentence_token() + # sentence = capitalize(sentence) + # TODO my intuition tells me we shouldn't return fun(obj), but IDK really - return capitalize(sentence) + return sentence diff --git a/src/tokenizer.py b/src/tokenizer.py index cf39532..187009d 100644 --- a/src/tokenizer.py +++ b/src/tokenizer.py @@ -5,41 +5,45 @@ from src.config import config class Tokenizer: def __init__(self): - self.chain_length = config.getint('grammar', 'chain_length') + self.chain_len = config.getint('grammar', 'chain_len') self.stop_word = config['grammar']['stop_word'] - self.end_sentence = config['grammar']['end_sentence'] - self.garbage_tokens = config['grammar']['all'] + self.endsent = config['grammar']['endsent'] + self.garbage = config['grammar']['garbage'] def split_to_trigrams(self, src_words): - if len(src_words) <= self.chain_length: + if len(src_words) <= self.chain_len: yield from () words = [self.stop_word] + for word in src_words: words.append(word) - if word[-1] in self.end_sentence: + if word[-1] in self.endsent: words.append(self.stop_word) if words[-1] != self.stop_word: words.append(self.stop_word) - for i in range(len(words) - self.chain_length): - yield words[i:i + self.chain_length + 1] + for i in range(len(words) - self.chain_len): + j = i + self.chain_len + 1 + yield words[i : j] def extract_words(self, message): symbols = list(re.sub('\s', ' ', message.text)) for entity in message.entities: - symbols[entity.offset:entity.length + entity.offset] = ' ' * entity.length + # TODO: explain the code + # TODO: validate the formula + symbols[entity.offset : (entity.length+entity.offset)] = ' ' * entity.length return list(filter(None, map(self.__prettify, ''.join(symbols).split(' ')))) def random_end_sentence_token(self): - return random_element(list(self.end_sentence)) + return random_element(list(self.endsent)) def __prettify(self, word): lowercase_word = word.lower().strip() last_symbol = lowercase_word[-1:] - if last_symbol not in self.end_sentence: + if last_symbol not in self.endsent: last_symbol = '' pretty_word = lowercase_word.strip(self.garbage_tokens)