issue #48: comments & small fixes
This commit is contained in:
parent
737375bf28
commit
c56124bb9b
|
@ -10,13 +10,13 @@ spam_stickers=BQADAgADSAIAAkcGQwU-G-9SZUDTWAI
|
||||||
level=INFO
|
level=INFO
|
||||||
|
|
||||||
[grammar]
|
[grammar]
|
||||||
chain_length=2
|
chain_len=2
|
||||||
separator=\x02
|
separator=\x02
|
||||||
stop_word=\x00
|
stop_word=\x00
|
||||||
max_words=30
|
max_wrds=30
|
||||||
max_messages=5
|
max_msgs=5
|
||||||
end_sentence=.....!!?
|
endsent=.....!!?
|
||||||
all=.!?;()\-—"[]{}«»/*&^#$
|
garbage=«<{([.!?;\-—"/*&^#$|#$%^&*(№;%:?*])}>»
|
||||||
|
|
||||||
[media_checker]
|
[media_checker]
|
||||||
lifetime=28800.0
|
lifetime=28800.0
|
||||||
|
|
|
@ -11,43 +11,53 @@ class ReplyGenerator:
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.trigram_repository = trigram_repository
|
self.trigram_repository = trigram_repository
|
||||||
|
|
||||||
self.max_words = config.getint('grammar', 'max_words')
|
self.max_wrds = config.getint('grammar', 'max_wrds')
|
||||||
self.max_messages = config.getint('grammar', 'max_messages')
|
self.max_msgs = config.getint('grammar', 'max_msgs')
|
||||||
|
|
||||||
self.stop_word = config['grammar']['stop_word']
|
self.stop_word = config['grammar']['stop_word']
|
||||||
self.separator = config['grammar']['separator']
|
self.separator = config['grammar']['separator']
|
||||||
self.end_sentence = config['grammar']['end_sentence']
|
self.endsen = config['grammar']['endsen']
|
||||||
|
|
||||||
def generate(self, message):
|
def generate(self, message):
|
||||||
"""
|
"""
|
||||||
Generates response based on message words
|
Generates response based on given message
|
||||||
|
|
||||||
:param message: Message
|
:param message: Message
|
||||||
:return: Response or empty string, if generated response equals to user message
|
:return:
|
||||||
|
- response (a message)
|
||||||
|
- empty string (if response == message)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
words = self.tokenizer.extract_words(message)
|
words = self.tokenizer.extract_words(message)
|
||||||
|
|
||||||
|
# TODO explain this
|
||||||
pairs = [trigram[:-1] for trigram in self.tokenizer.split_to_trigrams(words)]
|
pairs = [trigram[:-1] for trigram in self.tokenizer.split_to_trigrams(words)]
|
||||||
|
|
||||||
|
# TODO explain why it returns what it returns
|
||||||
messages = [self.__generate_best_message(chat_id=message.chat_id, pair=pair) for pair in pairs]
|
messages = [self.__generate_best_message(chat_id=message.chat_id, pair=pair) for pair in pairs]
|
||||||
longest_message = max(messages, key=len) if len(messages) else ''
|
longest_message = max(messages, key=len) if len(messages) else ''
|
||||||
|
|
||||||
if longest_message and strings_has_equal_letters(longest_message, ''.join(words)):
|
if longest_message and strings_has_equal_letters(longest_message, ''.join(words)):
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
return longest_message
|
return longest_message
|
||||||
|
|
||||||
def __generate_best_message(self, chat_id, pair):
|
def __generate_best_message(self, chat_id, pair):
|
||||||
|
# TODO explain this method
|
||||||
best_message = ''
|
best_message = ''
|
||||||
for _ in range(self.max_messages):
|
for _ in range(self.max_msgs):
|
||||||
generated = self.__generate_sentence(chat_id=chat_id, pair=pair)
|
generated = self.__generate_sentence(chat_id=chat_id, pair=pair)
|
||||||
if len(generated) > len(best_message):
|
if len(generated) > len(best_message):
|
||||||
best_message = generated
|
best_message = generated
|
||||||
|
# TODO explain the concept of the BEST message
|
||||||
return best_message
|
return best_message
|
||||||
|
|
||||||
def __generate_sentence(self, chat_id, pair):
|
def __generate_sentence(self, chat_id, pair):
|
||||||
|
# TODO explain this method
|
||||||
gen_words = []
|
gen_words = []
|
||||||
key = self.separator.join(pair)
|
key = self.separator.join(pair)
|
||||||
|
|
||||||
for _ in range(self.max_words):
|
# TODO explain this loop
|
||||||
|
for _ in range(self.max_wrds):
|
||||||
words = key.split(self.separator)
|
words = key.split(self.separator)
|
||||||
|
|
||||||
gen_words.append(words[1] if len(gen_words) == 0 else words[1])
|
gen_words.append(words[1] if len(gen_words) == 0 else words[1])
|
||||||
|
@ -58,18 +68,25 @@ class ReplyGenerator:
|
||||||
|
|
||||||
key = self.separator.join(words[1:] + [next_word])
|
key = self.separator.join(words[1:] + [next_word])
|
||||||
|
|
||||||
# Append last word, if it not already in the list
|
# TODO explain what's last word
|
||||||
|
# Append last word unless it is in the list already
|
||||||
last_word = key.split(self.separator)[-1]
|
last_word = key.split(self.separator)[-1]
|
||||||
if last_word not in gen_words:
|
if last_word not in gen_words:
|
||||||
gen_words.append(last_word)
|
gen_words.append(last_word)
|
||||||
|
|
||||||
# Keep only one word, if all words in list are the same
|
# If all words are equal (if set(words) == words[0]), leave just 1 word
|
||||||
if all(w == gen_words[0] for w in gen_words):
|
if len(set(gen_words)) == 1:
|
||||||
gen_words = [gen_words[0]]
|
gen_words = list(set(gen_words))
|
||||||
|
|
||||||
gen_words = list(filter(lambda w: w != self.stop_word, gen_words))
|
gen_words = [w for w in gen_words if w != self.stop_word]
|
||||||
|
# TODO maybe move wordlist preparations to some function?
|
||||||
|
|
||||||
|
# TODO maybe move generating the sentence to a function?
|
||||||
sentence = ' '.join(gen_words).strip()
|
sentence = ' '.join(gen_words).strip()
|
||||||
if sentence[-1:] not in self.end_sentence:
|
if sentence[-1:] not in self.endsen:
|
||||||
|
# TODO explain this pls:
|
||||||
sentence += self.tokenizer.random_end_sentence_token()
|
sentence += self.tokenizer.random_end_sentence_token()
|
||||||
|
# sentence = capitalize(sentence)
|
||||||
|
# TODO my intuition tells me we shouldn't return fun(obj), but IDK really
|
||||||
|
|
||||||
return capitalize(sentence)
|
return sentence
|
||||||
|
|
|
@ -5,41 +5,45 @@ from src.config import config
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.chain_length = config.getint('grammar', 'chain_length')
|
self.chain_len = config.getint('grammar', 'chain_len')
|
||||||
self.stop_word = config['grammar']['stop_word']
|
self.stop_word = config['grammar']['stop_word']
|
||||||
self.end_sentence = config['grammar']['end_sentence']
|
self.endsent = config['grammar']['endsent']
|
||||||
self.garbage_tokens = config['grammar']['all']
|
self.garbage = config['grammar']['garbage']
|
||||||
|
|
||||||
def split_to_trigrams(self, src_words):
|
def split_to_trigrams(self, src_words):
|
||||||
if len(src_words) <= self.chain_length:
|
if len(src_words) <= self.chain_len:
|
||||||
yield from ()
|
yield from ()
|
||||||
|
|
||||||
words = [self.stop_word]
|
words = [self.stop_word]
|
||||||
|
|
||||||
for word in src_words:
|
for word in src_words:
|
||||||
words.append(word)
|
words.append(word)
|
||||||
if word[-1] in self.end_sentence:
|
if word[-1] in self.endsent:
|
||||||
words.append(self.stop_word)
|
words.append(self.stop_word)
|
||||||
if words[-1] != self.stop_word:
|
if words[-1] != self.stop_word:
|
||||||
words.append(self.stop_word)
|
words.append(self.stop_word)
|
||||||
|
|
||||||
for i in range(len(words) - self.chain_length):
|
for i in range(len(words) - self.chain_len):
|
||||||
yield words[i:i + self.chain_length + 1]
|
j = i + self.chain_len + 1
|
||||||
|
yield words[i : j]
|
||||||
|
|
||||||
def extract_words(self, message):
|
def extract_words(self, message):
|
||||||
symbols = list(re.sub('\s', ' ', message.text))
|
symbols = list(re.sub('\s', ' ', message.text))
|
||||||
|
|
||||||
for entity in message.entities:
|
for entity in message.entities:
|
||||||
symbols[entity.offset:entity.length + entity.offset] = ' ' * entity.length
|
# TODO: explain the code
|
||||||
|
# TODO: validate the formula
|
||||||
|
symbols[entity.offset : (entity.length+entity.offset)] = ' ' * entity.length
|
||||||
|
|
||||||
return list(filter(None, map(self.__prettify, ''.join(symbols).split(' '))))
|
return list(filter(None, map(self.__prettify, ''.join(symbols).split(' '))))
|
||||||
|
|
||||||
def random_end_sentence_token(self):
|
def random_end_sentence_token(self):
|
||||||
return random_element(list(self.end_sentence))
|
return random_element(list(self.endsent))
|
||||||
|
|
||||||
def __prettify(self, word):
|
def __prettify(self, word):
|
||||||
lowercase_word = word.lower().strip()
|
lowercase_word = word.lower().strip()
|
||||||
last_symbol = lowercase_word[-1:]
|
last_symbol = lowercase_word[-1:]
|
||||||
if last_symbol not in self.end_sentence:
|
if last_symbol not in self.endsent:
|
||||||
last_symbol = ''
|
last_symbol = ''
|
||||||
pretty_word = lowercase_word.strip(self.garbage_tokens)
|
pretty_word = lowercase_word.strip(self.garbage_tokens)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue