learner: add possibility to learn from chat log
This commit is contained in:
parent
d583b26db7
commit
15566baf1a
|
@ -19,10 +19,15 @@ class Bot:
|
||||||
|
|
||||||
purge_queue_instance = chat_purge_queue.instance(self.updater.job_queue)
|
purge_queue_instance = chat_purge_queue.instance(self.updater.job_queue)
|
||||||
|
|
||||||
self.dispatcher.add_handler(MessageHandler())
|
msghandler = MessageHandler()
|
||||||
|
self.dispatcher.add_handler(msghandler)
|
||||||
self.dispatcher.add_handler(CommandHandler())
|
self.dispatcher.add_handler(CommandHandler())
|
||||||
self.dispatcher.add_handler(StatusHandler(chat_purge_queue=purge_queue_instance))
|
self.dispatcher.add_handler(StatusHandler(chat_purge_queue=purge_queue_instance))
|
||||||
|
|
||||||
|
if config['bot']['learn_from_file']:
|
||||||
|
msghandler.data_learner.learn_from_file(config['bot']['learn_from_file'])
|
||||||
|
while True: pass
|
||||||
|
|
||||||
if config['updates']['mode'] == 'polling':
|
if config['updates']['mode'] == 'polling':
|
||||||
self.updater.start_polling()
|
self.updater.start_polling()
|
||||||
elif config['updates']['mode'] == 'webhook':
|
elif config['updates']['mode'] == 'webhook':
|
||||||
|
|
|
@ -5,7 +5,7 @@ import os
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
|
|
||||||
sections = {
|
sections = {
|
||||||
'bot': ['token', 'name', 'anchors', 'god_mode', 'purge_interval', 'default_chance', 'spam_stickers'],
|
'bot': ['token', 'name', 'anchors', 'god_mode', 'purge_interval', 'default_chance', 'spam_stickers', 'mention_name', 'mention_name_full', 'learn_from_file'],
|
||||||
'grammar': ['chain_len', 'sep', 'stop_word', 'max_wrds', 'max_msgs', 'endsen', 'garbage', 'garbage_entities'],
|
'grammar': ['chain_len', 'sep', 'stop_word', 'max_wrds', 'max_msgs', 'endsen', 'garbage', 'garbage_entities'],
|
||||||
'logging': ['level'],
|
'logging': ['level'],
|
||||||
'updates': ['mode'],
|
'updates': ['mode'],
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
from src.config import trigram_repository, tokenizer
|
from src.config import trigram_repository, tokenizer
|
||||||
|
|
||||||
|
class FakeMessage:
|
||||||
|
def __init__(self, chat_id, text):
|
||||||
|
self.chat_id = chat_id
|
||||||
|
self.text = text
|
||||||
|
self.entities = []
|
||||||
|
|
||||||
class DataLearner:
|
class DataLearner:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -11,7 +16,30 @@ class DataLearner:
|
||||||
Split message to trigrams and write to DB
|
Split message to trigrams and write to DB
|
||||||
:param message: Message
|
:param message: Message
|
||||||
"""
|
"""
|
||||||
|
# print("{}".format(message))
|
||||||
words = self.tokenizer.extract_words(message)
|
words = self.tokenizer.extract_words(message)
|
||||||
trigrams = self.tokenizer.split_to_trigrams(words)
|
trigrams = self.tokenizer.split_to_trigrams(words)
|
||||||
|
|
||||||
self.trigram_repository.store(message.chat_id, trigrams)
|
self.trigram_repository.store(message.chat_id, trigrams)
|
||||||
|
|
||||||
|
def learn_from_file(self, file):
|
||||||
|
with open(file, 'r') as fd:
|
||||||
|
# f = open('filtered.txt', 'w+')
|
||||||
|
content = fd.read()
|
||||||
|
# message = object()
|
||||||
|
# setattr(message, 'chat_id', -1001120788844)
|
||||||
|
strs = content.split(';;;')
|
||||||
|
j = 0
|
||||||
|
for text in strs:
|
||||||
|
import re
|
||||||
|
text = re.sub(r'https?:\/\/\S+', '', text, flags=re.MULTILINE)
|
||||||
|
text = re.sub(r'[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)', '', text, flags=re.MULTILINE)
|
||||||
|
text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", '', text, flags=re.MULTILINE)
|
||||||
|
text = re.sub(r'@\S+', '', text, flags=re.MULTILINE)
|
||||||
|
text = text.strip()
|
||||||
|
message = FakeMessage(-1001180265993, text)
|
||||||
|
self.learn(message)
|
||||||
|
#f.write(text + ';;;')
|
||||||
|
j += 1
|
||||||
|
if j % 1000 == 0:
|
||||||
|
print('learned %d messages' % j)
|
||||||
|
|
Loading…
Reference in New Issue