imaginaryfriend/src/tokenizer.py

import re
from src.utils import random_element
from src.config import config


class Tokenizer:
    def __init__(self):
        self.chain_len = config.getint('grammar', 'chain_len')
        self.stop_word = config['grammar']['stop_word']
        self.endsen   = config['grammar']['endsen']
        self.garbage   = config['grammar']['garbage']
        # https://core.telegram.org/bots/api#messageentity
        self.garbage_entities = config.getlist('grammar', 'garbage_entities')

    def split_to_trigrams(self, words_list):
        if len(words_list) <= self.chain_len:
            yield from ()

        words = [self.stop_word]

        for word in words_list:
            words.append(word)
            if word[-1] in self.endsen:
                words.append(self.stop_word)
        if words[-1] != self.stop_word:
            words.append(self.stop_word)

        for i in range(len(words) - self.chain_len):
            j = i + self.chain_len + 1 
            yield words[i:j]

    def extract_words(self, message):
        symbols = list(re.sub('\s', ' ', self.remove_garbage_entities(message)))

        return list(filter(None, map(self.prettify, ''.join(symbols).split(' '))))

    def random_end_sentence_token(self):
        return random_element(list(self.endsen))

    def remove_garbage_entities(self, message):
        encoding = 'utf-16-le'
        utf16bytes = message.text.encode(encoding)
        result = bytearray()
        cur_pos = 0

        for e in message.entities:
            start_pos = e.offset * 2
            end_pos = (e.offset + e.length) * 2

            result += utf16bytes[cur_pos:start_pos]
            if e.type not in self.garbage_entities:
                result += utf16bytes[start_pos:end_pos]

            cur_pos = end_pos

        result += utf16bytes[cur_pos:]

        return result.decode(encoding)

    def prettify(self, word):
        lowercase_word = word.lower().strip()
        last_symbol = lowercase_word[-1:]
        if last_symbol not in self.endsen:
            last_symbol = ''
        pretty_word = lowercase_word.strip(self.garbage)

        if pretty_word != '' and len(pretty_word) > 2:
            return pretty_word + last_symbol
        elif lowercase_word in self.garbage:
            return None

        return lowercase_word
[WIP] #19 2016-12-09 19:37:55 +01:00			`import re`
			`from src.utils import random_element`
			`from src.config import config`


			`class Tokenizer:`
			`def __init__(self):`
issue #48: comments & small fixes 2017-08-18 02:48:09 +02:00			`self.chain_len = config.getint('grammar', 'chain_len')`
[WIP] #19 2016-12-09 19:37:55 +01:00			`self.stop_word = config['grammar']['stop_word']`
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`self.endsen = config['grammar']['endsen']`
issue #48: comments & small fixes 2017-08-18 02:48:09 +02:00			`self.garbage = config['grammar']['garbage']`
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`# https://core.telegram.org/bots/api#messageentity`
			`self.garbage_entities = config.getlist('grammar', 'garbage_entities')`
[WIP] #19 2016-12-09 19:37:55 +01:00
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`def split_to_trigrams(self, words_list):`
			`if len(words_list) <= self.chain_len:`
[WIP] #19 2016-12-09 19:37:55 +01:00			`yield from ()`

[WIP] #19. Improved tokenizer split_to_trigrams algorithm 2016-12-10 15:30:00 +01:00			`words = [self.stop_word]`
issue #48: comments & small fixes 2017-08-18 02:48:09 +02:00
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`for word in words_list:`
[WIP] #19. Improved tokenizer split_to_trigrams algorithm 2016-12-10 15:30:00 +01:00			`words.append(word)`
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`if word[-1] in self.endsen:`
[WIP] #19. Improved tokenizer split_to_trigrams algorithm 2016-12-10 15:30:00 +01:00			`words.append(self.stop_word)`
			`if words[-1] != self.stop_word:`
			`words.append(self.stop_word)`

issue #48: comments & small fixes 2017-08-18 02:48:09 +02:00			`for i in range(len(words) - self.chain_len):`
			`j = i + self.chain_len + 1`
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`yield words[i:j]`
[WIP] #19 2016-12-09 19:37:55 +01:00
			`def extract_words(self, message):`
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`symbols = list(re.sub('\s', ' ', self.remove_garbage_entities(message)))`
[WIP] #19 2016-12-09 19:37:55 +01:00
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`return list(filter(None, map(self.prettify, ''.join(symbols).split(' '))))`
[WIP] #19 2016-12-09 19:37:55 +01:00
			`def random_end_sentence_token(self):`
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`return random_element(list(self.endsen))`

			`def remove_garbage_entities(self, message):`
			`encoding = 'utf-16-le'`
			`utf16bytes = message.text.encode(encoding)`
			`result = bytearray()`
			`cur_pos = 0`

			`for e in message.entities:`
			`start_pos = e.offset * 2`
			`end_pos = (e.offset + e.length) * 2`

			`result += utf16bytes[cur_pos:start_pos]`
			`if e.type not in self.garbage_entities:`
			`result += utf16bytes[start_pos:end_pos]`

			`cur_pos = end_pos`

			`result += utf16bytes[cur_pos:]`

Fixed #53 2018-04-10 13:43:53 +02:00			`return result.decode(encoding)`
[WIP] #19 2016-12-09 19:37:55 +01:00
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`def prettify(self, word):`
[WIP] #19 2016-12-09 19:37:55 +01:00			`lowercase_word = word.lower().strip()`
			`last_symbol = lowercase_word[-1:]`
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`if last_symbol not in self.endsen:`
[WIP] #19 2016-12-09 19:37:55 +01:00			`last_symbol = ''`
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`pretty_word = lowercase_word.strip(self.garbage)`
[WIP] #19 2016-12-09 19:37:55 +01:00
			`if pretty_word != '' and len(pretty_word) > 2:`
			`return pretty_word + last_symbol`
Fixed entities remove in tokenizer + added code explanation 2017-08-19 19:23:35 +02:00			`elif lowercase_word in self.garbage:`
[WIP] #19 2016-12-09 19:37:55 +01:00			`return None`

			`return lowercase_word`