Source code for crate_anon.common.bugfix_flashtext

"""
crate_anon/common/bugfix_flashtext.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

THIS FILE, however, is by another author: from
https://github.com/vi3k6i5/flashtext/issues/44, by Ihor Bobak; added to
Flashtext code; licensed under the MIT License as per
https://github.com/vi3k6i5/flashtext/blob/master/LICENSE.

Rationale:

There is currently a bug in the method :meth:`replace_keywords` in the external
module ``flashtext`` in which certain characters provoke an 'index out of
range' error when working in case-insensitive mode. This is because some
non-ascii characters are larger in their lower-case form. Thanks to Ihor Bobak
for this bugfix.

Edits for PyCharm linter.
"""

from flashtext import KeywordProcessor


# noinspection PyAbstractClass
[docs]class KeywordProcessorFixed(KeywordProcessor): # noinspection PyUnusedLocal
[docs] def replace_keywords(self, a_sentence: str) -> str: if not a_sentence: # if sentence is empty or none just return the same. return a_sentence new_sentence = [] if not self.case_sensitive: sentence = a_sentence.lower() # by Ihor Bobak: # some letters can expand in size when lower() is called, therefore we will preprocess # noqa # a_sentense to find those letters which lower()-ed to 2 or more symbols. # noqa # So, imagine that X is lowered as yz, the rest are lowered as is: A->a, B->b, C->c # noqa # then for the string ABCXABC we want to get # ['A', 'B', 'C', 'X', '', 'A', 'B', 'C'] which corresponds to # ['a', 'b', 'c', 'y', 'z', 'a', 'b', 'c'] because when the code below will run by the indexes # noqa # of the lowered string, it will "glue" the original string also by THE SAME indexes # noqa orig_sentence = [] for i in range(0, len(a_sentence)): char = a_sentence[i] len_char_lower = len(char.lower()) for j in range(0, len_char_lower): # in most cases it will work just one iteration and will # add the same char orig_sentence.append(char if j == 0 else "") # but if it happens that X->yz, then for z it will add '' else: sentence = a_sentence orig_sentence = a_sentence current_word = "" current_dict = self.keyword_trie_dict current_white_space = "" sequence_end_pos = 0 idx = 0 sentence_len = len(sentence) while idx < sentence_len: char = sentence[idx] current_word += orig_sentence[idx] # when we reach whitespace if char not in self.non_word_boundaries: current_white_space = char # if end is present in current_dict if self._keyword in current_dict or char in current_dict: # update longest sequence found sequence_found = None longest_sequence_found = None is_longer_seq_found = False if self._keyword in current_dict: sequence_found = current_dict[self._keyword] longest_sequence_found = current_dict[self._keyword] sequence_end_pos = idx # re look for longest_sequence from this position if char in current_dict: current_dict_continued = current_dict[char] current_word_continued = current_word idy = idx + 1 while idy < sentence_len: inner_char = sentence[idy] current_word_continued += orig_sentence[idy] if ( inner_char not in self.non_word_boundaries and self._keyword in current_dict_continued ): # update longest sequence found current_white_space = inner_char longest_sequence_found = ( current_dict_continued[self._keyword] ) sequence_end_pos = idy is_longer_seq_found = True if inner_char in current_dict_continued: current_dict_continued = ( current_dict_continued[inner_char] ) else: break idy += 1 else: # end of sentence reached. if self._keyword in current_dict_continued: # update longest sequence found current_white_space = "" longest_sequence_found = ( current_dict_continued[self._keyword] ) sequence_end_pos = idy is_longer_seq_found = True if is_longer_seq_found: idx = sequence_end_pos current_word = current_word_continued current_dict = self.keyword_trie_dict if longest_sequence_found: new_sentence.append(longest_sequence_found) new_sentence.append(current_white_space) current_word = "" current_white_space = "" else: new_sentence.append(current_word) current_word = "" current_white_space = "" else: # we reset current_dict current_dict = self.keyword_trie_dict new_sentence.append(current_word) current_word = "" current_white_space = "" elif char in current_dict: # we can continue from this char current_dict = current_dict[char] else: # we reset current_dict current_dict = self.keyword_trie_dict # skip to end of word idy = idx + 1 while idy < sentence_len: char = sentence[idy] current_word += orig_sentence[idy] if char not in self.non_word_boundaries: break idy += 1 idx = idy new_sentence.append(current_word) current_word = "" current_white_space = "" # if we are end of sentence and have a sequence discovered if idx + 1 >= sentence_len: if self._keyword in current_dict: sequence_found = current_dict[self._keyword] new_sentence.append(sequence_found) else: new_sentence.append(current_word) idx += 1 return "".join(new_sentence)