"""
crate_anon/common/bugfix_flashtext.py
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
THIS FILE, however, is by another author: from
https://github.com/vi3k6i5/flashtext/issues/44, by Ihor Bobak; added to
Flashtext code; licensed under the MIT License as per
https://github.com/vi3k6i5/flashtext/blob/master/LICENSE.
Rationale:
There is currently a bug in the method :meth:`replace_keywords` in the external
module ``flashtext`` in which certain characters provoke an 'index out of
range' error when working in case-insensitive mode. This is because some
non-ascii characters are larger in their lower-case form. Thanks to Ihor Bobak
for this bugfix.
Edits for PyCharm linter.
"""
from flashtext import KeywordProcessor
# noinspection PyAbstractClass
[docs]class KeywordProcessorFixed(KeywordProcessor):
# noinspection PyUnusedLocal
[docs] def replace_keywords(self, a_sentence: str) -> str:
if not a_sentence:
# if sentence is empty or none just return the same.
return a_sentence
new_sentence = []
if not self.case_sensitive:
sentence = a_sentence.lower()
# by Ihor Bobak:
# some letters can expand in size when lower() is called, therefore we will preprocess # noqa: E501
# a_sentense to find those letters which lower()-ed to 2 or more symbols. # noqa: E501
# So, imagine that X is lowered as yz, the rest are lowered as is: A->a, B->b, C->c # noqa: E501
# then for the string ABCXABC we want to get
# ['A', 'B', 'C', 'X', '', 'A', 'B', 'C'] which corresponds to
# ['a', 'b', 'c', 'y', 'z', 'a', 'b', 'c'] because when the code below will run by the indexes # noqa: E501
# of the lowered string, it will "glue" the original string also by THE SAME indexes # noqa: E501
orig_sentence = []
for i in range(0, len(a_sentence)):
char = a_sentence[i]
len_char_lower = len(char.lower())
for j in range(0, len_char_lower):
# in most cases it will work just one iteration and will
# add the same char
orig_sentence.append(char if j == 0 else "")
# but if it happens that X->yz, then for z it will add ''
else:
sentence = a_sentence
orig_sentence = a_sentence
current_word = ""
current_dict = self.keyword_trie_dict
current_white_space = ""
sequence_end_pos = 0
idx = 0
sentence_len = len(sentence)
while idx < sentence_len:
char = sentence[idx]
current_word += orig_sentence[idx]
# when we reach whitespace
if char not in self.non_word_boundaries:
current_white_space = char
# if end is present in current_dict
if self._keyword in current_dict or char in current_dict:
# update longest sequence found
sequence_found = None
longest_sequence_found = None
is_longer_seq_found = False
if self._keyword in current_dict:
sequence_found = current_dict[self._keyword]
longest_sequence_found = current_dict[self._keyword]
sequence_end_pos = idx
# re look for longest_sequence from this position
if char in current_dict:
current_dict_continued = current_dict[char]
current_word_continued = current_word
idy = idx + 1
while idy < sentence_len:
inner_char = sentence[idy]
current_word_continued += orig_sentence[idy]
if (
inner_char not in self.non_word_boundaries
and self._keyword in current_dict_continued
):
# update longest sequence found
current_white_space = inner_char
longest_sequence_found = (
current_dict_continued[self._keyword]
)
sequence_end_pos = idy
is_longer_seq_found = True
if inner_char in current_dict_continued:
current_dict_continued = (
current_dict_continued[inner_char]
)
else:
break
idy += 1
else:
# end of sentence reached.
if self._keyword in current_dict_continued:
# update longest sequence found
current_white_space = ""
longest_sequence_found = (
current_dict_continued[self._keyword]
)
sequence_end_pos = idy
is_longer_seq_found = True
if is_longer_seq_found:
idx = sequence_end_pos
current_word = current_word_continued
current_dict = self.keyword_trie_dict
if longest_sequence_found:
new_sentence.append(longest_sequence_found)
new_sentence.append(current_white_space)
current_word = ""
current_white_space = ""
else:
new_sentence.append(current_word)
current_word = ""
current_white_space = ""
else:
# we reset current_dict
current_dict = self.keyword_trie_dict
new_sentence.append(current_word)
current_word = ""
current_white_space = ""
elif char in current_dict:
# we can continue from this char
current_dict = current_dict[char]
else:
# we reset current_dict
current_dict = self.keyword_trie_dict
# skip to end of word
idy = idx + 1
while idy < sentence_len:
char = sentence[idy]
current_word += orig_sentence[idy]
if char not in self.non_word_boundaries:
break
idy += 1
idx = idy
new_sentence.append(current_word)
current_word = ""
current_white_space = ""
# if we are end of sentence and have a sequence discovered
if idx + 1 >= sentence_len:
if self._keyword in current_dict:
sequence_found = current_dict[self._keyword]
new_sentence.append(sequence_found)
else:
new_sentence.append(current_word)
idx += 1
return "".join(new_sentence)