Source code for crate_anon.anonymise.scrub



    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <>.


**Scrubber classes for CRATE anonymiser.**


from abc import ABC, abstractmethod
from collections import OrderedDict
import datetime
import logging
import re
import string
from typing import (

    from re import Match

from cardinal_pythonlib.datetimefunc import coerce_to_datetime
from cardinal_pythonlib.file_io import gen_lines_without_comments
from cardinal_pythonlib.hash import GenericHasher
from cardinal_pythonlib.sql.validation import (
from cardinal_pythonlib.text import get_unicode_characters

# from flashtext import KeywordProcessor
from crate_anon.common.bugfix_flashtext import KeywordProcessorFixed

# ... temp bugfix

# noinspection PyPep8Naming
from crate_anon.anonymise.constants import (
    AnonymiseConfigDefaults as DA,
from crate_anon.anonymise.anonregex import (
from crate_anon.common.stringfunc import (

log = logging.getLogger(__name__)

# =============================================================================
# Generic scrubber base class
# =============================================================================

[docs]class ScrubberBase(ABC): """ Scrubber base class. """
[docs] def __init__(self, hasher: GenericHasher) -> None: """ Args: hasher: :class:`GenericHasher` to use to hash this scrubber (for change-detection purposes); should be a secure hasher """ self.hasher = hasher
[docs] @abstractmethod def scrub(self, text: str) -> str: """ Returns a scrubbed version of the text. Args: text: the raw text, potentially containing sensitive information Returns: the de-identified text """ raise NotImplementedError("Implement in derived class")
[docs] @abstractmethod def get_hash(self) -> str: """ Returns a hash of our scrubber -- so we can store it, and later see if it's changed. In an incremental update, if the scrubber has changed, we should re-anonymise all data for this patient. """ raise NotImplementedError("Implement in derived class")
# ============================================================================= # WordList # =============================================================================
[docs]def lower_case_words_from_file(filename: str) -> Generator[str, None, None]: """ Generates lower-case words from a file. """ for line in gen_lines_without_comments( filename, comment_at_start_only=True ): for word in line.split(): if word: yield word.lower()
[docs]def lower_case_phrase_lines_from_file( filename: str, ) -> Generator[str, None, None]: """ Generates lower-case phrases from a file, one per line. """ for line in gen_lines_without_comments( filename, comment_at_start_only=True ): # line is pre-stripped (left/right) and not empty yield line.lower()
FLASHTEXT_WORD_CHARACTERS = set( string.digits + string.ascii_letters # part of flashtext default + "_" # part of flashtext default + get_unicode_characters("Latin_Alphabetic") # part of flashtext default ) # Why do we do this? So e.g. "naïve" isn't truncated to "naï[~~~]". # Check: FLASHTEXT_WORDCHAR_STR = "".join(sorted(FLASHTEXT_WORD_CHARACTERS))
[docs]class WordList(ScrubberBase): """ A scrubber that removes all words in a wordlist, in case-insensitive fashion. This serves a dual function as an allowlist (is a word in the list?) and a denylist (scrub text using the wordlist). """
[docs] def __init__( self, filenames: Iterable[str] = None, words: Iterable[str] = None, as_phrases: bool = False, replacement_text: str = "[---]", hasher: GenericHasher = None, suffixes: List[str] = None, at_word_boundaries_only: bool = True, max_errors: int = 0, regex_method: bool = False, ) -> None: """ Args: filenames: Filenames to read words from. words: Additional words to add. as_phrases: Keep lines in the source file intact (as phrases), rather than splitting them into individual words, and (if ``regex_method`` is True) scrub as phrases. replacement_text: Replace sensitive content with this string. hasher: :class:`GenericHasher` to use to hash this scrubber (for change-detection purposes); should be a secure hasher. suffixes: Append each of these suffixes to each word. at_word_boundaries_only: Boolean. If set, ensure that the regex begins and ends with a word boundary requirement. (If false: will scrub ``ANN`` from ``bANNed``, for example.) max_errors: The maximum number of typographical insertion / deletion / substitution errors to permit. Applicable only if ``regex_method`` is True. regex_method: Use regular expressions? If True: slower, but phrase scrubbing deals with variable whitespace. If False: much faster (uses FlashText), but whitespace is inflexible. """ if not regex_method and at_word_boundaries_only is False: raise ValueError( "FlashText (chosen by regex_method=False) will only work at " "word boundaries, but at_word_boundaries_only is False" ) filenames = filenames or [] words = words or [] super().__init__(hasher) self.replacement_text = replacement_text self.as_phrases = as_phrases self.suffixes = suffixes or [] # type: List[str] self.at_word_boundaries_only = at_word_boundaries_only self.max_errors = max_errors self.regex_method = regex_method self._regex = None # type: Optional[Pattern[str]] self._processor = None # type: Optional[KeywordProcessorFixed] self._cached_hash = None # type: Optional[str] self._built = False self.words = set() # type: Set[str] # Sets are faster than lists for "is x in s" operations: # # noinspection PyTypeChecker for f in filenames: self.add_file(f, clear_cache=False) # noinspection PyTypeChecker for w in words: self.add_word(w, clear_cache=False)
# log.debug(f"Created wordlist with {len(self.words)} words")
[docs] def clear_cache(self) -> None: """ Clear cached information (e.g. the compiled regex, the cached hash of this scrubber). """ self._built = False self._regex = None # type: Optional[Pattern[str]] self._processor = None # type: Optional[KeywordProcessorFixed] self._cached_hash = None # type: Optional[str]
[docs] def add_word(self, word: str, clear_cache: bool = True) -> None: """ Add a word to our wordlist. Args: word: word to add clear_cache: also clear our cache? """ if not word: return self.words.add(word.lower()) if clear_cache: self.clear_cache()
[docs] def add_file(self, filename: str, clear_cache: bool = True) -> None: """ Add all words from a file. Args: filename: File to read. clear_cache: Also clear our cache? """ if self.as_phrases: wordgen = lower_case_phrase_lines_from_file(filename) else: wordgen = lower_case_words_from_file(filename) for w in wordgen: self.words.add(w) if clear_cache: self.clear_cache()
[docs] def contains(self, word: str) -> bool: """ Does our wordlist contain this word? """ return word.lower() in self.words
[docs] def get_hash(self) -> str: # docstring in parent class # A set is unordered. # We want the hash to be the same if we have the same words, even if # they were entered in a different order, so we need to sort: if not self._cached_hash: self._cached_hash = self.hasher.hash(sorted(self.words)) return self._cached_hash
[docs] def scrub(self, text: str) -> str: # docstring in parent class if not self._built: if self.regex_method: if not self._regex: return text return self._regex.sub(self.replacement_text, text) else: if not self._processor: return text return self._processor.replace_keywords(text)
def _gen_word_and_suffixed(self, w: str) -> Iterable[str]: """ Yields the word supplied plus suffixed versions. """ yield w for s in self.suffixes: yield w + s
[docs] def build(self) -> None: """ Compiles a high-speed scrubbing device, be it a regex or a FlashText processor. Called only when we have collected all our words. """ if self.regex_method: elements = [] # type: List[str] for w in self.words: if self.as_phrases: elements.extend( get_phrase_regex_elements( w, suffixes=self.suffixes, at_word_boundaries_only=self.at_word_boundaries_only, # noqa: E501 max_errors=self.max_errors, ) ) else: elements.extend( get_string_regex_elements( w, suffixes=self.suffixes, at_word_boundaries_only=self.at_word_boundaries_only, # noqa: E501 max_errors=self.max_errors, ) ) log.debug(f"Building regex with {len(elements)} elements") self._regex = get_regex_from_elements(elements) else: if self.words: self._processor = KeywordProcessorFixed(case_sensitive=False) self._processor.set_non_word_boundaries( FLASHTEXT_WORD_CHARACTERS ) replacement = self.replacement_text log.debug( f"Building FlashText processor with " f"{len(self.words)} keywords" ) for w in self.words: for sw in self._gen_word_and_suffixed(w): self._processor.add_keyword(sw, replacement) else: self._processor = None # type: Optional[KeywordProcessorFixed] self._built = True
# ============================================================================= # NonspecificScrubber # =============================================================================
[docs]class Replacer: """ Custom regex replacement called from regex.sub(). This base class doesn't do much and is the equivalent of just passing the replacement text to regex.sub(). """
[docs] def __init__(self, replacement_text: str) -> None: self.replacement_text = replacement_text
[docs] def replace(self, match: "Match") -> str: """ When re.sub() or regex.sub() is called, the "repl" argument can be a function. If so, it's a function that takes a :class:`re.Match` argument and returns the replacement text. """ return self.replacement_text
[docs]class NonspecificReplacer(Replacer): """ Custom regex replacement for the Nonspecific scrubber. Currently this will "blur" dates if replacement_text_all_dates contains any formatting directives. """
[docs] def __init__(self, replacement_text: str, replacement_text_all_dates: str): """ Args: replacement_text: Generic text to use. replacement_text_all_dates: Replacement text to use if the matched text is a date. Can include format specifiers to blur the date rather than scrubbing it out entirely. """ super().__init__(replacement_text) self.replacement_text_all_dates = replacement_text_all_dates self.slow_date_replacement = "%" in replacement_text_all_dates
[docs] def replace(self, match: "Match") -> str: groupdict = match.groupdict() if not self.is_a_date(groupdict): return super().replace(match) if self.slow_date_replacement: date = self.parse_date(match, groupdict) return date.strftime(self.replacement_text_all_dates) return self.replacement_text_all_dates
[docs] @staticmethod def is_a_date(groupdict: Dict[str, Any]) -> bool: """ Is the match result a date? We detect this via our named regex groups. """ return any( groupdict.get(groupname) is not None for groupname in ( DateRegexNames.DAY_MONTH_YEAR, DateRegexNames.MONTH_DAY_YEAR, DateRegexNames.YEAR_MONTH_DAY, DateRegexNames.ISODATE_NO_SEP, ) )
[docs] @staticmethod def parse_date( match: "Match", groupdict: Dict[str, Any] ) -> datetime.datetime: """ Retrieve a valid date from the Match object for blurring. Valid regex group name combinations, where D == DateRegexNames: D.ISODATE_NO_SEP: D.FOUR_DIGIT_YEAR, D.DAY_MONTH_YEAR: D.NUMERIC_DAY, D.NUMERIC_MONTH, D.TWO_DIGIT_YEAR, D.DAY_MONTH_YEAR: D.NUMERIC_DAY, D.NUMERIC_MONTH, D.FOUR_DIGIT_YEAR, D.DAY_MONTH_YEAR: D.NUMERIC_DAY, D.ALPHABETICAL_MONTH, D.TWO_DIGIT_YEAR, D.DAY_MONTH_YEAR: D.NUMERIC_DAY, D.ALPHABETICAL_MONTH, D.FOUR_DIGIT_YEAR, D.MONTH_DAY_YEAR: D.NUMERIC_DAY, D.NUMERIC_MONTH, D.TWO_DIGIT_YEAR, D.MONTH_DAY_YEAR: D.NUMERIC_DAY, D.NUMERIC_MONTH, D.FOUR_DIGIT_YEAR, D.MONTH_DAY_YEAR: D.NUMERIC_DAY, D.ALPHABETICAL_MONTH, D.TWO_DIGIT_YEAR, D.MONTH_DAY_YEAR: D.NUMERIC_DAY, D.ALPHABETICAL_MONTH, D.FOUR_DIGIT_YEAR, D.YEAR_MONTH_DAY: D.NUMERIC_DAY, D.NUMERIC_MONTH, D.TWO_DIGIT_YEAR, D.YEAR_MONTH_DAY: D.NUMERIC_DAY, D.NUMERIC_MONTH, D.FOUR_DIGIT_YEAR, D.YEAR_MONTH_DAY: D.NUMERIC_DAY, D.ALPHABETICAL_MONTH, D.TWO_DIGIT_YEAR, D.YEAR_MONTH_DAY: D.NUMERIC_DAY, D.ALPHABETICAL_MONTH, D.FOUR_DIGIT_YEAR, """ # noqa: E501 # Simple special handling for ISO date format without separators. isodate_no_sep = groupdict.get(DateRegexNames.ISODATE_NO_SEP) if isodate_no_sep is not None: return datetime.datetime.strptime(isodate_no_sep, "%Y%m%d") # For all others, extract D/M/Y information. year = groupdict.get(DateRegexNames.FOUR_DIGIT_YEAR) if year is None: two_digit_year = # Will convert: # 00-68 -> 2000-2068 # 69-99 -> 1969-1999 year = datetime.datetime.strptime(two_digit_year, "%y").year numeric_day = numeric_month = groupdict.get(DateRegexNames.NUMERIC_MONTH) if numeric_month is None: three_letter_month = DateRegexNames.ALPHABETICAL_MONTH )[:3] numeric_month = MONTH_3_LETTER_INDEX.get(three_letter_month) return datetime.datetime( int(year), int(numeric_month), int(numeric_day) )
[docs]class NonspecificScrubber(ScrubberBase): """ Scrubs a bunch of things that are independent of any patient-specific data, such as removing all UK postcodes, or numbers of a certain length. """
[docs] def __init__( self, hasher: GenericHasher, replacement_text: str = DA.REPLACE_NONSPECIFIC_INFO_WITH, anonymise_codes_at_word_boundaries_only: bool = DA.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY, # noqa anonymise_dates_at_word_boundaries_only: bool = DA.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY, # noqa anonymise_numbers_at_word_boundaries_only: bool = DA.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY, # noqa denylist: WordList = None, scrub_all_numbers_of_n_digits: List[int] = None, scrub_all_uk_postcodes: bool = DA.SCRUB_ALL_UK_POSTCODES, scrub_all_dates: bool = DA.SCRUB_ALL_DATES, replacement_text_all_dates: str = DA.REPLACE_ALL_DATES_WITH, scrub_all_email_addresses: bool = DA.SCRUB_ALL_EMAIL_ADDRESSES, extra_regexes: Optional[List[str]] = None, ) -> None: """ Args: replacement_text: Replace sensitive content with this string. hasher: :class:`GenericHasher` to use to hash this scrubber (for change-detection purposes); should be a secure hasher anonymise_codes_at_word_boundaries_only: For codes: Boolean. Ensure that the regex begins and ends with a word boundary requirement. anonymise_dates_at_word_boundaries_only: Scrub dates only if they occur at word boundaries. (Even if you say no, there are *some* restrictions or very odd things would happen; see :func:`crate_anon.anonymise.anonregex.get_generic_date_regex_elements`.) anonymise_numbers_at_word_boundaries_only: For numbers: Boolean. If set, ensure that the regex begins and ends with a word boundary requirement. If not set, the regex must be surrounded by non-digits. (If it were surrounded by more digits, it wouldn't be an n-digit number!) denylist: Words to scrub. scrub_all_numbers_of_n_digits: List of values of n; number lengths to scrub. scrub_all_uk_postcodes: Scrub all UK postcodes? scrub_all_dates: Scrub all dates? (Currently assumes the default locale for month names and ordinal suffixes.) replacement_text_all_dates: When scrub_all_dates is True, replace with this text. Supports limited datetime.strftime directives for "blurring" of dates. Example: "%b %Y" for abbreviated month and year. scrub_all_email_addresses: Scrub all e-mail addresses? extra_regexes: List of user-defined extra regexes to scrub. """ scrub_all_numbers_of_n_digits = scrub_all_numbers_of_n_digits or [] super().__init__(hasher) self.replacement_text = replacement_text self.anonymise_codes_at_word_boundaries_only = ( anonymise_codes_at_word_boundaries_only ) self.anonymise_dates_at_word_boundaries_only = ( anonymise_dates_at_word_boundaries_only ) self.anonymise_numbers_at_word_boundaries_only = ( anonymise_numbers_at_word_boundaries_only ) self.denylist = denylist self.scrub_all_numbers_of_n_digits = scrub_all_numbers_of_n_digits self.scrub_all_uk_postcodes = scrub_all_uk_postcodes self.scrub_all_dates = scrub_all_dates self.replacement_text_all_dates = replacement_text_all_dates self.check_replacement_text_all_dates() self.replacer = self.get_replacer() self.scrub_all_email_addresses = scrub_all_email_addresses self.extra_regexes = extra_regexes self._cached_hash = None # type: Optional[str] self._regex = None # type: Optional[Pattern[str]] self._regex_built = False self.build_regex()
[docs] def get_replacer(self) -> Replacer: """ Return a function that can be used as the "repl" (replacer) argument to a re.sub() or regex.sub() call. """ if ( self.replacement_text == self.replacement_text_all_dates and "%" not in self.replacement_text_all_dates ): # Fast, simple return Replacer(self.replacement_text) # Handle dates in a more complex way, e.g. blurring them: return NonspecificReplacer( self.replacement_text, self.replacement_text_all_dates )
[docs] def check_replacement_text_all_dates(self) -> None: """ Ensure our date-replacement text is legitimate in terms of e.g. "%Y"-style directives. """ bad = False possible_percent_chars = "".join(DATE_BLURRING_DIRECTIVES) if rf"%[^{possible_percent_chars}]", self.replacement_text_all_dates ): bad = True else: # Double-check: test_date =, 12, 31) try: test_date.strftime(self.replacement_text_all_dates) except ValueError: bad = True if bad: raise ValueError( f"Bad format {self.replacement_text_all_dates!r} for date " "scrubbing. Allowed directives are: " f"{DATE_BLURRING_DIRECTIVES_CSV}" )
[docs] def get_hash(self) -> str: # docstring in parent class if not self._cached_hash: self._cached_hash = self.hasher.hash( [ # signature, used for hashing: self.anonymise_codes_at_word_boundaries_only, self.anonymise_numbers_at_word_boundaries_only, self.denylist.get_hash() if self.denylist else None, self.scrub_all_numbers_of_n_digits, self.scrub_all_uk_postcodes, ] ) return self._cached_hash
[docs] def scrub(self, text: str) -> str: # docstring in parent class if not self._regex_built: self.build_regex() if self.denylist: text = self.denylist.scrub(text) if not self._regex: # possible; may be blank return text return self._regex.sub(self.replacer.replace, text)
[docs] def build_regex(self) -> None: """ Compile our high-speed regex. """ elements = [] # type: List[str] if self.scrub_all_uk_postcodes: elements.extend( get_uk_postcode_regex_elements( at_word_boundaries_only=( self.anonymise_codes_at_word_boundaries_only ) ) ) # noinspection PyTypeChecker for n in self.scrub_all_numbers_of_n_digits: elements.extend( get_number_of_length_n_regex_elements( n, at_word_boundaries_only=( self.anonymise_numbers_at_word_boundaries_only ), ) ) if self.scrub_all_dates: elements.extend( get_generic_date_regex_elements( at_word_boundaries_only=self.anonymise_dates_at_word_boundaries_only # noqa ) ) if self.scrub_all_email_addresses: elements.append(EMAIL_REGEX_STR) if self.extra_regexes: elements.extend(self.extra_regexes) self._regex = get_regex_from_elements(elements) self._regex_built = True
# ============================================================================= # PersonalizedScrubber # =============================================================================
[docs]class PersonalizedScrubber(ScrubberBase): """ Accepts patient-specific (patient and third-party) information, and uses that to scrub text. """
[docs] def __init__( self, hasher: GenericHasher, replacement_text_patient: str = DA.REPLACE_PATIENT_INFO_WITH, replacement_text_third_party: str = DA.REPLACE_THIRD_PARTY_INFO_WITH, # noqa anonymise_codes_at_word_boundaries_only: bool = DA.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY, # noqa anonymise_codes_at_numeric_boundaries_only: bool = DA.ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY, # noqa anonymise_dates_at_word_boundaries_only: bool = DA.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY, # noqa anonymise_numbers_at_word_boundaries_only: bool = DA.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY, # noqa anonymise_numbers_at_numeric_boundaries_only: bool = DA.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY, # noqa anonymise_strings_at_word_boundaries_only: bool = DA.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY, # noqa min_string_length_for_errors: int = DA.MIN_STRING_LENGTH_FOR_ERRORS, # noqa min_string_length_to_scrub_with: int = DA.MIN_STRING_LENGTH_TO_SCRUB_WITH, # noqa scrub_string_suffixes: List[str] = None, string_max_regex_errors: int = DA.STRING_MAX_REGEX_ERRORS, allowlist: WordList = None, alternatives: List[List[str]] = None, nonspecific_scrubber: NonspecificScrubber = None, nonspecific_scrubber_first: bool = DA.NONSPECIFIC_SCRUBBER_FIRST, debug: bool = False, ) -> None: """ Args: hasher: :class:`GenericHasher` to use to hash this scrubber (for change-detection purposes); should be a secure hasher. replacement_text_patient: Replace sensitive "patient" content with this string. replacement_text_third_party: Replace sensitive "third party" content with this string. anonymise_codes_at_word_boundaries_only: For codes: Boolean. Ensure that the regex begins and ends with a word boundary requirement. anonymise_codes_at_numeric_boundaries_only: For codes: Boolean. Only applicable if anonymise_codes_at_word_boundaries_only is False. Ensure that the code is only recognized when surrounded by non-numbers; that is, only at the boundaries of numbers (at numeric boundaries). See :func:`crate_anon.anonymise.anonregex.get_code_regex_elements`. anonymise_dates_at_word_boundaries_only: For dates: Boolean. Ensure that the regex begins and ends with a word boundary requirement. anonymise_numbers_at_word_boundaries_only: For numbers: Boolean. Ensure that the regex begins and ends with a word boundary requirement. See :func:`crate_anon.anonymise.anonregex.get_code_regex_elements`. anonymise_numbers_at_numeric_boundaries_only: For numbers: Boolean. Only applicable if anonymise_numbers_at_word_boundaries_only is False. Ensure that the number is only recognized when surrounded by non-numbers; that is, only at the boundaries of numbers (at numeric boundaries). See :func:`crate_anon.anonymise.anonregex.get_code_regex_elements`. anonymise_strings_at_word_boundaries_only: For strings: Boolean. Ensure that the regex begins and ends with a word boundary requirement. min_string_length_for_errors: For strings: minimum string length at which typographical errors will be permitted. min_string_length_to_scrub_with: For strings: minimum string length at which the string will be permitted to be scrubbed with. scrub_string_suffixes: A list of suffixes to permit on strings. string_max_regex_errors: The maximum number of typographical insertion / deletion / substitution errors to permit. allowlist: :class:`WordList` of words to allow (not to scrub). alternatives: This allows words to be substituted by equivalents; such as ``St`` for ``Street`` or ``Rd`` for ``Road``. The parameter is a list of lists of equivalents; see :func:`crate_anon.anonymise.config.get_word_alternatives`. nonspecific_scrubber: :class:`NonspecificScrubber` to apply to remove information that is generic. nonspecific_scrubber_first: If one is provided, run the nonspecific scrubber first (rather than last)? debug: Show the final scrubber regex text as we compile our regexes. """ scrub_string_suffixes = scrub_string_suffixes or [] super().__init__(hasher) self.replacement_text_patient = replacement_text_patient self.replacement_text_third_party = replacement_text_third_party self.anonymise_codes_at_word_boundaries_only = ( anonymise_codes_at_word_boundaries_only ) self.anonymise_codes_at_numeric_boundaries_only = ( anonymise_codes_at_numeric_boundaries_only ) self.anonymise_dates_at_word_boundaries_only = ( anonymise_dates_at_word_boundaries_only ) self.anonymise_numbers_at_word_boundaries_only = ( anonymise_numbers_at_word_boundaries_only ) self.anonymise_numbers_at_numeric_boundaries_only = ( anonymise_numbers_at_numeric_boundaries_only ) self.anonymise_strings_at_word_boundaries_only = ( anonymise_strings_at_word_boundaries_only ) self.min_string_length_for_errors = min_string_length_for_errors self.min_string_length_to_scrub_with = min_string_length_to_scrub_with self.scrub_string_suffixes = scrub_string_suffixes self.string_max_regex_errors = string_max_regex_errors self.allowlist = allowlist self.alternatives = alternatives self.nonspecific_scrubber = nonspecific_scrubber self.nonspecific_scrubber_first = nonspecific_scrubber_first self.debug = debug # Regex information self.re_patient = None # type: Optional[Pattern[str]] self.re_tp = None # type: Optional[Pattern[str]] self.regexes_built = False self.re_patient_elements = [] # type: List[str] self.re_tp_elements = [] # type: List[str] # ... both changed from set to list to reflect referee's point re # potential importance of scrubber order self.elements_tuplelist = ( [] ) # type: List[Tuple[bool, ScrubMethod, str]] # ... list of tuples: (patient?, type, value) # ... used for get_raw_info(); since we've made the order important, # we should detect changes in order here as well self.clear_cache()
[docs] def clear_cache(self) -> None: """ Clear the internal cache (the compiled regex). """ self.regexes_built = False
[docs] @staticmethod def get_scrub_method( datatype_long: str, scrub_method: Optional[ScrubMethod] ) -> ScrubMethod: """ Return the default scrub method for a given SQL datatype, unless overridden. For example, dates are scrubbed via a date method; numbers by a numeric method. Args: datatype_long: SQL datatype as a string scrub_method: optional method to enforce Returns: :class:`crate_anon.anonymise.constants.SCRUBMETHOD` value """ if scrub_method is not None: return scrub_method elif is_sqltype_date(datatype_long): return ScrubMethod.DATE elif is_sqltype_text_over_one_char(datatype_long): return ScrubMethod.WORDS else: return ScrubMethod.NUMERIC
[docs] def add_value( self, value: Any, scrub_method: ScrubMethod, patient: bool = True, clear_cache: bool = True, ) -> None: """ Add a specific value via a specific scrub_method. Args: value: value to add to the scrubber scrub_method: :class:`crate_anon.anonymise.constants.SCRUBMETHOD` value patient: Boolean; controls whether it's treated as a patient value or a third-party value. clear_cache: also clear our cache? """ if value is None: return new_tuple = (patient, scrub_method, repr(value)) if new_tuple not in self.elements_tuplelist: self.elements_tuplelist.append(new_tuple) # Note: object reference r = self.re_patient_elements if patient else self.re_tp_elements if scrub_method is ScrubMethod.DATE: elements = self.get_elements_date(value) elif scrub_method is ScrubMethod.WORDS: elements = self.get_elements_words(value) elif scrub_method is ScrubMethod.PHRASE: elements = self.get_elements_phrase(value) elif scrub_method is ScrubMethod.PHRASE_UNLESS_NUMERIC: elements = self.get_elements_phrase_unless_numeric(value) elif scrub_method is ScrubMethod.NUMERIC: elements = self.get_elements_numeric(value) elif scrub_method is ScrubMethod.CODE: elements = self.get_elements_code(value) else: raise ValueError( f"Bug: unknown scrub_method to add_value: " f"{scrub_method}" ) r.extend(elements) if clear_cache: self.clear_cache()
[docs] def get_elements_date( self, value: Union[datetime.datetime,] ) -> Optional[List[str]]: """ Returns a list of regex elements for a given date value. """ try: value = coerce_to_datetime(value) except Exception as e: log.warning( f"Invalid date received to PersonalizedScrubber. " f"get_elements_date(): value={value}, exception={e}" ) return return get_date_regex_elements( value, at_word_boundaries_only=( self.anonymise_dates_at_word_boundaries_only ), )
[docs] def get_elements_words(self, value: str) -> List[str]: """ Returns a list of regex elements for a given string that contains textual words. """ elements = [] # type: List[str] for s in get_anon_fragments_from_string(str(value)): length = len(s) if length < self.min_string_length_to_scrub_with: # With numbers: if you use the length limit, you may see # numeric parts of addresses, e.g. 4 Drury Lane as # 4 [___] [___]. However, if you exempt numbers then you # mess up a whole bunch of quantitative information, such # as "the last 4-5 years" getting wiped to "the last # [___]-5 years". So let's apply the length limit # consistently. continue if self.allowlist and self.allowlist.contains(s): continue if length >= self.min_string_length_for_errors: max_errors = self.string_max_regex_errors else: max_errors = 0 elements.extend( get_string_regex_elements( s, self.scrub_string_suffixes, max_errors=max_errors, at_word_boundaries_only=( self.anonymise_strings_at_word_boundaries_only ), ) ) return elements
[docs] def get_elements_phrase(self, value: Any) -> List[str]: """ Returns a list of regex elements for a given phrase. """ value = str(value).strip() if not value: return [] length = len(value) if length < self.min_string_length_to_scrub_with: return [] if self.allowlist and self.allowlist.contains(value): return [] if length >= self.min_string_length_for_errors: max_errors = self.string_max_regex_errors else: max_errors = 0 return get_phrase_regex_elements( value, max_errors=max_errors, at_word_boundaries_only=( self.anonymise_strings_at_word_boundaries_only ), alternatives=self.alternatives, )
[docs] def get_elements_phrase_unless_numeric(self, value: Any) -> List[str]: """ If the value is numeric, return an empty list. Otherwise, returns a list of regex elements for the given phrase. """ try: _ = float(value) return [] except (TypeError, ValueError): return self.get_elements_phrase(value)
[docs] def get_elements_numeric(self, value: Any) -> List[str]: """ Start with a number. Remove everything but the digits. Build a regex that scrubs the number. Particular examples: phone numbers, e.g. ``"(01223) 123456"``. Args: value: a string containing a number, or an actual number. Returns: a list of regex elements """ return get_code_regex_elements( get_digit_string_from_vaguely_numeric_string(str(value)), at_word_boundaries_only=( self.anonymise_numbers_at_word_boundaries_only ), at_numeric_boundaries_only=( self.anonymise_numbers_at_numeric_boundaries_only ), )
[docs] def get_elements_code(self, value: Any) -> List[str]: """ Start with an alphanumeric code. Remove whitespace. Build a regex that scrubs the code. Particular examples: postcodes, e.g. ``"PE12 3AB"``. Args: value: a string containing containing an alphanumeric code Returns: a list of regex elements """ return get_code_regex_elements( reduce_to_alphanumeric(str(value)), at_word_boundaries_only=( self.anonymise_codes_at_word_boundaries_only ), at_numeric_boundaries_only=( self.anonymise_codes_at_numeric_boundaries_only ), )
[docs] def get_patient_regex_string(self) -> str: """ Return the string version of the patient regex, sorted. """ return get_regex_string_from_elements(self.re_patient_elements)
[docs] def get_tp_regex_string(self) -> str: """ Return the string version of the third-party regex, sorted. """ return get_regex_string_from_elements(self.re_tp_elements)
[docs] def build_regexes(self) -> None: """ Compile our regexes. """ self.re_patient = get_regex_from_elements(self.re_patient_elements) self.re_tp = get_regex_from_elements(self.re_tp_elements) self.regexes_built = True # Note that the regexes themselves may be None even if they have # been built. if self.debug: log.debug(f"Patient scrubber: {self.get_patient_regex_string()}") log.debug(f"Third party scrubber: {self.get_tp_regex_string()}")
[docs] def scrub(self, text: str) -> Optional[str]: # docstring in parent class if text is None: return None if not self.regexes_built: self.build_regexes() # If nonspecific_scrubber_first: # (1) nonspecific, (2) patient, (3) third party. # Otherwise: # (1) patient, (2) third party, (3) nonspecific. if self.nonspecific_scrubber and self.nonspecific_scrubber_first: text = self.nonspecific_scrubber.scrub(text) if self.re_patient: text = self.re_patient.sub(self.replacement_text_patient, text) if self.re_tp: text = self.re_tp.sub(self.replacement_text_third_party, text) if self.nonspecific_scrubber and not self.nonspecific_scrubber_first: text = self.nonspecific_scrubber.scrub(text) return text
[docs] def get_hash(self) -> str: # docstring in parent class return self.hasher.hash(self.get_raw_info())
[docs] def get_raw_info(self) -> Dict[str, Any]: """ Summarizes settings and (sensitive) data for this scrubber. This is both a summary for debugging and the basis for our change-detection hash (and for the latter reason we need order etc. to be consistent). For any information we put in here, changes will cause data to be re-scrubbed. Note that the hasher should be a secure one, because this is sensitive information. """ # We use a list of tuples to make an OrderedDict. d = ( ( "anonymise_codes_at_word_boundaries_only", self.anonymise_codes_at_word_boundaries_only, ), ( "anonymise_codes_at_numeric_boundaries_only", self.anonymise_codes_at_numeric_boundaries_only, ), ( "anonymise_dates_at_word_boundaries_only", self.anonymise_dates_at_word_boundaries_only, ), ( "anonymise_numbers_at_word_boundaries_only", self.anonymise_numbers_at_word_boundaries_only, ), ( "anonymise_numbers_at_numeric_boundaries_only", self.anonymise_numbers_at_numeric_boundaries_only, ), ( "anonymise_strings_at_word_boundaries_only", self.anonymise_strings_at_word_boundaries_only, ), ( "min_string_length_for_errors", self.min_string_length_for_errors, ), ( "min_string_length_to_scrub_with", self.min_string_length_to_scrub_with, ), ("scrub_string_suffixes", sorted(self.scrub_string_suffixes)), ("string_max_regex_errors", self.string_max_regex_errors), ( "allowlist_hash", self.allowlist.get_hash() if self.allowlist else None, ), ( "nonspecific_scrubber_hash", ( self.nonspecific_scrubber.get_hash() if self.nonspecific_scrubber else None ), ), ("elements", self.elements_tuplelist), ) return OrderedDict(d)