Source code for crate_anon.anonymise.anonregex

"""
crate_anon/anonymise/anonregex.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Regular expression functions for anonymisation.**

"""

# =============================================================================
# Imports
# =============================================================================

import calendar
import datetime
import logging
from typing import Iterable, List, Optional, Pattern, Union

from cardinal_pythonlib.lists import unique_list

# https://pypi.python.org/pypi/regex/
# https://bitbucket.org/mrabarnett/mrab-regex
import regex  # sudo apt-get install python-regex

# noinspection PyProtectedMember
from regex import _regex_core

from crate_anon.common.regex_helpers import (
    assert_alphabetical,
    AT_LEAST_ONE_NONWORD,
    escape_literal_for_regex_giving_charlist,
    escape_literal_string_for_regex,
    first_n_characters_required,
    named_capture_group,
    NON_ALPHANUMERIC_SPLITTERS,
    noncapture_group,
    NOT_DIGIT_LOOKAHEAD,
    NOT_DIGIT_LOOKBEHIND,
    OPTIONAL_NON_NEWLINE_WHITESPACE,
    optional_noncapture_group,
    OPTIONAL_NONWORD,
    WORD_BOUNDARY as WB,
)

log = logging.getLogger(__name__)


# =============================================================================
# Constants
# =============================================================================

ORDINAL_SUFFIXES_ENGLISH = ("st", "nd", "rd", "th")  # 1st, 2nd, 3rd, 4th...
MONTHS_ENGLISH = tuple(calendar.month_name[_] for _ in range(1, 12 + 1))
# https://docs.python.org/3/library/calendar.html

REGEX_COMPILE_FLAGS = (
    regex.IGNORECASE | regex.UNICODE | regex.VERBOSE | regex.MULTILINE
)

EMAIL_REGEX_STR = (
    # http://emailregex.com/
    # The simple Python example doesn't cope with "r&d@somewhere.nhs.uk".
    # The "full" version is:
    r"""
(?:
    [a-z0-9!#$%&'*+/=?^_`{|}~-]+
    (?:
        \.[a-z0-9!#$%&'*+/=?^_`{|}~-]+
    )*|
    "
    (?:
        [\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|
        \\
        [\x01-\x09\x0b\x0c\x0e-\x7f]
    )*
    "
)
@
(?:
    (?:
        [a-z0-9]
        (?:
            [a-z0-9-]*
            [a-z0-9]
        )?
        \.
    )+
    [a-z0-9]
    (?:
        [a-z0-9-]*
        [a-z0-9]
    )?
    |
    \[
    (?:
        (?:
            25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?
        )
        \.
    ){3}
    (?:
        25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?
        |
        [a-z0-9-]*[a-z0-9]:
        (?:
            [\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]
            |
            \\[\x01-\x09\x0b\x0c\x0e-\x7f]
        )+
    )
    \]
)

"""
)


# =============================================================================
# String manipulation
# =============================================================================


[docs]def get_anon_fragments_from_string(s: str) -> List[str]: """ Takes a complex string, such as a name or address with its components separated by spaces, commas, etc., and returns a list of substrings to be used for anonymisation. - For example, from ``"John Smith"``, return ``["John", "Smith"]``; from ``"John D'Souza"``, return ``["John", "D", "Souza"]``; from ``"42 West Street"``, return ``["42", "West", "Street"]``. - Try these examples: .. code-block:: python get_anon_fragments_from_string("Bob D'Souza") get_anon_fragments_from_string("Jemima Al-Khalaim") get_anon_fragments_from_string("47 Russell Square") - Note that this is a LIBERAL algorithm, i.e. one prone to anonymise too much (e.g. all instances of ``"Street"`` if someone has that as part of their address). - *Note that we use the "word boundary" facility when replacing, and that treats apostrophes and hyphens as word boundaries.* Therefore, we don't need the largest-level chunks, like ``D'Souza``. """ return list(filter(None, NON_ALPHANUMERIC_SPLITTERS.split(s)))
# The filter(None, ...) aspect removes empty strings, e.g. from # leading/trailing whitespace. # ============================================================================= # Anonymisation regexes # ============================================================================= # ----------------------------------------------------------------------------- # Dates # ----------------------------------------------------------------------------- def _month_word_regex_fragment(month_name: str) -> str: """ Returns possibilities for the month word, allowing the first 3 characters, or the whole month name -- e.g. converts ``September`` to ``Sep(?:tember)?``, or indeed anything in between 3 and all of the characters, e.g. ``Sept``. """ return first_n_characters_required(month_name, 3)
[docs]def get_date_regex_elements( dt: Union[datetime.datetime, datetime.date], at_word_boundaries_only: bool = False, ordinal_suffixes: Iterable[str] = ORDINAL_SUFFIXES_ENGLISH, ) -> List[str]: """ Takes a datetime object and returns a list of regex strings with which to scrub. For example, a date/time of 13 Sep 2014 will produce regexes that recognize "13 Sep 2014", "September 13, 2014", "2014/09/13", and many more. Args: dt: The datetime or date or similar object. at_word_boundaries_only: Ensure that all regexes begin and end with a word boundary requirement. ordinal_suffixes: Language-specific suffixes that may be appended to numbers to make them ordinal. In English, "st", "nd", "rd", and "th". Returns: the list of regular expression strings, as above """ # Day (numeric), allowing leading zeroes and e.g. "1st, 2nd" assert_alphabetical(ordinal_suffixes) assert not isinstance(ordinal_suffixes, str) optional_suffixes = optional_noncapture_group("|".join(ordinal_suffixes)) day = "0*" + str(dt.day) + optional_suffixes # Month # ... numerically, allowing leading zeroes for numeric and e.g. # Feb/February month_numeric = "0*" + str(dt.month) # ... as a word # month_word = dt.strftime("%B") # can't cope with years < 1900 month_name = calendar.month_name[dt.month] # localized # Allow first 3 characters, or whole month name: month_word = _month_word_regex_fragment(month_name) month = "(?:" + month_numeric + "|" + month_word + ")" # Year year = str(dt.year) if len(year) == 4: year = "(?:" + year[0:2] + ")?" + year[2:4] # ... converts e.g. 1986 to (19)?86, to match 1986 or 86 # Separator sep = OPTIONAL_NONWORD # Regexes basic_regexes = [ day + sep + month + sep + year, # e.g. 13 Sep 2014 month + sep + day + sep + year, # e.g. Sep 13, 2014 year + sep + month + sep + day, # e.g. 2014/09/13 ] if at_word_boundaries_only: return [WB + x + WB for x in basic_regexes] else: return basic_regexes
[docs]class DateRegexNames: """ For named groups in date regexes. """ # Components that we might need to preserve for blurring, and thus # capture: ALPHABETICAL_MONTH = "alphabetical_month" FOUR_DIGIT_YEAR = "four_digit_year" NUMERIC_DAY = "numeric_day" NUMERIC_MONTH = "numeric_month" TWO_DIGIT_YEAR = "two_digit_year" # Grouped: DAY_MONTH_YEAR = "day_month_year" MONTH_DAY_YEAR = "month_day_year" YEAR_MONTH_DAY = "year_month_day" ISODATE_NO_SEP = "isodate_no_sep"
[docs]def get_generic_date_regex_elements( at_word_boundaries_only: bool = True, ordinal_suffixes: Iterable[str] = ORDINAL_SUFFIXES_ENGLISH, all_month_names: Iterable[str] = MONTHS_ENGLISH, ) -> List[str]: """ Returns a set of regex elements to scrub *any* date. Word boundaries are strongly preferred! This will match some odd things otherwise; see the associated unit tests. """ # https://stackoverflow.com/questions/51224/regular-expression-to-match-valid-dates # noqa # range [1, 31] numeric_day = named_capture_group( r"0?[1-9]|[12]\d|30|31", DateRegexNames.NUMERIC_DAY ) # range [1, 12] numeric_month = named_capture_group( r"0?[1-9]|1[0-2]", DateRegexNames.NUMERIC_MONTH ) # a 2-digit or 4-digit number two_digit_year = named_capture_group( r"\d{2}", DateRegexNames.TWO_DIGIT_YEAR ) four_digit_year = named_capture_group( r"\d{4}", DateRegexNames.FOUR_DIGIT_YEAR ) year = noncapture_group(rf"{two_digit_year}|{four_digit_year}") sep = r"[^\w\d\r\n:]" # an active separator # ^ = anything not in the set # \w = word (alphanumeric and underscore) # \d = digit [redundant, I think] # \r = carriage return (code 13) # \n = linefeed (code 10) # : = colon # For ordinal days: day = numeric_day + optional_noncapture_group("|".join(ordinal_suffixes)) # To be able to capture ISO dates like "20010101", but not capture e.g. # "31/12" as 3, 1, 12, we require separators normally and do a special for # ISO dates: two_digit_day = noncapture_group(r"0[1-9]|[12]\d|30|31") two_digit_month = noncapture_group(r"0[1-9]|1[0-2]") isodate_no_sep = four_digit_year + two_digit_month + two_digit_day # Then for months as words: alphabetical_months = named_capture_group( "|".join([_month_word_regex_fragment(m) for m in all_month_names]), DateRegexNames.ALPHABETICAL_MONTH, ) month = noncapture_group("|".join([numeric_month] + [alphabetical_months])) basic_regexes = [ named_capture_group( day + sep + month + sep + year, DateRegexNames.DAY_MONTH_YEAR, # e.g. UK ), named_capture_group( month + sep + day + sep + year, DateRegexNames.MONTH_DAY_YEAR, # e.g. USA ), named_capture_group( year + sep + month + sep + day, DateRegexNames.YEAR_MONTH_DAY, # e.g. ISO ), named_capture_group( isodate_no_sep, DateRegexNames.ISODATE_NO_SEP, # ISO with no separators ), ] if at_word_boundaries_only: return [WB + x + WB for x in basic_regexes] else: # Even if we don't require a strict word boundary, we can't allow just # anything -- you get garbage if numbers precede numeric dates. non_numeric_boundary = noncapture_group(r"\b|[\WA-Za-z_]") # \b word boundary = change from word to non-word (or the reverse) # \w = word = alphanumeric and underscore # ... so we take the subset that is alphabetical and underscore # \W = nonword = everything not in \w return [ non_numeric_boundary + x + non_numeric_boundary for x in basic_regexes ]
# ----------------------------------------------------------------------------- # Generic codes # -----------------------------------------------------------------------------
[docs]def get_code_regex_elements( s: str, liberal: bool = True, very_liberal: bool = True, at_word_boundaries_only: bool = True, at_numeric_boundaries_only: bool = True, ) -> List[str]: """ Takes a **string** representation of a number or an alphanumeric code, which may include leading zeros (as for phone numbers), and produces a list of regex strings for scrubbing. We allow all sorts of separators. For example, 0123456789 might appear as .. code-block:: none (01234) 56789 0123 456 789 01234-56789 0123.456.789 This can also be used for postcodes, which should have whitespace prestripped, so e.g. PE123AB might appear as .. code-block:: none PE123AB PE12 3AB PE 12 3 AB Args: s: The string representation of a number or code. liberal: Boolean. Use "optional non-newline whitespace" to separate characters in the source. very_liberal: Boolean. Use "optional nonword" to separate characters in the source. at_word_boundaries_only: Boolean. Ensure that the regex begins and ends with a word boundary requirement. So, if True, "123" will not be scrubbed from "M123". at_numeric_boundaries_only: Boolean. Only applicable if ``at_numeric_boundaries_only`` is False. Ensure that the number/code is only recognized when surrounded by non-numbers; that is, only at the boundaries of numbers (at numeric boundaries). - Applicable if ``not at_word_boundaries_only``. - Even though we're not restricting to word boundaries, because (for example) we want ``123456`` to match ``M123456``, it can be undesirable to match numbers that are bordered only by numbers; that is, with this setting, ``23`` should never match ``234`` or ``1234`` or ``123``. - If set, this option ensures that the number/code is recognized only when it is bordered by non-numbers. - But if you want to anonymise "123456" out of a phone number written like "01223123456", you might have to turn this off... Returns: a list of regular expression strings """ if not s: return [] chars = escape_literal_for_regex_giving_charlist( s ) # escape any decimal points, etc. if very_liberal: separators = OPTIONAL_NONWORD elif liberal: separators = OPTIONAL_NON_NEWLINE_WHITESPACE else: separators = "" s = separators.join([c for c in chars]) # ... can appear anywhere if at_word_boundaries_only: return [WB + s + WB] else: if at_numeric_boundaries_only: # http://www.regular-expressions.info/lookaround.html # https://stackoverflow.com/questions/15099150/regex-find-one-digit-number # noqa return [NOT_DIGIT_LOOKBEHIND + s + NOT_DIGIT_LOOKAHEAD] else: return [s]
# ----------------------------------------------------------------------------- # Generic numbers # -----------------------------------------------------------------------------
[docs]def get_number_of_length_n_regex_elements( n: int, liberal: bool = True, very_liberal: bool = False, at_word_boundaries_only: bool = True, ) -> List[str]: """ Get a list of regex strings for scrubbing n-digit numbers -- for example, to remove all 10-digit numbers as putative NHS numbers, or all 11-digit numbers as putative UK phone numbers. Args: n: the length of the number liberal: Boolean. Use "optional non-newline whitespace" to separate the digits. very_liberal: Boolean. Use "optional nonword" to separate the digits. at_word_boundaries_only: Boolean. If set, ensure that the regex begins and ends with a word boundary requirement. If not set, the regex must be surrounded by non-digits. (If it were surrounded by more digits, it wouldn't be an n-digit number!) Returns: a list of regular expression strings """ s = ["[0-9]"] * n if very_liberal: separators = OPTIONAL_NONWORD elif liberal: separators = OPTIONAL_NON_NEWLINE_WHITESPACE else: separators = "" s = separators.join([c for c in s]) if at_word_boundaries_only: return [WB + s + WB] else: return [NOT_DIGIT_LOOKBEHIND + s + NOT_DIGIT_LOOKAHEAD]
# ... if there was a digit before/after, it's not an n-digit number # ----------------------------------------------------------------------------- # UK postcodes # -----------------------------------------------------------------------------
[docs]def get_uk_postcode_regex_elements( at_word_boundaries_only: bool = True, ) -> List[str]: """ Get a list of regex strings for scrubbing UK postcodes. These have a well-defined format. Unless compiled with the ``re.IGNORECASE``, they will match upper-case postcodes only. Args: at_word_boundaries_only: Boolean. If set, ensure that the regex begins and ends with a word boundary requirement. Returns: a list of regular expression strings See: - https://stackoverflow.com/questions/164979/regex-for-matching-uk-postcodes """ # noqa # ------------------------------------------------------------------------- # Old # ------------------------------------------------------------------------- # e = [ # "AN NAA", # "ANN NAA", # "AAN NAA", # "AANN NAA", # "ANA NAA", # "AANA NAA", # ] # type: List[str] # for i in range(len(e)): # e[i] = e[i].replace("A", "[A-Z]") # letter # e[i] = e[i].replace("N", "[0-9]") # number # e[i] = e[i].replace(" ", OPTIONAL_WHITESPACE) # if at_word_boundaries_only: # e[i] = WB + e[i] + WB # return e # ------------------------------------------------------------------------- # New 2020-04-28: much more efficient # ------------------------------------------------------------------------- e = r"[A-Z]{1,2}\d[A-Z\d]?\s*\d[A-Z]{2}" if at_word_boundaries_only: e = WB + e + WB return [e]
[docs]def get_uk_postcode_regex_string(at_word_boundaries_only: bool = True) -> str: """ Shortcut to retrieve a single regex string for UK postcodes (following the changes above on 2020-04-28). See :func:`get_uk_postcode_regex_elements`. """ postcode_regexes = get_uk_postcode_regex_elements( at_word_boundaries_only=at_word_boundaries_only ) assert len(postcode_regexes) == 1 # as of 2020-04-28, this is true return postcode_regexes[0]
# ----------------------------------------------------------------------------- # Generic strings and phrases # ----------------------------------------------------------------------------- # Note, for strings, several typo-detecting methods: # http://en.wikipedia.org/wiki/Levenshtein_distance # http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/ # http://en.wikipedia.org/wiki/TRE_(computing) # https://pypi.python.org/pypi/regex # ... let's go with the fuzzy regex method (Python regex module).
[docs]def get_string_regex_elements( s: str, suffixes: List[str] = None, at_word_boundaries_only: bool = True, max_errors: int = 0, ) -> List[str]: """ Takes a string and returns a list of regex strings with which to scrub. Args: s: The starting string. suffixes: A list of suffixes to permit, typically ``["s"]``. at_word_boundaries_only: Boolean. If set, ensure that the regex begins and ends with a word boundary requirement. (If false: will scrub ``ANN`` from ``bANNed``.) max_errors: The maximum number of typographical insertion/deletion/substitution errors to permit. Returns: a list of regular expression strings """ if not s: return [] s = escape_literal_string_for_regex(s) if max_errors > 0: s = "(" + s + "){e<" + str(max_errors + 1) + "}" # - a leading (?e) forces a search for a better match than the first; # the other way is to specify the regex.ENHANCEMATCH flag... # however, when doing this in get_regex_from_elements(), we got a # segmentation fault... and, less consistently, when we put it here. # So skip that! # - (...) is the pattern # - suffix up to n insertion/deletion/substitution errors # ... https://pypi.python.org/pypi/regex # ... http://www.gossamer-threads.com/lists/python/python/1002881 if suffixes: suffixstr = ( "(?:" + "|".join([escape_literal_string_for_regex(x) for x in suffixes]) + "|)" # allows for no suffix at all ) else: suffixstr = "" if at_word_boundaries_only: return [WB + s + suffixstr + WB] else: return [s + suffixstr]
[docs]def get_phrase_regex_elements( phrase: str, suffixes: List[str] = None, at_word_boundaries_only: bool = True, max_errors: int = 0, alternatives: List[List[str]] = None, ) -> List[str]: """ Gets regular expressions to scrub a phrase; that is, all words within a phrase consecutively. Args: phrase: E.g. '4 Privet Drive'. suffixes: A list of suffixes to permit (unusual). at_word_boundaries_only: Apply regex only at word boundaries? max_errors: Maximum number of typos, as defined by the regex module. alternatives: This allows words to be substituted by equivalents; such as ``St`` for ``Street`` or ``Rd`` for ``Road``. The parameter is a list of lists of equivalents; see :func:`crate_anon.anonymise.config.get_word_alternatives`. Returns: A list of regex fragments. """ # Break the phrase into consecutive strings. strings = get_anon_fragments_from_string(phrase) if not strings: return [] if alternatives: # If we're allowing alternatives... for i, string in enumerate(strings): upperstring = string.upper() found_equivalents = False for equivalent_words in alternatives: if upperstring in equivalent_words: # Found it. Replace our single word with a regex # representing a whole set of alternatives (including what # we started with). strings[i] = ( "(?:" + "|".join( escape_literal_string_for_regex(x) for x in equivalent_words ) + ")" ) found_equivalents = True break if not found_equivalents: # No equivalents; just escape what we have strings[i] = escape_literal_string_for_regex(string) else: # Otherwise, escape what we have strings = [escape_literal_string_for_regex(x) for x in strings] s = AT_LEAST_ONE_NONWORD.join(strings) if max_errors > 0: s = "(" + s + "){e<" + str(max_errors + 1) + "}" if suffixes: suffixstr = ( "(?:" + "|".join([escape_literal_string_for_regex(x) for x in suffixes]) + "|)" # allows for no suffix at all ) else: suffixstr = "" if at_word_boundaries_only: return [WB + s + suffixstr + WB] else: return [s + suffixstr]
# ============================================================================= # Combining regex elements into a giant regex # =============================================================================
[docs]def get_regex_string_from_elements(elementlist: List[str]) -> str: """ Convert a list of regex elements into a single regex string. """ if not elementlist: return "" return "|".join(unique_list(elementlist))
# The or operator | has the lowest precedence. # ... http://www.regular-expressions.info/alternation.html # We also want to minimize the number of brackets. # THEREFORE, ANYTHING CONTRIBUTING FRAGMENTS HERE SHOULD NOT HAVE | # OPERATORS AT ITS TOP LEVEL. If it does, it should encapsulate them in a # non-capturing group, (?:...)
[docs]def get_regex_from_elements(elementlist: List[str]) -> Optional[Pattern]: """ Convert a list of regex elements into a compiled regex, which will operate in case-insensitive fashion on Unicode strings. """ if not elementlist: return None try: s = get_regex_string_from_elements(elementlist) return regex.compile(s, REGEX_COMPILE_FLAGS) except _regex_core.error: log.exception(f"Failed regex: elementlist={elementlist}") raise