Source code for crate_anon.anonymise.tests.scrub_tests

"""
crate_anon/anonymise/tests/scrub_tests.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

Unit testing.

"""

# =============================================================================
# Imports
# =============================================================================

import re
import logging
import os
from tempfile import TemporaryDirectory
from typing import List
from unittest import TestCase

from cardinal_pythonlib.hash import HmacMD5Hasher

from crate_anon.anonymise.constants import ScrubMethod
from crate_anon.anonymise.scrub import (
    NonspecificScrubber,
    PersonalizedScrubber,
    WordList,
)
from crate_anon.common.bugfix_flashtext import KeywordProcessorFixed

from faker import Faker

log = logging.getLogger(__name__)


# =============================================================================
# Constants
# =============================================================================

TEST_KEY = "hello"
PATIENT_REPLACEMENT = "[XXX]"
THIRD_PARTY_REPLACEMENT = "[YYY]"


# =============================================================================
# Test hashing
# =============================================================================


[docs]class HashTests(TestCase):
[docs]    def test_str_int_hash_equivalent(self) -> None:
        """
        Hashing an integer and its string equivalent should give the same
        answer.
        """
        hasher = HmacMD5Hasher(TEST_KEY)
        x = 1234567
        y = str(x)
        self.assertEqual(
            hasher.hash(x),
            hasher.hash(y),
            "Hasher providing different answer for str and int",
        )


# =============================================================================
# Test WordList
# =============================================================================


[docs]class WordListTests(TestCase):
[docs]    def setUp(self) -> None:
        self.tempdir = TemporaryDirectory()
        self.maxDiff = None  # see full differences upon failure

    def _test_flashtext_word_boundaries(self, target: str) -> None:
        anon_text = PATIENT_REPLACEMENT
        ft = KeywordProcessorFixed(case_sensitive=False)
        ft.add_keyword(target, anon_text)
        self.assertEqual(
            # FlashText will replace at word boundaries:
            ft.replace_keywords(f"x {target} x"),
            f"x {anon_text} x",
        )
        self.assertEqual(
            # But only at word boundaries, so this won't replace:
            ft.replace_keywords(f"x{target}x"),
            f"x{target}x",
        )

    def test_flashtext_word_boundaries(self) -> None:
        self._test_flashtext_word_boundaries("daisy")
        self._test_flashtext_word_boundaries("daisy bluebell")

    def _test_wordlist(self, regex_method: bool = False) -> None:
        """
        Test with e.g.

        .. code-block:: python

            pytest -k test_wordlist --log-cli-level=INFO
        """
        denylist_phrases = ["Alice", "Bob", "Charlie Brown", "Daisy"]
        anon_text = PATIENT_REPLACEMENT
        test_source_text = """
            I met Alice in the street.
            She was walking with Bob.
            Charlie was not with them.
            Their gloves were brown.
            They stopped to inspect a daisy.
            They discussed Charlie Brown cartoons.
            They discussed Charlie  Brown cartoons all day long.
            They made comment after comment.
        """
        denylist_text = (
            "\n# comment\n"
            + "\n".join(f" {x} " for x in denylist_phrases)
            + "\n"
        )
        # https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists  # noqa
        denylist_words = []  # type: List[str]
        for line in denylist_phrases:
            denylist_words += [x for x in line.split() if x]

        expected_result_phrases = test_source_text
        for element in denylist_phrases:
            # https://stackoverflow.com/questions/919056/case-insensitive-replace  # noqa
            element_re = re.compile(re.escape(element), re.IGNORECASE)
            expected_result_phrases = element_re.sub(
                anon_text, expected_result_phrases
            )
        if regex_method:
            # Regexes handle whitespace flexibly.
            expected_result_phrases = expected_result_phrases.replace(
                "Charlie  Brown", anon_text
            )

        expected_result_words = test_source_text
        for element in denylist_words:
            element_re = re.compile(re.escape(element), re.IGNORECASE)
            expected_result_words = element_re.sub(
                anon_text, expected_result_words
            )

        filename = os.path.join(self.tempdir.name, "badwords.txt")
        with open(filename, "wt") as f:
            f.write(denylist_text)

        wordlist_phrases = WordList(
            filenames=[filename],
            as_phrases=True,
            replacement_text=anon_text,
            regex_method=regex_method,
        )
        wordlist_words = WordList(
            filenames=[filename],
            as_phrases=False,
            replacement_text=anon_text,
            regex_method=regex_method,
        )

        log.info(f"test_source_text: {test_source_text}")
        log.info(f"denylist_text: {denylist_text}")

        result_words = wordlist_words.scrub(test_source_text)
        log.info(f"denylist_words: {denylist_words}")
        log.info(f"result_words: {result_words}")
        log.info(f"expected_result_words: {expected_result_words}")
        self.assertEqual(result_words, expected_result_words)

        result_phrases = wordlist_phrases.scrub(test_source_text)
        log.info(f"denylist_phrases: {denylist_phrases}")
        log.info(f"result_phrases: {result_phrases}")
        log.info(f"expected_result_phrases: {expected_result_phrases}")
        self.assertEqual(result_phrases, expected_result_phrases)

        wordlist_suffixes = WordList(
            words=["one", "two"],
            suffixes=["dog", "cat"],
            replacement_text=anon_text,
            regex_method=regex_method,
        )
        self.assertEqual(
            wordlist_suffixes.scrub("x one x"), f"x {anon_text} x"
        )
        self.assertEqual(
            wordlist_suffixes.scrub("x onedog x"), f"x {anon_text} x"
        )
        self.assertEqual(
            wordlist_suffixes.scrub("x one dog x"), f"x {anon_text} dog x"
        )

    def test_wordlist(self) -> None:
        self._test_wordlist(regex_method=False)
        self._test_wordlist(regex_method=True)


[docs]class ScrubberTestCase(TestCase):
[docs]    def setUp(self) -> None:
        self.key = TEST_KEY
        self.hasher = HmacMD5Hasher(self.key)


# =============================================================================
# Test PersonalizedScrubber
# =============================================================================


[docs]class PersonalizedScrubberTests(ScrubberTestCase):
[docs]    def setUp(self) -> None:
        super().setUp()

        self.anonpatient = PATIENT_REPLACEMENT
        self.anonthird = THIRD_PARTY_REPLACEMENT

    def test_phrase_unless_numeric(self) -> None:
        tests = [
            ("5", {"blah 5 blah": "blah 5 blah"}),
            (" 5 ", {"blah 5 blah": "blah 5 blah"}),
            (
                " 5.0 ",
                {
                    "blah 5 blah": "blah 5 blah",
                    "blah 5. blah": "blah 5. blah",
                    "blah 5.0 blah": "blah 5.0 blah",
                },
            ),
            (
                " 5. ",
                {
                    "blah 5 blah": "blah 5 blah",
                    "blah 5. blah": "blah 5. blah",
                    "blah 5.0 blah": "blah 5.0 blah",
                },
            ),
            (
                "5 Tree Road",
                {
                    "blah 5 blah": "blah 5 blah",
                    "blah 5 Tree Road blah": f"blah {self.anonpatient} blah",
                },
            ),
            (
                " 5 Tree Road ",
                {
                    "blah 5 blah": "blah 5 blah",
                    "blah 5 Tree Road blah": f"blah {self.anonpatient} blah",
                },
            ),
            (" 5b ", {"blah 5b blah": f"blah {self.anonpatient} blah"}),
        ]
        for scrubvalue, mapping in tests:
            scrubber = PersonalizedScrubber(
                replacement_text_patient=self.anonpatient,
                replacement_text_third_party=self.anonthird,
                hasher=self.hasher,
                min_string_length_to_scrub_with=1,
                debug=True,
            )
            scrubber.add_value(
                scrubvalue, scrub_method=ScrubMethod.PHRASE_UNLESS_NUMERIC
            )
            for start, end in mapping.items():
                self.assertEqual(
                    scrubber.scrub(start),
                    end,
                    f"Failure for scrubvalue: {scrubvalue!r}; regex elements "
                    f"are {scrubber.re_patient_elements}",
                )


[docs]class NonspecificScrubberTests(ScrubberTestCase):
    """
    Tests nonspecific scrubbing.
    """

[docs]    def setUp(self) -> None:
        super().setUp()

        self.fake = Faker(["en-GB"])
        self.fake.seed_instance(1234)

[docs]    def test_all_dates_scrubbed(self) -> None:
        """
        Check we can remove arbitrary dates. (See also anonregex_tests.py for
        tests of the date detection regexes.)
        """
        date_of_birth_1 = self.fake.date_of_birth()
        date_string_1 = date_of_birth_1.strftime("%d %b %Y")

        date_of_birth_2 = self.fake.date_of_birth()
        date_string_2 = date_of_birth_2.strftime("%d %b %Y")

        text = (
            f"{self.fake.text()} {date_string_1} "
            f"{self.fake.text()} {date_string_2}"
        )

        scrubber = NonspecificScrubber(
            self.hasher,
            replacement_text_all_dates="[REDACTED]",
            scrub_all_dates=True,
        )

        scrubbed = scrubber.scrub(text)

        self.assertEqual(scrubbed.count("[REDACTED]"), 2)

[docs]    def test_all_dates_in_supported_formats_blurred(self) -> None:
        """
        Check we can blur dates.
        """
        tests = (
            # Using "%b %Y" format:
            ("01 February 2003", "Feb 2003"),
            ("01 Feb 2003", "Feb 2003"),
            ("01 Feb 00", "Feb 2000"),
            ("01 Feb 69", "Feb 1969"),
            ("01 Feb 99", "Feb 1999"),
            ("4/5/2006", "May 2006"),
            ("4/5/99", "May 1999"),
            ("7/31/2008", "Jul 2008"),
            ("7/31/99", "Jul 1999"),
            ("8th Sept 2010", "Sep 2010"),
            ("8th Sept 99", "Sep 1999"),
            ("7/31/2008", "Jul 2008"),
            ("7/31/99", "Jul 1999"),
            ("2011-12-13", "Dec 2011"),
            ("99-12-13", "Dec 1999"),
            ("20160718", "Jul 2016"),
        )

        scrubber = NonspecificScrubber(
            self.hasher,
            scrub_all_dates=True,
            replacement_text_all_dates="%b %Y",
        )

        for text, expected in tests:
            self.assertEqual(
                scrubber.scrub(text), expected, msg=f"test: {text}"
            )

[docs]    def test_non_dates_scrubbed(self) -> None:
        """
        Test that non-date things are scrubbed with non-date replacement text,
        even if we have special date replacements configured.
        """
        scrubber = NonspecificScrubber(
            self.hasher,
            scrub_all_uk_postcodes=True,
            scrub_all_dates=True,
            replacement_text="[REDACTED]",
            replacement_text_all_dates="%b %Y",
        )

        self.assertEqual(scrubber.scrub(self.fake.postcode()), "[REDACTED]")

    def test_scrub_all_dates_with_replacement(self) -> None:
        custom_placeholder_tests = [
            ("[%Y-%m]", "[2022-02]"),
            ("[%B, %Y]", "[February, 2022]"),
            ("[%b '%y]", "[Feb '22]"),
            ("[%Y]", "[2022]"),
            ("[%b %Y]", "[Feb 2022]"),
        ]

        for replacement, expected in custom_placeholder_tests:
            scrubber = NonspecificScrubber(
                self.hasher,
                scrub_all_dates=True,
                replacement_text_all_dates=replacement,
            )

            self.assertEqual(scrubber.scrub("2022-02-28"), expected)

[docs]    def test_raises_for_unsupported_date_formats(self) -> None:
        """
        Check we can detect bad % directives that we will not allow through to
        datetime.date.strftime(). Compare DATE_BLURRING_DIRECTIVES, the stuff
        we do allow.
        """
        bad_formats = [
            "%a",
            "%A",
            "%w",
            "%d",
            "%H",
            "%I",
            "%p",
            "%M",
            "%S",
            "%f",
            "%z",
            "%Z",
            "%j",
            "%U",
            "%W",
            "%c",
            "%x",
            "%X",
            "%G",
            "%u",
            "%V",
            "hello %V world",  # detect not just at the start/end
            "%%",  # "%%" (literal %) currently unsupported
        ]

        for replacement in bad_formats:
            with self.assertRaises(ValueError):
                NonspecificScrubber(
                    self.hasher,
                    scrub_all_dates=True,
                    replacement_text_all_dates=replacement,
                )

[docs]    def test_email_addresses_scrubbed(self) -> None:
        """
        Test that e-mail addresses are scrubbed.
        """
        scrubber = NonspecificScrubber(
            self.hasher,
            scrub_all_email_addresses=True,
            replacement_text="[REDACTED]",
        )

        self.assertEqual(scrubber.scrub(self.fake.email()), "[REDACTED]")