Source code for crate_anon.crateweb.anonymise_api.serializers

"""
crate_anon/crateweb/anonymise_api/serializers.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

Django REST Framework serializer to anonymise the data.

"""

from collections import OrderedDict
from typing import Dict, List, Optional

from django.conf import settings

from cardinal_pythonlib.hash import GenericHasher, HashMethods, make_hasher
from rest_framework.serializers import (
    BooleanField,
    CharField,
    DictField,
    IntegerField,
    ListField,
    Serializer,
    SerializerMethodField,
)

from crate_anon.anonymise.constants import (
    AnonymiseConfigDefaults as Defaults,
    AnonymiseConfigKeys as ConfigKeys,
    DATE_BLURRING_DIRECTIVES_CSV,
    ScrubMethod,
)
from crate_anon.anonymise.scrub import (
    NonspecificScrubber,
    PersonalizedScrubber,
    WordList,
)
from crate_anon.crateweb.anonymise_api.constants import (
    ApiKeys,
    ApiSettingsKeys,
)


# noinspection PyAbstractClass
[docs]class SpecificSerializer(Serializer): """ Represents scrubbing information about a specific person or group of people (e.g. patient data, third-party data). """ dates = ListField( child=CharField(), help_text="List of dates to be scrubbed.", default=[], initial=[], ) phrases = ListField( child=CharField(), help_text=( "List of phrases (words appearing consecutively) to " "be scrubbed." ), default=[], initial=[], ) non_numeric_phrases = ListField( child=CharField(), help_text=( "List of phrases (words appearing consecutively) to " "be scrubbed. If a phrase is purely numeric it will be " "ignored." ), default=[], initial=[], ) words = ListField( child=CharField(), help_text="List of words to be scrubbed.", default=[], initial=[], ) numbers = ListField( child=CharField(), help_text="List of numbers to be scrubbed.", default=[], initial=[], ) codes = ListField( child=CharField(), help_text="List of codes (e.g. postcodes) to be scrubbed.", default=[], initial=[], )
# noinspection PyAbstractClass
[docs]class AllowlistSerializer(Serializer): """ Represents allowlist options. """ words = ListField( child=CharField(), help_text="Do not scrub these specific words.", default=[], initial=[], ) files = ListField( child=CharField(), help_text=( "Do not scrub words from these filename aliases " "(defined on the server)." ), default=[], initial=[], )
# noinspection PyAbstractClass
[docs]class DenylistSerializer(Serializer): """ Represents denylist options. """ words = ListField( child=CharField(), help_text="Scrub these specific words.", default=[], initial=[], ) files = ListField( child=CharField(), help_text=( "Scrub words from these filename aliases (defined on the server)." ), default=[], initial=[], )
# noinspection PyAbstractClass
[docs]class ScrubSerializer(Serializer): """ Represents all scrubber settings, including data to be scrubbed and scrubber configuration settings. """ # Input/Output fields # default implies required=False text = DictField( child=CharField(help_text="Text to be scrubbed."), help_text=( "The lines of text to be scrubbed, each keyed on a unique " "ID supplied by the caller." ), ) patient = SpecificSerializer( required=False, help_text="Specific patient data to be scrubbed." ) third_party = SpecificSerializer( required=False, help_text="Third party (e.g. family members') data to be scrubbed.", ) anonymise_codes_at_word_boundaries_only = BooleanField( default=Defaults.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY, initial=Defaults.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY, help_text=( "Ensure the codes to be scrubbed begin and end with a word " "boundary." ), ) anonymise_dates_at_word_boundaries_only = BooleanField( default=Defaults.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY, initial=Defaults.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY, help_text=( "Ensure the codes to be scrubbed begin and end with a word " "boundary." ), ) anonymise_numbers_at_word_boundaries_only = BooleanField( default=Defaults.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY, initial=Defaults.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY, help_text=( "Ensure the numbers to be scrubbed begin and end with a " "word boundary." ), ) anonymise_numbers_at_numeric_boundaries_only = BooleanField( default=Defaults.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY, initial=Defaults.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY, help_text=( "Ensure the numbers to be scrubbed begin and end with a " "numeric boundary." ), ) anonymise_strings_at_word_boundaries_only = BooleanField( default=Defaults.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY, initial=Defaults.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY, help_text=( "Ensure the numbers to be scrubbed begin and end with a " "word boundary." ), ) string_max_regex_errors = IntegerField( default=Defaults.STRING_MAX_REGEX_ERRORS, initial=Defaults.STRING_MAX_REGEX_ERRORS, help_text=( "The maximum number of typographical insertion/deletion/" "substitution errors to permit." ), ) min_string_length_for_errors = IntegerField( default=Defaults.MIN_STRING_LENGTH_FOR_ERRORS, initial=Defaults.MIN_STRING_LENGTH_FOR_ERRORS, help_text=( "The minimum string length at which typographical " "errors will be permitted." ), ) min_string_length_to_scrub_with = IntegerField( default=Defaults.MIN_STRING_LENGTH_TO_SCRUB_WITH, initial=Defaults.MIN_STRING_LENGTH_TO_SCRUB_WITH, help_text="Do not scrub strings shorter than this length.", ) scrub_string_suffixes = ListField( child=CharField(), help_text=( 'A list of suffixes to permit on strings. e.g. ["s"] ' "for plural forms." ), default=[], initial=[], ) allowlist = AllowlistSerializer( required=False, help_text="Allowlist options." ) denylist = DenylistSerializer( required=False, help_text="Denylist options." ) replace_patient_info_with = CharField( default=Defaults.REPLACE_PATIENT_INFO_WITH, initial=Defaults.REPLACE_PATIENT_INFO_WITH, help_text="Replace sensitive patient content with this.", ) replace_third_party_info_with = CharField( default=Defaults.REPLACE_THIRD_PARTY_INFO_WITH, initial=Defaults.REPLACE_THIRD_PARTY_INFO_WITH, help_text=( "Replace sensitive third party (e.g. family members') " "content with this." ), ) replace_nonspecific_info_with = CharField( default=Defaults.REPLACE_NONSPECIFIC_INFO_WITH, initial=Defaults.REPLACE_NONSPECIFIC_INFO_WITH, help_text="Replace any other sensitive content with this.", ) replace_all_dates_with = CharField( required=False, help_text=( "When scrubbing all dates, replace with this text. If the " "replacement text includes supported datetime.directives " f"({DATE_BLURRING_DIRECTIVES_CSV}), the date is 'blurred' " "to include just those components." ), ) scrub_all_numbers_of_n_digits = ListField( child=IntegerField(), help_text=( "Scrub all numbers with these lengths " "(e.g. [10] for all UK NHS numbers)." ), default=[], initial=[], ) scrub_all_uk_postcodes = BooleanField( default=Defaults.SCRUB_ALL_UK_POSTCODES, initial=Defaults.SCRUB_ALL_UK_POSTCODES, help_text="Scrub all UK postcodes.", ) scrub_all_dates = BooleanField( default=Defaults.SCRUB_ALL_DATES, initial=Defaults.SCRUB_ALL_DATES, help_text=( "Scrub all dates. Currently assumes the default locale " "for month names and ordinal suffixes." ), ) scrub_all_email_addresses = BooleanField( default=Defaults.SCRUB_ALL_EMAIL_ADDRESSES, initial=Defaults.SCRUB_ALL_EMAIL_ADDRESSES, help_text="Scrub all e-mail addresses.", ) alternatives = ListField( child=ListField(child=CharField()), help_text=( "List of alternative words to scrub. " 'e.g.: [["Street", "St"], ["Road", "Rd"], ["Avenue", "Ave"]]' ), default=[[]], initial=[[]], ) # Output-only fields # SerializerMethodField is read-only by default anonymised = SerializerMethodField( help_text=( "The anonymised text, keyed on the unique IDs supplied by " "the caller in the 'text' parameter of the request." ) )
[docs] def get_anonymised(self, data: OrderedDict) -> Dict[str, str]: """ Returns the anonymised text keyed on the unique IDs supplied by the caller. """ scrubber = self._get_personalized_scrubber(data) anonymised = dict() for key, value in data[ApiKeys.TEXT].items(): anonymised[key] = scrubber.scrub(value) return anonymised
def _get_personalized_scrubber( self, data: OrderedDict ) -> PersonalizedScrubber: """ Create a CRATE scrubber representing patient and third-party scrubbing settings. """ hasher = make_hasher( HashMethods.HMAC_MD5, settings.ANONYMISE_API[ApiSettingsKeys.HASH_KEY], ) options = ( ConfigKeys.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY, ConfigKeys.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY, ConfigKeys.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY, ConfigKeys.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY, ConfigKeys.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY, ConfigKeys.STRING_MAX_REGEX_ERRORS, ConfigKeys.MIN_STRING_LENGTH_FOR_ERRORS, ConfigKeys.MIN_STRING_LENGTH_TO_SCRUB_WITH, ConfigKeys.SCRUB_STRING_SUFFIXES, ) kwargs = {k: v for (k, v) in data.items() if k in options} replacement_text_patient = data[ConfigKeys.REPLACE_PATIENT_INFO_WITH] replacement_text_third_party = data[ ConfigKeys.REPLACE_THIRD_PARTY_INFO_WITH ] scrubber = PersonalizedScrubber( hasher, replacement_text_patient, replacement_text_third_party, nonspecific_scrubber=self._get_nonspecific_scrubber(data, hasher), allowlist=self._get_allowlist(data, hasher), alternatives=self._get_alternatives(data), **kwargs, ) for label in (ApiKeys.PATIENT, ApiKeys.THIRD_PARTY): if label in data: self._add_values_to_scrubber(scrubber, label, data) return scrubber @staticmethod def _get_alternatives(data: OrderedDict) -> Optional[List[List[str]]]: """ Returns a list of list of equivalents; see :func:`crate_anon.anonymise.config.get_word_alternatives` and :class:`crate_anon.anonymise.scrub.PersonalizedScrubber`. """ try: return [ [word.upper() for word in words] for words in data[ApiKeys.ALTERNATIVES] ] except KeyError: return None @staticmethod def _get_allowlist( data: OrderedDict, hasher: GenericHasher ) -> Optional[WordList]: """ Returns a :class:`crate_anon.anonymise.scrub.WordList` of words to be allowed through. """ try: allowlist_data = data[ApiKeys.ALLOWLIST] except KeyError: return None options = (ApiKeys.WORDS,) kwargs = {k: v for (k, v) in allowlist_data.items() if k in options} files = allowlist_data[ApiKeys.FILES] filename_lookup = settings.ANONYMISE_API.get( ApiSettingsKeys.ALLOWLIST_FILENAMES, {} ) filenames = [ filename for label, filename in filename_lookup.items() if label in files ] kwargs.update(filenames=filenames) return WordList(hasher=hasher, **kwargs) def _get_nonspecific_scrubber( self, data: OrderedDict, hasher: GenericHasher ) -> NonspecificScrubber: """ Returns a nonspecific scrubber for the current settings. """ denylist = self._get_denylist(data, hasher) options = ( # Also kwargs for NonspecificScrubber ConfigKeys.SCRUB_ALL_NUMBERS_OF_N_DIGITS, ConfigKeys.SCRUB_ALL_UK_POSTCODES, ConfigKeys.SCRUB_ALL_DATES, ConfigKeys.SCRUB_ALL_EMAIL_ADDRESSES, ConfigKeys.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY, ConfigKeys.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY, ) kwargs = {k: v for (k, v) in data.items() if k in options} # TODO: extra_regexes (might be a security no-no) replacement_text = data[ConfigKeys.REPLACE_NONSPECIFIC_INFO_WITH] try: kwargs["replacement_text_all_dates"] = data[ ConfigKeys.REPLACE_ALL_DATES_WITH ] except KeyError: pass return NonspecificScrubber( hasher, replacement_text=replacement_text, denylist=denylist, **kwargs, ) @staticmethod def _get_denylist( data: OrderedDict, hasher: GenericHasher ) -> Optional[WordList]: """ Returns a :class:`crate_anon.anonymise.scrub.WordList` of words to be scrubbed. """ try: denylist_data = data[ApiKeys.DENYLIST] except KeyError: return None options = (ApiKeys.WORDS,) kwargs = {k: v for (k, v) in denylist_data.items() if k in options} kwargs["replacement_text"] = data[ ConfigKeys.REPLACE_NONSPECIFIC_INFO_WITH ] files = denylist_data[ApiKeys.FILES] filename_lookup = settings.ANONYMISE_API.get( ApiSettingsKeys.DENYLIST_FILENAMES, {} ) filenames = [ filename for label, filename in filename_lookup.items() if label in files ] kwargs.update(filenames=filenames) # TODO: None of these are currently configurable # from crate_anon/anonymise/config.py # Do we care about them here? # suffixes # at_word_boundaries_only (for regex_method=True) # max_errors # regex_method: True return WordList(hasher=hasher, **kwargs) @staticmethod def _add_values_to_scrubber( scrubber: PersonalizedScrubber, label: str, data: OrderedDict ) -> None: """ Adds values to be scrubbed to either the patient or the third-party component of a scrubber. """ method_lookup = { ApiKeys.DATES: ScrubMethod.DATE, ApiKeys.PHRASES: ScrubMethod.PHRASE, ApiKeys.NON_NUMERIC_PHRASES: ScrubMethod.PHRASE_UNLESS_NUMERIC, ApiKeys.WORDS: ScrubMethod.WORDS, ApiKeys.NUMBERS: ScrubMethod.NUMERIC, ApiKeys.CODES: ScrubMethod.CODE, } is_patient = label == ApiKeys.PATIENT for name, values in data[label].items(): method = method_lookup[name] for value in values: scrubber.add_value(value, method, patient=is_patient)