"""
crate_anon/anonymise/constants.py
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
**Shared constants for CRATE anonymiser.**
"""
import calendar
from enum import unique
from sqlalchemy import Integer
from cardinal_pythonlib.enumlike import StrEnum
from crate_anon.version import CRATE_VERSION, CRATE_VERSION_DATE
from crate_anon.nlp_manager.constants import DatabaseConfigKeys
# =============================================================================
# Logging
# =============================================================================
LOG_DATEFMT = "%Y-%m-%d %H:%M:%S"
LOG_COLORS = {
"DEBUG": "cyan",
"INFO": "green",
"WARNING": "yellow",
"ERROR": "red",
"CRITICAL": "red,bg_white",
}
# =============================================================================
# Cosmetic
# =============================================================================
BIGSEP = "=" * 20 + " "
SEP = "-" * 20 + " "
# =============================================================================
# Defaults for command-line options
# =============================================================================
DEFAULT_REPORT_EVERY = 100000 # 100k
DEFAULT_CHUNKSIZE = 100000 # 100k
# =============================================================================
# Environment
# =============================================================================
ANON_CONFIG_ENV_VAR = "CRATE_ANON_CONFIG"
# =============================================================================
# Data dictionary
# =============================================================================
DATEFORMAT_ISO8601 = "%Y-%m-%dT%H:%M:%S%z" # e.g. 2013-07-24T20:04:07+0100
DEFAULT_INDEX_LEN = 20 # for data types where it's mandatory
LONGTEXT = "LONGTEXT"
MAX_PID_STR = "9" * 10 # e.g. NHS numbers are 10-digit
# Better overall than string.maketrans:
ODD_CHARS_TRANSLATE = [chr(x) for x in range(0, 256)]
for c in "()/ ":
ODD_CHARS_TRANSLATE[ord(c)] = "_"
for i in range(0, 32):
ODD_CHARS_TRANSLATE[i] = "_"
for i in range(127, 256):
ODD_CHARS_TRANSLATE[i] = "_"
ODD_CHARS_TRANSLATE = "".join(ODD_CHARS_TRANSLATE)
TridType = Integer
MAX_TRID = 2**31 - 1
# https://dev.mysql.com/doc/refman/5.0/en/numeric-type-overview.html
# Maximum INT UNSIGNED is 4294967295 == 2 ** 32 - 1.
# INT range is -2147483648 == -(2 ** 31) to
# +2147483647 == 2 ** 31 - 1 == 2.1 billion
# ... note that this is inadequate for 10-digit NHS numbers.
# Maximum BIGINT UNSIGNED is 18446744073709551615 == 2 ** 64 - 1.
# BIGINT range is -9223372036854775808 == -(2 ** 63) to
# +9223372036854775807 == 2 ** 64 - 1
# When scrub_all_dates is True and the replacement text is a date format
# string, allow these directives.
# https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
DATE_BLURRING_DIRECTIVES = (
"b", # Month as locale's abbreviated name
"B", # Month as locale's full name
"m", # Month as zero-padded decimal number
"Y", # Year with century as decimal number
"y", # Year without century as zero-padded decimal number
# Among things that are not currently supported: %% (literal %).
)
DATE_BLURRING_DIRECTIVES_CSV = ", ".join(
[f"%{d}" for d in DATE_BLURRING_DIRECTIVES]
)
# https://stackoverflow.com/questions/3418050/month-name-to-month-number-and-vice-versa-in-python
MONTH_3_LETTER_INDEX = {
# See _month_word_regex_fragment() in anonregex.py
# Assuming this may not be the same as calendar.month_abbr in some locales
month[:3]: index
for index, month in enumerate(calendar.month_name)
if month
}
[docs]@unique
class AlterMethodType(StrEnum):
BINARY_TO_TEXT = "binary_to_text"
FILENAME_FORMAT_TO_TEXT = "filename_format_to_text" # new in v0.18.18
FILENAME_TO_TEXT = "filename_to_text"
HASH = "hash"
# HTML_ESCAPE = "html_escape"
HTML_UNESCAPE = "html_unescape"
HTML_UNTAG = "html_untag"
SCRUBIN = "scrub"
SKIP_IF_TEXT_EXTRACT_FAILS = "skip_if_extract_fails"
TRUNCATEDATE = "truncate_date"
[docs]@unique
class Decision(StrEnum):
OMIT = "OMIT"
INCLUDE = "include"
[docs]@unique
class IndexType(StrEnum):
NONE = ""
NORMAL = "I"
UNIQUE = "U"
FULLTEXT = "F"
[docs]@unique
class ScrubMethod(StrEnum):
WORDS = "words"
PHRASE = "phrase"
PHRASE_UNLESS_NUMERIC = "phrase_unless_numeric"
NUMERIC = "number"
DATE = "date"
CODE = "code"
[docs]@unique
class ScrubSrc(StrEnum):
PATIENT = "patient"
THIRDPARTY = "thirdparty"
THIRDPARTY_XREF_PID = "thirdparty_xref_pid"
[docs]@unique
class SrcFlag(StrEnum):
PK = "K"
NOT_NULL = "N"
ADD_SRC_HASH = "H"
PRIMARY_PID = "P"
DEFINES_PRIMARY_PIDS = "*"
MASTER_PID = "M"
CONSTANT = "C"
ADDITION_ONLY = "A"
OPT_OUT = "!"
REQUIRED_SCRUBBER = "R"
# =============================================================================
# Databases
# =============================================================================
CHARSET = "utf8"
TABLE_KWARGS = {
# MySQL:
"mysql_charset": CHARSET,
"mysql_engine": "InnoDB",
}
COMMENT = "comment"
MYSQL_MAX_IDENTIFIER_LENGTH = 64
# MySQL: 64 -- http://dev.mysql.com/doc/refman/5.7/en/identifiers.html
SQLSERVER_MAX_IDENTIFIER_LENGTH = 128
# Microsoft SQL Server: 128 --
# https://docs.microsoft.com/en-us/sql/relational-databases/databases/database-identifiers # noqa
# =============================================================================
# Config keys
# =============================================================================
class AnonymiseConfigKeys:
# Sections
SECTION_MAIN = "main"
SECTION_EXTRA_REGEXES = "extra_regexes"
# Data dictionary
DATA_DICTIONARY_FILENAME = "data_dictionary_filename"
# Critical field types
SQLATYPE_MPID = "sqlatype_mpid"
SQLATYPE_PID = "sqlatype_pid"
# Encryption phrases/passwords
CHANGE_DETECTION_ENCRYPTION_PHRASE = "change_detection_encryption_phrase"
EXTRA_HASH_CONFIG_SECTIONS = "extra_hash_config_sections"
HASH_METHOD = "hash_method"
MASTER_PATIENT_ID_ENCRYPTION_PHRASE = "master_patient_id_encryption_phrase"
PER_TABLE_PATIENT_ID_ENCRYPTION_PHRASE = (
"per_table_patient_id_encryption_phrase"
)
# Text extraction
EXTRACT_TEXT_EXTENSIONS_CASE_SENSITIVE = (
"extract_text_extensions_case_sensitive"
)
EXTRACT_TEXT_EXTENSIONS_PERMITTED = "extract_text_extensions_permitted"
EXTRACT_TEXT_EXTENSIONS_PROHIBITED = "extract_text_extensions_prohibited"
EXTRACT_TEXT_PLAIN = "extract_text_plain"
EXTRACT_TEXT_WIDTH = "extract_text_width"
# Anonymisation
ALLOWLIST_FILENAMES = "allowlist_filenames"
ALLOW_NO_PATIENT_INFO = "allow_no_patient_info"
ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY = (
"anonymise_codes_at_word_boundaries_only"
)
ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY = (
"anonymise_codes_at_numeric_boundaries_only"
)
ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY = (
"anonymise_dates_at_word_boundaries_only"
)
ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY = (
"anonymise_numbers_at_numeric_boundaries_only"
)
ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY = (
"anonymise_numbers_at_word_boundaries_only"
)
ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY = (
"anonymise_strings_at_word_boundaries_only"
)
DENYLIST_FILENAMES = "denylist_filenames"
DENYLIST_FILES_AS_PHRASES = "denylist_files_as_phrases"
DENYLIST_USE_REGEX = "denylist_use_regex"
DEPRECATED_BLACKLIST_FILENAMES = "blacklist_filenames"
DEPRECATED_WHITELIST_FILENAMES = "whitelist_filenames"
MIN_STRING_LENGTH_FOR_ERRORS = "min_string_length_for_errors"
MIN_STRING_LENGTH_TO_SCRUB_WITH = "min_string_length_to_scrub_with"
NONSPECIFIC_SCRUBBER_FIRST = "nonspecific_scrubber_first"
PHRASE_ALTERNATIVE_WORD_FILENAMES = "phrase_alternative_word_filenames"
REPLACE_ALL_DATES_WITH = "replace_all_dates_with"
REPLACE_NONSPECIFIC_INFO_WITH = "replace_nonspecific_info_with"
REPLACE_PATIENT_INFO_WITH = "replace_patient_info_with"
REPLACE_THIRD_PARTY_INFO_WITH = "replace_third_party_info_with"
SCRUB_ALL_DATES = "scrub_all_dates"
SCRUB_ALL_EMAIL_ADDRESSES = "scrub_all_email_addresses"
SCRUB_ALL_NUMBERS_OF_N_DIGITS = "scrub_all_numbers_of_n_digits"
SCRUB_ALL_UK_POSTCODES = "scrub_all_uk_postcodes"
SCRUB_STRING_SUFFIXES = "scrub_string_suffixes"
STRING_MAX_REGEX_ERRORS = "string_max_regex_errors"
THIRDPARTY_XREF_MAX_DEPTH = "thirdparty_xref_max_depth"
TIMEFIELD_NAME = "timefield_name"
# Output fields and formatting
RESEARCH_ID_FIELDNAME = "research_id_fieldname"
TRID_FIELDNAME = "trid_fieldname"
MASTER_RESEARCH_ID_FIELDNAME = "master_research_id_fieldname"
ADD_MRID_WHEREVER_RID_ADDED = "add_mrid_wherever_rid_added"
SOURCE_HASH_FIELDNAME = "source_hash_fieldname"
# Destination database configuration
MAX_ROWS_BEFORE_COMMIT = "max_rows_before_commit"
MAX_BYTES_BEFORE_COMMIT = "max_bytes_before_commit"
TEMPORARY_TABLENAME = "temporary_tablename"
# Databases
ADMIN_DATABASE = "admin_database"
DESTINATION_DATABASE = "destination_database"
SOURCE_DATABASES = "source_databases"
# Processing options
DEBUG_MAX_N_PATIENTS = "debug_max_n_patients"
DEBUG_PID_LIST = "debug_pid_list"
# Opting out
OPTOUT_COL_VALUES = "optout_col_values"
OPTOUT_MPID_FILENAMES = "optout_mpid_filenames"
OPTOUT_PID_FILENAMES = "optout_pid_filenames"
class AnonymiseConfigDefaults:
# Critical field types
SQLATYPE_MPID = "BigInteger"
SQLATYPE_PID = "BigInteger"
# Encryption phrases/passwords
HASH_METHOD = "HMAC_MD5"
# Text extraction
EXTRACT_TEXT_EXTENSIONS_CASE_SENSITIVE = False
EXTRACT_TEXT_PLAIN = True
EXTRACT_TEXT_WIDTH = 80
# Anonymisation
ALLOW_NO_PATIENT_INFO = False
ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY = True
ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY = True
ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY = True
ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY = True
ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY = False
ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY = True
DENYLIST_FILES_AS_PHRASES = False
DENYLIST_USE_REGEX = False
MIN_STRING_LENGTH_FOR_ERRORS = 3
MIN_STRING_LENGTH_TO_SCRUB_WITH = 2
NONSPECIFIC_SCRUBBER_FIRST = False
REPLACE_ALL_DATES_WITH = "[~~~]"
REPLACE_NONSPECIFIC_INFO_WITH = "[~~~]"
REPLACE_PATIENT_INFO_WITH = "[__PPP__]"
REPLACE_THIRD_PARTY_INFO_WITH = "[__TTT__]"
SCRUB_ALL_DATES = False
SCRUB_ALL_EMAIL_ADDRESSES = False
SCRUB_ALL_UK_POSTCODES = False
STRING_MAX_REGEX_ERRORS = 0
THIRDPARTY_XREF_MAX_DEPTH = 1
TIMEFIELD_NAME = "_when_processed_utc"
# Output fields and formatting
RESEARCH_ID_FIELDNAME = "rid"
TRID_FIELDNAME = "trid"
MASTER_RESEARCH_ID_FIELDNAME = "mrid"
ADD_MRID_WHEREVER_RID_ADDED = True
SOURCE_HASH_FIELDNAME = "_src_hash"
# Destination database configuration
MAX_ROWS_BEFORE_COMMIT = 1000
MAX_BYTES_BEFORE_COMMIT = 80 * 1024 * 1024 # 80 Mb
TEMPORARY_TABLENAME = "_crate_temp_table"
# Processing options
DEBUG_MAX_N_PATIENTS = 0
[docs]class AnonymiseDatabaseSafeConfigKeys:
"""
Non-sensitive config keys relating to a specific database.
"""
DDGEN_ADD_PER_TABLE_PIDS_TO_SCRUBBER = (
"ddgen_add_per_table_pids_to_scrubber"
)
DDGEN_ADDITION_ONLY = "ddgen_addition_only"
DDGEN_ADDITION_ONLY_TABLES = "ddgen_addition_only_tables"
DDGEN_ALLOW_FULLTEXT_INDEXING = "ddgen_allow_fulltext_indexing"
DDGEN_APPEND_SOURCE_INFO_TO_COMMENT = "ddgen_append_source_info_to_comment"
DDGEN_BINARY_TO_TEXT_FIELD_PAIRS = "ddgen_binary_to_text_field_pairs"
DDGEN_CONSTANT_CONTENT = "ddgen_constant_content"
DDGEN_CONSTANT_CONTENT_TABLES = "ddgen_constant_content_tables"
DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE = (
"ddgen_convert_odd_chars_to_underscore"
)
DDGEN_DELETION_POSSIBLE_TABLES = "ddgen_deletion_possible_tables"
DDGEN_EXTRA_HASH_FIELDS = "ddgen_extra_hash_fields"
DDGEN_FIELD_ALLOWLIST = "ddgen_field_allowlist"
DDGEN_FIELD_DENYLIST = "ddgen_field_denylist"
DDGEN_FILENAME_TO_TEXT_FIELDS = "ddgen_filename_to_text_fields"
DDGEN_FORCE_LOWER_CASE = "ddgen_force_lower_case"
DDGEN_FREETEXT_INDEX_MIN_LENGTH = "ddgen_freetext_index_min_length"
DDGEN_INCLUDE_FIELDS = "ddgen_include_fields"
DDGEN_INDEX_FIELDS = "ddgen_index_fields"
DDGEN_MASTER_PID_FIELDNAME = "ddgen_master_pid_fieldname"
DDGEN_MIN_LENGTH_FOR_SCRUBBING = "ddgen_min_length_for_scrubbing"
DDGEN_NONCONSTANT_CONTENT_TABLES = "ddgen_nonconstant_content_tables"
DDGEN_OMIT_BY_DEFAULT = "ddgen_omit_by_default"
DDGEN_OMIT_FIELDS = "ddgen_omit_fields"
DDGEN_PATIENT_OPT_OUT_FIELDS = "ddgen_patient_opt_out_fields"
DDGEN_PER_TABLE_PID_FIELD = "ddgen_per_table_pid_field"
DDGEN_PID_DEFINING_FIELDNAMES = "ddgen_pid_defining_fieldnames"
DDGEN_PK_FIELDS = "ddgen_pk_fields"
DDGEN_PREFER_ORIGINAL_PK = "ddgen_prefer_original_pk"
DDGEN_RENAME_TABLES_REMOVE_SUFFIXES = "ddgen_rename_tables_remove_suffixes"
DDGEN_REQUIRED_SCRUBSRC_FIELDS = "ddgen_required_scrubsrc_fields"
DDGEN_SAFE_FIELDS_EXEMPT_FROM_SCRUBBING = (
"ddgen_safe_fields_exempt_from_scrubbing"
)
DDGEN_SCRUBMETHOD_CODE_FIELDS = "ddgen_scrubmethod_code_fields"
DDGEN_SCRUBMETHOD_DATE_FIELDS = "ddgen_scrubmethod_date_fields"
DDGEN_SCRUBMETHOD_NUMBER_FIELDS = "ddgen_scrubmethod_number_fields"
DDGEN_SCRUBMETHOD_PHRASE_FIELDS = "ddgen_scrubmethod_phrase_fields"
DDGEN_SCRUBSRC_PATIENT_FIELDS = "ddgen_scrubsrc_patient_fields"
DDGEN_SCRUBSRC_THIRDPARTY_FIELDS = "ddgen_scrubsrc_thirdparty_fields"
DDGEN_SCRUBSRC_THIRDPARTY_XREF_PID_FIELDS = (
"ddgen_scrubsrc_thirdparty_xref_pid_fields"
)
DDGEN_SKIP_ROW_IF_EXTRACT_TEXT_FAILS_FIELDS = (
"ddgen_skip_row_if_extract_text_fails_fields"
)
DDGEN_TABLE_ALLOWLIST = "ddgen_table_allowlist"
DDGEN_TABLE_DEFINES_PIDS = "ddgen_table_defines_pids"
DDGEN_TABLE_DENYLIST = "ddgen_table_denylist"
DDGEN_TABLE_REQUIRE_FIELD_ABSOLUTE = "ddgen_table_require_field_absolute"
DDGEN_TABLE_REQUIRE_FIELD_CONDITIONAL = (
"ddgen_table_require_field_conditional"
)
DDGEN_TRUNCATE_DATE_FIELDS = "ddgen_truncate_date_fields"
DEBUG_LIMITED_TABLES = "debug_limited_tables"
DEBUG_ROW_LIMIT = "debug_row_limit"
DEPRECATED_DDGEN_FIELD_BLACKLIST = "ddgen_field_blacklist"
DEPRECATED_DDGEN_FIELD_WHITELIST = "ddgen_field_whitelist"
DEPRECATED_DDGEN_TABLE_BLACKLIST = "ddgen_table_blacklist"
DEPRECATED_DDGEN_TABLE_WHITELIST = "ddgen_table_whitelist"
[docs]class AnonymiseDatabaseSafeConfigDefaults:
"""
Defaults for the keys above
"""
DDGEN_ADD_PER_TABLE_PIDS_TO_SCRUBBER = False
DDGEN_ADDITION_ONLY = False
DDGEN_ALLOW_FULLTEXT_INDEXING = True
DDGEN_APPEND_SOURCE_INFO_TO_COMMENT = True
DDGEN_CONSTANT_CONTENT = False
DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE = True
DDGEN_FORCE_LOWER_CASE = False
DDGEN_FREETEXT_INDEX_MIN_LENGTH = 1000
DDGEN_MIN_LENGTH_FOR_SCRUBBING = 50
DDGEN_OMIT_BY_DEFAULT = True
DDGEN_PREFER_ORIGINAL_PK = False
DEBUG_ROW_LIMIT = 0
class AnonymiseColumnComments:
TIMEFIELD_COMMENT = "Date/time that CRATE processed the source row (UTC)"
[docs]class HashConfigKeys:
"""
Config file keys for defining extra hashers.
"""
HASH_METHOD = "hash_method"
SECRET_KEY = "secret_key"
# =============================================================================
# Demo config
# =============================================================================
# This does not need to vary with Docker status.
_AK = AnonymiseConfigKeys
_DA = AnonymiseConfigDefaults
_DK = DatabaseConfigKeys
_SK = AnonymiseDatabaseSafeConfigKeys
_DS = AnonymiseDatabaseSafeConfigDefaults
# noinspection PyPep8
DEMO_CONFIG = rf"""# Configuration file for CRATE anonymiser (crate_anonymise).
# Version {CRATE_VERSION} ({CRATE_VERSION_DATE}).
#
# SEE HELP FOR DETAILS.
# =============================================================================
# Main settings
# =============================================================================
[{_AK.SECTION_MAIN}]
# -----------------------------------------------------------------------------
# Data dictionary
# -----------------------------------------------------------------------------
{_AK.DATA_DICTIONARY_FILENAME} = @@data_dictionary_filename@@
# -----------------------------------------------------------------------------
# Critical field types
# -----------------------------------------------------------------------------
{_AK.SQLATYPE_PID} =
{_AK.SQLATYPE_MPID} =
# -----------------------------------------------------------------------------
# Encryption phrases/passwords
# -----------------------------------------------------------------------------
{_AK.HASH_METHOD} = {_DA.HASH_METHOD}
{_AK.PER_TABLE_PATIENT_ID_ENCRYPTION_PHRASE} = @@per_table_patient_id_encryption_phrase@@
{_AK.MASTER_PATIENT_ID_ENCRYPTION_PHRASE} = @@master_patient_id_encryption_phrase@@
{_AK.CHANGE_DETECTION_ENCRYPTION_PHRASE} = @@change_detection_encryption_phrase@@
{_AK.EXTRA_HASH_CONFIG_SECTIONS} =
# -----------------------------------------------------------------------------
# Text extraction
# -----------------------------------------------------------------------------
{_AK.EXTRACT_TEXT_EXTENSIONS_PERMITTED} =
{_AK.EXTRACT_TEXT_EXTENSIONS_PROHIBITED} =
{_AK.EXTRACT_TEXT_EXTENSIONS_CASE_SENSITIVE} = {_DA.EXTRACT_TEXT_EXTENSIONS_CASE_SENSITIVE}
{_AK.EXTRACT_TEXT_PLAIN} = {_DA.EXTRACT_TEXT_PLAIN}
{_AK.EXTRACT_TEXT_WIDTH} = {_DA.EXTRACT_TEXT_WIDTH}
# -----------------------------------------------------------------------------
# Anonymisation
# -----------------------------------------------------------------------------
{_AK.ALLOW_NO_PATIENT_INFO} = {_DA.ALLOW_NO_PATIENT_INFO}
{_AK.REPLACE_ALL_DATES_WITH} = {_DA.REPLACE_ALL_DATES_WITH}
{_AK.REPLACE_PATIENT_INFO_WITH} = {_DA.REPLACE_PATIENT_INFO_WITH}
{_AK.REPLACE_THIRD_PARTY_INFO_WITH} = {_DA.REPLACE_THIRD_PARTY_INFO_WITH}
{_AK.REPLACE_NONSPECIFIC_INFO_WITH} = {_DA.REPLACE_NONSPECIFIC_INFO_WITH}
{_AK.THIRDPARTY_XREF_MAX_DEPTH} = {_DA.THIRDPARTY_XREF_MAX_DEPTH}
{_AK.SCRUB_STRING_SUFFIXES} =
s
{_AK.STRING_MAX_REGEX_ERRORS} = {_DA.STRING_MAX_REGEX_ERRORS}
{_AK.MIN_STRING_LENGTH_FOR_ERRORS} = {_DA.MIN_STRING_LENGTH_FOR_ERRORS}
{_AK.MIN_STRING_LENGTH_TO_SCRUB_WITH} = {_DA.MIN_STRING_LENGTH_TO_SCRUB_WITH}
{_AK.ALLOWLIST_FILENAMES} =
{_AK.DENYLIST_FILENAMES} =
{_AK.DENYLIST_FILES_AS_PHRASES} = {_DA.DENYLIST_FILES_AS_PHRASES}
{_AK.DENYLIST_USE_REGEX} = {_DA.DENYLIST_USE_REGEX}
{_AK.PHRASE_ALTERNATIVE_WORD_FILENAMES} =
{_AK.SCRUB_ALL_DATES} = {_DA.SCRUB_ALL_DATES}
{_AK.SCRUB_ALL_EMAIL_ADDRESSES} = {_DA.SCRUB_ALL_EMAIL_ADDRESSES}
{_AK.SCRUB_ALL_NUMBERS_OF_N_DIGITS} =
{_AK.SCRUB_ALL_UK_POSTCODES} = {_DA.SCRUB_ALL_UK_POSTCODES}
{_AK.NONSPECIFIC_SCRUBBER_FIRST} = {_DA.NONSPECIFIC_SCRUBBER_FIRST}
{_AK.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY} = {_DA.ANONYMISE_CODES_AT_WORD_BOUNDARIES_ONLY}
{_AK.ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY} = {_DA.ANONYMISE_CODES_AT_NUMERIC_BOUNDARIES_ONLY}
{_AK.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY} = {_DA.ANONYMISE_DATES_AT_WORD_BOUNDARIES_ONLY}
{_AK.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY} = {_DA.ANONYMISE_NUMBERS_AT_WORD_BOUNDARIES_ONLY}
{_AK.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY} = {_DA.ANONYMISE_NUMBERS_AT_NUMERIC_BOUNDARIES_ONLY}
{_AK.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY} = {_DA.ANONYMISE_STRINGS_AT_WORD_BOUNDARIES_ONLY}
# -----------------------------------------------------------------------------
# Output fields and formatting
# -----------------------------------------------------------------------------
{_AK.TIMEFIELD_NAME} = {_DA.TIMEFIELD_NAME}
{_AK.RESEARCH_ID_FIELDNAME} = {_DA.RESEARCH_ID_FIELDNAME}
{_AK.TRID_FIELDNAME} = {_DA.TRID_FIELDNAME}
{_AK.MASTER_RESEARCH_ID_FIELDNAME} = {_DA.MASTER_RESEARCH_ID_FIELDNAME}
{_AK.SOURCE_HASH_FIELDNAME} = {_DA.SOURCE_HASH_FIELDNAME}
# -----------------------------------------------------------------------------
# Destination database configuration
# See the [destination_database] section for connection details.
# -----------------------------------------------------------------------------
{_AK.MAX_ROWS_BEFORE_COMMIT} = {_DA.MAX_ROWS_BEFORE_COMMIT}
{_AK.MAX_BYTES_BEFORE_COMMIT} = {_DA.MAX_BYTES_BEFORE_COMMIT}
{_AK.TEMPORARY_TABLENAME} = {_DA.TEMPORARY_TABLENAME}
# -----------------------------------------------------------------------------
# Choose databases (defined in their own sections).
# -----------------------------------------------------------------------------
{_AK.SOURCE_DATABASES} =
sourcedb1
# sourcedb2
{_AK.DESTINATION_DATABASE} = destination_database
{_AK.ADMIN_DATABASE} = admin_database
# -----------------------------------------------------------------------------
# PROCESSING OPTIONS, TO LIMIT DATA QUANTITY FOR TESTING
# -----------------------------------------------------------------------------
{_AK.DEBUG_MAX_N_PATIENTS} =
{_AK.DEBUG_PID_LIST} =
# -----------------------------------------------------------------------------
# Opting out entirely
# -----------------------------------------------------------------------------
{_AK.OPTOUT_PID_FILENAMES} =
{_AK.OPTOUT_MPID_FILENAMES} =
{_AK.OPTOUT_COL_VALUES} =
# =============================================================================
# Extra regular expression patterns you wish to be scrubbed from the text
# as nonspecific information. See help.
# =============================================================================
[{_AK.SECTION_EXTRA_REGEXES}]
# =============================================================================
# Destination database details. User should have WRITE access.
# =============================================================================
[destination_database]
{_DK.URL} = @@dest_db_url@@
# =============================================================================
# Administrative database. User should have WRITE access.
# =============================================================================
[admin_database]
{_DK.URL} = @@admin_db_url@@
# =============================================================================
# SOURCE DATABASE DETAILS BELOW HERE.
# User should have READ access only for safety.
# =============================================================================
# -----------------------------------------------------------------------------
# Source database example 1
# -----------------------------------------------------------------------------
[sourcedb1]
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# CONNECTION DETAILS
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{_DK.URL} = @@source_db1_url@@
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# INPUT FIELDS, FOR THE AUTOGENERATION OF DATA DICTIONARIES
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{_SK.DDGEN_OMIT_BY_DEFAULT} = {_DS.DDGEN_OMIT_BY_DEFAULT}
{_SK.DDGEN_OMIT_FIELDS} =
{_SK.DDGEN_INCLUDE_FIELDS} = @@source_db1_ddgen_include_fields@@
{_SK.DDGEN_PER_TABLE_PID_FIELD} = patient_id
{_SK.DDGEN_TABLE_DEFINES_PIDS} = patient
{_SK.DDGEN_ADD_PER_TABLE_PIDS_TO_SCRUBBER} = {_DS.DDGEN_ADD_PER_TABLE_PIDS_TO_SCRUBBER}
{_SK.DDGEN_MASTER_PID_FIELDNAME} = nhsnum
{_SK.DDGEN_TABLE_DENYLIST} =
{_SK.DDGEN_TABLE_ALLOWLIST} =
{_SK.DDGEN_TABLE_REQUIRE_FIELD_ABSOLUTE} =
{_SK.DDGEN_TABLE_REQUIRE_FIELD_CONDITIONAL} =
{_SK.DDGEN_FIELD_DENYLIST} =
{_SK.DDGEN_FIELD_ALLOWLIST} =
{_SK.DDGEN_PK_FIELDS} =
{_SK.DDGEN_PREFER_ORIGINAL_PK} = {_DS.DDGEN_PREFER_ORIGINAL_PK}
{_SK.DDGEN_CONSTANT_CONTENT} = {_DS.DDGEN_CONSTANT_CONTENT}
{_SK.DDGEN_CONSTANT_CONTENT_TABLES} =
{_SK.DDGEN_NONCONSTANT_CONTENT_TABLES} =
{_SK.DDGEN_ADDITION_ONLY} = {_DS.DDGEN_ADDITION_ONLY}
{_SK.DDGEN_ADDITION_ONLY_TABLES} =
{_SK.DDGEN_DELETION_POSSIBLE_TABLES} =
{_SK.DDGEN_PID_DEFINING_FIELDNAMES} =
{_SK.DDGEN_SCRUBSRC_PATIENT_FIELDS} = @@source_db1_ddgen_scrubsrc_patient_fields@@
{_SK.DDGEN_SCRUBSRC_THIRDPARTY_FIELDS} =
{_SK.DDGEN_SCRUBSRC_THIRDPARTY_XREF_PID_FIELDS} =
{_SK.DDGEN_REQUIRED_SCRUBSRC_FIELDS} =
{_SK.DDGEN_SCRUBMETHOD_CODE_FIELDS} =
{_SK.DDGEN_SCRUBMETHOD_DATE_FIELDS} =
{_SK.DDGEN_SCRUBMETHOD_NUMBER_FIELDS} =
{_SK.DDGEN_SCRUBMETHOD_PHRASE_FIELDS} =
{_SK.DDGEN_SAFE_FIELDS_EXEMPT_FROM_SCRUBBING} =
{_SK.DDGEN_MIN_LENGTH_FOR_SCRUBBING} = {_DS.DDGEN_MIN_LENGTH_FOR_SCRUBBING}
{_SK.DDGEN_TRUNCATE_DATE_FIELDS} =
{_SK.DDGEN_FILENAME_TO_TEXT_FIELDS} =
{_SK.DDGEN_BINARY_TO_TEXT_FIELD_PAIRS} =
{_SK.DDGEN_SKIP_ROW_IF_EXTRACT_TEXT_FAILS_FIELDS} =
{_SK.DDGEN_RENAME_TABLES_REMOVE_SUFFIXES} =
{_SK.DDGEN_PATIENT_OPT_OUT_FIELDS} =
{_SK.DDGEN_EXTRA_HASH_FIELDS} =
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# DESTINATION INDEXING
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{_SK.DDGEN_INDEX_FIELDS} =
{_SK.DDGEN_ALLOW_FULLTEXT_INDEXING} = {_DS.DDGEN_ALLOW_FULLTEXT_INDEXING}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# DATA DICTIONARY MANIPULATION TO DESTINATION TABLE/FIELD NAMES
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{_SK.DDGEN_FORCE_LOWER_CASE} = {_DS.DDGEN_FORCE_LOWER_CASE}
{_SK.DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE} = {_DS.DDGEN_CONVERT_ODD_CHARS_TO_UNDERSCORE}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# PROCESSING OPTIONS, TO LIMIT DATA QUANTITY FOR TESTING
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{_SK.DEBUG_ROW_LIMIT} =
{_SK.DEBUG_LIMITED_TABLES} =
# -----------------------------------------------------------------------------
# Source database example 2
# -----------------------------------------------------------------------------
[mysourcedb2]
{_DK.URL} = mysql+mysqldb://username:password@127.0.0.1:3306/source2_databasename?charset=utf8
{_SK.DDGEN_FORCE_LOWER_CASE} = {_DS.DDGEN_FORCE_LOWER_CASE}
{_SK.DDGEN_APPEND_SOURCE_INFO_TO_COMMENT} = {_DS.DDGEN_APPEND_SOURCE_INFO_TO_COMMENT}
{_SK.DDGEN_PER_TABLE_PID_FIELD} = patient_id
{_SK.DDGEN_MASTER_PID_FIELDNAME} = nhsnum
{_SK.DDGEN_TABLE_DENYLIST} =
{_SK.DDGEN_FIELD_DENYLIST} =
{_SK.DDGEN_TABLE_REQUIRE_FIELD_ABSOLUTE} =
{_SK.DDGEN_TABLE_REQUIRE_FIELD_CONDITIONAL} =
{_SK.DDGEN_PK_FIELDS} =
{_SK.DDGEN_PREFER_ORIGINAL_PK} = {_DS.DDGEN_PREFER_ORIGINAL_PK}
{_SK.DDGEN_CONSTANT_CONTENT} = {_DS.DDGEN_CONSTANT_CONTENT}
{_SK.DDGEN_SCRUBSRC_PATIENT_FIELDS} =
{_SK.DDGEN_SCRUBSRC_THIRDPARTY_FIELDS} =
{_SK.DDGEN_SCRUBMETHOD_CODE_FIELDS} =
{_SK.DDGEN_SCRUBMETHOD_DATE_FIELDS} =
{_SK.DDGEN_SCRUBMETHOD_NUMBER_FIELDS} =
{_SK.DDGEN_SCRUBMETHOD_PHRASE_FIELDS} =
{_SK.DDGEN_SAFE_FIELDS_EXEMPT_FROM_SCRUBBING} =
{_SK.DDGEN_MIN_LENGTH_FOR_SCRUBBING} = {_DS.DDGEN_MIN_LENGTH_FOR_SCRUBBING}
{_SK.DDGEN_TRUNCATE_DATE_FIELDS} =
{_SK.DDGEN_FILENAME_TO_TEXT_FIELDS} =
{_SK.DDGEN_BINARY_TO_TEXT_FIELD_PAIRS} =
# -----------------------------------------------------------------------------
# Source database example 3
# -----------------------------------------------------------------------------
[camcops]
# Example for the CamCOPS anonymisation staging database
{_DK.URL} = mysql+mysqldb://username:password@127.0.0.1:3306/camcops_databasename?charset=utf8
# FOR EXAMPLE:
{_SK.DDGEN_FORCE_LOWER_CASE} = False
{_SK.DDGEN_PER_TABLE_PID_FIELD} = _patient_idnum1
{_SK.DDGEN_PID_DEFINING_FIELDNAMES} = _patient_idnum1
{_SK.DDGEN_MASTER_PID_FIELDNAME} = _patient_idnum2
{_SK.DDGEN_TABLE_DENYLIST} =
{_SK.DDGEN_FIELD_DENYLIST} = _patient_iddesc1
_patient_idshortdesc1
_patient_iddesc2
_patient_idshortdesc2
_patient_iddesc3
_patient_idshortdesc3
_patient_iddesc4
_patient_idshortdesc4
_patient_iddesc5
_patient_idshortdesc5
_patient_iddesc6
_patient_idshortdesc6
_patient_iddesc7
_patient_idshortdesc7
_patient_iddesc8
_patient_idshortdesc8
id
patient_id
_device
_era
_current
_when_removed_exact
_when_removed_batch_utc
_removing_user
_preserving_user
_forcibly_preserved
_predecessor_pk
_successor_pk
_manually_erased
_manually_erased_at
_manually_erasing_user
_addition_pending
_removal_pending
_move_off_tablet
{_SK.DDGEN_TABLE_REQUIRE_FIELD_ABSOLUTE} =
{_SK.DDGEN_TABLE_REQUIRE_FIELD_CONDITIONAL} =
{_SK.DDGEN_PK_FIELDS} = _pk
{_SK.DDGEN_PREFER_ORIGINAL_PK} = {_DS.DDGEN_PREFER_ORIGINAL_PK}
{_SK.DDGEN_CONSTANT_CONTENT} = False
{_SK.DDGEN_SCRUBSRC_PATIENT_FIELDS} = _patient_forename
_patient_surname
_patient_dob
_patient_idnum1
_patient_idnum2
_patient_idnum3
_patient_idnum4
_patient_idnum5
_patient_idnum6
_patient_idnum7
_patient_idnum8
{_SK.DDGEN_SCRUBSRC_THIRDPARTY_FIELDS} =
{_SK.DDGEN_SCRUBMETHOD_CODE_FIELDS} =
{_SK.DDGEN_SCRUBMETHOD_DATE_FIELDS} = _patient_dob
{_SK.DDGEN_SCRUBMETHOD_NUMBER_FIELDS} =
{_SK.DDGEN_SCRUBMETHOD_PHRASE_FIELDS} =
{_SK.DDGEN_SAFE_FIELDS_EXEMPT_FROM_SCRUBBING} = _device
_era
_when_added_exact
_adding_user
_when_removed_exact
_removing_user
_preserving_user
_manually_erased_at
_manually_erasing_user
when_last_modified
when_created
when_firstexit
clinician_specialty
clinician_name
clinician_post
clinician_professional_registration
clinician_contact_details
# ... now some task-specific ones
bdi_scale
pause_start_time
pause_end_time
trial_start_time
cue_start_time
target_start_time
detection_start_time
iti_start_time
iti_end_time
trial_end_time
response_time
target_time
choice_time
discharge_date
discharge_reason_code
diagnosis_psych_1_icd10code
diagnosis_psych_1_description
diagnosis_psych_2_icd10code
diagnosis_psych_2_description
diagnosis_psych_3_icd10code
diagnosis_psych_3_description
diagnosis_psych_4_icd10code
diagnosis_psych_4_description
diagnosis_medical_1
diagnosis_medical_2
diagnosis_medical_3
diagnosis_medical_4
category_start_time
category_response_time
category_chosen
gamble_fixed_option
gamble_lottery_option_p
gamble_lottery_option_q
gamble_start_time
gamble_response_time
likelihood
{_SK.DDGEN_MIN_LENGTH_FOR_SCRUBBING} = {_DS.DDGEN_MIN_LENGTH_FOR_SCRUBBING}
{_SK.DDGEN_TRUNCATE_DATE_FIELDS} = _patient_dob
{_SK.DDGEN_FILENAME_TO_TEXT_FIELDS} =
{_SK.DDGEN_BINARY_TO_TEXT_FIELD_PAIRS} =
""" # noqa
# For the style:
# [source_databases]
# source1 = blah
# source2 = thing
# ... you can't have multiple keys with the same name.
# https://stackoverflow.com/questions/287757