Source code for crate_anon.linkage.constants

r"""
crate_anon/linkage/constants.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Constants for linkage tools.**

"""

# =============================================================================
# Imports
# =============================================================================

import math
from multiprocessing import cpu_count
import os
import platform
from typing import Dict

import appdirs
from cardinal_pythonlib.hash import HashMethods
from cardinal_pythonlib.probability import probability_from_log_odds

from crate_anon.common.constants import EnvVar


# =============================================================================
# Helper functions
# =============================================================================


def _mk_dictstr(x: Dict[str, float]) -> str:
    return ",".join(f"{k}:{v}" for k, v in x.items())


# =============================================================================
# Constants
# =============================================================================

# CHECK_BASIC_ASSERTIONS_IN_HIGH_SPEED_FUNCTIONS = False  # for debugging only

INFINITY = math.inf
MINUS_INFINITY = -math.inf
NONE_TYPE = type(None)

DAYS_PER_YEAR = 365.25  # approximately!
MONTHS_PER_YEAR = 12
DAYS_PER_MONTH = DAYS_PER_YEAR / MONTHS_PER_YEAR  # on average

THIS_DIR = os.path.abspath(os.path.dirname(__file__))

UK_MEAN_OA_POPULATION_2011 = 309  # not used any more! Left here for interest.
# ... https://www.ons.gov.uk/methodology/geography/ukgeographies/censusgeography  # noqa
UK_POPULATION_2017 = 66040000  # 2017 figure, 66.04m
CAMBS_PBORO_POPULATION_2018 = 852523

GENDER_MALE = "M"
GENDER_FEMALE = "F"
GENDER_OTHER = "X"
GENDER_MISSING = ""
VALID_GENDERS = [GENDER_MISSING, GENDER_MALE, GENDER_FEMALE, GENDER_OTHER]
# ... standard three gender codes; "" = missing


SIMPLIFY_PUNCTUATION_WHITESPACE_TRANS = str.maketrans(
    {
        "\t": " ",  # tab -> space
        "\n": " ",  # linefeed -> space
        "\r": " ",  # carriage return -> space
        "“": '"',  # curly left double quote -> straight double quote
        "”": '"',  # curly right double quote -> straight double quote
        "‘": "'",  # curly left single quote -> straight single quote
        "’": "'",  # curly right single quote -> straight single quote
        "–": "-",  # en dash -> hyphen
        "—": "-",  # em dash -> hyphen
        "−": "-",  # minus -> hyphen
    }
)


# A capital Eszett was introduced for the first time in 2017. Before that, SS
# was the capital version. See https://en.wikipedia.org/wiki/%C3%9F.
ESZETT_LOWER_CASE = "ß"
ESZETT_UPPER_CASE = "ẞ"
SAFE_UPPER_PRETRANSLATE = str.maketrans({ESZETT_LOWER_CASE: ESZETT_UPPER_CASE})
MANGLE_PRETRANSLATE = str.maketrans(
    {
        ESZETT_LOWER_CASE: "ss",
        ESZETT_UPPER_CASE: "SS",
    }
)


[docs]class Switches: """ Argparse option switches that are used in several places, and also the names of MatchConfig parameters, used for error messages. """ INPUT = "input" OUTPUT = "output" INCLUDE_OTHER_INFO = "include_other_info" EXTRA_VALIDATION_OUTPUT = "extra_validation_output" CHECK_COMPARISON_ORDER = "check_comparison_order" REPORT_EVERY = "report_every" MIN_PROBANDS_FOR_PARALLEL = "min_probands_for_parallel" N_WORKERS = "n_workers" KEY = "key" ALLOW_DEFAULT_HASH_KEY = "allow_default_hash_key" HASH_METHOD = "hash_method" ROUNDING_SF = "rounding_sf" LOCAL_ID_HASH_KEY = "local_id_hash_key" POPULATION_SIZE = "population_size" ACCENT_TRANSLITERATIONS = "accent_transliterations" FORENAME_CACHE_FILENAME = "forename_cache_filename" FORENAME_SEX_FREQ_CSV = "forename_sex_freq_csv" FORENAME_MIN_FREQUENCY = "forename_min_frequency" NONSPECIFIC_NAME_COMPONENTS = "nonspecific_name_components" SURNAME_CACHE_FILENAME = "surname_cache_filename" SURNAME_FREQ_CSV = "surname_freq_csv" SURNAME_MIN_FREQUENCY = "surname_min_frequency" BIRTH_YEAR_PSEUDO_RANGE = "birth_year_pseudo_range" P_NOT_MALE_OR_FEMALE = "p_not_male_or_female" P_FEMALE_GIVEN_MALE_OR_FEMALE = "p_female_given_male_or_female" POSTCODE_CACHE_FILENAME = "postcode_cache_filename" POSTCODE_CSV_FILENAME = "postcode_csv_filename" P_UNKNOWN_OR_PSEUDO_POSTCODE = "p_unknown_or_pseudo_postcode" P_EP1_FORENAME = "p_ep1_forename" P_EP2NP1_FORENAME = "p_ep2np1_forename" P_EN_FORENAME = "p_en_forename" P_U_FORENAME = "p_u_forename" P_EP1_SURNAME = "p_ep1_surname" P_EP2NP1_SURNAME = "p_ep2np1_surname" P_EN_SURNAME = "p_en_surname" P_EP_DOB = "p_ep_dob" P_EN_DOB = "p_en_dob" P_E_GENDER = "p_e_gender" P_EP_POSTCODE = "p_ep_postcode" P_EN_POSTCODE = "p_en_postcode" K_POSTCODE = "k_postcode" K_PSEUDOPOSTCODE = "k_pseudopostcode" MIN_LOG_ODDS_FOR_MATCH = "min_log_odds_for_match" EXCEEDS_NEXT_BEST_LOG_ODDS = "exceeds_next_best_log_odds" PERFECT_ID_TRANSLATION = "perfect_id_translation"
[docs]class FuzzyDefaults: """ Some configuration defaults. """ # ------------------------------------------------------------------------- # Filenames # ------------------------------------------------------------------------- _appname = "crate" # Public data that we provide a local copy of _THIS_DIR = os.path.abspath(os.path.dirname(__file__)) if EnvVar.GENERATING_CRATE_DOCS in os.environ: _DATA_DIR = "/path/to/linkage/data/" else: _DATA_DIR = os.path.join(_THIS_DIR, "data") FORENAME_SEX_FREQ_CSV = os.path.join(_DATA_DIR, "us_forename_sex_freq.zip") SURNAME_FREQ_CSV = os.path.join(_DATA_DIR, "us_surname_freq.zip") POSTCODES_CSV = os.path.join(_DATA_DIR, "ONSPD_MAY_2022_UK.zip") if EnvVar.GENERATING_CRATE_DOCS in os.environ: DEFAULT_CACHE_DIR = "/path/to/crate/user/data" N_PROCESSES = 8 else: DEFAULT_CACHE_DIR = os.path.join( appdirs.user_data_dir(appname=_appname) ) if platform.system() == "Windows": N_PROCESSES = 1 # usually faster! else: N_PROCESSES = cpu_count() # Caches FORENAME_CACHE_FILENAME = os.path.join( DEFAULT_CACHE_DIR, "fuzzy_forename_cache.jsonl" ) POSTCODE_CACHE_FILENAME = os.path.join( DEFAULT_CACHE_DIR, "fuzzy_postcode_cache.json" ) SURNAME_CACHE_FILENAME = os.path.join( DEFAULT_CACHE_DIR, "fuzzy_surname_cache.jsonl" ) # ------------------------------------------------------------------------- # Hashing, rounding # ------------------------------------------------------------------------- HASH_KEY = "fuzzy_id_match_default_hash_key_DO_NOT_USE_FOR_LIVE_DATA" HASH_METHOD = HashMethods.HMAC_SHA256 ROUNDING_SF = 5 # ... number of significant figures for frequency rounding; 3 may be too # small, e.g. surname Smith 0.01006, corresponding metaphone SM0 # 0.010129999999999998 would be the same at 3sf. # ------------------------------------------------------------------------- # Run-time options # ------------------------------------------------------------------------- CHECK_COMPARISON_ORDER = False MIN_PROBANDS_FOR_PARALLEL = 1000 # ... a machine that takes ~30s to set up a basic parallel run (and 107.9s # for a 10k-to-10k comparison) processes single results at about 37/s... so # the break-even point is probably around 1000. But that does depend on the # sample size too. # ------------------------------------------------------------------------- # Population priors # ------------------------------------------------------------------------- # See command-line help. # (E) Empirical; see validation paper. # (N) From national data. POPULATION_SIZE = CAMBS_PBORO_POPULATION_2018 # (N) FORENAME_MIN_FREQ = 5e-6 SURNAME_MIN_FREQ = 5e-6 # Tried with (a) forename minimum frequency 2.9e-8, on the basis of US # forename data giving a floor at 2.875e-8 (M), 2.930e-8 (F), so 2.9e-8 to # 2sf; and (b) surname minimum frequency at 1.5e-7, since in the US surname # data, values below 3e-7 are reported as 0, so 1.5e-7 is the midpoint of # the low-frequency range. This doesn't (materially) affect the best # performance: accuracy etc. are still optimized at theta = delta = 0, MID # is still optimized at theta = delta = 15, and the WPM is optimized at # theta = 6, delta = 0 (rather than theta = 5, delta = 0). However, these # very low values just inflate MID overall and are not very plausible; much # below 1/n_p is not very plausible, and likely over-emphasizes matches on # unusual/unknown names. So: 5e-6 as originally planned (since the previous # US surname data had a floor at 1e-5, and since we will pilot with n_p # ~1e6). BIRTH_YEAR_PSEUDO_RANGE = 30 # (E) UK-wide ~90, perhaps; 30 empirically. P_FEMALE_GIVEN_MALE_OR_FEMALE = 0.51 # (N) P_NOT_MALE_OR_FEMALE = 0.004 # (N) K_POSTCODE = None # default is to autocalculate from population; see paper # noinspection HttpUrlsUsage _ = """ P_UNKNOWN_OR_PSEUDO_POSTCODE ---------------------------- - Pseudo-postcodes: e.g. ZZ99 3VZ, no fixed abode; ZZ99 3CZ, England/UK not otherwise specified [4]. - These postcodes are not in the ONS Postcode Directory. - In Apr-Jun 2019, 11.4% of households in England who were {homeless or threatened with homelessness} had no fixed abode [1, Table 2]. - That table totals 68,180 households, so that probably matches the 68,170 households in England used as the summary figure on p1 [1]. - In 2020, there were ~27.8 million households in the UK [2]. - The mean household size in the UK is 2.4 [2]. (Although the proportion who are homeless is likely biased towards single individuals?) Yes, "Nearly two-thirds of these were single households (households without children)." - 0.843 of the UK population live in England - So, the fraction of homelessness can be estimated as avg_people_per_household = 2.4 n_people_per_homeless_household = (2 / 3) * 1 + (1 / 3) * avg_people_per_household n_people_homeless_england = (11.4 / 100) * 68180 * n_people_per_homeless_household n_people_uk = 27.8e6 * 2.4 # 66.7 million, so that's about right n_people_england = 0.843 * n_people_uk p_homeless = n_people_homeless_england / n_people_england = 0.0002026794 We'll round: 0.000203 (So that's about 13.5k people with postcode ZZ99 3VZ, estimated.) [1] https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/852953/Statutory_Homelessness_Statistical_Release_Apr-Jun_2019.pdf # noqa [2] https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/families/bulletins/familiesandhouseholds/2020 # noqa [3] https://pubmed.ncbi.nlm.nih.gov/35477868/ [4] http://www.datadictionary.wales.nhs.uk/index.html#!WordDocuments/postcode.htm # noqa However, our empirical rate is 0.00201 for ZZ99 3VZ (SystmOne; see empirical_rates.sql). K_PSEUDOPOSTCODE ---------------- Distinct postcodes in sector ZZ993, from https://files.digital.nhs.uk/assets/ods/current/Look%20Ups.zip: ZZ99 3AZ Eire / Irish Republic / Southern Ireland ZZ99 3BZ Isle of Man ZZ99 3CZ England / Great Britain / United Kingdom (not otherwise stated) ZZ99 3EZ Guernsey / Herm / Jethou Island / Lihou ZZ99 3FZ Jersey ZZ99 3GZ Wales ZZ99 3HZ Channel Islands (not otherwise stated) / Alderney / Brechou / Sark, Little and Great ZZ99 3VZ No fixed abode ZZ99 3WZ At sea / In the air / Inadequately described/specified / Information refused / Not collected / Not known / Not stated/specified So there are 9 postcode units in the ZZ993 sector. Our estimate above is for homelessness, which is likely overrepresented, but these are also big groupings of visitors. No particularly strong evidence to deviate from 9 at present (acknowledging this is a fairly fuzzy estimate anyway). The most important thing is that k_pseudopostcode > 1. It would be invalid for it to be <1 and if it is exactly 1, then p_pnf_postcode = 0 (because the sector probability will exactly match the unit probability) and any inadvertent sector-not-unit match will give a log likelihood of +∞ and a certain match. However, empirically in SystmOne, ZZ993 / ZZ993VZ = 1.83 (see paper). """ P_UNKNOWN_OR_PSEUDO_POSTCODE = 0.00201 # (E) K_PSEUDOPOSTCODE = 1.83 # (E) # ------------------------------------------------------------------------- # Error rates # ------------------------------------------------------------------------- # (E) Empirical; see validation paper. # (*) Using the empirical value is much less efficient computationally. P_EP1_FORENAME = { GENDER_FEMALE: 0.00894, # (E) GENDER_MALE: 0.00840, # (E) } P_EP2NP1_FORENAME = { GENDER_FEMALE: 0.00881, # (E) GENDER_MALE: 0.00688, # (E) } P_EN_FORENAME = { GENDER_FEMALE: 0.00572, # (E) GENDER_MALE: 0.00625, # (E) } P_U_FORENAME = 0.00191 # (E) P_EP1_SURNAME = { GENDER_FEMALE: 0.00551, # (E) GENDER_MALE: 0.00471, # (E) } P_EP2NP1_SURNAME = { GENDER_FEMALE: 0.00378, # (E) GENDER_MALE: 0.00247, # (E) } P_EN_SURNAME = { GENDER_FEMALE: 0.0567, # (E) GENDER_MALE: 0.0134, # (E) } _P_E_DOB = 0.00492 # DOB not full match (E) _P_EP_DOB_GIVEN_P_E_DOB = 0.933 # P(partial | not full); (E) P_EP_DOB = _P_E_DOB * _P_EP_DOB_GIVEN_P_E_DOB # (E) P_EN_DOB_TRUE = _P_E_DOB * (1 - _P_EP_DOB_GIVEN_P_E_DOB) # (E) (*) P_EN_DOB = 0 # Much faster (*) P_E_GENDER = 0.0033 # (E) P_EP_POSTCODE = 0.0097 # (E) P_EN_POSTCODE = 0.300 # (E) # ------------------------------------------------------------------------- # Matching process # ------------------------------------------------------------------------- MIN_LOG_ODDS_FOR_MATCH = 5 # theta, in the validation paper EXCEEDS_NEXT_BEST_LOG_ODDS = 0 # delta, in the validation paper PERFECT_ID_TRANSLATION = "" REPORT_EVERY = 100 # cosmetic only # ------------------------------------------------------------------------- # Name handling # ------------------------------------------------------------------------- NONSPECIFIC_NAME_COMPONENTS = set( # Includes nobiliary particles: # https://en.wikipedia.org/wiki/Nobiliary_particle. Typically these # mean "of", "of the", or "the". See also # https://en.wikipedia.org/wiki/List_of_family_name_affixes; # https://en.wikipedia.org/wiki/Suffix_(name). x.upper() for x in ( # Arabic-speaking countries "Al", "El", # Belgian "de", "der", "van", # Danish "af", # Dutch "tot", "thoe", "van", # English, Welsh, Scottish "of", # French "d", # e.g. Giscard d'Estaing "de", "des", "du", "l", # e.g. L'Estrange "la", "le", # German "auf", "von", "zu", # Italian "da", "dai", "dal", "dalla", "dei", "del", "dell", "della", "di", # Portuguese, "da", "das", "do", "dos", # Somali "Aw", # Spanish "de", # Swedish "af", "av", "von", # Swiss "de", "von", # Thai "na", "Phra", "Sri", # USA: seniority "Jnr", "Jr", "Snr", "Sr", # USA: numbering (not just the USA in theory; e.g. Richard III). "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", ) ) ACCENT_TRANSLITERATIONS = [ # Only upper-case versions are required. # German: https://en.wikipedia.org/wiki/German_orthography ("Ä", "AE"), ("Ö", "OE"), ("Ü", "UE"), (ESZETT_UPPER_CASE, "SS"), ] # ------------------------------------------------------------------------- # Derived # ------------------------------------------------------------------------- MIN_P_FOR_MATCH = probability_from_log_odds(MIN_LOG_ODDS_FOR_MATCH) P_EP1_FORENAME_CSV = _mk_dictstr(P_EP1_FORENAME) P_EP2NP1_FORENAME_CSV = _mk_dictstr(P_EP2NP1_FORENAME) P_EN_FORENAME_CSV = _mk_dictstr(P_EN_FORENAME) P_EP1_SURNAME_CSV = _mk_dictstr(P_EP1_SURNAME) P_EP2NP1_SURNAME_CSV = _mk_dictstr(P_EP2NP1_SURNAME) P_EN_SURNAME_CSV = _mk_dictstr(P_EN_SURNAME) NONSPECIFIC_NAME_COMPONENTS_CSV = ",".join( sorted(NONSPECIFIC_NAME_COMPONENTS) ) ACCENT_TRANSLITERATIONS_SLASH_CSV = ",".join( f"{accent}/{plain}" for accent, plain in ACCENT_TRANSLITERATIONS ) ACCENT_TRANSLITERATIONS_TRANS = str.maketrans( {accent: plain for accent, plain in ACCENT_TRANSLITERATIONS} )