r"""
crate_anon/linkage/constants.py
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
**Constants for linkage tools.**
"""
# =============================================================================
# Imports
# =============================================================================
import math
from multiprocessing import cpu_count
import os
import platform
from typing import Dict
import appdirs
from cardinal_pythonlib.hash import HashMethods
from cardinal_pythonlib.probability import probability_from_log_odds
from crate_anon.common.constants import EnvVar
# =============================================================================
# Helper functions
# =============================================================================
def _mk_dictstr(x: Dict[str, float]) -> str:
return ",".join(f"{k}:{v}" for k, v in x.items())
# =============================================================================
# Constants
# =============================================================================
# CHECK_BASIC_ASSERTIONS_IN_HIGH_SPEED_FUNCTIONS = False # for debugging only
INFINITY = math.inf
MINUS_INFINITY = -math.inf
NONE_TYPE = type(None)
DAYS_PER_YEAR = 365.25 # approximately!
MONTHS_PER_YEAR = 12
DAYS_PER_MONTH = DAYS_PER_YEAR / MONTHS_PER_YEAR # on average
THIS_DIR = os.path.abspath(os.path.dirname(__file__))
UK_MEAN_OA_POPULATION_2011 = 309 # not used any more! Left here for interest.
# ... https://www.ons.gov.uk/methodology/geography/ukgeographies/censusgeography # noqa
UK_POPULATION_2017 = 66040000 # 2017 figure, 66.04m
CAMBS_PBORO_POPULATION_2018 = 852523
GENDER_MALE = "M"
GENDER_FEMALE = "F"
GENDER_OTHER = "X"
GENDER_MISSING = ""
VALID_GENDERS = [GENDER_MISSING, GENDER_MALE, GENDER_FEMALE, GENDER_OTHER]
# ... standard three gender codes; "" = missing
SIMPLIFY_PUNCTUATION_WHITESPACE_TRANS = str.maketrans(
{
"\t": " ", # tab -> space
"\n": " ", # linefeed -> space
"\r": " ", # carriage return -> space
"“": '"', # curly left double quote -> straight double quote
"”": '"', # curly right double quote -> straight double quote
"‘": "'", # curly left single quote -> straight single quote
"’": "'", # curly right single quote -> straight single quote
"–": "-", # en dash -> hyphen
"—": "-", # em dash -> hyphen
"−": "-", # minus -> hyphen
}
)
# A capital Eszett was introduced for the first time in 2017. Before that, SS
# was the capital version. See https://en.wikipedia.org/wiki/%C3%9F.
ESZETT_LOWER_CASE = "ß"
ESZETT_UPPER_CASE = "ẞ"
SAFE_UPPER_PRETRANSLATE = str.maketrans({ESZETT_LOWER_CASE: ESZETT_UPPER_CASE})
MANGLE_PRETRANSLATE = str.maketrans(
{
ESZETT_LOWER_CASE: "ss",
ESZETT_UPPER_CASE: "SS",
}
)
[docs]class Switches:
"""
Argparse option switches that are used in several places, and also the
names of MatchConfig parameters, used for error messages.
"""
INPUT = "input"
OUTPUT = "output"
INCLUDE_OTHER_INFO = "include_other_info"
EXTRA_VALIDATION_OUTPUT = "extra_validation_output"
CHECK_COMPARISON_ORDER = "check_comparison_order"
REPORT_EVERY = "report_every"
MIN_PROBANDS_FOR_PARALLEL = "min_probands_for_parallel"
N_WORKERS = "n_workers"
KEY = "key"
ALLOW_DEFAULT_HASH_KEY = "allow_default_hash_key"
HASH_METHOD = "hash_method"
ROUNDING_SF = "rounding_sf"
LOCAL_ID_HASH_KEY = "local_id_hash_key"
POPULATION_SIZE = "population_size"
ACCENT_TRANSLITERATIONS = "accent_transliterations"
FORENAME_CACHE_FILENAME = "forename_cache_filename"
FORENAME_SEX_FREQ_CSV = "forename_sex_freq_csv"
FORENAME_MIN_FREQUENCY = "forename_min_frequency"
NONSPECIFIC_NAME_COMPONENTS = "nonspecific_name_components"
SURNAME_CACHE_FILENAME = "surname_cache_filename"
SURNAME_FREQ_CSV = "surname_freq_csv"
SURNAME_MIN_FREQUENCY = "surname_min_frequency"
BIRTH_YEAR_PSEUDO_RANGE = "birth_year_pseudo_range"
P_NOT_MALE_OR_FEMALE = "p_not_male_or_female"
P_FEMALE_GIVEN_MALE_OR_FEMALE = "p_female_given_male_or_female"
POSTCODE_CACHE_FILENAME = "postcode_cache_filename"
POSTCODE_CSV_FILENAME = "postcode_csv_filename"
P_UNKNOWN_OR_PSEUDO_POSTCODE = "p_unknown_or_pseudo_postcode"
P_EP1_FORENAME = "p_ep1_forename"
P_EP2NP1_FORENAME = "p_ep2np1_forename"
P_EN_FORENAME = "p_en_forename"
P_U_FORENAME = "p_u_forename"
P_EP1_SURNAME = "p_ep1_surname"
P_EP2NP1_SURNAME = "p_ep2np1_surname"
P_EN_SURNAME = "p_en_surname"
P_EP_DOB = "p_ep_dob"
P_EN_DOB = "p_en_dob"
P_E_GENDER = "p_e_gender"
P_EP_POSTCODE = "p_ep_postcode"
P_EN_POSTCODE = "p_en_postcode"
K_POSTCODE = "k_postcode"
K_PSEUDOPOSTCODE = "k_pseudopostcode"
MIN_LOG_ODDS_FOR_MATCH = "min_log_odds_for_match"
EXCEEDS_NEXT_BEST_LOG_ODDS = "exceeds_next_best_log_odds"
PERFECT_ID_TRANSLATION = "perfect_id_translation"
[docs]class FuzzyDefaults:
"""
Some configuration defaults.
"""
# -------------------------------------------------------------------------
# Filenames
# -------------------------------------------------------------------------
_appname = "crate"
# Public data that we provide a local copy of
_THIS_DIR = os.path.abspath(os.path.dirname(__file__))
if EnvVar.GENERATING_CRATE_DOCS in os.environ:
_DATA_DIR = "/path/to/linkage/data/"
else:
_DATA_DIR = os.path.join(_THIS_DIR, "data")
FORENAME_SEX_FREQ_CSV = os.path.join(_DATA_DIR, "us_forename_sex_freq.zip")
SURNAME_FREQ_CSV = os.path.join(_DATA_DIR, "us_surname_freq.zip")
POSTCODES_CSV = os.path.join(_DATA_DIR, "ONSPD_MAY_2022_UK.zip")
if EnvVar.GENERATING_CRATE_DOCS in os.environ:
DEFAULT_CACHE_DIR = "/path/to/crate/user/data"
N_PROCESSES = 8
else:
DEFAULT_CACHE_DIR = os.path.join(
appdirs.user_data_dir(appname=_appname)
)
if platform.system() == "Windows":
N_PROCESSES = 1 # usually faster!
else:
N_PROCESSES = cpu_count()
# Caches
FORENAME_CACHE_FILENAME = os.path.join(
DEFAULT_CACHE_DIR, "fuzzy_forename_cache.jsonl"
)
POSTCODE_CACHE_FILENAME = os.path.join(
DEFAULT_CACHE_DIR, "fuzzy_postcode_cache.json"
)
SURNAME_CACHE_FILENAME = os.path.join(
DEFAULT_CACHE_DIR, "fuzzy_surname_cache.jsonl"
)
# -------------------------------------------------------------------------
# Hashing, rounding
# -------------------------------------------------------------------------
HASH_KEY = "fuzzy_id_match_default_hash_key_DO_NOT_USE_FOR_LIVE_DATA"
HASH_METHOD = HashMethods.HMAC_SHA256
ROUNDING_SF = 5
# ... number of significant figures for frequency rounding; 3 may be too
# small, e.g. surname Smith 0.01006, corresponding metaphone SM0
# 0.010129999999999998 would be the same at 3sf.
# -------------------------------------------------------------------------
# Run-time options
# -------------------------------------------------------------------------
CHECK_COMPARISON_ORDER = False
MIN_PROBANDS_FOR_PARALLEL = 1000
# ... a machine that takes ~30s to set up a basic parallel run (and 107.9s
# for a 10k-to-10k comparison) processes single results at about 37/s... so
# the break-even point is probably around 1000. But that does depend on the
# sample size too.
# -------------------------------------------------------------------------
# Population priors
# -------------------------------------------------------------------------
# See command-line help.
# (E) Empirical; see validation paper.
# (N) From national data.
POPULATION_SIZE = CAMBS_PBORO_POPULATION_2018 # (N)
FORENAME_MIN_FREQ = 5e-6
SURNAME_MIN_FREQ = 5e-6
# Tried with (a) forename minimum frequency 2.9e-8, on the basis of US
# forename data giving a floor at 2.875e-8 (M), 2.930e-8 (F), so 2.9e-8 to
# 2sf; and (b) surname minimum frequency at 1.5e-7, since in the US surname
# data, values below 3e-7 are reported as 0, so 1.5e-7 is the midpoint of
# the low-frequency range. This doesn't (materially) affect the best
# performance: accuracy etc. are still optimized at theta = delta = 0, MID
# is still optimized at theta = delta = 15, and the WPM is optimized at
# theta = 6, delta = 0 (rather than theta = 5, delta = 0). However, these
# very low values just inflate MID overall and are not very plausible; much
# below 1/n_p is not very plausible, and likely over-emphasizes matches on
# unusual/unknown names. So: 5e-6 as originally planned (since the previous
# US surname data had a floor at 1e-5, and since we will pilot with n_p
# ~1e6).
BIRTH_YEAR_PSEUDO_RANGE = 30 # (E) UK-wide ~90, perhaps; 30 empirically.
P_FEMALE_GIVEN_MALE_OR_FEMALE = 0.51 # (N)
P_NOT_MALE_OR_FEMALE = 0.004 # (N)
K_POSTCODE = None # default is to autocalculate from population; see paper
# noinspection HttpUrlsUsage
_ = """
P_UNKNOWN_OR_PSEUDO_POSTCODE
----------------------------
- Pseudo-postcodes: e.g. ZZ99 3VZ, no fixed abode; ZZ99 3CZ, England/UK
not otherwise specified [4].
- These postcodes are not in the ONS Postcode Directory.
- In Apr-Jun 2019, 11.4% of households in England who were {homeless or
threatened with homelessness} had no fixed abode [1, Table 2].
- That table totals 68,180 households, so that probably matches the
68,170 households in England used as the summary figure on p1 [1].
- In 2020, there were ~27.8 million households in the UK [2].
- The mean household size in the UK is 2.4 [2]. (Although the proportion
who are homeless is likely biased towards single individuals?)
Yes, "Nearly two-thirds of these were single households (households
without children)."
- 0.843 of the UK population live in England
- So, the fraction of homelessness can be estimated as
avg_people_per_household = 2.4
n_people_per_homeless_household = (2 / 3) * 1 + (1 / 3) * avg_people_per_household
n_people_homeless_england = (11.4 / 100) * 68180 * n_people_per_homeless_household
n_people_uk = 27.8e6 * 2.4 # 66.7 million, so that's about right
n_people_england = 0.843 * n_people_uk
p_homeless = n_people_homeless_england / n_people_england
= 0.0002026794
We'll round: 0.000203
(So that's about 13.5k people with postcode ZZ99 3VZ, estimated.)
[1] https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/852953/Statutory_Homelessness_Statistical_Release_Apr-Jun_2019.pdf # noqa
[2] https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/families/bulletins/familiesandhouseholds/2020 # noqa
[3] https://pubmed.ncbi.nlm.nih.gov/35477868/
[4] http://www.datadictionary.wales.nhs.uk/index.html#!WordDocuments/postcode.htm # noqa
However, our empirical rate is 0.00201 for ZZ99 3VZ (SystmOne; see
empirical_rates.sql).
K_PSEUDOPOSTCODE
----------------
Distinct postcodes in sector ZZ993, from
https://files.digital.nhs.uk/assets/ods/current/Look%20Ups.zip:
ZZ99 3AZ Eire / Irish Republic / Southern Ireland
ZZ99 3BZ Isle of Man
ZZ99 3CZ England / Great Britain / United Kingdom (not otherwise stated)
ZZ99 3EZ Guernsey / Herm / Jethou Island / Lihou
ZZ99 3FZ Jersey
ZZ99 3GZ Wales
ZZ99 3HZ Channel Islands (not otherwise stated) / Alderney / Brechou / Sark, Little and Great
ZZ99 3VZ No fixed abode
ZZ99 3WZ At sea / In the air / Inadequately described/specified / Information refused / Not collected / Not known / Not stated/specified
So there are 9 postcode units in the ZZ993 sector. Our estimate above is
for homelessness, which is likely overrepresented, but these are also big
groupings of visitors. No particularly strong evidence to deviate from 9
at present (acknowledging this is a fairly fuzzy estimate anyway). The most
important thing is that k_pseudopostcode > 1. It would be invalid for it
to be <1 and if it is exactly 1, then p_pnf_postcode = 0 (because the
sector probability will exactly match the unit probability) and any
inadvertent sector-not-unit match will give a log likelihood of +∞ and a
certain match.
However, empirically in SystmOne, ZZ993 / ZZ993VZ = 1.83 (see paper).
"""
P_UNKNOWN_OR_PSEUDO_POSTCODE = 0.00201 # (E)
K_PSEUDOPOSTCODE = 1.83 # (E)
# -------------------------------------------------------------------------
# Error rates
# -------------------------------------------------------------------------
# (E) Empirical; see validation paper.
# (*) Using the empirical value is much less efficient computationally.
P_EP1_FORENAME = {
GENDER_FEMALE: 0.00894, # (E)
GENDER_MALE: 0.00840, # (E)
}
P_EP2NP1_FORENAME = {
GENDER_FEMALE: 0.00881, # (E)
GENDER_MALE: 0.00688, # (E)
}
P_EN_FORENAME = {
GENDER_FEMALE: 0.00572, # (E)
GENDER_MALE: 0.00625, # (E)
}
P_U_FORENAME = 0.00191 # (E)
P_EP1_SURNAME = {
GENDER_FEMALE: 0.00551, # (E)
GENDER_MALE: 0.00471, # (E)
}
P_EP2NP1_SURNAME = {
GENDER_FEMALE: 0.00378, # (E)
GENDER_MALE: 0.00247, # (E)
}
P_EN_SURNAME = {
GENDER_FEMALE: 0.0567, # (E)
GENDER_MALE: 0.0134, # (E)
}
_P_E_DOB = 0.00492 # DOB not full match (E)
_P_EP_DOB_GIVEN_P_E_DOB = 0.933 # P(partial | not full); (E)
P_EP_DOB = _P_E_DOB * _P_EP_DOB_GIVEN_P_E_DOB # (E)
P_EN_DOB_TRUE = _P_E_DOB * (1 - _P_EP_DOB_GIVEN_P_E_DOB) # (E) (*)
P_EN_DOB = 0 # Much faster (*)
P_E_GENDER = 0.0033 # (E)
P_EP_POSTCODE = 0.0097 # (E)
P_EN_POSTCODE = 0.300 # (E)
# -------------------------------------------------------------------------
# Matching process
# -------------------------------------------------------------------------
MIN_LOG_ODDS_FOR_MATCH = 5 # theta, in the validation paper
EXCEEDS_NEXT_BEST_LOG_ODDS = 0 # delta, in the validation paper
PERFECT_ID_TRANSLATION = ""
REPORT_EVERY = 100 # cosmetic only
# -------------------------------------------------------------------------
# Name handling
# -------------------------------------------------------------------------
NONSPECIFIC_NAME_COMPONENTS = set(
# Includes nobiliary particles:
# https://en.wikipedia.org/wiki/Nobiliary_particle. Typically these
# mean "of", "of the", or "the". See also
# https://en.wikipedia.org/wiki/List_of_family_name_affixes;
# https://en.wikipedia.org/wiki/Suffix_(name).
x.upper()
for x in (
# Arabic-speaking countries
"Al",
"El",
# Belgian
"de",
"der",
"van",
# Danish
"af",
# Dutch
"tot",
"thoe",
"van",
# English, Welsh, Scottish
"of",
# French
"d", # e.g. Giscard d'Estaing
"de",
"des",
"du",
"l", # e.g. L'Estrange
"la",
"le",
# German
"auf",
"von",
"zu",
# Italian
"da",
"dai",
"dal",
"dalla",
"dei",
"del",
"dell",
"della",
"di",
# Portuguese,
"da",
"das",
"do",
"dos",
# Somali
"Aw",
# Spanish
"de",
# Swedish
"af",
"av",
"von",
# Swiss
"de",
"von",
# Thai
"na",
"Phra",
"Sri",
# USA: seniority
"Jnr",
"Jr",
"Snr",
"Sr",
# USA: numbering (not just the USA in theory; e.g. Richard III).
"I",
"II",
"III",
"IV",
"V",
"VI",
"VII",
"VIII",
"IX",
"X",
)
)
ACCENT_TRANSLITERATIONS = [
# Only upper-case versions are required.
# German: https://en.wikipedia.org/wiki/German_orthography
("Ä", "AE"),
("Ö", "OE"),
("Ü", "UE"),
(ESZETT_UPPER_CASE, "SS"),
]
# -------------------------------------------------------------------------
# Derived
# -------------------------------------------------------------------------
MIN_P_FOR_MATCH = probability_from_log_odds(MIN_LOG_ODDS_FOR_MATCH)
P_EP1_FORENAME_CSV = _mk_dictstr(P_EP1_FORENAME)
P_EP2NP1_FORENAME_CSV = _mk_dictstr(P_EP2NP1_FORENAME)
P_EN_FORENAME_CSV = _mk_dictstr(P_EN_FORENAME)
P_EP1_SURNAME_CSV = _mk_dictstr(P_EP1_SURNAME)
P_EP2NP1_SURNAME_CSV = _mk_dictstr(P_EP2NP1_SURNAME)
P_EN_SURNAME_CSV = _mk_dictstr(P_EN_SURNAME)
NONSPECIFIC_NAME_COMPONENTS_CSV = ",".join(
sorted(NONSPECIFIC_NAME_COMPONENTS)
)
ACCENT_TRANSLITERATIONS_SLASH_CSV = ",".join(
f"{accent}/{plain}" for accent, plain in ACCENT_TRANSLITERATIONS
)
ACCENT_TRANSLITERATIONS_TRANS = str.maketrans(
{accent: plain for accent, plain in ACCENT_TRANSLITERATIONS}
)