Source code for crate_anon.linkage.matchconfig

r"""
crate_anon/linkage/matchconfig.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Helper functions for linkage tools.**

"""

# =============================================================================
# Imports
# =============================================================================

import logging
from typing import Any, Dict, NoReturn, Optional, Set, Tuple, Union

from cardinal_pythonlib.hash import make_hasher
from cardinal_pythonlib.maths_py import round_sf, normal_round_int
from cardinal_pythonlib.probability import log_odds_from_1_in_n
from cardinal_pythonlib.reprfunc import auto_repr

from crate_anon.linkage.constants import (
    DAYS_PER_MONTH,
    DAYS_PER_YEAR,
    FuzzyDefaults,
    GENDER_FEMALE,
    GENDER_MALE,
    GENDER_MISSING,
    GENDER_OTHER,
    MONTHS_PER_YEAR,
    Switches,
    UK_POPULATION_2017,
    VALID_GENDERS,
)
from crate_anon.linkage.frequencies import (
    BasicNameFreqInfo,
    NameFrequencyInfo,
    PostcodeFrequencyInfo,
)
from crate_anon.linkage.helpers import (
    dict_from_str,
    safe_upper,
    standardize_name,
    standardize_perfect_id_key,
    standardize_perfect_id_value,
)

log = logging.getLogger(__name__)


# =============================================================================
# Main configuration class, supporting frequency-based probability calculations
# =============================================================================


[docs]class MatchConfig: """ Master config class. It's more convenient to pass one of these round than lots of its components. Default arguments are there for testing. """
[docs] def __init__( self, hash_key: str = FuzzyDefaults.HASH_KEY, hash_method: str = FuzzyDefaults.HASH_METHOD, rounding_sf: Optional[int] = FuzzyDefaults.ROUNDING_SF, local_id_hash_key: str = None, population_size: int = FuzzyDefaults.POPULATION_SIZE, forename_sex_csv_filename: str = FuzzyDefaults.FORENAME_SEX_FREQ_CSV, forename_cache_filename: str = FuzzyDefaults.FORENAME_CACHE_FILENAME, forename_freq_info: Optional[NameFrequencyInfo] = None, forename_min_frequency: float = FuzzyDefaults.FORENAME_MIN_FREQ, surname_csv_filename: str = FuzzyDefaults.SURNAME_FREQ_CSV, surname_cache_filename: str = FuzzyDefaults.SURNAME_CACHE_FILENAME, surname_freq_info: Optional[NameFrequencyInfo] = None, surname_min_frequency: float = FuzzyDefaults.SURNAME_MIN_FREQ, accent_transliterations_csv: str = ( FuzzyDefaults.ACCENT_TRANSLITERATIONS_SLASH_CSV ), nonspecific_name_components_csv: str = ( FuzzyDefaults.NONSPECIFIC_NAME_COMPONENTS_CSV ), birth_year_pseudo_range: float = FuzzyDefaults.BIRTH_YEAR_PSEUDO_RANGE, p_not_male_or_female: float = FuzzyDefaults.P_NOT_MALE_OR_FEMALE, p_female_given_male_or_female: float = ( FuzzyDefaults.P_FEMALE_GIVEN_MALE_OR_FEMALE ), postcode_csv_filename: str = FuzzyDefaults.POSTCODES_CSV, postcode_cache_filename: str = FuzzyDefaults.POSTCODE_CACHE_FILENAME, postcode_freq_info: Optional[PostcodeFrequencyInfo] = None, k_postcode: Optional[float] = FuzzyDefaults.K_POSTCODE, p_unknown_or_pseudo_postcode: float = ( FuzzyDefaults.P_UNKNOWN_OR_PSEUDO_POSTCODE ), k_pseudopostcode: float = FuzzyDefaults.K_PSEUDOPOSTCODE, p_ep1_forename: str = FuzzyDefaults.P_EP1_FORENAME_CSV, p_ep2np1_forename: str = FuzzyDefaults.P_EP2NP1_FORENAME_CSV, p_u_forename: float = FuzzyDefaults.P_U_FORENAME, p_en_forename: str = FuzzyDefaults.P_EN_FORENAME_CSV, p_ep1_surname: str = FuzzyDefaults.P_EP1_SURNAME_CSV, p_ep2np1_surname: str = FuzzyDefaults.P_EP2NP1_SURNAME_CSV, p_en_surname: str = FuzzyDefaults.P_EN_SURNAME_CSV, p_ep_dob: float = FuzzyDefaults.P_EP_DOB, p_en_dob: float = FuzzyDefaults.P_EN_DOB, p_e_gender: float = FuzzyDefaults.P_E_GENDER, p_ep_postcode: float = FuzzyDefaults.P_EP_POSTCODE, p_en_postcode: float = FuzzyDefaults.P_EN_POSTCODE, min_log_odds_for_match: float = FuzzyDefaults.MIN_LOG_ODDS_FOR_MATCH, exceeds_next_best_log_odds: float = ( FuzzyDefaults.EXCEEDS_NEXT_BEST_LOG_ODDS ), perfect_id_translation: Union[ Dict[str, str], str ] = FuzzyDefaults.PERFECT_ID_TRANSLATION, extra_validation_output: bool = False, check_comparison_order: bool = FuzzyDefaults.CHECK_COMPARISON_ORDER, report_every: int = FuzzyDefaults.REPORT_EVERY, min_probands_for_parallel: int = ( FuzzyDefaults.MIN_PROBANDS_FOR_PARALLEL ), n_workers: int = FuzzyDefaults.N_PROCESSES, verbose: bool = False, ) -> None: """ Args: hash_key: Key (passphrase) for hasher. hash_method: Method to use for hashhing. rounding_sf: Number of significant figures to use when rounding frequency information in hashed copies. Use ``None`` for no rounding. local_id_hash_key: If specified, then for hash operations, the local_id values will also be hashed, using this key. population_size: The size of the entire population (not our sample). See docstrings above. forename_sex_csv_filename: Forename frequencies. CSV file, with no header, of "name, frequency" pairs. forename_cache_filename: File in which to cache forename information for faster loading. forename_freq_info: Debugging option: overrides forename_sex_csv_filename by providing a NameFrequencyInfo object directly. forename_min_frequency: Minimum frequency for forenames. surname_csv_filename: Surname frequencies. CSV file, with no header, of "name, frequency" pairs. surname_cache_filename: File in which to cache forename information for faster loading. surname_freq_info: Debugging option: overrides surname_csv_filename by providing a NameFrequencyInfo object directly. surname_min_frequency: Minimum frequency for surnames. accent_transliterations_csv: Accent transliteration map. String of the form "Ä/AE,Ö/OE" -- comma-separated pairs, with slashed separating each pair. nonspecific_name_components_csv: CSV-separated list of nonspecific name components (e.g. nobiliary particles), which will be avoided as equivalent name fragments. birth_year_pseudo_range: b, such that P(two people share a DOB) = 1/(365.25 * b). p_not_male_or_female: Probability that a person in the population has gender 'X'. p_female_given_male_or_female: Probability that a person in the population is female, given that they are either male or female. postcode_csv_filename: Postcode mapping. CSV (or ZIP) file. Special format; see :class:`PostcodeFrequencyInfo`. postcode_cache_filename: File in which to cache postcode information for faster loading. postcode_freq_info: Debugging option: overrides postcode_csv_filename by providing a PostcodeFrequencyInfo object directly. k_postcode: Multiple applied to postcode unit/sector frequencies, such that p_f_postcode = k_postcode * f_f_postcode and p_p_postcode = k_postcode * f_p_postcode. If None, defaults to UK_POPULATION_2017 / population_size, appropriate if the population under consideration is geographically constrained (rather than sampled from across the UK). p_unknown_or_pseudo_postcode: Probability that a random person will have a pseudo-postcode, e.g. ZZ99 3VZ (no fixed abode) or a postcode not known to our database. Specifically, P(each pseudopostcode or unknown postcode unit | ¬H). k_pseudopostcode: Probability multiple: P(pseudopostcode sector or unknown postcode sector match | ¬H) = k_pseudopostcode * p_unknown_or_pseudo_postcode. Must strictly be >=1 and we enforce >1; see paper. p_ep1_forename: Error probability that a forename fails a full match but passes a partial 1 (metaphone) match. [GPD] p_ep2np1_forename: Error probability that a forename fails a full match and a partial 1 match but passes a partial 2 (F2C) match. [GPD] p_en_forename: Error probability that a forename yields no match at all. [GPD] p_ep1_surname: Error probability that a surname fails a full match but passes a partial 1 (metaphone) match. [GPD] p_ep2np1_surname: Error probability that a surname fails a full match and a partial 1 match but passes a partial 2 (F2C) match. [GPD] p_en_surname: Error probability that a surname yields no match at all. [GPD] p_ep_dob: Error probability that a DOB fails a full (YMD) match but passes a partial (YM, MD, or YD) match. p_en_dob: Error probability that a DOB produces no match at all. p_e_gender: Error probability of no gender match. p_ep_postcode: Probability that a postcode fails a full (unit) match but passes a partial (sector) match (due to error or a move within a sector). p_en_postcode: Probability that a postcode gives no match at all. min_log_odds_for_match: minimum log odds of a match, to consider two people a match exceeds_next_best_log_odds: In a multi-person comparison, the log odds of the best match must exceed those of the next-best match by this much for the best to be considered a unique winner. perfect_id_translation: Option dictionary mapping the perfect ID names in the proband to the equivalents in the sample, e.g. {"nhsnum": "nhsnumber"}. extra_validation_output: Add extra columns to the output for validation purposes? check_comparison_order: Check that comparisons follow the general rule "no match ≤ partial(s) ≤ full" and warn if not. report_every: Report progress every n probands. min_probands_for_parallel: Minimum number of probands for which we will bother to use parallel processing. n_workers: Number of parallel processes to use, if parallel processing is used. verbose: Be verbose on creation? - [GPD] In ``{gender:p, ...}`` dict-as-string format. - F2C = First two characters. """ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Input validation # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ def raise_bad(x_: Any, name_: str) -> NoReturn: """ Raise an informative ValueError. """ raise ValueError(f"Bad {name_}: {x_!r}") def check_prob( p_: float, name_: str, not_certain: bool = False ) -> float: """ Ensure that something is a probability, and return it. """ if not_certain: if not 0 < p_ < 1: raise_bad(p_, name_ + " [must be in range (0, 1)]") else: if not 0 <= p_ <= 1: raise_bad(p_, name_) return p_ def mk_gender_p_dict(csv_: str, name_: str) -> Dict[str, float]: """ Transform a comma-separated list of ``gender:p`` values into a corresponding dictionary, and fill in the blanks. """ d = {} # type: Dict[str, float] for gender_p_str in csv_.split(","): g_p_components = gender_p_str.split(":") if len(g_p_components) != 2: raise ValueError(f"Bad {name_}: {csv_!r}") g = g_p_components[0].strip() try: p = check_prob(float(g_p_components[1].strip()), name_) except (ValueError, TypeError): raise ValueError(f"Bad probability in {name_}: {csv_!r}") d[g] = p if GENDER_FEMALE not in d: raise ValueError( f"Gender {GENDER_FEMALE} not specified in {name_}" ) if GENDER_MALE not in d: raise ValueError( f"Gender {GENDER_MALE} not specified in {name_}" ) weighted_mean_m_f = ( self.p_female_given_m_or_f * d[GENDER_FEMALE] + self.p_male_given_m_or_f * d[GENDER_MALE] ) d.setdefault(GENDER_OTHER, weighted_mean_m_f) d.setdefault(GENDER_MISSING, weighted_mean_m_f) if set(d.keys()) != set(VALID_GENDERS): raise ValueError( f"Missing or bad genders in {name_}: {csv_!r} -- genders " f"should be {VALID_GENDERS}" ) return d def mk_p_c_dict( p_ep1_: Dict[str, float], p_ep2np1_: Dict[str, float], p_en_: Dict[str, float], ) -> Dict[str, float]: """ Calculates p_c = 1 - p_ep1 - p_ep2np1 = p_en. """ d = {} # type: Dict[str, float] for g in VALID_GENDERS: p_c_ = 1 - p_ep1_[g] - p_ep2np1_[g] - p_en_[g] assert 0 <= p_c_ <= 1 d[g] = p_c_ return d # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Basic creation # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if verbose: log.debug("Building MatchConfig...") # Hash information self.hash_fn = make_hasher(hash_method=hash_method, key=hash_key).hash if not (rounding_sf is None or 1 <= rounding_sf): raise_bad(rounding_sf, Switches.ROUNDING_SF) self.rounding_sf = rounding_sf if local_id_hash_key: self.local_id_hash_fn = make_hasher( hash_method=hash_method, key=local_id_hash_key ).hash else: # Convert to string if necessary; otherwise, an identity function: self.local_id_hash_fn = str # Overall population if not (population_size > 0): raise_bad(population_size, Switches.POPULATION_SIZE) self.population_size = population_size # Precalculate this, for access speed: self.baseline_log_odds_same_person = log_odds_from_1_in_n( self.population_size ) # Name handling: generic accent_dict = {} # type: Dict[str, str] for accent_pair in accent_transliterations_csv.split(","): accent_components = accent_pair.split("/") if len(accent_components) != 2: raise ValueError( f"Bad accent_transliterations_csv: " f"{accent_transliterations_csv!r}" ) accented = safe_upper(accent_components[0].strip()) plain = safe_upper(accent_components[1].strip()) if len(accented) != 1: raise ValueError( f"Bad accent_transliterations_csv: " f"{accent_transliterations_csv!r} -- contains accented " f"character {accented!r}, which should be of length 1" ) accent_dict[accented] = plain self.accent_transliterations = str.maketrans(accent_dict) self.nonspecific_name_components = set() # type: Set[str] for nonspec in nonspecific_name_components_csv.split(","): self.nonspecific_name_components.add(nonspec.strip().upper()) # Name handling: forenames self.forename_freq_info = forename_freq_info or NameFrequencyInfo( csv_filename=forename_sex_csv_filename, cache_filename=forename_cache_filename, min_frequency=check_prob( forename_min_frequency, Switches.FORENAME_MIN_FREQUENCY ), by_gender=True, ) if not isinstance(self.forename_freq_info, NameFrequencyInfo): raise ValueError("Bad forename_freq_info") # Name handling: surnames self.surname_freq_info = surname_freq_info or NameFrequencyInfo( csv_filename=surname_csv_filename, cache_filename=surname_cache_filename, min_frequency=check_prob( surname_min_frequency, Switches.SURNAME_MIN_FREQUENCY ), by_gender=False, ) if not isinstance(self.surname_freq_info, NameFrequencyInfo): raise ValueError("Bad surname_freq_info") # Population frequencies: DOB self.birth_year_pseudo_range = birth_year_pseudo_range if not (birth_year_pseudo_range >= 1): raise_bad( birth_year_pseudo_range, Switches.BIRTH_YEAR_PSEUDO_RANGE ) # Population frequencies: sex/gender # ... Check this before using mk_gender_p_dict: self.p_female_given_m_or_f = check_prob( p_female_given_male_or_female, Switches.P_FEMALE_GIVEN_MALE_OR_FEMALE, ) self.p_male_given_m_or_f = 1 - self.p_female_given_m_or_f self.p_not_male_or_female = check_prob( p_not_male_or_female, Switches.P_NOT_MALE_OR_FEMALE ) p_male_or_female = 1 - p_not_male_or_female self.p_female = p_female_given_male_or_female * p_male_or_female self.p_male = p_male_or_female - self.p_female # Population frequencies: postcode self.postcode_freq_info = postcode_freq_info or PostcodeFrequencyInfo( csv_filename=postcode_csv_filename, cache_filename=postcode_cache_filename, ) if not isinstance(self.postcode_freq_info, PostcodeFrequencyInfo): raise ValueError("Bad postcode_freq_info") self.p_unknown_or_pseudo_postcode_unit = check_prob( p_unknown_or_pseudo_postcode, Switches.P_UNKNOWN_OR_PSEUDO_POSTCODE, not_certain=True, ) if k_pseudopostcode <= 1: raise ValueError(f"Bad {Switches.K_PSEUDOPOSTCODE}: must be >1") self.k_pseudopostcode = k_pseudopostcode self.p_unknown_or_pseudo_postcode_sector = check_prob( k_pseudopostcode * p_unknown_or_pseudo_postcode, f"P(unknown postcode or pseudopostcode sector | ¬H) = " f"{Switches.K_PSEUDOPOSTCODE} * " f"{Switches.P_UNKNOWN_OR_PSEUDO_POSTCODE}", not_certain=True, ) self.k_postcode = ( UK_POPULATION_2017 / self.population_size if k_postcode is None else k_postcode ) self.p_known_postcode = 1 - self.p_unknown_or_pseudo_postcode_sector # Error probabilities: forenames self.p_ep1_forename = mk_gender_p_dict( p_ep1_forename, Switches.P_EP1_FORENAME ) self.p_ep2np1_forename = mk_gender_p_dict( p_ep2np1_forename, Switches.P_EP2NP1_FORENAME ) self.p_en_forename = mk_gender_p_dict( p_en_forename, Switches.P_EN_FORENAME ) self.p_c_forename = mk_p_c_dict( p_ep1_=self.p_ep1_forename, p_ep2np1_=self.p_ep2np1_forename, p_en_=self.p_en_forename, ) self.p_u_forename = check_prob(p_u_forename, Switches.P_U_FORENAME) # Error probabilities: surnames self.p_ep1_surname = mk_gender_p_dict( p_ep1_surname, Switches.P_EP1_SURNAME ) self.p_ep2np1_surname = mk_gender_p_dict( p_ep2np1_surname, Switches.P_EP2NP1_SURNAME ) self.p_en_surname = mk_gender_p_dict( p_en_surname, Switches.P_EN_SURNAME ) self.p_c_surname = mk_p_c_dict( p_ep1_=self.p_ep1_surname, p_ep2np1_=self.p_ep2np1_surname, p_en_=self.p_en_surname, ) # Error probabilities: DOB self.p_ep_dob = check_prob(p_ep_dob, Switches.P_EP_DOB) self.p_en_dob = check_prob(p_en_dob, Switches.P_EN_DOB) # Error probabilities: gender self.p_e_gender_error = check_prob( p_e_gender, Switches.P_E_GENDER, ) # Error probabilities: postcode self.p_ep_postcode = check_prob(p_ep_postcode, Switches.P_EP_POSTCODE) self.p_en_postcode = check_prob(p_en_postcode, Switches.P_EN_POSTCODE) # Matching rules self.min_log_odds_for_match = min_log_odds_for_match self.exceeds_next_best_log_odds = exceeds_next_best_log_odds if perfect_id_translation is None: perfect_id_xlate_raw = {} elif isinstance(perfect_id_translation, dict): perfect_id_xlate_raw = perfect_id_translation elif isinstance(perfect_id_translation, str): perfect_id_xlate_raw = dict_from_str(perfect_id_translation) else: raise ValueError( f"Bad perfect_id_translation: {perfect_id_translation!r}" ) self.perfect_id_translation = { standardize_perfect_id_key(k): standardize_perfect_id_value(v) for k, v in perfect_id_xlate_raw.values() } if self.perfect_id_translation: log.info( f"Using proband-to-sample perfect ID translation: " f"{self.perfect_id_translation}" ) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Some derived frequencies # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # DOB: self.p_c_dob = 1 - self.p_ep_dob - self.p_en_dob assert 0 <= self.p_c_dob <= 1 # These ignore the specialness of 29 February: self.p_f_dob = 1 / (DAYS_PER_YEAR * birth_year_pseudo_range) p_share_dob_md_not_ymd = (1 / DAYS_PER_YEAR) - self.p_f_dob p_share_dob_yd_not_ymd = ( 1 / (DAYS_PER_MONTH * birth_year_pseudo_range) ) - self.p_f_dob p_share_dob_ym_not_ymd = ( 1 / (MONTHS_PER_YEAR * birth_year_pseudo_range) ) - self.p_f_dob # These three are mutually exclusive possibilities (e.g. you can't # share YM and MD without sharing YMD), so we can just sum: self.p_pnf_dob = ( p_share_dob_md_not_ymd + p_share_dob_yd_not_ymd + p_share_dob_ym_not_ymd ) self.p_n_dob = 1 - self.p_f_dob - self.p_pnf_dob assert 0 <= self.p_f_dob <= 1 assert 0 <= p_share_dob_md_not_ymd <= 1 assert 0 <= p_share_dob_yd_not_ymd <= 1 assert 0 <= p_share_dob_ym_not_ymd <= 1 assert 0 <= self.p_pnf_dob <= 1 assert 0 <= self.p_n_dob <= 1 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Technical # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ self.extra_validation_output = extra_validation_output self.check_comparison_order = check_comparison_order self.report_every = report_every self.min_probands_for_parallel = min_probands_for_parallel self.n_workers = n_workers # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Reporting # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ self.partial_dob_mismatch_allowed = self.p_c_dob < 1 self.complete_dob_mismatch_allowed = self.p_en_dob > 0 if self.complete_dob_mismatch_allowed: potential_speedup_factor = round_sf( normal_round_int(1 / (1 - self.p_n_dob)), n=3, ) log.warning( f"You are allowing a person's DOB to be completely different, " f"with p = {self.p_en_dob}. That is valid but much less " f"efficient computationally (by an estimated factor of about " f"{potential_speedup_factor})." ) _ = """ Speedup: for a 90-year range (b = 90), this is a factor of about 252. For a single year, it's about 9; if I'm born on 1 Jan, allowing single-component errors mean we need to consider 1 Jan, but also all of Jan, and all other firsts of the month -- total 42 out of 365 days, or 1/8.69 of the year. For a multi-year range, the speedup increases: if I'm born on 1 Jan 1950 and we are considering 1900-1999, we'd need to consider 1950-01-01 (1), ????-01-01 (100), 1950-01-?? (31), 1950-??-01 (12), minus the overlaps (3), giving 141 possibilities but out of about 36500, i.e. considering only 1/259 of the candidates. To find probabilities in terms of b, using Octave: pkg load symbolic syms b p_f_dob p_pnf_dob p_n_dob speedup_no_mismatch speedup_no_partial second_stage_speedup DAYS_PER_YEAR = 365.25 DAYS_PER_MONTH = 30.4375 MONTHS_PER_YEAR = 12 p_f_dob = 1 / (DAYS_PER_YEAR * b) # = 4 / (1461⋅b) p_pnf_dob = ( 1 / DAYS_PER_YEAR + 1 / (DAYS_PER_MONTH * b) + 1 / (MONTHS_PER_YEAR * b) - 3 / (DAYS_PER_YEAR * b) ) simplify(p_pnf_dob) # = (16⋅b + 631) / (5844⋅b) p_n_dob = 1 - p_f_dob - p_pnf_dob simplify(p_n_dob) p_full_or_partial_match = 1 - p_n_dob speedup_no_mismatch = 1 / p_full_or_partial_match simplify(speedup_no_mismatch) # = 5844⋅b / (16⋅b + 647) speedup_no_partial = 1 / p_f_dob simplify(speedup_no_partial) # = 1461⋅b / 4 second_stage_speedup = speedup_no_partial / speedup_no_mismatch simplify(second_stage_speedup) # = b + 647 / 16 """ # noqa if verbose: log.debug(f"... MatchConfig built. Settings: {self}")
# log.debug( # f"p_dob_correct = {self.p_dob_correct}, " # f"p_dob_single_component_error = " # f"{self.p_dob_single_component_error}, " # f"p_dob_major_error = {self.p_dob_major_error}" # ) # log.debug( # f"p_two_people_share_dob_ymd = " # f"{self.p_two_people_share_dob_ymd}, " # f"p_share_dob_md_not_ymd = {p_share_dob_md_not_ymd}, " # f"p_share_dob_yd_not_ymd = {p_share_dob_yd_not_ymd}, " # f"p_share_dob_ym_not_ymd = {p_share_dob_ym_not_ymd}, " # f"p_two_people_have_partial_dob_match = " # f"{self.p_two_people_partial_dob_match}, " # f"p_two_people_no_dob_similarity = " # f"{self.p_two_people_no_dob_similarity}" # ) # ------------------------------------------------------------------------- # String representation # ------------------------------------------------------------------------- def __str__(self) -> str: return auto_repr(self) # not __repr__(), or it clutters up all the other objects # ------------------------------------------------------------------------- # Identifier frequency information # -------------------------------------------------------------------------
[docs] def get_forename_freq_info( self, name: str, gender: str, prestandardized: bool = False ) -> BasicNameFreqInfo: """ Returns the baseline frequency of a forename. Args: name: the name to check gender: the gender to look up for prestandardized: was the name pre-standardized? """ if not prestandardized: name = standardize_name(name) freq_func = self.forename_freq_info.name_frequency_info if gender in (GENDER_FEMALE, GENDER_MALE): return freq_func(name, gender, prestandardized=True) # Otherwise, take the mean across genders: return BasicNameFreqInfo.weighted_mean( objects=[ freq_func(name, GENDER_FEMALE, prestandardized=True), freq_func(name, GENDER_MALE, prestandardized=True), ], weights=[self.p_female, self.p_male], )
[docs] def get_surname_freq_info( self, name: str, prestandardized: bool = False ) -> BasicNameFreqInfo: """ Returns the baseline frequency of a surname. Args: name: the name to check prestandardized: was it pre-standardized? """ return self.surname_freq_info.name_frequency_info( name, prestandardized=prestandardized )
def gender_freq(self, gender: str) -> Optional[float]: if not gender: return None elif gender == GENDER_FEMALE: return self.p_female elif gender == GENDER_MALE: return self.p_male else: return self.p_not_male_or_female
[docs] def is_valid_postcode(self, postcode_unit: str) -> bool: """ Is this a valid postcode? """ return self.postcode_freq_info.debug_is_valid_postcode(postcode_unit)
[docs] def postcode_unit_sector_freq( self, postcode_unit: str, prestandardized: bool = False ) -> Tuple[float, float]: """ Returns the frequency for a full postcode, or postcode unit (the proportion of the population who live in that postcode), and the corresponding larger-scale postcode sector. The underlying function ensures that the sector frequency is as least as big as the unit frequency. """ return self.postcode_freq_info.postcode_unit_sector_frequency( postcode_unit, prestandardized=prestandardized )
[docs] def debug_postcode_unit_population( self, postcode_unit: str, prestandardized: bool = False ) -> float: """ Returns the calculated population of a postcode unit. Args: postcode_unit: the postcode unit to check prestandardized: was the postcode pre-standardized in format? """ return self.postcode_freq_info.debug_postcode_unit_population( postcode_unit, prestandardized=prestandardized )
[docs] def debug_postcode_sector_population( self, postcode_sector: str, prestandardized: bool = False ) -> float: """ Returns the calculated population of a postcode sector. Args: postcode_sector: the postcode sector to check prestandardized: was the postcode pre-standardized in format? """ return self.postcode_freq_info.debug_postcode_sector_population( postcode_sector, prestandardized=prestandardized )
# ------------------------------------------------------------------------- # Comparisons # -------------------------------------------------------------------------
[docs] def exceeds_primary_threshold(self, log_odds_match: float) -> bool: """ Decides as to whether the log odds, representing P(H | D) from a comparison of two :class:`Person` objects, are sufficient for a match, based on our threshold. Args: log_odds_match: log odds that they're the same person Returns: bool: binary decision """ return log_odds_match >= self.min_log_odds_for_match
# ------------------------------------------------------------------------- # Perfect ID handling # ------------------------------------------------------------------------- def remap_perfect_id_key(self, key: str) -> str: return self.perfect_id_translation.get(key, key)
# ============================================================================= # Dummy config that doesn't load frequency information # =============================================================================
[docs]def mk_dummy_match_config() -> MatchConfig: """ Returns a dummy config with empty frequency information. """ return MatchConfig( forename_cache_filename="", forename_sex_csv_filename="", surname_cache_filename="", surname_csv_filename="", postcode_cache_filename="", postcode_csv_filename="", )