Source code for crate_anon.linkage.matchconfig

r"""
crate_anon/linkage/matchconfig.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Helper functions for linkage tools.**

"""

# =============================================================================
# Imports
# =============================================================================

import logging
from typing import Any, Dict, NoReturn, Optional, Set, Tuple, Union

from cardinal_pythonlib.hash import make_hasher
from cardinal_pythonlib.maths_py import round_sf, normal_round_int
from cardinal_pythonlib.probability import log_odds_from_1_in_n
from cardinal_pythonlib.reprfunc import auto_repr

from crate_anon.linkage.constants import (
    DAYS_PER_MONTH,
    DAYS_PER_YEAR,
    FuzzyDefaults,
    GENDER_FEMALE,
    GENDER_MALE,
    GENDER_MISSING,
    GENDER_OTHER,
    MONTHS_PER_YEAR,
    Switches,
    UK_POPULATION_2017,
    VALID_GENDERS,
)
from crate_anon.linkage.frequencies import (
    BasicNameFreqInfo,
    NameFrequencyInfo,
    PostcodeFrequencyInfo,
)
from crate_anon.linkage.helpers import (
    dict_from_str,
    safe_upper,
    standardize_name,
    standardize_perfect_id_key,
    standardize_perfect_id_value,
)

log = logging.getLogger(__name__)


# =============================================================================
# Main configuration class, supporting frequency-based probability calculations
# =============================================================================


[docs]class MatchConfig:
    """
    Master config class. It's more convenient to pass one of these round than
    lots of its components.

    Default arguments are there for testing.
    """

[docs]    def __init__(
        self,
        hash_key: str = FuzzyDefaults.HASH_KEY,
        hash_method: str = FuzzyDefaults.HASH_METHOD,
        rounding_sf: Optional[int] = FuzzyDefaults.ROUNDING_SF,
        local_id_hash_key: str = None,
        population_size: int = FuzzyDefaults.POPULATION_SIZE,
        forename_sex_csv_filename: str = FuzzyDefaults.FORENAME_SEX_FREQ_CSV,
        forename_cache_filename: str = FuzzyDefaults.FORENAME_CACHE_FILENAME,
        forename_freq_info: Optional[NameFrequencyInfo] = None,
        forename_min_frequency: float = FuzzyDefaults.FORENAME_MIN_FREQ,
        surname_csv_filename: str = FuzzyDefaults.SURNAME_FREQ_CSV,
        surname_cache_filename: str = FuzzyDefaults.SURNAME_CACHE_FILENAME,
        surname_freq_info: Optional[NameFrequencyInfo] = None,
        surname_min_frequency: float = FuzzyDefaults.SURNAME_MIN_FREQ,
        accent_transliterations_csv: str = (
            FuzzyDefaults.ACCENT_TRANSLITERATIONS_SLASH_CSV
        ),
        nonspecific_name_components_csv: str = (
            FuzzyDefaults.NONSPECIFIC_NAME_COMPONENTS_CSV
        ),
        birth_year_pseudo_range: float = FuzzyDefaults.BIRTH_YEAR_PSEUDO_RANGE,
        p_not_male_or_female: float = FuzzyDefaults.P_NOT_MALE_OR_FEMALE,
        p_female_given_male_or_female: float = (
            FuzzyDefaults.P_FEMALE_GIVEN_MALE_OR_FEMALE
        ),
        postcode_csv_filename: str = FuzzyDefaults.POSTCODES_CSV,
        postcode_cache_filename: str = FuzzyDefaults.POSTCODE_CACHE_FILENAME,
        postcode_freq_info: Optional[PostcodeFrequencyInfo] = None,
        k_postcode: Optional[float] = FuzzyDefaults.K_POSTCODE,
        p_unknown_or_pseudo_postcode: float = (
            FuzzyDefaults.P_UNKNOWN_OR_PSEUDO_POSTCODE
        ),
        k_pseudopostcode: float = FuzzyDefaults.K_PSEUDOPOSTCODE,
        p_ep1_forename: str = FuzzyDefaults.P_EP1_FORENAME_CSV,
        p_ep2np1_forename: str = FuzzyDefaults.P_EP2NP1_FORENAME_CSV,
        p_u_forename: float = FuzzyDefaults.P_U_FORENAME,
        p_en_forename: str = FuzzyDefaults.P_EN_FORENAME_CSV,
        p_ep1_surname: str = FuzzyDefaults.P_EP1_SURNAME_CSV,
        p_ep2np1_surname: str = FuzzyDefaults.P_EP2NP1_SURNAME_CSV,
        p_en_surname: str = FuzzyDefaults.P_EN_SURNAME_CSV,
        p_ep_dob: float = FuzzyDefaults.P_EP_DOB,
        p_en_dob: float = FuzzyDefaults.P_EN_DOB,
        p_e_gender: float = FuzzyDefaults.P_E_GENDER,
        p_ep_postcode: float = FuzzyDefaults.P_EP_POSTCODE,
        p_en_postcode: float = FuzzyDefaults.P_EN_POSTCODE,
        min_log_odds_for_match: float = FuzzyDefaults.MIN_LOG_ODDS_FOR_MATCH,
        exceeds_next_best_log_odds: float = (
            FuzzyDefaults.EXCEEDS_NEXT_BEST_LOG_ODDS
        ),
        perfect_id_translation: Union[
            Dict[str, str], str
        ] = FuzzyDefaults.PERFECT_ID_TRANSLATION,
        extra_validation_output: bool = False,
        check_comparison_order: bool = FuzzyDefaults.CHECK_COMPARISON_ORDER,
        report_every: int = FuzzyDefaults.REPORT_EVERY,
        min_probands_for_parallel: int = (
            FuzzyDefaults.MIN_PROBANDS_FOR_PARALLEL
        ),
        n_workers: int = FuzzyDefaults.N_PROCESSES,
        verbose: bool = False,
    ) -> None:
        """
        Args:
            hash_key:
                Key (passphrase) for hasher.
            hash_method:
                Method to use for hashhing.
            rounding_sf:
                Number of significant figures to use when rounding frequency
                information in hashed copies. Use ``None`` for no rounding.
            local_id_hash_key:
                If specified, then for hash operations, the local_id values
                will also be hashed, using this key.

            population_size:
                The size of the entire population (not our sample). See
                docstrings above.

            forename_sex_csv_filename:
                Forename frequencies. CSV file, with no header, of "name,
                frequency" pairs.
            forename_cache_filename:
                File in which to cache forename information for faster loading.
            forename_freq_info:
                Debugging option: overrides forename_sex_csv_filename by
                providing a NameFrequencyInfo object directly.
            forename_min_frequency:
                Minimum frequency for forenames.

            surname_csv_filename:
                Surname frequencies. CSV file, with no header, of "name,
                frequency" pairs.
            surname_cache_filename:
                File in which to cache forename information for faster loading.
            surname_freq_info:
                Debugging option: overrides surname_csv_filename by
                providing a NameFrequencyInfo object directly.
            surname_min_frequency:
                Minimum frequency for surnames.
            accent_transliterations_csv:
                Accent transliteration map. String of the form "Ä/AE,Ö/OE" --
                comma-separated pairs, with slashed separating each pair.
            nonspecific_name_components_csv:
                CSV-separated list of nonspecific name components (e.g.
                nobiliary particles), which will be avoided as equivalent name
                fragments.

            birth_year_pseudo_range:
                b, such that P(two people share a DOB) = 1/(365.25 * b).

            p_not_male_or_female:
                Probability that a person in the population has gender 'X'.
            p_female_given_male_or_female:
                Probability that a person in the population is female, given
                that they are either male or female.

            postcode_csv_filename:
                Postcode mapping. CSV (or ZIP) file. Special format; see
                :class:`PostcodeFrequencyInfo`.
            postcode_cache_filename:
                File in which to cache postcode information for faster loading.
            postcode_freq_info:
                Debugging option: overrides postcode_csv_filename by
                providing a PostcodeFrequencyInfo object directly.
            k_postcode:
                Multiple applied to postcode unit/sector frequencies, such that
                p_f_postcode = k_postcode * f_f_postcode and p_p_postcode =
                k_postcode * f_p_postcode. If None, defaults to
                UK_POPULATION_2017 / population_size, appropriate if the
                population under consideration is geographically constrained
                (rather than sampled from across the UK).
            p_unknown_or_pseudo_postcode:
                Probability that a random person will have a pseudo-postcode,
                e.g. ZZ99 3VZ (no fixed abode) or a postcode not known to our
                database. Specifically, P(each pseudopostcode or unknown
                postcode unit | ¬H).
            k_pseudopostcode:
                Probability multiple: P(pseudopostcode sector or unknown
                postcode sector match | ¬H) = k_pseudopostcode *
                p_unknown_or_pseudo_postcode. Must strictly be >=1 and we
                enforce >1; see paper.

            p_ep1_forename:
                Error probability that a forename fails a full match but passes
                a partial 1 (metaphone) match. [GPD]
            p_ep2np1_forename:
                Error probability that a forename fails a full match and a
                partial 1 match but passes a partial 2 (F2C) match. [GPD]
            p_en_forename:
                Error probability that a forename yields no match at all. [GPD]
            p_ep1_surname:
                Error probability that a surname fails a full match but passes
                a partial 1 (metaphone) match. [GPD]
            p_ep2np1_surname:
                Error probability that a surname fails a full match and a
                partial 1 match but passes a partial 2 (F2C) match. [GPD]
            p_en_surname:
                Error probability that a surname yields no match at all. [GPD]
            p_ep_dob:
                Error probability that a DOB fails a full (YMD) match but
                passes a partial (YM, MD, or YD) match.
            p_en_dob:
                Error probability that a DOB produces no match at all.
            p_e_gender:
                Error probability of no gender match.
            p_ep_postcode:
                Probability that a postcode fails a full (unit) match but
                passes a partial (sector) match (due to error or a move within
                a sector).
            p_en_postcode:
                Probability that a postcode gives no match at all.
            min_log_odds_for_match:
                minimum log odds of a match, to consider two people a match
            exceeds_next_best_log_odds:
                In a multi-person comparison, the log odds of the best match
                must exceed those of the next-best match by this much for the
                best to be considered a unique winner.
            perfect_id_translation:
                Option dictionary mapping the perfect ID names in the proband
                to the equivalents in the sample, e.g. {"nhsnum": "nhsnumber"}.

            extra_validation_output:
                Add extra columns to the output for validation purposes?
            check_comparison_order:
                Check that comparisons follow the general rule "no match ≤
                partial(s) ≤ full" and warn if not.
            report_every:
                Report progress every n probands.
            min_probands_for_parallel:
                Minimum number of probands for which we will bother to use
                parallel processing.
            n_workers:
                Number of parallel processes to use, if parallel processing
                is used.
            verbose:
                Be verbose on creation?

        - [GPD] In ``{gender:p, ...}`` dict-as-string format.

        - F2C = First two characters.
        """
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Input validation
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

        def raise_bad(x_: Any, name_: str) -> NoReturn:
            """
            Raise an informative ValueError.
            """
            raise ValueError(f"Bad {name_}: {x_!r}")

        def check_prob(
            p_: float, name_: str, not_certain: bool = False
        ) -> float:
            """
            Ensure that something is a probability, and return it.
            """
            if not_certain:
                if not 0 < p_ < 1:
                    raise_bad(p_, name_ + " [must be in range (0, 1)]")
            else:
                if not 0 <= p_ <= 1:
                    raise_bad(p_, name_)
            return p_

        def mk_gender_p_dict(csv_: str, name_: str) -> Dict[str, float]:
            """
            Transform a comma-separated list of ``gender:p`` values into
            a corresponding dictionary, and fill in the blanks.
            """
            d = {}  # type: Dict[str, float]
            for gender_p_str in csv_.split(","):
                g_p_components = gender_p_str.split(":")
                if len(g_p_components) != 2:
                    raise ValueError(f"Bad {name_}: {csv_!r}")
                g = g_p_components[0].strip()
                try:
                    p = check_prob(float(g_p_components[1].strip()), name_)
                except (ValueError, TypeError):
                    raise ValueError(f"Bad probability in {name_}: {csv_!r}")
                d[g] = p
            if GENDER_FEMALE not in d:
                raise ValueError(
                    f"Gender {GENDER_FEMALE} not specified in {name_}"
                )
            if GENDER_MALE not in d:
                raise ValueError(
                    f"Gender {GENDER_MALE} not specified in {name_}"
                )
            weighted_mean_m_f = (
                self.p_female_given_m_or_f * d[GENDER_FEMALE]
                + self.p_male_given_m_or_f * d[GENDER_MALE]
            )
            d.setdefault(GENDER_OTHER, weighted_mean_m_f)
            d.setdefault(GENDER_MISSING, weighted_mean_m_f)
            if set(d.keys()) != set(VALID_GENDERS):
                raise ValueError(
                    f"Missing or bad genders in {name_}: {csv_!r} -- genders "
                    f"should be {VALID_GENDERS}"
                )
            return d

        def mk_p_c_dict(
            p_ep1_: Dict[str, float],
            p_ep2np1_: Dict[str, float],
            p_en_: Dict[str, float],
        ) -> Dict[str, float]:
            """
            Calculates p_c = 1 - p_ep1 - p_ep2np1 = p_en.
            """
            d = {}  # type: Dict[str, float]
            for g in VALID_GENDERS:
                p_c_ = 1 - p_ep1_[g] - p_ep2np1_[g] - p_en_[g]
                assert 0 <= p_c_ <= 1
                d[g] = p_c_
            return d

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Basic creation
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

        if verbose:
            log.debug("Building MatchConfig...")

        # Hash information

        self.hash_fn = make_hasher(hash_method=hash_method, key=hash_key).hash
        if not (rounding_sf is None or 1 <= rounding_sf):
            raise_bad(rounding_sf, Switches.ROUNDING_SF)
        self.rounding_sf = rounding_sf
        if local_id_hash_key:
            self.local_id_hash_fn = make_hasher(
                hash_method=hash_method, key=local_id_hash_key
            ).hash
        else:
            # Convert to string if necessary; otherwise, an identity function:
            self.local_id_hash_fn = str

        # Overall population

        if not (population_size > 0):
            raise_bad(population_size, Switches.POPULATION_SIZE)
        self.population_size = population_size
        # Precalculate this, for access speed:
        self.baseline_log_odds_same_person = log_odds_from_1_in_n(
            self.population_size
        )

        # Name handling: generic

        accent_dict = {}  # type: Dict[str, str]
        for accent_pair in accent_transliterations_csv.split(","):
            accent_components = accent_pair.split("/")
            if len(accent_components) != 2:
                raise ValueError(
                    f"Bad accent_transliterations_csv: "
                    f"{accent_transliterations_csv!r}"
                )
            accented = safe_upper(accent_components[0].strip())
            plain = safe_upper(accent_components[1].strip())
            if len(accented) != 1:
                raise ValueError(
                    f"Bad accent_transliterations_csv: "
                    f"{accent_transliterations_csv!r} -- contains accented "
                    f"character {accented!r}, which should be of length 1"
                )
            accent_dict[accented] = plain
        self.accent_transliterations = str.maketrans(accent_dict)
        self.nonspecific_name_components = set()  # type: Set[str]
        for nonspec in nonspecific_name_components_csv.split(","):
            self.nonspecific_name_components.add(nonspec.strip().upper())

        # Name handling: forenames

        self.forename_freq_info = forename_freq_info or NameFrequencyInfo(
            csv_filename=forename_sex_csv_filename,
            cache_filename=forename_cache_filename,
            min_frequency=check_prob(
                forename_min_frequency, Switches.FORENAME_MIN_FREQUENCY
            ),
            by_gender=True,
        )
        if not isinstance(self.forename_freq_info, NameFrequencyInfo):
            raise ValueError("Bad forename_freq_info")

        # Name handling: surnames

        self.surname_freq_info = surname_freq_info or NameFrequencyInfo(
            csv_filename=surname_csv_filename,
            cache_filename=surname_cache_filename,
            min_frequency=check_prob(
                surname_min_frequency, Switches.SURNAME_MIN_FREQUENCY
            ),
            by_gender=False,
        )
        if not isinstance(self.surname_freq_info, NameFrequencyInfo):
            raise ValueError("Bad surname_freq_info")

        # Population frequencies: DOB

        self.birth_year_pseudo_range = birth_year_pseudo_range
        if not (birth_year_pseudo_range >= 1):
            raise_bad(
                birth_year_pseudo_range, Switches.BIRTH_YEAR_PSEUDO_RANGE
            )

        # Population frequencies: sex/gender

        # ... Check this before using mk_gender_p_dict:
        self.p_female_given_m_or_f = check_prob(
            p_female_given_male_or_female,
            Switches.P_FEMALE_GIVEN_MALE_OR_FEMALE,
        )
        self.p_male_given_m_or_f = 1 - self.p_female_given_m_or_f
        self.p_not_male_or_female = check_prob(
            p_not_male_or_female, Switches.P_NOT_MALE_OR_FEMALE
        )
        p_male_or_female = 1 - p_not_male_or_female
        self.p_female = p_female_given_male_or_female * p_male_or_female
        self.p_male = p_male_or_female - self.p_female

        # Population frequencies: postcode

        self.postcode_freq_info = postcode_freq_info or PostcodeFrequencyInfo(
            csv_filename=postcode_csv_filename,
            cache_filename=postcode_cache_filename,
        )
        if not isinstance(self.postcode_freq_info, PostcodeFrequencyInfo):
            raise ValueError("Bad postcode_freq_info")
        self.p_unknown_or_pseudo_postcode_unit = check_prob(
            p_unknown_or_pseudo_postcode,
            Switches.P_UNKNOWN_OR_PSEUDO_POSTCODE,
            not_certain=True,
        )
        if k_pseudopostcode <= 1:
            raise ValueError(f"Bad {Switches.K_PSEUDOPOSTCODE}: must be >1")
        self.k_pseudopostcode = k_pseudopostcode
        self.p_unknown_or_pseudo_postcode_sector = check_prob(
            k_pseudopostcode * p_unknown_or_pseudo_postcode,
            f"P(unknown postcode or pseudopostcode sector | ¬H) = "
            f"{Switches.K_PSEUDOPOSTCODE} * "
            f"{Switches.P_UNKNOWN_OR_PSEUDO_POSTCODE}",
            not_certain=True,
        )
        self.k_postcode = (
            UK_POPULATION_2017 / self.population_size
            if k_postcode is None
            else k_postcode
        )
        self.p_known_postcode = 1 - self.p_unknown_or_pseudo_postcode_sector

        # Error probabilities: forenames

        self.p_ep1_forename = mk_gender_p_dict(
            p_ep1_forename, Switches.P_EP1_FORENAME
        )
        self.p_ep2np1_forename = mk_gender_p_dict(
            p_ep2np1_forename, Switches.P_EP2NP1_FORENAME
        )
        self.p_en_forename = mk_gender_p_dict(
            p_en_forename, Switches.P_EN_FORENAME
        )
        self.p_c_forename = mk_p_c_dict(
            p_ep1_=self.p_ep1_forename,
            p_ep2np1_=self.p_ep2np1_forename,
            p_en_=self.p_en_forename,
        )
        self.p_u_forename = check_prob(p_u_forename, Switches.P_U_FORENAME)

        # Error probabilities: surnames

        self.p_ep1_surname = mk_gender_p_dict(
            p_ep1_surname, Switches.P_EP1_SURNAME
        )
        self.p_ep2np1_surname = mk_gender_p_dict(
            p_ep2np1_surname, Switches.P_EP2NP1_SURNAME
        )
        self.p_en_surname = mk_gender_p_dict(
            p_en_surname, Switches.P_EN_SURNAME
        )
        self.p_c_surname = mk_p_c_dict(
            p_ep1_=self.p_ep1_surname,
            p_ep2np1_=self.p_ep2np1_surname,
            p_en_=self.p_en_surname,
        )

        # Error probabilities: DOB

        self.p_ep_dob = check_prob(p_ep_dob, Switches.P_EP_DOB)
        self.p_en_dob = check_prob(p_en_dob, Switches.P_EN_DOB)

        # Error probabilities: gender

        self.p_e_gender_error = check_prob(
            p_e_gender,
            Switches.P_E_GENDER,
        )

        # Error probabilities: postcode

        self.p_ep_postcode = check_prob(p_ep_postcode, Switches.P_EP_POSTCODE)
        self.p_en_postcode = check_prob(p_en_postcode, Switches.P_EN_POSTCODE)

        # Matching rules

        self.min_log_odds_for_match = min_log_odds_for_match
        self.exceeds_next_best_log_odds = exceeds_next_best_log_odds
        if perfect_id_translation is None:
            perfect_id_xlate_raw = {}
        elif isinstance(perfect_id_translation, dict):
            perfect_id_xlate_raw = perfect_id_translation
        elif isinstance(perfect_id_translation, str):
            perfect_id_xlate_raw = dict_from_str(perfect_id_translation)
        else:
            raise ValueError(
                f"Bad perfect_id_translation: {perfect_id_translation!r}"
            )
        self.perfect_id_translation = {
            standardize_perfect_id_key(k): standardize_perfect_id_value(v)
            for k, v in perfect_id_xlate_raw.values()
        }
        if self.perfect_id_translation:
            log.info(
                f"Using proband-to-sample perfect ID translation: "
                f"{self.perfect_id_translation}"
            )

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Some derived frequencies
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

        # DOB:

        self.p_c_dob = 1 - self.p_ep_dob - self.p_en_dob
        assert 0 <= self.p_c_dob <= 1
        # These ignore the specialness of 29 February:
        self.p_f_dob = 1 / (DAYS_PER_YEAR * birth_year_pseudo_range)
        p_share_dob_md_not_ymd = (1 / DAYS_PER_YEAR) - self.p_f_dob
        p_share_dob_yd_not_ymd = (
            1 / (DAYS_PER_MONTH * birth_year_pseudo_range)
        ) - self.p_f_dob
        p_share_dob_ym_not_ymd = (
            1 / (MONTHS_PER_YEAR * birth_year_pseudo_range)
        ) - self.p_f_dob
        # These three are mutually exclusive possibilities (e.g. you can't
        # share YM and MD without sharing YMD), so we can just sum:
        self.p_pnf_dob = (
            p_share_dob_md_not_ymd
            + p_share_dob_yd_not_ymd
            + p_share_dob_ym_not_ymd
        )
        self.p_n_dob = 1 - self.p_f_dob - self.p_pnf_dob
        assert 0 <= self.p_f_dob <= 1
        assert 0 <= p_share_dob_md_not_ymd <= 1
        assert 0 <= p_share_dob_yd_not_ymd <= 1
        assert 0 <= p_share_dob_ym_not_ymd <= 1
        assert 0 <= self.p_pnf_dob <= 1
        assert 0 <= self.p_n_dob <= 1

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Technical
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

        self.extra_validation_output = extra_validation_output
        self.check_comparison_order = check_comparison_order
        self.report_every = report_every
        self.min_probands_for_parallel = min_probands_for_parallel
        self.n_workers = n_workers

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Reporting
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

        self.partial_dob_mismatch_allowed = self.p_c_dob < 1
        self.complete_dob_mismatch_allowed = self.p_en_dob > 0
        if self.complete_dob_mismatch_allowed:
            potential_speedup_factor = round_sf(
                normal_round_int(1 / (1 - self.p_n_dob)),
                n=3,
            )
            log.warning(
                f"You are allowing a person's DOB to be completely different, "
                f"with p = {self.p_en_dob}. That is valid but much less "
                f"efficient computationally (by an estimated factor of about "
                f"{potential_speedup_factor})."
            )
        _ = """
        Speedup: for a 90-year range (b = 90), this is a factor of about 252.

        For a single year, it's about 9; if I'm born on 1 Jan, allowing
        single-component errors mean we need to consider 1 Jan, but also all of
        Jan, and all other firsts of the month -- total 42 out of 365 days, or
        1/8.69 of the year.

        For a multi-year range, the speedup increases: if I'm born on 1 Jan
        1950 and we are considering 1900-1999, we'd need to consider 1950-01-01
        (1), ????-01-01 (100), 1950-01-?? (31), 1950-??-01 (12), minus the
        overlaps (3), giving 141 possibilities but out of about 36500, i.e.
        considering only 1/259 of the candidates.

        To find probabilities in terms of b, using Octave:

            pkg load symbolic
            syms b p_f_dob p_pnf_dob p_n_dob speedup_no_mismatch speedup_no_partial second_stage_speedup
            DAYS_PER_YEAR = 365.25
            DAYS_PER_MONTH = 30.4375
            MONTHS_PER_YEAR = 12

            p_f_dob = 1 / (DAYS_PER_YEAR * b)
                # = 4 / (1461⋅b)

            p_pnf_dob = (
                1 / DAYS_PER_YEAR
                + 1 / (DAYS_PER_MONTH * b)
                + 1 / (MONTHS_PER_YEAR * b)
                - 3 / (DAYS_PER_YEAR * b)
            )
            simplify(p_pnf_dob)
                # = (16⋅b + 631) / (5844⋅b)

            p_n_dob = 1 - p_f_dob - p_pnf_dob
            simplify(p_n_dob)

            p_full_or_partial_match = 1 - p_n_dob
            speedup_no_mismatch = 1 / p_full_or_partial_match
            simplify(speedup_no_mismatch)
                # = 5844⋅b / (16⋅b + 647)

            speedup_no_partial = 1 / p_f_dob
            simplify(speedup_no_partial)
                # = 1461⋅b / 4

            second_stage_speedup = speedup_no_partial / speedup_no_mismatch
            simplify(second_stage_speedup)
                # = b + 647 / 16

        """  # noqa

        if verbose:
            log.debug(f"... MatchConfig built. Settings: {self}")
            # log.debug(
            #     f"p_dob_correct = {self.p_dob_correct}, "
            #     f"p_dob_single_component_error = "
            #     f"{self.p_dob_single_component_error}, "
            #     f"p_dob_major_error = {self.p_dob_major_error}"
            # )
            # log.debug(
            #     f"p_two_people_share_dob_ymd = "
            #     f"{self.p_two_people_share_dob_ymd}, "
            #     f"p_share_dob_md_not_ymd = {p_share_dob_md_not_ymd}, "
            #     f"p_share_dob_yd_not_ymd = {p_share_dob_yd_not_ymd}, "
            #     f"p_share_dob_ym_not_ymd = {p_share_dob_ym_not_ymd}, "
            #     f"p_two_people_have_partial_dob_match = "
            #     f"{self.p_two_people_partial_dob_match}, "
            #     f"p_two_people_no_dob_similarity = "
            #     f"{self.p_two_people_no_dob_similarity}"
            # )

    # -------------------------------------------------------------------------
    # String representation
    # -------------------------------------------------------------------------

    def __str__(self) -> str:
        return auto_repr(self)

    # not __repr__(), or it clutters up all the other objects

    # -------------------------------------------------------------------------
    # Identifier frequency information
    # -------------------------------------------------------------------------

[docs]    def get_forename_freq_info(
        self, name: str, gender: str, prestandardized: bool = False
    ) -> BasicNameFreqInfo:
        """
        Returns the baseline frequency of a forename.

        Args:
            name: the name to check
            gender: the gender to look up for
            prestandardized: was the name pre-standardized?
        """
        if not prestandardized:
            name = standardize_name(name)
        freq_func = self.forename_freq_info.name_frequency_info
        if gender in (GENDER_FEMALE, GENDER_MALE):
            return freq_func(name, gender, prestandardized=True)
        # Otherwise, take the mean across genders:
        return BasicNameFreqInfo.weighted_mean(
            objects=[
                freq_func(name, GENDER_FEMALE, prestandardized=True),
                freq_func(name, GENDER_MALE, prestandardized=True),
            ],
            weights=[self.p_female, self.p_male],
        )

[docs]    def get_surname_freq_info(
        self, name: str, prestandardized: bool = False
    ) -> BasicNameFreqInfo:
        """
        Returns the baseline frequency of a surname.

        Args:
            name: the name to check
            prestandardized: was it pre-standardized?
        """
        return self.surname_freq_info.name_frequency_info(
            name, prestandardized=prestandardized
        )

    def gender_freq(self, gender: str) -> Optional[float]:
        if not gender:
            return None
        elif gender == GENDER_FEMALE:
            return self.p_female
        elif gender == GENDER_MALE:
            return self.p_male
        else:
            return self.p_not_male_or_female

[docs]    def is_valid_postcode(self, postcode_unit: str) -> bool:
        """
        Is this a valid postcode?
        """
        return self.postcode_freq_info.debug_is_valid_postcode(postcode_unit)

[docs]    def postcode_unit_sector_freq(
        self, postcode_unit: str, prestandardized: bool = False
    ) -> Tuple[float, float]:
        """
        Returns the frequency for a full postcode, or postcode unit (the
        proportion of the population who live in that postcode), and the
        corresponding larger-scale postcode sector.

        The underlying function ensures that the sector frequency is as least
        as big as the unit frequency.
        """
        return self.postcode_freq_info.postcode_unit_sector_frequency(
            postcode_unit, prestandardized=prestandardized
        )

[docs]    def debug_postcode_unit_population(
        self, postcode_unit: str, prestandardized: bool = False
    ) -> float:
        """
        Returns the calculated population of a postcode unit.

        Args:
            postcode_unit: the postcode unit to check
            prestandardized: was the postcode pre-standardized in format?
        """
        return self.postcode_freq_info.debug_postcode_unit_population(
            postcode_unit, prestandardized=prestandardized
        )

[docs]    def debug_postcode_sector_population(
        self, postcode_sector: str, prestandardized: bool = False
    ) -> float:
        """
        Returns the calculated population of a postcode sector.

        Args:
            postcode_sector: the postcode sector to check
            prestandardized: was the postcode pre-standardized in format?
        """
        return self.postcode_freq_info.debug_postcode_sector_population(
            postcode_sector, prestandardized=prestandardized
        )

    # -------------------------------------------------------------------------
    # Comparisons
    # -------------------------------------------------------------------------

[docs]    def exceeds_primary_threshold(self, log_odds_match: float) -> bool:
        """
        Decides as to whether the log odds, representing P(H | D) from a
        comparison of two :class:`Person` objects, are sufficient for a match,
        based on our threshold.

        Args:
            log_odds_match: log odds that they're the same person

        Returns:
            bool: binary decision
        """
        return log_odds_match >= self.min_log_odds_for_match

    # -------------------------------------------------------------------------
    # Perfect ID handling
    # -------------------------------------------------------------------------

    def remap_perfect_id_key(self, key: str) -> str:
        return self.perfect_id_translation.get(key, key)


# =============================================================================
# Dummy config that doesn't load frequency information
# =============================================================================


[docs]def mk_dummy_match_config() -> MatchConfig:
    """
    Returns a dummy config with empty frequency information.
    """
    return MatchConfig(
        forename_cache_filename="",
        forename_sex_csv_filename="",
        surname_cache_filename="",
        surname_csv_filename="",
        postcode_cache_filename="",
        postcode_csv_filename="",
    )