Source code for crate_anon.linkage.tests.fuzzy_id_match_tests

"""
crate_anon/linkage/tests/fuzzy_id_match_tests.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

Unit tests.

"""

# =============================================================================
# Imports
# =============================================================================

import logging
import unittest
from typing import List, Optional, Tuple, Type

from cardinal_pythonlib.probability import probability_from_log_odds
from pendulum import Date

from crate_anon.linkage.comparison import (
    AdjustLogOddsComparison,
    Comparison,
    DirectComparison,
)
from crate_anon.linkage.constants import (
    FuzzyDefaults,
    GENDER_FEMALE,
    GENDER_MALE,
    GENDER_MISSING,
    GENDER_OTHER,
    INFINITY,
    VALID_GENDERS,
)
from crate_anon.linkage.frequencies import (
    BasicNameFreqInfo,
    NameFrequencyInfo,
    PostcodeFrequencyInfo,
)
from crate_anon.linkage.identifiers import (
    DateOfBirth,
    DummyLetterIdentifier,
    DummyLetterTemporalIdentifier,
    Forename,
    gen_best_comparisons,
    Gender,
    Identifier,
    PerfectID,
    Postcode,
    Surname,
    SurnameFragment,
    TemporalIDHolder,
)
from crate_anon.linkage.helpers import (
    get_postcode_sector,
    is_valid_isoformat_date,
    ln,
    POSTCODE_REGEX,
    remove_redundant_whitespace,
    safe_upper,
    simplify_punctuation_whitespace,
    standardize_name,
    standardize_postcode,
    surname_alternative_fragments,
)
from crate_anon.linkage.matchconfig import MatchConfig
from crate_anon.linkage.people import DuplicateIDError, People
from crate_anon.linkage.person import Person

log = logging.getLogger(__name__)


# =============================================================================
# Constants
# =============================================================================

BAD_DATE_STRINGS = ["1950-31-12", "1950", "blah", "2000-02-30"]
GOOD_DATE_STRINGS = ["1950-12-31", "1890-01-01", "2000-01-01"]
BAD_POSTCODES = [
    "99XX99",
    "CB99 9XXY",
    "CB99",
    "CB2",
    "NW19TTTEMP",
    "NW19TT TEMP",
]
GOOD_POSTCODES = [
    "CB99 9XY",
    "CB2 0QQ",
    "ZZ99 3VZ",
    "Z Z 9 9 3 V Z",
    " zz993vz ",
]  # good once standardized, anyway
BAD_GENDERS = ["Y", "male", "female", "?"]


# =============================================================================
# Rapid creation of a dummy config (without loading actual name/postcode info)
# =============================================================================


[docs]def mk_test_config(**kwargs) -> MatchConfig:
    """
    Create a dummy config, using dummy name/postcode info.
    """
    predefined_forenames = [
        BasicNameFreqInfo(
            name="ALICE",
            gender=GENDER_FEMALE,
            p_name=0.0032597245570899847,
            metaphone="ALK",
            p_metaphone=0.005664202032042135,
            p_metaphone_not_name=0.00240447747495215,
            f2c="AL",
            p_f2c=0.027635117202534115,
            p_f2c_not_name_metaphone=0.022541989771499515,
            synthetic=False,
        ),
        BasicNameFreqInfo(
            name="BEATRICE",
            gender=GENDER_FEMALE,
            p_name=0.0011134697472956023,
            metaphone="PTRK",
            p_metaphone=0.010795171997297154,
            p_metaphone_not_name=0.009681702250001551,
            f2c="BE",
            p_f2c=0.020540629656206778,
            p_f2c_not_name_metaphone=0.01938862260342886,
            synthetic=False,
        ),
        BasicNameFreqInfo(
            name="BETTY",
            gender=GENDER_FEMALE,
            p_name=0.005856056682186572,
            metaphone="PT",
            p_metaphone=0.007567968531021441,
            p_metaphone_not_name=0.0017119118488348687,
            f2c="BE",
            p_f2c=0.020540629656206778,
            p_f2c_not_name_metaphone=0.014031211254451567,
            synthetic=False,
        ),
        BasicNameFreqInfo(
            name="BOB",
            gender=GENDER_MALE,
            p_name=0.0005341749908504777,
            metaphone="PP",
            p_metaphone=0.002569054271327976,
            p_metaphone_not_name=0.0020348792804774983,
            f2c="BO",
            p_f2c=0.0035610312205931094,
            p_f2c_not_name_metaphone=0.0010918026974037107,
            synthetic=False,
        ),
        BasicNameFreqInfo(
            name="CAROLINE",
            gender=GENDER_FEMALE,
            p_name=0.001289812197195456,
            metaphone="KRLN",
            p_metaphone=0.005979308865585442,
            p_metaphone_not_name=0.004689496668389986,
            f2c="CA",
            p_f2c=0.033910941860871194,
            p_f2c_not_name_metaphone=0.02860674130257904,
            synthetic=False,
        ),
        BasicNameFreqInfo(
            name="CELIA",
            gender=GENDER_FEMALE,
            p_name=0.0003141885536034312,
            metaphone="KL",
            p_metaphone=0.016359410337593906,
            p_metaphone_not_name=0.016045221783990475,
            f2c="CE",
            p_f2c=0.0030682294813082723,
            p_f2c_not_name_metaphone=0.0026663592268114434,
            synthetic=False,
        ),
        BasicNameFreqInfo(
            name="DELILAH",
            gender=GENDER_FEMALE,
            p_name=0.00019936172952521078,
            metaphone="TLL",
            p_metaphone=0.000491534931894549,
            p_metaphone_not_name=0.00029217320236933826,
            f2c="DE",
            p_f2c=0.02472305974107954,
            p_f2c_not_name_metaphone=0.024435022377723725,
            synthetic=False,
        ),
        BasicNameFreqInfo(
            name="DOROTHY",
            gender=GENDER_FEMALE,
            p_name=0.006484867451993301,
            metaphone="TR0",
            p_metaphone=0.007164437365410392,
            p_metaphone_not_name=0.0006795699134170908,
            f2c="DO",
            p_f2c=0.020044376270378746,
            p_f2c_not_name_metaphone=0.01298493890496824,
            synthetic=False,
        ),
        BasicNameFreqInfo(
            name="ELIZABETH",
            gender=GENDER_FEMALE,
            p_name=0.009497275400440382,
            metaphone="ALSP",
            p_metaphone=0.010079561736620864,
            p_metaphone_not_name=0.0005822863361804823,
            f2c="EL",
            p_f2c=0.02543961854560152,
            p_f2c_not_name_metaphone=0.015404362973960957,
            synthetic=False,
        ),
    ]  # type: List[BasicNameFreqInfo]
    forename_freq_info = NameFrequencyInfo(
        csv_filename="",
        cache_filename="",
        by_gender=True,
        min_frequency=FuzzyDefaults.FORENAME_MIN_FREQ,
    )
    for f in predefined_forenames:
        forename_freq_info.name_gender_idx[f.name, f.gender] = f

    predefined_surnames = [
        BasicNameFreqInfo(
            name="JONES",
            gender="",
            p_name=0.00621,
            metaphone="JNS",
            p_metaphone=0.0068899999999999986,
            p_metaphone_not_name=0.0006799999999999983,
            f2c="JO",
            p_f2c=0.019480000000000268,
            p_f2c_not_name_metaphone=0.012984999999999938,
            synthetic=False,
        ),
        BasicNameFreqInfo(
            name="SMITH",
            gender="",
            p_name=0.01006,
            metaphone="SM0",
            p_metaphone=0.010129999999999998,
            p_metaphone_not_name=6.999999999999888e-05,
            f2c="SM",
            p_f2c=0.012514999999999967,
            p_f2c_not_name_metaphone=0.0023849999999999896,
            synthetic=False,
        ),
    ]  # type: List[BasicNameFreqInfo]
    surname_freq_info = NameFrequencyInfo(
        csv_filename="",
        cache_filename="",
        by_gender=False,
        min_frequency=FuzzyDefaults.SURNAME_MIN_FREQ,
    )
    for s in predefined_surnames:
        surname_freq_info.name_gender_idx[s.name, s.gender] = s

    postcode_freq_info = PostcodeFrequencyInfo(
        csv_filename="", cache_filename=""
    )

    return MatchConfig(
        forename_freq_info=forename_freq_info,
        surname_freq_info=surname_freq_info,
        postcode_freq_info=postcode_freq_info,
        **kwargs,
    )


# =============================================================================
# Helper class
# =============================================================================


[docs]class TestCondition:
    """
    Two representations of a person and whether they should match.
    """

[docs]    def __init__(
        self,
        cfg: MatchConfig,
        person_a: Person,
        person_b: Person,
        should_match: bool,
        debug: bool = True,
    ) -> None:
        """
        Args:
            cfg: the main :class:`MatchConfig` object
            person_a: one representation of a person
            person_b: another representation of a person
            should_match: should they be treated as the same person?
            debug: be verbose?
        """
        self.cfg = cfg
        self.person_a = person_a
        self.person_b = person_b
        self.should_match = should_match

        for id_person in (self.person_a, self.person_b):
            assert id_person.is_plaintext()
            id_person.ensure_valid_as_proband()
            for identifier in id_person.debug_gen_identifiers():
                assert identifier.is_plaintext

        log.info("- Making hashed versions for later")
        self.hashed_a = self.person_a.hashed()
        self.hashed_b = self.person_b.hashed()
        for h_person in (self.hashed_a, self.hashed_b):
            assert h_person.is_hashed()
            h_person.ensure_valid_as_proband()
            for identifier in h_person.debug_gen_identifiers():
                assert not identifier.is_plaintext
        self.debug = debug

[docs]    def log_odds_same_plaintext(self) -> float:
        """
        Checks whether the plaintext person objects match.

        Returns:
            float: the log odds that they are the same person
        """
        return self.person_a.log_odds_same(self.person_b)

[docs]    def log_odds_same_hashed(self) -> float:
        """
        Checks whether the hashed versions match.

        Returns:
            float: the log odds that they are the same person
        """
        return self.hashed_a.log_odds_same(self.hashed_b)

[docs]    def matches_plaintext(self) -> Tuple[bool, float]:
        """
        Do the plaintext versions match, by threshold?

        Returns:
            tuple: (matches, log_odds)
        """
        log_odds = self.log_odds_same_plaintext()
        return self.cfg.exceeds_primary_threshold(log_odds), log_odds

[docs]    def matches_hashed(self) -> Tuple[bool, float]:
        """
        Do the raw versions match, by threshold?

        Returns:
            bool: is there a match?
        """
        log_odds = self.log_odds_same_hashed()
        return self.cfg.exceeds_primary_threshold(log_odds), log_odds

[docs]    def check_comparison_as_expected(self) -> None:
        """
        Asserts that both the raw and hashed versions match, or don't match,
        according to ``self.should_match``.
        """
        log.info(
            f"Comparing:\n" f"- {self.person_a!r}\n" f"- {self.person_b!r}"
        )
        log.info("(1) Comparing plaintext")
        matches_raw, log_odds_plaintext = self.matches_plaintext()
        p_plaintext = probability_from_log_odds(log_odds_plaintext)
        p_plain_str = f"P(match | D) = {p_plaintext}"
        if matches_raw == self.should_match:
            if matches_raw:
                log.info(f"... should and does match: {p_plain_str}")
            else:
                log.info(f"... should not and does not match: {p_plain_str}")
        else:
            log_odds = log_odds_plaintext
            report = self.person_a.debug_comparison_report(
                self.person_b, verbose=False
            )
            raise AssertionError(
                f"Match failure: "
                f"matches_raw = {matches_raw}, "
                f"should_match = {self.should_match}, "
                f"log_odds = {log_odds}, "
                f"min_log_odds_for_match = {self.cfg.min_log_odds_for_match}, "
                f"P(match) = {probability_from_log_odds(log_odds)}, "
                f"person_a = {self.person_a}, "
                f"person_b = {self.person_b}, "
                f"report = {report}"
            )

        log.info(
            f"(2) Comparing hashed:\n"
            f"- {self.hashed_a}\n"
            f"- {self.hashed_b}"
        )
        matches_hashed, log_odds_hashed = self.matches_hashed()
        p_hashed = probability_from_log_odds(log_odds_hashed)
        p_hashed_str = f"P(match | D) = {p_hashed}"
        if matches_hashed == self.should_match:
            if matches_hashed:
                log.info(f"... should and does match: {p_hashed_str}")
            else:
                log.info(f"... should not and does not match: {p_hashed_str}")
        else:
            log_odds = log_odds_hashed
            report = self.hashed_a.debug_comparison_report(
                self.hashed_b, verbose=False
            )
            raise AssertionError(
                f"Match failure: "
                f"matches_hashed = {matches_hashed}, "
                f"should_match = {self.should_match}, "
                f"log_odds = {log_odds}, "
                f"threshold = {self.cfg.min_log_odds_for_match}, "
                f"min_log_odds_for_match = {self.cfg.min_log_odds_for_match}, "
                f"P(match) = {probability_from_log_odds(log_odds)}, "
                f"person_a = {self.person_a}, "
                f"person_b = {self.person_b}, "
                f"hashed_a = {self.hashed_a}, "
                f"hashed_b = {self.hashed_b}, "
                f"report = {report}"
            )

        log.info(
            "(3) Results of plaintext match should equal result of hashed "
            "match"
        )
        if log_odds_hashed != log_odds_plaintext:
            raise AssertionError(
                "Plaintext/hashed comparison discrepancy: "
                f"person_a = {self.person_a}, "
                f"person_b = {self.person_b}, "
                "log_odds_plaintext = {log_odds_plaintext}, "
                f"log_odds_hashed = {log_odds_hashed}"
            )


# =============================================================================
# Unit tests
# =============================================================================


[docs]class DummyTemporalIdentifierTests(unittest.TestCase):
    """
    Unit tests for :class:`DummyTemporalIdentifier`.
    """

    def test_overlap(self) -> None:
        d1 = Date(2000, 1, 1)
        d2 = Date(2000, 1, 2)
        d3 = Date(2000, 1, 3)
        d4 = Date(2000, 1, 4)
        p = "dummypostcode"
        # ---------------------------------------------------------------------
        # Overlaps
        # ---------------------------------------------------------------------
        self.assertEqual(
            TemporalIDHolder(p, d1, d2).overlaps(TemporalIDHolder(p, d2, d3)),
            True,
        )
        self.assertEqual(
            TemporalIDHolder(p, d2, d3).overlaps(TemporalIDHolder(p, d1, d2)),
            True,
        )
        self.assertEqual(
            TemporalIDHolder(p, d1, d4).overlaps(TemporalIDHolder(p, d2, d3)),
            True,
        )
        self.assertEqual(
            TemporalIDHolder(p, d1, None).overlaps(
                TemporalIDHolder(p, None, d4)
            ),
            True,
        )
        self.assertEqual(
            TemporalIDHolder(p, None, None).overlaps(
                TemporalIDHolder(p, None, None)
            ),
            True,
        )
        # ---------------------------------------------------------------------
        # Non-overlaps
        # ---------------------------------------------------------------------
        self.assertEqual(
            TemporalIDHolder(p, d1, d2).overlaps(TemporalIDHolder(p, d3, d4)),
            False,
        )
        self.assertEqual(
            TemporalIDHolder(p, None, d1).overlaps(
                TemporalIDHolder(p, d2, None)
            ),
            False,
        )


[docs]class FuzzyLinkageTests(unittest.TestCase):
    """
    Tests of the fuzzy linkage system.
    """

[docs]    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.cfg = mk_test_config(rounding_sf=None)
        self.p1 = Postcode(
            cfg=self.cfg,
            postcode="CB2 0QQ",  # Addenbrooke's Hospital
            start_date=Date(2000, 1, 1),
            end_date=Date(2010, 1, 1),
        )
        self.p2 = Postcode(
            cfg=self.cfg,
            postcode="CB2 3EB",  # Department of Psychology
            start_date=Date(2000, 1, 1),
            end_date=Date(2010, 1, 1),
        )
        self.alice_bcd_rarename_2000_add = Person(
            cfg=self.cfg,
            local_id="1",
            forenames=["Alice", "Beatrice", "Celia", "Delilah"],
            surnames=["Rarename"],
            dob="2000-01-01",
            postcodes=[self.p1],
        )
        self.alec_bcd_rarename_2000_add = Person(
            cfg=self.cfg,
            local_id="2",
            forenames=["Alec", "Beatrice", "Celia", "Delilah"],
            # Alec: same metaphone as Alice
            surnames=["Rarename"],
            dob="2000-01-01",
            postcodes=[self.p1],
        )
        self.bob_bcd_rarename_2000_add = Person(
            cfg=self.cfg,
            local_id="3",
            forenames=["Bob", "Beatrice", "Celia", "Delilah"],
            surnames=["Rarename"],
            dob="2000-01-01",
            postcodes=[self.p1],
        )
        self.alice_bc_rarename_2000_add = Person(
            cfg=self.cfg,
            local_id="4",
            forenames=["Alice", "Beatrice", "Celia"],
            surnames=["Rarename"],
            dob="2000-01-01",
            postcodes=[self.p1],
        )
        self.alice_b_rarename_2000_add = Person(
            cfg=self.cfg,
            local_id="5",
            forenames=["Alice", "Beatrice"],
            surnames=["Rarename"],
            dob="2000-01-01",
            postcodes=[self.p1],
        )
        self.alice_jones_2000_add = Person(
            cfg=self.cfg,
            local_id="6",
            forenames=["Alice"],
            surnames=["Jones"],
            dob="2000-01-01",
            postcodes=[self.p1],
        )
        self.bob_smith_1950_psych = Person(
            cfg=self.cfg,
            local_id="7",
            forenames=["Bob"],
            surnames=["Smith"],
            dob="1950-05-30",
            postcodes=[self.p2],
        )
        self.alice_smith_1930 = Person(
            cfg=self.cfg,
            local_id="8",
            forenames=["Alice"],
            surnames=["Smith"],
            dob="1930-01-01",
        )
        self.alice_smith_2000 = Person(
            cfg=self.cfg,
            local_id="9",
            forenames=["Alice"],
            surnames=["Smith"],
            dob="2000-01-01",
        )
        self.alice_smith = Person(
            cfg=self.cfg,
            local_id="10",
            forenames=["Alice"],
            surnames=["Smith"],
        )
        self.alice_bc_smith = Person(
            cfg=self.cfg,
            local_id="11",
            forenames=["Alice", "Betty", "Caroline"],
            surnames=["Smith"],
        )
        self.alice_bde_smith = Person(
            cfg=self.cfg,
            local_id="12",
            forenames=["Alice", "Betty", "Dorothy", "Elizabeth"],
            surnames=["Smith"],
        )
        self.all_people = [
            self.alice_bcd_rarename_2000_add,
            self.alec_bcd_rarename_2000_add,
            self.bob_bcd_rarename_2000_add,
            self.alice_bc_rarename_2000_add,
            self.alice_b_rarename_2000_add,
            self.alice_jones_2000_add,
            self.bob_smith_1950_psych,
            self.alice_smith_1930,
            self.alice_smith_2000,
            self.alice_smith,
            self.alice_bc_smith,
            self.alice_bde_smith,
        ]
        self.all_people_hashed = [p.hashed() for p in self.all_people]
        self.people_plaintext = People(cfg=self.cfg)
        self.people_plaintext.add_people(self.all_people)
        self.people_hashed = People(cfg=self.cfg)
        self.people_hashed.add_people(self.all_people_hashed)

    # -------------------------------------------------------------------------
    # Basic string transformations
    # -------------------------------------------------------------------------

    def test_standardize_name(self) -> None:
        tests = (
            # name, standardized version
            ("Al Jazeera", "ALJAZEERA"),
            ("Al'Jazeera", "ALJAZEERA"),
            ("Al'Jazeera'", "ALJAZEERA"),
            ("Alice", "ALICE"),
            ("ALJAZEERA", "ALJAZEERA"),
            ("aljazeera", "ALJAZEERA"),
            ("D'Souza", "DSOUZA"),
            ("de Clérambault", "DECLERAMBAULT"),
            ("Mary Ellen", "MARYELLEN"),
            ('"Al Jazeera"', "ALJAZEERA"),
            ("Müller", "MULLER"),
            ("Straße", "STRASSE"),
        )
        for item, target in tests:
            self.assertEqual(standardize_name(item), target)

    def test_safe_upper(self) -> None:
        tests = (
            ("Beethoven", "BEETHOVEN"),
            ("Clérambault", "CLÉRAMBAULT"),
            ("Straße", "STRAẞE"),
        )
        for a, b in tests:
            self.assertEqual(safe_upper(a), b)

    def test_remove_redundant_whitespace(self) -> None:
        tests = ((" van \t \r \n Beethoven ", "van Beethoven"),)
        for a, b in tests:
            self.assertEqual(remove_redundant_whitespace(a), b)

    def test_simplify_punctuation_whitespace(self) -> None:
        tests = (
            ("\n ‘John said “hello”.’", "  'John said \"hello\".'"),
            ("\t a–b—c−d-e ", "  a-b-c-d-e "),
        )
        for a, b in tests:
            self.assertEqual(simplify_punctuation_whitespace(a), b)

    def test_surname_fragments(self) -> None:
        cfg = self.cfg
        accent_transliterations = cfg.accent_transliterations
        nonspecific_name_components = cfg.nonspecific_name_components
        tests = (
            # In the expected answer, the original name (standardized) comes
            # first; then alphabetical order of all other variants. Some
            # examples are silly.
            #
            # France/French:
            (
                "Côte d'Ivoire",
                ["CÔTEDIVOIRE", "COTE", "COTEDIVOIRE", "CÔTE", "IVOIRE"],
            ),
            (
                "de Clérambault",
                [
                    "DECLÉRAMBAULT",
                    "CLERAMBAULT",
                    "CLÉRAMBAULT",
                    "DECLERAMBAULT",
                ],
            ),
            (
                "de la Billière",
                ["DELABILLIÈRE", "BILLIERE", "BILLIÈRE", "DELABILLIERE"],
            ),
            ("Façade", ["FAÇADE", "FACADE"]),
            ("Giscard d'Estaing", ["GISCARDDESTAING", "ESTAING", "GISCARD"]),
            ("L'Estrange", ["LESTRANGE", "ESTRANGE"]),
            ("L’Estrange", ["LESTRANGE", "ESTRANGE"]),
            # Germany (and in Beethoven's case, ancestrally Belgium):
            ("Beethoven", ["BEETHOVEN"]),
            ("Mozart Smith", ["MOZARTSMITH", "MOZART", "SMITH"]),
            ("Mozart-Smith", ["MOZARTSMITH", "MOZART", "SMITH"]),
            ("Müller", ["MÜLLER", "MUELLER", "MULLER"]),
            ("Straße", ["STRAẞE", "STRASSE"]),
            ("van  Beethoven", ["VANBEETHOVEN", "BEETHOVEN"]),
            # Italy:
            ("Calabrò", ["CALABRÒ", "CALABRO"]),
            ("De Marinis", ["DEMARINIS", "MARINIS"]),
            ("di Bisanzio", ["DIBISANZIO", "BISANZIO"]),
            # Sweden:
            ("Nyström", ["NYSTRÖM", "NYSTROEM", "NYSTROM"]),
            # Hmm. NYSTROEM is a German-style transliteration. Still, OK-ish.
        )
        for surname, target_fragments in tests:
            self.assertEqual(
                surname_alternative_fragments(
                    surname=surname,
                    accent_transliterations=accent_transliterations,
                    nonspecific_name_components=nonspecific_name_components,
                ),
                target_fragments,
            )

    def test_date_regex(self) -> None:
        for b in BAD_DATE_STRINGS:
            self.assertFalse(is_valid_isoformat_date(b))
        for g in GOOD_DATE_STRINGS:
            self.assertTrue(is_valid_isoformat_date(g))

    def test_standardize_postcode(self) -> None:
        tests = (
            # name, standardized version
            ("CB20QQ", "CB20QQ"),
            ("   CB2 0QQ   ", "CB20QQ"),
            ("   CB2-0 QQ   ", "CB20QQ"),
            ("cb2 0qq", "CB20QQ"),
        )
        for item, target in tests:
            self.assertEqual(standardize_postcode(item), target)

    def test_get_postcode_sector(self) -> None:
        tests = (
            # postcode, sector
            ("CB20QQ", "CB20"),
            ("   CB2 0QQ   ", "CB20"),
            ("   CB2-0 QQ   ", "CB20"),
            ("cb2 0qq", "CB20"),
        )
        for item, target in tests:
            self.assertEqual(get_postcode_sector(item), target)

    def test_postcode_regex(self) -> None:
        for b in BAD_POSTCODES:
            self.assertIsNone(
                POSTCODE_REGEX.match(b), f"Postcode {b!r} matched but is bad"
            )
            sb = standardize_postcode(b)
            self.assertIsNone(
                POSTCODE_REGEX.match(sb),
                f"Postcode {b!r} matched after standardization to {sb!r} "
                f"but is bad",
            )
        for g in GOOD_POSTCODES:
            sg = standardize_postcode(g)
            self.assertTrue(
                POSTCODE_REGEX.match(sg),
                f"Postcode {sg!r} (from {g!r}) did not match but is good",
            )

    # -------------------------------------------------------------------------
    # Frequencies
    # -------------------------------------------------------------------------

    def test_fuzzy_linkage_frequencies_name(self) -> None:
        cfg = self.cfg
        for surname in [
            "Smith",
            "Jones",
            "Blair",
            "Cardinal",
            "XYZ",
            "W",  # no metaphone
        ]:
            f = cfg.get_surname_freq_info(surname)
            log.info(f"Surname frequency for {surname}: {f}")

            self.assertIsInstance(f.name, str)
            self.assertIsInstance(f.gender, str)
            self.assertIsInstance(f.p_name, float)

            self.assertIsInstance(f.metaphone, str)
            self.assertIsInstance(f.p_metaphone, float)
            self.assertIsInstance(f.p_metaphone_not_name, float)

            self.assertIsInstance(f.f2c, str)
            self.assertIsInstance(f.p_f2c, float)
            self.assertIsInstance(f.p_f2c_not_name_metaphone, float)

        for forename, gender in [
            ("James", GENDER_MALE),
            ("Rachel", GENDER_FEMALE),
            ("Phoebe", GENDER_FEMALE),
            ("Elizabeth", GENDER_FEMALE),
            ("Elizabeth", GENDER_MALE),
            ("Elizabeth", ""),
            ("Rowan", GENDER_FEMALE),
            ("Rowan", GENDER_MALE),
            ("Rowan", ""),
            ("XYZ", ""),
            ("W", ""),  # no metaphone
        ]:
            f = cfg.get_forename_freq_info(forename, gender)
            log.info(
                f"Forename frequency for {forename}, gender {gender}: {f}"
            )
            self.assertIsInstance(f.name, str)
            self.assertIsInstance(f.gender, str)
            self.assertIsInstance(f.p_name, float)

            self.assertIsInstance(f.metaphone, str)
            self.assertIsInstance(f.p_metaphone, float)
            self.assertIsInstance(f.p_metaphone_not_name, float)

            self.assertIsInstance(f.f2c, str)
            self.assertIsInstance(f.p_f2c, float)
            self.assertIsInstance(f.p_f2c_not_name_metaphone, float)

    def test_fuzzy_linkage_frequencies_postcode(self) -> None:
        cfg = self.cfg
        # Examples are hospitals and colleges in Cambridge (not residential)
        # but it gives a broad idea.
        for postcode in ["CB2 0QQ", "CB2 0SZ", "CB2 3EB", "CB3 9DF"]:
            p = cfg.debug_postcode_unit_population(postcode)
            log.info(
                f"Calculated population for postcode unit {postcode}: {p}"
            )

        for ps in ["CB2 0", "CB2 1", "CB2 2", "CB2 3"]:
            p = cfg.debug_postcode_sector_population(ps)
            log.info(f"Calculated population for postcode sector {ps}: {p}")

    # -------------------------------------------------------------------------
    # Identifiers
    # -------------------------------------------------------------------------

    def test_identifier_dob(self) -> None:
        cfg = self.cfg

        for b in BAD_DATE_STRINGS:
            with self.assertRaises(ValueError):
                _ = DateOfBirth(cfg, b)

        full_match_log_lr = None  # type: Optional[float]
        for g in GOOD_DATE_STRINGS:
            d = DateOfBirth(cfg, g)
            self.assertEqual(d.dob_str, g)
            self.assertEqual(str(d), g)
            self.assertTrue(d.fully_matches(d))
            full_match_log_lr = d.comparison(d).posterior_log_odds(0)
            self.assertGreater(full_match_log_lr, 0)

        partial_matches = (
            ("2000-01-01", "2007-01-01"),  # year mismatch only
            ("2000-01-01", "2000-07-01"),  # month mismatch only
            ("2000-01-01", "2000-01-07"),  # day mismatch only
        )
        partial_match_log_lr = None  # type: Optional[float]
        for d1_str, d2_str in partial_matches:
            d1 = DateOfBirth(cfg, d1_str)
            d2 = DateOfBirth(cfg, d2_str)
            self.assertFalse(d1.fully_matches(d2))
            self.assertFalse(d2.fully_matches(d1))
            self.assertTrue(d1.partially_matches(d2))
            self.assertTrue(d2.partially_matches(d1))
            partial_match_log_lr = d1.comparison(d2).posterior_log_odds(0)
            self.assertLess(partial_match_log_lr, full_match_log_lr)

        not_partial_matches = (
            ("2000-01-01", "2007-07-01"),  # only day the same
            ("2000-01-01", "2000-07-07"),  # only year the same
            ("2000-01-01", "2007-01-07"),  # only month the same
        )
        for d1_str, d2_str in not_partial_matches:
            d1 = DateOfBirth(cfg, d1_str)
            d2 = DateOfBirth(cfg, d2_str)
            self.assertFalse(d1.fully_matches(d2))
            self.assertFalse(d2.fully_matches(d1))
            self.assertFalse(d1.partially_matches(d2))
            self.assertFalse(d2.partially_matches(d1))
            mismatch_log_lr = d1.comparison(d2).posterior_log_odds(0)
            self.assertLess(mismatch_log_lr, 0)
            self.assertLess(mismatch_log_lr, partial_match_log_lr)

    def test_identifier_postcode(self) -> None:
        cfg = self.cfg
        configs = [
            cfg,
            # Check extremes of k_postcode:
            mk_test_config(k_postcode=1),
            mk_test_config(k_postcode=1000),
            # Check extremes of p_unknown_or_pseudo_postcode, k_pseudopostcode:
            mk_test_config(
                p_unknown_or_pseudo_postcode=0.00001, k_pseudopostcode=1.2
            ),
            mk_test_config(
                p_unknown_or_pseudo_postcode=0.01, k_pseudopostcode=3
            ),
            # Very high combinations, e.g.
            # p_unknown_or_pseudo_postcode=0.00001, k_pseudopostcode=1.001, may
            # cause an error here. Very high combinations, e.g.
            # p_unknown_or_pseudo_postcode=0.1, k_pseudopostcode=3, may also
            # cause an error.
        ]
        # Any invalid settings are detected by the Postcode identifier class
        # checking that its comparisons are in a sensible order. All
        # identifiers do this, in fact.

        for b in BAD_POSTCODES:
            with self.assertRaises(ValueError):
                _ = Postcode(cfg, b)
        early = Date(2020, 1, 1)
        late = Date(2021, 12, 31)
        for g in GOOD_POSTCODES:  # includes pseudopostcodes
            with self.assertRaises(ValueError):
                _ = Postcode(cfg, g, start_date=late, end_date=early)
            p = Postcode(cfg, g)
            self.assertEqual(p.postcode_unit, standardize_postcode(g))
            self.assertTrue(p.fully_matches(p))

        empty = Postcode(cfg, "")
        self.assertEqual(str(empty), "")

        probe_partial_mismatch = (
            # Each tuple: (1) a postcode; (2) same sector, different unit; (3)
            # different sector.
            ("CB99 9XY", "CB99 9AB", "CB99 7AB"),  # nonsense
            ("CB2 0QQ", "CB2 0SL", "SW1A 2AA"),  # CUH 1, CUH 2, 10 Downing St
            ("ZZ99 3VZ", "ZZ99 3WZ", "ZZ99 1WZ"),  # pseudo: NFA, sea, Orkney
        )
        for probe_str, partial_str, mismatch_str in probe_partial_mismatch:
            for c in configs:
                p1 = Postcode(c, probe_str)
                p2 = Postcode(c, partial_str)
                p3 = Postcode(c, mismatch_str)

                # Everything matches itself.
                self.assertTrue(p1.fully_matches(p1))
                self.assertTrue(p2.fully_matches(p2))
                self.assertTrue(p3.fully_matches(p3))

                # Nothing matches another.
                self.assertFalse(p1.fully_matches(p2))
                self.assertFalse(p1.fully_matches(p3))
                self.assertFalse(p2.fully_matches(p3))

                # The partial match partially matches.
                self.assertTrue(p1.partially_matches(p2))

                # The nonmatch doesn't partially match.
                self.assertFalse(p1.partially_matches(p3))

                full_comp = p1.comparison(p1)
                full_log_lr = full_comp.posterior_log_odds(0)
                partial_comp = p1.comparison(p2)
                partial_log_lr = partial_comp.posterior_log_odds(0)
                nonmatch_comp = p1.comparison(p3)
                nonmatch_log_lr = nonmatch_comp.posterior_log_odds(0)

                self.assertGreater(
                    full_log_lr,
                    0,
                    f"comparing {probe_str!r} to itself, giving {full_comp!r}",
                )
                self.assertLess(
                    partial_log_lr,
                    full_log_lr,
                    f"comparing {probe_str!r} to {partial_str!r} "
                    f"(partial match); \ncfg = {cfg};\n"
                    f"p1 = {p1!r};\n"
                    f"giving {partial_comp!r}, versus the exact comparison "
                    f"{full_comp!r}",
                )
                self.assertLess(
                    nonmatch_log_lr,
                    partial_log_lr,
                    f"comparing {probe_str!r} to {mismatch_str!r} "
                    f"(nonmatch); \ncfg = {cfg};"
                    f"\np1 = {p1!r};\n"
                    f"giving {nonmatch_comp!r}, versus the previous partial "
                    f"comparison {partial_comp!r}",
                )

    def test_identifier_gender(self) -> None:
        cfg = self.cfg
        for b in BAD_GENDERS:
            with self.assertRaises(ValueError):
                _ = Gender(cfg, b)
        for g_str in VALID_GENDERS:
            g = Gender(cfg, g_str)
            log.critical(f"g = {g!r}")
            self.assertEqual(g.gender_str, g_str)
            self.assertEqual(str(g), g_str)
            if not g:
                continue
            self.assertTrue(g.fully_matches(g))
            comp = g.comparison(g)
            if comp:
                self.assertGreater(comp.posterior_log_odds(0), 0)

        empty = Gender(cfg, GENDER_MISSING)
        m = Gender(cfg, GENDER_MALE)
        f = Gender(cfg, GENDER_FEMALE)
        x = Gender(cfg, GENDER_OTHER)

        empty.ensure_has_freq_info_if_id_present()
        m.ensure_has_freq_info_if_id_present()
        f.ensure_has_freq_info_if_id_present()
        x.ensure_has_freq_info_if_id_present()

        self.assertEqual(str(empty), "")

        self.assertTrue(bool(m))
        self.assertTrue(bool(f))
        self.assertTrue(bool(x))
        self.assertFalse(bool(empty))

        self.assertTrue(m.fully_matches(m))
        self.assertTrue(m.comparison_relevant(m))

        self.assertTrue(f.comparison_relevant(f))
        self.assertTrue(f.comparison_relevant(f))

        self.assertFalse(m.fully_matches(f))
        self.assertFalse(m.fully_matches(x))
        self.assertFalse(f.fully_matches(m))
        self.assertFalse(f.fully_matches(x))

        f_comp_f = f.comparison(f)
        self.assertIsNotNone(f_comp_f)
        self.assertGreater(f.comparison(f).posterior_log_odds(0), 0)
        self.assertLess(f.comparison(m).posterior_log_odds(0), 0)

    def test_identifier_surname_fragment(self) -> None:
        cfg = self.cfg
        f1 = SurnameFragment(cfg, name="Smith", gender=GENDER_MALE)
        h1 = f1.hashed()
        self.assertTrue(f1.fully_matches(f1))
        self.assertTrue(f1.partially_matches(f1))
        self.assertFalse(f1.fully_matches(h1))
        self.assertFalse(f1.partially_matches(h1))
        self.assertTrue(h1.fully_matches(h1))
        self.assertTrue(h1.partially_matches(h1))

    def test_identifier_surname(self) -> None:
        # https://en.wikipedia.org/wiki/Double-barrelled_name
        cfg = self.cfg
        g = GENDER_FEMALE
        jones = Surname(cfg, name="Jones", gender=g)
        mozart = Surname(cfg, name="Mozart", gender=g)
        mozart_smith_hy = Surname(cfg, name="Mozart-Smith", gender=g)
        mozart_smith_sp = Surname(cfg, name="Mozart Smith", gender=g)
        smith = Surname(cfg, name="Smith", gender=g)
        smythe = Surname(cfg, name="Smythe", gender=g)
        mozart_hashed = mozart.hashed()
        mozart_smith_hashed = mozart_smith_hy.hashed()
        smith_hashed = smith.hashed()
        smythe_hashed = smythe.hashed()
        matching = [
            (jones, jones),
            (mozart_smith_hy, mozart),
            (mozart_smith_hy, mozart_smith_hy),
            (mozart_smith_hy, mozart_smith_sp),
            (mozart_smith_hy, smith),
            (mozart_smith_sp, mozart),
            (mozart_smith_sp, mozart_smith_hy),
            (mozart_smith_sp, smith),
            (smith, smith),
            (smythe, smythe),
            (mozart_hashed, mozart_hashed),
            (mozart_smith_hashed, mozart_smith_hashed),
            (smith_hashed, smith_hashed),
            (smythe_hashed, smythe_hashed),
        ]
        partially_matching = [
            (mozart_smith_hy, smythe),
            (mozart_smith_sp, smythe),
            (smith, smythe),
            (smith_hashed, smythe_hashed),
            (mozart_smith_hashed, smythe_hashed),
        ]
        nonmatching = [
            (jones, mozart_smith_hy),
            (jones, mozart_smith_sp),
            (smith, jones),
            (smith, mozart),
            (smith, smith_hashed),
            (smythe, smythe_hashed),
        ]
        for a, b in matching:
            self.assertTrue(a.fully_matches(b))
        for a, b in partially_matching:
            self.assertFalse(a.fully_matches(b))
            self.assertTrue(a.partially_matches(b))
        for a, b in nonmatching:
            self.assertFalse(a.fully_matches(b))
            self.assertFalse(a.partially_matches(b))

    # -------------------------------------------------------------------------
    # Lots of identifiers
    # -------------------------------------------------------------------------

[docs]    def test_identifier_transformations(self) -> None:
        """
        Creating hashed and plaintext JSON representation and loading an
        identifier back from them.
        """
        cfg = self.cfg
        identifiable = [
            DateOfBirth(cfg, dob="2000-12-31"),
            Forename(cfg, name="Elizabeth", gender=GENDER_FEMALE),
            Gender(cfg, gender=GENDER_MALE),
            PerfectID(cfg, identifiers={"nhsnum": 1}),
            Postcode(cfg, postcode="CB2 0QQ"),
            Surname(cfg, name="Smith", gender=GENDER_FEMALE),
            SurnameFragment(cfg, name="Smith", gender=GENDER_MALE),
        ]  # type: List[Identifier]
        for i in identifiable:
            self.assertTrue(i.is_plaintext)
            i_class = type(i)  # type: Type[Identifier]

            hd = i.as_dict(encrypt=True, include_frequencies=True)
            h = i_class.from_dict(cfg, hd, hashed=True)
            self.assertFalse(h.is_plaintext)
            h.ensure_has_freq_info_if_id_present()

            pd = i.as_dict(encrypt=False, include_frequencies=True)
            p = i_class.from_dict(cfg, pd, hashed=False)
            self.assertTrue(p.is_plaintext)
            p.ensure_has_freq_info_if_id_present()

    # -------------------------------------------------------------------------
    # Person checks
    # -------------------------------------------------------------------------

    def test_person_creation(self) -> None:
        cfg = self.cfg
        # Test the removal of blank names, etc.
        space = " "
        blank = ""
        p1 = Person(
            cfg, local_id="p1", forenames=["A", blank, space, None, "B"]
        )
        self.assertEqual(len(p1.forenames), 2)
        p2 = Person(
            cfg, local_id="p2", surnames=["A", blank, space, None, "B"]
        )
        self.assertEqual(len(p2.surnames), 2)
        p3 = Person(
            cfg,
            local_id="p3",
            postcodes=[GOOD_POSTCODES[0], blank, space, GOOD_POSTCODES[1]],
        )
        self.assertEqual(len(p3.postcodes), 2)

    def test_person_equality(self) -> None:
        cfg = self.cfg
        p1 = Person(cfg, local_id="hello")
        p2 = Person(cfg, local_id="world")
        p3 = Person(cfg, local_id="world")
        self.assertNotEqual(p1, p2)
        self.assertEqual(p2, p3)

        people = People(cfg)
        people.add_person(p1)
        people.add_person(p2)
        self.assertRaises(DuplicateIDError, people.add_person, p3)

    def test_person_copy(self) -> None:
        persons = [self.alice_smith]
        for orig in persons:
            cp = orig.copy()
            for attr in Person.ALL_PERSON_KEYS:
                orig_value = getattr(orig, attr)
                copy_value = getattr(cp, attr)
                self.assertEqual(
                    orig_value,
                    copy_value,
                    f"mismatch for {attr}:\n"
                    f"{orig_value!r}\n!=\n{copy_value!r}",
                )

    # -------------------------------------------------------------------------
    # Person comparisons
    # -------------------------------------------------------------------------

    def test_fuzzy_linkage_matches(self) -> None:
        test_values = [
            # Very easy match
            TestCondition(
                cfg=self.cfg,
                person_a=self.alice_bcd_rarename_2000_add,
                person_b=self.alice_bcd_rarename_2000_add,
                should_match=True,
            ),
            # Easy match
            TestCondition(
                cfg=self.cfg,
                person_a=self.alice_bc_rarename_2000_add,
                person_b=self.alice_b_rarename_2000_add,
                should_match=True,
            ),
            # Easy non-match
            TestCondition(
                cfg=self.cfg,
                person_a=self.alice_jones_2000_add,
                person_b=self.bob_smith_1950_psych,
                should_match=False,
            ),
            # Very ambiguous (1)
            TestCondition(
                cfg=self.cfg,
                person_a=self.alice_smith,
                person_b=self.alice_smith_1930,
                should_match=False,
            ),
            # Very ambiguous (2)
            TestCondition(
                cfg=self.cfg,
                person_a=self.alice_smith,
                person_b=self.alice_smith_2000,
                should_match=False,
            ),
            TestCondition(
                cfg=self.cfg,
                person_a=self.alice_bcd_rarename_2000_add,
                person_b=self.alec_bcd_rarename_2000_add,
                should_match=True,
            ),
            TestCondition(
                cfg=self.cfg,
                person_a=self.alice_bcd_rarename_2000_add,
                person_b=self.bob_bcd_rarename_2000_add,
                should_match=True,  # used to be False
            ),
        ]  # type: List[TestCondition]
        log.info("Testing comparisons...")
        for i, test in enumerate(test_values, start=1):
            log.info(f"Comparison {i}...")
            test.check_comparison_as_expected()

    def test_fuzzy_more_complex(self) -> None:
        log.info("Testing proband-versus-sample...")
        for i in range(len(self.all_people)):
            proband_plaintext = self.all_people[i]
            log.info(f"Plaintext search with proband: {proband_plaintext}")
            plaintext_winner = self.people_plaintext.get_unique_match(
                proband_plaintext
            )
            log.info(f"... WINNER: {plaintext_winner}")
            log.info(f"Hashed search with proband: {proband_plaintext}\n")
            proband_hashed = self.all_people_hashed[i]  # same order
            hashed_winner = self.people_hashed.get_unique_match(proband_hashed)
            log.info(f"... WINNER: {hashed_winner}")

[docs]    def test_exact_match(self) -> None:
        """
        Test the exact-match system.
        """
        id_type = "nhsnum"
        id_value = 3
        # Two people with no identifiers in common:
        p1 = Person(
            cfg=self.cfg, local_id="p1", perfect_id={id_type: id_value}
        )
        p2 = Person(
            cfg=self.cfg, local_id="p2", perfect_id={id_type: id_value}
        )
        # Perfect ID comparison is a function of a People object, not Person.
        people = People(cfg=self.cfg, people=[p1])

        # Match to self:
        result_p1 = people.get_unique_match_detailed(p1)
        self.assertEqual(result_p1.winner, p1)
        self.assertEqual(result_p1.best_log_odds, INFINITY)

        # Match to another with the same perfect ID:
        result_p2 = people.get_unique_match_detailed(p2)
        self.assertEqual(result_p2.winner, p1)
        self.assertEqual(result_p2.best_log_odds, INFINITY)

        # No two people in a People object with the same ID:
        self.assertRaises(DuplicateIDError, people.add_person, p2)

    # -------------------------------------------------------------------------
    # People checks
    # -------------------------------------------------------------------------
    # See also test_person_equality() above.

[docs]    def test_shortlist(self) -> None:
        """
        Our shortlisting process typically permits people with completely
        matching or partially matching DOBs, but not those with mismatched DOBs
        (for efficiency). Test that.
        """
        # Some test people:
        cfg1 = self.cfg
        proband = Person(cfg1, local_id="p1", dob="1950-01-01")
        full_dob_match = [
            # Full DOB match:
            Person(cfg1, local_id="p2", dob="1950-01-01"),
        ]
        partial_dob_match = [
            # Two components of DOB match:
            Person(cfg1, local_id="p3", dob="2000-01-01"),
            Person(cfg1, local_id="p4", dob="1950-12-01"),
            Person(cfg1, local_id="p5", dob="1950-01-12"),
        ]
        dob_mismatch = [
            # One component of DOB matches:
            Person(cfg1, local_id="p6", dob="1950-12-12"),
            Person(cfg1, local_id="p7", dob="2000-01-12"),
            Person(cfg1, local_id="p8", dob="2000-12-01"),
            # No component of DOB matches:
            Person(cfg1, local_id="p9", dob="2000-12-12"),
        ]
        all_people = (
            [proband] + full_dob_match + partial_dob_match + dob_mismatch
        )

        # A setup where we don't shortlist mismatched DOBs:
        self.assertEqual(cfg1.complete_dob_mismatch_allowed, False)
        self.assertEqual(cfg1.partial_dob_mismatch_allowed, True)
        people1 = People(cfg1, people=all_people)
        shortlist1 = list(people1.gen_shortlist(proband))
        self.assertTrue(proband in shortlist1)
        for full_p in full_dob_match:
            self.assertTrue(full_p in shortlist1)
        for partial_p in partial_dob_match:
            self.assertTrue(partial_p in shortlist1)
        for mismatch_p in dob_mismatch:
            self.assertFalse(mismatch_p in shortlist1)

        # And one where we do:
        cfg2 = mk_test_config(p_en_dob=FuzzyDefaults.P_EN_DOB_TRUE)
        self.assertEqual(cfg2.complete_dob_mismatch_allowed, True)
        self.assertEqual(cfg2.partial_dob_mismatch_allowed, True)
        people2 = People(cfg2, people=all_people)
        shortlist2 = list(people2.gen_shortlist(proband))
        for p in all_people:
            self.assertTrue(p in shortlist2)

        # And one where only exact DOB matches are allows:
        cfg3 = mk_test_config(p_ep_dob=0, p_en_dob=0)
        self.assertEqual(cfg3.complete_dob_mismatch_allowed, False)
        self.assertEqual(cfg3.partial_dob_mismatch_allowed, False)
        people3 = People(cfg3, people=all_people)
        shortlist3 = list(people3.gen_shortlist(proband))
        self.assertTrue(proband in shortlist3)
        for full_p in full_dob_match:
            self.assertTrue(full_p in shortlist3)
        for partial_p in partial_dob_match:
            self.assertFalse(partial_p in shortlist3)
        for mismatch_p in dob_mismatch:
            self.assertFalse(mismatch_p in shortlist3)


# -------------------------------------------------------------------------
# Multiple comparison correction checks
# -------------------------------------------------------------------------


[docs]class MultipleComparisonTestBase(unittest.TestCase):
    P_U = 0.1  # arbitrary
    P_O = 1 - P_U
    DELTA = 1e-10  # floating-point tolerance


[docs]class UnorderedMultipleComparisonTests(MultipleComparisonTestBase):
    @staticmethod
    def compare(
        proband_identifiers: List[Identifier],
        candidate_identifiers: List[Identifier],
    ) -> List[Comparison]:
        return list(
            gen_best_comparisons(
                proband_identifiers=proband_identifiers,
                candidate_identifiers=candidate_identifiers,
                ordered=False,
            )
        )

    def test_same_single_id_returns_one_match_and_no_correction(
        self,
    ) -> None:
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # UNORDERED, one/one identifier
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        a = DummyLetterIdentifier("A")

        result = self.compare([a], [a])
        self.assertEqual(len(result), 1)  # ... one match, no correction

        comparison = result[0]
        self.assertIsInstance(comparison, DirectComparison)
        self.assertEqual(comparison.d_description, "dummy_match:A")

    def test_same_two_ids_returns_two_matches_and_a_correction(
        self,
    ) -> None:
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Unordered, two/two identifiers
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        a = DummyLetterIdentifier("A")
        b = DummyLetterIdentifier("B")

        result = self.compare([a, b], [a, b])
        self.assertEqual(len(result), 3)  # ... two matches and a correction

        comparison1 = result[0]
        self.assertIsInstance(comparison1, DirectComparison)
        self.assertEqual(comparison1.d_description, "dummy_match:A")
        comparison2 = result[1]
        self.assertIsInstance(comparison2, DirectComparison)
        self.assertEqual(comparison2.d_description, "dummy_match:B")
        correction = result[-1]
        self.assertIsInstance(correction, AdjustLogOddsComparison)
        # Correction should be for 2 hits from 2 comparisons, and a Bonferroni
        # correction:
        self.assertAlmostEqual(
            correction.log_likelihood_ratio, -ln(2), delta=self.DELTA
        )

    def test_same_three_ids_returns_three_matches_and_a_correction(
        self,
    ) -> None:
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Unordered, three/three identifiers
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        a = DummyLetterIdentifier("A")
        b = DummyLetterIdentifier("B")
        c = DummyLetterIdentifier("C")

        result = self.compare([a, b, c], [a, b, c])
        self.assertEqual(len(result), 4)  # ... three matches and a correction

        comparison1 = result[0]
        self.assertIsInstance(comparison1, DirectComparison)
        self.assertEqual(comparison1.d_description, "dummy_match:A")
        comparison2 = result[1]
        self.assertIsInstance(comparison2, DirectComparison)
        self.assertEqual(comparison2.d_description, "dummy_match:B")
        comparison3 = result[2]
        self.assertIsInstance(comparison3, DirectComparison)
        self.assertEqual(comparison3.d_description, "dummy_match:C")

        correction = result[-1]
        self.assertIsInstance(correction, AdjustLogOddsComparison)
        # Correction should be for 3 hits from 6 comparisons:
        self.assertAlmostEqual(
            correction.log_likelihood_ratio, -ln(6), delta=self.DELTA
        )

    def test_one_out_of_three_ids_returns_three_matches_and_a_correction(
        self,
    ) -> None:
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Unordered, one/three identifiers
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        a = DummyLetterIdentifier("A")
        b = DummyLetterIdentifier("B")
        c = DummyLetterIdentifier("C")

        result = self.compare([a], [a, b, c])
        self.assertEqual(len(result), 2)  # ... one match and a correction

        comparison = result[0]
        self.assertIsInstance(comparison, DirectComparison)
        self.assertEqual(comparison.d_description, "dummy_match:A")

        correction = result[-1]
        self.assertIsInstance(correction, AdjustLogOddsComparison)
        # Correction should be for 1 hit from 3 comparisons:
        self.assertAlmostEqual(
            correction.log_likelihood_ratio, -ln(3), delta=self.DELTA
        )

[docs]    def test_with_incomparable_identifiers(self) -> None:
        """
        Use identifiers that aren't allowed to be compared, e.g. names with
        non-overlapping timestamps. This will give a comparison that is
        ``None``, and make the code coverage checks happy.

        .. code-block:: bash

            pip install pytest-cov
            pytest --cov --cov-report html
        """
        a_early = DummyLetterTemporalIdentifier(
            value="A", start_date="1900-01-01", end_date="1900-12-31"
        )
        a_late = DummyLetterTemporalIdentifier(
            value="A", start_date="2000-01-01", end_date="2000-12-31"
        )
        result = self.compare([a_early], [a_late])
        self.assertEqual(len(result), 0)  # no comparisons


[docs]class OrderedMultipleComparisonTests(MultipleComparisonTestBase):
    def compare(
        self,
        proband_identifiers: List[Identifier],
        candidate_identifiers: List[Identifier],
    ) -> List[Comparison]:
        return list(
            gen_best_comparisons(
                proband_identifiers=proband_identifiers,
                candidate_identifiers=candidate_identifiers,
                ordered=True,
                p_u=self.P_U,
            )
        )

    def test_same_single_identifier_returns_one_match_and_no_correction(
        self,
    ) -> None:
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # ORDERED, one/one identifier
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        a = DummyLetterIdentifier("A")

        result = self.compare([a], [a])
        self.assertEqual(len(result), 1)  # ... one match, no correction

        comparison = result[0]
        self.assertIsInstance(comparison, DirectComparison)
        self.assertEqual(comparison.d_description, "dummy_match:A")

    def test_same_two_ids_same_order_returns_two_matches_and_a_correction(
        self,
    ) -> None:
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Ordered, two/two identifiers, correct order
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        a = DummyLetterIdentifier("A")
        b = DummyLetterIdentifier("B")

        result = self.compare([a, b], [a, b])
        self.assertEqual(len(result), 3)  # ... two matches and a correction

        comparison1 = result[0]
        self.assertIsInstance(comparison1, DirectComparison)
        self.assertEqual(comparison1.d_description, "dummy_match:A")
        comparison2 = result[1]
        self.assertIsInstance(comparison2, DirectComparison)
        self.assertEqual(comparison2.d_description, "dummy_match:B")

        correction = result[-1]
        self.assertIsInstance(correction, AdjustLogOddsComparison)
        # - P(D|H) correction: +ln(p_o).
        # - P(D|¬H) correction: nothing, i.e. -ln(1) = 0.
        self.assertAlmostEqual(
            correction.log_likelihood_ratio, ln(self.P_O), delta=self.DELTA
        )

    def test_same_two_ids_diff_order_returns_two_matches_and_a_correction(
        self,
    ) -> None:
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Ordered, two/two identifiers, wrong order
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        a = DummyLetterIdentifier("A")
        b = DummyLetterIdentifier("B")

        result = self.compare([a, b], [b, a])
        self.assertEqual(len(result), 3)  # ... two matches and a correction

        comparison1 = result[0]
        self.assertIsInstance(comparison1, DirectComparison)
        self.assertEqual(comparison1.d_description, "dummy_match:A")
        comparison2 = result[1]
        self.assertIsInstance(comparison2, DirectComparison)
        self.assertEqual(comparison2.d_description, "dummy_match:B")

        correction = result[-1]
        self.assertIsInstance(correction, AdjustLogOddsComparison)
        # - P(D|H) correction: +ln(p_u).
        # - P(D|¬H) correction: Bonferroni for 2 options but minus one for the
        #   ordered option, so nothing.
        self.assertAlmostEqual(
            correction.log_likelihood_ratio, ln(self.P_U), delta=self.DELTA
        )

    def test_same_three_ids_same_order_returns_three_matches_and_a_correction(
        self,
    ) -> None:
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Ordered, three/three identifiers, correct order
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        a = DummyLetterIdentifier("A")
        b = DummyLetterIdentifier("B")
        c = DummyLetterIdentifier("C")

        result = self.compare([a, b, c], [a, b, c])
        self.assertEqual(len(result), 4)  # ... three matches and a correction

        comparison1 = result[0]
        self.assertIsInstance(comparison1, DirectComparison)
        self.assertEqual(comparison1.d_description, "dummy_match:A")
        comparison2 = result[1]
        self.assertIsInstance(comparison2, DirectComparison)
        self.assertEqual(comparison2.d_description, "dummy_match:B")
        comparison3 = result[2]
        self.assertIsInstance(comparison3, DirectComparison)
        self.assertEqual(comparison3.d_description, "dummy_match:C")

        correction = result[-1]
        self.assertIsInstance(correction, AdjustLogOddsComparison)
        # - P(D|H) correction: +ln(p_o).
        # - P(D|¬H) correction: nothing (correct order).
        self.assertAlmostEqual(
            correction.log_likelihood_ratio, ln(self.P_O), delta=self.DELTA
        )

    def test_same_three_ids_diff_order_returns_three_matches_and_a_correction(
        self,
    ) -> None:
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Ordered, three/three identifiers, wrong order
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        a = DummyLetterIdentifier("A")
        b = DummyLetterIdentifier("B")
        c = DummyLetterIdentifier("C")

        result = self.compare([a, b, c], [b, c, a])
        self.assertEqual(len(result), 4)  # ... three matches and a correction

        comparison1 = result[0]
        self.assertIsInstance(comparison1, DirectComparison)
        self.assertEqual(comparison1.d_description, "dummy_match:B")
        comparison2 = result[1]
        self.assertIsInstance(comparison2, DirectComparison)
        self.assertEqual(comparison2.d_description, "dummy_match:C")
        comparison3 = result[2]
        self.assertIsInstance(comparison3, DirectComparison)
        self.assertEqual(comparison3.d_description, "dummy_match:A")

        correction = result[-1]
        self.assertIsInstance(correction, AdjustLogOddsComparison)
        # - P(D|H) correction: +ln(p_u).
        # - P(D|¬H) correction: Bonferroni for 6 options minus the one for the
        #   correct order.
        self.assertAlmostEqual(
            correction.log_likelihood_ratio,
            ln(self.P_U) - ln(5),
            delta=self.DELTA,
        )

    def test_two_of_three_matching_ids_returns_three_matches_and_a_correction(
        self,
    ) -> None:
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Ordered, three/three identifiers, two match, wrong order
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        a = DummyLetterIdentifier("A")
        b = DummyLetterIdentifier("B")
        c = DummyLetterIdentifier("C")
        d = DummyLetterIdentifier("D")

        """
        Comparing proband [a, b, c] to candidate [b, c, d]:

        p = proband index
        c = candidate index
        d = distance
        LLR = log likelihood ratio

                           p c d  LLR
        a - b  mismatch A  0 0 0  -4.5
        a - c  mismatch A  0 1 1  -4.5
        a - d  mismatch A  0 2 4  -4.5
        b - b  match B     1 0 1  3.2
        b - c  mismatch B  1 1 0  -4.5
        b - d  mismatch B  1 2 1  -4.5
        c - b  mismatch C  2 0 4  -4.5
        c - c  match C     2 1 1  3.2
        c - d  mismatch C  2 2 0  -4.5

        then we sort them by -LLR and distance:

                                        returned?
        b - b  match B     1 0 1  3.2   Yes
        c - c  match C     2 1 1  3.2   Yes
        a - b  mismatch A  0 0 0  -4.5  No (c=0 used)
        b - c  mismatch B  1 1 0  -4.5  No (p=1 or c=1 used)
        c - d  mismatch C  2 2 0  -4.5  No (p=2 used)
        a - c  mismatch A  0 1 1  -4.5  No (c=1 used)
        b - d  mismatch B  1 2 1  -4.5  No (p=1 used)
        a - d  mismatch A  0 2 4  -4.5  Yes
        c - b  mismatch C  2 0 4  -4.5  No (p=2 or c=0 used)

        """

        result = self.compare([a, b, c], [b, c, d])
        # ... three matches (but one will be bad) and a correction
        self.assertEqual(len(result), 4)

        comparison1 = result[0]
        self.assertIsInstance(comparison1, DirectComparison)
        self.assertEqual(comparison1.d_description, "dummy_match:B")
        comparison2 = result[1]
        self.assertIsInstance(comparison2, DirectComparison)
        self.assertEqual(comparison2.d_description, "dummy_match:C")
        comparison3 = result[2]
        self.assertIsInstance(comparison3, DirectComparison)
        self.assertEqual(comparison3.d_description, "dummy_mismatch:A")

        correction = result[-1]
        self.assertIsInstance(correction, AdjustLogOddsComparison)
        # - P(D|H) correction: +ln(p_u).
        # - P(D|¬H) correction: Bonferroni for 6 options minus the one for the
        #   correct order.
        self.assertAlmostEqual(
            correction.log_likelihood_ratio,
            ln(self.P_U) - ln(5),
            delta=self.DELTA,
        )

[docs]    def test_order_correct_with_duplicate_names_1(self) -> None:
        """
        Compare "A A" to "A A" in ordered fashion.

        Think of this as proband A_P1, A_P2 and candidate A_C1, A_C2.

        Should give a "correctly ordered" match, A_P1:A_C1 and A_C2:A_C2, with
        correction for P_O.

        Should not treat it as an incorrectly ordered match, A_P1:A_C2 and
        A_P2:A_C1, and apply a different correction for P_U etc.

        This might work without the "distance" sort in ComparisonInfo (it does,
        in fact), but that is a safety. See below for a test that does depend
        on that distance metric.
        """
        a = DummyLetterIdentifier("A")

        result = self.compare([a, a], [a, a])
        self.assertEqual(len(result), 3)
        comparison1 = result[0]
        self.assertIsInstance(comparison1, DirectComparison)
        self.assertEqual(comparison1.d_description, "dummy_match:A")
        comparison2 = result[1]
        self.assertIsInstance(comparison2, DirectComparison)
        self.assertEqual(comparison2.d_description, "dummy_match:A")
        correction = result[2]
        self.assertIsInstance(correction, AdjustLogOddsComparison)
        self.assertAlmostEqual(
            correction.log_likelihood_ratio,
            ln(self.P_O),
            delta=self.DELTA,
        )

[docs]    def test_order_correct_with_duplicate_names_2(self) -> None:
        """
        Compare "A B" to "B B" in ordered fashion.

        We want this to give A_P1:B_P1 (mismatch) and B_P2:B_C2 (ordered
        match).

        It should not give A_P1:B_P2 (mismatch) and B_P2:B_C1 (unordered
        match).

        This does not work without the "distance" part of the sort in
        ComparisonInfo.
        """
        a = DummyLetterIdentifier("A")
        b = DummyLetterIdentifier("B")

        result = self.compare([a, b], [b, b])
        self.assertEqual(len(result), 3)
        # Matches come first (better LLR):
        comparison1 = result[0]
        self.assertIsInstance(comparison1, DirectComparison)
        self.assertEqual(comparison1.d_description, "dummy_match:B")
        # Then mismatches:
        comparison2 = result[1]
        self.assertIsInstance(comparison2, DirectComparison)
        self.assertEqual(comparison2.d_description, "dummy_mismatch:A")
        # Then corrections:
        correction = result[2]
        self.assertIsInstance(correction, AdjustLogOddsComparison)
        self.assertAlmostEqual(
            correction.log_likelihood_ratio,
            ln(self.P_O),
            delta=self.DELTA,
        )