"""
crate_anon/linkage/tests/fuzzy_id_match_tests.py
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
Unit tests.
"""
# =============================================================================
# Imports
# =============================================================================
import logging
import unittest
from typing import List, Optional, Tuple, Type
from cardinal_pythonlib.probability import probability_from_log_odds
from pendulum import Date
from crate_anon.linkage.comparison import (
AdjustLogOddsComparison,
Comparison,
DirectComparison,
)
from crate_anon.linkage.constants import (
FuzzyDefaults,
GENDER_FEMALE,
GENDER_MALE,
GENDER_MISSING,
GENDER_OTHER,
INFINITY,
VALID_GENDERS,
)
from crate_anon.linkage.frequencies import (
BasicNameFreqInfo,
NameFrequencyInfo,
PostcodeFrequencyInfo,
)
from crate_anon.linkage.identifiers import (
DateOfBirth,
DummyLetterIdentifier,
DummyLetterTemporalIdentifier,
Forename,
gen_best_comparisons,
Gender,
Identifier,
PerfectID,
Postcode,
Surname,
SurnameFragment,
TemporalIDHolder,
)
from crate_anon.linkage.helpers import (
get_postcode_sector,
is_valid_isoformat_date,
ln,
POSTCODE_REGEX,
remove_redundant_whitespace,
safe_upper,
simplify_punctuation_whitespace,
standardize_name,
standardize_postcode,
surname_alternative_fragments,
)
from crate_anon.linkage.matchconfig import MatchConfig
from crate_anon.linkage.people import DuplicateIDError, People
from crate_anon.linkage.person import Person
log = logging.getLogger(__name__)
# =============================================================================
# Constants
# =============================================================================
BAD_DATE_STRINGS = ["1950-31-12", "1950", "blah", "2000-02-30"]
GOOD_DATE_STRINGS = ["1950-12-31", "1890-01-01", "2000-01-01"]
BAD_POSTCODES = [
"99XX99",
"CB99 9XXY",
"CB99",
"CB2",
"NW19TTTEMP",
"NW19TT TEMP",
]
GOOD_POSTCODES = [
"CB99 9XY",
"CB2 0QQ",
"ZZ99 3VZ",
"Z Z 9 9 3 V Z",
" zz993vz ",
] # good once standardized, anyway
BAD_GENDERS = ["Y", "male", "female", "?"]
# =============================================================================
# Rapid creation of a dummy config (without loading actual name/postcode info)
# =============================================================================
[docs]def mk_test_config(**kwargs) -> MatchConfig:
"""
Create a dummy config, using dummy name/postcode info.
"""
predefined_forenames = [
BasicNameFreqInfo(
name="ALICE",
gender=GENDER_FEMALE,
p_name=0.0032597245570899847,
metaphone="ALK",
p_metaphone=0.005664202032042135,
p_metaphone_not_name=0.00240447747495215,
f2c="AL",
p_f2c=0.027635117202534115,
p_f2c_not_name_metaphone=0.022541989771499515,
synthetic=False,
),
BasicNameFreqInfo(
name="BEATRICE",
gender=GENDER_FEMALE,
p_name=0.0011134697472956023,
metaphone="PTRK",
p_metaphone=0.010795171997297154,
p_metaphone_not_name=0.009681702250001551,
f2c="BE",
p_f2c=0.020540629656206778,
p_f2c_not_name_metaphone=0.01938862260342886,
synthetic=False,
),
BasicNameFreqInfo(
name="BETTY",
gender=GENDER_FEMALE,
p_name=0.005856056682186572,
metaphone="PT",
p_metaphone=0.007567968531021441,
p_metaphone_not_name=0.0017119118488348687,
f2c="BE",
p_f2c=0.020540629656206778,
p_f2c_not_name_metaphone=0.014031211254451567,
synthetic=False,
),
BasicNameFreqInfo(
name="BOB",
gender=GENDER_MALE,
p_name=0.0005341749908504777,
metaphone="PP",
p_metaphone=0.002569054271327976,
p_metaphone_not_name=0.0020348792804774983,
f2c="BO",
p_f2c=0.0035610312205931094,
p_f2c_not_name_metaphone=0.0010918026974037107,
synthetic=False,
),
BasicNameFreqInfo(
name="CAROLINE",
gender=GENDER_FEMALE,
p_name=0.001289812197195456,
metaphone="KRLN",
p_metaphone=0.005979308865585442,
p_metaphone_not_name=0.004689496668389986,
f2c="CA",
p_f2c=0.033910941860871194,
p_f2c_not_name_metaphone=0.02860674130257904,
synthetic=False,
),
BasicNameFreqInfo(
name="CELIA",
gender=GENDER_FEMALE,
p_name=0.0003141885536034312,
metaphone="KL",
p_metaphone=0.016359410337593906,
p_metaphone_not_name=0.016045221783990475,
f2c="CE",
p_f2c=0.0030682294813082723,
p_f2c_not_name_metaphone=0.0026663592268114434,
synthetic=False,
),
BasicNameFreqInfo(
name="DELILAH",
gender=GENDER_FEMALE,
p_name=0.00019936172952521078,
metaphone="TLL",
p_metaphone=0.000491534931894549,
p_metaphone_not_name=0.00029217320236933826,
f2c="DE",
p_f2c=0.02472305974107954,
p_f2c_not_name_metaphone=0.024435022377723725,
synthetic=False,
),
BasicNameFreqInfo(
name="DOROTHY",
gender=GENDER_FEMALE,
p_name=0.006484867451993301,
metaphone="TR0",
p_metaphone=0.007164437365410392,
p_metaphone_not_name=0.0006795699134170908,
f2c="DO",
p_f2c=0.020044376270378746,
p_f2c_not_name_metaphone=0.01298493890496824,
synthetic=False,
),
BasicNameFreqInfo(
name="ELIZABETH",
gender=GENDER_FEMALE,
p_name=0.009497275400440382,
metaphone="ALSP",
p_metaphone=0.010079561736620864,
p_metaphone_not_name=0.0005822863361804823,
f2c="EL",
p_f2c=0.02543961854560152,
p_f2c_not_name_metaphone=0.015404362973960957,
synthetic=False,
),
] # type: List[BasicNameFreqInfo]
forename_freq_info = NameFrequencyInfo(
csv_filename="",
cache_filename="",
by_gender=True,
min_frequency=FuzzyDefaults.FORENAME_MIN_FREQ,
)
for f in predefined_forenames:
forename_freq_info.name_gender_idx[f.name, f.gender] = f
predefined_surnames = [
BasicNameFreqInfo(
name="JONES",
gender="",
p_name=0.00621,
metaphone="JNS",
p_metaphone=0.0068899999999999986,
p_metaphone_not_name=0.0006799999999999983,
f2c="JO",
p_f2c=0.019480000000000268,
p_f2c_not_name_metaphone=0.012984999999999938,
synthetic=False,
),
BasicNameFreqInfo(
name="SMITH",
gender="",
p_name=0.01006,
metaphone="SM0",
p_metaphone=0.010129999999999998,
p_metaphone_not_name=6.999999999999888e-05,
f2c="SM",
p_f2c=0.012514999999999967,
p_f2c_not_name_metaphone=0.0023849999999999896,
synthetic=False,
),
] # type: List[BasicNameFreqInfo]
surname_freq_info = NameFrequencyInfo(
csv_filename="",
cache_filename="",
by_gender=False,
min_frequency=FuzzyDefaults.SURNAME_MIN_FREQ,
)
for s in predefined_surnames:
surname_freq_info.name_gender_idx[s.name, s.gender] = s
postcode_freq_info = PostcodeFrequencyInfo(
csv_filename="", cache_filename=""
)
return MatchConfig(
forename_freq_info=forename_freq_info,
surname_freq_info=surname_freq_info,
postcode_freq_info=postcode_freq_info,
**kwargs,
)
# =============================================================================
# Helper class
# =============================================================================
[docs]class TestCondition:
"""
Two representations of a person and whether they should match.
"""
[docs] def __init__(
self,
cfg: MatchConfig,
person_a: Person,
person_b: Person,
should_match: bool,
debug: bool = True,
) -> None:
"""
Args:
cfg: the main :class:`MatchConfig` object
person_a: one representation of a person
person_b: another representation of a person
should_match: should they be treated as the same person?
debug: be verbose?
"""
self.cfg = cfg
self.person_a = person_a
self.person_b = person_b
self.should_match = should_match
for id_person in (self.person_a, self.person_b):
assert id_person.is_plaintext()
id_person.ensure_valid_as_proband()
for identifier in id_person.debug_gen_identifiers():
assert identifier.is_plaintext
log.info("- Making hashed versions for later")
self.hashed_a = self.person_a.hashed()
self.hashed_b = self.person_b.hashed()
for h_person in (self.hashed_a, self.hashed_b):
assert h_person.is_hashed()
h_person.ensure_valid_as_proband()
for identifier in h_person.debug_gen_identifiers():
assert not identifier.is_plaintext
self.debug = debug
[docs] def log_odds_same_plaintext(self) -> float:
"""
Checks whether the plaintext person objects match.
Returns:
float: the log odds that they are the same person
"""
return self.person_a.log_odds_same(self.person_b)
[docs] def log_odds_same_hashed(self) -> float:
"""
Checks whether the hashed versions match.
Returns:
float: the log odds that they are the same person
"""
return self.hashed_a.log_odds_same(self.hashed_b)
[docs] def matches_plaintext(self) -> Tuple[bool, float]:
"""
Do the plaintext versions match, by threshold?
Returns:
tuple: (matches, log_odds)
"""
log_odds = self.log_odds_same_plaintext()
return self.cfg.exceeds_primary_threshold(log_odds), log_odds
[docs] def matches_hashed(self) -> Tuple[bool, float]:
"""
Do the raw versions match, by threshold?
Returns:
bool: is there a match?
"""
log_odds = self.log_odds_same_hashed()
return self.cfg.exceeds_primary_threshold(log_odds), log_odds
[docs] def check_comparison_as_expected(self) -> None:
"""
Asserts that both the raw and hashed versions match, or don't match,
according to ``self.should_match``.
"""
log.info(
f"Comparing:\n" f"- {self.person_a!r}\n" f"- {self.person_b!r}"
)
log.info("(1) Comparing plaintext")
matches_raw, log_odds_plaintext = self.matches_plaintext()
p_plaintext = probability_from_log_odds(log_odds_plaintext)
p_plain_str = f"P(match | D) = {p_plaintext}"
if matches_raw == self.should_match:
if matches_raw:
log.info(f"... should and does match: {p_plain_str}")
else:
log.info(f"... should not and does not match: {p_plain_str}")
else:
log_odds = log_odds_plaintext
report = self.person_a.debug_comparison_report(
self.person_b, verbose=False
)
raise AssertionError(
f"Match failure: "
f"matches_raw = {matches_raw}, "
f"should_match = {self.should_match}, "
f"log_odds = {log_odds}, "
f"min_log_odds_for_match = {self.cfg.min_log_odds_for_match}, "
f"P(match) = {probability_from_log_odds(log_odds)}, "
f"person_a = {self.person_a}, "
f"person_b = {self.person_b}, "
f"report = {report}"
)
log.info(
f"(2) Comparing hashed:\n"
f"- {self.hashed_a}\n"
f"- {self.hashed_b}"
)
matches_hashed, log_odds_hashed = self.matches_hashed()
p_hashed = probability_from_log_odds(log_odds_hashed)
p_hashed_str = f"P(match | D) = {p_hashed}"
if matches_hashed == self.should_match:
if matches_hashed:
log.info(f"... should and does match: {p_hashed_str}")
else:
log.info(f"... should not and does not match: {p_hashed_str}")
else:
log_odds = log_odds_hashed
report = self.hashed_a.debug_comparison_report(
self.hashed_b, verbose=False
)
raise AssertionError(
f"Match failure: "
f"matches_hashed = {matches_hashed}, "
f"should_match = {self.should_match}, "
f"log_odds = {log_odds}, "
f"threshold = {self.cfg.min_log_odds_for_match}, "
f"min_log_odds_for_match = {self.cfg.min_log_odds_for_match}, "
f"P(match) = {probability_from_log_odds(log_odds)}, "
f"person_a = {self.person_a}, "
f"person_b = {self.person_b}, "
f"hashed_a = {self.hashed_a}, "
f"hashed_b = {self.hashed_b}, "
f"report = {report}"
)
log.info(
"(3) Results of plaintext match should equal result of hashed "
"match"
)
if log_odds_hashed != log_odds_plaintext:
raise AssertionError(
"Plaintext/hashed comparison discrepancy: "
f"person_a = {self.person_a}, "
f"person_b = {self.person_b}, "
"log_odds_plaintext = {log_odds_plaintext}, "
f"log_odds_hashed = {log_odds_hashed}"
)
# =============================================================================
# Unit tests
# =============================================================================
[docs]class DummyTemporalIdentifierTests(unittest.TestCase):
"""
Unit tests for :class:`DummyTemporalIdentifier`.
"""
def test_overlap(self) -> None:
d1 = Date(2000, 1, 1)
d2 = Date(2000, 1, 2)
d3 = Date(2000, 1, 3)
d4 = Date(2000, 1, 4)
p = "dummypostcode"
# ---------------------------------------------------------------------
# Overlaps
# ---------------------------------------------------------------------
self.assertEqual(
TemporalIDHolder(p, d1, d2).overlaps(TemporalIDHolder(p, d2, d3)),
True,
)
self.assertEqual(
TemporalIDHolder(p, d2, d3).overlaps(TemporalIDHolder(p, d1, d2)),
True,
)
self.assertEqual(
TemporalIDHolder(p, d1, d4).overlaps(TemporalIDHolder(p, d2, d3)),
True,
)
self.assertEqual(
TemporalIDHolder(p, d1, None).overlaps(
TemporalIDHolder(p, None, d4)
),
True,
)
self.assertEqual(
TemporalIDHolder(p, None, None).overlaps(
TemporalIDHolder(p, None, None)
),
True,
)
# ---------------------------------------------------------------------
# Non-overlaps
# ---------------------------------------------------------------------
self.assertEqual(
TemporalIDHolder(p, d1, d2).overlaps(TemporalIDHolder(p, d3, d4)),
False,
)
self.assertEqual(
TemporalIDHolder(p, None, d1).overlaps(
TemporalIDHolder(p, d2, None)
),
False,
)
[docs]class FuzzyLinkageTests(unittest.TestCase):
"""
Tests of the fuzzy linkage system.
"""
[docs] def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.cfg = mk_test_config(rounding_sf=None)
self.p1 = Postcode(
cfg=self.cfg,
postcode="CB2 0QQ", # Addenbrooke's Hospital
start_date=Date(2000, 1, 1),
end_date=Date(2010, 1, 1),
)
self.p2 = Postcode(
cfg=self.cfg,
postcode="CB2 3EB", # Department of Psychology
start_date=Date(2000, 1, 1),
end_date=Date(2010, 1, 1),
)
self.alice_bcd_rarename_2000_add = Person(
cfg=self.cfg,
local_id="1",
forenames=["Alice", "Beatrice", "Celia", "Delilah"],
surnames=["Rarename"],
dob="2000-01-01",
postcodes=[self.p1],
)
self.alec_bcd_rarename_2000_add = Person(
cfg=self.cfg,
local_id="2",
forenames=["Alec", "Beatrice", "Celia", "Delilah"],
# Alec: same metaphone as Alice
surnames=["Rarename"],
dob="2000-01-01",
postcodes=[self.p1],
)
self.bob_bcd_rarename_2000_add = Person(
cfg=self.cfg,
local_id="3",
forenames=["Bob", "Beatrice", "Celia", "Delilah"],
surnames=["Rarename"],
dob="2000-01-01",
postcodes=[self.p1],
)
self.alice_bc_rarename_2000_add = Person(
cfg=self.cfg,
local_id="4",
forenames=["Alice", "Beatrice", "Celia"],
surnames=["Rarename"],
dob="2000-01-01",
postcodes=[self.p1],
)
self.alice_b_rarename_2000_add = Person(
cfg=self.cfg,
local_id="5",
forenames=["Alice", "Beatrice"],
surnames=["Rarename"],
dob="2000-01-01",
postcodes=[self.p1],
)
self.alice_jones_2000_add = Person(
cfg=self.cfg,
local_id="6",
forenames=["Alice"],
surnames=["Jones"],
dob="2000-01-01",
postcodes=[self.p1],
)
self.bob_smith_1950_psych = Person(
cfg=self.cfg,
local_id="7",
forenames=["Bob"],
surnames=["Smith"],
dob="1950-05-30",
postcodes=[self.p2],
)
self.alice_smith_1930 = Person(
cfg=self.cfg,
local_id="8",
forenames=["Alice"],
surnames=["Smith"],
dob="1930-01-01",
)
self.alice_smith_2000 = Person(
cfg=self.cfg,
local_id="9",
forenames=["Alice"],
surnames=["Smith"],
dob="2000-01-01",
)
self.alice_smith = Person(
cfg=self.cfg,
local_id="10",
forenames=["Alice"],
surnames=["Smith"],
)
self.alice_bc_smith = Person(
cfg=self.cfg,
local_id="11",
forenames=["Alice", "Betty", "Caroline"],
surnames=["Smith"],
)
self.alice_bde_smith = Person(
cfg=self.cfg,
local_id="12",
forenames=["Alice", "Betty", "Dorothy", "Elizabeth"],
surnames=["Smith"],
)
self.all_people = [
self.alice_bcd_rarename_2000_add,
self.alec_bcd_rarename_2000_add,
self.bob_bcd_rarename_2000_add,
self.alice_bc_rarename_2000_add,
self.alice_b_rarename_2000_add,
self.alice_jones_2000_add,
self.bob_smith_1950_psych,
self.alice_smith_1930,
self.alice_smith_2000,
self.alice_smith,
self.alice_bc_smith,
self.alice_bde_smith,
]
self.all_people_hashed = [p.hashed() for p in self.all_people]
self.people_plaintext = People(cfg=self.cfg)
self.people_plaintext.add_people(self.all_people)
self.people_hashed = People(cfg=self.cfg)
self.people_hashed.add_people(self.all_people_hashed)
# -------------------------------------------------------------------------
# Basic string transformations
# -------------------------------------------------------------------------
def test_standardize_name(self) -> None:
tests = (
# name, standardized version
("Al Jazeera", "ALJAZEERA"),
("Al'Jazeera", "ALJAZEERA"),
("Al'Jazeera'", "ALJAZEERA"),
("Alice", "ALICE"),
("ALJAZEERA", "ALJAZEERA"),
("aljazeera", "ALJAZEERA"),
("D'Souza", "DSOUZA"),
("de Clérambault", "DECLERAMBAULT"),
("Mary Ellen", "MARYELLEN"),
('"Al Jazeera"', "ALJAZEERA"),
("Müller", "MULLER"),
("Straße", "STRASSE"),
)
for item, target in tests:
self.assertEqual(standardize_name(item), target)
def test_safe_upper(self) -> None:
tests = (
("Beethoven", "BEETHOVEN"),
("Clérambault", "CLÉRAMBAULT"),
("Straße", "STRAẞE"),
)
for a, b in tests:
self.assertEqual(safe_upper(a), b)
def test_remove_redundant_whitespace(self) -> None:
tests = ((" van \t \r \n Beethoven ", "van Beethoven"),)
for a, b in tests:
self.assertEqual(remove_redundant_whitespace(a), b)
def test_simplify_punctuation_whitespace(self) -> None:
tests = (
("\n ‘John said “hello”.’", " 'John said \"hello\".'"),
("\t a–b—c−d-e ", " a-b-c-d-e "),
)
for a, b in tests:
self.assertEqual(simplify_punctuation_whitespace(a), b)
def test_surname_fragments(self) -> None:
cfg = self.cfg
accent_transliterations = cfg.accent_transliterations
nonspecific_name_components = cfg.nonspecific_name_components
tests = (
# In the expected answer, the original name (standardized) comes
# first; then alphabetical order of all other variants. Some
# examples are silly.
#
# France/French:
(
"Côte d'Ivoire",
["CÔTEDIVOIRE", "COTE", "COTEDIVOIRE", "CÔTE", "IVOIRE"],
),
(
"de Clérambault",
[
"DECLÉRAMBAULT",
"CLERAMBAULT",
"CLÉRAMBAULT",
"DECLERAMBAULT",
],
),
(
"de la Billière",
["DELABILLIÈRE", "BILLIERE", "BILLIÈRE", "DELABILLIERE"],
),
("Façade", ["FAÇADE", "FACADE"]),
("Giscard d'Estaing", ["GISCARDDESTAING", "ESTAING", "GISCARD"]),
("L'Estrange", ["LESTRANGE", "ESTRANGE"]),
("L’Estrange", ["LESTRANGE", "ESTRANGE"]),
# Germany (and in Beethoven's case, ancestrally Belgium):
("Beethoven", ["BEETHOVEN"]),
("Mozart Smith", ["MOZARTSMITH", "MOZART", "SMITH"]),
("Mozart-Smith", ["MOZARTSMITH", "MOZART", "SMITH"]),
("Müller", ["MÜLLER", "MUELLER", "MULLER"]),
("Straße", ["STRAẞE", "STRASSE"]),
("van Beethoven", ["VANBEETHOVEN", "BEETHOVEN"]),
# Italy:
("Calabrò", ["CALABRÒ", "CALABRO"]),
("De Marinis", ["DEMARINIS", "MARINIS"]),
("di Bisanzio", ["DIBISANZIO", "BISANZIO"]),
# Sweden:
("Nyström", ["NYSTRÖM", "NYSTROEM", "NYSTROM"]),
# Hmm. NYSTROEM is a German-style transliteration. Still, OK-ish.
)
for surname, target_fragments in tests:
self.assertEqual(
surname_alternative_fragments(
surname=surname,
accent_transliterations=accent_transliterations,
nonspecific_name_components=nonspecific_name_components,
),
target_fragments,
)
def test_date_regex(self) -> None:
for b in BAD_DATE_STRINGS:
self.assertFalse(is_valid_isoformat_date(b))
for g in GOOD_DATE_STRINGS:
self.assertTrue(is_valid_isoformat_date(g))
def test_standardize_postcode(self) -> None:
tests = (
# name, standardized version
("CB20QQ", "CB20QQ"),
(" CB2 0QQ ", "CB20QQ"),
(" CB2-0 QQ ", "CB20QQ"),
("cb2 0qq", "CB20QQ"),
)
for item, target in tests:
self.assertEqual(standardize_postcode(item), target)
def test_get_postcode_sector(self) -> None:
tests = (
# postcode, sector
("CB20QQ", "CB20"),
(" CB2 0QQ ", "CB20"),
(" CB2-0 QQ ", "CB20"),
("cb2 0qq", "CB20"),
)
for item, target in tests:
self.assertEqual(get_postcode_sector(item), target)
def test_postcode_regex(self) -> None:
for b in BAD_POSTCODES:
self.assertIsNone(
POSTCODE_REGEX.match(b), f"Postcode {b!r} matched but is bad"
)
sb = standardize_postcode(b)
self.assertIsNone(
POSTCODE_REGEX.match(sb),
f"Postcode {b!r} matched after standardization to {sb!r} "
f"but is bad",
)
for g in GOOD_POSTCODES:
sg = standardize_postcode(g)
self.assertTrue(
POSTCODE_REGEX.match(sg),
f"Postcode {sg!r} (from {g!r}) did not match but is good",
)
# -------------------------------------------------------------------------
# Frequencies
# -------------------------------------------------------------------------
def test_fuzzy_linkage_frequencies_name(self) -> None:
cfg = self.cfg
for surname in [
"Smith",
"Jones",
"Blair",
"Cardinal",
"XYZ",
"W", # no metaphone
]:
f = cfg.get_surname_freq_info(surname)
log.info(f"Surname frequency for {surname}: {f}")
self.assertIsInstance(f.name, str)
self.assertIsInstance(f.gender, str)
self.assertIsInstance(f.p_name, float)
self.assertIsInstance(f.metaphone, str)
self.assertIsInstance(f.p_metaphone, float)
self.assertIsInstance(f.p_metaphone_not_name, float)
self.assertIsInstance(f.f2c, str)
self.assertIsInstance(f.p_f2c, float)
self.assertIsInstance(f.p_f2c_not_name_metaphone, float)
for forename, gender in [
("James", GENDER_MALE),
("Rachel", GENDER_FEMALE),
("Phoebe", GENDER_FEMALE),
("Elizabeth", GENDER_FEMALE),
("Elizabeth", GENDER_MALE),
("Elizabeth", ""),
("Rowan", GENDER_FEMALE),
("Rowan", GENDER_MALE),
("Rowan", ""),
("XYZ", ""),
("W", ""), # no metaphone
]:
f = cfg.get_forename_freq_info(forename, gender)
log.info(
f"Forename frequency for {forename}, gender {gender}: {f}"
)
self.assertIsInstance(f.name, str)
self.assertIsInstance(f.gender, str)
self.assertIsInstance(f.p_name, float)
self.assertIsInstance(f.metaphone, str)
self.assertIsInstance(f.p_metaphone, float)
self.assertIsInstance(f.p_metaphone_not_name, float)
self.assertIsInstance(f.f2c, str)
self.assertIsInstance(f.p_f2c, float)
self.assertIsInstance(f.p_f2c_not_name_metaphone, float)
def test_fuzzy_linkage_frequencies_postcode(self) -> None:
cfg = self.cfg
# Examples are hospitals and colleges in Cambridge (not residential)
# but it gives a broad idea.
for postcode in ["CB2 0QQ", "CB2 0SZ", "CB2 3EB", "CB3 9DF"]:
p = cfg.debug_postcode_unit_population(postcode)
log.info(
f"Calculated population for postcode unit {postcode}: {p}"
)
for ps in ["CB2 0", "CB2 1", "CB2 2", "CB2 3"]:
p = cfg.debug_postcode_sector_population(ps)
log.info(f"Calculated population for postcode sector {ps}: {p}")
# -------------------------------------------------------------------------
# Identifiers
# -------------------------------------------------------------------------
def test_identifier_dob(self) -> None:
cfg = self.cfg
for b in BAD_DATE_STRINGS:
with self.assertRaises(ValueError):
_ = DateOfBirth(cfg, b)
full_match_log_lr = None # type: Optional[float]
for g in GOOD_DATE_STRINGS:
d = DateOfBirth(cfg, g)
self.assertEqual(d.dob_str, g)
self.assertEqual(str(d), g)
self.assertTrue(d.fully_matches(d))
full_match_log_lr = d.comparison(d).posterior_log_odds(0)
self.assertGreater(full_match_log_lr, 0)
partial_matches = (
("2000-01-01", "2007-01-01"), # year mismatch only
("2000-01-01", "2000-07-01"), # month mismatch only
("2000-01-01", "2000-01-07"), # day mismatch only
)
partial_match_log_lr = None # type: Optional[float]
for d1_str, d2_str in partial_matches:
d1 = DateOfBirth(cfg, d1_str)
d2 = DateOfBirth(cfg, d2_str)
self.assertFalse(d1.fully_matches(d2))
self.assertFalse(d2.fully_matches(d1))
self.assertTrue(d1.partially_matches(d2))
self.assertTrue(d2.partially_matches(d1))
partial_match_log_lr = d1.comparison(d2).posterior_log_odds(0)
self.assertLess(partial_match_log_lr, full_match_log_lr)
not_partial_matches = (
("2000-01-01", "2007-07-01"), # only day the same
("2000-01-01", "2000-07-07"), # only year the same
("2000-01-01", "2007-01-07"), # only month the same
)
for d1_str, d2_str in not_partial_matches:
d1 = DateOfBirth(cfg, d1_str)
d2 = DateOfBirth(cfg, d2_str)
self.assertFalse(d1.fully_matches(d2))
self.assertFalse(d2.fully_matches(d1))
self.assertFalse(d1.partially_matches(d2))
self.assertFalse(d2.partially_matches(d1))
mismatch_log_lr = d1.comparison(d2).posterior_log_odds(0)
self.assertLess(mismatch_log_lr, 0)
self.assertLess(mismatch_log_lr, partial_match_log_lr)
def test_identifier_postcode(self) -> None:
cfg = self.cfg
configs = [
cfg,
# Check extremes of k_postcode:
mk_test_config(k_postcode=1),
mk_test_config(k_postcode=1000),
# Check extremes of p_unknown_or_pseudo_postcode, k_pseudopostcode:
mk_test_config(
p_unknown_or_pseudo_postcode=0.00001, k_pseudopostcode=1.2
),
mk_test_config(
p_unknown_or_pseudo_postcode=0.01, k_pseudopostcode=3
),
# Very high combinations, e.g.
# p_unknown_or_pseudo_postcode=0.00001, k_pseudopostcode=1.001, may
# cause an error here. Very high combinations, e.g.
# p_unknown_or_pseudo_postcode=0.1, k_pseudopostcode=3, may also
# cause an error.
]
# Any invalid settings are detected by the Postcode identifier class
# checking that its comparisons are in a sensible order. All
# identifiers do this, in fact.
for b in BAD_POSTCODES:
with self.assertRaises(ValueError):
_ = Postcode(cfg, b)
early = Date(2020, 1, 1)
late = Date(2021, 12, 31)
for g in GOOD_POSTCODES: # includes pseudopostcodes
with self.assertRaises(ValueError):
_ = Postcode(cfg, g, start_date=late, end_date=early)
p = Postcode(cfg, g)
self.assertEqual(p.postcode_unit, standardize_postcode(g))
self.assertTrue(p.fully_matches(p))
empty = Postcode(cfg, "")
self.assertEqual(str(empty), "")
probe_partial_mismatch = (
# Each tuple: (1) a postcode; (2) same sector, different unit; (3)
# different sector.
("CB99 9XY", "CB99 9AB", "CB99 7AB"), # nonsense
("CB2 0QQ", "CB2 0SL", "SW1A 2AA"), # CUH 1, CUH 2, 10 Downing St
("ZZ99 3VZ", "ZZ99 3WZ", "ZZ99 1WZ"), # pseudo: NFA, sea, Orkney
)
for probe_str, partial_str, mismatch_str in probe_partial_mismatch:
for c in configs:
p1 = Postcode(c, probe_str)
p2 = Postcode(c, partial_str)
p3 = Postcode(c, mismatch_str)
# Everything matches itself.
self.assertTrue(p1.fully_matches(p1))
self.assertTrue(p2.fully_matches(p2))
self.assertTrue(p3.fully_matches(p3))
# Nothing matches another.
self.assertFalse(p1.fully_matches(p2))
self.assertFalse(p1.fully_matches(p3))
self.assertFalse(p2.fully_matches(p3))
# The partial match partially matches.
self.assertTrue(p1.partially_matches(p2))
# The nonmatch doesn't partially match.
self.assertFalse(p1.partially_matches(p3))
full_comp = p1.comparison(p1)
full_log_lr = full_comp.posterior_log_odds(0)
partial_comp = p1.comparison(p2)
partial_log_lr = partial_comp.posterior_log_odds(0)
nonmatch_comp = p1.comparison(p3)
nonmatch_log_lr = nonmatch_comp.posterior_log_odds(0)
self.assertGreater(
full_log_lr,
0,
f"comparing {probe_str!r} to itself, giving {full_comp!r}",
)
self.assertLess(
partial_log_lr,
full_log_lr,
f"comparing {probe_str!r} to {partial_str!r} "
f"(partial match); \ncfg = {cfg};\n"
f"p1 = {p1!r};\n"
f"giving {partial_comp!r}, versus the exact comparison "
f"{full_comp!r}",
)
self.assertLess(
nonmatch_log_lr,
partial_log_lr,
f"comparing {probe_str!r} to {mismatch_str!r} "
f"(nonmatch); \ncfg = {cfg};"
f"\np1 = {p1!r};\n"
f"giving {nonmatch_comp!r}, versus the previous partial "
f"comparison {partial_comp!r}",
)
def test_identifier_gender(self) -> None:
cfg = self.cfg
for b in BAD_GENDERS:
with self.assertRaises(ValueError):
_ = Gender(cfg, b)
for g_str in VALID_GENDERS:
g = Gender(cfg, g_str)
log.critical(f"g = {g!r}")
self.assertEqual(g.gender_str, g_str)
self.assertEqual(str(g), g_str)
if not g:
continue
self.assertTrue(g.fully_matches(g))
comp = g.comparison(g)
if comp:
self.assertGreater(comp.posterior_log_odds(0), 0)
empty = Gender(cfg, GENDER_MISSING)
m = Gender(cfg, GENDER_MALE)
f = Gender(cfg, GENDER_FEMALE)
x = Gender(cfg, GENDER_OTHER)
empty.ensure_has_freq_info_if_id_present()
m.ensure_has_freq_info_if_id_present()
f.ensure_has_freq_info_if_id_present()
x.ensure_has_freq_info_if_id_present()
self.assertEqual(str(empty), "")
self.assertTrue(bool(m))
self.assertTrue(bool(f))
self.assertTrue(bool(x))
self.assertFalse(bool(empty))
self.assertTrue(m.fully_matches(m))
self.assertTrue(m.comparison_relevant(m))
self.assertTrue(f.comparison_relevant(f))
self.assertTrue(f.comparison_relevant(f))
self.assertFalse(m.fully_matches(f))
self.assertFalse(m.fully_matches(x))
self.assertFalse(f.fully_matches(m))
self.assertFalse(f.fully_matches(x))
f_comp_f = f.comparison(f)
self.assertIsNotNone(f_comp_f)
self.assertGreater(f.comparison(f).posterior_log_odds(0), 0)
self.assertLess(f.comparison(m).posterior_log_odds(0), 0)
def test_identifier_surname_fragment(self) -> None:
cfg = self.cfg
f1 = SurnameFragment(cfg, name="Smith", gender=GENDER_MALE)
h1 = f1.hashed()
self.assertTrue(f1.fully_matches(f1))
self.assertTrue(f1.partially_matches(f1))
self.assertFalse(f1.fully_matches(h1))
self.assertFalse(f1.partially_matches(h1))
self.assertTrue(h1.fully_matches(h1))
self.assertTrue(h1.partially_matches(h1))
def test_identifier_surname(self) -> None:
# https://en.wikipedia.org/wiki/Double-barrelled_name
cfg = self.cfg
g = GENDER_FEMALE
jones = Surname(cfg, name="Jones", gender=g)
mozart = Surname(cfg, name="Mozart", gender=g)
mozart_smith_hy = Surname(cfg, name="Mozart-Smith", gender=g)
mozart_smith_sp = Surname(cfg, name="Mozart Smith", gender=g)
smith = Surname(cfg, name="Smith", gender=g)
smythe = Surname(cfg, name="Smythe", gender=g)
mozart_hashed = mozart.hashed()
mozart_smith_hashed = mozart_smith_hy.hashed()
smith_hashed = smith.hashed()
smythe_hashed = smythe.hashed()
matching = [
(jones, jones),
(mozart_smith_hy, mozart),
(mozart_smith_hy, mozart_smith_hy),
(mozart_smith_hy, mozart_smith_sp),
(mozart_smith_hy, smith),
(mozart_smith_sp, mozart),
(mozart_smith_sp, mozart_smith_hy),
(mozart_smith_sp, smith),
(smith, smith),
(smythe, smythe),
(mozart_hashed, mozart_hashed),
(mozart_smith_hashed, mozart_smith_hashed),
(smith_hashed, smith_hashed),
(smythe_hashed, smythe_hashed),
]
partially_matching = [
(mozart_smith_hy, smythe),
(mozart_smith_sp, smythe),
(smith, smythe),
(smith_hashed, smythe_hashed),
(mozart_smith_hashed, smythe_hashed),
]
nonmatching = [
(jones, mozart_smith_hy),
(jones, mozart_smith_sp),
(smith, jones),
(smith, mozart),
(smith, smith_hashed),
(smythe, smythe_hashed),
]
for a, b in matching:
self.assertTrue(a.fully_matches(b))
for a, b in partially_matching:
self.assertFalse(a.fully_matches(b))
self.assertTrue(a.partially_matches(b))
for a, b in nonmatching:
self.assertFalse(a.fully_matches(b))
self.assertFalse(a.partially_matches(b))
# -------------------------------------------------------------------------
# Lots of identifiers
# -------------------------------------------------------------------------
# -------------------------------------------------------------------------
# Person checks
# -------------------------------------------------------------------------
def test_person_creation(self) -> None:
cfg = self.cfg
# Test the removal of blank names, etc.
space = " "
blank = ""
p1 = Person(
cfg, local_id="p1", forenames=["A", blank, space, None, "B"]
)
self.assertEqual(len(p1.forenames), 2)
p2 = Person(
cfg, local_id="p2", surnames=["A", blank, space, None, "B"]
)
self.assertEqual(len(p2.surnames), 2)
p3 = Person(
cfg,
local_id="p3",
postcodes=[GOOD_POSTCODES[0], blank, space, GOOD_POSTCODES[1]],
)
self.assertEqual(len(p3.postcodes), 2)
def test_person_equality(self) -> None:
cfg = self.cfg
p1 = Person(cfg, local_id="hello")
p2 = Person(cfg, local_id="world")
p3 = Person(cfg, local_id="world")
self.assertNotEqual(p1, p2)
self.assertEqual(p2, p3)
people = People(cfg)
people.add_person(p1)
people.add_person(p2)
self.assertRaises(DuplicateIDError, people.add_person, p3)
def test_person_copy(self) -> None:
persons = [self.alice_smith]
for orig in persons:
cp = orig.copy()
for attr in Person.ALL_PERSON_KEYS:
orig_value = getattr(orig, attr)
copy_value = getattr(cp, attr)
self.assertEqual(
orig_value,
copy_value,
f"mismatch for {attr}:\n"
f"{orig_value!r}\n!=\n{copy_value!r}",
)
# -------------------------------------------------------------------------
# Person comparisons
# -------------------------------------------------------------------------
def test_fuzzy_linkage_matches(self) -> None:
test_values = [
# Very easy match
TestCondition(
cfg=self.cfg,
person_a=self.alice_bcd_rarename_2000_add,
person_b=self.alice_bcd_rarename_2000_add,
should_match=True,
),
# Easy match
TestCondition(
cfg=self.cfg,
person_a=self.alice_bc_rarename_2000_add,
person_b=self.alice_b_rarename_2000_add,
should_match=True,
),
# Easy non-match
TestCondition(
cfg=self.cfg,
person_a=self.alice_jones_2000_add,
person_b=self.bob_smith_1950_psych,
should_match=False,
),
# Very ambiguous (1)
TestCondition(
cfg=self.cfg,
person_a=self.alice_smith,
person_b=self.alice_smith_1930,
should_match=False,
),
# Very ambiguous (2)
TestCondition(
cfg=self.cfg,
person_a=self.alice_smith,
person_b=self.alice_smith_2000,
should_match=False,
),
TestCondition(
cfg=self.cfg,
person_a=self.alice_bcd_rarename_2000_add,
person_b=self.alec_bcd_rarename_2000_add,
should_match=True,
),
TestCondition(
cfg=self.cfg,
person_a=self.alice_bcd_rarename_2000_add,
person_b=self.bob_bcd_rarename_2000_add,
should_match=True, # used to be False
),
] # type: List[TestCondition]
log.info("Testing comparisons...")
for i, test in enumerate(test_values, start=1):
log.info(f"Comparison {i}...")
test.check_comparison_as_expected()
def test_fuzzy_more_complex(self) -> None:
log.info("Testing proband-versus-sample...")
for i in range(len(self.all_people)):
proband_plaintext = self.all_people[i]
log.info(f"Plaintext search with proband: {proband_plaintext}")
plaintext_winner = self.people_plaintext.get_unique_match(
proband_plaintext
)
log.info(f"... WINNER: {plaintext_winner}")
log.info(f"Hashed search with proband: {proband_plaintext}\n")
proband_hashed = self.all_people_hashed[i] # same order
hashed_winner = self.people_hashed.get_unique_match(proband_hashed)
log.info(f"... WINNER: {hashed_winner}")
[docs] def test_exact_match(self) -> None:
"""
Test the exact-match system.
"""
id_type = "nhsnum"
id_value = 3
# Two people with no identifiers in common:
p1 = Person(
cfg=self.cfg, local_id="p1", perfect_id={id_type: id_value}
)
p2 = Person(
cfg=self.cfg, local_id="p2", perfect_id={id_type: id_value}
)
# Perfect ID comparison is a function of a People object, not Person.
people = People(cfg=self.cfg, people=[p1])
# Match to self:
result_p1 = people.get_unique_match_detailed(p1)
self.assertEqual(result_p1.winner, p1)
self.assertEqual(result_p1.best_log_odds, INFINITY)
# Match to another with the same perfect ID:
result_p2 = people.get_unique_match_detailed(p2)
self.assertEqual(result_p2.winner, p1)
self.assertEqual(result_p2.best_log_odds, INFINITY)
# No two people in a People object with the same ID:
self.assertRaises(DuplicateIDError, people.add_person, p2)
# -------------------------------------------------------------------------
# People checks
# -------------------------------------------------------------------------
# See also test_person_equality() above.
[docs] def test_shortlist(self) -> None:
"""
Our shortlisting process typically permits people with completely
matching or partially matching DOBs, but not those with mismatched DOBs
(for efficiency). Test that.
"""
# Some test people:
cfg1 = self.cfg
proband = Person(cfg1, local_id="p1", dob="1950-01-01")
full_dob_match = [
# Full DOB match:
Person(cfg1, local_id="p2", dob="1950-01-01"),
]
partial_dob_match = [
# Two components of DOB match:
Person(cfg1, local_id="p3", dob="2000-01-01"),
Person(cfg1, local_id="p4", dob="1950-12-01"),
Person(cfg1, local_id="p5", dob="1950-01-12"),
]
dob_mismatch = [
# One component of DOB matches:
Person(cfg1, local_id="p6", dob="1950-12-12"),
Person(cfg1, local_id="p7", dob="2000-01-12"),
Person(cfg1, local_id="p8", dob="2000-12-01"),
# No component of DOB matches:
Person(cfg1, local_id="p9", dob="2000-12-12"),
]
all_people = (
[proband] + full_dob_match + partial_dob_match + dob_mismatch
)
# A setup where we don't shortlist mismatched DOBs:
self.assertEqual(cfg1.complete_dob_mismatch_allowed, False)
self.assertEqual(cfg1.partial_dob_mismatch_allowed, True)
people1 = People(cfg1, people=all_people)
shortlist1 = list(people1.gen_shortlist(proband))
self.assertTrue(proband in shortlist1)
for full_p in full_dob_match:
self.assertTrue(full_p in shortlist1)
for partial_p in partial_dob_match:
self.assertTrue(partial_p in shortlist1)
for mismatch_p in dob_mismatch:
self.assertFalse(mismatch_p in shortlist1)
# And one where we do:
cfg2 = mk_test_config(p_en_dob=FuzzyDefaults.P_EN_DOB_TRUE)
self.assertEqual(cfg2.complete_dob_mismatch_allowed, True)
self.assertEqual(cfg2.partial_dob_mismatch_allowed, True)
people2 = People(cfg2, people=all_people)
shortlist2 = list(people2.gen_shortlist(proband))
for p in all_people:
self.assertTrue(p in shortlist2)
# And one where only exact DOB matches are allows:
cfg3 = mk_test_config(p_ep_dob=0, p_en_dob=0)
self.assertEqual(cfg3.complete_dob_mismatch_allowed, False)
self.assertEqual(cfg3.partial_dob_mismatch_allowed, False)
people3 = People(cfg3, people=all_people)
shortlist3 = list(people3.gen_shortlist(proband))
self.assertTrue(proband in shortlist3)
for full_p in full_dob_match:
self.assertTrue(full_p in shortlist3)
for partial_p in partial_dob_match:
self.assertFalse(partial_p in shortlist3)
for mismatch_p in dob_mismatch:
self.assertFalse(mismatch_p in shortlist3)
# -------------------------------------------------------------------------
# Multiple comparison correction checks
# -------------------------------------------------------------------------
[docs]class MultipleComparisonTestBase(unittest.TestCase):
P_U = 0.1 # arbitrary
P_O = 1 - P_U
DELTA = 1e-10 # floating-point tolerance
[docs]class UnorderedMultipleComparisonTests(MultipleComparisonTestBase):
@staticmethod
def compare(
proband_identifiers: List[Identifier],
candidate_identifiers: List[Identifier],
) -> List[Comparison]:
return list(
gen_best_comparisons(
proband_identifiers=proband_identifiers,
candidate_identifiers=candidate_identifiers,
ordered=False,
)
)
def test_same_single_id_returns_one_match_and_no_correction(
self,
) -> None:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# UNORDERED, one/one identifier
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a = DummyLetterIdentifier("A")
result = self.compare([a], [a])
self.assertEqual(len(result), 1) # ... one match, no correction
comparison = result[0]
self.assertIsInstance(comparison, DirectComparison)
self.assertEqual(comparison.d_description, "dummy_match:A")
def test_same_two_ids_returns_two_matches_and_a_correction(
self,
) -> None:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Unordered, two/two identifiers
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a = DummyLetterIdentifier("A")
b = DummyLetterIdentifier("B")
result = self.compare([a, b], [a, b])
self.assertEqual(len(result), 3) # ... two matches and a correction
comparison1 = result[0]
self.assertIsInstance(comparison1, DirectComparison)
self.assertEqual(comparison1.d_description, "dummy_match:A")
comparison2 = result[1]
self.assertIsInstance(comparison2, DirectComparison)
self.assertEqual(comparison2.d_description, "dummy_match:B")
correction = result[-1]
self.assertIsInstance(correction, AdjustLogOddsComparison)
# Correction should be for 2 hits from 2 comparisons, and a Bonferroni
# correction:
self.assertAlmostEqual(
correction.log_likelihood_ratio, -ln(2), delta=self.DELTA
)
def test_same_three_ids_returns_three_matches_and_a_correction(
self,
) -> None:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Unordered, three/three identifiers
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a = DummyLetterIdentifier("A")
b = DummyLetterIdentifier("B")
c = DummyLetterIdentifier("C")
result = self.compare([a, b, c], [a, b, c])
self.assertEqual(len(result), 4) # ... three matches and a correction
comparison1 = result[0]
self.assertIsInstance(comparison1, DirectComparison)
self.assertEqual(comparison1.d_description, "dummy_match:A")
comparison2 = result[1]
self.assertIsInstance(comparison2, DirectComparison)
self.assertEqual(comparison2.d_description, "dummy_match:B")
comparison3 = result[2]
self.assertIsInstance(comparison3, DirectComparison)
self.assertEqual(comparison3.d_description, "dummy_match:C")
correction = result[-1]
self.assertIsInstance(correction, AdjustLogOddsComparison)
# Correction should be for 3 hits from 6 comparisons:
self.assertAlmostEqual(
correction.log_likelihood_ratio, -ln(6), delta=self.DELTA
)
def test_one_out_of_three_ids_returns_three_matches_and_a_correction(
self,
) -> None:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Unordered, one/three identifiers
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a = DummyLetterIdentifier("A")
b = DummyLetterIdentifier("B")
c = DummyLetterIdentifier("C")
result = self.compare([a], [a, b, c])
self.assertEqual(len(result), 2) # ... one match and a correction
comparison = result[0]
self.assertIsInstance(comparison, DirectComparison)
self.assertEqual(comparison.d_description, "dummy_match:A")
correction = result[-1]
self.assertIsInstance(correction, AdjustLogOddsComparison)
# Correction should be for 1 hit from 3 comparisons:
self.assertAlmostEqual(
correction.log_likelihood_ratio, -ln(3), delta=self.DELTA
)
[docs] def test_with_incomparable_identifiers(self) -> None:
"""
Use identifiers that aren't allowed to be compared, e.g. names with
non-overlapping timestamps. This will give a comparison that is
``None``, and make the code coverage checks happy.
.. code-block:: bash
pip install pytest-cov
pytest --cov --cov-report html
"""
a_early = DummyLetterTemporalIdentifier(
value="A", start_date="1900-01-01", end_date="1900-12-31"
)
a_late = DummyLetterTemporalIdentifier(
value="A", start_date="2000-01-01", end_date="2000-12-31"
)
result = self.compare([a_early], [a_late])
self.assertEqual(len(result), 0) # no comparisons
[docs]class OrderedMultipleComparisonTests(MultipleComparisonTestBase):
def compare(
self,
proband_identifiers: List[Identifier],
candidate_identifiers: List[Identifier],
) -> List[Comparison]:
return list(
gen_best_comparisons(
proband_identifiers=proband_identifiers,
candidate_identifiers=candidate_identifiers,
ordered=True,
p_u=self.P_U,
)
)
def test_same_single_identifier_returns_one_match_and_no_correction(
self,
) -> None:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ORDERED, one/one identifier
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a = DummyLetterIdentifier("A")
result = self.compare([a], [a])
self.assertEqual(len(result), 1) # ... one match, no correction
comparison = result[0]
self.assertIsInstance(comparison, DirectComparison)
self.assertEqual(comparison.d_description, "dummy_match:A")
def test_same_two_ids_same_order_returns_two_matches_and_a_correction(
self,
) -> None:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Ordered, two/two identifiers, correct order
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a = DummyLetterIdentifier("A")
b = DummyLetterIdentifier("B")
result = self.compare([a, b], [a, b])
self.assertEqual(len(result), 3) # ... two matches and a correction
comparison1 = result[0]
self.assertIsInstance(comparison1, DirectComparison)
self.assertEqual(comparison1.d_description, "dummy_match:A")
comparison2 = result[1]
self.assertIsInstance(comparison2, DirectComparison)
self.assertEqual(comparison2.d_description, "dummy_match:B")
correction = result[-1]
self.assertIsInstance(correction, AdjustLogOddsComparison)
# - P(D|H) correction: +ln(p_o).
# - P(D|¬H) correction: nothing, i.e. -ln(1) = 0.
self.assertAlmostEqual(
correction.log_likelihood_ratio, ln(self.P_O), delta=self.DELTA
)
def test_same_two_ids_diff_order_returns_two_matches_and_a_correction(
self,
) -> None:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Ordered, two/two identifiers, wrong order
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a = DummyLetterIdentifier("A")
b = DummyLetterIdentifier("B")
result = self.compare([a, b], [b, a])
self.assertEqual(len(result), 3) # ... two matches and a correction
comparison1 = result[0]
self.assertIsInstance(comparison1, DirectComparison)
self.assertEqual(comparison1.d_description, "dummy_match:A")
comparison2 = result[1]
self.assertIsInstance(comparison2, DirectComparison)
self.assertEqual(comparison2.d_description, "dummy_match:B")
correction = result[-1]
self.assertIsInstance(correction, AdjustLogOddsComparison)
# - P(D|H) correction: +ln(p_u).
# - P(D|¬H) correction: Bonferroni for 2 options but minus one for the
# ordered option, so nothing.
self.assertAlmostEqual(
correction.log_likelihood_ratio, ln(self.P_U), delta=self.DELTA
)
def test_same_three_ids_same_order_returns_three_matches_and_a_correction(
self,
) -> None:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Ordered, three/three identifiers, correct order
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a = DummyLetterIdentifier("A")
b = DummyLetterIdentifier("B")
c = DummyLetterIdentifier("C")
result = self.compare([a, b, c], [a, b, c])
self.assertEqual(len(result), 4) # ... three matches and a correction
comparison1 = result[0]
self.assertIsInstance(comparison1, DirectComparison)
self.assertEqual(comparison1.d_description, "dummy_match:A")
comparison2 = result[1]
self.assertIsInstance(comparison2, DirectComparison)
self.assertEqual(comparison2.d_description, "dummy_match:B")
comparison3 = result[2]
self.assertIsInstance(comparison3, DirectComparison)
self.assertEqual(comparison3.d_description, "dummy_match:C")
correction = result[-1]
self.assertIsInstance(correction, AdjustLogOddsComparison)
# - P(D|H) correction: +ln(p_o).
# - P(D|¬H) correction: nothing (correct order).
self.assertAlmostEqual(
correction.log_likelihood_ratio, ln(self.P_O), delta=self.DELTA
)
def test_same_three_ids_diff_order_returns_three_matches_and_a_correction(
self,
) -> None:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Ordered, three/three identifiers, wrong order
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a = DummyLetterIdentifier("A")
b = DummyLetterIdentifier("B")
c = DummyLetterIdentifier("C")
result = self.compare([a, b, c], [b, c, a])
self.assertEqual(len(result), 4) # ... three matches and a correction
comparison1 = result[0]
self.assertIsInstance(comparison1, DirectComparison)
self.assertEqual(comparison1.d_description, "dummy_match:B")
comparison2 = result[1]
self.assertIsInstance(comparison2, DirectComparison)
self.assertEqual(comparison2.d_description, "dummy_match:C")
comparison3 = result[2]
self.assertIsInstance(comparison3, DirectComparison)
self.assertEqual(comparison3.d_description, "dummy_match:A")
correction = result[-1]
self.assertIsInstance(correction, AdjustLogOddsComparison)
# - P(D|H) correction: +ln(p_u).
# - P(D|¬H) correction: Bonferroni for 6 options minus the one for the
# correct order.
self.assertAlmostEqual(
correction.log_likelihood_ratio,
ln(self.P_U) - ln(5),
delta=self.DELTA,
)
def test_two_of_three_matching_ids_returns_three_matches_and_a_correction(
self,
) -> None:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Ordered, three/three identifiers, two match, wrong order
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
a = DummyLetterIdentifier("A")
b = DummyLetterIdentifier("B")
c = DummyLetterIdentifier("C")
d = DummyLetterIdentifier("D")
"""
Comparing proband [a, b, c] to candidate [b, c, d]:
p = proband index
c = candidate index
d = distance
LLR = log likelihood ratio
p c d LLR
a - b mismatch A 0 0 0 -4.5
a - c mismatch A 0 1 1 -4.5
a - d mismatch A 0 2 4 -4.5
b - b match B 1 0 1 3.2
b - c mismatch B 1 1 0 -4.5
b - d mismatch B 1 2 1 -4.5
c - b mismatch C 2 0 4 -4.5
c - c match C 2 1 1 3.2
c - d mismatch C 2 2 0 -4.5
then we sort them by -LLR and distance:
returned?
b - b match B 1 0 1 3.2 Yes
c - c match C 2 1 1 3.2 Yes
a - b mismatch A 0 0 0 -4.5 No (c=0 used)
b - c mismatch B 1 1 0 -4.5 No (p=1 or c=1 used)
c - d mismatch C 2 2 0 -4.5 No (p=2 used)
a - c mismatch A 0 1 1 -4.5 No (c=1 used)
b - d mismatch B 1 2 1 -4.5 No (p=1 used)
a - d mismatch A 0 2 4 -4.5 Yes
c - b mismatch C 2 0 4 -4.5 No (p=2 or c=0 used)
"""
result = self.compare([a, b, c], [b, c, d])
# ... three matches (but one will be bad) and a correction
self.assertEqual(len(result), 4)
comparison1 = result[0]
self.assertIsInstance(comparison1, DirectComparison)
self.assertEqual(comparison1.d_description, "dummy_match:B")
comparison2 = result[1]
self.assertIsInstance(comparison2, DirectComparison)
self.assertEqual(comparison2.d_description, "dummy_match:C")
comparison3 = result[2]
self.assertIsInstance(comparison3, DirectComparison)
self.assertEqual(comparison3.d_description, "dummy_mismatch:A")
correction = result[-1]
self.assertIsInstance(correction, AdjustLogOddsComparison)
# - P(D|H) correction: +ln(p_u).
# - P(D|¬H) correction: Bonferroni for 6 options minus the one for the
# correct order.
self.assertAlmostEqual(
correction.log_likelihood_ratio,
ln(self.P_U) - ln(5),
delta=self.DELTA,
)
[docs] def test_order_correct_with_duplicate_names_1(self) -> None:
"""
Compare "A A" to "A A" in ordered fashion.
Think of this as proband A_P1, A_P2 and candidate A_C1, A_C2.
Should give a "correctly ordered" match, A_P1:A_C1 and A_C2:A_C2, with
correction for P_O.
Should not treat it as an incorrectly ordered match, A_P1:A_C2 and
A_P2:A_C1, and apply a different correction for P_U etc.
This might work without the "distance" sort in ComparisonInfo (it does,
in fact), but that is a safety. See below for a test that does depend
on that distance metric.
"""
a = DummyLetterIdentifier("A")
result = self.compare([a, a], [a, a])
self.assertEqual(len(result), 3)
comparison1 = result[0]
self.assertIsInstance(comparison1, DirectComparison)
self.assertEqual(comparison1.d_description, "dummy_match:A")
comparison2 = result[1]
self.assertIsInstance(comparison2, DirectComparison)
self.assertEqual(comparison2.d_description, "dummy_match:A")
correction = result[2]
self.assertIsInstance(correction, AdjustLogOddsComparison)
self.assertAlmostEqual(
correction.log_likelihood_ratio,
ln(self.P_O),
delta=self.DELTA,
)
[docs] def test_order_correct_with_duplicate_names_2(self) -> None:
"""
Compare "A B" to "B B" in ordered fashion.
We want this to give A_P1:B_P1 (mismatch) and B_P2:B_C2 (ordered
match).
It should not give A_P1:B_P2 (mismatch) and B_P2:B_C1 (unordered
match).
This does not work without the "distance" part of the sort in
ComparisonInfo.
"""
a = DummyLetterIdentifier("A")
b = DummyLetterIdentifier("B")
result = self.compare([a, b], [b, b])
self.assertEqual(len(result), 3)
# Matches come first (better LLR):
comparison1 = result[0]
self.assertIsInstance(comparison1, DirectComparison)
self.assertEqual(comparison1.d_description, "dummy_match:B")
# Then mismatches:
comparison2 = result[1]
self.assertIsInstance(comparison2, DirectComparison)
self.assertEqual(comparison2.d_description, "dummy_mismatch:A")
# Then corrections:
correction = result[2]
self.assertIsInstance(correction, AdjustLogOddsComparison)
self.assertAlmostEqual(
correction.log_likelihood_ratio,
ln(self.P_O),
delta=self.DELTA,
)