r"""
crate_anon/linkage/people.py
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
**People representations for fuzzy matching.**
"""
# =============================================================================
# Imports
# =============================================================================
from collections import defaultdict
import logging
from typing import (
Dict,
Generator,
Iterable,
List,
Optional,
Set,
)
from ordered_set import OrderedSet
from crate_anon.linkage.constants import INFINITY, MINUS_INFINITY
from crate_anon.linkage.matchconfig import MatchConfig
from crate_anon.linkage.matchresult import MatchResult
from crate_anon.linkage.person import Person
log = logging.getLogger(__name__)
# =============================================================================
# Exceptions
# =============================================================================
[docs]class DuplicateIDError(Exception):
pass
# =============================================================================
# People: a collection of Person objects
# =============================================================================
# Try staring at the word "people" for a while and watch it look odd...
[docs]class People:
"""
Represents a group of people, and implements a shortlist.
"""
[docs] def __init__(
self,
cfg: MatchConfig,
person: Person = None,
people: Iterable[Person] = None,
) -> None:
"""
Creates a blank collection.
Raises :exc:`crate_anon.linkage.fuzzy_id_match.DuplicateLocalIDError`
if some people have duplicate ``local_id`` values.
"""
self.cfg = cfg
self.people = [] # type: List[Person]
# ... list is preferable to set, as we may slice it for parallel
# processing, and it maintains order.
# These may be plaintext or hashed DOB strings depending on our people:
self.dob_md_to_people = defaultdict(
list
) # type: Dict[str, List[Person]]
self.dob_yd_to_people = defaultdict(
list
) # type: Dict[str, List[Person]]
self.dob_ym_to_people = defaultdict(
list
) # type: Dict[str, List[Person]]
self.dob_ymd_to_people = defaultdict(
list
) # type: Dict[str, List[Person]]
self.perfect_id_map = defaultdict(
dict
) # type: Dict[str, Dict[str, Person]]
self._known_local_ids = set() # type: Set[str]
self._people_are_plaintext = None # type: Optional[bool]
if person:
self.add_person(person)
if people:
self.add_people(people)
[docs] def add_person(self, person: Person) -> None:
"""
Adds a single person.
Raises :exc:`crate_anon.linkage.fuzzy_id_match.DuplicateLocalIDError`
if the person has a ``local_id`` value already in our collection.
"""
# Plaintext or hashed?
if self.people:
# Not the first person.
if person.is_plaintext() != self._people_are_plaintext:
new = Person.plain_or_hashed_txt(person.is_plaintext())
old = Person.plain_or_hashed_txt(self._people_are_plaintext)
raise ValueError(
f"Trying to add a {new} person but all existing people "
f"are {old}"
)
else:
# First person.
self._people_are_plaintext = person.is_plaintext()
# Check local ID not duplicated.
if person.local_id in self._known_local_ids:
raise DuplicateIDError(
f"Person with duplicate local ID {person.local_id!r}"
)
self._known_local_ids.add(person.local_id)
# Build perfect ID map and ensure no duplication.
for key, value in person.perfect_id.identifiers.items():
# e.g. key = "nhsnum", value = some NHS number as a string, or a
# hashed equivalent.
id_to_person = self.perfect_id_map[key] # e.g. for NHS#
if value in id_to_person:
raise DuplicateIDError(
f"Person with duplicate perfect ID {key} = {value!r}"
)
id_to_person[value] = person
# Add to DOB maps.
dob = person.dob
if dob:
self.dob_md_to_people[dob.dob_md].append(person)
self.dob_yd_to_people[dob.dob_yd].append(person)
self.dob_ym_to_people[dob.dob_ym].append(person)
self.dob_ymd_to_people[dob.dob_str].append(person)
else:
# DOB absent.
# We do need a way to retrieve people with no DOB.
# We use a blank string key for this:
self.dob_ymd_to_people[""].append(person)
# It's also true that dob.dob_str will be "", so this is just for
# clarity.
# We do not need to add to the partial DOB maps. See
# gen_shortlist().
# Add the person.
self.people.append(person)
[docs] def add_people(self, people: Iterable[Person]) -> None:
"""
Adds multiple people.
Raises :exc:`crate_anon.linkage.fuzzy_id_match.DuplicateLocalIDError`
if some people have duplicate ``local_id`` values with respect to those
we already know.
"""
for person in people:
self.add_person(person)
[docs] def size(self) -> int:
"""
Returns the number of people in this object.
"""
return len(self.people)
[docs] def ensure_valid_as_probands(self) -> None:
"""
Ensures all people have sufficient information to act as a proband,
or raises :exc:`ValueError`.
"""
log.info("Validating probands...")
for p in self.people:
p.ensure_valid_as_proband()
log.debug("... OK")
[docs] def ensure_valid_as_sample(self) -> None:
"""
Ensures all people have sufficient information to act as a candidate
from a sample, or raises :exc:`ValueError`.
"""
log.info("Validating sample...")
for p in self.people:
p.ensure_valid_as_candidate()
log.debug("... OK")
[docs] def get_perfect_match(self, proband: Person) -> Optional[Person]:
"""
Returns the first person who matches on a perfect (person-unique) ID,
or ``None``.
"""
for key, value in proband.perfect_id.identifiers.items():
key = self.cfg.remap_perfect_id_key(key)
winner = self.perfect_id_map[key].get(value)
if winner:
return winner
return None
[docs] def gen_shortlist(self, proband: Person) -> Generator[Person, None, None]:
"""
Generates a shortlist of potential candidates for fuzzy matching (e.g.
by restriction to same/similar dates of birth -- or with no such
restriction, if preferred).
Yields:
proband: a :class:`Person`
"""
# A high-speed function.
cfg = self.cfg
dob = proband.dob
# 2023-02-28 update for referees:
# - Allow comparison where the DOB is missing.
# - Of necessity, probands with no DOBs must be compared to all
# candidates.
# - Likewise, if we permit a complete DOB mismatch (where DOBs are
# present), we must compare to all candidates.
if cfg.complete_dob_mismatch_allowed or not dob:
# No shortlisting; everyone's a candidate. Slow.
for person in self.people:
# self.people is a list, so order is consistent and matches
# the input.
yield person
else:
# Implement the shortlist by DOB.
# Most efficient to let set operations determine uniqueness, then
# iterate through the set.
# We use an OrderedSet to be sure of consistency; the precise
# ordering is as below (e.g. people with the same DOB, then those
# with the partial matches as shown below). Within each category,
# the ordering will be as the input. (Thus, if configured for
# duplicate detection, which entails identical DOBs, the earliest
# winner will always be the first in the input.)
# First, exact matches:
shortlist = OrderedSet(self.dob_ymd_to_people[dob.dob_str])
# Now, we'll slow it all down with partial matches:
if cfg.partial_dob_mismatch_allowed:
shortlist.update(self.dob_md_to_people[dob.dob_md])
shortlist.update(self.dob_yd_to_people[dob.dob_yd])
shortlist.update(self.dob_ym_to_people[dob.dob_ym])
# But also, we must include any candidates who have no DOB.
# (We already know that our proband has a DOB, or we wouldn't be
# in this part of the if statement.)
shortlist.update(self.dob_ymd_to_people[""])
for person in shortlist:
yield person
[docs] def get_unique_match_detailed(self, proband: Person) -> MatchResult:
"""
Returns a single person matching the proband, or ``None`` if there is
no match (as defined by the probability settings in ``cfg``).
Args:
proband: a :class:`Person`
"""
# 2020-04-25: Do this in one pass.
# A bit like
# https://www.geeksforgeeks.org/python-program-to-find-second-largest-number-in-a-list/ # noqa
# ... but modified, as that fails to deal with joint winners
# ... and it's not a super algorithm anyway.
# Step 1. Scan everything in a single pass, establishing the best
# candidate and the runner-up.
cfg = self.cfg
best_log_odds = MINUS_INFINITY
second_best_log_odds = MINUS_INFINITY
second_best_candidate = None # type: Optional[Person]
best_candidate = self.get_perfect_match(proband)
if best_candidate:
best_log_odds = INFINITY
else:
# Fuzzy matching
proband_log_odds_same = proband.log_odds_same # for speed
for candidate in self.gen_shortlist(proband):
log_odds = proband_log_odds_same(candidate)
if log_odds > best_log_odds:
second_best_log_odds = best_log_odds
second_best_candidate = best_candidate
best_log_odds = log_odds
best_candidate = candidate
elif log_odds > second_best_log_odds:
second_best_log_odds = log_odds
second_best_candidate = candidate
# If log_odds == best_log_odds, we don't change the winner,
# i.e. the first-encountered candidate continues in the lead.
# The shortlist is generated in a consistent order.
result = MatchResult(
best_log_odds=best_log_odds,
second_best_log_odds=second_best_log_odds,
best_candidate=best_candidate,
second_best_candidate=second_best_candidate,
proband=proband,
)
# Is there a winner?
if (
best_candidate
and best_log_odds >= cfg.min_log_odds_for_match
and best_log_odds
>= (second_best_log_odds + cfg.exceeds_next_best_log_odds)
):
# (a) There needs to be a best candidate.
# (b) The best needs to be good enough.
# (c) The best must beat the runner-up by a sufficient margin.
result.winner = best_candidate
return result
[docs] def get_unique_match(self, proband: Person) -> Optional[Person]:
"""
Returns a single person matching the proband, or ``None`` if there is
no match (as defined by the probability settings in ``cfg``).
Args:
proband: a :class:`Person`
Returns:
the winner (a :class:`Person`) or ``None``
"""
result = self.get_unique_match_detailed(proband)
return result.winner
[docs] def hashed(self) -> "People":
"""
Returns a hashed version of itself.
"""
return People(cfg=self.cfg, people=[p.hashed() for p in self.people])
[docs] def copy(self) -> "People":
"""
Returns a copy of itself.
"""
return People(cfg=self.cfg, people=[p.copy() for p in self.people])