Source code for crate_anon.linkage.comparison

r"""
crate_anon/linkage/comparison.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Comparison classes for linkage tools.**

These implement the maths without regard to the kind of identifier being
compared. Includes classes for full/partial matches, and a function to iterate
through a bunch of comparisons as part of a Bayesian probability calculation.
The hypothesis H throughout is that two people being compared are in fact the
same person.

"""

# =============================================================================
# Imports
# =============================================================================

from typing import Iterable, Optional

from cardinal_pythonlib.reprfunc import auto_repr

from crate_anon.linkage.helpers import (
    log_likelihood_ratio_from_p,
    log_posterior_odds_from_pdh_pdnh,
)
from crate_anon.linkage.constants import INFINITY, MINUS_INFINITY


# =============================================================================
# Simple person-related probability calculations
# =============================================================================


[docs]class Comparison: """ Abstract base class for comparing two pieces of information and calculating the posterior probability of a person match. This code must be fast, so avoid extraneous parameters. """
[docs] def __init__(self) -> None: pass
def __str__(self) -> str: """ Returns a brief description. """ return ( f"{self.d_description} " f"[P(D|H)={self.p_d_given_h}, " f"P(D|¬H)={self.p_d_given_not_h}]" ) def __repr__(self) -> str: return auto_repr(self) @property def d_description(self) -> str: """ A description of D, the data (e.g. "match" or "mismatch"). """ raise NotImplementedError("Implement in derived class!") @property def p_d_given_h(self) -> float: """ Returns :math:`P(D | H)`, the probability of the observed data given the hypothesis of a match. """ raise NotImplementedError("Implement in derived class!") @property def p_d_given_not_h(self) -> float: r""" Returns :math:`P(D | \neg H)`, the probability of the observed data given no match. """ raise NotImplementedError("Implement in derived class!") @property def log_likelihood_ratio(self) -> float: return log_likelihood_ratio_from_p( self.p_d_given_h, self.p_d_given_not_h )
[docs] def posterior_log_odds(self, prior_log_odds: float) -> float: """ Returns the posterior log odds, given the prior log odds. Often overriden in derived classes for a faster version. Args: prior_log_odds: prior log odds that they're the same person Returns: float: posterior log odds, O(H | D), as above """ # if self.p_d_given_h == 0: # # Shortcut: P(H | D) must be 0 (since likelihood ratio is 0) # return MINUS_INFINITY # ... but: a Python shortcut is slower than a compiled log. return log_posterior_odds_from_pdh_pdnh( log_prior_odds=prior_log_odds, p_d_given_h=self.p_d_given_h, p_d_given_not_h=self.p_d_given_not_h, )
[docs]class ImpossibleComparison(Comparison): """ Special comparison to denote impossibility/failure, i.e. for when P(D | H) = 0, that doesn't bother with all the calculations involved in calculating a likelihood ratio of 0. """ @property def d_description(self) -> str: return "ImpossibleComparison" @property def p_d_given_h(self) -> float: return 0 @property def p_d_given_not_h(self) -> float: # Unimportant! return 1 # makes things "in principle" calculable
[docs] def posterior_log_odds(self, prior_log_odds: float) -> float: # Nice and quick: return MINUS_INFINITY
[docs]class CertainComparison(Comparison): """ Special comparison to denote failure, i.e. for when P(D | H) = 0, that doesn't bother with all the calculations involved in calculating a likelihood ratio of 0. """ @property def d_description(self) -> str: return "CertainComparison" @property def p_d_given_h(self) -> float: # Unimportant as long as it's not 0. return 1 @property def p_d_given_not_h(self) -> float: # Not used. But zero. return 0 # makes things "in principle" calculable
[docs] def posterior_log_odds(self, prior_log_odds: float) -> float: # Nice and quick: return INFINITY
[docs]class DirectComparison(Comparison): r""" Represents a comparison where the user supplies :math:`P(D | H)` and :math:`P(D | \neg H)` directly. This is the fastest real comparison. It precalculates the log likelihood ratio for speed; that way, our comparison can be re-used fast. """
[docs] def __init__( self, p_d_given_same_person: float, p_d_given_diff_person: float, d_description: str = "?", ) -> None: r""" Args: p_d_given_same_person: :math:`P(D | H)` p_d_given_diff_person: :math:`P(D | \neg H)` """ super().__init__() self._p_d_given_h = p_d_given_same_person self._p_d_given_not_h = p_d_given_diff_person self._log_likelihood_ratio = log_likelihood_ratio_from_p( p_d_given_h=p_d_given_same_person, p_d_given_not_h=p_d_given_diff_person, ) self._description = d_description
def __str__(self) -> str: return ( f"DirectComparison" f"[{self._description}, " f"P(D|H)={self.p_d_given_h}, " f"P(D|¬H)={self.p_d_given_not_h}, " f"log_likelihood_ratio={self._log_likelihood_ratio}]" ) @property def d_description(self) -> str: return self._description @property def p_d_given_h(self) -> float: return self._p_d_given_h @property def p_d_given_not_h(self) -> float: return self._p_d_given_not_h @property def log_likelihood_ratio(self) -> float: return self._log_likelihood_ratio
[docs] def posterior_log_odds(self, prior_log_odds: float) -> float: # Fast version. # (You can't use use numba to compile a member function; the only # option is numba.jitclass() on the whole class. And making # DirectComparison a jitclass actually slowed things down.) return prior_log_odds + self._log_likelihood_ratio
[docs]class MatchNoMatchComparison(Comparison): """ Represents a comparison when there can be a match or not. The purpose of this is to represent this choice CLEARLY. Code that produces one of these could equally produce one of two :class:`DirectComparison` objects, conditional upon ``match``, but this is often clearer. Not currently used in main code. """
[docs] def __init__( self, match: bool, p_match_given_same_person: float, p_match_given_diff_person: float, ) -> None: r""" Args: match: D; is there a match? p_match_given_same_person: If match: :math:`P(D | H) = P(\text{match given same person}) = 1 - p_e`. If no match: :math:`P(D | H) = 1 - P(\text{match given same person}) = p_e`. p_match_given_diff_person: If match: :math:`P(D | \neg H) = P(\text{match given different person}) = p_f`. If no match: :math:`P(D | \neg H) = 1 - P(\text{match given different person}) = 1 - p_f`. """ # noqa super().__init__() self.match = match self.p_match_given_same_person = p_match_given_same_person self.p_match_given_diff_person = p_match_given_diff_person
@property def d_description(self) -> str: return "match" if self.match else "mismatch" @property def p_d_given_h(self) -> float: if self.match: return self.p_match_given_same_person # 1 - p_e else: return 1 - self.p_match_given_same_person # p_e @property def p_d_given_not_h(self) -> float: if self.match: return self.p_match_given_diff_person # p_f else: return 1 - self.p_match_given_diff_person # 1 - p_f
[docs]class FullPartialNoMatchComparison(Comparison): """ Represents a comparison where there can be a full or a partial match. (If there is neither a full nor a partial match, the hypothesis is rejected.) Again, this is for clarity. Code that produces one of these could equally produce one of three :class:`DirectComparison` objects, conditional upon ``full_match`` and ``partial_match``, but this is generally much clearer. Not currently used in main code. """
[docs] def __init__( self, full_match: bool, p_f: float, p_e: float, partial_match: bool, p_p: float, ) -> None: r""" Args: full_match: was there a full match? p_f: :math:`p_f = P(\text{full match} | \neg H)` p_e: :math:`p_e = P(\text{partial but not full match} | H)` partial_match: was there a partial match? p_p: :math:`p_p = P(\text{partial match} | \neg H)` """ super().__init__() assert p_f <= p_p, f"p_p={p_p} < p_f={p_f}, but should have p_f <= p_p" self.full_match = full_match self.p_f = p_f self.p_e = p_e self.partial_match = partial_match self.p_p = p_p
@property def d_description(self) -> str: if self.full_match: return "full match" elif self.partial_match: return "partial match" else: return "mismatch" @property def p_d_given_h(self) -> float: if self.full_match: return 1 - self.p_e elif self.partial_match: return self.p_e else: return 0 @property def p_d_given_not_h(self) -> float: if self.full_match: return self.p_f elif self.partial_match: return self.p_p - self.p_f else: return 1 - self.p_p # IRRELEVANT since p_d_given_h == 0
[docs] def posterior_log_odds(self, prior_log_odds: float) -> float: if not self.full_match and not self.partial_match: # No match. # Shortcut, since p_d_given_h is 0 and therefore LR is 0: return MINUS_INFINITY return super().posterior_log_odds(prior_log_odds)
[docs]class AdjustLogOddsComparison(Comparison): """ Used to adjust log odds (via the log likelihood ratio) directly. See :func:`crate_anon.linkage.identifiers.gen_best_comparisons_unordered`. """ BAD_METHOD = "Bad method"
[docs] def __init__( self, log_odds_delta: float, description: str = "?", ) -> None: super().__init__() self._p_d_given_h = None self._p_d_given_not_h = None self._log_likelihood_ratio = log_odds_delta self._description = description
def __str__(self) -> str: return ( f"AdjustLogOddsComparison[{self._description}, " f"log_odds_delta={self._log_likelihood_ratio}]" ) @property def d_description(self) -> str: return self._description @property def p_d_given_h(self) -> float: raise AssertionError(self.BAD_METHOD) @property def p_d_given_not_h(self) -> float: raise AssertionError(self.BAD_METHOD) @property def log_likelihood_ratio(self) -> float: return self._log_likelihood_ratio
[docs] def posterior_log_odds(self, prior_log_odds: float) -> float: return prior_log_odds + self._log_likelihood_ratio
# ============================================================================= # The main Bayesian comparison point # =============================================================================
[docs]def bayes_compare( log_odds: float, comparisons: Iterable[Optional[Comparison]], ) -> float: """ Works through multiple comparisons and returns posterior log odds. Ignore comparisons that are ``None``. Args: log_odds: prior log odds comparisons: an iterable of :class:`Comparison` objects Returns: float: posterior log odds """ # High speed function. # Fractionally faster to call the incoming parameter "log_odds" and not # assign it to a further variable here. for comparison in filter(None, comparisons): log_odds = comparison.posterior_log_odds(log_odds) # If there is a realistic chance of hitting -∞, this saves time: if log_odds == MINUS_INFINITY: return MINUS_INFINITY # We could check for +∞ too, but that (via PerfectID) is done outside # the Bayesian process. return log_odds