Source code for crate_anon.linkage.comparison

r"""
crate_anon/linkage/comparison.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Comparison classes for linkage tools.**

These implement the maths without regard to the kind of identifier being
compared. Includes classes for full/partial matches, and a function to iterate
through a bunch of comparisons as part of a Bayesian probability calculation.
The hypothesis H throughout is that two people being compared are in fact the
same person.

"""

# =============================================================================
# Imports
# =============================================================================

from typing import Iterable, Optional

from cardinal_pythonlib.reprfunc import auto_repr

from crate_anon.linkage.helpers import (
    log_likelihood_ratio_from_p,
    log_posterior_odds_from_pdh_pdnh,
)
from crate_anon.linkage.constants import INFINITY, MINUS_INFINITY


# =============================================================================
# Simple person-related probability calculations
# =============================================================================


[docs]class Comparison:
    """
    Abstract base class for comparing two pieces of information and calculating
    the posterior probability of a person match.

    This code must be fast, so avoid extraneous parameters.
    """

[docs]    def __init__(self) -> None:
        pass

    def __str__(self) -> str:
        """
        Returns a brief description.
        """
        return (
            f"{self.d_description} "
            f"[P(D|H)={self.p_d_given_h}, "
            f"P(D|¬H)={self.p_d_given_not_h}]"
        )

    def __repr__(self) -> str:
        return auto_repr(self)

    @property
    def d_description(self) -> str:
        """
        A description of D, the data (e.g. "match" or "mismatch").
        """
        raise NotImplementedError("Implement in derived class!")

    @property
    def p_d_given_h(self) -> float:
        """
        Returns :math:`P(D | H)`, the probability of the observed data given
        the hypothesis of a match.
        """
        raise NotImplementedError("Implement in derived class!")

    @property
    def p_d_given_not_h(self) -> float:
        r"""
        Returns :math:`P(D | \neg H)`, the probability of the observed data
        given no match.
        """
        raise NotImplementedError("Implement in derived class!")

    @property
    def log_likelihood_ratio(self) -> float:
        return log_likelihood_ratio_from_p(
            self.p_d_given_h, self.p_d_given_not_h
        )

[docs]    def posterior_log_odds(self, prior_log_odds: float) -> float:
        """
        Returns the posterior log odds, given the prior log odds. Often
        overriden in derived classes for a faster version.

        Args:
            prior_log_odds:
                prior log odds that they're the same person

        Returns:
            float: posterior log odds, O(H | D), as above
        """
        # if self.p_d_given_h == 0:
        #     # Shortcut: P(H | D) must be 0 (since likelihood ratio is 0)
        #     return MINUS_INFINITY
        # ... but: a Python shortcut is slower than a compiled log.
        return log_posterior_odds_from_pdh_pdnh(
            log_prior_odds=prior_log_odds,
            p_d_given_h=self.p_d_given_h,
            p_d_given_not_h=self.p_d_given_not_h,
        )


[docs]class ImpossibleComparison(Comparison):
    """
    Special comparison to denote impossibility/failure, i.e. for when P(D | H)
    = 0, that doesn't bother with all the calculations involved in calculating
    a likelihood ratio of 0.
    """

    @property
    def d_description(self) -> str:
        return "ImpossibleComparison"

    @property
    def p_d_given_h(self) -> float:
        return 0

    @property
    def p_d_given_not_h(self) -> float:
        # Unimportant!
        return 1  # makes things "in principle" calculable

[docs]    def posterior_log_odds(self, prior_log_odds: float) -> float:
        # Nice and quick:
        return MINUS_INFINITY


[docs]class CertainComparison(Comparison):
    """
    Special comparison to denote failure, i.e. for when P(D | H) = 0, that
    doesn't bother with all the calculations involved in calculating a
    likelihood ratio of 0.
    """

    @property
    def d_description(self) -> str:
        return "CertainComparison"

    @property
    def p_d_given_h(self) -> float:
        # Unimportant as long as it's not 0.
        return 1

    @property
    def p_d_given_not_h(self) -> float:
        # Not used. But zero.
        return 0  # makes things "in principle" calculable

[docs]    def posterior_log_odds(self, prior_log_odds: float) -> float:
        # Nice and quick:
        return INFINITY


[docs]class DirectComparison(Comparison):
    r"""
    Represents a comparison where the user supplies :math:`P(D | H)` and
    :math:`P(D | \neg H)` directly. This is the fastest real comparison. It
    precalculates the log likelihood ratio for speed; that way, our comparison
    can be re-used fast.
    """

[docs]    def __init__(
        self,
        p_d_given_same_person: float,
        p_d_given_diff_person: float,
        d_description: str = "?",
    ) -> None:
        r"""
        Args:
            p_d_given_same_person: :math:`P(D | H)`
            p_d_given_diff_person: :math:`P(D | \neg H)`
        """
        super().__init__()
        self._p_d_given_h = p_d_given_same_person
        self._p_d_given_not_h = p_d_given_diff_person
        self._log_likelihood_ratio = log_likelihood_ratio_from_p(
            p_d_given_h=p_d_given_same_person,
            p_d_given_not_h=p_d_given_diff_person,
        )
        self._description = d_description

    def __str__(self) -> str:
        return (
            f"DirectComparison"
            f"[{self._description}, "
            f"P(D|H)={self.p_d_given_h}, "
            f"P(D|¬H)={self.p_d_given_not_h}, "
            f"log_likelihood_ratio={self._log_likelihood_ratio}]"
        )

    @property
    def d_description(self) -> str:
        return self._description

    @property
    def p_d_given_h(self) -> float:
        return self._p_d_given_h

    @property
    def p_d_given_not_h(self) -> float:
        return self._p_d_given_not_h

    @property
    def log_likelihood_ratio(self) -> float:
        return self._log_likelihood_ratio

[docs]    def posterior_log_odds(self, prior_log_odds: float) -> float:
        # Fast version.
        # (You can't use use numba to compile a member function; the only
        # option is numba.jitclass() on the whole class. And making
        # DirectComparison a jitclass actually slowed things down.)
        return prior_log_odds + self._log_likelihood_ratio


[docs]class MatchNoMatchComparison(Comparison):
    """
    Represents a comparison when there can be a match or not.

    The purpose of this is to represent this choice CLEARLY. Code that produces
    one of these could equally produce one of two :class:`DirectComparison`
    objects, conditional upon ``match``, but this is often clearer.

    Not currently used in main code.
    """

[docs]    def __init__(
        self,
        match: bool,
        p_match_given_same_person: float,
        p_match_given_diff_person: float,
    ) -> None:
        r"""
        Args:
            match:
                D; is there a match?
            p_match_given_same_person:
                If match:
                :math:`P(D | H) = P(\text{match given same person}) = 1 - p_e`.
                If no match:
                :math:`P(D | H) = 1 - P(\text{match given same person}) = p_e`.
            p_match_given_diff_person:
                If match:
                :math:`P(D | \neg H) = P(\text{match given different person}) = p_f`.
                If no match:
                :math:`P(D | \neg H) = 1 - P(\text{match given different person}) = 1 - p_f`.
        """  # noqa
        super().__init__()
        self.match = match
        self.p_match_given_same_person = p_match_given_same_person
        self.p_match_given_diff_person = p_match_given_diff_person

    @property
    def d_description(self) -> str:
        return "match" if self.match else "mismatch"

    @property
    def p_d_given_h(self) -> float:
        if self.match:
            return self.p_match_given_same_person  # 1 - p_e
        else:
            return 1 - self.p_match_given_same_person  # p_e

    @property
    def p_d_given_not_h(self) -> float:
        if self.match:
            return self.p_match_given_diff_person  # p_f
        else:
            return 1 - self.p_match_given_diff_person  # 1 - p_f


[docs]class FullPartialNoMatchComparison(Comparison):
    """
    Represents a comparison where there can be a full or a partial match.
    (If there is neither a full nor a partial match, the hypothesis is
    rejected.)

    Again, this is for clarity. Code that produces one of these could equally
    produce one of three :class:`DirectComparison` objects, conditional upon
    ``full_match`` and ``partial_match``, but this is generally much clearer.

    Not currently used in main code.
    """

[docs]    def __init__(
        self,
        full_match: bool,
        p_f: float,
        p_e: float,
        partial_match: bool,
        p_p: float,
    ) -> None:
        r"""
        Args:
            full_match:
                was there a full match?
            p_f:
                :math:`p_f = P(\text{full match} | \neg H)`
            p_e:
                :math:`p_e = P(\text{partial but not full match} | H)`
            partial_match:
                was there a partial match?
            p_p:
                :math:`p_p = P(\text{partial match} | \neg H)`
        """
        super().__init__()
        assert p_f <= p_p, f"p_p={p_p} < p_f={p_f}, but should have p_f <= p_p"
        self.full_match = full_match
        self.p_f = p_f
        self.p_e = p_e
        self.partial_match = partial_match
        self.p_p = p_p

    @property
    def d_description(self) -> str:
        if self.full_match:
            return "full match"
        elif self.partial_match:
            return "partial match"
        else:
            return "mismatch"

    @property
    def p_d_given_h(self) -> float:
        if self.full_match:
            return 1 - self.p_e
        elif self.partial_match:
            return self.p_e
        else:
            return 0

    @property
    def p_d_given_not_h(self) -> float:
        if self.full_match:
            return self.p_f
        elif self.partial_match:
            return self.p_p - self.p_f
        else:
            return 1 - self.p_p  # IRRELEVANT since p_d_given_h == 0

[docs]    def posterior_log_odds(self, prior_log_odds: float) -> float:
        if not self.full_match and not self.partial_match:
            # No match.
            # Shortcut, since p_d_given_h is 0 and therefore LR is 0:
            return MINUS_INFINITY
        return super().posterior_log_odds(prior_log_odds)


[docs]class AdjustLogOddsComparison(Comparison):
    """
    Used to adjust log odds (via the log likelihood ratio) directly. See
    :func:`crate_anon.linkage.identifiers.gen_best_comparisons_unordered`.
    """

    BAD_METHOD = "Bad method"

[docs]    def __init__(
        self,
        log_odds_delta: float,
        description: str = "?",
    ) -> None:
        super().__init__()
        self._p_d_given_h = None
        self._p_d_given_not_h = None
        self._log_likelihood_ratio = log_odds_delta
        self._description = description

    def __str__(self) -> str:
        return (
            f"AdjustLogOddsComparison[{self._description}, "
            f"log_odds_delta={self._log_likelihood_ratio}]"
        )

    @property
    def d_description(self) -> str:
        return self._description

    @property
    def p_d_given_h(self) -> float:
        raise AssertionError(self.BAD_METHOD)

    @property
    def p_d_given_not_h(self) -> float:
        raise AssertionError(self.BAD_METHOD)

    @property
    def log_likelihood_ratio(self) -> float:
        return self._log_likelihood_ratio

[docs]    def posterior_log_odds(self, prior_log_odds: float) -> float:
        return prior_log_odds + self._log_likelihood_ratio


# =============================================================================
# The main Bayesian comparison point
# =============================================================================


[docs]def bayes_compare(
    log_odds: float,
    comparisons: Iterable[Optional[Comparison]],
) -> float:
    """
    Works through multiple comparisons and returns posterior log odds.
    Ignore comparisons that are ``None``.

    Args:
        log_odds: prior log odds
        comparisons: an iterable of :class:`Comparison` objects

    Returns:
        float: posterior log odds
    """
    # High speed function.
    # Fractionally faster to call the incoming parameter "log_odds" and not
    # assign it to a further variable here.
    for comparison in filter(None, comparisons):
        log_odds = comparison.posterior_log_odds(log_odds)
        # If there is a realistic chance of hitting -∞, this saves time:
        if log_odds == MINUS_INFINITY:
            return MINUS_INFINITY
        # We could check for +∞ too, but that (via PerfectID) is done outside
        # the Bayesian process.
    return log_odds