Source code for crate_anon.nlp_manager.regex_parser

"""
crate_anon/nlp_manager/regex_parser.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Shared elements for regex-based NLP work.**

"""

from abc import abstractmethod, ABC
import logging
from typing import Any, Dict, Generator, List, Optional, Tuple

from sqlalchemy import Column, Integer, Float, String, Text

from crate_anon.common.regex_helpers import (
    LEFT_BRACKET as LB,
    RIGHT_BRACKET as RB,
)
from crate_anon.nlp_manager.constants import (
    MAX_SQL_FIELD_LEN,
    ProcessorConfigKeys,
    SqlTypeDbIdentifier,
)
from crate_anon.nlp_manager.base_nlp_parser import BaseNlpParser
from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.nlp_manager.number import to_float, to_pos_float
from crate_anon.nlp_manager.regex_func import (
    compile_regex,
    compile_regex_dict,
    get_regex_dict_match,
)
from crate_anon.nlp_manager.regex_numbers import (
    SIGNED_FLOAT,
    IGNORESIGN_INTEGER,
)
from crate_anon.nlp_manager.regex_units import (
    OUT_OF_SEPARATOR,
    SCORE,
)

log = logging.getLogger(__name__)


# =============================================================================
# Generic entities
# =============================================================================

# -----------------------------------------------------------------------------
# Blood results
# -----------------------------------------------------------------------------

OPTIONAL_RESULTS_IGNORABLES = r"""
    (?:  # OPTIONAL_RESULTS_IGNORABLES
        \s | \| | \:          # whitespace, bar, colon
        | \bHH?\b | \(HH?\)   # H/HH at a word boundary; (H)/(HH)
        | \bLL?\b | \(LL?\)   # L/LL etc.
        | \* | \(\*\)         # *, (*)
        | — | --              # em dash, double hyphen-minus
        | –\s+ | -\s+ | ‐\s+  # en dash/hyphen-minus/Unicode hyphen; whitespace
    )*                        # ... any of those, repeated 0 or more times
"""
# - you often get | characters when people copy/paste tables
# - blood test abnormality markers can look like e.g.
#       17 (H), 17 (*), 17 HH
# Re parentheses:
# - you can also see things like "CRP (5)"
#   ... but we'll handle that
# - However, if there's a right parenthesis only, that's less good, e.g.
#   "Present: Nicola Adams (NA). 1.0. Minutes of the last meeting."
#   ... which we don't want to be interpreted as "sodium 1.0".
#   HOW BEST TO DO THIS?
# - https://stackoverflow.com/questions/546433/regular-expression-to-match-outer-brackets  # noqa
#   https://stackoverflow.com/questions/7898310/using-regex-to-balance-match-parenthesis  # noqa
# - ... simplest is perhaps: base ignorables, or those with brackets, as above
# - ... even better than a nested thing is just a list of alternatives

OPTIONAL_POC = r"""
    (?: ,? \s+ POC )?   # OPTIONAL_POC: point-of-care testing, "[,] POC"
"""
# ... e.g. "Glucose, POC"; "Potassium, POC".
# Seen in CUH for
#
#     sodium, POC
#     potassium, POC
#     creatinine, POC
#     urea, POC
#     glucose, POC
#     lactate, POC
#     bilirubin, POC
#     HCT, POC
#     alkaline phosphatase, POC
#     alanine transferase, POC
#
#     HGB, POC
#     WBC, POC
#     PLT, POC
#     MCV, POC
#     MCH, POC
#     neutrophil count, POC
#     lymphocyte count, POC

# -----------------------------------------------------------------------------
# Tense indicators
# -----------------------------------------------------------------------------

IS = "is"
WAS = "was"
TENSE_INDICATOR = rf"(?: \b {IS} \b | \b {WAS} \b )"

# Standardized result values; see MAX_TENSE_TEXT_LENGTH
PAST = "past"
PRESENT = "present"
EVER = "ever"  # e.g. for "never"

TENSE_LOOKUP = compile_regex_dict(
    {
        IS: PRESENT,
        WAS: PAST,
    }
)

# -----------------------------------------------------------------------------
# Mathematical relations
# -----------------------------------------------------------------------------
# ... don't use unnamed groups here; EQ is also used as a return value

LT = r"(?: < | less \s+ than | under )"
LE = "<="
EQ = r"(?: = | equals | equal \s+ to )"
GE = ">="
GT = r"(?: > | (?:more|greater) \s+ than | over )"
# OF = "\b of \b"  # as in: "a BMI of 30"... but too likely to be mistaken for a target?  # noqa

RELATION = rf"(?: {LE} | {LT} | {EQ} | {GE} | {GT} )"
# ... ORDER MATTERS: greedier things first, i.e.
# - LE before LT
# - GE before GT

RELATION_LOOKUP = compile_regex_dict(
    {
        # To standardize the output, so (for example) "=" and "equals" can both
        # map to "=".
        LT: "<",
        LE: "<=",
        EQ: "=",
        GE: ">=",
        GT: ">",
    }
)

# -----------------------------------------------------------------------------
# Punctuation
# -----------------------------------------------------------------------------

APOSTROPHE = "['’]"  # ASCII apostrophe; right single quote (U+2019)


# =============================================================================
# Regex assembly functions
# =============================================================================


# =============================================================================
# Functions to handle processed data
# =============================================================================


[docs]def common_tense(
    tense_text: Optional[str], relation_text: Optional[str]
) -> Tuple[Optional[str], Optional[str]]:
    """
    Takes strings potentially representing "tense" and "equality" concepts
    and unifies them.

    - Used, for example, to help impute that "CRP was 72" means that relation
      was EQ in the PAST, etc.

    Args:
        tense_text: putative tense information
        relation_text: putative relationship (equals, less than, etc.)

    Returns:
         tuple: ``tense, relation``; either may be ``None``.
    """
    tense = None
    if tense_text:
        _, tense = get_regex_dict_match(tense_text, TENSE_LOOKUP)
    elif relation_text:
        _, tense = get_regex_dict_match(relation_text, TENSE_LOOKUP)

    _, relation = get_regex_dict_match(relation_text, RELATION_LOOKUP, "=")

    return tense, relation


# =============================================================================
# Constants for generic processors
# =============================================================================

FN_VARIABLE_NAME = "variable_name"
FN_CONTENT = "_content"
FN_START = "_start"
FN_END = "_end"
FN_VARIABLE_TEXT = "variable_text"
FN_RELATION_TEXT = "relation_text"
FN_RELATION = "relation"
FN_VALUE_TEXT = "value_text"
FN_UNITS = "units"
FN_TENSE_TEXT = "tense_text"
FN_TENSE = "tense"

HELP_VARIABLE_NAME = "Variable name"
HELP_CONTENT = "Matching text contents"
HELP_START = "Start position (of matching string within whole text)"
HELP_END = "End position (of matching string within whole text)"
HELP_VARIABLE_TEXT = "Text that matched the variable name"
HELP_RELATION_TEXT = (
    "Text that matched the mathematical relationship between variable and "
    "value (e.g. '=', '<=', 'less than')"
)
HELP_RELATION = (
    "Standardized mathematical relationship between variable and value "
    "(e.g. '=', '<=')"
)
HELP_VALUE_TEXT = "Matched numerical value, as text"
HELP_UNITS = "Matched units, as text"
HELP_TARGET_UNIT = "Numerical value in preferred units, if known"
HELP_TENSE_TEXT = f"Tense text, if known (e.g. '{IS}', '{WAS}')"
HELP_TENSE = f"Calculated tense, if known (e.g. '{PAST}', '{PRESENT}')"

MAX_RELATION_TEXT_LENGTH = 50
MAX_RELATION_LENGTH = max(len(x) for x in RELATION_LOOKUP.values())
MAX_VALUE_TEXT_LENGTH = 50
MAX_UNITS_LENGTH = 50
MAX_TENSE_TEXT_LENGTH = 50
MAX_TENSE_LENGTH = max(len(x) for x in TENSE_LOOKUP.values())


# =============================================================================
# Generic processors
# =============================================================================

# -----------------------------------------------------------------------------
# NumericalResultParser
# -----------------------------------------------------------------------------


[docs]class NumericalResultParser(BaseNlpParser):
    """
    DO NOT USE DIRECTLY. Base class for generic numerical results, where
    a SINGLE variable is produced.
    """

[docs]    def __init__(
        self,
        nlpdef: NlpDefinition,
        cfg_processor_name: str,
        variable: str,
        target_unit: str,
        regex_str_for_debugging: str,
        commit: bool = False,
    ) -> None:
        r"""
        Init function for NumericalResultParser.

        Args:
            nlpdef:
                A :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`.

            cfg_processor_name:
                Config section name in the :ref:`NLP config file <nlp_config>`.

            variable:
                Used by subclasses as the record value for ``variable_name``.

            target_unit:
                Fieldname used for the primary output quantity.

            regex_str_for_debugging:
                String form of regex, for debugging.

            commit:
                Force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.

        Subclasses will extend this method.
        """
        # NB This docstring was associated with Sphinx errors!
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            commit=commit,
            friendly_name=variable,
        )
        self.variable = variable
        self.target_unit = target_unit
        self.regex_str_for_debugging = regex_str_for_debugging

        if nlpdef is None:  # only None for debugging!
            self.tablename = self.classname().lower()
            self.assume_preferred_unit = True
        else:
            self.tablename = self._cfgsection.opt_str(
                ProcessorConfigKeys.DESTTABLE, required=True
            )
            self.assume_preferred_unit = self._cfgsection.opt_bool(
                ProcessorConfigKeys.ASSUME_PREFERRED_UNIT, default=True
            )

        # Sanity checks
        assert (
            len(self.variable) <= MAX_SQL_FIELD_LEN
        ), f"Variable name too long (max {MAX_SQL_FIELD_LEN} characters)"

[docs]    def get_regex_str_for_debugging(self) -> str:
        """
        Returns the string version of the regex, for debugging.
        """
        return self.regex_str_for_debugging

[docs]    def set_tablename(self, tablename: str) -> None:
        """
        In case a friend class wants to override.
        """
        self.tablename = tablename

[docs]    def dest_tables_columns(self) -> Dict[str, List[Column]]:
        # docstring in superclass
        return {
            self.tablename: [
                Column(
                    FN_VARIABLE_NAME,
                    SqlTypeDbIdentifier,
                    comment=HELP_VARIABLE_NAME,
                ),
                Column(FN_CONTENT, Text, comment=HELP_CONTENT),
                Column(FN_START, Integer, comment=HELP_START),
                Column(FN_END, Integer, comment=HELP_END),
                Column(FN_VARIABLE_TEXT, Text, comment=HELP_VARIABLE_TEXT),
                Column(
                    FN_RELATION_TEXT,
                    String(MAX_RELATION_TEXT_LENGTH),
                    comment=HELP_RELATION_TEXT,
                ),
                Column(
                    FN_RELATION,
                    String(MAX_RELATION_LENGTH),
                    comment=HELP_RELATION,
                ),
                Column(FN_VALUE_TEXT, Text, comment=HELP_VALUE_TEXT),
                Column(FN_UNITS, String(MAX_UNITS_LENGTH), comment=HELP_UNITS),
                Column(self.target_unit, Float, comment=HELP_TARGET_UNIT),
                Column(
                    FN_TENSE_TEXT,
                    String(MAX_TENSE_TEXT_LENGTH),
                    comment=HELP_TENSE_TEXT,
                ),
                Column(FN_TENSE, String(MAX_TENSE_LENGTH), comment=HELP_TENSE),
            ]
        }

[docs]    @abstractmethod
    def parse(
        self, text: str
    ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
        # docstring in superclass
        raise NotImplementedError

[docs]    def test_numerical_parser(
        self,
        test_expected_list: List[Tuple[str, List[float]]],
        add_test_no_plain_number: bool = True,
        verbose: bool = False,
    ) -> None:
        """
        Args:
            test_expected_list:
                list of tuples ``test_string, expected_values``. The parser
                will parse ``test_string`` and compare the result (each value
                of the target unit) to ``expected_values``, which is a list of
                numerical (``float``), and can be an empty list.
            verbose:
                show the regex string too

        Raises:
            :exc:`AssertionError` if a comparison fails

        Compare also :func:`test_numerical_parser_detailed`.
        """
        log.info(f"Testing parser: {self.classname()}")
        if verbose:
            log.debug(f"... regex string:\n{self.regex_str_for_debugging}")
        if add_test_no_plain_number:
            test_expected_list = test_expected_list + [
                ("999", [])  # no quantity specified
            ]  # use "+ [...]", not append(), so as not to modify for caller
        for test_string, expected_values in test_expected_list:
            full_result = list(self.parse(test_string))
            actual_values = list(x[self.target_unit] for t, x in full_result)
            assert actual_values == expected_values, (
                f"Parser {self.classname()}: Expected {expected_values!r}, "
                f"got {actual_values!r}, when parsing {test_string!r}; "
                f"full result:\n{full_result!r}"
            )
        log.info("... OK")

[docs]    def detailed_test(
        self, text: str, expected: List[Dict[str, Any]], verbose: bool = False
    ) -> None:
        """
        Runs a more detailed check. Whereas :func:`test_numerical_parser` tests
        the primary numerical results, this function tests other key/value
        pairs returned by the parser.

        Args:
            text:
                text to parse
            expected:
                list of ``resultdict`` dictionaries (each mapping column names
                to values).

                - The parser should return one result dictionary for
                  every entry in ``expected``.
                - It's fine for the ``resultdict`` not to include all the
                  columns returned for the parser. However, for any column that
                  is present, the parser must provide the corresponding value.

            verbose:
                be verbose
        """
        full_result = list(self.parse(text))
        if len(full_result) != len(expected):
            raise ValueError(
                f"Parser {self.classname()}: expected {len(expected)} results "
                f"but got {len(full_result)} when parsing {text!r}; "
                f"full result:\n{full_result!r}"
            )
        if verbose:
            log.info(f"detailed_test: {text!r} -> {full_result!r}")
        for i, text_result in enumerate(full_result):
            _, result = text_result
            expected_dict = expected[i]
            for k, expected_value in expected_dict.items():
                if k not in result:
                    raise ValueError(
                        f"Parser {self.classname()}: Expected value dict "
                        f"had key {k!r} but this is absent from result "
                        f"{result!r}"
                    )
                observed_value = result[k]
                if observed_value != expected_value:
                    raise ValueError(
                        f"Parser {self.classname()}: expected {k} = "
                        f"{expected_value!r}, got {observed_value!r}, "
                        f"when parsing {text!r}; full result:\n"
                        f"{full_result!r}"
                    )

[docs]    def detailed_test_multiple(
        self,
        tests: List[Tuple[str, List[Dict[str, Any]]]],
        verbose: bool = False,
    ) -> None:
        """
        Args:
            tests:
                list of tuples ``test_string, expected``. The parser will parse
                ``test_string`` and compare the result(s) to ``expected``. This
                is list of dictionaries with keys that can be like ``values``,
                ``tense``, etc. Each dictionary value is the corresponding
                expected value.
            verbose:
                show the regex string too

        Raises:
            :exc:`AssertionError` if a comparison fails
        """
        log.info(f"Detailed tests for parser: {self.classname()}")
        if verbose:
            log.debug(f"... regex string:\n{self.regex_str_for_debugging}")
        for test_string, expected_dict_list in tests:
            self.detailed_test(
                test_string, expected_dict_list, verbose=verbose
            )
        log.info("... OK")


# -----------------------------------------------------------------------------
# SimpleNumericalResultParser
# -----------------------------------------------------------------------------

GROUP_NUMBER_WHOLE_EXPRESSION = 0

GROUP_NAME_QUANTITY = "quantity"
GROUP_NAME_RELATION = "relation"
GROUP_NAME_TENSE = "tense"
GROUP_NAME_UNITS = "units"
GROUP_NAME_VALUE = "value"


[docs]def make_simple_numeric_regex(
    quantity: str,
    units: str,
    value: str = SIGNED_FLOAT,
    tense_indicator: str = TENSE_INDICATOR,
    relation: str = RELATION,
    optional_results_ignorables: str = OPTIONAL_RESULTS_IGNORABLES,
    optional_ignorable_after_quantity: str = "",
    units_optional: bool = True,
) -> str:
    r"""
    Makes a regex with named groups to handle simple numerical results.

    Copes with formats like:

    .. code-block:: none

        sodium 132 mM
        sodium (mM) 132
        sodium (132 mM)

    ... and lots more.

    Args:
        quantity:
            Regex for the quantity (e.g. for "sodium" or "Na").
        units:
            Regex for units.
        value:
            Regex for the numerical value (e.g. our ``SIGNED_FLOAT`` regex).
        tense_indicator:
            Regex for tense indicator.
        relation:
            Regex for mathematical relationship (e.g. equals, less than).
        optional_results_ignorables:
            Regex for junk to ignore in between the other things.
            Should include its own "optionality" (e.g. ``*``).
        optional_ignorable_after_quantity:
            Regex for additional things that can be ignored right after the
            quantity. Should include its own "optionality" (e.g. ``?``).
        units_optional:
            The units are allowed to be omitted. Usually true.

    The resulting regex groups are named, not numbered:

    .. code-block:: none

        0:          Whole thing; integer, as in: m.group(0)
        'quantity': Quantity
        'tense':    Tense (optional)
        'relation': Relation (optional)
        'value':    Value
        'units':    Units (optional)

    ... as used by :class:`SimpleNumericalResultParser`.

    Just to check re overlap:

    .. code-block:: python

        import regex
        s1 = r"(?P<quantity>Sodium)\s+(?P<value>\d+)\s+(?P<units>mM)"
        s2 = r"(?P<quantity>Sodium)\s+\((?P<units>mM)\)\s+(?P<value>\d+)"
        s = f"{s1}|{s2}"
        r = regex.compile(s)
        t1 = "Sodium 132 mM"
        t2 = "Sodium (mM) 127"
        m1 = r.match(t1)
        m2 = r.match(t2)

        print(m1.group(0))  # Sodium 132 mM
        print(m1.group("quantity"))  # Sodium
        print(m1.group("value"))  # 132
        print(m1.group("units"))  # mM

        print(m2.group(0))  # Sodium (mM) 127
        print(m2.group("quantity"))  # Sodium
        print(m2.group("value"))  # 127
        print(m2.group("units"))  # mM

    ... so it's fine in that multiple groups can have the same name.

    """

    def group(groupname: str, contents: str, optional: bool = False) -> str:
        opt_str = "?" if optional else ""
        return f"(?P<{groupname}> {contents} ){opt_str}"

    def bracketed(s: str) -> str:
        return rf"{LB} \s* {s} \s* {RB}"

    group_quantity = group(GROUP_NAME_QUANTITY, quantity)
    group_tense_optional = group(GROUP_NAME_TENSE, tense_indicator, True)
    group_relation_optional = group(GROUP_NAME_RELATION, relation, True)
    group_units = group(GROUP_NAME_UNITS, units)
    group_units_bracketed = bracketed(group_units)
    group_value = group(GROUP_NAME_VALUE, value)
    group_value_bracketed = bracketed(group_value)
    value_units_all_bracketed = bracketed(rf"{group_value} \s+ {group_units}")
    units_optional_descriptor = "optional" if units_optional else "required"
    qmark_if_units_optional = "?" if units_optional else ""

    return rf"""
        # - Either: quantity [tense] [relation] value [units]
        #   or:     quantity (units value)
        #   or:     quantity (units) [tense] [relation] value
        # Quantity:
        {group_quantity}
        # Ignorable:
        {optional_ignorable_after_quantity}
        {optional_results_ignorables}
        (?:
            (?:
                # (units) ... [tense] ... [relation] ... value
                # Units, in brackets:
                {group_units_bracketed}
                # Tense indicator (optional):
                {group_tense_optional}
                # Ignorable:
                {optional_results_ignorables}
                # Relation (optional):
                {group_relation_optional}
                # Ignorable:
                {optional_results_ignorables}
                # Value:
                {group_value}
            )
            |
            (?:
                # (value units)
                {value_units_all_bracketed}
            )
            |
            (?:
                # [tense] ... [relation] ... value|(value) ... [units]
                # Tense indicator (optional):
                {group_tense_optional}
                # Ignorable:
                {optional_results_ignorables}
                # Relation (optional):
                {group_relation_optional}
                # Ignorable:
                {optional_results_ignorables}
                # Value or (value):
                (?:
                    {group_value} |
                    {group_value_bracketed}
                )
                # Ignorable:
                {optional_results_ignorables}
                # Units ({units_optional_descriptor}):
                {group_units}{qmark_if_units_optional}
            )
        )
    """


[docs]class SimpleNumericalResultParser(NumericalResultParser, ABC):
    """
    Base class for simple single-format numerical results. Use this when not
    only do you have a single variable to produce, but you have a single regex
    (in a standard format) that can produce it.
    """

[docs]    def __init__(
        self,
        nlpdef: NlpDefinition,
        cfg_processor_name: str,
        regex_str: str,
        variable: str,
        target_unit: str,
        units_to_factor: Dict[str, float],
        take_absolute: bool = False,
        commit: bool = False,
        debug: bool = False,
    ) -> None:
        """
        Args:

            nlpdef:
                :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`

            cfg_processor_name:
                config section suffix in the :ref:`NLP config file
                <nlp_config>`

            regex_str:
                Regular expression, in string format.

                This class operates with compiled regexes having this group
                format (capture groups in this sequence):

                - variable
                - tense_indicator
                - relation
                - value
                - units

            variable:
                used as the record value for ``variable_name``

            target_unit:
                fieldname used for the primary output quantity

            units_to_factor:
                dictionary, mapping

                - FROM (compiled regex for units)
                - TO EITHER a float (multiple) to multiply those units by, to
                  get the preferred unit
                - OR a function taking a text parameter and returning a float
                  value in preferred unit

                Any units present in the regex but absent from
                ``units_to_factor`` will lead the result to be ignored. For
                example, this allows you to ignore a relative neutrophil count
                ("neutrophils 2.2%") while detecting absolute neutrophil counts
                ("neutrophils 2.2"), or ignoring "docusate sodium 100mg" but
                detecting "sodium 140 mM".

            take_absolute:
                Convert negative values to positive ones? Typical text
                requiring this option might look like:

                .. code-block:: none

                    CRP-4
                    CRP-106
                    CRP -97
                    Blood results for today as follows: Na- 142, K-4.1, ...

                ... occurring in 23 out of 8054 hits for CRP of one test set in
                our data.

                For many quantities, we know that they cannot be negative, so
                this is just a notation rather than a minus sign. We have to
                account for it, or it'll distort our values. Preferable to
                account for it here rather than later; see manual.

            commit:
                force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.

            debug:
                print the regex?

        """
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            variable=variable,
            target_unit=target_unit,
            regex_str_for_debugging=regex_str,
            commit=commit,
        )
        if debug:
            log.debug(f"Regex for {self.classname()}: {regex_str}")
        self.compiled_regex = compile_regex(regex_str)
        self.units_to_factor = compile_regex_dict(units_to_factor)
        self.take_absolute = take_absolute

[docs]    def parse(
        self, text: str, debug: bool = False
    ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
        # docstring in superclass
        if not text:
            return
        for m in self.compiled_regex.finditer(text):
            startpos = m.start()
            endpos = m.end()
            # groups = repr(m.groups())  # all matching groups
            matching_text = m.group(GROUP_NUMBER_WHOLE_EXPRESSION)
            # matching_text = text[startpos:endpos]  # same thing

            variable_text = m.group(GROUP_NAME_QUANTITY)
            tense_text = m.group(GROUP_NAME_TENSE)
            relation_text = m.group(GROUP_NAME_RELATION)
            value_text = m.group(GROUP_NAME_VALUE)
            units = m.group(GROUP_NAME_UNITS)

            # If units are known (or we're choosing to assume preferred units
            # if none are specified), calculate an absolute value
            value_in_target_units = None
            if units:
                matched_unit, multiple_or_fn = get_regex_dict_match(
                    units, self.units_to_factor
                )
                if not matched_unit:
                    # None of our units match. But there is a unit, and the
                    # regex matched. So this is a BAD unit. Skip the value.
                    continue
                # Otherwise: we did match a unit.
                if callable(multiple_or_fn):
                    value_in_target_units = multiple_or_fn(value_text)
                else:
                    value_in_target_units = (
                        to_float(value_text) * multiple_or_fn
                    )
            elif self.assume_preferred_unit:  # unit is None or empty
                value_in_target_units = to_float(value_text)

            if value_in_target_units is not None and self.take_absolute:
                value_in_target_units = abs(value_in_target_units)

            tense, relation = common_tense(tense_text, relation_text)

            result = {
                FN_VARIABLE_NAME: self.variable,
                FN_CONTENT: matching_text,
                FN_START: startpos,
                FN_END: endpos,
                FN_VARIABLE_TEXT: variable_text,
                FN_RELATION_TEXT: relation_text,
                FN_RELATION: relation,
                FN_VALUE_TEXT: value_text,
                FN_UNITS: units,
                self.target_unit: value_in_target_units,
                FN_TENSE_TEXT: tense_text,
                FN_TENSE: tense,
            }
            if debug:
                log.debug(f"Match {m} for {text!r} -> {result}")
            yield self.tablename, result


# -----------------------------------------------------------------------------
# NumeratorOutOfDenominatorParser
# -----------------------------------------------------------------------------


[docs]class NumeratorOutOfDenominatorParser(BaseNlpParser, ABC):
    """
    Base class for X-out-of-Y numerical results, e.g. for MMSE/ACE.

    - Integer denominator, expected to be positive.
    - Otherwise similar to :class:`SimpleNumericalResultParser`.
    """

[docs]    def __init__(
        self,
        nlpdef: NlpDefinition,
        cfg_processor_name: str,
        variable_name: str,  # e.g. "MMSE"
        variable_regex_str: str,  # e.g. regex for MMSE
        expected_denominator: int,
        numerator_text_fieldname: str = "numerator_text",
        numerator_fieldname: str = "numerator",
        denominator_text_fieldname: str = "denominator_text",
        denominator_fieldname: str = "denominator",
        correct_numerator_fieldname: str = None,  # default below
        take_absolute: bool = True,
        commit: bool = False,
        debug: bool = False,
    ) -> None:
        """
        This class operates with compiled regexes having this group format:
          - quantity_regex_str: e.g. to find "MMSE"

        Args:
            nlpdef:
                a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
            cfg_processor_name:
                the suffix (name) of a CRATE NLP config file processor section
                (from which we may choose to get extra config information)
            variable_name:
                becomes the content of the ``variable_name`` output column
            variable_regex_str:
                regex for the text that states the variable
            expected_denominator:
                the integer value that's expected as the "out of Y" part. For
                example, an MMSE is out of 30; an ACE-III total is out of 100.
                If the text just says "MMSE 17", we will infer "17 out of 30";
                so, for the MMSE, ``expected_denominator`` should be 30.
            numerator_text_fieldname:
                field (column) name in which to store the text retrieved as the
                numerator
            numerator_fieldname:
                field (column) name in which to store the numerical value
                retrieved as the numerator
            denominator_text_fieldname:
                field (column) name in which to store the text retrieved as the
                denominator
            denominator_fieldname:
                field (column) name in which to store the numerical value
                retrieved as the denominator
            correct_numerator_fieldname:
                field (column) name in which we store the principal validated
                numerator. For example, if an MMSE processor sees "17" or
                "17/30", this field will end up containing 17; but if it sees
                "17/100", it will remain NULL.
            take_absolute:
                Convert negative values to positive ones?
                As for :class:`SimpleNumericalResultParser`.
            commit:
                force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.
            debug:
                print the regex?

        """
        self.variable_name = variable_name
        assert expected_denominator > 0
        self.expected_denominator = expected_denominator
        self.numerator_text_fieldname = numerator_text_fieldname
        self.numerator_fieldname = numerator_fieldname
        self.denominator_text_fieldname = denominator_text_fieldname
        self.denominator_fieldname = denominator_fieldname
        self.correct_numerator_fieldname = (
            correct_numerator_fieldname or f"out_of_{expected_denominator}"
        )
        self.take_absolute = take_absolute

        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            commit=commit,
            friendly_name=variable_name,
        )
        if nlpdef is None:  # only None for debugging!
            self.tablename = self.classname().lower()
        else:
            self.tablename = self._cfgsection.opt_str(
                ProcessorConfigKeys.DESTTABLE, required=True
            )

        regex_str = rf"""
            ( {variable_regex_str} )           # 1. group for variable (thing being measured)
            {OPTIONAL_RESULTS_IGNORABLES}
            {SCORE}?                           # optional "score" or similar
            {OPTIONAL_RESULTS_IGNORABLES}
            ( {TENSE_INDICATOR} )?             # 2. optional group for tense indicator
            {OPTIONAL_RESULTS_IGNORABLES}
            ( {RELATION} )?                    # 3. optional group for relation
            {OPTIONAL_RESULTS_IGNORABLES}
            ( {SIGNED_FLOAT} )                 # 4. group for numerator
            (?:                                # optional "/ denominator"
                \s* {OUT_OF_SEPARATOR} \s*
                ( {IGNORESIGN_INTEGER} )         # 5. group for denominator
            )?
        """  # noqa
        if debug:
            log.debug(f"Regex for {self.classname()}: {regex_str}")
        self.regex_str = regex_str
        self.compiled_regex = compile_regex(regex_str)

[docs]    def dest_tables_columns(self) -> Dict[str, List[Column]]:
        # docstring in superclass
        return {
            self.tablename: [
                Column(
                    FN_VARIABLE_NAME,
                    SqlTypeDbIdentifier,
                    comment=HELP_VARIABLE_NAME,
                ),
                Column(FN_CONTENT, Text, comment=HELP_CONTENT),
                Column(FN_START, Integer, comment=HELP_START),
                Column(FN_END, Integer, comment=HELP_END),
                Column(FN_VARIABLE_TEXT, Text, comment=HELP_VARIABLE_TEXT),
                Column(
                    FN_RELATION_TEXT,
                    String(MAX_RELATION_TEXT_LENGTH),
                    comment=HELP_RELATION_TEXT,
                ),
                Column(
                    FN_RELATION,
                    String(MAX_RELATION_LENGTH),
                    comment=HELP_RELATION,
                ),
                Column(
                    self.numerator_text_fieldname,
                    String(MAX_VALUE_TEXT_LENGTH),
                    comment="Numerator, as text",
                ),
                Column(self.numerator_fieldname, Float, comment="Numerator"),
                Column(
                    self.denominator_text_fieldname,
                    String(MAX_VALUE_TEXT_LENGTH),
                    comment="Denominator, as text",
                ),
                Column(
                    self.denominator_fieldname, Float, comment="Denominator"
                ),
                Column(
                    self.correct_numerator_fieldname,
                    Float,
                    comment="Numerator, if denominator is as expected (units "
                    "are correct)",
                ),
                Column(
                    FN_TENSE_TEXT,
                    String(MAX_TENSE_TEXT_LENGTH),
                    comment=HELP_TENSE_TEXT,
                ),
                Column(FN_TENSE, String(MAX_TENSE_LENGTH), comment=HELP_TENSE),
            ]
        }

[docs]    def parse(
        self, text: str, debug: bool = False
    ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
        # docstring in superclass
        for m in self.compiled_regex.finditer(text):
            startpos = m.start()
            endpos = m.end()
            # groups = repr(m.groups())  # all matching groups
            matching_text = m.group(0)  # the whole thing
            # matching_text = text[startpos:endpos]  # same thing

            variable_text = m.group(1)
            tense_text = m.group(2)
            relation_text = m.group(3)
            numerator_text = m.group(4)
            denominator_text = m.group(5)

            if self.take_absolute:
                numerator = to_pos_float(numerator_text)
            else:
                numerator = to_float(numerator_text)
            denominator = to_float(denominator_text)

            if numerator is None:
                log.critical("bug - numerator is None, should be impossible")
                continue
            correct_numerator = None
            if denominator is None:
                if numerator <= self.expected_denominator:
                    correct_numerator = numerator
            else:
                if numerator <= denominator == self.expected_denominator:
                    correct_numerator = numerator

            tense, relation = common_tense(tense_text, relation_text)

            result = {
                FN_VARIABLE_NAME: self.variable_name,
                FN_CONTENT: matching_text,
                FN_START: startpos,
                FN_END: endpos,
                FN_VARIABLE_TEXT: variable_text,
                FN_RELATION_TEXT: relation_text,
                FN_RELATION: relation,
                self.numerator_text_fieldname: numerator_text,
                self.numerator_fieldname: numerator,
                self.denominator_text_fieldname: denominator_text,
                self.denominator_fieldname: denominator,
                self.correct_numerator_fieldname: correct_numerator,
                FN_TENSE_TEXT: tense_text,
                FN_TENSE: tense,
            }
            if debug:
                log.debug(f"Match {m} for {text!r} -> {result}")
            yield self.tablename, result

[docs]    def test_numerator_denominator_parser(
        self,
        test_expected_list: List[Tuple[str, List[Tuple[float, float]]]],
        verbose: bool = False,
    ) -> None:
        """
        Test the parser.

        Args:
            test_expected_list:
                list of tuples ``test_string, expected_values``. The parser
                will parse ``test_string`` and compare the result (each value
                of the target unit) to ``expected_values``, which is a list of
                tuples ``numerator, denominator``, and can be an empty list.
            verbose:
                print the regex?

        Raises:
            :exc:`AssertionError` if a comparison fails
        """
        log.info(f"Testing parser: {self.classname()}")
        if verbose:
            log.debug(f"... regex:\n{self.regex_str}")
        for test_string, expected_values in test_expected_list:
            actual_values = list(
                (x[self.numerator_fieldname], x[self.denominator_fieldname])
                for t, x in self.parse(test_string)
            )
            assert actual_values == expected_values, (
                "Parser {name}: Expected {expected}, got {actual}, when "
                "parsing {test_string}; full result:\n{full}".format(
                    name=self.classname(),
                    expected=expected_values,
                    actual=actual_values,
                    test_string=repr(test_string),
                    full=repr(list(self.parse(test_string))),
                )
            )
        log.info("... OK")


# =============================================================================
# Validator base class (for testing regex NLP classes)
# =============================================================================


[docs]class ValidatorBase(BaseNlpParser):
    r"""
    DO NOT USE DIRECTLY. Base class for **validating** regex parser
    sensitivity.

    The validator will find fields that refer to the variable, whether or not
    they meet the other criteria of the actual NLP processors (i.e. whether or
    not they contain a valid value). More explanation below.

    Suppose we're validating C-reactive protein (CRP). Key concepts:

    - source (true state of the world): Pr present, Ab absent
    - software decision: Y yes, N no
    - signal detection theory classification:

      - hit = Pr & Y = true positive
      - miss = Pr & N = false negative
      - false alarm = Ab & Y = false positive
      - correct rejection = Ab & N = true negative

    - common SDT metrics:

      - positive predictive value, PPV = P(Pr | Y) = precision (\*)
      - negative predictive value, NPV = P(Ab | N)
      - sensitivity = P(Y | Pr) = recall (\*) = true positive rate
      - specificity = P(N | Ab) = true negative rate

      (\*) common names used in the NLP context.

    - other common classifier metric:

      .. code-block:: none

        F_beta score = (1 + beta^2) * precision * recall /
                       ((beta^2 * precision) + recall)

      ... which measures performance when you value recall beta times as much
      as precision (thus, for example, the F1 score when beta = 1). See
      https://en.wikipedia.org/wiki/F1_score/

    Working from source to NLP, we can see there are a few types of "absent":

    - X. unselected database field containing text

        - Q. field contains "CRP", "C-reactive protein", etc.; something
          that a human (or as a proxy: a machine) would judge as
          containing a textual reference to CRP.

            - Pr. Present: a human would judge that a CRP value is present,
                e.g. "today her CRP is 7, which I am not concerned about."

                - H.  Hit: software reports the value.
                - M.  Miss: software misses the value.
                  (Maybe: "his CRP was twenty-one".)

            - Ab1. Absent: reference to CRP, but no numerical information,
              e.g. "her CRP was normal".

                - FA1. False alarm: software reports a numerical value.
                  (Maybe: "my CRP was 7 hours behind my boss's deadline")
                - CR1. Correct rejection: software doesn't report a value.

        - Ab2. field contains no reference to CRP at all.

                - FA2. False alarm: software reports a numerical value.
                  (A bit harder to think of examples... but imagine a bug
                  that gives a hit for "number of carp: 7". Or an alternative
                  abbreviation meaning, e.g. "took part in a cardiac
                  rehabilitation programme (CRP) 4 hours/week".)

                - CR2. Correct rejection: software doesn't report a value.

    From NLP backwards to source:

    - Y. Software says value present.

        - H. Hit: value is present.
        - FA. False alarm: value is absent.

    - N. Software says value absent.

        - CR. Correct rejection: value is absent.
        - M. Miss: value is present.

    The key metrics are:

    - precision = positive predictive value = P(Pr | Y)

      ... relatively easy to check; find all the "Y" records and check
      manually that they're correct.

    - sensitivity = recall = P(Y | Pr)

      ... Here, we want a sample that is enriched for "symptom actually
      present", for human reasons. For example, if 0.1% of text entries
      refer to CRP, then to assess 100 "Pr" samples we would have to
      review 100,000 text records, 99,900 of which are completely
      irrelevant. So we want an automated way of finding "Pr" records.
      That's what the validator classes do.

    You can enrich for "Pr" records with SQL, e.g.

    .. code-block:: sql

        SELECT textfield FROM sometable WHERE (
            textfield LIKE '%CRP%'
            OR textfield LIKE '%C-reactive protein%');

    or similar, but really we want the best "CRP detector" possible. That is
    probably to use a regex, either in SQL (... ``WHERE textfield REGEX
    'myregex'``) or using these validator classes. (The main NLP regexes don't
    distinguish between "CRP present, no valid value" and "CRP absent",
    because regexes either match or don't.)

    Each validator class implements the core variable-finding part of its
    corresponding NLP regex class, but without the value or units. For example,
    the CRP class looks for things like "CRP is 6" or "CRP 20 mg/L", whereas
    the CRP validator looks for things like "CRP".

    """

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        """
        Args:
            nlpdef:
                :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`

            cfg_processor_name:
                config section suffix in the :ref:`NLP config file
                <nlp_config>`

            commit:
                force a COMMIT whenever we insert data? You should specify this
                in multiprocess mode, or you may get database deadlocks.
        """
        (
            validated_variable,
            regex_str_list,
        ) = self.get_variablename_regexstrlist()
        vname = f"{validated_variable}_validator"
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            commit=commit,
            friendly_name=vname,
        )
        self.regex_str_list = regex_str_list  # for debugging only
        self.compiled_regex_list = [compile_regex(r) for r in regex_str_list]
        self.variable = vname
        self.NAME = self.variable

        if nlpdef is None:  # only None for debugging!
            self.tablename = self.classname().lower()
        else:
            self.tablename = self._cfgsection.opt_str(
                ProcessorConfigKeys.DESTTABLE, required=True
            )

[docs]    @classmethod
    @abstractmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        """
        To be overridden.

        Returns:
            tuple: ``(validated_variable_name, regex_str_list)``, where:

            regex_str_list:
                List of regular expressions, each in string format.

                This class operates with compiled regexes having this group
                format (capture groups in this sequence):

                - variable

            validated_variable:
                used to set our ``variable`` attribute and thus the value of
                the field ``variable_name`` in the NLP output; for example, if
                ``validated_variable == 'crp'``, then the ``variable_name``
                field will be set to ``crp_validator``.

        """
        raise NotImplementedError

[docs]    def set_tablename(self, tablename: str) -> None:
        """
        In case a friend class wants to override.
        """
        self.tablename = tablename

[docs]    def dest_tables_columns(self) -> Dict[str, List[Column]]:
        # docstring in superclass
        return {
            self.tablename: [
                Column(
                    FN_VARIABLE_NAME,
                    SqlTypeDbIdentifier,
                    comment=HELP_VARIABLE_NAME,
                ),
                Column(FN_CONTENT, Text, comment=HELP_CONTENT),
                Column(FN_START, Integer, comment=HELP_START),
                Column(FN_END, Integer, comment=HELP_END),
            ]
        }

[docs]    def parse(
        self, text: str
    ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
        # docstring in superclass
        for compiled_regex in self.compiled_regex_list:
            for m in compiled_regex.finditer(text):
                startpos = m.start()
                endpos = m.end()
                # groups = repr(m.groups())  # all matching groups
                matching_text = m.group(0)  # the whole thing
                # matching_text = text[startpos:endpos]  # same thing

                yield self.tablename, {
                    FN_VARIABLE_NAME: self.variable,
                    FN_CONTENT: matching_text,
                    FN_START: startpos,
                    FN_END: endpos,
                }

[docs]    def test_validator(
        self, test_expected_list: List[Tuple[str, bool]], verbose: bool = False
    ) -> None:
        """
        The 'bool' part of test_expected_list is: should it match any?
        ... noting that "match anywhere" is the "search" function, whereas
        "match" matches at the beginning:

            https://docs.python.org/3/library/re.html#re.regex.match
        """
        log.info(f"Testing validator: {self.classname()}")
        if verbose:
            n = len(self.regex_str_list)
            for i, r in enumerate(self.regex_str_list, start=1):
                log.debug(f"... regex #{i}/{n}: {r}\n")
        for test_string, expected_match in test_expected_list:
            results = list(
                r.search(test_string) for r in self.compiled_regex_list
            )
            actual_match = any(results)
            assert actual_match == expected_match, (
                f"Validator {self.classname()}: Expected 'at least one regex "
                f"should match somewhere (search)' to be {expected_match}, "
                f"got {actual_match}, when parsing {test_string!r}; "
                f"full results = {results}"
            )
        log.info("... OK")

[docs]    def test(self, verbose: bool = False) -> None:
        log.info(f"... no tests implemented for validator {self.classname()}")


# =============================================================================
# More general testing
# =============================================================================


[docs]def learning_alternative_regex_groups() -> None:
    """
    Function to learn about regex syntax.
    """
    regex_str = r"""
        (
            (?:
                \s*
                (?: (a) | (b) | (c) | (d) )
                \s*
            )*
            ( fish )?
        )
    """
    compiled_regex = compile_regex(regex_str)
    for test_str in ("a", "b", "a c", "d", "e", "a fish", "c c c"):
        m = compiled_regex.match(test_str)
        log.info(f"Match: {m}; groups: {m.groups()}")
    """
    So:
        - groups can overlap
        - groups are ordered by their opening bracket
        - matches are filled in neatly
    """