Source code for crate_anon.nlp_manager.regex_parser

"""
crate_anon/nlp_manager/regex_parser.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Shared elements for regex-based NLP work.**

"""

from abc import abstractmethod, ABC
import logging
from typing import Any, Dict, Generator, List, Optional, Tuple

from sqlalchemy import Column, Integer, Float, String, Text

from crate_anon.common.regex_helpers import (
    LEFT_BRACKET as LB,
    RIGHT_BRACKET as RB,
)
from crate_anon.nlp_manager.constants import (
    MAX_SQL_FIELD_LEN,
    ProcessorConfigKeys,
    SqlTypeDbIdentifier,
)
from crate_anon.nlp_manager.base_nlp_parser import BaseNlpParser
from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.nlp_manager.number import to_float, to_pos_float
from crate_anon.nlp_manager.regex_func import (
    compile_regex,
    compile_regex_dict,
    get_regex_dict_match,
)
from crate_anon.nlp_manager.regex_numbers import (
    SIGNED_FLOAT,
    IGNORESIGN_INTEGER,
)
from crate_anon.nlp_manager.regex_units import (
    OUT_OF_SEPARATOR,
    SCORE,
)

log = logging.getLogger(__name__)


# =============================================================================
# Generic entities
# =============================================================================

# -----------------------------------------------------------------------------
# Blood results
# -----------------------------------------------------------------------------

OPTIONAL_RESULTS_IGNORABLES = r"""
    (?:  # OPTIONAL_RESULTS_IGNORABLES
        \s | \| | \:          # whitespace, bar, colon
        | \bHH?\b | \(HH?\)   # H/HH at a word boundary; (H)/(HH)
        | \bLL?\b | \(LL?\)   # L/LL etc.
        | \* | \(\*\)         # *, (*)
        | — | --              # em dash, double hyphen-minus
        | –\s+ | -\s+ | ‐\s+  # en dash/hyphen-minus/Unicode hyphen; whitespace
    )*                        # ... any of those, repeated 0 or more times
"""
# - you often get | characters when people copy/paste tables
# - blood test abnormality markers can look like e.g.
#       17 (H), 17 (*), 17 HH
# Re parentheses:
# - you can also see things like "CRP (5)"
#   ... but we'll handle that
# - However, if there's a right parenthesis only, that's less good, e.g.
#   "Present: Nicola Adams (NA). 1.0. Minutes of the last meeting."
#   ... which we don't want to be interpreted as "sodium 1.0".
#   HOW BEST TO DO THIS?
# - https://stackoverflow.com/questions/546433/regular-expression-to-match-outer-brackets  # noqa
#   https://stackoverflow.com/questions/7898310/using-regex-to-balance-match-parenthesis  # noqa
# - ... simplest is perhaps: base ignorables, or those with brackets, as above
# - ... even better than a nested thing is just a list of alternatives

OPTIONAL_POC = r"""
    (?: ,? \s+ POC )?   # OPTIONAL_POC: point-of-care testing, "[,] POC"
"""
# ... e.g. "Glucose, POC"; "Potassium, POC".
# Seen in CUH for
#
#     sodium, POC
#     potassium, POC
#     creatinine, POC
#     urea, POC
#     glucose, POC
#     lactate, POC
#     bilirubin, POC
#     HCT, POC
#     alkaline phosphatase, POC
#     alanine transferase, POC
#
#     HGB, POC
#     WBC, POC
#     PLT, POC
#     MCV, POC
#     MCH, POC
#     neutrophil count, POC
#     lymphocyte count, POC

# -----------------------------------------------------------------------------
# Tense indicators
# -----------------------------------------------------------------------------

IS = "is"
WAS = "was"
TENSE_INDICATOR = rf"(?: \b {IS} \b | \b {WAS} \b )"

# Standardized result values; see MAX_TENSE_TEXT_LENGTH
PAST = "past"
PRESENT = "present"
EVER = "ever"  # e.g. for "never"

TENSE_LOOKUP = compile_regex_dict(
    {
        IS: PRESENT,
        WAS: PAST,
    }
)

# -----------------------------------------------------------------------------
# Mathematical relations
# -----------------------------------------------------------------------------
# ... don't use unnamed groups here; EQ is also used as a return value

LT = r"(?: < | less \s+ than | under )"
LE = "<="
EQ = r"(?: = | equals | equal \s+ to )"
GE = ">="
GT = r"(?: > | (?:more|greater) \s+ than | over )"
# OF = "\b of \b"  # as in: "a BMI of 30"... but too likely to be mistaken for a target?  # noqa

RELATION = rf"(?: {LE} | {LT} | {EQ} | {GE} | {GT} )"
# ... ORDER MATTERS: greedier things first, i.e.
# - LE before LT
# - GE before GT

RELATION_LOOKUP = compile_regex_dict(
    {
        # To standardize the output, so (for example) "=" and "equals" can both
        # map to "=".
        LT: "<",
        LE: "<=",
        EQ: "=",
        GE: ">=",
        GT: ">",
    }
)

# -----------------------------------------------------------------------------
# Punctuation
# -----------------------------------------------------------------------------

APOSTROPHE = "['’]"  # ASCII apostrophe; right single quote (U+2019)


# =============================================================================
# Regex assembly functions
# =============================================================================


# =============================================================================
# Functions to handle processed data
# =============================================================================


[docs]def common_tense( tense_text: Optional[str], relation_text: Optional[str] ) -> Tuple[Optional[str], Optional[str]]: """ Takes strings potentially representing "tense" and "equality" concepts and unifies them. - Used, for example, to help impute that "CRP was 72" means that relation was EQ in the PAST, etc. Args: tense_text: putative tense information relation_text: putative relationship (equals, less than, etc.) Returns: tuple: ``tense, relation``; either may be ``None``. """ tense = None if tense_text: _, tense = get_regex_dict_match(tense_text, TENSE_LOOKUP) elif relation_text: _, tense = get_regex_dict_match(relation_text, TENSE_LOOKUP) _, relation = get_regex_dict_match(relation_text, RELATION_LOOKUP, "=") return tense, relation
# ============================================================================= # Constants for generic processors # ============================================================================= FN_VARIABLE_NAME = "variable_name" FN_CONTENT = "_content" FN_START = "_start" FN_END = "_end" FN_VARIABLE_TEXT = "variable_text" FN_RELATION_TEXT = "relation_text" FN_RELATION = "relation" FN_VALUE_TEXT = "value_text" FN_UNITS = "units" FN_TENSE_TEXT = "tense_text" FN_TENSE = "tense" HELP_VARIABLE_NAME = "Variable name" HELP_CONTENT = "Matching text contents" HELP_START = "Start position (of matching string within whole text)" HELP_END = "End position (of matching string within whole text)" HELP_VARIABLE_TEXT = "Text that matched the variable name" HELP_RELATION_TEXT = ( "Text that matched the mathematical relationship between variable and " "value (e.g. '=', '<=', 'less than')" ) HELP_RELATION = ( "Standardized mathematical relationship between variable and value " "(e.g. '=', '<=')" ) HELP_VALUE_TEXT = "Matched numerical value, as text" HELP_UNITS = "Matched units, as text" HELP_TARGET_UNIT = "Numerical value in preferred units, if known" HELP_TENSE_TEXT = f"Tense text, if known (e.g. '{IS}', '{WAS}')" HELP_TENSE = f"Calculated tense, if known (e.g. '{PAST}', '{PRESENT}')" MAX_RELATION_TEXT_LENGTH = 50 MAX_RELATION_LENGTH = max(len(x) for x in RELATION_LOOKUP.values()) MAX_VALUE_TEXT_LENGTH = 50 MAX_UNITS_LENGTH = 50 MAX_TENSE_TEXT_LENGTH = 50 MAX_TENSE_LENGTH = max(len(x) for x in TENSE_LOOKUP.values()) # ============================================================================= # Generic processors # ============================================================================= # ----------------------------------------------------------------------------- # NumericalResultParser # -----------------------------------------------------------------------------
[docs]class NumericalResultParser(BaseNlpParser): """ DO NOT USE DIRECTLY. Base class for generic numerical results, where a SINGLE variable is produced. """
[docs] def __init__( self, nlpdef: NlpDefinition, cfg_processor_name: str, variable: str, target_unit: str, regex_str_for_debugging: str, commit: bool = False, ) -> None: r""" Init function for NumericalResultParser. Args: nlpdef: A :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`. cfg_processor_name: Config section name in the :ref:`NLP config file <nlp_config>`. variable: Used by subclasses as the record value for ``variable_name``. target_unit: Fieldname used for the primary output quantity. regex_str_for_debugging: String form of regex, for debugging. commit: Force a COMMIT whenever we insert data? You should specify this in multiprocess mode, or you may get database deadlocks. Subclasses will extend this method. """ # NB This docstring was associated with Sphinx errors! super().__init__( nlpdef=nlpdef, cfg_processor_name=cfg_processor_name, commit=commit, friendly_name=variable, ) self.variable = variable self.target_unit = target_unit self.regex_str_for_debugging = regex_str_for_debugging if nlpdef is None: # only None for debugging! self.tablename = self.classname().lower() self.assume_preferred_unit = True else: self.tablename = self._cfgsection.opt_str( ProcessorConfigKeys.DESTTABLE, required=True ) self.assume_preferred_unit = self._cfgsection.opt_bool( ProcessorConfigKeys.ASSUME_PREFERRED_UNIT, default=True ) # Sanity checks assert ( len(self.variable) <= MAX_SQL_FIELD_LEN ), f"Variable name too long (max {MAX_SQL_FIELD_LEN} characters)"
[docs] def get_regex_str_for_debugging(self) -> str: """ Returns the string version of the regex, for debugging. """ return self.regex_str_for_debugging
[docs] def set_tablename(self, tablename: str) -> None: """ In case a friend class wants to override. """ self.tablename = tablename
[docs] def dest_tables_columns(self) -> Dict[str, List[Column]]: # docstring in superclass return { self.tablename: [ Column( FN_VARIABLE_NAME, SqlTypeDbIdentifier, comment=HELP_VARIABLE_NAME, ), Column(FN_CONTENT, Text, comment=HELP_CONTENT), Column(FN_START, Integer, comment=HELP_START), Column(FN_END, Integer, comment=HELP_END), Column(FN_VARIABLE_TEXT, Text, comment=HELP_VARIABLE_TEXT), Column( FN_RELATION_TEXT, String(MAX_RELATION_TEXT_LENGTH), comment=HELP_RELATION_TEXT, ), Column( FN_RELATION, String(MAX_RELATION_LENGTH), comment=HELP_RELATION, ), Column(FN_VALUE_TEXT, Text, comment=HELP_VALUE_TEXT), Column(FN_UNITS, String(MAX_UNITS_LENGTH), comment=HELP_UNITS), Column(self.target_unit, Float, comment=HELP_TARGET_UNIT), Column( FN_TENSE_TEXT, String(MAX_TENSE_TEXT_LENGTH), comment=HELP_TENSE_TEXT, ), Column(FN_TENSE, String(MAX_TENSE_LENGTH), comment=HELP_TENSE), ] }
[docs] @abstractmethod def parse( self, text: str ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: # docstring in superclass raise NotImplementedError
[docs] def test_numerical_parser( self, test_expected_list: List[Tuple[str, List[float]]], add_test_no_plain_number: bool = True, verbose: bool = False, ) -> None: """ Args: test_expected_list: list of tuples ``test_string, expected_values``. The parser will parse ``test_string`` and compare the result (each value of the target unit) to ``expected_values``, which is a list of numerical (``float``), and can be an empty list. verbose: show the regex string too Raises: :exc:`AssertionError` if a comparison fails Compare also :func:`test_numerical_parser_detailed`. """ log.info(f"Testing parser: {self.classname()}") if verbose: log.debug(f"... regex string:\n{self.regex_str_for_debugging}") if add_test_no_plain_number: test_expected_list = test_expected_list + [ ("999", []) # no quantity specified ] # use "+ [...]", not append(), so as not to modify for caller for test_string, expected_values in test_expected_list: full_result = list(self.parse(test_string)) actual_values = list(x[self.target_unit] for t, x in full_result) assert actual_values == expected_values, ( f"Parser {self.classname()}: Expected {expected_values!r}, " f"got {actual_values!r}, when parsing {test_string!r}; " f"full result:\n{full_result!r}" ) log.info("... OK")
[docs] def detailed_test( self, text: str, expected: List[Dict[str, Any]], verbose: bool = False ) -> None: """ Runs a more detailed check. Whereas :func:`test_numerical_parser` tests the primary numerical results, this function tests other key/value pairs returned by the parser. Args: text: text to parse expected: list of ``resultdict`` dictionaries (each mapping column names to values). - The parser should return one result dictionary for every entry in ``expected``. - It's fine for the ``resultdict`` not to include all the columns returned for the parser. However, for any column that is present, the parser must provide the corresponding value. verbose: be verbose """ full_result = list(self.parse(text)) if len(full_result) != len(expected): raise ValueError( f"Parser {self.classname()}: expected {len(expected)} results " f"but got {len(full_result)} when parsing {text!r}; " f"full result:\n{full_result!r}" ) if verbose: log.info(f"detailed_test: {text!r} -> {full_result!r}") for i, text_result in enumerate(full_result): _, result = text_result expected_dict = expected[i] for k, expected_value in expected_dict.items(): if k not in result: raise ValueError( f"Parser {self.classname()}: Expected value dict " f"had key {k!r} but this is absent from result " f"{result!r}" ) observed_value = result[k] if observed_value != expected_value: raise ValueError( f"Parser {self.classname()}: expected {k} = " f"{expected_value!r}, got {observed_value!r}, " f"when parsing {text!r}; full result:\n" f"{full_result!r}" )
[docs] def detailed_test_multiple( self, tests: List[Tuple[str, List[Dict[str, Any]]]], verbose: bool = False, ) -> None: """ Args: tests: list of tuples ``test_string, expected``. The parser will parse ``test_string`` and compare the result(s) to ``expected``. This is list of dictionaries with keys that can be like ``values``, ``tense``, etc. Each dictionary value is the corresponding expected value. verbose: show the regex string too Raises: :exc:`AssertionError` if a comparison fails """ log.info(f"Detailed tests for parser: {self.classname()}") if verbose: log.debug(f"... regex string:\n{self.regex_str_for_debugging}") for test_string, expected_dict_list in tests: self.detailed_test( test_string, expected_dict_list, verbose=verbose ) log.info("... OK")
# ----------------------------------------------------------------------------- # SimpleNumericalResultParser # ----------------------------------------------------------------------------- GROUP_NUMBER_WHOLE_EXPRESSION = 0 GROUP_NAME_QUANTITY = "quantity" GROUP_NAME_RELATION = "relation" GROUP_NAME_TENSE = "tense" GROUP_NAME_UNITS = "units" GROUP_NAME_VALUE = "value"
[docs]def make_simple_numeric_regex( quantity: str, units: str, value: str = SIGNED_FLOAT, tense_indicator: str = TENSE_INDICATOR, relation: str = RELATION, optional_results_ignorables: str = OPTIONAL_RESULTS_IGNORABLES, optional_ignorable_after_quantity: str = "", units_optional: bool = True, ) -> str: r""" Makes a regex with named groups to handle simple numerical results. Copes with formats like: .. code-block:: none sodium 132 mM sodium (mM) 132 sodium (132 mM) ... and lots more. Args: quantity: Regex for the quantity (e.g. for "sodium" or "Na"). units: Regex for units. value: Regex for the numerical value (e.g. our ``SIGNED_FLOAT`` regex). tense_indicator: Regex for tense indicator. relation: Regex for mathematical relationship (e.g. equals, less than). optional_results_ignorables: Regex for junk to ignore in between the other things. Should include its own "optionality" (e.g. ``*``). optional_ignorable_after_quantity: Regex for additional things that can be ignored right after the quantity. Should include its own "optionality" (e.g. ``?``). units_optional: The units are allowed to be omitted. Usually true. The resulting regex groups are named, not numbered: .. code-block:: none 0: Whole thing; integer, as in: m.group(0) 'quantity': Quantity 'tense': Tense (optional) 'relation': Relation (optional) 'value': Value 'units': Units (optional) ... as used by :class:`SimpleNumericalResultParser`. Just to check re overlap: .. code-block:: python import regex s1 = r"(?P<quantity>Sodium)\s+(?P<value>\d+)\s+(?P<units>mM)" s2 = r"(?P<quantity>Sodium)\s+\((?P<units>mM)\)\s+(?P<value>\d+)" s = f"{s1}|{s2}" r = regex.compile(s) t1 = "Sodium 132 mM" t2 = "Sodium (mM) 127" m1 = r.match(t1) m2 = r.match(t2) print(m1.group(0)) # Sodium 132 mM print(m1.group("quantity")) # Sodium print(m1.group("value")) # 132 print(m1.group("units")) # mM print(m2.group(0)) # Sodium (mM) 127 print(m2.group("quantity")) # Sodium print(m2.group("value")) # 127 print(m2.group("units")) # mM ... so it's fine in that multiple groups can have the same name. """ def group(groupname: str, contents: str, optional: bool = False) -> str: opt_str = "?" if optional else "" return f"(?P<{groupname}> {contents} ){opt_str}" def bracketed(s: str) -> str: return rf"{LB} \s* {s} \s* {RB}" group_quantity = group(GROUP_NAME_QUANTITY, quantity) group_tense_optional = group(GROUP_NAME_TENSE, tense_indicator, True) group_relation_optional = group(GROUP_NAME_RELATION, relation, True) group_units = group(GROUP_NAME_UNITS, units) group_units_bracketed = bracketed(group_units) group_value = group(GROUP_NAME_VALUE, value) group_value_bracketed = bracketed(group_value) value_units_all_bracketed = bracketed(rf"{group_value} \s+ {group_units}") units_optional_descriptor = "optional" if units_optional else "required" qmark_if_units_optional = "?" if units_optional else "" return rf""" # - Either: quantity [tense] [relation] value [units] # or: quantity (units value) # or: quantity (units) [tense] [relation] value # Quantity: {group_quantity} # Ignorable: {optional_ignorable_after_quantity} {optional_results_ignorables} (?: (?: # (units) ... [tense] ... [relation] ... value # Units, in brackets: {group_units_bracketed} # Tense indicator (optional): {group_tense_optional} # Ignorable: {optional_results_ignorables} # Relation (optional): {group_relation_optional} # Ignorable: {optional_results_ignorables} # Value: {group_value} ) | (?: # (value units) {value_units_all_bracketed} ) | (?: # [tense] ... [relation] ... value|(value) ... [units] # Tense indicator (optional): {group_tense_optional} # Ignorable: {optional_results_ignorables} # Relation (optional): {group_relation_optional} # Ignorable: {optional_results_ignorables} # Value or (value): (?: {group_value} | {group_value_bracketed} ) # Ignorable: {optional_results_ignorables} # Units ({units_optional_descriptor}): {group_units}{qmark_if_units_optional} ) ) """
[docs]class SimpleNumericalResultParser(NumericalResultParser, ABC): """ Base class for simple single-format numerical results. Use this when not only do you have a single variable to produce, but you have a single regex (in a standard format) that can produce it. """
[docs] def __init__( self, nlpdef: NlpDefinition, cfg_processor_name: str, regex_str: str, variable: str, target_unit: str, units_to_factor: Dict[str, float], take_absolute: bool = False, commit: bool = False, debug: bool = False, ) -> None: """ Args: nlpdef: :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` cfg_processor_name: config section suffix in the :ref:`NLP config file <nlp_config>` regex_str: Regular expression, in string format. This class operates with compiled regexes having this group format (capture groups in this sequence): - variable - tense_indicator - relation - value - units variable: used as the record value for ``variable_name`` target_unit: fieldname used for the primary output quantity units_to_factor: dictionary, mapping - FROM (compiled regex for units) - TO EITHER a float (multiple) to multiply those units by, to get the preferred unit - OR a function taking a text parameter and returning a float value in preferred unit Any units present in the regex but absent from ``units_to_factor`` will lead the result to be ignored. For example, this allows you to ignore a relative neutrophil count ("neutrophils 2.2%") while detecting absolute neutrophil counts ("neutrophils 2.2"), or ignoring "docusate sodium 100mg" but detecting "sodium 140 mM". take_absolute: Convert negative values to positive ones? Typical text requiring this option might look like: .. code-block:: none CRP-4 CRP-106 CRP -97 Blood results for today as follows: Na- 142, K-4.1, ... ... occurring in 23 out of 8054 hits for CRP of one test set in our data. For many quantities, we know that they cannot be negative, so this is just a notation rather than a minus sign. We have to account for it, or it'll distort our values. Preferable to account for it here rather than later; see manual. commit: force a COMMIT whenever we insert data? You should specify this in multiprocess mode, or you may get database deadlocks. debug: print the regex? """ super().__init__( nlpdef=nlpdef, cfg_processor_name=cfg_processor_name, variable=variable, target_unit=target_unit, regex_str_for_debugging=regex_str, commit=commit, ) if debug: log.debug(f"Regex for {self.classname()}: {regex_str}") self.compiled_regex = compile_regex(regex_str) self.units_to_factor = compile_regex_dict(units_to_factor) self.take_absolute = take_absolute
[docs] def parse( self, text: str, debug: bool = False ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: # docstring in superclass if not text: return for m in self.compiled_regex.finditer(text): startpos = m.start() endpos = m.end() # groups = repr(m.groups()) # all matching groups matching_text = m.group(GROUP_NUMBER_WHOLE_EXPRESSION) # matching_text = text[startpos:endpos] # same thing variable_text = m.group(GROUP_NAME_QUANTITY) tense_text = m.group(GROUP_NAME_TENSE) relation_text = m.group(GROUP_NAME_RELATION) value_text = m.group(GROUP_NAME_VALUE) units = m.group(GROUP_NAME_UNITS) # If units are known (or we're choosing to assume preferred units # if none are specified), calculate an absolute value value_in_target_units = None if units: matched_unit, multiple_or_fn = get_regex_dict_match( units, self.units_to_factor ) if not matched_unit: # None of our units match. But there is a unit, and the # regex matched. So this is a BAD unit. Skip the value. continue # Otherwise: we did match a unit. if callable(multiple_or_fn): value_in_target_units = multiple_or_fn(value_text) else: value_in_target_units = ( to_float(value_text) * multiple_or_fn ) elif self.assume_preferred_unit: # unit is None or empty value_in_target_units = to_float(value_text) if value_in_target_units is not None and self.take_absolute: value_in_target_units = abs(value_in_target_units) tense, relation = common_tense(tense_text, relation_text) result = { FN_VARIABLE_NAME: self.variable, FN_CONTENT: matching_text, FN_START: startpos, FN_END: endpos, FN_VARIABLE_TEXT: variable_text, FN_RELATION_TEXT: relation_text, FN_RELATION: relation, FN_VALUE_TEXT: value_text, FN_UNITS: units, self.target_unit: value_in_target_units, FN_TENSE_TEXT: tense_text, FN_TENSE: tense, } if debug: log.debug(f"Match {m} for {text!r} -> {result}") yield self.tablename, result
# ----------------------------------------------------------------------------- # NumeratorOutOfDenominatorParser # -----------------------------------------------------------------------------
[docs]class NumeratorOutOfDenominatorParser(BaseNlpParser, ABC): """ Base class for X-out-of-Y numerical results, e.g. for MMSE/ACE. - Integer denominator, expected to be positive. - Otherwise similar to :class:`SimpleNumericalResultParser`. """
[docs] def __init__( self, nlpdef: NlpDefinition, cfg_processor_name: str, variable_name: str, # e.g. "MMSE" variable_regex_str: str, # e.g. regex for MMSE expected_denominator: int, numerator_text_fieldname: str = "numerator_text", numerator_fieldname: str = "numerator", denominator_text_fieldname: str = "denominator_text", denominator_fieldname: str = "denominator", correct_numerator_fieldname: str = None, # default below take_absolute: bool = True, commit: bool = False, debug: bool = False, ) -> None: """ This class operates with compiled regexes having this group format: - quantity_regex_str: e.g. to find "MMSE" Args: nlpdef: a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` cfg_processor_name: the suffix (name) of a CRATE NLP config file processor section (from which we may choose to get extra config information) variable_name: becomes the content of the ``variable_name`` output column variable_regex_str: regex for the text that states the variable expected_denominator: the integer value that's expected as the "out of Y" part. For example, an MMSE is out of 30; an ACE-III total is out of 100. If the text just says "MMSE 17", we will infer "17 out of 30"; so, for the MMSE, ``expected_denominator`` should be 30. numerator_text_fieldname: field (column) name in which to store the text retrieved as the numerator numerator_fieldname: field (column) name in which to store the numerical value retrieved as the numerator denominator_text_fieldname: field (column) name in which to store the text retrieved as the denominator denominator_fieldname: field (column) name in which to store the numerical value retrieved as the denominator correct_numerator_fieldname: field (column) name in which we store the principal validated numerator. For example, if an MMSE processor sees "17" or "17/30", this field will end up containing 17; but if it sees "17/100", it will remain NULL. take_absolute: Convert negative values to positive ones? As for :class:`SimpleNumericalResultParser`. commit: force a COMMIT whenever we insert data? You should specify this in multiprocess mode, or you may get database deadlocks. debug: print the regex? """ self.variable_name = variable_name assert expected_denominator > 0 self.expected_denominator = expected_denominator self.numerator_text_fieldname = numerator_text_fieldname self.numerator_fieldname = numerator_fieldname self.denominator_text_fieldname = denominator_text_fieldname self.denominator_fieldname = denominator_fieldname self.correct_numerator_fieldname = ( correct_numerator_fieldname or f"out_of_{expected_denominator}" ) self.take_absolute = take_absolute super().__init__( nlpdef=nlpdef, cfg_processor_name=cfg_processor_name, commit=commit, friendly_name=variable_name, ) if nlpdef is None: # only None for debugging! self.tablename = self.classname().lower() else: self.tablename = self._cfgsection.opt_str( ProcessorConfigKeys.DESTTABLE, required=True ) regex_str = rf""" ( {variable_regex_str} ) # 1. group for variable (thing being measured) {OPTIONAL_RESULTS_IGNORABLES} {SCORE}? # optional "score" or similar {OPTIONAL_RESULTS_IGNORABLES} ( {TENSE_INDICATOR} )? # 2. optional group for tense indicator {OPTIONAL_RESULTS_IGNORABLES} ( {RELATION} )? # 3. optional group for relation {OPTIONAL_RESULTS_IGNORABLES} ( {SIGNED_FLOAT} ) # 4. group for numerator (?: # optional "/ denominator" \s* {OUT_OF_SEPARATOR} \s* ( {IGNORESIGN_INTEGER} ) # 5. group for denominator )? """ # noqa if debug: log.debug(f"Regex for {self.classname()}: {regex_str}") self.regex_str = regex_str self.compiled_regex = compile_regex(regex_str)
[docs] def dest_tables_columns(self) -> Dict[str, List[Column]]: # docstring in superclass return { self.tablename: [ Column( FN_VARIABLE_NAME, SqlTypeDbIdentifier, comment=HELP_VARIABLE_NAME, ), Column(FN_CONTENT, Text, comment=HELP_CONTENT), Column(FN_START, Integer, comment=HELP_START), Column(FN_END, Integer, comment=HELP_END), Column(FN_VARIABLE_TEXT, Text, comment=HELP_VARIABLE_TEXT), Column( FN_RELATION_TEXT, String(MAX_RELATION_TEXT_LENGTH), comment=HELP_RELATION_TEXT, ), Column( FN_RELATION, String(MAX_RELATION_LENGTH), comment=HELP_RELATION, ), Column( self.numerator_text_fieldname, String(MAX_VALUE_TEXT_LENGTH), comment="Numerator, as text", ), Column(self.numerator_fieldname, Float, comment="Numerator"), Column( self.denominator_text_fieldname, String(MAX_VALUE_TEXT_LENGTH), comment="Denominator, as text", ), Column( self.denominator_fieldname, Float, comment="Denominator" ), Column( self.correct_numerator_fieldname, Float, comment="Numerator, if denominator is as expected (units " "are correct)", ), Column( FN_TENSE_TEXT, String(MAX_TENSE_TEXT_LENGTH), comment=HELP_TENSE_TEXT, ), Column(FN_TENSE, String(MAX_TENSE_LENGTH), comment=HELP_TENSE), ] }
[docs] def parse( self, text: str, debug: bool = False ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: # docstring in superclass for m in self.compiled_regex.finditer(text): startpos = m.start() endpos = m.end() # groups = repr(m.groups()) # all matching groups matching_text = m.group(0) # the whole thing # matching_text = text[startpos:endpos] # same thing variable_text = m.group(1) tense_text = m.group(2) relation_text = m.group(3) numerator_text = m.group(4) denominator_text = m.group(5) if self.take_absolute: numerator = to_pos_float(numerator_text) else: numerator = to_float(numerator_text) denominator = to_float(denominator_text) if numerator is None: log.critical("bug - numerator is None, should be impossible") continue correct_numerator = None if denominator is None: if numerator <= self.expected_denominator: correct_numerator = numerator else: if numerator <= denominator == self.expected_denominator: correct_numerator = numerator tense, relation = common_tense(tense_text, relation_text) result = { FN_VARIABLE_NAME: self.variable_name, FN_CONTENT: matching_text, FN_START: startpos, FN_END: endpos, FN_VARIABLE_TEXT: variable_text, FN_RELATION_TEXT: relation_text, FN_RELATION: relation, self.numerator_text_fieldname: numerator_text, self.numerator_fieldname: numerator, self.denominator_text_fieldname: denominator_text, self.denominator_fieldname: denominator, self.correct_numerator_fieldname: correct_numerator, FN_TENSE_TEXT: tense_text, FN_TENSE: tense, } if debug: log.debug(f"Match {m} for {text!r} -> {result}") yield self.tablename, result
[docs] def test_numerator_denominator_parser( self, test_expected_list: List[Tuple[str, List[Tuple[float, float]]]], verbose: bool = False, ) -> None: """ Test the parser. Args: test_expected_list: list of tuples ``test_string, expected_values``. The parser will parse ``test_string`` and compare the result (each value of the target unit) to ``expected_values``, which is a list of tuples ``numerator, denominator``, and can be an empty list. verbose: print the regex? Raises: :exc:`AssertionError` if a comparison fails """ log.info(f"Testing parser: {self.classname()}") if verbose: log.debug(f"... regex:\n{self.regex_str}") for test_string, expected_values in test_expected_list: actual_values = list( (x[self.numerator_fieldname], x[self.denominator_fieldname]) for t, x in self.parse(test_string) ) assert actual_values == expected_values, ( "Parser {name}: Expected {expected}, got {actual}, when " "parsing {test_string}; full result:\n{full}".format( name=self.classname(), expected=expected_values, actual=actual_values, test_string=repr(test_string), full=repr(list(self.parse(test_string))), ) ) log.info("... OK")
# ============================================================================= # Validator base class (for testing regex NLP classes) # =============================================================================
[docs]class ValidatorBase(BaseNlpParser): r""" DO NOT USE DIRECTLY. Base class for **validating** regex parser sensitivity. The validator will find fields that refer to the variable, whether or not they meet the other criteria of the actual NLP processors (i.e. whether or not they contain a valid value). More explanation below. Suppose we're validating C-reactive protein (CRP). Key concepts: - source (true state of the world): Pr present, Ab absent - software decision: Y yes, N no - signal detection theory classification: - hit = Pr & Y = true positive - miss = Pr & N = false negative - false alarm = Ab & Y = false positive - correct rejection = Ab & N = true negative - common SDT metrics: - positive predictive value, PPV = P(Pr | Y) = precision (\*) - negative predictive value, NPV = P(Ab | N) - sensitivity = P(Y | Pr) = recall (\*) = true positive rate - specificity = P(N | Ab) = true negative rate (\*) common names used in the NLP context. - other common classifier metric: .. code-block:: none F_beta score = (1 + beta^2) * precision * recall / ((beta^2 * precision) + recall) ... which measures performance when you value recall beta times as much as precision (thus, for example, the F1 score when beta = 1). See https://en.wikipedia.org/wiki/F1_score/ Working from source to NLP, we can see there are a few types of "absent": - X. unselected database field containing text - Q. field contains "CRP", "C-reactive protein", etc.; something that a human (or as a proxy: a machine) would judge as containing a textual reference to CRP. - Pr. Present: a human would judge that a CRP value is present, e.g. "today her CRP is 7, which I am not concerned about." - H. Hit: software reports the value. - M. Miss: software misses the value. (Maybe: "his CRP was twenty-one".) - Ab1. Absent: reference to CRP, but no numerical information, e.g. "her CRP was normal". - FA1. False alarm: software reports a numerical value. (Maybe: "my CRP was 7 hours behind my boss's deadline") - CR1. Correct rejection: software doesn't report a value. - Ab2. field contains no reference to CRP at all. - FA2. False alarm: software reports a numerical value. (A bit harder to think of examples... but imagine a bug that gives a hit for "number of carp: 7". Or an alternative abbreviation meaning, e.g. "took part in a cardiac rehabilitation programme (CRP) 4 hours/week".) - CR2. Correct rejection: software doesn't report a value. From NLP backwards to source: - Y. Software says value present. - H. Hit: value is present. - FA. False alarm: value is absent. - N. Software says value absent. - CR. Correct rejection: value is absent. - M. Miss: value is present. The key metrics are: - precision = positive predictive value = P(Pr | Y) ... relatively easy to check; find all the "Y" records and check manually that they're correct. - sensitivity = recall = P(Y | Pr) ... Here, we want a sample that is enriched for "symptom actually present", for human reasons. For example, if 0.1% of text entries refer to CRP, then to assess 100 "Pr" samples we would have to review 100,000 text records, 99,900 of which are completely irrelevant. So we want an automated way of finding "Pr" records. That's what the validator classes do. You can enrich for "Pr" records with SQL, e.g. .. code-block:: sql SELECT textfield FROM sometable WHERE ( textfield LIKE '%CRP%' OR textfield LIKE '%C-reactive protein%'); or similar, but really we want the best "CRP detector" possible. That is probably to use a regex, either in SQL (... ``WHERE textfield REGEX 'myregex'``) or using these validator classes. (The main NLP regexes don't distinguish between "CRP present, no valid value" and "CRP absent", because regexes either match or don't.) Each validator class implements the core variable-finding part of its corresponding NLP regex class, but without the value or units. For example, the CRP class looks for things like "CRP is 6" or "CRP 20 mg/L", whereas the CRP validator looks for things like "CRP". """
[docs] def __init__( self, nlpdef: Optional[NlpDefinition], cfg_processor_name: Optional[str], commit: bool = False, ) -> None: """ Args: nlpdef: :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition` cfg_processor_name: config section suffix in the :ref:`NLP config file <nlp_config>` commit: force a COMMIT whenever we insert data? You should specify this in multiprocess mode, or you may get database deadlocks. """ ( validated_variable, regex_str_list, ) = self.get_variablename_regexstrlist() vname = f"{validated_variable}_validator" super().__init__( nlpdef=nlpdef, cfg_processor_name=cfg_processor_name, commit=commit, friendly_name=vname, ) self.regex_str_list = regex_str_list # for debugging only self.compiled_regex_list = [compile_regex(r) for r in regex_str_list] self.variable = vname self.NAME = self.variable if nlpdef is None: # only None for debugging! self.tablename = self.classname().lower() else: self.tablename = self._cfgsection.opt_str( ProcessorConfigKeys.DESTTABLE, required=True )
[docs] @classmethod @abstractmethod def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: """ To be overridden. Returns: tuple: ``(validated_variable_name, regex_str_list)``, where: regex_str_list: List of regular expressions, each in string format. This class operates with compiled regexes having this group format (capture groups in this sequence): - variable validated_variable: used to set our ``variable`` attribute and thus the value of the field ``variable_name`` in the NLP output; for example, if ``validated_variable == 'crp'``, then the ``variable_name`` field will be set to ``crp_validator``. """ raise NotImplementedError
[docs] def set_tablename(self, tablename: str) -> None: """ In case a friend class wants to override. """ self.tablename = tablename
[docs] def dest_tables_columns(self) -> Dict[str, List[Column]]: # docstring in superclass return { self.tablename: [ Column( FN_VARIABLE_NAME, SqlTypeDbIdentifier, comment=HELP_VARIABLE_NAME, ), Column(FN_CONTENT, Text, comment=HELP_CONTENT), Column(FN_START, Integer, comment=HELP_START), Column(FN_END, Integer, comment=HELP_END), ] }
[docs] def parse( self, text: str ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: # docstring in superclass for compiled_regex in self.compiled_regex_list: for m in compiled_regex.finditer(text): startpos = m.start() endpos = m.end() # groups = repr(m.groups()) # all matching groups matching_text = m.group(0) # the whole thing # matching_text = text[startpos:endpos] # same thing yield self.tablename, { FN_VARIABLE_NAME: self.variable, FN_CONTENT: matching_text, FN_START: startpos, FN_END: endpos, }
[docs] def test_validator( self, test_expected_list: List[Tuple[str, bool]], verbose: bool = False ) -> None: """ The 'bool' part of test_expected_list is: should it match any? ... noting that "match anywhere" is the "search" function, whereas "match" matches at the beginning: https://docs.python.org/3/library/re.html#re.regex.match """ log.info(f"Testing validator: {self.classname()}") if verbose: n = len(self.regex_str_list) for i, r in enumerate(self.regex_str_list, start=1): log.debug(f"... regex #{i}/{n}: {r}\n") for test_string, expected_match in test_expected_list: results = list( r.search(test_string) for r in self.compiled_regex_list ) actual_match = any(results) assert actual_match == expected_match, ( f"Validator {self.classname()}: Expected 'at least one regex " f"should match somewhere (search)' to be {expected_match}, " f"got {actual_match}, when parsing {test_string!r}; " f"full results = {results}" ) log.info("... OK")
[docs] def test(self, verbose: bool = False) -> None: log.info(f"... no tests implemented for validator {self.classname()}")
# ============================================================================= # More general testing # =============================================================================
[docs]def learning_alternative_regex_groups() -> None: """ Function to learn about regex syntax. """ regex_str = r""" ( (?: \s* (?: (a) | (b) | (c) | (d) ) \s* )* ( fish )? ) """ compiled_regex = compile_regex(regex_str) for test_str in ("a", "b", "a c", "d", "e", "a fish", "c c c"): m = compiled_regex.match(test_str) log.info(f"Match: {m}; groups: {m.groups()}") """ So: - groups can overlap - groups are ordered by their opening bracket - matches are filled in neatly """