Source code for crate_anon.nlp_manager.parse_biochemistry

"""
crate_anon/nlp_manager/parse_biochemistry.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Python regex-based NLP processors for biochemistry data.**

All inherit from
:class:`crate_anon.nlp_manager.regex_parser.SimpleNumericalResultParser` and
are constructed with these arguments:

nlpdef:
    a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
cfgsection:
    the name of a CRATE NLP config file section (from which we may
    choose to get extra config information)
commit:
    force a COMMIT whenever we insert data? You should specify this
    in multiprocess mode, or you may get database deadlocks.

"""

import logging
from typing import List, Optional, Tuple, Union

from crate_anon.common.regex_helpers import (
    regex_or,
    WORD_BOUNDARY,
)
from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.nlp_manager.number import to_float
from crate_anon.nlp_manager.regex_parser import (
    make_simple_numeric_regex,
    OPTIONAL_POC,
    SimpleNumericalResultParser,
    ValidatorBase,
)
from crate_anon.nlp_manager.regex_read_codes import (
    ReadCodes,
    regex_components_from_read_codes,
)
from crate_anon.nlp_manager.regex_units import (
    factor_micromolar_from_mg_per_dl,
    factor_millimolar_from_mg_per_dl,
    G,
    G_PER_L,
    MG,
    MG_PER_DL,
    MG_PER_L,
    MICROEQ_PER_L,
    MICROMOLAR,
    micromolar_from_mg_per_dl,
    MICROMOLES_PER_L,
    MICROUNITS_PER_ML,
    MILLIEQ_PER_L,
    MILLIMOLAR,
    millimolar_from_mg_per_dl,
    MILLIMOLES_PER_L,
    MILLIMOLES_PER_MOL,
    MILLIUNITS_PER_L,
    PERCENT,
    UNITS_PER_L,
)

log = logging.getLogger(__name__)


# =============================================================================
# C-reactive protein (CRP)
# =============================================================================


[docs]class Crp(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY.

    C-reactive protein (CRP). Default units are mg/L; also supports mg/dL.

    CRP units:

    - mg/L is commonest in the UK (or at least standard at Addenbrooke's,
      Hinchingbrooke, and Dundee);

    - values of <=6 mg/L or <10 mg/L are normal, and e.g. 70-250 mg/L in
      pneumonia.

    - Refs include:

      - https://www.ncbi.nlm.nih.gov/pubmed/7705110
      - https://emedicine.medscape.com/article/2086909-overview

    - 1 mg/dL = 10 mg/L, so normal in mg/dL is <=1 roughly.

    """

    CRP_BASE = rf"""
        {WORD_BOUNDARY}
            (?: (?: C [-\s]+ reactive [\s]+ protein ) | CRP )
        {WORD_BOUNDARY}
    """
    CRP = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.CRP_PLASMA,
            ReadCodes.CRP_SERUM,
        ),
        CRP_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=CRP,
        units=regex_or(MG_PER_DL, MG_PER_L),
        optional_ignorable_after_quantity=OPTIONAL_POC,
    )
    NAME = "CRP"
    PREFERRED_UNIT_COLUMN = "value_mg_L"
    UNIT_MAPPING = {
        MG_PER_L: 1,  # preferred unit
        MG_PER_DL: 10,  # 1 mg/dL -> 10 mg/L
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class
        self.test_numerical_parser(
            [
                ("CRP", []),  # should fail; no values
                ("CRP 6", [6]),
                ("C-reactive protein 6", [6]),
                ("C reactive protein 6", [6]),
                ("CRP = 6", [6]),
                ("CRP 6 mg/dl", [60]),
                ("CRP: 6", [6]),
                ("CRP equals 6", [6]),
                ("CRP is equal to 6", [6]),
                ("CRP <1", [1]),
                ("CRP less than 1", [1]),
                ("CRP <1 mg/dl", [10]),
                ("CRP >250", [250]),
                ("CRP more than 1", [1]),
                ("CRP greater than 1", [1]),
                ("CRP >250 mg/dl", [2500]),
                ("CRP was 62", [62]),
                ("CRP was 62 mg/l", [62]),
                ("CRP was <1", [1]),
                ("CRP is 19.2", [19.2]),
                ("CRP is >250", [250]),
                ("CRP is 19 mg dl-1", [190]),
                ("CRP is 19 mg dl -1", [190]),
                ("CRP 1.9 mg/L", [1.9]),
                ("CRP-97", [97]),
                ("CRP 1.9 mg L-1", [1.9]),
                ("CRP        |       1.9 (H)      | mg/L", [1.9]),
                ("Plasma C-reactive protein level (XE2dy) 45 mg/L", [45]),
                ("Serum C reactive protein level (XaINL) 45 mg/L", [45]),
                ("CRP (mg/L) 62", [62]),
            ],
            verbose=verbose,
        )


[docs]class CrpValidator(ValidatorBase):
    """
    Validator for Crp (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return Crp.NAME, [Crp.CRP]


# =============================================================================
# Sodium (Na)
# =============================================================================
# ... handy to check approximately expected distribution of results!


[docs]class Sodium(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (U&E).

    Sodium (Na), in mM.
    """

    SODIUM_BASE = rf"""
        {WORD_BOUNDARY} (?: Na | Sodium ) {WORD_BOUNDARY}
    """
    SODIUM = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.SODIUM,
            ReadCodes.SODIUM_BLOOD,
            ReadCodes.SODIUM_PLASMA,
            ReadCodes.SODIUM_SERUM,
        ),
        SODIUM_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=SODIUM,
        units=regex_or(
            MILLIMOLAR,  # good
            MILLIMOLES_PER_L,  # good
            MILLIEQ_PER_L,  # good
            MG,  # bad
        ),
        optional_ignorable_after_quantity=OPTIONAL_POC,
    )
    NAME = "Sodium"
    PREFERRED_UNIT_COLUMN = "value_mmol_L"
    UNIT_MAPPING = {
        MILLIMOLAR: 1,  # preferred unit
        MILLIMOLES_PER_L: 1,
        MILLIEQ_PER_L: 1,
        # but not MG
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class
        self.test_numerical_parser(
            [
                ("Na", []),  # should fail; no values
                ("Na 120", [120]),
                ("sodium 153", [153]),
                ("Na 135 mEq/L", [135]),
                ("Na 139 mM", [139]),
                ("docusate sodium 100mg", []),
                (
                    "Present: Nicola Adams (NA). 1.0 Minutes of last meeting",
                    [],
                ),
                ("Present: Nicola Adams (NA) 1.0 Minutes of last meeting", []),
                ("Na (H) 145 mM", [145]),
                ("Na (*) 145 mM", [145]),
                ("Na (X) 145 mM", []),
                ("blah (Na) 145 mM", []),
                ("Na (145) something", [145]),
                ("Na (145 mM), others", [145]),
                ("Na-145", [145]),
                ("Sodium level (X771T) 145", [145]),
                ("Blood sodium level (XaDva) 145", [145]),
                ("Plasma sodium level (XaIRf) 145", [145]),
                ("Serum sodium level (XE2q0) 145", [145]),
                ("Serum sodium level (mmol/L) 137", [137]),
            ],
            verbose=verbose,
        )


[docs]class SodiumValidator(ValidatorBase):
    """
    Validator for Sodium (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return Sodium.NAME, [Sodium.SODIUM]


# =============================================================================
# Potassium (K)
# =============================================================================


[docs]class Potassium(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (U&E).

    Potassium (K), in mM.
    """

    POTASSIUM_BASE = rf"""
        {WORD_BOUNDARY} (?: K | Potassium ) {WORD_BOUNDARY}
    """
    POTASSIUM = regex_or(
        POTASSIUM_BASE,
        *regex_components_from_read_codes(
            ReadCodes.POTASSIUM,
            ReadCodes.POTASSIUM_BLOOD,
            ReadCodes.POTASSIUM_PLASMA,
            ReadCodes.POTASSIUM_SERUM,
        ),
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=POTASSIUM,
        units=regex_or(
            MILLIMOLAR,  # good
            MILLIMOLES_PER_L,  # good
            MILLIEQ_PER_L,  # good
            MG,  # bad
        ),
        optional_ignorable_after_quantity=OPTIONAL_POC,
    )
    NAME = "Potassium"
    PREFERRED_UNIT_COLUMN = "value_mmol_L"
    UNIT_MAPPING = {
        MILLIMOLAR: 1,  # preferred unit
        MILLIMOLES_PER_L: 1,
        MILLIEQ_PER_L: 1,
        # but not MG
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class
        self.test_numerical_parser(
            [
                ("K", []),  # should fail; no values
                ("K 4", [4]),
                ("Potassium 4.3", [4.3]),
                ("K 4.5 mEq/L", [4.5]),
                ("K 4.5 mM", [4.5]),
                ("losartan potassium 50mg", []),
                ("Present: Kerry Smith (K). 1.0 Minutes of last meeting", []),
                ("Present: Kerry Smith (K) 1.0 Minutes of last meeting", []),
                ("K (H) 5.6 mM", [5.6]),
                ("K (*) 5.6 mM", [5.6]),
                ("K (X) 5.6 mM", []),
                ("blah (K) 5.6 mM", []),
                ("K (5.6) something", [5.6]),
                ("K (5.6 mM), others", [5.6]),
                ("K-3.2", [3.2]),
                ("Potassium level (X771S) 3.2", [3.2]),
                ("Blood potassium level (XaDvZ) 3.2", [3.2]),
                ("Plasma potassium level (XaIRl) 3.2", [3.2]),
                ("Serum potassium level (XE2pz) 3.2", [3.2]),
                ("Serum potassium level (XaIRl) 3.2", []),  # wrong code
            ],
            verbose=verbose,
        )


[docs]class PotassiumValidator(ValidatorBase):
    """
    Validator for Potassium (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return Potassium.NAME, [Potassium.POTASSIUM]


# =============================================================================
# Urea
# =============================================================================


[docs]class Urea(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (U&E).

    Urea, in mM.
    """

    UREA_BASE = rf"""
        {WORD_BOUNDARY} U(?:r(?:ea)?)? {WORD_BOUNDARY}
    """
    UREA = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.UREA_BLOOD,
            ReadCodes.UREA_PLASMA,
            ReadCodes.UREA_SERUM,
        ),
        UREA_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=UREA,
        units=regex_or(
            MILLIMOLAR,  # good
            MILLIMOLES_PER_L,  # good
            MILLIEQ_PER_L,  # good
            MG,  # bad
        ),
        optional_ignorable_after_quantity=OPTIONAL_POC,
    )
    NAME = "Urea"
    PREFERRED_UNIT_COLUMN = "value_mmol_L"
    UNIT_MAPPING = {
        MILLIMOLAR: 1,  # preferred unit
        MILLIMOLES_PER_L: 1,
        MILLIEQ_PER_L: 1,
        # but not MG
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class
        self.test_numerical_parser(
            [
                ("Urea", []),  # should fail; no values
                ("U 4", [4]),
                ("Urea 4.3", [4.3]),
                ("U 4.5 mEq/L", [4.5]),
                ("Ur 4.5 mM", [4.5]),
                (
                    "Present: Ursula Rogers (U). 1.0 Minutes of last meeting",
                    [],
                ),
                (
                    "Present: Ursula Rogers (UR) 1.0 Minutes of last meeting",
                    [],
                ),
                ("U (H) 5.6 mM", [5.6]),
                ("Ur (*) 5.6 mM", [5.6]),
                ("Urea (X) 5.6 mM", []),
                ("blah (U) 5.6 mM", []),
                ("Urea (5.6) something", [5.6]),
                ("Urea (5.6 mM), others", [5.6]),
                ("U-3.2", [3.2]),
                ("Blood urea (X771P) 3.2", [3.2]),
                ("Plasma urea level (XaDvl) 3.2", [3.2]),
                ("Serum urea level (XM0lt) 3.2", [3.2]),
            ],
            verbose=verbose,
        )


[docs]class UreaValidator(ValidatorBase):
    """
    Validator for Urea (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return Urea.NAME, [Urea.UREA]


# =============================================================================
# Creatinine
# =============================================================================


[docs]class Creatinine(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (U&E).

    Creatinine. Default units are micromolar (SI); also supports mg/dL.
    """

    CREATININE_BASE = rf"""
        {WORD_BOUNDARY} Cr(?:eat(?:inine)?)? {WORD_BOUNDARY}
    """
    # ... Cr, Creat, Creatinine
    # Possible that "creatine" is present as a typo... but it's wrong...
    CREATININE = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.CREATININE,
            ReadCodes.CREATININE_PLASMA,
            ReadCodes.CREATININE_PLASMA_CORRECTED,
            ReadCodes.CREATININE_SERUM,
            ReadCodes.CREATININE_SERUM_CORRECTED,
        ),
        CREATININE_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=CREATININE,
        units=regex_or(
            MICROMOLAR,  # good
            MICROMOLES_PER_L,  # good
            MICROEQ_PER_L,  # good
            MG_PER_DL,  # good but needs conversion
            # ... note that MG_PER_DL must precede MG
            MG,  # bad
        ),
        optional_ignorable_after_quantity=OPTIONAL_POC,
    )
    CREATININE_MOLECULAR_MASS_G_PER_MOL = 113.12
    # ... https://pubchem.ncbi.nlm.nih.gov/compound/creatinine
    NAME = "Creatinine"
    PREFERRED_UNIT_COLUMN = "value_micromol_L"
    UNIT_MAPPING = {
        MICROMOLAR: 1,  # preferred unit
        MICROMOLES_PER_L: 1,
        MICROEQ_PER_L: 1,
        MG_PER_DL: factor_micromolar_from_mg_per_dl(
            CREATININE_MOLECULAR_MASS_G_PER_MOL
        ),
        # but not MG
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class
        def convert(mg_dl: float) -> float:
            # Convert mg/dl to μM
            return micromolar_from_mg_per_dl(
                mg_dl, self.CREATININE_MOLECULAR_MASS_G_PER_MOL
            )

        self.test_numerical_parser(
            [
                ("Creatinine", []),  # should fail; no values
                ("Cr 50", [50]),
                ("Creat 125.5", [125.5]),
                ("Creat 75 uEq/L", [75]),
                ("Cr 75 μM", [75]),
                (
                    "Present: Chloe Rogers (CR). 1.0 Minutes of last meeting",
                    [],
                ),
                ("Creatinine (H) 200 uM", [200]),
                ("Creatinine (*) 200 micromol/L", [200]),
                ("Creatinine (X) 200 uM", []),
                ("Creatinine 200 micromolar", [200]),
                ("Creatinine 200 micromolar, others", [200]),
                ("blah (creat) 5.6 uM", []),
                ("Creatinine (200) something", [200]),
                ("Creatinine (200 micromolar)", [200]),
                ("Creatinine (200 micromolar), others", [200]),
                ("Cr-75", [75]),
                ("creatinine 3 mg/dl", [convert(3)]),
                ("creatinine 3 mg", []),
                ("Creatinine level (X771Q) 75", [75]),
                ("Plasma creatinine level (XaETQ) 75", [75]),
                ("Cor plasma creatinine level (XaERX) 75", [75]),
                ("Serum creatinine level (XE2q5) 75", [75]),
                ("Cor serum creatinine level (XaERc) 75", [75]),
            ],
            verbose=verbose,
        )


[docs]class CreatinineValidator(ValidatorBase):
    """
    Validator for Creatinine (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return Creatinine.NAME, [Creatinine.CREATININE]


# =============================================================================
# Lithium (Li)
# =============================================================================


[docs]class Lithium(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (THERAPEUTIC DRUG MONITORING).

    Lithium (Li) levels (for blood tests, not doses), in mM.
    """

    LITHIUM_BASE = rf"""
        {WORD_BOUNDARY} Li(?:thium)? {WORD_BOUNDARY}
    """
    LITHIUM = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.LITHIUM_SERUM,
        ),
        LITHIUM_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=LITHIUM,
        units=regex_or(
            MILLIMOLAR,  # good
            MILLIMOLES_PER_L,  # good
            MILLIEQ_PER_L,  # good
            MG,  # bad
            G,  # bad
        ),
    )
    NAME = "Lithium"
    PREFERRED_UNIT_COLUMN = "value_mmol_L"
    UNIT_MAPPING = {
        MILLIMOLAR: 1,  # preferred unit
        MILLIMOLES_PER_L: 1,
        MILLIEQ_PER_L: 1,
        # but not MG
        # and not G
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class
        self.test_numerical_parser(
            [
                ("Li", []),  # should fail; no values
                ("Li 0.4", [0.4]),
                ("li 1200 mg", []),  # that's a dose
                ("li 1.2 g", []),  # that's a dose
                ("lithium 1200 mg", []),  # that's a dose
                ("lithium 153", [153]),  # an unhappy patient...
                ("Li 135 mEq/L", [135]),
                ("Li 139 mM", [139]),
                ("lithium carbonate 800mg", []),
                (
                    "Present: Linda Ingles (LI). 1.0 Minutes of last meeting",
                    [],
                ),
                ("Present: Linda Ingles (LI) 1.0 Minutes of last meeting", []),
                ("Li (H) 1.3 mM", [1.3]),
                ("Li (*) 1.3 mM", [1.3]),
                ("Li (X) 1.3 mM", []),
                ("blah (Li) 1.2 mM", []),
                ("Li (1.3) something", [1.3]),
                ("Li (0.4 mM), others", [0.4]),
                ("Li-0.4", [0.4]),
                ("Serum lithium level (XE25g) 0.4", [0.4]),
            ],
            verbose=verbose,
        )


[docs]class LithiumValidator(ValidatorBase):
    """
    Validator for Lithium (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return Lithium.NAME, [Lithium.LITHIUM]


# =============================================================================
# Thyroid-stimulating hormone (TSH)
# =============================================================================


[docs]class Tsh(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (ENDOCRINOLOGY).

    Thyroid-stimulating hormone (TSH), in mIU/L (or μIU/mL).
    """

    TSH_BASE = rf"""
        {WORD_BOUNDARY}
            (?: TSH | thyroid [-\s]+ stimulating [-\s]+ hormone )
        {WORD_BOUNDARY}
    """
    TSH = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.TSH_PLASMA,
            ReadCodes.TSH_PLASMA_30_MIN,
            ReadCodes.TSH_PLASMA_60_MIN,
            ReadCodes.TSH_PLASMA_90_MIN,
            ReadCodes.TSH_PLASMA_120_MIN,
            ReadCodes.TSH_PLASMA_150_MIN,
            ReadCodes.TSH_SERUM,
            ReadCodes.TSH_SERUM_60_MIN,
            ReadCodes.TSH_SERUM_90_MIN,
            ReadCodes.TSH_SERUM_120_MIN,
            ReadCodes.TSH_SERUM_150_MIN,
        ),
        TSH_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=TSH,
        units=regex_or(
            MILLIUNITS_PER_L,  # good
            MICROUNITS_PER_ML,  # good
        ),
    )
    NAME = "TSH"
    PREFERRED_UNIT_COLUMN = "value_mU_L"
    UNIT_MAPPING = {
        MILLIUNITS_PER_L: 1,  # preferred unit
        MICROUNITS_PER_ML: 1,
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in superclass
        self.test_numerical_parser(
            [
                ("TSH", []),  # should fail; no values
                ("TSH 1.5", [1.5]),
                ("thyroid-stimulating hormone 1.5", [1.5]),
                ("TSH 1.5 mU/L", [1.5]),
                ("TSH 1.5 mIU/L", [1.5]),
                ("TSH 1.5 μU/mL", [1.5]),
                ("TSH 1.5 μIU/mL", [1.5]),
                ("TSH 1.5 uU/mL", [1.5]),
                ("TSH 1.5 uIU/mL", [1.5]),
                ("TSH-2.3", [2.3]),
                ("Plasma TSH level (XaELW) 2.3", [2.3]),
                ("Serum TSH level (XaELV) 2.3", [2.3]),
                # etc.; not all Read codes tested here
            ],
            verbose=verbose,
        )


[docs]class TshValidator(ValidatorBase):
    """
    Validator for TSH (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return Tsh.NAME, [Tsh.TSH]


# =============================================================================
# Alkaline phosphatase
# =============================================================================


[docs]class AlkPhos(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (LFTs/BFTs).

    Alkaline phosphatase (ALP, AlkP, AlkPhos). Units are U/L.
    """

    ALKP_BASE = rf"""
        {WORD_BOUNDARY}
        (?:
            (?: ALk?P (?:\. | {WORD_BOUNDARY}) ) |
            (?:
                alk(?:aline | \.)?
                [-\s]*
                phos(?:phatase{WORD_BOUNDARY} | \. | {WORD_BOUNDARY})
            )
        )
    """
    ALKP = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.ALKPHOS_PLASMA,
            ReadCodes.ALKPHOS_SERUM,
            ReadCodes.ALKPHOS,  # least specific; at end
        ),
        ALKP_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(quantity=ALKP, units=UNITS_PER_L)
    NAME = "AlkPhos"
    PREFERRED_UNIT_COLUMN = "value_U_L"
    UNIT_MAPPING = {UNITS_PER_L: 1}  # preferred unit

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in superclass
        self.test_numerical_parser(
            [
                ("ALP", []),  # should fail; no values
                ("was 7", []),  # no quantity
                ("ALP 55", [55]),
                ("Alkaline-Phosphatase 55", [55]),
                ("Alkaline Phosphatase    55 U/L ", [55]),
                ("ALP 55 U/L", [55]),
                ("ALP-55", [55]),
                ("AlkP 55", [55]),
                ("alk.phos. 55", [55]),
                ("alk. phos. 55", [55]),
                ("alkphos 55", [55]),
                ("Alkaline phosphatase level (44F3.) 55", [55]),
                (
                    "Alkaline phosphatase level (44F3x) 55",
                    [],
                ),  # test "." in regex
                ("Plasma alkaline phosphatase level (XaIRj) 55", [55]),
                ("Serum alkaline phosphatase level (XE2px) 55", [55]),
            ],
            verbose=verbose,
        )


[docs]class AlkPhosValidator(ValidatorBase):
    """
    Validator for AlkPhos (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return AlkPhos.NAME, [AlkPhos.ALKP]


# =============================================================================
# Alanine aminotransferase (ALT)
# =============================================================================


[docs]class ALT(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (LFTs).

    Alanine aminotransferase (ALT), a.k.a. alanine transaminase (ALT).
    Units are U/L.

    A.k.a. serum glutamate-pyruvate transaminase (SGPT), or serum
    glutamate-pyruvic transaminase (SGPT), but not a.k.a. those in recent
    memory!
    """

    ALT_BASE = rf"""
        {WORD_BOUNDARY}
        (?:
            ALT |
            alanine [-\s]+ (?: aminotransferase | transaminase )
        )
        {WORD_BOUNDARY}
    """
    ALT = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.ALT,
        ),
        ALT_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(quantity=ALT, units=UNITS_PER_L)
    NAME = "ALT"
    PREFERRED_UNIT_COLUMN = "value_U_L"
    UNIT_MAPPING = {UNITS_PER_L: 1}  # preferred unit

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in superclass
        self.test_numerical_parser(
            [
                ("ALT", []),  # should fail; no values
                ("was 7", []),  # no quantity
                ("ALT 55", [55]),
                ("alanine-aminotransferase 55", [55]),
                ("Alanine aminotransferase    55 U/L ", [55]),
                ("alanine transaminase    55 U/L ", [55]),
                ("ALT 55 U/L", [55]),
                ("ALT-55", [55]),
                ("ALP 55", []),  # wrong thing
                ("ALT/SGPT serum level (44G3.) 55", [55]),
            ],
            verbose=verbose,
        )


[docs]class ALTValidator(ValidatorBase):
    """
    Validator for ALT (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return ALT.NAME, [ALT.ALT]


# =============================================================================
# Gamma GT (gGT)
# =============================================================================


[docs]class GammaGT(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (LFTs).

    Gamma-glutamyl transferase (gGT), in U/L.
    """

    GGT_BASE = rf"""
        {WORD_BOUNDARY}
        (?:
            (?: γ | G | gamma)
            [-\s]*
            (?:
                GT |
                glutamyl [-\s]+ transferase
            )
        )
        {WORD_BOUNDARY}
    """
    GGT = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.GAMMA_GT,
            ReadCodes.GAMMA_GT_PLASMA,
            ReadCodes.GAMMA_GT_SERUM,
        ),
        GGT_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(quantity=GGT, units=UNITS_PER_L)
    NAME = "GammaGT"
    PREFERRED_UNIT_COLUMN = "value_U_L"
    UNIT_MAPPING = {UNITS_PER_L: 1}  # preferred unit

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in superclass
        self.test_numerical_parser(
            [
                ("gGT", []),  # should fail; no values
                ("was 7", []),  # no quantity
                ("gGT 55", [55]),
                ("gamma Glutamyl Transferase 19  U/L", [19]),
                ("Gamma GT    55 U/L ", [55]),
                ("GGT 55 U/L", [55]),
                ("ggt-55", [55]),
                ("γGT 55", [55]),
                ("Gamma-glutamyl transferase lev (44G4.) 55", [55]),
                ("Plasma gamma-glutamyl transferase level (XaES4) 55", [55]),
                ("Serum gamma-glutamyl transferase level (XaES3) 55", [55]),
            ],
            verbose=verbose,
        )


[docs]class GammaGTValidator(ValidatorBase):
    """
    Validator for GammaGT (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return GammaGT.NAME, [GammaGT.GGT]


# =============================================================================
# Total bilirubin
# =============================================================================


[docs]class Bilirubin(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (LFTs).

    Total bilirubin. Units are μM.
    """

    BILIRUBIN_BASE = rf"""
        {WORD_BOUNDARY}
        (?: t(?: ot(?:al | \.)? | \.) \s+ )?
        bili?(?: \. | rubin{WORD_BOUNDARY})?
    """
    BILIRUBIN = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.BILIRUBIN_PLASMA_TOTAL,
            ReadCodes.BILIRUBIN_SERUM,
            ReadCodes.BILIRUBIN_SERUM_TOTAL,
            ReadCodes.BILIRUBIN_TOTAL,
        ),
        BILIRUBIN_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=BILIRUBIN,
        units=regex_or(
            MICROMOLAR,  # good
            MICROMOLES_PER_L,  # good
        ),
    )
    NAME = "Bilirubin"
    PREFERRED_UNIT_COLUMN = "value_micromol_L"
    UNIT_MAPPING = {MICROMOLAR: 1, MICROMOLES_PER_L: 1}  # preferred unit

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in superclass
        self.test_numerical_parser(
            [
                ("tot Bil", []),  # should fail; no values
                ("was 7", []),  # no quantity
                ("tot Bil 6", [6]),
                ("Total Bilirubin: 6", [6]),
                ("Total Bilirubin 6 umol/L", [6]),
                ("bilirubin 17 μM", [17]),
                ("t.bilirubin 17 μM", [17]),
                ("t. bilirubin 17 μM", [17]),
                ("bili. 17 μM", [17]),
                ("bili 17 μM", [17]),
                ("Plasma total bilirubin level (XaETf) 17", [17]),
                ("Serum bilirubin level (44E..) 17", [17]),
                ("Serum total bilirubin level (XaERu) 17", [17]),
                ("Total bilirubin level (XE2qu) 17", [17]),
                (
                    "Total   bilirubin \t  level \n (XE2qu) 17",
                    [17],
                ),  # test whitespace
                (
                    "xTotal bilirubin level (XE2qu) 17",
                    [],
                ),  # test word boundary
                ("Serum total bilirubin level (XaERu) 6 umol/L", [6]),
            ],
            verbose=verbose,
        )


[docs]class BilirubinValidator(ValidatorBase):
    """
    Validator for Bilirubin (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return Bilirubin.NAME, [Bilirubin.BILIRUBIN]


# =============================================================================
# Albumin (Alb)
# =============================================================================


[docs]class Albumin(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (LFTs).

    Albumin (Alb). Units are g/L.
    """

    ALBUMIN_BASE = rf"""
        {WORD_BOUNDARY}
        (?:
            alb(?:\. | umin{WORD_BOUNDARY})?
            (?: \s+ level{WORD_BOUNDARY})?
        )
    """
    ALBUMIN = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.ALBUMIN_PLASMA,
            ReadCodes.ALBUMIN_SERUM,
        ),
        ALBUMIN_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(quantity=ALBUMIN, units=G_PER_L)
    NAME = "Albumin"
    PREFERRED_UNIT_COLUMN = "value_g_L"
    UNIT_MAPPING = {G_PER_L: 1}  # preferred unit

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in superclass
        self.test_numerical_parser(
            [
                ("Alb", []),  # should fail; no values
                ("was 7", []),  # no quantity
                ("ALP 6", []),  # wrong quantity
                ("Alb 6", [6]),
                ("Albumin: 48", [48]),
                ("Albumin 48 g/L", [48]),
                ("alb. 48", [48]),
                ("albumin level 48", [48]),
                ("Plasma albumin level (XaIRc) 48", [48]),
                ("Serum albumin level (XE2eA) 48", [48]),
            ],
            verbose=verbose,
        )


[docs]class AlbuminValidator(ValidatorBase):
    """
    Validator for Albumin (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return Albumin.NAME, [Albumin.ALBUMIN]


# =============================================================================
# Glucose
# =============================================================================


[docs]class Glucose(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY.

    Glucose. Default units are mM; also supports mg/dL.
    """

    # By Emanuele Osimo, Feb 2019.
    # Some modifications by Rudolf Cardinal, Feb 2019.
    GLUCOSE_BASE = rf"""
        {WORD_BOUNDARY} glu(?:c(?:ose)?)? {WORD_BOUNDARY}
        # glu, gluc, glucose
    """
    GLUCOSE = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.GLUCOSE,
            ReadCodes.GLUCOSE_BLOOD,
            ReadCodes.GLUCOSE_BLOOD_2H_POSTPRANDIAL,
            ReadCodes.GLUCOSE_BLOOD_150_MIN,
            ReadCodes.GLUCOSE_PLASMA_RANDOM,
            ReadCodes.GLUCOSE_PLASMA_FASTING,
            ReadCodes.GLUCOSE_PLASMA_30_MIN,
            ReadCodes.GLUCOSE_PLASMA_60_MIN,
            ReadCodes.GLUCOSE_PLASMA_90_MIN,
            ReadCodes.GLUCOSE_PLASMA_120_MIN,
            ReadCodes.GLUCOSE_PLASMA_2H_POSTPRANDIAL,
            ReadCodes.GLUCOSE_PLASMA_150_MIN,
            ReadCodes.GLUCOSE_SERUM,
            ReadCodes.GLUCOSE_SERUM_RANDOM,
            ReadCodes.GLUCOSE_SERUM_FASTING,
            ReadCodes.GLUCOSE_SERUM_30_MIN,
            ReadCodes.GLUCOSE_SERUM_60_MIN,
            ReadCodes.GLUCOSE_SERUM_90_MIN,
            ReadCodes.GLUCOSE_SERUM_120_MIN,
            ReadCodes.GLUCOSE_SERUM_2H_POSTPRANDIAL,
            ReadCodes.GLUCOSE_SERUM_150_MIN,
            # !
        ),
        GLUCOSE_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=GLUCOSE,
        units=regex_or(
            MILLIMOLAR,  # good
            MILLIMOLES_PER_L,  # good
            MG_PER_DL,  # good but needs conversion
        ),
        optional_ignorable_after_quantity=OPTIONAL_POC,
    )
    GLUCOSE_MOLECULAR_MASS_G_PER_MOL = 180.156
    # ... https://pubchem.ncbi.nlm.nih.gov/compound/D-glucose
    NAME = "Glucose"
    PREFERRED_UNIT_COLUMN = "value_mmol_L"
    UNIT_MAPPING = {
        MILLIMOLAR: 1,  # preferred unit
        MILLIMOLES_PER_L: 1,
        MG_PER_DL: factor_millimolar_from_mg_per_dl(
            GLUCOSE_MOLECULAR_MASS_G_PER_MOL
        ),
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class

        def convert(mg_dl: float) -> float:
            # Convert mg/dl to mM
            return millimolar_from_mg_per_dl(
                mg_dl, self.GLUCOSE_MOLECULAR_MASS_G_PER_MOL
            )

        self.test_numerical_parser(
            [
                ("glu", []),  # should fail; no values
                ("glucose 6 mM", [6]),
                ("glucose 6 mmol", [6]),
                ("glucose 6", [6]),
                ("glu 6", [6]),
                ("glucose 90 mg/dl", [convert(90)]),  # unit conversion
                ("gluc = 6", [6]),
                ("glucose: 6", [6]),
                ("glu equals 6", [6]),
                ("glucose is equal to 6", [6]),
                ("glu <4", [4]),
                ("glucose less than 1", [1]),  # would be bad news...
                ("glu more than 20", [20]),
                ("glucose was 15", [15]),
                ("glucose was 90 mg/dl", [convert(90)]),
                ("glu is 90 mg dl-1", [convert(90)]),
                ("glucose is 90 mg dl -1", [convert(90)]),
                ("glu-5", [5]),
                ("glucose        |       20.3 (H)      | mmol/L", [20.3]),
                ("Glucose level (X772y) 5", [5]),
                ("Blood glucose level (X772z) 5", [5]),
                # Not all Read codes tested.
            ],
            verbose=verbose,
        )


[docs]class GlucoseValidator(ValidatorBase):
    """
    Validator for Glucose (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return Glucose.NAME, [Glucose.GLUCOSE]


# =============================================================================
# LDL cholesterol
# =============================================================================


[docs]class LDLCholesterol(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (LIPID PROFILE).

    Low density lipoprotein (LDL) cholesterol.
    Default units are mM; also supports mg/dL.
    """

    # By Emanuele Osimo, Feb 2019.
    # Some modifications by Rudolf Cardinal, Feb 2019.
    LDL_BASE = rf"""
        {WORD_BOUNDARY}
        LDL [-\s]*
        (?:
            chol(?:esterol)?{WORD_BOUNDARY} |
            chol\. |
            {WORD_BOUNDARY}  # allows LDL by itself
        )
    """
    LDL = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.LDL_PLASMA,
            ReadCodes.LDL_PLASMA_FASTING,
            ReadCodes.LDL_PLASMA_RANDOM,
            ReadCodes.LDL_SERUM,
            ReadCodes.LDL_SERUM_FASTING,
            ReadCodes.LDL_SERUM_RANDOM,
        ),
        LDL_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=LDL,
        units=regex_or(
            MILLIMOLAR,  # good
            MILLIMOLES_PER_L,  # good
            MG_PER_DL,  # good but needs conversion
        ),
    )
    NAME = "LDL cholesterol"
    PREFERRED_UNIT_COLUMN = "value_mmol_L"
    FACTOR_MG_DL_TO_MMOL_L = 0.02586
    # ... https://www.ncbi.nlm.nih.gov/books/NBK33478/
    UNIT_MAPPING = {
        MILLIMOLAR: 1,  # preferred unit
        MILLIMOLES_PER_L: 1,
        MG_PER_DL: FACTOR_MG_DL_TO_MMOL_L,
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class

        def convert(mg_dl: float) -> float:
            # Convert mg/dl to mM
            return self.FACTOR_MG_DL_TO_MMOL_L * mg_dl

        self.test_numerical_parser(
            [
                ("LDL", []),  # should fail; no values
                ("LDL 4 mM", [4]),
                ("LDL chol 4 mmol", [4]),
                ("LDL chol. 4 mmol", [4]),
                ("LDL 4", [4]),
                ("chol 4", []),  # that's total cholesterol
                ("HDL chol 4", []),  # that's HDL cholesterol
                (
                    "LDL cholesterol 140 mg/dl",
                    [convert(140)],
                ),  # unit conversion
                ("LDL = 4", [4]),
                ("LDL: 4", [4]),
                ("LDL equals 4", [4]),
                ("LDL is equal to 4", [4]),
                ("LDL <4", [4]),
                ("LDLchol less than 4", [4]),
                ("LDL cholesterol more than 20", [20]),
                ("LDL was 4", [4]),
                ("LDL chol was 140 mg/dl", [convert(140)]),
                ("chol was 140 mg/dl", []),
                ("LDL is 140 mg dl-1", [convert(140)]),
                ("ldl chol is 140 mg dl -1", [convert(140)]),
                ("ldl-4", [4]),
                ("LDL chol     |       6.2 (H)      | mmol/L", [6.2]),
                ("Plasma LDL cholesterol level (XaEVs) 4", [4]),
                ("Plasma rndm LDL cholest level (44d4.) 4", [4]),
                ("Plasma fast LDL cholest level (44d5.) 4", [4]),
                ("Serum LDL cholesterol level (44P6.) 4", [4]),
                ("Serum fast LDL cholesterol lev (44PD.) 4", [4]),
                ("Ser random LDL cholesterol lev (44PE.) 4", [4]),
            ],
            verbose=verbose,
        )


[docs]class LDLCholesterolValidator(ValidatorBase):
    """
    Validator for LDLCholesterol (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return LDLCholesterol.NAME, [LDLCholesterol.LDL]


# =============================================================================
# HDL cholesterol
# =============================================================================


[docs]class HDLCholesterol(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (LIPID PROFILE).

    High-density lipoprotein (HDL) cholesterol.
    Default units are mM; also supports mg/dL.
    """

    # By Emanuele Osimo, Feb 2019.
    # Some modifications by Rudolf Cardinal, Feb 2019.
    HDL_BASE = rf"""
        {WORD_BOUNDARY}
        HDL [-\s]*
        (?:
            chol(?:esterol)?{WORD_BOUNDARY} |
            chol\. |
            {WORD_BOUNDARY}  # allows HDL by itself
        )
    """
    HDL = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.HDL_PLASMA,
            ReadCodes.HDL_PLASMA_FASTING,
            ReadCodes.HDL_PLASMA_RANDOM,
            ReadCodes.HDL_SERUM,
            ReadCodes.HDL_SERUM_FASTING,
            ReadCodes.HDL_SERUM_RANDOM,
        ),
        HDL_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=HDL,
        units=regex_or(
            MILLIMOLAR,  # good
            MILLIMOLES_PER_L,  # good
            MG_PER_DL,  # good but needs conversion
        ),
    )
    NAME = "HDL cholesterol"
    PREFERRED_UNIT_COLUMN = "value_mmol_L"
    FACTOR_MG_DL_TO_MMOL_L = 0.02586
    # ... https://www.ncbi.nlm.nih.gov/books/NBK33478/
    UNIT_MAPPING = {
        MILLIMOLAR: 1,  # preferred unit
        MILLIMOLES_PER_L: 1,
        MG_PER_DL: FACTOR_MG_DL_TO_MMOL_L,
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class

        def convert(mg_dl: float) -> float:
            # Convert mg/dl to mM
            return self.FACTOR_MG_DL_TO_MMOL_L * mg_dl

        self.test_numerical_parser(
            [
                ("HDL", []),  # should fail; no values
                ("HDL 4 mM", [4]),
                ("HDL chol 4 mmol", [4]),
                ("HDL chol. 4 mmol", [4]),
                ("HDL 4", [4]),
                ("chol 4", []),  # that's total cholesterol
                ("LDL chol 4", []),  # that's LDL cholesterol
                (
                    "HDL cholesterol 140 mg/dl",
                    [convert(140)],
                ),  # unit conversion
                ("HDL = 4", [4]),
                ("HDL: 4", [4]),
                ("HDL equals 4", [4]),
                ("HDL is equal to 4", [4]),
                ("HDL <4", [4]),
                ("HDLchol less than 4", [4]),
                ("HDL cholesterol more than 20", [20]),
                ("HDL was 4", [4]),
                ("HDL chol was 140 mg/dl", [convert(140)]),
                ("chol was 140 mg/dl", []),
                ("HDL is 140 mg dl-1", [convert(140)]),
                ("Hdl chol is 140 mg dl -1", [convert(140)]),
                ("hdl-4", [4]),
                ("HDL chol     |       6.2 (H)      | mmol/L", [6.2]),
                ("Plasma HDL cholesterol level (XaEVr) 4", [4]),
                ("Plasma rndm HDL cholest level (44d2.) 4", [4]),
                ("Plasma fast HDL cholest level (44d3.) 4", [4]),
                ("Serum HDL cholesterol level (44P5.) 4", [4]),
                ("Serum fast HDL cholesterol lev (44PB.) 4", [4]),
                ("Ser random HDL cholesterol lev (44PC.) 4", [4]),
            ],
            verbose=verbose,
        )


[docs]class HDLCholesterolValidator(ValidatorBase):
    """
    Validator for HDLCholesterol (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return HDLCholesterol.NAME, [HDLCholesterol.HDL]


# =============================================================================
# Total cholesterol
# =============================================================================


[docs]class TotalCholesterol(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (LIPID PROFILE).

    Total or undifferentiated cholesterol.
    Default units are mM; also supports mg/dL.
    """

    CHOLESTEROL_BASE = rf"""
        {WORD_BOUNDARY}
        (?<!HDL[-\s]+) (?<!LDL[-\s]+)  # not preceded by HDL or LDL
        (?: tot(?:al) [-\s] )?         # optional "total" prefix
        (?:
            chol(?:esterol)?{WORD_BOUNDARY} |
            chol\.
        )
    """
    # ... (?<! something ) is a negative lookbehind assertion
    CHOLESTEROL = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.CHOLESTEROL_SERUM,
            ReadCodes.CHOLESTEROL_TOTAL_PLASMA,
            ReadCodes.CHOLESTEROL_TOTAL_SERUM,
        ),
        CHOLESTEROL_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=CHOLESTEROL,
        units=regex_or(
            MILLIMOLAR,  # good
            MILLIMOLES_PER_L,  # good
            MG_PER_DL,  # good but needs conversion
        ),
    )
    NAME = "Total cholesterol"
    PREFERRED_UNIT_COLUMN = "value_mmol_L"
    FACTOR_MG_DL_TO_MMOL_L = 0.02586
    # ... https://www.ncbi.nlm.nih.gov/books/NBK33478/
    UNIT_MAPPING = {
        MILLIMOLAR: 1,  # preferred unit
        MILLIMOLES_PER_L: 1,
        MG_PER_DL: FACTOR_MG_DL_TO_MMOL_L,
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class

        def convert(mg_dl: float) -> float:
            # Convert mg/dl to mM
            return self.FACTOR_MG_DL_TO_MMOL_L * mg_dl

        self.test_numerical_parser(
            [
                ("chol", []),  # should fail; no values
                ("chol 4 mM", [4]),
                ("total chol 4 mmol", [4]),
                ("chol. 4 mmol", [4]),
                ("chol 4", [4]),
                ("HDL chol 4", []),  # that's HDL cholesterol
                ("LDL chol 4", []),  # that's LDL cholesterol
                (
                    "total cholesterol 140 mg/dl",
                    [convert(140)],
                ),  # unit conversion
                ("chol = 4", [4]),
                ("chol: 4", [4]),
                ("chol equals 4", [4]),
                ("chol is equal to 4", [4]),
                ("chol <4", [4]),
                ("chol less than 4", [4]),
                ("cholesterol more than 20", [20]),
                ("chol was 4", [4]),
                ("chol was 140 mg/dl", [convert(140)]),
                ("chol was 140", [140]),  # but probably wrong interpretation!
                ("chol is 140 mg dl-1", [convert(140)]),
                ("chol is 140 mg dl -1", [convert(140)]),
                ("chol-4", [4]),
                ("chol     |       6.2 (H)      | mmol/L", [6.2]),
                ("Serum cholesterol level (XE2eD) 4", [4]),
                ("Plasma total cholesterol level (XaIRd) 4", [4]),
                ("Serum total cholesterol level (XaJe9) 4", [4]),
            ],
            verbose=verbose,
        )


[docs]class TotalCholesterolValidator(ValidatorBase):
    """
    Validator for TotalCholesterol (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return TotalCholesterol.NAME, [TotalCholesterol.CHOLESTEROL]


# =============================================================================
# Triglycerides
# =============================================================================


[docs]class Triglycerides(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY (LIPID PROFILE).

    Triglycerides.
    Default units are mM; also supports mg/dL.
    """

    # By Emanuele Osimo, Feb 2019.
    # Some modifications by Rudolf Cardinal, Feb 2019.
    TG_BASE = rf"""
        {WORD_BOUNDARY}
        (?: Triglyceride[s]? | TG )
        {WORD_BOUNDARY}
    """
    TG = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.TG,
            ReadCodes.TG_PLASMA,
            ReadCodes.TG_PLASMA_FASTING,
            ReadCodes.TG_PLASMA_RANDOM,
            ReadCodes.TG_SERUM,
            ReadCodes.TG_SERUM_FASTING,
            ReadCodes.TG_SERUM_RANDOM,
        ),
        TG_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=TG,
        units=regex_or(
            MILLIMOLAR,  # good
            MILLIMOLES_PER_L,  # good
            MG_PER_DL,  # good but needs conversion
        ),
    )
    NAME = "Triglycerides"
    PREFERRED_UNIT_COLUMN = "value_mmol_L"
    FACTOR_MG_DL_TO_MMOL_L = 0.01129  # reciprocal of 88.57
    # ... https://www.ncbi.nlm.nih.gov/books/NBK33478/
    # ... https://www.ncbi.nlm.nih.gov/books/NBK83505/
    UNIT_MAPPING = {
        MILLIMOLAR: 1,  # preferred unit
        MILLIMOLES_PER_L: 1,
        MG_PER_DL: FACTOR_MG_DL_TO_MMOL_L,
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class

        def convert(mg_dl: float) -> float:
            # Convert mg/dl to mM
            return self.FACTOR_MG_DL_TO_MMOL_L * mg_dl

        self.test_numerical_parser(
            [
                ("TG", []),  # should fail; no values
                ("triglycerides", []),  # should fail; no values
                ("TG 4 mM", [4]),
                ("triglycerides 4 mmol", [4]),
                ("triglyceride 4 mmol", [4]),
                ("TG 4", [4]),
                ("TG 140 mg/dl", [convert(140)]),  # unit conversion
                ("TG = 4", [4]),
                ("TG: 4", [4]),
                ("TG equals 4", [4]),
                ("TG is equal to 4", [4]),
                ("TG <4", [4]),
                ("TG less than 4", [4]),
                ("TG more than 20", [20]),
                ("TG was 4", [4]),
                ("TG was 140 mg/dl", [convert(140)]),
                ("TG was 140", [140]),  # but probably wrong interpretation!
                ("TG is 140 mg dl-1", [convert(140)]),
                ("TG is 140 mg dl -1", [convert(140)]),
                ("TG-4", [4]),
                ("triglycerides    |       6.2 (H)      | mmol/L", [6.2]),
                ("Triglyceride level (X772O) 4", [4]),
                ("Plasma triglyceride level (44e..) 4", [4]),
                ("Plasma rndm triglyceride level (44e0.) 4", [4]),
                ("Plasma fast triglyceride level (44e1.) 4", [4]),
                ("Serum triglyceride levels (XE2q9) 4", [4]),
                ("Serum fasting triglyceride lev (44Q4.) 4", [4]),
                ("Serum random triglyceride lev (44Q5.) 4", [4]),
            ],
            verbose=verbose,
        )


[docs]class TriglyceridesValidator(ValidatorBase):
    """
    Validator for Triglycerides (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return Triglycerides.NAME, [Triglycerides.TG]


# =============================================================================
# HbA1c
# =============================================================================


[docs]def hba1c_mmol_per_mol_from_percent(
    percent: Union[float, str]
) -> Optional[float]:
    """
    Convert an HbA1c value from old percentage units -- DCCT (Diabetes Control
    and Complications Trial), UKPDS (United Kingdom Prospective Diabetes Study)
    or NGSP (National Glycohemoglobin Standardization Program) -- to newer IFCC
    (International Federation of Clinical Chemistry) mmol/mol units (mmol HbA1c
    / mol Hb).

    Args:
        percent: DCCT value as a percentage

    Returns:
        IFCC value in mmol/mol

    Example: 5% becomes 31.1 mmol/mol.

    By Emanuele Osimo, Feb 2019.
    Some modifications by Rudolf Cardinal, Feb 2019.

    References:

    - Emanuele had mmol_per_mol = (percent - 2.14) * 10.929 -- primary source
      awaited.
    - Jeppsson 2002, https://www.ncbi.nlm.nih.gov/pubmed/11916276 -- no, that's
      the chemistry
    - https://www.ifcchba1c.org/
    - http://www.ngsp.org/ifccngsp.asp -- gives master equation of
      NGSP = [0.09148 × IFCC] + 2.152), therefore implying
      IFCC = (NGSP – 2.152) × 10.93135.
    - Little & Rohlfing 2013: https://www.ncbi.nlm.nih.gov/pubmed/23318564;
      also gives NGSP = [0.09148 * IFCC] + 2.152.

    Note also that you may see eAG values (estimated average glucose), in
    mmol/L or mg/dl; see http://www.ngsp.org/A1ceAG.asp; these are not direct
    measurements of HbA1c.

    """
    if isinstance(percent, str):
        percent = to_float(percent)
    if not percent:
        return None
    percent = abs(percent)  # deals with e.g. "HbA1c-8%" -> -8
    return (percent - 2.152) * 10.93135


[docs]class HbA1c(SimpleNumericalResultParser):
    """
    BIOCHEMISTRY.

    Glycosylated (glycated) haemoglobin (HbA1c).
    Default units are mmol/mol; also supports %.

    Note: HbA1 is different
    (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2541274).
    """

    # By Emanuele Osimo, Feb 2019.
    # Some modifications by Rudolf Cardinal, Feb 2019.
    HBA1C_BASE = rf"""
        {WORD_BOUNDARY}
        (?:
            (?: Glyc(?:osyl)?ated [-\s]+ (?:ha?emoglobin|Hb) ) |
            HbA1c
        )
        {WORD_BOUNDARY}
    """
    HBA1C = regex_or(
        *regex_components_from_read_codes(
            ReadCodes.HBA1C,
            ReadCodes.HBA1C_DCCT,
            ReadCodes.HBA1C_IFCC,
        ),
        HBA1C_BASE,
        wrap_each_in_noncapture_group=True,
        wrap_result_in_noncapture_group=False,
    )
    REGEX = make_simple_numeric_regex(
        quantity=HBA1C,
        units=regex_or(
            MILLIMOLES_PER_MOL,  # standard
            PERCENT,  # good but needs conversion
            MILLIMOLES_PER_L,  # bad; may be an eAG value
            MG_PER_DL,  # bad; may be an eAG value
        ),
    )
    NAME = "HBA1C"
    PREFERRED_UNIT_COLUMN = "value_mmol_mol"
    UNIT_MAPPING = {
        MILLIMOLES_PER_MOL: 1,  # preferred unit
        PERCENT: hba1c_mmol_per_mol_from_percent,
        # but not MILLIMOLES_PER_L
        # and not MG_PER_DL
    }

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            regex_str=self.REGEX,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            units_to_factor=self.UNIT_MAPPING,
            commit=commit,
            take_absolute=True,
        )

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class

        def convert(percent: float) -> float:
            # Convert % to mmol/mol
            return hba1c_mmol_per_mol_from_percent(percent)

        self.test_numerical_parser(
            [
                ("HbA1c", []),  # should fail; no values
                ("glycosylated haemoglobin", []),  # should fail; no values
                ("HbA1c 31", [31]),
                ("HbA1c 31 mmol/mol", [31]),
                ("HbA1c 31 mg/dl", []),  # wrong units
                ("HbA1c 31 mmol/L", []),  # wrong units
                ("glycosylated haemoglobin 31 mmol/mol", [31]),
                ("glycated hemoglobin 31 mmol/mol", [31]),
                ("HbA1c 8%", [convert(8)]),
                ("HbA1c = 8%", [convert(8)]),
                ("HbA1c: 31", [31]),
                ("HbA1c equals 31", [31]),
                ("HbA1c is equal to 31", [31]),
                ("HbA1c <31.2", [31.2]),
                ("HbA1c less than 4", [4]),
                ("HbA1c more than 20", [20]),
                ("HbA1c was 31", [31]),
                ("HbA1c was 15%", [convert(15)]),
                ("HbA1c-31", [31]),
                ("HbA1c-8%", [convert(8)]),
                ("HbA1c    |       40 (H)      | mmol/mol", [40]),
                ("Haemoglobin A1c level (X772q) 8%", [convert(8)]),
                ("HbA1c level (DCCT aligned) (XaERp) 8%", [convert(8)]),
                ("HbA1c levl - IFCC standardised (XaPbt) 31 mmol/mol", [31]),
            ],
            verbose=verbose,
        )


[docs]class HbA1cValidator(ValidatorBase):
    """
    Validator for HbA1c (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        return HbA1c.NAME, [HbA1c.HBA1C]


# =============================================================================
# All classes in this module
# =============================================================================

ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS = [
    (Albumin, AlbuminValidator),
    (AlkPhos, AlkPhosValidator),
    (ALT, ALTValidator),
    (Bilirubin, BilirubinValidator),
    (Creatinine, CreatinineValidator),
    (Crp, CrpValidator),
    (GammaGT, GammaGTValidator),
    (Glucose, GlucoseValidator),
    (HbA1c, HbA1cValidator),
    (HDLCholesterol, HDLCholesterolValidator),
    (LDLCholesterol, LDLCholesterolValidator),
    (Lithium, LithiumValidator),
    (Potassium, PotassiumValidator),
    (Sodium, SodiumValidator),
    (TotalCholesterol, TotalCholesterolValidator),
    (Triglycerides, TriglyceridesValidator),
    (Tsh, TshValidator),
    (Urea, UreaValidator),
]