"""
crate_anon/nlp_manager/parse_biochemistry.py
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
**Python regex-based NLP processors for biochemistry data.**
All inherit from
:class:`crate_anon.nlp_manager.regex_parser.SimpleNumericalResultParser` and
are constructed with these arguments:
nlpdef:
a :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`
cfgsection:
the name of a CRATE NLP config file section (from which we may
choose to get extra config information)
commit:
force a COMMIT whenever we insert data? You should specify this
in multiprocess mode, or you may get database deadlocks.
"""
import logging
from typing import List, Optional, Tuple, Union
from crate_anon.common.regex_helpers import (
regex_or,
WORD_BOUNDARY,
)
from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.nlp_manager.number import to_float
from crate_anon.nlp_manager.regex_parser import (
make_simple_numeric_regex,
OPTIONAL_POC,
SimpleNumericalResultParser,
ValidatorBase,
)
from crate_anon.nlp_manager.regex_read_codes import (
ReadCodes,
regex_components_from_read_codes,
)
from crate_anon.nlp_manager.regex_units import (
factor_micromolar_from_mg_per_dl,
factor_millimolar_from_mg_per_dl,
G,
G_PER_L,
MG,
MG_PER_DL,
MG_PER_L,
MICROEQ_PER_L,
MICROMOLAR,
micromolar_from_mg_per_dl,
MICROMOLES_PER_L,
MICROUNITS_PER_ML,
MILLIEQ_PER_L,
MILLIMOLAR,
millimolar_from_mg_per_dl,
MILLIMOLES_PER_L,
MILLIMOLES_PER_MOL,
MILLIUNITS_PER_L,
PERCENT,
UNITS_PER_L,
)
log = logging.getLogger(__name__)
# =============================================================================
# C-reactive protein (CRP)
# =============================================================================
[docs]class Crp(SimpleNumericalResultParser):
"""
BIOCHEMISTRY.
C-reactive protein (CRP). Default units are mg/L; also supports mg/dL.
CRP units:
- mg/L is commonest in the UK (or at least standard at Addenbrooke's,
Hinchingbrooke, and Dundee);
- values of <=6 mg/L or <10 mg/L are normal, and e.g. 70-250 mg/L in
pneumonia.
- Refs include:
- https://www.ncbi.nlm.nih.gov/pubmed/7705110
- https://emedicine.medscape.com/article/2086909-overview
- 1 mg/dL = 10 mg/L, so normal in mg/dL is <=1 roughly.
"""
CRP_BASE = rf"""
{WORD_BOUNDARY}
(?: (?: C [-\s]+ reactive [\s]+ protein ) | CRP )
{WORD_BOUNDARY}
"""
CRP = regex_or(
*regex_components_from_read_codes(
ReadCodes.CRP_PLASMA,
ReadCodes.CRP_SERUM,
),
CRP_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=CRP,
units=regex_or(MG_PER_DL, MG_PER_L),
optional_ignorable_after_quantity=OPTIONAL_POC,
)
NAME = "CRP"
PREFERRED_UNIT_COLUMN = "value_mg_L"
UNIT_MAPPING = {
MG_PER_L: 1, # preferred unit
MG_PER_DL: 10, # 1 mg/dL -> 10 mg/L
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in parent class
self.test_numerical_parser(
[
("CRP", []), # should fail; no values
("CRP 6", [6]),
("C-reactive protein 6", [6]),
("C reactive protein 6", [6]),
("CRP = 6", [6]),
("CRP 6 mg/dl", [60]),
("CRP: 6", [6]),
("CRP equals 6", [6]),
("CRP is equal to 6", [6]),
("CRP <1", [1]),
("CRP less than 1", [1]),
("CRP <1 mg/dl", [10]),
("CRP >250", [250]),
("CRP more than 1", [1]),
("CRP greater than 1", [1]),
("CRP >250 mg/dl", [2500]),
("CRP was 62", [62]),
("CRP was 62 mg/l", [62]),
("CRP was <1", [1]),
("CRP is 19.2", [19.2]),
("CRP is >250", [250]),
("CRP is 19 mg dl-1", [190]),
("CRP is 19 mg dl -1", [190]),
("CRP 1.9 mg/L", [1.9]),
("CRP-97", [97]),
("CRP 1.9 mg L-1", [1.9]),
("CRP | 1.9 (H) | mg/L", [1.9]),
("Plasma C-reactive protein level (XE2dy) 45 mg/L", [45]),
("Serum C reactive protein level (XaINL) 45 mg/L", [45]),
("CRP (mg/L) 62", [62]),
],
verbose=verbose,
)
[docs]class CrpValidator(ValidatorBase):
"""
Validator for Crp (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return Crp.NAME, [Crp.CRP]
# =============================================================================
# Sodium (Na)
# =============================================================================
# ... handy to check approximately expected distribution of results!
[docs]class Sodium(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (U&E).
Sodium (Na), in mM.
"""
SODIUM_BASE = rf"""
{WORD_BOUNDARY} (?: Na | Sodium ) {WORD_BOUNDARY}
"""
SODIUM = regex_or(
*regex_components_from_read_codes(
ReadCodes.SODIUM,
ReadCodes.SODIUM_BLOOD,
ReadCodes.SODIUM_PLASMA,
ReadCodes.SODIUM_SERUM,
),
SODIUM_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=SODIUM,
units=regex_or(
MILLIMOLAR, # good
MILLIMOLES_PER_L, # good
MILLIEQ_PER_L, # good
MG, # bad
),
optional_ignorable_after_quantity=OPTIONAL_POC,
)
NAME = "Sodium"
PREFERRED_UNIT_COLUMN = "value_mmol_L"
UNIT_MAPPING = {
MILLIMOLAR: 1, # preferred unit
MILLIMOLES_PER_L: 1,
MILLIEQ_PER_L: 1,
# but not MG
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in parent class
self.test_numerical_parser(
[
("Na", []), # should fail; no values
("Na 120", [120]),
("sodium 153", [153]),
("Na 135 mEq/L", [135]),
("Na 139 mM", [139]),
("docusate sodium 100mg", []),
(
"Present: Nicola Adams (NA). 1.0 Minutes of last meeting",
[],
),
("Present: Nicola Adams (NA) 1.0 Minutes of last meeting", []),
("Na (H) 145 mM", [145]),
("Na (*) 145 mM", [145]),
("Na (X) 145 mM", []),
("blah (Na) 145 mM", []),
("Na (145) something", [145]),
("Na (145 mM), others", [145]),
("Na-145", [145]),
("Sodium level (X771T) 145", [145]),
("Blood sodium level (XaDva) 145", [145]),
("Plasma sodium level (XaIRf) 145", [145]),
("Serum sodium level (XE2q0) 145", [145]),
("Serum sodium level (mmol/L) 137", [137]),
],
verbose=verbose,
)
[docs]class SodiumValidator(ValidatorBase):
"""
Validator for Sodium (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return Sodium.NAME, [Sodium.SODIUM]
# =============================================================================
# Potassium (K)
# =============================================================================
[docs]class Potassium(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (U&E).
Potassium (K), in mM.
"""
POTASSIUM_BASE = rf"""
{WORD_BOUNDARY} (?: K | Potassium ) {WORD_BOUNDARY}
"""
POTASSIUM = regex_or(
POTASSIUM_BASE,
*regex_components_from_read_codes(
ReadCodes.POTASSIUM,
ReadCodes.POTASSIUM_BLOOD,
ReadCodes.POTASSIUM_PLASMA,
ReadCodes.POTASSIUM_SERUM,
),
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=POTASSIUM,
units=regex_or(
MILLIMOLAR, # good
MILLIMOLES_PER_L, # good
MILLIEQ_PER_L, # good
MG, # bad
),
optional_ignorable_after_quantity=OPTIONAL_POC,
)
NAME = "Potassium"
PREFERRED_UNIT_COLUMN = "value_mmol_L"
UNIT_MAPPING = {
MILLIMOLAR: 1, # preferred unit
MILLIMOLES_PER_L: 1,
MILLIEQ_PER_L: 1,
# but not MG
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in parent class
self.test_numerical_parser(
[
("K", []), # should fail; no values
("K 4", [4]),
("Potassium 4.3", [4.3]),
("K 4.5 mEq/L", [4.5]),
("K 4.5 mM", [4.5]),
("losartan potassium 50mg", []),
("Present: Kerry Smith (K). 1.0 Minutes of last meeting", []),
("Present: Kerry Smith (K) 1.0 Minutes of last meeting", []),
("K (H) 5.6 mM", [5.6]),
("K (*) 5.6 mM", [5.6]),
("K (X) 5.6 mM", []),
("blah (K) 5.6 mM", []),
("K (5.6) something", [5.6]),
("K (5.6 mM), others", [5.6]),
("K-3.2", [3.2]),
("Potassium level (X771S) 3.2", [3.2]),
("Blood potassium level (XaDvZ) 3.2", [3.2]),
("Plasma potassium level (XaIRl) 3.2", [3.2]),
("Serum potassium level (XE2pz) 3.2", [3.2]),
("Serum potassium level (XaIRl) 3.2", []), # wrong code
],
verbose=verbose,
)
[docs]class PotassiumValidator(ValidatorBase):
"""
Validator for Potassium (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return Potassium.NAME, [Potassium.POTASSIUM]
# =============================================================================
# Urea
# =============================================================================
[docs]class Urea(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (U&E).
Urea, in mM.
"""
UREA_BASE = rf"""
{WORD_BOUNDARY} U(?:r(?:ea)?)? {WORD_BOUNDARY}
"""
UREA = regex_or(
*regex_components_from_read_codes(
ReadCodes.UREA_BLOOD,
ReadCodes.UREA_PLASMA,
ReadCodes.UREA_SERUM,
),
UREA_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=UREA,
units=regex_or(
MILLIMOLAR, # good
MILLIMOLES_PER_L, # good
MILLIEQ_PER_L, # good
MG, # bad
),
optional_ignorable_after_quantity=OPTIONAL_POC,
)
NAME = "Urea"
PREFERRED_UNIT_COLUMN = "value_mmol_L"
UNIT_MAPPING = {
MILLIMOLAR: 1, # preferred unit
MILLIMOLES_PER_L: 1,
MILLIEQ_PER_L: 1,
# but not MG
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in parent class
self.test_numerical_parser(
[
("Urea", []), # should fail; no values
("U 4", [4]),
("Urea 4.3", [4.3]),
("U 4.5 mEq/L", [4.5]),
("Ur 4.5 mM", [4.5]),
(
"Present: Ursula Rogers (U). 1.0 Minutes of last meeting",
[],
),
(
"Present: Ursula Rogers (UR) 1.0 Minutes of last meeting",
[],
),
("U (H) 5.6 mM", [5.6]),
("Ur (*) 5.6 mM", [5.6]),
("Urea (X) 5.6 mM", []),
("blah (U) 5.6 mM", []),
("Urea (5.6) something", [5.6]),
("Urea (5.6 mM), others", [5.6]),
("U-3.2", [3.2]),
("Blood urea (X771P) 3.2", [3.2]),
("Plasma urea level (XaDvl) 3.2", [3.2]),
("Serum urea level (XM0lt) 3.2", [3.2]),
],
verbose=verbose,
)
[docs]class UreaValidator(ValidatorBase):
"""
Validator for Urea (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return Urea.NAME, [Urea.UREA]
# =============================================================================
# Creatinine
# =============================================================================
[docs]class Creatinine(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (U&E).
Creatinine. Default units are micromolar (SI); also supports mg/dL.
"""
CREATININE_BASE = rf"""
{WORD_BOUNDARY} Cr(?:eat(?:inine)?)? {WORD_BOUNDARY}
"""
# ... Cr, Creat, Creatinine
# Possible that "creatine" is present as a typo... but it's wrong...
CREATININE = regex_or(
*regex_components_from_read_codes(
ReadCodes.CREATININE,
ReadCodes.CREATININE_PLASMA,
ReadCodes.CREATININE_PLASMA_CORRECTED,
ReadCodes.CREATININE_SERUM,
ReadCodes.CREATININE_SERUM_CORRECTED,
),
CREATININE_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=CREATININE,
units=regex_or(
MICROMOLAR, # good
MICROMOLES_PER_L, # good
MICROEQ_PER_L, # good
MG_PER_DL, # good but needs conversion
# ... note that MG_PER_DL must precede MG
MG, # bad
),
optional_ignorable_after_quantity=OPTIONAL_POC,
)
CREATININE_MOLECULAR_MASS_G_PER_MOL = 113.12
# ... https://pubchem.ncbi.nlm.nih.gov/compound/creatinine
NAME = "Creatinine"
PREFERRED_UNIT_COLUMN = "value_micromol_L"
UNIT_MAPPING = {
MICROMOLAR: 1, # preferred unit
MICROMOLES_PER_L: 1,
MICROEQ_PER_L: 1,
MG_PER_DL: factor_micromolar_from_mg_per_dl(
CREATININE_MOLECULAR_MASS_G_PER_MOL
),
# but not MG
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in parent class
def convert(mg_dl: float) -> float:
# Convert mg/dl to μM
return micromolar_from_mg_per_dl(
mg_dl, self.CREATININE_MOLECULAR_MASS_G_PER_MOL
)
self.test_numerical_parser(
[
("Creatinine", []), # should fail; no values
("Cr 50", [50]),
("Creat 125.5", [125.5]),
("Creat 75 uEq/L", [75]),
("Cr 75 μM", [75]),
(
"Present: Chloe Rogers (CR). 1.0 Minutes of last meeting",
[],
),
("Creatinine (H) 200 uM", [200]),
("Creatinine (*) 200 micromol/L", [200]),
("Creatinine (X) 200 uM", []),
("Creatinine 200 micromolar", [200]),
("Creatinine 200 micromolar, others", [200]),
("blah (creat) 5.6 uM", []),
("Creatinine (200) something", [200]),
("Creatinine (200 micromolar)", [200]),
("Creatinine (200 micromolar), others", [200]),
("Cr-75", [75]),
("creatinine 3 mg/dl", [convert(3)]),
("creatinine 3 mg", []),
("Creatinine level (X771Q) 75", [75]),
("Plasma creatinine level (XaETQ) 75", [75]),
("Cor plasma creatinine level (XaERX) 75", [75]),
("Serum creatinine level (XE2q5) 75", [75]),
("Cor serum creatinine level (XaERc) 75", [75]),
],
verbose=verbose,
)
[docs]class CreatinineValidator(ValidatorBase):
"""
Validator for Creatinine (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return Creatinine.NAME, [Creatinine.CREATININE]
# =============================================================================
# Lithium (Li)
# =============================================================================
[docs]class Lithium(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (THERAPEUTIC DRUG MONITORING).
Lithium (Li) levels (for blood tests, not doses), in mM.
"""
LITHIUM_BASE = rf"""
{WORD_BOUNDARY} Li(?:thium)? {WORD_BOUNDARY}
"""
LITHIUM = regex_or(
*regex_components_from_read_codes(
ReadCodes.LITHIUM_SERUM,
),
LITHIUM_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=LITHIUM,
units=regex_or(
MILLIMOLAR, # good
MILLIMOLES_PER_L, # good
MILLIEQ_PER_L, # good
MG, # bad
G, # bad
),
)
NAME = "Lithium"
PREFERRED_UNIT_COLUMN = "value_mmol_L"
UNIT_MAPPING = {
MILLIMOLAR: 1, # preferred unit
MILLIMOLES_PER_L: 1,
MILLIEQ_PER_L: 1,
# but not MG
# and not G
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in parent class
self.test_numerical_parser(
[
("Li", []), # should fail; no values
("Li 0.4", [0.4]),
("li 1200 mg", []), # that's a dose
("li 1.2 g", []), # that's a dose
("lithium 1200 mg", []), # that's a dose
("lithium 153", [153]), # an unhappy patient...
("Li 135 mEq/L", [135]),
("Li 139 mM", [139]),
("lithium carbonate 800mg", []),
(
"Present: Linda Ingles (LI). 1.0 Minutes of last meeting",
[],
),
("Present: Linda Ingles (LI) 1.0 Minutes of last meeting", []),
("Li (H) 1.3 mM", [1.3]),
("Li (*) 1.3 mM", [1.3]),
("Li (X) 1.3 mM", []),
("blah (Li) 1.2 mM", []),
("Li (1.3) something", [1.3]),
("Li (0.4 mM), others", [0.4]),
("Li-0.4", [0.4]),
("Serum lithium level (XE25g) 0.4", [0.4]),
],
verbose=verbose,
)
[docs]class LithiumValidator(ValidatorBase):
"""
Validator for Lithium (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return Lithium.NAME, [Lithium.LITHIUM]
# =============================================================================
# Thyroid-stimulating hormone (TSH)
# =============================================================================
[docs]class Tsh(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (ENDOCRINOLOGY).
Thyroid-stimulating hormone (TSH), in mIU/L (or μIU/mL).
"""
TSH_BASE = rf"""
{WORD_BOUNDARY}
(?: TSH | thyroid [-\s]+ stimulating [-\s]+ hormone )
{WORD_BOUNDARY}
"""
TSH = regex_or(
*regex_components_from_read_codes(
ReadCodes.TSH_PLASMA,
ReadCodes.TSH_PLASMA_30_MIN,
ReadCodes.TSH_PLASMA_60_MIN,
ReadCodes.TSH_PLASMA_90_MIN,
ReadCodes.TSH_PLASMA_120_MIN,
ReadCodes.TSH_PLASMA_150_MIN,
ReadCodes.TSH_SERUM,
ReadCodes.TSH_SERUM_60_MIN,
ReadCodes.TSH_SERUM_90_MIN,
ReadCodes.TSH_SERUM_120_MIN,
ReadCodes.TSH_SERUM_150_MIN,
),
TSH_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=TSH,
units=regex_or(
MILLIUNITS_PER_L, # good
MICROUNITS_PER_ML, # good
),
)
NAME = "TSH"
PREFERRED_UNIT_COLUMN = "value_mU_L"
UNIT_MAPPING = {
MILLIUNITS_PER_L: 1, # preferred unit
MICROUNITS_PER_ML: 1,
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in superclass
self.test_numerical_parser(
[
("TSH", []), # should fail; no values
("TSH 1.5", [1.5]),
("thyroid-stimulating hormone 1.5", [1.5]),
("TSH 1.5 mU/L", [1.5]),
("TSH 1.5 mIU/L", [1.5]),
("TSH 1.5 μU/mL", [1.5]),
("TSH 1.5 μIU/mL", [1.5]),
("TSH 1.5 uU/mL", [1.5]),
("TSH 1.5 uIU/mL", [1.5]),
("TSH-2.3", [2.3]),
("Plasma TSH level (XaELW) 2.3", [2.3]),
("Serum TSH level (XaELV) 2.3", [2.3]),
# etc.; not all Read codes tested here
],
verbose=verbose,
)
[docs]class TshValidator(ValidatorBase):
"""
Validator for TSH (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return Tsh.NAME, [Tsh.TSH]
# =============================================================================
# Alkaline phosphatase
# =============================================================================
[docs]class AlkPhos(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (LFTs/BFTs).
Alkaline phosphatase (ALP, AlkP, AlkPhos). Units are U/L.
"""
ALKP_BASE = rf"""
{WORD_BOUNDARY}
(?:
(?: ALk?P (?:\. | {WORD_BOUNDARY}) ) |
(?:
alk(?:aline | \.)?
[-\s]*
phos(?:phatase{WORD_BOUNDARY} | \. | {WORD_BOUNDARY})
)
)
"""
ALKP = regex_or(
*regex_components_from_read_codes(
ReadCodes.ALKPHOS_PLASMA,
ReadCodes.ALKPHOS_SERUM,
ReadCodes.ALKPHOS, # least specific; at end
),
ALKP_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(quantity=ALKP, units=UNITS_PER_L)
NAME = "AlkPhos"
PREFERRED_UNIT_COLUMN = "value_U_L"
UNIT_MAPPING = {UNITS_PER_L: 1} # preferred unit
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in superclass
self.test_numerical_parser(
[
("ALP", []), # should fail; no values
("was 7", []), # no quantity
("ALP 55", [55]),
("Alkaline-Phosphatase 55", [55]),
("Alkaline Phosphatase 55 U/L ", [55]),
("ALP 55 U/L", [55]),
("ALP-55", [55]),
("AlkP 55", [55]),
("alk.phos. 55", [55]),
("alk. phos. 55", [55]),
("alkphos 55", [55]),
("Alkaline phosphatase level (44F3.) 55", [55]),
(
"Alkaline phosphatase level (44F3x) 55",
[],
), # test "." in regex
("Plasma alkaline phosphatase level (XaIRj) 55", [55]),
("Serum alkaline phosphatase level (XE2px) 55", [55]),
],
verbose=verbose,
)
[docs]class AlkPhosValidator(ValidatorBase):
"""
Validator for AlkPhos (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return AlkPhos.NAME, [AlkPhos.ALKP]
# =============================================================================
# Alanine aminotransferase (ALT)
# =============================================================================
[docs]class ALT(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (LFTs).
Alanine aminotransferase (ALT), a.k.a. alanine transaminase (ALT).
Units are U/L.
A.k.a. serum glutamate-pyruvate transaminase (SGPT), or serum
glutamate-pyruvic transaminase (SGPT), but not a.k.a. those in recent
memory!
"""
ALT_BASE = rf"""
{WORD_BOUNDARY}
(?:
ALT |
alanine [-\s]+ (?: aminotransferase | transaminase )
)
{WORD_BOUNDARY}
"""
ALT = regex_or(
*regex_components_from_read_codes(
ReadCodes.ALT,
),
ALT_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(quantity=ALT, units=UNITS_PER_L)
NAME = "ALT"
PREFERRED_UNIT_COLUMN = "value_U_L"
UNIT_MAPPING = {UNITS_PER_L: 1} # preferred unit
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in superclass
self.test_numerical_parser(
[
("ALT", []), # should fail; no values
("was 7", []), # no quantity
("ALT 55", [55]),
("alanine-aminotransferase 55", [55]),
("Alanine aminotransferase 55 U/L ", [55]),
("alanine transaminase 55 U/L ", [55]),
("ALT 55 U/L", [55]),
("ALT-55", [55]),
("ALP 55", []), # wrong thing
("ALT/SGPT serum level (44G3.) 55", [55]),
],
verbose=verbose,
)
[docs]class ALTValidator(ValidatorBase):
"""
Validator for ALT (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return ALT.NAME, [ALT.ALT]
# =============================================================================
# Gamma GT (gGT)
# =============================================================================
[docs]class GammaGT(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (LFTs).
Gamma-glutamyl transferase (gGT), in U/L.
"""
GGT_BASE = rf"""
{WORD_BOUNDARY}
(?:
(?: γ | G | gamma)
[-\s]*
(?:
GT |
glutamyl [-\s]+ transferase
)
)
{WORD_BOUNDARY}
"""
GGT = regex_or(
*regex_components_from_read_codes(
ReadCodes.GAMMA_GT,
ReadCodes.GAMMA_GT_PLASMA,
ReadCodes.GAMMA_GT_SERUM,
),
GGT_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(quantity=GGT, units=UNITS_PER_L)
NAME = "GammaGT"
PREFERRED_UNIT_COLUMN = "value_U_L"
UNIT_MAPPING = {UNITS_PER_L: 1} # preferred unit
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in superclass
self.test_numerical_parser(
[
("gGT", []), # should fail; no values
("was 7", []), # no quantity
("gGT 55", [55]),
("gamma Glutamyl Transferase 19 U/L", [19]),
("Gamma GT 55 U/L ", [55]),
("GGT 55 U/L", [55]),
("ggt-55", [55]),
("γGT 55", [55]),
("Gamma-glutamyl transferase lev (44G4.) 55", [55]),
("Plasma gamma-glutamyl transferase level (XaES4) 55", [55]),
("Serum gamma-glutamyl transferase level (XaES3) 55", [55]),
],
verbose=verbose,
)
[docs]class GammaGTValidator(ValidatorBase):
"""
Validator for GammaGT (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return GammaGT.NAME, [GammaGT.GGT]
# =============================================================================
# Total bilirubin
# =============================================================================
[docs]class Bilirubin(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (LFTs).
Total bilirubin. Units are μM.
"""
BILIRUBIN_BASE = rf"""
{WORD_BOUNDARY}
(?: t(?: ot(?:al | \.)? | \.) \s+ )?
bili?(?: \. | rubin{WORD_BOUNDARY})?
"""
BILIRUBIN = regex_or(
*regex_components_from_read_codes(
ReadCodes.BILIRUBIN_PLASMA_TOTAL,
ReadCodes.BILIRUBIN_SERUM,
ReadCodes.BILIRUBIN_SERUM_TOTAL,
ReadCodes.BILIRUBIN_TOTAL,
),
BILIRUBIN_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=BILIRUBIN,
units=regex_or(
MICROMOLAR, # good
MICROMOLES_PER_L, # good
),
)
NAME = "Bilirubin"
PREFERRED_UNIT_COLUMN = "value_micromol_L"
UNIT_MAPPING = {MICROMOLAR: 1, MICROMOLES_PER_L: 1} # preferred unit
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in superclass
self.test_numerical_parser(
[
("tot Bil", []), # should fail; no values
("was 7", []), # no quantity
("tot Bil 6", [6]),
("Total Bilirubin: 6", [6]),
("Total Bilirubin 6 umol/L", [6]),
("bilirubin 17 μM", [17]),
("t.bilirubin 17 μM", [17]),
("t. bilirubin 17 μM", [17]),
("bili. 17 μM", [17]),
("bili 17 μM", [17]),
("Plasma total bilirubin level (XaETf) 17", [17]),
("Serum bilirubin level (44E..) 17", [17]),
("Serum total bilirubin level (XaERu) 17", [17]),
("Total bilirubin level (XE2qu) 17", [17]),
(
"Total bilirubin \t level \n (XE2qu) 17",
[17],
), # test whitespace
(
"xTotal bilirubin level (XE2qu) 17",
[],
), # test word boundary
("Serum total bilirubin level (XaERu) 6 umol/L", [6]),
],
verbose=verbose,
)
[docs]class BilirubinValidator(ValidatorBase):
"""
Validator for Bilirubin (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return Bilirubin.NAME, [Bilirubin.BILIRUBIN]
# =============================================================================
# Albumin (Alb)
# =============================================================================
[docs]class Albumin(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (LFTs).
Albumin (Alb). Units are g/L.
"""
ALBUMIN_BASE = rf"""
{WORD_BOUNDARY}
(?:
alb(?:\. | umin{WORD_BOUNDARY})?
(?: \s+ level{WORD_BOUNDARY})?
)
"""
ALBUMIN = regex_or(
*regex_components_from_read_codes(
ReadCodes.ALBUMIN_PLASMA,
ReadCodes.ALBUMIN_SERUM,
),
ALBUMIN_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(quantity=ALBUMIN, units=G_PER_L)
NAME = "Albumin"
PREFERRED_UNIT_COLUMN = "value_g_L"
UNIT_MAPPING = {G_PER_L: 1} # preferred unit
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in superclass
self.test_numerical_parser(
[
("Alb", []), # should fail; no values
("was 7", []), # no quantity
("ALP 6", []), # wrong quantity
("Alb 6", [6]),
("Albumin: 48", [48]),
("Albumin 48 g/L", [48]),
("alb. 48", [48]),
("albumin level 48", [48]),
("Plasma albumin level (XaIRc) 48", [48]),
("Serum albumin level (XE2eA) 48", [48]),
],
verbose=verbose,
)
[docs]class AlbuminValidator(ValidatorBase):
"""
Validator for Albumin (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return Albumin.NAME, [Albumin.ALBUMIN]
# =============================================================================
# Glucose
# =============================================================================
[docs]class Glucose(SimpleNumericalResultParser):
"""
BIOCHEMISTRY.
Glucose. Default units are mM; also supports mg/dL.
"""
# By Emanuele Osimo, Feb 2019.
# Some modifications by Rudolf Cardinal, Feb 2019.
GLUCOSE_BASE = rf"""
{WORD_BOUNDARY} glu(?:c(?:ose)?)? {WORD_BOUNDARY}
# glu, gluc, glucose
"""
GLUCOSE = regex_or(
*regex_components_from_read_codes(
ReadCodes.GLUCOSE,
ReadCodes.GLUCOSE_BLOOD,
ReadCodes.GLUCOSE_BLOOD_2H_POSTPRANDIAL,
ReadCodes.GLUCOSE_BLOOD_150_MIN,
ReadCodes.GLUCOSE_PLASMA_RANDOM,
ReadCodes.GLUCOSE_PLASMA_FASTING,
ReadCodes.GLUCOSE_PLASMA_30_MIN,
ReadCodes.GLUCOSE_PLASMA_60_MIN,
ReadCodes.GLUCOSE_PLASMA_90_MIN,
ReadCodes.GLUCOSE_PLASMA_120_MIN,
ReadCodes.GLUCOSE_PLASMA_2H_POSTPRANDIAL,
ReadCodes.GLUCOSE_PLASMA_150_MIN,
ReadCodes.GLUCOSE_SERUM,
ReadCodes.GLUCOSE_SERUM_RANDOM,
ReadCodes.GLUCOSE_SERUM_FASTING,
ReadCodes.GLUCOSE_SERUM_30_MIN,
ReadCodes.GLUCOSE_SERUM_60_MIN,
ReadCodes.GLUCOSE_SERUM_90_MIN,
ReadCodes.GLUCOSE_SERUM_120_MIN,
ReadCodes.GLUCOSE_SERUM_2H_POSTPRANDIAL,
ReadCodes.GLUCOSE_SERUM_150_MIN,
# !
),
GLUCOSE_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=GLUCOSE,
units=regex_or(
MILLIMOLAR, # good
MILLIMOLES_PER_L, # good
MG_PER_DL, # good but needs conversion
),
optional_ignorable_after_quantity=OPTIONAL_POC,
)
GLUCOSE_MOLECULAR_MASS_G_PER_MOL = 180.156
# ... https://pubchem.ncbi.nlm.nih.gov/compound/D-glucose
NAME = "Glucose"
PREFERRED_UNIT_COLUMN = "value_mmol_L"
UNIT_MAPPING = {
MILLIMOLAR: 1, # preferred unit
MILLIMOLES_PER_L: 1,
MG_PER_DL: factor_millimolar_from_mg_per_dl(
GLUCOSE_MOLECULAR_MASS_G_PER_MOL
),
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in parent class
def convert(mg_dl: float) -> float:
# Convert mg/dl to mM
return millimolar_from_mg_per_dl(
mg_dl, self.GLUCOSE_MOLECULAR_MASS_G_PER_MOL
)
self.test_numerical_parser(
[
("glu", []), # should fail; no values
("glucose 6 mM", [6]),
("glucose 6 mmol", [6]),
("glucose 6", [6]),
("glu 6", [6]),
("glucose 90 mg/dl", [convert(90)]), # unit conversion
("gluc = 6", [6]),
("glucose: 6", [6]),
("glu equals 6", [6]),
("glucose is equal to 6", [6]),
("glu <4", [4]),
("glucose less than 1", [1]), # would be bad news...
("glu more than 20", [20]),
("glucose was 15", [15]),
("glucose was 90 mg/dl", [convert(90)]),
("glu is 90 mg dl-1", [convert(90)]),
("glucose is 90 mg dl -1", [convert(90)]),
("glu-5", [5]),
("glucose | 20.3 (H) | mmol/L", [20.3]),
("Glucose level (X772y) 5", [5]),
("Blood glucose level (X772z) 5", [5]),
# Not all Read codes tested.
],
verbose=verbose,
)
[docs]class GlucoseValidator(ValidatorBase):
"""
Validator for Glucose (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return Glucose.NAME, [Glucose.GLUCOSE]
# =============================================================================
# LDL cholesterol
# =============================================================================
[docs]class LDLCholesterol(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (LIPID PROFILE).
Low density lipoprotein (LDL) cholesterol.
Default units are mM; also supports mg/dL.
"""
# By Emanuele Osimo, Feb 2019.
# Some modifications by Rudolf Cardinal, Feb 2019.
LDL_BASE = rf"""
{WORD_BOUNDARY}
LDL [-\s]*
(?:
chol(?:esterol)?{WORD_BOUNDARY} |
chol\. |
{WORD_BOUNDARY} # allows LDL by itself
)
"""
LDL = regex_or(
*regex_components_from_read_codes(
ReadCodes.LDL_PLASMA,
ReadCodes.LDL_PLASMA_FASTING,
ReadCodes.LDL_PLASMA_RANDOM,
ReadCodes.LDL_SERUM,
ReadCodes.LDL_SERUM_FASTING,
ReadCodes.LDL_SERUM_RANDOM,
),
LDL_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=LDL,
units=regex_or(
MILLIMOLAR, # good
MILLIMOLES_PER_L, # good
MG_PER_DL, # good but needs conversion
),
)
NAME = "LDL cholesterol"
PREFERRED_UNIT_COLUMN = "value_mmol_L"
FACTOR_MG_DL_TO_MMOL_L = 0.02586
# ... https://www.ncbi.nlm.nih.gov/books/NBK33478/
UNIT_MAPPING = {
MILLIMOLAR: 1, # preferred unit
MILLIMOLES_PER_L: 1,
MG_PER_DL: FACTOR_MG_DL_TO_MMOL_L,
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in parent class
def convert(mg_dl: float) -> float:
# Convert mg/dl to mM
return self.FACTOR_MG_DL_TO_MMOL_L * mg_dl
self.test_numerical_parser(
[
("LDL", []), # should fail; no values
("LDL 4 mM", [4]),
("LDL chol 4 mmol", [4]),
("LDL chol. 4 mmol", [4]),
("LDL 4", [4]),
("chol 4", []), # that's total cholesterol
("HDL chol 4", []), # that's HDL cholesterol
(
"LDL cholesterol 140 mg/dl",
[convert(140)],
), # unit conversion
("LDL = 4", [4]),
("LDL: 4", [4]),
("LDL equals 4", [4]),
("LDL is equal to 4", [4]),
("LDL <4", [4]),
("LDLchol less than 4", [4]),
("LDL cholesterol more than 20", [20]),
("LDL was 4", [4]),
("LDL chol was 140 mg/dl", [convert(140)]),
("chol was 140 mg/dl", []),
("LDL is 140 mg dl-1", [convert(140)]),
("ldl chol is 140 mg dl -1", [convert(140)]),
("ldl-4", [4]),
("LDL chol | 6.2 (H) | mmol/L", [6.2]),
("Plasma LDL cholesterol level (XaEVs) 4", [4]),
("Plasma rndm LDL cholest level (44d4.) 4", [4]),
("Plasma fast LDL cholest level (44d5.) 4", [4]),
("Serum LDL cholesterol level (44P6.) 4", [4]),
("Serum fast LDL cholesterol lev (44PD.) 4", [4]),
("Ser random LDL cholesterol lev (44PE.) 4", [4]),
],
verbose=verbose,
)
[docs]class LDLCholesterolValidator(ValidatorBase):
"""
Validator for LDLCholesterol (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return LDLCholesterol.NAME, [LDLCholesterol.LDL]
# =============================================================================
# HDL cholesterol
# =============================================================================
[docs]class HDLCholesterol(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (LIPID PROFILE).
High-density lipoprotein (HDL) cholesterol.
Default units are mM; also supports mg/dL.
"""
# By Emanuele Osimo, Feb 2019.
# Some modifications by Rudolf Cardinal, Feb 2019.
HDL_BASE = rf"""
{WORD_BOUNDARY}
HDL [-\s]*
(?:
chol(?:esterol)?{WORD_BOUNDARY} |
chol\. |
{WORD_BOUNDARY} # allows HDL by itself
)
"""
HDL = regex_or(
*regex_components_from_read_codes(
ReadCodes.HDL_PLASMA,
ReadCodes.HDL_PLASMA_FASTING,
ReadCodes.HDL_PLASMA_RANDOM,
ReadCodes.HDL_SERUM,
ReadCodes.HDL_SERUM_FASTING,
ReadCodes.HDL_SERUM_RANDOM,
),
HDL_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=HDL,
units=regex_or(
MILLIMOLAR, # good
MILLIMOLES_PER_L, # good
MG_PER_DL, # good but needs conversion
),
)
NAME = "HDL cholesterol"
PREFERRED_UNIT_COLUMN = "value_mmol_L"
FACTOR_MG_DL_TO_MMOL_L = 0.02586
# ... https://www.ncbi.nlm.nih.gov/books/NBK33478/
UNIT_MAPPING = {
MILLIMOLAR: 1, # preferred unit
MILLIMOLES_PER_L: 1,
MG_PER_DL: FACTOR_MG_DL_TO_MMOL_L,
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in parent class
def convert(mg_dl: float) -> float:
# Convert mg/dl to mM
return self.FACTOR_MG_DL_TO_MMOL_L * mg_dl
self.test_numerical_parser(
[
("HDL", []), # should fail; no values
("HDL 4 mM", [4]),
("HDL chol 4 mmol", [4]),
("HDL chol. 4 mmol", [4]),
("HDL 4", [4]),
("chol 4", []), # that's total cholesterol
("LDL chol 4", []), # that's LDL cholesterol
(
"HDL cholesterol 140 mg/dl",
[convert(140)],
), # unit conversion
("HDL = 4", [4]),
("HDL: 4", [4]),
("HDL equals 4", [4]),
("HDL is equal to 4", [4]),
("HDL <4", [4]),
("HDLchol less than 4", [4]),
("HDL cholesterol more than 20", [20]),
("HDL was 4", [4]),
("HDL chol was 140 mg/dl", [convert(140)]),
("chol was 140 mg/dl", []),
("HDL is 140 mg dl-1", [convert(140)]),
("Hdl chol is 140 mg dl -1", [convert(140)]),
("hdl-4", [4]),
("HDL chol | 6.2 (H) | mmol/L", [6.2]),
("Plasma HDL cholesterol level (XaEVr) 4", [4]),
("Plasma rndm HDL cholest level (44d2.) 4", [4]),
("Plasma fast HDL cholest level (44d3.) 4", [4]),
("Serum HDL cholesterol level (44P5.) 4", [4]),
("Serum fast HDL cholesterol lev (44PB.) 4", [4]),
("Ser random HDL cholesterol lev (44PC.) 4", [4]),
],
verbose=verbose,
)
[docs]class HDLCholesterolValidator(ValidatorBase):
"""
Validator for HDLCholesterol (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return HDLCholesterol.NAME, [HDLCholesterol.HDL]
# =============================================================================
# Total cholesterol
# =============================================================================
[docs]class TotalCholesterol(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (LIPID PROFILE).
Total or undifferentiated cholesterol.
Default units are mM; also supports mg/dL.
"""
CHOLESTEROL_BASE = rf"""
{WORD_BOUNDARY}
(?<!HDL[-\s]+) (?<!LDL[-\s]+) # not preceded by HDL or LDL
(?: tot(?:al) [-\s] )? # optional "total" prefix
(?:
chol(?:esterol)?{WORD_BOUNDARY} |
chol\.
)
"""
# ... (?<! something ) is a negative lookbehind assertion
CHOLESTEROL = regex_or(
*regex_components_from_read_codes(
ReadCodes.CHOLESTEROL_SERUM,
ReadCodes.CHOLESTEROL_TOTAL_PLASMA,
ReadCodes.CHOLESTEROL_TOTAL_SERUM,
),
CHOLESTEROL_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=CHOLESTEROL,
units=regex_or(
MILLIMOLAR, # good
MILLIMOLES_PER_L, # good
MG_PER_DL, # good but needs conversion
),
)
NAME = "Total cholesterol"
PREFERRED_UNIT_COLUMN = "value_mmol_L"
FACTOR_MG_DL_TO_MMOL_L = 0.02586
# ... https://www.ncbi.nlm.nih.gov/books/NBK33478/
UNIT_MAPPING = {
MILLIMOLAR: 1, # preferred unit
MILLIMOLES_PER_L: 1,
MG_PER_DL: FACTOR_MG_DL_TO_MMOL_L,
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in parent class
def convert(mg_dl: float) -> float:
# Convert mg/dl to mM
return self.FACTOR_MG_DL_TO_MMOL_L * mg_dl
self.test_numerical_parser(
[
("chol", []), # should fail; no values
("chol 4 mM", [4]),
("total chol 4 mmol", [4]),
("chol. 4 mmol", [4]),
("chol 4", [4]),
("HDL chol 4", []), # that's HDL cholesterol
("LDL chol 4", []), # that's LDL cholesterol
(
"total cholesterol 140 mg/dl",
[convert(140)],
), # unit conversion
("chol = 4", [4]),
("chol: 4", [4]),
("chol equals 4", [4]),
("chol is equal to 4", [4]),
("chol <4", [4]),
("chol less than 4", [4]),
("cholesterol more than 20", [20]),
("chol was 4", [4]),
("chol was 140 mg/dl", [convert(140)]),
("chol was 140", [140]), # but probably wrong interpretation!
("chol is 140 mg dl-1", [convert(140)]),
("chol is 140 mg dl -1", [convert(140)]),
("chol-4", [4]),
("chol | 6.2 (H) | mmol/L", [6.2]),
("Serum cholesterol level (XE2eD) 4", [4]),
("Plasma total cholesterol level (XaIRd) 4", [4]),
("Serum total cholesterol level (XaJe9) 4", [4]),
],
verbose=verbose,
)
[docs]class TotalCholesterolValidator(ValidatorBase):
"""
Validator for TotalCholesterol (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return TotalCholesterol.NAME, [TotalCholesterol.CHOLESTEROL]
# =============================================================================
# Triglycerides
# =============================================================================
[docs]class Triglycerides(SimpleNumericalResultParser):
"""
BIOCHEMISTRY (LIPID PROFILE).
Triglycerides.
Default units are mM; also supports mg/dL.
"""
# By Emanuele Osimo, Feb 2019.
# Some modifications by Rudolf Cardinal, Feb 2019.
TG_BASE = rf"""
{WORD_BOUNDARY}
(?: Triglyceride[s]? | TG )
{WORD_BOUNDARY}
"""
TG = regex_or(
*regex_components_from_read_codes(
ReadCodes.TG,
ReadCodes.TG_PLASMA,
ReadCodes.TG_PLASMA_FASTING,
ReadCodes.TG_PLASMA_RANDOM,
ReadCodes.TG_SERUM,
ReadCodes.TG_SERUM_FASTING,
ReadCodes.TG_SERUM_RANDOM,
),
TG_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=TG,
units=regex_or(
MILLIMOLAR, # good
MILLIMOLES_PER_L, # good
MG_PER_DL, # good but needs conversion
),
)
NAME = "Triglycerides"
PREFERRED_UNIT_COLUMN = "value_mmol_L"
FACTOR_MG_DL_TO_MMOL_L = 0.01129 # reciprocal of 88.57
# ... https://www.ncbi.nlm.nih.gov/books/NBK33478/
# ... https://www.ncbi.nlm.nih.gov/books/NBK83505/
UNIT_MAPPING = {
MILLIMOLAR: 1, # preferred unit
MILLIMOLES_PER_L: 1,
MG_PER_DL: FACTOR_MG_DL_TO_MMOL_L,
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in parent class
def convert(mg_dl: float) -> float:
# Convert mg/dl to mM
return self.FACTOR_MG_DL_TO_MMOL_L * mg_dl
self.test_numerical_parser(
[
("TG", []), # should fail; no values
("triglycerides", []), # should fail; no values
("TG 4 mM", [4]),
("triglycerides 4 mmol", [4]),
("triglyceride 4 mmol", [4]),
("TG 4", [4]),
("TG 140 mg/dl", [convert(140)]), # unit conversion
("TG = 4", [4]),
("TG: 4", [4]),
("TG equals 4", [4]),
("TG is equal to 4", [4]),
("TG <4", [4]),
("TG less than 4", [4]),
("TG more than 20", [20]),
("TG was 4", [4]),
("TG was 140 mg/dl", [convert(140)]),
("TG was 140", [140]), # but probably wrong interpretation!
("TG is 140 mg dl-1", [convert(140)]),
("TG is 140 mg dl -1", [convert(140)]),
("TG-4", [4]),
("triglycerides | 6.2 (H) | mmol/L", [6.2]),
("Triglyceride level (X772O) 4", [4]),
("Plasma triglyceride level (44e..) 4", [4]),
("Plasma rndm triglyceride level (44e0.) 4", [4]),
("Plasma fast triglyceride level (44e1.) 4", [4]),
("Serum triglyceride levels (XE2q9) 4", [4]),
("Serum fasting triglyceride lev (44Q4.) 4", [4]),
("Serum random triglyceride lev (44Q5.) 4", [4]),
],
verbose=verbose,
)
[docs]class TriglyceridesValidator(ValidatorBase):
"""
Validator for Triglycerides (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return Triglycerides.NAME, [Triglycerides.TG]
# =============================================================================
# HbA1c
# =============================================================================
[docs]def hba1c_mmol_per_mol_from_percent(
percent: Union[float, str]
) -> Optional[float]:
"""
Convert an HbA1c value from old percentage units -- DCCT (Diabetes Control
and Complications Trial), UKPDS (United Kingdom Prospective Diabetes Study)
or NGSP (National Glycohemoglobin Standardization Program) -- to newer IFCC
(International Federation of Clinical Chemistry) mmol/mol units (mmol HbA1c
/ mol Hb).
Args:
percent: DCCT value as a percentage
Returns:
IFCC value in mmol/mol
Example: 5% becomes 31.1 mmol/mol.
By Emanuele Osimo, Feb 2019.
Some modifications by Rudolf Cardinal, Feb 2019.
References:
- Emanuele had mmol_per_mol = (percent - 2.14) * 10.929 -- primary source
awaited.
- Jeppsson 2002, https://www.ncbi.nlm.nih.gov/pubmed/11916276 -- no, that's
the chemistry
- https://www.ifcchba1c.org/
- http://www.ngsp.org/ifccngsp.asp -- gives master equation of
NGSP = [0.09148 × IFCC] + 2.152), therefore implying
IFCC = (NGSP – 2.152) × 10.93135.
- Little & Rohlfing 2013: https://www.ncbi.nlm.nih.gov/pubmed/23318564;
also gives NGSP = [0.09148 * IFCC] + 2.152.
Note also that you may see eAG values (estimated average glucose), in
mmol/L or mg/dl; see http://www.ngsp.org/A1ceAG.asp; these are not direct
measurements of HbA1c.
"""
if isinstance(percent, str):
percent = to_float(percent)
if not percent:
return None
percent = abs(percent) # deals with e.g. "HbA1c-8%" -> -8
return (percent - 2.152) * 10.93135
[docs]class HbA1c(SimpleNumericalResultParser):
"""
BIOCHEMISTRY.
Glycosylated (glycated) haemoglobin (HbA1c).
Default units are mmol/mol; also supports %.
Note: HbA1 is different
(https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2541274).
"""
# By Emanuele Osimo, Feb 2019.
# Some modifications by Rudolf Cardinal, Feb 2019.
HBA1C_BASE = rf"""
{WORD_BOUNDARY}
(?:
(?: Glyc(?:osyl)?ated [-\s]+ (?:ha?emoglobin|Hb) ) |
HbA1c
)
{WORD_BOUNDARY}
"""
HBA1C = regex_or(
*regex_components_from_read_codes(
ReadCodes.HBA1C,
ReadCodes.HBA1C_DCCT,
ReadCodes.HBA1C_IFCC,
),
HBA1C_BASE,
wrap_each_in_noncapture_group=True,
wrap_result_in_noncapture_group=False,
)
REGEX = make_simple_numeric_regex(
quantity=HBA1C,
units=regex_or(
MILLIMOLES_PER_MOL, # standard
PERCENT, # good but needs conversion
MILLIMOLES_PER_L, # bad; may be an eAG value
MG_PER_DL, # bad; may be an eAG value
),
)
NAME = "HBA1C"
PREFERRED_UNIT_COLUMN = "value_mmol_mol"
UNIT_MAPPING = {
MILLIMOLES_PER_MOL: 1, # preferred unit
PERCENT: hba1c_mmol_per_mol_from_percent,
# but not MILLIMOLES_PER_L
# and not MG_PER_DL
}
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
) -> None:
# see documentation above
super().__init__(
nlpdef=nlpdef,
cfg_processor_name=cfg_processor_name,
regex_str=self.REGEX,
variable=self.NAME,
target_unit=self.PREFERRED_UNIT_COLUMN,
units_to_factor=self.UNIT_MAPPING,
commit=commit,
take_absolute=True,
)
[docs] def test(self, verbose: bool = False) -> None:
# docstring in parent class
def convert(percent: float) -> float:
# Convert % to mmol/mol
return hba1c_mmol_per_mol_from_percent(percent)
self.test_numerical_parser(
[
("HbA1c", []), # should fail; no values
("glycosylated haemoglobin", []), # should fail; no values
("HbA1c 31", [31]),
("HbA1c 31 mmol/mol", [31]),
("HbA1c 31 mg/dl", []), # wrong units
("HbA1c 31 mmol/L", []), # wrong units
("glycosylated haemoglobin 31 mmol/mol", [31]),
("glycated hemoglobin 31 mmol/mol", [31]),
("HbA1c 8%", [convert(8)]),
("HbA1c = 8%", [convert(8)]),
("HbA1c: 31", [31]),
("HbA1c equals 31", [31]),
("HbA1c is equal to 31", [31]),
("HbA1c <31.2", [31.2]),
("HbA1c less than 4", [4]),
("HbA1c more than 20", [20]),
("HbA1c was 31", [31]),
("HbA1c was 15%", [convert(15)]),
("HbA1c-31", [31]),
("HbA1c-8%", [convert(8)]),
("HbA1c | 40 (H) | mmol/mol", [40]),
("Haemoglobin A1c level (X772q) 8%", [convert(8)]),
("HbA1c level (DCCT aligned) (XaERp) 8%", [convert(8)]),
("HbA1c levl - IFCC standardised (XaPbt) 31 mmol/mol", [31]),
],
verbose=verbose,
)
[docs]class HbA1cValidator(ValidatorBase):
"""
Validator for HbA1c (see help for explanation).
"""
[docs] @classmethod
def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
return HbA1c.NAME, [HbA1c.HBA1C]
# =============================================================================
# All classes in this module
# =============================================================================
ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS = [
(Albumin, AlbuminValidator),
(AlkPhos, AlkPhosValidator),
(ALT, ALTValidator),
(Bilirubin, BilirubinValidator),
(Creatinine, CreatinineValidator),
(Crp, CrpValidator),
(GammaGT, GammaGTValidator),
(Glucose, GlucoseValidator),
(HbA1c, HbA1cValidator),
(HDLCholesterol, HDLCholesterolValidator),
(LDLCholesterol, LDLCholesterolValidator),
(Lithium, LithiumValidator),
(Potassium, PotassiumValidator),
(Sodium, SodiumValidator),
(TotalCholesterol, TotalCholesterolValidator),
(Triglycerides, TriglyceridesValidator),
(Tsh, TshValidator),
(Urea, UreaValidator),
]