Source code for crate_anon.nlp_manager.regex_units

"""
crate_anon/nlp_manager/regex_units.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Regular expressions to detect physical units.**

"""

from typing import List, Optional, Tuple

from crate_anon.nlp_manager.regex_numbers import (
    BILLION,
    MULTIPLY_OR_SPACE,
    PLAIN_INTEGER,
    POWER,
    TRILLION,
)


# =============================================================================
# Physical units
# =============================================================================

OUT_OF_SEPARATOR = r"(?: \/ | \b out \s+ of \b )"


[docs]def per( numerator: str, denominator: str, include_power_minus1: bool = True, numerator_optional: bool = False, ) -> str: """ Returns regex text representing "X per Y"; e.g. "millimoles per litre", "cells per cubic millimetre". Args: numerator: regex representing the numerator denominator: regex representing the denominator include_power_minus1: include the "n d -1" format for "n/d" numerator_optional: presence of the numerator is optional """ if numerator: if numerator_optional: # ensure that the optional whitespace is captured as part of the # "optional" bit, so there is no leftover whitespace that can # remain numerator_part = rf"(?: {numerator} \s* )?" else: # numerator, optional whitespace numerator_part = rf"{numerator} \s*" # Use of "\s* \b" rather than "\s+" is so we can have a BLANK # numerator. else: # Blank numerator numerator_part = "" options = [ rf"{numerator_part} (?: \/ | \b per \b) \s* {denominator}", ] if include_power_minus1: options.append(rf"{numerator_part} \b {denominator} \s* -1") return r"(?: {} )".format(" | ".join(options))
def _out_of_str(n_as_regex: str) -> str: """ Returns regex text representing "out of N". Args: n_as_regex: the "N", as a regular expression """ # / n # out of n return rf"(?: {OUT_OF_SEPARATOR} \s* {n_as_regex} \b)"
[docs]def out_of(n: int) -> str: """ Returns regex text representing "out of N". Args: n: the number N """ return _out_of_str(str(n))
[docs]def out_of_anything() -> str: """ Returns: regex representing "out of N" where N is any number """ return _out_of_str(PLAIN_INTEGER)
[docs]def power(x: str, n: int, allow_no_operator: bool = False) -> str: """ Returns regex text representing "x to the power n". Args: x: base n: exponent allow_no_operator: make the operator (like ``^`` or ``**``) optional? """ return r"(?: {x} \s* {power}{optional} \s* {n})".format( x=x, power=POWER, optional="?" if allow_no_operator else "", n=n, )
[docs]def units_times(*args: str) -> str: """ Returns regular expression text combining all its inputs with optional multiplication. For units, where they are notionally multiplied. """ multiply = MULTIPLY_OR_SPACE + "?" joined = multiply.join(args) return rf"(?: {joined} )"
[docs]def units_by_dimension( *args: Tuple[str, int], # specify type of *one* arg! allow_no_operator: bool = False, ) -> str: """ Returns regex text for a unit where we specify them by their dimensions. Args: *args: each is a tuple ``unit, power`` allow_no_operator: make the operator (like ``^`` or ``**``) optional? """ multiply = " " + MULTIPLY_OR_SPACE + " " power_elements = [] # type: List[str] for i, unit_exponent in enumerate(args): unit, exponent = unit_exponent assert exponent != 0 power_elements.append( power(unit, exponent, allow_no_operator=allow_no_operator) ) joined_power_elements = multiply.join(power_elements) power_style = rf"(?: {joined_power_elements} )" options = [power_style] # noinspection PyChainedComparisons if len(args) == 2 and args[0][1] > 0 and args[1][1] < 0: # x per y options.append(per(args[0][0], args[1][0], include_power_minus1=False)) return r"(?: {} )".format(r" | ".join(options))
# ----------------------------------------------------------------------------- # Distance # ----------------------------------------------------------------------------- M = r"(?: met(?:re|er)s? | m )" # m, metre(s), meter(s) CM = r"(?: cm | centimet(?:re|er)s? )" # cm, centimetre(s), centimeter(s) MM = r"(?: mm | millimet(?:re|er)s? )" # mm, millimetre(s), millimeter(s) FEET = r"""(?: f(?:ee|oo)?t | \' | ’ | ′ )""" # ... feet, foot, ft # ... apostrophe, right single quote (U+2019), prime (U+2032) INCHES = r"""(?: in(?:ch(?:e)?)?s? | \" | ” | ″)""" # ... in, ins, inch, inches, [inchs = typo but clear] # ... ", right double quote (U+2014), double prime (U+2033) # ----------------------------------------------------------------------------- # Mass # ----------------------------------------------------------------------------- MCG = r"(?: mcg | microgram(?:me)?s? | [μu]g )" # you won't stop people using ug... # noqa MG = r"(?: mg | milligram(?:me)?s? )" # mg, milligram, milligrams, milligramme, milligrammes # noqa G = r"(?: gram(?:me)?s? | g )" # g, gram, grams, gramme, grammes KG = r"(?: kgs? | kilo(?:gram(?:me)?)?s? )" # kg, kgs, kilos ... kilogrammes etc. # noqa LB = r"(?: pounds? | lbs? )" # pound(s), lb(s) STONES = r"(?: stones? | st\.? )" # stone(s), st, st. # ----------------------------------------------------------------------------- # Volume # ----------------------------------------------------------------------------- L = r"(?: lit(?:re|er)s? | L )" # L, litre(s), liter(s) DL = rf"(?: d(?:eci)?{L} )" # 10^-1 ML = rf"(?: m(?:illi)?{L} )" # 10^-3 MICROLITRE = rf"(?: micro{L} | [μu]L )" # 10^-6: microL, microliter(s), microlitre(s), μL, uL # noqa NANOLITRE = rf"(?: nano{L} | nL )" # 10^-9: nanoL, nanoliter(s), nanolitre(s), nL # noqa PICOLITRE = rf"(?: pico{L} | pL )" # 10^-12: picoL, picoliter(s), picolitre(s), pL # noqa FEMTOLITRE = rf"(?: femto{L} | fL )" # 10^-15: femtoL, femtoliter(s), femtolitre(s), fL # noqa # CUBIC_MM = r"""(?: (?:\b cubic \s+ {mm}) | {mm_cubed} )""".format( CUBIC_MM = r"""(?: (?:\b cubic \s+ {mm}) | {mm_cubed} | (?: \b cmm \b ) )""".format( # noqa mm=MM, mm_cubed=power(MM, 3, allow_no_operator=True) ) # cubic mm, etc. | mm^3, mm3, mm 3, etc. | cmm # "cmm" added 2018-09-07 having seen this in the wild (albeit urinary results). # A microlitre is of course the same as a cubic millimetre: CUBIC_MM_OR_MICROLITRE = rf"(?: {MICROLITRE} | {CUBIC_MM} )" # ----------------------------------------------------------------------------- # Inverse (reciprocal) volume # ----------------------------------------------------------------------------- PER_CUBIC_MM = per("", CUBIC_MM, numerator_optional=True) # ----------------------------------------------------------------------------- # Time # ----------------------------------------------------------------------------- HOUR = r"(?: \b h(?:rs?|ours?)? \b)" # h, hr, hrs, hour, hours DAY = r"(?: \b d(?:y?|ay?)? \b )" # d, dy, day WEEK = r"(?: \b w(?:k?|eek?)? \b)" # w, wk, week MONTH = r"(?:\b month \b)" # month YEAR = r"(?:\b y(?:(?:ea)?r)? \b)" # y, yr, year DAYS_PER_WEEK = 7 # The mean month (across a normal 4-year cycle ignoring century non-leap years) # is 30.4375 days: # n <- c(28, rep(30, 4), rep(31, 7)) # mean 30.41667 # l <- c(29, rep(30, 4), rep(31, 7)) # mean 30.5 # fouryearcycle <- c(n, n, n, l) # mean 30.4375 # century <- c(rep(n, 76), rep(l, 24)) # mean 30.43667 # mean(n) / 7 # 4.345238 # mean(fouryearcycle) / 7 # 4.348214 # mean(century) / 7 # 4.348095 # ... the Google answer for weeks per month is 4.34524, i.e. a normal year. # But let's not be spuriouly precise: WEEKS_PER_MONTH_APPROX = 4.35 WEEKS_PER_YEAR_APPROX = 52 # ----------------------------------------------------------------------------- # Proportions # ----------------------------------------------------------------------------- PERCENT = r"""(?:%|pe?r?\s?ce?n?t)""" # "%" or some subset of "percent" -- for the latter, must have "pct", other # characters optional # ----------------------------------------------------------------------------- # Arbitrary count things # ----------------------------------------------------------------------------- CELLS = r"(?:\b cells? \b)" UNITS = r"(?: (?:I\.?)? U(?:nits?|\.)? )" # U, IU, I.U., unit, units... # (IU for international units) MICROUNITS = rf"(?: (?:micro|μ|u) {UNITS} )" MILLIUNITS = rf"(?: m(?:illi)? {UNITS} )" UK = r"(?: U(?:nited\s+|\.\s*)? K(?:ingdom|\.)? )" ALCOHOL = r"(?: \b(?:alcohol|ethanol|EtOH)\b )" UK_ALCOHOL_UNITS = rf"(?: (?: {UK} \s+)? ({ALCOHOL} \s+)? {UNITS} )" # U, unit, units, UK units, UK alcohol units... # I thought not "IU" as they are not international units; however, RS used that # term, so whether correct or in error, that's sufficient for me to include it! UK_ALCOHOL_UNITS_PER_DAY = per( UK_ALCOHOL_UNITS, DAY, include_power_minus1=False ) UK_ALCOHOL_UNITS_PER_WEEK = per( UK_ALCOHOL_UNITS, WEEK, include_power_minus1=False ) UK_ALCOHOL_UNITS_PER_MONTH = per( UK_ALCOHOL_UNITS, MONTH, include_power_minus1=False ) UK_ALCOHOL_UNITS_PER_YEAR = per( UK_ALCOHOL_UNITS, YEAR, include_power_minus1=False ) SCORE = r"(?:scored?)" # score(d) # ----------------------------------------------------------------------------- # Moles # ----------------------------------------------------------------------------- MOLES = r"(?:\b mole?s? \b)" # mol, mole, mols, moles MICROMOLES = r"(?: (?:micro|μ|u)mole?s? )" MILLIMOLES = r"(?: m(?:illi)?mole?s? )" MICROEQ = r"(?: (?:micro|μ|u)Eq )" MILLIEQ = r"(?: m(?:illi)?Eq )" # ----------------------------------------------------------------------------- # Concentration (molarity) # ----------------------------------------------------------------------------- MICROMOLAR = r"(?:[μu]M | micromolar)" MILLIMOLAR = r"(?:mM)" # NB case-insensitive... confusable with millimetres MICROEQ_PER_L = per(MICROEQ, L) MICROMOLES_PER_L = per(MICROMOLES, L) MILLIEQ_PER_L = per(MILLIEQ, L) MILLIMOLES_PER_L = per(MILLIMOLES, L) # ----------------------------------------------------------------------------- # Concentration (mass) # ----------------------------------------------------------------------------- G_PER_DL = per(G, DL) G_PER_L = per(G, L) L_PER_L = per(L, L) MG_PER_DL = per(MG, DL) MG_PER_L = per(MG, L) # ----------------------------------------------------------------------------- # Concentration (arbitrary count and dimensionless things) # ----------------------------------------------------------------------------- BILLION_PER_L = per(BILLION, L) TRILLION_PER_L = per(TRILLION, L) CELLS_PER_CUBIC_MM = per(CELLS, CUBIC_MM, numerator_optional=True) CELLS_PER_CUBIC_MM_OR_MICROLITRE = per( CELLS, CUBIC_MM_OR_MICROLITRE, numerator_optional=True ) MICROUNITS_PER_ML = per(MICROUNITS, ML) MILLIUNITS_PER_L = per(MILLIUNITS, L) UNITS_PER_L = per(UNITS, L) MILLIMOLES_PER_MOL = per(MILLIMOLES, MOLES) # ----------------------------------------------------------------------------- # Speed # ----------------------------------------------------------------------------- MM_PER_H = per(MM, HOUR) # ----------------------------------------------------------------------------- # Pressure # ----------------------------------------------------------------------------- MM_HG = r"(?: mm \s* Hg )" # mmHg, mm Hg # ... likelihood of "millimetres of mercury" quite small? # ----------------------------------------------------------------------------- # Area and related # ----------------------------------------------------------------------------- SQ_M = r""" (?: # square metres (?: sq(?:uare)? \s+ {m} ) # sq m, square metres, etc. | (?: {m} \s+ sq(?:uared?)? ) # m sq, metres square(d), etc. | {m_sq} # m ^ 2, etc. ) """.format( m=M, m_sq=power(M, 2) ) # BMI KG_PER_SQ_M = r"(?: {kg_per_sqm} | {kg_sqm_pow_minus2} )".format( kg_per_sqm=per(KG, SQ_M, include_power_minus1=False), kg_sqm_pow_minus2=units_times(KG, power(M, -2)), ) # ============================================================================= # Generic conversion functions # =============================================================================
[docs]def kg_from_st_lb_oz( stones: float = 0, pounds: float = 0, ounces: float = 0 ) -> Optional[float]: """ Convert Imperial to metric mass. Returns: mass in kg """ # 16 ounces in a pound # 14 pounds in a stone # 1 avoirdupois pound = 0.45359237 kg # https://en.wikipedia.org/wiki/Pound_(mass) # Have you the peas? "Goods of weight"; aveir de peis (OFr.; see OED). try: total_pounds = (stones * 14) + pounds + (ounces / 16) return 0.45359237 * total_pounds except (TypeError, ValueError): return None
[docs]def m_from_ft_in(feet: float = 0, inches: float = 0) -> Optional[float]: """ Converts Imperial to metric length. Returns: length in m """ # 12 inches in a foot # 1 inch = 25.4 mm try: total_inches = (feet * 12) + inches return total_inches * 25.4 / 1000 except (TypeError, ValueError): return None
[docs]def m_from_m_cm(metres: float = 0, centimetres: float = 0) -> Optional[float]: """ Converts metres/centimetres to metres. """ try: return metres + (centimetres / 100) except (TypeError, ValueError): return None
[docs]def assemble_units(components: List[Optional[str]]) -> str: """ Takes e.g. ``["ft", "in"]`` and makes ``"ft in"``. """ active_components = [c for c in components if c] return " ".join(active_components)
[docs]def factor_millimolar_from_mg_per_dl(molecular_mass_g_per_mol: float) -> float: """ Returns the conversion factor that you should multiple a "mg/dL" number by to get a "mM" (mmol/L) number. Principle: .. code-block:: none mmol_per_L = 0.001 * mol_per_L = 0.001 * (g_per_L / g_per_mol) = 0.001 * ((10 * g_per_dL) / g_per_mol) = 0.001 * ((10 * 1000 * mg_per_dL) / g_per_mol) = (0.001 * 10 * 1000 / g_per_mol) * mg_per_dL = (10 / g_per_mol) * mg_per_dl Example: glucose, molecular mass 180.156 g/mol => conversion factor is (10 / 180.156) 90 mg/dL -> (10 / 180.156) * 90 mM = 5.0 mM Args: molecular_mass_g_per_mol: molecular mass in g/mol Returns: conversion factor """ return 10 / molecular_mass_g_per_mol
[docs]def factor_micromolar_from_mg_per_dl(molecular_mass_g_per_mol: float) -> float: """ Returns the conversion factor that you should multiple a "mg/dL" number by to get a "μM" (μmol/L) number. Args: molecular_mass_g_per_mol: molecular mass in g/mol Returns: conversion factor """ return 1000 * factor_millimolar_from_mg_per_dl(molecular_mass_g_per_mol)
[docs]def millimolar_from_mg_per_dl( mg_per_dl: float, molecular_mass_g_per_mol: float ) -> float: """ Converts a concentration from mg/dL to mM (mmol/L). Args: mg_per_dl: value in mg/dL molecular_mass_g_per_mol: molecular mass in g/mol Returns: value in mM = mmol/L """ return mg_per_dl * factor_millimolar_from_mg_per_dl( molecular_mass_g_per_mol )
[docs]def micromolar_from_mg_per_dl( mg_per_dl: float, molecular_mass_g_per_mol: float ) -> float: """ Converts a concentration from mg/dL to μM (μmol/L). Args: mg_per_dl: value in mg/dL molecular_mass_g_per_mol: molecular mass in g/mol Returns: value in μM = μmol/L """ return mg_per_dl * factor_micromolar_from_mg_per_dl( molecular_mass_g_per_mol )