Source code for crate_anon.nlp_manager.parse_substance_misuse

"""
crate_anon/nlp_manager/parse_substance_misuse.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Python regex-based NLP processors for substance misuse.**

"""

import logging
from typing import Any, Dict, Generator, List, Optional, Tuple

from crate_anon.common.regex_helpers import (
    at_wb_start_end,
    noncapture_group,
    optional_named_capture_group,
    optional_noncapture_group,
    regex_or,
    WORD_BOUNDARY,
)
from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.nlp_manager.number import to_float
from crate_anon.nlp_manager.regex_func import (
    compile_regex,
    compile_regex_dict,
    get_regex_dict_match,
    get_regex_dict_search,
)
from crate_anon.nlp_manager.regex_parser import (
    common_tense,
    EVER,
    FN_CONTENT,
    FN_END,
    FN_RELATION,
    FN_RELATION_TEXT,
    FN_START,
    FN_TENSE,
    FN_TENSE_TEXT,
    FN_UNITS,
    FN_VALUE_TEXT,
    FN_VARIABLE_NAME,
    FN_VARIABLE_TEXT,
    GROUP_NAME_QUANTITY,
    GROUP_NAME_RELATION,
    GROUP_NAME_TENSE,
    GROUP_NAME_UNITS,
    GROUP_NAME_VALUE,
    GROUP_NUMBER_WHOLE_EXPRESSION,
    make_simple_numeric_regex,
    NumericalResultParser,
    PAST,
    PRESENT,
    ValidatorBase,
)
from crate_anon.nlp_manager.regex_units import (
    ALCOHOL,
    DAYS_PER_WEEK,
    UK_ALCOHOL_UNITS_PER_DAY,
    UK_ALCOHOL_UNITS_PER_MONTH,
    UK_ALCOHOL_UNITS_PER_WEEK,
    UK_ALCOHOL_UNITS_PER_YEAR,
    WEEKS_PER_MONTH_APPROX,
    WEEKS_PER_YEAR_APPROX,
)

log = logging.getLogger(__name__)


# =============================================================================
# Alcohol
# =============================================================================


[docs]class AlcoholUnits(NumericalResultParser): """ SUBSTANCE MISUSE. Alcohol consumption, specified explicitly as (UK) units per day or per week, or via non-numeric references to not drinking any. - Output is in UK units per week. A UK unit is 10 ml of ethanol [#f1]_ [#f2]_. UK NHS guidelines used to be "per week" and remain broadly week-based [#f1]_. - It doesn't attempt any understanding of other alcohol descriptions (e.g. "pints of beer", "glasses of wine", "bottles of vodka") so is expected to apply where a clinician has converted a (potentially mixed) alcohol description to a units-per-week calculation. .. [#f1] https://www.nhs.uk/live-well/alcohol-advice/calculating-alcohol-units/, accessed 2023-01-18. .. [#f2] https://en.wikipedia.org/wiki/Unit_of_alcohol """ # noqa: E501 # There are no relevant Read codes for alcohol consumption in # v3ReadCode_PBCL.xlsx. # ------------------------------------------------------------------------- # Regex building for tense-related statements # ------------------------------------------------------------------------- # All these are verbose regexes, so don't omit \s+ for whitespace! PAST_ADVERBS = ( "formerly", "once", "peak", "previously", "was", ) PAST_ADVERBS_RE = noncapture_group(regex_or(*PAST_ADVERBS)) DOES_NOT = r"does\s*n[o'’]t" # does not, doesn't PRESENT_ADVERBS = ( r"at \s+ present", r"currently", r"has \s+ been", r"now", r"nowadays", r"presently", r"these \s+ days", DOES_NOT, ) PRESENT_ADVERBS_RE = noncapture_group(regex_or(*PRESENT_ADVERBS)) TEMPORAL_WORDS = tuple( at_wb_start_end(x) for x in PAST_ADVERBS + PRESENT_ADVERBS ) TEMPORAL = noncapture_group(regex_or(*TEMPORAL_WORDS)) OPT_TEMPORAL = optional_noncapture_group(regex_or(*TEMPORAL_WORDS)) NEVER = "never" # "Never" is both temporal and negating and thus fiddly. We do *not* # include it in standard temporal words, or a statement about "has never # drunk >100 u/w" would be misinterpreted as positive. # ------------------------------------------------------------------------- # Regex building for drinking alcohol (and when) # ------------------------------------------------------------------------- DRINKING_PAST = ( # Past infinitive: she used to drink r"\b used \s+ to \s+ drink \b", # Imperfect tense: she [adverb] drank rf"\b (?: {PAST_ADVERBS_RE} \s+ )? drank \b", # Perfect tense: has drunk rf"\b has (?: {PAST_ADVERBS_RE} \s+ )? drunk \b", # Past continuous tense: he was [adverb] drinking # Also abbreviated past continuous tense: previously drinking rf"\b {PAST_ADVERBS_RE} \s+ drinking \b", ) # We don't allow the adverbs by themselves, to avoid something that isn't # explicitly about alcohol or drinking, e.g. "[insulin] currently 6 # units/day". DRINKING_PRESENT = ( # Present tense: he [adverb] drinks rf"\b (?: {PRESENT_ADVERBS_RE} \s+)? drinks \b", # Present continuous tense: he is [adverb] drinking rf"\b (?: is \s+)? (?: {PRESENT_ADVERBS_RE} \s+)? drinking \b", ) DRINKING_PAST_PRESENT = DRINKING_PAST + DRINKING_PRESENT DRINKING = noncapture_group(regex_or(*DRINKING_PAST_PRESENT)) OPT_DRINKING = optional_noncapture_group(regex_or(*DRINKING_PAST_PRESENT)) ALCOHOL_PM_CONSUMPTION = rf"{ALCOHOL} (?: \s+ consumption \b)?" ALC = noncapture_group(ALCOHOL_PM_CONSUMPTION) OPT_ALC = optional_noncapture_group(ALCOHOL_PM_CONSUMPTION) # BRK: requires some sort of wordbreak or whitespace, but also disposes of # junk like some punctuation (e.g. "previously: none" versus "previously # none") and words like "at" (e.g. in "drinking at X units/week"). BRK = noncapture_group( regex_or( r"\s* : \s*", # colon +/- whitespace r"\s* \b at \b \s*", # "at" +/- whitespace r"\s+", # whitespace WORD_BOUNDARY, # other word break ) ) # Move from more to less specific, or the less specific will capture first. ALCOHOL_DRINKING = rf""" {WORD_BOUNDARY} # Alcohol drinking: (?: # 1. ... DRINKING ... [ALC] ... {OPT_TEMPORAL} {BRK} {DRINKING} {BRK} {OPT_TEMPORAL} {BRK} {OPT_ALC} {BRK} {OPT_TEMPORAL} | # 2. ... ALC ... [DRINKING] ... {OPT_TEMPORAL} {BRK} {ALC} {BRK} {OPT_TEMPORAL} {BRK} {OPT_DRINKING} {BRK} {OPT_TEMPORAL} ) {WORD_BOUNDARY} """ _drinking_tense_dict = {} # type: Dict[str, str] for _past in DRINKING_PAST + PAST_ADVERBS: _drinking_tense_dict[_past] = PAST for _present in DRINKING_PRESENT + PRESENT_ADVERBS: _drinking_tense_dict[_present] = PRESENT TENSE_PAST_PRESENT_LOOKUP = compile_regex_dict(_drinking_tense_dict) TENSE_NEVER_LOOKUP = compile_regex_dict({NEVER: EVER}) # ------------------------------------------------------------------------- # Regex building for "drinking alcohol at X units per week" # ------------------------------------------------------------------------- # A temporal suffix allows e.g. "drinking X units/week previously". GROUP_NAME_SUFFIX = "suffix" group_suffix = r"\b \s*" + optional_named_capture_group( TEMPORAL, GROUP_NAME_SUFFIX ) REGEX_ALCOHOL_UNITS = ( make_simple_numeric_regex( quantity=ALCOHOL_DRINKING, units=regex_or( UK_ALCOHOL_UNITS_PER_DAY, UK_ALCOHOL_UNITS_PER_WEEK, UK_ALCOHOL_UNITS_PER_MONTH, # perhaps unusual! UK_ALCOHOL_UNITS_PER_YEAR, # perhaps unusual! ), units_optional=False, ) + group_suffix ) # ------------------------------------------------------------------------- # Regex building for "no alcohol" statements # ------------------------------------------------------------------------- ABSTINENT = r"\b abstin[ae]nt \b" # "abstinent", or typo "abstinant" NONE = noncapture_group( WORD_BOUNDARY + noncapture_group( regex_or( "0", rf"{ABSTINENT} (?: \s+ from \b )?", NEVER, "no", "none", "zero", ) ) + WORD_BOUNDARY ) TEETOTAL = noncapture_group( r"\b te[ea][-]?total(?:l?er)? \b", ) DOES_NOT_DRINK = noncapture_group( regex_or( rf"\b {DOES_NOT} \s+ drink \b", rf"\b has \s+ {NEVER} \s+ drunk \b", ) ) OPT_TEMPORAL_AND_OR_DRINKING_BRK = ( f"{OPT_TEMPORAL} {BRK} {OPT_DRINKING} {BRK} {OPT_TEMPORAL} {BRK}" ) NO_ALCOHOL = rf""" {WORD_BOUNDARY} # "No alcohol" statements. # Temporal modifiers might be found in all sorts of places. (?: # 1. [DRINKING] ... ALC ... [DRINKING] ... NONE ... {OPT_TEMPORAL_AND_OR_DRINKING_BRK} {ALC} {BRK} {OPT_TEMPORAL_AND_OR_DRINKING_BRK} {NONE} {BRK} {OPT_TEMPORAL_AND_OR_DRINKING_BRK} | # 2. NONE ... ALC (e.g. "never alcohol") {OPT_TEMPORAL_AND_OR_DRINKING_BRK} {NONE} {BRK} {OPT_TEMPORAL_AND_OR_DRINKING_BRK} {ALC} {BRK} {OPT_TEMPORAL_AND_OR_DRINKING_BRK} | # 3. "has never drunk... alcohol", etc. {DOES_NOT_DRINK} {BRK} {ALC} {BRK} | # 4. "teetotal" with typos {TEETOTAL} # ... but not just "drinking... none" (could be water etc.) ) {WORD_BOUNDARY} """ # ------------------------------------------------------------------------- # Other class variables # ------------------------------------------------------------------------- NAME = "AlcoholUnits" PREFERRED_UNIT_COLUMN = "value_uk_units_per_week" UNIT_MAPPING = { UK_ALCOHOL_UNITS_PER_WEEK: 1, # preferred unit UK_ALCOHOL_UNITS_PER_DAY: DAYS_PER_WEEK, # 1 unit/day -> 7 units/week UK_ALCOHOL_UNITS_PER_MONTH: 1 / WEEKS_PER_MONTH_APPROX, UK_ALCOHOL_UNITS_PER_YEAR: 1 / WEEKS_PER_YEAR_APPROX, } # ------------------------------------------------------------------------- # Init # -------------------------------------------------------------------------
[docs] def __init__( self, nlpdef: Optional[NlpDefinition], cfg_processor_name: Optional[str], commit: bool = False, ) -> None: # see documentation above super().__init__( nlpdef=nlpdef, cfg_processor_name=cfg_processor_name, variable=self.NAME, target_unit=self.PREFERRED_UNIT_COLUMN, regex_str_for_debugging=self.REGEX_ALCOHOL_UNITS, commit=commit, ) self.compiled_regex_alcohol = compile_regex(self.REGEX_ALCOHOL_UNITS) self.units_to_factor = compile_regex_dict(self.UNIT_MAPPING) self.compiled_regex_no_alcohol = compile_regex(self.NO_ALCOHOL)
# ------------------------------------------------------------------------- # Parse # -------------------------------------------------------------------------
[docs] def parse( self, text: str, debug: bool = False ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: """ Parse for two regexes which operate slightly differently. """ if not text: return yield from self.parse_alcohol_units(text, debug) yield from self.parse_alcohol_none(text, debug)
[docs] def parse_alcohol_units( self, text: str, debug: bool = False ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: """ We amend SimpleNumericalResultParser.parse() to deal with tense a bit better (e.g. "used to drink"). Comments from that version not repeated. That version also shortened a bit since we guarantee some aspects of the flags. """ for m in self.compiled_regex_alcohol.finditer(text): startpos = m.start() endpos = m.end() matching_text = m.group(GROUP_NUMBER_WHOLE_EXPRESSION) variable_text = m.group(GROUP_NAME_QUANTITY) tense_text = m.group(GROUP_NAME_TENSE) relation_text = m.group(GROUP_NAME_RELATION) value_text = m.group(GROUP_NAME_VALUE) units = m.group(GROUP_NAME_UNITS) suffix_text = m.group(self.GROUP_NAME_SUFFIX) value_in_target_units = None if units: matched_unit, multiple_or_fn = get_regex_dict_match( units, self.units_to_factor ) if not matched_unit: continue # MODIFIED: no need to check callable(multiple_or_fn); always # no value_in_target_units = to_float(value_text) * multiple_or_fn # MODIFIED: no need to check self.assume_preferred_unit (we never # assume that here) # MODIFIED: no need to check self.take_absolute (always yes) if value_in_target_units is not None: value_in_target_units = abs(value_in_target_units) tense, relation = common_tense(tense_text, relation_text) # MODIFIED: Extra bit here to detect tense information in a # different place: for temporal_info in (variable_text, suffix_text): if tense: break tense = self._get_tense(temporal_info) if tense: tense_text = temporal_info # Back to the previous code: result = { FN_VARIABLE_NAME: self.variable, FN_CONTENT: matching_text, FN_START: startpos, FN_END: endpos, FN_VARIABLE_TEXT: variable_text, FN_RELATION_TEXT: relation_text, FN_RELATION: relation, FN_VALUE_TEXT: value_text, FN_UNITS: units, self.target_unit: value_in_target_units, FN_TENSE_TEXT: tense_text, FN_TENSE: tense, } if debug: log.debug(f"Match {m} for {text!r} -> {result}") yield self.tablename, result
[docs] def parse_alcohol_none( self, text: str, debug: bool = False ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: """ Deal with references to not drinking any alcohol (except those referred to as e.g. "0 units per week", which will be picked up by the units-per-week function -- that will be rare!). """ for m in self.compiled_regex_no_alcohol.finditer(text): startpos = m.start() endpos = m.end() matching_text = m.group(GROUP_NUMBER_WHOLE_EXPRESSION) tense = self._get_tense(matching_text) tense_text = matching_text if tense else None result = { FN_VARIABLE_NAME: self.variable, FN_CONTENT: matching_text, FN_START: startpos, FN_END: endpos, FN_VARIABLE_TEXT: matching_text, FN_RELATION_TEXT: None, FN_RELATION: None, FN_VALUE_TEXT: matching_text, FN_UNITS: None, self.target_unit: 0, # zero units FN_TENSE_TEXT: tense_text, FN_TENSE: tense, } if debug: log.debug(f"Match {m} for {text!r} -> {result}") yield self.tablename, result
def _get_tense(self, text: str) -> Optional[str]: """ Find a tense indicator and return the corresponding text, or None. """ # We deal with "never" first because otherwise "never drank" may hit # "[optional_stuff] drank" and be classified as the past tense. _, tense = get_regex_dict_search(text, self.TENSE_NEVER_LOOKUP) if not tense: _, tense = get_regex_dict_search( text, self.TENSE_PAST_PRESENT_LOOKUP ) return tense # ------------------------------------------------------------------------- # Test # -------------------------------------------------------------------------
[docs] def test(self, verbose: bool = False) -> None: # docstring in parent class # Test via e.g.: # pytest -k SubstanceMisuseTests # self-tests # crate_run_crate_nlp_demo - --processors AlcoholUnits # interactive no_results = [] six_no_tense = [{self.target_unit: 6, FN_TENSE: None}] six_past = [{self.target_unit: 6, FN_TENSE: PAST}] six_present = [{self.target_unit: 6, FN_TENSE: PRESENT}] six_per_day_present = [ {self.target_unit: 6 * DAYS_PER_WEEK, FN_TENSE: PRESENT} ] six_per_month_present = [ {self.target_unit: 6 / WEEKS_PER_MONTH_APPROX, FN_TENSE: PRESENT} ] six_per_year_present = [ {self.target_unit: 6 / WEEKS_PER_YEAR_APPROX, FN_TENSE: PRESENT} ] under_6_present = [ {self.target_unit: 6, FN_RELATION: "<", FN_TENSE: PRESENT} ] over_200_present = [ {self.target_unit: 200, FN_RELATION: ">", FN_TENSE: PRESENT} ] no_alcohol_no_tense = [{self.target_unit: 0, FN_TENSE: None}] no_alcohol_past = [{self.target_unit: 0, FN_TENSE: PAST}] no_alcohol_present = [{self.target_unit: 0, FN_TENSE: PRESENT}] no_alcohol_ever = [{self.target_unit: 0, FN_TENSE: EVER}] self.detailed_test_multiple( [ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # No results expected: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ("Alcohol", no_results), ("He used to drink like a fish", no_results), ("[e.g. insulin] currently 6 units per week", no_results), ("[e.g. insulin] previously 6 units per week", no_results), ("[could be insulin] peak 6 u/w", no_results), ("[!] methylalcohol 6 u/w", no_results), ("[not starts with no] Alcohol: not explored", no_results), # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Value with no tense: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ("Alcohol 6 u/w", six_no_tense), ("Alcohol - 6 u/w", six_no_tense), ("EtOH = 6 u/w", six_no_tense), ("EtOH = 6 u/wk", six_no_tense), ("Alcohol (units/week): 6", six_no_tense), ("Ethanol 6 units/week", six_no_tense), ("[not international but] alcohol 6 IU/week", six_no_tense), ("alcohol 6 I.U./week", six_no_tense), # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Past tense: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ("Alcohol: was 6 u/w", six_past), # other tenses fail (= good) ("Alcohol: formerly 6 u/w", six_past), ("Alcohol: previously 6 u/w", six_past), ("Alcohol: once 6 u/w", six_past), ("Alcohol: peak 6 u/w", six_past), ("Used to drink 6 u/w", six_past), ("Peak drinking 6 u/w", six_past), ("Peak alcohol consumption: 6 u/w", six_past), ("Drank 6 u/w", six_past), ("Formerly drank 6 u/w", six_past), ("Previously drank 6 u/w", six_past), ("Was drinking 6 u/w", six_past), ("Was previously drinking 6 u/w", six_past), ("Was formerly drinking 6 u/w", six_past), ("Alcohol: formerly 6 u/w", six_past), ("Alcohol: previously 6 u/w", six_past), ("Alcohol: 6 u/w previously", six_past), # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Present tense: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ("Drinks 6 units per week", six_present), ("Drinks 6 alcohol units per week", six_present), ("Drinks 6 UK units per week", six_present), ("Drinks 6 UK alcohol units per week", six_present), ("[silly] Drinks 6 UK alcohol IU per week", six_present), ("Drinks 6 units/d", six_per_day_present), ("Drinks 6 units/dy", six_per_day_present), ("Drinks 6 units/day", six_per_day_present), ("Currently drinks 6 units per week", six_present), ("These days drinks 6 units per week", six_present), ("Now drinks 6 units per week", six_present), ("Nowadays drinks 6 units per week", six_present), ("Drinking 6 units per week", six_present), ("Currently drinking 6 units per week", six_present), ("Presently drinking 6 units per week", six_present), ("Alcohol: currently 6 u/w", six_present), ("Alcohol: presently 6 u/w", six_present), ("In terms of alcohol she drinks 6 units/week", six_present), ("Has been drinking 6 units per week", six_present), ("Drinks 6 units per month", six_per_month_present), ("Drinks 6 units per year", six_per_year_present), # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Inequalities: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ("Alcohol: presently less than 6 u/w", under_6_present), ("Alcohol: presently under 6 u/w", under_6_present), ("Alcohol: presently >200 u/w", over_200_present), ("Alcohol: currently more than 200 u/w", over_200_present), ("Alcohol: currently over 200 u/w", over_200_present), # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # References to not drinking -- no tense: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ("Alcohol: none", no_alcohol_no_tense), ("Teetotal", no_alcohol_no_tense), ("Tee-total", no_alcohol_no_tense), # typo ("Teetotaller", no_alcohol_no_tense), ("Teetotaler", no_alcohol_no_tense), # typo ("Abstinent from alcohol", no_alcohol_no_tense), ("Alcohol: abstinent", no_alcohol_no_tense), ("Alcohol: abstinant", no_alcohol_no_tense), # typo ("Alcohol: zero", no_alcohol_no_tense), ("Alcohol: 0", no_alcohol_no_tense), ("Alcohol: no", no_alcohol_no_tense), # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # References to not drinking -- past tense: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ("Alcohol: was abstinent", no_alcohol_past), ("Alcohol: previously abstinent", no_alcohol_past), # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # References to not drinking -- present tense: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ("Alcohol: has been abstinent", no_alcohol_present), ("Alcohol: currently abstinent", no_alcohol_present), ("Alcohol: currently none", no_alcohol_present), ("Drinks no alcohol", no_alcohol_present), ("Drinks zero alcohol", no_alcohol_present), ("Does not drink alcohol", no_alcohol_present), ("Doesn't drink alcohol", no_alcohol_present), # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # References to not drinking -- ever: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ("Has never drunk alcohol", no_alcohol_ever), ("Never drank alcohol", no_alcohol_ever), ("Alcohol: never", no_alcohol_ever), # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Vague references to not drinking, not interpreted: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ("Has not drunk alcohol", no_results), # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Potential teetotal statements, but very tricky to be sure: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ("Doesn't drink [coffee]", no_results), # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Distractors: # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ("Lemonade, which he does not drink.", no_results), ], verbose=verbose, )
[docs]class AlcoholUnitsValidator(ValidatorBase): """ Validator for AlcoholUnits (see help for explanation). """
[docs] @classmethod def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]: # We're very broad here: return AlcoholUnits.NAME, [ regex_or( ALCOHOL, r"\b dr[iau]nk ", # drink/drank/drunk plus any ending AlcoholUnits.ABSTINENT, AlcoholUnits.TEETOTAL, ) ]
# ============================================================================= # All classes in this module # ============================================================================= ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS = [ (AlcoholUnits, AlcoholUnitsValidator), ]