Source code for crate_anon.nlp_manager.parse_substance_misuse

"""
crate_anon/nlp_manager/parse_substance_misuse.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Python regex-based NLP processors for substance misuse.**

"""

import logging
from typing import Any, Dict, Generator, List, Optional, Tuple

from crate_anon.common.regex_helpers import (
    at_wb_start_end,
    noncapture_group,
    optional_named_capture_group,
    optional_noncapture_group,
    regex_or,
    WORD_BOUNDARY,
)
from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.nlp_manager.number import to_float
from crate_anon.nlp_manager.regex_func import (
    compile_regex,
    compile_regex_dict,
    get_regex_dict_match,
    get_regex_dict_search,
)
from crate_anon.nlp_manager.regex_parser import (
    common_tense,
    EVER,
    FN_CONTENT,
    FN_END,
    FN_RELATION,
    FN_RELATION_TEXT,
    FN_START,
    FN_TENSE,
    FN_TENSE_TEXT,
    FN_UNITS,
    FN_VALUE_TEXT,
    FN_VARIABLE_NAME,
    FN_VARIABLE_TEXT,
    GROUP_NAME_QUANTITY,
    GROUP_NAME_RELATION,
    GROUP_NAME_TENSE,
    GROUP_NAME_UNITS,
    GROUP_NAME_VALUE,
    GROUP_NUMBER_WHOLE_EXPRESSION,
    make_simple_numeric_regex,
    NumericalResultParser,
    PAST,
    PRESENT,
    ValidatorBase,
)
from crate_anon.nlp_manager.regex_units import (
    ALCOHOL,
    DAYS_PER_WEEK,
    UK_ALCOHOL_UNITS_PER_DAY,
    UK_ALCOHOL_UNITS_PER_MONTH,
    UK_ALCOHOL_UNITS_PER_WEEK,
    UK_ALCOHOL_UNITS_PER_YEAR,
    WEEKS_PER_MONTH_APPROX,
    WEEKS_PER_YEAR_APPROX,
)

log = logging.getLogger(__name__)


# =============================================================================
# Alcohol
# =============================================================================


[docs]class AlcoholUnits(NumericalResultParser):
    """
    SUBSTANCE MISUSE.

    Alcohol consumption, specified explicitly as (UK) units per day or per
    week, or via non-numeric references to not drinking any.

    - Output is in UK units per week. A UK unit is 10 ml of ethanol [#f1]_ [#f2]_.
      UK NHS guidelines used to be "per week" and remain broadly week-based [#f1]_.
    - It doesn't attempt any understanding of other alcohol descriptions (e.g.
      "pints of beer", "glasses of wine", "bottles of vodka") so is expected to
      apply where a clinician has converted a (potentially mixed) alcohol
      description to a units-per-week calculation.

    .. [#f1] https://www.nhs.uk/live-well/alcohol-advice/calculating-alcohol-units/,
           accessed 2023-01-18.
    .. [#f2] https://en.wikipedia.org/wiki/Unit_of_alcohol
    """  # noqa: E501

    # There are no relevant Read codes for alcohol consumption in
    # v3ReadCode_PBCL.xlsx.

    # -------------------------------------------------------------------------
    # Regex building for tense-related statements
    # -------------------------------------------------------------------------

    # All these are verbose regexes, so don't omit \s+ for whitespace!
    PAST_ADVERBS = (
        "formerly",
        "once",
        "peak",
        "previously",
        "was",
    )
    PAST_ADVERBS_RE = noncapture_group(regex_or(*PAST_ADVERBS))
    DOES_NOT = r"does\s*n[o'’]t"  # does not, doesn't
    PRESENT_ADVERBS = (
        r"at \s+ present",
        r"currently",
        r"has \s+ been",
        r"now",
        r"nowadays",
        r"presently",
        r"these \s+ days",
        DOES_NOT,
    )
    PRESENT_ADVERBS_RE = noncapture_group(regex_or(*PRESENT_ADVERBS))
    TEMPORAL_WORDS = tuple(
        at_wb_start_end(x) for x in PAST_ADVERBS + PRESENT_ADVERBS
    )
    TEMPORAL = noncapture_group(regex_or(*TEMPORAL_WORDS))
    OPT_TEMPORAL = optional_noncapture_group(regex_or(*TEMPORAL_WORDS))

    NEVER = "never"
    # "Never" is both temporal and negating and thus fiddly. We do *not*
    # include it in standard temporal words, or a statement about "has never
    # drunk >100 u/w" would be misinterpreted as positive.

    # -------------------------------------------------------------------------
    # Regex building for drinking alcohol (and when)
    # -------------------------------------------------------------------------

    DRINKING_PAST = (
        # Past infinitive: she used to drink
        r"\b used \s+ to \s+ drink \b",
        # Imperfect tense: she [adverb] drank
        rf"\b (?: {PAST_ADVERBS_RE} \s+ )? drank \b",
        # Perfect tense: has drunk
        rf"\b has (?: {PAST_ADVERBS_RE} \s+ )? drunk \b",
        # Past continuous tense: he was [adverb] drinking
        # Also abbreviated past continuous tense: previously drinking
        rf"\b {PAST_ADVERBS_RE} \s+ drinking \b",
    )
    # We don't allow the adverbs by themselves, to avoid something that isn't
    # explicitly about alcohol or drinking, e.g. "[insulin] currently 6
    # units/day".
    DRINKING_PRESENT = (
        # Present tense: he [adverb] drinks
        rf"\b (?: {PRESENT_ADVERBS_RE} \s+)? drinks \b",
        # Present continuous tense: he is [adverb] drinking
        rf"\b (?: is \s+)? (?: {PRESENT_ADVERBS_RE} \s+)? drinking \b",
    )
    DRINKING_PAST_PRESENT = DRINKING_PAST + DRINKING_PRESENT
    DRINKING = noncapture_group(regex_or(*DRINKING_PAST_PRESENT))
    OPT_DRINKING = optional_noncapture_group(regex_or(*DRINKING_PAST_PRESENT))
    ALCOHOL_PM_CONSUMPTION = rf"{ALCOHOL} (?: \s+ consumption \b)?"
    ALC = noncapture_group(ALCOHOL_PM_CONSUMPTION)
    OPT_ALC = optional_noncapture_group(ALCOHOL_PM_CONSUMPTION)

    # BRK: requires some sort of wordbreak or whitespace, but also disposes of
    # junk like some punctuation (e.g. "previously: none" versus "previously
    # none") and words like "at" (e.g. in "drinking at X units/week").
    BRK = noncapture_group(
        regex_or(
            r"\s* : \s*",  # colon +/- whitespace
            r"\s* \b at \b \s*",  # "at" +/- whitespace
            r"\s+",  # whitespace
            WORD_BOUNDARY,  # other word break
        )
    )

    # Move from more to less specific, or the less specific will capture first.
    ALCOHOL_DRINKING = rf"""
        {WORD_BOUNDARY}
            # Alcohol drinking:
            (?:
                    # 1. ... DRINKING ... [ALC] ...
                    {OPT_TEMPORAL} {BRK}
                    {DRINKING} {BRK}
                    {OPT_TEMPORAL} {BRK}
                    {OPT_ALC} {BRK}
                    {OPT_TEMPORAL}
                |
                    # 2. ... ALC ... [DRINKING] ...
                    {OPT_TEMPORAL} {BRK}
                    {ALC} {BRK}
                    {OPT_TEMPORAL} {BRK}
                    {OPT_DRINKING} {BRK}
                    {OPT_TEMPORAL}
            )
        {WORD_BOUNDARY}
    """

    _drinking_tense_dict = {}  # type: Dict[str, str]
    for _past in DRINKING_PAST + PAST_ADVERBS:
        _drinking_tense_dict[_past] = PAST
    for _present in DRINKING_PRESENT + PRESENT_ADVERBS:
        _drinking_tense_dict[_present] = PRESENT
    TENSE_PAST_PRESENT_LOOKUP = compile_regex_dict(_drinking_tense_dict)
    TENSE_NEVER_LOOKUP = compile_regex_dict({NEVER: EVER})

    # -------------------------------------------------------------------------
    # Regex building for "drinking alcohol at X units per week"
    # -------------------------------------------------------------------------

    # A temporal suffix allows e.g. "drinking X units/week previously".
    GROUP_NAME_SUFFIX = "suffix"
    group_suffix = r"\b \s*" + optional_named_capture_group(
        TEMPORAL, GROUP_NAME_SUFFIX
    )
    REGEX_ALCOHOL_UNITS = (
        make_simple_numeric_regex(
            quantity=ALCOHOL_DRINKING,
            units=regex_or(
                UK_ALCOHOL_UNITS_PER_DAY,
                UK_ALCOHOL_UNITS_PER_WEEK,
                UK_ALCOHOL_UNITS_PER_MONTH,  # perhaps unusual!
                UK_ALCOHOL_UNITS_PER_YEAR,  # perhaps unusual!
            ),
            units_optional=False,
        )
        + group_suffix
    )

    # -------------------------------------------------------------------------
    # Regex building for "no alcohol" statements
    # -------------------------------------------------------------------------

    ABSTINENT = r"\b abstin[ae]nt \b"  # "abstinent", or typo "abstinant"
    NONE = noncapture_group(
        WORD_BOUNDARY
        + noncapture_group(
            regex_or(
                "0",
                rf"{ABSTINENT} (?: \s+ from \b )?",
                NEVER,
                "no",
                "none",
                "zero",
            )
        )
        + WORD_BOUNDARY
    )
    TEETOTAL = noncapture_group(
        r"\b te[ea][-]?total(?:l?er)? \b",
    )
    DOES_NOT_DRINK = noncapture_group(
        regex_or(
            rf"\b {DOES_NOT} \s+ drink \b",
            rf"\b has \s+ {NEVER} \s+ drunk \b",
        )
    )
    OPT_TEMPORAL_AND_OR_DRINKING_BRK = (
        f"{OPT_TEMPORAL} {BRK} {OPT_DRINKING} {BRK} {OPT_TEMPORAL} {BRK}"
    )
    NO_ALCOHOL = rf"""
        {WORD_BOUNDARY}
            # "No alcohol" statements.
            # Temporal modifiers might be found in all sorts of places.
            (?:
                    # 1. [DRINKING] ... ALC ... [DRINKING] ... NONE ...
                    {OPT_TEMPORAL_AND_OR_DRINKING_BRK}
                    {ALC} {BRK}
                    {OPT_TEMPORAL_AND_OR_DRINKING_BRK}
                    {NONE} {BRK}
                    {OPT_TEMPORAL_AND_OR_DRINKING_BRK}
                |
                    # 2. NONE ... ALC (e.g. "never alcohol")
                    {OPT_TEMPORAL_AND_OR_DRINKING_BRK}
                    {NONE} {BRK}
                    {OPT_TEMPORAL_AND_OR_DRINKING_BRK}
                    {ALC} {BRK}
                    {OPT_TEMPORAL_AND_OR_DRINKING_BRK}
                |
                    # 3. "has never drunk... alcohol", etc.
                    {DOES_NOT_DRINK} {BRK} {ALC} {BRK}
                |
                    # 4. "teetotal" with typos
                    {TEETOTAL}
                # ... but not just "drinking... none" (could be water etc.)
            )
        {WORD_BOUNDARY}
    """

    # -------------------------------------------------------------------------
    # Other class variables
    # -------------------------------------------------------------------------

    NAME = "AlcoholUnits"
    PREFERRED_UNIT_COLUMN = "value_uk_units_per_week"
    UNIT_MAPPING = {
        UK_ALCOHOL_UNITS_PER_WEEK: 1,  # preferred unit
        UK_ALCOHOL_UNITS_PER_DAY: DAYS_PER_WEEK,  # 1 unit/day -> 7 units/week
        UK_ALCOHOL_UNITS_PER_MONTH: 1 / WEEKS_PER_MONTH_APPROX,
        UK_ALCOHOL_UNITS_PER_YEAR: 1 / WEEKS_PER_YEAR_APPROX,
    }

    # -------------------------------------------------------------------------
    # Init
    # -------------------------------------------------------------------------

[docs]    def __init__(
        self,
        nlpdef: Optional[NlpDefinition],
        cfg_processor_name: Optional[str],
        commit: bool = False,
    ) -> None:
        # see documentation above
        super().__init__(
            nlpdef=nlpdef,
            cfg_processor_name=cfg_processor_name,
            variable=self.NAME,
            target_unit=self.PREFERRED_UNIT_COLUMN,
            regex_str_for_debugging=self.REGEX_ALCOHOL_UNITS,
            commit=commit,
        )
        self.compiled_regex_alcohol = compile_regex(self.REGEX_ALCOHOL_UNITS)
        self.units_to_factor = compile_regex_dict(self.UNIT_MAPPING)
        self.compiled_regex_no_alcohol = compile_regex(self.NO_ALCOHOL)

    # -------------------------------------------------------------------------
    # Parse
    # -------------------------------------------------------------------------

[docs]    def parse(
        self, text: str, debug: bool = False
    ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
        """
        Parse for two regexes which operate slightly differently.
        """
        if not text:
            return
        yield from self.parse_alcohol_units(text, debug)
        yield from self.parse_alcohol_none(text, debug)

[docs]    def parse_alcohol_units(
        self, text: str, debug: bool = False
    ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
        """
        We amend SimpleNumericalResultParser.parse() to deal with tense a bit
        better (e.g. "used to drink"). Comments from that version not repeated.
        That version also shortened a bit since we guarantee some aspects of
        the flags.
        """
        for m in self.compiled_regex_alcohol.finditer(text):
            startpos = m.start()
            endpos = m.end()
            matching_text = m.group(GROUP_NUMBER_WHOLE_EXPRESSION)
            variable_text = m.group(GROUP_NAME_QUANTITY)
            tense_text = m.group(GROUP_NAME_TENSE)
            relation_text = m.group(GROUP_NAME_RELATION)
            value_text = m.group(GROUP_NAME_VALUE)
            units = m.group(GROUP_NAME_UNITS)
            suffix_text = m.group(self.GROUP_NAME_SUFFIX)

            value_in_target_units = None
            if units:
                matched_unit, multiple_or_fn = get_regex_dict_match(
                    units, self.units_to_factor
                )
                if not matched_unit:
                    continue
                # MODIFIED: no need to check callable(multiple_or_fn); always
                # no
                value_in_target_units = to_float(value_text) * multiple_or_fn
            # MODIFIED: no need to check self.assume_preferred_unit (we never
            # assume that here)

            # MODIFIED: no need to check self.take_absolute (always yes)
            if value_in_target_units is not None:
                value_in_target_units = abs(value_in_target_units)

            tense, relation = common_tense(tense_text, relation_text)

            # MODIFIED: Extra bit here to detect tense information in a
            # different place:
            for temporal_info in (variable_text, suffix_text):
                if tense:
                    break
                tense = self._get_tense(temporal_info)
                if tense:
                    tense_text = temporal_info

            # Back to the previous code:
            result = {
                FN_VARIABLE_NAME: self.variable,
                FN_CONTENT: matching_text,
                FN_START: startpos,
                FN_END: endpos,
                FN_VARIABLE_TEXT: variable_text,
                FN_RELATION_TEXT: relation_text,
                FN_RELATION: relation,
                FN_VALUE_TEXT: value_text,
                FN_UNITS: units,
                self.target_unit: value_in_target_units,
                FN_TENSE_TEXT: tense_text,
                FN_TENSE: tense,
            }
            if debug:
                log.debug(f"Match {m} for {text!r} -> {result}")
            yield self.tablename, result

[docs]    def parse_alcohol_none(
        self, text: str, debug: bool = False
    ) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
        """
        Deal with references to not drinking any alcohol (except those referred
        to as e.g. "0 units per week", which will be picked up by the
        units-per-week function -- that will be rare!).
        """
        for m in self.compiled_regex_no_alcohol.finditer(text):
            startpos = m.start()
            endpos = m.end()
            matching_text = m.group(GROUP_NUMBER_WHOLE_EXPRESSION)
            tense = self._get_tense(matching_text)
            tense_text = matching_text if tense else None

            result = {
                FN_VARIABLE_NAME: self.variable,
                FN_CONTENT: matching_text,
                FN_START: startpos,
                FN_END: endpos,
                FN_VARIABLE_TEXT: matching_text,
                FN_RELATION_TEXT: None,
                FN_RELATION: None,
                FN_VALUE_TEXT: matching_text,
                FN_UNITS: None,
                self.target_unit: 0,  # zero units
                FN_TENSE_TEXT: tense_text,
                FN_TENSE: tense,
            }
            if debug:
                log.debug(f"Match {m} for {text!r} -> {result}")
            yield self.tablename, result

    def _get_tense(self, text: str) -> Optional[str]:
        """
        Find a tense indicator and return the corresponding text, or None.
        """
        # We deal with "never" first because otherwise "never drank" may hit
        # "[optional_stuff] drank" and be classified as the past tense.
        _, tense = get_regex_dict_search(text, self.TENSE_NEVER_LOOKUP)
        if not tense:
            _, tense = get_regex_dict_search(
                text, self.TENSE_PAST_PRESENT_LOOKUP
            )
        return tense

    # -------------------------------------------------------------------------
    # Test
    # -------------------------------------------------------------------------

[docs]    def test(self, verbose: bool = False) -> None:
        # docstring in parent class
        # Test via e.g.:
        #   pytest -k SubstanceMisuseTests  # self-tests
        #   crate_run_crate_nlp_demo - --processors AlcoholUnits  # interactive
        no_results = []
        six_no_tense = [{self.target_unit: 6, FN_TENSE: None}]
        six_past = [{self.target_unit: 6, FN_TENSE: PAST}]
        six_present = [{self.target_unit: 6, FN_TENSE: PRESENT}]
        six_per_day_present = [
            {self.target_unit: 6 * DAYS_PER_WEEK, FN_TENSE: PRESENT}
        ]
        six_per_month_present = [
            {self.target_unit: 6 / WEEKS_PER_MONTH_APPROX, FN_TENSE: PRESENT}
        ]
        six_per_year_present = [
            {self.target_unit: 6 / WEEKS_PER_YEAR_APPROX, FN_TENSE: PRESENT}
        ]
        under_6_present = [
            {self.target_unit: 6, FN_RELATION: "<", FN_TENSE: PRESENT}
        ]
        over_200_present = [
            {self.target_unit: 200, FN_RELATION: ">", FN_TENSE: PRESENT}
        ]
        no_alcohol_no_tense = [{self.target_unit: 0, FN_TENSE: None}]
        no_alcohol_past = [{self.target_unit: 0, FN_TENSE: PAST}]
        no_alcohol_present = [{self.target_unit: 0, FN_TENSE: PRESENT}]
        no_alcohol_ever = [{self.target_unit: 0, FN_TENSE: EVER}]
        self.detailed_test_multiple(
            [
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # No results expected:
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                ("Alcohol", no_results),
                ("He used to drink like a fish", no_results),
                ("[e.g. insulin] currently 6 units per week", no_results),
                ("[e.g. insulin] previously 6 units per week", no_results),
                ("[could be insulin] peak 6 u/w", no_results),
                ("[!] methylalcohol 6 u/w", no_results),
                ("[not starts with no] Alcohol: not explored", no_results),
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # Value with no tense:
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                ("Alcohol 6 u/w", six_no_tense),
                ("Alcohol - 6 u/w", six_no_tense),
                ("EtOH = 6 u/w", six_no_tense),
                ("EtOH = 6 u/wk", six_no_tense),
                ("Alcohol (units/week): 6", six_no_tense),
                ("Ethanol 6 units/week", six_no_tense),
                ("[not international but] alcohol 6 IU/week", six_no_tense),
                ("alcohol 6 I.U./week", six_no_tense),
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # Past tense:
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                ("Alcohol: was 6 u/w", six_past),  # other tenses fail (= good)
                ("Alcohol: formerly 6 u/w", six_past),
                ("Alcohol: previously 6 u/w", six_past),
                ("Alcohol: once 6 u/w", six_past),
                ("Alcohol: peak 6 u/w", six_past),
                ("Used to drink 6 u/w", six_past),
                ("Peak drinking 6 u/w", six_past),
                ("Peak alcohol consumption: 6 u/w", six_past),
                ("Drank 6 u/w", six_past),
                ("Formerly drank 6 u/w", six_past),
                ("Previously drank 6 u/w", six_past),
                ("Was drinking 6 u/w", six_past),
                ("Was previously drinking 6 u/w", six_past),
                ("Was formerly drinking 6 u/w", six_past),
                ("Alcohol: formerly 6 u/w", six_past),
                ("Alcohol: previously 6 u/w", six_past),
                ("Alcohol: 6 u/w previously", six_past),
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # Present tense:
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                ("Drinks 6 units per week", six_present),
                ("Drinks 6 alcohol units per week", six_present),
                ("Drinks 6 UK units per week", six_present),
                ("Drinks 6 UK alcohol units per week", six_present),
                ("[silly] Drinks 6 UK alcohol IU per week", six_present),
                ("Drinks 6 units/d", six_per_day_present),
                ("Drinks 6 units/dy", six_per_day_present),
                ("Drinks 6 units/day", six_per_day_present),
                ("Currently drinks 6 units per week", six_present),
                ("These days drinks 6 units per week", six_present),
                ("Now drinks 6 units per week", six_present),
                ("Nowadays drinks 6 units per week", six_present),
                ("Drinking 6 units per week", six_present),
                ("Currently drinking 6 units per week", six_present),
                ("Presently drinking 6 units per week", six_present),
                ("Alcohol: currently 6 u/w", six_present),
                ("Alcohol: presently 6 u/w", six_present),
                ("In terms of alcohol she drinks 6 units/week", six_present),
                ("Has been drinking 6 units per week", six_present),
                ("Drinks 6 units per month", six_per_month_present),
                ("Drinks 6 units per year", six_per_year_present),
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # Inequalities:
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                ("Alcohol: presently less than 6 u/w", under_6_present),
                ("Alcohol: presently under 6 u/w", under_6_present),
                ("Alcohol: presently >200 u/w", over_200_present),
                ("Alcohol: currently more than 200 u/w", over_200_present),
                ("Alcohol: currently over 200 u/w", over_200_present),
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # References to not drinking -- no tense:
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                ("Alcohol: none", no_alcohol_no_tense),
                ("Teetotal", no_alcohol_no_tense),
                ("Tee-total", no_alcohol_no_tense),  # typo
                ("Teetotaller", no_alcohol_no_tense),
                ("Teetotaler", no_alcohol_no_tense),  # typo
                ("Abstinent from alcohol", no_alcohol_no_tense),
                ("Alcohol: abstinent", no_alcohol_no_tense),
                ("Alcohol: abstinant", no_alcohol_no_tense),  # typo
                ("Alcohol: zero", no_alcohol_no_tense),
                ("Alcohol: 0", no_alcohol_no_tense),
                ("Alcohol: no", no_alcohol_no_tense),
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # References to not drinking -- past tense:
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                ("Alcohol: was abstinent", no_alcohol_past),
                ("Alcohol: previously abstinent", no_alcohol_past),
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # References to not drinking -- present tense:
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                ("Alcohol: has been abstinent", no_alcohol_present),
                ("Alcohol: currently abstinent", no_alcohol_present),
                ("Alcohol: currently none", no_alcohol_present),
                ("Drinks no alcohol", no_alcohol_present),
                ("Drinks zero alcohol", no_alcohol_present),
                ("Does not drink alcohol", no_alcohol_present),
                ("Doesn't drink alcohol", no_alcohol_present),
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # References to not drinking -- ever:
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                ("Has never drunk alcohol", no_alcohol_ever),
                ("Never drank alcohol", no_alcohol_ever),
                ("Alcohol: never", no_alcohol_ever),
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # Vague references to not drinking, not interpreted:
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                ("Has not drunk alcohol", no_results),
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # Potential teetotal statements, but very tricky to be sure:
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                ("Doesn't drink [coffee]", no_results),
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                # Distractors:
                # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                ("Lemonade, which he does not drink.", no_results),
            ],
            verbose=verbose,
        )


[docs]class AlcoholUnitsValidator(ValidatorBase):
    """
    Validator for AlcoholUnits (see help for explanation).
    """

[docs]    @classmethod
    def get_variablename_regexstrlist(cls) -> Tuple[str, List[str]]:
        # We're very broad here:
        return AlcoholUnits.NAME, [
            regex_or(
                ALCOHOL,
                r"\b dr[iau]nk ",  # drink/drank/drunk plus any ending
                AlcoholUnits.ABSTINENT,
                AlcoholUnits.TEETOTAL,
            )
        ]


# =============================================================================
# All classes in this module
# =============================================================================

ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS = [
    (AlcoholUnits, AlcoholUnitsValidator),
]