Source code for crate_anon.nlp_manager.nlp_definition

#!/usr/bin/env python

"""
crate_anon/nlp_manager/nlp_definition.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**NLP definition class.**

"""

# =============================================================================
# Imports
# =============================================================================

import datetime
import json
import logging
import os
import sys
from typing import (
    Any,
    Dict,
    Iterable,
    List,
    Optional,
    Tuple,
    Type,
    TYPE_CHECKING,
)

from cardinal_pythonlib.datetimefunc import get_now_utc_notz_datetime
from cardinal_pythonlib.docker import running_under_docker
from cardinal_pythonlib.lists import chunks
from sqlalchemy.engine.base import Engine
from sqlalchemy.orm.session import Session
from sqlalchemy.schema import MetaData

from crate_anon.anonymise.constants import AnonymiseConfigDefaults
from crate_anon.anonymise.dbholder import DatabaseHolder
from crate_anon.common.constants import EnvVar
from crate_anon.common.extendedconfigparser import (
    ConfigSection,
    ExtendedConfigParser,
)
from crate_anon.common.sql import TransactionSizeLimiter
from crate_anon.nlp_manager.cloud_config import CloudConfig
from crate_anon.nlp_manager.constants import (
    CloudNlpConfigKeys,
    DatabaseConfigKeys,
    DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT,
    DEFAULT_CLOUD_MAX_CONTENT_LENGTH,
    DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST,
    DEFAULT_CLOUD_MAX_TRIES,
    DEFAULT_CLOUD_RATE_LIMIT_HZ,
    DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S,
    DEFAULT_TEMPORARY_TABLENAME,
    full_sectionname,
    GATE_PIPELINE_CLASSNAME,
    NlpOutputConfigKeys,
    HashClass,
    InputFieldConfigKeys,
    MAX_SQL_FIELD_LEN,
    NLP_CONFIG_ENV_VAR,
    NlpConfigPrefixes,
    NlpDefConfigKeys,
    ProcessorConfigKeys,
    NlpDefValues,
)
from crate_anon.nlprp.constants import NlprpKeys
from crate_anon.version import CRATE_VERSION, CRATE_VERSION_DATE

if TYPE_CHECKING:
    from crate_anon.nlp_manager.base_nlp_parser import (
        BaseNlpParser,
        TableMaker,
    )
    from crate_anon.nlp_manager.input_field_config import InputFieldConfig

log = logging.getLogger(__name__)


# =============================================================================
# Demo config
# =============================================================================


[docs]def demo_nlp_config() -> str:
    """
    Returns a demo NLP config file for CRATE.
    """
    # -------------------------------------------------------------------------
    # Imports
    # -------------------------------------------------------------------------

    from crate_anon.nlp_manager.parse_biochemistry import (
        ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS,
    )  # delayed import
    from crate_anon.nlp_manager.parse_clinical import (
        ALL_CLINICAL_NLP_AND_VALIDATORS,
    )  # delayed import
    from crate_anon.nlp_manager.parse_cognitive import (
        ALL_COGNITIVE_NLP_AND_VALIDATORS,
    )  # delayed import
    from crate_anon.nlp_manager.parse_haematology import (
        ALL_HAEMATOLOGY_NLP_AND_VALIDATORS,
    )  # delayed import
    from crate_anon.nlp_manager.parse_substance_misuse import (
        ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS,
    )  # delayed import

    # -------------------------------------------------------------------------
    # Helper functions
    # -------------------------------------------------------------------------

    def _make_procdef_pair(name: str) -> str:
        return f"""[{NlpConfigPrefixes.PROCESSOR}:procdef_{name}]
{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.DESTTABLE} = {name}
[{NlpConfigPrefixes.PROCESSOR}:procdef_validate_{name}]
{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.DESTTABLE} = validate_{name}"""

    def _make_module_procdef_block(
        nlp_and_validators: List[
            Tuple[Type["BaseNlpParser"], Type["BaseNlpParser"]]
        ]
    ) -> str:
        _procdeflist = []  # type: List[str]
        for nlpclass, validatorclass in nlp_and_validators:
            _procdeflist.append(
                _make_procdef_pair(nlpclass.classname().lower())
            )
        return "\n\n".join(_procdeflist)

    def _make_proclist(
        nlp_and_validators: List[
            Tuple[Type["BaseNlpParser"], Type["BaseNlpParser"]]
        ]
    ) -> str:
        _proclist = []  # type: List[str]
        for nlpclass, validatorclass in nlp_and_validators:
            _name = nlpclass.classname().lower()
            _proclist.append(
                f"    {nlpclass.classname()} procdef_{_name}\n"
                f"    {validatorclass.classname()} procdef_validate_{_name}"
            )
        return "\n".join(_proclist)

    # -------------------------------------------------------------------------
    # Quasi-constants
    # -------------------------------------------------------------------------

    for_docker = running_under_docker()

    destdb = "DESTINATION_DATABASE"
    hashphrase = "doesnotmatter"
    if_clin_docs = "INPUT_FIELD_CLINICAL_DOCUMENTS"
    if_prog_notes = "INPUT_FIELD_PROGRESS_NOTES"
    inputfields = f"{if_clin_docs}\n" f"    {if_prog_notes}"
    truncate_text_at = "32766"
    my_env = "MY_ENV_SECTION"
    my_src_db = "SOURCE_DATABASE"
    my_cloud = "my_uk_cloud_service"
    ridfield = "RID_FIELD"
    tridfield = "TRID_FIELD"
    nlp_input_terminator = "END_OF_TEXT_FOR_NLP"
    nlp_output_terminator = "END_OF_NLP_OUTPUT_RECORD"

    procdefs_biochemistry = _make_module_procdef_block(
        ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS
    )
    procdefs_clinical = _make_module_procdef_block(
        ALL_CLINICAL_NLP_AND_VALIDATORS
    )
    procdefs_cognitive = _make_module_procdef_block(
        ALL_COGNITIVE_NLP_AND_VALIDATORS
    )
    procdefs_haematology = _make_module_procdef_block(
        ALL_HAEMATOLOGY_NLP_AND_VALIDATORS
    )
    procdefs_substance_misuse = _make_module_procdef_block(
        ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS
    )

    proclist_biochemistry = _make_proclist(ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS)
    proclist_clinical = _make_proclist(ALL_CLINICAL_NLP_AND_VALIDATORS)
    proclist_cognitive = _make_proclist(ALL_COGNITIVE_NLP_AND_VALIDATORS)
    proclist_haematology = _make_proclist(ALL_HAEMATOLOGY_NLP_AND_VALIDATORS)
    proclist_substance_misuse = _make_proclist(
        ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS
    )

    if EnvVar.GENERATING_CRATE_DOCS in os.environ:
        nlp_prog_dir = "/path/to/crate_anon/nlp_manager/compiled_nlp_classes"
    else:
        this_dir = os.path.abspath(
            os.path.dirname(__file__)
        )  # crate_anon/nlp_manager
        nlp_prog_dir = os.path.join(this_dir, "compiled_nlp_classes")

    if for_docker:
        # See crate.Dockerfile
        gate_home = "/crate/gate"
        kcl_pharmacotherapy_dir = "/crate/brc-gate-pharmacotherapy"
        cloud_request_data_dir = "/crate/tmp/clouddata"
        gate_plugin_file = (
            "/crate/src/crate_anon/nlp_manager/specimen_gate_plugin_file.ini"
        )
    else:
        gate_home = "/path/to/GATE_Developer_8.6.1"
        kcl_pharmacotherapy_dir = "/path/to/brc-gate-pharmacotherapy"
        cloud_request_data_dir = "/srv/crate/clouddata"
        gate_plugin_file = "/path/to/specimen_gate_plugin_file.ini"

    _DA = AnonymiseConfigDefaults

    # -------------------------------------------------------------------------
    # The demo config itself
    # -------------------------------------------------------------------------

    # noinspection HttpUrlsUsage
    return f"""# Configuration file for CRATE NLP manager (crate_nlp).
# Version {CRATE_VERSION} ({CRATE_VERSION_DATE}).
#
# PLEASE SEE THE HELP at https://crateanon.readthedocs.io/
# Using defaults for Docker environment: {for_docker}

# =============================================================================
# A. Individual NLP definitions
# =============================================================================
# - referred to by the NLP manager's command-line arguments
# - You are likely to need to alter these (particularly the bits in capital
#   letters) to refer to your own database(s).

# -----------------------------------------------------------------------------
# GATE people-and-places demo
# -----------------------------------------------------------------------------

[{NlpConfigPrefixes.NLPDEF}:gate_name_location_demo]

{NlpDefConfigKeys.INPUTFIELDDEFS} =
    {inputfields}
{NlpDefConfigKeys.PROCESSORS} =
    GATE procdef_gate_name_location
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}


# -----------------------------------------------------------------------------
# KConnect (Bio-YODIE) disease-finding GATE app
# -----------------------------------------------------------------------------

[{NlpConfigPrefixes.NLPDEF}:gate_kconnect_diseases]

{NlpDefConfigKeys.INPUTFIELDDEFS} =
    {inputfields}
{NlpDefConfigKeys.PROCESSORS} =
    GATE procdef_gate_kconnect
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}


# -----------------------------------------------------------------------------
# KCL Lewy body dementia GATE app
# -----------------------------------------------------------------------------

[{NlpConfigPrefixes.NLPDEF}:gate_kcl_lbd]

{NlpDefConfigKeys.INPUTFIELDDEFS} =
    {inputfields}
{NlpDefConfigKeys.PROCESSORS} =
    GATE procdef_gate_kcl_lbda
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}


# -----------------------------------------------------------------------------
# KCL pharmacotherapy GATE app
# -----------------------------------------------------------------------------

[{NlpConfigPrefixes.NLPDEF}:gate_kcl_pharmacotherapy]

{NlpDefConfigKeys.INPUTFIELDDEFS} =
    {inputfields}
{NlpDefConfigKeys.PROCESSORS} =
    GATE procdef_gate_pharmacotherapy
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}


# -----------------------------------------------------------------------------
# Medex-UIMA medication-finding app
# -----------------------------------------------------------------------------

[{NlpConfigPrefixes.NLPDEF}:medex_medications]

{NlpDefConfigKeys.INPUTFIELDDEFS} =
    {inputfields}
{NlpDefConfigKeys.PROCESSORS} =
    Medex procdef_medex_medications
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}


# -----------------------------------------------------------------------------
# CRATE number-finding Python regexes
# -----------------------------------------------------------------------------

[{NlpConfigPrefixes.NLPDEF}:crate_biomarkers]

{NlpDefConfigKeys.INPUTFIELDDEFS} =
    {inputfields}
{NlpDefConfigKeys.PROCESSORS} =
    # -------------------------------------------------------------------------
    # Biochemistry
    # -------------------------------------------------------------------------
{proclist_biochemistry}
    # -------------------------------------------------------------------------
    # Clinical
    # -------------------------------------------------------------------------
{proclist_clinical}
    # -------------------------------------------------------------------------
    # Cognitive
    # -------------------------------------------------------------------------
{proclist_cognitive}
    # -------------------------------------------------------------------------
    # Haematology
    # -------------------------------------------------------------------------
{proclist_haematology}
    # -------------------------------------------------------------------------
    # Substance misuse
    # -------------------------------------------------------------------------
{proclist_substance_misuse}

{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
# {NlpDefConfigKeys.TRUNCATE_TEXT_AT} = {truncate_text_at}
# {NlpDefConfigKeys.RECORD_TRUNCATED_VALUES} = False
{NlpDefConfigKeys.MAX_ROWS_BEFORE_COMMIT} = {_DA.MAX_ROWS_BEFORE_COMMIT}
{NlpDefConfigKeys.MAX_BYTES_BEFORE_COMMIT} = {_DA.MAX_BYTES_BEFORE_COMMIT}

# -----------------------------------------------------------------------------
# Cloud NLP demo
# -----------------------------------------------------------------------------

[{NlpConfigPrefixes.NLPDEF}:cloud_nlp_demo]

{NlpDefConfigKeys.INPUTFIELDDEFS} =
    {inputfields}
{NlpDefConfigKeys.PROCESSORS} =
    Cloud procdef_cloud_crp
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
{NlpDefConfigKeys.CLOUD_CONFIG} = {my_cloud}
{NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR} = {cloud_request_data_dir}


# =============================================================================
# B. NLP processor definitions
# =============================================================================
# - You're likely to have to modify the destination databases these point to,
#   but otherwise you can probably leave them as they are.

# -----------------------------------------------------------------------------
# Specimen CRATE regular expression processor definitions
# -----------------------------------------------------------------------------

    # Most of these are very simple, and just require a destination database
    # (as a cross-reference to a database section within this file) and a
    # destination table.

    # Biochemistry

{procdefs_biochemistry}

    # Clinical

{procdefs_clinical}

    # Cognitive

{procdefs_cognitive}

    # Haematology

{procdefs_haematology}

    # Substance misuse

{procdefs_substance_misuse}

# -----------------------------------------------------------------------------
# Specimen GATE demo people/places processor definition
# -----------------------------------------------------------------------------

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Define the processor
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_name_location]

{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.OUTPUTTYPEMAP} =
    Person output_person
    Location output_location
{ProcessorConfigKeys.PROGARGS} =
    java
    -classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*"
    -Dgate.home="{{GATE_HOME}}"
    {GATE_PIPELINE_CLASSNAME}
    --gate_app "{{GATE_HOME}}/plugins/ANNIE/ANNIE_with_defaults.gapp"
    --pluginfile "{{GATE_PLUGIN_FILE}}"
    --annotation Person
    --annotation Location
    --input_terminator {nlp_input_terminator}
    --output_terminator {nlp_output_terminator}
    --log_tag {{NLPLOGTAG}}
    --verbose
{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator}
{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator}
# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Define the output tables used by this GATE processor
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[{NlpConfigPrefixes.OUTPUT}:output_person]

{NlpOutputConfigKeys.DESTTABLE} = person
{NlpOutputConfigKeys.RENAMES} =
    firstName   firstname
{NlpOutputConfigKeys.DESTFIELDS} =
    rule        VARCHAR(100)    Rule used to find this person (e.g. TitleFirstName, PersonFull)
    firstname   VARCHAR(100)    First name
    surname     VARCHAR(100)    Surname
    gender      VARCHAR(7)      Gender (e.g. male, female, unknown)
    kind        VARCHAR(100)    Kind of name (e.g. personName, fullName)
    # ... longest gender: "unknown" (7)
{NlpOutputConfigKeys.INDEXDEFS} =
    firstname   64
    surname     64

[{NlpConfigPrefixes.OUTPUT}:output_location]

{NlpOutputConfigKeys.DESTTABLE} = location
{NlpOutputConfigKeys.RENAMES} =
    locType     loctype
{NlpOutputConfigKeys.DESTFIELDS} =
    rule        VARCHAR(100)    Rule used (e.g. Location1)
    loctype     VARCHAR(100)    Location type (e.g. city)
{NlpOutputConfigKeys.INDEXDEFS} =
    rule    100
    loctype 100


# -----------------------------------------------------------------------------
# Specimen Sheffield/KCL KConnect (Bio-YODIE) processor definition
# -----------------------------------------------------------------------------
# https://gate.ac.uk/applications/bio-yodie.html

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Define the processor
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_kconnect]

{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.OUTPUTTYPEMAP} =
    Disease_or_Syndrome output_disease_or_syndrome
{ProcessorConfigKeys.PROGARGS} =
    java
    -classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*"
    -Dgate.home="{{GATE_HOME}}"
    {GATE_PIPELINE_CLASSNAME}
    --gate_app "{{KCONNECTDIR}}/main-bio/main-bio.xgapp"
    --pluginfile "{{GATE_PLUGIN_FILE}}"
    --annotation Disease_or_Syndrome
    --input_terminator {nlp_input_terminator}
    --output_terminator {nlp_output_terminator}
    --log_tag {{NLPLOGTAG}}
    --suppress_gate_stdout
    --verbose
{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator}
{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator}
# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Define the output tables used by this GATE processor
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[{NlpConfigPrefixes.OUTPUT}:output_disease_or_syndrome]

{NlpOutputConfigKeys.DESTTABLE} = kconnect_diseases
{NlpOutputConfigKeys.RENAMES} =
    Experiencer     experiencer
    Negation        negation
    PREF            pref
    STY             sty
    TUI             tui
    Temporality     temporality
    VOCABS          vocabs
{NlpOutputConfigKeys.DESTFIELDS} =
    # Found by manual inspection of KConnect/Bio-YODIE output from the GATE console:
    experiencer  VARCHAR(100)  Who experienced it; e.g. "Patient", "Other"
    negation     VARCHAR(100)  Was it negated or not; e.g. "Affirmed", "Negated"
    pref         VARCHAR(100)  PREFferred name; e.g. "Rheumatic gout"
    sty          VARCHAR(100)  Semantic Type (STY) [semantic type name]; e.g. "Disease or Syndrome"
    tui          VARCHAR(4)    Type Unique Identifier (TUI) [semantic type identifier]; 4 characters; https://www.ncbi.nlm.nih.gov/books/NBK9679/; e.g. "T047"
    temporality  VARCHAR(100)  Occurrence in time; e.g. "Recent", "historical", "hypothetical"
    vocabs       VARCHAR(255)  List of UMLS vocabularies; e.g. "AIR,MSH,NDFRT,MEDLINEPLUS,NCI,LNC,NCI_FDA,NCI,MTH,AIR,ICD9CM,LNC,SNOMEDCT_US,LCH_NW,HPO,SNOMEDCT_US,ICD9CM,SNOMEDCT_US,COSTAR,CST,DXP,QMR,OMIM,OMIM,AOD,CSP,NCI_NCI-GLOSS,CHV"
    inst         VARCHAR(8)    Looks like a Concept Unique Identifier (CUI); 1 letter then 7 digits; e.g. "C0003873"
    inst_full    VARCHAR(255)  Looks like a URL to a CUI; e.g. "http://linkedlifedata.com/resource/umls/id/C0003873"
    language     VARCHAR(100)  Language; e.g. ""; ?will look like "ENG" for English? See https://www.nlm.nih.gov/research/umls/implementation_resources/query_diagrams/er1.html
    tui_full     VARCHAR(255)  TUI (?); e.g. "http://linkedlifedata.com/resource/semanticnetwork/id/T047"
{NlpOutputConfigKeys.INDEXDEFS} =
    pref    100
    sty     100
    tui     4
    inst    8


# -----------------------------------------------------------------------------
# Specimen KCL GATE pharmacotherapy processor definition
# -----------------------------------------------------------------------------
# https://github.com/KHP-Informatics/brc-gate-pharmacotherapy

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Define the processor
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_pharmacotherapy]

{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.OUTPUTTYPEMAP} =
    Prescription output_prescription
{ProcessorConfigKeys.PROGARGS} =
    java
    -classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*"
    -Dgate.home="{{GATE_HOME}}"
    {GATE_PIPELINE_CLASSNAME}
    --gate_app "{{GATE_PHARMACOTHERAPY_DIR}}/application.xgapp"
    --pluginfile "{{GATE_PLUGIN_FILE}}"
    --include_set Output
    --annotation Prescription
    --input_terminator {nlp_input_terminator}
    --output_terminator {nlp_output_terminator}
    --log_tag {{NLPLOGTAG}}
    --suppress_gate_stdout
    --show_contents_on_crash
{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator}
{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator}
# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Define the output tables used by this GATE processor
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[{NlpConfigPrefixes.OUTPUT}:output_prescription]

{NlpOutputConfigKeys.DESTTABLE} = medications_gate
{NlpOutputConfigKeys.RENAMES} =
    drug-type           drug_type
    dose-value          dose_value
    dose-unit           dose_unit
    dose-multiple       dose_multiple
    Directionality      directionality
    Experiencer         experiencer
    "Length of Time"    length_of_time
    Temporality         temporality
    "Unit of Time"      unit_of_time
{NlpOutputConfigKeys.NULL_LITERALS} =
    null
    ""
{NlpOutputConfigKeys.DESTFIELDS} =
    # Found by (a) manual inspection of BRC GATE pharmacotherapy output from
    # the GATE console; (b) inspection of
    # application-resources/schemas/Prescription.xml
    # Note preference for DECIMAL over FLOAT/REAL; see
    # https://stackoverflow.com/questions/1056323
    # Note that not all annotations appear for all texts. Try e.g.:
    #   Please start haloperidol 5mg tds.
    #   I suggest you start haloperidol 5mg tds for one week.
    rule            VARCHAR(100)  Rule yielding this drug. Not in XML but is present in a subset: e.g. "weanOff"; max length unclear
    drug            VARCHAR(200)  Drug name. Required string; e.g. "haloperidol"; max length 47 from "wc -L BNF_generic.lst", 134 from BNF_trade.lst
    drug_type       VARCHAR(100)  Type of drug name. Required string; from "drug-type"; e.g. "BNF_generic"; ?length of longest drug ".lst" filename
    dose            VARCHAR(100)  Dose text. Required string; e.g. "5mg"; max length unclear
    dose_value      DECIMAL       Numerical dose value. Required numeric; from "dose-value"; "double" in the XML but DECIMAL probably better; e.g. 5.0
    dose_unit       VARCHAR(100)  Text of dose units. Required string; from "dose-unit"; e.g. "mg"; max length unclear
    dose_multiple   INT           Dose count (multiple). Required integer; from "dose-multiple"; e.g. 1
    route           VARCHAR(7)    Route of administration. Required string; one of: "oral", "im", "iv", "rectal", "sc", "dermal", "unknown"
    status          VARCHAR(10)   Change in drug status. Required; one of: "start", "continuing", "stop"
    tense           VARCHAR(7)    Tense in which drug is referred to. Required; one of: "past", "present"
    date            VARCHAR(100)  ?. Optional string; max length unclear
    directionality  VARCHAR(100)  ?. Optional string; max length unclear
    experiencer     VARCHAR(100)  Person experiencing the drug-related event. Optional string; e.g. "Patient"
    frequency       DECIMAL       Frequency (times per <time_unit>). Optional numeric; "double" in the XML but DECIMAL probably better
    interval        DECIMAL       The n in "every n <time_unit>s" (1 for "every <time_unit>"). Optional numeric; "double" in the XML but DECIMAL probably better
    length_of_time  VARCHAR(100)  ?. Optional string; from "Length of Time"; max length unclear
    temporality     VARCHAR(100)  ?. Optional string; e.g. "Recent", "Historical"
    time_unit       VARCHAR(100)  Unit of time (see frequency, interval). Optional string; from "time-unit"; e.g. "day"; max length unclear
    unit_of_time    VARCHAR(100)  ?. Optional string; from "Unit of Time"; max length unclear
    when            VARCHAR(100)  ?. Optional string; max length unclear
{NlpOutputConfigKeys.INDEXDEFS} =
    rule    100
    drug    200
    route   7
    status  10
    tense   7


# -----------------------------------------------------------------------------
# Specimen KCL Lewy Body Diagnosis Application (LBDA) processor definition
# -----------------------------------------------------------------------------
# https://github.com/KHP-Informatics/brc-gate-LBD

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Define the processor
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_kcl_lbda]

    # "cDiagnosis" is the "confirmed diagnosis" field, as d/w Jyoti Jyoti
    # 2018-03-20; see also README.md. This appears in the "Automatic" and the
    # unnamed set. There is also a near-miss one, "DiagnosisAlmost", which
    # appears in the unnamed set.
    #   "Mr Jones has Lewy body dementia."
    #       -> DiagnosisAlmost
    #   "Mr Jones has a diagnosis of Lewy body dementia."
    #       -> DiagnosisAlmost, cDiagnosis
    # Note that we must use lower case in the outputtypemap.

{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.OUTPUTTYPEMAP} =
    cDiagnosis output_lbd_diagnosis
    DiagnosisAlmost output_lbd_diagnosis
{ProcessorConfigKeys.PROGARGS} =
    java
    -classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*"
    -Dgate.home="{{GATE_HOME}}"
    {GATE_PIPELINE_CLASSNAME}
    --gate_app "{{KCL_LBDA_DIR}}/application.xgapp"
    --pluginfile "{{GATE_PLUGIN_FILE}}"
    --set_annotation "" DiagnosisAlmost
    --set_annotation Automatic cDiagnosis
    --input_terminator {nlp_input_terminator}
    --output_terminator {nlp_output_terminator}
    --log_tag {{NLPLOGTAG}}
    --suppress_gate_stdout
    --verbose
{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator}
{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator}
# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # Define the output tables used by this GATE processor
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

[{NlpConfigPrefixes.OUTPUT}:output_lbd_diagnosis]

{NlpOutputConfigKeys.DESTTABLE} = lewy_body_dementia_gate
{NlpOutputConfigKeys.NULL_LITERALS} =
    null
    ""
{NlpOutputConfigKeys.DESTFIELDS} =
    # Found by
    # (a) manual inspection of output from the GATE Developer console:
    # - e.g. {{rule=Includefin, text=Lewy body dementia}}
    # (b) inspection of contents:
    # - run a Cygwin shell
    # - find . -type f -exec grep cDiagnosis -l {{}} \\;
    # - 3 hits:
    #       ./application-resources/jape/DiagnosisExclude2.jape
    #           ... part of the "Lewy"-detection apparatus
    #       ./application-resources/jape/text-feature.jape
    #           ... adds "text" annotation to cDiagnosis Token
    #       ./application.xgapp
    #           ... in annotationTypes
    # On that basis:
    rule            VARCHAR(100)  Rule that generated the hit.
    text            VARCHAR(200)  Text that matched the rule.
{NlpOutputConfigKeys.INDEXDEFS} =
    rule    100
    text    200


# -----------------------------------------------------------------------------
# Specimen MedEx processor definition
# -----------------------------------------------------------------------------
# https://sbmi.uth.edu/ccb/resources/medex.htm

[{NlpConfigPrefixes.PROCESSOR}:procdef_medex_medications]

{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.DESTTABLE} = medications_medex
{ProcessorConfigKeys.PROGARGS} =
    java
    -classpath {{NLPPROGDIR}}:{{MEDEXDIR}}/bin:{{MEDEXDIR}}/lib/*
    -Dfile.encoding=UTF-8
    CrateMedexPipeline
    -lt {{NLPLOGTAG}}
    -v -v
# ... other arguments are added by the code
{ProcessorConfigKeys.PROGENVSECTION} = {my_env}


# =============================================================================
# C. Environment variable definitions
# =============================================================================
# - You'll need to modify this according to your local configuration.

[{NlpConfigPrefixes.ENV}:{my_env}]

GATE_HOME = {gate_home}
GATE_PHARMACOTHERAPY_DIR = {kcl_pharmacotherapy_dir}
GATE_PLUGIN_FILE = {gate_plugin_file}
KCL_LBDA_DIR = /path/to/brc-gate-LBD/Lewy_Body_Diagnosis
KCONNECTDIR = /path/to/yodie-pipeline-1-2-umls-only
MEDEXDIR = /path/to/Medex_UIMA_1.3.6
NLPPROGDIR = {nlp_prog_dir}
OS_PATHSEP = {os.pathsep}


# =============================================================================
# D. Input field definitions
# =============================================================================

[{NlpConfigPrefixes.INPUT}:{if_clin_docs}]

{InputFieldConfigKeys.SRCDB} = {my_src_db}
{InputFieldConfigKeys.SRCTABLE} = EXTRACTED_CLINICAL_DOCUMENTS
{InputFieldConfigKeys.SRCPKFIELD} = DOCUMENT_PK
{InputFieldConfigKeys.SRCFIELD} = DOCUMENT_TEXT
{InputFieldConfigKeys.SRCDATETIMEFIELD} = DOCUMENT_DATE
{InputFieldConfigKeys.COPYFIELDS} =
    {ridfield}
    {tridfield}
{InputFieldConfigKeys.INDEXED_COPYFIELDS} =
    {ridfield}
    {tridfield}
# {InputFieldConfigKeys.DEBUG_ROW_LIMIT} = 0

[{NlpConfigPrefixes.INPUT}:{if_prog_notes}]

{InputFieldConfigKeys.SRCDB} = {my_src_db}
{InputFieldConfigKeys.SRCTABLE} = PROGRESS_NOTES
{InputFieldConfigKeys.SRCPKFIELD} = PN_PK
{InputFieldConfigKeys.SRCFIELD} = PN_TEXT
{InputFieldConfigKeys.SRCDATETIMEFIELD} = PN_DATE
{InputFieldConfigKeys.COPYFIELDS} =
    {ridfield}
    {tridfield}
{InputFieldConfigKeys.INDEXED_COPYFIELDS} =
    {ridfield}
    {tridfield}


# =============================================================================
# E. Database definitions, each in its own section
# =============================================================================

[{NlpConfigPrefixes.DATABASE}:{my_src_db}]

{DatabaseConfigKeys.URL} = mysql+mysqldb://anontest:XXX@127.0.0.1:3306/anonymous_output?charset=utf8

[{NlpConfigPrefixes.DATABASE}:{destdb}]

{DatabaseConfigKeys.URL} = mysql+mysqldb://anontest:XXX@127.0.0.1:3306/anonymous_output?charset=utf8


# =============================================================================
# F. Information for using cloud-based NLP
# =============================================================================

[{NlpConfigPrefixes.CLOUD}:{my_cloud}]

{CloudNlpConfigKeys.CLOUD_URL} = https://your_url
{CloudNlpConfigKeys.USERNAME} = your_username
{CloudNlpConfigKeys.PASSWORD} = your_password
{CloudNlpConfigKeys.WAIT_ON_CONN_ERR} = {DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S}
{CloudNlpConfigKeys.MAX_CONTENT_LENGTH} = {DEFAULT_CLOUD_MAX_CONTENT_LENGTH}
{CloudNlpConfigKeys.MAX_RECORDS_PER_REQUEST} = {DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST}
{CloudNlpConfigKeys.LIMIT_BEFORE_COMMIT} = {DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT}
{CloudNlpConfigKeys.STOP_AT_FAILURE} = true
{CloudNlpConfigKeys.MAX_TRIES} = {DEFAULT_CLOUD_MAX_TRIES}
{CloudNlpConfigKeys.RATE_LIMIT_HZ} = {DEFAULT_CLOUD_RATE_LIMIT_HZ}

[{NlpConfigPrefixes.PROCESSOR}:procdef_cloud_crp]

{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.DESTTABLE} = crp_test
{ProcessorConfigKeys.PROCESSOR_NAME} = crate_anon.nlp_manager.parse_biochemistry.Crp
{ProcessorConfigKeys.PROCESSOR_FORMAT} = {NlpDefValues.FORMAT_STANDARD}

"""  # noqa


# =============================================================================
# Get config filename (from environment variable)
# =============================================================================


[docs]def get_nlp_config_filename_or_exit() -> str:
    """
    Returns the config filename, from our environment variable.
    If we can't retrieve it, perform a hard exit.
    """
    # Get filename
    try:
        config_filename = os.environ[NLP_CONFIG_ENV_VAR]
        assert config_filename
    except (KeyError, AssertionError):
        print(
            f"You must set the {NLP_CONFIG_ENV_VAR} environment variable "
            f"to point to a CRATE NLP config file, or specify it on the "
            f"command line."
        )
        sys.exit(1)
    return config_filename


# =============================================================================
# Config class
# =============================================================================


[docs]class NlpDefinition:
    """
    Class representing an NLP master definition as read from config file.

    An NLP definition represents the combination of

    - one or more NLP processors (e.g. "CRATE's C-reactive protein finder")
    - one or more input fields in the source database

    The NLP definition can therefore be used to say "run this set of NLP
    processors over this set of textual fields in my database".

    See the documentation for the :ref:`NLP config file <nlp_config>`.
    """

[docs]    def __init__(self, nlpname: str, logtag: str = "") -> None:
        """
        Read config from file.

        Args:
            nlpname: config section name for this NLP definition
            logtag: text that may be passed to child processes to identify
                the NLP definition in their log output
        """

        # DELAYED IMPORTS (to make life simpler for classes deriving from
        # NlpParser and using NlpDefinition -- they can now do it directly,
        # not just via forward reference).
        from crate_anon.nlp_manager.all_processors import make_nlp_parser
        from crate_anon.nlp_manager.input_field_config import InputFieldConfig

        self._nlpname = nlpname
        self._logtag = logtag

        log.info(f"Loading config for section: {nlpname}")
        self._config_filename = get_nlp_config_filename_or_exit()

        # Read config from file.
        self._cfg = ConfigSection(
            section=full_sectionname(NlpConfigPrefixes.NLPDEF, nlpname),
            filename=self._config_filename,
            case_sensitive=True,
        )

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Our own stuff
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        self._databases = {}  # type: Dict[str, DatabaseHolder]
        self._progressdb_name = self._cfg.opt_str(
            NlpDefConfigKeys.PROGRESSDB, required=True
        )
        self._progdb = self.get_database(self._progressdb_name)
        self._temporary_tablename = self._cfg.opt_str(
            NlpDefConfigKeys.TEMPORARY_TABLENAME,
            default=DEFAULT_TEMPORARY_TABLENAME,
        )
        self._hashphrase = self._cfg.opt_str(
            NlpDefConfigKeys.HASHPHRASE, required=True
        )
        self._hasher = HashClass(self._hashphrase)
        self._max_rows_before_commit = self._cfg.opt_int_positive(
            NlpDefConfigKeys.MAX_ROWS_BEFORE_COMMIT,
            AnonymiseConfigDefaults.MAX_ROWS_BEFORE_COMMIT,
        )
        self._max_bytes_before_commit = self._cfg.opt_int_positive(
            NlpDefConfigKeys.MAX_BYTES_BEFORE_COMMIT,
            AnonymiseConfigDefaults.MAX_BYTES_BEFORE_COMMIT,
        )
        self._now = get_now_utc_notz_datetime()
        self.truncate_text_at = self._cfg.opt_int_positive(
            NlpDefConfigKeys.TRUNCATE_TEXT_AT, default=0
        )
        self.record_truncated_values = self._cfg.opt_bool(
            NlpDefConfigKeys.RECORD_TRUNCATED_VALUES, default=False
        )
        self._cloud_config_name = self._cfg.opt_str(
            NlpDefConfigKeys.CLOUD_CONFIG
        )
        self._cloud_request_data_dir = self._cfg.opt_str(
            NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR
        )

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Input field definitions
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        self._inputfielddefs = self._cfg.opt_strlist(
            NlpDefConfigKeys.INPUTFIELDDEFS, required=True, lower=False
        )
        self._inputfieldmap = {}  # type: Dict[str, InputFieldConfig]
        for cfg_input_name in self._inputfielddefs:
            if cfg_input_name in self._inputfieldmap:
                continue
            self._inputfieldmap[cfg_input_name] = InputFieldConfig(
                self, cfg_input_name
            )

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # NLP processors
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        self._processors = []  # type: List[TableMaker]
        processorpairs = self._cfg.opt_strlist(
            NlpDefConfigKeys.PROCESSORS, required=True, lower=False
        )
        # self._procstmp = {}
        try:
            for proctype, procname in chunks(processorpairs, 2):
                processor = make_nlp_parser(
                    classname=proctype,
                    nlpdef=self,
                    cfg_processor_name=procname,
                )
                self._processors.append(processor)
        except ValueError:
            log.critical(f"Bad {NlpDefConfigKeys.PROCESSORS} specification")
            raise

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Transaction sizes, for early commit
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        self._transaction_limiters = (
            {}
        )  # type: Dict[Session, TransactionSizeLimiter]
        # dictionary of session -> TransactionSizeLimiter

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # Cloud config (loaded on request, then cached)
        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        self._cloudcfg = None  # type: Optional[CloudConfig]

    # -------------------------------------------------------------------------
    # Basic info
    # -------------------------------------------------------------------------

    @property
    def name(self) -> str:
        """
        Returns the name of the NLP definition.
        """
        return self._nlpname

    @property
    def logtag(self) -> str:
        """
        Returns the log tag of the NLP definition (may be used by child
        processes to provide more information for logs).
        """
        return self._logtag

    @property
    def now(self) -> datetime.datetime:
        """
        Returns the time this NLP definition was created (in UTC). Used to
        time-stamp NLP runs.
        """
        return self._now

    # -------------------------------------------------------------------------
    # Config file
    # -------------------------------------------------------------------------

    @property
    def parser(self) -> ExtendedConfigParser:
        """
        Returns the
        :class:`crate_anon.common.extendedconfigparser.ExtendedConfigParser` in
        use.
        """
        return self._cfg.parser

[docs]    def get_config_section(self, section: str) -> ConfigSection:
        """
        Returns a :class:`crate_anon.common.extendedconfigparser.ConfigSection`
        referring to a (potentially different) section.

        Args:
            section:
                New section name.
        """
        return self._cfg.other_section(section)

[docs]    def get_env_dict(
        self,
        env_section_name: str,
        parent_env: Optional[Dict[str, str]] = None,
    ) -> Dict[str, str]:
        """
        Gets an operating system environment variable dictionary (``variable:
        value`` mapping) from the config file.

        Args:
            env_section_name: config section name, without its "env:" prefix
            parent_env: optional starting point (e.g. parent OS environment)

        Returns:
            a dictionary suitable for use as an OS environment

        """
        return self._cfg.parser.get_env_dict(
            full_sectionname(NlpConfigPrefixes.ENV, env_section_name),
            parent_env=parent_env,
        )

[docs]    def get_database(
        self,
        name_and_cfg_section: str,
        with_session: bool = True,
        with_conn: bool = False,
        reflect: bool = False,
    ) -> DatabaseHolder:
        """
        Returns a :class:`crate_anon.anonymise.dbholder.DatabaseHolder` from
        the config file, containing information abuot a database.

        Args:
            name_and_cfg_section:
                string that is the name of the database, and also the config
                file section name describing the database
            with_session: create an SQLAlchemy Session?
            with_conn: create an SQLAlchemy connection (via an Engine)?
            reflect: read the database structure (when required)?
        """
        if name_and_cfg_section in self._databases:
            return self._databases[name_and_cfg_section]
        dbsection = full_sectionname(
            NlpConfigPrefixes.DATABASE, name_and_cfg_section
        )
        assert len(name_and_cfg_section) <= MAX_SQL_FIELD_LEN
        db = self.parser.get_database(
            dbsection,
            with_session=with_session,
            with_conn=with_conn,
            reflect=reflect,
        )
        self._databases[name_and_cfg_section] = db
        return db

    # -------------------------------------------------------------------------
    # Hashing
    # -------------------------------------------------------------------------

[docs]    def hash(self, text: str) -> str:
        """
        Hash text via this NLP definition's hasher. The hash will be stored in
        a secret progress database and to detect later changes in the source
        records.

        Args:
            text: text (typically from the source database) to be hashed

        Returns:
            the hashed value
        """
        return self._hasher.hash(text)

    # -------------------------------------------------------------------------
    # Database
    # -------------------------------------------------------------------------

    @property
    def temporary_tablename(self) -> str:
        """
        Temporary tablename to use.

        See the documentation for the :ref:`NLP config file <nlp_config>`.
        """
        return self._temporary_tablename

[docs]    def set_echo(self, echo: bool) -> None:
        """
        Set the SQLAlchemy ``echo`` parameter (to echo SQL) for all our
        source databases.
        """
        self._progdb.engine.echo = echo
        for db in self._databases.values():
            db.engine.echo = echo
        # Now, SQLAlchemy will mess things up by adding an additional handler.
        # So, bye-bye:
        for logname in (
            "sqlalchemy.engine.base.Engine",
            "sqlalchemy.engine.base.OptionEngine",
        ):
            logger = logging.getLogger(logname)
            logger.handlers = []  # ... of type: List[logging.Handler]

    @property
    def progressdb_session(self) -> Session:
        """
        Returns an SQLAlchemy ORM :class:`Session` for the progress database.
        """
        return self._progdb.session

    @property
    def progressdb_engine(self) -> Engine:
        """
        Returns an SQLAlchemy Core :class:`Engine` for the progress database.
        """
        return self._progdb.engine

    @property
    def progressdb_metadata(self) -> MetaData:
        """
        Returns the SQLAlchemy :class:`MetaData` for the progress database.
        """
        return self._progdb.metadata

    @property
    def progdb(self) -> DatabaseHolder:
        """
        Returns the progress database.
        """
        return self._progdb

[docs]    def commit_all(self) -> None:
        """
        Execute a COMMIT on all databases (all destination database and the
        progress database).
        """
        self.commit(self.progressdb_session)
        for db in self._databases.values():
            self.commit(db.session)

[docs]    def get_transation_limiter(
        self, session: Session
    ) -> TransactionSizeLimiter:
        """
        Returns (or creates and returns) a transaction limiter for a given
        SQLAlchemy session.

        Args:
            session: SQLAlchemy ORM :class:`Session`

        Returns:
            a :class:`crate_anon.common.sql.TransactionSizeLimiter`

        """
        if session not in self._transaction_limiters:
            self._transaction_limiters[session] = TransactionSizeLimiter(
                session,
                max_rows_before_commit=self._max_rows_before_commit,
                max_bytes_before_commit=self._max_bytes_before_commit,
            )
        return self._transaction_limiters[session]

[docs]    def notify_transaction(
        self,
        session: Session,
        n_rows: int,
        n_bytes: int,
        force_commit: bool = False,
    ) -> None:
        """
        Tell our transaction limiter about a transaction that's occurred on
        one of our databases. This may trigger a COMMIT.

        Args:
            session: SQLAlchemy ORM :class:`Session` that was used
            n_rows: number of rows inserted
            n_bytes: number of bytes inserted
            force_commit: force a COMMIT?
        """
        tl = self.get_transation_limiter(session)
        tl.notify(n_rows=n_rows, n_bytes=n_bytes, force_commit=force_commit)

[docs]    def commit(self, session: Session) -> None:
        """
        Executes a COMMIT on a specific session.

        Args:
            session: SQLAlchemy ORM :class:`Session`
        """
        tl = self.get_transation_limiter(session)
        tl.commit()

    # -------------------------------------------------------------------------
    # Input fields
    # -------------------------------------------------------------------------

    @property
    def inputfieldconfigs(self) -> Iterable["InputFieldConfig"]:
        """
        Returns all input field configurations used by this NLP definition.

        Returns:
            list of
            `crate_anon.nlp_manager.input_field_config.InputFieldConfig`
            objects

        """
        return self._inputfieldmap.values()

    # -------------------------------------------------------------------------
    # NLP processors
    # -------------------------------------------------------------------------

    @property
    def processors(self) -> List["TableMaker"]:
        """
        Returns all NLP processors used by this NLP definition.

        Returns:
            list of objects derived from
            :class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser`

        """
        return self._processors

    @property
    def noncloud_processors(self) -> List["BaseNlpParser"]:
        """
        Returns all local (non-cloud) NLP processors used by this NLP
        definition.

        Returns:
            list of objects derived from
            :class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser`

        """
        return [
            x for x in self._processors if not x.is_cloud_processor()
        ]  # type: List["BaseNlpParser"]

    @property
    def uses_cloud_processors(self) -> bool:
        """
        Are any of our processors cloud-based?
        """
        return any(x.is_cloud_processor() for x in self._processors)

    # -------------------------------------------------------------------------
    # NLPRP info
    # -------------------------------------------------------------------------

[docs]    def nlprp_local_processors(
        self, sql_dialect: str = None
    ) -> Dict[str, Any]:
        """
        Returns a draft list of processors as per the NLPRP
        :ref:`list_processors <nlprp_list_processors>` command.
        """
        processors = []  # type: List[Dict, str, Any]
        for proc in self.noncloud_processors:
            processors.append(proc.nlprp_processor_info(sql_dialect))
        return {NlprpKeys.PROCESSORS: processors}

[docs]    def nlprp_local_processors_json(
        self, indent: int = 4, sort_keys: bool = True, sql_dialect: str = None
    ) -> str:
        """
        Returns a formatted JSON string from :func:`nlprp_list_processors`.
        This is primarily for debugging.

        Args:
            indent: number of spaces for indentation
            sort_keys: sort keys?
            sql_dialect: preferred SQL dialect for ``tabular_schema``, or
                ``None`` for default
        """
        json_structure = self.nlprp_local_processors(sql_dialect=sql_dialect)
        return json.dumps(json_structure, indent=indent, sort_keys=sort_keys)

    # -------------------------------------------------------------------------
    # Cloud NLP
    # -------------------------------------------------------------------------

[docs]    def get_cloud_config(self) -> Optional[CloudConfig]:
        """
        Returns the :class:`crate_anon.nlp_manager.cloud_config.CloudConfig`
        object associated with this NLP definition, or ``None`` if there isn't
        one.
        """
        our_name = self.name
        if self._cloudcfg is None:
            if not self._cloud_config_name:
                raise ValueError(
                    f"No {NlpDefConfigKeys.CLOUD_CONFIG!r} parameter "
                    f"specified for NLP definition {our_name!r}"
                )
            if not self._cloud_request_data_dir:
                raise ValueError(
                    f"No {NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR!r} "
                    f"parameter specified for NLP definition {our_name!r}"
                )
            req_root_dir = os.path.abspath(self._cloud_request_data_dir)
            if not os.path.isdir(req_root_dir):
                raise ValueError(
                    f"Directory {req_root_dir!r}, specified by config "
                    f"parameter {NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR!r} "
                    f"for NLP definition {our_name!r}"
                )
            req_data_dir = os.path.join(req_root_dir, our_name)
            os.makedirs(req_data_dir, exist_ok=True)
            self._cloudcfg = CloudConfig(
                self, name=self._cloud_config_name, req_data_dir=req_data_dir
            )
        return self._cloudcfg

[docs]    def get_cloud_config_or_raise(self) -> CloudConfig:
        """
        Returns the :class:`crate_anon.nlp_manager.cloud_config.CloudConfig`
        object associated with this NLP definition, or raise :exc:`ValueError`
        if there isn't one.
        """
        cloudcfg = self.get_cloud_config()
        if cloudcfg is None:
            raise ValueError(
                f"No cloud NLP configuration for NLP definition "
                f"{self.name!r}"
            )
        if not cloudcfg.remote_processors:
            raise ValueError(
                f"No remote (cloud) processors configured for "
                f"NLP definition {self.name!r}"
            )
        return cloudcfg