#!/usr/bin/env python
"""
crate_anon/nlp_manager/nlp_definition.py
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
**NLP definition class.**
"""
# =============================================================================
# Imports
# =============================================================================
import datetime
import json
import logging
import os
import sys
from typing import (
Any,
Dict,
Iterable,
List,
Optional,
Tuple,
Type,
TYPE_CHECKING,
)
from cardinal_pythonlib.datetimefunc import get_now_utc_notz_datetime
from cardinal_pythonlib.docker import running_under_docker
from cardinal_pythonlib.lists import chunks
from sqlalchemy.engine.base import Engine
from sqlalchemy.orm.session import Session
from sqlalchemy.schema import MetaData
from crate_anon.anonymise.constants import AnonymiseConfigDefaults
from crate_anon.anonymise.dbholder import DatabaseHolder
from crate_anon.common.constants import EnvVar
from crate_anon.common.extendedconfigparser import (
ConfigSection,
ExtendedConfigParser,
)
from crate_anon.common.sql import TransactionSizeLimiter
from crate_anon.nlp_manager.cloud_config import CloudConfig
from crate_anon.nlp_manager.constants import (
CloudNlpConfigKeys,
DatabaseConfigKeys,
DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT,
DEFAULT_CLOUD_MAX_CONTENT_LENGTH,
DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST,
DEFAULT_CLOUD_MAX_TRIES,
DEFAULT_CLOUD_RATE_LIMIT_HZ,
DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S,
DEFAULT_TEMPORARY_TABLENAME,
full_sectionname,
GATE_PIPELINE_CLASSNAME,
NlpOutputConfigKeys,
HashClass,
InputFieldConfigKeys,
MAX_SQL_FIELD_LEN,
NLP_CONFIG_ENV_VAR,
NlpConfigPrefixes,
NlpDefConfigKeys,
ProcessorConfigKeys,
NlpDefValues,
)
from crate_anon.nlprp.constants import NlprpKeys
from crate_anon.version import CRATE_VERSION, CRATE_VERSION_DATE
if TYPE_CHECKING:
from crate_anon.nlp_manager.base_nlp_parser import (
BaseNlpParser,
TableMaker,
)
from crate_anon.nlp_manager.input_field_config import InputFieldConfig
log = logging.getLogger(__name__)
# =============================================================================
# Demo config
# =============================================================================
[docs]def demo_nlp_config() -> str:
"""
Returns a demo NLP config file for CRATE.
"""
# -------------------------------------------------------------------------
# Imports
# -------------------------------------------------------------------------
from crate_anon.nlp_manager.parse_biochemistry import (
ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS,
) # delayed import
from crate_anon.nlp_manager.parse_clinical import (
ALL_CLINICAL_NLP_AND_VALIDATORS,
) # delayed import
from crate_anon.nlp_manager.parse_cognitive import (
ALL_COGNITIVE_NLP_AND_VALIDATORS,
) # delayed import
from crate_anon.nlp_manager.parse_haematology import (
ALL_HAEMATOLOGY_NLP_AND_VALIDATORS,
) # delayed import
from crate_anon.nlp_manager.parse_substance_misuse import (
ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS,
) # delayed import
# -------------------------------------------------------------------------
# Helper functions
# -------------------------------------------------------------------------
def _make_procdef_pair(name: str) -> str:
return f"""[{NlpConfigPrefixes.PROCESSOR}:procdef_{name}]
{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.DESTTABLE} = {name}
[{NlpConfigPrefixes.PROCESSOR}:procdef_validate_{name}]
{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.DESTTABLE} = validate_{name}"""
def _make_module_procdef_block(
nlp_and_validators: List[
Tuple[Type["BaseNlpParser"], Type["BaseNlpParser"]]
]
) -> str:
_procdeflist = [] # type: List[str]
for nlpclass, validatorclass in nlp_and_validators:
_procdeflist.append(
_make_procdef_pair(nlpclass.classname().lower())
)
return "\n\n".join(_procdeflist)
def _make_proclist(
nlp_and_validators: List[
Tuple[Type["BaseNlpParser"], Type["BaseNlpParser"]]
]
) -> str:
_proclist = [] # type: List[str]
for nlpclass, validatorclass in nlp_and_validators:
_name = nlpclass.classname().lower()
_proclist.append(
f" {nlpclass.classname()} procdef_{_name}\n"
f" {validatorclass.classname()} procdef_validate_{_name}"
)
return "\n".join(_proclist)
# -------------------------------------------------------------------------
# Quasi-constants
# -------------------------------------------------------------------------
for_docker = running_under_docker()
destdb = "DESTINATION_DATABASE"
hashphrase = "doesnotmatter"
if_clin_docs = "INPUT_FIELD_CLINICAL_DOCUMENTS"
if_prog_notes = "INPUT_FIELD_PROGRESS_NOTES"
inputfields = f"{if_clin_docs}\n" f" {if_prog_notes}"
truncate_text_at = "32766"
my_env = "MY_ENV_SECTION"
my_src_db = "SOURCE_DATABASE"
my_cloud = "my_uk_cloud_service"
ridfield = "RID_FIELD"
tridfield = "TRID_FIELD"
nlp_input_terminator = "END_OF_TEXT_FOR_NLP"
nlp_output_terminator = "END_OF_NLP_OUTPUT_RECORD"
procdefs_biochemistry = _make_module_procdef_block(
ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS
)
procdefs_clinical = _make_module_procdef_block(
ALL_CLINICAL_NLP_AND_VALIDATORS
)
procdefs_cognitive = _make_module_procdef_block(
ALL_COGNITIVE_NLP_AND_VALIDATORS
)
procdefs_haematology = _make_module_procdef_block(
ALL_HAEMATOLOGY_NLP_AND_VALIDATORS
)
procdefs_substance_misuse = _make_module_procdef_block(
ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS
)
proclist_biochemistry = _make_proclist(ALL_BIOCHEMISTRY_NLP_AND_VALIDATORS)
proclist_clinical = _make_proclist(ALL_CLINICAL_NLP_AND_VALIDATORS)
proclist_cognitive = _make_proclist(ALL_COGNITIVE_NLP_AND_VALIDATORS)
proclist_haematology = _make_proclist(ALL_HAEMATOLOGY_NLP_AND_VALIDATORS)
proclist_substance_misuse = _make_proclist(
ALL_SUBSTANCE_MISUSE_NLP_AND_VALIDATORS
)
if EnvVar.GENERATING_CRATE_DOCS in os.environ:
nlp_prog_dir = "/path/to/crate_anon/nlp_manager/compiled_nlp_classes"
else:
this_dir = os.path.abspath(
os.path.dirname(__file__)
) # crate_anon/nlp_manager
nlp_prog_dir = os.path.join(this_dir, "compiled_nlp_classes")
if for_docker:
# See crate.Dockerfile
gate_home = "/crate/gate"
kcl_pharmacotherapy_dir = "/crate/brc-gate-pharmacotherapy"
cloud_request_data_dir = "/crate/tmp/clouddata"
gate_plugin_file = (
"/crate/src/crate_anon/nlp_manager/specimen_gate_plugin_file.ini"
)
else:
gate_home = "/path/to/GATE_Developer_8.6.1"
kcl_pharmacotherapy_dir = "/path/to/brc-gate-pharmacotherapy"
cloud_request_data_dir = "/srv/crate/clouddata"
gate_plugin_file = "/path/to/specimen_gate_plugin_file.ini"
_DA = AnonymiseConfigDefaults
# -------------------------------------------------------------------------
# The demo config itself
# -------------------------------------------------------------------------
# noinspection HttpUrlsUsage
return f"""# Configuration file for CRATE NLP manager (crate_nlp).
# Version {CRATE_VERSION} ({CRATE_VERSION_DATE}).
#
# PLEASE SEE THE HELP at https://crateanon.readthedocs.io/
# Using defaults for Docker environment: {for_docker}
# =============================================================================
# A. Individual NLP definitions
# =============================================================================
# - referred to by the NLP manager's command-line arguments
# - You are likely to need to alter these (particularly the bits in capital
# letters) to refer to your own database(s).
# -----------------------------------------------------------------------------
# GATE people-and-places demo
# -----------------------------------------------------------------------------
[{NlpConfigPrefixes.NLPDEF}:gate_name_location_demo]
{NlpDefConfigKeys.INPUTFIELDDEFS} =
{inputfields}
{NlpDefConfigKeys.PROCESSORS} =
GATE procdef_gate_name_location
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
# -----------------------------------------------------------------------------
# KConnect (Bio-YODIE) disease-finding GATE app
# -----------------------------------------------------------------------------
[{NlpConfigPrefixes.NLPDEF}:gate_kconnect_diseases]
{NlpDefConfigKeys.INPUTFIELDDEFS} =
{inputfields}
{NlpDefConfigKeys.PROCESSORS} =
GATE procdef_gate_kconnect
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
# -----------------------------------------------------------------------------
# KCL Lewy body dementia GATE app
# -----------------------------------------------------------------------------
[{NlpConfigPrefixes.NLPDEF}:gate_kcl_lbd]
{NlpDefConfigKeys.INPUTFIELDDEFS} =
{inputfields}
{NlpDefConfigKeys.PROCESSORS} =
GATE procdef_gate_kcl_lbda
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
# -----------------------------------------------------------------------------
# KCL pharmacotherapy GATE app
# -----------------------------------------------------------------------------
[{NlpConfigPrefixes.NLPDEF}:gate_kcl_pharmacotherapy]
{NlpDefConfigKeys.INPUTFIELDDEFS} =
{inputfields}
{NlpDefConfigKeys.PROCESSORS} =
GATE procdef_gate_pharmacotherapy
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
# -----------------------------------------------------------------------------
# Medex-UIMA medication-finding app
# -----------------------------------------------------------------------------
[{NlpConfigPrefixes.NLPDEF}:medex_medications]
{NlpDefConfigKeys.INPUTFIELDDEFS} =
{inputfields}
{NlpDefConfigKeys.PROCESSORS} =
Medex procdef_medex_medications
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
# -----------------------------------------------------------------------------
# CRATE number-finding Python regexes
# -----------------------------------------------------------------------------
[{NlpConfigPrefixes.NLPDEF}:crate_biomarkers]
{NlpDefConfigKeys.INPUTFIELDDEFS} =
{inputfields}
{NlpDefConfigKeys.PROCESSORS} =
# -------------------------------------------------------------------------
# Biochemistry
# -------------------------------------------------------------------------
{proclist_biochemistry}
# -------------------------------------------------------------------------
# Clinical
# -------------------------------------------------------------------------
{proclist_clinical}
# -------------------------------------------------------------------------
# Cognitive
# -------------------------------------------------------------------------
{proclist_cognitive}
# -------------------------------------------------------------------------
# Haematology
# -------------------------------------------------------------------------
{proclist_haematology}
# -------------------------------------------------------------------------
# Substance misuse
# -------------------------------------------------------------------------
{proclist_substance_misuse}
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
# {NlpDefConfigKeys.TRUNCATE_TEXT_AT} = {truncate_text_at}
# {NlpDefConfigKeys.RECORD_TRUNCATED_VALUES} = False
{NlpDefConfigKeys.MAX_ROWS_BEFORE_COMMIT} = {_DA.MAX_ROWS_BEFORE_COMMIT}
{NlpDefConfigKeys.MAX_BYTES_BEFORE_COMMIT} = {_DA.MAX_BYTES_BEFORE_COMMIT}
# -----------------------------------------------------------------------------
# Cloud NLP demo
# -----------------------------------------------------------------------------
[{NlpConfigPrefixes.NLPDEF}:cloud_nlp_demo]
{NlpDefConfigKeys.INPUTFIELDDEFS} =
{inputfields}
{NlpDefConfigKeys.PROCESSORS} =
Cloud procdef_cloud_crp
{NlpDefConfigKeys.PROGRESSDB} = {destdb}
{NlpDefConfigKeys.HASHPHRASE} = {hashphrase}
{NlpDefConfigKeys.CLOUD_CONFIG} = {my_cloud}
{NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR} = {cloud_request_data_dir}
# =============================================================================
# B. NLP processor definitions
# =============================================================================
# - You're likely to have to modify the destination databases these point to,
# but otherwise you can probably leave them as they are.
# -----------------------------------------------------------------------------
# Specimen CRATE regular expression processor definitions
# -----------------------------------------------------------------------------
# Most of these are very simple, and just require a destination database
# (as a cross-reference to a database section within this file) and a
# destination table.
# Biochemistry
{procdefs_biochemistry}
# Clinical
{procdefs_clinical}
# Cognitive
{procdefs_cognitive}
# Haematology
{procdefs_haematology}
# Substance misuse
{procdefs_substance_misuse}
# -----------------------------------------------------------------------------
# Specimen GATE demo people/places processor definition
# -----------------------------------------------------------------------------
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define the processor
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_name_location]
{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.OUTPUTTYPEMAP} =
Person output_person
Location output_location
{ProcessorConfigKeys.PROGARGS} =
java
-classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*"
-Dgate.home="{{GATE_HOME}}"
{GATE_PIPELINE_CLASSNAME}
--gate_app "{{GATE_HOME}}/plugins/ANNIE/ANNIE_with_defaults.gapp"
--pluginfile "{{GATE_PLUGIN_FILE}}"
--annotation Person
--annotation Location
--input_terminator {nlp_input_terminator}
--output_terminator {nlp_output_terminator}
--log_tag {{NLPLOGTAG}}
--verbose
{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator}
{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator}
# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define the output tables used by this GATE processor
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[{NlpConfigPrefixes.OUTPUT}:output_person]
{NlpOutputConfigKeys.DESTTABLE} = person
{NlpOutputConfigKeys.RENAMES} =
firstName firstname
{NlpOutputConfigKeys.DESTFIELDS} =
rule VARCHAR(100) Rule used to find this person (e.g. TitleFirstName, PersonFull)
firstname VARCHAR(100) First name
surname VARCHAR(100) Surname
gender VARCHAR(7) Gender (e.g. male, female, unknown)
kind VARCHAR(100) Kind of name (e.g. personName, fullName)
# ... longest gender: "unknown" (7)
{NlpOutputConfigKeys.INDEXDEFS} =
firstname 64
surname 64
[{NlpConfigPrefixes.OUTPUT}:output_location]
{NlpOutputConfigKeys.DESTTABLE} = location
{NlpOutputConfigKeys.RENAMES} =
locType loctype
{NlpOutputConfigKeys.DESTFIELDS} =
rule VARCHAR(100) Rule used (e.g. Location1)
loctype VARCHAR(100) Location type (e.g. city)
{NlpOutputConfigKeys.INDEXDEFS} =
rule 100
loctype 100
# -----------------------------------------------------------------------------
# Specimen Sheffield/KCL KConnect (Bio-YODIE) processor definition
# -----------------------------------------------------------------------------
# https://gate.ac.uk/applications/bio-yodie.html
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define the processor
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_kconnect]
{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.OUTPUTTYPEMAP} =
Disease_or_Syndrome output_disease_or_syndrome
{ProcessorConfigKeys.PROGARGS} =
java
-classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*"
-Dgate.home="{{GATE_HOME}}"
{GATE_PIPELINE_CLASSNAME}
--gate_app "{{KCONNECTDIR}}/main-bio/main-bio.xgapp"
--pluginfile "{{GATE_PLUGIN_FILE}}"
--annotation Disease_or_Syndrome
--input_terminator {nlp_input_terminator}
--output_terminator {nlp_output_terminator}
--log_tag {{NLPLOGTAG}}
--suppress_gate_stdout
--verbose
{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator}
{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator}
# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define the output tables used by this GATE processor
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[{NlpConfigPrefixes.OUTPUT}:output_disease_or_syndrome]
{NlpOutputConfigKeys.DESTTABLE} = kconnect_diseases
{NlpOutputConfigKeys.RENAMES} =
Experiencer experiencer
Negation negation
PREF pref
STY sty
TUI tui
Temporality temporality
VOCABS vocabs
{NlpOutputConfigKeys.DESTFIELDS} =
# Found by manual inspection of KConnect/Bio-YODIE output from the GATE console:
experiencer VARCHAR(100) Who experienced it; e.g. "Patient", "Other"
negation VARCHAR(100) Was it negated or not; e.g. "Affirmed", "Negated"
pref VARCHAR(100) PREFferred name; e.g. "Rheumatic gout"
sty VARCHAR(100) Semantic Type (STY) [semantic type name]; e.g. "Disease or Syndrome"
tui VARCHAR(4) Type Unique Identifier (TUI) [semantic type identifier]; 4 characters; https://www.ncbi.nlm.nih.gov/books/NBK9679/; e.g. "T047"
temporality VARCHAR(100) Occurrence in time; e.g. "Recent", "historical", "hypothetical"
vocabs VARCHAR(255) List of UMLS vocabularies; e.g. "AIR,MSH,NDFRT,MEDLINEPLUS,NCI,LNC,NCI_FDA,NCI,MTH,AIR,ICD9CM,LNC,SNOMEDCT_US,LCH_NW,HPO,SNOMEDCT_US,ICD9CM,SNOMEDCT_US,COSTAR,CST,DXP,QMR,OMIM,OMIM,AOD,CSP,NCI_NCI-GLOSS,CHV"
inst VARCHAR(8) Looks like a Concept Unique Identifier (CUI); 1 letter then 7 digits; e.g. "C0003873"
inst_full VARCHAR(255) Looks like a URL to a CUI; e.g. "http://linkedlifedata.com/resource/umls/id/C0003873"
language VARCHAR(100) Language; e.g. ""; ?will look like "ENG" for English? See https://www.nlm.nih.gov/research/umls/implementation_resources/query_diagrams/er1.html
tui_full VARCHAR(255) TUI (?); e.g. "http://linkedlifedata.com/resource/semanticnetwork/id/T047"
{NlpOutputConfigKeys.INDEXDEFS} =
pref 100
sty 100
tui 4
inst 8
# -----------------------------------------------------------------------------
# Specimen KCL GATE pharmacotherapy processor definition
# -----------------------------------------------------------------------------
# https://github.com/KHP-Informatics/brc-gate-pharmacotherapy
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define the processor
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_pharmacotherapy]
{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.OUTPUTTYPEMAP} =
Prescription output_prescription
{ProcessorConfigKeys.PROGARGS} =
java
-classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*"
-Dgate.home="{{GATE_HOME}}"
{GATE_PIPELINE_CLASSNAME}
--gate_app "{{GATE_PHARMACOTHERAPY_DIR}}/application.xgapp"
--pluginfile "{{GATE_PLUGIN_FILE}}"
--include_set Output
--annotation Prescription
--input_terminator {nlp_input_terminator}
--output_terminator {nlp_output_terminator}
--log_tag {{NLPLOGTAG}}
--suppress_gate_stdout
--show_contents_on_crash
{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator}
{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator}
# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define the output tables used by this GATE processor
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[{NlpConfigPrefixes.OUTPUT}:output_prescription]
{NlpOutputConfigKeys.DESTTABLE} = medications_gate
{NlpOutputConfigKeys.RENAMES} =
drug-type drug_type
dose-value dose_value
dose-unit dose_unit
dose-multiple dose_multiple
Directionality directionality
Experiencer experiencer
"Length of Time" length_of_time
Temporality temporality
"Unit of Time" unit_of_time
{NlpOutputConfigKeys.NULL_LITERALS} =
null
""
{NlpOutputConfigKeys.DESTFIELDS} =
# Found by (a) manual inspection of BRC GATE pharmacotherapy output from
# the GATE console; (b) inspection of
# application-resources/schemas/Prescription.xml
# Note preference for DECIMAL over FLOAT/REAL; see
# https://stackoverflow.com/questions/1056323
# Note that not all annotations appear for all texts. Try e.g.:
# Please start haloperidol 5mg tds.
# I suggest you start haloperidol 5mg tds for one week.
rule VARCHAR(100) Rule yielding this drug. Not in XML but is present in a subset: e.g. "weanOff"; max length unclear
drug VARCHAR(200) Drug name. Required string; e.g. "haloperidol"; max length 47 from "wc -L BNF_generic.lst", 134 from BNF_trade.lst
drug_type VARCHAR(100) Type of drug name. Required string; from "drug-type"; e.g. "BNF_generic"; ?length of longest drug ".lst" filename
dose VARCHAR(100) Dose text. Required string; e.g. "5mg"; max length unclear
dose_value DECIMAL Numerical dose value. Required numeric; from "dose-value"; "double" in the XML but DECIMAL probably better; e.g. 5.0
dose_unit VARCHAR(100) Text of dose units. Required string; from "dose-unit"; e.g. "mg"; max length unclear
dose_multiple INT Dose count (multiple). Required integer; from "dose-multiple"; e.g. 1
route VARCHAR(7) Route of administration. Required string; one of: "oral", "im", "iv", "rectal", "sc", "dermal", "unknown"
status VARCHAR(10) Change in drug status. Required; one of: "start", "continuing", "stop"
tense VARCHAR(7) Tense in which drug is referred to. Required; one of: "past", "present"
date VARCHAR(100) ?. Optional string; max length unclear
directionality VARCHAR(100) ?. Optional string; max length unclear
experiencer VARCHAR(100) Person experiencing the drug-related event. Optional string; e.g. "Patient"
frequency DECIMAL Frequency (times per <time_unit>). Optional numeric; "double" in the XML but DECIMAL probably better
interval DECIMAL The n in "every n <time_unit>s" (1 for "every <time_unit>"). Optional numeric; "double" in the XML but DECIMAL probably better
length_of_time VARCHAR(100) ?. Optional string; from "Length of Time"; max length unclear
temporality VARCHAR(100) ?. Optional string; e.g. "Recent", "Historical"
time_unit VARCHAR(100) Unit of time (see frequency, interval). Optional string; from "time-unit"; e.g. "day"; max length unclear
unit_of_time VARCHAR(100) ?. Optional string; from "Unit of Time"; max length unclear
when VARCHAR(100) ?. Optional string; max length unclear
{NlpOutputConfigKeys.INDEXDEFS} =
rule 100
drug 200
route 7
status 10
tense 7
# -----------------------------------------------------------------------------
# Specimen KCL Lewy Body Diagnosis Application (LBDA) processor definition
# -----------------------------------------------------------------------------
# https://github.com/KHP-Informatics/brc-gate-LBD
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define the processor
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[{NlpConfigPrefixes.PROCESSOR}:procdef_gate_kcl_lbda]
# "cDiagnosis" is the "confirmed diagnosis" field, as d/w Jyoti Jyoti
# 2018-03-20; see also README.md. This appears in the "Automatic" and the
# unnamed set. There is also a near-miss one, "DiagnosisAlmost", which
# appears in the unnamed set.
# "Mr Jones has Lewy body dementia."
# -> DiagnosisAlmost
# "Mr Jones has a diagnosis of Lewy body dementia."
# -> DiagnosisAlmost, cDiagnosis
# Note that we must use lower case in the outputtypemap.
{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.OUTPUTTYPEMAP} =
cDiagnosis output_lbd_diagnosis
DiagnosisAlmost output_lbd_diagnosis
{ProcessorConfigKeys.PROGARGS} =
java
-classpath "{{NLPPROGDIR}}"{{OS_PATHSEP}}"{{GATE_HOME}}/lib/*"
-Dgate.home="{{GATE_HOME}}"
{GATE_PIPELINE_CLASSNAME}
--gate_app "{{KCL_LBDA_DIR}}/application.xgapp"
--pluginfile "{{GATE_PLUGIN_FILE}}"
--set_annotation "" DiagnosisAlmost
--set_annotation Automatic cDiagnosis
--input_terminator {nlp_input_terminator}
--output_terminator {nlp_output_terminator}
--log_tag {{NLPLOGTAG}}
--suppress_gate_stdout
--verbose
{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
{ProcessorConfigKeys.INPUT_TERMINATOR} = {nlp_input_terminator}
{ProcessorConfigKeys.OUTPUT_TERMINATOR} = {nlp_output_terminator}
# {ProcessorConfigKeys.MAX_EXTERNAL_PROG_USES} = 1000
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Define the output tables used by this GATE processor
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[{NlpConfigPrefixes.OUTPUT}:output_lbd_diagnosis]
{NlpOutputConfigKeys.DESTTABLE} = lewy_body_dementia_gate
{NlpOutputConfigKeys.NULL_LITERALS} =
null
""
{NlpOutputConfigKeys.DESTFIELDS} =
# Found by
# (a) manual inspection of output from the GATE Developer console:
# - e.g. {{rule=Includefin, text=Lewy body dementia}}
# (b) inspection of contents:
# - run a Cygwin shell
# - find . -type f -exec grep cDiagnosis -l {{}} \\;
# - 3 hits:
# ./application-resources/jape/DiagnosisExclude2.jape
# ... part of the "Lewy"-detection apparatus
# ./application-resources/jape/text-feature.jape
# ... adds "text" annotation to cDiagnosis Token
# ./application.xgapp
# ... in annotationTypes
# On that basis:
rule VARCHAR(100) Rule that generated the hit.
text VARCHAR(200) Text that matched the rule.
{NlpOutputConfigKeys.INDEXDEFS} =
rule 100
text 200
# -----------------------------------------------------------------------------
# Specimen MedEx processor definition
# -----------------------------------------------------------------------------
# https://sbmi.uth.edu/ccb/resources/medex.htm
[{NlpConfigPrefixes.PROCESSOR}:procdef_medex_medications]
{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.DESTTABLE} = medications_medex
{ProcessorConfigKeys.PROGARGS} =
java
-classpath {{NLPPROGDIR}}:{{MEDEXDIR}}/bin:{{MEDEXDIR}}/lib/*
-Dfile.encoding=UTF-8
CrateMedexPipeline
-lt {{NLPLOGTAG}}
-v -v
# ... other arguments are added by the code
{ProcessorConfigKeys.PROGENVSECTION} = {my_env}
# =============================================================================
# C. Environment variable definitions
# =============================================================================
# - You'll need to modify this according to your local configuration.
[{NlpConfigPrefixes.ENV}:{my_env}]
GATE_HOME = {gate_home}
GATE_PHARMACOTHERAPY_DIR = {kcl_pharmacotherapy_dir}
GATE_PLUGIN_FILE = {gate_plugin_file}
KCL_LBDA_DIR = /path/to/brc-gate-LBD/Lewy_Body_Diagnosis
KCONNECTDIR = /path/to/yodie-pipeline-1-2-umls-only
MEDEXDIR = /path/to/Medex_UIMA_1.3.6
NLPPROGDIR = {nlp_prog_dir}
OS_PATHSEP = {os.pathsep}
# =============================================================================
# D. Input field definitions
# =============================================================================
[{NlpConfigPrefixes.INPUT}:{if_clin_docs}]
{InputFieldConfigKeys.SRCDB} = {my_src_db}
{InputFieldConfigKeys.SRCTABLE} = EXTRACTED_CLINICAL_DOCUMENTS
{InputFieldConfigKeys.SRCPKFIELD} = DOCUMENT_PK
{InputFieldConfigKeys.SRCFIELD} = DOCUMENT_TEXT
{InputFieldConfigKeys.SRCDATETIMEFIELD} = DOCUMENT_DATE
{InputFieldConfigKeys.COPYFIELDS} =
{ridfield}
{tridfield}
{InputFieldConfigKeys.INDEXED_COPYFIELDS} =
{ridfield}
{tridfield}
# {InputFieldConfigKeys.DEBUG_ROW_LIMIT} = 0
[{NlpConfigPrefixes.INPUT}:{if_prog_notes}]
{InputFieldConfigKeys.SRCDB} = {my_src_db}
{InputFieldConfigKeys.SRCTABLE} = PROGRESS_NOTES
{InputFieldConfigKeys.SRCPKFIELD} = PN_PK
{InputFieldConfigKeys.SRCFIELD} = PN_TEXT
{InputFieldConfigKeys.SRCDATETIMEFIELD} = PN_DATE
{InputFieldConfigKeys.COPYFIELDS} =
{ridfield}
{tridfield}
{InputFieldConfigKeys.INDEXED_COPYFIELDS} =
{ridfield}
{tridfield}
# =============================================================================
# E. Database definitions, each in its own section
# =============================================================================
[{NlpConfigPrefixes.DATABASE}:{my_src_db}]
{DatabaseConfigKeys.URL} = mysql+mysqldb://anontest:XXX@127.0.0.1:3306/anonymous_output?charset=utf8
[{NlpConfigPrefixes.DATABASE}:{destdb}]
{DatabaseConfigKeys.URL} = mysql+mysqldb://anontest:XXX@127.0.0.1:3306/anonymous_output?charset=utf8
# =============================================================================
# F. Information for using cloud-based NLP
# =============================================================================
[{NlpConfigPrefixes.CLOUD}:{my_cloud}]
{CloudNlpConfigKeys.CLOUD_URL} = https://your_url
{CloudNlpConfigKeys.USERNAME} = your_username
{CloudNlpConfigKeys.PASSWORD} = your_password
{CloudNlpConfigKeys.WAIT_ON_CONN_ERR} = {DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S}
{CloudNlpConfigKeys.MAX_CONTENT_LENGTH} = {DEFAULT_CLOUD_MAX_CONTENT_LENGTH}
{CloudNlpConfigKeys.MAX_RECORDS_PER_REQUEST} = {DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST}
{CloudNlpConfigKeys.LIMIT_BEFORE_COMMIT} = {DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT}
{CloudNlpConfigKeys.STOP_AT_FAILURE} = true
{CloudNlpConfigKeys.MAX_TRIES} = {DEFAULT_CLOUD_MAX_TRIES}
{CloudNlpConfigKeys.RATE_LIMIT_HZ} = {DEFAULT_CLOUD_RATE_LIMIT_HZ}
[{NlpConfigPrefixes.PROCESSOR}:procdef_cloud_crp]
{ProcessorConfigKeys.DESTDB} = {destdb}
{ProcessorConfigKeys.DESTTABLE} = crp_test
{ProcessorConfigKeys.PROCESSOR_NAME} = crate_anon.nlp_manager.parse_biochemistry.Crp
{ProcessorConfigKeys.PROCESSOR_FORMAT} = {NlpDefValues.FORMAT_STANDARD}
""" # noqa
# =============================================================================
# Get config filename (from environment variable)
# =============================================================================
[docs]def get_nlp_config_filename_or_exit() -> str:
"""
Returns the config filename, from our environment variable.
If we can't retrieve it, perform a hard exit.
"""
# Get filename
try:
config_filename = os.environ[NLP_CONFIG_ENV_VAR]
assert config_filename
except (KeyError, AssertionError):
print(
f"You must set the {NLP_CONFIG_ENV_VAR} environment variable "
f"to point to a CRATE NLP config file, or specify it on the "
f"command line."
)
sys.exit(1)
return config_filename
# =============================================================================
# Config class
# =============================================================================
[docs]class NlpDefinition:
"""
Class representing an NLP master definition as read from config file.
An NLP definition represents the combination of
- one or more NLP processors (e.g. "CRATE's C-reactive protein finder")
- one or more input fields in the source database
The NLP definition can therefore be used to say "run this set of NLP
processors over this set of textual fields in my database".
See the documentation for the :ref:`NLP config file <nlp_config>`.
"""
[docs] def __init__(self, nlpname: str, logtag: str = "") -> None:
"""
Read config from file.
Args:
nlpname: config section name for this NLP definition
logtag: text that may be passed to child processes to identify
the NLP definition in their log output
"""
# DELAYED IMPORTS (to make life simpler for classes deriving from
# NlpParser and using NlpDefinition -- they can now do it directly,
# not just via forward reference).
from crate_anon.nlp_manager.all_processors import make_nlp_parser
from crate_anon.nlp_manager.input_field_config import InputFieldConfig
self._nlpname = nlpname
self._logtag = logtag
log.info(f"Loading config for section: {nlpname}")
self._config_filename = get_nlp_config_filename_or_exit()
# Read config from file.
self._cfg = ConfigSection(
section=full_sectionname(NlpConfigPrefixes.NLPDEF, nlpname),
filename=self._config_filename,
case_sensitive=True,
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Our own stuff
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
self._databases = {} # type: Dict[str, DatabaseHolder]
self._progressdb_name = self._cfg.opt_str(
NlpDefConfigKeys.PROGRESSDB, required=True
)
self._progdb = self.get_database(self._progressdb_name)
self._temporary_tablename = self._cfg.opt_str(
NlpDefConfigKeys.TEMPORARY_TABLENAME,
default=DEFAULT_TEMPORARY_TABLENAME,
)
self._hashphrase = self._cfg.opt_str(
NlpDefConfigKeys.HASHPHRASE, required=True
)
self._hasher = HashClass(self._hashphrase)
self._max_rows_before_commit = self._cfg.opt_int_positive(
NlpDefConfigKeys.MAX_ROWS_BEFORE_COMMIT,
AnonymiseConfigDefaults.MAX_ROWS_BEFORE_COMMIT,
)
self._max_bytes_before_commit = self._cfg.opt_int_positive(
NlpDefConfigKeys.MAX_BYTES_BEFORE_COMMIT,
AnonymiseConfigDefaults.MAX_BYTES_BEFORE_COMMIT,
)
self._now = get_now_utc_notz_datetime()
self.truncate_text_at = self._cfg.opt_int_positive(
NlpDefConfigKeys.TRUNCATE_TEXT_AT, default=0
)
self.record_truncated_values = self._cfg.opt_bool(
NlpDefConfigKeys.RECORD_TRUNCATED_VALUES, default=False
)
self._cloud_config_name = self._cfg.opt_str(
NlpDefConfigKeys.CLOUD_CONFIG
)
self._cloud_request_data_dir = self._cfg.opt_str(
NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Input field definitions
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
self._inputfielddefs = self._cfg.opt_strlist(
NlpDefConfigKeys.INPUTFIELDDEFS, required=True, lower=False
)
self._inputfieldmap = {} # type: Dict[str, InputFieldConfig]
for cfg_input_name in self._inputfielddefs:
if cfg_input_name in self._inputfieldmap:
continue
self._inputfieldmap[cfg_input_name] = InputFieldConfig(
self, cfg_input_name
)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# NLP processors
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
self._processors = [] # type: List[TableMaker]
processorpairs = self._cfg.opt_strlist(
NlpDefConfigKeys.PROCESSORS, required=True, lower=False
)
# self._procstmp = {}
try:
for proctype, procname in chunks(processorpairs, 2):
processor = make_nlp_parser(
classname=proctype,
nlpdef=self,
cfg_processor_name=procname,
)
self._processors.append(processor)
except ValueError:
log.critical(f"Bad {NlpDefConfigKeys.PROCESSORS} specification")
raise
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Transaction sizes, for early commit
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
self._transaction_limiters = (
{}
) # type: Dict[Session, TransactionSizeLimiter]
# dictionary of session -> TransactionSizeLimiter
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Cloud config (loaded on request, then cached)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
self._cloudcfg = None # type: Optional[CloudConfig]
# -------------------------------------------------------------------------
# Basic info
# -------------------------------------------------------------------------
@property
def name(self) -> str:
"""
Returns the name of the NLP definition.
"""
return self._nlpname
@property
def logtag(self) -> str:
"""
Returns the log tag of the NLP definition (may be used by child
processes to provide more information for logs).
"""
return self._logtag
@property
def now(self) -> datetime.datetime:
"""
Returns the time this NLP definition was created (in UTC). Used to
time-stamp NLP runs.
"""
return self._now
# -------------------------------------------------------------------------
# Config file
# -------------------------------------------------------------------------
@property
def parser(self) -> ExtendedConfigParser:
"""
Returns the
:class:`crate_anon.common.extendedconfigparser.ExtendedConfigParser` in
use.
"""
return self._cfg.parser
[docs] def get_config_section(self, section: str) -> ConfigSection:
"""
Returns a :class:`crate_anon.common.extendedconfigparser.ConfigSection`
referring to a (potentially different) section.
Args:
section:
New section name.
"""
return self._cfg.other_section(section)
[docs] def get_env_dict(
self,
env_section_name: str,
parent_env: Optional[Dict[str, str]] = None,
) -> Dict[str, str]:
"""
Gets an operating system environment variable dictionary (``variable:
value`` mapping) from the config file.
Args:
env_section_name: config section name, without its "env:" prefix
parent_env: optional starting point (e.g. parent OS environment)
Returns:
a dictionary suitable for use as an OS environment
"""
return self._cfg.parser.get_env_dict(
full_sectionname(NlpConfigPrefixes.ENV, env_section_name),
parent_env=parent_env,
)
[docs] def get_database(
self,
name_and_cfg_section: str,
with_session: bool = True,
with_conn: bool = False,
reflect: bool = False,
) -> DatabaseHolder:
"""
Returns a :class:`crate_anon.anonymise.dbholder.DatabaseHolder` from
the config file, containing information abuot a database.
Args:
name_and_cfg_section:
string that is the name of the database, and also the config
file section name describing the database
with_session: create an SQLAlchemy Session?
with_conn: create an SQLAlchemy connection (via an Engine)?
reflect: read the database structure (when required)?
"""
if name_and_cfg_section in self._databases:
return self._databases[name_and_cfg_section]
dbsection = full_sectionname(
NlpConfigPrefixes.DATABASE, name_and_cfg_section
)
assert len(name_and_cfg_section) <= MAX_SQL_FIELD_LEN
db = self.parser.get_database(
dbsection,
with_session=with_session,
with_conn=with_conn,
reflect=reflect,
)
self._databases[name_and_cfg_section] = db
return db
# -------------------------------------------------------------------------
# Hashing
# -------------------------------------------------------------------------
[docs] def hash(self, text: str) -> str:
"""
Hash text via this NLP definition's hasher. The hash will be stored in
a secret progress database and to detect later changes in the source
records.
Args:
text: text (typically from the source database) to be hashed
Returns:
the hashed value
"""
return self._hasher.hash(text)
# -------------------------------------------------------------------------
# Database
# -------------------------------------------------------------------------
@property
def temporary_tablename(self) -> str:
"""
Temporary tablename to use.
See the documentation for the :ref:`NLP config file <nlp_config>`.
"""
return self._temporary_tablename
[docs] def set_echo(self, echo: bool) -> None:
"""
Set the SQLAlchemy ``echo`` parameter (to echo SQL) for all our
source databases.
"""
self._progdb.engine.echo = echo
for db in self._databases.values():
db.engine.echo = echo
# Now, SQLAlchemy will mess things up by adding an additional handler.
# So, bye-bye:
for logname in (
"sqlalchemy.engine.base.Engine",
"sqlalchemy.engine.base.OptionEngine",
):
logger = logging.getLogger(logname)
logger.handlers = [] # ... of type: List[logging.Handler]
@property
def progressdb_session(self) -> Session:
"""
Returns an SQLAlchemy ORM :class:`Session` for the progress database.
"""
return self._progdb.session
@property
def progressdb_engine(self) -> Engine:
"""
Returns an SQLAlchemy Core :class:`Engine` for the progress database.
"""
return self._progdb.engine
@property
def progressdb_metadata(self) -> MetaData:
"""
Returns the SQLAlchemy :class:`MetaData` for the progress database.
"""
return self._progdb.metadata
@property
def progdb(self) -> DatabaseHolder:
"""
Returns the progress database.
"""
return self._progdb
[docs] def commit_all(self) -> None:
"""
Execute a COMMIT on all databases (all destination database and the
progress database).
"""
self.commit(self.progressdb_session)
for db in self._databases.values():
self.commit(db.session)
[docs] def get_transation_limiter(
self, session: Session
) -> TransactionSizeLimiter:
"""
Returns (or creates and returns) a transaction limiter for a given
SQLAlchemy session.
Args:
session: SQLAlchemy ORM :class:`Session`
Returns:
a :class:`crate_anon.common.sql.TransactionSizeLimiter`
"""
if session not in self._transaction_limiters:
self._transaction_limiters[session] = TransactionSizeLimiter(
session,
max_rows_before_commit=self._max_rows_before_commit,
max_bytes_before_commit=self._max_bytes_before_commit,
)
return self._transaction_limiters[session]
[docs] def notify_transaction(
self,
session: Session,
n_rows: int,
n_bytes: int,
force_commit: bool = False,
) -> None:
"""
Tell our transaction limiter about a transaction that's occurred on
one of our databases. This may trigger a COMMIT.
Args:
session: SQLAlchemy ORM :class:`Session` that was used
n_rows: number of rows inserted
n_bytes: number of bytes inserted
force_commit: force a COMMIT?
"""
tl = self.get_transation_limiter(session)
tl.notify(n_rows=n_rows, n_bytes=n_bytes, force_commit=force_commit)
[docs] def commit(self, session: Session) -> None:
"""
Executes a COMMIT on a specific session.
Args:
session: SQLAlchemy ORM :class:`Session`
"""
tl = self.get_transation_limiter(session)
tl.commit()
# -------------------------------------------------------------------------
# Input fields
# -------------------------------------------------------------------------
@property
def inputfieldconfigs(self) -> Iterable["InputFieldConfig"]:
"""
Returns all input field configurations used by this NLP definition.
Returns:
list of
`crate_anon.nlp_manager.input_field_config.InputFieldConfig`
objects
"""
return self._inputfieldmap.values()
# -------------------------------------------------------------------------
# NLP processors
# -------------------------------------------------------------------------
@property
def processors(self) -> List["TableMaker"]:
"""
Returns all NLP processors used by this NLP definition.
Returns:
list of objects derived from
:class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser`
"""
return self._processors
@property
def noncloud_processors(self) -> List["BaseNlpParser"]:
"""
Returns all local (non-cloud) NLP processors used by this NLP
definition.
Returns:
list of objects derived from
:class:`crate_anon.nlp_manager.base_nlp_parser.BaseNlpParser`
"""
return [
x for x in self._processors if not x.is_cloud_processor()
] # type: List["BaseNlpParser"]
@property
def uses_cloud_processors(self) -> bool:
"""
Are any of our processors cloud-based?
"""
return any(x.is_cloud_processor() for x in self._processors)
# -------------------------------------------------------------------------
# NLPRP info
# -------------------------------------------------------------------------
[docs] def nlprp_local_processors(
self, sql_dialect: str = None
) -> Dict[str, Any]:
"""
Returns a draft list of processors as per the NLPRP
:ref:`list_processors <nlprp_list_processors>` command.
"""
processors = [] # type: List[Dict, str, Any]
for proc in self.noncloud_processors:
processors.append(proc.nlprp_processor_info(sql_dialect))
return {NlprpKeys.PROCESSORS: processors}
[docs] def nlprp_local_processors_json(
self, indent: int = 4, sort_keys: bool = True, sql_dialect: str = None
) -> str:
"""
Returns a formatted JSON string from :func:`nlprp_list_processors`.
This is primarily for debugging.
Args:
indent: number of spaces for indentation
sort_keys: sort keys?
sql_dialect: preferred SQL dialect for ``tabular_schema``, or
``None`` for default
"""
json_structure = self.nlprp_local_processors(sql_dialect=sql_dialect)
return json.dumps(json_structure, indent=indent, sort_keys=sort_keys)
# -------------------------------------------------------------------------
# Cloud NLP
# -------------------------------------------------------------------------
[docs] def get_cloud_config(self) -> Optional[CloudConfig]:
"""
Returns the :class:`crate_anon.nlp_manager.cloud_config.CloudConfig`
object associated with this NLP definition, or ``None`` if there isn't
one.
"""
our_name = self.name
if self._cloudcfg is None:
if not self._cloud_config_name:
raise ValueError(
f"No {NlpDefConfigKeys.CLOUD_CONFIG!r} parameter "
f"specified for NLP definition {our_name!r}"
)
if not self._cloud_request_data_dir:
raise ValueError(
f"No {NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR!r} "
f"parameter specified for NLP definition {our_name!r}"
)
req_root_dir = os.path.abspath(self._cloud_request_data_dir)
if not os.path.isdir(req_root_dir):
raise ValueError(
f"Directory {req_root_dir!r}, specified by config "
f"parameter {NlpDefConfigKeys.CLOUD_REQUEST_DATA_DIR!r} "
f"for NLP definition {our_name!r}"
)
req_data_dir = os.path.join(req_root_dir, our_name)
os.makedirs(req_data_dir, exist_ok=True)
self._cloudcfg = CloudConfig(
self, name=self._cloud_config_name, req_data_dir=req_data_dir
)
return self._cloudcfg
[docs] def get_cloud_config_or_raise(self) -> CloudConfig:
"""
Returns the :class:`crate_anon.nlp_manager.cloud_config.CloudConfig`
object associated with this NLP definition, or raise :exc:`ValueError`
if there isn't one.
"""
cloudcfg = self.get_cloud_config()
if cloudcfg is None:
raise ValueError(
f"No cloud NLP configuration for NLP definition "
f"{self.name!r}"
)
if not cloudcfg.remote_processors:
raise ValueError(
f"No remote (cloud) processors configured for "
f"NLP definition {self.name!r}"
)
return cloudcfg