Source code for crate_anon.nlp_manager.constants

"""
crate_anon/nlp_manager/constants.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Constants for CRATE NLP, including a demo config file.**

"""

from cardinal_pythonlib.hash import HmacMD5Hasher
from sqlalchemy.types import String


# =============================================================================
# Constants
# =============================================================================

DEFAULT_CLOUD_LIMIT_BEFORE_COMMIT = 1000
DEFAULT_CLOUD_MAX_CONTENT_LENGTH = 0  # no limit
DEFAULT_CLOUD_MAX_RECORDS_PER_REQUEST = 1000
DEFAULT_CLOUD_MAX_TRIES = 5
DEFAULT_CLOUD_RATE_LIMIT_HZ = 2
DEFAULT_CLOUD_WAIT_ON_CONN_ERR_S = 180  # in seconds
DEFAULT_REPORT_EVERY_NLP = 500  # low values slow down processing
DEFAULT_TEMPORARY_TABLENAME = "_crate_nlp_temptable"

FN_CRATE_VERSION_FIELD = "_crate_version"  # new in v0.18.53
FN_NLPDEF = "_nlpdef"
FN_PK = "_pk"
FN_SRCDATETIMEFIELD = "_srcdatetimefield"  # new in v0.18.52
FN_SRCDATETIMEVAL = "_srcdatetimeval"  # new in v0.18.52
FN_SRCDB = "_srcdb"
FN_SRCFIELD = "_srcfield"
FN_SRCPKFIELD = "_srcpkfield"
FN_SRCPKSTR = "_srcpkstr"
FN_SRCPKVAL = "_srcpkval"
FN_SRCTABLE = "_srctable"
FN_WHEN_FETCHED = "_when_fetched_utc"  # new in v0.18.53

TRUNCATED_FLAG = "_truncated"  # NOT A FIELD/COLUMN NAME. INTERNAL USE ONLY.

GATE_PIPELINE_CLASSNAME = "CrateGatePipeline"

HashClass = HmacMD5Hasher

MAX_STRING_PK_LENGTH = 64  # trade-off; space versus capability
MAX_SQL_FIELD_LEN = 64
# ... http://dev.mysql.com/doc/refman/5.0/en/identifiers.html
MAX_SEMANTIC_VERSION_STRING_LENGTH = (
    147  # https://github.com/mojombo/semver/issues/79
)
MEDEX_PIPELINE_CLASSNAME = "CrateMedexPipeline"
MEDEX_DATA_READY_SIGNAL = "data_ready"
MEDEX_RESULTS_READY_SIGNAL = "results_ready"

NLP_CONFIG_ENV_VAR = "CRATE_NLP_CONFIG"

SqlTypeDbIdentifier = String(MAX_SQL_FIELD_LEN)
# ... text field used for database names, table names, and field names


# =============================================================================
# Simple classes for string constant collections
# =============================================================================


[docs]class NlpConfigPrefixes: """ Section name prefixes for the NLP config file. """ NLPDEF = "nlpdef" PROCESSOR = "processor" ENV = "env" OUTPUT = "output" INPUT = "input" DATABASE = "database" CLOUD = "cloud"
[docs]class NlpDefConfigKeys: """ Config file keys for NLP definitions. """ INPUTFIELDDEFS = "inputfielddefs" PROCESSORS = "processors" PROGRESSDB = "progressdb" HASHPHRASE = "hashphrase" TEMPORARY_TABLENAME = "temporary_tablename" MAX_ROWS_BEFORE_COMMIT = "max_rows_before_commit" MAX_BYTES_BEFORE_COMMIT = "max_bytes_before_commit" TRUNCATE_TEXT_AT = "truncate_text_at" RECORD_TRUNCATED_VALUES = "record_truncated_values" CLOUD_CONFIG = "cloud_config" CLOUD_REQUEST_DATA_DIR = "cloud_request_data_dir"
[docs]class NlpDefValues: """ Config file values for NLP definitions """ # Since any server with the same output format as CRATE's is compatible, # we call this format standard FORMAT_STANDARD = "Standard" FORMAT_GATE = "GATE"
[docs]class InputFieldConfigKeys: """ Config file keys for input database fields (columns). """ SRCDB = "srcdb" SRCTABLE = "srctable" SRCPKFIELD = "srcpkfield" SRCFIELD = "srcfield" SRCDATETIMEFIELD = "srcdatetimefield" COPYFIELDS = "copyfields" INDEXED_COPYFIELDS = "indexed_copyfields" DEBUG_ROW_LIMIT = "debug_row_limit"
[docs]class ProcessorConfigKeys: """ Config file keys for NLP processors. """ ASSUME_PREFERRED_UNIT = "assume_preferred_unit" DESTDB = "destdb" DESTTABLE = "desttable" OUTPUTTYPEMAP = "outputtypemap" PROGARGS = "progargs" PROGENVSECTION = "progenvsection" INPUT_TERMINATOR = "input_terminator" OUTPUT_TERMINATOR = "output_terminator" MAX_EXTERNAL_PROG_USES = "max_external_prog_uses" PROCESSOR_NAME = "processor_name" PROCESSOR_VERSION = "processor_version" PROCESSOR_FORMAT = "processor_format"
[docs]class NlpOutputConfigKeys: """ Config file keys for output tables from GATE or Cloud NLP processors. """ DESTTABLE = "desttable" RENAMES = "renames" NULL_LITERALS = "null_literals" DESTFIELDS = "destfields" INDEXDEFS = "indexdefs"
[docs]class DatabaseConfigKeys: """ Config file keys for database definitions. """ URL = "url" ECHO = "echo"
[docs]class CloudNlpConfigKeys: """ Config file keys for cloud NLP. """ CLOUD_URL = "cloud_url" VERIFY_SSL = "verify_ssl" COMPRESS = "compress" USERNAME = "username" PASSWORD = "password" WAIT_ON_CONN_ERR = "wait_on_conn_err" MAX_CONTENT_LENGTH = "max_content_length" LIMIT_BEFORE_COMMIT = "limit_before_commit" MAX_RECORDS_PER_REQUEST = "max_records_per_request" STOP_AT_FAILURE = "stop_at_failure" MAX_TRIES = "max_tries" RATE_LIMIT_HZ = "rate_limit_hz" TEST_LENGTH_FUNCTION_SPEED = "test_length_function_speed"
[docs]class GateApiKeys: """ Dictionary keys for the direct API to GATE. See https://cloud.gate.ac.uk/info/help/online-api.html for format of response from processor. The GATE JSON format is: .. code-block:: json { "text":"The text of the document", "entities":{ "SampleAnnotationType1":[ { "indices":[0,3], "feature1":"value1", "feature2":"value2" } ], "SampleAnnotationType2":[ { "indices":[12,15], "feature3":"value3" } ] } } """ ENTITIES = "entities" INDICES = "indices" TEXT = "text"
[docs]class GateResultKeys: """ Dictionary keys to represent GATE results in our NLPRP server. """ TYPE = "type" START = "start" END = "end" SET = "set" FEATURES = "features"
[docs]class GateFieldNames: """ Field (column) names for results from GATE. These match KEY_* strings in ``CrateGatePipeline.java``. """ SET = "_set" TYPE = "_type" ID = "_id" STARTPOS = "_start" ENDPOS = "_end" CONTENT = "_content"
# ============================================================================= # Config helpers # ============================================================================= _ALL_NLPRP_SECTION_PREFIXES = [ v for k, v in NlpConfigPrefixes.__dict__.items() if not k.startswith("_") ] def full_sectionname(section_type: str, section: str) -> str: if section_type in _ALL_NLPRP_SECTION_PREFIXES: return section_type + ":" + section raise ValueError(f"Unrecognised section type: {section_type}")