"""
crate_anon/nlp_manager/output_user_config.py
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
**Define output configuration for GATE NLP applications.**
"""
import ast
import logging
import shlex
from typing import Dict, List
from cardinal_pythonlib.sql.validation import (
ensure_valid_field_name,
ensure_valid_table_name,
is_sqltype_valid,
)
from cardinal_pythonlib.lists import chunks
from cardinal_pythonlib.sqlalchemy.schema import (
get_sqla_coltype_from_dialect_str,
)
from sqlalchemy.engine.base import Engine
from sqlalchemy.schema import Column, Index
from crate_anon.common.extendedconfigparser import (
ConfigSection,
ExtendedConfigParser,
)
from crate_anon.nlp_manager.constants import (
full_sectionname,
NlpOutputConfigKeys,
NlpConfigPrefixes,
)
from crate_anon.nlp_manager.input_field_config import InputFieldConfig
log = logging.getLogger(__name__)
# =============================================================================
# OutputUserConfig
# =============================================================================
[docs]class OutputUserConfig:
"""
Class defining configuration for the output of a given GATE app.
See the documentation for the :ref:`NLP config file <nlp_config>`.
"""
[docs] def __init__(
self,
parser: ExtendedConfigParser,
cfg_output_name: str,
schema_required: bool = True,
) -> None:
"""
Read config from a configparser section.
Args:
parser:
:class:`crate_anon.common.extendedconfigparser.ExtendedConfigParser`
cfg_output_name:
config file section name suffix -- this is the second of the
pair of strings in the ``outputtypemap`` part of the GATE NLP
app config section. See
- :ref:`NLP config file <nlp_config>`
- :class:`crate_anon.nlp_manager.parse_gate.Gate`
schema_required:
is it required that the user has specified a schema, i.e.
destfields and a desttable? - Should be true for Gate, False
for Cloud as the remote processors may have their own schema
definition.
"""
sectionname = full_sectionname(
NlpConfigPrefixes.OUTPUT, cfg_output_name
)
cfg = ConfigSection(section=sectionname, parser=parser)
# ---------------------------------------------------------------------
# desttable
# ---------------------------------------------------------------------
self._desttable = cfg.opt_str(
NlpOutputConfigKeys.DESTTABLE, required=True
)
ensure_valid_table_name(self._desttable)
# ---------------------------------------------------------------------
# renames
# ---------------------------------------------------------------------
self._renames = {} # type: Dict[str, str]
rename_lines = cfg.opt_strlist(
NlpOutputConfigKeys.RENAMES, required=False, as_words=False
)
for line in rename_lines:
if not line.strip():
continue
words = shlex.split(line)
if len(words) != 2:
raise ValueError(
f"Bad {NlpOutputConfigKeys.RENAMES!r} option in config "
f"section {sectionname!r}; line was {line!r} but should "
f"have contained two things"
)
annotation_name = words[0]
field_name = words[1]
ensure_valid_field_name(field_name)
self._renames[annotation_name] = field_name
# ---------------------------------------------------------------------
# null_literals
# ---------------------------------------------------------------------
null_literal_lines = cfg.opt_strlist(
NlpOutputConfigKeys.NULL_LITERALS, required=False, as_words=False
)
self._null_literals = [] # type: List[str]
for line in null_literal_lines:
self._null_literals += shlex.split(line)
# ---------------------------------------------------------------------
# destfields
# ---------------------------------------------------------------------
self._destfields = [] # type: List[str]
self._dest_datatypes = [] # type: List[str]
self._dest_comments = [] # type: List[str]
dest_field_lines = cfg.opt_strlist(
NlpOutputConfigKeys.DESTFIELDS,
required=schema_required,
as_words=False,
)
# ... comments will be removed during that process.
# log.critical(dest_field_lines)
# If dest_field_lines is empty (as it may be for a Cloud processor)
# the following block doesn't execute, so the 'dest' attributed remain
# empty
for dfl in dest_field_lines:
parts = dfl.split(maxsplit=2)
assert len(parts) >= 2, f"Bad field definition line: {dfl!r}"
field = parts[0]
datatype = parts[1].upper()
comment = parts[2] if len(parts) > 2 else None
ensure_valid_field_name(field)
if not is_sqltype_valid(datatype):
raise ValueError(f"Invalid datatype for {field}: {datatype}")
self._destfields.append(field)
self._dest_datatypes.append(datatype)
self._dest_comments.append(comment)
src_fields = [
c.name for c in InputFieldConfig.get_core_columns_for_dest()
]
for sf in src_fields:
if sf in self._destfields:
raise ValueError(
f"For section {sectionname}, destination field {sf} is "
f"auto-supplied; do not add it manually"
)
if len(set(self._destfields)) != len(self._destfields):
raise ValueError(
f"Duplicate fields exist in destination fields: "
f"{self._destfields}"
)
# ---------------------------------------------------------------------
# indexdefs
# ---------------------------------------------------------------------
self._indexfields = [] # type: List[str]
self._indexlengths = [] # type: List[int]
indexdefs = cfg.opt_strlist(NlpOutputConfigKeys.INDEXDEFS)
if indexdefs:
for c in chunks(indexdefs, 2): # pairs: field, length
indexfieldname = c[0]
lengthstr = c[1]
if indexfieldname not in self._destfields:
raise ValueError(
f"Index field {indexfieldname} not in "
f"destination fields {self._destfields}"
)
try:
length = ast.literal_eval(lengthstr)
if length is not None:
length = int(length)
except ValueError:
raise ValueError(f"Bad index length: {lengthstr}")
self._indexfields.append(indexfieldname)
self._indexlengths.append(length)
@property
def dest_tablename(self) -> str:
"""
Returns the name of the destination table.
"""
return self._desttable
@property
def destfields(self) -> List[str]:
"""
Returns the list of destination fields.
"""
return self._destfields
[docs] def get_columns(self, engine: Engine) -> List[Column]:
"""
Return all SQLAlchemy :class:`Column` definitions for the destination
table.
Args:
engine: SQLAlchemy database :class:`Engine`
Returns:
list of SQLAlchemy :class:`Column` objects
"""
columns = [] # type: List[Column]
for i, field in enumerate(self._destfields):
datatype = self._dest_datatypes[i]
comment = self._dest_comments[i]
columns.append(
Column(
field,
get_sqla_coltype_from_dialect_str(
datatype, engine.dialect
),
comment=comment,
)
)
return columns
@property
def indexes(self) -> List[Index]:
"""
Return all SQLAlchemy :class:`Index` definitions for the destination
table.
Returns:
list of SQLAlchemy :class:`Index` objects
"""
indexes = [] # type: List[Index]
for i, field in enumerate(self._indexfields):
index_name = f"_idx_{field}"
length = self._indexlengths[i]
kwargs = {"mysql_length": length} if length is not None else {}
indexes.append(Index(index_name, field, **kwargs))
return indexes
@property
def renames(self) -> Dict[str, str]:
"""
Return the "rename dictionary": a dictionary mapping GATE annotation
names to fieldnames in the NLP destination table.
See
- ``renames`` in the :ref:`NLP config file <nlp_config>`.
- :meth:`crate_anon.nlp_manager.parse_gate.Gate.parse`
"""
return self._renames
@property
def null_literals(self) -> List[str]:
"""
Returns string values from the GATE output that will be interpreted as
NULL values.
See
- ``null_literals`` in the :ref:`NLP config file <nlp_config>`.
- :meth:`crate_anon.nlp_manager.parse_gate.Gate.parse`.
"""
return self._null_literals