Source code for crate_anon.nlp_manager.output_user_config

"""
crate_anon/nlp_manager/output_user_config.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Define output configuration for GATE NLP applications.**

"""

import ast
import logging
import shlex
from typing import Dict, List

from cardinal_pythonlib.sql.validation import (
    ensure_valid_field_name,
    ensure_valid_table_name,
    is_sqltype_valid,
)
from cardinal_pythonlib.lists import chunks
from cardinal_pythonlib.sqlalchemy.schema import (
    get_sqla_coltype_from_dialect_str,
)
from sqlalchemy.engine.base import Engine
from sqlalchemy.schema import Column, Index

from crate_anon.common.extendedconfigparser import (
    ConfigSection,
    ExtendedConfigParser,
)
from crate_anon.nlp_manager.constants import (
    full_sectionname,
    NlpOutputConfigKeys,
    NlpConfigPrefixes,
)
from crate_anon.nlp_manager.input_field_config import InputFieldConfig

log = logging.getLogger(__name__)


# =============================================================================
# OutputUserConfig
# =============================================================================


[docs]class OutputUserConfig:
    """
    Class defining configuration for the output of a given GATE app.

    See the documentation for the :ref:`NLP config file <nlp_config>`.
    """

[docs]    def __init__(
        self,
        parser: ExtendedConfigParser,
        cfg_output_name: str,
        schema_required: bool = True,
    ) -> None:
        """
        Read config from a configparser section.

        Args:
            parser:
                :class:`crate_anon.common.extendedconfigparser.ExtendedConfigParser`
            cfg_output_name:
                config file section name suffix -- this is the second of the
                pair of strings in the ``outputtypemap`` part of the GATE NLP
                app config section. See

                - :ref:`NLP config file <nlp_config>`
                - :class:`crate_anon.nlp_manager.parse_gate.Gate`

           schema_required:
               is it required that the user has specified a schema, i.e.
               destfields and a desttable? - Should be true for Gate, False
               for Cloud as the remote processors may have their own schema
               definition.
        """

        sectionname = full_sectionname(
            NlpConfigPrefixes.OUTPUT, cfg_output_name
        )
        cfg = ConfigSection(section=sectionname, parser=parser)

        # ---------------------------------------------------------------------
        # desttable
        # ---------------------------------------------------------------------

        self._desttable = cfg.opt_str(
            NlpOutputConfigKeys.DESTTABLE, required=True
        )
        ensure_valid_table_name(self._desttable)

        # ---------------------------------------------------------------------
        # renames
        # ---------------------------------------------------------------------

        self._renames = {}  # type: Dict[str, str]
        rename_lines = cfg.opt_strlist(
            NlpOutputConfigKeys.RENAMES, required=False, as_words=False
        )
        for line in rename_lines:
            if not line.strip():
                continue
            words = shlex.split(line)
            if len(words) != 2:
                raise ValueError(
                    f"Bad {NlpOutputConfigKeys.RENAMES!r} option in config "
                    f"section {sectionname!r}; line was {line!r} but should "
                    f"have contained two things"
                )
            annotation_name = words[0]
            field_name = words[1]
            ensure_valid_field_name(field_name)
            self._renames[annotation_name] = field_name

        # ---------------------------------------------------------------------
        # null_literals
        # ---------------------------------------------------------------------

        null_literal_lines = cfg.opt_strlist(
            NlpOutputConfigKeys.NULL_LITERALS, required=False, as_words=False
        )
        self._null_literals = []  # type: List[str]
        for line in null_literal_lines:
            self._null_literals += shlex.split(line)

        # ---------------------------------------------------------------------
        # destfields
        # ---------------------------------------------------------------------

        self._destfields = []  # type: List[str]
        self._dest_datatypes = []  # type: List[str]
        self._dest_comments = []  # type: List[str]
        dest_field_lines = cfg.opt_strlist(
            NlpOutputConfigKeys.DESTFIELDS,
            required=schema_required,
            as_words=False,
        )
        # ... comments will be removed during that process.
        # log.critical(dest_field_lines)
        # If dest_field_lines is empty (as it may be for a Cloud processor)
        # the following block doesn't execute, so the 'dest' attributed remain
        # empty
        for dfl in dest_field_lines:
            parts = dfl.split(maxsplit=2)
            assert len(parts) >= 2, f"Bad field definition line: {dfl!r}"
            field = parts[0]
            datatype = parts[1].upper()
            comment = parts[2] if len(parts) > 2 else None
            ensure_valid_field_name(field)
            if not is_sqltype_valid(datatype):
                raise ValueError(f"Invalid datatype for {field}: {datatype}")
            self._destfields.append(field)
            self._dest_datatypes.append(datatype)
            self._dest_comments.append(comment)

        src_fields = [
            c.name for c in InputFieldConfig.get_core_columns_for_dest()
        ]
        for sf in src_fields:
            if sf in self._destfields:
                raise ValueError(
                    f"For section {sectionname}, destination field {sf} is "
                    f"auto-supplied; do not add it manually"
                )

        if len(set(self._destfields)) != len(self._destfields):
            raise ValueError(
                f"Duplicate fields exist in destination fields: "
                f"{self._destfields}"
            )

        # ---------------------------------------------------------------------
        # indexdefs
        # ---------------------------------------------------------------------

        self._indexfields = []  # type: List[str]
        self._indexlengths = []  # type: List[int]
        indexdefs = cfg.opt_strlist(NlpOutputConfigKeys.INDEXDEFS)
        if indexdefs:
            for c in chunks(indexdefs, 2):  # pairs: field, length
                indexfieldname = c[0]
                lengthstr = c[1]
                if indexfieldname not in self._destfields:
                    raise ValueError(
                        f"Index field {indexfieldname} not in "
                        f"destination fields {self._destfields}"
                    )
                try:
                    length = ast.literal_eval(lengthstr)
                    if length is not None:
                        length = int(length)
                except ValueError:
                    raise ValueError(f"Bad index length: {lengthstr}")
                self._indexfields.append(indexfieldname)
                self._indexlengths.append(length)

    @property
    def dest_tablename(self) -> str:
        """
        Returns the name of the destination table.
        """
        return self._desttable

    @property
    def destfields(self) -> List[str]:
        """
        Returns the list of destination fields.
        """
        return self._destfields

[docs]    def get_columns(self, engine: Engine) -> List[Column]:
        """
        Return all SQLAlchemy :class:`Column` definitions for the destination
        table.

        Args:
            engine: SQLAlchemy database :class:`Engine`

        Returns:
            list of SQLAlchemy :class:`Column` objects

        """
        columns = []  # type: List[Column]
        for i, field in enumerate(self._destfields):
            datatype = self._dest_datatypes[i]
            comment = self._dest_comments[i]
            columns.append(
                Column(
                    field,
                    get_sqla_coltype_from_dialect_str(
                        datatype, engine.dialect
                    ),
                    comment=comment,
                )
            )
        return columns

    @property
    def indexes(self) -> List[Index]:
        """
        Return all SQLAlchemy :class:`Index` definitions for the destination
        table.

        Returns:
            list of SQLAlchemy :class:`Index` objects

        """
        indexes = []  # type: List[Index]
        for i, field in enumerate(self._indexfields):
            index_name = f"_idx_{field}"
            length = self._indexlengths[i]
            kwargs = {"mysql_length": length} if length is not None else {}
            indexes.append(Index(index_name, field, **kwargs))
        return indexes

    @property
    def renames(self) -> Dict[str, str]:
        """
        Return the "rename dictionary": a dictionary mapping GATE annotation
        names to fieldnames in the NLP destination table.

        See

        - ``renames`` in the :ref:`NLP config file <nlp_config>`.
        - :meth:`crate_anon.nlp_manager.parse_gate.Gate.parse`
        """
        return self._renames

    @property
    def null_literals(self) -> List[str]:
        """
        Returns string values from the GATE output that will be interpreted as
        NULL values.

        See

        - ``null_literals`` in the :ref:`NLP config file <nlp_config>`.
        - :meth:`crate_anon.nlp_manager.parse_gate.Gate.parse`.
        """
        return self._null_literals