Source code for crate_anon.nlp_manager.base_nlp_parser

"""
crate_anon/nlp_manager/base_nlp_parser.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Simple base class for all our NLP parsers (GATE, regex, ...)**

"""

from abc import ABC, abstractmethod
from functools import lru_cache
import json
import logging
import sys
from typing import (
    Any,
    Dict,
    Generator,
    Iterable,
    List,
    Optional,
    Tuple,
    TYPE_CHECKING,
)

from cardinal_pythonlib.reprfunc import auto_repr
from cardinal_pythonlib.timing import MultiTimerContext, timer
from cardinal_pythonlib.sqlalchemy.schema import (
    column_lists_equal,
    index_lists_equal,
)

# OK to import "registry"; see
# https://github.com/zzzeek/sqlalchemy/blob/master/README.dialects.rst
# noinspection PyProtectedMember
from sqlalchemy.dialects import registry
from sqlalchemy.engine.base import Engine
from sqlalchemy.exc import DatabaseError
from sqlalchemy.orm.session import Session
from sqlalchemy.schema import Column, Index, Table
from sqlalchemy.sql import and_, exists, or_
from sqlalchemy.sql.schema import MetaData
from sqlalchemy.types import Integer, Text

from crate_anon.anonymise.dbholder import DatabaseHolder
from crate_anon.anonymise.constants import (
    COMMENT,
    TABLE_KWARGS,
)
from crate_anon.common.stringfunc import (
    compress_docstring,
    does_text_contain_word_chars,
    get_docstring,
)
from crate_anon.nlp_manager.constants import (
    FN_NLPDEF,
    FN_SRCPKVAL,
    FN_SRCPKSTR,
    full_sectionname,
    NlpConfigPrefixes,
    ProcessorConfigKeys,
    GateFieldNames as GateFN,
    SqlTypeDbIdentifier,
    MAX_SQL_FIELD_LEN,
)
from crate_anon.nlp_manager.input_field_config import InputFieldConfig
from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.nlprp.api import NlprpServerProcessor
from crate_anon.nlprp.constants import (
    ALL_SQL_DIALECTS,
    NlprpKeys,
    NlprpValues,
    SqlDialects,
)
from crate_anon.version import CRATE_VERSION

if TYPE_CHECKING:
    from sqlalchemy.engine.interfaces import Dialect
    from crate_anon.common.extendedconfigparser import ConfigSection

log = logging.getLogger(__name__)

DEFAULT_NLPRP_SQL_DIALECT = SqlDialects.MYSQL
TIMING_DELETE_DEST_RECORD = "BaseNlpParser_delete_dest_record"
TIMING_INSERT = "BaseNlpParser_sql_insert"
TIMING_PARSE = "parse"
TIMING_HANDLE_PARSED = "handled_parsed"


# =============================================================================
# Exception meaning "could not parse this piece of text"
# =============================================================================


[docs]class TextProcessingFailed(Exception): pass
# ============================================================================= # Base class for all parser types # =============================================================================
[docs]class TableMaker(ABC): """ Base class for all CRATE NLP processors, local and cloud, including those that talk to third-party software. Manages the interface to databases for results storage, etc. """ _is_cloud_processor = False # overridden by cloud-based classes
[docs] def __init__( self, nlpdef: Optional[NlpDefinition], cfg_processor_name: Optional[str], commit: bool = False, friendly_name: str = "?", ) -> None: r""" ``__init__`` function for :class:`TableMaker`. Args: nlpdef: An instance of :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`. cfg_processor_name: The name of a CRATE NLP config file section, TO WHICH we will add a ``processor:`` prefix (from which section we may choose to get extra config information). commit: Force a COMMIT whenever we insert data? You should specify this in multiprocess mode, or you may get database deadlocks. friendly_name: Friendly name for the parser. """ # NB This docstring was associated with Sphinx errors! self._nlpdef = nlpdef self._cfg_processor_name = cfg_processor_name self._commit = commit self._friendly_name = friendly_name self._destdb_name = None # type: Optional[str] self._destdb = None # type: Optional[DatabaseHolder] if nlpdef is None: self._sectionname = "" self._cfgsection = None # type: Optional[ConfigSection] self._destdb_name = "" self._destdb = None # type: Optional[DatabaseHolder] else: self._sectionname = full_sectionname( NlpConfigPrefixes.PROCESSOR, cfg_processor_name ) self._cfgsection = nlpdef.get_config_section(self._sectionname) self._destdb_name = self._cfgsection.opt_str( ProcessorConfigKeys.DESTDB, required=True ) self._destdb = nlpdef.get_database(self._destdb_name)
def __str__(self) -> str: return self.classname() def __repr__(self) -> str: return auto_repr(self)
[docs] @classmethod def classname(cls) -> str: """ Returns the short Python name of this class. """ return cls.__name__
[docs] @classmethod def fully_qualified_classname(cls) -> str: """ Returns the class's fully qualified name. """ # This may be imperfect; see # https://stackoverflow.com/questions/2020014/get-fully-qualified-class-name-of-an-object-in-python # noqa # https://www.python.org/dev/peps/pep-3155/ return ".".join([cls.__module__, cls.__qualname__])
[docs] @classmethod def is_cloud_processor(cls) -> bool: """ Is this class a cloud-based (remote) NLP processor? """ return cls._is_cloud_processor
[docs] @abstractmethod def dest_tables_columns(self) -> Dict[str, List[Column]]: """ Describes the destination table(s) that this NLP processor wants to write to. Returns: dict: a dictionary of ``{tablename: destination_columns}``, where ``destination_columns`` is a list of SQLAlchemy :class:`Column` objects. """ raise NotImplementedError
[docs] def dest_tables_indexes(self) -> Dict[str, List[Index]]: """ Describes indexes that this NLP processor suggests for its destination table(s). Returns: dict: a dictionary of ``{tablename: indexes}``, where ``indexes`` is a list of SQLAlchemy :class:`Index` objects. """ return {}
@property def dest_metadata(self) -> MetaData: """ Returns the SQLAlchemy metadata for the destination database (which this NLP processor was told about at construction). """ return self._destdb.metadata @property def dest_session(self) -> Session: """ Returns the SQLAlchemy ORM Session for the destination database (which this NLP processor was told about at construction). """ return self._destdb.session @property def dest_engine(self) -> Engine: """ Returns the SQLAlchemy database Engine for the destination database (which this NLP processor was told about at construction). """ return self._destdb.engine @property def nlpdef_name(self) -> Optional[str]: """ Returns the name of our :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`, if we have one, or ``None``. """ if self._nlpdef is None: return None return self._nlpdef.name @property def friendly_name(self) -> str: """ Returns the NLP parser's friendly name """ return self._friendly_name @property def friendly_name_with_section(self) -> str: """ Returns the NLP parser's friendly name and config section. """ return f"{self.friendly_name} [{self._sectionname}]" @property def dest_dbname(self) -> str: """ Returns the friendly (config file) name for the destination database (which this NLP processor was told about at construction). """ return self._destdb_name @staticmethod def _assert_no_overlap( description1: str, cols1: List[Column], description2: str, cols2: List[Column], ) -> None: """ Asserts that the two column lists do not include overlapping column names. Used for ensuring non-overlapping column names when we add NLP-specific columns to generic columns (e.g. about the source data). Args: description1: description of group 1, used for error messages cols1: list 1 of SQLAlchemy :class:`Column` objects description2: description of group 2, used for error messages cols2: list 2 of SQLAlchemy :class:`Column` objects """ set1 = set(c.name for c in cols1) set2 = set(c.name for c in cols2) assert not (set1 & set2), ( f"Overlap between {description1} column names ({set1}) and " f"{description2} column names ({set2})" ) @staticmethod def _assert_column_lists_identical( list_of_column_lists: List[List[Column]], ) -> None: """ Ensure that every column list (in a list of column lists) is identical. """ n = len(list_of_column_lists) if n <= 1: return for i in range(n - 1): a_list = list_of_column_lists[i] b_list = list_of_column_lists[i + 1] if not column_lists_equal(a_list, b_list): msg = ( "Mismatch between column lists. (Are you trying to" " blend source tables with different column names into a " "single NLP results table?) Mismatch is between list {a} " "and list {b}.\n" "-- LIST A: {a_list}.\n" "-- LIST B: {b_list}.\n" "-- ALL LISTS: {all_lists}.\n" "-- ALL COLUMN NAMES: {all_colnames}.".format( a=i, b=i + 1, a_list=a_list, b_list=b_list, all_lists=list_of_column_lists, all_colnames=[ [c.name for c in columns] for columns in list_of_column_lists ], ) ) log.critical(msg) raise ValueError(msg) @staticmethod def _assert_index_lists_identical( list_of_index_lists: List[List[Index]], ) -> None: """ Ensure that every index list (in a list of index lists) is identical. """ n = len(list_of_index_lists) if n <= 1: return for i in range(n - 1): a_list = list_of_index_lists[i] b_list = list_of_index_lists[i + 1] if not index_lists_equal(a_list, b_list): msg = ( "Mismatch between index lists. (Are you trying to" " blend source tables with different column names into a " "single NLP results table?) Mismatch is between list {a} " "and list {b}.\n" "-- LIST A: {a_list}.\n" "-- LIST B: {b_list}.\n" "-- ALL LISTS: {all_lists}.\n" "-- ALL COLUMN NAMES: {all_colnames}.".format( a=i, b=i + 1, a_list=a_list, b_list=b_list, all_lists=list_of_index_lists, all_colnames=[ [c.name for c in columns] for columns in list_of_index_lists ], ) ) log.critical(msg) raise ValueError(msg) # Put these GATE methods here because it's also useful for Cloud processors @staticmethod def _standard_gate_columns() -> List[Column]: """ Returns standard columns for GATE output. """ return [ Column( GateFN.SET, SqlTypeDbIdentifier, comment="GATE output set name" ), Column( GateFN.TYPE, SqlTypeDbIdentifier, comment="GATE annotation type name", ), Column( GateFN.ID, Integer, comment="GATE annotation ID (not clear this is very useful)", ), Column( GateFN.STARTPOS, Integer, comment="Start position in the content", ), Column( GateFN.ENDPOS, Integer, comment="End position in the content" ), Column( GateFN.CONTENT, Text, comment="Full content marked as relevant.", ), ] @staticmethod def _standard_gate_indexes() -> List[Index]: """ Returns standard indexes for GATE output. """ return [Index("_idx__set", GateFN.SET, mysql_length=MAX_SQL_FIELD_LEN)]
[docs] @lru_cache(maxsize=None) def tables(self) -> Dict[str, Table]: """ Returns a dictionary of ``{tablename: Table}``, mapping table names to SQLAlchemy Table objects, for all destination tables of this NLP processor. """ # Obtain a single set of copy columns ifconfigs = self._nlpdef.inputfieldconfigs assert ifconfigs, "Must specify a list of InputFieldConfigs" assert self._destdb, "Cannot use tables() call without a database" copycolumns_list = [i.get_copy_columns() for i in ifconfigs] self._assert_column_lists_identical(copycolumns_list) copy_columns = copycolumns_list[0] core_columns = InputFieldConfig.get_core_columns_for_dest() self._assert_no_overlap("copy", copy_columns, "source", core_columns) # Create one or more tables meta = self.dest_metadata tables = {} # Dict[str, Table] t_columns = self.dest_tables_columns() for tablename, extra_dest_cols in t_columns.items(): self._assert_no_overlap( "copy", copy_columns, "destination", extra_dest_cols ) # And to check we haven't introduced any bugs internally: self._assert_no_overlap( "source", core_columns, "destination", extra_dest_cols ) columns = core_columns + extra_dest_cols + copy_columns copy_of_cols = [c.copy() for c in columns] t_indexes = self.dest_tables_indexes() extra_dest_indexes = [] # type: List[Index] if tablename in t_indexes: extra_dest_indexes = t_indexes[tablename] copyindexes_list = [i.get_copy_indexes() for i in ifconfigs] self._assert_index_lists_identical(copyindexes_list) copy_indexes = copyindexes_list[0] core_indexes = InputFieldConfig.get_core_indexes_for_dest() column_like_things = ( copy_of_cols + core_indexes + extra_dest_indexes + copy_indexes ) # log.debug(repr(column_like_things)) table_kwargs = { COMMENT: f"CRATE NLP results for {self.friendly_name}", **TABLE_KWARGS, } tables[tablename] = Table( tablename, meta, *column_like_things, **table_kwargs ) # You can put indexes in the column list: # http://docs.sqlalchemy.org/en/latest/core/constraints.html # NOTE that after creating the Table, all the column objects get # "contaminated" by the link to it, so you have to start afresh # with new column objects, or take a further copy, as above. # You can copy a Column, but not an Index. return tables
[docs] def get_tablenames(self) -> Iterable[str]: """ Returns all destination table names for this NLP processor. """ return self.dest_tables_columns().keys()
[docs] def get_table(self, tablename: str) -> Table: """ Returns an SQLAlchemy :class:`Table` for a given destination table of this NLP processor whose name is ``tablename``. """ tables = self.tables() try: return tables[tablename] except KeyError: raise KeyError( f"No destination table for this NLP processor " f"named {tablename!r}" )
[docs] def make_tables(self, drop_first: bool = False) -> List[str]: """ Creates all destination tables for this NLP processor in the destination database. Args: drop_first: drop the tables first? """ assert self._destdb, "No database specified!" engine = self.dest_engine tables = self.tables() pretty_names = [] # type: List[str] for t in tables.values(): pretty_name = f"{self._destdb.name}.{t.name}" if drop_first: log.info(f"Dropping table {pretty_name}") t.drop(engine, checkfirst=True) log.info(f"Creating table {pretty_name} (with indexes)") t.create(engine, checkfirst=True) pretty_names.append(pretty_name) return pretty_names
[docs] def delete_dest_record( self, ifconfig: InputFieldConfig, srcpkval: int, srcpkstr: Optional[str], commit: bool = False, ) -> None: """ Deletes all destination records for a given source record. - Used during incremental updates. - For when a record (specified by ``srcpkval``) has been updated in the source; wipe older entries for it in the destination database(s). Args: ifconfig: :class:`crate_anon.nlp_manager.input_field_config.InputFieldConfig` that defines the source database, table, and field (column) srcpkval: integer primary key (PK) value srcpkstr: for tables with string PKs: the string PK value commit: execute a COMMIT after we have deleted the records? If you don't do this, we will get deadlocks in incremental mode. See e.g. https://dev.mysql.com/doc/refman/5.5/en/innodb-deadlocks.html """ # noqa session = self.dest_session srcdb = ifconfig.srcdb srctable = ifconfig.srctable srcfield = ifconfig.srcfield destdb_name = self._destdb.name nlpdef_name = self._nlpdef.name for tablename, desttable in self.tables().items(): log.debug( f"delete_from_dest_dbs... {srcdb}.{srctable} -> " f"{destdb_name}.{tablename}" ) # noinspection PyProtectedMember,PyPropertyAccess delquery = ( desttable.delete() .where(desttable.c._srcdb == srcdb) .where(desttable.c._srctable == srctable) .where(desttable.c._srcfield == srcfield) .where(desttable.c._srcpkval == srcpkval) .where(desttable.c._nlpdef == nlpdef_name) ) if srcpkstr is not None: # noinspection PyProtectedMember,PyPropertyAccess delquery = delquery.where(desttable.c._srcpkstr == srcpkstr) with MultiTimerContext(timer, TIMING_DELETE_DEST_RECORD): session.execute(delquery) if commit: self._nlpdef.commit(session)
[docs] def delete_where_srcpk_not( self, ifconfig: InputFieldConfig, temptable: Optional[Table] ) -> None: """ Function to help with deleting NLP destination records whose source records have been deleted. See :func:`crate_anon.nlp_manager.nlp_manager.delete_where_no_source`. Args: ifconfig: :class:`crate_anon.nlp_manager.input_field_config.InputFieldConfig` that defines the source database, table, and field (column). temptable: If this is specified (as an SQLAlchemy) table, we delete NLP destination records whose source PK has not been inserted into this table. Otherwise, we delete *all* NLP destination records from the source column. """ destsession = self.dest_session srcdb = ifconfig.srcdb srctable = ifconfig.srctable srcfield = ifconfig.srcfield for desttable_name, desttable in self.tables().items(): log.debug( f"delete_where_srcpk_not... {srcdb}.{srctable} -> " f"{self._destdb_name}.{desttable_name}" ) # noinspection PyProtectedMember,PyPropertyAccess dest_deletion_query = ( # see get_core_indexes_for_dest desttable.delete() .where(desttable.c._srcdb == srcdb) .where(desttable.c._srctable == srctable) .where(desttable.c._srcfield == srcfield) .where(desttable.c._nlpdef == self._nlpdef.name) ) if temptable is not None: log.debug("... deleting selectively") # DELETE FROM a WHERE NOT EXISTS ( # SELECT 1 FROM b # WHERE a.a1 = b.b1 # AND ( # a.a2 = b.b2 # OR (a.a2 IS NULL AND b.b2 IS NULL) # ) # ) temptable_pkvalcol = temptable.columns[FN_SRCPKVAL] temptable_pkstrcol = temptable.columns[FN_SRCPKSTR] # noinspection PyProtectedMember,PyPropertyAccess dest_deletion_query = dest_deletion_query.where( ~exists().where( and_( desttable.c._srcpkval == temptable_pkvalcol, or_( desttable.c._srcpkstr == temptable_pkstrcol, and_( desttable.c._srcpkstr.is_(None), temptable_pkstrcol.is_(None), ), ), ) ) ) else: log.debug("... deleting all") destsession.execute(dest_deletion_query) self._nlpdef.commit(destsession)
@property def destdb(self) -> DatabaseHolder: """ Returns the destination database. """ return self._destdb
# ============================================================================= # Base class for all local parser types # =============================================================================
[docs]class BaseNlpParser(TableMaker): """ Base class for all local CRATE NLP parsers. """ uses_external_tool = False # may be overridden
[docs] def __init__( self, nlpdef: Optional[NlpDefinition], cfg_processor_name: Optional[str], commit: bool = False, friendly_name: str = "?", ) -> None: super().__init__( nlpdef, cfg_processor_name, commit, friendly_name=friendly_name )
# ------------------------------------------------------------------------- # NLP processing # -------------------------------------------------------------------------
[docs] @abstractmethod def parse( self, text: str ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: """ Main parsing function. Args: text: the raw text to parse Yields: tuple: ``tablename, valuedict``, where ``valuedict`` is a dictionary of ``{columnname: value}``. The values returned are ONLY those generated by NLP, and do not include either (a) the source reference values (``_srcdb``, ``_srctable``, etc.) or the "copy" fields. Raises: :exc:`crate_anon.nlp_manager.base_nlp_parser.TextProcessingFailed` if we could not process this text. """ raise NotImplementedError
[docs] def process( self, text: str, starting_fields_values: Dict[str, Any] ) -> None: """ The core function that takes a single piece of text and feeds it through a single NLP processor. This may produce zero, one, or many output records. Those records are then merged with information about their source (etc)., and inserted into the destination database. Args: text: the raw text to parse starting_fields_values: a dictionary of the format ``{columnname: value}`` that should be added to whatever the NLP processor comes up with. This will, in practice, include source metadata (which table, row [PK], and column did the text come from), processing metadata (when did the NLP processing take place?), and other values that the user has told us to copy across from the source database. Raises: :exc:`crate_anon.nlp_manager.base_nlp_parser.TextProcessingFailed` if this parser could not process the text """ if not does_text_contain_word_chars(text): # log.warning(f"No word characters found in {text}") # ... the warning occurs frequently so slows down processing return starting_fields_values[FN_NLPDEF] = self._nlpdef.name session = self.dest_session n_values = 0 with MultiTimerContext(timer, TIMING_PARSE): for tablename, nlp_values in self.parse(text): with MultiTimerContext(timer, TIMING_HANDLE_PARSED): # Merge dictionaries so EXISTING FIELDS/VALUES # (starting_fields_values) HAVE PRIORITY. nlp_values.update(starting_fields_values) sqla_table = self.get_table(tablename) # If we have superfluous keys in our dictionary, SQLAlchemy # will choke ("Unconsumed column names", reporting the # thing that's in our dictionary that it doesn't know # about). HOWEVER, note that SQLA column names may be mixed # case (e.g. 'Text') while our copy-column names are lower # case (e.g. 'text'), so we must have pre-converted # the SQLA column names to lower case. That happens in # InputFieldConfig.get_copy_columns and # InputFieldConfig.get_copy_indexes column_names = [c.name for c in sqla_table.columns] final_values = { k: v for k, v in nlp_values.items() if k in column_names } # log.debug(repr(sqla_table)) insertquery = sqla_table.insert().values(final_values) try: with MultiTimerContext(timer, TIMING_INSERT): session.execute(insertquery) except DatabaseError as e: # We can get an error on insert if for example the # output returned by the NLP is invalid for the column # type log.error(e) self._nlpdef.notify_transaction( session, n_rows=1, n_bytes=sys.getsizeof(final_values), force_commit=self._commit, ) n_values += 1 log.debug( f"NLP processor {self.nlpdef_name}/{self.friendly_name}:" f" found {n_values} values" )
[docs] @abstractmethod def test(self, verbose: bool = False) -> None: r""" Performs a self-test on the NLP processor. Args: verbose: Be verbose? This is an abstract method that is subclassed. """ # NB This docstring was associated with Sphinx errors! raise NotImplementedError( f"No test function for regex class: " f"{self.classname()}" )
[docs] def test_parser(self, test_strings: List[str]) -> None: """ Tests the NLP processor's parser with a set of test strings. """ log.info(f"Testing parser: {self.classname()}") for text in test_strings: log.info(f" {text} -> {list(self.parse(text))}") log.info("... OK")
# ------------------------------------------------------------------------- # NLPRP info # -------------------------------------------------------------------------
[docs] @staticmethod def describe_sqla_col( column: Column, sql_dialect: str = None ) -> Dict[str, Any]: """ Describes a single SQLAlchemy :class:`Column` in the :ref:`NLPRP <nlprp>` format, which follows ``INFORMATION_SCHEMA.COLUMNS`` closely. Args: column: the :class:`Column` sql_dialect: preferred SQL dialect for response, or ``None`` for a default """ sql_dialect = sql_dialect or DEFAULT_NLPRP_SQL_DIALECT assert sql_dialect in ALL_SQL_DIALECTS, ( f"Unknown SQL dialect {sql_dialect!r}; must be one of " f"{ALL_SQL_DIALECTS}" ) dialect = registry.load(sql_dialect)() # type: Dialect # log.debug(f"dialect: {dialect}") # dialect = MSDialect() column_type = column.type.compile(dialect) data_type = column_type.partition("(")[0] # ... https://stackoverflow.com/questions/27387415/how-would-i-get-everything-before-a-in-a-string-python # noqa return { NlprpKeys.COLUMN_NAME: column.name, NlprpKeys.COLUMN_TYPE: column_type, NlprpKeys.DATA_TYPE: data_type, NlprpKeys.IS_NULLABLE: column.nullable, NlprpKeys.COLUMN_COMMENT: column.comment, }
[docs] def nlprp_schema_info(self, sql_dialect: str = None) -> Dict[str, Any]: """ Returns a dictionary for the ``schema_type`` parameter, and associated parameters describing the schema (e.g. ``tabular_schema``), of the NLPRP :ref:`list_processors <nlprp_list_processors>` command. This is not a classmethod, because it may be specialized as we load external schema information (e.g. GATE processors). Args: sql_dialect: preferred SQL dialect for ``tabular_schema`` """ sql_dialect = sql_dialect or DEFAULT_NLPRP_SQL_DIALECT tabular_schema = {} # type: Dict[str, List[Dict[str, Any]]] for tablename, columns in self.dest_tables_columns().items(): colinfo = [] # type: List[Dict[str, Any]] for column in columns: colinfo.append(self.describe_sqla_col(column, sql_dialect)) tabular_schema[tablename] = colinfo schema_info = { NlprpKeys.SCHEMA_TYPE: NlprpValues.TABULAR, NlprpKeys.SQL_DIALECT: sql_dialect, NlprpKeys.TABULAR_SCHEMA: tabular_schema, } return schema_info
[docs] @classmethod def nlprp_name(cls) -> str: """ Returns the processor's name for use in response to the NLPRP :ref:`list_processors <nlprp_list_processors>` command. The default is the fully qualified module/class name -- because this is highly unlikely to clash with any other NLP processors on a given server. """ return cls.fully_qualified_classname()
[docs] @classmethod def nlprp_title(cls) -> str: """ Returns the processor's title for use in response to the NLPRP :ref:`list_processors <nlprp_list_processors>` command. The default is the short Python class name. """ return cls.__name__
[docs] @classmethod def nlprp_version(cls) -> str: """ Returns the processor's version for use in response to the NLPRP :ref:`list_processors <nlprp_list_processors>` command. The default is the current CRATE version. """ return CRATE_VERSION
[docs] @classmethod def nlprp_is_default_version(cls) -> bool: """ Returns whether this processor is the default version of its name, for use in response to the NLPRP :ref:`list_processors <nlprp_list_processors>` command. The default is ``True``. """ return True
[docs] @classmethod def nlprp_description(cls) -> str: """ Returns the processor's description for use in response to the NLPRP :ref:`list_processors <nlprp_list_processors>` command. Uses each processor's docstring, and reformats it slightly. """ return compress_docstring(get_docstring(cls))
def nlprp_server_processor( self, sql_dialect: str = None ) -> NlprpServerProcessor: schema_info = self.nlprp_schema_info(sql_dialect) return NlprpServerProcessor( name=self.nlprp_name(), title=self.nlprp_title(), version=self.nlprp_version(), is_default_version=self.nlprp_is_default_version(), description=self.nlprp_description(), schema_type=schema_info[NlprpKeys.SCHEMA_TYPE], sql_dialect=schema_info.get(NlprpKeys.SQL_DIALECT), tabular_schema=schema_info.get(NlprpKeys.TABULAR_SCHEMA), )
[docs] def nlprp_processor_info(self, sql_dialect: str = None) -> Dict[str, Any]: """ Returns a dictionary suitable for use as this processor's response to the NLPRP :ref:`list_processors <nlprp_list_processors>` command. This is not a classmethod, because it may be specialized as we load external schema information (e.g. GATE processors). Args: sql_dialect: preferred SQL dialect for ``tabular_schema`` """ return self.nlprp_server_processor(sql_dialect).infodict
[docs] def nlprp_processor_info_json( self, indent: int = 4, sort_keys: bool = True, sql_dialect: str = None ) -> str: """ Returns a formatted JSON string from :func:`nlprp_schema_info`. This is primarily for debugging. Args: indent: number of spaces for indentation sort_keys: sort keys? sql_dialect: preferred SQL dialect for ``tabular_schema``, or ``None`` for default """ json_structure = self.nlprp_processor_info(sql_dialect=sql_dialect) return json.dumps(json_structure, indent=indent, sort_keys=sort_keys)