"""
crate_anon/nlp_manager/base_nlp_parser.py
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
**Simple base class for all our NLP parsers (GATE, regex, ...)**
"""
from abc import ABC, abstractmethod
from functools import lru_cache
import json
import logging
import sys
from typing import (
Any,
Dict,
Generator,
Iterable,
List,
Optional,
Tuple,
TYPE_CHECKING,
)
from cardinal_pythonlib.reprfunc import auto_repr
from cardinal_pythonlib.timing import MultiTimerContext, timer
from cardinal_pythonlib.sqlalchemy.schema import (
column_lists_equal,
index_lists_equal,
)
# OK to import "registry"; see
# https://github.com/zzzeek/sqlalchemy/blob/master/README.dialects.rst
# noinspection PyProtectedMember
from sqlalchemy.dialects import registry
from sqlalchemy.engine.base import Engine
from sqlalchemy.exc import DatabaseError
from sqlalchemy.orm.session import Session
from sqlalchemy.schema import Column, Index, Table
from sqlalchemy.sql import and_, exists, or_
from sqlalchemy.sql.schema import MetaData
from sqlalchemy.types import Integer, Text
from crate_anon.anonymise.dbholder import DatabaseHolder
from crate_anon.anonymise.constants import (
COMMENT,
TABLE_KWARGS,
)
from crate_anon.common.stringfunc import (
compress_docstring,
does_text_contain_word_chars,
get_docstring,
)
from crate_anon.nlp_manager.constants import (
FN_NLPDEF,
FN_SRCPKVAL,
FN_SRCPKSTR,
full_sectionname,
NlpConfigPrefixes,
ProcessorConfigKeys,
GateFieldNames as GateFN,
SqlTypeDbIdentifier,
MAX_SQL_FIELD_LEN,
)
from crate_anon.nlp_manager.input_field_config import InputFieldConfig
from crate_anon.nlp_manager.nlp_definition import NlpDefinition
from crate_anon.nlprp.api import NlprpServerProcessor
from crate_anon.nlprp.constants import (
ALL_SQL_DIALECTS,
NlprpKeys,
NlprpValues,
SqlDialects,
)
from crate_anon.version import CRATE_VERSION
if TYPE_CHECKING:
from sqlalchemy.engine.interfaces import Dialect
from crate_anon.common.extendedconfigparser import ConfigSection
log = logging.getLogger(__name__)
DEFAULT_NLPRP_SQL_DIALECT = SqlDialects.MYSQL
TIMING_DELETE_DEST_RECORD = "BaseNlpParser_delete_dest_record"
TIMING_INSERT = "BaseNlpParser_sql_insert"
TIMING_PARSE = "parse"
TIMING_HANDLE_PARSED = "handled_parsed"
# =============================================================================
# Exception meaning "could not parse this piece of text"
# =============================================================================
[docs]class TextProcessingFailed(Exception):
pass
# =============================================================================
# Base class for all parser types
# =============================================================================
[docs]class TableMaker(ABC):
"""
Base class for all CRATE NLP processors, local and cloud, including those
that talk to third-party software. Manages the interface to databases for
results storage, etc.
"""
_is_cloud_processor = False # overridden by cloud-based classes
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
friendly_name: str = "?",
) -> None:
r"""
``__init__`` function for :class:`TableMaker`.
Args:
nlpdef:
An instance of
:class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`.
cfg_processor_name:
The name of a CRATE NLP config file section, TO WHICH we will
add a ``processor:`` prefix (from which section we may choose
to get extra config information).
commit:
Force a COMMIT whenever we insert data? You should specify this
in multiprocess mode, or you may get database deadlocks.
friendly_name:
Friendly name for the parser.
"""
# NB This docstring was associated with Sphinx errors!
self._nlpdef = nlpdef
self._cfg_processor_name = cfg_processor_name
self._commit = commit
self._friendly_name = friendly_name
self._destdb_name = None # type: Optional[str]
self._destdb = None # type: Optional[DatabaseHolder]
if nlpdef is None:
self._sectionname = ""
self._cfgsection = None # type: Optional[ConfigSection]
self._destdb_name = ""
self._destdb = None # type: Optional[DatabaseHolder]
else:
self._sectionname = full_sectionname(
NlpConfigPrefixes.PROCESSOR, cfg_processor_name
)
self._cfgsection = nlpdef.get_config_section(self._sectionname)
self._destdb_name = self._cfgsection.opt_str(
ProcessorConfigKeys.DESTDB, required=True
)
self._destdb = nlpdef.get_database(self._destdb_name)
def __str__(self) -> str:
return self.classname()
def __repr__(self) -> str:
return auto_repr(self)
[docs] @classmethod
def classname(cls) -> str:
"""
Returns the short Python name of this class.
"""
return cls.__name__
[docs] @classmethod
def fully_qualified_classname(cls) -> str:
"""
Returns the class's fully qualified name.
"""
# This may be imperfect; see
# https://stackoverflow.com/questions/2020014/get-fully-qualified-class-name-of-an-object-in-python # noqa
# https://www.python.org/dev/peps/pep-3155/
return ".".join([cls.__module__, cls.__qualname__])
[docs] @classmethod
def is_cloud_processor(cls) -> bool:
"""
Is this class a cloud-based (remote) NLP processor?
"""
return cls._is_cloud_processor
[docs] @abstractmethod
def dest_tables_columns(self) -> Dict[str, List[Column]]:
"""
Describes the destination table(s) that this NLP processor wants to
write to.
Returns:
dict: a dictionary of ``{tablename: destination_columns}``, where
``destination_columns`` is a list of SQLAlchemy :class:`Column`
objects.
"""
raise NotImplementedError
[docs] def dest_tables_indexes(self) -> Dict[str, List[Index]]:
"""
Describes indexes that this NLP processor suggests for its destination
table(s).
Returns:
dict: a dictionary of ``{tablename: indexes}``, where ``indexes``
is a list of SQLAlchemy :class:`Index` objects.
"""
return {}
@property
def dest_metadata(self) -> MetaData:
"""
Returns the SQLAlchemy metadata for the destination database (which
this NLP processor was told about at construction).
"""
return self._destdb.metadata
@property
def dest_session(self) -> Session:
"""
Returns the SQLAlchemy ORM Session for the destination database (which
this NLP processor was told about at construction).
"""
return self._destdb.session
@property
def dest_engine(self) -> Engine:
"""
Returns the SQLAlchemy database Engine for the destination database
(which this NLP processor was told about at construction).
"""
return self._destdb.engine
@property
def nlpdef_name(self) -> Optional[str]:
"""
Returns the name of our
:class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`, if we
have one, or ``None``.
"""
if self._nlpdef is None:
return None
return self._nlpdef.name
@property
def friendly_name(self) -> str:
"""
Returns the NLP parser's friendly name
"""
return self._friendly_name
@property
def friendly_name_with_section(self) -> str:
"""
Returns the NLP parser's friendly name and config section.
"""
return f"{self.friendly_name} [{self._sectionname}]"
@property
def dest_dbname(self) -> str:
"""
Returns the friendly (config file) name for the destination database
(which this NLP processor was told about at construction).
"""
return self._destdb_name
@staticmethod
def _assert_no_overlap(
description1: str,
cols1: List[Column],
description2: str,
cols2: List[Column],
) -> None:
"""
Asserts that the two column lists do not include overlapping column
names.
Used for ensuring non-overlapping column names when we add NLP-specific
columns to generic columns (e.g. about the source data).
Args:
description1: description of group 1, used for error messages
cols1: list 1 of SQLAlchemy :class:`Column` objects
description2: description of group 2, used for error messages
cols2: list 2 of SQLAlchemy :class:`Column` objects
"""
set1 = set(c.name for c in cols1)
set2 = set(c.name for c in cols2)
assert not (set1 & set2), (
f"Overlap between {description1} column names ({set1}) and "
f"{description2} column names ({set2})"
)
@staticmethod
def _assert_column_lists_identical(
list_of_column_lists: List[List[Column]],
) -> None:
"""
Ensure that every column list (in a list of column lists) is identical.
"""
n = len(list_of_column_lists)
if n <= 1:
return
for i in range(n - 1):
a_list = list_of_column_lists[i]
b_list = list_of_column_lists[i + 1]
if not column_lists_equal(a_list, b_list):
msg = (
"Mismatch between column lists. (Are you trying to"
" blend source tables with different column names into a "
"single NLP results table?) Mismatch is between list {a} "
"and list {b}.\n"
"-- LIST A: {a_list}.\n"
"-- LIST B: {b_list}.\n"
"-- ALL LISTS: {all_lists}.\n"
"-- ALL COLUMN NAMES: {all_colnames}.".format(
a=i,
b=i + 1,
a_list=a_list,
b_list=b_list,
all_lists=list_of_column_lists,
all_colnames=[
[c.name for c in columns]
for columns in list_of_column_lists
],
)
)
log.critical(msg)
raise ValueError(msg)
@staticmethod
def _assert_index_lists_identical(
list_of_index_lists: List[List[Index]],
) -> None:
"""
Ensure that every index list (in a list of index lists) is identical.
"""
n = len(list_of_index_lists)
if n <= 1:
return
for i in range(n - 1):
a_list = list_of_index_lists[i]
b_list = list_of_index_lists[i + 1]
if not index_lists_equal(a_list, b_list):
msg = (
"Mismatch between index lists. (Are you trying to"
" blend source tables with different column names into a "
"single NLP results table?) Mismatch is between list {a} "
"and list {b}.\n"
"-- LIST A: {a_list}.\n"
"-- LIST B: {b_list}.\n"
"-- ALL LISTS: {all_lists}.\n"
"-- ALL COLUMN NAMES: {all_colnames}.".format(
a=i,
b=i + 1,
a_list=a_list,
b_list=b_list,
all_lists=list_of_index_lists,
all_colnames=[
[c.name for c in columns]
for columns in list_of_index_lists
],
)
)
log.critical(msg)
raise ValueError(msg)
# Put these GATE methods here because it's also useful for Cloud processors
@staticmethod
def _standard_gate_columns() -> List[Column]:
"""
Returns standard columns for GATE output.
"""
return [
Column(
GateFN.SET, SqlTypeDbIdentifier, comment="GATE output set name"
),
Column(
GateFN.TYPE,
SqlTypeDbIdentifier,
comment="GATE annotation type name",
),
Column(
GateFN.ID,
Integer,
comment="GATE annotation ID (not clear this is very useful)",
),
Column(
GateFN.STARTPOS,
Integer,
comment="Start position in the content",
),
Column(
GateFN.ENDPOS, Integer, comment="End position in the content"
),
Column(
GateFN.CONTENT,
Text,
comment="Full content marked as relevant.",
),
]
@staticmethod
def _standard_gate_indexes() -> List[Index]:
"""
Returns standard indexes for GATE output.
"""
return [Index("_idx__set", GateFN.SET, mysql_length=MAX_SQL_FIELD_LEN)]
[docs] @lru_cache(maxsize=None)
def tables(self) -> Dict[str, Table]:
"""
Returns a dictionary of ``{tablename: Table}``, mapping table names
to SQLAlchemy Table objects, for all destination tables of this NLP
processor.
"""
# Obtain a single set of copy columns
ifconfigs = self._nlpdef.inputfieldconfigs
assert ifconfigs, "Must specify a list of InputFieldConfigs"
assert self._destdb, "Cannot use tables() call without a database"
copycolumns_list = [i.get_copy_columns() for i in ifconfigs]
self._assert_column_lists_identical(copycolumns_list)
copy_columns = copycolumns_list[0]
core_columns = InputFieldConfig.get_core_columns_for_dest()
self._assert_no_overlap("copy", copy_columns, "source", core_columns)
# Create one or more tables
meta = self.dest_metadata
tables = {} # Dict[str, Table]
t_columns = self.dest_tables_columns()
for tablename, extra_dest_cols in t_columns.items():
self._assert_no_overlap(
"copy", copy_columns, "destination", extra_dest_cols
)
# And to check we haven't introduced any bugs internally:
self._assert_no_overlap(
"source", core_columns, "destination", extra_dest_cols
)
columns = core_columns + extra_dest_cols + copy_columns
copy_of_cols = [c.copy() for c in columns]
t_indexes = self.dest_tables_indexes()
extra_dest_indexes = [] # type: List[Index]
if tablename in t_indexes:
extra_dest_indexes = t_indexes[tablename]
copyindexes_list = [i.get_copy_indexes() for i in ifconfigs]
self._assert_index_lists_identical(copyindexes_list)
copy_indexes = copyindexes_list[0]
core_indexes = InputFieldConfig.get_core_indexes_for_dest()
column_like_things = (
copy_of_cols + core_indexes + extra_dest_indexes + copy_indexes
)
# log.debug(repr(column_like_things))
table_kwargs = {
COMMENT: f"CRATE NLP results for {self.friendly_name}",
**TABLE_KWARGS,
}
tables[tablename] = Table(
tablename, meta, *column_like_things, **table_kwargs
)
# You can put indexes in the column list:
# http://docs.sqlalchemy.org/en/latest/core/constraints.html
# NOTE that after creating the Table, all the column objects get
# "contaminated" by the link to it, so you have to start afresh
# with new column objects, or take a further copy, as above.
# You can copy a Column, but not an Index.
return tables
[docs] def get_tablenames(self) -> Iterable[str]:
"""
Returns all destination table names for this NLP processor.
"""
return self.dest_tables_columns().keys()
[docs] def get_table(self, tablename: str) -> Table:
"""
Returns an SQLAlchemy :class:`Table` for a given destination table of
this NLP processor whose name is ``tablename``.
"""
tables = self.tables()
try:
return tables[tablename]
except KeyError:
raise KeyError(
f"No destination table for this NLP processor "
f"named {tablename!r}"
)
[docs] def make_tables(self, drop_first: bool = False) -> List[str]:
"""
Creates all destination tables for this NLP processor in the
destination database.
Args:
drop_first: drop the tables first?
"""
assert self._destdb, "No database specified!"
engine = self.dest_engine
tables = self.tables()
pretty_names = [] # type: List[str]
for t in tables.values():
pretty_name = f"{self._destdb.name}.{t.name}"
if drop_first:
log.info(f"Dropping table {pretty_name}")
t.drop(engine, checkfirst=True)
log.info(f"Creating table {pretty_name} (with indexes)")
t.create(engine, checkfirst=True)
pretty_names.append(pretty_name)
return pretty_names
[docs] def delete_dest_record(
self,
ifconfig: InputFieldConfig,
srcpkval: int,
srcpkstr: Optional[str],
commit: bool = False,
) -> None:
"""
Deletes all destination records for a given source record.
- Used during incremental updates.
- For when a record (specified by ``srcpkval``) has been updated in the
source; wipe older entries for it in the destination database(s).
Args:
ifconfig:
:class:`crate_anon.nlp_manager.input_field_config.InputFieldConfig`
that defines the source database, table, and field (column)
srcpkval:
integer primary key (PK) value
srcpkstr:
for tables with string PKs: the string PK value
commit:
execute a COMMIT after we have deleted the records?
If you don't do this, we will get deadlocks in incremental mode.
See e.g.
https://dev.mysql.com/doc/refman/5.5/en/innodb-deadlocks.html
""" # noqa
session = self.dest_session
srcdb = ifconfig.srcdb
srctable = ifconfig.srctable
srcfield = ifconfig.srcfield
destdb_name = self._destdb.name
nlpdef_name = self._nlpdef.name
for tablename, desttable in self.tables().items():
log.debug(
f"delete_from_dest_dbs... {srcdb}.{srctable} -> "
f"{destdb_name}.{tablename}"
)
# noinspection PyProtectedMember,PyPropertyAccess
delquery = (
desttable.delete()
.where(desttable.c._srcdb == srcdb)
.where(desttable.c._srctable == srctable)
.where(desttable.c._srcfield == srcfield)
.where(desttable.c._srcpkval == srcpkval)
.where(desttable.c._nlpdef == nlpdef_name)
)
if srcpkstr is not None:
# noinspection PyProtectedMember,PyPropertyAccess
delquery = delquery.where(desttable.c._srcpkstr == srcpkstr)
with MultiTimerContext(timer, TIMING_DELETE_DEST_RECORD):
session.execute(delquery)
if commit:
self._nlpdef.commit(session)
[docs] def delete_where_srcpk_not(
self, ifconfig: InputFieldConfig, temptable: Optional[Table]
) -> None:
"""
Function to help with deleting NLP destination records whose source
records have been deleted.
See :func:`crate_anon.nlp_manager.nlp_manager.delete_where_no_source`.
Args:
ifconfig:
:class:`crate_anon.nlp_manager.input_field_config.InputFieldConfig`
that defines the source database, table, and field (column).
temptable:
If this is specified (as an SQLAlchemy) table, we delete NLP
destination records whose source PK has not been inserted into
this table. Otherwise, we delete *all* NLP destination records
from the source column.
"""
destsession = self.dest_session
srcdb = ifconfig.srcdb
srctable = ifconfig.srctable
srcfield = ifconfig.srcfield
for desttable_name, desttable in self.tables().items():
log.debug(
f"delete_where_srcpk_not... {srcdb}.{srctable} -> "
f"{self._destdb_name}.{desttable_name}"
)
# noinspection PyProtectedMember,PyPropertyAccess
dest_deletion_query = (
# see get_core_indexes_for_dest
desttable.delete()
.where(desttable.c._srcdb == srcdb)
.where(desttable.c._srctable == srctable)
.where(desttable.c._srcfield == srcfield)
.where(desttable.c._nlpdef == self._nlpdef.name)
)
if temptable is not None:
log.debug("... deleting selectively")
# DELETE FROM a WHERE NOT EXISTS (
# SELECT 1 FROM b
# WHERE a.a1 = b.b1
# AND (
# a.a2 = b.b2
# OR (a.a2 IS NULL AND b.b2 IS NULL)
# )
# )
temptable_pkvalcol = temptable.columns[FN_SRCPKVAL]
temptable_pkstrcol = temptable.columns[FN_SRCPKSTR]
# noinspection PyProtectedMember,PyPropertyAccess
dest_deletion_query = dest_deletion_query.where(
~exists().where(
and_(
desttable.c._srcpkval == temptable_pkvalcol,
or_(
desttable.c._srcpkstr == temptable_pkstrcol,
and_(
desttable.c._srcpkstr.is_(None),
temptable_pkstrcol.is_(None),
),
),
)
)
)
else:
log.debug("... deleting all")
destsession.execute(dest_deletion_query)
self._nlpdef.commit(destsession)
@property
def destdb(self) -> DatabaseHolder:
"""
Returns the destination database.
"""
return self._destdb
# =============================================================================
# Base class for all local parser types
# =============================================================================
[docs]class BaseNlpParser(TableMaker):
"""
Base class for all local CRATE NLP parsers.
"""
uses_external_tool = False # may be overridden
[docs] def __init__(
self,
nlpdef: Optional[NlpDefinition],
cfg_processor_name: Optional[str],
commit: bool = False,
friendly_name: str = "?",
) -> None:
super().__init__(
nlpdef, cfg_processor_name, commit, friendly_name=friendly_name
)
# -------------------------------------------------------------------------
# NLP processing
# -------------------------------------------------------------------------
[docs] @abstractmethod
def parse(
self, text: str
) -> Generator[Tuple[str, Dict[str, Any]], None, None]:
"""
Main parsing function.
Args:
text:
the raw text to parse
Yields:
tuple: ``tablename, valuedict``, where ``valuedict`` is
a dictionary of ``{columnname: value}``. The values returned are
ONLY those generated by NLP, and do not include either (a) the
source reference values (``_srcdb``, ``_srctable``, etc.) or the
"copy" fields.
Raises:
:exc:`crate_anon.nlp_manager.base_nlp_parser.TextProcessingFailed`
if we could not process this text.
"""
raise NotImplementedError
[docs] def process(
self, text: str, starting_fields_values: Dict[str, Any]
) -> None:
"""
The core function that takes a single piece of text and feeds it
through a single NLP processor. This may produce zero, one, or many
output records. Those records are then merged with information about
their source (etc)., and inserted into the destination database.
Args:
text:
the raw text to parse
starting_fields_values:
a dictionary of the format ``{columnname: value}`` that should
be added to whatever the NLP processor comes up with. This
will, in practice, include source metadata (which table,
row [PK], and column did the text come from), processing
metadata (when did the NLP processing take place?), and other
values that the user has told us to copy across from the source
database.
Raises:
:exc:`crate_anon.nlp_manager.base_nlp_parser.TextProcessingFailed`
if this parser could not process the text
"""
if not does_text_contain_word_chars(text):
# log.warning(f"No word characters found in {text}")
# ... the warning occurs frequently so slows down processing
return
starting_fields_values[FN_NLPDEF] = self._nlpdef.name
session = self.dest_session
n_values = 0
with MultiTimerContext(timer, TIMING_PARSE):
for tablename, nlp_values in self.parse(text):
with MultiTimerContext(timer, TIMING_HANDLE_PARSED):
# Merge dictionaries so EXISTING FIELDS/VALUES
# (starting_fields_values) HAVE PRIORITY.
nlp_values.update(starting_fields_values)
sqla_table = self.get_table(tablename)
# If we have superfluous keys in our dictionary, SQLAlchemy
# will choke ("Unconsumed column names", reporting the
# thing that's in our dictionary that it doesn't know
# about). HOWEVER, note that SQLA column names may be mixed
# case (e.g. 'Text') while our copy-column names are lower
# case (e.g. 'text'), so we must have pre-converted
# the SQLA column names to lower case. That happens in
# InputFieldConfig.get_copy_columns and
# InputFieldConfig.get_copy_indexes
column_names = [c.name for c in sqla_table.columns]
final_values = {
k: v
for k, v in nlp_values.items()
if k in column_names
}
# log.debug(repr(sqla_table))
insertquery = sqla_table.insert().values(final_values)
try:
with MultiTimerContext(timer, TIMING_INSERT):
session.execute(insertquery)
except DatabaseError as e:
# We can get an error on insert if for example the
# output returned by the NLP is invalid for the column
# type
log.error(e)
self._nlpdef.notify_transaction(
session,
n_rows=1,
n_bytes=sys.getsizeof(final_values),
force_commit=self._commit,
)
n_values += 1
log.debug(
f"NLP processor {self.nlpdef_name}/{self.friendly_name}:"
f" found {n_values} values"
)
[docs] @abstractmethod
def test(self, verbose: bool = False) -> None:
r"""
Performs a self-test on the NLP processor.
Args:
verbose:
Be verbose?
This is an abstract method that is subclassed.
"""
# NB This docstring was associated with Sphinx errors!
raise NotImplementedError(
f"No test function for regex class: " f"{self.classname()}"
)
[docs] def test_parser(self, test_strings: List[str]) -> None:
"""
Tests the NLP processor's parser with a set of test strings.
"""
log.info(f"Testing parser: {self.classname()}")
for text in test_strings:
log.info(f" {text} -> {list(self.parse(text))}")
log.info("... OK")
# -------------------------------------------------------------------------
# NLPRP info
# -------------------------------------------------------------------------
[docs] @staticmethod
def describe_sqla_col(
column: Column, sql_dialect: str = None
) -> Dict[str, Any]:
"""
Describes a single SQLAlchemy :class:`Column` in the :ref:`NLPRP
<nlprp>` format, which follows ``INFORMATION_SCHEMA.COLUMNS`` closely.
Args:
column:
the :class:`Column`
sql_dialect:
preferred SQL dialect for response, or ``None`` for a default
"""
sql_dialect = sql_dialect or DEFAULT_NLPRP_SQL_DIALECT
assert sql_dialect in ALL_SQL_DIALECTS, (
f"Unknown SQL dialect {sql_dialect!r}; must be one of "
f"{ALL_SQL_DIALECTS}"
)
dialect = registry.load(sql_dialect)() # type: Dialect
# log.debug(f"dialect: {dialect}")
# dialect = MSDialect()
column_type = column.type.compile(dialect)
data_type = column_type.partition("(")[0]
# ... https://stackoverflow.com/questions/27387415/how-would-i-get-everything-before-a-in-a-string-python # noqa
return {
NlprpKeys.COLUMN_NAME: column.name,
NlprpKeys.COLUMN_TYPE: column_type,
NlprpKeys.DATA_TYPE: data_type,
NlprpKeys.IS_NULLABLE: column.nullable,
NlprpKeys.COLUMN_COMMENT: column.comment,
}
[docs] def nlprp_schema_info(self, sql_dialect: str = None) -> Dict[str, Any]:
"""
Returns a dictionary for the ``schema_type`` parameter, and associated
parameters describing the schema (e.g. ``tabular_schema``), of the
NLPRP :ref:`list_processors <nlprp_list_processors>` command.
This is not a classmethod, because it may be specialized as we load
external schema information (e.g. GATE processors).
Args:
sql_dialect:
preferred SQL dialect for ``tabular_schema``
"""
sql_dialect = sql_dialect or DEFAULT_NLPRP_SQL_DIALECT
tabular_schema = {} # type: Dict[str, List[Dict[str, Any]]]
for tablename, columns in self.dest_tables_columns().items():
colinfo = [] # type: List[Dict[str, Any]]
for column in columns:
colinfo.append(self.describe_sqla_col(column, sql_dialect))
tabular_schema[tablename] = colinfo
schema_info = {
NlprpKeys.SCHEMA_TYPE: NlprpValues.TABULAR,
NlprpKeys.SQL_DIALECT: sql_dialect,
NlprpKeys.TABULAR_SCHEMA: tabular_schema,
}
return schema_info
[docs] @classmethod
def nlprp_name(cls) -> str:
"""
Returns the processor's name for use in response to the NLPRP
:ref:`list_processors <nlprp_list_processors>` command.
The default is the fully qualified module/class name -- because this is
highly unlikely to clash with any other NLP processors on a given
server.
"""
return cls.fully_qualified_classname()
[docs] @classmethod
def nlprp_title(cls) -> str:
"""
Returns the processor's title for use in response to the NLPRP
:ref:`list_processors <nlprp_list_processors>` command.
The default is the short Python class name.
"""
return cls.__name__
[docs] @classmethod
def nlprp_version(cls) -> str:
"""
Returns the processor's version for use in response to the NLPRP
:ref:`list_processors <nlprp_list_processors>` command.
The default is the current CRATE version.
"""
return CRATE_VERSION
[docs] @classmethod
def nlprp_is_default_version(cls) -> bool:
"""
Returns whether this processor is the default version of its name, for
use in response to the NLPRP :ref:`list_processors
<nlprp_list_processors>` command.
The default is ``True``.
"""
return True
[docs] @classmethod
def nlprp_description(cls) -> str:
"""
Returns the processor's description for use in response to the NLPRP
:ref:`list_processors <nlprp_list_processors>` command.
Uses each processor's docstring, and reformats it slightly.
"""
return compress_docstring(get_docstring(cls))
def nlprp_server_processor(
self, sql_dialect: str = None
) -> NlprpServerProcessor:
schema_info = self.nlprp_schema_info(sql_dialect)
return NlprpServerProcessor(
name=self.nlprp_name(),
title=self.nlprp_title(),
version=self.nlprp_version(),
is_default_version=self.nlprp_is_default_version(),
description=self.nlprp_description(),
schema_type=schema_info[NlprpKeys.SCHEMA_TYPE],
sql_dialect=schema_info.get(NlprpKeys.SQL_DIALECT),
tabular_schema=schema_info.get(NlprpKeys.TABULAR_SCHEMA),
)
[docs] def nlprp_processor_info(self, sql_dialect: str = None) -> Dict[str, Any]:
"""
Returns a dictionary suitable for use as this processor's response to
the NLPRP :ref:`list_processors <nlprp_list_processors>` command.
This is not a classmethod, because it may be specialized as we load
external schema information (e.g. GATE processors).
Args:
sql_dialect:
preferred SQL dialect for ``tabular_schema``
"""
return self.nlprp_server_processor(sql_dialect).infodict
[docs] def nlprp_processor_info_json(
self, indent: int = 4, sort_keys: bool = True, sql_dialect: str = None
) -> str:
"""
Returns a formatted JSON string from :func:`nlprp_schema_info`.
This is primarily for debugging.
Args:
indent:
number of spaces for indentation
sort_keys:
sort keys?
sql_dialect:
preferred SQL dialect for ``tabular_schema``, or ``None`` for
default
"""
json_structure = self.nlprp_processor_info(sql_dialect=sql_dialect)
return json.dumps(json_structure, indent=indent, sort_keys=sort_keys)