Source code for crate_anon.nlp_manager.input_field_config

"""
crate_anon/nlp_manager/input_field_config.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Class to define input fields for NLP.**

"""

import logging

# import sys
from typing import Any, Dict, Generator, List, Optional, Tuple

from cardinal_pythonlib.datetimefunc import get_now_utc_notz_datetime
from cardinal_pythonlib.hash import hash64
from cardinal_pythonlib.sql.validation import (
    ensure_valid_field_name,
    ensure_valid_table_name,
)
from cardinal_pythonlib.sqlalchemy.core_query import count_star
from cardinal_pythonlib.sqlalchemy.schema import (
    is_sqlatype_integer,
    get_column_type,
    table_or_view_exists,
)
from cardinal_pythonlib.timing import MultiTimerContext, timer
from sqlalchemy import BigInteger, Column, DateTime, Index, String, Table
from sqlalchemy.engine.base import Engine
from sqlalchemy.orm.session import Session
from sqlalchemy.sql import and_, column, exists, null, or_, select, table
from sqlalchemy.sql.schema import MetaData

from crate_anon.common.parallel import is_my_job_by_hash_prehashed
from crate_anon.common.stringfunc import relevant_for_nlp
from crate_anon.nlp_manager.constants import (
    FN_CRATE_VERSION_FIELD,
    FN_WHEN_FETCHED,
    FN_NLPDEF,
    FN_PK,
    FN_SRCDATETIMEFIELD,
    FN_SRCDATETIMEVAL,
    FN_SRCDB,
    FN_SRCTABLE,
    FN_SRCPKFIELD,
    FN_SRCPKVAL,
    FN_SRCPKSTR,
    FN_SRCFIELD,
    TRUNCATED_FLAG,
    InputFieldConfigKeys,
    MAX_SEMANTIC_VERSION_STRING_LENGTH,
    MAX_STRING_PK_LENGTH,
)
from crate_anon.nlp_manager.constants import (
    full_sectionname,
    SqlTypeDbIdentifier,
)
from crate_anon.nlp_manager.models import NlpRecord
from crate_anon.nlp_manager.nlp_definition import (
    NlpConfigPrefixes,
    NlpDefinition,
)
from crate_anon.version import CRATE_VERSION

log = logging.getLogger(__name__)

TIMING_GEN_TEXT_SQL_SELECT = "gen_text_sql_select"
TIMING_PROCESS_GEN_TEXT = "process_generated_text"
TIMING_PROGRESS_DB_SELECT = "progress_db_select"
TIMING_PROGRESS_DB_DELETE = "progress_db_delete"


# =============================================================================
# Input field definition
# =============================================================================


[docs]class InputFieldConfig: """ Class defining an input field for NLP (containing text). See the documentation for the :ref:`NLP config file <nlp_config>`. """
[docs] def __init__(self, nlpdef: NlpDefinition, cfg_input_name: str) -> None: """ Read config from a configparser section, and also associate with a specific NLP definition. Args: nlpdef: :class:`crate_anon.nlp_manager.nlp_definition.NlpDefinition`, the master NLP definition, referring to the master config file etc. cfg_input_name: config section name for the input field definition """ self.name = cfg_input_name cfg = nlpdef.get_config_section( full_sectionname(NlpConfigPrefixes.INPUT, cfg_input_name) ) self._nlpdef = nlpdef self._srcdb = cfg.opt_str(InputFieldConfigKeys.SRCDB) self._srctable = cfg.opt_str(InputFieldConfigKeys.SRCTABLE) self._srcpkfield = cfg.opt_str(InputFieldConfigKeys.SRCPKFIELD) self._srcfield = cfg.opt_str(InputFieldConfigKeys.SRCFIELD) self._srcdatetimefield = cfg.opt_str( InputFieldConfigKeys.SRCDATETIMEFIELD, required=False ) # ... new in v0.18.52 # Make these case-sensitive to avoid our failure in renaming SQLA # Column objects to be lower-case: self._copyfields = cfg.opt_multiline( InputFieldConfigKeys.COPYFIELDS ) # fieldnames self._indexed_copyfields = cfg.opt_multiline( InputFieldConfigKeys.INDEXED_COPYFIELDS ) self._debug_row_limit = cfg.opt_int( InputFieldConfigKeys.DEBUG_ROW_LIMIT, default=0 ) # self._fetch_sorted = opt_bool('fetch_sorted', default=True) ensure_valid_table_name(self._srctable) ensure_valid_field_name(self._srcpkfield) ensure_valid_field_name(self._srcfield) if self._srcdatetimefield: ensure_valid_field_name(self._srcdatetimefield) if len(set(self._indexed_copyfields)) != len(self._indexed_copyfields): raise ValueError( f"Redundant indexed_copyfields: {self._indexed_copyfields}" ) if len(set(self._copyfields)) != len(self._copyfields): raise ValueError(f"Redundant copyfields: {self._copyfields}") indexed_not_copied = set(self._indexed_copyfields) - set( self._copyfields ) if indexed_not_copied: raise ValueError( f"Fields in index_copyfields but not in " f"copyfields: {indexed_not_copied}" ) # allfields = [self._srcpkfield, self._srcfield] + self._copyfields # if len(allfields) != len(set(allfields)): # raise ValueError( # f"Field overlap in InputFieldConfig: {section}") # RE-THOUGHT: OK to copy source text fields etc. if desired. # It's fine in SQL to say SELECT a, a FROM mytable; self._db = nlpdef.get_database(self._srcdb)
@property def srcdb(self) -> str: """ Returns the name of the source database. """ return self._srcdb @property def srctable(self) -> str: """ Returns the name of the source table. """ return self._srctable @property def srcpkfield(self) -> str: """ Returns the name of the primary key (PK) field (column) in the source table. """ return self._srcpkfield @property def srcfield(self) -> str: """ Returns the name of the text field (column) in the source table. """ return self._srcfield @property def srcdatetimefield(self) -> str: # new in v0.18.52 """ Returns the name of the field (column) in the source table that defines the date/time of the source text. """ return self._srcdatetimefield @property def source_session(self) -> Session: """ Returns the SQLAlchemy ORM :class:`Session` for the source database. """ return self._db.session @property def _source_metadata(self) -> MetaData: """ Returns the SQLAlchemy :class:`MetaData` for the source database, used for reflection (inspection) of the source database structure. """ return self._db.metadata @property def _source_engine(self) -> Engine: """ Returns the SQLAlchemy Core :class:`Engine` for the source database. """ return self._db.engine @property def _progress_session(self) -> Session: """ Returns the SQLAlchemy ORM :class:`Session` for the progress database. """ return self._nlpdef.progressdb_session
[docs] @staticmethod def get_core_columns_for_dest() -> List[Column]: """ Returns the columns used NLP destination tables, primarily describing the source. See :ref:`Standard NLP output columns <standard_nlp_output_columns>`. Returns: a list of SQLAlchemy :class:`Column` objects """ return [ Column( FN_PK, BigInteger, primary_key=True, autoincrement=True, comment="Arbitrary primary key (PK) of output record", ), Column( FN_NLPDEF, SqlTypeDbIdentifier, comment="Name of the NLP definition producing this row", ), Column( FN_SRCDB, SqlTypeDbIdentifier, comment="Source database name (from CRATE NLP config)", ), Column( FN_SRCTABLE, SqlTypeDbIdentifier, comment="Source table name" ), Column( FN_SRCPKFIELD, SqlTypeDbIdentifier, comment="PK field (column) name in source table", ), Column( FN_SRCPKVAL, BigInteger, comment="PK of source record (or integer hash of PK if the " "PK is a string)", ), Column( FN_SRCPKSTR, String(MAX_STRING_PK_LENGTH), comment=f"NULL if the table has an integer PK, but the PK " f"if the PK was a string, to deal with hash " f"collisions. Max length: {MAX_STRING_PK_LENGTH}", ), Column( FN_SRCFIELD, SqlTypeDbIdentifier, comment="Field (column) name of source text", ), Column( FN_SRCDATETIMEFIELD, SqlTypeDbIdentifier, comment="Date/time field (column) name in source table", ), Column( FN_SRCDATETIMEVAL, DateTime, nullable=True, comment="Date/time of source field", ), Column( FN_CRATE_VERSION_FIELD, String(MAX_SEMANTIC_VERSION_STRING_LENGTH), nullable=True, comment="Version of CRATE that generated this NLP record.", ), Column( FN_WHEN_FETCHED, DateTime, nullable=True, comment="Date/time that the NLP processor fetched the " "record from the source database (in UTC).", ), ]
[docs] @staticmethod def get_core_indexes_for_dest() -> List[Index]: """ Returns the core indexes to be applied to the destination tables. Primarily, these are for columns that refer to the source. Returns: a list of SQLAlchemy :class:`Index` objects See - https://stackoverflow.com/questions/179085/multiple-indexes-vs-multi-column-indexes """ # noqa return [ Index( "_idx_srcref", # Remember, order matters; more to less specific # See also BaseNlpParser.delete_dest_record FN_SRCPKVAL, FN_NLPDEF, FN_SRCFIELD, FN_SRCTABLE, FN_SRCDB, FN_SRCPKSTR, ), Index("_idx_srcdate", FN_SRCDATETIMEVAL), Index( "_idx_deletion", # We sometimes delete just using the following; see # BaseNlpParser.delete_where_srcpk_not FN_NLPDEF, FN_SRCFIELD, FN_SRCTABLE, FN_SRCDB, ), ]
def _require_table_or_view_exists(self) -> None: """ Ensure that the source table exists, or raise :exc:`RuntimeError`. """ if not table_or_view_exists(self._source_engine, self._srctable): raise RuntimeError( f"Missing source table: {self._srcdb}.{self._srctable}" )
[docs] def get_copy_columns(self) -> List[Column]: """ Returns the columns that the user has requested to be copied from the source table to the NLP destination table. Returns: a list of SQLAlchemy :class:`Column` objects """ # We read the column type from the source database. self._require_table_or_view_exists() meta = self._source_metadata t = Table(self._srctable, meta, autoload=True) copy_columns = [] # type: List[Column] processed_copy_column_names = [] # type: List[str] for c in t.columns: # if c.name.lower() in self._copyfields: if c.name in self._copyfields: copied = c.copy() # Force lower case: # copied.name = copied.name.lower() # copied.name = quoted_name(copied.name.lower(), None) # ... this is not working properly. Keep getting an # "Unconsumed column names" error with e.g. a source field of # "Text". # Try making copyfields case-sensitive instead. copy_columns.append(copied) processed_copy_column_names.append(c.name) # Check all requested fields are present: missing = set(self._copyfields) - set(processed_copy_column_names) if missing: raise RuntimeError( f"The following fields were requested to be copied but are " f"absent from the source (NB case-sensitive): {missing}" ) # log.critical(copy_columns) return copy_columns
[docs] def get_copy_indexes(self) -> List[Index]: """ Returns indexes that should be made in the destination table for columns that the user has requested to be copied from the source. Returns: a list of SQLAlchemy :class:`Index` objects """ self._require_table_or_view_exists() meta = self._source_metadata t = Table(self._srctable, meta, autoload=True) copy_indexes = [] # type: List[Index] processed_copy_index_col_names = [] # type: List[str] for c in t.columns: # if c.name.lower() in self._indexed_copyfields: if c.name in self._indexed_copyfields: copied = c.copy() # See above re case. idx_name = f"idx_{c.name}" copy_indexes.append(Index(idx_name, copied)) processed_copy_index_col_names.append(c.name) missing = set(self._indexed_copyfields) - set( processed_copy_index_col_names ) if missing: raise ValueError( f"The following fields were requested to be copied/indexed but" f" are absent from the source (NB case-sensitive): {missing}" ) return copy_indexes
[docs] def is_pk_integer(self) -> bool: """ Is the primary key (PK) of the source table an integer? """ pkcoltype = get_column_type( self._source_engine, self._srctable, self._srcpkfield ) if not pkcoltype: raise ValueError( f"Unable to get column type for column " f"{self._srctable}.{self._srcpkfield}" ) pk_is_integer = is_sqlatype_integer(pkcoltype) # log.debug(f"pk_is_integer: {pkcoltype!r} -> {pk_is_integer}") return pk_is_integer
[docs] def gen_text( self, tasknum: int = 0, ntasks: int = 1 ) -> Generator[Tuple[str, Dict[str, Any]], None, None]: """ Generate text strings from the source database, for NLP. Text fields that are NULL, empty, or contain only whitespace, are skipped. Yields: tuple: ``text, dict``, where ``text`` is the source text and ``dict`` is a column-to-value mapping for all other fields (source reference fields, copy fields). """ if 1 < ntasks <= tasknum: raise RuntimeError(f"Invalid tasknum {tasknum}; must be <{ntasks}") # --------------------------------------------------------------------- # Values that are constant to all items we will generate # (i.e. database/field *names*, plus CRATE version info) # --------------------------------------------------------------------- base_dict = { FN_SRCDB: self._srcdb, FN_SRCTABLE: self._srctable, FN_SRCPKFIELD: self._srcpkfield, FN_SRCFIELD: self._srcfield, FN_SRCDATETIMEFIELD: self._srcdatetimefield, FN_CRATE_VERSION_FIELD: CRATE_VERSION, } # --------------------------------------------------------------------- # Build a query # --------------------------------------------------------------------- session = self.source_session pkcol = column(self._srcpkfield) # ... don't use is_sqlatype_integer with this; it's a column clause, # not a full column definition. pk_is_integer = self.is_pk_integer() # Core columns colindex_pk = 0 colindex_text = 1 colindex_datetime = 2 colindex_remainder_start = 3 selectcols = [ pkcol, column(self._srcfield), ( column(self._srcdatetimefield) if self._srcdatetimefield else null() ), ] # User-specified extra columns for extracol in self._copyfields: selectcols.append(column(extracol)) query = select(selectcols).select_from(table(self._srctable)) # not ordered... # if self._fetch_sorted: # query = query.order_by(pkcol) # --------------------------------------------------------------------- # Plan our parallel-processing approach # --------------------------------------------------------------------- distribute_by_hash = False if ntasks > 1: if pk_is_integer: # Integer PK, so we can be efficient and bake the parallel # processing work division into the SQL: query = query.where(pkcol % ntasks == tasknum) else: distribute_by_hash = True # --------------------------------------------------------------------- # Execute the query # --------------------------------------------------------------------- nrows_returned = 0 with MultiTimerContext(timer, TIMING_GEN_TEXT_SQL_SELECT): when_fetched = get_now_utc_notz_datetime() result = session.execute(query) for row in result: # ... "result" is a generator with MultiTimerContext(timer, TIMING_PROCESS_GEN_TEXT): # Get PK value pkval = row[colindex_pk] # Deal with non-integer PKs if pk_is_integer: hashed_pk = None # remove warning about reference before assignment # noqa else: hashed_pk = hash64(pkval) if ( distribute_by_hash and not is_my_job_by_hash_prehashed( hashed_pk, tasknum, ntasks ) ): continue # Optional debug limit on the number of rows if 0 < self._debug_row_limit <= nrows_returned: log.warning( f"Table {self._srcdb}.{self._srctable}: not " f"fetching more than {self._debug_row_limit} rows " f"(in total for this process) due to debugging " f"limits" ) result.close() # http://docs.sqlalchemy.org/en/latest/core/connections.html # noqa return # Get text text = row[colindex_text] # Skip text that is absent/empty/contains only whitespace: if not relevant_for_nlp(text): continue # We don't modify (e.g. strip) the text, because our NLP # processor may return relevant character positions, so we # want those to be correct with respect to the source. # Get everything else other_values = dict( zip(self._copyfields, row[colindex_remainder_start:]) ) if pk_is_integer: other_values[FN_SRCPKVAL] = pkval # an integer other_values[FN_SRCPKSTR] = None else: # hashed_pk will have been set above other_values[FN_SRCPKVAL] = hashed_pk # an integer other_values[FN_SRCPKSTR] = pkval # a string etc. other_values[FN_SRCDATETIMEVAL] = row[colindex_datetime] other_values[FN_WHEN_FETCHED] = when_fetched other_values.update(base_dict) if ( self._nlpdef.truncate_text_at and len(text) > self._nlpdef.truncate_text_at ): text = text[: self._nlpdef.truncate_text_at] other_values[TRUNCATED_FLAG] = True else: other_values[TRUNCATED_FLAG] = False # Yield the result yield text, other_values nrows_returned += 1
[docs] def get_count(self) -> int: """ Counts records in the source table. Used for progress monitoring. """ return count_star( session=self.source_session, tablename=self._srctable )
[docs] def get_progress_record( self, srcpkval: int, srcpkstr: str = None ) -> Optional[NlpRecord]: """ Fetch a progress record for the given source record, if one exists. Returns: :class:`crate_anon.nlp_manager.models.NlpRecord`, or ``None`` """ session = self._progress_session query = ( session.query(NlpRecord) .filter(NlpRecord.srcdb == self._srcdb) .filter(NlpRecord.srctable == self._srctable) .filter(NlpRecord.srcpkval == srcpkval) .filter(NlpRecord.srcfield == self._srcfield) .filter(NlpRecord.nlpdef == self._nlpdef.name) # Order not important (though the order of the index certainly # is; see NlpRecord.__table_args__). # https://stackoverflow.com/questions/11436469/does-order-of-where-clauses-matter-in-sql # noqa ) if srcpkstr is not None: query = query.filter(NlpRecord.srcpkstr == srcpkstr) # log.critical(query) with MultiTimerContext(timer, TIMING_PROGRESS_DB_SELECT): # This was surprisingly slow under SQL Server testing. return query.one_or_none()
[docs] def gen_src_pks(self) -> Generator[Tuple[int, Optional[str]], None, None]: """ Generate integer PKs from the source table. For tables with an integer PK, yields tuples: ``pk_value, None``. For tables with a string PK, yields tuples: ``pk_hash, pk_value``. - Timing is subsumed under the timer named ``TIMING_DELETE_WHERE_NO_SOURCE``. """ session = self.source_session query = select([column(self._srcpkfield)]).select_from( table(self._srctable) ) result = session.execute(query) if self.is_pk_integer(): for row in result: yield row[0], None else: for row in result: pkval = row[0] yield hash64(pkval), pkval
[docs] def delete_progress_records_where_srcpk_not( self, temptable: Optional[Table] ) -> None: """ If ``temptable`` is None, deletes all progress records for this input field/NLP definition. If ``temptable`` is a table, deletes records from the progress database (from this input field/NLP definition) whose source PK is not in the temporary table. (Used for deleting NLP records when the source has subsequently been deleted.) """ progsession = self._progress_session log.debug( f"delete_progress_records_where_srcpk_not... " f"{self._srcdb}.{self._srctable} -> progressdb" ) prog_deletion_query = ( progsession.query(NlpRecord) .filter(NlpRecord.srcdb == self._srcdb) .filter(NlpRecord.srctable == self._srctable) . # unnecessary # filter(NlpRecord.srcpkfield == self._srcpkfield). filter(NlpRecord.nlpdef == self._nlpdef.name) ) if temptable is not None: log.debug("... deleting selectively") temptable_pkvalcol = temptable.columns[FN_SRCPKVAL] temptable_pkstrcol = temptable.columns[FN_SRCPKSTR] prog_deletion_query = prog_deletion_query.filter( ~exists().where( and_( NlpRecord.srcpkval == temptable_pkvalcol, or_( NlpRecord.srcpkstr == temptable_pkstrcol, and_( NlpRecord.srcpkstr.is_(None), temptable_pkstrcol.is_(None), ), ), ) ) ) else: log.debug("... deleting all") with MultiTimerContext(timer, TIMING_PROGRESS_DB_DELETE): prog_deletion_query.delete(synchronize_session=False) # http://docs.sqlalchemy.org/en/latest/orm/query.html#sqlalchemy.orm.query.Query.delete # noqa self._nlpdef.commit(progsession)
[docs] def delete_all_progress_records(self) -> None: """ Deletes **all** records from the progress database for this NLP definition (across all source tables/columns). """ progsession = self._progress_session prog_deletion_query = progsession.query(NlpRecord).filter( NlpRecord.nlpdef == self._nlpdef.name ) log.debug( f"delete_all_progress_records for NLP definition: " f"{self._nlpdef.name}" ) with MultiTimerContext(timer, TIMING_PROGRESS_DB_DELETE): prog_deletion_query.delete(synchronize_session=False) self._nlpdef.commit(progsession)