Source code for crate_anon.nlp_manager.models

#!/usr/bin/env python

"""
crate_anon/nlp_manager/models.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**SQLAlchemy ORM models for the NLP progress database.**

"""

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.schema import Column, Index, MetaData
from sqlalchemy.types import BigInteger, DateTime, String

from crate_anon.anonymise.constants import TABLE_KWARGS
from crate_anon.nlp_manager.constants import (
    HashClass,
    MAX_STRING_PK_LENGTH,
    SqlTypeDbIdentifier,
)

progress_meta = MetaData()
ProgressBase = declarative_base(metadata=progress_meta)

FN_SRCHASH = "srchash"


# =============================================================================
# Global constants
# =============================================================================

SqlTypeHash = HashClass("dummysalt").sqla_column_type()


# =============================================================================
# Record of progress
# =============================================================================


[docs]class NlpRecord(ProgressBase): """ Class to record the fact of processing a source record for a particular kind of NLP (and to keep a hash allowing identification of altered source contents later). """ __tablename__ = "crate_nlp_progress" __table_args__ = ( Index( "_idx1", # index name # index fields: "srcpkval", # integer and most specific "nlpdef", # usually >1 NLP def to 1 db/table/field combo "srcfield", # } roughly, more to less specific? "srctable", # } "srcdb", # } "srcpkstr", # last as we may not use it # - performance is critical here # - put them in descending order of specificity # https://stackoverflow.com/questions/2292662/how-important-is-the-order-of-columns-in-indexes # noqa # - start with srcpkval, as it's (a) specific and (b) integer # - srcpkfield: don't need to index, because the source table # can only have one PK # - srcpkstr: must include, since srcpkval can be non-unique, # due to hash collisions, if we're using a string # ... but ?should be last because we may not use it in # queries (for tables with integer PK) unique=True # Despite having a NULL field in a UNIQUE index, this is OK for # SQL Server 2008+ (https://stackoverflow.com/questions/767657) and # MySQL also seems happy. ), TABLE_KWARGS, ) # https://stackoverflow.com/questions/6626810/multiple-columns-index-when-using-the-declarative-orm-extension-of-sqlalchemy # noqa # http://docs.sqlalchemy.org/en/latest/orm/extensions/declarative/table_config.html # noqa pk = Column( "pk", BigInteger, primary_key=True, autoincrement=True, comment="PK of NLP record (no specific use)", ) srcdb = Column( "srcdb", SqlTypeDbIdentifier, comment="Source database" # primary_key=True ) srctable = Column( "srctable", SqlTypeDbIdentifier, comment="Source table name" # primary_key=True ) srcpkfield = Column( "srcpkfield", SqlTypeDbIdentifier, comment="Primary key column name in source table (for info only)", ) srcpkval = Column( "srcpkval", BigInteger, comment="Primary key value in source table (or hash if PK is a string)" # primary_key=True ) srcpkstr = Column( "srcpkstr", String(MAX_STRING_PK_LENGTH), comment=f"Original string PK, used when the table has a string PK, to " f"deal with hash collisions. Max length: " f"{MAX_STRING_PK_LENGTH}" # primary_key=True, default='' # can't have a NULL in a composite PK ) srcfield = Column( "srcfield", SqlTypeDbIdentifier, comment="Name of column in source field containing actual data" # primary_key=True ) nlpdef = Column( "nlpdef", SqlTypeDbIdentifier, comment="Name of natural language processing definition that source " "was processed for" # primary_key=True ) whenprocessedutc = Column( "whenprocessedutc", DateTime, comment="Time that NLP record was processed (batch time that the run " "was commenced for that NLP definition; UTC)", ) srchash = Column( FN_SRCHASH, SqlTypeHash, comment="Secure hash of source field contents at the time of " "processing", )