Source code for crate_anon.anonymise.make_demo_database

#!/usr/bin/env python

"""
crate_anon/anonymise/make_demo_database.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Makes a test database (from tiny to large) for anonymisation testing.**

See also:

- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3751474/
- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3751474/table/T7/

After anonymisation, check with:

.. code-block:: sql

    SELECT * FROM anonymous_output.notes WHERE brcid IN (
        SELECT brcid
        FROM anonymous_mapping.secret_map
        WHERE patient_id < 2
    );
    SELECT * FROM test.patients WHERE patient_id < 2;

"""

import argparse
import datetime
import enum
import logging
import os
import random
import subprocess
from typing import TYPE_CHECKING

from cardinal_pythonlib.datetimefunc import pendulum_to_datetime
from cardinal_pythonlib.logs import configure_logger_for_colour
from pendulum import DateTime as Pendulum  # NB name clash with SQLAlchemy
from rich_argparse import ArgumentDefaultsRichHelpFormatter
from sqlalchemy import (
    create_engine,
    BigInteger,
    Boolean,
    Column,
    Date,
    DateTime,  # NB name clash with pendulum
    Enum,
    ForeignKey,
    Integer,
    LargeBinary,
    MetaData,
    String,
    Text,
)
from sqlalchemy.ext.compiler import compiles
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker
from sqlalchemy.sql import text

from crate_anon.anonymise.constants import (
    CHARSET,
    TABLE_KWARGS,
)
from crate_anon.common.constants import EnvVar

if TYPE_CHECKING:
    from sqlalchemy.sql.type_api import TypeEngine
    from sqlalchemy.sql.compiler import SQLCompiler

log = logging.getLogger(__name__)
metadata = MetaData()
Base = declarative_base(metadata=metadata)

# =============================================================================
# Constants
# =============================================================================

CONSOLE_ENCODING = "utf8"
REPORT_EVERY = 50
BASE_DOB = datetime.date(day=1, month=10, year=1980)
DT_FORMATS = [
    "%d %b %Y",  # e.g. 24 Jul 2013
    "%d %B %Y",  # e.g. 24 July 2013
    "%a %d %B %Y",  # e.g. Wed 24 July 2013
    "%d %B %Y, %H:%M %z",  # ... e.g. 24 July 2013, 20:04 +0100
    "%a %d %B %Y, %H:%M %z",  # ... e.g. Wed 24 July 2013, 20:04 +0100
    "%a %d %B %Y, %H:%M",  # ... e.g. Wed 24 July 2013, 20:04
    "%a %d %b %Y, %H:%M",  # ... e.g. Wed 24 Jul 2013, 20:04
    "%d %B %Y, %H:%M:%S %z",
    "%d %b %Y, %H:%M %z",
    "%d %b %Y, %H:%M:%S %z",
    "%H:%M",
    "%Y-%m-%dT%H:%M:%S%z",  # e.g. 2013-07-24T20:04:07+0100
    "%Y-%m-%d",  # e.g. 2013-07-24
    "%Y-%m-%dT%H%M",  # e.g. 20130724T2004
    "%Y-%m-%d",  # e.g. 20130724
    "%Y%m%d%H%M%S%z",  # e.g. 20130724200407+0100
    "%Y%m%d",  # e.g. 20130724
    "%Y-%m-%dT%H:%M:%SZ",  # e.g. 2013-07-24T20:03:07Z
    "%d/%m/%Y %H:%M",  # e.g. 01/12/2014 09:45
]

CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

if EnvVar.GENERATING_CRATE_DOCS in os.environ:
    DEFAULT_DOCDIR = "/path/to/test_docs"
else:
    DEFAULT_DOCDIR = os.path.abspath(
        os.path.join(CURRENT_DIR, os.pardir, "testdocs_for_text_extraction")
    )

DEFAULT_DOCTEST_DOC = os.path.join(DEFAULT_DOCDIR, "doctest.doc")
DEFAULT_DOCTEST_DOCX = os.path.join(DEFAULT_DOCDIR, "doctest.docx")
DEFAULT_DOCTEST_ODT = os.path.join(DEFAULT_DOCDIR, "doctest.odt")
DEFAULT_DOCTEST_PDF = os.path.join(DEFAULT_DOCDIR, "doctest.pdf")

MAX_EXT_LENGTH_WITH_DOT = 10


# =============================================================================
# BLOB type
# =============================================================================

# http://docs.sqlalchemy.org/en/latest/core/custom_types.html
# noinspection PyUnusedLocal
[docs]@compiles(LargeBinary, "mysql")
def compile_blob_mysql(
    type_: "TypeEngine", compiler: "SQLCompiler", **kw
) -> str:
    """
    Provides a custom type for the SQLAlchemy ``LargeBinary`` type under MySQL,
    by using ``LONGBLOB`` (which overrides the default of ``BLOB``).

    MySQL: https://dev.mysql.com/doc/refman/5.7/en/storage-requirements.html

    .. code-block:: none

        TINYBLOB: up to 2^8 bytes
        BLOB: up to 2^16 bytes = 64 KiB
        MEDIUMBLOB: up to 2^24 bytes = 16 MiB  <-- minimum for docs
        LONGBLOB: up to 2^32 bytes = 4 GiB

        VARBINARY: up to 65535 = 64 KiB

    SQL Server: https://msdn.microsoft.com/en-us/library/ms188362.aspx

    .. code-block:: none

        BINARY: up to 8000 bytes = 8 KB
        VARBINARY(MAX): up to 2^31 - 1 bytes = 2 GiB <-- minimum for docs
        IMAGE: deprecated; up to 2^31 - 1 bytes = 2 GiB
            https://msdn.microsoft.com/en-us/library/ms187993.aspx

    SQL Alchemy:

    .. code-block:: none

        _Binary: base class
        LargeBinary: translates to BLOB in MySQL
        VARBINARY, as an SQL base data type
        dialects.mysql.base.LONGBLOB
        dialects.mssql.base.VARBINARY

    Therefore, we can take the LargeBinary type and modify it.
    """
    return "LONGBLOB"  # would have been "BLOB"


# If this goes wrong for future versions of SQL Server, write another
# specializer to produce "VARBINARY(MAX)" instead of "IMAGE". I haven't done
# that because it may be that SQL Alchemy is reading the SQL Server version
# (it definitely executes "select @@version") and specializing accordingly.


# =============================================================================
# A silly enum
# =============================================================================


[docs]class EnumColours(enum.Enum):
    """
    A silly enum, for testing.
    """

    red = 1
    green = 2
    blue = 3


# =============================================================================
# Randomness
# =============================================================================


[docs]def coin(p: float = 0.5) -> bool:
    """
    Biased coin toss. Returns ``True`` with probability ``p``.
    """
    return random.random() < p


# =============================================================================
# Tables
# =============================================================================


[docs]class Patient(Base):
    """
    SQLAlchemy ORM class for fictional patients.
    """

    __tablename__ = "patient"
    __table_args__ = TABLE_KWARGS

    patient_id = Column(Integer, primary_key=True, autoincrement=False)
    forename = Column(String(50))
    surname = Column(String(50))
    dob = Column(Date)
    nullfield = Column(Integer)
    nhsnum = Column(BigInteger)
    phone = Column(String(50))
    postcode = Column(String(50))
    optout = Column(Boolean, default=False)
    related_patient_id = Column(Integer)
    colour = Column(Enum(EnumColours), nullable=True)  # new in v0.18.41


[docs]class Note(Base):
    """
    SQLAlchemy ORM class for fictional notes.
    """

    __tablename__ = "note"
    __table_args__ = TABLE_KWARGS

    note_id = Column(Integer, primary_key=True)
    patient_id = Column(Integer, ForeignKey("patient.patient_id"))
    note = Column(Text)
    note_datetime = Column(DateTime)

    patient = relationship("Patient")


[docs]class BlobDoc(Base):
    """
    SQLAlchemy ORM class for fictional binary documents.
    """

    __tablename__ = "blobdoc"
    __table_args__ = TABLE_KWARGS

    blob_doc_id = Column(Integer, primary_key=True)
    patient_id = Column(Integer, ForeignKey("patient.patient_id"))
    blob = Column(LargeBinary)  # modified as above!
    extension = Column(String(MAX_EXT_LENGTH_WITH_DOT))
    blob_datetime = Column(DateTime)

    patient = relationship("Patient")

    def __init__(
        self, patient: Patient, filename: str, blob_datetime: datetime.datetime
    ) -> None:
        """
        Args:
            patient: corresponding :class:`Patient` object
            filename: filename containing the binary document to load and
                store in the database
            blob_datetime: date/time value to give this BLOB
        """
        _, extension = os.path.splitext(filename)
        with open(filename, "rb") as f:
            contents = f.read()  # will be of type 'bytes'
        # noinspection PyArgumentList
        super().__init__(
            patient=patient,
            blob=contents,
            extension=extension,
            blob_datetime=blob_datetime,
        )


[docs]class FilenameDoc(Base):
    """
    SQLAlchemy ORM class for a table containing the filenames of binary
    documents.
    """

    __tablename__ = "filenamedoc"
    __table_args__ = TABLE_KWARGS

    filename_doc_id = Column(Integer, primary_key=True)
    patient_id = Column(Integer, ForeignKey("patient.patient_id"))
    filename = Column(Text)
    file_datetime = Column(DateTime)

    patient = relationship("Patient")


# noinspection PyPep8Naming
[docs]def main() -> None:
    """
    Command-line processor. See command-line help.
    """
    default_size = 0
    # noinspection PyTypeChecker
    parser = argparse.ArgumentParser(
        formatter_class=ArgumentDefaultsRichHelpFormatter
    )
    parser.add_argument(
        "url",
        help=(
            "SQLAlchemy database URL. Append ?charset=utf8, e.g. "
            "mysql+mysqldb://root:password@127.0.0.1:3306/test?charset=utf8 ."
            " WARNING: If you get the error 'MySQL has gone away', increase "
            "the max_allowed_packet parameter in my.cnf (e.g. to 32M)."
        ),
    )
    parser.add_argument(
        "--size",
        type=int,
        default=default_size,
        choices=[0, 1, 2, 3],
        help="Make tiny (0), small (1), medium (2), or large (3) database",
    )
    parser.add_argument(
        "--verbose", "-v", action="store_true", help="Be verbose"
    )
    parser.add_argument("--echo", action="store_true", help="Echo SQL")
    parser.add_argument(
        "--doctest_doc", default=DEFAULT_DOCTEST_DOC, help="Test file for .DOC"
    )
    parser.add_argument(
        "--doctest_docx",
        default=DEFAULT_DOCTEST_DOCX,
        help="Test file for .DOCX",
    )
    parser.add_argument(
        "--doctest_odt", default=DEFAULT_DOCTEST_ODT, help="Test file for .ODT"
    )
    parser.add_argument(
        "--doctest_pdf", default=DEFAULT_DOCTEST_PDF, help="Test file for .PDF"
    )
    args = parser.parse_args()

    nwords = 10000
    if args.size == 0:
        n_patients = 20
        notes_per_patient = 1
        words_per_note = 100
    elif args.size == 1:
        n_patients = 100
        notes_per_patient = 5
        words_per_note = 100
    elif args.size == 2:
        n_patients = 100
        notes_per_patient = 100
        words_per_note = 1000
    elif args.size == 3:
        # about 1.4 Gb
        n_patients = 1000
        notes_per_patient = 100
        words_per_note = 1000
    else:
        assert False, "Bad size parameter"
    loglevel = logging.DEBUG if args.verbose else logging.INFO
    rootlogger = logging.getLogger()
    configure_logger_for_colour(rootlogger, level=loglevel)

    # 0. Announce intentions

    log.info(
        f"n_patients={n_patients}, "
        f"notes_per_patient={notes_per_patient}, "
        f"words_per_note={words_per_note}"
    )

    # 1. Get words

    log.info("Fetching words.")
    words = (
        subprocess.check_output(
            ["grep", "-v", "'s", "-m", str(nwords), "/usr/share/dict/words"]
        )
        .decode(CONSOLE_ENCODING)
        .splitlines()
    )

    # 2. Open database

    log.info("Opening database.")
    log.debug(f"URL: {args.url}")
    engine = create_engine(args.url, echo=args.echo, encoding=CHARSET)
    session = sessionmaker(bind=engine)()

    # 3. Create tables

    log.info("Creating tables.")
    metadata.drop_all(engine, checkfirst=True)
    metadata.create_all(engine, checkfirst=True)

    # 4. Insert

    log.info(
        f"Aiming for a total of "
        f"{n_patients * notes_per_patient * words_per_note} "
        f"words in notes."
    )

    log.info("Inserting data.")

    # Autoincrementing date

    _datetime = Pendulum(year=2000, month=1, day=1, hour=9)

    def incdatetime() -> datetime.datetime:
        nonlocal _datetime
        _p = _datetime
        _datetime = _datetime.add(days=1)
        return pendulum_to_datetime(_p)

    # Special extra patient

    # noinspection PyTypeChecker
    p1 = Patient(
        patient_id=1,
        forename="Ronald Gibbet",
        surname="MacDonald",
        dob=datetime.date(day=11, month=11, year=1911),
        nhsnum=123456,
        phone="(01223)-123456",
        postcode="CB2 3EB",
        colour=EnumColours.red,
    )
    session.add(p1)
    for _ in range(notes_per_patient):
        n1 = Note(
            patient=p1,
            note="""
Ronald MacDonald lived on a farm and kept a gibbet for scaring off
small animals. He was born on 11 Nov 1911 and was very proud of this.
His cat’s name was Flitterwick. It did not like the gibbets.
Ronalds other passion was blimping.
A typo might be RonaldMacDonald.
His phone number was 0122-312-3456, or 01223-123456, or (01223) 123456,
or 01223 123 456, or 01223 123456.
His NHS number was 123.456 or possibly 12 34 56.
His postcode was CB2 3EB, or possible CB23EB, or CB2, or 3EB.

Some HTML encoding is &amp; and &lt;.
An HTML tag is <a href="http://somewhere">this link</a>.
Start aspirin 75mg od. Remains on Lipitor 40mg nocte.
For haloperidol 2mg po prn max qds.
Start amoxicillin 500 mg b.i.d. for 7 days.

Some numerical results:
His CRP is 10. His previous CRP was <13 mg/dl.
Sodium 140.
TSH 3.5; urea normal.
Height 1.82m, weight 75kg, BMI 22.6. BP 135/82.
MMSE 28/30. ACE-R 72, ACE-II 73, ACE 73.
ESR 16 (H) mm/h.
WBC 9.2; neutrophils 4.3; lymphocytes 2.6; eosinophils 0.4; monocytes 1.2;
basophils 0.6.
            """,
            note_datetime=incdatetime(),
        )
        session.add(n1)
    for filename in (
        args.doctest_doc,
        args.doctest_docx,
        args.doctest_odt,
        args.doctest_pdf,
    ):
        bd = BlobDoc(
            patient=p1, filename=filename, blob_datetime=incdatetime()
        )
        session.add(bd)
        fd = FilenameDoc(
            patient=p1, filename=filename, file_datetime=incdatetime()
        )
        session.add(fd)

    # noinspection PyTypeChecker
    p2 = Patient(
        patient_id=2,
        forename="Bob D'Souza",
        surname="",
        dob=datetime.date(day=11, month=11, year=1911),
        nhsnum=234567,
        phone="(01223)-234567",
        postcode="CB2 3EB",
        related_patient_id=1,
        colour=EnumColours.green,
    )
    session.add(p2)
    for _ in range(notes_per_patient):
        n2 = Note(
            patient=p2,
            note="""
Bob D'Souza, also known as Bob, or Mr DSouza, or sometimes Mr D Souza,
or the D'Souza bloke down the road, or BobDSouza or BobD'Souza.
His phone number was 0122-312-3456, or 01223-123456, or (01223) 123456,
or 01223 123 456, or 01223 123456.
His NHS number was 123.456 or possibly 12 34 56 or 123456, perhaps.
His postcode was CB2 3EB, or possible CB23EB, or CB2, or 3EB.
Bob Hope visited Seattle.
Bob took venlafaxine 375 M/R od, and is due to start clozapine 75mg bd.
            """,
            note_datetime=incdatetime(),
        )
        session.add(n2)

    # A bunch of patients
    random.seed(1)
    prev_forename = ""
    prev_surname = ""
    for p in range(n_patients):
        if p % REPORT_EVERY == 0:
            log.info(f"patient {p}")
        forename = words[(p + 1) % nwords] + " " + words[(p + 10) % nwords]
        surname = words[(p + 2) % nwords]
        dob = BASE_DOB + datetime.timedelta(days=p)
        ok_date = dob + datetime.timedelta(days=1)
        nhsnum = random.randint(1, 9999999999)
        # noinspection PyTypeChecker
        patient = Patient(
            patient_id=p + 3,
            forename=forename,
            surname=surname,
            dob=dob,
            nhsnum=nhsnum,
            phone="123456",
            postcode="CB2 3EB",
            related_patient_id=p + 2,  # one back from patient_id
            colour=EnumColours.blue if coin() else None,
        )
        session.add(patient)
        patient_id = patient.patient_id
        dates = (
            "DATES: "
            + (
                " ".join([dob.strftime(fmt) for fmt in DT_FORMATS])
                + " ".join([ok_date.strftime(fmt) for fmt in DT_FORMATS])
            )
            + ". "
        )
        fname = "FORENAME: " + forename + ". "
        sname = "SURNAME: " + surname + ". "
        rname = "RELATIVE: " + prev_forename + " " + prev_surname + ". "
        numbers = f"NUMBERS: {patient_id}, {patient_id + 1}, {nhsnum}. "
        for n in range(notes_per_patient):
            wstr = " ".join(words[p % nwords : (p + words_per_note) % nwords])
            note = Note(
                patient=patient,
                note=fname + sname + rname + numbers + dates + wstr,
                note_datetime=incdatetime(),
            )
            session.add(note)
        prev_forename = forename
        prev_surname = surname

    # 5. Commit

    log.info("Committing...")
    session.commit()
    log.info("Done.")

    # 6. Report size

    if engine.dialect.name == "mysql":
        log.info("Done. Database size:")
        sql = """
            SELECT
                table_schema,
                table_name,
                table_rows,
                data_length,
                index_length,
                ROUND(((data_length + index_length) / (1024 * 1024)), 2)
                  AS "Size_MB"
            FROM
                information_schema.tables
            WHERE table_schema = DATABASE()
        """
        rows = session.execute(text(sql))
        for r in rows:
            print(
                "schema={}, table={}, rows={}, data_length={}, "
                "index_length={}, size_MB={}".format(*r)
            )


if __name__ == "__main__":
    main()