Source code for crate_anon.anonymise.anonymise_cli

#!/usr/bin/env python

"""
crate_anon/anonymise/anonymise_cli.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Command-line entry point for anonymisation.**

Split from the anonymisation functions so we can respond quickly to
command-line input; uses a delayed import when starting anonymisation.

"""

# Uses a delayed import (see below), so we can set up logging before
# using the config object.
import argparse
import logging
import os
from typing import List

from cardinal_pythonlib.logs import configure_logger_for_colour
from rich_argparse import ArgumentDefaultsRichHelpFormatter

from crate_anon.anonymise.constants import (
    ANON_CONFIG_ENV_VAR,
    DEFAULT_CHUNKSIZE,
    DEFAULT_REPORT_EVERY,
)
from crate_anon.common.exceptions import call_main_with_exception_reporting
from crate_anon.version import CRATE_VERSION_PRETTY

log = logging.getLogger(__name__)

DEBUG_RUN_WITH_PDB = False

if DEBUG_RUN_WITH_PDB:
    from cardinal_pythonlib.debugging import pdb_run
else:
    pdb_run = None


# =============================================================================
# Main
# =============================================================================


[docs]def inner_main() -> None: """ Indirect command-line entry point. See command-line help. Calls :func:`crate_anon.anonymise.anonymise.anonymise`. """ # noinspection PyTypeChecker parser = argparse.ArgumentParser( description=f"Database anonymiser. ({CRATE_VERSION_PRETTY})", formatter_class=ArgumentDefaultsRichHelpFormatter, ) parser.add_argument( "--config", help=f"Config file (overriding environment variable " f"{ANON_CONFIG_ENV_VAR}).", ) parser.add_argument( "--version", action="version", version=CRATE_VERSION_PRETTY ) parser.add_argument( "--verbose", "-v", action="store_true", help="Be verbose" ) mode_options = parser.add_argument_group("Mode options") mode_group = mode_options.add_mutually_exclusive_group() mode_group.add_argument( "-i", "--incremental", dest="incremental", action="store_true", help="Process only new/changed information, where possible.", default=True, ) mode_group.add_argument( "-f", "--full", dest="incremental", action="store_false", help="Drop and remake everything.", default=False, ) mode_options.add_argument( "--skipdelete", dest="skipdelete", action="store_true", help="For incremental updates, skip deletion of rows present in the " "destination but not the source.", ) action_options = parser.add_argument_group( "Action options (default is to do all, but if any are specified, " "only those are done)" ) action_options.add_argument( "--dropremake", action="store_true", help="Drop/remake destination tables, and admin tables except " "opt-out tables.", ) action_options.add_argument( "--drop_all", action="store_true", help="Drop all destination tables known to the data dictionary, and " "all admin tables, then stop. (May also be helpful in revealing " "leftover tables in the destination database, e.g. if the data " "dictionary has changed.)", ) action_options.add_argument( "--optout", action="store_true", help="Update opt-out list in administrative database.", ) action_options.add_argument( "--nonpatienttables", action="store_true", help="Process non-patient tables only.", ) action_options.add_argument( "--patienttables", action="store_true", help="Process patient tables only.", ) action_options.add_argument( "--index", action="store_true", help="Create indexes only." ) restrict_options = parser.add_argument_group("Restriction options") restrict_options.add_argument( "--restrict", help="Restrict which patients are processed. Specify which field to " "base the restriction on or 'pid' for patient ids.", ) restrict_options.add_argument( "--limits", nargs=2, help="Specify lower and upper limits of the field " "specified in '--restrict'.", ) restrict_options.add_argument( "--file", help="Specify a file with a list of values for the field " "specified in '--restrict'.", ) restrict_options.add_argument( "--list", nargs="+", help="Specify a list of values for the field " "specified in '--restrict'.", ) restrict_options.add_argument( "--free_text_limit", type=int, help="Filter out all free text fields over the specified length. " "For example, if you specify 200, then VARCHAR(200) fields will " "be permitted, but VARCHAR(200), or VARCHAR(MAX), or TEXT " "(etc., etc.) fields will be excluded.", ) restrict_options.add_argument( "--excludescrubbed", action="store_true", help="Exclude all text fields which are being scrubbed.", ) processing_options = parser.add_argument_group("Processing options") processing_options.add_argument( "--process", nargs="?", type=int, default=0, help="For multiprocess mode: specify process number.", ) processing_options.add_argument( "--nprocesses", nargs="?", type=int, default=1, help="For multiprocess mode: specify total number of processes " "(launched somehow, of which this is to be one).", ) processing_options.add_argument( "--processcluster", default="", help="Process cluster name (used as part of log name).", ) processing_options.add_argument( "--skip_dd_check", action="store_true", help="Skip data dictionary validity check.", ) processing_options.add_argument( "--seed", help="String to use as the basis of the seed for the random number " "generator used for the transient integer RID (TRID). Leave " "blank to use the default seed (system time).", ) processing_options.add_argument( "--chunksize", nargs="?", type=int, default=DEFAULT_CHUNKSIZE, help="Number of records copied in a chunk when copying PKs from one " "database to another.", ) debugging_options = parser.add_argument_group("Reporting and debugging") debugging_options.add_argument( "--reportevery", nargs="?", type=int, default=DEFAULT_REPORT_EVERY, help="Report insert progress every n rows in verbose mode.", ) debugging_options.add_argument( "--debugscrubbers", action="store_true", help="Report sensitive scrubbing information, for debugging.", ) debugging_options.add_argument( "--savescrubbers", action="store_true", help="Saves sensitive scrubbing information in admin database, " "for debugging.", ) debugging_options.add_argument( "--echo", action="store_true", help="Echo SQL." ) args = parser.parse_args() # ------------------------------------------------------------------------- # Verbosity, logging # ------------------------------------------------------------------------- mynames = [] # type: List[str] if args.processcluster: mynames.append(args.processcluster) if args.nprocesses > 1: mynames.append(f"proc{args.process}") loglevel = logging.DEBUG if args.verbose else logging.INFO rootlogger = logging.getLogger() configure_logger_for_colour(rootlogger, loglevel, extranames=mynames) # ------------------------------------------------------------------------- # Onwards # ------------------------------------------------------------------------- if args.config: os.environ[ANON_CONFIG_ENV_VAR] = args.config # Delayed import; pass everything else on from crate_anon.anonymise.anonymise import anonymise # delayed import anonymise( incremental=args.incremental, skipdelete=args.skipdelete, dropremake=args.dropremake, full_drop_only=args.drop_all, optout=args.optout, patienttables=args.patienttables, nonpatienttables=args.nonpatienttables, index=args.index, restrict=args.restrict, restrict_file=args.file, restrict_limits=args.limits, restrict_list=args.list, free_text_limit=args.free_text_limit, exclude_scrubbed_fields=args.excludescrubbed, nprocesses=args.nprocesses, process=args.process, skip_dd_check=args.skip_dd_check, seed=args.seed, chunksize=args.chunksize, reportevery=args.reportevery, echo=args.echo, debugscrubbers=args.debugscrubbers, savescrubbers=args.savescrubbers, )
[docs]def main() -> None: """ Command-line entry point. """ call_main_with_exception_reporting(inner_main)
# ============================================================================= # Command-line entry point # ============================================================================= if __name__ == "__main__": if DEBUG_RUN_WITH_PDB: pdb_run(main) else: main()