#!/usr/bin/env python
"""
crate_anon/anonymise/draft_dd.py
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
**Draft an anonymisation data dictionary.**
"""
import argparse
import logging
import os
from cardinal_pythonlib.enumlike import keys_descriptions_from_enum
from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger
from rich_argparse import ArgumentDefaultsRichHelpFormatter
from crate_anon.anonymise.config import Config
from crate_anon.anonymise.constants import ANON_CONFIG_ENV_VAR
from crate_anon.preprocess.systmone_ddgen import (
DEFAULT_SYSTMONE_CONTEXT,
modify_dd_for_systmone,
SystmOneContext,
)
from crate_anon.version import CRATE_VERSION_PRETTY
log = logging.getLogger(__name__)
# =============================================================================
# Draft a data dictionary
# =============================================================================
[docs]def draft_dd(
config: Config,
dd_output_filename: str,
incremental: bool = False,
skip_dd_check: bool = False,
explicit_dest_datatype: bool = False,
systmone: bool = False,
systmone_context: SystmOneContext = None,
systmone_sre_spec_csv_filename: str = None,
systmone_append_comments: bool = False,
systmone_include_generic: bool = False,
systmone_allow_unprefixed_tables: bool = False,
systmone_alter_loaded_rows: bool = False,
systmone_table_info_in_comments: bool = True,
) -> None:
"""
Draft a data dictionary, by scanning a source database.
Args:
config:
Anonymisation config object.
incremental:
If true: make it an incremental data dictionary, using only fields
present in the database but absent from the existing data
dictionary referred to in the config.
dd_output_filename:
File for output ('-' for stdout).
skip_dd_check:
If true: skip data dictionary validity check when loading the
pre-existing data dictionary in "incremental" mode.
explicit_dest_datatype:
Make destination datatypes explicit, not implicit. (Primarily for
debugging.)
systmone:
Process data dictionary for SystmOne data?
systmone_context:
(For SystmOne.) Source database context for SystmOne use.
systmone_sre_spec_csv_filename:
(For SystmOne.) Optional filename for TPP Strategic Reporting
Extract (SRE) specification CSV.
systmone_append_comments:
(For SystmOne.) Append, rather than replacing, existing comments?
Usually better as False -- if you use
``systmone_sre_spec_csv_filename``, this will provide better
comments.
systmone_include_generic:
(For SystmOne.) Include all fields that are not known about by this
code and treated specially? If False, the config file settings are
used (which may omit or include). If True, all such fields are
included.
systmone_allow_unprefixed_tables:
(For SystmOne.) Permit tables that don't start with the expected
prefix? (That prefix is e.g. 'SR' for the TPP SRE context, 'S1\\_'
for the CPFT Data Warehouse context.) Discouraged; you may get odd
tables and views.
systmone_alter_loaded_rows:
Alter rows that were loaded from disk (not read from a database)?
The default is to leave such rows untouched.
systmone_table_info_in_comments:
Include table descriptions in column comments?
"""
if incremental:
# For "incremental", we load the data dictionary from disk.
# Otherwise, we don't, so a completely fresh one will be generated.
config.load_dd(check_against_source_db=not skip_dd_check)
dd = config.dd
dd.draft_from_source_databases()
# Will skip source columns that it knows about already (and thus generate
# an incremental data dictionary if we had pre-loaded some).
if systmone:
if not systmone_context:
raise ValueError("Requires SystmOne context to be specified")
modify_dd_for_systmone(
dd=dd,
context=systmone_context,
sre_spec_csv_filename=systmone_sre_spec_csv_filename,
append_comments=systmone_append_comments,
include_generic=systmone_include_generic,
allow_unprefixed_tables=systmone_allow_unprefixed_tables,
alter_loaded_rows=systmone_alter_loaded_rows,
table_info_in_comments=systmone_table_info_in_comments,
)
dd.tidy_draft()
if explicit_dest_datatype:
dd.make_dest_datatypes_explicit()
dd.check_valid(check_against_source_db=not skip_dd_check)
dd.write(dd_output_filename)
# =============================================================================
# Main
# =============================================================================
[docs]def main() -> None:
"""
Command-line entry point.
"""
# noinspection PyTypeChecker
parser = argparse.ArgumentParser(
description=f"Draft a data dictionary for the anonymiser, by scanning "
f"a source database. ({CRATE_VERSION_PRETTY})",
formatter_class=ArgumentDefaultsRichHelpFormatter,
)
parser.add_argument(
"--config",
help=f"Config file (overriding environment variable "
f"{ANON_CONFIG_ENV_VAR}). Note that the config file has several "
f"options governing the automatic generation of data "
f"dictionaries.",
)
parser.add_argument(
"--verbose", "-v", action="store_true", help="Be verbose"
)
parser.add_argument(
"--incremental",
action="store_true",
help="Drafts an INCREMENTAL draft data dictionary (containing fields "
"in the database that aren't in the existing data dictionary "
"referred to by the config file).",
)
parser.add_argument(
"--skip_dd_check",
action="store_true",
help="Skip validity check (against the source database) for the "
"data dictionary.",
)
parser.add_argument(
"--output", default="-", help="File for output; use '-' for stdout."
)
parser.add_argument(
"--explicit_dest_datatype",
action="store_true",
help="(Primarily for debugging.) CRATE will convert the source column "
"data type (e.g. INTEGER, FLOAT, VARCHAR(25)) to a datatype for "
"the destination database, sometimes with modifications. "
"However, this is usually implicit: the draft data dictionary "
"doesn't show these data types unless they require modification. "
"Use this option to make them all explicit.",
)
parser.add_argument(
"--systmone",
action="store_true",
help="Modify the data dictionary for SystmOne. CRATE knows about some "
"of the standard SystmOne data structure and can read a "
"database and customize the data dictionary for SystmOne.",
)
s1_options = parser.add_argument_group(
"SystmOne options (for when --systmone is used)"
)
context_k, context_d = keys_descriptions_from_enum(
SystmOneContext, keys_to_lower=True
)
s1_options.add_argument(
"--systmone_context",
type=str,
choices=context_k,
default=DEFAULT_SYSTMONE_CONTEXT.name.lower(),
help="Context of the SystmOne database that you are reading. "
f"[{context_d}]",
)
s1_options.add_argument(
"--systmone_sre_spec",
help="SystmOne Strategic Reporting Extract (SRE) specification CSV "
"filename (from TPP, containing table/field comments).",
)
s1_options.add_argument(
"--systmone_append_comments",
action="store_true",
help="Append to comments, rather than replacing them.",
)
s1_options.add_argument(
"--systmone_include_generic",
action="store_true",
help="Include all 'generic' fields, overriding preferences set via "
"the config file options.",
)
s1_options.add_argument(
"--systmone_allow_unprefixed_tables",
action="store_true",
help="Permit tables that don't start with the expected prefix "
"(which is e.g. 'SR' for the TPP SRE context, 'S1_' for the CPFT "
"Data Warehouse context). May add helpful content, but you may "
"get odd tables and views.",
)
s1_options.add_argument(
"--systmone_alter_loaded_rows",
action="store_true",
help="(For --incremental.) Alter rows that were loaded from disk "
"(not read from a database)? The default is to leave such rows "
"untouched.",
)
s1_options.add_argument(
"--systmone_table_info_in_comments",
dest="systmone_table_info_in_comments",
action="store_true",
default=True,
help="Add table descriptions to column comments. Useful if the "
"database does not itself support table comments.",
)
s1_options.add_argument(
"--systmone_no_table_info_in_comments",
dest="systmone_table_info_in_comments",
action="store_false",
default=False,
help="Opposite of --systmone_table_info_in_comments.",
)
args = parser.parse_args()
# -------------------------------------------------------------------------
# Verbosity, logging
# -------------------------------------------------------------------------
loglevel = logging.DEBUG if args.verbose else logging.INFO
main_only_quicksetup_rootlogger(level=loglevel)
# -------------------------------------------------------------------------
# Onwards
# -------------------------------------------------------------------------
if args.config:
os.environ[ANON_CONFIG_ENV_VAR] = args.config
from crate_anon.anonymise.config_singleton import config # delayed import
draft_dd(
config=config,
dd_output_filename=args.output,
incremental=args.incremental,
skip_dd_check=args.skip_dd_check,
explicit_dest_datatype=args.explicit_dest_datatype,
systmone=args.systmone,
systmone_context=SystmOneContext[args.systmone_context],
systmone_sre_spec_csv_filename=args.systmone_sre_spec,
systmone_append_comments=args.systmone_append_comments,
systmone_include_generic=args.systmone_include_generic,
systmone_allow_unprefixed_tables=args.systmone_allow_unprefixed_tables,
systmone_alter_loaded_rows=args.systmone_alter_loaded_rows,
systmone_table_info_in_comments=args.systmone_table_info_in_comments,
)
if __name__ == "__main__":
main()