#!/usr/bin/env python
"""
crate_anon/linkage/bulk_hash.py
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
Tool to hash multiple IDs from the command line.
Test code to look at different types of digest:
.. code-block:: python
import hashlib
import hmac
msg = "This is an ex-parrot!"
key = "voom"
key_bytes = str(key).encode('utf-8')
msg_bytes = str(msg).encode('utf-8')
digestmod = hashlib.sha256
hmac_obj = hmac.new(key=key_bytes, msg=msg_bytes, digestmod=digestmod)
# These are the two default kinds of digest:
print(hmac_obj.digest()) # 8-bit binary
print(hmac_obj.hexdigest()) # hexadecimal
# Hex carries 4 bits per character. There are other possibilities,
# notably:
# - Base64 with 6 bits per character;
# - Base32 with 5 bits per character.
"""
import argparse
import logging
from typing import Optional, TextIO
from cardinal_pythonlib.file_io import (
gen_noncomment_lines,
smart_open,
writeline_nl,
)
from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger
from cardinal_pythonlib.hash import (
HashMethods,
make_hasher,
)
from rich_argparse import ArgumentDefaultsRichHelpFormatter
log = logging.getLogger(__name__)
def get_first_noncomment_line(filename: str) -> Optional[str]:
try:
with open(filename) as f:
return next(gen_noncomment_lines(f))
except StopIteration:
return None
[docs]def bulk_hash(
input_filename: str,
output_filename: str,
hash_method: str,
key: str,
keep_id: bool = True,
):
"""
Hash lines from one file to another.
Args:
input_filename:
input filename, or "-" for stdin
output_filename:
output filename, or "-" for stdin
hash_method:
method to use; e.g. ``HMAC_SHA256``
key:
secret key for hasher
keep_id:
produce CSV with ``hash,id`` pairs, rather than just lines with
the hashes?
Note that the hash precedes the ID with the ``keep_id`` option, which
works best if the ID might contain commas.
"""
log.info(f"Reading from: {input_filename}")
log.info(f"Writing to: {output_filename}")
log.info(f"Using hash method: {hash_method}")
log.info(f"keep_id: {keep_id}")
log.debug(f"Using key: {key!r}") # NB security warning in help
hash_fn = make_hasher(hash_method=hash_method, key=key).hash
with smart_open(input_filename, "rt") as i: # type: TextIO
with smart_open(output_filename, "wt") as o: # type: TextIO
for line in gen_noncomment_lines(i):
hashed = hash_fn(line) if line else ""
outline = f"{hashed},{line}" if keep_id else hashed
# log.debug(f"{line!r} -> {hashed!r}")
writeline_nl(o, outline)
log.info("Done.")
[docs]def main() -> None:
"""
Command-line entry point.
"""
# noinspection PyTypeChecker
parser = argparse.ArgumentParser(
description="Hash IDs in bulk, using a cryptographic hash function.",
formatter_class=ArgumentDefaultsRichHelpFormatter,
)
parser.add_argument(
"infile",
type=str,
help="Input file, or '-' for stdin. "
"Use one line per thing to be hashed. "
"Comments (marked with '#') and blank lines are ignored. "
"Lines have whitespace stripped left and right.",
)
parser.add_argument(
"--outfile",
type=str,
default="-",
help="Output file, or '-' for stdout. "
"One line will be written for every input line. "
"Blank lines will be written for commented or blank input.",
)
parser.add_argument(
"--key",
type=str,
help="Secret key for hasher (warning: may be visible in process list; "
"see also --keyfile)",
)
parser.add_argument(
"--keyfile",
type=str,
help="File whose first noncomment line contains the secret key for "
"the hasher. (It will be whitespace-stripped right and left.)",
)
parser.add_argument(
"--method",
choices=[
HashMethods.HMAC_MD5,
HashMethods.HMAC_SHA256,
HashMethods.HMAC_SHA512,
],
default=HashMethods.HMAC_MD5,
help="Hash method",
)
parser.add_argument(
"--keepid",
action="store_true",
help="Produce CSV output with (hash,id) rather than just the hash",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Be verbose (NB will write key to stderr)",
)
args = parser.parse_args()
main_only_quicksetup_rootlogger(
logging.DEBUG if args.verbose else logging.INFO
)
assert bool(args.key) != bool(
args.keyfile
), "Specify either --key or --keyfile (and not both)."
if args.keyfile:
key = get_first_noncomment_line(args.keyfile)
assert key, f"No key found in keyfile: {args.keyfile}"
else:
key = args.key
bulk_hash(
input_filename=args.infile,
output_filename=args.outfile,
hash_method=args.method,
key=key,
keep_id=args.keepid,
)
if __name__ == "__main__":
main()