Source code for crate_anon.linkage.validation.test_hash_speed

#!/usr/bin/env python

"""
crate_anon/linkage/validation/test_hash_speed.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Test the speed of hashing.**

The question is: if someone malicious learned a secret hash key, how long would
it take them to generate a reverse map from a known identifier space?

The test uses a single CPU core.

Specimen results, for padding length 9 and the HMAC_MD5 algorithm, on Wombat
(3.5 GHz CPU), tested with 100000 (1e5) iterations (which took 0.72 s, piping
to ``/dev/null``):

- 1e9 operations will take about 7200 s = 2 hours.
  This is the right order of magniture for NHS numbers (9 digits plus a
  checksum; other rules might restrict that a bit more).

- 7.3e12 operations will take about 52667022 s = 1.7 years
  This is the right order of magnitude for NHS numbers plus dates of birth
  covering 20 years (1e9 for NHS number * 365 days/year * 20 years).

- 3.65e13 operations will take about 263335113 s = 8.4 years
  This is the right order of magnitude for NHS numbers plus DOBs covering
  a century (1e9 for NHS number * 365 days/year * 100 years).

The hash algorithm isn't a major factor; moving from HMAC_MD5 to HMAC_SHA512,
for example, only takes the time for 1e5 iterations from 0.72 s to 0.86 s.

(A subsequent test: faster, at 5443 s = 1.5 h, 1.25 y, and 6.3 y respectively.)

Speed tests considered for paper (not used in the end; real-world measures
used):

.. code-block:: bash

    ./test_hash_speed.py --method HMAC_MD5 --ntests 1000000 > /dev/null
    ./test_hash_speed.py --method HMAC_SHA256 --ntests 1000000 > /dev/null
    ./test_hash_speed.py --method HMAC_SHA512 --ntests 1000000 > /dev/null

"""

import argparse
import logging
import time
from typing import Iterable, List, TextIO

from cardinal_pythonlib.file_io import smart_open, writeline_nl
from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger
from cardinal_pythonlib.hash import HashMethods, make_hasher
from cardinal_pythonlib.randomness import generate_random_string
from rich_argparse import ArgumentDefaultsRichHelpFormatter

log = logging.getLogger(__name__)


[docs]def gen_dummy_data(n: int, string_length: int) -> Iterable[str]: """ Generate some random strings of the specified width """ for i in range(n): yield str(n).ljust(string_length, "x")
[docs]def test_hash_speed( output_filename: str, hash_method: str, key: str, ntests: int, intended_possibilities: List[int], string_length: int, ): """ Hash lines from one file to another. Args: output_filename: Output filename, or "-" for stdin hash_method: Method to use; e.g. ``HMAC_SHA256`` key: Secret key for hasher ntests: Number of hashes to perform. intended_possibilities: Number of hashes to estimate time for. string_length: Length of each string to hash (characters). Note that the hash precedes the ID with the ``keep_id`` option, which works best if the ID might contain commas. """ log.info(f"Writing to: {output_filename}") log.info(f"Using hash method: {hash_method}") log.info(f"Hashing some random data {ntests} times, using one CPU core") log.info(f"String length: {string_length}") log.debug(f"Using key: {key!r}") # NB security warning in help hasher = make_hasher(hash_method=hash_method, key=key) with smart_open(output_filename, "wt") as o: # type: TextIO start_time = time.time() for data in gen_dummy_data(ntests, string_length): hashed = hasher.hash(data) writeline_nl(o, f"{data} -> {hashed}") end_time = time.time() time_taken_s = end_time - start_time log.info(f"Start time (s): {start_time}") log.info(f"End time (s): {end_time}") log.info(f"Time taken (s): {time_taken_s}") log.info(f"Number of hash operations: {ntests}") log.info(f"Hash operations per second: {ntests / time_taken_s}") for intended in intended_possibilities: estimated_time_s = intended * time_taken_s / ntests log.info( f"For {intended} operations (on a single CPU core), " f"estimated time (s): {estimated_time_s}" )
[docs]def main() -> None: """ Command-line entry point. """ # noinspection PyTypeChecker parser = argparse.ArgumentParser( description="Test the speed of a hash method.", formatter_class=ArgumentDefaultsRichHelpFormatter, ) parser.add_argument( "--outfile", type=str, default="-", help="Output file; can use '-' for stdout (and redirect to " "/dev/null, which you should do for 'core' speed testing).", ) parser.add_argument( "--key", type=str, help="Key for hasher. Ordinarily this would be secret, but this is " "just for testing. Default is random.", ) parser.add_argument( "--keyfile", type=str, help="File whose first noncomment line contains the secret key for " "the hasher. (It will be whitespace-stripped right and left.)", ) parser.add_argument( "--method", choices=[ HashMethods.HMAC_MD5, HashMethods.HMAC_SHA256, HashMethods.HMAC_SHA512, ], default=HashMethods.HMAC_MD5, help="Hash method", ) parser.add_argument( "--ntests", type=int, default=100000, help="Number of hash tests to time for real (a small number).", ) parser.add_argument( "--length", type=int, default=10, help="Length of string to hash. (Consecutive integers covering the " "range of --ntests will be padded to this length.)", ) parser.add_argument( "--intended", type=int, nargs="+", default=[1000000000, 7300000000000, 36500000000000], help="Number of hash tests to calculate time for (a big number). For" "example, approx. 1000000000 (1e9) for NHS number; " "36500000000000 (3.65e13) for NHS number " "plus DOBs covering a century; " "7300000000000 (7.3e12) for NHS number " "plus DOBs covering two decades.", ) parser.add_argument( "--verbose", "-v", action="store_true", help="Be verbose (NB will write key to stderr)", ) args = parser.parse_args() main_only_quicksetup_rootlogger( logging.DEBUG if args.verbose else logging.INFO ) key = generate_random_string(length=64) if args.key is None else args.key test_hash_speed( output_filename=args.outfile, hash_method=args.method, key=key, string_length=args.length, ntests=args.ntests, intended_possibilities=args.intended, )
if __name__ == "__main__": main()