"""
crate_anon/common/parallel.py
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
**Assistance functions for "embarrassingly parallel" job assignment.**
"""
import logging
from typing import Any
from cardinal_pythonlib.hash import hash64
log = logging.getLogger(__name__)
[docs]def is_my_job_by_int(value: int, tasknum: int, ntasks: int) -> bool:
"""
"Is it my job to do this work?"
Args:
value: some integer value that is fairly evenly distributed, to spread
the workload
tasknum: which task number am I?
ntasks: how many tasks are there in total?
Returns:
is it my job?
Algorithm:
- if there's only one task: yes
- otherwise, return ``value % ntasks == tasknum``
"""
if ntasks == 1:
return True
return value % ntasks == tasknum
[docs]def is_my_job_by_hash(value: Any, tasknum: int, ntasks: int) -> bool:
"""
"Is it my job to do this work?"
Args:
value: anything that's hashable
tasknum: which task number am I?
ntasks: how many tasks are there in total?
Returns:
is it my job?
Algorithm:
- We convert some non-integer thing into a deterministic but roughly
randomly distributed integer using :func:`hash64`. That produces a signed
integer, which is OK because ``%`` works nonetheless.
When we use it:
- We use this function to parallelize for non-integer PKs.
- This is less efficient than dividing the work up via SQL, because we have
to fetch/hash something.
- Perform this test ASAP in loops, for speed.
"""
if ntasks == 1:
return True
return hash64(value) % ntasks == tasknum
[docs]def is_my_job_by_hash_prehashed(
hashed_value: int, tasknum: int, ntasks: int
) -> bool:
"""
A version of :func:`is_my_job_by_hash` for use when you have pre-hashed
the value, and ``ntasks`` is guaranteed to be >1.
Args:
hashed_value: integer hashed value
tasknum: which task number am I?
ntasks: how many tasks are there in total?
Returns:
is it my job?
"""
return hashed_value % ntasks == tasknum