Source code for crate_anon.nlp_manager.tests.regex_test_helperfunc

"""
crate_anon/nlp_manager/tests/regex_test_helperfunc.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Functions for testing regular expressions.**

"""

import logging

from typing import List, Pattern, Tuple
from crate_anon.nlp_manager.regex_func import compile_regex
from crate_anon.nlp_manager.regex_parser import BaseNlpParser, ValidatorBase

log = logging.getLogger(__name__)


[docs]def f_score(precision: float, recall: float, beta: float = 1) -> float:
    """
    Calculates an F score (e.g. an F1 score for ``beta == 1``).
    See https://en.wikipedia.org/wiki/F1_score.

    Args:
        precision: precision of the test, P(really positive | test positive)
        recall: recall of the test, P(test positive | really positive)
        beta: controls the type of the F score (the relative emphasis on
            precision versus recall)

    Returns:
        the F score

    """
    beta_sq = beta**2
    return (
        (1 + beta_sq) * precision * recall / ((beta_sq * precision) + recall)
    )


[docs]def get_compiled_regex_results(
    compiled_regex: Pattern, text: str
) -> List[str]:
    """
    Finds all the hits for a regex when applied to text.

    Args:
        compiled_regex: a compiled regular expression
        text: text to parse

    Returns:
        a list of all the (entire) hits for this regex in ``text``

    """
    results = []  # type: List[str]
    for m in compiled_regex.finditer(text):
        results.append(m.group(0))
    return results


[docs]def print_compiled_regex_results(
    compiled_regex: Pattern, text: str, prefix_spaces: int = 4
) -> None:
    """
    Applies a regex to text and prints (to stdout) all its hits.

    Args:
        compiled_regex: a compiled regular expression
        text: text to parse
        prefix_spaces: number of spaces to begin each answer with
    """
    results = get_compiled_regex_results(compiled_regex, text)
    print(f"{' ' * prefix_spaces}{text!r} -> {results!r}")


[docs]def assert_text_regex(
    name: str,
    regex_text: str,
    test_expected_list: List[Tuple[str, List[str]]],
    verbose: bool = False,
) -> None:
    """
    Test a regex upon some text.

    Args:
        name: regex name (for display purposes only)
        regex_text: text that should be compiled to give our regex
        test_expected_list:
            list of tuples ``teststring, expected_results``, where
            ``teststring`` is some text and ``expected_results`` is a list of
            expected hits for the regex within ``teststring``
        verbose: be verbose?

    Returns:

    """
    log.info(f"Testing regex named {name}")
    compiled_regex = compile_regex(regex_text)
    if verbose:
        log.debug(f"... regex text:\n{regex_text}")
    for test_string, expected_values in test_expected_list:
        actual_values = get_compiled_regex_results(compiled_regex, test_string)
        assert actual_values == expected_values, (
            "Regex {name}: Expected {expected_values}, got {actual_values}, "
            "when parsing {test_string}. Regex text:\n{regex_text}]".format(
                name=name,
                expected_values=expected_values,
                actual_values=actual_values,
                test_string=repr(test_string),
                regex_text=regex_text,
            )
        )
    log.info("... OK")


[docs]def run_tests_nlp_and_validator_classes(
    all_nlp_and_validators: List[Tuple[BaseNlpParser, ValidatorBase]]
) -> None:
    """
    Tests multiple pairs of NLP classes and their associated validators.
    """
    all_nlp_classes, all_validator_classes = zip(*all_nlp_and_validators)
    for cls in all_nlp_classes:
        cls(None, None).test(verbose=True)
    for cls in all_validator_classes:
        cls(None, None).test(verbose=True)