Source code for crate_anon.common.regex_helpers

"""
crate_anon/common/regex_helpers.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Constants and helper functionsfor use with regexes.**

"""

from typing import Iterable, List, Union

import regex  # sudo apt-get install python-regex


# =============================================================================
# Constants
# =============================================================================

# Reminders: ? zero or one, + one or more, * zero or more
# Non-capturing groups: (?:...)
# ... https://docs.python.org/3/howto/regex.html
# ... https://stackoverflow.com/questions/3512471/non-capturing-group

ASTERISK = r"\*"
AT_LEAST_ONE_NONWORD = r"\W+"  # 1 or more non-alphanumeric character
AT_LEAST_ONE_WHITESPACE = r"\s+"  # one or more whitespace chars
AT_LEAST_ONE_NON_NEWLINE_WHITESPACE = r"[ \t]+"  # one or more spaces/tabs

HYPHEN_OR_EN_DASH = r"[-–]"

LEFT_BRACKET = r"\("

NON_ALPHANUMERIC_SPLITTERS = regex.compile(AT_LEAST_ONE_NONWORD, regex.UNICODE)

# http://www.regular-expressions.info/lookaround.html
# Not all engines support lookbehind; e.g. regexr.com doesn't; but Python does
NOT_DIGIT_LOOKBEHIND = r"(?<!\d)"
NOT_DIGIT_LOOKAHEAD = r"(?!\d)"

# The Kleene star has highest precedence.
# So, for example, ab*c matches abbbc, but not (all of) ababc. See regexr.com
OPTIONAL_NONWORD = r"\W*"  # zero or more non-alphanumeric characters...
# ... doesn't need to be [\W]*, for precedence reasons as above.
OPTIONAL_WHITESPACE = r"\s*"  # zero or more whitespace chars
OPTIONAL_NON_NEWLINE_WHITESPACE = r"[ \t]*"  # zero or more spaces/tabs

REGEX_METACHARS = [
    "\\",
    "^",
    "$",
    ".",
    "|",
    "?",
    "*",
    "+",
    "(",
    ")",
    "[",
    "{",
    "#",
    " ",
]
# http://www.regular-expressions.info/characters.html
# Start with \, for replacement.

RIGHT_BRACKET = r"\)"

WB = r"\b"  # word boundary; escape the slash if not using a raw string
WHITESPACE_CHARACTERS = [" ", "\t", "\n"]
WORD_BOUNDARY = WB

_NOT_EMPTY_WORD_ONLY_REGEX = regex.compile(r"^\w+$")
_NOT_EMPTY_ALPHABETICAL_ONLY_REGEX = regex.compile("^[a-zA-Z]+$")
# cf. https://stackoverflow.com/questions/336210/regular-expression-for-alphanumeric-and-underscores  # noqa


# =============================================================================
# Helper functions
# =============================================================================


[docs]def escape_literal_string_for_regex(s: str) -> str:
    r"""
    Escape any regex characters. Returns a string.

    For example, maps ``Hello there.`` to ``Hello\ there\.``

    Start with ``\`` -> ``\\``; this should be the first replacement in
    :data:`REGEX_METACHARS`.
    """
    for c in REGEX_METACHARS:
        s = s.replace(c, "\\" + c)
    return s


[docs]def escape_literal_for_regex_giving_charlist(s: str) -> List[str]:
    r"""
    Escape any regex characters. Returns a list of characters or escaped
    characters.

    Start with ``\`` -> ``\\``; this should be the first replacement in
    :data:`REGEX_METACHARS`.
    """
    chars = []  # type: List[str]
    for unescaped_char in s:
        if unescaped_char in REGEX_METACHARS:
            chars.append("\\" + unescaped_char)
        else:
            chars.append(unescaped_char)
    return chars


[docs]def escape_literal_for_regex_allowing_flexible_whitespace(s: str) -> str:
    r"""
    Escapes literal characters, but creating a regex that allows flexible
    whitespace (e.g. double space) for every bit of whitespace in the original.

    For example, maps ``Hello there.`` to ``Hello\s+there\.``
    """
    # Replace all forms of whitespace with spaces.
    for c in WHITESPACE_CHARACTERS:
        s = s.replace(c, " ")
    # Eliminate double spaces
    while "  " in s:
        s = s.replace("  ", " ")
    # Escape regex characters, except handling whitespace (now, spaces)
    # differently.
    s = escape_literal_string_for_regex(s)
    s = s.replace(r"\ ", AT_LEAST_ONE_WHITESPACE)
    return s


[docs]def at_wb_start_end(regex_str: str) -> str:
    """
    Returns a version of the regex starting and ending with a word boundary.

    Caution using this. Digits do not end a word, so "mm3" will not match if
    your "mm" group ends in a word boundary.
    """
    return rf"\b{regex_str}\b"


[docs]def at_start_wb(regex_str: str) -> str:
    """
    Returns a version of the regex starting with a word boundary.

    Beware, though; e.g. "3kg" is reasonable, and this does NOT have a word
    boundary in.
    """
    return rf"\b{regex_str}"


[docs]def noncapture_group(regex_str: str) -> str:
    """
    Wraps the string in a non-capture group, ``(?: ... )``
    """
    return f"(?:{regex_str})"


[docs]def optional_noncapture_group(regex_str: str) -> str:
    """
    Wraps the string in an optional non-capture group, ``(?: ... )?``
    """
    return f"(?:{regex_str})?"


[docs]def named_capture_group(regex_str: str, name: str) -> str:
    """
    Wraps the string in an named capture group, ``(?P<name>...)``
    The P is for Python extensions;
    https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups
    """
    return f"(?P<{name}>{regex_str})"


[docs]def optional_named_capture_group(regex_str: str, name: str) -> str:
    """
    As for :func:`named_capture_group`, but optional.
    """
    return f"(?P<{name}>{regex_str})?"


[docs]def regex_or(
    *regex_strings: str,
    wrap_each_in_noncapture_group: bool = False,
    wrap_result_in_noncapture_group: bool = False,
) -> str:
    """
    Returns a regex representing an "or" join of the components.

    Args:
        regex_strings:
            The strings to join with ``|``.
        wrap_each_in_noncapture_group:
            Convert each ``component`` into ``(?:component)`` before joining?
        wrap_result_in_noncapture_group:
            Convert the final ``result`` into ``(?:result)``?
    """
    if len(regex_strings) == 1:
        # Add a bit of efficiency.
        only_string = regex_strings[0]
        if wrap_each_in_noncapture_group or wrap_result_in_noncapture_group:
            return noncapture_group(only_string)
        else:
            return only_string
    if wrap_each_in_noncapture_group:
        result = "|".join(noncapture_group(x) for x in regex_strings)
    else:
        result = "|".join(x for x in regex_strings)
    if wrap_result_in_noncapture_group:
        return noncapture_group(result)
    else:
        return result


[docs]def assert_alphabetical(x: Union[str, Iterable[str]]) -> None:
    """
    Asserts that the string is not empty and contains only alphabetical
    characters.
    """
    if isinstance(x, str):
        assert _NOT_EMPTY_ALPHABETICAL_ONLY_REGEX.match(x), (
            f"Should be non-empty and contain only alphabetical characters: "
            f"{x!r}"
        )
    else:
        for s in x:
            assert isinstance(s, str)
            assert _NOT_EMPTY_ALPHABETICAL_ONLY_REGEX.match(s), (
                f"Should be non-empty and contain only alphabetical "
                f"characters: {s!r} (part of {x!r})"
            )


[docs]def first_n_characters_required(x: str, n: int) -> str:
    """
    Returns a regex string that requires the first n characters, and then
    allows the rest as optional as long as they are in sequence.

    Args:
        x:
            String
        n:
            Minimum number of characters required at the start
    """
    assert _NOT_EMPTY_WORD_ONLY_REGEX.match(x)
    assert n >= 0
    start = x[0:n]
    rest = x[n:]
    rest_regex = ""
    for c in reversed(rest):
        rest_regex = optional_noncapture_group(c + rest_regex)
    return start + rest_regex


[docs]def anchor(x: str, start: bool = True, end: bool = True) -> str:
    """
    Anchor a regex at the start and/or end.
    """
    s = "^" if start else ""
    e = "$" if end else ""
    return s + x + e