Source code for crate_anon.common.regex_helpers

"""
crate_anon/common/regex_helpers.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Constants and helper functionsfor use with regexes.**

"""

from typing import Iterable, List, Union

import regex  # sudo apt-get install python-regex


# =============================================================================
# Constants
# =============================================================================

# Reminders: ? zero or one, + one or more, * zero or more
# Non-capturing groups: (?:...)
# ... https://docs.python.org/3/howto/regex.html
# ... https://stackoverflow.com/questions/3512471/non-capturing-group

ASTERISK = r"\*"
AT_LEAST_ONE_NONWORD = r"\W+"  # 1 or more non-alphanumeric character
AT_LEAST_ONE_WHITESPACE = r"\s+"  # one or more whitespace chars
AT_LEAST_ONE_NON_NEWLINE_WHITESPACE = r"[ \t]+"  # one or more spaces/tabs

HYPHEN_OR_EN_DASH = r"[-–]"

LEFT_BRACKET = r"\("

NON_ALPHANUMERIC_SPLITTERS = regex.compile(AT_LEAST_ONE_NONWORD, regex.UNICODE)

# http://www.regular-expressions.info/lookaround.html
# Not all engines support lookbehind; e.g. regexr.com doesn't; but Python does
NOT_DIGIT_LOOKBEHIND = r"(?<!\d)"
NOT_DIGIT_LOOKAHEAD = r"(?!\d)"

# The Kleene star has highest precedence.
# So, for example, ab*c matches abbbc, but not (all of) ababc. See regexr.com
OPTIONAL_NONWORD = r"\W*"  # zero or more non-alphanumeric characters...
# ... doesn't need to be [\W]*, for precedence reasons as above.
OPTIONAL_WHITESPACE = r"\s*"  # zero or more whitespace chars
OPTIONAL_NON_NEWLINE_WHITESPACE = r"[ \t]*"  # zero or more spaces/tabs

REGEX_METACHARS = [
    "\\",
    "^",
    "$",
    ".",
    "|",
    "?",
    "*",
    "+",
    "(",
    ")",
    "[",
    "{",
    "#",
    " ",
]
# http://www.regular-expressions.info/characters.html
# Start with \, for replacement.

RIGHT_BRACKET = r"\)"

WB = r"\b"  # word boundary; escape the slash if not using a raw string
WHITESPACE_CHARACTERS = [" ", "\t", "\n"]
WORD_BOUNDARY = WB

_NOT_EMPTY_WORD_ONLY_REGEX = regex.compile(r"^\w+$")
_NOT_EMPTY_ALPHABETICAL_ONLY_REGEX = regex.compile("^[a-zA-Z]+$")
# cf. https://stackoverflow.com/questions/336210/regular-expression-for-alphanumeric-and-underscores  # noqa


# =============================================================================
# Helper functions
# =============================================================================


[docs]def escape_literal_string_for_regex(s: str) -> str: r""" Escape any regex characters. Returns a string. For example, maps ``Hello there.`` to ``Hello\ there\.`` Start with ``\`` -> ``\\``; this should be the first replacement in :data:`REGEX_METACHARS`. """ for c in REGEX_METACHARS: s = s.replace(c, "\\" + c) return s
[docs]def escape_literal_for_regex_giving_charlist(s: str) -> List[str]: r""" Escape any regex characters. Returns a list of characters or escaped characters. Start with ``\`` -> ``\\``; this should be the first replacement in :data:`REGEX_METACHARS`. """ chars = [] # type: List[str] for unescaped_char in s: if unescaped_char in REGEX_METACHARS: chars.append("\\" + unescaped_char) else: chars.append(unescaped_char) return chars
[docs]def escape_literal_for_regex_allowing_flexible_whitespace(s: str) -> str: r""" Escapes literal characters, but creating a regex that allows flexible whitespace (e.g. double space) for every bit of whitespace in the original. For example, maps ``Hello there.`` to ``Hello\s+there\.`` """ # Replace all forms of whitespace with spaces. for c in WHITESPACE_CHARACTERS: s = s.replace(c, " ") # Eliminate double spaces while " " in s: s = s.replace(" ", " ") # Escape regex characters, except handling whitespace (now, spaces) # differently. s = escape_literal_string_for_regex(s) s = s.replace(r"\ ", AT_LEAST_ONE_WHITESPACE) return s
[docs]def at_wb_start_end(regex_str: str) -> str: """ Returns a version of the regex starting and ending with a word boundary. Caution using this. Digits do not end a word, so "mm3" will not match if your "mm" group ends in a word boundary. """ return rf"\b{regex_str}\b"
[docs]def at_start_wb(regex_str: str) -> str: """ Returns a version of the regex starting with a word boundary. Beware, though; e.g. "3kg" is reasonable, and this does NOT have a word boundary in. """ return rf"\b{regex_str}"
[docs]def noncapture_group(regex_str: str) -> str: """ Wraps the string in a non-capture group, ``(?: ... )`` """ return f"(?:{regex_str})"
[docs]def optional_noncapture_group(regex_str: str) -> str: """ Wraps the string in an optional non-capture group, ``(?: ... )?`` """ return f"(?:{regex_str})?"
[docs]def named_capture_group(regex_str: str, name: str) -> str: """ Wraps the string in an named capture group, ``(?P<name>...)`` The P is for Python extensions; https://docs.python.org/3/howto/regex.html#non-capturing-and-named-groups """ return f"(?P<{name}>{regex_str})"
[docs]def optional_named_capture_group(regex_str: str, name: str) -> str: """ As for :func:`named_capture_group`, but optional. """ return f"(?P<{name}>{regex_str})?"
[docs]def regex_or( *regex_strings: str, wrap_each_in_noncapture_group: bool = False, wrap_result_in_noncapture_group: bool = False, ) -> str: """ Returns a regex representing an "or" join of the components. Args: regex_strings: The strings to join with ``|``. wrap_each_in_noncapture_group: Convert each ``component`` into ``(?:component)`` before joining? wrap_result_in_noncapture_group: Convert the final ``result`` into ``(?:result)``? """ if len(regex_strings) == 1: # Add a bit of efficiency. only_string = regex_strings[0] if wrap_each_in_noncapture_group or wrap_result_in_noncapture_group: return noncapture_group(only_string) else: return only_string if wrap_each_in_noncapture_group: result = "|".join(noncapture_group(x) for x in regex_strings) else: result = "|".join(x for x in regex_strings) if wrap_result_in_noncapture_group: return noncapture_group(result) else: return result
[docs]def assert_alphabetical(x: Union[str, Iterable[str]]) -> None: """ Asserts that the string is not empty and contains only alphabetical characters. """ if isinstance(x, str): assert _NOT_EMPTY_ALPHABETICAL_ONLY_REGEX.match(x), ( f"Should be non-empty and contain only alphabetical characters: " f"{x!r}" ) else: for s in x: assert isinstance(s, str) assert _NOT_EMPTY_ALPHABETICAL_ONLY_REGEX.match(s), ( f"Should be non-empty and contain only alphabetical " f"characters: {s!r} (part of {x!r})" )
[docs]def first_n_characters_required(x: str, n: int) -> str: """ Returns a regex string that requires the first n characters, and then allows the rest as optional as long as they are in sequence. Args: x: String n: Minimum number of characters required at the start """ assert _NOT_EMPTY_WORD_ONLY_REGEX.match(x) assert n >= 0 start = x[0:n] rest = x[n:] rest_regex = "" for c in reversed(rest): rest_regex = optional_noncapture_group(c + rest_regex) return start + rest_regex
[docs]def anchor(x: str, start: bool = True, end: bool = True) -> str: """ Anchor a regex at the start and/or end. """ s = "^" if start else "" e = "$" if end else "" return s + x + e