Source code for crate_anon.nlp_manager.regex_numbers

"""
crate_anon/nlp_manager/regex_numbers.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Constants and functions to assist in making regular expressions relating to
numbers (e.g. integers, floating-point, scientific notation...).**

"""

# =============================================================================
# Helper functions
# =============================================================================


def _negative_lookahead(x: str) -> str:
    """
    Regex for: x does not occur here.
    """
    # (?! something ) is a negative lookahead assertion
    return rf"(?! {x} )"


def _negative_lookbehind(x: str) -> str:
    """
    Regex for: x does not immediately precede what's here.
    """
    # (?<! something ) is a negative lookbehind assertion
    return rf"(?<! {x} )"


# =============================================================================
# Mathematical operations
# =============================================================================

MULTIPLY = r"[x\*×⋅]"  # x, *, ×, ⋅
MULTIPLY_OR_SPACE = r"[x\*×⋅\s]"  # x, *, ×, ⋅, space
POWER = r"(?: \^ | \*\* )"  # ^, **
POWER_INC_E = r"(?: e | \^ | \*\* )"  # e, ^, **
POWER_INC_E_ASTERISK = r"(?: e | \^ | \*\* | \*)"  # e, ^, **, *
# ... e.g. in CUH: "10*9/L" for "×10^9/L"

PLUS_SIGN = r"\+"  # don't forget to escape it
MINUS_SIGN = r"[-−–]"  # any of: ASCII hyphen-minus, Unicode minus, en dash
SIGN = rf"(?: {PLUS_SIGN} | {MINUS_SIGN} )"

# NO_MINUS_SIGN = _negative_lookahead(MINUS_SIGN)
# NO_PRECEDING_MINUS_SIGN = _negative_lookbehind(MINUS_SIGN)
# NO_PRECEDING_MINUS_SIGN_OR_DIGIT = _negative_lookbehind(fr"{MINUS_SIGN} | \d")  # noqa: E501
NO_PRECEDING_MINUS_SIGN_OR_DIGITCOMMA_OR_DOT = _negative_lookbehind(
    rf"{MINUS_SIGN} | \d,? | \."
)


# =============================================================================
# Quantities
# =============================================================================


[docs]def times_ten_to_power(n: int) -> str: """ For a power *n*, returns a regex to capture "10^n" and similar notations. """ return rf"(?: {MULTIPLY}? \s* 10 \s* {POWER_INC_E_ASTERISK} \s* {n})"
BILLION = times_ten_to_power(9) TRILLION = times_ten_to_power(12) # ============================================================================= # Number components # ============================================================================= # Don't create components that are entirely optional; they're hard to test! PLAIN_INTEGER = r"\d+" # Numbers with commas: https://stackoverflow.com/questions/5917082 # ... then modified a little, because that fails with Python's regex module; # (a) the "\d+" grabs things like "12,000" and thinks "aha, 12", so we have to # fix that by putting the "thousands" bit first; then # (b) that has to be modified to contain at least one comma/thousands grouping # (or it will treat "9800" as "980"). PLAIN_INTEGER_W_THOUSAND_COMMAS = r"(?: (?: \d{1,3} (?:,\d{3})+ ) | \d+ )" # ... plain integer allowing commas as a thousands separator # (1) a number with thousands separators, or # (2) a plain number # ... NOTE: PUT THE ONE THAT NEEDS TO BE GREEDIER FIRST, i.e. the one with # thousands separators FLOATING_POINT_GROUP = r"(?: \. \d+ )" # decimal point and further digits SCIENTIFIC_NOTATION_EXPONENT = rf"(?: E {SIGN}? \d+ )" # ... Scientific notation does NOT offer non-integer exponents. # Specifically, float("-3.4e-27") is fine, but float("-3.4e-27.1") isn't. # NO_FOLLOWING_SCIENTIFIC_NOTATION_EXPONENT = _negative_lookahead( # SCIENTIFIC_NOTATION_EXPONENT) # ============================================================================= # Number types # ============================================================================= # Beware of unsigned types. You may not want a sign, but if you use an # unsigned type, "-3" will be read as "3". # Beware this one. You may not want a sign, but if you use this, "-3" will be # read as "3". IGNORESIGN_INTEGER = PLAIN_INTEGER_W_THOUSAND_COMMAS SIGNED_INTEGER = r"(?: {sign}? {integer} )".format( sign=SIGN, # optional integer=PLAIN_INTEGER_W_THOUSAND_COMMAS, ) UNSIGNED_INTEGER = r"(?: {nominus} {plus}? {integer} )".format( nominus=NO_PRECEDING_MINUS_SIGN_OR_DIGITCOMMA_OR_DOT, plus=PLUS_SIGN, # optional integer=PLAIN_INTEGER_W_THOUSAND_COMMAS, ) IGNORESIGN_FLOAT = r"(?: {integer} {fp}? )".format( integer=PLAIN_INTEGER_W_THOUSAND_COMMAS, fp=FLOATING_POINT_GROUP, # optional ) SIGNED_FLOAT = r"(?: {sign}? {integer} {fp}? )".format( sign=SIGN, # optional integer=PLAIN_INTEGER_W_THOUSAND_COMMAS, fp=FLOATING_POINT_GROUP, # optional ) UNSIGNED_FLOAT = r"(?: {nominus} {plus}? {integer} {fp}? )".format( nominus=NO_PRECEDING_MINUS_SIGN_OR_DIGITCOMMA_OR_DOT, plus=PLUS_SIGN, # optional integer=PLAIN_INTEGER_W_THOUSAND_COMMAS, fp=FLOATING_POINT_GROUP, # optional ) LIBERAL_NUMBER = r"(?: {sign}? {integer} {fp}? {exp}? )".format( sign=SIGN, # optional integer=PLAIN_INTEGER_W_THOUSAND_COMMAS, fp=FLOATING_POINT_GROUP, # optional exp=SCIENTIFIC_NOTATION_EXPONENT, # optional )