Source code for crate_anon.common.stringfunc

"""
crate_anon/common/stringfunc.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Simple string functions.**

"""

import fnmatch
from functools import lru_cache
import sys
from typing import Any, List, Optional, Pattern, TextIO, Type

from cardinal_pythonlib.extract_text import wordwrap
import prettytable
import regex


# =============================================================================
# Simple string manipulation
# =============================================================================


[docs]def get_digit_string_from_vaguely_numeric_string(s: str) -> str:
    """
    Strips non-digit characters from a string.

    For example, converts ``"(01223) 123456"`` to ``"01223123456"``.
    """
    return "".join([d for d in s if d.isdigit()])


[docs]def reduce_to_alphanumeric(s: str) -> str:
    """
    Strips non-alphanumeric characters from a string.

    For example, converts ``"PE12 3AB"`` to ``"PE12 3AB"``.
    """
    return "".join([d for d in s if d.isalnum()])


[docs]def remove_whitespace(s: str) -> str:
    """
    Removes whitespace from a string.
    """
    return "".join(s.split())


# =============================================================================
# Specification matching
# =============================================================================


[docs]@lru_cache(maxsize=None)
def get_spec_match_regex(spec: str) -> Pattern:
    """
    Returns a compiled, case-insensitive regular expression representing a
    shell-style pattern (using ``*``, ``?`` and similar wildcards; see
    https://docs.python.org/3.5/library/fnmatch.html).

    Args:
        spec: the pattern to pass to ``fnmatch``, e.g. ``"patient_addr*"``.

    Returns:
        the compiled regular expression
    """
    return regex.compile(fnmatch.translate(spec), regex.IGNORECASE)


# =============================================================================
# Printing/encoding
# =============================================================================


[docs]def uprint(
    *objects: Any, sep: str = " ", end: str = "\n", file: TextIO = sys.stdout
) -> None:
    """
    Prints strings to outputs that support UTF-8 encoding, but also to those
    that do not (e.g. Windows stdout, sometimes).

    Args:
        *objects: things to print
        sep: separator between those objects
        end: print this at the end
        file: file-like object to print to

    See
    https://stackoverflow.com/questions/14630288/unicodeencodeerror-charmap-codec-cant-encode-character-maps-to-undefined

    Examples:

    - Linux, Python 3.6.8 console: ``sys.stdout.encoding == "UTF-8"``
    - Windows, Python 3.7.4 console: ``sys.stdout.encoding == "utf-8"``
    - Windows, Python 3.7.4, from script: ``sys.stdout.encoding == "cp1252"``
    """
    enc = file.encoding.lower()
    if enc == "utf-8":
        print(*objects, sep=sep, end=end, file=file)
    else:

        def f(obj: Any) -> str:
            return str(obj).encode(enc, errors="backslashreplace").decode(enc)

        # https://docs.python.org/3.5/library/codecs.html#codec-base-classes
        print(*map(f, objects), sep=sep, end=end, file=file)


# =============================================================================
# String tests
# =============================================================================


[docs]def does_text_contain_word_chars(text: str) -> bool:
    """
    Is a string worth treating as interesting text -- does it contain "word"
    characters?
    """
    # Slower (as per FS's tests):
    #   regex_any_word_char = regex.compile(r'[\w\W]*[a-zA-Z0-9_][\w\W]*')
    #   return bool(text and regex_any_word_char.match(text))
    # Faster:
    return bool(text and any(33 <= ord(c) <= 126 for c in text))


# =============================================================================
# Docstring manipulation
# =============================================================================


[docs]def get_docstring(cls: Type) -> str:
    """
    Fetches a docstring from a class.
    """
    # PyCharm thinks that __doc__ is bytes, but it's str!
    # ... ah, no, now it's stopped believing that.
    return cls.__doc__ or ""
    # This is likely unnecessary: even integer variables have the __doc__
    # attribute.
    # return getattr(cls, '__doc__', "") or ""


[docs]def compress_docstring(docstring: str) -> str:
    """
    Splats a docstring onto a single line, compressing all whitespace.
    """
    docstring = docstring.replace("\n", " ")
    # https://stackoverflow.com/questions/2077897/substitute-multiple-whitespace-with-single-whitespace-in-python
    return " ".join(docstring.split())


[docs]def trim_docstring(docstring: str) -> str:
    """
    Removes initial/terminal blank lines and leading whitespace from
    docstrings.

    This is the PEP257 implementation (https://peps.python.org/pep-0257/),
    except with ``sys.maxint`` replaced by ``sys.maxsize`` (see
    https://docs.python.org/3.1/whatsnew/3.0.html#integers).

    Demonstration:

    .. code-block:: python

        from crate_anon.common.stringfunc import trim_docstring
        print(trim_docstring.__doc__)
        print(trim_docstring(trim_docstring.__doc__))
    """
    if not docstring:
        return ""
    # Convert tabs to spaces (following the normal Python rules)
    # and split into a list of lines:
    lines = docstring.expandtabs().splitlines()
    # Determine minimum indentation (first line doesn't count):
    indent = sys.maxsize
    for line in lines[1:]:
        stripped = line.lstrip()
        if stripped:
            indent = min(indent, len(line) - len(stripped))
    # Remove indentation (first line is special):
    trimmed = [lines[0].strip()]
    if indent < sys.maxsize:
        for line in lines[1:]:
            trimmed.append(line[indent:].rstrip())
    # Strip off trailing and leading blank lines:
    while trimmed and not trimmed[-1]:
        trimmed.pop()
    while trimmed and not trimmed[0]:
        trimmed.pop(0)
    # Return a single string:
    return "\n".join(trimmed)


# =============================================================================
# Tabular
# =============================================================================


[docs]def make_twocol_table(
    colnames: List[str],
    rows: List[List[str]],
    max_table_width: int = 79,
    padding_width: int = 1,
    vertical_lines: bool = True,
    rewrap_right_col: bool = True,
) -> str:
    """
    Formats a two-column table. Tries not to split/wrap the left-hand column,
    but resizes the right-hand column.
    """
    leftcol_width = max(len(r[0]) for r in [colnames] + rows)
    pt = prettytable.PrettyTable(
        colnames,
        header=True,
        border=True,
        hrules=prettytable.ALL,
        vrules=prettytable.ALL if vertical_lines else prettytable.NONE,
        align="l",  # default alignment for all columns (left)
        valign="t",  # default alignment for all rows (top)
        max_table_width=max_table_width,
        padding_width=padding_width,
    )
    rightcol_width = max_table_width - leftcol_width - (4 * padding_width) - 3
    # ... 3 vertical lines (even if invisible); 4 paddings (2 per column)
    pt.max_width[colnames[0]] = leftcol_width
    pt.max_width[colnames[1]] = rightcol_width
    for row in rows:
        righttext = row[1]
        if rewrap_right_col:
            righttext = wordwrap(righttext, width=rightcol_width)
        ptrow = [row[0], righttext]
        pt.add_row(ptrow)
    return pt.get_string()


# =============================================================================
# Checking strings for NLP
# =============================================================================

_RELEVANT_FOR_NLP_REGEX_STR = r"\w"  # word character present
RELEVANT_FOR_NLP_REGEX = regex.compile(
    _RELEVANT_FOR_NLP_REGEX_STR, flags=regex.IGNORECASE
)
# regex deals with Unicode automatically, as verified in stringfunc_tests.py


[docs]def relevant_for_nlp(x: Optional[str]) -> bool:
    """
    Does this string contain content that's relevant for NLP?
    We want to eliminate ``None`` values, and strings that do not contain
    relevant content. A string containing only whitespace is not relevant.
    """
    if not x:
        # None, or empty string
        return False
    return RELEVANT_FOR_NLP_REGEX.search(x) is not None