Source code for crate_anon.nlp_manager.build_medex_itself

#!/usr/bin/env python

"""
crate_anon/nlp_manager/build_medex_itself.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Script to compile (and modify slightly) Java source for MedEx-UIMA.**

"""

import argparse
import glob
import logging
import os
import subprocess
import sys
from typing import Dict, List, Tuple, Union

from cardinal_pythonlib.fileops import purge
from cardinal_pythonlib.logs import configure_logger_for_colour
import chardet
from rich_argparse import ArgumentDefaultsRichHelpFormatter

from crate_anon.common.constants import EnvVar

log = logging.getLogger(__name__)

EXIT_FAILURE = 1

if EnvVar.GENERATING_CRATE_DOCS in os.environ:
    DEFAULT_MEDEX_DIR = "/path/to/Medex/installation"
else:
    DEFAULT_MEDEX_DIR = os.path.join(
        os.path.expanduser("~"), "dev", "Medex_UIMA_1.3.6"
    )

DEFAULT_JAVA = "java"
DEFAULT_JAVAC = "javac"

EXTRA_ROUTES = [
    "i/m",
    "i.m.",
    "i. m.",
    "intramuscularly",
    "intramuscular inj.",
    "intramuscular injection",
    "inh",
    "inh.",
    "i/v",
    "i.v.",
    "i. v.",
    "nasal",
    "nasally",
    "nebs",
    "nebulised",
    "nebuliser",
    "nebulized",
    "nebulizer",
    "ng",
    "n/g",
    "n.g.",
    "n. g.",
    "nasogastric",
    "nasogastrically",
    "nj",
    "n/j",
    "n.j.",
    "n. j.",
    "p/o",
    "p.o.",
    "p. o.",
    "pr",
    "p/r",
    "p.r.",
    "p. r.",
    "s/c",
    "s.c.",
    "s. c.",
    "top",
    "top.",
]
EXTRA_FREQUENCIES = [  # Tuples of (literal, TIMEX3)
    # EXTRA FOR UK FREQUENCIES; see
    # http://www.evidence.nhs.uk/formulary/bnf/current/general-reference/latin-abbreviations  # noqa
    # TIMEX3 codes:
    # http://www.timeml.org/tempeval2/tempeval2-trial/guidelines/timex3guidelines-072009.pdf
    # qqh, quarta quaque hora
    ("q.q.h.", "R1P4H"),
    # qds, quater die sumendum; MUST BE BEFORE COMPETING "qd" (= per day)
    # expression, e.g. in frequency_rules:
    # expression="[Qq]\.?[ ]?[Dd]\.?",val="R1P24H"
    ("q.d.s.", "R1P6H"),
    # tds, ter die sumendum
    ("t.d.s.", "R1P8H"),
    # bd, bis die
    ("b.d.", "R1P12H"),
    # od, omni die
    ("o.d.", "R1P24H"),
    # mane
    ("mane", "R1P24H"),
    # om, omni mane
    ("o.m.", "R1P24H"),
    # nocte
    ("nocte", "R1P24H"),
    # on, omni nocte -- beware also the word "on"...
    ("o.n.", "R1P24H"),
    # fortnightly and variants
    ("fortnightly", "R1P2W"),  # W: page 9 of TIMEX3 PDF above
    ("2 weekly", "R1P2W"),
    ("two weekly", "R1P2W"),
    # monthly
    ("monthly", "R1P1M"),  # M: page 8 of TIMEX3 PDF above
]
DO_NOT_REMOVE_DOTS = [
    "o.n."
    # the word "on" is too confusing; e.g. "Start olanzapine 5mg nocte." is
    # fine; "Start olanzapine 5mg on." is tolerable, but too easily confused
    # with "Start olanzapine 5mg on Tuesday."
]

SEM_ENG_TRIGGER_LINE_TRIMMED = "Map regexlist = new Hashtable();"
FREQ_RULE_TRIGGER_LINE_TRIMMED = "FREQUENCY:"
SOURCE_START_MARKER = "// START CRATE MODIFICATIONS"
SOURCE_END_MARKER = "// END CRATE MODIFICATIONS"


[docs]def terminate(x: str) -> str: """ Terminates its input with a newline. """ return x + "\n"
[docs]def lex_freq(x: str) -> str: """ For MedEx's ``lexicon.cfg``: creates a frequency line. """ return f"{x}\tFREQ"
[docs]def lex_route(x: str) -> str: """ For MedEx's ``lexicon.cfg``: creates a route line. """ return f"{x}\tRUT"
[docs]def semantic_rule_engine_line( frequency: str, dots_optional: bool = True ) -> str: """ For MedEx: create a semantic rule engine line (a line of Java to be inserted). Args: frequency: string representing the frequency, e.g. "b.d." dots_optional: if ``frequency`` contains full stops, are they optional? Returns: a line of Java code """ # NB case-insensitive regexes in SemanticRuleEngine.java, so ignore case # here # If you need to put in a \, double it to \\ for Java's benefit. regex_str = "" for c in frequency: if c == " ": regex_str += r"\\s+" elif c == ".": if dots_optional: regex_str += r"\\.?\\s*" else: regex_str += r"\\.\\s*" else: regex_str += c return rf' regexlist.put("^({regex_str})( |$)", "FREQ"); // RNC'
[docs]def frequency_rules_line( frequency: str, timex: str, dots_optional: bool ) -> str: """ Creates a line for MedEx's ``frequency_rules`` file. Args: frequency: the string representing a drug frequency, e.g. "b.d." timex: a TIMEX version of this frequency dots_optional: if ``frequency`` contains full stops, are they optional? Returns: a line to go into the ``frequency_rules`` file """ # NB case-sensitive regexes in Rule.java, so offer upper- and lower-case # alternatives here. # No need for word boundaries with \b, since at this stage all words have # already been separated by the tokenization process. regex_str = "" for c in frequency: if c == " ": regex_str += r"\s+" elif c == ".": if dots_optional: regex_str += r"\.?\s?" else: regex_str += r"\.\s?" elif c.isalpha(): # Case-insensitive here. regex_str += rf"[{c.upper()}{c.lower()}]" else: regex_str += c return rf'expression="{regex_str}",val="{timex}"'
[docs]def add_lines_if_not_in(filename: str, lines: List[str]) -> None: r""" Adds lines to a file, if they're not already there. Args: filename: name of file to modify lines: lines to insert Elements of lines should not have their own ``\n`` characters. """ # MB 2020-07-14 # In MedEx 1.3.8 lexicon.cfg contains invalid UTF-8 # chardet detects Windows-1252 rawdata = open(filename, "rb").read() log.info(f"Detecting encoding for: {filename}") encoding = chardet.detect(rawdata)["encoding"] log.info(f"Detected: {encoding}") with open(filename, "r", encoding=encoding) as f: existing = f.readlines() # will have trailing newlines log.info(f"Read {len(existing)} lines from {filename}") # print(existing[-5:]) with open(filename, "a") as f: for line in lines: if terminate(line) not in existing: log.info(f"Adding {filename} line: {line!r}") f.write(terminate(line))
[docs]def add_lines_after_trigger( filename: str, trigger: str, start_marker: str, end_marker: str, lines: List[str], ) -> None: r""" Adds lines to a file, after a triggering line. Args: filename: name of file to modify trigger: line that begins the section of interest; we don't start paying attention until this is encountered start_marker: see below end_marker: see below lines: lines to insert Immediately after we've encountered ``trigger``, we insert ``start_marker``, then ``lines``, then ``end_marker``. If the file already has such a block, we chop out the old block before inserting the new. Elements of lines should not have their own ``\n`` characters. """ with open(filename, "r") as f: existing = f.readlines() log.info(f"Read {len(existing)} lines from {filename}") with open(filename, "w") as f: index = 0 for line in existing: f.write(line) index += 1 if line.strip() == trigger: break # ... index now pointing to one after the trigger line # Excise an existing block of ours? if index < len(existing) and existing[index] == terminate( start_marker ): while index < len(existing) and existing[index] != terminate( end_marker ): index += 1 index += 1 # line after end_marker # Add stuff f.write(terminate(start_marker)) for line in lines: log.info(f"Adding {filename} line: {line!r}") f.write(terminate(line)) f.write(terminate(end_marker)) # Write the rest for line in existing[index:]: f.write(line)
[docs]def replace_in_file( filename: str, changes: List[Tuple[str, str]], count: int = -1, encoding: str = "utf8", backup_suffix: str = "~", ) -> None: """ Replaces content in a file. Args: filename: name of file to modify changes: list of ``old, new`` tuples; we will replace ``old`` by ``new`` in each case count: up to how many times should we perform the replacement? See :func:`str.replace`. encoding: character encoding to be used backup_suffix: we'll create a backup file; what should we append to the filename to give the name of the backup file? """ log.info(f"Replacing code in file: {filename}") # Read contents with open(filename, encoding=encoding) as input_file: original_content = input_file.read() # Replace new_content = original_content for old, new in changes: new_content = new_content.replace(old, new, count) # Check for differences if new_content == original_content: log.info("... nothing to do") return # Make backup, if different backup_name = filename + backup_suffix os.rename(filename, backup_name) log.info(f"... backup is: {backup_name!r}") # Write out new with open(filename, "w", encoding=encoding) as output_file: output_file.write(new_content)
[docs]def main() -> None: """ Command-line processor. See command-line help. """ # ------------------------------------------------------------------------- # Arguments # ------------------------------------------------------------------------- # noinspection PyTypeChecker parser = argparse.ArgumentParser( description="Compile MedEx-UIMA itself (in Java)", formatter_class=ArgumentDefaultsRichHelpFormatter, ) parser.add_argument( "--medexdir", default=DEFAULT_MEDEX_DIR, help="Root directory of MedEx installation", ) parser.add_argument("--javac", default=DEFAULT_JAVAC, help="Java compiler") parser.add_argument( "--deletefirst", action="store_true", help="Delete existing .class files first (optional)", ) parser.add_argument( "--verbose", "-v", action="store_true", help="Be verbose" ) args = parser.parse_args() # ------------------------------------------------------------------------- # Logging # ------------------------------------------------------------------------- loglevel = logging.DEBUG if args.verbose else logging.INFO rootlogger = logging.getLogger() configure_logger_for_colour(rootlogger, level=loglevel) if not os.path.exists(args.medexdir): log.error( f"Could not find Medex installation at {args.medexdir}. " f"Is Medex installed? Have you set --medexdir correctly?" ) sys.exit(EXIT_FAILURE) # Remove garbage Apple backup files hidden_pattern = os.path.join(args.medexdir, "**", "._*") for hidden in glob.glob(hidden_pattern, recursive=True): log.info(f"Removing file {hidden}") os.remove(hidden) # ------------------------------------------------------------------------- # Add lexicon entries # ------------------------------------------------------------------------- lexfilename = os.path.join(args.medexdir, "resources", "lexicon.cfg") lexlines = [lex_route(route) for route in EXTRA_ROUTES] for frequency, _ in EXTRA_FREQUENCIES: lexlines.append(lex_freq(frequency)) if "." in frequency: lexlines.append(lex_freq(frequency.replace(".", ". "))) if frequency not in DO_NOT_REMOVE_DOTS: lexlines.append(lex_freq(frequency.replace(".", ""))) # Need to add variants, e.g. "om" for "o.m."? add_lines_if_not_in(lexfilename, lexlines) # ------------------------------------------------------------------------- # Add frequency tags to SemanticRuleEngine.java # ------------------------------------------------------------------------- semengfilename = os.path.join( args.medexdir, "src", "org", "apache", "medex", "SemanticRuleEngine.java", ) semlines = [ semantic_rule_engine_line( frequency, frequency not in DO_NOT_REMOVE_DOTS ) for frequency, _ in EXTRA_FREQUENCIES ] add_lines_after_trigger( semengfilename, SEM_ENG_TRIGGER_LINE_TRIMMED, SOURCE_START_MARKER, SOURCE_END_MARKER, semlines, ) # ------------------------------------------------------------------------- # Add frequency tags to frequency_rules # ------------------------------------------------------------------------- freqrulefilename = os.path.join( args.medexdir, "resources", "TIMEX", "rules", "frequency_rules" ) frlines = [ frequency_rules_line( frequency, timex, frequency not in DO_NOT_REMOVE_DOTS ) for frequency, timex in EXTRA_FREQUENCIES ] add_lines_after_trigger( freqrulefilename, FREQ_RULE_TRIGGER_LINE_TRIMMED, SOURCE_START_MARKER, SOURCE_END_MARKER, frlines, ) # ------------------------------------------------------------------------- # Fix bugs! Argh. # ------------------------------------------------------------------------- # MB 2020-07-10 # These lines no longer match in MedEx 1.3.8. It looks like at least the # first bug is fixed. bugfixes = [ { "filename": os.path.join( args.medexdir, "src", "org", "apache", "NLPTools", "Document.java", ), "changes": [ { "comment": """ Medex confuses & and &&, leading to Exception in thread "main" java.lang.StringIndexOutOfBoundsException: String index out of range: 2 at java.lang.String.charAt(Unknown Source) at org.apache.NLPTools.Document.<init>(Document.java:134) at org.apache.medex.MedTagger.run_batch_medtag(MedTagger.java:256) at CrateMedexPipeline.processInput(CrateMedexPipeline.java:302) at CrateMedexPipeline.<init>(CrateMedexPipeline.java:128) at CrateMedexPipeline.main(CrateMedexPipeline.java:320) """, # noqa "wrong": r"while(cur_pos<llen & (txt.charAt(cur_pos)==' ' || txt.charAt(cur_pos)=='\n' || txt.charAt(cur_pos)=='\r') ){", # noqa "right": r"while(cur_pos<llen && (txt.charAt(cur_pos)==' ' || txt.charAt(cur_pos)=='\n' || txt.charAt(cur_pos)=='\r') ){", # noqa # -----------------------------^ }, ], }, # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ { "filename": os.path.join( args.medexdir, "src", "org", "apache", "algorithms", "SuffixArray.java", ), "changes": [ { "comment": """ java.lang.StringIndexOutOfBoundsException: String index out of range: 1 at java.lang.String.charAt(Unknown Source) at org.apache.algorithms.SuffixArray.construct_tree_word(SuffixArray.java:375) at org.apache.algorithms.SuffixArray.re_build(SuffixArray.java:97) at org.apache.algorithms.SuffixArray.<init>(SuffixArray.java:60) at org.apache.medex.MedTagger.medtagging(MedTagger.java:359) at org.apache.medex.MedTagger.run_batch_medtag(MedTagger.java:264) at CrateMedexPipeline.processInput(CrateMedexPipeline.java:302) at CrateMedexPipeline.<init>(CrateMedexPipeline.java:128) at CrateMedexPipeline.main(CrateMedexPipeline.java:320) Offending code in SuffixArray.java: for (int i=0;i<this.N;i++){ int pos=this.SA[i]; if (this.otext.charAt(pos) != ' ' && this.otext.charAt(pos) != '\n' && this.otext.charAt(pos) != this.end_char && (pos == 0 || (this.otext.charAt(pos-1) == ' ' || this.otext.charAt(pos-1) == '\n'))){ this.insert_SF_tree(this.SA[i], 0, 0); //# 0 denote the root in __SA; } } The bug may relate to what's in SA[i]... but as a simple fix: """, # noqa "wrong": r"if (this.otext.charAt(pos) != ' ' && this.otext.charAt(pos) != '\n' && this.otext.charAt(pos) != this.end_char && (pos == 0 || (this.otext.charAt(pos-1) == ' ' || this.otext.charAt(pos-1) == '\n'))){", # noqa "right": r"if (pos < this.otext.length() && this.otext.charAt(pos) != ' ' && this.otext.charAt(pos) != '\n' && this.otext.charAt(pos) != this.end_char && (pos == 0 || (this.otext.charAt(pos-1) == ' ' || this.otext.charAt(pos-1) == '\n'))){", # noqa # -------------^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ }, ], }, ] # type: List[Dict[str, Union[str, List[Dict[str, str]]]]] _ = """ BUGS IN MEDEX-UIMA NOT YET FIXED: java.lang.ArrayIndexOutOfBoundsException: -1 at java.util.Vector.elementData(Unknown Source) at java.util.Vector.get(Unknown Source) at org.apache.NLPTools.SentenceBoundary.detect_boundaries(SentenceBoundary.java:329) at org.apache.medex.MedTagger.medtagging(MedTagger.java:354) at org.apache.medex.MedTagger.run_batch_medtag(MedTagger.java:264) at CrateMedexPipeline.processInput(CrateMedexPipeline.java:312) at CrateMedexPipeline.runPipeline(CrateMedexPipeline.java:138) at CrateMedexPipeline.<init>(CrateMedexPipeline.java:112) at CrateMedexPipeline.main(CrateMedexPipeline.java:330) java.lang.NullPointerException at org.apache.algorithms.SuffixArray.search(SuffixArray.java:636) at org.apache.medex.MedTagger.medtagging(MedTagger.java:362) at org.apache.medex.MedTagger.run_batch_medtag(MedTagger.java:264) at CrateMedexPipeline.processInput(CrateMedexPipeline.java:312) at CrateMedexPipeline.runPipeline(CrateMedexPipeline.java:138) at CrateMedexPipeline.<init>(CrateMedexPipeline.java:112) at CrateMedexPipeline.main(CrateMedexPipeline.java:330) ... frankly, it's just badly written. That's clearly why it uses the "catch all exceptions" strategy, but one would imagine the errors are unintentional (certainly the &/&& one!) or else they wouldn't print a stack trace and chug on. """ # noqa for bf in bugfixes: filename = bf["filename"] changes = [] # type: List[Tuple[str, str]] for change in bf["changes"]: changes.append((change["wrong"], change["right"])) replace_in_file(filename, changes) # ------------------------------------------------------------------------- # Clean up first? # ------------------------------------------------------------------------- if args.deletefirst: purge(args.medexdir, "*.class") # ------------------------------------------------------------------------- # Compile # ------------------------------------------------------------------------- bindir = os.path.join(args.medexdir, "bin") classpath = os.pathsep.join( [ os.path.join(args.medexdir, "src"), os.path.join(args.medexdir, "lib", "*"), # jar files ] ) classpath_options = ["-classpath", classpath] os.chdir(args.medexdir) cmdargs = ( [args.javac] + classpath_options + ["src/org/apache/medex/Main.java"] + # ... compiling this compiles everything else necessary ["-d", bindir] # put the binaries here ) log.info(f"Executing command: {cmdargs}") subprocess.check_call(cmdargs)
if __name__ == "__main__": main()