/*
CrateMedexPipeline.java
===============================================================================
Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
Created by Rudolf Cardinal (rnc1001@cam.ac.uk).
This file is part of CRATE.
CRATE is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
CRATE is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with CRATE. If not, see <https://www.gnu.org/licenses/>.
===============================================================================
Implements a Java interface to MedEx-UIMA that receives "data is available"
signals and returns "processed data ready for collection" signals via
stdin/stdout. See medex_parser.py.
Note in particular:
- the MedTagger class opens "sents/X" (where X is the input file) from the
directory System.getProperty("user.dir"), curse it.
- That is the directory from which you run the Java program.
- It's hard to set the "user.dir" property reliably, apparently:
http://stackoverflow.com/questions/840190/changing-the-current-working-directory-in-java
... I comment in passing that this is not something that makes one think
"Java - wow"...
- So we must simply CALL THIS PROGRAM FROM AN APPROPRIATE (e.g. temporary)
DIRECTORY, AND MAKE DIRECTORIES IT WANTS:
sents
log
*/
// no "package" command required
import java.util.*;
import java.io.*;
import java.text.SimpleDateFormat;
import org.apache.medex.MedTagger;
/**
* CrateMedexPipeline is a command-line program that fires up MedEx, reads
* "data ready" signals from stdin, asks MedEx to process input disk files into
* output disk files, and writes "results ready" signals to stdout.
*/
public class CrateMedexPipeline {
// ========================================================================
// Members
// ========================================================================
// Constants
private static final String m_defaultlogprefix = "crate_medex_pipeline:";
private static final String m_default_data_ready_signal = "DATA_READY";
private static final String m_default_results_ready_signal = "RESULTS_READY";
private static final SimpleDateFormat m_datetimefmt = new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss.SSS");
private static final String TAB = "\t";
private static final String NEWLINE = "\n";
// Interface
private String m_args[];
// Options
private String m_data_ready_signal = m_default_data_ready_signal;
private String m_results_ready_signal = m_default_results_ready_signal;
// UNUSED // private String m_file_encoding = "UTF-8"; // null would be: use system default
private int m_verbose = 0;
// Internal
private String m_logprefix = m_defaultlogprefix;
private int m_count = 0;
private String m_pipe_encoding = "UTF-8";
private PrintStream m_stdout = null;
// MedEx tagger object
private MedTagger m_medtagger = null;
// MedEx options
// ... this ugly file-finding code is as per MedEx:
private static String m_location = MedTagger.class.getProtectionDomain().getCodeSource().getLocation().getPath();
private String m_lexicon_file = resource("lexicon.cfg");
private String m_rxnorm_file = resource("brand_generic.cfg");
private String m_code_file = resource("code.cfg");
private String m_generic_file = resource("rxcui_generic.cfg");
private String m_norm_file = resource("norm.cfg");
private String m_word_file = resource("word.txt");
private String m_abbr_file = resource("abbr.txt");
private String m_grammar_file = resource("grammar.txt");
private String m_if_detect_sents = "y"; // -b [yn]: use built-in sentence boundary detector?
private String m_if_freq_norm = "y"; // -f [yn]: normalize frequency to TIMEX3 format? (e.g. "b.i.d." -> "R1P12H")
private String m_if_drool_engine = "n"; // -d [yn]: "use drool engine? ... The default setting is to use the built-in rules for disambiguation (faster)"
private String m_if_offset_showed = "y"; // -p [yn]: show offset information?
private String m_if_output_tag = "n"; // -t [yn]: show tagging information?
private String m_input_dir = null;
private String m_output_dir = null;
// ========================================================================
// Constructor
// ========================================================================
/** Process command-line arguments and execute the pipeline. */
public CrateMedexPipeline(String args[]) throws IOException {
m_stdout = new PrintStream(System.out, true, m_pipe_encoding);
m_args = args;
processArgs();
if (m_verbose > 0) {
reportArgs();
}
try {
runPipeline();
} catch (Exception e) {
status("Uncaught exception; aborting; stack trace follows");
e.printStackTrace();
abort(); // otherwise, Java exits with an UNDEFINED (e.g. 0 = "happy") return code
// NOTE ALSO THAT MEDEX CATCHES ITS OWN GENERAL EXCEPTIONS, PRINTS
// A STACK TRACE, AND CARRIES ON. See e.g. MedTagger.java, and
// search for printStackTrace.
}
}
/**
* Starts MedEx; read "data ready" signals from stdin; asks MedEx to
* batch-process files on disk (creating output disk files); writes
* "results ready" signals to stdout.
*/
private void runPipeline() throws IOException {
setupMedex();
status("Ready for input");
// Wait for each "data ready" signal, then process files.
BufferedReader br = new BufferedReader(
new InputStreamReader(System.in, m_pipe_encoding));
String line;
boolean finished = false;
while (!finished) {
line = br.readLine();
if (m_verbose >= 2) {
status("Contents of stdin: " + line);
}
if (line == null) {
finished = true;
} else if (line.equals(m_data_ready_signal)) {
// Process text
processInput();
signalResultsReady();
++m_count;
}
}
}
// ========================================================================
// Handling of args, stdin, etc.
// ========================================================================
/** Exit in a happy way. */
private void exit() {
System.exit(0);
}
/** Exit in a sad way. */
private void abort() {
System.exit(1);
}
/** Complain that the user has passed bad command-line arguments. Exit. */
private void fail(String msg) {
status(msg);
reportArgs();
abort();
}
/** Show a usage message. */
private void usage() {
System.out.print(
"usage: CrateMedexPipeline -i DIR -o DIR\n" +
" [-h] [-v [-v]] [-lt LOGTAG]\n" +
" [-data_ready_signal DATA_READY]\n" +
" [-results_ready_signal RESULTS_READY]\n" +
"\n" +
"Java front end to MedEx-UIMA natural language processor for drugs.\n" +
"Takes signals on stdin, and data on disk.\n" +
"Writes signals to stdout, and data to disk.\n" +
"\n" +
"required arguments:\n" +
" -i DIR (*) Specifies the input directory to read text from.\n" +
" -o DIR (*) Specifies the input directory to write results to.\n" +
"\n" +
"optional arguments:\n" +
" --help Show this help message and exit.\n" +
" -h\n" +
"\n" +
" -v Verbose (use twice to be more verbose).\n" +
"\n" +
" -lt LOGTAG Use an additional tag for stderr logging.\n" +
" Helpful in multiprocess environments.\n" +
"\n" +
" -data_ready_signal DATA_READY\n" +
" Sets the 'data ready' signal that this program waits for\n" +
" on stdin before scanning for data.\n" +
"\n" +
" -results_ready_signal RESULTS_READY\n" +
" Sets the 'data ready' signal that this program sends on\n" +
" stdout once results are ready on disk.\n" +
"\n" +
"(*) MedEx argument\n"
);
}
/**
* Process command-line arguments from m_args, and set internal variables.
*/
private void processArgs() {
int i = 0;
int nleft;
String arg;
String insufficient = "CrateMedexPipeline: Insufficient arguments while processing ";
// Process
while (i < m_args.length) {
arg = m_args[i++].toLowerCase();
nleft = m_args.length - i;
switch (arg) {
case "-i":
if (nleft < 1) fail(insufficient + arg);
m_input_dir = m_args[i++];
break;
case "-o":
if (nleft < 1) fail(insufficient + arg);
m_output_dir = m_args[i++];
break;
case "-h":
case "--help":
usage();
exit();
break;
case "-v":
m_verbose++;
break;
case "-lt":
if (nleft < 1) fail(insufficient + arg);
setLogTag(m_args[i++]);
break;
case "-data_ready_signal":
if (nleft < 1) fail(insufficient + arg);
m_data_ready_signal = m_args[i++];
break;
case "-results_ready_signal":
if (nleft < 1) fail(insufficient + arg);
m_results_ready_signal = m_args[i++];
break;
default:
usage();
abort();
break;
}
}
// Validate
if (m_input_dir == null) {
status("missing -i parameter; use -h for help");
abort();
}
if (m_output_dir == null) {
status("missing -o parameter; use -h for help");
abort();
}
}
/** Report command-line arguments (from m_args). */
private void reportArgs() {
for (int i = 0; i < m_args.length; i++) {
status("Arg " + i + " = " + m_args[i]);
}
}
/**
* Set the log tag (providing extra information relating to who called us.
*/
private void setLogTag(String msg) {
m_logprefix = m_defaultlogprefix;
if (m_logprefix.length() > 0) {
m_logprefix += msg + ":";
}
}
/** Returns the current date/time, for log output. */
private String now() {
return m_datetimefmt.format(Calendar.getInstance().getTime());
}
/** Write a (prefixed) status message to the log. */
private void status(String msg) {
status(msg, true);
}
/** Write a message to the log, optionally with a date/time/logtag prefix. */
private void status(String msg, boolean prefix) {
System.err.println((prefix ? (now() + ":" + m_logprefix) : "") + msg);
}
/** Prints a string to stdout. */
private void print(String msg) {
m_stdout.print(msg);
}
/** Prints a string to stdout and line-terminates it. */
private void println(String msg) {
// status("println: " + msg);
m_stdout.println(msg);
}
// ========================================================================
// MedEx input processing
// ========================================================================
/** Returns the filename of a MedEx resource file. */
private String resource(String stem) {
return m_location + ".." + File.separator + "resources" + File.separator + stem;
}
/** Creates a MedEx processor (tagger). */
private void setupMedex() throws IOException {
status("Starting MedEx...");
m_medtagger = new MedTagger(m_lexicon_file,
m_rxnorm_file,
m_code_file,
m_generic_file,
m_input_dir,
m_output_dir,
m_word_file,
m_abbr_file,
m_grammar_file,
m_if_detect_sents,
m_norm_file,
m_if_freq_norm,
m_if_drool_engine,
m_if_offset_showed,
m_if_output_tag);
status("... done");
}
/**
* Asks MedEx to batch-process files in our input directory and write
* results to our output directory.
*/
private void processInput() throws IOException {
m_medtagger.run_batch_medtag();
}
// ========================================================================
// MedEx output processing
// ========================================================================
/** Indicate on stdout that results are ready in our output files. */
private void signalResultsReady() throws IOException {
println(m_results_ready_signal);
// Flushing is not required:
// http://stackoverflow.com/questions/7166328
}
// ========================================================================
// Main (run from the command line)
// ========================================================================
/** main(); create and run our pipeline. */
public static void main(String args[]) throws IOException {
CrateMedexPipeline medex = new CrateMedexPipeline(args);
}
}