Source code for crate_anon.nlp_manager.run_crate_nlp_demo

#!/usr/bin/env python

"""
crate_anon/nlp_manager/run_crate_nlp_demo.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Run the build-in NLP tools for a quick command-line test.**

"""

import argparse
import logging
from pprint import pformat
from typing import Any, Dict, List

from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger
from rich_argparse import ArgumentDefaultsRichHelpFormatter

from crate_anon.common.constants import DEMO_NLP_INPUT_TERMINATOR
from crate_anon.common.inputfunc import gen_chunks_from_files
from crate_anon.nlp_manager.all_processors import (
    get_nlp_parser_class,
    possible_local_processor_names_without_external_tools,
)
from crate_anon.nlp_manager.base_nlp_parser import BaseNlpParser

log = logging.getLogger(__name__)


# =============================================================================
# Processors
# =============================================================================


[docs]def get_processors(processor_names: List[str]) -> List[BaseNlpParser]: """ Fetches CRATE NLP processors by name. """ processors = [] # type: List[BaseNlpParser] for name in processor_names: cls = get_nlp_parser_class(name) if not cls: raise ValueError(f"Unknown processor: {name}") processor = cls(nlpdef=None, cfg_processor_name=None) if isinstance(processor, BaseNlpParser): processors.append(processor) else: raise ValueError( f"Processor {name} is not a CRATE build-in NLP processor" ) return processors
# ============================================================================= # Do the work # =============================================================================
[docs]def process_text(text: str, processors: List[BaseNlpParser]) -> None: """ Runs a single pieces of text through multiple NLP processors, and reports the output. """ log.debug(f"Processing text:\n{text}") results = {} # type: Dict[str, List[Dict[str, Any]]] for processor in processors: log.debug(f"- Processor: {processor.nlprp_name()}.") for tablename, nlp_values in processor.parse(text): if not nlp_values: continue results_this_proc = results.setdefault(tablename, []) results_this_proc.append(nlp_values) pretty = pformat(results) log.info(f"Results:\n" f"{pretty}") log.debug("- Text processing complete.")
# ============================================================================= # Main # =============================================================================
[docs]def main() -> None: """ Command-line entry point. """ all_processors = "all" possible_proc_names = ( possible_local_processor_names_without_external_tools() ) possible_processor_options = [all_processors] + possible_proc_names # noinspection PyTypeChecker parser = argparse.ArgumentParser( description="Demonstrate CRATE's built-in Python NLP tools", formatter_class=ArgumentDefaultsRichHelpFormatter, ) parser.add_argument( "inputs", type=str, nargs="+", help="Input files (use '-' for stdin)" ) parser.add_argument( "--terminator", type=str, default=DEMO_NLP_INPUT_TERMINATOR, help=( "Single-line terminator separating input chunks in an input file." ), ) parser.add_argument( "--processors", type=str, required=True, nargs="+", metavar="PROCESSOR", choices=possible_processor_options, help=f"NLP processor(s) to apply. Possibilities: " f"{','.join(possible_processor_options)}", ) parser.add_argument("--verbose", action="store_true", help="Be verbose") args = parser.parse_args() main_only_quicksetup_rootlogger( level=logging.DEBUG if args.verbose else logging.INFO ) log.debug(f"Input terminator: {args.terminator!r}") if all_processors in args.processors: processors = get_processors(possible_proc_names) else: processors = get_processors(args.processors) for text in gen_chunks_from_files( filenames=args.inputs, chunk_terminator_line=args.terminator ): if not text: continue process_text(text, processors) log.info("Done.")
if __name__ == "__main__": main()