Source code for crate_anon.anonymise.test_extract_text

#!/usr/bin/env python

"""
crate_anon/anonymise/test_extract_text.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Test CRATE's text-extraction system.**

"""

import argparse
import logging
import os
import sys
import traceback

from cardinal_pythonlib.extract_text import (
    document_to_text,
    TextProcessingConfig,
)
from cardinal_pythonlib.logs import main_only_quicksetup_rootlogger

from crate_anon.common.argparse_assist import (
    RawDescriptionArgumentDefaultsRichHelpFormatter,
)
from crate_anon.common.stringfunc import uprint

EXIT_TEXT = 0
EXIT_NO_TEXT = 1
EXIT_ERROR = 2


[docs]def main() -> int: """ Command-line entry point. See command-line help. """ # noinspection PyTypeChecker parser = argparse.ArgumentParser( description=f""" Test CRATE text extraction and/or detect text in files. Exit codes: - {EXIT_TEXT} for "text found" - {EXIT_NO_TEXT} for "no text found" - {EXIT_ERROR} for "error" (e.g. file not found) """, formatter_class=RawDescriptionArgumentDefaultsRichHelpFormatter, ) parser.add_argument( "filename", type=str, help="File from which to extract text" ) parser.add_argument( "--plain", action="store_true", help="Use plainest format (re e.g. table layouts)", ) parser.add_argument( "--semiplain", action="store_true", help="Use semi-plain format (re e.g. table layouts)", ) parser.add_argument( "--width", type=int, default=80, help="Width to word-wrap to" ) parser.add_argument( "--rstrip", action="store_true", help="Right-strip all lines" ) parser.add_argument( "--silent", action="store_true", help="Don't print the text, just exit with a code", ) parser.add_argument( "--outfile", type=str, help="Filename to which to write (rather than stdout)", ) parser.add_argument( "--encoding", type=str, default="utf-8", help="Encoding used for --outfile", ) parser.add_argument("--verbose", action="store_true", help="Be verbose") args = parser.parse_args() main_only_quicksetup_rootlogger( level=logging.DEBUG if args.verbose else logging.INFO ) extension = os.path.splitext(args.filename)[1] config = TextProcessingConfig( plain=args.plain, semiplain=args.semiplain, width=args.width, rstrip=args.rstrip, ) # noinspection PyBroadException try: result = document_to_text( filename=args.filename, blob=None, extension=extension, config=config, ) except Exception: traceback.print_exc(file=sys.stderr) # full details, please return EXIT_ERROR if not args.silent: if args.outfile: with open(args.outfile, "wt", encoding=args.encoding) as f: f.write(result) else: uprint(result) contains_text = bool(result.strip()) return EXIT_TEXT if contains_text else EXIT_NO_TEXT
if __name__ == "__main__": sys.exit(main())