Source code for crate_anon.anonymise.check_text_extractor

#!/usr/bin/env python

"""
crate_anon/anonymise/check_text_extractor.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

**Check if text extraction tools are available (for anonymisation).**

"""

import argparse

from cardinal_pythonlib.extract_text import is_text_extractor_available
from rich_argparse import ArgumentDefaultsRichHelpFormatter

from crate_anon.version import CRATE_VERSION_PRETTY


# =============================================================================
# Main
# =============================================================================


[docs]def main() -> None: """ Command-line entry point """ # noinspection PyTypeChecker parser = argparse.ArgumentParser( description=f"Check availability of tools to extract text from " f"different document formats. ({CRATE_VERSION_PRETTY})", formatter_class=ArgumentDefaultsRichHelpFormatter, ) parser.add_argument( "checkextractor", nargs="*", help="File extensions to check for availability of a text extractor. " "Try, for example, '.doc .docx .odt .pdf .rtf .txt None' " "(use a '.' prefix for all extensions, and use the special " "extension 'None' to check the fallback processor).", ) args = parser.parse_args() for ext in args.checkextractor: if ext.lower() == "none": ext = None available = is_text_extractor_available(ext) print(f"Text extractor for extension {ext} present: {available}")
if __name__ == "__main__": main()