Source code for crate_anon.common.spreadsheet

"""
crate_anon/common/spreadsheet.py

===============================================================================

    Copyright (C) 2015, University of Cambridge, Department of Psychiatry.
    Created by Rudolf Cardinal (rnc1001@cam.ac.uk).

    This file is part of CRATE.

    CRATE is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    CRATE is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with CRATE. If not, see <https://www.gnu.org/licenses/>.

===============================================================================

Functions for reading/writing spreadsheets.

"""

# =============================================================================
# Imports
# =============================================================================

import csv
from enum import Enum
import logging
import os
from typing import Any, Dict, Iterable, List, Sequence, TextIO

from cardinal_pythonlib.file_io import smart_open
import openpyxl
import pyexcel_ods
import pyexcel_xlsx

log = logging.getLogger(__name__)


# =============================================================================
# Constants
# =============================================================================

SPREADSHEET_ROW_TYPE = Sequence[Any]
# ... a row is a sequence of cell values
SINGLE_SPREADSHEET_TYPE = Iterable[SPREADSHEET_ROW_TYPE]
# ... iterable of rows
SINGLE_SPREADSHEET_GENERATOR_TYPE = Iterable[SPREADSHEET_ROW_TYPE]
MULTIPLE_SPREADSHEET_TYPE = Dict[str, SINGLE_SPREADSHEET_TYPE]
# ... maps spreadsheet names to spreadsheets


# =============================================================================
# Enums
# =============================================================================


[docs]class SpreadsheetFileExtensions(Enum): CSV = ".csv" TSV = ".tsv" ODS = ".ods" XLSX = ".xlsx"
# ============================================================================= # Reading methods # =============================================================================
[docs]def skip_spreadsheet_row(row: SPREADSHEET_ROW_TYPE) -> bool: """ Should we skip a row, because it's empty or starts with a comment? """ if not row: return True first = row[0] if isinstance(first, str) and first.strip().startswith("#"): return True return not any(v for v in row)
[docs]def gen_rows_from_csv(filename: str) -> SINGLE_SPREADSHEET_GENERATOR_TYPE: """ Generates rows from a CSV file. """ log.debug(f"Loading as CSV: {filename}") with open(filename, "r") as csvfile: reader = csv.reader(csvfile) for row in reader: if skip_spreadsheet_row(row): continue yield row
[docs]def gen_rows_from_tsv(filename: str) -> SINGLE_SPREADSHEET_GENERATOR_TYPE: """ Generates rows from a TSV file. """ log.debug(f"Loading as TSV: {filename}") with open(filename, "r") as tsvfile: reader = csv.reader(tsvfile, delimiter="\t") for row in reader: if skip_spreadsheet_row(row): continue yield row
[docs]def gen_rows_from_xlsx(filename: str) -> SINGLE_SPREADSHEET_GENERATOR_TYPE: """ Generates rows from an XLSX file, reading the first sheet. """ log.debug(f"Loading as XLSX: {filename}") workbook = openpyxl.load_workbook(filename) # ... NB potential bug using read_only; see postcodes.py worksheet = workbook.active # first sheet, by default for sheet_row in worksheet.iter_rows(): row = ["" if cell.value is None else cell.value for cell in sheet_row] if skip_spreadsheet_row(row): continue yield row
[docs]def gen_rows_from_ods(filename: str) -> SINGLE_SPREADSHEET_GENERATOR_TYPE: """ Generates rows from an ODS file, reading the first sheet. """ log.debug(f"Loading as ODS: {filename}") data = pyexcel_ods.get_data(filename) # type: MULTIPLE_SPREADSHEET_TYPE # ... but it's an ordered dictionary, so: first_key = next(iter(data)) first_sheet_rows = data[first_key] for row in first_sheet_rows: if skip_spreadsheet_row(row): continue yield row
[docs]def gen_rows_from_spreadsheet( filename: str, ) -> SINGLE_SPREADSHEET_GENERATOR_TYPE: """ Generates rows from a spreadsheet-type file, autodetecting it. Args: filename: Filename to read. """ _, ext = os.path.splitext(filename) if ext == SpreadsheetFileExtensions.CSV.value: row_gen = gen_rows_from_csv(filename) elif ext == SpreadsheetFileExtensions.ODS.value: row_gen = gen_rows_from_ods(filename) elif ext == SpreadsheetFileExtensions.TSV.value: row_gen = gen_rows_from_tsv(filename) elif ext == SpreadsheetFileExtensions.XLSX.value: row_gen = gen_rows_from_xlsx(filename) else: raise ValueError(f"Unknown spreadsheet extension: {ext!r}") for row in row_gen: yield row
# ============================================================================= # Writing methods # =============================================================================
[docs]def make_safe_for_spreadsheet(x: Any) -> Any: """ Helper function for :func:`remove_none_values_from_spreadsheet`. """ return "" if x is None else x
[docs]def remove_none_values_from_spreadsheet( data: MULTIPLE_SPREADSHEET_TYPE, ) -> MULTIPLE_SPREADSHEET_TYPE: """ The ODS writer does not cope with ``None`` values, giving: .. code-block:: AttributeError: 'NoneType' object has no attribute 'split' Here, we transform ``None`` values to the empty string. """ result = {} for sheetname, sheetdata in data.items(): converted_sheetdata = [] # type: List[List[Any]] for row in sheetdata: converted_row = [make_safe_for_spreadsheet(x) for x in row] converted_sheetdata.append(converted_row) result[sheetname] = converted_sheetdata return result
[docs]def write_csv(filename: str, rows: SINGLE_SPREADSHEET_TYPE) -> None: """ Writes to a comma-separated values (CSV) file. Empty (null) values are translated to "". Args: rows: Rows to write. (The first row is often a header row.) filename: Name of file to write. """ log.info(f"Saving as CSV: {filename}") with smart_open(filename, "wt") as f: # type: TextIO writer = csv.writer(f) writer.writerows(rows)
[docs]def write_tsv(filename: str, rows: SINGLE_SPREADSHEET_TYPE) -> None: """ Writes to a tab-separated values (TSV) file. Empty (null) values are translated to "". Args: rows: Rows to write. (The first row is often a header row.) filename: Name of file to write. """ log.info(f"Saving as TSV: {filename}") with smart_open(filename, "wt") as f: # type: TextIO writer = csv.writer(f, delimiter="\t") writer.writerows(rows)
[docs]def write_ods(filename: str, data: MULTIPLE_SPREADSHEET_TYPE) -> None: """ Writes to an OpenOffice spreadsheet (ODS) file. Args: data: See :func:`write_spreadsheet`. filename: Name of file to write. """ log.info(f"Saving as ODS: {filename}") pyexcel_ods.save_data(filename, data)
[docs]def write_xlsx(filename: str, data: MULTIPLE_SPREADSHEET_TYPE) -> None: """ Writes to an OpenOffice spreadsheet (ODS) file. Args: data: See :func:`write_spreadsheet`. filename: Name of file to write. """ log.info(f"Saving as XLSX: {filename}") pyexcel_xlsx.save_data(filename, data)
[docs]def write_spreadsheet( filename: str, data: MULTIPLE_SPREADSHEET_TYPE, filetype: str = None ) -> None: """ Writes to a spreadsheet-style file, autodetecting it. Args: filename: Name of file to write, or "-" for stdout (in which case the filetype is forced to TSV). data: A dictionary whose keys are spreadsheet names and whose corresponding values contain spreadsheet data. (For TSV, which is a single-sheet format, only the first value is used.) Each dictionary value is an iterable containing rows, and each row is an iterable of cell data items. filetype: File type as one of the string values of SpreadsheetFileExtensions; alternatively, use ``None`` to autodetect from the filename. """ ext = filetype or os.path.splitext(filename)[1] if filename == "-" or ext == SpreadsheetFileExtensions.TSV.value: first_key = next(iter(data)) # https://stackoverflow.com/questions/30362391/how-do-you-find-the-first-key-in-a-dictionary # noqa first_sheet = data[first_key] write_tsv(filename, first_sheet) elif ext == SpreadsheetFileExtensions.CSV.value: first_key = next(iter(data)) first_sheet = data[first_key] write_csv(filename, first_sheet) elif ext == SpreadsheetFileExtensions.ODS.value: # The ODS writer does not like None values. write_ods(filename, remove_none_values_from_spreadsheet(data)) elif ext == SpreadsheetFileExtensions.XLSX.value: write_xlsx(filename, data) else: raise ValueError(f"Unknown spreadsheet extension: {ext!r}")