Source code for msl.io.tables

"""
Read a data table from a file.
"""
import re

import numpy as np

from .base import Reader
from .dataset import Dataset
from .readers import ExcelReader
from .readers import GSheetsReader
from .utils import get_basename

_spreadsheet_top_left_regex = re.compile(r'^([A-Z]+)(\d+)$')
_spreadsheet_range_regex = re.compile(r'^[A-Z]+\d*:[A-Z]+\d*$')


extension_delimiter_map = {'.csv': ','}
""":class:`dict`: The delimiter to use to separate columns in a table based on the file extension.

If the `delimiter` is not specified when calling the :func:`~msl.io.read_table` function then this
extension-delimiter map is used to determine the value of the `delimiter`. If the file extension
is not in the map then the value of the `delimiter` is :data:`None` (i.e., split columns by any
whitespace). 

Examples
--------
You can customize your own map by adding key-value pairs

.. code-block:: pycon

    >>> from msl.io import extension_delimiter_map
    >>> extension_delimiter_map['.txt'] = '\\t'

"""


[docs]def read_table_text(file, **kwargs): """Read a data table from a text-based file. A *table* has the following properties: 1. The first row is a header. 2. All rows have the same number of columns. 3. All data values in a column have the same data type. Parameters ---------- file : :term:`path-like <path-like object>` or :term:`file-like <file object>` The file to read. **kwargs All keyword arguments are passed to :func:`~numpy.loadtxt`. If the `delimiter` is not specified and the `file` has ``csv`` as the file extension then the `delimiter` is automatically set to be ``','``. Returns ------- :class:`~msl.io.dataset.Dataset` The table as a :class:`~msl.io.dataset.Dataset`. The header is included in the :class:`~msl.io.metadata.Metadata`. """ if kwargs.get('unpack', False): raise ValueError('Cannot use the "unpack" option') if hasattr(file, 'as_posix'): # a pathlib.Path object file = str(file) if 'delimiter' not in kwargs: extn = Reader.get_extension(file).lower() kwargs['delimiter'] = extension_delimiter_map.get(extn) if 'skiprows' not in kwargs: kwargs['skiprows'] = 0 kwargs['skiprows'] += 1 # Reader.get_lines is 1-based, np.loadtxt is 0-based first_line = Reader.get_lines(file, kwargs['skiprows'], kwargs['skiprows']) if not first_line: header, data = [], [] else: header = first_line[0].split(kwargs['delimiter']) # Calling np.loadtxt (on Python 3.5, 3.6 and 3.7) on a file # on a mapped drive could raise an OSError. This occurred # when a local folder was shared and then mapped on the same # computer. Opening the file using open() and then passing # in the file handle to np.loadtxt is more universal if hasattr(file, 'read'): # already a file-like object data = np.loadtxt(file, **kwargs) else: with open(file, mode='rt') as fp: data = np.loadtxt(fp, **kwargs) use_cols = kwargs.get('usecols') if use_cols: if isinstance(use_cols, int): use_cols = [use_cols] header = [header[i] for i in use_cols] return Dataset(get_basename(file), None, True, data=data, header=np.asarray(header, dtype=str))
[docs]def read_table_excel(file, cells=None, sheet=None, as_datetime=True, dtype=None, **kwargs): """Read a data table from an Excel spreadsheet. A *table* has the following properties: 1. The first row is a header. 2. All rows have the same number of columns. 3. All data values in a column have the same data type. Parameters ---------- file : :term:`path-like <path-like object>` or :term:`file-like <file object>` The file to read. cells : :class:`str`, optional The cells to read. For example, ``C9`` will start at cell C9 and include all values until the end of the spreadsheet, ``A:C`` includes all rows in columns A, B and C, and, ``C9:G20`` includes values from only the specified cells. If not specified then returns all values from the specified `sheet`. sheet : :class:`str`, optional The name of the sheet to read the data from. If there is only one sheet in the workbook then you do not need to specify the name of the sheet. as_datetime : :class:`bool`, optional Whether dates should be returned as :class:`~datetime.datetime` or :class:`~datetime.date` objects. If :data:`False` then dates are returned as a :class:`str`. dtype : :class:`object`, optional If specified then it must be able to be converted to a :class:`~numpy.dtype` object. **kwargs All additional keyword arguments are passed to :func:`~xlrd.open_workbook`. Can use an `encoding` keyword argument as an alias for `encoding_override`. Returns ------- :class:`~msl.io.dataset.Dataset` The table as a :class:`~msl.io.dataset.Dataset`. The header is included in the :class:`~msl.io.metadata.Metadata`. """ if hasattr(file, 'as_posix'): # a pathlib.Path object file = str(file) elif hasattr(file, 'name'): # a TextIOWrapper object file = file.name with ExcelReader(file, **kwargs) as excel: if cells is not None and not _spreadsheet_range_regex.match(cells): match = _spreadsheet_top_left_regex.match(cells) if not match: raise ValueError('Invalid cell {!r}'.format(cells)) name = sheet or excel.workbook.sheet_names()[0] s = excel.workbook.sheet_by_name(name) letters = excel.to_letters(s.ncols - 1) row = match.group(2) cells += ':{}{}'.format(letters, row) table = excel.read(cell=cells, sheet=sheet, as_datetime=as_datetime) return _spreadsheet_to_dataset(table, file, dtype)
[docs]def read_table_gsheets(file, cells=None, sheet=None, as_datetime=True, dtype=None, **kwargs): """Read a data table from a Google Sheets spreadsheet. .. attention:: You must have already performed the instructions specified in :class:`.GDrive` and in :class:`.GSheets` to be able to use this function. A *table* has the following properties: 1. The first row is a header. 2. All rows have the same number of columns. 3. All data values in a column have the same data type. Parameters ---------- file : :term:`path-like <path-like object>` or :term:`file-like <file object>` The file to read. Can be the ID of a Google Sheets spreadsheet. cells : :class:`str`, optional The cells to read. For example, ``C9`` will start at cell C9 and include all values until the end of the spreadsheet, ``A:C`` includes all rows in columns A, B and C, and, ``C9:G20`` includes values from only the specified cells. If not specified then returns all values from the specified `sheet`. sheet : :class:`str`, optional The name of the sheet to read the data from. If there is only one sheet in the spreadsheet then you do not need to specify the name of the sheet. as_datetime : :class:`bool`, optional Whether dates should be returned as :class:`~datetime.datetime` or :class:`~datetime.date` objects. If :data:`False` then dates are returned as a :class:`str`. dtype : :class:`object`, optional If specified then it must be able to be converted to a :class:`~numpy.dtype` object. **kwargs All additional keyword arguments are passed to :class:`~msl.io.readers.gsheets.GSheetsReader`. Returns ------- :class:`~msl.io.dataset.Dataset` The table as a :class:`~msl.io.dataset.Dataset`. The header is included in the :class:`~msl.io.metadata.Metadata`. """ if hasattr(file, 'as_posix'): # a pathlib.Path object file = str(file) elif hasattr(file, 'name'): # a TextIOWrapper object file = file.name with GSheetsReader(file, **kwargs) as sheets: if cells is not None and not _spreadsheet_range_regex.match(cells): if not _spreadsheet_top_left_regex.match(cells): raise ValueError('Invalid cell {!r}'.format(cells)) r, c = sheets.to_indices(cells) data = sheets.read(sheet=sheet, as_datetime=as_datetime) table = [row[c:] for row in data[r:]] else: table = sheets.read(cell=cells, sheet=sheet, as_datetime=as_datetime) return _spreadsheet_to_dataset(table, file, dtype)
def _spreadsheet_to_dataset(table, file, dtype): if not table: header, data = [], [] elif len(table) == 1: header, data = table[0], [] elif len(table[0]) == 1: # a single column header, data = table[0], [row[0] for row in table[1:]] else: header, data = table[0], table[1:] if len(data) == 1: # a single row data = data[0] return Dataset(get_basename(file), None, True, data=data, dtype=dtype, header=np.asarray(header, dtype=str))