Source code for navis.io.pq_io

#    This script is part of navis (http://www.github.com/navis-org/navis).
#    Copyright (C) 2018 Philipp Schlegel
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#    GNU General Public License for more details.
import pandas as pd
import numpy as np

from pathlib import Path
from typing import List, Union, Optional

from .. import config, core

__all__ = ["read_parquet", "write_parquet", "scan_parquet"]

# Set up logging
logger = config.get_logger(__name__)

SKELETON_COLUMNS = ('node_id', 'x', 'y', 'z', 'radius', 'parent_id', 'neuron')
NA_VALUES = (None, 'None')
META_DATA = ('name', 'units', 'soma')  # meta data to write for each neuron

INT_TYPES = (int, np.int8, np.int16, np.int32, np.int64)



[docs]
def scan_parquet(file: Union[str, Path]):
    """Scan parquet file.

    Parameters
    ----------
    file :              str
                        File to be scan.

    Returns
    -------
    pd.DataFrame
                        Summary of file's content.

    See Also
    --------
    :func:`navis.write_parquet`
                        Export neurons as parquet files.
    :func:`navis.read_parquet`
                        Read parquet file into neurons.

    Examples
    --------
    See :func:`navis.write_parquet` for examples.

    """
    try:
        import pyarrow.parquet as pq
    except ImportError:
        raise ImportError('Reading parquet files requires the pyarrow library:\n'
                          ' pip3 install pyarrow')

    f = Path(file).expanduser()
    if not f.is_file():
        raise FileNotFoundError(f'File "{f}" does not exist.')

    metadata = pq.read_metadata(f)

    try:
        meta = {k.decode(): v.decode() for k, v in metadata.metadata.items()}
    except BaseException:
        logger.warning(f'Unable to decode meta data for parquet file {f}')

    # Parse meta data
    ids = [v for k, v in meta.items() if k.endswith(':id') and not k.startswith('_')]
    records = {i: {} for i in ids}
    for k, v in meta.items():
        if k.startswith('_'):
            continue
        if ':' not in k:
            continue

        id, prop = k.split(':')

        if id not in records:  # there might be an "ARROW:schema" entry
            continue

        records[id][prop] = v

    # Turn into DataFrame
    df =  pd.DataFrame.from_records(list(records.values()))

    # Move ID column to front
    ids = df['id']
    df.drop(labels=['id'], axis=1, inplace=True)
    df.insert(0, 'id', ids)

    return df




[docs]
def read_parquet(f: Union[str, Path],
                 read_meta: bool = True,
                 limit: Optional[int] = None,
                 subset: Optional[List[Union[str, int]]] = None,
                 progress=True
                 ) -> 'core.NeuronObject':
    """Read parquet file into Neuron/List.

    See `here <https://github.com/navis-org/navis/blob/master/navis/io/pq_io.md>`_
    for format specifications.

    Parameters
    ----------
    f :                 str
                        File to be read.
    read_meta :         bool
                        Whether to read neuron meta data stored in the parquet
                        file (e.g. name or units). Defaults to True but can be
                        switched off in case there are any issues.
    limit :             int, optional
                        If reading from a file containing multiple neurons you
                        can use this parameter to read only the first ``limit``
                        neurons. Useful if wanting to get a sample from a large
                        library of neurons.
    subset :            str | int | list thereof
                        If the parquet file contains multiple neurons you can
                        use this to select the IDs of the neurons to load. Only
                        works if the parquet file actually contains multiple
                        neurons.

    Returns
    -------
    navis.TreeNeuron/Dotprops
                        If parquet file contains a single neuron.
    navis.NeuronList
                        If parquet file contains multiple neurons.

    See Also
    --------
    :func:`navis.write_parquet`
                        Export neurons as parquet files.
    :func:`navis.scan_parquet`
                        Scan parquet file for its contents.

    Examples
    --------
    See :func:`navis.write_parquet` for examples.

    """
    f = Path(f).expanduser()
    if not f.is_file():
        raise FileNotFoundError(f'File "{f}" does not exist.')

    try:
        import pyarrow.parquet as pq
    except ImportError:
        raise ImportError('Reading parquet files requires the pyarrow library:\n'
                          ' pip3 install pyarrow')

    if limit is not None:
        if subset not in (None, False):
            raise ValueError('You can provide either a `subset` or a `limit` but '
                             'not both.')
        scan = scan_parquet(f)
        subset = scan.id.values[:limit]

    if isinstance(subset, (pd.Series)):
        subset = subset.values

    # Read the table
    if subset is None or subset is False:
        table = pq.read_table(f)
    elif isinstance(subset, (str, int)):
        table = pq.read_table(f, filters=[("neuron", "=", subset)])
    elif isinstance(subset, (list, np.ndarray)):
        table = pq.read_table(f, filters=[("neuron", "in", subset)])
    else:
        raise TypeError(f'`subset` must be int, str or iterable, got "{type(subset)}')

    # Extract meta data (will be byte encoded)
    if read_meta:
        metadata = {k.decode(): v.decode() for k, v in table.schema.metadata.items()}
    else:
        metadata = {}

    # Extract neuron meta data once here instead of for every neuron individually
    # Meta data is encoded as {"{ID}_{PROPERTY}": VALUE}
    # Here we pre-emptively turn this into {(ID, PROPERTY): VALUE}
    # Note that we're dropping "private" properties where the key starts with "_"
    neuron_meta = {tuple(k.split(':')): v for k, v in metadata.items() if not k.startswith('_')}

    # Convert to pandas
    table = table.to_pandas()

    # Check if we're doing skeletons or dotprops
    if 'node_id' in table.columns:
        _extract_neuron = _extract_skeleton
    elif 'vect_x' in table.columns:
        _extract_neuron = _extract_dotprops
    else:
        raise TypeError('Unable to extract neuron from parquet file with '
                        f'columns {table.columns}')

    # If this is a single neuron
    if 'neuron' not in table.columns:
        if metadata:
            id = [v for k, v in metadata.items() if k[1] == 'id'][0]
        else:
            id = '0'  # <-- generic ID as fallback if we don't have metadata
        return _extract_neuron(table, id, neuron_meta)
    else:
        neurons = []
        # Note: this could be done in threads
        for i, (id, this_table) in enumerate(config.tqdm(table.groupby('neuron'),
                                             disable=not progress,
                                             leave=False,
                                             desc='Making nrn')):
            this_table = this_table.drop("neuron", axis=1)
            neurons.append(_extract_neuron(this_table, id, neuron_meta))
        return core.NeuronList(neurons)



def _extract_skeleton(nodes, id, metadata):
    """Extract a single skeleton."""
    # Meta data is encoded as "{ID}_{PROPERTY}"
    str_id = str(id)
    this_meta = {k[1]: v for k, v in metadata.items() if k[0] == str_id}
    # Drop "Nones"
    this_meta = {k: v for k, v in this_meta.items() if v != "None"}

    # The soma needs to be added separately because it is typically stored as
    # list (e.g. [0]) which the TreeNeuron initialisation doesn't like
    if "soma" in this_meta:
        soma = this_meta.pop("soma")
        # Parse a list string (e.g. "[1]") back into a list
        if soma.startswith('['):
            soma = [_try_int(i.strip()) for i in soma[1:-1].split(',')]
        else:
            soma = _try_int(soma)
    else:
        soma = None

    # Make the neuron
    this_meta['id'] = id
    tn = core.TreeNeuron(nodes, **this_meta)

    # Fix soma
    if soma:
        tn.soma = soma
    else:
        tn.soma = None

    return tn


def _extract_dotprops(table, id, metadata):
    """Extract a single dotprop."""
    # Meta data is encoded as "{ID}_{PROPERTY}"
    str_id = str(id)
    this_meta = {k[1]: v for k, v in metadata.items() if k[0] == str_id}
    # Drop "Nones"
    this_meta = {k: v for k, v in this_meta.items() if v != "None"}

    # Make the neuron
    this_meta['id'] = id
    this_meta['k'] = this_meta.get('k', 5)  # <- set a default K of 5

    if 'vect_x' in table:
        this_meta['vect'] = table[['vect_x', 'vect_y', 'vect_z']].values
    if 'alpha' in table:
        this_meta['alpha'] = table['alpha'].values

    return core.Dotprops(table[['x', 'y', 'z']].values,
                       **this_meta)


def _try_int(x):
    """Try converting `x` into an integer."""
    try:
        return int(x)
    except ValueError:
        return x


def _int_to_bytes(x, bits=64):
    """Convert integer to bytes."""
    return int(x).to_bytes(bits, 'big')


def _bytes_to_int(x):
    """Convert bytes to integer."""
    return int.from_bytes(x, "big")



[docs]
def write_parquet(x: 'core.NeuronObject',
                  filepath: Union[str, Path],
                  write_meta: bool = True) -> None:
    """Write TreeNeuron(s) or Dotprops to parquet file.

    See `here <https://github.com/navis-org/navis/blob/master/navis/io/pq_io.md>`_
    for format specifications.

    Parameters
    ----------
    x :                 TreeNeuron | Dotprop | NeuronList thereof
                        Neuron(s) to save. If NeuronList must contain either
                        only TreeNeurons or only Dotprops.
    filepath :          str | pathlib.Path
                        Destination for the file.
    write_meta :        bool | list of str
                        Whether to also write neuron properties to file. By
                        default this is `.name`, `.units` and `.soma`. You can
                        change which properties are written by providing them as
                        list of strings.

    See Also
    --------
    :func:`navis.read_parquet`
                        Import skeleton from parquet file.
    :func:`navis.scan_parquet`
                        Scan parquet file for its contents.

    Examples
    --------
    Save a bunch of skeletons:

    >>> import navis
    >>> nl = navis.example_neurons(3, kind='skeleton')
    >>> navis.write_parquet(nl, tmp_dir / 'skeletons.parquet')

    Inspect that file's content

    >>> import navis
    >>> contents = navis.scan_parquet(tmp_dir / 'skeletons.parquet')
    >>> contents                                                # doctest: +SKIP
               id        units       name    soma
    0   722817260  8 nanometer  DA1_lPN_R     NaN
    1  1734350908  8 nanometer  DA1_lPN_R     [6]
    2  1734350788  8 nanometer  DA1_lPN_R  [4177]

    Read the skeletons back in

    >>> import navis
    >>> nl = navis.read_parquet(tmp_dir / 'skeletons.parquet')
    >>> len(nl)
    3

    """
    filepath = Path(filepath).expanduser()

    # Make sure inputs are only TreeNeurons or Dotprops
    if isinstance(x, core.NeuronList):
        types = x.types
        if types == (core.TreeNeuron,):
            _write_parquet = _write_parquet_skeletons
        elif types == (core.Dotprops, ):
            _write_parquet = _write_parquet_dotprops
        else:
            raise TypeError('Can only write either TreeNeurons or Dotprops to '
                            f'parquet but NeuronList contains {types}')
        if x.is_degenerated:
            raise ValueError('NeuronList must not contain non-unique IDs')
    else:
        if isinstance(x, (core.TreeNeuron, )):
            _write_parquet = _write_parquet_skeletons
        elif isinstance(x, (core.Dotprops, )):
            _write_parquet = _write_parquet_dotprops
        else:
            raise TypeError('Can only write TreeNeurons or Dotprops to parquet, '
                            f'got "{type(x)}"')

    return _write_parquet(x, filepath=filepath, write_meta=write_meta)



def _write_parquet_skeletons(x: 'core.TreeNeuron',
                             filepath: Union[str, Path],
                             write_meta: bool = True,
                             ) -> None:
    """Write TreeNeurons to parquet file."""
    try:
        import pyarrow as pa
        import pyarrow.parquet as pq
    except ImportError:
        raise ImportError('Writing parquet files requires the pyarrow library:\n'
                         ' pip3 install pyarrow')

    # Make sure we're working with a list, not a single neuron
    x = core.NeuronList(x)

    # Generate node table
    nodes = x.nodes[x.nodes.columns[np.isin(x.nodes.columns, SKELETON_COLUMNS)]]

    # Convert to pyarrow table
    table = pa.Table.from_pandas(nodes)

    # Compile metadata
    metadata = _compile_meta(x, write_meta=write_meta)

    # Generate a schema with the new meta data
    schema = pa.schema([table.schema.field(i) for i in range(len(table.schema))],
                       metadata=metadata)

    return pq.write_table(table.cast(schema), filepath)


def _write_parquet_dotprops(x: 'core.Dotprops',
                            filepath: Union[str, Path],
                            write_meta: bool = True,
                            ) -> None:
    """Write Dotprops to parquet file.

    Examples
    --------
    We will test writing dotprops here instead of the main function

    >>> import navis
    >>> nl = navis.example_neurons(3, kind='skeleton')
    >>> dp = navis.make_dotprops(nl, k=5)
    >>> navis.write_parquet(dp, tmp_dir / 'dotprops.parquet')
    >>> dp2 = navis.read_parquet(tmp_dir / 'dotprops.parquet')
    >>> assert len(dp) == len(dp2)
    >>> assert all([i in dp2.id for i in dp.id])

    """
    try:
        import pyarrow as pa
        import pyarrow.parquet as pq
    except ImportError:
        raise ImportError('Writing parquet files requires the pyarrow library:\n'
                         ' pip3 install pyarrow')

    # Make sure we're working with a list, not a single neuron
    x = core.NeuronList(x)

    # Generate table
    table = pd.DataFrame(np.vstack(x.points), columns=['x', 'y', 'z'])

    if all(x.has_vect):
        table = pd.concat((table,
                           pd.DataFrame(np.vstack(x.vect),
                                        columns=['vect_x', 'vect_y', 'vect_z'])
                                        ),
                          axis=1)

    if all(x.has_alpha):
        table['alpha'] = np.concatenate(x.alpha)

    # Add neuron ID
    table['neuron'] = np.repeat(x.id, x.n_points)

    # Convert to pyarrow table
    table = pa.Table.from_pandas(table)

    # Compile metadata
    metadata = _compile_meta(x, write_meta=write_meta)

    # Generate a schema with the new meta data
    schema = pa.schema([table.schema.field(i) for i in range(len(table.schema))],
                       metadata=metadata)

    return pq.write_table(table.cast(schema), filepath)


def _compile_meta(x: Union['core.BaseNeuron', 'core.NeuronList'],
                  write_meta: bool
                  ) -> dict:
    """Compile meta data for writing to parquet file."""
    metadata = {}
    for n in core.NeuronList(x):
        # ID is always written to file and it has to be a string
        if isinstance(n.id, INT_TYPES):
            metadata[f'{n.id}:id'] = str(n.id) #_int_to_bytes(n.id)
        else:
            metadata[f'{n.id}:id'] = str(n.id)

        # If not write_meta, only ID is written to file
        if not write_meta:
            continue

        if isinstance(write_meta, (list, np.ndarray, tuple)):
            attrs = write_meta
        else:
            attrs = META_DATA

        for p in attrs:
            if not getattr(n, p, None):
                continue
            # We're mapping meta data as "{ID}_{property}"
            # e.g. {"1734350788_name": "DA1_lPN_R"}
            metadata[f'{n.id}:{p}'] = str(getattr(n, p, None))

    return metadata