Source code for selene_sdk.samplers.file_samplers.mat_file_sampler

"""
This module provides the `MatFileSampler` class and its supporting
methods.
"""
import h5py
import numpy as np
import scipy.io

from .file_sampler import FileSampler


def _load_mat_file(filepath, sequence_key, targets_key=None):
    """
    Loads data from a `*.mat` file or a `*.h5` file.

    Parameters
    ----------
    filepath : str
        The path to the file to load the data from.
    sequence_key : str
        The key for the sequences data matrix.
    targets_key : str, optional
        Default is None. The key for the targets data matrix.

    Returns
    -------
    (sequences, targets, h5py_filehandle) : \
            tuple(array-like, array-like, h5py.File)
        If the matrix files can be loaded with `scipy.io`,
        the tuple will only be (sequences, targets). Otherwise,
        the 2 matrices and the h5py file handle are returned.

    """
    try:  # see if we can load the file using scipy first
        mat = scipy.io.loadmat(filepath)
        targets = None
        if targets_key:
            targets = mat[targets_key]
        return (mat[sequence_key], targets)
    except (NotImplementedError, ValueError):
        mat = h5py.File(filepath, 'r')
        sequences = mat[sequence_key]
        targets = None
        if targets_key:
            targets = mat[targets_key]
        return (sequences, targets, mat)


[docs]class MatFileSampler(FileSampler):
    """
    A sampler for which the dataset is loaded directly from a `*.mat` file.

    Parameters
    ----------
    filepath : str
        The path to the file to load the data from.
    sequence_key : str
        The key for the sequences data matrix.
    targets_key : str, optional
        Default is None. The key for the targets data matrix.
    random_seed : int, optional
        Default is 436. Sets the random seed for sampling.
    shuffle : bool, optional
        Default is True. Shuffle the order of the samples in the matrix
        before sampling from it.
    sequence_batch_axis : int, optional
        Default is 0. Specify the batch axis.
    sequence_alphabet_axis : int, optional
        Default is 1. Specify the alphabet axis.
    targets_batch_axis : int, optional
        Default is 0. Speciy the batch axis.

    Attributes
    ----------
    n_samples : int
        The number of samples in the data matrix.
    """

    def __init__(self,
                 filepath,
                 sequence_key,
                 targets_key=None,
                 random_seed=436,
                 shuffle=True,
                 sequence_batch_axis=0,
                 sequence_alphabet_axis=1,
                 targets_batch_axis=0):
        """
        Constructs a new `MatFileSampler` object.
        """
        super(MatFileSampler, self).__init__()
        out = _load_mat_file(
            filepath,
            sequence_key,
            targets_key=targets_key)
        self._sample_seqs = out[0]
        self._sample_tgts = out[1]
        self._mat_fh = None
        if len(out) > 2:
            self._mat_fh = out[2]
        self._seq_batch_axis = sequence_batch_axis
        self._seq_alphabet_axis = sequence_alphabet_axis
        self._seq_final_axis = 3 - sequence_batch_axis - sequence_alphabet_axis
        if self._sample_tgts is not None:
            self._tgts_batch_axis = targets_batch_axis
        self.n_samples = self._sample_seqs.shape[self._seq_batch_axis]

        self._sample_indices = np.arange(self.n_samples).tolist()
        self._sample_next = 0

        self._shuffle = shuffle
        if self._shuffle:
            np.random.shuffle(self._sample_indices)

[docs]    def sample(self, batch_size=1):
        """
        Draws a mini-batch of examples and their corresponding
        labels.

        Parameters
        ----------
        batch_size : int, optional
            Default is 1. The number of examples to include in the
            mini-batch.

        Returns
        -------
        sequences, targets : tuple(numpy.ndarray, numpy.ndarray)
            A tuple containing the numeric representation of the
            sequence examples and their corresponding labels. The
            shape of `sequences` will be
            :math:`B \\times L \\times N`, where :math:`B` is
            `batch_size`, :math:`L` is the sequence length, and
            :math:`N` is the size of the sequence type's alphabet.
            The shape of `targets` will be :math:`B \\times F`,
            where :math:`F` is the number of features.
        """
        sample_up_to = self._sample_next + batch_size
        use_indices = None
        if sample_up_to > len(self._sample_indices):
            if self._shuffle:
                np.random.shuffle(self._sample_indices)
            self._sample_next = 0
            use_indices = self._sample_indices[:batch_size]
        else:
            use_indices = self._sample_indices[self._sample_next:sample_up_to]
        self._sample_next += batch_size
        use_indices = sorted(use_indices)
        if self._seq_batch_axis == 0:
            sequences = self._sample_seqs[use_indices, :, :].astype(float)
        elif self._seq_batch_axis == 1:
            sequences = self._sample_seqs[:, use_indices, :].astype(float)
        else:
            sequences = self._sample_seqs[:, :, use_indices].astype(float)

        if self._seq_batch_axis != 0 or self._seq_alphabet_axis != 2:
            sequences = np.transpose(
                sequences, (self._seq_batch_axis,
                            self._seq_final_axis,
                            self._seq_alphabet_axis))
        if self._sample_tgts is not None:
            if self._tgts_batch_axis == 0:
                targets = self._sample_tgts[use_indices, :].astype(float)
            else:
                targets = self._sample_tgts[:, use_indices].astype(float)
                targets = np.transpose(
                    targets, (1, 0))
            return (sequences, targets)
        return sequences,

[docs]    def get_data(self, batch_size, n_samples=None):
        """
        This method fetches a subset of the data from the sampler,
        divided into batches.

        Parameters
        ----------
        batch_size : int
            The size of the batches to divide the data into.
        n_samples : int, optional
            Default is None. The total number of samples to retrieve.

        Returns
        -------
        sequences : list(np.ndarray)
            The list of sequences grouped into batches.
            An element in the `sequences` list is of
            the shape :math:`B \\times L \\times N`, where :math:`B`
            is `batch_size`, :math:`L` is the sequence length,
            and :math:`N` is the size of the sequence type's alphabet.
        """
        if not n_samples:
            n_samples = self.n_samples
        sequences = []

        count = batch_size
        while count < n_samples:
            seqs, = self.sample(batch_size=batch_size)
            sequences.append(seqs)
            count += batch_size
        remainder = batch_size - (count - n_samples)
        seqs, = self.sample(batch_size=remainder)
        sequences.append(seqs)
        return sequences

[docs]    def get_data_and_targets(self, batch_size, n_samples=None):
        """
        This method fetches a subset of the sequence data and
        targets from the sampler, divided into batches.

        Parameters
        ----------
        batch_size : int
            The size of the batches to divide the data into.
        n_samples : int, optional
            Default is None. The total number of samples to retrieve.

        Returns
        -------
        sequences_and_targets, targets_matrix : \
        tuple(list(tuple(numpy.ndarray, numpy.ndarray)), numpy.ndarray)
            Tuple containing the list of sequence-target pairs, as well
            as a single matrix with all targets in the same order.
            Note that `sequences_and_targets`'s sequence elements are of
            the shape :math:`B \\times L \\times N` and its target
            elements are of the shape :math:`B \\times F`, where
            :math:`B` is `batch_size`, :math:`L` is the sequence length,
            :math:`N` is the size of the sequence type's alphabet, and
            :math:`F` is the number of features. Further,
            `target_matrix` is of the shape :math:`S \\times F`, where
            :math:`S =` `n_samples`.
        """
        if self._sample_tgts is None:
            raise ValueError(
                "No targets matrix was specified during sampler "
                "initialization. Please use `get_data` instead.")
        if not n_samples:
            n_samples = self.n_samples

        sequences_and_targets = []
        targets_mat = []

        count = 0
        while count < n_samples:
            sample_size = min(n_samples - count, batch_size)
            seqs, tgts = self.sample(batch_size=sample_size)
            sequences_and_targets.append((seqs, tgts))
            targets_mat.append(tgts)
            count += sample_size

        # TODO: should not assume targets are always integers
        targets_mat = np.vstack(targets_mat).astype(float)
        return sequences_and_targets, targets_mat