"""
This module provides the `MatFileSampler` class and its supporting
methods.
"""
import h5py
import numpy as np
import scipy.io
from .file_sampler import FileSampler
def _load_mat_file(filepath, sequence_key, targets_key=None):
"""
Loads data from a `*.mat` file or a `*.h5` file.
Parameters
----------
filepath : str
The path to the file to load the data from.
sequence_key : str
The key for the sequences data matrix.
targets_key : str, optional
Default is None. The key for the targets data matrix.
Returns
-------
(sequences, targets, h5py_filehandle) : \
tuple(array-like, array-like, h5py.File)
If the matrix files can be loaded with `scipy.io`,
the tuple will only be (sequences, targets). Otherwise,
the 2 matrices and the h5py file handle are returned.
"""
try: # see if we can load the file using scipy first
mat = scipy.io.loadmat(filepath)
targets = None
if targets_key:
targets = mat[targets_key]
return (mat[sequence_key], targets)
except (NotImplementedError, ValueError):
mat = h5py.File(filepath, 'r')
sequences = mat[sequence_key]
targets = None
if targets_key:
targets = mat[targets_key]
return (sequences, targets, mat)
[docs]class MatFileSampler(FileSampler):
"""
A sampler for which the dataset is loaded directly from a `*.mat` file.
Parameters
----------
filepath : str
The path to the file to load the data from.
sequence_key : str
The key for the sequences data matrix.
targets_key : str, optional
Default is None. The key for the targets data matrix.
random_seed : int, optional
Default is 436. Sets the random seed for sampling.
shuffle : bool, optional
Default is True. Shuffle the order of the samples in the matrix
before sampling from it.
sequence_batch_axis : int, optional
Default is 0. Specify the batch axis.
sequence_alphabet_axis : int, optional
Default is 1. Specify the alphabet axis.
targets_batch_axis : int, optional
Default is 0. Speciy the batch axis.
Attributes
----------
n_samples : int
The number of samples in the data matrix.
"""
def __init__(self,
filepath,
sequence_key,
targets_key=None,
random_seed=436,
shuffle=True,
sequence_batch_axis=0,
sequence_alphabet_axis=1,
targets_batch_axis=0):
"""
Constructs a new `MatFileSampler` object.
"""
super(MatFileSampler, self).__init__()
out = _load_mat_file(
filepath,
sequence_key,
targets_key=targets_key)
self._sample_seqs = out[0]
self._sample_tgts = out[1]
self._mat_fh = None
if len(out) > 2:
self._mat_fh = out[2]
self._seq_batch_axis = sequence_batch_axis
self._seq_alphabet_axis = sequence_alphabet_axis
self._seq_final_axis = 3 - sequence_batch_axis - sequence_alphabet_axis
if self._sample_tgts is not None:
self._tgts_batch_axis = targets_batch_axis
self.n_samples = self._sample_seqs.shape[self._seq_batch_axis]
self._sample_indices = np.arange(self.n_samples).tolist()
self._sample_next = 0
self._shuffle = shuffle
if self._shuffle:
np.random.shuffle(self._sample_indices)
[docs] def sample(self, batch_size=1):
"""
Draws a mini-batch of examples and their corresponding
labels.
Parameters
----------
batch_size : int, optional
Default is 1. The number of examples to include in the
mini-batch.
Returns
-------
sequences, targets : tuple(numpy.ndarray, numpy.ndarray)
A tuple containing the numeric representation of the
sequence examples and their corresponding labels. The
shape of `sequences` will be
:math:`B \\times L \\times N`, where :math:`B` is
`batch_size`, :math:`L` is the sequence length, and
:math:`N` is the size of the sequence type's alphabet.
The shape of `targets` will be :math:`B \\times F`,
where :math:`F` is the number of features.
"""
sample_up_to = self._sample_next + batch_size
use_indices = None
if sample_up_to > len(self._sample_indices):
if self._shuffle:
np.random.shuffle(self._sample_indices)
self._sample_next = 0
use_indices = self._sample_indices[:batch_size]
else:
use_indices = self._sample_indices[self._sample_next:sample_up_to]
self._sample_next += batch_size
use_indices = sorted(use_indices)
if self._seq_batch_axis == 0:
sequences = self._sample_seqs[use_indices, :, :].astype(float)
elif self._seq_batch_axis == 1:
sequences = self._sample_seqs[:, use_indices, :].astype(float)
else:
sequences = self._sample_seqs[:, :, use_indices].astype(float)
if self._seq_batch_axis != 0 or self._seq_alphabet_axis != 2:
sequences = np.transpose(
sequences, (self._seq_batch_axis,
self._seq_final_axis,
self._seq_alphabet_axis))
if self._sample_tgts is not None:
if self._tgts_batch_axis == 0:
targets = self._sample_tgts[use_indices, :].astype(float)
else:
targets = self._sample_tgts[:, use_indices].astype(float)
targets = np.transpose(
targets, (1, 0))
return (sequences, targets)
return sequences,
[docs] def get_data(self, batch_size, n_samples=None):
"""
This method fetches a subset of the data from the sampler,
divided into batches.
Parameters
----------
batch_size : int
The size of the batches to divide the data into.
n_samples : int, optional
Default is None. The total number of samples to retrieve.
Returns
-------
sequences : list(np.ndarray)
The list of sequences grouped into batches.
An element in the `sequences` list is of
the shape :math:`B \\times L \\times N`, where :math:`B`
is `batch_size`, :math:`L` is the sequence length,
and :math:`N` is the size of the sequence type's alphabet.
"""
if not n_samples:
n_samples = self.n_samples
sequences = []
count = batch_size
while count < n_samples:
seqs, = self.sample(batch_size=batch_size)
sequences.append(seqs)
count += batch_size
remainder = batch_size - (count - n_samples)
seqs, = self.sample(batch_size=remainder)
sequences.append(seqs)
return sequences
[docs] def get_data_and_targets(self, batch_size, n_samples=None):
"""
This method fetches a subset of the sequence data and
targets from the sampler, divided into batches.
Parameters
----------
batch_size : int
The size of the batches to divide the data into.
n_samples : int, optional
Default is None. The total number of samples to retrieve.
Returns
-------
sequences_and_targets, targets_matrix : \
tuple(list(tuple(numpy.ndarray, numpy.ndarray)), numpy.ndarray)
Tuple containing the list of sequence-target pairs, as well
as a single matrix with all targets in the same order.
Note that `sequences_and_targets`'s sequence elements are of
the shape :math:`B \\times L \\times N` and its target
elements are of the shape :math:`B \\times F`, where
:math:`B` is `batch_size`, :math:`L` is the sequence length,
:math:`N` is the size of the sequence type's alphabet, and
:math:`F` is the number of features. Further,
`target_matrix` is of the shape :math:`S \\times F`, where
:math:`S =` `n_samples`.
"""
if self._sample_tgts is None:
raise ValueError(
"No targets matrix was specified during sampler "
"initialization. Please use `get_data` instead.")
if not n_samples:
n_samples = self.n_samples
sequences_and_targets = []
targets_mat = []
count = 0
while count < n_samples:
sample_size = min(n_samples - count, batch_size)
seqs, tgts = self.sample(batch_size=sample_size)
sequences_and_targets.append((seqs, tgts))
targets_mat.append(tgts)
count += sample_size
# TODO: should not assume targets are always integers
targets_mat = np.vstack(targets_mat).astype(float)
return sequences_and_targets, targets_mat