Source code for selene_sdk.predict.predict_handlers.handler

"""
This class is the abstract base class for all handlers, i.e. objects
that "handle" model predictions. Specifically, handlers should store
the model predictions or scores derived from those predictions and eventually
output them according to a user-specified output format.
"""
from abc import ABCMeta
from abc import abstractmethod
import os
from sys import getsizeof

import h5py


def write_to_tsv_file(data_across_features, info_cols, output_filepath):
    """
    Write samples with valid predictions/scores to a tab-delimited file.

    Parameters
    ----------
    data_across_features : list(arraylike)
        For each sequence input, we should have predictions or scores derived
        from those predictions across all the genomic/sequence-level features
        our model can predict. The length of this list is the number of
        sequences inputted to the model and the length of each element
        (`arraylike`) in the list is the number of sequence-level features.
    info_cols : list(arraylike)
        Identifying information attached to each sequence entry. Each item
        in `info_cols` corresponds to each row that will be written to the
        file. All columns in an element of `info_cols` will be prepended to
        the values in an element of `data_across_features`.
    output_filepath : str
        Filepath to which to write outputs

    """
    with open(output_filepath, 'a') as output_handle:
        for info_batch, preds_batch in zip(info_cols, data_across_features):
            for info, preds in zip(info_batch, preds_batch):
                preds_str = '\t'.join(
                    probabilities_to_string(list(preds)))
                info_str = '\t'.join([str(i) for i in info])
                output_handle.write("{0}\t{1}\n".format(info_str, preds_str))


def write_to_hdf5_file(data_across_features,
                       info_cols,
                       hdf5_filepath,
                       start_index,
                       info_filepath=None):
    """
    Write samples with valid predictions/scores to an HDF5 file. The
    dataset attached to this file will be accessed using the key "data".
    Each column corresponds to the prediction/score for a model class
    (e.g. genomic feature), and each row is a different input
    variant/sequence.

    Parameters
    ----------
    data_across_features : list(arraylike)
        For each sequence input, we should have predictions or scores derived
        from those predictions across all the genomic/sequence-level features
        our model can predict. The length of this list is the number of
        sequences inputted to the model and the length of each element
        (`arraylike`) in the list is the number of sequence-level features.
    info_cols : list(arraylike)
        Identifying information attached to each sequence entry. Each item
        in `info_cols` is the label information for each row that is written
        to the file. All values in an element of `info_cols` will be written
        to a separate .txt file.
    hdf5_filepath : str
        HDF5 filepath to which to write the data.
    start_index : int
        The row index in the HDF5 matrix to which to start writing the data
    info_filepath : str or None, optional
        Default is None. .txt filepath to which to write the labels.
        Can be None if multiple handlers were initialized--only 1
        needs to write to the labels file.

    Returns
    -------
    int
        The updated start_index.
    """
    if info_filepath is not None:
        with open(info_filepath, 'a') as info_handle:
            for info_batch in info_cols:
                for info in info_batch:
                    info_str = '\t'.join([str(i) for i in info])
                    info_handle.write("{0}\n".format(info_str))
    with h5py.File(hdf5_filepath, 'a') as hdf5_handle:
        data = hdf5_handle["data"]
        for data_batch in data_across_features:
            data[start_index :(start_index + data_batch.shape[0])] = data_batch
            start_index = start_index + data_batch.shape[0]

    return start_index


def probabilities_to_string(probabilities):
    """
    Converts a list of probability values (`float`s) to a list of
    `str` probability values, where each value is represented in
    scientific notation with 2 digits after the decimal.

    Parameters
    ----------
    probabilities : list(float)

    Returns
    -------
    list(str)

    """
    return ["{:.2e}".format(p) for p in probabilities]


[docs]class PredictionsHandler(metaclass=ABCMeta):
    """
    The abstract base class for handlers, which "handle" model
    predictions. Handlers are responsible for accepting predictions,
    storing these predictions or scores derived from the predictions,
    and then returning them in a user-specified output format (Selene
    currently supports TSV and HDF5 file outputs)

    Parameters
    ----------
    features : list(str)
        List of sequence-level features, in the same order that the
        model will return its predictions.
    columns_for_ids : list(str)
        Columns in the file that will help to identify the sequence
        or variant to which the model prediction scores correspond.
    output_path_prefix : str
        Path to the file to which Selene will write the absolute difference
        scores. The path may contain a filename prefix. Selene will append
        a handler-specific name to the end of the path/prefix.
    output_format : {'tsv', 'hdf5'}
        Specify the desired output format. TSV can be specified if the final
        file should be easily perused (e.g. viewed in a text editor/Excel).
        However, saving to a TSV file is much slower than saving to an HDF5
        file.
    output_size : int, optional
        The total number of rows in the output. Must be specified when
        the output_format is hdf5.
    write_mem_limit : int, optional
        Default is 1500. Specify the amount of memory you can allocate to
        storing model predictions/scores for this particular handler, in MB.
        Handler will write to file whenever this memory limit is reached.
    write_labels : bool, optional
        Default is True. If you initialize multiple write handlers for the
        same set of inputs with output format `hdf5`, set `write_label` to
        False on all handlers except 1 so that only 1 handler writes the
        row labels to an output file.

    Attributes
    ----------
    needs_base_pred : bool
        Whether the handler needs the base (reference) prediction as input
        to compute the final output

    """
    def __init__(self,
                 features,
                 columns_for_ids,
                 output_path_prefix,
                 output_format,
                 output_size=None,
                 write_mem_limit=1500,
                 write_labels=True):
        self.needs_base_pred = False
        self._results = []
        self._samples = []

        self._features = features
        self._columns_for_ids = columns_for_ids
        self._output_path_prefix = output_path_prefix
        self._output_format = output_format
        self._output_size = output_size
        if output_format == 'hdf5' and output_size is None:
            raise ValueError("`output_size` must be specified when "
                             "`output_format` is 'hdf5'.")

        self._output_filepath = None
        self._labels_filepath = None
        self._hdf5_start_index = None

        self._write_mem_limit = write_mem_limit
        self._write_labels = write_labels

    def _create_write_handler(self, handler_filename):
        """
        Initialize handlers for writing outputs to file.

        """
        output_path = None
        filename_prefix = None
        if not os.path.isdir(self._output_path_prefix):
            output_path, filename_prefix = os.path.split(
                self._output_path_prefix)
        else:
            output_path = self._output_path_prefix
        if filename_prefix is not None:
            handler_filename = "{0}_{1}".format(
                filename_prefix, handler_filename)
        scores_filepath = os.path.join(output_path, handler_filename)
        if self._output_format == "tsv":
            self._output_filepath = "{0}.tsv".format(scores_filepath)
            with open(self._output_filepath, 'w+') as output_handle:
                column_names = self._columns_for_ids + self._features
                output_handle.write("{0}\n".format(
                    '\t'.join(column_names)))
        elif self._output_format == "hdf5":
            self._output_filepath = "{0}.h5".format(scores_filepath)
            with h5py.File(self._output_filepath, 'w') as output_handle:
                output_handle.create_dataset(
                    "data",
                    (self._output_size, len(self._features)),
                    dtype='float64')
            self._hdf5_start_index = 0

            if not self._write_labels:
                return
            labels_filename = "row_labels.txt"
            if filename_prefix is not None:
                # always output same row labels filename
                if filename_prefix[-4:] == '.ref' or \
                        filename_prefix[-4:] == '.alt':
                    filename_prefix = filename_prefix[:-4]
                labels_filename = "{0}_{1}".format(
                    filename_prefix, labels_filename)
            self._labels_filepath = os.path.join(output_path, labels_filename)
            # create the file
            label_handle = open(self._labels_filepath, 'w+')
            label_handle.write("{0}\n".format(
                                '\t'.join(self._columns_for_ids)))

    def _reached_mem_limit(self):
        mem_used = (self._results[0].nbytes * len(self._results) +
                    getsizeof(self._samples[0]) * len(self._samples))
        return mem_used / 10**6 >= self._write_mem_limit

[docs]    @abstractmethod
    def handle_batch_predictions(self, *args, **kwargs):
        """
        Must be able to handle a batch of model predictions.
        """
        raise NotImplementedError

[docs]    def write_to_file(self):
        """
        Writes accumulated handler results to file.

        """
        if not self._results:
            return None
        if self._hdf5_start_index is not None:
            self._hdf5_start_index = write_to_hdf5_file(
                self._results,
                self._samples,
                self._output_filepath,
                self._hdf5_start_index,
                info_filepath=self._labels_filepath)
        else:
            write_to_tsv_file(self._results,
                              self._samples,
                              self._output_filepath)
        self._results = []
        self._samples = []