Source code for selene_sdk.samplers.online_sampler

"""
This module provides the `OnlineSampler` class and supporting methods.
Objects of the class `OnlineSampler`, are samplers which load examples
"on the fly" rather than storing them all persistently in memory.

"""
from abc import ABCMeta
import os
import random

import numpy as np

from .sampler import Sampler
from ..targets import GenomicFeatures


[docs]class OnlineSampler(Sampler, metaclass=ABCMeta): """ A sampler in which training/validation/test data is constructed from random sampling of the dataset for each batch passed to the model. This form of sampling may alleviate the problem of loading an extremely large dataset into memory when developing a new model. Parameters ---------- reference_sequence : selene_sdk.sequences.Sequence A reference sequence from which to create examples. target_path : str Path to tabix-indexed, compressed BED file (`*.bed.gz`) of genomic coordinates mapped to the genomic features we want to predict. features : list(str) List of distinct features that we aim to predict. seed : int, optional Default is 436. Sets the random seed for sampling. validation_holdout : list(str) or float, optional Default is `['chr6', 'chr7']`. Holdout can be regional or proportional. If regional, expects a list (e.g. `['X', 'Y']`). Regions must match those specified in the first column of the tabix-indexed BED file. If proportional, specify a percentage between (0.0, 1.0). Typically 0.10 or 0.20. test_holdout : list(str) or float, optional Default is `['chr8', 'chr9']`. See documentation for `validation_holdout` for additional information. sequence_length : int, optional Default is 1000. Model is trained on sequences of `sequence_length` where genomic features are annotated to the center regions of these sequences. center_bin_to_predict : int, optional Default is 200. Query the tabix-indexed file for a region of length `center_bin_to_predict`. feature_thresholds : float [0.0, 1.0], optional Default is 0.5. The `feature_threshold` to pass to the `GenomicFeatures` object. mode : {'train', 'validate', 'test'}, optional Default is `'train'`. The mode to run the sampler in. save_datasets : list(str), optional Default is `[]` the empty list. The list of modes for which we should save the sampled data to file (e.g. `["test", "validate"]`). output_dir : str or None, optional Default is None. The path to the directory where we should save sampled examples for a mode. If `save_datasets` is a non-empty list, `output_dir` must be specified. If the path in `output_dir` does not exist it will be created automatically. Attributes ---------- reference_sequence : selene_sdk.sequences.Sequence The reference sequence that examples are created from. target : selene_sdk.targets.Target The `selene_sdk.targets.Target` object holding the features that we would like to predict. validation_holdout : list(str) or float The samples to hold out for validating model performance. These can be "regional" or "proportional". If regional, this is a list of region names (e.g. `['chrX', 'chrY']`). These regions must match those specified in the first column of the tabix-indexed BED file. If proportional, this is the fraction of total samples that will be held out. test_holdout : list(str) or float The samples to hold out for testing model performance. See the documentation for `validation_holdout` for more details. sequence_length : int The length of the sequences to train the model on. modes : list(str) The list of modes that the sampler can be run in. mode : str The current mode that the sampler is running in. Must be one of the modes listed in `modes`. Raises ------ ValueError If `mode` is not a valid mode. ValueError If the parities of `sequence_length` and `center_bin_to_predict` are not the same. ValueError If `sequence_length` is smaller than `center_bin_to_predict` is. ValueError If the types of `validation_holdout` and `test_holdout` are not the same. """ STRAND_SIDES = ('+', '-') """ Defines the strands that features can be sampled from. """ def __init__(self, reference_sequence, target_path, features, seed=436, validation_holdout=['chr6', 'chr7'], test_holdout=['chr8', 'chr9'], sequence_length=1001, center_bin_to_predict=201, feature_thresholds=0.5, mode="train", save_datasets=[], output_dir=None): """ Creates a new `OnlineSampler` object. """ super(OnlineSampler, self).__init__( features, save_datasets=save_datasets, output_dir=output_dir) self.seed = seed np.random.seed(self.seed) random.seed(self.seed + 1) if isinstance(center_bin_to_predict, int): if (sequence_length + center_bin_to_predict) % 2 != 0: raise ValueError( "Sequence length of {0} with a center bin length of {1} " "is invalid. These 2 inputs should both be odd or both be " "even.".format(sequence_length, center_bin_to_predict)) # specifying a test holdout partition is optional if test_holdout: self.modes.append("test") if isinstance(validation_holdout, (list,)) and \ isinstance(test_holdout, (list,)): self.validation_holdout = [ str(c) for c in validation_holdout] self.test_holdout = [str(c) for c in test_holdout] self._holdout_type = "chromosome" elif isinstance(validation_holdout, float) and \ isinstance(test_holdout, float): self.validation_holdout = validation_holdout self.test_holdout = test_holdout self._holdout_type = "proportion" else: raise ValueError( "Validation holdout and test holdout must have the " "same type (list or float) but validation was " "type {0} and test was type {1}".format( type(validation_holdout), type(test_holdout))) else: self.test_holdout = None if isinstance(validation_holdout, (list,)): self.validation_holdout = [ str(c) for c in validation_holdout] self._holdout_type = "chromosome" elif isinstance(validation_holdout, float): self.validation_holdout = validation_holdout self._holdout_type = "proportion" else: raise ValueError( "Validation holdout must be of type list (chromosomal " "holdout) or float (proportion holdout) but was type " "{0}.".format(type(validation_holdout))) if mode not in self.modes: raise ValueError( "Mode must be one of {0}. Input was '{1}'.".format( self.modes, mode)) self.mode = mode self.sequence_length = sequence_length window_radius = int(self.sequence_length / 2) self._start_window_radius = window_radius self._end_window_radius = window_radius if self.sequence_length % 2 != 0: self._end_window_radius += 1 if isinstance(center_bin_to_predict, int): bin_radius = int(center_bin_to_predict / 2) self._start_radius = bin_radius self._end_radius = bin_radius if center_bin_to_predict % 2 != 0: self._end_radius += 1 else: if not isinstance(center_bin_to_predict, list) or \ len(center_bin_to_predict) != 2: raise ValueError( "`center_bin_to_predict` needs to be either an int or a list of " "two ints, but was type '{0}'".format( type(center_bin_to_predict))) else: bin_start, bin_end = center_bin_to_predict if bin_start < 0 or bin_end > self.sequence_length: ValueError( "center_bin_to_predict [{0}, {1}]" "is out-of-bound for sequence length {3}.".format( bin_start, bin_end, self.sequence_length)) self._start_radius = self._start_window_radius - bin_start self._end_radius = self._end_window_radius - (self.sequence_length - bin_end) self.reference_sequence = reference_sequence self.n_features = len(self._features) self.target = GenomicFeatures( target_path, self._features, feature_thresholds=feature_thresholds) self._save_filehandles = {}
[docs] def get_feature_from_index(self, index): """ Returns the feature corresponding to an index in the feature vector. Parameters ---------- index : int The index of the feature to retrieve the name for. Returns ------- str The name of the feature occurring at the specified index. """ return self.target.index_feature_dict[index]
[docs] def get_sequence_from_encoding(self, encoding): """ Gets the string sequence from the one-hot encoding of the sequence. Parameters ---------- encoding : numpy.ndarray An :math:`L \\times N` array (where :math:`L` is the length of the sequence and :math:`N` is the size of the sequence type's alphabet) containing the one-hot encoding of the sequence. Returns ------- str The sequence of :math:`L` characters decoded from the input. """ return self.reference_sequence.encoding_to_sequence(encoding)
[docs] def save_dataset_to_file(self, mode, close_filehandle=False): """ Save samples for each partition (i.e. train/validate/test) to disk. Parameters ---------- mode : str Must be one of the modes specified in `save_datasets` during sampler initialization. close_filehandle : bool, optional Default is False. `close_filehandle=True` assumes that all data corresponding to the input `mode` has been saved to file and `save_dataset_to_file` will not be called with `mode` again. """ if mode not in self._save_datasets: return samples = self._save_datasets[mode] if mode not in self._save_filehandles: self._save_filehandles[mode] = open( os.path.join(self._output_dir, "{0}_data.bed".format(mode)), 'w+') file_handle = self._save_filehandles[mode] while len(samples) > 0: cols = samples.pop(0) line = '\t'.join([str(c) for c in cols]) file_handle.write("{0}\n".format(line)) if close_filehandle: file_handle.close()
[docs] def get_data_and_targets(self, batch_size, n_samples=None, mode=None): """ This method fetches a subset of the data from the sampler, divided into batches. This method also allows the user to specify what operating mode to run the sampler in when fetching the data. Parameters ---------- batch_size : int The size of the batches to divide the data into. n_samples : int or None, optional Default is None. The total number of samples to retrieve. If `n_samples` is None and the mode is `validate`, will set `n_samples` to 32000; if the mode is `test`, will set `n_samples` to 640000 if it is None. If the mode is `train` you must have specified a value for `n_samples`. mode : str, optional Default is None. The mode to run the sampler in when fetching the samples. See `selene_sdk.samplers.IntervalsSampler.modes` for more information. If None, will use the current mode `self.mode`. Returns ------- sequences_and_targets, targets_matrix : \ tuple(list(tuple(numpy.ndarray, numpy.ndarray)), numpy.ndarray) Tuple containing the list of sequence-target pairs, as well as a single matrix with all targets in the same order. Note that `sequences_and_targets`'s sequence elements are of the shape :math:`B \\times L \\times N` and its target elements are of the shape :math:`B \\times F`, where :math:`B` is `batch_size`, :math:`L` is the sequence length, :math:`N` is the size of the sequence type's alphabet, and :math:`F` is the number of features. Further, `target_matrix` is of the shape :math:`S \\times F`, where :math:`S =` `n_samples`. """ if mode is not None: self.set_mode(mode) else: mode = self.mode sequences_and_targets = [] if n_samples is None and mode == "validate": n_samples = 32000 elif n_samples is None and mode == "test": n_samples = 640000 n_batches = int(n_samples / batch_size) for _ in range(n_batches): inputs, targets = self.sample(batch_size) sequences_and_targets.append((inputs, targets)) targets_mat = np.vstack([t for (s, t) in sequences_and_targets]) if mode in self._save_datasets: self.save_dataset_to_file(mode, close_filehandle=True) return sequences_and_targets, targets_mat
[docs] def get_dataset_in_batches(self, mode, batch_size, n_samples=None): """ This method returns a subset of the data for a specified run mode, divided into mini-batches. Parameters ---------- mode : {'test', 'validate'} The mode to run the sampler in when fetching the samples. See `selene_sdk.samplers.IntervalsSampler.modes` for more information. batch_size : int The size of the batches to divide the data into. n_samples : int or None, optional Default is `None`. The total number of samples to retrieve. If `None`, it will retrieve 32000 samples if `mode` is validate or 640000 samples if `mode` is test or train. Returns ------- sequences_and_targets, targets_matrix : \ tuple(list(tuple(numpy.ndarray, numpy.ndarray)), numpy.ndarray) Tuple containing the list of sequence-target pairs, as well as a single matrix with all targets in the same order. The list is length :math:`S`, where :math:`S =` `n_samples`. Note that `sequences_and_targets`'s sequence elements are of the shape :math:`B \\times L \\times N` and its target elements are of the shape :math:`B \\times F`, where :math:`B` is `batch_size`, :math:`L` is the sequence length, :math:`N` is the size of the sequence type's alphabet, and :math:`F` is the number of features. Further, `target_matrix` is of the shape :math:`S \\times F` """ return self.get_data_and_targets( batch_size, n_samples=n_samples, mode=mode)
[docs] def get_validation_set(self, batch_size, n_samples=None): """ This method returns a subset of validation data from the sampler, divided into batches. Parameters ---------- batch_size : int The size of the batches to divide the data into. n_samples : int or None, optional Default is `None`. The total number of validation examples to retrieve. If `None`, 32000 examples are retrieved. Returns ------- sequences_and_targets, targets_matrix : \ tuple(list(tuple(numpy.ndarray, numpy.ndarray)), numpy.ndarray) Tuple containing the list of sequence-target pairs, as well as a single matrix with all targets in the same order. Note that `sequences_and_targets`'s sequence elements are of the shape :math:`B \\times L \\times N` and its target elements are of the shape :math:`B \\times F`, where :math:`B` is `batch_size`, :math:`L` is the sequence length, :math:`N` is the size of the sequence type's alphabet, and :math:`F` is the number of features. Further, `target_matrix` is of the shape :math:`S \\times F`, where :math:`S =` `n_samples`. """ return self.get_dataset_in_batches( "validate", batch_size, n_samples=n_samples)
[docs] def get_test_set(self, batch_size, n_samples=None): """ This method returns a subset of testing data from the sampler, divided into batches. Parameters ---------- batch_size : int The size of the batches to divide the data into. n_samples : int or None, optional Default is `None`. The total number of validation examples to retrieve. If `None`, 640000 examples are retrieved. Returns ------- sequences_and_targets, targets_matrix : \ tuple(list(tuple(numpy.ndarray, numpy.ndarray)), numpy.ndarray) Tuple containing the list of sequence-target pairs, as well as a single matrix with all targets in the same order. Note that `sequences_and_targets`'s sequence elements are of the shape :math:`B \\times L \\times N` and its target elements are of the shape :math:`B \\times F`, where :math:`B` is `batch_size`, :math:`L` is the sequence length, :math:`N` is the size of the sequence type's alphabet, and :math:`F` is the number of features. Further, `target_matrix` is of the shape :math:`S \\times F`, where :math:`S =` `n_samples`. Raises ------ ValueError If no test partition of the data was specified during sampler initialization. """ if "test" not in self.modes: raise ValueError("No test partition of the data was specified " "during initialization. Cannot use method " "`get_test_set`.") return self.get_dataset_in_batches("test", batch_size, n_samples)