Source code for selene_sdk.samplers.file_samplers.bed_file_sampler

"""
This module provides the BedFileSampler class.
"""
import numpy as np

from .file_sampler import FileSampler

[docs]class BedFileSampler(FileSampler): """ A sampler for which the dataset is loaded directly from a `*.bed` file. Parameters ---------- filepath : str The path to the file to load the data from. reference_sequence : selene_sdk.sequences.Sequence A reference sequence from which to create examples. n_samples : int Number of lines in the file. (`wc -l <filepath>`) sequence_length : int or None, optional Default is None. If the coordinates of each sample in the BED file already account for the full sequence (that is, `end - start = sequence_length`), there is no need to specify the sequence length. If `sequence_length` is not None, the length of each sample will be checked to determine whether the sample coordinates need to be truncated or expanded to reach the sequence length specified in the model architecture. targets_avail : bool, optional Default is False. If `targets_avail`, assumes that it is the last column of the `*.bed` file. The last column should contain the indices, separated by semicolons, of features (classes) found within a given sample's coordinates (e.g. 0;1;45;60). This assumes that we are only looking for the absence/presence of each feature within the interval. n_features : int or None, optional Default is None. If `targets_avail` is True, must specify `n_features`, the total number of features (classes). Attributes ---------- filepath : str The path to the file to load the data from. reference_sequence : selene_sdk.sequences.Sequence A reference sequence from which to create examples. n_samples : int Number of lines in the file. (`wc -l <filepath>`) sequence_length : int or None, optional Default is None. If the coordinates of each sample in the BED file already account for the full sequence (that is, `end - start = sequence_length`), there is no need to specify the sequence length. If `sequence_length` is not None, the length of each sample will be checked to determine whether the sample coordinates need to be truncated or expanded to reach the sequence length specified in the model architecture. targets_avail : bool If `targets_avail`, assumes that it is the last column of the `*.bed` file. The last column should contain the indices, separated by semicolons, of features (classes) found within a given sample's coordinates (e.g. 0;1;45;60). This assumes that we are only looking or the absence/presence of each feature within the interval. n_features : int or None If `targets_avail` is True, must specify `n_features`, the total number of features (classes). """ def __init__(self, filepath, reference_sequence, n_samples, sequence_length=None, targets_avail=False, n_features=None): """ Constructs a new `BedFileSampler` object. """ super(BedFileSampler, self).__init__() self.filepath = filepath self._file_handle = open(self.filepath, 'r') self.reference_sequence = reference_sequence self.sequence_length = sequence_length self.targets_avail = targets_avail self.n_features = n_features self.n_samples = n_samples
[docs] def sample(self, batch_size=1): """ Draws a mini-batch of examples and their corresponding labels. Parameters ---------- batch_size : int, optional Default is 1. The number of examples to include in the mini-batch. Returns ------- sequences, targets : tuple(numpy.ndarray, numpy.ndarray) A tuple containing the numeric representation of the sequence examples and their corresponding labels. The shape of `sequences` will be :math:`B \\times L \\times N`, where :math:`B` is `batch_size`, :math:`L` is the sequence length, and :math:`N` is the size of the sequence type's alphabet. The shape of `targets` will be :math:`B \\times F`, where :math:`F` is the number of features. """ sequences = [] targets = None if self.targets_avail: targets = [] while len(sequences) < batch_size: line = self._file_handle.readline() if not line: # TODO: add functionality to shuffle the file if sampler # reaches the end of the file. self._file_handle.close() self._file_handle = open(self.filepath, 'r') line = self._file_handle.readline() cols = line.split('\t') chrom = cols[0] start = int(cols[1]) end = int(cols[2]) strand_side = None features = None if len(cols) == 5: strand_side = cols[3] features = cols[4].strip() elif len(cols) == 4 and self.targets_avail: features = cols[3].strip() elif len(cols) == 4: strand_side = cols[3].strip() # if strand_side is None, assume strandedness does not matter. # can change this to randomly selecting +/- later strand_side = '+' n = end - start if self.sequence_length and n < self.sequence_length: diff = (self.sequence_length - n) / 2 pad_l = int(np.floor(diff)) pad_r = int(np.ceil(diff)) start = start - pad_l end = end + pad_r elif self.sequence_length and n > self.sequence_length: start = int((n - self.sequence_length) // 2) end = int(start + self.sequence_length) sequence = self.reference_sequence.get_encoding_from_coords( chrom, start, end, strand=strand_side) if sequence.shape[0] == 0: continue sequences.append(sequence) if self.targets_avail: tgts = np.zeros((self.n_features)) features = [int(f) for f in features.split(';') if f] tgts[features] = 1 targets.append(tgts.astype(float)) sequences = np.array(sequences) if self.targets_avail: targets = np.array(targets) return (sequences, targets) return sequences,
[docs] def get_data(self, batch_size, n_samples=None): """ This method fetches a subset of the data from the sampler, divided into batches. Parameters ---------- batch_size : int The size of the batches to divide the data into. n_samples : int, optional Default is None. The total number of samples to retrieve. Returns ------- sequences : list(np.ndarray) The list of sequences grouped into batches. An element in the `sequences` list is of the shape :math:`B \\times L \\times N`, where :math:`B` is `batch_size`, :math:`L` is the sequence length, and :math:`N` is the size of the sequence type's alphabet. """ if not n_samples: n_samples = self.n_samples sequences = [] count = batch_size while count < n_samples: seqs, = self.sample(batch_size=batch_size) sequences.append(seqs) count += batch_size remainder = batch_size - (count - n_samples) seqs, = self.sample(batch_size=remainder) sequences.append(seqs) return sequences
[docs] def get_data_and_targets(self, batch_size, n_samples=None): """ This method fetches a subset of the sequence data and targets from the sampler, divided into batches. Parameters ---------- batch_size : int The size of the batches to divide the data into. n_samples : int, optional Default is None. The total number of samples to retrieve. Returns ------- sequences_and_targets, targets_matrix : \ tuple(list(tuple(numpy.ndarray, numpy.ndarray)), numpy.ndarray) Tuple containing the list of sequence-target pairs, as well as a single matrix with all targets in the same order. Note that `sequences_and_targets`'s sequence elements are of the shape :math:`B \\times L \\times N` and its target elements are of the shape :math:`B \\times F`, where :math:`B` is `batch_size`, :math:`L` is the sequence length, :math:`N` is the size of the sequence type's alphabet, and :math:`F` is the number of features. Further, `target_matrix` is of the shape :math:`S \\times F`, where :math:`S =` `n_samples`. """ if not self.targets_avail: raise ValueError( "No targets are specified in the *.bed file. " "Please use `get_data` instead.") if not n_samples: n_samples = self.n_samples sequences_and_targets = [] targets_mat = [] count = batch_size while count < n_samples: seqs, tgts = self.sample(batch_size=batch_size) sequences_and_targets.append((seqs, tgts)) targets_mat.append(tgts) count += batch_size remainder = batch_size - (count - n_samples) seqs, tgts = self.sample(batch_size=remainder) sequences_and_targets.append((seqs, tgts)) targets_mat.append(tgts) targets_mat = np.vstack(targets_mat).astype(int) return sequences_and_targets, targets_mat