Source code for selene_sdk.targets.genomic_features

"""
This class contains methods to query a file of genomic coordinates,
where each row of [start, end) coordinates corresponds to a genomic feature
in the sequence.

It accepts the path to a tabix-indexed .bed.gz file of genomic coordinates.

This .tsv/.bed file must contain the following columns, in order:
    chrom ('1', '2', ..., 'X', 'Y'), start (0-based), end, feature
Additionally, the column names should be omitted from the file itself
(i.e. there is no header and the first line in the file is the first
row of genome coordinates for a feature).
"""
import types

import tabix
import numpy as np

from .target import Target
from ._genomic_features import _fast_get_feature_data


def _any_positive_rows(rows, start, end, thresholds):
    """
    Searches through a set of feature annotations for positive examples
    according to a threshold specific to each feature. For each feature
    in `rows`, the overlap between the feature and the query region must
    be greater than that feature's threshold to be considered positive.

    Parameters
    ----------
    rows : list(tuple(int, int, str)) or None
        A list of tuples of the form `(start, end, feature_name)`, or
        `None`.
    start : int
        The 0-based start coordinate of the region to query.
    end : int
        One past the last coordinate of the region to query.
    thresholds : dict
        A dictionary mapping feature names (`str`) to
        thresholds (`float`), where the threshold is the minimum
        fraction of a region that must overlap with a label for it to be
        considered a positive example of that label.

    Returns
    -------
    bool
        `True` if there is at least one feature in `rows` that meets its
        feature-specific cutoff. `False` otherwise, or if `rows==None`.

    """
    if rows is None:
        return False
    for row in rows:  # features within [start, end)
        is_positive = _is_positive_row(
            start, end, int(row[1]), int(row[2]), thresholds[row[3]])
        if is_positive:
            return True
    return False


def _is_positive_row(start, end,
                     feature_start, feature_end,
                     threshold):
    """
    Determine if a feature annotation overlaps enough with a query
    region to meet a minimal overlap threshold and be considered a
    positive example of said feature.

    Parameters
    ----------
    start : int
        The 0-based start coordinate of the query region.
    end : int
        One past the last coordinate of the query region.
    feature_start : int
        The 0-based start coordinate of the feature.
    feature_end : int
        One past the last coordinate of the feature.
    threshold : float
        The minimum fraction of the query region that a feature must
        overlap with to be considered positive.

    Returns
    -------
    bool
        `True` if the feature is a positive example and meets the
        overlap threshold. Otherwise `False`.

    """
    overlap_start = max(feature_start, start)
    overlap_end = min(feature_end, end)
    min_overlap_needed = int(
        (end - start) * threshold - 1)
    if min_overlap_needed < 0:
        min_overlap_needed = 0
    if overlap_end - overlap_start > min_overlap_needed:
        return True
    else:
        return False


def _get_feature_data(chrom, start, end,
                      thresholds, feature_index_dict, get_feature_rows):
    """
    Generates a target vector for the given query region.

    Parameters
    ----------
     chrom : str
        The name of the region (e.g. 'chr1', 'chr2', ..., 'chrX',
        'chrY') to query inside of.
    start : int
        The 0-based start coordinate of the region to query.
    end : int
        One past the last coordinate of the region to query.
    thresholds : np.ndarray, dtype=numpy.float32
        An array of feature thresholds, where the value in position
        `i` corresponds to the threshold for the feature name that is
        mapped to index `i` by `feature_index_dict`.
    feature_index_dict : dict
        A dictionary mapping feature names (`str`) to indices (`int`),
        where the index is the position of the feature in `features`.
    get_feature_rows : types.FunctionType
        A function that takes coordinates and returns rows
        (`list(tuple(int, int, str))`).

    Returns
    -------
    numpy.ndarray, dtype=int
        A target vector where the `i`th position is equal to one if the
        `i`th feature is positive, and zero otherwise.

    """
    rows = get_feature_rows(chrom, start, end)
    return _fast_get_feature_data(
        start, end, thresholds, feature_index_dict, rows)


def _define_feature_thresholds(feature_thresholds, features):
    """
    Defines the minimal overlap thresholds for the various features.

    Parameters
    ----------
    feature_thresholds : float or dict or type.FunctionType
        Either a function that takes a feature name (`str`) and returns
        a threshold (`float`) or a constant `float` that will be used
        for all features.
    features : list(str)
        A list of feature names.

    Returns
    -------
    feature_thresholds_dict, feature_thresholds_vec : \
    tuple(dict, numpy.ndarray)
        A tuple. The first element, `feature_thresholds_dict`, is a
        dictionary that maps feature names (`str`) to thresholds
        (`float`). The second element, `feature_thresholds_vec`, is an
        array of the thresholds (numpy.`float32`) where the `i`th value
        corresponds to the threshold for the `i`th feature from
        the `features` input.

    """
    feature_thresholds_dict = {}
    feature_thresholds_vec = np.zeros(len(features))
    if (isinstance(feature_thresholds, float) or
            isinstance(feature_thresholds, int)) and feature_thresholds <= 1 \
            and feature_thresholds > 0:
        feature_thresholds_dict = dict.fromkeys(features, feature_thresholds)
        feature_thresholds_vec += feature_thresholds
    elif isinstance(feature_thresholds, dict):
        # assign the default value to everything first
        feature_thresholds_dict = dict.fromkeys(
            features, feature_thresholds["default"])
        feature_thresholds_vec += feature_thresholds["default"]
        for i, f in enumerate(features):
            if f in feature_thresholds:
                feature_thresholds_dict[f] = feature_thresholds[f]
                feature_thresholds_vec[i] = feature_thresholds[f]
    # this branch will not be accessed if you use a config.yml file to
    # specify input parameters
    elif isinstance(feature_thresholds, types.FunctionType):
        for i, f in enumerate(features):
            threshold = feature_thresholds(f)
            feature_thresholds_dict[f] = threshold
            feature_thresholds_vec[i] = threshold
    feature_thresholds_vec = feature_thresholds_vec.astype(np.float32)
    return feature_thresholds_dict, feature_thresholds_vec


[docs]class GenomicFeatures(Target): """ Stores the dataset specifying sequence regions and features. Accepts a tabix-indexed `*.bed` file with the following columns, in order: :: [chrom, start, end, strand, feature] Note that `chrom` is interchangeable with any sort of region (e.g. a protein in a FAA file). Further, `start` is 0-based. Lastly, any addition columns following the five shown above will be ignored. Parameters ---------- input_path : str Path to the tabix-indexed dataset. Note that for the file to be tabix-indexed, it must have been compressed with `bgzip`. Thus, `input_path` should be a `*.gz` file with a corresponding `*.tbi` file in the same directory. features : list(str) The non-redundant list of genomic features (i.e. labels) that will be predicted. feature_thresholds : float or dict or types.FunctionType or None Default is None. A genomic region is determined to be a positive sample if at least one genomic feature peak takes up a proportion of the region greater than or equal to the threshold specified for that feature. * `None` - No thresholds specified. All features found in\ a query region are annotated to that region. * `float` - A single threshold applies to all the features\ in the dataset. * `dict` - A dictionary mapping feature names (`str`) to \ threshold values (`float`), which thereby assigns\ different thresholds to different features. If a\ feature's threshold is not specified in this \ dictionary, then we assume that a key `"default"`\ exists in the dictionary that has the default \ threshold value we should assign to the feature \ name that is absent from the dictionary keys. * `types.FunctionType` - define a function that takes as \ input the feature name and returns\ the feature's threshold. Attributes ---------- data : tabix.open The data stored in a tabix-indexed `*.bed` file. n_features : int The number of distinct features. feature_index_dict : dict A dictionary mapping feature names (`str`) to indices (`int`), where the index is the position of the feature in `features`. index_feature_dict : dict A dictionary mapping indices (`int`) to feature names (`str`), where the index is the position of the feature in the input features. feature_thresholds : dict or None * `dict` - A dictionary mapping feature names (`str`) to thresholds\ (`float`), where the threshold is the minimum overlap that a\ feature annotation must have with a query region to be\ considered a positive example of that feature. * `None` - No threshold specifications. Assumes that all features\ returned by a tabix query are annotated to the query region. """ def __init__(self, input_path, features, feature_thresholds=None): """ Constructs a new `GenomicFeatures` object. """ self.data = tabix.open(input_path) self.n_features = len(features) self.feature_index_dict = dict( [(feat, index) for index, feat in enumerate(features)]) self.index_feature_dict = dict(list(enumerate(features))) if feature_thresholds is None: self.feature_thresholds = None self._feature_thresholds_vec = None else: self.feature_thresholds, self._feature_thresholds_vec = \ _define_feature_thresholds(feature_thresholds, features) def _query_tabix(self, chrom, start, end): """ Queries a tabix-indexed `*.bed` file for features falling into the specified region. Parameters ---------- chrom : str The name of the region (e.g. '1', '2', ..., 'X', 'Y') to query in. start : int The 0-based start position of the query coordinates. end : int One past the last position of the query coordinates. Returns ------- list(list(str)) or None A list, wherein each sub-list corresponds to a line from the tabix-indexed file, and each value in a sub-list corresponds to a column in that row. If a `tabix.TabixError` is caught, we assume it was because there were no features present in the query region, and return `None`. """ try: return self.data.query(chrom, start, end) except tabix.TabixError: return None
[docs] def is_positive(self, chrom, start, end): """ Determines whether the query the `chrom` queried contains any genomic features within the :math:`[start, end)` region. If so, the query is considered positive. Parameters ---------- chrom : str The name of the region (e.g. '1', '2', ..., 'X', 'Y'). start : int The 0-based first position in the region. end : int One past the 0-based last position in the region. Returns ------- bool `True` if this meets the criterion for a positive example, `False` otherwise. Note that if we catch a `tabix.TabixError` exception, we assume the error was the result of no features being present in the queried region and return `False`. """ rows = self._query_tabix(chrom, start, end) return _any_positive_rows(rows, start, end, self.feature_thresholds)
[docs] def get_feature_data(self, chrom, start, end): """ For a sequence of length :math:`L = end - start`, return the features' one-hot encoding corresponding to that region. For instance, for `n_features`, each position in that sequence will have a binary vector specifying whether the genomic feature's coordinates overlap with that position. @TODO: Clarify with an example, as this is hard to read right now. Parameters ---------- chrom : str The name of the region (e.g. '1', '2', ..., 'X', 'Y'). start : int The 0-based first position in the region. end : int One past the 0-based last position in the region. Returns ------- numpy.ndarray :math:`L \\times N` array, where :math:`L = end - start` and :math:`N =` `self.n_features`. Note that if we catch a `tabix.TabixError`, we assume the error was the result of there being no features present in the queried region and return a `numpy.ndarray` of zeros. """ if self._feature_thresholds_vec is None: features = np.zeros((end - start)) rows = self._query_tabix(chrom, start, end) if not rows: return features for r in rows: feature = r[3] ix = self.feature_index_map[feature] features[ix] = 1 return features return _get_feature_data( chrom, start, end, self._feature_thresholds_vec, self.feature_index_dict, self._query_tabix)