Source code for selene_sdk.sequences.sequence

"""
This module provides the `Sequence` class. This class is the abstract
base class for biological sequence collections (e.g. genomes).

"""
from abc import ABCMeta
from abc import abstractmethod

import numpy as np

from ._sequence import _fast_sequence_to_encoding


[docs]def sequence_to_encoding(sequence, base_to_index, bases_arr): """Converts an input sequence to its one-hot encoding. Parameters ---------- sequence : str The input sequence of length :math:`L`. base_to_index : dict A dict that maps input characters to indices, where the indices specify the column to assign as 1 when a base exists at the current position in the input. If a base does not exist at the current position in the input, it's corresponding column in the encoding is set as zero. Note that the rows correspond directly to the positions in the input sequence. For instance, with a a genome you would have each of `['A', 'C', 'G', 'T']` as keys, mapping to values of `[0, 1, 2, 3]`. bases_arr : list(str) The characters in the sequence's alphabet. Returns ------- numpy.ndarray, dtype=numpy.float32 The :math:`L \\times N` encoding of the sequence, where :math:`L` is the length of the input sequence and :math:`N` is the size of the sequence alphabet. """ return _fast_sequence_to_encoding(sequence, base_to_index, len(bases_arr))
def _get_base_index(encoding_row): unk_val = 1 / len(encoding_row) for index, val in enumerate(encoding_row): if np.isclose(val, unk_val) is True: return -1 elif val == 1: return index return -1
[docs]def encoding_to_sequence(encoding, bases_arr, unk_base): """Converts a sequence one-hot encoding to its string sequence. Parameters ---------- encoding : numpy.ndarray, dtype=numpy.float32 The :math:`L \\times N` encoding of the sequence, where :math:`L` is the length of the sequence, and :math:`N` is the size of the sequence alphabet. bases_arr : list(str) A list of the bases in the sequence's alphabet that corresponds to the correct columns for those bases in the encoding. unk_base : str The base corresponding to the "unknown" character in this encoding. See `selene_sdk.sequences.Sequence.UNK_BASE` for more information. Returns ------- str The sequence of :math:`L` characters decoded from the input array. """ sequence = [] for row in encoding: base_pos = _get_base_index(row) if base_pos == -1: sequence.append(unk_base) else: sequence.append(bases_arr[base_pos]) return "".join(sequence)
[docs]def get_reverse_encoding(encoding, bases_arr, base_to_index, complementary_base_dict): """ The Genome DNA bases encoding is created such that the reverse encoding can be quickly computed. Parameters ---------- encoding : numpy.ndarray bases_arr : list(str) base_to_index : dict complementary_base_dict : dict Returns ------- numpy.ndarray """ reverse_encoding = np.zeros(encoding.shape) for index, row in enumerate(encoding): base_pos = _get_base_index(row) if base_pos == -1: reverse_encoding[index, :] = 1 / len(bases_arr) else: base = complementary_base_dict[bases_arr[base_pos]] complem_base_pos = base_to_index[base] rev_index = encoding.shape[0] - row - 1 reverse_encoding[rev_index, complem_base_pos] = 1 return reverse_encoding
def reverse_complement_sequence(sequence, complementary_base_dict): """ Finds the reverse complement of a sequence. Parameters ---------- sequence : str The sequence to reverse complement. complementary_base_dict: dict A dict that maps bases (`str`) to their complementary bases (`str`). Returns ------- str The reverse complement of the input sequence. """ rev_comp_bases = [complementary_base_dict[b] for b in sequence[::-1]] return ''.join(rev_comp_bases)
[docs]class Sequence(metaclass=ABCMeta): """ The abstract base class for biological sequence classes. """ @property @abstractmethod def BASE_TO_INDEX(self): """ A dictionary mapping members of the alphabet (i.e. all possible symbols that can occur in a sequence) to integers. Returns ------- dict The dictionary mapping the alphabet to integers. """ raise NotImplementedError() @property @abstractmethod def INDEX_TO_BASE(self): """ A dictionary mapping integers to members of the alphabet (i.e. all possible symbols that can occur in a sequence). We expect that `INDEX_TO_BASE[i]==BASES_ARR[i]` is `True` for all valid `i`. Returns ------- dict The dictionary mapping integers to the alphabet. """ raise NotImplementedError() @property @abstractmethod def BASES_ARR(self): """ This is an array with the alphabet (i.e. all possible symbols that may occur in a sequence). We expect that `INDEX_TO_BASE[i]==BASES_ARR[i]` is `True` for all valid `i`. Returns ------- numpy.ndarray, dtype=str The array of all members of the alphabet. """ raise NotImplementedError() @property @abstractmethod def UNK_BASE(self): """ This is a base used to represent unknown positions. This is not the same as a character from outside the sequence's alphabet. A character from outside the alphabet is an error. A position with an unknown base signifies that the position is one of the bases from the alphabet, but we are uncertain which. Returns ------- str The character representing an unknown base. """ raise NotImplementedError()
[docs] @abstractmethod def coords_in_bounds(self, *args, **kwargs): """Checks if queried coordinates are valid. Returns ------- bool `True` if the coordinates are in bounds, otherwise `False`. """ raise NotImplementedError()
[docs] @abstractmethod def get_sequence_from_coords(self, *args, **kwargs): """Extracts a string representation of a sequence at the given coordinates. Returns ------- str The sequence of bases occuring at the queried coordinates. This sequence will be of length :math:`L` normally, but only if the coordinates are valid. Behavior is undefined for invalid coordinates. """ raise NotImplementedError()
[docs] @abstractmethod def get_encoding_from_coords(self, *args, **kwargs): """Extracts the numerical encoding for a sequence occurring at the given coordinates. Returns ------- numpy.ndarray, dtype=numpy.float32 The :math:`L \\times N` encoding of the sequence occuring at queried coordinates, where :math:`L` is the length of the sequence, and :math:`N` is the size of the sequence type's alphabet. Behavior is undefined for invalid coordinates. """ raise NotImplementedError()
[docs] @classmethod @abstractmethod def sequence_to_encoding(cls, sequence): """Transforms a biological sequence into a numerical representation. Parameters ---------- sequence : str The input sequence of characters. Returns ------- numpy.ndarray, dtype=numpy.float32 The :math:`L \\times N` encoding of the sequence, where :math:`L` is the length of the sequence, and :math:`N` is the size of the sequence type's alphabet. """ raise NotImplementedError()
[docs] @classmethod @abstractmethod def encoding_to_sequence(cls, encoding): """Transforms the input numerical representation of a sequence into a string representation. Parameters ---------- encoding : numpy.ndarray, dtype=numpy.float32 The :math:`L \\times N` encoding of the sequence, where :math:`L` is the length of the sequence, and :math:`N` is the size of the sequence type's alphabet. Returns ------- str The sequence of bases decoded from the input array. This sequence will be of length :math:`L`. """ raise NotImplementedError()