Source code for selene_sdk.sequences.proteome

This module provides the `Proteome` clasee. This class wraps the
indexed FASTA file for an organism's proteomic sequence. It supports
retrieving parts of the sequence and converting these parts into their
one-hot encodings.

import numpy as np
import pyfaidx

from .sequence import Sequence
from .sequence import sequence_to_encoding
from .sequence import encoding_to_sequence

def _get_sequence_from_coords(len_prots, proteome_sequence,
                              prot, start, end):
    Gets the amino acid sequence at specified coordinates.

    len_prots : dict
        A dictionary mapping protein names to lengths.
    proteome_sequence : function
        A closure that returns the sequence at given coordinates.
    prot : str
        The name of a protein, e.g. "YFP".
    start : int
        The 0-based start coordinate of the first position in the
    end : int
        One past the 0-based last position in the sequence.

        The amino acid sequence.

    if start > len_prots[prot] or (end > len_prots[prot] + 1) or start < 0:
        return ""
    return proteome_sequence(prot, start, end)

[docs]class Proteome(Sequence): """Provides access to an organism's proteomic sequence. It supports retrieving parts of the sequence and converting these parts into their one-hot encodings. It is essentially a wrapper class around the `pyfaidx.Fasta` class. Parameters ---------- input_path : str Path to an indexed FASTA file containing amino acid sequences, that is, a `*.fasta` file with a corresponding `*.fai` file in the same directory. File should contain the sequences from which training examples will be created. Attributes ---------- proteome : pyfaidx.Fasta The FASTA or FAA file containing the protein sequences. prots : list(str) The list of protein names. len_prots : dict A dictionary that maps protein names to the lengths, and does so for all protein sequences in the proteome. """ BASES_ARR = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] """ This is an array with the alphabet (i.e. all possible symbols that may occur in a sequence). We expect that `INDEX_TO_BASE[i]==BASES_ARR[i]` is `True` for all valid `i`. """ BASE_TO_INDEX = { 'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4, 'E': 5, 'Q': 6, 'G': 7, 'H': 8, 'I': 9, 'L': 10, 'K': 11, 'M': 12, 'F': 13, 'P': 14, 'S': 15, 'T': 16, 'W': 17, 'Y': 18, 'V': 19 } """ A dictionary mapping members of the alphabet (i.e. all possible symbols that can occur in a sequence) to integers. """ INDEX_TO_BASE = {0: 'A', 1: 'R', 2: 'N', 3: 'D', 4: 'C', 5: 'E', 6: 'Q', 7: 'G', 8: 'H', 9: 'I', 10: 'L', 11: 'K', 12: 'M', 13: 'F', 14: 'P', 15: 'S', 16: 'T', 17: 'W', 18: 'Y', 19: 'V'} """ A dictionary mapping integers to members of the alphabet (i.e. all possible symbols that can occur in a sequence). We expect that `INDEX_TO_BASE[i]==BASES_ARR[i]` is `True` for all valid `i`. """ UNK_BASE = "X" """ This is a base used to represent unknown positions. This is not the same as a character from outside the sequence's alphabet. A character from outside the alphabet is an error. A position with an unknown base signifies that the position is one of the bases from the alphabet, but we are uncertain which. """ def __init__(self, input_path): """ Constructs a `Proteome` object. """ self.proteome = pyfaidx.Fasta(input_path) self.prots = sorted(self.proteome.keys()) self.len_prots = self._get_len_prots()
[docs] def get_prots(self): """Gets the list of protein names. Returns ------- list(str) A list of the protein names. """ return self.prots
[docs] def get_prot_lens(self): """ Gets the name and length of each protein sequence in the file. Returns ------- list(tuple(str, int)) A list of tuples of protein names and protein lengths. """ return [(k, self.len_prots[k]) for k in self.prots]
def _get_len_prots(self): """ Returns ------- dict A dictionary mapping the names of proteins to their lengths. """ len_prots = {} for prot in self.prots: len_prots[prot] = len(self.proteome[prot]) return len_prots def _proteome_sequence(self, prot, start, end): """ Returns ------- str The amino acid sequence at the query coordinates. """ return self.proteome[prot][start:end].seq
[docs] def coords_in_bounds(self, prot, start, end): """Check if the coordinates we want to query is valid. Parameters ---------- prot : str The name of the protein, e.g. "YFP". start : int The 0-based start coordinate of the first position in the sequence. end : int One past the 0-based last position in the sequence. Returns ------- bool A boolean indicating whether we can retrieve a sequence from the queried coordinates. """ if (start > self.len_prots[prot] or end > (self.len_prots[prot] + 1) or start < 0): return False return True
[docs] def get_sequence_from_coords(self, prot, start, end): """Gets the queried protein sequence at the input coordinates. Parameters ---------- prot : str The protein name, e.g. "YFP". start : int The 0-based start coordinate of the first position in the sequence. end : int One past the 0-based last position in the sequence. Returns ------- str The sequence of :math:`L` amino acids at the specified coordinates, where :math:`L = end - start`. """ return _get_sequence_from_coords( self.len_prots, self._proteome_sequence, prot, start, end)
[docs] def get_encoding_from_coords(self, prot, start, end): """Gets the one-hot encoding of the protein's sequence at the input coordinates. Parameters ---------- prot : str The name of the protein, e.g. "YFP". start : int The 0-based start coordinate of the first position in the sequence. end : int One past the 0-based last position in the sequence. Returns ------- numpy.ndarray, dtype=numpy.float32 The :math:`L \\times 20` encoding of the sequence, where :math:`L = end - start`. """ sequence = self.get_sequence_from_coords(prot, start, end) encoding = self.sequence_to_encoding(sequence) return encoding
[docs] @classmethod def sequence_to_encoding(cls, sequence): """Converts an input sequence to its one-hot encoding. Parameters ---------- sequence : str The input sequence of amino acids of length :math:`L`. Returns ------- numpy.ndarray, dtype=numpy.float32 The :math:`L \\times 20` array, where `L` was the length of the input sequence. """ return sequence_to_encoding(sequence, cls.BASE_TO_INDEX, cls.BASES_ARR)
[docs] @classmethod def encoding_to_sequence(cls, encoding): """Converts an input one-hot encoding to its amino acid sequence. Parameters ---------- encoding : numpy.ndarray, dtype=numpy.float32 The :math:`L \\times 20` encoding of the sequence, where :math:`L` is the length of the output amino acid sequence. Returns ------- str The sequence of :math:`L` amino acids decoded from the input array. """ return encoding_to_sequence(encoding, cls.BASES_ARR, cls.UNK_BASE)