Source code for selene_sdk.sequences.proteome

"""
This module provides the `Proteome` clasee. This class wraps the
indexed FASTA file for an organism's proteomic sequence. It supports
retrieving parts of the sequence and converting these parts into their
one-hot encodings.

"""
import numpy as np
import pyfaidx

from .sequence import Sequence
from .sequence import sequence_to_encoding
from .sequence import encoding_to_sequence


def _get_sequence_from_coords(len_prots, proteome_sequence,
                              prot, start, end):
    """
    Gets the amino acid sequence at specified coordinates.

    Parameters
    ----------
    len_prots : dict
        A dictionary mapping protein names to lengths.
    proteome_sequence : function
        A closure that returns the sequence at given coordinates.
    prot : str
        The name of a protein, e.g. "YFP".
    start : int
        The 0-based start coordinate of the first position in the
        sequence.
    end : int
        One past the 0-based last position in the sequence.

    Returns
    -------
    str
        The amino acid sequence.

    """
    if start > len_prots[prot] or (end > len_prots[prot] + 1) or start < 0:
        return ""
    return proteome_sequence(prot, start, end)


[docs]class Proteome(Sequence):
    """Provides access to an organism's proteomic sequence.

    It supports retrieving parts of the sequence and converting these
    parts into their one-hot encodings. It is essentially a wrapper
    class around the `pyfaidx.Fasta` class.

    Parameters
    ----------
    input_path : str
        Path to an indexed FASTA file containing amino acid
        sequences, that is, a `*.fasta` file with a corresponding
        `*.fai` file in the same directory. File should contain the
        sequences from which training examples will be created.

    Attributes
    ----------
    proteome : pyfaidx.Fasta
        The FASTA or FAA file containing the protein sequences.
    prots : list(str)
        The list of protein names.
    len_prots : dict
        A dictionary that maps protein names to the lengths, and does so
        for all protein sequences in the proteome.


    """

    BASES_ARR = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
                 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
    """
    This is an array with the alphabet (i.e. all possible symbols
    that may occur in a sequence). We expect that
    `INDEX_TO_BASE[i]==BASES_ARR[i]` is `True` for all valid `i`.
    """

    BASE_TO_INDEX = {
        'A': 0, 'R': 1, 'N': 2, 'D': 3, 'C': 4, 'E': 5, 'Q': 6,
        'G': 7, 'H': 8, 'I': 9, 'L': 10, 'K': 11, 'M': 12, 'F': 13,
        'P': 14, 'S': 15, 'T': 16, 'W': 17, 'Y': 18, 'V': 19
    }
    """
    A dictionary mapping members of the alphabet (i.e. all
    possible symbols that can occur in a sequence) to integers.
    """

    INDEX_TO_BASE = {0: 'A', 1: 'R', 2: 'N', 3: 'D', 4: 'C', 5: 'E', 6: 'Q',
                     7: 'G', 8: 'H', 9: 'I', 10: 'L', 11: 'K', 12: 'M',
                     13: 'F', 14: 'P', 15: 'S', 16: 'T', 17: 'W', 18: 'Y',
                     19: 'V'}
    """
    A dictionary mapping integers to members of the alphabet (i.e.
    all possible symbols that can occur in a sequence). We expect
    that `INDEX_TO_BASE[i]==BASES_ARR[i]` is `True` for all
    valid `i`.
    """

    UNK_BASE = "X"
    """
    This is a base used to represent unknown positions. This is not
    the same as a character from outside the sequence's alphabet. A
    character from outside the alphabet is an error. A position with
    an unknown base signifies that the position is one of the bases
    from the alphabet, but we are uncertain which.
    """

    def __init__(self, input_path):
        """
        Constructs a `Proteome` object.
        """
        self.proteome = pyfaidx.Fasta(input_path)
        self.prots = sorted(self.proteome.keys())
        self.len_prots = self._get_len_prots()

[docs]    def get_prots(self):
        """Gets the list of protein names.

        Returns
        -------
        list(str)
            A list of the protein names.

        """
        return self.prots

[docs]    def get_prot_lens(self):
        """
        Gets the name and length of each protein sequence in the file.

        Returns
        -------
        list(tuple(str, int))
            A list of tuples of protein names and protein lengths.

        """
        return [(k, self.len_prots[k]) for k in self.prots]

    def _get_len_prots(self):
        """
        Returns
        -------
        dict
            A dictionary mapping the names of proteins to their lengths.

        """
        len_prots = {}
        for prot in self.prots:
            len_prots[prot] = len(self.proteome[prot])
        return len_prots

    def _proteome_sequence(self, prot, start, end):
        """
        Returns
        -------
        str
            The amino acid sequence at the query coordinates.
        """
        return self.proteome[prot][start:end].seq

[docs]    def coords_in_bounds(self, prot, start, end):
        """Check if the coordinates we want to query is valid.

        Parameters
        ----------
        prot : str
            The name of the protein, e.g. "YFP".
        start : int
            The 0-based start coordinate of the first position in the
            sequence.
        end : int
            One past the 0-based last position in the sequence.

        Returns
        -------
        bool
            A boolean indicating whether we can retrieve a sequence from
            the queried coordinates.

        """
        if (start > self.len_prots[prot] or end > (self.len_prots[prot] + 1)
                or start < 0):
            return False
        return True

[docs]    def get_sequence_from_coords(self, prot, start, end):
        """Gets the queried protein sequence at the input coordinates.

        Parameters
        ----------
        prot : str
            The protein name, e.g. "YFP".
        start : int
            The 0-based start coordinate of the first position in the
            sequence.
        end : int
            One past the 0-based last position in the sequence.

        Returns
        -------
        str
            The sequence of :math:`L` amino acids at the specified
            coordinates, where :math:`L = end - start`.

        """
        return _get_sequence_from_coords(
            self.len_prots, self._proteome_sequence, prot, start, end)

[docs]    def get_encoding_from_coords(self, prot, start, end):
        """Gets the one-hot encoding of the protein's sequence at the
        input coordinates.

        Parameters
        ----------
        prot : str
            The name of the protein, e.g. "YFP".
        start : int
            The 0-based start coordinate of the first position in the
            sequence.
        end : int
            One past the 0-based last position in the sequence.

        Returns
        -------
        numpy.ndarray, dtype=numpy.float32
            The :math:`L \\times 20` encoding of the sequence, where
            :math:`L = end - start`.

        """
        sequence = self.get_sequence_from_coords(prot, start, end)
        encoding = self.sequence_to_encoding(sequence)
        return encoding

[docs]    @classmethod
    def sequence_to_encoding(cls, sequence):
        """Converts an input sequence to its one-hot encoding.

        Parameters
        ----------
        sequence : str
            The input sequence of amino acids of length :math:`L`.

        Returns
        -------
        numpy.ndarray, dtype=numpy.float32
            The :math:`L \\times 20` array, where `L` was the length of
            the input sequence.

        """
        return sequence_to_encoding(sequence, cls.BASE_TO_INDEX, cls.BASES_ARR)

[docs]    @classmethod
    def encoding_to_sequence(cls, encoding):
        """Converts an input one-hot encoding to its amino acid sequence.

        Parameters
        ----------
        encoding : numpy.ndarray, dtype=numpy.float32
            The :math:`L \\times 20` encoding of the sequence, where
            :math:`L` is the length of the output amino acid sequence.

        Returns
        -------
        str
            The sequence of :math:`L` amino acids decoded from the
            input array.

        """
        return encoding_to_sequence(encoding, cls.BASES_ARR, cls.UNK_BASE)