# Source code for selene_sdk.sequences.genome

"""
This module provides the Genome class. This class wraps the indexed
FASTA file for an organism's genomic sequence. It supports retrieving
parts of the sequence and converting these parts into their one-hot
encodings.

"""
import numpy as np
import pkg_resources
import pyfaidx
import tabix

from .sequence import Sequence
from .sequence import sequence_to_encoding
from .sequence import encoding_to_sequence

def _not_blacklist_region(chrom, start, end, blacklist_tabix):
"""
Check if the input coordinates are not overlapping with blacklist regions.

Parameters
----------
chrom : str
The name of the chromosomes, e.g. "chr1".
start : int
The 0-based start coordinate of the sequence.
end : int
One past the last coordinate of the sequence.
blacklist_tabix : tabix.open or None, optional
Default is None. Tabix file handle if a file of blacklist regions
is available.

Returns
-------
bool
False if the coordinates are overlaping with blacklist regions
(if specified). Otherwise, return True.

"""
if blacklist_tabix is not None:
try:
rows = blacklist_tabix.query(chrom, start, end)
for row in rows:
return False
except tabix.TabixError:
pass
return True

def _check_coords(len_chrs,
chrom,
start,
end,
blacklist_tabix=None):
"""
Check if the input coordinates are valid.

Parameters
----------
len_chrs : dict
A dictionary mapping chromosome names to lengths.
chrom : str
The name of the chromosomes, e.g. "chr1".
start : int
The 0-based start coordinate of the sequence.
end : int
One past the last coordinate of the sequence.
Default is False. Allow coordinates that are partially
out of bounds.
blacklist_tabix : tabix.open or None, optional
Default is None. Tabix file handle if a file of blacklist regions
is available.

Returns
-------
bool
True if the coordinates are valid (start and end are within
chromosome boundaries and not overlaping with blacklist regions
(if specified). Otherwise, return False.

"""
return chrom in len_chrs and \
start < len_chrs[chrom] and \
start < end and \
end > 0 and \
(start >= 0 if not pad else True) and \
(end <= len_chrs[chrom] if not pad else True) and \
_not_blacklist_region(chrom, start, end, blacklist_tabix)

def _get_sequence_from_coords(len_chrs,
genome_sequence,
chrom,
start,
end,
strand='+',
blacklist_tabix=None):
"""
Gets the genomic sequence at the input coordinates.

Parameters
----------
len_chrs : dict
A dictionary mapping chromosome names to lengths.
genome_sequence : function
A closure that extracts a sequence from a genome.
chrom : str
The name of the chromosomes, e.g. "chr1".
start : int
The 0-based start coordinate of the sequence.
end : int
One past the last coordinate of the sequence.
strand : {'+', '-', '.'}, optional
Default is '+'. The strand the sequence is located on. '.' is treated
as '+'.
Default is False. If the coordinates are out of bounds, make an
in-bounds query and then pad the sequence to return the desired
sequence length.
blacklist_tabix : tabix.open or None, optional
Default is None. Tabix file handle if a file of blacklist regions
is available.

Returns
-------
str
The genomic sequence occurring at the input coordinates.

Raises
------
ValueError
If the input char to strand is not one of the specified
choices.

"""

if not _check_coords(len_chrs,
chrom,
start,
end,
blacklist_tabix=blacklist_tabix):
return ""

if strand != '+' and strand != '-' and strand != '.':
raise ValueError(
"Strand must be one of '+', '-', or '.'. Input was {0}".format(
strand))

if end > len_chrs[chrom]:
end = len_chrs[chrom]
if start < 0:
start = 0
genome_sequence(chrom, start, end, strand) +
[docs]class Genome(Sequence): """This class provides access to an organism's genomic sequence. This class supports retrieving parts of the sequence and converting these parts into their one-hot encodings. It is essentially a wrapper class around the pyfaidx.Fasta class. Parameters ---------- input_path : str Path to an indexed FASTA file, that is, a *.fasta file with a corresponding *.fai file in the same directory. This file should contain the target organism's genome sequence. blacklist_regions : str or None, optional Default is None. Path to a tabix-indexed list of regions from which we should not output sequences. This is used to ensure that we are not sampling from areas where we will never collect measurements. You can pass as input "hg19" or "hg38" to use the blacklist regions released by ENCODE. You can also pass in your own tabix-indexed .gz file. bases_order : list(str) or None, optional Default is None (use the default base ordering of ['A', 'C', 'G', 'T']). Specify a different ordering of DNA bases for one-hot encoding. Attributes ---------- genome : pyfaidx.Fasta The FASTA file containing the genome sequence. chrs : list(str) The list of chromosome names. len_chrs : dict A dictionary mapping the names of each chromosome in the file to the length of said chromosome. """ BASES_ARR = ['A', 'C', 'G', 'T'] """ This is an array with the alphabet (i.e. all possible symbols that may occur in a sequence). We expect that INDEX_TO_BASE[i]==BASES_ARR[i] is True for all valid i. """ BASE_TO_INDEX = { 'A': 0, 'C': 1, 'G': 2, 'T': 3, 'a': 0, 'c': 1, 'g': 2, 't': 3, } """ A dictionary mapping members of the alphabet (i.e. all possible symbols that can occur in a sequence) to integers. """ INDEX_TO_BASE = { 0: 'A', 1: 'C', 2: 'G', 3: 'T' } """ A dictionary mapping integers to members of the alphabet (i.e. all possible symbols that can occur in a sequence). We expect that INDEX_TO_BASE[i]==BASES_ARR[i] is True for all valid i. """ COMPLEMENTARY_BASE_DICT = { 'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'a': 'T', 'c': 'G', 'g': 'C', 't': 'A', 'n': 'N' } """ A dictionary mapping each base to its complementary base. """ UNK_BASE = "N" """ This is a base used to represent unknown positions. This is not the same as a character from outside the sequence's alphabet. A character from outside the alphabet is an error. A position with an unknown base signifies that the position is one of the bases from the alphabet, but we are uncertain which. """ def __init__(self, input_path, blacklist_regions=None, bases_order=None): """ Constructs a Genome object. """ self.genome = pyfaidx.Fasta(input_path) self.chrs = sorted(self.genome.keys()) self.len_chrs = self._get_len_chrs() self._blacklist_tabix = None if blacklist_regions == "hg19": self._blacklist_tabix = tabix.open( pkg_resources.resource_filename( "selene_sdk", "sequences/data/hg19_blacklist_ENCFF001TDO.bed.gz")) elif blacklist_regions == "hg38": self._blacklist_tabix = tabix.open( pkg_resources.resource_filename( "selene_sdk", "sequences/data/hg38.blacklist.bed.gz")) elif blacklist_regions is not None: # user-specified file self._blacklist_tabix = tabix.open( blacklist_regions) if bases_order is not None: bases = [str.upper(b) for b in bases_order] self.BASES_ARR = bases lc_bases = [str.lower(b) for b in bases] self.BASE_TO_INDEX = { **{b: ix for (ix, b) in enumerate(bases)}, **{b: ix for (ix, b) in enumerate(lc_bases)}} self.INDEX_TO_BASE = {ix: b for (ix, b) in enumerate(bases)} self.update_bases_order(bases) @classmethod def update_bases_order(cls, bases): cls.BASES_ARR = bases lc_bases = [str.lower(b) for b in bases] cls.BASE_TO_INDEX = { **{b: ix for (ix, b) in enumerate(bases)}, **{b: ix for (ix, b) in enumerate(lc_bases)}} cls.INDEX_TO_BASE = {ix: b for (ix, b) in enumerate(bases)}
[docs] def get_sequence_from_coords(self, chrom, start, end, strand='+', pad=False): """ Gets the queried chromosome's sequence at the input coordinates. Parameters ---------- chrom : str The name of the chromosomes, e.g. "chr1". start : int The 0-based start coordinate of the sequence. end : int One past the 0-based last position in the sequence. strand : {'+', '-', '.'}, optional Default is '+'. The strand the sequence is located on. '.' is treated as '.'. pad : bool, optional Default is False. Pad the output sequence with 'N' if start and/or end are out of bounds to return a sequence of length end - start. Returns ------- str The genomic sequence of length :math:L where :math:L = end - start. If pad is False and one/both of start and end are out of bounds, will return an empty string. Also returns an empty string if chrom cannot be found in the input FASTA file. Otherwise, will return the sequence with padding at the start/end if appropriate. Raises ------ ValueError If the input char to strand is not one of the specified choices. """ return _get_sequence_from_coords(self.len_chrs, self._genome_sequence, chrom, start, end, strand=strand, pad=pad, blacklist_tabix=self._blacklist_tabix)
[docs] def get_encoding_from_coords(self, chrom, start, end, strand='+', pad=False): """Gets the one-hot encoding of the genomic sequence at the queried coordinates. Parameters ---------- chrom : str The name of the chromosome or region, e.g. "chr1". start : int The 0-based start coordinate of the first position in the sequence. end : int One past the 0-based last position in the sequence. strand : {'+', '-', '.'}, optional Default is '+'. The strand the sequence is located on. '.' is treated as '+'. pad : bool, optional Default is False. Pad the output sequence with 'N' if start and/or end are out of bounds to return a sequence of length end - start. Returns ------- numpy.ndarray, dtype=numpy.float32 The :math:L \\times 4 encoding of the sequence, where :math:L = end - start, unless chrom cannot be found in the input FASTA, start or end are out of bounds, or (if a blacklist exists) the region overlaps with a blacklist region. In these cases, it will return an empty encoding--that is, L = 0 for the NumPy array returned. Raises ------ ValueError If the input char to strand is not one of the specified choices. (Raised in the call to self.get_sequence_from_coords) """ sequence = self.get_sequence_from_coords( chrom, start, end, strand=strand, pad=pad) encoding = self.sequence_to_encoding(sequence) return encoding
[docs] def get_encoding_from_coords_check_unk(self, chrom, start, end, strand='+', pad=False): """Gets the one-hot encoding of the genomic sequence at the queried coordinates and check whether the sequence contains unknown base(s). Parameters ---------- chrom : str The name of the chromosome or region, e.g. "chr1". start : int The 0-based start coordinate of the first position in the sequence. end : int One past the 0-based last position in the sequence. strand : {'+', '-', '.'}, optional Default is '+'. The strand the sequence is located on. '.' is treated as '+'. pad : bool, optional Default is False. Pad the output sequence with 'N' if start and/or end are out of bounds to return a sequence of length end - start. Returns ------- tuple(numpy.ndarray, bool) * tuple[0] is the :math:L \\times 4 encoding of the sequence containing data of numpy.float32 type, where :math:L = end - start, unless chrom cannot be found in the input FASTA, start or end are out of bounds, or (if a blacklist exists) the region overlaps with a blacklist region. In these cases, it will return an empty encoding--that is, L = 0 for the NumPy array returned. * tuple[1] is the boolean value that indicates whether the sequence contains any unknown base(s) specified in self.UNK_BASE Raises ------ ValueError If the input char to strand is not one of the specified choices. (Raised in the call to self.get_sequence_from_coords) """ sequence = self.get_sequence_from_coords( chrom, start, end, strand=strand, pad=pad) encoding = self.sequence_to_encoding(sequence) return encoding, self.UNK_BASE in sequence
[docs] @classmethod def sequence_to_encoding(cls, sequence): """Converts an input sequence to its one-hot encoding. Parameters ---------- sequence : str A nucleotide sequence of length :math:L Returns ------- numpy.ndarray, dtype=numpy.float32 The :math:L \\times 4 one-hot encoding of the sequence. """ return sequence_to_encoding(sequence, cls.BASE_TO_INDEX, cls.BASES_ARR)
[docs] @classmethod def encoding_to_sequence(cls, encoding): """Converts an input one-hot encoding to its DNA sequence. Parameters ---------- encoding : numpy.ndarray, dtype=numpy.float32 An :math:L \\times 4 one-hot encoding of the sequence, where :math:L is the length of the output sequence. Returns ------- str The sequence of :math:L nucleotides decoded from the input array. """ return encoding_to_sequence(encoding, cls.BASES_ARR, cls.UNK_BASE)