Source code for selene_sdk.interpret.vis

"""
This module provides the methods for visualizing different ouputs
from selene analysis methods.

"""
from collections import defaultdict
from copy import deepcopy
import os
import re
import warnings

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import transforms
from matplotlib.path import Path
from matplotlib.patches import PathPatch
import matplotlib.patheffects
from matplotlib.text import TextPath
import pkg_resources
from plotly.offline import download_plotlyjs, plot
import plotly.graph_objs as go
import seaborn as sns
import tabix

from ..sequences import Genome


_SVG_PATHS = {'T': "M 0,100 l 100, 0 l 0,-25 l -37.5, 0 l 0,-75 l -25, 0 " +
                   "l 0,75 l -37.5,0 l 0,25 z",
              'C': ("M 100,12.5 l 0,25 c 0,0 -25,-15 -50,-12.5 " +
                    "c 0,0 -25,0 -25,25 c 0,0 0,25 25,25 c 0,0 25,2.5 50,-15" +
                    " l 0, 25 C 100,87.5 75,100 50,100 C 50,100 0,100 0,50 " +
                    "C 0,50 0,0 50,0 C 50,0 75,0 100,12.5 z"),
              'G': ("M 100,12.5 l 0,25 c 0,0 -25,-15 -50,-12.5 " +
                    "c 0,0 -25,0 -25,25 c 0,0 0,25 25,25 c 0,0 25,2.5 50,-15" +
                    " l 0, 25 C 100,87.5 75,100 50,100 C 50,100 0,100 0,50 " +
                    "C 0,50 0,0 50,0 C 50,0 75,0 100,12.5 M 100,37.5 " +
                    "l 0,17.5 l -50,0 l 0,-17 l 25,0 l 0,-25 l 25,0 z"),
              'A': ("M 0,0 l 37.5,100 l 25,0 l 37.5,-100 l -25,0 l -9.375,25" +
                    " l -31.25,0 l -9.375,-25 l -25,0 z 0,0 M 43.75, 50 " +
                    "l 12.5,0 l -5.859375,15.625 l -5.859375,-15.625 z"),
              'U': ("M 0,100 l 25,0 l 0,-50 C 25,50 25,25 50,25" +
                    " C 50,25 75,25 75,50 l 0,50 l 25,0 L 100,50 " +
                    "C 100,50 100,0, 50,0 C 50,0 0,0 0,50 l 0,50 z")}


def _svg_parse(path_string):
    """Functionality for parsing a string from source vector graphics.

    Source is from `matplotlib.org/2.1.1/gallery/showcase/firefox.html`
    with minor modifications.

    Parameters
    ----------
    path_string : str
        String containing the path code from an SVG file.

    Returns
    -------
    list(numpy.uint8), numpy.ndarray, dtype=np.float32
        A 2-tuple containing code types and coordinates for a matplotlib
        path.

    """
    commands = {'M': (Path.MOVETO,),
                'L': (Path.LINETO,),
                'Q': (Path.CURVE3,)*2,
                'C': (Path.CURVE4,)*3,
                'Z': (Path.CLOSEPOLY,)}
    path_re = re.compile(r'([MLHVCSQTAZ])([^MLHVCSQTAZ]+)', re.IGNORECASE)
    float_re = re.compile(r'(?:[\s,]*)([+-]?\d+(?:\.\d+)?)')
    vertices = []
    codes = []
    last = (0, 0)
    for cmd, values in path_re.findall(path_string):
        points = [float(v) for v in float_re.findall(values)]
        points = np.array(points).reshape((len(points)//2, 2))
        if cmd.islower():
            points += last
        cmd = cmd.capitalize()
        if len(points) > 0:
            last = points[-1]
        codes.extend(commands[cmd])
        vertices.extend(points.tolist())
    return np.array(vertices), codes


for k in _SVG_PATHS.keys():
    _SVG_PATHS[k] = _svg_parse(_SVG_PATHS[k])


class _TextPathRenderingEffect(matplotlib.patheffects.AbstractPathEffect):
    """This class provides an effect for continuously rendering a text
    path over another path.

    """
    def __init__(self, bar, x_translation=0., y_translation=0.,
                 x_scale=1., y_scale=1.):
        """This is a class for re-rendering text paths and preserving
        their scale.

        Parameters
        ----------
        bar : matplotlib.patches.Patch
            The patch where the letter is.
        x_translation : float, optional
            Default is 0. Amount by which to translate the x coordinate.
        y_translation : float, optional
            Default is 0. Amount by which to translate the y coordinate.
        x_scale : float, optional
            Default is 1. Amount by which to scale the width.
        y_scale : float, optional
            Default is 1. Amount by which to scale the height.

        """
        self._bar = bar
        self._x_translation = x_translation
        self._y_translation = y_translation
        self._x_scale = x_scale
        self._y_scale = y_scale

    def draw_path(self, renderer, gc, tpath, affine, rgbFace=None):
        """Redraws the path.

        """
        b_x, b_y, b_w, b_h = self._bar.get_extents().bounds
        t_x, t_y, t_w, t_h = tpath.get_extents().bounds
        translation = [b_x - t_x, b_y - t_y]
        translation[0] += self._x_translation
        translation[1] += self._y_translation
        scale = [b_w / t_w, b_h / t_h]
        scale[0] *= self._x_scale
        scale[1] *= self._y_scale
        affine = affine.identity().scale(*scale).translate(*translation)
        renderer.draw_path(gc, tpath, affine, rgbFace)


[docs]def sequence_logo(score_matrix, order="value", width=1.0, ax=None,
                  sequence_type=Genome, font_properties=None,
                  color_scheme=None,
                  **kwargs):
    """Plots a sequence logo for visualizing motifs.

    Parameters
    ----------
    score_matrix : np.ndarray
        An :math:`L \\times N` array (where :math:`L` is the length of
        the sequence, and :math:`N` is the size of the alphabet)
        containing the scores for each base occuring at each position.
    order : {'alpha', 'value'}
        The manner by which to sort the bases stacked at each position
        in the sequence logo plot.

            * 'alpha' - Bases go in the order they are found in the\
                      sequence alphabet.
            * 'value' - Bases go in the order of their value, with the\
                        largest at the bottom.
    width : float, optional
        Default is 1. The width of each character in the plotted logo.
        A value of 1 will mean that there is no gap between each the
        characters at each position. A value of 0 will not draw any
        characters.
    ax : matplotlib.pyplot.Axes or None, optional
        Default is `None`. The axes to plot on. If left as `None`, a new
        axis will be created.
    sequence_type : class, optional
        Default is `selene_sdk.sequences.Genome`. The type of sequence that
        the *in silico* mutagenesis results are associated with. This
        should generally be a subclass of `selene_sdk.sequences.Sequence`.
    font_properties : matplotlib.font_manager.FontProperties or None, optional
        Default is `None`. A `matplotlib.font_manager.FontProperties`
        object that specifies the properties of the font to use for
        plotting the motif. If `None`, no font will be used, and the
        text will be rendered by a path. This method of rendering paths
        is  preferred, as it ensures all character heights correspond to
        the actual values, and that there are no extra gaps between
        the tops and bottoms of characters at each position in the
        sequence logo. If the user opts to use a value other
        than `None`, then no such guarantee can be made.
    color_scheme : list(str) or None, optional
        Default is `None`. A list containing the hex codes or names of
        colors to use, appearing in the order of the bases of the
        sequence type. If left as `None`, a default palette will be made
        with `seaborn.color_palette`, and will have as many
        colors as there are characters in the input sequence alphabet.

    Returns
    -------
    matplotlib.pyplot.Axes
        The axes containing the sequence logo plot.

    Raises
    ------
    ValueError
        If the number of columns in `score_matrix` does not match the
        number of characters in the alphabet of `sequence_type`.

    ValueError
        If the number of colors in `color_palette` does not match the
        number of characters in the alphabet of `sequence_type`.

    Examples
    --------
    We have included an example of the output from a `sequence_logo`
    plot below:

    .. image:: ../../docs/source/_static/img/sequence_logo_example.png

    """
    # Note that everything will break if we do not deepcopy.
    score_matrix = deepcopy(score_matrix)

    score_matrix = score_matrix.transpose()
    if font_properties is not None:
        warnings.warn(
            "Specifying a value for `font_properties` (other than `None`) "
            "will use the `matplotlib`-based character paths, and causes "
            "distortions in the plotted motif. We recommend leaving "
            "`font_properties=None`. See the documentation for details.",
            UserWarning)

    if color_scheme is None:
        color_scheme = sns.color_palette("Set1",
                                         n_colors=len(sequence_type.BASES_ARR))
        color_scheme = color_scheme.as_hex()
    if len(color_scheme) < len(sequence_type.BASES_ARR):
        raise ValueError(
            "Color scheme is shorter than number of bases in sequence.")

    if score_matrix.shape[0] != len(sequence_type.BASES_ARR):
        raise ValueError(
            "Got score with {0} bases for sequence with {1} bases.".format(
                score_matrix.shape[0], len(sequence_type.BASES_ARR)))
    if ax is None:
        _, ax = plt.subplots(figsize=score_matrix.shape)

    # Determine offsets depending on sort order.
    positive_offsets = np.zeros_like(score_matrix)
    negative_offsets = np.zeros_like(score_matrix)
    bases = np.empty(score_matrix.shape, dtype=object)
    bases[:, :] = "?"  # This ensures blanks are visually obvious.

    # Change ordering of things based on input arguments.
    if order == "alpha":
        for i in range(score_matrix.shape[0]):
            bases[i, :] = sequence_type.BASES_ARR[i]

    elif order == "value":
        if np.sum(score_matrix < 0) != 0:
            sorted_scores = np.zeros_like(score_matrix)
            for j in range(score_matrix.shape[1]):
                # Sort the negative values and put them at bottom.
                div = np.sum(score_matrix[:, j] < 0.)
                negative_idx = np.argwhere(score_matrix[:, j] < 0.).flatten()
                negative_sort_idx = np.argsort(score_matrix[negative_idx, j],
                                               axis=None)
                sorted_scores[:div, j] = score_matrix[
                    negative_idx[negative_sort_idx], j]
                bases[:div, j] = sequence_type.BASES_ARR[
                    negative_idx[negative_sort_idx]].flatten()

                # Sort the positive values and stack atop the negatives.
                positive_idx = np.argwhere(score_matrix[:, j] >= 0.).flatten()
                positive_sort_idx = np.argsort(score_matrix[positive_idx, j],
                                               axis=None)
                sorted_scores[div:, j] = score_matrix[
                    positive_idx[positive_sort_idx], j]
                bases[div:, j] = sequence_type.BASES_ARR[
                    positive_idx[positive_sort_idx]].flatten()
            score_matrix = sorted_scores
        else:
            for j in range(score_matrix.shape[1]):
                sort_idx = np.argsort(score_matrix[:, j], axis=None)[::-1]
                bases[:, j] = sequence_type.BASES_ARR[sort_idx]
                score_matrix[:, j] = score_matrix[sort_idx, j]

    # Create offsets for each bar.
    for i in range(score_matrix.shape[0] - 1):
        y_coords = score_matrix[i, :]
        if i > 0:
            negative_offsets[i + 1, :] = negative_offsets[i, :]
            positive_offsets[i + 1, :] = positive_offsets[i, :]
        neg_idx = np.argwhere(y_coords < 0.)
        pos_idx = np.argwhere(y_coords >= 0.)
        negative_offsets[i + 1, neg_idx] += y_coords[neg_idx]
        positive_offsets[i + 1, pos_idx] += y_coords[pos_idx]

    for i in range(score_matrix.shape[0]):
        x_coords = np.arange(score_matrix.shape[1]) + 0.5
        y_coords = score_matrix[i, :]

        # Manage negatives and positives separately.
        offsets = np.zeros(score_matrix.shape[1])
        negative_idx = np.argwhere(y_coords < 0.)
        positive_idx = np.argwhere(y_coords >= 0.)
        offsets[negative_idx] = negative_offsets[i, negative_idx]
        offsets[positive_idx] = positive_offsets[i, positive_idx]
        bars = ax.bar(x_coords, y_coords, color="black", width=width,
                      bottom=offsets)
        for j, bar in enumerate(bars):
            base = bases[i, j]
            bar.set_color(color_scheme[sequence_type.BASE_TO_INDEX[base]])
            bar.set_edgecolor(None)

    # Iterate over the barplot's bars and turn them into letters.
    new_patches = []
    for i, bar in enumerate(ax.patches):
        base_idx = i // score_matrix.shape[1]
        seq_idx = i % score_matrix.shape[1]
        base = bases[base_idx, seq_idx]
        # We construct a text path that tracks the bars in the barplot.
        # Thus, the barplot takes care of scaling and translation,
        #  and we just copy it.
        if font_properties is None:
            text = Path(_SVG_PATHS[base][0], _SVG_PATHS[base][1])
        else:
            text = TextPath((0., 0.), base, fontproperties=font_properties)
        b_x, b_y, b_w, b_h = bar.get_extents().bounds
        t_x, t_y, t_w, t_h = text.get_extents().bounds
        scale = (b_w / t_w, b_h / t_h)
        translation = (b_x - t_x, b_y - t_y)
        text = PathPatch(text, facecolor=bar.get_facecolor(), lw=0.)
        bar.set_facecolor("none")
        text.set_path_effects([_TextPathRenderingEffect(bar)])
        transform = transforms.Affine2D().translate(*translation).scale(*scale)
        text.set_transform(transform)
        new_patches.append(text)

    for patch in new_patches:
        ax.add_patch(patch)
    ax.set_xlim(0, score_matrix.shape[1])
    ax.set_xticks(np.arange(score_matrix.shape[1]) + 0.5)
    ax.set_xticklabels(np.arange(score_matrix.shape[1]))
    return ax


[docs]def rescale_score_matrix(score_matrix, base_scaling="identity",
                         position_scaling="identity"):
    """Performs base-wise and position-wise scaling of a score matrix for a
    feature, usually produced from an *in silico* mutagenesis experiment.

    Parameters
    ----------
    score_matrix : numpy.ndarray
        An :math:`L \\times N` matrix containing the scores for each
        position, where :math:`L` is the length of the sequence, and
        :math:`N` is the number of characters in the alphabet.
    base_scaling : {'identity', 'probability', 'max_effect'}
        The type of scaling performed on each base at a given position.

            * 'identity' - No transformation will be applied to the\
                           data.
            * 'probability' - The relative sizes of the bases will be\
                              the original input probabilities.
            * 'max_effect' - The relative sizes of the bases will be\
                             the max effect of the original input\
                             values.
    position_scaling : {'identity', 'probability', 'max_effect'}
        The type of scaling performed on each position.

            * 'identity'    - No transformation will be applied to the data.
            * 'probability' - The sum of values at a position will be\
                              equal to the sum of the original input\
                              values at that position.
            * 'max_effect'  - The sum of values at a position will be\
                              equal to the sum of the max effect values\
                              of the original input values at that\
                              position.

    Returns
    -------
    numpy.ndarray
        The transformed score matrix.

    Raises
    ------
    ValueError
        If an unsupported `base_scaling` or `position_scaling` is
        entered.

    """
    # Note that things can break if we do not deepcopy.
    score_matrix = deepcopy(score_matrix)

    score_matrix = score_matrix.transpose()
    rescaled_scores = score_matrix

    # Scale individual bases.
    if base_scaling == "identity" or base_scaling == "probability":
        pass
    elif base_scaling == "max_effect":
        rescaled_scores = score_matrix - np.min(score_matrix, axis=0)
    else:
        raise ValueError(
            "Could not find base scaling \"{0}\".".format(base_scaling))

    # Scale each position
    if position_scaling == "max_effect":
        max_effects = np.max(score_matrix, axis=0) - np.min(score_matrix,
                                                            axis=0)
        rescaled_scores /= rescaled_scores.sum(axis=0)[np.newaxis, :]
        rescaled_scores *= max_effects[np.newaxis, :]
    elif position_scaling == "probability":
        rescaled_scores /= np.sum(score_matrix, axis=0)[np.newaxis, :]
    elif position_scaling != "identity":
        raise ValueError(
            "Could not find position scaling \"{0}\".".format(
                position_scaling))
    return rescaled_scores.transpose()


[docs]def heatmap(score_matrix, mask=None, sequence_type=Genome, **kwargs):
    """Plots the input matrix of scores, generally those produced by an
    *in silico* mutagenesis experiment, on a heatmap.

    Parameters
    ----------
    score_matrix : numpy.ndarray
        An :math:`L \\times N` array (where :math:`L` is the length of
        the sequence, and :math:`N` is the size of the alphabet)
        containing the scores for each base change at each position.
    mask : numpy.ndarray, dtype=bool or None, optional
        Default is `None`. An :math:`L \\times N` array (where :math:`L`
        is the length of the sequence, and :math:`N` is the size of the
        alphabet)  containing `True` at positions in the heatmap to
        mask. If `None`, no masking will occur.
    sequence_type : class, optional
        Default is `selene_sdk.sequences.Genome`. The class of sequence that
        the *in silico* mutagenesis results are associated with. This is
        generally a sub-class of `selene_sdk.sequences.Sequence`.
    **kwargs : dict
        Keyword arguments to pass to `seaborn.heatmap`. Some useful ones
        to remember are:

            * cbar_kws - Keyword arguments to forward to the colorbar.
            * yticklabels - Manipulate the tick labels on the y axis.
            * cbar - If `False`, hide the color bar. If `True`, show\
                     the colorbar.
            * cmap - The color map to use for the heatmap.

    Returns
    -------
    matplotlib.pyplot.Axes
        The axes containing the heatmap plot.

    """
    # Note that some things can break if we do not deepcopy.
    score_matrix = deepcopy(score_matrix)

    # This flipping is so that ordering is consistent with ordering
    # in the sequence logo.
    if mask is not None:
        mask = mask.transpose()
        mask = np.flip(mask, axis=0)
    score_matrix = score_matrix.transpose()
    score_matrix = np.flip(score_matrix, axis=0)

    if "yticklabels" in kwargs:
        yticklabels = kwargs.pop("yticklabels")
    else:
        yticklabels = sequence_type.BASES_ARR[::-1]
    if "cbar_kws" in kwargs:
        cbar_kws = kwargs.pop("cbar_kws")
    else:
        cbar_kws = dict(use_gridspec=False, location="bottom", pad=0.2)
    if "cmap" in kwargs:
        cmap = kwargs.pop("cmap")
    else:
        cmap = "Blues"
    ret = sns.heatmap(score_matrix, mask=mask, yticklabels=yticklabels,
                      cbar_kws=cbar_kws, cmap=cmap, **kwargs)
    ret.set_yticklabels(labels=ret.get_yticklabels(), rotation=0)
    return ret


[docs]def load_variant_abs_diff_scores(input_path):
    """
    Loads the variant data, labels, and feature names from a diff scores
    file output from variant effect prediction.

    TODO: should we move this out of `vis.py`?

    Parameters
    ----------
    input_path : str
        Path to the input file.

    Returns
    -------
    tuple(np.ndarray, list(tuple(str)), list(str))

        * `tuple[0]` is the matrix of absolute difference scores. The rows\
          are the variants and the columns are the features for which the\
          model makes predictions.
        * `tuple[1]` is the list of variant labels. Each tuple contains\
          (chrom, pos, name, ref, alt).
        * `tuple[2]` is the list of features.

    """
    features = []
    labels = []
    diffs = []
    with open(input_path, 'r') as file_handle:
        colnames = file_handle.readline()
        features = colnames.strip().split('\t')[5:]
        for line in file_handle:
            cols = line.strip().split('\t')
            scores = [float(f) for f in cols[5:]]
            label = tuple(cols[:5])
            diffs.append(scores)
            labels.append(label)
    diffs = np.array(diffs)
    return (diffs, labels, features)


[docs]def sort_standard_chrs(chrom):
    """
    Returns the value on which the
    standard chromosomes can be sorted.

    Parameters
    ----------
    chrom : str
        The chromosome

    Returns
    -------
    int
        The value on which to sort

    """
    chrom = chrom[3:]
    if chrom.isdigit():
        return int(chrom)
    if chrom == 'X':
        return 23
    elif chrom == 'Y':
        return 24
    elif chrom == 'M':
        return 25
    else:  # unknown chr
        return 26


[docs]def ordered_variants_and_indices(labels):
    """
    Get the ordered variant labels, where the labels are sorted by chromosome
    and position, and the indices corresponding to the sort,

    Parameters
    ----------
    labels : list(tuple(str))
        The list of variant labels. Each label is a tuple of
        (chrom, pos, name, ref, alt).

    Returns
    -------
    tuple(list(tuple), list(int))
        The first value is the ordered list of labels. Each label
        is a tuple of (chrom, pos, ref, alt). The second value
        is the ordered list of label indices.

    """
    labels_dict = defaultdict(list)
    for i, l in enumerate(labels):
        chrom, pos, name, ref, alt = l
        pos = int(pos)
        info = (i, pos, ref, alt)
        labels_dict[chrom].append(info)
    for chrom, labels_list in labels_dict.items():
        labels_list.sort(key=lambda tup: tup[1:])
    ordered_keys = sorted(labels_dict.keys(),
                          key=sort_standard_chrs)

    ordered_labels = []
    ordered_label_indices = []
    for chrom in ordered_keys:
        for l in labels_dict[chrom]:
            index, pos, ref, alt = l
            ordered_label_indices.append(index)
            ordered_labels.append((chrom, pos, ref, alt))
    return (ordered_labels, ordered_label_indices)


def _label_tuple_to_text(label, diff, genes=None):
    """
    Converts the variant label tuple to a string.

    Parameters
    ----------
    label : tuple(str)
        A tuple of (chrom, pos, ref, alt).
    diff : float
        The max difference score across some or all features in the model.
    genes : list(str) or None, optional
        Default is None. If the closest protein-coding `genes` are specified,
        will display them in the label text.

    Returns
    -------
    str
        The label text.

    """
    chrom, pos, ref, alt = label
    if genes is not None:
        if len(genes) == 0:
            genes_str = "none found"
        else:
            genes_str = ', '.join(genes)
        text = ("max diff score: {0}<br />{1} {2}, {3}/{4}<br />"
                "closest protein-coding gene(s): {5}").format(
            diff, chrom, pos, ref, alt, genes_str)
    else:
        text = "max diff score: {0}<br />{1} {2}, {3}/{4}".format(
            diff, chrom, pos, ref, alt)
    return text


def _variant_closest_genes(label, tabix_fh, chrs_gene_intervals):
    chrom = label[0]
    pos = label[1]
    closest_genes = []
    try:
        overlaps = tabix_fh.query(chrom, pos, pos + 1)
        for o in overlaps:
            closest_genes.append(o[-1])
    except tabix.TabixError:
        pass
    if len(closest_genes) != 0:
        return closest_genes
    gene_intervals = chrs_gene_intervals[chrom]
    closest = None
    for (start, end, strand, gene) in gene_intervals:
        if start < pos and closest and closest == pos - end:
            closest_genes.append(gene)
        elif start < pos and (closest is None
                or closest > pos - end):
            closest = pos - end
            closest_genes = [gene]
        elif start > pos and closest and closest == start - pos:
            closest_genes.append(gene)
        elif start > pos and (closest is None
                or closest > start - pos):
            closest = start - pos
            closest_genes = [gene]
    return closest_genes


def _load_chrs_gene_intervals(gene_intervals_bed):
    chrs_gene_intervals = defaultdict(list)
    with open(gene_intervals_bed, 'r') as file_handle:
        for line in file_handle:
            cols = line.strip().split('\t')
            chrom = cols[0]
            start = int(cols[1])
            end = int(cols[2])
            strand = cols[3]
            gene = cols[4]
            chrs_gene_intervals[chrom].append((start, end, strand, gene))
    return chrs_gene_intervals


def _variants_closest_protein_coding_gene(labels, version="hg38"):
    gene_intervals_tabix = pkg_resources.resource_filename(
        "selene_sdk",
        ("interpret/data/gencode_v28_{0}/"
         "protein_coding_l12_genes.bed.gz").format(version))
    gene_intervals_bed = pkg_resources.resource_filename(
        "selene_sdk",
        ("interpret/data/gencode_v28_{0}/"
         "protein_coding_l12_genes.bed").format(version))

    tabix_fh = tabix.open(gene_intervals_tabix)
    chrs_gene_intervals = _load_chrs_gene_intervals(gene_intervals_bed)

    labels_gene_information = []
    for l in labels:
        labels_gene_information.append(_variant_closest_genes(
            l, tabix_fh, chrs_gene_intervals))

    return labels_gene_information


[docs]def variant_diffs_scatter_plot(data,
                               labels,
                               features,
                               output_path,
                               filter_features=None,
                               labels_sort_fn=ordered_variants_and_indices,
                               nth_percentile=None,
                               hg_reference_version=None,
                               threshold_line=None,
                               auto_open=False):
    """
    Displays each variant's max probability difference across features
    as a point in a scatter plot. The points in the scatter plot are
    ordered by the variant chromosome and position by default. Variants can
    be sorted differently by passing in a new `labels_sort_fn`.

    Parameters
    ----------
    data : np.ndarray
        Absolute difference scores for variants across all features that a
        model predicts. This is the first value in the tuple returned by
        `load_variant_abs_diff_scores`.
    labels : list(tuple(str))
        A list of variant labels. This is the second value in the tuple
        returned by `load_variant_abs_diff_scores`.
    features : list(str)
        A list of the features the model predicts. This is the third value
        in the tuple returned by `load_variant_abs_diff_scores`.
    output_path : str
        Path to output file. Must have '.html' extension.
    filter_features : types.FunctionType or None, optional
        Default is None. A function that takes in a `list(str)` of features
        and returns the `list(int)` of feature indices over which we would
        compute the max(probability difference) for each variant. For example,
        a user may only want to visualize the max probability difference
        for TF binding features. If `None`, uses all the features.
    labels_sort_fn : types.FunctionType, optional
        Default is `ordered_variants_and_indices`. A function that takes
        in a `list(tuple(str))` of labels corresponding to the rows in `data`
        and returns a `tuple(list(tuple), list(int))`, where the first value
        is the ordered list of variant labels and the second value is the
        ordered list of indices for those variant labels. By default,
        variants are sorted by chromosome and position.
    nth_percentile : int [0, 100] or None, optional
        Default is None. If `nth_percentile` is not None, only displays the
        variants with a max absolute difference score within the
        `nth_percentile` of scores.
    hg_reference_version : str {"hg19", "hg38"} or None, optional
        Default is None. On hover, we can display the gene(s) closest to
        each variant if `hg_reference_version` is not None, where closest
        can be a variant within a gene interval (where genes and
        their coordinates are taken from level 1 & 2 protein-coding genes
        in gencode v28) or near a gene. In the future, we will allow users
        to specify their own genome file so that this information can
        be annotated to variants from other organisms, other genome versions,
        etc.
    threshold_line : float or None, optional
        Default is None. If `threshold_line` is not None, draws a horizontal
        line at the specified threshold. Helps focus the visual on variants
        above a certain threshold.
    auto_open : bool, optional
        Default is False. If `auto_open`, will automatically open a web
        browser that displays the plotted HTML file.

    Returns
    -------
    plotly.graph_objs.graph_objs.Figure
        The generated Plotly figure.

    """
    labels_ordered, label_indices = labels_sort_fn(labels)
    variant_closest_genes = None
    if hg_reference_version is not None:
        variant_closest_genes = _variants_closest_protein_coding_gene(
            labels_ordered, version=hg_reference_version)
    ordered_data = data[label_indices, :]

    feature_indices = None
    if filter_features is not None:
        feature_indices = filter_features(features)
        ordered_data = data[:, feature_indices]
    variants_max_diff = np.amax(ordered_data, axis=1)

    display_labels = None
    if nth_percentile:
        p = np.percentile(variants_max_diff, nth_percentile)
        keep = np.where(variants_max_diff >= p)[0]
        print("{0} variants with max abs diff score above {1} are in the "
              "{2}th percentile.".format(
                  len(keep), p, nth_percentile))
        variants_max_diff = variants_max_diff[keep]
        display_labels = []
        for i, l in enumerate(labels_ordered):
            if i not in keep:
                continue
            display_labels.append(l)
    else:
        display_labels = labels_ordered

    if variant_closest_genes:
        text_labels = [
            _label_tuple_to_text(l, d, g) for l, d, g in
            zip(display_labels, variants_max_diff, variant_closest_genes)]
    else:
        text_labels = [_label_tuple_to_text(l, d) for l, d in
                       zip(display_labels, variants_max_diff)]

    label_x = [' '.join([l[0], str(l[1])]) for l in display_labels]
    data = [go.Scatter(x=label_x,
                       y=variants_max_diff,
                       mode='markers',
                       marker = dict(
                           color = "#39CCCC",
                           line = dict(width = 1)),
                       text=text_labels,
                       hoverinfo="text")]

    go_layout = {
        "title": "Max probability difference scores",
        "hovermode": "closest",
        "hoverlabel": {
                "font": {"size": 16}
            },
        "xaxis": {
            "title": "Genome coordinates",
            "showticklabels": True,
            "tickangle": 35,
            "titlefont": {"family": "Arial, sans-serif",
                          "size": 16},
            "nticks": 25,
            "tickmode": "auto",
            "automargin": True
            },
        "yaxis": {
            "title": "Absolute difference",
            "titlefont": {"family": "Arial, sans-serif",
                          "size": 16}
            }
        }
    if isinstance(threshold_line, float):
        layout = go.Layout(
            **go_layout,
            shapes=[
                {"type": "line",
                 "x0": label_x[0],
                 "y0": threshold_line,
                 "x1": label_x[-1],
                 "y1": threshold_line,
                 "line": {
                    "color": "rgba(255, 99, 71, 1)",
                    "width": 2
                    }
                }
            ])
    else:
        layout = go.Layout(**go_layout)
    fig = go.Figure(data=data, layout=layout)
    path, filename = os.path.split(output_path)
    os.makedirs(path, exist_ok=True)
    plot(fig, filename=output_path, auto_open=auto_open)
    return fig