Source code for selene_sdk.utils.config_utils

"""
Utilities for loading configurations, instantiating Python objects, and
running operations in _Selene_.

"""
import os
import importlib
import sys
from time import strftime
import types
import random

import numpy as np
import torch

from . import _is_lua_trained_model
from . import instantiate


def class_instantiate(classobj):
    """Not used currently, but might be useful later for recursive
    class instantiation
    """
    for attr, obj in classobj.__dict__.items():
        is_module = getattr(obj, '__module__', None)
        if is_module and "selene_sdk" in is_module and attr is not "model":
            class_instantiate(obj)
    classobj.__init__(**classobj.__dict__)


def module_from_file(path):
    """
    Load a module created based on a Python file path.

    Parameters
    ----------
    path : str
        Path to the model architecture file.

    Returns
    -------
    The loaded module

    """
    parent_path, module_file = os.path.split(path)
    loader = importlib.machinery.SourceFileLoader(
        module_file[:-3], path)
    module = types.ModuleType(loader.name)
    loader.exec_module(module)
    return module


def module_from_dir(path):
    """
    This method expects that you pass in the path to a valid Python module,
    where the `__init__.py` file already imports the model class,
    `criterion`, and `get_optimizer` methods from the appropriate file
    (e.g. `__init__.py` contains the line `from <model_class_file> import
    <ModelClass>`).

    Parameters
    ----------
    path : str
        Path to the Python module containing the model class.

    Returns
    -------
    The loaded module
    """
    parent_path, module_dir = os.path.split(path)
    sys.path.insert(0, parent_path)
    return importlib.import_module(module_dir)


[docs]def initialize_model(model_configs, train=True, lr=None): """ Initialize model (and associated criterion, optimizer) Parameters ---------- model_configs : dict Model-specific configuration train : bool, optional Default is True. If `train`, returns the user-specified optimizer and optimizer class that can be found within the input model file. lr : float or None, optional If `train`, a learning rate must be specified. Otherwise, None. Returns ------- model, criterion : tuple(torch.nn.Module, torch.nn._Loss) or \ model, criterion, optim_class, optim_kwargs : \ tuple(torch.nn.Module, torch.nn._Loss, torch.optim, dict) * `torch.nn.Module` - the model architecture * `torch.nn._Loss` - the loss function associated with the model * `torch.optim` - the optimizer associated with the model * `dict` - the optimizer arguments The optimizer and its arguments are only returned if `train` is True. Raises ------ ValueError If `train` but the `lr` specified is not a float. """ import_model_from = model_configs["path"] model_class_name = model_configs["class"] module = None if os.path.isdir(import_model_from): module = module_from_dir(import_model_from) else: module = module_from_file(import_model_from) model_class = getattr(module, model_class_name) model = model_class(**model_configs["class_args"]) if "non_strand_specific" in model_configs: from selene_sdk.utils import NonStrandSpecific model = NonStrandSpecific( model, mode=model_configs["non_strand_specific"]) _is_lua_trained_model(model) criterion = module.criterion() if train and isinstance(lr, float): optim_class, optim_kwargs = module.get_optimizer(lr) return model, criterion, optim_class, optim_kwargs elif train: raise ValueError("Learning rate must be specified as a float " "but was {0}".format(lr)) return model, criterion
[docs]def execute(operations, configs, output_dir): """ Execute operations in _Selene_. Parameters ---------- operations : list(str) The list of operations to carry out in _Selene_. configs : dict or object The loaded configurations from a YAML file. output_dir : str or None The path to the directory where all outputs will be saved. If None, this means that an `output_dir` was not specified in the top-level configuration keys. `output_dir` must be specified in each class's individual configuration wherever it is required. Returns ------- None Executes the operations listed and outputs any files to the dirs specified in each operation's configuration. Raises ------ ValueError If an expected key in configuration is missing. """ model = None train_model = None for op in operations: if op == "train": model, loss, optim, optim_kwargs = initialize_model( configs["model"], train=True, lr=configs["lr"]) sampler_info = configs["sampler"] if output_dir is not None: sampler_info.bind(output_dir=output_dir) sampler = instantiate(sampler_info) train_model_info = configs["train_model"] train_model_info.bind(model=model, data_sampler=sampler, loss_criterion=loss, optimizer_class=optim, optimizer_kwargs=optim_kwargs) if output_dir is not None: train_model_info.bind(output_dir=output_dir) if "random_seed" in configs: train_model_info.bind(deterministic=True) train_model = instantiate(train_model_info) # TODO: will find a better way to handle this in the future if "load_test_set" in configs and configs["load_test_set"] and \ "evaluate" in operations: train_model.create_test_set() train_model.train_and_validate() elif op == "evaluate": if train_model is not None: train_model.evaluate() if not model: model, loss = initialize_model( configs["model"], train=False) if "evaluate_model" in configs: sampler_info = configs["sampler"] sampler = instantiate(sampler_info) evaluate_model_info = configs["evaluate_model"] evaluate_model_info.bind( model=model, criterion=loss, data_sampler=sampler) if output_dir is not None: evaluate_model_info.bind(output_dir=output_dir) evaluate_model = instantiate(evaluate_model_info) evaluate_model.evaluate() elif op == "analyze": if not model: model, _ = initialize_model( configs["model"], train=False) analyze_seqs_info = configs["analyze_sequences"] analyze_seqs_info.bind(model=model) analyze_seqs = instantiate(analyze_seqs_info) if "variant_effect_prediction" in configs: vareff_info = configs["variant_effect_prediction"] if "vcf_files" not in vareff_info: raise ValueError("variant effect prediction requires " "as input a list of 1 or more *.vcf " "files ('vcf_files').") for filepath in vareff_info.pop("vcf_files"): analyze_seqs.variant_effect_prediction( filepath, **vareff_info) if "in_silico_mutagenesis" in configs: ism_info = configs["in_silico_mutagenesis"] if "sequence" in ism_info: analyze_seqs.in_silico_mutagenesis(**ism_info) elif "input_path" in ism_info: analyze_seqs.in_silico_mutagenesis_from_file(**ism_info) elif "fa_files" in ism_info: for filepath in ism_info.pop("fa_files"): analyze_seqs.in_silico_mutagenesis_from_file( filepath, **ism_info) else: raise ValueError("in silico mutagenesis requires as input " "the path to the FASTA file " "('input_path') or a sequence " "('input_sequence') or a list of " "FASTA files ('fa_files'), but found " "neither.") if "prediction" in configs: predict_info = configs["prediction"] analyze_seqs.get_predictions(**predict_info)
[docs]def parse_configs_and_run(configs, create_subdirectory=True, lr=None): """ Method to parse the configuration YAML file and run each operation specified. Parameters ---------- configs : dict The dictionary of nested configuration parameters. Will look for the following top-level parameters: * `ops`: A list of 1 or more of the values \ {"train", "evaluate", "analyze"}. The operations specified\ determine what objects and information we expect to parse\ in order to run these operations. This is required. * `output_dir`: Output directory to use for all the operations.\ If no `output_dir` is specified, assumes that all constructors\ that will be initialized (which have their own configurations\ in `configs`) have their own `output_dir` specified.\ Optional. * `random_seed`: A random seed set for `torch` and `torch.cuda`\ for reproducibility. Optional. * `lr`: The learning rate, if one of the operations in the list is\ "train". * `load_test_set`: If `ops: [train, evaluate]`, you may set\ this parameter to True if you would like to load the test\ set into memory ahead of time--and therefore save the test\ data to a .bed file at the start of training. This is only\ useful if you have a machine that can support a large increase\ (on the order of GBs) in memory usage and if you want to\ create a test dataset early-on because you do not know if your\ model will finish training and evaluation within the allotted\ time that your job is run. create_subdirectory : bool, optional Default is True. If `create_subdirectory`, will create a directory within `output_dir` with the name formatted as "%Y-%m-%d-%H-%M-%S", the date/time this method was run. lr : float or None, optional Default is None. If "lr" (learning rate) is already specified as a top-level key in `configs`, there is no need to set `lr` to a value unless you want to override the value in `configs`. Otherwise, set `lr` to the desired learning rate if "train" is one of the operations to be executed. Returns ------- None Executes the operations listed and outputs any files to the dirs specified in each operation's configuration. """ operations = configs["ops"] if "train" in operations and "lr" not in configs and lr != None: configs["lr"] = float(lr) elif "train" in operations and "lr" in configs and lr != None: print("Warning: learning rate specified in both the " "configuration dict and this method's `lr` parameter. " "Using the `lr` value input to `parse_configs_and_run` " "({0}, not {1}).".format(lr, configs["lr"])) elif "train" in operations and "lr" not in configs and lr == None: raise ValueError("Learning rate not specified, cannot " "fit model. Exiting.") current_run_output_dir = None if "output_dir" not in configs and \ ("train" in operations or "evaluate" in operations): print("No top-level output directory specified. All constructors " "to be initialized (e.g. Sampler, TrainModel) that require " "this parameter must have it specified in their individual " "parameter configuration.") elif "output_dir" in configs: current_run_output_dir = configs["output_dir"] os.makedirs(current_run_output_dir, exist_ok=True) if "create_subdirectory" in configs: create_subdirectory = configs["create_subdirectory"] if create_subdirectory: current_run_output_dir = os.path.join( current_run_output_dir, strftime("%Y-%m-%d-%H-%M-%S")) os.makedirs(current_run_output_dir) print("Outputs and logs saved to {0}".format( current_run_output_dir)) if "random_seed" in configs: seed = configs["random_seed"] random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) print("Setting random seed = {0}".format(seed)) else: print("Warning: no random seed specified in config file. " "Using a random seed ensures results are reproducible.") execute(operations, configs, current_run_output_dir)