Source code for matsci_opt_benchmarks.crabnet_hyperparameter.core

"""
This is a skeleton file that can serve as a starting point for a Python
console script. To run this script uncomment the following lines in the
``[options.entry_points]`` section in ``setup.cfg``::

    console_scripts =
         fibonacci = crabnet_hyperparameter.skeleton:run

Then run ``pip install .`` (or ``pip install -e .`` for editable mode)
which will install the command ``fibonacci`` inside your current environment.

Besides console scripts, the header (i.e. until ``_logger``...) of this file can
also be used as template for Python modules.

Note:
    This file can be renamed depending on your needs or safely removed if not needed.

References:
    - https://setuptools.pypa.io/en/latest/userguide/entry_point.html
    - https://pip.pypa.io/en/stable/reference/pip_install
"""

import argparse
import logging
import pprint
import sys
from copy import copy
from time import time

import numpy as np
import pandas as pd
from crabnet.crabnet_ import CrabNet
from crabnet.utils.utils import count_parameters
from matbench.bench import MatbenchBenchmark
from numpy.random import default_rng

from matsci_opt_benchmarks.crabnet_hyperparameter import __version__

__author__ = "sgbaird"
__copyright__ = "sgbaird"
__license__ = "MIT"

_logger = logging.getLogger(__name__)


# ---- Python API ----
# The functions defined in this section can be imported by users in their
# Python scripts/interactive interpreter, e.g. via
# `from matsci_opt_benchmarks.crabnet_hyperparameter.skeleton import fib`,
# when using this Python module as a library.


[docs]def fib(n): """Fibonacci example function Args: n (int): integer Returns: int: n-th Fibonacci number """ assert n > 0 a, b = 1, 1 for _i in range(n - 1): a, b = b, a + b return a
#############
[docs]def get_parameters(): """Get parameter set and parameter constraints for CrabNet. Returns: (list(dict), list): CrabNet parameters, CrabNet parameter contraints for Ax """ parameters = [ {"name": "N", "type": "range", "bounds": [1, 10]}, {"name": "alpha", "type": "range", "bounds": [0.0, 1.0]}, {"name": "d_model", "type": "range", "bounds": [100, 1024]}, {"name": "dim_feedforward", "type": "range", "bounds": [1024, 4096]}, {"name": "dropout", "type": "range", "bounds": [0.0, 1.0]}, {"name": "emb_scaler", "type": "range", "bounds": [0.0, 1.0]}, {"name": "eps", "type": "range", "bounds": [1e-7, 1e-4]}, {"name": "epochs_step", "type": "range", "bounds": [5, 20]}, {"name": "fudge", "type": "range", "bounds": [0.0, 0.1]}, {"name": "heads", "type": "range", "bounds": [1, 10]}, {"name": "k", "type": "range", "bounds": [2, 10]}, {"name": "lr", "type": "range", "bounds": [1e-4, 6e-3]}, {"name": "pe_resolution", "type": "range", "bounds": [2500, 10000]}, {"name": "ple_resolution", "type": "range", "bounds": [2500, 10000]}, {"name": "pos_scaler", "type": "range", "bounds": [0.0, 1.0]}, {"name": "weight_decay", "type": "range", "bounds": [0.0, 1.0]}, {"name": "batch_size", "type": "range", "bounds": [32, 256]}, {"name": "out_hidden4", "type": "range", "bounds": [32, 512]}, {"name": "betas1", "type": "range", "bounds": [0.5, 0.9999]}, {"name": "betas2", "type": "range", "bounds": [0.5, 0.9999]}, {"name": "bias", "type": "choice", "values": [False, True]}, {"name": "criterion", "type": "choice", "values": ["RobustL1", "RobustL2"]}, { "name": "elem_prop", "type": "choice", "values": ["mat2vec", "magpie", "onehot"], }, {"name": "train_frac", "type": "range", "bounds": [0.01, 1.0]}, ] parameter_constraints = ["betas1 <= betas2", "emb_scaler + pos_scaler <= 1.0"] return parameters, parameter_constraints
# def evaluate(parameters): # results = matbench_metric_calculator(parameters) # outputs = { # "mae": results["average_mae"], # "rmse": results["average_rmse"], # "model_size": results["model_size"], # "runtime": results["runtime"], # } # return outputs
[docs]def correct_parameterization(parameters: dict, verbose=False): """Modify tunable hyperparameters for combatibility with CrabNet. Args: parameters (dict): Hyperparameter set used by Ax in optimization. verbose (bool, optional): Print function progress. Defaults to False. Returns: dict: Modified dictionary with the correct parameters for CrabNet compatibility. """ # take dictionary of tunable hyperparameters and output hyperparameter # combinations compatible with CrabNet if verbose: pprint.pprint(parameters) parameters["out_hidden"] = [ parameters.get("out_hidden4") * 8, parameters.get("out_hidden4") * 4, parameters.get("out_hidden4") * 2, parameters.get("out_hidden4"), ] parameters.pop("out_hidden4") parameters["betas"] = ( parameters.get("betas1"), parameters.get("betas2"), ) parameters.pop("betas1") parameters.pop("betas2") d_model = parameters["d_model"] # make heads even (unless it's 1) (because d_model must be even) heads = parameters["heads"] if np.mod(heads, 2) != 0: heads = heads + 1 parameters["heads"] = heads # NOTE: d_model must be divisible by heads d_model = parameters["heads"] * round(d_model / parameters["heads"]) parameters["d_model"] = d_model parameters["pos_scaler_log"] = ( 1 - parameters["emb_scaler"] - parameters["pos_scaler"] ) parameters["epochs"] = parameters["epochs_step"] * 4 return parameters
[docs]def evaluate(parameters): """Trains CrabNet using the inputted parameter set and records the results. Args: parameters (list(dict)): Hyperparameter set for CrabNet. Returns: dict: Results after CrabNet training. MAE, RMSE, Model Size, Runtime. If there is an error, dict contains error at dict["error"] """ t0 = time() print("user parameters are:", parameters) parameters = copy(parameters) train_frac = parameters.pop("train_frac") seed = parameters.pop("sample_seed") if "hardware" in parameters: parameters.pop("hardware") rng = default_rng(seed) # default hyperparameters parameterization = { "N": 3, "alpha": 0.5, "d_model": 512, "dim_feedforward": 2048, "dropout": 0.1, "emb_scaler": 1.0, "epochs_step": 10, "eps": 0.000001, "fudge": 0.02, "heads": 4, "k": 6, "lr": 0.001, "pe_resolution": 5000, "ple_resolution": 5000, "pos_scaler": 1.0, "weight_decay": 0, "batch_size": 32, "out_hidden4": 128, "betas1": 0.9, "betas2": 0.999, "losscurve": False, "learningcurve": False, "bias": False, "criterion": "RobustL1", "elem_prop": "mat2vec", } # update the values of the selected hyperparameters parameterization.update(parameters) print(parameterization) cb = CrabNet(**correct_parameterization(parameterization)) mb = MatbenchBenchmark(autoload=False, subset=["matbench_expt_gap"]) # TODO: try-except with NaN output if failure try: for task in mb.tasks: task.load() for fold in task.folds: # Inputs are either chemical compositions as strings or crystal # structures as pymatgen.Structure objects. Outputs are either # floats (regression tasks) or bools (classification tasks) train_inputs, train_outputs = task.get_train_and_val_data(fold) # prep input for CrabNet train_df = pd.concat( (train_inputs, train_outputs), axis=1, keys=["formula", "target"] ) train_df = train_df.sample(frac=train_frac, random_state=rng) # train and validate your model cb.fit(train_df=train_df) # Get testing data test_inputs, test_outputs = task.get_test_data( fold, include_target=True ) test_df = pd.concat( (test_inputs, test_outputs), axis=1, keys=["formula", "target"] ) # Predict on the testing data # Your output should be a pandas series, numpy array, or python iterable # where the array elements are floats or bools predictions = cb.predict(test_df=test_df) predictions = np.nan_to_num(predictions) # Record your data! task.record(fold, predictions) scores = task.scores # `fit` needs to be called prior to `count_parameters` # all 5 models should be same size, but we take the last for simplicity model_size = count_parameters(cb.model) # REVIEW: if using multiple tasks, return multiple `scores` dicts return {"scores": scores, "model_size": model_size, "runtime": time() - t0} except Exception as e: return {"error": str(e), "runtime": time() - t0}
############# # ---- CLI ---- # The functions defined in this section are wrappers around the main Python # API allowing them to be called directly from the terminal as a CLI # executable/script.
[docs]def parse_args(args): """Parse command line parameters Args: args (List[str]): command line parameters as list of strings (for example ``["--help"]``). Returns: :obj:`argparse.Namespace`: command line parameters namespace """ parser = argparse.ArgumentParser(description="Just a Fibonacci demonstration") parser.add_argument( "--version", action="version", version="matsci-opt-benchmarks {ver}".format(ver=__version__), ) parser.add_argument(dest="n", help="n-th Fibonacci number", type=int, metavar="INT") parser.add_argument( "-v", "--verbose", dest="loglevel", help="set loglevel to INFO", action="store_const", const=logging.INFO, ) parser.add_argument( "-vv", "--very-verbose", dest="loglevel", help="set loglevel to DEBUG", action="store_const", const=logging.DEBUG, ) return parser.parse_args(args)
[docs]def setup_logging(loglevel): """Setup basic logging Args: loglevel (int): minimum loglevel for emitting messages """ logformat = "[%(asctime)s] %(levelname)s:%(name)s:%(message)s" logging.basicConfig( level=loglevel, stream=sys.stdout, format=logformat, datefmt="%Y-%m-%d %H:%M:%S" )
[docs]def main(args): """Wrapper allowing :func:`fib` to be called with string arguments in a CLI fashion Instead of returning the value from :func:`fib`, it prints the result to the ``stdout`` in a nicely formatted message. Args: args (List[str]): command line parameters as list of strings (for example ``["--verbose", "42"]``). """ args = parse_args(args) setup_logging(args.loglevel) _logger.debug("Starting crazy calculations...") print("The {}-th Fibonacci number is {}".format(args.n, fib(args.n))) _logger.info("Script ends here")
[docs]def run(): """Calls :func:`main` passing the CLI arguments extracted from :obj:`sys.argv` This function can be used as entry point to create console scripts with setuptools. """ main(sys.argv[1:])
if __name__ == "__main__": # ^ This is a guard statement that will prevent the following code from # being executed in the case someone imports this file instead of # executing it as a script. # https://docs.python.org/3/library/__main__.html # After installing your project with pip, users can also run your Python # modules as scripts via the ``-m`` flag, as defined in PEP 338:: # # python -m matsci_opt_benchmarks.crabnet_hyperparameter.skeleton 42 # run() # %% Code Graveyard # crabnet_param_names = [ # "N", # "alpha", # "d_model", # "dim_feedforward", # "dropout", # "emb_scaler", # "epochs_step", # "eps", # "fudge", # "heads", # "k", # "lr", # "pe_resolution", # "ple_resolution", # "pos_scaler", # "weight_decay", # "batch_size", # "out_hidden4", # "betas1", # "betas2", # "bias", # "criterion", # "elem_prop", # ] # new_parameters = {p: parameters[p] for p in parameters if p in crabnet_param_names}