Source code for pyspectools.models.classes


import pprint

import torch
import numpy as np
from uncertainties import ufloat, ufloat_fromstr
from monsterurl import get_monster
from joblib import dump, load
from plotly import graph_objs as go
from plotly.subplots import make_subplots
import pandas as pd

from pyspectools.models import torch_models
from pyspectools.units import kappa, inertial_defect

"""
This module defines a set of classes for high-level interaction with
the user. These classes provide a set of methods that will assist
the user in performing inference with PyTorch models by taking care
of some aspects (e.g. plotting, density estimation) in the background.
"""

[docs]class SpecConstants: """ Class representing experimental parameters to be fed into the `MoleculeDetective` model. The user provides a set of constants as input, and the main purpose of this class is to help manage experimental uncertainties. """ def __init__(self, A: str, B: str, C: str, u_a=ufloat(0., 3.), u_b=ufloat(0., 3.), u_c=ufloat(0., 3.), **kwargs): """ Create a `SpecConstants` object based on experimental parameters. The minimal input contains a set of rotational constants, A, B, and C, and optional args are the dipole moments along each axis. In the case of the rotational constants, the expected inputs are actually strings (in MHz), representing value(uncertainty), for example: A = "5623.562(102)" This reflects the same notation as what SPFIT provides. The advantage of being able to provide uncertainties is the ability to set priors for poorly or not determined parameters. For example, if A is not determined, we provide an uniformative prior by setting a very large uncertainty for A: A = "8032(6000)" The `generate_samples` method will take uncertainties into account, and prepare samples for variational inference by the `MoleculeDetective` model. By default, the dipole moments are based on uninformative priors. If you know the value of one axis, or at least have a good idea of what it may be, or if you want to deactivate certain axes, you can specify them; the example below sets the b and c axes to zero, while "turning on" the a-dipole moment: from uncertainties import ufloat u_a = ufloat(1., 0.5) u_b = ufloat(0., 0.) u_c = ufloat(0., 0.) Parameters ---------- A, B, C : str Principal rotational constants. The expected input for these are string representations of value(uncertainty), for example: "5962.3213(102)" u_a, u_b, u_c : ufloat, optional Dipole moments as `ufloat` objects, by default ufloat(0., 3.), and correspond to uninformative priors. """ self.A = ufloat_fromstr(A) self.B = ufloat_fromstr(B) self.C = ufloat_fromstr(C) self.u_a = u_a self.u_b = u_b self.u_c = u_c # generate a name self.__name__ = get_monster() # Propagate uncertainty of kappa and delta based # on experimental constants self.kappa = kappa(self.A, self.B, self.C) self.delta = inertial_defect(self.A, self.B, self.C) # override defaults with user-specified values if kwargs: self.__dict__.update(**kwargs) def __repr__(self): return pprint.PrettyPrinter(indent=4).pformat(self.__dict__) def __call__(self, N=1000): return self.generate_samples(N)
[docs] def save(self, path=None): """ Save the molecule to disk using Pickle in `joblib`. Parameters ---------- path : str, optional Name to save the molecule to, not including the extension ".pkl", by default None """ if path is None: path = f"{self.__name__}.pkl" dump(self, path)
[docs] @classmethod def load(cls, path): return load(path)
[docs] def generate_samples(self, N: int): """ Function to generate samples of spectroscopic parameters, based on "diagonal" Gaussians. The nominal value and standard deviations of each parameter are used parameterize a Gaussian, and `N` random samples are drawn. In the case of the dipole moments, we take the absolute value of the samples, and delta and kappa are recalculated based on the drawn A, B, C. TODO - Make this code look cleaner; there must be a smarter way to sample Parameters ---------- N : int Number of samples to generate. Returns ------- samples 2D np.ndarray, where columns correspond to parameter, and rows are samples """ A = np.abs(np.random.normal(self.A.nominal_value, self.A.std_dev, size=(N))) B = np.abs(np.random.normal(self.B.nominal_value, self.B.std_dev, size=(N))) C = np.abs(np.random.normal(self.C.nominal_value, self.C.std_dev, size=(N))) u_a = np.abs(np.random.normal(self.u_a.nominal_value, self.u_a.std_dev, size=(N))) u_b = np.abs(np.random.normal(self.u_b.nominal_value, self.u_b.std_dev, size=(N))) u_c = np.abs(np.random.normal(self.u_c.nominal_value, self.u_c.std_dev, size=(N))) # calculate derived parameters asymmetry = kappa(A, B, C) delta = inertial_defect(A, B, C) samples = np.vstack([A, B, C, asymmetry, delta, u_a, u_b, u_c]).T # We need to make sure physics is constrained: A >= B >= C (valid_mask,) = np.where((A >= B) & (B >= C)) return samples[valid_mask]
[docs]class MoleculeResult: func_encoding = [ "Aliphatic", "Allene", "Vinyl", "Alkyne", "Carbonyl (General)", "Carbonyl (α-nitrogen)", "Carbonyl (α-carbon)", "Aldehyde", "Amide", "Ketone", "Ether", "Amine", "Amino acid", "Nitrate", "Nitrile", "Isonitrile", "Nitro", "Alcohol", "Alcohol (Carboxylic acid)", "Enol", "Phenol", "Peroxide", "Aromatic sp2 carbon" ] def __init__(self, eigenspectrum, formulas, functional_groups): self.eigenspectrum = eigenspectrum self.formulas = pd.DataFrame(formulas, columns=["H", "C", "O", "N"]) self.functional_groups = pd.DataFrame(functional_groups, columns=self.func_encoding)
[docs] def analyze(self, q=(0.025, 0.5, 0.975)): """ Convenience function to compute some summary statistics, and make interactive plotly figures. Parameters ---------- q : tuple, optional [description], by default (0.025, 0.5, 0.975) Returns ------- fig Plotly Figure object results dict containing summary statistics of the formula/functional predictions. """ # covariance matrix of the formulas and functional group predictions formula_covar = np.cov(self.formulas, rowvar=False) functional_covar = np.cov(self.functional_groups, rowvar=False) # calculate the quantile ranges to report as marginal uncertainties # by default, the specified range is the median, and the edges are # the 95% highest posterior density formula_q = np.quantile(self.formulas, q, axis=0) functional_q = np.quantile(self.functional_groups, q, axis=0) # package the statistical summary results = { "formula": {"covariance": formula_covar, "quantile": formula_q}, "functional": {"covariance": functional_covar, "quantile": functional_q} } fig = go.Figure() for index, atom in enumerate(["H", "C", "O", "N"]): fig.add_trace( go.Violin( x=[atom,] * len(self.formulas), y=self.formulas[atom], name=atom, meanline_visible=True, opacity=0.6, ) ) fig.update_layout(title_text="Predicted formula", xaxis_title="Atom", yaxis_title="Number") return fig, results
[docs]class MoleculeDetective: def __init__(self, weights_path=None, device="cpu", **kwargs): self.model = torch_models.VarMolDetect.load_weights(weights_path, device, **kwargs) # set to inference mode self.model.eval() # For taking care of moving data around self.device = self.model.device def __call__(self, specconst_obj: "SpecConstants", composition: int, N=1000): return self.run_inference(specconst_obj, composition, N)
[docs] def run_inference(self, specconst_obj: "SpecConstants", composition=None, N=1000): """ Use a pre-trained PyTorch model to perform inference, conditional on the experimental constants and the expected composition. This framework can be used to account for various forms of uncertainties, and the default behavior is to provide the minimum amount of information. For example, the `composition` argument can be provided as an `int` type representing: [0: hydrocarbon, 1: oxygen-bearing, 2: nitrogen-bearing, 3: ON-bearing] By default, `composition` is None; the case where we don't know what the composition is, in which case we will randomly try all four compositions. This implementation is set up to take advantage of performance in `torch`. Rather than repeatedly call a function a la MCMC sampling, we simply pass an entire tensor of all the samples so that `torch` can run without Python interaction. You can think of each row as a "minibatch". The constants are converted to GHz, and therefore expected as MHz. Parameters ---------- specconst_obj : [type] `SpecConstants` object, which will generate random samples based on the experimental uncertainties. composition : int or None (default), optional The expected composition of the molecule. When `composition` is an integer, inference is performed conditional on the specific composition; the definitions are provided in the docstring. If `None` (default), we have no prior knowledge of the composition, and will randomly test all four. N : int, optional Number of samples to run, by default 1000 Returns ------- eigenspectrum, formula, functional NumPy arrays corresponding to the predicted eigenspectrum, molecular formula, and functional groups present. `functional` is output as log sigmoid by the model, and this function returns the exponential to get back sigmoid likelihoods. """ samples = specconst_obj(N) # convert to GHz samples[:,:3] /= 1000. if composition is None: composition = np.random.randint(low=0, high=4, size=len(samples)) compositions = np.zeros((len(samples), 4)) compositions[:,composition] = 1 else: # generate one-hot encoding for the composition compositions = np.zeros((len(samples), max(4, composition))) compositions[:, composition] = 1 # Type cast Tensors into float32 from NumPy arrays, and move to # CPU or GPU samples = torch.from_numpy(samples).float().to(self.device) compositions = torch.from_numpy(compositions).float().to(self.device) # Run the model with torch.no_grad(): pred, latent = self.model(samples, compositions) # unpack predictions eigenspectrum, formula, functional = pred result = MoleculeResult(eigenspectrum.numpy(), formula.numpy(), functional.numpy()) return result