Source code for pyspectools.models.classes


import pprint

import torch
import numpy as np
from uncertainties import ufloat, ufloat_fromstr
from monsterurl import get_monster
from joblib import dump, load
from plotly import graph_objs as go
from plotly.subplots import make_subplots
import pandas as pd

from pyspectools.models import torch_models
from pyspectools.units import kappa, inertial_defect

"""
This module defines a set of classes for high-level interaction with
the user. These classes provide a set of methods that will assist
the user in performing inference with PyTorch models by taking care
of some aspects (e.g. plotting, density estimation) in the background.
"""

[docs]class SpecConstants:
    """
    Class representing experimental parameters to be fed into the
    `MoleculeDetective` model. The user provides a set of constants
    as input, and the main purpose of this class is to help manage
    experimental uncertainties.
    """
    def __init__(self, A: str, B: str, C: str, u_a=ufloat(0., 3.), u_b=ufloat(0., 3.), u_c=ufloat(0., 3.), **kwargs):
        """
        Create a `SpecConstants` object based on experimental parameters. The minimal
        input contains a set of rotational constants, A, B, and C, and optional args
        are the dipole moments along each axis. In the case of the rotational constants,
        the expected inputs are actually strings (in MHz), representing value(uncertainty),
        for example:
        
        A = "5623.562(102)"
        
        This reflects the same notation as what SPFIT provides. The advantage of being
        able to provide uncertainties is the ability to set priors for poorly or not
        determined parameters. For example, if A is not determined, we provide an uniformative
        prior by setting a very large uncertainty for A:
        
        A = "8032(6000)"
        
        The `generate_samples` method will take uncertainties into account,
        and prepare samples for variational inference by the `MoleculeDetective` model.
        
        By default, the dipole moments are based on uninformative priors. If you know
        the value of one axis, or at least have a good idea of what it may be, or if you want
        to deactivate certain axes, you can specify them; the example below sets the
        b and c axes to zero, while "turning on" the a-dipole moment:
        
        from uncertainties import ufloat
        
        u_a = ufloat(1., 0.5)
        u_b = ufloat(0., 0.)
        u_c = ufloat(0., 0.)

        Parameters
        ----------
        A, B, C : str
            Principal rotational constants. The expected input for these are
            string representations of value(uncertainty), for example:
            "5962.3213(102)"

        u_a, u_b, u_c : ufloat, optional
            Dipole moments as `ufloat` objects, by default ufloat(0., 3.), and
            correspond to uninformative priors.
        """
        self.A = ufloat_fromstr(A)
        self.B = ufloat_fromstr(B)
        self.C = ufloat_fromstr(C)
        self.u_a = u_a
        self.u_b = u_b
        self.u_c = u_c
        # generate a name
        self.__name__ = get_monster()
        # Propagate uncertainty of kappa and delta based
        # on experimental constants
        self.kappa = kappa(self.A, self.B, self.C)
        self.delta = inertial_defect(self.A, self.B, self.C)
        # override defaults with user-specified values
        if kwargs:
            self.__dict__.update(**kwargs)
    
    def __repr__(self):
        return pprint.PrettyPrinter(indent=4).pformat(self.__dict__)
    
    def __call__(self, N=1000):
        return self.generate_samples(N)
    
[docs]    def save(self, path=None):
        """
        Save the molecule to disk using Pickle in `joblib`. 

        Parameters
        ----------
        path : str, optional
            Name to save the molecule to, not including the
            extension ".pkl", by default None
        """
        if path is None:
            path = f"{self.__name__}.pkl"
        dump(self, path)
    
[docs]    @classmethod
    def load(cls, path):
        return load(path)
        
[docs]    def generate_samples(self, N: int):
        """
        Function to generate samples of spectroscopic parameters, based on
        "diagonal" Gaussians. The nominal value and standard deviations of
        each parameter are used parameterize a Gaussian, and `N` random 
        samples are drawn. In the case of the dipole moments, we take the
        absolute value of the samples, and delta and kappa are recalculated
        based on the drawn A, B, C.
        
        TODO - Make this code look cleaner; there must be a smarter way to sample

        Parameters
        ----------
        N : int
            Number of samples to generate.

        Returns
        -------
        samples
            2D np.ndarray, where columns correspond to parameter, and rows
            are samples
        """
        A = np.abs(np.random.normal(self.A.nominal_value, self.A.std_dev, size=(N)))
        B = np.abs(np.random.normal(self.B.nominal_value, self.B.std_dev, size=(N)))
        C = np.abs(np.random.normal(self.C.nominal_value, self.C.std_dev, size=(N)))
        u_a = np.abs(np.random.normal(self.u_a.nominal_value, self.u_a.std_dev, size=(N)))
        u_b = np.abs(np.random.normal(self.u_b.nominal_value, self.u_b.std_dev, size=(N)))
        u_c = np.abs(np.random.normal(self.u_c.nominal_value, self.u_c.std_dev, size=(N)))
        # calculate derived parameters
        asymmetry = kappa(A, B, C)
        delta = inertial_defect(A, B, C)
        samples = np.vstack([A, B, C, asymmetry, delta, u_a, u_b, u_c]).T
        # We need to make sure physics is constrained: A >= B >= C
        (valid_mask,) = np.where((A >= B) & (B >= C))
        return samples[valid_mask]
    

[docs]class MoleculeResult:
    
    func_encoding = [
        "Aliphatic",
        "Allene",
        "Vinyl",
        "Alkyne",
        "Carbonyl (General)",
        "Carbonyl (α-nitrogen)",
        "Carbonyl (α-carbon)",
        "Aldehyde",
        "Amide",
        "Ketone",
        "Ether",
        "Amine",
        "Amino acid",
        "Nitrate",
        "Nitrile",
        "Isonitrile",
        "Nitro",
        "Alcohol",
        "Alcohol (Carboxylic acid)",
        "Enol",
        "Phenol",
        "Peroxide",
        "Aromatic sp2 carbon"
    ]
    
    def __init__(self, eigenspectrum, formulas, functional_groups):
        self.eigenspectrum = eigenspectrum
        self.formulas = pd.DataFrame(formulas, columns=["H", "C", "O", "N"])
        self.functional_groups = pd.DataFrame(functional_groups, columns=self.func_encoding)
    
[docs]    def analyze(self, q=(0.025, 0.5, 0.975)):
        """
        Convenience function to compute some summary statistics, and make interactive
        plotly figures.

        Parameters
        ----------
        q : tuple, optional
            [description], by default (0.025, 0.5, 0.975)

        Returns
        -------
        fig
            Plotly Figure object
        
        results
            dict containing summary statistics of the formula/functional
            predictions.
        """
        # covariance matrix of the formulas and functional group predictions
        formula_covar = np.cov(self.formulas, rowvar=False)
        functional_covar = np.cov(self.functional_groups, rowvar=False)
        # calculate the quantile ranges to report as marginal uncertainties
        # by default, the specified range is the median, and the edges are
        # the 95% highest posterior density
        formula_q = np.quantile(self.formulas, q, axis=0)
        functional_q = np.quantile(self.functional_groups, q, axis=0)
        # package the statistical summary
        results = {
            "formula": {"covariance": formula_covar, "quantile": formula_q},
            "functional": {"covariance": functional_covar, "quantile": functional_q}
        }
        
        fig = go.Figure()
        for index, atom in enumerate(["H", "C", "O", "N"]):
            fig.add_trace(
                go.Violin(
                    x=[atom,] * len(self.formulas),
                    y=self.formulas[atom],
                    name=atom,
                    meanline_visible=True,
                    opacity=0.6,
                )
            )
        fig.update_layout(title_text="Predicted formula", xaxis_title="Atom", yaxis_title="Number")
        return fig, results


[docs]class MoleculeDetective:
    def __init__(self, weights_path=None, device="cpu", **kwargs):
        self.model = torch_models.VarMolDetect.load_weights(weights_path, device, **kwargs)
        # set to inference mode
        self.model.eval()
        # For taking care of moving data around
        self.device = self.model.device
    
    def __call__(self, specconst_obj: "SpecConstants", composition: int, N=1000):
        return self.run_inference(specconst_obj, composition, N)
    
[docs]    def run_inference(self, specconst_obj: "SpecConstants", composition=None, N=1000):
        """
        Use a pre-trained PyTorch model to perform inference, conditional on the
        experimental constants and the expected composition. This framework can
        be used to account for various forms of uncertainties, and the default
        behavior is to provide the minimum amount of information. For example,
        the `composition` argument can be provided as an `int` type representing:
        
        [0: hydrocarbon, 1: oxygen-bearing, 2: nitrogen-bearing, 3: ON-bearing]
        
        By default, `composition` is None; the case where we don't know what
        the composition is, in which case we will randomly try all four compositions.
        
        This implementation is set up to take advantage of performance in `torch`. Rather
        than repeatedly call a function a la MCMC sampling, we simply pass an entire tensor
        of all the samples so that `torch` can run without Python interaction. You can
        think of each row as a "minibatch".
        
        The constants are converted to GHz, and therefore expected as MHz.

        Parameters
        ----------
        specconst_obj : [type]
            `SpecConstants` object, which will generate random samples
            based on the experimental uncertainties.
        composition : int or None (default), optional
            The expected composition of the molecule. When `composition` is
            an integer, inference is performed conditional on the specific
            composition; the definitions are provided in the docstring.
            If `None` (default), we have no prior knowledge of the composition,
            and will randomly test all four.
            
        N : int, optional
            Number of samples to run, by default 1000

        Returns
        -------
        eigenspectrum, formula, functional
            NumPy arrays corresponding to the predicted eigenspectrum,
            molecular formula, and functional groups present. `functional`
            is output as log sigmoid by the model, and this function
            returns the exponential to get back sigmoid likelihoods.
        """
        samples = specconst_obj(N)
        # convert to GHz
        samples[:,:3] /= 1000.
        if composition is None:
            composition = np.random.randint(low=0, high=4, size=len(samples))
            compositions = np.zeros((len(samples), 4))
            compositions[:,composition] = 1
        else:
            # generate one-hot encoding for the composition
            compositions = np.zeros((len(samples), max(4, composition)))
            compositions[:, composition] = 1
        # Type cast Tensors into float32 from NumPy arrays, and move to
        # CPU or GPU
        samples = torch.from_numpy(samples).float().to(self.device)
        compositions = torch.from_numpy(compositions).float().to(self.device)
        # Run the model
        with torch.no_grad():
            pred, latent = self.model(samples, compositions)
        # unpack predictions
        eigenspectrum, formula, functional = pred
        result = MoleculeResult(eigenspectrum.numpy(), formula.numpy(), functional.numpy())
        return result