Source code for pyspectools.fitting

"""
    Wrapper functions for lmfit.

    Each module of PySpectools should use these functions
    when fitting is required, rather than rewrite for each
    purpose.
"""

import lmfit
import numpy as np
import pandas as pd
import peakutils
from tqdm.autonotebook import tqdm
from scipy import sparse
from scipy.sparse.linalg import spsolve

from pyspectools import lineshapes


[docs]class PySpecModel(lmfit.models.Model):
    def __init__(self, function, **kwargs):
        super(PySpecModel, self).__init__(function, **kwargs)
        self.params = self.make_params()


[docs]class FirstDerivLorentzian_Model(PySpecModel):
    """
    Child class of the PySpecModel, which in itself inherits from the `lmfit` `Models` class.
    Gives the first derivative Lorentzian line shape profile for fitting.
    """

    def __init__(self, **kwargs):
        super(FirstDerivLorentzian_Model, self).__init__(
            lineshapes.first_deriv_lorentzian, **kwargs
        )


[docs]class SecDerivLorentzian_Model(PySpecModel):
    """
    Child class of the PySpecModel, which in itself inherits from the `lmfit` `Models` class.
    Gives the second derivative Lorentzian line shape profile for fitting.
    """

    def __init__(self, **kwargs):
        super(SecDerivLorentzian_Model, self).__init__(
            lineshapes.sec_deriv_lorentzian, **kwargs
        )


[docs]class BJModel(PySpecModel):
    """
    Model for fitting prolate/linear molecules.
    """

    def __init__(self, **kwargs):
        super(BJModel, self).__init__(calc_harmonic_transition, **kwargs)


[docs]class PairGaussianModel(PySpecModel):
    def __init__(self, **kwargs):
        super(PairGaussianModel, self).__init__(
            lineshapes.pair_gaussian, independent_vars=["x"], **kwargs
        )

[docs]    def fit_pair(self, x, y, verbose=False):
        # Automatically find where the Doppler splitting is
        indexes = peakutils.indexes(y, thres=0.4, min_dist=10)
        best_two = np.argsort(y[indexes])
        guess_center = np.average(x[best_two])
        guess_sep = np.std(x[best_two])
        # This calculates the amplitude of a Gaussian based on
        # the peak height
        prefactor = np.sqrt(2.0 * np.pi) * 0.01
        guess_amp = np.average(y[best_two]) * prefactor
        # Set the parameter guesses
        self.params["A1"].set(guess_amp)
        self.params["A2"].set(guess_amp)
        self.params["w"].set(0.005, min=0.0001, max=0.03)
        if np.isfinite(guess_sep) is True:
            self.params["xsep"].set(guess_sep, min=guess_sep * 0.8, max=guess_sep * 1.2)
        else:
            self.params["xsep"].set(0.05)
        self.params["x0"].set(
            guess_center, min=guess_center - 0.6, max=guess_center + 0.6
        )
        if verbose is True:
            print("Peaks found: {}".format(x[best_two]))
            print("Initial parameters:")
            for key, value in self.params.items():
                print(key, value)
        results = self.fit(data=y, x=x, params=self.params)
        return results


[docs]def rotor_energy(J, B, D=0.0):
    """ Expression for a linear/prolate top with
        centrifugal distortion.

        parameters:
        ---------------
        J - integer quantum number
        B - rotational constant in MHz
        D - CD term in MHz

        returns:
        --------------
        state energy in MHz
    """
    return B * J * (J + 1) - D * J ** 2.0 * (J + 1) ** 2.0


[docs]def calc_harmonic_transition(J, B, D=0.0):
    """
        Calculate the transition frequency for
        a given upper state J, B, and D.

        parameters:
        --------------
        J - quantum number
        B - rotational constant in MHz
        D - centrifugal distortion constant in MHz

        returns:
        --------------
        transition frequency in MHz
    """
    lower = rotor_energy(J - 1, B, D)
    upper = rotor_energy(J, B, D)
    return upper - lower


[docs]def quant_check(value, threshold=0.001):
    """
        Function that will check if a value is close
        to an integer to the absolute value of the threshold.
        
        parameters:
        ---------------
        value - float for number to check
        threshold - float determining whether value is
                    close enough to being integer
                    
        returns:
        ---------------
        True if the value is close enough to being an integer,
        False otherwise.
    """
    nearest_half = np.round(value * 2) / 2
    return np.abs(nearest_half - value) <= threshold


[docs]def harmonic_fitter(progressions, J_thres=0.01):
    """
        Function that will sequentially fit every progression
        with a simple harmonic model defined by B and D. The
        "B" value here actually corresponds to B+C for a near-prolate,
        or 2B for a prolate top.
        
        There are a number of filters applied in order to minimize
        calculations that won't be meaningful - these parameters
        may have to be tuned for different test cases.
        
        Because the model is not actually quantized, J is
        represented as a float. To our advantage, this will
        actually separate real (molecular) progressions from
        fake news; at least half of the J values must be
        close to being an integer for us to consider fitting.
        
        parameters:
        ---------------
        progressions - iterable containing arrays of progressions
        J_thres - optional argument corresponding to how close a
                  value must be to an integer
                  
        returns:
        ---------------
        pandas dataframe containing the fit results; columns
        are B, D, fit RMS, and pairs of columns corresponding
        to the fitted frequency and approximate J value.
    """
    BJ_fit_model = lmfit.models.Model(calc_harmonic_transition)
    params = BJ_fit_model.make_params()
    data = list()
    fit_objs = list()
    failed = list()
    for index, progression in tqdm(enumerate(progressions)):
        # Determine the approximate value of B based on
        # the differences between observed transitions
        approx_B = np.average(np.diff(progression))
        # Calculate the values of J that are assigned based on B
        J = (progression / approx_B) / 2.0
        # We want at least half of the lines to be close to being integer
        if len(progression) >= 2:
            if np.sum(quant_check(J, J_thres)) >= len(progression) / 1.5:
                # Let B vary a bit
                params["B"].set(approx_B, min=approx_B * 0.9, max=approx_B * 1.1)
                # Constrain D to be less than 5 MHz
                params["D"].set(0.001, min=0.0, max=1.0)
                fit = BJ_fit_model.fit(
                    data=progression, J=J, params=params, fit_kws={"maxfev": 100}
                )
                # Only include progressions that can be fit successfully
                if fit.success is True:
                    # Calculate fit RMS
                    rms = np.sqrt(np.average(np.square(fit.residual)))
                    # Only add it to the list of the RMS is
                    # sufficiently low
                    return_dict = dict()
                    return_dict["RMS"] = rms
                    return_dict.update(fit.best_values)
                    # Make columns for frequency and J
                    for i, frequency in enumerate(progression):
                        return_dict[i] = frequency
                        return_dict["J{}".format(i)] = J[i]
                    data.append(return_dict)
                    fit_objs.append(fit)
                else:
                    failed.append([index, fit.fit_report()])
            else:
                failed.append(index)
        else:
            return_dict = dict()
            return_dict["RMS"] = 0.0
            return_dict["B"] = approx_B
            # reformat the frequencies and approximate J values
            for i, frequency in enumerate(progression):
                return_dict[i] = frequency
                return_dict["J{}".format(i)] = J[i]
            data.append(return_dict)
    full_df = pd.DataFrame(data=data)
    full_df.sort_values(["RMS", "B", "D"], ascending=False, inplace=True)
    return full_df, fit_objs


[docs]def baseline_als(y, lam=1e4, p=0.01, niter=10):
    """
        Function for performing an iterative baseline
        fitting using the asymmetric least squares algorithm.

        This refers to the paper:
        "Baseline Correction with Asymmetric Least Squares Smoothing"
        by Eilers and Boelens (2005).

        The code is taken from a Stack Overflow question:
        https://stackoverflow.com/a/29185844

        According to the paper, the tested values are:
        0.001 <= p <= 0.1 for positive peaks
        1e2 <= lam <= 1e9

        Parameters:
        --------------
        y : numpy 1D array
            Data used to fit the baseline
        lam : float
            Tuning factor for penalty function that offsets the difference cost function
        p : float
            Weighting factor for the cost function

        Returns:
        --------------
        z : numpy 1D array
            Array containing the baseline values
    """
    L = len(y)
    D = sparse.diags([1, -2, 1], [0, -1, -2], shape=(L, L - 2))
    # Initialize a set of weights
    w = np.ones(L)
    # Iterate for a set number of times to fit baseline
    for i in range(niter):
        W = sparse.spdiags(w, 0, L, L)
        Z = W + lam * D.dot(D.transpose())
        z = spsolve(Z, w * y)
        w = p * (y > z) + (1 - p) * (y < z)
    return z