Source code for pyspectools.qchem.heat_analysis

"""
heat_analysis.py

Routines for performing batch analysis on HEAT345(Q) calculations.
The scripts are hardcoded to follow the filestructure produced by
the heat345.py scripts, i.e.

top/
|---heat345.py
|---heat345.yaml
|---zmat.yaml
|---correlation/
    |---AUG-PCVXZ/
        |---calcY-correlation-AUG-PCVXZ.log

and so on.
"""

import os
from glob import glob
import logging
import numpy as np
import pandas as pd
from scipy.constants import Avogadro

# Additional modules written by yours truly
from pyspectools.qchem import parsers
from pyspectools.qchem import extrapolation as ep
from pyspectools import routines

"""Analysis functions

These functions were designed for use with the HEAT pandas dataframes
"""

[docs]def calc_enthalpy(dataframe, reactants, products):
    # Calculate the enthalpy of reaction between lists of reactants and
    # products. The value is returned in kJ/mole, which is calculated
    # based on the explicit value of hartree to joule from NIST
    E_prod = np.sum([dataframe.loc[ID]["Total"] for ID in products])
    E_reac = np.sum([dataframe.loc[ID]["Total"] for ID in reactants])
    return (E_prod - E_reac) * (Avogadro * 4.359744650e-21)


[docs]def calc_reaction(dataframe, reactants, products, reactions):
    # A basic function for building sequential reactions
    # Could be useful for PES building
    energy = calc_enthalpy(dataframe, reactants, products)
    react_name = " + ".join(reactants) + "-->" + " + ".join(products)
    print(react_name + ": " + str(energy) + " kJ/mol")
    reactions.append([react_name, energy])
    

[docs]def get_hot(dataframe, species):
    # Return the HEAT345(Q) energy for one molecule
    return dataframe.loc[species]["Total"]


[docs]def relative_energies(df, base, molecules):
    # Returns a dataframe with all the energetics relative to
    # a single molecule
    # bases is a list of reactants, while molecules are a nested
    # list
    energetics = list()
    names = list()
    if base not in molecules:
        molecules.append(base)
    for pair in molecules:
        # Loop over all the pairs and calculate the relative energy
        energetics.append(calc_enthalpy(df, base, pair))
        names.append(" + ".join(pair))
    new_df = pd.DataFrame(energetics, index=names, columns=["Relative energy (kJ/mol)"])
    new_df["Coordinate"] = np.arange(len(molecules))
    return new_df


"""
    Low-level functions

    These functions are called by the higher level wrappers
    for performing the analysis.
"""


[docs]def analysis_lookup(method):
    """ Returns the function associated with the string
        based name of a function.
    """
    if method == "ZPE":
        func = anal_zpe
    elif method == "MVD":
        func = anal_mvd
    elif method == "DBOC":
        func = anal_dboc
    elif method == "CCSD(T)":
        func = anal_correlation
    elif method == "HLC":
        func = anal_hlc
    elif method == "(Q)":
        func = anal_q
    return func


[docs]def anal_zpe(data_dict):
    """ Function for parsing the ZPE out of the raw data dictionary.
        Takes the full dictionary as input, and returns the ZPE.
    """
    zpe = 0.
    # Different syntax for different CFOUR function calls
    for word in ["frequency", "freq"]:
        try:
            zpe = data_dict[word]["zpe"] / (4.12 * 627.509)
        except KeyError:
            pass
    return zpe


[docs]def anal_mvd(data_dict):
    """ Function for parsing the MVD1 + MVD2 out of the raw data dictionary.
        Takes the full dictionary as input, and returns the relativistic
        contributions to the energy.
    """
    return data_dict["rel"]["relativistic"]


[docs]def anal_dboc(data_dict):
    """ Returns the DBOC value """
    return data_dict["dboc"]["dboc"]


[docs]def anal_q(data_dict):
    """ Try and get the CCSDT(Q) energy """
    try:
        q_dict = data_dict["ccsdtq"]
        q_energy = q_dict["ccsdt(q) energy"] - q_dict["ccsdt energy"]
    except KeyError:
        q_energy = 0.
    return q_energy


[docs]def anal_correlation(data_dict):
    """ Function for extrapolating the correlation and SCF energies to
        the CBS limit using their respective schemes.
        Takes the full dictionary as input, and returns the SCF/CBS and
        CCSD(T)/CBS energies as floats
    """
    # Get the basis names from dictionary keys
    corr_basis = [name for name in data_dict.keys() if "correlation" in name]
    cardinals = list()
    bases = list()
    corr_energies = list()
    scf_energies = list()
    # This loop is deliberately written this way so that the ordering
    # is preserved
    for basis in corr_basis:
        basis_str = basis.split("-")[-1]
        basis_X = match_basis(basis_str)
        corr_energies.append(data_dict[basis]["ccsd(t) energy"])
        scf_energies.append(data_dict[basis]["scf energy"])
        cardinals.append(basis_X)
        bases.append(basis_str)
    # Package into pandas df
    extrap_df = pd.DataFrame(
            data=list(zip(bases, cardinals, scf_energies, corr_energies)),
            columns=["Basis", "Cardinal", "SCF energy", "CCSD(T) energy"]
            )
    # Not necessary, but makes things nicer
    extrap_df.sort_values(["Cardinal"], ascending=True, inplace=True)
    # Extrapolate SCF to CBS limit
    scf_cbs, scf_popt = ep.extrapolate_SCF(
            extrap_df["SCF energy"],
            extrap_df["Cardinal"]
            )
    # Extrapolate CCSD(T) correlation to CBS limit
    cc_cbs, cc_popt = ep.extrapolate_correlation(
            extrap_df["CCSD(T) energy"],
            extrap_df["Cardinal"]
            )
    return scf_cbs, cc_cbs, extrap_df


[docs]def anal_hlc(data_dict):
    """ Function for analyzing the non-perturbative corrections to
        the triple excitations.

        Takes the frozen-core CCSDT and CCSD(T) values at two basis
        and extrapolates them to the CBS limit.

        The end result is the difference between CCSDT/CBS and CCSD(T)/CBS
        
        This function returns the correction, as well as the dataframes
        holding the answers
    """
    # Get HLC perturbative terms
    pert_basis = [key for key in data_dict if "hlc-pert" in key]
    cardinals = list()
    pert_energies = list()
    full_energies = list()
    for basis in pert_basis:
        basis_str = basis.split("-")[-1]
        basis_X = match_basis(basis_str)
        cardinals.append(basis_X)
        scf = data_dict[basis]["scf energy"]
        corr = data_dict[basis]["ccsd(t) energy"]
        pert_energies.append(corr - scf)
    # Package the perturbative terms and extrapolate
    pert_df = pd.DataFrame(
            data=list(zip(cardinals, pert_energies)),
            columns=["Cardinal", "CCSD(T) energy"]
            )
    pert_cbs, pert_popt = ep.extrapolate_correlation(
            pert_df["CCSD(T) energy"],
            pert_df["Cardinal"]
            )
    # Get HLC non-perturbative terms
    pert_basis = [key for key in data_dict if "hlc-full" in key]
    cardinals = list()
    pert_energies = list()
    full_energies = list()
    for basis in pert_basis:
        basis_str = basis.split("-")[-1]
        basis_X = match_basis(basis_str)
        cardinals.append(basis_X)
        corr = data_dict[basis]["ccsd(t) energy"]
        pert_energies.append(corr)
    # Package the non-perturbative terms and extrapolate
    nonpert_df = pd.DataFrame(
            data=list(zip(cardinals, pert_energies)),
            columns=["Cardinal", "CCSDT energy"]
            )
    nonpert_cbs, nonpert_popt = ep.extrapolate_correlation(
            nonpert_df["CCSDT energy"],
            nonpert_df["Cardinal"]
            )
    correction = nonpert_cbs - pert_cbs
    return correction, pert_df, nonpert_df


[docs]def heat_analysis(mol_name, data_dict, methods=None, logger=None):
    """ Main driver function for analyzing HEAT contributions.

        This version has been written to be completely modular with
        respect to the contributions that it can take.

        Loops through a list of analysis functions and spits out
        a dataframe containing all of the resulting analysis.
    """
    if os.path.isdir("outputs") is False:
        os.mkdir("outputs")
    if methods is None:
        # If not specified, just default to the bare minimum 
        methods = [
                "ZPE",
                "CCSD(T)",
                ]
    results = dict()
    heat_energy = 0.
    for method in methods:
        # Get the function from string
        anal_func = analysis_lookup(method)
        # If the contribution is just a single value, add
        # straight to the dictionary
        if method in ["(Q)", "DBOC", "ZPE", "MVD"]:
            results[method] = anal_func(data_dict)
            if logger:
                logger.info(method + str(results[method]))
        elif method == "CCSD(T)":
            # Deal with the correlation
            scf_cbs, cc_cbs, extrap_df = anal_func(data_dict)
            results["SCF/CBS"] = scf_cbs
            results["CCSD(T)/CBS"] = cc_cbs
            extrap_df.to_csv("outputs/" + mol_name + "-SCF-CC.csv")
            if logger:
                logger.info("SCF/CC data")
                logger.info(extrap_df)
        elif method == "HLC":
            hlc, pert_df, nonpert_df = anal_func(data_dict)
            results["HLC"] = hlc
            pert_df.to_csv("outputs/" + mol_name + "-(T).csv")
            nonpert_df.to_csv("outputs/" + mol_name + "-T.csv")
            if logger:
                logger.info("Perturbative triple excitations")
                logger.info(pert_df)
                logger.info("Non-perturbative triple excitations")
                logger.info(nonpert_df)
    # Sum up all of the contributions
    for key, value in results.items():
        heat_energy+=value
    results["Total"] = heat_energy
    if logger:
        logger.info("Final energy: " + str(heat_energy))
    heat_df = pd.DataFrame.from_dict([results], orient="columns")
    heat_df.index = [mol_name]
    heat_df.to_csv("heat_analysis/" + mol_name + "-HEAT.csv")
    return heat_df


[docs]def read_heat(dir_path):
    """
    Function to parse in all of the calculations that consist of
    the HEAT scheme.
    Args: dir_path, path to top directory of the HEAT calculation
    which would end with the calcID

    Returns a dictionary containing all of the parsed logfiles
    """
    # Folder for keeping all the analysis logging outputs
    for folder in ["logs", "heat_analysis", "yml"]:
        if os.path.isdir(folder) is False:
            os.mkdir(folder)
    calcID = dir_path.split("/")[-1]
    logger = init_logger(calcID + "-HEAT.log")
    logger.info("HEAT345(Q) analysis of " + calcID)
    # Get the calculation folders
    dir_contents = glob(dir_path + "/*")
    dir_contents = [name for name in dir_contents
                    if os.path.isdir(name) is True]
    logger.info("Present folders: ")
    logger.info(dir_contents)
    results = dict()
    # Main loop over the calculation types
    # Perhaps there is a better way to code this up without explicit
    # hardcoding of if-cases, but for now this is how it'll be done.
    for calctype in dir_contents:
        calc_name = calctype.split("/")[-1]
        logger.info("Working on " + calctype)
        # For all cases other than the correlation, we have only one
        # calculation log file
        if "correlation" not in calctype and "hlc" not in calctype:
            try:
                logname = glob(calctype + "/*.log")[0]
                logger.info("Parsing " + logname)
                results[calc_name] = parsers.parse_cfour(logname)
            except IndexError:
                logger.info("Trouble parsing " + calctype)
        else:
            # There are multiple bases being used for correlation
            # and the HLC terms
            calc_list = glob(calctype + "/*/*.log")
            logger.info("Basis found: ")
            logger.info(calc_list)
            for basis in calc_list:
                # Get basis name from folder
                name = basis.split("/")[-2]
                results[calc_name + "-" + name] = parsers.parse_cfour(basis)
    logger.info("Done reading!")
    logger.info("Dumping results to " + calcID + "-parsed.yml")
    routines.dump_yaml("yml/" + calcID + "-parsed.yml", results)
    return results, logger


[docs]def analyze_molecule(mol_name, directory, methods=None):
    """ Automate analysis of a molecule by pointing to a directory
        containing all of the relevant calculations, as well as providing
        an identifier.
        
        Optional argument is to specify what contributions to include
        in the calculation

        Returns a dataframe containing the energy breakdown.
    """
    data_dict, logger = read_heat(directory)
    mol_df = heat_analysis(mol_name, data_dict, methods, logger)
    return mol_df


"""
    Miscellaneous functions

    Logging and basis matching
"""

[docs]def init_logger(log_name):
    """
    Use `logging` module to provide debug data for HEAT analysis.
    All of the extraneous data (e.g. extraplation coefficients, etc)
    will be output with the logger.

    Required input is the name of the log file including file extension.
    Returns a logger object.
    """
    logger = logging.getLogger(log_name)
    logger.setLevel(logging.DEBUG)

    fh = logging.FileHandler("logs/" + log_name)
    fh.setLevel(logging.DEBUG)

    # Set up the formatter
    formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
            )
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    return logger


[docs]def match_basis(basis):
    """
    Converts a string representations of basis angular momenta
    to the cardinal number
    """
    # Remove the supplemental functions from the string as they don't
    # refer to the cardinal number
    basis = basis.replace("AUG-PC", "")
    basis = basis.replace("P", "")
    basis = basis.replace("C", "")
    basis_dict = {
            "VDZ": 2.,
            "VTZ": 3.,
            "VQZ": 4.,
            "V5Z": 5.,
            "V6Z": 6.
            }
    return basis_dict[basis]


[docs]def check_analysis(analysis_dict, logger):
    """
    Function for performing small sanity checks on the analysis results.
    Basic idea is to do some value comparisons to make sure the extrapolations
    have been performed sensibily.
    Args: dictionary containing the analysis values and a reference to the
    logger object used for debugging.
    The function will only print bad flags.
    """
    if np.abs(analysis_dict["CCSD(T)/CBS"] - analysis_dict["HLC-(T)"]) > 0.4:
        logger.warning("CCSD(T)/CBS and HLC-(T) difference > 0.4 Ha")
    if np.abs(analysis_dict["(Q)"] - analysis_dict["T - (T)"]) > 0.01:
        logger.warning("Large difference in CCSDT(Q) and T - (T)")
    if (analysis_dict["SCF/CBS"] / analysis_dict["HEAT345(Q)"]) < 0.99:
        logger.warning("Large contribution of HEAT345(Q) to SCF/CBS")
    #if np.abs(analysis_dict["HLC-T"] - analysis_dict["HLC-(T)"]) > 0.01:
    #    logger.warning("Large difference between HLC terms")
    if analysis_dict["ZPE"] == 0.:
        logger.warning("No ZPE - better be an atom!")
    if np.abs(analysis_dict["(Q)"]) > 1.:
        logger.warning("CCSD(T) contribution is too large to be true, setting to 0.")
        analysis_dict["(Q)"] = 0.