Source code for gptchem.data

import pandas as pd
import pystow



[docs]
def get_photoswitch_data() -> pd.DataFrame:
    """Return the photoswitch data as a pandas DataFrame.

    References:
        [GriffithsPhotoSwitches] `Griffiths, K.; Halcovitch, N. R.; Griffin, J. M. Efficient Solid-State Photoswitching of Methoxyazobenzene in a Metal–Organic Framework for Thermal Energy Storage. Chemical Science 2022, 13 (10), 3014–3019. <https://doi.org/10.1039/d2sc00632d>`_
    """
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "photoswitches",
            url="https://www.dropbox.com/s/z5z9z944cc060x9/photoswitches.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .drop_duplicates(subset=["SMILES"])
        .reset_index(drop=True)
    )




[docs]
def get_polymer_data() -> pd.DataFrame:
    """Return the dataset reported in [JablonkaAL]_."""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "polymer",
            url="https://www.dropbox.com/s/rpximatxlb8igl9/polymers.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_moosavi_mof_data() -> pd.DataFrame:
    """Return the data and features used in [MoosaviDiversity]_.

    You can find the original datasets on `MaterialsCloud archive <https://archive.materialscloud.org/record/2020.67>`_.

    We additionally computed the MOFid [BuciorMOFid]_ for each MOF.
    """
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "moosavi_core",
            url="https://www.dropbox.com/s/obfnx9fu73dqr3a/moosavi_core.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .drop_duplicates(subset="mofid")
        .reset_index(drop=True)
    )




[docs]
def get_moosavi_cv_data() -> pd.DataFrame:
    """Return the gravimetric heat capacity used in [MoosaviCp]_.

    You can find the original datasets on `MaterialsCloud archive <https://doi.org/10.24435/materialscloud:p1-2y>`_.

    We additionally computed the MOFid [BuciorMOFid]_ for each MOF
    and dropped entries for which we could not compute the MOFid.
    """
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "cv",
            url="https://www.dropbox.com/s/lncrftmdcgn1zdh/cv.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .drop_duplicates(subset="mofid")
        .reset_index(drop=True)
    )




[docs]
def get_moosavi_pcv_data() -> pd.DataFrame:
    """Return the site-projected heat capacity and features used in [MoosaviCp]_.

    You can find the original datasets on `MaterialsCloud archive <https://doi.org/10.24435/materialscloud:p1-2y>`_.

    We additionally computed the MOFid [BuciorMOFid]_ for each MOF
    and dropped entries for which we could not compute the MOFid.
    """
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "pcv",
            url="https://www.dropbox.com/s/r4fub4i9nadt1kc/pcv.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_qmug_data() -> pd.DataFrame:
    """Return the data and features used in [QMUG]_.

    We mean-aggregrated the numerical data per SMILES
    and additionally computed SELFIES and INChI.
    """
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "qmug",
            url="https://www.dropbox.com/s/6pk0ohy5agqwe3q/qmugs.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_qmug_small_data() -> pd.DataFrame:
    """Return the data and features used in [QMUG]_.

    For the subset of short SMILES.

    We mean-aggregrated the numerical data per SMILES
    and additionally computed SELFIES and INChI.
    """
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "qmug_small",
            url="https://www.dropbox.com/s/wkkrpfb2ash23a2/qmugs_small.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_hea_phase_data() -> pd.DataFrame:
    """Return the dataset reported in [Pei]_."""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "hea",
            url="https://www.dropbox.com/s/4edwffuajclxa5h/hea_phase.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_opv_data() -> pd.DataFrame:
    """Return the dataset reported in [NagasawaOPV]_"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "opv",
            url="https://www.dropbox.com/s/a45eu1xw0zkyrmc/opv.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_esol_data() -> pd.DataFrame:
    """Return the dataset reported in [ESOL]_"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "esol",
            url="https://www.dropbox.com/s/teqmkvl7v22bfox/esol.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_solubility_test_data() -> pd.DataFrame:
    """Return the dataset reported in [soltest]_"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "solubility",
            url="https://www.dropbox.com/s/xeg02ulael9akhf/solubility_test_set.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_doyle_rxn_data() -> pd.DataFrame:
    """Return the reaction dataset reported in [Doyle]_"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "doyle_rxn",
            url="https://www.dropbox.com/s/gjxatqagwh3cwb6/dreher_doyle.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_suzuki_rxn_data() -> pd.DataFrame:
    """Return the reaction dataset reported in [Suzuki]_"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "suzuki_rxn",
            url="https://www.dropbox.com/s/0uv38jgrj2k33u7/suzuki_dreher.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_freesolv_data() -> pd.DataFrame:
    """Return the FreeSolv data [freesolv]_"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "freesolv",
            url="https://www.dropbox.com/s/rnin1zyuat3miyp/free_solv.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_lipophilicity_data() -> pd.DataFrame:
    """Return the Lipophilicity data parsed from ChEMBL [chembl]_"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "lipophilicity",
            url="https://www.dropbox.com/s/secesuqvqrdexz4/lipophilicity.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_mof_solvent_data() -> pd.DataFrame:
    """Return the MOF reaction data []"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "mof_rxn",
            url="https://www.dropbox.com/s/jon75f9duukqm36/mof_yield_gpt3.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_matbench_glass():
    """Return the glass formation ability dataset from matbench"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "matbench_glass",
            url="https://www.dropbox.com/s/f2o06xdw2ri5bc0/gfa.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_matbench_is_metal():
    """Return the is metal dataset from matbench [matbench]_"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "matbench_is_metal",
            url="https://www.dropbox.com/s/h9dprz801vsdyhy/is_metal.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_matbench_expt_gap():
    """Return the experimental band gap dataset from matbench [matbench]_"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "matbench_expt_gap",
            url="https://www.dropbox.com/s/4iqnhf9nui0dk7e/expt_gap.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_matbench_steels():
    """Return the steel yield strength dataset from matbench [matbench]_"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "matbench_steels",
            url="https://www.dropbox.com/s/7cf330um2a47v3c/steels.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )




[docs]
def get_water_stability():
    """Return the water stability dataset used in [waterStability]_"""
    return (
        pystow.module("gptchem")
        .ensure_csv(
            "mof_water_stability",
            url="https://www.dropbox.com/s/87qpe16lu6nmm1d/water_stability.csv?dl=1",
            read_csv_kwargs=dict(sep=","),
        )
        .reset_index(drop=True)
    )