import pandas as pd
import pystow
[docs]
def get_photoswitch_data() -> pd.DataFrame:
"""Return the photoswitch data as a pandas DataFrame.
References:
[GriffithsPhotoSwitches] `Griffiths, K.; Halcovitch, N. R.; Griffin, J. M. Efficient Solid-State Photoswitching of Methoxyazobenzene in a Metal–Organic Framework for Thermal Energy Storage. Chemical Science 2022, 13 (10), 3014–3019. <https://doi.org/10.1039/d2sc00632d>`_
"""
return (
pystow.module("gptchem")
.ensure_csv(
"photoswitches",
url="https://www.dropbox.com/s/z5z9z944cc060x9/photoswitches.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.drop_duplicates(subset=["SMILES"])
.reset_index(drop=True)
)
[docs]
def get_polymer_data() -> pd.DataFrame:
"""Return the dataset reported in [JablonkaAL]_."""
return (
pystow.module("gptchem")
.ensure_csv(
"polymer",
url="https://www.dropbox.com/s/rpximatxlb8igl9/polymers.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_moosavi_mof_data() -> pd.DataFrame:
"""Return the data and features used in [MoosaviDiversity]_.
You can find the original datasets on `MaterialsCloud archive <https://archive.materialscloud.org/record/2020.67>`_.
We additionally computed the MOFid [BuciorMOFid]_ for each MOF.
"""
return (
pystow.module("gptchem")
.ensure_csv(
"moosavi_core",
url="https://www.dropbox.com/s/obfnx9fu73dqr3a/moosavi_core.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.drop_duplicates(subset="mofid")
.reset_index(drop=True)
)
[docs]
def get_moosavi_cv_data() -> pd.DataFrame:
"""Return the gravimetric heat capacity used in [MoosaviCp]_.
You can find the original datasets on `MaterialsCloud archive <https://doi.org/10.24435/materialscloud:p1-2y>`_.
We additionally computed the MOFid [BuciorMOFid]_ for each MOF
and dropped entries for which we could not compute the MOFid.
"""
return (
pystow.module("gptchem")
.ensure_csv(
"cv",
url="https://www.dropbox.com/s/lncrftmdcgn1zdh/cv.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.drop_duplicates(subset="mofid")
.reset_index(drop=True)
)
[docs]
def get_moosavi_pcv_data() -> pd.DataFrame:
"""Return the site-projected heat capacity and features used in [MoosaviCp]_.
You can find the original datasets on `MaterialsCloud archive <https://doi.org/10.24435/materialscloud:p1-2y>`_.
We additionally computed the MOFid [BuciorMOFid]_ for each MOF
and dropped entries for which we could not compute the MOFid.
"""
return (
pystow.module("gptchem")
.ensure_csv(
"pcv",
url="https://www.dropbox.com/s/r4fub4i9nadt1kc/pcv.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_qmug_data() -> pd.DataFrame:
"""Return the data and features used in [QMUG]_.
We mean-aggregrated the numerical data per SMILES
and additionally computed SELFIES and INChI.
"""
return (
pystow.module("gptchem")
.ensure_csv(
"qmug",
url="https://www.dropbox.com/s/6pk0ohy5agqwe3q/qmugs.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_qmug_small_data() -> pd.DataFrame:
"""Return the data and features used in [QMUG]_.
For the subset of short SMILES.
We mean-aggregrated the numerical data per SMILES
and additionally computed SELFIES and INChI.
"""
return (
pystow.module("gptchem")
.ensure_csv(
"qmug_small",
url="https://www.dropbox.com/s/wkkrpfb2ash23a2/qmugs_small.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_hea_phase_data() -> pd.DataFrame:
"""Return the dataset reported in [Pei]_."""
return (
pystow.module("gptchem")
.ensure_csv(
"hea",
url="https://www.dropbox.com/s/4edwffuajclxa5h/hea_phase.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_opv_data() -> pd.DataFrame:
"""Return the dataset reported in [NagasawaOPV]_"""
return (
pystow.module("gptchem")
.ensure_csv(
"opv",
url="https://www.dropbox.com/s/a45eu1xw0zkyrmc/opv.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_esol_data() -> pd.DataFrame:
"""Return the dataset reported in [ESOL]_"""
return (
pystow.module("gptchem")
.ensure_csv(
"esol",
url="https://www.dropbox.com/s/teqmkvl7v22bfox/esol.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_solubility_test_data() -> pd.DataFrame:
"""Return the dataset reported in [soltest]_"""
return (
pystow.module("gptchem")
.ensure_csv(
"solubility",
url="https://www.dropbox.com/s/xeg02ulael9akhf/solubility_test_set.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_doyle_rxn_data() -> pd.DataFrame:
"""Return the reaction dataset reported in [Doyle]_"""
return (
pystow.module("gptchem")
.ensure_csv(
"doyle_rxn",
url="https://www.dropbox.com/s/gjxatqagwh3cwb6/dreher_doyle.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_suzuki_rxn_data() -> pd.DataFrame:
"""Return the reaction dataset reported in [Suzuki]_"""
return (
pystow.module("gptchem")
.ensure_csv(
"suzuki_rxn",
url="https://www.dropbox.com/s/0uv38jgrj2k33u7/suzuki_dreher.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_freesolv_data() -> pd.DataFrame:
"""Return the FreeSolv data [freesolv]_"""
return (
pystow.module("gptchem")
.ensure_csv(
"freesolv",
url="https://www.dropbox.com/s/rnin1zyuat3miyp/free_solv.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_lipophilicity_data() -> pd.DataFrame:
"""Return the Lipophilicity data parsed from ChEMBL [chembl]_"""
return (
pystow.module("gptchem")
.ensure_csv(
"lipophilicity",
url="https://www.dropbox.com/s/secesuqvqrdexz4/lipophilicity.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_mof_solvent_data() -> pd.DataFrame:
"""Return the MOF reaction data []"""
return (
pystow.module("gptchem")
.ensure_csv(
"mof_rxn",
url="https://www.dropbox.com/s/jon75f9duukqm36/mof_yield_gpt3.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_matbench_glass():
"""Return the glass formation ability dataset from matbench"""
return (
pystow.module("gptchem")
.ensure_csv(
"matbench_glass",
url="https://www.dropbox.com/s/f2o06xdw2ri5bc0/gfa.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_matbench_expt_gap():
"""Return the experimental band gap dataset from matbench [matbench]_"""
return (
pystow.module("gptchem")
.ensure_csv(
"matbench_expt_gap",
url="https://www.dropbox.com/s/4iqnhf9nui0dk7e/expt_gap.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_matbench_steels():
"""Return the steel yield strength dataset from matbench [matbench]_"""
return (
pystow.module("gptchem")
.ensure_csv(
"matbench_steels",
url="https://www.dropbox.com/s/7cf330um2a47v3c/steels.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)
[docs]
def get_water_stability():
"""Return the water stability dataset used in [waterStability]_"""
return (
pystow.module("gptchem")
.ensure_csv(
"mof_water_stability",
url="https://www.dropbox.com/s/87qpe16lu6nmm1d/water_stability.csv?dl=1",
read_csv_kwargs=dict(sep=","),
)
.reset_index(drop=True)
)