from typing import List, Optional
import numpy as np
import pandas as pd
from numpy.typing import ArrayLike
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from gptchem.extractor import ClassificationExtractor
from gptchem.formatter import ClassificationFormatter
from gptchem.querier import Querier
from gptchem.tuner import Tuner
[docs]
class GPTClassifier:
"""Wrapper around GPT-3 fine tuning in style of a scikit-learn classifier."""
def __init__(
self,
property_name: str,
tuner: Tuner,
querier_settings: Optional[dict] = None,
extractor: ClassificationExtractor = ClassificationExtractor(),
save_valid_file: bool = False,
):
"""Initialize a GPTClassifier.
Args:
property_name (str): Name of the property to be predicted.
This will be part of the prompt.
tuner (Tuner): Tuner object to be used for fine tuning.
This specifies the model to be used and the fine-tuning settings.
querier_settings (Optional[dict], optional): Settings for the querier.
Defaults to None.
extractor (ClassificationExtractor, optional): Callable object that can extract
integers from the completions produced by the querier.
Defaults to ClassificationExtractor().
save_valid_file (bool, optional): Whether to save the validation file.
Defaults to False.
"""
self.property_name = property_name
self.tuner = tuner
self.querier_setting = (
querier_settings if querier_settings is not None else {"max_tokens": 3}
)
self.extractor = extractor
self.formatter = ClassificationFormatter(
representation_column="repr",
label_column="prop",
property_name=property_name,
num_classes=None,
)
self.model_name = None
self.tune_res = None
self.save_valid_file = save_valid_file
def _prepare_df(self, X: ArrayLike, y: ArrayLike):
rows = []
for i in range(len(X)):
rows.append({"repr": X[i], "prop": y[i]})
return pd.DataFrame(rows)
[docs]
def fit(self, X: ArrayLike, y: ArrayLike) -> None:
"""Fine tune a GPT-3 model on a dataset.
Args:
X (ArrayLike): Input data (typically array of molecular representations)
y (ArrayLike): Target data (typically array of property values)
"""
df = self._prepare_df(X, y)
formatted = self.formatter(df)
tune_res = self.tuner(formatted)
self.model_name = tune_res["model_name"]
self.tune_res = tune_res
[docs]
def predict(self, X: ArrayLike) -> ArrayLike:
"""Predict property values for a set of molecular representations.
Args:
X (ArrayLike): Input data (typically array of molecular representations)
Returns:
ArrayLike: Predicted property values
"""
df = self._prepare_df(X, [0] * len(X))
formatted = self.formatter(df)
if self.save_valid_file:
self.tuner._write_file(formatted, "valid")
querier = Querier(self.model_name, **self.querier_setting)
completions = querier(formatted)
extracted = self.extractor(completions)
return extracted
[docs]
class NGramGPTClassifier:
"""Add the predictions of a N-Gram model to the prompt.
Empirically, this tends to degrade performance.
"""
def __init__(
self,
property_name: str,
tuner: Tuner,
querier_settings: Optional[dict] = None,
extractor: ClassificationExtractor = ClassificationExtractor(),
count_vectorizer: Optional[CountVectorizer] = None,
ngram_model: Optional[BaseEstimator] = None,
):
"""Initialize a GPTClassifier.
Args:
property_name (str): Name of the property to be predicted.
This will be part of the prompt.
tuner (Tuner): Tuner object to be used for fine tuning.
This specifies the model to be used and the fine-tuning settings.
querier_settings (Optional[dict], optional): Settings for the querier.
Defaults to None.
extractor (ClassificationExtractor, optional): Callable object that can extract
integers from the completions produced by the querier.
Defaults to ClassificationExtractor().
"""
self.property_name = property_name
self.tuner = tuner
self.querier_setting = (
querier_settings if querier_settings is not None else {"max_tokens": 3}
)
self.extractor = extractor
self.formatter = ClassificationFormatter(
representation_column="repr_with_ngram",
label_column="prop",
property_name=property_name,
num_classes=None,
)
self.model_name = None
self.tune_res = None
self.count_vectorizer = CountVectorizer() if count_vectorizer is None else count_vectorizer
self.ngram_model = MultinomialNB() if ngram_model is None else ngram_model
def _fit_ngram_model(self, X: ArrayLike, y: ArrayLike):
X = self.count_vectorizer.fit_transform(X)
self.ngram_model.fit(X, y)
def _predict_ngram_model(self, X: ArrayLike):
X = self.count_vectorizer.transform(X)
return self.ngram_model.predict(X)
def _prepare_df(self, X: ArrayLike, y: ArrayLike):
rows = []
for i in range(len(X)):
rows.append({"repr": X[i], "prop": y[i]})
return pd.DataFrame(rows)
[docs]
def fit(self, X: ArrayLike, y: ArrayLike) -> None:
"""Fine tune a GPT-3 model on a dataset.
Args:
X (ArrayLike): Input data (typically array of molecular representations)
y (ArrayLike): Target data (typically array of property values)
"""
df = self._prepare_df(X, y)
self._fit_ngram_model(X, y)
ngram_preds = self._predict_ngram_model(X)
df["ngram_preds"] = ngram_preds
df["repr_with_ngram"] = (
df["repr"] + " with n-gram prediction " + df["ngram_preds"].astype(str)
)
formatted = self.formatter(df)
tune_res = self.tuner(formatted)
self.model_name = tune_res["model_name"]
self.tune_res = tune_res
[docs]
def predict(self, X: ArrayLike) -> ArrayLike:
"""Predict property values for a set of molecular representations.
Args:
X (ArrayLike): Input data (typically array of molecular representations)
Returns:
ArrayLike: Predicted property values
"""
df = self._prepare_df(X, [0] * len(X))
ngram_preds = self._predict_ngram_model(X)
df["ngram_preds"] = ngram_preds
df["repr_with_ngram"] = (
df["repr"] + " with n-gram prediction " + df["ngram_preds"].astype(str)
)
formatted = self.formatter(df)
querier = Querier(self.model_name, **self.querier_setting)
completions = querier(formatted)
extracted = self.extractor(completions)
return extracted
[docs]
class DifficultNGramClassifier:
"""Highlight cases an N-Gram model struggles with."""
def __init__(
self,
property_name: str,
tuner: Tuner,
querier_settings: Optional[dict] = None,
extractor: ClassificationExtractor = ClassificationExtractor(),
count_vectorizer: Optional[CountVectorizer] = None,
ngram_model: Optional[BaseEstimator] = None,
):
"""Initialize a GPTClassifier.
Args:
property_name (str): Name of the property to be predicted.
This will be part of the prompt.
tuner (Tuner): Tuner object to be used for fine tuning.
This specifies the model to be used and the fine-tuning settings.
querier_settings (Optional[dict], optional): Settings for the querier.
Defaults to None.
extractor (ClassificationExtractor, optional): Callable object that can extract
integers from the completions produced by the querier.
Defaults to ClassificationExtractor().
"""
self.property_name = property_name
self.tuner = tuner
self.querier_setting = (
querier_settings if querier_settings is not None else {"max_tokens": 3}
)
self.extractor = extractor
self.formatter = ClassificationFormatter(
representation_column="repr_with_ngram",
label_column="prop",
property_name=property_name,
num_classes=None,
)
self.model_name = None
self.tune_res = None
self.count_vectorizer = CountVectorizer() if count_vectorizer is None else count_vectorizer
self.ngram_model = MultinomialNB() if ngram_model is None else ngram_model
def _fit_ngram_model(self, X: ArrayLike, y: ArrayLike):
X = self.count_vectorizer.fit_transform(X)
self.ngram_model.fit(X, y)
def _predict_ngram_model(self, X: ArrayLike):
X = self.count_vectorizer.transform(X)
return self.ngram_model.predict(X)
def _prepare_df(self, X: ArrayLike, y: ArrayLike):
rows = []
for i in range(len(X)):
rows.append({"repr": X[i], "prop": y[i]})
return pd.DataFrame(rows)
[docs]
def fit(self, X: ArrayLike, y: ArrayLike) -> None:
"""Fine tune a GPT-3 model on a dataset.
Args:
X (ArrayLike): Input data (typically array of molecular representations)
y (ArrayLike): Target data (typically array of property values)
"""
df = self._prepare_df(X, y)
self._fit_ngram_model(X, y)
ngram_preds = self._predict_ngram_model(X)
df["ngram_incorrect"] = ngram_preds != y
n_gram_diffcult = [
"This was a difficult example. Pay attention." if x else ""
for x in df["ngram_incorrect"]
]
df["repr_with_ngram"] = df["repr"] + " " + n_gram_diffcult
formatted = self.formatter(df)
tune_res = self.tuner(formatted)
self.model_name = tune_res["model_name"]
self.tune_res = tune_res
[docs]
def predict(self, X: ArrayLike) -> ArrayLike:
"""Predict property values for a set of molecular representations.
Args:
X (ArrayLike): Input data (typically array of molecular representations)
Returns:
ArrayLike: Predicted property values
"""
df = self._prepare_df(X, [0] * len(X))
df["repr_with_ngram"] = df["repr"] + " " + [""] * len(X)
formatted = self.formatter(df)
querier = Querier(self.model_name, **self.querier_setting)
completions = querier(formatted)
extracted = self.extractor(completions)
return extracted
[docs]
class MultiRepGPTClassifier(GPTClassifier):
"""GPT Classifier trained on muliple representations."""
def __init__(
self,
property_name: str,
tuner: Tuner,
querier_settings: Optional[dict] = None,
extractor: ClassificationExtractor = ClassificationExtractor(),
rep_names: Optional[List[str]] = None,
) -> None:
self.property_name = property_name
self.tuner = tuner
self.querier_setting = (
querier_settings if querier_settings is not None else {"max_tokens": 3}
)
self.extractor = extractor
self.formatter = ClassificationFormatter(
representation_column="repr",
label_column="prop",
property_name=property_name,
num_classes=None,
)
self.model_name = None
self.tune_res = None
self.rep_names = rep_names
def _prepare_df(self, X: ArrayLike, y: ArrayLike, shuffle: bool = True):
# assumes that columns in X are the different representations
rows = []
for i in range(len(X)):
for j in range(len(X[i])):
repr_name = self.rep_names[j] + " " if self.rep_names is not None else ""
rows.append({"repr": repr_name + X[i][j], "prop": y[i], "mol": i, "rep": j})
if shuffle:
return pd.DataFrame(rows).sample(frac=1)
return pd.DataFrame(rows)
def _predict(self, X: ArrayLike) -> ArrayLike:
df = self._prepare_df(X, [0] * len(X), shuffle=False)
formatted = self.formatter(df)
querier = Querier(self.model_name, **self.querier_setting)
completions = querier(formatted)
extracted = self.extractor(completions)
# reshape such that predictions also have multiple columns
# one per representation and one row per molecule
# we can get the molecule and the representation from the df
predictions = np.zeros((len(X), len(X[0])))
for i in range(len(X)):
for rep in range(len(X[0])):
subset = df[(df["mol"] == i) & (df["rep"] == rep)]
predictions[i, rep] = extracted[subset.index[0]]
return predictions
[docs]
def predict(self, X: ArrayLike, return_std: bool = False):
predictions = self._predict(X)
if return_std:
return np.mean(predictions, axis=1), np.std(predictions, axis=1)
return np.mean(predictions, axis=1)