Source code for gptchem.tuner

import os
import subprocess
import time
from pathlib import Path
from typing import Optional

import openai
import pandas as pd
from fastcore.basics import basic_repr
from fastcore.xtras import dumps
from loguru import logger
from openai import FineTune
from openai.cli import FineTune as FineTuneCli

from .types import PathType
from .utils import make_outdir


def _check_ft_state(ft_id):
    ft = FineTune.retrieve(id=ft_id)
    return ft.get("status")


def get_ft_model_name(ft_id, sleep=60):
    while True:
        ft = FineTune.retrieve(id=ft_id)
        status = ft.get("status")
        logger.debug(f"Fine tuning status: {status}")
        if status == "succeeded":
            return ft.get("fine_tuned_model")
        elif status == "pending":
            time.sleep(sleep)
        elif status == "failed":
            raise RuntimeError(f"Fine tuning failed: {ft}")
        time.sleep(sleep)


_PRESETS = {
    "ada-classification": {
        "base_model": "ada",
        "n_epochs": 4,
    },
    "ada-inverse": {
        "base_model": "ada",
        "n_epochs": 2,
    },
}


[docs] class Tuner: """Wrapper around the OpenAI API for fine tuning.""" _sleep = 120 def __init__( self, base_model: str = "ada", batch_size: Optional[int] = None, n_epochs: int = 4, learning_rate_multiplier: Optional[float] = None, outdir: Optional[PathType] = None, run_name: str = None, wandb_sync: bool = True, write_summary: bool = True, ) -> None: """Initialize a Tuner. Args: base_model: The base model to fine tune. Defaults to "ada". batch_size: The batch size to use for fine tuning. Defaults to None. n_epochs: The number of epochs to fine tune for. Defaults to 4. learning_rate_multiplier: The learning rate multiplier to use for fine tuning. The OpenAI docs state "We recommend experimenting with values in the range 0.02 to 0.2 to see what produces the best results." Defaults to None. outdir: The directory to save the fine tuning results to. If not specified, a directory will be created in `BASE_OUTDIR` run_name: The name of the run. This is used to create the output directory. wandb_sync: Whether to sync the results to Weights & Biases. write_summary: Whether to write a summary of the fine tuning run to a file. Defaults to True. """ self.base_model = base_model self.batch_size = batch_size self.n_epochs = n_epochs self.learning_rate_multiplier = learning_rate_multiplier self.run_name = run_name self.wandb_sync = wandb_sync self.outdir = ( outdir if outdir is not None and Path(outdir).exists() else make_outdir(self.run_name) ) self._modelname = None self._ft_id = None self._train_filename = None self._valid_filename = None self._train_file_id = None self._valid_file_id = None self._res = None self._write_summary = write_summary @classmethod def from_preset(cls, preset: str = "ada-classification"): if preset not in _PRESETS: raise ValueError( f"Invalid preset: {preset}. Valid presets are: {list(_PRESETS.keys())}" ) return cls(**_PRESETS[preset]) @property def model_name(self): if self._modelname is None: raise ValueError("Model name not set. Please call `tuner.tune()` first.") @property def summary(self) -> dict: return { "base_model": self.base_model, "batch_size": self.batch_size, "n_epochs": self.n_epochs, "learning_rate_multiplier": self.learning_rate_multiplier, "run_name": self.run_name, "wandb_sync": self.wandb_sync, "outdir": str(self.outdir), "train_filename": str(self._train_filename), "valid_filename": str(self._valid_filename), "model_name": self._modelname, "ft_id": self._ft_id, "date": time.strftime("%Y%m%d_%H%M%S"), "train_file_id": self._train_file_id, "valid_file_id": self._valid_file_id, } def _write_file(self, df: pd.DataFrame, data_type: str) -> None: """Write a dataframe to a file as json in records form.""" if df is not None and isinstance(df, pd.DataFrame) and not df.empty: if data_type not in ["train", "valid"]: raise ValueError(f"Invalid type: {data_type}. Valid types are: ['train', 'valid']") filename = os.path.abspath(os.path.join(self.outdir, f"{data_type}.jsonl")) df = df[["prompt", "completion"]] df.to_json(filename, orient="records", lines=True, force_ascii=False) if data_type == "train": self._train_filename = filename elif data_type == "valid": self._valid_filename = filename return filename return None
[docs] def tune(self, train_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None) -> dict: """Fine tune a model on a dataset. Args: train_df (pd.DataFrame): Training dataset. validation_df (pd.DataFrame, optional): Validation dataset. Defaults to None. Returns: dict: Summary of the fine tuning run. Raises: ValueError: If no training dataset is provided. """ if train_df is None: raise ValueError("Please provide a training dataset.") train_file = self._write_file(train_df, "train") valid_file = self._write_file(validation_df, "valid") file_args = { "training_file": FineTuneCli._get_or_upload(train_file, check_if_file_exists=False) } self._train_file_id = file_args["training_file"] if valid_file is not None: file_args["validation_file"] = FineTuneCli._get_or_upload( valid_file, check_if_file_exists=False ) self._valid_file_id = file_args["validation_file"] settings = {} if self.batch_size is not None: settings["batch_size"] = self.batch_size if self.n_epochs is not None: settings["n_epochs"] = self.n_epochs if self.learning_rate_multiplier is not None: settings["learning_rate_multiplier"] = self.learning_rate_multiplier result = openai.FineTune.create( **file_args, model=self.base_model, **settings, ) self._res = result logger.debug(f"Requested fine tuning. {result}") modelname = None try: ft_id = result["id"] modelname = get_ft_model_name(ft_id, self._sleep) # sync runs with wandb if self.wandb_sync: subprocess.run("openai wandb sync -n 1", shell=True) except Exception: logger.exception("Fine tuning failed.") if modelname is None: raise ValueError(f"Fine tuning failed. Result: {result}.") self._modelname = modelname self._ft_id = ft_id if self._write_summary: with open(os.path.join(self.outdir, "summary.json"), "w") as f: f.write(dumps(self.summary)) logger.debug(f"Fine tuning completed. {self.summary}") return self.summary
def __call__( self, train_df: pd.DataFrame, validation_df: Optional[pd.DataFrame] = None ) -> dict: return self.tune(train_df, validation_df) __repr__ = basic_repr("base_model,batch_size,n_epochs,learning_rate_multiplier,run_name")