|
import logging |
|
from collections import defaultdict |
|
|
|
import numpy as np |
|
import tiktoken |
|
|
|
logger = logging.getLogger(__name__) |
|
logger.setLevel(logging.INFO) |
|
|
|
|
|
def check_format_errors(train_dataset, user_role, model_role): |
|
""" |
|
Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep |
|
""" |
|
|
|
format_errors = defaultdict(int) |
|
|
|
for ex in train_dataset: |
|
if not isinstance(ex, dict): |
|
format_errors["data_type"] += 1 |
|
continue |
|
|
|
messages = ex.get("messages", None) |
|
if not messages: |
|
format_errors["missing_messages_list"] += 1 |
|
continue |
|
|
|
for message in messages: |
|
if "role" not in message or "content" not in message: |
|
format_errors["message_missing_key"] += 1 |
|
|
|
if any( |
|
k not in ("role", "content", "name", "function_call", "weight") |
|
for k in message |
|
): |
|
format_errors["message_unrecognized_key"] += 1 |
|
|
|
if message.get("role", None) not in ["system", user_role, model_role]: |
|
format_errors["unrecognized_role"] += 1 |
|
|
|
content = message.get("content", None) |
|
function_call = message.get("function_call", None) |
|
|
|
if (not content and not function_call) or not isinstance(content, str): |
|
format_errors["missing_content"] += 1 |
|
|
|
if not any(message.get("role", None) == model_role for message in messages): |
|
format_errors["example_missing_assistant_message"] += 1 |
|
|
|
if format_errors: |
|
logger.warning("Found errors:") |
|
for k, v in format_errors.items(): |
|
logger.warning(f"{k}: {v}") |
|
else: |
|
logger.info("No errors found") |
|
|
|
return format_errors if format_errors else {} |
|
|
|
|
|
def get_distributions(train_dataset, user_role, model_role): |
|
""" |
|
Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep |
|
|
|
Gets the distributions of the number of messages per example, the total number of tokens per example, and the number of assistant tokens per example. |
|
""" |
|
encoding = tiktoken.get_encoding("cl100k_base") |
|
|
|
|
|
|
|
def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1): |
|
num_tokens = 0 |
|
for message in messages: |
|
num_tokens += tokens_per_message |
|
for key, value in message.items(): |
|
num_tokens += len(encoding.encode(value)) |
|
if key == "name": |
|
num_tokens += tokens_per_name |
|
num_tokens += 3 |
|
return num_tokens |
|
|
|
def num_assistant_tokens_from_messages(messages): |
|
num_tokens = 0 |
|
for message in messages: |
|
if message["role"] == model_role: |
|
num_tokens += len(encoding.encode(message["content"])) |
|
return num_tokens |
|
|
|
n_missing_system = 0 |
|
n_missing_user = 0 |
|
n_messages = [] |
|
convo_lens = [] |
|
assistant_message_lens = [] |
|
|
|
for ex in train_dataset: |
|
messages = ex["messages"] |
|
if not any(message["role"] == "system" for message in messages): |
|
n_missing_system += 1 |
|
if not any(message["role"] == user_role for message in messages): |
|
n_missing_user += 1 |
|
n_messages.append(len(messages)) |
|
convo_lens.append(num_tokens_from_messages(messages)) |
|
assistant_message_lens.append(num_assistant_tokens_from_messages(messages)) |
|
|
|
return { |
|
"n_missing_system": n_missing_system, |
|
"n_missing_user": n_missing_user, |
|
"n_messages": n_messages, |
|
"convo_lens": convo_lens, |
|
"assistant_message_lens": assistant_message_lens, |
|
} |
|
|
|
|
|
def check_token_counts(train_dataset, user_role, model_role): |
|
""" |
|
Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep |
|
""" |
|
|
|
def print_distribution(values, name): |
|
logger.info(f"\n#### Distribution of {name}:") |
|
logger.info(f"min / max: {min(values)}, {max(values)}") |
|
logger.info(f"mean / median: {np.mean(values)}, {np.median(values)}") |
|
logger.info(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}") |
|
|
|
|
|
distributions = get_distributions( |
|
train_dataset, user_role=user_role, model_role=model_role |
|
) |
|
n_missing_system = distributions["n_missing_system"] |
|
n_missing_user = distributions["n_missing_user"] |
|
n_messages = distributions["n_messages"] |
|
convo_lens = distributions["convo_lens"] |
|
assistant_message_lens = distributions["assistant_message_lens"] |
|
|
|
logger.info("Num examples missing system message:", n_missing_system) |
|
logger.info("Num examples missing user message:", n_missing_user) |
|
print_distribution(n_messages, "num_messages_per_example") |
|
print_distribution(convo_lens, "num_total_tokens_per_example") |
|
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example") |
|
n_too_long = sum(l > 4096 for l in convo_lens) |
|
logger.info( |
|
f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning" |
|
) |
|
|
|
return |
|
|
|
|
|
def estimate_cost(train_dataset, user_role, model_role): |
|
""" |
|
Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep |
|
""" |
|
distributions = get_distributions( |
|
train_dataset, user_role=user_role, model_role=model_role |
|
) |
|
n_missing_system = distributions["n_missing_system"] |
|
n_missing_user = distributions["n_missing_user"] |
|
n_messages = distributions["n_messages"] |
|
convo_lens = distributions["convo_lens"] |
|
assistant_message_lens = distributions["assistant_message_lens"] |
|
|
|
|
|
MAX_TOKENS_PER_EXAMPLE = 4096 |
|
|
|
TARGET_EPOCHS = 3 |
|
MIN_TARGET_EXAMPLES = 100 |
|
MAX_TARGET_EXAMPLES = 25000 |
|
MIN_DEFAULT_EPOCHS = 1 |
|
MAX_DEFAULT_EPOCHS = 25 |
|
|
|
n_epochs = TARGET_EPOCHS |
|
n_train_examples = len(train_dataset) |
|
try: |
|
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES: |
|
n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples) |
|
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES: |
|
n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples) |
|
except: |
|
n_epochs = TARGET_EPOCHS |
|
|
|
n_billing_tokens_in_dataset = sum( |
|
min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens |
|
) |
|
|
|
return { |
|
"Estimated number of tokens in dataset": n_billing_tokens_in_dataset, |
|
f"Estimated number of tokens that will be billed (assuming {n_epochs} training epochs)": n_epochs |
|
* n_billing_tokens_in_dataset, |
|
} |
|
|