Spaces:

acmc
/

whatsapp-chats-finetuning-formatter

Running

App Files Files Community

ACMC commited on Apr 23

Commit

bd73a7b

•

1 Parent(s): bf9e30f

Bugfix

Browse files

Files changed (3) hide show

app.py +99 -46
utils.py +52 -49
validation.py +40 -27

app.py CHANGED Viewed

@@ -1,33 +1,41 @@
 # %%
 from uuid import uuid4
-import gradio as gr
 import datasets
-import json
-import io
-from utils import (
-    process_chat_file,
-    transform_conversations_dataset_into_training_examples,
-)
-from validation import (
-    check_format_errors,
-    estimate_cost,
-    get_distributions,
-)
 import matplotlib.pyplot as plt
-def convert_to_dataset(files, do_spelling_correction, progress):
     modified_dataset = None
     for file in progress.tqdm(files, desc="Processing files"):
         if modified_dataset is None:
             # First file
             modified_dataset = process_chat_file(
-                file, do_spelling_correction=do_spelling_correction
             )
         else:
             # Concatenate the datasets
             this_file_dataset = process_chat_file(
-                file, do_spelling_correction=do_spelling_correction
             )
             modified_dataset = datasets.concatenate_datasets(
                 [modified_dataset, this_file_dataset]
@@ -43,25 +51,41 @@ def file_upload_callback(
     user_role,
     model_role,
     whatsapp_name,
     progress=gr.Progress(),
 ):
-    print(f"Processing {files}")
-    full_system_prompt = f"""You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
-# Task
 The {model_role} and the {user_role} can send multiple messages in a row, as a JSON list of strings. Your answer always needs to be JSON compliant. The strings are delimited by double quotes ("). The strings are separated by a comma (,). The list is delimited by square brackets ([, ]). Always start your answer with [", and close it with "]. Do not write anything else in your answer after "].
 # Information about me
-You should use the following information about me to answer:
 {system_prompt}"""
     # Example
     # [{{\"role\":\"user\",\"content\":\"[\"Hello!\",\"How are you?\"]\"}},{{\"role\":\"assistant\",\"content\":\"[\"Hi!\",\"I'm doing great.\",\"What about you?\"]\"}},{{\"role\":\"user\",\"content\":\"[\"I'm doing well.\",\"Have you been travelling?\"]\"}}]
     # Response:
     # [{{\"role\":\"assistant\",\"content\":\"[\"Yes, I've been to many places.\",\"I love travelling.\"]\"}}]"""
     # # Avoid using the full system prompt for now, as it is too long and increases the cost of the training
     # full_system_prompt = system_prompt
     dataset = convert_to_dataset(
-        files=files, progress=progress, do_spelling_correction=do_spelling_correction
     )
     training_examples_ds = transform_conversations_dataset_into_training_examples(
         conversations_ds=dataset,
         system_prompt=full_system_prompt,
@@ -69,6 +93,7 @@ You should use the following information about me to answer:
         model_role=model_role,
         whatsapp_name=whatsapp_name,
     )
     # Split into training and validation datasets (80% and 20%)
     training_examples_ds = training_examples_ds.train_test_split(
@@ -78,9 +103,9 @@ You should use the following information about me to answer:
         training_examples_ds["train"],
         training_examples_ds["test"],
     )
-    training_examples_ds = training_examples_ds#.select(
     #    range(min(250, len(training_examples_ds)))
-    #)
     validation_examples_ds = validation_examples_ds.select(
         range(min(200, len(validation_examples_ds)))
     )
@@ -124,6 +149,12 @@ You should use the following information about me to answer:
     file_path_validation = f"validation_examples_{uuid}.jsonl"
     validation_examples_ds.to_json(path_or_buf=file_path_validation, force_ascii=False)
     return (
         file_path,
         gr.update(visible=True),
@@ -142,7 +173,7 @@ def remove_file_and_hide_button(file_path):
     try:
         os.remove(file_path)
     except Exception as e:
-        print(f"Error removing file {file_path}: {e}")
     return gr.update(visible=False)
@@ -190,32 +221,52 @@ with gr.Blocks(theme=theme) as demo:
         info="Enter your WhatsApp name as it appears in your profile. It needs to match exactly your name. If you're unsure, you can check the chat messages to see it.",
     )
-    user_role = gr.Textbox(
-        label="Role for User",
-        info="This is a technical parameter. If you don't know what to write, just type 'user'.",
-        value="user",
-    )
-    model_role = gr.Textbox(
-        label="Role for Model",
-        info="This is a technical parameter. If you don't know what to write, just type 'model'.",
-        value="model",
-    )
-    do_spelling_correction = gr.Checkbox(
-        label="Do Spelling Correction (English)",
-        info="Check this box if you want to perform spelling correction on the chat messages before generating the training examples.",
-    )
-    # Allow the user to choose the validation split size
-    validation_split = gr.Slider(
-        minimum=0.0,
-        maximum=0.5,
-        value=0.2,
-        interactive=True,
-        label="Validation Split",
-        info="Choose the percentage of the dataset to be used for validation. For example, if you choose 0.2, 20% of the dataset will be used for validation and 80% for training.",
-    )
     submit = gr.Button(value="Submit", variant="primary")
@@ -253,6 +304,8 @@ with gr.Blocks(theme=theme) as demo:
             user_role,
             model_role,
             whatsapp_name,
         ],
         outputs=[
             output_file,

 # %%
+import io
+import json
+import logging
 from uuid import uuid4
 import datasets
+import gradio as gr
 import matplotlib.pyplot as plt
+from utils import (process_chat_file,
+                   transform_conversations_dataset_into_training_examples)
+from validation import check_format_errors, estimate_cost, get_distributions
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+def convert_to_dataset(files, do_spelling_correction, progress, whatsapp_name, datetime_dayfirst, message_line_format):
     modified_dataset = None
     for file in progress.tqdm(files, desc="Processing files"):
         if modified_dataset is None:
             # First file
             modified_dataset = process_chat_file(
+                file,
+                do_spelling_correction=do_spelling_correction,
+                whatsapp_name=whatsapp_name,
+                datetime_dayfirst=datetime_dayfirst,
+                message_line_format=message_line_format,
             )
         else:
             # Concatenate the datasets
             this_file_dataset = process_chat_file(
+                file,
+                do_spelling_correction=do_spelling_correction,
+                whatsapp_name=whatsapp_name,
+                datetime_dayfirst=datetime_dayfirst,
+                message_line_format=message_line_format,
             )
             modified_dataset = datasets.concatenate_datasets(
                 [modified_dataset, this_file_dataset]
     user_role,
     model_role,
     whatsapp_name,
+    datetime_dayfirst,
+    message_line_format,
     progress=gr.Progress(),
 ):
+    logger.info(f"Processing {files}")
+    full_system_prompt = f"""# Task
+You are a chatbot. Your goal is to simulate realistic, natural chat conversations as if you were me.
 The {model_role} and the {user_role} can send multiple messages in a row, as a JSON list of strings. Your answer always needs to be JSON compliant. The strings are delimited by double quotes ("). The strings are separated by a comma (,). The list is delimited by square brackets ([, ]). Always start your answer with [", and close it with "]. Do not write anything else in your answer after "].
 # Information about me
 {system_prompt}"""
     # Example
     # [{{\"role\":\"user\",\"content\":\"[\"Hello!\",\"How are you?\"]\"}},{{\"role\":\"assistant\",\"content\":\"[\"Hi!\",\"I'm doing great.\",\"What about you?\"]\"}},{{\"role\":\"user\",\"content\":\"[\"I'm doing well.\",\"Have you been travelling?\"]\"}}]
     # Response:
     # [{{\"role\":\"assistant\",\"content\":\"[\"Yes, I've been to many places.\",\"I love travelling.\"]\"}}]"""
+    # Check if the user has not chosen any files
+    if not files or len(files) == 0:
+        raise gr.Error("Please upload at least one file.")
+    # Check if the user has not entered their whatsapp name
+    if not whatsapp_name or len(whatsapp_name) == 0:
+        raise gr.Error("Please enter your WhatsApp name.")
     # # Avoid using the full system prompt for now, as it is too long and increases the cost of the training
     # full_system_prompt = system_prompt
     dataset = convert_to_dataset(
+        files=files,
+        progress=progress,
+        do_spelling_correction=do_spelling_correction,
+        whatsapp_name=whatsapp_name,
+        datetime_dayfirst=datetime_dayfirst,
+        message_line_format=message_line_format,
     )
+    logger.info(f"Number of conversations of dataset before being transformed: {len(dataset)}")
     training_examples_ds = transform_conversations_dataset_into_training_examples(
         conversations_ds=dataset,
         system_prompt=full_system_prompt,
         model_role=model_role,
         whatsapp_name=whatsapp_name,
     )
+    logger.info(f"Number of training examples: {len(training_examples_ds)}")
     # Split into training and validation datasets (80% and 20%)
     training_examples_ds = training_examples_ds.train_test_split(
         training_examples_ds["train"],
         training_examples_ds["test"],
     )
+    training_examples_ds = training_examples_ds  # .select(
     #    range(min(250, len(training_examples_ds)))
+    # )
     validation_examples_ds = validation_examples_ds.select(
         range(min(200, len(validation_examples_ds)))
     )
     file_path_validation = f"validation_examples_{uuid}.jsonl"
     validation_examples_ds.to_json(path_or_buf=file_path_validation, force_ascii=False)
+    # If there's less than 50 training examples, show a warning message
+    if len(training_examples_ds) < 50:
+        gr.Warning(
+            "Warning: There are less than 50 training examples. The model may not perform well with such a small dataset. Consider adding more chat files to increase the number of training examples."
+        )
     return (
         file_path,
         gr.update(visible=True),
     try:
         os.remove(file_path)
     except Exception as e:
+        logger.info(f"Error removing file {file_path}: {e}")
     return gr.update(visible=False)
         info="Enter your WhatsApp name as it appears in your profile. It needs to match exactly your name. If you're unsure, you can check the chat messages to see it.",
     )
+    # Advanced parameters section, collapsed by default
+    with gr.Accordion(label="Advanced Parameters", open=False):
+        gr.Markdown(
+            """
+            These are advanced parameters that you can change if you know what you're doing. If you're unsure, you can leave them as they are.
+            """
+        )
+        user_role = gr.Textbox(
+            label="Role for User",
+            info="This is a technical parameter. If you don't know what to write, just type 'user'.",
+            value="user",
+        )
+        model_role = gr.Textbox(
+            label="Role for Model",
+            info="This is a technical parameter. Usual values are 'model' or 'assistant'.",
+            value="model",
+        )
+        message_line_format = gr.Textbox(
+            label="Message Line Format",
+            info="Format of each message line in the chat file, as a regular expression. The default value should work for most cases.",
+            value=r"\[?(?P<msg_datetime>\S+,\s\S+?(?:\s[APap][Mm])?)\]? (?:- )?(?P<contact_name>.+): (?P<message>.+)",
+        )
+        datetime_dayfirst = gr.Checkbox(
+            label="Date format: Day first",
+            info="Check this box if the date time format in the chat messages is in the format 'DD/MM/YYYY'. You can check your phone settings to see the date format. Otherwise, it will be assumed that the date time format is 'MM/DD/YYYY'.",
+            value=True,
+        )
+        do_spelling_correction = gr.Checkbox(
+            label="Do Spelling Correction (English)",
+            info="Check this box if you want to perform spelling correction on the chat messages before generating the training examples.",
+        )
+        # Allow the user to choose the validation split size
+        validation_split = gr.Slider(
+            minimum=0.0,
+            maximum=0.5,
+            value=0.2,
+            interactive=True,
+            label="Validation Split",
+            info="Choose the percentage of the dataset to be used for validation. For example, if you choose 0.2, 20% of the dataset will be used for validation and 80% for training.",
+        )
     submit = gr.Button(value="Submit", variant="primary")
             user_role,
             model_role,
             whatsapp_name,
+            datetime_dayfirst,
+            message_line_format,
         ],
         outputs=[
             output_file,

utils.py CHANGED Viewed

@@ -1,36 +1,13 @@
-import datasets
 import datetime
-import os
 import json
 import re
-exp = re.compile(
-    r"(?P<month>\d+)/(?P<day>\d+)/(?P<year>\d+), (?P<hour>\d+):(?P<minute>\d+) - (?P<contact_name>.+): (?P<message>.+)"
-)
-def process_line(example):
-    # The lines have this format: dd/mm/yy, hh:mm - <person>: <msg>
-    try:
-        groups = exp.match(example["text"]).groupdict()
-        timestamp = datetime.datetime(
-            int(groups["year"]),
-            int(groups["month"]),
-            int(groups["day"]),
-            int(groups["hour"]),
-            int(groups["minute"]),
-        ).timestamp()
-        return {
-            "message": groups["message"],
-            "contact_name": groups["contact_name"],
-            "timestamp": timestamp,
-        }
-    except Exception as e:
-        print(e)
-        print(example["text"])
-        raise e
 # %%
 # Now, create message groups ('conversations')
@@ -63,10 +40,11 @@ def printable_conversation(conversation):
     )
 # %%
 # Use spacy to spell check the messages
 import spacy
-import contextualSpellCheck
 from spellchecker import SpellChecker
 spell = SpellChecker()
@@ -78,17 +56,17 @@ def spell_check_conversation(conversation):
     for i, message in enumerate(conversation["conversations"]):
         # Use SpaCy to get the words
         words = spell.split_words(message["message"])
-        print(f"Words: {words}")
         corrected_message = []
         for word in words:
             correction = spell.correction(word)
             if (correction != None) and (correction != word):
-                print(f"Spell check: {word} -> {correction}")
                 corrected_message.append(correction)
             else:
                 corrected_message.append(word)
-        print(f"Corrected message: {corrected_message}")
         joined_message = " ".join(corrected_message)
         conversation["conversations"][i]["message"] = joined_message
@@ -107,7 +85,7 @@ def spell_check_conversation_spacy(conversation):
     docs = list(nlp.pipe([msg["message"] for msg in conversation["conversations"]]))
     for i, doc in enumerate(docs):
         if doc._.performed_spellCheck:
-            print(f"Spell checked: {doc.text} -> {doc._.outcome_spellCheck}")
             conversation["conversations"][i]["message"] = doc._.outcome_spellCheck
     return conversation
@@ -144,8 +122,8 @@ A: I'm fine too
 To do it, we'll use MobileBERT with the next sentence prediction head. We'll use the first message as the first sentence, and the second message as the second sentence. If the model predicts that the second sentence is more likely to be the next sentence, we'll swap the messages.
 """
-from transformers import AutoTokenizer, AutoModelForNextSentencePrediction
 import torch
 tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 model = AutoModelForNextSentencePrediction.from_pretrained("bert-base-uncased")
@@ -186,10 +164,12 @@ def swap_messages_if_needed(message1, message2):
     swap = logits[0, 0] - logits[1, 0] < -0.2
     if swap:
         # Swap the messages
-        print(f"YES Swapping messages: {message1['message']} <-> {message2['message']}")
         return message2, message1
     else:
-        # print(f"NOT swapping messages: {message1['message']} <-> {message2['message']}")
         return message1, message2
@@ -208,8 +188,8 @@ def swap_messages_if_needed_in_conversation(conversation):
         new_conversation[-1] = message1
         new_conversation.append(message2)
-    # print(f"\nOriginal conversation:\n{printable_conversation(conversation)}")
-    # print(f"\nNew conversation:\n{printable_conversation(new_conversation)}")
     return new_conversation
@@ -226,26 +206,38 @@ test_conversation = [
         "timestamp": 3,
     },
 ]
-# print(swap_messages_if_needed_in_conversation(test_conversation))
 # %%
 # Now, we'll train an mT5 model to generate the next message in a conversation
 import os
-# For the contact_name, rewrite everything that is not 'Aldi' to 'Other'
-def rewrite_contact_name(conversation):
-    for message in conversation["conversations"]:
-        if message["contact_name"] != "Aldi":
-            message["contact_name"] = "Other"
-    return conversation
 # %%
-def process_chat_file(file, do_spelling_correction, do_reordering=False):
     """
     Process a chat file and return a dataset with the conversations.
     """
     ds = (
         datasets.load_dataset("text", data_files=[file])["train"]
         .filter(
@@ -288,6 +280,13 @@ def process_chat_file(file, do_spelling_correction, do_reordering=False):
     else:
         reordered_conversations_ds = spell_checked_conversations_ds
     changed_contact_name_ds = reordered_conversations_ds.map(
         rewrite_contact_name
     )  # , num_proc=os.cpu_count() - 1)
@@ -372,6 +371,10 @@ def transform_conversations_dataset_into_training_examples(
                         ]
                     }
                 )
         # Before returning, flatten the list of dictionaries into a dictionary of lists
         flattened_examples = {}
         for key in processed_examples[0].keys():

 import datetime
 import json
+import logging
+import os
 import re
+import datasets
+import dateutil.parser
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 # %%
 # Now, create message groups ('conversations')
     )
+import contextualSpellCheck
 # %%
 # Use spacy to spell check the messages
 import spacy
 from spellchecker import SpellChecker
 spell = SpellChecker()
     for i, message in enumerate(conversation["conversations"]):
         # Use SpaCy to get the words
         words = spell.split_words(message["message"])
+        logger.info(f"Words: {words}")
         corrected_message = []
         for word in words:
             correction = spell.correction(word)
             if (correction != None) and (correction != word):
+                logger.info(f"Spell check: {word} -> {correction}")
                 corrected_message.append(correction)
             else:
                 corrected_message.append(word)
+        logger.info(f"Corrected message: {corrected_message}")
         joined_message = " ".join(corrected_message)
         conversation["conversations"][i]["message"] = joined_message
     docs = list(nlp.pipe([msg["message"] for msg in conversation["conversations"]]))
     for i, doc in enumerate(docs):
         if doc._.performed_spellCheck:
+            logger.info(f"Spell checked: {doc.text} -> {doc._.outcome_spellCheck}")
             conversation["conversations"][i]["message"] = doc._.outcome_spellCheck
     return conversation
 To do it, we'll use MobileBERT with the next sentence prediction head. We'll use the first message as the first sentence, and the second message as the second sentence. If the model predicts that the second sentence is more likely to be the next sentence, we'll swap the messages.
 """
 import torch
+from transformers import AutoModelForNextSentencePrediction, AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 model = AutoModelForNextSentencePrediction.from_pretrained("bert-base-uncased")
     swap = logits[0, 0] - logits[1, 0] < -0.2
     if swap:
         # Swap the messages
+        logger.info(
+            f"Swapping messages: {message1['message']} <-> {message2['message']}"
+        )
         return message2, message1
     else:
+        # logger.info(f"NOT swapping messages: {message1['message']} <-> {message2['message']}")
         return message1, message2
         new_conversation[-1] = message1
         new_conversation.append(message2)
+    # logger.info(f"\nOriginal conversation:\n{printable_conversation(conversation)}")
+    # logger.info(f"\nNew conversation:\n{printable_conversation(new_conversation)}")
     return new_conversation
         "timestamp": 3,
     },
 ]
+# logger.info(swap_messages_if_needed_in_conversation(test_conversation))
 # %%
 # Now, we'll train an mT5 model to generate the next message in a conversation
 import os
 # %%
+def process_chat_file(file, do_spelling_correction, whatsapp_name, datetime_dayfirst, message_line_format, do_reordering=False):
     """
     Process a chat file and return a dataset with the conversations.
     """
+    exp = re.compile(
+        # r"(?P<msg_datetime>.+?) - (?P<contact_name>.+): (?P<message>.+)"
+        # r"\[?(?P<msg_datetime>\S+,\s\S+?(?:\s[APap][Mm])?)\]? (?:- )?(?P<contact_name>.+): (?P<message>.+)"
+        message_line_format
+    )
+    def process_line(example):
+        # The lines have this format: dd/mm/yy, hh:mm - <person>: <msg>
+        try:
+            groups = exp.match(example["text"]).groupdict()
+            timestamp = dateutil.parser.parse(groups['msg_datetime'], dayfirst=datetime_dayfirst).timestamp()
+            return {
+                "message": groups["message"],
+                "contact_name": groups["contact_name"],
+                "timestamp": timestamp,
+            }
+        except Exception as e:
+            logger.exception(example["text"])
+            raise e
     ds = (
         datasets.load_dataset("text", data_files=[file])["train"]
         .filter(
     else:
         reordered_conversations_ds = spell_checked_conversations_ds
+    # For the contact_name, rewrite everything that is not 'my_whatsapp_name' to 'Other'
+    def rewrite_contact_name(conversation):
+        for message in conversation["conversations"]:
+            if message["contact_name"] != whatsapp_name:
+                message["contact_name"] = "Other"
+        return conversation
     changed_contact_name_ds = reordered_conversations_ds.map(
         rewrite_contact_name
     )  # , num_proc=os.cpu_count() - 1)
                         ]
                     }
                 )
+            else:
+                logger.warning(
+                    f"Discarding conversation because the length is not at least {MIN_MESSAGES_THRESHOLD}: {messages}"
+                )
         # Before returning, flatten the list of dictionaries into a dictionary of lists
         flattened_examples = {}
         for key in processed_examples[0].keys():

validation.py CHANGED Viewed

@@ -1,7 +1,12 @@
-import numpy as np
 from collections import defaultdict
 import tiktoken
 def check_format_errors(train_dataset, user_role, model_role):
     """
@@ -24,7 +29,10 @@ def check_format_errors(train_dataset, user_role, model_role):
             if "role" not in message or "content" not in message:
                 format_errors["message_missing_key"] += 1
-            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                 format_errors["message_unrecognized_key"] += 1
             if message.get("role", None) not in ["system", user_role, model_role]:
@@ -40,14 +48,15 @@ def check_format_errors(train_dataset, user_role, model_role):
             format_errors["example_missing_assistant_message"] += 1
     if format_errors:
-        print("Found errors:")
         for k, v in format_errors.items():
-            print(f"{k}: {v}")
     else:
-        print("No errors found")
     return format_errors if format_errors else {}
 def get_distributions(train_dataset, user_role, model_role):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
@@ -76,7 +85,6 @@ def get_distributions(train_dataset, user_role, model_role):
                 num_tokens += len(encoding.encode(message["content"]))
         return num_tokens
     n_missing_system = 0
     n_missing_user = 0
     n_messages = []
@@ -92,13 +100,13 @@ def get_distributions(train_dataset, user_role, model_role):
         n_messages.append(len(messages))
         convo_lens.append(num_tokens_from_messages(messages))
         assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
     return {
         "n_missing_system": n_missing_system,
         "n_missing_user": n_missing_user,
         "n_messages": n_messages,
         "convo_lens": convo_lens,
-        "assistant_message_lens": assistant_message_lens
     }
@@ -106,48 +114,49 @@ def check_token_counts(train_dataset, user_role, model_role):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
     """
-    def print_distribution(values, name):
-        print(f"\n#### Distribution of {name}:")
-        print(f"min / max: {min(values)}, {max(values)}")
-        print(f"mean / median: {np.mean(values)}, {np.median(values)}")
-        print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")
     # Warnings and tokens counts
-    distributions = get_distributions(train_dataset, user_role=user_role, model_role=model_role)
     n_missing_system = distributions["n_missing_system"]
     n_missing_user = distributions["n_missing_user"]
     n_messages = distributions["n_messages"]
     convo_lens = distributions["convo_lens"]
     assistant_message_lens = distributions["assistant_message_lens"]
-    print("Num examples missing system message:", n_missing_system)
-    print("Num examples missing user message:", n_missing_user)
     print_distribution(n_messages, "num_messages_per_example")
     print_distribution(convo_lens, "num_total_tokens_per_example")
     print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
     n_too_long = sum(l > 4096 for l in convo_lens)
-    print(
         f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning"
     )
-    return
 def estimate_cost(train_dataset, user_role, model_role):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
     """
-    distributions = get_distributions(train_dataset, user_role=user_role, model_role=model_role)
     n_missing_system = distributions["n_missing_system"]
     n_missing_user = distributions["n_missing_user"]
     n_messages = distributions["n_messages"]
     convo_lens = distributions["convo_lens"]
     assistant_message_lens = distributions["assistant_message_lens"]
     # Pricing and default n_epochs estimate
     MAX_TOKENS_PER_EXAMPLE = 4096
@@ -159,10 +168,13 @@ def estimate_cost(train_dataset, user_role, model_role):
     n_epochs = TARGET_EPOCHS
     n_train_examples = len(train_dataset)
-    if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
-        n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
-    elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
-        n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)
     n_billing_tokens_in_dataset = sum(
         min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens
@@ -170,5 +182,6 @@ def estimate_cost(train_dataset, user_role, model_role):
     return {
         "Estimated number of tokens in dataset": n_billing_tokens_in_dataset,
-        f"Estimated number of tokens that will be billed (assuming {n_epochs} training epochs)": n_epochs * n_billing_tokens_in_dataset
     }

+import logging
 from collections import defaultdict
+import numpy as np
 import tiktoken
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
 def check_format_errors(train_dataset, user_role, model_role):
     """
             if "role" not in message or "content" not in message:
                 format_errors["message_missing_key"] += 1
+            if any(
+                k not in ("role", "content", "name", "function_call", "weight")
+                for k in message
+            ):
                 format_errors["message_unrecognized_key"] += 1
             if message.get("role", None) not in ["system", user_role, model_role]:
             format_errors["example_missing_assistant_message"] += 1
     if format_errors:
+        logger.warning("Found errors:")
         for k, v in format_errors.items():
+            logger.warning(f"{k}: {v}")
     else:
+        logger.info("No errors found")
     return format_errors if format_errors else {}
 def get_distributions(train_dataset, user_role, model_role):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
                 num_tokens += len(encoding.encode(message["content"]))
         return num_tokens
     n_missing_system = 0
     n_missing_user = 0
     n_messages = []
         n_messages.append(len(messages))
         convo_lens.append(num_tokens_from_messages(messages))
         assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
     return {
         "n_missing_system": n_missing_system,
         "n_missing_user": n_missing_user,
         "n_messages": n_messages,
         "convo_lens": convo_lens,
+        "assistant_message_lens": assistant_message_lens,
     }
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
     """
+    def print_distribution(values, name):
+        logger.info(f"\n#### Distribution of {name}:")
+        logger.info(f"min / max: {min(values)}, {max(values)}")
+        logger.info(f"mean / median: {np.mean(values)}, {np.median(values)}")
+        logger.info(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")
     # Warnings and tokens counts
+    distributions = get_distributions(
+        train_dataset, user_role=user_role, model_role=model_role
+    )
     n_missing_system = distributions["n_missing_system"]
     n_missing_user = distributions["n_missing_user"]
     n_messages = distributions["n_messages"]
     convo_lens = distributions["convo_lens"]
     assistant_message_lens = distributions["assistant_message_lens"]
+    logger.info("Num examples missing system message:", n_missing_system)
+    logger.info("Num examples missing user message:", n_missing_user)
     print_distribution(n_messages, "num_messages_per_example")
     print_distribution(convo_lens, "num_total_tokens_per_example")
     print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
     n_too_long = sum(l > 4096 for l in convo_lens)
+    logger.info(
         f"\n{n_too_long} examples may be over the 4096 token limit, they will be truncated during fine-tuning"
     )
+    return
 def estimate_cost(train_dataset, user_role, model_role):
     """
     Extracted from: https://cookbook.openai.com/examples/chat_finetuning_data_prep
     """
+    distributions = get_distributions(
+        train_dataset, user_role=user_role, model_role=model_role
+    )
     n_missing_system = distributions["n_missing_system"]
     n_missing_user = distributions["n_missing_user"]
     n_messages = distributions["n_messages"]
     convo_lens = distributions["convo_lens"]
     assistant_message_lens = distributions["assistant_message_lens"]
     # Pricing and default n_epochs estimate
     MAX_TOKENS_PER_EXAMPLE = 4096
     n_epochs = TARGET_EPOCHS
     n_train_examples = len(train_dataset)
+    try:
+        if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
+            n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
+        elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
+            n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)
+    except:
+        n_epochs = TARGET_EPOCHS
     n_billing_tokens_in_dataset = sum(
         min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens
     return {
         "Estimated number of tokens in dataset": n_billing_tokens_in_dataset,
+        f"Estimated number of tokens that will be billed (assuming {n_epochs} training epochs)": n_epochs
+        * n_billing_tokens_in_dataset,
     }