Spaces:

DIBT
/

preference_data_by_language

Runtime error

App Files Files Community

davanstrien HF staff commited on Apr 15

Commit

dec1d11

•

1 Parent(s): 4476ba5

Refactor app.py to improve performance

Browse files

Files changed (1) hide show

app.py +88 -31

app.py CHANGED Viewed

@@ -1,36 +1,33 @@
 import gradio as gr
 from huggingface_hub import (
-    list_datasets,
     create_collection,
     get_collection,
-    add_collection_item,
-    update_collection_item,
 )
-from tqdm.auto import tqdm
 from toolz import unique
-from collections import defaultdict
-from huggingface_hub import login
-import os
-from dotenv import load_dotenv
 load_dotenv()
 login(token=os.getenv("HF_TOKEN"))
 def extract_languages(dataset_info):
-    return [
-        tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")
-    ]
 def create_dataset_info():
-    all_datasets = list(tqdm(list_datasets(full=True)))
     all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id]
     dpo_in_name = [
-        dataset
-        for dataset in all_datasets
-        if "_dpo" in dataset.id or "dpo_" in dataset.id
     ]
     dpo_in_tags = [
         dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)
@@ -62,6 +59,7 @@ def create_update_collections(language_groups):
             collection = create_collection(
                 title=collection_title,
                 description=f"A collection of DPO datasets for the {language.upper()} language.",
             )
         except Exception:
             collection = get_collection(f"DPO-datasets-for-{language.upper()}")
@@ -70,9 +68,7 @@ def create_update_collections(language_groups):
         for dataset in dataset_list:
             if dataset.id not in existing_items:
-                add_collection_item(
-                    collection.slug, item_id=dataset.id, item_type="dataset"
-                )
         collections[language] = collection
@@ -86,9 +82,7 @@ def display_datasets(language):
     collection = collections[language]
     output = f"## Datasets for {language.upper()}\n\n"
     output += f"Total datasets: {len(dataset_list)}\n\n"
-    output += (
-        f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n"
-    )
     for dataset in dataset_list:
         output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
     return output
@@ -119,20 +113,83 @@ collections = create_update_collections(datasets)
 # Get the list of available languages
 languages = list(datasets.keys())
-with gr.Blocks() as iface:
-    gr.Markdown("# DPO Datasets by Language")
-    gr.Markdown("Explore DPO datasets grouped by language.")
     with gr.Row():
         with gr.Column():
             language_dropdown = gr.Dropdown(languages, label="Select Language")
             dataset_info = gr.Markdown()
-        with gr.Column():
-            overview = gr.Markdown(display_overview())
-    language_dropdown.change(
-        display_datasets, inputs=language_dropdown, outputs=dataset_info
-    )
-iface.launch()

+import os
+from collections import defaultdict
 import gradio as gr
+from dotenv import load_dotenv
 from huggingface_hub import (
+    add_collection_item,
     create_collection,
     get_collection,
+    list_datasets,
+    login,
 )
 from toolz import unique
+from tqdm.auto import tqdm
 load_dotenv()
 login(token=os.getenv("HF_TOKEN"))
+LIMIT = None
 def extract_languages(dataset_info):
+    return [tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")]
 def create_dataset_info():
+    all_datasets = list(tqdm(list_datasets(full=True, limit=LIMIT)))
     all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id]
     dpo_in_name = [
+        dataset for dataset in all_datasets if "_dpo" in dataset.id or "dpo_" in dataset.id
     ]
     dpo_in_tags = [
         dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)
             collection = create_collection(
                 title=collection_title,
                 description=f"A collection of DPO datasets for the {language.upper()} language.",
+                exists_ok=True,
             )
         except Exception:
             collection = get_collection(f"DPO-datasets-for-{language.upper()}")
         for dataset in dataset_list:
             if dataset.id not in existing_items:
+                add_collection_item(collection.slug, item_id=dataset.id, item_type="dataset")
         collections[language] = collection
     collection = collections[language]
     output = f"## Datasets for {language.upper()}\n\n"
     output += f"Total datasets: {len(dataset_list)}\n\n"
+    output += f"View Hugging Face [Collection](https://huggingface.co/collections/{collection.slug}) for language.\n\n"
     for dataset in dataset_list:
         output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
     return output
 # Get the list of available languages
 languages = list(datasets.keys())
+overview = """
+This Space shows an overview of Direct Preference Optimization (DPO) datasets available on the Hugging Face Hub across different languages.
+Recently ORPO has been demonstrated to be a powerful tool for training better performing language models.
+- ORPO training can be done using DPO style datasets
+- Is a key ingredient for training better models for every language having enough DPO datasets for different languages?
+- This Space aims to track the number DPO datasets are available for different languages and how many datasets are available for each language!"""
+dpo = """
+#### What is Direct Preference Optimization (DPO)?
+DPO is a machine learning approach designed to optimize language models based on direct user preferences, bypassing the traditional reward modeling phase. It works by:
+1. Calculating log probabilities of preferred and less preferred outputs from a language model.
+2. Adjusting the model to maximize the likelihood of preferred outputs.
+This makes the optimization process simpler and potentially more effective by directly targeting what users deem desirable or preferable in language model responses.
+A DPO dataset includes three components:
+- **Input**: The input text or prompt that the language model receives.
+- **Chosen Output**: The output text that the user prefers.
+- **Rejected Output**: The output text that is less preferred by the user.
+#### What is ORPO?
+Odds Ratio Preference Optimization (ORPO) is a refinement that does not require a reference model for preference alignment. ORPO directly trains a language model without a SFT step, meaning you can do SFT and preference training in one stage. ORPO uses the same DPO datasets as DPO, but the training process is different. This means any DPO dataset can be used for ORPO training!
+Recently, Argilla, KAIST, and Hugging Face created [zephyr-orpo-141b-A35b-v0.1](https://huggingface.co/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1) using ORPO. This model is showing very strong performance whilst using only 70k examples for training. This suggests that ORPO could be a very powerful tool for training better performing open language models for many languages.
+The only thing that might be missing is enough DPO datasets for training these models. This space aims to track what DPO datasets are available for different languages and how many datasets are available for each language.
+"""
+adding_datasets = """
+## Adding a dataset
+To include a dataset in this Space, it should have either a `dpo` tag (or the dataset ID should contain `_dpo` or `dpo_`).
+Additionally, the dataset card should include language metadata. If you know of another dataset that should be included in this Space, please:
+1. Add the `dpo` tag to the dataset
+2. Include the language metadata in the dataset card
+3. Open a discussion in this Space.
+I'll refresh the list of datasets to include it &#x1F917;
+"""
+faq = """
+## Frequently Asked Questions
+**Q: What is the difference between DPO and ORPO?**
+A: DPO and ORPO both use direct user preferences to optimize language models, but ORPO does not require a separate reference model for preference alignment. ORPO can perform supervised fine-tuning (SFT) and preference training in a single stage.
+**Q: Can I use DPO datasets for ORPO training?**
+A: Yes! Since ORPO uses the same dataset format as DPO, any DPO dataset can be used for ORPO training.
+**Q: How can I contribute to this Space?**
+A: If you know of a dataset that should be included, make sure it has the `dpo` tag or the appropriate dataset ID format, and include language metadata in the dataset card. Then, open a discussion in this Space to let me know about the dataset.
+"""
+with gr.Blocks() as demo:
+    gr.HTML(
+        "<h1 style='text-align: center;'>🌍 DPO Datasets by Language 🗣️</h1>",
+    )
+    gr.Markdown(overview)
+    overview = gr.Markdown(display_overview())
     with gr.Row():
         with gr.Column():
             language_dropdown = gr.Dropdown(languages, label="Select Language")
             dataset_info = gr.Markdown()
+    language_dropdown.change(display_datasets, inputs=language_dropdown, outputs=dataset_info)
+    with gr.Accordion("More Information", open=False):
+        gr.Markdown(dpo)
+        gr.Markdown(adding_datasets)
+        gr.Markdown(faq)
+demo.launch()