Spaces:
Runtime error
Runtime error
File size: 8,169 Bytes
dec1d11 f0497b9 dec1d11 809b033 dec1d11 809b033 dec1d11 809b033 f0497b9 dec1d11 0fa257c 809b033 d8e5519 dec1d11 1c43f88 0fa257c f0497b9 dec1d11 809b033 f0497b9 dec1d11 809b033 dec1d11 809b033 f0497b9 809b033 f0497b9 809b033 f0497b9 809b033 3f00085 356ed86 d8e5519 356ed86 809b033 3f00085 d8e5519 809b033 3f00085 809b033 3f00085 809b033 f0497b9 809b033 f0497b9 809b033 dec1d11 809b033 f0497b9 2604b51 809b033 2604b51 809b033 2604b51 3f00085 809b033 2604b51 809b033 f0497b9 3f00085 f0497b9 dec1d11 b40cf90 dec1d11 b40cf90 dec1d11 b40cf90 dec1d11 809b033 2604b51 809b033 dec1d11 809b033 dec1d11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import os
from collections import defaultdict
import gradio as gr
from dotenv import load_dotenv
from huggingface_hub import (
add_collection_item,
create_collection,
list_datasets,
)
from toolz import unique
from tqdm.auto import tqdm
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
LIMIT = None
NAMESPACE = "DIBT"
def extract_languages(dataset_info):
return [tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")]
def create_dataset_info():
all_datasets = list(tqdm(list_datasets(full=True, limit=LIMIT)))
all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id]
dpo_in_name = [
dataset for dataset in all_datasets if "_dpo" in dataset.id or "dpo_" in dataset.id
]
dpo_in_tags = [
dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)
]
all_dpo_datasets = dpo_in_name + dpo_in_tags
dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id))
dpo_datasets = [d for d in dpo_datasets if d.card_data is not None]
dpo_datasets_with_languages = [
dpo_dataset
for dpo_dataset in dpo_datasets
if dpo_dataset.card_data.get("language") is not None
]
language_groups = defaultdict(list)
for dataset in dpo_datasets_with_languages:
languages = extract_languages(dataset)
for language in languages:
language_groups[language].append(dataset)
return language_groups
def create_update_collections(language_groups):
collections = {}
for language, dataset_list in language_groups.items():
collection_title = f"DPO datasets for {language.upper()}"
collection = create_collection(
title=collection_title,
description=f"A collection of DPO datasets for the {language.upper()} language.",
exists_ok=True,
namespace=NAMESPACE,
token=HF_TOKEN,
)
existing_items = {item.item_id for item in collection.items}
for dataset in dataset_list:
if dataset.id not in existing_items:
add_collection_item(
collection.slug, item_id=dataset.id, item_type="dataset", token=HF_TOKEN
)
collections[language] = collection
return collections
def display_datasets(language):
if language not in datasets:
return "No datasets found for the selected language."
dataset_list = datasets[language]
collection = collections[language]
output = f"## Datasets for {language.upper()}\n\n"
output += f"Total datasets: {len(dataset_list)}\n\n"
output += f"View Hugging Face [Collection](https://huggingface.co/collections/{collection.slug}) for language.\n\n"
for dataset in dataset_list:
output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
return output
def display_overview():
total_datasets = sum(len(datasets) for datasets in datasets.values())
total_languages = len(datasets)
overview = "## Dataset Overview\n\n"
overview += f"- Total number of datasets: {total_datasets}\n"
overview += f"- Total number of languages covered: {total_languages}\n\n"
overview += "### Datasets per Language\n\n"
for language, dataset_list in datasets.items():
collection = collections[language]
overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n"
return overview
# Create the dataset information
datasets = create_dataset_info()
# Create/update collections for each language
collections = create_update_collections(datasets)
# Get the list of available languages
languages = list(datasets.keys())
overview = """
This Space shows an overview of preference datasets, in particular DPO style datasets, available on the Hugging Face Hub across different languages
Recently, [Odds Ratio Preference Optimization](https://huggingface.co/papers/2403.07691) ORPO has been demonstrated to be a powerful tool for training better performing language models directly from preference datasets.
- ORPO can be done using DPO style datasets
- Is a key ingredient for training better models for every language having enough DPO datasets for different languages?
- This Space aims to track the number DPO datasets available on the Hugging Face Hub for different languages.
"""
dpo = """
#### What is Direct Preference Optimization (DPO)?
DPO is a machine learning approach designed to optimize language models based on direct user preferences, bypassing the traditional reward modeling phase. It works by:
1. Calculating log probabilities of preferred and less preferred outputs from a language model.
2. Adjusting the model to maximize the likelihood of preferred outputs.
This makes the optimization process simpler and potentially more effective by directly targeting what users deem desirable or preferable in language model responses.
A DPO dataset includes three components:
- **Input**: The input text or prompt that the language model receives.
- **Chosen Output**: The output text that the user prefers.
- **Rejected Output**: The output text that is less preferred by the user.
#### What is ORPO?
Odds Ratio Preference Optimization (ORPO) is a refinement that does not require a reference model for preference alignment. ORPO directly trains a language model without a SFT step, meaning you can do SFT and preference training in one stage. ORPO uses the same DPO datasets as DPO, but the training process is different. This means any DPO dataset can be used for ORPO training!
Recently, Argilla, KAIST, and Hugging Face created [zephyr-orpo-141b-A35b-v0.1](https://huggingface.co/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1) using ORPO. This model is showing very strong performance whilst using only 70k examples for training. This suggests that ORPO could be a very powerful tool for training better performing open language models for many languages.
The only thing that might be missing is enough DPO datasets for training these models. This space aims to track what DPO datasets are available for different languages and how many datasets are available for each language.
"""
adding_datasets = """
## Adding a dataset
To include a dataset in this Space, it should have either a `dpo` tag (or the dataset ID should contain `_dpo` or `dpo_`).
Additionally, the dataset card should include language metadata. If you know of another dataset that should be included in this Space, please:
1. Add the `dpo` tag to the dataset
2. Include the language metadata in the dataset card
3. Open a discussion in this Space.
I'll refresh the list of datasets to include it 🤗
"""
faq = """
## Frequently Asked Questions
**Q: What is the difference between DPO and ORPO?**
A: DPO and ORPO both use direct user preferences to optimize language models, but ORPO does not require a separate reference model for preference alignment. ORPO can perform supervised fine-tuning (SFT) and preference training in a single stage.
**Q: Can I use DPO datasets for ORPO training?**
A: Yes! Since ORPO uses the same dataset format as DPO, any DPO dataset can be used for ORPO training.
**Q: How can I contribute to this Space?**
A: If you know of a dataset that should be included, make sure it has the `dpo` tag or the appropriate dataset ID format, and include language metadata in the dataset card. Then, open a discussion in this Space to let me know about the dataset.
"""
with gr.Blocks() as demo:
gr.HTML(
"<h1 style='text-align: center;'>π DPO Datasets by Language π£οΈ</h1>",
)
gr.Markdown(overview)
overview = gr.Markdown(display_overview())
with gr.Row():
with gr.Column():
language_dropdown = gr.Dropdown(languages, label="Select Language")
dataset_info = gr.Markdown()
language_dropdown.change(display_datasets, inputs=language_dropdown, outputs=dataset_info)
with gr.Accordion("More Information", open=False):
gr.Markdown(dpo)
gr.Markdown(adding_datasets)
gr.Markdown(faq)
demo.launch()
|