File size: 8,169 Bytes
dec1d11
 
 
f0497b9
dec1d11
809b033
dec1d11
809b033
dec1d11
809b033
f0497b9
dec1d11
0fa257c
809b033
d8e5519
dec1d11
1c43f88
0fa257c
f0497b9
 
dec1d11
809b033
f0497b9
 
dec1d11
809b033
 
 
dec1d11
809b033
 
 
 
 
f0497b9
 
 
809b033
 
 
 
 
 
f0497b9
 
 
 
 
809b033
f0497b9
 
809b033
3f00085
 
 
 
356ed86
 
 
 
 
d8e5519
356ed86
809b033
 
3f00085
 
d8e5519
 
 
809b033
3f00085
809b033
3f00085
 
809b033
f0497b9
809b033
f0497b9
809b033
 
 
 
dec1d11
809b033
 
 
 
f0497b9
2604b51
 
 
809b033
2604b51
 
 
809b033
2604b51
 
3f00085
 
809b033
2604b51
 
809b033
f0497b9
 
 
3f00085
 
 
f0497b9
 
 
dec1d11
b40cf90
 
dec1d11
b40cf90
dec1d11
b40cf90
 
dec1d11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809b033
2604b51
 
 
 
809b033
dec1d11
 
 
 
 
809b033
dec1d11
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import os
from collections import defaultdict

import gradio as gr
from dotenv import load_dotenv
from huggingface_hub import (
    add_collection_item,
    create_collection,
    list_datasets,
)
from toolz import unique
from tqdm.auto import tqdm

load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
LIMIT = None
NAMESPACE = "DIBT"


def extract_languages(dataset_info):
    return [tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")]


def create_dataset_info():
    all_datasets = list(tqdm(list_datasets(full=True, limit=LIMIT)))
    all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id]

    dpo_in_name = [
        dataset for dataset in all_datasets if "_dpo" in dataset.id or "dpo_" in dataset.id
    ]
    dpo_in_tags = [
        dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)
    ]

    all_dpo_datasets = dpo_in_name + dpo_in_tags
    dpo_datasets = list(unique(all_dpo_datasets, key=lambda x: x.id))
    dpo_datasets = [d for d in dpo_datasets if d.card_data is not None]
    dpo_datasets_with_languages = [
        dpo_dataset
        for dpo_dataset in dpo_datasets
        if dpo_dataset.card_data.get("language") is not None
    ]

    language_groups = defaultdict(list)
    for dataset in dpo_datasets_with_languages:
        languages = extract_languages(dataset)
        for language in languages:
            language_groups[language].append(dataset)

    return language_groups


def create_update_collections(language_groups):
    collections = {}
    for language, dataset_list in language_groups.items():
        collection_title = f"DPO datasets for {language.upper()}"
        collection = create_collection(
            title=collection_title,
            description=f"A collection of DPO datasets for the {language.upper()} language.",
            exists_ok=True,
            namespace=NAMESPACE,
            token=HF_TOKEN,
        )
        existing_items = {item.item_id for item in collection.items}

        for dataset in dataset_list:
            if dataset.id not in existing_items:
                add_collection_item(
                    collection.slug, item_id=dataset.id, item_type="dataset", token=HF_TOKEN
                )

        collections[language] = collection

    return collections


def display_datasets(language):
    if language not in datasets:
        return "No datasets found for the selected language."
    dataset_list = datasets[language]
    collection = collections[language]
    output = f"## Datasets for {language.upper()}\n\n"
    output += f"Total datasets: {len(dataset_list)}\n\n"
    output += f"View Hugging Face [Collection](https://huggingface.co/collections/{collection.slug}) for language.\n\n"
    for dataset in dataset_list:
        output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
    return output


def display_overview():
    total_datasets = sum(len(datasets) for datasets in datasets.values())
    total_languages = len(datasets)

    overview = "## Dataset Overview\n\n"
    overview += f"- Total number of datasets: {total_datasets}\n"
    overview += f"- Total number of languages covered: {total_languages}\n\n"

    overview += "### Datasets per Language\n\n"
    for language, dataset_list in datasets.items():
        collection = collections[language]
        overview += f"- {language.upper()}: {len(dataset_list)} datasets ([View Collection](https://huggingface.co/collections/{collection.slug}))\n"

    return overview


# Create the dataset information
datasets = create_dataset_info()

# Create/update collections for each language
collections = create_update_collections(datasets)

# Get the list of available languages
languages = list(datasets.keys())

overview = """
This Space shows an overview of preference datasets, in particular DPO style datasets, available on the Hugging Face Hub across different languages
Recently, [Odds Ratio Preference Optimization](https://huggingface.co/papers/2403.07691) ORPO has been demonstrated to be a powerful tool for training better performing language models directly from preference datasets. 

- ORPO can be done using DPO style datasets 
- Is a key ingredient for training better models for every language having enough DPO datasets for different languages?
- This Space aims to track the number DPO datasets available on the Hugging Face Hub for different languages. 
"""

dpo = """
#### What is Direct Preference Optimization (DPO)?

DPO is a machine learning approach designed to optimize language models based on direct user preferences, bypassing the traditional reward modeling phase. It works by:

1. Calculating log probabilities of preferred and less preferred outputs from a language model.
2. Adjusting the model to maximize the likelihood of preferred outputs.

This makes the optimization process simpler and potentially more effective by directly targeting what users deem desirable or preferable in language model responses.

A DPO dataset includes three components:

- **Input**: The input text or prompt that the language model receives.
- **Chosen Output**: The output text that the user prefers.
- **Rejected Output**: The output text that is less preferred by the user.

#### What is ORPO?

Odds Ratio Preference Optimization (ORPO) is a refinement that does not require a reference model for preference alignment. ORPO directly trains a language model without a SFT step, meaning you can do SFT and preference training in one stage. ORPO uses the same DPO datasets as DPO, but the training process is different. This means any DPO dataset can be used for ORPO training!

Recently, Argilla, KAIST, and Hugging Face created [zephyr-orpo-141b-A35b-v0.1](https://huggingface.co/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1) using ORPO. This model is showing very strong performance whilst using only 70k examples for training. This suggests that ORPO could be a very powerful tool for training better performing open language models for many languages.

The only thing that might be missing is enough DPO datasets for training these models. This space aims to track what DPO datasets are available for different languages and how many datasets are available for each language.
"""

adding_datasets = """
## Adding a dataset

To include a dataset in this Space, it should have either a `dpo` tag (or the dataset ID should contain `_dpo` or `dpo_`). 
Additionally, the dataset card should include language metadata. If you know of another dataset that should be included in this Space, please:

1. Add the `dpo` tag to the dataset
2. Include the language metadata in the dataset card
3. Open a discussion in this Space. 

I'll refresh the list of datasets to include it 🤗
"""
faq = """

## Frequently Asked Questions

**Q: What is the difference between DPO and ORPO?**
A: DPO and ORPO both use direct user preferences to optimize language models, but ORPO does not require a separate reference model for preference alignment. ORPO can perform supervised fine-tuning (SFT) and preference training in a single stage.

**Q: Can I use DPO datasets for ORPO training?**
A: Yes! Since ORPO uses the same dataset format as DPO, any DPO dataset can be used for ORPO training.

**Q: How can I contribute to this Space?**
A: If you know of a dataset that should be included, make sure it has the `dpo` tag or the appropriate dataset ID format, and include language metadata in the dataset card. Then, open a discussion in this Space to let me know about the dataset.
"""

with gr.Blocks() as demo:
    gr.HTML(
        "<h1 style='text-align: center;'>🌍 DPO Datasets by Language πŸ—£οΈ</h1>",
    )
    gr.Markdown(overview)

    overview = gr.Markdown(display_overview())

    with gr.Row():
        with gr.Column():
            language_dropdown = gr.Dropdown(languages, label="Select Language")
            dataset_info = gr.Markdown()

    language_dropdown.change(display_datasets, inputs=language_dropdown, outputs=dataset_info)
    with gr.Accordion("More Information", open=False):
        gr.Markdown(dpo)
        gr.Markdown(adding_datasets)
        gr.Markdown(faq)

demo.launch()