davanstrien HF staff commited on
Commit
dec1d11
β€’
1 Parent(s): 4476ba5

Refactor app.py to improve performance

Browse files
Files changed (1) hide show
  1. app.py +88 -31
app.py CHANGED
@@ -1,36 +1,33 @@
 
 
 
1
  import gradio as gr
 
2
  from huggingface_hub import (
3
- list_datasets,
4
  create_collection,
5
  get_collection,
6
- add_collection_item,
7
- update_collection_item,
8
  )
9
- from tqdm.auto import tqdm
10
  from toolz import unique
11
- from collections import defaultdict
12
- from huggingface_hub import login
13
- import os
14
- from dotenv import load_dotenv
15
 
16
  load_dotenv()
17
  login(token=os.getenv("HF_TOKEN"))
 
18
 
19
 
20
  def extract_languages(dataset_info):
21
- return [
22
- tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")
23
- ]
24
 
25
 
26
  def create_dataset_info():
27
- all_datasets = list(tqdm(list_datasets(full=True)))
28
  all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id]
29
 
30
  dpo_in_name = [
31
- dataset
32
- for dataset in all_datasets
33
- if "_dpo" in dataset.id or "dpo_" in dataset.id
34
  ]
35
  dpo_in_tags = [
36
  dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)
@@ -62,6 +59,7 @@ def create_update_collections(language_groups):
62
  collection = create_collection(
63
  title=collection_title,
64
  description=f"A collection of DPO datasets for the {language.upper()} language.",
 
65
  )
66
  except Exception:
67
  collection = get_collection(f"DPO-datasets-for-{language.upper()}")
@@ -70,9 +68,7 @@ def create_update_collections(language_groups):
70
 
71
  for dataset in dataset_list:
72
  if dataset.id not in existing_items:
73
- add_collection_item(
74
- collection.slug, item_id=dataset.id, item_type="dataset"
75
- )
76
 
77
  collections[language] = collection
78
 
@@ -86,9 +82,7 @@ def display_datasets(language):
86
  collection = collections[language]
87
  output = f"## Datasets for {language.upper()}\n\n"
88
  output += f"Total datasets: {len(dataset_list)}\n\n"
89
- output += (
90
- f"[View Collection](https://huggingface.co/collections/{collection.slug})\n\n"
91
- )
92
  for dataset in dataset_list:
93
  output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
94
  return output
@@ -119,20 +113,83 @@ collections = create_update_collections(datasets)
119
  # Get the list of available languages
120
  languages = list(datasets.keys())
121
 
122
- with gr.Blocks() as iface:
123
- gr.Markdown("# DPO Datasets by Language")
124
- gr.Markdown("Explore DPO datasets grouped by language.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  with gr.Row():
127
  with gr.Column():
128
  language_dropdown = gr.Dropdown(languages, label="Select Language")
129
  dataset_info = gr.Markdown()
130
 
131
- with gr.Column():
132
- overview = gr.Markdown(display_overview())
133
-
134
- language_dropdown.change(
135
- display_datasets, inputs=language_dropdown, outputs=dataset_info
136
- )
137
 
138
- iface.launch()
 
1
+ import os
2
+ from collections import defaultdict
3
+
4
  import gradio as gr
5
+ from dotenv import load_dotenv
6
  from huggingface_hub import (
7
+ add_collection_item,
8
  create_collection,
9
  get_collection,
10
+ list_datasets,
11
+ login,
12
  )
 
13
  from toolz import unique
14
+ from tqdm.auto import tqdm
 
 
 
15
 
16
  load_dotenv()
17
  login(token=os.getenv("HF_TOKEN"))
18
+ LIMIT = None
19
 
20
 
21
  def extract_languages(dataset_info):
22
+ return [tag.split(":")[1] for tag in dataset_info.tags if tag.startswith("language:")]
 
 
23
 
24
 
25
  def create_dataset_info():
26
+ all_datasets = list(tqdm(list_datasets(full=True, limit=LIMIT)))
27
  all_datasets = [d for d in all_datasets if "open-llm-leaderboard" not in d.id]
28
 
29
  dpo_in_name = [
30
+ dataset for dataset in all_datasets if "_dpo" in dataset.id or "dpo_" in dataset.id
 
 
31
  ]
32
  dpo_in_tags = [
33
  dataset for dataset in all_datasets if any(tag == "dpo" for tag in dataset.tags)
 
59
  collection = create_collection(
60
  title=collection_title,
61
  description=f"A collection of DPO datasets for the {language.upper()} language.",
62
+ exists_ok=True,
63
  )
64
  except Exception:
65
  collection = get_collection(f"DPO-datasets-for-{language.upper()}")
 
68
 
69
  for dataset in dataset_list:
70
  if dataset.id not in existing_items:
71
+ add_collection_item(collection.slug, item_id=dataset.id, item_type="dataset")
 
 
72
 
73
  collections[language] = collection
74
 
 
82
  collection = collections[language]
83
  output = f"## Datasets for {language.upper()}\n\n"
84
  output += f"Total datasets: {len(dataset_list)}\n\n"
85
+ output += f"View Hugging Face [Collection](https://huggingface.co/collections/{collection.slug}) for language.\n\n"
 
 
86
  for dataset in dataset_list:
87
  output += f"- [{dataset.id}](https://huggingface.co/datasets/{dataset.id})\n"
88
  return output
 
113
  # Get the list of available languages
114
  languages = list(datasets.keys())
115
 
116
+ overview = """
117
+ This Space shows an overview of Direct Preference Optimization (DPO) datasets available on the Hugging Face Hub across different languages.
118
+
119
+ Recently ORPO has been demonstrated to be a powerful tool for training better performing language models.
120
+
121
+ - ORPO training can be done using DPO style datasets
122
+ - Is a key ingredient for training better models for every language having enough DPO datasets for different languages?
123
+ - This Space aims to track the number DPO datasets are available for different languages and how many datasets are available for each language!"""
124
+
125
+ dpo = """
126
+ #### What is Direct Preference Optimization (DPO)?
127
+
128
+ DPO is a machine learning approach designed to optimize language models based on direct user preferences, bypassing the traditional reward modeling phase. It works by:
129
+
130
+ 1. Calculating log probabilities of preferred and less preferred outputs from a language model.
131
+ 2. Adjusting the model to maximize the likelihood of preferred outputs.
132
+
133
+ This makes the optimization process simpler and potentially more effective by directly targeting what users deem desirable or preferable in language model responses.
134
+
135
+ A DPO dataset includes three components:
136
+
137
+ - **Input**: The input text or prompt that the language model receives.
138
+ - **Chosen Output**: The output text that the user prefers.
139
+ - **Rejected Output**: The output text that is less preferred by the user.
140
+
141
+ #### What is ORPO?
142
+
143
+ Odds Ratio Preference Optimization (ORPO) is a refinement that does not require a reference model for preference alignment. ORPO directly trains a language model without a SFT step, meaning you can do SFT and preference training in one stage. ORPO uses the same DPO datasets as DPO, but the training process is different. This means any DPO dataset can be used for ORPO training!
144
+
145
+ Recently, Argilla, KAIST, and Hugging Face created [zephyr-orpo-141b-A35b-v0.1](https://huggingface.co/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1) using ORPO. This model is showing very strong performance whilst using only 70k examples for training. This suggests that ORPO could be a very powerful tool for training better performing open language models for many languages.
146
+
147
+ The only thing that might be missing is enough DPO datasets for training these models. This space aims to track what DPO datasets are available for different languages and how many datasets are available for each language.
148
+ """
149
+
150
+ adding_datasets = """
151
+ ## Adding a dataset
152
+
153
+ To include a dataset in this Space, it should have either a `dpo` tag (or the dataset ID should contain `_dpo` or `dpo_`).
154
+ Additionally, the dataset card should include language metadata. If you know of another dataset that should be included in this Space, please:
155
+
156
+ 1. Add the `dpo` tag to the dataset
157
+ 2. Include the language metadata in the dataset card
158
+ 3. Open a discussion in this Space.
159
+
160
+ I'll refresh the list of datasets to include it 🤗
161
+ """
162
+ faq = """
163
+
164
+ ## Frequently Asked Questions
165
+
166
+ **Q: What is the difference between DPO and ORPO?**
167
+ A: DPO and ORPO both use direct user preferences to optimize language models, but ORPO does not require a separate reference model for preference alignment. ORPO can perform supervised fine-tuning (SFT) and preference training in a single stage.
168
+
169
+ **Q: Can I use DPO datasets for ORPO training?**
170
+ A: Yes! Since ORPO uses the same dataset format as DPO, any DPO dataset can be used for ORPO training.
171
+
172
+ **Q: How can I contribute to this Space?**
173
+ A: If you know of a dataset that should be included, make sure it has the `dpo` tag or the appropriate dataset ID format, and include language metadata in the dataset card. Then, open a discussion in this Space to let me know about the dataset.
174
+ """
175
+
176
+ with gr.Blocks() as demo:
177
+ gr.HTML(
178
+ "<h1 style='text-align: center;'>🌍 DPO Datasets by Language πŸ—£οΈ</h1>",
179
+ )
180
+ gr.Markdown(overview)
181
+
182
+ overview = gr.Markdown(display_overview())
183
 
184
  with gr.Row():
185
  with gr.Column():
186
  language_dropdown = gr.Dropdown(languages, label="Select Language")
187
  dataset_info = gr.Markdown()
188
 
189
+ language_dropdown.change(display_datasets, inputs=language_dropdown, outputs=dataset_info)
190
+ with gr.Accordion("More Information", open=False):
191
+ gr.Markdown(dpo)
192
+ gr.Markdown(adding_datasets)
193
+ gr.Markdown(faq)
 
194
 
195
+ demo.launch()