simplify_ux

#944
by clefourrier HF staff - opened
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import logging
2
  import time
3
  import schedule
@@ -60,18 +61,8 @@ NEW_DATA_ON_LEADERBOARD = True
60
  LEADERBOARD_DF = None
61
 
62
  def restart_space():
63
- logging.info(f"Restarting space with repo ID: {REPO_ID}")
64
- try:
65
- # Check if new data is pending and download if necessary
66
- if NEW_DATA_ON_LEADERBOARD:
67
- logging.info("Fetching latest leaderboard data before restart.")
68
- get_latest_data_leaderboard()
69
 
70
- # Now restart the space
71
- API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
72
- logging.info("Space restarted successfully.")
73
- except Exception as e:
74
- logging.error(f"Failed to restart space: {e}")
75
 
76
  def time_diff_wrapper(func):
77
  def wrapper(*args, **kwargs):
@@ -109,35 +100,29 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
109
  attempt += 1
110
  raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
111
 
112
- def get_latest_data_leaderboard(leaderboard_initial_df=None):
113
  global NEW_DATA_ON_LEADERBOARD
114
  global LEADERBOARD_DF
115
  if NEW_DATA_ON_LEADERBOARD:
116
- logging.info("Leaderboard updated at reload!")
117
- try:
118
- leaderboard_dataset = datasets.load_dataset(
119
- AGGREGATED_REPO,
120
- "default",
121
- split="train",
122
- cache_dir=HF_HOME,
123
- download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD, # Always download fresh data
124
- verification_mode="no_checks"
125
- )
126
- LEADERBOARD_DF = get_leaderboard_df(
127
- leaderboard_dataset=leaderboard_dataset,
128
- cols=COLS,
129
- benchmark_cols=BENCHMARK_COLS,
130
- )
131
- logging.info("Leaderboard dataset successfully downloaded.")
132
- except Exception as e:
133
- logging.error(f"Failed to download leaderboard dataset: {e}")
134
- return
135
-
136
- # Reset the flag after successful download
137
  NEW_DATA_ON_LEADERBOARD = False
 
138
  else:
139
  LEADERBOARD_DF = leaderboard_initial_df
140
- logging.info("Using cached leaderboard dataset.")
141
  return LEADERBOARD_DF
142
 
143
 
@@ -147,9 +132,6 @@ def get_latest_data_queue():
147
 
148
  def init_space():
149
  """Initializes the application space, loading only necessary data."""
150
- global NEW_DATA_ON_LEADERBOARD
151
- NEW_DATA_ON_LEADERBOARD = True # Ensure new data is always pulled on restart
152
-
153
  if DO_FULL_INIT:
154
  # These downloads only occur on full initialization
155
  try:
@@ -184,6 +166,12 @@ LEADERBOARD_DF, eval_queue_dfs = init_space()
184
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
185
 
186
 
 
 
 
 
 
 
187
  def init_leaderboard(dataframe):
188
  if dataframe is None or dataframe.empty:
189
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -461,13 +449,18 @@ def update_leaderboard(payload: WebhookPayload) -> None:
461
  """Redownloads the leaderboard dataset each time it updates"""
462
  if payload.repo.type == "dataset" and payload.event.action == "update":
463
  global NEW_DATA_ON_LEADERBOARD
464
- logging.info("New data detected, downloading updated leaderboard dataset.")
465
-
466
- # Mark the flag for new data
467
  NEW_DATA_ON_LEADERBOARD = True
468
 
469
- # Now actually download the latest data immediately
470
- get_latest_data_leaderboard()
 
 
 
 
 
 
471
 
472
  # The below code is not used at the moment, as we can manage the queue file locally
473
  LAST_UPDATE_QUEUE = datetime.datetime.now()
@@ -487,6 +480,5 @@ def update_queue(payload: WebhookPayload) -> None:
487
  webhooks_server.launch()
488
 
489
  scheduler = BackgroundScheduler()
490
- scheduler.add_job(restart_space, "interval", hours=1) # Restart every 1h
491
- logging.info("Scheduler initialized to restart space every 1 hour.")
492
  scheduler.start()
 
1
+ import os
2
  import logging
3
  import time
4
  import schedule
 
61
  LEADERBOARD_DF = None
62
 
63
  def restart_space():
64
+ API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
 
 
 
 
 
65
 
 
 
 
 
 
66
 
67
  def time_diff_wrapper(func):
68
  def wrapper(*args, **kwargs):
 
100
  attempt += 1
101
  raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
102
 
103
+ def get_latest_data_leaderboard(leaderboard_initial_df = None):
104
  global NEW_DATA_ON_LEADERBOARD
105
  global LEADERBOARD_DF
106
  if NEW_DATA_ON_LEADERBOARD:
107
+ print("Leaderboard updated at reload!")
108
+ leaderboard_dataset = datasets.load_dataset(
109
+ AGGREGATED_REPO,
110
+ "default",
111
+ split="train",
112
+ cache_dir=HF_HOME,
113
+ download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
114
+ verification_mode="no_checks"
115
+ )
116
+ LEADERBOARD_DF = get_leaderboard_df(
117
+ leaderboard_dataset=leaderboard_dataset,
118
+ cols=COLS,
119
+ benchmark_cols=BENCHMARK_COLS,
120
+ )
 
 
 
 
 
 
 
121
  NEW_DATA_ON_LEADERBOARD = False
122
+
123
  else:
124
  LEADERBOARD_DF = leaderboard_initial_df
125
+
126
  return LEADERBOARD_DF
127
 
128
 
 
132
 
133
  def init_space():
134
  """Initializes the application space, loading only necessary data."""
 
 
 
135
  if DO_FULL_INIT:
136
  # These downloads only occur on full initialization
137
  try:
 
166
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
167
 
168
 
169
+ # Function to check if a user is logged in
170
+ def check_login(profile: gr.OAuthProfile | None) -> bool:
171
+ if profile is None:
172
+ return False
173
+ return True
174
+
175
  def init_leaderboard(dataframe):
176
  if dataframe is None or dataframe.empty:
177
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
449
  """Redownloads the leaderboard dataset each time it updates"""
450
  if payload.repo.type == "dataset" and payload.event.action == "update":
451
  global NEW_DATA_ON_LEADERBOARD
452
+ if NEW_DATA_ON_LEADERBOARD:
453
+ return
 
454
  NEW_DATA_ON_LEADERBOARD = True
455
 
456
+ datasets.load_dataset(
457
+ AGGREGATED_REPO,
458
+ "default",
459
+ split="train",
460
+ cache_dir=HF_HOME,
461
+ download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
462
+ verification_mode="no_checks"
463
+ )
464
 
465
  # The below code is not used at the moment, as we can manage the queue file locally
466
  LAST_UPDATE_QUEUE = datetime.datetime.now()
 
480
  webhooks_server.launch()
481
 
482
  scheduler = BackgroundScheduler()
483
+ scheduler.add_job(restart_space, "interval", hours=3) # restarted every 3h as backup in case automatic updates are not working
 
484
  scheduler.start()
pyproject.toml CHANGED
@@ -11,18 +11,19 @@ dependencies = [
11
  "black>=24.8.0",
12
  "click>=8.1.7",
13
  "datasets>=3.0.0",
14
- "huggingface-hub>=0.26.2",
15
  "pandas>=2.2.2",
16
  "python-dateutil>=2.9.0",
17
  "sentencepiece>=0.2.0",
18
- "transformers==4.46.1",
19
  "tokenizers>=0.19.0",
20
- "gradio-space-ci",
21
  "isort>=5.13.2",
22
  "ruff>=0.6.4",
23
- "gradio-leaderboard==0.0.12",
24
- "gradio[oauth]==4.44.1",
25
  "schedule>=1.2.2",
 
26
  ]
27
 
28
  [tool.ruff]
@@ -33,16 +34,16 @@ ignore=["I","EM","FBT","TRY003","S101","D101","D102","D103","D104","D105","G004"
33
  fixable=["ALL"]
34
  select=["ALL"]
35
 
36
- [tool.ruff.lint]
37
  select = ["E", "F"]
38
  fixable = ["ALL"]
39
  ignore = ["E501"] # line too long (black is taking care of this)
40
 
41
- [tool.isort]
42
  profile = "black"
43
 
44
  [tool.black]
45
  line-length = 119
46
 
47
- [tool.uv.sources]
48
- gradio-space-ci = { git = "https://huggingface.co/spaces/Wauplin/gradio-space-ci" }
 
11
  "black>=24.8.0",
12
  "click>=8.1.7",
13
  "datasets>=3.0.0",
14
+ "huggingface-hub>=0.24.7",
15
  "pandas>=2.2.2",
16
  "python-dateutil>=2.9.0",
17
  "sentencepiece>=0.2.0",
18
+ "transformers==4.44.2",
19
  "tokenizers>=0.19.0",
20
+ "gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected]",
21
  "isort>=5.13.2",
22
  "ruff>=0.6.4",
23
+ "gradio-leaderboard==0.0.11",
24
+ "gradio[oauth]==4.44.0",
25
  "schedule>=1.2.2",
26
+ "pigar>=2.1.6",
27
  ]
28
 
29
  [tool.ruff]
 
34
  fixable=["ALL"]
35
  select=["ALL"]
36
 
37
+ [tool.ruff.lint]
38
  select = ["E", "F"]
39
  fixable = ["ALL"]
40
  ignore = ["E501"] # line too long (black is taking care of this)
41
 
42
+ [tool.isort]
43
  profile = "black"
44
 
45
  [tool.black]
46
  line-length = 119
47
 
48
+ [tool.hatch.metadata]
49
+ allow-direct-references = true
requirements.txt CHANGED
@@ -2,16 +2,16 @@ APScheduler==3.10.4
2
  black==24.8.0
3
  click==8.1.7
4
  datasets==3.0.0
5
- huggingface-hub>=0.26.2
6
  pandas==2.2.2
7
  python-dateutil==2.9.0
8
  sentencepiece==0.2.0
9
- transformers==4.46.1
10
  tokenizers>=0.19.0
11
  gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
12
  isort==5.13.2
13
  ruff===0.6.4
14
- gradio==4.44.1
15
  gradio[oauth]
16
- gradio_leaderboard==0.0.12
17
  schedule == 1.2.2
 
2
  black==24.8.0
3
  click==8.1.7
4
  datasets==3.0.0
5
+ huggingface-hub>=0.24.7
6
  pandas==2.2.2
7
  python-dateutil==2.9.0
8
  sentencepiece==0.2.0
9
+ transformers==4.44.2
10
  tokenizers>=0.19.0
11
  gradio-space-ci @ git+https://huggingface.co/spaces/Wauplin/[email protected] # CI !!!
12
  isort==5.13.2
13
  ruff===0.6.4
14
+ gradio==4.44.0
15
  gradio[oauth]
16
+ gradio_leaderboard==0.0.11
17
  schedule == 1.2.2
src/display/about.py CHANGED
@@ -13,7 +13,6 @@ icons = f"""
13
  - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
14
  - {ModelType.chat.to_str(" : ")} model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
15
  - {ModelType.merges.to_str(" : ")} model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
16
- - {ModelType.MM.to_str(" : ")} model: models integrating multiple data types (e.g., text, image, audio) for tasks like image captioning and visual question answering.
17
  """
18
  LLM_BENCHMARKS_TEXT = """
19
  ## ABOUT
 
13
  - {ModelType.FT.to_str(" : ")} model: pretrained models finetuned on more data
14
  - {ModelType.chat.to_str(" : ")} model: chat like fine-tunes, either using IFT (datasets of task instruction), RLHF or DPO (changing the model loss a bit with an added policy), etc
15
  - {ModelType.merges.to_str(" : ")} model: merges or MoErges, models which have been merged or fused without additional fine-tuning.
 
16
  """
17
  LLM_BENCHMARKS_TEXT = """
18
  ## ABOUT
src/display/utils.py CHANGED
@@ -128,8 +128,6 @@ auto_eval_column_dict.append(["fullname", ColumnContent, ColumnContent("fullname
128
  auto_eval_column_dict.append(["generation", ColumnContent, ColumnContent("Generation", "number", False)])
129
  auto_eval_column_dict.append(["base_model", ColumnContent, ColumnContent("Base Model", "str", False)])
130
 
131
- auto_eval_column_dict.append(["co2_emissions_kg", ColumnContent, ColumnContent("CO₂ cost (kg)", "number", True)])
132
-
133
  # We use make dataclass to dynamically fill the scores from Tasks
134
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
135
 
@@ -198,7 +196,6 @@ class ModelType(Enum):
198
  PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
199
  CPT = ModelDetails(name="🟩 continuously pretrained", symbol="🟩")
200
  FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
201
- MM = ModelDetails(name="🌸 multimodal", symbol="🌸")
202
  chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
203
  merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
204
  Unknown = ModelDetails(name="❓ other", symbol="❓")
@@ -218,10 +215,9 @@ class ModelType(Enum):
218
  return ModelType.chat
219
  if "merge" in m_type or "🤝" in m_type:
220
  return ModelType.merges
221
- if "multimodal" in m_type or "🌸" in m_type:
222
- return ModelType.MM
223
  return ModelType.Unknown
224
 
 
225
  class WeightType(Enum):
226
  Adapter = ModelDetails("Adapter")
227
  Original = ModelDetails("Original")
 
128
  auto_eval_column_dict.append(["generation", ColumnContent, ColumnContent("Generation", "number", False)])
129
  auto_eval_column_dict.append(["base_model", ColumnContent, ColumnContent("Base Model", "str", False)])
130
 
 
 
131
  # We use make dataclass to dynamically fill the scores from Tasks
132
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
133
 
 
196
  PT = ModelDetails(name="🟢 pretrained", symbol="🟢")
197
  CPT = ModelDetails(name="🟩 continuously pretrained", symbol="🟩")
198
  FT = ModelDetails(name="🔶 fine-tuned on domain-specific datasets", symbol="🔶")
 
199
  chat = ModelDetails(name="💬 chat models (RLHF, DPO, IFT, ...)", symbol="💬")
200
  merges = ModelDetails(name="🤝 base merges and moerges", symbol="🤝")
201
  Unknown = ModelDetails(name="❓ other", symbol="❓")
 
215
  return ModelType.chat
216
  if "merge" in m_type or "🤝" in m_type:
217
  return ModelType.merges
 
 
218
  return ModelType.Unknown
219
 
220
+
221
  class WeightType(Enum):
222
  Adapter = ModelDetails("Adapter")
223
  Original = ModelDetails("Original")
src/submission/check_validity.py CHANGED
@@ -6,7 +6,7 @@ from collections import defaultdict
6
  from datetime import datetime, timedelta, timezone
7
 
8
  import huggingface_hub
9
- from huggingface_hub import ModelCard, hf_hub_download
10
  from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata, parse_safetensors_file_metadata
11
  from transformers import AutoConfig, AutoTokenizer
12
 
@@ -179,28 +179,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
179
 
180
  return set(file_names), users_to_submission_dates
181
 
182
- def check_chat_template(model: str, revision: str) -> tuple[bool, str]:
183
- try:
184
- # Attempt to download only the tokenizer_config.json file
185
- config_file = hf_hub_download(
186
- repo_id=model,
187
- filename="tokenizer_config.json",
188
- revision=revision,
189
- repo_type="model"
190
- )
191
-
192
- # Read and parse the tokenizer_config.json file
193
- with open(config_file, 'r') as f:
194
- tokenizer_config = json.load(f)
195
 
196
- # Check if chat_template exists in the tokenizer configuration
197
- if 'chat_template' not in tokenizer_config:
198
- return False, f"The model {model} doesn't have a chat_template in its tokenizer_config.json. Please add a chat_template before submitting or submit without it."
199
-
200
- return True, ""
201
- except Exception as e:
202
- return False, f"Error checking chat_template for model {model}: {str(e)}"
203
-
204
  def get_model_tags(model_card, model: str):
205
  is_merge_from_metadata = False
206
  is_moe_from_metadata = False
 
6
  from datetime import datetime, timedelta, timezone
7
 
8
  import huggingface_hub
9
+ from huggingface_hub import ModelCard
10
  from huggingface_hub.hf_api import ModelInfo, get_safetensors_metadata, parse_safetensors_file_metadata
11
  from transformers import AutoConfig, AutoTokenizer
12
 
 
179
 
180
  return set(file_names), users_to_submission_dates
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
 
 
 
 
 
 
 
 
183
  def get_model_tags(model_card, model: str):
184
  is_merge_from_metadata = False
185
  is_moe_from_metadata = False
src/submission/submit.py CHANGED
@@ -24,7 +24,6 @@ from src.submission.check_validity import (
24
  get_model_size,
25
  is_model_on_hub,
26
  user_submission_permission,
27
- check_chat_template,
28
  )
29
 
30
  from src.voting.vote_system import VoteManager
@@ -115,7 +114,6 @@ def add_new_eval(
115
  except Exception as e:
116
  return styled_error("Could not get your model information. Please fill it up properly.")
117
 
118
- # Has it been submitted already?
119
  model_key = f"{model}_{model_info.sha}_{precision}"
120
  if model_key in requested_models:
121
  return styled_error(f"The model '{model}' with revision '{model_info.sha}' and precision '{precision}' has already been submitted.")
@@ -125,12 +123,12 @@ def add_new_eval(
125
  if model_size is None:
126
  return styled_error(error_text)
127
 
128
- # Absolute size limit for float16 and bfloat16
129
  if precision in ["float16", "bfloat16"] and model_size > 100:
130
  return styled_error(f"Sadly, models larger than 100B parameters cannot be submitted in {precision} precision at this time. "
131
  f"Your model size: {model_size:.2f}B parameters.")
132
 
133
- # Precision-adjusted size limit for 8bit, 4bit, and GPTQ
134
  if precision in ["8bit", "4bit", "GPTQ"]:
135
  size_checker = ModelSizeChecker(model=model, precision=precision, model_size_in_b=model_size)
136
 
@@ -165,12 +163,6 @@ def add_new_eval(
165
  modelcard_OK, error_msg, model_card = check_model_card(model)
166
  if not modelcard_OK:
167
  return styled_error(error_msg)
168
-
169
- # Check the chat template submission
170
- if use_chat_template:
171
- chat_template_valid, chat_template_error = check_chat_template(model, revision)
172
- if not chat_template_valid:
173
- return styled_error(chat_template_error)
174
 
175
  # Seems good, creating the eval
176
  print("Adding new eval")
 
24
  get_model_size,
25
  is_model_on_hub,
26
  user_submission_permission,
 
27
  )
28
 
29
  from src.voting.vote_system import VoteManager
 
114
  except Exception as e:
115
  return styled_error("Could not get your model information. Please fill it up properly.")
116
 
 
117
  model_key = f"{model}_{model_info.sha}_{precision}"
118
  if model_key in requested_models:
119
  return styled_error(f"The model '{model}' with revision '{model_info.sha}' and precision '{precision}' has already been submitted.")
 
123
  if model_size is None:
124
  return styled_error(error_text)
125
 
126
+ # First check: Absolute size limit for float16 and bfloat16
127
  if precision in ["float16", "bfloat16"] and model_size > 100:
128
  return styled_error(f"Sadly, models larger than 100B parameters cannot be submitted in {precision} precision at this time. "
129
  f"Your model size: {model_size:.2f}B parameters.")
130
 
131
+ # Second check: Precision-adjusted size limit for 8bit, 4bit, and GPTQ
132
  if precision in ["8bit", "4bit", "GPTQ"]:
133
  size_checker = ModelSizeChecker(model=model, precision=precision, model_size_in_b=model_size)
134
 
 
163
  modelcard_OK, error_msg, model_card = check_model_card(model)
164
  if not modelcard_OK:
165
  return styled_error(error_msg)
 
 
 
 
 
 
166
 
167
  # Seems good, creating the eval
168
  print("Adding new eval")