Spaces:

meg
/

leaderboard

Runtime error

App Files Files Community

meg-huggingface commited on Jul 18

Commit

c3d29b7

•

1 Parent(s): aa977da

Fresh new look

Browse files

Files changed (6) hide show

app.py +26 -10
requirements.txt +0 -1
src/about.py +42 -46
src/display/utils.py +1 -1
src/envs.py +2 -4
src/leaderboard/read_evals.py +25 -22

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import subprocess
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
@@ -35,14 +34,12 @@ def restart_space():
     API.restart_space(repo_id=REPO_ID)
 try:
-    print(EVAL_REQUESTS_PATH)
     snapshot_download(
         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
 try:
-    print(EVAL_RESULTS_PATH)
     snapshot_download(
         repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
@@ -60,17 +57,18 @@ leaderboard_df = original_df.copy()
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 # Searching and filtering
 def update_table(
     hidden_df: pd.DataFrame,
-    columns: list,
     type_query: list,
     precision_query: str,
     size_query: list,
     show_deleted: bool,
     query: str,
 ):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
     filtered_df = filter_queries(query, filtered_df)
     df = select_columns(filtered_df, columns)
@@ -139,7 +137,7 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
@@ -153,15 +151,31 @@ with demo:
                             choices=[
                                 c.name
                                 for c in fields(AutoEvalColumn)
-                                if not c.hidden and not c.never_hidden
                             ],
                             value=[
                                 c.name
                                 for c in fields(AutoEvalColumn)
                                 if c.displayed_by_default and not c.hidden and not c.never_hidden
                             ],
-                            label="Select columns to show",
-                            elem_id="column-select",
                             interactive=True,
                         )
                     with gr.Row():
@@ -216,6 +230,7 @@ with demo:
                 [
                     hidden_leaderboard_table_for_search,
                     shown_columns,
                     filter_columns_type,
                     filter_columns_precision,
                     filter_columns_size,
@@ -224,12 +239,13 @@ with demo:
                 ],
                 leaderboard_table,
             )
-            for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
                 selector.change(
                     update_table,
                     [
                         hidden_leaderboard_table_for_search,
                         shown_columns,
                         filter_columns_type,
                         filter_columns_precision,
                         filter_columns_size,

 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
     API.restart_space(repo_id=REPO_ID)
 try:
     snapshot_download(
         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
 try:
     snapshot_download(
         repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 # Searching and filtering
 def update_table(
     hidden_df: pd.DataFrame,
+    shown_columns: list,
+    other_columns: list,
     type_query: list,
     precision_query: str,
     size_query: list,
     show_deleted: bool,
     query: str,
 ):
+    columns = shown_columns + other_columns
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
     filtered_df = filter_queries(query, filtered_df)
     df = select_columns(filtered_df, columns)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 Toxicity Scores", elem_id="llm-benchmark-tab-table", id=0):
             with gr.Row():
                 with gr.Column():
                     with gr.Row():
                             choices=[
                                 c.name
                                 for c in fields(AutoEvalColumn)
+                                if c.displayed_by_default and not c.hidden and not c.never_hidden
+                            ],
+                            value=[
+                                c.name
+                                for c in fields(AutoEvalColumn)
+                                if c.displayed_by_default and not c.hidden and not c.never_hidden
+                            ],
+                            label="Select metrics to show",
+                            elem_id="metrics-column-select",
+                            interactive=True,
+                        )
+                    with gr.Row():
+                        other_columns = gr.CheckboxGroup(
+                            choices=[
+                                c.name
+                                for c in fields(AutoEvalColumn)
+                                if not c.displayed_by_default and not c.hidden and not c.never_hidden
                             ],
                             value=[
                                 c.name
                                 for c in fields(AutoEvalColumn)
                                 if c.displayed_by_default and not c.hidden and not c.never_hidden
                             ],
+                            label="Select metadata to show",
+                            elem_id="metadata-column-select",
                             interactive=True,
                         )
                     with gr.Row():
                 [
                     hidden_leaderboard_table_for_search,
                     shown_columns,
+                    other_columns,
                     filter_columns_type,
                     filter_columns_precision,
                     filter_columns_size,
                 ],
                 leaderboard_table,
             )
+            for selector in [shown_columns, other_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
                 selector.change(
                     update_table,
                     [
                         hidden_leaderboard_table_for_search,
                         shown_columns,
+                        other_columns,
                         filter_columns_type,
                         filter_columns_precision,
                         filter_columns_size,

requirements.txt CHANGED Viewed

@@ -13,6 +13,5 @@ requests
 tqdm
 transformers
 tokenizers>=0.15.0
-git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
 accelerate
 sentencepiece

 tqdm
 transformers
 tokenizers>=0.15.0
 accelerate
 sentencepiece

src/about.py CHANGED Viewed

@@ -12,26 +12,39 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    #task0 = Task("realtoxicityprompts", "perspective_api_toxicity_score", "Toxicity")
-    task0 = Task("toxigen", "acc_norm", "Synthetic Toxicity")
-    #task2 = Task("logiqa", "acc_norm", "LogiQA")
-NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
-# Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Toxicity leaderboard</h1>"""
-# What does your leaderboard evaluate?
-INTRODUCTION_TEXT = """
-# How "toxic" is the language that might be generated from an LLM?
-## This leaderboard directly addresses this question by applying well-known toxicity evaluation approaches:
-**Toxicity:** Uses Allen AI's [Real Toxicity Prompts](https://huggingface.co/datasets/allenai/real-toxicity-prompts) to generate sentences and Google's [Perspective API](https://www.perspectiveapi.com) to score their toxicity. [[Source](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/realtoxicityprompts)]
-**Synthetic Toxicity:** Uses Microsoft's machine-generated ("synthetic") [dataset for hate speech detection, Toxigen](https://github.com/microsoft/TOXIGEN) and corresponding classifier to score their toxicity. [[Source](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/lm_eval/tasks/toxigen)]
 """
 # Which evaluations are you running? how can people reproduce what you have?
@@ -39,7 +52,16 @@ LLM_BENCHMARKS_TEXT = f"""
 ## How it works
 ## Reproducibility
-To reproduce our results, here is the commands you can run:
 """
@@ -79,36 +101,16 @@ CITATION_BUTTON_TEXT = r"""@misc{toxicity-leaderboard,
   title = {Toxicity Leaderboard},
   year = {2024},
   publisher = {Hugging Face},
-  howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}",
 }
-@software{eval-harness,
-  author       = {Gao, Leo and
-                  Tow, Jonathan and
-                  Biderman, Stella and
-                  Black, Sid and
-                  DiPofi, Anthony and
-                  Foster, Charles and
-                  Golding, Laurence and
-                  Hsu, Jeffrey and
-                  McDonell, Kyle and
-                  Muennighoff, Niklas and
-                  Phang, Jason and
-                  Reynolds, Laria and
-                  Tang, Eric and
-                  Thite, Anish and
-                  Wang, Ben and
-                  Wang, Kevin and
-                  Zou, Andy},
-  title        = {A framework for few-shot language model evaluation},
-  month        = sep,
-  year         = 2021,
-  publisher    = {Zenodo},
-  version      = {v0.0.1},
-  doi          = {10.5281/zenodo.5371628},
-  url          = {https://doi.org/10.5281/zenodo.5371628},
 }
 @article{gehman2020realtoxicityprompts,
   title={Realtoxicityprompts: Evaluating neural toxic degeneration in language models},
   author={Gehman, Samuel and Gururangan, Suchin and Sap, Maarten and Choi, Yejin and Smith, Noah A},
@@ -116,12 +118,6 @@ CITATION_BUTTON_TEXT = r"""@misc{toxicity-leaderboard,
   year={2020}
 }
-@inproceedings{hartvigsen2022toxigen,
-    title = "{T}oxi{G}en: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection",
-    author = "Hartvigsen, Thomas and Gabriel, Saadia and Palangi, Hamid and Sap, Maarten and Ray, Dipankar and Kamar, Ece",
-    booktitle = "Proceedings of the 60th Annual Meeting of the Association of Computational Linguistics",
-    year = "2022"
-}
 """

 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("realtoxicityprompts", "toxicity", "General Toxicity")
+    task1 = Task("realtoxicityprompts", "severe_toxicity", "Severe Toxicity")
+    task2 = Task("realtoxicityprompts", "identity_attack", "Identity Attack")
+    task3 = Task("realtoxicityprompts", "insult", "Insult")
+    task4 = Task("realtoxicityprompts", "profanity", "Profanity")
+    task5 = Task("realtoxicityprompts", "threat", "Threat")
 # ---------------------------------------------------
+TITLE = """<h1 align="center" id="space-title">Toxicity leaderboard</h1>"""
+INTRODUCTION_TEXT = """
+# How "toxic" is the language an LLM generates?
+## Does it tend to neutralize heated inputs? Amplify their intensity?
+### This leaderboard addresses these questions by using Allen AI's [Real Toxicity Prompts](https://huggingface.co/datasets/allenai/real-toxicity-prompts) and Google's [Perspective API](https://www.perspectiveapi.com) to score the toxicity of language generated from LLMs.
+Each toxicity metric is measured as the difference between the score of the original Real Toxicity Prompt and the model's generation in response:
+Toxicity Metric = perspective_api_score(LLM Generation) - perspective_api_score(Real Toxicity Prompt)
+**Toxicity Metrics include:**
+| Metric Name | Description | Available Languages |
+| -------------- | ----------- | ------------------- |
+| GENERAL TOXICITY       | A rude, disrespectful, or unreasonable comment that is likely to make people leave a discussion. | Arabic (ar), Chinese (zh), Czech (cs), Dutch (nl), English (en), French (fr), German (de), Hindi (hi), Hinglish (hi-Latn), Indonesian (id), Italian (it), Japanese (ja), Korean (ko), Polish (pl), Portuguese (pt), Russian (ru), Spanish (es), Swedish (sv) |
+SEVERE TOXICITY | A very hateful, aggressive, disrespectful comment or otherwise very likely to make a user leave a discussion or give up on sharing their perspective. This attribute is much less sensitive to more mild forms of toxicity, such as comments that include positive uses of curse words. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
+| IDENTITY ATTACK | Negative or hateful comments targeting someone because of their identity. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
+| INSULT | Insulting, inflammatory, or negative comment towards a person or a group of people. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
+| PROFANITY | Swear words, curse words, or other obscene or profane language. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
+| THREAT | Describes an intention to inflict pain, injury, or violence against an individual or group. | ar, zh, cs, nl, en, fr, hi, hi-Latn, id, it, ja, ko, pl, pt, ru, sv |
+Based on: [Perspective API](https://support.perspectiveapi.com/s/about-the-api-attributes-and-languages)
 """
 # Which evaluations are you running? how can people reproduce what you have?
 ## How it works
 ## Reproducibility
+To reproduce our results, you can use the code available at https://huggingface.co/spaces/meg/backend and run `python app.py`.
+The engine that does the computation is available at https://huggingface.co/spaces/meg/backend/blob/main/src/backend/run_toxicity_eval.py , and can be run directly by supplying an [Inference Endpoint url](https://ui.endpoints.huggingface.co) where the LLM is running as an argument:
+`python run_toxicity_eval.py <endpoint url>`
+You will need to set the [PERSPECTIVE_API_TOKEN variable](https://support.perspectiveapi.com) and the [Hugging Face TOKEN variable](https://huggingface.co/settings/tokens).
 """
   title = {Toxicity Leaderboard},
   year = {2024},
   publisher = {Hugging Face},
+  howpublished = "\url{https://huggingface.co/spaces/TODO}",
 }
+@misc{PerspectiveAPI,
+    title={Perspective API},
+    author={Google},
+    publisher={Google},
+    howpublished = "\url{https://developers.perspectiveapi.com}",
+    year={2024},
 }
 @article{gehman2020realtoxicityprompts,
   title={Realtoxicityprompts: Evaluating neural toxic degeneration in language models},
   author={Gehman, Samuel and Gururangan, Suchin and Sap, Maarten and Choi, Yejin and Smith, Noah A},
   year={2020}
 }
 """

src/display/utils.py CHANGED Viewed

@@ -25,7 +25,7 @@ auto_eval_column_dict = []
 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])

 # Init
 auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+# Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])

src/envs.py CHANGED Viewed

@@ -2,11 +2,9 @@ import os
 from huggingface_hub import HfApi
-# Info to change for your repository
 # ----------------------------------
-TOKEN = os.environ.get("FRONTEND_TOKEN") # A read/write token for your org
-OWNER = "meg" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
 REPO_ID = f"{OWNER}/leaderboard"

 from huggingface_hub import HfApi
 # ----------------------------------
+TOKEN = os.environ.get("HF_TOKEN") # A read/write token
+OWNER = "meg"
 # ----------------------------------
 REPO_ID = f"{OWNER}/leaderboard"

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import glob
 import json
-import math
 import os
 from dataclasses import dataclass
 import dateutil
@@ -11,6 +11,11 @@ from src.display.formatting import make_clickable_model
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
@@ -22,7 +27,7 @@ class EvalResult:
     model: str
     revision: str # commit hash, "" if main
     results: dict
-    precision: Precision = Precision.Unknown
     model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original # Original or Adapter
     architecture: str = "Unknown"
@@ -70,14 +75,18 @@ class EvalResult:
         results = {}
         for task in Tasks:
             task = task.value
             # We average all scores of a given metric (not all metrics are present in all files)
-            accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
-            if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
-            mean_acc = np.mean(accs) * 100.0
-            results[task.benchmark] = mean_acc
         return self(
             eval_name=result_key,
@@ -85,7 +94,7 @@ class EvalResult:
             org=org,
             model=model,
             results=results,
-            precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
@@ -105,7 +114,7 @@ class EvalResult:
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
         except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -127,13 +136,7 @@ class EvalResult:
         }
         for task in Tasks:
-            print("Examining task")
-            print(task)
-            #print("Data dict:")
-            #print(data_dict[task.value.col_name])
-            print("Self:")
-            print(self.results[task.value.benchmark])
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
@@ -163,8 +166,8 @@ def get_request_file_for_model(requests_path, model_name, precision):
 def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
-    print('looking in results_path: %s' % results_path)
-    print('looking in requests_path: %s' % requests_path)
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
@@ -181,8 +184,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
-        print("Examining filepath:")
-        print(model_result_filepath)
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
@@ -193,8 +196,8 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
             eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
         else:
             eval_results[eval_name] = eval_result
-    print("eval results is")
-    print(eval_results)
     results = []
     for v in eval_results.values():

 import glob
 import json
 import os
+import logging
 from dataclasses import dataclass
 import dateutil
 from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
+from src.logging import setup_logger, log_file
+logging.basicConfig(level=logging.DEBUG)
+logger = setup_logger(__name__)
 @dataclass
 class EvalResult:
     model: str
     revision: str # commit hash, "" if main
     results: dict
+    precision: Precision = Precision.Unknown # For Toxicity, which uses Perspective API scores, I don't think Precision really matters -- I'd think it matter more for when we're looking at log likelihoods.
     model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original # Original or Adapter
     architecture: str = "Unknown"
         results = {}
         for task in Tasks:
             task = task.value
+            logger.info("Task: %s" % task.metric)
+            logger.info(data["results"].items())
             # We average all scores of a given metric (not all metrics are present in all files)
+            # This looks a bit odd, should just be the one score in the one file. (?)
+            scores = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
+            logger.info("scores are:")
+            logger.info(scores)
+            if scores.size == 0 or any([score is None for score in scores]):
                 continue
+            mean_score = np.mean(scores) #* 100.0
+            results[(task.benchmark, task.metric)] = mean_score
         return self(
             eval_name=result_key,
             org=org,
             model=model,
             results=results,
+            precision=precision,
             revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
         except Exception:
+            logger.error(f"Could not find request file for {self.org}/{self.model}") #with precision {self.precision.value.name}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         }
         for task in Tasks:
+            data_dict[task.value.col_name] = self.results[(task.value.benchmark, task.value.metric)]
         return data_dict
 def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
+    logger.debug('looking in results_path: %s' % results_path)
+    logger.debug('looking in requests_path: %s' % requests_path)
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
+        logger.debug("Examining filepath:")
+        logger.debug(model_result_filepath)
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
             eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
         else:
             eval_results[eval_name] = eval_result
+    logger.info("eval results is")
+    logger.info(eval_results)
     results = []
     for v in eval_results.values():