data_only_hallucination_leaderboard

Runtime error

App Files Files Community

pminervini commited on Jan 28

Commit

7b5f39c

•

1 Parent(s): e6de949

update

Browse files

Files changed (20) hide show

blog/figures/clustermap_all.pdf +0 -0
blog/figures/clustermap_all_coolwarm.pdf +0 -0
blog/figures/clustermap_all_viridis.pdf +0 -0
blog/figures/clustermap_detect.pdf +0 -0
blog/figures/clustermap_detect_coolwarm.pdf +0 -0
blog/figures/clustermap_detect_viridis.pdf +0 -0
blog/figures/clustermap_instr.pdf +0 -0
blog/figures/clustermap_instr_coolwarm.pdf +0 -0
blog/figures/clustermap_instr_viridis.pdf +0 -0
blog/figures/clustermap_qa.pdf +0 -0
blog/figures/clustermap_qa_coolwarm.pdf +0 -0
blog/figures/clustermap_qa_viridis.pdf +0 -0
blog/figures/clustermap_rc.pdf +0 -0
blog/figures/clustermap_rc_coolwarm.pdf +0 -0
blog/figures/clustermap_rc_viridis.pdf +0 -0
blog/figures/clustermap_summ.pdf +0 -0
blog/figures/clustermap_summ_coolwarm.pdf +0 -0
blog/figures/clustermap_summ_viridis.pdf +0 -0
cli/analysis-cli.py +43 -24
src/backend/tasks/selfcheckgpt/task.py +3 -2

blog/figures/clustermap_all.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_all.pdf and b/blog/figures/clustermap_all.pdf differ

blog/figures/clustermap_all_coolwarm.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_all_coolwarm.pdf and b/blog/figures/clustermap_all_coolwarm.pdf differ

blog/figures/clustermap_all_viridis.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_all_viridis.pdf and b/blog/figures/clustermap_all_viridis.pdf differ

blog/figures/clustermap_detect.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_detect.pdf and b/blog/figures/clustermap_detect.pdf differ

blog/figures/clustermap_detect_coolwarm.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_detect_coolwarm.pdf and b/blog/figures/clustermap_detect_coolwarm.pdf differ

blog/figures/clustermap_detect_viridis.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_detect_viridis.pdf and b/blog/figures/clustermap_detect_viridis.pdf differ

blog/figures/clustermap_instr.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_instr.pdf and b/blog/figures/clustermap_instr.pdf differ

blog/figures/clustermap_instr_coolwarm.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_instr_coolwarm.pdf and b/blog/figures/clustermap_instr_coolwarm.pdf differ

blog/figures/clustermap_instr_viridis.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_instr_viridis.pdf and b/blog/figures/clustermap_instr_viridis.pdf differ

blog/figures/clustermap_qa.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_qa.pdf and b/blog/figures/clustermap_qa.pdf differ

blog/figures/clustermap_qa_coolwarm.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_qa_coolwarm.pdf and b/blog/figures/clustermap_qa_coolwarm.pdf differ

blog/figures/clustermap_qa_viridis.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_qa_viridis.pdf and b/blog/figures/clustermap_qa_viridis.pdf differ

blog/figures/clustermap_rc.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_rc.pdf and b/blog/figures/clustermap_rc.pdf differ

blog/figures/clustermap_rc_coolwarm.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_rc_coolwarm.pdf and b/blog/figures/clustermap_rc_coolwarm.pdf differ

blog/figures/clustermap_rc_viridis.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_rc_viridis.pdf and b/blog/figures/clustermap_rc_viridis.pdf differ

blog/figures/clustermap_summ.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_summ.pdf and b/blog/figures/clustermap_summ.pdf differ

blog/figures/clustermap_summ_coolwarm.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_summ_coolwarm.pdf and b/blog/figures/clustermap_summ_coolwarm.pdf differ

blog/figures/clustermap_summ_viridis.pdf CHANGED Viewed

Binary files a/blog/figures/clustermap_summ_viridis.pdf and b/blog/figures/clustermap_summ_viridis.pdf differ

cli/analysis-cli.py CHANGED Viewed

@@ -19,6 +19,14 @@ from src.envs import QUEUE_REPO, RESULTS_REPO, API
 from src.utils import my_snapshot_download
 def find_json_files(json_path):
     res = []
     for root, dirs, files in os.walk(json_path):
@@ -40,13 +48,16 @@ def sanitise_metric(name: str) -> str:
     res = res.replace("exact", "EM")
     res = res.replace("HasAns_EM", "HasAns")
     res = res.replace("NoAns_EM", "NoAns")
     return res
 def sanitise_dataset(name: str) -> str:
     res = name
-    res = res.replace("tqa8", "TriviaQA")
-    res = res.replace("nq8", "NQ")
     res = res.replace("truthfulqa", "TruthfulQA")
     res = res.replace("ifeval", "IFEval")
     res = res.replace("selfcheckgpt", "SelfCheckGPT")
@@ -111,12 +122,16 @@ if data_map is None:
         for dataset_name, results_dict in data["results"].items():
             for metric_name, value in results_dict.items():
-                if ',' in metric_name and '_stderr' not in metric_name \
-                        and 'f1' not in metric_name \
-                        and model_name_to_model_map[model_name]["likes"] > 128:
                     to_add = True
                     if 'memo-trap_v2' in dataset_name:
                         to_add = False
@@ -128,9 +143,6 @@ if data_map is None:
                     if 'faithdial' in dataset_name:
                         to_add = False
-                    if 'nq_open' in dataset_name or 'triviaqa' in dataset_name:
-                        to_add = False
                     if 'truthfulqa_gen' in dataset_name:
                         to_add = False
@@ -138,13 +150,9 @@ if data_map is None:
                         if 'precision' not in metric_name:
                             to_add = False
-                    if 'correctness,' in metric_name or 'em,' in metric_name:
-                        to_add = False
-                    if 'rouge' in metric_name:
-                        pass
-                        # if 'rougeL' not in metric_name:
-                        #     to_add = False
                     if 'ifeval' in dataset_name:
                         if 'prompt_level_strict_acc' not in metric_name:
@@ -161,14 +169,23 @@ if data_map is None:
                     if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' in dataset_name:
                         to_add = False
-                    if 'rouge' in metric_name:
-                        value /= 100.0
-                    if 'squad' in dataset_name:
-                        value /= 100.0
                     if to_add:
-                        sanitised_metric_name = sanitise_metric(metric_name.split(',')[0])
                         sanitised_dataset_name = sanitise_dataset(dataset_name)
                         model_dataset_metric_to_result_map[(model_name, sanitised_dataset_name, sanitised_metric_name)] = value
@@ -207,6 +224,8 @@ for plot_type in plot_type_lst:
                     to_add = False
                 if 'SelfCheckGPT' in dataset_metric[0] and 'MAX' not in dataset_metric[1]:
                     to_add = False
                 if to_add is True:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
             elif plot_type in {'summ'}:
@@ -316,6 +335,6 @@ for plot_type in plot_type_lst:
         cmap_suffix = '' if cmap is None else f'_{cmap}'
         # Save the clustermap to file
-        fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}.pdf')
-        fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}.png')
-        fig.savefig(f'plots/clustermap_{plot_type}{cmap_suffix}_t.png', transparent=True, facecolor="none")

 from src.utils import my_snapshot_download
+def is_float(string):
+    try:
+        float(string)
+        return True
+    except ValueError:
+        return False
 def find_json_files(json_path):
     res = []
     for root, dirs, files in os.walk(json_path):
     res = res.replace("exact", "EM")
     res = res.replace("HasAns_EM", "HasAns")
     res = res.replace("NoAns_EM", "NoAns")
+    res = res.replace("em", "EM")
     return res
 def sanitise_dataset(name: str) -> str:
     res = name
+    res = res.replace("tqa8", "TriviaQA (8-shot)")
+    res = res.replace("nq8", "NQ (8-shot)")
+    res = res.replace("nq_open", "NQ (64-shot)")
+    res = res.replace("triviaqa", "TriviaQA (64-shot)")
     res = res.replace("truthfulqa", "TruthfulQA")
     res = res.replace("ifeval", "IFEval")
     res = res.replace("selfcheckgpt", "SelfCheckGPT")
         for dataset_name, results_dict in data["results"].items():
             for metric_name, value in results_dict.items():
+                if model_name_to_model_map[model_name]["likes"] > 128:
                     to_add = True
+                    if 'f1' in metric_name:
+                        to_add = False
+                    if 'stderr' in metric_name:
+                        to_add = False
                     if 'memo-trap_v2' in dataset_name:
                         to_add = False
                     if 'faithdial' in dataset_name:
                         to_add = False
                     if 'truthfulqa_gen' in dataset_name:
                         to_add = False
                         if 'precision' not in metric_name:
                             to_add = False
+                    if 'halueval' in dataset_name:
+                        if 'acc' not in metric_name:
+                            to_add = False
                     if 'ifeval' in dataset_name:
                         if 'prompt_level_strict_acc' not in metric_name:
                     if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' in dataset_name:
                         to_add = False
+                    if isinstance(value, str):
+                        if is_float(value):
+                            value = float(value)
+                        else:
+                            to_add = False
                     if to_add:
+                        if 'rouge' in metric_name:
+                            value /= 100.0
+                        if 'squad' in dataset_name:
+                            value /= 100.0
+                        sanitised_metric_name = metric_name
+                        if "," in sanitised_metric_name:
+                            sanitised_metric_name = sanitised_metric_name.split(',')[0]
+                        sanitised_metric_name = sanitise_metric(sanitised_metric_name)
                         sanitised_dataset_name = sanitise_dataset(dataset_name)
                         model_dataset_metric_to_result_map[(model_name, sanitised_dataset_name, sanitised_metric_name)] = value
                     to_add = False
                 if 'SelfCheckGPT' in dataset_metric[0] and 'MAX' not in dataset_metric[1]:
                     to_add = False
+                if '64-shot' in dataset_metric[0]:
+                    to_add = False
                 if to_add is True:
                     data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
             elif plot_type in {'summ'}:
         cmap_suffix = '' if cmap is None else f'_{cmap}'
         # Save the clustermap to file
+        fig.savefig(f'blog/figures/clustermap_{plot_type}{cmap_suffix}.pdf')
+        fig.savefig(f'blog/figures/clustermap_{plot_type}{cmap_suffix}.png')
+        fig.savefig(f'blog/figures/clustermap_{plot_type}{cmap_suffix}_t.png', transparent=True, facecolor="none")

src/backend/tasks/selfcheckgpt/task.py CHANGED Viewed

@@ -21,8 +21,9 @@ class SelfCheckGpt(Task):
     def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
         super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
-        self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512} # these end tokens are hard coded because of the current limitaion of the llm-eval.
-        self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
         self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
         self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNLI')

     def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
         super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
+        # these end tokens are hard coded because of the current limitaion of the llm-eval.
+        self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
+        self.generation_kwargs_sampling_number = 5  # the number of sampling for self-consistence
         self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
         self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNLI')