|
dataset,prompt,metric,value
|
|
anli_dev_r1,GPT-3 style,accuracy,0.406
|
|
anli_dev_r1,MNLI crowdsource,accuracy,0.381
|
|
anli_dev_r1,can we infer,accuracy,0.417
|
|
anli_dev_r1,guaranteed/possible/impossible,accuracy,0.312
|
|
anli_dev_r1,justified in saying,accuracy,0.412
|
|
anli_dev_r1,median,accuracy,0.406
|
|
anli_dev_r2,GPT-3 style,accuracy,0.39
|
|
anli_dev_r2,MNLI crowdsource,accuracy,0.365
|
|
anli_dev_r2,can we infer,accuracy,0.393
|
|
anli_dev_r2,guaranteed/possible/impossible,accuracy,0.319
|
|
anli_dev_r2,justified in saying,accuracy,0.378
|
|
anli_dev_r2,median,accuracy,0.378
|
|
anli_dev_r3,GPT-3 style,accuracy,0.4083333333333333
|
|
anli_dev_r3,MNLI crowdsource,accuracy,0.3641666666666667
|
|
anli_dev_r3,can we infer,accuracy,0.42833333333333334
|
|
anli_dev_r3,guaranteed/possible/impossible,accuracy,0.30666666666666664
|
|
anli_dev_r3,justified in saying,accuracy,0.4008333333333333
|
|
anli_dev_r3,median,accuracy,0.4008333333333333
|
|
story_cloze_2016,Answer Given options,accuracy,0.9401389631213255
|
|
story_cloze_2016,Choose Story Ending,accuracy,0.9406734366648851
|
|
story_cloze_2016,Generate Ending,accuracy,0.7883484767504009
|
|
story_cloze_2016,Novel Correct Ending,accuracy,0.9465526456440406
|
|
story_cloze_2016,Story Continuation and Options,accuracy,0.9390700160342063
|
|
story_cloze_2016,median,accuracy,0.9401389631213255
|
|
super_glue_cb,GPT-3 style,accuracy,0.75
|
|
super_glue_cb,MNLI crowdsource,accuracy,0.125
|
|
super_glue_cb,can we infer,accuracy,0.8571428571428571
|
|
super_glue_cb,guaranteed/possible/impossible,accuracy,0.32142857142857145
|
|
super_glue_cb,justified in saying,accuracy,0.8392857142857143
|
|
super_glue_cb,median,accuracy,0.75
|
|
super_glue_copa,"C1 or C2? premise, so/because…",accuracy,0.76
|
|
super_glue_copa,best_option,accuracy,0.84
|
|
super_glue_copa,cause_effect,accuracy,0.91
|
|
super_glue_copa,i_am_hesitating,accuracy,0.89
|
|
super_glue_copa,plausible_alternatives,accuracy,0.9
|
|
super_glue_copa,median,accuracy,0.89
|
|
super_glue_rte,GPT-3 style,accuracy,0.851985559566787
|
|
super_glue_rte,MNLI crowdsource,accuracy,0.7978339350180506
|
|
super_glue_rte,does it follow that,accuracy,0.7364620938628159
|
|
super_glue_rte,guaranteed true,accuracy,0.7220216606498195
|
|
super_glue_rte,should assume,accuracy,0.6931407942238267
|
|
super_glue_rte,median,accuracy,0.7364620938628159
|
|
winogrande_winogrande_xl,Replace,accuracy,0.5564325177584846
|
|
winogrande_winogrande_xl,True or False,accuracy,0.5043409629044988
|
|
winogrande_winogrande_xl,does underscore refer to,accuracy,0.5588003157063931
|
|
winogrande_winogrande_xl,stand for,accuracy,0.5311760063141279
|
|
winogrande_winogrande_xl,underscore refer to,accuracy,0.5706393054459353
|
|
winogrande_winogrande_xl,median,accuracy,0.5564325177584846
|
|
xcopa_id,"C1 or C2? premise, so/because…",accuracy,0.55
|
|
xcopa_id,best_option,accuracy,0.68
|
|
xcopa_id,cause_effect,accuracy,0.82
|
|
xcopa_id,i_am_hesitating,accuracy,0.78
|
|
xcopa_id,plausible_alternatives,accuracy,0.81
|
|
xcopa_id,median,accuracy,0.78
|
|
xcopa_sw,"C1 or C2? premise, so/because…",accuracy,0.5
|
|
xcopa_sw,best_option,accuracy,0.56
|
|
xcopa_sw,cause_effect,accuracy,0.55
|
|
xcopa_sw,i_am_hesitating,accuracy,0.57
|
|
xcopa_sw,plausible_alternatives,accuracy,0.58
|
|
xcopa_sw,median,accuracy,0.56
|
|
xcopa_ta,"C1 or C2? premise, so/because…",accuracy,0.54
|
|
xcopa_ta,best_option,accuracy,0.65
|
|
xcopa_ta,cause_effect,accuracy,0.68
|
|
xcopa_ta,i_am_hesitating,accuracy,0.66
|
|
xcopa_ta,plausible_alternatives,accuracy,0.69
|
|
xcopa_ta,median,accuracy,0.66
|
|
xcopa_vi,"C1 or C2? premise, so/because…",accuracy,0.59
|
|
xcopa_vi,best_option,accuracy,0.72
|
|
xcopa_vi,cause_effect,accuracy,0.79
|
|
xcopa_vi,i_am_hesitating,accuracy,0.74
|
|
xcopa_vi,plausible_alternatives,accuracy,0.76
|
|
xcopa_vi,median,accuracy,0.74
|
|
xcopa_zh,"C1 or C2? premise, so/because…",accuracy,0.68
|
|
xcopa_zh,best_option,accuracy,0.73
|
|
xcopa_zh,cause_effect,accuracy,0.81
|
|
xcopa_zh,i_am_hesitating,accuracy,0.82
|
|
xcopa_zh,plausible_alternatives,accuracy,0.8
|
|
xcopa_zh,median,accuracy,0.8
|
|
xnli_ar,GPT-3 style,accuracy,0.45502008032128516
|
|
xnli_ar,MNLI crowdsource,accuracy,0.41285140562248995
|
|
xnli_ar,can we infer,accuracy,0.5522088353413654
|
|
xnli_ar,guaranteed/possible/impossible,accuracy,0.463855421686747
|
|
xnli_ar,justified in saying,accuracy,0.5020080321285141
|
|
xnli_ar,median,accuracy,0.463855421686747
|
|
xnli_en,GPT-3 style,accuracy,0.5502008032128514
|
|
xnli_en,MNLI crowdsource,accuracy,0.4036144578313253
|
|
xnli_en,can we infer,accuracy,0.5971887550200803
|
|
xnli_en,guaranteed/possible/impossible,accuracy,0.52570281124498
|
|
xnli_en,justified in saying,accuracy,0.5642570281124498
|
|
xnli_en,median,accuracy,0.5502008032128514
|
|
xnli_es,GPT-3 style,accuracy,0.5208835341365462
|
|
xnli_es,MNLI crowdsource,accuracy,0.43012048192771085
|
|
xnli_es,can we infer,accuracy,0.5931726907630522
|
|
xnli_es,guaranteed/possible/impossible,accuracy,0.4955823293172691
|
|
xnli_es,justified in saying,accuracy,0.5578313253012048
|
|
xnli_es,median,accuracy,0.5208835341365462
|
|
xnli_fr,GPT-3 style,accuracy,0.5180722891566265
|
|
xnli_fr,MNLI crowdsource,accuracy,0.41847389558232934
|
|
xnli_fr,can we infer,accuracy,0.5799196787148594
|
|
xnli_fr,guaranteed/possible/impossible,accuracy,0.47269076305220886
|
|
xnli_fr,justified in saying,accuracy,0.5433734939759036
|
|
xnli_fr,median,accuracy,0.5180722891566265
|
|
xnli_hi,GPT-3 style,accuracy,0.4578313253012048
|
|
xnli_hi,MNLI crowdsource,accuracy,0.40803212851405624
|
|
xnli_hi,can we infer,accuracy,0.5502008032128514
|
|
xnli_hi,guaranteed/possible/impossible,accuracy,0.43413654618473896
|
|
xnli_hi,justified in saying,accuracy,0.4963855421686747
|
|
xnli_hi,median,accuracy,0.4578313253012048
|
|
xnli_sw,GPT-3 style,accuracy,0.37269076305220883
|
|
xnli_sw,MNLI crowdsource,accuracy,0.36305220883534134
|
|
xnli_sw,can we infer,accuracy,0.5004016064257029
|
|
xnli_sw,guaranteed/possible/impossible,accuracy,0.37269076305220883
|
|
xnli_sw,justified in saying,accuracy,0.4606425702811245
|
|
xnli_sw,median,accuracy,0.37269076305220883
|
|
xnli_ur,GPT-3 style,accuracy,0.40080321285140563
|
|
xnli_ur,MNLI crowdsource,accuracy,0.39156626506024095
|
|
xnli_ur,can we infer,accuracy,0.5180722891566265
|
|
xnli_ur,guaranteed/possible/impossible,accuracy,0.40281124497991966
|
|
xnli_ur,justified in saying,accuracy,0.4602409638554217
|
|
xnli_ur,median,accuracy,0.40281124497991966
|
|
xnli_vi,GPT-3 style,accuracy,0.4927710843373494
|
|
xnli_vi,MNLI crowdsource,accuracy,0.39959839357429716
|
|
xnli_vi,can we infer,accuracy,0.5726907630522088
|
|
xnli_vi,guaranteed/possible/impossible,accuracy,0.4979919678714859
|
|
xnli_vi,justified in saying,accuracy,0.5152610441767068
|
|
xnli_vi,median,accuracy,0.4979919678714859
|
|
xnli_zh,GPT-3 style,accuracy,0.4759036144578313
|
|
xnli_zh,MNLI crowdsource,accuracy,0.40602409638554215
|
|
xnli_zh,can we infer,accuracy,0.5694779116465863
|
|
xnli_zh,guaranteed/possible/impossible,accuracy,0.4847389558232932
|
|
xnli_zh,justified in saying,accuracy,0.5028112449799197
|
|
xnli_zh,median,accuracy,0.4847389558232932
|
|
xstory_cloze_ar,Answer Given options,accuracy,0.9205823957643945
|
|
xstory_cloze_ar,Choose Story Ending,accuracy,0.9185969556585043
|
|
xstory_cloze_ar,Generate Ending,accuracy,0.6790205162144275
|
|
xstory_cloze_ar,Novel Correct Ending,accuracy,0.9212442091330245
|
|
xstory_cloze_ar,Story Continuation and Options,accuracy,0.9086697551290536
|
|
xstory_cloze_ar,median,accuracy,0.9185969556585043
|
|
xstory_cloze_es,Answer Given options,accuracy,0.928524156187955
|
|
xstory_cloze_es,Choose Story Ending,accuracy,0.9391131700860358
|
|
xstory_cloze_es,Generate Ending,accuracy,0.7306419589675711
|
|
xstory_cloze_es,Novel Correct Ending,accuracy,0.9291859695565851
|
|
xstory_cloze_es,Story Continuation and Options,accuracy,0.9212442091330245
|
|
xstory_cloze_es,median,accuracy,0.928524156187955
|
|
xstory_cloze_eu,Answer Given options,accuracy,0.8530774321641297
|
|
xstory_cloze_eu,Choose Story Ending,accuracy,0.8669755129053607
|
|
xstory_cloze_eu,Generate Ending,accuracy,0.6465916611515553
|
|
xstory_cloze_eu,Novel Correct Ending,accuracy,0.8504301786896096
|
|
xstory_cloze_eu,Story Continuation and Options,accuracy,0.8358702845797485
|
|
xstory_cloze_eu,median,accuracy,0.8504301786896096
|
|
xstory_cloze_hi,Answer Given options,accuracy,0.870946393117141
|
|
xstory_cloze_hi,Choose Story Ending,accuracy,0.8835208471211119
|
|
xstory_cloze_hi,Generate Ending,accuracy,0.6565188616810059
|
|
xstory_cloze_hi,Novel Correct Ending,accuracy,0.8689609530112509
|
|
xstory_cloze_hi,Story Continuation and Options,accuracy,0.8722700198544011
|
|
xstory_cloze_hi,median,accuracy,0.870946393117141
|
|
xstory_cloze_id,Answer Given options,accuracy,0.913964262078094
|
|
xstory_cloze_id,Choose Story Ending,accuracy,0.9258769027134348
|
|
xstory_cloze_id,Generate Ending,accuracy,0.7021839841164792
|
|
xstory_cloze_id,Novel Correct Ending,accuracy,0.9159497021839841
|
|
xstory_cloze_id,Story Continuation and Options,accuracy,0.8954334877564527
|
|
xstory_cloze_id,median,accuracy,0.913964262078094
|
|
xstory_cloze_zh,Answer Given options,accuracy,0.9113170086035738
|
|
xstory_cloze_zh,Choose Story Ending,accuracy,0.9225678358702846
|
|
xstory_cloze_zh,Generate Ending,accuracy,0.6915949702183984
|
|
xstory_cloze_zh,Novel Correct Ending,accuracy,0.9172733289212442
|
|
xstory_cloze_zh,Story Continuation and Options,accuracy,0.9060225016545335
|
|
xstory_cloze_zh,median,accuracy,0.9113170086035738
|
|
xwinograd_en,Replace,accuracy,0.6064516129032258
|
|
xwinograd_en,True or False,accuracy,0.48516129032258065
|
|
xwinograd_en,does underscore refer to,accuracy,0.6008602150537634
|
|
xwinograd_en,stand for,accuracy,0.5230107526881721
|
|
xwinograd_en,underscore refer to,accuracy,0.5974193548387097
|
|
xwinograd_en,median,accuracy,0.5974193548387097
|
|
xwinograd_fr,Replace,accuracy,0.5301204819277109
|
|
xwinograd_fr,True or False,accuracy,0.5301204819277109
|
|
xwinograd_fr,does underscore refer to,accuracy,0.5903614457831325
|
|
xwinograd_fr,stand for,accuracy,0.4457831325301205
|
|
xwinograd_fr,underscore refer to,accuracy,0.5180722891566265
|
|
xwinograd_fr,median,accuracy,0.5301204819277109
|
|
xwinograd_pt,Replace,accuracy,0.6045627376425855
|
|
xwinograd_pt,True or False,accuracy,0.4600760456273764
|
|
xwinograd_pt,does underscore refer to,accuracy,0.5437262357414449
|
|
xwinograd_pt,stand for,accuracy,0.5057034220532319
|
|
xwinograd_pt,underscore refer to,accuracy,0.5399239543726235
|
|
xwinograd_pt,median,accuracy,0.5399239543726235
|
|
xwinograd_zh,Replace,accuracy,0.5595238095238095
|
|
xwinograd_zh,True or False,accuracy,0.5119047619047619
|
|
xwinograd_zh,does underscore refer to,accuracy,0.5158730158730159
|
|
xwinograd_zh,stand for,accuracy,0.49404761904761907
|
|
xwinograd_zh,underscore refer to,accuracy,0.5357142857142857
|
|
xwinograd_zh,median,accuracy,0.5158730158730159
|
|
multiple,average,multiple,0.6388768429576182
|
|
|