Contamination results based on "Data Contamination Quiz"
#9
by
shahriargolchin
- opened
- .gitignore +2 -1
- README.md +1 -1
- contamination_report.csv +51 -15
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
*.pyc
|
2 |
-
*.json
|
|
|
|
1 |
*.pyc
|
2 |
+
*.json
|
3 |
+
*.lock
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: ππ¨ Data Contamination
|
3 |
emoji: π
|
4 |
colorFrom: green
|
5 |
colorTo: blue
|
|
|
1 |
---
|
2 |
+
title: ππ¨ Data Contamination Database
|
3 |
emoji: π
|
4 |
colorFrom: green
|
5 |
colorTo: blue
|
contamination_report.csv
CHANGED
@@ -1,5 +1,30 @@
|
|
1 |
Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
|
2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
conll2003;;GPT-3.5;model;100.0;100.0;100.0;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;7
|
4 |
nyu-mll/glue;mnli;GPT-3.5;model;100.0;100.0;;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;7
|
5 |
rajpurkar/squad_v2;;GPT-3.5;model;100.0;100.0;;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;7
|
@@ -428,24 +453,35 @@ zest;;EleutherAI/pile;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
|
|
428 |
zest;;togethercomputer/RedPajama-Data-V2;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
|
429 |
|
430 |
|
431 |
-
imdb;;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/
|
432 |
-
imdb;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
433 |
|
434 |
-
ag_news;;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/
|
435 |
-
ag_news;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
436 |
|
437 |
-
yelp_review_full;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/
|
438 |
-
yelp_review_full;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
439 |
|
440 |
-
nyu-mll/glue;rte;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/
|
441 |
-
nyu-mll/glue;rte;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
442 |
|
443 |
-
nyu-mll/glue;wnli;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/
|
444 |
-
nyu-mll/glue;wnli;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
445 |
|
446 |
-
samsum;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/
|
447 |
-
samsum;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/
|
448 |
|
449 |
-
EdinburghNLP/xsum;;GPT-4;model;0.0;;100.0;model-based;https://arxiv.org/
|
450 |
-
EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/
|
451 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
|
2 |
|
3 |
+
gsm8k;;GPT-4;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
|
4 |
+
ucinlp/drop;;GPT-4;model;;44.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
5 |
+
openai_humaneval;;GPT-4;model;;;56.71;model-based;https://arxiv.org/abs/2311.06233;8
|
6 |
+
imdb;;GPT-4;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
|
7 |
+
imdb;;GPT-3.5;model;;;55.00;model-based;https://arxiv.org/abs/2311.06233;8
|
8 |
+
ag_news;;GPT-4;model;;;91.00;model-based;https://arxiv.org/abs/2311.06233;8
|
9 |
+
ag_news;;GPT-3.5;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
|
10 |
+
yelp_review_full;;GPT-4;model;;;80.00;model-based;https://arxiv.org/abs/2311.06233;8
|
11 |
+
yelp_review_full;;GPT-3.5;model;;;13.00;model-based;https://arxiv.org/abs/2311.06233;8
|
12 |
+
nyu-mll/glue;rte;GPT-4;model;;60.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
13 |
+
nyu-mll/glue;rte;GPT-3.5;model;;71.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
14 |
+
nyu-mll/glue;wnli;GPT-4;model;;50.70;;model-based;https://arxiv.org/abs/2311.06233;8
|
15 |
+
nyu-mll/glue;wnli;GPT-3.5;model;;12.68;;model-based;https://arxiv.org/abs/2311.06233;8
|
16 |
+
samsum;;GPT-4;model;;;77.00;model-based;https://arxiv.org/abs/2311.06233;8
|
17 |
+
samsum;;GPT-3.5;model;;;74.00;model-based;https://arxiv.org/abs/2311.06233;8
|
18 |
+
EdinburghNLP/xsum;;GPT-4;model;;;95.00;model-based;https://arxiv.org/abs/2311.06233;8
|
19 |
+
EdinburghNLP/xsum;;GPT-3.5;model;;;79.00;model-based;https://arxiv.org/abs/2311.06233;8
|
20 |
+
|
21 |
+
allenai/ai2_arc;;CommonCrawl;corpus;;;28.7;data-based;https://arxiv.org/abs/2310.17589;5
|
22 |
+
tau/commonsense_qa;;CommonCrawl;corpus;;1.6;;data-based;https://arxiv.org/abs/2310.17589;5
|
23 |
+
winogrande;;CommonCrawl;corpus;;1.1;;data-based;https://arxiv.org/abs/2310.17589;5
|
24 |
+
ceval/ceval-exam;;CommonCrawl;corpus;;45.8;;data-based;https://arxiv.org/abs/2310.17589;5
|
25 |
+
Rowan/hellaswag;;CommonCrawl;corpus;;12.4;;data-based;https://arxiv.org/abs/2310.17589;5
|
26 |
+
cais/mmlu;;CommonCrawl;corpus;;;29.1;data-based;https://arxiv.org/abs/2310.17589;5
|
27 |
+
|
28 |
conll2003;;GPT-3.5;model;100.0;100.0;100.0;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;7
|
29 |
nyu-mll/glue;mnli;GPT-3.5;model;100.0;100.0;;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;7
|
30 |
rajpurkar/squad_v2;;GPT-3.5;model;100.0;100.0;;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;7
|
|
|
453 |
zest;;togethercomputer/RedPajama-Data-V2;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
|
454 |
|
455 |
|
456 |
+
imdb;;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
457 |
+
imdb;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
458 |
|
459 |
+
ag_news;;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
|
460 |
+
ag_news;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
461 |
|
462 |
+
yelp_review_full;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
463 |
+
yelp_review_full;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
464 |
|
465 |
+
nyu-mll/glue;rte;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
466 |
+
nyu-mll/glue;rte;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
467 |
|
468 |
+
nyu-mll/glue;wnli;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
|
469 |
+
nyu-mll/glue;wnli;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
470 |
|
471 |
+
samsum;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
472 |
+
samsum;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
|
473 |
|
474 |
+
EdinburghNLP/xsum;;GPT-4;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
|
475 |
+
EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
|
476 |
+
|
477 |
+
bigbio/mednli;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
478 |
+
bigbio/mednli;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
479 |
+
|
480 |
+
RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
481 |
+
RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
|
482 |
+
|
483 |
+
|
484 |
+
openai_humaneval;;EleutherAI/pile;corpus;;;12.2;data-based;https://arxiv.org/abs/2403.04811;12
|
485 |
+
mbpp;;EleutherAI/pile;corpus;;;3.6;data-based;https://arxiv.org/abs/2403.04811;12
|
486 |
+
openai_humaneval;;bigcode/the-stack;corpus;;;18.9;data-based;https://arxiv.org/abs/2403.04811;12
|
487 |
+
mbpp;;bigcode/the-stack;corpus;;;20.8;data-based;https://arxiv.org/abs/2403.04811;12
|