Contamination results based on "Data Contamination Quiz"

#9
Files changed (3) hide show
  1. .gitignore +2 -1
  2. README.md +1 -1
  3. contamination_report.csv +51 -15
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  *.pyc
2
- *.json
 
 
1
  *.pyc
2
+ *.json
3
+ *.lock
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: πŸπŸ’¨ Data Contamination Report
3
  emoji: 🏭
4
  colorFrom: green
5
  colorTo: blue
 
1
  ---
2
+ title: πŸπŸ’¨ Data Contamination Database
3
  emoji: 🏭
4
  colorFrom: green
5
  colorTo: blue
contamination_report.csv CHANGED
@@ -1,5 +1,30 @@
1
  Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  conll2003;;GPT-3.5;model;100.0;100.0;100.0;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;7
4
  nyu-mll/glue;mnli;GPT-3.5;model;100.0;100.0;;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;7
5
  rajpurkar/squad_v2;;GPT-3.5;model;100.0;100.0;;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;7
@@ -428,24 +453,35 @@ zest;;EleutherAI/pile;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
428
  zest;;togethercomputer/RedPajama-Data-V2;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
429
 
430
 
431
- imdb;;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
432
- imdb;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
433
 
434
- ag_news;;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/pdf/2308.08493;3
435
- ag_news;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
436
 
437
- yelp_review_full;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
438
- yelp_review_full;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
439
 
440
- nyu-mll/glue;rte;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
441
- nyu-mll/glue;rte;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
442
 
443
- nyu-mll/glue;wnli;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/pdf/2308.08493;3
444
- nyu-mll/glue;wnli;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
445
 
446
- samsum;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
447
- samsum;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/pdf/2308.08493;3
448
 
449
- EdinburghNLP/xsum;;GPT-4;model;0.0;;100.0;model-based;https://arxiv.org/pdf/2308.08493;3
450
- EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/pdf/2308.08493;3
451
-
 
 
 
 
 
 
 
 
 
 
 
 
1
  Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Development Split;Test Split;Approach;Reference;PR
2
 
3
+ gsm8k;;GPT-4;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
4
+ ucinlp/drop;;GPT-4;model;;44.00;;model-based;https://arxiv.org/abs/2311.06233;8
5
+ openai_humaneval;;GPT-4;model;;;56.71;model-based;https://arxiv.org/abs/2311.06233;8
6
+ imdb;;GPT-4;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
7
+ imdb;;GPT-3.5;model;;;55.00;model-based;https://arxiv.org/abs/2311.06233;8
8
+ ag_news;;GPT-4;model;;;91.00;model-based;https://arxiv.org/abs/2311.06233;8
9
+ ag_news;;GPT-3.5;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
10
+ yelp_review_full;;GPT-4;model;;;80.00;model-based;https://arxiv.org/abs/2311.06233;8
11
+ yelp_review_full;;GPT-3.5;model;;;13.00;model-based;https://arxiv.org/abs/2311.06233;8
12
+ nyu-mll/glue;rte;GPT-4;model;;60.00;;model-based;https://arxiv.org/abs/2311.06233;8
13
+ nyu-mll/glue;rte;GPT-3.5;model;;71.00;;model-based;https://arxiv.org/abs/2311.06233;8
14
+ nyu-mll/glue;wnli;GPT-4;model;;50.70;;model-based;https://arxiv.org/abs/2311.06233;8
15
+ nyu-mll/glue;wnli;GPT-3.5;model;;12.68;;model-based;https://arxiv.org/abs/2311.06233;8
16
+ samsum;;GPT-4;model;;;77.00;model-based;https://arxiv.org/abs/2311.06233;8
17
+ samsum;;GPT-3.5;model;;;74.00;model-based;https://arxiv.org/abs/2311.06233;8
18
+ EdinburghNLP/xsum;;GPT-4;model;;;95.00;model-based;https://arxiv.org/abs/2311.06233;8
19
+ EdinburghNLP/xsum;;GPT-3.5;model;;;79.00;model-based;https://arxiv.org/abs/2311.06233;8
20
+
21
+ allenai/ai2_arc;;CommonCrawl;corpus;;;28.7;data-based;https://arxiv.org/abs/2310.17589;5
22
+ tau/commonsense_qa;;CommonCrawl;corpus;;1.6;;data-based;https://arxiv.org/abs/2310.17589;5
23
+ winogrande;;CommonCrawl;corpus;;1.1;;data-based;https://arxiv.org/abs/2310.17589;5
24
+ ceval/ceval-exam;;CommonCrawl;corpus;;45.8;;data-based;https://arxiv.org/abs/2310.17589;5
25
+ Rowan/hellaswag;;CommonCrawl;corpus;;12.4;;data-based;https://arxiv.org/abs/2310.17589;5
26
+ cais/mmlu;;CommonCrawl;corpus;;;29.1;data-based;https://arxiv.org/abs/2310.17589;5
27
+
28
  conll2003;;GPT-3.5;model;100.0;100.0;100.0;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;7
29
  nyu-mll/glue;mnli;GPT-3.5;model;100.0;100.0;;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;7
30
  rajpurkar/squad_v2;;GPT-3.5;model;100.0;100.0;;model-based;https://hitz-zentroa.github.io/lm-contamination/blog/;7
 
453
  zest;;togethercomputer/RedPajama-Data-V2;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
454
 
455
 
456
+ imdb;;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
457
+ imdb;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
458
 
459
+ ag_news;;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
460
+ ag_news;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
461
 
462
+ yelp_review_full;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
463
+ yelp_review_full;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
464
 
465
+ nyu-mll/glue;rte;GPT-4;model;100.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
466
+ nyu-mll/glue;rte;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
467
 
468
+ nyu-mll/glue;wnli;GPT-4;model;100.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
469
+ nyu-mll/glue;wnli;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
470
 
471
+ samsum;;GPT-4;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
472
+ samsum;;GPT-3.5;model;0.0;;0.0;model-based;https://arxiv.org/abs/2308.08493;3
473
 
474
+ EdinburghNLP/xsum;;GPT-4;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
475
+ EdinburghNLP/xsum;;GPT-3.5;model;0.0;;100.0;model-based;https://arxiv.org/abs/2308.08493;3
476
+
477
+ bigbio/mednli;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
478
+ bigbio/mednli;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
479
+
480
+ RadNLI;;GPT-4;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
481
+ RadNLI;;GPT-3.5;model;0.0;0.0;0.0;model-based;https://arxiv.org/abs/2308.08493;8
482
+
483
+
484
+ openai_humaneval;;EleutherAI/pile;corpus;;;12.2;data-based;https://arxiv.org/abs/2403.04811;12
485
+ mbpp;;EleutherAI/pile;corpus;;;3.6;data-based;https://arxiv.org/abs/2403.04811;12
486
+ openai_humaneval;;bigcode/the-stack;corpus;;;18.9;data-based;https://arxiv.org/abs/2403.04811;12
487
+ mbpp;;bigcode/the-stack;corpus;;;20.8;data-based;https://arxiv.org/abs/2403.04811;12