GPT-3.5Turbo HumanEval Contamination based on "Generalization or Memorization: Data Contamination and Trustworthy Evaluation for Large Language Models"
#16
by
jupyter31
- opened
- contamination_report.csv +2 -0
contamination_report.csv
CHANGED
@@ -3,6 +3,8 @@ Evaluation Dataset;Subset;Contaminated Source;Model or corpus;Train Split;Develo
|
|
3 |
gsm8k;;GPT-4;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
|
4 |
ucinlp/drop;;GPT-4;model;;44.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
5 |
openai_humaneval;;GPT-4;model;;;56.71;model-based;https://arxiv.org/abs/2311.06233;8
|
|
|
|
|
6 |
imdb;;GPT-4;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
|
7 |
imdb;;GPT-3.5;model;;;55.00;model-based;https://arxiv.org/abs/2311.06233;8
|
8 |
ag_news;;GPT-4;model;;;91.00;model-based;https://arxiv.org/abs/2311.06233;8
|
|
|
3 |
gsm8k;;GPT-4;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
|
4 |
ucinlp/drop;;GPT-4;model;;44.00;;model-based;https://arxiv.org/abs/2311.06233;8
|
5 |
openai_humaneval;;GPT-4;model;;;56.71;model-based;https://arxiv.org/abs/2311.06233;8
|
6 |
+
openai_humaneval;;GPT-3.5-turbo/0613;model;;;23.79;model-based;https://arxiv.org/abs/2402.15938;16
|
7 |
+
openai_humaneval;;GPT-3.5-turbo/1106;model;;;41.47;model-based;https://arxiv.org/abs/2402.15938;16
|
8 |
imdb;;GPT-4;model;;;82.00;model-based;https://arxiv.org/abs/2311.06233;8
|
9 |
imdb;;GPT-3.5;model;;;55.00;model-based;https://arxiv.org/abs/2311.06233;8
|
10 |
ag_news;;GPT-4;model;;;91.00;model-based;https://arxiv.org/abs/2311.06233;8
|