OSainz commited on
Commit
582a8ca
β€’
1 Parent(s): c50904f

Add PR number + Postprocessing

Browse files
Files changed (2) hide show
  1. contamination_report.csv +7 -7
  2. postprocessing.py +4 -0
contamination_report.csv CHANGED
@@ -163,16 +163,12 @@ gigaword;;togethercomputer/RedPajama-Data-V2;;corpus;;;2.82;data-based;https://a
163
 
164
  gsm8k;;BAAI/Aquila2-34B;;model;;;100.0;model-based;https://huggingface.co/BAAI/Aquila2-34B/blob/main/README.md;21
165
  gsm8k;;BAAI/AquilaChat2-34B;;model;;;100.0;model-based;https://huggingface.co/BAAI/AquilaChat2-34B/blob/main/README.md;21
166
- gsm8k;;EleutherAI/llemma_7b;;model;;;0.15;data-based;https://openreview.net/pdf?id=4WnqRR915j;
167
- gsm8k;;EleutherAI/llemma_34b;;model;;;0.15;data-based;https://openreview.net/pdf?id=4WnqRR915j;
168
- gsm8k;;EleutherAI/proof-pile-2;;corpus;;;0.15;data-based;https://openreview.net/pdf?id=4WnqRR915j;
169
  gsm8k;;GPT-4;;model;100.0;;1.0;data-based;https://arxiv.org/abs/2303.08774;11
170
  gsm8k;;GPT-4;;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
171
 
172
- hendrycks/competition_math;;EleutherAI/llemma_7b;;model;;;7.72;data-based;https://openreview.net/pdf?id=4WnqRR915j;
173
- hendrycks/competition_math;;EleutherAI/llemma_34b;;model;;;7.72;data-based;https://openreview.net/pdf?id=4WnqRR915j;
174
- hendrycks/competition_math;;EleutherAI/proof-pile-2;;corpus;;;7.72;data-based;https://openreview.net/pdf?id=4WnqRR915j;
175
-
176
  head_qa;en;EleutherAI/pile;;corpus;;;5.11;data-based;https://arxiv.org/abs/2310.20707;2
177
  head_qa;en;allenai/c4;;corpus;;;5.22;data-based;https://arxiv.org/abs/2310.20707;2
178
  head_qa;en;oscar-corpus/OSCAR-2301;;corpus;;;5.29;data-based;https://arxiv.org/abs/2310.20707;2
@@ -183,6 +179,10 @@ health_fact;;allenai/c4;;corpus;;;7.53;data-based;https://arxiv.org/abs/2310.207
183
  health_fact;;oscar-corpus/OSCAR-2301;;corpus;;;3.4;data-based;https://arxiv.org/abs/2310.20707;2
184
  health_fact;;togethercomputer/RedPajama-Data-V2;;corpus;;;18.7;data-based;https://arxiv.org/abs/2310.20707;2
185
 
 
 
 
 
186
  hlgd;;EleutherAI/pile;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
187
  hlgd;;allenai/c4;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
188
  hlgd;;oscar-corpus/OSCAR-2301;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
 
163
 
164
  gsm8k;;BAAI/Aquila2-34B;;model;;;100.0;model-based;https://huggingface.co/BAAI/Aquila2-34B/blob/main/README.md;21
165
  gsm8k;;BAAI/AquilaChat2-34B;;model;;;100.0;model-based;https://huggingface.co/BAAI/AquilaChat2-34B/blob/main/README.md;21
166
+ gsm8k;;EleutherAI/llemma_34b;;model;;;0.15;data-based;https://openreview.net/forum?id=4WnqRR915j;23
167
+ gsm8k;;EleutherAI/llemma_7b;;model;;;0.15;data-based;https://openreview.net/forum?id=4WnqRR915j;23
168
+ gsm8k;;EleutherAI/proof-pile-2;;corpus;;;0.15;data-based;https://openreview.net/forum?id=4WnqRR915j;23
169
  gsm8k;;GPT-4;;model;100.0;;1.0;data-based;https://arxiv.org/abs/2303.08774;11
170
  gsm8k;;GPT-4;;model;79.00;;;model-based;https://arxiv.org/abs/2311.06233;8
171
 
 
 
 
 
172
  head_qa;en;EleutherAI/pile;;corpus;;;5.11;data-based;https://arxiv.org/abs/2310.20707;2
173
  head_qa;en;allenai/c4;;corpus;;;5.22;data-based;https://arxiv.org/abs/2310.20707;2
174
  head_qa;en;oscar-corpus/OSCAR-2301;;corpus;;;5.29;data-based;https://arxiv.org/abs/2310.20707;2
 
179
  health_fact;;oscar-corpus/OSCAR-2301;;corpus;;;3.4;data-based;https://arxiv.org/abs/2310.20707;2
180
  health_fact;;togethercomputer/RedPajama-Data-V2;;corpus;;;18.7;data-based;https://arxiv.org/abs/2310.20707;2
181
 
182
+ hendrycks/competition_math;;EleutherAI/llemma_34b;;model;;;7.72;data-based;https://openreview.net/forum?id=4WnqRR915j;23
183
+ hendrycks/competition_math;;EleutherAI/llemma_7b;;model;;;7.72;data-based;https://openreview.net/forum?id=4WnqRR915j;23
184
+ hendrycks/competition_math;;EleutherAI/proof-pile-2;;corpus;;;7.72;data-based;https://openreview.net/forum?id=4WnqRR915j;23
185
+
186
  hlgd;;EleutherAI/pile;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
187
  hlgd;;allenai/c4;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
188
  hlgd;;oscar-corpus/OSCAR-2301;;corpus;;;0.0;data-based;https://arxiv.org/abs/2310.20707;2
postprocessing.py CHANGED
@@ -17,6 +17,9 @@ def remove_duplicates(data):
17
  def fix_arxiv_links(data):
18
  return [[*item[:-2], item[-2].replace("arxiv.org/pdf", "arxiv.org/abs"), item[-1]] for item in data]
19
 
 
 
 
20
  def sort_data(data):
21
  return sorted(data, key=lambda x: (x[0], x[1], x[2], x[3], x[-1]))
22
 
@@ -25,6 +28,7 @@ def main():
25
  data = sort_data(data)
26
  data = remove_duplicates(data)
27
  data = fix_arxiv_links(data)
 
28
  print("Total datapoints:", len(data))
29
 
30
  with open("contamination_report.csv", 'w') as f:
 
17
  def fix_arxiv_links(data):
18
  return [[*item[:-2], item[-2].replace("arxiv.org/pdf", "arxiv.org/abs"), item[-1]] for item in data]
19
 
20
+ def fix_openreview_links(data):
21
+ return [[*item[:-2], item[-2].replace("openreview.net/pdf", "openreview.net/forum"), item[-1]] for item in data]
22
+
23
  def sort_data(data):
24
  return sorted(data, key=lambda x: (x[0], x[1], x[2], x[3], x[-1]))
25
 
 
28
  data = sort_data(data)
29
  data = remove_duplicates(data)
30
  data = fix_arxiv_links(data)
31
+ data = fix_openreview_links(data)
32
  print("Total datapoints:", len(data))
33
 
34
  with open("contamination_report.csv", 'w') as f: