Shaltiel commited on
Commit
54bd295
โ€ข
1 Parent(s): 0939241

Initial about page

Browse files
Files changed (5) hide show
  1. app.py +1 -13
  2. logos/dicta-logo.jpg +0 -0
  3. logos/mafat-logo.jpg +0 -0
  4. requirements.txt +0 -1
  5. src/about.py +141 -8
app.py CHANGED
@@ -5,8 +5,6 @@ from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
  EVALUATION_QUEUE_TEXT,
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
@@ -334,18 +332,8 @@ with demo:
334
  submission_result,
335
  )
336
 
337
- with gr.Row():
338
- with gr.Accordion("๐Ÿ“™ Citation", open=False):
339
- citation_button = gr.Textbox(
340
- value=CITATION_BUTTON_TEXT,
341
- label=CITATION_BUTTON_LABEL,
342
- lines=20,
343
- elem_id="citation-button",
344
- show_copy_button=True,
345
- )
346
-
347
  scheduler = BackgroundScheduler()
348
  scheduler.add_job(restart_space, "interval", seconds=1800)
349
  # scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
350
  scheduler.start()
351
- demo.queue(default_concurrency_limit=40).launch()
 
5
  from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
 
 
8
  EVALUATION_QUEUE_TEXT,
9
  INTRODUCTION_TEXT,
10
  LLM_BENCHMARKS_TEXT,
 
332
  submission_result,
333
  )
334
 
 
 
 
 
 
 
 
 
 
 
335
  scheduler = BackgroundScheduler()
336
  scheduler.add_job(restart_space, "interval", seconds=1800)
337
  # scheduler.add_job(launch_backend, "interval", seconds=100) # will only allow one job to be run at the same time
338
  scheduler.start()
339
+ demo.queue(default_concurrency_limit=40).launch(allowed_paths=["logos/"])
logos/dicta-logo.jpg ADDED
logos/mafat-logo.jpg ADDED
requirements.txt CHANGED
@@ -13,6 +13,5 @@ requests==2.28.2
13
  tqdm==4.65.0
14
  transformers==4.38.2
15
  tokenizers>=0.15.0
16
- git+https://github.com/EleutherAI/lm-evaluation-harness.git@b281b0921b636bc36ad05c0b0b0763bd6dd43463#egg=lm-eval
17
  accelerate==0.24.1
18
  sentencepiece
 
13
  tqdm==4.65.0
14
  transformers==4.38.2
15
  tokenizers>=0.15.0
 
16
  accelerate==0.24.1
17
  sentencepiece
src/about.py CHANGED
@@ -27,19 +27,156 @@ TITLE = """<h1 align="center" id="space-title">Hebrew LLM Leaderboard</h1>"""
27
 
28
  # What does your leaderboard evaluate?
29
  INTRODUCTION_TEXT = """
30
- Intro text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  """
32
 
33
  # Which evaluations are you running? how can people reproduce what you have?
34
  LLM_BENCHMARKS_TEXT = f"""
35
  ## How it works
36
 
37
- ## Reproducibility
38
- To reproduce our results, here is the commands you can run:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  """
41
 
42
  EVALUATION_QUEUE_TEXT = """
 
 
 
 
43
  ## Some good practices before submitting a model
44
 
45
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
@@ -66,9 +203,5 @@ When we add extra information about models to the leaderboard, it will be automa
66
  ## In case of model failure
67
  If your model is displayed in the `FAILED` category, its execution stopped.
68
  Make sure you have followed the above steps first.
69
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
70
- """
71
-
72
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
73
- CITATION_BUTTON_TEXT = r"""
74
  """
 
27
 
28
  # What does your leaderboard evaluate?
29
  INTRODUCTION_TEXT = """
30
+ <div style="display: flex; justify-content: center;">
31
+ <div style="max-width: 70vw;">
32
+
33
+ Welcome to the Leaderboard for open Hebrew LLMs. The leaderboard ranks the different models according to their success on various tasks on Hebrew.
34
+
35
+ The leaderboard was created and is operated by a collaboration of [Mafat / The Israeli National Program for NLP in Hebrew and Arabic](https://nnlp-il.mafat.ai/) and [DICTA: The Israel Center for Text Analysis](https://dicta.org.il/).
36
+
37
+ <div dir="rtl" style="text-align: right">
38
+
39
+ ื‘ืจื•ื›ื™ื ื”ื‘ืื™ื ืœืœื•ื— ื”ืชื•ืฆืื•ืช ืฉืœ ืžื•ื“ืœื™ LLM ืคืชื•ื—ื™ื ื‘ืขื‘ืจื™ืช. ืœื•ื— ื”ืชื•ืฆืื•ืช ืžื“ืจื’ ืืช ื”ืžื•ื“ืœื™ื ื”ืฉื•ื ื™ื ืœืคื™ ื”ืฆืœื—ืชื ื‘ืžืฉื™ืžื•ืช ืฉื•ื ื•ืช ื‘ืขื‘ืจื™ืช.
40
+
41
+ ืœื•ื— ื”ืชื•ืฆืื•ืช ื ื•ืฆืจ ื•ืžืชื•ืคืขืœ ืขืœ ื™ื“ื™ ืฉื™ืชื•ืฃ ืคืขื•ืœื” ื‘ื™ืŸ [ืžืคื"ืช / ื”ืชื•ื›ื ื™ืช ื”ืœืื•ืžื™ืช ื”ื™ืฉืจืืœื™ืช ืœ-NLP ื‘ืขื‘ืจื™ืช ื•ื‘ืขืจื‘ื™ืช](https://nnlp-il.mafat.ai/) ื•[ื“ื™ืงื˜ื”: ื”ืžืจื›ื– ื”ื™ืฉืจืืœื™ ืœื ื™ืชื•ื— ื˜ืงืกื˜ื™ื](https://dicta.org.il/)
42
+
43
+ </div>
44
+
45
+ <div style="display: flex; flex-direction: row; justify-content: space-around; align-items: center" dir="ltr">
46
+ <a href="https://dicta.org.il/">
47
+ <img src="file/logos/dicta-logo.jpg" alt="Dicta Logo" style="max-height: 65px">
48
+ </a>
49
+ <a href="https://nnlp-il.mafat.ai/">
50
+ <img src="file/logos/mafat-logo.jpg" alt="Mafat Logo" style="max-height: 100px">
51
+ </a>
52
+ </div>
53
+ </div>
54
+ </div>
55
  """
56
 
57
  # Which evaluations are you running? how can people reproduce what you have?
58
  LLM_BENCHMARKS_TEXT = f"""
59
  ## How it works
60
 
61
+ We have curated 4 datasets for benchmarking the quality of the LLMs in Hebrew. All of the benchmarks test the base model using a few-shot prompt. Note that the tests specifically evaluate the model's abilities regarding Hebrew, without regard for the capabilities of the model in other languages.
62
+
63
+ 1. QA TLNLS (HeQ)
64
+
65
+ - **Source**: We use the test subset of the HeQ dataset, released by Amir Cohen [here](https://aclanthology.org/2023.findings-emnlp.915/). Data can be found [here](https://github.com/NNLP-IL/Hebrew-Question-Answering-Dataset).
66
+
67
+ - **Scoring**: We score the results using the `tlnls` scoring method proposed in the paper released with HeQ, which accounts for the linguistic properties of Hebrew language.
68
+
69
+ - **Number of examples**: 1,436 prompts.
70
+
71
+ - **Few-Shot Format**: For every context paragraph in the dataset, the few-shot prompt is formatted with the context paragraph, followed by 3 questions and answers on that paragraph, and finally with the desired question unanswers.
72
+
73
+ For example:
74
+
75
+ <blockquote dir="rtl" style='text-align: right; background-color: #f0f0f0;'>
76
+ <p>ื‘ืฉื ืช 2012, ื”ืชืžื•ื“ื“ื” ืœืจืืฉื•ื ื” ื‘ืคืจื™ื™ืžืจื™ื– ืฉืœ ืžืคืœื’ืช ื”ืขื‘ื•ื“ื” ืœืงืจืืช ื”ื‘ื—ื™ืจื•ืช ืœื›ื ืกืช ื”ืชืฉืข ืขืฉืจื” ื•ื”ื’ื™ืขื” ืœืžืงื•ื ื”ึพ36 ื‘ืจืฉื™ืžื” ื”ืืจืฆื™ืช (ื”ื‘ื˜ื—ืช ื™ื™ืฆื•ื’ ืœืื™ืฉื”). ื‘ึพ2015 ืœืงืจืืช ื”ื‘ื—ื™ืจื•ืช ืœื›ื ืกืช ื”ืขืฉืจื™ื, ื”ืชืžื•ื“ื“ื” ื•ืจื‘ื™ืŸ ื‘ืคืจื™ื™ืžืจื™ื– ืฉืœ ืžืคืœื’ืช ื”ืขื‘ื•ื“ื” ื•ื”ื•ืฆื‘ื” ื‘ืžืงื•ื ื”-22 ื‘ืจืฉื™ืžืช ื”ืžื—ื ื” ื”ืฆื™ื•ื ื™ ืœื›ื ืกืช, ืืฉืจ ืฉื•ืจื™ื™ืŸ ืœืื™ืฉื” ื•ื ื‘ื—ืจื” ืœื›ื ืกืช. ื‘ืฉื ืช ื”ื›ื”ื•ื ื” ื”ืจืืฉื•ื ื” ืฉืœื” ื‘ื›ื ืกืช, ื”ืขื ื™ืง ืœื” ื”ืžื›ื•ืŸ ื”ื™ืฉืจืืœื™ ืœื“ืžื•ืงืจื˜ื™ื” ืืช ืื•ืช ื”ืคืจืœืžื ื˜ืจ ื”ืžืฆื˜ื™ื™ืŸ ืœืฉื ืช 2016. ื—ื‘ืจื” ื‘ื•ื•ืขื“ืช ื”ื—ื•ืฅ ื•ื‘ื™ื˜ื—ื•ืŸ, ืฉื ื”ื™ื ื—ื‘ืจื” ื‘ื•ื•ืขื“ืช ื”ืžืฉื ื” ืœื›ื•ื— ืื“ื. ื™ื–ืžื” ื•ื™ืฉื‘ื” ื‘ืจืืฉ ื•ื•ืขื“ืช ื”ืžืฉื ื” ืœื‘ื—ื™ื ืช ืžืฉืง ื”ืืฉืจืื™ ื‘ื™ืฉืจืืœ. ื™ื–ืžื” ื•ื—ื‘ืจื” ื‘ื•ื•ืขื“ืช ื”ื—ืงื™ืจื” ื”ืคืจืœืžื ื˜ืจื™ืช ืœื‘ื—ื™ื ืช ืžืฉืง ื”ืืฉืจืื™ ื‘ื™ืฉืจืืœ, ื•ื›ืŸ ื—ื‘ืจื” ื‘ื•ื•ืขื“ืช ื”ื›ืœื›ืœื”, ื•ื•ืขื“ืช ื”ื›ื ืกืช ื•ื”ื•ื•ืขื“ื” ื”ืžื™ื•ื—ื“ืช ืœื–ื›ื•ื™ื•ืช ื”ื™ืœื“, ื•ื‘ื•ื•ืขื“ืช ื”ืžืฉื ื” ืœืงื™ื“ื•ื ืขืกืงื™ื ืงื˜ื ื™ื ื•ื‘ื™ื ื•ื ื™ื™ื</p>
77
+
78
+ ืฉืืœื”: ื‘ืื™ื–ื” ืคืจืก ื–ื›ืชื” ื•ืจื‘ื™ืŸ?
79
+ ืชืฉื•ื‘ื”: ืื•ืช ื”ืคืจืœืžื ื˜ืจ ื”ืžืฆื˜ื™ื™ืŸ ืœืฉื ืช 2016
80
+
81
+ ืฉืืœื”: ืžื™ ืžืขื ื™ืง ืืช ืื•ืช ื”ืคืจืœืžื ื˜ืจ ื”ืžืฆื˜ื™ื™ืŸ?
82
+ ืชืฉื•ื‘ื”: ื”ืžื›ื•ืŸ ื”ื™ืฉืจืืœื™ ืœื“ืžื•ืงืจื˜ื™ื”
83
+
84
+ ืฉืืœื”: ืžืชื™ ื”ืชืงื™ื™ืžื• ื”ื‘ื—ื™ืจื•ืช ืœื›ื ืกืช ื”ืขืฉืจื™ื?
85
+ ืชืฉื•ื‘ื”: ื‘ึพ2015
86
+
87
+ ืฉืืœื”: ืœืื™ื–ื• ื›ื ืกืช ื ื›ื ืกื” ื•ืจื‘ื™ืŸ ืœืจืืฉื•ื ื”?
88
+ ืชืฉื•ื‘ื”:
89
+ </blockquote>
90
+
91
+ 2. Sentiment Acc (Mafat)
92
+
93
+
94
+ - **Source**: We use a test subset of an early version of the Hebrew Sentiment dataset, released by Mafat \& NNLP-IL [here](https://www.facebook.com/groups/MDLI1/permalink/2681774131986618/). The latest version of the data can be found [here](https://github.com/NNLP-IL/Hebrew-Question-Answering-Dataset) (albeit it is different than the data we used).
95
+
96
+ - **Scoring**: We compute the accuracy score on the predictions, expecting either "ื—ื™ื•ื‘ื™", "ืฉืœื™ืœื™", or "ื ื˜ืจืœื™".
97
+
98
+ - **Number of examples**: 3,000 examples, 1,000 from each category. These examples were selected by a linguist tagger.
99
+
100
+ - **Few-Shot Format**: For every prompt, we provide 9 few-shot examples, 3 from each category, randomly shuffled.
101
+
102
+ For example:
103
+
104
+ <blockquote dir="rtl" style='text-align: right; background-color: #f0f0f0'>
105
+ <p>
106
+ ืžืฉืคื˜: ืžืฉืคื˜ ื—ื™ื•ื‘ื™ <br/>
107
+ ืชืฉื•ื‘ื”: ื—ื™ื•ื‘ื™
108
+
109
+ ืžืฉืคื˜: ืžืฉืคื˜ ืฉืœื™ืœื™ <br/>
110
+ ืชืฉื•ื‘ื”: ืฉืœื™ืœื™
111
+
112
+ ืžืฉืคื˜: ืžืฉืคื˜ ื ื˜ืจืœื™ <br/>
113
+ ืชืฉื•ื‘ื”: ื ื˜ืจืœื™
114
+
115
+ ...
116
+
117
+ ืžืฉืคื˜: ืžืฉืคื˜ ื›ืœืฉื”ื• <br/>
118
+ ืชืฉื•ื‘ื”:
119
+ </blockquote>
120
+
121
+
122
+ 3. Winograd (Binary) Acc
123
+
124
+
125
+ - **Source**: We use `A Translation of the Winograd Schema Challenge to Hebrew`, translated by Prof. Vered Schwartz. The data can be found [here](https://www.cs.ubc.ca/~vshwartz/resources/winograd_he.jsonl).
126
+
127
+ - **Scoring**: We provide in the prompt the two possible answers, and compute the accuracy score.
128
+
129
+ - **Number of examples**: 278 examples.
130
+
131
+ - **Few-Shot Format**: For every prompt, we provide 5 few-shot examples, and then the question at hand. Each example is formatted with the input sentence with the question, the possible answers, and the expected answer.
132
+
133
+ For example:
134
+
135
+ <blockquote dir="rtl" style='text-align: right; background-color: #f0f0f0'>
136
+ <p>
137
+ ืฉืืœื”: ื”ืฉื•ื˜ืจื™ื ืขืฆืจื• ืืช ื—ื‘ืจื™ ื”ื›ื ื•ืคื™ื”. ื”ื ื ื™ื”ืœื• ืืจื’ื•ืŸ ืฉืœ ืกื—ืจ ื‘ืกืžื™ื. ืžื™ ื ื™ื”ืœื•? <br/>
138
+ ืืคืฉืจื•ื™ื•ืช: "ื—ื‘ืจื™ ื”ื›ื ื•ืคื™ื”" ืื• "ื”ืฉื•ื˜ืจื™ื"<br/>
139
+ ืชืฉื•ื‘ื”: ื—ื‘ืจื™ ื”ื›ื ื•ืคื™ื”
140
+
141
+ ...
142
+
143
+ ืฉืืœื”: ื”ืฉื•ืขืœื™ื ื”ื™ื• ืžื’ื™ืขื™ื ื‘ืœื™ืœื•ืช ืœืชืงื•ืฃ ืืช ื”ืชืจื ื’ื•ืœื™ื, ืื– ื”ื™ื™ืชื™ ืฆืจื™ืš ืœืฉืžื•ืจ ืขืœื™ื”ื. ืขืœ ืžื™ ื”ื™ื™ืชื™ ืฆืจื™ืš ืœืฉืžื•ืจ?<br/>
144
+ ืืคืฉืจื•ื™ื•ืช: "ื”ืชืจื ื’ื•ืœื™ื" ืื• "ื”ืฉื•ืขืœื™ื"<br/>
145
+ ืชืฉื•ื‘ื”:
146
+ </blockquote>
147
+
148
+
149
+ 4. Translation BLEU
150
+
151
+ - **Source**: We use the aligned translation corpus `NeuLabs-TedTalks`, which can be found [here](https://opus.nlpl.eu/NeuLab-TedTalks/en&he/v1/NeuLab-TedTalks).
152
+
153
+ - **Scoring**: We use the `sacrebleu.sentence_blue` scoring function.
154
+
155
+ - **Number of examples**: We took a random 1,000 examples which were 30-40 words in length from the aligned corpus, and compute the mean score for translating those examples from English to Hebrew, and from Hebrew to English (a total of 2,000 examples).
156
+
157
+ - **Few-Shot Format**: For every prompt, we provide 3 few-shot examples of an English sentence and the Hebrew equivalent. The order depends on the direction that we are attempting to translate to.
158
+
159
+ For example:
160
+
161
+
162
+ <blockquote style="background-color: #f0f0f0;">
163
+ <p>
164
+ English: Some sentence in English<br/>
165
+ Hebrew: ืžืฉืคื˜ ื‘ืขื‘ืจื™ืช.
166
+
167
+ ...
168
+
169
+ English: Some sentence to translate to Hebrew <br/>
170
+ Hebrew:
171
+ </blockquote>
172
 
173
  """
174
 
175
  EVALUATION_QUEUE_TEXT = """
176
+ ## Important Note
177
+
178
+ Due to budget restrictions, we have a cap on the number of models that can be tested a month. Please only send your model when you are ready for testing. We also have limits on the number of models that can be sent per/user.
179
+
180
  ## Some good practices before submitting a model
181
 
182
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 
203
  ## In case of model failure
204
  If your model is displayed in the `FAILED` category, its execution stopped.
205
  Make sure you have followed the above steps first.
206
+ If everything is done and the model still won't run, please reach out to `shaltiel at dicta dot org dot il` with the details.
 
 
 
 
207
  """