OpenLLM-Ro
/

RoLlama2-7b-Base-2024-05-14

@@ -4,233 +4,450 @@ language:
 - ro
 base_model: meta-llama/Llama-2-7b-hf
 model-index:
-        - name: OpenLLM-Ro/RoLlama2-7b-Base
-          results:
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: RoMT-Bench
-                        type: RoMT-Bench
-                  metrics:
-                        - name: Score
-                          type: Score
-                          value: 12.00
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: RoCulturaBench
-                        type: RoCulturaBench
-                  metrics:
-                        - name: Score
-                          type: Score
-                          value: 8.00
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: Romanian_Academic_Benchmarks
-                        type: Romanian_Academic_Benchmarks
-                  metrics:
-                        - name: Average accuracy
-                          type: accuracy
-                          value: 38.03
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: OpenLLM-Ro/ro_arc_challenge
-                        type: OpenLLM-Ro/ro_arc_challenge
-                  metrics:
-                        - name: Average accuracy
-                          type: accuracy
-                          value: 37.95
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: OpenLLM-Ro/ro_mmlu
-                        type: OpenLLM-Ro/ro_mmlu
-                  metrics:
-                        - name: Average accuracy
-                          type: accuracy
-                          value: 27.22
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: OpenLLM-Ro/ro_winogrande
-                        type: OpenLLM-Ro/ro_winogrande
-                  metrics:
-                        - name: Average accuracy
-                          type: accuracy
-                          value: 59.29
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: OpenLLM-Ro/ro_hellaswag
-                        type: OpenLLM-Ro/ro_hellaswag
-                  metrics:
-                        - name: Average accuracy
-                          type: accuracy
-                          value: 57.22
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: OpenLLM-Ro/ro_gsm8k
-                        type: OpenLLM-Ro/ro_gsm8k
-                  metrics:
-                        - name: Average accuracy
-                          type: accuracy
-                          value: 2.53
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: OpenLLM-Ro/ro_truthfulqa
-                        type: OpenLLM-Ro/ro_truthfulqa
-                  metrics:
-                        - name: Average accuracy
-                          type: accuracy
-                          value: 44.00
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: LaRoSeDa_binary
-                        type: LaRoSeDa_binary
-                  metrics:
-                        - name: Average macro-f1
-                          type: macro-f1
-                          value: 83.25
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: LaRoSeDa_multiclass
-                        type: LaRoSeDa_multiclass
-                  metrics:
-                        - name: Average macro-f1
-                          type: macro-f1
-                          value: 61.04
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: LaRoSeDa_binary_finetuned
-                        type: LaRoSeDa_binary_finetuned
-                  metrics:
-                        - name: Average macro-f1
-                          type: macro-f1
-                          value: 98.97
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: LaRoSeDa_multiclass_finetuned
-                        type: LaRoSeDa_multiclass_finetuned
-                  metrics:
-                        - name: Average macro-f1
-                          type: macro-f1
-                          value: 87.72
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: WMT_EN-RO
-                        type: WMT_EN-RO
-                  metrics:
-                        - name: Average bleu
-                          type: bleu
-                          value: 10.01
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: WMT_RO-EN
-                        type: WMT_RO-EN
-                  metrics:
-                        - name: Average bleu
-                          type: bleu
-                          value: 13.03
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: WMT_EN-RO_finetuned
-                        type: WMT_EN-RO_finetuned
-                  metrics:
-                        - name: Average bleu
-                          type: bleu
-                          value: 27.85
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: WMT_RO-EN_finetuned
-                        type: WMT_RO-EN_finetuned
-                  metrics:
-                        - name: Average bleu
-                          type: bleu
-                          value: 39.30
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: XQuAD
-                        type: XQuAD
-                  metrics:
-                        - name: Average exact_match
-                          type: exact_match
-                          value: 30.15
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: XQuAD
-                        type: XQuAD
-                  metrics:
-                        - name: Average f1
-                          type: f1
-                          value: 47.03
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: XQuAD_finetuned
-                        type: XQuAD_finetuned
-                  metrics:
-                        - name: Average exact_match
-                          type: exact_match
-                          value: 67.06
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: XQuAD_finetuned
-                        type: XQuAD_finetuned
-                  metrics:
-                        - name: Average f1
-                          type: f1
-                          value: 79.96
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: STS
-                        type: STS
-                  metrics:
-                        - name: Average spearman
-                          type: spearman
-                          value: 7.89
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: STS
-                        type: STS
-                  metrics:
-                        - name: Average pearson
-                          type: pearson
-                          value: 7.98
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: STS_finetuned
-                        type: STS_finetuned
-                  metrics:
-                        - name: Average spearman
-                          type: spearman
-                          value: 71.75
-                - task:
-                        type: text-generation
-                  dataset:
-                        name: STS_finetuned
-                        type: STS_finetuned
-                  metrics:
-                        - name: Average pearson
-                          type: pearson
-                          value: 71.99
 ---
 # Model Card for Model ID
@@ -295,19 +512,77 @@ print(tokenizer.decode(outputs[0]))
 ## Academic Benchmarks
-| Model              | Average  | ARC      | MMLU     |Winogrande|HellaSwag | GSM8k    |TruthfulQA|
-|--------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|
-| Llama-2-7b         | 37.04    | 36.05    | **33.66**    | 57.56    | 48.00    | **4.75**     | 42.22    |
-| *RoLlama2-7b-Base* | ***38.03***  | ***37.95***  | *27.22*  | ***59.29***  | ***57.22***  | *2.53*   | ***44.00***  |
-<!-- ## Downstream Tasks
-| Model              | Sentiment Analysis  | ARC      | MMLU     |Winogrande|HellaSwag | GSM8k    |TruthfulQA|
-|--------------------|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|:--------:|
-| Llama-2-7b         | 37.04    | 36.05    | **33.66**    | 57.56    | 48.00    | **4.75**     | 42.22    |
-| *RoLlama2-7b-Base* | ***38.03***  | ***37.95***  | *27.22*  | ***59.29***  | ***57.22***  | *2.53*   | ***44.00***  |
- -->
 ## RoLlama2 Model Family

 - ro
 base_model: meta-llama/Llama-2-7b-hf
 model-index:
+    - name: OpenLLM-Ro/RoLlama2-7b-Base
+      results:
+        - task:
+            type: text-generation
+          dataset:
+            name: Romanian_Academic_Benchmarks
+            type: Romanian_Academic_Benchmarks
+          metrics:
+            - name: Average accuracy
+              type: accuracy
+              value: 38.03
+        - task:
+            type: text-generation
+          dataset:
+            name: OpenLLM-Ro/ro_arc_challenge
+            type: OpenLLM-Ro/ro_arc_challenge
+          metrics:
+            - name: Average accuracy
+              type: accuracy
+              value: 37.95
+        - task:
+            type: text-generation
+          dataset:
+            name: OpenLLM-Ro/ro_mmlu
+            type: OpenLLM-Ro/ro_mmlu
+          metrics:
+            - name: Average accuracy
+              type: accuracy
+              value: 27.22
+        - task:
+            type: text-generation
+          dataset:
+            name: OpenLLM-Ro/ro_winogrande
+            type: OpenLLM-Ro/ro_winogrande
+          metrics:
+            - name: Average accuracy
+              type: accuracy
+              value: 59.29
+        - task:
+            type: text-generation
+          dataset:
+            name: OpenLLM-Ro/ro_hellaswag
+            type: OpenLLM-Ro/ro_hellaswag
+          metrics:
+            - name: Average accuracy
+              type: accuracy
+              value: 57.22
+        - task:
+            type: text-generation
+          dataset:
+            name: OpenLLM-Ro/ro_gsm8k
+            type: OpenLLM-Ro/ro_gsm8k
+          metrics:
+            - name: Average accuracy
+              type: accuracy
+              value: 2.53
+        - task:
+            type: text-generation
+          dataset:
+            name: OpenLLM-Ro/ro_truthfulqa
+            type: OpenLLM-Ro/ro_truthfulqa
+          metrics:
+            - name: Average accuracy
+              type: accuracy
+              value: 44.00
+        - task:
+            type: text-generation
+          dataset:
+            name: LaRoSeDa_binary
+            type: LaRoSeDa_binary
+          metrics:
+            - name: Average macro-f1
+              type: macro-f1
+              value: 83.25
+        - task:
+            type: text-generation
+          dataset:
+            name: LaRoSeDa_multiclass
+            type: LaRoSeDa_multiclass
+          metrics:
+            - name: Average macro-f1
+              type: macro-f1
+              value: 61.04
+        - task:
+            type: text-generation
+          dataset:
+            name: LaRoSeDa_binary_finetuned
+            type: LaRoSeDa_binary_finetuned
+          metrics:
+            - name: Average macro-f1
+              type: macro-f1
+              value: 98.97
+        - task:
+            type: text-generation
+          dataset:
+            name: LaRoSeDa_multiclass_finetuned
+            type: LaRoSeDa_multiclass_finetuned
+          metrics:
+            - name: Average macro-f1
+              type: macro-f1
+              value: 87.72
+        - task:
+            type: text-generation
+          dataset:
+            name: WMT_EN-RO
+            type: WMT_EN-RO
+          metrics:
+            - name: Average bleu
+              type: bleu
+              value: 10.01
+        - task:
+            type: text-generation
+          dataset:
+            name: WMT_RO-EN
+            type: WMT_RO-EN
+          metrics:
+            - name: Average bleu
+              type: bleu
+              value: 13.03
+        - task:
+            type: text-generation
+          dataset:
+            name: WMT_EN-RO_finetuned
+            type: WMT_EN-RO_finetuned
+          metrics:
+            - name: Average bleu
+              type: bleu
+              value: 27.85
+        - task:
+            type: text-generation
+          dataset:
+            name: WMT_RO-EN_finetuned
+            type: WMT_RO-EN_finetuned
+          metrics:
+            - name: Average bleu
+              type: bleu
+              value: 39.30
+        - task:
+            type: text-generation
+          dataset:
+            name: XQuAD
+            type: XQuAD
+          metrics:
+            - name: Average exact_match
+              type: exact_match
+              value: 30.15
+        - task:
+            type: text-generation
+          dataset:
+            name: XQuAD
+            type: XQuAD
+          metrics:
+            - name: Average f1
+              type: f1
+              value: 47.03
+        - task:
+            type: text-generation
+          dataset:
+            name: XQuAD_finetuned
+            type: XQuAD_finetuned
+          metrics:
+            - name: Average exact_match
+              type: exact_match
+              value: 67.06
+        - task:
+            type: text-generation
+          dataset:
+            name: XQuAD_finetuned
+            type: XQuAD_finetuned
+          metrics:
+            - name: Average f1
+              type: f1
+              value: 79.96
+        - task:
+            type: text-generation
+          dataset:
+            name: STS
+            type: STS
+          metrics:
+            - name: Average spearman
+              type: spearman
+              value: 7.89
+        - task:
+            type: text-generation
+          dataset:
+            name: STS
+            type: STS
+          metrics:
+            - name: Average pearson
+              type: pearson
+              value: 7.98
+        - task:
+            type: text-generation
+          dataset:
+            name: STS_finetuned
+            type: STS_finetuned
+          metrics:
+            - name: Average spearman
+              type: spearman
+              value: 71.75
+        - task:
+            type: text-generation
+          dataset:
+            name: STS_finetuned
+            type: STS_finetuned
+          metrics:
+            - name: Average pearson
+              type: pearson
+              value: 71.99
+        - task:
+            type: text-generation
+          dataset:
+            name: OpenLLM-Ro/ro_arc_challenge
+            type: OpenLLM-Ro/ro_arc_challenge
+          metrics:
+            - name: 0-shot
+              type: accuracy
+              value: 35.56
+            - name: 1-shot
+              type: accuracy
+              value: 36.42
+            - name: 3-shot
+              type: accuracy
+              value: 38.56
+            - name: 5-shot
+              type: accuracy
+              value: 38.39
+            - name: 10-shot
+              type: accuracy
+              value: 39.07
+            - name: 25-shot
+              type: accuracy
+              value: 39.67
+        - task:
+            type: text-generation
+          dataset:
+            name: OpenLLM-Ro/ro_mmlu
+            type: OpenLLM-Ro/ro_mmlu
+          metrics:
+            - name: 0-shot
+              type: accuracy
+              value: 25.82
+            - name: 1-shot
+              type: accuracy
+              value: 25.48
+            - name: 3-shot
+              type: accuracy
+              value: 27.61
+            - name: 5-shot
+              type: accuracy
+              value: 29.96
+        - task:
+            type: text-generation
+          dataset:
+            name: OpenLLM-Ro/ro_winogrande
+            type: OpenLLM-Ro/ro_winogrande
+          metrics:
+            - name: 0-shot
+              type: accuracy
+              value: 58.72
+            - name: 1-shot
+              type: accuracy
+              value: 58.88
+            - name: 3-shot
+              type: accuracy
+              value: 60.38
+            - name: 5-shot
+              type: accuracy
+              value: 59.19
+        - task:
+            type: text-generation
+          dataset:
+            name: OpenLLM-Ro/ro_hellaswag
+            type: OpenLLM-Ro/ro_hellaswag
+          metrics:
+            - name: 0-shot
+              type: accuracy
+              value: 55.85
+            - name: 1-shot
+              type: accuracy
+              value: 57.06
+            - name: 3-shot
+              type: accuracy
+              value: 57.52
+            - name: 5-shot
+              type: accuracy
+              value: 57.89
+            - name: 10-shot
+              type: accuracy
+              value: 57.79
+        - task:
+            type: text-generation
+          dataset:
+            name: OpenLLM-Ro/ro_gsm8k
+            type: OpenLLM-Ro/ro_gsm8k
+          metrics:
+            - name: 0-shot
+              type: accuracy
+              value: 0.00
+            - name: 1-shot
+              type: accuracy
+              value: 2.96
+            - name: 3-shot
+              type: accuracy
+              value: 4.62
+        - task:
+            type: text-generation
+          dataset:
+            name: LaRoSeDa_binary
+            type: LaRoSeDa_binary
+          metrics:
+            - name: 0-shot
+              type: macro-f1
+              value: 42.78
+            - name: 1-shot
+              type: macro-f1
+              value: 98.00
+            - name: 3-shot
+              type: macro-f1
+              value: 95.13
+            - name: 5-shot
+              type: macro-f1
+              value: 97.07
+        - task:
+            type: text-generation
+          dataset:
+            name: LaRoSeDa_multiclass
+            type: LaRoSeDa_multiclass
+          metrics:
+            - name: 0-shot
+              type: macro-f1
+              value: 46.41
+            - name: 1-shot
+              type: macro-f1
+              value: 67.36
+            - name: 3-shot
+              type: macro-f1
+              value: 65.16
+            - name: 5-shot
+              type: macro-f1
+              value: 65.23
+        - task:
+            type: text-generation
+          dataset:
+            name: WMT_EN-RO
+            type: WMT_EN-RO
+          metrics:
+            - name: 0-shot
+              type: bleu
+              value: 4.45
+            - name: 1-shot
+              type: bleu
+              value: 8.61
+            - name: 3-shot
+              type: bleu
+              value: 12.25
+            - name: 5-shot
+              type: bleu
+              value: 14.73
+        - task:
+            type: text-generation
+          dataset:
+            name: WMT_RO-EN
+            type: WMT_RO-EN
+          metrics:
+            - name: 0-shot
+              type: bleu
+              value: 1.29
+            - name: 1-shot
+              type: bleu
+              value: 10.78
+            - name: 3-shot
+              type: bleu
+              value: 16.82
+            - name: 5-shot
+              type: bleu
+              value: 23.24
+        - task:
+            type: text-generation
+          dataset:
+            name: XQuAD_EM
+            type: XQuAD_EM
+          metrics:
+            - name: 0-shot
+              type: exact_match
+              value: 5.29
+            - name: 1-shot
+              type: exact_match
+              value: 33.95
+            - name: 3-shot
+              type: exact_match
+              value: 39.24
+            - name: 5-shot
+              type: exact_match
+              value: 42.10
+        - task:
+            type: text-generation
+          dataset:
+            name: XQuAD_F1
+            type: XQuAD_F1
+          metrics:
+            - name: 0-shot
+              type: f1
+              value: 16.17
+            - name: 1-shot
+              type: f1
+              value: 51.84
+            - name: 3-shot
+              type: f1
+              value: 58.82
+            - name: 5-shot
+              type: f1
+              value: 61.29
+        - task:
+            type: text-generation
+          dataset:
+            name: STS
+            type: STS
+          metrics:
+            - name: 0-shot
+              type: spearman
+              value: -1.74
+            - name: 1-shot
+              type: spearman
+              value: 15.47
+            - name: 3-shot
+              type: spearman
+              value: 9.93
+        - task:
+            type: text-generation
+          dataset:
+            name: STS
+            type: STS
+          metrics:
+            - name: 0-shot
+              type: pearson
+              value: -1.40
+            - name: 1-shot
+              type: pearson
+              value: 15.00
+            - name: 3-shot
+              type: pearson
+              value: 10.33
 ---
 # Model Card for Model ID
 ## Academic Benchmarks
+<table>
+<tbody>
+<tr>
+<td><strong>Model</strong></td>
+<td><strong><center>Average</center></strong></td>
+<td><strong><center>ARC</center></strong></td>
+<td><strong><center>MMLU</center></strong></td>
+<td><strong><center>Winogrande</center></strong></td>
+<td><strong><center>Hellaswag</center></strong></td>
+<td><strong><center>GSM8k</center></strong></td>
+<td><strong><center>TruthfulQA</center></strong></td>
+</tr>
+<tr>
+<td>Llama-2-7b-hf</td><td><center>37.04</center></td><td><center>36.05</center></td><td><center><strong>33.66</strong></center></td><td><center>57.56</center></td><td><center>48.00</center></td><td><center><strong>4.75</strong></center></td><td><center>42.22</center></td>
+</tr>
+<tr>
+<td><em>RoLlama2-7b-Base</em></td><td><center><em><strong>38.03</strong></em></center></td><td><center><em><strong>37.95</strong></em></center></td><td><center><em>27.22</em></center></td><td><center><em><strong>59.29</strong></em></center></td><td><center><em><strong>57.22</strong></em></center></td><td><center><em>2.53</em></center></td><td><center><em><strong>44.00</strong></em></center></td>
+</tr>
+</tbody>
+</table>
+## Downstream Tasks
+<table>
+<tbody>
+<tr>
+<td></td>
+<td colspan="4"><center><strong>LaRoSeDa</strong></center></td>
+<td colspan="4"><center><strong>WMT</strong></center></td>
+<td colspan="4"><center><strong>XQuAD</strong></center></td>
+<td colspan="4"><center><strong>STS</strong></center></td>
+</tr>
+<tr>
+<td></td>
+<td colspan="2"><center><strong>Few-shot</strong></center></td>
+<td colspan="2"><center><strong>Finetuned</strong></center></td>
+<td colspan="2"><center><strong>Few-shot</strong></center></td>
+<td colspan="2"><center><strong>Finetuned</strong></center></td>
+<td colspan="2"><center><strong>Few-shot</strong></center></td>
+<td colspan="2"><center><strong>Finetuned</strong></center></td>
+<td colspan="2"><center><strong>Few-shot</strong></center></td>
+<td colspan="2"><center><strong>Finetuned</strong></center></td>
+</tr>
+<tr>
+<td></td>
+<td><center><strong>Binary<br>(Macro F1)</strong></center></td>
+<td><center><strong>Multiclass<br>(Macro F1)</strong></center></td>
+<td><center><strong>Binary<br>(Macro F1)</strong></center></td>
+<td><center><strong>Multiclass<br>(Macro F1)</strong></center></td>
+<td><center><strong>EN-RO<br>(Bleu)</strong></center></td>
+<td><center><strong>RO-EN<br>(Bleu)</strong></center></td>
+<td><center><strong>EN-RO<br>(Bleu)</strong></center></td>
+<td><center><strong>RO-EN<br>(Bleu)</strong></center></td>
+<td><center><strong>-<br>(EM)</strong></center></td>
+<td><center><strong>-<br>(F1)</strong></center></td>
+<td><center><strong>-<br>(EM)</strong></center></td>
+<td><center><strong>-<br>(F1)</strong></center></td>
+<td><center><strong>-<br>(Spearman)</strong></center></td>
+<td><center><strong>-<br>Pearson)</strong></center></td>
+<td><center><strong>-<br>(Spearman)</strong></center></td>
+<td><center><strong>-<br>(Pearson)</strong></center></td>
+</tr>
+<tr>
+<td>Llama-2-7b-hf</td><td><center><strong>93.19</strong></center></td><td><center>54.11</center></td><td><center>98.43</center></td><td><center>87.22</center></td><td><center><strong>14.90</strong></center></td><td><center><strong>26.61</strong></center></td><td><center>24.95</center></td><td><center>39.09</center></td><td><center><strong>38.91</strong></center></td><td><center><strong>56.82</strong></center></td><td><center>65.46</center></td><td><center>79.42</center></td><td><center><strong>9.08</strong></center></td><td><center><strong>9.07</strong></center></td><td><center><strong>79.93</strong></center></td><td><center><strong>81.08</strong></center></td>
+</tr>
+<tr>
+<td><em>RoLlama2-7b-Base</em></td><td><center><em>83.25</em></center></td><td><center><em><strong>61.04</strong></em></center></td><td><center><em><strong>98.97</strong></em></center></td><td><center><em><strong>87.72</strong></em></center></td><td><center><em>10.01</em></center></td><td><center><em>13.03</em></center></td><td><center><em><strong>27.85</strong></em></center></td><td><center><em><strong>39.30</strong></em></center></td><td><center><em>30.15</em></center></td><td><center><em>47.03</em></center></td><td><center><em><strong>67.06</strong></em></center></td><td><center><em><strong>79.96</strong></em></center></td><td><center><em>7.89</em></center></td><td><center><em>7.98</em></center></td><td><center><em>71.75</em></center></td><td><center><em>71.99</em></center></td>
+</tr>
+</tbody>
+</table>
 ## RoLlama2 Model Family