Files changed (1) hide show
  1. README.md +2 -1796
README.md CHANGED
@@ -169,1601 +169,7 @@ widget:
169
  A: Let's think step by step.
170
  example_title: Mathematical reasoning
171
  group: English
172
- model-index:
173
- - name: bloom
174
- results:
175
- - task:
176
- type: text-generation
177
- name: text generation
178
- dataset:
179
- name: arc_challenge
180
- type: arc_challenge
181
- metrics:
182
- - name: acc
183
- type: acc
184
- value: 0.4112627986348123
185
- verified: false
186
- - task:
187
- type: text-generation
188
- name: text generation
189
- dataset:
190
- name: arc_easy
191
- type: arc_easy
192
- metrics:
193
- - name: acc
194
- type: acc
195
- value: 0.726010101010101
196
- verified: false
197
- - task:
198
- type: text-generation
199
- name: text generation
200
- dataset:
201
- name: axb
202
- type: axb
203
- metrics:
204
- - name: acc
205
- type: acc
206
- value: 0.5751811594202898
207
- verified: false
208
- - task:
209
- type: text-generation
210
- name: text generation
211
- dataset:
212
- name: axg
213
- type: axg
214
- metrics:
215
- - name: acc
216
- type: acc
217
- value: 0.5252808988764045
218
- verified: false
219
- - task:
220
- type: text-generation
221
- name: text generation
222
- dataset:
223
- name: boolq
224
- type: boolq
225
- metrics:
226
- - name: acc
227
- type: acc
228
- value: 0.6345565749235474
229
- verified: false
230
- - task:
231
- type: text-generation
232
- name: text generation
233
- dataset:
234
- name: cb
235
- type: cb
236
- metrics:
237
- - name: acc
238
- type: acc
239
- value: 0.3392857142857143
240
- verified: false
241
- - task:
242
- type: text-generation
243
- name: text generation
244
- dataset:
245
- name: cola
246
- type: cola
247
- metrics:
248
- - name: acc
249
- type: acc
250
- value: 0.39022051773729627
251
- verified: false
252
- - task:
253
- type: text-generation
254
- name: text generation
255
- dataset:
256
- name: copa
257
- type: copa
258
- metrics:
259
- - name: acc
260
- type: acc
261
- value: 0.56
262
- verified: false
263
- - task:
264
- type: text-generation
265
- name: text generation
266
- dataset:
267
- name: crows_pairs_english
268
- type: crows_pairs_english
269
- metrics:
270
- - name: acc
271
- type: acc
272
- value: 0.5
273
- verified: false
274
- - task:
275
- type: text-generation
276
- name: text generation
277
- dataset:
278
- name: crows_pairs_french
279
- type: crows_pairs_french
280
- metrics:
281
- - name: acc
282
- type: acc
283
- value: 0.505664877757901
284
- verified: false
285
- - task:
286
- type: text-generation
287
- name: text generation
288
- dataset:
289
- name: diabla
290
- type: diabla
291
- metrics:
292
- - name: acc
293
- type: acc
294
- value: 0.2947981906750174
295
- verified: false
296
- - task:
297
- type: text-generation
298
- name: text generation
299
- dataset:
300
- name: gsarti/flores_101_afr
301
- type: gsarti/flores_101_afr
302
- metrics:
303
- - name: byte_perplexity
304
- type: byte_perplexity
305
- value: 4.25431550058444
306
- verified: false
307
- - task:
308
- type: text-generation
309
- name: text generation
310
- dataset:
311
- name: gsarti/flores_101_amh
312
- type: gsarti/flores_101_amh
313
- metrics:
314
- - name: byte_perplexity
315
- type: byte_perplexity
316
- value: 3.716877477347089
317
- verified: false
318
- - task:
319
- type: text-generation
320
- name: text generation
321
- dataset:
322
- name: gsarti/flores_101_ara
323
- type: gsarti/flores_101_ara
324
- metrics:
325
- - name: byte_perplexity
326
- type: byte_perplexity
327
- value: 1.7049030137120964
328
- verified: false
329
- - task:
330
- type: text-generation
331
- name: text generation
332
- dataset:
333
- name: gsarti/flores_101_asm
334
- type: gsarti/flores_101_asm
335
- metrics:
336
- - name: byte_perplexity
337
- type: byte_perplexity
338
- value: 6.576581380404954
339
- verified: false
340
- - task:
341
- type: text-generation
342
- name: text generation
343
- dataset:
344
- name: gsarti/flores_101_ast
345
- type: gsarti/flores_101_ast
346
- metrics:
347
- - name: byte_perplexity
348
- type: byte_perplexity
349
- value: 2.8562364775797944
350
- verified: false
351
- - task:
352
- type: text-generation
353
- name: text generation
354
- dataset:
355
- name: gsarti/flores_101_azj
356
- type: gsarti/flores_101_azj
357
- metrics:
358
- - name: byte_perplexity
359
- type: byte_perplexity
360
- value: 4.80721528624391
361
- verified: false
362
- - task:
363
- type: text-generation
364
- name: text generation
365
- dataset:
366
- name: gsarti/flores_101_bel
367
- type: gsarti/flores_101_bel
368
- metrics:
369
- - name: byte_perplexity
370
- type: byte_perplexity
371
- value: 2.7312177406635065
372
- verified: false
373
- - task:
374
- type: text-generation
375
- name: text generation
376
- dataset:
377
- name: gsarti/flores_101_ben
378
- type: gsarti/flores_101_ben
379
- metrics:
380
- - name: byte_perplexity
381
- type: byte_perplexity
382
- value: 5.993409478990023
383
- verified: false
384
- - task:
385
- type: text-generation
386
- name: text generation
387
- dataset:
388
- name: gsarti/flores_101_bos
389
- type: gsarti/flores_101_bos
390
- metrics:
391
- - name: byte_perplexity
392
- type: byte_perplexity
393
- value: 3.5936169095529493
394
- verified: false
395
- - task:
396
- type: text-generation
397
- name: text generation
398
- dataset:
399
- name: gsarti/flores_101_bul
400
- type: gsarti/flores_101_bul
401
- metrics:
402
- - name: byte_perplexity
403
- type: byte_perplexity
404
- value: 2.159035321398085
405
- verified: false
406
- - task:
407
- type: text-generation
408
- name: text generation
409
- dataset:
410
- name: gsarti/flores_101_cat
411
- type: gsarti/flores_101_cat
412
- metrics:
413
- - name: byte_perplexity
414
- type: byte_perplexity
415
- value: 2.167873680006659
416
- verified: false
417
- - task:
418
- type: text-generation
419
- name: text generation
420
- dataset:
421
- name: gsarti/flores_101_ceb
422
- type: gsarti/flores_101_ceb
423
- metrics:
424
- - name: byte_perplexity
425
- type: byte_perplexity
426
- value: 5.286975089885673
427
- verified: false
428
- - task:
429
- type: text-generation
430
- name: text generation
431
- dataset:
432
- name: gsarti/flores_101_ces
433
- type: gsarti/flores_101_ces
434
- metrics:
435
- - name: byte_perplexity
436
- type: byte_perplexity
437
- value: 3.4516208322236017
438
- verified: false
439
- - task:
440
- type: text-generation
441
- name: text generation
442
- dataset:
443
- name: gsarti/flores_101_ckb
444
- type: gsarti/flores_101_ckb
445
- metrics:
446
- - name: byte_perplexity
447
- type: byte_perplexity
448
- value: 3.7051034724765612
449
- verified: false
450
- - task:
451
- type: text-generation
452
- name: text generation
453
- dataset:
454
- name: gsarti/flores_101_cym
455
- type: gsarti/flores_101_cym
456
- metrics:
457
- - name: byte_perplexity
458
- type: byte_perplexity
459
- value: 7.0889312398688125
460
- verified: false
461
- - task:
462
- type: text-generation
463
- name: text generation
464
- dataset:
465
- name: gsarti/flores_101_dan
466
- type: gsarti/flores_101_dan
467
- metrics:
468
- - name: byte_perplexity
469
- type: byte_perplexity
470
- value: 3.4300748208111838
471
- verified: false
472
- - task:
473
- type: text-generation
474
- name: text generation
475
- dataset:
476
- name: gsarti/flores_101_deu
477
- type: gsarti/flores_101_deu
478
- metrics:
479
- - name: byte_perplexity
480
- type: byte_perplexity
481
- value: 2.3380585896268107
482
- verified: false
483
- - task:
484
- type: text-generation
485
- name: text generation
486
- dataset:
487
- name: gsarti/flores_101_ell
488
- type: gsarti/flores_101_ell
489
- metrics:
490
- - name: byte_perplexity
491
- type: byte_perplexity
492
- value: 1.9595604725375586
493
- verified: false
494
- - task:
495
- type: text-generation
496
- name: text generation
497
- dataset:
498
- name: gsarti/flores_101_eng
499
- type: gsarti/flores_101_eng
500
- metrics:
501
- - name: byte_perplexity
502
- type: byte_perplexity
503
- value: 1.8819637649637901
504
- verified: false
505
- - task:
506
- type: text-generation
507
- name: text generation
508
- dataset:
509
- name: gsarti/flores_101_est
510
- type: gsarti/flores_101_est
511
- metrics:
512
- - name: byte_perplexity
513
- type: byte_perplexity
514
- value: 5.773850600380297
515
- verified: false
516
- - task:
517
- type: text-generation
518
- name: text generation
519
- dataset:
520
- name: gsarti/flores_101_fas
521
- type: gsarti/flores_101_fas
522
- metrics:
523
- - name: byte_perplexity
524
- type: byte_perplexity
525
- value: 2.4306140728294086
526
- verified: false
527
- - task:
528
- type: text-generation
529
- name: text generation
530
- dataset:
531
- name: gsarti/flores_101_fin
532
- type: gsarti/flores_101_fin
533
- metrics:
534
- - name: byte_perplexity
535
- type: byte_perplexity
536
- value: 4.304305536244342
537
- verified: false
538
- - task:
539
- type: text-generation
540
- name: text generation
541
- dataset:
542
- name: gsarti/flores_101_fra
543
- type: gsarti/flores_101_fra
544
- metrics:
545
- - name: byte_perplexity
546
- type: byte_perplexity
547
- value: 1.9374688438541796
548
- verified: false
549
- - task:
550
- type: text-generation
551
- name: text generation
552
- dataset:
553
- name: gsarti/flores_101_ful
554
- type: gsarti/flores_101_ful
555
- metrics:
556
- - name: byte_perplexity
557
- type: byte_perplexity
558
- value: 9.740353097219378
559
- verified: false
560
- - task:
561
- type: text-generation
562
- name: text generation
563
- dataset:
564
- name: gsarti/flores_101_gle
565
- type: gsarti/flores_101_gle
566
- metrics:
567
- - name: byte_perplexity
568
- type: byte_perplexity
569
- value: 6.035269765075012
570
- verified: false
571
- - task:
572
- type: text-generation
573
- name: text generation
574
- dataset:
575
- name: gsarti/flores_101_glg
576
- type: gsarti/flores_101_glg
577
- metrics:
578
- - name: byte_perplexity
579
- type: byte_perplexity
580
- value: 2.365451129546636
581
- verified: false
582
- - task:
583
- type: text-generation
584
- name: text generation
585
- dataset:
586
- name: gsarti/flores_101_guj
587
- type: gsarti/flores_101_guj
588
- metrics:
589
- - name: byte_perplexity
590
- type: byte_perplexity
591
- value: 5.70676742569154
592
- verified: false
593
- - task:
594
- type: text-generation
595
- name: text generation
596
- dataset:
597
- name: gsarti/flores_101_hau
598
- type: gsarti/flores_101_hau
599
- metrics:
600
- - name: byte_perplexity
601
- type: byte_perplexity
602
- value: 8.855204288260023
603
- verified: false
604
- - task:
605
- type: text-generation
606
- name: text generation
607
- dataset:
608
- name: gsarti/flores_101_heb
609
- type: gsarti/flores_101_heb
610
- metrics:
611
- - name: byte_perplexity
612
- type: byte_perplexity
613
- value: 2.920943798471208
614
- verified: false
615
- - task:
616
- type: text-generation
617
- name: text generation
618
- dataset:
619
- name: gsarti/flores_101_hin
620
- type: gsarti/flores_101_hin
621
- metrics:
622
- - name: byte_perplexity
623
- type: byte_perplexity
624
- value: 5.452028001573195
625
- verified: false
626
- - task:
627
- type: text-generation
628
- name: text generation
629
- dataset:
630
- name: gsarti/flores_101_hrv
631
- type: gsarti/flores_101_hrv
632
- metrics:
633
- - name: byte_perplexity
634
- type: byte_perplexity
635
- value: 3.7056829077179225
636
- verified: false
637
- - task:
638
- type: text-generation
639
- name: text generation
640
- dataset:
641
- name: gsarti/flores_101_hun
642
- type: gsarti/flores_101_hun
643
- metrics:
644
- - name: byte_perplexity
645
- type: byte_perplexity
646
- value: 4.058579478967854
647
- verified: false
648
- - task:
649
- type: text-generation
650
- name: text generation
651
- dataset:
652
- name: gsarti/flores_101_hye
653
- type: gsarti/flores_101_hye
654
- metrics:
655
- - name: byte_perplexity
656
- type: byte_perplexity
657
- value: 3.127237816041562
658
- verified: false
659
- - task:
660
- type: text-generation
661
- name: text generation
662
- dataset:
663
- name: gsarti/flores_101_ibo
664
- type: gsarti/flores_101_ibo
665
- metrics:
666
- - name: byte_perplexity
667
- type: byte_perplexity
668
- value: 3.9500357969906683
669
- verified: false
670
- - task:
671
- type: text-generation
672
- name: text generation
673
- dataset:
674
- name: gsarti/flores_101_ind
675
- type: gsarti/flores_101_ind
676
- metrics:
677
- - name: byte_perplexity
678
- type: byte_perplexity
679
- value: 1.976163584180101
680
- verified: false
681
- - task:
682
- type: text-generation
683
- name: text generation
684
- dataset:
685
- name: gsarti/flores_101_isl
686
- type: gsarti/flores_101_isl
687
- metrics:
688
- - name: byte_perplexity
689
- type: byte_perplexity
690
- value: 5.500542085165231
691
- verified: false
692
- - task:
693
- type: text-generation
694
- name: text generation
695
- dataset:
696
- name: gsarti/flores_101_ita
697
- type: gsarti/flores_101_ita
698
- metrics:
699
- - name: byte_perplexity
700
- type: byte_perplexity
701
- value: 2.314465100752677
702
- verified: false
703
- - task:
704
- type: text-generation
705
- name: text generation
706
- dataset:
707
- name: gsarti/flores_101_jav
708
- type: gsarti/flores_101_jav
709
- metrics:
710
- - name: byte_perplexity
711
- type: byte_perplexity
712
- value: 4.942322446550142
713
- verified: false
714
- - task:
715
- type: text-generation
716
- name: text generation
717
- dataset:
718
- name: gsarti/flores_101_jpn
719
- type: gsarti/flores_101_jpn
720
- metrics:
721
- - name: byte_perplexity
722
- type: byte_perplexity
723
- value: 2.259421750521777
724
- verified: false
725
- - task:
726
- type: text-generation
727
- name: text generation
728
- dataset:
729
- name: gsarti/flores_101_kam
730
- type: gsarti/flores_101_kam
731
- metrics:
732
- - name: byte_perplexity
733
- type: byte_perplexity
734
- value: 9.743025325635475
735
- verified: false
736
- - task:
737
- type: text-generation
738
- name: text generation
739
- dataset:
740
- name: gsarti/flores_101_kan
741
- type: gsarti/flores_101_kan
742
- metrics:
743
- - name: byte_perplexity
744
- type: byte_perplexity
745
- value: 6.233724699944989
746
- verified: false
747
- - task:
748
- type: text-generation
749
- name: text generation
750
- dataset:
751
- name: gsarti/flores_101_kat
752
- type: gsarti/flores_101_kat
753
- metrics:
754
- - name: byte_perplexity
755
- type: byte_perplexity
756
- value: 2.0508893415872107
757
- verified: false
758
- - task:
759
- type: text-generation
760
- name: text generation
761
- dataset:
762
- name: gsarti/flores_101_kaz
763
- type: gsarti/flores_101_kaz
764
- metrics:
765
- - name: byte_perplexity
766
- type: byte_perplexity
767
- value: 3.0390148516287927
768
- verified: false
769
- - task:
770
- type: text-generation
771
- name: text generation
772
- dataset:
773
- name: gsarti/flores_101_kea
774
- type: gsarti/flores_101_kea
775
- metrics:
776
- - name: byte_perplexity
777
- type: byte_perplexity
778
- value: 7.147132270533836
779
- verified: false
780
- - task:
781
- type: text-generation
782
- name: text generation
783
- dataset:
784
- name: gsarti/flores_101_khm
785
- type: gsarti/flores_101_khm
786
- metrics:
787
- - name: byte_perplexity
788
- type: byte_perplexity
789
- value: 3.366514710252477
790
- verified: false
791
- - task:
792
- type: text-generation
793
- name: text generation
794
- dataset:
795
- name: gsarti/flores_101_kir
796
- type: gsarti/flores_101_kir
797
- metrics:
798
- - name: byte_perplexity
799
- type: byte_perplexity
800
- value: 3.2413845359487885
801
- verified: false
802
- - task:
803
- type: text-generation
804
- name: text generation
805
- dataset:
806
- name: gsarti/flores_101_kor
807
- type: gsarti/flores_101_kor
808
- metrics:
809
- - name: byte_perplexity
810
- type: byte_perplexity
811
- value: 2.9023196482741027
812
- verified: false
813
- - task:
814
- type: text-generation
815
- name: text generation
816
- dataset:
817
- name: gsarti/flores_101_lao
818
- type: gsarti/flores_101_lao
819
- metrics:
820
- - name: byte_perplexity
821
- type: byte_perplexity
822
- value: 2.331446855837494
823
- verified: false
824
- - task:
825
- type: text-generation
826
- name: text generation
827
- dataset:
828
- name: gsarti/flores_101_lav
829
- type: gsarti/flores_101_lav
830
- metrics:
831
- - name: byte_perplexity
832
- type: byte_perplexity
833
- value: 5.223609016485348
834
- verified: false
835
- - task:
836
- type: text-generation
837
- name: text generation
838
- dataset:
839
- name: gsarti/flores_101_lin
840
- type: gsarti/flores_101_lin
841
- metrics:
842
- - name: byte_perplexity
843
- type: byte_perplexity
844
- value: 4.847471204107301
845
- verified: false
846
- - task:
847
- type: text-generation
848
- name: text generation
849
- dataset:
850
- name: gsarti/flores_101_lit
851
- type: gsarti/flores_101_lit
852
- metrics:
853
- - name: byte_perplexity
854
- type: byte_perplexity
855
- value: 4.5432035498036765
856
- verified: false
857
- - task:
858
- type: text-generation
859
- name: text generation
860
- dataset:
861
- name: gsarti/flores_101_ltz
862
- type: gsarti/flores_101_ltz
863
- metrics:
864
- - name: byte_perplexity
865
- type: byte_perplexity
866
- value: 5.5910516978201015
867
- verified: false
868
- - task:
869
- type: text-generation
870
- name: text generation
871
- dataset:
872
- name: gsarti/flores_101_lug
873
- type: gsarti/flores_101_lug
874
- metrics:
875
- - name: byte_perplexity
876
- type: byte_perplexity
877
- value: 5.4301049946044175
878
- verified: false
879
- - task:
880
- type: text-generation
881
- name: text generation
882
- dataset:
883
- name: gsarti/flores_101_luo
884
- type: gsarti/flores_101_luo
885
- metrics:
886
- - name: byte_perplexity
887
- type: byte_perplexity
888
- value: 12.031029857399394
889
- verified: false
890
- - task:
891
- type: text-generation
892
- name: text generation
893
- dataset:
894
- name: gsarti/flores_101_mal
895
- type: gsarti/flores_101_mal
896
- metrics:
897
- - name: byte_perplexity
898
- type: byte_perplexity
899
- value: 4.794302548141229
900
- verified: false
901
- - task:
902
- type: text-generation
903
- name: text generation
904
- dataset:
905
- name: gsarti/flores_101_mar
906
- type: gsarti/flores_101_mar
907
- metrics:
908
- - name: byte_perplexity
909
- type: byte_perplexity
910
- value: 6.856682255407709
911
- verified: false
912
- - task:
913
- type: text-generation
914
- name: text generation
915
- dataset:
916
- name: gsarti/flores_101_mkd
917
- type: gsarti/flores_101_mkd
918
- metrics:
919
- - name: byte_perplexity
920
- type: byte_perplexity
921
- value: 2.3354144607382983
922
- verified: false
923
- - task:
924
- type: text-generation
925
- name: text generation
926
- dataset:
927
- name: gsarti/flores_101_mlt
928
- type: gsarti/flores_101_mlt
929
- metrics:
930
- - name: byte_perplexity
931
- type: byte_perplexity
932
- value: 9.04135227904975
933
- verified: false
934
- - task:
935
- type: text-generation
936
- name: text generation
937
- dataset:
938
- name: gsarti/flores_101_mon
939
- type: gsarti/flores_101_mon
940
- metrics:
941
- - name: byte_perplexity
942
- type: byte_perplexity
943
- value: 3.094907723618666
944
- verified: false
945
- - task:
946
- type: text-generation
947
- name: text generation
948
- dataset:
949
- name: gsarti/flores_101_mri
950
- type: gsarti/flores_101_mri
951
- metrics:
952
- - name: byte_perplexity
953
- type: byte_perplexity
954
- value: 5.2659698341456505
955
- verified: false
956
- - task:
957
- type: text-generation
958
- name: text generation
959
- dataset:
960
- name: gsarti/flores_101_msa
961
- type: gsarti/flores_101_msa
962
- metrics:
963
- - name: byte_perplexity
964
- type: byte_perplexity
965
- value: 2.2220779892820985
966
- verified: false
967
- - task:
968
- type: text-generation
969
- name: text generation
970
- dataset:
971
- name: gsarti/flores_101_mya
972
- type: gsarti/flores_101_mya
973
- metrics:
974
- - name: byte_perplexity
975
- type: byte_perplexity
976
- value: 2.5229159853414433
977
- verified: false
978
- - task:
979
- type: text-generation
980
- name: text generation
981
- dataset:
982
- name: gsarti/flores_101_nld
983
- type: gsarti/flores_101_nld
984
- metrics:
985
- - name: byte_perplexity
986
- type: byte_perplexity
987
- value: 2.799153089002766
988
- verified: false
989
- - task:
990
- type: text-generation
991
- name: text generation
992
- dataset:
993
- name: gsarti/flores_101_nob
994
- type: gsarti/flores_101_nob
995
- metrics:
996
- - name: byte_perplexity
997
- type: byte_perplexity
998
- value: 3.628942049758715
999
- verified: false
1000
- - task:
1001
- type: text-generation
1002
- name: text generation
1003
- dataset:
1004
- name: gsarti/flores_101_npi
1005
- type: gsarti/flores_101_npi
1006
- metrics:
1007
- - name: byte_perplexity
1008
- type: byte_perplexity
1009
- value: 6.666236527803879
1010
- verified: false
1011
- - task:
1012
- type: text-generation
1013
- name: text generation
1014
- dataset:
1015
- name: gsarti/flores_101_nso
1016
- type: gsarti/flores_101_nso
1017
- metrics:
1018
- - name: byte_perplexity
1019
- type: byte_perplexity
1020
- value: 5.015319074943932
1021
- verified: false
1022
- - task:
1023
- type: text-generation
1024
- name: text generation
1025
- dataset:
1026
- name: gsarti/flores_101_nya
1027
- type: gsarti/flores_101_nya
1028
- metrics:
1029
- - name: byte_perplexity
1030
- type: byte_perplexity
1031
- value: 4.938044040751036
1032
- verified: false
1033
- - task:
1034
- type: text-generation
1035
- name: text generation
1036
- dataset:
1037
- name: gsarti/flores_101_oci
1038
- type: gsarti/flores_101_oci
1039
- metrics:
1040
- - name: byte_perplexity
1041
- type: byte_perplexity
1042
- value: 3.607440766288032
1043
- verified: false
1044
- - task:
1045
- type: text-generation
1046
- name: text generation
1047
- dataset:
1048
- name: gsarti/flores_101_orm
1049
- type: gsarti/flores_101_orm
1050
- metrics:
1051
- - name: byte_perplexity
1052
- type: byte_perplexity
1053
- value: 11.31585044916705
1054
- verified: false
1055
- - task:
1056
- type: text-generation
1057
- name: text generation
1058
- dataset:
1059
- name: gsarti/flores_101_ory
1060
- type: gsarti/flores_101_ory
1061
- metrics:
1062
- - name: byte_perplexity
1063
- type: byte_perplexity
1064
- value: 5.981891184515959
1065
- verified: false
1066
- - task:
1067
- type: text-generation
1068
- name: text generation
1069
- dataset:
1070
- name: gsarti/flores_101_pan
1071
- type: gsarti/flores_101_pan
1072
- metrics:
1073
- - name: byte_perplexity
1074
- type: byte_perplexity
1075
- value: 4.7716086841502685
1076
- verified: false
1077
- - task:
1078
- type: text-generation
1079
- name: text generation
1080
- dataset:
1081
- name: gsarti/flores_101_pol
1082
- type: gsarti/flores_101_pol
1083
- metrics:
1084
- - name: byte_perplexity
1085
- type: byte_perplexity
1086
- value: 3.01200174157614
1087
- verified: false
1088
- - task:
1089
- type: text-generation
1090
- name: text generation
1091
- dataset:
1092
- name: gsarti/flores_101_por
1093
- type: gsarti/flores_101_por
1094
- metrics:
1095
- - name: byte_perplexity
1096
- type: byte_perplexity
1097
- value: 1.8411472115156693
1098
- verified: false
1099
- - task:
1100
- type: text-generation
1101
- name: text generation
1102
- dataset:
1103
- name: gsarti/flores_101_pus
1104
- type: gsarti/flores_101_pus
1105
- metrics:
1106
- - name: byte_perplexity
1107
- type: byte_perplexity
1108
- value: 4.623872921169341
1109
- verified: false
1110
- - task:
1111
- type: text-generation
1112
- name: text generation
1113
- dataset:
1114
- name: gsarti/flores_101_ron
1115
- type: gsarti/flores_101_ron
1116
- metrics:
1117
- - name: byte_perplexity
1118
- type: byte_perplexity
1119
- value: 3.049829411973529
1120
- verified: false
1121
- - task:
1122
- type: text-generation
1123
- name: text generation
1124
- dataset:
1125
- name: gsarti/flores_101_rus
1126
- type: gsarti/flores_101_rus
1127
- metrics:
1128
- - name: byte_perplexity
1129
- type: byte_perplexity
1130
- value: 1.7083443875791493
1131
- verified: false
1132
- - task:
1133
- type: text-generation
1134
- name: text generation
1135
- dataset:
1136
- name: gsarti/flores_101_slk
1137
- type: gsarti/flores_101_slk
1138
- metrics:
1139
- - name: byte_perplexity
1140
- type: byte_perplexity
1141
- value: 4.037719650548048
1142
- verified: false
1143
- - task:
1144
- type: text-generation
1145
- name: text generation
1146
- dataset:
1147
- name: gsarti/flores_101_slv
1148
- type: gsarti/flores_101_slv
1149
- metrics:
1150
- - name: byte_perplexity
1151
- type: byte_perplexity
1152
- value: 4.141036287764831
1153
- verified: false
1154
- - task:
1155
- type: text-generation
1156
- name: text generation
1157
- dataset:
1158
- name: gsarti/flores_101_sna
1159
- type: gsarti/flores_101_sna
1160
- metrics:
1161
- - name: byte_perplexity
1162
- type: byte_perplexity
1163
- value: 4.7109183690601295
1164
- verified: false
1165
- - task:
1166
- type: text-generation
1167
- name: text generation
1168
- dataset:
1169
- name: gsarti/flores_101_snd
1170
- type: gsarti/flores_101_snd
1171
- metrics:
1172
- - name: byte_perplexity
1173
- type: byte_perplexity
1174
- value: 4.206170931541356
1175
- verified: false
1176
- - task:
1177
- type: text-generation
1178
- name: text generation
1179
- dataset:
1180
- name: gsarti/flores_101_som
1181
- type: gsarti/flores_101_som
1182
- metrics:
1183
- - name: byte_perplexity
1184
- type: byte_perplexity
1185
- value: 9.154342083821405
1186
- verified: false
1187
- - task:
1188
- type: text-generation
1189
- name: text generation
1190
- dataset:
1191
- name: gsarti/flores_101_spa
1192
- type: gsarti/flores_101_spa
1193
- metrics:
1194
- - name: byte_perplexity
1195
- type: byte_perplexity
1196
- value: 1.7955816311143258
1197
- verified: false
1198
- - task:
1199
- type: text-generation
1200
- name: text generation
1201
- dataset:
1202
- name: gsarti/flores_101_srp
1203
- type: gsarti/flores_101_srp
1204
- metrics:
1205
- - name: byte_perplexity
1206
- type: byte_perplexity
1207
- value: 2.241096141430147
1208
- verified: false
1209
- - task:
1210
- type: text-generation
1211
- name: text generation
1212
- dataset:
1213
- name: gsarti/flores_101_swe
1214
- type: gsarti/flores_101_swe
1215
- metrics:
1216
- - name: byte_perplexity
1217
- type: byte_perplexity
1218
- value: 3.344977179674293
1219
- verified: false
1220
- - task:
1221
- type: text-generation
1222
- name: text generation
1223
- dataset:
1224
- name: gsarti/flores_101_swh
1225
- type: gsarti/flores_101_swh
1226
- metrics:
1227
- - name: byte_perplexity
1228
- type: byte_perplexity
1229
- value: 2.6844272218041634
1230
- verified: false
1231
- - task:
1232
- type: text-generation
1233
- name: text generation
1234
- dataset:
1235
- name: gsarti/flores_101_tam
1236
- type: gsarti/flores_101_tam
1237
- metrics:
1238
- - name: byte_perplexity
1239
- type: byte_perplexity
1240
- value: 5.1645951632801745
1241
- verified: false
1242
- - task:
1243
- type: text-generation
1244
- name: text generation
1245
- dataset:
1246
- name: gsarti/flores_101_tel
1247
- type: gsarti/flores_101_tel
1248
- metrics:
1249
- - name: byte_perplexity
1250
- type: byte_perplexity
1251
- value: 6.8098996634099445
1252
- verified: false
1253
- - task:
1254
- type: text-generation
1255
- name: text generation
1256
- dataset:
1257
- name: gsarti/flores_101_tgk
1258
- type: gsarti/flores_101_tgk
1259
- metrics:
1260
- - name: byte_perplexity
1261
- type: byte_perplexity
1262
- value: 3.785457016715163
1263
- verified: false
1264
- - task:
1265
- type: text-generation
1266
- name: text generation
1267
- dataset:
1268
- name: gsarti/flores_101_tgl
1269
- type: gsarti/flores_101_tgl
1270
- metrics:
1271
- - name: byte_perplexity
1272
- type: byte_perplexity
1273
- value: 3.7498953645610875
1274
- verified: false
1275
- - task:
1276
- type: text-generation
1277
- name: text generation
1278
- dataset:
1279
- name: gsarti/flores_101_tha
1280
- type: gsarti/flores_101_tha
1281
- metrics:
1282
- - name: byte_perplexity
1283
- type: byte_perplexity
1284
- value: 2.104151663233468
1285
- verified: false
1286
- - task:
1287
- type: text-generation
1288
- name: text generation
1289
- dataset:
1290
- name: gsarti/flores_101_tur
1291
- type: gsarti/flores_101_tur
1292
- metrics:
1293
- - name: byte_perplexity
1294
- type: byte_perplexity
1295
- value: 3.3178240103796037
1296
- verified: false
1297
- - task:
1298
- type: text-generation
1299
- name: text generation
1300
- dataset:
1301
- name: gsarti/flores_101_ukr
1302
- type: gsarti/flores_101_ukr
1303
- metrics:
1304
- - name: byte_perplexity
1305
- type: byte_perplexity
1306
- value: 2.088543437159643
1307
- verified: false
1308
- - task:
1309
- type: text-generation
1310
- name: text generation
1311
- dataset:
1312
- name: gsarti/flores_101_umb
1313
- type: gsarti/flores_101_umb
1314
- metrics:
1315
- - name: byte_perplexity
1316
- type: byte_perplexity
1317
- value: 11.766013385445124
1318
- verified: false
1319
- - task:
1320
- type: text-generation
1321
- name: text generation
1322
- dataset:
1323
- name: gsarti/flores_101_urd
1324
- type: gsarti/flores_101_urd
1325
- metrics:
1326
- - name: byte_perplexity
1327
- type: byte_perplexity
1328
- value: 1.7788699847612357
1329
- verified: false
1330
- - task:
1331
- type: text-generation
1332
- name: text generation
1333
- dataset:
1334
- name: gsarti/flores_101_uzb
1335
- type: gsarti/flores_101_uzb
1336
- metrics:
1337
- - name: byte_perplexity
1338
- type: byte_perplexity
1339
- value: 8.499879863290486
1340
- verified: false
1341
- - task:
1342
- type: text-generation
1343
- name: text generation
1344
- dataset:
1345
- name: gsarti/flores_101_vie
1346
- type: gsarti/flores_101_vie
1347
- metrics:
1348
- - name: byte_perplexity
1349
- type: byte_perplexity
1350
- value: 1.65901207387262
1351
- verified: false
1352
- - task:
1353
- type: text-generation
1354
- name: text generation
1355
- dataset:
1356
- name: gsarti/flores_101_wol
1357
- type: gsarti/flores_101_wol
1358
- metrics:
1359
- - name: byte_perplexity
1360
- type: byte_perplexity
1361
- value: 6.141703791276928
1362
- verified: false
1363
- - task:
1364
- type: text-generation
1365
- name: text generation
1366
- dataset:
1367
- name: gsarti/flores_101_xho
1368
- type: gsarti/flores_101_xho
1369
- metrics:
1370
- - name: byte_perplexity
1371
- type: byte_perplexity
1372
- value: 4.690199677955254
1373
- verified: false
1374
- - task:
1375
- type: text-generation
1376
- name: text generation
1377
- dataset:
1378
- name: gsarti/flores_101_yor
1379
- type: gsarti/flores_101_yor
1380
- metrics:
1381
- - name: byte_perplexity
1382
- type: byte_perplexity
1383
- value: 4.360585696242932
1384
- verified: false
1385
- - task:
1386
- type: text-generation
1387
- name: text generation
1388
- dataset:
1389
- name: gsarti/flores_101_zho_simpl
1390
- type: gsarti/flores_101_zho_simpl
1391
- metrics:
1392
- - name: byte_perplexity
1393
- type: byte_perplexity
1394
- value: 2.1183545781883515
1395
- verified: false
1396
- - task:
1397
- type: text-generation
1398
- name: text generation
1399
- dataset:
1400
- name: gsarti/flores_101_zho_trad
1401
- type: gsarti/flores_101_zho_trad
1402
- metrics:
1403
- - name: byte_perplexity
1404
- type: byte_perplexity
1405
- value: 2.273787884962656
1406
- verified: false
1407
- - task:
1408
- type: text-generation
1409
- name: text generation
1410
- dataset:
1411
- name: gsarti/flores_101_zul
1412
- type: gsarti/flores_101_zul
1413
- metrics:
1414
- - name: byte_perplexity
1415
- type: byte_perplexity
1416
- value: 6.016954767729589
1417
- verified: false
1418
- - task:
1419
- type: text-generation
1420
- name: text generation
1421
- dataset:
1422
- name: headqa
1423
- type: headqa
1424
- metrics:
1425
- - name: acc
1426
- type: acc
1427
- value: 0.3464624361779723
1428
- verified: false
1429
- - task:
1430
- type: text-generation
1431
- name: text generation
1432
- dataset:
1433
- name: hellaswag
1434
- type: hellaswag
1435
- metrics:
1436
- - name: acc
1437
- type: acc
1438
- value: 0.5353515236008763
1439
- verified: false
1440
- - task:
1441
- type: text-generation
1442
- name: text generation
1443
- dataset:
1444
- name: lambada_mt_de
1445
- type: lambada_mt_de
1446
- metrics:
1447
- - name: acc
1448
- type: acc
1449
- value: 0.3291286629148069
1450
- verified: false
1451
- - task:
1452
- type: text-generation
1453
- name: text generation
1454
- dataset:
1455
- name: lambada_mt_en
1456
- type: lambada_mt_en
1457
- metrics:
1458
- - name: acc
1459
- type: acc
1460
- value: 0.6720357073549389
1461
- verified: false
1462
- - task:
1463
- type: text-generation
1464
- name: text generation
1465
- dataset:
1466
- name: lambada_mt_es
1467
- type: lambada_mt_es
1468
- metrics:
1469
- - name: acc
1470
- type: acc
1471
- value: 0.476421502037648
1472
- verified: false
1473
- - task:
1474
- type: text-generation
1475
- name: text generation
1476
- dataset:
1477
- name: lambada_mt_it
1478
- type: lambada_mt_it
1479
- metrics:
1480
- - name: acc
1481
- type: acc
1482
- value: 0.4061711624296526
1483
- verified: false
1484
- - task:
1485
- type: text-generation
1486
- name: text generation
1487
- dataset:
1488
- name: logiqa
1489
- type: logiqa
1490
- metrics:
1491
- - name: acc
1492
- type: acc
1493
- value: 0.2350230414746544
1494
- verified: false
1495
- - task:
1496
- type: text-generation
1497
- name: text generation
1498
- dataset:
1499
- name: mathqa
1500
- type: mathqa
1501
- metrics:
1502
- - name: acc
1503
- type: acc
1504
- value: 0.27671691792294806
1505
- verified: false
1506
- - task:
1507
- type: text-generation
1508
- name: text generation
1509
- dataset:
1510
- name: mc_taco
1511
- type: mc_taco
1512
- metrics:
1513
- - name: em
1514
- type: em
1515
- value: 0.13063063063063063
1516
- verified: false
1517
- - task:
1518
- type: text-generation
1519
- name: text generation
1520
- dataset:
1521
- name: mnli
1522
- type: mnli
1523
- metrics:
1524
- - name: acc
1525
- type: acc
1526
- value: 0.3545565500406835
1527
- verified: false
1528
- - task:
1529
- type: text-generation
1530
- name: text generation
1531
- dataset:
1532
- name: mnli_mismatched
1533
- type: mnli_mismatched
1534
- metrics:
1535
- - name: acc
1536
- type: acc
1537
- value: 0.3545565500406835
1538
- verified: false
1539
- - task:
1540
- type: text-generation
1541
- name: text generation
1542
- dataset:
1543
- name: mrpc
1544
- type: mrpc
1545
- metrics:
1546
- - name: acc
1547
- type: acc
1548
- value: 0.3872549019607843
1549
- verified: false
1550
- - task:
1551
- type: text-generation
1552
- name: text generation
1553
- dataset:
1554
- name: multirc
1555
- type: multirc
1556
- metrics:
1557
- - name: acc
1558
- type: acc
1559
- value: 0.570957095709571
1560
- verified: false
1561
- - task:
1562
- type: text-generation
1563
- name: text generation
1564
- dataset:
1565
- name: openbookqa
1566
- type: openbookqa
1567
- metrics:
1568
- - name: acc
1569
- type: acc
1570
- value: 0.312
1571
- verified: false
1572
- - task:
1573
- type: text-generation
1574
- name: text generation
1575
- dataset:
1576
- name: piqa
1577
- type: piqa
1578
- metrics:
1579
- - name: acc
1580
- type: acc
1581
- value: 0.7812840043525572
1582
- verified: false
1583
- - task:
1584
- type: text-generation
1585
- name: text generation
1586
- dataset:
1587
- name: prost
1588
- type: prost
1589
- metrics:
1590
- - name: acc
1591
- type: acc
1592
- value: 0.2977156276686593
1593
- verified: false
1594
- - task:
1595
- type: text-generation
1596
- name: text generation
1597
- dataset:
1598
- name: pubmedqa
1599
- type: pubmedqa
1600
- metrics:
1601
- - name: acc
1602
- type: acc
1603
- value: 0.741
1604
- verified: false
1605
- - task:
1606
- type: text-generation
1607
- name: text generation
1608
- dataset:
1609
- name: qnli
1610
- type: qnli
1611
- metrics:
1612
- - name: acc
1613
- type: acc
1614
- value: 0.5172981878088962
1615
- verified: false
1616
- - task:
1617
- type: text-generation
1618
- name: text generation
1619
- dataset:
1620
- name: qqp
1621
- type: qqp
1622
- metrics:
1623
- - name: acc
1624
- type: acc
1625
- value: 0.5883007667573584
1626
- verified: false
1627
- - task:
1628
- type: text-generation
1629
- name: text generation
1630
- dataset:
1631
- name: race
1632
- type: race
1633
- metrics:
1634
- - name: acc
1635
- type: acc
1636
- value: 0.39043062200956935
1637
- verified: false
1638
- - task:
1639
- type: text-generation
1640
- name: text generation
1641
- dataset:
1642
- name: rte
1643
- type: rte
1644
- metrics:
1645
- - name: acc
1646
- type: acc
1647
- value: 0.5198555956678701
1648
- verified: false
1649
- - task:
1650
- type: text-generation
1651
- name: text generation
1652
- dataset:
1653
- name: sciq
1654
- type: sciq
1655
- metrics:
1656
- - name: acc
1657
- type: acc
1658
- value: 0.936
1659
- verified: false
1660
- - task:
1661
- type: text-generation
1662
- name: text generation
1663
- dataset:
1664
- name: sst
1665
- type: sst
1666
- metrics:
1667
- - name: acc
1668
- type: acc
1669
- value: 0.6043577981651376
1670
- verified: false
1671
- - task:
1672
- type: text-generation
1673
- name: text generation
1674
- dataset:
1675
- name: triviaqa
1676
- type: triviaqa
1677
- metrics:
1678
- - name: acc
1679
- type: acc
1680
- value: 0.18332891363917617
1681
- verified: false
1682
- - task:
1683
- type: text-generation
1684
- name: text generation
1685
- dataset:
1686
- name: tydiqa_primary
1687
- type: tydiqa_primary
1688
- metrics:
1689
- - name: acc
1690
- type: acc
1691
- value: 0.2809817301342725
1692
- verified: false
1693
- - task:
1694
- type: text-generation
1695
- name: text generation
1696
- dataset:
1697
- name: webqs
1698
- type: webqs
1699
- metrics:
1700
- - name: acc
1701
- type: acc
1702
- value: 0.061515748031496065
1703
- verified: false
1704
- - task:
1705
- type: text-generation
1706
- name: text generation
1707
- dataset:
1708
- name: wic
1709
- type: wic
1710
- metrics:
1711
- - name: acc
1712
- type: acc
1713
- value: 0.5062695924764891
1714
- verified: false
1715
- - task:
1716
- type: text-generation
1717
- name: text generation
1718
- dataset:
1719
- name: winogrande
1720
- type: winogrande
1721
- metrics:
1722
- - name: acc
1723
- type: acc
1724
- value: 0.7095501183898973
1725
- verified: false
1726
- - task:
1727
- type: text-generation
1728
- name: text generation
1729
- dataset:
1730
- name: wnli
1731
- type: wnli
1732
- metrics:
1733
- - name: acc
1734
- type: acc
1735
- value: 0.5704225352112676
1736
- verified: false
1737
- - task:
1738
- type: text-generation
1739
- name: text generation
1740
- dataset:
1741
- name: wsc
1742
- type: wsc
1743
- metrics:
1744
- - name: acc
1745
- type: acc
1746
- value: 0.5192307692307693
1747
- verified: false
1748
- - task:
1749
- type: text-generation
1750
- name: text generation
1751
- dataset:
1752
- name: humaneval
1753
- type: humaneval
1754
- metrics:
1755
- - name: pass@1
1756
- type: pass@1
1757
- value: 0.15524390243902436
1758
- verified: false
1759
- - name: pass@10
1760
- type: pass@10
1761
- value: 0.3220367632383857
1762
- verified: false
1763
- - name: pass@100
1764
- type: pass@100
1765
- value: 0.5545431515723145
1766
- verified: false
1767
  ---
1768
 
1769
  <img src="https://s3.amazonaws.com/moonup/production/uploads/1657124309515-5f17f0a0925b9863e28ad517.png" alt="BigScience Logo" width="800" style="margin-left:'auto' margin-right:'auto' display:'block'"/>
@@ -2152,208 +558,8 @@ Model may:
2152
  ---
2153
 
2154
  # Evaluation
2155
- *This section describes the evaluation protocols and provides the results.*
2156
-
2157
-
2158
- <details>
2159
- <summary>Click to expand</summary>
2160
 
2161
- ## Metrics
2162
- *This section describes the different ways performance is calculated and why.*
2163
-
2164
-
2165
- Includes:
2166
-
2167
- | Metric | Why chosen |
2168
- |--------------------|--------------------------------------------------------------------|
2169
- | [Perplexity](#perplexity) | Standard metric for quantifying model improvements during training |
2170
- | Cross Entropy [Loss](#loss) | Standard objective for language models. |
2171
-
2172
- And multiple different metrics for specific tasks. _(More evaluation metrics forthcoming upon completion of evaluation protocol.)_
2173
-
2174
- ## Factors
2175
- *This section lists some different aspects of BLOOM models. Its focus is on aspects that are likely to give rise to high variance in model behavior.*
2176
-
2177
- - Language, such as English or Yoruba
2178
-
2179
- - Domain, such as newswire or stories
2180
-
2181
- - Demographic characteristics, such as gender or nationality
2182
-
2183
- ## Results
2184
- *Results are based on the [Factors](#factors) and [Metrics](#metrics).*
2185
-
2186
- **Zero-shot evaluations:**
2187
-
2188
- # <span style="color:red"><b>WARNING:</b> These are <b>intermediate results</b></span>
2189
-
2190
- See this repository for JSON files: https://github.com/bigscience-workshop/evaluation-results
2191
-
2192
- | Task | Language | Metric | BLOOM-176B | OPT-175B* |
2193
- |:--------|:-----------------|:------------------------|-------------:|------------:|
2194
- | arc_challenge | eng | acc ↑ | 0.411 | 0.412 |
2195
- | arc_easy | eng | acc ↑ | 0.726 | 0.751 |
2196
- | axb (Median of 10 prompts) | eng | acc ↑ | 0.575 | 0.532 |
2197
- | axg (Median of 10 prompts) | eng | acc ↑ | 0.525 | 0.548 |
2198
- | boolq (Median of 11 prompts) | eng | acc ↑ | 0.635 | 0.622 |
2199
- | cb (Median of 15 prompts) | eng | acc ↑ | 0.339 | 0.411 |
2200
- | cola (Median of 5 prompts) | eng | acc ↑ | 0.39 | 0.444 |
2201
- | copa (Median of 9 prompts) | eng | acc ↑ | 0.56 | 0.55 |
2202
- | crows_pairs_english (Median of 6 prompts) | eng | acc ↑ | 0.5 | 0.502 |
2203
- | crows_pairs_french (Median of 7 prompts) | fra | acc ↑ | 0.506 | 0.499 |
2204
- | diabla (Median of 2 prompts) | eng | acc ↑ | 0.295 | 0.289 |
2205
- | gsarti/flores_101_afr | afr | byte_perplexity ↓ | 4.254 | 3.381 |
2206
- | gsarti/flores_101_amh | amh | byte_perplexity ↓ | 3.717 | 3.87 |
2207
- | gsarti/flores_101_ara | ara | byte_perplexity ↓ | 1.705 | 2.42 |
2208
- | gsarti/flores_101_asm | asm | byte_perplexity ↓ | 6.577 | 3.028 |
2209
- | gsarti/flores_101_ast | ast | byte_perplexity ↓ | 2.856 | 4.737 |
2210
- | gsarti/flores_101_azj | azj | byte_perplexity ↓ | 4.807 | 4.767 |
2211
- | gsarti/flores_101_bel | bel | byte_perplexity ↓ | 2.731 | 2.557 |
2212
- | gsarti/flores_101_ben | ben | byte_perplexity ↓ | 5.993 | 2.243 |
2213
- | gsarti/flores_101_bos | bos | byte_perplexity ↓ | 3.594 | 2.668 |
2214
- | gsarti/flores_101_bul | bul | byte_perplexity ↓ | 2.159 | 2.099 |
2215
- | gsarti/flores_101_cat | cat | byte_perplexity ↓ | 2.168 | 2.837 |
2216
- | gsarti/flores_101_ceb | ceb | byte_perplexity ↓ | 5.287 | 3.636 |
2217
- | gsarti/flores_101_ces | ces | byte_perplexity ↓ | 3.452 | 2.749 |
2218
- | gsarti/flores_101_ckb | ckb | byte_perplexity ↓ | 3.705 | 4.688 |
2219
- | gsarti/flores_101_cym | cym | byte_perplexity ↓ | 7.089 | 5.075 |
2220
- | gsarti/flores_101_dan | dan | byte_perplexity ↓ | 3.43 | 2.492 |
2221
- | gsarti/flores_101_deu | deu | byte_perplexity ↓ | 2.338 | 2.099 |
2222
- | gsarti/flores_101_ell | ell | byte_perplexity ↓ | 1.96 | 1.811 |
2223
- | gsarti/flores_101_eng | eng | byte_perplexity ↓ | 1.882 | 1.9 |
2224
- | gsarti/flores_101_est | est | byte_perplexity ↓ | 5.774 | 3.533 |
2225
- | gsarti/flores_101_fas | fas | byte_perplexity ↓ | 2.431 | 2.444 |
2226
- | gsarti/flores_101_fin | fin | byte_perplexity ↓ | 4.304 | 2.601 |
2227
- | gsarti/flores_101_fra | fra | byte_perplexity ↓ | 1.937 | 1.984 |
2228
- | gsarti/flores_101_ful | ful | byte_perplexity ↓ | 9.74 | 11.84 |
2229
- | gsarti/flores_101_gle | gle | byte_perplexity ↓ | 6.035 | 3.914 |
2230
- | gsarti/flores_101_glg | glg | byte_perplexity ↓ | 2.365 | 3.015 |
2231
- | gsarti/flores_101_guj | guj | byte_perplexity ↓ | 5.707 | 2.438 |
2232
- | gsarti/flores_101_hau | hau | byte_perplexity ↓ | 8.855 | 5.283 |
2233
- | gsarti/flores_101_heb | heb | byte_perplexity ↓ | 2.921 | 2.903 |
2234
- | gsarti/flores_101_hin | hin | byte_perplexity ↓ | 5.452 | 1.86 |
2235
- | gsarti/flores_101_hrv | hrv | byte_perplexity ↓ | 3.706 | 2.715 |
2236
- | gsarti/flores_101_hun | hun | byte_perplexity ↓ | 4.059 | 2.865 |
2237
- | gsarti/flores_101_hye | hye | byte_perplexity ↓ | 3.127 | 3.411 |
2238
- | gsarti/flores_101_ibo | ibo | byte_perplexity ↓ | 3.95 | 8.008 |
2239
- | gsarti/flores_101_ind | ind | byte_perplexity ↓ | 1.976 | 2.632 |
2240
- | gsarti/flores_101_isl | isl | byte_perplexity ↓ | 5.501 | 4.701 |
2241
- | gsarti/flores_101_ita | ita | byte_perplexity ↓ | 2.314 | 2.104 |
2242
- | gsarti/flores_101_jav | jav | byte_perplexity ↓ | 4.942 | 8.16 |
2243
- | gsarti/flores_101_jpn | jpn | byte_perplexity ↓ | 2.259 | 2.198 |
2244
- | gsarti/flores_101_kam | kam | byte_perplexity ↓ | 9.743 | 10.981 |
2245
- | gsarti/flores_101_kan | kan | byte_perplexity ↓ | 6.234 | 2.373 |
2246
- | gsarti/flores_101_kat | kat | byte_perplexity ↓ | 2.051 | 2.466 |
2247
- | gsarti/flores_101_kaz | kaz | byte_perplexity ↓ | 3.039 | 4.376 |
2248
- | gsarti/flores_101_kea | kea | byte_perplexity ↓ | 7.147 | 9.632 |
2249
- | gsarti/flores_101_khm | khm | byte_perplexity ↓ | 3.367 | 2.646 |
2250
- | gsarti/flores_101_kir | kir | byte_perplexity ↓ | 3.241 | 4.522 |
2251
- | gsarti/flores_101_kor | kor | byte_perplexity ↓ | 2.902 | 3.376 |
2252
- | gsarti/flores_101_lao | lao | byte_perplexity ↓ | 2.331 | 3.106 |
2253
- | gsarti/flores_101_lav | lav | byte_perplexity ↓ | 5.224 | 4.811 |
2254
- | gsarti/flores_101_lin | lin | byte_perplexity ↓ | 4.847 | 8.871 |
2255
- | gsarti/flores_101_lit | lit | byte_perplexity ↓ | 4.543 | 5.183 |
2256
- | gsarti/flores_101_ltz | ltz | byte_perplexity ↓ | 5.591 | 7.158 |
2257
- | gsarti/flores_101_lug | lug | byte_perplexity ↓ | 5.43 | 7.399 |
2258
- | gsarti/flores_101_luo | luo | byte_perplexity ↓ | 12.031 | 11.951 |
2259
- | gsarti/flores_101_mal | mal | byte_perplexity ↓ | 4.794 | 2.054 |
2260
- | gsarti/flores_101_mar | mar | byte_perplexity ↓ | 6.857 | 2.274 |
2261
- | gsarti/flores_101_mkd | mkd | byte_perplexity ↓ | 2.335 | 2.538 |
2262
- | gsarti/flores_101_mlt | mlt | byte_perplexity ↓ | 9.041 | 5.996 |
2263
- | gsarti/flores_101_mon | mon | byte_perplexity ↓ | 3.095 | 4.519 |
2264
- | gsarti/flores_101_mri | mri | byte_perplexity ↓ | 5.266 | 4.438 |
2265
- | gsarti/flores_101_msa | msa | byte_perplexity ↓ | 2.222 | 2.935 |
2266
- | gsarti/flores_101_mya | mya | byte_perplexity ↓ | 2.523 | 2.413 |
2267
- | gsarti/flores_101_nld | nld | byte_perplexity ↓ | 2.799 | 2.293 |
2268
- | gsarti/flores_101_nob | nob | byte_perplexity ↓ | 3.629 | 2.593 |
2269
- | gsarti/flores_101_npi | npi | byte_perplexity ↓ | 6.666 | 2.499 |
2270
- | gsarti/flores_101_nso | nso | byte_perplexity ↓ | 5.015 | 8.485 |
2271
- | gsarti/flores_101_nya | nya | byte_perplexity ↓ | 4.938 | 7.548 |
2272
- | gsarti/flores_101_oci | oci | byte_perplexity ↓ | 3.607 | 4.936 |
2273
- | gsarti/flores_101_orm | orm | byte_perplexity ↓ | 11.316 | 7.145 |
2274
- | gsarti/flores_101_ory | ory | byte_perplexity ↓ | 5.982 | 2.668 |
2275
- | gsarti/flores_101_pan | pan | byte_perplexity ↓ | 4.772 | 2.782 |
2276
- | gsarti/flores_101_pol | pol | byte_perplexity ↓ | 3.012 | 2.432 |
2277
- | gsarti/flores_101_por | por | byte_perplexity ↓ | 1.841 | 2.178 |
2278
- | gsarti/flores_101_pus | pus | byte_perplexity ↓ | 4.624 | 4.785 |
2279
- | gsarti/flores_101_ron | ron | byte_perplexity ↓ | 3.05 | 2.197 |
2280
- | gsarti/flores_101_rus | rus | byte_perplexity ↓ | 1.708 | 1.689 |
2281
- | gsarti/flores_101_slk | slk | byte_perplexity ↓ | 4.038 | 3.419 |
2282
- | gsarti/flores_101_slv | slv | byte_perplexity ↓ | 4.141 | 3.582 |
2283
- | gsarti/flores_101_sna | sna | byte_perplexity ↓ | 4.711 | 5.588 |
2284
- | gsarti/flores_101_snd | snd | byte_perplexity ↓ | 4.206 | 5.667 |
2285
- | gsarti/flores_101_som | som | byte_perplexity ↓ | 9.154 | 4.788 |
2286
- | gsarti/flores_101_spa | spa | byte_perplexity ↓ | 1.796 | 2.098 |
2287
- | gsarti/flores_101_srp | srp | byte_perplexity ↓ | 2.241 | 2.688 |
2288
- | gsarti/flores_101_swe | swe | byte_perplexity ↓ | 3.345 | 2.468 |
2289
- | gsarti/flores_101_swh | swh | byte_perplexity ↓ | 2.684 | 4.473 |
2290
- | gsarti/flores_101_tam | tam | byte_perplexity ↓ | 5.165 | 2.024 |
2291
- | gsarti/flores_101_tel | tel | byte_perplexity ↓ | 6.81 | 2.407 |
2292
- | gsarti/flores_101_tgk | tgk | byte_perplexity ↓ | 3.785 | 4.899 |
2293
- | gsarti/flores_101_tgl | tgl | byte_perplexity ↓ | 3.75 | 2.738 |
2294
- | gsarti/flores_101_tha | tha | byte_perplexity ↓ | 2.104 | 2.035 |
2295
- | gsarti/flores_101_tur | tur | byte_perplexity ↓ | 3.318 | 2.622 |
2296
- | gsarti/flores_101_ukr | ukr | byte_perplexity ↓ | 2.089 | 1.93 |
2297
- | gsarti/flores_101_umb | umb | byte_perplexity ↓ | 11.766 | 11.64 |
2298
- | gsarti/flores_101_urd | urd | byte_perplexity ↓ | 1.779 | 2.982 |
2299
- | gsarti/flores_101_uzb | uzb | byte_perplexity ↓ | 8.5 | 13.209 |
2300
- | gsarti/flores_101_vie | vie | byte_perplexity ↓ | 1.659 | 2.229 |
2301
- | gsarti/flores_101_wol | wol | byte_perplexity ↓ | 6.142 | 13.945 |
2302
- | gsarti/flores_101_xho | xho | byte_perplexity ↓ | 4.69 | 8.42 |
2303
- | gsarti/flores_101_yor | yor | byte_perplexity ↓ | 4.361 | 7.636 |
2304
- | gsarti/flores_101_zho_simpl | zho_simpl | byte_perplexity ↓ | 2.118 | 5.113 |
2305
- | gsarti/flores_101_zho_trad | zho_trad | byte_perplexity ↓ | 2.274 | 5.67 |
2306
- | gsarti/flores_101_zul | zul | byte_perplexity ↓ | 6.017 | 7.341 |
2307
- | headqa | esp | acc ↑ | 0.346 | 0.244 |
2308
- | hellaswag | eng | acc ↑ | 0.535 | 0.592 |
2309
- | lambada_mt_de | deu | acc ↑ | 0.329 | 0.358 |
2310
- | lambada_mt_en | eng | acc ↑ | 0.672 | 0.747 |
2311
- | lambada_mt_es | esp | acc ↑ | 0.476 | 0.397 |
2312
- | lambada_mt_it | ita | acc ↑ | 0.406 | 0.409 |
2313
- | logiqa | eng | acc ↑ | 0.235 | 0.244 |
2314
- | mathqa | eng | acc ↑ | 0.277 | 0.268 |
2315
- | mc_taco | eng | em ↑ | 0.131 | 0.124 |
2316
- | mnli (Median of 15 prompts) | eng | acc ↑ | 0.355 | 0.36 |
2317
- | mnli_mismatched (Median of 15 prompts) | eng | acc ↑ | 0.355 | 0.36 |
2318
- | mrpc | eng | acc ↑ | 0.387 | 0.446 |
2319
- | multirc (Median of 11 prompts) | eng | acc ↑ | 0.571 | 0.599 |
2320
- | openbookqa | eng | acc ↑ | 0.312 | 0.322 |
2321
- | piqa | eng | acc ↑ | 0.781 | 0.791 |
2322
- | prost | eng | acc ↑ | 0.298 | 0.299 |
2323
- | pubmedqa | eng | acc ↑ | 0.741 | 0.709 |
2324
- | qnli | eng | acc ↑ | 0.517 | 0.554 |
2325
- | qqp (Median of 7 prompts) | eng | acc ↑ | 0.588 | 0.395 |
2326
- | race | eng | acc ↑ | 0.39 | 0.402 |
2327
- | rte (Median of 6 prompts) | eng | acc ↑ | 0.52 | 0.495 |
2328
- | sciq | eng | acc ↑ | 0.936 | 0.948 |
2329
- | sst (Median of 6 prompts) | eng | acc ↑ | 0.604 | 0.647 |
2330
- | triviaqa | eng | acc ↑ | 0.183 | 0.342 |
2331
- | tydiqa_primary (Median of 16 prompts) | eng | acc ↑ | 0.281 | 0.148 |
2332
- | webqs | eng | acc ↑ | 0.062 | 0.159 |
2333
- | wic (Median of 11 prompts) | eng | acc ↑ | 0.506 | 0.498 |
2334
- | winogrande | eng | acc ↑ | 0.71 | 0.736 |
2335
- | wnli (Median of 6 prompts) | eng | acc ↑ | 0.57 | 0.563 |
2336
- | wsc (Median of 11 prompts) | eng | acc ↑ | 0.519 | 0.413 |
2337
- | humaneval | python | pass@1 ↑ | 0.155 | 0.0 |
2338
- | humaneval | python | pass@10 ↑ | 0.322 | 0.0 |
2339
- | humaneval | python | pass@100 ↑ | 0.555 | 0.003 |
2340
-
2341
-
2342
- **Train-time Evaluation:**
2343
-
2344
- Final checkpoint after 95K steps:
2345
-
2346
- - Training Loss: 1.939
2347
-
2348
- - Validation Loss: 2.061
2349
-
2350
- - Perplexity: 7.045
2351
-
2352
- For more see: https://huggingface.co/bigscience/tr11-176B-ml-logs
2353
-
2354
- </details>
2355
-
2356
- ---
2357
 
2358
  # Recommendations
2359
 
 
169
  A: Let's think step by step.
170
  example_title: Mathematical reasoning
171
  group: English
172
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  ---
174
 
175
  <img src="https://s3.amazonaws.com/moonup/production/uploads/1657124309515-5f17f0a0925b9863e28ad517.png" alt="BigScience Logo" width="800" style="margin-left:'auto' margin-right:'auto' display:'block'"/>
 
558
  ---
559
 
560
  # Evaluation
 
 
 
 
 
561
 
562
+ We are currently working to perform systematic evaluation of our model. As results come in this section will be filled out.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
 
564
  # Recommendations
565