m3hrdadfi commited on
Commit
a8a053b
1 Parent(s): d59d695

Hello zabanshenas

Browse files
config.json ADDED
@@ -0,0 +1,501 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/content/checkpoint-84500",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "ace",
15
+ "1": "afr",
16
+ "2": "als",
17
+ "3": "amh",
18
+ "4": "ang",
19
+ "5": "ara",
20
+ "6": "arg",
21
+ "7": "arz",
22
+ "8": "asm",
23
+ "9": "ast",
24
+ "10": "ava",
25
+ "11": "aym",
26
+ "12": "azb",
27
+ "13": "aze",
28
+ "14": "bak",
29
+ "15": "bar",
30
+ "16": "bcl",
31
+ "17": "be-tarask",
32
+ "18": "bel",
33
+ "19": "ben",
34
+ "20": "bho",
35
+ "21": "bjn",
36
+ "22": "bod",
37
+ "23": "bos",
38
+ "24": "bpy",
39
+ "25": "bre",
40
+ "26": "bul",
41
+ "27": "bxr",
42
+ "28": "cat",
43
+ "29": "cbk",
44
+ "30": "cdo",
45
+ "31": "ceb",
46
+ "32": "ces",
47
+ "33": "che",
48
+ "34": "chr",
49
+ "35": "chv",
50
+ "36": "ckb",
51
+ "37": "cor",
52
+ "38": "cos",
53
+ "39": "crh",
54
+ "40": "csb",
55
+ "41": "cym",
56
+ "42": "dan",
57
+ "43": "deu",
58
+ "44": "diq",
59
+ "45": "div",
60
+ "46": "dsb",
61
+ "47": "dty",
62
+ "48": "egl",
63
+ "49": "ell",
64
+ "50": "eng",
65
+ "51": "epo",
66
+ "52": "est",
67
+ "53": "eus",
68
+ "54": "ext",
69
+ "55": "fao",
70
+ "56": "fas",
71
+ "57": "fin",
72
+ "58": "fra",
73
+ "59": "frp",
74
+ "60": "fry",
75
+ "61": "fur",
76
+ "62": "gag",
77
+ "63": "gla",
78
+ "64": "gle",
79
+ "65": "glg",
80
+ "66": "glk",
81
+ "67": "glv",
82
+ "68": "grn",
83
+ "69": "guj",
84
+ "70": "hak",
85
+ "71": "hat",
86
+ "72": "hau",
87
+ "73": "hbs",
88
+ "74": "heb",
89
+ "75": "hif",
90
+ "76": "hin",
91
+ "77": "hrv",
92
+ "78": "hsb",
93
+ "79": "hun",
94
+ "80": "hye",
95
+ "81": "ibo",
96
+ "82": "ido",
97
+ "83": "ile",
98
+ "84": "ilo",
99
+ "85": "ina",
100
+ "86": "ind",
101
+ "87": "isl",
102
+ "88": "ita",
103
+ "89": "jam",
104
+ "90": "jav",
105
+ "91": "jbo",
106
+ "92": "jpn",
107
+ "93": "kaa",
108
+ "94": "kab",
109
+ "95": "kan",
110
+ "96": "kat",
111
+ "97": "kaz",
112
+ "98": "kbd",
113
+ "99": "khm",
114
+ "100": "kin",
115
+ "101": "kir",
116
+ "102": "koi",
117
+ "103": "kok",
118
+ "104": "kom",
119
+ "105": "kor",
120
+ "106": "krc",
121
+ "107": "ksh",
122
+ "108": "kur",
123
+ "109": "lad",
124
+ "110": "lao",
125
+ "111": "lat",
126
+ "112": "lav",
127
+ "113": "lez",
128
+ "114": "lij",
129
+ "115": "lim",
130
+ "116": "lin",
131
+ "117": "lit",
132
+ "118": "lmo",
133
+ "119": "lrc",
134
+ "120": "ltg",
135
+ "121": "ltz",
136
+ "122": "lug",
137
+ "123": "lzh",
138
+ "124": "mai",
139
+ "125": "mal",
140
+ "126": "map-bms",
141
+ "127": "mar",
142
+ "128": "mdf",
143
+ "129": "mhr",
144
+ "130": "min",
145
+ "131": "mkd",
146
+ "132": "mlg",
147
+ "133": "mlt",
148
+ "134": "nan",
149
+ "135": "mon",
150
+ "136": "mri",
151
+ "137": "mrj",
152
+ "138": "msa",
153
+ "139": "mwl",
154
+ "140": "mya",
155
+ "141": "myv",
156
+ "142": "mzn",
157
+ "143": "nap",
158
+ "144": "nav",
159
+ "145": "nci",
160
+ "146": "nds",
161
+ "147": "nds-nl",
162
+ "148": "nep",
163
+ "149": "new",
164
+ "150": "nld",
165
+ "151": "nno",
166
+ "152": "nob",
167
+ "153": "nrm",
168
+ "154": "nso",
169
+ "155": "oci",
170
+ "156": "olo",
171
+ "157": "ori",
172
+ "158": "orm",
173
+ "159": "oss",
174
+ "160": "pag",
175
+ "161": "pam",
176
+ "162": "pan",
177
+ "163": "pap",
178
+ "164": "pcd",
179
+ "165": "pdc",
180
+ "166": "pfl",
181
+ "167": "pnb",
182
+ "168": "pol",
183
+ "169": "por",
184
+ "170": "pus",
185
+ "171": "que",
186
+ "172": "roa-tara",
187
+ "173": "roh",
188
+ "174": "ron",
189
+ "175": "rue",
190
+ "176": "rup",
191
+ "177": "rus",
192
+ "178": "sah",
193
+ "179": "san",
194
+ "180": "scn",
195
+ "181": "sco",
196
+ "182": "sgs",
197
+ "183": "sin",
198
+ "184": "slk",
199
+ "185": "slv",
200
+ "186": "sme",
201
+ "187": "sna",
202
+ "188": "snd",
203
+ "189": "som",
204
+ "190": "spa",
205
+ "191": "sqi",
206
+ "192": "srd",
207
+ "193": "srn",
208
+ "194": "srp",
209
+ "195": "stq",
210
+ "196": "sun",
211
+ "197": "swa",
212
+ "198": "swe",
213
+ "199": "szl",
214
+ "200": "tam",
215
+ "201": "tat",
216
+ "202": "tcy",
217
+ "203": "tel",
218
+ "204": "tet",
219
+ "205": "tgk",
220
+ "206": "tgl",
221
+ "207": "tha",
222
+ "208": "ton",
223
+ "209": "tsn",
224
+ "210": "tuk",
225
+ "211": "tur",
226
+ "212": "tyv",
227
+ "213": "udm",
228
+ "214": "uig",
229
+ "215": "ukr",
230
+ "216": "urd",
231
+ "217": "uzb",
232
+ "218": "vec",
233
+ "219": "vep",
234
+ "220": "vie",
235
+ "221": "vls",
236
+ "222": "vol",
237
+ "223": "vro",
238
+ "224": "war",
239
+ "225": "wln",
240
+ "226": "wol",
241
+ "227": "wuu",
242
+ "228": "xho",
243
+ "229": "xmf",
244
+ "230": "yid",
245
+ "231": "yor",
246
+ "232": "zea",
247
+ "233": "zh-yue",
248
+ "234": "zho"
249
+ },
250
+ "initializer_range": 0.02,
251
+ "intermediate_size": 3072,
252
+ "label2id": {
253
+ "ace": 0,
254
+ "afr": 1,
255
+ "als": 2,
256
+ "amh": 3,
257
+ "ang": 4,
258
+ "ara": 5,
259
+ "arg": 6,
260
+ "arz": 7,
261
+ "asm": 8,
262
+ "ast": 9,
263
+ "ava": 10,
264
+ "aym": 11,
265
+ "azb": 12,
266
+ "aze": 13,
267
+ "bak": 14,
268
+ "bar": 15,
269
+ "bcl": 16,
270
+ "be-tarask": 17,
271
+ "bel": 18,
272
+ "ben": 19,
273
+ "bho": 20,
274
+ "bjn": 21,
275
+ "bod": 22,
276
+ "bos": 23,
277
+ "bpy": 24,
278
+ "bre": 25,
279
+ "bul": 26,
280
+ "bxr": 27,
281
+ "cat": 28,
282
+ "cbk": 29,
283
+ "cdo": 30,
284
+ "ceb": 31,
285
+ "ces": 32,
286
+ "che": 33,
287
+ "chr": 34,
288
+ "chv": 35,
289
+ "ckb": 36,
290
+ "cor": 37,
291
+ "cos": 38,
292
+ "crh": 39,
293
+ "csb": 40,
294
+ "cym": 41,
295
+ "dan": 42,
296
+ "deu": 43,
297
+ "diq": 44,
298
+ "div": 45,
299
+ "dsb": 46,
300
+ "dty": 47,
301
+ "egl": 48,
302
+ "ell": 49,
303
+ "eng": 50,
304
+ "epo": 51,
305
+ "est": 52,
306
+ "eus": 53,
307
+ "ext": 54,
308
+ "fao": 55,
309
+ "fas": 56,
310
+ "fin": 57,
311
+ "fra": 58,
312
+ "frp": 59,
313
+ "fry": 60,
314
+ "fur": 61,
315
+ "gag": 62,
316
+ "gla": 63,
317
+ "gle": 64,
318
+ "glg": 65,
319
+ "glk": 66,
320
+ "glv": 67,
321
+ "grn": 68,
322
+ "guj": 69,
323
+ "hak": 70,
324
+ "hat": 71,
325
+ "hau": 72,
326
+ "hbs": 73,
327
+ "heb": 74,
328
+ "hif": 75,
329
+ "hin": 76,
330
+ "hrv": 77,
331
+ "hsb": 78,
332
+ "hun": 79,
333
+ "hye": 80,
334
+ "ibo": 81,
335
+ "ido": 82,
336
+ "ile": 83,
337
+ "ilo": 84,
338
+ "ina": 85,
339
+ "ind": 86,
340
+ "isl": 87,
341
+ "ita": 88,
342
+ "jam": 89,
343
+ "jav": 90,
344
+ "jbo": 91,
345
+ "jpn": 92,
346
+ "kaa": 93,
347
+ "kab": 94,
348
+ "kan": 95,
349
+ "kat": 96,
350
+ "kaz": 97,
351
+ "kbd": 98,
352
+ "khm": 99,
353
+ "kin": 100,
354
+ "kir": 101,
355
+ "koi": 102,
356
+ "kok": 103,
357
+ "kom": 104,
358
+ "kor": 105,
359
+ "krc": 106,
360
+ "ksh": 107,
361
+ "kur": 108,
362
+ "lad": 109,
363
+ "lao": 110,
364
+ "lat": 111,
365
+ "lav": 112,
366
+ "lez": 113,
367
+ "lij": 114,
368
+ "lim": 115,
369
+ "lin": 116,
370
+ "lit": 117,
371
+ "lmo": 118,
372
+ "lrc": 119,
373
+ "ltg": 120,
374
+ "ltz": 121,
375
+ "lug": 122,
376
+ "lzh": 123,
377
+ "mai": 124,
378
+ "mal": 125,
379
+ "map-bms": 126,
380
+ "mar": 127,
381
+ "mdf": 128,
382
+ "mhr": 129,
383
+ "min": 130,
384
+ "mkd": 131,
385
+ "mlg": 132,
386
+ "mlt": 133,
387
+ "nan": 134,
388
+ "mon": 135,
389
+ "mri": 136,
390
+ "mrj": 137,
391
+ "msa": 138,
392
+ "mwl": 139,
393
+ "mya": 140,
394
+ "myv": 141,
395
+ "mzn": 142,
396
+ "nap": 143,
397
+ "nav": 144,
398
+ "nci": 145,
399
+ "nds": 146,
400
+ "nds-nl": 147,
401
+ "nep": 148,
402
+ "new": 149,
403
+ "nld": 150,
404
+ "nno": 151,
405
+ "nob": 152,
406
+ "nrm": 153,
407
+ "nso": 154,
408
+ "oci": 155,
409
+ "olo": 156,
410
+ "ori": 157,
411
+ "orm": 158,
412
+ "oss": 159,
413
+ "pag": 160,
414
+ "pam": 161,
415
+ "pan": 162,
416
+ "pap": 163,
417
+ "pcd": 164,
418
+ "pdc": 165,
419
+ "pfl": 166,
420
+ "pnb": 167,
421
+ "pol": 168,
422
+ "por": 169,
423
+ "pus": 170,
424
+ "que": 171,
425
+ "roa-tara": 172,
426
+ "roh": 173,
427
+ "ron": 174,
428
+ "rue": 175,
429
+ "rup": 176,
430
+ "rus": 177,
431
+ "sah": 178,
432
+ "san": 179,
433
+ "scn": 180,
434
+ "sco": 181,
435
+ "sgs": 182,
436
+ "sin": 183,
437
+ "slk": 184,
438
+ "slv": 185,
439
+ "sme": 186,
440
+ "sna": 187,
441
+ "snd": 188,
442
+ "som": 189,
443
+ "spa": 190,
444
+ "sqi": 191,
445
+ "srd": 192,
446
+ "srn": 193,
447
+ "srp": 194,
448
+ "stq": 195,
449
+ "sun": 196,
450
+ "swa": 197,
451
+ "swe": 198,
452
+ "szl": 199,
453
+ "tam": 200,
454
+ "tat": 201,
455
+ "tcy": 202,
456
+ "tel": 203,
457
+ "tet": 204,
458
+ "tgk": 205,
459
+ "tgl": 206,
460
+ "tha": 207,
461
+ "ton": 208,
462
+ "tsn": 209,
463
+ "tuk": 210,
464
+ "tur": 211,
465
+ "tyv": 212,
466
+ "udm": 213,
467
+ "uig": 214,
468
+ "ukr": 215,
469
+ "urd": 216,
470
+ "uzb": 217,
471
+ "vec": 218,
472
+ "vep": 219,
473
+ "vie": 220,
474
+ "vls": 221,
475
+ "vol": 222,
476
+ "vro": 223,
477
+ "war": 224,
478
+ "wln": 225,
479
+ "wol": 226,
480
+ "wuu": 227,
481
+ "xho": 228,
482
+ "xmf": 229,
483
+ "yid": 230,
484
+ "yor": 231,
485
+ "zea": 232,
486
+ "zh-yue": 233,
487
+ "zho": 234
488
+ },
489
+ "layer_norm_eps": 1e-12,
490
+ "max_position_embeddings": 512,
491
+ "model_type": "roberta",
492
+ "num_attention_heads": 12,
493
+ "num_hidden_layers": 12,
494
+ "pad_token_id": 1,
495
+ "position_embedding_type": "absolute",
496
+ "problem_type": "single_label_classification",
497
+ "transformers_version": "4.8.0.dev0",
498
+ "type_vocab_size": 2,
499
+ "use_cache": true,
500
+ "vocab_size": 32000
501
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8863bf86a75da23bd15abd55fd68a0d6e2a8a70851389f96e13580dc356301cd
3
+ size 443277549
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:776984d5f858636ec3bdd8021e4a897de447f6c0e8d25b1a3eb0239829f9dabe
3
+ size 443480960
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "/content/checkpoint-84500", "tokenizer_class": "RobertaTokenizer"}
trainer_state.json ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "global_step": 95214,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.12,
12
+ "learning_rate": 3e-05,
13
+ "loss": 2.2074,
14
+ "step": 5000
15
+ },
16
+ {
17
+ "epoch": 0.12,
18
+ "eval_accuracy": 0.6701050400733948,
19
+ "eval_loss": 1.3863972425460815,
20
+ "eval_runtime": 1030.5845,
21
+ "eval_samples_per_second": 109.496,
22
+ "eval_steps_per_second": 9.125,
23
+ "step": 5000
24
+ },
25
+ {
26
+ "epoch": 0.24,
27
+ "learning_rate": 2.877048978688162e-05,
28
+ "loss": 1.2979,
29
+ "step": 10000
30
+ },
31
+ {
32
+ "epoch": 0.24,
33
+ "eval_accuracy": 0.7089281678199768,
34
+ "eval_loss": 1.1698851585388184,
35
+ "eval_runtime": 1023.3048,
36
+ "eval_samples_per_second": 110.275,
37
+ "eval_steps_per_second": 9.19,
38
+ "step": 10000
39
+ },
40
+ {
41
+ "epoch": 0.35,
42
+ "learning_rate": 2.7540979573763234e-05,
43
+ "loss": 1.1725,
44
+ "step": 15000
45
+ },
46
+ {
47
+ "epoch": 0.35,
48
+ "eval_accuracy": 0.723913311958313,
49
+ "eval_loss": 1.0914552211761475,
50
+ "eval_runtime": 1021.0004,
51
+ "eval_samples_per_second": 110.524,
52
+ "eval_steps_per_second": 9.211,
53
+ "step": 15000
54
+ },
55
+ {
56
+ "epoch": 0.47,
57
+ "learning_rate": 2.6311223360202048e-05,
58
+ "loss": 1.1066,
59
+ "step": 20000
60
+ },
61
+ {
62
+ "epoch": 0.47,
63
+ "eval_accuracy": 0.7364969849586487,
64
+ "eval_loss": 1.0439897775650024,
65
+ "eval_runtime": 1023.0505,
66
+ "eval_samples_per_second": 110.302,
67
+ "eval_steps_per_second": 9.192,
68
+ "step": 20000
69
+ },
70
+ {
71
+ "epoch": 0.79,
72
+ "learning_rate": 2.3351475380761303e-05,
73
+ "loss": 1.0399,
74
+ "step": 25000
75
+ },
76
+ {
77
+ "epoch": 0.79,
78
+ "eval_accuracy": 0.7490540146827698,
79
+ "eval_loss": 0.9834885001182556,
80
+ "eval_runtime": 1112.3584,
81
+ "eval_samples_per_second": 101.447,
82
+ "eval_steps_per_second": 6.341,
83
+ "step": 25000
84
+ },
85
+ {
86
+ "epoch": 0.95,
87
+ "learning_rate": 2.1689427361606846e-05,
88
+ "loss": 1.0012,
89
+ "step": 30000
90
+ },
91
+ {
92
+ "epoch": 0.95,
93
+ "eval_accuracy": 0.7581638693809509,
94
+ "eval_loss": 0.9408503770828247,
95
+ "eval_runtime": 1113.6514,
96
+ "eval_samples_per_second": 101.329,
97
+ "eval_steps_per_second": 6.333,
98
+ "step": 30000
99
+ },
100
+ {
101
+ "epoch": 1.1,
102
+ "learning_rate": 2.0027046799831514e-05,
103
+ "loss": 0.8971,
104
+ "step": 35000
105
+ },
106
+ {
107
+ "epoch": 1.1,
108
+ "eval_accuracy": 0.7663254737854004,
109
+ "eval_loss": 0.9190825819969177,
110
+ "eval_runtime": 1114.1162,
111
+ "eval_samples_per_second": 101.287,
112
+ "eval_steps_per_second": 6.331,
113
+ "step": 35000
114
+ },
115
+ {
116
+ "epoch": 1.26,
117
+ "learning_rate": 1.8364998780677056e-05,
118
+ "loss": 0.8405,
119
+ "step": 40000
120
+ },
121
+ {
122
+ "epoch": 1.26,
123
+ "eval_accuracy": 0.7693473100662231,
124
+ "eval_loss": 0.9057066440582275,
125
+ "eval_runtime": 1056.7123,
126
+ "eval_samples_per_second": 106.789,
127
+ "eval_steps_per_second": 6.674,
128
+ "step": 40000
129
+ },
130
+ {
131
+ "epoch": 1.42,
132
+ "learning_rate": 1.6702950761522602e-05,
133
+ "loss": 0.834,
134
+ "step": 45000
135
+ },
136
+ {
137
+ "epoch": 1.42,
138
+ "eval_accuracy": 0.775940477848053,
139
+ "eval_loss": 0.8832579851150513,
140
+ "eval_runtime": 1059.1014,
141
+ "eval_samples_per_second": 106.548,
142
+ "eval_steps_per_second": 6.659,
143
+ "step": 45000
144
+ },
145
+ {
146
+ "epoch": 1.58,
147
+ "learning_rate": 1.5040570199747268e-05,
148
+ "loss": 0.8212,
149
+ "step": 50000
150
+ },
151
+ {
152
+ "epoch": 1.58,
153
+ "eval_accuracy": 0.7792636156082153,
154
+ "eval_loss": 0.866415798664093,
155
+ "eval_runtime": 1054.2119,
156
+ "eval_samples_per_second": 107.042,
157
+ "eval_steps_per_second": 6.69,
158
+ "step": 50000
159
+ },
160
+ {
161
+ "epoch": 1.73,
162
+ "learning_rate": 1.3378854723213692e-05,
163
+ "loss": 0.8133,
164
+ "step": 55000
165
+ },
166
+ {
167
+ "epoch": 1.73,
168
+ "eval_accuracy": 0.7830652594566345,
169
+ "eval_loss": 0.8482676148414612,
170
+ "eval_runtime": 1054.5931,
171
+ "eval_samples_per_second": 107.003,
172
+ "eval_steps_per_second": 6.688,
173
+ "step": 55000
174
+ },
175
+ {
176
+ "epoch": 1.89,
177
+ "learning_rate": 1.1716474161438358e-05,
178
+ "loss": 0.7947,
179
+ "step": 60000
180
+ },
181
+ {
182
+ "epoch": 1.89,
183
+ "eval_accuracy": 0.7862643599510193,
184
+ "eval_loss": 0.8344442844390869,
185
+ "eval_runtime": 1058.4611,
186
+ "eval_samples_per_second": 106.612,
187
+ "eval_steps_per_second": 6.663,
188
+ "step": 60000
189
+ },
190
+ {
191
+ "epoch": 2.05,
192
+ "learning_rate": 1.0054758684904782e-05,
193
+ "loss": 0.7446,
194
+ "step": 65000
195
+ },
196
+ {
197
+ "epoch": 2.05,
198
+ "eval_accuracy": 0.7890114784240723,
199
+ "eval_loss": 0.8323635458946228,
200
+ "eval_runtime": 1054.4779,
201
+ "eval_samples_per_second": 107.015,
202
+ "eval_steps_per_second": 6.689,
203
+ "step": 65000
204
+ },
205
+ {
206
+ "epoch": 2.21,
207
+ "learning_rate": 8.392378123129448e-06,
208
+ "loss": 0.6463,
209
+ "step": 70000
210
+ },
211
+ {
212
+ "epoch": 2.21,
213
+ "eval_accuracy": 0.7913243770599365,
214
+ "eval_loss": 0.8241144418716431,
215
+ "eval_runtime": 1057.5768,
216
+ "eval_samples_per_second": 106.701,
217
+ "eval_steps_per_second": 6.669,
218
+ "step": 70000
219
+ },
220
+ {
221
+ "epoch": 2.36,
222
+ "learning_rate": 6.7306626465958724e-06,
223
+ "loss": 0.6461,
224
+ "step": 75000
225
+ },
226
+ {
227
+ "epoch": 2.36,
228
+ "eval_accuracy": 0.792999267578125,
229
+ "eval_loss": 0.8187506794929504,
230
+ "eval_runtime": 1055.2791,
231
+ "eval_samples_per_second": 106.934,
232
+ "eval_steps_per_second": 6.684,
233
+ "step": 75000
234
+ },
235
+ {
236
+ "epoch": 2.52,
237
+ "learning_rate": 5.068282084820538e-06,
238
+ "loss": 0.6403,
239
+ "step": 80000
240
+ },
241
+ {
242
+ "epoch": 2.52,
243
+ "eval_accuracy": 0.7952146530151367,
244
+ "eval_loss": 0.8096941709518433,
245
+ "eval_runtime": 1058.933,
246
+ "eval_samples_per_second": 106.565,
247
+ "eval_steps_per_second": 6.66,
248
+ "step": 80000
249
+ },
250
+ {
251
+ "epoch": 2.68,
252
+ "learning_rate": 3.406566608286962e-06,
253
+ "loss": 0.6294,
254
+ "step": 85000
255
+ },
256
+ {
257
+ "epoch": 2.68,
258
+ "eval_accuracy": 0.7971464991569519,
259
+ "eval_loss": 0.8044777512550354,
260
+ "eval_runtime": 1078.5713,
261
+ "eval_samples_per_second": 104.625,
262
+ "eval_steps_per_second": 6.539,
263
+ "step": 85000
264
+ },
265
+ {
266
+ "epoch": 2.84,
267
+ "learning_rate": 1.744186046511628e-06,
268
+ "loss": 0.6198,
269
+ "step": 90000
270
+ },
271
+ {
272
+ "epoch": 2.84,
273
+ "eval_accuracy": 0.7985377907752991,
274
+ "eval_loss": 0.7970817685127258,
275
+ "eval_runtime": 1091.9096,
276
+ "eval_samples_per_second": 103.346,
277
+ "eval_steps_per_second": 6.459,
278
+ "step": 90000
279
+ },
280
+ {
281
+ "epoch": 2.99,
282
+ "learning_rate": 8.213802735717294e-08,
283
+ "loss": 0.6169,
284
+ "step": 95000
285
+ },
286
+ {
287
+ "epoch": 2.99,
288
+ "eval_accuracy": 0.7989366054534912,
289
+ "eval_loss": 0.794947624206543,
290
+ "eval_runtime": 1091.2301,
291
+ "eval_samples_per_second": 103.411,
292
+ "eval_steps_per_second": 6.463,
293
+ "step": 95000
294
+ },
295
+ {
296
+ "epoch": 3.0,
297
+ "step": 95214,
298
+ "total_flos": 2.918866335440248e+17,
299
+ "train_loss": 0.06961118817632395,
300
+ "train_runtime": 8353.1581,
301
+ "train_samples_per_second": 364.749,
302
+ "train_steps_per_second": 11.399
303
+ }
304
+ ],
305
+ "max_steps": 95214,
306
+ "num_train_epochs": 3,
307
+ "total_flos": 2.918866335440248e+17,
308
+ "trial_name": null,
309
+ "trial_params": null
310
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff