|
{ |
|
"best_metric": 1.4095008373260498, |
|
"best_model_checkpoint": "./results/checkpoint-2000", |
|
"epoch": 1.9038553069966682, |
|
"eval_steps": 100, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00951927653498334, |
|
"grad_norm": 0.19429340958595276, |
|
"learning_rate": 5e-06, |
|
"loss": 1.9923, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01903855306996668, |
|
"grad_norm": 0.22915661334991455, |
|
"learning_rate": 1e-05, |
|
"loss": 2.0345, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.028557829604950024, |
|
"grad_norm": 0.24586208164691925, |
|
"learning_rate": 1.5e-05, |
|
"loss": 1.9754, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03807710613993336, |
|
"grad_norm": 0.2809593677520752, |
|
"learning_rate": 2e-05, |
|
"loss": 1.9887, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.047596382674916705, |
|
"grad_norm": 0.3456374406814575, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.9913, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05711565920990005, |
|
"grad_norm": 0.43767085671424866, |
|
"learning_rate": 3e-05, |
|
"loss": 1.9953, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06663493574488338, |
|
"grad_norm": 0.5941727757453918, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.9795, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07615421227986673, |
|
"grad_norm": 0.6205775141716003, |
|
"learning_rate": 4e-05, |
|
"loss": 1.9804, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08567348881485007, |
|
"grad_norm": 0.7349818348884583, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.9511, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09519276534983341, |
|
"grad_norm": 0.7896203398704529, |
|
"learning_rate": 5e-05, |
|
"loss": 1.9337, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09519276534983341, |
|
"eval_loss": 1.9389821290969849, |
|
"eval_runtime": 39.6995, |
|
"eval_samples_per_second": 2.519, |
|
"eval_steps_per_second": 0.63, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10471204188481675, |
|
"grad_norm": 0.8368595242500305, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 1.9052, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1142313184198001, |
|
"grad_norm": 0.9555522799491882, |
|
"learning_rate": 6e-05, |
|
"loss": 1.8766, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12375059495478344, |
|
"grad_norm": 0.9343065023422241, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 1.8416, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13326987148976677, |
|
"grad_norm": 0.82816481590271, |
|
"learning_rate": 7e-05, |
|
"loss": 1.8246, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14278914802475012, |
|
"grad_norm": 0.7882226705551147, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.7909, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15230842455973345, |
|
"grad_norm": 0.6617611646652222, |
|
"learning_rate": 8e-05, |
|
"loss": 1.7938, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1618277010947168, |
|
"grad_norm": 0.598513126373291, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.759, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17134697762970014, |
|
"grad_norm": 0.45931604504585266, |
|
"learning_rate": 9e-05, |
|
"loss": 1.7542, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1808662541646835, |
|
"grad_norm": 0.38422173261642456, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.7525, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.19038553069966682, |
|
"grad_norm": 0.31998202204704285, |
|
"learning_rate": 0.0001, |
|
"loss": 1.732, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.19038553069966682, |
|
"eval_loss": 1.7440533638000488, |
|
"eval_runtime": 39.6462, |
|
"eval_samples_per_second": 2.522, |
|
"eval_steps_per_second": 0.631, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.19990480723465018, |
|
"grad_norm": 0.3325157165527344, |
|
"learning_rate": 0.000105, |
|
"loss": 1.7215, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.2094240837696335, |
|
"grad_norm": 0.31313714385032654, |
|
"learning_rate": 0.00011000000000000002, |
|
"loss": 1.7033, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.21894336030461684, |
|
"grad_norm": 0.3179369270801544, |
|
"learning_rate": 0.00011499999999999999, |
|
"loss": 1.7025, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2284626368396002, |
|
"grad_norm": 0.3547224700450897, |
|
"learning_rate": 0.00012, |
|
"loss": 1.6833, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.23798191337458352, |
|
"grad_norm": 0.3367106020450592, |
|
"learning_rate": 0.000125, |
|
"loss": 1.6524, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.24750118990956688, |
|
"grad_norm": 0.38071829080581665, |
|
"learning_rate": 0.00013000000000000002, |
|
"loss": 1.6425, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.25702046644455023, |
|
"grad_norm": 0.34949544072151184, |
|
"learning_rate": 0.00013500000000000003, |
|
"loss": 1.6218, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.26653974297953353, |
|
"grad_norm": 0.32234707474708557, |
|
"learning_rate": 0.00014, |
|
"loss": 1.6284, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.2760590195145169, |
|
"grad_norm": 0.3492746949195862, |
|
"learning_rate": 0.000145, |
|
"loss": 1.6053, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.28557829604950025, |
|
"grad_norm": 0.3380492031574249, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.5728, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.28557829604950025, |
|
"eval_loss": 1.5842629671096802, |
|
"eval_runtime": 39.8763, |
|
"eval_samples_per_second": 2.508, |
|
"eval_steps_per_second": 0.627, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2950975725844836, |
|
"grad_norm": 0.3693602383136749, |
|
"learning_rate": 0.000155, |
|
"loss": 1.5769, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3046168491194669, |
|
"grad_norm": 0.3339674174785614, |
|
"learning_rate": 0.00016, |
|
"loss": 1.5691, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.31413612565445026, |
|
"grad_norm": 0.33094632625579834, |
|
"learning_rate": 0.000165, |
|
"loss": 1.5501, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.3236554021894336, |
|
"grad_norm": 0.3607189953327179, |
|
"learning_rate": 0.00017, |
|
"loss": 1.5373, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.3331746787244169, |
|
"grad_norm": 0.34884127974510193, |
|
"learning_rate": 0.000175, |
|
"loss": 1.5084, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3426939552594003, |
|
"grad_norm": 0.33757245540618896, |
|
"learning_rate": 0.00018, |
|
"loss": 1.5158, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.35221323179438363, |
|
"grad_norm": 0.34877315163612366, |
|
"learning_rate": 0.00018500000000000002, |
|
"loss": 1.5002, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.361732508329367, |
|
"grad_norm": 0.38293707370758057, |
|
"learning_rate": 0.00019, |
|
"loss": 1.4845, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3712517848643503, |
|
"grad_norm": 0.3441324830055237, |
|
"learning_rate": 0.000195, |
|
"loss": 1.4849, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.38077106139933364, |
|
"grad_norm": 0.41154617071151733, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4862, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.38077106139933364, |
|
"eval_loss": 1.4930579662322998, |
|
"eval_runtime": 39.6754, |
|
"eval_samples_per_second": 2.52, |
|
"eval_steps_per_second": 0.63, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.390290337934317, |
|
"grad_norm": 0.36156630516052246, |
|
"learning_rate": 0.0001995876288659794, |
|
"loss": 1.4872, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.39980961446930036, |
|
"grad_norm": 0.32177475094795227, |
|
"learning_rate": 0.00019917525773195877, |
|
"loss": 1.457, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.40932889100428366, |
|
"grad_norm": 0.41120991110801697, |
|
"learning_rate": 0.00019876288659793816, |
|
"loss": 1.4794, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.418848167539267, |
|
"grad_norm": 0.3899654448032379, |
|
"learning_rate": 0.00019835051546391753, |
|
"loss": 1.4728, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.42836744407425037, |
|
"grad_norm": 0.3392334580421448, |
|
"learning_rate": 0.00019793814432989693, |
|
"loss": 1.4837, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.43788672060923367, |
|
"grad_norm": 0.35381370782852173, |
|
"learning_rate": 0.00019752577319587632, |
|
"loss": 1.4722, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.447405997144217, |
|
"grad_norm": 0.3540886342525482, |
|
"learning_rate": 0.0001971134020618557, |
|
"loss": 1.4608, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4569252736792004, |
|
"grad_norm": 0.38124117255210876, |
|
"learning_rate": 0.00019670103092783505, |
|
"loss": 1.4648, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.46644455021418374, |
|
"grad_norm": 0.34540703892707825, |
|
"learning_rate": 0.00019628865979381442, |
|
"loss": 1.4651, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.47596382674916704, |
|
"grad_norm": 0.34259673953056335, |
|
"learning_rate": 0.00019587628865979381, |
|
"loss": 1.442, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.47596382674916704, |
|
"eval_loss": 1.4635019302368164, |
|
"eval_runtime": 39.7119, |
|
"eval_samples_per_second": 2.518, |
|
"eval_steps_per_second": 0.63, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4854831032841504, |
|
"grad_norm": 0.39079272747039795, |
|
"learning_rate": 0.0001954639175257732, |
|
"loss": 1.4676, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.49500237981913375, |
|
"grad_norm": 0.3873017132282257, |
|
"learning_rate": 0.00019505154639175258, |
|
"loss": 1.4491, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.504521656354117, |
|
"grad_norm": 0.46681663393974304, |
|
"learning_rate": 0.00019463917525773197, |
|
"loss": 1.4483, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5140409328891005, |
|
"grad_norm": 0.3657790720462799, |
|
"learning_rate": 0.00019422680412371134, |
|
"loss": 1.4402, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5235602094240838, |
|
"grad_norm": 0.3777405321598053, |
|
"learning_rate": 0.00019381443298969073, |
|
"loss": 1.4295, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5330794859590671, |
|
"grad_norm": 0.3857463002204895, |
|
"learning_rate": 0.00019340206185567012, |
|
"loss": 1.4548, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5425987624940505, |
|
"grad_norm": 0.36316171288490295, |
|
"learning_rate": 0.0001929896907216495, |
|
"loss": 1.4567, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5521180390290338, |
|
"grad_norm": 0.37083303928375244, |
|
"learning_rate": 0.00019257731958762889, |
|
"loss": 1.4458, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5616373155640171, |
|
"grad_norm": 0.39506301283836365, |
|
"learning_rate": 0.00019216494845360825, |
|
"loss": 1.4364, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5711565920990005, |
|
"grad_norm": 0.3553004264831543, |
|
"learning_rate": 0.00019175257731958765, |
|
"loss": 1.4676, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5711565920990005, |
|
"eval_loss": 1.449947714805603, |
|
"eval_runtime": 39.6196, |
|
"eval_samples_per_second": 2.524, |
|
"eval_steps_per_second": 0.631, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5806758686339838, |
|
"grad_norm": 0.4231698513031006, |
|
"learning_rate": 0.00019134020618556704, |
|
"loss": 1.4348, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5901951451689672, |
|
"grad_norm": 0.4203357994556427, |
|
"learning_rate": 0.0001909278350515464, |
|
"loss": 1.4455, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5997144217039505, |
|
"grad_norm": 0.39158034324645996, |
|
"learning_rate": 0.00019051546391752577, |
|
"loss": 1.4594, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6092336982389338, |
|
"grad_norm": 0.39505264163017273, |
|
"learning_rate": 0.00019010309278350514, |
|
"loss": 1.4653, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6187529747739172, |
|
"grad_norm": 0.3803844153881073, |
|
"learning_rate": 0.00018969072164948454, |
|
"loss": 1.4335, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6282722513089005, |
|
"grad_norm": 0.3700083792209625, |
|
"learning_rate": 0.00018927835051546393, |
|
"loss": 1.4409, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6377915278438838, |
|
"grad_norm": 0.35323163866996765, |
|
"learning_rate": 0.0001888659793814433, |
|
"loss": 1.4456, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6473108043788672, |
|
"grad_norm": 0.34639179706573486, |
|
"learning_rate": 0.0001884536082474227, |
|
"loss": 1.441, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6568300809138505, |
|
"grad_norm": 0.39847052097320557, |
|
"learning_rate": 0.00018804123711340206, |
|
"loss": 1.4214, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.6663493574488338, |
|
"grad_norm": 0.3468664884567261, |
|
"learning_rate": 0.00018762886597938145, |
|
"loss": 1.4398, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6663493574488338, |
|
"eval_loss": 1.4422180652618408, |
|
"eval_runtime": 39.7751, |
|
"eval_samples_per_second": 2.514, |
|
"eval_steps_per_second": 0.629, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.6758686339838172, |
|
"grad_norm": 0.35168835520744324, |
|
"learning_rate": 0.00018721649484536085, |
|
"loss": 1.436, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6853879105188005, |
|
"grad_norm": 0.3769698143005371, |
|
"learning_rate": 0.0001868041237113402, |
|
"loss": 1.4424, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.694907187053784, |
|
"grad_norm": 0.4488052725791931, |
|
"learning_rate": 0.0001863917525773196, |
|
"loss": 1.4509, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7044264635887673, |
|
"grad_norm": 0.34030118584632874, |
|
"learning_rate": 0.00018597938144329897, |
|
"loss": 1.4355, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7139457401237506, |
|
"grad_norm": 0.3737122714519501, |
|
"learning_rate": 0.00018556701030927837, |
|
"loss": 1.4336, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.723465016658734, |
|
"grad_norm": 0.36228156089782715, |
|
"learning_rate": 0.00018515463917525776, |
|
"loss": 1.4371, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7329842931937173, |
|
"grad_norm": 0.37088775634765625, |
|
"learning_rate": 0.00018474226804123713, |
|
"loss": 1.4445, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7425035697287006, |
|
"grad_norm": 0.39574727416038513, |
|
"learning_rate": 0.0001843298969072165, |
|
"loss": 1.4252, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.752022846263684, |
|
"grad_norm": 0.41419413685798645, |
|
"learning_rate": 0.00018391752577319586, |
|
"loss": 1.4173, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.7615421227986673, |
|
"grad_norm": 0.3290116786956787, |
|
"learning_rate": 0.00018350515463917526, |
|
"loss": 1.4544, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7615421227986673, |
|
"eval_loss": 1.436285138130188, |
|
"eval_runtime": 39.7262, |
|
"eval_samples_per_second": 2.517, |
|
"eval_steps_per_second": 0.629, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.7710613993336506, |
|
"grad_norm": 0.38003116846084595, |
|
"learning_rate": 0.00018309278350515465, |
|
"loss": 1.425, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.780580675868634, |
|
"grad_norm": 0.3706168532371521, |
|
"learning_rate": 0.00018268041237113402, |
|
"loss": 1.4537, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.7900999524036173, |
|
"grad_norm": 0.36038973927497864, |
|
"learning_rate": 0.0001822680412371134, |
|
"loss": 1.4188, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7996192289386007, |
|
"grad_norm": 0.36866557598114014, |
|
"learning_rate": 0.00018185567010309278, |
|
"loss": 1.4246, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.809138505473584, |
|
"grad_norm": 0.37158384919166565, |
|
"learning_rate": 0.00018144329896907217, |
|
"loss": 1.4323, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8186577820085673, |
|
"grad_norm": 0.3713236153125763, |
|
"learning_rate": 0.00018103092783505157, |
|
"loss": 1.4407, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8281770585435507, |
|
"grad_norm": 0.3830552101135254, |
|
"learning_rate": 0.00018061855670103093, |
|
"loss": 1.4427, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"grad_norm": 0.3613452613353729, |
|
"learning_rate": 0.00018020618556701033, |
|
"loss": 1.445, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8472156116135173, |
|
"grad_norm": 0.3281649351119995, |
|
"learning_rate": 0.0001797938144329897, |
|
"loss": 1.4311, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.8567348881485007, |
|
"grad_norm": 0.3342822194099426, |
|
"learning_rate": 0.0001793814432989691, |
|
"loss": 1.4455, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.8567348881485007, |
|
"eval_loss": 1.4309983253479004, |
|
"eval_runtime": 39.6671, |
|
"eval_samples_per_second": 2.521, |
|
"eval_steps_per_second": 0.63, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.866254164683484, |
|
"grad_norm": 0.3413979411125183, |
|
"learning_rate": 0.00017896907216494848, |
|
"loss": 1.4416, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.8757734412184673, |
|
"grad_norm": 0.36158689856529236, |
|
"learning_rate": 0.00017855670103092785, |
|
"loss": 1.4328, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.8852927177534508, |
|
"grad_norm": 0.3876676857471466, |
|
"learning_rate": 0.00017814432989690724, |
|
"loss": 1.4321, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.894811994288434, |
|
"grad_norm": 0.35580044984817505, |
|
"learning_rate": 0.00017773195876288658, |
|
"loss": 1.4377, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9043312708234175, |
|
"grad_norm": 0.3638615906238556, |
|
"learning_rate": 0.00017731958762886598, |
|
"loss": 1.4222, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9138505473584008, |
|
"grad_norm": 0.32744646072387695, |
|
"learning_rate": 0.00017690721649484537, |
|
"loss": 1.4235, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9233698238933841, |
|
"grad_norm": 0.3513677716255188, |
|
"learning_rate": 0.00017649484536082474, |
|
"loss": 1.4511, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9328891004283675, |
|
"grad_norm": 0.35235506296157837, |
|
"learning_rate": 0.00017608247422680413, |
|
"loss": 1.4274, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9424083769633508, |
|
"grad_norm": 0.343514621257782, |
|
"learning_rate": 0.0001756701030927835, |
|
"loss": 1.417, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9519276534983341, |
|
"grad_norm": 0.38501816987991333, |
|
"learning_rate": 0.0001752577319587629, |
|
"loss": 1.4422, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9519276534983341, |
|
"eval_loss": 1.4280465841293335, |
|
"eval_runtime": 39.8059, |
|
"eval_samples_per_second": 2.512, |
|
"eval_steps_per_second": 0.628, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9614469300333175, |
|
"grad_norm": 0.3453533351421356, |
|
"learning_rate": 0.0001748453608247423, |
|
"loss": 1.4134, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.9709662065683008, |
|
"grad_norm": 0.3687838017940521, |
|
"learning_rate": 0.00017443298969072165, |
|
"loss": 1.4357, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.9804854831032842, |
|
"grad_norm": 0.3397800624370575, |
|
"learning_rate": 0.00017402061855670105, |
|
"loss": 1.4221, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.9900047596382675, |
|
"grad_norm": 0.3534553647041321, |
|
"learning_rate": 0.00017360824742268042, |
|
"loss": 1.4232, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.9995240361732508, |
|
"grad_norm": 0.31195327639579773, |
|
"learning_rate": 0.0001731958762886598, |
|
"loss": 1.4316, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.009043312708234, |
|
"grad_norm": 0.3326282799243927, |
|
"learning_rate": 0.0001727835051546392, |
|
"loss": 1.4361, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.0185625892432175, |
|
"grad_norm": 0.38880521059036255, |
|
"learning_rate": 0.00017237113402061857, |
|
"loss": 1.4282, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.028081865778201, |
|
"grad_norm": 0.3405047357082367, |
|
"learning_rate": 0.00017195876288659796, |
|
"loss": 1.4295, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.0376011423131841, |
|
"grad_norm": 0.3320964574813843, |
|
"learning_rate": 0.0001715463917525773, |
|
"loss": 1.426, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.0471204188481675, |
|
"grad_norm": 0.3246203660964966, |
|
"learning_rate": 0.0001711340206185567, |
|
"loss": 1.4115, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0471204188481675, |
|
"eval_loss": 1.4249218702316284, |
|
"eval_runtime": 39.6935, |
|
"eval_samples_per_second": 2.519, |
|
"eval_steps_per_second": 0.63, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.056639695383151, |
|
"grad_norm": 0.33676016330718994, |
|
"learning_rate": 0.0001707216494845361, |
|
"loss": 1.4429, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.0661589719181341, |
|
"grad_norm": 0.304193913936615, |
|
"learning_rate": 0.00017030927835051546, |
|
"loss": 1.4145, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.0756782484531175, |
|
"grad_norm": 0.3132353723049164, |
|
"learning_rate": 0.00016989690721649485, |
|
"loss": 1.3993, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.085197524988101, |
|
"grad_norm": 0.3249862492084503, |
|
"learning_rate": 0.00016948453608247422, |
|
"loss": 1.4079, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.0947168015230841, |
|
"grad_norm": 0.31127285957336426, |
|
"learning_rate": 0.00016907216494845361, |
|
"loss": 1.4189, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.1042360780580676, |
|
"grad_norm": 0.3150855302810669, |
|
"learning_rate": 0.000168659793814433, |
|
"loss": 1.4211, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.113755354593051, |
|
"grad_norm": 0.3354848325252533, |
|
"learning_rate": 0.00016824742268041238, |
|
"loss": 1.4146, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.1232746311280342, |
|
"grad_norm": 0.36233270168304443, |
|
"learning_rate": 0.00016783505154639177, |
|
"loss": 1.4289, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.1327939076630176, |
|
"grad_norm": 0.3264828622341156, |
|
"learning_rate": 0.00016742268041237114, |
|
"loss": 1.4021, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.142313184198001, |
|
"grad_norm": 0.3206409215927124, |
|
"learning_rate": 0.00016701030927835053, |
|
"loss": 1.4014, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.142313184198001, |
|
"eval_loss": 1.4221289157867432, |
|
"eval_runtime": 39.6868, |
|
"eval_samples_per_second": 2.52, |
|
"eval_steps_per_second": 0.63, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.1518324607329844, |
|
"grad_norm": 0.31770697236061096, |
|
"learning_rate": 0.00016659793814432993, |
|
"loss": 1.424, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.1613517372679676, |
|
"grad_norm": 0.313504695892334, |
|
"learning_rate": 0.0001661855670103093, |
|
"loss": 1.4229, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.170871013802951, |
|
"grad_norm": 0.35023483633995056, |
|
"learning_rate": 0.00016577319587628869, |
|
"loss": 1.4059, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.1803902903379344, |
|
"grad_norm": 0.3138754963874817, |
|
"learning_rate": 0.00016536082474226803, |
|
"loss": 1.4185, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.1899095668729176, |
|
"grad_norm": 0.31875547766685486, |
|
"learning_rate": 0.00016494845360824742, |
|
"loss": 1.4115, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.199428843407901, |
|
"grad_norm": 0.3276744484901428, |
|
"learning_rate": 0.00016453608247422681, |
|
"loss": 1.405, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.2089481199428844, |
|
"grad_norm": 0.3124449849128723, |
|
"learning_rate": 0.00016412371134020618, |
|
"loss": 1.4164, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.2184673964778676, |
|
"grad_norm": 0.31706124544143677, |
|
"learning_rate": 0.00016371134020618558, |
|
"loss": 1.413, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.227986673012851, |
|
"grad_norm": 0.3277345895767212, |
|
"learning_rate": 0.00016329896907216494, |
|
"loss": 1.4339, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.2375059495478344, |
|
"grad_norm": 0.3391578793525696, |
|
"learning_rate": 0.00016288659793814434, |
|
"loss": 1.414, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.2375059495478344, |
|
"eval_loss": 1.4196603298187256, |
|
"eval_runtime": 39.6612, |
|
"eval_samples_per_second": 2.521, |
|
"eval_steps_per_second": 0.63, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.2470252260828176, |
|
"grad_norm": 0.3561215400695801, |
|
"learning_rate": 0.00016247422680412373, |
|
"loss": 1.431, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.256544502617801, |
|
"grad_norm": 0.3222928047180176, |
|
"learning_rate": 0.0001620618556701031, |
|
"loss": 1.4207, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.2660637791527845, |
|
"grad_norm": 0.29916074872016907, |
|
"learning_rate": 0.0001616494845360825, |
|
"loss": 1.413, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.2755830556877679, |
|
"grad_norm": 0.31502220034599304, |
|
"learning_rate": 0.00016123711340206186, |
|
"loss": 1.4041, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.285102332222751, |
|
"grad_norm": 0.30248329043388367, |
|
"learning_rate": 0.00016082474226804125, |
|
"loss": 1.4136, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.2946216087577345, |
|
"grad_norm": 0.29750174283981323, |
|
"learning_rate": 0.00016041237113402065, |
|
"loss": 1.4149, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.3041408852927177, |
|
"grad_norm": 0.35403352975845337, |
|
"learning_rate": 0.00016, |
|
"loss": 1.434, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.313660161827701, |
|
"grad_norm": 0.3174794614315033, |
|
"learning_rate": 0.0001595876288659794, |
|
"loss": 1.4129, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.3231794383626845, |
|
"grad_norm": 0.31973496079444885, |
|
"learning_rate": 0.00015917525773195875, |
|
"loss": 1.4312, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.332698714897668, |
|
"grad_norm": 0.29147669672966003, |
|
"learning_rate": 0.00015876288659793814, |
|
"loss": 1.4203, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.332698714897668, |
|
"eval_loss": 1.4181098937988281, |
|
"eval_runtime": 39.59, |
|
"eval_samples_per_second": 2.526, |
|
"eval_steps_per_second": 0.631, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.342217991432651, |
|
"grad_norm": 0.3290899097919464, |
|
"learning_rate": 0.00015835051546391754, |
|
"loss": 1.4159, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.3517372679676345, |
|
"grad_norm": 0.2933942377567291, |
|
"learning_rate": 0.0001579381443298969, |
|
"loss": 1.4399, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.3612565445026177, |
|
"grad_norm": 0.3046482801437378, |
|
"learning_rate": 0.0001575257731958763, |
|
"loss": 1.4004, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.370775821037601, |
|
"grad_norm": 0.29072305560112, |
|
"learning_rate": 0.00015711340206185566, |
|
"loss": 1.4026, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.3802950975725845, |
|
"grad_norm": 0.32136955857276917, |
|
"learning_rate": 0.00015670103092783506, |
|
"loss": 1.4189, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.389814374107568, |
|
"grad_norm": 0.3227519094944, |
|
"learning_rate": 0.00015628865979381445, |
|
"loss": 1.417, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.399333650642551, |
|
"grad_norm": 0.32664725184440613, |
|
"learning_rate": 0.00015587628865979382, |
|
"loss": 1.41, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.4088529271775345, |
|
"grad_norm": 0.3103736340999603, |
|
"learning_rate": 0.0001554639175257732, |
|
"loss": 1.4175, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.418372203712518, |
|
"grad_norm": 0.30146801471710205, |
|
"learning_rate": 0.00015505154639175258, |
|
"loss": 1.4281, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.4278914802475011, |
|
"grad_norm": 0.3465326726436615, |
|
"learning_rate": 0.00015463917525773197, |
|
"loss": 1.3968, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.4278914802475011, |
|
"eval_loss": 1.4155893325805664, |
|
"eval_runtime": 39.6723, |
|
"eval_samples_per_second": 2.521, |
|
"eval_steps_per_second": 0.63, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.4374107567824845, |
|
"grad_norm": 0.33584079146385193, |
|
"learning_rate": 0.00015422680412371137, |
|
"loss": 1.4215, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.446930033317468, |
|
"grad_norm": 0.3028492331504822, |
|
"learning_rate": 0.00015381443298969073, |
|
"loss": 1.4143, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.4564493098524511, |
|
"grad_norm": 0.29686522483825684, |
|
"learning_rate": 0.00015340206185567013, |
|
"loss": 1.4249, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.4659685863874345, |
|
"grad_norm": 0.30677148699760437, |
|
"learning_rate": 0.00015298969072164947, |
|
"loss": 1.4225, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.475487862922418, |
|
"grad_norm": 0.3218235969543457, |
|
"learning_rate": 0.00015257731958762886, |
|
"loss": 1.3925, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.4850071394574011, |
|
"grad_norm": 0.29073500633239746, |
|
"learning_rate": 0.00015216494845360826, |
|
"loss": 1.4068, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.4945264159923846, |
|
"grad_norm": 0.3078315258026123, |
|
"learning_rate": 0.00015175257731958762, |
|
"loss": 1.404, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.504045692527368, |
|
"grad_norm": 0.30224424600601196, |
|
"learning_rate": 0.00015134020618556702, |
|
"loss": 1.4244, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.5135649690623514, |
|
"grad_norm": 0.3163105845451355, |
|
"learning_rate": 0.00015092783505154638, |
|
"loss": 1.4099, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.5230842455973346, |
|
"grad_norm": 0.289919376373291, |
|
"learning_rate": 0.00015051546391752578, |
|
"loss": 1.4235, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.5230842455973346, |
|
"eval_loss": 1.4139596223831177, |
|
"eval_runtime": 39.6382, |
|
"eval_samples_per_second": 2.523, |
|
"eval_steps_per_second": 0.631, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.532603522132318, |
|
"grad_norm": 0.2937719225883484, |
|
"learning_rate": 0.00015010309278350517, |
|
"loss": 1.4112, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.5421227986673012, |
|
"grad_norm": 0.30618056654930115, |
|
"learning_rate": 0.00014969072164948454, |
|
"loss": 1.41, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.5516420752022846, |
|
"grad_norm": 0.3118852972984314, |
|
"learning_rate": 0.00014927835051546393, |
|
"loss": 1.4234, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.561161351737268, |
|
"grad_norm": 0.30171871185302734, |
|
"learning_rate": 0.0001488659793814433, |
|
"loss": 1.4151, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.5706806282722514, |
|
"grad_norm": 0.3201466500759125, |
|
"learning_rate": 0.0001484536082474227, |
|
"loss": 1.4054, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.5801999048072346, |
|
"grad_norm": 0.3177640736103058, |
|
"learning_rate": 0.0001480412371134021, |
|
"loss": 1.4043, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.589719181342218, |
|
"grad_norm": 0.3199518918991089, |
|
"learning_rate": 0.00014762886597938146, |
|
"loss": 1.4049, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.5992384578772012, |
|
"grad_norm": 0.30514276027679443, |
|
"learning_rate": 0.00014721649484536085, |
|
"loss": 1.4103, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.6087577344121846, |
|
"grad_norm": 0.31192877888679504, |
|
"learning_rate": 0.0001468041237113402, |
|
"loss": 1.4173, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.618277010947168, |
|
"grad_norm": 0.2868764400482178, |
|
"learning_rate": 0.00014639175257731958, |
|
"loss": 1.4154, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.618277010947168, |
|
"eval_loss": 1.4129725694656372, |
|
"eval_runtime": 39.6482, |
|
"eval_samples_per_second": 2.522, |
|
"eval_steps_per_second": 0.631, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.6277962874821514, |
|
"grad_norm": 0.2957026958465576, |
|
"learning_rate": 0.00014597938144329898, |
|
"loss": 1.413, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.6373155640171349, |
|
"grad_norm": 0.2958736717700958, |
|
"learning_rate": 0.00014556701030927834, |
|
"loss": 1.4209, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.646834840552118, |
|
"grad_norm": 0.31275510787963867, |
|
"learning_rate": 0.00014515463917525774, |
|
"loss": 1.4022, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.6563541170871012, |
|
"grad_norm": 0.32319965958595276, |
|
"learning_rate": 0.0001447422680412371, |
|
"loss": 1.3997, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.6658733936220846, |
|
"grad_norm": 0.29642221331596375, |
|
"learning_rate": 0.0001443298969072165, |
|
"loss": 1.4032, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.675392670157068, |
|
"grad_norm": 0.32867589592933655, |
|
"learning_rate": 0.0001439175257731959, |
|
"loss": 1.4203, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.6849119466920515, |
|
"grad_norm": 0.31527167558670044, |
|
"learning_rate": 0.00014350515463917526, |
|
"loss": 1.4157, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.6944312232270349, |
|
"grad_norm": 0.3072745203971863, |
|
"learning_rate": 0.00014309278350515465, |
|
"loss": 1.3823, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.703950499762018, |
|
"grad_norm": 0.28791388869285583, |
|
"learning_rate": 0.00014268041237113402, |
|
"loss": 1.4317, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.7134697762970015, |
|
"grad_norm": 0.28491660952568054, |
|
"learning_rate": 0.00014226804123711342, |
|
"loss": 1.4128, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.7134697762970015, |
|
"eval_loss": 1.4114233255386353, |
|
"eval_runtime": 39.5879, |
|
"eval_samples_per_second": 2.526, |
|
"eval_steps_per_second": 0.632, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.7229890528319847, |
|
"grad_norm": 0.2969178259372711, |
|
"learning_rate": 0.0001418556701030928, |
|
"loss": 1.4014, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.732508329366968, |
|
"grad_norm": 0.29975786805152893, |
|
"learning_rate": 0.00014144329896907218, |
|
"loss": 1.3926, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.7420276059019515, |
|
"grad_norm": 0.31133607029914856, |
|
"learning_rate": 0.00014103092783505157, |
|
"loss": 1.4063, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.751546882436935, |
|
"grad_norm": 0.29973700642585754, |
|
"learning_rate": 0.0001406185567010309, |
|
"loss": 1.4148, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.761066158971918, |
|
"grad_norm": 0.2864433228969574, |
|
"learning_rate": 0.0001402061855670103, |
|
"loss": 1.4032, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.7705854355069015, |
|
"grad_norm": 0.2933174669742584, |
|
"learning_rate": 0.0001397938144329897, |
|
"loss": 1.4284, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.7801047120418847, |
|
"grad_norm": 0.2991732060909271, |
|
"learning_rate": 0.00013938144329896907, |
|
"loss": 1.4039, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.789623988576868, |
|
"grad_norm": 0.2981884479522705, |
|
"learning_rate": 0.00013896907216494846, |
|
"loss": 1.3952, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.7991432651118515, |
|
"grad_norm": 0.29072457551956177, |
|
"learning_rate": 0.00013855670103092783, |
|
"loss": 1.4121, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.808662541646835, |
|
"grad_norm": 0.3064673840999603, |
|
"learning_rate": 0.00013814432989690722, |
|
"loss": 1.4029, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.808662541646835, |
|
"eval_loss": 1.4104112386703491, |
|
"eval_runtime": 39.5819, |
|
"eval_samples_per_second": 2.526, |
|
"eval_steps_per_second": 0.632, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.28513726592063904, |
|
"learning_rate": 0.00013773195876288661, |
|
"loss": 1.4098, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.8277010947168015, |
|
"grad_norm": 0.3132310211658478, |
|
"learning_rate": 0.00013731958762886598, |
|
"loss": 1.4157, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.8372203712517847, |
|
"grad_norm": 0.2846241295337677, |
|
"learning_rate": 0.00013690721649484538, |
|
"loss": 1.4045, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.8467396477867681, |
|
"grad_norm": 0.2918606996536255, |
|
"learning_rate": 0.00013649484536082474, |
|
"loss": 1.4107, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.8562589243217515, |
|
"grad_norm": 0.28303080797195435, |
|
"learning_rate": 0.00013608247422680414, |
|
"loss": 1.4221, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.865778200856735, |
|
"grad_norm": 0.2898513674736023, |
|
"learning_rate": 0.00013567010309278353, |
|
"loss": 1.3937, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.8752974773917184, |
|
"grad_norm": 0.2937197685241699, |
|
"learning_rate": 0.0001352577319587629, |
|
"loss": 1.4134, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.8848167539267016, |
|
"grad_norm": 0.2756626605987549, |
|
"learning_rate": 0.0001348453608247423, |
|
"loss": 1.4085, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.8943360304616848, |
|
"grad_norm": 0.28634968400001526, |
|
"learning_rate": 0.00013443298969072166, |
|
"loss": 1.3999, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.9038553069966682, |
|
"grad_norm": 0.28319478034973145, |
|
"learning_rate": 0.00013402061855670103, |
|
"loss": 1.3935, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.9038553069966682, |
|
"eval_loss": 1.4095008373260498, |
|
"eval_runtime": 39.6055, |
|
"eval_samples_per_second": 2.525, |
|
"eval_steps_per_second": 0.631, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.296083329417216e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|