|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 1632, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0012254901960784314, |
|
"grad_norm": 1.6059832937437346, |
|
"learning_rate": 6.0975609756097564e-06, |
|
"loss": 1.3541, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006127450980392157, |
|
"grad_norm": 1.5281382037952618, |
|
"learning_rate": 3.048780487804878e-05, |
|
"loss": 1.3583, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.012254901960784314, |
|
"grad_norm": 0.5993529609680898, |
|
"learning_rate": 6.097560975609756e-05, |
|
"loss": 1.3375, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01838235294117647, |
|
"grad_norm": 0.7098761948973563, |
|
"learning_rate": 9.146341463414634e-05, |
|
"loss": 1.2793, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.024509803921568627, |
|
"grad_norm": 0.4448928165735333, |
|
"learning_rate": 0.00012195121951219512, |
|
"loss": 1.2037, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.030637254901960783, |
|
"grad_norm": 0.40161131951951523, |
|
"learning_rate": 0.0001524390243902439, |
|
"loss": 1.1336, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03676470588235294, |
|
"grad_norm": 0.2541806870693389, |
|
"learning_rate": 0.00018292682926829268, |
|
"loss": 1.1063, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0428921568627451, |
|
"grad_norm": 0.1491059867354642, |
|
"learning_rate": 0.00021341463414634146, |
|
"loss": 1.0795, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.049019607843137254, |
|
"grad_norm": 0.14162648176415202, |
|
"learning_rate": 0.00024390243902439024, |
|
"loss": 1.0729, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05514705882352941, |
|
"grad_norm": 0.14138145278805817, |
|
"learning_rate": 0.00027439024390243905, |
|
"loss": 1.0504, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.061274509803921566, |
|
"grad_norm": 0.11598136180137979, |
|
"learning_rate": 0.0003048780487804878, |
|
"loss": 1.0378, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06740196078431372, |
|
"grad_norm": 0.14589294245833054, |
|
"learning_rate": 0.0003353658536585366, |
|
"loss": 1.0446, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07352941176470588, |
|
"grad_norm": 0.14730649378552943, |
|
"learning_rate": 0.00036585365853658537, |
|
"loss": 1.0333, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07965686274509803, |
|
"grad_norm": 0.11759388636190853, |
|
"learning_rate": 0.0003963414634146342, |
|
"loss": 1.0143, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.0857843137254902, |
|
"grad_norm": 0.12311842814802663, |
|
"learning_rate": 0.0004268292682926829, |
|
"loss": 1.0148, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09191176470588236, |
|
"grad_norm": 0.11751325803408062, |
|
"learning_rate": 0.00045731707317073173, |
|
"loss": 1.0176, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09803921568627451, |
|
"grad_norm": 0.13732149342206665, |
|
"learning_rate": 0.0004878048780487805, |
|
"loss": 1.0042, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10416666666666667, |
|
"grad_norm": 0.11309991089195273, |
|
"learning_rate": 0.0005182926829268293, |
|
"loss": 1.0065, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.11029411764705882, |
|
"grad_norm": 0.12583526404678766, |
|
"learning_rate": 0.0005487804878048781, |
|
"loss": 1.0149, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11642156862745098, |
|
"grad_norm": 0.12017272173906957, |
|
"learning_rate": 0.0005792682926829268, |
|
"loss": 0.9948, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.12254901960784313, |
|
"grad_norm": 0.11719746827044987, |
|
"learning_rate": 0.0006097560975609756, |
|
"loss": 0.9905, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12867647058823528, |
|
"grad_norm": 0.13729668356371552, |
|
"learning_rate": 0.0006402439024390244, |
|
"loss": 0.9977, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.13480392156862744, |
|
"grad_norm": 0.12981775917103477, |
|
"learning_rate": 0.0006707317073170732, |
|
"loss": 1.0017, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1409313725490196, |
|
"grad_norm": 0.13870526825011106, |
|
"learning_rate": 0.0007012195121951219, |
|
"loss": 0.9997, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 0.12747253256734123, |
|
"learning_rate": 0.0007317073170731707, |
|
"loss": 0.9877, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.15318627450980393, |
|
"grad_norm": 0.12869096982052203, |
|
"learning_rate": 0.0007621951219512195, |
|
"loss": 0.9823, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.15931372549019607, |
|
"grad_norm": 0.12889388640011323, |
|
"learning_rate": 0.0007926829268292683, |
|
"loss": 0.9774, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.16544117647058823, |
|
"grad_norm": 0.13890909917337924, |
|
"learning_rate": 0.000823170731707317, |
|
"loss": 0.9874, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.1715686274509804, |
|
"grad_norm": 0.167350127299137, |
|
"learning_rate": 0.0008536585365853659, |
|
"loss": 0.9855, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.17769607843137256, |
|
"grad_norm": 0.12637500439423752, |
|
"learning_rate": 0.0008841463414634147, |
|
"loss": 0.9837, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.18382352941176472, |
|
"grad_norm": 0.13036804871454152, |
|
"learning_rate": 0.0009146341463414635, |
|
"loss": 0.9846, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.18995098039215685, |
|
"grad_norm": 0.13716320964191064, |
|
"learning_rate": 0.0009451219512195122, |
|
"loss": 0.9742, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.19607843137254902, |
|
"grad_norm": 0.1505410439189676, |
|
"learning_rate": 0.000975609756097561, |
|
"loss": 0.9657, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.20220588235294118, |
|
"grad_norm": 0.13939847463577426, |
|
"learning_rate": 0.0009999988550474805, |
|
"loss": 0.982, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 0.1723912783342495, |
|
"learning_rate": 0.000999958782259877, |
|
"loss": 0.9771, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.21446078431372548, |
|
"grad_norm": 0.20339647033601935, |
|
"learning_rate": 0.0009998614670898504, |
|
"loss": 0.9742, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.22058823529411764, |
|
"grad_norm": 0.1805520187680447, |
|
"learning_rate": 0.0009997069206794246, |
|
"loss": 0.9724, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2267156862745098, |
|
"grad_norm": 0.14101202234864954, |
|
"learning_rate": 0.000999495160723267, |
|
"loss": 0.9699, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.23284313725490197, |
|
"grad_norm": 0.14408773000983136, |
|
"learning_rate": 0.0009992262114666653, |
|
"loss": 0.9721, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.23897058823529413, |
|
"grad_norm": 0.12211770445331499, |
|
"learning_rate": 0.0009989001037027502, |
|
"loss": 0.9638, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.24509803921568626, |
|
"grad_norm": 0.11058292325672553, |
|
"learning_rate": 0.0009985168747689707, |
|
"loss": 0.9613, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2512254901960784, |
|
"grad_norm": 0.151326097792135, |
|
"learning_rate": 0.0009980765685428175, |
|
"loss": 0.9784, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.25735294117647056, |
|
"grad_norm": 0.13748406413323222, |
|
"learning_rate": 0.0009975792354368017, |
|
"loss": 0.9578, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.26348039215686275, |
|
"grad_norm": 0.13529789016829663, |
|
"learning_rate": 0.000997024932392681, |
|
"loss": 0.9561, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2696078431372549, |
|
"grad_norm": 0.11913360946345405, |
|
"learning_rate": 0.0009964137228749407, |
|
"loss": 0.9587, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2757352941176471, |
|
"grad_norm": 0.11311287494378686, |
|
"learning_rate": 0.0009957456768635274, |
|
"loss": 0.9534, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2818627450980392, |
|
"grad_norm": 0.1687787460034738, |
|
"learning_rate": 0.000995020870845837, |
|
"loss": 0.9483, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.28799019607843135, |
|
"grad_norm": 0.13894852921173279, |
|
"learning_rate": 0.000994239387807957, |
|
"loss": 0.9465, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 0.15098532770679463, |
|
"learning_rate": 0.0009934013172251653, |
|
"loss": 0.9512, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3002450980392157, |
|
"grad_norm": 0.1603820773467054, |
|
"learning_rate": 0.0009925067550516852, |
|
"loss": 0.9364, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.30637254901960786, |
|
"grad_norm": 0.20531045303348816, |
|
"learning_rate": 0.0009915558037097002, |
|
"loss": 0.9436, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.1373133786573118, |
|
"learning_rate": 0.0009905485720776265, |
|
"loss": 0.934, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.31862745098039214, |
|
"grad_norm": 1.8470429009125096, |
|
"learning_rate": 0.0009894851754776472, |
|
"loss": 0.9526, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3247549019607843, |
|
"grad_norm": 0.11771908587868915, |
|
"learning_rate": 0.000988365735662509, |
|
"loss": 0.9513, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.33088235294117646, |
|
"grad_norm": 0.14600554728304563, |
|
"learning_rate": 0.0009871903808015812, |
|
"loss": 0.9424, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.33700980392156865, |
|
"grad_norm": 0.12078857521986237, |
|
"learning_rate": 0.0009859592454661823, |
|
"loss": 0.9501, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3431372549019608, |
|
"grad_norm": 0.12236086569601477, |
|
"learning_rate": 0.0009846724706141716, |
|
"loss": 0.9445, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3492647058823529, |
|
"grad_norm": 0.13112435126866281, |
|
"learning_rate": 0.0009833302035738107, |
|
"loss": 0.9363, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3553921568627451, |
|
"grad_norm": 0.14235863463223497, |
|
"learning_rate": 0.0009819325980268945, |
|
"loss": 0.9485, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.36151960784313725, |
|
"grad_norm": 0.12952693104841728, |
|
"learning_rate": 0.0009804798139911568, |
|
"loss": 0.9421, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.36764705882352944, |
|
"grad_norm": 0.11889525565491425, |
|
"learning_rate": 0.0009789720178019483, |
|
"loss": 0.9321, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3737745098039216, |
|
"grad_norm": 0.10497064780361466, |
|
"learning_rate": 0.0009774093820931922, |
|
"loss": 0.9383, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3799019607843137, |
|
"grad_norm": 0.1297312332951932, |
|
"learning_rate": 0.0009757920857776188, |
|
"loss": 0.9315, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3860294117647059, |
|
"grad_norm": 0.12939401003239853, |
|
"learning_rate": 0.0009741203140262813, |
|
"loss": 0.931, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.39215686274509803, |
|
"grad_norm": 0.1724687096774714, |
|
"learning_rate": 0.0009723942582473544, |
|
"loss": 0.9244, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.39828431372549017, |
|
"grad_norm": 0.1440746949158844, |
|
"learning_rate": 0.000970614116064219, |
|
"loss": 0.9186, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.40441176470588236, |
|
"grad_norm": 0.11601098074712127, |
|
"learning_rate": 0.0009687800912928362, |
|
"loss": 0.9331, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4105392156862745, |
|
"grad_norm": 0.16676922492027155, |
|
"learning_rate": 0.0009668923939184109, |
|
"loss": 0.9282, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 0.16264723211095694, |
|
"learning_rate": 0.0009649512400713498, |
|
"loss": 0.931, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4227941176470588, |
|
"grad_norm": 0.12835068212058262, |
|
"learning_rate": 0.000962956852002516, |
|
"loss": 0.9212, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.42892156862745096, |
|
"grad_norm": 0.11722967905017813, |
|
"learning_rate": 0.0009609094580577824, |
|
"loss": 0.9301, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.43504901960784315, |
|
"grad_norm": 0.11753159563989828, |
|
"learning_rate": 0.0009588092926518875, |
|
"loss": 0.9181, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.4411764705882353, |
|
"grad_norm": 0.3058794099499992, |
|
"learning_rate": 0.0009566565962415959, |
|
"loss": 0.9276, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.44730392156862747, |
|
"grad_norm": 0.11638001282284417, |
|
"learning_rate": 0.0009544516152981679, |
|
"loss": 0.9187, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.4534313725490196, |
|
"grad_norm": 0.0979888983879341, |
|
"learning_rate": 0.0009521946022791401, |
|
"loss": 0.9178, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.45955882352941174, |
|
"grad_norm": 0.12474491987381439, |
|
"learning_rate": 0.0009498858155994194, |
|
"loss": 0.9248, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.46568627450980393, |
|
"grad_norm": 0.11539100379640589, |
|
"learning_rate": 0.0009475255196016972, |
|
"loss": 0.9082, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.47181372549019607, |
|
"grad_norm": 0.11060768973607205, |
|
"learning_rate": 0.0009451139845261834, |
|
"loss": 0.913, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.47794117647058826, |
|
"grad_norm": 0.10933884625046772, |
|
"learning_rate": 0.0009426514864796647, |
|
"loss": 0.9164, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4840686274509804, |
|
"grad_norm": 0.12110040734071437, |
|
"learning_rate": 0.000940138307403893, |
|
"loss": 0.918, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.49019607843137253, |
|
"grad_norm": 0.12516912467922992, |
|
"learning_rate": 0.0009375747350433044, |
|
"loss": 0.9099, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4963235294117647, |
|
"grad_norm": 0.15405339011075656, |
|
"learning_rate": 0.0009349610629120733, |
|
"loss": 0.9153, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5024509803921569, |
|
"grad_norm": 0.15983105881560317, |
|
"learning_rate": 0.0009322975902605082, |
|
"loss": 0.9139, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.508578431372549, |
|
"grad_norm": 0.12456698491529061, |
|
"learning_rate": 0.000929584622040788, |
|
"loss": 0.9196, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.5147058823529411, |
|
"grad_norm": 0.39391857600951824, |
|
"learning_rate": 0.0009268224688720474, |
|
"loss": 0.911, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.5208333333333334, |
|
"grad_norm": 0.135863544001222, |
|
"learning_rate": 0.0009240114470048129, |
|
"loss": 0.9082, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.5269607843137255, |
|
"grad_norm": 0.16266736226237394, |
|
"learning_rate": 0.0009211518782847931, |
|
"loss": 0.9208, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.5330882352941176, |
|
"grad_norm": 0.14823148129955332, |
|
"learning_rate": 0.0009182440901160307, |
|
"loss": 0.9243, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.5392156862745098, |
|
"grad_norm": 0.14300453285294948, |
|
"learning_rate": 0.0009152884154234145, |
|
"loss": 0.9082, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.5453431372549019, |
|
"grad_norm": 0.11464976577167021, |
|
"learning_rate": 0.0009122851926145641, |
|
"loss": 0.9066, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.5514705882352942, |
|
"grad_norm": 0.1380022527629278, |
|
"learning_rate": 0.0009092347655410818, |
|
"loss": 0.9059, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.5575980392156863, |
|
"grad_norm": 0.12505924658066125, |
|
"learning_rate": 0.0009061374834591849, |
|
"loss": 0.9106, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.5637254901960784, |
|
"grad_norm": 0.14573814560545809, |
|
"learning_rate": 0.0009029937009897176, |
|
"loss": 0.9085, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5698529411764706, |
|
"grad_norm": 0.12549980653285886, |
|
"learning_rate": 0.0008998037780775488, |
|
"loss": 0.9134, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5759803921568627, |
|
"grad_norm": 1.0091372919881914, |
|
"learning_rate": 0.0008965680799503608, |
|
"loss": 0.9012, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5821078431372549, |
|
"grad_norm": 0.12482100803406715, |
|
"learning_rate": 0.0008932869770768326, |
|
"loss": 0.9083, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.16842450336753725, |
|
"learning_rate": 0.0008899608451242233, |
|
"loss": 0.906, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.5943627450980392, |
|
"grad_norm": 0.37435753717247394, |
|
"learning_rate": 0.0008865900649153606, |
|
"loss": 0.9116, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.6004901960784313, |
|
"grad_norm": 0.1504221848097725, |
|
"learning_rate": 0.0008831750223850389, |
|
"loss": 0.9069, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.6066176470588235, |
|
"grad_norm": 0.6779799933680686, |
|
"learning_rate": 0.0008797161085358317, |
|
"loss": 0.8935, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.6127450980392157, |
|
"grad_norm": 0.4033225235584718, |
|
"learning_rate": 0.0008762137193933241, |
|
"loss": 0.898, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6188725490196079, |
|
"grad_norm": 0.12597786237207237, |
|
"learning_rate": 0.0008726682559607706, |
|
"loss": 0.8965, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.12434445433374085, |
|
"learning_rate": 0.0008690801241731818, |
|
"loss": 0.9195, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.6311274509803921, |
|
"grad_norm": 0.11180950505927977, |
|
"learning_rate": 0.0008654497348508476, |
|
"loss": 0.8856, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.6372549019607843, |
|
"grad_norm": 0.11471149278729899, |
|
"learning_rate": 0.0008617775036523015, |
|
"loss": 0.888, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.6433823529411765, |
|
"grad_norm": 0.10661278460678202, |
|
"learning_rate": 0.000858063851026728, |
|
"loss": 0.8844, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.6495098039215687, |
|
"grad_norm": 0.13323867388840102, |
|
"learning_rate": 0.0008543092021658259, |
|
"loss": 0.8917, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.6556372549019608, |
|
"grad_norm": 0.11051797161585711, |
|
"learning_rate": 0.0008505139869551248, |
|
"loss": 0.8966, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.6617647058823529, |
|
"grad_norm": 0.10201940027044504, |
|
"learning_rate": 0.0008466786399247663, |
|
"loss": 0.9086, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.6678921568627451, |
|
"grad_norm": 0.11341681063010137, |
|
"learning_rate": 0.000842803600199753, |
|
"loss": 0.885, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.6740196078431373, |
|
"grad_norm": 0.11249706294845149, |
|
"learning_rate": 0.0008388893114496705, |
|
"loss": 0.8761, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.6801470588235294, |
|
"grad_norm": 0.10372930065250463, |
|
"learning_rate": 0.0008349362218378904, |
|
"loss": 0.8875, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.6862745098039216, |
|
"grad_norm": 0.1473106555936528, |
|
"learning_rate": 0.0008309447839702582, |
|
"loss": 0.8813, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6924019607843137, |
|
"grad_norm": 0.11251651358057665, |
|
"learning_rate": 0.0008269154548432722, |
|
"loss": 0.8856, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.6985294117647058, |
|
"grad_norm": 0.18983642337639475, |
|
"learning_rate": 0.0008228486957917607, |
|
"loss": 0.8893, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.7046568627450981, |
|
"grad_norm": 0.1926017574087406, |
|
"learning_rate": 0.0008187449724360605, |
|
"loss": 0.8853, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.7107843137254902, |
|
"grad_norm": 0.10232469720003605, |
|
"learning_rate": 0.0008146047546287076, |
|
"loss": 0.8786, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.7169117647058824, |
|
"grad_norm": 0.1029258945252611, |
|
"learning_rate": 0.0008104285164006415, |
|
"loss": 0.8799, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.7230392156862745, |
|
"grad_norm": 0.14002711821151018, |
|
"learning_rate": 0.0008062167359069301, |
|
"loss": 0.8827, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.7291666666666666, |
|
"grad_norm": 0.11144023976662691, |
|
"learning_rate": 0.0008019698953720256, |
|
"loss": 0.8832, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.7352941176470589, |
|
"grad_norm": 0.12878152634910894, |
|
"learning_rate": 0.000797688481034551, |
|
"loss": 0.8852, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.741421568627451, |
|
"grad_norm": 0.12427328372120441, |
|
"learning_rate": 0.0007933729830916297, |
|
"loss": 0.8807, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.7475490196078431, |
|
"grad_norm": 0.10823708822180528, |
|
"learning_rate": 0.00078902389564276, |
|
"loss": 0.8839, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.7536764705882353, |
|
"grad_norm": 0.106769228793293, |
|
"learning_rate": 0.0007846417166332445, |
|
"loss": 0.8756, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.7598039215686274, |
|
"grad_norm": 0.10610630308145001, |
|
"learning_rate": 0.0007802269477971771, |
|
"loss": 0.8786, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.7659313725490197, |
|
"grad_norm": 0.11560264922747754, |
|
"learning_rate": 0.000775780094599998, |
|
"loss": 0.8769, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.7720588235294118, |
|
"grad_norm": 0.14907985785369013, |
|
"learning_rate": 0.0007713016661806211, |
|
"loss": 0.8795, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.7781862745098039, |
|
"grad_norm": 0.11129324500895409, |
|
"learning_rate": 0.00076679217529314, |
|
"loss": 0.875, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.7843137254901961, |
|
"grad_norm": 0.14839037572821354, |
|
"learning_rate": 0.0007622521382481208, |
|
"loss": 0.8703, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.7904411764705882, |
|
"grad_norm": 0.13899413539068264, |
|
"learning_rate": 0.0007576820748534875, |
|
"loss": 0.8763, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.7965686274509803, |
|
"grad_norm": 0.11601000036057739, |
|
"learning_rate": 0.0007530825083550073, |
|
"loss": 0.887, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8026960784313726, |
|
"grad_norm": 0.10442455243797781, |
|
"learning_rate": 0.0007484539653763815, |
|
"loss": 0.8751, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.8088235294117647, |
|
"grad_norm": 0.12520557374066849, |
|
"learning_rate": 0.0007437969758589507, |
|
"loss": 0.8673, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.8149509803921569, |
|
"grad_norm": 0.11448612465079618, |
|
"learning_rate": 0.0007391120730010193, |
|
"loss": 0.8694, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.821078431372549, |
|
"grad_norm": 0.13153196077249837, |
|
"learning_rate": 0.0007343997931968067, |
|
"loss": 0.87, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.8272058823529411, |
|
"grad_norm": 0.11840806919932775, |
|
"learning_rate": 0.0007296606759750351, |
|
"loss": 0.8672, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.09757556694809401, |
|
"learning_rate": 0.0007248952639371542, |
|
"loss": 0.8676, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.8394607843137255, |
|
"grad_norm": 0.09022546372550531, |
|
"learning_rate": 0.0007201041026952188, |
|
"loss": 0.8664, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.8455882352941176, |
|
"grad_norm": 0.10331609924034306, |
|
"learning_rate": 0.0007152877408094178, |
|
"loss": 0.8616, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.8517156862745098, |
|
"grad_norm": 0.12505516849755346, |
|
"learning_rate": 0.0007104467297252677, |
|
"loss": 0.8652, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.8578431372549019, |
|
"grad_norm": 0.09465790589490469, |
|
"learning_rate": 0.0007055816237104753, |
|
"loss": 0.8699, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8639705882352942, |
|
"grad_norm": 0.12150502920638975, |
|
"learning_rate": 0.0007006929797914775, |
|
"loss": 0.8597, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.8700980392156863, |
|
"grad_norm": 0.10318640070426116, |
|
"learning_rate": 0.0006957813576896647, |
|
"loss": 0.8603, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.8762254901960784, |
|
"grad_norm": 0.11768219039546857, |
|
"learning_rate": 0.000690847319757296, |
|
"loss": 0.8653, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.8823529411764706, |
|
"grad_norm": 0.13084092032465588, |
|
"learning_rate": 0.000685891430913113, |
|
"loss": 0.8599, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.8884803921568627, |
|
"grad_norm": 0.1378907468788972, |
|
"learning_rate": 0.0006809142585776604, |
|
"loss": 0.8625, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.8946078431372549, |
|
"grad_norm": 0.1137277604032764, |
|
"learning_rate": 0.0006759163726083191, |
|
"loss": 0.8626, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.9007352941176471, |
|
"grad_norm": 0.10001539507212627, |
|
"learning_rate": 0.0006708983452340609, |
|
"loss": 0.849, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.9068627450980392, |
|
"grad_norm": 0.10484673231574948, |
|
"learning_rate": 0.0006658607509899319, |
|
"loss": 0.8682, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.9129901960784313, |
|
"grad_norm": 0.15431021500080286, |
|
"learning_rate": 0.0006608041666512712, |
|
"loss": 0.8645, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.9191176470588235, |
|
"grad_norm": 0.11219846062082936, |
|
"learning_rate": 0.0006557291711676738, |
|
"loss": 0.8541, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9252450980392157, |
|
"grad_norm": 0.11968297539967529, |
|
"learning_rate": 0.0006506363455967037, |
|
"loss": 0.8645, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.9313725490196079, |
|
"grad_norm": 0.1421862868626235, |
|
"learning_rate": 0.0006455262730373672, |
|
"loss": 0.8628, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.09419342625695648, |
|
"learning_rate": 0.0006403995385633503, |
|
"loss": 0.859, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.9436274509803921, |
|
"grad_norm": 0.10436946711990462, |
|
"learning_rate": 0.0006352567291560318, |
|
"loss": 0.8564, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.9497549019607843, |
|
"grad_norm": 0.11378998198564375, |
|
"learning_rate": 0.0006300984336372771, |
|
"loss": 0.8552, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.9558823529411765, |
|
"grad_norm": 0.11204753073504012, |
|
"learning_rate": 0.0006249252426020216, |
|
"loss": 0.8567, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.9620098039215687, |
|
"grad_norm": 0.13969543336270682, |
|
"learning_rate": 0.000619737748350651, |
|
"loss": 0.8521, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.9681372549019608, |
|
"grad_norm": 0.10178000324550646, |
|
"learning_rate": 0.0006145365448211866, |
|
"loss": 0.849, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.9742647058823529, |
|
"grad_norm": 0.28123512574152176, |
|
"learning_rate": 0.0006093222275212822, |
|
"loss": 0.8539, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.9803921568627451, |
|
"grad_norm": 0.11687237794199178, |
|
"learning_rate": 0.0006040953934600423, |
|
"loss": 0.8466, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.9865196078431373, |
|
"grad_norm": 0.08664193399778343, |
|
"learning_rate": 0.0005988566410796687, |
|
"loss": 0.8408, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.9926470588235294, |
|
"grad_norm": 0.1332025987779646, |
|
"learning_rate": 0.0005936065701869403, |
|
"loss": 0.8545, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.9987745098039216, |
|
"grad_norm": 2.6257846647757304, |
|
"learning_rate": 0.0005883457818845414, |
|
"loss": 0.8575, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.2250986099243164, |
|
"eval_runtime": 111.2787, |
|
"eval_samples_per_second": 188.185, |
|
"eval_steps_per_second": 5.886, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 1.0049019607843137, |
|
"grad_norm": 1.2727033946666415, |
|
"learning_rate": 0.0005830748785022368, |
|
"loss": 0.7791, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.0110294117647058, |
|
"grad_norm": 0.1911261436318345, |
|
"learning_rate": 0.0005777944635279099, |
|
"loss": 0.7643, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.017156862745098, |
|
"grad_norm": 0.1586922694379843, |
|
"learning_rate": 0.0005725051415384657, |
|
"loss": 0.7516, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.0232843137254901, |
|
"grad_norm": 0.11186631342494315, |
|
"learning_rate": 0.0005672075181306108, |
|
"loss": 0.7526, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.0294117647058822, |
|
"grad_norm": 0.13268928121380016, |
|
"learning_rate": 0.0005619021998515165, |
|
"loss": 0.7699, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.0355392156862746, |
|
"grad_norm": 0.11229206406982119, |
|
"learning_rate": 0.0005565897941293721, |
|
"loss": 0.7813, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.0416666666666667, |
|
"grad_norm": 0.12012493868077331, |
|
"learning_rate": 0.000551270909203838, |
|
"loss": 0.7606, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.0477941176470589, |
|
"grad_norm": 0.11528601274141542, |
|
"learning_rate": 0.0005459461540564057, |
|
"loss": 0.7597, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.053921568627451, |
|
"grad_norm": 0.10690663798544056, |
|
"learning_rate": 0.0005406161383406731, |
|
"loss": 0.7595, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.0600490196078431, |
|
"grad_norm": 0.13779006322984594, |
|
"learning_rate": 0.000535281472312543, |
|
"loss": 0.7604, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.0661764705882353, |
|
"grad_norm": 0.1286321993738431, |
|
"learning_rate": 0.0005299427667603515, |
|
"loss": 0.7591, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.0723039215686274, |
|
"grad_norm": 0.11691581398274645, |
|
"learning_rate": 0.0005246006329349376, |
|
"loss": 0.7539, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.0784313725490196, |
|
"grad_norm": 0.1359966051449655, |
|
"learning_rate": 0.0005192556824796568, |
|
"loss": 0.7478, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.0845588235294117, |
|
"grad_norm": 0.11784308765572785, |
|
"learning_rate": 0.0005139085273603527, |
|
"loss": 0.7526, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.0906862745098038, |
|
"grad_norm": 0.08243234125547984, |
|
"learning_rate": 0.0005085597797952905, |
|
"loss": 0.7503, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.0968137254901962, |
|
"grad_norm": 0.10687199207970824, |
|
"learning_rate": 0.0005032100521850608, |
|
"loss": 0.7639, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.1029411764705883, |
|
"grad_norm": 0.09754168316674887, |
|
"learning_rate": 0.0004978599570424639, |
|
"loss": 0.7648, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.1090686274509804, |
|
"grad_norm": 0.09597242384268918, |
|
"learning_rate": 0.0004925101069223802, |
|
"loss": 0.7618, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.1151960784313726, |
|
"grad_norm": 0.16042928535903608, |
|
"learning_rate": 0.0004871611143516367, |
|
"loss": 0.7488, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.1213235294117647, |
|
"grad_norm": 0.11558221571064126, |
|
"learning_rate": 0.00048181359175887594, |
|
"loss": 0.758, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.1274509803921569, |
|
"grad_norm": 0.10187875911094818, |
|
"learning_rate": 0.0004764681514044362, |
|
"loss": 0.7548, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.133578431372549, |
|
"grad_norm": 0.10845733289411896, |
|
"learning_rate": 0.0004711254053102521, |
|
"loss": 0.7447, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.1397058823529411, |
|
"grad_norm": 0.12423589225823602, |
|
"learning_rate": 0.0004657859651897806, |
|
"loss": 0.7567, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.1458333333333333, |
|
"grad_norm": 0.101823111875683, |
|
"learning_rate": 0.0004604504423779639, |
|
"loss": 0.7496, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.1519607843137254, |
|
"grad_norm": 0.1490251916250933, |
|
"learning_rate": 0.00045511944776123513, |
|
"loss": 0.7476, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.1580882352941178, |
|
"grad_norm": 0.0909899046261831, |
|
"learning_rate": 0.00044979359170757555, |
|
"loss": 0.7557, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.1642156862745099, |
|
"grad_norm": 0.10504026034830631, |
|
"learning_rate": 0.00044447348399663056, |
|
"loss": 0.7551, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.170343137254902, |
|
"grad_norm": 0.10154008572100809, |
|
"learning_rate": 0.00043915973374989326, |
|
"loss": 0.7553, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 0.0948668753989249, |
|
"learning_rate": 0.0004338529493609647, |
|
"loss": 0.7529, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.1825980392156863, |
|
"grad_norm": 0.12214475449625242, |
|
"learning_rate": 0.0004285537384258951, |
|
"loss": 0.7544, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.1887254901960784, |
|
"grad_norm": 0.10021333779503093, |
|
"learning_rate": 0.00042326270767361815, |
|
"loss": 0.7561, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.1948529411764706, |
|
"grad_norm": 0.10089690279960518, |
|
"learning_rate": 0.0004179804628964839, |
|
"loss": 0.7473, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.2009803921568627, |
|
"grad_norm": 0.09480462930649244, |
|
"learning_rate": 0.00041270760888089997, |
|
"loss": 0.7543, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.2071078431372548, |
|
"grad_norm": 0.13593486404780272, |
|
"learning_rate": 0.000407444749338085, |
|
"loss": 0.7447, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.213235294117647, |
|
"grad_norm": 0.09373417852720169, |
|
"learning_rate": 0.00040219248683494925, |
|
"loss": 0.7516, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.219362745098039, |
|
"grad_norm": 0.09327325503229338, |
|
"learning_rate": 0.00039695142272510334, |
|
"loss": 0.7443, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.2254901960784315, |
|
"grad_norm": 0.10034654814592268, |
|
"learning_rate": 0.0003917221570800065, |
|
"loss": 0.7475, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2316176470588236, |
|
"grad_norm": 0.09837253795265176, |
|
"learning_rate": 0.0003865052886202621, |
|
"loss": 0.7438, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.2377450980392157, |
|
"grad_norm": 0.09051967236076873, |
|
"learning_rate": 0.000381301414647068, |
|
"loss": 0.7537, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.2438725490196079, |
|
"grad_norm": 0.08928603737077674, |
|
"learning_rate": 0.0003761111309738285, |
|
"loss": 0.7372, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.10153157943466443, |
|
"learning_rate": 0.0003709350318579371, |
|
"loss": 0.748, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.2561274509803921, |
|
"grad_norm": 0.09687550081111064, |
|
"learning_rate": 0.0003657737099327378, |
|
"loss": 0.7445, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.2622549019607843, |
|
"grad_norm": 0.08345067550046266, |
|
"learning_rate": 0.0003606277561396726, |
|
"loss": 0.7459, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.2683823529411764, |
|
"grad_norm": 0.09822605701018818, |
|
"learning_rate": 0.0003554977596606203, |
|
"loss": 0.7473, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.2745098039215685, |
|
"grad_norm": 0.08897185232535443, |
|
"learning_rate": 0.00035038430785044053, |
|
"loss": 0.7485, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.280637254901961, |
|
"grad_norm": 0.09625591066370137, |
|
"learning_rate": 0.00034528798616972434, |
|
"loss": 0.739, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.2867647058823528, |
|
"grad_norm": 0.09405754837547836, |
|
"learning_rate": 0.00034020937811776156, |
|
"loss": 0.7558, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.2928921568627452, |
|
"grad_norm": 0.10283887592029052, |
|
"learning_rate": 0.0003351490651657347, |
|
"loss": 0.7576, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.2990196078431373, |
|
"grad_norm": 0.09668902027134954, |
|
"learning_rate": 0.00033010762669014347, |
|
"loss": 0.7339, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.3051470588235294, |
|
"grad_norm": 0.09491296282202684, |
|
"learning_rate": 0.00032508563990646925, |
|
"loss": 0.74, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.3112745098039216, |
|
"grad_norm": 0.08614797698276408, |
|
"learning_rate": 0.00032008367980308734, |
|
"loss": 0.7491, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.3174019607843137, |
|
"grad_norm": 0.11385248252735118, |
|
"learning_rate": 0.0003151023190754343, |
|
"loss": 0.7424, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.3235294117647058, |
|
"grad_norm": 0.09914180185658039, |
|
"learning_rate": 0.0003101421280604379, |
|
"loss": 0.7386, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.329656862745098, |
|
"grad_norm": 0.23046737698236164, |
|
"learning_rate": 0.000305203674671216, |
|
"loss": 0.7429, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.3357843137254901, |
|
"grad_norm": 0.08999049028390338, |
|
"learning_rate": 0.00030028752433205476, |
|
"loss": 0.7504, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.3419117647058822, |
|
"grad_norm": 0.08798822285342148, |
|
"learning_rate": 0.0002953942399136702, |
|
"loss": 0.7475, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.3480392156862746, |
|
"grad_norm": 0.11440187719573272, |
|
"learning_rate": 0.00029052438166876307, |
|
"loss": 0.745, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.3541666666666667, |
|
"grad_norm": 0.09680996894090088, |
|
"learning_rate": 0.00028567850716787257, |
|
"loss": 0.7493, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.3602941176470589, |
|
"grad_norm": 0.08285996425508975, |
|
"learning_rate": 0.0002808571712355389, |
|
"loss": 0.7503, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.366421568627451, |
|
"grad_norm": 0.09255457708748827, |
|
"learning_rate": 0.0002760609258867784, |
|
"loss": 0.7318, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.3725490196078431, |
|
"grad_norm": 0.09572569889952495, |
|
"learning_rate": 0.00027129032026388045, |
|
"loss": 0.7348, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.3786764705882353, |
|
"grad_norm": 0.0909286336173143, |
|
"learning_rate": 0.00026654590057353467, |
|
"loss": 0.7403, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.3848039215686274, |
|
"grad_norm": 0.08684037452287134, |
|
"learning_rate": 0.00026182821002429345, |
|
"loss": 0.7492, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.3909313725490196, |
|
"grad_norm": 0.09039702206887036, |
|
"learning_rate": 0.00025713778876437744, |
|
"loss": 0.7271, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.3970588235294117, |
|
"grad_norm": 0.10259305111827258, |
|
"learning_rate": 0.00025247517381983136, |
|
"loss": 0.7334, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.403186274509804, |
|
"grad_norm": 0.09305514839850722, |
|
"learning_rate": 0.00024784089903303854, |
|
"loss": 0.7342, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.409313725490196, |
|
"grad_norm": 0.09362025462459474, |
|
"learning_rate": 0.00024323549500159802, |
|
"loss": 0.7287, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.4154411764705883, |
|
"grad_norm": 0.09297020814808991, |
|
"learning_rate": 0.0002386594890175749, |
|
"loss": 0.7424, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.4215686274509804, |
|
"grad_norm": 0.08353530769016755, |
|
"learning_rate": 0.0002341134050071283, |
|
"loss": 0.7485, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.4276960784313726, |
|
"grad_norm": 0.08590929777722105, |
|
"learning_rate": 0.00022959776347052509, |
|
"loss": 0.7347, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.4338235294117647, |
|
"grad_norm": 0.09926401747517187, |
|
"learning_rate": 0.00022511308142254488, |
|
"loss": 0.7529, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.4399509803921569, |
|
"grad_norm": 0.08954546869195683, |
|
"learning_rate": 0.00022065987233328528, |
|
"loss": 0.741, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.446078431372549, |
|
"grad_norm": 0.08807524417797259, |
|
"learning_rate": 0.000216238646069373, |
|
"loss": 0.7409, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.4522058823529411, |
|
"grad_norm": 0.09243225975101937, |
|
"learning_rate": 0.00021184990883558658, |
|
"loss": 0.7358, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.4583333333333333, |
|
"grad_norm": 0.09464124110688574, |
|
"learning_rate": 0.00020749416311689845, |
|
"loss": 0.7346, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.4644607843137254, |
|
"grad_norm": 0.09316779183905147, |
|
"learning_rate": 0.0002031719076209445, |
|
"loss": 0.7313, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 0.09732126328149814, |
|
"learning_rate": 0.00019888363722092372, |
|
"loss": 0.7341, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4767156862745099, |
|
"grad_norm": 0.08886419656539418, |
|
"learning_rate": 0.0001946298428989386, |
|
"loss": 0.7375, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.482843137254902, |
|
"grad_norm": 0.07850103787876181, |
|
"learning_rate": 0.00019041101168978093, |
|
"loss": 0.7287, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.4889705882352942, |
|
"grad_norm": 0.08830903517712305, |
|
"learning_rate": 0.00018622762662516868, |
|
"loss": 0.735, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.4950980392156863, |
|
"grad_norm": 0.08702420978835, |
|
"learning_rate": 0.00018208016667844152, |
|
"loss": 0.7393, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.5012254901960784, |
|
"grad_norm": 0.08438557069126026, |
|
"learning_rate": 0.00017796910670972132, |
|
"loss": 0.7423, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.5073529411764706, |
|
"grad_norm": 0.0874124625381899, |
|
"learning_rate": 0.00017389491741154372, |
|
"loss": 0.7417, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.5134803921568627, |
|
"grad_norm": 0.08983974571361636, |
|
"learning_rate": 0.0001698580652549665, |
|
"loss": 0.7284, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.5196078431372548, |
|
"grad_norm": 0.09655404504961941, |
|
"learning_rate": 0.00016585901243616042, |
|
"loss": 0.732, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.5257352941176472, |
|
"grad_norm": 0.08637672179778699, |
|
"learning_rate": 0.00016189821682349205, |
|
"loss": 0.7293, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.531862745098039, |
|
"grad_norm": 0.08546099516549688, |
|
"learning_rate": 0.0001579761319050991, |
|
"loss": 0.7356, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.5379901960784315, |
|
"grad_norm": 0.07907834707353141, |
|
"learning_rate": 0.00015409320673696902, |
|
"loss": 0.731, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.5441176470588234, |
|
"grad_norm": 0.08633766548134404, |
|
"learning_rate": 0.00015024988589152537, |
|
"loss": 0.7254, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.5502450980392157, |
|
"grad_norm": 0.09255543384492604, |
|
"learning_rate": 0.00014644660940672628, |
|
"loss": 0.7352, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.5563725490196079, |
|
"grad_norm": 0.07948954260097911, |
|
"learning_rate": 0.0001426838127356823, |
|
"loss": 0.7281, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.5625, |
|
"grad_norm": 0.0858902101428744, |
|
"learning_rate": 0.0001389619266968002, |
|
"loss": 0.7404, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.5686274509803921, |
|
"grad_norm": 0.08099715928187635, |
|
"learning_rate": 0.0001352813774244565, |
|
"loss": 0.729, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.5747549019607843, |
|
"grad_norm": 0.08675937769241077, |
|
"learning_rate": 0.0001316425863202078, |
|
"loss": 0.7289, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.5808823529411766, |
|
"grad_norm": 0.08712801365461889, |
|
"learning_rate": 0.00012804597000454215, |
|
"loss": 0.7368, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.5870098039215685, |
|
"grad_norm": 0.09259317971708025, |
|
"learning_rate": 0.00012449194026917883, |
|
"loss": 0.7254, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.593137254901961, |
|
"grad_norm": 0.0821781596676491, |
|
"learning_rate": 0.00012098090402992085, |
|
"loss": 0.7307, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5992647058823528, |
|
"grad_norm": 0.08514490094990651, |
|
"learning_rate": 0.00011751326328006473, |
|
"loss": 0.7226, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.6053921568627452, |
|
"grad_norm": 0.08798702323560283, |
|
"learning_rate": 0.00011408941504437532, |
|
"loss": 0.7274, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.6115196078431373, |
|
"grad_norm": 0.08477334907464082, |
|
"learning_rate": 0.00011070975133362842, |
|
"loss": 0.7351, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.6176470588235294, |
|
"grad_norm": 0.08440769123912943, |
|
"learning_rate": 0.00010737465909972776, |
|
"loss": 0.7322, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.6237745098039216, |
|
"grad_norm": 0.08816338400851521, |
|
"learning_rate": 0.00010408452019140119, |
|
"loss": 0.7257, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.6299019607843137, |
|
"grad_norm": 0.0889764540813042, |
|
"learning_rate": 0.00010083971131048159, |
|
"loss": 0.7285, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.6360294117647058, |
|
"grad_norm": 0.08514875374173311, |
|
"learning_rate": 9.764060396877661e-05, |
|
"loss": 0.7323, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.642156862745098, |
|
"grad_norm": 0.08728092386538332, |
|
"learning_rate": 9.448756444553224e-05, |
|
"loss": 0.7256, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.6482843137254903, |
|
"grad_norm": 0.08430901480786084, |
|
"learning_rate": 9.138095374549633e-05, |
|
"loss": 0.7278, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.6544117647058822, |
|
"grad_norm": 0.08407206279064601, |
|
"learning_rate": 8.832112755758598e-05, |
|
"loss": 0.7232, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.6605392156862746, |
|
"grad_norm": 0.09172008766801877, |
|
"learning_rate": 8.530843621416234e-05, |
|
"loss": 0.7262, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.088740049378598, |
|
"learning_rate": 8.234322465092047e-05, |
|
"loss": 0.7294, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.6727941176470589, |
|
"grad_norm": 0.08733287037829676, |
|
"learning_rate": 7.942583236739581e-05, |
|
"loss": 0.7326, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.678921568627451, |
|
"grad_norm": 0.08946029254025908, |
|
"learning_rate": 7.655659338809329e-05, |
|
"loss": 0.7302, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.6850490196078431, |
|
"grad_norm": 0.08125225656437084, |
|
"learning_rate": 7.373583622424358e-05, |
|
"loss": 0.7243, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.6911764705882353, |
|
"grad_norm": 0.08262402915263288, |
|
"learning_rate": 7.096388383619079e-05, |
|
"loss": 0.722, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.6973039215686274, |
|
"grad_norm": 0.0983748828611407, |
|
"learning_rate": 6.824105359641513e-05, |
|
"loss": 0.7224, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.7034313725490198, |
|
"grad_norm": 0.09511486159825465, |
|
"learning_rate": 6.556765725319525e-05, |
|
"loss": 0.7353, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.7095588235294117, |
|
"grad_norm": 0.08406547135240365, |
|
"learning_rate": 6.294400089491526e-05, |
|
"loss": 0.7249, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.715686274509804, |
|
"grad_norm": 0.09349436979698456, |
|
"learning_rate": 6.037038491501978e-05, |
|
"loss": 0.7199, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.721813725490196, |
|
"grad_norm": 0.0859858185739073, |
|
"learning_rate": 5.7847103977619555e-05, |
|
"loss": 0.7231, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.7279411764705883, |
|
"grad_norm": 0.08083261397597263, |
|
"learning_rate": 5.53744469837551e-05, |
|
"loss": 0.7331, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.7340686274509802, |
|
"grad_norm": 0.08197105924202848, |
|
"learning_rate": 5.295269703831901e-05, |
|
"loss": 0.725, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.7401960784313726, |
|
"grad_norm": 0.07809550724924257, |
|
"learning_rate": 5.058213141764151e-05, |
|
"loss": 0.718, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.7463235294117647, |
|
"grad_norm": 0.08030689228502133, |
|
"learning_rate": 4.826302153774448e-05, |
|
"loss": 0.7171, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.7524509803921569, |
|
"grad_norm": 0.08468169044962492, |
|
"learning_rate": 4.599563292326592e-05, |
|
"loss": 0.7267, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.758578431372549, |
|
"grad_norm": 0.0820979728898519, |
|
"learning_rate": 4.3780225177058766e-05, |
|
"loss": 0.7166, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 0.07927753918146413, |
|
"learning_rate": 4.161705195046761e-05, |
|
"loss": 0.718, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.7708333333333335, |
|
"grad_norm": 0.0840926492467221, |
|
"learning_rate": 3.9506360914287386e-05, |
|
"loss": 0.728, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.7769607843137254, |
|
"grad_norm": 0.08436514170292973, |
|
"learning_rate": 3.744839373040682e-05, |
|
"loss": 0.7214, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.7830882352941178, |
|
"grad_norm": 0.08999651739438058, |
|
"learning_rate": 3.5443386024138605e-05, |
|
"loss": 0.7296, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.7892156862745097, |
|
"grad_norm": 0.08092101714415846, |
|
"learning_rate": 3.349156735724274e-05, |
|
"loss": 0.7196, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.795343137254902, |
|
"grad_norm": 0.08061728201126263, |
|
"learning_rate": 3.1593161201642354e-05, |
|
"loss": 0.7284, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.8014705882352942, |
|
"grad_norm": 0.07834911210493091, |
|
"learning_rate": 2.9748384913837522e-05, |
|
"loss": 0.7325, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.8075980392156863, |
|
"grad_norm": 0.0790057426287298, |
|
"learning_rate": 2.7957449710019512e-05, |
|
"loss": 0.7286, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.8137254901960784, |
|
"grad_norm": 0.07695722244990424, |
|
"learning_rate": 2.622056064188738e-05, |
|
"loss": 0.7276, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.8198529411764706, |
|
"grad_norm": 0.09150941908302032, |
|
"learning_rate": 2.4537916573171337e-05, |
|
"loss": 0.7239, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.8259803921568627, |
|
"grad_norm": 0.09154350521364844, |
|
"learning_rate": 2.2909710156863274e-05, |
|
"loss": 0.7312, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.8321078431372548, |
|
"grad_norm": 0.08002699190491802, |
|
"learning_rate": 2.1336127813159355e-05, |
|
"loss": 0.7254, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.8382352941176472, |
|
"grad_norm": 0.08021887418189992, |
|
"learning_rate": 1.981734970811644e-05, |
|
"loss": 0.7222, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.844362745098039, |
|
"grad_norm": 0.08930992030073814, |
|
"learning_rate": 1.8353549733023333e-05, |
|
"loss": 0.7383, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.8504901960784315, |
|
"grad_norm": 0.08303320455844895, |
|
"learning_rate": 1.6944895484492072e-05, |
|
"loss": 0.7133, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.8566176470588234, |
|
"grad_norm": 0.08729988838797742, |
|
"learning_rate": 1.5591548245268428e-05, |
|
"loss": 0.7315, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.8627450980392157, |
|
"grad_norm": 0.08890946616187734, |
|
"learning_rate": 1.429366296576623e-05, |
|
"loss": 0.7197, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.8688725490196079, |
|
"grad_norm": 0.08151959163399713, |
|
"learning_rate": 1.30513882463264e-05, |
|
"loss": 0.7263, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.875, |
|
"grad_norm": 0.09562764992899297, |
|
"learning_rate": 1.1864866320203115e-05, |
|
"loss": 0.7188, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.8811274509803921, |
|
"grad_norm": 0.07846982174735893, |
|
"learning_rate": 1.073423303727894e-05, |
|
"loss": 0.7237, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.8872549019607843, |
|
"grad_norm": 0.07663001458145507, |
|
"learning_rate": 9.659617848510882e-06, |
|
"loss": 0.7252, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.8933823529411766, |
|
"grad_norm": 0.08080118998190498, |
|
"learning_rate": 8.64114379110853e-06, |
|
"loss": 0.7112, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.8995098039215685, |
|
"grad_norm": 0.08484892902062172, |
|
"learning_rate": 7.678927474447817e-06, |
|
"loss": 0.7264, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.905637254901961, |
|
"grad_norm": 0.08516868212628753, |
|
"learning_rate": 6.77307906671909e-06, |
|
"loss": 0.7271, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.9117647058823528, |
|
"grad_norm": 0.08133640776409674, |
|
"learning_rate": 5.923702282314092e-06, |
|
"loss": 0.7324, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.9178921568627452, |
|
"grad_norm": 0.08097758322239303, |
|
"learning_rate": 5.130894369951011e-06, |
|
"loss": 0.7273, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.9240196078431373, |
|
"grad_norm": 0.08298253969927805, |
|
"learning_rate": 4.394746101540115e-06, |
|
"loss": 0.7298, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.9301470588235294, |
|
"grad_norm": 0.0776674873026092, |
|
"learning_rate": 3.7153417617907802e-06, |
|
"loss": 0.7199, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.9362745098039216, |
|
"grad_norm": 0.08136684261699478, |
|
"learning_rate": 3.092759138561607e-06, |
|
"loss": 0.727, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.9424019607843137, |
|
"grad_norm": 0.21064147236672387, |
|
"learning_rate": 2.5270695139539833e-06, |
|
"loss": 0.7305, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.9485294117647058, |
|
"grad_norm": 0.07856752844029326, |
|
"learning_rate": 2.018337656150726e-06, |
|
"loss": 0.7259, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.954656862745098, |
|
"grad_norm": 0.07818237444191369, |
|
"learning_rate": 1.5666218120005682e-06, |
|
"loss": 0.724, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.9607843137254903, |
|
"grad_norm": 0.07498630792024302, |
|
"learning_rate": 1.1719737003492159e-06, |
|
"loss": 0.7205, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.9669117647058822, |
|
"grad_norm": 0.07974040781241311, |
|
"learning_rate": 8.344385061176962e-07, |
|
"loss": 0.7228, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.9730392156862746, |
|
"grad_norm": 0.08674324039372262, |
|
"learning_rate": 5.540548751292173e-07, |
|
"loss": 0.7364, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.9791666666666665, |
|
"grad_norm": 0.08126471002081247, |
|
"learning_rate": 3.3085490968409737e-07, |
|
"loss": 0.7232, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.9852941176470589, |
|
"grad_norm": 0.08409713343077478, |
|
"learning_rate": 1.6486416488459277e-07, |
|
"loss": 0.7251, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.991421568627451, |
|
"grad_norm": 0.08609959331430181, |
|
"learning_rate": 5.6101645708850346e-08, |
|
"loss": 0.7326, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.9975490196078431, |
|
"grad_norm": 0.0748443586344328, |
|
"learning_rate": 4.579804834703438e-09, |
|
"loss": 0.7238, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.1972506046295166, |
|
"eval_runtime": 112.8302, |
|
"eval_samples_per_second": 185.598, |
|
"eval_steps_per_second": 5.805, |
|
"step": 1632 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 1632, |
|
"total_flos": 160150899916800.0, |
|
"train_loss": 0.8347952699690473, |
|
"train_runtime": 3713.7533, |
|
"train_samples_per_second": 56.241, |
|
"train_steps_per_second": 0.439 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1632, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 160150899916800.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|