diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,25602 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 18257, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.466796875, + "learning_rate": 1.095290251916758e-07, + "loss": 1.5925, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.38671875, + "learning_rate": 5.47645125958379e-07, + "loss": 1.7789, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.578125, + "learning_rate": 1.095290251916758e-06, + "loss": 1.7766, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.5703125, + "learning_rate": 1.642935377875137e-06, + "loss": 1.639, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 0.7578125, + "learning_rate": 2.190580503833516e-06, + "loss": 1.6936, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.484375, + "learning_rate": 2.738225629791895e-06, + "loss": 1.7455, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 0.4375, + "learning_rate": 3.285870755750274e-06, + "loss": 1.6397, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 0.455078125, + "learning_rate": 3.8335158817086525e-06, + "loss": 1.6603, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 0.578125, + "learning_rate": 4.381161007667032e-06, + "loss": 1.7611, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 0.5546875, + "learning_rate": 4.928806133625411e-06, + "loss": 1.6704, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 0.3828125, + "learning_rate": 5.47645125958379e-06, + "loss": 1.7226, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 0.349609375, + "learning_rate": 6.024096385542169e-06, + "loss": 1.7409, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 0.51171875, + "learning_rate": 6.571741511500548e-06, + "loss": 1.6828, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 0.66015625, + "learning_rate": 7.119386637458927e-06, + "loss": 1.677, + "step": 65 + }, + { + "epoch": 0.0, + "grad_norm": 0.4609375, + "learning_rate": 7.667031763417305e-06, + "loss": 1.7095, + "step": 70 + }, + { + "epoch": 0.0, + "grad_norm": 0.62109375, + "learning_rate": 8.214676889375684e-06, + "loss": 1.5529, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 0.59375, + "learning_rate": 8.762322015334064e-06, + "loss": 1.6196, + "step": 80 + }, + { + "epoch": 0.0, + "grad_norm": 0.453125, + "learning_rate": 9.309967141292443e-06, + "loss": 1.6254, + "step": 85 + }, + { + "epoch": 0.0, + "grad_norm": 0.62109375, + "learning_rate": 9.857612267250823e-06, + "loss": 1.6312, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.515625, + "learning_rate": 1.0405257393209202e-05, + "loss": 1.5719, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.458984375, + "learning_rate": 1.095290251916758e-05, + "loss": 1.5849, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.5546875, + "learning_rate": 1.1500547645125959e-05, + "loss": 1.5305, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.400390625, + "learning_rate": 1.2048192771084338e-05, + "loss": 1.5214, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.27734375, + "learning_rate": 1.2595837897042718e-05, + "loss": 1.5621, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.279296875, + "learning_rate": 1.3143483023001096e-05, + "loss": 1.4636, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 0.306640625, + "learning_rate": 1.3691128148959475e-05, + "loss": 1.4968, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 0.26953125, + "learning_rate": 1.4238773274917854e-05, + "loss": 1.5992, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 0.275390625, + "learning_rate": 1.4786418400876234e-05, + "loss": 1.4915, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 0.2314453125, + "learning_rate": 1.533406352683461e-05, + "loss": 1.4276, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 0.2451171875, + "learning_rate": 1.588170865279299e-05, + "loss": 1.4851, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 0.236328125, + "learning_rate": 1.642935377875137e-05, + "loss": 1.4907, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 0.203125, + "learning_rate": 1.697699890470975e-05, + "loss": 1.4977, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 0.1708984375, + "learning_rate": 1.7524644030668127e-05, + "loss": 1.3028, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 0.1552734375, + "learning_rate": 1.8072289156626505e-05, + "loss": 1.4589, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 0.1640625, + "learning_rate": 1.8619934282584886e-05, + "loss": 1.4321, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 0.1494140625, + "learning_rate": 1.9167579408543264e-05, + "loss": 1.3841, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 0.20703125, + "learning_rate": 1.9715224534501645e-05, + "loss": 1.3932, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 0.1708984375, + "learning_rate": 2.0262869660460023e-05, + "loss": 1.3728, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 0.1884765625, + "learning_rate": 2.0810514786418404e-05, + "loss": 1.3659, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 0.16015625, + "learning_rate": 2.1358159912376778e-05, + "loss": 1.4092, + "step": 195 + }, + { + "epoch": 0.01, + "grad_norm": 0.1611328125, + "learning_rate": 2.190580503833516e-05, + "loss": 1.4435, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 0.1953125, + "learning_rate": 2.2453450164293537e-05, + "loss": 1.3881, + "step": 205 + }, + { + "epoch": 0.01, + "grad_norm": 0.150390625, + "learning_rate": 2.3001095290251918e-05, + "loss": 1.3537, + "step": 210 + }, + { + "epoch": 0.01, + "grad_norm": 0.1650390625, + "learning_rate": 2.3548740416210296e-05, + "loss": 1.3825, + "step": 215 + }, + { + "epoch": 0.01, + "grad_norm": 0.1513671875, + "learning_rate": 2.4096385542168677e-05, + "loss": 1.3656, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 0.2216796875, + "learning_rate": 2.4644030668127055e-05, + "loss": 1.3531, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 0.1416015625, + "learning_rate": 2.5191675794085436e-05, + "loss": 1.3594, + "step": 230 + }, + { + "epoch": 0.01, + "grad_norm": 0.1884765625, + "learning_rate": 2.5739320920043813e-05, + "loss": 1.3433, + "step": 235 + }, + { + "epoch": 0.01, + "grad_norm": 0.1357421875, + "learning_rate": 2.628696604600219e-05, + "loss": 1.3442, + "step": 240 + }, + { + "epoch": 0.01, + "grad_norm": 0.1318359375, + "learning_rate": 2.6834611171960572e-05, + "loss": 1.4246, + "step": 245 + }, + { + "epoch": 0.01, + "grad_norm": 0.1376953125, + "learning_rate": 2.738225629791895e-05, + "loss": 1.3524, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 0.1513671875, + "learning_rate": 2.792990142387733e-05, + "loss": 1.3018, + "step": 255 + }, + { + "epoch": 0.01, + "grad_norm": 0.1845703125, + "learning_rate": 2.847754654983571e-05, + "loss": 1.4327, + "step": 260 + }, + { + "epoch": 0.01, + "grad_norm": 0.1396484375, + "learning_rate": 2.902519167579409e-05, + "loss": 1.2796, + "step": 265 + }, + { + "epoch": 0.01, + "grad_norm": 0.1435546875, + "learning_rate": 2.9572836801752468e-05, + "loss": 1.4626, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 0.12890625, + "learning_rate": 3.012048192771085e-05, + "loss": 1.3257, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 0.1376953125, + "learning_rate": 3.066812705366922e-05, + "loss": 1.3463, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 0.1240234375, + "learning_rate": 3.12157721796276e-05, + "loss": 1.3423, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 0.12890625, + "learning_rate": 3.176341730558598e-05, + "loss": 1.3721, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 0.1279296875, + "learning_rate": 3.231106243154436e-05, + "loss": 1.2935, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 0.14453125, + "learning_rate": 3.285870755750274e-05, + "loss": 1.3651, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 0.14453125, + "learning_rate": 3.3406352683461115e-05, + "loss": 1.3716, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 0.1259765625, + "learning_rate": 3.39539978094195e-05, + "loss": 1.237, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 0.13671875, + "learning_rate": 3.450164293537788e-05, + "loss": 1.3898, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 0.130859375, + "learning_rate": 3.5049288061336255e-05, + "loss": 1.2465, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 0.138671875, + "learning_rate": 3.559693318729463e-05, + "loss": 1.308, + "step": 325 + }, + { + "epoch": 0.02, + "grad_norm": 0.1259765625, + "learning_rate": 3.614457831325301e-05, + "loss": 1.2948, + "step": 330 + }, + { + "epoch": 0.02, + "grad_norm": 0.12890625, + "learning_rate": 3.6692223439211395e-05, + "loss": 1.3287, + "step": 335 + }, + { + "epoch": 0.02, + "grad_norm": 0.130859375, + "learning_rate": 3.723986856516977e-05, + "loss": 1.3124, + "step": 340 + }, + { + "epoch": 0.02, + "grad_norm": 0.1337890625, + "learning_rate": 3.778751369112815e-05, + "loss": 1.3206, + "step": 345 + }, + { + "epoch": 0.02, + "grad_norm": 0.140625, + "learning_rate": 3.833515881708653e-05, + "loss": 1.354, + "step": 350 + }, + { + "epoch": 0.02, + "grad_norm": 0.1474609375, + "learning_rate": 3.888280394304491e-05, + "loss": 1.2987, + "step": 355 + }, + { + "epoch": 0.02, + "grad_norm": 0.1328125, + "learning_rate": 3.943044906900329e-05, + "loss": 1.2752, + "step": 360 + }, + { + "epoch": 0.02, + "grad_norm": 0.134765625, + "learning_rate": 3.997809419496167e-05, + "loss": 1.3191, + "step": 365 + }, + { + "epoch": 0.02, + "grad_norm": 0.1337890625, + "learning_rate": 4.0525739320920046e-05, + "loss": 1.3768, + "step": 370 + }, + { + "epoch": 0.02, + "grad_norm": 0.126953125, + "learning_rate": 4.107338444687843e-05, + "loss": 1.2676, + "step": 375 + }, + { + "epoch": 0.02, + "grad_norm": 0.142578125, + "learning_rate": 4.162102957283681e-05, + "loss": 1.2931, + "step": 380 + }, + { + "epoch": 0.02, + "grad_norm": 0.1474609375, + "learning_rate": 4.2168674698795186e-05, + "loss": 1.2615, + "step": 385 + }, + { + "epoch": 0.02, + "grad_norm": 0.1396484375, + "learning_rate": 4.2716319824753556e-05, + "loss": 1.311, + "step": 390 + }, + { + "epoch": 0.02, + "grad_norm": 0.1474609375, + "learning_rate": 4.326396495071194e-05, + "loss": 1.2629, + "step": 395 + }, + { + "epoch": 0.02, + "grad_norm": 0.1318359375, + "learning_rate": 4.381161007667032e-05, + "loss": 1.3424, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 0.140625, + "learning_rate": 4.4359255202628696e-05, + "loss": 1.2941, + "step": 405 + }, + { + "epoch": 0.02, + "grad_norm": 0.1435546875, + "learning_rate": 4.4906900328587074e-05, + "loss": 1.2161, + "step": 410 + }, + { + "epoch": 0.02, + "grad_norm": 0.1494140625, + "learning_rate": 4.545454545454546e-05, + "loss": 1.3718, + "step": 415 + }, + { + "epoch": 0.02, + "grad_norm": 0.134765625, + "learning_rate": 4.6002190580503836e-05, + "loss": 1.3224, + "step": 420 + }, + { + "epoch": 0.02, + "grad_norm": 0.14453125, + "learning_rate": 4.6549835706462214e-05, + "loss": 1.3083, + "step": 425 + }, + { + "epoch": 0.02, + "grad_norm": 0.142578125, + "learning_rate": 4.709748083242059e-05, + "loss": 1.2505, + "step": 430 + }, + { + "epoch": 0.02, + "grad_norm": 0.1435546875, + "learning_rate": 4.764512595837897e-05, + "loss": 1.2964, + "step": 435 + }, + { + "epoch": 0.02, + "grad_norm": 0.1416015625, + "learning_rate": 4.8192771084337354e-05, + "loss": 1.264, + "step": 440 + }, + { + "epoch": 0.02, + "grad_norm": 0.1435546875, + "learning_rate": 4.874041621029573e-05, + "loss": 1.2452, + "step": 445 + }, + { + "epoch": 0.02, + "grad_norm": 0.1533203125, + "learning_rate": 4.928806133625411e-05, + "loss": 1.3098, + "step": 450 + }, + { + "epoch": 0.02, + "grad_norm": 0.1513671875, + "learning_rate": 4.983570646221249e-05, + "loss": 1.2268, + "step": 455 + }, + { + "epoch": 0.03, + "grad_norm": 0.158203125, + "learning_rate": 5.038335158817087e-05, + "loss": 1.3172, + "step": 460 + }, + { + "epoch": 0.03, + "grad_norm": 0.1416015625, + "learning_rate": 5.093099671412924e-05, + "loss": 1.2184, + "step": 465 + }, + { + "epoch": 0.03, + "grad_norm": 0.138671875, + "learning_rate": 5.147864184008763e-05, + "loss": 1.2912, + "step": 470 + }, + { + "epoch": 0.03, + "grad_norm": 0.146484375, + "learning_rate": 5.2026286966046e-05, + "loss": 1.2859, + "step": 475 + }, + { + "epoch": 0.03, + "grad_norm": 0.140625, + "learning_rate": 5.257393209200438e-05, + "loss": 1.2723, + "step": 480 + }, + { + "epoch": 0.03, + "grad_norm": 0.14453125, + "learning_rate": 5.312157721796276e-05, + "loss": 1.2359, + "step": 485 + }, + { + "epoch": 0.03, + "grad_norm": 0.1455078125, + "learning_rate": 5.3669222343921145e-05, + "loss": 1.3044, + "step": 490 + }, + { + "epoch": 0.03, + "grad_norm": 0.1494140625, + "learning_rate": 5.4216867469879516e-05, + "loss": 1.2649, + "step": 495 + }, + { + "epoch": 0.03, + "grad_norm": 0.1494140625, + "learning_rate": 5.47645125958379e-05, + "loss": 1.323, + "step": 500 + }, + { + "epoch": 0.03, + "grad_norm": 0.1455078125, + "learning_rate": 5.531215772179628e-05, + "loss": 1.2744, + "step": 505 + }, + { + "epoch": 0.03, + "grad_norm": 0.154296875, + "learning_rate": 5.585980284775466e-05, + "loss": 1.2393, + "step": 510 + }, + { + "epoch": 0.03, + "grad_norm": 0.1474609375, + "learning_rate": 5.640744797371303e-05, + "loss": 1.2021, + "step": 515 + }, + { + "epoch": 0.03, + "grad_norm": 0.1484375, + "learning_rate": 5.695509309967142e-05, + "loss": 1.341, + "step": 520 + }, + { + "epoch": 0.03, + "grad_norm": 0.146484375, + "learning_rate": 5.7502738225629795e-05, + "loss": 1.2942, + "step": 525 + }, + { + "epoch": 0.03, + "grad_norm": 0.1591796875, + "learning_rate": 5.805038335158818e-05, + "loss": 1.2963, + "step": 530 + }, + { + "epoch": 0.03, + "grad_norm": 0.1455078125, + "learning_rate": 5.859802847754655e-05, + "loss": 1.355, + "step": 535 + }, + { + "epoch": 0.03, + "grad_norm": 0.16015625, + "learning_rate": 5.9145673603504935e-05, + "loss": 1.2284, + "step": 540 + }, + { + "epoch": 0.03, + "grad_norm": 0.1552734375, + "learning_rate": 5.969331872946331e-05, + "loss": 1.2243, + "step": 545 + }, + { + "epoch": 0.03, + "grad_norm": 0.1513671875, + "learning_rate": 6.02409638554217e-05, + "loss": 1.3001, + "step": 550 + }, + { + "epoch": 0.03, + "grad_norm": 0.146484375, + "learning_rate": 6.078860898138007e-05, + "loss": 1.238, + "step": 555 + }, + { + "epoch": 0.03, + "grad_norm": 0.15234375, + "learning_rate": 6.133625410733844e-05, + "loss": 1.2305, + "step": 560 + }, + { + "epoch": 0.03, + "grad_norm": 0.1416015625, + "learning_rate": 6.188389923329682e-05, + "loss": 1.2445, + "step": 565 + }, + { + "epoch": 0.03, + "grad_norm": 0.279296875, + "learning_rate": 6.24315443592552e-05, + "loss": 1.2829, + "step": 570 + }, + { + "epoch": 0.03, + "grad_norm": 0.1455078125, + "learning_rate": 6.297918948521358e-05, + "loss": 1.2107, + "step": 575 + }, + { + "epoch": 0.03, + "grad_norm": 0.146484375, + "learning_rate": 6.352683461117196e-05, + "loss": 1.2449, + "step": 580 + }, + { + "epoch": 0.03, + "grad_norm": 0.1611328125, + "learning_rate": 6.407447973713035e-05, + "loss": 1.2306, + "step": 585 + }, + { + "epoch": 0.03, + "grad_norm": 0.1669921875, + "learning_rate": 6.462212486308872e-05, + "loss": 1.2696, + "step": 590 + }, + { + "epoch": 0.03, + "grad_norm": 0.1484375, + "learning_rate": 6.51697699890471e-05, + "loss": 1.3035, + "step": 595 + }, + { + "epoch": 0.03, + "grad_norm": 0.158203125, + "learning_rate": 6.571741511500547e-05, + "loss": 1.2613, + "step": 600 + }, + { + "epoch": 0.03, + "grad_norm": 0.1572265625, + "learning_rate": 6.626506024096386e-05, + "loss": 1.2877, + "step": 605 + }, + { + "epoch": 0.03, + "grad_norm": 0.1552734375, + "learning_rate": 6.681270536692223e-05, + "loss": 1.229, + "step": 610 + }, + { + "epoch": 0.03, + "grad_norm": 0.15625, + "learning_rate": 6.736035049288061e-05, + "loss": 1.2112, + "step": 615 + }, + { + "epoch": 0.03, + "grad_norm": 0.1611328125, + "learning_rate": 6.7907995618839e-05, + "loss": 1.265, + "step": 620 + }, + { + "epoch": 0.03, + "grad_norm": 0.15625, + "learning_rate": 6.845564074479738e-05, + "loss": 1.2641, + "step": 625 + }, + { + "epoch": 0.03, + "grad_norm": 0.1484375, + "learning_rate": 6.900328587075575e-05, + "loss": 1.2364, + "step": 630 + }, + { + "epoch": 0.03, + "grad_norm": 0.16015625, + "learning_rate": 6.955093099671414e-05, + "loss": 1.2265, + "step": 635 + }, + { + "epoch": 0.04, + "grad_norm": 0.16796875, + "learning_rate": 7.009857612267251e-05, + "loss": 1.2454, + "step": 640 + }, + { + "epoch": 0.04, + "grad_norm": 0.166015625, + "learning_rate": 7.06462212486309e-05, + "loss": 1.2541, + "step": 645 + }, + { + "epoch": 0.04, + "grad_norm": 0.1572265625, + "learning_rate": 7.119386637458927e-05, + "loss": 1.3241, + "step": 650 + }, + { + "epoch": 0.04, + "grad_norm": 0.158203125, + "learning_rate": 7.174151150054765e-05, + "loss": 1.2416, + "step": 655 + }, + { + "epoch": 0.04, + "grad_norm": 0.1552734375, + "learning_rate": 7.228915662650602e-05, + "loss": 1.2588, + "step": 660 + }, + { + "epoch": 0.04, + "grad_norm": 0.1572265625, + "learning_rate": 7.28368017524644e-05, + "loss": 1.2438, + "step": 665 + }, + { + "epoch": 0.04, + "grad_norm": 0.1611328125, + "learning_rate": 7.338444687842279e-05, + "loss": 1.3034, + "step": 670 + }, + { + "epoch": 0.04, + "grad_norm": 0.1591796875, + "learning_rate": 7.393209200438116e-05, + "loss": 1.2278, + "step": 675 + }, + { + "epoch": 0.04, + "grad_norm": 0.1474609375, + "learning_rate": 7.447973713033955e-05, + "loss": 1.2721, + "step": 680 + }, + { + "epoch": 0.04, + "grad_norm": 0.1640625, + "learning_rate": 7.502738225629792e-05, + "loss": 1.2244, + "step": 685 + }, + { + "epoch": 0.04, + "grad_norm": 0.16015625, + "learning_rate": 7.55750273822563e-05, + "loss": 1.3325, + "step": 690 + }, + { + "epoch": 0.04, + "grad_norm": 0.1669921875, + "learning_rate": 7.612267250821467e-05, + "loss": 1.2812, + "step": 695 + }, + { + "epoch": 0.04, + "grad_norm": 0.1611328125, + "learning_rate": 7.667031763417306e-05, + "loss": 1.2739, + "step": 700 + }, + { + "epoch": 0.04, + "grad_norm": 0.1611328125, + "learning_rate": 7.721796276013144e-05, + "loss": 1.2815, + "step": 705 + }, + { + "epoch": 0.04, + "grad_norm": 0.162109375, + "learning_rate": 7.776560788608982e-05, + "loss": 1.2886, + "step": 710 + }, + { + "epoch": 0.04, + "grad_norm": 0.1552734375, + "learning_rate": 7.83132530120482e-05, + "loss": 1.2802, + "step": 715 + }, + { + "epoch": 0.04, + "grad_norm": 0.1669921875, + "learning_rate": 7.886089813800658e-05, + "loss": 1.3426, + "step": 720 + }, + { + "epoch": 0.04, + "grad_norm": 0.166015625, + "learning_rate": 7.940854326396495e-05, + "loss": 1.3236, + "step": 725 + }, + { + "epoch": 0.04, + "grad_norm": 0.1552734375, + "learning_rate": 7.995618838992334e-05, + "loss": 1.214, + "step": 730 + }, + { + "epoch": 0.04, + "grad_norm": 0.166015625, + "learning_rate": 8.05038335158817e-05, + "loss": 1.3208, + "step": 735 + }, + { + "epoch": 0.04, + "grad_norm": 0.1669921875, + "learning_rate": 8.105147864184009e-05, + "loss": 1.2268, + "step": 740 + }, + { + "epoch": 0.04, + "grad_norm": 0.1630859375, + "learning_rate": 8.159912376779846e-05, + "loss": 1.2339, + "step": 745 + }, + { + "epoch": 0.04, + "grad_norm": 0.1572265625, + "learning_rate": 8.214676889375686e-05, + "loss": 1.1939, + "step": 750 + }, + { + "epoch": 0.04, + "grad_norm": 0.1552734375, + "learning_rate": 8.269441401971523e-05, + "loss": 1.2931, + "step": 755 + }, + { + "epoch": 0.04, + "grad_norm": 0.1572265625, + "learning_rate": 8.324205914567362e-05, + "loss": 1.2586, + "step": 760 + }, + { + "epoch": 0.04, + "grad_norm": 0.166015625, + "learning_rate": 8.378970427163199e-05, + "loss": 1.2386, + "step": 765 + }, + { + "epoch": 0.04, + "grad_norm": 0.150390625, + "learning_rate": 8.433734939759037e-05, + "loss": 1.1571, + "step": 770 + }, + { + "epoch": 0.04, + "grad_norm": 0.1591796875, + "learning_rate": 8.488499452354874e-05, + "loss": 1.2219, + "step": 775 + }, + { + "epoch": 0.04, + "grad_norm": 0.1552734375, + "learning_rate": 8.543263964950711e-05, + "loss": 1.238, + "step": 780 + }, + { + "epoch": 0.04, + "grad_norm": 0.1640625, + "learning_rate": 8.59802847754655e-05, + "loss": 1.2028, + "step": 785 + }, + { + "epoch": 0.04, + "grad_norm": 0.162109375, + "learning_rate": 8.652792990142388e-05, + "loss": 1.1656, + "step": 790 + }, + { + "epoch": 0.04, + "grad_norm": 0.15234375, + "learning_rate": 8.707557502738227e-05, + "loss": 1.2504, + "step": 795 + }, + { + "epoch": 0.04, + "grad_norm": 0.1630859375, + "learning_rate": 8.762322015334064e-05, + "loss": 1.2424, + "step": 800 + }, + { + "epoch": 0.04, + "grad_norm": 0.1630859375, + "learning_rate": 8.817086527929902e-05, + "loss": 1.2736, + "step": 805 + }, + { + "epoch": 0.04, + "grad_norm": 0.1669921875, + "learning_rate": 8.871851040525739e-05, + "loss": 1.1999, + "step": 810 + }, + { + "epoch": 0.04, + "grad_norm": 0.1728515625, + "learning_rate": 8.926615553121578e-05, + "loss": 1.2618, + "step": 815 + }, + { + "epoch": 0.04, + "grad_norm": 0.1572265625, + "learning_rate": 8.981380065717415e-05, + "loss": 1.2386, + "step": 820 + }, + { + "epoch": 0.05, + "grad_norm": 0.17578125, + "learning_rate": 9.036144578313253e-05, + "loss": 1.3859, + "step": 825 + }, + { + "epoch": 0.05, + "grad_norm": 0.15625, + "learning_rate": 9.090909090909092e-05, + "loss": 1.2344, + "step": 830 + }, + { + "epoch": 0.05, + "grad_norm": 0.1630859375, + "learning_rate": 9.14567360350493e-05, + "loss": 1.2679, + "step": 835 + }, + { + "epoch": 0.05, + "grad_norm": 0.1630859375, + "learning_rate": 9.200438116100767e-05, + "loss": 1.2153, + "step": 840 + }, + { + "epoch": 0.05, + "grad_norm": 0.1689453125, + "learning_rate": 9.255202628696606e-05, + "loss": 1.3274, + "step": 845 + }, + { + "epoch": 0.05, + "grad_norm": 0.1650390625, + "learning_rate": 9.309967141292443e-05, + "loss": 1.2584, + "step": 850 + }, + { + "epoch": 0.05, + "grad_norm": 0.162109375, + "learning_rate": 9.364731653888281e-05, + "loss": 1.2726, + "step": 855 + }, + { + "epoch": 0.05, + "grad_norm": 0.1611328125, + "learning_rate": 9.419496166484118e-05, + "loss": 1.2299, + "step": 860 + }, + { + "epoch": 0.05, + "grad_norm": 0.15625, + "learning_rate": 9.474260679079957e-05, + "loss": 1.189, + "step": 865 + }, + { + "epoch": 0.05, + "grad_norm": 0.1591796875, + "learning_rate": 9.529025191675794e-05, + "loss": 1.1814, + "step": 870 + }, + { + "epoch": 0.05, + "grad_norm": 0.1640625, + "learning_rate": 9.583789704271632e-05, + "loss": 1.2205, + "step": 875 + }, + { + "epoch": 0.05, + "grad_norm": 0.1494140625, + "learning_rate": 9.638554216867471e-05, + "loss": 1.3385, + "step": 880 + }, + { + "epoch": 0.05, + "grad_norm": 0.1611328125, + "learning_rate": 9.693318729463309e-05, + "loss": 1.2303, + "step": 885 + }, + { + "epoch": 0.05, + "grad_norm": 0.16796875, + "learning_rate": 9.748083242059146e-05, + "loss": 1.2501, + "step": 890 + }, + { + "epoch": 0.05, + "grad_norm": 0.15625, + "learning_rate": 9.802847754654983e-05, + "loss": 1.2016, + "step": 895 + }, + { + "epoch": 0.05, + "grad_norm": 0.1484375, + "learning_rate": 9.857612267250822e-05, + "loss": 1.2084, + "step": 900 + }, + { + "epoch": 0.05, + "grad_norm": 0.1572265625, + "learning_rate": 9.912376779846659e-05, + "loss": 1.2041, + "step": 905 + }, + { + "epoch": 0.05, + "grad_norm": 0.173828125, + "learning_rate": 9.967141292442497e-05, + "loss": 1.2082, + "step": 910 + }, + { + "epoch": 0.05, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010021905805038336, + "loss": 1.2389, + "step": 915 + }, + { + "epoch": 0.05, + "grad_norm": 0.16015625, + "learning_rate": 0.00010076670317634174, + "loss": 1.2297, + "step": 920 + }, + { + "epoch": 0.05, + "grad_norm": 0.17578125, + "learning_rate": 0.00010131434830230011, + "loss": 1.1597, + "step": 925 + }, + { + "epoch": 0.05, + "grad_norm": 0.15234375, + "learning_rate": 0.00010186199342825848, + "loss": 1.2344, + "step": 930 + }, + { + "epoch": 0.05, + "grad_norm": 0.169921875, + "learning_rate": 0.00010240963855421688, + "loss": 1.173, + "step": 935 + }, + { + "epoch": 0.05, + "grad_norm": 0.158203125, + "learning_rate": 0.00010295728368017525, + "loss": 1.2133, + "step": 940 + }, + { + "epoch": 0.05, + "grad_norm": 0.16015625, + "learning_rate": 0.00010350492880613362, + "loss": 1.2304, + "step": 945 + }, + { + "epoch": 0.05, + "grad_norm": 0.189453125, + "learning_rate": 0.000104052573932092, + "loss": 1.2685, + "step": 950 + }, + { + "epoch": 0.05, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001046002190580504, + "loss": 1.2245, + "step": 955 + }, + { + "epoch": 0.05, + "grad_norm": 0.16796875, + "learning_rate": 0.00010514786418400876, + "loss": 1.2806, + "step": 960 + }, + { + "epoch": 0.05, + "grad_norm": 0.1630859375, + "learning_rate": 0.00010569550930996715, + "loss": 1.2308, + "step": 965 + }, + { + "epoch": 0.05, + "grad_norm": 0.162109375, + "learning_rate": 0.00010624315443592552, + "loss": 1.2814, + "step": 970 + }, + { + "epoch": 0.05, + "grad_norm": 0.16015625, + "learning_rate": 0.00010679079956188392, + "loss": 1.2158, + "step": 975 + }, + { + "epoch": 0.05, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010733844468784229, + "loss": 1.254, + "step": 980 + }, + { + "epoch": 0.05, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010788608981380066, + "loss": 1.3504, + "step": 985 + }, + { + "epoch": 0.05, + "grad_norm": 0.1650390625, + "learning_rate": 0.00010843373493975903, + "loss": 1.2993, + "step": 990 + }, + { + "epoch": 0.05, + "grad_norm": 0.169921875, + "learning_rate": 0.00010898138006571743, + "loss": 1.2117, + "step": 995 + }, + { + "epoch": 0.05, + "grad_norm": 0.169921875, + "learning_rate": 0.0001095290251916758, + "loss": 1.2644, + "step": 1000 + }, + { + "epoch": 0.06, + "grad_norm": 0.166015625, + "learning_rate": 0.00011007667031763418, + "loss": 1.1637, + "step": 1005 + }, + { + "epoch": 0.06, + "grad_norm": 0.1787109375, + "learning_rate": 0.00011062431544359256, + "loss": 1.2364, + "step": 1010 + }, + { + "epoch": 0.06, + "grad_norm": 0.1826171875, + "learning_rate": 0.00011117196056955093, + "loss": 1.1681, + "step": 1015 + }, + { + "epoch": 0.06, + "grad_norm": 0.15234375, + "learning_rate": 0.00011171960569550932, + "loss": 1.2646, + "step": 1020 + }, + { + "epoch": 0.06, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001122672508214677, + "loss": 1.2545, + "step": 1025 + }, + { + "epoch": 0.06, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011281489594742607, + "loss": 1.3127, + "step": 1030 + }, + { + "epoch": 0.06, + "grad_norm": 0.1748046875, + "learning_rate": 0.00011336254107338444, + "loss": 1.1493, + "step": 1035 + }, + { + "epoch": 0.06, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011391018619934284, + "loss": 1.2099, + "step": 1040 + }, + { + "epoch": 0.06, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001144578313253012, + "loss": 1.1963, + "step": 1045 + }, + { + "epoch": 0.06, + "grad_norm": 0.158203125, + "learning_rate": 0.00011500547645125959, + "loss": 1.2605, + "step": 1050 + }, + { + "epoch": 0.06, + "grad_norm": 0.169921875, + "learning_rate": 0.00011555312157721796, + "loss": 1.1419, + "step": 1055 + }, + { + "epoch": 0.06, + "grad_norm": 0.158203125, + "learning_rate": 0.00011610076670317636, + "loss": 1.1865, + "step": 1060 + }, + { + "epoch": 0.06, + "grad_norm": 0.166015625, + "learning_rate": 0.00011664841182913473, + "loss": 1.2134, + "step": 1065 + }, + { + "epoch": 0.06, + "grad_norm": 0.166015625, + "learning_rate": 0.0001171960569550931, + "loss": 1.2468, + "step": 1070 + }, + { + "epoch": 0.06, + "grad_norm": 0.171875, + "learning_rate": 0.00011774370208105147, + "loss": 1.2814, + "step": 1075 + }, + { + "epoch": 0.06, + "grad_norm": 0.1640625, + "learning_rate": 0.00011829134720700987, + "loss": 1.2728, + "step": 1080 + }, + { + "epoch": 0.06, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011883899233296824, + "loss": 1.2128, + "step": 1085 + }, + { + "epoch": 0.06, + "grad_norm": 0.1591796875, + "learning_rate": 0.00011938663745892663, + "loss": 1.2427, + "step": 1090 + }, + { + "epoch": 0.06, + "grad_norm": 0.1591796875, + "learning_rate": 0.000119934282584885, + "loss": 1.2083, + "step": 1095 + }, + { + "epoch": 0.06, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001204819277108434, + "loss": 1.2308, + "step": 1100 + }, + { + "epoch": 0.06, + "grad_norm": 0.1767578125, + "learning_rate": 0.00012102957283680177, + "loss": 1.2208, + "step": 1105 + }, + { + "epoch": 0.06, + "grad_norm": 0.173828125, + "learning_rate": 0.00012157721796276014, + "loss": 1.2535, + "step": 1110 + }, + { + "epoch": 0.06, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001221248630887185, + "loss": 1.2217, + "step": 1115 + }, + { + "epoch": 0.06, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012267250821467688, + "loss": 1.2437, + "step": 1120 + }, + { + "epoch": 0.06, + "grad_norm": 0.1611328125, + "learning_rate": 0.00012322015334063528, + "loss": 1.2499, + "step": 1125 + }, + { + "epoch": 0.06, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012376779846659365, + "loss": 1.1709, + "step": 1130 + }, + { + "epoch": 0.06, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012431544359255202, + "loss": 1.219, + "step": 1135 + }, + { + "epoch": 0.06, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001248630887185104, + "loss": 1.1608, + "step": 1140 + }, + { + "epoch": 0.06, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001254107338444688, + "loss": 1.2019, + "step": 1145 + }, + { + "epoch": 0.06, + "grad_norm": 0.1611328125, + "learning_rate": 0.00012595837897042716, + "loss": 1.2408, + "step": 1150 + }, + { + "epoch": 0.06, + "grad_norm": 0.1640625, + "learning_rate": 0.00012650602409638556, + "loss": 1.2266, + "step": 1155 + }, + { + "epoch": 0.06, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012705366922234393, + "loss": 1.2009, + "step": 1160 + }, + { + "epoch": 0.06, + "grad_norm": 0.16796875, + "learning_rate": 0.00012760131434830233, + "loss": 1.2754, + "step": 1165 + }, + { + "epoch": 0.06, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001281489594742607, + "loss": 1.28, + "step": 1170 + }, + { + "epoch": 0.06, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012869660460021907, + "loss": 1.2804, + "step": 1175 + }, + { + "epoch": 0.06, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012924424972617744, + "loss": 1.2307, + "step": 1180 + }, + { + "epoch": 0.06, + "grad_norm": 0.158203125, + "learning_rate": 0.00012979189485213584, + "loss": 1.2555, + "step": 1185 + }, + { + "epoch": 0.07, + "grad_norm": 0.1640625, + "learning_rate": 0.0001303395399780942, + "loss": 1.2674, + "step": 1190 + }, + { + "epoch": 0.07, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013088718510405258, + "loss": 1.2061, + "step": 1195 + }, + { + "epoch": 0.07, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013143483023001095, + "loss": 1.2576, + "step": 1200 + }, + { + "epoch": 0.07, + "grad_norm": 0.16015625, + "learning_rate": 0.00013198247535596935, + "loss": 1.2024, + "step": 1205 + }, + { + "epoch": 0.07, + "grad_norm": 0.1513671875, + "learning_rate": 0.00013253012048192772, + "loss": 1.2608, + "step": 1210 + }, + { + "epoch": 0.07, + "grad_norm": 0.16015625, + "learning_rate": 0.0001330777656078861, + "loss": 1.1914, + "step": 1215 + }, + { + "epoch": 0.07, + "grad_norm": 0.169921875, + "learning_rate": 0.00013362541073384446, + "loss": 1.2109, + "step": 1220 + }, + { + "epoch": 0.07, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013417305585980283, + "loss": 1.1982, + "step": 1225 + }, + { + "epoch": 0.07, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013472070098576123, + "loss": 1.3028, + "step": 1230 + }, + { + "epoch": 0.07, + "grad_norm": 0.158203125, + "learning_rate": 0.0001352683461117196, + "loss": 1.1142, + "step": 1235 + }, + { + "epoch": 0.07, + "grad_norm": 0.154296875, + "learning_rate": 0.000135815991237678, + "loss": 1.2338, + "step": 1240 + }, + { + "epoch": 0.07, + "grad_norm": 0.1552734375, + "learning_rate": 0.00013636363636363637, + "loss": 1.2409, + "step": 1245 + }, + { + "epoch": 0.07, + "grad_norm": 0.154296875, + "learning_rate": 0.00013691128148959477, + "loss": 1.2334, + "step": 1250 + }, + { + "epoch": 0.07, + "grad_norm": 0.169921875, + "learning_rate": 0.00013745892661555314, + "loss": 1.2972, + "step": 1255 + }, + { + "epoch": 0.07, + "grad_norm": 0.16015625, + "learning_rate": 0.0001380065717415115, + "loss": 1.2492, + "step": 1260 + }, + { + "epoch": 0.07, + "grad_norm": 0.162109375, + "learning_rate": 0.00013855421686746988, + "loss": 1.2353, + "step": 1265 + }, + { + "epoch": 0.07, + "grad_norm": 0.1640625, + "learning_rate": 0.00013910186199342828, + "loss": 1.1976, + "step": 1270 + }, + { + "epoch": 0.07, + "grad_norm": 0.16796875, + "learning_rate": 0.00013964950711938665, + "loss": 1.1801, + "step": 1275 + }, + { + "epoch": 0.07, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014019715224534502, + "loss": 1.246, + "step": 1280 + }, + { + "epoch": 0.07, + "grad_norm": 0.162109375, + "learning_rate": 0.0001407447973713034, + "loss": 1.1764, + "step": 1285 + }, + { + "epoch": 0.07, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001412924424972618, + "loss": 1.2766, + "step": 1290 + }, + { + "epoch": 0.07, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014184008762322016, + "loss": 1.3015, + "step": 1295 + }, + { + "epoch": 0.07, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014238773274917853, + "loss": 1.2957, + "step": 1300 + }, + { + "epoch": 0.07, + "grad_norm": 0.15234375, + "learning_rate": 0.0001429353778751369, + "loss": 1.2891, + "step": 1305 + }, + { + "epoch": 0.07, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001434830230010953, + "loss": 1.2687, + "step": 1310 + }, + { + "epoch": 0.07, + "grad_norm": 0.162109375, + "learning_rate": 0.00014403066812705367, + "loss": 1.1833, + "step": 1315 + }, + { + "epoch": 0.07, + "grad_norm": 0.1552734375, + "learning_rate": 0.00014457831325301204, + "loss": 1.2054, + "step": 1320 + }, + { + "epoch": 0.07, + "grad_norm": 0.1611328125, + "learning_rate": 0.00014512595837897044, + "loss": 1.2631, + "step": 1325 + }, + { + "epoch": 0.07, + "grad_norm": 0.162109375, + "learning_rate": 0.0001456736035049288, + "loss": 1.1603, + "step": 1330 + }, + { + "epoch": 0.07, + "grad_norm": 0.15625, + "learning_rate": 0.0001462212486308872, + "loss": 1.2929, + "step": 1335 + }, + { + "epoch": 0.07, + "grad_norm": 0.1572265625, + "learning_rate": 0.00014676889375684558, + "loss": 1.1819, + "step": 1340 + }, + { + "epoch": 0.07, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014731653888280395, + "loss": 1.1963, + "step": 1345 + }, + { + "epoch": 0.07, + "grad_norm": 0.1572265625, + "learning_rate": 0.00014786418400876232, + "loss": 1.2173, + "step": 1350 + }, + { + "epoch": 0.07, + "grad_norm": 0.1552734375, + "learning_rate": 0.00014841182913472072, + "loss": 1.2552, + "step": 1355 + }, + { + "epoch": 0.07, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001489594742606791, + "loss": 1.1746, + "step": 1360 + }, + { + "epoch": 0.07, + "grad_norm": 0.1552734375, + "learning_rate": 0.00014950711938663746, + "loss": 1.2428, + "step": 1365 + }, + { + "epoch": 0.08, + "grad_norm": 0.158203125, + "learning_rate": 0.00015005476451259583, + "loss": 1.2926, + "step": 1370 + }, + { + "epoch": 0.08, + "grad_norm": 0.16015625, + "learning_rate": 0.00015060240963855423, + "loss": 1.2575, + "step": 1375 + }, + { + "epoch": 0.08, + "grad_norm": 0.16015625, + "learning_rate": 0.0001511500547645126, + "loss": 1.1596, + "step": 1380 + }, + { + "epoch": 0.08, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015169769989047097, + "loss": 1.1738, + "step": 1385 + }, + { + "epoch": 0.08, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015224534501642934, + "loss": 1.2195, + "step": 1390 + }, + { + "epoch": 0.08, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015279299014238774, + "loss": 1.2544, + "step": 1395 + }, + { + "epoch": 0.08, + "grad_norm": 0.154296875, + "learning_rate": 0.0001533406352683461, + "loss": 1.1298, + "step": 1400 + }, + { + "epoch": 0.08, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015388828039430448, + "loss": 1.1252, + "step": 1405 + }, + { + "epoch": 0.08, + "grad_norm": 0.1640625, + "learning_rate": 0.00015443592552026288, + "loss": 1.2897, + "step": 1410 + }, + { + "epoch": 0.08, + "grad_norm": 0.150390625, + "learning_rate": 0.00015498357064622128, + "loss": 1.2853, + "step": 1415 + }, + { + "epoch": 0.08, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015553121577217965, + "loss": 1.2858, + "step": 1420 + }, + { + "epoch": 0.08, + "grad_norm": 0.15234375, + "learning_rate": 0.00015607886089813802, + "loss": 1.1699, + "step": 1425 + }, + { + "epoch": 0.08, + "grad_norm": 0.16015625, + "learning_rate": 0.0001566265060240964, + "loss": 1.2723, + "step": 1430 + }, + { + "epoch": 0.08, + "grad_norm": 0.15234375, + "learning_rate": 0.0001571741511500548, + "loss": 1.2586, + "step": 1435 + }, + { + "epoch": 0.08, + "grad_norm": 0.1669921875, + "learning_rate": 0.00015772179627601316, + "loss": 1.2875, + "step": 1440 + }, + { + "epoch": 0.08, + "grad_norm": 0.1513671875, + "learning_rate": 0.00015826944140197153, + "loss": 1.2107, + "step": 1445 + }, + { + "epoch": 0.08, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001588170865279299, + "loss": 1.2168, + "step": 1450 + }, + { + "epoch": 0.08, + "grad_norm": 0.1513671875, + "learning_rate": 0.00015936473165388827, + "loss": 1.2014, + "step": 1455 + }, + { + "epoch": 0.08, + "grad_norm": 0.146484375, + "learning_rate": 0.00015991237677984667, + "loss": 1.1969, + "step": 1460 + }, + { + "epoch": 0.08, + "grad_norm": 0.166015625, + "learning_rate": 0.00016046002190580504, + "loss": 1.2384, + "step": 1465 + }, + { + "epoch": 0.08, + "grad_norm": 0.154296875, + "learning_rate": 0.0001610076670317634, + "loss": 1.2022, + "step": 1470 + }, + { + "epoch": 0.08, + "grad_norm": 0.154296875, + "learning_rate": 0.00016155531215772178, + "loss": 1.2074, + "step": 1475 + }, + { + "epoch": 0.08, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016210295728368018, + "loss": 1.2065, + "step": 1480 + }, + { + "epoch": 0.08, + "grad_norm": 0.1474609375, + "learning_rate": 0.00016265060240963855, + "loss": 1.2527, + "step": 1485 + }, + { + "epoch": 0.08, + "grad_norm": 0.158203125, + "learning_rate": 0.00016319824753559692, + "loss": 1.1705, + "step": 1490 + }, + { + "epoch": 0.08, + "grad_norm": 0.154296875, + "learning_rate": 0.00016374589266155532, + "loss": 1.2375, + "step": 1495 + }, + { + "epoch": 0.08, + "grad_norm": 0.16015625, + "learning_rate": 0.00016429353778751372, + "loss": 1.2421, + "step": 1500 + }, + { + "epoch": 0.08, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001648411829134721, + "loss": 1.2498, + "step": 1505 + }, + { + "epoch": 0.08, + "grad_norm": 0.158203125, + "learning_rate": 0.00016538882803943046, + "loss": 1.2472, + "step": 1510 + }, + { + "epoch": 0.08, + "grad_norm": 0.15625, + "learning_rate": 0.00016593647316538883, + "loss": 1.2729, + "step": 1515 + }, + { + "epoch": 0.08, + "grad_norm": 0.154296875, + "learning_rate": 0.00016648411829134723, + "loss": 1.1838, + "step": 1520 + }, + { + "epoch": 0.08, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001670317634173056, + "loss": 1.2131, + "step": 1525 + }, + { + "epoch": 0.08, + "grad_norm": 0.15625, + "learning_rate": 0.00016757940854326397, + "loss": 1.2797, + "step": 1530 + }, + { + "epoch": 0.08, + "grad_norm": 0.15625, + "learning_rate": 0.00016812705366922234, + "loss": 1.14, + "step": 1535 + }, + { + "epoch": 0.08, + "grad_norm": 0.1572265625, + "learning_rate": 0.00016867469879518074, + "loss": 1.1353, + "step": 1540 + }, + { + "epoch": 0.08, + "grad_norm": 0.15625, + "learning_rate": 0.0001692223439211391, + "loss": 1.2047, + "step": 1545 + }, + { + "epoch": 0.08, + "grad_norm": 0.1552734375, + "learning_rate": 0.00016976998904709748, + "loss": 1.2367, + "step": 1550 + }, + { + "epoch": 0.09, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017031763417305585, + "loss": 1.2328, + "step": 1555 + }, + { + "epoch": 0.09, + "grad_norm": 0.1484375, + "learning_rate": 0.00017086527929901423, + "loss": 1.2321, + "step": 1560 + }, + { + "epoch": 0.09, + "grad_norm": 0.158203125, + "learning_rate": 0.00017141292442497262, + "loss": 1.2252, + "step": 1565 + }, + { + "epoch": 0.09, + "grad_norm": 0.1669921875, + "learning_rate": 0.000171960569550931, + "loss": 1.2612, + "step": 1570 + }, + { + "epoch": 0.09, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017250821467688937, + "loss": 1.2438, + "step": 1575 + }, + { + "epoch": 0.09, + "grad_norm": 0.15234375, + "learning_rate": 0.00017305585980284776, + "loss": 1.2371, + "step": 1580 + }, + { + "epoch": 0.09, + "grad_norm": 0.154296875, + "learning_rate": 0.00017360350492880616, + "loss": 1.1611, + "step": 1585 + }, + { + "epoch": 0.09, + "grad_norm": 0.16015625, + "learning_rate": 0.00017415115005476453, + "loss": 1.2399, + "step": 1590 + }, + { + "epoch": 0.09, + "grad_norm": 0.1484375, + "learning_rate": 0.0001746987951807229, + "loss": 1.1899, + "step": 1595 + }, + { + "epoch": 0.09, + "grad_norm": 0.15625, + "learning_rate": 0.00017524644030668127, + "loss": 1.2359, + "step": 1600 + }, + { + "epoch": 0.09, + "grad_norm": 0.158203125, + "learning_rate": 0.00017579408543263967, + "loss": 1.2184, + "step": 1605 + }, + { + "epoch": 0.09, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017634173055859804, + "loss": 1.2627, + "step": 1610 + }, + { + "epoch": 0.09, + "grad_norm": 0.1552734375, + "learning_rate": 0.00017688937568455641, + "loss": 1.2724, + "step": 1615 + }, + { + "epoch": 0.09, + "grad_norm": 0.158203125, + "learning_rate": 0.00017743702081051479, + "loss": 1.2299, + "step": 1620 + }, + { + "epoch": 0.09, + "grad_norm": 0.162109375, + "learning_rate": 0.00017798466593647318, + "loss": 1.227, + "step": 1625 + }, + { + "epoch": 0.09, + "grad_norm": 0.1494140625, + "learning_rate": 0.00017853231106243155, + "loss": 1.2011, + "step": 1630 + }, + { + "epoch": 0.09, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017907995618838993, + "loss": 1.1601, + "step": 1635 + }, + { + "epoch": 0.09, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001796276013143483, + "loss": 1.2511, + "step": 1640 + }, + { + "epoch": 0.09, + "grad_norm": 0.162109375, + "learning_rate": 0.0001801752464403067, + "loss": 1.209, + "step": 1645 + }, + { + "epoch": 0.09, + "grad_norm": 0.16015625, + "learning_rate": 0.00018072289156626507, + "loss": 1.2136, + "step": 1650 + }, + { + "epoch": 0.09, + "grad_norm": 0.15234375, + "learning_rate": 0.00018127053669222344, + "loss": 1.2127, + "step": 1655 + }, + { + "epoch": 0.09, + "grad_norm": 0.154296875, + "learning_rate": 0.00018181818181818183, + "loss": 1.1698, + "step": 1660 + }, + { + "epoch": 0.09, + "grad_norm": 0.1484375, + "learning_rate": 0.0001823658269441402, + "loss": 1.2385, + "step": 1665 + }, + { + "epoch": 0.09, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001829134720700986, + "loss": 1.2362, + "step": 1670 + }, + { + "epoch": 0.09, + "grad_norm": 0.14453125, + "learning_rate": 0.00018346111719605697, + "loss": 1.2404, + "step": 1675 + }, + { + "epoch": 0.09, + "grad_norm": 0.15625, + "learning_rate": 0.00018400876232201535, + "loss": 1.2084, + "step": 1680 + }, + { + "epoch": 0.09, + "grad_norm": 0.150390625, + "learning_rate": 0.00018455640744797372, + "loss": 1.2191, + "step": 1685 + }, + { + "epoch": 0.09, + "grad_norm": 0.162109375, + "learning_rate": 0.00018510405257393211, + "loss": 1.2111, + "step": 1690 + }, + { + "epoch": 0.09, + "grad_norm": 0.1484375, + "learning_rate": 0.00018565169769989049, + "loss": 1.1864, + "step": 1695 + }, + { + "epoch": 0.09, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018619934282584886, + "loss": 1.2756, + "step": 1700 + }, + { + "epoch": 0.09, + "grad_norm": 0.15234375, + "learning_rate": 0.00018674698795180723, + "loss": 1.1807, + "step": 1705 + }, + { + "epoch": 0.09, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018729463307776562, + "loss": 1.1757, + "step": 1710 + }, + { + "epoch": 0.09, + "grad_norm": 0.150390625, + "learning_rate": 0.000187842278203724, + "loss": 1.1995, + "step": 1715 + }, + { + "epoch": 0.09, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018838992332968237, + "loss": 1.2545, + "step": 1720 + }, + { + "epoch": 0.09, + "grad_norm": 0.158203125, + "learning_rate": 0.00018893756845564074, + "loss": 1.2862, + "step": 1725 + }, + { + "epoch": 0.09, + "grad_norm": 0.16015625, + "learning_rate": 0.00018948521358159914, + "loss": 1.2356, + "step": 1730 + }, + { + "epoch": 0.1, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001900328587075575, + "loss": 1.2461, + "step": 1735 + }, + { + "epoch": 0.1, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019058050383351588, + "loss": 1.2673, + "step": 1740 + }, + { + "epoch": 0.1, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019112814895947428, + "loss": 1.2255, + "step": 1745 + }, + { + "epoch": 0.1, + "grad_norm": 0.14453125, + "learning_rate": 0.00019167579408543265, + "loss": 1.1952, + "step": 1750 + }, + { + "epoch": 0.1, + "grad_norm": 0.150390625, + "learning_rate": 0.00019222343921139104, + "loss": 1.1944, + "step": 1755 + }, + { + "epoch": 0.1, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019277108433734942, + "loss": 1.2916, + "step": 1760 + }, + { + "epoch": 0.1, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001933187294633078, + "loss": 1.216, + "step": 1765 + }, + { + "epoch": 0.1, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019386637458926618, + "loss": 1.2824, + "step": 1770 + }, + { + "epoch": 0.1, + "grad_norm": 0.162109375, + "learning_rate": 0.00019441401971522456, + "loss": 1.1353, + "step": 1775 + }, + { + "epoch": 0.1, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019496166484118293, + "loss": 1.1201, + "step": 1780 + }, + { + "epoch": 0.1, + "grad_norm": 0.15625, + "learning_rate": 0.0001955093099671413, + "loss": 1.2502, + "step": 1785 + }, + { + "epoch": 0.1, + "grad_norm": 0.185546875, + "learning_rate": 0.00019605695509309967, + "loss": 1.1795, + "step": 1790 + }, + { + "epoch": 0.1, + "grad_norm": 0.15234375, + "learning_rate": 0.00019660460021905807, + "loss": 1.2021, + "step": 1795 + }, + { + "epoch": 0.1, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019715224534501644, + "loss": 1.1568, + "step": 1800 + }, + { + "epoch": 0.1, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001976998904709748, + "loss": 1.2459, + "step": 1805 + }, + { + "epoch": 0.1, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019824753559693318, + "loss": 1.2203, + "step": 1810 + }, + { + "epoch": 0.1, + "grad_norm": 0.15625, + "learning_rate": 0.00019879518072289158, + "loss": 1.2607, + "step": 1815 + }, + { + "epoch": 0.1, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019934282584884995, + "loss": 1.24, + "step": 1820 + }, + { + "epoch": 0.1, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019989047097480832, + "loss": 1.2106, + "step": 1825 + }, + { + "epoch": 0.1, + "grad_norm": 0.1455078125, + "learning_rate": 0.00019999997075432027, + "loss": 1.1284, + "step": 1830 + }, + { + "epoch": 0.1, + "grad_norm": 0.162109375, + "learning_rate": 0.0001999998519437756, + "loss": 1.2807, + "step": 1835 + }, + { + "epoch": 0.1, + "grad_norm": 0.150390625, + "learning_rate": 0.0001999996417406196, + "loss": 1.185, + "step": 1840 + }, + { + "epoch": 0.1, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001999993401450443, + "loss": 1.2288, + "step": 1845 + }, + { + "epoch": 0.1, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019999894715732542, + "loss": 1.1901, + "step": 1850 + }, + { + "epoch": 0.1, + "grad_norm": 0.154296875, + "learning_rate": 0.000199998462777822, + "loss": 1.2768, + "step": 1855 + }, + { + "epoch": 0.1, + "grad_norm": 0.158203125, + "learning_rate": 0.00019999788700697684, + "loss": 1.2265, + "step": 1860 + }, + { + "epoch": 0.1, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001999972198453161, + "loss": 1.1219, + "step": 1865 + }, + { + "epoch": 0.1, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001999964612934495, + "loss": 1.2869, + "step": 1870 + }, + { + "epoch": 0.1, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019999561135207038, + "loss": 1.2066, + "step": 1875 + }, + { + "epoch": 0.1, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019999467002195545, + "loss": 1.2709, + "step": 1880 + }, + { + "epoch": 0.1, + "grad_norm": 0.15625, + "learning_rate": 0.00019999363730396503, + "loss": 1.1617, + "step": 1885 + }, + { + "epoch": 0.1, + "grad_norm": 0.15234375, + "learning_rate": 0.000199992513199043, + "loss": 1.2292, + "step": 1890 + }, + { + "epoch": 0.1, + "grad_norm": 0.15625, + "learning_rate": 0.00019999129770821662, + "loss": 1.1603, + "step": 1895 + }, + { + "epoch": 0.1, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019998999083259685, + "loss": 1.2836, + "step": 1900 + }, + { + "epoch": 0.1, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019998859257337802, + "loss": 1.209, + "step": 1905 + }, + { + "epoch": 0.1, + "grad_norm": 0.1484375, + "learning_rate": 0.00019998710293183804, + "loss": 1.211, + "step": 1910 + }, + { + "epoch": 0.1, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019998552190933838, + "loss": 1.2048, + "step": 1915 + }, + { + "epoch": 0.11, + "grad_norm": 0.150390625, + "learning_rate": 0.00019998384950732393, + "loss": 1.1859, + "step": 1920 + }, + { + "epoch": 0.11, + "grad_norm": 0.1484375, + "learning_rate": 0.00019998208572732321, + "loss": 1.2177, + "step": 1925 + }, + { + "epoch": 0.11, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019998023057094812, + "loss": 1.2393, + "step": 1930 + }, + { + "epoch": 0.11, + "grad_norm": 0.162109375, + "learning_rate": 0.00019997828403989416, + "loss": 1.1509, + "step": 1935 + }, + { + "epoch": 0.11, + "grad_norm": 0.154296875, + "learning_rate": 0.00019997624613594034, + "loss": 1.1318, + "step": 1940 + }, + { + "epoch": 0.11, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019997411686094912, + "loss": 1.1796, + "step": 1945 + }, + { + "epoch": 0.11, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001999718962168665, + "loss": 1.168, + "step": 1950 + }, + { + "epoch": 0.11, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019996958420572203, + "loss": 1.2048, + "step": 1955 + }, + { + "epoch": 0.11, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001999671808296287, + "loss": 1.1888, + "step": 1960 + }, + { + "epoch": 0.11, + "grad_norm": 0.154296875, + "learning_rate": 0.000199964686090783, + "loss": 1.2198, + "step": 1965 + }, + { + "epoch": 0.11, + "grad_norm": 0.16015625, + "learning_rate": 0.00019996209999146499, + "loss": 1.2259, + "step": 1970 + }, + { + "epoch": 0.11, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019995942253403813, + "loss": 1.167, + "step": 1975 + }, + { + "epoch": 0.11, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019995665372094944, + "loss": 1.2167, + "step": 1980 + }, + { + "epoch": 0.11, + "grad_norm": 0.158203125, + "learning_rate": 0.00019995379355472942, + "loss": 1.1921, + "step": 1985 + }, + { + "epoch": 0.11, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019995084203799202, + "loss": 1.1466, + "step": 1990 + }, + { + "epoch": 0.11, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019994779917343475, + "loss": 1.255, + "step": 1995 + }, + { + "epoch": 0.11, + "grad_norm": 0.15625, + "learning_rate": 0.00019994466496383857, + "loss": 1.2633, + "step": 2000 + }, + { + "epoch": 0.11, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001999414394120679, + "loss": 1.1621, + "step": 2005 + }, + { + "epoch": 0.11, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019993812252107066, + "loss": 1.2009, + "step": 2010 + }, + { + "epoch": 0.11, + "grad_norm": 0.154296875, + "learning_rate": 0.00019993471429387828, + "loss": 1.221, + "step": 2015 + }, + { + "epoch": 0.11, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001999312147336056, + "loss": 1.253, + "step": 2020 + }, + { + "epoch": 0.11, + "grad_norm": 0.15625, + "learning_rate": 0.00019992762384345096, + "loss": 1.188, + "step": 2025 + }, + { + "epoch": 0.11, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001999239416266962, + "loss": 1.2327, + "step": 2030 + }, + { + "epoch": 0.11, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019992016808670658, + "loss": 1.2273, + "step": 2035 + }, + { + "epoch": 0.11, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019991630322693086, + "loss": 1.2041, + "step": 2040 + }, + { + "epoch": 0.11, + "grad_norm": 0.16015625, + "learning_rate": 0.00019991234705090118, + "loss": 1.2807, + "step": 2045 + }, + { + "epoch": 0.11, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019990829956223328, + "loss": 1.2737, + "step": 2050 + }, + { + "epoch": 0.11, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019990416076462624, + "loss": 1.1945, + "step": 2055 + }, + { + "epoch": 0.11, + "grad_norm": 0.15625, + "learning_rate": 0.00019989993066186262, + "loss": 1.2268, + "step": 2060 + }, + { + "epoch": 0.11, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001998956092578084, + "loss": 1.2135, + "step": 2065 + }, + { + "epoch": 0.11, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019989119655641307, + "loss": 1.2546, + "step": 2070 + }, + { + "epoch": 0.11, + "grad_norm": 0.158203125, + "learning_rate": 0.0001998866925617095, + "loss": 1.1937, + "step": 2075 + }, + { + "epoch": 0.11, + "grad_norm": 0.150390625, + "learning_rate": 0.00019988209727781403, + "loss": 1.2649, + "step": 2080 + }, + { + "epoch": 0.11, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019987741070892642, + "loss": 1.1636, + "step": 2085 + }, + { + "epoch": 0.11, + "grad_norm": 0.158203125, + "learning_rate": 0.0001998726328593298, + "loss": 1.2095, + "step": 2090 + }, + { + "epoch": 0.11, + "grad_norm": 0.15625, + "learning_rate": 0.00019986776373339082, + "loss": 1.2588, + "step": 2095 + }, + { + "epoch": 0.12, + "grad_norm": 0.1484375, + "learning_rate": 0.00019986280333555954, + "loss": 1.1869, + "step": 2100 + }, + { + "epoch": 0.12, + "grad_norm": 0.154296875, + "learning_rate": 0.00019985775167036932, + "loss": 1.1601, + "step": 2105 + }, + { + "epoch": 0.12, + "grad_norm": 0.1455078125, + "learning_rate": 0.0001998526087424371, + "loss": 1.1809, + "step": 2110 + }, + { + "epoch": 0.12, + "grad_norm": 0.1455078125, + "learning_rate": 0.0001998473745564631, + "loss": 1.168, + "step": 2115 + }, + { + "epoch": 0.12, + "grad_norm": 0.1533203125, + "learning_rate": 0.000199842049117231, + "loss": 1.1992, + "step": 2120 + }, + { + "epoch": 0.12, + "grad_norm": 0.162109375, + "learning_rate": 0.00019983663242960784, + "loss": 1.2649, + "step": 2125 + }, + { + "epoch": 0.12, + "grad_norm": 0.154296875, + "learning_rate": 0.0001998311244985441, + "loss": 1.1687, + "step": 2130 + }, + { + "epoch": 0.12, + "grad_norm": 0.15234375, + "learning_rate": 0.00019982552532907364, + "loss": 1.1899, + "step": 2135 + }, + { + "epoch": 0.12, + "grad_norm": 0.1455078125, + "learning_rate": 0.00019981983492631367, + "loss": 1.1579, + "step": 2140 + }, + { + "epoch": 0.12, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001998140532954648, + "loss": 1.132, + "step": 2145 + }, + { + "epoch": 0.12, + "grad_norm": 0.15625, + "learning_rate": 0.00019980818044181106, + "loss": 1.1908, + "step": 2150 + }, + { + "epoch": 0.12, + "grad_norm": 0.14453125, + "learning_rate": 0.0001998022163707198, + "loss": 1.2316, + "step": 2155 + }, + { + "epoch": 0.12, + "grad_norm": 0.1484375, + "learning_rate": 0.00019979616108764173, + "loss": 1.1697, + "step": 2160 + }, + { + "epoch": 0.12, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019979001459811095, + "loss": 1.251, + "step": 2165 + }, + { + "epoch": 0.12, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001997837769077449, + "loss": 1.2057, + "step": 2170 + }, + { + "epoch": 0.12, + "grad_norm": 0.15625, + "learning_rate": 0.00019977744802224438, + "loss": 1.183, + "step": 2175 + }, + { + "epoch": 0.12, + "grad_norm": 0.15234375, + "learning_rate": 0.00019977102794739354, + "loss": 1.238, + "step": 2180 + }, + { + "epoch": 0.12, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019976451668905987, + "loss": 1.1808, + "step": 2185 + }, + { + "epoch": 0.12, + "grad_norm": 0.1376953125, + "learning_rate": 0.00019975791425319415, + "loss": 1.2381, + "step": 2190 + }, + { + "epoch": 0.12, + "grad_norm": 0.154296875, + "learning_rate": 0.00019975122064583056, + "loss": 1.2191, + "step": 2195 + }, + { + "epoch": 0.12, + "grad_norm": 0.16015625, + "learning_rate": 0.00019974443587308656, + "loss": 1.2661, + "step": 2200 + }, + { + "epoch": 0.12, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019973755994116292, + "loss": 1.201, + "step": 2205 + }, + { + "epoch": 0.12, + "grad_norm": 0.1484375, + "learning_rate": 0.0001997305928563438, + "loss": 1.1525, + "step": 2210 + }, + { + "epoch": 0.12, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019972353462499657, + "loss": 1.2477, + "step": 2215 + }, + { + "epoch": 0.12, + "grad_norm": 0.15625, + "learning_rate": 0.00019971638525357195, + "loss": 1.2477, + "step": 2220 + }, + { + "epoch": 0.12, + "grad_norm": 0.154296875, + "learning_rate": 0.00019970914474860393, + "loss": 1.2408, + "step": 2225 + }, + { + "epoch": 0.12, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019970181311670983, + "loss": 1.2717, + "step": 2230 + }, + { + "epoch": 0.12, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019969439036459022, + "loss": 1.2187, + "step": 2235 + }, + { + "epoch": 0.12, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019968687649902898, + "loss": 1.1677, + "step": 2240 + }, + { + "epoch": 0.12, + "grad_norm": 0.15625, + "learning_rate": 0.0001996792715268932, + "loss": 1.2571, + "step": 2245 + }, + { + "epoch": 0.12, + "grad_norm": 0.1484375, + "learning_rate": 0.00019967157545513326, + "loss": 1.2498, + "step": 2250 + }, + { + "epoch": 0.12, + "grad_norm": 0.166015625, + "learning_rate": 0.0001996637882907829, + "loss": 1.1404, + "step": 2255 + }, + { + "epoch": 0.12, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001996559100409589, + "loss": 1.2308, + "step": 2260 + }, + { + "epoch": 0.12, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019964794071286154, + "loss": 1.1871, + "step": 2265 + }, + { + "epoch": 0.12, + "grad_norm": 0.15625, + "learning_rate": 0.00019963988031377407, + "loss": 1.1729, + "step": 2270 + }, + { + "epoch": 0.12, + "grad_norm": 0.150390625, + "learning_rate": 0.00019963172885106322, + "loss": 1.2352, + "step": 2275 + }, + { + "epoch": 0.12, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019962348633217878, + "loss": 1.2005, + "step": 2280 + }, + { + "epoch": 0.13, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019961515276465384, + "loss": 1.2379, + "step": 2285 + }, + { + "epoch": 0.13, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019960672815610465, + "loss": 1.1967, + "step": 2290 + }, + { + "epoch": 0.13, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019959821251423074, + "loss": 1.215, + "step": 2295 + }, + { + "epoch": 0.13, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019958960584681474, + "loss": 1.2723, + "step": 2300 + }, + { + "epoch": 0.13, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019958090816172254, + "loss": 1.318, + "step": 2305 + }, + { + "epoch": 0.13, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019957211946690321, + "loss": 1.1797, + "step": 2310 + }, + { + "epoch": 0.13, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019956323977038894, + "loss": 1.1386, + "step": 2315 + }, + { + "epoch": 0.13, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019955426908029514, + "loss": 1.1652, + "step": 2320 + }, + { + "epoch": 0.13, + "grad_norm": 0.150390625, + "learning_rate": 0.0001995452074048204, + "loss": 1.2241, + "step": 2325 + }, + { + "epoch": 0.13, + "grad_norm": 0.15625, + "learning_rate": 0.0001995360547522464, + "loss": 1.1903, + "step": 2330 + }, + { + "epoch": 0.13, + "grad_norm": 0.1591796875, + "learning_rate": 0.000199526811130938, + "loss": 1.1414, + "step": 2335 + }, + { + "epoch": 0.13, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019951747654934322, + "loss": 1.2083, + "step": 2340 + }, + { + "epoch": 0.13, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001995080510159932, + "loss": 1.1457, + "step": 2345 + }, + { + "epoch": 0.13, + "grad_norm": 0.154296875, + "learning_rate": 0.00019949853453950215, + "loss": 1.1902, + "step": 2350 + }, + { + "epoch": 0.13, + "grad_norm": 0.154296875, + "learning_rate": 0.0001994889271285675, + "loss": 1.2253, + "step": 2355 + }, + { + "epoch": 0.13, + "grad_norm": 0.15625, + "learning_rate": 0.00019947922879196966, + "loss": 1.1969, + "step": 2360 + }, + { + "epoch": 0.13, + "grad_norm": 0.158203125, + "learning_rate": 0.00019946943953857226, + "loss": 1.1755, + "step": 2365 + }, + { + "epoch": 0.13, + "grad_norm": 0.15234375, + "learning_rate": 0.00019945955937732194, + "loss": 1.2198, + "step": 2370 + }, + { + "epoch": 0.13, + "grad_norm": 0.1484375, + "learning_rate": 0.00019944958831724844, + "loss": 1.2239, + "step": 2375 + }, + { + "epoch": 0.13, + "grad_norm": 0.154296875, + "learning_rate": 0.0001994395263674646, + "loss": 1.1852, + "step": 2380 + }, + { + "epoch": 0.13, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019942937353716636, + "loss": 1.2415, + "step": 2385 + }, + { + "epoch": 0.13, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001994191298356326, + "loss": 1.1977, + "step": 2390 + }, + { + "epoch": 0.13, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019940879527222536, + "loss": 1.1554, + "step": 2395 + }, + { + "epoch": 0.13, + "grad_norm": 0.162109375, + "learning_rate": 0.00019939836985638965, + "loss": 1.2109, + "step": 2400 + }, + { + "epoch": 0.13, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001993878535976536, + "loss": 1.1984, + "step": 2405 + }, + { + "epoch": 0.13, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019937724650562818, + "loss": 1.2217, + "step": 2410 + }, + { + "epoch": 0.13, + "grad_norm": 0.15234375, + "learning_rate": 0.00019936654859000768, + "loss": 1.1562, + "step": 2415 + }, + { + "epoch": 0.13, + "grad_norm": 0.154296875, + "learning_rate": 0.00019935575986056907, + "loss": 1.2268, + "step": 2420 + }, + { + "epoch": 0.13, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019934488032717255, + "loss": 1.1742, + "step": 2425 + }, + { + "epoch": 0.13, + "grad_norm": 0.15625, + "learning_rate": 0.00019933390999976122, + "loss": 1.1728, + "step": 2430 + }, + { + "epoch": 0.13, + "grad_norm": 0.150390625, + "learning_rate": 0.00019932284888836112, + "loss": 1.191, + "step": 2435 + }, + { + "epoch": 0.13, + "grad_norm": 0.15234375, + "learning_rate": 0.0001993116970030813, + "loss": 1.2015, + "step": 2440 + }, + { + "epoch": 0.13, + "grad_norm": 0.154296875, + "learning_rate": 0.0001993004543541139, + "loss": 1.1842, + "step": 2445 + }, + { + "epoch": 0.13, + "grad_norm": 0.1455078125, + "learning_rate": 0.00019928912095173372, + "loss": 1.2625, + "step": 2450 + }, + { + "epoch": 0.13, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019927769680629873, + "loss": 1.2149, + "step": 2455 + }, + { + "epoch": 0.13, + "grad_norm": 0.166015625, + "learning_rate": 0.0001992661819282498, + "loss": 1.2789, + "step": 2460 + }, + { + "epoch": 0.14, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019925457632811064, + "loss": 1.1799, + "step": 2465 + }, + { + "epoch": 0.14, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019924288001648799, + "loss": 1.1884, + "step": 2470 + }, + { + "epoch": 0.14, + "grad_norm": 0.1484375, + "learning_rate": 0.00019923109300407137, + "loss": 1.2513, + "step": 2475 + }, + { + "epoch": 0.14, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001992192153016333, + "loss": 1.1907, + "step": 2480 + }, + { + "epoch": 0.14, + "grad_norm": 0.162109375, + "learning_rate": 0.0001992072469200291, + "loss": 1.1897, + "step": 2485 + }, + { + "epoch": 0.14, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019919518787019704, + "loss": 1.2474, + "step": 2490 + }, + { + "epoch": 0.14, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001991830381631582, + "loss": 1.1519, + "step": 2495 + }, + { + "epoch": 0.14, + "grad_norm": 0.154296875, + "learning_rate": 0.00019917079781001654, + "loss": 1.2137, + "step": 2500 + }, + { + "epoch": 0.14, + "grad_norm": 0.150390625, + "learning_rate": 0.00019915846682195884, + "loss": 1.1884, + "step": 2505 + }, + { + "epoch": 0.14, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001991460452102547, + "loss": 1.1588, + "step": 2510 + }, + { + "epoch": 0.14, + "grad_norm": 0.154296875, + "learning_rate": 0.00019913353298625663, + "loss": 1.2153, + "step": 2515 + }, + { + "epoch": 0.14, + "grad_norm": 0.150390625, + "learning_rate": 0.00019912093016139987, + "loss": 1.1183, + "step": 2520 + }, + { + "epoch": 0.14, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001991082367472025, + "loss": 1.129, + "step": 2525 + }, + { + "epoch": 0.14, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019909545275526534, + "loss": 1.1752, + "step": 2530 + }, + { + "epoch": 0.14, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001990825781972721, + "loss": 1.1123, + "step": 2535 + }, + { + "epoch": 0.14, + "grad_norm": 0.15234375, + "learning_rate": 0.00019906961308498912, + "loss": 1.1872, + "step": 2540 + }, + { + "epoch": 0.14, + "grad_norm": 0.15625, + "learning_rate": 0.0001990565574302656, + "loss": 1.2054, + "step": 2545 + }, + { + "epoch": 0.14, + "grad_norm": 0.158203125, + "learning_rate": 0.0001990434112450335, + "loss": 1.1857, + "step": 2550 + }, + { + "epoch": 0.14, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001990301745413074, + "loss": 1.1533, + "step": 2555 + }, + { + "epoch": 0.14, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019901684733118478, + "loss": 1.1818, + "step": 2560 + }, + { + "epoch": 0.14, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019900342962684566, + "loss": 1.2414, + "step": 2565 + }, + { + "epoch": 0.14, + "grad_norm": 0.15234375, + "learning_rate": 0.00019898992144055293, + "loss": 1.173, + "step": 2570 + }, + { + "epoch": 0.14, + "grad_norm": 0.154296875, + "learning_rate": 0.000198976322784652, + "loss": 1.1529, + "step": 2575 + }, + { + "epoch": 0.14, + "grad_norm": 0.158203125, + "learning_rate": 0.00019896263367157112, + "loss": 1.2378, + "step": 2580 + }, + { + "epoch": 0.14, + "grad_norm": 0.15625, + "learning_rate": 0.00019894885411382113, + "loss": 1.1579, + "step": 2585 + }, + { + "epoch": 0.14, + "grad_norm": 0.150390625, + "learning_rate": 0.00019893498412399554, + "loss": 1.3038, + "step": 2590 + }, + { + "epoch": 0.14, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019892102371477056, + "loss": 1.2808, + "step": 2595 + }, + { + "epoch": 0.14, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001989069728989049, + "loss": 1.1197, + "step": 2600 + }, + { + "epoch": 0.14, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019889283168924004, + "loss": 1.2699, + "step": 2605 + }, + { + "epoch": 0.14, + "grad_norm": 0.154296875, + "learning_rate": 0.00019887860009870002, + "loss": 1.1598, + "step": 2610 + }, + { + "epoch": 0.14, + "grad_norm": 0.146484375, + "learning_rate": 0.0001988642781402915, + "loss": 1.2068, + "step": 2615 + }, + { + "epoch": 0.14, + "grad_norm": 0.15625, + "learning_rate": 0.00019884986582710366, + "loss": 1.125, + "step": 2620 + }, + { + "epoch": 0.14, + "grad_norm": 0.154296875, + "learning_rate": 0.00019883536317230838, + "loss": 1.2651, + "step": 2625 + }, + { + "epoch": 0.14, + "grad_norm": 0.154296875, + "learning_rate": 0.00019882077018915998, + "loss": 1.1742, + "step": 2630 + }, + { + "epoch": 0.14, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019880608689099537, + "loss": 1.1599, + "step": 2635 + }, + { + "epoch": 0.14, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001987913132912341, + "loss": 1.1619, + "step": 2640 + }, + { + "epoch": 0.14, + "grad_norm": 0.15625, + "learning_rate": 0.0001987764494033781, + "loss": 1.2287, + "step": 2645 + }, + { + "epoch": 0.15, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001987614952410119, + "loss": 1.1133, + "step": 2650 + }, + { + "epoch": 0.15, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019874645081780252, + "loss": 1.2409, + "step": 2655 + }, + { + "epoch": 0.15, + "grad_norm": 0.158203125, + "learning_rate": 0.00019873131614749952, + "loss": 1.1873, + "step": 2660 + }, + { + "epoch": 0.15, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001987160912439348, + "loss": 1.1572, + "step": 2665 + }, + { + "epoch": 0.15, + "grad_norm": 0.166015625, + "learning_rate": 0.00019870077612102287, + "loss": 1.2298, + "step": 2670 + }, + { + "epoch": 0.15, + "grad_norm": 0.150390625, + "learning_rate": 0.00019868537079276063, + "loss": 1.1669, + "step": 2675 + }, + { + "epoch": 0.15, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019866987527322746, + "loss": 1.1517, + "step": 2680 + }, + { + "epoch": 0.15, + "grad_norm": 0.158203125, + "learning_rate": 0.0001986542895765851, + "loss": 1.2001, + "step": 2685 + }, + { + "epoch": 0.15, + "grad_norm": 0.158203125, + "learning_rate": 0.00019863861371707779, + "loss": 1.2124, + "step": 2690 + }, + { + "epoch": 0.15, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019862284770903212, + "loss": 1.2066, + "step": 2695 + }, + { + "epoch": 0.15, + "grad_norm": 0.1591796875, + "learning_rate": 0.000198606991566857, + "loss": 1.1659, + "step": 2700 + }, + { + "epoch": 0.15, + "grad_norm": 0.158203125, + "learning_rate": 0.00019859104530504392, + "loss": 1.1927, + "step": 2705 + }, + { + "epoch": 0.15, + "grad_norm": 0.158203125, + "learning_rate": 0.00019857500893816652, + "loss": 1.1639, + "step": 2710 + }, + { + "epoch": 0.15, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019855888248088087, + "loss": 1.1809, + "step": 2715 + }, + { + "epoch": 0.15, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019854266594792548, + "loss": 1.1528, + "step": 2720 + }, + { + "epoch": 0.15, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019852635935412095, + "loss": 1.2252, + "step": 2725 + }, + { + "epoch": 0.15, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019850996271437044, + "loss": 1.1844, + "step": 2730 + }, + { + "epoch": 0.15, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001984934760436592, + "loss": 1.2045, + "step": 2735 + }, + { + "epoch": 0.15, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019847689935705492, + "loss": 1.2144, + "step": 2740 + }, + { + "epoch": 0.15, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019846023266970747, + "loss": 1.2032, + "step": 2745 + }, + { + "epoch": 0.15, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019844347599684895, + "loss": 1.224, + "step": 2750 + }, + { + "epoch": 0.15, + "grad_norm": 0.146484375, + "learning_rate": 0.0001984266293537938, + "loss": 1.1924, + "step": 2755 + }, + { + "epoch": 0.15, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019840969275593858, + "loss": 1.1363, + "step": 2760 + }, + { + "epoch": 0.15, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019839266621876216, + "loss": 1.2361, + "step": 2765 + }, + { + "epoch": 0.15, + "grad_norm": 0.1484375, + "learning_rate": 0.00019837554975782554, + "loss": 1.1856, + "step": 2770 + }, + { + "epoch": 0.15, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001983583433887719, + "loss": 1.1942, + "step": 2775 + }, + { + "epoch": 0.15, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001983410471273267, + "loss": 1.1392, + "step": 2780 + }, + { + "epoch": 0.15, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019832366098929735, + "loss": 1.1921, + "step": 2785 + }, + { + "epoch": 0.15, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019830618499057357, + "loss": 1.2808, + "step": 2790 + }, + { + "epoch": 0.15, + "grad_norm": 0.158203125, + "learning_rate": 0.00019828861914712715, + "loss": 1.1448, + "step": 2795 + }, + { + "epoch": 0.15, + "grad_norm": 0.1484375, + "learning_rate": 0.00019827096347501205, + "loss": 1.2194, + "step": 2800 + }, + { + "epoch": 0.15, + "grad_norm": 0.16796875, + "learning_rate": 0.0001982532179903642, + "loss": 1.1556, + "step": 2805 + }, + { + "epoch": 0.15, + "grad_norm": 0.158203125, + "learning_rate": 0.00019823538270940171, + "loss": 1.193, + "step": 2810 + }, + { + "epoch": 0.15, + "grad_norm": 0.154296875, + "learning_rate": 0.00019821745764842476, + "loss": 1.1494, + "step": 2815 + }, + { + "epoch": 0.15, + "grad_norm": 0.15625, + "learning_rate": 0.00019819944282381554, + "loss": 1.1407, + "step": 2820 + }, + { + "epoch": 0.15, + "grad_norm": 0.14453125, + "learning_rate": 0.0001981813382520383, + "loss": 1.248, + "step": 2825 + }, + { + "epoch": 0.16, + "grad_norm": 0.15625, + "learning_rate": 0.00019816314394963924, + "loss": 1.2004, + "step": 2830 + }, + { + "epoch": 0.16, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019814485993324671, + "loss": 1.2354, + "step": 2835 + }, + { + "epoch": 0.16, + "grad_norm": 0.15625, + "learning_rate": 0.00019812648621957095, + "loss": 1.1944, + "step": 2840 + }, + { + "epoch": 0.16, + "grad_norm": 0.15234375, + "learning_rate": 0.0001981080228254042, + "loss": 1.2029, + "step": 2845 + }, + { + "epoch": 0.16, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019808946976762067, + "loss": 1.2045, + "step": 2850 + }, + { + "epoch": 0.16, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001980708270631765, + "loss": 1.2151, + "step": 2855 + }, + { + "epoch": 0.16, + "grad_norm": 0.15625, + "learning_rate": 0.0001980520947291098, + "loss": 1.1599, + "step": 2860 + }, + { + "epoch": 0.16, + "grad_norm": 0.16015625, + "learning_rate": 0.0001980332727825405, + "loss": 1.2136, + "step": 2865 + }, + { + "epoch": 0.16, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019801436124067053, + "loss": 1.1651, + "step": 2870 + }, + { + "epoch": 0.16, + "grad_norm": 0.154296875, + "learning_rate": 0.0001979953601207837, + "loss": 1.2054, + "step": 2875 + }, + { + "epoch": 0.16, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001979762694402456, + "loss": 1.2132, + "step": 2880 + }, + { + "epoch": 0.16, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019795708921650378, + "loss": 1.1789, + "step": 2885 + }, + { + "epoch": 0.16, + "grad_norm": 0.15625, + "learning_rate": 0.0001979378194670875, + "loss": 1.1421, + "step": 2890 + }, + { + "epoch": 0.16, + "grad_norm": 0.154296875, + "learning_rate": 0.000197918460209608, + "loss": 1.2263, + "step": 2895 + }, + { + "epoch": 0.16, + "grad_norm": 0.15625, + "learning_rate": 0.0001978990114617582, + "loss": 1.208, + "step": 2900 + }, + { + "epoch": 0.16, + "grad_norm": 0.154296875, + "learning_rate": 0.00019787947324131282, + "loss": 1.1271, + "step": 2905 + }, + { + "epoch": 0.16, + "grad_norm": 0.158203125, + "learning_rate": 0.00019785984556612836, + "loss": 1.2911, + "step": 2910 + }, + { + "epoch": 0.16, + "grad_norm": 0.154296875, + "learning_rate": 0.0001978401284541432, + "loss": 1.2554, + "step": 2915 + }, + { + "epoch": 0.16, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019782032192337725, + "loss": 1.1166, + "step": 2920 + }, + { + "epoch": 0.16, + "grad_norm": 0.15234375, + "learning_rate": 0.00019780042599193225, + "loss": 1.2657, + "step": 2925 + }, + { + "epoch": 0.16, + "grad_norm": 0.15625, + "learning_rate": 0.00019778044067799168, + "loss": 1.2139, + "step": 2930 + }, + { + "epoch": 0.16, + "grad_norm": 0.154296875, + "learning_rate": 0.00019776036599982065, + "loss": 1.2246, + "step": 2935 + }, + { + "epoch": 0.16, + "grad_norm": 0.15234375, + "learning_rate": 0.00019774020197576596, + "loss": 1.1838, + "step": 2940 + }, + { + "epoch": 0.16, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019771994862425606, + "loss": 1.1485, + "step": 2945 + }, + { + "epoch": 0.16, + "grad_norm": 0.154296875, + "learning_rate": 0.00019769960596380105, + "loss": 1.198, + "step": 2950 + }, + { + "epoch": 0.16, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019767917401299263, + "loss": 1.1564, + "step": 2955 + }, + { + "epoch": 0.16, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019765865279050413, + "loss": 1.208, + "step": 2960 + }, + { + "epoch": 0.16, + "grad_norm": 0.169921875, + "learning_rate": 0.00019763804231509046, + "loss": 1.2053, + "step": 2965 + }, + { + "epoch": 0.16, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001976173426055881, + "loss": 1.2324, + "step": 2970 + }, + { + "epoch": 0.16, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001975965536809151, + "loss": 1.1634, + "step": 2975 + }, + { + "epoch": 0.16, + "grad_norm": 0.1572265625, + "learning_rate": 0.000197575675560071, + "loss": 1.1063, + "step": 2980 + }, + { + "epoch": 0.16, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019755470826213694, + "loss": 1.1857, + "step": 2985 + }, + { + "epoch": 0.16, + "grad_norm": 0.16015625, + "learning_rate": 0.00019753365180627544, + "loss": 1.2372, + "step": 2990 + }, + { + "epoch": 0.16, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001975125062117306, + "loss": 1.2264, + "step": 2995 + }, + { + "epoch": 0.16, + "grad_norm": 0.154296875, + "learning_rate": 0.000197491271497828, + "loss": 1.1289, + "step": 3000 + }, + { + "epoch": 0.16, + "grad_norm": 0.15234375, + "learning_rate": 0.00019746994768397456, + "loss": 1.2187, + "step": 3005 + }, + { + "epoch": 0.16, + "grad_norm": 0.166015625, + "learning_rate": 0.00019744853478965874, + "loss": 1.2276, + "step": 3010 + }, + { + "epoch": 0.17, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019742703283445035, + "loss": 1.1815, + "step": 3015 + }, + { + "epoch": 0.17, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019740544183800065, + "loss": 1.182, + "step": 3020 + }, + { + "epoch": 0.17, + "grad_norm": 0.158203125, + "learning_rate": 0.00019738376182004222, + "loss": 1.194, + "step": 3025 + }, + { + "epoch": 0.17, + "grad_norm": 0.154296875, + "learning_rate": 0.00019736199280038906, + "loss": 1.2095, + "step": 3030 + }, + { + "epoch": 0.17, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001973401347989364, + "loss": 1.185, + "step": 3035 + }, + { + "epoch": 0.17, + "grad_norm": 0.16015625, + "learning_rate": 0.00019731818783566094, + "loss": 1.1828, + "step": 3040 + }, + { + "epoch": 0.17, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019729615193062058, + "loss": 1.1592, + "step": 3045 + }, + { + "epoch": 0.17, + "grad_norm": 0.15625, + "learning_rate": 0.00019727402710395455, + "loss": 1.218, + "step": 3050 + }, + { + "epoch": 0.17, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019725181337588331, + "loss": 1.1935, + "step": 3055 + }, + { + "epoch": 0.17, + "grad_norm": 0.15234375, + "learning_rate": 0.00019722951076670862, + "loss": 1.2958, + "step": 3060 + }, + { + "epoch": 0.17, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019720711929681347, + "loss": 1.2223, + "step": 3065 + }, + { + "epoch": 0.17, + "grad_norm": 0.154296875, + "learning_rate": 0.00019718463898666202, + "loss": 1.2116, + "step": 3070 + }, + { + "epoch": 0.17, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019716206985679962, + "loss": 1.2521, + "step": 3075 + }, + { + "epoch": 0.17, + "grad_norm": 0.162109375, + "learning_rate": 0.00019713941192785284, + "loss": 1.277, + "step": 3080 + }, + { + "epoch": 0.17, + "grad_norm": 0.15625, + "learning_rate": 0.00019711666522052935, + "loss": 1.1773, + "step": 3085 + }, + { + "epoch": 0.17, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019709382975561805, + "loss": 1.1902, + "step": 3090 + }, + { + "epoch": 0.17, + "grad_norm": 0.15625, + "learning_rate": 0.00019707090555398887, + "loss": 1.2634, + "step": 3095 + }, + { + "epoch": 0.17, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019704789263659287, + "loss": 1.2584, + "step": 3100 + }, + { + "epoch": 0.17, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019702479102446218, + "loss": 1.1253, + "step": 3105 + }, + { + "epoch": 0.17, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019700160073870998, + "loss": 1.2754, + "step": 3110 + }, + { + "epoch": 0.17, + "grad_norm": 0.15625, + "learning_rate": 0.00019697832180053056, + "loss": 1.176, + "step": 3115 + }, + { + "epoch": 0.17, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019695495423119913, + "loss": 1.1775, + "step": 3120 + }, + { + "epoch": 0.17, + "grad_norm": 0.162109375, + "learning_rate": 0.00019693149805207196, + "loss": 1.2001, + "step": 3125 + }, + { + "epoch": 0.17, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019690795328458632, + "loss": 1.1762, + "step": 3130 + }, + { + "epoch": 0.17, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019688431995026044, + "loss": 1.2328, + "step": 3135 + }, + { + "epoch": 0.17, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001968605980706934, + "loss": 1.2827, + "step": 3140 + }, + { + "epoch": 0.17, + "grad_norm": 0.15234375, + "learning_rate": 0.00019683678766756538, + "loss": 1.1568, + "step": 3145 + }, + { + "epoch": 0.17, + "grad_norm": 0.154296875, + "learning_rate": 0.00019681288876263723, + "loss": 1.1664, + "step": 3150 + }, + { + "epoch": 0.17, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019678890137775094, + "loss": 1.22, + "step": 3155 + }, + { + "epoch": 0.17, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019676482553482915, + "loss": 1.2122, + "step": 3160 + }, + { + "epoch": 0.17, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019674066125587549, + "loss": 1.2239, + "step": 3165 + }, + { + "epoch": 0.17, + "grad_norm": 0.15234375, + "learning_rate": 0.00019671640856297435, + "loss": 1.1882, + "step": 3170 + }, + { + "epoch": 0.17, + "grad_norm": 0.16015625, + "learning_rate": 0.00019669206747829093, + "loss": 1.2303, + "step": 3175 + }, + { + "epoch": 0.17, + "grad_norm": 0.150390625, + "learning_rate": 0.0001966676380240712, + "loss": 1.246, + "step": 3180 + }, + { + "epoch": 0.17, + "grad_norm": 0.154296875, + "learning_rate": 0.00019664312022264188, + "loss": 1.2049, + "step": 3185 + }, + { + "epoch": 0.17, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019661851409641054, + "loss": 1.2294, + "step": 3190 + }, + { + "epoch": 0.18, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019659381966786535, + "loss": 1.1979, + "step": 3195 + }, + { + "epoch": 0.18, + "grad_norm": 0.1455078125, + "learning_rate": 0.00019656903695957522, + "loss": 1.1838, + "step": 3200 + }, + { + "epoch": 0.18, + "grad_norm": 0.15625, + "learning_rate": 0.00019654416599418977, + "loss": 1.1748, + "step": 3205 + }, + { + "epoch": 0.18, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019651920679443925, + "loss": 1.204, + "step": 3210 + }, + { + "epoch": 0.18, + "grad_norm": 0.15234375, + "learning_rate": 0.00019649415938313453, + "loss": 1.1967, + "step": 3215 + }, + { + "epoch": 0.18, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019646902378316717, + "loss": 1.1838, + "step": 3220 + }, + { + "epoch": 0.18, + "grad_norm": 0.154296875, + "learning_rate": 0.00019644380001750927, + "loss": 1.1577, + "step": 3225 + }, + { + "epoch": 0.18, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019641848810921352, + "loss": 1.1423, + "step": 3230 + }, + { + "epoch": 0.18, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019639308808141318, + "loss": 1.1374, + "step": 3235 + }, + { + "epoch": 0.18, + "grad_norm": 0.15625, + "learning_rate": 0.000196367599957322, + "loss": 1.1429, + "step": 3240 + }, + { + "epoch": 0.18, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001963420237602343, + "loss": 1.2008, + "step": 3245 + }, + { + "epoch": 0.18, + "grad_norm": 0.15234375, + "learning_rate": 0.00019631635951352486, + "loss": 1.2106, + "step": 3250 + }, + { + "epoch": 0.18, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019629060724064897, + "loss": 1.2584, + "step": 3255 + }, + { + "epoch": 0.18, + "grad_norm": 0.1640625, + "learning_rate": 0.00019626476696514228, + "loss": 1.1772, + "step": 3260 + }, + { + "epoch": 0.18, + "grad_norm": 0.1513671875, + "learning_rate": 0.000196238838710621, + "loss": 1.1017, + "step": 3265 + }, + { + "epoch": 0.18, + "grad_norm": 0.158203125, + "learning_rate": 0.00019621282250078165, + "loss": 1.2701, + "step": 3270 + }, + { + "epoch": 0.18, + "grad_norm": 0.154296875, + "learning_rate": 0.0001961867183594011, + "loss": 1.1424, + "step": 3275 + }, + { + "epoch": 0.18, + "grad_norm": 0.158203125, + "learning_rate": 0.00019616052631033672, + "loss": 1.1661, + "step": 3280 + }, + { + "epoch": 0.18, + "grad_norm": 0.16015625, + "learning_rate": 0.00019613424637752612, + "loss": 1.1719, + "step": 3285 + }, + { + "epoch": 0.18, + "grad_norm": 0.146484375, + "learning_rate": 0.0001961078785849872, + "loss": 1.1828, + "step": 3290 + }, + { + "epoch": 0.18, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019608142295681827, + "loss": 1.1878, + "step": 3295 + }, + { + "epoch": 0.18, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019605487951719787, + "loss": 1.1571, + "step": 3300 + }, + { + "epoch": 0.18, + "grad_norm": 0.154296875, + "learning_rate": 0.00019602824829038473, + "loss": 1.1789, + "step": 3305 + }, + { + "epoch": 0.18, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019600152930071788, + "loss": 1.1424, + "step": 3310 + }, + { + "epoch": 0.18, + "grad_norm": 0.158203125, + "learning_rate": 0.0001959747225726165, + "loss": 1.1929, + "step": 3315 + }, + { + "epoch": 0.18, + "grad_norm": 0.154296875, + "learning_rate": 0.0001959478281305801, + "loss": 1.2009, + "step": 3320 + }, + { + "epoch": 0.18, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019592084599918814, + "loss": 1.1991, + "step": 3325 + }, + { + "epoch": 0.18, + "grad_norm": 0.15625, + "learning_rate": 0.00019589377620310035, + "loss": 1.141, + "step": 3330 + }, + { + "epoch": 0.18, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001958666187670566, + "loss": 1.1432, + "step": 3335 + }, + { + "epoch": 0.18, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019583937371587683, + "loss": 1.1486, + "step": 3340 + }, + { + "epoch": 0.18, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019581204107446096, + "loss": 1.2263, + "step": 3345 + }, + { + "epoch": 0.18, + "grad_norm": 0.154296875, + "learning_rate": 0.0001957846208677891, + "loss": 1.2542, + "step": 3350 + }, + { + "epoch": 0.18, + "grad_norm": 0.15234375, + "learning_rate": 0.00019575711312092137, + "loss": 1.1678, + "step": 3355 + }, + { + "epoch": 0.18, + "grad_norm": 0.158203125, + "learning_rate": 0.00019572951785899775, + "loss": 1.1939, + "step": 3360 + }, + { + "epoch": 0.18, + "grad_norm": 0.15625, + "learning_rate": 0.00019570183510723838, + "loss": 1.1907, + "step": 3365 + }, + { + "epoch": 0.18, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019567406489094327, + "loss": 1.1974, + "step": 3370 + }, + { + "epoch": 0.18, + "grad_norm": 0.154296875, + "learning_rate": 0.0001956462072354924, + "loss": 1.2286, + "step": 3375 + }, + { + "epoch": 0.19, + "grad_norm": 0.15625, + "learning_rate": 0.0001956182621663456, + "loss": 1.1975, + "step": 3380 + }, + { + "epoch": 0.19, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019559022970904268, + "loss": 1.1666, + "step": 3385 + }, + { + "epoch": 0.19, + "grad_norm": 0.16015625, + "learning_rate": 0.00019556210988920327, + "loss": 1.2401, + "step": 3390 + }, + { + "epoch": 0.19, + "grad_norm": 0.158203125, + "learning_rate": 0.00019553390273252682, + "loss": 1.2396, + "step": 3395 + }, + { + "epoch": 0.19, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019550560826479268, + "loss": 1.2139, + "step": 3400 + }, + { + "epoch": 0.19, + "grad_norm": 0.1396484375, + "learning_rate": 0.00019547722651185986, + "loss": 1.1671, + "step": 3405 + }, + { + "epoch": 0.19, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019544875749966727, + "loss": 1.2247, + "step": 3410 + }, + { + "epoch": 0.19, + "grad_norm": 0.16015625, + "learning_rate": 0.00019542020125423352, + "loss": 1.1916, + "step": 3415 + }, + { + "epoch": 0.19, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019539155780165697, + "loss": 1.1933, + "step": 3420 + }, + { + "epoch": 0.19, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001953628271681156, + "loss": 1.1717, + "step": 3425 + }, + { + "epoch": 0.19, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019533400937986716, + "loss": 1.1789, + "step": 3430 + }, + { + "epoch": 0.19, + "grad_norm": 0.158203125, + "learning_rate": 0.00019530510446324903, + "loss": 1.2876, + "step": 3435 + }, + { + "epoch": 0.19, + "grad_norm": 0.15625, + "learning_rate": 0.0001952761124446782, + "loss": 1.242, + "step": 3440 + }, + { + "epoch": 0.19, + "grad_norm": 0.1640625, + "learning_rate": 0.00019524703335065124, + "loss": 1.1854, + "step": 3445 + }, + { + "epoch": 0.19, + "grad_norm": 0.154296875, + "learning_rate": 0.0001952178672077444, + "loss": 1.278, + "step": 3450 + }, + { + "epoch": 0.19, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019518861404261336, + "loss": 1.2237, + "step": 3455 + }, + { + "epoch": 0.19, + "grad_norm": 0.154296875, + "learning_rate": 0.00019515927388199337, + "loss": 1.1598, + "step": 3460 + }, + { + "epoch": 0.19, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019512984675269928, + "loss": 1.2386, + "step": 3465 + }, + { + "epoch": 0.19, + "grad_norm": 0.158203125, + "learning_rate": 0.00019510033268162533, + "loss": 1.188, + "step": 3470 + }, + { + "epoch": 0.19, + "grad_norm": 0.166015625, + "learning_rate": 0.00019507073169574519, + "loss": 1.1833, + "step": 3475 + }, + { + "epoch": 0.19, + "grad_norm": 0.158203125, + "learning_rate": 0.0001950410438221121, + "loss": 1.1666, + "step": 3480 + }, + { + "epoch": 0.19, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019501126908785853, + "loss": 1.1627, + "step": 3485 + }, + { + "epoch": 0.19, + "grad_norm": 0.16015625, + "learning_rate": 0.0001949814075201965, + "loss": 1.2412, + "step": 3490 + }, + { + "epoch": 0.19, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001949514591464173, + "loss": 1.1465, + "step": 3495 + }, + { + "epoch": 0.19, + "grad_norm": 0.154296875, + "learning_rate": 0.00019492142399389154, + "loss": 1.1897, + "step": 3500 + }, + { + "epoch": 0.19, + "grad_norm": 0.1484375, + "learning_rate": 0.0001948913020900692, + "loss": 1.2094, + "step": 3505 + }, + { + "epoch": 0.19, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019486109346247954, + "loss": 1.1988, + "step": 3510 + }, + { + "epoch": 0.19, + "grad_norm": 0.150390625, + "learning_rate": 0.000194830798138731, + "loss": 1.1563, + "step": 3515 + }, + { + "epoch": 0.19, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001948004161465113, + "loss": 1.1494, + "step": 3520 + }, + { + "epoch": 0.19, + "grad_norm": 0.1474609375, + "learning_rate": 0.00019476994751358744, + "loss": 1.1626, + "step": 3525 + }, + { + "epoch": 0.19, + "grad_norm": 0.150390625, + "learning_rate": 0.00019473939226780548, + "loss": 1.1899, + "step": 3530 + }, + { + "epoch": 0.19, + "grad_norm": 0.1484375, + "learning_rate": 0.00019470875043709076, + "loss": 1.1181, + "step": 3535 + }, + { + "epoch": 0.19, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019467802204944766, + "loss": 1.2443, + "step": 3540 + }, + { + "epoch": 0.19, + "grad_norm": 0.16015625, + "learning_rate": 0.0001946472071329597, + "loss": 1.1897, + "step": 3545 + }, + { + "epoch": 0.19, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019461630571578943, + "loss": 1.1916, + "step": 3550 + }, + { + "epoch": 0.19, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001945853178261786, + "loss": 1.2369, + "step": 3555 + }, + { + "epoch": 0.19, + "grad_norm": 0.15234375, + "learning_rate": 0.0001945542434924478, + "loss": 1.1876, + "step": 3560 + }, + { + "epoch": 0.2, + "grad_norm": 0.166015625, + "learning_rate": 0.00019452308274299677, + "loss": 1.2302, + "step": 3565 + }, + { + "epoch": 0.2, + "grad_norm": 0.162109375, + "learning_rate": 0.00019449183560630418, + "loss": 1.1644, + "step": 3570 + }, + { + "epoch": 0.2, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001944605021109276, + "loss": 1.2396, + "step": 3575 + }, + { + "epoch": 0.2, + "grad_norm": 0.15234375, + "learning_rate": 0.00019442908228550364, + "loss": 1.2611, + "step": 3580 + }, + { + "epoch": 0.2, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019439757615874768, + "loss": 1.1609, + "step": 3585 + }, + { + "epoch": 0.2, + "grad_norm": 0.154296875, + "learning_rate": 0.00019436598375945405, + "loss": 1.2111, + "step": 3590 + }, + { + "epoch": 0.2, + "grad_norm": 0.1484375, + "learning_rate": 0.00019433430511649593, + "loss": 1.1857, + "step": 3595 + }, + { + "epoch": 0.2, + "grad_norm": 0.15234375, + "learning_rate": 0.00019430254025882532, + "loss": 1.1455, + "step": 3600 + }, + { + "epoch": 0.2, + "grad_norm": 0.150390625, + "learning_rate": 0.0001942706892154729, + "loss": 1.1492, + "step": 3605 + }, + { + "epoch": 0.2, + "grad_norm": 0.16015625, + "learning_rate": 0.00019423875201554834, + "loss": 1.1672, + "step": 3610 + }, + { + "epoch": 0.2, + "grad_norm": 0.15625, + "learning_rate": 0.00019420672868823984, + "loss": 1.2263, + "step": 3615 + }, + { + "epoch": 0.2, + "grad_norm": 0.16015625, + "learning_rate": 0.00019417461926281442, + "loss": 1.1972, + "step": 3620 + }, + { + "epoch": 0.2, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019414242376861778, + "loss": 1.2509, + "step": 3625 + }, + { + "epoch": 0.2, + "grad_norm": 0.158203125, + "learning_rate": 0.00019411014223507425, + "loss": 1.1969, + "step": 3630 + }, + { + "epoch": 0.2, + "grad_norm": 0.1416015625, + "learning_rate": 0.00019407777469168682, + "loss": 1.1543, + "step": 3635 + }, + { + "epoch": 0.2, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019404532116803707, + "loss": 1.2066, + "step": 3640 + }, + { + "epoch": 0.2, + "grad_norm": 0.16015625, + "learning_rate": 0.00019401278169378517, + "loss": 1.0788, + "step": 3645 + }, + { + "epoch": 0.2, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019398015629866984, + "loss": 1.1937, + "step": 3650 + }, + { + "epoch": 0.2, + "grad_norm": 0.150390625, + "learning_rate": 0.00019394744501250832, + "loss": 1.1784, + "step": 3655 + }, + { + "epoch": 0.2, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019391464786519636, + "loss": 1.144, + "step": 3660 + }, + { + "epoch": 0.2, + "grad_norm": 0.154296875, + "learning_rate": 0.00019388176488670814, + "loss": 1.1592, + "step": 3665 + }, + { + "epoch": 0.2, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001938487961070964, + "loss": 1.2506, + "step": 3670 + }, + { + "epoch": 0.2, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019381574155649212, + "loss": 1.2269, + "step": 3675 + }, + { + "epoch": 0.2, + "grad_norm": 0.15625, + "learning_rate": 0.00019378260126510483, + "loss": 1.2301, + "step": 3680 + }, + { + "epoch": 0.2, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001937493752632223, + "loss": 1.1464, + "step": 3685 + }, + { + "epoch": 0.2, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019371606358121072, + "loss": 1.2216, + "step": 3690 + }, + { + "epoch": 0.2, + "grad_norm": 0.15625, + "learning_rate": 0.00019368266624951455, + "loss": 1.185, + "step": 3695 + }, + { + "epoch": 0.2, + "grad_norm": 0.158203125, + "learning_rate": 0.0001936491832986565, + "loss": 1.1557, + "step": 3700 + }, + { + "epoch": 0.2, + "grad_norm": 0.15625, + "learning_rate": 0.00019361561475923758, + "loss": 1.1541, + "step": 3705 + }, + { + "epoch": 0.2, + "grad_norm": 0.1513671875, + "learning_rate": 0.00019358196066193703, + "loss": 1.1495, + "step": 3710 + }, + { + "epoch": 0.2, + "grad_norm": 0.154296875, + "learning_rate": 0.0001935482210375122, + "loss": 1.1735, + "step": 3715 + }, + { + "epoch": 0.2, + "grad_norm": 0.1796875, + "learning_rate": 0.00019351439591679869, + "loss": 1.255, + "step": 3720 + }, + { + "epoch": 0.2, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001934804853307102, + "loss": 1.2078, + "step": 3725 + }, + { + "epoch": 0.2, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019344648931023852, + "loss": 1.0816, + "step": 3730 + }, + { + "epoch": 0.2, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001934124078864536, + "loss": 1.1158, + "step": 3735 + }, + { + "epoch": 0.2, + "grad_norm": 0.150390625, + "learning_rate": 0.00019337824109050335, + "loss": 1.2213, + "step": 3740 + }, + { + "epoch": 0.21, + "grad_norm": 0.154296875, + "learning_rate": 0.00019334398895361375, + "loss": 1.2004, + "step": 3745 + }, + { + "epoch": 0.21, + "grad_norm": 0.154296875, + "learning_rate": 0.00019330965150708883, + "loss": 1.2926, + "step": 3750 + }, + { + "epoch": 0.21, + "grad_norm": 0.154296875, + "learning_rate": 0.0001932752287823104, + "loss": 1.1325, + "step": 3755 + }, + { + "epoch": 0.21, + "grad_norm": 0.193359375, + "learning_rate": 0.00019324072081073842, + "loss": 1.205, + "step": 3760 + }, + { + "epoch": 0.21, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019320612762391067, + "loss": 1.183, + "step": 3765 + }, + { + "epoch": 0.21, + "grad_norm": 0.15625, + "learning_rate": 0.00019317144925344278, + "loss": 1.1753, + "step": 3770 + }, + { + "epoch": 0.21, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001931366857310283, + "loss": 1.0924, + "step": 3775 + }, + { + "epoch": 0.21, + "grad_norm": 0.162109375, + "learning_rate": 0.00019310183708843854, + "loss": 1.211, + "step": 3780 + }, + { + "epoch": 0.21, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019306690335752264, + "loss": 1.1834, + "step": 3785 + }, + { + "epoch": 0.21, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019303188457020753, + "loss": 1.1375, + "step": 3790 + }, + { + "epoch": 0.21, + "grad_norm": 0.15625, + "learning_rate": 0.00019299678075849778, + "loss": 1.1875, + "step": 3795 + }, + { + "epoch": 0.21, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001929615919544758, + "loss": 1.1639, + "step": 3800 + }, + { + "epoch": 0.21, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001929263181903015, + "loss": 1.2174, + "step": 3805 + }, + { + "epoch": 0.21, + "grad_norm": 0.158203125, + "learning_rate": 0.00019289095949821266, + "loss": 1.1335, + "step": 3810 + }, + { + "epoch": 0.21, + "grad_norm": 0.16015625, + "learning_rate": 0.00019285551591052447, + "loss": 1.1638, + "step": 3815 + }, + { + "epoch": 0.21, + "grad_norm": 0.158203125, + "learning_rate": 0.00019281998745962988, + "loss": 1.2058, + "step": 3820 + }, + { + "epoch": 0.21, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019278437417799923, + "loss": 1.1903, + "step": 3825 + }, + { + "epoch": 0.21, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019274867609818052, + "loss": 1.2149, + "step": 3830 + }, + { + "epoch": 0.21, + "grad_norm": 0.166015625, + "learning_rate": 0.00019271289325279925, + "loss": 1.2248, + "step": 3835 + }, + { + "epoch": 0.21, + "grad_norm": 0.15234375, + "learning_rate": 0.00019267702567455825, + "loss": 1.1595, + "step": 3840 + }, + { + "epoch": 0.21, + "grad_norm": 0.1494140625, + "learning_rate": 0.00019264107339623797, + "loss": 1.0764, + "step": 3845 + }, + { + "epoch": 0.21, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001926050364506961, + "loss": 1.1806, + "step": 3850 + }, + { + "epoch": 0.21, + "grad_norm": 0.158203125, + "learning_rate": 0.00019256891487086787, + "loss": 1.1941, + "step": 3855 + }, + { + "epoch": 0.21, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019253270868976573, + "loss": 1.2064, + "step": 3860 + }, + { + "epoch": 0.21, + "grad_norm": 0.166015625, + "learning_rate": 0.00019249641794047957, + "loss": 1.2227, + "step": 3865 + }, + { + "epoch": 0.21, + "grad_norm": 0.162109375, + "learning_rate": 0.00019246004265617645, + "loss": 1.2342, + "step": 3870 + }, + { + "epoch": 0.21, + "grad_norm": 0.154296875, + "learning_rate": 0.00019242358287010076, + "loss": 1.1305, + "step": 3875 + }, + { + "epoch": 0.21, + "grad_norm": 0.1640625, + "learning_rate": 0.0001923870386155741, + "loss": 1.1803, + "step": 3880 + }, + { + "epoch": 0.21, + "grad_norm": 0.150390625, + "learning_rate": 0.00019235040992599523, + "loss": 1.2234, + "step": 3885 + }, + { + "epoch": 0.21, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019231369683484018, + "loss": 1.192, + "step": 3890 + }, + { + "epoch": 0.21, + "grad_norm": 0.15625, + "learning_rate": 0.000192276899375662, + "loss": 1.2201, + "step": 3895 + }, + { + "epoch": 0.21, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001922400175820909, + "loss": 1.1795, + "step": 3900 + }, + { + "epoch": 0.21, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001922030514878342, + "loss": 1.1543, + "step": 3905 + }, + { + "epoch": 0.21, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019216600112667624, + "loss": 1.2045, + "step": 3910 + }, + { + "epoch": 0.21, + "grad_norm": 0.150390625, + "learning_rate": 0.0001921288665324783, + "loss": 1.1899, + "step": 3915 + }, + { + "epoch": 0.21, + "grad_norm": 0.150390625, + "learning_rate": 0.00019209164773917876, + "loss": 1.1393, + "step": 3920 + }, + { + "epoch": 0.21, + "grad_norm": 0.158203125, + "learning_rate": 0.00019205434478079288, + "loss": 1.1836, + "step": 3925 + }, + { + "epoch": 0.22, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019201695769141284, + "loss": 1.2185, + "step": 3930 + }, + { + "epoch": 0.22, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019197948650520776, + "loss": 1.1442, + "step": 3935 + }, + { + "epoch": 0.22, + "grad_norm": 0.162109375, + "learning_rate": 0.00019194193125642354, + "loss": 1.1998, + "step": 3940 + }, + { + "epoch": 0.22, + "grad_norm": 0.1591796875, + "learning_rate": 0.000191904291979383, + "loss": 1.1879, + "step": 3945 + }, + { + "epoch": 0.22, + "grad_norm": 0.150390625, + "learning_rate": 0.0001918665687084857, + "loss": 1.1614, + "step": 3950 + }, + { + "epoch": 0.22, + "grad_norm": 0.154296875, + "learning_rate": 0.00019182876147820794, + "loss": 1.2431, + "step": 3955 + }, + { + "epoch": 0.22, + "grad_norm": 0.1484375, + "learning_rate": 0.00019179087032310284, + "loss": 1.2027, + "step": 3960 + }, + { + "epoch": 0.22, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019175289527780014, + "loss": 1.1537, + "step": 3965 + }, + { + "epoch": 0.22, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019171483637700626, + "loss": 1.1182, + "step": 3970 + }, + { + "epoch": 0.22, + "grad_norm": 0.154296875, + "learning_rate": 0.0001916766936555043, + "loss": 1.1563, + "step": 3975 + }, + { + "epoch": 0.22, + "grad_norm": 0.1640625, + "learning_rate": 0.00019163846714815392, + "loss": 1.1452, + "step": 3980 + }, + { + "epoch": 0.22, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001916001568898914, + "loss": 1.17, + "step": 3985 + }, + { + "epoch": 0.22, + "grad_norm": 0.15625, + "learning_rate": 0.00019156176291572954, + "loss": 1.2051, + "step": 3990 + }, + { + "epoch": 0.22, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001915232852607576, + "loss": 1.1872, + "step": 3995 + }, + { + "epoch": 0.22, + "grad_norm": 0.158203125, + "learning_rate": 0.0001914847239601414, + "loss": 1.1855, + "step": 4000 + }, + { + "epoch": 0.22, + "grad_norm": 0.154296875, + "learning_rate": 0.00019144607904912316, + "loss": 1.1743, + "step": 4005 + }, + { + "epoch": 0.22, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019140735056302158, + "loss": 1.1908, + "step": 4010 + }, + { + "epoch": 0.22, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001913685385372316, + "loss": 1.2331, + "step": 4015 + }, + { + "epoch": 0.22, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019132964300722468, + "loss": 1.1411, + "step": 4020 + }, + { + "epoch": 0.22, + "grad_norm": 0.162109375, + "learning_rate": 0.00019129066400854842, + "loss": 1.1171, + "step": 4025 + }, + { + "epoch": 0.22, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019125160157682688, + "loss": 1.1484, + "step": 4030 + }, + { + "epoch": 0.22, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019121245574776026, + "loss": 1.1838, + "step": 4035 + }, + { + "epoch": 0.22, + "grad_norm": 0.1552734375, + "learning_rate": 0.000191173226557125, + "loss": 1.1931, + "step": 4040 + }, + { + "epoch": 0.22, + "grad_norm": 0.16015625, + "learning_rate": 0.00019113391404077375, + "loss": 1.1914, + "step": 4045 + }, + { + "epoch": 0.22, + "grad_norm": 0.162109375, + "learning_rate": 0.00019109451823463527, + "loss": 1.1788, + "step": 4050 + }, + { + "epoch": 0.22, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019105503917471453, + "loss": 1.1897, + "step": 4055 + }, + { + "epoch": 0.22, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019101547689709243, + "loss": 1.1927, + "step": 4060 + }, + { + "epoch": 0.22, + "grad_norm": 0.15625, + "learning_rate": 0.0001909758314379261, + "loss": 1.2086, + "step": 4065 + }, + { + "epoch": 0.22, + "grad_norm": 0.150390625, + "learning_rate": 0.0001909361028334486, + "loss": 1.2443, + "step": 4070 + }, + { + "epoch": 0.22, + "grad_norm": 0.181640625, + "learning_rate": 0.00019089629111996898, + "loss": 1.2293, + "step": 4075 + }, + { + "epoch": 0.22, + "grad_norm": 0.162109375, + "learning_rate": 0.0001908563963338723, + "loss": 1.2378, + "step": 4080 + }, + { + "epoch": 0.22, + "grad_norm": 0.15234375, + "learning_rate": 0.00019081641851161941, + "loss": 1.1847, + "step": 4085 + }, + { + "epoch": 0.22, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019077635768974724, + "loss": 1.2248, + "step": 4090 + }, + { + "epoch": 0.22, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019073621390486843, + "loss": 1.1274, + "step": 4095 + }, + { + "epoch": 0.22, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019069598719367146, + "loss": 1.2443, + "step": 4100 + }, + { + "epoch": 0.22, + "grad_norm": 0.1591796875, + "learning_rate": 0.00019065567759292065, + "loss": 1.1931, + "step": 4105 + }, + { + "epoch": 0.23, + "grad_norm": 0.154296875, + "learning_rate": 0.0001906152851394561, + "loss": 1.1672, + "step": 4110 + }, + { + "epoch": 0.23, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019057480987019347, + "loss": 1.1768, + "step": 4115 + }, + { + "epoch": 0.23, + "grad_norm": 0.154296875, + "learning_rate": 0.00019053425182212434, + "loss": 1.1816, + "step": 4120 + }, + { + "epoch": 0.23, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019049361103231575, + "loss": 1.1541, + "step": 4125 + }, + { + "epoch": 0.23, + "grad_norm": 0.15234375, + "learning_rate": 0.0001904528875379105, + "loss": 1.2016, + "step": 4130 + }, + { + "epoch": 0.23, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001904120813761268, + "loss": 1.1801, + "step": 4135 + }, + { + "epoch": 0.23, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019037119258425862, + "loss": 1.1839, + "step": 4140 + }, + { + "epoch": 0.23, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001903302211996753, + "loss": 1.1676, + "step": 4145 + }, + { + "epoch": 0.23, + "grad_norm": 0.1552734375, + "learning_rate": 0.00019028916725982178, + "loss": 1.2568, + "step": 4150 + }, + { + "epoch": 0.23, + "grad_norm": 0.16015625, + "learning_rate": 0.00019024803080221832, + "loss": 1.1593, + "step": 4155 + }, + { + "epoch": 0.23, + "grad_norm": 0.1533203125, + "learning_rate": 0.00019020681186446064, + "loss": 1.1637, + "step": 4160 + }, + { + "epoch": 0.23, + "grad_norm": 0.16015625, + "learning_rate": 0.00019016551048421992, + "loss": 1.1169, + "step": 4165 + }, + { + "epoch": 0.23, + "grad_norm": 0.1640625, + "learning_rate": 0.00019012412669924263, + "loss": 1.2356, + "step": 4170 + }, + { + "epoch": 0.23, + "grad_norm": 0.1572265625, + "learning_rate": 0.00019008266054735046, + "loss": 1.1956, + "step": 4175 + }, + { + "epoch": 0.23, + "grad_norm": 0.154296875, + "learning_rate": 0.00019004111206644055, + "loss": 1.1518, + "step": 4180 + }, + { + "epoch": 0.23, + "grad_norm": 0.162109375, + "learning_rate": 0.00018999948129448516, + "loss": 1.2697, + "step": 4185 + }, + { + "epoch": 0.23, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018995776826953182, + "loss": 1.1719, + "step": 4190 + }, + { + "epoch": 0.23, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001899159730297032, + "loss": 1.2134, + "step": 4195 + }, + { + "epoch": 0.23, + "grad_norm": 0.15625, + "learning_rate": 0.0001898740956131971, + "loss": 1.168, + "step": 4200 + }, + { + "epoch": 0.23, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018983213605828646, + "loss": 1.1682, + "step": 4205 + }, + { + "epoch": 0.23, + "grad_norm": 0.158203125, + "learning_rate": 0.00018979009440331924, + "loss": 1.2043, + "step": 4210 + }, + { + "epoch": 0.23, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018974797068671854, + "loss": 1.2089, + "step": 4215 + }, + { + "epoch": 0.23, + "grad_norm": 0.15625, + "learning_rate": 0.00018970576494698228, + "loss": 1.2662, + "step": 4220 + }, + { + "epoch": 0.23, + "grad_norm": 0.15625, + "learning_rate": 0.0001896634772226835, + "loss": 1.2474, + "step": 4225 + }, + { + "epoch": 0.23, + "grad_norm": 0.16015625, + "learning_rate": 0.00018962110755247014, + "loss": 1.2028, + "step": 4230 + }, + { + "epoch": 0.23, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018957865597506496, + "loss": 1.2415, + "step": 4235 + }, + { + "epoch": 0.23, + "grad_norm": 0.158203125, + "learning_rate": 0.0001895361225292656, + "loss": 1.163, + "step": 4240 + }, + { + "epoch": 0.23, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018949350725394463, + "loss": 1.2265, + "step": 4245 + }, + { + "epoch": 0.23, + "grad_norm": 0.16015625, + "learning_rate": 0.00018945081018804921, + "loss": 1.2452, + "step": 4250 + }, + { + "epoch": 0.23, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018940803137060147, + "loss": 1.1117, + "step": 4255 + }, + { + "epoch": 0.23, + "grad_norm": 0.154296875, + "learning_rate": 0.0001893651708406981, + "loss": 1.1876, + "step": 4260 + }, + { + "epoch": 0.23, + "grad_norm": 0.15625, + "learning_rate": 0.00018932222863751047, + "loss": 1.1784, + "step": 4265 + }, + { + "epoch": 0.23, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001892792048002847, + "loss": 1.1368, + "step": 4270 + }, + { + "epoch": 0.23, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018923609936834148, + "loss": 1.15, + "step": 4275 + }, + { + "epoch": 0.23, + "grad_norm": 0.162109375, + "learning_rate": 0.00018919291238107596, + "loss": 1.1065, + "step": 4280 + }, + { + "epoch": 0.23, + "grad_norm": 0.1552734375, + "learning_rate": 0.000189149643877958, + "loss": 1.1503, + "step": 4285 + }, + { + "epoch": 0.23, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001891062938985318, + "loss": 1.2226, + "step": 4290 + }, + { + "epoch": 0.24, + "grad_norm": 0.162109375, + "learning_rate": 0.00018906286248241615, + "loss": 1.1709, + "step": 4295 + }, + { + "epoch": 0.24, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018901934966930416, + "loss": 1.1334, + "step": 4300 + }, + { + "epoch": 0.24, + "grad_norm": 0.166015625, + "learning_rate": 0.00018897575549896344, + "loss": 1.2418, + "step": 4305 + }, + { + "epoch": 0.24, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001889320800112359, + "loss": 1.2129, + "step": 4310 + }, + { + "epoch": 0.24, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018888832324603772, + "loss": 1.1138, + "step": 4315 + }, + { + "epoch": 0.24, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018884448524335943, + "loss": 1.1346, + "step": 4320 + }, + { + "epoch": 0.24, + "grad_norm": 0.158203125, + "learning_rate": 0.00018880056604326582, + "loss": 1.1759, + "step": 4325 + }, + { + "epoch": 0.24, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001887565656858958, + "loss": 1.2258, + "step": 4330 + }, + { + "epoch": 0.24, + "grad_norm": 0.158203125, + "learning_rate": 0.00018871248421146254, + "loss": 1.1465, + "step": 4335 + }, + { + "epoch": 0.24, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001886683216602533, + "loss": 1.1522, + "step": 4340 + }, + { + "epoch": 0.24, + "grad_norm": 0.166015625, + "learning_rate": 0.00018862407807262947, + "loss": 1.0901, + "step": 4345 + }, + { + "epoch": 0.24, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018857975348902643, + "loss": 1.1998, + "step": 4350 + }, + { + "epoch": 0.24, + "grad_norm": 0.15625, + "learning_rate": 0.0001885353479499537, + "loss": 1.2311, + "step": 4355 + }, + { + "epoch": 0.24, + "grad_norm": 0.15625, + "learning_rate": 0.00018849086149599464, + "loss": 1.1242, + "step": 4360 + }, + { + "epoch": 0.24, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001884462941678067, + "loss": 1.1467, + "step": 4365 + }, + { + "epoch": 0.24, + "grad_norm": 0.16015625, + "learning_rate": 0.00018840164600612117, + "loss": 1.211, + "step": 4370 + }, + { + "epoch": 0.24, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018835691705174328, + "loss": 1.1764, + "step": 4375 + }, + { + "epoch": 0.24, + "grad_norm": 0.166015625, + "learning_rate": 0.00018831210734555196, + "loss": 1.0915, + "step": 4380 + }, + { + "epoch": 0.24, + "grad_norm": 0.154296875, + "learning_rate": 0.00018826721692850006, + "loss": 1.1972, + "step": 4385 + }, + { + "epoch": 0.24, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001882222458416142, + "loss": 1.2591, + "step": 4390 + }, + { + "epoch": 0.24, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001881771941259947, + "loss": 1.1628, + "step": 4395 + }, + { + "epoch": 0.24, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001881320618228155, + "loss": 1.1768, + "step": 4400 + }, + { + "epoch": 0.24, + "grad_norm": 0.1640625, + "learning_rate": 0.0001880868489733243, + "loss": 1.1984, + "step": 4405 + }, + { + "epoch": 0.24, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018804155561884234, + "loss": 1.2011, + "step": 4410 + }, + { + "epoch": 0.24, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018799618180076447, + "loss": 1.2258, + "step": 4415 + }, + { + "epoch": 0.24, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018795072756055907, + "loss": 1.2561, + "step": 4420 + }, + { + "epoch": 0.24, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018790519293976801, + "loss": 1.199, + "step": 4425 + }, + { + "epoch": 0.24, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018785957798000667, + "loss": 1.2548, + "step": 4430 + }, + { + "epoch": 0.24, + "grad_norm": 0.15234375, + "learning_rate": 0.0001878138827229637, + "loss": 1.1135, + "step": 4435 + }, + { + "epoch": 0.24, + "grad_norm": 0.162109375, + "learning_rate": 0.00018776810721040144, + "loss": 1.1236, + "step": 4440 + }, + { + "epoch": 0.24, + "grad_norm": 0.173828125, + "learning_rate": 0.0001877222514841552, + "loss": 1.1822, + "step": 4445 + }, + { + "epoch": 0.24, + "grad_norm": 0.16015625, + "learning_rate": 0.0001876763155861339, + "loss": 1.1692, + "step": 4450 + }, + { + "epoch": 0.24, + "grad_norm": 0.146484375, + "learning_rate": 0.0001876302995583196, + "loss": 1.1848, + "step": 4455 + }, + { + "epoch": 0.24, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001875842034427676, + "loss": 1.248, + "step": 4460 + }, + { + "epoch": 0.24, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018753802728160637, + "loss": 1.2719, + "step": 4465 + }, + { + "epoch": 0.24, + "grad_norm": 0.162109375, + "learning_rate": 0.00018749177111703765, + "loss": 1.229, + "step": 4470 + }, + { + "epoch": 0.25, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018744543499133614, + "loss": 1.1709, + "step": 4475 + }, + { + "epoch": 0.25, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018739901894684974, + "loss": 1.1611, + "step": 4480 + }, + { + "epoch": 0.25, + "grad_norm": 0.158203125, + "learning_rate": 0.00018735252302599936, + "loss": 1.1862, + "step": 4485 + }, + { + "epoch": 0.25, + "grad_norm": 0.15234375, + "learning_rate": 0.00018730594727127886, + "loss": 1.1517, + "step": 4490 + }, + { + "epoch": 0.25, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018725929172525515, + "loss": 1.1763, + "step": 4495 + }, + { + "epoch": 0.25, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018721255643056796, + "loss": 1.1797, + "step": 4500 + }, + { + "epoch": 0.25, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018716574142993003, + "loss": 1.1808, + "step": 4505 + }, + { + "epoch": 0.25, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001871188467661268, + "loss": 1.2529, + "step": 4510 + }, + { + "epoch": 0.25, + "grad_norm": 0.16015625, + "learning_rate": 0.00018707187248201662, + "loss": 1.1896, + "step": 4515 + }, + { + "epoch": 0.25, + "grad_norm": 0.1640625, + "learning_rate": 0.0001870248186205306, + "loss": 1.1237, + "step": 4520 + }, + { + "epoch": 0.25, + "grad_norm": 0.162109375, + "learning_rate": 0.0001869776852246725, + "loss": 1.1672, + "step": 4525 + }, + { + "epoch": 0.25, + "grad_norm": 0.150390625, + "learning_rate": 0.0001869304723375189, + "loss": 1.1375, + "step": 4530 + }, + { + "epoch": 0.25, + "grad_norm": 0.1640625, + "learning_rate": 0.0001868831800022189, + "loss": 1.2534, + "step": 4535 + }, + { + "epoch": 0.25, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001868358082619943, + "loss": 1.1793, + "step": 4540 + }, + { + "epoch": 0.25, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018678835716013942, + "loss": 1.1798, + "step": 4545 + }, + { + "epoch": 0.25, + "grad_norm": 0.158203125, + "learning_rate": 0.0001867408267400211, + "loss": 1.1684, + "step": 4550 + }, + { + "epoch": 0.25, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018669321704507875, + "loss": 1.1877, + "step": 4555 + }, + { + "epoch": 0.25, + "grad_norm": 0.16796875, + "learning_rate": 0.00018664552811882413, + "loss": 1.2018, + "step": 4560 + }, + { + "epoch": 0.25, + "grad_norm": 0.16015625, + "learning_rate": 0.0001865977600048415, + "loss": 1.1956, + "step": 4565 + }, + { + "epoch": 0.25, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001865499127467874, + "loss": 1.1512, + "step": 4570 + }, + { + "epoch": 0.25, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018650198638839078, + "loss": 1.1813, + "step": 4575 + }, + { + "epoch": 0.25, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018645398097345286, + "loss": 1.1737, + "step": 4580 + }, + { + "epoch": 0.25, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001864058965458471, + "loss": 1.1519, + "step": 4585 + }, + { + "epoch": 0.25, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001863577331495192, + "loss": 1.1106, + "step": 4590 + }, + { + "epoch": 0.25, + "grad_norm": 0.1591796875, + "learning_rate": 0.000186309490828487, + "loss": 1.1294, + "step": 4595 + }, + { + "epoch": 0.25, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018626116962684047, + "loss": 1.1882, + "step": 4600 + }, + { + "epoch": 0.25, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018621276958874166, + "loss": 1.1629, + "step": 4605 + }, + { + "epoch": 0.25, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018616429075842477, + "loss": 1.1658, + "step": 4610 + }, + { + "epoch": 0.25, + "grad_norm": 0.15625, + "learning_rate": 0.00018611573318019585, + "loss": 1.0996, + "step": 4615 + }, + { + "epoch": 0.25, + "grad_norm": 0.1474609375, + "learning_rate": 0.00018606709689843304, + "loss": 1.1976, + "step": 4620 + }, + { + "epoch": 0.25, + "grad_norm": 0.154296875, + "learning_rate": 0.0001860183819575864, + "loss": 1.2164, + "step": 4625 + }, + { + "epoch": 0.25, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001859695884021778, + "loss": 1.2458, + "step": 4630 + }, + { + "epoch": 0.25, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018592071627680106, + "loss": 1.1858, + "step": 4635 + }, + { + "epoch": 0.25, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018587176562612174, + "loss": 1.1194, + "step": 4640 + }, + { + "epoch": 0.25, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001858227364948772, + "loss": 1.218, + "step": 4645 + }, + { + "epoch": 0.25, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018577362892787646, + "loss": 1.2114, + "step": 4650 + }, + { + "epoch": 0.25, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018572444297000032, + "loss": 1.166, + "step": 4655 + }, + { + "epoch": 0.26, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018567517866620116, + "loss": 1.1793, + "step": 4660 + }, + { + "epoch": 0.26, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018562583606150296, + "loss": 1.1417, + "step": 4665 + }, + { + "epoch": 0.26, + "grad_norm": 0.162109375, + "learning_rate": 0.00018557641520100133, + "loss": 1.1991, + "step": 4670 + }, + { + "epoch": 0.26, + "grad_norm": 0.16015625, + "learning_rate": 0.00018552691612986334, + "loss": 1.1739, + "step": 4675 + }, + { + "epoch": 0.26, + "grad_norm": 0.15234375, + "learning_rate": 0.00018547733889332752, + "loss": 1.0675, + "step": 4680 + }, + { + "epoch": 0.26, + "grad_norm": 0.15625, + "learning_rate": 0.00018542768353670388, + "loss": 1.2482, + "step": 4685 + }, + { + "epoch": 0.26, + "grad_norm": 0.154296875, + "learning_rate": 0.00018537795010537385, + "loss": 1.1433, + "step": 4690 + }, + { + "epoch": 0.26, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018532813864479013, + "loss": 1.18, + "step": 4695 + }, + { + "epoch": 0.26, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018527824920047678, + "loss": 1.1855, + "step": 4700 + }, + { + "epoch": 0.26, + "grad_norm": 0.146484375, + "learning_rate": 0.00018522828181802918, + "loss": 1.1978, + "step": 4705 + }, + { + "epoch": 0.26, + "grad_norm": 0.154296875, + "learning_rate": 0.00018517823654311385, + "loss": 1.283, + "step": 4710 + }, + { + "epoch": 0.26, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001851281134214686, + "loss": 1.1947, + "step": 4715 + }, + { + "epoch": 0.26, + "grad_norm": 0.169921875, + "learning_rate": 0.00018507791249890225, + "loss": 1.215, + "step": 4720 + }, + { + "epoch": 0.26, + "grad_norm": 0.1484375, + "learning_rate": 0.00018502763382129484, + "loss": 1.2102, + "step": 4725 + }, + { + "epoch": 0.26, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018497727743459742, + "loss": 1.1311, + "step": 4730 + }, + { + "epoch": 0.26, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018492684338483213, + "loss": 1.2421, + "step": 4735 + }, + { + "epoch": 0.26, + "grad_norm": 0.158203125, + "learning_rate": 0.00018487633171809198, + "loss": 1.1825, + "step": 4740 + }, + { + "epoch": 0.26, + "grad_norm": 0.166015625, + "learning_rate": 0.00018482574248054098, + "loss": 1.1921, + "step": 4745 + }, + { + "epoch": 0.26, + "grad_norm": 0.15625, + "learning_rate": 0.00018477507571841402, + "loss": 1.2143, + "step": 4750 + }, + { + "epoch": 0.26, + "grad_norm": 0.15625, + "learning_rate": 0.00018472433147801684, + "loss": 1.1863, + "step": 4755 + }, + { + "epoch": 0.26, + "grad_norm": 0.1591796875, + "learning_rate": 0.000184673509805726, + "loss": 1.2314, + "step": 4760 + }, + { + "epoch": 0.26, + "grad_norm": 0.162109375, + "learning_rate": 0.00018462261074798887, + "loss": 1.1281, + "step": 4765 + }, + { + "epoch": 0.26, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018457163435132344, + "loss": 1.2231, + "step": 4770 + }, + { + "epoch": 0.26, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018452058066231844, + "loss": 1.2033, + "step": 4775 + }, + { + "epoch": 0.26, + "grad_norm": 0.162109375, + "learning_rate": 0.00018446944972763324, + "loss": 1.2101, + "step": 4780 + }, + { + "epoch": 0.26, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018441824159399784, + "loss": 1.1672, + "step": 4785 + }, + { + "epoch": 0.26, + "grad_norm": 0.154296875, + "learning_rate": 0.00018436695630821273, + "loss": 1.1587, + "step": 4790 + }, + { + "epoch": 0.26, + "grad_norm": 0.16015625, + "learning_rate": 0.00018431559391714894, + "loss": 1.1598, + "step": 4795 + }, + { + "epoch": 0.26, + "grad_norm": 0.1591796875, + "learning_rate": 0.000184264154467748, + "loss": 1.1382, + "step": 4800 + }, + { + "epoch": 0.26, + "grad_norm": 0.158203125, + "learning_rate": 0.0001842126380070218, + "loss": 1.0878, + "step": 4805 + }, + { + "epoch": 0.26, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018416104458205266, + "loss": 1.2027, + "step": 4810 + }, + { + "epoch": 0.26, + "grad_norm": 0.154296875, + "learning_rate": 0.00018410937423999325, + "loss": 1.1148, + "step": 4815 + }, + { + "epoch": 0.26, + "grad_norm": 0.16015625, + "learning_rate": 0.00018405762702806652, + "loss": 1.2245, + "step": 4820 + }, + { + "epoch": 0.26, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018400580299356557, + "loss": 1.241, + "step": 4825 + }, + { + "epoch": 0.26, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018395390218385393, + "loss": 1.1644, + "step": 4830 + }, + { + "epoch": 0.26, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018390192464636513, + "loss": 1.2225, + "step": 4835 + }, + { + "epoch": 0.27, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018384987042860287, + "loss": 1.1297, + "step": 4840 + }, + { + "epoch": 0.27, + "grad_norm": 0.16796875, + "learning_rate": 0.0001837977395781409, + "loss": 1.2017, + "step": 4845 + }, + { + "epoch": 0.27, + "grad_norm": 0.16015625, + "learning_rate": 0.0001837455321426231, + "loss": 1.1781, + "step": 4850 + }, + { + "epoch": 0.27, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018369324816976324, + "loss": 1.2031, + "step": 4855 + }, + { + "epoch": 0.27, + "grad_norm": 0.166015625, + "learning_rate": 0.00018364088770734507, + "loss": 1.275, + "step": 4860 + }, + { + "epoch": 0.27, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018358845080322227, + "loss": 1.1411, + "step": 4865 + }, + { + "epoch": 0.27, + "grad_norm": 0.162109375, + "learning_rate": 0.00018353593750531836, + "loss": 1.2066, + "step": 4870 + }, + { + "epoch": 0.27, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018348334786162668, + "loss": 1.1122, + "step": 4875 + }, + { + "epoch": 0.27, + "grad_norm": 0.15625, + "learning_rate": 0.00018343068192021037, + "loss": 1.2278, + "step": 4880 + }, + { + "epoch": 0.27, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018337793972920228, + "loss": 1.1732, + "step": 4885 + }, + { + "epoch": 0.27, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018332512133680494, + "loss": 1.1678, + "step": 4890 + }, + { + "epoch": 0.27, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018327222679129048, + "loss": 1.1443, + "step": 4895 + }, + { + "epoch": 0.27, + "grad_norm": 0.16015625, + "learning_rate": 0.00018321925614100075, + "loss": 1.149, + "step": 4900 + }, + { + "epoch": 0.27, + "grad_norm": 0.162109375, + "learning_rate": 0.00018316620943434705, + "loss": 1.1907, + "step": 4905 + }, + { + "epoch": 0.27, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018311308671981025, + "loss": 1.2136, + "step": 4910 + }, + { + "epoch": 0.27, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018305988804594064, + "loss": 1.1774, + "step": 4915 + }, + { + "epoch": 0.27, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018300661346135793, + "loss": 1.1947, + "step": 4920 + }, + { + "epoch": 0.27, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018295326301475125, + "loss": 1.2368, + "step": 4925 + }, + { + "epoch": 0.27, + "grad_norm": 0.14453125, + "learning_rate": 0.000182899836754879, + "loss": 1.1278, + "step": 4930 + }, + { + "epoch": 0.27, + "grad_norm": 0.16796875, + "learning_rate": 0.00018284633473056896, + "loss": 1.2375, + "step": 4935 + }, + { + "epoch": 0.27, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018279275699071808, + "loss": 1.1362, + "step": 4940 + }, + { + "epoch": 0.27, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018273910358429252, + "loss": 1.1237, + "step": 4945 + }, + { + "epoch": 0.27, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018268537456032758, + "loss": 1.1155, + "step": 4950 + }, + { + "epoch": 0.27, + "grad_norm": 0.16015625, + "learning_rate": 0.00018263156996792777, + "loss": 1.1875, + "step": 4955 + }, + { + "epoch": 0.27, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018257768985626652, + "loss": 1.2704, + "step": 4960 + }, + { + "epoch": 0.27, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018252373427458634, + "loss": 1.2003, + "step": 4965 + }, + { + "epoch": 0.27, + "grad_norm": 0.169921875, + "learning_rate": 0.0001824697032721987, + "loss": 1.2654, + "step": 4970 + }, + { + "epoch": 0.27, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001824155968984841, + "loss": 1.1861, + "step": 4975 + }, + { + "epoch": 0.27, + "grad_norm": 0.162109375, + "learning_rate": 0.00018236141520289176, + "loss": 1.1147, + "step": 4980 + }, + { + "epoch": 0.27, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018230715823493987, + "loss": 1.2168, + "step": 4985 + }, + { + "epoch": 0.27, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018225282604421533, + "loss": 1.1606, + "step": 4990 + }, + { + "epoch": 0.27, + "grad_norm": 0.16015625, + "learning_rate": 0.00018219841868037385, + "loss": 1.2405, + "step": 4995 + }, + { + "epoch": 0.27, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001821439361931398, + "loss": 1.2015, + "step": 5000 + }, + { + "epoch": 0.27, + "grad_norm": 0.166015625, + "learning_rate": 0.00018208937863230625, + "loss": 1.1589, + "step": 5005 + }, + { + "epoch": 0.27, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001820347460477348, + "loss": 1.1751, + "step": 5010 + }, + { + "epoch": 0.27, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001819800384893557, + "loss": 1.1279, + "step": 5015 + }, + { + "epoch": 0.27, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018192525600716769, + "loss": 1.1365, + "step": 5020 + }, + { + "epoch": 0.28, + "grad_norm": 0.162109375, + "learning_rate": 0.00018187039865123793, + "loss": 1.1537, + "step": 5025 + }, + { + "epoch": 0.28, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018181546647170216, + "loss": 1.2473, + "step": 5030 + }, + { + "epoch": 0.28, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018176045951876434, + "loss": 1.2583, + "step": 5035 + }, + { + "epoch": 0.28, + "grad_norm": 0.16796875, + "learning_rate": 0.00018170537784269688, + "loss": 1.2115, + "step": 5040 + }, + { + "epoch": 0.28, + "grad_norm": 0.1640625, + "learning_rate": 0.00018165022149384044, + "loss": 1.227, + "step": 5045 + }, + { + "epoch": 0.28, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018159499052260384, + "loss": 1.1894, + "step": 5050 + }, + { + "epoch": 0.28, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018153968497946425, + "loss": 1.2358, + "step": 5055 + }, + { + "epoch": 0.28, + "grad_norm": 0.162109375, + "learning_rate": 0.00018148430491496696, + "loss": 1.1847, + "step": 5060 + }, + { + "epoch": 0.28, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018142885037972524, + "loss": 1.1305, + "step": 5065 + }, + { + "epoch": 0.28, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018137332142442055, + "loss": 1.156, + "step": 5070 + }, + { + "epoch": 0.28, + "grad_norm": 0.16015625, + "learning_rate": 0.00018131771809980237, + "loss": 1.191, + "step": 5075 + }, + { + "epoch": 0.28, + "grad_norm": 0.162109375, + "learning_rate": 0.00018126204045668804, + "loss": 1.1395, + "step": 5080 + }, + { + "epoch": 0.28, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018120628854596295, + "loss": 1.1608, + "step": 5085 + }, + { + "epoch": 0.28, + "grad_norm": 0.15625, + "learning_rate": 0.00018115046241858025, + "loss": 1.1623, + "step": 5090 + }, + { + "epoch": 0.28, + "grad_norm": 0.1689453125, + "learning_rate": 0.000181094562125561, + "loss": 1.2233, + "step": 5095 + }, + { + "epoch": 0.28, + "grad_norm": 0.1640625, + "learning_rate": 0.000181038587717994, + "loss": 1.1292, + "step": 5100 + }, + { + "epoch": 0.28, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018098253924703583, + "loss": 1.2846, + "step": 5105 + }, + { + "epoch": 0.28, + "grad_norm": 0.166015625, + "learning_rate": 0.0001809264167639107, + "loss": 1.1793, + "step": 5110 + }, + { + "epoch": 0.28, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018087022031991045, + "loss": 1.2, + "step": 5115 + }, + { + "epoch": 0.28, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001808139499663946, + "loss": 1.2053, + "step": 5120 + }, + { + "epoch": 0.28, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001807576057547902, + "loss": 1.1382, + "step": 5125 + }, + { + "epoch": 0.28, + "grad_norm": 0.1640625, + "learning_rate": 0.0001807011877365917, + "loss": 1.2428, + "step": 5130 + }, + { + "epoch": 0.28, + "grad_norm": 0.158203125, + "learning_rate": 0.00018064469596336115, + "loss": 1.2063, + "step": 5135 + }, + { + "epoch": 0.28, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001805881304867279, + "loss": 1.1638, + "step": 5140 + }, + { + "epoch": 0.28, + "grad_norm": 0.15625, + "learning_rate": 0.00018053149135838865, + "loss": 1.2003, + "step": 5145 + }, + { + "epoch": 0.28, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018047477863010754, + "loss": 1.2658, + "step": 5150 + }, + { + "epoch": 0.28, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001804179923537158, + "loss": 1.1726, + "step": 5155 + }, + { + "epoch": 0.28, + "grad_norm": 0.162109375, + "learning_rate": 0.00018036113258111205, + "loss": 1.1496, + "step": 5160 + }, + { + "epoch": 0.28, + "grad_norm": 0.166015625, + "learning_rate": 0.00018030419936426192, + "loss": 1.163, + "step": 5165 + }, + { + "epoch": 0.28, + "grad_norm": 0.166015625, + "learning_rate": 0.0001802471927551983, + "loss": 1.2435, + "step": 5170 + }, + { + "epoch": 0.28, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018019011280602108, + "loss": 1.1734, + "step": 5175 + }, + { + "epoch": 0.28, + "grad_norm": 0.154296875, + "learning_rate": 0.00018013295956889722, + "loss": 1.1583, + "step": 5180 + }, + { + "epoch": 0.28, + "grad_norm": 0.16015625, + "learning_rate": 0.00018007573309606058, + "loss": 1.1795, + "step": 5185 + }, + { + "epoch": 0.28, + "grad_norm": 0.158203125, + "learning_rate": 0.00018001843343981203, + "loss": 1.1806, + "step": 5190 + }, + { + "epoch": 0.28, + "grad_norm": 0.1640625, + "learning_rate": 0.00017996106065251932, + "loss": 1.1773, + "step": 5195 + }, + { + "epoch": 0.28, + "grad_norm": 0.158203125, + "learning_rate": 0.00017990361478661696, + "loss": 1.1526, + "step": 5200 + }, + { + "epoch": 0.29, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017984609589460636, + "loss": 1.1713, + "step": 5205 + }, + { + "epoch": 0.29, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001797885040290556, + "loss": 1.227, + "step": 5210 + }, + { + "epoch": 0.29, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017973083924259945, + "loss": 1.1794, + "step": 5215 + }, + { + "epoch": 0.29, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017967310158793938, + "loss": 1.2075, + "step": 5220 + }, + { + "epoch": 0.29, + "grad_norm": 0.15625, + "learning_rate": 0.00017961529111784337, + "loss": 1.2101, + "step": 5225 + }, + { + "epoch": 0.29, + "grad_norm": 0.154296875, + "learning_rate": 0.00017955740788514604, + "loss": 1.1414, + "step": 5230 + }, + { + "epoch": 0.29, + "grad_norm": 0.162109375, + "learning_rate": 0.00017949945194274843, + "loss": 1.23, + "step": 5235 + }, + { + "epoch": 0.29, + "grad_norm": 0.1533203125, + "learning_rate": 0.00017944142334361807, + "loss": 1.1522, + "step": 5240 + }, + { + "epoch": 0.29, + "grad_norm": 0.1640625, + "learning_rate": 0.00017938332214078893, + "loss": 1.1904, + "step": 5245 + }, + { + "epoch": 0.29, + "grad_norm": 0.1533203125, + "learning_rate": 0.00017932514838736124, + "loss": 1.194, + "step": 5250 + }, + { + "epoch": 0.29, + "grad_norm": 0.158203125, + "learning_rate": 0.00017926690213650165, + "loss": 1.2166, + "step": 5255 + }, + { + "epoch": 0.29, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017920858344144298, + "loss": 1.2755, + "step": 5260 + }, + { + "epoch": 0.29, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001791501923554843, + "loss": 1.2374, + "step": 5265 + }, + { + "epoch": 0.29, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001790917289319908, + "loss": 1.1829, + "step": 5270 + }, + { + "epoch": 0.29, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001790331932243938, + "loss": 1.1124, + "step": 5275 + }, + { + "epoch": 0.29, + "grad_norm": 0.166015625, + "learning_rate": 0.00017897458528619073, + "loss": 1.1659, + "step": 5280 + }, + { + "epoch": 0.29, + "grad_norm": 0.1552734375, + "learning_rate": 0.00017891590517094498, + "loss": 1.2141, + "step": 5285 + }, + { + "epoch": 0.29, + "grad_norm": 0.1552734375, + "learning_rate": 0.00017885715293228593, + "loss": 1.1627, + "step": 5290 + }, + { + "epoch": 0.29, + "grad_norm": 0.173828125, + "learning_rate": 0.00017879832862390885, + "loss": 1.1381, + "step": 5295 + }, + { + "epoch": 0.29, + "grad_norm": 0.16015625, + "learning_rate": 0.00017873943229957488, + "loss": 1.2275, + "step": 5300 + }, + { + "epoch": 0.29, + "grad_norm": 0.1572265625, + "learning_rate": 0.000178680464013111, + "loss": 1.1676, + "step": 5305 + }, + { + "epoch": 0.29, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017862142381840996, + "loss": 1.1738, + "step": 5310 + }, + { + "epoch": 0.29, + "grad_norm": 0.16015625, + "learning_rate": 0.0001785623117694302, + "loss": 1.1766, + "step": 5315 + }, + { + "epoch": 0.29, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017850312792019586, + "loss": 1.1603, + "step": 5320 + }, + { + "epoch": 0.29, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017844387232479665, + "loss": 1.1815, + "step": 5325 + }, + { + "epoch": 0.29, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001783845450373879, + "loss": 1.1621, + "step": 5330 + }, + { + "epoch": 0.29, + "grad_norm": 0.1640625, + "learning_rate": 0.0001783251461121905, + "loss": 1.1822, + "step": 5335 + }, + { + "epoch": 0.29, + "grad_norm": 0.166015625, + "learning_rate": 0.0001782656756034907, + "loss": 1.1437, + "step": 5340 + }, + { + "epoch": 0.29, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017820613356564025, + "loss": 1.1738, + "step": 5345 + }, + { + "epoch": 0.29, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017814652005305624, + "loss": 1.1824, + "step": 5350 + }, + { + "epoch": 0.29, + "grad_norm": 0.16015625, + "learning_rate": 0.00017808683512022112, + "loss": 1.2294, + "step": 5355 + }, + { + "epoch": 0.29, + "grad_norm": 0.162109375, + "learning_rate": 0.0001780270788216826, + "loss": 1.1425, + "step": 5360 + }, + { + "epoch": 0.29, + "grad_norm": 0.162109375, + "learning_rate": 0.00017796725121205356, + "loss": 1.2155, + "step": 5365 + }, + { + "epoch": 0.29, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001779073523460121, + "loss": 1.181, + "step": 5370 + }, + { + "epoch": 0.29, + "grad_norm": 0.15234375, + "learning_rate": 0.00017784738227830146, + "loss": 1.1912, + "step": 5375 + }, + { + "epoch": 0.29, + "grad_norm": 0.15625, + "learning_rate": 0.00017778734106372996, + "loss": 1.1595, + "step": 5380 + }, + { + "epoch": 0.29, + "grad_norm": 0.16015625, + "learning_rate": 0.00017772722875717087, + "loss": 1.1602, + "step": 5385 + }, + { + "epoch": 0.3, + "grad_norm": 0.16015625, + "learning_rate": 0.00017766704541356248, + "loss": 1.1615, + "step": 5390 + }, + { + "epoch": 0.3, + "grad_norm": 0.1669921875, + "learning_rate": 0.000177606791087908, + "loss": 1.1613, + "step": 5395 + }, + { + "epoch": 0.3, + "grad_norm": 0.1552734375, + "learning_rate": 0.00017754646583527554, + "loss": 1.1412, + "step": 5400 + }, + { + "epoch": 0.3, + "grad_norm": 0.171875, + "learning_rate": 0.000177486069710798, + "loss": 1.2031, + "step": 5405 + }, + { + "epoch": 0.3, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017742560276967306, + "loss": 1.0884, + "step": 5410 + }, + { + "epoch": 0.3, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017736506506716308, + "loss": 1.1752, + "step": 5415 + }, + { + "epoch": 0.3, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017730445665859517, + "loss": 1.2408, + "step": 5420 + }, + { + "epoch": 0.3, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017724377759936098, + "loss": 1.1372, + "step": 5425 + }, + { + "epoch": 0.3, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017718302794491683, + "loss": 1.1929, + "step": 5430 + }, + { + "epoch": 0.3, + "grad_norm": 0.1728515625, + "learning_rate": 0.00017712220775078344, + "loss": 1.2121, + "step": 5435 + }, + { + "epoch": 0.3, + "grad_norm": 0.158203125, + "learning_rate": 0.00017706131707254607, + "loss": 1.2263, + "step": 5440 + }, + { + "epoch": 0.3, + "grad_norm": 0.1640625, + "learning_rate": 0.00017700035596585441, + "loss": 1.1945, + "step": 5445 + }, + { + "epoch": 0.3, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017693932448642244, + "loss": 1.1754, + "step": 5450 + }, + { + "epoch": 0.3, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017687822269002855, + "loss": 1.1458, + "step": 5455 + }, + { + "epoch": 0.3, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017681705063251534, + "loss": 1.1236, + "step": 5460 + }, + { + "epoch": 0.3, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017675580836978966, + "loss": 1.1502, + "step": 5465 + }, + { + "epoch": 0.3, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017669449595782242, + "loss": 1.2136, + "step": 5470 + }, + { + "epoch": 0.3, + "grad_norm": 0.1640625, + "learning_rate": 0.0001766331134526488, + "loss": 1.1455, + "step": 5475 + }, + { + "epoch": 0.3, + "grad_norm": 0.166015625, + "learning_rate": 0.00017657166091036787, + "loss": 1.2175, + "step": 5480 + }, + { + "epoch": 0.3, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017651013838714287, + "loss": 1.1593, + "step": 5485 + }, + { + "epoch": 0.3, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001764485459392009, + "loss": 1.1017, + "step": 5490 + }, + { + "epoch": 0.3, + "grad_norm": 0.1591796875, + "learning_rate": 0.000176386883622833, + "loss": 1.1422, + "step": 5495 + }, + { + "epoch": 0.3, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017632515149439403, + "loss": 1.1333, + "step": 5500 + }, + { + "epoch": 0.3, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017626334961030272, + "loss": 1.1235, + "step": 5505 + }, + { + "epoch": 0.3, + "grad_norm": 0.158203125, + "learning_rate": 0.00017620147802704147, + "loss": 1.1761, + "step": 5510 + }, + { + "epoch": 0.3, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001761395368011564, + "loss": 1.2288, + "step": 5515 + }, + { + "epoch": 0.3, + "grad_norm": 0.169921875, + "learning_rate": 0.00017607752598925737, + "loss": 1.1904, + "step": 5520 + }, + { + "epoch": 0.3, + "grad_norm": 0.16015625, + "learning_rate": 0.00017601544564801772, + "loss": 1.1194, + "step": 5525 + }, + { + "epoch": 0.3, + "grad_norm": 0.1640625, + "learning_rate": 0.00017595329583417434, + "loss": 1.2093, + "step": 5530 + }, + { + "epoch": 0.3, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017589107660452772, + "loss": 1.1759, + "step": 5535 + }, + { + "epoch": 0.3, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017582878801594172, + "loss": 1.1685, + "step": 5540 + }, + { + "epoch": 0.3, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017576643012534352, + "loss": 1.106, + "step": 5545 + }, + { + "epoch": 0.3, + "grad_norm": 0.158203125, + "learning_rate": 0.0001757040029897238, + "loss": 1.2092, + "step": 5550 + }, + { + "epoch": 0.3, + "grad_norm": 0.162109375, + "learning_rate": 0.00017564150666613638, + "loss": 1.2147, + "step": 5555 + }, + { + "epoch": 0.3, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001755789412116984, + "loss": 1.1028, + "step": 5560 + }, + { + "epoch": 0.3, + "grad_norm": 0.1640625, + "learning_rate": 0.00017551630668359008, + "loss": 1.2131, + "step": 5565 + }, + { + "epoch": 0.31, + "grad_norm": 0.1767578125, + "learning_rate": 0.00017545360313905492, + "loss": 1.1669, + "step": 5570 + }, + { + "epoch": 0.31, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017539083063539934, + "loss": 1.2072, + "step": 5575 + }, + { + "epoch": 0.31, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017532798922999292, + "loss": 1.1919, + "step": 5580 + }, + { + "epoch": 0.31, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001752650789802681, + "loss": 1.2274, + "step": 5585 + }, + { + "epoch": 0.31, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001752020999437203, + "loss": 1.2395, + "step": 5590 + }, + { + "epoch": 0.31, + "grad_norm": 0.158203125, + "learning_rate": 0.0001751390521779078, + "loss": 1.1656, + "step": 5595 + }, + { + "epoch": 0.31, + "grad_norm": 0.1533203125, + "learning_rate": 0.00017507593574045166, + "loss": 1.148, + "step": 5600 + }, + { + "epoch": 0.31, + "grad_norm": 0.16796875, + "learning_rate": 0.00017501275068903575, + "loss": 1.2134, + "step": 5605 + }, + { + "epoch": 0.31, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017494949708140662, + "loss": 1.2303, + "step": 5610 + }, + { + "epoch": 0.31, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017488617497537348, + "loss": 1.1557, + "step": 5615 + }, + { + "epoch": 0.31, + "grad_norm": 0.1640625, + "learning_rate": 0.00017482278442880814, + "loss": 1.2181, + "step": 5620 + }, + { + "epoch": 0.31, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017475932549964492, + "loss": 1.1929, + "step": 5625 + }, + { + "epoch": 0.31, + "grad_norm": 0.173828125, + "learning_rate": 0.00017469579824588077, + "loss": 1.2005, + "step": 5630 + }, + { + "epoch": 0.31, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001746322027255749, + "loss": 1.1688, + "step": 5635 + }, + { + "epoch": 0.31, + "grad_norm": 0.17578125, + "learning_rate": 0.00017456853899684905, + "loss": 1.1488, + "step": 5640 + }, + { + "epoch": 0.31, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017450480711788728, + "loss": 1.1308, + "step": 5645 + }, + { + "epoch": 0.31, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017444100714693583, + "loss": 1.1607, + "step": 5650 + }, + { + "epoch": 0.31, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017437713914230328, + "loss": 1.195, + "step": 5655 + }, + { + "epoch": 0.31, + "grad_norm": 0.1640625, + "learning_rate": 0.00017431320316236038, + "loss": 1.2241, + "step": 5660 + }, + { + "epoch": 0.31, + "grad_norm": 0.15625, + "learning_rate": 0.00017424919926553993, + "loss": 1.1902, + "step": 5665 + }, + { + "epoch": 0.31, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001741851275103369, + "loss": 1.1647, + "step": 5670 + }, + { + "epoch": 0.31, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001741209879553082, + "loss": 1.1406, + "step": 5675 + }, + { + "epoch": 0.31, + "grad_norm": 0.16015625, + "learning_rate": 0.00017405678065907272, + "loss": 1.1755, + "step": 5680 + }, + { + "epoch": 0.31, + "grad_norm": 0.173828125, + "learning_rate": 0.0001739925056803113, + "loss": 1.1539, + "step": 5685 + }, + { + "epoch": 0.31, + "grad_norm": 0.162109375, + "learning_rate": 0.00017392816307776655, + "loss": 1.2184, + "step": 5690 + }, + { + "epoch": 0.31, + "grad_norm": 0.166015625, + "learning_rate": 0.000173863752910243, + "loss": 1.1078, + "step": 5695 + }, + { + "epoch": 0.31, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001737992752366069, + "loss": 1.1312, + "step": 5700 + }, + { + "epoch": 0.31, + "grad_norm": 0.16796875, + "learning_rate": 0.00017373473011578607, + "loss": 1.2622, + "step": 5705 + }, + { + "epoch": 0.31, + "grad_norm": 0.171875, + "learning_rate": 0.00017367011760677014, + "loss": 1.1566, + "step": 5710 + }, + { + "epoch": 0.31, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017360543776861028, + "loss": 1.1736, + "step": 5715 + }, + { + "epoch": 0.31, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017354069066041907, + "loss": 1.1516, + "step": 5720 + }, + { + "epoch": 0.31, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017347587634137076, + "loss": 1.0988, + "step": 5725 + }, + { + "epoch": 0.31, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017341099487070088, + "loss": 1.1291, + "step": 5730 + }, + { + "epoch": 0.31, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017334604630770643, + "loss": 1.1378, + "step": 5735 + }, + { + "epoch": 0.31, + "grad_norm": 0.1640625, + "learning_rate": 0.00017328103071174566, + "loss": 1.1181, + "step": 5740 + }, + { + "epoch": 0.31, + "grad_norm": 0.162109375, + "learning_rate": 0.00017321594814223814, + "loss": 1.2002, + "step": 5745 + }, + { + "epoch": 0.31, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017315079865866457, + "loss": 1.2042, + "step": 5750 + }, + { + "epoch": 0.32, + "grad_norm": 0.16015625, + "learning_rate": 0.0001730855823205669, + "loss": 1.1689, + "step": 5755 + }, + { + "epoch": 0.32, + "grad_norm": 0.158203125, + "learning_rate": 0.00017302029918754807, + "loss": 1.2722, + "step": 5760 + }, + { + "epoch": 0.32, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017295494931927224, + "loss": 1.2428, + "step": 5765 + }, + { + "epoch": 0.32, + "grad_norm": 0.166015625, + "learning_rate": 0.0001728895327754643, + "loss": 1.1916, + "step": 5770 + }, + { + "epoch": 0.32, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017282404961591036, + "loss": 1.1339, + "step": 5775 + }, + { + "epoch": 0.32, + "grad_norm": 0.158203125, + "learning_rate": 0.00017275849990045723, + "loss": 1.1971, + "step": 5780 + }, + { + "epoch": 0.32, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017269288368901257, + "loss": 1.1746, + "step": 5785 + }, + { + "epoch": 0.32, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017262720104154483, + "loss": 1.1285, + "step": 5790 + }, + { + "epoch": 0.32, + "grad_norm": 0.162109375, + "learning_rate": 0.00017256145201808326, + "loss": 1.2119, + "step": 5795 + }, + { + "epoch": 0.32, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017249563667871764, + "loss": 1.1001, + "step": 5800 + }, + { + "epoch": 0.32, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001724297550835984, + "loss": 1.1578, + "step": 5805 + }, + { + "epoch": 0.32, + "grad_norm": 0.1640625, + "learning_rate": 0.0001723638072929366, + "loss": 1.2169, + "step": 5810 + }, + { + "epoch": 0.32, + "grad_norm": 0.1552734375, + "learning_rate": 0.00017229779336700368, + "loss": 1.2047, + "step": 5815 + }, + { + "epoch": 0.32, + "grad_norm": 0.158203125, + "learning_rate": 0.0001722317133661316, + "loss": 1.2149, + "step": 5820 + }, + { + "epoch": 0.32, + "grad_norm": 0.16015625, + "learning_rate": 0.00017216556735071272, + "loss": 1.1894, + "step": 5825 + }, + { + "epoch": 0.32, + "grad_norm": 0.166015625, + "learning_rate": 0.00017209935538119967, + "loss": 1.1614, + "step": 5830 + }, + { + "epoch": 0.32, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001720330775181054, + "loss": 1.1752, + "step": 5835 + }, + { + "epoch": 0.32, + "grad_norm": 0.1689453125, + "learning_rate": 0.000171966733822003, + "loss": 1.2206, + "step": 5840 + }, + { + "epoch": 0.32, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001719003243535259, + "loss": 1.1617, + "step": 5845 + }, + { + "epoch": 0.32, + "grad_norm": 0.15625, + "learning_rate": 0.00017183384917336748, + "loss": 1.1625, + "step": 5850 + }, + { + "epoch": 0.32, + "grad_norm": 0.1591796875, + "learning_rate": 0.00017176730834228123, + "loss": 1.052, + "step": 5855 + }, + { + "epoch": 0.32, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001717007019210807, + "loss": 1.1625, + "step": 5860 + }, + { + "epoch": 0.32, + "grad_norm": 0.166015625, + "learning_rate": 0.00017163402997063927, + "loss": 1.1776, + "step": 5865 + }, + { + "epoch": 0.32, + "grad_norm": 0.1650390625, + "learning_rate": 0.00017156729255189028, + "loss": 1.1873, + "step": 5870 + }, + { + "epoch": 0.32, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001715004897258269, + "loss": 1.1842, + "step": 5875 + }, + { + "epoch": 0.32, + "grad_norm": 0.158203125, + "learning_rate": 0.00017143362155350205, + "loss": 1.1114, + "step": 5880 + }, + { + "epoch": 0.32, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001713666880960284, + "loss": 1.1181, + "step": 5885 + }, + { + "epoch": 0.32, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017129968941457827, + "loss": 1.1281, + "step": 5890 + }, + { + "epoch": 0.32, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001712326255703836, + "loss": 1.2166, + "step": 5895 + }, + { + "epoch": 0.32, + "grad_norm": 0.166015625, + "learning_rate": 0.00017116549662473594, + "loss": 1.1349, + "step": 5900 + }, + { + "epoch": 0.32, + "grad_norm": 0.154296875, + "learning_rate": 0.00017109830263898617, + "loss": 1.1106, + "step": 5905 + }, + { + "epoch": 0.32, + "grad_norm": 0.173828125, + "learning_rate": 0.00017103104367454476, + "loss": 1.1913, + "step": 5910 + }, + { + "epoch": 0.32, + "grad_norm": 0.1640625, + "learning_rate": 0.00017096371979288154, + "loss": 1.2241, + "step": 5915 + }, + { + "epoch": 0.32, + "grad_norm": 0.1572265625, + "learning_rate": 0.00017089633105552566, + "loss": 1.1808, + "step": 5920 + }, + { + "epoch": 0.32, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001708288775240655, + "loss": 1.1804, + "step": 5925 + }, + { + "epoch": 0.32, + "grad_norm": 0.162109375, + "learning_rate": 0.00017076135926014874, + "loss": 1.1897, + "step": 5930 + }, + { + "epoch": 0.33, + "grad_norm": 0.162109375, + "learning_rate": 0.00017069377632548212, + "loss": 1.1233, + "step": 5935 + }, + { + "epoch": 0.33, + "grad_norm": 0.16015625, + "learning_rate": 0.0001706261287818316, + "loss": 1.2036, + "step": 5940 + }, + { + "epoch": 0.33, + "grad_norm": 0.1640625, + "learning_rate": 0.00017055841669102212, + "loss": 1.2187, + "step": 5945 + }, + { + "epoch": 0.33, + "grad_norm": 0.154296875, + "learning_rate": 0.0001704906401149376, + "loss": 1.1052, + "step": 5950 + }, + { + "epoch": 0.33, + "grad_norm": 0.162109375, + "learning_rate": 0.0001704227991155209, + "loss": 1.103, + "step": 5955 + }, + { + "epoch": 0.33, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017035489375477383, + "loss": 1.224, + "step": 5960 + }, + { + "epoch": 0.33, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001702869240947569, + "loss": 1.17, + "step": 5965 + }, + { + "epoch": 0.33, + "grad_norm": 0.162109375, + "learning_rate": 0.00017021889019758945, + "loss": 1.1957, + "step": 5970 + }, + { + "epoch": 0.33, + "grad_norm": 0.171875, + "learning_rate": 0.00017015079212544956, + "loss": 1.1352, + "step": 5975 + }, + { + "epoch": 0.33, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017008262994057394, + "loss": 1.1536, + "step": 5980 + }, + { + "epoch": 0.33, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017001440370525782, + "loss": 1.157, + "step": 5985 + }, + { + "epoch": 0.33, + "grad_norm": 0.162109375, + "learning_rate": 0.00016994611348185513, + "loss": 1.1929, + "step": 5990 + }, + { + "epoch": 0.33, + "grad_norm": 0.158203125, + "learning_rate": 0.0001698777593327781, + "loss": 1.1516, + "step": 5995 + }, + { + "epoch": 0.33, + "grad_norm": 0.162109375, + "learning_rate": 0.00016980934132049751, + "loss": 1.1501, + "step": 6000 + }, + { + "epoch": 0.33, + "grad_norm": 0.16796875, + "learning_rate": 0.00016974085950754238, + "loss": 1.1923, + "step": 6005 + }, + { + "epoch": 0.33, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016967231395650028, + "loss": 1.2744, + "step": 6010 + }, + { + "epoch": 0.33, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001696037047300167, + "loss": 1.1086, + "step": 6015 + }, + { + "epoch": 0.33, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016953503189079557, + "loss": 1.2266, + "step": 6020 + }, + { + "epoch": 0.33, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001694662955015989, + "loss": 1.1767, + "step": 6025 + }, + { + "epoch": 0.33, + "grad_norm": 0.1572265625, + "learning_rate": 0.00016939749562524674, + "loss": 1.2378, + "step": 6030 + }, + { + "epoch": 0.33, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016932863232461718, + "loss": 1.1885, + "step": 6035 + }, + { + "epoch": 0.33, + "grad_norm": 0.1640625, + "learning_rate": 0.00016925970566264632, + "loss": 1.1633, + "step": 6040 + }, + { + "epoch": 0.33, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016919071570232808, + "loss": 1.1965, + "step": 6045 + }, + { + "epoch": 0.33, + "grad_norm": 0.162109375, + "learning_rate": 0.0001691216625067143, + "loss": 1.1941, + "step": 6050 + }, + { + "epoch": 0.33, + "grad_norm": 0.162109375, + "learning_rate": 0.0001690525461389146, + "loss": 1.1419, + "step": 6055 + }, + { + "epoch": 0.33, + "grad_norm": 0.158203125, + "learning_rate": 0.00016898336666209635, + "loss": 1.1146, + "step": 6060 + }, + { + "epoch": 0.33, + "grad_norm": 0.1572265625, + "learning_rate": 0.00016891412413948454, + "loss": 1.2513, + "step": 6065 + }, + { + "epoch": 0.33, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016884481863436182, + "loss": 1.1577, + "step": 6070 + }, + { + "epoch": 0.33, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016877545021006839, + "loss": 1.1731, + "step": 6075 + }, + { + "epoch": 0.33, + "grad_norm": 0.166015625, + "learning_rate": 0.00016870601893000193, + "loss": 1.1596, + "step": 6080 + }, + { + "epoch": 0.33, + "grad_norm": 0.1640625, + "learning_rate": 0.00016863652485761766, + "loss": 1.2516, + "step": 6085 + }, + { + "epoch": 0.33, + "grad_norm": 0.1552734375, + "learning_rate": 0.00016856696805642808, + "loss": 1.1978, + "step": 6090 + }, + { + "epoch": 0.33, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001684973485900031, + "loss": 1.1111, + "step": 6095 + }, + { + "epoch": 0.33, + "grad_norm": 0.1630859375, + "learning_rate": 0.00016842766652196977, + "loss": 1.1747, + "step": 6100 + }, + { + "epoch": 0.33, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016835792191601255, + "loss": 1.1575, + "step": 6105 + }, + { + "epoch": 0.33, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001682881148358729, + "loss": 1.1989, + "step": 6110 + }, + { + "epoch": 0.33, + "grad_norm": 0.158203125, + "learning_rate": 0.00016821824534534946, + "loss": 1.1933, + "step": 6115 + }, + { + "epoch": 0.34, + "grad_norm": 0.15234375, + "learning_rate": 0.00016814831350829782, + "loss": 1.1758, + "step": 6120 + }, + { + "epoch": 0.34, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016807831938863067, + "loss": 1.2157, + "step": 6125 + }, + { + "epoch": 0.34, + "grad_norm": 0.1640625, + "learning_rate": 0.0001680082630503175, + "loss": 1.2277, + "step": 6130 + }, + { + "epoch": 0.34, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001679381445573848, + "loss": 1.1707, + "step": 6135 + }, + { + "epoch": 0.34, + "grad_norm": 0.158203125, + "learning_rate": 0.00016786796397391568, + "loss": 1.1909, + "step": 6140 + }, + { + "epoch": 0.34, + "grad_norm": 0.166015625, + "learning_rate": 0.0001677977213640502, + "loss": 1.182, + "step": 6145 + }, + { + "epoch": 0.34, + "grad_norm": 0.16796875, + "learning_rate": 0.00016772741679198493, + "loss": 1.1349, + "step": 6150 + }, + { + "epoch": 0.34, + "grad_norm": 0.1640625, + "learning_rate": 0.00016765705032197324, + "loss": 1.1642, + "step": 6155 + }, + { + "epoch": 0.34, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016758662201832487, + "loss": 1.1889, + "step": 6160 + }, + { + "epoch": 0.34, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001675161319454063, + "loss": 1.2314, + "step": 6165 + }, + { + "epoch": 0.34, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016744558016764028, + "loss": 1.1359, + "step": 6170 + }, + { + "epoch": 0.34, + "grad_norm": 0.1572265625, + "learning_rate": 0.00016737496674950597, + "loss": 1.0452, + "step": 6175 + }, + { + "epoch": 0.34, + "grad_norm": 0.162109375, + "learning_rate": 0.000167304291755539, + "loss": 1.2646, + "step": 6180 + }, + { + "epoch": 0.34, + "grad_norm": 0.171875, + "learning_rate": 0.00016723355525033115, + "loss": 1.2203, + "step": 6185 + }, + { + "epoch": 0.34, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016716275729853047, + "loss": 1.1943, + "step": 6190 + }, + { + "epoch": 0.34, + "grad_norm": 0.1630859375, + "learning_rate": 0.00016709189796484112, + "loss": 1.1427, + "step": 6195 + }, + { + "epoch": 0.34, + "grad_norm": 0.1552734375, + "learning_rate": 0.00016702097731402345, + "loss": 1.2121, + "step": 6200 + }, + { + "epoch": 0.34, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001669499954108937, + "loss": 1.1631, + "step": 6205 + }, + { + "epoch": 0.34, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016687895232032424, + "loss": 1.1884, + "step": 6210 + }, + { + "epoch": 0.34, + "grad_norm": 0.158203125, + "learning_rate": 0.00016680784810724333, + "loss": 1.1243, + "step": 6215 + }, + { + "epoch": 0.34, + "grad_norm": 0.16796875, + "learning_rate": 0.000166736682836635, + "loss": 1.2107, + "step": 6220 + }, + { + "epoch": 0.34, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001666654565735392, + "loss": 1.1739, + "step": 6225 + }, + { + "epoch": 0.34, + "grad_norm": 0.162109375, + "learning_rate": 0.00016659416938305155, + "loss": 1.147, + "step": 6230 + }, + { + "epoch": 0.34, + "grad_norm": 0.16015625, + "learning_rate": 0.00016652282133032334, + "loss": 1.144, + "step": 6235 + }, + { + "epoch": 0.34, + "grad_norm": 0.1572265625, + "learning_rate": 0.00016645141248056153, + "loss": 1.171, + "step": 6240 + }, + { + "epoch": 0.34, + "grad_norm": 0.166015625, + "learning_rate": 0.00016637994289902866, + "loss": 1.1352, + "step": 6245 + }, + { + "epoch": 0.34, + "grad_norm": 0.1640625, + "learning_rate": 0.0001663084126510427, + "loss": 1.1661, + "step": 6250 + }, + { + "epoch": 0.34, + "grad_norm": 0.1640625, + "learning_rate": 0.00016623682180197716, + "loss": 1.1711, + "step": 6255 + }, + { + "epoch": 0.34, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016616517041726083, + "loss": 1.1735, + "step": 6260 + }, + { + "epoch": 0.34, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016609345856237793, + "loss": 1.1531, + "step": 6265 + }, + { + "epoch": 0.34, + "grad_norm": 0.173828125, + "learning_rate": 0.00016602168630286784, + "loss": 1.1722, + "step": 6270 + }, + { + "epoch": 0.34, + "grad_norm": 0.1640625, + "learning_rate": 0.00016594985370432523, + "loss": 1.1805, + "step": 6275 + }, + { + "epoch": 0.34, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016587796083239987, + "loss": 1.2066, + "step": 6280 + }, + { + "epoch": 0.34, + "grad_norm": 0.169921875, + "learning_rate": 0.00016580600775279667, + "loss": 1.1717, + "step": 6285 + }, + { + "epoch": 0.34, + "grad_norm": 0.162109375, + "learning_rate": 0.00016573399453127547, + "loss": 1.1868, + "step": 6290 + }, + { + "epoch": 0.34, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016566192123365116, + "loss": 1.1424, + "step": 6295 + }, + { + "epoch": 0.35, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016558978792579352, + "loss": 1.1829, + "step": 6300 + }, + { + "epoch": 0.35, + "grad_norm": 0.171875, + "learning_rate": 0.00016551759467362714, + "loss": 1.1477, + "step": 6305 + }, + { + "epoch": 0.35, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001654453415431314, + "loss": 1.1573, + "step": 6310 + }, + { + "epoch": 0.35, + "grad_norm": 0.162109375, + "learning_rate": 0.00016537302860034048, + "loss": 1.1836, + "step": 6315 + }, + { + "epoch": 0.35, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001653006559113431, + "loss": 1.2053, + "step": 6320 + }, + { + "epoch": 0.35, + "grad_norm": 0.162109375, + "learning_rate": 0.00016522822354228268, + "loss": 1.2679, + "step": 6325 + }, + { + "epoch": 0.35, + "grad_norm": 0.162109375, + "learning_rate": 0.00016515573155935712, + "loss": 1.1288, + "step": 6330 + }, + { + "epoch": 0.35, + "grad_norm": 0.1640625, + "learning_rate": 0.00016508318002881892, + "loss": 1.1524, + "step": 6335 + }, + { + "epoch": 0.35, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001650105690169748, + "loss": 1.1965, + "step": 6340 + }, + { + "epoch": 0.35, + "grad_norm": 0.1630859375, + "learning_rate": 0.00016493789859018605, + "loss": 1.1372, + "step": 6345 + }, + { + "epoch": 0.35, + "grad_norm": 0.1630859375, + "learning_rate": 0.00016486516881486813, + "loss": 1.1815, + "step": 6350 + }, + { + "epoch": 0.35, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016479237975749078, + "loss": 1.2207, + "step": 6355 + }, + { + "epoch": 0.35, + "grad_norm": 0.15625, + "learning_rate": 0.0001647195314845779, + "loss": 1.0963, + "step": 6360 + }, + { + "epoch": 0.35, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016464662406270758, + "loss": 1.0869, + "step": 6365 + }, + { + "epoch": 0.35, + "grad_norm": 0.16796875, + "learning_rate": 0.0001645736575585119, + "loss": 1.1557, + "step": 6370 + }, + { + "epoch": 0.35, + "grad_norm": 0.16796875, + "learning_rate": 0.0001645006320386769, + "loss": 1.248, + "step": 6375 + }, + { + "epoch": 0.35, + "grad_norm": 0.1572265625, + "learning_rate": 0.00016442754756994265, + "loss": 1.1437, + "step": 6380 + }, + { + "epoch": 0.35, + "grad_norm": 0.16015625, + "learning_rate": 0.00016435440421910306, + "loss": 1.1352, + "step": 6385 + }, + { + "epoch": 0.35, + "grad_norm": 0.162109375, + "learning_rate": 0.00016428120205300585, + "loss": 1.1734, + "step": 6390 + }, + { + "epoch": 0.35, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016420794113855243, + "loss": 1.2053, + "step": 6395 + }, + { + "epoch": 0.35, + "grad_norm": 0.16015625, + "learning_rate": 0.00016413462154269808, + "loss": 1.1419, + "step": 6400 + }, + { + "epoch": 0.35, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016406124333245145, + "loss": 1.1326, + "step": 6405 + }, + { + "epoch": 0.35, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016398780657487505, + "loss": 1.1899, + "step": 6410 + }, + { + "epoch": 0.35, + "grad_norm": 0.16796875, + "learning_rate": 0.00016391431133708462, + "loss": 1.1499, + "step": 6415 + }, + { + "epoch": 0.35, + "grad_norm": 0.158203125, + "learning_rate": 0.00016384075768624955, + "loss": 1.1841, + "step": 6420 + }, + { + "epoch": 0.35, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001637671456895925, + "loss": 1.0846, + "step": 6425 + }, + { + "epoch": 0.35, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016369347541438954, + "loss": 1.1874, + "step": 6430 + }, + { + "epoch": 0.35, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001636197469279699, + "loss": 1.1702, + "step": 6435 + }, + { + "epoch": 0.35, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016354596029771615, + "loss": 1.2064, + "step": 6440 + }, + { + "epoch": 0.35, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016347211559106383, + "loss": 1.1969, + "step": 6445 + }, + { + "epoch": 0.35, + "grad_norm": 0.16015625, + "learning_rate": 0.00016339821287550171, + "loss": 1.1114, + "step": 6450 + }, + { + "epoch": 0.35, + "grad_norm": 0.16796875, + "learning_rate": 0.00016332425221857148, + "loss": 1.1589, + "step": 6455 + }, + { + "epoch": 0.35, + "grad_norm": 0.181640625, + "learning_rate": 0.00016325023368786784, + "loss": 1.1911, + "step": 6460 + }, + { + "epoch": 0.35, + "grad_norm": 0.1640625, + "learning_rate": 0.00016317615735103838, + "loss": 1.1119, + "step": 6465 + }, + { + "epoch": 0.35, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016310202327578338, + "loss": 1.2206, + "step": 6470 + }, + { + "epoch": 0.35, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016302783152985614, + "loss": 1.22, + "step": 6475 + }, + { + "epoch": 0.35, + "grad_norm": 0.1630859375, + "learning_rate": 0.00016295358218106247, + "loss": 1.1452, + "step": 6480 + }, + { + "epoch": 0.36, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001628792752972609, + "loss": 1.2454, + "step": 6485 + }, + { + "epoch": 0.36, + "grad_norm": 0.162109375, + "learning_rate": 0.00016280491094636257, + "loss": 1.1665, + "step": 6490 + }, + { + "epoch": 0.36, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016273048919633105, + "loss": 1.1854, + "step": 6495 + }, + { + "epoch": 0.36, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001626560101151824, + "loss": 1.1945, + "step": 6500 + }, + { + "epoch": 0.36, + "grad_norm": 0.16796875, + "learning_rate": 0.0001625814737709852, + "loss": 1.172, + "step": 6505 + }, + { + "epoch": 0.36, + "grad_norm": 0.16015625, + "learning_rate": 0.00016250688023186017, + "loss": 1.2228, + "step": 6510 + }, + { + "epoch": 0.36, + "grad_norm": 0.162109375, + "learning_rate": 0.0001624322295659804, + "loss": 1.2405, + "step": 6515 + }, + { + "epoch": 0.36, + "grad_norm": 0.166015625, + "learning_rate": 0.00016235752184157125, + "loss": 1.1174, + "step": 6520 + }, + { + "epoch": 0.36, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001622827571269101, + "loss": 1.1757, + "step": 6525 + }, + { + "epoch": 0.36, + "grad_norm": 0.1640625, + "learning_rate": 0.0001622079354903265, + "loss": 1.1761, + "step": 6530 + }, + { + "epoch": 0.36, + "grad_norm": 0.1708984375, + "learning_rate": 0.000162133057000202, + "loss": 1.1007, + "step": 6535 + }, + { + "epoch": 0.36, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001620581217249701, + "loss": 1.1801, + "step": 6540 + }, + { + "epoch": 0.36, + "grad_norm": 0.166015625, + "learning_rate": 0.0001619831297331162, + "loss": 1.1957, + "step": 6545 + }, + { + "epoch": 0.36, + "grad_norm": 0.158203125, + "learning_rate": 0.00016190808109317755, + "loss": 1.1597, + "step": 6550 + }, + { + "epoch": 0.36, + "grad_norm": 0.162109375, + "learning_rate": 0.0001618329758737432, + "loss": 1.1873, + "step": 6555 + }, + { + "epoch": 0.36, + "grad_norm": 0.169921875, + "learning_rate": 0.0001617578141434538, + "loss": 1.1642, + "step": 6560 + }, + { + "epoch": 0.36, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016168259597100178, + "loss": 1.1846, + "step": 6565 + }, + { + "epoch": 0.36, + "grad_norm": 0.1640625, + "learning_rate": 0.00016160732142513106, + "loss": 1.1687, + "step": 6570 + }, + { + "epoch": 0.36, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016153199057463712, + "loss": 1.123, + "step": 6575 + }, + { + "epoch": 0.36, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016145660348836693, + "loss": 1.1444, + "step": 6580 + }, + { + "epoch": 0.36, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016138116023521875, + "loss": 1.1467, + "step": 6585 + }, + { + "epoch": 0.36, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001613056608841423, + "loss": 1.1102, + "step": 6590 + }, + { + "epoch": 0.36, + "grad_norm": 0.169921875, + "learning_rate": 0.0001612301055041385, + "loss": 1.1791, + "step": 6595 + }, + { + "epoch": 0.36, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016115449416425945, + "loss": 1.2453, + "step": 6600 + }, + { + "epoch": 0.36, + "grad_norm": 0.162109375, + "learning_rate": 0.00016107882693360844, + "loss": 1.1349, + "step": 6605 + }, + { + "epoch": 0.36, + "grad_norm": 0.166015625, + "learning_rate": 0.0001610031038813399, + "loss": 1.0905, + "step": 6610 + }, + { + "epoch": 0.36, + "grad_norm": 0.1640625, + "learning_rate": 0.00016092732507665914, + "loss": 1.1994, + "step": 6615 + }, + { + "epoch": 0.36, + "grad_norm": 0.16015625, + "learning_rate": 0.0001608514905888225, + "loss": 1.1689, + "step": 6620 + }, + { + "epoch": 0.36, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001607756004871372, + "loss": 1.1518, + "step": 6625 + }, + { + "epoch": 0.36, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001606996548409613, + "loss": 1.1862, + "step": 6630 + }, + { + "epoch": 0.36, + "grad_norm": 0.166015625, + "learning_rate": 0.00016062365371970362, + "loss": 1.2025, + "step": 6635 + }, + { + "epoch": 0.36, + "grad_norm": 0.1611328125, + "learning_rate": 0.00016054759719282365, + "loss": 1.228, + "step": 6640 + }, + { + "epoch": 0.36, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016047148532983156, + "loss": 1.1946, + "step": 6645 + }, + { + "epoch": 0.36, + "grad_norm": 0.162109375, + "learning_rate": 0.00016039531820028806, + "loss": 1.1171, + "step": 6650 + }, + { + "epoch": 0.36, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001603190958738044, + "loss": 1.1555, + "step": 6655 + }, + { + "epoch": 0.36, + "grad_norm": 0.162109375, + "learning_rate": 0.00016024281842004221, + "loss": 1.1259, + "step": 6660 + }, + { + "epoch": 0.37, + "grad_norm": 0.15234375, + "learning_rate": 0.0001601664859087136, + "loss": 1.1639, + "step": 6665 + }, + { + "epoch": 0.37, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001600900984095809, + "loss": 1.1768, + "step": 6670 + }, + { + "epoch": 0.37, + "grad_norm": 0.205078125, + "learning_rate": 0.0001600136559924568, + "loss": 1.1152, + "step": 6675 + }, + { + "epoch": 0.37, + "grad_norm": 0.1640625, + "learning_rate": 0.0001599371587272041, + "loss": 1.2518, + "step": 6680 + }, + { + "epoch": 0.37, + "grad_norm": 0.439453125, + "learning_rate": 0.00015986060668373575, + "loss": 1.2053, + "step": 6685 + }, + { + "epoch": 0.37, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015978399993201475, + "loss": 1.1711, + "step": 6690 + }, + { + "epoch": 0.37, + "grad_norm": 0.171875, + "learning_rate": 0.00015970733854205413, + "loss": 1.2553, + "step": 6695 + }, + { + "epoch": 0.37, + "grad_norm": 0.169921875, + "learning_rate": 0.00015963062258391686, + "loss": 1.1595, + "step": 6700 + }, + { + "epoch": 0.37, + "grad_norm": 0.1640625, + "learning_rate": 0.00015955385212771572, + "loss": 1.2224, + "step": 6705 + }, + { + "epoch": 0.37, + "grad_norm": 0.169921875, + "learning_rate": 0.00015947702724361336, + "loss": 1.1493, + "step": 6710 + }, + { + "epoch": 0.37, + "grad_norm": 0.169921875, + "learning_rate": 0.00015940014800182217, + "loss": 1.1706, + "step": 6715 + }, + { + "epoch": 0.37, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015932321447260418, + "loss": 1.1524, + "step": 6720 + }, + { + "epoch": 0.37, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015924622672627104, + "loss": 1.1213, + "step": 6725 + }, + { + "epoch": 0.37, + "grad_norm": 0.16796875, + "learning_rate": 0.000159169184833184, + "loss": 1.1586, + "step": 6730 + }, + { + "epoch": 0.37, + "grad_norm": 0.16015625, + "learning_rate": 0.00015909208886375376, + "loss": 1.0952, + "step": 6735 + }, + { + "epoch": 0.37, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015901493888844037, + "loss": 1.1649, + "step": 6740 + }, + { + "epoch": 0.37, + "grad_norm": 0.16796875, + "learning_rate": 0.00015893773497775344, + "loss": 1.0904, + "step": 6745 + }, + { + "epoch": 0.37, + "grad_norm": 0.16796875, + "learning_rate": 0.00015886047720225163, + "loss": 1.1368, + "step": 6750 + }, + { + "epoch": 0.37, + "grad_norm": 0.1640625, + "learning_rate": 0.00015878316563254301, + "loss": 1.1242, + "step": 6755 + }, + { + "epoch": 0.37, + "grad_norm": 0.169921875, + "learning_rate": 0.0001587058003392847, + "loss": 1.137, + "step": 6760 + }, + { + "epoch": 0.37, + "grad_norm": 0.16796875, + "learning_rate": 0.00015862838139318303, + "loss": 1.2024, + "step": 6765 + }, + { + "epoch": 0.37, + "grad_norm": 0.158203125, + "learning_rate": 0.00015855090886499325, + "loss": 1.2072, + "step": 6770 + }, + { + "epoch": 0.37, + "grad_norm": 0.166015625, + "learning_rate": 0.00015847338282551964, + "loss": 1.2086, + "step": 6775 + }, + { + "epoch": 0.37, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001583958033456154, + "loss": 1.0903, + "step": 6780 + }, + { + "epoch": 0.37, + "grad_norm": 0.166015625, + "learning_rate": 0.0001583181704961825, + "loss": 1.2337, + "step": 6785 + }, + { + "epoch": 0.37, + "grad_norm": 0.171875, + "learning_rate": 0.0001582404843481718, + "loss": 1.2071, + "step": 6790 + }, + { + "epoch": 0.37, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001581627449725827, + "loss": 1.1655, + "step": 6795 + }, + { + "epoch": 0.37, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015808495244046345, + "loss": 1.1426, + "step": 6800 + }, + { + "epoch": 0.37, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015800710682291078, + "loss": 1.1939, + "step": 6805 + }, + { + "epoch": 0.37, + "grad_norm": 0.169921875, + "learning_rate": 0.0001579292081910699, + "loss": 1.1837, + "step": 6810 + }, + { + "epoch": 0.37, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001578512566161345, + "loss": 1.2064, + "step": 6815 + }, + { + "epoch": 0.37, + "grad_norm": 0.1640625, + "learning_rate": 0.00015777325216934668, + "loss": 1.1863, + "step": 6820 + }, + { + "epoch": 0.37, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015769519492199688, + "loss": 1.099, + "step": 6825 + }, + { + "epoch": 0.37, + "grad_norm": 0.169921875, + "learning_rate": 0.00015761708494542372, + "loss": 1.1181, + "step": 6830 + }, + { + "epoch": 0.37, + "grad_norm": 0.162109375, + "learning_rate": 0.00015753892231101403, + "loss": 1.1693, + "step": 6835 + }, + { + "epoch": 0.37, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015746070709020285, + "loss": 1.1695, + "step": 6840 + }, + { + "epoch": 0.37, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015738243935447318, + "loss": 1.0942, + "step": 6845 + }, + { + "epoch": 0.38, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001573041191753561, + "loss": 1.1937, + "step": 6850 + }, + { + "epoch": 0.38, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001572257466244305, + "loss": 1.1738, + "step": 6855 + }, + { + "epoch": 0.38, + "grad_norm": 0.15625, + "learning_rate": 0.00015714732177332324, + "loss": 1.189, + "step": 6860 + }, + { + "epoch": 0.38, + "grad_norm": 0.169921875, + "learning_rate": 0.00015706884469370898, + "loss": 1.1427, + "step": 6865 + }, + { + "epoch": 0.38, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015699031545731004, + "loss": 1.1307, + "step": 6870 + }, + { + "epoch": 0.38, + "grad_norm": 0.166015625, + "learning_rate": 0.00015691173413589647, + "loss": 1.1661, + "step": 6875 + }, + { + "epoch": 0.38, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015683310080128585, + "loss": 1.1778, + "step": 6880 + }, + { + "epoch": 0.38, + "grad_norm": 0.169921875, + "learning_rate": 0.0001567544155253434, + "loss": 1.2428, + "step": 6885 + }, + { + "epoch": 0.38, + "grad_norm": 0.1982421875, + "learning_rate": 0.00015667567837998175, + "loss": 1.1149, + "step": 6890 + }, + { + "epoch": 0.38, + "grad_norm": 0.16796875, + "learning_rate": 0.00015659688943716086, + "loss": 1.2712, + "step": 6895 + }, + { + "epoch": 0.38, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001565180487688882, + "loss": 1.1571, + "step": 6900 + }, + { + "epoch": 0.38, + "grad_norm": 0.1669921875, + "learning_rate": 0.00015643915644721843, + "loss": 1.2464, + "step": 6905 + }, + { + "epoch": 0.38, + "grad_norm": 0.166015625, + "learning_rate": 0.00015636021254425333, + "loss": 1.1519, + "step": 6910 + }, + { + "epoch": 0.38, + "grad_norm": 0.1650390625, + "learning_rate": 0.000156281217132142, + "loss": 1.1258, + "step": 6915 + }, + { + "epoch": 0.38, + "grad_norm": 0.166015625, + "learning_rate": 0.0001562021702830804, + "loss": 1.154, + "step": 6920 + }, + { + "epoch": 0.38, + "grad_norm": 0.16015625, + "learning_rate": 0.00015612307206931168, + "loss": 1.1526, + "step": 6925 + }, + { + "epoch": 0.38, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001560439225631259, + "loss": 1.1815, + "step": 6930 + }, + { + "epoch": 0.38, + "grad_norm": 0.16796875, + "learning_rate": 0.00015596472183685992, + "loss": 1.1635, + "step": 6935 + }, + { + "epoch": 0.38, + "grad_norm": 0.1640625, + "learning_rate": 0.00015588546996289749, + "loss": 1.237, + "step": 6940 + }, + { + "epoch": 0.38, + "grad_norm": 0.177734375, + "learning_rate": 0.00015580616701366904, + "loss": 1.1708, + "step": 6945 + }, + { + "epoch": 0.38, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015572681306165174, + "loss": 1.1548, + "step": 6950 + }, + { + "epoch": 0.38, + "grad_norm": 0.169921875, + "learning_rate": 0.00015564740817936934, + "loss": 1.191, + "step": 6955 + }, + { + "epoch": 0.38, + "grad_norm": 0.169921875, + "learning_rate": 0.00015556795243939212, + "loss": 1.1753, + "step": 6960 + }, + { + "epoch": 0.38, + "grad_norm": 0.16015625, + "learning_rate": 0.0001554884459143369, + "loss": 1.1277, + "step": 6965 + }, + { + "epoch": 0.38, + "grad_norm": 0.16015625, + "learning_rate": 0.0001554088886768669, + "loss": 1.1678, + "step": 6970 + }, + { + "epoch": 0.38, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015532928079969158, + "loss": 1.2238, + "step": 6975 + }, + { + "epoch": 0.38, + "grad_norm": 0.166015625, + "learning_rate": 0.00015524962235556684, + "loss": 1.2124, + "step": 6980 + }, + { + "epoch": 0.38, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001551699134172947, + "loss": 1.1457, + "step": 6985 + }, + { + "epoch": 0.38, + "grad_norm": 0.162109375, + "learning_rate": 0.0001550901540577233, + "loss": 1.1325, + "step": 6990 + }, + { + "epoch": 0.38, + "grad_norm": 0.169921875, + "learning_rate": 0.00015501034434974695, + "loss": 1.1362, + "step": 6995 + }, + { + "epoch": 0.38, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015493048436630597, + "loss": 1.1841, + "step": 7000 + }, + { + "epoch": 0.38, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001548505741803865, + "loss": 1.1923, + "step": 7005 + }, + { + "epoch": 0.38, + "grad_norm": 0.158203125, + "learning_rate": 0.00015477061386502078, + "loss": 1.2032, + "step": 7010 + }, + { + "epoch": 0.38, + "grad_norm": 0.1806640625, + "learning_rate": 0.00015469060349328665, + "loss": 1.1615, + "step": 7015 + }, + { + "epoch": 0.38, + "grad_norm": 0.1640625, + "learning_rate": 0.00015461054313830778, + "loss": 1.1173, + "step": 7020 + }, + { + "epoch": 0.38, + "grad_norm": 0.166015625, + "learning_rate": 0.00015453043287325357, + "loss": 1.1551, + "step": 7025 + }, + { + "epoch": 0.39, + "grad_norm": 0.171875, + "learning_rate": 0.00015445027277133898, + "loss": 1.1771, + "step": 7030 + }, + { + "epoch": 0.39, + "grad_norm": 0.158203125, + "learning_rate": 0.00015437006290582448, + "loss": 1.1434, + "step": 7035 + }, + { + "epoch": 0.39, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015428980335001618, + "loss": 1.1717, + "step": 7040 + }, + { + "epoch": 0.39, + "grad_norm": 0.16796875, + "learning_rate": 0.00015420949417726537, + "loss": 1.2549, + "step": 7045 + }, + { + "epoch": 0.39, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015412913546096889, + "loss": 1.2945, + "step": 7050 + }, + { + "epoch": 0.39, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015404872727456875, + "loss": 1.1672, + "step": 7055 + }, + { + "epoch": 0.39, + "grad_norm": 0.169921875, + "learning_rate": 0.00015396826969155217, + "loss": 1.2175, + "step": 7060 + }, + { + "epoch": 0.39, + "grad_norm": 0.17578125, + "learning_rate": 0.00015388776278545162, + "loss": 1.1626, + "step": 7065 + }, + { + "epoch": 0.39, + "grad_norm": 0.166015625, + "learning_rate": 0.0001538072066298445, + "loss": 1.1752, + "step": 7070 + }, + { + "epoch": 0.39, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015372660129835337, + "loss": 1.1765, + "step": 7075 + }, + { + "epoch": 0.39, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015364594686464558, + "loss": 1.1739, + "step": 7080 + }, + { + "epoch": 0.39, + "grad_norm": 0.16796875, + "learning_rate": 0.00015356524340243345, + "loss": 1.1457, + "step": 7085 + }, + { + "epoch": 0.39, + "grad_norm": 0.169921875, + "learning_rate": 0.0001534844909854741, + "loss": 1.1647, + "step": 7090 + }, + { + "epoch": 0.39, + "grad_norm": 0.16015625, + "learning_rate": 0.00015340368968756938, + "loss": 1.1566, + "step": 7095 + }, + { + "epoch": 0.39, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015332283958256583, + "loss": 1.233, + "step": 7100 + }, + { + "epoch": 0.39, + "grad_norm": 0.16796875, + "learning_rate": 0.00015324194074435457, + "loss": 1.1282, + "step": 7105 + }, + { + "epoch": 0.39, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001531609932468713, + "loss": 1.1809, + "step": 7110 + }, + { + "epoch": 0.39, + "grad_norm": 0.1669921875, + "learning_rate": 0.00015307999716409612, + "loss": 1.1903, + "step": 7115 + }, + { + "epoch": 0.39, + "grad_norm": 0.16796875, + "learning_rate": 0.00015299895257005357, + "loss": 1.2071, + "step": 7120 + }, + { + "epoch": 0.39, + "grad_norm": 0.166015625, + "learning_rate": 0.00015291785953881255, + "loss": 1.0904, + "step": 7125 + }, + { + "epoch": 0.39, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001528367181444862, + "loss": 1.183, + "step": 7130 + }, + { + "epoch": 0.39, + "grad_norm": 0.169921875, + "learning_rate": 0.0001527555284612319, + "loss": 1.1049, + "step": 7135 + }, + { + "epoch": 0.39, + "grad_norm": 0.15625, + "learning_rate": 0.00015267429056325108, + "loss": 1.0839, + "step": 7140 + }, + { + "epoch": 0.39, + "grad_norm": 0.16015625, + "learning_rate": 0.00015259300452478934, + "loss": 1.1634, + "step": 7145 + }, + { + "epoch": 0.39, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015251167042013619, + "loss": 1.1184, + "step": 7150 + }, + { + "epoch": 0.39, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001524302883236251, + "loss": 1.1734, + "step": 7155 + }, + { + "epoch": 0.39, + "grad_norm": 0.1728515625, + "learning_rate": 0.00015234885830963342, + "loss": 1.1759, + "step": 7160 + }, + { + "epoch": 0.39, + "grad_norm": 0.162109375, + "learning_rate": 0.00015226738045258226, + "loss": 1.1699, + "step": 7165 + }, + { + "epoch": 0.39, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001521858548269365, + "loss": 1.1255, + "step": 7170 + }, + { + "epoch": 0.39, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001521042815072046, + "loss": 1.2116, + "step": 7175 + }, + { + "epoch": 0.39, + "grad_norm": 0.169921875, + "learning_rate": 0.00015202266056793873, + "loss": 1.2304, + "step": 7180 + }, + { + "epoch": 0.39, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015194099208373443, + "loss": 1.1679, + "step": 7185 + }, + { + "epoch": 0.39, + "grad_norm": 0.16796875, + "learning_rate": 0.0001518592761292308, + "loss": 1.1427, + "step": 7190 + }, + { + "epoch": 0.39, + "grad_norm": 0.166015625, + "learning_rate": 0.0001517775127791103, + "loss": 1.1441, + "step": 7195 + }, + { + "epoch": 0.39, + "grad_norm": 0.166015625, + "learning_rate": 0.00015169570210809874, + "loss": 1.1659, + "step": 7200 + }, + { + "epoch": 0.39, + "grad_norm": 0.166015625, + "learning_rate": 0.00015161384419096506, + "loss": 1.2178, + "step": 7205 + }, + { + "epoch": 0.39, + "grad_norm": 0.1669921875, + "learning_rate": 0.00015153193910252152, + "loss": 1.1661, + "step": 7210 + }, + { + "epoch": 0.4, + "grad_norm": 0.16015625, + "learning_rate": 0.0001514499869176234, + "loss": 1.1505, + "step": 7215 + }, + { + "epoch": 0.4, + "grad_norm": 0.169921875, + "learning_rate": 0.00015136798771116908, + "loss": 1.1764, + "step": 7220 + }, + { + "epoch": 0.4, + "grad_norm": 0.16796875, + "learning_rate": 0.00015128594155809988, + "loss": 1.1145, + "step": 7225 + }, + { + "epoch": 0.4, + "grad_norm": 0.171875, + "learning_rate": 0.00015120384853340002, + "loss": 1.1814, + "step": 7230 + }, + { + "epoch": 0.4, + "grad_norm": 0.16796875, + "learning_rate": 0.00015112170871209654, + "loss": 1.1457, + "step": 7235 + }, + { + "epoch": 0.4, + "grad_norm": 0.171875, + "learning_rate": 0.00015103952216925933, + "loss": 1.2408, + "step": 7240 + }, + { + "epoch": 0.4, + "grad_norm": 0.1640625, + "learning_rate": 0.0001509572889800009, + "loss": 1.11, + "step": 7245 + }, + { + "epoch": 0.4, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015087500921947647, + "loss": 1.1904, + "step": 7250 + }, + { + "epoch": 0.4, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001507926829628837, + "loss": 1.1597, + "step": 7255 + }, + { + "epoch": 0.4, + "grad_norm": 0.1728515625, + "learning_rate": 0.00015071031028546288, + "loss": 1.1375, + "step": 7260 + }, + { + "epoch": 0.4, + "grad_norm": 0.1728515625, + "learning_rate": 0.00015062789126249658, + "loss": 1.2153, + "step": 7265 + }, + { + "epoch": 0.4, + "grad_norm": 0.166015625, + "learning_rate": 0.0001505454259693099, + "loss": 1.2355, + "step": 7270 + }, + { + "epoch": 0.4, + "grad_norm": 0.1640625, + "learning_rate": 0.0001504629144812701, + "loss": 1.1851, + "step": 7275 + }, + { + "epoch": 0.4, + "grad_norm": 0.16796875, + "learning_rate": 0.0001503803568737867, + "loss": 1.1688, + "step": 7280 + }, + { + "epoch": 0.4, + "grad_norm": 0.1611328125, + "learning_rate": 0.00015029775322231135, + "loss": 1.0928, + "step": 7285 + }, + { + "epoch": 0.4, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001502151036023378, + "loss": 1.2275, + "step": 7290 + }, + { + "epoch": 0.4, + "grad_norm": 0.1572265625, + "learning_rate": 0.00015013240808940182, + "loss": 1.1867, + "step": 7295 + }, + { + "epoch": 0.4, + "grad_norm": 0.162109375, + "learning_rate": 0.0001500496667590811, + "loss": 1.2231, + "step": 7300 + }, + { + "epoch": 0.4, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014996687968699522, + "loss": 1.18, + "step": 7305 + }, + { + "epoch": 0.4, + "grad_norm": 0.1640625, + "learning_rate": 0.00014988404694880555, + "loss": 1.1973, + "step": 7310 + }, + { + "epoch": 0.4, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014980116862021525, + "loss": 1.1616, + "step": 7315 + }, + { + "epoch": 0.4, + "grad_norm": 0.169921875, + "learning_rate": 0.00014971824477696903, + "loss": 1.2154, + "step": 7320 + }, + { + "epoch": 0.4, + "grad_norm": 0.1748046875, + "learning_rate": 0.00014963527549485332, + "loss": 1.2213, + "step": 7325 + }, + { + "epoch": 0.4, + "grad_norm": 0.1640625, + "learning_rate": 0.000149552260849696, + "loss": 1.0926, + "step": 7330 + }, + { + "epoch": 0.4, + "grad_norm": 0.173828125, + "learning_rate": 0.00014946920091736643, + "loss": 1.1684, + "step": 7335 + }, + { + "epoch": 0.4, + "grad_norm": 0.1640625, + "learning_rate": 0.00014938609577377538, + "loss": 1.1545, + "step": 7340 + }, + { + "epoch": 0.4, + "grad_norm": 0.173828125, + "learning_rate": 0.00014930294549487494, + "loss": 1.2216, + "step": 7345 + }, + { + "epoch": 0.4, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014921975015665836, + "loss": 1.2078, + "step": 7350 + }, + { + "epoch": 0.4, + "grad_norm": 0.1591796875, + "learning_rate": 0.00014913650983516024, + "loss": 1.2067, + "step": 7355 + }, + { + "epoch": 0.4, + "grad_norm": 0.16796875, + "learning_rate": 0.00014905322460645614, + "loss": 1.123, + "step": 7360 + }, + { + "epoch": 0.4, + "grad_norm": 0.169921875, + "learning_rate": 0.00014896989454666275, + "loss": 1.1822, + "step": 7365 + }, + { + "epoch": 0.4, + "grad_norm": 0.169921875, + "learning_rate": 0.00014888651973193765, + "loss": 1.1361, + "step": 7370 + }, + { + "epoch": 0.4, + "grad_norm": 0.166015625, + "learning_rate": 0.00014880310023847941, + "loss": 1.224, + "step": 7375 + }, + { + "epoch": 0.4, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014871963614252742, + "loss": 1.1574, + "step": 7380 + }, + { + "epoch": 0.4, + "grad_norm": 0.16015625, + "learning_rate": 0.00014863612752036176, + "loss": 1.1223, + "step": 7385 + }, + { + "epoch": 0.4, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014855257444830328, + "loss": 1.1701, + "step": 7390 + }, + { + "epoch": 0.41, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001484689770027134, + "loss": 1.1517, + "step": 7395 + }, + { + "epoch": 0.41, + "grad_norm": 0.15625, + "learning_rate": 0.00014838533525999417, + "loss": 1.1162, + "step": 7400 + }, + { + "epoch": 0.41, + "grad_norm": 0.1611328125, + "learning_rate": 0.00014830164929658806, + "loss": 1.1335, + "step": 7405 + }, + { + "epoch": 0.41, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014821791918897794, + "loss": 1.1782, + "step": 7410 + }, + { + "epoch": 0.41, + "grad_norm": 0.1728515625, + "learning_rate": 0.00014813414501368708, + "loss": 1.1286, + "step": 7415 + }, + { + "epoch": 0.41, + "grad_norm": 0.162109375, + "learning_rate": 0.000148050326847279, + "loss": 1.1696, + "step": 7420 + }, + { + "epoch": 0.41, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001479664647663574, + "loss": 1.0777, + "step": 7425 + }, + { + "epoch": 0.41, + "grad_norm": 0.1591796875, + "learning_rate": 0.00014788255884756618, + "loss": 1.1785, + "step": 7430 + }, + { + "epoch": 0.41, + "grad_norm": 0.16015625, + "learning_rate": 0.00014779860916758924, + "loss": 1.2023, + "step": 7435 + }, + { + "epoch": 0.41, + "grad_norm": 0.162109375, + "learning_rate": 0.00014771461580315049, + "loss": 1.1364, + "step": 7440 + }, + { + "epoch": 0.41, + "grad_norm": 0.166015625, + "learning_rate": 0.0001476305788310138, + "loss": 1.0836, + "step": 7445 + }, + { + "epoch": 0.41, + "grad_norm": 0.1591796875, + "learning_rate": 0.00014754649832798283, + "loss": 1.1177, + "step": 7450 + }, + { + "epoch": 0.41, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001474623743709011, + "loss": 1.1556, + "step": 7455 + }, + { + "epoch": 0.41, + "grad_norm": 0.166015625, + "learning_rate": 0.00014737820703665178, + "loss": 1.1634, + "step": 7460 + }, + { + "epoch": 0.41, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001472939964021577, + "loss": 1.1871, + "step": 7465 + }, + { + "epoch": 0.41, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014720974254438132, + "loss": 1.209, + "step": 7470 + }, + { + "epoch": 0.41, + "grad_norm": 0.158203125, + "learning_rate": 0.0001471254455403245, + "loss": 1.0775, + "step": 7475 + }, + { + "epoch": 0.41, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001470411054670286, + "loss": 1.2501, + "step": 7480 + }, + { + "epoch": 0.41, + "grad_norm": 0.1767578125, + "learning_rate": 0.00014695672240157439, + "loss": 1.1184, + "step": 7485 + }, + { + "epoch": 0.41, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014687229642108176, + "loss": 1.149, + "step": 7490 + }, + { + "epoch": 0.41, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014678782760271003, + "loss": 1.1579, + "step": 7495 + }, + { + "epoch": 0.41, + "grad_norm": 0.166015625, + "learning_rate": 0.00014670331602365753, + "loss": 1.1178, + "step": 7500 + }, + { + "epoch": 0.41, + "grad_norm": 0.169921875, + "learning_rate": 0.00014661876176116172, + "loss": 1.1766, + "step": 7505 + }, + { + "epoch": 0.41, + "grad_norm": 0.166015625, + "learning_rate": 0.00014653416489249905, + "loss": 1.2319, + "step": 7510 + }, + { + "epoch": 0.41, + "grad_norm": 0.162109375, + "learning_rate": 0.00014644952549498495, + "loss": 1.2514, + "step": 7515 + }, + { + "epoch": 0.41, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014636484364597367, + "loss": 1.1396, + "step": 7520 + }, + { + "epoch": 0.41, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014628011942285828, + "loss": 1.153, + "step": 7525 + }, + { + "epoch": 0.41, + "grad_norm": 0.1611328125, + "learning_rate": 0.00014619535290307057, + "loss": 1.276, + "step": 7530 + }, + { + "epoch": 0.41, + "grad_norm": 0.158203125, + "learning_rate": 0.00014611054416408103, + "loss": 1.2446, + "step": 7535 + }, + { + "epoch": 0.41, + "grad_norm": 0.1728515625, + "learning_rate": 0.00014602569328339864, + "loss": 1.2219, + "step": 7540 + }, + { + "epoch": 0.41, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014594080033857097, + "loss": 1.1463, + "step": 7545 + }, + { + "epoch": 0.41, + "grad_norm": 0.1630859375, + "learning_rate": 0.000145855865407184, + "loss": 1.1184, + "step": 7550 + }, + { + "epoch": 0.41, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014577088856686213, + "loss": 1.1214, + "step": 7555 + }, + { + "epoch": 0.41, + "grad_norm": 0.166015625, + "learning_rate": 0.000145685869895268, + "loss": 1.2583, + "step": 7560 + }, + { + "epoch": 0.41, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014560080947010256, + "loss": 1.2364, + "step": 7565 + }, + { + "epoch": 0.41, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014551570736910484, + "loss": 1.2237, + "step": 7570 + }, + { + "epoch": 0.41, + "grad_norm": 0.166015625, + "learning_rate": 0.00014543056367005195, + "loss": 1.1011, + "step": 7575 + }, + { + "epoch": 0.42, + "grad_norm": 0.169921875, + "learning_rate": 0.0001453453784507591, + "loss": 1.2022, + "step": 7580 + }, + { + "epoch": 0.42, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014526015178907935, + "loss": 1.1686, + "step": 7585 + }, + { + "epoch": 0.42, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014517488376290376, + "loss": 1.1226, + "step": 7590 + }, + { + "epoch": 0.42, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014508957445016107, + "loss": 1.1691, + "step": 7595 + }, + { + "epoch": 0.42, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014500422392881783, + "loss": 1.1732, + "step": 7600 + }, + { + "epoch": 0.42, + "grad_norm": 0.166015625, + "learning_rate": 0.0001449188322768782, + "loss": 1.1826, + "step": 7605 + }, + { + "epoch": 0.42, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014483339957238397, + "loss": 1.2111, + "step": 7610 + }, + { + "epoch": 0.42, + "grad_norm": 0.1640625, + "learning_rate": 0.00014474792589341443, + "loss": 1.1708, + "step": 7615 + }, + { + "epoch": 0.42, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014466241131808635, + "loss": 1.1544, + "step": 7620 + }, + { + "epoch": 0.42, + "grad_norm": 0.162109375, + "learning_rate": 0.0001445768559245538, + "loss": 1.1789, + "step": 7625 + }, + { + "epoch": 0.42, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001444912597910083, + "loss": 1.1848, + "step": 7630 + }, + { + "epoch": 0.42, + "grad_norm": 0.16015625, + "learning_rate": 0.00014440562299567836, + "loss": 1.1384, + "step": 7635 + }, + { + "epoch": 0.42, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001443199456168299, + "loss": 1.1512, + "step": 7640 + }, + { + "epoch": 0.42, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001442342277327658, + "loss": 1.1939, + "step": 7645 + }, + { + "epoch": 0.42, + "grad_norm": 0.169921875, + "learning_rate": 0.000144148469421826, + "loss": 1.1397, + "step": 7650 + }, + { + "epoch": 0.42, + "grad_norm": 0.162109375, + "learning_rate": 0.00014406267076238737, + "loss": 1.2094, + "step": 7655 + }, + { + "epoch": 0.42, + "grad_norm": 0.169921875, + "learning_rate": 0.0001439768318328637, + "loss": 1.1209, + "step": 7660 + }, + { + "epoch": 0.42, + "grad_norm": 0.1845703125, + "learning_rate": 0.00014389095271170549, + "loss": 1.2254, + "step": 7665 + }, + { + "epoch": 0.42, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014380503347740005, + "loss": 1.1778, + "step": 7670 + }, + { + "epoch": 0.42, + "grad_norm": 0.16796875, + "learning_rate": 0.00014371907420847132, + "loss": 1.1528, + "step": 7675 + }, + { + "epoch": 0.42, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014363307498347985, + "loss": 1.1539, + "step": 7680 + }, + { + "epoch": 0.42, + "grad_norm": 0.173828125, + "learning_rate": 0.00014354703588102268, + "loss": 1.1372, + "step": 7685 + }, + { + "epoch": 0.42, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001434609569797333, + "loss": 1.1385, + "step": 7690 + }, + { + "epoch": 0.42, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014337483835828163, + "loss": 1.2349, + "step": 7695 + }, + { + "epoch": 0.42, + "grad_norm": 0.169921875, + "learning_rate": 0.00014328868009537378, + "loss": 1.1939, + "step": 7700 + }, + { + "epoch": 0.42, + "grad_norm": 0.166015625, + "learning_rate": 0.00014320248226975223, + "loss": 1.1573, + "step": 7705 + }, + { + "epoch": 0.42, + "grad_norm": 0.1728515625, + "learning_rate": 0.00014311624496019546, + "loss": 1.283, + "step": 7710 + }, + { + "epoch": 0.42, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014302996824551813, + "loss": 1.2353, + "step": 7715 + }, + { + "epoch": 0.42, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014294365220457093, + "loss": 1.1532, + "step": 7720 + }, + { + "epoch": 0.42, + "grad_norm": 0.171875, + "learning_rate": 0.0001428572969162405, + "loss": 1.1465, + "step": 7725 + }, + { + "epoch": 0.42, + "grad_norm": 0.16796875, + "learning_rate": 0.0001427709024594492, + "loss": 1.1481, + "step": 7730 + }, + { + "epoch": 0.42, + "grad_norm": 0.1640625, + "learning_rate": 0.00014268446891315537, + "loss": 1.1494, + "step": 7735 + }, + { + "epoch": 0.42, + "grad_norm": 0.169921875, + "learning_rate": 0.00014259799635635298, + "loss": 1.1859, + "step": 7740 + }, + { + "epoch": 0.42, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014251148486807172, + "loss": 1.1282, + "step": 7745 + }, + { + "epoch": 0.42, + "grad_norm": 0.169921875, + "learning_rate": 0.00014242493452737675, + "loss": 1.2092, + "step": 7750 + }, + { + "epoch": 0.42, + "grad_norm": 0.171875, + "learning_rate": 0.00014233834541336888, + "loss": 1.1788, + "step": 7755 + }, + { + "epoch": 0.43, + "grad_norm": 0.1650390625, + "learning_rate": 0.00014225171760518415, + "loss": 1.1056, + "step": 7760 + }, + { + "epoch": 0.43, + "grad_norm": 0.1728515625, + "learning_rate": 0.00014216505118199425, + "loss": 1.176, + "step": 7765 + }, + { + "epoch": 0.43, + "grad_norm": 0.166015625, + "learning_rate": 0.0001420783462230059, + "loss": 1.1368, + "step": 7770 + }, + { + "epoch": 0.43, + "grad_norm": 0.166015625, + "learning_rate": 0.0001419916028074612, + "loss": 1.1946, + "step": 7775 + }, + { + "epoch": 0.43, + "grad_norm": 0.16796875, + "learning_rate": 0.0001419048210146373, + "loss": 1.1036, + "step": 7780 + }, + { + "epoch": 0.43, + "grad_norm": 0.169921875, + "learning_rate": 0.00014181800092384647, + "loss": 1.2529, + "step": 7785 + }, + { + "epoch": 0.43, + "grad_norm": 0.162109375, + "learning_rate": 0.000141731142614436, + "loss": 1.1413, + "step": 7790 + }, + { + "epoch": 0.43, + "grad_norm": 0.1572265625, + "learning_rate": 0.00014164424616578805, + "loss": 1.1956, + "step": 7795 + }, + { + "epoch": 0.43, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014155731165731972, + "loss": 1.1081, + "step": 7800 + }, + { + "epoch": 0.43, + "grad_norm": 0.181640625, + "learning_rate": 0.00014147033916848275, + "loss": 1.2275, + "step": 7805 + }, + { + "epoch": 0.43, + "grad_norm": 0.162109375, + "learning_rate": 0.00014138332877876381, + "loss": 1.2397, + "step": 7810 + }, + { + "epoch": 0.43, + "grad_norm": 0.1630859375, + "learning_rate": 0.000141296280567684, + "loss": 1.229, + "step": 7815 + }, + { + "epoch": 0.43, + "grad_norm": 0.169921875, + "learning_rate": 0.0001412091946147991, + "loss": 1.1537, + "step": 7820 + }, + { + "epoch": 0.43, + "grad_norm": 0.169921875, + "learning_rate": 0.00014112207099969937, + "loss": 1.1069, + "step": 7825 + }, + { + "epoch": 0.43, + "grad_norm": 0.162109375, + "learning_rate": 0.00014103490980200945, + "loss": 1.2241, + "step": 7830 + }, + { + "epoch": 0.43, + "grad_norm": 0.16015625, + "learning_rate": 0.00014094771110138835, + "loss": 1.2112, + "step": 7835 + }, + { + "epoch": 0.43, + "grad_norm": 0.169921875, + "learning_rate": 0.00014086047497752937, + "loss": 1.1526, + "step": 7840 + }, + { + "epoch": 0.43, + "grad_norm": 0.162109375, + "learning_rate": 0.00014077320151015997, + "loss": 1.1632, + "step": 7845 + }, + { + "epoch": 0.43, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014068589077904185, + "loss": 1.1236, + "step": 7850 + }, + { + "epoch": 0.43, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014059854286397057, + "loss": 1.1998, + "step": 7855 + }, + { + "epoch": 0.43, + "grad_norm": 0.162109375, + "learning_rate": 0.00014051115784477594, + "loss": 1.2108, + "step": 7860 + }, + { + "epoch": 0.43, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014042373580132136, + "loss": 1.1749, + "step": 7865 + }, + { + "epoch": 0.43, + "grad_norm": 0.1669921875, + "learning_rate": 0.00014033627681350435, + "loss": 1.173, + "step": 7870 + }, + { + "epoch": 0.43, + "grad_norm": 0.1533203125, + "learning_rate": 0.00014024878096125605, + "loss": 1.1113, + "step": 7875 + }, + { + "epoch": 0.43, + "grad_norm": 0.169921875, + "learning_rate": 0.0001401612483245413, + "loss": 1.214, + "step": 7880 + }, + { + "epoch": 0.43, + "grad_norm": 0.166015625, + "learning_rate": 0.0001400736789833586, + "loss": 1.1316, + "step": 7885 + }, + { + "epoch": 0.43, + "grad_norm": 0.16015625, + "learning_rate": 0.00013998607301773998, + "loss": 1.1805, + "step": 7890 + }, + { + "epoch": 0.43, + "grad_norm": 0.15625, + "learning_rate": 0.00013989843050775096, + "loss": 1.096, + "step": 7895 + }, + { + "epoch": 0.43, + "grad_norm": 0.1767578125, + "learning_rate": 0.00013981075153349037, + "loss": 1.2747, + "step": 7900 + }, + { + "epoch": 0.43, + "grad_norm": 0.1640625, + "learning_rate": 0.0001397230361750905, + "loss": 1.1922, + "step": 7905 + }, + { + "epoch": 0.43, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013963528451271679, + "loss": 1.2204, + "step": 7910 + }, + { + "epoch": 0.43, + "grad_norm": 0.171875, + "learning_rate": 0.0001395474966265679, + "loss": 1.2171, + "step": 7915 + }, + { + "epoch": 0.43, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013945967259687558, + "loss": 1.1588, + "step": 7920 + }, + { + "epoch": 0.43, + "grad_norm": 0.1640625, + "learning_rate": 0.00013937181250390468, + "loss": 1.2271, + "step": 7925 + }, + { + "epoch": 0.43, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001392839164279529, + "loss": 1.2605, + "step": 7930 + }, + { + "epoch": 0.43, + "grad_norm": 0.1640625, + "learning_rate": 0.00013919598444935087, + "loss": 1.1003, + "step": 7935 + }, + { + "epoch": 0.43, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013910801664846208, + "loss": 1.221, + "step": 7940 + }, + { + "epoch": 0.44, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001390200131056827, + "loss": 1.1455, + "step": 7945 + }, + { + "epoch": 0.44, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013893197390144155, + "loss": 1.1203, + "step": 7950 + }, + { + "epoch": 0.44, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013884389911620012, + "loss": 1.1516, + "step": 7955 + }, + { + "epoch": 0.44, + "grad_norm": 0.171875, + "learning_rate": 0.00013875578883045238, + "loss": 1.1548, + "step": 7960 + }, + { + "epoch": 0.44, + "grad_norm": 0.171875, + "learning_rate": 0.00013866764312472473, + "loss": 1.1942, + "step": 7965 + }, + { + "epoch": 0.44, + "grad_norm": 0.166015625, + "learning_rate": 0.00013857946207957594, + "loss": 1.1953, + "step": 7970 + }, + { + "epoch": 0.44, + "grad_norm": 0.1640625, + "learning_rate": 0.0001384912457755971, + "loss": 1.0613, + "step": 7975 + }, + { + "epoch": 0.44, + "grad_norm": 0.16796875, + "learning_rate": 0.00013840299429341153, + "loss": 1.1485, + "step": 7980 + }, + { + "epoch": 0.44, + "grad_norm": 0.16796875, + "learning_rate": 0.00013831470771367464, + "loss": 1.2536, + "step": 7985 + }, + { + "epoch": 0.44, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013822638611707397, + "loss": 1.1073, + "step": 7990 + }, + { + "epoch": 0.44, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001381380295843291, + "loss": 1.1628, + "step": 7995 + }, + { + "epoch": 0.44, + "grad_norm": 0.166015625, + "learning_rate": 0.0001380496381961914, + "loss": 1.1307, + "step": 8000 + }, + { + "epoch": 0.44, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001379612120334443, + "loss": 1.2052, + "step": 8005 + }, + { + "epoch": 0.44, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001378727511769028, + "loss": 1.1994, + "step": 8010 + }, + { + "epoch": 0.44, + "grad_norm": 0.17578125, + "learning_rate": 0.00013778425570741377, + "loss": 1.1662, + "step": 8015 + }, + { + "epoch": 0.44, + "grad_norm": 0.1611328125, + "learning_rate": 0.00013769572570585567, + "loss": 1.1763, + "step": 8020 + }, + { + "epoch": 0.44, + "grad_norm": 0.169921875, + "learning_rate": 0.0001376071612531384, + "loss": 1.1969, + "step": 8025 + }, + { + "epoch": 0.44, + "grad_norm": 0.169921875, + "learning_rate": 0.00013751856243020359, + "loss": 1.1872, + "step": 8030 + }, + { + "epoch": 0.44, + "grad_norm": 0.1640625, + "learning_rate": 0.00013742992931802398, + "loss": 1.1696, + "step": 8035 + }, + { + "epoch": 0.44, + "grad_norm": 0.158203125, + "learning_rate": 0.00013734126199760392, + "loss": 1.1599, + "step": 8040 + }, + { + "epoch": 0.44, + "grad_norm": 0.16796875, + "learning_rate": 0.00013725256054997886, + "loss": 1.1414, + "step": 8045 + }, + { + "epoch": 0.44, + "grad_norm": 0.16796875, + "learning_rate": 0.0001371638250562155, + "loss": 1.1845, + "step": 8050 + }, + { + "epoch": 0.44, + "grad_norm": 0.1640625, + "learning_rate": 0.00013707505559741164, + "loss": 1.2117, + "step": 8055 + }, + { + "epoch": 0.44, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013698625225469615, + "loss": 1.154, + "step": 8060 + }, + { + "epoch": 0.44, + "grad_norm": 0.171875, + "learning_rate": 0.0001368974151092288, + "loss": 1.1619, + "step": 8065 + }, + { + "epoch": 0.44, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013680854424220036, + "loss": 1.1804, + "step": 8070 + }, + { + "epoch": 0.44, + "grad_norm": 0.166015625, + "learning_rate": 0.0001367196397348323, + "loss": 1.2434, + "step": 8075 + }, + { + "epoch": 0.44, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001366307016683769, + "loss": 1.1144, + "step": 8080 + }, + { + "epoch": 0.44, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013654173012411712, + "loss": 1.1867, + "step": 8085 + }, + { + "epoch": 0.44, + "grad_norm": 0.1640625, + "learning_rate": 0.0001364527251833665, + "loss": 1.1751, + "step": 8090 + }, + { + "epoch": 0.44, + "grad_norm": 0.162109375, + "learning_rate": 0.00013636368692746906, + "loss": 1.1101, + "step": 8095 + }, + { + "epoch": 0.44, + "grad_norm": 0.16796875, + "learning_rate": 0.00013627461543779936, + "loss": 1.1906, + "step": 8100 + }, + { + "epoch": 0.44, + "grad_norm": 0.1640625, + "learning_rate": 0.00013618551079576228, + "loss": 1.1638, + "step": 8105 + }, + { + "epoch": 0.44, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013609637308279292, + "loss": 1.2165, + "step": 8110 + }, + { + "epoch": 0.44, + "grad_norm": 0.1640625, + "learning_rate": 0.00013600720238035683, + "loss": 1.1542, + "step": 8115 + }, + { + "epoch": 0.44, + "grad_norm": 0.16015625, + "learning_rate": 0.0001359179987699494, + "loss": 1.1143, + "step": 8120 + }, + { + "epoch": 0.45, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001358287623330964, + "loss": 1.1966, + "step": 8125 + }, + { + "epoch": 0.45, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013573949315135334, + "loss": 1.1892, + "step": 8130 + }, + { + "epoch": 0.45, + "grad_norm": 0.169921875, + "learning_rate": 0.00013565019130630588, + "loss": 1.2578, + "step": 8135 + }, + { + "epoch": 0.45, + "grad_norm": 0.1591796875, + "learning_rate": 0.00013556085687956934, + "loss": 1.0711, + "step": 8140 + }, + { + "epoch": 0.45, + "grad_norm": 0.171875, + "learning_rate": 0.000135471489952789, + "loss": 1.1693, + "step": 8145 + }, + { + "epoch": 0.45, + "grad_norm": 0.16796875, + "learning_rate": 0.00013538209060763966, + "loss": 1.1988, + "step": 8150 + }, + { + "epoch": 0.45, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001352926589258259, + "loss": 1.1489, + "step": 8155 + }, + { + "epoch": 0.45, + "grad_norm": 0.173828125, + "learning_rate": 0.00013520319498908177, + "loss": 1.1932, + "step": 8160 + }, + { + "epoch": 0.45, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001351136988791708, + "loss": 1.2381, + "step": 8165 + }, + { + "epoch": 0.45, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013502417067788594, + "loss": 1.0664, + "step": 8170 + }, + { + "epoch": 0.45, + "grad_norm": 0.1552734375, + "learning_rate": 0.00013493461046704954, + "loss": 1.1675, + "step": 8175 + }, + { + "epoch": 0.45, + "grad_norm": 0.166015625, + "learning_rate": 0.00013484501832851305, + "loss": 1.2074, + "step": 8180 + }, + { + "epoch": 0.45, + "grad_norm": 0.17578125, + "learning_rate": 0.00013475539434415726, + "loss": 1.1126, + "step": 8185 + }, + { + "epoch": 0.45, + "grad_norm": 0.16796875, + "learning_rate": 0.0001346657385958919, + "loss": 1.2108, + "step": 8190 + }, + { + "epoch": 0.45, + "grad_norm": 0.1640625, + "learning_rate": 0.00013457605116565593, + "loss": 1.0783, + "step": 8195 + }, + { + "epoch": 0.45, + "grad_norm": 0.173828125, + "learning_rate": 0.00013448633213541708, + "loss": 1.2537, + "step": 8200 + }, + { + "epoch": 0.45, + "grad_norm": 0.1640625, + "learning_rate": 0.00013439658158717206, + "loss": 1.2103, + "step": 8205 + }, + { + "epoch": 0.45, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013430679960294635, + "loss": 1.2296, + "step": 8210 + }, + { + "epoch": 0.45, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001342169862647942, + "loss": 1.1953, + "step": 8215 + }, + { + "epoch": 0.45, + "grad_norm": 0.173828125, + "learning_rate": 0.00013412714165479846, + "loss": 1.1904, + "step": 8220 + }, + { + "epoch": 0.45, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001340372658550706, + "loss": 1.154, + "step": 8225 + }, + { + "epoch": 0.45, + "grad_norm": 0.17578125, + "learning_rate": 0.00013394735894775057, + "loss": 1.2235, + "step": 8230 + }, + { + "epoch": 0.45, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013385742101500677, + "loss": 1.2637, + "step": 8235 + }, + { + "epoch": 0.45, + "grad_norm": 0.173828125, + "learning_rate": 0.00013376745213903592, + "loss": 1.1311, + "step": 8240 + }, + { + "epoch": 0.45, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001336774524020631, + "loss": 1.1393, + "step": 8245 + }, + { + "epoch": 0.45, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001335874218863415, + "loss": 1.1448, + "step": 8250 + }, + { + "epoch": 0.45, + "grad_norm": 0.1640625, + "learning_rate": 0.00013349736067415252, + "loss": 1.2296, + "step": 8255 + }, + { + "epoch": 0.45, + "grad_norm": 0.158203125, + "learning_rate": 0.00013340726884780553, + "loss": 1.178, + "step": 8260 + }, + { + "epoch": 0.45, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013331714648963792, + "loss": 1.1577, + "step": 8265 + }, + { + "epoch": 0.45, + "grad_norm": 0.1708984375, + "learning_rate": 0.00013322699368201508, + "loss": 1.1838, + "step": 8270 + }, + { + "epoch": 0.45, + "grad_norm": 0.1640625, + "learning_rate": 0.00013313681050733001, + "loss": 1.1692, + "step": 8275 + }, + { + "epoch": 0.45, + "grad_norm": 0.1728515625, + "learning_rate": 0.00013304659704800367, + "loss": 1.2479, + "step": 8280 + }, + { + "epoch": 0.45, + "grad_norm": 0.16796875, + "learning_rate": 0.00013295635338648462, + "loss": 1.2184, + "step": 8285 + }, + { + "epoch": 0.45, + "grad_norm": 0.1591796875, + "learning_rate": 0.00013286607960524897, + "loss": 1.2417, + "step": 8290 + }, + { + "epoch": 0.45, + "grad_norm": 0.1640625, + "learning_rate": 0.00013277577578680048, + "loss": 1.1967, + "step": 8295 + }, + { + "epoch": 0.45, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001326854420136702, + "loss": 1.1231, + "step": 8300 + }, + { + "epoch": 0.45, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001325950783684167, + "loss": 1.185, + "step": 8305 + }, + { + "epoch": 0.46, + "grad_norm": 0.1640625, + "learning_rate": 0.00013250468493362583, + "loss": 1.1716, + "step": 8310 + }, + { + "epoch": 0.46, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013241426179191057, + "loss": 1.1983, + "step": 8315 + }, + { + "epoch": 0.46, + "grad_norm": 0.169921875, + "learning_rate": 0.00013232380902591115, + "loss": 1.1448, + "step": 8320 + }, + { + "epoch": 0.46, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013223332671829483, + "loss": 1.271, + "step": 8325 + }, + { + "epoch": 0.46, + "grad_norm": 0.169921875, + "learning_rate": 0.00013214281495175584, + "loss": 1.2046, + "step": 8330 + }, + { + "epoch": 0.46, + "grad_norm": 0.16015625, + "learning_rate": 0.00013205227380901543, + "loss": 1.1249, + "step": 8335 + }, + { + "epoch": 0.46, + "grad_norm": 0.1572265625, + "learning_rate": 0.00013196170337282162, + "loss": 1.1973, + "step": 8340 + }, + { + "epoch": 0.46, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001318711037259492, + "loss": 1.0632, + "step": 8345 + }, + { + "epoch": 0.46, + "grad_norm": 0.17578125, + "learning_rate": 0.00013178047495119964, + "loss": 1.1831, + "step": 8350 + }, + { + "epoch": 0.46, + "grad_norm": 0.185546875, + "learning_rate": 0.00013168981713140116, + "loss": 1.1754, + "step": 8355 + }, + { + "epoch": 0.46, + "grad_norm": 0.16015625, + "learning_rate": 0.00013159913034940836, + "loss": 1.142, + "step": 8360 + }, + { + "epoch": 0.46, + "grad_norm": 0.169921875, + "learning_rate": 0.00013150841468810242, + "loss": 1.2121, + "step": 8365 + }, + { + "epoch": 0.46, + "grad_norm": 0.16796875, + "learning_rate": 0.00013141767023039085, + "loss": 1.1684, + "step": 8370 + }, + { + "epoch": 0.46, + "grad_norm": 0.166015625, + "learning_rate": 0.00013132689705920748, + "loss": 1.1947, + "step": 8375 + }, + { + "epoch": 0.46, + "grad_norm": 0.1640625, + "learning_rate": 0.00013123609525751248, + "loss": 1.175, + "step": 8380 + }, + { + "epoch": 0.46, + "grad_norm": 0.169921875, + "learning_rate": 0.00013114526490829204, + "loss": 1.1841, + "step": 8385 + }, + { + "epoch": 0.46, + "grad_norm": 0.17578125, + "learning_rate": 0.00013105440609455856, + "loss": 1.2425, + "step": 8390 + }, + { + "epoch": 0.46, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013096351889935033, + "loss": 1.2554, + "step": 8395 + }, + { + "epoch": 0.46, + "grad_norm": 0.1630859375, + "learning_rate": 0.00013087260340573174, + "loss": 1.1532, + "step": 8400 + }, + { + "epoch": 0.46, + "grad_norm": 0.166015625, + "learning_rate": 0.00013078165969679291, + "loss": 1.1028, + "step": 8405 + }, + { + "epoch": 0.46, + "grad_norm": 0.1669921875, + "learning_rate": 0.00013069068785564984, + "loss": 1.1521, + "step": 8410 + }, + { + "epoch": 0.46, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013059968796544412, + "loss": 1.1432, + "step": 8415 + }, + { + "epoch": 0.46, + "grad_norm": 0.171875, + "learning_rate": 0.00013050866010934313, + "loss": 1.2014, + "step": 8420 + }, + { + "epoch": 0.46, + "grad_norm": 0.1650390625, + "learning_rate": 0.00013041760437053964, + "loss": 1.1032, + "step": 8425 + }, + { + "epoch": 0.46, + "grad_norm": 0.169921875, + "learning_rate": 0.00013032652083225208, + "loss": 1.208, + "step": 8430 + }, + { + "epoch": 0.46, + "grad_norm": 0.166015625, + "learning_rate": 0.00013023540957772414, + "loss": 1.1257, + "step": 8435 + }, + { + "epoch": 0.46, + "grad_norm": 0.166015625, + "learning_rate": 0.0001301442706902249, + "loss": 1.1944, + "step": 8440 + }, + { + "epoch": 0.46, + "grad_norm": 0.1640625, + "learning_rate": 0.0001300531042530487, + "loss": 1.228, + "step": 8445 + }, + { + "epoch": 0.46, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012996191034951512, + "loss": 1.1473, + "step": 8450 + }, + { + "epoch": 0.46, + "grad_norm": 0.166015625, + "learning_rate": 0.00012987068906296863, + "loss": 1.2271, + "step": 8455 + }, + { + "epoch": 0.46, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012977944047677897, + "loss": 1.238, + "step": 8460 + }, + { + "epoch": 0.46, + "grad_norm": 0.171875, + "learning_rate": 0.00012968816467434074, + "loss": 1.1768, + "step": 8465 + }, + { + "epoch": 0.46, + "grad_norm": 0.1640625, + "learning_rate": 0.00012959686173907333, + "loss": 1.1546, + "step": 8470 + }, + { + "epoch": 0.46, + "grad_norm": 0.166015625, + "learning_rate": 0.00012950553175442107, + "loss": 1.2455, + "step": 8475 + }, + { + "epoch": 0.46, + "grad_norm": 0.16015625, + "learning_rate": 0.00012941417480385293, + "loss": 1.2709, + "step": 8480 + }, + { + "epoch": 0.46, + "grad_norm": 0.169921875, + "learning_rate": 0.00012932279097086248, + "loss": 1.1166, + "step": 8485 + }, + { + "epoch": 0.47, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012923138033896797, + "loss": 1.1345, + "step": 8490 + }, + { + "epoch": 0.47, + "grad_norm": 0.166015625, + "learning_rate": 0.00012913994299171207, + "loss": 1.1113, + "step": 8495 + }, + { + "epoch": 0.47, + "grad_norm": 0.173828125, + "learning_rate": 0.00012904847901266188, + "loss": 1.2091, + "step": 8500 + }, + { + "epoch": 0.47, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001289569884854089, + "loss": 1.214, + "step": 8505 + }, + { + "epoch": 0.47, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012886547149356872, + "loss": 1.1845, + "step": 8510 + }, + { + "epoch": 0.47, + "grad_norm": 0.17578125, + "learning_rate": 0.00012877392812078134, + "loss": 1.232, + "step": 8515 + }, + { + "epoch": 0.47, + "grad_norm": 0.173828125, + "learning_rate": 0.00012868235845071068, + "loss": 1.1696, + "step": 8520 + }, + { + "epoch": 0.47, + "grad_norm": 0.1572265625, + "learning_rate": 0.00012859076256704484, + "loss": 1.1541, + "step": 8525 + }, + { + "epoch": 0.47, + "grad_norm": 0.166015625, + "learning_rate": 0.00012849914055349578, + "loss": 1.1485, + "step": 8530 + }, + { + "epoch": 0.47, + "grad_norm": 0.17578125, + "learning_rate": 0.00012840749249379938, + "loss": 1.116, + "step": 8535 + }, + { + "epoch": 0.47, + "grad_norm": 0.1708984375, + "learning_rate": 0.00012831581847171534, + "loss": 1.1878, + "step": 8540 + }, + { + "epoch": 0.47, + "grad_norm": 0.166015625, + "learning_rate": 0.00012822411857102702, + "loss": 1.083, + "step": 8545 + }, + { + "epoch": 0.47, + "grad_norm": 0.16796875, + "learning_rate": 0.00012813239287554155, + "loss": 1.2502, + "step": 8550 + }, + { + "epoch": 0.47, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012804064146908946, + "loss": 1.1709, + "step": 8555 + }, + { + "epoch": 0.47, + "grad_norm": 0.171875, + "learning_rate": 0.000127948864435525, + "loss": 1.1941, + "step": 8560 + }, + { + "epoch": 0.47, + "grad_norm": 0.171875, + "learning_rate": 0.00012785706185872565, + "loss": 1.2039, + "step": 8565 + }, + { + "epoch": 0.47, + "grad_norm": 0.16796875, + "learning_rate": 0.00012776523382259232, + "loss": 1.2117, + "step": 8570 + }, + { + "epoch": 0.47, + "grad_norm": 0.169921875, + "learning_rate": 0.00012767338041104917, + "loss": 1.1662, + "step": 8575 + }, + { + "epoch": 0.47, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012758150170804358, + "loss": 1.0912, + "step": 8580 + }, + { + "epoch": 0.47, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012748959779754597, + "loss": 1.2631, + "step": 8585 + }, + { + "epoch": 0.47, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001273976687635499, + "loss": 1.154, + "step": 8590 + }, + { + "epoch": 0.47, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001273057146900718, + "loss": 1.1527, + "step": 8595 + }, + { + "epoch": 0.47, + "grad_norm": 0.1640625, + "learning_rate": 0.00012721373566115105, + "loss": 1.1861, + "step": 8600 + }, + { + "epoch": 0.47, + "grad_norm": 0.16796875, + "learning_rate": 0.00012712173176084978, + "loss": 1.1469, + "step": 8605 + }, + { + "epoch": 0.47, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012702970307325293, + "loss": 1.2622, + "step": 8610 + }, + { + "epoch": 0.47, + "grad_norm": 0.1787109375, + "learning_rate": 0.000126937649682468, + "loss": 1.198, + "step": 8615 + }, + { + "epoch": 0.47, + "grad_norm": 0.166015625, + "learning_rate": 0.00012684557167262517, + "loss": 1.1417, + "step": 8620 + }, + { + "epoch": 0.47, + "grad_norm": 0.1669921875, + "learning_rate": 0.000126753469127877, + "loss": 1.2068, + "step": 8625 + }, + { + "epoch": 0.47, + "grad_norm": 0.1591796875, + "learning_rate": 0.00012666134213239853, + "loss": 1.2055, + "step": 8630 + }, + { + "epoch": 0.47, + "grad_norm": 0.166015625, + "learning_rate": 0.0001265691907703872, + "loss": 1.0553, + "step": 8635 + }, + { + "epoch": 0.47, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012647701512606262, + "loss": 1.082, + "step": 8640 + }, + { + "epoch": 0.47, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012638481528366675, + "loss": 1.186, + "step": 8645 + }, + { + "epoch": 0.47, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012629259132746344, + "loss": 1.199, + "step": 8650 + }, + { + "epoch": 0.47, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012620034334173873, + "loss": 1.1182, + "step": 8655 + }, + { + "epoch": 0.47, + "grad_norm": 0.1640625, + "learning_rate": 0.00012610807141080058, + "loss": 1.1438, + "step": 8660 + }, + { + "epoch": 0.47, + "grad_norm": 0.171875, + "learning_rate": 0.0001260157756189789, + "loss": 1.1269, + "step": 8665 + }, + { + "epoch": 0.47, + "grad_norm": 0.16796875, + "learning_rate": 0.0001259234560506253, + "loss": 1.202, + "step": 8670 + }, + { + "epoch": 0.48, + "grad_norm": 0.1708984375, + "learning_rate": 0.00012583111279011318, + "loss": 1.1333, + "step": 8675 + }, + { + "epoch": 0.48, + "grad_norm": 0.171875, + "learning_rate": 0.00012573874592183756, + "loss": 1.213, + "step": 8680 + }, + { + "epoch": 0.48, + "grad_norm": 0.169921875, + "learning_rate": 0.00012564635553021511, + "loss": 1.1436, + "step": 8685 + }, + { + "epoch": 0.48, + "grad_norm": 0.169921875, + "learning_rate": 0.0001255539416996839, + "loss": 1.1571, + "step": 8690 + }, + { + "epoch": 0.48, + "grad_norm": 0.169921875, + "learning_rate": 0.00012546150451470348, + "loss": 1.2428, + "step": 8695 + }, + { + "epoch": 0.48, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012536904405975472, + "loss": 1.2165, + "step": 8700 + }, + { + "epoch": 0.48, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012527656041933983, + "loss": 1.1764, + "step": 8705 + }, + { + "epoch": 0.48, + "grad_norm": 0.171875, + "learning_rate": 0.00012518405367798206, + "loss": 1.1994, + "step": 8710 + }, + { + "epoch": 0.48, + "grad_norm": 0.1708984375, + "learning_rate": 0.00012509152392022591, + "loss": 1.2052, + "step": 8715 + }, + { + "epoch": 0.48, + "grad_norm": 0.1787109375, + "learning_rate": 0.00012499897123063687, + "loss": 1.1536, + "step": 8720 + }, + { + "epoch": 0.48, + "grad_norm": 0.166015625, + "learning_rate": 0.0001249063956938014, + "loss": 1.1935, + "step": 8725 + }, + { + "epoch": 0.48, + "grad_norm": 0.169921875, + "learning_rate": 0.00012481379739432674, + "loss": 1.1697, + "step": 8730 + }, + { + "epoch": 0.48, + "grad_norm": 0.1767578125, + "learning_rate": 0.00012472117641684112, + "loss": 1.1128, + "step": 8735 + }, + { + "epoch": 0.48, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012462853284599335, + "loss": 1.1837, + "step": 8740 + }, + { + "epoch": 0.48, + "grad_norm": 0.16796875, + "learning_rate": 0.00012453586676645296, + "loss": 1.1359, + "step": 8745 + }, + { + "epoch": 0.48, + "grad_norm": 0.169921875, + "learning_rate": 0.00012444317826290998, + "loss": 1.1335, + "step": 8750 + }, + { + "epoch": 0.48, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012435046742007504, + "loss": 1.1713, + "step": 8755 + }, + { + "epoch": 0.48, + "grad_norm": 0.1640625, + "learning_rate": 0.0001242577343226791, + "loss": 1.0868, + "step": 8760 + }, + { + "epoch": 0.48, + "grad_norm": 0.1611328125, + "learning_rate": 0.00012416497905547345, + "loss": 1.2478, + "step": 8765 + }, + { + "epoch": 0.48, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001240722017032297, + "loss": 1.1887, + "step": 8770 + }, + { + "epoch": 0.48, + "grad_norm": 0.171875, + "learning_rate": 0.00012397940235073964, + "loss": 1.1038, + "step": 8775 + }, + { + "epoch": 0.48, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001238865810828151, + "loss": 1.1008, + "step": 8780 + }, + { + "epoch": 0.48, + "grad_norm": 0.1767578125, + "learning_rate": 0.00012379373798428807, + "loss": 1.1604, + "step": 8785 + }, + { + "epoch": 0.48, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012370087314001035, + "loss": 1.1985, + "step": 8790 + }, + { + "epoch": 0.48, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012360798663485365, + "loss": 1.1248, + "step": 8795 + }, + { + "epoch": 0.48, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012351507855370953, + "loss": 1.1265, + "step": 8800 + }, + { + "epoch": 0.48, + "grad_norm": 0.16796875, + "learning_rate": 0.00012342214898148924, + "loss": 1.1833, + "step": 8805 + }, + { + "epoch": 0.48, + "grad_norm": 0.173828125, + "learning_rate": 0.00012332919800312365, + "loss": 1.2389, + "step": 8810 + }, + { + "epoch": 0.48, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012323622570356324, + "loss": 1.1159, + "step": 8815 + }, + { + "epoch": 0.48, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001231432321677779, + "loss": 1.1641, + "step": 8820 + }, + { + "epoch": 0.48, + "grad_norm": 0.1640625, + "learning_rate": 0.00012305021748075705, + "loss": 1.1587, + "step": 8825 + }, + { + "epoch": 0.48, + "grad_norm": 0.15625, + "learning_rate": 0.0001229571817275093, + "loss": 1.1954, + "step": 8830 + }, + { + "epoch": 0.48, + "grad_norm": 0.166015625, + "learning_rate": 0.0001228641249930626, + "loss": 1.1489, + "step": 8835 + }, + { + "epoch": 0.48, + "grad_norm": 0.169921875, + "learning_rate": 0.00012277104736246408, + "loss": 1.095, + "step": 8840 + }, + { + "epoch": 0.48, + "grad_norm": 0.173828125, + "learning_rate": 0.00012267794892077992, + "loss": 1.1478, + "step": 8845 + }, + { + "epoch": 0.48, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012258482975309535, + "loss": 1.1428, + "step": 8850 + }, + { + "epoch": 0.49, + "grad_norm": 0.1640625, + "learning_rate": 0.00012249168994451451, + "loss": 1.1707, + "step": 8855 + }, + { + "epoch": 0.49, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001223985295801605, + "loss": 1.1852, + "step": 8860 + }, + { + "epoch": 0.49, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012230534874517506, + "loss": 1.1895, + "step": 8865 + }, + { + "epoch": 0.49, + "grad_norm": 0.173828125, + "learning_rate": 0.00012221214752471875, + "loss": 1.2153, + "step": 8870 + }, + { + "epoch": 0.49, + "grad_norm": 0.166015625, + "learning_rate": 0.0001221189260039707, + "loss": 1.1326, + "step": 8875 + }, + { + "epoch": 0.49, + "grad_norm": 0.169921875, + "learning_rate": 0.00012202568426812866, + "loss": 1.1269, + "step": 8880 + }, + { + "epoch": 0.49, + "grad_norm": 0.169921875, + "learning_rate": 0.00012193242240240878, + "loss": 1.2114, + "step": 8885 + }, + { + "epoch": 0.49, + "grad_norm": 0.166015625, + "learning_rate": 0.00012183914049204563, + "loss": 1.2026, + "step": 8890 + }, + { + "epoch": 0.49, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012174583862229217, + "loss": 1.0864, + "step": 8895 + }, + { + "epoch": 0.49, + "grad_norm": 0.169921875, + "learning_rate": 0.00012165251687841948, + "loss": 1.1961, + "step": 8900 + }, + { + "epoch": 0.49, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012155917534571688, + "loss": 1.231, + "step": 8905 + }, + { + "epoch": 0.49, + "grad_norm": 0.1650390625, + "learning_rate": 0.00012146581410949176, + "loss": 1.1196, + "step": 8910 + }, + { + "epoch": 0.49, + "grad_norm": 0.181640625, + "learning_rate": 0.00012137243325506955, + "loss": 1.1796, + "step": 8915 + }, + { + "epoch": 0.49, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012127903286779352, + "loss": 1.1415, + "step": 8920 + }, + { + "epoch": 0.49, + "grad_norm": 0.1669921875, + "learning_rate": 0.00012118561303302491, + "loss": 1.1687, + "step": 8925 + }, + { + "epoch": 0.49, + "grad_norm": 0.1748046875, + "learning_rate": 0.00012109217383614264, + "loss": 1.1745, + "step": 8930 + }, + { + "epoch": 0.49, + "grad_norm": 0.171875, + "learning_rate": 0.00012099871536254337, + "loss": 1.1503, + "step": 8935 + }, + { + "epoch": 0.49, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012090523769764135, + "loss": 1.1252, + "step": 8940 + }, + { + "epoch": 0.49, + "grad_norm": 0.158203125, + "learning_rate": 0.00012081174092686842, + "loss": 1.0908, + "step": 8945 + }, + { + "epoch": 0.49, + "grad_norm": 0.166015625, + "learning_rate": 0.0001207182251356738, + "loss": 1.1352, + "step": 8950 + }, + { + "epoch": 0.49, + "grad_norm": 0.1689453125, + "learning_rate": 0.00012062469040952415, + "loss": 1.1196, + "step": 8955 + }, + { + "epoch": 0.49, + "grad_norm": 0.1787109375, + "learning_rate": 0.00012053113683390346, + "loss": 1.2844, + "step": 8960 + }, + { + "epoch": 0.49, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012043756449431289, + "loss": 1.2494, + "step": 8965 + }, + { + "epoch": 0.49, + "grad_norm": 0.169921875, + "learning_rate": 0.00012034397347627075, + "loss": 1.2029, + "step": 8970 + }, + { + "epoch": 0.49, + "grad_norm": 0.166015625, + "learning_rate": 0.0001202503638653125, + "loss": 1.1702, + "step": 8975 + }, + { + "epoch": 0.49, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012015673574699047, + "loss": 1.1436, + "step": 8980 + }, + { + "epoch": 0.49, + "grad_norm": 0.169921875, + "learning_rate": 0.00012006308920687402, + "loss": 1.1993, + "step": 8985 + }, + { + "epoch": 0.49, + "grad_norm": 0.1611328125, + "learning_rate": 0.00011996942433054927, + "loss": 1.0998, + "step": 8990 + }, + { + "epoch": 0.49, + "grad_norm": 0.1640625, + "learning_rate": 0.00011987574120361911, + "loss": 1.112, + "step": 8995 + }, + { + "epoch": 0.49, + "grad_norm": 0.162109375, + "learning_rate": 0.00011978203991170313, + "loss": 1.1976, + "step": 9000 + }, + { + "epoch": 0.49, + "grad_norm": 0.16015625, + "learning_rate": 0.00011968832054043755, + "loss": 1.1803, + "step": 9005 + }, + { + "epoch": 0.49, + "grad_norm": 0.162109375, + "learning_rate": 0.00011959458317547503, + "loss": 1.1088, + "step": 9010 + }, + { + "epoch": 0.49, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011950082790248473, + "loss": 1.1638, + "step": 9015 + }, + { + "epoch": 0.49, + "grad_norm": 0.1796875, + "learning_rate": 0.00011940705480715222, + "loss": 1.1852, + "step": 9020 + }, + { + "epoch": 0.49, + "grad_norm": 0.16796875, + "learning_rate": 0.00011931326397517924, + "loss": 1.1152, + "step": 9025 + }, + { + "epoch": 0.49, + "grad_norm": 0.169921875, + "learning_rate": 0.00011921945549228386, + "loss": 1.1023, + "step": 9030 + }, + { + "epoch": 0.49, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011912562944420018, + "loss": 1.191, + "step": 9035 + }, + { + "epoch": 0.5, + "grad_norm": 0.16796875, + "learning_rate": 0.00011903178591667845, + "loss": 1.1539, + "step": 9040 + }, + { + "epoch": 0.5, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011893792499548482, + "loss": 1.1156, + "step": 9045 + }, + { + "epoch": 0.5, + "grad_norm": 0.1748046875, + "learning_rate": 0.00011884404676640137, + "loss": 1.2176, + "step": 9050 + }, + { + "epoch": 0.5, + "grad_norm": 0.1572265625, + "learning_rate": 0.000118750151315226, + "loss": 1.2067, + "step": 9055 + }, + { + "epoch": 0.5, + "grad_norm": 0.16796875, + "learning_rate": 0.00011865623872777232, + "loss": 1.1671, + "step": 9060 + }, + { + "epoch": 0.5, + "grad_norm": 0.1640625, + "learning_rate": 0.00011856230908986965, + "loss": 1.1284, + "step": 9065 + }, + { + "epoch": 0.5, + "grad_norm": 0.171875, + "learning_rate": 0.00011846836248736286, + "loss": 1.1997, + "step": 9070 + }, + { + "epoch": 0.5, + "grad_norm": 0.1640625, + "learning_rate": 0.00011837439900611229, + "loss": 1.1906, + "step": 9075 + }, + { + "epoch": 0.5, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011828041873199382, + "loss": 1.1339, + "step": 9080 + }, + { + "epoch": 0.5, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011818642175089855, + "loss": 1.2474, + "step": 9085 + }, + { + "epoch": 0.5, + "grad_norm": 0.162109375, + "learning_rate": 0.0001180924081487329, + "loss": 1.1568, + "step": 9090 + }, + { + "epoch": 0.5, + "grad_norm": 0.171875, + "learning_rate": 0.0001179983780114185, + "loss": 1.1465, + "step": 9095 + }, + { + "epoch": 0.5, + "grad_norm": 0.173828125, + "learning_rate": 0.00011790433142489208, + "loss": 1.2031, + "step": 9100 + }, + { + "epoch": 0.5, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011781026847510541, + "loss": 1.1925, + "step": 9105 + }, + { + "epoch": 0.5, + "grad_norm": 0.162109375, + "learning_rate": 0.00011771618924802516, + "loss": 1.158, + "step": 9110 + }, + { + "epoch": 0.5, + "grad_norm": 0.166015625, + "learning_rate": 0.00011762209382963298, + "loss": 1.2164, + "step": 9115 + }, + { + "epoch": 0.5, + "grad_norm": 0.162109375, + "learning_rate": 0.00011752798230592521, + "loss": 1.1389, + "step": 9120 + }, + { + "epoch": 0.5, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011743385476291298, + "loss": 1.137, + "step": 9125 + }, + { + "epoch": 0.5, + "grad_norm": 0.171875, + "learning_rate": 0.00011733971128662202, + "loss": 1.1422, + "step": 9130 + }, + { + "epoch": 0.5, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011724555196309266, + "loss": 1.172, + "step": 9135 + }, + { + "epoch": 0.5, + "grad_norm": 0.162109375, + "learning_rate": 0.00011715137687837965, + "loss": 1.1292, + "step": 9140 + }, + { + "epoch": 0.5, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011705718611855223, + "loss": 1.164, + "step": 9145 + }, + { + "epoch": 0.5, + "grad_norm": 0.173828125, + "learning_rate": 0.0001169629797696939, + "loss": 1.1617, + "step": 9150 + }, + { + "epoch": 0.5, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011686875791790246, + "loss": 1.1447, + "step": 9155 + }, + { + "epoch": 0.5, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001167745206492898, + "loss": 1.1206, + "step": 9160 + }, + { + "epoch": 0.5, + "grad_norm": 0.16796875, + "learning_rate": 0.000116680268049982, + "loss": 1.0839, + "step": 9165 + }, + { + "epoch": 0.5, + "grad_norm": 0.177734375, + "learning_rate": 0.00011658600020611906, + "loss": 1.1799, + "step": 9170 + }, + { + "epoch": 0.5, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011649171720385496, + "loss": 1.1548, + "step": 9175 + }, + { + "epoch": 0.5, + "grad_norm": 0.17578125, + "learning_rate": 0.00011639741912935753, + "loss": 1.1429, + "step": 9180 + }, + { + "epoch": 0.5, + "grad_norm": 0.1630859375, + "learning_rate": 0.00011630310606880839, + "loss": 1.1355, + "step": 9185 + }, + { + "epoch": 0.5, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001162087781084028, + "loss": 1.1415, + "step": 9190 + }, + { + "epoch": 0.5, + "grad_norm": 0.16796875, + "learning_rate": 0.0001161144353343497, + "loss": 1.1226, + "step": 9195 + }, + { + "epoch": 0.5, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011602007783287153, + "loss": 1.1547, + "step": 9200 + }, + { + "epoch": 0.5, + "grad_norm": 0.171875, + "learning_rate": 0.0001159257056902042, + "loss": 1.277, + "step": 9205 + }, + { + "epoch": 0.5, + "grad_norm": 0.1845703125, + "learning_rate": 0.00011583131899259702, + "loss": 1.1556, + "step": 9210 + }, + { + "epoch": 0.5, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011573691782631256, + "loss": 1.1532, + "step": 9215 + }, + { + "epoch": 0.51, + "grad_norm": 0.171875, + "learning_rate": 0.00011564250227762668, + "loss": 1.1602, + "step": 9220 + }, + { + "epoch": 0.51, + "grad_norm": 0.166015625, + "learning_rate": 0.00011554807243282828, + "loss": 1.1712, + "step": 9225 + }, + { + "epoch": 0.51, + "grad_norm": 0.16796875, + "learning_rate": 0.00011545362837821947, + "loss": 1.1379, + "step": 9230 + }, + { + "epoch": 0.51, + "grad_norm": 0.17578125, + "learning_rate": 0.0001153591702001152, + "loss": 1.2032, + "step": 9235 + }, + { + "epoch": 0.51, + "grad_norm": 0.1748046875, + "learning_rate": 0.00011526469798484341, + "loss": 1.1966, + "step": 9240 + }, + { + "epoch": 0.51, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011517021181874486, + "loss": 1.1637, + "step": 9245 + }, + { + "epoch": 0.51, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011507571178817307, + "loss": 1.1833, + "step": 9250 + }, + { + "epoch": 0.51, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011498119797949416, + "loss": 1.1807, + "step": 9255 + }, + { + "epoch": 0.51, + "grad_norm": 0.1630859375, + "learning_rate": 0.00011488667047908695, + "loss": 1.0899, + "step": 9260 + }, + { + "epoch": 0.51, + "grad_norm": 0.1640625, + "learning_rate": 0.00011479212937334267, + "loss": 1.1128, + "step": 9265 + }, + { + "epoch": 0.51, + "grad_norm": 0.173828125, + "learning_rate": 0.00011469757474866511, + "loss": 1.2238, + "step": 9270 + }, + { + "epoch": 0.51, + "grad_norm": 0.1787109375, + "learning_rate": 0.00011460300669147024, + "loss": 1.1462, + "step": 9275 + }, + { + "epoch": 0.51, + "grad_norm": 0.173828125, + "learning_rate": 0.00011450842528818649, + "loss": 1.2737, + "step": 9280 + }, + { + "epoch": 0.51, + "grad_norm": 0.162109375, + "learning_rate": 0.00011441383062525435, + "loss": 1.1189, + "step": 9285 + }, + { + "epoch": 0.51, + "grad_norm": 0.177734375, + "learning_rate": 0.0001143192227891265, + "loss": 1.1053, + "step": 9290 + }, + { + "epoch": 0.51, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011422460186626765, + "loss": 1.2605, + "step": 9295 + }, + { + "epoch": 0.51, + "grad_norm": 0.16796875, + "learning_rate": 0.00011412996794315445, + "loss": 1.1205, + "step": 9300 + }, + { + "epoch": 0.51, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011403532110627541, + "loss": 1.1824, + "step": 9305 + }, + { + "epoch": 0.51, + "grad_norm": 0.169921875, + "learning_rate": 0.00011394066144213094, + "loss": 1.1099, + "step": 9310 + }, + { + "epoch": 0.51, + "grad_norm": 0.16796875, + "learning_rate": 0.00011384598903723307, + "loss": 1.0561, + "step": 9315 + }, + { + "epoch": 0.51, + "grad_norm": 0.1591796875, + "learning_rate": 0.00011375130397810547, + "loss": 1.1908, + "step": 9320 + }, + { + "epoch": 0.51, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001136566063512835, + "loss": 1.1312, + "step": 9325 + }, + { + "epoch": 0.51, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011356189624331386, + "loss": 1.1794, + "step": 9330 + }, + { + "epoch": 0.51, + "grad_norm": 0.16796875, + "learning_rate": 0.00011346717374075478, + "loss": 1.1816, + "step": 9335 + }, + { + "epoch": 0.51, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011337243893017566, + "loss": 1.229, + "step": 9340 + }, + { + "epoch": 0.51, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011327769189815733, + "loss": 1.1432, + "step": 9345 + }, + { + "epoch": 0.51, + "grad_norm": 0.173828125, + "learning_rate": 0.00011318293273129166, + "loss": 1.1915, + "step": 9350 + }, + { + "epoch": 0.51, + "grad_norm": 0.1650390625, + "learning_rate": 0.00011308816151618171, + "loss": 1.1272, + "step": 9355 + }, + { + "epoch": 0.51, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011299337833944141, + "loss": 1.2189, + "step": 9360 + }, + { + "epoch": 0.51, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001128985832876958, + "loss": 1.1982, + "step": 9365 + }, + { + "epoch": 0.51, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011280377644758063, + "loss": 1.0604, + "step": 9370 + }, + { + "epoch": 0.51, + "grad_norm": 0.169921875, + "learning_rate": 0.00011270895790574246, + "loss": 1.2464, + "step": 9375 + }, + { + "epoch": 0.51, + "grad_norm": 0.1728515625, + "learning_rate": 0.00011261412774883857, + "loss": 1.149, + "step": 9380 + }, + { + "epoch": 0.51, + "grad_norm": 0.171875, + "learning_rate": 0.0001125192860635369, + "loss": 1.1785, + "step": 9385 + }, + { + "epoch": 0.51, + "grad_norm": 0.177734375, + "learning_rate": 0.00011242443293651583, + "loss": 1.1137, + "step": 9390 + }, + { + "epoch": 0.51, + "grad_norm": 0.169921875, + "learning_rate": 0.0001123295684544642, + "loss": 1.1899, + "step": 9395 + }, + { + "epoch": 0.51, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011223469270408136, + "loss": 1.1728, + "step": 9400 + }, + { + "epoch": 0.52, + "grad_norm": 0.162109375, + "learning_rate": 0.00011213980577207677, + "loss": 1.1643, + "step": 9405 + }, + { + "epoch": 0.52, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011204490774517026, + "loss": 1.1044, + "step": 9410 + }, + { + "epoch": 0.52, + "grad_norm": 0.16796875, + "learning_rate": 0.00011194999871009171, + "loss": 1.199, + "step": 9415 + }, + { + "epoch": 0.52, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011185507875358112, + "loss": 1.1379, + "step": 9420 + }, + { + "epoch": 0.52, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011176014796238842, + "loss": 1.2192, + "step": 9425 + }, + { + "epoch": 0.52, + "grad_norm": 0.169921875, + "learning_rate": 0.00011166520642327352, + "loss": 1.1892, + "step": 9430 + }, + { + "epoch": 0.52, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011157025422300603, + "loss": 1.1677, + "step": 9435 + }, + { + "epoch": 0.52, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011147529144836541, + "loss": 1.2184, + "step": 9440 + }, + { + "epoch": 0.52, + "grad_norm": 0.1708984375, + "learning_rate": 0.00011138031818614072, + "loss": 1.1363, + "step": 9445 + }, + { + "epoch": 0.52, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001112853345231307, + "loss": 1.2203, + "step": 9450 + }, + { + "epoch": 0.52, + "grad_norm": 0.173828125, + "learning_rate": 0.00011119034054614346, + "loss": 1.1162, + "step": 9455 + }, + { + "epoch": 0.52, + "grad_norm": 0.1591796875, + "learning_rate": 0.00011109533634199662, + "loss": 1.0821, + "step": 9460 + }, + { + "epoch": 0.52, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011100032199751718, + "loss": 1.2297, + "step": 9465 + }, + { + "epoch": 0.52, + "grad_norm": 0.16015625, + "learning_rate": 0.00011090529759954129, + "loss": 1.1898, + "step": 9470 + }, + { + "epoch": 0.52, + "grad_norm": 0.173828125, + "learning_rate": 0.0001108102632349144, + "loss": 1.2135, + "step": 9475 + }, + { + "epoch": 0.52, + "grad_norm": 0.1630859375, + "learning_rate": 0.00011071521899049102, + "loss": 1.2227, + "step": 9480 + }, + { + "epoch": 0.52, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001106201649531347, + "loss": 1.1815, + "step": 9485 + }, + { + "epoch": 0.52, + "grad_norm": 0.1669921875, + "learning_rate": 0.00011052510120971791, + "loss": 1.2157, + "step": 9490 + }, + { + "epoch": 0.52, + "grad_norm": 0.1630859375, + "learning_rate": 0.00011043002784712204, + "loss": 1.1603, + "step": 9495 + }, + { + "epoch": 0.52, + "grad_norm": 0.169921875, + "learning_rate": 0.00011033494495223724, + "loss": 1.2134, + "step": 9500 + }, + { + "epoch": 0.52, + "grad_norm": 0.166015625, + "learning_rate": 0.0001102398526119624, + "loss": 1.1588, + "step": 9505 + }, + { + "epoch": 0.52, + "grad_norm": 0.171875, + "learning_rate": 0.00011014475091320496, + "loss": 1.1676, + "step": 9510 + }, + { + "epoch": 0.52, + "grad_norm": 0.1689453125, + "learning_rate": 0.00011004963994288102, + "loss": 1.2596, + "step": 9515 + }, + { + "epoch": 0.52, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010995451978791511, + "loss": 1.2051, + "step": 9520 + }, + { + "epoch": 0.52, + "grad_norm": 0.169921875, + "learning_rate": 0.00010985939053524017, + "loss": 1.1736, + "step": 9525 + }, + { + "epoch": 0.52, + "grad_norm": 0.173828125, + "learning_rate": 0.00010976425227179738, + "loss": 1.2479, + "step": 9530 + }, + { + "epoch": 0.52, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010966910508453627, + "loss": 1.1076, + "step": 9535 + }, + { + "epoch": 0.52, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010957394906041442, + "loss": 1.1039, + "step": 9540 + }, + { + "epoch": 0.52, + "grad_norm": 0.1650390625, + "learning_rate": 0.00010947878428639757, + "loss": 1.2402, + "step": 9545 + }, + { + "epoch": 0.52, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001093836108494594, + "loss": 1.1539, + "step": 9550 + }, + { + "epoch": 0.52, + "grad_norm": 0.173828125, + "learning_rate": 0.00010928842883658152, + "loss": 1.2417, + "step": 9555 + }, + { + "epoch": 0.52, + "grad_norm": 0.169921875, + "learning_rate": 0.00010919323833475343, + "loss": 1.1806, + "step": 9560 + }, + { + "epoch": 0.52, + "grad_norm": 0.1591796875, + "learning_rate": 0.00010909803943097233, + "loss": 1.1364, + "step": 9565 + }, + { + "epoch": 0.52, + "grad_norm": 0.173828125, + "learning_rate": 0.00010900283221224306, + "loss": 1.2511, + "step": 9570 + }, + { + "epoch": 0.52, + "grad_norm": 0.17578125, + "learning_rate": 0.00010890761676557816, + "loss": 1.1083, + "step": 9575 + }, + { + "epoch": 0.52, + "grad_norm": 0.17578125, + "learning_rate": 0.00010881239317799765, + "loss": 1.0718, + "step": 9580 + }, + { + "epoch": 0.53, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010871716153652895, + "loss": 1.1785, + "step": 9585 + }, + { + "epoch": 0.53, + "grad_norm": 0.169921875, + "learning_rate": 0.00010862192192820693, + "loss": 1.1352, + "step": 9590 + }, + { + "epoch": 0.53, + "grad_norm": 0.1796875, + "learning_rate": 0.00010852667444007358, + "loss": 1.1345, + "step": 9595 + }, + { + "epoch": 0.53, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010843141915917829, + "loss": 1.1186, + "step": 9600 + }, + { + "epoch": 0.53, + "grad_norm": 0.1640625, + "learning_rate": 0.00010833615617257743, + "loss": 1.1213, + "step": 9605 + }, + { + "epoch": 0.53, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010824088556733452, + "loss": 1.1469, + "step": 9610 + }, + { + "epoch": 0.53, + "grad_norm": 0.169921875, + "learning_rate": 0.0001081456074305199, + "loss": 1.1345, + "step": 9615 + }, + { + "epoch": 0.53, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010805032184921091, + "loss": 1.1589, + "step": 9620 + }, + { + "epoch": 0.53, + "grad_norm": 0.16796875, + "learning_rate": 0.00010795502891049167, + "loss": 1.1469, + "step": 9625 + }, + { + "epoch": 0.53, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010785972870145302, + "loss": 1.1811, + "step": 9630 + }, + { + "epoch": 0.53, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010776442130919241, + "loss": 1.1993, + "step": 9635 + }, + { + "epoch": 0.53, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001076691068208139, + "loss": 1.1724, + "step": 9640 + }, + { + "epoch": 0.53, + "grad_norm": 0.1640625, + "learning_rate": 0.00010757378532342803, + "loss": 1.1881, + "step": 9645 + }, + { + "epoch": 0.53, + "grad_norm": 0.171875, + "learning_rate": 0.00010747845690415172, + "loss": 1.1064, + "step": 9650 + }, + { + "epoch": 0.53, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010738312165010824, + "loss": 1.1816, + "step": 9655 + }, + { + "epoch": 0.53, + "grad_norm": 0.171875, + "learning_rate": 0.0001072877796484271, + "loss": 1.1331, + "step": 9660 + }, + { + "epoch": 0.53, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010719243098624394, + "loss": 1.215, + "step": 9665 + }, + { + "epoch": 0.53, + "grad_norm": 0.1630859375, + "learning_rate": 0.00010709707575070059, + "loss": 1.1443, + "step": 9670 + }, + { + "epoch": 0.53, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010700171402894472, + "loss": 1.0988, + "step": 9675 + }, + { + "epoch": 0.53, + "grad_norm": 0.16796875, + "learning_rate": 0.00010690634590813014, + "loss": 1.157, + "step": 9680 + }, + { + "epoch": 0.53, + "grad_norm": 0.16796875, + "learning_rate": 0.0001068109714754163, + "loss": 1.1724, + "step": 9685 + }, + { + "epoch": 0.53, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010671559081796857, + "loss": 1.1435, + "step": 9690 + }, + { + "epoch": 0.53, + "grad_norm": 0.173828125, + "learning_rate": 0.00010662020402295796, + "loss": 1.1928, + "step": 9695 + }, + { + "epoch": 0.53, + "grad_norm": 0.1728515625, + "learning_rate": 0.000106524811177561, + "loss": 1.1886, + "step": 9700 + }, + { + "epoch": 0.53, + "grad_norm": 0.1884765625, + "learning_rate": 0.00010642941236895994, + "loss": 1.1617, + "step": 9705 + }, + { + "epoch": 0.53, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001063340076843423, + "loss": 1.1931, + "step": 9710 + }, + { + "epoch": 0.53, + "grad_norm": 0.1591796875, + "learning_rate": 0.00010623859721090106, + "loss": 1.2535, + "step": 9715 + }, + { + "epoch": 0.53, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010614318103583445, + "loss": 1.1818, + "step": 9720 + }, + { + "epoch": 0.53, + "grad_norm": 0.171875, + "learning_rate": 0.000106047759246346, + "loss": 1.1468, + "step": 9725 + }, + { + "epoch": 0.53, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010595233192964422, + "loss": 1.194, + "step": 9730 + }, + { + "epoch": 0.53, + "grad_norm": 0.173828125, + "learning_rate": 0.00010585689917294282, + "loss": 1.1362, + "step": 9735 + }, + { + "epoch": 0.53, + "grad_norm": 0.162109375, + "learning_rate": 0.00010576146106346037, + "loss": 1.178, + "step": 9740 + }, + { + "epoch": 0.53, + "grad_norm": 0.1806640625, + "learning_rate": 0.00010566601768842043, + "loss": 1.1273, + "step": 9745 + }, + { + "epoch": 0.53, + "grad_norm": 0.166015625, + "learning_rate": 0.0001055705691350513, + "loss": 1.1585, + "step": 9750 + }, + { + "epoch": 0.53, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010547511549058604, + "loss": 1.2623, + "step": 9755 + }, + { + "epoch": 0.53, + "grad_norm": 0.166015625, + "learning_rate": 0.00010537965684226233, + "loss": 1.1143, + "step": 9760 + }, + { + "epoch": 0.53, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010528419327732247, + "loss": 1.1588, + "step": 9765 + }, + { + "epoch": 0.54, + "grad_norm": 0.1650390625, + "learning_rate": 0.00010518872488301323, + "loss": 1.0327, + "step": 9770 + }, + { + "epoch": 0.54, + "grad_norm": 0.1611328125, + "learning_rate": 0.00010509325174658582, + "loss": 1.0866, + "step": 9775 + }, + { + "epoch": 0.54, + "grad_norm": 0.16796875, + "learning_rate": 0.00010499777395529571, + "loss": 1.1541, + "step": 9780 + }, + { + "epoch": 0.54, + "grad_norm": 0.1650390625, + "learning_rate": 0.00010490229159640275, + "loss": 1.1544, + "step": 9785 + }, + { + "epoch": 0.54, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010480680475717084, + "loss": 1.1667, + "step": 9790 + }, + { + "epoch": 0.54, + "grad_norm": 0.162109375, + "learning_rate": 0.00010471131352486801, + "loss": 1.137, + "step": 9795 + }, + { + "epoch": 0.54, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010461581798676636, + "loss": 1.171, + "step": 9800 + }, + { + "epoch": 0.54, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010452031823014186, + "loss": 1.1982, + "step": 9805 + }, + { + "epoch": 0.54, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010442481434227437, + "loss": 1.1579, + "step": 9810 + }, + { + "epoch": 0.54, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010432930641044754, + "loss": 1.1684, + "step": 9815 + }, + { + "epoch": 0.54, + "grad_norm": 0.171875, + "learning_rate": 0.00010423379452194863, + "loss": 1.1205, + "step": 9820 + }, + { + "epoch": 0.54, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010413827876406866, + "loss": 1.2203, + "step": 9825 + }, + { + "epoch": 0.54, + "grad_norm": 0.16796875, + "learning_rate": 0.00010404275922410202, + "loss": 1.2415, + "step": 9830 + }, + { + "epoch": 0.54, + "grad_norm": 0.173828125, + "learning_rate": 0.00010394723598934668, + "loss": 1.1107, + "step": 9835 + }, + { + "epoch": 0.54, + "grad_norm": 0.162109375, + "learning_rate": 0.000103851709147104, + "loss": 1.1226, + "step": 9840 + }, + { + "epoch": 0.54, + "grad_norm": 0.169921875, + "learning_rate": 0.0001037561787846785, + "loss": 1.1235, + "step": 9845 + }, + { + "epoch": 0.54, + "grad_norm": 0.1650390625, + "learning_rate": 0.00010366064498937806, + "loss": 1.1031, + "step": 9850 + }, + { + "epoch": 0.54, + "grad_norm": 0.1689453125, + "learning_rate": 0.00010356510784851358, + "loss": 1.1934, + "step": 9855 + }, + { + "epoch": 0.54, + "grad_norm": 0.16796875, + "learning_rate": 0.00010346956744939918, + "loss": 1.1664, + "step": 9860 + }, + { + "epoch": 0.54, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010337402387935173, + "loss": 1.2113, + "step": 9865 + }, + { + "epoch": 0.54, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010327847722569121, + "loss": 1.1374, + "step": 9870 + }, + { + "epoch": 0.54, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010318292757574029, + "loss": 1.1662, + "step": 9875 + }, + { + "epoch": 0.54, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010308737501682444, + "loss": 1.2542, + "step": 9880 + }, + { + "epoch": 0.54, + "grad_norm": 0.173828125, + "learning_rate": 0.00010299181963627175, + "loss": 1.1446, + "step": 9885 + }, + { + "epoch": 0.54, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010289626152141294, + "loss": 1.1309, + "step": 9890 + }, + { + "epoch": 0.54, + "grad_norm": 0.1787109375, + "learning_rate": 0.00010280070075958119, + "loss": 1.2414, + "step": 9895 + }, + { + "epoch": 0.54, + "grad_norm": 0.169921875, + "learning_rate": 0.00010270513743811205, + "loss": 1.1768, + "step": 9900 + }, + { + "epoch": 0.54, + "grad_norm": 0.1767578125, + "learning_rate": 0.00010260957164434352, + "loss": 1.1825, + "step": 9905 + }, + { + "epoch": 0.54, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010251400346561578, + "loss": 1.1436, + "step": 9910 + }, + { + "epoch": 0.54, + "grad_norm": 0.1640625, + "learning_rate": 0.00010241843298927121, + "loss": 1.0725, + "step": 9915 + }, + { + "epoch": 0.54, + "grad_norm": 0.1796875, + "learning_rate": 0.0001023228603026543, + "loss": 1.136, + "step": 9920 + }, + { + "epoch": 0.54, + "grad_norm": 0.1640625, + "learning_rate": 0.0001022272854931116, + "loss": 1.1164, + "step": 9925 + }, + { + "epoch": 0.54, + "grad_norm": 0.1640625, + "learning_rate": 0.00010213170864799146, + "loss": 1.1769, + "step": 9930 + }, + { + "epoch": 0.54, + "grad_norm": 0.171875, + "learning_rate": 0.00010203612985464429, + "loss": 1.2069, + "step": 9935 + }, + { + "epoch": 0.54, + "grad_norm": 0.162109375, + "learning_rate": 0.00010194054920042208, + "loss": 1.1555, + "step": 9940 + }, + { + "epoch": 0.54, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010184496677267867, + "loss": 1.1883, + "step": 9945 + }, + { + "epoch": 0.54, + "grad_norm": 0.1708984375, + "learning_rate": 0.00010174938265876945, + "loss": 1.1809, + "step": 9950 + }, + { + "epoch": 0.55, + "grad_norm": 0.169921875, + "learning_rate": 0.00010165379694605137, + "loss": 1.2525, + "step": 9955 + }, + { + "epoch": 0.55, + "grad_norm": 0.169921875, + "learning_rate": 0.00010155820972188284, + "loss": 1.1263, + "step": 9960 + }, + { + "epoch": 0.55, + "grad_norm": 0.1806640625, + "learning_rate": 0.00010146262107362363, + "loss": 1.1874, + "step": 9965 + }, + { + "epoch": 0.55, + "grad_norm": 0.1806640625, + "learning_rate": 0.00010136703108863486, + "loss": 1.2121, + "step": 9970 + }, + { + "epoch": 0.55, + "grad_norm": 0.173828125, + "learning_rate": 0.00010127143985427883, + "loss": 1.1992, + "step": 9975 + }, + { + "epoch": 0.55, + "grad_norm": 0.2109375, + "learning_rate": 0.00010117584745791898, + "loss": 1.166, + "step": 9980 + }, + { + "epoch": 0.55, + "grad_norm": 0.166015625, + "learning_rate": 0.00010108025398691988, + "loss": 1.1574, + "step": 9985 + }, + { + "epoch": 0.55, + "grad_norm": 0.173828125, + "learning_rate": 0.00010098465952864693, + "loss": 1.1448, + "step": 9990 + }, + { + "epoch": 0.55, + "grad_norm": 0.171875, + "learning_rate": 0.00010088906417046665, + "loss": 1.1529, + "step": 9995 + }, + { + "epoch": 0.55, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010079346799974616, + "loss": 1.1718, + "step": 10000 + }, + { + "epoch": 0.55, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001006978711038535, + "loss": 1.093, + "step": 10005 + }, + { + "epoch": 0.55, + "grad_norm": 0.177734375, + "learning_rate": 0.00010060227357015723, + "loss": 1.1666, + "step": 10010 + }, + { + "epoch": 0.55, + "grad_norm": 0.16015625, + "learning_rate": 0.00010050667548602664, + "loss": 1.1145, + "step": 10015 + }, + { + "epoch": 0.55, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001004110769388314, + "loss": 1.137, + "step": 10020 + }, + { + "epoch": 0.55, + "grad_norm": 0.1669921875, + "learning_rate": 0.00010031547801594164, + "loss": 1.1741, + "step": 10025 + }, + { + "epoch": 0.55, + "grad_norm": 0.1728515625, + "learning_rate": 0.00010021987880472788, + "loss": 1.161, + "step": 10030 + }, + { + "epoch": 0.55, + "grad_norm": 0.169921875, + "learning_rate": 0.00010012427939256081, + "loss": 1.1705, + "step": 10035 + }, + { + "epoch": 0.55, + "grad_norm": 0.1796875, + "learning_rate": 0.00010002867986681142, + "loss": 1.2379, + "step": 10040 + }, + { + "epoch": 0.55, + "grad_norm": 0.166015625, + "learning_rate": 9.993308031485071e-05, + "loss": 1.1709, + "step": 10045 + }, + { + "epoch": 0.55, + "grad_norm": 0.17578125, + "learning_rate": 9.98374808240497e-05, + "loss": 1.2326, + "step": 10050 + }, + { + "epoch": 0.55, + "grad_norm": 0.1748046875, + "learning_rate": 9.974188148177946e-05, + "loss": 1.2494, + "step": 10055 + }, + { + "epoch": 0.55, + "grad_norm": 0.1640625, + "learning_rate": 9.964628237541084e-05, + "loss": 1.0937, + "step": 10060 + }, + { + "epoch": 0.55, + "grad_norm": 0.171875, + "learning_rate": 9.955068359231446e-05, + "loss": 1.166, + "step": 10065 + }, + { + "epoch": 0.55, + "grad_norm": 0.169921875, + "learning_rate": 9.945508521986066e-05, + "loss": 1.1593, + "step": 10070 + }, + { + "epoch": 0.55, + "grad_norm": 0.16796875, + "learning_rate": 9.935948734541947e-05, + "loss": 1.1114, + "step": 10075 + }, + { + "epoch": 0.55, + "grad_norm": 0.1630859375, + "learning_rate": 9.926389005636039e-05, + "loss": 1.2054, + "step": 10080 + }, + { + "epoch": 0.55, + "grad_norm": 0.1669921875, + "learning_rate": 9.91682934400524e-05, + "loss": 1.1198, + "step": 10085 + }, + { + "epoch": 0.55, + "grad_norm": 0.171875, + "learning_rate": 9.907269758386386e-05, + "loss": 1.1255, + "step": 10090 + }, + { + "epoch": 0.55, + "grad_norm": 0.1806640625, + "learning_rate": 9.89771025751625e-05, + "loss": 1.1594, + "step": 10095 + }, + { + "epoch": 0.55, + "grad_norm": 0.1767578125, + "learning_rate": 9.888150850131516e-05, + "loss": 1.1785, + "step": 10100 + }, + { + "epoch": 0.55, + "grad_norm": 0.1767578125, + "learning_rate": 9.878591544968795e-05, + "loss": 1.1016, + "step": 10105 + }, + { + "epoch": 0.55, + "grad_norm": 0.1630859375, + "learning_rate": 9.869032350764597e-05, + "loss": 1.1373, + "step": 10110 + }, + { + "epoch": 0.55, + "grad_norm": 0.1669921875, + "learning_rate": 9.859473276255324e-05, + "loss": 1.1844, + "step": 10115 + }, + { + "epoch": 0.55, + "grad_norm": 0.185546875, + "learning_rate": 9.849914330177291e-05, + "loss": 1.1619, + "step": 10120 + }, + { + "epoch": 0.55, + "grad_norm": 0.162109375, + "learning_rate": 9.840355521266672e-05, + "loss": 1.1239, + "step": 10125 + }, + { + "epoch": 0.55, + "grad_norm": 0.1640625, + "learning_rate": 9.830796858259528e-05, + "loss": 1.0935, + "step": 10130 + }, + { + "epoch": 0.56, + "grad_norm": 0.171875, + "learning_rate": 9.821238349891778e-05, + "loss": 1.1757, + "step": 10135 + }, + { + "epoch": 0.56, + "grad_norm": 0.16796875, + "learning_rate": 9.811680004899214e-05, + "loss": 1.081, + "step": 10140 + }, + { + "epoch": 0.56, + "grad_norm": 0.1708984375, + "learning_rate": 9.802121832017466e-05, + "loss": 1.1609, + "step": 10145 + }, + { + "epoch": 0.56, + "grad_norm": 0.1669921875, + "learning_rate": 9.792563839982012e-05, + "loss": 1.2551, + "step": 10150 + }, + { + "epoch": 0.56, + "grad_norm": 0.17578125, + "learning_rate": 9.783006037528157e-05, + "loss": 1.2193, + "step": 10155 + }, + { + "epoch": 0.56, + "grad_norm": 0.1689453125, + "learning_rate": 9.77344843339105e-05, + "loss": 1.1624, + "step": 10160 + }, + { + "epoch": 0.56, + "grad_norm": 0.1669921875, + "learning_rate": 9.76389103630564e-05, + "loss": 1.1218, + "step": 10165 + }, + { + "epoch": 0.56, + "grad_norm": 0.1787109375, + "learning_rate": 9.754333855006699e-05, + "loss": 1.1217, + "step": 10170 + }, + { + "epoch": 0.56, + "grad_norm": 0.1728515625, + "learning_rate": 9.74477689822879e-05, + "loss": 1.1508, + "step": 10175 + }, + { + "epoch": 0.56, + "grad_norm": 0.173828125, + "learning_rate": 9.73522017470629e-05, + "loss": 1.1326, + "step": 10180 + }, + { + "epoch": 0.56, + "grad_norm": 0.1806640625, + "learning_rate": 9.725663693173343e-05, + "loss": 1.1927, + "step": 10185 + }, + { + "epoch": 0.56, + "grad_norm": 0.1630859375, + "learning_rate": 9.716107462363881e-05, + "loss": 1.1657, + "step": 10190 + }, + { + "epoch": 0.56, + "grad_norm": 0.1669921875, + "learning_rate": 9.70655149101161e-05, + "loss": 1.2035, + "step": 10195 + }, + { + "epoch": 0.56, + "grad_norm": 0.1689453125, + "learning_rate": 9.696995787849985e-05, + "loss": 1.1384, + "step": 10200 + }, + { + "epoch": 0.56, + "grad_norm": 0.177734375, + "learning_rate": 9.687440361612238e-05, + "loss": 1.1734, + "step": 10205 + }, + { + "epoch": 0.56, + "grad_norm": 0.1640625, + "learning_rate": 9.677885221031329e-05, + "loss": 1.2824, + "step": 10210 + }, + { + "epoch": 0.56, + "grad_norm": 0.1787109375, + "learning_rate": 9.668330374839965e-05, + "loss": 1.0996, + "step": 10215 + }, + { + "epoch": 0.56, + "grad_norm": 0.1689453125, + "learning_rate": 9.658775831770575e-05, + "loss": 1.1382, + "step": 10220 + }, + { + "epoch": 0.56, + "grad_norm": 0.1708984375, + "learning_rate": 9.64922160055533e-05, + "loss": 1.1684, + "step": 10225 + }, + { + "epoch": 0.56, + "grad_norm": 0.1728515625, + "learning_rate": 9.6396676899261e-05, + "loss": 1.1403, + "step": 10230 + }, + { + "epoch": 0.56, + "grad_norm": 0.1689453125, + "learning_rate": 9.630114108614464e-05, + "loss": 1.1393, + "step": 10235 + }, + { + "epoch": 0.56, + "grad_norm": 0.1630859375, + "learning_rate": 9.6205608653517e-05, + "loss": 1.1194, + "step": 10240 + }, + { + "epoch": 0.56, + "grad_norm": 0.162109375, + "learning_rate": 9.611007968868787e-05, + "loss": 1.1387, + "step": 10245 + }, + { + "epoch": 0.56, + "grad_norm": 0.169921875, + "learning_rate": 9.601455427896377e-05, + "loss": 1.1942, + "step": 10250 + }, + { + "epoch": 0.56, + "grad_norm": 0.1728515625, + "learning_rate": 9.591903251164797e-05, + "loss": 1.1733, + "step": 10255 + }, + { + "epoch": 0.56, + "grad_norm": 0.16796875, + "learning_rate": 9.582351447404042e-05, + "loss": 1.1244, + "step": 10260 + }, + { + "epoch": 0.56, + "grad_norm": 0.169921875, + "learning_rate": 9.572800025343774e-05, + "loss": 1.2431, + "step": 10265 + }, + { + "epoch": 0.56, + "grad_norm": 0.16796875, + "learning_rate": 9.563248993713296e-05, + "loss": 1.177, + "step": 10270 + }, + { + "epoch": 0.56, + "grad_norm": 0.16796875, + "learning_rate": 9.553698361241557e-05, + "loss": 1.1976, + "step": 10275 + }, + { + "epoch": 0.56, + "grad_norm": 0.1767578125, + "learning_rate": 9.544148136657145e-05, + "loss": 1.1354, + "step": 10280 + }, + { + "epoch": 0.56, + "grad_norm": 0.1728515625, + "learning_rate": 9.534598328688273e-05, + "loss": 1.1658, + "step": 10285 + }, + { + "epoch": 0.56, + "grad_norm": 0.177734375, + "learning_rate": 9.52504894606277e-05, + "loss": 1.1143, + "step": 10290 + }, + { + "epoch": 0.56, + "grad_norm": 0.1748046875, + "learning_rate": 9.515499997508083e-05, + "loss": 1.1975, + "step": 10295 + }, + { + "epoch": 0.56, + "grad_norm": 0.1787109375, + "learning_rate": 9.505951491751256e-05, + "loss": 1.1698, + "step": 10300 + }, + { + "epoch": 0.56, + "grad_norm": 0.169921875, + "learning_rate": 9.496403437518925e-05, + "loss": 1.1516, + "step": 10305 + }, + { + "epoch": 0.56, + "grad_norm": 0.173828125, + "learning_rate": 9.486855843537332e-05, + "loss": 1.1572, + "step": 10310 + }, + { + "epoch": 0.56, + "grad_norm": 0.171875, + "learning_rate": 9.477308718532275e-05, + "loss": 1.182, + "step": 10315 + }, + { + "epoch": 0.57, + "grad_norm": 0.173828125, + "learning_rate": 9.467762071229138e-05, + "loss": 1.0752, + "step": 10320 + }, + { + "epoch": 0.57, + "grad_norm": 0.16796875, + "learning_rate": 9.45821591035286e-05, + "loss": 1.2191, + "step": 10325 + }, + { + "epoch": 0.57, + "grad_norm": 0.169921875, + "learning_rate": 9.448670244627945e-05, + "loss": 1.1375, + "step": 10330 + }, + { + "epoch": 0.57, + "grad_norm": 0.1630859375, + "learning_rate": 9.439125082778438e-05, + "loss": 1.1327, + "step": 10335 + }, + { + "epoch": 0.57, + "grad_norm": 0.169921875, + "learning_rate": 9.429580433527923e-05, + "loss": 1.1228, + "step": 10340 + }, + { + "epoch": 0.57, + "grad_norm": 0.1767578125, + "learning_rate": 9.420036305599512e-05, + "loss": 1.1423, + "step": 10345 + }, + { + "epoch": 0.57, + "grad_norm": 0.16796875, + "learning_rate": 9.410492707715856e-05, + "loss": 1.1457, + "step": 10350 + }, + { + "epoch": 0.57, + "grad_norm": 0.1728515625, + "learning_rate": 9.400949648599107e-05, + "loss": 1.1912, + "step": 10355 + }, + { + "epoch": 0.57, + "grad_norm": 0.1669921875, + "learning_rate": 9.391407136970926e-05, + "loss": 1.1711, + "step": 10360 + }, + { + "epoch": 0.57, + "grad_norm": 0.166015625, + "learning_rate": 9.381865181552479e-05, + "loss": 1.1608, + "step": 10365 + }, + { + "epoch": 0.57, + "grad_norm": 0.162109375, + "learning_rate": 9.372323791064423e-05, + "loss": 1.1022, + "step": 10370 + }, + { + "epoch": 0.57, + "grad_norm": 0.1728515625, + "learning_rate": 9.362782974226891e-05, + "loss": 1.1634, + "step": 10375 + }, + { + "epoch": 0.57, + "grad_norm": 0.1767578125, + "learning_rate": 9.353242739759506e-05, + "loss": 1.1867, + "step": 10380 + }, + { + "epoch": 0.57, + "grad_norm": 0.177734375, + "learning_rate": 9.343703096381342e-05, + "loss": 1.1356, + "step": 10385 + }, + { + "epoch": 0.57, + "grad_norm": 0.171875, + "learning_rate": 9.334164052810945e-05, + "loss": 1.1598, + "step": 10390 + }, + { + "epoch": 0.57, + "grad_norm": 0.1796875, + "learning_rate": 9.324625617766309e-05, + "loss": 1.2104, + "step": 10395 + }, + { + "epoch": 0.57, + "grad_norm": 0.1689453125, + "learning_rate": 9.315087799964873e-05, + "loss": 1.1648, + "step": 10400 + }, + { + "epoch": 0.57, + "grad_norm": 0.1787109375, + "learning_rate": 9.305550608123509e-05, + "loss": 1.2151, + "step": 10405 + }, + { + "epoch": 0.57, + "grad_norm": 0.1669921875, + "learning_rate": 9.296014050958513e-05, + "loss": 1.1961, + "step": 10410 + }, + { + "epoch": 0.57, + "grad_norm": 0.185546875, + "learning_rate": 9.286478137185619e-05, + "loss": 1.1608, + "step": 10415 + }, + { + "epoch": 0.57, + "grad_norm": 0.169921875, + "learning_rate": 9.276942875519952e-05, + "loss": 1.2612, + "step": 10420 + }, + { + "epoch": 0.57, + "grad_norm": 0.16796875, + "learning_rate": 9.267408274676051e-05, + "loss": 1.1399, + "step": 10425 + }, + { + "epoch": 0.57, + "grad_norm": 0.171875, + "learning_rate": 9.257874343367846e-05, + "loss": 1.2151, + "step": 10430 + }, + { + "epoch": 0.57, + "grad_norm": 0.173828125, + "learning_rate": 9.248341090308666e-05, + "loss": 1.1745, + "step": 10435 + }, + { + "epoch": 0.57, + "grad_norm": 0.1708984375, + "learning_rate": 9.238808524211211e-05, + "loss": 1.1334, + "step": 10440 + }, + { + "epoch": 0.57, + "grad_norm": 0.1787109375, + "learning_rate": 9.229276653787552e-05, + "loss": 1.1511, + "step": 10445 + }, + { + "epoch": 0.57, + "grad_norm": 0.1689453125, + "learning_rate": 9.219745487749125e-05, + "loss": 1.1586, + "step": 10450 + }, + { + "epoch": 0.57, + "grad_norm": 0.1787109375, + "learning_rate": 9.210215034806732e-05, + "loss": 1.1586, + "step": 10455 + }, + { + "epoch": 0.57, + "grad_norm": 0.1689453125, + "learning_rate": 9.200685303670509e-05, + "loss": 1.1163, + "step": 10460 + }, + { + "epoch": 0.57, + "grad_norm": 0.1669921875, + "learning_rate": 9.191156303049944e-05, + "loss": 1.0988, + "step": 10465 + }, + { + "epoch": 0.57, + "grad_norm": 0.1767578125, + "learning_rate": 9.181628041653848e-05, + "loss": 1.1843, + "step": 10470 + }, + { + "epoch": 0.57, + "grad_norm": 0.1787109375, + "learning_rate": 9.172100528190363e-05, + "loss": 1.2074, + "step": 10475 + }, + { + "epoch": 0.57, + "grad_norm": 0.17578125, + "learning_rate": 9.162573771366947e-05, + "loss": 1.1822, + "step": 10480 + }, + { + "epoch": 0.57, + "grad_norm": 0.1767578125, + "learning_rate": 9.153047779890363e-05, + "loss": 1.1655, + "step": 10485 + }, + { + "epoch": 0.57, + "grad_norm": 0.1728515625, + "learning_rate": 9.143522562466672e-05, + "loss": 1.2011, + "step": 10490 + }, + { + "epoch": 0.57, + "grad_norm": 0.166015625, + "learning_rate": 9.133998127801239e-05, + "loss": 1.1183, + "step": 10495 + }, + { + "epoch": 0.58, + "grad_norm": 0.173828125, + "learning_rate": 9.124474484598706e-05, + "loss": 1.1215, + "step": 10500 + }, + { + "epoch": 0.58, + "grad_norm": 0.171875, + "learning_rate": 9.114951641562988e-05, + "loss": 1.1731, + "step": 10505 + }, + { + "epoch": 0.58, + "grad_norm": 0.1640625, + "learning_rate": 9.105429607397274e-05, + "loss": 1.1641, + "step": 10510 + }, + { + "epoch": 0.58, + "grad_norm": 0.1728515625, + "learning_rate": 9.09590839080401e-05, + "loss": 1.2396, + "step": 10515 + }, + { + "epoch": 0.58, + "grad_norm": 0.1748046875, + "learning_rate": 9.086388000484903e-05, + "loss": 1.1676, + "step": 10520 + }, + { + "epoch": 0.58, + "grad_norm": 0.1728515625, + "learning_rate": 9.076868445140897e-05, + "loss": 1.1063, + "step": 10525 + }, + { + "epoch": 0.58, + "grad_norm": 0.17578125, + "learning_rate": 9.067349733472175e-05, + "loss": 1.237, + "step": 10530 + }, + { + "epoch": 0.58, + "grad_norm": 0.1689453125, + "learning_rate": 9.057831874178145e-05, + "loss": 1.2237, + "step": 10535 + }, + { + "epoch": 0.58, + "grad_norm": 0.173828125, + "learning_rate": 9.048314875957445e-05, + "loss": 1.1732, + "step": 10540 + }, + { + "epoch": 0.58, + "grad_norm": 0.173828125, + "learning_rate": 9.038798747507923e-05, + "loss": 1.2999, + "step": 10545 + }, + { + "epoch": 0.58, + "grad_norm": 0.17578125, + "learning_rate": 9.029283497526622e-05, + "loss": 1.1659, + "step": 10550 + }, + { + "epoch": 0.58, + "grad_norm": 0.1748046875, + "learning_rate": 9.019769134709797e-05, + "loss": 1.0748, + "step": 10555 + }, + { + "epoch": 0.58, + "grad_norm": 0.1650390625, + "learning_rate": 9.010255667752883e-05, + "loss": 1.2317, + "step": 10560 + }, + { + "epoch": 0.58, + "grad_norm": 0.1689453125, + "learning_rate": 9.000743105350502e-05, + "loss": 1.2144, + "step": 10565 + }, + { + "epoch": 0.58, + "grad_norm": 0.1787109375, + "learning_rate": 8.991231456196444e-05, + "loss": 1.1363, + "step": 10570 + }, + { + "epoch": 0.58, + "grad_norm": 0.16796875, + "learning_rate": 8.981720728983662e-05, + "loss": 1.092, + "step": 10575 + }, + { + "epoch": 0.58, + "grad_norm": 0.1787109375, + "learning_rate": 8.972210932404277e-05, + "loss": 1.1072, + "step": 10580 + }, + { + "epoch": 0.58, + "grad_norm": 0.169921875, + "learning_rate": 8.962702075149552e-05, + "loss": 1.1435, + "step": 10585 + }, + { + "epoch": 0.58, + "grad_norm": 0.181640625, + "learning_rate": 8.953194165909893e-05, + "loss": 1.1958, + "step": 10590 + }, + { + "epoch": 0.58, + "grad_norm": 0.1728515625, + "learning_rate": 8.943687213374833e-05, + "loss": 1.1728, + "step": 10595 + }, + { + "epoch": 0.58, + "grad_norm": 0.1728515625, + "learning_rate": 8.934181226233046e-05, + "loss": 1.1884, + "step": 10600 + }, + { + "epoch": 0.58, + "grad_norm": 0.16796875, + "learning_rate": 8.924676213172312e-05, + "loss": 1.1324, + "step": 10605 + }, + { + "epoch": 0.58, + "grad_norm": 0.1689453125, + "learning_rate": 8.915172182879524e-05, + "loss": 1.149, + "step": 10610 + }, + { + "epoch": 0.58, + "grad_norm": 0.169921875, + "learning_rate": 8.905669144040673e-05, + "loss": 1.1547, + "step": 10615 + }, + { + "epoch": 0.58, + "grad_norm": 0.173828125, + "learning_rate": 8.896167105340844e-05, + "loss": 1.2046, + "step": 10620 + }, + { + "epoch": 0.58, + "grad_norm": 0.166015625, + "learning_rate": 8.886666075464223e-05, + "loss": 1.1899, + "step": 10625 + }, + { + "epoch": 0.58, + "grad_norm": 0.1748046875, + "learning_rate": 8.877166063094053e-05, + "loss": 1.1937, + "step": 10630 + }, + { + "epoch": 0.58, + "grad_norm": 0.171875, + "learning_rate": 8.867667076912659e-05, + "loss": 1.1743, + "step": 10635 + }, + { + "epoch": 0.58, + "grad_norm": 0.1826171875, + "learning_rate": 8.858169125601423e-05, + "loss": 1.2328, + "step": 10640 + }, + { + "epoch": 0.58, + "grad_norm": 0.173828125, + "learning_rate": 8.84867221784079e-05, + "loss": 1.2201, + "step": 10645 + }, + { + "epoch": 0.58, + "grad_norm": 0.1728515625, + "learning_rate": 8.839176362310237e-05, + "loss": 1.1055, + "step": 10650 + }, + { + "epoch": 0.58, + "grad_norm": 0.16796875, + "learning_rate": 8.829681567688293e-05, + "loss": 1.1566, + "step": 10655 + }, + { + "epoch": 0.58, + "grad_norm": 0.1787109375, + "learning_rate": 8.820187842652505e-05, + "loss": 1.141, + "step": 10660 + }, + { + "epoch": 0.58, + "grad_norm": 0.1748046875, + "learning_rate": 8.810695195879458e-05, + "loss": 1.1052, + "step": 10665 + }, + { + "epoch": 0.58, + "grad_norm": 0.1728515625, + "learning_rate": 8.801203636044738e-05, + "loss": 1.0451, + "step": 10670 + }, + { + "epoch": 0.58, + "grad_norm": 0.1689453125, + "learning_rate": 8.791713171822943e-05, + "loss": 1.1743, + "step": 10675 + }, + { + "epoch": 0.58, + "grad_norm": 0.16796875, + "learning_rate": 8.782223811887663e-05, + "loss": 1.0914, + "step": 10680 + }, + { + "epoch": 0.59, + "grad_norm": 0.1640625, + "learning_rate": 8.772735564911494e-05, + "loss": 1.1892, + "step": 10685 + }, + { + "epoch": 0.59, + "grad_norm": 0.1630859375, + "learning_rate": 8.763248439566002e-05, + "loss": 1.2377, + "step": 10690 + }, + { + "epoch": 0.59, + "grad_norm": 0.16796875, + "learning_rate": 8.753762444521731e-05, + "loss": 1.16, + "step": 10695 + }, + { + "epoch": 0.59, + "grad_norm": 0.16796875, + "learning_rate": 8.744277588448191e-05, + "loss": 1.2214, + "step": 10700 + }, + { + "epoch": 0.59, + "grad_norm": 0.1728515625, + "learning_rate": 8.73479388001385e-05, + "loss": 1.1693, + "step": 10705 + }, + { + "epoch": 0.59, + "grad_norm": 0.1669921875, + "learning_rate": 8.725311327886139e-05, + "loss": 1.1424, + "step": 10710 + }, + { + "epoch": 0.59, + "grad_norm": 0.1669921875, + "learning_rate": 8.715829940731415e-05, + "loss": 1.2334, + "step": 10715 + }, + { + "epoch": 0.59, + "grad_norm": 0.1748046875, + "learning_rate": 8.70634972721498e-05, + "loss": 1.1767, + "step": 10720 + }, + { + "epoch": 0.59, + "grad_norm": 0.69921875, + "learning_rate": 8.69687069600106e-05, + "loss": 1.1229, + "step": 10725 + }, + { + "epoch": 0.59, + "grad_norm": 0.169921875, + "learning_rate": 8.687392855752808e-05, + "loss": 1.2189, + "step": 10730 + }, + { + "epoch": 0.59, + "grad_norm": 0.16796875, + "learning_rate": 8.677916215132275e-05, + "loss": 1.1704, + "step": 10735 + }, + { + "epoch": 0.59, + "grad_norm": 0.177734375, + "learning_rate": 8.66844078280043e-05, + "loss": 1.122, + "step": 10740 + }, + { + "epoch": 0.59, + "grad_norm": 0.1708984375, + "learning_rate": 8.658966567417124e-05, + "loss": 1.1558, + "step": 10745 + }, + { + "epoch": 0.59, + "grad_norm": 0.1796875, + "learning_rate": 8.649493577641107e-05, + "loss": 1.2495, + "step": 10750 + }, + { + "epoch": 0.59, + "grad_norm": 0.166015625, + "learning_rate": 8.640021822130004e-05, + "loss": 1.1238, + "step": 10755 + }, + { + "epoch": 0.59, + "grad_norm": 0.1572265625, + "learning_rate": 8.630551309540314e-05, + "loss": 1.1335, + "step": 10760 + }, + { + "epoch": 0.59, + "grad_norm": 0.1796875, + "learning_rate": 8.621082048527392e-05, + "loss": 1.2267, + "step": 10765 + }, + { + "epoch": 0.59, + "grad_norm": 0.1767578125, + "learning_rate": 8.611614047745466e-05, + "loss": 1.1267, + "step": 10770 + }, + { + "epoch": 0.59, + "grad_norm": 0.1708984375, + "learning_rate": 8.602147315847595e-05, + "loss": 1.1962, + "step": 10775 + }, + { + "epoch": 0.59, + "grad_norm": 0.177734375, + "learning_rate": 8.592681861485687e-05, + "loss": 1.1619, + "step": 10780 + }, + { + "epoch": 0.59, + "grad_norm": 0.1689453125, + "learning_rate": 8.583217693310475e-05, + "loss": 1.1219, + "step": 10785 + }, + { + "epoch": 0.59, + "grad_norm": 0.177734375, + "learning_rate": 8.573754819971533e-05, + "loss": 1.1878, + "step": 10790 + }, + { + "epoch": 0.59, + "grad_norm": 0.169921875, + "learning_rate": 8.564293250117237e-05, + "loss": 1.1166, + "step": 10795 + }, + { + "epoch": 0.59, + "grad_norm": 0.1640625, + "learning_rate": 8.554832992394773e-05, + "loss": 1.1859, + "step": 10800 + }, + { + "epoch": 0.59, + "grad_norm": 0.1689453125, + "learning_rate": 8.54537405545013e-05, + "loss": 1.1452, + "step": 10805 + }, + { + "epoch": 0.59, + "grad_norm": 0.1728515625, + "learning_rate": 8.53591644792809e-05, + "loss": 1.1658, + "step": 10810 + }, + { + "epoch": 0.59, + "grad_norm": 0.16796875, + "learning_rate": 8.526460178472225e-05, + "loss": 1.1467, + "step": 10815 + }, + { + "epoch": 0.59, + "grad_norm": 0.17578125, + "learning_rate": 8.517005255724874e-05, + "loss": 1.2163, + "step": 10820 + }, + { + "epoch": 0.59, + "grad_norm": 0.177734375, + "learning_rate": 8.507551688327157e-05, + "loss": 1.1833, + "step": 10825 + }, + { + "epoch": 0.59, + "grad_norm": 0.17578125, + "learning_rate": 8.498099484918943e-05, + "loss": 1.1367, + "step": 10830 + }, + { + "epoch": 0.59, + "grad_norm": 0.162109375, + "learning_rate": 8.48864865413886e-05, + "loss": 1.1606, + "step": 10835 + }, + { + "epoch": 0.59, + "grad_norm": 0.169921875, + "learning_rate": 8.479199204624288e-05, + "loss": 1.1066, + "step": 10840 + }, + { + "epoch": 0.59, + "grad_norm": 0.16796875, + "learning_rate": 8.469751145011333e-05, + "loss": 1.1006, + "step": 10845 + }, + { + "epoch": 0.59, + "grad_norm": 0.171875, + "learning_rate": 8.460304483934834e-05, + "loss": 1.189, + "step": 10850 + }, + { + "epoch": 0.59, + "grad_norm": 0.1640625, + "learning_rate": 8.450859230028363e-05, + "loss": 1.153, + "step": 10855 + }, + { + "epoch": 0.59, + "grad_norm": 0.169921875, + "learning_rate": 8.441415391924192e-05, + "loss": 1.159, + "step": 10860 + }, + { + "epoch": 0.6, + "grad_norm": 0.1689453125, + "learning_rate": 8.431972978253303e-05, + "loss": 1.1256, + "step": 10865 + }, + { + "epoch": 0.6, + "grad_norm": 0.162109375, + "learning_rate": 8.422531997645374e-05, + "loss": 1.2179, + "step": 10870 + }, + { + "epoch": 0.6, + "grad_norm": 0.1728515625, + "learning_rate": 8.413092458728786e-05, + "loss": 1.175, + "step": 10875 + }, + { + "epoch": 0.6, + "grad_norm": 0.1708984375, + "learning_rate": 8.403654370130585e-05, + "loss": 1.1468, + "step": 10880 + }, + { + "epoch": 0.6, + "grad_norm": 0.17578125, + "learning_rate": 8.394217740476505e-05, + "loss": 1.1275, + "step": 10885 + }, + { + "epoch": 0.6, + "grad_norm": 0.1767578125, + "learning_rate": 8.384782578390931e-05, + "loss": 1.1314, + "step": 10890 + }, + { + "epoch": 0.6, + "grad_norm": 0.1728515625, + "learning_rate": 8.375348892496929e-05, + "loss": 1.2109, + "step": 10895 + }, + { + "epoch": 0.6, + "grad_norm": 0.1689453125, + "learning_rate": 8.365916691416197e-05, + "loss": 1.1558, + "step": 10900 + }, + { + "epoch": 0.6, + "grad_norm": 0.166015625, + "learning_rate": 8.356485983769085e-05, + "loss": 1.1935, + "step": 10905 + }, + { + "epoch": 0.6, + "grad_norm": 0.1728515625, + "learning_rate": 8.347056778174572e-05, + "loss": 1.1658, + "step": 10910 + }, + { + "epoch": 0.6, + "grad_norm": 0.1708984375, + "learning_rate": 8.337629083250272e-05, + "loss": 1.1647, + "step": 10915 + }, + { + "epoch": 0.6, + "grad_norm": 0.1748046875, + "learning_rate": 8.328202907612412e-05, + "loss": 1.1896, + "step": 10920 + }, + { + "epoch": 0.6, + "grad_norm": 0.1728515625, + "learning_rate": 8.318778259875836e-05, + "loss": 1.1991, + "step": 10925 + }, + { + "epoch": 0.6, + "grad_norm": 0.1845703125, + "learning_rate": 8.309355148653986e-05, + "loss": 1.1244, + "step": 10930 + }, + { + "epoch": 0.6, + "grad_norm": 0.171875, + "learning_rate": 8.299933582558899e-05, + "loss": 1.2094, + "step": 10935 + }, + { + "epoch": 0.6, + "grad_norm": 0.1748046875, + "learning_rate": 8.290513570201211e-05, + "loss": 1.1765, + "step": 10940 + }, + { + "epoch": 0.6, + "grad_norm": 0.1748046875, + "learning_rate": 8.281095120190125e-05, + "loss": 1.102, + "step": 10945 + }, + { + "epoch": 0.6, + "grad_norm": 0.1728515625, + "learning_rate": 8.27167824113342e-05, + "loss": 1.2151, + "step": 10950 + }, + { + "epoch": 0.6, + "grad_norm": 0.1689453125, + "learning_rate": 8.26226294163744e-05, + "loss": 1.1786, + "step": 10955 + }, + { + "epoch": 0.6, + "grad_norm": 0.173828125, + "learning_rate": 8.252849230307092e-05, + "loss": 1.1476, + "step": 10960 + }, + { + "epoch": 0.6, + "grad_norm": 0.1728515625, + "learning_rate": 8.24343711574582e-05, + "loss": 1.1859, + "step": 10965 + }, + { + "epoch": 0.6, + "grad_norm": 0.1767578125, + "learning_rate": 8.234026606555617e-05, + "loss": 1.2477, + "step": 10970 + }, + { + "epoch": 0.6, + "grad_norm": 0.1640625, + "learning_rate": 8.224617711336999e-05, + "loss": 1.141, + "step": 10975 + }, + { + "epoch": 0.6, + "grad_norm": 0.1689453125, + "learning_rate": 8.215210438689023e-05, + "loss": 1.112, + "step": 10980 + }, + { + "epoch": 0.6, + "grad_norm": 0.1796875, + "learning_rate": 8.20580479720925e-05, + "loss": 1.1282, + "step": 10985 + }, + { + "epoch": 0.6, + "grad_norm": 0.177734375, + "learning_rate": 8.196400795493753e-05, + "loss": 1.1125, + "step": 10990 + }, + { + "epoch": 0.6, + "grad_norm": 0.1669921875, + "learning_rate": 8.186998442137106e-05, + "loss": 1.1781, + "step": 10995 + }, + { + "epoch": 0.6, + "grad_norm": 0.1826171875, + "learning_rate": 8.177597745732384e-05, + "loss": 1.1558, + "step": 11000 + }, + { + "epoch": 0.6, + "grad_norm": 0.1708984375, + "learning_rate": 8.168198714871137e-05, + "loss": 1.1321, + "step": 11005 + }, + { + "epoch": 0.6, + "grad_norm": 0.1748046875, + "learning_rate": 8.158801358143397e-05, + "loss": 1.2147, + "step": 11010 + }, + { + "epoch": 0.6, + "grad_norm": 0.177734375, + "learning_rate": 8.149405684137669e-05, + "loss": 1.2106, + "step": 11015 + }, + { + "epoch": 0.6, + "grad_norm": 0.1708984375, + "learning_rate": 8.140011701440912e-05, + "loss": 1.1424, + "step": 11020 + }, + { + "epoch": 0.6, + "grad_norm": 0.169921875, + "learning_rate": 8.130619418638554e-05, + "loss": 1.1261, + "step": 11025 + }, + { + "epoch": 0.6, + "grad_norm": 0.1767578125, + "learning_rate": 8.121228844314457e-05, + "loss": 1.1197, + "step": 11030 + }, + { + "epoch": 0.6, + "grad_norm": 0.166015625, + "learning_rate": 8.111839987050923e-05, + "loss": 1.2139, + "step": 11035 + }, + { + "epoch": 0.6, + "grad_norm": 0.177734375, + "learning_rate": 8.10245285542868e-05, + "loss": 1.2004, + "step": 11040 + }, + { + "epoch": 0.6, + "grad_norm": 0.1767578125, + "learning_rate": 8.093067458026898e-05, + "loss": 1.1045, + "step": 11045 + }, + { + "epoch": 0.61, + "grad_norm": 0.1748046875, + "learning_rate": 8.083683803423143e-05, + "loss": 1.1802, + "step": 11050 + }, + { + "epoch": 0.61, + "grad_norm": 0.177734375, + "learning_rate": 8.074301900193396e-05, + "loss": 1.1863, + "step": 11055 + }, + { + "epoch": 0.61, + "grad_norm": 0.171875, + "learning_rate": 8.064921756912032e-05, + "loss": 1.1537, + "step": 11060 + }, + { + "epoch": 0.61, + "grad_norm": 0.173828125, + "learning_rate": 8.055543382151828e-05, + "loss": 1.2008, + "step": 11065 + }, + { + "epoch": 0.61, + "grad_norm": 0.1767578125, + "learning_rate": 8.046166784483935e-05, + "loss": 1.15, + "step": 11070 + }, + { + "epoch": 0.61, + "grad_norm": 0.1708984375, + "learning_rate": 8.036791972477884e-05, + "loss": 1.2001, + "step": 11075 + }, + { + "epoch": 0.61, + "grad_norm": 0.1650390625, + "learning_rate": 8.02741895470157e-05, + "loss": 1.2361, + "step": 11080 + }, + { + "epoch": 0.61, + "grad_norm": 0.17578125, + "learning_rate": 8.018047739721255e-05, + "loss": 1.2323, + "step": 11085 + }, + { + "epoch": 0.61, + "grad_norm": 0.169921875, + "learning_rate": 8.00867833610155e-05, + "loss": 1.2584, + "step": 11090 + }, + { + "epoch": 0.61, + "grad_norm": 0.1640625, + "learning_rate": 7.999310752405408e-05, + "loss": 1.159, + "step": 11095 + }, + { + "epoch": 0.61, + "grad_norm": 0.1708984375, + "learning_rate": 7.989944997194123e-05, + "loss": 1.1296, + "step": 11100 + }, + { + "epoch": 0.61, + "grad_norm": 0.171875, + "learning_rate": 7.980581079027314e-05, + "loss": 1.1494, + "step": 11105 + }, + { + "epoch": 0.61, + "grad_norm": 0.1787109375, + "learning_rate": 7.971219006462925e-05, + "loss": 1.1472, + "step": 11110 + }, + { + "epoch": 0.61, + "grad_norm": 0.1689453125, + "learning_rate": 7.96185878805721e-05, + "loss": 1.0716, + "step": 11115 + }, + { + "epoch": 0.61, + "grad_norm": 0.1728515625, + "learning_rate": 7.952500432364731e-05, + "loss": 1.1093, + "step": 11120 + }, + { + "epoch": 0.61, + "grad_norm": 0.1728515625, + "learning_rate": 7.943143947938342e-05, + "loss": 1.1564, + "step": 11125 + }, + { + "epoch": 0.61, + "grad_norm": 0.1806640625, + "learning_rate": 7.933789343329199e-05, + "loss": 1.1545, + "step": 11130 + }, + { + "epoch": 0.61, + "grad_norm": 0.17578125, + "learning_rate": 7.924436627086726e-05, + "loss": 1.093, + "step": 11135 + }, + { + "epoch": 0.61, + "grad_norm": 0.1728515625, + "learning_rate": 7.91508580775863e-05, + "loss": 1.1415, + "step": 11140 + }, + { + "epoch": 0.61, + "grad_norm": 0.1748046875, + "learning_rate": 7.905736893890875e-05, + "loss": 1.1978, + "step": 11145 + }, + { + "epoch": 0.61, + "grad_norm": 0.1787109375, + "learning_rate": 7.8963898940277e-05, + "loss": 1.1466, + "step": 11150 + }, + { + "epoch": 0.61, + "grad_norm": 0.169921875, + "learning_rate": 7.887044816711581e-05, + "loss": 1.1666, + "step": 11155 + }, + { + "epoch": 0.61, + "grad_norm": 0.16796875, + "learning_rate": 7.877701670483242e-05, + "loss": 1.0945, + "step": 11160 + }, + { + "epoch": 0.61, + "grad_norm": 0.1767578125, + "learning_rate": 7.868360463881633e-05, + "loss": 1.1524, + "step": 11165 + }, + { + "epoch": 0.61, + "grad_norm": 0.171875, + "learning_rate": 7.859021205443951e-05, + "loss": 1.1879, + "step": 11170 + }, + { + "epoch": 0.61, + "grad_norm": 0.1689453125, + "learning_rate": 7.849683903705595e-05, + "loss": 1.1988, + "step": 11175 + }, + { + "epoch": 0.61, + "grad_norm": 0.16796875, + "learning_rate": 7.84034856720018e-05, + "loss": 1.1913, + "step": 11180 + }, + { + "epoch": 0.61, + "grad_norm": 0.1728515625, + "learning_rate": 7.831015204459527e-05, + "loss": 1.148, + "step": 11185 + }, + { + "epoch": 0.61, + "grad_norm": 0.171875, + "learning_rate": 7.821683824013658e-05, + "loss": 1.1996, + "step": 11190 + }, + { + "epoch": 0.61, + "grad_norm": 0.1650390625, + "learning_rate": 7.812354434390771e-05, + "loss": 1.1253, + "step": 11195 + }, + { + "epoch": 0.61, + "grad_norm": 0.169921875, + "learning_rate": 7.803027044117257e-05, + "loss": 1.1595, + "step": 11200 + }, + { + "epoch": 0.61, + "grad_norm": 0.171875, + "learning_rate": 7.79370166171767e-05, + "loss": 1.2007, + "step": 11205 + }, + { + "epoch": 0.61, + "grad_norm": 0.1708984375, + "learning_rate": 7.784378295714731e-05, + "loss": 1.216, + "step": 11210 + }, + { + "epoch": 0.61, + "grad_norm": 0.169921875, + "learning_rate": 7.77505695462933e-05, + "loss": 1.1905, + "step": 11215 + }, + { + "epoch": 0.61, + "grad_norm": 0.17578125, + "learning_rate": 7.76573764698049e-05, + "loss": 1.1776, + "step": 11220 + }, + { + "epoch": 0.61, + "grad_norm": 0.173828125, + "learning_rate": 7.756420381285379e-05, + "loss": 1.222, + "step": 11225 + }, + { + "epoch": 0.62, + "grad_norm": 0.1708984375, + "learning_rate": 7.747105166059306e-05, + "loss": 1.2169, + "step": 11230 + }, + { + "epoch": 0.62, + "grad_norm": 0.1728515625, + "learning_rate": 7.737792009815704e-05, + "loss": 1.1233, + "step": 11235 + }, + { + "epoch": 0.62, + "grad_norm": 0.1748046875, + "learning_rate": 7.72848092106612e-05, + "loss": 1.2118, + "step": 11240 + }, + { + "epoch": 0.62, + "grad_norm": 0.17578125, + "learning_rate": 7.719171908320214e-05, + "loss": 1.2159, + "step": 11245 + }, + { + "epoch": 0.62, + "grad_norm": 0.16796875, + "learning_rate": 7.709864980085744e-05, + "loss": 1.1048, + "step": 11250 + }, + { + "epoch": 0.62, + "grad_norm": 0.1923828125, + "learning_rate": 7.700560144868576e-05, + "loss": 1.2757, + "step": 11255 + }, + { + "epoch": 0.62, + "grad_norm": 0.1640625, + "learning_rate": 7.69125741117265e-05, + "loss": 1.1914, + "step": 11260 + }, + { + "epoch": 0.62, + "grad_norm": 0.1767578125, + "learning_rate": 7.681956787499991e-05, + "loss": 1.1629, + "step": 11265 + }, + { + "epoch": 0.62, + "grad_norm": 0.181640625, + "learning_rate": 7.672658282350687e-05, + "loss": 1.1249, + "step": 11270 + }, + { + "epoch": 0.62, + "grad_norm": 0.171875, + "learning_rate": 7.66336190422291e-05, + "loss": 1.1994, + "step": 11275 + }, + { + "epoch": 0.62, + "grad_norm": 0.1669921875, + "learning_rate": 7.654067661612865e-05, + "loss": 1.181, + "step": 11280 + }, + { + "epoch": 0.62, + "grad_norm": 0.166015625, + "learning_rate": 7.644775563014824e-05, + "loss": 1.0829, + "step": 11285 + }, + { + "epoch": 0.62, + "grad_norm": 0.169921875, + "learning_rate": 7.635485616921083e-05, + "loss": 1.1677, + "step": 11290 + }, + { + "epoch": 0.62, + "grad_norm": 0.1708984375, + "learning_rate": 7.626197831821985e-05, + "loss": 1.0531, + "step": 11295 + }, + { + "epoch": 0.62, + "grad_norm": 0.1787109375, + "learning_rate": 7.616912216205888e-05, + "loss": 1.254, + "step": 11300 + }, + { + "epoch": 0.62, + "grad_norm": 0.169921875, + "learning_rate": 7.607628778559175e-05, + "loss": 1.2373, + "step": 11305 + }, + { + "epoch": 0.62, + "grad_norm": 0.1630859375, + "learning_rate": 7.598347527366233e-05, + "loss": 1.1418, + "step": 11310 + }, + { + "epoch": 0.62, + "grad_norm": 0.1708984375, + "learning_rate": 7.589068471109446e-05, + "loss": 1.2046, + "step": 11315 + }, + { + "epoch": 0.62, + "grad_norm": 0.17578125, + "learning_rate": 7.579791618269207e-05, + "loss": 1.1768, + "step": 11320 + }, + { + "epoch": 0.62, + "grad_norm": 0.1669921875, + "learning_rate": 7.570516977323886e-05, + "loss": 1.1717, + "step": 11325 + }, + { + "epoch": 0.62, + "grad_norm": 0.173828125, + "learning_rate": 7.56124455674983e-05, + "loss": 1.2207, + "step": 11330 + }, + { + "epoch": 0.62, + "grad_norm": 0.16796875, + "learning_rate": 7.551974365021353e-05, + "loss": 1.1449, + "step": 11335 + }, + { + "epoch": 0.62, + "grad_norm": 0.1767578125, + "learning_rate": 7.542706410610747e-05, + "loss": 1.2054, + "step": 11340 + }, + { + "epoch": 0.62, + "grad_norm": 0.17578125, + "learning_rate": 7.533440701988249e-05, + "loss": 1.1281, + "step": 11345 + }, + { + "epoch": 0.62, + "grad_norm": 0.16796875, + "learning_rate": 7.524177247622042e-05, + "loss": 1.0928, + "step": 11350 + }, + { + "epoch": 0.62, + "grad_norm": 0.17578125, + "learning_rate": 7.514916055978246e-05, + "loss": 1.19, + "step": 11355 + }, + { + "epoch": 0.62, + "grad_norm": 0.166015625, + "learning_rate": 7.505657135520929e-05, + "loss": 1.1097, + "step": 11360 + }, + { + "epoch": 0.62, + "grad_norm": 0.1689453125, + "learning_rate": 7.496400494712068e-05, + "loss": 1.1422, + "step": 11365 + }, + { + "epoch": 0.62, + "grad_norm": 0.1767578125, + "learning_rate": 7.487146142011559e-05, + "loss": 1.2908, + "step": 11370 + }, + { + "epoch": 0.62, + "grad_norm": 0.1728515625, + "learning_rate": 7.477894085877211e-05, + "loss": 1.2214, + "step": 11375 + }, + { + "epoch": 0.62, + "grad_norm": 0.171875, + "learning_rate": 7.468644334764731e-05, + "loss": 1.137, + "step": 11380 + }, + { + "epoch": 0.62, + "grad_norm": 0.1748046875, + "learning_rate": 7.459396897127726e-05, + "loss": 1.1791, + "step": 11385 + }, + { + "epoch": 0.62, + "grad_norm": 0.1708984375, + "learning_rate": 7.450151781417677e-05, + "loss": 1.1886, + "step": 11390 + }, + { + "epoch": 0.62, + "grad_norm": 0.171875, + "learning_rate": 7.44090899608395e-05, + "loss": 1.1103, + "step": 11395 + }, + { + "epoch": 0.62, + "grad_norm": 0.1669921875, + "learning_rate": 7.431668549573785e-05, + "loss": 1.1789, + "step": 11400 + }, + { + "epoch": 0.62, + "grad_norm": 0.1767578125, + "learning_rate": 7.422430450332276e-05, + "loss": 1.1381, + "step": 11405 + }, + { + "epoch": 0.62, + "grad_norm": 0.16796875, + "learning_rate": 7.413194706802378e-05, + "loss": 1.1534, + "step": 11410 + }, + { + "epoch": 0.63, + "grad_norm": 0.169921875, + "learning_rate": 7.403961327424888e-05, + "loss": 1.1222, + "step": 11415 + }, + { + "epoch": 0.63, + "grad_norm": 0.1728515625, + "learning_rate": 7.394730320638441e-05, + "loss": 1.1583, + "step": 11420 + }, + { + "epoch": 0.63, + "grad_norm": 0.171875, + "learning_rate": 7.385501694879518e-05, + "loss": 1.1558, + "step": 11425 + }, + { + "epoch": 0.63, + "grad_norm": 0.1728515625, + "learning_rate": 7.376275458582409e-05, + "loss": 1.2163, + "step": 11430 + }, + { + "epoch": 0.63, + "grad_norm": 0.169921875, + "learning_rate": 7.367051620179221e-05, + "loss": 1.134, + "step": 11435 + }, + { + "epoch": 0.63, + "grad_norm": 0.1640625, + "learning_rate": 7.357830188099875e-05, + "loss": 1.1114, + "step": 11440 + }, + { + "epoch": 0.63, + "grad_norm": 0.1650390625, + "learning_rate": 7.348611170772095e-05, + "loss": 1.1102, + "step": 11445 + }, + { + "epoch": 0.63, + "grad_norm": 0.173828125, + "learning_rate": 7.339394576621393e-05, + "loss": 1.188, + "step": 11450 + }, + { + "epoch": 0.63, + "grad_norm": 0.173828125, + "learning_rate": 7.330180414071063e-05, + "loss": 1.1481, + "step": 11455 + }, + { + "epoch": 0.63, + "grad_norm": 0.1650390625, + "learning_rate": 7.320968691542185e-05, + "loss": 1.1679, + "step": 11460 + }, + { + "epoch": 0.63, + "grad_norm": 0.169921875, + "learning_rate": 7.311759417453605e-05, + "loss": 1.1306, + "step": 11465 + }, + { + "epoch": 0.63, + "grad_norm": 0.1767578125, + "learning_rate": 7.30255260022193e-05, + "loss": 1.1473, + "step": 11470 + }, + { + "epoch": 0.63, + "grad_norm": 0.1630859375, + "learning_rate": 7.293348248261526e-05, + "loss": 1.1137, + "step": 11475 + }, + { + "epoch": 0.63, + "grad_norm": 0.181640625, + "learning_rate": 7.284146369984498e-05, + "loss": 1.2396, + "step": 11480 + }, + { + "epoch": 0.63, + "grad_norm": 0.1787109375, + "learning_rate": 7.274946973800703e-05, + "loss": 1.1465, + "step": 11485 + }, + { + "epoch": 0.63, + "grad_norm": 0.1728515625, + "learning_rate": 7.265750068117716e-05, + "loss": 1.1559, + "step": 11490 + }, + { + "epoch": 0.63, + "grad_norm": 0.1767578125, + "learning_rate": 7.256555661340843e-05, + "loss": 1.201, + "step": 11495 + }, + { + "epoch": 0.63, + "grad_norm": 0.1640625, + "learning_rate": 7.247363761873103e-05, + "loss": 1.101, + "step": 11500 + }, + { + "epoch": 0.63, + "grad_norm": 0.171875, + "learning_rate": 7.23817437811523e-05, + "loss": 1.1014, + "step": 11505 + }, + { + "epoch": 0.63, + "grad_norm": 0.171875, + "learning_rate": 7.228987518465652e-05, + "loss": 1.1576, + "step": 11510 + }, + { + "epoch": 0.63, + "grad_norm": 0.16796875, + "learning_rate": 7.219803191320492e-05, + "loss": 1.1553, + "step": 11515 + }, + { + "epoch": 0.63, + "grad_norm": 0.17578125, + "learning_rate": 7.210621405073558e-05, + "loss": 1.1841, + "step": 11520 + }, + { + "epoch": 0.63, + "grad_norm": 0.169921875, + "learning_rate": 7.201442168116335e-05, + "loss": 1.1712, + "step": 11525 + }, + { + "epoch": 0.63, + "grad_norm": 0.1806640625, + "learning_rate": 7.192265488837986e-05, + "loss": 1.0964, + "step": 11530 + }, + { + "epoch": 0.63, + "grad_norm": 0.173828125, + "learning_rate": 7.183091375625327e-05, + "loss": 1.1773, + "step": 11535 + }, + { + "epoch": 0.63, + "grad_norm": 0.1767578125, + "learning_rate": 7.173919836862829e-05, + "loss": 1.1961, + "step": 11540 + }, + { + "epoch": 0.63, + "grad_norm": 0.17578125, + "learning_rate": 7.164750880932616e-05, + "loss": 1.1477, + "step": 11545 + }, + { + "epoch": 0.63, + "grad_norm": 0.1767578125, + "learning_rate": 7.155584516214451e-05, + "loss": 1.2039, + "step": 11550 + }, + { + "epoch": 0.63, + "grad_norm": 0.1767578125, + "learning_rate": 7.14642075108572e-05, + "loss": 1.2151, + "step": 11555 + }, + { + "epoch": 0.63, + "grad_norm": 0.173828125, + "learning_rate": 7.137259593921447e-05, + "loss": 1.1193, + "step": 11560 + }, + { + "epoch": 0.63, + "grad_norm": 0.1748046875, + "learning_rate": 7.128101053094261e-05, + "loss": 1.1474, + "step": 11565 + }, + { + "epoch": 0.63, + "grad_norm": 0.17578125, + "learning_rate": 7.118945136974402e-05, + "loss": 1.1494, + "step": 11570 + }, + { + "epoch": 0.63, + "grad_norm": 0.1689453125, + "learning_rate": 7.10979185392972e-05, + "loss": 1.1307, + "step": 11575 + }, + { + "epoch": 0.63, + "grad_norm": 0.1630859375, + "learning_rate": 7.100641212325648e-05, + "loss": 1.102, + "step": 11580 + }, + { + "epoch": 0.63, + "grad_norm": 0.185546875, + "learning_rate": 7.091493220525203e-05, + "loss": 1.1107, + "step": 11585 + }, + { + "epoch": 0.63, + "grad_norm": 0.1708984375, + "learning_rate": 7.082347886888996e-05, + "loss": 1.1833, + "step": 11590 + }, + { + "epoch": 0.64, + "grad_norm": 0.193359375, + "learning_rate": 7.073205219775194e-05, + "loss": 1.0751, + "step": 11595 + }, + { + "epoch": 0.64, + "grad_norm": 0.173828125, + "learning_rate": 7.064065227539532e-05, + "loss": 1.14, + "step": 11600 + }, + { + "epoch": 0.64, + "grad_norm": 0.171875, + "learning_rate": 7.0549279185353e-05, + "loss": 1.1199, + "step": 11605 + }, + { + "epoch": 0.64, + "grad_norm": 0.1767578125, + "learning_rate": 7.04579330111333e-05, + "loss": 1.177, + "step": 11610 + }, + { + "epoch": 0.64, + "grad_norm": 0.1689453125, + "learning_rate": 7.036661383622012e-05, + "loss": 1.175, + "step": 11615 + }, + { + "epoch": 0.64, + "grad_norm": 0.166015625, + "learning_rate": 7.027532174407248e-05, + "loss": 1.1929, + "step": 11620 + }, + { + "epoch": 0.64, + "grad_norm": 0.1708984375, + "learning_rate": 7.018405681812473e-05, + "loss": 1.1093, + "step": 11625 + }, + { + "epoch": 0.64, + "grad_norm": 0.1650390625, + "learning_rate": 7.009281914178636e-05, + "loss": 1.1812, + "step": 11630 + }, + { + "epoch": 0.64, + "grad_norm": 0.17578125, + "learning_rate": 7.000160879844209e-05, + "loss": 1.2149, + "step": 11635 + }, + { + "epoch": 0.64, + "grad_norm": 0.1708984375, + "learning_rate": 6.99104258714515e-05, + "loss": 1.1322, + "step": 11640 + }, + { + "epoch": 0.64, + "grad_norm": 0.166015625, + "learning_rate": 6.981927044414915e-05, + "loss": 1.1873, + "step": 11645 + }, + { + "epoch": 0.64, + "grad_norm": 0.1669921875, + "learning_rate": 6.972814259984452e-05, + "loss": 1.1271, + "step": 11650 + }, + { + "epoch": 0.64, + "grad_norm": 0.1806640625, + "learning_rate": 6.963704242182184e-05, + "loss": 1.1403, + "step": 11655 + }, + { + "epoch": 0.64, + "grad_norm": 0.16796875, + "learning_rate": 6.954596999334008e-05, + "loss": 1.1593, + "step": 11660 + }, + { + "epoch": 0.64, + "grad_norm": 0.169921875, + "learning_rate": 6.945492539763285e-05, + "loss": 1.1941, + "step": 11665 + }, + { + "epoch": 0.64, + "grad_norm": 0.1689453125, + "learning_rate": 6.93639087179082e-05, + "loss": 1.1829, + "step": 11670 + }, + { + "epoch": 0.64, + "grad_norm": 0.1689453125, + "learning_rate": 6.927292003734892e-05, + "loss": 1.1681, + "step": 11675 + }, + { + "epoch": 0.64, + "grad_norm": 0.1806640625, + "learning_rate": 6.9181959439112e-05, + "loss": 1.325, + "step": 11680 + }, + { + "epoch": 0.64, + "grad_norm": 0.166015625, + "learning_rate": 6.90910270063288e-05, + "loss": 1.1048, + "step": 11685 + }, + { + "epoch": 0.64, + "grad_norm": 0.1728515625, + "learning_rate": 6.900012282210495e-05, + "loss": 1.1311, + "step": 11690 + }, + { + "epoch": 0.64, + "grad_norm": 0.16796875, + "learning_rate": 6.890924696952038e-05, + "loss": 1.1954, + "step": 11695 + }, + { + "epoch": 0.64, + "grad_norm": 0.173828125, + "learning_rate": 6.881839953162896e-05, + "loss": 1.2225, + "step": 11700 + }, + { + "epoch": 0.64, + "grad_norm": 0.1748046875, + "learning_rate": 6.87275805914587e-05, + "loss": 1.1493, + "step": 11705 + }, + { + "epoch": 0.64, + "grad_norm": 0.169921875, + "learning_rate": 6.863679023201146e-05, + "loss": 1.1678, + "step": 11710 + }, + { + "epoch": 0.64, + "grad_norm": 0.169921875, + "learning_rate": 6.854602853626305e-05, + "loss": 1.1627, + "step": 11715 + }, + { + "epoch": 0.64, + "grad_norm": 0.171875, + "learning_rate": 6.845529558716315e-05, + "loss": 1.1878, + "step": 11720 + }, + { + "epoch": 0.64, + "grad_norm": 0.1806640625, + "learning_rate": 6.836459146763504e-05, + "loss": 1.1604, + "step": 11725 + }, + { + "epoch": 0.64, + "grad_norm": 0.173828125, + "learning_rate": 6.827391626057571e-05, + "loss": 1.1919, + "step": 11730 + }, + { + "epoch": 0.64, + "grad_norm": 0.171875, + "learning_rate": 6.818327004885571e-05, + "loss": 1.1219, + "step": 11735 + }, + { + "epoch": 0.64, + "grad_norm": 0.171875, + "learning_rate": 6.809265291531913e-05, + "loss": 1.1519, + "step": 11740 + }, + { + "epoch": 0.64, + "grad_norm": 0.1689453125, + "learning_rate": 6.800206494278345e-05, + "loss": 1.1143, + "step": 11745 + }, + { + "epoch": 0.64, + "grad_norm": 0.16796875, + "learning_rate": 6.791150621403953e-05, + "loss": 1.2425, + "step": 11750 + }, + { + "epoch": 0.64, + "grad_norm": 0.16796875, + "learning_rate": 6.78209768118514e-05, + "loss": 1.0954, + "step": 11755 + }, + { + "epoch": 0.64, + "grad_norm": 0.1748046875, + "learning_rate": 6.773047681895646e-05, + "loss": 1.1626, + "step": 11760 + }, + { + "epoch": 0.64, + "grad_norm": 0.1845703125, + "learning_rate": 6.764000631806511e-05, + "loss": 1.1976, + "step": 11765 + }, + { + "epoch": 0.64, + "grad_norm": 0.1728515625, + "learning_rate": 6.75495653918608e-05, + "loss": 1.1826, + "step": 11770 + }, + { + "epoch": 0.64, + "grad_norm": 0.1611328125, + "learning_rate": 6.745915412299994e-05, + "loss": 1.1087, + "step": 11775 + }, + { + "epoch": 0.65, + "grad_norm": 0.173828125, + "learning_rate": 6.7368772594112e-05, + "loss": 1.1276, + "step": 11780 + }, + { + "epoch": 0.65, + "grad_norm": 0.1728515625, + "learning_rate": 6.727842088779905e-05, + "loss": 1.2056, + "step": 11785 + }, + { + "epoch": 0.65, + "grad_norm": 0.16796875, + "learning_rate": 6.7188099086636e-05, + "loss": 1.198, + "step": 11790 + }, + { + "epoch": 0.65, + "grad_norm": 0.1669921875, + "learning_rate": 6.709780727317041e-05, + "loss": 1.1536, + "step": 11795 + }, + { + "epoch": 0.65, + "grad_norm": 0.162109375, + "learning_rate": 6.70075455299225e-05, + "loss": 1.1171, + "step": 11800 + }, + { + "epoch": 0.65, + "grad_norm": 0.1708984375, + "learning_rate": 6.691731393938494e-05, + "loss": 1.1072, + "step": 11805 + }, + { + "epoch": 0.65, + "grad_norm": 0.1748046875, + "learning_rate": 6.682711258402285e-05, + "loss": 1.0588, + "step": 11810 + }, + { + "epoch": 0.65, + "grad_norm": 0.1728515625, + "learning_rate": 6.673694154627372e-05, + "loss": 1.1593, + "step": 11815 + }, + { + "epoch": 0.65, + "grad_norm": 0.1630859375, + "learning_rate": 6.664680090854734e-05, + "loss": 1.155, + "step": 11820 + }, + { + "epoch": 0.65, + "grad_norm": 0.1826171875, + "learning_rate": 6.655669075322573e-05, + "loss": 1.1108, + "step": 11825 + }, + { + "epoch": 0.65, + "grad_norm": 0.171875, + "learning_rate": 6.6466611162663e-05, + "loss": 1.2183, + "step": 11830 + }, + { + "epoch": 0.65, + "grad_norm": 0.17578125, + "learning_rate": 6.637656221918541e-05, + "loss": 1.2246, + "step": 11835 + }, + { + "epoch": 0.65, + "grad_norm": 0.1640625, + "learning_rate": 6.62865440050911e-05, + "loss": 1.1802, + "step": 11840 + }, + { + "epoch": 0.65, + "grad_norm": 0.1708984375, + "learning_rate": 6.619655660265028e-05, + "loss": 1.1221, + "step": 11845 + }, + { + "epoch": 0.65, + "grad_norm": 0.17578125, + "learning_rate": 6.610660009410485e-05, + "loss": 1.1762, + "step": 11850 + }, + { + "epoch": 0.65, + "grad_norm": 0.1689453125, + "learning_rate": 6.601667456166854e-05, + "loss": 1.0455, + "step": 11855 + }, + { + "epoch": 0.65, + "grad_norm": 0.1748046875, + "learning_rate": 6.592678008752673e-05, + "loss": 1.1638, + "step": 11860 + }, + { + "epoch": 0.65, + "grad_norm": 0.1669921875, + "learning_rate": 6.583691675383652e-05, + "loss": 1.1619, + "step": 11865 + }, + { + "epoch": 0.65, + "grad_norm": 0.1611328125, + "learning_rate": 6.574708464272645e-05, + "loss": 1.1493, + "step": 11870 + }, + { + "epoch": 0.65, + "grad_norm": 0.173828125, + "learning_rate": 6.565728383629655e-05, + "loss": 1.2081, + "step": 11875 + }, + { + "epoch": 0.65, + "grad_norm": 0.1748046875, + "learning_rate": 6.556751441661818e-05, + "loss": 1.1543, + "step": 11880 + }, + { + "epoch": 0.65, + "grad_norm": 0.1787109375, + "learning_rate": 6.54777764657342e-05, + "loss": 1.2102, + "step": 11885 + }, + { + "epoch": 0.65, + "grad_norm": 0.173828125, + "learning_rate": 6.538807006565851e-05, + "loss": 1.2417, + "step": 11890 + }, + { + "epoch": 0.65, + "grad_norm": 0.171875, + "learning_rate": 6.529839529837629e-05, + "loss": 1.1262, + "step": 11895 + }, + { + "epoch": 0.65, + "grad_norm": 0.1728515625, + "learning_rate": 6.520875224584369e-05, + "loss": 1.1561, + "step": 11900 + }, + { + "epoch": 0.65, + "grad_norm": 0.1767578125, + "learning_rate": 6.511914098998809e-05, + "loss": 1.2026, + "step": 11905 + }, + { + "epoch": 0.65, + "grad_norm": 0.1630859375, + "learning_rate": 6.502956161270758e-05, + "loss": 1.1308, + "step": 11910 + }, + { + "epoch": 0.65, + "grad_norm": 0.1708984375, + "learning_rate": 6.494001419587123e-05, + "loss": 1.1639, + "step": 11915 + }, + { + "epoch": 0.65, + "grad_norm": 0.1748046875, + "learning_rate": 6.485049882131893e-05, + "loss": 1.1152, + "step": 11920 + }, + { + "epoch": 0.65, + "grad_norm": 0.1767578125, + "learning_rate": 6.476101557086117e-05, + "loss": 1.164, + "step": 11925 + }, + { + "epoch": 0.65, + "grad_norm": 0.1689453125, + "learning_rate": 6.467156452627919e-05, + "loss": 1.1221, + "step": 11930 + }, + { + "epoch": 0.65, + "grad_norm": 0.171875, + "learning_rate": 6.458214576932478e-05, + "loss": 1.1723, + "step": 11935 + }, + { + "epoch": 0.65, + "grad_norm": 0.1708984375, + "learning_rate": 6.449275938172016e-05, + "loss": 1.1294, + "step": 11940 + }, + { + "epoch": 0.65, + "grad_norm": 0.1728515625, + "learning_rate": 6.440340544515798e-05, + "loss": 1.1893, + "step": 11945 + }, + { + "epoch": 0.65, + "grad_norm": 0.177734375, + "learning_rate": 6.431408404130134e-05, + "loss": 1.2588, + "step": 11950 + }, + { + "epoch": 0.65, + "grad_norm": 0.1796875, + "learning_rate": 6.422479525178347e-05, + "loss": 1.1218, + "step": 11955 + }, + { + "epoch": 0.66, + "grad_norm": 0.173828125, + "learning_rate": 6.413553915820785e-05, + "loss": 1.1684, + "step": 11960 + }, + { + "epoch": 0.66, + "grad_norm": 0.1796875, + "learning_rate": 6.404631584214804e-05, + "loss": 1.1561, + "step": 11965 + }, + { + "epoch": 0.66, + "grad_norm": 0.1689453125, + "learning_rate": 6.395712538514778e-05, + "loss": 1.2067, + "step": 11970 + }, + { + "epoch": 0.66, + "grad_norm": 0.1689453125, + "learning_rate": 6.386796786872058e-05, + "loss": 1.188, + "step": 11975 + }, + { + "epoch": 0.66, + "grad_norm": 0.173828125, + "learning_rate": 6.377884337435001e-05, + "loss": 1.2199, + "step": 11980 + }, + { + "epoch": 0.66, + "grad_norm": 0.173828125, + "learning_rate": 6.368975198348932e-05, + "loss": 1.2381, + "step": 11985 + }, + { + "epoch": 0.66, + "grad_norm": 0.16796875, + "learning_rate": 6.360069377756166e-05, + "loss": 1.1386, + "step": 11990 + }, + { + "epoch": 0.66, + "grad_norm": 0.166015625, + "learning_rate": 6.351166883795973e-05, + "loss": 1.1694, + "step": 11995 + }, + { + "epoch": 0.66, + "grad_norm": 0.171875, + "learning_rate": 6.342267724604584e-05, + "loss": 1.1833, + "step": 12000 + }, + { + "epoch": 0.66, + "grad_norm": 0.1728515625, + "learning_rate": 6.33337190831519e-05, + "loss": 1.1712, + "step": 12005 + }, + { + "epoch": 0.66, + "grad_norm": 0.169921875, + "learning_rate": 6.324479443057922e-05, + "loss": 1.1911, + "step": 12010 + }, + { + "epoch": 0.66, + "grad_norm": 0.1748046875, + "learning_rate": 6.315590336959842e-05, + "loss": 1.1977, + "step": 12015 + }, + { + "epoch": 0.66, + "grad_norm": 0.1728515625, + "learning_rate": 6.306704598144956e-05, + "loss": 1.2255, + "step": 12020 + }, + { + "epoch": 0.66, + "grad_norm": 0.1728515625, + "learning_rate": 6.297822234734182e-05, + "loss": 1.0571, + "step": 12025 + }, + { + "epoch": 0.66, + "grad_norm": 0.1728515625, + "learning_rate": 6.28894325484535e-05, + "loss": 1.1637, + "step": 12030 + }, + { + "epoch": 0.66, + "grad_norm": 0.169921875, + "learning_rate": 6.280067666593213e-05, + "loss": 1.2216, + "step": 12035 + }, + { + "epoch": 0.66, + "grad_norm": 0.1826171875, + "learning_rate": 6.271195478089411e-05, + "loss": 1.1278, + "step": 12040 + }, + { + "epoch": 0.66, + "grad_norm": 0.173828125, + "learning_rate": 6.26232669744248e-05, + "loss": 1.21, + "step": 12045 + }, + { + "epoch": 0.66, + "grad_norm": 0.16796875, + "learning_rate": 6.253461332757838e-05, + "loss": 1.1584, + "step": 12050 + }, + { + "epoch": 0.66, + "grad_norm": 0.17578125, + "learning_rate": 6.244599392137794e-05, + "loss": 1.2096, + "step": 12055 + }, + { + "epoch": 0.66, + "grad_norm": 0.181640625, + "learning_rate": 6.235740883681515e-05, + "loss": 1.1499, + "step": 12060 + }, + { + "epoch": 0.66, + "grad_norm": 0.171875, + "learning_rate": 6.226885815485036e-05, + "loss": 1.2646, + "step": 12065 + }, + { + "epoch": 0.66, + "grad_norm": 0.1708984375, + "learning_rate": 6.21803419564124e-05, + "loss": 1.1436, + "step": 12070 + }, + { + "epoch": 0.66, + "grad_norm": 0.1748046875, + "learning_rate": 6.209186032239877e-05, + "loss": 1.0668, + "step": 12075 + }, + { + "epoch": 0.66, + "grad_norm": 0.1689453125, + "learning_rate": 6.200341333367521e-05, + "loss": 1.1911, + "step": 12080 + }, + { + "epoch": 0.66, + "grad_norm": 0.171875, + "learning_rate": 6.191500107107586e-05, + "loss": 1.1395, + "step": 12085 + }, + { + "epoch": 0.66, + "grad_norm": 0.169921875, + "learning_rate": 6.182662361540305e-05, + "loss": 1.2165, + "step": 12090 + }, + { + "epoch": 0.66, + "grad_norm": 0.1708984375, + "learning_rate": 6.173828104742749e-05, + "loss": 1.2104, + "step": 12095 + }, + { + "epoch": 0.66, + "grad_norm": 0.1845703125, + "learning_rate": 6.164997344788776e-05, + "loss": 1.2337, + "step": 12100 + }, + { + "epoch": 0.66, + "grad_norm": 0.1708984375, + "learning_rate": 6.15617008974907e-05, + "loss": 1.1937, + "step": 12105 + }, + { + "epoch": 0.66, + "grad_norm": 0.169921875, + "learning_rate": 6.147346347691097e-05, + "loss": 1.1332, + "step": 12110 + }, + { + "epoch": 0.66, + "grad_norm": 0.166015625, + "learning_rate": 6.138526126679112e-05, + "loss": 1.1138, + "step": 12115 + }, + { + "epoch": 0.66, + "grad_norm": 0.1669921875, + "learning_rate": 6.129709434774166e-05, + "loss": 1.1369, + "step": 12120 + }, + { + "epoch": 0.66, + "grad_norm": 0.1728515625, + "learning_rate": 6.120896280034074e-05, + "loss": 1.1501, + "step": 12125 + }, + { + "epoch": 0.66, + "grad_norm": 0.173828125, + "learning_rate": 6.112086670513419e-05, + "loss": 1.1227, + "step": 12130 + }, + { + "epoch": 0.66, + "grad_norm": 0.173828125, + "learning_rate": 6.103280614263539e-05, + "loss": 1.0718, + "step": 12135 + }, + { + "epoch": 0.66, + "grad_norm": 0.181640625, + "learning_rate": 6.094478119332542e-05, + "loss": 1.2111, + "step": 12140 + }, + { + "epoch": 0.67, + "grad_norm": 0.173828125, + "learning_rate": 6.085679193765264e-05, + "loss": 1.12, + "step": 12145 + }, + { + "epoch": 0.67, + "grad_norm": 0.17578125, + "learning_rate": 6.076883845603285e-05, + "loss": 1.1479, + "step": 12150 + }, + { + "epoch": 0.67, + "grad_norm": 0.17578125, + "learning_rate": 6.068092082884912e-05, + "loss": 1.1859, + "step": 12155 + }, + { + "epoch": 0.67, + "grad_norm": 0.1748046875, + "learning_rate": 6.0593039136451845e-05, + "loss": 1.2051, + "step": 12160 + }, + { + "epoch": 0.67, + "grad_norm": 0.1767578125, + "learning_rate": 6.050519345915851e-05, + "loss": 1.1908, + "step": 12165 + }, + { + "epoch": 0.67, + "grad_norm": 0.1767578125, + "learning_rate": 6.041738387725366e-05, + "loss": 1.137, + "step": 12170 + }, + { + "epoch": 0.67, + "grad_norm": 0.16796875, + "learning_rate": 6.032961047098886e-05, + "loss": 1.1643, + "step": 12175 + }, + { + "epoch": 0.67, + "grad_norm": 0.1669921875, + "learning_rate": 6.024187332058271e-05, + "loss": 1.1522, + "step": 12180 + }, + { + "epoch": 0.67, + "grad_norm": 0.17578125, + "learning_rate": 6.015417250622057e-05, + "loss": 1.1674, + "step": 12185 + }, + { + "epoch": 0.67, + "grad_norm": 0.169921875, + "learning_rate": 6.006650810805461e-05, + "loss": 1.146, + "step": 12190 + }, + { + "epoch": 0.67, + "grad_norm": 0.177734375, + "learning_rate": 5.997888020620373e-05, + "loss": 1.1807, + "step": 12195 + }, + { + "epoch": 0.67, + "grad_norm": 0.171875, + "learning_rate": 5.9891288880753486e-05, + "loss": 1.1484, + "step": 12200 + }, + { + "epoch": 0.67, + "grad_norm": 0.169921875, + "learning_rate": 5.980373421175601e-05, + "loss": 1.1014, + "step": 12205 + }, + { + "epoch": 0.67, + "grad_norm": 0.1728515625, + "learning_rate": 5.971621627922992e-05, + "loss": 1.1192, + "step": 12210 + }, + { + "epoch": 0.67, + "grad_norm": 0.1748046875, + "learning_rate": 5.962873516316023e-05, + "loss": 1.1931, + "step": 12215 + }, + { + "epoch": 0.67, + "grad_norm": 0.171875, + "learning_rate": 5.9541290943498316e-05, + "loss": 1.1835, + "step": 12220 + }, + { + "epoch": 0.67, + "grad_norm": 0.16796875, + "learning_rate": 5.945388370016192e-05, + "loss": 1.1538, + "step": 12225 + }, + { + "epoch": 0.67, + "grad_norm": 0.1728515625, + "learning_rate": 5.9366513513034883e-05, + "loss": 1.1747, + "step": 12230 + }, + { + "epoch": 0.67, + "grad_norm": 0.185546875, + "learning_rate": 5.9279180461967235e-05, + "loss": 1.1316, + "step": 12235 + }, + { + "epoch": 0.67, + "grad_norm": 0.1728515625, + "learning_rate": 5.9191884626774993e-05, + "loss": 1.2002, + "step": 12240 + }, + { + "epoch": 0.67, + "grad_norm": 0.17578125, + "learning_rate": 5.91046260872403e-05, + "loss": 1.1811, + "step": 12245 + }, + { + "epoch": 0.67, + "grad_norm": 0.1708984375, + "learning_rate": 5.901740492311111e-05, + "loss": 1.1516, + "step": 12250 + }, + { + "epoch": 0.67, + "grad_norm": 0.17578125, + "learning_rate": 5.893022121410121e-05, + "loss": 1.1061, + "step": 12255 + }, + { + "epoch": 0.67, + "grad_norm": 0.1650390625, + "learning_rate": 5.8843075039890174e-05, + "loss": 1.0782, + "step": 12260 + }, + { + "epoch": 0.67, + "grad_norm": 0.1748046875, + "learning_rate": 5.8755966480123345e-05, + "loss": 1.2757, + "step": 12265 + }, + { + "epoch": 0.67, + "grad_norm": 0.1650390625, + "learning_rate": 5.86688956144116e-05, + "loss": 1.1038, + "step": 12270 + }, + { + "epoch": 0.67, + "grad_norm": 0.17578125, + "learning_rate": 5.8581862522331376e-05, + "loss": 1.1952, + "step": 12275 + }, + { + "epoch": 0.67, + "grad_norm": 0.1728515625, + "learning_rate": 5.8494867283424615e-05, + "loss": 1.2524, + "step": 12280 + }, + { + "epoch": 0.67, + "grad_norm": 0.1787109375, + "learning_rate": 5.840790997719868e-05, + "loss": 1.1178, + "step": 12285 + }, + { + "epoch": 0.67, + "grad_norm": 0.1787109375, + "learning_rate": 5.832099068312623e-05, + "loss": 1.1701, + "step": 12290 + }, + { + "epoch": 0.67, + "grad_norm": 0.1748046875, + "learning_rate": 5.823410948064516e-05, + "loss": 1.0914, + "step": 12295 + }, + { + "epoch": 0.67, + "grad_norm": 0.1728515625, + "learning_rate": 5.814726644915862e-05, + "loss": 1.1672, + "step": 12300 + }, + { + "epoch": 0.67, + "grad_norm": 0.177734375, + "learning_rate": 5.806046166803485e-05, + "loss": 1.1787, + "step": 12305 + }, + { + "epoch": 0.67, + "grad_norm": 0.17578125, + "learning_rate": 5.797369521660714e-05, + "loss": 1.1533, + "step": 12310 + }, + { + "epoch": 0.67, + "grad_norm": 0.166015625, + "learning_rate": 5.788696717417366e-05, + "loss": 1.2159, + "step": 12315 + }, + { + "epoch": 0.67, + "grad_norm": 0.1689453125, + "learning_rate": 5.78002776199976e-05, + "loss": 1.205, + "step": 12320 + }, + { + "epoch": 0.68, + "grad_norm": 0.173828125, + "learning_rate": 5.7713626633306924e-05, + "loss": 1.1475, + "step": 12325 + }, + { + "epoch": 0.68, + "grad_norm": 0.171875, + "learning_rate": 5.7627014293294334e-05, + "loss": 1.1225, + "step": 12330 + }, + { + "epoch": 0.68, + "grad_norm": 0.1650390625, + "learning_rate": 5.754044067911728e-05, + "loss": 1.2068, + "step": 12335 + }, + { + "epoch": 0.68, + "grad_norm": 0.166015625, + "learning_rate": 5.7453905869897696e-05, + "loss": 1.1058, + "step": 12340 + }, + { + "epoch": 0.68, + "grad_norm": 0.16796875, + "learning_rate": 5.736740994472214e-05, + "loss": 1.2131, + "step": 12345 + }, + { + "epoch": 0.68, + "grad_norm": 0.16796875, + "learning_rate": 5.728095298264161e-05, + "loss": 1.0874, + "step": 12350 + }, + { + "epoch": 0.68, + "grad_norm": 0.173828125, + "learning_rate": 5.719453506267156e-05, + "loss": 1.1569, + "step": 12355 + }, + { + "epoch": 0.68, + "grad_norm": 0.181640625, + "learning_rate": 5.710815626379161e-05, + "loss": 1.1321, + "step": 12360 + }, + { + "epoch": 0.68, + "grad_norm": 0.169921875, + "learning_rate": 5.7021816664945747e-05, + "loss": 1.0889, + "step": 12365 + }, + { + "epoch": 0.68, + "grad_norm": 0.1669921875, + "learning_rate": 5.6935516345042125e-05, + "loss": 1.1424, + "step": 12370 + }, + { + "epoch": 0.68, + "grad_norm": 0.171875, + "learning_rate": 5.6849255382953e-05, + "loss": 1.1572, + "step": 12375 + }, + { + "epoch": 0.68, + "grad_norm": 0.171875, + "learning_rate": 5.6763033857514555e-05, + "loss": 1.1449, + "step": 12380 + }, + { + "epoch": 0.68, + "grad_norm": 0.1982421875, + "learning_rate": 5.667685184752707e-05, + "loss": 1.1553, + "step": 12385 + }, + { + "epoch": 0.68, + "grad_norm": 0.16796875, + "learning_rate": 5.659070943175462e-05, + "loss": 1.1084, + "step": 12390 + }, + { + "epoch": 0.68, + "grad_norm": 0.17578125, + "learning_rate": 5.6504606688925185e-05, + "loss": 1.1901, + "step": 12395 + }, + { + "epoch": 0.68, + "grad_norm": 0.17578125, + "learning_rate": 5.641854369773034e-05, + "loss": 1.1956, + "step": 12400 + }, + { + "epoch": 0.68, + "grad_norm": 0.1767578125, + "learning_rate": 5.633252053682545e-05, + "loss": 1.1692, + "step": 12405 + }, + { + "epoch": 0.68, + "grad_norm": 0.1708984375, + "learning_rate": 5.624653728482947e-05, + "loss": 1.1563, + "step": 12410 + }, + { + "epoch": 0.68, + "grad_norm": 0.1787109375, + "learning_rate": 5.616059402032485e-05, + "loss": 1.1523, + "step": 12415 + }, + { + "epoch": 0.68, + "grad_norm": 0.1748046875, + "learning_rate": 5.607469082185748e-05, + "loss": 1.1506, + "step": 12420 + }, + { + "epoch": 0.68, + "grad_norm": 0.1640625, + "learning_rate": 5.5988827767936633e-05, + "loss": 1.1812, + "step": 12425 + }, + { + "epoch": 0.68, + "grad_norm": 0.1748046875, + "learning_rate": 5.590300493703496e-05, + "loss": 1.0443, + "step": 12430 + }, + { + "epoch": 0.68, + "grad_norm": 0.171875, + "learning_rate": 5.581722240758829e-05, + "loss": 1.2583, + "step": 12435 + }, + { + "epoch": 0.68, + "grad_norm": 0.16796875, + "learning_rate": 5.5731480257995636e-05, + "loss": 1.1145, + "step": 12440 + }, + { + "epoch": 0.68, + "grad_norm": 0.1748046875, + "learning_rate": 5.564577856661908e-05, + "loss": 1.1461, + "step": 12445 + }, + { + "epoch": 0.68, + "grad_norm": 0.1806640625, + "learning_rate": 5.556011741178376e-05, + "loss": 1.2273, + "step": 12450 + }, + { + "epoch": 0.68, + "grad_norm": 0.181640625, + "learning_rate": 5.547449687177777e-05, + "loss": 1.1375, + "step": 12455 + }, + { + "epoch": 0.68, + "grad_norm": 0.1728515625, + "learning_rate": 5.5388917024852096e-05, + "loss": 1.1263, + "step": 12460 + }, + { + "epoch": 0.68, + "grad_norm": 0.166015625, + "learning_rate": 5.5303377949220446e-05, + "loss": 1.1205, + "step": 12465 + }, + { + "epoch": 0.68, + "grad_norm": 0.1689453125, + "learning_rate": 5.521787972305936e-05, + "loss": 1.1976, + "step": 12470 + }, + { + "epoch": 0.68, + "grad_norm": 0.1669921875, + "learning_rate": 5.5132422424508025e-05, + "loss": 1.1398, + "step": 12475 + }, + { + "epoch": 0.68, + "grad_norm": 0.1650390625, + "learning_rate": 5.504700613166827e-05, + "loss": 1.1556, + "step": 12480 + }, + { + "epoch": 0.68, + "grad_norm": 0.16796875, + "learning_rate": 5.4961630922604266e-05, + "loss": 1.2524, + "step": 12485 + }, + { + "epoch": 0.68, + "grad_norm": 0.1953125, + "learning_rate": 5.487629687534284e-05, + "loss": 1.2138, + "step": 12490 + }, + { + "epoch": 0.68, + "grad_norm": 0.1767578125, + "learning_rate": 5.47910040678731e-05, + "loss": 1.1351, + "step": 12495 + }, + { + "epoch": 0.68, + "grad_norm": 0.173828125, + "learning_rate": 5.470575257814654e-05, + "loss": 1.2218, + "step": 12500 + }, + { + "epoch": 0.68, + "grad_norm": 0.16796875, + "learning_rate": 5.462054248407675e-05, + "loss": 1.19, + "step": 12505 + }, + { + "epoch": 0.69, + "grad_norm": 0.1787109375, + "learning_rate": 5.453537386353962e-05, + "loss": 1.1435, + "step": 12510 + }, + { + "epoch": 0.69, + "grad_norm": 0.1728515625, + "learning_rate": 5.4450246794373094e-05, + "loss": 1.2136, + "step": 12515 + }, + { + "epoch": 0.69, + "grad_norm": 0.171875, + "learning_rate": 5.4365161354377124e-05, + "loss": 1.2098, + "step": 12520 + }, + { + "epoch": 0.69, + "grad_norm": 0.177734375, + "learning_rate": 5.4280117621313685e-05, + "loss": 1.1028, + "step": 12525 + }, + { + "epoch": 0.69, + "grad_norm": 0.171875, + "learning_rate": 5.4195115672906494e-05, + "loss": 1.1161, + "step": 12530 + }, + { + "epoch": 0.69, + "grad_norm": 0.1748046875, + "learning_rate": 5.41101555868412e-05, + "loss": 1.2346, + "step": 12535 + }, + { + "epoch": 0.69, + "grad_norm": 0.171875, + "learning_rate": 5.4025237440765195e-05, + "loss": 1.1362, + "step": 12540 + }, + { + "epoch": 0.69, + "grad_norm": 0.177734375, + "learning_rate": 5.3940361312287505e-05, + "loss": 1.1449, + "step": 12545 + }, + { + "epoch": 0.69, + "grad_norm": 0.166015625, + "learning_rate": 5.385552727897868e-05, + "loss": 1.0648, + "step": 12550 + }, + { + "epoch": 0.69, + "grad_norm": 0.1748046875, + "learning_rate": 5.3770735418370924e-05, + "loss": 1.0629, + "step": 12555 + }, + { + "epoch": 0.69, + "grad_norm": 0.1728515625, + "learning_rate": 5.368598580795785e-05, + "loss": 1.1927, + "step": 12560 + }, + { + "epoch": 0.69, + "grad_norm": 0.1708984375, + "learning_rate": 5.3601278525194476e-05, + "loss": 1.1831, + "step": 12565 + }, + { + "epoch": 0.69, + "grad_norm": 0.177734375, + "learning_rate": 5.3516613647497075e-05, + "loss": 1.1503, + "step": 12570 + }, + { + "epoch": 0.69, + "grad_norm": 0.1787109375, + "learning_rate": 5.343199125224319e-05, + "loss": 1.1571, + "step": 12575 + }, + { + "epoch": 0.69, + "grad_norm": 0.1669921875, + "learning_rate": 5.33474114167716e-05, + "loss": 1.209, + "step": 12580 + }, + { + "epoch": 0.69, + "grad_norm": 0.1748046875, + "learning_rate": 5.326287421838214e-05, + "loss": 1.1659, + "step": 12585 + }, + { + "epoch": 0.69, + "grad_norm": 0.1728515625, + "learning_rate": 5.317837973433563e-05, + "loss": 1.1821, + "step": 12590 + }, + { + "epoch": 0.69, + "grad_norm": 0.177734375, + "learning_rate": 5.309392804185391e-05, + "loss": 1.1669, + "step": 12595 + }, + { + "epoch": 0.69, + "grad_norm": 0.1806640625, + "learning_rate": 5.300951921811974e-05, + "loss": 1.1642, + "step": 12600 + }, + { + "epoch": 0.69, + "grad_norm": 0.17578125, + "learning_rate": 5.2925153340276626e-05, + "loss": 1.1552, + "step": 12605 + }, + { + "epoch": 0.69, + "grad_norm": 0.173828125, + "learning_rate": 5.2840830485428917e-05, + "loss": 1.2297, + "step": 12610 + }, + { + "epoch": 0.69, + "grad_norm": 0.1767578125, + "learning_rate": 5.2756550730641505e-05, + "loss": 1.1281, + "step": 12615 + }, + { + "epoch": 0.69, + "grad_norm": 0.181640625, + "learning_rate": 5.267231415294002e-05, + "loss": 1.1497, + "step": 12620 + }, + { + "epoch": 0.69, + "grad_norm": 0.16796875, + "learning_rate": 5.258812082931057e-05, + "loss": 1.1669, + "step": 12625 + }, + { + "epoch": 0.69, + "grad_norm": 0.1806640625, + "learning_rate": 5.250397083669979e-05, + "loss": 1.1949, + "step": 12630 + }, + { + "epoch": 0.69, + "grad_norm": 0.1669921875, + "learning_rate": 5.241986425201459e-05, + "loss": 1.1882, + "step": 12635 + }, + { + "epoch": 0.69, + "grad_norm": 0.1728515625, + "learning_rate": 5.233580115212233e-05, + "loss": 1.1623, + "step": 12640 + }, + { + "epoch": 0.69, + "grad_norm": 0.1748046875, + "learning_rate": 5.2251781613850606e-05, + "loss": 1.1773, + "step": 12645 + }, + { + "epoch": 0.69, + "grad_norm": 0.169921875, + "learning_rate": 5.2167805713987184e-05, + "loss": 1.16, + "step": 12650 + }, + { + "epoch": 0.69, + "grad_norm": 0.1689453125, + "learning_rate": 5.208387352927988e-05, + "loss": 1.1388, + "step": 12655 + }, + { + "epoch": 0.69, + "grad_norm": 0.1796875, + "learning_rate": 5.199998513643667e-05, + "loss": 1.1086, + "step": 12660 + }, + { + "epoch": 0.69, + "grad_norm": 0.1689453125, + "learning_rate": 5.191614061212546e-05, + "loss": 1.1496, + "step": 12665 + }, + { + "epoch": 0.69, + "grad_norm": 0.171875, + "learning_rate": 5.18323400329741e-05, + "loss": 1.1491, + "step": 12670 + }, + { + "epoch": 0.69, + "grad_norm": 0.1728515625, + "learning_rate": 5.174858347557015e-05, + "loss": 1.1115, + "step": 12675 + }, + { + "epoch": 0.69, + "grad_norm": 0.16796875, + "learning_rate": 5.166487101646109e-05, + "loss": 1.2119, + "step": 12680 + }, + { + "epoch": 0.69, + "grad_norm": 0.1650390625, + "learning_rate": 5.1581202732154035e-05, + "loss": 1.2117, + "step": 12685 + }, + { + "epoch": 0.7, + "grad_norm": 0.1767578125, + "learning_rate": 5.14975786991157e-05, + "loss": 1.2826, + "step": 12690 + }, + { + "epoch": 0.7, + "grad_norm": 0.17578125, + "learning_rate": 5.141399899377245e-05, + "loss": 1.1498, + "step": 12695 + }, + { + "epoch": 0.7, + "grad_norm": 0.1767578125, + "learning_rate": 5.133046369250998e-05, + "loss": 1.1839, + "step": 12700 + }, + { + "epoch": 0.7, + "grad_norm": 0.1748046875, + "learning_rate": 5.124697287167355e-05, + "loss": 1.2456, + "step": 12705 + }, + { + "epoch": 0.7, + "grad_norm": 0.17578125, + "learning_rate": 5.116352660756771e-05, + "loss": 1.1911, + "step": 12710 + }, + { + "epoch": 0.7, + "grad_norm": 0.16796875, + "learning_rate": 5.1080124976456334e-05, + "loss": 1.2231, + "step": 12715 + }, + { + "epoch": 0.7, + "grad_norm": 0.1708984375, + "learning_rate": 5.099676805456237e-05, + "loss": 1.2012, + "step": 12720 + }, + { + "epoch": 0.7, + "grad_norm": 0.1640625, + "learning_rate": 5.091345591806807e-05, + "loss": 1.1584, + "step": 12725 + }, + { + "epoch": 0.7, + "grad_norm": 0.1708984375, + "learning_rate": 5.083018864311464e-05, + "loss": 1.1039, + "step": 12730 + }, + { + "epoch": 0.7, + "grad_norm": 0.1728515625, + "learning_rate": 5.07469663058024e-05, + "loss": 1.205, + "step": 12735 + }, + { + "epoch": 0.7, + "grad_norm": 0.177734375, + "learning_rate": 5.066378898219044e-05, + "loss": 1.1802, + "step": 12740 + }, + { + "epoch": 0.7, + "grad_norm": 0.1689453125, + "learning_rate": 5.058065674829685e-05, + "loss": 1.1798, + "step": 12745 + }, + { + "epoch": 0.7, + "grad_norm": 0.1669921875, + "learning_rate": 5.0497569680098445e-05, + "loss": 1.1172, + "step": 12750 + }, + { + "epoch": 0.7, + "grad_norm": 0.1669921875, + "learning_rate": 5.0414527853530846e-05, + "loss": 1.0927, + "step": 12755 + }, + { + "epoch": 0.7, + "grad_norm": 0.171875, + "learning_rate": 5.033153134448814e-05, + "loss": 1.1358, + "step": 12760 + }, + { + "epoch": 0.7, + "grad_norm": 0.173828125, + "learning_rate": 5.02485802288232e-05, + "loss": 1.1605, + "step": 12765 + }, + { + "epoch": 0.7, + "grad_norm": 0.1767578125, + "learning_rate": 5.016567458234729e-05, + "loss": 1.225, + "step": 12770 + }, + { + "epoch": 0.7, + "grad_norm": 0.1806640625, + "learning_rate": 5.008281448083021e-05, + "loss": 1.2231, + "step": 12775 + }, + { + "epoch": 0.7, + "grad_norm": 0.1728515625, + "learning_rate": 5.000000000000002e-05, + "loss": 1.143, + "step": 12780 + }, + { + "epoch": 0.7, + "grad_norm": 0.1748046875, + "learning_rate": 4.991723121554318e-05, + "loss": 1.1943, + "step": 12785 + }, + { + "epoch": 0.7, + "grad_norm": 0.1728515625, + "learning_rate": 4.983450820310433e-05, + "loss": 1.1777, + "step": 12790 + }, + { + "epoch": 0.7, + "grad_norm": 0.1640625, + "learning_rate": 4.975183103828635e-05, + "loss": 1.1359, + "step": 12795 + }, + { + "epoch": 0.7, + "grad_norm": 0.169921875, + "learning_rate": 4.966919979665018e-05, + "loss": 1.1229, + "step": 12800 + }, + { + "epoch": 0.7, + "grad_norm": 0.1748046875, + "learning_rate": 4.958661455371466e-05, + "loss": 1.1743, + "step": 12805 + }, + { + "epoch": 0.7, + "grad_norm": 0.1728515625, + "learning_rate": 4.950407538495688e-05, + "loss": 1.1673, + "step": 12810 + }, + { + "epoch": 0.7, + "grad_norm": 0.1767578125, + "learning_rate": 4.942158236581153e-05, + "loss": 1.2264, + "step": 12815 + }, + { + "epoch": 0.7, + "grad_norm": 0.1650390625, + "learning_rate": 4.9339135571671344e-05, + "loss": 1.1127, + "step": 12820 + }, + { + "epoch": 0.7, + "grad_norm": 0.1728515625, + "learning_rate": 4.9256735077886605e-05, + "loss": 1.1564, + "step": 12825 + }, + { + "epoch": 0.7, + "grad_norm": 0.1669921875, + "learning_rate": 4.917438095976544e-05, + "loss": 1.1626, + "step": 12830 + }, + { + "epoch": 0.7, + "grad_norm": 0.169921875, + "learning_rate": 4.9092073292573545e-05, + "loss": 1.2197, + "step": 12835 + }, + { + "epoch": 0.7, + "grad_norm": 0.169921875, + "learning_rate": 4.900981215153419e-05, + "loss": 1.1686, + "step": 12840 + }, + { + "epoch": 0.7, + "grad_norm": 0.16796875, + "learning_rate": 4.892759761182801e-05, + "loss": 1.1162, + "step": 12845 + }, + { + "epoch": 0.7, + "grad_norm": 0.169921875, + "learning_rate": 4.884542974859319e-05, + "loss": 1.1793, + "step": 12850 + }, + { + "epoch": 0.7, + "grad_norm": 0.1689453125, + "learning_rate": 4.8763308636925154e-05, + "loss": 1.1517, + "step": 12855 + }, + { + "epoch": 0.7, + "grad_norm": 0.1728515625, + "learning_rate": 4.868123435187673e-05, + "loss": 1.1833, + "step": 12860 + }, + { + "epoch": 0.7, + "grad_norm": 0.1640625, + "learning_rate": 4.8599206968457746e-05, + "loss": 1.09, + "step": 12865 + }, + { + "epoch": 0.7, + "grad_norm": 0.166015625, + "learning_rate": 4.851722656163534e-05, + "loss": 1.2365, + "step": 12870 + }, + { + "epoch": 0.71, + "grad_norm": 0.1689453125, + "learning_rate": 4.8435293206333656e-05, + "loss": 1.2035, + "step": 12875 + }, + { + "epoch": 0.71, + "grad_norm": 0.1728515625, + "learning_rate": 4.835340697743383e-05, + "loss": 1.2277, + "step": 12880 + }, + { + "epoch": 0.71, + "grad_norm": 0.1767578125, + "learning_rate": 4.827156794977399e-05, + "loss": 1.115, + "step": 12885 + }, + { + "epoch": 0.71, + "grad_norm": 0.1748046875, + "learning_rate": 4.818977619814893e-05, + "loss": 1.1453, + "step": 12890 + }, + { + "epoch": 0.71, + "grad_norm": 0.169921875, + "learning_rate": 4.810803179731056e-05, + "loss": 1.1685, + "step": 12895 + }, + { + "epoch": 0.71, + "grad_norm": 0.1787109375, + "learning_rate": 4.802633482196719e-05, + "loss": 1.1615, + "step": 12900 + }, + { + "epoch": 0.71, + "grad_norm": 0.173828125, + "learning_rate": 4.7944685346784024e-05, + "loss": 1.1596, + "step": 12905 + }, + { + "epoch": 0.71, + "grad_norm": 0.17578125, + "learning_rate": 4.786308344638265e-05, + "loss": 1.1528, + "step": 12910 + }, + { + "epoch": 0.71, + "grad_norm": 0.173828125, + "learning_rate": 4.778152919534143e-05, + "loss": 1.1603, + "step": 12915 + }, + { + "epoch": 0.71, + "grad_norm": 0.1748046875, + "learning_rate": 4.7700022668194925e-05, + "loss": 1.1256, + "step": 12920 + }, + { + "epoch": 0.71, + "grad_norm": 0.1689453125, + "learning_rate": 4.7618563939434244e-05, + "loss": 1.1293, + "step": 12925 + }, + { + "epoch": 0.71, + "grad_norm": 0.1669921875, + "learning_rate": 4.7537153083506716e-05, + "loss": 1.1085, + "step": 12930 + }, + { + "epoch": 0.71, + "grad_norm": 0.1689453125, + "learning_rate": 4.745579017481596e-05, + "loss": 1.2238, + "step": 12935 + }, + { + "epoch": 0.71, + "grad_norm": 0.173828125, + "learning_rate": 4.737447528772181e-05, + "loss": 1.1304, + "step": 12940 + }, + { + "epoch": 0.71, + "grad_norm": 0.1787109375, + "learning_rate": 4.729320849654018e-05, + "loss": 1.2142, + "step": 12945 + }, + { + "epoch": 0.71, + "grad_norm": 0.1708984375, + "learning_rate": 4.721198987554296e-05, + "loss": 1.2238, + "step": 12950 + }, + { + "epoch": 0.71, + "grad_norm": 0.1787109375, + "learning_rate": 4.7130819498958125e-05, + "loss": 1.1509, + "step": 12955 + }, + { + "epoch": 0.71, + "grad_norm": 0.16796875, + "learning_rate": 4.7049697440969496e-05, + "loss": 1.2085, + "step": 12960 + }, + { + "epoch": 0.71, + "grad_norm": 0.1787109375, + "learning_rate": 4.696862377571676e-05, + "loss": 1.2026, + "step": 12965 + }, + { + "epoch": 0.71, + "grad_norm": 0.1748046875, + "learning_rate": 4.68875985772954e-05, + "loss": 1.1621, + "step": 12970 + }, + { + "epoch": 0.71, + "grad_norm": 0.181640625, + "learning_rate": 4.68066219197565e-05, + "loss": 1.128, + "step": 12975 + }, + { + "epoch": 0.71, + "grad_norm": 0.1650390625, + "learning_rate": 4.672569387710689e-05, + "loss": 1.2856, + "step": 12980 + }, + { + "epoch": 0.71, + "grad_norm": 0.1689453125, + "learning_rate": 4.6644814523308924e-05, + "loss": 1.1252, + "step": 12985 + }, + { + "epoch": 0.71, + "grad_norm": 0.173828125, + "learning_rate": 4.6563983932280495e-05, + "loss": 1.1572, + "step": 12990 + }, + { + "epoch": 0.71, + "grad_norm": 0.171875, + "learning_rate": 4.64832021778948e-05, + "loss": 1.1686, + "step": 12995 + }, + { + "epoch": 0.71, + "grad_norm": 0.169921875, + "learning_rate": 4.6402469333980656e-05, + "loss": 1.2313, + "step": 13000 + }, + { + "epoch": 0.71, + "grad_norm": 0.1689453125, + "learning_rate": 4.632178547432192e-05, + "loss": 1.1801, + "step": 13005 + }, + { + "epoch": 0.71, + "grad_norm": 0.1669921875, + "learning_rate": 4.624115067265784e-05, + "loss": 1.2349, + "step": 13010 + }, + { + "epoch": 0.71, + "grad_norm": 0.1728515625, + "learning_rate": 4.616056500268268e-05, + "loss": 1.1818, + "step": 13015 + }, + { + "epoch": 0.71, + "grad_norm": 0.1767578125, + "learning_rate": 4.608002853804604e-05, + "loss": 1.1842, + "step": 13020 + }, + { + "epoch": 0.71, + "grad_norm": 0.1689453125, + "learning_rate": 4.5999541352352294e-05, + "loss": 1.1208, + "step": 13025 + }, + { + "epoch": 0.71, + "grad_norm": 0.1728515625, + "learning_rate": 4.591910351916097e-05, + "loss": 1.0402, + "step": 13030 + }, + { + "epoch": 0.71, + "grad_norm": 0.173828125, + "learning_rate": 4.583871511198634e-05, + "loss": 1.187, + "step": 13035 + }, + { + "epoch": 0.71, + "grad_norm": 0.1728515625, + "learning_rate": 4.575837620429762e-05, + "loss": 1.2104, + "step": 13040 + }, + { + "epoch": 0.71, + "grad_norm": 0.1806640625, + "learning_rate": 4.567808686951872e-05, + "loss": 1.1241, + "step": 13045 + }, + { + "epoch": 0.71, + "grad_norm": 0.166015625, + "learning_rate": 4.5597847181028296e-05, + "loss": 1.1456, + "step": 13050 + }, + { + "epoch": 0.72, + "grad_norm": 0.1728515625, + "learning_rate": 4.551765721215964e-05, + "loss": 1.1684, + "step": 13055 + }, + { + "epoch": 0.72, + "grad_norm": 0.17578125, + "learning_rate": 4.5437517036200474e-05, + "loss": 1.1856, + "step": 13060 + }, + { + "epoch": 0.72, + "grad_norm": 0.1728515625, + "learning_rate": 4.5357426726393146e-05, + "loss": 1.1849, + "step": 13065 + }, + { + "epoch": 0.72, + "grad_norm": 0.1708984375, + "learning_rate": 4.52773863559344e-05, + "loss": 1.1626, + "step": 13070 + }, + { + "epoch": 0.72, + "grad_norm": 0.1640625, + "learning_rate": 4.5197395997975365e-05, + "loss": 1.1048, + "step": 13075 + }, + { + "epoch": 0.72, + "grad_norm": 0.173828125, + "learning_rate": 4.5117455725621304e-05, + "loss": 1.267, + "step": 13080 + }, + { + "epoch": 0.72, + "grad_norm": 0.173828125, + "learning_rate": 4.5037565611931995e-05, + "loss": 1.2042, + "step": 13085 + }, + { + "epoch": 0.72, + "grad_norm": 0.173828125, + "learning_rate": 4.4957725729921074e-05, + "loss": 1.144, + "step": 13090 + }, + { + "epoch": 0.72, + "grad_norm": 0.177734375, + "learning_rate": 4.48779361525565e-05, + "loss": 1.1252, + "step": 13095 + }, + { + "epoch": 0.72, + "grad_norm": 0.1767578125, + "learning_rate": 4.479819695276003e-05, + "loss": 1.2143, + "step": 13100 + }, + { + "epoch": 0.72, + "grad_norm": 0.166015625, + "learning_rate": 4.471850820340766e-05, + "loss": 1.0556, + "step": 13105 + }, + { + "epoch": 0.72, + "grad_norm": 0.16796875, + "learning_rate": 4.463886997732901e-05, + "loss": 1.1739, + "step": 13110 + }, + { + "epoch": 0.72, + "grad_norm": 0.169921875, + "learning_rate": 4.455928234730774e-05, + "loss": 1.0889, + "step": 13115 + }, + { + "epoch": 0.72, + "grad_norm": 0.1748046875, + "learning_rate": 4.447974538608107e-05, + "loss": 1.1132, + "step": 13120 + }, + { + "epoch": 0.72, + "grad_norm": 0.1748046875, + "learning_rate": 4.4400259166340076e-05, + "loss": 1.1859, + "step": 13125 + }, + { + "epoch": 0.72, + "grad_norm": 0.173828125, + "learning_rate": 4.432082376072937e-05, + "loss": 1.1718, + "step": 13130 + }, + { + "epoch": 0.72, + "grad_norm": 0.1630859375, + "learning_rate": 4.4241439241847205e-05, + "loss": 1.1603, + "step": 13135 + }, + { + "epoch": 0.72, + "grad_norm": 0.17578125, + "learning_rate": 4.416210568224519e-05, + "loss": 1.179, + "step": 13140 + }, + { + "epoch": 0.72, + "grad_norm": 0.1767578125, + "learning_rate": 4.40828231544285e-05, + "loss": 1.2242, + "step": 13145 + }, + { + "epoch": 0.72, + "grad_norm": 0.169921875, + "learning_rate": 4.40035917308556e-05, + "loss": 1.1504, + "step": 13150 + }, + { + "epoch": 0.72, + "grad_norm": 0.1728515625, + "learning_rate": 4.3924411483938254e-05, + "loss": 1.17, + "step": 13155 + }, + { + "epoch": 0.72, + "grad_norm": 0.1806640625, + "learning_rate": 4.384528248604153e-05, + "loss": 1.1896, + "step": 13160 + }, + { + "epoch": 0.72, + "grad_norm": 0.1787109375, + "learning_rate": 4.376620480948345e-05, + "loss": 1.1076, + "step": 13165 + }, + { + "epoch": 0.72, + "grad_norm": 0.1689453125, + "learning_rate": 4.3687178526535444e-05, + "loss": 1.2358, + "step": 13170 + }, + { + "epoch": 0.72, + "grad_norm": 0.18359375, + "learning_rate": 4.360820370942168e-05, + "loss": 1.133, + "step": 13175 + }, + { + "epoch": 0.72, + "grad_norm": 0.1767578125, + "learning_rate": 4.35292804303195e-05, + "loss": 1.1723, + "step": 13180 + }, + { + "epoch": 0.72, + "grad_norm": 0.1708984375, + "learning_rate": 4.345040876135894e-05, + "loss": 1.1845, + "step": 13185 + }, + { + "epoch": 0.72, + "grad_norm": 0.171875, + "learning_rate": 4.337158877462312e-05, + "loss": 1.1915, + "step": 13190 + }, + { + "epoch": 0.72, + "grad_norm": 0.177734375, + "learning_rate": 4.3292820542147695e-05, + "loss": 1.1755, + "step": 13195 + }, + { + "epoch": 0.72, + "grad_norm": 0.1708984375, + "learning_rate": 4.321410413592118e-05, + "loss": 1.1164, + "step": 13200 + }, + { + "epoch": 0.72, + "grad_norm": 0.166015625, + "learning_rate": 4.3135439627884565e-05, + "loss": 1.1593, + "step": 13205 + }, + { + "epoch": 0.72, + "grad_norm": 0.169921875, + "learning_rate": 4.305682708993164e-05, + "loss": 1.1905, + "step": 13210 + }, + { + "epoch": 0.72, + "grad_norm": 0.17578125, + "learning_rate": 4.297826659390844e-05, + "loss": 1.2257, + "step": 13215 + }, + { + "epoch": 0.72, + "grad_norm": 0.1748046875, + "learning_rate": 4.289975821161366e-05, + "loss": 1.041, + "step": 13220 + }, + { + "epoch": 0.72, + "grad_norm": 0.16796875, + "learning_rate": 4.282130201479818e-05, + "loss": 1.1574, + "step": 13225 + }, + { + "epoch": 0.72, + "grad_norm": 0.1708984375, + "learning_rate": 4.274289807516532e-05, + "loss": 1.1792, + "step": 13230 + }, + { + "epoch": 0.72, + "grad_norm": 0.1767578125, + "learning_rate": 4.266454646437057e-05, + "loss": 1.2165, + "step": 13235 + }, + { + "epoch": 0.73, + "grad_norm": 0.1748046875, + "learning_rate": 4.258624725402165e-05, + "loss": 1.1101, + "step": 13240 + }, + { + "epoch": 0.73, + "grad_norm": 0.1787109375, + "learning_rate": 4.250800051567837e-05, + "loss": 1.1152, + "step": 13245 + }, + { + "epoch": 0.73, + "grad_norm": 0.166015625, + "learning_rate": 4.242980632085247e-05, + "loss": 1.0985, + "step": 13250 + }, + { + "epoch": 0.73, + "grad_norm": 0.17578125, + "learning_rate": 4.235166474100793e-05, + "loss": 1.1884, + "step": 13255 + }, + { + "epoch": 0.73, + "grad_norm": 0.169921875, + "learning_rate": 4.2273575847560364e-05, + "loss": 1.1598, + "step": 13260 + }, + { + "epoch": 0.73, + "grad_norm": 0.1787109375, + "learning_rate": 4.219553971187744e-05, + "loss": 1.1828, + "step": 13265 + }, + { + "epoch": 0.73, + "grad_norm": 0.173828125, + "learning_rate": 4.21175564052784e-05, + "loss": 1.142, + "step": 13270 + }, + { + "epoch": 0.73, + "grad_norm": 0.169921875, + "learning_rate": 4.2039625999034505e-05, + "loss": 1.2091, + "step": 13275 + }, + { + "epoch": 0.73, + "grad_norm": 0.18359375, + "learning_rate": 4.1961748564368356e-05, + "loss": 1.0797, + "step": 13280 + }, + { + "epoch": 0.73, + "grad_norm": 0.1767578125, + "learning_rate": 4.1883924172454356e-05, + "loss": 1.2071, + "step": 13285 + }, + { + "epoch": 0.73, + "grad_norm": 0.1767578125, + "learning_rate": 4.180615289441826e-05, + "loss": 1.1632, + "step": 13290 + }, + { + "epoch": 0.73, + "grad_norm": 0.1748046875, + "learning_rate": 4.17284348013375e-05, + "loss": 1.1654, + "step": 13295 + }, + { + "epoch": 0.73, + "grad_norm": 0.1630859375, + "learning_rate": 4.16507699642407e-05, + "loss": 1.1511, + "step": 13300 + }, + { + "epoch": 0.73, + "grad_norm": 0.171875, + "learning_rate": 4.157315845410792e-05, + "loss": 1.2286, + "step": 13305 + }, + { + "epoch": 0.73, + "grad_norm": 0.1767578125, + "learning_rate": 4.1495600341870375e-05, + "loss": 1.2273, + "step": 13310 + }, + { + "epoch": 0.73, + "grad_norm": 0.1728515625, + "learning_rate": 4.141809569841068e-05, + "loss": 1.1825, + "step": 13315 + }, + { + "epoch": 0.73, + "grad_norm": 0.1767578125, + "learning_rate": 4.134064459456235e-05, + "loss": 1.1725, + "step": 13320 + }, + { + "epoch": 0.73, + "grad_norm": 0.171875, + "learning_rate": 4.126324710111012e-05, + "loss": 1.0658, + "step": 13325 + }, + { + "epoch": 0.73, + "grad_norm": 0.171875, + "learning_rate": 4.118590328878973e-05, + "loss": 1.2339, + "step": 13330 + }, + { + "epoch": 0.73, + "grad_norm": 0.1787109375, + "learning_rate": 4.1108613228287716e-05, + "loss": 1.1675, + "step": 13335 + }, + { + "epoch": 0.73, + "grad_norm": 0.1728515625, + "learning_rate": 4.1031376990241644e-05, + "loss": 1.1443, + "step": 13340 + }, + { + "epoch": 0.73, + "grad_norm": 0.169921875, + "learning_rate": 4.095419464523982e-05, + "loss": 1.1169, + "step": 13345 + }, + { + "epoch": 0.73, + "grad_norm": 0.1826171875, + "learning_rate": 4.087706626382136e-05, + "loss": 1.1747, + "step": 13350 + }, + { + "epoch": 0.73, + "grad_norm": 0.1787109375, + "learning_rate": 4.079999191647589e-05, + "loss": 1.1475, + "step": 13355 + }, + { + "epoch": 0.73, + "grad_norm": 0.1728515625, + "learning_rate": 4.072297167364393e-05, + "loss": 1.108, + "step": 13360 + }, + { + "epoch": 0.73, + "grad_norm": 0.1689453125, + "learning_rate": 4.0646005605716266e-05, + "loss": 1.1033, + "step": 13365 + }, + { + "epoch": 0.73, + "grad_norm": 0.1669921875, + "learning_rate": 4.0569093783034396e-05, + "loss": 1.1738, + "step": 13370 + }, + { + "epoch": 0.73, + "grad_norm": 0.16796875, + "learning_rate": 4.049223627589002e-05, + "loss": 1.1878, + "step": 13375 + }, + { + "epoch": 0.73, + "grad_norm": 0.1708984375, + "learning_rate": 4.0415433154525484e-05, + "loss": 1.1707, + "step": 13380 + }, + { + "epoch": 0.73, + "grad_norm": 0.1669921875, + "learning_rate": 4.033868448913316e-05, + "loss": 1.1536, + "step": 13385 + }, + { + "epoch": 0.73, + "grad_norm": 0.177734375, + "learning_rate": 4.0261990349855827e-05, + "loss": 1.1218, + "step": 13390 + }, + { + "epoch": 0.73, + "grad_norm": 0.1767578125, + "learning_rate": 4.018535080678626e-05, + "loss": 1.1605, + "step": 13395 + }, + { + "epoch": 0.73, + "grad_norm": 0.1748046875, + "learning_rate": 4.0108765929967585e-05, + "loss": 1.0793, + "step": 13400 + }, + { + "epoch": 0.73, + "grad_norm": 0.1708984375, + "learning_rate": 4.0032235789392716e-05, + "loss": 1.2144, + "step": 13405 + }, + { + "epoch": 0.73, + "grad_norm": 0.166015625, + "learning_rate": 3.99557604550047e-05, + "loss": 1.1918, + "step": 13410 + }, + { + "epoch": 0.73, + "grad_norm": 0.1826171875, + "learning_rate": 3.987933999669642e-05, + "loss": 1.1431, + "step": 13415 + }, + { + "epoch": 0.74, + "grad_norm": 0.169921875, + "learning_rate": 3.98029744843107e-05, + "loss": 1.1728, + "step": 13420 + }, + { + "epoch": 0.74, + "grad_norm": 0.1728515625, + "learning_rate": 3.972666398763997e-05, + "loss": 1.1354, + "step": 13425 + }, + { + "epoch": 0.74, + "grad_norm": 0.171875, + "learning_rate": 3.965040857642656e-05, + "loss": 1.1052, + "step": 13430 + }, + { + "epoch": 0.74, + "grad_norm": 0.17578125, + "learning_rate": 3.9574208320362396e-05, + "loss": 1.2923, + "step": 13435 + }, + { + "epoch": 0.74, + "grad_norm": 0.1728515625, + "learning_rate": 3.949806328908888e-05, + "loss": 1.1323, + "step": 13440 + }, + { + "epoch": 0.74, + "grad_norm": 0.181640625, + "learning_rate": 3.942197355219721e-05, + "loss": 1.116, + "step": 13445 + }, + { + "epoch": 0.74, + "grad_norm": 0.177734375, + "learning_rate": 3.9345939179227766e-05, + "loss": 1.2529, + "step": 13450 + }, + { + "epoch": 0.74, + "grad_norm": 0.173828125, + "learning_rate": 3.926996023967052e-05, + "loss": 1.2199, + "step": 13455 + }, + { + "epoch": 0.74, + "grad_norm": 0.16796875, + "learning_rate": 3.9194036802964604e-05, + "loss": 1.0931, + "step": 13460 + }, + { + "epoch": 0.74, + "grad_norm": 0.1708984375, + "learning_rate": 3.911816893849869e-05, + "loss": 1.1403, + "step": 13465 + }, + { + "epoch": 0.74, + "grad_norm": 0.1650390625, + "learning_rate": 3.9042356715610375e-05, + "loss": 1.1292, + "step": 13470 + }, + { + "epoch": 0.74, + "grad_norm": 0.1796875, + "learning_rate": 3.89666002035866e-05, + "loss": 1.2141, + "step": 13475 + }, + { + "epoch": 0.74, + "grad_norm": 0.1787109375, + "learning_rate": 3.8890899471663244e-05, + "loss": 1.2205, + "step": 13480 + }, + { + "epoch": 0.74, + "grad_norm": 0.1767578125, + "learning_rate": 3.8815254589025405e-05, + "loss": 1.1424, + "step": 13485 + }, + { + "epoch": 0.74, + "grad_norm": 0.169921875, + "learning_rate": 3.873966562480692e-05, + "loss": 1.1041, + "step": 13490 + }, + { + "epoch": 0.74, + "grad_norm": 0.17578125, + "learning_rate": 3.866413264809069e-05, + "loss": 1.1941, + "step": 13495 + }, + { + "epoch": 0.74, + "grad_norm": 0.1728515625, + "learning_rate": 3.858865572790826e-05, + "loss": 1.1208, + "step": 13500 + }, + { + "epoch": 0.74, + "grad_norm": 0.16796875, + "learning_rate": 3.8513234933240216e-05, + "loss": 1.2222, + "step": 13505 + }, + { + "epoch": 0.74, + "grad_norm": 0.1640625, + "learning_rate": 3.843787033301559e-05, + "loss": 1.1738, + "step": 13510 + }, + { + "epoch": 0.74, + "grad_norm": 0.17578125, + "learning_rate": 3.836256199611218e-05, + "loss": 1.1422, + "step": 13515 + }, + { + "epoch": 0.74, + "grad_norm": 0.1689453125, + "learning_rate": 3.828730999135639e-05, + "loss": 1.2219, + "step": 13520 + }, + { + "epoch": 0.74, + "grad_norm": 0.1748046875, + "learning_rate": 3.821211438752299e-05, + "loss": 1.1706, + "step": 13525 + }, + { + "epoch": 0.74, + "grad_norm": 0.171875, + "learning_rate": 3.813697525333544e-05, + "loss": 1.0983, + "step": 13530 + }, + { + "epoch": 0.74, + "grad_norm": 0.1650390625, + "learning_rate": 3.806189265746536e-05, + "loss": 1.2105, + "step": 13535 + }, + { + "epoch": 0.74, + "grad_norm": 0.1689453125, + "learning_rate": 3.7986866668532864e-05, + "loss": 1.2181, + "step": 13540 + }, + { + "epoch": 0.74, + "grad_norm": 0.1796875, + "learning_rate": 3.791189735510616e-05, + "loss": 1.2017, + "step": 13545 + }, + { + "epoch": 0.74, + "grad_norm": 0.1728515625, + "learning_rate": 3.78369847857019e-05, + "loss": 1.1555, + "step": 13550 + }, + { + "epoch": 0.74, + "grad_norm": 0.173828125, + "learning_rate": 3.7762129028784634e-05, + "loss": 1.2115, + "step": 13555 + }, + { + "epoch": 0.74, + "grad_norm": 0.173828125, + "learning_rate": 3.768733015276717e-05, + "loss": 1.2153, + "step": 13560 + }, + { + "epoch": 0.74, + "grad_norm": 0.1650390625, + "learning_rate": 3.761258822601015e-05, + "loss": 1.1868, + "step": 13565 + }, + { + "epoch": 0.74, + "grad_norm": 0.1767578125, + "learning_rate": 3.75379033168224e-05, + "loss": 1.2226, + "step": 13570 + }, + { + "epoch": 0.74, + "grad_norm": 0.169921875, + "learning_rate": 3.7463275493460425e-05, + "loss": 1.2322, + "step": 13575 + }, + { + "epoch": 0.74, + "grad_norm": 0.1708984375, + "learning_rate": 3.7388704824128696e-05, + "loss": 1.1601, + "step": 13580 + }, + { + "epoch": 0.74, + "grad_norm": 0.166015625, + "learning_rate": 3.73141913769793e-05, + "loss": 1.2101, + "step": 13585 + }, + { + "epoch": 0.74, + "grad_norm": 0.166015625, + "learning_rate": 3.723973522011226e-05, + "loss": 1.1101, + "step": 13590 + }, + { + "epoch": 0.74, + "grad_norm": 0.1845703125, + "learning_rate": 3.7165336421575006e-05, + "loss": 1.1107, + "step": 13595 + }, + { + "epoch": 0.74, + "grad_norm": 0.1669921875, + "learning_rate": 3.709099504936266e-05, + "loss": 1.1833, + "step": 13600 + }, + { + "epoch": 0.75, + "grad_norm": 0.1748046875, + "learning_rate": 3.701671117141786e-05, + "loss": 1.1344, + "step": 13605 + }, + { + "epoch": 0.75, + "grad_norm": 0.177734375, + "learning_rate": 3.6942484855630674e-05, + "loss": 1.2202, + "step": 13610 + }, + { + "epoch": 0.75, + "grad_norm": 0.1826171875, + "learning_rate": 3.6868316169838615e-05, + "loss": 1.1184, + "step": 13615 + }, + { + "epoch": 0.75, + "grad_norm": 0.173828125, + "learning_rate": 3.6794205181826404e-05, + "loss": 1.0868, + "step": 13620 + }, + { + "epoch": 0.75, + "grad_norm": 0.171875, + "learning_rate": 3.672015195932618e-05, + "loss": 1.1645, + "step": 13625 + }, + { + "epoch": 0.75, + "grad_norm": 0.1748046875, + "learning_rate": 3.664615657001711e-05, + "loss": 1.1464, + "step": 13630 + }, + { + "epoch": 0.75, + "grad_norm": 0.1708984375, + "learning_rate": 3.657221908152576e-05, + "loss": 1.2247, + "step": 13635 + }, + { + "epoch": 0.75, + "grad_norm": 0.1728515625, + "learning_rate": 3.649833956142552e-05, + "loss": 1.1753, + "step": 13640 + }, + { + "epoch": 0.75, + "grad_norm": 0.177734375, + "learning_rate": 3.642451807723699e-05, + "loss": 1.1876, + "step": 13645 + }, + { + "epoch": 0.75, + "grad_norm": 0.173828125, + "learning_rate": 3.635075469642753e-05, + "loss": 1.1502, + "step": 13650 + }, + { + "epoch": 0.75, + "grad_norm": 0.169921875, + "learning_rate": 3.6277049486411685e-05, + "loss": 1.1478, + "step": 13655 + }, + { + "epoch": 0.75, + "grad_norm": 0.169921875, + "learning_rate": 3.6203402514550545e-05, + "loss": 1.1439, + "step": 13660 + }, + { + "epoch": 0.75, + "grad_norm": 0.1787109375, + "learning_rate": 3.6129813848152186e-05, + "loss": 1.1978, + "step": 13665 + }, + { + "epoch": 0.75, + "grad_norm": 0.1748046875, + "learning_rate": 3.6056283554471185e-05, + "loss": 1.1789, + "step": 13670 + }, + { + "epoch": 0.75, + "grad_norm": 0.166015625, + "learning_rate": 3.598281170070905e-05, + "loss": 1.1936, + "step": 13675 + }, + { + "epoch": 0.75, + "grad_norm": 0.1728515625, + "learning_rate": 3.590939835401363e-05, + "loss": 1.0905, + "step": 13680 + }, + { + "epoch": 0.75, + "grad_norm": 0.177734375, + "learning_rate": 3.583604358147943e-05, + "loss": 1.1509, + "step": 13685 + }, + { + "epoch": 0.75, + "grad_norm": 0.1796875, + "learning_rate": 3.5762747450147384e-05, + "loss": 1.1356, + "step": 13690 + }, + { + "epoch": 0.75, + "grad_norm": 0.1748046875, + "learning_rate": 3.5689510027004866e-05, + "loss": 1.2413, + "step": 13695 + }, + { + "epoch": 0.75, + "grad_norm": 0.169921875, + "learning_rate": 3.5616331378985524e-05, + "loss": 1.201, + "step": 13700 + }, + { + "epoch": 0.75, + "grad_norm": 0.1796875, + "learning_rate": 3.554321157296936e-05, + "loss": 1.0684, + "step": 13705 + }, + { + "epoch": 0.75, + "grad_norm": 0.1640625, + "learning_rate": 3.547015067578255e-05, + "loss": 1.0633, + "step": 13710 + }, + { + "epoch": 0.75, + "grad_norm": 0.17578125, + "learning_rate": 3.53971487541975e-05, + "loss": 1.1793, + "step": 13715 + }, + { + "epoch": 0.75, + "grad_norm": 0.1806640625, + "learning_rate": 3.532420587493266e-05, + "loss": 1.178, + "step": 13720 + }, + { + "epoch": 0.75, + "grad_norm": 0.1611328125, + "learning_rate": 3.5251322104652485e-05, + "loss": 1.1171, + "step": 13725 + }, + { + "epoch": 0.75, + "grad_norm": 0.1806640625, + "learning_rate": 3.517849750996751e-05, + "loss": 1.2193, + "step": 13730 + }, + { + "epoch": 0.75, + "grad_norm": 0.169921875, + "learning_rate": 3.5105732157434056e-05, + "loss": 1.126, + "step": 13735 + }, + { + "epoch": 0.75, + "grad_norm": 0.1708984375, + "learning_rate": 3.503302611355449e-05, + "loss": 1.1239, + "step": 13740 + }, + { + "epoch": 0.75, + "grad_norm": 0.1767578125, + "learning_rate": 3.496037944477676e-05, + "loss": 1.0908, + "step": 13745 + }, + { + "epoch": 0.75, + "grad_norm": 0.1767578125, + "learning_rate": 3.4887792217494743e-05, + "loss": 1.2219, + "step": 13750 + }, + { + "epoch": 0.75, + "grad_norm": 0.17578125, + "learning_rate": 3.481526449804777e-05, + "loss": 1.1626, + "step": 13755 + }, + { + "epoch": 0.75, + "grad_norm": 0.173828125, + "learning_rate": 3.4742796352721065e-05, + "loss": 1.1822, + "step": 13760 + }, + { + "epoch": 0.75, + "grad_norm": 0.1708984375, + "learning_rate": 3.467038784774516e-05, + "loss": 1.1902, + "step": 13765 + }, + { + "epoch": 0.75, + "grad_norm": 0.169921875, + "learning_rate": 3.45980390492962e-05, + "loss": 1.1101, + "step": 13770 + }, + { + "epoch": 0.75, + "grad_norm": 0.17578125, + "learning_rate": 3.452575002349574e-05, + "loss": 1.137, + "step": 13775 + }, + { + "epoch": 0.75, + "grad_norm": 0.1708984375, + "learning_rate": 3.445352083641076e-05, + "loss": 1.1581, + "step": 13780 + }, + { + "epoch": 0.76, + "grad_norm": 0.177734375, + "learning_rate": 3.43813515540534e-05, + "loss": 1.1684, + "step": 13785 + }, + { + "epoch": 0.76, + "grad_norm": 0.1669921875, + "learning_rate": 3.430924224238119e-05, + "loss": 1.1619, + "step": 13790 + }, + { + "epoch": 0.76, + "grad_norm": 0.1728515625, + "learning_rate": 3.423719296729683e-05, + "loss": 1.1864, + "step": 13795 + }, + { + "epoch": 0.76, + "grad_norm": 0.1796875, + "learning_rate": 3.416520379464812e-05, + "loss": 1.1556, + "step": 13800 + }, + { + "epoch": 0.76, + "grad_norm": 0.1787109375, + "learning_rate": 3.409327479022796e-05, + "loss": 1.173, + "step": 13805 + }, + { + "epoch": 0.76, + "grad_norm": 0.1611328125, + "learning_rate": 3.40214060197742e-05, + "loss": 1.1378, + "step": 13810 + }, + { + "epoch": 0.76, + "grad_norm": 0.1689453125, + "learning_rate": 3.39495975489697e-05, + "loss": 1.1369, + "step": 13815 + }, + { + "epoch": 0.76, + "grad_norm": 0.1728515625, + "learning_rate": 3.387784944344221e-05, + "loss": 1.1497, + "step": 13820 + }, + { + "epoch": 0.76, + "grad_norm": 0.1826171875, + "learning_rate": 3.380616176876432e-05, + "loss": 1.214, + "step": 13825 + }, + { + "epoch": 0.76, + "grad_norm": 0.1708984375, + "learning_rate": 3.373453459045329e-05, + "loss": 1.1564, + "step": 13830 + }, + { + "epoch": 0.76, + "grad_norm": 0.1708984375, + "learning_rate": 3.366296797397125e-05, + "loss": 1.1412, + "step": 13835 + }, + { + "epoch": 0.76, + "grad_norm": 0.1640625, + "learning_rate": 3.359146198472478e-05, + "loss": 1.2285, + "step": 13840 + }, + { + "epoch": 0.76, + "grad_norm": 0.18359375, + "learning_rate": 3.35200166880653e-05, + "loss": 1.1703, + "step": 13845 + }, + { + "epoch": 0.76, + "grad_norm": 0.177734375, + "learning_rate": 3.344863214928855e-05, + "loss": 1.1571, + "step": 13850 + }, + { + "epoch": 0.76, + "grad_norm": 0.1708984375, + "learning_rate": 3.3377308433634856e-05, + "loss": 1.1294, + "step": 13855 + }, + { + "epoch": 0.76, + "grad_norm": 0.17578125, + "learning_rate": 3.330604560628885e-05, + "loss": 1.085, + "step": 13860 + }, + { + "epoch": 0.76, + "grad_norm": 0.1728515625, + "learning_rate": 3.3234843732379706e-05, + "loss": 1.2972, + "step": 13865 + }, + { + "epoch": 0.76, + "grad_norm": 0.181640625, + "learning_rate": 3.316370287698067e-05, + "loss": 1.1964, + "step": 13870 + }, + { + "epoch": 0.76, + "grad_norm": 0.177734375, + "learning_rate": 3.309262310510937e-05, + "loss": 1.1041, + "step": 13875 + }, + { + "epoch": 0.76, + "grad_norm": 0.1748046875, + "learning_rate": 3.302160448172755e-05, + "loss": 1.237, + "step": 13880 + }, + { + "epoch": 0.76, + "grad_norm": 0.1650390625, + "learning_rate": 3.295064707174108e-05, + "loss": 1.1725, + "step": 13885 + }, + { + "epoch": 0.76, + "grad_norm": 0.1787109375, + "learning_rate": 3.287975093999993e-05, + "loss": 1.1727, + "step": 13890 + }, + { + "epoch": 0.76, + "grad_norm": 0.1708984375, + "learning_rate": 3.2808916151297954e-05, + "loss": 1.1962, + "step": 13895 + }, + { + "epoch": 0.76, + "grad_norm": 0.171875, + "learning_rate": 3.273814277037303e-05, + "loss": 1.0735, + "step": 13900 + }, + { + "epoch": 0.76, + "grad_norm": 0.1689453125, + "learning_rate": 3.26674308619069e-05, + "loss": 1.2206, + "step": 13905 + }, + { + "epoch": 0.76, + "grad_norm": 0.177734375, + "learning_rate": 3.259678049052517e-05, + "loss": 1.1535, + "step": 13910 + }, + { + "epoch": 0.76, + "grad_norm": 0.1689453125, + "learning_rate": 3.252619172079706e-05, + "loss": 1.1734, + "step": 13915 + }, + { + "epoch": 0.76, + "grad_norm": 0.177734375, + "learning_rate": 3.245566461723565e-05, + "loss": 1.1578, + "step": 13920 + }, + { + "epoch": 0.76, + "grad_norm": 0.1728515625, + "learning_rate": 3.238519924429757e-05, + "loss": 1.119, + "step": 13925 + }, + { + "epoch": 0.76, + "grad_norm": 0.166015625, + "learning_rate": 3.231479566638311e-05, + "loss": 1.157, + "step": 13930 + }, + { + "epoch": 0.76, + "grad_norm": 0.1689453125, + "learning_rate": 3.2244453947835964e-05, + "loss": 1.176, + "step": 13935 + }, + { + "epoch": 0.76, + "grad_norm": 0.1806640625, + "learning_rate": 3.2174174152943416e-05, + "loss": 1.1568, + "step": 13940 + }, + { + "epoch": 0.76, + "grad_norm": 0.16796875, + "learning_rate": 3.210395634593602e-05, + "loss": 1.1072, + "step": 13945 + }, + { + "epoch": 0.76, + "grad_norm": 0.171875, + "learning_rate": 3.203380059098786e-05, + "loss": 1.1266, + "step": 13950 + }, + { + "epoch": 0.76, + "grad_norm": 0.17578125, + "learning_rate": 3.196370695221615e-05, + "loss": 1.1422, + "step": 13955 + }, + { + "epoch": 0.76, + "grad_norm": 0.1748046875, + "learning_rate": 3.18936754936814e-05, + "loss": 1.2068, + "step": 13960 + }, + { + "epoch": 0.76, + "grad_norm": 0.169921875, + "learning_rate": 3.182370627938728e-05, + "loss": 1.131, + "step": 13965 + }, + { + "epoch": 0.77, + "grad_norm": 0.171875, + "learning_rate": 3.175379937328059e-05, + "loss": 1.2384, + "step": 13970 + }, + { + "epoch": 0.77, + "grad_norm": 0.1796875, + "learning_rate": 3.168395483925119e-05, + "loss": 1.2203, + "step": 13975 + }, + { + "epoch": 0.77, + "grad_norm": 0.18359375, + "learning_rate": 3.161417274113188e-05, + "loss": 1.1862, + "step": 13980 + }, + { + "epoch": 0.77, + "grad_norm": 0.17578125, + "learning_rate": 3.154445314269843e-05, + "loss": 1.1876, + "step": 13985 + }, + { + "epoch": 0.77, + "grad_norm": 0.1787109375, + "learning_rate": 3.1474796107669524e-05, + "loss": 1.2134, + "step": 13990 + }, + { + "epoch": 0.77, + "grad_norm": 0.171875, + "learning_rate": 3.140520169970667e-05, + "loss": 1.1462, + "step": 13995 + }, + { + "epoch": 0.77, + "grad_norm": 0.171875, + "learning_rate": 3.133566998241403e-05, + "loss": 1.1342, + "step": 14000 + }, + { + "epoch": 0.77, + "grad_norm": 0.169921875, + "learning_rate": 3.1266201019338594e-05, + "loss": 1.1929, + "step": 14005 + }, + { + "epoch": 0.77, + "grad_norm": 0.173828125, + "learning_rate": 3.1196794873969946e-05, + "loss": 1.1846, + "step": 14010 + }, + { + "epoch": 0.77, + "grad_norm": 0.1728515625, + "learning_rate": 3.112745160974031e-05, + "loss": 1.2214, + "step": 14015 + }, + { + "epoch": 0.77, + "grad_norm": 0.1748046875, + "learning_rate": 3.1058171290024326e-05, + "loss": 1.2702, + "step": 14020 + }, + { + "epoch": 0.77, + "grad_norm": 0.1767578125, + "learning_rate": 3.098895397813927e-05, + "loss": 1.2535, + "step": 14025 + }, + { + "epoch": 0.77, + "grad_norm": 0.189453125, + "learning_rate": 3.091979973734461e-05, + "loss": 1.1729, + "step": 14030 + }, + { + "epoch": 0.77, + "grad_norm": 0.1669921875, + "learning_rate": 3.085070863084247e-05, + "loss": 1.1687, + "step": 14035 + }, + { + "epoch": 0.77, + "grad_norm": 0.169921875, + "learning_rate": 3.0781680721777e-05, + "loss": 1.1672, + "step": 14040 + }, + { + "epoch": 0.77, + "grad_norm": 0.1708984375, + "learning_rate": 3.071271607323472e-05, + "loss": 1.1815, + "step": 14045 + }, + { + "epoch": 0.77, + "grad_norm": 0.171875, + "learning_rate": 3.064381474824434e-05, + "loss": 1.2, + "step": 14050 + }, + { + "epoch": 0.77, + "grad_norm": 0.1669921875, + "learning_rate": 3.0574976809776676e-05, + "loss": 1.1494, + "step": 14055 + }, + { + "epoch": 0.77, + "grad_norm": 0.1689453125, + "learning_rate": 3.0506202320744582e-05, + "loss": 1.1557, + "step": 14060 + }, + { + "epoch": 0.77, + "grad_norm": 0.1708984375, + "learning_rate": 3.0437491344002967e-05, + "loss": 1.0796, + "step": 14065 + }, + { + "epoch": 0.77, + "grad_norm": 0.17578125, + "learning_rate": 3.0368843942348668e-05, + "loss": 1.2548, + "step": 14070 + }, + { + "epoch": 0.77, + "grad_norm": 0.173828125, + "learning_rate": 3.0300260178520456e-05, + "loss": 1.1587, + "step": 14075 + }, + { + "epoch": 0.77, + "grad_norm": 0.1708984375, + "learning_rate": 3.0231740115198927e-05, + "loss": 1.1738, + "step": 14080 + }, + { + "epoch": 0.77, + "grad_norm": 0.169921875, + "learning_rate": 3.0163283815006404e-05, + "loss": 1.1717, + "step": 14085 + }, + { + "epoch": 0.77, + "grad_norm": 0.181640625, + "learning_rate": 3.009489134050699e-05, + "loss": 1.1571, + "step": 14090 + }, + { + "epoch": 0.77, + "grad_norm": 0.173828125, + "learning_rate": 3.0026562754206465e-05, + "loss": 1.2267, + "step": 14095 + }, + { + "epoch": 0.77, + "grad_norm": 0.1650390625, + "learning_rate": 2.995829811855223e-05, + "loss": 1.183, + "step": 14100 + }, + { + "epoch": 0.77, + "grad_norm": 0.171875, + "learning_rate": 2.9890097495933134e-05, + "loss": 1.1724, + "step": 14105 + }, + { + "epoch": 0.77, + "grad_norm": 0.171875, + "learning_rate": 2.982196094867965e-05, + "loss": 1.1948, + "step": 14110 + }, + { + "epoch": 0.77, + "grad_norm": 0.1748046875, + "learning_rate": 2.9753888539063623e-05, + "loss": 1.212, + "step": 14115 + }, + { + "epoch": 0.77, + "grad_norm": 0.1787109375, + "learning_rate": 2.968588032929833e-05, + "loss": 1.1881, + "step": 14120 + }, + { + "epoch": 0.77, + "grad_norm": 0.171875, + "learning_rate": 2.961793638153829e-05, + "loss": 1.1855, + "step": 14125 + }, + { + "epoch": 0.77, + "grad_norm": 0.177734375, + "learning_rate": 2.9550056757879375e-05, + "loss": 1.1817, + "step": 14130 + }, + { + "epoch": 0.77, + "grad_norm": 0.1806640625, + "learning_rate": 2.9482241520358624e-05, + "loss": 1.1128, + "step": 14135 + }, + { + "epoch": 0.77, + "grad_norm": 0.171875, + "learning_rate": 2.9414490730954292e-05, + "loss": 1.2449, + "step": 14140 + }, + { + "epoch": 0.77, + "grad_norm": 0.1767578125, + "learning_rate": 2.9346804451585607e-05, + "loss": 1.1301, + "step": 14145 + }, + { + "epoch": 0.78, + "grad_norm": 0.1826171875, + "learning_rate": 2.9279182744112965e-05, + "loss": 1.1794, + "step": 14150 + }, + { + "epoch": 0.78, + "grad_norm": 0.177734375, + "learning_rate": 2.9211625670337695e-05, + "loss": 1.2097, + "step": 14155 + }, + { + "epoch": 0.78, + "grad_norm": 0.185546875, + "learning_rate": 2.9144133292002064e-05, + "loss": 1.2003, + "step": 14160 + }, + { + "epoch": 0.78, + "grad_norm": 0.1796875, + "learning_rate": 2.907670567078924e-05, + "loss": 1.1768, + "step": 14165 + }, + { + "epoch": 0.78, + "grad_norm": 0.1748046875, + "learning_rate": 2.900934286832312e-05, + "loss": 1.1687, + "step": 14170 + }, + { + "epoch": 0.78, + "grad_norm": 0.16796875, + "learning_rate": 2.8942044946168445e-05, + "loss": 1.1567, + "step": 14175 + }, + { + "epoch": 0.78, + "grad_norm": 0.173828125, + "learning_rate": 2.887481196583063e-05, + "loss": 1.1866, + "step": 14180 + }, + { + "epoch": 0.78, + "grad_norm": 0.169921875, + "learning_rate": 2.8807643988755782e-05, + "loss": 1.11, + "step": 14185 + }, + { + "epoch": 0.78, + "grad_norm": 0.177734375, + "learning_rate": 2.874054107633051e-05, + "loss": 1.1675, + "step": 14190 + }, + { + "epoch": 0.78, + "grad_norm": 0.1767578125, + "learning_rate": 2.8673503289882007e-05, + "loss": 1.1883, + "step": 14195 + }, + { + "epoch": 0.78, + "grad_norm": 0.1796875, + "learning_rate": 2.860653069067798e-05, + "loss": 1.1704, + "step": 14200 + }, + { + "epoch": 0.78, + "grad_norm": 0.1728515625, + "learning_rate": 2.8539623339926546e-05, + "loss": 1.1541, + "step": 14205 + }, + { + "epoch": 0.78, + "grad_norm": 0.1806640625, + "learning_rate": 2.8472781298776097e-05, + "loss": 1.1938, + "step": 14210 + }, + { + "epoch": 0.78, + "grad_norm": 0.16796875, + "learning_rate": 2.8406004628315464e-05, + "loss": 1.1602, + "step": 14215 + }, + { + "epoch": 0.78, + "grad_norm": 0.1748046875, + "learning_rate": 2.8339293389573664e-05, + "loss": 1.0625, + "step": 14220 + }, + { + "epoch": 0.78, + "grad_norm": 0.1865234375, + "learning_rate": 2.8272647643519956e-05, + "loss": 1.1587, + "step": 14225 + }, + { + "epoch": 0.78, + "grad_norm": 0.1728515625, + "learning_rate": 2.8206067451063666e-05, + "loss": 1.109, + "step": 14230 + }, + { + "epoch": 0.78, + "grad_norm": 0.1630859375, + "learning_rate": 2.8139552873054297e-05, + "loss": 1.1802, + "step": 14235 + }, + { + "epoch": 0.78, + "grad_norm": 0.1689453125, + "learning_rate": 2.8073103970281346e-05, + "loss": 1.1886, + "step": 14240 + }, + { + "epoch": 0.78, + "grad_norm": 0.166015625, + "learning_rate": 2.8006720803474273e-05, + "loss": 1.1782, + "step": 14245 + }, + { + "epoch": 0.78, + "grad_norm": 0.1708984375, + "learning_rate": 2.7940403433302532e-05, + "loss": 1.1051, + "step": 14250 + }, + { + "epoch": 0.78, + "grad_norm": 0.171875, + "learning_rate": 2.7874151920375325e-05, + "loss": 1.1943, + "step": 14255 + }, + { + "epoch": 0.78, + "grad_norm": 0.1767578125, + "learning_rate": 2.780796632524174e-05, + "loss": 1.1166, + "step": 14260 + }, + { + "epoch": 0.78, + "grad_norm": 0.166015625, + "learning_rate": 2.774184670839063e-05, + "loss": 1.1605, + "step": 14265 + }, + { + "epoch": 0.78, + "grad_norm": 0.1728515625, + "learning_rate": 2.7675793130250548e-05, + "loss": 1.2724, + "step": 14270 + }, + { + "epoch": 0.78, + "grad_norm": 0.1728515625, + "learning_rate": 2.7609805651189614e-05, + "loss": 1.1806, + "step": 14275 + }, + { + "epoch": 0.78, + "grad_norm": 0.1708984375, + "learning_rate": 2.7543884331515636e-05, + "loss": 1.1059, + "step": 14280 + }, + { + "epoch": 0.78, + "grad_norm": 0.169921875, + "learning_rate": 2.747802923147591e-05, + "loss": 1.1892, + "step": 14285 + }, + { + "epoch": 0.78, + "grad_norm": 0.1640625, + "learning_rate": 2.7412240411257262e-05, + "loss": 1.1662, + "step": 14290 + }, + { + "epoch": 0.78, + "grad_norm": 0.1708984375, + "learning_rate": 2.7346517930985838e-05, + "loss": 1.1538, + "step": 14295 + }, + { + "epoch": 0.78, + "grad_norm": 0.1708984375, + "learning_rate": 2.7280861850727236e-05, + "loss": 1.1386, + "step": 14300 + }, + { + "epoch": 0.78, + "grad_norm": 0.1787109375, + "learning_rate": 2.7215272230486356e-05, + "loss": 1.1457, + "step": 14305 + }, + { + "epoch": 0.78, + "grad_norm": 0.18359375, + "learning_rate": 2.714974913020739e-05, + "loss": 1.0917, + "step": 14310 + }, + { + "epoch": 0.78, + "grad_norm": 0.1806640625, + "learning_rate": 2.708429260977363e-05, + "loss": 1.1846, + "step": 14315 + }, + { + "epoch": 0.78, + "grad_norm": 0.169921875, + "learning_rate": 2.7018902729007632e-05, + "loss": 1.265, + "step": 14320 + }, + { + "epoch": 0.78, + "grad_norm": 0.1748046875, + "learning_rate": 2.6953579547671004e-05, + "loss": 1.1437, + "step": 14325 + }, + { + "epoch": 0.78, + "grad_norm": 0.17578125, + "learning_rate": 2.6888323125464378e-05, + "loss": 1.1078, + "step": 14330 + }, + { + "epoch": 0.79, + "grad_norm": 0.1689453125, + "learning_rate": 2.6823133522027423e-05, + "loss": 1.209, + "step": 14335 + }, + { + "epoch": 0.79, + "grad_norm": 0.1708984375, + "learning_rate": 2.675801079693867e-05, + "loss": 1.1419, + "step": 14340 + }, + { + "epoch": 0.79, + "grad_norm": 0.1708984375, + "learning_rate": 2.669295500971557e-05, + "loss": 1.1218, + "step": 14345 + }, + { + "epoch": 0.79, + "grad_norm": 0.1787109375, + "learning_rate": 2.6627966219814405e-05, + "loss": 1.0754, + "step": 14350 + }, + { + "epoch": 0.79, + "grad_norm": 0.1748046875, + "learning_rate": 2.656304448663025e-05, + "loss": 1.1922, + "step": 14355 + }, + { + "epoch": 0.79, + "grad_norm": 0.1806640625, + "learning_rate": 2.6498189869496803e-05, + "loss": 1.1682, + "step": 14360 + }, + { + "epoch": 0.79, + "grad_norm": 0.17578125, + "learning_rate": 2.6433402427686494e-05, + "loss": 1.1014, + "step": 14365 + }, + { + "epoch": 0.79, + "grad_norm": 0.173828125, + "learning_rate": 2.6368682220410358e-05, + "loss": 1.0514, + "step": 14370 + }, + { + "epoch": 0.79, + "grad_norm": 0.1767578125, + "learning_rate": 2.630402930681799e-05, + "loss": 1.2006, + "step": 14375 + }, + { + "epoch": 0.79, + "grad_norm": 0.1787109375, + "learning_rate": 2.6239443745997406e-05, + "loss": 1.1759, + "step": 14380 + }, + { + "epoch": 0.79, + "grad_norm": 0.1875, + "learning_rate": 2.6174925596975164e-05, + "loss": 1.1712, + "step": 14385 + }, + { + "epoch": 0.79, + "grad_norm": 0.17578125, + "learning_rate": 2.6110474918716176e-05, + "loss": 1.0893, + "step": 14390 + }, + { + "epoch": 0.79, + "grad_norm": 0.177734375, + "learning_rate": 2.6046091770123694e-05, + "loss": 1.1495, + "step": 14395 + }, + { + "epoch": 0.79, + "grad_norm": 0.166015625, + "learning_rate": 2.598177621003921e-05, + "loss": 1.2217, + "step": 14400 + }, + { + "epoch": 0.79, + "grad_norm": 0.1767578125, + "learning_rate": 2.591752829724249e-05, + "loss": 1.1603, + "step": 14405 + }, + { + "epoch": 0.79, + "grad_norm": 0.169921875, + "learning_rate": 2.585334809045148e-05, + "loss": 1.09, + "step": 14410 + }, + { + "epoch": 0.79, + "grad_norm": 0.1767578125, + "learning_rate": 2.5789235648322263e-05, + "loss": 1.14, + "step": 14415 + }, + { + "epoch": 0.79, + "grad_norm": 0.1650390625, + "learning_rate": 2.572519102944889e-05, + "loss": 1.2157, + "step": 14420 + }, + { + "epoch": 0.79, + "grad_norm": 0.173828125, + "learning_rate": 2.5661214292363545e-05, + "loss": 1.151, + "step": 14425 + }, + { + "epoch": 0.79, + "grad_norm": 0.1796875, + "learning_rate": 2.5597305495536317e-05, + "loss": 1.135, + "step": 14430 + }, + { + "epoch": 0.79, + "grad_norm": 0.1728515625, + "learning_rate": 2.55334646973752e-05, + "loss": 1.1895, + "step": 14435 + }, + { + "epoch": 0.79, + "grad_norm": 0.1630859375, + "learning_rate": 2.5469691956226095e-05, + "loss": 1.1144, + "step": 14440 + }, + { + "epoch": 0.79, + "grad_norm": 0.173828125, + "learning_rate": 2.5405987330372605e-05, + "loss": 1.1324, + "step": 14445 + }, + { + "epoch": 0.79, + "grad_norm": 0.171875, + "learning_rate": 2.534235087803616e-05, + "loss": 1.2282, + "step": 14450 + }, + { + "epoch": 0.79, + "grad_norm": 0.173828125, + "learning_rate": 2.5278782657375853e-05, + "loss": 1.1355, + "step": 14455 + }, + { + "epoch": 0.79, + "grad_norm": 0.1748046875, + "learning_rate": 2.5215282726488476e-05, + "loss": 1.1699, + "step": 14460 + }, + { + "epoch": 0.79, + "grad_norm": 0.1640625, + "learning_rate": 2.515185114340829e-05, + "loss": 1.2136, + "step": 14465 + }, + { + "epoch": 0.79, + "grad_norm": 0.1826171875, + "learning_rate": 2.50884879661072e-05, + "loss": 1.2011, + "step": 14470 + }, + { + "epoch": 0.79, + "grad_norm": 0.1787109375, + "learning_rate": 2.5025193252494538e-05, + "loss": 1.122, + "step": 14475 + }, + { + "epoch": 0.79, + "grad_norm": 0.171875, + "learning_rate": 2.4961967060417114e-05, + "loss": 1.2412, + "step": 14480 + }, + { + "epoch": 0.79, + "grad_norm": 0.17578125, + "learning_rate": 2.4898809447659034e-05, + "loss": 1.1561, + "step": 14485 + }, + { + "epoch": 0.79, + "grad_norm": 0.171875, + "learning_rate": 2.4835720471941803e-05, + "loss": 1.1655, + "step": 14490 + }, + { + "epoch": 0.79, + "grad_norm": 0.17578125, + "learning_rate": 2.4772700190924158e-05, + "loss": 1.1939, + "step": 14495 + }, + { + "epoch": 0.79, + "grad_norm": 0.1728515625, + "learning_rate": 2.4709748662202094e-05, + "loss": 1.1419, + "step": 14500 + }, + { + "epoch": 0.79, + "grad_norm": 0.1689453125, + "learning_rate": 2.4646865943308694e-05, + "loss": 1.1741, + "step": 14505 + }, + { + "epoch": 0.79, + "grad_norm": 0.1748046875, + "learning_rate": 2.4584052091714237e-05, + "loss": 1.1634, + "step": 14510 + }, + { + "epoch": 0.8, + "grad_norm": 0.177734375, + "learning_rate": 2.4521307164826002e-05, + "loss": 1.167, + "step": 14515 + }, + { + "epoch": 0.8, + "grad_norm": 0.17578125, + "learning_rate": 2.4458631219988325e-05, + "loss": 1.1621, + "step": 14520 + }, + { + "epoch": 0.8, + "grad_norm": 0.171875, + "learning_rate": 2.439602431448249e-05, + "loss": 1.1669, + "step": 14525 + }, + { + "epoch": 0.8, + "grad_norm": 0.1884765625, + "learning_rate": 2.4333486505526615e-05, + "loss": 1.1708, + "step": 14530 + }, + { + "epoch": 0.8, + "grad_norm": 0.1728515625, + "learning_rate": 2.4271017850275757e-05, + "loss": 1.1187, + "step": 14535 + }, + { + "epoch": 0.8, + "grad_norm": 0.1767578125, + "learning_rate": 2.4208618405821715e-05, + "loss": 1.1206, + "step": 14540 + }, + { + "epoch": 0.8, + "grad_norm": 0.173828125, + "learning_rate": 2.4146288229193102e-05, + "loss": 1.1602, + "step": 14545 + }, + { + "epoch": 0.8, + "grad_norm": 0.1728515625, + "learning_rate": 2.408402737735511e-05, + "loss": 1.2055, + "step": 14550 + }, + { + "epoch": 0.8, + "grad_norm": 0.166015625, + "learning_rate": 2.4021835907209688e-05, + "loss": 1.1385, + "step": 14555 + }, + { + "epoch": 0.8, + "grad_norm": 0.169921875, + "learning_rate": 2.39597138755953e-05, + "loss": 1.1764, + "step": 14560 + }, + { + "epoch": 0.8, + "grad_norm": 0.177734375, + "learning_rate": 2.3897661339287036e-05, + "loss": 1.1861, + "step": 14565 + }, + { + "epoch": 0.8, + "grad_norm": 0.17578125, + "learning_rate": 2.383567835499634e-05, + "loss": 1.1928, + "step": 14570 + }, + { + "epoch": 0.8, + "grad_norm": 0.1806640625, + "learning_rate": 2.3773764979371194e-05, + "loss": 1.194, + "step": 14575 + }, + { + "epoch": 0.8, + "grad_norm": 0.1767578125, + "learning_rate": 2.371192126899593e-05, + "loss": 1.1139, + "step": 14580 + }, + { + "epoch": 0.8, + "grad_norm": 0.1708984375, + "learning_rate": 2.3650147280391244e-05, + "loss": 1.1799, + "step": 14585 + }, + { + "epoch": 0.8, + "grad_norm": 0.177734375, + "learning_rate": 2.3588443070014023e-05, + "loss": 1.1999, + "step": 14590 + }, + { + "epoch": 0.8, + "grad_norm": 0.173828125, + "learning_rate": 2.3526808694257475e-05, + "loss": 1.1243, + "step": 14595 + }, + { + "epoch": 0.8, + "grad_norm": 0.169921875, + "learning_rate": 2.3465244209450943e-05, + "loss": 1.181, + "step": 14600 + }, + { + "epoch": 0.8, + "grad_norm": 0.1728515625, + "learning_rate": 2.3403749671859908e-05, + "loss": 1.143, + "step": 14605 + }, + { + "epoch": 0.8, + "grad_norm": 0.1796875, + "learning_rate": 2.3342325137685938e-05, + "loss": 1.1431, + "step": 14610 + }, + { + "epoch": 0.8, + "grad_norm": 0.1806640625, + "learning_rate": 2.3280970663066536e-05, + "loss": 1.1875, + "step": 14615 + }, + { + "epoch": 0.8, + "grad_norm": 0.16796875, + "learning_rate": 2.3219686304075293e-05, + "loss": 1.0855, + "step": 14620 + }, + { + "epoch": 0.8, + "grad_norm": 0.171875, + "learning_rate": 2.3158472116721654e-05, + "loss": 1.1312, + "step": 14625 + }, + { + "epoch": 0.8, + "grad_norm": 0.173828125, + "learning_rate": 2.309732815695096e-05, + "loss": 1.1876, + "step": 14630 + }, + { + "epoch": 0.8, + "grad_norm": 0.1767578125, + "learning_rate": 2.3036254480644325e-05, + "loss": 1.0694, + "step": 14635 + }, + { + "epoch": 0.8, + "grad_norm": 0.1767578125, + "learning_rate": 2.297525114361868e-05, + "loss": 1.1991, + "step": 14640 + }, + { + "epoch": 0.8, + "grad_norm": 0.173828125, + "learning_rate": 2.291431820162665e-05, + "loss": 1.1271, + "step": 14645 + }, + { + "epoch": 0.8, + "grad_norm": 0.189453125, + "learning_rate": 2.2853455710356542e-05, + "loss": 1.2456, + "step": 14650 + }, + { + "epoch": 0.8, + "grad_norm": 0.1669921875, + "learning_rate": 2.2792663725432218e-05, + "loss": 1.2064, + "step": 14655 + }, + { + "epoch": 0.8, + "grad_norm": 0.173828125, + "learning_rate": 2.2731942302413158e-05, + "loss": 1.1134, + "step": 14660 + }, + { + "epoch": 0.8, + "grad_norm": 0.17578125, + "learning_rate": 2.2671291496794344e-05, + "loss": 1.145, + "step": 14665 + }, + { + "epoch": 0.8, + "grad_norm": 0.1689453125, + "learning_rate": 2.2610711364006243e-05, + "loss": 1.1989, + "step": 14670 + }, + { + "epoch": 0.8, + "grad_norm": 0.1708984375, + "learning_rate": 2.2550201959414642e-05, + "loss": 1.1864, + "step": 14675 + }, + { + "epoch": 0.8, + "grad_norm": 0.16796875, + "learning_rate": 2.248976333832078e-05, + "loss": 1.1518, + "step": 14680 + }, + { + "epoch": 0.8, + "grad_norm": 0.17578125, + "learning_rate": 2.2429395555961163e-05, + "loss": 1.1685, + "step": 14685 + }, + { + "epoch": 0.8, + "grad_norm": 0.1748046875, + "learning_rate": 2.236909866750758e-05, + "loss": 1.2221, + "step": 14690 + }, + { + "epoch": 0.8, + "grad_norm": 0.1748046875, + "learning_rate": 2.230887272806703e-05, + "loss": 1.1326, + "step": 14695 + }, + { + "epoch": 0.81, + "grad_norm": 0.1767578125, + "learning_rate": 2.2248717792681607e-05, + "loss": 1.1789, + "step": 14700 + }, + { + "epoch": 0.81, + "grad_norm": 0.177734375, + "learning_rate": 2.218863391632857e-05, + "loss": 1.1708, + "step": 14705 + }, + { + "epoch": 0.81, + "grad_norm": 0.1767578125, + "learning_rate": 2.212862115392025e-05, + "loss": 1.1773, + "step": 14710 + }, + { + "epoch": 0.81, + "grad_norm": 0.1767578125, + "learning_rate": 2.2068679560303963e-05, + "loss": 1.1225, + "step": 14715 + }, + { + "epoch": 0.81, + "grad_norm": 0.173828125, + "learning_rate": 2.2008809190261916e-05, + "loss": 1.2139, + "step": 14720 + }, + { + "epoch": 0.81, + "grad_norm": 0.1767578125, + "learning_rate": 2.194901009851137e-05, + "loss": 1.1995, + "step": 14725 + }, + { + "epoch": 0.81, + "grad_norm": 0.166015625, + "learning_rate": 2.1889282339704286e-05, + "loss": 1.1509, + "step": 14730 + }, + { + "epoch": 0.81, + "grad_norm": 0.1748046875, + "learning_rate": 2.1829625968427568e-05, + "loss": 1.0717, + "step": 14735 + }, + { + "epoch": 0.81, + "grad_norm": 0.1708984375, + "learning_rate": 2.177004103920275e-05, + "loss": 1.1712, + "step": 14740 + }, + { + "epoch": 0.81, + "grad_norm": 0.171875, + "learning_rate": 2.1710527606486153e-05, + "loss": 1.1454, + "step": 14745 + }, + { + "epoch": 0.81, + "grad_norm": 0.1708984375, + "learning_rate": 2.1651085724668764e-05, + "loss": 1.1789, + "step": 14750 + }, + { + "epoch": 0.81, + "grad_norm": 0.166015625, + "learning_rate": 2.1591715448076143e-05, + "loss": 1.1542, + "step": 14755 + }, + { + "epoch": 0.81, + "grad_norm": 0.1767578125, + "learning_rate": 2.1532416830968393e-05, + "loss": 1.2123, + "step": 14760 + }, + { + "epoch": 0.81, + "grad_norm": 0.173828125, + "learning_rate": 2.147318992754016e-05, + "loss": 1.108, + "step": 14765 + }, + { + "epoch": 0.81, + "grad_norm": 0.173828125, + "learning_rate": 2.1414034791920547e-05, + "loss": 1.1693, + "step": 14770 + }, + { + "epoch": 0.81, + "grad_norm": 0.169921875, + "learning_rate": 2.1354951478173068e-05, + "loss": 1.1721, + "step": 14775 + }, + { + "epoch": 0.81, + "grad_norm": 0.1708984375, + "learning_rate": 2.129594004029555e-05, + "loss": 1.1436, + "step": 14780 + }, + { + "epoch": 0.81, + "grad_norm": 0.1669921875, + "learning_rate": 2.1237000532220186e-05, + "loss": 1.0732, + "step": 14785 + }, + { + "epoch": 0.81, + "grad_norm": 0.17578125, + "learning_rate": 2.1178133007813418e-05, + "loss": 1.1218, + "step": 14790 + }, + { + "epoch": 0.81, + "grad_norm": 0.17578125, + "learning_rate": 2.111933752087588e-05, + "loss": 1.174, + "step": 14795 + }, + { + "epoch": 0.81, + "grad_norm": 0.17578125, + "learning_rate": 2.106061412514242e-05, + "loss": 1.2131, + "step": 14800 + }, + { + "epoch": 0.81, + "grad_norm": 0.1728515625, + "learning_rate": 2.1001962874281878e-05, + "loss": 1.1411, + "step": 14805 + }, + { + "epoch": 0.81, + "grad_norm": 0.173828125, + "learning_rate": 2.0943383821897357e-05, + "loss": 1.0683, + "step": 14810 + }, + { + "epoch": 0.81, + "grad_norm": 0.177734375, + "learning_rate": 2.088487702152577e-05, + "loss": 1.1771, + "step": 14815 + }, + { + "epoch": 0.81, + "grad_norm": 0.1787109375, + "learning_rate": 2.0826442526638147e-05, + "loss": 1.1603, + "step": 14820 + }, + { + "epoch": 0.81, + "grad_norm": 0.1767578125, + "learning_rate": 2.07680803906393e-05, + "loss": 1.1607, + "step": 14825 + }, + { + "epoch": 0.81, + "grad_norm": 0.169921875, + "learning_rate": 2.07097906668681e-05, + "loss": 1.076, + "step": 14830 + }, + { + "epoch": 0.81, + "grad_norm": 0.181640625, + "learning_rate": 2.0651573408597035e-05, + "loss": 1.1833, + "step": 14835 + }, + { + "epoch": 0.81, + "grad_norm": 0.17578125, + "learning_rate": 2.0593428669032522e-05, + "loss": 1.2107, + "step": 14840 + }, + { + "epoch": 0.81, + "grad_norm": 0.17578125, + "learning_rate": 2.0535356501314572e-05, + "loss": 1.1851, + "step": 14845 + }, + { + "epoch": 0.81, + "grad_norm": 0.177734375, + "learning_rate": 2.047735695851697e-05, + "loss": 1.1028, + "step": 14850 + }, + { + "epoch": 0.81, + "grad_norm": 0.177734375, + "learning_rate": 2.0419430093647073e-05, + "loss": 1.1625, + "step": 14855 + }, + { + "epoch": 0.81, + "grad_norm": 0.1669921875, + "learning_rate": 2.0361575959645886e-05, + "loss": 1.144, + "step": 14860 + }, + { + "epoch": 0.81, + "grad_norm": 0.1767578125, + "learning_rate": 2.0303794609387815e-05, + "loss": 1.148, + "step": 14865 + }, + { + "epoch": 0.81, + "grad_norm": 0.1748046875, + "learning_rate": 2.024608609568086e-05, + "loss": 1.1555, + "step": 14870 + }, + { + "epoch": 0.81, + "grad_norm": 0.17578125, + "learning_rate": 2.018845047126642e-05, + "loss": 1.1315, + "step": 14875 + }, + { + "epoch": 0.82, + "grad_norm": 0.1591796875, + "learning_rate": 2.0130887788819275e-05, + "loss": 1.1403, + "step": 14880 + }, + { + "epoch": 0.82, + "grad_norm": 0.169921875, + "learning_rate": 2.007339810094756e-05, + "loss": 1.1378, + "step": 14885 + }, + { + "epoch": 0.82, + "grad_norm": 0.1708984375, + "learning_rate": 2.001598146019261e-05, + "loss": 1.1418, + "step": 14890 + }, + { + "epoch": 0.82, + "grad_norm": 0.173828125, + "learning_rate": 1.9958637919029166e-05, + "loss": 1.2768, + "step": 14895 + }, + { + "epoch": 0.82, + "grad_norm": 0.1787109375, + "learning_rate": 1.990136752986499e-05, + "loss": 1.1747, + "step": 14900 + }, + { + "epoch": 0.82, + "grad_norm": 0.166015625, + "learning_rate": 1.9844170345041124e-05, + "loss": 1.2055, + "step": 14905 + }, + { + "epoch": 0.82, + "grad_norm": 0.16796875, + "learning_rate": 1.9787046416831555e-05, + "loss": 1.135, + "step": 14910 + }, + { + "epoch": 0.82, + "grad_norm": 0.171875, + "learning_rate": 1.9729995797443524e-05, + "loss": 1.2799, + "step": 14915 + }, + { + "epoch": 0.82, + "grad_norm": 0.1748046875, + "learning_rate": 1.9673018539017075e-05, + "loss": 1.1108, + "step": 14920 + }, + { + "epoch": 0.82, + "grad_norm": 0.1728515625, + "learning_rate": 1.9616114693625355e-05, + "loss": 1.1695, + "step": 14925 + }, + { + "epoch": 0.82, + "grad_norm": 0.1728515625, + "learning_rate": 1.9559284313274252e-05, + "loss": 1.1855, + "step": 14930 + }, + { + "epoch": 0.82, + "grad_norm": 0.1728515625, + "learning_rate": 1.9502527449902718e-05, + "loss": 1.1635, + "step": 14935 + }, + { + "epoch": 0.82, + "grad_norm": 0.1748046875, + "learning_rate": 1.9445844155382353e-05, + "loss": 1.1343, + "step": 14940 + }, + { + "epoch": 0.82, + "grad_norm": 0.173828125, + "learning_rate": 1.9389234481517604e-05, + "loss": 1.098, + "step": 14945 + }, + { + "epoch": 0.82, + "grad_norm": 0.173828125, + "learning_rate": 1.9332698480045575e-05, + "loss": 1.186, + "step": 14950 + }, + { + "epoch": 0.82, + "grad_norm": 0.1748046875, + "learning_rate": 1.9276236202636077e-05, + "loss": 1.1687, + "step": 14955 + }, + { + "epoch": 0.82, + "grad_norm": 0.17578125, + "learning_rate": 1.9219847700891558e-05, + "loss": 1.1899, + "step": 14960 + }, + { + "epoch": 0.82, + "grad_norm": 0.177734375, + "learning_rate": 1.9163533026347e-05, + "loss": 1.2169, + "step": 14965 + }, + { + "epoch": 0.82, + "grad_norm": 0.1748046875, + "learning_rate": 1.9107292230469975e-05, + "loss": 1.2982, + "step": 14970 + }, + { + "epoch": 0.82, + "grad_norm": 0.17578125, + "learning_rate": 1.9051125364660416e-05, + "loss": 1.1029, + "step": 14975 + }, + { + "epoch": 0.82, + "grad_norm": 0.1708984375, + "learning_rate": 1.8995032480250806e-05, + "loss": 1.1488, + "step": 14980 + }, + { + "epoch": 0.82, + "grad_norm": 0.1748046875, + "learning_rate": 1.8939013628505963e-05, + "loss": 1.1138, + "step": 14985 + }, + { + "epoch": 0.82, + "grad_norm": 0.177734375, + "learning_rate": 1.8883068860623098e-05, + "loss": 1.1779, + "step": 14990 + }, + { + "epoch": 0.82, + "grad_norm": 0.169921875, + "learning_rate": 1.882719822773158e-05, + "loss": 1.2502, + "step": 14995 + }, + { + "epoch": 0.82, + "grad_norm": 0.171875, + "learning_rate": 1.8771401780893217e-05, + "loss": 1.1273, + "step": 15000 + }, + { + "epoch": 0.82, + "grad_norm": 0.1748046875, + "learning_rate": 1.871567957110183e-05, + "loss": 1.1555, + "step": 15005 + }, + { + "epoch": 0.82, + "grad_norm": 0.1767578125, + "learning_rate": 1.866003164928355e-05, + "loss": 1.1827, + "step": 15010 + }, + { + "epoch": 0.82, + "grad_norm": 0.171875, + "learning_rate": 1.860445806629645e-05, + "loss": 1.1184, + "step": 15015 + }, + { + "epoch": 0.82, + "grad_norm": 0.1689453125, + "learning_rate": 1.8548958872930868e-05, + "loss": 1.1143, + "step": 15020 + }, + { + "epoch": 0.82, + "grad_norm": 0.1767578125, + "learning_rate": 1.849353411990894e-05, + "loss": 1.1244, + "step": 15025 + }, + { + "epoch": 0.82, + "grad_norm": 0.1884765625, + "learning_rate": 1.843818385788495e-05, + "loss": 1.1501, + "step": 15030 + }, + { + "epoch": 0.82, + "grad_norm": 0.181640625, + "learning_rate": 1.8382908137444955e-05, + "loss": 1.1973, + "step": 15035 + }, + { + "epoch": 0.82, + "grad_norm": 0.1748046875, + "learning_rate": 1.8327707009106976e-05, + "loss": 1.1341, + "step": 15040 + }, + { + "epoch": 0.82, + "grad_norm": 0.16796875, + "learning_rate": 1.827258052332085e-05, + "loss": 1.1807, + "step": 15045 + }, + { + "epoch": 0.82, + "grad_norm": 0.169921875, + "learning_rate": 1.8217528730468193e-05, + "loss": 1.1366, + "step": 15050 + }, + { + "epoch": 0.82, + "grad_norm": 0.1728515625, + "learning_rate": 1.816255168086236e-05, + "loss": 1.1123, + "step": 15055 + }, + { + "epoch": 0.82, + "grad_norm": 0.1748046875, + "learning_rate": 1.810764942474834e-05, + "loss": 1.1146, + "step": 15060 + }, + { + "epoch": 0.83, + "grad_norm": 0.1708984375, + "learning_rate": 1.805282201230285e-05, + "loss": 1.2433, + "step": 15065 + }, + { + "epoch": 0.83, + "grad_norm": 0.1669921875, + "learning_rate": 1.799806949363415e-05, + "loss": 1.1296, + "step": 15070 + }, + { + "epoch": 0.83, + "grad_norm": 0.166015625, + "learning_rate": 1.7943391918782117e-05, + "loss": 1.1524, + "step": 15075 + }, + { + "epoch": 0.83, + "grad_norm": 0.17578125, + "learning_rate": 1.7888789337717993e-05, + "loss": 1.2408, + "step": 15080 + }, + { + "epoch": 0.83, + "grad_norm": 0.173828125, + "learning_rate": 1.7834261800344687e-05, + "loss": 1.0967, + "step": 15085 + }, + { + "epoch": 0.83, + "grad_norm": 0.1796875, + "learning_rate": 1.7779809356496337e-05, + "loss": 1.1405, + "step": 15090 + }, + { + "epoch": 0.83, + "grad_norm": 0.1689453125, + "learning_rate": 1.7725432055938573e-05, + "loss": 1.1318, + "step": 15095 + }, + { + "epoch": 0.83, + "grad_norm": 0.171875, + "learning_rate": 1.7671129948368236e-05, + "loss": 1.2091, + "step": 15100 + }, + { + "epoch": 0.83, + "grad_norm": 0.1865234375, + "learning_rate": 1.7616903083413615e-05, + "loss": 1.2158, + "step": 15105 + }, + { + "epoch": 0.83, + "grad_norm": 0.169921875, + "learning_rate": 1.7562751510634046e-05, + "loss": 1.138, + "step": 15110 + }, + { + "epoch": 0.83, + "grad_norm": 0.1650390625, + "learning_rate": 1.7508675279520194e-05, + "loss": 1.1527, + "step": 15115 + }, + { + "epoch": 0.83, + "grad_norm": 0.173828125, + "learning_rate": 1.7454674439493734e-05, + "loss": 1.2086, + "step": 15120 + }, + { + "epoch": 0.83, + "grad_norm": 0.1689453125, + "learning_rate": 1.7400749039907617e-05, + "loss": 1.1983, + "step": 15125 + }, + { + "epoch": 0.83, + "grad_norm": 0.1728515625, + "learning_rate": 1.734689913004568e-05, + "loss": 1.1715, + "step": 15130 + }, + { + "epoch": 0.83, + "grad_norm": 0.1728515625, + "learning_rate": 1.7293124759122882e-05, + "loss": 1.1548, + "step": 15135 + }, + { + "epoch": 0.83, + "grad_norm": 0.1748046875, + "learning_rate": 1.723942597628504e-05, + "loss": 1.1605, + "step": 15140 + }, + { + "epoch": 0.83, + "grad_norm": 0.1806640625, + "learning_rate": 1.7185802830608978e-05, + "loss": 1.1958, + "step": 15145 + }, + { + "epoch": 0.83, + "grad_norm": 0.1708984375, + "learning_rate": 1.7132255371102358e-05, + "loss": 1.1007, + "step": 15150 + }, + { + "epoch": 0.83, + "grad_norm": 0.1669921875, + "learning_rate": 1.707878364670368e-05, + "loss": 1.08, + "step": 15155 + }, + { + "epoch": 0.83, + "grad_norm": 0.1787109375, + "learning_rate": 1.7025387706282237e-05, + "loss": 1.1766, + "step": 15160 + }, + { + "epoch": 0.83, + "grad_norm": 0.1669921875, + "learning_rate": 1.6972067598637973e-05, + "loss": 1.1762, + "step": 15165 + }, + { + "epoch": 0.83, + "grad_norm": 0.177734375, + "learning_rate": 1.6918823372501713e-05, + "loss": 1.1726, + "step": 15170 + }, + { + "epoch": 0.83, + "grad_norm": 0.1748046875, + "learning_rate": 1.6865655076534726e-05, + "loss": 1.1205, + "step": 15175 + }, + { + "epoch": 0.83, + "grad_norm": 0.1796875, + "learning_rate": 1.681256275932903e-05, + "loss": 1.1836, + "step": 15180 + }, + { + "epoch": 0.83, + "grad_norm": 0.1748046875, + "learning_rate": 1.6759546469407082e-05, + "loss": 1.1649, + "step": 15185 + }, + { + "epoch": 0.83, + "grad_norm": 0.1689453125, + "learning_rate": 1.6706606255222035e-05, + "loss": 1.1409, + "step": 15190 + }, + { + "epoch": 0.83, + "grad_norm": 0.1767578125, + "learning_rate": 1.6653742165157304e-05, + "loss": 1.1462, + "step": 15195 + }, + { + "epoch": 0.83, + "grad_norm": 0.1640625, + "learning_rate": 1.6600954247526913e-05, + "loss": 1.1258, + "step": 15200 + }, + { + "epoch": 0.83, + "grad_norm": 0.16796875, + "learning_rate": 1.654824255057509e-05, + "loss": 1.0984, + "step": 15205 + }, + { + "epoch": 0.83, + "grad_norm": 0.17578125, + "learning_rate": 1.6495607122476618e-05, + "loss": 1.2316, + "step": 15210 + }, + { + "epoch": 0.83, + "grad_norm": 0.1640625, + "learning_rate": 1.644304801133638e-05, + "loss": 1.164, + "step": 15215 + }, + { + "epoch": 0.83, + "grad_norm": 0.18359375, + "learning_rate": 1.639056526518964e-05, + "loss": 1.146, + "step": 15220 + }, + { + "epoch": 0.83, + "grad_norm": 0.1806640625, + "learning_rate": 1.6338158932001747e-05, + "loss": 1.1302, + "step": 15225 + }, + { + "epoch": 0.83, + "grad_norm": 0.1708984375, + "learning_rate": 1.628582905966839e-05, + "loss": 1.1973, + "step": 15230 + }, + { + "epoch": 0.83, + "grad_norm": 0.1796875, + "learning_rate": 1.62335756960152e-05, + "loss": 1.1351, + "step": 15235 + }, + { + "epoch": 0.83, + "grad_norm": 0.173828125, + "learning_rate": 1.6181398888797982e-05, + "loss": 1.1337, + "step": 15240 + }, + { + "epoch": 0.84, + "grad_norm": 0.181640625, + "learning_rate": 1.6129298685702586e-05, + "loss": 1.0967, + "step": 15245 + }, + { + "epoch": 0.84, + "grad_norm": 0.1796875, + "learning_rate": 1.607727513434475e-05, + "loss": 1.1385, + "step": 15250 + }, + { + "epoch": 0.84, + "grad_norm": 0.1787109375, + "learning_rate": 1.6025328282270315e-05, + "loss": 1.1293, + "step": 15255 + }, + { + "epoch": 0.84, + "grad_norm": 0.17578125, + "learning_rate": 1.597345817695486e-05, + "loss": 1.2423, + "step": 15260 + }, + { + "epoch": 0.84, + "grad_norm": 0.16796875, + "learning_rate": 1.5921664865803955e-05, + "loss": 1.1544, + "step": 15265 + }, + { + "epoch": 0.84, + "grad_norm": 0.1748046875, + "learning_rate": 1.5869948396152844e-05, + "loss": 1.1054, + "step": 15270 + }, + { + "epoch": 0.84, + "grad_norm": 0.17578125, + "learning_rate": 1.5818308815266734e-05, + "loss": 1.1825, + "step": 15275 + }, + { + "epoch": 0.84, + "grad_norm": 0.18359375, + "learning_rate": 1.5766746170340398e-05, + "loss": 1.1645, + "step": 15280 + }, + { + "epoch": 0.84, + "grad_norm": 0.1748046875, + "learning_rate": 1.5715260508498387e-05, + "loss": 1.1658, + "step": 15285 + }, + { + "epoch": 0.84, + "grad_norm": 0.1806640625, + "learning_rate": 1.566385187679479e-05, + "loss": 1.1489, + "step": 15290 + }, + { + "epoch": 0.84, + "grad_norm": 0.166015625, + "learning_rate": 1.5612520322213466e-05, + "loss": 1.1312, + "step": 15295 + }, + { + "epoch": 0.84, + "grad_norm": 0.17578125, + "learning_rate": 1.5561265891667654e-05, + "loss": 1.1652, + "step": 15300 + }, + { + "epoch": 0.84, + "grad_norm": 0.1767578125, + "learning_rate": 1.551008863200024e-05, + "loss": 1.1594, + "step": 15305 + }, + { + "epoch": 0.84, + "grad_norm": 0.1728515625, + "learning_rate": 1.545898858998346e-05, + "loss": 1.1512, + "step": 15310 + }, + { + "epoch": 0.84, + "grad_norm": 0.1767578125, + "learning_rate": 1.5407965812319125e-05, + "loss": 1.1489, + "step": 15315 + }, + { + "epoch": 0.84, + "grad_norm": 0.173828125, + "learning_rate": 1.5357020345638306e-05, + "loss": 1.1412, + "step": 15320 + }, + { + "epoch": 0.84, + "grad_norm": 0.181640625, + "learning_rate": 1.530615223650147e-05, + "loss": 1.107, + "step": 15325 + }, + { + "epoch": 0.84, + "grad_norm": 0.1748046875, + "learning_rate": 1.5255361531398382e-05, + "loss": 1.2239, + "step": 15330 + }, + { + "epoch": 0.84, + "grad_norm": 0.1689453125, + "learning_rate": 1.5204648276748101e-05, + "loss": 1.1057, + "step": 15335 + }, + { + "epoch": 0.84, + "grad_norm": 0.1728515625, + "learning_rate": 1.5154012518898786e-05, + "loss": 1.1529, + "step": 15340 + }, + { + "epoch": 0.84, + "grad_norm": 0.173828125, + "learning_rate": 1.5103454304127896e-05, + "loss": 1.1616, + "step": 15345 + }, + { + "epoch": 0.84, + "grad_norm": 0.169921875, + "learning_rate": 1.505297367864199e-05, + "loss": 1.1154, + "step": 15350 + }, + { + "epoch": 0.84, + "grad_norm": 0.1787109375, + "learning_rate": 1.5002570688576622e-05, + "loss": 1.1565, + "step": 15355 + }, + { + "epoch": 0.84, + "grad_norm": 0.1669921875, + "learning_rate": 1.4952245379996565e-05, + "loss": 1.1727, + "step": 15360 + }, + { + "epoch": 0.84, + "grad_norm": 0.1689453125, + "learning_rate": 1.4901997798895439e-05, + "loss": 1.2091, + "step": 15365 + }, + { + "epoch": 0.84, + "grad_norm": 0.17578125, + "learning_rate": 1.4851827991195922e-05, + "loss": 1.1739, + "step": 15370 + }, + { + "epoch": 0.84, + "grad_norm": 0.1826171875, + "learning_rate": 1.480173600274951e-05, + "loss": 1.1919, + "step": 15375 + }, + { + "epoch": 0.84, + "grad_norm": 0.177734375, + "learning_rate": 1.4751721879336755e-05, + "loss": 1.1953, + "step": 15380 + }, + { + "epoch": 0.84, + "grad_norm": 0.1728515625, + "learning_rate": 1.4701785666666857e-05, + "loss": 1.1684, + "step": 15385 + }, + { + "epoch": 0.84, + "grad_norm": 0.17578125, + "learning_rate": 1.465192741037793e-05, + "loss": 1.173, + "step": 15390 + }, + { + "epoch": 0.84, + "grad_norm": 0.1708984375, + "learning_rate": 1.4602147156036761e-05, + "loss": 1.1794, + "step": 15395 + }, + { + "epoch": 0.84, + "grad_norm": 0.162109375, + "learning_rate": 1.4552444949138954e-05, + "loss": 1.0663, + "step": 15400 + }, + { + "epoch": 0.84, + "grad_norm": 0.1728515625, + "learning_rate": 1.4502820835108677e-05, + "loss": 1.2011, + "step": 15405 + }, + { + "epoch": 0.84, + "grad_norm": 0.1796875, + "learning_rate": 1.445327485929877e-05, + "loss": 1.2647, + "step": 15410 + }, + { + "epoch": 0.84, + "grad_norm": 0.177734375, + "learning_rate": 1.4403807066990694e-05, + "loss": 1.1713, + "step": 15415 + }, + { + "epoch": 0.84, + "grad_norm": 0.17578125, + "learning_rate": 1.4354417503394412e-05, + "loss": 1.1608, + "step": 15420 + }, + { + "epoch": 0.84, + "grad_norm": 0.171875, + "learning_rate": 1.4305106213648366e-05, + "loss": 1.1372, + "step": 15425 + }, + { + "epoch": 0.85, + "grad_norm": 0.173828125, + "learning_rate": 1.425587324281954e-05, + "loss": 1.1375, + "step": 15430 + }, + { + "epoch": 0.85, + "grad_norm": 0.17578125, + "learning_rate": 1.4206718635903304e-05, + "loss": 1.1521, + "step": 15435 + }, + { + "epoch": 0.85, + "grad_norm": 0.1767578125, + "learning_rate": 1.4157642437823337e-05, + "loss": 1.1572, + "step": 15440 + }, + { + "epoch": 0.85, + "grad_norm": 0.1708984375, + "learning_rate": 1.4108644693431816e-05, + "loss": 1.1404, + "step": 15445 + }, + { + "epoch": 0.85, + "grad_norm": 0.1826171875, + "learning_rate": 1.4059725447509053e-05, + "loss": 1.1601, + "step": 15450 + }, + { + "epoch": 0.85, + "grad_norm": 0.169921875, + "learning_rate": 1.4010884744763742e-05, + "loss": 1.131, + "step": 15455 + }, + { + "epoch": 0.85, + "grad_norm": 0.1689453125, + "learning_rate": 1.3962122629832674e-05, + "loss": 1.1325, + "step": 15460 + }, + { + "epoch": 0.85, + "grad_norm": 0.171875, + "learning_rate": 1.3913439147280982e-05, + "loss": 1.1895, + "step": 15465 + }, + { + "epoch": 0.85, + "grad_norm": 0.1708984375, + "learning_rate": 1.3864834341601762e-05, + "loss": 1.1374, + "step": 15470 + }, + { + "epoch": 0.85, + "grad_norm": 0.1748046875, + "learning_rate": 1.3816308257216325e-05, + "loss": 1.1611, + "step": 15475 + }, + { + "epoch": 0.85, + "grad_norm": 0.1806640625, + "learning_rate": 1.376786093847392e-05, + "loss": 1.1853, + "step": 15480 + }, + { + "epoch": 0.85, + "grad_norm": 0.173828125, + "learning_rate": 1.3719492429651992e-05, + "loss": 1.1183, + "step": 15485 + }, + { + "epoch": 0.85, + "grad_norm": 0.16796875, + "learning_rate": 1.367120277495576e-05, + "loss": 1.2144, + "step": 15490 + }, + { + "epoch": 0.85, + "grad_norm": 0.17578125, + "learning_rate": 1.3622992018518532e-05, + "loss": 1.2231, + "step": 15495 + }, + { + "epoch": 0.85, + "grad_norm": 0.1728515625, + "learning_rate": 1.3574860204401352e-05, + "loss": 1.1949, + "step": 15500 + }, + { + "epoch": 0.85, + "grad_norm": 0.1689453125, + "learning_rate": 1.352680737659331e-05, + "loss": 1.1698, + "step": 15505 + }, + { + "epoch": 0.85, + "grad_norm": 0.1708984375, + "learning_rate": 1.347883357901114e-05, + "loss": 1.1875, + "step": 15510 + }, + { + "epoch": 0.85, + "grad_norm": 0.169921875, + "learning_rate": 1.343093885549943e-05, + "loss": 1.2028, + "step": 15515 + }, + { + "epoch": 0.85, + "grad_norm": 0.1787109375, + "learning_rate": 1.3383123249830488e-05, + "loss": 1.2279, + "step": 15520 + }, + { + "epoch": 0.85, + "grad_norm": 0.1787109375, + "learning_rate": 1.3335386805704308e-05, + "loss": 1.1933, + "step": 15525 + }, + { + "epoch": 0.85, + "grad_norm": 0.181640625, + "learning_rate": 1.3287729566748553e-05, + "loss": 1.0972, + "step": 15530 + }, + { + "epoch": 0.85, + "grad_norm": 0.169921875, + "learning_rate": 1.3240151576518446e-05, + "loss": 1.2459, + "step": 15535 + }, + { + "epoch": 0.85, + "grad_norm": 0.1708984375, + "learning_rate": 1.3192652878496869e-05, + "loss": 1.1849, + "step": 15540 + }, + { + "epoch": 0.85, + "grad_norm": 0.1806640625, + "learning_rate": 1.3145233516094103e-05, + "loss": 1.1269, + "step": 15545 + }, + { + "epoch": 0.85, + "grad_norm": 0.171875, + "learning_rate": 1.3097893532648108e-05, + "loss": 1.153, + "step": 15550 + }, + { + "epoch": 0.85, + "grad_norm": 0.16796875, + "learning_rate": 1.3050632971424116e-05, + "loss": 1.2444, + "step": 15555 + }, + { + "epoch": 0.85, + "grad_norm": 0.181640625, + "learning_rate": 1.3003451875614913e-05, + "loss": 1.178, + "step": 15560 + }, + { + "epoch": 0.85, + "grad_norm": 0.173828125, + "learning_rate": 1.2956350288340524e-05, + "loss": 1.145, + "step": 15565 + }, + { + "epoch": 0.85, + "grad_norm": 0.1689453125, + "learning_rate": 1.2909328252648468e-05, + "loss": 1.132, + "step": 15570 + }, + { + "epoch": 0.85, + "grad_norm": 0.18359375, + "learning_rate": 1.2862385811513411e-05, + "loss": 1.2389, + "step": 15575 + }, + { + "epoch": 0.85, + "grad_norm": 0.169921875, + "learning_rate": 1.2815523007837393e-05, + "loss": 1.1711, + "step": 15580 + }, + { + "epoch": 0.85, + "grad_norm": 0.1669921875, + "learning_rate": 1.2768739884449532e-05, + "loss": 1.1258, + "step": 15585 + }, + { + "epoch": 0.85, + "grad_norm": 0.1826171875, + "learning_rate": 1.2722036484106314e-05, + "loss": 1.1635, + "step": 15590 + }, + { + "epoch": 0.85, + "grad_norm": 0.1767578125, + "learning_rate": 1.2675412849491198e-05, + "loss": 1.1924, + "step": 15595 + }, + { + "epoch": 0.85, + "grad_norm": 0.177734375, + "learning_rate": 1.2628869023214807e-05, + "loss": 1.1779, + "step": 15600 + }, + { + "epoch": 0.85, + "grad_norm": 0.1728515625, + "learning_rate": 1.2582405047814827e-05, + "loss": 1.1109, + "step": 15605 + }, + { + "epoch": 0.86, + "grad_norm": 0.16796875, + "learning_rate": 1.2536020965755968e-05, + "loss": 1.1628, + "step": 15610 + }, + { + "epoch": 0.86, + "grad_norm": 0.1640625, + "learning_rate": 1.2489716819429941e-05, + "loss": 1.1961, + "step": 15615 + }, + { + "epoch": 0.86, + "grad_norm": 0.17578125, + "learning_rate": 1.2443492651155332e-05, + "loss": 1.2238, + "step": 15620 + }, + { + "epoch": 0.86, + "grad_norm": 0.171875, + "learning_rate": 1.2397348503177686e-05, + "loss": 1.1817, + "step": 15625 + }, + { + "epoch": 0.86, + "grad_norm": 0.17578125, + "learning_rate": 1.2351284417669428e-05, + "loss": 1.2217, + "step": 15630 + }, + { + "epoch": 0.86, + "grad_norm": 0.1669921875, + "learning_rate": 1.2305300436729793e-05, + "loss": 1.1884, + "step": 15635 + }, + { + "epoch": 0.86, + "grad_norm": 0.1708984375, + "learning_rate": 1.2259396602384776e-05, + "loss": 1.1059, + "step": 15640 + }, + { + "epoch": 0.86, + "grad_norm": 0.173828125, + "learning_rate": 1.2213572956587182e-05, + "loss": 1.1857, + "step": 15645 + }, + { + "epoch": 0.86, + "grad_norm": 0.1708984375, + "learning_rate": 1.2167829541216459e-05, + "loss": 1.2348, + "step": 15650 + }, + { + "epoch": 0.86, + "grad_norm": 0.1767578125, + "learning_rate": 1.2122166398078827e-05, + "loss": 1.1035, + "step": 15655 + }, + { + "epoch": 0.86, + "grad_norm": 0.1748046875, + "learning_rate": 1.207658356890704e-05, + "loss": 1.2017, + "step": 15660 + }, + { + "epoch": 0.86, + "grad_norm": 0.166015625, + "learning_rate": 1.2031081095360542e-05, + "loss": 1.14, + "step": 15665 + }, + { + "epoch": 0.86, + "grad_norm": 0.173828125, + "learning_rate": 1.1985659019025208e-05, + "loss": 1.1444, + "step": 15670 + }, + { + "epoch": 0.86, + "grad_norm": 0.1728515625, + "learning_rate": 1.1940317381413635e-05, + "loss": 1.1573, + "step": 15675 + }, + { + "epoch": 0.86, + "grad_norm": 0.169921875, + "learning_rate": 1.1895056223964707e-05, + "loss": 1.1853, + "step": 15680 + }, + { + "epoch": 0.86, + "grad_norm": 0.1669921875, + "learning_rate": 1.1849875588043879e-05, + "loss": 1.1688, + "step": 15685 + }, + { + "epoch": 0.86, + "grad_norm": 0.177734375, + "learning_rate": 1.1804775514942946e-05, + "loss": 1.1349, + "step": 15690 + }, + { + "epoch": 0.86, + "grad_norm": 0.1728515625, + "learning_rate": 1.1759756045880155e-05, + "loss": 1.2029, + "step": 15695 + }, + { + "epoch": 0.86, + "grad_norm": 0.1787109375, + "learning_rate": 1.1714817221999975e-05, + "loss": 1.1874, + "step": 15700 + }, + { + "epoch": 0.86, + "grad_norm": 0.1728515625, + "learning_rate": 1.1669959084373261e-05, + "loss": 1.1484, + "step": 15705 + }, + { + "epoch": 0.86, + "grad_norm": 0.173828125, + "learning_rate": 1.1625181673997087e-05, + "loss": 1.1444, + "step": 15710 + }, + { + "epoch": 0.86, + "grad_norm": 0.1748046875, + "learning_rate": 1.1580485031794775e-05, + "loss": 1.2489, + "step": 15715 + }, + { + "epoch": 0.86, + "grad_norm": 0.1728515625, + "learning_rate": 1.1535869198615824e-05, + "loss": 1.2006, + "step": 15720 + }, + { + "epoch": 0.86, + "grad_norm": 0.1728515625, + "learning_rate": 1.1491334215235826e-05, + "loss": 1.1459, + "step": 15725 + }, + { + "epoch": 0.86, + "grad_norm": 0.1767578125, + "learning_rate": 1.1446880122356541e-05, + "loss": 1.1095, + "step": 15730 + }, + { + "epoch": 0.86, + "grad_norm": 0.1748046875, + "learning_rate": 1.1402506960605808e-05, + "loss": 1.2269, + "step": 15735 + }, + { + "epoch": 0.86, + "grad_norm": 0.1708984375, + "learning_rate": 1.1358214770537479e-05, + "loss": 1.1456, + "step": 15740 + }, + { + "epoch": 0.86, + "grad_norm": 0.1728515625, + "learning_rate": 1.1314003592631362e-05, + "loss": 1.1032, + "step": 15745 + }, + { + "epoch": 0.86, + "grad_norm": 0.1689453125, + "learning_rate": 1.1269873467293313e-05, + "loss": 1.1541, + "step": 15750 + }, + { + "epoch": 0.86, + "grad_norm": 0.171875, + "learning_rate": 1.1225824434855015e-05, + "loss": 1.1225, + "step": 15755 + }, + { + "epoch": 0.86, + "grad_norm": 0.171875, + "learning_rate": 1.1181856535574154e-05, + "loss": 1.1481, + "step": 15760 + }, + { + "epoch": 0.86, + "grad_norm": 0.1748046875, + "learning_rate": 1.1137969809634141e-05, + "loss": 1.1179, + "step": 15765 + }, + { + "epoch": 0.86, + "grad_norm": 0.17578125, + "learning_rate": 1.1094164297144294e-05, + "loss": 1.1666, + "step": 15770 + }, + { + "epoch": 0.86, + "grad_norm": 0.185546875, + "learning_rate": 1.1050440038139664e-05, + "loss": 1.2062, + "step": 15775 + }, + { + "epoch": 0.86, + "grad_norm": 0.1787109375, + "learning_rate": 1.1006797072581065e-05, + "loss": 1.1807, + "step": 15780 + }, + { + "epoch": 0.86, + "grad_norm": 0.169921875, + "learning_rate": 1.096323544035498e-05, + "loss": 1.2034, + "step": 15785 + }, + { + "epoch": 0.86, + "grad_norm": 0.16015625, + "learning_rate": 1.0919755181273584e-05, + "loss": 1.0936, + "step": 15790 + }, + { + "epoch": 0.87, + "grad_norm": 0.166015625, + "learning_rate": 1.0876356335074688e-05, + "loss": 1.2038, + "step": 15795 + }, + { + "epoch": 0.87, + "grad_norm": 0.1787109375, + "learning_rate": 1.0833038941421691e-05, + "loss": 1.1436, + "step": 15800 + }, + { + "epoch": 0.87, + "grad_norm": 0.171875, + "learning_rate": 1.078980303990358e-05, + "loss": 1.1527, + "step": 15805 + }, + { + "epoch": 0.87, + "grad_norm": 0.169921875, + "learning_rate": 1.0746648670034765e-05, + "loss": 1.1403, + "step": 15810 + }, + { + "epoch": 0.87, + "grad_norm": 0.177734375, + "learning_rate": 1.0703575871255245e-05, + "loss": 1.1163, + "step": 15815 + }, + { + "epoch": 0.87, + "grad_norm": 0.1728515625, + "learning_rate": 1.0660584682930441e-05, + "loss": 1.1601, + "step": 15820 + }, + { + "epoch": 0.87, + "grad_norm": 0.18359375, + "learning_rate": 1.061767514435119e-05, + "loss": 1.1993, + "step": 15825 + }, + { + "epoch": 0.87, + "grad_norm": 0.1796875, + "learning_rate": 1.057484729473367e-05, + "loss": 1.1943, + "step": 15830 + }, + { + "epoch": 0.87, + "grad_norm": 0.1708984375, + "learning_rate": 1.0532101173219433e-05, + "loss": 1.2041, + "step": 15835 + }, + { + "epoch": 0.87, + "grad_norm": 0.17578125, + "learning_rate": 1.0489436818875331e-05, + "loss": 1.1697, + "step": 15840 + }, + { + "epoch": 0.87, + "grad_norm": 0.197265625, + "learning_rate": 1.044685427069354e-05, + "loss": 1.1648, + "step": 15845 + }, + { + "epoch": 0.87, + "grad_norm": 0.166015625, + "learning_rate": 1.0404353567591351e-05, + "loss": 1.1325, + "step": 15850 + }, + { + "epoch": 0.87, + "grad_norm": 0.17578125, + "learning_rate": 1.0361934748411373e-05, + "loss": 1.2276, + "step": 15855 + }, + { + "epoch": 0.87, + "grad_norm": 0.173828125, + "learning_rate": 1.031959785192127e-05, + "loss": 1.1691, + "step": 15860 + }, + { + "epoch": 0.87, + "grad_norm": 0.1708984375, + "learning_rate": 1.027734291681397e-05, + "loss": 1.1603, + "step": 15865 + }, + { + "epoch": 0.87, + "grad_norm": 0.1767578125, + "learning_rate": 1.0235169981707371e-05, + "loss": 1.2138, + "step": 15870 + }, + { + "epoch": 0.87, + "grad_norm": 0.1845703125, + "learning_rate": 1.0193079085144485e-05, + "loss": 1.2319, + "step": 15875 + }, + { + "epoch": 0.87, + "grad_norm": 0.1875, + "learning_rate": 1.0151070265593343e-05, + "loss": 1.192, + "step": 15880 + }, + { + "epoch": 0.87, + "grad_norm": 0.1796875, + "learning_rate": 1.0109143561446955e-05, + "loss": 1.1216, + "step": 15885 + }, + { + "epoch": 0.87, + "grad_norm": 0.1845703125, + "learning_rate": 1.0067299011023312e-05, + "loss": 1.146, + "step": 15890 + }, + { + "epoch": 0.87, + "grad_norm": 0.177734375, + "learning_rate": 1.002553665256526e-05, + "loss": 1.1802, + "step": 15895 + }, + { + "epoch": 0.87, + "grad_norm": 0.169921875, + "learning_rate": 9.983856524240588e-06, + "loss": 1.1755, + "step": 15900 + }, + { + "epoch": 0.87, + "grad_norm": 0.173828125, + "learning_rate": 9.942258664141901e-06, + "loss": 1.1626, + "step": 15905 + }, + { + "epoch": 0.87, + "grad_norm": 0.1796875, + "learning_rate": 9.900743110286648e-06, + "loss": 1.177, + "step": 15910 + }, + { + "epoch": 0.87, + "grad_norm": 0.171875, + "learning_rate": 9.859309900616987e-06, + "loss": 1.1246, + "step": 15915 + }, + { + "epoch": 0.87, + "grad_norm": 0.177734375, + "learning_rate": 9.817959072999904e-06, + "loss": 1.2089, + "step": 15920 + }, + { + "epoch": 0.87, + "grad_norm": 0.1767578125, + "learning_rate": 9.77669066522705e-06, + "loss": 1.1896, + "step": 15925 + }, + { + "epoch": 0.87, + "grad_norm": 0.177734375, + "learning_rate": 9.735504715014764e-06, + "loss": 1.186, + "step": 15930 + }, + { + "epoch": 0.87, + "grad_norm": 0.1787109375, + "learning_rate": 9.694401260003993e-06, + "loss": 1.1843, + "step": 15935 + }, + { + "epoch": 0.87, + "grad_norm": 0.16796875, + "learning_rate": 9.653380337760354e-06, + "loss": 1.1994, + "step": 15940 + }, + { + "epoch": 0.87, + "grad_norm": 0.1748046875, + "learning_rate": 9.612441985773912e-06, + "loss": 1.2127, + "step": 15945 + }, + { + "epoch": 0.87, + "grad_norm": 0.1826171875, + "learning_rate": 9.571586241459462e-06, + "loss": 1.1564, + "step": 15950 + }, + { + "epoch": 0.87, + "grad_norm": 0.171875, + "learning_rate": 9.530813142156104e-06, + "loss": 1.1474, + "step": 15955 + }, + { + "epoch": 0.87, + "grad_norm": 0.171875, + "learning_rate": 9.490122725127538e-06, + "loss": 1.1543, + "step": 15960 + }, + { + "epoch": 0.87, + "grad_norm": 0.1728515625, + "learning_rate": 9.449515027561828e-06, + "loss": 1.1735, + "step": 15965 + }, + { + "epoch": 0.87, + "grad_norm": 0.166015625, + "learning_rate": 9.408990086571479e-06, + "loss": 1.153, + "step": 15970 + }, + { + "epoch": 0.88, + "grad_norm": 0.1767578125, + "learning_rate": 9.368547939193373e-06, + "loss": 1.1329, + "step": 15975 + }, + { + "epoch": 0.88, + "grad_norm": 0.1748046875, + "learning_rate": 9.328188622388644e-06, + "loss": 1.1603, + "step": 15980 + }, + { + "epoch": 0.88, + "grad_norm": 0.16796875, + "learning_rate": 9.287912173042811e-06, + "loss": 1.1533, + "step": 15985 + }, + { + "epoch": 0.88, + "grad_norm": 0.17578125, + "learning_rate": 9.24771862796563e-06, + "loss": 1.1527, + "step": 15990 + }, + { + "epoch": 0.88, + "grad_norm": 0.1728515625, + "learning_rate": 9.207608023891101e-06, + "loss": 1.1962, + "step": 15995 + }, + { + "epoch": 0.88, + "grad_norm": 0.1787109375, + "learning_rate": 9.16758039747736e-06, + "loss": 1.1322, + "step": 16000 + }, + { + "epoch": 0.88, + "grad_norm": 0.1748046875, + "learning_rate": 9.127635785306788e-06, + "loss": 1.1952, + "step": 16005 + }, + { + "epoch": 0.88, + "grad_norm": 0.1767578125, + "learning_rate": 9.087774223885848e-06, + "loss": 1.1636, + "step": 16010 + }, + { + "epoch": 0.88, + "grad_norm": 0.169921875, + "learning_rate": 9.047995749645145e-06, + "loss": 1.1521, + "step": 16015 + }, + { + "epoch": 0.88, + "grad_norm": 0.171875, + "learning_rate": 9.008300398939274e-06, + "loss": 1.27, + "step": 16020 + }, + { + "epoch": 0.88, + "grad_norm": 0.1728515625, + "learning_rate": 8.968688208046938e-06, + "loss": 1.1969, + "step": 16025 + }, + { + "epoch": 0.88, + "grad_norm": 0.185546875, + "learning_rate": 8.929159213170802e-06, + "loss": 1.177, + "step": 16030 + }, + { + "epoch": 0.88, + "grad_norm": 0.171875, + "learning_rate": 8.88971345043751e-06, + "loss": 1.0645, + "step": 16035 + }, + { + "epoch": 0.88, + "grad_norm": 0.177734375, + "learning_rate": 8.85035095589759e-06, + "loss": 1.0861, + "step": 16040 + }, + { + "epoch": 0.88, + "grad_norm": 0.181640625, + "learning_rate": 8.811071765525537e-06, + "loss": 1.1567, + "step": 16045 + }, + { + "epoch": 0.88, + "grad_norm": 0.1748046875, + "learning_rate": 8.771875915219674e-06, + "loss": 1.1757, + "step": 16050 + }, + { + "epoch": 0.88, + "grad_norm": 0.1728515625, + "learning_rate": 8.732763440802183e-06, + "loss": 1.1666, + "step": 16055 + }, + { + "epoch": 0.88, + "grad_norm": 0.1748046875, + "learning_rate": 8.693734378019003e-06, + "loss": 1.1707, + "step": 16060 + }, + { + "epoch": 0.88, + "grad_norm": 0.173828125, + "learning_rate": 8.654788762539879e-06, + "loss": 1.1437, + "step": 16065 + }, + { + "epoch": 0.88, + "grad_norm": 0.1708984375, + "learning_rate": 8.61592662995827e-06, + "loss": 1.1796, + "step": 16070 + }, + { + "epoch": 0.88, + "grad_norm": 0.1689453125, + "learning_rate": 8.577148015791348e-06, + "loss": 1.161, + "step": 16075 + }, + { + "epoch": 0.88, + "grad_norm": 0.1787109375, + "learning_rate": 8.538452955479982e-06, + "loss": 1.1462, + "step": 16080 + }, + { + "epoch": 0.88, + "grad_norm": 0.1640625, + "learning_rate": 8.499841484388615e-06, + "loss": 1.1826, + "step": 16085 + }, + { + "epoch": 0.88, + "grad_norm": 0.16796875, + "learning_rate": 8.461313637805335e-06, + "loss": 1.094, + "step": 16090 + }, + { + "epoch": 0.88, + "grad_norm": 0.17578125, + "learning_rate": 8.422869450941807e-06, + "loss": 1.2613, + "step": 16095 + }, + { + "epoch": 0.88, + "grad_norm": 0.1728515625, + "learning_rate": 8.384508958933256e-06, + "loss": 1.1028, + "step": 16100 + }, + { + "epoch": 0.88, + "grad_norm": 0.1708984375, + "learning_rate": 8.346232196838333e-06, + "loss": 1.1893, + "step": 16105 + }, + { + "epoch": 0.88, + "grad_norm": 0.173828125, + "learning_rate": 8.308039199639261e-06, + "loss": 1.2048, + "step": 16110 + }, + { + "epoch": 0.88, + "grad_norm": 0.173828125, + "learning_rate": 8.269930002241655e-06, + "loss": 1.1932, + "step": 16115 + }, + { + "epoch": 0.88, + "grad_norm": 0.173828125, + "learning_rate": 8.231904639474585e-06, + "loss": 1.2113, + "step": 16120 + }, + { + "epoch": 0.88, + "grad_norm": 0.181640625, + "learning_rate": 8.193963146090432e-06, + "loss": 1.2568, + "step": 16125 + }, + { + "epoch": 0.88, + "grad_norm": 0.1767578125, + "learning_rate": 8.156105556764982e-06, + "loss": 1.1633, + "step": 16130 + }, + { + "epoch": 0.88, + "grad_norm": 0.1669921875, + "learning_rate": 8.118331906097342e-06, + "loss": 1.1578, + "step": 16135 + }, + { + "epoch": 0.88, + "grad_norm": 0.169921875, + "learning_rate": 8.080642228609891e-06, + "loss": 1.175, + "step": 16140 + }, + { + "epoch": 0.88, + "grad_norm": 0.1708984375, + "learning_rate": 8.04303655874824e-06, + "loss": 1.228, + "step": 16145 + }, + { + "epoch": 0.88, + "grad_norm": 0.181640625, + "learning_rate": 8.005514930881242e-06, + "loss": 1.1441, + "step": 16150 + }, + { + "epoch": 0.88, + "grad_norm": 0.171875, + "learning_rate": 7.968077379300964e-06, + "loss": 1.1286, + "step": 16155 + }, + { + "epoch": 0.89, + "grad_norm": 0.1845703125, + "learning_rate": 7.930723938222595e-06, + "loss": 1.1959, + "step": 16160 + }, + { + "epoch": 0.89, + "grad_norm": 0.1708984375, + "learning_rate": 7.893454641784493e-06, + "loss": 1.2265, + "step": 16165 + }, + { + "epoch": 0.89, + "grad_norm": 0.1767578125, + "learning_rate": 7.856269524048066e-06, + "loss": 1.2004, + "step": 16170 + }, + { + "epoch": 0.89, + "grad_norm": 0.1689453125, + "learning_rate": 7.819168618997819e-06, + "loss": 1.2231, + "step": 16175 + }, + { + "epoch": 0.89, + "grad_norm": 0.1728515625, + "learning_rate": 7.782151960541306e-06, + "loss": 1.185, + "step": 16180 + }, + { + "epoch": 0.89, + "grad_norm": 0.177734375, + "learning_rate": 7.745219582509066e-06, + "loss": 1.1759, + "step": 16185 + }, + { + "epoch": 0.89, + "grad_norm": 0.1748046875, + "learning_rate": 7.708371518654589e-06, + "loss": 1.2174, + "step": 16190 + }, + { + "epoch": 0.89, + "grad_norm": 0.1748046875, + "learning_rate": 7.671607802654345e-06, + "loss": 1.1044, + "step": 16195 + }, + { + "epoch": 0.89, + "grad_norm": 0.19140625, + "learning_rate": 7.634928468107705e-06, + "loss": 1.2369, + "step": 16200 + }, + { + "epoch": 0.89, + "grad_norm": 0.1689453125, + "learning_rate": 7.59833354853694e-06, + "loss": 1.1629, + "step": 16205 + }, + { + "epoch": 0.89, + "grad_norm": 0.1767578125, + "learning_rate": 7.56182307738712e-06, + "loss": 1.2227, + "step": 16210 + }, + { + "epoch": 0.89, + "grad_norm": 0.1748046875, + "learning_rate": 7.525397088026187e-06, + "loss": 1.2655, + "step": 16215 + }, + { + "epoch": 0.89, + "grad_norm": 0.177734375, + "learning_rate": 7.489055613744833e-06, + "loss": 1.1773, + "step": 16220 + }, + { + "epoch": 0.89, + "grad_norm": 0.169921875, + "learning_rate": 7.452798687756568e-06, + "loss": 1.1289, + "step": 16225 + }, + { + "epoch": 0.89, + "grad_norm": 0.169921875, + "learning_rate": 7.4166263431975635e-06, + "loss": 1.1562, + "step": 16230 + }, + { + "epoch": 0.89, + "grad_norm": 0.166015625, + "learning_rate": 7.380538613126709e-06, + "loss": 1.1724, + "step": 16235 + }, + { + "epoch": 0.89, + "grad_norm": 0.173828125, + "learning_rate": 7.344535530525598e-06, + "loss": 1.2103, + "step": 16240 + }, + { + "epoch": 0.89, + "grad_norm": 0.1796875, + "learning_rate": 7.3086171282984115e-06, + "loss": 1.243, + "step": 16245 + }, + { + "epoch": 0.89, + "grad_norm": 0.1708984375, + "learning_rate": 7.272783439272013e-06, + "loss": 1.2094, + "step": 16250 + }, + { + "epoch": 0.89, + "grad_norm": 0.173828125, + "learning_rate": 7.237034496195727e-06, + "loss": 1.1357, + "step": 16255 + }, + { + "epoch": 0.89, + "grad_norm": 0.16796875, + "learning_rate": 7.201370331741541e-06, + "loss": 1.1915, + "step": 16260 + }, + { + "epoch": 0.89, + "grad_norm": 0.1767578125, + "learning_rate": 7.165790978503895e-06, + "loss": 1.183, + "step": 16265 + }, + { + "epoch": 0.89, + "grad_norm": 0.166015625, + "learning_rate": 7.1302964689997535e-06, + "loss": 1.132, + "step": 16270 + }, + { + "epoch": 0.89, + "grad_norm": 0.1748046875, + "learning_rate": 7.0948868356685125e-06, + "loss": 1.2421, + "step": 16275 + }, + { + "epoch": 0.89, + "grad_norm": 0.171875, + "learning_rate": 7.059562110871998e-06, + "loss": 1.0783, + "step": 16280 + }, + { + "epoch": 0.89, + "grad_norm": 0.171875, + "learning_rate": 7.024322326894461e-06, + "loss": 1.2316, + "step": 16285 + }, + { + "epoch": 0.89, + "grad_norm": 0.1708984375, + "learning_rate": 6.989167515942519e-06, + "loss": 1.2171, + "step": 16290 + }, + { + "epoch": 0.89, + "grad_norm": 0.173828125, + "learning_rate": 6.954097710145091e-06, + "loss": 1.2011, + "step": 16295 + }, + { + "epoch": 0.89, + "grad_norm": 0.1689453125, + "learning_rate": 6.919112941553452e-06, + "loss": 1.2027, + "step": 16300 + }, + { + "epoch": 0.89, + "grad_norm": 0.181640625, + "learning_rate": 6.884213242141136e-06, + "loss": 1.1424, + "step": 16305 + }, + { + "epoch": 0.89, + "grad_norm": 0.1767578125, + "learning_rate": 6.849398643803962e-06, + "loss": 1.181, + "step": 16310 + }, + { + "epoch": 0.89, + "grad_norm": 0.1748046875, + "learning_rate": 6.814669178359912e-06, + "loss": 1.1685, + "step": 16315 + }, + { + "epoch": 0.89, + "grad_norm": 0.171875, + "learning_rate": 6.7800248775492185e-06, + "loss": 1.1517, + "step": 16320 + }, + { + "epoch": 0.89, + "grad_norm": 0.1796875, + "learning_rate": 6.745465773034243e-06, + "loss": 1.1752, + "step": 16325 + }, + { + "epoch": 0.89, + "grad_norm": 0.16796875, + "learning_rate": 6.710991896399521e-06, + "loss": 1.1186, + "step": 16330 + }, + { + "epoch": 0.89, + "grad_norm": 0.1806640625, + "learning_rate": 6.676603279151683e-06, + "loss": 1.1278, + "step": 16335 + }, + { + "epoch": 0.89, + "grad_norm": 0.1708984375, + "learning_rate": 6.642299952719388e-06, + "loss": 1.2376, + "step": 16340 + }, + { + "epoch": 0.9, + "grad_norm": 0.1767578125, + "learning_rate": 6.6080819484534265e-06, + "loss": 1.1657, + "step": 16345 + }, + { + "epoch": 0.9, + "grad_norm": 0.1728515625, + "learning_rate": 6.573949297626558e-06, + "loss": 1.1629, + "step": 16350 + }, + { + "epoch": 0.9, + "grad_norm": 0.1787109375, + "learning_rate": 6.539902031433565e-06, + "loss": 1.1644, + "step": 16355 + }, + { + "epoch": 0.9, + "grad_norm": 0.16796875, + "learning_rate": 6.505940180991155e-06, + "loss": 1.1675, + "step": 16360 + }, + { + "epoch": 0.9, + "grad_norm": 0.1728515625, + "learning_rate": 6.472063777338e-06, + "loss": 1.1836, + "step": 16365 + }, + { + "epoch": 0.9, + "grad_norm": 0.1689453125, + "learning_rate": 6.43827285143469e-06, + "loss": 1.1358, + "step": 16370 + }, + { + "epoch": 0.9, + "grad_norm": 0.1826171875, + "learning_rate": 6.404567434163688e-06, + "loss": 1.1718, + "step": 16375 + }, + { + "epoch": 0.9, + "grad_norm": 0.1728515625, + "learning_rate": 6.370947556329265e-06, + "loss": 1.1502, + "step": 16380 + }, + { + "epoch": 0.9, + "grad_norm": 0.1787109375, + "learning_rate": 6.337413248657576e-06, + "loss": 1.1677, + "step": 16385 + }, + { + "epoch": 0.9, + "grad_norm": 0.1728515625, + "learning_rate": 6.303964541796547e-06, + "loss": 1.1864, + "step": 16390 + }, + { + "epoch": 0.9, + "grad_norm": 0.17578125, + "learning_rate": 6.270601466315873e-06, + "loss": 1.1558, + "step": 16395 + }, + { + "epoch": 0.9, + "grad_norm": 0.1708984375, + "learning_rate": 6.237324052706961e-06, + "loss": 1.1839, + "step": 16400 + }, + { + "epoch": 0.9, + "grad_norm": 0.1650390625, + "learning_rate": 6.2041323313829725e-06, + "loss": 1.1195, + "step": 16405 + }, + { + "epoch": 0.9, + "grad_norm": 0.171875, + "learning_rate": 6.171026332678731e-06, + "loss": 1.1968, + "step": 16410 + }, + { + "epoch": 0.9, + "grad_norm": 0.1669921875, + "learning_rate": 6.138006086850734e-06, + "loss": 1.1062, + "step": 16415 + }, + { + "epoch": 0.9, + "grad_norm": 0.1767578125, + "learning_rate": 6.105071624077053e-06, + "loss": 1.1991, + "step": 16420 + }, + { + "epoch": 0.9, + "grad_norm": 0.171875, + "learning_rate": 6.07222297445742e-06, + "loss": 1.1643, + "step": 16425 + }, + { + "epoch": 0.9, + "grad_norm": 0.171875, + "learning_rate": 6.039460168013122e-06, + "loss": 1.1206, + "step": 16430 + }, + { + "epoch": 0.9, + "grad_norm": 0.173828125, + "learning_rate": 6.006783234686975e-06, + "loss": 1.1193, + "step": 16435 + }, + { + "epoch": 0.9, + "grad_norm": 0.1767578125, + "learning_rate": 5.974192204343354e-06, + "loss": 1.2072, + "step": 16440 + }, + { + "epoch": 0.9, + "grad_norm": 0.1728515625, + "learning_rate": 5.941687106768068e-06, + "loss": 1.1326, + "step": 16445 + }, + { + "epoch": 0.9, + "grad_norm": 0.1689453125, + "learning_rate": 5.909267971668419e-06, + "loss": 1.1982, + "step": 16450 + }, + { + "epoch": 0.9, + "grad_norm": 0.1728515625, + "learning_rate": 5.876934828673153e-06, + "loss": 1.1706, + "step": 16455 + }, + { + "epoch": 0.9, + "grad_norm": 0.1728515625, + "learning_rate": 5.8446877073324305e-06, + "loss": 1.1214, + "step": 16460 + }, + { + "epoch": 0.9, + "grad_norm": 0.1650390625, + "learning_rate": 5.812526637117765e-06, + "loss": 1.1113, + "step": 16465 + }, + { + "epoch": 0.9, + "grad_norm": 0.171875, + "learning_rate": 5.780451647422036e-06, + "loss": 1.2201, + "step": 16470 + }, + { + "epoch": 0.9, + "grad_norm": 0.1728515625, + "learning_rate": 5.748462767559471e-06, + "loss": 1.1673, + "step": 16475 + }, + { + "epoch": 0.9, + "grad_norm": 0.17578125, + "learning_rate": 5.716560026765594e-06, + "loss": 1.1072, + "step": 16480 + }, + { + "epoch": 0.9, + "grad_norm": 0.1748046875, + "learning_rate": 5.684743454197173e-06, + "loss": 1.1669, + "step": 16485 + }, + { + "epoch": 0.9, + "grad_norm": 0.169921875, + "learning_rate": 5.6530130789322565e-06, + "loss": 1.2506, + "step": 16490 + }, + { + "epoch": 0.9, + "grad_norm": 0.169921875, + "learning_rate": 5.6213689299701055e-06, + "loss": 1.2053, + "step": 16495 + }, + { + "epoch": 0.9, + "grad_norm": 0.173828125, + "learning_rate": 5.589811036231196e-06, + "loss": 1.1743, + "step": 16500 + }, + { + "epoch": 0.9, + "grad_norm": 0.17578125, + "learning_rate": 5.558339426557124e-06, + "loss": 1.1414, + "step": 16505 + }, + { + "epoch": 0.9, + "grad_norm": 0.1689453125, + "learning_rate": 5.5269541297106705e-06, + "loss": 1.1321, + "step": 16510 + }, + { + "epoch": 0.9, + "grad_norm": 0.169921875, + "learning_rate": 5.495655174375724e-06, + "loss": 1.0863, + "step": 16515 + }, + { + "epoch": 0.9, + "grad_norm": 0.16796875, + "learning_rate": 5.464442589157259e-06, + "loss": 1.1304, + "step": 16520 + }, + { + "epoch": 0.91, + "grad_norm": 0.169921875, + "learning_rate": 5.433316402581323e-06, + "loss": 1.1899, + "step": 16525 + }, + { + "epoch": 0.91, + "grad_norm": 0.173828125, + "learning_rate": 5.402276643094961e-06, + "loss": 1.1078, + "step": 16530 + }, + { + "epoch": 0.91, + "grad_norm": 0.166015625, + "learning_rate": 5.371323339066303e-06, + "loss": 1.1161, + "step": 16535 + }, + { + "epoch": 0.91, + "grad_norm": 0.177734375, + "learning_rate": 5.340456518784387e-06, + "loss": 1.2131, + "step": 16540 + }, + { + "epoch": 0.91, + "grad_norm": 0.1669921875, + "learning_rate": 5.30967621045928e-06, + "loss": 1.1079, + "step": 16545 + }, + { + "epoch": 0.91, + "grad_norm": 0.169921875, + "learning_rate": 5.2789824422219246e-06, + "loss": 1.2354, + "step": 16550 + }, + { + "epoch": 0.91, + "grad_norm": 0.1748046875, + "learning_rate": 5.248375242124215e-06, + "loss": 1.1894, + "step": 16555 + }, + { + "epoch": 0.91, + "grad_norm": 0.16796875, + "learning_rate": 5.217854638138897e-06, + "loss": 1.1514, + "step": 16560 + }, + { + "epoch": 0.91, + "grad_norm": 0.1767578125, + "learning_rate": 5.187420658159614e-06, + "loss": 1.1632, + "step": 16565 + }, + { + "epoch": 0.91, + "grad_norm": 0.1728515625, + "learning_rate": 5.157073330000794e-06, + "loss": 1.1349, + "step": 16570 + }, + { + "epoch": 0.91, + "grad_norm": 0.18359375, + "learning_rate": 5.126812681397697e-06, + "loss": 1.1648, + "step": 16575 + }, + { + "epoch": 0.91, + "grad_norm": 0.1767578125, + "learning_rate": 5.096638740006365e-06, + "loss": 1.2359, + "step": 16580 + }, + { + "epoch": 0.91, + "grad_norm": 0.1708984375, + "learning_rate": 5.066551533403607e-06, + "loss": 1.1126, + "step": 16585 + }, + { + "epoch": 0.91, + "grad_norm": 0.171875, + "learning_rate": 5.036551089086916e-06, + "loss": 1.1309, + "step": 16590 + }, + { + "epoch": 0.91, + "grad_norm": 0.1767578125, + "learning_rate": 5.006637434474537e-06, + "loss": 1.1842, + "step": 16595 + }, + { + "epoch": 0.91, + "grad_norm": 0.1796875, + "learning_rate": 4.976810596905379e-06, + "loss": 1.1839, + "step": 16600 + }, + { + "epoch": 0.91, + "grad_norm": 0.1669921875, + "learning_rate": 4.947070603639015e-06, + "loss": 1.1687, + "step": 16605 + }, + { + "epoch": 0.91, + "grad_norm": 0.17578125, + "learning_rate": 4.917417481855657e-06, + "loss": 1.2529, + "step": 16610 + }, + { + "epoch": 0.91, + "grad_norm": 0.185546875, + "learning_rate": 4.887851258656062e-06, + "loss": 1.2056, + "step": 16615 + }, + { + "epoch": 0.91, + "grad_norm": 0.1689453125, + "learning_rate": 4.858371961061647e-06, + "loss": 1.1376, + "step": 16620 + }, + { + "epoch": 0.91, + "grad_norm": 0.1767578125, + "learning_rate": 4.828979616014351e-06, + "loss": 1.2936, + "step": 16625 + }, + { + "epoch": 0.91, + "grad_norm": 0.1728515625, + "learning_rate": 4.79967425037664e-06, + "loss": 1.1438, + "step": 16630 + }, + { + "epoch": 0.91, + "grad_norm": 0.19921875, + "learning_rate": 4.770455890931469e-06, + "loss": 1.1084, + "step": 16635 + }, + { + "epoch": 0.91, + "grad_norm": 0.16796875, + "learning_rate": 4.7413245643823435e-06, + "loss": 1.2458, + "step": 16640 + }, + { + "epoch": 0.91, + "grad_norm": 0.1748046875, + "learning_rate": 4.712280297353156e-06, + "loss": 1.1415, + "step": 16645 + }, + { + "epoch": 0.91, + "grad_norm": 0.1650390625, + "learning_rate": 4.6833231163882716e-06, + "loss": 1.1359, + "step": 16650 + }, + { + "epoch": 0.91, + "grad_norm": 0.1728515625, + "learning_rate": 4.654453047952434e-06, + "loss": 1.1625, + "step": 16655 + }, + { + "epoch": 0.91, + "grad_norm": 0.1728515625, + "learning_rate": 4.6256701184308135e-06, + "loss": 1.103, + "step": 16660 + }, + { + "epoch": 0.91, + "grad_norm": 0.1767578125, + "learning_rate": 4.5969743541289025e-06, + "loss": 1.1005, + "step": 16665 + }, + { + "epoch": 0.91, + "grad_norm": 0.173828125, + "learning_rate": 4.5683657812725655e-06, + "loss": 1.1641, + "step": 16670 + }, + { + "epoch": 0.91, + "grad_norm": 0.16796875, + "learning_rate": 4.5398444260079555e-06, + "loss": 1.1248, + "step": 16675 + }, + { + "epoch": 0.91, + "grad_norm": 0.1787109375, + "learning_rate": 4.511410314401509e-06, + "loss": 1.235, + "step": 16680 + }, + { + "epoch": 0.91, + "grad_norm": 0.169921875, + "learning_rate": 4.483063472439952e-06, + "loss": 1.1876, + "step": 16685 + }, + { + "epoch": 0.91, + "grad_norm": 0.1806640625, + "learning_rate": 4.4548039260302486e-06, + "loss": 1.1554, + "step": 16690 + }, + { + "epoch": 0.91, + "grad_norm": 0.181640625, + "learning_rate": 4.426631700999584e-06, + "loss": 1.1349, + "step": 16695 + }, + { + "epoch": 0.91, + "grad_norm": 0.1767578125, + "learning_rate": 4.398546823095295e-06, + "loss": 1.1381, + "step": 16700 + }, + { + "epoch": 0.91, + "grad_norm": 0.1787109375, + "learning_rate": 4.370549317984951e-06, + "loss": 1.163, + "step": 16705 + }, + { + "epoch": 0.92, + "grad_norm": 0.1796875, + "learning_rate": 4.342639211256227e-06, + "loss": 1.1566, + "step": 16710 + }, + { + "epoch": 0.92, + "grad_norm": 0.173828125, + "learning_rate": 4.314816528416943e-06, + "loss": 1.223, + "step": 16715 + }, + { + "epoch": 0.92, + "grad_norm": 0.1650390625, + "learning_rate": 4.287081294895001e-06, + "loss": 1.1395, + "step": 16720 + }, + { + "epoch": 0.92, + "grad_norm": 0.1787109375, + "learning_rate": 4.2594335360384264e-06, + "loss": 1.1159, + "step": 16725 + }, + { + "epoch": 0.92, + "grad_norm": 0.171875, + "learning_rate": 4.231873277115217e-06, + "loss": 1.169, + "step": 16730 + }, + { + "epoch": 0.92, + "grad_norm": 0.169921875, + "learning_rate": 4.204400543313503e-06, + "loss": 1.1806, + "step": 16735 + }, + { + "epoch": 0.92, + "grad_norm": 0.1748046875, + "learning_rate": 4.17701535974131e-06, + "loss": 1.1525, + "step": 16740 + }, + { + "epoch": 0.92, + "grad_norm": 0.1748046875, + "learning_rate": 4.149717751426762e-06, + "loss": 1.2027, + "step": 16745 + }, + { + "epoch": 0.92, + "grad_norm": 0.1767578125, + "learning_rate": 4.122507743317861e-06, + "loss": 1.1434, + "step": 16750 + }, + { + "epoch": 0.92, + "grad_norm": 0.1767578125, + "learning_rate": 4.095385360282611e-06, + "loss": 1.1302, + "step": 16755 + }, + { + "epoch": 0.92, + "grad_norm": 0.1748046875, + "learning_rate": 4.068350627108875e-06, + "loss": 1.0971, + "step": 16760 + }, + { + "epoch": 0.92, + "grad_norm": 0.1650390625, + "learning_rate": 4.041403568504443e-06, + "loss": 1.231, + "step": 16765 + }, + { + "epoch": 0.92, + "grad_norm": 0.166015625, + "learning_rate": 4.014544209096971e-06, + "loss": 1.2159, + "step": 16770 + }, + { + "epoch": 0.92, + "grad_norm": 0.1728515625, + "learning_rate": 3.98777257343399e-06, + "loss": 1.1385, + "step": 16775 + }, + { + "epoch": 0.92, + "grad_norm": 0.1787109375, + "learning_rate": 3.961088685982806e-06, + "loss": 1.1849, + "step": 16780 + }, + { + "epoch": 0.92, + "grad_norm": 0.1708984375, + "learning_rate": 3.9344925711305655e-06, + "loss": 1.1613, + "step": 16785 + }, + { + "epoch": 0.92, + "grad_norm": 0.169921875, + "learning_rate": 3.907984253184182e-06, + "loss": 1.1847, + "step": 16790 + }, + { + "epoch": 0.92, + "grad_norm": 0.1767578125, + "learning_rate": 3.881563756370344e-06, + "loss": 1.1473, + "step": 16795 + }, + { + "epoch": 0.92, + "grad_norm": 0.16015625, + "learning_rate": 3.855231104835477e-06, + "loss": 1.0595, + "step": 16800 + }, + { + "epoch": 0.92, + "grad_norm": 0.16796875, + "learning_rate": 3.828986322645678e-06, + "loss": 1.1727, + "step": 16805 + }, + { + "epoch": 0.92, + "grad_norm": 0.1748046875, + "learning_rate": 3.802829433786825e-06, + "loss": 1.1347, + "step": 16810 + }, + { + "epoch": 0.92, + "grad_norm": 0.173828125, + "learning_rate": 3.7767604621643747e-06, + "loss": 1.2203, + "step": 16815 + }, + { + "epoch": 0.92, + "grad_norm": 0.16796875, + "learning_rate": 3.750779431603513e-06, + "loss": 1.1079, + "step": 16820 + }, + { + "epoch": 0.92, + "grad_norm": 0.1708984375, + "learning_rate": 3.7248863658489787e-06, + "loss": 1.2136, + "step": 16825 + }, + { + "epoch": 0.92, + "grad_norm": 0.1708984375, + "learning_rate": 3.699081288565187e-06, + "loss": 1.1902, + "step": 16830 + }, + { + "epoch": 0.92, + "grad_norm": 0.1748046875, + "learning_rate": 3.6733642233360952e-06, + "loss": 1.1588, + "step": 16835 + }, + { + "epoch": 0.92, + "grad_norm": 0.1708984375, + "learning_rate": 3.647735193665247e-06, + "loss": 1.2034, + "step": 16840 + }, + { + "epoch": 0.92, + "grad_norm": 0.173828125, + "learning_rate": 3.622194222975672e-06, + "loss": 1.2295, + "step": 16845 + }, + { + "epoch": 0.92, + "grad_norm": 0.17578125, + "learning_rate": 3.596741334610032e-06, + "loss": 1.1129, + "step": 16850 + }, + { + "epoch": 0.92, + "grad_norm": 0.1748046875, + "learning_rate": 3.5713765518303743e-06, + "loss": 1.1785, + "step": 16855 + }, + { + "epoch": 0.92, + "grad_norm": 0.177734375, + "learning_rate": 3.546099897818289e-06, + "loss": 1.243, + "step": 16860 + }, + { + "epoch": 0.92, + "grad_norm": 0.1767578125, + "learning_rate": 3.5209113956747862e-06, + "loss": 1.1932, + "step": 16865 + }, + { + "epoch": 0.92, + "grad_norm": 0.1748046875, + "learning_rate": 3.4958110684203405e-06, + "loss": 1.1639, + "step": 16870 + }, + { + "epoch": 0.92, + "grad_norm": 0.177734375, + "learning_rate": 3.4707989389948457e-06, + "loss": 1.1287, + "step": 16875 + }, + { + "epoch": 0.92, + "grad_norm": 0.1767578125, + "learning_rate": 3.4458750302575503e-06, + "loss": 1.1221, + "step": 16880 + }, + { + "epoch": 0.92, + "grad_norm": 0.1748046875, + "learning_rate": 3.421039364987122e-06, + "loss": 1.0995, + "step": 16885 + }, + { + "epoch": 0.93, + "grad_norm": 0.17578125, + "learning_rate": 3.396291965881515e-06, + "loss": 1.1231, + "step": 16890 + }, + { + "epoch": 0.93, + "grad_norm": 0.166015625, + "learning_rate": 3.3716328555581154e-06, + "loss": 1.2138, + "step": 16895 + }, + { + "epoch": 0.93, + "grad_norm": 0.1728515625, + "learning_rate": 3.3470620565535294e-06, + "loss": 1.2367, + "step": 16900 + }, + { + "epoch": 0.93, + "grad_norm": 0.1806640625, + "learning_rate": 3.322579591323682e-06, + "loss": 1.2178, + "step": 16905 + }, + { + "epoch": 0.93, + "grad_norm": 0.1806640625, + "learning_rate": 3.298185482243765e-06, + "loss": 1.1914, + "step": 16910 + }, + { + "epoch": 0.93, + "grad_norm": 0.1689453125, + "learning_rate": 3.2738797516082663e-06, + "loss": 1.183, + "step": 16915 + }, + { + "epoch": 0.93, + "grad_norm": 0.1796875, + "learning_rate": 3.2496624216308057e-06, + "loss": 1.204, + "step": 16920 + }, + { + "epoch": 0.93, + "grad_norm": 0.1748046875, + "learning_rate": 3.225533514444312e-06, + "loss": 1.17, + "step": 16925 + }, + { + "epoch": 0.93, + "grad_norm": 0.1767578125, + "learning_rate": 3.2014930521008126e-06, + "loss": 1.0692, + "step": 16930 + }, + { + "epoch": 0.93, + "grad_norm": 0.1669921875, + "learning_rate": 3.1775410565715866e-06, + "loss": 1.0283, + "step": 16935 + }, + { + "epoch": 0.93, + "grad_norm": 0.17578125, + "learning_rate": 3.1536775497470028e-06, + "loss": 1.1455, + "step": 16940 + }, + { + "epoch": 0.93, + "grad_norm": 0.173828125, + "learning_rate": 3.1299025534365813e-06, + "loss": 1.1927, + "step": 16945 + }, + { + "epoch": 0.93, + "grad_norm": 0.1748046875, + "learning_rate": 3.1062160893689297e-06, + "loss": 1.19, + "step": 16950 + }, + { + "epoch": 0.93, + "grad_norm": 0.1708984375, + "learning_rate": 3.082618179191765e-06, + "loss": 1.1795, + "step": 16955 + }, + { + "epoch": 0.93, + "grad_norm": 0.1728515625, + "learning_rate": 3.0591088444718697e-06, + "loss": 1.155, + "step": 16960 + }, + { + "epoch": 0.93, + "grad_norm": 0.1787109375, + "learning_rate": 3.035688106695067e-06, + "loss": 1.1472, + "step": 16965 + }, + { + "epoch": 0.93, + "grad_norm": 0.1748046875, + "learning_rate": 3.0123559872662244e-06, + "loss": 1.1495, + "step": 16970 + }, + { + "epoch": 0.93, + "grad_norm": 0.1787109375, + "learning_rate": 2.989112507509195e-06, + "loss": 1.2443, + "step": 16975 + }, + { + "epoch": 0.93, + "grad_norm": 0.1669921875, + "learning_rate": 2.9659576886668317e-06, + "loss": 1.2309, + "step": 16980 + }, + { + "epoch": 0.93, + "grad_norm": 0.1689453125, + "learning_rate": 2.9428915519009616e-06, + "loss": 1.1494, + "step": 16985 + }, + { + "epoch": 0.93, + "grad_norm": 0.1728515625, + "learning_rate": 2.919914118292366e-06, + "loss": 1.1678, + "step": 16990 + }, + { + "epoch": 0.93, + "grad_norm": 0.169921875, + "learning_rate": 2.8970254088407368e-06, + "loss": 1.1647, + "step": 16995 + }, + { + "epoch": 0.93, + "grad_norm": 0.166015625, + "learning_rate": 2.8742254444647178e-06, + "loss": 1.1213, + "step": 17000 + }, + { + "epoch": 0.93, + "grad_norm": 0.1748046875, + "learning_rate": 2.8515142460018075e-06, + "loss": 1.1535, + "step": 17005 + }, + { + "epoch": 0.93, + "grad_norm": 0.177734375, + "learning_rate": 2.8288918342084027e-06, + "loss": 1.1511, + "step": 17010 + }, + { + "epoch": 0.93, + "grad_norm": 0.185546875, + "learning_rate": 2.8063582297597313e-06, + "loss": 1.1657, + "step": 17015 + }, + { + "epoch": 0.93, + "grad_norm": 0.169921875, + "learning_rate": 2.7839134532498977e-06, + "loss": 1.2301, + "step": 17020 + }, + { + "epoch": 0.93, + "grad_norm": 0.1708984375, + "learning_rate": 2.761557525191771e-06, + "loss": 1.0988, + "step": 17025 + }, + { + "epoch": 0.93, + "grad_norm": 0.169921875, + "learning_rate": 2.739290466017075e-06, + "loss": 1.2522, + "step": 17030 + }, + { + "epoch": 0.93, + "grad_norm": 0.1728515625, + "learning_rate": 2.717112296076263e-06, + "loss": 1.1256, + "step": 17035 + }, + { + "epoch": 0.93, + "grad_norm": 0.17578125, + "learning_rate": 2.6950230356385996e-06, + "loss": 1.1679, + "step": 17040 + }, + { + "epoch": 0.93, + "grad_norm": 0.1689453125, + "learning_rate": 2.6730227048920365e-06, + "loss": 1.1421, + "step": 17045 + }, + { + "epoch": 0.93, + "grad_norm": 0.1728515625, + "learning_rate": 2.6511113239433004e-06, + "loss": 1.1202, + "step": 17050 + }, + { + "epoch": 0.93, + "grad_norm": 0.169921875, + "learning_rate": 2.6292889128178065e-06, + "loss": 1.1374, + "step": 17055 + }, + { + "epoch": 0.93, + "grad_norm": 0.1787109375, + "learning_rate": 2.6075554914596456e-06, + "loss": 1.1719, + "step": 17060 + }, + { + "epoch": 0.93, + "grad_norm": 0.16796875, + "learning_rate": 2.5859110797315846e-06, + "loss": 1.2577, + "step": 17065 + }, + { + "epoch": 0.93, + "grad_norm": 0.1787109375, + "learning_rate": 2.564355697415055e-06, + "loss": 1.2182, + "step": 17070 + }, + { + "epoch": 0.94, + "grad_norm": 0.173828125, + "learning_rate": 2.5428893642101216e-06, + "loss": 1.1502, + "step": 17075 + }, + { + "epoch": 0.94, + "grad_norm": 0.1708984375, + "learning_rate": 2.5215120997354235e-06, + "loss": 1.2016, + "step": 17080 + }, + { + "epoch": 0.94, + "grad_norm": 0.1669921875, + "learning_rate": 2.5002239235282664e-06, + "loss": 1.1663, + "step": 17085 + }, + { + "epoch": 0.94, + "grad_norm": 0.177734375, + "learning_rate": 2.479024855044476e-06, + "loss": 1.1409, + "step": 17090 + }, + { + "epoch": 0.94, + "grad_norm": 0.1796875, + "learning_rate": 2.4579149136584876e-06, + "loss": 1.2097, + "step": 17095 + }, + { + "epoch": 0.94, + "grad_norm": 0.16796875, + "learning_rate": 2.436894118663213e-06, + "loss": 1.1563, + "step": 17100 + }, + { + "epoch": 0.94, + "grad_norm": 0.1689453125, + "learning_rate": 2.415962489270196e-06, + "loss": 1.1454, + "step": 17105 + }, + { + "epoch": 0.94, + "grad_norm": 0.171875, + "learning_rate": 2.395120044609367e-06, + "loss": 1.125, + "step": 17110 + }, + { + "epoch": 0.94, + "grad_norm": 0.171875, + "learning_rate": 2.374366803729244e-06, + "loss": 1.1371, + "step": 17115 + }, + { + "epoch": 0.94, + "grad_norm": 0.173828125, + "learning_rate": 2.3537027855967676e-06, + "loss": 1.1338, + "step": 17120 + }, + { + "epoch": 0.94, + "grad_norm": 0.16796875, + "learning_rate": 2.333128009097374e-06, + "loss": 1.2073, + "step": 17125 + }, + { + "epoch": 0.94, + "grad_norm": 0.177734375, + "learning_rate": 2.312642493034889e-06, + "loss": 1.0865, + "step": 17130 + }, + { + "epoch": 0.94, + "grad_norm": 0.1689453125, + "learning_rate": 2.2922462561316026e-06, + "loss": 1.1977, + "step": 17135 + }, + { + "epoch": 0.94, + "grad_norm": 0.169921875, + "learning_rate": 2.271939317028182e-06, + "loss": 1.1992, + "step": 17140 + }, + { + "epoch": 0.94, + "grad_norm": 0.1708984375, + "learning_rate": 2.251721694283726e-06, + "loss": 1.1972, + "step": 17145 + }, + { + "epoch": 0.94, + "grad_norm": 0.1728515625, + "learning_rate": 2.231593406375654e-06, + "loss": 1.2196, + "step": 17150 + }, + { + "epoch": 0.94, + "grad_norm": 0.173828125, + "learning_rate": 2.2115544716997617e-06, + "loss": 1.2578, + "step": 17155 + }, + { + "epoch": 0.94, + "grad_norm": 0.1767578125, + "learning_rate": 2.1916049085701886e-06, + "loss": 1.1821, + "step": 17160 + }, + { + "epoch": 0.94, + "grad_norm": 0.171875, + "learning_rate": 2.171744735219372e-06, + "loss": 1.1414, + "step": 17165 + }, + { + "epoch": 0.94, + "grad_norm": 0.1650390625, + "learning_rate": 2.1519739697980933e-06, + "loss": 1.1677, + "step": 17170 + }, + { + "epoch": 0.94, + "grad_norm": 0.1669921875, + "learning_rate": 2.1322926303753877e-06, + "loss": 1.1222, + "step": 17175 + }, + { + "epoch": 0.94, + "grad_norm": 0.177734375, + "learning_rate": 2.1127007349385773e-06, + "loss": 1.1948, + "step": 17180 + }, + { + "epoch": 0.94, + "grad_norm": 0.1728515625, + "learning_rate": 2.0931983013932065e-06, + "loss": 1.1847, + "step": 17185 + }, + { + "epoch": 0.94, + "grad_norm": 0.171875, + "learning_rate": 2.0737853475631285e-06, + "loss": 1.1168, + "step": 17190 + }, + { + "epoch": 0.94, + "grad_norm": 0.1689453125, + "learning_rate": 2.054461891190351e-06, + "loss": 1.1159, + "step": 17195 + }, + { + "epoch": 0.94, + "grad_norm": 0.16796875, + "learning_rate": 2.0352279499351147e-06, + "loss": 1.2277, + "step": 17200 + }, + { + "epoch": 0.94, + "grad_norm": 0.1728515625, + "learning_rate": 2.016083541375824e-06, + "loss": 1.0641, + "step": 17205 + }, + { + "epoch": 0.94, + "grad_norm": 0.1767578125, + "learning_rate": 1.9970286830091167e-06, + "loss": 1.2393, + "step": 17210 + }, + { + "epoch": 0.94, + "grad_norm": 0.1611328125, + "learning_rate": 1.9780633922497404e-06, + "loss": 1.1617, + "step": 17215 + }, + { + "epoch": 0.94, + "grad_norm": 0.177734375, + "learning_rate": 1.9591876864305856e-06, + "loss": 1.159, + "step": 17220 + }, + { + "epoch": 0.94, + "grad_norm": 0.177734375, + "learning_rate": 1.9404015828026756e-06, + "loss": 1.0684, + "step": 17225 + }, + { + "epoch": 0.94, + "grad_norm": 0.1796875, + "learning_rate": 1.921705098535165e-06, + "loss": 1.1426, + "step": 17230 + }, + { + "epoch": 0.94, + "grad_norm": 0.1708984375, + "learning_rate": 1.9030982507152628e-06, + "loss": 1.1149, + "step": 17235 + }, + { + "epoch": 0.94, + "grad_norm": 0.1640625, + "learning_rate": 1.8845810563483002e-06, + "loss": 1.0976, + "step": 17240 + }, + { + "epoch": 0.94, + "grad_norm": 0.1640625, + "learning_rate": 1.8661535323576286e-06, + "loss": 1.0789, + "step": 17245 + }, + { + "epoch": 0.94, + "grad_norm": 0.1669921875, + "learning_rate": 1.8478156955846871e-06, + "loss": 1.1253, + "step": 17250 + }, + { + "epoch": 0.95, + "grad_norm": 0.17578125, + "learning_rate": 1.829567562788914e-06, + "loss": 1.1965, + "step": 17255 + }, + { + "epoch": 0.95, + "grad_norm": 0.1689453125, + "learning_rate": 1.8114091506477914e-06, + "loss": 1.1672, + "step": 17260 + }, + { + "epoch": 0.95, + "grad_norm": 0.1767578125, + "learning_rate": 1.7933404757567884e-06, + "loss": 1.1776, + "step": 17265 + }, + { + "epoch": 0.95, + "grad_norm": 0.1669921875, + "learning_rate": 1.77536155462934e-06, + "loss": 1.1156, + "step": 17270 + }, + { + "epoch": 0.95, + "grad_norm": 0.1669921875, + "learning_rate": 1.7574724036969026e-06, + "loss": 1.1717, + "step": 17275 + }, + { + "epoch": 0.95, + "grad_norm": 0.1767578125, + "learning_rate": 1.7396730393088644e-06, + "loss": 1.1301, + "step": 17280 + }, + { + "epoch": 0.95, + "grad_norm": 0.169921875, + "learning_rate": 1.7219634777325354e-06, + "loss": 1.2481, + "step": 17285 + }, + { + "epoch": 0.95, + "grad_norm": 0.1708984375, + "learning_rate": 1.7043437351531798e-06, + "loss": 1.1498, + "step": 17290 + }, + { + "epoch": 0.95, + "grad_norm": 0.1748046875, + "learning_rate": 1.686813827673961e-06, + "loss": 1.1509, + "step": 17295 + }, + { + "epoch": 0.95, + "grad_norm": 0.166015625, + "learning_rate": 1.6693737713159519e-06, + "loss": 1.1795, + "step": 17300 + }, + { + "epoch": 0.95, + "grad_norm": 0.1865234375, + "learning_rate": 1.6520235820180918e-06, + "loss": 1.2174, + "step": 17305 + }, + { + "epoch": 0.95, + "grad_norm": 0.1708984375, + "learning_rate": 1.6347632756371855e-06, + "loss": 1.1921, + "step": 17310 + }, + { + "epoch": 0.95, + "grad_norm": 0.169921875, + "learning_rate": 1.6175928679479258e-06, + "loss": 1.1467, + "step": 17315 + }, + { + "epoch": 0.95, + "grad_norm": 0.169921875, + "learning_rate": 1.6005123746428042e-06, + "loss": 1.1252, + "step": 17320 + }, + { + "epoch": 0.95, + "grad_norm": 0.173828125, + "learning_rate": 1.583521811332145e-06, + "loss": 1.216, + "step": 17325 + }, + { + "epoch": 0.95, + "grad_norm": 0.1728515625, + "learning_rate": 1.5666211935441044e-06, + "loss": 1.1271, + "step": 17330 + }, + { + "epoch": 0.95, + "grad_norm": 0.1708984375, + "learning_rate": 1.5498105367246274e-06, + "loss": 1.1873, + "step": 17335 + }, + { + "epoch": 0.95, + "grad_norm": 0.171875, + "learning_rate": 1.533089856237413e-06, + "loss": 1.2123, + "step": 17340 + }, + { + "epoch": 0.95, + "grad_norm": 0.171875, + "learning_rate": 1.5164591673639595e-06, + "loss": 1.267, + "step": 17345 + }, + { + "epoch": 0.95, + "grad_norm": 0.1806640625, + "learning_rate": 1.4999184853035197e-06, + "loss": 1.1982, + "step": 17350 + }, + { + "epoch": 0.95, + "grad_norm": 0.171875, + "learning_rate": 1.4834678251730572e-06, + "loss": 1.1584, + "step": 17355 + }, + { + "epoch": 0.95, + "grad_norm": 0.1728515625, + "learning_rate": 1.4671072020073118e-06, + "loss": 1.2699, + "step": 17360 + }, + { + "epoch": 0.95, + "grad_norm": 0.173828125, + "learning_rate": 1.4508366307586786e-06, + "loss": 1.2251, + "step": 17365 + }, + { + "epoch": 0.95, + "grad_norm": 0.16796875, + "learning_rate": 1.4346561262972957e-06, + "loss": 1.1924, + "step": 17370 + }, + { + "epoch": 0.95, + "grad_norm": 0.1728515625, + "learning_rate": 1.4185657034109567e-06, + "loss": 1.2191, + "step": 17375 + }, + { + "epoch": 0.95, + "grad_norm": 0.171875, + "learning_rate": 1.402565376805165e-06, + "loss": 1.1823, + "step": 17380 + }, + { + "epoch": 0.95, + "grad_norm": 0.1767578125, + "learning_rate": 1.3866551611030342e-06, + "loss": 1.1107, + "step": 17385 + }, + { + "epoch": 0.95, + "grad_norm": 0.177734375, + "learning_rate": 1.3708350708453554e-06, + "loss": 1.2043, + "step": 17390 + }, + { + "epoch": 0.95, + "grad_norm": 0.1748046875, + "learning_rate": 1.3551051204905408e-06, + "loss": 1.1392, + "step": 17395 + }, + { + "epoch": 0.95, + "grad_norm": 0.1689453125, + "learning_rate": 1.339465324414635e-06, + "loss": 1.122, + "step": 17400 + }, + { + "epoch": 0.95, + "grad_norm": 0.17578125, + "learning_rate": 1.323915696911271e-06, + "loss": 1.1663, + "step": 17405 + }, + { + "epoch": 0.95, + "grad_norm": 0.1806640625, + "learning_rate": 1.30845625219167e-06, + "loss": 1.2134, + "step": 17410 + }, + { + "epoch": 0.95, + "grad_norm": 0.1748046875, + "learning_rate": 1.2930870043846633e-06, + "loss": 1.1754, + "step": 17415 + }, + { + "epoch": 0.95, + "grad_norm": 0.169921875, + "learning_rate": 1.277807967536615e-06, + "loss": 1.1718, + "step": 17420 + }, + { + "epoch": 0.95, + "grad_norm": 0.1640625, + "learning_rate": 1.2626191556114664e-06, + "loss": 1.1119, + "step": 17425 + }, + { + "epoch": 0.95, + "grad_norm": 0.173828125, + "learning_rate": 1.2475205824906689e-06, + "loss": 1.1386, + "step": 17430 + }, + { + "epoch": 0.95, + "grad_norm": 0.1689453125, + "learning_rate": 1.23251226197324e-06, + "loss": 1.2457, + "step": 17435 + }, + { + "epoch": 0.96, + "grad_norm": 0.1787109375, + "learning_rate": 1.2175942077756963e-06, + "loss": 1.1837, + "step": 17440 + }, + { + "epoch": 0.96, + "grad_norm": 0.1728515625, + "learning_rate": 1.2027664335320655e-06, + "loss": 1.2327, + "step": 17445 + }, + { + "epoch": 0.96, + "grad_norm": 0.1767578125, + "learning_rate": 1.1880289527938404e-06, + "loss": 1.2002, + "step": 17450 + }, + { + "epoch": 0.96, + "grad_norm": 0.177734375, + "learning_rate": 1.173381779030014e-06, + "loss": 1.1953, + "step": 17455 + }, + { + "epoch": 0.96, + "grad_norm": 0.16796875, + "learning_rate": 1.1588249256270334e-06, + "loss": 1.2091, + "step": 17460 + }, + { + "epoch": 0.96, + "grad_norm": 0.169921875, + "learning_rate": 1.1443584058888347e-06, + "loss": 1.1761, + "step": 17465 + }, + { + "epoch": 0.96, + "grad_norm": 0.173828125, + "learning_rate": 1.1299822330367305e-06, + "loss": 1.1212, + "step": 17470 + }, + { + "epoch": 0.96, + "grad_norm": 0.1748046875, + "learning_rate": 1.1156964202095332e-06, + "loss": 1.2025, + "step": 17475 + }, + { + "epoch": 0.96, + "grad_norm": 0.171875, + "learning_rate": 1.1015009804634102e-06, + "loss": 1.2314, + "step": 17480 + }, + { + "epoch": 0.96, + "grad_norm": 0.1767578125, + "learning_rate": 1.0873959267719947e-06, + "loss": 1.1906, + "step": 17485 + }, + { + "epoch": 0.96, + "grad_norm": 0.169921875, + "learning_rate": 1.0733812720262525e-06, + "loss": 1.2927, + "step": 17490 + }, + { + "epoch": 0.96, + "grad_norm": 0.177734375, + "learning_rate": 1.059457029034583e-06, + "loss": 1.1541, + "step": 17495 + }, + { + "epoch": 0.96, + "grad_norm": 0.1689453125, + "learning_rate": 1.045623210522717e-06, + "loss": 1.1553, + "step": 17500 + }, + { + "epoch": 0.96, + "grad_norm": 0.1708984375, + "learning_rate": 1.0318798291337861e-06, + "loss": 1.1507, + "step": 17505 + }, + { + "epoch": 0.96, + "grad_norm": 0.173828125, + "learning_rate": 1.0182268974282206e-06, + "loss": 1.1781, + "step": 17510 + }, + { + "epoch": 0.96, + "grad_norm": 0.1640625, + "learning_rate": 1.0046644278838057e-06, + "loss": 1.1769, + "step": 17515 + }, + { + "epoch": 0.96, + "grad_norm": 0.171875, + "learning_rate": 9.911924328956823e-07, + "loss": 1.1506, + "step": 17520 + }, + { + "epoch": 0.96, + "grad_norm": 0.173828125, + "learning_rate": 9.778109247762568e-07, + "loss": 1.1506, + "step": 17525 + }, + { + "epoch": 0.96, + "grad_norm": 0.1748046875, + "learning_rate": 9.645199157552688e-07, + "loss": 1.2123, + "step": 17530 + }, + { + "epoch": 0.96, + "grad_norm": 0.17578125, + "learning_rate": 9.513194179797235e-07, + "loss": 1.1271, + "step": 17535 + }, + { + "epoch": 0.96, + "grad_norm": 0.171875, + "learning_rate": 9.382094435139155e-07, + "loss": 1.1028, + "step": 17540 + }, + { + "epoch": 0.96, + "grad_norm": 0.16796875, + "learning_rate": 9.251900043394269e-07, + "loss": 1.1772, + "step": 17545 + }, + { + "epoch": 0.96, + "grad_norm": 0.1728515625, + "learning_rate": 9.122611123550839e-07, + "loss": 1.1611, + "step": 17550 + }, + { + "epoch": 0.96, + "grad_norm": 0.1728515625, + "learning_rate": 8.99422779376935e-07, + "loss": 1.1994, + "step": 17555 + }, + { + "epoch": 0.96, + "grad_norm": 0.1728515625, + "learning_rate": 8.866750171382942e-07, + "loss": 1.1474, + "step": 17560 + }, + { + "epoch": 0.96, + "grad_norm": 0.1669921875, + "learning_rate": 8.740178372896979e-07, + "loss": 1.1702, + "step": 17565 + }, + { + "epoch": 0.96, + "grad_norm": 0.1787109375, + "learning_rate": 8.614512513988815e-07, + "loss": 1.1506, + "step": 17570 + }, + { + "epoch": 0.96, + "grad_norm": 0.1806640625, + "learning_rate": 8.489752709508025e-07, + "loss": 1.1921, + "step": 17575 + }, + { + "epoch": 0.96, + "grad_norm": 0.1708984375, + "learning_rate": 8.365899073475958e-07, + "loss": 1.132, + "step": 17580 + }, + { + "epoch": 0.96, + "grad_norm": 0.173828125, + "learning_rate": 8.242951719085734e-07, + "loss": 1.2136, + "step": 17585 + }, + { + "epoch": 0.96, + "grad_norm": 0.1630859375, + "learning_rate": 8.120910758702582e-07, + "loss": 1.1054, + "step": 17590 + }, + { + "epoch": 0.96, + "grad_norm": 0.1669921875, + "learning_rate": 7.999776303862838e-07, + "loss": 1.0366, + "step": 17595 + }, + { + "epoch": 0.96, + "grad_norm": 0.173828125, + "learning_rate": 7.879548465274833e-07, + "loss": 1.1015, + "step": 17600 + }, + { + "epoch": 0.96, + "grad_norm": 0.17578125, + "learning_rate": 7.760227352817895e-07, + "loss": 1.2512, + "step": 17605 + }, + { + "epoch": 0.96, + "grad_norm": 0.173828125, + "learning_rate": 7.641813075543014e-07, + "loss": 1.1721, + "step": 17610 + }, + { + "epoch": 0.96, + "grad_norm": 0.1748046875, + "learning_rate": 7.524305741672178e-07, + "loss": 1.194, + "step": 17615 + }, + { + "epoch": 0.97, + "grad_norm": 0.1708984375, + "learning_rate": 7.40770545859848e-07, + "loss": 1.158, + "step": 17620 + }, + { + "epoch": 0.97, + "grad_norm": 0.16796875, + "learning_rate": 7.292012332886344e-07, + "loss": 1.0983, + "step": 17625 + }, + { + "epoch": 0.97, + "grad_norm": 0.162109375, + "learning_rate": 7.177226470270748e-07, + "loss": 1.1755, + "step": 17630 + }, + { + "epoch": 0.97, + "grad_norm": 0.1728515625, + "learning_rate": 7.063347975657664e-07, + "loss": 1.1139, + "step": 17635 + }, + { + "epoch": 0.97, + "grad_norm": 0.1708984375, + "learning_rate": 6.950376953123727e-07, + "loss": 1.1781, + "step": 17640 + }, + { + "epoch": 0.97, + "grad_norm": 0.166015625, + "learning_rate": 6.838313505916238e-07, + "loss": 1.2069, + "step": 17645 + }, + { + "epoch": 0.97, + "grad_norm": 0.169921875, + "learning_rate": 6.727157736453272e-07, + "loss": 1.215, + "step": 17650 + }, + { + "epoch": 0.97, + "grad_norm": 0.1767578125, + "learning_rate": 6.616909746322897e-07, + "loss": 1.1989, + "step": 17655 + }, + { + "epoch": 0.97, + "grad_norm": 0.1796875, + "learning_rate": 6.507569636283961e-07, + "loss": 1.1836, + "step": 17660 + }, + { + "epoch": 0.97, + "grad_norm": 0.1796875, + "learning_rate": 6.399137506265307e-07, + "loss": 1.214, + "step": 17665 + }, + { + "epoch": 0.97, + "grad_norm": 0.17578125, + "learning_rate": 6.291613455365886e-07, + "loss": 1.2227, + "step": 17670 + }, + { + "epoch": 0.97, + "grad_norm": 0.1708984375, + "learning_rate": 6.184997581855201e-07, + "loss": 1.0723, + "step": 17675 + }, + { + "epoch": 0.97, + "grad_norm": 0.166015625, + "learning_rate": 6.079289983172199e-07, + "loss": 1.152, + "step": 17680 + }, + { + "epoch": 0.97, + "grad_norm": 0.17578125, + "learning_rate": 5.974490755926043e-07, + "loss": 1.2155, + "step": 17685 + }, + { + "epoch": 0.97, + "grad_norm": 0.169921875, + "learning_rate": 5.870599995895564e-07, + "loss": 1.1577, + "step": 17690 + }, + { + "epoch": 0.97, + "grad_norm": 0.1806640625, + "learning_rate": 5.767617798029589e-07, + "loss": 1.0966, + "step": 17695 + }, + { + "epoch": 0.97, + "grad_norm": 0.166015625, + "learning_rate": 5.665544256446054e-07, + "loss": 1.1387, + "step": 17700 + }, + { + "epoch": 0.97, + "grad_norm": 0.173828125, + "learning_rate": 5.56437946443289e-07, + "loss": 1.1982, + "step": 17705 + }, + { + "epoch": 0.97, + "grad_norm": 0.1728515625, + "learning_rate": 5.464123514447472e-07, + "loss": 1.1425, + "step": 17710 + }, + { + "epoch": 0.97, + "grad_norm": 0.18359375, + "learning_rate": 5.364776498116398e-07, + "loss": 1.1517, + "step": 17715 + }, + { + "epoch": 0.97, + "grad_norm": 0.16796875, + "learning_rate": 5.266338506235701e-07, + "loss": 1.1926, + "step": 17720 + }, + { + "epoch": 0.97, + "grad_norm": 0.1728515625, + "learning_rate": 5.168809628770532e-07, + "loss": 1.1041, + "step": 17725 + }, + { + "epoch": 0.97, + "grad_norm": 0.1689453125, + "learning_rate": 5.072189954855034e-07, + "loss": 1.0947, + "step": 17730 + }, + { + "epoch": 0.97, + "grad_norm": 0.171875, + "learning_rate": 4.976479572792903e-07, + "loss": 1.0862, + "step": 17735 + }, + { + "epoch": 0.97, + "grad_norm": 0.1865234375, + "learning_rate": 4.88167857005628e-07, + "loss": 1.2524, + "step": 17740 + }, + { + "epoch": 0.97, + "grad_norm": 0.173828125, + "learning_rate": 4.787787033286417e-07, + "loss": 1.2265, + "step": 17745 + }, + { + "epoch": 0.97, + "grad_norm": 0.181640625, + "learning_rate": 4.694805048293338e-07, + "loss": 1.2045, + "step": 17750 + }, + { + "epoch": 0.97, + "grad_norm": 0.1689453125, + "learning_rate": 4.602732700055956e-07, + "loss": 1.1682, + "step": 17755 + }, + { + "epoch": 0.97, + "grad_norm": 0.181640625, + "learning_rate": 4.511570072721738e-07, + "loss": 1.2312, + "step": 17760 + }, + { + "epoch": 0.97, + "grad_norm": 0.171875, + "learning_rate": 4.4213172496065936e-07, + "loss": 1.1783, + "step": 17765 + }, + { + "epoch": 0.97, + "grad_norm": 0.16796875, + "learning_rate": 4.3319743131949864e-07, + "loss": 1.1469, + "step": 17770 + }, + { + "epoch": 0.97, + "grad_norm": 0.1748046875, + "learning_rate": 4.243541345140045e-07, + "loss": 1.1705, + "step": 17775 + }, + { + "epoch": 0.97, + "grad_norm": 0.16796875, + "learning_rate": 4.15601842626312e-07, + "loss": 1.0916, + "step": 17780 + }, + { + "epoch": 0.97, + "grad_norm": 0.169921875, + "learning_rate": 4.0694056365535584e-07, + "loss": 1.1413, + "step": 17785 + }, + { + "epoch": 0.97, + "grad_norm": 0.19140625, + "learning_rate": 3.9837030551694853e-07, + "loss": 1.2364, + "step": 17790 + }, + { + "epoch": 0.97, + "grad_norm": 0.1708984375, + "learning_rate": 3.8989107604365803e-07, + "loss": 1.1832, + "step": 17795 + }, + { + "epoch": 0.97, + "grad_norm": 0.171875, + "learning_rate": 3.8150288298489655e-07, + "loss": 1.1379, + "step": 17800 + }, + { + "epoch": 0.98, + "grad_norm": 0.1806640625, + "learning_rate": 3.732057340068762e-07, + "loss": 1.0899, + "step": 17805 + }, + { + "epoch": 0.98, + "grad_norm": 0.1728515625, + "learning_rate": 3.6499963669256453e-07, + "loss": 1.0986, + "step": 17810 + }, + { + "epoch": 0.98, + "grad_norm": 0.173828125, + "learning_rate": 3.5688459854175127e-07, + "loss": 1.1834, + "step": 17815 + }, + { + "epoch": 0.98, + "grad_norm": 0.173828125, + "learning_rate": 3.4886062697100375e-07, + "loss": 1.1115, + "step": 17820 + }, + { + "epoch": 0.98, + "grad_norm": 0.17578125, + "learning_rate": 3.409277293136337e-07, + "loss": 1.1083, + "step": 17825 + }, + { + "epoch": 0.98, + "grad_norm": 0.1884765625, + "learning_rate": 3.3308591281973055e-07, + "loss": 1.1524, + "step": 17830 + }, + { + "epoch": 0.98, + "grad_norm": 0.1787109375, + "learning_rate": 3.253351846561614e-07, + "loss": 1.1753, + "step": 17835 + }, + { + "epoch": 0.98, + "grad_norm": 0.1689453125, + "learning_rate": 3.1767555190652667e-07, + "loss": 1.1314, + "step": 17840 + }, + { + "epoch": 0.98, + "grad_norm": 0.1748046875, + "learning_rate": 3.10107021571171e-07, + "loss": 1.1214, + "step": 17845 + }, + { + "epoch": 0.98, + "grad_norm": 0.1748046875, + "learning_rate": 3.026296005671725e-07, + "loss": 1.1708, + "step": 17850 + }, + { + "epoch": 0.98, + "grad_norm": 0.1787109375, + "learning_rate": 2.9524329572837573e-07, + "loss": 1.1913, + "step": 17855 + }, + { + "epoch": 0.98, + "grad_norm": 0.1748046875, + "learning_rate": 2.8794811380530304e-07, + "loss": 1.2385, + "step": 17860 + }, + { + "epoch": 0.98, + "grad_norm": 0.1708984375, + "learning_rate": 2.8074406146523234e-07, + "loss": 1.1682, + "step": 17865 + }, + { + "epoch": 0.98, + "grad_norm": 0.169921875, + "learning_rate": 2.736311452921525e-07, + "loss": 1.1836, + "step": 17870 + }, + { + "epoch": 0.98, + "grad_norm": 0.171875, + "learning_rate": 2.6660937178674127e-07, + "loss": 1.127, + "step": 17875 + }, + { + "epoch": 0.98, + "grad_norm": 0.177734375, + "learning_rate": 2.5967874736638756e-07, + "loss": 1.1737, + "step": 17880 + }, + { + "epoch": 0.98, + "grad_norm": 0.1748046875, + "learning_rate": 2.5283927836519117e-07, + "loss": 1.1194, + "step": 17885 + }, + { + "epoch": 0.98, + "grad_norm": 0.1787109375, + "learning_rate": 2.4609097103392984e-07, + "loss": 1.1573, + "step": 17890 + }, + { + "epoch": 0.98, + "grad_norm": 0.1689453125, + "learning_rate": 2.3943383154005904e-07, + "loss": 1.1017, + "step": 17895 + }, + { + "epoch": 0.98, + "grad_norm": 0.169921875, + "learning_rate": 2.3286786596772302e-07, + "loss": 1.2186, + "step": 17900 + }, + { + "epoch": 0.98, + "grad_norm": 0.16796875, + "learning_rate": 2.2639308031774388e-07, + "loss": 1.1778, + "step": 17905 + }, + { + "epoch": 0.98, + "grad_norm": 0.177734375, + "learning_rate": 2.2000948050759917e-07, + "loss": 1.0945, + "step": 17910 + }, + { + "epoch": 0.98, + "grad_norm": 0.171875, + "learning_rate": 2.1371707237144433e-07, + "loss": 1.1886, + "step": 17915 + }, + { + "epoch": 0.98, + "grad_norm": 0.17578125, + "learning_rate": 2.0751586166007919e-07, + "loss": 1.1772, + "step": 17920 + }, + { + "epoch": 0.98, + "grad_norm": 0.181640625, + "learning_rate": 2.0140585404094803e-07, + "loss": 1.2029, + "step": 17925 + }, + { + "epoch": 0.98, + "grad_norm": 0.16796875, + "learning_rate": 1.95387055098184e-07, + "loss": 1.0444, + "step": 17930 + }, + { + "epoch": 0.98, + "grad_norm": 0.1767578125, + "learning_rate": 1.894594703324981e-07, + "loss": 1.1908, + "step": 17935 + }, + { + "epoch": 0.98, + "grad_norm": 0.177734375, + "learning_rate": 1.8362310516130132e-07, + "loss": 1.1938, + "step": 17940 + }, + { + "epoch": 0.98, + "grad_norm": 0.1669921875, + "learning_rate": 1.7787796491859355e-07, + "loss": 1.1436, + "step": 17945 + }, + { + "epoch": 0.98, + "grad_norm": 0.169921875, + "learning_rate": 1.7222405485501912e-07, + "loss": 1.2447, + "step": 17950 + }, + { + "epoch": 0.98, + "grad_norm": 0.1650390625, + "learning_rate": 1.666613801378447e-07, + "loss": 1.1373, + "step": 17955 + }, + { + "epoch": 0.98, + "grad_norm": 0.1728515625, + "learning_rate": 1.6118994585094805e-07, + "loss": 1.2357, + "step": 17960 + }, + { + "epoch": 0.98, + "grad_norm": 0.1708984375, + "learning_rate": 1.5580975699482914e-07, + "loss": 1.1555, + "step": 17965 + }, + { + "epoch": 0.98, + "grad_norm": 0.1806640625, + "learning_rate": 1.5052081848657694e-07, + "loss": 1.1874, + "step": 17970 + }, + { + "epoch": 0.98, + "grad_norm": 0.1689453125, + "learning_rate": 1.453231351599138e-07, + "loss": 1.2045, + "step": 17975 + }, + { + "epoch": 0.98, + "grad_norm": 0.1748046875, + "learning_rate": 1.4021671176513984e-07, + "loss": 1.2113, + "step": 17980 + }, + { + "epoch": 0.99, + "grad_norm": 0.1708984375, + "learning_rate": 1.3520155296914417e-07, + "loss": 1.2882, + "step": 17985 + }, + { + "epoch": 0.99, + "grad_norm": 0.1708984375, + "learning_rate": 1.3027766335543812e-07, + "loss": 1.2075, + "step": 17990 + }, + { + "epoch": 0.99, + "grad_norm": 0.169921875, + "learning_rate": 1.254450474240776e-07, + "loss": 1.2115, + "step": 17995 + }, + { + "epoch": 0.99, + "grad_norm": 0.1748046875, + "learning_rate": 1.207037095917407e-07, + "loss": 1.279, + "step": 18000 + }, + { + "epoch": 0.99, + "grad_norm": 0.1708984375, + "learning_rate": 1.1605365419166126e-07, + "loss": 1.163, + "step": 18005 + }, + { + "epoch": 0.99, + "grad_norm": 0.1728515625, + "learning_rate": 1.1149488547365083e-07, + "loss": 1.1874, + "step": 18010 + }, + { + "epoch": 0.99, + "grad_norm": 0.1728515625, + "learning_rate": 1.0702740760409891e-07, + "loss": 1.1259, + "step": 18015 + }, + { + "epoch": 0.99, + "grad_norm": 0.171875, + "learning_rate": 1.0265122466593946e-07, + "loss": 1.1281, + "step": 18020 + }, + { + "epoch": 0.99, + "grad_norm": 0.171875, + "learning_rate": 9.836634065870654e-08, + "loss": 1.2164, + "step": 18025 + }, + { + "epoch": 0.99, + "grad_norm": 0.1767578125, + "learning_rate": 9.41727594984676e-08, + "loss": 1.1758, + "step": 18030 + }, + { + "epoch": 0.99, + "grad_norm": 0.17578125, + "learning_rate": 9.007048501784576e-08, + "loss": 1.1462, + "step": 18035 + }, + { + "epoch": 0.99, + "grad_norm": 0.1708984375, + "learning_rate": 8.605952096601977e-08, + "loss": 1.1391, + "step": 18040 + }, + { + "epoch": 0.99, + "grad_norm": 0.169921875, + "learning_rate": 8.213987100873511e-08, + "loss": 1.1093, + "step": 18045 + }, + { + "epoch": 0.99, + "grad_norm": 0.166015625, + "learning_rate": 7.831153872824847e-08, + "loss": 1.1492, + "step": 18050 + }, + { + "epoch": 0.99, + "grad_norm": 0.1728515625, + "learning_rate": 7.457452762339445e-08, + "loss": 1.1813, + "step": 18055 + }, + { + "epoch": 0.99, + "grad_norm": 0.166015625, + "learning_rate": 7.092884110951881e-08, + "loss": 1.144, + "step": 18060 + }, + { + "epoch": 0.99, + "grad_norm": 0.1689453125, + "learning_rate": 6.737448251852296e-08, + "loss": 1.2227, + "step": 18065 + }, + { + "epoch": 0.99, + "grad_norm": 0.1787109375, + "learning_rate": 6.391145509883067e-08, + "loss": 1.1563, + "step": 18070 + }, + { + "epoch": 0.99, + "grad_norm": 0.1748046875, + "learning_rate": 6.053976201538803e-08, + "loss": 1.2125, + "step": 18075 + }, + { + "epoch": 0.99, + "grad_norm": 0.169921875, + "learning_rate": 5.725940634969673e-08, + "loss": 1.0353, + "step": 18080 + }, + { + "epoch": 0.99, + "grad_norm": 0.1708984375, + "learning_rate": 5.407039109974754e-08, + "loss": 1.1594, + "step": 18085 + }, + { + "epoch": 0.99, + "grad_norm": 0.169921875, + "learning_rate": 5.097271918007573e-08, + "loss": 1.186, + "step": 18090 + }, + { + "epoch": 0.99, + "grad_norm": 0.173828125, + "learning_rate": 4.7966393421727815e-08, + "loss": 1.2099, + "step": 18095 + }, + { + "epoch": 0.99, + "grad_norm": 0.171875, + "learning_rate": 4.505141657226153e-08, + "loss": 1.1668, + "step": 18100 + }, + { + "epoch": 0.99, + "grad_norm": 0.17578125, + "learning_rate": 4.222779129576804e-08, + "loss": 1.1367, + "step": 18105 + }, + { + "epoch": 0.99, + "grad_norm": 0.1669921875, + "learning_rate": 3.949552017282754e-08, + "loss": 1.1588, + "step": 18110 + }, + { + "epoch": 0.99, + "grad_norm": 0.16796875, + "learning_rate": 3.685460570053145e-08, + "loss": 1.1962, + "step": 18115 + }, + { + "epoch": 0.99, + "grad_norm": 0.169921875, + "learning_rate": 3.430505029249354e-08, + "loss": 1.1058, + "step": 18120 + }, + { + "epoch": 0.99, + "grad_norm": 0.1728515625, + "learning_rate": 3.184685627882766e-08, + "loss": 1.2007, + "step": 18125 + }, + { + "epoch": 0.99, + "grad_norm": 0.1826171875, + "learning_rate": 2.948002590612564e-08, + "loss": 1.1123, + "step": 18130 + }, + { + "epoch": 0.99, + "grad_norm": 0.171875, + "learning_rate": 2.7204561337523803e-08, + "loss": 1.1671, + "step": 18135 + }, + { + "epoch": 0.99, + "grad_norm": 0.177734375, + "learning_rate": 2.5020464652603103e-08, + "loss": 1.2248, + "step": 18140 + }, + { + "epoch": 0.99, + "grad_norm": 0.16796875, + "learning_rate": 2.292773784748903e-08, + "loss": 1.1082, + "step": 18145 + }, + { + "epoch": 0.99, + "grad_norm": 0.173828125, + "learning_rate": 2.0926382834784984e-08, + "loss": 1.1874, + "step": 18150 + }, + { + "epoch": 0.99, + "grad_norm": 0.171875, + "learning_rate": 1.9016401443572306e-08, + "loss": 1.0811, + "step": 18155 + }, + { + "epoch": 0.99, + "grad_norm": 0.1669921875, + "learning_rate": 1.7197795419443552e-08, + "loss": 1.2035, + "step": 18160 + }, + { + "epoch": 0.99, + "grad_norm": 0.1787109375, + "learning_rate": 1.54705664244581e-08, + "loss": 1.2233, + "step": 18165 + }, + { + "epoch": 1.0, + "grad_norm": 0.171875, + "learning_rate": 1.3834716037197659e-08, + "loss": 1.1751, + "step": 18170 + }, + { + "epoch": 1.0, + "grad_norm": 0.1728515625, + "learning_rate": 1.2290245752699659e-08, + "loss": 1.2549, + "step": 18175 + }, + { + "epoch": 1.0, + "grad_norm": 0.1787109375, + "learning_rate": 1.0837156982501651e-08, + "loss": 1.1124, + "step": 18180 + }, + { + "epoch": 1.0, + "grad_norm": 0.1796875, + "learning_rate": 9.475451054619112e-09, + "loss": 1.1638, + "step": 18185 + }, + { + "epoch": 1.0, + "grad_norm": 0.18359375, + "learning_rate": 8.20512921355654e-09, + "loss": 1.1538, + "step": 18190 + }, + { + "epoch": 1.0, + "grad_norm": 0.1748046875, + "learning_rate": 7.026192620285254e-09, + "loss": 1.1048, + "step": 18195 + }, + { + "epoch": 1.0, + "grad_norm": 0.173828125, + "learning_rate": 5.938642352287804e-09, + "loss": 1.1768, + "step": 18200 + }, + { + "epoch": 1.0, + "grad_norm": 0.1748046875, + "learning_rate": 4.942479403491351e-09, + "loss": 1.1869, + "step": 18205 + }, + { + "epoch": 1.0, + "grad_norm": 0.16796875, + "learning_rate": 4.037704684312083e-09, + "loss": 1.1892, + "step": 18210 + }, + { + "epoch": 1.0, + "grad_norm": 0.1748046875, + "learning_rate": 3.2243190216552087e-09, + "loss": 1.1961, + "step": 18215 + }, + { + "epoch": 1.0, + "grad_norm": 0.177734375, + "learning_rate": 2.502323158903863e-09, + "loss": 1.0843, + "step": 18220 + }, + { + "epoch": 1.0, + "grad_norm": 0.1767578125, + "learning_rate": 1.8717177558968957e-09, + "loss": 1.1736, + "step": 18225 + }, + { + "epoch": 1.0, + "grad_norm": 0.173828125, + "learning_rate": 1.3325033889621806e-09, + "loss": 1.1493, + "step": 18230 + }, + { + "epoch": 1.0, + "grad_norm": 0.1796875, + "learning_rate": 8.846805509166167e-10, + "loss": 1.1426, + "step": 18235 + }, + { + "epoch": 1.0, + "grad_norm": 0.1767578125, + "learning_rate": 5.282496510328195e-10, + "loss": 1.216, + "step": 18240 + }, + { + "epoch": 1.0, + "grad_norm": 0.171875, + "learning_rate": 2.6321101505022427e-10, + "loss": 1.1377, + "step": 18245 + }, + { + "epoch": 1.0, + "grad_norm": 0.177734375, + "learning_rate": 8.956488520839302e-11, + "loss": 1.1538, + "step": 18250 + }, + { + "epoch": 1.0, + "grad_norm": 0.1728515625, + "learning_rate": 7.31142020260478e-12, + "loss": 1.1785, + "step": 18255 + }, + { + "epoch": 1.0, + "eval_loss": 1.1695406436920166, + "eval_runtime": 1922.8575, + "eval_samples_per_second": 8.407, + "eval_steps_per_second": 1.051, + "step": 18257 + }, + { + "epoch": 1.0, + "step": 18257, + "total_flos": 1.8793056999854572e+18, + "train_loss": 1.1836543822331609, + "train_runtime": 65654.1144, + "train_samples_per_second": 2.225, + "train_steps_per_second": 0.278 + } + ], + "logging_steps": 5, + "max_steps": 18257, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.8793056999854572e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}