{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9740777666999003, "eval_steps": 125, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.8329473944625845, "learning_rate": 1.0000000000000002e-06, "loss": 0.9189, "step": 1 }, { "epoch": 0.0, "eval_loss": 0.8840048313140869, "eval_runtime": 99.9262, "eval_samples_per_second": 17.693, "eval_steps_per_second": 0.37, "step": 1 }, { "epoch": 0.01, "grad_norm": 1.7916344264608899, "learning_rate": 2.0000000000000003e-06, "loss": 0.8962, "step": 2 }, { "epoch": 0.01, "grad_norm": 1.8909931480365287, "learning_rate": 3e-06, "loss": 0.8805, "step": 3 }, { "epoch": 0.02, "grad_norm": 1.6318273112027453, "learning_rate": 4.000000000000001e-06, "loss": 0.913, "step": 4 }, { "epoch": 0.02, "grad_norm": 1.2463401136319747, "learning_rate": 5e-06, "loss": 0.908, "step": 5 }, { "epoch": 0.02, "grad_norm": 1.1463980681106876, "learning_rate": 6e-06, "loss": 0.8729, "step": 6 }, { "epoch": 0.03, "grad_norm": 0.9477573494094379, "learning_rate": 7e-06, "loss": 0.8411, "step": 7 }, { "epoch": 0.03, "grad_norm": 4.0165120162042935, "learning_rate": 8.000000000000001e-06, "loss": 1.0541, "step": 8 }, { "epoch": 0.04, "grad_norm": 1.0713771331971476, "learning_rate": 9e-06, "loss": 0.8495, "step": 9 }, { "epoch": 0.04, "grad_norm": 0.8667235558943894, "learning_rate": 1e-05, "loss": 0.8199, "step": 10 }, { "epoch": 0.04, "grad_norm": 0.7411429457268661, "learning_rate": 9.999897234791831e-06, "loss": 0.7964, "step": 11 }, { "epoch": 0.05, "grad_norm": 0.5729968036750446, "learning_rate": 9.999588943391597e-06, "loss": 0.8039, "step": 12 }, { "epoch": 0.05, "grad_norm": 0.5402964207486183, "learning_rate": 9.99907513847195e-06, "loss": 0.8287, "step": 13 }, { "epoch": 0.06, "grad_norm": 0.5633328266442124, "learning_rate": 9.9983558411534e-06, "loss": 0.7842, "step": 14 }, { "epoch": 0.06, "grad_norm": 0.5412290905686791, "learning_rate": 9.99743108100344e-06, "loss": 0.8345, "step": 15 }, { "epoch": 0.06, "grad_norm": 0.4895379189634968, "learning_rate": 9.99630089603534e-06, "loss": 0.7922, "step": 16 }, { "epoch": 0.07, "grad_norm": 0.5088260537094976, "learning_rate": 9.994965332706574e-06, "loss": 0.7969, "step": 17 }, { "epoch": 0.07, "grad_norm": 0.47075507205524136, "learning_rate": 9.993424445916923e-06, "loss": 0.7931, "step": 18 }, { "epoch": 0.08, "grad_norm": 0.3878407143429931, "learning_rate": 9.991678299006206e-06, "loss": 0.8041, "step": 19 }, { "epoch": 0.08, "grad_norm": 0.3873731682942636, "learning_rate": 9.989726963751683e-06, "loss": 0.8107, "step": 20 }, { "epoch": 0.08, "grad_norm": 0.42417629604043083, "learning_rate": 9.987570520365105e-06, "loss": 0.7874, "step": 21 }, { "epoch": 0.09, "grad_norm": 0.4324680733617199, "learning_rate": 9.98520905748941e-06, "loss": 0.8025, "step": 22 }, { "epoch": 0.09, "grad_norm": 0.34546199757993784, "learning_rate": 9.982642672195093e-06, "loss": 0.8048, "step": 23 }, { "epoch": 0.1, "grad_norm": 0.35958771648273496, "learning_rate": 9.979871469976197e-06, "loss": 0.831, "step": 24 }, { "epoch": 0.1, "grad_norm": 0.3940074197908444, "learning_rate": 9.976895564745993e-06, "loss": 0.7944, "step": 25 }, { "epoch": 0.1, "grad_norm": 0.3818406187889153, "learning_rate": 9.973715078832288e-06, "loss": 0.7936, "step": 26 }, { "epoch": 0.11, "grad_norm": 10.237346743255186, "learning_rate": 9.970330142972403e-06, "loss": 1.0017, "step": 27 }, { "epoch": 0.11, "grad_norm": 6.504690612414681, "learning_rate": 9.966740896307791e-06, "loss": 1.0329, "step": 28 }, { "epoch": 0.12, "grad_norm": 0.4028775109425473, "learning_rate": 9.962947486378325e-06, "loss": 0.7702, "step": 29 }, { "epoch": 0.12, "grad_norm": 0.39810277536967076, "learning_rate": 9.95895006911623e-06, "loss": 0.771, "step": 30 }, { "epoch": 0.12, "grad_norm": 0.29862663396811506, "learning_rate": 9.954748808839675e-06, "loss": 0.7767, "step": 31 }, { "epoch": 0.13, "grad_norm": 0.3106188272362696, "learning_rate": 9.950343878246011e-06, "loss": 0.7943, "step": 32 }, { "epoch": 0.13, "grad_norm": 0.34702364911134964, "learning_rate": 9.945735458404681e-06, "loss": 0.7972, "step": 33 }, { "epoch": 0.14, "grad_norm": 0.3216448960978253, "learning_rate": 9.94092373874978e-06, "loss": 0.7847, "step": 34 }, { "epoch": 0.14, "grad_norm": 0.31232207978504006, "learning_rate": 9.935908917072253e-06, "loss": 0.7738, "step": 35 }, { "epoch": 0.14, "grad_norm": 0.3004886604892709, "learning_rate": 9.930691199511775e-06, "loss": 0.7877, "step": 36 }, { "epoch": 0.15, "grad_norm": 0.2870013960815822, "learning_rate": 9.925270800548285e-06, "loss": 0.754, "step": 37 }, { "epoch": 0.15, "grad_norm": 0.28322113595593756, "learning_rate": 9.91964794299315e-06, "loss": 0.7445, "step": 38 }, { "epoch": 0.16, "grad_norm": 0.3065117198934518, "learning_rate": 9.91382285798002e-06, "loss": 0.787, "step": 39 }, { "epoch": 0.16, "grad_norm": 0.2727693466806482, "learning_rate": 9.907795784955327e-06, "loss": 0.7865, "step": 40 }, { "epoch": 0.16, "grad_norm": 0.2746198009076503, "learning_rate": 9.901566971668437e-06, "loss": 0.7755, "step": 41 }, { "epoch": 0.17, "grad_norm": 0.2888207750688948, "learning_rate": 9.895136674161466e-06, "loss": 0.7789, "step": 42 }, { "epoch": 0.17, "grad_norm": 0.26218141394209254, "learning_rate": 9.888505156758758e-06, "loss": 0.7781, "step": 43 }, { "epoch": 0.18, "grad_norm": 0.27028128323788914, "learning_rate": 9.881672692056022e-06, "loss": 0.7596, "step": 44 }, { "epoch": 0.18, "grad_norm": 0.301432889634355, "learning_rate": 9.874639560909118e-06, "loss": 0.7746, "step": 45 }, { "epoch": 0.18, "grad_norm": 0.27768870163187315, "learning_rate": 9.867406052422525e-06, "loss": 0.7751, "step": 46 }, { "epoch": 0.19, "grad_norm": 0.2638230079020965, "learning_rate": 9.85997246393744e-06, "loss": 0.8085, "step": 47 }, { "epoch": 0.19, "grad_norm": 0.2826098837962784, "learning_rate": 9.852339101019574e-06, "loss": 0.7878, "step": 48 }, { "epoch": 0.2, "grad_norm": 0.2673052298412088, "learning_rate": 9.844506277446577e-06, "loss": 0.7747, "step": 49 }, { "epoch": 0.2, "grad_norm": 0.2589820555015507, "learning_rate": 9.836474315195148e-06, "loss": 0.7491, "step": 50 }, { "epoch": 0.2, "grad_norm": 0.27744141325372174, "learning_rate": 9.828243544427795e-06, "loss": 0.771, "step": 51 }, { "epoch": 0.21, "grad_norm": 0.25617202776049003, "learning_rate": 9.819814303479268e-06, "loss": 0.789, "step": 52 }, { "epoch": 0.21, "grad_norm": 0.25777796417187593, "learning_rate": 9.811186938842645e-06, "loss": 0.7498, "step": 53 }, { "epoch": 0.22, "grad_norm": 0.26356120702424557, "learning_rate": 9.802361805155097e-06, "loss": 0.7618, "step": 54 }, { "epoch": 0.22, "grad_norm": 0.24594116238284844, "learning_rate": 9.793339265183303e-06, "loss": 0.7647, "step": 55 }, { "epoch": 0.22, "grad_norm": 0.2766331605712476, "learning_rate": 9.784119689808545e-06, "loss": 0.7757, "step": 56 }, { "epoch": 0.23, "grad_norm": 0.2674205732918615, "learning_rate": 9.774703458011453e-06, "loss": 0.7479, "step": 57 }, { "epoch": 0.23, "grad_norm": 0.25100414008068433, "learning_rate": 9.765090956856437e-06, "loss": 0.7629, "step": 58 }, { "epoch": 0.24, "grad_norm": 0.2558976905977626, "learning_rate": 9.755282581475769e-06, "loss": 0.7368, "step": 59 }, { "epoch": 0.24, "grad_norm": 0.2816597522453804, "learning_rate": 9.745278735053345e-06, "loss": 0.7675, "step": 60 }, { "epoch": 0.24, "grad_norm": 0.27864046582364604, "learning_rate": 9.735079828808107e-06, "loss": 0.7693, "step": 61 }, { "epoch": 0.25, "grad_norm": 0.2537495381298166, "learning_rate": 9.724686281977146e-06, "loss": 0.7612, "step": 62 }, { "epoch": 0.25, "grad_norm": 0.27161360619636454, "learning_rate": 9.714098521798466e-06, "loss": 0.7659, "step": 63 }, { "epoch": 0.26, "grad_norm": 0.257282261183055, "learning_rate": 9.703316983493414e-06, "loss": 0.77, "step": 64 }, { "epoch": 0.26, "grad_norm": 0.2598148868150837, "learning_rate": 9.692342110248802e-06, "loss": 0.7637, "step": 65 }, { "epoch": 0.26, "grad_norm": 0.25319486577746536, "learning_rate": 9.681174353198687e-06, "loss": 0.7529, "step": 66 }, { "epoch": 0.27, "grad_norm": 0.2616187230129625, "learning_rate": 9.669814171405818e-06, "loss": 0.7482, "step": 67 }, { "epoch": 0.27, "grad_norm": 0.2531735015293101, "learning_rate": 9.658262031842772e-06, "loss": 0.7507, "step": 68 }, { "epoch": 0.28, "grad_norm": 0.2540031125746497, "learning_rate": 9.64651840937276e-06, "loss": 0.7573, "step": 69 }, { "epoch": 0.28, "grad_norm": 0.26251145119756225, "learning_rate": 9.63458378673011e-06, "loss": 0.7617, "step": 70 }, { "epoch": 0.28, "grad_norm": 11.659656865913, "learning_rate": 9.622458654500408e-06, "loss": 0.9807, "step": 71 }, { "epoch": 0.29, "grad_norm": 43.33218979251603, "learning_rate": 9.610143511100354e-06, "loss": 1.0213, "step": 72 }, { "epoch": 0.29, "grad_norm": 0.29661440448786996, "learning_rate": 9.597638862757255e-06, "loss": 0.7597, "step": 73 }, { "epoch": 0.3, "grad_norm": 0.2674359864711363, "learning_rate": 9.584945223488227e-06, "loss": 0.7716, "step": 74 }, { "epoch": 0.3, "grad_norm": 0.2587397735578842, "learning_rate": 9.572063115079063e-06, "loss": 0.7654, "step": 75 }, { "epoch": 0.3, "grad_norm": 0.27326638279450294, "learning_rate": 9.558993067062785e-06, "loss": 0.7832, "step": 76 }, { "epoch": 0.31, "grad_norm": 0.26424783232216553, "learning_rate": 9.545735616697875e-06, "loss": 0.7509, "step": 77 }, { "epoch": 0.31, "grad_norm": 0.26894661215694415, "learning_rate": 9.532291308946191e-06, "loss": 0.7638, "step": 78 }, { "epoch": 0.32, "grad_norm": 12.381149099110814, "learning_rate": 9.518660696450567e-06, "loss": 0.9726, "step": 79 }, { "epoch": 0.32, "grad_norm": 63.276974873593076, "learning_rate": 9.504844339512096e-06, "loss": 0.961, "step": 80 }, { "epoch": 0.32, "grad_norm": 0.34404347007223207, "learning_rate": 9.490842806067095e-06, "loss": 0.7366, "step": 81 }, { "epoch": 0.33, "grad_norm": 0.2761892805994169, "learning_rate": 9.476656671663766e-06, "loss": 0.7565, "step": 82 }, { "epoch": 0.33, "grad_norm": 0.2938700568825168, "learning_rate": 9.462286519438531e-06, "loss": 0.7586, "step": 83 }, { "epoch": 0.33, "grad_norm": 0.30998708104141814, "learning_rate": 9.44773294009206e-06, "loss": 0.747, "step": 84 }, { "epoch": 0.34, "grad_norm": 0.2789622879446074, "learning_rate": 9.432996531865001e-06, "loss": 0.7381, "step": 85 }, { "epoch": 0.34, "grad_norm": 0.3043211841621936, "learning_rate": 9.418077900513377e-06, "loss": 0.7648, "step": 86 }, { "epoch": 0.35, "grad_norm": 0.27269347275749684, "learning_rate": 9.40297765928369e-06, "loss": 0.7287, "step": 87 }, { "epoch": 0.35, "grad_norm": 0.29165683068711035, "learning_rate": 9.387696428887715e-06, "loss": 0.7714, "step": 88 }, { "epoch": 0.35, "grad_norm": 0.29093659611546846, "learning_rate": 9.372234837476979e-06, "loss": 0.754, "step": 89 }, { "epoch": 0.36, "grad_norm": 0.2622000520062877, "learning_rate": 9.356593520616948e-06, "loss": 0.7604, "step": 90 }, { "epoch": 0.36, "grad_norm": 0.29648676556314774, "learning_rate": 9.340773121260893e-06, "loss": 0.7677, "step": 91 }, { "epoch": 0.37, "grad_norm": 0.2971691719126809, "learning_rate": 9.324774289723469e-06, "loss": 0.7826, "step": 92 }, { "epoch": 0.37, "grad_norm": 0.2695147958164756, "learning_rate": 9.308597683653976e-06, "loss": 0.7675, "step": 93 }, { "epoch": 0.37, "grad_norm": 0.2947547264550856, "learning_rate": 9.292243968009332e-06, "loss": 0.7611, "step": 94 }, { "epoch": 0.38, "grad_norm": 0.2698951119585888, "learning_rate": 9.275713815026732e-06, "loss": 0.7437, "step": 95 }, { "epoch": 0.38, "grad_norm": 0.2922871464880811, "learning_rate": 9.259007904196023e-06, "loss": 0.7716, "step": 96 }, { "epoch": 0.39, "grad_norm": 13.342043215041077, "learning_rate": 9.242126922231763e-06, "loss": 1.0262, "step": 97 }, { "epoch": 0.39, "grad_norm": 3.3348436860369577, "learning_rate": 9.225071563045007e-06, "loss": 0.9733, "step": 98 }, { "epoch": 0.39, "grad_norm": 0.30515152013617763, "learning_rate": 9.207842527714767e-06, "loss": 0.7491, "step": 99 }, { "epoch": 0.4, "grad_norm": 0.2797372956926758, "learning_rate": 9.190440524459203e-06, "loss": 0.7658, "step": 100 }, { "epoch": 0.4, "grad_norm": 0.295442681103485, "learning_rate": 9.172866268606514e-06, "loss": 0.7359, "step": 101 }, { "epoch": 0.41, "grad_norm": 0.2890934213055238, "learning_rate": 9.15512048256552e-06, "loss": 0.7783, "step": 102 }, { "epoch": 0.41, "grad_norm": 0.2698602196909741, "learning_rate": 9.137203895795983e-06, "loss": 0.7476, "step": 103 }, { "epoch": 0.41, "grad_norm": 0.30586672847809404, "learning_rate": 9.119117244778609e-06, "loss": 0.7494, "step": 104 }, { "epoch": 0.42, "grad_norm": 0.28256137853789653, "learning_rate": 9.10086127298478e-06, "loss": 0.7347, "step": 105 }, { "epoch": 0.42, "grad_norm": 0.2654565204147507, "learning_rate": 9.082436730845993e-06, "loss": 0.7282, "step": 106 }, { "epoch": 0.43, "grad_norm": 0.3110313172517606, "learning_rate": 9.063844375723014e-06, "loss": 0.7442, "step": 107 }, { "epoch": 0.43, "grad_norm": 0.30406144580285027, "learning_rate": 9.045084971874738e-06, "loss": 0.7365, "step": 108 }, { "epoch": 0.43, "grad_norm": 0.2578097986689305, "learning_rate": 9.026159290426782e-06, "loss": 0.7644, "step": 109 }, { "epoch": 0.44, "grad_norm": 0.2875228334986064, "learning_rate": 9.007068109339783e-06, "loss": 0.7359, "step": 110 }, { "epoch": 0.44, "grad_norm": 0.3007292657335934, "learning_rate": 8.987812213377423e-06, "loss": 0.7571, "step": 111 }, { "epoch": 0.45, "grad_norm": 0.2705781647990633, "learning_rate": 8.968392394074164e-06, "loss": 0.7321, "step": 112 }, { "epoch": 0.45, "grad_norm": 7.709717121399015, "learning_rate": 8.948809449702712e-06, "loss": 1.0663, "step": 113 }, { "epoch": 0.45, "grad_norm": 0.3159530858994423, "learning_rate": 8.929064185241214e-06, "loss": 0.7594, "step": 114 }, { "epoch": 0.46, "grad_norm": 0.3001925080979955, "learning_rate": 8.90915741234015e-06, "loss": 0.7486, "step": 115 }, { "epoch": 0.46, "grad_norm": 2.7719217922453914, "learning_rate": 8.889089949288986e-06, "loss": 1.0014, "step": 116 }, { "epoch": 0.47, "grad_norm": 0.2980097580186808, "learning_rate": 8.868862620982534e-06, "loss": 0.7302, "step": 117 }, { "epoch": 0.47, "grad_norm": 0.2674175783919389, "learning_rate": 8.84847625888703e-06, "loss": 0.7515, "step": 118 }, { "epoch": 0.47, "grad_norm": 0.33089914745986465, "learning_rate": 8.827931701005974e-06, "loss": 0.7452, "step": 119 }, { "epoch": 0.48, "grad_norm": 0.2657873873108844, "learning_rate": 8.807229791845673e-06, "loss": 0.7565, "step": 120 }, { "epoch": 0.48, "grad_norm": 0.26891569095038903, "learning_rate": 8.786371382380527e-06, "loss": 0.7525, "step": 121 }, { "epoch": 0.49, "grad_norm": 0.30828049821760906, "learning_rate": 8.765357330018056e-06, "loss": 0.7395, "step": 122 }, { "epoch": 0.49, "grad_norm": 0.2703956947723179, "learning_rate": 8.74418849856364e-06, "loss": 0.7762, "step": 123 }, { "epoch": 0.49, "grad_norm": 0.27665743831770573, "learning_rate": 8.722865758185036e-06, "loss": 0.7499, "step": 124 }, { "epoch": 0.5, "grad_norm": 0.31328445024397394, "learning_rate": 8.701389985376578e-06, "loss": 0.7368, "step": 125 }, { "epoch": 0.5, "eval_loss": 0.7192811369895935, "eval_runtime": 97.0775, "eval_samples_per_second": 18.212, "eval_steps_per_second": 0.381, "step": 125 }, { "epoch": 0.5, "grad_norm": 0.26565830658390915, "learning_rate": 8.679762062923176e-06, "loss": 0.7727, "step": 126 }, { "epoch": 0.51, "grad_norm": 0.2768705145101062, "learning_rate": 8.657982879864007e-06, "loss": 0.7178, "step": 127 }, { "epoch": 0.51, "grad_norm": 0.27658899618203814, "learning_rate": 8.636053331455986e-06, "loss": 0.7521, "step": 128 }, { "epoch": 0.51, "grad_norm": 0.2687326456666238, "learning_rate": 8.613974319136959e-06, "loss": 0.7411, "step": 129 }, { "epoch": 0.52, "grad_norm": 0.2618083386724651, "learning_rate": 8.591746750488639e-06, "loss": 0.7306, "step": 130 }, { "epoch": 0.52, "grad_norm": 0.25666246646393165, "learning_rate": 8.569371539199316e-06, "loss": 0.7505, "step": 131 }, { "epoch": 0.53, "grad_norm": 0.3203048983481449, "learning_rate": 8.54684960502629e-06, "loss": 0.7515, "step": 132 }, { "epoch": 0.53, "grad_norm": 0.2521993776332652, "learning_rate": 8.52418187375806e-06, "loss": 0.7505, "step": 133 }, { "epoch": 0.53, "grad_norm": 0.26591933789428035, "learning_rate": 8.501369277176275e-06, "loss": 0.7353, "step": 134 }, { "epoch": 0.54, "grad_norm": 0.27603393300812845, "learning_rate": 8.478412753017433e-06, "loss": 0.7609, "step": 135 }, { "epoch": 0.54, "grad_norm": 0.2668194745887302, "learning_rate": 8.455313244934324e-06, "loss": 0.7141, "step": 136 }, { "epoch": 0.55, "grad_norm": 0.2707242484249978, "learning_rate": 8.432071702457253e-06, "loss": 0.7223, "step": 137 }, { "epoch": 0.55, "grad_norm": 0.25511126409448154, "learning_rate": 8.408689080954997e-06, "loss": 0.7153, "step": 138 }, { "epoch": 0.55, "grad_norm": 15.420395873510664, "learning_rate": 8.38516634159555e-06, "loss": 1.0042, "step": 139 }, { "epoch": 0.56, "grad_norm": 0.3012234081880187, "learning_rate": 8.361504451306585e-06, "loss": 0.7713, "step": 140 }, { "epoch": 0.56, "grad_norm": 0.2598491249346932, "learning_rate": 8.337704382735741e-06, "loss": 0.7288, "step": 141 }, { "epoch": 0.57, "grad_norm": 0.26256298238822373, "learning_rate": 8.313767114210615e-06, "loss": 0.7379, "step": 142 }, { "epoch": 0.57, "grad_norm": 0.2945113973208366, "learning_rate": 8.289693629698564e-06, "loss": 0.7401, "step": 143 }, { "epoch": 0.57, "grad_norm": 0.24981458420819586, "learning_rate": 8.265484918766243e-06, "loss": 0.7512, "step": 144 }, { "epoch": 0.58, "grad_norm": 0.2678413297548206, "learning_rate": 8.241141976538944e-06, "loss": 0.7449, "step": 145 }, { "epoch": 0.58, "grad_norm": 0.2623417103083203, "learning_rate": 8.216665803659671e-06, "loss": 0.7647, "step": 146 }, { "epoch": 0.59, "grad_norm": 0.263785777979794, "learning_rate": 8.192057406248028e-06, "loss": 0.7725, "step": 147 }, { "epoch": 0.59, "grad_norm": 0.2519540317965661, "learning_rate": 8.16731779585885e-06, "loss": 0.715, "step": 148 }, { "epoch": 0.59, "grad_norm": 0.27015785362121375, "learning_rate": 8.142447989440618e-06, "loss": 0.7532, "step": 149 }, { "epoch": 0.6, "grad_norm": 0.25863328277564784, "learning_rate": 8.117449009293668e-06, "loss": 0.7335, "step": 150 }, { "epoch": 0.6, "grad_norm": 0.2550714525590909, "learning_rate": 8.092321883028157e-06, "loss": 0.7182, "step": 151 }, { "epoch": 0.61, "grad_norm": 0.2752483825047847, "learning_rate": 8.067067643521834e-06, "loss": 0.772, "step": 152 }, { "epoch": 0.61, "grad_norm": 0.2582002365542859, "learning_rate": 8.041687328877566e-06, "loss": 0.7284, "step": 153 }, { "epoch": 0.61, "grad_norm": 0.24823845365447103, "learning_rate": 8.016181982380682e-06, "loss": 0.7467, "step": 154 }, { "epoch": 0.62, "grad_norm": 0.25644169647568194, "learning_rate": 7.99055265245608e-06, "loss": 0.7191, "step": 155 }, { "epoch": 0.62, "grad_norm": 0.27463144458405375, "learning_rate": 7.96480039262513e-06, "loss": 0.7375, "step": 156 }, { "epoch": 0.63, "grad_norm": 0.2642553744129046, "learning_rate": 7.938926261462366e-06, "loss": 0.7587, "step": 157 }, { "epoch": 0.63, "grad_norm": 0.25679436337027056, "learning_rate": 7.912931322551981e-06, "loss": 0.7312, "step": 158 }, { "epoch": 0.63, "grad_norm": 0.2784563408983632, "learning_rate": 7.886816644444099e-06, "loss": 0.7213, "step": 159 }, { "epoch": 0.64, "grad_norm": 0.2638439438037005, "learning_rate": 7.860583300610849e-06, "loss": 0.7286, "step": 160 }, { "epoch": 0.64, "grad_norm": 0.2599487424469649, "learning_rate": 7.83423236940225e-06, "loss": 0.7211, "step": 161 }, { "epoch": 0.65, "grad_norm": 0.2620169279227201, "learning_rate": 7.807764934001875e-06, "loss": 0.7243, "step": 162 }, { "epoch": 0.65, "grad_norm": 0.3455142385888707, "learning_rate": 7.781182082382325e-06, "loss": 0.7709, "step": 163 }, { "epoch": 0.65, "grad_norm": 0.25797958121628417, "learning_rate": 7.754484907260513e-06, "loss": 0.7371, "step": 164 }, { "epoch": 0.66, "grad_norm": 0.26245124486082283, "learning_rate": 7.727674506052744e-06, "loss": 0.7625, "step": 165 }, { "epoch": 0.66, "grad_norm": 0.259525851556208, "learning_rate": 7.700751980829601e-06, "loss": 0.7785, "step": 166 }, { "epoch": 0.67, "grad_norm": 0.25578616887441574, "learning_rate": 7.673718438270649e-06, "loss": 0.7349, "step": 167 }, { "epoch": 0.67, "grad_norm": 0.26880908011952676, "learning_rate": 7.646574989618938e-06, "loss": 0.7423, "step": 168 }, { "epoch": 0.67, "grad_norm": 0.28268284846763475, "learning_rate": 7.619322750635327e-06, "loss": 0.8089, "step": 169 }, { "epoch": 0.68, "grad_norm": 0.2565025440158926, "learning_rate": 7.591962841552627e-06, "loss": 0.708, "step": 170 }, { "epoch": 0.68, "grad_norm": 0.2589330645555015, "learning_rate": 7.564496387029532e-06, "loss": 0.7276, "step": 171 }, { "epoch": 0.69, "grad_norm": 4.072287518324262, "learning_rate": 7.536924516104411e-06, "loss": 0.963, "step": 172 }, { "epoch": 0.69, "grad_norm": 5.236195816930884, "learning_rate": 7.509248362148889e-06, "loss": 0.9786, "step": 173 }, { "epoch": 0.69, "grad_norm": 0.3208811366448085, "learning_rate": 7.481469062821252e-06, "loss": 0.7417, "step": 174 }, { "epoch": 0.7, "grad_norm": 0.280850125752099, "learning_rate": 7.453587760019691e-06, "loss": 0.7249, "step": 175 }, { "epoch": 0.7, "grad_norm": 0.26692675004354643, "learning_rate": 7.42560559983536e-06, "loss": 0.727, "step": 176 }, { "epoch": 0.71, "grad_norm": 0.27369623001787907, "learning_rate": 7.39752373250527e-06, "loss": 0.7617, "step": 177 }, { "epoch": 0.71, "grad_norm": 0.2784142807735342, "learning_rate": 7.369343312364994e-06, "loss": 0.7466, "step": 178 }, { "epoch": 0.71, "grad_norm": 37.27250895412763, "learning_rate": 7.34106549780123e-06, "loss": 1.0667, "step": 179 }, { "epoch": 0.72, "grad_norm": 0.2760061039631398, "learning_rate": 7.312691451204178e-06, "loss": 0.7244, "step": 180 }, { "epoch": 0.72, "grad_norm": 0.25459829854169064, "learning_rate": 7.284222338919758e-06, "loss": 0.7364, "step": 181 }, { "epoch": 0.73, "grad_norm": 0.26179305555253735, "learning_rate": 7.255659331201673e-06, "loss": 0.733, "step": 182 }, { "epoch": 0.73, "grad_norm": 0.2604418741829541, "learning_rate": 7.227003602163296e-06, "loss": 0.7209, "step": 183 }, { "epoch": 0.73, "grad_norm": 0.26185681215109163, "learning_rate": 7.198256329729412e-06, "loss": 0.7164, "step": 184 }, { "epoch": 0.74, "grad_norm": 16.152387103951856, "learning_rate": 7.169418695587791e-06, "loss": 1.0372, "step": 185 }, { "epoch": 0.74, "grad_norm": 21.228735850953576, "learning_rate": 7.140491885140629e-06, "loss": 1.0402, "step": 186 }, { "epoch": 0.75, "grad_norm": 0.28404810159037286, "learning_rate": 7.1114770874558e-06, "loss": 0.7006, "step": 187 }, { "epoch": 0.75, "grad_norm": 0.26609034714435736, "learning_rate": 7.082375495217996e-06, "loss": 0.7537, "step": 188 }, { "epoch": 0.75, "grad_norm": 0.25765084929276133, "learning_rate": 7.053188304679691e-06, "loss": 0.7302, "step": 189 }, { "epoch": 0.76, "grad_norm": 0.26384158886834463, "learning_rate": 7.023916715611969e-06, "loss": 0.712, "step": 190 }, { "epoch": 0.76, "grad_norm": 0.27151931787506317, "learning_rate": 6.994561931255209e-06, "loss": 0.7502, "step": 191 }, { "epoch": 0.77, "grad_norm": 0.27031492068457535, "learning_rate": 6.965125158269619e-06, "loss": 0.7179, "step": 192 }, { "epoch": 0.77, "grad_norm": 0.26995073084719196, "learning_rate": 6.935607606685642e-06, "loss": 0.7624, "step": 193 }, { "epoch": 0.77, "grad_norm": 0.25666755324587454, "learning_rate": 6.906010489854209e-06, "loss": 0.7426, "step": 194 }, { "epoch": 0.78, "grad_norm": 0.2764461509009301, "learning_rate": 6.876335024396872e-06, "loss": 0.723, "step": 195 }, { "epoch": 0.78, "grad_norm": 0.2597906002555833, "learning_rate": 6.846582430155783e-06, "loss": 0.7407, "step": 196 }, { "epoch": 0.79, "grad_norm": 0.26409742487438864, "learning_rate": 6.816753930143558e-06, "loss": 0.7206, "step": 197 }, { "epoch": 0.79, "grad_norm": 0.25320169233675405, "learning_rate": 6.786850750493006e-06, "loss": 0.7437, "step": 198 }, { "epoch": 0.79, "grad_norm": 0.2708696048462205, "learning_rate": 6.7568741204067145e-06, "loss": 0.7422, "step": 199 }, { "epoch": 0.8, "grad_norm": 0.26542323915181154, "learning_rate": 6.726825272106539e-06, "loss": 0.7514, "step": 200 }, { "epoch": 0.8, "grad_norm": 0.26307166597433396, "learning_rate": 6.696705440782939e-06, "loss": 0.7509, "step": 201 }, { "epoch": 0.81, "grad_norm": 0.26671446872754456, "learning_rate": 6.66651586454421e-06, "loss": 0.7465, "step": 202 }, { "epoch": 0.81, "grad_norm": 0.2720083369272757, "learning_rate": 6.636257784365585e-06, "loss": 0.7349, "step": 203 }, { "epoch": 0.81, "grad_norm": 0.2652218770116059, "learning_rate": 6.605932444038229e-06, "loss": 0.7348, "step": 204 }, { "epoch": 0.82, "grad_norm": 0.26402314109149694, "learning_rate": 6.575541090118105e-06, "loss": 0.7495, "step": 205 }, { "epoch": 0.82, "grad_norm": 0.2639803511821082, "learning_rate": 6.545084971874738e-06, "loss": 0.7138, "step": 206 }, { "epoch": 0.83, "grad_norm": 0.2673567043268493, "learning_rate": 6.514565341239861e-06, "loss": 0.7341, "step": 207 }, { "epoch": 0.83, "grad_norm": 74.0236556664021, "learning_rate": 6.483983452755953e-06, "loss": 1.084, "step": 208 }, { "epoch": 0.83, "grad_norm": 0.2694930198888902, "learning_rate": 6.4533405635246696e-06, "loss": 0.7422, "step": 209 }, { "epoch": 0.84, "grad_norm": 0.2808089325365656, "learning_rate": 6.4226379331551625e-06, "loss": 0.7543, "step": 210 }, { "epoch": 0.84, "grad_norm": 0.24629087243802783, "learning_rate": 6.3918768237123175e-06, "loss": 0.7088, "step": 211 }, { "epoch": 0.85, "grad_norm": 0.2605148206784209, "learning_rate": 6.361058499664856e-06, "loss": 0.7434, "step": 212 }, { "epoch": 0.85, "grad_norm": 0.26199687024127344, "learning_rate": 6.330184227833376e-06, "loss": 0.7369, "step": 213 }, { "epoch": 0.85, "grad_norm": 0.2693080928270376, "learning_rate": 6.299255277338265e-06, "loss": 0.7337, "step": 214 }, { "epoch": 0.86, "grad_norm": 0.2573571779304293, "learning_rate": 6.268272919547537e-06, "loss": 0.7366, "step": 215 }, { "epoch": 0.86, "grad_norm": 0.25347655388671, "learning_rate": 6.237238428024573e-06, "loss": 0.7392, "step": 216 }, { "epoch": 0.87, "grad_norm": 0.254807709796356, "learning_rate": 6.2061530784757625e-06, "loss": 0.7709, "step": 217 }, { "epoch": 0.87, "grad_norm": 0.25435065962054804, "learning_rate": 6.175018148698077e-06, "loss": 0.7472, "step": 218 }, { "epoch": 0.87, "grad_norm": 0.25856868944475736, "learning_rate": 6.143834918526528e-06, "loss": 0.7442, "step": 219 }, { "epoch": 0.88, "grad_norm": 0.24960062893507637, "learning_rate": 6.112604669781572e-06, "loss": 0.7163, "step": 220 }, { "epoch": 0.88, "grad_norm": 0.2544024553733407, "learning_rate": 6.0813286862164175e-06, "loss": 0.7236, "step": 221 }, { "epoch": 0.89, "grad_norm": 0.2532920039697931, "learning_rate": 6.050008253464247e-06, "loss": 0.7427, "step": 222 }, { "epoch": 0.89, "grad_norm": 0.25372808971698796, "learning_rate": 6.018644658985378e-06, "loss": 0.7286, "step": 223 }, { "epoch": 0.89, "grad_norm": 0.2570514856547558, "learning_rate": 5.987239192014336e-06, "loss": 0.7349, "step": 224 }, { "epoch": 0.9, "grad_norm": 0.2578576277551542, "learning_rate": 5.955793143506863e-06, "loss": 0.7266, "step": 225 }, { "epoch": 0.9, "grad_norm": 0.26312215832145636, "learning_rate": 5.9243078060868445e-06, "loss": 0.7389, "step": 226 }, { "epoch": 0.91, "grad_norm": 0.26518617358808877, "learning_rate": 5.892784473993184e-06, "loss": 0.7108, "step": 227 }, { "epoch": 0.91, "grad_norm": 0.25620517627113376, "learning_rate": 5.861224443026595e-06, "loss": 0.7232, "step": 228 }, { "epoch": 0.91, "grad_norm": 28.36402580586963, "learning_rate": 5.82962901049634e-06, "loss": 0.9734, "step": 229 }, { "epoch": 0.92, "grad_norm": 0.2807037939514787, "learning_rate": 5.797999475166897e-06, "loss": 0.7341, "step": 230 }, { "epoch": 0.92, "grad_norm": 8.208344028346868, "learning_rate": 5.766337137204579e-06, "loss": 0.938, "step": 231 }, { "epoch": 0.93, "grad_norm": 0.26445130385050997, "learning_rate": 5.734643298124091e-06, "loss": 0.7316, "step": 232 }, { "epoch": 0.93, "grad_norm": 0.251567451954335, "learning_rate": 5.702919260735015e-06, "loss": 0.6966, "step": 233 }, { "epoch": 0.93, "grad_norm": 0.26329080787916564, "learning_rate": 5.671166329088278e-06, "loss": 0.7319, "step": 234 }, { "epoch": 0.94, "grad_norm": 0.2566777339661679, "learning_rate": 5.6393858084225305e-06, "loss": 0.7529, "step": 235 }, { "epoch": 0.94, "grad_norm": 0.2710815554700812, "learning_rate": 5.6075790051105025e-06, "loss": 0.7515, "step": 236 }, { "epoch": 0.95, "grad_norm": 0.27096961550302734, "learning_rate": 5.575747226605298e-06, "loss": 0.7073, "step": 237 }, { "epoch": 0.95, "grad_norm": 0.2509131795738037, "learning_rate": 5.543891781386655e-06, "loss": 0.7513, "step": 238 }, { "epoch": 0.95, "grad_norm": 0.26210506205941153, "learning_rate": 5.512013978907157e-06, "loss": 0.7569, "step": 239 }, { "epoch": 0.96, "grad_norm": 0.25123130642497177, "learning_rate": 5.480115129538409e-06, "loss": 0.7239, "step": 240 }, { "epoch": 0.96, "grad_norm": 0.2596821607229612, "learning_rate": 5.448196544517168e-06, "loss": 0.7256, "step": 241 }, { "epoch": 0.97, "grad_norm": 0.2714818563550966, "learning_rate": 5.4162595358914475e-06, "loss": 0.7329, "step": 242 }, { "epoch": 0.97, "grad_norm": 0.260503064108439, "learning_rate": 5.384305416466584e-06, "loss": 0.7112, "step": 243 }, { "epoch": 0.97, "grad_norm": 0.2661267608396215, "learning_rate": 5.35233549975127e-06, "loss": 0.7534, "step": 244 }, { "epoch": 0.98, "grad_norm": 0.27502743208671454, "learning_rate": 5.320351099903565e-06, "loss": 0.7355, "step": 245 }, { "epoch": 0.98, "grad_norm": 0.2598641343680277, "learning_rate": 5.288353531676873e-06, "loss": 0.7476, "step": 246 }, { "epoch": 0.99, "grad_norm": 0.2629788348419056, "learning_rate": 5.256344110365896e-06, "loss": 0.7523, "step": 247 }, { "epoch": 0.99, "grad_norm": 0.256123432185156, "learning_rate": 5.224324151752575e-06, "loss": 0.7479, "step": 248 }, { "epoch": 0.99, "grad_norm": 0.2592695095071067, "learning_rate": 5.192294972051992e-06, "loss": 0.7586, "step": 249 }, { "epoch": 1.0, "grad_norm": 0.26264999139615697, "learning_rate": 5.160257887858278e-06, "loss": 0.7406, "step": 250 }, { "epoch": 1.0, "eval_loss": 0.7036678791046143, "eval_runtime": 96.3087, "eval_samples_per_second": 18.358, "eval_steps_per_second": 0.384, "step": 250 }, { "epoch": 1.0, "grad_norm": 0.2614921961545108, "learning_rate": 5.128214216090478e-06, "loss": 0.7488, "step": 251 }, { "epoch": 1.0, "grad_norm": 0.2605743808221771, "learning_rate": 5.0961652739384356e-06, "loss": 0.7338, "step": 252 }, { "epoch": 1.01, "grad_norm": 2.904417203670962, "learning_rate": 5.064112378808636e-06, "loss": 0.9738, "step": 253 }, { "epoch": 1.01, "grad_norm": 0.2581985759494367, "learning_rate": 5.032056848270056e-06, "loss": 0.7693, "step": 254 }, { "epoch": 1.02, "grad_norm": 0.25102446332314765, "learning_rate": 5e-06, "loss": 0.7213, "step": 255 }, { "epoch": 1.0, "grad_norm": 1.4216598983588058, "learning_rate": 4.967943151729945e-06, "loss": 0.9193, "step": 256 }, { "epoch": 1.0, "grad_norm": 0.32982276331099014, "learning_rate": 4.935887621191364e-06, "loss": 0.6842, "step": 257 }, { "epoch": 1.01, "grad_norm": 0.29043411467478625, "learning_rate": 4.903834726061565e-06, "loss": 0.7087, "step": 258 }, { "epoch": 1.01, "grad_norm": 0.25986254756592664, "learning_rate": 4.871785783909523e-06, "loss": 0.6741, "step": 259 }, { "epoch": 1.02, "grad_norm": 0.30049816553828484, "learning_rate": 4.839742112141725e-06, "loss": 0.7063, "step": 260 }, { "epoch": 1.02, "grad_norm": 0.2895999616622155, "learning_rate": 4.807705027948008e-06, "loss": 0.7146, "step": 261 }, { "epoch": 1.02, "grad_norm": 0.30041272052643164, "learning_rate": 4.775675848247427e-06, "loss": 0.7134, "step": 262 }, { "epoch": 1.03, "grad_norm": 0.27518299819790887, "learning_rate": 4.743655889634105e-06, "loss": 0.692, "step": 263 }, { "epoch": 1.03, "grad_norm": 0.26955160521446175, "learning_rate": 4.711646468323129e-06, "loss": 0.658, "step": 264 }, { "epoch": 1.04, "grad_norm": 0.27535739664976405, "learning_rate": 4.679648900096436e-06, "loss": 0.6908, "step": 265 }, { "epoch": 1.04, "grad_norm": 0.26763089124769246, "learning_rate": 4.64766450024873e-06, "loss": 0.6861, "step": 266 }, { "epoch": 1.04, "grad_norm": 74.38254133611925, "learning_rate": 4.615694583533418e-06, "loss": 0.9994, "step": 267 }, { "epoch": 1.05, "grad_norm": 0.2986551656205794, "learning_rate": 4.583740464108554e-06, "loss": 0.7075, "step": 268 }, { "epoch": 1.05, "grad_norm": 0.27619714658975547, "learning_rate": 4.551803455482833e-06, "loss": 0.6596, "step": 269 }, { "epoch": 1.06, "grad_norm": 0.25242413092583954, "learning_rate": 4.5198848704615915e-06, "loss": 0.6628, "step": 270 }, { "epoch": 1.06, "grad_norm": 0.26017582720690735, "learning_rate": 4.487986021092844e-06, "loss": 0.6916, "step": 271 }, { "epoch": 1.06, "grad_norm": 0.2719334401383232, "learning_rate": 4.456108218613346e-06, "loss": 0.6935, "step": 272 }, { "epoch": 1.07, "grad_norm": 0.2874168693732095, "learning_rate": 4.424252773394704e-06, "loss": 0.7013, "step": 273 }, { "epoch": 1.07, "grad_norm": 0.27088215577908004, "learning_rate": 4.392420994889498e-06, "loss": 0.693, "step": 274 }, { "epoch": 1.08, "grad_norm": 0.2532812233498042, "learning_rate": 4.3606141915774695e-06, "loss": 0.6762, "step": 275 }, { "epoch": 1.08, "grad_norm": 0.263089719520046, "learning_rate": 4.3288336709117246e-06, "loss": 0.6677, "step": 276 }, { "epoch": 1.08, "grad_norm": 0.2544227492529696, "learning_rate": 4.297080739264987e-06, "loss": 0.6744, "step": 277 }, { "epoch": 1.09, "grad_norm": 0.2645751047762965, "learning_rate": 4.265356701875911e-06, "loss": 0.7047, "step": 278 }, { "epoch": 1.09, "grad_norm": 0.2602397068309733, "learning_rate": 4.23366286279542e-06, "loss": 0.6792, "step": 279 }, { "epoch": 1.1, "grad_norm": 0.27012218470061766, "learning_rate": 4.2020005248331056e-06, "loss": 0.6914, "step": 280 }, { "epoch": 1.1, "grad_norm": 0.2645729945558582, "learning_rate": 4.170370989503662e-06, "loss": 0.6812, "step": 281 }, { "epoch": 1.1, "grad_norm": 0.26158176244234604, "learning_rate": 4.138775556973406e-06, "loss": 0.6545, "step": 282 }, { "epoch": 1.11, "grad_norm": 0.2609966416888788, "learning_rate": 4.107215526006818e-06, "loss": 0.6534, "step": 283 }, { "epoch": 1.11, "grad_norm": 0.2633177456953443, "learning_rate": 4.075692193913156e-06, "loss": 0.6617, "step": 284 }, { "epoch": 1.12, "grad_norm": 0.2711366514748812, "learning_rate": 4.04420685649314e-06, "loss": 0.7026, "step": 285 }, { "epoch": 1.12, "grad_norm": 0.26016920693531187, "learning_rate": 4.012760807985665e-06, "loss": 0.685, "step": 286 }, { "epoch": 1.12, "grad_norm": 0.2634077734164485, "learning_rate": 3.9813553410146225e-06, "loss": 0.6732, "step": 287 }, { "epoch": 1.13, "grad_norm": 0.2630739058989318, "learning_rate": 3.949991746535753e-06, "loss": 0.6898, "step": 288 }, { "epoch": 1.13, "grad_norm": 0.26810735877032293, "learning_rate": 3.918671313783583e-06, "loss": 0.6739, "step": 289 }, { "epoch": 1.14, "grad_norm": 0.2667138269733132, "learning_rate": 3.887395330218429e-06, "loss": 0.6649, "step": 290 }, { "epoch": 1.14, "grad_norm": 0.2563222658817468, "learning_rate": 3.856165081473474e-06, "loss": 0.708, "step": 291 }, { "epoch": 1.14, "grad_norm": 0.26451201369218524, "learning_rate": 3.824981851301924e-06, "loss": 0.6715, "step": 292 }, { "epoch": 1.15, "grad_norm": 0.2598494927634439, "learning_rate": 3.7938469215242374e-06, "loss": 0.6955, "step": 293 }, { "epoch": 1.15, "grad_norm": 0.25396403307478727, "learning_rate": 3.7627615719754294e-06, "loss": 0.6676, "step": 294 }, { "epoch": 1.16, "grad_norm": 0.2532765659210287, "learning_rate": 3.731727080452464e-06, "loss": 0.6748, "step": 295 }, { "epoch": 1.16, "grad_norm": 0.26061271523616963, "learning_rate": 3.7007447226617367e-06, "loss": 0.7058, "step": 296 }, { "epoch": 1.16, "grad_norm": 0.25801226716086006, "learning_rate": 3.669815772166625e-06, "loss": 0.6717, "step": 297 }, { "epoch": 1.17, "grad_norm": 0.2678578983084559, "learning_rate": 3.638941500335145e-06, "loss": 0.6785, "step": 298 }, { "epoch": 1.17, "grad_norm": 0.2629273311111566, "learning_rate": 3.608123176287685e-06, "loss": 0.6846, "step": 299 }, { "epoch": 1.18, "grad_norm": 0.26372287416738016, "learning_rate": 3.5773620668448384e-06, "loss": 0.7155, "step": 300 }, { "epoch": 1.18, "grad_norm": 0.2705437923133382, "learning_rate": 3.5466594364753325e-06, "loss": 0.6723, "step": 301 }, { "epoch": 1.18, "grad_norm": 0.2839291053250124, "learning_rate": 3.516016547244047e-06, "loss": 0.7035, "step": 302 }, { "epoch": 1.19, "grad_norm": 0.2760313377640414, "learning_rate": 3.48543465876014e-06, "loss": 0.6751, "step": 303 }, { "epoch": 1.19, "grad_norm": 0.25394626546919435, "learning_rate": 3.4549150281252635e-06, "loss": 0.6795, "step": 304 }, { "epoch": 1.2, "grad_norm": 0.2591221216055477, "learning_rate": 3.424458909881897e-06, "loss": 0.6766, "step": 305 }, { "epoch": 1.2, "grad_norm": 0.2691782924782941, "learning_rate": 3.3940675559617724e-06, "loss": 0.6895, "step": 306 }, { "epoch": 1.2, "grad_norm": 0.26815145183562566, "learning_rate": 3.363742215634416e-06, "loss": 0.6671, "step": 307 }, { "epoch": 1.21, "grad_norm": 0.26601253229287064, "learning_rate": 3.3334841354557923e-06, "loss": 0.6902, "step": 308 }, { "epoch": 1.21, "grad_norm": 0.2759140499002526, "learning_rate": 3.303294559217063e-06, "loss": 0.7011, "step": 309 }, { "epoch": 1.22, "grad_norm": 0.2532152571509874, "learning_rate": 3.273174727893463e-06, "loss": 0.6631, "step": 310 }, { "epoch": 1.22, "grad_norm": 0.2587732106097895, "learning_rate": 3.2431258795932863e-06, "loss": 0.6964, "step": 311 }, { "epoch": 1.22, "grad_norm": 0.26154429819114283, "learning_rate": 3.213149249506997e-06, "loss": 0.7018, "step": 312 }, { "epoch": 1.23, "grad_norm": 0.2640699829932556, "learning_rate": 3.183246069856443e-06, "loss": 0.6809, "step": 313 }, { "epoch": 1.23, "grad_norm": 0.26350081604751235, "learning_rate": 3.1534175698442194e-06, "loss": 0.655, "step": 314 }, { "epoch": 1.24, "grad_norm": 0.2620810766791341, "learning_rate": 3.12366497560313e-06, "loss": 0.7034, "step": 315 }, { "epoch": 1.24, "grad_norm": 0.26759545817238656, "learning_rate": 3.093989510145792e-06, "loss": 0.7238, "step": 316 }, { "epoch": 1.24, "grad_norm": 0.26779990918516644, "learning_rate": 3.0643923933143603e-06, "loss": 0.6733, "step": 317 }, { "epoch": 1.25, "grad_norm": 0.266460909749917, "learning_rate": 3.0348748417303826e-06, "loss": 0.6708, "step": 318 }, { "epoch": 1.25, "grad_norm": 0.26133722668937404, "learning_rate": 3.005438068744792e-06, "loss": 0.6838, "step": 319 }, { "epoch": 1.26, "grad_norm": 0.26893527319559857, "learning_rate": 2.976083284388031e-06, "loss": 0.662, "step": 320 }, { "epoch": 1.26, "grad_norm": 0.2699706439018165, "learning_rate": 2.9468116953203107e-06, "loss": 0.6867, "step": 321 }, { "epoch": 1.26, "grad_norm": 0.2660357193246072, "learning_rate": 2.9176245047820064e-06, "loss": 0.6802, "step": 322 }, { "epoch": 1.27, "grad_norm": 0.2676372441332764, "learning_rate": 2.8885229125442022e-06, "loss": 0.7143, "step": 323 }, { "epoch": 1.27, "grad_norm": 0.26172654998349437, "learning_rate": 2.859508114859374e-06, "loss": 0.6688, "step": 324 }, { "epoch": 1.28, "grad_norm": 0.26622148926216194, "learning_rate": 2.83058130441221e-06, "loss": 0.671, "step": 325 }, { "epoch": 1.28, "grad_norm": 0.28436198488003145, "learning_rate": 2.80174367027059e-06, "loss": 0.7157, "step": 326 }, { "epoch": 1.28, "grad_norm": 0.2637348527869296, "learning_rate": 2.772996397836704e-06, "loss": 0.6893, "step": 327 }, { "epoch": 1.29, "grad_norm": 0.27094660677035604, "learning_rate": 2.7443406687983267e-06, "loss": 0.7149, "step": 328 }, { "epoch": 1.29, "grad_norm": 0.26943858696681655, "learning_rate": 2.7157776610802416e-06, "loss": 0.6756, "step": 329 }, { "epoch": 1.3, "grad_norm": 0.2637431948145577, "learning_rate": 2.687308548795825e-06, "loss": 0.6731, "step": 330 }, { "epoch": 1.3, "grad_norm": 0.2605764257900338, "learning_rate": 2.6589345021987725e-06, "loss": 0.6601, "step": 331 }, { "epoch": 1.3, "grad_norm": 0.2691370320929689, "learning_rate": 2.6306566876350072e-06, "loss": 0.6747, "step": 332 }, { "epoch": 1.31, "grad_norm": 0.26505353096492007, "learning_rate": 2.6024762674947313e-06, "loss": 0.6355, "step": 333 }, { "epoch": 1.31, "grad_norm": 0.2771254154185314, "learning_rate": 2.5743944001646394e-06, "loss": 0.6679, "step": 334 }, { "epoch": 1.32, "grad_norm": 0.2734747453519109, "learning_rate": 2.5464122399803126e-06, "loss": 0.6842, "step": 335 }, { "epoch": 1.32, "grad_norm": 0.26644275829657593, "learning_rate": 2.5185309371787515e-06, "loss": 0.6986, "step": 336 }, { "epoch": 1.32, "grad_norm": 2.921169787601039, "learning_rate": 2.4907516378511137e-06, "loss": 0.9339, "step": 337 }, { "epoch": 1.33, "grad_norm": 0.25960894123414624, "learning_rate": 2.46307548389559e-06, "loss": 0.6743, "step": 338 }, { "epoch": 1.33, "grad_norm": 0.2637156946948773, "learning_rate": 2.43550361297047e-06, "loss": 0.682, "step": 339 }, { "epoch": 1.34, "grad_norm": 0.26481304373722364, "learning_rate": 2.408037158447375e-06, "loss": 0.6838, "step": 340 }, { "epoch": 1.34, "grad_norm": 0.3098032445823631, "learning_rate": 2.3806772493646725e-06, "loss": 0.6569, "step": 341 }, { "epoch": 1.34, "grad_norm": 0.26219269959571206, "learning_rate": 2.353425010381063e-06, "loss": 0.6761, "step": 342 }, { "epoch": 1.35, "grad_norm": 0.2595963563694489, "learning_rate": 2.3262815617293517e-06, "loss": 0.6705, "step": 343 }, { "epoch": 1.35, "grad_norm": 6.834107729255299, "learning_rate": 2.2992480191704003e-06, "loss": 0.9304, "step": 344 }, { "epoch": 1.36, "grad_norm": 0.27845487671808766, "learning_rate": 2.272325493947257e-06, "loss": 0.7032, "step": 345 }, { "epoch": 1.36, "grad_norm": 0.27640918477115356, "learning_rate": 2.245515092739488e-06, "loss": 0.6782, "step": 346 }, { "epoch": 1.36, "grad_norm": 0.2807587758407648, "learning_rate": 2.2188179176176767e-06, "loss": 0.6932, "step": 347 }, { "epoch": 1.37, "grad_norm": 0.27767259525555504, "learning_rate": 2.1922350659981262e-06, "loss": 0.6466, "step": 348 }, { "epoch": 1.37, "grad_norm": 0.267658673895331, "learning_rate": 2.165767630597752e-06, "loss": 0.7089, "step": 349 }, { "epoch": 1.38, "grad_norm": 0.2745228363539692, "learning_rate": 2.139416699389153e-06, "loss": 0.6778, "step": 350 }, { "epoch": 1.38, "grad_norm": 0.26408657536920843, "learning_rate": 2.1131833555559037e-06, "loss": 0.693, "step": 351 }, { "epoch": 1.38, "grad_norm": 0.27582062474841573, "learning_rate": 2.08706867744802e-06, "loss": 0.6896, "step": 352 }, { "epoch": 1.39, "grad_norm": 0.26826511006390147, "learning_rate": 2.061073738537635e-06, "loss": 0.6796, "step": 353 }, { "epoch": 1.39, "grad_norm": 0.26630216372896987, "learning_rate": 2.0351996073748713e-06, "loss": 0.664, "step": 354 }, { "epoch": 1.4, "grad_norm": 0.2809694093215496, "learning_rate": 2.00944734754392e-06, "loss": 0.7044, "step": 355 }, { "epoch": 1.4, "grad_norm": 1.2984772108139377, "learning_rate": 1.983818017619318e-06, "loss": 0.9348, "step": 356 }, { "epoch": 1.4, "grad_norm": 0.2721943586746493, "learning_rate": 1.9583126711224342e-06, "loss": 0.6918, "step": 357 }, { "epoch": 1.41, "grad_norm": 0.27480703238103743, "learning_rate": 1.932932356478168e-06, "loss": 0.6854, "step": 358 }, { "epoch": 1.41, "grad_norm": 0.27867368393846137, "learning_rate": 1.9076781169718426e-06, "loss": 0.6892, "step": 359 }, { "epoch": 1.42, "grad_norm": 0.2747868621778029, "learning_rate": 1.8825509907063328e-06, "loss": 0.6947, "step": 360 }, { "epoch": 1.42, "grad_norm": 0.2819831023326237, "learning_rate": 1.857552010559382e-06, "loss": 0.7059, "step": 361 }, { "epoch": 1.42, "grad_norm": 0.27392909848006375, "learning_rate": 1.8326822041411524e-06, "loss": 0.6909, "step": 362 }, { "epoch": 1.43, "grad_norm": 0.2743865258535757, "learning_rate": 1.8079425937519729e-06, "loss": 0.679, "step": 363 }, { "epoch": 1.43, "grad_norm": 2.5387012983068686, "learning_rate": 1.7833341963403312e-06, "loss": 0.8855, "step": 364 }, { "epoch": 1.44, "grad_norm": 0.265533513891752, "learning_rate": 1.7588580234610592e-06, "loss": 0.6915, "step": 365 }, { "epoch": 1.44, "grad_norm": 0.27359948413194535, "learning_rate": 1.7345150812337564e-06, "loss": 0.6983, "step": 366 }, { "epoch": 1.44, "grad_norm": 0.2742529435490673, "learning_rate": 1.7103063703014372e-06, "loss": 0.6712, "step": 367 }, { "epoch": 1.45, "grad_norm": 0.27199602044830407, "learning_rate": 1.6862328857893856e-06, "loss": 0.6929, "step": 368 }, { "epoch": 1.45, "grad_norm": 0.26752081718353027, "learning_rate": 1.6622956172642601e-06, "loss": 0.6693, "step": 369 }, { "epoch": 1.46, "grad_norm": 0.2729904749450422, "learning_rate": 1.6384955486934157e-06, "loss": 0.6545, "step": 370 }, { "epoch": 1.46, "grad_norm": 0.27686659493721505, "learning_rate": 1.6148336584044539e-06, "loss": 0.6957, "step": 371 }, { "epoch": 1.46, "grad_norm": 0.27203211099316454, "learning_rate": 1.5913109190450033e-06, "loss": 0.6709, "step": 372 }, { "epoch": 1.47, "grad_norm": 0.2748872025880921, "learning_rate": 1.567928297542749e-06, "loss": 0.6648, "step": 373 }, { "epoch": 1.47, "grad_norm": 0.28544562477258, "learning_rate": 1.544686755065677e-06, "loss": 0.6937, "step": 374 }, { "epoch": 1.48, "grad_norm": 0.27110159404128226, "learning_rate": 1.5215872469825682e-06, "loss": 0.6593, "step": 375 }, { "epoch": 1.48, "eval_loss": 0.6996302008628845, "eval_runtime": 96.9399, "eval_samples_per_second": 18.238, "eval_steps_per_second": 0.382, "step": 375 }, { "epoch": 1.48, "grad_norm": 0.28648432605335455, "learning_rate": 1.4986307228237268e-06, "loss": 0.6883, "step": 376 }, { "epoch": 1.48, "grad_norm": 0.2695264027977092, "learning_rate": 1.4758181262419425e-06, "loss": 0.6696, "step": 377 }, { "epoch": 1.49, "grad_norm": 0.2786040566891135, "learning_rate": 1.4531503949737107e-06, "loss": 0.6768, "step": 378 }, { "epoch": 1.49, "grad_norm": 0.2730138945863401, "learning_rate": 1.4306284608006837e-06, "loss": 0.699, "step": 379 }, { "epoch": 1.5, "grad_norm": 0.28711818986138005, "learning_rate": 1.4082532495113627e-06, "loss": 0.6961, "step": 380 }, { "epoch": 1.5, "grad_norm": 0.27838100192134935, "learning_rate": 1.3860256808630429e-06, "loss": 0.6589, "step": 381 }, { "epoch": 1.5, "grad_norm": 0.2798399005913698, "learning_rate": 1.3639466685440133e-06, "loss": 0.6924, "step": 382 }, { "epoch": 1.51, "grad_norm": 0.2801056534585314, "learning_rate": 1.3420171201359933e-06, "loss": 0.7047, "step": 383 }, { "epoch": 1.51, "grad_norm": 0.28576658015544487, "learning_rate": 1.3202379370768254e-06, "loss": 0.6617, "step": 384 }, { "epoch": 1.52, "grad_norm": 1.3100321738765892, "learning_rate": 1.298610014623423e-06, "loss": 0.9236, "step": 385 }, { "epoch": 1.52, "grad_norm": 0.303191205875695, "learning_rate": 1.2771342418149658e-06, "loss": 0.6896, "step": 386 }, { "epoch": 1.52, "grad_norm": 0.29469211064765755, "learning_rate": 1.2558115014363592e-06, "loss": 0.6804, "step": 387 }, { "epoch": 1.53, "grad_norm": 0.2772656639598833, "learning_rate": 1.234642669981946e-06, "loss": 0.7043, "step": 388 }, { "epoch": 1.53, "grad_norm": 0.28874341170670975, "learning_rate": 1.2136286176194744e-06, "loss": 0.6839, "step": 389 }, { "epoch": 1.54, "grad_norm": 0.29443238526351323, "learning_rate": 1.1927702081543279e-06, "loss": 0.6852, "step": 390 }, { "epoch": 1.54, "grad_norm": 0.28011985365824127, "learning_rate": 1.1720682989940264e-06, "loss": 0.7019, "step": 391 }, { "epoch": 1.54, "grad_norm": 0.2987249208096274, "learning_rate": 1.1515237411129698e-06, "loss": 0.6625, "step": 392 }, { "epoch": 1.55, "grad_norm": 0.30150125882304396, "learning_rate": 1.1311373790174656e-06, "loss": 0.7102, "step": 393 }, { "epoch": 1.55, "grad_norm": 0.28396138619493894, "learning_rate": 1.1109100507110133e-06, "loss": 0.6538, "step": 394 }, { "epoch": 1.56, "grad_norm": 0.28445333602874173, "learning_rate": 1.0908425876598512e-06, "loss": 0.6719, "step": 395 }, { "epoch": 1.56, "grad_norm": 0.2914051878027514, "learning_rate": 1.0709358147587883e-06, "loss": 0.6803, "step": 396 }, { "epoch": 1.56, "grad_norm": 0.2969873740157493, "learning_rate": 1.0511905502972885e-06, "loss": 0.6845, "step": 397 }, { "epoch": 1.57, "grad_norm": 0.27955221055069657, "learning_rate": 1.031607605925839e-06, "loss": 0.6819, "step": 398 }, { "epoch": 1.57, "grad_norm": 0.2840904640426781, "learning_rate": 1.0121877866225783e-06, "loss": 0.6685, "step": 399 }, { "epoch": 1.58, "grad_norm": 0.2866769315662431, "learning_rate": 9.929318906602176e-07, "loss": 0.7126, "step": 400 }, { "epoch": 1.58, "grad_norm": 0.28635331619928306, "learning_rate": 9.738407095732195e-07, "loss": 0.6825, "step": 401 }, { "epoch": 1.58, "grad_norm": 0.29612030431665504, "learning_rate": 9.549150281252633e-07, "loss": 0.6889, "step": 402 }, { "epoch": 1.59, "grad_norm": 0.2800350134356635, "learning_rate": 9.361556242769871e-07, "loss": 0.6902, "step": 403 }, { "epoch": 1.59, "grad_norm": 0.303825841465723, "learning_rate": 9.175632691540065e-07, "loss": 0.6949, "step": 404 }, { "epoch": 1.6, "grad_norm": 0.2869303466691903, "learning_rate": 8.991387270152202e-07, "loss": 0.6953, "step": 405 }, { "epoch": 1.6, "grad_norm": 0.2822567891211309, "learning_rate": 8.808827552213917e-07, "loss": 0.6733, "step": 406 }, { "epoch": 1.6, "grad_norm": 0.29613652418881947, "learning_rate": 8.627961042040183e-07, "loss": 0.6721, "step": 407 }, { "epoch": 1.61, "grad_norm": 0.2904383828418472, "learning_rate": 8.448795174344803e-07, "loss": 0.6849, "step": 408 }, { "epoch": 1.61, "grad_norm": 0.28422518396116003, "learning_rate": 8.271337313934869e-07, "loss": 0.676, "step": 409 }, { "epoch": 1.62, "grad_norm": 0.2998884374161429, "learning_rate": 8.095594755407971e-07, "loss": 0.72, "step": 410 }, { "epoch": 1.62, "grad_norm": 0.2877123964038153, "learning_rate": 7.921574722852343e-07, "loss": 0.686, "step": 411 }, { "epoch": 1.62, "grad_norm": 0.2848351263069502, "learning_rate": 7.749284369549954e-07, "loss": 0.6755, "step": 412 }, { "epoch": 1.63, "grad_norm": 3.762805837262192, "learning_rate": 7.578730777682386e-07, "loss": 0.9037, "step": 413 }, { "epoch": 1.63, "grad_norm": 0.2782769592021476, "learning_rate": 7.409920958039795e-07, "loss": 0.6686, "step": 414 }, { "epoch": 1.64, "grad_norm": 0.28369386272785363, "learning_rate": 7.242861849732696e-07, "loss": 0.6772, "step": 415 }, { "epoch": 1.64, "grad_norm": 0.28870985589458403, "learning_rate": 7.077560319906696e-07, "loss": 0.6665, "step": 416 }, { "epoch": 1.64, "grad_norm": 0.2880267458624612, "learning_rate": 6.914023163460248e-07, "loss": 0.6767, "step": 417 }, { "epoch": 1.65, "grad_norm": 0.2879073116640725, "learning_rate": 6.752257102765325e-07, "loss": 0.6733, "step": 418 }, { "epoch": 1.65, "grad_norm": 0.2978223401759706, "learning_rate": 6.592268787391077e-07, "loss": 0.707, "step": 419 }, { "epoch": 1.66, "grad_norm": 0.2781074725093229, "learning_rate": 6.43406479383053e-07, "loss": 0.6962, "step": 420 }, { "epoch": 1.66, "grad_norm": 0.29577562012306474, "learning_rate": 6.277651625230219e-07, "loss": 0.6772, "step": 421 }, { "epoch": 1.66, "grad_norm": 0.2848699679509908, "learning_rate": 6.12303571112286e-07, "loss": 0.7008, "step": 422 }, { "epoch": 1.67, "grad_norm": 0.2728708533046375, "learning_rate": 5.9702234071631e-07, "loss": 0.6994, "step": 423 }, { "epoch": 1.67, "grad_norm": 0.2971147397482144, "learning_rate": 5.819220994866237e-07, "loss": 0.6784, "step": 424 }, { "epoch": 1.67, "grad_norm": 0.2918077307247773, "learning_rate": 5.670034681349995e-07, "loss": 0.6798, "step": 425 }, { "epoch": 1.68, "grad_norm": 0.2766527969263755, "learning_rate": 5.522670599079416e-07, "loss": 0.692, "step": 426 }, { "epoch": 1.68, "grad_norm": 0.2896023594106076, "learning_rate": 5.377134805614714e-07, "loss": 0.6885, "step": 427 }, { "epoch": 1.69, "grad_norm": 0.29226174571780184, "learning_rate": 5.233433283362349e-07, "loss": 0.6609, "step": 428 }, { "epoch": 1.69, "grad_norm": 0.30313380575685006, "learning_rate": 5.091571939329049e-07, "loss": 0.6559, "step": 429 }, { "epoch": 1.69, "grad_norm": 0.2851579977806079, "learning_rate": 4.951556604879049e-07, "loss": 0.6862, "step": 430 }, { "epoch": 1.7, "grad_norm": 0.28828722620682673, "learning_rate": 4.813393035494329e-07, "loss": 0.673, "step": 431 }, { "epoch": 1.7, "grad_norm": 0.283083619090625, "learning_rate": 4.677086910538092e-07, "loss": 0.6477, "step": 432 }, { "epoch": 1.71, "grad_norm": 0.29809618255032516, "learning_rate": 4.542643833021254e-07, "loss": 0.7054, "step": 433 }, { "epoch": 1.71, "grad_norm": 0.3086409841744957, "learning_rate": 4.410069329372152e-07, "loss": 0.6763, "step": 434 }, { "epoch": 1.71, "grad_norm": 0.3016981542599131, "learning_rate": 4.279368849209381e-07, "loss": 0.6843, "step": 435 }, { "epoch": 1.72, "grad_norm": 0.27698126686942587, "learning_rate": 4.150547765117746e-07, "loss": 0.6891, "step": 436 }, { "epoch": 1.72, "grad_norm": 0.2793586730481018, "learning_rate": 4.0236113724274716e-07, "loss": 0.6796, "step": 437 }, { "epoch": 1.73, "grad_norm": 0.28666974827878056, "learning_rate": 3.8985648889964755e-07, "loss": 0.6648, "step": 438 }, { "epoch": 1.73, "grad_norm": 0.28527293041641727, "learning_rate": 3.77541345499593e-07, "loss": 0.7071, "step": 439 }, { "epoch": 1.73, "grad_norm": 0.29843069047155474, "learning_rate": 3.6541621326989183e-07, "loss": 0.6803, "step": 440 }, { "epoch": 1.74, "grad_norm": 0.3001228333782996, "learning_rate": 3.534815906272404e-07, "loss": 0.7176, "step": 441 }, { "epoch": 1.74, "grad_norm": 0.2928174202718284, "learning_rate": 3.417379681572297e-07, "loss": 0.6747, "step": 442 }, { "epoch": 1.75, "grad_norm": 0.2951128501223636, "learning_rate": 3.301858285941845e-07, "loss": 0.7046, "step": 443 }, { "epoch": 1.75, "grad_norm": 0.2931876464433515, "learning_rate": 3.18825646801314e-07, "loss": 0.6734, "step": 444 }, { "epoch": 1.75, "grad_norm": 0.29349560436630445, "learning_rate": 3.076578897511978e-07, "loss": 0.6852, "step": 445 }, { "epoch": 1.76, "grad_norm": 0.7457047514934827, "learning_rate": 2.966830165065876e-07, "loss": 0.9017, "step": 446 }, { "epoch": 1.76, "grad_norm": 0.2813462622367945, "learning_rate": 2.8590147820153513e-07, "loss": 0.6969, "step": 447 }, { "epoch": 1.77, "grad_norm": 0.30904433658187347, "learning_rate": 2.7531371802285436e-07, "loss": 0.6829, "step": 448 }, { "epoch": 1.77, "grad_norm": 0.28837856935691286, "learning_rate": 2.6492017119189415e-07, "loss": 0.6527, "step": 449 }, { "epoch": 1.77, "grad_norm": 0.2940858357017207, "learning_rate": 2.547212649466568e-07, "loss": 0.6696, "step": 450 }, { "epoch": 1.78, "grad_norm": 0.27956534271888384, "learning_rate": 2.447174185242324e-07, "loss": 0.7048, "step": 451 }, { "epoch": 1.78, "grad_norm": 0.28259810422391984, "learning_rate": 2.3490904314356412e-07, "loss": 0.6772, "step": 452 }, { "epoch": 1.79, "grad_norm": 0.2846763408984384, "learning_rate": 2.2529654198854834e-07, "loss": 0.7507, "step": 453 }, { "epoch": 1.79, "grad_norm": 0.2784656918785677, "learning_rate": 2.1588031019145638e-07, "loss": 0.7072, "step": 454 }, { "epoch": 1.79, "grad_norm": 0.28593045031546543, "learning_rate": 2.0666073481669714e-07, "loss": 0.6944, "step": 455 }, { "epoch": 1.8, "grad_norm": 0.296176705173677, "learning_rate": 1.9763819484490353e-07, "loss": 0.6691, "step": 456 }, { "epoch": 1.8, "grad_norm": 0.2923471158891105, "learning_rate": 1.8881306115735632e-07, "loss": 0.705, "step": 457 }, { "epoch": 1.81, "grad_norm": 0.2769134760770555, "learning_rate": 1.801856965207338e-07, "loss": 0.6845, "step": 458 }, { "epoch": 1.81, "grad_norm": 0.30153347516541434, "learning_rate": 1.7175645557220567e-07, "loss": 0.6935, "step": 459 }, { "epoch": 1.81, "grad_norm": 0.29938078931561396, "learning_rate": 1.6352568480485277e-07, "loss": 0.6822, "step": 460 }, { "epoch": 1.82, "grad_norm": 0.2909845977727433, "learning_rate": 1.5549372255342367e-07, "loss": 0.6959, "step": 461 }, { "epoch": 1.82, "grad_norm": 0.2851155920589762, "learning_rate": 1.4766089898042678e-07, "loss": 0.6909, "step": 462 }, { "epoch": 1.83, "grad_norm": 3.590844633307425, "learning_rate": 1.4002753606256082e-07, "loss": 0.9279, "step": 463 }, { "epoch": 1.83, "grad_norm": 0.289769942223222, "learning_rate": 1.3259394757747678e-07, "loss": 0.6664, "step": 464 }, { "epoch": 1.83, "grad_norm": 1.4756345980091514, "learning_rate": 1.253604390908819e-07, "loss": 0.9066, "step": 465 }, { "epoch": 1.84, "grad_norm": 0.2905542054012534, "learning_rate": 1.1832730794397951e-07, "loss": 0.6989, "step": 466 }, { "epoch": 1.84, "grad_norm": 0.3056790622208962, "learning_rate": 1.1149484324124326e-07, "loss": 0.64, "step": 467 }, { "epoch": 1.85, "grad_norm": 0.2915224050071343, "learning_rate": 1.0486332583853565e-07, "loss": 0.6411, "step": 468 }, { "epoch": 1.85, "grad_norm": 0.2947477867782055, "learning_rate": 9.843302833156377e-08, "loss": 0.6901, "step": 469 }, { "epoch": 1.85, "grad_norm": 0.6968434274277037, "learning_rate": 9.22042150446728e-08, "loss": 0.9234, "step": 470 }, { "epoch": 1.86, "grad_norm": 0.28365980605268926, "learning_rate": 8.617714201998084e-08, "loss": 0.6871, "step": 471 }, { "epoch": 1.86, "grad_norm": 0.29456041125148436, "learning_rate": 8.035205700685167e-08, "loss": 0.6841, "step": 472 }, { "epoch": 1.87, "grad_norm": 0.3143240746562965, "learning_rate": 7.47291994517163e-08, "loss": 0.6793, "step": 473 }, { "epoch": 1.87, "grad_norm": 0.3002190997085016, "learning_rate": 6.930880048822531e-08, "loss": 0.6909, "step": 474 }, { "epoch": 1.87, "grad_norm": 0.28198233683666907, "learning_rate": 6.409108292774912e-08, "loss": 0.6677, "step": 475 }, { "epoch": 1.88, "grad_norm": 0.2974351663902672, "learning_rate": 5.907626125022159e-08, "loss": 0.6863, "step": 476 }, { "epoch": 1.88, "grad_norm": 0.3167342201027022, "learning_rate": 5.426454159531913e-08, "loss": 0.6728, "step": 477 }, { "epoch": 1.89, "grad_norm": 0.2838121481556639, "learning_rate": 4.9656121753990924e-08, "loss": 0.6765, "step": 478 }, { "epoch": 1.89, "grad_norm": 0.28117138518414553, "learning_rate": 4.52511911603265e-08, "loss": 0.6827, "step": 479 }, { "epoch": 1.89, "grad_norm": 0.29399576903346636, "learning_rate": 4.104993088376974e-08, "loss": 0.6933, "step": 480 }, { "epoch": 1.9, "grad_norm": 0.29636385882946953, "learning_rate": 3.705251362167484e-08, "loss": 0.6641, "step": 481 }, { "epoch": 1.9, "grad_norm": 0.2913085312105263, "learning_rate": 3.325910369220975e-08, "loss": 0.6973, "step": 482 }, { "epoch": 1.91, "grad_norm": 0.29080057862930364, "learning_rate": 2.966985702759828e-08, "loss": 0.6678, "step": 483 }, { "epoch": 1.91, "grad_norm": 0.2952306166117519, "learning_rate": 2.6284921167712975e-08, "loss": 0.7017, "step": 484 }, { "epoch": 1.91, "grad_norm": 0.2959396595629797, "learning_rate": 2.3104435254008852e-08, "loss": 0.6569, "step": 485 }, { "epoch": 1.92, "grad_norm": 0.28791733733513286, "learning_rate": 2.012853002380466e-08, "loss": 0.6573, "step": 486 }, { "epoch": 1.92, "grad_norm": 0.2923526176448832, "learning_rate": 1.735732780490884e-08, "loss": 0.6903, "step": 487 }, { "epoch": 1.93, "grad_norm": 0.2908268098513957, "learning_rate": 1.4790942510590767e-08, "loss": 0.6756, "step": 488 }, { "epoch": 1.93, "grad_norm": 0.29069043095745606, "learning_rate": 1.2429479634897268e-08, "loss": 0.6722, "step": 489 }, { "epoch": 1.93, "grad_norm": 0.28912174671892155, "learning_rate": 1.0273036248318325e-08, "loss": 0.6927, "step": 490 }, { "epoch": 1.94, "grad_norm": 0.29773909318477504, "learning_rate": 8.321700993795812e-09, "loss": 0.6703, "step": 491 }, { "epoch": 1.94, "grad_norm": 0.2846360921300275, "learning_rate": 6.575554083078084e-09, "loss": 0.6915, "step": 492 }, { "epoch": 1.95, "grad_norm": 0.3040183654289367, "learning_rate": 5.034667293427053e-09, "loss": 0.6836, "step": 493 }, { "epoch": 1.95, "grad_norm": 0.29012455377167307, "learning_rate": 3.6991039646616657e-09, "loss": 0.6844, "step": 494 }, { "epoch": 1.95, "grad_norm": 0.2778518633390048, "learning_rate": 2.568918996560532e-09, "loss": 0.6779, "step": 495 }, { "epoch": 1.96, "grad_norm": 0.29541155663074187, "learning_rate": 1.6441588466009627e-09, "loss": 0.6979, "step": 496 }, { "epoch": 1.96, "grad_norm": 0.28364315270086676, "learning_rate": 9.248615280499362e-10, "loss": 0.6792, "step": 497 }, { "epoch": 1.97, "grad_norm": 0.2865142006406564, "learning_rate": 4.1105660840368154e-10, "loss": 0.7034, "step": 498 }, { "epoch": 1.97, "grad_norm": 0.2854424675916699, "learning_rate": 1.0276520816976388e-10, "loss": 0.6747, "step": 499 }, { "epoch": 1.97, "grad_norm": 0.29666466368391803, "learning_rate": 0.0, "loss": 0.6754, "step": 500 }, { "epoch": 1.97, "eval_loss": 0.6983408331871033, "eval_runtime": 93.907, "eval_samples_per_second": 18.827, "eval_steps_per_second": 0.394, "step": 500 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 250, "total_flos": 1571976955035648.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }