|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9740777666999003, |
|
"eval_steps": 125, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.8329473944625845, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.9189, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 0.8840048313140869, |
|
"eval_runtime": 99.9262, |
|
"eval_samples_per_second": 17.693, |
|
"eval_steps_per_second": 0.37, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.7916344264608899, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.8962, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.8909931480365287, |
|
"learning_rate": 3e-06, |
|
"loss": 0.8805, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.6318273112027453, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.913, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.2463401136319747, |
|
"learning_rate": 5e-06, |
|
"loss": 0.908, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.1463980681106876, |
|
"learning_rate": 6e-06, |
|
"loss": 0.8729, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9477573494094379, |
|
"learning_rate": 7e-06, |
|
"loss": 0.8411, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.0165120162042935, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.0541, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.0713771331971476, |
|
"learning_rate": 9e-06, |
|
"loss": 0.8495, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8667235558943894, |
|
"learning_rate": 1e-05, |
|
"loss": 0.8199, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.7411429457268661, |
|
"learning_rate": 9.999897234791831e-06, |
|
"loss": 0.7964, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5729968036750446, |
|
"learning_rate": 9.999588943391597e-06, |
|
"loss": 0.8039, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5402964207486183, |
|
"learning_rate": 9.99907513847195e-06, |
|
"loss": 0.8287, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5633328266442124, |
|
"learning_rate": 9.9983558411534e-06, |
|
"loss": 0.7842, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.5412290905686791, |
|
"learning_rate": 9.99743108100344e-06, |
|
"loss": 0.8345, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.4895379189634968, |
|
"learning_rate": 9.99630089603534e-06, |
|
"loss": 0.7922, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5088260537094976, |
|
"learning_rate": 9.994965332706574e-06, |
|
"loss": 0.7969, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.47075507205524136, |
|
"learning_rate": 9.993424445916923e-06, |
|
"loss": 0.7931, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3878407143429931, |
|
"learning_rate": 9.991678299006206e-06, |
|
"loss": 0.8041, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.3873731682942636, |
|
"learning_rate": 9.989726963751683e-06, |
|
"loss": 0.8107, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.42417629604043083, |
|
"learning_rate": 9.987570520365105e-06, |
|
"loss": 0.7874, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.4324680733617199, |
|
"learning_rate": 9.98520905748941e-06, |
|
"loss": 0.8025, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.34546199757993784, |
|
"learning_rate": 9.982642672195093e-06, |
|
"loss": 0.8048, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.35958771648273496, |
|
"learning_rate": 9.979871469976197e-06, |
|
"loss": 0.831, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.3940074197908444, |
|
"learning_rate": 9.976895564745993e-06, |
|
"loss": 0.7944, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.3818406187889153, |
|
"learning_rate": 9.973715078832288e-06, |
|
"loss": 0.7936, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 10.237346743255186, |
|
"learning_rate": 9.970330142972403e-06, |
|
"loss": 1.0017, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 6.504690612414681, |
|
"learning_rate": 9.966740896307791e-06, |
|
"loss": 1.0329, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.4028775109425473, |
|
"learning_rate": 9.962947486378325e-06, |
|
"loss": 0.7702, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.39810277536967076, |
|
"learning_rate": 9.95895006911623e-06, |
|
"loss": 0.771, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.29862663396811506, |
|
"learning_rate": 9.954748808839675e-06, |
|
"loss": 0.7767, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.3106188272362696, |
|
"learning_rate": 9.950343878246011e-06, |
|
"loss": 0.7943, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.34702364911134964, |
|
"learning_rate": 9.945735458404681e-06, |
|
"loss": 0.7972, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.3216448960978253, |
|
"learning_rate": 9.94092373874978e-06, |
|
"loss": 0.7847, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.31232207978504006, |
|
"learning_rate": 9.935908917072253e-06, |
|
"loss": 0.7738, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.3004886604892709, |
|
"learning_rate": 9.930691199511775e-06, |
|
"loss": 0.7877, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.2870013960815822, |
|
"learning_rate": 9.925270800548285e-06, |
|
"loss": 0.754, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.28322113595593756, |
|
"learning_rate": 9.91964794299315e-06, |
|
"loss": 0.7445, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.3065117198934518, |
|
"learning_rate": 9.91382285798002e-06, |
|
"loss": 0.787, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2727693466806482, |
|
"learning_rate": 9.907795784955327e-06, |
|
"loss": 0.7865, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2746198009076503, |
|
"learning_rate": 9.901566971668437e-06, |
|
"loss": 0.7755, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.2888207750688948, |
|
"learning_rate": 9.895136674161466e-06, |
|
"loss": 0.7789, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.26218141394209254, |
|
"learning_rate": 9.888505156758758e-06, |
|
"loss": 0.7781, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.27028128323788914, |
|
"learning_rate": 9.881672692056022e-06, |
|
"loss": 0.7596, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.301432889634355, |
|
"learning_rate": 9.874639560909118e-06, |
|
"loss": 0.7746, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.27768870163187315, |
|
"learning_rate": 9.867406052422525e-06, |
|
"loss": 0.7751, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2638230079020965, |
|
"learning_rate": 9.85997246393744e-06, |
|
"loss": 0.8085, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2826098837962784, |
|
"learning_rate": 9.852339101019574e-06, |
|
"loss": 0.7878, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2673052298412088, |
|
"learning_rate": 9.844506277446577e-06, |
|
"loss": 0.7747, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2589820555015507, |
|
"learning_rate": 9.836474315195148e-06, |
|
"loss": 0.7491, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.27744141325372174, |
|
"learning_rate": 9.828243544427795e-06, |
|
"loss": 0.771, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.25617202776049003, |
|
"learning_rate": 9.819814303479268e-06, |
|
"loss": 0.789, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.25777796417187593, |
|
"learning_rate": 9.811186938842645e-06, |
|
"loss": 0.7498, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.26356120702424557, |
|
"learning_rate": 9.802361805155097e-06, |
|
"loss": 0.7618, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.24594116238284844, |
|
"learning_rate": 9.793339265183303e-06, |
|
"loss": 0.7647, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2766331605712476, |
|
"learning_rate": 9.784119689808545e-06, |
|
"loss": 0.7757, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.2674205732918615, |
|
"learning_rate": 9.774703458011453e-06, |
|
"loss": 0.7479, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.25100414008068433, |
|
"learning_rate": 9.765090956856437e-06, |
|
"loss": 0.7629, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.2558976905977626, |
|
"learning_rate": 9.755282581475769e-06, |
|
"loss": 0.7368, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.2816597522453804, |
|
"learning_rate": 9.745278735053345e-06, |
|
"loss": 0.7675, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.27864046582364604, |
|
"learning_rate": 9.735079828808107e-06, |
|
"loss": 0.7693, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.2537495381298166, |
|
"learning_rate": 9.724686281977146e-06, |
|
"loss": 0.7612, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.27161360619636454, |
|
"learning_rate": 9.714098521798466e-06, |
|
"loss": 0.7659, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.257282261183055, |
|
"learning_rate": 9.703316983493414e-06, |
|
"loss": 0.77, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.2598148868150837, |
|
"learning_rate": 9.692342110248802e-06, |
|
"loss": 0.7637, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.25319486577746536, |
|
"learning_rate": 9.681174353198687e-06, |
|
"loss": 0.7529, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2616187230129625, |
|
"learning_rate": 9.669814171405818e-06, |
|
"loss": 0.7482, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2531735015293101, |
|
"learning_rate": 9.658262031842772e-06, |
|
"loss": 0.7507, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.2540031125746497, |
|
"learning_rate": 9.64651840937276e-06, |
|
"loss": 0.7573, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.26251145119756225, |
|
"learning_rate": 9.63458378673011e-06, |
|
"loss": 0.7617, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 11.659656865913, |
|
"learning_rate": 9.622458654500408e-06, |
|
"loss": 0.9807, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 43.33218979251603, |
|
"learning_rate": 9.610143511100354e-06, |
|
"loss": 1.0213, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.29661440448786996, |
|
"learning_rate": 9.597638862757255e-06, |
|
"loss": 0.7597, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2674359864711363, |
|
"learning_rate": 9.584945223488227e-06, |
|
"loss": 0.7716, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.2587397735578842, |
|
"learning_rate": 9.572063115079063e-06, |
|
"loss": 0.7654, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.27326638279450294, |
|
"learning_rate": 9.558993067062785e-06, |
|
"loss": 0.7832, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.26424783232216553, |
|
"learning_rate": 9.545735616697875e-06, |
|
"loss": 0.7509, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.26894661215694415, |
|
"learning_rate": 9.532291308946191e-06, |
|
"loss": 0.7638, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 12.381149099110814, |
|
"learning_rate": 9.518660696450567e-06, |
|
"loss": 0.9726, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 63.276974873593076, |
|
"learning_rate": 9.504844339512096e-06, |
|
"loss": 0.961, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.34404347007223207, |
|
"learning_rate": 9.490842806067095e-06, |
|
"loss": 0.7366, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.2761892805994169, |
|
"learning_rate": 9.476656671663766e-06, |
|
"loss": 0.7565, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.2938700568825168, |
|
"learning_rate": 9.462286519438531e-06, |
|
"loss": 0.7586, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.30998708104141814, |
|
"learning_rate": 9.44773294009206e-06, |
|
"loss": 0.747, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2789622879446074, |
|
"learning_rate": 9.432996531865001e-06, |
|
"loss": 0.7381, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.3043211841621936, |
|
"learning_rate": 9.418077900513377e-06, |
|
"loss": 0.7648, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.27269347275749684, |
|
"learning_rate": 9.40297765928369e-06, |
|
"loss": 0.7287, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.29165683068711035, |
|
"learning_rate": 9.387696428887715e-06, |
|
"loss": 0.7714, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.29093659611546846, |
|
"learning_rate": 9.372234837476979e-06, |
|
"loss": 0.754, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.2622000520062877, |
|
"learning_rate": 9.356593520616948e-06, |
|
"loss": 0.7604, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.29648676556314774, |
|
"learning_rate": 9.340773121260893e-06, |
|
"loss": 0.7677, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.2971691719126809, |
|
"learning_rate": 9.324774289723469e-06, |
|
"loss": 0.7826, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.2695147958164756, |
|
"learning_rate": 9.308597683653976e-06, |
|
"loss": 0.7675, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.2947547264550856, |
|
"learning_rate": 9.292243968009332e-06, |
|
"loss": 0.7611, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.2698951119585888, |
|
"learning_rate": 9.275713815026732e-06, |
|
"loss": 0.7437, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.2922871464880811, |
|
"learning_rate": 9.259007904196023e-06, |
|
"loss": 0.7716, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 13.342043215041077, |
|
"learning_rate": 9.242126922231763e-06, |
|
"loss": 1.0262, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.3348436860369577, |
|
"learning_rate": 9.225071563045007e-06, |
|
"loss": 0.9733, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.30515152013617763, |
|
"learning_rate": 9.207842527714767e-06, |
|
"loss": 0.7491, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2797372956926758, |
|
"learning_rate": 9.190440524459203e-06, |
|
"loss": 0.7658, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.295442681103485, |
|
"learning_rate": 9.172866268606514e-06, |
|
"loss": 0.7359, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2890934213055238, |
|
"learning_rate": 9.15512048256552e-06, |
|
"loss": 0.7783, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.2698602196909741, |
|
"learning_rate": 9.137203895795983e-06, |
|
"loss": 0.7476, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.30586672847809404, |
|
"learning_rate": 9.119117244778609e-06, |
|
"loss": 0.7494, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.28256137853789653, |
|
"learning_rate": 9.10086127298478e-06, |
|
"loss": 0.7347, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.2654565204147507, |
|
"learning_rate": 9.082436730845993e-06, |
|
"loss": 0.7282, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.3110313172517606, |
|
"learning_rate": 9.063844375723014e-06, |
|
"loss": 0.7442, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.30406144580285027, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.7365, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.2578097986689305, |
|
"learning_rate": 9.026159290426782e-06, |
|
"loss": 0.7644, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.2875228334986064, |
|
"learning_rate": 9.007068109339783e-06, |
|
"loss": 0.7359, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.3007292657335934, |
|
"learning_rate": 8.987812213377423e-06, |
|
"loss": 0.7571, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.2705781647990633, |
|
"learning_rate": 8.968392394074164e-06, |
|
"loss": 0.7321, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 7.709717121399015, |
|
"learning_rate": 8.948809449702712e-06, |
|
"loss": 1.0663, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.3159530858994423, |
|
"learning_rate": 8.929064185241214e-06, |
|
"loss": 0.7594, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.3001925080979955, |
|
"learning_rate": 8.90915741234015e-06, |
|
"loss": 0.7486, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.7719217922453914, |
|
"learning_rate": 8.889089949288986e-06, |
|
"loss": 1.0014, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2980097580186808, |
|
"learning_rate": 8.868862620982534e-06, |
|
"loss": 0.7302, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.2674175783919389, |
|
"learning_rate": 8.84847625888703e-06, |
|
"loss": 0.7515, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.33089914745986465, |
|
"learning_rate": 8.827931701005974e-06, |
|
"loss": 0.7452, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.2657873873108844, |
|
"learning_rate": 8.807229791845673e-06, |
|
"loss": 0.7565, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.26891569095038903, |
|
"learning_rate": 8.786371382380527e-06, |
|
"loss": 0.7525, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.30828049821760906, |
|
"learning_rate": 8.765357330018056e-06, |
|
"loss": 0.7395, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.2703956947723179, |
|
"learning_rate": 8.74418849856364e-06, |
|
"loss": 0.7762, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.27665743831770573, |
|
"learning_rate": 8.722865758185036e-06, |
|
"loss": 0.7499, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.31328445024397394, |
|
"learning_rate": 8.701389985376578e-06, |
|
"loss": 0.7368, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 0.7192811369895935, |
|
"eval_runtime": 97.0775, |
|
"eval_samples_per_second": 18.212, |
|
"eval_steps_per_second": 0.381, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.26565830658390915, |
|
"learning_rate": 8.679762062923176e-06, |
|
"loss": 0.7727, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.2768705145101062, |
|
"learning_rate": 8.657982879864007e-06, |
|
"loss": 0.7178, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.27658899618203814, |
|
"learning_rate": 8.636053331455986e-06, |
|
"loss": 0.7521, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.2687326456666238, |
|
"learning_rate": 8.613974319136959e-06, |
|
"loss": 0.7411, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.2618083386724651, |
|
"learning_rate": 8.591746750488639e-06, |
|
"loss": 0.7306, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.25666246646393165, |
|
"learning_rate": 8.569371539199316e-06, |
|
"loss": 0.7505, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.3203048983481449, |
|
"learning_rate": 8.54684960502629e-06, |
|
"loss": 0.7515, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.2521993776332652, |
|
"learning_rate": 8.52418187375806e-06, |
|
"loss": 0.7505, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.26591933789428035, |
|
"learning_rate": 8.501369277176275e-06, |
|
"loss": 0.7353, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.27603393300812845, |
|
"learning_rate": 8.478412753017433e-06, |
|
"loss": 0.7609, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.2668194745887302, |
|
"learning_rate": 8.455313244934324e-06, |
|
"loss": 0.7141, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.2707242484249978, |
|
"learning_rate": 8.432071702457253e-06, |
|
"loss": 0.7223, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.25511126409448154, |
|
"learning_rate": 8.408689080954997e-06, |
|
"loss": 0.7153, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 15.420395873510664, |
|
"learning_rate": 8.38516634159555e-06, |
|
"loss": 1.0042, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.3012234081880187, |
|
"learning_rate": 8.361504451306585e-06, |
|
"loss": 0.7713, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.2598491249346932, |
|
"learning_rate": 8.337704382735741e-06, |
|
"loss": 0.7288, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.26256298238822373, |
|
"learning_rate": 8.313767114210615e-06, |
|
"loss": 0.7379, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.2945113973208366, |
|
"learning_rate": 8.289693629698564e-06, |
|
"loss": 0.7401, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.24981458420819586, |
|
"learning_rate": 8.265484918766243e-06, |
|
"loss": 0.7512, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.2678413297548206, |
|
"learning_rate": 8.241141976538944e-06, |
|
"loss": 0.7449, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.2623417103083203, |
|
"learning_rate": 8.216665803659671e-06, |
|
"loss": 0.7647, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.263785777979794, |
|
"learning_rate": 8.192057406248028e-06, |
|
"loss": 0.7725, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.2519540317965661, |
|
"learning_rate": 8.16731779585885e-06, |
|
"loss": 0.715, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.27015785362121375, |
|
"learning_rate": 8.142447989440618e-06, |
|
"loss": 0.7532, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.25863328277564784, |
|
"learning_rate": 8.117449009293668e-06, |
|
"loss": 0.7335, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.2550714525590909, |
|
"learning_rate": 8.092321883028157e-06, |
|
"loss": 0.7182, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2752483825047847, |
|
"learning_rate": 8.067067643521834e-06, |
|
"loss": 0.772, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2582002365542859, |
|
"learning_rate": 8.041687328877566e-06, |
|
"loss": 0.7284, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.24823845365447103, |
|
"learning_rate": 8.016181982380682e-06, |
|
"loss": 0.7467, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.25644169647568194, |
|
"learning_rate": 7.99055265245608e-06, |
|
"loss": 0.7191, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.27463144458405375, |
|
"learning_rate": 7.96480039262513e-06, |
|
"loss": 0.7375, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.2642553744129046, |
|
"learning_rate": 7.938926261462366e-06, |
|
"loss": 0.7587, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.25679436337027056, |
|
"learning_rate": 7.912931322551981e-06, |
|
"loss": 0.7312, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.2784563408983632, |
|
"learning_rate": 7.886816644444099e-06, |
|
"loss": 0.7213, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.2638439438037005, |
|
"learning_rate": 7.860583300610849e-06, |
|
"loss": 0.7286, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.2599487424469649, |
|
"learning_rate": 7.83423236940225e-06, |
|
"loss": 0.7211, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.2620169279227201, |
|
"learning_rate": 7.807764934001875e-06, |
|
"loss": 0.7243, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.3455142385888707, |
|
"learning_rate": 7.781182082382325e-06, |
|
"loss": 0.7709, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.25797958121628417, |
|
"learning_rate": 7.754484907260513e-06, |
|
"loss": 0.7371, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.26245124486082283, |
|
"learning_rate": 7.727674506052744e-06, |
|
"loss": 0.7625, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.259525851556208, |
|
"learning_rate": 7.700751980829601e-06, |
|
"loss": 0.7785, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.25578616887441574, |
|
"learning_rate": 7.673718438270649e-06, |
|
"loss": 0.7349, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.26880908011952676, |
|
"learning_rate": 7.646574989618938e-06, |
|
"loss": 0.7423, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.28268284846763475, |
|
"learning_rate": 7.619322750635327e-06, |
|
"loss": 0.8089, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.2565025440158926, |
|
"learning_rate": 7.591962841552627e-06, |
|
"loss": 0.708, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.2589330645555015, |
|
"learning_rate": 7.564496387029532e-06, |
|
"loss": 0.7276, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 4.072287518324262, |
|
"learning_rate": 7.536924516104411e-06, |
|
"loss": 0.963, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 5.236195816930884, |
|
"learning_rate": 7.509248362148889e-06, |
|
"loss": 0.9786, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.3208811366448085, |
|
"learning_rate": 7.481469062821252e-06, |
|
"loss": 0.7417, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.280850125752099, |
|
"learning_rate": 7.453587760019691e-06, |
|
"loss": 0.7249, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.26692675004354643, |
|
"learning_rate": 7.42560559983536e-06, |
|
"loss": 0.727, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.27369623001787907, |
|
"learning_rate": 7.39752373250527e-06, |
|
"loss": 0.7617, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.2784142807735342, |
|
"learning_rate": 7.369343312364994e-06, |
|
"loss": 0.7466, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 37.27250895412763, |
|
"learning_rate": 7.34106549780123e-06, |
|
"loss": 1.0667, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.2760061039631398, |
|
"learning_rate": 7.312691451204178e-06, |
|
"loss": 0.7244, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.25459829854169064, |
|
"learning_rate": 7.284222338919758e-06, |
|
"loss": 0.7364, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.26179305555253735, |
|
"learning_rate": 7.255659331201673e-06, |
|
"loss": 0.733, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.2604418741829541, |
|
"learning_rate": 7.227003602163296e-06, |
|
"loss": 0.7209, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.26185681215109163, |
|
"learning_rate": 7.198256329729412e-06, |
|
"loss": 0.7164, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 16.152387103951856, |
|
"learning_rate": 7.169418695587791e-06, |
|
"loss": 1.0372, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 21.228735850953576, |
|
"learning_rate": 7.140491885140629e-06, |
|
"loss": 1.0402, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.28404810159037286, |
|
"learning_rate": 7.1114770874558e-06, |
|
"loss": 0.7006, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.26609034714435736, |
|
"learning_rate": 7.082375495217996e-06, |
|
"loss": 0.7537, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.25765084929276133, |
|
"learning_rate": 7.053188304679691e-06, |
|
"loss": 0.7302, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.26384158886834463, |
|
"learning_rate": 7.023916715611969e-06, |
|
"loss": 0.712, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.27151931787506317, |
|
"learning_rate": 6.994561931255209e-06, |
|
"loss": 0.7502, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.27031492068457535, |
|
"learning_rate": 6.965125158269619e-06, |
|
"loss": 0.7179, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.26995073084719196, |
|
"learning_rate": 6.935607606685642e-06, |
|
"loss": 0.7624, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.25666755324587454, |
|
"learning_rate": 6.906010489854209e-06, |
|
"loss": 0.7426, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.2764461509009301, |
|
"learning_rate": 6.876335024396872e-06, |
|
"loss": 0.723, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.2597906002555833, |
|
"learning_rate": 6.846582430155783e-06, |
|
"loss": 0.7407, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.26409742487438864, |
|
"learning_rate": 6.816753930143558e-06, |
|
"loss": 0.7206, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.25320169233675405, |
|
"learning_rate": 6.786850750493006e-06, |
|
"loss": 0.7437, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.2708696048462205, |
|
"learning_rate": 6.7568741204067145e-06, |
|
"loss": 0.7422, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.26542323915181154, |
|
"learning_rate": 6.726825272106539e-06, |
|
"loss": 0.7514, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.26307166597433396, |
|
"learning_rate": 6.696705440782939e-06, |
|
"loss": 0.7509, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.26671446872754456, |
|
"learning_rate": 6.66651586454421e-06, |
|
"loss": 0.7465, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.2720083369272757, |
|
"learning_rate": 6.636257784365585e-06, |
|
"loss": 0.7349, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.2652218770116059, |
|
"learning_rate": 6.605932444038229e-06, |
|
"loss": 0.7348, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.26402314109149694, |
|
"learning_rate": 6.575541090118105e-06, |
|
"loss": 0.7495, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.2639803511821082, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.7138, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.2673567043268493, |
|
"learning_rate": 6.514565341239861e-06, |
|
"loss": 0.7341, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 74.0236556664021, |
|
"learning_rate": 6.483983452755953e-06, |
|
"loss": 1.084, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.2694930198888902, |
|
"learning_rate": 6.4533405635246696e-06, |
|
"loss": 0.7422, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.2808089325365656, |
|
"learning_rate": 6.4226379331551625e-06, |
|
"loss": 0.7543, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.24629087243802783, |
|
"learning_rate": 6.3918768237123175e-06, |
|
"loss": 0.7088, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.2605148206784209, |
|
"learning_rate": 6.361058499664856e-06, |
|
"loss": 0.7434, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.26199687024127344, |
|
"learning_rate": 6.330184227833376e-06, |
|
"loss": 0.7369, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.2693080928270376, |
|
"learning_rate": 6.299255277338265e-06, |
|
"loss": 0.7337, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.2573571779304293, |
|
"learning_rate": 6.268272919547537e-06, |
|
"loss": 0.7366, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.25347655388671, |
|
"learning_rate": 6.237238428024573e-06, |
|
"loss": 0.7392, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.254807709796356, |
|
"learning_rate": 6.2061530784757625e-06, |
|
"loss": 0.7709, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.25435065962054804, |
|
"learning_rate": 6.175018148698077e-06, |
|
"loss": 0.7472, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.25856868944475736, |
|
"learning_rate": 6.143834918526528e-06, |
|
"loss": 0.7442, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.24960062893507637, |
|
"learning_rate": 6.112604669781572e-06, |
|
"loss": 0.7163, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.2544024553733407, |
|
"learning_rate": 6.0813286862164175e-06, |
|
"loss": 0.7236, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.2532920039697931, |
|
"learning_rate": 6.050008253464247e-06, |
|
"loss": 0.7427, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.25372808971698796, |
|
"learning_rate": 6.018644658985378e-06, |
|
"loss": 0.7286, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.2570514856547558, |
|
"learning_rate": 5.987239192014336e-06, |
|
"loss": 0.7349, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.2578576277551542, |
|
"learning_rate": 5.955793143506863e-06, |
|
"loss": 0.7266, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.26312215832145636, |
|
"learning_rate": 5.9243078060868445e-06, |
|
"loss": 0.7389, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.26518617358808877, |
|
"learning_rate": 5.892784473993184e-06, |
|
"loss": 0.7108, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.25620517627113376, |
|
"learning_rate": 5.861224443026595e-06, |
|
"loss": 0.7232, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 28.36402580586963, |
|
"learning_rate": 5.82962901049634e-06, |
|
"loss": 0.9734, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.2807037939514787, |
|
"learning_rate": 5.797999475166897e-06, |
|
"loss": 0.7341, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 8.208344028346868, |
|
"learning_rate": 5.766337137204579e-06, |
|
"loss": 0.938, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.26445130385050997, |
|
"learning_rate": 5.734643298124091e-06, |
|
"loss": 0.7316, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.251567451954335, |
|
"learning_rate": 5.702919260735015e-06, |
|
"loss": 0.6966, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.26329080787916564, |
|
"learning_rate": 5.671166329088278e-06, |
|
"loss": 0.7319, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.2566777339661679, |
|
"learning_rate": 5.6393858084225305e-06, |
|
"loss": 0.7529, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.2710815554700812, |
|
"learning_rate": 5.6075790051105025e-06, |
|
"loss": 0.7515, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.27096961550302734, |
|
"learning_rate": 5.575747226605298e-06, |
|
"loss": 0.7073, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.2509131795738037, |
|
"learning_rate": 5.543891781386655e-06, |
|
"loss": 0.7513, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.26210506205941153, |
|
"learning_rate": 5.512013978907157e-06, |
|
"loss": 0.7569, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.25123130642497177, |
|
"learning_rate": 5.480115129538409e-06, |
|
"loss": 0.7239, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.2596821607229612, |
|
"learning_rate": 5.448196544517168e-06, |
|
"loss": 0.7256, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.2714818563550966, |
|
"learning_rate": 5.4162595358914475e-06, |
|
"loss": 0.7329, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.260503064108439, |
|
"learning_rate": 5.384305416466584e-06, |
|
"loss": 0.7112, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.2661267608396215, |
|
"learning_rate": 5.35233549975127e-06, |
|
"loss": 0.7534, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.27502743208671454, |
|
"learning_rate": 5.320351099903565e-06, |
|
"loss": 0.7355, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.2598641343680277, |
|
"learning_rate": 5.288353531676873e-06, |
|
"loss": 0.7476, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.2629788348419056, |
|
"learning_rate": 5.256344110365896e-06, |
|
"loss": 0.7523, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.256123432185156, |
|
"learning_rate": 5.224324151752575e-06, |
|
"loss": 0.7479, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.2592695095071067, |
|
"learning_rate": 5.192294972051992e-06, |
|
"loss": 0.7586, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.26264999139615697, |
|
"learning_rate": 5.160257887858278e-06, |
|
"loss": 0.7406, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7036678791046143, |
|
"eval_runtime": 96.3087, |
|
"eval_samples_per_second": 18.358, |
|
"eval_steps_per_second": 0.384, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2614921961545108, |
|
"learning_rate": 5.128214216090478e-06, |
|
"loss": 0.7488, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.2605743808221771, |
|
"learning_rate": 5.0961652739384356e-06, |
|
"loss": 0.7338, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 2.904417203670962, |
|
"learning_rate": 5.064112378808636e-06, |
|
"loss": 0.9738, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.2581985759494367, |
|
"learning_rate": 5.032056848270056e-06, |
|
"loss": 0.7693, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.25102446332314765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7213, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.4216598983588058, |
|
"learning_rate": 4.967943151729945e-06, |
|
"loss": 0.9193, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.32982276331099014, |
|
"learning_rate": 4.935887621191364e-06, |
|
"loss": 0.6842, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.29043411467478625, |
|
"learning_rate": 4.903834726061565e-06, |
|
"loss": 0.7087, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.25986254756592664, |
|
"learning_rate": 4.871785783909523e-06, |
|
"loss": 0.6741, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.30049816553828484, |
|
"learning_rate": 4.839742112141725e-06, |
|
"loss": 0.7063, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.2895999616622155, |
|
"learning_rate": 4.807705027948008e-06, |
|
"loss": 0.7146, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.30041272052643164, |
|
"learning_rate": 4.775675848247427e-06, |
|
"loss": 0.7134, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.27518299819790887, |
|
"learning_rate": 4.743655889634105e-06, |
|
"loss": 0.692, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.26955160521446175, |
|
"learning_rate": 4.711646468323129e-06, |
|
"loss": 0.658, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.27535739664976405, |
|
"learning_rate": 4.679648900096436e-06, |
|
"loss": 0.6908, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.26763089124769246, |
|
"learning_rate": 4.64766450024873e-06, |
|
"loss": 0.6861, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 74.38254133611925, |
|
"learning_rate": 4.615694583533418e-06, |
|
"loss": 0.9994, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.2986551656205794, |
|
"learning_rate": 4.583740464108554e-06, |
|
"loss": 0.7075, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.27619714658975547, |
|
"learning_rate": 4.551803455482833e-06, |
|
"loss": 0.6596, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.25242413092583954, |
|
"learning_rate": 4.5198848704615915e-06, |
|
"loss": 0.6628, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.26017582720690735, |
|
"learning_rate": 4.487986021092844e-06, |
|
"loss": 0.6916, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.2719334401383232, |
|
"learning_rate": 4.456108218613346e-06, |
|
"loss": 0.6935, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.2874168693732095, |
|
"learning_rate": 4.424252773394704e-06, |
|
"loss": 0.7013, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.27088215577908004, |
|
"learning_rate": 4.392420994889498e-06, |
|
"loss": 0.693, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.2532812233498042, |
|
"learning_rate": 4.3606141915774695e-06, |
|
"loss": 0.6762, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.263089719520046, |
|
"learning_rate": 4.3288336709117246e-06, |
|
"loss": 0.6677, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.2544227492529696, |
|
"learning_rate": 4.297080739264987e-06, |
|
"loss": 0.6744, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.2645751047762965, |
|
"learning_rate": 4.265356701875911e-06, |
|
"loss": 0.7047, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.2602397068309733, |
|
"learning_rate": 4.23366286279542e-06, |
|
"loss": 0.6792, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.27012218470061766, |
|
"learning_rate": 4.2020005248331056e-06, |
|
"loss": 0.6914, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.2645729945558582, |
|
"learning_rate": 4.170370989503662e-06, |
|
"loss": 0.6812, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.26158176244234604, |
|
"learning_rate": 4.138775556973406e-06, |
|
"loss": 0.6545, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.2609966416888788, |
|
"learning_rate": 4.107215526006818e-06, |
|
"loss": 0.6534, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.2633177456953443, |
|
"learning_rate": 4.075692193913156e-06, |
|
"loss": 0.6617, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.2711366514748812, |
|
"learning_rate": 4.04420685649314e-06, |
|
"loss": 0.7026, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.26016920693531187, |
|
"learning_rate": 4.012760807985665e-06, |
|
"loss": 0.685, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.2634077734164485, |
|
"learning_rate": 3.9813553410146225e-06, |
|
"loss": 0.6732, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.2630739058989318, |
|
"learning_rate": 3.949991746535753e-06, |
|
"loss": 0.6898, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.26810735877032293, |
|
"learning_rate": 3.918671313783583e-06, |
|
"loss": 0.6739, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.2667138269733132, |
|
"learning_rate": 3.887395330218429e-06, |
|
"loss": 0.6649, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.2563222658817468, |
|
"learning_rate": 3.856165081473474e-06, |
|
"loss": 0.708, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.26451201369218524, |
|
"learning_rate": 3.824981851301924e-06, |
|
"loss": 0.6715, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.2598494927634439, |
|
"learning_rate": 3.7938469215242374e-06, |
|
"loss": 0.6955, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.25396403307478727, |
|
"learning_rate": 3.7627615719754294e-06, |
|
"loss": 0.6676, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.2532765659210287, |
|
"learning_rate": 3.731727080452464e-06, |
|
"loss": 0.6748, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.26061271523616963, |
|
"learning_rate": 3.7007447226617367e-06, |
|
"loss": 0.7058, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.25801226716086006, |
|
"learning_rate": 3.669815772166625e-06, |
|
"loss": 0.6717, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.2678578983084559, |
|
"learning_rate": 3.638941500335145e-06, |
|
"loss": 0.6785, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.2629273311111566, |
|
"learning_rate": 3.608123176287685e-06, |
|
"loss": 0.6846, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.26372287416738016, |
|
"learning_rate": 3.5773620668448384e-06, |
|
"loss": 0.7155, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.2705437923133382, |
|
"learning_rate": 3.5466594364753325e-06, |
|
"loss": 0.6723, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.2839291053250124, |
|
"learning_rate": 3.516016547244047e-06, |
|
"loss": 0.7035, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.2760313377640414, |
|
"learning_rate": 3.48543465876014e-06, |
|
"loss": 0.6751, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.25394626546919435, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 0.6795, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.2591221216055477, |
|
"learning_rate": 3.424458909881897e-06, |
|
"loss": 0.6766, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.2691782924782941, |
|
"learning_rate": 3.3940675559617724e-06, |
|
"loss": 0.6895, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.26815145183562566, |
|
"learning_rate": 3.363742215634416e-06, |
|
"loss": 0.6671, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.26601253229287064, |
|
"learning_rate": 3.3334841354557923e-06, |
|
"loss": 0.6902, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.2759140499002526, |
|
"learning_rate": 3.303294559217063e-06, |
|
"loss": 0.7011, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.2532152571509874, |
|
"learning_rate": 3.273174727893463e-06, |
|
"loss": 0.6631, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.2587732106097895, |
|
"learning_rate": 3.2431258795932863e-06, |
|
"loss": 0.6964, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.26154429819114283, |
|
"learning_rate": 3.213149249506997e-06, |
|
"loss": 0.7018, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.2640699829932556, |
|
"learning_rate": 3.183246069856443e-06, |
|
"loss": 0.6809, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.26350081604751235, |
|
"learning_rate": 3.1534175698442194e-06, |
|
"loss": 0.655, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.2620810766791341, |
|
"learning_rate": 3.12366497560313e-06, |
|
"loss": 0.7034, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.26759545817238656, |
|
"learning_rate": 3.093989510145792e-06, |
|
"loss": 0.7238, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.26779990918516644, |
|
"learning_rate": 3.0643923933143603e-06, |
|
"loss": 0.6733, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.266460909749917, |
|
"learning_rate": 3.0348748417303826e-06, |
|
"loss": 0.6708, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.26133722668937404, |
|
"learning_rate": 3.005438068744792e-06, |
|
"loss": 0.6838, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.26893527319559857, |
|
"learning_rate": 2.976083284388031e-06, |
|
"loss": 0.662, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.2699706439018165, |
|
"learning_rate": 2.9468116953203107e-06, |
|
"loss": 0.6867, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.2660357193246072, |
|
"learning_rate": 2.9176245047820064e-06, |
|
"loss": 0.6802, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.2676372441332764, |
|
"learning_rate": 2.8885229125442022e-06, |
|
"loss": 0.7143, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.26172654998349437, |
|
"learning_rate": 2.859508114859374e-06, |
|
"loss": 0.6688, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.26622148926216194, |
|
"learning_rate": 2.83058130441221e-06, |
|
"loss": 0.671, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.28436198488003145, |
|
"learning_rate": 2.80174367027059e-06, |
|
"loss": 0.7157, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.2637348527869296, |
|
"learning_rate": 2.772996397836704e-06, |
|
"loss": 0.6893, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.27094660677035604, |
|
"learning_rate": 2.7443406687983267e-06, |
|
"loss": 0.7149, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.26943858696681655, |
|
"learning_rate": 2.7157776610802416e-06, |
|
"loss": 0.6756, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.2637431948145577, |
|
"learning_rate": 2.687308548795825e-06, |
|
"loss": 0.6731, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.2605764257900338, |
|
"learning_rate": 2.6589345021987725e-06, |
|
"loss": 0.6601, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.2691370320929689, |
|
"learning_rate": 2.6306566876350072e-06, |
|
"loss": 0.6747, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.26505353096492007, |
|
"learning_rate": 2.6024762674947313e-06, |
|
"loss": 0.6355, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.2771254154185314, |
|
"learning_rate": 2.5743944001646394e-06, |
|
"loss": 0.6679, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.2734747453519109, |
|
"learning_rate": 2.5464122399803126e-06, |
|
"loss": 0.6842, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.26644275829657593, |
|
"learning_rate": 2.5185309371787515e-06, |
|
"loss": 0.6986, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.921169787601039, |
|
"learning_rate": 2.4907516378511137e-06, |
|
"loss": 0.9339, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.25960894123414624, |
|
"learning_rate": 2.46307548389559e-06, |
|
"loss": 0.6743, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.2637156946948773, |
|
"learning_rate": 2.43550361297047e-06, |
|
"loss": 0.682, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.26481304373722364, |
|
"learning_rate": 2.408037158447375e-06, |
|
"loss": 0.6838, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.3098032445823631, |
|
"learning_rate": 2.3806772493646725e-06, |
|
"loss": 0.6569, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.26219269959571206, |
|
"learning_rate": 2.353425010381063e-06, |
|
"loss": 0.6761, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.2595963563694489, |
|
"learning_rate": 2.3262815617293517e-06, |
|
"loss": 0.6705, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 6.834107729255299, |
|
"learning_rate": 2.2992480191704003e-06, |
|
"loss": 0.9304, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.27845487671808766, |
|
"learning_rate": 2.272325493947257e-06, |
|
"loss": 0.7032, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.27640918477115356, |
|
"learning_rate": 2.245515092739488e-06, |
|
"loss": 0.6782, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.2807587758407648, |
|
"learning_rate": 2.2188179176176767e-06, |
|
"loss": 0.6932, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.27767259525555504, |
|
"learning_rate": 2.1922350659981262e-06, |
|
"loss": 0.6466, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.267658673895331, |
|
"learning_rate": 2.165767630597752e-06, |
|
"loss": 0.7089, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.2745228363539692, |
|
"learning_rate": 2.139416699389153e-06, |
|
"loss": 0.6778, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.26408657536920843, |
|
"learning_rate": 2.1131833555559037e-06, |
|
"loss": 0.693, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.27582062474841573, |
|
"learning_rate": 2.08706867744802e-06, |
|
"loss": 0.6896, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.26826511006390147, |
|
"learning_rate": 2.061073738537635e-06, |
|
"loss": 0.6796, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.26630216372896987, |
|
"learning_rate": 2.0351996073748713e-06, |
|
"loss": 0.664, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.2809694093215496, |
|
"learning_rate": 2.00944734754392e-06, |
|
"loss": 0.7044, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.2984772108139377, |
|
"learning_rate": 1.983818017619318e-06, |
|
"loss": 0.9348, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.2721943586746493, |
|
"learning_rate": 1.9583126711224342e-06, |
|
"loss": 0.6918, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.27480703238103743, |
|
"learning_rate": 1.932932356478168e-06, |
|
"loss": 0.6854, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.27867368393846137, |
|
"learning_rate": 1.9076781169718426e-06, |
|
"loss": 0.6892, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.2747868621778029, |
|
"learning_rate": 1.8825509907063328e-06, |
|
"loss": 0.6947, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.2819831023326237, |
|
"learning_rate": 1.857552010559382e-06, |
|
"loss": 0.7059, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.27392909848006375, |
|
"learning_rate": 1.8326822041411524e-06, |
|
"loss": 0.6909, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.2743865258535757, |
|
"learning_rate": 1.8079425937519729e-06, |
|
"loss": 0.679, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 2.5387012983068686, |
|
"learning_rate": 1.7833341963403312e-06, |
|
"loss": 0.8855, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.265533513891752, |
|
"learning_rate": 1.7588580234610592e-06, |
|
"loss": 0.6915, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.27359948413194535, |
|
"learning_rate": 1.7345150812337564e-06, |
|
"loss": 0.6983, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.2742529435490673, |
|
"learning_rate": 1.7103063703014372e-06, |
|
"loss": 0.6712, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.27199602044830407, |
|
"learning_rate": 1.6862328857893856e-06, |
|
"loss": 0.6929, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.26752081718353027, |
|
"learning_rate": 1.6622956172642601e-06, |
|
"loss": 0.6693, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.2729904749450422, |
|
"learning_rate": 1.6384955486934157e-06, |
|
"loss": 0.6545, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.27686659493721505, |
|
"learning_rate": 1.6148336584044539e-06, |
|
"loss": 0.6957, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.27203211099316454, |
|
"learning_rate": 1.5913109190450033e-06, |
|
"loss": 0.6709, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.2748872025880921, |
|
"learning_rate": 1.567928297542749e-06, |
|
"loss": 0.6648, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.28544562477258, |
|
"learning_rate": 1.544686755065677e-06, |
|
"loss": 0.6937, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.27110159404128226, |
|
"learning_rate": 1.5215872469825682e-06, |
|
"loss": 0.6593, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"eval_loss": 0.6996302008628845, |
|
"eval_runtime": 96.9399, |
|
"eval_samples_per_second": 18.238, |
|
"eval_steps_per_second": 0.382, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.28648432605335455, |
|
"learning_rate": 1.4986307228237268e-06, |
|
"loss": 0.6883, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.2695264027977092, |
|
"learning_rate": 1.4758181262419425e-06, |
|
"loss": 0.6696, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.2786040566891135, |
|
"learning_rate": 1.4531503949737107e-06, |
|
"loss": 0.6768, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.2730138945863401, |
|
"learning_rate": 1.4306284608006837e-06, |
|
"loss": 0.699, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.28711818986138005, |
|
"learning_rate": 1.4082532495113627e-06, |
|
"loss": 0.6961, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.27838100192134935, |
|
"learning_rate": 1.3860256808630429e-06, |
|
"loss": 0.6589, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.2798399005913698, |
|
"learning_rate": 1.3639466685440133e-06, |
|
"loss": 0.6924, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.2801056534585314, |
|
"learning_rate": 1.3420171201359933e-06, |
|
"loss": 0.7047, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.28576658015544487, |
|
"learning_rate": 1.3202379370768254e-06, |
|
"loss": 0.6617, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.3100321738765892, |
|
"learning_rate": 1.298610014623423e-06, |
|
"loss": 0.9236, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.303191205875695, |
|
"learning_rate": 1.2771342418149658e-06, |
|
"loss": 0.6896, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.29469211064765755, |
|
"learning_rate": 1.2558115014363592e-06, |
|
"loss": 0.6804, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.2772656639598833, |
|
"learning_rate": 1.234642669981946e-06, |
|
"loss": 0.7043, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.28874341170670975, |
|
"learning_rate": 1.2136286176194744e-06, |
|
"loss": 0.6839, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.29443238526351323, |
|
"learning_rate": 1.1927702081543279e-06, |
|
"loss": 0.6852, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.28011985365824127, |
|
"learning_rate": 1.1720682989940264e-06, |
|
"loss": 0.7019, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.2987249208096274, |
|
"learning_rate": 1.1515237411129698e-06, |
|
"loss": 0.6625, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.30150125882304396, |
|
"learning_rate": 1.1311373790174656e-06, |
|
"loss": 0.7102, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.28396138619493894, |
|
"learning_rate": 1.1109100507110133e-06, |
|
"loss": 0.6538, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.28445333602874173, |
|
"learning_rate": 1.0908425876598512e-06, |
|
"loss": 0.6719, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.2914051878027514, |
|
"learning_rate": 1.0709358147587883e-06, |
|
"loss": 0.6803, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.2969873740157493, |
|
"learning_rate": 1.0511905502972885e-06, |
|
"loss": 0.6845, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.27955221055069657, |
|
"learning_rate": 1.031607605925839e-06, |
|
"loss": 0.6819, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.2840904640426781, |
|
"learning_rate": 1.0121877866225783e-06, |
|
"loss": 0.6685, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.2866769315662431, |
|
"learning_rate": 9.929318906602176e-07, |
|
"loss": 0.7126, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.28635331619928306, |
|
"learning_rate": 9.738407095732195e-07, |
|
"loss": 0.6825, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.29612030431665504, |
|
"learning_rate": 9.549150281252633e-07, |
|
"loss": 0.6889, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.2800350134356635, |
|
"learning_rate": 9.361556242769871e-07, |
|
"loss": 0.6902, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.303825841465723, |
|
"learning_rate": 9.175632691540065e-07, |
|
"loss": 0.6949, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.2869303466691903, |
|
"learning_rate": 8.991387270152202e-07, |
|
"loss": 0.6953, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.2822567891211309, |
|
"learning_rate": 8.808827552213917e-07, |
|
"loss": 0.6733, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.29613652418881947, |
|
"learning_rate": 8.627961042040183e-07, |
|
"loss": 0.6721, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.2904383828418472, |
|
"learning_rate": 8.448795174344803e-07, |
|
"loss": 0.6849, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.28422518396116003, |
|
"learning_rate": 8.271337313934869e-07, |
|
"loss": 0.676, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.2998884374161429, |
|
"learning_rate": 8.095594755407971e-07, |
|
"loss": 0.72, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.2877123964038153, |
|
"learning_rate": 7.921574722852343e-07, |
|
"loss": 0.686, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.2848351263069502, |
|
"learning_rate": 7.749284369549954e-07, |
|
"loss": 0.6755, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 3.762805837262192, |
|
"learning_rate": 7.578730777682386e-07, |
|
"loss": 0.9037, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.2782769592021476, |
|
"learning_rate": 7.409920958039795e-07, |
|
"loss": 0.6686, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.28369386272785363, |
|
"learning_rate": 7.242861849732696e-07, |
|
"loss": 0.6772, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.28870985589458403, |
|
"learning_rate": 7.077560319906696e-07, |
|
"loss": 0.6665, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.2880267458624612, |
|
"learning_rate": 6.914023163460248e-07, |
|
"loss": 0.6767, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.2879073116640725, |
|
"learning_rate": 6.752257102765325e-07, |
|
"loss": 0.6733, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.2978223401759706, |
|
"learning_rate": 6.592268787391077e-07, |
|
"loss": 0.707, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.2781074725093229, |
|
"learning_rate": 6.43406479383053e-07, |
|
"loss": 0.6962, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.29577562012306474, |
|
"learning_rate": 6.277651625230219e-07, |
|
"loss": 0.6772, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.2848699679509908, |
|
"learning_rate": 6.12303571112286e-07, |
|
"loss": 0.7008, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.2728708533046375, |
|
"learning_rate": 5.9702234071631e-07, |
|
"loss": 0.6994, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.2971147397482144, |
|
"learning_rate": 5.819220994866237e-07, |
|
"loss": 0.6784, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.2918077307247773, |
|
"learning_rate": 5.670034681349995e-07, |
|
"loss": 0.6798, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.2766527969263755, |
|
"learning_rate": 5.522670599079416e-07, |
|
"loss": 0.692, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.2896023594106076, |
|
"learning_rate": 5.377134805614714e-07, |
|
"loss": 0.6885, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.29226174571780184, |
|
"learning_rate": 5.233433283362349e-07, |
|
"loss": 0.6609, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.30313380575685006, |
|
"learning_rate": 5.091571939329049e-07, |
|
"loss": 0.6559, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.2851579977806079, |
|
"learning_rate": 4.951556604879049e-07, |
|
"loss": 0.6862, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.28828722620682673, |
|
"learning_rate": 4.813393035494329e-07, |
|
"loss": 0.673, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.283083619090625, |
|
"learning_rate": 4.677086910538092e-07, |
|
"loss": 0.6477, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.29809618255032516, |
|
"learning_rate": 4.542643833021254e-07, |
|
"loss": 0.7054, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.3086409841744957, |
|
"learning_rate": 4.410069329372152e-07, |
|
"loss": 0.6763, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.3016981542599131, |
|
"learning_rate": 4.279368849209381e-07, |
|
"loss": 0.6843, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.27698126686942587, |
|
"learning_rate": 4.150547765117746e-07, |
|
"loss": 0.6891, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.2793586730481018, |
|
"learning_rate": 4.0236113724274716e-07, |
|
"loss": 0.6796, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.28666974827878056, |
|
"learning_rate": 3.8985648889964755e-07, |
|
"loss": 0.6648, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.28527293041641727, |
|
"learning_rate": 3.77541345499593e-07, |
|
"loss": 0.7071, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.29843069047155474, |
|
"learning_rate": 3.6541621326989183e-07, |
|
"loss": 0.6803, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.3001228333782996, |
|
"learning_rate": 3.534815906272404e-07, |
|
"loss": 0.7176, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.2928174202718284, |
|
"learning_rate": 3.417379681572297e-07, |
|
"loss": 0.6747, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.2951128501223636, |
|
"learning_rate": 3.301858285941845e-07, |
|
"loss": 0.7046, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.2931876464433515, |
|
"learning_rate": 3.18825646801314e-07, |
|
"loss": 0.6734, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.29349560436630445, |
|
"learning_rate": 3.076578897511978e-07, |
|
"loss": 0.6852, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.7457047514934827, |
|
"learning_rate": 2.966830165065876e-07, |
|
"loss": 0.9017, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.2813462622367945, |
|
"learning_rate": 2.8590147820153513e-07, |
|
"loss": 0.6969, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.30904433658187347, |
|
"learning_rate": 2.7531371802285436e-07, |
|
"loss": 0.6829, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.28837856935691286, |
|
"learning_rate": 2.6492017119189415e-07, |
|
"loss": 0.6527, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.2940858357017207, |
|
"learning_rate": 2.547212649466568e-07, |
|
"loss": 0.6696, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.27956534271888384, |
|
"learning_rate": 2.447174185242324e-07, |
|
"loss": 0.7048, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.28259810422391984, |
|
"learning_rate": 2.3490904314356412e-07, |
|
"loss": 0.6772, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.2846763408984384, |
|
"learning_rate": 2.2529654198854834e-07, |
|
"loss": 0.7507, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.2784656918785677, |
|
"learning_rate": 2.1588031019145638e-07, |
|
"loss": 0.7072, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.28593045031546543, |
|
"learning_rate": 2.0666073481669714e-07, |
|
"loss": 0.6944, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.296176705173677, |
|
"learning_rate": 1.9763819484490353e-07, |
|
"loss": 0.6691, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.2923471158891105, |
|
"learning_rate": 1.8881306115735632e-07, |
|
"loss": 0.705, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.2769134760770555, |
|
"learning_rate": 1.801856965207338e-07, |
|
"loss": 0.6845, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.30153347516541434, |
|
"learning_rate": 1.7175645557220567e-07, |
|
"loss": 0.6935, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.29938078931561396, |
|
"learning_rate": 1.6352568480485277e-07, |
|
"loss": 0.6822, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.2909845977727433, |
|
"learning_rate": 1.5549372255342367e-07, |
|
"loss": 0.6959, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.2851155920589762, |
|
"learning_rate": 1.4766089898042678e-07, |
|
"loss": 0.6909, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 3.590844633307425, |
|
"learning_rate": 1.4002753606256082e-07, |
|
"loss": 0.9279, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.289769942223222, |
|
"learning_rate": 1.3259394757747678e-07, |
|
"loss": 0.6664, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 1.4756345980091514, |
|
"learning_rate": 1.253604390908819e-07, |
|
"loss": 0.9066, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.2905542054012534, |
|
"learning_rate": 1.1832730794397951e-07, |
|
"loss": 0.6989, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.3056790622208962, |
|
"learning_rate": 1.1149484324124326e-07, |
|
"loss": 0.64, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.2915224050071343, |
|
"learning_rate": 1.0486332583853565e-07, |
|
"loss": 0.6411, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.2947477867782055, |
|
"learning_rate": 9.843302833156377e-08, |
|
"loss": 0.6901, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.6968434274277037, |
|
"learning_rate": 9.22042150446728e-08, |
|
"loss": 0.9234, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.28365980605268926, |
|
"learning_rate": 8.617714201998084e-08, |
|
"loss": 0.6871, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.29456041125148436, |
|
"learning_rate": 8.035205700685167e-08, |
|
"loss": 0.6841, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.3143240746562965, |
|
"learning_rate": 7.47291994517163e-08, |
|
"loss": 0.6793, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.3002190997085016, |
|
"learning_rate": 6.930880048822531e-08, |
|
"loss": 0.6909, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.28198233683666907, |
|
"learning_rate": 6.409108292774912e-08, |
|
"loss": 0.6677, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.2974351663902672, |
|
"learning_rate": 5.907626125022159e-08, |
|
"loss": 0.6863, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.3167342201027022, |
|
"learning_rate": 5.426454159531913e-08, |
|
"loss": 0.6728, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.2838121481556639, |
|
"learning_rate": 4.9656121753990924e-08, |
|
"loss": 0.6765, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.28117138518414553, |
|
"learning_rate": 4.52511911603265e-08, |
|
"loss": 0.6827, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.29399576903346636, |
|
"learning_rate": 4.104993088376974e-08, |
|
"loss": 0.6933, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.29636385882946953, |
|
"learning_rate": 3.705251362167484e-08, |
|
"loss": 0.6641, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.2913085312105263, |
|
"learning_rate": 3.325910369220975e-08, |
|
"loss": 0.6973, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.29080057862930364, |
|
"learning_rate": 2.966985702759828e-08, |
|
"loss": 0.6678, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.2952306166117519, |
|
"learning_rate": 2.6284921167712975e-08, |
|
"loss": 0.7017, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.2959396595629797, |
|
"learning_rate": 2.3104435254008852e-08, |
|
"loss": 0.6569, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.28791733733513286, |
|
"learning_rate": 2.012853002380466e-08, |
|
"loss": 0.6573, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.2923526176448832, |
|
"learning_rate": 1.735732780490884e-08, |
|
"loss": 0.6903, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.2908268098513957, |
|
"learning_rate": 1.4790942510590767e-08, |
|
"loss": 0.6756, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.29069043095745606, |
|
"learning_rate": 1.2429479634897268e-08, |
|
"loss": 0.6722, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.28912174671892155, |
|
"learning_rate": 1.0273036248318325e-08, |
|
"loss": 0.6927, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.29773909318477504, |
|
"learning_rate": 8.321700993795812e-09, |
|
"loss": 0.6703, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.2846360921300275, |
|
"learning_rate": 6.575554083078084e-09, |
|
"loss": 0.6915, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.3040183654289367, |
|
"learning_rate": 5.034667293427053e-09, |
|
"loss": 0.6836, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.29012455377167307, |
|
"learning_rate": 3.6991039646616657e-09, |
|
"loss": 0.6844, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.2778518633390048, |
|
"learning_rate": 2.568918996560532e-09, |
|
"loss": 0.6779, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.29541155663074187, |
|
"learning_rate": 1.6441588466009627e-09, |
|
"loss": 0.6979, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.28364315270086676, |
|
"learning_rate": 9.248615280499362e-10, |
|
"loss": 0.6792, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.2865142006406564, |
|
"learning_rate": 4.1105660840368154e-10, |
|
"loss": 0.7034, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.2854424675916699, |
|
"learning_rate": 1.0276520816976388e-10, |
|
"loss": 0.6747, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.29666466368391803, |
|
"learning_rate": 0.0, |
|
"loss": 0.6754, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"eval_loss": 0.6983408331871033, |
|
"eval_runtime": 93.907, |
|
"eval_samples_per_second": 18.827, |
|
"eval_steps_per_second": 0.394, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 250, |
|
"total_flos": 1571976955035648.0, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|