{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9879518072289155, "eval_steps": 500, "global_step": 664, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030120481927710845, "grad_norm": 0.4040583074092865, "learning_rate": 1.0000000000000002e-06, "loss": 1.684, "step": 1 }, { "epoch": 0.006024096385542169, "grad_norm": 0.4095500409603119, "learning_rate": 2.0000000000000003e-06, "loss": 1.6744, "step": 2 }, { "epoch": 0.009036144578313253, "grad_norm": 0.43334975838661194, "learning_rate": 3e-06, "loss": 1.5956, "step": 3 }, { "epoch": 0.012048192771084338, "grad_norm": 0.39737147092819214, "learning_rate": 4.000000000000001e-06, "loss": 1.6404, "step": 4 }, { "epoch": 0.015060240963855422, "grad_norm": 0.41804248094558716, "learning_rate": 5e-06, "loss": 1.6398, "step": 5 }, { "epoch": 0.018072289156626505, "grad_norm": 0.41640806198120117, "learning_rate": 6e-06, "loss": 1.6439, "step": 6 }, { "epoch": 0.02108433734939759, "grad_norm": 0.41058269143104553, "learning_rate": 7e-06, "loss": 1.595, "step": 7 }, { "epoch": 0.024096385542168676, "grad_norm": 0.3926224708557129, "learning_rate": 8.000000000000001e-06, "loss": 1.6345, "step": 8 }, { "epoch": 0.02710843373493976, "grad_norm": 0.4289781451225281, "learning_rate": 9e-06, "loss": 1.644, "step": 9 }, { "epoch": 0.030120481927710843, "grad_norm": 0.40193450450897217, "learning_rate": 1e-05, "loss": 1.5528, "step": 10 }, { "epoch": 0.03313253012048193, "grad_norm": 0.4374159574508667, "learning_rate": 9.999942312273667e-06, "loss": 1.6132, "step": 11 }, { "epoch": 0.03614457831325301, "grad_norm": 0.42067164182662964, "learning_rate": 9.999769250425817e-06, "loss": 1.5811, "step": 12 }, { "epoch": 0.0391566265060241, "grad_norm": 0.45214423537254333, "learning_rate": 9.999480818449868e-06, "loss": 1.5985, "step": 13 }, { "epoch": 0.04216867469879518, "grad_norm": 0.42575493454933167, "learning_rate": 9.999077023001411e-06, "loss": 1.6494, "step": 14 }, { "epoch": 0.045180722891566265, "grad_norm": 0.4473470151424408, "learning_rate": 9.998557873398066e-06, "loss": 1.6773, "step": 15 }, { "epoch": 0.04819277108433735, "grad_norm": 0.4473958909511566, "learning_rate": 9.997923381619257e-06, "loss": 1.6342, "step": 16 }, { "epoch": 0.05120481927710843, "grad_norm": 0.46099603176116943, "learning_rate": 9.997173562305937e-06, "loss": 1.5608, "step": 17 }, { "epoch": 0.05421686746987952, "grad_norm": 0.44749438762664795, "learning_rate": 9.996308432760257e-06, "loss": 1.6081, "step": 18 }, { "epoch": 0.0572289156626506, "grad_norm": 0.46720796823501587, "learning_rate": 9.995328012945158e-06, "loss": 1.595, "step": 19 }, { "epoch": 0.060240963855421686, "grad_norm": 0.436519056558609, "learning_rate": 9.994232325483917e-06, "loss": 1.6167, "step": 20 }, { "epoch": 0.06325301204819277, "grad_norm": 0.42265141010284424, "learning_rate": 9.99302139565962e-06, "loss": 1.6195, "step": 21 }, { "epoch": 0.06626506024096386, "grad_norm": 0.3948360085487366, "learning_rate": 9.991695251414584e-06, "loss": 1.5915, "step": 22 }, { "epoch": 0.06927710843373494, "grad_norm": 0.4320020377635956, "learning_rate": 9.990253923349706e-06, "loss": 1.5906, "step": 23 }, { "epoch": 0.07228915662650602, "grad_norm": 0.44574347138404846, "learning_rate": 9.988697444723763e-06, "loss": 1.5712, "step": 24 }, { "epoch": 0.07530120481927711, "grad_norm": 0.41239818930625916, "learning_rate": 9.98702585145264e-06, "loss": 1.5752, "step": 25 }, { "epoch": 0.0783132530120482, "grad_norm": 0.42800942063331604, "learning_rate": 9.9852391821085e-06, "loss": 1.5531, "step": 26 }, { "epoch": 0.08132530120481928, "grad_norm": 0.41428813338279724, "learning_rate": 9.983337477918904e-06, "loss": 1.5882, "step": 27 }, { "epoch": 0.08433734939759036, "grad_norm": 0.4180893898010254, "learning_rate": 9.981320782765847e-06, "loss": 1.5694, "step": 28 }, { "epoch": 0.08734939759036145, "grad_norm": 0.4115102291107178, "learning_rate": 9.97918914318475e-06, "loss": 1.5347, "step": 29 }, { "epoch": 0.09036144578313253, "grad_norm": 0.42680180072784424, "learning_rate": 9.976942608363394e-06, "loss": 1.5275, "step": 30 }, { "epoch": 0.09337349397590361, "grad_norm": 0.39122140407562256, "learning_rate": 9.97458123014077e-06, "loss": 1.4961, "step": 31 }, { "epoch": 0.0963855421686747, "grad_norm": 0.39151236414909363, "learning_rate": 9.972105063005895e-06, "loss": 1.5359, "step": 32 }, { "epoch": 0.09939759036144578, "grad_norm": 0.38214412331581116, "learning_rate": 9.969514164096548e-06, "loss": 1.5103, "step": 33 }, { "epoch": 0.10240963855421686, "grad_norm": 0.39758872985839844, "learning_rate": 9.966808593197959e-06, "loss": 1.4839, "step": 34 }, { "epoch": 0.10542168674698796, "grad_norm": 0.3730682134628296, "learning_rate": 9.96398841274142e-06, "loss": 1.4908, "step": 35 }, { "epoch": 0.10843373493975904, "grad_norm": 0.367106556892395, "learning_rate": 9.96105368780285e-06, "loss": 1.4372, "step": 36 }, { "epoch": 0.11144578313253012, "grad_norm": 0.3657713532447815, "learning_rate": 9.958004486101293e-06, "loss": 1.4791, "step": 37 }, { "epoch": 0.1144578313253012, "grad_norm": 0.3604431450366974, "learning_rate": 9.954840877997356e-06, "loss": 1.443, "step": 38 }, { "epoch": 0.11746987951807229, "grad_norm": 0.3567802906036377, "learning_rate": 9.95156293649158e-06, "loss": 1.4589, "step": 39 }, { "epoch": 0.12048192771084337, "grad_norm": 0.3510221838951111, "learning_rate": 9.948170737222763e-06, "loss": 1.378, "step": 40 }, { "epoch": 0.12349397590361445, "grad_norm": 0.35590696334838867, "learning_rate": 9.94466435846621e-06, "loss": 1.4064, "step": 41 }, { "epoch": 0.12650602409638553, "grad_norm": 0.3688894510269165, "learning_rate": 9.941043881131928e-06, "loss": 1.3728, "step": 42 }, { "epoch": 0.12951807228915663, "grad_norm": 0.35922420024871826, "learning_rate": 9.93730938876276e-06, "loss": 1.3809, "step": 43 }, { "epoch": 0.13253012048192772, "grad_norm": 0.34513840079307556, "learning_rate": 9.933460967532454e-06, "loss": 1.4276, "step": 44 }, { "epoch": 0.1355421686746988, "grad_norm": 0.34767019748687744, "learning_rate": 9.929498706243681e-06, "loss": 1.3542, "step": 45 }, { "epoch": 0.13855421686746988, "grad_norm": 0.3442816138267517, "learning_rate": 9.925422696325976e-06, "loss": 1.3512, "step": 46 }, { "epoch": 0.14156626506024098, "grad_norm": 0.36368539929389954, "learning_rate": 9.921233031833639e-06, "loss": 1.3736, "step": 47 }, { "epoch": 0.14457831325301204, "grad_norm": 0.33587586879730225, "learning_rate": 9.916929809443555e-06, "loss": 1.3906, "step": 48 }, { "epoch": 0.14759036144578314, "grad_norm": 0.34448426961898804, "learning_rate": 9.912513128452974e-06, "loss": 1.362, "step": 49 }, { "epoch": 0.15060240963855423, "grad_norm": 0.3427204489707947, "learning_rate": 9.907983090777206e-06, "loss": 1.3292, "step": 50 }, { "epoch": 0.1536144578313253, "grad_norm": 0.38191652297973633, "learning_rate": 9.903339800947284e-06, "loss": 1.3735, "step": 51 }, { "epoch": 0.1566265060240964, "grad_norm": 0.3497113287448883, "learning_rate": 9.898583366107539e-06, "loss": 1.3776, "step": 52 }, { "epoch": 0.15963855421686746, "grad_norm": 0.31867995858192444, "learning_rate": 9.893713896013134e-06, "loss": 1.3058, "step": 53 }, { "epoch": 0.16265060240963855, "grad_norm": 0.3136507272720337, "learning_rate": 9.888731503027535e-06, "loss": 1.3463, "step": 54 }, { "epoch": 0.16566265060240964, "grad_norm": 0.3268043100833893, "learning_rate": 9.883636302119911e-06, "loss": 1.3189, "step": 55 }, { "epoch": 0.1686746987951807, "grad_norm": 0.3265782296657562, "learning_rate": 9.878428410862484e-06, "loss": 1.2983, "step": 56 }, { "epoch": 0.1716867469879518, "grad_norm": 0.30159902572631836, "learning_rate": 9.873107949427815e-06, "loss": 1.3545, "step": 57 }, { "epoch": 0.1746987951807229, "grad_norm": 0.3109259307384491, "learning_rate": 9.867675040586035e-06, "loss": 1.3894, "step": 58 }, { "epoch": 0.17771084337349397, "grad_norm": 0.30744504928588867, "learning_rate": 9.862129809702006e-06, "loss": 1.3777, "step": 59 }, { "epoch": 0.18072289156626506, "grad_norm": 0.3043947219848633, "learning_rate": 9.856472384732432e-06, "loss": 1.354, "step": 60 }, { "epoch": 0.18373493975903615, "grad_norm": 0.3052617013454437, "learning_rate": 9.850702896222908e-06, "loss": 1.3074, "step": 61 }, { "epoch": 0.18674698795180722, "grad_norm": 0.3007952570915222, "learning_rate": 9.844821477304904e-06, "loss": 1.2124, "step": 62 }, { "epoch": 0.1897590361445783, "grad_norm": 0.2832448482513428, "learning_rate": 9.838828263692693e-06, "loss": 1.2841, "step": 63 }, { "epoch": 0.1927710843373494, "grad_norm": 0.27628499269485474, "learning_rate": 9.832723393680222e-06, "loss": 1.2425, "step": 64 }, { "epoch": 0.19578313253012047, "grad_norm": 0.2700969874858856, "learning_rate": 9.826507008137919e-06, "loss": 1.2543, "step": 65 }, { "epoch": 0.19879518072289157, "grad_norm": 0.2948736548423767, "learning_rate": 9.820179250509442e-06, "loss": 1.2708, "step": 66 }, { "epoch": 0.20180722891566266, "grad_norm": 0.29045990109443665, "learning_rate": 9.813740266808375e-06, "loss": 1.3043, "step": 67 }, { "epoch": 0.20481927710843373, "grad_norm": 0.27807915210723877, "learning_rate": 9.807190205614847e-06, "loss": 1.206, "step": 68 }, { "epoch": 0.20783132530120482, "grad_norm": 0.267451673746109, "learning_rate": 9.800529218072112e-06, "loss": 1.2255, "step": 69 }, { "epoch": 0.21084337349397592, "grad_norm": 0.2782948613166809, "learning_rate": 9.793757457883062e-06, "loss": 1.2236, "step": 70 }, { "epoch": 0.21385542168674698, "grad_norm": 0.276692271232605, "learning_rate": 9.786875081306677e-06, "loss": 1.2588, "step": 71 }, { "epoch": 0.21686746987951808, "grad_norm": 0.2745719254016876, "learning_rate": 9.779882247154419e-06, "loss": 1.215, "step": 72 }, { "epoch": 0.21987951807228914, "grad_norm": 0.2591319978237152, "learning_rate": 9.772779116786568e-06, "loss": 1.2833, "step": 73 }, { "epoch": 0.22289156626506024, "grad_norm": 0.27248722314834595, "learning_rate": 9.765565854108503e-06, "loss": 1.2575, "step": 74 }, { "epoch": 0.22590361445783133, "grad_norm": 0.273562490940094, "learning_rate": 9.758242625566912e-06, "loss": 1.2134, "step": 75 }, { "epoch": 0.2289156626506024, "grad_norm": 0.29504141211509705, "learning_rate": 9.750809600145955e-06, "loss": 1.2222, "step": 76 }, { "epoch": 0.2319277108433735, "grad_norm": 0.2649330496788025, "learning_rate": 9.743266949363368e-06, "loss": 1.1992, "step": 77 }, { "epoch": 0.23493975903614459, "grad_norm": 0.26566869020462036, "learning_rate": 9.735614847266502e-06, "loss": 1.2432, "step": 78 }, { "epoch": 0.23795180722891565, "grad_norm": 0.25488752126693726, "learning_rate": 9.727853470428301e-06, "loss": 1.1646, "step": 79 }, { "epoch": 0.24096385542168675, "grad_norm": 0.280771404504776, "learning_rate": 9.719982997943245e-06, "loss": 1.2075, "step": 80 }, { "epoch": 0.24397590361445784, "grad_norm": 0.3047221899032593, "learning_rate": 9.712003611423194e-06, "loss": 1.2378, "step": 81 }, { "epoch": 0.2469879518072289, "grad_norm": 0.2466488480567932, "learning_rate": 9.703915494993215e-06, "loss": 1.2169, "step": 82 }, { "epoch": 0.25, "grad_norm": 0.25351837277412415, "learning_rate": 9.695718835287328e-06, "loss": 1.1995, "step": 83 }, { "epoch": 0.25301204819277107, "grad_norm": 0.26343056559562683, "learning_rate": 9.6874138214442e-06, "loss": 1.2329, "step": 84 }, { "epoch": 0.2560240963855422, "grad_norm": 0.2685820460319519, "learning_rate": 9.679000645102771e-06, "loss": 1.2054, "step": 85 }, { "epoch": 0.25903614457831325, "grad_norm": 0.2713760733604431, "learning_rate": 9.670479500397854e-06, "loss": 1.2417, "step": 86 }, { "epoch": 0.2620481927710843, "grad_norm": 0.2718029320240021, "learning_rate": 9.66185058395563e-06, "loss": 1.2587, "step": 87 }, { "epoch": 0.26506024096385544, "grad_norm": 0.26352736353874207, "learning_rate": 9.653114094889128e-06, "loss": 1.2541, "step": 88 }, { "epoch": 0.2680722891566265, "grad_norm": 0.2755894362926483, "learning_rate": 9.644270234793625e-06, "loss": 1.2091, "step": 89 }, { "epoch": 0.2710843373493976, "grad_norm": 0.25112101435661316, "learning_rate": 9.63531920774199e-06, "loss": 1.2425, "step": 90 }, { "epoch": 0.2740963855421687, "grad_norm": 0.26138511300086975, "learning_rate": 9.62626122027999e-06, "loss": 1.2252, "step": 91 }, { "epoch": 0.27710843373493976, "grad_norm": 0.2561100721359253, "learning_rate": 9.617096481421498e-06, "loss": 1.2206, "step": 92 }, { "epoch": 0.28012048192771083, "grad_norm": 0.26238083839416504, "learning_rate": 9.607825202643696e-06, "loss": 1.1859, "step": 93 }, { "epoch": 0.28313253012048195, "grad_norm": 0.27538710832595825, "learning_rate": 9.598447597882181e-06, "loss": 1.2062, "step": 94 }, { "epoch": 0.286144578313253, "grad_norm": 0.25858640670776367, "learning_rate": 9.588963883526033e-06, "loss": 1.2354, "step": 95 }, { "epoch": 0.2891566265060241, "grad_norm": 0.2807197570800781, "learning_rate": 9.579374278412819e-06, "loss": 1.2433, "step": 96 }, { "epoch": 0.2921686746987952, "grad_norm": 0.28452298045158386, "learning_rate": 9.569679003823542e-06, "loss": 1.2191, "step": 97 }, { "epoch": 0.29518072289156627, "grad_norm": 0.25671708583831787, "learning_rate": 9.559878283477546e-06, "loss": 1.2095, "step": 98 }, { "epoch": 0.29819277108433734, "grad_norm": 0.25289785861968994, "learning_rate": 9.549972343527336e-06, "loss": 1.2033, "step": 99 }, { "epoch": 0.30120481927710846, "grad_norm": 0.27585139870643616, "learning_rate": 9.539961412553375e-06, "loss": 1.149, "step": 100 }, { "epoch": 0.3042168674698795, "grad_norm": 0.2492348849773407, "learning_rate": 9.529845721558802e-06, "loss": 1.1271, "step": 101 }, { "epoch": 0.3072289156626506, "grad_norm": 0.254409521818161, "learning_rate": 9.5196255039641e-06, "loss": 1.2528, "step": 102 }, { "epoch": 0.3102409638554217, "grad_norm": 0.3059585690498352, "learning_rate": 9.50930099560172e-06, "loss": 1.2058, "step": 103 }, { "epoch": 0.3132530120481928, "grad_norm": 0.2655487656593323, "learning_rate": 9.498872434710624e-06, "loss": 1.1311, "step": 104 }, { "epoch": 0.31626506024096385, "grad_norm": 0.271914005279541, "learning_rate": 9.488340061930797e-06, "loss": 1.1831, "step": 105 }, { "epoch": 0.3192771084337349, "grad_norm": 0.29053163528442383, "learning_rate": 9.477704120297698e-06, "loss": 1.1585, "step": 106 }, { "epoch": 0.32228915662650603, "grad_norm": 0.26874732971191406, "learning_rate": 9.46696485523664e-06, "loss": 1.2012, "step": 107 }, { "epoch": 0.3253012048192771, "grad_norm": 0.25582486391067505, "learning_rate": 9.45612251455714e-06, "loss": 1.1397, "step": 108 }, { "epoch": 0.32831325301204817, "grad_norm": 0.26407524943351746, "learning_rate": 9.445177348447187e-06, "loss": 1.1887, "step": 109 }, { "epoch": 0.3313253012048193, "grad_norm": 0.2506115734577179, "learning_rate": 9.434129609467484e-06, "loss": 1.2219, "step": 110 }, { "epoch": 0.33433734939759036, "grad_norm": 0.2572745084762573, "learning_rate": 9.422979552545604e-06, "loss": 1.1362, "step": 111 }, { "epoch": 0.3373493975903614, "grad_norm": 0.28277891874313354, "learning_rate": 9.411727434970121e-06, "loss": 1.1409, "step": 112 }, { "epoch": 0.34036144578313254, "grad_norm": 0.30223405361175537, "learning_rate": 9.400373516384671e-06, "loss": 1.1546, "step": 113 }, { "epoch": 0.3433734939759036, "grad_norm": 0.2697835862636566, "learning_rate": 9.388918058781947e-06, "loss": 1.2384, "step": 114 }, { "epoch": 0.3463855421686747, "grad_norm": 0.2695978283882141, "learning_rate": 9.377361326497673e-06, "loss": 1.1364, "step": 115 }, { "epoch": 0.3493975903614458, "grad_norm": 0.25360485911369324, "learning_rate": 9.365703586204495e-06, "loss": 1.178, "step": 116 }, { "epoch": 0.35240963855421686, "grad_norm": 0.27773186564445496, "learning_rate": 9.353945106905822e-06, "loss": 1.1682, "step": 117 }, { "epoch": 0.35542168674698793, "grad_norm": 0.27416011691093445, "learning_rate": 9.342086159929629e-06, "loss": 1.1477, "step": 118 }, { "epoch": 0.35843373493975905, "grad_norm": 0.27577441930770874, "learning_rate": 9.330127018922195e-06, "loss": 1.1497, "step": 119 }, { "epoch": 0.3614457831325301, "grad_norm": 0.2513567805290222, "learning_rate": 9.318067959841776e-06, "loss": 1.1339, "step": 120 }, { "epoch": 0.3644578313253012, "grad_norm": 0.27182286977767944, "learning_rate": 9.305909260952255e-06, "loss": 1.1362, "step": 121 }, { "epoch": 0.3674698795180723, "grad_norm": 0.26553475856781006, "learning_rate": 9.29365120281671e-06, "loss": 1.1576, "step": 122 }, { "epoch": 0.3704819277108434, "grad_norm": 0.25539693236351013, "learning_rate": 9.28129406829094e-06, "loss": 1.1384, "step": 123 }, { "epoch": 0.37349397590361444, "grad_norm": 0.2685853838920593, "learning_rate": 9.268838142516943e-06, "loss": 1.1842, "step": 124 }, { "epoch": 0.37650602409638556, "grad_norm": 0.2612561881542206, "learning_rate": 9.256283712916337e-06, "loss": 1.1578, "step": 125 }, { "epoch": 0.3795180722891566, "grad_norm": 0.26739126443862915, "learning_rate": 9.24363106918372e-06, "loss": 1.1477, "step": 126 }, { "epoch": 0.3825301204819277, "grad_norm": 0.2942097783088684, "learning_rate": 9.230880503279991e-06, "loss": 1.1747, "step": 127 }, { "epoch": 0.3855421686746988, "grad_norm": 0.2746829688549042, "learning_rate": 9.218032309425613e-06, "loss": 1.1651, "step": 128 }, { "epoch": 0.3885542168674699, "grad_norm": 0.27550533413887024, "learning_rate": 9.205086784093823e-06, "loss": 1.1361, "step": 129 }, { "epoch": 0.39156626506024095, "grad_norm": 0.31240707635879517, "learning_rate": 9.19204422600379e-06, "loss": 1.1933, "step": 130 }, { "epoch": 0.39457831325301207, "grad_norm": 0.24383339285850525, "learning_rate": 9.178904936113719e-06, "loss": 1.1739, "step": 131 }, { "epoch": 0.39759036144578314, "grad_norm": 0.3256170153617859, "learning_rate": 9.165669217613919e-06, "loss": 1.1709, "step": 132 }, { "epoch": 0.4006024096385542, "grad_norm": 0.2967703938484192, "learning_rate": 9.152337375919792e-06, "loss": 1.1379, "step": 133 }, { "epoch": 0.4036144578313253, "grad_norm": 0.2854821979999542, "learning_rate": 9.138909718664788e-06, "loss": 1.1741, "step": 134 }, { "epoch": 0.4066265060240964, "grad_norm": 0.33066266775131226, "learning_rate": 9.125386555693316e-06, "loss": 1.1779, "step": 135 }, { "epoch": 0.40963855421686746, "grad_norm": 0.27965742349624634, "learning_rate": 9.111768199053588e-06, "loss": 1.1717, "step": 136 }, { "epoch": 0.4126506024096386, "grad_norm": 0.29010990262031555, "learning_rate": 9.098054962990415e-06, "loss": 1.1526, "step": 137 }, { "epoch": 0.41566265060240964, "grad_norm": 0.2726079523563385, "learning_rate": 9.084247163937959e-06, "loss": 1.1136, "step": 138 }, { "epoch": 0.4186746987951807, "grad_norm": 0.2590181231498718, "learning_rate": 9.070345120512436e-06, "loss": 1.1267, "step": 139 }, { "epoch": 0.42168674698795183, "grad_norm": 0.291429340839386, "learning_rate": 9.056349153504753e-06, "loss": 1.1429, "step": 140 }, { "epoch": 0.4246987951807229, "grad_norm": 0.2864663004875183, "learning_rate": 9.042259585873119e-06, "loss": 1.1161, "step": 141 }, { "epoch": 0.42771084337349397, "grad_norm": 0.29812097549438477, "learning_rate": 9.028076742735583e-06, "loss": 1.157, "step": 142 }, { "epoch": 0.4307228915662651, "grad_norm": 0.29142752289772034, "learning_rate": 9.013800951362532e-06, "loss": 1.0919, "step": 143 }, { "epoch": 0.43373493975903615, "grad_norm": 0.2857559621334076, "learning_rate": 8.999432541169145e-06, "loss": 1.1391, "step": 144 }, { "epoch": 0.4367469879518072, "grad_norm": 0.29825499653816223, "learning_rate": 8.984971843707787e-06, "loss": 1.1589, "step": 145 }, { "epoch": 0.4397590361445783, "grad_norm": 0.26081719994544983, "learning_rate": 8.970419192660366e-06, "loss": 1.1411, "step": 146 }, { "epoch": 0.4427710843373494, "grad_norm": 0.3022754490375519, "learning_rate": 8.955774923830618e-06, "loss": 1.1528, "step": 147 }, { "epoch": 0.4457831325301205, "grad_norm": 0.28860539197921753, "learning_rate": 8.94103937513637e-06, "loss": 1.1784, "step": 148 }, { "epoch": 0.44879518072289154, "grad_norm": 0.25238746404647827, "learning_rate": 8.92621288660175e-06, "loss": 1.1447, "step": 149 }, { "epoch": 0.45180722891566266, "grad_norm": 0.2728082239627838, "learning_rate": 8.911295800349316e-06, "loss": 1.0984, "step": 150 }, { "epoch": 0.45481927710843373, "grad_norm": 0.26758912205696106, "learning_rate": 8.896288460592187e-06, "loss": 1.0918, "step": 151 }, { "epoch": 0.4578313253012048, "grad_norm": 0.27047985792160034, "learning_rate": 8.881191213626084e-06, "loss": 1.1279, "step": 152 }, { "epoch": 0.4608433734939759, "grad_norm": 0.309121698141098, "learning_rate": 8.86600440782135e-06, "loss": 1.1366, "step": 153 }, { "epoch": 0.463855421686747, "grad_norm": 0.2778535485267639, "learning_rate": 8.850728393614903e-06, "loss": 1.1423, "step": 154 }, { "epoch": 0.46686746987951805, "grad_norm": 0.2797792851924896, "learning_rate": 8.835363523502154e-06, "loss": 1.1664, "step": 155 }, { "epoch": 0.46987951807228917, "grad_norm": 0.3094732463359833, "learning_rate": 8.819910152028872e-06, "loss": 1.1295, "step": 156 }, { "epoch": 0.47289156626506024, "grad_norm": 0.2910013496875763, "learning_rate": 8.804368635783002e-06, "loss": 1.0793, "step": 157 }, { "epoch": 0.4759036144578313, "grad_norm": 0.26490893959999084, "learning_rate": 8.788739333386443e-06, "loss": 1.092, "step": 158 }, { "epoch": 0.4789156626506024, "grad_norm": 0.25550705194473267, "learning_rate": 8.773022605486755e-06, "loss": 1.1325, "step": 159 }, { "epoch": 0.4819277108433735, "grad_norm": 0.2488010972738266, "learning_rate": 8.75721881474886e-06, "loss": 1.0885, "step": 160 }, { "epoch": 0.48493975903614456, "grad_norm": 0.3159677982330322, "learning_rate": 8.741328325846663e-06, "loss": 1.1544, "step": 161 }, { "epoch": 0.4879518072289157, "grad_norm": 0.30506086349487305, "learning_rate": 8.725351505454631e-06, "loss": 1.1716, "step": 162 }, { "epoch": 0.49096385542168675, "grad_norm": 0.29045408964157104, "learning_rate": 8.709288722239345e-06, "loss": 1.1199, "step": 163 }, { "epoch": 0.4939759036144578, "grad_norm": 0.2709057033061981, "learning_rate": 8.693140346850975e-06, "loss": 1.113, "step": 164 }, { "epoch": 0.49698795180722893, "grad_norm": 0.28410249948501587, "learning_rate": 8.67690675191475e-06, "loss": 1.1383, "step": 165 }, { "epoch": 0.5, "grad_norm": 0.29826584458351135, "learning_rate": 8.660588312022345e-06, "loss": 1.0619, "step": 166 }, { "epoch": 0.5030120481927711, "grad_norm": 0.3092498779296875, "learning_rate": 8.644185403723231e-06, "loss": 1.1101, "step": 167 }, { "epoch": 0.5060240963855421, "grad_norm": 0.30253866314888, "learning_rate": 8.627698405516007e-06, "loss": 1.0649, "step": 168 }, { "epoch": 0.5090361445783133, "grad_norm": 0.2906908690929413, "learning_rate": 8.611127697839649e-06, "loss": 1.1436, "step": 169 }, { "epoch": 0.5120481927710844, "grad_norm": 0.30768147110939026, "learning_rate": 8.594473663064735e-06, "loss": 1.1116, "step": 170 }, { "epoch": 0.5150602409638554, "grad_norm": 0.3316003680229187, "learning_rate": 8.577736685484626e-06, "loss": 1.1484, "step": 171 }, { "epoch": 0.5180722891566265, "grad_norm": 0.3070067763328552, "learning_rate": 8.560917151306594e-06, "loss": 1.144, "step": 172 }, { "epoch": 0.5210843373493976, "grad_norm": 0.27163851261138916, "learning_rate": 8.544015448642916e-06, "loss": 1.1071, "step": 173 }, { "epoch": 0.5240963855421686, "grad_norm": 0.2992447316646576, "learning_rate": 8.527031967501906e-06, "loss": 1.1647, "step": 174 }, { "epoch": 0.5271084337349398, "grad_norm": 0.262173593044281, "learning_rate": 8.509967099778934e-06, "loss": 1.2107, "step": 175 }, { "epoch": 0.5301204819277109, "grad_norm": 0.33722054958343506, "learning_rate": 8.492821239247365e-06, "loss": 1.0553, "step": 176 }, { "epoch": 0.5331325301204819, "grad_norm": 0.27636295557022095, "learning_rate": 8.475594781549483e-06, "loss": 1.1275, "step": 177 }, { "epoch": 0.536144578313253, "grad_norm": 0.2799915671348572, "learning_rate": 8.45828812418736e-06, "loss": 1.0764, "step": 178 }, { "epoch": 0.5391566265060241, "grad_norm": 0.27971795201301575, "learning_rate": 8.44090166651368e-06, "loss": 1.0634, "step": 179 }, { "epoch": 0.5421686746987951, "grad_norm": 0.3047524690628052, "learning_rate": 8.42343580972253e-06, "loss": 1.1203, "step": 180 }, { "epoch": 0.5451807228915663, "grad_norm": 0.3009694218635559, "learning_rate": 8.405890956840136e-06, "loss": 1.1168, "step": 181 }, { "epoch": 0.5481927710843374, "grad_norm": 0.30559536814689636, "learning_rate": 8.388267512715565e-06, "loss": 1.113, "step": 182 }, { "epoch": 0.5512048192771084, "grad_norm": 0.3450864851474762, "learning_rate": 8.370565884011389e-06, "loss": 1.0621, "step": 183 }, { "epoch": 0.5542168674698795, "grad_norm": 0.3391083776950836, "learning_rate": 8.352786479194288e-06, "loss": 1.1276, "step": 184 }, { "epoch": 0.5572289156626506, "grad_norm": 0.3621962070465088, "learning_rate": 8.33492970852564e-06, "loss": 1.081, "step": 185 }, { "epoch": 0.5602409638554217, "grad_norm": 0.28517264127731323, "learning_rate": 8.316995984052048e-06, "loss": 1.0723, "step": 186 }, { "epoch": 0.5632530120481928, "grad_norm": 0.3252887427806854, "learning_rate": 8.298985719595824e-06, "loss": 1.0727, "step": 187 }, { "epoch": 0.5662650602409639, "grad_norm": 0.3289787769317627, "learning_rate": 8.280899330745452e-06, "loss": 1.0726, "step": 188 }, { "epoch": 0.5692771084337349, "grad_norm": 0.3431254029273987, "learning_rate": 8.262737234845993e-06, "loss": 1.0908, "step": 189 }, { "epoch": 0.572289156626506, "grad_norm": 0.295175164937973, "learning_rate": 8.244499850989453e-06, "loss": 1.1408, "step": 190 }, { "epoch": 0.5753012048192772, "grad_norm": 0.3039282262325287, "learning_rate": 8.226187600005116e-06, "loss": 1.1105, "step": 191 }, { "epoch": 0.5783132530120482, "grad_norm": 0.30349868535995483, "learning_rate": 8.207800904449829e-06, "loss": 1.109, "step": 192 }, { "epoch": 0.5813253012048193, "grad_norm": 0.3329324722290039, "learning_rate": 8.189340188598263e-06, "loss": 1.0828, "step": 193 }, { "epoch": 0.5843373493975904, "grad_norm": 0.32696786522865295, "learning_rate": 8.1708058784331e-06, "loss": 1.116, "step": 194 }, { "epoch": 0.5873493975903614, "grad_norm": 0.30085158348083496, "learning_rate": 8.15219840163523e-06, "loss": 1.141, "step": 195 }, { "epoch": 0.5903614457831325, "grad_norm": 0.30034953355789185, "learning_rate": 8.133518187573864e-06, "loss": 1.1254, "step": 196 }, { "epoch": 0.5933734939759037, "grad_norm": 0.35607779026031494, "learning_rate": 8.114765667296628e-06, "loss": 1.0621, "step": 197 }, { "epoch": 0.5963855421686747, "grad_norm": 0.30774402618408203, "learning_rate": 8.095941273519634e-06, "loss": 1.0462, "step": 198 }, { "epoch": 0.5993975903614458, "grad_norm": 0.3458847999572754, "learning_rate": 8.077045440617465e-06, "loss": 1.0695, "step": 199 }, { "epoch": 0.6024096385542169, "grad_norm": 0.3302537202835083, "learning_rate": 8.058078604613178e-06, "loss": 1.1314, "step": 200 }, { "epoch": 0.6054216867469879, "grad_norm": 0.32025519013404846, "learning_rate": 8.039041203168233e-06, "loss": 1.1179, "step": 201 }, { "epoch": 0.608433734939759, "grad_norm": 0.32808589935302734, "learning_rate": 8.019933675572389e-06, "loss": 1.1393, "step": 202 }, { "epoch": 0.6114457831325302, "grad_norm": 0.31607547402381897, "learning_rate": 8.000756462733577e-06, "loss": 1.1027, "step": 203 }, { "epoch": 0.6144578313253012, "grad_norm": 0.33204394578933716, "learning_rate": 7.981510007167719e-06, "loss": 1.0795, "step": 204 }, { "epoch": 0.6174698795180723, "grad_norm": 0.3012982904911041, "learning_rate": 7.962194752988519e-06, "loss": 1.104, "step": 205 }, { "epoch": 0.6204819277108434, "grad_norm": 0.28379830718040466, "learning_rate": 7.942811145897215e-06, "loss": 1.1108, "step": 206 }, { "epoch": 0.6234939759036144, "grad_norm": 0.3218439817428589, "learning_rate": 7.923359633172299e-06, "loss": 1.0856, "step": 207 }, { "epoch": 0.6265060240963856, "grad_norm": 0.2985135614871979, "learning_rate": 7.903840663659186e-06, "loss": 1.1621, "step": 208 }, { "epoch": 0.6295180722891566, "grad_norm": 0.3362099528312683, "learning_rate": 7.884254687759863e-06, "loss": 1.1173, "step": 209 }, { "epoch": 0.6325301204819277, "grad_norm": 0.32187989354133606, "learning_rate": 7.864602157422501e-06, "loss": 1.1293, "step": 210 }, { "epoch": 0.6355421686746988, "grad_norm": 0.34748998284339905, "learning_rate": 7.844883526131014e-06, "loss": 1.1501, "step": 211 }, { "epoch": 0.6385542168674698, "grad_norm": 0.2776443660259247, "learning_rate": 7.8250992488946e-06, "loss": 1.1272, "step": 212 }, { "epoch": 0.641566265060241, "grad_norm": 0.34776571393013, "learning_rate": 7.805249782237256e-06, "loss": 1.0993, "step": 213 }, { "epoch": 0.6445783132530121, "grad_norm": 0.3251356780529022, "learning_rate": 7.78533558418722e-06, "loss": 1.0717, "step": 214 }, { "epoch": 0.6475903614457831, "grad_norm": 0.32606494426727295, "learning_rate": 7.765357114266409e-06, "loss": 1.1061, "step": 215 }, { "epoch": 0.6506024096385542, "grad_norm": 0.32897332310676575, "learning_rate": 7.745314833479834e-06, "loss": 1.065, "step": 216 }, { "epoch": 0.6536144578313253, "grad_norm": 0.34086140990257263, "learning_rate": 7.72520920430493e-06, "loss": 1.1221, "step": 217 }, { "epoch": 0.6566265060240963, "grad_norm": 0.395309180021286, "learning_rate": 7.705040690680915e-06, "loss": 1.0839, "step": 218 }, { "epoch": 0.6596385542168675, "grad_norm": 0.3107753396034241, "learning_rate": 7.684809757998066e-06, "loss": 1.0287, "step": 219 }, { "epoch": 0.6626506024096386, "grad_norm": 0.32579633593559265, "learning_rate": 7.664516873086987e-06, "loss": 1.0925, "step": 220 }, { "epoch": 0.6656626506024096, "grad_norm": 0.32496118545532227, "learning_rate": 7.644162504207834e-06, "loss": 1.0225, "step": 221 }, { "epoch": 0.6686746987951807, "grad_norm": 0.34487584233283997, "learning_rate": 7.623747121039512e-06, "loss": 1.1216, "step": 222 }, { "epoch": 0.6716867469879518, "grad_norm": 0.28649845719337463, "learning_rate": 7.603271194668835e-06, "loss": 1.0989, "step": 223 }, { "epoch": 0.6746987951807228, "grad_norm": 0.3071340024471283, "learning_rate": 7.582735197579657e-06, "loss": 1.0908, "step": 224 }, { "epoch": 0.677710843373494, "grad_norm": 0.33348020911216736, "learning_rate": 7.562139603641971e-06, "loss": 1.0497, "step": 225 }, { "epoch": 0.6807228915662651, "grad_norm": 0.3527333736419678, "learning_rate": 7.541484888100974e-06, "loss": 1.1121, "step": 226 }, { "epoch": 0.6837349397590361, "grad_norm": 0.3623991310596466, "learning_rate": 7.520771527566093e-06, "loss": 1.0675, "step": 227 }, { "epoch": 0.6867469879518072, "grad_norm": 0.3683350384235382, "learning_rate": 7.500000000000001e-06, "loss": 1.082, "step": 228 }, { "epoch": 0.6897590361445783, "grad_norm": 0.30012479424476624, "learning_rate": 7.479170784707574e-06, "loss": 1.1421, "step": 229 }, { "epoch": 0.6927710843373494, "grad_norm": 0.32032355666160583, "learning_rate": 7.458284362324844e-06, "loss": 1.0996, "step": 230 }, { "epoch": 0.6957831325301205, "grad_norm": 0.30791187286376953, "learning_rate": 7.437341214807895e-06, "loss": 1.1221, "step": 231 }, { "epoch": 0.6987951807228916, "grad_norm": 0.3271755576133728, "learning_rate": 7.416341825421755e-06, "loss": 1.0937, "step": 232 }, { "epoch": 0.7018072289156626, "grad_norm": 0.320961594581604, "learning_rate": 7.395286678729232e-06, "loss": 1.0727, "step": 233 }, { "epoch": 0.7048192771084337, "grad_norm": 0.31970059871673584, "learning_rate": 7.374176260579746e-06, "loss": 1.104, "step": 234 }, { "epoch": 0.7078313253012049, "grad_norm": 0.3410727083683014, "learning_rate": 7.353011058098104e-06, "loss": 1.0866, "step": 235 }, { "epoch": 0.7108433734939759, "grad_norm": 0.3979572653770447, "learning_rate": 7.33179155967327e-06, "loss": 1.0773, "step": 236 }, { "epoch": 0.713855421686747, "grad_norm": 0.3386681079864502, "learning_rate": 7.310518254947092e-06, "loss": 1.0943, "step": 237 }, { "epoch": 0.7168674698795181, "grad_norm": 0.32043731212615967, "learning_rate": 7.289191634803002e-06, "loss": 1.1104, "step": 238 }, { "epoch": 0.7198795180722891, "grad_norm": 0.3244670331478119, "learning_rate": 7.267812191354691e-06, "loss": 1.1137, "step": 239 }, { "epoch": 0.7228915662650602, "grad_norm": 0.32313597202301025, "learning_rate": 7.246380417934752e-06, "loss": 1.1296, "step": 240 }, { "epoch": 0.7259036144578314, "grad_norm": 0.4050733149051666, "learning_rate": 7.224896809083297e-06, "loss": 1.0725, "step": 241 }, { "epoch": 0.7289156626506024, "grad_norm": 0.2902127206325531, "learning_rate": 7.203361860536544e-06, "loss": 1.119, "step": 242 }, { "epoch": 0.7319277108433735, "grad_norm": 0.31548964977264404, "learning_rate": 7.181776069215382e-06, "loss": 1.0712, "step": 243 }, { "epoch": 0.7349397590361446, "grad_norm": 0.31955307722091675, "learning_rate": 7.160139933213899e-06, "loss": 1.0925, "step": 244 }, { "epoch": 0.7379518072289156, "grad_norm": 0.37396878004074097, "learning_rate": 7.138453951787894e-06, "loss": 1.1029, "step": 245 }, { "epoch": 0.7409638554216867, "grad_norm": 0.3100704550743103, "learning_rate": 7.1167186253433474e-06, "loss": 1.1001, "step": 246 }, { "epoch": 0.7439759036144579, "grad_norm": 0.32318195700645447, "learning_rate": 7.094934455424889e-06, "loss": 1.0909, "step": 247 }, { "epoch": 0.7469879518072289, "grad_norm": 0.40869641304016113, "learning_rate": 7.073101944704209e-06, "loss": 1.0925, "step": 248 }, { "epoch": 0.75, "grad_norm": 0.31567490100860596, "learning_rate": 7.051221596968471e-06, "loss": 1.0973, "step": 249 }, { "epoch": 0.7530120481927711, "grad_norm": 0.32018548250198364, "learning_rate": 7.029293917108678e-06, "loss": 1.0222, "step": 250 }, { "epoch": 0.7560240963855421, "grad_norm": 0.3648555874824524, "learning_rate": 7.0073194111080315e-06, "loss": 1.075, "step": 251 }, { "epoch": 0.7590361445783133, "grad_norm": 0.3636914789676666, "learning_rate": 6.985298586030241e-06, "loss": 1.1419, "step": 252 }, { "epoch": 0.7620481927710844, "grad_norm": 0.337217777967453, "learning_rate": 6.963231950007845e-06, "loss": 1.0848, "step": 253 }, { "epoch": 0.7650602409638554, "grad_norm": 0.34433966875076294, "learning_rate": 6.941120012230464e-06, "loss": 1.0675, "step": 254 }, { "epoch": 0.7680722891566265, "grad_norm": 0.31864967942237854, "learning_rate": 6.918963282933063e-06, "loss": 1.0576, "step": 255 }, { "epoch": 0.7710843373493976, "grad_norm": 0.37333500385284424, "learning_rate": 6.896762273384179e-06, "loss": 1.0536, "step": 256 }, { "epoch": 0.7740963855421686, "grad_norm": 0.389068603515625, "learning_rate": 6.8745174958741164e-06, "loss": 1.0992, "step": 257 }, { "epoch": 0.7771084337349398, "grad_norm": 0.3978760540485382, "learning_rate": 6.852229463703131e-06, "loss": 1.124, "step": 258 }, { "epoch": 0.7801204819277109, "grad_norm": 0.37109795212745667, "learning_rate": 6.829898691169581e-06, "loss": 1.065, "step": 259 }, { "epoch": 0.7831325301204819, "grad_norm": 0.3223637044429779, "learning_rate": 6.8075256935580655e-06, "loss": 1.0475, "step": 260 }, { "epoch": 0.786144578313253, "grad_norm": 0.32434552907943726, "learning_rate": 6.78511098712753e-06, "loss": 1.0797, "step": 261 }, { "epoch": 0.7891566265060241, "grad_norm": 0.3082960546016693, "learning_rate": 6.762655089099353e-06, "loss": 1.0889, "step": 262 }, { "epoch": 0.7921686746987951, "grad_norm": 0.33072763681411743, "learning_rate": 6.740158517645418e-06, "loss": 1.0575, "step": 263 }, { "epoch": 0.7951807228915663, "grad_norm": 0.3404625952243805, "learning_rate": 6.717621791876147e-06, "loss": 1.0192, "step": 264 }, { "epoch": 0.7981927710843374, "grad_norm": 0.31751227378845215, "learning_rate": 6.695045431828524e-06, "loss": 1.105, "step": 265 }, { "epoch": 0.8012048192771084, "grad_norm": 0.3528308868408203, "learning_rate": 6.672429958454103e-06, "loss": 1.0803, "step": 266 }, { "epoch": 0.8042168674698795, "grad_norm": 0.3395234942436218, "learning_rate": 6.649775893606982e-06, "loss": 1.1057, "step": 267 }, { "epoch": 0.8072289156626506, "grad_norm": 0.37763112783432007, "learning_rate": 6.627083760031755e-06, "loss": 1.0911, "step": 268 }, { "epoch": 0.8102409638554217, "grad_norm": 0.3695107400417328, "learning_rate": 6.604354081351461e-06, "loss": 1.1105, "step": 269 }, { "epoch": 0.8132530120481928, "grad_norm": 0.3498575687408447, "learning_rate": 6.5815873820554925e-06, "loss": 1.0347, "step": 270 }, { "epoch": 0.8162650602409639, "grad_norm": 0.3670216202735901, "learning_rate": 6.558784187487495e-06, "loss": 1.009, "step": 271 }, { "epoch": 0.8192771084337349, "grad_norm": 0.38344910740852356, "learning_rate": 6.535945023833249e-06, "loss": 1.0132, "step": 272 }, { "epoch": 0.822289156626506, "grad_norm": 0.3509382903575897, "learning_rate": 6.513070418108525e-06, "loss": 1.0768, "step": 273 }, { "epoch": 0.8253012048192772, "grad_norm": 0.37638577818870544, "learning_rate": 6.490160898146919e-06, "loss": 1.0435, "step": 274 }, { "epoch": 0.8283132530120482, "grad_norm": 0.36278653144836426, "learning_rate": 6.467216992587679e-06, "loss": 1.1227, "step": 275 }, { "epoch": 0.8313253012048193, "grad_norm": 0.34076735377311707, "learning_rate": 6.444239230863505e-06, "loss": 1.042, "step": 276 }, { "epoch": 0.8343373493975904, "grad_norm": 0.3733161687850952, "learning_rate": 6.421228143188325e-06, "loss": 1.0266, "step": 277 }, { "epoch": 0.8373493975903614, "grad_norm": 0.3508923351764679, "learning_rate": 6.398184260545072e-06, "loss": 1.0716, "step": 278 }, { "epoch": 0.8403614457831325, "grad_norm": 0.38440215587615967, "learning_rate": 6.375108114673425e-06, "loss": 1.1266, "step": 279 }, { "epoch": 0.8433734939759037, "grad_norm": 0.38378679752349854, "learning_rate": 6.3520002380575395e-06, "loss": 1.1126, "step": 280 }, { "epoch": 0.8463855421686747, "grad_norm": 0.36522167921066284, "learning_rate": 6.32886116391376e-06, "loss": 1.1011, "step": 281 }, { "epoch": 0.8493975903614458, "grad_norm": 0.345027357339859, "learning_rate": 6.305691426178316e-06, "loss": 1.1179, "step": 282 }, { "epoch": 0.8524096385542169, "grad_norm": 0.3252032697200775, "learning_rate": 6.282491559495005e-06, "loss": 1.0666, "step": 283 }, { "epoch": 0.8554216867469879, "grad_norm": 0.3353135585784912, "learning_rate": 6.259262099202849e-06, "loss": 1.045, "step": 284 }, { "epoch": 0.858433734939759, "grad_norm": 0.35613197088241577, "learning_rate": 6.23600358132375e-06, "loss": 1.0619, "step": 285 }, { "epoch": 0.8614457831325302, "grad_norm": 0.3165310025215149, "learning_rate": 6.212716542550112e-06, "loss": 1.0846, "step": 286 }, { "epoch": 0.8644578313253012, "grad_norm": 0.3935311436653137, "learning_rate": 6.189401520232464e-06, "loss": 1.0634, "step": 287 }, { "epoch": 0.8674698795180723, "grad_norm": 0.3519918620586395, "learning_rate": 6.166059052367055e-06, "loss": 1.1106, "step": 288 }, { "epoch": 0.8704819277108434, "grad_norm": 0.34923064708709717, "learning_rate": 6.142689677583447e-06, "loss": 1.0479, "step": 289 }, { "epoch": 0.8734939759036144, "grad_norm": 0.327006459236145, "learning_rate": 6.119293935132076e-06, "loss": 1.0652, "step": 290 }, { "epoch": 0.8765060240963856, "grad_norm": 0.3877696692943573, "learning_rate": 6.095872364871818e-06, "loss": 1.0686, "step": 291 }, { "epoch": 0.8795180722891566, "grad_norm": 0.3664681613445282, "learning_rate": 6.072425507257528e-06, "loss": 1.0205, "step": 292 }, { "epoch": 0.8825301204819277, "grad_norm": 0.35179319977760315, "learning_rate": 6.048953903327568e-06, "loss": 1.0839, "step": 293 }, { "epoch": 0.8855421686746988, "grad_norm": 0.3436523675918579, "learning_rate": 6.025458094691323e-06, "loss": 1.1028, "step": 294 }, { "epoch": 0.8885542168674698, "grad_norm": 0.3567025363445282, "learning_rate": 6.0019386235167055e-06, "loss": 1.0638, "step": 295 }, { "epoch": 0.891566265060241, "grad_norm": 0.34170979261398315, "learning_rate": 5.978396032517641e-06, "loss": 1.1007, "step": 296 }, { "epoch": 0.8945783132530121, "grad_norm": 0.32985955476760864, "learning_rate": 5.9548308649415486e-06, "loss": 1.1342, "step": 297 }, { "epoch": 0.8975903614457831, "grad_norm": 0.3722776174545288, "learning_rate": 5.931243664556803e-06, "loss": 1.1253, "step": 298 }, { "epoch": 0.9006024096385542, "grad_norm": 0.3619896173477173, "learning_rate": 5.90763497564019e-06, "loss": 1.0155, "step": 299 }, { "epoch": 0.9036144578313253, "grad_norm": 0.39154163002967834, "learning_rate": 5.884005342964343e-06, "loss": 1.151, "step": 300 }, { "epoch": 0.9066265060240963, "grad_norm": 0.35082048177719116, "learning_rate": 5.860355311785175e-06, "loss": 1.0529, "step": 301 }, { "epoch": 0.9096385542168675, "grad_norm": 0.3391878306865692, "learning_rate": 5.836685427829296e-06, "loss": 1.1057, "step": 302 }, { "epoch": 0.9126506024096386, "grad_norm": 0.34184518456459045, "learning_rate": 5.812996237281423e-06, "loss": 1.0481, "step": 303 }, { "epoch": 0.9156626506024096, "grad_norm": 0.3681842088699341, "learning_rate": 5.7892882867717705e-06, "loss": 1.0455, "step": 304 }, { "epoch": 0.9186746987951807, "grad_norm": 0.3326142728328705, "learning_rate": 5.765562123363445e-06, "loss": 1.071, "step": 305 }, { "epoch": 0.9216867469879518, "grad_norm": 0.36516857147216797, "learning_rate": 5.7418182945398136e-06, "loss": 1.0701, "step": 306 }, { "epoch": 0.9246987951807228, "grad_norm": 0.3817295730113983, "learning_rate": 5.718057348191874e-06, "loss": 1.0718, "step": 307 }, { "epoch": 0.927710843373494, "grad_norm": 0.3265933096408844, "learning_rate": 5.6942798326056205e-06, "loss": 1.0765, "step": 308 }, { "epoch": 0.9307228915662651, "grad_norm": 0.3510778248310089, "learning_rate": 5.670486296449373e-06, "loss": 1.1283, "step": 309 }, { "epoch": 0.9337349397590361, "grad_norm": 0.32118940353393555, "learning_rate": 5.646677288761132e-06, "loss": 1.0592, "step": 310 }, { "epoch": 0.9367469879518072, "grad_norm": 0.3270410895347595, "learning_rate": 5.622853358935908e-06, "loss": 1.0876, "step": 311 }, { "epoch": 0.9397590361445783, "grad_norm": 0.35170766711235046, "learning_rate": 5.599015056713037e-06, "loss": 1.0684, "step": 312 }, { "epoch": 0.9427710843373494, "grad_norm": 0.3354102671146393, "learning_rate": 5.575162932163501e-06, "loss": 1.0861, "step": 313 }, { "epoch": 0.9457831325301205, "grad_norm": 0.37484198808670044, "learning_rate": 5.551297535677236e-06, "loss": 1.0697, "step": 314 }, { "epoch": 0.9487951807228916, "grad_norm": 0.3812544047832489, "learning_rate": 5.527419417950424e-06, "loss": 1.0526, "step": 315 }, { "epoch": 0.9518072289156626, "grad_norm": 0.3424987196922302, "learning_rate": 5.503529129972792e-06, "loss": 1.0456, "step": 316 }, { "epoch": 0.9548192771084337, "grad_norm": 0.37978166341781616, "learning_rate": 5.479627223014902e-06, "loss": 1.0712, "step": 317 }, { "epoch": 0.9578313253012049, "grad_norm": 0.37075453996658325, "learning_rate": 5.455714248615417e-06, "loss": 1.0659, "step": 318 }, { "epoch": 0.9608433734939759, "grad_norm": 0.3791234791278839, "learning_rate": 5.431790758568388e-06, "loss": 1.0408, "step": 319 }, { "epoch": 0.963855421686747, "grad_norm": 0.3565094769001007, "learning_rate": 5.4078573049105135e-06, "loss": 1.0777, "step": 320 }, { "epoch": 0.9668674698795181, "grad_norm": 0.3704342246055603, "learning_rate": 5.383914439908403e-06, "loss": 1.1454, "step": 321 }, { "epoch": 0.9698795180722891, "grad_norm": 0.3541310429573059, "learning_rate": 5.359962716045836e-06, "loss": 1.0392, "step": 322 }, { "epoch": 0.9728915662650602, "grad_norm": 0.380628377199173, "learning_rate": 5.336002686011007e-06, "loss": 1.137, "step": 323 }, { "epoch": 0.9759036144578314, "grad_norm": 0.39858028292655945, "learning_rate": 5.312034902683779e-06, "loss": 1.1154, "step": 324 }, { "epoch": 0.9789156626506024, "grad_norm": 0.3649790585041046, "learning_rate": 5.288059919122922e-06, "loss": 0.9955, "step": 325 }, { "epoch": 0.9819277108433735, "grad_norm": 0.41761839389801025, "learning_rate": 5.2640782885533515e-06, "loss": 1.0635, "step": 326 }, { "epoch": 0.9849397590361446, "grad_norm": 0.43014606833457947, "learning_rate": 5.240090564353365e-06, "loss": 1.0369, "step": 327 }, { "epoch": 0.9879518072289156, "grad_norm": 0.36708715558052063, "learning_rate": 5.21609730004187e-06, "loss": 1.0698, "step": 328 }, { "epoch": 0.9909638554216867, "grad_norm": 0.34585151076316833, "learning_rate": 5.1920990492656135e-06, "loss": 1.1109, "step": 329 }, { "epoch": 0.9939759036144579, "grad_norm": 0.3957839906215668, "learning_rate": 5.168096365786402e-06, "loss": 1.0439, "step": 330 }, { "epoch": 0.9969879518072289, "grad_norm": 0.35115697979927063, "learning_rate": 5.144089803468333e-06, "loss": 1.1163, "step": 331 }, { "epoch": 1.0, "grad_norm": 0.38435807824134827, "learning_rate": 5.1200799162650035e-06, "loss": 1.0951, "step": 332 }, { "epoch": 1.0030120481927711, "grad_norm": 0.31511375308036804, "learning_rate": 5.096067258206735e-06, "loss": 1.1165, "step": 333 }, { "epoch": 1.0060240963855422, "grad_norm": 0.41014838218688965, "learning_rate": 5.072052383387787e-06, "loss": 1.1078, "step": 334 }, { "epoch": 1.0090361445783131, "grad_norm": 0.36247673630714417, "learning_rate": 5.048035845953569e-06, "loss": 0.9971, "step": 335 }, { "epoch": 1.0120481927710843, "grad_norm": 0.3278728127479553, "learning_rate": 5.024018200087855e-06, "loss": 1.1189, "step": 336 }, { "epoch": 1.0030120481927711, "grad_norm": 0.3709251880645752, "learning_rate": 5e-06, "loss": 1.0844, "step": 337 }, { "epoch": 1.0060240963855422, "grad_norm": 0.3526400029659271, "learning_rate": 4.975981799912147e-06, "loss": 1.0526, "step": 338 }, { "epoch": 1.0090361445783131, "grad_norm": 0.3683416545391083, "learning_rate": 4.951964154046432e-06, "loss": 1.0687, "step": 339 }, { "epoch": 1.0120481927710843, "grad_norm": 0.3674441874027252, "learning_rate": 4.927947616612216e-06, "loss": 1.048, "step": 340 }, { "epoch": 1.0150602409638554, "grad_norm": 0.33722299337387085, "learning_rate": 4.903932741793266e-06, "loss": 0.9881, "step": 341 }, { "epoch": 1.0180722891566265, "grad_norm": 0.38287389278411865, "learning_rate": 4.879920083734997e-06, "loss": 1.0368, "step": 342 }, { "epoch": 1.0210843373493976, "grad_norm": 0.38486406207084656, "learning_rate": 4.855910196531669e-06, "loss": 1.036, "step": 343 }, { "epoch": 1.0240963855421688, "grad_norm": 0.3734920620918274, "learning_rate": 4.8319036342135985e-06, "loss": 1.0488, "step": 344 }, { "epoch": 1.0271084337349397, "grad_norm": 0.40039393305778503, "learning_rate": 4.807900950734388e-06, "loss": 1.0315, "step": 345 }, { "epoch": 1.0301204819277108, "grad_norm": 0.3681272566318512, "learning_rate": 4.78390269995813e-06, "loss": 1.0515, "step": 346 }, { "epoch": 1.033132530120482, "grad_norm": 0.3793656826019287, "learning_rate": 4.759909435646636e-06, "loss": 1.1554, "step": 347 }, { "epoch": 1.036144578313253, "grad_norm": 0.3956240117549896, "learning_rate": 4.735921711446649e-06, "loss": 1.0764, "step": 348 }, { "epoch": 1.0391566265060241, "grad_norm": 0.3506197929382324, "learning_rate": 4.711940080877079e-06, "loss": 1.0664, "step": 349 }, { "epoch": 1.0421686746987953, "grad_norm": 0.33289358019828796, "learning_rate": 4.687965097316223e-06, "loss": 1.0989, "step": 350 }, { "epoch": 1.0451807228915662, "grad_norm": 0.38886559009552, "learning_rate": 4.6639973139889944e-06, "loss": 1.0367, "step": 351 }, { "epoch": 1.0481927710843373, "grad_norm": 0.36474519968032837, "learning_rate": 4.640037283954165e-06, "loss": 1.041, "step": 352 }, { "epoch": 1.0512048192771084, "grad_norm": 0.3719565272331238, "learning_rate": 4.616085560091596e-06, "loss": 1.0481, "step": 353 }, { "epoch": 1.0542168674698795, "grad_norm": 0.38648533821105957, "learning_rate": 4.592142695089489e-06, "loss": 1.0783, "step": 354 }, { "epoch": 1.0572289156626506, "grad_norm": 0.4134596884250641, "learning_rate": 4.568209241431615e-06, "loss": 1.0206, "step": 355 }, { "epoch": 1.0602409638554218, "grad_norm": 0.35198330879211426, "learning_rate": 4.544285751384585e-06, "loss": 1.0578, "step": 356 }, { "epoch": 1.0632530120481927, "grad_norm": 0.4130623936653137, "learning_rate": 4.520372776985101e-06, "loss": 1.0467, "step": 357 }, { "epoch": 1.0662650602409638, "grad_norm": 0.35723182559013367, "learning_rate": 4.496470870027209e-06, "loss": 1.0781, "step": 358 }, { "epoch": 1.069277108433735, "grad_norm": 0.35406294465065, "learning_rate": 4.472580582049578e-06, "loss": 1.001, "step": 359 }, { "epoch": 1.072289156626506, "grad_norm": 0.38317278027534485, "learning_rate": 4.448702464322764e-06, "loss": 1.0656, "step": 360 }, { "epoch": 1.0753012048192772, "grad_norm": 0.338810533285141, "learning_rate": 4.4248370678364995e-06, "loss": 0.9687, "step": 361 }, { "epoch": 1.0783132530120483, "grad_norm": 0.33876633644104004, "learning_rate": 4.400984943286965e-06, "loss": 1.0505, "step": 362 }, { "epoch": 1.0813253012048192, "grad_norm": 0.3846857249736786, "learning_rate": 4.377146641064093e-06, "loss": 1.0058, "step": 363 }, { "epoch": 1.0843373493975903, "grad_norm": 0.4660666286945343, "learning_rate": 4.3533227112388694e-06, "loss": 1.0146, "step": 364 }, { "epoch": 1.0873493975903614, "grad_norm": 0.4065142869949341, "learning_rate": 4.329513703550628e-06, "loss": 1.0294, "step": 365 }, { "epoch": 1.0903614457831325, "grad_norm": 0.39198002219200134, "learning_rate": 4.305720167394381e-06, "loss": 1.0866, "step": 366 }, { "epoch": 1.0933734939759037, "grad_norm": 0.35108157992362976, "learning_rate": 4.2819426518081265e-06, "loss": 1.0525, "step": 367 }, { "epoch": 1.0963855421686748, "grad_norm": 0.34058380126953125, "learning_rate": 4.258181705460188e-06, "loss": 1.0815, "step": 368 }, { "epoch": 1.0993975903614457, "grad_norm": 0.4144446849822998, "learning_rate": 4.234437876636557e-06, "loss": 1.0305, "step": 369 }, { "epoch": 1.1024096385542168, "grad_norm": 0.3802807927131653, "learning_rate": 4.21071171322823e-06, "loss": 1.0463, "step": 370 }, { "epoch": 1.105421686746988, "grad_norm": 0.3633134067058563, "learning_rate": 4.1870037627185785e-06, "loss": 1.0386, "step": 371 }, { "epoch": 1.108433734939759, "grad_norm": 0.4094638526439667, "learning_rate": 4.163314572170704e-06, "loss": 1.0414, "step": 372 }, { "epoch": 1.1114457831325302, "grad_norm": 0.37921878695487976, "learning_rate": 4.139644688214827e-06, "loss": 1.0142, "step": 373 }, { "epoch": 1.1144578313253013, "grad_norm": 0.413327157497406, "learning_rate": 4.115994657035659e-06, "loss": 1.0886, "step": 374 }, { "epoch": 1.1174698795180722, "grad_norm": 0.37856829166412354, "learning_rate": 4.0923650243598104e-06, "loss": 1.084, "step": 375 }, { "epoch": 1.1204819277108433, "grad_norm": 0.41401001811027527, "learning_rate": 4.0687563354431986e-06, "loss": 1.118, "step": 376 }, { "epoch": 1.1234939759036144, "grad_norm": 0.3299630582332611, "learning_rate": 4.045169135058452e-06, "loss": 0.9993, "step": 377 }, { "epoch": 1.1265060240963856, "grad_norm": 0.40372124314308167, "learning_rate": 4.021603967482361e-06, "loss": 0.9855, "step": 378 }, { "epoch": 1.1295180722891567, "grad_norm": 0.360078364610672, "learning_rate": 3.998061376483298e-06, "loss": 1.0382, "step": 379 }, { "epoch": 1.1325301204819278, "grad_norm": 0.3652278184890747, "learning_rate": 3.974541905308679e-06, "loss": 1.0232, "step": 380 }, { "epoch": 1.1355421686746987, "grad_norm": 0.3333640396595001, "learning_rate": 3.951046096672434e-06, "loss": 1.0304, "step": 381 }, { "epoch": 1.1385542168674698, "grad_norm": 0.3765230178833008, "learning_rate": 3.927574492742473e-06, "loss": 1.0738, "step": 382 }, { "epoch": 1.141566265060241, "grad_norm": 0.3517187833786011, "learning_rate": 3.904127635128184e-06, "loss": 1.0491, "step": 383 }, { "epoch": 1.144578313253012, "grad_norm": 0.35913482308387756, "learning_rate": 3.880706064867927e-06, "loss": 1.0509, "step": 384 }, { "epoch": 1.1475903614457832, "grad_norm": 0.3901945650577545, "learning_rate": 3.857310322416555e-06, "loss": 1.0653, "step": 385 }, { "epoch": 1.1506024096385543, "grad_norm": 0.3298746347427368, "learning_rate": 3.833940947632947e-06, "loss": 0.9943, "step": 386 }, { "epoch": 1.1536144578313252, "grad_norm": 0.3933880031108856, "learning_rate": 3.8105984797675364e-06, "loss": 1.06, "step": 387 }, { "epoch": 1.1566265060240963, "grad_norm": 0.42192092537879944, "learning_rate": 3.7872834574498894e-06, "loss": 1.0453, "step": 388 }, { "epoch": 1.1596385542168675, "grad_norm": 0.38652369379997253, "learning_rate": 3.7639964186762506e-06, "loss": 1.035, "step": 389 }, { "epoch": 1.1626506024096386, "grad_norm": 0.44319620728492737, "learning_rate": 3.740737900797151e-06, "loss": 1.1098, "step": 390 }, { "epoch": 1.1656626506024097, "grad_norm": 0.3664276599884033, "learning_rate": 3.7175084405049978e-06, "loss": 0.991, "step": 391 }, { "epoch": 1.1686746987951806, "grad_norm": 0.3838660717010498, "learning_rate": 3.6943085738216855e-06, "loss": 1.092, "step": 392 }, { "epoch": 1.1716867469879517, "grad_norm": 0.38596200942993164, "learning_rate": 3.6711388360862417e-06, "loss": 1.077, "step": 393 }, { "epoch": 1.1746987951807228, "grad_norm": 0.337519109249115, "learning_rate": 3.6479997619424605e-06, "loss": 1.0932, "step": 394 }, { "epoch": 1.177710843373494, "grad_norm": 0.350619912147522, "learning_rate": 3.6248918853265756e-06, "loss": 1.0796, "step": 395 }, { "epoch": 1.180722891566265, "grad_norm": 0.38858625292778015, "learning_rate": 3.6018157394549287e-06, "loss": 1.0689, "step": 396 }, { "epoch": 1.1837349397590362, "grad_norm": 0.38901758193969727, "learning_rate": 3.5787718568116764e-06, "loss": 1.1019, "step": 397 }, { "epoch": 1.1867469879518073, "grad_norm": 0.34919285774230957, "learning_rate": 3.5557607691364983e-06, "loss": 1.0646, "step": 398 }, { "epoch": 1.1897590361445782, "grad_norm": 0.41810017824172974, "learning_rate": 3.5327830074123214e-06, "loss": 1.0429, "step": 399 }, { "epoch": 1.1927710843373494, "grad_norm": 0.3683408796787262, "learning_rate": 3.509839101853082e-06, "loss": 0.9905, "step": 400 }, { "epoch": 1.1957831325301205, "grad_norm": 0.3720911741256714, "learning_rate": 3.486929581891476e-06, "loss": 1.0213, "step": 401 }, { "epoch": 1.1987951807228916, "grad_norm": 0.3495194911956787, "learning_rate": 3.464054976166753e-06, "loss": 1.0386, "step": 402 }, { "epoch": 1.2018072289156627, "grad_norm": 0.36551299691200256, "learning_rate": 3.441215812512508e-06, "loss": 1.0043, "step": 403 }, { "epoch": 1.2048192771084336, "grad_norm": 0.3687341809272766, "learning_rate": 3.41841261794451e-06, "loss": 1.0313, "step": 404 }, { "epoch": 1.2078313253012047, "grad_norm": 0.3739585280418396, "learning_rate": 3.3956459186485414e-06, "loss": 1.0326, "step": 405 }, { "epoch": 1.2108433734939759, "grad_norm": 0.38974305987358093, "learning_rate": 3.372916239968246e-06, "loss": 1.0665, "step": 406 }, { "epoch": 1.213855421686747, "grad_norm": 0.4061500132083893, "learning_rate": 3.3502241063930196e-06, "loss": 1.114, "step": 407 }, { "epoch": 1.216867469879518, "grad_norm": 0.398306280374527, "learning_rate": 3.327570041545897e-06, "loss": 1.0584, "step": 408 }, { "epoch": 1.2198795180722892, "grad_norm": 0.36864137649536133, "learning_rate": 3.304954568171478e-06, "loss": 1.081, "step": 409 }, { "epoch": 1.2228915662650603, "grad_norm": 0.3283785581588745, "learning_rate": 3.282378208123856e-06, "loss": 1.0605, "step": 410 }, { "epoch": 1.2259036144578312, "grad_norm": 0.38243263959884644, "learning_rate": 3.259841482354582e-06, "loss": 1.0161, "step": 411 }, { "epoch": 1.2289156626506024, "grad_norm": 0.38818714022636414, "learning_rate": 3.2373449109006476e-06, "loss": 1.0602, "step": 412 }, { "epoch": 1.2319277108433735, "grad_norm": 0.3809143304824829, "learning_rate": 3.21488901287247e-06, "loss": 1.0088, "step": 413 }, { "epoch": 1.2349397590361446, "grad_norm": 0.37948790192604065, "learning_rate": 3.192474306441936e-06, "loss": 1.0532, "step": 414 }, { "epoch": 1.2379518072289157, "grad_norm": 0.44067418575286865, "learning_rate": 3.170101308830421e-06, "loss": 1.0377, "step": 415 }, { "epoch": 1.2409638554216866, "grad_norm": 0.3667253255844116, "learning_rate": 3.1477705362968702e-06, "loss": 1.0234, "step": 416 }, { "epoch": 1.2439759036144578, "grad_norm": 0.37526583671569824, "learning_rate": 3.1254825041258852e-06, "loss": 1.0344, "step": 417 }, { "epoch": 1.2469879518072289, "grad_norm": 0.42664584517478943, "learning_rate": 3.103237726615822e-06, "loss": 1.0439, "step": 418 }, { "epoch": 1.25, "grad_norm": 0.3878503441810608, "learning_rate": 3.081036717066938e-06, "loss": 1.1294, "step": 419 }, { "epoch": 1.2530120481927711, "grad_norm": 0.4370405972003937, "learning_rate": 3.0588799877695375e-06, "loss": 1.0563, "step": 420 }, { "epoch": 1.2560240963855422, "grad_norm": 0.38727104663848877, "learning_rate": 3.036768049992157e-06, "loss": 1.0561, "step": 421 }, { "epoch": 1.2590361445783134, "grad_norm": 0.3639293909072876, "learning_rate": 3.0147014139697596e-06, "loss": 1.0747, "step": 422 }, { "epoch": 1.2620481927710843, "grad_norm": 0.3889468014240265, "learning_rate": 2.99268058889197e-06, "loss": 1.0575, "step": 423 }, { "epoch": 1.2650602409638554, "grad_norm": 0.3735024929046631, "learning_rate": 2.9707060828913226e-06, "loss": 1.0432, "step": 424 }, { "epoch": 1.2680722891566265, "grad_norm": 0.3623259365558624, "learning_rate": 2.9487784030315297e-06, "loss": 1.0929, "step": 425 }, { "epoch": 1.2710843373493976, "grad_norm": 0.38363751769065857, "learning_rate": 2.9268980552957917e-06, "loss": 1.018, "step": 426 }, { "epoch": 1.2740963855421688, "grad_norm": 0.36796835064888, "learning_rate": 2.905065544575114e-06, "loss": 1.0636, "step": 427 }, { "epoch": 1.2771084337349397, "grad_norm": 0.3460337817668915, "learning_rate": 2.8832813746566546e-06, "loss": 1.1039, "step": 428 }, { "epoch": 1.2801204819277108, "grad_norm": 0.37609270215034485, "learning_rate": 2.86154604821211e-06, "loss": 1.0746, "step": 429 }, { "epoch": 1.283132530120482, "grad_norm": 0.39871373772621155, "learning_rate": 2.8398600667861032e-06, "loss": 1.0095, "step": 430 }, { "epoch": 1.286144578313253, "grad_norm": 0.38184547424316406, "learning_rate": 2.8182239307846195e-06, "loss": 1.0278, "step": 431 }, { "epoch": 1.2891566265060241, "grad_norm": 0.40051835775375366, "learning_rate": 2.796638139463456e-06, "loss": 1.0261, "step": 432 }, { "epoch": 1.2921686746987953, "grad_norm": 0.38206747174263, "learning_rate": 2.7751031909167046e-06, "loss": 1.0817, "step": 433 }, { "epoch": 1.2951807228915664, "grad_norm": 0.42132294178009033, "learning_rate": 2.7536195820652506e-06, "loss": 1.0253, "step": 434 }, { "epoch": 1.2981927710843373, "grad_norm": 0.37671446800231934, "learning_rate": 2.73218780864531e-06, "loss": 1.0555, "step": 435 }, { "epoch": 1.3012048192771084, "grad_norm": 0.405241459608078, "learning_rate": 2.710808365197e-06, "loss": 1.0957, "step": 436 }, { "epoch": 1.3042168674698795, "grad_norm": 0.3754029870033264, "learning_rate": 2.689481745052908e-06, "loss": 0.9929, "step": 437 }, { "epoch": 1.3072289156626506, "grad_norm": 0.3823848068714142, "learning_rate": 2.6682084403267305e-06, "loss": 1.0884, "step": 438 }, { "epoch": 1.3102409638554218, "grad_norm": 0.3721786439418793, "learning_rate": 2.6469889419018985e-06, "loss": 1.0173, "step": 439 }, { "epoch": 1.3132530120481927, "grad_norm": 0.3947805166244507, "learning_rate": 2.6258237394202556e-06, "loss": 1.0628, "step": 440 }, { "epoch": 1.3162650602409638, "grad_norm": 0.3939521908760071, "learning_rate": 2.60471332127077e-06, "loss": 1.0576, "step": 441 }, { "epoch": 1.319277108433735, "grad_norm": 0.40392783284187317, "learning_rate": 2.5836581745782474e-06, "loss": 1.0515, "step": 442 }, { "epoch": 1.322289156626506, "grad_norm": 0.39871639013290405, "learning_rate": 2.5626587851921053e-06, "loss": 1.0039, "step": 443 }, { "epoch": 1.3253012048192772, "grad_norm": 0.409939706325531, "learning_rate": 2.541715637675156e-06, "loss": 1.0394, "step": 444 }, { "epoch": 1.3283132530120483, "grad_norm": 0.3738921880722046, "learning_rate": 2.520829215292426e-06, "loss": 1.0766, "step": 445 }, { "epoch": 1.3313253012048194, "grad_norm": 0.3500833213329315, "learning_rate": 2.5000000000000015e-06, "loss": 1.0533, "step": 446 }, { "epoch": 1.3343373493975903, "grad_norm": 0.3490578532218933, "learning_rate": 2.4792284724339077e-06, "loss": 1.0512, "step": 447 }, { "epoch": 1.3373493975903614, "grad_norm": 0.32971325516700745, "learning_rate": 2.4585151118990286e-06, "loss": 1.0417, "step": 448 }, { "epoch": 1.3403614457831325, "grad_norm": 0.36310428380966187, "learning_rate": 2.4378603963580293e-06, "loss": 1.122, "step": 449 }, { "epoch": 1.3433734939759037, "grad_norm": 0.40908730030059814, "learning_rate": 2.417264802420343e-06, "loss": 1.0535, "step": 450 }, { "epoch": 1.3463855421686746, "grad_norm": 0.3725447654724121, "learning_rate": 2.396728805331167e-06, "loss": 1.0547, "step": 451 }, { "epoch": 1.3493975903614457, "grad_norm": 0.39754316210746765, "learning_rate": 2.3762528789604887e-06, "loss": 1.0292, "step": 452 }, { "epoch": 1.3524096385542168, "grad_norm": 0.39532670378685, "learning_rate": 2.3558374957921678e-06, "loss": 1.0182, "step": 453 }, { "epoch": 1.355421686746988, "grad_norm": 0.40215209126472473, "learning_rate": 2.3354831269130133e-06, "loss": 1.0495, "step": 454 }, { "epoch": 1.358433734939759, "grad_norm": 0.421367347240448, "learning_rate": 2.3151902420019357e-06, "loss": 1.0389, "step": 455 }, { "epoch": 1.3614457831325302, "grad_norm": 0.38005751371383667, "learning_rate": 2.2949593093190863e-06, "loss": 1.0681, "step": 456 }, { "epoch": 1.3644578313253013, "grad_norm": 0.3765680193901062, "learning_rate": 2.274790795695071e-06, "loss": 1.0338, "step": 457 }, { "epoch": 1.3674698795180724, "grad_norm": 0.34579628705978394, "learning_rate": 2.2546851665201692e-06, "loss": 1.0749, "step": 458 }, { "epoch": 1.3704819277108433, "grad_norm": 0.3837708830833435, "learning_rate": 2.2346428857335904e-06, "loss": 1.0642, "step": 459 }, { "epoch": 1.3734939759036144, "grad_norm": 0.3635129928588867, "learning_rate": 2.2146644158127827e-06, "loss": 1.0432, "step": 460 }, { "epoch": 1.3765060240963856, "grad_norm": 0.40961354970932007, "learning_rate": 2.1947502177627437e-06, "loss": 1.0437, "step": 461 }, { "epoch": 1.3795180722891567, "grad_norm": 0.37368935346603394, "learning_rate": 2.1749007511054005e-06, "loss": 1.0578, "step": 462 }, { "epoch": 1.3825301204819276, "grad_norm": 0.40420466661453247, "learning_rate": 2.1551164738689896e-06, "loss": 1.0743, "step": 463 }, { "epoch": 1.3855421686746987, "grad_norm": 0.3825657069683075, "learning_rate": 2.1353978425775006e-06, "loss": 1.0327, "step": 464 }, { "epoch": 1.3885542168674698, "grad_norm": 0.39921796321868896, "learning_rate": 2.1157453122401385e-06, "loss": 1.0576, "step": 465 }, { "epoch": 1.391566265060241, "grad_norm": 0.36656901240348816, "learning_rate": 2.0961593363408154e-06, "loss": 1.0264, "step": 466 }, { "epoch": 1.394578313253012, "grad_norm": 0.3587695360183716, "learning_rate": 2.076640366827703e-06, "loss": 1.071, "step": 467 }, { "epoch": 1.3975903614457832, "grad_norm": 0.3668745756149292, "learning_rate": 2.0571888541027857e-06, "loss": 0.9852, "step": 468 }, { "epoch": 1.4006024096385543, "grad_norm": 0.41092541813850403, "learning_rate": 2.0378052470114822e-06, "loss": 1.0234, "step": 469 }, { "epoch": 1.4036144578313254, "grad_norm": 0.42871734499931335, "learning_rate": 2.018489992832283e-06, "loss": 1.0427, "step": 470 }, { "epoch": 1.4066265060240963, "grad_norm": 0.3699125349521637, "learning_rate": 1.999243537266424e-06, "loss": 1.0422, "step": 471 }, { "epoch": 1.4096385542168675, "grad_norm": 0.36434075236320496, "learning_rate": 1.980066324427613e-06, "loss": 1.0588, "step": 472 }, { "epoch": 1.4126506024096386, "grad_norm": 0.4026855528354645, "learning_rate": 1.960958796831769e-06, "loss": 1.0295, "step": 473 }, { "epoch": 1.4156626506024097, "grad_norm": 0.3882656395435333, "learning_rate": 1.9419213953868236e-06, "loss": 1.0366, "step": 474 }, { "epoch": 1.4186746987951806, "grad_norm": 0.40121057629585266, "learning_rate": 1.9229545593825367e-06, "loss": 1.0806, "step": 475 }, { "epoch": 1.4216867469879517, "grad_norm": 0.3884546756744385, "learning_rate": 1.9040587264803673e-06, "loss": 1.1063, "step": 476 }, { "epoch": 1.4246987951807228, "grad_norm": 0.3452583849430084, "learning_rate": 1.8852343327033717e-06, "loss": 1.0373, "step": 477 }, { "epoch": 1.427710843373494, "grad_norm": 0.39576640725135803, "learning_rate": 1.8664818124261375e-06, "loss": 1.0804, "step": 478 }, { "epoch": 1.430722891566265, "grad_norm": 0.40806901454925537, "learning_rate": 1.8478015983647718e-06, "loss": 1.0341, "step": 479 }, { "epoch": 1.4337349397590362, "grad_norm": 0.37504813075065613, "learning_rate": 1.8291941215669024e-06, "loss": 1.0557, "step": 480 }, { "epoch": 1.4367469879518073, "grad_norm": 0.39833274483680725, "learning_rate": 1.8106598114017398e-06, "loss": 1.0336, "step": 481 }, { "epoch": 1.4397590361445782, "grad_norm": 0.39540019631385803, "learning_rate": 1.7921990955501705e-06, "loss": 1.0473, "step": 482 }, { "epoch": 1.4427710843373494, "grad_norm": 0.40363839268684387, "learning_rate": 1.7738123999948853e-06, "loss": 1.0193, "step": 483 }, { "epoch": 1.4457831325301205, "grad_norm": 0.37323495745658875, "learning_rate": 1.755500149010549e-06, "loss": 0.9827, "step": 484 }, { "epoch": 1.4487951807228916, "grad_norm": 0.41902491450309753, "learning_rate": 1.737262765154008e-06, "loss": 1.069, "step": 485 }, { "epoch": 1.4518072289156627, "grad_norm": 0.40718671679496765, "learning_rate": 1.7191006692545493e-06, "loss": 1.0873, "step": 486 }, { "epoch": 1.4548192771084336, "grad_norm": 0.4020282030105591, "learning_rate": 1.7010142804041785e-06, "loss": 1.0425, "step": 487 }, { "epoch": 1.4578313253012047, "grad_norm": 0.3684733510017395, "learning_rate": 1.6830040159479521e-06, "loss": 1.0121, "step": 488 }, { "epoch": 1.4608433734939759, "grad_norm": 0.3506666421890259, "learning_rate": 1.66507029147436e-06, "loss": 1.0484, "step": 489 }, { "epoch": 1.463855421686747, "grad_norm": 0.468654602766037, "learning_rate": 1.6472135208057128e-06, "loss": 1.0682, "step": 490 }, { "epoch": 1.466867469879518, "grad_norm": 0.4075433313846588, "learning_rate": 1.629434115988614e-06, "loss": 1.0589, "step": 491 }, { "epoch": 1.4698795180722892, "grad_norm": 0.3535695970058441, "learning_rate": 1.611732487284437e-06, "loss": 1.0628, "step": 492 }, { "epoch": 1.4728915662650603, "grad_norm": 0.37299081683158875, "learning_rate": 1.5941090431598654e-06, "loss": 1.019, "step": 493 }, { "epoch": 1.4759036144578312, "grad_norm": 0.34906429052352905, "learning_rate": 1.5765641902774704e-06, "loss": 1.0281, "step": 494 }, { "epoch": 1.4789156626506024, "grad_norm": 0.4228847920894623, "learning_rate": 1.5590983334863191e-06, "loss": 1.0176, "step": 495 }, { "epoch": 1.4819277108433735, "grad_norm": 0.4109274446964264, "learning_rate": 1.5417118758126408e-06, "loss": 1.0818, "step": 496 }, { "epoch": 1.4849397590361446, "grad_norm": 0.3916458189487457, "learning_rate": 1.524405218450517e-06, "loss": 1.0299, "step": 497 }, { "epoch": 1.4879518072289157, "grad_norm": 0.3761802911758423, "learning_rate": 1.5071787607526366e-06, "loss": 1.0152, "step": 498 }, { "epoch": 1.4909638554216866, "grad_norm": 0.3690100610256195, "learning_rate": 1.4900329002210684e-06, "loss": 1.0818, "step": 499 }, { "epoch": 1.4939759036144578, "grad_norm": 0.36231639981269836, "learning_rate": 1.472968032498095e-06, "loss": 1.0708, "step": 500 }, { "epoch": 1.4969879518072289, "grad_norm": 0.3943842053413391, "learning_rate": 1.4559845513570859e-06, "loss": 1.0399, "step": 501 }, { "epoch": 1.5, "grad_norm": 0.380312979221344, "learning_rate": 1.439082848693406e-06, "loss": 0.9593, "step": 502 }, { "epoch": 1.5030120481927711, "grad_norm": 0.43198204040527344, "learning_rate": 1.4222633145153758e-06, "loss": 0.9807, "step": 503 }, { "epoch": 1.5060240963855422, "grad_norm": 0.3783879578113556, "learning_rate": 1.4055263369352673e-06, "loss": 1.0255, "step": 504 }, { "epoch": 1.5090361445783134, "grad_norm": 0.3918922543525696, "learning_rate": 1.388872302160353e-06, "loss": 1.0401, "step": 505 }, { "epoch": 1.5120481927710845, "grad_norm": 0.39092695713043213, "learning_rate": 1.3723015944839947e-06, "loss": 1.0715, "step": 506 }, { "epoch": 1.5150602409638554, "grad_norm": 0.33539846539497375, "learning_rate": 1.35581459627677e-06, "loss": 1.0185, "step": 507 }, { "epoch": 1.5180722891566265, "grad_norm": 0.3622112572193146, "learning_rate": 1.339411687977657e-06, "loss": 1.0932, "step": 508 }, { "epoch": 1.5210843373493976, "grad_norm": 0.3799549341201782, "learning_rate": 1.3230932480852487e-06, "loss": 1.0413, "step": 509 }, { "epoch": 1.5240963855421685, "grad_norm": 0.37101662158966064, "learning_rate": 1.3068596531490253e-06, "loss": 1.0402, "step": 510 }, { "epoch": 1.5271084337349397, "grad_norm": 0.3901662826538086, "learning_rate": 1.290711277760658e-06, "loss": 1.0245, "step": 511 }, { "epoch": 1.5301204819277108, "grad_norm": 0.37363961338996887, "learning_rate": 1.2746484945453691e-06, "loss": 1.0387, "step": 512 }, { "epoch": 1.533132530120482, "grad_norm": 0.376298725605011, "learning_rate": 1.2586716741533389e-06, "loss": 1.0305, "step": 513 }, { "epoch": 1.536144578313253, "grad_norm": 0.35384973883628845, "learning_rate": 1.2427811852511396e-06, "loss": 1.0001, "step": 514 }, { "epoch": 1.5391566265060241, "grad_norm": 0.3355305790901184, "learning_rate": 1.226977394513247e-06, "loss": 1.0756, "step": 515 }, { "epoch": 1.5421686746987953, "grad_norm": 0.3982202112674713, "learning_rate": 1.2112606666135602e-06, "loss": 1.0102, "step": 516 }, { "epoch": 1.5451807228915664, "grad_norm": 0.33996695280075073, "learning_rate": 1.1956313642169974e-06, "loss": 1.0388, "step": 517 }, { "epoch": 1.5481927710843375, "grad_norm": 0.3969401717185974, "learning_rate": 1.1800898479711293e-06, "loss": 1.0541, "step": 518 }, { "epoch": 1.5512048192771084, "grad_norm": 0.3649154603481293, "learning_rate": 1.1646364764978468e-06, "loss": 1.0625, "step": 519 }, { "epoch": 1.5542168674698795, "grad_norm": 0.39856594800949097, "learning_rate": 1.1492716063850973e-06, "loss": 1.0405, "step": 520 }, { "epoch": 1.5572289156626506, "grad_norm": 0.3574175238609314, "learning_rate": 1.1339955921786504e-06, "loss": 1.0486, "step": 521 }, { "epoch": 1.5602409638554215, "grad_norm": 0.36913472414016724, "learning_rate": 1.1188087863739173e-06, "loss": 0.9595, "step": 522 }, { "epoch": 1.5632530120481927, "grad_norm": 0.32440900802612305, "learning_rate": 1.1037115394078162e-06, "loss": 1.0586, "step": 523 }, { "epoch": 1.5662650602409638, "grad_norm": 0.41809505224227905, "learning_rate": 1.0887041996506858e-06, "loss": 1.0959, "step": 524 }, { "epoch": 1.569277108433735, "grad_norm": 0.3481323719024658, "learning_rate": 1.0737871133982524e-06, "loss": 1.0388, "step": 525 }, { "epoch": 1.572289156626506, "grad_norm": 0.3880089223384857, "learning_rate": 1.0589606248636291e-06, "loss": 1.0153, "step": 526 }, { "epoch": 1.5753012048192772, "grad_norm": 0.3808007836341858, "learning_rate": 1.0442250761693829e-06, "loss": 1.0111, "step": 527 }, { "epoch": 1.5783132530120483, "grad_norm": 0.38831576704978943, "learning_rate": 1.0295808073396352e-06, "loss": 0.9816, "step": 528 }, { "epoch": 1.5813253012048194, "grad_norm": 0.41834479570388794, "learning_rate": 1.015028156292212e-06, "loss": 1.0189, "step": 529 }, { "epoch": 1.5843373493975905, "grad_norm": 0.3809266984462738, "learning_rate": 1.0005674588308566e-06, "loss": 1.0146, "step": 530 }, { "epoch": 1.5873493975903614, "grad_norm": 0.4059775471687317, "learning_rate": 9.861990486374695e-07, "loss": 0.9792, "step": 531 }, { "epoch": 1.5903614457831325, "grad_norm": 0.36427873373031616, "learning_rate": 9.719232572644189e-07, "loss": 1.0827, "step": 532 }, { "epoch": 1.5933734939759037, "grad_norm": 0.3794417679309845, "learning_rate": 9.577404141268815e-07, "loss": 1.0314, "step": 533 }, { "epoch": 1.5963855421686746, "grad_norm": 0.40571263432502747, "learning_rate": 9.436508464952471e-07, "loss": 1.0521, "step": 534 }, { "epoch": 1.5993975903614457, "grad_norm": 0.36858484148979187, "learning_rate": 9.296548794875659e-07, "loss": 1.0314, "step": 535 }, { "epoch": 1.6024096385542168, "grad_norm": 0.35998910665512085, "learning_rate": 9.157528360620416e-07, "loss": 1.0451, "step": 536 }, { "epoch": 1.605421686746988, "grad_norm": 0.3696284294128418, "learning_rate": 9.019450370095867e-07, "loss": 0.9977, "step": 537 }, { "epoch": 1.608433734939759, "grad_norm": 0.4475997984409332, "learning_rate": 8.882318009464124e-07, "loss": 1.0073, "step": 538 }, { "epoch": 1.6114457831325302, "grad_norm": 0.40017929673194885, "learning_rate": 8.74613444306684e-07, "loss": 0.9603, "step": 539 }, { "epoch": 1.6144578313253013, "grad_norm": 0.3758133053779602, "learning_rate": 8.61090281335214e-07, "loss": 0.9584, "step": 540 }, { "epoch": 1.6174698795180724, "grad_norm": 0.35535839200019836, "learning_rate": 8.476626240802099e-07, "loss": 1.1102, "step": 541 }, { "epoch": 1.6204819277108435, "grad_norm": 0.43646156787872314, "learning_rate": 8.343307823860819e-07, "loss": 1.0792, "step": 542 }, { "epoch": 1.6234939759036144, "grad_norm": 0.39517444372177124, "learning_rate": 8.210950638862813e-07, "loss": 1.0216, "step": 543 }, { "epoch": 1.6265060240963856, "grad_norm": 0.42866745591163635, "learning_rate": 8.079557739962129e-07, "loss": 1.0596, "step": 544 }, { "epoch": 1.6295180722891565, "grad_norm": 0.3550488352775574, "learning_rate": 7.949132159061784e-07, "loss": 1.0535, "step": 545 }, { "epoch": 1.6325301204819276, "grad_norm": 0.3993145823478699, "learning_rate": 7.819676905743872e-07, "loss": 1.008, "step": 546 }, { "epoch": 1.6355421686746987, "grad_norm": 0.39240461587905884, "learning_rate": 7.691194967200099e-07, "loss": 1.0231, "step": 547 }, { "epoch": 1.6385542168674698, "grad_norm": 0.356810063123703, "learning_rate": 7.563689308162803e-07, "loss": 1.0048, "step": 548 }, { "epoch": 1.641566265060241, "grad_norm": 0.36379274725914, "learning_rate": 7.43716287083664e-07, "loss": 1.0925, "step": 549 }, { "epoch": 1.644578313253012, "grad_norm": 0.4245232045650482, "learning_rate": 7.31161857483057e-07, "loss": 1.0368, "step": 550 }, { "epoch": 1.6475903614457832, "grad_norm": 0.3779962658882141, "learning_rate": 7.187059317090622e-07, "loss": 1.1019, "step": 551 }, { "epoch": 1.6506024096385543, "grad_norm": 0.41444671154022217, "learning_rate": 7.063487971832922e-07, "loss": 1.084, "step": 552 }, { "epoch": 1.6536144578313254, "grad_norm": 0.369693398475647, "learning_rate": 6.940907390477458e-07, "loss": 1.0164, "step": 553 }, { "epoch": 1.6566265060240963, "grad_norm": 0.43131789565086365, "learning_rate": 6.819320401582258e-07, "loss": 1.0915, "step": 554 }, { "epoch": 1.6596385542168675, "grad_norm": 0.41402745246887207, "learning_rate": 6.698729810778065e-07, "loss": 1.0014, "step": 555 }, { "epoch": 1.6626506024096386, "grad_norm": 0.38247060775756836, "learning_rate": 6.579138400703716e-07, "loss": 1.0127, "step": 556 }, { "epoch": 1.6656626506024095, "grad_norm": 0.45507028698921204, "learning_rate": 6.460548930941801e-07, "loss": 1.0202, "step": 557 }, { "epoch": 1.6686746987951806, "grad_norm": 0.381002813577652, "learning_rate": 6.342964137955071e-07, "loss": 1.035, "step": 558 }, { "epoch": 1.6716867469879517, "grad_norm": 0.4605034291744232, "learning_rate": 6.226386735023271e-07, "loss": 1.0472, "step": 559 }, { "epoch": 1.6746987951807228, "grad_norm": 0.3616805970668793, "learning_rate": 6.110819412180535e-07, "loss": 1.0302, "step": 560 }, { "epoch": 1.677710843373494, "grad_norm": 0.3862994313240051, "learning_rate": 5.99626483615331e-07, "loss": 1.024, "step": 561 }, { "epoch": 1.680722891566265, "grad_norm": 0.406364381313324, "learning_rate": 5.882725650298787e-07, "loss": 1.0184, "step": 562 }, { "epoch": 1.6837349397590362, "grad_norm": 0.42682695388793945, "learning_rate": 5.770204474543978e-07, "loss": 1.0347, "step": 563 }, { "epoch": 1.6867469879518073, "grad_norm": 0.4065680503845215, "learning_rate": 5.658703905325186e-07, "loss": 1.0352, "step": 564 }, { "epoch": 1.6897590361445785, "grad_norm": 0.402649462223053, "learning_rate": 5.548226515528133e-07, "loss": 1.0293, "step": 565 }, { "epoch": 1.6927710843373494, "grad_norm": 0.38777557015419006, "learning_rate": 5.438774854428614e-07, "loss": 1.0521, "step": 566 }, { "epoch": 1.6957831325301205, "grad_norm": 0.42119914293289185, "learning_rate": 5.330351447633603e-07, "loss": 1.0862, "step": 567 }, { "epoch": 1.6987951807228916, "grad_norm": 0.3981137275695801, "learning_rate": 5.222958797023036e-07, "loss": 1.0312, "step": 568 }, { "epoch": 1.7018072289156625, "grad_norm": 0.40969544649124146, "learning_rate": 5.11659938069205e-07, "loss": 1.0397, "step": 569 }, { "epoch": 1.7048192771084336, "grad_norm": 0.373832643032074, "learning_rate": 5.011275652893782e-07, "loss": 1.0546, "step": 570 }, { "epoch": 1.7078313253012047, "grad_norm": 0.4301709532737732, "learning_rate": 4.906990043982813e-07, "loss": 1.0475, "step": 571 }, { "epoch": 1.7108433734939759, "grad_norm": 0.4075815975666046, "learning_rate": 4.803744960358992e-07, "loss": 0.9895, "step": 572 }, { "epoch": 1.713855421686747, "grad_norm": 0.41060760617256165, "learning_rate": 4.701542784411994e-07, "loss": 1.032, "step": 573 }, { "epoch": 1.716867469879518, "grad_norm": 0.38388729095458984, "learning_rate": 4.6003858744662564e-07, "loss": 1.0629, "step": 574 }, { "epoch": 1.7198795180722892, "grad_norm": 0.37711286544799805, "learning_rate": 4.500276564726652e-07, "loss": 1.0032, "step": 575 }, { "epoch": 1.7228915662650603, "grad_norm": 0.4005860388278961, "learning_rate": 4.401217165224564e-07, "loss": 1.0953, "step": 576 }, { "epoch": 1.7259036144578315, "grad_norm": 0.39737778902053833, "learning_rate": 4.3032099617645874e-07, "loss": 1.0731, "step": 577 }, { "epoch": 1.7289156626506024, "grad_norm": 0.39624249935150146, "learning_rate": 4.2062572158718284e-07, "loss": 1.0633, "step": 578 }, { "epoch": 1.7319277108433735, "grad_norm": 0.3743440508842468, "learning_rate": 4.1103611647396734e-07, "loss": 1.0415, "step": 579 }, { "epoch": 1.7349397590361446, "grad_norm": 0.3983217477798462, "learning_rate": 4.0155240211781966e-07, "loss": 1.0129, "step": 580 }, { "epoch": 1.7379518072289155, "grad_norm": 0.4027600586414337, "learning_rate": 3.921747973563056e-07, "loss": 1.0909, "step": 581 }, { "epoch": 1.7409638554216866, "grad_norm": 0.4163624942302704, "learning_rate": 3.829035185785035e-07, "loss": 1.0766, "step": 582 }, { "epoch": 1.7439759036144578, "grad_norm": 0.3989628255367279, "learning_rate": 3.737387797200126e-07, "loss": 1.0506, "step": 583 }, { "epoch": 1.7469879518072289, "grad_norm": 0.339167058467865, "learning_rate": 3.646807922580098e-07, "loss": 1.027, "step": 584 }, { "epoch": 1.75, "grad_norm": 0.44778549671173096, "learning_rate": 3.557297652063768e-07, "loss": 1.0107, "step": 585 }, { "epoch": 1.7530120481927711, "grad_norm": 0.43992355465888977, "learning_rate": 3.4688590511087304e-07, "loss": 1.0068, "step": 586 }, { "epoch": 1.7560240963855422, "grad_norm": 0.36404716968536377, "learning_rate": 3.3814941604437155e-07, "loss": 1.0631, "step": 587 }, { "epoch": 1.7590361445783134, "grad_norm": 0.39936619997024536, "learning_rate": 3.2952049960214785e-07, "loss": 0.9933, "step": 588 }, { "epoch": 1.7620481927710845, "grad_norm": 0.42165055871009827, "learning_rate": 3.20999354897229e-07, "loss": 1.0362, "step": 589 }, { "epoch": 1.7650602409638554, "grad_norm": 0.41388002038002014, "learning_rate": 3.1258617855580155e-07, "loss": 1.048, "step": 590 }, { "epoch": 1.7680722891566265, "grad_norm": 0.37040451169013977, "learning_rate": 3.0428116471267146e-07, "loss": 1.073, "step": 591 }, { "epoch": 1.7710843373493976, "grad_norm": 0.4236885607242584, "learning_rate": 2.9608450500678566e-07, "loss": 1.076, "step": 592 }, { "epoch": 1.7740963855421685, "grad_norm": 0.442690908908844, "learning_rate": 2.879963885768083e-07, "loss": 1.0546, "step": 593 }, { "epoch": 1.7771084337349397, "grad_norm": 0.4121167063713074, "learning_rate": 2.800170020567566e-07, "loss": 1.0169, "step": 594 }, { "epoch": 1.7801204819277108, "grad_norm": 0.37887606024742126, "learning_rate": 2.721465295716996e-07, "loss": 1.0828, "step": 595 }, { "epoch": 1.783132530120482, "grad_norm": 0.380744993686676, "learning_rate": 2.643851527335006e-07, "loss": 1.0376, "step": 596 }, { "epoch": 1.786144578313253, "grad_norm": 0.3593333065509796, "learning_rate": 2.5673305063663335e-07, "loss": 0.9841, "step": 597 }, { "epoch": 1.7891566265060241, "grad_norm": 0.40235635638237, "learning_rate": 2.4919039985404626e-07, "loss": 1.0454, "step": 598 }, { "epoch": 1.7921686746987953, "grad_norm": 0.3604947030544281, "learning_rate": 2.4175737443308976e-07, "loss": 1.0195, "step": 599 }, { "epoch": 1.7951807228915664, "grad_norm": 0.3955729007720947, "learning_rate": 2.3443414589149838e-07, "loss": 1.0324, "step": 600 }, { "epoch": 1.7981927710843375, "grad_norm": 0.3765583038330078, "learning_rate": 2.272208832134326e-07, "loss": 1.0905, "step": 601 }, { "epoch": 1.8012048192771084, "grad_norm": 0.3759413957595825, "learning_rate": 2.201177528455828e-07, "loss": 1.0711, "step": 602 }, { "epoch": 1.8042168674698795, "grad_norm": 0.3818155825138092, "learning_rate": 2.131249186933243e-07, "loss": 1.0911, "step": 603 }, { "epoch": 1.8072289156626506, "grad_norm": 0.39899012446403503, "learning_rate": 2.0624254211693894e-07, "loss": 1.0562, "step": 604 }, { "epoch": 1.8102409638554215, "grad_norm": 0.390240341424942, "learning_rate": 1.994707819278896e-07, "loss": 1.02, "step": 605 }, { "epoch": 1.8132530120481927, "grad_norm": 0.4145658016204834, "learning_rate": 1.9280979438515479e-07, "loss": 1.022, "step": 606 }, { "epoch": 1.8162650602409638, "grad_norm": 0.4315582811832428, "learning_rate": 1.8625973319162605e-07, "loss": 1.0332, "step": 607 }, { "epoch": 1.819277108433735, "grad_norm": 0.40108925104141235, "learning_rate": 1.7982074949055794e-07, "loss": 1.0494, "step": 608 }, { "epoch": 1.822289156626506, "grad_norm": 0.37406808137893677, "learning_rate": 1.7349299186208258e-07, "loss": 1.0744, "step": 609 }, { "epoch": 1.8253012048192772, "grad_norm": 0.49371138215065, "learning_rate": 1.6727660631977894e-07, "loss": 1.0319, "step": 610 }, { "epoch": 1.8283132530120483, "grad_norm": 0.3623702824115753, "learning_rate": 1.6117173630730787e-07, "loss": 1.1106, "step": 611 }, { "epoch": 1.8313253012048194, "grad_norm": 0.3635117709636688, "learning_rate": 1.5517852269509692e-07, "loss": 1.0454, "step": 612 }, { "epoch": 1.8343373493975905, "grad_norm": 0.3568932116031647, "learning_rate": 1.492971037770924e-07, "loss": 1.0008, "step": 613 }, { "epoch": 1.8373493975903614, "grad_norm": 0.3618745803833008, "learning_rate": 1.435276152675691e-07, "loss": 1.0265, "step": 614 }, { "epoch": 1.8403614457831325, "grad_norm": 0.3803001940250397, "learning_rate": 1.378701902979962e-07, "loss": 1.0643, "step": 615 }, { "epoch": 1.8433734939759037, "grad_norm": 0.459064781665802, "learning_rate": 1.323249594139664e-07, "loss": 1.0108, "step": 616 }, { "epoch": 1.8463855421686746, "grad_norm": 0.3967600166797638, "learning_rate": 1.2689205057218602e-07, "loss": 0.9983, "step": 617 }, { "epoch": 1.8493975903614457, "grad_norm": 0.3921276926994324, "learning_rate": 1.2157158913751687e-07, "loss": 0.9829, "step": 618 }, { "epoch": 1.8524096385542168, "grad_norm": 0.3510358929634094, "learning_rate": 1.1636369788008973e-07, "loss": 1.0848, "step": 619 }, { "epoch": 1.855421686746988, "grad_norm": 0.33366602659225464, "learning_rate": 1.1126849697246533e-07, "loss": 1.0474, "step": 620 }, { "epoch": 1.858433734939759, "grad_norm": 0.3874680697917938, "learning_rate": 1.0628610398686679e-07, "loss": 1.0968, "step": 621 }, { "epoch": 1.8614457831325302, "grad_norm": 0.3490632474422455, "learning_rate": 1.014166338924627e-07, "loss": 1.0689, "step": 622 }, { "epoch": 1.8644578313253013, "grad_norm": 0.44556349515914917, "learning_rate": 9.666019905271662e-08, "loss": 1.0402, "step": 623 }, { "epoch": 1.8674698795180724, "grad_norm": 0.4002796411514282, "learning_rate": 9.201690922279405e-08, "loss": 1.0333, "step": 624 }, { "epoch": 1.8704819277108435, "grad_norm": 0.4069937467575073, "learning_rate": 8.748687154702673e-08, "loss": 1.1043, "step": 625 }, { "epoch": 1.8734939759036144, "grad_norm": 0.4305992126464844, "learning_rate": 8.307019055644517e-08, "loss": 1.0116, "step": 626 }, { "epoch": 1.8765060240963856, "grad_norm": 0.36993542313575745, "learning_rate": 7.876696816636276e-08, "loss": 1.0075, "step": 627 }, { "epoch": 1.8795180722891565, "grad_norm": 0.3679683208465576, "learning_rate": 7.45773036740255e-08, "loss": 1.0131, "step": 628 }, { "epoch": 1.8825301204819276, "grad_norm": 0.39285150170326233, "learning_rate": 7.050129375632098e-08, "loss": 1.0376, "step": 629 }, { "epoch": 1.8855421686746987, "grad_norm": 0.4058956801891327, "learning_rate": 6.65390324675469e-08, "loss": 1.0214, "step": 630 }, { "epoch": 1.8885542168674698, "grad_norm": 0.3686175048351288, "learning_rate": 6.269061123724163e-08, "loss": 1.0229, "step": 631 }, { "epoch": 1.891566265060241, "grad_norm": 0.37797847390174866, "learning_rate": 5.895611886807317e-08, "loss": 1.0389, "step": 632 }, { "epoch": 1.894578313253012, "grad_norm": 0.4053489565849304, "learning_rate": 5.533564153379134e-08, "loss": 1.0475, "step": 633 }, { "epoch": 1.8975903614457832, "grad_norm": 0.4105575382709503, "learning_rate": 5.182926277723821e-08, "loss": 1.029, "step": 634 }, { "epoch": 1.9006024096385543, "grad_norm": 0.3795925974845886, "learning_rate": 4.843706350842081e-08, "loss": 1.0502, "step": 635 }, { "epoch": 1.9036144578313254, "grad_norm": 0.3822070062160492, "learning_rate": 4.515912200264427e-08, "loss": 1.025, "step": 636 }, { "epoch": 1.9066265060240963, "grad_norm": 0.389304518699646, "learning_rate": 4.19955138987066e-08, "loss": 1.0387, "step": 637 }, { "epoch": 1.9096385542168675, "grad_norm": 0.4106118381023407, "learning_rate": 3.894631219715006e-08, "loss": 1.0442, "step": 638 }, { "epoch": 1.9126506024096386, "grad_norm": 0.39749521017074585, "learning_rate": 3.601158725858034e-08, "loss": 1.0183, "step": 639 }, { "epoch": 1.9156626506024095, "grad_norm": 0.37224337458610535, "learning_rate": 3.3191406802041693e-08, "loss": 0.9505, "step": 640 }, { "epoch": 1.9186746987951806, "grad_norm": 0.4012593924999237, "learning_rate": 3.048583590345266e-08, "loss": 0.9986, "step": 641 }, { "epoch": 1.9216867469879517, "grad_norm": 0.39663243293762207, "learning_rate": 2.7894936994106724e-08, "loss": 1.0163, "step": 642 }, { "epoch": 1.9246987951807228, "grad_norm": 0.467464804649353, "learning_rate": 2.5418769859231194e-08, "loss": 1.0142, "step": 643 }, { "epoch": 1.927710843373494, "grad_norm": 0.3819372355937958, "learning_rate": 2.3057391636606698e-08, "loss": 0.993, "step": 644 }, { "epoch": 1.930722891566265, "grad_norm": 0.3579060733318329, "learning_rate": 2.081085681524986e-08, "loss": 1.0213, "step": 645 }, { "epoch": 1.9337349397590362, "grad_norm": 0.33856555819511414, "learning_rate": 1.8679217234154335e-08, "loss": 1.0315, "step": 646 }, { "epoch": 1.9367469879518073, "grad_norm": 0.36222004890441895, "learning_rate": 1.6662522081097308e-08, "loss": 0.9624, "step": 647 }, { "epoch": 1.9397590361445785, "grad_norm": 0.3850827217102051, "learning_rate": 1.4760817891500966e-08, "loss": 1.0241, "step": 648 }, { "epoch": 1.9427710843373494, "grad_norm": 0.42454764246940613, "learning_rate": 1.2974148547362231e-08, "loss": 0.96, "step": 649 }, { "epoch": 1.9457831325301205, "grad_norm": 0.3735847473144531, "learning_rate": 1.1302555276238581e-08, "loss": 1.007, "step": 650 }, { "epoch": 1.9487951807228916, "grad_norm": 0.3734501302242279, "learning_rate": 9.746076650294922e-09, "loss": 1.0119, "step": 651 }, { "epoch": 1.9518072289156625, "grad_norm": 0.431612491607666, "learning_rate": 8.304748585417077e-09, "loss": 1.0564, "step": 652 }, { "epoch": 1.9548192771084336, "grad_norm": 0.3824908137321472, "learning_rate": 6.978604340380779e-09, "loss": 0.9928, "step": 653 }, { "epoch": 1.9578313253012047, "grad_norm": 0.38838639855384827, "learning_rate": 5.767674516083954e-09, "loss": 1.0136, "step": 654 }, { "epoch": 1.9608433734939759, "grad_norm": 0.3842463493347168, "learning_rate": 4.671987054842842e-09, "loss": 1.1033, "step": 655 }, { "epoch": 1.963855421686747, "grad_norm": 0.3982362449169159, "learning_rate": 3.6915672397436208e-09, "loss": 0.9286, "step": 656 }, { "epoch": 1.966867469879518, "grad_norm": 0.4161483645439148, "learning_rate": 2.8264376940634332e-09, "loss": 1.0393, "step": 657 }, { "epoch": 1.9698795180722892, "grad_norm": 0.37065836787223816, "learning_rate": 2.076618380744133e-09, "loss": 1.0168, "step": 658 }, { "epoch": 1.9728915662650603, "grad_norm": 0.3696196973323822, "learning_rate": 1.4421266019348789e-09, "loss": 1.0211, "step": 659 }, { "epoch": 1.9759036144578315, "grad_norm": 0.4084506630897522, "learning_rate": 9.229769985902304e-10, "loss": 1.0667, "step": 660 }, { "epoch": 1.9789156626506024, "grad_norm": 0.4110511839389801, "learning_rate": 5.191815501343067e-10, "loss": 1.044, "step": 661 }, { "epoch": 1.9819277108433735, "grad_norm": 0.36027991771698, "learning_rate": 2.307495741843413e-10, "loss": 1.0415, "step": 662 }, { "epoch": 1.9849397590361446, "grad_norm": 0.3866373598575592, "learning_rate": 5.768772633363284e-11, "loss": 1.0336, "step": 663 }, { "epoch": 1.9879518072289155, "grad_norm": 0.3646948039531708, "learning_rate": 0.0, "loss": 1.0516, "step": 664 } ], "logging_steps": 1, "max_steps": 664, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 166, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.6730319840173097e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }