diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,143794 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999756531054464, + "eval_steps": 500, + "global_step": 20536, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.869378910719938e-05, + "grad_norm": 14.938457489013672, + "learning_rate": 6.482982171799027e-08, + "loss": 1.7073, + "step": 1 + }, + { + "epoch": 9.738757821439876e-05, + "grad_norm": 10.37699031829834, + "learning_rate": 1.2965964343598055e-07, + "loss": 1.6562, + "step": 2 + }, + { + "epoch": 0.00014608136732159813, + "grad_norm": 17.40507698059082, + "learning_rate": 1.9448946515397085e-07, + "loss": 1.7121, + "step": 3 + }, + { + "epoch": 0.00019477515642879752, + "grad_norm": 9.880698204040527, + "learning_rate": 2.593192868719611e-07, + "loss": 1.636, + "step": 4 + }, + { + "epoch": 0.00024346894553599688, + "grad_norm": 0.9481208324432373, + "learning_rate": 3.241491085899514e-07, + "loss": 0.8258, + "step": 5 + }, + { + "epoch": 0.00029216273464319626, + "grad_norm": 10.632257461547852, + "learning_rate": 3.889789303079417e-07, + "loss": 1.5371, + "step": 6 + }, + { + "epoch": 0.00034085652375039565, + "grad_norm": 11.937743186950684, + "learning_rate": 4.5380875202593195e-07, + "loss": 1.6412, + "step": 7 + }, + { + "epoch": 0.00038955031285759504, + "grad_norm": 11.181463241577148, + "learning_rate": 5.186385737439222e-07, + "loss": 1.591, + "step": 8 + }, + { + "epoch": 0.00043824410196479437, + "grad_norm": 18.661169052124023, + "learning_rate": 5.834683954619125e-07, + "loss": 1.6372, + "step": 9 + }, + { + "epoch": 0.00048693789107199376, + "grad_norm": 48.73066711425781, + "learning_rate": 6.482982171799028e-07, + "loss": 1.6522, + "step": 10 + }, + { + "epoch": 0.0005356316801791932, + "grad_norm": 19.319507598876953, + "learning_rate": 7.131280388978931e-07, + "loss": 1.5885, + "step": 11 + }, + { + "epoch": 0.0005843254692863925, + "grad_norm": 27.77176856994629, + "learning_rate": 7.779578606158834e-07, + "loss": 1.6645, + "step": 12 + }, + { + "epoch": 0.0006330192583935919, + "grad_norm": 17.04899024963379, + "learning_rate": 8.427876823338736e-07, + "loss": 1.6989, + "step": 13 + }, + { + "epoch": 0.0006817130475007913, + "grad_norm": 10.058494567871094, + "learning_rate": 9.076175040518639e-07, + "loss": 1.5851, + "step": 14 + }, + { + "epoch": 0.0007304068366079906, + "grad_norm": 14.594865798950195, + "learning_rate": 9.724473257698543e-07, + "loss": 1.6077, + "step": 15 + }, + { + "epoch": 0.0007791006257151901, + "grad_norm": 16.03203010559082, + "learning_rate": 1.0372771474878444e-06, + "loss": 1.6259, + "step": 16 + }, + { + "epoch": 0.0008277944148223894, + "grad_norm": 13.072065353393555, + "learning_rate": 1.1021069692058347e-06, + "loss": 1.5192, + "step": 17 + }, + { + "epoch": 0.0008764882039295887, + "grad_norm": 23.050539016723633, + "learning_rate": 1.166936790923825e-06, + "loss": 1.6509, + "step": 18 + }, + { + "epoch": 0.0009251819930367882, + "grad_norm": 15.03418254852295, + "learning_rate": 1.2317666126418153e-06, + "loss": 1.4912, + "step": 19 + }, + { + "epoch": 0.0009738757821439875, + "grad_norm": 14.599044799804688, + "learning_rate": 1.2965964343598056e-06, + "loss": 1.5889, + "step": 20 + }, + { + "epoch": 0.0010225695712511868, + "grad_norm": 14.713092803955078, + "learning_rate": 1.3614262560777957e-06, + "loss": 1.5643, + "step": 21 + }, + { + "epoch": 0.0010712633603583864, + "grad_norm": 10.631231307983398, + "learning_rate": 1.4262560777957862e-06, + "loss": 1.4935, + "step": 22 + }, + { + "epoch": 0.0011199571494655857, + "grad_norm": 10.128983497619629, + "learning_rate": 1.4910858995137765e-06, + "loss": 1.5175, + "step": 23 + }, + { + "epoch": 0.001168650938572785, + "grad_norm": 8.249070167541504, + "learning_rate": 1.5559157212317668e-06, + "loss": 1.4488, + "step": 24 + }, + { + "epoch": 0.0012173447276799844, + "grad_norm": 10.228593826293945, + "learning_rate": 1.6207455429497571e-06, + "loss": 1.4544, + "step": 25 + }, + { + "epoch": 0.0012660385167871837, + "grad_norm": 9.103757858276367, + "learning_rate": 1.6855753646677472e-06, + "loss": 1.4424, + "step": 26 + }, + { + "epoch": 0.0013147323058943833, + "grad_norm": 7.3896260261535645, + "learning_rate": 1.7504051863857377e-06, + "loss": 1.3459, + "step": 27 + }, + { + "epoch": 0.0013634260950015826, + "grad_norm": 7.2700886726379395, + "learning_rate": 1.8152350081037278e-06, + "loss": 1.3951, + "step": 28 + }, + { + "epoch": 0.001412119884108782, + "grad_norm": 11.91407299041748, + "learning_rate": 1.8800648298217181e-06, + "loss": 1.3878, + "step": 29 + }, + { + "epoch": 0.0014608136732159813, + "grad_norm": 7.582723617553711, + "learning_rate": 1.9448946515397086e-06, + "loss": 1.4431, + "step": 30 + }, + { + "epoch": 0.0015095074623231806, + "grad_norm": 5.829516410827637, + "learning_rate": 2.0097244732576985e-06, + "loss": 1.4062, + "step": 31 + }, + { + "epoch": 0.0015582012514303801, + "grad_norm": 10.774565696716309, + "learning_rate": 2.074554294975689e-06, + "loss": 1.3868, + "step": 32 + }, + { + "epoch": 0.0016068950405375795, + "grad_norm": 13.08377742767334, + "learning_rate": 2.1393841166936795e-06, + "loss": 1.4133, + "step": 33 + }, + { + "epoch": 0.0016555888296447788, + "grad_norm": 0.7290104627609253, + "learning_rate": 2.2042139384116694e-06, + "loss": 0.8838, + "step": 34 + }, + { + "epoch": 0.0017042826187519781, + "grad_norm": 5.386821746826172, + "learning_rate": 2.2690437601296597e-06, + "loss": 1.3546, + "step": 35 + }, + { + "epoch": 0.0017529764078591775, + "grad_norm": 11.15325927734375, + "learning_rate": 2.33387358184765e-06, + "loss": 1.3567, + "step": 36 + }, + { + "epoch": 0.001801670196966377, + "grad_norm": 5.735583782196045, + "learning_rate": 2.3987034035656403e-06, + "loss": 1.3756, + "step": 37 + }, + { + "epoch": 0.0018503639860735764, + "grad_norm": 5.347208499908447, + "learning_rate": 2.4635332252836306e-06, + "loss": 1.1982, + "step": 38 + }, + { + "epoch": 0.0018990577751807757, + "grad_norm": 4.520090103149414, + "learning_rate": 2.528363047001621e-06, + "loss": 1.3719, + "step": 39 + }, + { + "epoch": 0.001947751564287975, + "grad_norm": 9.583420753479004, + "learning_rate": 2.593192868719611e-06, + "loss": 1.2422, + "step": 40 + }, + { + "epoch": 0.0019964453533951746, + "grad_norm": 4.523978233337402, + "learning_rate": 2.6580226904376015e-06, + "loss": 1.2754, + "step": 41 + }, + { + "epoch": 0.0020451391425023737, + "grad_norm": 7.033502101898193, + "learning_rate": 2.7228525121555914e-06, + "loss": 1.1466, + "step": 42 + }, + { + "epoch": 0.0020938329316095732, + "grad_norm": 6.324263095855713, + "learning_rate": 2.787682333873582e-06, + "loss": 1.3316, + "step": 43 + }, + { + "epoch": 0.002142526720716773, + "grad_norm": 6.586194038391113, + "learning_rate": 2.8525121555915724e-06, + "loss": 1.3247, + "step": 44 + }, + { + "epoch": 0.002191220509823972, + "grad_norm": 3.474792003631592, + "learning_rate": 2.9173419773095623e-06, + "loss": 1.3032, + "step": 45 + }, + { + "epoch": 0.0022399142989311715, + "grad_norm": 4.8373823165893555, + "learning_rate": 2.982171799027553e-06, + "loss": 1.2938, + "step": 46 + }, + { + "epoch": 0.0022886080880383706, + "grad_norm": 4.593636512756348, + "learning_rate": 3.0470016207455433e-06, + "loss": 1.3274, + "step": 47 + }, + { + "epoch": 0.00233730187714557, + "grad_norm": 3.5691211223602295, + "learning_rate": 3.1118314424635336e-06, + "loss": 1.2213, + "step": 48 + }, + { + "epoch": 0.0023859956662527697, + "grad_norm": 3.3933768272399902, + "learning_rate": 3.1766612641815235e-06, + "loss": 1.2247, + "step": 49 + }, + { + "epoch": 0.0024346894553599688, + "grad_norm": 3.5371007919311523, + "learning_rate": 3.2414910858995142e-06, + "loss": 1.3008, + "step": 50 + }, + { + "epoch": 0.0024833832444671683, + "grad_norm": 3.5540590286254883, + "learning_rate": 3.3063209076175045e-06, + "loss": 1.3844, + "step": 51 + }, + { + "epoch": 0.0025320770335743674, + "grad_norm": 5.930482387542725, + "learning_rate": 3.3711507293354944e-06, + "loss": 1.2844, + "step": 52 + }, + { + "epoch": 0.002580770822681567, + "grad_norm": 0.6217969059944153, + "learning_rate": 3.435980551053485e-06, + "loss": 0.8075, + "step": 53 + }, + { + "epoch": 0.0026294646117887665, + "grad_norm": 4.198158264160156, + "learning_rate": 3.5008103727714754e-06, + "loss": 1.2364, + "step": 54 + }, + { + "epoch": 0.0026781584008959657, + "grad_norm": 3.3262763023376465, + "learning_rate": 3.5656401944894653e-06, + "loss": 1.2393, + "step": 55 + }, + { + "epoch": 0.002726852190003165, + "grad_norm": 0.6489346027374268, + "learning_rate": 3.6304700162074556e-06, + "loss": 0.8508, + "step": 56 + }, + { + "epoch": 0.0027755459791103643, + "grad_norm": 4.508919715881348, + "learning_rate": 3.6952998379254463e-06, + "loss": 1.2064, + "step": 57 + }, + { + "epoch": 0.002824239768217564, + "grad_norm": 5.464781761169434, + "learning_rate": 3.7601296596434362e-06, + "loss": 1.1282, + "step": 58 + }, + { + "epoch": 0.0028729335573247634, + "grad_norm": 3.1992220878601074, + "learning_rate": 3.824959481361427e-06, + "loss": 1.2085, + "step": 59 + }, + { + "epoch": 0.0029216273464319625, + "grad_norm": 4.650223731994629, + "learning_rate": 3.889789303079417e-06, + "loss": 1.1991, + "step": 60 + }, + { + "epoch": 0.002970321135539162, + "grad_norm": 4.478381156921387, + "learning_rate": 3.954619124797407e-06, + "loss": 1.1694, + "step": 61 + }, + { + "epoch": 0.003019014924646361, + "grad_norm": 3.5059022903442383, + "learning_rate": 4.019448946515397e-06, + "loss": 1.1233, + "step": 62 + }, + { + "epoch": 0.0030677087137535607, + "grad_norm": 2.770831346511841, + "learning_rate": 4.084278768233388e-06, + "loss": 1.285, + "step": 63 + }, + { + "epoch": 0.0031164025028607603, + "grad_norm": 3.2876157760620117, + "learning_rate": 4.149108589951378e-06, + "loss": 1.1901, + "step": 64 + }, + { + "epoch": 0.0031650962919679594, + "grad_norm": 4.4894585609436035, + "learning_rate": 4.213938411669368e-06, + "loss": 1.1291, + "step": 65 + }, + { + "epoch": 0.003213790081075159, + "grad_norm": 4.754753589630127, + "learning_rate": 4.278768233387359e-06, + "loss": 1.1862, + "step": 66 + }, + { + "epoch": 0.003262483870182358, + "grad_norm": 3.949821949005127, + "learning_rate": 4.3435980551053485e-06, + "loss": 1.1801, + "step": 67 + }, + { + "epoch": 0.0033111776592895576, + "grad_norm": 2.9675307273864746, + "learning_rate": 4.408427876823339e-06, + "loss": 1.0941, + "step": 68 + }, + { + "epoch": 0.003359871448396757, + "grad_norm": 4.766266345977783, + "learning_rate": 4.473257698541329e-06, + "loss": 1.1773, + "step": 69 + }, + { + "epoch": 0.0034085652375039563, + "grad_norm": 5.057507038116455, + "learning_rate": 4.538087520259319e-06, + "loss": 1.1201, + "step": 70 + }, + { + "epoch": 0.003457259026611156, + "grad_norm": 2.2419261932373047, + "learning_rate": 4.60291734197731e-06, + "loss": 1.2571, + "step": 71 + }, + { + "epoch": 0.003505952815718355, + "grad_norm": 6.629289627075195, + "learning_rate": 4.6677471636953e-06, + "loss": 1.2254, + "step": 72 + }, + { + "epoch": 0.0035546466048255545, + "grad_norm": 3.7402637004852295, + "learning_rate": 4.73257698541329e-06, + "loss": 1.1414, + "step": 73 + }, + { + "epoch": 0.003603340393932754, + "grad_norm": 2.5614731311798096, + "learning_rate": 4.797406807131281e-06, + "loss": 1.1708, + "step": 74 + }, + { + "epoch": 0.003652034183039953, + "grad_norm": 2.4759161472320557, + "learning_rate": 4.862236628849271e-06, + "loss": 1.1322, + "step": 75 + }, + { + "epoch": 0.0037007279721471527, + "grad_norm": 3.6032660007476807, + "learning_rate": 4.927066450567261e-06, + "loss": 1.2291, + "step": 76 + }, + { + "epoch": 0.003749421761254352, + "grad_norm": 6.088449954986572, + "learning_rate": 4.9918962722852515e-06, + "loss": 1.1193, + "step": 77 + }, + { + "epoch": 0.0037981155503615514, + "grad_norm": 2.640449047088623, + "learning_rate": 5.056726094003242e-06, + "loss": 1.2179, + "step": 78 + }, + { + "epoch": 0.003846809339468751, + "grad_norm": 2.6539082527160645, + "learning_rate": 5.121555915721232e-06, + "loss": 1.1666, + "step": 79 + }, + { + "epoch": 0.00389550312857595, + "grad_norm": 3.3687117099761963, + "learning_rate": 5.186385737439222e-06, + "loss": 1.2371, + "step": 80 + }, + { + "epoch": 0.00394419691768315, + "grad_norm": 3.5197246074676514, + "learning_rate": 5.251215559157212e-06, + "loss": 1.1909, + "step": 81 + }, + { + "epoch": 0.003992890706790349, + "grad_norm": 2.6676502227783203, + "learning_rate": 5.316045380875203e-06, + "loss": 1.1945, + "step": 82 + }, + { + "epoch": 0.004041584495897549, + "grad_norm": 2.264862298965454, + "learning_rate": 5.380875202593193e-06, + "loss": 1.1534, + "step": 83 + }, + { + "epoch": 0.004090278285004747, + "grad_norm": 4.73082971572876, + "learning_rate": 5.445705024311183e-06, + "loss": 1.1786, + "step": 84 + }, + { + "epoch": 0.004138972074111947, + "grad_norm": 2.13845157623291, + "learning_rate": 5.510534846029174e-06, + "loss": 1.1484, + "step": 85 + }, + { + "epoch": 0.0041876658632191465, + "grad_norm": 3.358384847640991, + "learning_rate": 5.575364667747164e-06, + "loss": 1.1818, + "step": 86 + }, + { + "epoch": 0.004236359652326346, + "grad_norm": 2.8769824504852295, + "learning_rate": 5.640194489465154e-06, + "loss": 1.1395, + "step": 87 + }, + { + "epoch": 0.004285053441433546, + "grad_norm": 3.4804019927978516, + "learning_rate": 5.705024311183145e-06, + "loss": 1.0937, + "step": 88 + }, + { + "epoch": 0.004333747230540744, + "grad_norm": 2.4652771949768066, + "learning_rate": 5.769854132901135e-06, + "loss": 1.2397, + "step": 89 + }, + { + "epoch": 0.004382441019647944, + "grad_norm": 2.615896224975586, + "learning_rate": 5.834683954619125e-06, + "loss": 1.1642, + "step": 90 + }, + { + "epoch": 0.004431134808755143, + "grad_norm": 2.9985616207122803, + "learning_rate": 5.899513776337116e-06, + "loss": 1.107, + "step": 91 + }, + { + "epoch": 0.004479828597862343, + "grad_norm": 3.004094123840332, + "learning_rate": 5.964343598055106e-06, + "loss": 1.0836, + "step": 92 + }, + { + "epoch": 0.0045285223869695425, + "grad_norm": 5.234726428985596, + "learning_rate": 6.029173419773096e-06, + "loss": 1.2091, + "step": 93 + }, + { + "epoch": 0.004577216176076741, + "grad_norm": 3.727107286453247, + "learning_rate": 6.094003241491087e-06, + "loss": 1.1198, + "step": 94 + }, + { + "epoch": 0.004625909965183941, + "grad_norm": 6.085718631744385, + "learning_rate": 6.158833063209076e-06, + "loss": 1.0883, + "step": 95 + }, + { + "epoch": 0.00467460375429114, + "grad_norm": 2.4184916019439697, + "learning_rate": 6.223662884927067e-06, + "loss": 1.1968, + "step": 96 + }, + { + "epoch": 0.00472329754339834, + "grad_norm": 3.070183038711548, + "learning_rate": 6.2884927066450575e-06, + "loss": 1.0861, + "step": 97 + }, + { + "epoch": 0.004771991332505539, + "grad_norm": 2.2245736122131348, + "learning_rate": 6.353322528363047e-06, + "loss": 1.1956, + "step": 98 + }, + { + "epoch": 0.004820685121612738, + "grad_norm": 2.8125691413879395, + "learning_rate": 6.418152350081038e-06, + "loss": 1.0803, + "step": 99 + }, + { + "epoch": 0.0048693789107199376, + "grad_norm": 6.708660125732422, + "learning_rate": 6.4829821717990284e-06, + "loss": 1.1889, + "step": 100 + }, + { + "epoch": 0.004918072699827137, + "grad_norm": 2.5404975414276123, + "learning_rate": 6.547811993517018e-06, + "loss": 1.1614, + "step": 101 + }, + { + "epoch": 0.004966766488934337, + "grad_norm": 2.9992356300354004, + "learning_rate": 6.612641815235009e-06, + "loss": 1.1605, + "step": 102 + }, + { + "epoch": 0.005015460278041536, + "grad_norm": 5.147703170776367, + "learning_rate": 6.677471636952999e-06, + "loss": 1.1071, + "step": 103 + }, + { + "epoch": 0.005064154067148735, + "grad_norm": 2.2262613773345947, + "learning_rate": 6.742301458670989e-06, + "loss": 1.0895, + "step": 104 + }, + { + "epoch": 0.0051128478562559344, + "grad_norm": 2.9136710166931152, + "learning_rate": 6.80713128038898e-06, + "loss": 1.1127, + "step": 105 + }, + { + "epoch": 0.005161541645363134, + "grad_norm": 2.7122507095336914, + "learning_rate": 6.87196110210697e-06, + "loss": 1.1272, + "step": 106 + }, + { + "epoch": 0.0052102354344703335, + "grad_norm": 2.2721667289733887, + "learning_rate": 6.93679092382496e-06, + "loss": 1.1904, + "step": 107 + }, + { + "epoch": 0.005258929223577533, + "grad_norm": 2.019613265991211, + "learning_rate": 7.001620745542951e-06, + "loss": 1.139, + "step": 108 + }, + { + "epoch": 0.005307623012684732, + "grad_norm": 2.6866297721862793, + "learning_rate": 7.06645056726094e-06, + "loss": 1.1329, + "step": 109 + }, + { + "epoch": 0.005356316801791931, + "grad_norm": 2.7020046710968018, + "learning_rate": 7.131280388978931e-06, + "loss": 1.173, + "step": 110 + }, + { + "epoch": 0.005405010590899131, + "grad_norm": 2.4910926818847656, + "learning_rate": 7.196110210696922e-06, + "loss": 1.2321, + "step": 111 + }, + { + "epoch": 0.00545370438000633, + "grad_norm": 7.341420650482178, + "learning_rate": 7.260940032414911e-06, + "loss": 1.1336, + "step": 112 + }, + { + "epoch": 0.00550239816911353, + "grad_norm": 6.999799728393555, + "learning_rate": 7.3257698541329015e-06, + "loss": 1.1741, + "step": 113 + }, + { + "epoch": 0.005551091958220729, + "grad_norm": 2.117766857147217, + "learning_rate": 7.390599675850893e-06, + "loss": 1.199, + "step": 114 + }, + { + "epoch": 0.005599785747327928, + "grad_norm": 2.689647674560547, + "learning_rate": 7.455429497568882e-06, + "loss": 1.151, + "step": 115 + }, + { + "epoch": 0.005648479536435128, + "grad_norm": 3.8767874240875244, + "learning_rate": 7.5202593192868724e-06, + "loss": 1.0924, + "step": 116 + }, + { + "epoch": 0.005697173325542327, + "grad_norm": 2.6393229961395264, + "learning_rate": 7.5850891410048636e-06, + "loss": 1.0594, + "step": 117 + }, + { + "epoch": 0.005745867114649527, + "grad_norm": 2.808015823364258, + "learning_rate": 7.649918962722854e-06, + "loss": 1.2646, + "step": 118 + }, + { + "epoch": 0.0057945609037567255, + "grad_norm": 3.7151336669921875, + "learning_rate": 7.714748784440843e-06, + "loss": 1.2085, + "step": 119 + }, + { + "epoch": 0.005843254692863925, + "grad_norm": 3.604806661605835, + "learning_rate": 7.779578606158834e-06, + "loss": 1.1355, + "step": 120 + }, + { + "epoch": 0.005891948481971125, + "grad_norm": 0.5509645938873291, + "learning_rate": 7.844408427876824e-06, + "loss": 0.7708, + "step": 121 + }, + { + "epoch": 0.005940642271078324, + "grad_norm": 3.1226603984832764, + "learning_rate": 7.909238249594813e-06, + "loss": 1.1888, + "step": 122 + }, + { + "epoch": 0.005989336060185524, + "grad_norm": 3.1639978885650635, + "learning_rate": 7.974068071312805e-06, + "loss": 1.071, + "step": 123 + }, + { + "epoch": 0.006038029849292722, + "grad_norm": 2.228502035140991, + "learning_rate": 8.038897893030794e-06, + "loss": 1.0879, + "step": 124 + }, + { + "epoch": 0.006086723638399922, + "grad_norm": 4.803535461425781, + "learning_rate": 8.103727714748785e-06, + "loss": 1.1379, + "step": 125 + }, + { + "epoch": 0.0061354174275071215, + "grad_norm": 2.4562809467315674, + "learning_rate": 8.168557536466776e-06, + "loss": 1.1988, + "step": 126 + }, + { + "epoch": 0.006184111216614321, + "grad_norm": 3.4044525623321533, + "learning_rate": 8.233387358184766e-06, + "loss": 1.1629, + "step": 127 + }, + { + "epoch": 0.006232805005721521, + "grad_norm": 3.727553129196167, + "learning_rate": 8.298217179902755e-06, + "loss": 1.1324, + "step": 128 + }, + { + "epoch": 0.006281498794828719, + "grad_norm": 2.520841121673584, + "learning_rate": 8.363047001620746e-06, + "loss": 1.232, + "step": 129 + }, + { + "epoch": 0.006330192583935919, + "grad_norm": 2.4541027545928955, + "learning_rate": 8.427876823338736e-06, + "loss": 1.0949, + "step": 130 + }, + { + "epoch": 0.006378886373043118, + "grad_norm": 7.470241069793701, + "learning_rate": 8.492706645056727e-06, + "loss": 1.1685, + "step": 131 + }, + { + "epoch": 0.006427580162150318, + "grad_norm": 2.4426560401916504, + "learning_rate": 8.557536466774718e-06, + "loss": 1.2203, + "step": 132 + }, + { + "epoch": 0.0064762739512575175, + "grad_norm": 3.658627986907959, + "learning_rate": 8.622366288492708e-06, + "loss": 1.1788, + "step": 133 + }, + { + "epoch": 0.006524967740364716, + "grad_norm": 2.760282516479492, + "learning_rate": 8.687196110210697e-06, + "loss": 1.191, + "step": 134 + }, + { + "epoch": 0.006573661529471916, + "grad_norm": 2.261939764022827, + "learning_rate": 8.752025931928688e-06, + "loss": 1.2233, + "step": 135 + }, + { + "epoch": 0.006622355318579115, + "grad_norm": 2.2327353954315186, + "learning_rate": 8.816855753646678e-06, + "loss": 1.079, + "step": 136 + }, + { + "epoch": 0.006671049107686315, + "grad_norm": 2.6800625324249268, + "learning_rate": 8.881685575364669e-06, + "loss": 1.1346, + "step": 137 + }, + { + "epoch": 0.006719742896793514, + "grad_norm": 2.9521918296813965, + "learning_rate": 8.946515397082658e-06, + "loss": 1.1614, + "step": 138 + }, + { + "epoch": 0.006768436685900713, + "grad_norm": 5.614945888519287, + "learning_rate": 9.01134521880065e-06, + "loss": 0.9742, + "step": 139 + }, + { + "epoch": 0.006817130475007913, + "grad_norm": 2.114039182662964, + "learning_rate": 9.076175040518639e-06, + "loss": 1.1891, + "step": 140 + }, + { + "epoch": 0.006865824264115112, + "grad_norm": 2.024221897125244, + "learning_rate": 9.14100486223663e-06, + "loss": 1.0544, + "step": 141 + }, + { + "epoch": 0.006914518053222312, + "grad_norm": 2.7922492027282715, + "learning_rate": 9.20583468395462e-06, + "loss": 1.1083, + "step": 142 + }, + { + "epoch": 0.006963211842329511, + "grad_norm": 2.3637173175811768, + "learning_rate": 9.270664505672609e-06, + "loss": 1.1497, + "step": 143 + }, + { + "epoch": 0.00701190563143671, + "grad_norm": 2.367253065109253, + "learning_rate": 9.3354943273906e-06, + "loss": 1.1416, + "step": 144 + }, + { + "epoch": 0.0070605994205439095, + "grad_norm": 2.8511059284210205, + "learning_rate": 9.400324149108591e-06, + "loss": 1.1308, + "step": 145 + }, + { + "epoch": 0.007109293209651109, + "grad_norm": 2.6169631481170654, + "learning_rate": 9.46515397082658e-06, + "loss": 1.0756, + "step": 146 + }, + { + "epoch": 0.0071579869987583086, + "grad_norm": 2.426828384399414, + "learning_rate": 9.529983792544572e-06, + "loss": 1.2314, + "step": 147 + }, + { + "epoch": 0.007206680787865508, + "grad_norm": 1.8587294816970825, + "learning_rate": 9.594813614262561e-06, + "loss": 1.0331, + "step": 148 + }, + { + "epoch": 0.007255374576972707, + "grad_norm": 2.458409547805786, + "learning_rate": 9.65964343598055e-06, + "loss": 1.0934, + "step": 149 + }, + { + "epoch": 0.007304068366079906, + "grad_norm": 3.034083127975464, + "learning_rate": 9.724473257698542e-06, + "loss": 1.0759, + "step": 150 + }, + { + "epoch": 0.007352762155187106, + "grad_norm": 2.8500349521636963, + "learning_rate": 9.789303079416533e-06, + "loss": 0.9936, + "step": 151 + }, + { + "epoch": 0.0074014559442943054, + "grad_norm": 5.1554975509643555, + "learning_rate": 9.854132901134522e-06, + "loss": 1.169, + "step": 152 + }, + { + "epoch": 0.007450149733401505, + "grad_norm": 2.112546443939209, + "learning_rate": 9.918962722852514e-06, + "loss": 1.0796, + "step": 153 + }, + { + "epoch": 0.007498843522508704, + "grad_norm": 3.0289175510406494, + "learning_rate": 9.983792544570503e-06, + "loss": 1.0737, + "step": 154 + }, + { + "epoch": 0.007547537311615903, + "grad_norm": 2.4024620056152344, + "learning_rate": 1.0048622366288494e-05, + "loss": 1.0991, + "step": 155 + }, + { + "epoch": 0.007596231100723103, + "grad_norm": 3.1477301120758057, + "learning_rate": 1.0113452188006484e-05, + "loss": 1.1701, + "step": 156 + }, + { + "epoch": 0.007644924889830302, + "grad_norm": 2.4802632331848145, + "learning_rate": 1.0178282009724473e-05, + "loss": 1.0132, + "step": 157 + }, + { + "epoch": 0.007693618678937502, + "grad_norm": 3.534938335418701, + "learning_rate": 1.0243111831442464e-05, + "loss": 1.1791, + "step": 158 + }, + { + "epoch": 0.0077423124680447005, + "grad_norm": 2.7250072956085205, + "learning_rate": 1.0307941653160454e-05, + "loss": 1.1171, + "step": 159 + }, + { + "epoch": 0.0077910062571519, + "grad_norm": 2.7443771362304688, + "learning_rate": 1.0372771474878445e-05, + "loss": 0.9985, + "step": 160 + }, + { + "epoch": 0.0078397000462591, + "grad_norm": 2.3715553283691406, + "learning_rate": 1.0437601296596436e-05, + "loss": 1.1705, + "step": 161 + }, + { + "epoch": 0.0078883938353663, + "grad_norm": 1.9851881265640259, + "learning_rate": 1.0502431118314424e-05, + "loss": 1.1517, + "step": 162 + }, + { + "epoch": 0.007937087624473498, + "grad_norm": 3.1289663314819336, + "learning_rate": 1.0567260940032415e-05, + "loss": 1.1062, + "step": 163 + }, + { + "epoch": 0.007985781413580698, + "grad_norm": 2.5561187267303467, + "learning_rate": 1.0632090761750406e-05, + "loss": 1.0665, + "step": 164 + }, + { + "epoch": 0.008034475202687897, + "grad_norm": 2.488168478012085, + "learning_rate": 1.0696920583468395e-05, + "loss": 1.1329, + "step": 165 + }, + { + "epoch": 0.008083168991795097, + "grad_norm": 2.0521299839019775, + "learning_rate": 1.0761750405186387e-05, + "loss": 1.1426, + "step": 166 + }, + { + "epoch": 0.008131862780902296, + "grad_norm": 6.507519245147705, + "learning_rate": 1.0826580226904378e-05, + "loss": 1.0229, + "step": 167 + }, + { + "epoch": 0.008180556570009495, + "grad_norm": 2.412785768508911, + "learning_rate": 1.0891410048622366e-05, + "loss": 1.1399, + "step": 168 + }, + { + "epoch": 0.008229250359116695, + "grad_norm": 2.9695041179656982, + "learning_rate": 1.0956239870340357e-05, + "loss": 1.0327, + "step": 169 + }, + { + "epoch": 0.008277944148223894, + "grad_norm": 2.4780497550964355, + "learning_rate": 1.1021069692058348e-05, + "loss": 1.0295, + "step": 170 + }, + { + "epoch": 0.008326637937331094, + "grad_norm": 0.4250452220439911, + "learning_rate": 1.1085899513776337e-05, + "loss": 0.7082, + "step": 171 + }, + { + "epoch": 0.008375331726438293, + "grad_norm": 3.982174873352051, + "learning_rate": 1.1150729335494328e-05, + "loss": 1.1901, + "step": 172 + }, + { + "epoch": 0.008424025515545492, + "grad_norm": 3.2423527240753174, + "learning_rate": 1.121555915721232e-05, + "loss": 1.1358, + "step": 173 + }, + { + "epoch": 0.008472719304652692, + "grad_norm": 2.557166576385498, + "learning_rate": 1.1280388978930307e-05, + "loss": 1.0747, + "step": 174 + }, + { + "epoch": 0.00852141309375989, + "grad_norm": 2.5642740726470947, + "learning_rate": 1.1345218800648299e-05, + "loss": 1.0534, + "step": 175 + }, + { + "epoch": 0.008570106882867091, + "grad_norm": 2.169997453689575, + "learning_rate": 1.141004862236629e-05, + "loss": 1.0662, + "step": 176 + }, + { + "epoch": 0.00861880067197429, + "grad_norm": 2.721982002258301, + "learning_rate": 1.1474878444084279e-05, + "loss": 1.1648, + "step": 177 + }, + { + "epoch": 0.008667494461081489, + "grad_norm": 2.1389870643615723, + "learning_rate": 1.153970826580227e-05, + "loss": 1.1079, + "step": 178 + }, + { + "epoch": 0.008716188250188689, + "grad_norm": 3.254185438156128, + "learning_rate": 1.1604538087520261e-05, + "loss": 1.0735, + "step": 179 + }, + { + "epoch": 0.008764882039295888, + "grad_norm": 0.3111902177333832, + "learning_rate": 1.166936790923825e-05, + "loss": 0.6636, + "step": 180 + }, + { + "epoch": 0.008813575828403088, + "grad_norm": 3.0268805027008057, + "learning_rate": 1.173419773095624e-05, + "loss": 1.2027, + "step": 181 + }, + { + "epoch": 0.008862269617510287, + "grad_norm": 2.5411858558654785, + "learning_rate": 1.1799027552674231e-05, + "loss": 1.0497, + "step": 182 + }, + { + "epoch": 0.008910963406617485, + "grad_norm": 2.8545658588409424, + "learning_rate": 1.1863857374392221e-05, + "loss": 1.0791, + "step": 183 + }, + { + "epoch": 0.008959657195724686, + "grad_norm": 2.446444272994995, + "learning_rate": 1.1928687196110212e-05, + "loss": 1.0183, + "step": 184 + }, + { + "epoch": 0.009008350984831884, + "grad_norm": 1.7995147705078125, + "learning_rate": 1.1993517017828202e-05, + "loss": 1.0978, + "step": 185 + }, + { + "epoch": 0.009057044773939085, + "grad_norm": 3.2176294326782227, + "learning_rate": 1.2058346839546193e-05, + "loss": 1.104, + "step": 186 + }, + { + "epoch": 0.009105738563046284, + "grad_norm": 3.111555576324463, + "learning_rate": 1.2123176661264182e-05, + "loss": 1.1059, + "step": 187 + }, + { + "epoch": 0.009154432352153482, + "grad_norm": 0.2686788737773895, + "learning_rate": 1.2188006482982173e-05, + "loss": 0.684, + "step": 188 + }, + { + "epoch": 0.009203126141260683, + "grad_norm": 2.025911331176758, + "learning_rate": 1.2252836304700164e-05, + "loss": 1.038, + "step": 189 + }, + { + "epoch": 0.009251819930367881, + "grad_norm": 3.6916260719299316, + "learning_rate": 1.2317666126418152e-05, + "loss": 1.0621, + "step": 190 + }, + { + "epoch": 0.009300513719475082, + "grad_norm": 2.2412023544311523, + "learning_rate": 1.2382495948136143e-05, + "loss": 1.1515, + "step": 191 + }, + { + "epoch": 0.00934920750858228, + "grad_norm": 2.6090991497039795, + "learning_rate": 1.2447325769854134e-05, + "loss": 1.1406, + "step": 192 + }, + { + "epoch": 0.00939790129768948, + "grad_norm": 3.5606086254119873, + "learning_rate": 1.2512155591572124e-05, + "loss": 1.185, + "step": 193 + }, + { + "epoch": 0.00944659508679668, + "grad_norm": 2.2085511684417725, + "learning_rate": 1.2576985413290115e-05, + "loss": 1.0781, + "step": 194 + }, + { + "epoch": 0.009495288875903878, + "grad_norm": 2.469919443130493, + "learning_rate": 1.2641815235008106e-05, + "loss": 1.0245, + "step": 195 + }, + { + "epoch": 0.009543982665011079, + "grad_norm": 2.218496799468994, + "learning_rate": 1.2706645056726094e-05, + "loss": 1.1353, + "step": 196 + }, + { + "epoch": 0.009592676454118277, + "grad_norm": 3.613008975982666, + "learning_rate": 1.2771474878444085e-05, + "loss": 1.1573, + "step": 197 + }, + { + "epoch": 0.009641370243225476, + "grad_norm": 3.371553659439087, + "learning_rate": 1.2836304700162076e-05, + "loss": 1.0183, + "step": 198 + }, + { + "epoch": 0.009690064032332676, + "grad_norm": 2.6804678440093994, + "learning_rate": 1.2901134521880066e-05, + "loss": 1.0317, + "step": 199 + }, + { + "epoch": 0.009738757821439875, + "grad_norm": 2.9396567344665527, + "learning_rate": 1.2965964343598057e-05, + "loss": 0.9882, + "step": 200 + }, + { + "epoch": 0.009787451610547076, + "grad_norm": 2.9930872917175293, + "learning_rate": 1.3030794165316048e-05, + "loss": 1.0365, + "step": 201 + }, + { + "epoch": 0.009836145399654274, + "grad_norm": 2.534083366394043, + "learning_rate": 1.3095623987034036e-05, + "loss": 1.1633, + "step": 202 + }, + { + "epoch": 0.009884839188761473, + "grad_norm": 3.1834232807159424, + "learning_rate": 1.3160453808752027e-05, + "loss": 1.0576, + "step": 203 + }, + { + "epoch": 0.009933532977868673, + "grad_norm": 0.20488938689231873, + "learning_rate": 1.3225283630470018e-05, + "loss": 0.6515, + "step": 204 + }, + { + "epoch": 0.009982226766975872, + "grad_norm": 2.9953932762145996, + "learning_rate": 1.3290113452188008e-05, + "loss": 1.1632, + "step": 205 + }, + { + "epoch": 0.010030920556083072, + "grad_norm": 1.8575705289840698, + "learning_rate": 1.3354943273905999e-05, + "loss": 1.1129, + "step": 206 + }, + { + "epoch": 0.010079614345190271, + "grad_norm": 3.806241035461426, + "learning_rate": 1.341977309562399e-05, + "loss": 0.9946, + "step": 207 + }, + { + "epoch": 0.01012830813429747, + "grad_norm": 5.708776950836182, + "learning_rate": 1.3484602917341978e-05, + "loss": 1.0471, + "step": 208 + }, + { + "epoch": 0.01017700192340467, + "grad_norm": 3.151057243347168, + "learning_rate": 1.3549432739059969e-05, + "loss": 1.1817, + "step": 209 + }, + { + "epoch": 0.010225695712511869, + "grad_norm": 3.7722485065460205, + "learning_rate": 1.361426256077796e-05, + "loss": 1.0592, + "step": 210 + }, + { + "epoch": 0.01027438950161907, + "grad_norm": 2.5222666263580322, + "learning_rate": 1.367909238249595e-05, + "loss": 1.0635, + "step": 211 + }, + { + "epoch": 0.010323083290726268, + "grad_norm": 0.16624125838279724, + "learning_rate": 1.374392220421394e-05, + "loss": 0.7407, + "step": 212 + }, + { + "epoch": 0.010371777079833467, + "grad_norm": 4.5698628425598145, + "learning_rate": 1.380875202593193e-05, + "loss": 1.0675, + "step": 213 + }, + { + "epoch": 0.010420470868940667, + "grad_norm": 2.713099718093872, + "learning_rate": 1.387358184764992e-05, + "loss": 1.123, + "step": 214 + }, + { + "epoch": 0.010469164658047866, + "grad_norm": 2.9813456535339355, + "learning_rate": 1.393841166936791e-05, + "loss": 1.0738, + "step": 215 + }, + { + "epoch": 0.010517858447155066, + "grad_norm": 2.3989267349243164, + "learning_rate": 1.4003241491085902e-05, + "loss": 0.9982, + "step": 216 + }, + { + "epoch": 0.010566552236262265, + "grad_norm": 2.2782974243164062, + "learning_rate": 1.4068071312803891e-05, + "loss": 1.0408, + "step": 217 + }, + { + "epoch": 0.010615246025369464, + "grad_norm": 2.307957172393799, + "learning_rate": 1.413290113452188e-05, + "loss": 1.0706, + "step": 218 + }, + { + "epoch": 0.010663939814476664, + "grad_norm": 2.1506807804107666, + "learning_rate": 1.4197730956239872e-05, + "loss": 1.0412, + "step": 219 + }, + { + "epoch": 0.010712633603583863, + "grad_norm": 4.56396484375, + "learning_rate": 1.4262560777957861e-05, + "loss": 1.0004, + "step": 220 + }, + { + "epoch": 0.010761327392691063, + "grad_norm": 3.3647029399871826, + "learning_rate": 1.4327390599675852e-05, + "loss": 1.0585, + "step": 221 + }, + { + "epoch": 0.010810021181798262, + "grad_norm": 3.7195286750793457, + "learning_rate": 1.4392220421393844e-05, + "loss": 1.0944, + "step": 222 + }, + { + "epoch": 0.01085871497090546, + "grad_norm": 2.0134363174438477, + "learning_rate": 1.4457050243111831e-05, + "loss": 1.1346, + "step": 223 + }, + { + "epoch": 0.01090740876001266, + "grad_norm": 2.450629949569702, + "learning_rate": 1.4521880064829822e-05, + "loss": 1.1617, + "step": 224 + }, + { + "epoch": 0.01095610254911986, + "grad_norm": 4.74463415145874, + "learning_rate": 1.4586709886547814e-05, + "loss": 1.0322, + "step": 225 + }, + { + "epoch": 0.01100479633822706, + "grad_norm": 3.707082509994507, + "learning_rate": 1.4651539708265803e-05, + "loss": 1.1262, + "step": 226 + }, + { + "epoch": 0.011053490127334259, + "grad_norm": 2.8056089878082275, + "learning_rate": 1.4716369529983794e-05, + "loss": 1.028, + "step": 227 + }, + { + "epoch": 0.011102183916441457, + "grad_norm": 2.8268918991088867, + "learning_rate": 1.4781199351701785e-05, + "loss": 1.0812, + "step": 228 + }, + { + "epoch": 0.011150877705548658, + "grad_norm": 2.378094434738159, + "learning_rate": 1.4846029173419773e-05, + "loss": 1.1124, + "step": 229 + }, + { + "epoch": 0.011199571494655856, + "grad_norm": 1.9641963243484497, + "learning_rate": 1.4910858995137764e-05, + "loss": 1.0663, + "step": 230 + }, + { + "epoch": 0.011248265283763057, + "grad_norm": 2.565169095993042, + "learning_rate": 1.4975688816855755e-05, + "loss": 0.9493, + "step": 231 + }, + { + "epoch": 0.011296959072870255, + "grad_norm": 3.1285223960876465, + "learning_rate": 1.5040518638573745e-05, + "loss": 1.1087, + "step": 232 + }, + { + "epoch": 0.011345652861977454, + "grad_norm": 2.4043843746185303, + "learning_rate": 1.5105348460291736e-05, + "loss": 1.2115, + "step": 233 + }, + { + "epoch": 0.011394346651084655, + "grad_norm": 5.855169773101807, + "learning_rate": 1.5170178282009727e-05, + "loss": 0.9697, + "step": 234 + }, + { + "epoch": 0.011443040440191853, + "grad_norm": 2.358520984649658, + "learning_rate": 1.5235008103727715e-05, + "loss": 1.0528, + "step": 235 + }, + { + "epoch": 0.011491734229299054, + "grad_norm": 6.016727447509766, + "learning_rate": 1.5299837925445708e-05, + "loss": 1.0259, + "step": 236 + }, + { + "epoch": 0.011540428018406252, + "grad_norm": 2.409496784210205, + "learning_rate": 1.53646677471637e-05, + "loss": 1.0274, + "step": 237 + }, + { + "epoch": 0.011589121807513451, + "grad_norm": 2.4742085933685303, + "learning_rate": 1.5429497568881687e-05, + "loss": 0.9636, + "step": 238 + }, + { + "epoch": 0.011637815596620651, + "grad_norm": 3.759424924850464, + "learning_rate": 1.5494327390599678e-05, + "loss": 1.0392, + "step": 239 + }, + { + "epoch": 0.01168650938572785, + "grad_norm": 2.851156711578369, + "learning_rate": 1.555915721231767e-05, + "loss": 1.1106, + "step": 240 + }, + { + "epoch": 0.01173520317483505, + "grad_norm": 2.2877724170684814, + "learning_rate": 1.5623987034035657e-05, + "loss": 0.9906, + "step": 241 + }, + { + "epoch": 0.01178389696394225, + "grad_norm": 3.827793598175049, + "learning_rate": 1.5688816855753648e-05, + "loss": 1.0964, + "step": 242 + }, + { + "epoch": 0.011832590753049448, + "grad_norm": 2.575880765914917, + "learning_rate": 1.575364667747164e-05, + "loss": 0.9803, + "step": 243 + }, + { + "epoch": 0.011881284542156648, + "grad_norm": 2.4090559482574463, + "learning_rate": 1.5818476499189627e-05, + "loss": 1.028, + "step": 244 + }, + { + "epoch": 0.011929978331263847, + "grad_norm": 2.5369837284088135, + "learning_rate": 1.5883306320907618e-05, + "loss": 1.0663, + "step": 245 + }, + { + "epoch": 0.011978672120371047, + "grad_norm": 2.016309976577759, + "learning_rate": 1.594813614262561e-05, + "loss": 1.0759, + "step": 246 + }, + { + "epoch": 0.012027365909478246, + "grad_norm": 2.271496295928955, + "learning_rate": 1.60129659643436e-05, + "loss": 1.0171, + "step": 247 + }, + { + "epoch": 0.012076059698585445, + "grad_norm": 2.535283327102661, + "learning_rate": 1.6077795786061588e-05, + "loss": 1.1746, + "step": 248 + }, + { + "epoch": 0.012124753487692645, + "grad_norm": 2.241523265838623, + "learning_rate": 1.614262560777958e-05, + "loss": 1.0977, + "step": 249 + }, + { + "epoch": 0.012173447276799844, + "grad_norm": 3.008208751678467, + "learning_rate": 1.620745542949757e-05, + "loss": 1.0253, + "step": 250 + }, + { + "epoch": 0.012222141065907044, + "grad_norm": 2.952908754348755, + "learning_rate": 1.627228525121556e-05, + "loss": 0.989, + "step": 251 + }, + { + "epoch": 0.012270834855014243, + "grad_norm": 2.2950775623321533, + "learning_rate": 1.6337115072933553e-05, + "loss": 0.9699, + "step": 252 + }, + { + "epoch": 0.012319528644121442, + "grad_norm": 2.772942066192627, + "learning_rate": 1.640194489465154e-05, + "loss": 0.943, + "step": 253 + }, + { + "epoch": 0.012368222433228642, + "grad_norm": 1.9105616807937622, + "learning_rate": 1.646677471636953e-05, + "loss": 1.0752, + "step": 254 + }, + { + "epoch": 0.01241691622233584, + "grad_norm": 1.7819758653640747, + "learning_rate": 1.6531604538087523e-05, + "loss": 1.0516, + "step": 255 + }, + { + "epoch": 0.012465610011443041, + "grad_norm": 2.2907509803771973, + "learning_rate": 1.659643435980551e-05, + "loss": 1.0311, + "step": 256 + }, + { + "epoch": 0.01251430380055024, + "grad_norm": 4.741451263427734, + "learning_rate": 1.66612641815235e-05, + "loss": 1.0737, + "step": 257 + }, + { + "epoch": 0.012562997589657439, + "grad_norm": 2.3769335746765137, + "learning_rate": 1.6726094003241493e-05, + "loss": 1.1514, + "step": 258 + }, + { + "epoch": 0.012611691378764639, + "grad_norm": 2.785268783569336, + "learning_rate": 1.679092382495948e-05, + "loss": 1.0131, + "step": 259 + }, + { + "epoch": 0.012660385167871838, + "grad_norm": 1.979073166847229, + "learning_rate": 1.685575364667747e-05, + "loss": 1.0521, + "step": 260 + }, + { + "epoch": 0.012709078956979038, + "grad_norm": 2.2048535346984863, + "learning_rate": 1.6920583468395463e-05, + "loss": 1.0798, + "step": 261 + }, + { + "epoch": 0.012757772746086237, + "grad_norm": 2.9464733600616455, + "learning_rate": 1.6985413290113454e-05, + "loss": 1.0396, + "step": 262 + }, + { + "epoch": 0.012806466535193435, + "grad_norm": 2.2687127590179443, + "learning_rate": 1.7050243111831445e-05, + "loss": 1.0889, + "step": 263 + }, + { + "epoch": 0.012855160324300636, + "grad_norm": 2.555537223815918, + "learning_rate": 1.7115072933549436e-05, + "loss": 1.0391, + "step": 264 + }, + { + "epoch": 0.012903854113407835, + "grad_norm": 2.208693265914917, + "learning_rate": 1.7179902755267424e-05, + "loss": 1.0113, + "step": 265 + }, + { + "epoch": 0.012952547902515035, + "grad_norm": 2.2434377670288086, + "learning_rate": 1.7244732576985415e-05, + "loss": 1.014, + "step": 266 + }, + { + "epoch": 0.013001241691622234, + "grad_norm": 1.8683985471725464, + "learning_rate": 1.7309562398703406e-05, + "loss": 1.1864, + "step": 267 + }, + { + "epoch": 0.013049935480729432, + "grad_norm": 2.992035388946533, + "learning_rate": 1.7374392220421394e-05, + "loss": 0.9591, + "step": 268 + }, + { + "epoch": 0.013098629269836633, + "grad_norm": 2.2664005756378174, + "learning_rate": 1.7439222042139385e-05, + "loss": 1.0657, + "step": 269 + }, + { + "epoch": 0.013147323058943831, + "grad_norm": 2.917034864425659, + "learning_rate": 1.7504051863857376e-05, + "loss": 1.1115, + "step": 270 + }, + { + "epoch": 0.013196016848051032, + "grad_norm": 3.6751809120178223, + "learning_rate": 1.7568881685575364e-05, + "loss": 1.1002, + "step": 271 + }, + { + "epoch": 0.01324471063715823, + "grad_norm": 4.402239799499512, + "learning_rate": 1.7633711507293355e-05, + "loss": 0.9688, + "step": 272 + }, + { + "epoch": 0.01329340442626543, + "grad_norm": 2.3622117042541504, + "learning_rate": 1.7698541329011346e-05, + "loss": 1.0885, + "step": 273 + }, + { + "epoch": 0.01334209821537263, + "grad_norm": 3.9420201778411865, + "learning_rate": 1.7763371150729338e-05, + "loss": 1.1, + "step": 274 + }, + { + "epoch": 0.013390792004479828, + "grad_norm": 2.860931396484375, + "learning_rate": 1.782820097244733e-05, + "loss": 1.0893, + "step": 275 + }, + { + "epoch": 0.013439485793587029, + "grad_norm": 3.028212547302246, + "learning_rate": 1.7893030794165316e-05, + "loss": 0.9109, + "step": 276 + }, + { + "epoch": 0.013488179582694227, + "grad_norm": 2.3869855403900146, + "learning_rate": 1.7957860615883308e-05, + "loss": 1.0039, + "step": 277 + }, + { + "epoch": 0.013536873371801426, + "grad_norm": 1.9145023822784424, + "learning_rate": 1.80226904376013e-05, + "loss": 1.0849, + "step": 278 + }, + { + "epoch": 0.013585567160908626, + "grad_norm": 2.466167688369751, + "learning_rate": 1.808752025931929e-05, + "loss": 1.0949, + "step": 279 + }, + { + "epoch": 0.013634260950015825, + "grad_norm": 1.817857027053833, + "learning_rate": 1.8152350081037278e-05, + "loss": 1.0013, + "step": 280 + }, + { + "epoch": 0.013682954739123026, + "grad_norm": 2.5072271823883057, + "learning_rate": 1.821717990275527e-05, + "loss": 1.0618, + "step": 281 + }, + { + "epoch": 0.013731648528230224, + "grad_norm": 4.120292663574219, + "learning_rate": 1.828200972447326e-05, + "loss": 1.0199, + "step": 282 + }, + { + "epoch": 0.013780342317337423, + "grad_norm": 2.699310541152954, + "learning_rate": 1.8346839546191248e-05, + "loss": 1.0498, + "step": 283 + }, + { + "epoch": 0.013829036106444623, + "grad_norm": 1.9382245540618896, + "learning_rate": 1.841166936790924e-05, + "loss": 1.0127, + "step": 284 + }, + { + "epoch": 0.013877729895551822, + "grad_norm": 1.8312773704528809, + "learning_rate": 1.847649918962723e-05, + "loss": 1.1863, + "step": 285 + }, + { + "epoch": 0.013926423684659022, + "grad_norm": 3.203509569168091, + "learning_rate": 1.8541329011345218e-05, + "loss": 0.9787, + "step": 286 + }, + { + "epoch": 0.013975117473766221, + "grad_norm": 2.851487874984741, + "learning_rate": 1.860615883306321e-05, + "loss": 1.0552, + "step": 287 + }, + { + "epoch": 0.01402381126287342, + "grad_norm": 2.416335344314575, + "learning_rate": 1.86709886547812e-05, + "loss": 0.9536, + "step": 288 + }, + { + "epoch": 0.01407250505198062, + "grad_norm": 3.2487123012542725, + "learning_rate": 1.873581847649919e-05, + "loss": 0.9719, + "step": 289 + }, + { + "epoch": 0.014121198841087819, + "grad_norm": 2.227816343307495, + "learning_rate": 1.8800648298217182e-05, + "loss": 1.0793, + "step": 290 + }, + { + "epoch": 0.01416989263019502, + "grad_norm": 1.9918168783187866, + "learning_rate": 1.8865478119935173e-05, + "loss": 1.0274, + "step": 291 + }, + { + "epoch": 0.014218586419302218, + "grad_norm": 2.6595070362091064, + "learning_rate": 1.893030794165316e-05, + "loss": 1.1017, + "step": 292 + }, + { + "epoch": 0.014267280208409417, + "grad_norm": 3.2851130962371826, + "learning_rate": 1.8995137763371152e-05, + "loss": 1.0624, + "step": 293 + }, + { + "epoch": 0.014315973997516617, + "grad_norm": 1.874655842781067, + "learning_rate": 1.9059967585089144e-05, + "loss": 1.1108, + "step": 294 + }, + { + "epoch": 0.014364667786623816, + "grad_norm": 2.060924530029297, + "learning_rate": 1.912479740680713e-05, + "loss": 1.0172, + "step": 295 + }, + { + "epoch": 0.014413361575731016, + "grad_norm": 3.367705821990967, + "learning_rate": 1.9189627228525122e-05, + "loss": 1.0455, + "step": 296 + }, + { + "epoch": 0.014462055364838215, + "grad_norm": 1.8530776500701904, + "learning_rate": 1.9254457050243114e-05, + "loss": 1.023, + "step": 297 + }, + { + "epoch": 0.014510749153945414, + "grad_norm": 0.1530066281557083, + "learning_rate": 1.93192868719611e-05, + "loss": 0.7296, + "step": 298 + }, + { + "epoch": 0.014559442943052614, + "grad_norm": 1.968161940574646, + "learning_rate": 1.9384116693679093e-05, + "loss": 0.9718, + "step": 299 + }, + { + "epoch": 0.014608136732159813, + "grad_norm": 3.917356014251709, + "learning_rate": 1.9448946515397084e-05, + "loss": 1.0007, + "step": 300 + }, + { + "epoch": 0.014656830521267013, + "grad_norm": 2.148250102996826, + "learning_rate": 1.9513776337115075e-05, + "loss": 1.1034, + "step": 301 + }, + { + "epoch": 0.014705524310374212, + "grad_norm": 0.15658627450466156, + "learning_rate": 1.9578606158833066e-05, + "loss": 0.7137, + "step": 302 + }, + { + "epoch": 0.01475421809948141, + "grad_norm": 0.14732518792152405, + "learning_rate": 1.9643435980551057e-05, + "loss": 0.6552, + "step": 303 + }, + { + "epoch": 0.014802911888588611, + "grad_norm": 2.823254346847534, + "learning_rate": 1.9708265802269045e-05, + "loss": 1.021, + "step": 304 + }, + { + "epoch": 0.01485160567769581, + "grad_norm": 2.281395673751831, + "learning_rate": 1.9773095623987036e-05, + "loss": 1.0445, + "step": 305 + }, + { + "epoch": 0.01490029946680301, + "grad_norm": 2.5205490589141846, + "learning_rate": 1.9837925445705027e-05, + "loss": 0.9933, + "step": 306 + }, + { + "epoch": 0.014948993255910209, + "grad_norm": 1.9979664087295532, + "learning_rate": 1.9902755267423015e-05, + "loss": 1.0895, + "step": 307 + }, + { + "epoch": 0.014997687045017407, + "grad_norm": 1.7522437572479248, + "learning_rate": 1.9967585089141006e-05, + "loss": 0.9593, + "step": 308 + }, + { + "epoch": 0.015046380834124608, + "grad_norm": 2.6025302410125732, + "learning_rate": 2.0032414910858997e-05, + "loss": 1.1096, + "step": 309 + }, + { + "epoch": 0.015095074623231806, + "grad_norm": 2.6606249809265137, + "learning_rate": 2.009724473257699e-05, + "loss": 0.9069, + "step": 310 + }, + { + "epoch": 0.015143768412339007, + "grad_norm": 2.0834555625915527, + "learning_rate": 2.016207455429498e-05, + "loss": 0.9936, + "step": 311 + }, + { + "epoch": 0.015192462201446206, + "grad_norm": 1.9502568244934082, + "learning_rate": 2.0226904376012967e-05, + "loss": 1.078, + "step": 312 + }, + { + "epoch": 0.015241155990553404, + "grad_norm": 2.2026150226593018, + "learning_rate": 2.029173419773096e-05, + "loss": 0.9959, + "step": 313 + }, + { + "epoch": 0.015289849779660605, + "grad_norm": 2.208299398422241, + "learning_rate": 2.0356564019448946e-05, + "loss": 1.1049, + "step": 314 + }, + { + "epoch": 0.015338543568767803, + "grad_norm": 1.7123290300369263, + "learning_rate": 2.0421393841166937e-05, + "loss": 1.0079, + "step": 315 + }, + { + "epoch": 0.015387237357875004, + "grad_norm": 2.0317881107330322, + "learning_rate": 2.048622366288493e-05, + "loss": 1.0515, + "step": 316 + }, + { + "epoch": 0.015435931146982202, + "grad_norm": 2.72285795211792, + "learning_rate": 2.055105348460292e-05, + "loss": 0.9326, + "step": 317 + }, + { + "epoch": 0.015484624936089401, + "grad_norm": 2.1823229789733887, + "learning_rate": 2.0615883306320907e-05, + "loss": 1.0472, + "step": 318 + }, + { + "epoch": 0.015533318725196602, + "grad_norm": 1.8123681545257568, + "learning_rate": 2.06807131280389e-05, + "loss": 1.0534, + "step": 319 + }, + { + "epoch": 0.0155820125143038, + "grad_norm": 4.037994384765625, + "learning_rate": 2.074554294975689e-05, + "loss": 1.0694, + "step": 320 + }, + { + "epoch": 0.015630706303411, + "grad_norm": 1.9442471265792847, + "learning_rate": 2.081037277147488e-05, + "loss": 0.9666, + "step": 321 + }, + { + "epoch": 0.0156794000925182, + "grad_norm": 2.1183595657348633, + "learning_rate": 2.0875202593192872e-05, + "loss": 1.0274, + "step": 322 + }, + { + "epoch": 0.0157280938816254, + "grad_norm": 2.6239407062530518, + "learning_rate": 2.0940032414910863e-05, + "loss": 0.9864, + "step": 323 + }, + { + "epoch": 0.0157767876707326, + "grad_norm": 3.11822772026062, + "learning_rate": 2.1004862236628847e-05, + "loss": 0.9717, + "step": 324 + }, + { + "epoch": 0.015825481459839797, + "grad_norm": 1.7127619981765747, + "learning_rate": 2.106969205834684e-05, + "loss": 1.0076, + "step": 325 + }, + { + "epoch": 0.015874175248946996, + "grad_norm": 1.7988882064819336, + "learning_rate": 2.113452188006483e-05, + "loss": 1.0024, + "step": 326 + }, + { + "epoch": 0.015922869038054198, + "grad_norm": 2.059793710708618, + "learning_rate": 2.119935170178282e-05, + "loss": 1.0197, + "step": 327 + }, + { + "epoch": 0.015971562827161397, + "grad_norm": 2.5717921257019043, + "learning_rate": 2.1264181523500812e-05, + "loss": 1.0725, + "step": 328 + }, + { + "epoch": 0.016020256616268595, + "grad_norm": 2.0654585361480713, + "learning_rate": 2.1329011345218803e-05, + "loss": 1.0836, + "step": 329 + }, + { + "epoch": 0.016068950405375794, + "grad_norm": 3.4460630416870117, + "learning_rate": 2.139384116693679e-05, + "loss": 0.9938, + "step": 330 + }, + { + "epoch": 0.016117644194482993, + "grad_norm": 2.852635383605957, + "learning_rate": 2.1458670988654782e-05, + "loss": 1.027, + "step": 331 + }, + { + "epoch": 0.016166337983590195, + "grad_norm": 2.286125898361206, + "learning_rate": 2.1523500810372773e-05, + "loss": 1.0429, + "step": 332 + }, + { + "epoch": 0.016215031772697393, + "grad_norm": 2.56862735748291, + "learning_rate": 2.1588330632090764e-05, + "loss": 1.0973, + "step": 333 + }, + { + "epoch": 0.016263725561804592, + "grad_norm": 2.4518861770629883, + "learning_rate": 2.1653160453808756e-05, + "loss": 1.0894, + "step": 334 + }, + { + "epoch": 0.01631241935091179, + "grad_norm": 1.6010596752166748, + "learning_rate": 2.1717990275526747e-05, + "loss": 0.8765, + "step": 335 + }, + { + "epoch": 0.01636111314001899, + "grad_norm": 0.12348173558712006, + "learning_rate": 2.178282009724473e-05, + "loss": 0.649, + "step": 336 + }, + { + "epoch": 0.01640980692912619, + "grad_norm": 2.3185741901397705, + "learning_rate": 2.1847649918962722e-05, + "loss": 1.0467, + "step": 337 + }, + { + "epoch": 0.01645850071823339, + "grad_norm": 2.5788393020629883, + "learning_rate": 2.1912479740680713e-05, + "loss": 0.9805, + "step": 338 + }, + { + "epoch": 0.01650719450734059, + "grad_norm": 2.417008638381958, + "learning_rate": 2.1977309562398705e-05, + "loss": 0.9499, + "step": 339 + }, + { + "epoch": 0.016555888296447788, + "grad_norm": 3.1746816635131836, + "learning_rate": 2.2042139384116696e-05, + "loss": 1.0349, + "step": 340 + }, + { + "epoch": 0.016604582085554986, + "grad_norm": 2.357459783554077, + "learning_rate": 2.2106969205834687e-05, + "loss": 1.1505, + "step": 341 + }, + { + "epoch": 0.01665327587466219, + "grad_norm": 2.3088343143463135, + "learning_rate": 2.2171799027552675e-05, + "loss": 0.9584, + "step": 342 + }, + { + "epoch": 0.016701969663769387, + "grad_norm": 0.10663726180791855, + "learning_rate": 2.2236628849270666e-05, + "loss": 0.6014, + "step": 343 + }, + { + "epoch": 0.016750663452876586, + "grad_norm": 3.3853352069854736, + "learning_rate": 2.2301458670988657e-05, + "loss": 0.9837, + "step": 344 + }, + { + "epoch": 0.016799357241983785, + "grad_norm": 2.140716075897217, + "learning_rate": 2.2366288492706648e-05, + "loss": 1.01, + "step": 345 + }, + { + "epoch": 0.016848051031090983, + "grad_norm": 2.7914907932281494, + "learning_rate": 2.243111831442464e-05, + "loss": 1.0369, + "step": 346 + }, + { + "epoch": 0.016896744820198185, + "grad_norm": 2.5873281955718994, + "learning_rate": 2.249594813614263e-05, + "loss": 1.0348, + "step": 347 + }, + { + "epoch": 0.016945438609305384, + "grad_norm": 2.7999351024627686, + "learning_rate": 2.2560777957860615e-05, + "loss": 0.9678, + "step": 348 + }, + { + "epoch": 0.016994132398412583, + "grad_norm": 0.1282394975423813, + "learning_rate": 2.2625607779578606e-05, + "loss": 0.6991, + "step": 349 + }, + { + "epoch": 0.01704282618751978, + "grad_norm": 3.0318660736083984, + "learning_rate": 2.2690437601296597e-05, + "loss": 1.0788, + "step": 350 + }, + { + "epoch": 0.01709151997662698, + "grad_norm": 2.737649440765381, + "learning_rate": 2.2755267423014588e-05, + "loss": 1.0362, + "step": 351 + }, + { + "epoch": 0.017140213765734182, + "grad_norm": 1.6683567762374878, + "learning_rate": 2.282009724473258e-05, + "loss": 1.0878, + "step": 352 + }, + { + "epoch": 0.01718890755484138, + "grad_norm": 2.2829537391662598, + "learning_rate": 2.288492706645057e-05, + "loss": 1.0761, + "step": 353 + }, + { + "epoch": 0.01723760134394858, + "grad_norm": 2.312715530395508, + "learning_rate": 2.2949756888168558e-05, + "loss": 1.0435, + "step": 354 + }, + { + "epoch": 0.01728629513305578, + "grad_norm": 0.1267106831073761, + "learning_rate": 2.301458670988655e-05, + "loss": 0.6236, + "step": 355 + }, + { + "epoch": 0.017334988922162977, + "grad_norm": 2.1496944427490234, + "learning_rate": 2.307941653160454e-05, + "loss": 1.0602, + "step": 356 + }, + { + "epoch": 0.01738368271127018, + "grad_norm": 2.276364326477051, + "learning_rate": 2.314424635332253e-05, + "loss": 1.0366, + "step": 357 + }, + { + "epoch": 0.017432376500377378, + "grad_norm": 4.8686017990112305, + "learning_rate": 2.3209076175040523e-05, + "loss": 0.965, + "step": 358 + }, + { + "epoch": 0.017481070289484577, + "grad_norm": 3.8798017501831055, + "learning_rate": 2.3273905996758514e-05, + "loss": 1.0482, + "step": 359 + }, + { + "epoch": 0.017529764078591775, + "grad_norm": 1.491219401359558, + "learning_rate": 2.33387358184765e-05, + "loss": 1.1032, + "step": 360 + }, + { + "epoch": 0.017578457867698974, + "grad_norm": 1.4921164512634277, + "learning_rate": 2.340356564019449e-05, + "loss": 1.0363, + "step": 361 + }, + { + "epoch": 0.017627151656806176, + "grad_norm": 2.2991740703582764, + "learning_rate": 2.346839546191248e-05, + "loss": 1.0161, + "step": 362 + }, + { + "epoch": 0.017675845445913375, + "grad_norm": 2.739652633666992, + "learning_rate": 2.3533225283630472e-05, + "loss": 0.9509, + "step": 363 + }, + { + "epoch": 0.017724539235020573, + "grad_norm": 2.227057695388794, + "learning_rate": 2.3598055105348463e-05, + "loss": 1.0052, + "step": 364 + }, + { + "epoch": 0.017773233024127772, + "grad_norm": 2.2680976390838623, + "learning_rate": 2.3662884927066454e-05, + "loss": 1.0191, + "step": 365 + }, + { + "epoch": 0.01782192681323497, + "grad_norm": 2.2497191429138184, + "learning_rate": 2.3727714748784442e-05, + "loss": 0.9143, + "step": 366 + }, + { + "epoch": 0.017870620602342173, + "grad_norm": 2.0163612365722656, + "learning_rate": 2.3792544570502433e-05, + "loss": 0.9785, + "step": 367 + }, + { + "epoch": 0.01791931439144937, + "grad_norm": 1.9305003881454468, + "learning_rate": 2.3857374392220424e-05, + "loss": 1.0568, + "step": 368 + }, + { + "epoch": 0.01796800818055657, + "grad_norm": 0.10355480760335922, + "learning_rate": 2.3922204213938415e-05, + "loss": 0.6089, + "step": 369 + }, + { + "epoch": 0.01801670196966377, + "grad_norm": 1.6975165605545044, + "learning_rate": 2.3987034035656403e-05, + "loss": 1.0184, + "step": 370 + }, + { + "epoch": 0.018065395758770968, + "grad_norm": 1.7285752296447754, + "learning_rate": 2.4051863857374394e-05, + "loss": 0.9815, + "step": 371 + }, + { + "epoch": 0.01811408954787817, + "grad_norm": 2.7775766849517822, + "learning_rate": 2.4116693679092385e-05, + "loss": 1.0871, + "step": 372 + }, + { + "epoch": 0.01816278333698537, + "grad_norm": 1.860260009765625, + "learning_rate": 2.4181523500810373e-05, + "loss": 1.005, + "step": 373 + }, + { + "epoch": 0.018211477126092567, + "grad_norm": 1.8739532232284546, + "learning_rate": 2.4246353322528364e-05, + "loss": 1.0489, + "step": 374 + }, + { + "epoch": 0.018260170915199766, + "grad_norm": 4.175334453582764, + "learning_rate": 2.4311183144246355e-05, + "loss": 1.0473, + "step": 375 + }, + { + "epoch": 0.018308864704306965, + "grad_norm": 1.8349510431289673, + "learning_rate": 2.4376012965964347e-05, + "loss": 0.9284, + "step": 376 + }, + { + "epoch": 0.018357558493414167, + "grad_norm": 2.2791526317596436, + "learning_rate": 2.4440842787682338e-05, + "loss": 1.0378, + "step": 377 + }, + { + "epoch": 0.018406252282521365, + "grad_norm": 0.10789645463228226, + "learning_rate": 2.450567260940033e-05, + "loss": 0.6288, + "step": 378 + }, + { + "epoch": 0.018454946071628564, + "grad_norm": 2.355968713760376, + "learning_rate": 2.4570502431118313e-05, + "loss": 1.0016, + "step": 379 + }, + { + "epoch": 0.018503639860735763, + "grad_norm": 0.0989394411444664, + "learning_rate": 2.4635332252836304e-05, + "loss": 0.6527, + "step": 380 + }, + { + "epoch": 0.01855233364984296, + "grad_norm": 1.9441018104553223, + "learning_rate": 2.4700162074554296e-05, + "loss": 1.0952, + "step": 381 + }, + { + "epoch": 0.018601027438950164, + "grad_norm": 2.583024024963379, + "learning_rate": 2.4764991896272287e-05, + "loss": 1.0353, + "step": 382 + }, + { + "epoch": 0.018649721228057362, + "grad_norm": 1.8415536880493164, + "learning_rate": 2.4829821717990278e-05, + "loss": 1.0322, + "step": 383 + }, + { + "epoch": 0.01869841501716456, + "grad_norm": 2.204754114151001, + "learning_rate": 2.489465153970827e-05, + "loss": 1.0529, + "step": 384 + }, + { + "epoch": 0.01874710880627176, + "grad_norm": 1.685062289237976, + "learning_rate": 2.4959481361426257e-05, + "loss": 0.9564, + "step": 385 + }, + { + "epoch": 0.01879580259537896, + "grad_norm": 2.1638951301574707, + "learning_rate": 2.5024311183144248e-05, + "loss": 0.9501, + "step": 386 + }, + { + "epoch": 0.01884449638448616, + "grad_norm": 2.086672782897949, + "learning_rate": 2.508914100486224e-05, + "loss": 0.9887, + "step": 387 + }, + { + "epoch": 0.01889319017359336, + "grad_norm": 2.5284039974212646, + "learning_rate": 2.515397082658023e-05, + "loss": 1.0149, + "step": 388 + }, + { + "epoch": 0.018941883962700558, + "grad_norm": 2.130089282989502, + "learning_rate": 2.521880064829822e-05, + "loss": 0.9365, + "step": 389 + }, + { + "epoch": 0.018990577751807756, + "grad_norm": 0.09775998443365097, + "learning_rate": 2.5283630470016212e-05, + "loss": 0.665, + "step": 390 + }, + { + "epoch": 0.019039271540914955, + "grad_norm": 2.1528971195220947, + "learning_rate": 2.5348460291734197e-05, + "loss": 1.0663, + "step": 391 + }, + { + "epoch": 0.019087965330022157, + "grad_norm": 1.9917851686477661, + "learning_rate": 2.5413290113452188e-05, + "loss": 1.0206, + "step": 392 + }, + { + "epoch": 0.019136659119129356, + "grad_norm": 4.262947082519531, + "learning_rate": 2.547811993517018e-05, + "loss": 0.9966, + "step": 393 + }, + { + "epoch": 0.019185352908236555, + "grad_norm": 4.160139560699463, + "learning_rate": 2.554294975688817e-05, + "loss": 1.0468, + "step": 394 + }, + { + "epoch": 0.019234046697343753, + "grad_norm": 1.8787238597869873, + "learning_rate": 2.560777957860616e-05, + "loss": 1.1045, + "step": 395 + }, + { + "epoch": 0.019282740486450952, + "grad_norm": 1.6416929960250854, + "learning_rate": 2.5672609400324153e-05, + "loss": 0.9993, + "step": 396 + }, + { + "epoch": 0.019331434275558154, + "grad_norm": 2.3458948135375977, + "learning_rate": 2.573743922204214e-05, + "loss": 0.9181, + "step": 397 + }, + { + "epoch": 0.019380128064665353, + "grad_norm": 2.441420316696167, + "learning_rate": 2.580226904376013e-05, + "loss": 0.992, + "step": 398 + }, + { + "epoch": 0.01942882185377255, + "grad_norm": 2.6267755031585693, + "learning_rate": 2.5867098865478123e-05, + "loss": 1.05, + "step": 399 + }, + { + "epoch": 0.01947751564287975, + "grad_norm": 0.09385080635547638, + "learning_rate": 2.5931928687196114e-05, + "loss": 0.5956, + "step": 400 + }, + { + "epoch": 0.01952620943198695, + "grad_norm": 2.7307708263397217, + "learning_rate": 2.5996758508914105e-05, + "loss": 1.092, + "step": 401 + }, + { + "epoch": 0.01957490322109415, + "grad_norm": 2.4982681274414062, + "learning_rate": 2.6061588330632096e-05, + "loss": 1.083, + "step": 402 + }, + { + "epoch": 0.01962359701020135, + "grad_norm": 3.502535581588745, + "learning_rate": 2.612641815235008e-05, + "loss": 1.0452, + "step": 403 + }, + { + "epoch": 0.01967229079930855, + "grad_norm": 2.557565450668335, + "learning_rate": 2.619124797406807e-05, + "loss": 1.0181, + "step": 404 + }, + { + "epoch": 0.019720984588415747, + "grad_norm": 1.9502094984054565, + "learning_rate": 2.6256077795786063e-05, + "loss": 0.9891, + "step": 405 + }, + { + "epoch": 0.019769678377522946, + "grad_norm": 1.7901406288146973, + "learning_rate": 2.6320907617504054e-05, + "loss": 1.0196, + "step": 406 + }, + { + "epoch": 0.019818372166630148, + "grad_norm": 2.7601139545440674, + "learning_rate": 2.6385737439222045e-05, + "loss": 1.0375, + "step": 407 + }, + { + "epoch": 0.019867065955737347, + "grad_norm": 1.8666871786117554, + "learning_rate": 2.6450567260940036e-05, + "loss": 1.0212, + "step": 408 + }, + { + "epoch": 0.019915759744844545, + "grad_norm": 2.1622893810272217, + "learning_rate": 2.6515397082658024e-05, + "loss": 1.0447, + "step": 409 + }, + { + "epoch": 0.019964453533951744, + "grad_norm": 2.30372953414917, + "learning_rate": 2.6580226904376015e-05, + "loss": 1.0087, + "step": 410 + }, + { + "epoch": 0.020013147323058943, + "grad_norm": 1.990332841873169, + "learning_rate": 2.6645056726094006e-05, + "loss": 1.0748, + "step": 411 + }, + { + "epoch": 0.020061841112166145, + "grad_norm": 2.2521791458129883, + "learning_rate": 2.6709886547811997e-05, + "loss": 1.0827, + "step": 412 + }, + { + "epoch": 0.020110534901273344, + "grad_norm": 2.370309829711914, + "learning_rate": 2.677471636952999e-05, + "loss": 0.9177, + "step": 413 + }, + { + "epoch": 0.020159228690380542, + "grad_norm": 2.2251389026641846, + "learning_rate": 2.683954619124798e-05, + "loss": 0.9174, + "step": 414 + }, + { + "epoch": 0.02020792247948774, + "grad_norm": 2.338634967803955, + "learning_rate": 2.6904376012965964e-05, + "loss": 1.0322, + "step": 415 + }, + { + "epoch": 0.02025661626859494, + "grad_norm": 27.788888931274414, + "learning_rate": 2.6969205834683955e-05, + "loss": 0.9932, + "step": 416 + }, + { + "epoch": 0.02030531005770214, + "grad_norm": 1.952962875366211, + "learning_rate": 2.7034035656401946e-05, + "loss": 0.9823, + "step": 417 + }, + { + "epoch": 0.02035400384680934, + "grad_norm": 1.7657105922698975, + "learning_rate": 2.7098865478119938e-05, + "loss": 0.9003, + "step": 418 + }, + { + "epoch": 0.02040269763591654, + "grad_norm": 2.043911933898926, + "learning_rate": 2.716369529983793e-05, + "loss": 0.9977, + "step": 419 + }, + { + "epoch": 0.020451391425023738, + "grad_norm": 2.934741497039795, + "learning_rate": 2.722852512155592e-05, + "loss": 1.018, + "step": 420 + }, + { + "epoch": 0.020500085214130936, + "grad_norm": 2.291511297225952, + "learning_rate": 2.7293354943273908e-05, + "loss": 0.9866, + "step": 421 + }, + { + "epoch": 0.02054877900323814, + "grad_norm": 2.5935235023498535, + "learning_rate": 2.73581847649919e-05, + "loss": 1.046, + "step": 422 + }, + { + "epoch": 0.020597472792345337, + "grad_norm": 2.6763882637023926, + "learning_rate": 2.742301458670989e-05, + "loss": 1.1014, + "step": 423 + }, + { + "epoch": 0.020646166581452536, + "grad_norm": 3.480491876602173, + "learning_rate": 2.748784440842788e-05, + "loss": 0.97, + "step": 424 + }, + { + "epoch": 0.020694860370559735, + "grad_norm": 2.07953143119812, + "learning_rate": 2.7552674230145872e-05, + "loss": 1.0643, + "step": 425 + }, + { + "epoch": 0.020743554159666933, + "grad_norm": 3.085841178894043, + "learning_rate": 2.761750405186386e-05, + "loss": 0.9475, + "step": 426 + }, + { + "epoch": 0.020792247948774135, + "grad_norm": 0.1051601767539978, + "learning_rate": 2.7682333873581848e-05, + "loss": 0.6687, + "step": 427 + }, + { + "epoch": 0.020840941737881334, + "grad_norm": 2.686763048171997, + "learning_rate": 2.774716369529984e-05, + "loss": 0.9535, + "step": 428 + }, + { + "epoch": 0.020889635526988533, + "grad_norm": 2.6260788440704346, + "learning_rate": 2.781199351701783e-05, + "loss": 0.9969, + "step": 429 + }, + { + "epoch": 0.02093832931609573, + "grad_norm": 1.9896270036697388, + "learning_rate": 2.787682333873582e-05, + "loss": 1.0102, + "step": 430 + }, + { + "epoch": 0.02098702310520293, + "grad_norm": 2.074465751647949, + "learning_rate": 2.7941653160453812e-05, + "loss": 1.0622, + "step": 431 + }, + { + "epoch": 0.021035716894310132, + "grad_norm": 2.2959980964660645, + "learning_rate": 2.8006482982171803e-05, + "loss": 1.0511, + "step": 432 + }, + { + "epoch": 0.02108441068341733, + "grad_norm": 2.310297727584839, + "learning_rate": 2.807131280388979e-05, + "loss": 0.9952, + "step": 433 + }, + { + "epoch": 0.02113310447252453, + "grad_norm": 2.7813801765441895, + "learning_rate": 2.8136142625607782e-05, + "loss": 1.0256, + "step": 434 + }, + { + "epoch": 0.02118179826163173, + "grad_norm": 1.7151693105697632, + "learning_rate": 2.820097244732577e-05, + "loss": 1.0046, + "step": 435 + }, + { + "epoch": 0.021230492050738927, + "grad_norm": 2.42338228225708, + "learning_rate": 2.826580226904376e-05, + "loss": 0.91, + "step": 436 + }, + { + "epoch": 0.02127918583984613, + "grad_norm": 1.7467331886291504, + "learning_rate": 2.8330632090761752e-05, + "loss": 0.9487, + "step": 437 + }, + { + "epoch": 0.021327879628953328, + "grad_norm": 2.086745500564575, + "learning_rate": 2.8395461912479744e-05, + "loss": 1.0028, + "step": 438 + }, + { + "epoch": 0.021376573418060527, + "grad_norm": 1.9011980295181274, + "learning_rate": 2.846029173419773e-05, + "loss": 0.9751, + "step": 439 + }, + { + "epoch": 0.021425267207167725, + "grad_norm": 2.4477837085723877, + "learning_rate": 2.8525121555915722e-05, + "loss": 1.18, + "step": 440 + }, + { + "epoch": 0.021473960996274924, + "grad_norm": 2.125051975250244, + "learning_rate": 2.8589951377633714e-05, + "loss": 1.0878, + "step": 441 + }, + { + "epoch": 0.021522654785382126, + "grad_norm": 2.2513673305511475, + "learning_rate": 2.8654781199351705e-05, + "loss": 0.9413, + "step": 442 + }, + { + "epoch": 0.021571348574489325, + "grad_norm": 2.9314498901367188, + "learning_rate": 2.8719611021069696e-05, + "loss": 1.0035, + "step": 443 + }, + { + "epoch": 0.021620042363596523, + "grad_norm": 5.616748809814453, + "learning_rate": 2.8784440842787687e-05, + "loss": 1.0338, + "step": 444 + }, + { + "epoch": 0.021668736152703722, + "grad_norm": 3.5767030715942383, + "learning_rate": 2.884927066450567e-05, + "loss": 0.9238, + "step": 445 + }, + { + "epoch": 0.02171742994181092, + "grad_norm": 2.1487019062042236, + "learning_rate": 2.8914100486223663e-05, + "loss": 1.005, + "step": 446 + }, + { + "epoch": 0.021766123730918123, + "grad_norm": 2.6322174072265625, + "learning_rate": 2.8978930307941654e-05, + "loss": 1.0691, + "step": 447 + }, + { + "epoch": 0.02181481752002532, + "grad_norm": 3.060429811477661, + "learning_rate": 2.9043760129659645e-05, + "loss": 1.0196, + "step": 448 + }, + { + "epoch": 0.02186351130913252, + "grad_norm": 2.821092367172241, + "learning_rate": 2.9108589951377636e-05, + "loss": 0.9535, + "step": 449 + }, + { + "epoch": 0.02191220509823972, + "grad_norm": 1.9808021783828735, + "learning_rate": 2.9173419773095627e-05, + "loss": 1.0442, + "step": 450 + }, + { + "epoch": 0.021960898887346918, + "grad_norm": 3.903930187225342, + "learning_rate": 2.9238249594813615e-05, + "loss": 0.9834, + "step": 451 + }, + { + "epoch": 0.02200959267645412, + "grad_norm": 4.109078884124756, + "learning_rate": 2.9303079416531606e-05, + "loss": 1.0624, + "step": 452 + }, + { + "epoch": 0.02205828646556132, + "grad_norm": 3.0043177604675293, + "learning_rate": 2.9367909238249597e-05, + "loss": 1.003, + "step": 453 + }, + { + "epoch": 0.022106980254668517, + "grad_norm": 2.590132713317871, + "learning_rate": 2.943273905996759e-05, + "loss": 1.0359, + "step": 454 + }, + { + "epoch": 0.022155674043775716, + "grad_norm": 2.6366941928863525, + "learning_rate": 2.949756888168558e-05, + "loss": 1.0096, + "step": 455 + }, + { + "epoch": 0.022204367832882915, + "grad_norm": 2.4326822757720947, + "learning_rate": 2.956239870340357e-05, + "loss": 1.0298, + "step": 456 + }, + { + "epoch": 0.022253061621990117, + "grad_norm": 2.8013861179351807, + "learning_rate": 2.9627228525121555e-05, + "loss": 1.0821, + "step": 457 + }, + { + "epoch": 0.022301755411097315, + "grad_norm": 2.0566751956939697, + "learning_rate": 2.9692058346839546e-05, + "loss": 0.837, + "step": 458 + }, + { + "epoch": 0.022350449200204514, + "grad_norm": 2.1780924797058105, + "learning_rate": 2.9756888168557537e-05, + "loss": 1.0101, + "step": 459 + }, + { + "epoch": 0.022399142989311713, + "grad_norm": 5.737508773803711, + "learning_rate": 2.982171799027553e-05, + "loss": 1.0074, + "step": 460 + }, + { + "epoch": 0.02244783677841891, + "grad_norm": 2.197871446609497, + "learning_rate": 2.988654781199352e-05, + "loss": 1.0215, + "step": 461 + }, + { + "epoch": 0.022496530567526114, + "grad_norm": 2.164320230484009, + "learning_rate": 2.995137763371151e-05, + "loss": 1.0088, + "step": 462 + }, + { + "epoch": 0.022545224356633312, + "grad_norm": 0.10526752471923828, + "learning_rate": 3.00162074554295e-05, + "loss": 0.629, + "step": 463 + }, + { + "epoch": 0.02259391814574051, + "grad_norm": 2.1289801597595215, + "learning_rate": 3.008103727714749e-05, + "loss": 1.034, + "step": 464 + }, + { + "epoch": 0.02264261193484771, + "grad_norm": 2.3047211170196533, + "learning_rate": 3.014586709886548e-05, + "loss": 0.9849, + "step": 465 + }, + { + "epoch": 0.02269130572395491, + "grad_norm": 2.2475645542144775, + "learning_rate": 3.0210696920583472e-05, + "loss": 1.0254, + "step": 466 + }, + { + "epoch": 0.02273999951306211, + "grad_norm": 1.5767109394073486, + "learning_rate": 3.0275526742301463e-05, + "loss": 1.0834, + "step": 467 + }, + { + "epoch": 0.02278869330216931, + "grad_norm": 1.8482333421707153, + "learning_rate": 3.0340356564019454e-05, + "loss": 0.8911, + "step": 468 + }, + { + "epoch": 0.022837387091276508, + "grad_norm": 1.7968910932540894, + "learning_rate": 3.040518638573744e-05, + "loss": 1.0764, + "step": 469 + }, + { + "epoch": 0.022886080880383707, + "grad_norm": 1.717691421508789, + "learning_rate": 3.047001620745543e-05, + "loss": 1.0176, + "step": 470 + }, + { + "epoch": 0.022934774669490905, + "grad_norm": 4.378477096557617, + "learning_rate": 3.0534846029173424e-05, + "loss": 0.9868, + "step": 471 + }, + { + "epoch": 0.022983468458598107, + "grad_norm": 1.7011104822158813, + "learning_rate": 3.0599675850891415e-05, + "loss": 0.9988, + "step": 472 + }, + { + "epoch": 0.023032162247705306, + "grad_norm": 1.8283088207244873, + "learning_rate": 3.066450567260941e-05, + "loss": 0.9807, + "step": 473 + }, + { + "epoch": 0.023080856036812505, + "grad_norm": 2.1688098907470703, + "learning_rate": 3.07293354943274e-05, + "loss": 1.0135, + "step": 474 + }, + { + "epoch": 0.023129549825919703, + "grad_norm": 1.8126914501190186, + "learning_rate": 3.079416531604538e-05, + "loss": 1.0573, + "step": 475 + }, + { + "epoch": 0.023178243615026902, + "grad_norm": 2.2765097618103027, + "learning_rate": 3.085899513776337e-05, + "loss": 0.9293, + "step": 476 + }, + { + "epoch": 0.023226937404134104, + "grad_norm": 5.261617660522461, + "learning_rate": 3.0923824959481364e-05, + "loss": 1.0552, + "step": 477 + }, + { + "epoch": 0.023275631193241303, + "grad_norm": 1.7710902690887451, + "learning_rate": 3.0988654781199356e-05, + "loss": 1.0774, + "step": 478 + }, + { + "epoch": 0.0233243249823485, + "grad_norm": 2.3287806510925293, + "learning_rate": 3.105348460291735e-05, + "loss": 0.9497, + "step": 479 + }, + { + "epoch": 0.0233730187714557, + "grad_norm": 2.51962947845459, + "learning_rate": 3.111831442463534e-05, + "loss": 1.0924, + "step": 480 + }, + { + "epoch": 0.0234217125605629, + "grad_norm": 1.9604449272155762, + "learning_rate": 3.118314424635332e-05, + "loss": 1.0317, + "step": 481 + }, + { + "epoch": 0.0234704063496701, + "grad_norm": 3.0046353340148926, + "learning_rate": 3.1247974068071313e-05, + "loss": 1.0184, + "step": 482 + }, + { + "epoch": 0.0235191001387773, + "grad_norm": 2.2795732021331787, + "learning_rate": 3.1312803889789305e-05, + "loss": 0.9243, + "step": 483 + }, + { + "epoch": 0.0235677939278845, + "grad_norm": 2.790226936340332, + "learning_rate": 3.1377633711507296e-05, + "loss": 1.0143, + "step": 484 + }, + { + "epoch": 0.023616487716991697, + "grad_norm": 1.7765182256698608, + "learning_rate": 3.144246353322529e-05, + "loss": 0.9888, + "step": 485 + }, + { + "epoch": 0.023665181506098896, + "grad_norm": 2.8066537380218506, + "learning_rate": 3.150729335494328e-05, + "loss": 1.0123, + "step": 486 + }, + { + "epoch": 0.023713875295206098, + "grad_norm": 2.1740822792053223, + "learning_rate": 3.157212317666126e-05, + "loss": 0.9771, + "step": 487 + }, + { + "epoch": 0.023762569084313297, + "grad_norm": 3.4194021224975586, + "learning_rate": 3.1636952998379254e-05, + "loss": 1.0868, + "step": 488 + }, + { + "epoch": 0.023811262873420495, + "grad_norm": 1.806457281112671, + "learning_rate": 3.1701782820097245e-05, + "loss": 1.0025, + "step": 489 + }, + { + "epoch": 0.023859956662527694, + "grad_norm": 1.7321624755859375, + "learning_rate": 3.1766612641815236e-05, + "loss": 1.0261, + "step": 490 + }, + { + "epoch": 0.023908650451634893, + "grad_norm": 2.302947521209717, + "learning_rate": 3.183144246353323e-05, + "loss": 1.0851, + "step": 491 + }, + { + "epoch": 0.023957344240742095, + "grad_norm": 4.102329730987549, + "learning_rate": 3.189627228525122e-05, + "loss": 1.0824, + "step": 492 + }, + { + "epoch": 0.024006038029849294, + "grad_norm": 2.1122288703918457, + "learning_rate": 3.196110210696921e-05, + "loss": 1.0467, + "step": 493 + }, + { + "epoch": 0.024054731818956492, + "grad_norm": 5.404764175415039, + "learning_rate": 3.20259319286872e-05, + "loss": 0.9741, + "step": 494 + }, + { + "epoch": 0.02410342560806369, + "grad_norm": 1.7255363464355469, + "learning_rate": 3.209076175040519e-05, + "loss": 1.0172, + "step": 495 + }, + { + "epoch": 0.02415211939717089, + "grad_norm": 2.9701128005981445, + "learning_rate": 3.2155591572123176e-05, + "loss": 1.0812, + "step": 496 + }, + { + "epoch": 0.024200813186278092, + "grad_norm": 2.557631492614746, + "learning_rate": 3.222042139384117e-05, + "loss": 1.0151, + "step": 497 + }, + { + "epoch": 0.02424950697538529, + "grad_norm": 1.6299247741699219, + "learning_rate": 3.228525121555916e-05, + "loss": 1.0256, + "step": 498 + }, + { + "epoch": 0.02429820076449249, + "grad_norm": 2.7617287635803223, + "learning_rate": 3.235008103727715e-05, + "loss": 0.9363, + "step": 499 + }, + { + "epoch": 0.024346894553599688, + "grad_norm": 2.1531519889831543, + "learning_rate": 3.241491085899514e-05, + "loss": 1.0183, + "step": 500 + }, + { + "epoch": 0.024395588342706886, + "grad_norm": 2.0843493938446045, + "learning_rate": 3.247974068071313e-05, + "loss": 0.9649, + "step": 501 + }, + { + "epoch": 0.02444428213181409, + "grad_norm": 2.1867334842681885, + "learning_rate": 3.254457050243112e-05, + "loss": 1.0133, + "step": 502 + }, + { + "epoch": 0.024492975920921287, + "grad_norm": 1.8980716466903687, + "learning_rate": 3.2609400324149114e-05, + "loss": 1.0488, + "step": 503 + }, + { + "epoch": 0.024541669710028486, + "grad_norm": 2.8974719047546387, + "learning_rate": 3.2674230145867105e-05, + "loss": 1.0124, + "step": 504 + }, + { + "epoch": 0.024590363499135685, + "grad_norm": 3.0150625705718994, + "learning_rate": 3.2739059967585096e-05, + "loss": 1.0115, + "step": 505 + }, + { + "epoch": 0.024639057288242883, + "grad_norm": 3.6080873012542725, + "learning_rate": 3.280388978930308e-05, + "loss": 0.9839, + "step": 506 + }, + { + "epoch": 0.024687751077350086, + "grad_norm": 2.4524624347686768, + "learning_rate": 3.286871961102107e-05, + "loss": 0.9994, + "step": 507 + }, + { + "epoch": 0.024736444866457284, + "grad_norm": 2.110025644302368, + "learning_rate": 3.293354943273906e-05, + "loss": 1.0538, + "step": 508 + }, + { + "epoch": 0.024785138655564483, + "grad_norm": 3.64152193069458, + "learning_rate": 3.2998379254457054e-05, + "loss": 1.0734, + "step": 509 + }, + { + "epoch": 0.02483383244467168, + "grad_norm": 0.1006433367729187, + "learning_rate": 3.3063209076175045e-05, + "loss": 0.6398, + "step": 510 + }, + { + "epoch": 0.02488252623377888, + "grad_norm": 2.2548813819885254, + "learning_rate": 3.3128038897893036e-05, + "loss": 1.0217, + "step": 511 + }, + { + "epoch": 0.024931220022886082, + "grad_norm": 2.664238452911377, + "learning_rate": 3.319286871961102e-05, + "loss": 0.9335, + "step": 512 + }, + { + "epoch": 0.02497991381199328, + "grad_norm": 1.8910140991210938, + "learning_rate": 3.325769854132901e-05, + "loss": 0.962, + "step": 513 + }, + { + "epoch": 0.02502860760110048, + "grad_norm": 2.0702168941497803, + "learning_rate": 3.3322528363047e-05, + "loss": 0.9755, + "step": 514 + }, + { + "epoch": 0.02507730139020768, + "grad_norm": 1.7760716676712036, + "learning_rate": 3.3387358184764994e-05, + "loss": 1.0019, + "step": 515 + }, + { + "epoch": 0.025125995179314877, + "grad_norm": 3.7862322330474854, + "learning_rate": 3.3452188006482985e-05, + "loss": 1.0549, + "step": 516 + }, + { + "epoch": 0.02517468896842208, + "grad_norm": 2.3980329036712646, + "learning_rate": 3.3517017828200977e-05, + "loss": 1.014, + "step": 517 + }, + { + "epoch": 0.025223382757529278, + "grad_norm": 1.642289161682129, + "learning_rate": 3.358184764991896e-05, + "loss": 1.0988, + "step": 518 + }, + { + "epoch": 0.025272076546636477, + "grad_norm": 2.3299777507781982, + "learning_rate": 3.364667747163695e-05, + "loss": 1.0718, + "step": 519 + }, + { + "epoch": 0.025320770335743675, + "grad_norm": 2.7846076488494873, + "learning_rate": 3.371150729335494e-05, + "loss": 1.0171, + "step": 520 + }, + { + "epoch": 0.025369464124850874, + "grad_norm": 1.7091236114501953, + "learning_rate": 3.3776337115072934e-05, + "loss": 1.0252, + "step": 521 + }, + { + "epoch": 0.025418157913958076, + "grad_norm": 1.6958582401275635, + "learning_rate": 3.3841166936790925e-05, + "loss": 0.9351, + "step": 522 + }, + { + "epoch": 0.025466851703065275, + "grad_norm": 1.9740880727767944, + "learning_rate": 3.390599675850892e-05, + "loss": 0.8865, + "step": 523 + }, + { + "epoch": 0.025515545492172474, + "grad_norm": 1.6591501235961914, + "learning_rate": 3.397082658022691e-05, + "loss": 0.9478, + "step": 524 + }, + { + "epoch": 0.025564239281279672, + "grad_norm": 2.239241600036621, + "learning_rate": 3.40356564019449e-05, + "loss": 1.0018, + "step": 525 + }, + { + "epoch": 0.02561293307038687, + "grad_norm": 2.735506057739258, + "learning_rate": 3.410048622366289e-05, + "loss": 0.9481, + "step": 526 + }, + { + "epoch": 0.025661626859494073, + "grad_norm": 2.435372829437256, + "learning_rate": 3.416531604538088e-05, + "loss": 1.076, + "step": 527 + }, + { + "epoch": 0.02571032064860127, + "grad_norm": 2.261003255844116, + "learning_rate": 3.423014586709887e-05, + "loss": 0.9814, + "step": 528 + }, + { + "epoch": 0.02575901443770847, + "grad_norm": 2.5261242389678955, + "learning_rate": 3.4294975688816864e-05, + "loss": 0.9609, + "step": 529 + }, + { + "epoch": 0.02580770822681567, + "grad_norm": 2.371645927429199, + "learning_rate": 3.435980551053485e-05, + "loss": 1.0273, + "step": 530 + }, + { + "epoch": 0.025856402015922868, + "grad_norm": 2.715780735015869, + "learning_rate": 3.442463533225284e-05, + "loss": 1.004, + "step": 531 + }, + { + "epoch": 0.02590509580503007, + "grad_norm": 0.13597795367240906, + "learning_rate": 3.448946515397083e-05, + "loss": 0.6707, + "step": 532 + }, + { + "epoch": 0.02595378959413727, + "grad_norm": 2.350999116897583, + "learning_rate": 3.455429497568882e-05, + "loss": 1.1069, + "step": 533 + }, + { + "epoch": 0.026002483383244467, + "grad_norm": 2.2074766159057617, + "learning_rate": 3.461912479740681e-05, + "loss": 0.9725, + "step": 534 + }, + { + "epoch": 0.026051177172351666, + "grad_norm": 2.490415334701538, + "learning_rate": 3.4683954619124804e-05, + "loss": 0.9399, + "step": 535 + }, + { + "epoch": 0.026099870961458865, + "grad_norm": 2.4299426078796387, + "learning_rate": 3.474878444084279e-05, + "loss": 0.9321, + "step": 536 + }, + { + "epoch": 0.026148564750566067, + "grad_norm": 1.711242437362671, + "learning_rate": 3.481361426256078e-05, + "loss": 1.0455, + "step": 537 + }, + { + "epoch": 0.026197258539673265, + "grad_norm": 1.785469889640808, + "learning_rate": 3.487844408427877e-05, + "loss": 1.0346, + "step": 538 + }, + { + "epoch": 0.026245952328780464, + "grad_norm": 2.1605918407440186, + "learning_rate": 3.494327390599676e-05, + "loss": 0.9684, + "step": 539 + }, + { + "epoch": 0.026294646117887663, + "grad_norm": 2.203432083129883, + "learning_rate": 3.500810372771475e-05, + "loss": 0.9412, + "step": 540 + }, + { + "epoch": 0.02634333990699486, + "grad_norm": 1.9612529277801514, + "learning_rate": 3.5072933549432744e-05, + "loss": 1.0341, + "step": 541 + }, + { + "epoch": 0.026392033696102064, + "grad_norm": 9.225504875183105, + "learning_rate": 3.513776337115073e-05, + "loss": 0.9874, + "step": 542 + }, + { + "epoch": 0.026440727485209262, + "grad_norm": 3.08673357963562, + "learning_rate": 3.520259319286872e-05, + "loss": 1.1279, + "step": 543 + }, + { + "epoch": 0.02648942127431646, + "grad_norm": 1.5902131795883179, + "learning_rate": 3.526742301458671e-05, + "loss": 0.9576, + "step": 544 + }, + { + "epoch": 0.02653811506342366, + "grad_norm": 2.3041839599609375, + "learning_rate": 3.53322528363047e-05, + "loss": 0.9785, + "step": 545 + }, + { + "epoch": 0.02658680885253086, + "grad_norm": 2.7498135566711426, + "learning_rate": 3.539708265802269e-05, + "loss": 1.0252, + "step": 546 + }, + { + "epoch": 0.02663550264163806, + "grad_norm": 0.10858599096536636, + "learning_rate": 3.5461912479740684e-05, + "loss": 0.6414, + "step": 547 + }, + { + "epoch": 0.02668419643074526, + "grad_norm": 3.5636260509490967, + "learning_rate": 3.5526742301458675e-05, + "loss": 1.0274, + "step": 548 + }, + { + "epoch": 0.026732890219852458, + "grad_norm": 2.368516683578491, + "learning_rate": 3.5591572123176666e-05, + "loss": 1.0831, + "step": 549 + }, + { + "epoch": 0.026781584008959657, + "grad_norm": 2.059257984161377, + "learning_rate": 3.565640194489466e-05, + "loss": 1.0788, + "step": 550 + }, + { + "epoch": 0.026830277798066855, + "grad_norm": 2.1013729572296143, + "learning_rate": 3.572123176661265e-05, + "loss": 0.9301, + "step": 551 + }, + { + "epoch": 0.026878971587174057, + "grad_norm": 1.7301254272460938, + "learning_rate": 3.578606158833063e-05, + "loss": 0.9624, + "step": 552 + }, + { + "epoch": 0.026927665376281256, + "grad_norm": 3.9803719520568848, + "learning_rate": 3.5850891410048624e-05, + "loss": 0.9784, + "step": 553 + }, + { + "epoch": 0.026976359165388455, + "grad_norm": 2.3213491439819336, + "learning_rate": 3.5915721231766615e-05, + "loss": 0.9969, + "step": 554 + }, + { + "epoch": 0.027025052954495653, + "grad_norm": 2.339308977127075, + "learning_rate": 3.5980551053484606e-05, + "loss": 0.9818, + "step": 555 + }, + { + "epoch": 0.027073746743602852, + "grad_norm": 2.5330801010131836, + "learning_rate": 3.60453808752026e-05, + "loss": 0.9752, + "step": 556 + }, + { + "epoch": 0.027122440532710054, + "grad_norm": 2.460414409637451, + "learning_rate": 3.611021069692059e-05, + "loss": 1.0687, + "step": 557 + }, + { + "epoch": 0.027171134321817253, + "grad_norm": 2.1701488494873047, + "learning_rate": 3.617504051863858e-05, + "loss": 1.0404, + "step": 558 + }, + { + "epoch": 0.02721982811092445, + "grad_norm": 1.5528171062469482, + "learning_rate": 3.623987034035657e-05, + "loss": 0.8884, + "step": 559 + }, + { + "epoch": 0.02726852190003165, + "grad_norm": 1.7984468936920166, + "learning_rate": 3.6304700162074555e-05, + "loss": 0.9284, + "step": 560 + }, + { + "epoch": 0.02731721568913885, + "grad_norm": 4.170769214630127, + "learning_rate": 3.6369529983792546e-05, + "loss": 0.9267, + "step": 561 + }, + { + "epoch": 0.02736590947824605, + "grad_norm": 2.660585403442383, + "learning_rate": 3.643435980551054e-05, + "loss": 1.0474, + "step": 562 + }, + { + "epoch": 0.02741460326735325, + "grad_norm": 1.970781922340393, + "learning_rate": 3.649918962722853e-05, + "loss": 0.9521, + "step": 563 + }, + { + "epoch": 0.02746329705646045, + "grad_norm": 2.127364158630371, + "learning_rate": 3.656401944894652e-05, + "loss": 0.9989, + "step": 564 + }, + { + "epoch": 0.027511990845567647, + "grad_norm": 2.374283790588379, + "learning_rate": 3.662884927066451e-05, + "loss": 1.0808, + "step": 565 + }, + { + "epoch": 0.027560684634674846, + "grad_norm": 1.5529088973999023, + "learning_rate": 3.6693679092382495e-05, + "loss": 1.0504, + "step": 566 + }, + { + "epoch": 0.027609378423782048, + "grad_norm": 2.288280725479126, + "learning_rate": 3.6758508914100487e-05, + "loss": 1.0148, + "step": 567 + }, + { + "epoch": 0.027658072212889247, + "grad_norm": 1.8117592334747314, + "learning_rate": 3.682333873581848e-05, + "loss": 0.9802, + "step": 568 + }, + { + "epoch": 0.027706766001996445, + "grad_norm": 3.9279286861419678, + "learning_rate": 3.688816855753647e-05, + "loss": 1.0003, + "step": 569 + }, + { + "epoch": 0.027755459791103644, + "grad_norm": 1.9738463163375854, + "learning_rate": 3.695299837925446e-05, + "loss": 1.099, + "step": 570 + }, + { + "epoch": 0.027804153580210843, + "grad_norm": 1.9521723985671997, + "learning_rate": 3.701782820097245e-05, + "loss": 0.9077, + "step": 571 + }, + { + "epoch": 0.027852847369318045, + "grad_norm": 1.7752403020858765, + "learning_rate": 3.7082658022690435e-05, + "loss": 1.0526, + "step": 572 + }, + { + "epoch": 0.027901541158425244, + "grad_norm": 1.9197717905044556, + "learning_rate": 3.714748784440843e-05, + "loss": 1.0214, + "step": 573 + }, + { + "epoch": 0.027950234947532442, + "grad_norm": 2.5880606174468994, + "learning_rate": 3.721231766612642e-05, + "loss": 1.0614, + "step": 574 + }, + { + "epoch": 0.02799892873663964, + "grad_norm": 8.818861961364746, + "learning_rate": 3.727714748784441e-05, + "loss": 1.106, + "step": 575 + }, + { + "epoch": 0.02804762252574684, + "grad_norm": 1.9681780338287354, + "learning_rate": 3.73419773095624e-05, + "loss": 1.0308, + "step": 576 + }, + { + "epoch": 0.028096316314854042, + "grad_norm": 2.5822384357452393, + "learning_rate": 3.740680713128039e-05, + "loss": 0.9029, + "step": 577 + }, + { + "epoch": 0.02814501010396124, + "grad_norm": 3.987283706665039, + "learning_rate": 3.747163695299838e-05, + "loss": 0.9832, + "step": 578 + }, + { + "epoch": 0.02819370389306844, + "grad_norm": 2.553811550140381, + "learning_rate": 3.7536466774716374e-05, + "loss": 0.9231, + "step": 579 + }, + { + "epoch": 0.028242397682175638, + "grad_norm": 1.913416862487793, + "learning_rate": 3.7601296596434365e-05, + "loss": 1.0636, + "step": 580 + }, + { + "epoch": 0.028291091471282837, + "grad_norm": 3.7639288902282715, + "learning_rate": 3.7666126418152356e-05, + "loss": 0.9292, + "step": 581 + }, + { + "epoch": 0.02833978526039004, + "grad_norm": 3.648477792739868, + "learning_rate": 3.773095623987035e-05, + "loss": 1.0083, + "step": 582 + }, + { + "epoch": 0.028388479049497237, + "grad_norm": 2.3596363067626953, + "learning_rate": 3.779578606158834e-05, + "loss": 1.1034, + "step": 583 + }, + { + "epoch": 0.028437172838604436, + "grad_norm": 0.09372120350599289, + "learning_rate": 3.786061588330632e-05, + "loss": 0.6195, + "step": 584 + }, + { + "epoch": 0.028485866627711635, + "grad_norm": 2.1756207942962646, + "learning_rate": 3.7925445705024314e-05, + "loss": 0.9396, + "step": 585 + }, + { + "epoch": 0.028534560416818833, + "grad_norm": 1.8068351745605469, + "learning_rate": 3.7990275526742305e-05, + "loss": 0.9919, + "step": 586 + }, + { + "epoch": 0.028583254205926036, + "grad_norm": 2.400648832321167, + "learning_rate": 3.8055105348460296e-05, + "loss": 1.0196, + "step": 587 + }, + { + "epoch": 0.028631947995033234, + "grad_norm": 2.0348987579345703, + "learning_rate": 3.811993517017829e-05, + "loss": 1.1034, + "step": 588 + }, + { + "epoch": 0.028680641784140433, + "grad_norm": 2.0021495819091797, + "learning_rate": 3.818476499189628e-05, + "loss": 0.9409, + "step": 589 + }, + { + "epoch": 0.02872933557324763, + "grad_norm": 2.3837149143218994, + "learning_rate": 3.824959481361426e-05, + "loss": 0.9021, + "step": 590 + }, + { + "epoch": 0.02877802936235483, + "grad_norm": 2.05210542678833, + "learning_rate": 3.8314424635332254e-05, + "loss": 0.9551, + "step": 591 + }, + { + "epoch": 0.028826723151462032, + "grad_norm": 2.4614205360412598, + "learning_rate": 3.8379254457050245e-05, + "loss": 1.0534, + "step": 592 + }, + { + "epoch": 0.02887541694056923, + "grad_norm": 0.09540872275829315, + "learning_rate": 3.8444084278768236e-05, + "loss": 0.6679, + "step": 593 + }, + { + "epoch": 0.02892411072967643, + "grad_norm": 2.1147046089172363, + "learning_rate": 3.850891410048623e-05, + "loss": 0.9599, + "step": 594 + }, + { + "epoch": 0.02897280451878363, + "grad_norm": 2.596844434738159, + "learning_rate": 3.857374392220422e-05, + "loss": 0.9706, + "step": 595 + }, + { + "epoch": 0.029021498307890827, + "grad_norm": 0.09304942935705185, + "learning_rate": 3.86385737439222e-05, + "loss": 0.6598, + "step": 596 + }, + { + "epoch": 0.02907019209699803, + "grad_norm": 1.9215260744094849, + "learning_rate": 3.8703403565640194e-05, + "loss": 0.8982, + "step": 597 + }, + { + "epoch": 0.029118885886105228, + "grad_norm": 3.2562789916992188, + "learning_rate": 3.8768233387358185e-05, + "loss": 0.9643, + "step": 598 + }, + { + "epoch": 0.029167579675212427, + "grad_norm": 1.9583642482757568, + "learning_rate": 3.8833063209076176e-05, + "loss": 1.0235, + "step": 599 + }, + { + "epoch": 0.029216273464319625, + "grad_norm": 2.0927844047546387, + "learning_rate": 3.889789303079417e-05, + "loss": 0.9471, + "step": 600 + }, + { + "epoch": 0.029264967253426824, + "grad_norm": 2.589385986328125, + "learning_rate": 3.896272285251216e-05, + "loss": 1.0183, + "step": 601 + }, + { + "epoch": 0.029313661042534026, + "grad_norm": 4.183237552642822, + "learning_rate": 3.902755267423015e-05, + "loss": 1.0151, + "step": 602 + }, + { + "epoch": 0.029362354831641225, + "grad_norm": 1.520853042602539, + "learning_rate": 3.909238249594814e-05, + "loss": 0.926, + "step": 603 + }, + { + "epoch": 0.029411048620748424, + "grad_norm": 1.8899506330490112, + "learning_rate": 3.915721231766613e-05, + "loss": 1.0292, + "step": 604 + }, + { + "epoch": 0.029459742409855622, + "grad_norm": 2.2169172763824463, + "learning_rate": 3.922204213938412e-05, + "loss": 1.0297, + "step": 605 + }, + { + "epoch": 0.02950843619896282, + "grad_norm": 2.2810416221618652, + "learning_rate": 3.9286871961102114e-05, + "loss": 1.032, + "step": 606 + }, + { + "epoch": 0.029557129988070023, + "grad_norm": 1.814902663230896, + "learning_rate": 3.9351701782820105e-05, + "loss": 0.9721, + "step": 607 + }, + { + "epoch": 0.029605823777177222, + "grad_norm": 0.10144108533859253, + "learning_rate": 3.941653160453809e-05, + "loss": 0.6472, + "step": 608 + }, + { + "epoch": 0.02965451756628442, + "grad_norm": 0.0859297513961792, + "learning_rate": 3.948136142625608e-05, + "loss": 0.622, + "step": 609 + }, + { + "epoch": 0.02970321135539162, + "grad_norm": 1.7854193449020386, + "learning_rate": 3.954619124797407e-05, + "loss": 1.0527, + "step": 610 + }, + { + "epoch": 0.029751905144498818, + "grad_norm": 2.156127691268921, + "learning_rate": 3.961102106969206e-05, + "loss": 0.968, + "step": 611 + }, + { + "epoch": 0.02980059893360602, + "grad_norm": 2.0210020542144775, + "learning_rate": 3.9675850891410054e-05, + "loss": 1.0438, + "step": 612 + }, + { + "epoch": 0.02984929272271322, + "grad_norm": 2.3017585277557373, + "learning_rate": 3.9740680713128045e-05, + "loss": 1.0082, + "step": 613 + }, + { + "epoch": 0.029897986511820417, + "grad_norm": 2.2427332401275635, + "learning_rate": 3.980551053484603e-05, + "loss": 1.008, + "step": 614 + }, + { + "epoch": 0.029946680300927616, + "grad_norm": 1.7297391891479492, + "learning_rate": 3.987034035656402e-05, + "loss": 0.9424, + "step": 615 + }, + { + "epoch": 0.029995374090034815, + "grad_norm": 2.073776960372925, + "learning_rate": 3.993517017828201e-05, + "loss": 0.8871, + "step": 616 + }, + { + "epoch": 0.030044067879142017, + "grad_norm": 2.3680577278137207, + "learning_rate": 4e-05, + "loss": 1.0633, + "step": 617 + }, + { + "epoch": 0.030092761668249216, + "grad_norm": 2.2992799282073975, + "learning_rate": 3.999999975124909e-05, + "loss": 0.9436, + "step": 618 + }, + { + "epoch": 0.030141455457356414, + "grad_norm": 1.9338417053222656, + "learning_rate": 3.9999999004996356e-05, + "loss": 1.0351, + "step": 619 + }, + { + "epoch": 0.030190149246463613, + "grad_norm": 2.0336754322052, + "learning_rate": 3.999999776124184e-05, + "loss": 1.0004, + "step": 620 + }, + { + "epoch": 0.03023884303557081, + "grad_norm": 1.5492404699325562, + "learning_rate": 3.999999601998554e-05, + "loss": 0.9624, + "step": 621 + }, + { + "epoch": 0.030287536824678014, + "grad_norm": 4.426306247711182, + "learning_rate": 3.999999378122752e-05, + "loss": 0.9634, + "step": 622 + }, + { + "epoch": 0.030336230613785212, + "grad_norm": 1.9836819171905518, + "learning_rate": 3.999999104496783e-05, + "loss": 1.0315, + "step": 623 + }, + { + "epoch": 0.03038492440289241, + "grad_norm": 1.8197855949401855, + "learning_rate": 3.999998781120654e-05, + "loss": 1.0623, + "step": 624 + }, + { + "epoch": 0.03043361819199961, + "grad_norm": 2.1574320793151855, + "learning_rate": 3.999998407994373e-05, + "loss": 1.0018, + "step": 625 + }, + { + "epoch": 0.03048231198110681, + "grad_norm": 2.7250208854675293, + "learning_rate": 3.999997985117949e-05, + "loss": 0.9591, + "step": 626 + }, + { + "epoch": 0.03053100577021401, + "grad_norm": 2.6366496086120605, + "learning_rate": 3.999997512491393e-05, + "loss": 0.9501, + "step": 627 + }, + { + "epoch": 0.03057969955932121, + "grad_norm": 2.5917725563049316, + "learning_rate": 3.999996990114716e-05, + "loss": 0.9711, + "step": 628 + }, + { + "epoch": 0.030628393348428408, + "grad_norm": 5.484663009643555, + "learning_rate": 3.999996417987933e-05, + "loss": 0.9632, + "step": 629 + }, + { + "epoch": 0.030677087137535607, + "grad_norm": 2.8008430004119873, + "learning_rate": 3.999995796111056e-05, + "loss": 0.9546, + "step": 630 + }, + { + "epoch": 0.030725780926642805, + "grad_norm": 2.403695583343506, + "learning_rate": 3.9999951244841e-05, + "loss": 1.0032, + "step": 631 + }, + { + "epoch": 0.030774474715750007, + "grad_norm": 2.3413405418395996, + "learning_rate": 3.9999944031070846e-05, + "loss": 1.0839, + "step": 632 + }, + { + "epoch": 0.030823168504857206, + "grad_norm": 2.2895522117614746, + "learning_rate": 3.999993631980025e-05, + "loss": 1.0135, + "step": 633 + }, + { + "epoch": 0.030871862293964405, + "grad_norm": 3.7820241451263428, + "learning_rate": 3.9999928111029424e-05, + "loss": 0.8999, + "step": 634 + }, + { + "epoch": 0.030920556083071603, + "grad_norm": 2.07338547706604, + "learning_rate": 3.999991940475855e-05, + "loss": 0.9786, + "step": 635 + }, + { + "epoch": 0.030969249872178802, + "grad_norm": 1.813260793685913, + "learning_rate": 3.9999910200987864e-05, + "loss": 1.0517, + "step": 636 + }, + { + "epoch": 0.031017943661286004, + "grad_norm": 1.5120881795883179, + "learning_rate": 3.9999900499717593e-05, + "loss": 0.972, + "step": 637 + }, + { + "epoch": 0.031066637450393203, + "grad_norm": 2.588925838470459, + "learning_rate": 3.999989030094797e-05, + "loss": 0.8761, + "step": 638 + }, + { + "epoch": 0.0311153312395004, + "grad_norm": 2.837277889251709, + "learning_rate": 3.999987960467925e-05, + "loss": 1.0315, + "step": 639 + }, + { + "epoch": 0.0311640250286076, + "grad_norm": 1.9785462617874146, + "learning_rate": 3.9999868410911704e-05, + "loss": 0.8944, + "step": 640 + }, + { + "epoch": 0.0312127188177148, + "grad_norm": 2.176551342010498, + "learning_rate": 3.9999856719645605e-05, + "loss": 1.0487, + "step": 641 + }, + { + "epoch": 0.031261412606822, + "grad_norm": 2.3603646755218506, + "learning_rate": 3.999984453088125e-05, + "loss": 1.0226, + "step": 642 + }, + { + "epoch": 0.031310106395929196, + "grad_norm": 2.0503005981445312, + "learning_rate": 3.999983184461894e-05, + "loss": 0.8876, + "step": 643 + }, + { + "epoch": 0.0313588001850364, + "grad_norm": 3.259406566619873, + "learning_rate": 3.999981866085899e-05, + "loss": 0.848, + "step": 644 + }, + { + "epoch": 0.0314074939741436, + "grad_norm": 2.8436594009399414, + "learning_rate": 3.999980497960172e-05, + "loss": 0.9574, + "step": 645 + }, + { + "epoch": 0.0314561877632508, + "grad_norm": 2.030836820602417, + "learning_rate": 3.9999790800847475e-05, + "loss": 0.9383, + "step": 646 + }, + { + "epoch": 0.031504881552358, + "grad_norm": 2.0337610244750977, + "learning_rate": 3.9999776124596626e-05, + "loss": 0.8912, + "step": 647 + }, + { + "epoch": 0.0315535753414652, + "grad_norm": 2.899083137512207, + "learning_rate": 3.999976095084951e-05, + "loss": 0.9518, + "step": 648 + }, + { + "epoch": 0.031602269130572395, + "grad_norm": 2.378239154815674, + "learning_rate": 3.999974527960652e-05, + "loss": 0.9186, + "step": 649 + }, + { + "epoch": 0.031650962919679594, + "grad_norm": 4.0041823387146, + "learning_rate": 3.999972911086804e-05, + "loss": 1.0315, + "step": 650 + }, + { + "epoch": 0.03169965670878679, + "grad_norm": 3.026069164276123, + "learning_rate": 3.999971244463448e-05, + "loss": 0.9897, + "step": 651 + }, + { + "epoch": 0.03174835049789399, + "grad_norm": 2.787048101425171, + "learning_rate": 3.999969528090625e-05, + "loss": 0.9622, + "step": 652 + }, + { + "epoch": 0.03179704428700119, + "grad_norm": 2.330690860748291, + "learning_rate": 3.999967761968377e-05, + "loss": 0.9504, + "step": 653 + }, + { + "epoch": 0.031845738076108396, + "grad_norm": 3.110393762588501, + "learning_rate": 3.9999659460967484e-05, + "loss": 0.9767, + "step": 654 + }, + { + "epoch": 0.031894431865215594, + "grad_norm": 1.850142002105713, + "learning_rate": 3.9999640804757857e-05, + "loss": 1.0209, + "step": 655 + }, + { + "epoch": 0.03194312565432279, + "grad_norm": 1.5889180898666382, + "learning_rate": 3.9999621651055334e-05, + "loss": 1.0438, + "step": 656 + }, + { + "epoch": 0.03199181944342999, + "grad_norm": 2.0931310653686523, + "learning_rate": 3.9999601999860397e-05, + "loss": 0.9937, + "step": 657 + }, + { + "epoch": 0.03204051323253719, + "grad_norm": 1.7083806991577148, + "learning_rate": 3.999958185117354e-05, + "loss": 0.9997, + "step": 658 + }, + { + "epoch": 0.03208920702164439, + "grad_norm": 2.358295202255249, + "learning_rate": 3.9999561204995264e-05, + "loss": 0.9506, + "step": 659 + }, + { + "epoch": 0.03213790081075159, + "grad_norm": 1.5826209783554077, + "learning_rate": 3.999954006132607e-05, + "loss": 0.9662, + "step": 660 + }, + { + "epoch": 0.03218659459985879, + "grad_norm": 2.33335280418396, + "learning_rate": 3.99995184201665e-05, + "loss": 0.986, + "step": 661 + }, + { + "epoch": 0.032235288388965985, + "grad_norm": 1.6521079540252686, + "learning_rate": 3.999949628151709e-05, + "loss": 0.9978, + "step": 662 + }, + { + "epoch": 0.032283982178073184, + "grad_norm": 2.4852280616760254, + "learning_rate": 3.999947364537838e-05, + "loss": 1.0092, + "step": 663 + }, + { + "epoch": 0.03233267596718039, + "grad_norm": 2.106867551803589, + "learning_rate": 3.999945051175094e-05, + "loss": 0.9757, + "step": 664 + }, + { + "epoch": 0.03238136975628759, + "grad_norm": 1.8355432748794556, + "learning_rate": 3.999942688063534e-05, + "loss": 0.9983, + "step": 665 + }, + { + "epoch": 0.03243006354539479, + "grad_norm": 1.8878296613693237, + "learning_rate": 3.9999402752032186e-05, + "loss": 1.0098, + "step": 666 + }, + { + "epoch": 0.032478757334501986, + "grad_norm": 2.0784735679626465, + "learning_rate": 3.999937812594206e-05, + "loss": 0.9271, + "step": 667 + }, + { + "epoch": 0.032527451123609184, + "grad_norm": 2.6394457817077637, + "learning_rate": 3.999935300236557e-05, + "loss": 0.9876, + "step": 668 + }, + { + "epoch": 0.03257614491271638, + "grad_norm": 1.816990852355957, + "learning_rate": 3.999932738130336e-05, + "loss": 0.9886, + "step": 669 + }, + { + "epoch": 0.03262483870182358, + "grad_norm": 2.030911684036255, + "learning_rate": 3.999930126275606e-05, + "loss": 1.0494, + "step": 670 + }, + { + "epoch": 0.03267353249093078, + "grad_norm": 2.2473974227905273, + "learning_rate": 3.9999274646724316e-05, + "loss": 0.9407, + "step": 671 + }, + { + "epoch": 0.03272222628003798, + "grad_norm": 1.9996330738067627, + "learning_rate": 3.999924753320879e-05, + "loss": 1.0536, + "step": 672 + }, + { + "epoch": 0.03277092006914518, + "grad_norm": 2.4639780521392822, + "learning_rate": 3.999921992221016e-05, + "loss": 0.9633, + "step": 673 + }, + { + "epoch": 0.03281961385825238, + "grad_norm": 1.8107469081878662, + "learning_rate": 3.9999191813729106e-05, + "loss": 1.0106, + "step": 674 + }, + { + "epoch": 0.03286830764735958, + "grad_norm": 3.2257697582244873, + "learning_rate": 3.999916320776634e-05, + "loss": 0.9523, + "step": 675 + }, + { + "epoch": 0.03291700143646678, + "grad_norm": 1.7901618480682373, + "learning_rate": 3.999913410432256e-05, + "loss": 1.0258, + "step": 676 + }, + { + "epoch": 0.03296569522557398, + "grad_norm": 1.5760072469711304, + "learning_rate": 3.999910450339849e-05, + "loss": 0.8775, + "step": 677 + }, + { + "epoch": 0.03301438901468118, + "grad_norm": 6.6868367195129395, + "learning_rate": 3.999907440499488e-05, + "loss": 0.8949, + "step": 678 + }, + { + "epoch": 0.03306308280378838, + "grad_norm": 1.8491085767745972, + "learning_rate": 3.9999043809112473e-05, + "loss": 0.9589, + "step": 679 + }, + { + "epoch": 0.033111776592895575, + "grad_norm": 2.3121273517608643, + "learning_rate": 3.999901271575202e-05, + "loss": 0.9116, + "step": 680 + }, + { + "epoch": 0.033160470382002774, + "grad_norm": 2.22660756111145, + "learning_rate": 3.999898112491431e-05, + "loss": 0.9846, + "step": 681 + }, + { + "epoch": 0.03320916417110997, + "grad_norm": 6.5877275466918945, + "learning_rate": 3.999894903660011e-05, + "loss": 0.8751, + "step": 682 + }, + { + "epoch": 0.03325785796021717, + "grad_norm": 1.6290162801742554, + "learning_rate": 3.9998916450810245e-05, + "loss": 0.9557, + "step": 683 + }, + { + "epoch": 0.03330655174932438, + "grad_norm": 1.9151227474212646, + "learning_rate": 3.99988833675455e-05, + "loss": 0.9895, + "step": 684 + }, + { + "epoch": 0.033355245538431576, + "grad_norm": 1.8291139602661133, + "learning_rate": 3.999884978680671e-05, + "loss": 0.9654, + "step": 685 + }, + { + "epoch": 0.033403939327538774, + "grad_norm": 1.8716026544570923, + "learning_rate": 3.9998815708594704e-05, + "loss": 1.0116, + "step": 686 + }, + { + "epoch": 0.03345263311664597, + "grad_norm": 2.4750871658325195, + "learning_rate": 3.999878113291034e-05, + "loss": 1.0137, + "step": 687 + }, + { + "epoch": 0.03350132690575317, + "grad_norm": 2.6397197246551514, + "learning_rate": 3.9998746059754474e-05, + "loss": 1.0605, + "step": 688 + }, + { + "epoch": 0.03355002069486037, + "grad_norm": 1.8601336479187012, + "learning_rate": 3.9998710489127965e-05, + "loss": 1.0292, + "step": 689 + }, + { + "epoch": 0.03359871448396757, + "grad_norm": 2.152353048324585, + "learning_rate": 3.999867442103172e-05, + "loss": 1.055, + "step": 690 + }, + { + "epoch": 0.03364740827307477, + "grad_norm": 2.22940993309021, + "learning_rate": 3.9998637855466625e-05, + "loss": 0.9585, + "step": 691 + }, + { + "epoch": 0.033696102062181967, + "grad_norm": 2.9358773231506348, + "learning_rate": 3.9998600792433576e-05, + "loss": 0.9213, + "step": 692 + }, + { + "epoch": 0.033744795851289165, + "grad_norm": 4.691249847412109, + "learning_rate": 3.9998563231933524e-05, + "loss": 1.0872, + "step": 693 + }, + { + "epoch": 0.03379348964039637, + "grad_norm": 2.140336513519287, + "learning_rate": 3.999852517396738e-05, + "loss": 0.9981, + "step": 694 + }, + { + "epoch": 0.03384218342950357, + "grad_norm": 3.000537157058716, + "learning_rate": 3.999848661853611e-05, + "loss": 0.9714, + "step": 695 + }, + { + "epoch": 0.03389087721861077, + "grad_norm": 1.6998554468154907, + "learning_rate": 3.999844756564065e-05, + "loss": 0.9545, + "step": 696 + }, + { + "epoch": 0.03393957100771797, + "grad_norm": 1.792333960533142, + "learning_rate": 3.999840801528199e-05, + "loss": 0.9929, + "step": 697 + }, + { + "epoch": 0.033988264796825166, + "grad_norm": 2.4072153568267822, + "learning_rate": 3.9998367967461106e-05, + "loss": 1.0684, + "step": 698 + }, + { + "epoch": 0.034036958585932364, + "grad_norm": 1.950757384300232, + "learning_rate": 3.999832742217899e-05, + "loss": 1.0411, + "step": 699 + }, + { + "epoch": 0.03408565237503956, + "grad_norm": 4.8072967529296875, + "learning_rate": 3.9998286379436663e-05, + "loss": 0.9454, + "step": 700 + }, + { + "epoch": 0.03413434616414676, + "grad_norm": 4.44407844543457, + "learning_rate": 3.9998244839235135e-05, + "loss": 0.9217, + "step": 701 + }, + { + "epoch": 0.03418303995325396, + "grad_norm": 2.793130397796631, + "learning_rate": 3.9998202801575445e-05, + "loss": 1.064, + "step": 702 + }, + { + "epoch": 0.03423173374236116, + "grad_norm": 3.2079267501831055, + "learning_rate": 3.999816026645864e-05, + "loss": 0.9769, + "step": 703 + }, + { + "epoch": 0.034280427531468365, + "grad_norm": 2.64158034324646, + "learning_rate": 3.9998117233885765e-05, + "loss": 0.9966, + "step": 704 + }, + { + "epoch": 0.03432912132057556, + "grad_norm": 2.4090073108673096, + "learning_rate": 3.999807370385791e-05, + "loss": 0.8645, + "step": 705 + }, + { + "epoch": 0.03437781510968276, + "grad_norm": 2.6604692935943604, + "learning_rate": 3.999802967637615e-05, + "loss": 0.8361, + "step": 706 + }, + { + "epoch": 0.03442650889878996, + "grad_norm": 2.942528247833252, + "learning_rate": 3.999798515144157e-05, + "loss": 0.8848, + "step": 707 + }, + { + "epoch": 0.03447520268789716, + "grad_norm": 7.804299354553223, + "learning_rate": 3.999794012905529e-05, + "loss": 0.9511, + "step": 708 + }, + { + "epoch": 0.03452389647700436, + "grad_norm": 2.4837582111358643, + "learning_rate": 3.999789460921842e-05, + "loss": 0.995, + "step": 709 + }, + { + "epoch": 0.03457259026611156, + "grad_norm": 2.2688095569610596, + "learning_rate": 3.9997848591932105e-05, + "loss": 0.9976, + "step": 710 + }, + { + "epoch": 0.034621284055218755, + "grad_norm": 2.977428674697876, + "learning_rate": 3.999780207719748e-05, + "loss": 1.0215, + "step": 711 + }, + { + "epoch": 0.034669977844325954, + "grad_norm": 0.09618327021598816, + "learning_rate": 3.999775506501571e-05, + "loss": 0.6095, + "step": 712 + }, + { + "epoch": 0.03471867163343315, + "grad_norm": 3.1787948608398438, + "learning_rate": 3.9997707555387954e-05, + "loss": 0.9682, + "step": 713 + }, + { + "epoch": 0.03476736542254036, + "grad_norm": 3.3310015201568604, + "learning_rate": 3.999765954831541e-05, + "loss": 0.9465, + "step": 714 + }, + { + "epoch": 0.03481605921164756, + "grad_norm": 1.8069260120391846, + "learning_rate": 3.999761104379925e-05, + "loss": 0.966, + "step": 715 + }, + { + "epoch": 0.034864753000754756, + "grad_norm": 2.30326509475708, + "learning_rate": 3.999756204184069e-05, + "loss": 0.9598, + "step": 716 + }, + { + "epoch": 0.034913446789861954, + "grad_norm": 4.542076587677002, + "learning_rate": 3.9997512542440954e-05, + "loss": 0.9878, + "step": 717 + }, + { + "epoch": 0.03496214057896915, + "grad_norm": 1.8626340627670288, + "learning_rate": 3.999746254560128e-05, + "loss": 1.0437, + "step": 718 + }, + { + "epoch": 0.03501083436807635, + "grad_norm": 2.787985324859619, + "learning_rate": 3.999741205132289e-05, + "loss": 0.9811, + "step": 719 + }, + { + "epoch": 0.03505952815718355, + "grad_norm": 1.9141076803207397, + "learning_rate": 3.9997361059607055e-05, + "loss": 1.0294, + "step": 720 + }, + { + "epoch": 0.03510822194629075, + "grad_norm": 2.0706894397735596, + "learning_rate": 3.999730957045504e-05, + "loss": 1.0128, + "step": 721 + }, + { + "epoch": 0.03515691573539795, + "grad_norm": 3.47106671333313, + "learning_rate": 3.999725758386813e-05, + "loss": 0.9189, + "step": 722 + }, + { + "epoch": 0.035205609524505146, + "grad_norm": 1.8696225881576538, + "learning_rate": 3.999720509984761e-05, + "loss": 1.0246, + "step": 723 + }, + { + "epoch": 0.03525430331361235, + "grad_norm": 1.8819262981414795, + "learning_rate": 3.999715211839479e-05, + "loss": 1.0537, + "step": 724 + }, + { + "epoch": 0.03530299710271955, + "grad_norm": 2.5511720180511475, + "learning_rate": 3.9997098639510985e-05, + "loss": 1.0149, + "step": 725 + }, + { + "epoch": 0.03535169089182675, + "grad_norm": 2.5106287002563477, + "learning_rate": 3.9997044663197535e-05, + "loss": 1.0169, + "step": 726 + }, + { + "epoch": 0.03540038468093395, + "grad_norm": 1.6424604654312134, + "learning_rate": 3.9996990189455774e-05, + "loss": 1.0844, + "step": 727 + }, + { + "epoch": 0.03544907847004115, + "grad_norm": 2.1245460510253906, + "learning_rate": 3.9996935218287064e-05, + "loss": 1.0938, + "step": 728 + }, + { + "epoch": 0.035497772259148345, + "grad_norm": 3.12711238861084, + "learning_rate": 3.9996879749692754e-05, + "loss": 1.0086, + "step": 729 + }, + { + "epoch": 0.035546466048255544, + "grad_norm": 3.014852285385132, + "learning_rate": 3.9996823783674245e-05, + "loss": 0.9627, + "step": 730 + }, + { + "epoch": 0.03559515983736274, + "grad_norm": 2.566087245941162, + "learning_rate": 3.999676732023292e-05, + "loss": 0.8396, + "step": 731 + }, + { + "epoch": 0.03564385362646994, + "grad_norm": 2.543382167816162, + "learning_rate": 3.999671035937019e-05, + "loss": 0.9727, + "step": 732 + }, + { + "epoch": 0.03569254741557714, + "grad_norm": 2.37815260887146, + "learning_rate": 3.999665290108746e-05, + "loss": 0.8696, + "step": 733 + }, + { + "epoch": 0.035741241204684346, + "grad_norm": 1.7921174764633179, + "learning_rate": 3.9996594945386164e-05, + "loss": 0.9358, + "step": 734 + }, + { + "epoch": 0.035789934993791545, + "grad_norm": 1.8493930101394653, + "learning_rate": 3.9996536492267754e-05, + "loss": 1.0176, + "step": 735 + }, + { + "epoch": 0.03583862878289874, + "grad_norm": 1.9211663007736206, + "learning_rate": 3.999647754173367e-05, + "loss": 1.0493, + "step": 736 + }, + { + "epoch": 0.03588732257200594, + "grad_norm": 1.8278419971466064, + "learning_rate": 3.999641809378538e-05, + "loss": 0.9526, + "step": 737 + }, + { + "epoch": 0.03593601636111314, + "grad_norm": 2.6988987922668457, + "learning_rate": 3.999635814842437e-05, + "loss": 0.9668, + "step": 738 + }, + { + "epoch": 0.03598471015022034, + "grad_norm": 2.6907498836517334, + "learning_rate": 3.999629770565214e-05, + "loss": 0.9759, + "step": 739 + }, + { + "epoch": 0.03603340393932754, + "grad_norm": 2.0799081325531006, + "learning_rate": 3.999623676547016e-05, + "loss": 0.9111, + "step": 740 + }, + { + "epoch": 0.03608209772843474, + "grad_norm": 2.434253692626953, + "learning_rate": 3.9996175327879976e-05, + "loss": 1.0412, + "step": 741 + }, + { + "epoch": 0.036130791517541935, + "grad_norm": 0.09167376905679703, + "learning_rate": 3.999611339288311e-05, + "loss": 0.6833, + "step": 742 + }, + { + "epoch": 0.036179485306649134, + "grad_norm": 1.8889789581298828, + "learning_rate": 3.99960509604811e-05, + "loss": 1.1, + "step": 743 + }, + { + "epoch": 0.03622817909575634, + "grad_norm": 2.2089459896087646, + "learning_rate": 3.9995988030675496e-05, + "loss": 1.0782, + "step": 744 + }, + { + "epoch": 0.03627687288486354, + "grad_norm": 1.8784133195877075, + "learning_rate": 3.999592460346788e-05, + "loss": 1.0774, + "step": 745 + }, + { + "epoch": 0.03632556667397074, + "grad_norm": 2.558651924133301, + "learning_rate": 3.99958606788598e-05, + "loss": 1.0092, + "step": 746 + }, + { + "epoch": 0.036374260463077936, + "grad_norm": 1.9977201223373413, + "learning_rate": 3.9995796256852865e-05, + "loss": 1.0606, + "step": 747 + }, + { + "epoch": 0.036422954252185134, + "grad_norm": 3.2444007396698, + "learning_rate": 3.9995731337448675e-05, + "loss": 1.0969, + "step": 748 + }, + { + "epoch": 0.03647164804129233, + "grad_norm": 1.8269562721252441, + "learning_rate": 3.999566592064885e-05, + "loss": 0.9696, + "step": 749 + }, + { + "epoch": 0.03652034183039953, + "grad_norm": 4.334853172302246, + "learning_rate": 3.999560000645501e-05, + "loss": 1.1304, + "step": 750 + }, + { + "epoch": 0.03656903561950673, + "grad_norm": 2.016094207763672, + "learning_rate": 3.9995533594868796e-05, + "loss": 1.0335, + "step": 751 + }, + { + "epoch": 0.03661772940861393, + "grad_norm": 2.7659404277801514, + "learning_rate": 3.999546668589186e-05, + "loss": 0.9501, + "step": 752 + }, + { + "epoch": 0.03666642319772113, + "grad_norm": 2.1481642723083496, + "learning_rate": 3.999539927952587e-05, + "loss": 0.8862, + "step": 753 + }, + { + "epoch": 0.03671511698682833, + "grad_norm": 2.1126461029052734, + "learning_rate": 3.99953313757725e-05, + "loss": 1.0012, + "step": 754 + }, + { + "epoch": 0.03676381077593553, + "grad_norm": 1.7901240587234497, + "learning_rate": 3.9995262974633435e-05, + "loss": 0.9778, + "step": 755 + }, + { + "epoch": 0.03681250456504273, + "grad_norm": 2.094231128692627, + "learning_rate": 3.999519407611038e-05, + "loss": 0.9932, + "step": 756 + }, + { + "epoch": 0.03686119835414993, + "grad_norm": 1.9873002767562866, + "learning_rate": 3.9995124680205056e-05, + "loss": 0.9612, + "step": 757 + }, + { + "epoch": 0.03690989214325713, + "grad_norm": 2.4629194736480713, + "learning_rate": 3.9995054786919175e-05, + "loss": 1.0104, + "step": 758 + }, + { + "epoch": 0.03695858593236433, + "grad_norm": 1.6584515571594238, + "learning_rate": 3.999498439625448e-05, + "loss": 0.9508, + "step": 759 + }, + { + "epoch": 0.037007279721471525, + "grad_norm": 2.5225820541381836, + "learning_rate": 3.9994913508212736e-05, + "loss": 1.0202, + "step": 760 + }, + { + "epoch": 0.037055973510578724, + "grad_norm": 2.7062911987304688, + "learning_rate": 3.9994842122795695e-05, + "loss": 0.9946, + "step": 761 + }, + { + "epoch": 0.03710466729968592, + "grad_norm": 2.19594144821167, + "learning_rate": 3.999477024000513e-05, + "loss": 1.0112, + "step": 762 + }, + { + "epoch": 0.03715336108879312, + "grad_norm": 2.1105244159698486, + "learning_rate": 3.999469785984283e-05, + "loss": 0.9865, + "step": 763 + }, + { + "epoch": 0.03720205487790033, + "grad_norm": 2.2300286293029785, + "learning_rate": 3.99946249823106e-05, + "loss": 0.8719, + "step": 764 + }, + { + "epoch": 0.037250748667007526, + "grad_norm": 0.08645277470350266, + "learning_rate": 3.9994551607410245e-05, + "loss": 0.6185, + "step": 765 + }, + { + "epoch": 0.037299442456114724, + "grad_norm": 4.942261695861816, + "learning_rate": 3.9994477735143597e-05, + "loss": 1.0249, + "step": 766 + }, + { + "epoch": 0.03734813624522192, + "grad_norm": 2.254749298095703, + "learning_rate": 3.9994403365512504e-05, + "loss": 1.0497, + "step": 767 + }, + { + "epoch": 0.03739683003432912, + "grad_norm": 2.3632755279541016, + "learning_rate": 3.999432849851879e-05, + "loss": 1.0475, + "step": 768 + }, + { + "epoch": 0.03744552382343632, + "grad_norm": 2.1688451766967773, + "learning_rate": 3.999425313416434e-05, + "loss": 1.0401, + "step": 769 + }, + { + "epoch": 0.03749421761254352, + "grad_norm": 2.1116342544555664, + "learning_rate": 3.9994177272451015e-05, + "loss": 0.979, + "step": 770 + }, + { + "epoch": 0.03754291140165072, + "grad_norm": 2.18056583404541, + "learning_rate": 3.999410091338071e-05, + "loss": 0.8302, + "step": 771 + }, + { + "epoch": 0.03759160519075792, + "grad_norm": 4.189380645751953, + "learning_rate": 3.999402405695532e-05, + "loss": 1.0483, + "step": 772 + }, + { + "epoch": 0.037640298979865115, + "grad_norm": 1.9835344552993774, + "learning_rate": 3.9993946703176764e-05, + "loss": 0.9176, + "step": 773 + }, + { + "epoch": 0.03768899276897232, + "grad_norm": 6.428050518035889, + "learning_rate": 3.999386885204696e-05, + "loss": 0.9923, + "step": 774 + }, + { + "epoch": 0.03773768655807952, + "grad_norm": 2.593562364578247, + "learning_rate": 3.999379050356784e-05, + "loss": 1.0041, + "step": 775 + }, + { + "epoch": 0.03778638034718672, + "grad_norm": 6.517501354217529, + "learning_rate": 3.999371165774136e-05, + "loss": 0.9424, + "step": 776 + }, + { + "epoch": 0.03783507413629392, + "grad_norm": 3.2792651653289795, + "learning_rate": 3.999363231456948e-05, + "loss": 0.9479, + "step": 777 + }, + { + "epoch": 0.037883767925401116, + "grad_norm": 4.873302459716797, + "learning_rate": 3.999355247405417e-05, + "loss": 1.0011, + "step": 778 + }, + { + "epoch": 0.037932461714508314, + "grad_norm": 2.653005838394165, + "learning_rate": 3.999347213619742e-05, + "loss": 0.9913, + "step": 779 + }, + { + "epoch": 0.03798115550361551, + "grad_norm": 2.4232587814331055, + "learning_rate": 3.999339130100123e-05, + "loss": 1.0357, + "step": 780 + }, + { + "epoch": 0.03802984929272271, + "grad_norm": 2.142749547958374, + "learning_rate": 3.9993309968467616e-05, + "loss": 1.053, + "step": 781 + }, + { + "epoch": 0.03807854308182991, + "grad_norm": 2.0103063583374023, + "learning_rate": 3.999322813859858e-05, + "loss": 0.9463, + "step": 782 + }, + { + "epoch": 0.03812723687093711, + "grad_norm": 1.8877925872802734, + "learning_rate": 3.999314581139618e-05, + "loss": 0.8836, + "step": 783 + }, + { + "epoch": 0.038175930660044315, + "grad_norm": 1.8503371477127075, + "learning_rate": 3.999306298686245e-05, + "loss": 1.0234, + "step": 784 + }, + { + "epoch": 0.03822462444915151, + "grad_norm": 4.029489994049072, + "learning_rate": 3.999297966499946e-05, + "loss": 0.9753, + "step": 785 + }, + { + "epoch": 0.03827331823825871, + "grad_norm": 1.7001214027404785, + "learning_rate": 3.999289584580928e-05, + "loss": 0.9253, + "step": 786 + }, + { + "epoch": 0.03832201202736591, + "grad_norm": 2.4941039085388184, + "learning_rate": 3.999281152929398e-05, + "loss": 1.0518, + "step": 787 + }, + { + "epoch": 0.03837070581647311, + "grad_norm": 1.8200424909591675, + "learning_rate": 3.999272671545568e-05, + "loss": 1.022, + "step": 788 + }, + { + "epoch": 0.03841939960558031, + "grad_norm": 2.770552635192871, + "learning_rate": 3.999264140429648e-05, + "loss": 0.9527, + "step": 789 + }, + { + "epoch": 0.03846809339468751, + "grad_norm": 1.8825627565383911, + "learning_rate": 3.9992555595818504e-05, + "loss": 0.9152, + "step": 790 + }, + { + "epoch": 0.038516787183794705, + "grad_norm": 2.1728334426879883, + "learning_rate": 3.999246929002389e-05, + "loss": 0.9176, + "step": 791 + }, + { + "epoch": 0.038565480972901904, + "grad_norm": 1.8064653873443604, + "learning_rate": 3.9992382486914766e-05, + "loss": 0.979, + "step": 792 + }, + { + "epoch": 0.0386141747620091, + "grad_norm": 2.052483558654785, + "learning_rate": 3.9992295186493304e-05, + "loss": 0.8875, + "step": 793 + }, + { + "epoch": 0.03866286855111631, + "grad_norm": 2.0836257934570312, + "learning_rate": 3.999220738876169e-05, + "loss": 1.087, + "step": 794 + }, + { + "epoch": 0.03871156234022351, + "grad_norm": 3.4332990646362305, + "learning_rate": 3.9992119093722086e-05, + "loss": 0.9647, + "step": 795 + }, + { + "epoch": 0.038760256129330706, + "grad_norm": 2.7646801471710205, + "learning_rate": 3.99920303013767e-05, + "loss": 1.0366, + "step": 796 + }, + { + "epoch": 0.038808949918437904, + "grad_norm": 1.9408352375030518, + "learning_rate": 3.999194101172773e-05, + "loss": 0.9372, + "step": 797 + }, + { + "epoch": 0.0388576437075451, + "grad_norm": 2.6122021675109863, + "learning_rate": 3.9991851224777416e-05, + "loss": 1.0148, + "step": 798 + }, + { + "epoch": 0.0389063374966523, + "grad_norm": 2.2062854766845703, + "learning_rate": 3.999176094052797e-05, + "loss": 0.9118, + "step": 799 + }, + { + "epoch": 0.0389550312857595, + "grad_norm": 1.762821912765503, + "learning_rate": 3.999167015898165e-05, + "loss": 0.9962, + "step": 800 + }, + { + "epoch": 0.0390037250748667, + "grad_norm": 1.7290245294570923, + "learning_rate": 3.999157888014072e-05, + "loss": 0.9195, + "step": 801 + }, + { + "epoch": 0.0390524188639739, + "grad_norm": 2.2654662132263184, + "learning_rate": 3.9991487104007436e-05, + "loss": 0.9421, + "step": 802 + }, + { + "epoch": 0.039101112653081097, + "grad_norm": 2.0389068126678467, + "learning_rate": 3.9991394830584084e-05, + "loss": 1.0201, + "step": 803 + }, + { + "epoch": 0.0391498064421883, + "grad_norm": 1.9215362071990967, + "learning_rate": 3.999130205987297e-05, + "loss": 1.034, + "step": 804 + }, + { + "epoch": 0.0391985002312955, + "grad_norm": 1.4395695924758911, + "learning_rate": 3.999120879187639e-05, + "loss": 0.9906, + "step": 805 + }, + { + "epoch": 0.0392471940204027, + "grad_norm": 3.3704235553741455, + "learning_rate": 3.999111502659667e-05, + "loss": 0.955, + "step": 806 + }, + { + "epoch": 0.0392958878095099, + "grad_norm": 2.328274726867676, + "learning_rate": 3.9991020764036145e-05, + "loss": 0.9304, + "step": 807 + }, + { + "epoch": 0.0393445815986171, + "grad_norm": 2.668591022491455, + "learning_rate": 3.9990926004197145e-05, + "loss": 0.9268, + "step": 808 + }, + { + "epoch": 0.039393275387724296, + "grad_norm": 3.0677740573883057, + "learning_rate": 3.999083074708205e-05, + "loss": 0.9534, + "step": 809 + }, + { + "epoch": 0.039441969176831494, + "grad_norm": 1.8926215171813965, + "learning_rate": 3.999073499269321e-05, + "loss": 0.9212, + "step": 810 + }, + { + "epoch": 0.03949066296593869, + "grad_norm": 2.4400269985198975, + "learning_rate": 3.9990638741033015e-05, + "loss": 1.0066, + "step": 811 + }, + { + "epoch": 0.03953935675504589, + "grad_norm": 2.1602413654327393, + "learning_rate": 3.999054199210386e-05, + "loss": 1.0175, + "step": 812 + }, + { + "epoch": 0.03958805054415309, + "grad_norm": 2.2643842697143555, + "learning_rate": 3.999044474590815e-05, + "loss": 0.9831, + "step": 813 + }, + { + "epoch": 0.039636744333260296, + "grad_norm": 1.529510736465454, + "learning_rate": 3.99903470024483e-05, + "loss": 0.9607, + "step": 814 + }, + { + "epoch": 0.039685438122367495, + "grad_norm": 5.547833442687988, + "learning_rate": 3.9990248761726755e-05, + "loss": 0.9235, + "step": 815 + }, + { + "epoch": 0.03973413191147469, + "grad_norm": 2.502899169921875, + "learning_rate": 3.9990150023745944e-05, + "loss": 0.9407, + "step": 816 + }, + { + "epoch": 0.03978282570058189, + "grad_norm": 0.10272713005542755, + "learning_rate": 3.999005078850833e-05, + "loss": 0.6539, + "step": 817 + }, + { + "epoch": 0.03983151948968909, + "grad_norm": 1.899046540260315, + "learning_rate": 3.998995105601638e-05, + "loss": 0.9758, + "step": 818 + }, + { + "epoch": 0.03988021327879629, + "grad_norm": 2.8249711990356445, + "learning_rate": 3.9989850826272574e-05, + "loss": 0.9877, + "step": 819 + }, + { + "epoch": 0.03992890706790349, + "grad_norm": 4.631659507751465, + "learning_rate": 3.998975009927941e-05, + "loss": 0.9514, + "step": 820 + }, + { + "epoch": 0.03997760085701069, + "grad_norm": 3.1162097454071045, + "learning_rate": 3.998964887503938e-05, + "loss": 0.9403, + "step": 821 + }, + { + "epoch": 0.040026294646117885, + "grad_norm": 1.8213539123535156, + "learning_rate": 3.9989547153555014e-05, + "loss": 0.9681, + "step": 822 + }, + { + "epoch": 0.040074988435225084, + "grad_norm": 1.962171196937561, + "learning_rate": 3.998944493482885e-05, + "loss": 0.9681, + "step": 823 + }, + { + "epoch": 0.04012368222433229, + "grad_norm": 2.444854497909546, + "learning_rate": 3.9989342218863415e-05, + "loss": 0.9914, + "step": 824 + }, + { + "epoch": 0.04017237601343949, + "grad_norm": 1.984133243560791, + "learning_rate": 3.998923900566127e-05, + "loss": 1.0625, + "step": 825 + }, + { + "epoch": 0.04022106980254669, + "grad_norm": 2.4743292331695557, + "learning_rate": 3.998913529522498e-05, + "loss": 0.9602, + "step": 826 + }, + { + "epoch": 0.040269763591653886, + "grad_norm": 3.0082356929779053, + "learning_rate": 3.998903108755713e-05, + "loss": 0.9209, + "step": 827 + }, + { + "epoch": 0.040318457380761084, + "grad_norm": 2.7015397548675537, + "learning_rate": 3.9988926382660304e-05, + "loss": 0.9553, + "step": 828 + }, + { + "epoch": 0.04036715116986828, + "grad_norm": 2.3959407806396484, + "learning_rate": 3.9988821180537114e-05, + "loss": 0.9496, + "step": 829 + }, + { + "epoch": 0.04041584495897548, + "grad_norm": 2.422945976257324, + "learning_rate": 3.9988715481190174e-05, + "loss": 1.0051, + "step": 830 + }, + { + "epoch": 0.04046453874808268, + "grad_norm": 3.4876081943511963, + "learning_rate": 3.998860928462212e-05, + "loss": 0.8498, + "step": 831 + }, + { + "epoch": 0.04051323253718988, + "grad_norm": 2.1478726863861084, + "learning_rate": 3.9988502590835585e-05, + "loss": 0.9589, + "step": 832 + }, + { + "epoch": 0.04056192632629708, + "grad_norm": 3.4130032062530518, + "learning_rate": 3.998839539983322e-05, + "loss": 0.9738, + "step": 833 + }, + { + "epoch": 0.04061062011540428, + "grad_norm": 4.848374843597412, + "learning_rate": 3.99882877116177e-05, + "loss": 1.0279, + "step": 834 + }, + { + "epoch": 0.04065931390451148, + "grad_norm": 2.2555534839630127, + "learning_rate": 3.998817952619171e-05, + "loss": 0.9768, + "step": 835 + }, + { + "epoch": 0.04070800769361868, + "grad_norm": 2.154771089553833, + "learning_rate": 3.998807084355791e-05, + "loss": 0.972, + "step": 836 + }, + { + "epoch": 0.04075670148272588, + "grad_norm": 3.4666831493377686, + "learning_rate": 3.998796166371904e-05, + "loss": 0.9878, + "step": 837 + }, + { + "epoch": 0.04080539527183308, + "grad_norm": 2.428501605987549, + "learning_rate": 3.9987851986677804e-05, + "loss": 1.0314, + "step": 838 + }, + { + "epoch": 0.04085408906094028, + "grad_norm": 2.162482500076294, + "learning_rate": 3.9987741812436926e-05, + "loss": 1.0227, + "step": 839 + }, + { + "epoch": 0.040902782850047475, + "grad_norm": 4.111578464508057, + "learning_rate": 3.998763114099914e-05, + "loss": 0.9598, + "step": 840 + }, + { + "epoch": 0.040951476639154674, + "grad_norm": 3.900606632232666, + "learning_rate": 3.9987519972367215e-05, + "loss": 0.9879, + "step": 841 + }, + { + "epoch": 0.04100017042826187, + "grad_norm": 3.1210923194885254, + "learning_rate": 3.9987408306543905e-05, + "loss": 0.9537, + "step": 842 + }, + { + "epoch": 0.04104886421736907, + "grad_norm": 2.486419677734375, + "learning_rate": 3.9987296143531994e-05, + "loss": 0.8922, + "step": 843 + }, + { + "epoch": 0.04109755800647628, + "grad_norm": 1.7388612031936646, + "learning_rate": 3.998718348333427e-05, + "loss": 1.0298, + "step": 844 + }, + { + "epoch": 0.041146251795583476, + "grad_norm": 2.198850393295288, + "learning_rate": 3.9987070325953525e-05, + "loss": 0.9549, + "step": 845 + }, + { + "epoch": 0.041194945584690675, + "grad_norm": 2.652733325958252, + "learning_rate": 3.998695667139259e-05, + "loss": 0.8821, + "step": 846 + }, + { + "epoch": 0.04124363937379787, + "grad_norm": 0.1024961918592453, + "learning_rate": 3.998684251965428e-05, + "loss": 0.6788, + "step": 847 + }, + { + "epoch": 0.04129233316290507, + "grad_norm": 2.9409339427948, + "learning_rate": 3.998672787074145e-05, + "loss": 0.9506, + "step": 848 + }, + { + "epoch": 0.04134102695201227, + "grad_norm": 1.9731327295303345, + "learning_rate": 3.9986612724656934e-05, + "loss": 1.0044, + "step": 849 + }, + { + "epoch": 0.04138972074111947, + "grad_norm": 2.4185009002685547, + "learning_rate": 3.9986497081403606e-05, + "loss": 0.9905, + "step": 850 + }, + { + "epoch": 0.04143841453022667, + "grad_norm": 2.4818522930145264, + "learning_rate": 3.998638094098434e-05, + "loss": 0.8991, + "step": 851 + }, + { + "epoch": 0.04148710831933387, + "grad_norm": 2.5989389419555664, + "learning_rate": 3.998626430340203e-05, + "loss": 0.9823, + "step": 852 + }, + { + "epoch": 0.041535802108441065, + "grad_norm": 2.9242520332336426, + "learning_rate": 3.998614716865957e-05, + "loss": 1.0215, + "step": 853 + }, + { + "epoch": 0.04158449589754827, + "grad_norm": 2.5811288356781006, + "learning_rate": 3.9986029536759885e-05, + "loss": 0.9428, + "step": 854 + }, + { + "epoch": 0.04163318968665547, + "grad_norm": 2.1001250743865967, + "learning_rate": 3.9985911407705875e-05, + "loss": 0.9535, + "step": 855 + }, + { + "epoch": 0.04168188347576267, + "grad_norm": 2.4282584190368652, + "learning_rate": 3.998579278150051e-05, + "loss": 0.9135, + "step": 856 + }, + { + "epoch": 0.04173057726486987, + "grad_norm": 2.469447612762451, + "learning_rate": 3.9985673658146726e-05, + "loss": 0.9272, + "step": 857 + }, + { + "epoch": 0.041779271053977066, + "grad_norm": 2.456944465637207, + "learning_rate": 3.998555403764749e-05, + "loss": 0.9006, + "step": 858 + }, + { + "epoch": 0.041827964843084264, + "grad_norm": 2.0872433185577393, + "learning_rate": 3.998543392000577e-05, + "loss": 0.9149, + "step": 859 + }, + { + "epoch": 0.04187665863219146, + "grad_norm": 3.4838128089904785, + "learning_rate": 3.9985313305224555e-05, + "loss": 0.9181, + "step": 860 + }, + { + "epoch": 0.04192535242129866, + "grad_norm": 2.3444252014160156, + "learning_rate": 3.998519219330685e-05, + "loss": 1.0168, + "step": 861 + }, + { + "epoch": 0.04197404621040586, + "grad_norm": 2.0540075302124023, + "learning_rate": 3.9985070584255675e-05, + "loss": 0.9667, + "step": 862 + }, + { + "epoch": 0.04202273999951306, + "grad_norm": 2.2010884284973145, + "learning_rate": 3.9984948478074044e-05, + "loss": 0.8895, + "step": 863 + }, + { + "epoch": 0.042071433788620265, + "grad_norm": 2.1200852394104004, + "learning_rate": 3.9984825874765e-05, + "loss": 0.9262, + "step": 864 + }, + { + "epoch": 0.04212012757772746, + "grad_norm": 2.032454013824463, + "learning_rate": 3.998470277433159e-05, + "loss": 1.0664, + "step": 865 + }, + { + "epoch": 0.04216882136683466, + "grad_norm": 2.5952184200286865, + "learning_rate": 3.998457917677687e-05, + "loss": 1.0301, + "step": 866 + }, + { + "epoch": 0.04221751515594186, + "grad_norm": 2.7539303302764893, + "learning_rate": 3.998445508210393e-05, + "loss": 0.9245, + "step": 867 + }, + { + "epoch": 0.04226620894504906, + "grad_norm": 2.461909532546997, + "learning_rate": 3.998433049031584e-05, + "loss": 0.9367, + "step": 868 + }, + { + "epoch": 0.04231490273415626, + "grad_norm": 2.491039276123047, + "learning_rate": 3.9984205401415706e-05, + "loss": 0.9914, + "step": 869 + }, + { + "epoch": 0.04236359652326346, + "grad_norm": 2.264907121658325, + "learning_rate": 3.9984079815406646e-05, + "loss": 0.9851, + "step": 870 + }, + { + "epoch": 0.042412290312370655, + "grad_norm": 2.7687060832977295, + "learning_rate": 3.998395373229177e-05, + "loss": 0.9306, + "step": 871 + }, + { + "epoch": 0.042460984101477854, + "grad_norm": 2.044282913208008, + "learning_rate": 3.998382715207423e-05, + "loss": 1.0652, + "step": 872 + }, + { + "epoch": 0.04250967789058505, + "grad_norm": 1.7178844213485718, + "learning_rate": 3.998370007475717e-05, + "loss": 0.9631, + "step": 873 + }, + { + "epoch": 0.04255837167969226, + "grad_norm": 2.328803777694702, + "learning_rate": 3.998357250034374e-05, + "loss": 0.9843, + "step": 874 + }, + { + "epoch": 0.04260706546879946, + "grad_norm": 1.6560077667236328, + "learning_rate": 3.998344442883713e-05, + "loss": 0.8762, + "step": 875 + }, + { + "epoch": 0.042655759257906656, + "grad_norm": 1.9222662448883057, + "learning_rate": 3.998331586024051e-05, + "loss": 1.0187, + "step": 876 + }, + { + "epoch": 0.042704453047013854, + "grad_norm": 2.0421366691589355, + "learning_rate": 3.9983186794557097e-05, + "loss": 1.0542, + "step": 877 + }, + { + "epoch": 0.04275314683612105, + "grad_norm": 3.9065544605255127, + "learning_rate": 3.998305723179008e-05, + "loss": 0.9619, + "step": 878 + }, + { + "epoch": 0.04280184062522825, + "grad_norm": 4.657001495361328, + "learning_rate": 3.99829271719427e-05, + "loss": 0.9996, + "step": 879 + }, + { + "epoch": 0.04285053441433545, + "grad_norm": 2.4569926261901855, + "learning_rate": 3.9982796615018184e-05, + "loss": 0.9551, + "step": 880 + }, + { + "epoch": 0.04289922820344265, + "grad_norm": 2.34306001663208, + "learning_rate": 3.9982665561019774e-05, + "loss": 1.0199, + "step": 881 + }, + { + "epoch": 0.04294792199254985, + "grad_norm": 2.9945569038391113, + "learning_rate": 3.998253400995074e-05, + "loss": 0.8879, + "step": 882 + }, + { + "epoch": 0.04299661578165705, + "grad_norm": 2.4207112789154053, + "learning_rate": 3.998240196181435e-05, + "loss": 1.054, + "step": 883 + }, + { + "epoch": 0.04304530957076425, + "grad_norm": 1.9702421426773071, + "learning_rate": 3.998226941661389e-05, + "loss": 0.9891, + "step": 884 + }, + { + "epoch": 0.04309400335987145, + "grad_norm": 6.526185512542725, + "learning_rate": 3.998213637435266e-05, + "loss": 0.9503, + "step": 885 + }, + { + "epoch": 0.04314269714897865, + "grad_norm": 2.427323818206787, + "learning_rate": 3.9982002835033956e-05, + "loss": 0.9172, + "step": 886 + }, + { + "epoch": 0.04319139093808585, + "grad_norm": 2.5160024166107178, + "learning_rate": 3.9981868798661115e-05, + "loss": 0.9747, + "step": 887 + }, + { + "epoch": 0.04324008472719305, + "grad_norm": 1.7901735305786133, + "learning_rate": 3.998173426523747e-05, + "loss": 0.9663, + "step": 888 + }, + { + "epoch": 0.043288778516300246, + "grad_norm": 2.2615668773651123, + "learning_rate": 3.9981599234766356e-05, + "loss": 1.0472, + "step": 889 + }, + { + "epoch": 0.043337472305407444, + "grad_norm": 1.824877142906189, + "learning_rate": 3.998146370725114e-05, + "loss": 0.964, + "step": 890 + }, + { + "epoch": 0.04338616609451464, + "grad_norm": 2.2559025287628174, + "learning_rate": 3.9981327682695196e-05, + "loss": 0.965, + "step": 891 + }, + { + "epoch": 0.04343485988362184, + "grad_norm": 1.6734575033187866, + "learning_rate": 3.9981191161101904e-05, + "loss": 1.099, + "step": 892 + }, + { + "epoch": 0.04348355367272904, + "grad_norm": 1.69194495677948, + "learning_rate": 3.9981054142474665e-05, + "loss": 1.0583, + "step": 893 + }, + { + "epoch": 0.043532247461836246, + "grad_norm": 2.4512100219726562, + "learning_rate": 3.9980916626816876e-05, + "loss": 0.9136, + "step": 894 + }, + { + "epoch": 0.043580941250943445, + "grad_norm": 1.5096309185028076, + "learning_rate": 3.998077861413196e-05, + "loss": 1.0136, + "step": 895 + }, + { + "epoch": 0.04362963504005064, + "grad_norm": 1.7432252168655396, + "learning_rate": 3.998064010442336e-05, + "loss": 0.9858, + "step": 896 + }, + { + "epoch": 0.04367832882915784, + "grad_norm": 1.8704626560211182, + "learning_rate": 3.9980501097694514e-05, + "loss": 1.0965, + "step": 897 + }, + { + "epoch": 0.04372702261826504, + "grad_norm": 2.253603935241699, + "learning_rate": 3.998036159394889e-05, + "loss": 0.9606, + "step": 898 + }, + { + "epoch": 0.04377571640737224, + "grad_norm": 2.241831064224243, + "learning_rate": 3.998022159318994e-05, + "loss": 0.9278, + "step": 899 + }, + { + "epoch": 0.04382441019647944, + "grad_norm": 3.1804559230804443, + "learning_rate": 3.998008109542115e-05, + "loss": 0.9303, + "step": 900 + }, + { + "epoch": 0.04387310398558664, + "grad_norm": 3.040424108505249, + "learning_rate": 3.997994010064603e-05, + "loss": 0.9563, + "step": 901 + }, + { + "epoch": 0.043921797774693835, + "grad_norm": 1.975946068763733, + "learning_rate": 3.997979860886808e-05, + "loss": 0.9134, + "step": 902 + }, + { + "epoch": 0.043970491563801034, + "grad_norm": 2.3027617931365967, + "learning_rate": 3.997965662009082e-05, + "loss": 0.9589, + "step": 903 + }, + { + "epoch": 0.04401918535290824, + "grad_norm": 1.6265320777893066, + "learning_rate": 3.9979514134317766e-05, + "loss": 1.0579, + "step": 904 + }, + { + "epoch": 0.04406787914201544, + "grad_norm": 1.634824514389038, + "learning_rate": 3.997937115155249e-05, + "loss": 1.0662, + "step": 905 + }, + { + "epoch": 0.04411657293112264, + "grad_norm": 2.0072667598724365, + "learning_rate": 3.9979227671798526e-05, + "loss": 0.9964, + "step": 906 + }, + { + "epoch": 0.044165266720229836, + "grad_norm": 4.866993427276611, + "learning_rate": 3.997908369505946e-05, + "loss": 1.0228, + "step": 907 + }, + { + "epoch": 0.044213960509337034, + "grad_norm": 2.0668439865112305, + "learning_rate": 3.997893922133886e-05, + "loss": 0.9774, + "step": 908 + }, + { + "epoch": 0.04426265429844423, + "grad_norm": 1.942483901977539, + "learning_rate": 3.997879425064032e-05, + "loss": 0.9062, + "step": 909 + }, + { + "epoch": 0.04431134808755143, + "grad_norm": 2.206077814102173, + "learning_rate": 3.997864878296747e-05, + "loss": 1.0169, + "step": 910 + }, + { + "epoch": 0.04436004187665863, + "grad_norm": 1.7478222846984863, + "learning_rate": 3.997850281832389e-05, + "loss": 1.0111, + "step": 911 + }, + { + "epoch": 0.04440873566576583, + "grad_norm": 0.09630844742059708, + "learning_rate": 3.9978356356713245e-05, + "loss": 0.686, + "step": 912 + }, + { + "epoch": 0.04445742945487303, + "grad_norm": 0.10166051983833313, + "learning_rate": 3.997820939813916e-05, + "loss": 0.6772, + "step": 913 + }, + { + "epoch": 0.04450612324398023, + "grad_norm": 1.8913036584854126, + "learning_rate": 3.997806194260528e-05, + "loss": 0.9804, + "step": 914 + }, + { + "epoch": 0.04455481703308743, + "grad_norm": 2.6712756156921387, + "learning_rate": 3.997791399011531e-05, + "loss": 0.955, + "step": 915 + }, + { + "epoch": 0.04460351082219463, + "grad_norm": 2.302018880844116, + "learning_rate": 3.99777655406729e-05, + "loss": 0.9067, + "step": 916 + }, + { + "epoch": 0.04465220461130183, + "grad_norm": 1.715843915939331, + "learning_rate": 3.997761659428174e-05, + "loss": 0.9515, + "step": 917 + }, + { + "epoch": 0.04470089840040903, + "grad_norm": 1.8908183574676514, + "learning_rate": 3.997746715094556e-05, + "loss": 1.0002, + "step": 918 + }, + { + "epoch": 0.04474959218951623, + "grad_norm": 1.8703879117965698, + "learning_rate": 3.997731721066806e-05, + "loss": 1.0574, + "step": 919 + }, + { + "epoch": 0.044798285978623426, + "grad_norm": 1.7840477228164673, + "learning_rate": 3.997716677345297e-05, + "loss": 1.0592, + "step": 920 + }, + { + "epoch": 0.044846979767730624, + "grad_norm": 2.1077258586883545, + "learning_rate": 3.997701583930403e-05, + "loss": 0.8925, + "step": 921 + }, + { + "epoch": 0.04489567355683782, + "grad_norm": 2.262209177017212, + "learning_rate": 3.9976864408225017e-05, + "loss": 0.8427, + "step": 922 + }, + { + "epoch": 0.04494436734594502, + "grad_norm": 1.540063500404358, + "learning_rate": 3.9976712480219664e-05, + "loss": 0.9846, + "step": 923 + }, + { + "epoch": 0.04499306113505223, + "grad_norm": 2.0189359188079834, + "learning_rate": 3.9976560055291766e-05, + "loss": 1.0137, + "step": 924 + }, + { + "epoch": 0.045041754924159426, + "grad_norm": 2.04514741897583, + "learning_rate": 3.9976407133445126e-05, + "loss": 0.9859, + "step": 925 + }, + { + "epoch": 0.045090448713266625, + "grad_norm": 1.945496678352356, + "learning_rate": 3.997625371468353e-05, + "loss": 0.9877, + "step": 926 + }, + { + "epoch": 0.04513914250237382, + "grad_norm": 2.3777377605438232, + "learning_rate": 3.99760997990108e-05, + "loss": 1.0088, + "step": 927 + }, + { + "epoch": 0.04518783629148102, + "grad_norm": 1.535273551940918, + "learning_rate": 3.997594538643078e-05, + "loss": 0.9975, + "step": 928 + }, + { + "epoch": 0.04523653008058822, + "grad_norm": 2.1278417110443115, + "learning_rate": 3.997579047694728e-05, + "loss": 0.9037, + "step": 929 + }, + { + "epoch": 0.04528522386969542, + "grad_norm": 2.401639223098755, + "learning_rate": 3.997563507056418e-05, + "loss": 0.9432, + "step": 930 + }, + { + "epoch": 0.04533391765880262, + "grad_norm": 2.6899142265319824, + "learning_rate": 3.9975479167285334e-05, + "loss": 0.9561, + "step": 931 + }, + { + "epoch": 0.04538261144790982, + "grad_norm": 1.7908822298049927, + "learning_rate": 3.997532276711462e-05, + "loss": 0.8996, + "step": 932 + }, + { + "epoch": 0.045431305237017015, + "grad_norm": 2.193138599395752, + "learning_rate": 3.997516587005593e-05, + "loss": 0.9638, + "step": 933 + }, + { + "epoch": 0.04547999902612422, + "grad_norm": 2.2890141010284424, + "learning_rate": 3.997500847611317e-05, + "loss": 0.9955, + "step": 934 + }, + { + "epoch": 0.04552869281523142, + "grad_norm": 2.2443575859069824, + "learning_rate": 3.997485058529026e-05, + "loss": 0.9999, + "step": 935 + }, + { + "epoch": 0.04557738660433862, + "grad_norm": 1.6461360454559326, + "learning_rate": 3.9974692197591116e-05, + "loss": 0.9423, + "step": 936 + }, + { + "epoch": 0.04562608039344582, + "grad_norm": 2.1834778785705566, + "learning_rate": 3.9974533313019676e-05, + "loss": 0.8658, + "step": 937 + }, + { + "epoch": 0.045674774182553016, + "grad_norm": 3.3046953678131104, + "learning_rate": 3.99743739315799e-05, + "loss": 1.0239, + "step": 938 + }, + { + "epoch": 0.045723467971660214, + "grad_norm": 2.2729976177215576, + "learning_rate": 3.9974214053275756e-05, + "loss": 0.9419, + "step": 939 + }, + { + "epoch": 0.04577216176076741, + "grad_norm": 2.8468871116638184, + "learning_rate": 3.997405367811122e-05, + "loss": 1.0096, + "step": 940 + }, + { + "epoch": 0.04582085554987461, + "grad_norm": 5.151241302490234, + "learning_rate": 3.9973892806090266e-05, + "loss": 0.8742, + "step": 941 + }, + { + "epoch": 0.04586954933898181, + "grad_norm": 1.684903621673584, + "learning_rate": 3.997373143721691e-05, + "loss": 1.0165, + "step": 942 + }, + { + "epoch": 0.04591824312808901, + "grad_norm": 2.3800103664398193, + "learning_rate": 3.997356957149517e-05, + "loss": 0.8584, + "step": 943 + }, + { + "epoch": 0.045966936917196215, + "grad_norm": 2.0992918014526367, + "learning_rate": 3.9973407208929064e-05, + "loss": 0.9512, + "step": 944 + }, + { + "epoch": 0.04601563070630341, + "grad_norm": 2.589118242263794, + "learning_rate": 3.997324434952263e-05, + "loss": 0.924, + "step": 945 + }, + { + "epoch": 0.04606432449541061, + "grad_norm": 2.8197453022003174, + "learning_rate": 3.9973080993279925e-05, + "loss": 0.96, + "step": 946 + }, + { + "epoch": 0.04611301828451781, + "grad_norm": 2.8476243019104004, + "learning_rate": 3.9972917140205005e-05, + "loss": 0.9572, + "step": 947 + }, + { + "epoch": 0.04616171207362501, + "grad_norm": 5.673343181610107, + "learning_rate": 3.997275279030195e-05, + "loss": 1.0241, + "step": 948 + }, + { + "epoch": 0.04621040586273221, + "grad_norm": 2.5605273246765137, + "learning_rate": 3.997258794357486e-05, + "loss": 1.0153, + "step": 949 + }, + { + "epoch": 0.04625909965183941, + "grad_norm": 2.6237993240356445, + "learning_rate": 3.997242260002781e-05, + "loss": 0.8588, + "step": 950 + }, + { + "epoch": 0.046307793440946605, + "grad_norm": 2.57322096824646, + "learning_rate": 3.997225675966493e-05, + "loss": 0.9643, + "step": 951 + }, + { + "epoch": 0.046356487230053804, + "grad_norm": 1.8304353952407837, + "learning_rate": 3.9972090422490346e-05, + "loss": 1.0651, + "step": 952 + }, + { + "epoch": 0.046405181019161, + "grad_norm": 2.242814540863037, + "learning_rate": 3.997192358850819e-05, + "loss": 0.9979, + "step": 953 + }, + { + "epoch": 0.04645387480826821, + "grad_norm": 2.1902987957000732, + "learning_rate": 3.997175625772261e-05, + "loss": 1.066, + "step": 954 + }, + { + "epoch": 0.04650256859737541, + "grad_norm": 1.4579628705978394, + "learning_rate": 3.997158843013777e-05, + "loss": 0.9425, + "step": 955 + }, + { + "epoch": 0.046551262386482606, + "grad_norm": 1.9696391820907593, + "learning_rate": 3.997142010575785e-05, + "loss": 0.906, + "step": 956 + }, + { + "epoch": 0.046599956175589805, + "grad_norm": 2.168837785720825, + "learning_rate": 3.997125128458704e-05, + "loss": 0.9382, + "step": 957 + }, + { + "epoch": 0.046648649964697, + "grad_norm": 2.0061326026916504, + "learning_rate": 3.997108196662953e-05, + "loss": 0.9477, + "step": 958 + }, + { + "epoch": 0.0466973437538042, + "grad_norm": 1.9584897756576538, + "learning_rate": 3.9970912151889535e-05, + "loss": 1.0559, + "step": 959 + }, + { + "epoch": 0.0467460375429114, + "grad_norm": 1.6554275751113892, + "learning_rate": 3.9970741840371276e-05, + "loss": 0.936, + "step": 960 + }, + { + "epoch": 0.0467947313320186, + "grad_norm": 2.775860071182251, + "learning_rate": 3.9970571032078995e-05, + "loss": 0.9179, + "step": 961 + }, + { + "epoch": 0.0468434251211258, + "grad_norm": 2.3871560096740723, + "learning_rate": 3.997039972701694e-05, + "loss": 0.9601, + "step": 962 + }, + { + "epoch": 0.046892118910233, + "grad_norm": 2.670888900756836, + "learning_rate": 3.9970227925189375e-05, + "loss": 0.8971, + "step": 963 + }, + { + "epoch": 0.0469408126993402, + "grad_norm": 1.955356478691101, + "learning_rate": 3.997005562660057e-05, + "loss": 1.059, + "step": 964 + }, + { + "epoch": 0.0469895064884474, + "grad_norm": 3.8139243125915527, + "learning_rate": 3.99698828312548e-05, + "loss": 1.0458, + "step": 965 + }, + { + "epoch": 0.0470382002775546, + "grad_norm": 2.295198917388916, + "learning_rate": 3.9969709539156375e-05, + "loss": 1.0372, + "step": 966 + }, + { + "epoch": 0.0470868940666618, + "grad_norm": 2.9068050384521484, + "learning_rate": 3.996953575030961e-05, + "loss": 0.9122, + "step": 967 + }, + { + "epoch": 0.047135587855769, + "grad_norm": 2.9795284271240234, + "learning_rate": 3.9969361464718816e-05, + "loss": 0.966, + "step": 968 + }, + { + "epoch": 0.047184281644876196, + "grad_norm": 1.6801156997680664, + "learning_rate": 3.996918668238834e-05, + "loss": 0.8779, + "step": 969 + }, + { + "epoch": 0.047232975433983394, + "grad_norm": 1.8691587448120117, + "learning_rate": 3.996901140332253e-05, + "loss": 0.8981, + "step": 970 + }, + { + "epoch": 0.04728166922309059, + "grad_norm": 3.157672882080078, + "learning_rate": 3.996883562752573e-05, + "loss": 0.9538, + "step": 971 + }, + { + "epoch": 0.04733036301219779, + "grad_norm": 2.271566390991211, + "learning_rate": 3.9968659355002333e-05, + "loss": 0.9566, + "step": 972 + }, + { + "epoch": 0.04737905680130499, + "grad_norm": 1.9127609729766846, + "learning_rate": 3.996848258575671e-05, + "loss": 1.0912, + "step": 973 + }, + { + "epoch": 0.047427750590412196, + "grad_norm": 1.9243837594985962, + "learning_rate": 3.996830531979326e-05, + "loss": 0.9617, + "step": 974 + }, + { + "epoch": 0.047476444379519395, + "grad_norm": 2.7351038455963135, + "learning_rate": 3.9968127557116393e-05, + "loss": 0.9851, + "step": 975 + }, + { + "epoch": 0.04752513816862659, + "grad_norm": 2.3132126331329346, + "learning_rate": 3.996794929773054e-05, + "loss": 0.9026, + "step": 976 + }, + { + "epoch": 0.04757383195773379, + "grad_norm": 2.1806576251983643, + "learning_rate": 3.996777054164012e-05, + "loss": 1.0363, + "step": 977 + }, + { + "epoch": 0.04762252574684099, + "grad_norm": 1.9240658283233643, + "learning_rate": 3.996759128884959e-05, + "loss": 1.0103, + "step": 978 + }, + { + "epoch": 0.04767121953594819, + "grad_norm": 2.6040306091308594, + "learning_rate": 3.9967411539363404e-05, + "loss": 0.945, + "step": 979 + }, + { + "epoch": 0.04771991332505539, + "grad_norm": 0.0946655422449112, + "learning_rate": 3.996723129318603e-05, + "loss": 0.6932, + "step": 980 + }, + { + "epoch": 0.04776860711416259, + "grad_norm": 1.923803448677063, + "learning_rate": 3.996705055032197e-05, + "loss": 0.9107, + "step": 981 + }, + { + "epoch": 0.047817300903269785, + "grad_norm": 2.0288379192352295, + "learning_rate": 3.9966869310775704e-05, + "loss": 0.9792, + "step": 982 + }, + { + "epoch": 0.047865994692376984, + "grad_norm": 2.352795362472534, + "learning_rate": 3.996668757455174e-05, + "loss": 0.9895, + "step": 983 + }, + { + "epoch": 0.04791468848148419, + "grad_norm": 2.3473455905914307, + "learning_rate": 3.9966505341654606e-05, + "loss": 1.0546, + "step": 984 + }, + { + "epoch": 0.04796338227059139, + "grad_norm": 0.08904188126325607, + "learning_rate": 3.996632261208882e-05, + "loss": 0.5846, + "step": 985 + }, + { + "epoch": 0.04801207605969859, + "grad_norm": 2.81023907661438, + "learning_rate": 3.996613938585895e-05, + "loss": 0.8632, + "step": 986 + }, + { + "epoch": 0.048060769848805786, + "grad_norm": 0.09220428019762039, + "learning_rate": 3.996595566296954e-05, + "loss": 0.6579, + "step": 987 + }, + { + "epoch": 0.048109463637912984, + "grad_norm": 0.08847545832395554, + "learning_rate": 3.996577144342516e-05, + "loss": 0.6408, + "step": 988 + }, + { + "epoch": 0.04815815742702018, + "grad_norm": 2.517857789993286, + "learning_rate": 3.9965586727230393e-05, + "loss": 0.9036, + "step": 989 + }, + { + "epoch": 0.04820685121612738, + "grad_norm": 2.165005683898926, + "learning_rate": 3.9965401514389846e-05, + "loss": 1.0204, + "step": 990 + }, + { + "epoch": 0.04825554500523458, + "grad_norm": 2.194004774093628, + "learning_rate": 3.996521580490811e-05, + "loss": 0.8239, + "step": 991 + }, + { + "epoch": 0.04830423879434178, + "grad_norm": 0.09169045835733414, + "learning_rate": 3.9965029598789803e-05, + "loss": 0.6436, + "step": 992 + }, + { + "epoch": 0.04835293258344898, + "grad_norm": 3.051811456680298, + "learning_rate": 3.996484289603958e-05, + "loss": 1.0003, + "step": 993 + }, + { + "epoch": 0.048401626372556183, + "grad_norm": 2.8993980884552, + "learning_rate": 3.996465569666205e-05, + "loss": 0.8946, + "step": 994 + }, + { + "epoch": 0.04845032016166338, + "grad_norm": 1.8785117864608765, + "learning_rate": 3.9964468000661905e-05, + "loss": 0.8477, + "step": 995 + }, + { + "epoch": 0.04849901395077058, + "grad_norm": 1.6702016592025757, + "learning_rate": 3.996427980804379e-05, + "loss": 1.0363, + "step": 996 + }, + { + "epoch": 0.04854770773987778, + "grad_norm": 3.990635871887207, + "learning_rate": 3.99640911188124e-05, + "loss": 0.9826, + "step": 997 + }, + { + "epoch": 0.04859640152898498, + "grad_norm": 2.276948928833008, + "learning_rate": 3.996390193297242e-05, + "loss": 0.8756, + "step": 998 + }, + { + "epoch": 0.04864509531809218, + "grad_norm": 2.2450151443481445, + "learning_rate": 3.996371225052856e-05, + "loss": 0.917, + "step": 999 + }, + { + "epoch": 0.048693789107199376, + "grad_norm": 2.0066983699798584, + "learning_rate": 3.996352207148553e-05, + "loss": 0.9267, + "step": 1000 + }, + { + "epoch": 0.048742482896306574, + "grad_norm": 2.469078302383423, + "learning_rate": 3.996333139584808e-05, + "loss": 0.8849, + "step": 1001 + }, + { + "epoch": 0.04879117668541377, + "grad_norm": 1.8887038230895996, + "learning_rate": 3.996314022362094e-05, + "loss": 0.8299, + "step": 1002 + }, + { + "epoch": 0.04883987047452097, + "grad_norm": 2.1025283336639404, + "learning_rate": 3.9962948554808865e-05, + "loss": 0.9897, + "step": 1003 + }, + { + "epoch": 0.04888856426362818, + "grad_norm": 2.7846827507019043, + "learning_rate": 3.9962756389416624e-05, + "loss": 0.9007, + "step": 1004 + }, + { + "epoch": 0.048937258052735376, + "grad_norm": 3.100847005844116, + "learning_rate": 3.996256372744899e-05, + "loss": 0.9193, + "step": 1005 + }, + { + "epoch": 0.048985951841842575, + "grad_norm": 0.0999951884150505, + "learning_rate": 3.996237056891078e-05, + "loss": 0.6296, + "step": 1006 + }, + { + "epoch": 0.04903464563094977, + "grad_norm": 2.5349619388580322, + "learning_rate": 3.9962176913806766e-05, + "loss": 0.885, + "step": 1007 + }, + { + "epoch": 0.04908333942005697, + "grad_norm": 2.0738234519958496, + "learning_rate": 3.9961982762141786e-05, + "loss": 0.9955, + "step": 1008 + }, + { + "epoch": 0.04913203320916417, + "grad_norm": 2.525279998779297, + "learning_rate": 3.996178811392067e-05, + "loss": 0.9471, + "step": 1009 + }, + { + "epoch": 0.04918072699827137, + "grad_norm": 2.08406925201416, + "learning_rate": 3.996159296914825e-05, + "loss": 0.9651, + "step": 1010 + }, + { + "epoch": 0.04922942078737857, + "grad_norm": 0.08902523666620255, + "learning_rate": 3.996139732782938e-05, + "loss": 0.62, + "step": 1011 + }, + { + "epoch": 0.04927811457648577, + "grad_norm": 2.0606415271759033, + "learning_rate": 3.996120118996894e-05, + "loss": 0.906, + "step": 1012 + }, + { + "epoch": 0.049326808365592965, + "grad_norm": 1.9627857208251953, + "learning_rate": 3.99610045555718e-05, + "loss": 0.9783, + "step": 1013 + }, + { + "epoch": 0.04937550215470017, + "grad_norm": 0.09432227164506912, + "learning_rate": 3.996080742464285e-05, + "loss": 0.6018, + "step": 1014 + }, + { + "epoch": 0.04942419594380737, + "grad_norm": 1.944543719291687, + "learning_rate": 3.996060979718699e-05, + "loss": 0.8988, + "step": 1015 + }, + { + "epoch": 0.04947288973291457, + "grad_norm": 2.2160253524780273, + "learning_rate": 3.996041167320915e-05, + "loss": 1.0213, + "step": 1016 + }, + { + "epoch": 0.04952158352202177, + "grad_norm": 1.8128306865692139, + "learning_rate": 3.996021305271424e-05, + "loss": 0.9199, + "step": 1017 + }, + { + "epoch": 0.049570277311128966, + "grad_norm": 1.8413043022155762, + "learning_rate": 3.996001393570722e-05, + "loss": 0.923, + "step": 1018 + }, + { + "epoch": 0.049618971100236164, + "grad_norm": 3.5488193035125732, + "learning_rate": 3.995981432219303e-05, + "loss": 0.9933, + "step": 1019 + }, + { + "epoch": 0.04966766488934336, + "grad_norm": 1.733197808265686, + "learning_rate": 3.995961421217664e-05, + "loss": 1.0117, + "step": 1020 + }, + { + "epoch": 0.04971635867845056, + "grad_norm": 1.9791741371154785, + "learning_rate": 3.995941360566302e-05, + "loss": 1.0105, + "step": 1021 + }, + { + "epoch": 0.04976505246755776, + "grad_norm": 2.6253161430358887, + "learning_rate": 3.995921250265717e-05, + "loss": 1.0564, + "step": 1022 + }, + { + "epoch": 0.04981374625666496, + "grad_norm": 2.416565418243408, + "learning_rate": 3.9959010903164095e-05, + "loss": 0.9216, + "step": 1023 + }, + { + "epoch": 0.049862440045772165, + "grad_norm": 1.6967726945877075, + "learning_rate": 3.99588088071888e-05, + "loss": 0.8877, + "step": 1024 + }, + { + "epoch": 0.04991113383487936, + "grad_norm": 2.3600332736968994, + "learning_rate": 3.9958606214736314e-05, + "loss": 0.9849, + "step": 1025 + }, + { + "epoch": 0.04995982762398656, + "grad_norm": 2.412046432495117, + "learning_rate": 3.995840312581168e-05, + "loss": 0.9937, + "step": 1026 + }, + { + "epoch": 0.05000852141309376, + "grad_norm": 2.8074216842651367, + "learning_rate": 3.995819954041995e-05, + "loss": 0.9806, + "step": 1027 + }, + { + "epoch": 0.05005721520220096, + "grad_norm": 1.9050984382629395, + "learning_rate": 3.995799545856619e-05, + "loss": 0.9575, + "step": 1028 + }, + { + "epoch": 0.05010590899130816, + "grad_norm": 2.149553060531616, + "learning_rate": 3.995779088025547e-05, + "loss": 1.0378, + "step": 1029 + }, + { + "epoch": 0.05015460278041536, + "grad_norm": 2.60648775100708, + "learning_rate": 3.995758580549288e-05, + "loss": 0.9693, + "step": 1030 + }, + { + "epoch": 0.050203296569522556, + "grad_norm": 2.044553279876709, + "learning_rate": 3.995738023428352e-05, + "loss": 1.0269, + "step": 1031 + }, + { + "epoch": 0.050251990358629754, + "grad_norm": 2.0103187561035156, + "learning_rate": 3.9957174166632516e-05, + "loss": 1.0386, + "step": 1032 + }, + { + "epoch": 0.05030068414773695, + "grad_norm": 3.039008378982544, + "learning_rate": 3.995696760254498e-05, + "loss": 1.0342, + "step": 1033 + }, + { + "epoch": 0.05034937793684416, + "grad_norm": 2.4044454097747803, + "learning_rate": 3.995676054202605e-05, + "loss": 0.9202, + "step": 1034 + }, + { + "epoch": 0.05039807172595136, + "grad_norm": 2.5986082553863525, + "learning_rate": 3.995655298508089e-05, + "loss": 1.0352, + "step": 1035 + }, + { + "epoch": 0.050446765515058556, + "grad_norm": 0.0968092605471611, + "learning_rate": 3.9956344931714655e-05, + "loss": 0.6145, + "step": 1036 + }, + { + "epoch": 0.050495459304165755, + "grad_norm": 2.4326717853546143, + "learning_rate": 3.995613638193252e-05, + "loss": 0.957, + "step": 1037 + }, + { + "epoch": 0.05054415309327295, + "grad_norm": 2.39840030670166, + "learning_rate": 3.9955927335739664e-05, + "loss": 0.9784, + "step": 1038 + }, + { + "epoch": 0.05059284688238015, + "grad_norm": 2.0391151905059814, + "learning_rate": 3.995571779314131e-05, + "loss": 0.9547, + "step": 1039 + }, + { + "epoch": 0.05064154067148735, + "grad_norm": 2.2211215496063232, + "learning_rate": 3.9955507754142645e-05, + "loss": 1.0211, + "step": 1040 + }, + { + "epoch": 0.05069023446059455, + "grad_norm": 3.436243772506714, + "learning_rate": 3.9955297218748913e-05, + "loss": 1.0656, + "step": 1041 + }, + { + "epoch": 0.05073892824970175, + "grad_norm": 2.5587880611419678, + "learning_rate": 3.995508618696533e-05, + "loss": 0.8754, + "step": 1042 + }, + { + "epoch": 0.05078762203880895, + "grad_norm": 4.6186323165893555, + "learning_rate": 3.9954874658797176e-05, + "loss": 1.0693, + "step": 1043 + }, + { + "epoch": 0.05083631582791615, + "grad_norm": 2.167182445526123, + "learning_rate": 3.995466263424969e-05, + "loss": 0.9201, + "step": 1044 + }, + { + "epoch": 0.05088500961702335, + "grad_norm": 2.2633845806121826, + "learning_rate": 3.995445011332815e-05, + "loss": 1.0483, + "step": 1045 + }, + { + "epoch": 0.05093370340613055, + "grad_norm": 2.1516194343566895, + "learning_rate": 3.995423709603784e-05, + "loss": 0.9413, + "step": 1046 + }, + { + "epoch": 0.05098239719523775, + "grad_norm": 1.880817174911499, + "learning_rate": 3.995402358238407e-05, + "loss": 0.9725, + "step": 1047 + }, + { + "epoch": 0.05103109098434495, + "grad_norm": 1.6311495304107666, + "learning_rate": 3.995380957237214e-05, + "loss": 0.9508, + "step": 1048 + }, + { + "epoch": 0.051079784773452146, + "grad_norm": 2.9207749366760254, + "learning_rate": 3.9953595066007376e-05, + "loss": 1.0529, + "step": 1049 + }, + { + "epoch": 0.051128478562559344, + "grad_norm": 3.3380355834960938, + "learning_rate": 3.9953380063295114e-05, + "loss": 1.0661, + "step": 1050 + }, + { + "epoch": 0.05117717235166654, + "grad_norm": 14.268030166625977, + "learning_rate": 3.995316456424071e-05, + "loss": 0.9417, + "step": 1051 + }, + { + "epoch": 0.05122586614077374, + "grad_norm": 3.4133830070495605, + "learning_rate": 3.995294856884952e-05, + "loss": 0.9383, + "step": 1052 + }, + { + "epoch": 0.05127455992988094, + "grad_norm": 2.9957242012023926, + "learning_rate": 3.995273207712691e-05, + "loss": 0.877, + "step": 1053 + }, + { + "epoch": 0.051323253718988146, + "grad_norm": 4.871799468994141, + "learning_rate": 3.995251508907827e-05, + "loss": 0.9053, + "step": 1054 + }, + { + "epoch": 0.051371947508095345, + "grad_norm": 2.96791672706604, + "learning_rate": 3.9952297604709004e-05, + "loss": 0.7809, + "step": 1055 + }, + { + "epoch": 0.05142064129720254, + "grad_norm": 2.1366114616394043, + "learning_rate": 3.99520796240245e-05, + "loss": 1.0319, + "step": 1056 + }, + { + "epoch": 0.05146933508630974, + "grad_norm": 2.971513032913208, + "learning_rate": 3.995186114703021e-05, + "loss": 0.9827, + "step": 1057 + }, + { + "epoch": 0.05151802887541694, + "grad_norm": 1.5462775230407715, + "learning_rate": 3.9951642173731554e-05, + "loss": 0.9266, + "step": 1058 + }, + { + "epoch": 0.05156672266452414, + "grad_norm": 3.1607279777526855, + "learning_rate": 3.9951422704133975e-05, + "loss": 1.0385, + "step": 1059 + }, + { + "epoch": 0.05161541645363134, + "grad_norm": 2.855938196182251, + "learning_rate": 3.995120273824294e-05, + "loss": 0.9338, + "step": 1060 + }, + { + "epoch": 0.05166411024273854, + "grad_norm": 2.174220561981201, + "learning_rate": 3.9950982276063916e-05, + "loss": 0.868, + "step": 1061 + }, + { + "epoch": 0.051712804031845735, + "grad_norm": 0.08816041797399521, + "learning_rate": 3.9950761317602393e-05, + "loss": 0.5787, + "step": 1062 + }, + { + "epoch": 0.051761497820952934, + "grad_norm": 2.961894989013672, + "learning_rate": 3.995053986286386e-05, + "loss": 0.9129, + "step": 1063 + }, + { + "epoch": 0.05181019161006014, + "grad_norm": 2.0020980834960938, + "learning_rate": 3.995031791185383e-05, + "loss": 0.9206, + "step": 1064 + }, + { + "epoch": 0.05185888539916734, + "grad_norm": 2.200814962387085, + "learning_rate": 3.995009546457782e-05, + "loss": 0.9399, + "step": 1065 + }, + { + "epoch": 0.05190757918827454, + "grad_norm": 2.4845046997070312, + "learning_rate": 3.994987252104137e-05, + "loss": 0.9127, + "step": 1066 + }, + { + "epoch": 0.051956272977381736, + "grad_norm": 1.8597851991653442, + "learning_rate": 3.9949649081250025e-05, + "loss": 0.9336, + "step": 1067 + }, + { + "epoch": 0.052004966766488935, + "grad_norm": 1.9491777420043945, + "learning_rate": 3.9949425145209334e-05, + "loss": 0.9329, + "step": 1068 + }, + { + "epoch": 0.05205366055559613, + "grad_norm": 1.9103612899780273, + "learning_rate": 3.994920071292487e-05, + "loss": 0.9094, + "step": 1069 + }, + { + "epoch": 0.05210235434470333, + "grad_norm": 2.619983196258545, + "learning_rate": 3.994897578440223e-05, + "loss": 0.88, + "step": 1070 + }, + { + "epoch": 0.05215104813381053, + "grad_norm": 2.918424367904663, + "learning_rate": 3.9948750359646994e-05, + "loss": 0.9607, + "step": 1071 + }, + { + "epoch": 0.05219974192291773, + "grad_norm": 4.0751752853393555, + "learning_rate": 3.994852443866477e-05, + "loss": 0.9764, + "step": 1072 + }, + { + "epoch": 0.05224843571202493, + "grad_norm": 4.643908500671387, + "learning_rate": 3.994829802146119e-05, + "loss": 0.9513, + "step": 1073 + }, + { + "epoch": 0.052297129501132134, + "grad_norm": 2.2394700050354004, + "learning_rate": 3.994807110804187e-05, + "loss": 0.9681, + "step": 1074 + }, + { + "epoch": 0.05234582329023933, + "grad_norm": 2.7548160552978516, + "learning_rate": 3.9947843698412467e-05, + "loss": 0.9569, + "step": 1075 + }, + { + "epoch": 0.05239451707934653, + "grad_norm": 2.511773109436035, + "learning_rate": 3.994761579257863e-05, + "loss": 0.9266, + "step": 1076 + }, + { + "epoch": 0.05244321086845373, + "grad_norm": 2.384650945663452, + "learning_rate": 3.9947387390546036e-05, + "loss": 0.9355, + "step": 1077 + }, + { + "epoch": 0.05249190465756093, + "grad_norm": 0.1135876402258873, + "learning_rate": 3.994715849232036e-05, + "loss": 0.6388, + "step": 1078 + }, + { + "epoch": 0.05254059844666813, + "grad_norm": 3.5922868251800537, + "learning_rate": 3.994692909790729e-05, + "loss": 1.004, + "step": 1079 + }, + { + "epoch": 0.052589292235775326, + "grad_norm": 1.7445158958435059, + "learning_rate": 3.994669920731255e-05, + "loss": 0.9768, + "step": 1080 + }, + { + "epoch": 0.052637986024882524, + "grad_norm": 2.8852462768554688, + "learning_rate": 3.994646882054184e-05, + "loss": 0.9335, + "step": 1081 + }, + { + "epoch": 0.05268667981398972, + "grad_norm": 1.7798532247543335, + "learning_rate": 3.994623793760091e-05, + "loss": 0.9494, + "step": 1082 + }, + { + "epoch": 0.05273537360309692, + "grad_norm": 3.133805274963379, + "learning_rate": 3.994600655849549e-05, + "loss": 0.9584, + "step": 1083 + }, + { + "epoch": 0.05278406739220413, + "grad_norm": 1.889966368675232, + "learning_rate": 3.994577468323133e-05, + "loss": 0.9193, + "step": 1084 + }, + { + "epoch": 0.052832761181311326, + "grad_norm": 0.08795332163572311, + "learning_rate": 3.9945542311814214e-05, + "loss": 0.6772, + "step": 1085 + }, + { + "epoch": 0.052881454970418525, + "grad_norm": 1.987379789352417, + "learning_rate": 3.994530944424991e-05, + "loss": 0.9232, + "step": 1086 + }, + { + "epoch": 0.05293014875952572, + "grad_norm": 2.1473820209503174, + "learning_rate": 3.994507608054422e-05, + "loss": 0.9609, + "step": 1087 + }, + { + "epoch": 0.05297884254863292, + "grad_norm": 1.7860733270645142, + "learning_rate": 3.994484222070294e-05, + "loss": 0.9787, + "step": 1088 + }, + { + "epoch": 0.05302753633774012, + "grad_norm": 3.936624526977539, + "learning_rate": 3.994460786473189e-05, + "loss": 1.0298, + "step": 1089 + }, + { + "epoch": 0.05307623012684732, + "grad_norm": 1.8190745115280151, + "learning_rate": 3.994437301263691e-05, + "loss": 0.9481, + "step": 1090 + }, + { + "epoch": 0.05312492391595452, + "grad_norm": 1.892386794090271, + "learning_rate": 3.994413766442383e-05, + "loss": 1.0188, + "step": 1091 + }, + { + "epoch": 0.05317361770506172, + "grad_norm": 1.9980231523513794, + "learning_rate": 3.99439018200985e-05, + "loss": 0.8996, + "step": 1092 + }, + { + "epoch": 0.053222311494168915, + "grad_norm": 2.054861545562744, + "learning_rate": 3.9943665479666795e-05, + "loss": 0.9581, + "step": 1093 + }, + { + "epoch": 0.05327100528327612, + "grad_norm": 1.9728755950927734, + "learning_rate": 3.994342864313459e-05, + "loss": 0.9332, + "step": 1094 + }, + { + "epoch": 0.05331969907238332, + "grad_norm": 2.2268905639648438, + "learning_rate": 3.994319131050779e-05, + "loss": 0.8535, + "step": 1095 + }, + { + "epoch": 0.05336839286149052, + "grad_norm": 2.4327969551086426, + "learning_rate": 3.9942953481792285e-05, + "loss": 0.9387, + "step": 1096 + }, + { + "epoch": 0.05341708665059772, + "grad_norm": 1.840247631072998, + "learning_rate": 3.9942715156993986e-05, + "loss": 0.9457, + "step": 1097 + }, + { + "epoch": 0.053465780439704916, + "grad_norm": 2.1309010982513428, + "learning_rate": 3.9942476336118835e-05, + "loss": 0.9814, + "step": 1098 + }, + { + "epoch": 0.053514474228812114, + "grad_norm": 2.352128267288208, + "learning_rate": 3.994223701917277e-05, + "loss": 0.9824, + "step": 1099 + }, + { + "epoch": 0.05356316801791931, + "grad_norm": 1.7394278049468994, + "learning_rate": 3.994199720616173e-05, + "loss": 0.9765, + "step": 1100 + }, + { + "epoch": 0.05361186180702651, + "grad_norm": 1.765277624130249, + "learning_rate": 3.99417568970917e-05, + "loss": 0.9809, + "step": 1101 + }, + { + "epoch": 0.05366055559613371, + "grad_norm": 1.553444266319275, + "learning_rate": 3.994151609196864e-05, + "loss": 0.9016, + "step": 1102 + }, + { + "epoch": 0.05370924938524091, + "grad_norm": 2.795088768005371, + "learning_rate": 3.9941274790798564e-05, + "loss": 1.0043, + "step": 1103 + }, + { + "epoch": 0.053757943174348115, + "grad_norm": 2.3274574279785156, + "learning_rate": 3.994103299358745e-05, + "loss": 0.8713, + "step": 1104 + }, + { + "epoch": 0.053806636963455313, + "grad_norm": 2.3475122451782227, + "learning_rate": 3.994079070034132e-05, + "loss": 0.9987, + "step": 1105 + }, + { + "epoch": 0.05385533075256251, + "grad_norm": 1.8091734647750854, + "learning_rate": 3.994054791106621e-05, + "loss": 1.0495, + "step": 1106 + }, + { + "epoch": 0.05390402454166971, + "grad_norm": 2.450005292892456, + "learning_rate": 3.994030462576815e-05, + "loss": 0.9572, + "step": 1107 + }, + { + "epoch": 0.05395271833077691, + "grad_norm": 3.2016143798828125, + "learning_rate": 3.9940060844453194e-05, + "loss": 0.9553, + "step": 1108 + }, + { + "epoch": 0.05400141211988411, + "grad_norm": 2.2643864154815674, + "learning_rate": 3.9939816567127404e-05, + "loss": 1.0537, + "step": 1109 + }, + { + "epoch": 0.05405010590899131, + "grad_norm": 4.736966133117676, + "learning_rate": 3.9939571793796866e-05, + "loss": 0.9626, + "step": 1110 + }, + { + "epoch": 0.054098799698098506, + "grad_norm": 4.073451042175293, + "learning_rate": 3.993932652446765e-05, + "loss": 0.9274, + "step": 1111 + }, + { + "epoch": 0.054147493487205704, + "grad_norm": 7.708310604095459, + "learning_rate": 3.993908075914588e-05, + "loss": 0.9264, + "step": 1112 + }, + { + "epoch": 0.0541961872763129, + "grad_norm": 1.735755443572998, + "learning_rate": 3.993883449783765e-05, + "loss": 0.9537, + "step": 1113 + }, + { + "epoch": 0.05424488106542011, + "grad_norm": 1.6547961235046387, + "learning_rate": 3.99385877405491e-05, + "loss": 1.0244, + "step": 1114 + }, + { + "epoch": 0.05429357485452731, + "grad_norm": 0.0889412984251976, + "learning_rate": 3.993834048728636e-05, + "loss": 0.5646, + "step": 1115 + }, + { + "epoch": 0.054342268643634506, + "grad_norm": 1.9347020387649536, + "learning_rate": 3.993809273805558e-05, + "loss": 0.9349, + "step": 1116 + }, + { + "epoch": 0.054390962432741705, + "grad_norm": 2.0958728790283203, + "learning_rate": 3.993784449286293e-05, + "loss": 0.9473, + "step": 1117 + }, + { + "epoch": 0.0544396562218489, + "grad_norm": 1.5054552555084229, + "learning_rate": 3.9937595751714576e-05, + "loss": 0.9827, + "step": 1118 + }, + { + "epoch": 0.0544883500109561, + "grad_norm": 2.4576239585876465, + "learning_rate": 3.993734651461671e-05, + "loss": 0.9713, + "step": 1119 + }, + { + "epoch": 0.0545370438000633, + "grad_norm": 2.6142654418945312, + "learning_rate": 3.993709678157553e-05, + "loss": 0.8987, + "step": 1120 + }, + { + "epoch": 0.0545857375891705, + "grad_norm": 1.9256272315979004, + "learning_rate": 3.9936846552597254e-05, + "loss": 0.988, + "step": 1121 + }, + { + "epoch": 0.0546344313782777, + "grad_norm": 0.08586999028921127, + "learning_rate": 3.99365958276881e-05, + "loss": 0.6108, + "step": 1122 + }, + { + "epoch": 0.0546831251673849, + "grad_norm": 1.707668423652649, + "learning_rate": 3.993634460685431e-05, + "loss": 0.8974, + "step": 1123 + }, + { + "epoch": 0.0547318189564921, + "grad_norm": 2.5068929195404053, + "learning_rate": 3.993609289010213e-05, + "loss": 0.9179, + "step": 1124 + }, + { + "epoch": 0.0547805127455993, + "grad_norm": 0.08876845240592957, + "learning_rate": 3.9935840677437816e-05, + "loss": 0.6987, + "step": 1125 + }, + { + "epoch": 0.0548292065347065, + "grad_norm": 2.2096593379974365, + "learning_rate": 3.993558796886765e-05, + "loss": 0.9542, + "step": 1126 + }, + { + "epoch": 0.0548779003238137, + "grad_norm": 2.044084310531616, + "learning_rate": 3.993533476439792e-05, + "loss": 0.9184, + "step": 1127 + }, + { + "epoch": 0.0549265941129209, + "grad_norm": 3.053807020187378, + "learning_rate": 3.9935081064034916e-05, + "loss": 0.919, + "step": 1128 + }, + { + "epoch": 0.054975287902028096, + "grad_norm": 2.5066545009613037, + "learning_rate": 3.993482686778496e-05, + "loss": 0.968, + "step": 1129 + }, + { + "epoch": 0.055023981691135294, + "grad_norm": 2.5527939796447754, + "learning_rate": 3.993457217565436e-05, + "loss": 0.9193, + "step": 1130 + }, + { + "epoch": 0.05507267548024249, + "grad_norm": 2.9377458095550537, + "learning_rate": 3.993431698764946e-05, + "loss": 0.9975, + "step": 1131 + }, + { + "epoch": 0.05512136926934969, + "grad_norm": 0.08906394243240356, + "learning_rate": 3.993406130377661e-05, + "loss": 0.72, + "step": 1132 + }, + { + "epoch": 0.05517006305845689, + "grad_norm": 2.6708319187164307, + "learning_rate": 3.993380512404216e-05, + "loss": 0.9338, + "step": 1133 + }, + { + "epoch": 0.055218756847564096, + "grad_norm": 1.9896891117095947, + "learning_rate": 3.9933548448452505e-05, + "loss": 0.9317, + "step": 1134 + }, + { + "epoch": 0.055267450636671295, + "grad_norm": 2.4108681678771973, + "learning_rate": 3.9933291277014e-05, + "loss": 1.0416, + "step": 1135 + }, + { + "epoch": 0.05531614442577849, + "grad_norm": 0.08767108619213104, + "learning_rate": 3.9933033609733064e-05, + "loss": 0.658, + "step": 1136 + }, + { + "epoch": 0.05536483821488569, + "grad_norm": 2.338120698928833, + "learning_rate": 3.99327754466161e-05, + "loss": 0.9881, + "step": 1137 + }, + { + "epoch": 0.05541353200399289, + "grad_norm": 3.224930763244629, + "learning_rate": 3.993251678766953e-05, + "loss": 0.9778, + "step": 1138 + }, + { + "epoch": 0.05546222579310009, + "grad_norm": 1.6657689809799194, + "learning_rate": 3.9932257632899785e-05, + "loss": 1.0193, + "step": 1139 + }, + { + "epoch": 0.05551091958220729, + "grad_norm": 1.9647952318191528, + "learning_rate": 3.993199798231332e-05, + "loss": 0.8976, + "step": 1140 + }, + { + "epoch": 0.05555961337131449, + "grad_norm": 2.166674852371216, + "learning_rate": 3.993173783591658e-05, + "loss": 1.0084, + "step": 1141 + }, + { + "epoch": 0.055608307160421686, + "grad_norm": 2.343904972076416, + "learning_rate": 3.993147719371604e-05, + "loss": 0.9262, + "step": 1142 + }, + { + "epoch": 0.055657000949528884, + "grad_norm": 2.106142520904541, + "learning_rate": 3.99312160557182e-05, + "loss": 0.988, + "step": 1143 + }, + { + "epoch": 0.05570569473863609, + "grad_norm": 2.042377471923828, + "learning_rate": 3.9930954421929536e-05, + "loss": 0.9707, + "step": 1144 + }, + { + "epoch": 0.05575438852774329, + "grad_norm": 0.09479460120201111, + "learning_rate": 3.9930692292356564e-05, + "loss": 0.7194, + "step": 1145 + }, + { + "epoch": 0.05580308231685049, + "grad_norm": 2.5254263877868652, + "learning_rate": 3.9930429667005804e-05, + "loss": 1.033, + "step": 1146 + }, + { + "epoch": 0.055851776105957686, + "grad_norm": 2.1396737098693848, + "learning_rate": 3.993016654588379e-05, + "loss": 1.0141, + "step": 1147 + }, + { + "epoch": 0.055900469895064885, + "grad_norm": 2.1584620475769043, + "learning_rate": 3.992990292899707e-05, + "loss": 0.9324, + "step": 1148 + }, + { + "epoch": 0.05594916368417208, + "grad_norm": 2.1781678199768066, + "learning_rate": 3.992963881635219e-05, + "loss": 0.8809, + "step": 1149 + }, + { + "epoch": 0.05599785747327928, + "grad_norm": 3.545761823654175, + "learning_rate": 3.9929374207955736e-05, + "loss": 1.0301, + "step": 1150 + }, + { + "epoch": 0.05604655126238648, + "grad_norm": 2.011821985244751, + "learning_rate": 3.992910910381428e-05, + "loss": 1.018, + "step": 1151 + }, + { + "epoch": 0.05609524505149368, + "grad_norm": 0.13306941092014313, + "learning_rate": 3.992884350393441e-05, + "loss": 0.6122, + "step": 1152 + }, + { + "epoch": 0.05614393884060088, + "grad_norm": 2.2026994228363037, + "learning_rate": 3.992857740832274e-05, + "loss": 1.0214, + "step": 1153 + }, + { + "epoch": 0.056192632629708084, + "grad_norm": 2.0144379138946533, + "learning_rate": 3.99283108169859e-05, + "loss": 0.9208, + "step": 1154 + }, + { + "epoch": 0.05624132641881528, + "grad_norm": 2.099607229232788, + "learning_rate": 3.9928043729930506e-05, + "loss": 0.8764, + "step": 1155 + }, + { + "epoch": 0.05629002020792248, + "grad_norm": 1.4486440420150757, + "learning_rate": 3.992777614716321e-05, + "loss": 0.8568, + "step": 1156 + }, + { + "epoch": 0.05633871399702968, + "grad_norm": 2.289851665496826, + "learning_rate": 3.992750806869066e-05, + "loss": 0.9248, + "step": 1157 + }, + { + "epoch": 0.05638740778613688, + "grad_norm": 1.9191092252731323, + "learning_rate": 3.992723949451953e-05, + "loss": 0.9873, + "step": 1158 + }, + { + "epoch": 0.05643610157524408, + "grad_norm": 2.3094072341918945, + "learning_rate": 3.992697042465651e-05, + "loss": 1.0241, + "step": 1159 + }, + { + "epoch": 0.056484795364351276, + "grad_norm": 1.7471145391464233, + "learning_rate": 3.9926700859108274e-05, + "loss": 0.8702, + "step": 1160 + }, + { + "epoch": 0.056533489153458474, + "grad_norm": 1.8021066188812256, + "learning_rate": 3.992643079788154e-05, + "loss": 0.9444, + "step": 1161 + }, + { + "epoch": 0.05658218294256567, + "grad_norm": 2.1272873878479004, + "learning_rate": 3.992616024098302e-05, + "loss": 0.8658, + "step": 1162 + }, + { + "epoch": 0.05663087673167287, + "grad_norm": 2.176103115081787, + "learning_rate": 3.992588918841946e-05, + "loss": 0.9603, + "step": 1163 + }, + { + "epoch": 0.05667957052078008, + "grad_norm": 2.2505109310150146, + "learning_rate": 3.9925617640197574e-05, + "loss": 0.9504, + "step": 1164 + }, + { + "epoch": 0.056728264309887276, + "grad_norm": 2.1710152626037598, + "learning_rate": 3.992534559632414e-05, + "loss": 0.9401, + "step": 1165 + }, + { + "epoch": 0.056776958098994475, + "grad_norm": 1.728882074356079, + "learning_rate": 3.992507305680592e-05, + "loss": 0.9481, + "step": 1166 + }, + { + "epoch": 0.05682565188810167, + "grad_norm": 2.6200523376464844, + "learning_rate": 3.992480002164969e-05, + "loss": 0.9086, + "step": 1167 + }, + { + "epoch": 0.05687434567720887, + "grad_norm": 1.5524619817733765, + "learning_rate": 3.992452649086223e-05, + "loss": 1.0009, + "step": 1168 + }, + { + "epoch": 0.05692303946631607, + "grad_norm": 2.237855911254883, + "learning_rate": 3.992425246445037e-05, + "loss": 0.8965, + "step": 1169 + }, + { + "epoch": 0.05697173325542327, + "grad_norm": 2.0646419525146484, + "learning_rate": 3.9923977942420915e-05, + "loss": 0.9696, + "step": 1170 + }, + { + "epoch": 0.05702042704453047, + "grad_norm": 2.158893585205078, + "learning_rate": 3.992370292478069e-05, + "loss": 0.8539, + "step": 1171 + }, + { + "epoch": 0.05706912083363767, + "grad_norm": 2.5064499378204346, + "learning_rate": 3.992342741153653e-05, + "loss": 0.8477, + "step": 1172 + }, + { + "epoch": 0.057117814622744865, + "grad_norm": 2.3487355709075928, + "learning_rate": 3.992315140269531e-05, + "loss": 0.9605, + "step": 1173 + }, + { + "epoch": 0.05716650841185207, + "grad_norm": 2.3337254524230957, + "learning_rate": 3.992287489826387e-05, + "loss": 0.9304, + "step": 1174 + }, + { + "epoch": 0.05721520220095927, + "grad_norm": 3.8319690227508545, + "learning_rate": 3.992259789824911e-05, + "loss": 0.9636, + "step": 1175 + }, + { + "epoch": 0.05726389599006647, + "grad_norm": 13.1787691116333, + "learning_rate": 3.9922320402657905e-05, + "loss": 0.9575, + "step": 1176 + }, + { + "epoch": 0.05731258977917367, + "grad_norm": 2.6429638862609863, + "learning_rate": 3.992204241149717e-05, + "loss": 0.9672, + "step": 1177 + }, + { + "epoch": 0.057361283568280866, + "grad_norm": 2.5477232933044434, + "learning_rate": 3.992176392477381e-05, + "loss": 0.969, + "step": 1178 + }, + { + "epoch": 0.057409977357388065, + "grad_norm": 2.501253843307495, + "learning_rate": 3.992148494249475e-05, + "loss": 0.9131, + "step": 1179 + }, + { + "epoch": 0.05745867114649526, + "grad_norm": 2.012984037399292, + "learning_rate": 3.9921205464666934e-05, + "loss": 1.0033, + "step": 1180 + }, + { + "epoch": 0.05750736493560246, + "grad_norm": 2.212390661239624, + "learning_rate": 3.9920925491297325e-05, + "loss": 0.879, + "step": 1181 + }, + { + "epoch": 0.05755605872470966, + "grad_norm": 2.5811855792999268, + "learning_rate": 3.9920645022392876e-05, + "loss": 0.9608, + "step": 1182 + }, + { + "epoch": 0.05760475251381686, + "grad_norm": 2.0330910682678223, + "learning_rate": 3.992036405796056e-05, + "loss": 1.0286, + "step": 1183 + }, + { + "epoch": 0.057653446302924065, + "grad_norm": 2.2284724712371826, + "learning_rate": 3.9920082598007377e-05, + "loss": 0.9013, + "step": 1184 + }, + { + "epoch": 0.057702140092031264, + "grad_norm": 2.8543500900268555, + "learning_rate": 3.9919800642540325e-05, + "loss": 1.0552, + "step": 1185 + }, + { + "epoch": 0.05775083388113846, + "grad_norm": 1.8044755458831787, + "learning_rate": 3.9919518191566413e-05, + "loss": 0.9729, + "step": 1186 + }, + { + "epoch": 0.05779952767024566, + "grad_norm": 2.411282539367676, + "learning_rate": 3.9919235245092674e-05, + "loss": 0.9012, + "step": 1187 + }, + { + "epoch": 0.05784822145935286, + "grad_norm": 1.8088088035583496, + "learning_rate": 3.9918951803126135e-05, + "loss": 0.9295, + "step": 1188 + }, + { + "epoch": 0.05789691524846006, + "grad_norm": 3.069361925125122, + "learning_rate": 3.9918667865673856e-05, + "loss": 0.9346, + "step": 1189 + }, + { + "epoch": 0.05794560903756726, + "grad_norm": 1.9869799613952637, + "learning_rate": 3.99183834327429e-05, + "loss": 0.9367, + "step": 1190 + }, + { + "epoch": 0.057994302826674456, + "grad_norm": 2.645369291305542, + "learning_rate": 3.991809850434034e-05, + "loss": 1.0375, + "step": 1191 + }, + { + "epoch": 0.058042996615781654, + "grad_norm": 2.435629367828369, + "learning_rate": 3.991781308047326e-05, + "loss": 1.0295, + "step": 1192 + }, + { + "epoch": 0.05809169040488885, + "grad_norm": 2.6817376613616943, + "learning_rate": 3.991752716114877e-05, + "loss": 0.8869, + "step": 1193 + }, + { + "epoch": 0.05814038419399606, + "grad_norm": 3.6405067443847656, + "learning_rate": 3.9917240746373965e-05, + "loss": 1.0171, + "step": 1194 + }, + { + "epoch": 0.05818907798310326, + "grad_norm": 2.2462780475616455, + "learning_rate": 3.991695383615599e-05, + "loss": 0.984, + "step": 1195 + }, + { + "epoch": 0.058237771772210456, + "grad_norm": 2.395639181137085, + "learning_rate": 3.991666643050197e-05, + "loss": 0.9477, + "step": 1196 + }, + { + "epoch": 0.058286465561317655, + "grad_norm": 1.9207558631896973, + "learning_rate": 3.9916378529419054e-05, + "loss": 0.9415, + "step": 1197 + }, + { + "epoch": 0.05833515935042485, + "grad_norm": 4.173513412475586, + "learning_rate": 3.991609013291441e-05, + "loss": 0.8401, + "step": 1198 + }, + { + "epoch": 0.05838385313953205, + "grad_norm": 1.8284785747528076, + "learning_rate": 3.991580124099521e-05, + "loss": 1.078, + "step": 1199 + }, + { + "epoch": 0.05843254692863925, + "grad_norm": 2.195582628250122, + "learning_rate": 3.9915511853668635e-05, + "loss": 0.9317, + "step": 1200 + }, + { + "epoch": 0.05848124071774645, + "grad_norm": 2.1512813568115234, + "learning_rate": 3.9915221970941885e-05, + "loss": 0.973, + "step": 1201 + }, + { + "epoch": 0.05852993450685365, + "grad_norm": 2.3986127376556396, + "learning_rate": 3.9914931592822176e-05, + "loss": 0.9491, + "step": 1202 + }, + { + "epoch": 0.05857862829596085, + "grad_norm": 2.1100637912750244, + "learning_rate": 3.9914640719316725e-05, + "loss": 1.001, + "step": 1203 + }, + { + "epoch": 0.05862732208506805, + "grad_norm": 0.08897548913955688, + "learning_rate": 3.991434935043277e-05, + "loss": 0.593, + "step": 1204 + }, + { + "epoch": 0.05867601587417525, + "grad_norm": 1.9628560543060303, + "learning_rate": 3.991405748617756e-05, + "loss": 0.9431, + "step": 1205 + }, + { + "epoch": 0.05872470966328245, + "grad_norm": 2.6323795318603516, + "learning_rate": 3.991376512655836e-05, + "loss": 0.9209, + "step": 1206 + }, + { + "epoch": 0.05877340345238965, + "grad_norm": 2.42576003074646, + "learning_rate": 3.991347227158243e-05, + "loss": 0.9364, + "step": 1207 + }, + { + "epoch": 0.05882209724149685, + "grad_norm": 1.852513313293457, + "learning_rate": 3.991317892125706e-05, + "loss": 0.9912, + "step": 1208 + }, + { + "epoch": 0.058870791030604046, + "grad_norm": 1.6037890911102295, + "learning_rate": 3.991288507558955e-05, + "loss": 0.9645, + "step": 1209 + }, + { + "epoch": 0.058919484819711244, + "grad_norm": 3.134801149368286, + "learning_rate": 3.991259073458722e-05, + "loss": 0.9805, + "step": 1210 + }, + { + "epoch": 0.05896817860881844, + "grad_norm": 1.7431213855743408, + "learning_rate": 3.991229589825737e-05, + "loss": 0.8528, + "step": 1211 + }, + { + "epoch": 0.05901687239792564, + "grad_norm": 2.436408758163452, + "learning_rate": 3.9912000566607335e-05, + "loss": 1.0213, + "step": 1212 + }, + { + "epoch": 0.05906556618703284, + "grad_norm": 2.1320130825042725, + "learning_rate": 3.991170473964448e-05, + "loss": 0.932, + "step": 1213 + }, + { + "epoch": 0.059114259976140046, + "grad_norm": 1.7586932182312012, + "learning_rate": 3.9911408417376156e-05, + "loss": 0.9562, + "step": 1214 + }, + { + "epoch": 0.059162953765247245, + "grad_norm": 0.09310396015644073, + "learning_rate": 3.991111159980973e-05, + "loss": 0.6774, + "step": 1215 + }, + { + "epoch": 0.059211647554354443, + "grad_norm": 2.5198776721954346, + "learning_rate": 3.9910814286952586e-05, + "loss": 1.0249, + "step": 1216 + }, + { + "epoch": 0.05926034134346164, + "grad_norm": 4.142366886138916, + "learning_rate": 3.991051647881213e-05, + "loss": 0.9111, + "step": 1217 + }, + { + "epoch": 0.05930903513256884, + "grad_norm": 2.4143576622009277, + "learning_rate": 3.991021817539575e-05, + "loss": 0.8466, + "step": 1218 + }, + { + "epoch": 0.05935772892167604, + "grad_norm": 4.648275852203369, + "learning_rate": 3.990991937671088e-05, + "loss": 0.9075, + "step": 1219 + }, + { + "epoch": 0.05940642271078324, + "grad_norm": 2.8007986545562744, + "learning_rate": 3.990962008276495e-05, + "loss": 0.9851, + "step": 1220 + }, + { + "epoch": 0.05945511649989044, + "grad_norm": 2.0425093173980713, + "learning_rate": 3.990932029356541e-05, + "loss": 0.9742, + "step": 1221 + }, + { + "epoch": 0.059503810288997636, + "grad_norm": 2.0514864921569824, + "learning_rate": 3.990902000911971e-05, + "loss": 0.8641, + "step": 1222 + }, + { + "epoch": 0.059552504078104834, + "grad_norm": 3.1195054054260254, + "learning_rate": 3.990871922943532e-05, + "loss": 0.9485, + "step": 1223 + }, + { + "epoch": 0.05960119786721204, + "grad_norm": 2.551828145980835, + "learning_rate": 3.9908417954519726e-05, + "loss": 0.9618, + "step": 1224 + }, + { + "epoch": 0.05964989165631924, + "grad_norm": 1.7895680665969849, + "learning_rate": 3.990811618438042e-05, + "loss": 1.0601, + "step": 1225 + }, + { + "epoch": 0.05969858544542644, + "grad_norm": 2.3355672359466553, + "learning_rate": 3.9907813919024907e-05, + "loss": 0.9496, + "step": 1226 + }, + { + "epoch": 0.059747279234533636, + "grad_norm": 3.7686994075775146, + "learning_rate": 3.9907511158460715e-05, + "loss": 0.9127, + "step": 1227 + }, + { + "epoch": 0.059795973023640835, + "grad_norm": 2.209653854370117, + "learning_rate": 3.9907207902695354e-05, + "loss": 1.0351, + "step": 1228 + }, + { + "epoch": 0.05984466681274803, + "grad_norm": 2.150063991546631, + "learning_rate": 3.9906904151736384e-05, + "loss": 1.0457, + "step": 1229 + }, + { + "epoch": 0.05989336060185523, + "grad_norm": 3.6178998947143555, + "learning_rate": 3.990659990559137e-05, + "loss": 1.0098, + "step": 1230 + }, + { + "epoch": 0.05994205439096243, + "grad_norm": 2.127527952194214, + "learning_rate": 3.9906295164267865e-05, + "loss": 1.0032, + "step": 1231 + }, + { + "epoch": 0.05999074818006963, + "grad_norm": 3.8126025199890137, + "learning_rate": 3.9905989927773446e-05, + "loss": 0.9569, + "step": 1232 + }, + { + "epoch": 0.060039441969176835, + "grad_norm": 2.0553276538848877, + "learning_rate": 3.990568419611571e-05, + "loss": 0.9809, + "step": 1233 + }, + { + "epoch": 0.060088135758284034, + "grad_norm": 2.5788657665252686, + "learning_rate": 3.990537796930228e-05, + "loss": 1.0333, + "step": 1234 + }, + { + "epoch": 0.06013682954739123, + "grad_norm": 4.130887985229492, + "learning_rate": 3.990507124734075e-05, + "loss": 0.9319, + "step": 1235 + }, + { + "epoch": 0.06018552333649843, + "grad_norm": 1.8881797790527344, + "learning_rate": 3.990476403023875e-05, + "loss": 1.0945, + "step": 1236 + }, + { + "epoch": 0.06023421712560563, + "grad_norm": 2.7623136043548584, + "learning_rate": 3.990445631800394e-05, + "loss": 1.0608, + "step": 1237 + }, + { + "epoch": 0.06028291091471283, + "grad_norm": 2.6882481575012207, + "learning_rate": 3.990414811064396e-05, + "loss": 0.9082, + "step": 1238 + }, + { + "epoch": 0.06033160470382003, + "grad_norm": 3.0170235633850098, + "learning_rate": 3.990383940816648e-05, + "loss": 0.9404, + "step": 1239 + }, + { + "epoch": 0.060380298492927226, + "grad_norm": 2.361527681350708, + "learning_rate": 3.9903530210579184e-05, + "loss": 0.8573, + "step": 1240 + }, + { + "epoch": 0.060428992282034424, + "grad_norm": 2.0018980503082275, + "learning_rate": 3.9903220517889756e-05, + "loss": 0.9067, + "step": 1241 + }, + { + "epoch": 0.06047768607114162, + "grad_norm": 1.604238510131836, + "learning_rate": 3.9902910330105905e-05, + "loss": 0.8508, + "step": 1242 + }, + { + "epoch": 0.06052637986024883, + "grad_norm": 2.4273669719696045, + "learning_rate": 3.990259964723534e-05, + "loss": 0.9944, + "step": 1243 + }, + { + "epoch": 0.06057507364935603, + "grad_norm": 2.0984716415405273, + "learning_rate": 3.9902288469285804e-05, + "loss": 0.8764, + "step": 1244 + }, + { + "epoch": 0.060623767438463226, + "grad_norm": 3.1203105449676514, + "learning_rate": 3.990197679626502e-05, + "loss": 0.9741, + "step": 1245 + }, + { + "epoch": 0.060672461227570425, + "grad_norm": 0.0913635641336441, + "learning_rate": 3.990166462818075e-05, + "loss": 0.6244, + "step": 1246 + }, + { + "epoch": 0.06072115501667762, + "grad_norm": 1.9064691066741943, + "learning_rate": 3.9901351965040754e-05, + "loss": 0.883, + "step": 1247 + }, + { + "epoch": 0.06076984880578482, + "grad_norm": 2.024393320083618, + "learning_rate": 3.990103880685282e-05, + "loss": 0.9743, + "step": 1248 + }, + { + "epoch": 0.06081854259489202, + "grad_norm": 2.618483066558838, + "learning_rate": 3.990072515362472e-05, + "loss": 1.0065, + "step": 1249 + }, + { + "epoch": 0.06086723638399922, + "grad_norm": 1.9478973150253296, + "learning_rate": 3.990041100536428e-05, + "loss": 0.9313, + "step": 1250 + }, + { + "epoch": 0.06091593017310642, + "grad_norm": 1.599401593208313, + "learning_rate": 3.99000963620793e-05, + "loss": 0.9591, + "step": 1251 + }, + { + "epoch": 0.06096462396221362, + "grad_norm": 1.3602787256240845, + "learning_rate": 3.98997812237776e-05, + "loss": 1.0217, + "step": 1252 + }, + { + "epoch": 0.06101331775132082, + "grad_norm": 2.6844136714935303, + "learning_rate": 3.9899465590467044e-05, + "loss": 0.8701, + "step": 1253 + }, + { + "epoch": 0.06106201154042802, + "grad_norm": 2.3563497066497803, + "learning_rate": 3.989914946215545e-05, + "loss": 0.8262, + "step": 1254 + }, + { + "epoch": 0.06111070532953522, + "grad_norm": 2.0402917861938477, + "learning_rate": 3.989883283885071e-05, + "loss": 0.9319, + "step": 1255 + }, + { + "epoch": 0.06115939911864242, + "grad_norm": 1.9389634132385254, + "learning_rate": 3.989851572056068e-05, + "loss": 0.9408, + "step": 1256 + }, + { + "epoch": 0.06120809290774962, + "grad_norm": 2.519333600997925, + "learning_rate": 3.989819810729326e-05, + "loss": 0.9188, + "step": 1257 + }, + { + "epoch": 0.061256786696856816, + "grad_norm": 2.7221972942352295, + "learning_rate": 3.989787999905636e-05, + "loss": 1.0549, + "step": 1258 + }, + { + "epoch": 0.061305480485964015, + "grad_norm": 2.0805206298828125, + "learning_rate": 3.989756139585787e-05, + "loss": 0.9436, + "step": 1259 + }, + { + "epoch": 0.06135417427507121, + "grad_norm": 2.2243411540985107, + "learning_rate": 3.9897242297705724e-05, + "loss": 0.9067, + "step": 1260 + }, + { + "epoch": 0.06140286806417841, + "grad_norm": 1.688602089881897, + "learning_rate": 3.989692270460786e-05, + "loss": 1.0173, + "step": 1261 + }, + { + "epoch": 0.06145156185328561, + "grad_norm": 1.8561224937438965, + "learning_rate": 3.989660261657224e-05, + "loss": 0.8804, + "step": 1262 + }, + { + "epoch": 0.061500255642392816, + "grad_norm": 1.9425071477890015, + "learning_rate": 3.98962820336068e-05, + "loss": 0.9281, + "step": 1263 + }, + { + "epoch": 0.061548949431500015, + "grad_norm": 1.8541650772094727, + "learning_rate": 3.989596095571955e-05, + "loss": 1.0473, + "step": 1264 + }, + { + "epoch": 0.061597643220607214, + "grad_norm": 2.1750097274780273, + "learning_rate": 3.989563938291844e-05, + "loss": 1.0317, + "step": 1265 + }, + { + "epoch": 0.06164633700971441, + "grad_norm": 2.6602842807769775, + "learning_rate": 3.98953173152115e-05, + "loss": 0.8621, + "step": 1266 + }, + { + "epoch": 0.06169503079882161, + "grad_norm": 2.069213390350342, + "learning_rate": 3.989499475260672e-05, + "loss": 0.9312, + "step": 1267 + }, + { + "epoch": 0.06174372458792881, + "grad_norm": 1.9860707521438599, + "learning_rate": 3.989467169511213e-05, + "loss": 0.9741, + "step": 1268 + }, + { + "epoch": 0.06179241837703601, + "grad_norm": 1.988297462463379, + "learning_rate": 3.989434814273577e-05, + "loss": 0.9438, + "step": 1269 + }, + { + "epoch": 0.06184111216614321, + "grad_norm": 2.7201974391937256, + "learning_rate": 3.9894024095485686e-05, + "loss": 0.9986, + "step": 1270 + }, + { + "epoch": 0.061889805955250406, + "grad_norm": 2.4198646545410156, + "learning_rate": 3.989369955336994e-05, + "loss": 0.9501, + "step": 1271 + }, + { + "epoch": 0.061938499744357604, + "grad_norm": 2.1943976879119873, + "learning_rate": 3.9893374516396606e-05, + "loss": 0.9073, + "step": 1272 + }, + { + "epoch": 0.06198719353346481, + "grad_norm": 2.5974884033203125, + "learning_rate": 3.989304898457376e-05, + "loss": 0.87, + "step": 1273 + }, + { + "epoch": 0.06203588732257201, + "grad_norm": 2.9105143547058105, + "learning_rate": 3.989272295790951e-05, + "loss": 0.8842, + "step": 1274 + }, + { + "epoch": 0.06208458111167921, + "grad_norm": 1.9315766096115112, + "learning_rate": 3.989239643641197e-05, + "loss": 0.9053, + "step": 1275 + }, + { + "epoch": 0.062133274900786406, + "grad_norm": 5.30185604095459, + "learning_rate": 3.989206942008925e-05, + "loss": 0.8107, + "step": 1276 + }, + { + "epoch": 0.062181968689893605, + "grad_norm": 2.1179699897766113, + "learning_rate": 3.9891741908949485e-05, + "loss": 0.947, + "step": 1277 + }, + { + "epoch": 0.0622306624790008, + "grad_norm": 1.7617990970611572, + "learning_rate": 3.989141390300083e-05, + "loss": 1.0357, + "step": 1278 + }, + { + "epoch": 0.062279356268108, + "grad_norm": 2.418574571609497, + "learning_rate": 3.989108540225144e-05, + "loss": 0.934, + "step": 1279 + }, + { + "epoch": 0.0623280500572152, + "grad_norm": 3.6130478382110596, + "learning_rate": 3.9890756406709484e-05, + "loss": 0.891, + "step": 1280 + }, + { + "epoch": 0.0623767438463224, + "grad_norm": 1.6696856021881104, + "learning_rate": 3.989042691638315e-05, + "loss": 0.9299, + "step": 1281 + }, + { + "epoch": 0.0624254376354296, + "grad_norm": 1.975967526435852, + "learning_rate": 3.989009693128064e-05, + "loss": 1.0036, + "step": 1282 + }, + { + "epoch": 0.062474131424536804, + "grad_norm": 0.09141549468040466, + "learning_rate": 3.988976645141015e-05, + "loss": 0.6485, + "step": 1283 + }, + { + "epoch": 0.062522825213644, + "grad_norm": 2.068049907684326, + "learning_rate": 3.9889435476779905e-05, + "loss": 0.9206, + "step": 1284 + }, + { + "epoch": 0.0625715190027512, + "grad_norm": 1.6464755535125732, + "learning_rate": 3.9889104007398136e-05, + "loss": 0.9197, + "step": 1285 + }, + { + "epoch": 0.06262021279185839, + "grad_norm": 1.8743621110916138, + "learning_rate": 3.98887720432731e-05, + "loss": 0.9674, + "step": 1286 + }, + { + "epoch": 0.06266890658096559, + "grad_norm": 1.8216361999511719, + "learning_rate": 3.9888439584413034e-05, + "loss": 0.9196, + "step": 1287 + }, + { + "epoch": 0.0627176003700728, + "grad_norm": 1.6812840700149536, + "learning_rate": 3.9888106630826234e-05, + "loss": 0.9491, + "step": 1288 + }, + { + "epoch": 0.06276629415918, + "grad_norm": 2.3238587379455566, + "learning_rate": 3.988777318252096e-05, + "loss": 0.9053, + "step": 1289 + }, + { + "epoch": 0.0628149879482872, + "grad_norm": 1.8065152168273926, + "learning_rate": 3.988743923950551e-05, + "loss": 0.848, + "step": 1290 + }, + { + "epoch": 0.0628636817373944, + "grad_norm": 2.1315786838531494, + "learning_rate": 3.9887104801788205e-05, + "loss": 0.8687, + "step": 1291 + }, + { + "epoch": 0.0629123755265016, + "grad_norm": 1.9612033367156982, + "learning_rate": 3.988676986937735e-05, + "loss": 1.0104, + "step": 1292 + }, + { + "epoch": 0.0629610693156088, + "grad_norm": 2.1168365478515625, + "learning_rate": 3.988643444228129e-05, + "loss": 0.9274, + "step": 1293 + }, + { + "epoch": 0.063009763104716, + "grad_norm": 1.770053505897522, + "learning_rate": 3.988609852050835e-05, + "loss": 0.9547, + "step": 1294 + }, + { + "epoch": 0.0630584568938232, + "grad_norm": 1.732008695602417, + "learning_rate": 3.98857621040669e-05, + "loss": 0.9671, + "step": 1295 + }, + { + "epoch": 0.0631071506829304, + "grad_norm": 2.6759822368621826, + "learning_rate": 3.9885425192965304e-05, + "loss": 0.8982, + "step": 1296 + }, + { + "epoch": 0.06315584447203759, + "grad_norm": 0.09152437001466751, + "learning_rate": 3.988508778721194e-05, + "loss": 0.6159, + "step": 1297 + }, + { + "epoch": 0.06320453826114479, + "grad_norm": 1.8812358379364014, + "learning_rate": 3.9884749886815214e-05, + "loss": 0.9491, + "step": 1298 + }, + { + "epoch": 0.06325323205025199, + "grad_norm": 1.7548471689224243, + "learning_rate": 3.988441149178352e-05, + "loss": 0.981, + "step": 1299 + }, + { + "epoch": 0.06330192583935919, + "grad_norm": 1.4924743175506592, + "learning_rate": 3.9884072602125266e-05, + "loss": 0.8958, + "step": 1300 + }, + { + "epoch": 0.06335061962846639, + "grad_norm": 2.3024861812591553, + "learning_rate": 3.98837332178489e-05, + "loss": 0.8273, + "step": 1301 + }, + { + "epoch": 0.06339931341757359, + "grad_norm": 1.675376057624817, + "learning_rate": 3.988339333896286e-05, + "loss": 0.9306, + "step": 1302 + }, + { + "epoch": 0.06344800720668078, + "grad_norm": 2.5164008140563965, + "learning_rate": 3.988305296547559e-05, + "loss": 0.8412, + "step": 1303 + }, + { + "epoch": 0.06349670099578798, + "grad_norm": 1.6492184400558472, + "learning_rate": 3.988271209739557e-05, + "loss": 1.021, + "step": 1304 + }, + { + "epoch": 0.06354539478489518, + "grad_norm": 3.066559314727783, + "learning_rate": 3.9882370734731273e-05, + "loss": 0.8586, + "step": 1305 + }, + { + "epoch": 0.06359408857400238, + "grad_norm": 1.7397115230560303, + "learning_rate": 3.9882028877491194e-05, + "loss": 0.9265, + "step": 1306 + }, + { + "epoch": 0.06364278236310958, + "grad_norm": 1.6229678392410278, + "learning_rate": 3.988168652568383e-05, + "loss": 0.9855, + "step": 1307 + }, + { + "epoch": 0.06369147615221679, + "grad_norm": 1.7877734899520874, + "learning_rate": 3.98813436793177e-05, + "loss": 0.8211, + "step": 1308 + }, + { + "epoch": 0.06374016994132399, + "grad_norm": 2.270341634750366, + "learning_rate": 3.988100033840134e-05, + "loss": 0.9968, + "step": 1309 + }, + { + "epoch": 0.06378886373043119, + "grad_norm": 2.3643479347229004, + "learning_rate": 3.9880656502943274e-05, + "loss": 0.987, + "step": 1310 + }, + { + "epoch": 0.06383755751953839, + "grad_norm": 1.9946069717407227, + "learning_rate": 3.988031217295207e-05, + "loss": 0.9732, + "step": 1311 + }, + { + "epoch": 0.06388625130864559, + "grad_norm": 1.779789924621582, + "learning_rate": 3.987996734843629e-05, + "loss": 1.0542, + "step": 1312 + }, + { + "epoch": 0.06393494509775279, + "grad_norm": 2.0077402591705322, + "learning_rate": 3.98796220294045e-05, + "loss": 0.9433, + "step": 1313 + }, + { + "epoch": 0.06398363888685998, + "grad_norm": 4.152279853820801, + "learning_rate": 3.987927621586531e-05, + "loss": 0.8968, + "step": 1314 + }, + { + "epoch": 0.06403233267596718, + "grad_norm": 1.9373815059661865, + "learning_rate": 3.987892990782731e-05, + "loss": 0.8971, + "step": 1315 + }, + { + "epoch": 0.06408102646507438, + "grad_norm": 2.0938944816589355, + "learning_rate": 3.987858310529911e-05, + "loss": 0.9097, + "step": 1316 + }, + { + "epoch": 0.06412972025418158, + "grad_norm": 1.5414518117904663, + "learning_rate": 3.987823580828935e-05, + "loss": 0.9575, + "step": 1317 + }, + { + "epoch": 0.06417841404328878, + "grad_norm": 1.7396928071975708, + "learning_rate": 3.9877888016806665e-05, + "loss": 0.978, + "step": 1318 + }, + { + "epoch": 0.06422710783239598, + "grad_norm": 2.024301290512085, + "learning_rate": 3.9877539730859694e-05, + "loss": 0.8506, + "step": 1319 + }, + { + "epoch": 0.06427580162150318, + "grad_norm": 2.654562473297119, + "learning_rate": 3.987719095045711e-05, + "loss": 0.9115, + "step": 1320 + }, + { + "epoch": 0.06432449541061037, + "grad_norm": 2.157076597213745, + "learning_rate": 3.987684167560759e-05, + "loss": 0.9082, + "step": 1321 + }, + { + "epoch": 0.06437318919971757, + "grad_norm": 3.4727072715759277, + "learning_rate": 3.9876491906319825e-05, + "loss": 0.961, + "step": 1322 + }, + { + "epoch": 0.06442188298882477, + "grad_norm": 1.8409218788146973, + "learning_rate": 3.9876141642602516e-05, + "loss": 0.8902, + "step": 1323 + }, + { + "epoch": 0.06447057677793197, + "grad_norm": 2.0496652126312256, + "learning_rate": 3.987579088446436e-05, + "loss": 0.8877, + "step": 1324 + }, + { + "epoch": 0.06451927056703917, + "grad_norm": 2.061161756515503, + "learning_rate": 3.98754396319141e-05, + "loss": 0.9668, + "step": 1325 + }, + { + "epoch": 0.06456796435614637, + "grad_norm": 2.6562438011169434, + "learning_rate": 3.9875087884960465e-05, + "loss": 0.9847, + "step": 1326 + }, + { + "epoch": 0.06461665814525357, + "grad_norm": 2.0042238235473633, + "learning_rate": 3.987473564361221e-05, + "loss": 1.0002, + "step": 1327 + }, + { + "epoch": 0.06466535193436078, + "grad_norm": 2.1422126293182373, + "learning_rate": 3.987438290787809e-05, + "loss": 0.9841, + "step": 1328 + }, + { + "epoch": 0.06471404572346798, + "grad_norm": 0.08804892748594284, + "learning_rate": 3.987402967776688e-05, + "loss": 0.6774, + "step": 1329 + }, + { + "epoch": 0.06476273951257518, + "grad_norm": 1.8558787107467651, + "learning_rate": 3.987367595328738e-05, + "loss": 0.9462, + "step": 1330 + }, + { + "epoch": 0.06481143330168238, + "grad_norm": 1.8337055444717407, + "learning_rate": 3.987332173444837e-05, + "loss": 1.0128, + "step": 1331 + }, + { + "epoch": 0.06486012709078957, + "grad_norm": 1.7356560230255127, + "learning_rate": 3.987296702125867e-05, + "loss": 0.9743, + "step": 1332 + }, + { + "epoch": 0.06490882087989677, + "grad_norm": 2.283435344696045, + "learning_rate": 3.987261181372711e-05, + "loss": 0.931, + "step": 1333 + }, + { + "epoch": 0.06495751466900397, + "grad_norm": 2.010319232940674, + "learning_rate": 3.987225611186251e-05, + "loss": 0.9138, + "step": 1334 + }, + { + "epoch": 0.06500620845811117, + "grad_norm": 1.6035798788070679, + "learning_rate": 3.987189991567373e-05, + "loss": 0.9433, + "step": 1335 + }, + { + "epoch": 0.06505490224721837, + "grad_norm": 4.150123596191406, + "learning_rate": 3.987154322516963e-05, + "loss": 0.9699, + "step": 1336 + }, + { + "epoch": 0.06510359603632557, + "grad_norm": 1.940488338470459, + "learning_rate": 3.987118604035909e-05, + "loss": 0.8942, + "step": 1337 + }, + { + "epoch": 0.06515228982543277, + "grad_norm": 2.2575442790985107, + "learning_rate": 3.987082836125097e-05, + "loss": 0.9596, + "step": 1338 + }, + { + "epoch": 0.06520098361453996, + "grad_norm": 2.0970704555511475, + "learning_rate": 3.987047018785419e-05, + "loss": 1.0305, + "step": 1339 + }, + { + "epoch": 0.06524967740364716, + "grad_norm": 3.109987258911133, + "learning_rate": 3.987011152017765e-05, + "loss": 1.0135, + "step": 1340 + }, + { + "epoch": 0.06529837119275436, + "grad_norm": 0.08258938789367676, + "learning_rate": 3.9869752358230275e-05, + "loss": 0.6629, + "step": 1341 + }, + { + "epoch": 0.06534706498186156, + "grad_norm": 1.7971018552780151, + "learning_rate": 3.9869392702020994e-05, + "loss": 0.9091, + "step": 1342 + }, + { + "epoch": 0.06539575877096876, + "grad_norm": 2.4285953044891357, + "learning_rate": 3.986903255155877e-05, + "loss": 0.9212, + "step": 1343 + }, + { + "epoch": 0.06544445256007596, + "grad_norm": 1.912719964981079, + "learning_rate": 3.9868671906852536e-05, + "loss": 1.0048, + "step": 1344 + }, + { + "epoch": 0.06549314634918316, + "grad_norm": 2.1571261882781982, + "learning_rate": 3.986831076791129e-05, + "loss": 0.9773, + "step": 1345 + }, + { + "epoch": 0.06554184013829036, + "grad_norm": 1.5238367319107056, + "learning_rate": 3.9867949134743994e-05, + "loss": 0.8404, + "step": 1346 + }, + { + "epoch": 0.06559053392739755, + "grad_norm": 2.289560556411743, + "learning_rate": 3.9867587007359655e-05, + "loss": 1.0019, + "step": 1347 + }, + { + "epoch": 0.06563922771650477, + "grad_norm": 0.08545619994401932, + "learning_rate": 3.9867224385767274e-05, + "loss": 0.577, + "step": 1348 + }, + { + "epoch": 0.06568792150561197, + "grad_norm": 1.6473093032836914, + "learning_rate": 3.986686126997588e-05, + "loss": 1.0721, + "step": 1349 + }, + { + "epoch": 0.06573661529471916, + "grad_norm": 2.365379810333252, + "learning_rate": 3.986649765999449e-05, + "loss": 0.9315, + "step": 1350 + }, + { + "epoch": 0.06578530908382636, + "grad_norm": 1.7186682224273682, + "learning_rate": 3.9866133555832166e-05, + "loss": 0.942, + "step": 1351 + }, + { + "epoch": 0.06583400287293356, + "grad_norm": 3.419349193572998, + "learning_rate": 3.986576895749796e-05, + "loss": 0.9686, + "step": 1352 + }, + { + "epoch": 0.06588269666204076, + "grad_norm": 1.831440806388855, + "learning_rate": 3.986540386500094e-05, + "loss": 0.9438, + "step": 1353 + }, + { + "epoch": 0.06593139045114796, + "grad_norm": 2.1167705059051514, + "learning_rate": 3.986503827835019e-05, + "loss": 0.8768, + "step": 1354 + }, + { + "epoch": 0.06598008424025516, + "grad_norm": 1.9682785272598267, + "learning_rate": 3.98646721975548e-05, + "loss": 0.9903, + "step": 1355 + }, + { + "epoch": 0.06602877802936236, + "grad_norm": 1.655375361442566, + "learning_rate": 3.986430562262388e-05, + "loss": 0.9464, + "step": 1356 + }, + { + "epoch": 0.06607747181846955, + "grad_norm": 1.5574262142181396, + "learning_rate": 3.986393855356654e-05, + "loss": 0.9478, + "step": 1357 + }, + { + "epoch": 0.06612616560757675, + "grad_norm": 1.762222170829773, + "learning_rate": 3.986357099039191e-05, + "loss": 0.9303, + "step": 1358 + }, + { + "epoch": 0.06617485939668395, + "grad_norm": 3.557171106338501, + "learning_rate": 3.9863202933109154e-05, + "loss": 0.9704, + "step": 1359 + }, + { + "epoch": 0.06622355318579115, + "grad_norm": 2.265258312225342, + "learning_rate": 3.986283438172741e-05, + "loss": 0.9455, + "step": 1360 + }, + { + "epoch": 0.06627224697489835, + "grad_norm": 1.6772483587265015, + "learning_rate": 3.9862465336255855e-05, + "loss": 1.0291, + "step": 1361 + }, + { + "epoch": 0.06632094076400555, + "grad_norm": 2.0517537593841553, + "learning_rate": 3.986209579670366e-05, + "loss": 0.9633, + "step": 1362 + }, + { + "epoch": 0.06636963455311275, + "grad_norm": 2.2075390815734863, + "learning_rate": 3.9861725763080015e-05, + "loss": 0.9525, + "step": 1363 + }, + { + "epoch": 0.06641832834221995, + "grad_norm": 1.9235806465148926, + "learning_rate": 3.986135523539414e-05, + "loss": 0.9342, + "step": 1364 + }, + { + "epoch": 0.06646702213132714, + "grad_norm": 1.863725185394287, + "learning_rate": 3.986098421365523e-05, + "loss": 0.9383, + "step": 1365 + }, + { + "epoch": 0.06651571592043434, + "grad_norm": 1.6338635683059692, + "learning_rate": 3.986061269787255e-05, + "loss": 0.9027, + "step": 1366 + }, + { + "epoch": 0.06656440970954154, + "grad_norm": 2.1415436267852783, + "learning_rate": 3.9860240688055296e-05, + "loss": 0.8926, + "step": 1367 + }, + { + "epoch": 0.06661310349864875, + "grad_norm": 3.404616355895996, + "learning_rate": 3.985986818421275e-05, + "loss": 0.9574, + "step": 1368 + }, + { + "epoch": 0.06666179728775595, + "grad_norm": 2.150717258453369, + "learning_rate": 3.9859495186354174e-05, + "loss": 1.0833, + "step": 1369 + }, + { + "epoch": 0.06671049107686315, + "grad_norm": 3.6969330310821533, + "learning_rate": 3.985912169448884e-05, + "loss": 0.9154, + "step": 1370 + }, + { + "epoch": 0.06675918486597035, + "grad_norm": 1.787434458732605, + "learning_rate": 3.985874770862605e-05, + "loss": 0.9538, + "step": 1371 + }, + { + "epoch": 0.06680787865507755, + "grad_norm": 1.7769776582717896, + "learning_rate": 3.98583732287751e-05, + "loss": 0.9454, + "step": 1372 + }, + { + "epoch": 0.06685657244418475, + "grad_norm": 1.8247519731521606, + "learning_rate": 3.98579982549453e-05, + "loss": 0.8949, + "step": 1373 + }, + { + "epoch": 0.06690526623329195, + "grad_norm": 1.7900925874710083, + "learning_rate": 3.9857622787145986e-05, + "loss": 0.8999, + "step": 1374 + }, + { + "epoch": 0.06695396002239914, + "grad_norm": 1.9814966917037964, + "learning_rate": 3.98572468253865e-05, + "loss": 1.0449, + "step": 1375 + }, + { + "epoch": 0.06700265381150634, + "grad_norm": 2.3213729858398438, + "learning_rate": 3.985687036967618e-05, + "loss": 0.9181, + "step": 1376 + }, + { + "epoch": 0.06705134760061354, + "grad_norm": 2.8417301177978516, + "learning_rate": 3.98564934200244e-05, + "loss": 0.9566, + "step": 1377 + }, + { + "epoch": 0.06710004138972074, + "grad_norm": 1.6795320510864258, + "learning_rate": 3.985611597644054e-05, + "loss": 1.0334, + "step": 1378 + }, + { + "epoch": 0.06714873517882794, + "grad_norm": 2.344240427017212, + "learning_rate": 3.985573803893399e-05, + "loss": 0.8765, + "step": 1379 + }, + { + "epoch": 0.06719742896793514, + "grad_norm": 1.7039798498153687, + "learning_rate": 3.985535960751413e-05, + "loss": 0.9139, + "step": 1380 + }, + { + "epoch": 0.06724612275704234, + "grad_norm": 1.5991291999816895, + "learning_rate": 3.9854980682190404e-05, + "loss": 0.998, + "step": 1381 + }, + { + "epoch": 0.06729481654614954, + "grad_norm": 3.504502058029175, + "learning_rate": 3.985460126297221e-05, + "loss": 0.9231, + "step": 1382 + }, + { + "epoch": 0.06734351033525673, + "grad_norm": 1.7792129516601562, + "learning_rate": 3.985422134986901e-05, + "loss": 0.9538, + "step": 1383 + }, + { + "epoch": 0.06739220412436393, + "grad_norm": 2.2459421157836914, + "learning_rate": 3.985384094289024e-05, + "loss": 0.9077, + "step": 1384 + }, + { + "epoch": 0.06744089791347113, + "grad_norm": 2.3492653369903564, + "learning_rate": 3.9853460042045365e-05, + "loss": 0.8277, + "step": 1385 + }, + { + "epoch": 0.06748959170257833, + "grad_norm": 4.5477094650268555, + "learning_rate": 3.9853078647343866e-05, + "loss": 0.9224, + "step": 1386 + }, + { + "epoch": 0.06753828549168553, + "grad_norm": 2.049046277999878, + "learning_rate": 3.985269675879522e-05, + "loss": 0.8722, + "step": 1387 + }, + { + "epoch": 0.06758697928079274, + "grad_norm": 1.3792240619659424, + "learning_rate": 3.9852314376408936e-05, + "loss": 0.9627, + "step": 1388 + }, + { + "epoch": 0.06763567306989994, + "grad_norm": 1.5248092412948608, + "learning_rate": 3.985193150019452e-05, + "loss": 0.9098, + "step": 1389 + }, + { + "epoch": 0.06768436685900714, + "grad_norm": 3.470261573791504, + "learning_rate": 3.9851548130161494e-05, + "loss": 0.9316, + "step": 1390 + }, + { + "epoch": 0.06773306064811434, + "grad_norm": 2.5862174034118652, + "learning_rate": 3.985116426631941e-05, + "loss": 1.0046, + "step": 1391 + }, + { + "epoch": 0.06778175443722154, + "grad_norm": 2.5430338382720947, + "learning_rate": 3.985077990867779e-05, + "loss": 0.9788, + "step": 1392 + }, + { + "epoch": 0.06783044822632874, + "grad_norm": 2.3784492015838623, + "learning_rate": 3.985039505724622e-05, + "loss": 0.9461, + "step": 1393 + }, + { + "epoch": 0.06787914201543593, + "grad_norm": 1.5657154321670532, + "learning_rate": 3.985000971203426e-05, + "loss": 0.9411, + "step": 1394 + }, + { + "epoch": 0.06792783580454313, + "grad_norm": 2.2090206146240234, + "learning_rate": 3.98496238730515e-05, + "loss": 0.9618, + "step": 1395 + }, + { + "epoch": 0.06797652959365033, + "grad_norm": 2.1874918937683105, + "learning_rate": 3.984923754030753e-05, + "loss": 0.9864, + "step": 1396 + }, + { + "epoch": 0.06802522338275753, + "grad_norm": 1.922232985496521, + "learning_rate": 3.984885071381198e-05, + "loss": 0.9678, + "step": 1397 + }, + { + "epoch": 0.06807391717186473, + "grad_norm": 2.0655930042266846, + "learning_rate": 3.984846339357445e-05, + "loss": 0.9193, + "step": 1398 + }, + { + "epoch": 0.06812261096097193, + "grad_norm": 2.831402540206909, + "learning_rate": 3.9848075579604575e-05, + "loss": 0.9694, + "step": 1399 + }, + { + "epoch": 0.06817130475007913, + "grad_norm": 1.631203293800354, + "learning_rate": 3.984768727191203e-05, + "loss": 0.9581, + "step": 1400 + }, + { + "epoch": 0.06821999853918632, + "grad_norm": 2.5365755558013916, + "learning_rate": 3.984729847050644e-05, + "loss": 0.9299, + "step": 1401 + }, + { + "epoch": 0.06826869232829352, + "grad_norm": 1.9931282997131348, + "learning_rate": 3.984690917539749e-05, + "loss": 1.0166, + "step": 1402 + }, + { + "epoch": 0.06831738611740072, + "grad_norm": 2.0627691745758057, + "learning_rate": 3.984651938659487e-05, + "loss": 0.9089, + "step": 1403 + }, + { + "epoch": 0.06836607990650792, + "grad_norm": 2.1400818824768066, + "learning_rate": 3.984612910410827e-05, + "loss": 0.9268, + "step": 1404 + }, + { + "epoch": 0.06841477369561512, + "grad_norm": 0.08511119335889816, + "learning_rate": 3.98457383279474e-05, + "loss": 0.6471, + "step": 1405 + }, + { + "epoch": 0.06846346748472232, + "grad_norm": 1.8158535957336426, + "learning_rate": 3.9845347058121976e-05, + "loss": 0.8938, + "step": 1406 + }, + { + "epoch": 0.06851216127382952, + "grad_norm": 2.005856513977051, + "learning_rate": 3.984495529464174e-05, + "loss": 0.9274, + "step": 1407 + }, + { + "epoch": 0.06856085506293673, + "grad_norm": 2.1389708518981934, + "learning_rate": 3.9844563037516424e-05, + "loss": 1.0025, + "step": 1408 + }, + { + "epoch": 0.06860954885204393, + "grad_norm": 2.187363624572754, + "learning_rate": 3.98441702867558e-05, + "loss": 0.8586, + "step": 1409 + }, + { + "epoch": 0.06865824264115113, + "grad_norm": 1.9013200998306274, + "learning_rate": 3.984377704236963e-05, + "loss": 0.8718, + "step": 1410 + }, + { + "epoch": 0.06870693643025833, + "grad_norm": 4.587543487548828, + "learning_rate": 3.984338330436769e-05, + "loss": 0.948, + "step": 1411 + }, + { + "epoch": 0.06875563021936552, + "grad_norm": 1.7111835479736328, + "learning_rate": 3.984298907275979e-05, + "loss": 0.9454, + "step": 1412 + }, + { + "epoch": 0.06880432400847272, + "grad_norm": 1.684738278388977, + "learning_rate": 3.984259434755572e-05, + "loss": 0.9901, + "step": 1413 + }, + { + "epoch": 0.06885301779757992, + "grad_norm": 2.8962390422821045, + "learning_rate": 3.984219912876532e-05, + "loss": 1.0825, + "step": 1414 + }, + { + "epoch": 0.06890171158668712, + "grad_norm": 1.6918342113494873, + "learning_rate": 3.9841803416398385e-05, + "loss": 1.0092, + "step": 1415 + }, + { + "epoch": 0.06895040537579432, + "grad_norm": 3.1710972785949707, + "learning_rate": 3.98414072104648e-05, + "loss": 0.9376, + "step": 1416 + }, + { + "epoch": 0.06899909916490152, + "grad_norm": 5.328662872314453, + "learning_rate": 3.9841010510974395e-05, + "loss": 0.9422, + "step": 1417 + }, + { + "epoch": 0.06904779295400872, + "grad_norm": 1.5668474435806274, + "learning_rate": 3.9840613317937044e-05, + "loss": 0.8752, + "step": 1418 + }, + { + "epoch": 0.06909648674311591, + "grad_norm": 1.5692659616470337, + "learning_rate": 3.9840215631362624e-05, + "loss": 0.9975, + "step": 1419 + }, + { + "epoch": 0.06914518053222311, + "grad_norm": 1.900888204574585, + "learning_rate": 3.983981745126104e-05, + "loss": 0.825, + "step": 1420 + }, + { + "epoch": 0.06919387432133031, + "grad_norm": 2.178093671798706, + "learning_rate": 3.9839418777642185e-05, + "loss": 0.9116, + "step": 1421 + }, + { + "epoch": 0.06924256811043751, + "grad_norm": 2.726824998855591, + "learning_rate": 3.983901961051598e-05, + "loss": 1.0003, + "step": 1422 + }, + { + "epoch": 0.06929126189954471, + "grad_norm": 1.985106348991394, + "learning_rate": 3.9838619949892346e-05, + "loss": 0.9278, + "step": 1423 + }, + { + "epoch": 0.06933995568865191, + "grad_norm": 2.5491840839385986, + "learning_rate": 3.983821979578124e-05, + "loss": 0.9158, + "step": 1424 + }, + { + "epoch": 0.0693886494777591, + "grad_norm": 2.1859848499298096, + "learning_rate": 3.98378191481926e-05, + "loss": 0.9373, + "step": 1425 + }, + { + "epoch": 0.0694373432668663, + "grad_norm": 2.122009754180908, + "learning_rate": 3.983741800713641e-05, + "loss": 0.9532, + "step": 1426 + }, + { + "epoch": 0.0694860370559735, + "grad_norm": 2.8990023136138916, + "learning_rate": 3.983701637262263e-05, + "loss": 1.0467, + "step": 1427 + }, + { + "epoch": 0.06953473084508072, + "grad_norm": 2.4651505947113037, + "learning_rate": 3.983661424466126e-05, + "loss": 1.0684, + "step": 1428 + }, + { + "epoch": 0.06958342463418792, + "grad_norm": 3.0970277786254883, + "learning_rate": 3.9836211623262306e-05, + "loss": 0.9737, + "step": 1429 + }, + { + "epoch": 0.06963211842329511, + "grad_norm": 2.0433144569396973, + "learning_rate": 3.983580850843578e-05, + "loss": 0.9063, + "step": 1430 + }, + { + "epoch": 0.06968081221240231, + "grad_norm": 3.079200506210327, + "learning_rate": 3.9835404900191704e-05, + "loss": 0.9538, + "step": 1431 + }, + { + "epoch": 0.06972950600150951, + "grad_norm": 2.1079633235931396, + "learning_rate": 3.983500079854013e-05, + "loss": 0.9708, + "step": 1432 + }, + { + "epoch": 0.06977819979061671, + "grad_norm": 2.6690618991851807, + "learning_rate": 3.983459620349109e-05, + "loss": 1.0275, + "step": 1433 + }, + { + "epoch": 0.06982689357972391, + "grad_norm": 1.9216381311416626, + "learning_rate": 3.983419111505467e-05, + "loss": 0.9385, + "step": 1434 + }, + { + "epoch": 0.06987558736883111, + "grad_norm": 1.7890269756317139, + "learning_rate": 3.9833785533240936e-05, + "loss": 0.9332, + "step": 1435 + }, + { + "epoch": 0.0699242811579383, + "grad_norm": 3.1389517784118652, + "learning_rate": 3.983337945805998e-05, + "loss": 0.9623, + "step": 1436 + }, + { + "epoch": 0.0699729749470455, + "grad_norm": 2.809044361114502, + "learning_rate": 3.98329728895219e-05, + "loss": 0.868, + "step": 1437 + }, + { + "epoch": 0.0700216687361527, + "grad_norm": 1.9997198581695557, + "learning_rate": 3.9832565827636814e-05, + "loss": 0.96, + "step": 1438 + }, + { + "epoch": 0.0700703625252599, + "grad_norm": 2.324995994567871, + "learning_rate": 3.983215827241484e-05, + "loss": 0.9372, + "step": 1439 + }, + { + "epoch": 0.0701190563143671, + "grad_norm": 1.8339589834213257, + "learning_rate": 3.983175022386612e-05, + "loss": 0.8838, + "step": 1440 + }, + { + "epoch": 0.0701677501034743, + "grad_norm": 2.104477882385254, + "learning_rate": 3.9831341682000813e-05, + "loss": 0.9548, + "step": 1441 + }, + { + "epoch": 0.0702164438925815, + "grad_norm": 1.8045109510421753, + "learning_rate": 3.9830932646829066e-05, + "loss": 0.9821, + "step": 1442 + }, + { + "epoch": 0.0702651376816887, + "grad_norm": 1.906075358390808, + "learning_rate": 3.9830523118361064e-05, + "loss": 0.9282, + "step": 1443 + }, + { + "epoch": 0.0703138314707959, + "grad_norm": 1.7439191341400146, + "learning_rate": 3.983011309660699e-05, + "loss": 0.9926, + "step": 1444 + }, + { + "epoch": 0.0703625252599031, + "grad_norm": 2.3277804851531982, + "learning_rate": 3.9829702581577045e-05, + "loss": 0.8623, + "step": 1445 + }, + { + "epoch": 0.07041121904901029, + "grad_norm": 2.0107693672180176, + "learning_rate": 3.9829291573281443e-05, + "loss": 0.8504, + "step": 1446 + }, + { + "epoch": 0.07045991283811749, + "grad_norm": 2.4310808181762695, + "learning_rate": 3.98288800717304e-05, + "loss": 0.9782, + "step": 1447 + }, + { + "epoch": 0.0705086066272247, + "grad_norm": 2.4511759281158447, + "learning_rate": 3.9828468076934165e-05, + "loss": 0.92, + "step": 1448 + }, + { + "epoch": 0.0705573004163319, + "grad_norm": 1.799453854560852, + "learning_rate": 3.982805558890297e-05, + "loss": 0.91, + "step": 1449 + }, + { + "epoch": 0.0706059942054391, + "grad_norm": 2.8741161823272705, + "learning_rate": 3.982764260764709e-05, + "loss": 0.9539, + "step": 1450 + }, + { + "epoch": 0.0706546879945463, + "grad_norm": 1.931865930557251, + "learning_rate": 3.98272291331768e-05, + "loss": 0.9874, + "step": 1451 + }, + { + "epoch": 0.0707033817836535, + "grad_norm": 1.71570885181427, + "learning_rate": 3.982681516550236e-05, + "loss": 0.9086, + "step": 1452 + }, + { + "epoch": 0.0707520755727607, + "grad_norm": 3.982764482498169, + "learning_rate": 3.9826400704634094e-05, + "loss": 0.9984, + "step": 1453 + }, + { + "epoch": 0.0708007693618679, + "grad_norm": 2.045637369155884, + "learning_rate": 3.98259857505823e-05, + "loss": 0.9072, + "step": 1454 + }, + { + "epoch": 0.0708494631509751, + "grad_norm": 1.8974672555923462, + "learning_rate": 3.9825570303357306e-05, + "loss": 0.8946, + "step": 1455 + }, + { + "epoch": 0.0708981569400823, + "grad_norm": 2.665538787841797, + "learning_rate": 3.982515436296944e-05, + "loss": 0.992, + "step": 1456 + }, + { + "epoch": 0.07094685072918949, + "grad_norm": 1.5727874040603638, + "learning_rate": 3.982473792942905e-05, + "loss": 0.9735, + "step": 1457 + }, + { + "epoch": 0.07099554451829669, + "grad_norm": 2.2023580074310303, + "learning_rate": 3.98243210027465e-05, + "loss": 1.0535, + "step": 1458 + }, + { + "epoch": 0.07104423830740389, + "grad_norm": 1.974673867225647, + "learning_rate": 3.9823903582932165e-05, + "loss": 0.9053, + "step": 1459 + }, + { + "epoch": 0.07109293209651109, + "grad_norm": 2.4472150802612305, + "learning_rate": 3.9823485669996415e-05, + "loss": 0.8333, + "step": 1460 + }, + { + "epoch": 0.07114162588561829, + "grad_norm": 2.0355637073516846, + "learning_rate": 3.982306726394965e-05, + "loss": 0.9503, + "step": 1461 + }, + { + "epoch": 0.07119031967472549, + "grad_norm": 1.8428298234939575, + "learning_rate": 3.9822648364802276e-05, + "loss": 0.9604, + "step": 1462 + }, + { + "epoch": 0.07123901346383268, + "grad_norm": 2.1352691650390625, + "learning_rate": 3.9822228972564724e-05, + "loss": 0.9296, + "step": 1463 + }, + { + "epoch": 0.07128770725293988, + "grad_norm": 2.212313652038574, + "learning_rate": 3.982180908724741e-05, + "loss": 0.973, + "step": 1464 + }, + { + "epoch": 0.07133640104204708, + "grad_norm": 1.685816764831543, + "learning_rate": 3.98213887088608e-05, + "loss": 0.9654, + "step": 1465 + }, + { + "epoch": 0.07138509483115428, + "grad_norm": 4.495471954345703, + "learning_rate": 3.982096783741533e-05, + "loss": 0.8747, + "step": 1466 + }, + { + "epoch": 0.07143378862026148, + "grad_norm": 1.5522643327713013, + "learning_rate": 3.9820546472921475e-05, + "loss": 0.9622, + "step": 1467 + }, + { + "epoch": 0.07148248240936869, + "grad_norm": 2.401439666748047, + "learning_rate": 3.9820124615389725e-05, + "loss": 0.9133, + "step": 1468 + }, + { + "epoch": 0.07153117619847589, + "grad_norm": 2.3015997409820557, + "learning_rate": 3.981970226483056e-05, + "loss": 0.873, + "step": 1469 + }, + { + "epoch": 0.07157986998758309, + "grad_norm": 1.8968604803085327, + "learning_rate": 3.9819279421254505e-05, + "loss": 0.9723, + "step": 1470 + }, + { + "epoch": 0.07162856377669029, + "grad_norm": 1.5844018459320068, + "learning_rate": 3.981885608467206e-05, + "loss": 0.9659, + "step": 1471 + }, + { + "epoch": 0.07167725756579749, + "grad_norm": 1.625347375869751, + "learning_rate": 3.981843225509377e-05, + "loss": 0.998, + "step": 1472 + }, + { + "epoch": 0.07172595135490469, + "grad_norm": 1.5366250276565552, + "learning_rate": 3.9818007932530166e-05, + "loss": 0.9251, + "step": 1473 + }, + { + "epoch": 0.07177464514401188, + "grad_norm": 2.192600965499878, + "learning_rate": 3.981758311699181e-05, + "loss": 1.0035, + "step": 1474 + }, + { + "epoch": 0.07182333893311908, + "grad_norm": 1.9463640451431274, + "learning_rate": 3.9817157808489267e-05, + "loss": 0.9337, + "step": 1475 + }, + { + "epoch": 0.07187203272222628, + "grad_norm": 1.7505476474761963, + "learning_rate": 3.981673200703311e-05, + "loss": 1.015, + "step": 1476 + }, + { + "epoch": 0.07192072651133348, + "grad_norm": 2.0478203296661377, + "learning_rate": 3.9816305712633946e-05, + "loss": 0.9713, + "step": 1477 + }, + { + "epoch": 0.07196942030044068, + "grad_norm": 3.282183885574341, + "learning_rate": 3.981587892530236e-05, + "loss": 0.9069, + "step": 1478 + }, + { + "epoch": 0.07201811408954788, + "grad_norm": 3.0941929817199707, + "learning_rate": 3.981545164504899e-05, + "loss": 0.8026, + "step": 1479 + }, + { + "epoch": 0.07206680787865508, + "grad_norm": 1.654505968093872, + "learning_rate": 3.9815023871884454e-05, + "loss": 1.0405, + "step": 1480 + }, + { + "epoch": 0.07211550166776227, + "grad_norm": 2.3501365184783936, + "learning_rate": 3.981459560581939e-05, + "loss": 0.9179, + "step": 1481 + }, + { + "epoch": 0.07216419545686947, + "grad_norm": 1.5979043245315552, + "learning_rate": 3.9814166846864445e-05, + "loss": 0.9434, + "step": 1482 + }, + { + "epoch": 0.07221288924597667, + "grad_norm": 2.7889842987060547, + "learning_rate": 3.98137375950303e-05, + "loss": 0.895, + "step": 1483 + }, + { + "epoch": 0.07226158303508387, + "grad_norm": 2.1720592975616455, + "learning_rate": 3.981330785032763e-05, + "loss": 1.0415, + "step": 1484 + }, + { + "epoch": 0.07231027682419107, + "grad_norm": 1.6934711933135986, + "learning_rate": 3.981287761276711e-05, + "loss": 0.9184, + "step": 1485 + }, + { + "epoch": 0.07235897061329827, + "grad_norm": 1.4388458728790283, + "learning_rate": 3.9812446882359464e-05, + "loss": 0.9847, + "step": 1486 + }, + { + "epoch": 0.07240766440240547, + "grad_norm": 1.806349515914917, + "learning_rate": 3.9812015659115387e-05, + "loss": 0.9218, + "step": 1487 + }, + { + "epoch": 0.07245635819151268, + "grad_norm": 2.801020622253418, + "learning_rate": 3.9811583943045626e-05, + "loss": 0.9362, + "step": 1488 + }, + { + "epoch": 0.07250505198061988, + "grad_norm": 1.9960479736328125, + "learning_rate": 3.9811151734160896e-05, + "loss": 0.9282, + "step": 1489 + }, + { + "epoch": 0.07255374576972708, + "grad_norm": 2.1950747966766357, + "learning_rate": 3.9810719032471964e-05, + "loss": 0.974, + "step": 1490 + }, + { + "epoch": 0.07260243955883428, + "grad_norm": 1.8153836727142334, + "learning_rate": 3.981028583798959e-05, + "loss": 0.9198, + "step": 1491 + }, + { + "epoch": 0.07265113334794147, + "grad_norm": 1.6322532892227173, + "learning_rate": 3.980985215072455e-05, + "loss": 0.8828, + "step": 1492 + }, + { + "epoch": 0.07269982713704867, + "grad_norm": 1.8354130983352661, + "learning_rate": 3.9809417970687627e-05, + "loss": 0.9158, + "step": 1493 + }, + { + "epoch": 0.07274852092615587, + "grad_norm": 2.985565185546875, + "learning_rate": 3.9808983297889634e-05, + "loss": 0.95, + "step": 1494 + }, + { + "epoch": 0.07279721471526307, + "grad_norm": 1.5564452409744263, + "learning_rate": 3.980854813234137e-05, + "loss": 0.9343, + "step": 1495 + }, + { + "epoch": 0.07284590850437027, + "grad_norm": 1.8146610260009766, + "learning_rate": 3.9808112474053667e-05, + "loss": 0.9198, + "step": 1496 + }, + { + "epoch": 0.07289460229347747, + "grad_norm": 2.4224729537963867, + "learning_rate": 3.980767632303736e-05, + "loss": 0.9166, + "step": 1497 + }, + { + "epoch": 0.07294329608258467, + "grad_norm": 1.6419419050216675, + "learning_rate": 3.98072396793033e-05, + "loss": 0.8925, + "step": 1498 + }, + { + "epoch": 0.07299198987169186, + "grad_norm": 2.0063138008117676, + "learning_rate": 3.9806802542862344e-05, + "loss": 1.0439, + "step": 1499 + }, + { + "epoch": 0.07304068366079906, + "grad_norm": 1.6311625242233276, + "learning_rate": 3.980636491372537e-05, + "loss": 0.8891, + "step": 1500 + }, + { + "epoch": 0.07308937744990626, + "grad_norm": 2.114464521408081, + "learning_rate": 3.980592679190326e-05, + "loss": 0.9148, + "step": 1501 + }, + { + "epoch": 0.07313807123901346, + "grad_norm": 2.370678424835205, + "learning_rate": 3.980548817740692e-05, + "loss": 0.8894, + "step": 1502 + }, + { + "epoch": 0.07318676502812066, + "grad_norm": 2.198432207107544, + "learning_rate": 3.980504907024726e-05, + "loss": 0.92, + "step": 1503 + }, + { + "epoch": 0.07323545881722786, + "grad_norm": 2.2169787883758545, + "learning_rate": 3.980460947043519e-05, + "loss": 0.9486, + "step": 1504 + }, + { + "epoch": 0.07328415260633506, + "grad_norm": 1.7092363834381104, + "learning_rate": 3.980416937798165e-05, + "loss": 0.9697, + "step": 1505 + }, + { + "epoch": 0.07333284639544226, + "grad_norm": 2.8468704223632812, + "learning_rate": 3.98037287928976e-05, + "loss": 0.8981, + "step": 1506 + }, + { + "epoch": 0.07338154018454945, + "grad_norm": 2.363081693649292, + "learning_rate": 3.980328771519399e-05, + "loss": 0.877, + "step": 1507 + }, + { + "epoch": 0.07343023397365667, + "grad_norm": 1.7570911645889282, + "learning_rate": 3.980284614488179e-05, + "loss": 0.9036, + "step": 1508 + }, + { + "epoch": 0.07347892776276387, + "grad_norm": 0.08778484165668488, + "learning_rate": 3.980240408197198e-05, + "loss": 0.6732, + "step": 1509 + }, + { + "epoch": 0.07352762155187106, + "grad_norm": 2.8274424076080322, + "learning_rate": 3.9801961526475574e-05, + "loss": 0.9588, + "step": 1510 + }, + { + "epoch": 0.07357631534097826, + "grad_norm": 2.6947689056396484, + "learning_rate": 3.980151847840357e-05, + "loss": 0.9406, + "step": 1511 + }, + { + "epoch": 0.07362500913008546, + "grad_norm": 1.7072992324829102, + "learning_rate": 3.9801074937766984e-05, + "loss": 0.8784, + "step": 1512 + }, + { + "epoch": 0.07367370291919266, + "grad_norm": 1.5893027782440186, + "learning_rate": 3.980063090457685e-05, + "loss": 0.9655, + "step": 1513 + }, + { + "epoch": 0.07372239670829986, + "grad_norm": 1.810792088508606, + "learning_rate": 3.980018637884422e-05, + "loss": 0.835, + "step": 1514 + }, + { + "epoch": 0.07377109049740706, + "grad_norm": 1.8907088041305542, + "learning_rate": 3.979974136058015e-05, + "loss": 0.9675, + "step": 1515 + }, + { + "epoch": 0.07381978428651426, + "grad_norm": 1.9259780645370483, + "learning_rate": 3.979929584979571e-05, + "loss": 1.0028, + "step": 1516 + }, + { + "epoch": 0.07386847807562145, + "grad_norm": 2.5160505771636963, + "learning_rate": 3.9798849846501976e-05, + "loss": 0.8922, + "step": 1517 + }, + { + "epoch": 0.07391717186472865, + "grad_norm": 0.09102759510278702, + "learning_rate": 3.9798403350710044e-05, + "loss": 0.5805, + "step": 1518 + }, + { + "epoch": 0.07396586565383585, + "grad_norm": 1.8620085716247559, + "learning_rate": 3.979795636243103e-05, + "loss": 0.8887, + "step": 1519 + }, + { + "epoch": 0.07401455944294305, + "grad_norm": 1.718292474746704, + "learning_rate": 3.979750888167605e-05, + "loss": 1.0087, + "step": 1520 + }, + { + "epoch": 0.07406325323205025, + "grad_norm": 4.3682169914245605, + "learning_rate": 3.979706090845622e-05, + "loss": 0.9598, + "step": 1521 + }, + { + "epoch": 0.07411194702115745, + "grad_norm": 1.5964399576187134, + "learning_rate": 3.97966124427827e-05, + "loss": 1.0164, + "step": 1522 + }, + { + "epoch": 0.07416064081026465, + "grad_norm": 2.53011155128479, + "learning_rate": 3.9796163484666644e-05, + "loss": 0.8425, + "step": 1523 + }, + { + "epoch": 0.07420933459937185, + "grad_norm": 1.5396138429641724, + "learning_rate": 3.979571403411921e-05, + "loss": 0.9401, + "step": 1524 + }, + { + "epoch": 0.07425802838847904, + "grad_norm": 2.547114610671997, + "learning_rate": 3.979526409115159e-05, + "loss": 0.9748, + "step": 1525 + }, + { + "epoch": 0.07430672217758624, + "grad_norm": 2.359174966812134, + "learning_rate": 3.979481365577497e-05, + "loss": 0.8552, + "step": 1526 + }, + { + "epoch": 0.07435541596669344, + "grad_norm": 0.08367882668972015, + "learning_rate": 3.9794362728000555e-05, + "loss": 0.5467, + "step": 1527 + }, + { + "epoch": 0.07440410975580065, + "grad_norm": 1.7064436674118042, + "learning_rate": 3.979391130783956e-05, + "loss": 0.941, + "step": 1528 + }, + { + "epoch": 0.07445280354490785, + "grad_norm": 1.9163073301315308, + "learning_rate": 3.979345939530322e-05, + "loss": 0.9118, + "step": 1529 + }, + { + "epoch": 0.07450149733401505, + "grad_norm": 3.5725717544555664, + "learning_rate": 3.9793006990402774e-05, + "loss": 0.9741, + "step": 1530 + }, + { + "epoch": 0.07455019112312225, + "grad_norm": 3.734622001647949, + "learning_rate": 3.979255409314947e-05, + "loss": 0.9413, + "step": 1531 + }, + { + "epoch": 0.07459888491222945, + "grad_norm": 2.0407352447509766, + "learning_rate": 3.979210070355458e-05, + "loss": 0.9626, + "step": 1532 + }, + { + "epoch": 0.07464757870133665, + "grad_norm": 2.908519983291626, + "learning_rate": 3.979164682162938e-05, + "loss": 0.8974, + "step": 1533 + }, + { + "epoch": 0.07469627249044385, + "grad_norm": 1.469390630722046, + "learning_rate": 3.979119244738516e-05, + "loss": 0.9371, + "step": 1534 + }, + { + "epoch": 0.07474496627955104, + "grad_norm": 0.08797211945056915, + "learning_rate": 3.9790737580833226e-05, + "loss": 0.703, + "step": 1535 + }, + { + "epoch": 0.07479366006865824, + "grad_norm": 2.0115718841552734, + "learning_rate": 3.979028222198489e-05, + "loss": 0.925, + "step": 1536 + }, + { + "epoch": 0.07484235385776544, + "grad_norm": 1.5705665349960327, + "learning_rate": 3.9789826370851476e-05, + "loss": 0.8972, + "step": 1537 + }, + { + "epoch": 0.07489104764687264, + "grad_norm": 4.229894161224365, + "learning_rate": 3.978937002744433e-05, + "loss": 0.9319, + "step": 1538 + }, + { + "epoch": 0.07493974143597984, + "grad_norm": 2.1103556156158447, + "learning_rate": 3.97889131917748e-05, + "loss": 0.9208, + "step": 1539 + }, + { + "epoch": 0.07498843522508704, + "grad_norm": 1.9161006212234497, + "learning_rate": 3.9788455863854246e-05, + "loss": 0.9727, + "step": 1540 + }, + { + "epoch": 0.07503712901419424, + "grad_norm": 2.8551223278045654, + "learning_rate": 3.978799804369405e-05, + "loss": 0.9582, + "step": 1541 + }, + { + "epoch": 0.07508582280330144, + "grad_norm": 1.8952287435531616, + "learning_rate": 3.9787539731305605e-05, + "loss": 0.9135, + "step": 1542 + }, + { + "epoch": 0.07513451659240863, + "grad_norm": 2.6695945262908936, + "learning_rate": 3.97870809267003e-05, + "loss": 0.9674, + "step": 1543 + }, + { + "epoch": 0.07518321038151583, + "grad_norm": 1.8248817920684814, + "learning_rate": 3.978662162988955e-05, + "loss": 0.9414, + "step": 1544 + }, + { + "epoch": 0.07523190417062303, + "grad_norm": 2.2290585041046143, + "learning_rate": 3.9786161840884784e-05, + "loss": 0.8147, + "step": 1545 + }, + { + "epoch": 0.07528059795973023, + "grad_norm": 1.6257864236831665, + "learning_rate": 3.978570155969745e-05, + "loss": 0.9615, + "step": 1546 + }, + { + "epoch": 0.07532929174883743, + "grad_norm": 1.591310739517212, + "learning_rate": 3.9785240786338973e-05, + "loss": 0.9408, + "step": 1547 + }, + { + "epoch": 0.07537798553794464, + "grad_norm": 3.9384448528289795, + "learning_rate": 3.978477952082083e-05, + "loss": 0.9735, + "step": 1548 + }, + { + "epoch": 0.07542667932705184, + "grad_norm": 1.7417271137237549, + "learning_rate": 3.978431776315449e-05, + "loss": 1.0042, + "step": 1549 + }, + { + "epoch": 0.07547537311615904, + "grad_norm": 1.8124054670333862, + "learning_rate": 3.978385551335145e-05, + "loss": 0.9463, + "step": 1550 + }, + { + "epoch": 0.07552406690526624, + "grad_norm": 1.592585563659668, + "learning_rate": 3.978339277142319e-05, + "loss": 0.9152, + "step": 1551 + }, + { + "epoch": 0.07557276069437344, + "grad_norm": 1.6417580842971802, + "learning_rate": 3.978292953738123e-05, + "loss": 1.0221, + "step": 1552 + }, + { + "epoch": 0.07562145448348064, + "grad_norm": 1.494549036026001, + "learning_rate": 3.97824658112371e-05, + "loss": 1.0154, + "step": 1553 + }, + { + "epoch": 0.07567014827258783, + "grad_norm": 1.5763605833053589, + "learning_rate": 3.978200159300233e-05, + "loss": 0.9651, + "step": 1554 + }, + { + "epoch": 0.07571884206169503, + "grad_norm": 2.1900699138641357, + "learning_rate": 3.9781536882688464e-05, + "loss": 0.9052, + "step": 1555 + }, + { + "epoch": 0.07576753585080223, + "grad_norm": 1.8594640493392944, + "learning_rate": 3.9781071680307066e-05, + "loss": 1.021, + "step": 1556 + }, + { + "epoch": 0.07581622963990943, + "grad_norm": 0.09507814794778824, + "learning_rate": 3.978060598586971e-05, + "loss": 0.5663, + "step": 1557 + }, + { + "epoch": 0.07586492342901663, + "grad_norm": 1.5318063497543335, + "learning_rate": 3.9780139799387976e-05, + "loss": 0.8835, + "step": 1558 + }, + { + "epoch": 0.07591361721812383, + "grad_norm": 1.6417474746704102, + "learning_rate": 3.977967312087345e-05, + "loss": 0.9598, + "step": 1559 + }, + { + "epoch": 0.07596231100723103, + "grad_norm": 1.808650255203247, + "learning_rate": 3.9779205950337764e-05, + "loss": 0.987, + "step": 1560 + }, + { + "epoch": 0.07601100479633822, + "grad_norm": 2.359060287475586, + "learning_rate": 3.9778738287792525e-05, + "loss": 0.8389, + "step": 1561 + }, + { + "epoch": 0.07605969858544542, + "grad_norm": 2.1933836936950684, + "learning_rate": 3.9778270133249365e-05, + "loss": 0.9096, + "step": 1562 + }, + { + "epoch": 0.07610839237455262, + "grad_norm": 1.7976137399673462, + "learning_rate": 3.977780148671993e-05, + "loss": 0.9529, + "step": 1563 + }, + { + "epoch": 0.07615708616365982, + "grad_norm": 2.047119140625, + "learning_rate": 3.977733234821588e-05, + "loss": 0.9122, + "step": 1564 + }, + { + "epoch": 0.07620577995276702, + "grad_norm": 1.625171422958374, + "learning_rate": 3.9776862717748883e-05, + "loss": 0.9345, + "step": 1565 + }, + { + "epoch": 0.07625447374187422, + "grad_norm": 0.08886029571294785, + "learning_rate": 3.977639259533063e-05, + "loss": 0.6029, + "step": 1566 + }, + { + "epoch": 0.07630316753098142, + "grad_norm": 4.190988063812256, + "learning_rate": 3.977592198097281e-05, + "loss": 0.8805, + "step": 1567 + }, + { + "epoch": 0.07635186132008863, + "grad_norm": 2.801846504211426, + "learning_rate": 3.977545087468712e-05, + "loss": 0.9461, + "step": 1568 + }, + { + "epoch": 0.07640055510919583, + "grad_norm": 1.6550158262252808, + "learning_rate": 3.977497927648528e-05, + "loss": 0.9424, + "step": 1569 + }, + { + "epoch": 0.07644924889830303, + "grad_norm": 1.7448582649230957, + "learning_rate": 3.9774507186379045e-05, + "loss": 0.997, + "step": 1570 + }, + { + "epoch": 0.07649794268741023, + "grad_norm": 2.495096445083618, + "learning_rate": 3.977403460438013e-05, + "loss": 1.0055, + "step": 1571 + }, + { + "epoch": 0.07654663647651742, + "grad_norm": 1.5802537202835083, + "learning_rate": 3.97735615305003e-05, + "loss": 0.9432, + "step": 1572 + }, + { + "epoch": 0.07659533026562462, + "grad_norm": 1.5997530221939087, + "learning_rate": 3.977308796475133e-05, + "loss": 0.8417, + "step": 1573 + }, + { + "epoch": 0.07664402405473182, + "grad_norm": 2.0089731216430664, + "learning_rate": 3.9772613907144987e-05, + "loss": 0.9261, + "step": 1574 + }, + { + "epoch": 0.07669271784383902, + "grad_norm": 3.482668399810791, + "learning_rate": 3.9772139357693074e-05, + "loss": 0.9139, + "step": 1575 + }, + { + "epoch": 0.07674141163294622, + "grad_norm": 0.08593886345624924, + "learning_rate": 3.9771664316407395e-05, + "loss": 0.6703, + "step": 1576 + }, + { + "epoch": 0.07679010542205342, + "grad_norm": 1.9485174417495728, + "learning_rate": 3.977118878329976e-05, + "loss": 0.8359, + "step": 1577 + }, + { + "epoch": 0.07683879921116062, + "grad_norm": 3.278095006942749, + "learning_rate": 3.9770712758382e-05, + "loss": 0.8942, + "step": 1578 + }, + { + "epoch": 0.07688749300026781, + "grad_norm": 1.938035249710083, + "learning_rate": 3.977023624166596e-05, + "loss": 0.9878, + "step": 1579 + }, + { + "epoch": 0.07693618678937501, + "grad_norm": 2.2370169162750244, + "learning_rate": 3.976975923316349e-05, + "loss": 1.0576, + "step": 1580 + }, + { + "epoch": 0.07698488057848221, + "grad_norm": 3.2798423767089844, + "learning_rate": 3.9769281732886455e-05, + "loss": 0.8837, + "step": 1581 + }, + { + "epoch": 0.07703357436758941, + "grad_norm": 2.124622106552124, + "learning_rate": 3.9768803740846735e-05, + "loss": 0.8993, + "step": 1582 + }, + { + "epoch": 0.07708226815669661, + "grad_norm": 3.578850030899048, + "learning_rate": 3.976832525705622e-05, + "loss": 1.0008, + "step": 1583 + }, + { + "epoch": 0.07713096194580381, + "grad_norm": 2.564342975616455, + "learning_rate": 3.976784628152682e-05, + "loss": 0.9437, + "step": 1584 + }, + { + "epoch": 0.077179655734911, + "grad_norm": 2.270115375518799, + "learning_rate": 3.9767366814270426e-05, + "loss": 0.8366, + "step": 1585 + }, + { + "epoch": 0.0772283495240182, + "grad_norm": 3.3870315551757812, + "learning_rate": 3.9766886855298995e-05, + "loss": 0.9771, + "step": 1586 + }, + { + "epoch": 0.0772770433131254, + "grad_norm": 2.3755509853363037, + "learning_rate": 3.976640640462444e-05, + "loss": 0.9659, + "step": 1587 + }, + { + "epoch": 0.07732573710223262, + "grad_norm": 2.1639275550842285, + "learning_rate": 3.976592546225873e-05, + "loss": 0.9868, + "step": 1588 + }, + { + "epoch": 0.07737443089133982, + "grad_norm": 2.1672253608703613, + "learning_rate": 3.976544402821382e-05, + "loss": 0.8701, + "step": 1589 + }, + { + "epoch": 0.07742312468044701, + "grad_norm": 1.5540997982025146, + "learning_rate": 3.9764962102501685e-05, + "loss": 0.9362, + "step": 1590 + }, + { + "epoch": 0.07747181846955421, + "grad_norm": 2.1507842540740967, + "learning_rate": 3.976447968513432e-05, + "loss": 1.0008, + "step": 1591 + }, + { + "epoch": 0.07752051225866141, + "grad_norm": 3.345888614654541, + "learning_rate": 3.9763996776123716e-05, + "loss": 0.9641, + "step": 1592 + }, + { + "epoch": 0.07756920604776861, + "grad_norm": 2.535107135772705, + "learning_rate": 3.976351337548189e-05, + "loss": 0.8914, + "step": 1593 + }, + { + "epoch": 0.07761789983687581, + "grad_norm": 2.6314823627471924, + "learning_rate": 3.9763029483220876e-05, + "loss": 0.943, + "step": 1594 + }, + { + "epoch": 0.07766659362598301, + "grad_norm": 1.5174740552902222, + "learning_rate": 3.9762545099352696e-05, + "loss": 0.9156, + "step": 1595 + }, + { + "epoch": 0.0777152874150902, + "grad_norm": 1.2851943969726562, + "learning_rate": 3.9762060223889396e-05, + "loss": 0.9437, + "step": 1596 + }, + { + "epoch": 0.0777639812041974, + "grad_norm": 4.741919040679932, + "learning_rate": 3.976157485684306e-05, + "loss": 0.9874, + "step": 1597 + }, + { + "epoch": 0.0778126749933046, + "grad_norm": 2.0556957721710205, + "learning_rate": 3.976108899822574e-05, + "loss": 0.8564, + "step": 1598 + }, + { + "epoch": 0.0778613687824118, + "grad_norm": 1.496385931968689, + "learning_rate": 3.976060264804953e-05, + "loss": 0.9284, + "step": 1599 + }, + { + "epoch": 0.077910062571519, + "grad_norm": 1.638816475868225, + "learning_rate": 3.976011580632653e-05, + "loss": 0.9555, + "step": 1600 + }, + { + "epoch": 0.0779587563606262, + "grad_norm": 1.815242886543274, + "learning_rate": 3.9759628473068844e-05, + "loss": 1.0176, + "step": 1601 + }, + { + "epoch": 0.0780074501497334, + "grad_norm": 2.1306991577148438, + "learning_rate": 3.9759140648288606e-05, + "loss": 1.0194, + "step": 1602 + }, + { + "epoch": 0.0780561439388406, + "grad_norm": 1.9957033395767212, + "learning_rate": 3.9758652331997936e-05, + "loss": 0.9263, + "step": 1603 + }, + { + "epoch": 0.0781048377279478, + "grad_norm": 1.8656316995620728, + "learning_rate": 3.9758163524208986e-05, + "loss": 1.0173, + "step": 1604 + }, + { + "epoch": 0.078153531517055, + "grad_norm": 1.7941267490386963, + "learning_rate": 3.975767422493392e-05, + "loss": 0.891, + "step": 1605 + }, + { + "epoch": 0.07820222530616219, + "grad_norm": 1.919294834136963, + "learning_rate": 3.9757184434184903e-05, + "loss": 1.0345, + "step": 1606 + }, + { + "epoch": 0.07825091909526939, + "grad_norm": 1.896150827407837, + "learning_rate": 3.9756694151974125e-05, + "loss": 0.9267, + "step": 1607 + }, + { + "epoch": 0.0782996128843766, + "grad_norm": 4.438630104064941, + "learning_rate": 3.975620337831378e-05, + "loss": 0.9064, + "step": 1608 + }, + { + "epoch": 0.0783483066734838, + "grad_norm": 1.7076901197433472, + "learning_rate": 3.975571211321607e-05, + "loss": 0.9495, + "step": 1609 + }, + { + "epoch": 0.078397000462591, + "grad_norm": 3.3944671154022217, + "learning_rate": 3.975522035669323e-05, + "loss": 1.0579, + "step": 1610 + }, + { + "epoch": 0.0784456942516982, + "grad_norm": 1.8080686330795288, + "learning_rate": 3.975472810875747e-05, + "loss": 0.939, + "step": 1611 + }, + { + "epoch": 0.0784943880408054, + "grad_norm": 2.0832457542419434, + "learning_rate": 3.9754235369421055e-05, + "loss": 0.9842, + "step": 1612 + }, + { + "epoch": 0.0785430818299126, + "grad_norm": 1.5712230205535889, + "learning_rate": 3.975374213869623e-05, + "loss": 0.8372, + "step": 1613 + }, + { + "epoch": 0.0785917756190198, + "grad_norm": 1.3082884550094604, + "learning_rate": 3.975324841659527e-05, + "loss": 1.0201, + "step": 1614 + }, + { + "epoch": 0.078640469408127, + "grad_norm": 1.4082139730453491, + "learning_rate": 3.975275420313045e-05, + "loss": 0.9097, + "step": 1615 + }, + { + "epoch": 0.0786891631972342, + "grad_norm": 2.0507824420928955, + "learning_rate": 3.9752259498314074e-05, + "loss": 0.9732, + "step": 1616 + }, + { + "epoch": 0.07873785698634139, + "grad_norm": 1.9189480543136597, + "learning_rate": 3.975176430215845e-05, + "loss": 0.9726, + "step": 1617 + }, + { + "epoch": 0.07878655077544859, + "grad_norm": 1.6820317506790161, + "learning_rate": 3.975126861467587e-05, + "loss": 0.8933, + "step": 1618 + }, + { + "epoch": 0.07883524456455579, + "grad_norm": 1.6409237384796143, + "learning_rate": 3.97507724358787e-05, + "loss": 0.9905, + "step": 1619 + }, + { + "epoch": 0.07888393835366299, + "grad_norm": 2.465322494506836, + "learning_rate": 3.9750275765779255e-05, + "loss": 0.9046, + "step": 1620 + }, + { + "epoch": 0.07893263214277019, + "grad_norm": 1.886191964149475, + "learning_rate": 3.97497786043899e-05, + "loss": 0.9569, + "step": 1621 + }, + { + "epoch": 0.07898132593187739, + "grad_norm": 2.2668216228485107, + "learning_rate": 3.9749280951723e-05, + "loss": 0.9943, + "step": 1622 + }, + { + "epoch": 0.07903001972098458, + "grad_norm": 3.19876766204834, + "learning_rate": 3.974878280779093e-05, + "loss": 0.8764, + "step": 1623 + }, + { + "epoch": 0.07907871351009178, + "grad_norm": 0.08404159545898438, + "learning_rate": 3.97482841726061e-05, + "loss": 0.6298, + "step": 1624 + }, + { + "epoch": 0.07912740729919898, + "grad_norm": 2.108694314956665, + "learning_rate": 3.97477850461809e-05, + "loss": 0.8776, + "step": 1625 + }, + { + "epoch": 0.07917610108830618, + "grad_norm": 1.4217385053634644, + "learning_rate": 3.9747285428527734e-05, + "loss": 0.8886, + "step": 1626 + }, + { + "epoch": 0.07922479487741338, + "grad_norm": 2.1015560626983643, + "learning_rate": 3.9746785319659056e-05, + "loss": 0.8453, + "step": 1627 + }, + { + "epoch": 0.07927348866652059, + "grad_norm": 1.4213588237762451, + "learning_rate": 3.9746284719587285e-05, + "loss": 0.9244, + "step": 1628 + }, + { + "epoch": 0.07932218245562779, + "grad_norm": 1.6145020723342896, + "learning_rate": 3.974578362832488e-05, + "loss": 0.984, + "step": 1629 + }, + { + "epoch": 0.07937087624473499, + "grad_norm": 1.613423466682434, + "learning_rate": 3.9745282045884315e-05, + "loss": 0.9884, + "step": 1630 + }, + { + "epoch": 0.07941957003384219, + "grad_norm": 2.189629077911377, + "learning_rate": 3.974477997227805e-05, + "loss": 0.9808, + "step": 1631 + }, + { + "epoch": 0.07946826382294939, + "grad_norm": 1.8477060794830322, + "learning_rate": 3.9744277407518585e-05, + "loss": 0.9947, + "step": 1632 + }, + { + "epoch": 0.07951695761205659, + "grad_norm": 2.0763378143310547, + "learning_rate": 3.9743774351618426e-05, + "loss": 0.9542, + "step": 1633 + }, + { + "epoch": 0.07956565140116378, + "grad_norm": 2.7218449115753174, + "learning_rate": 3.974327080459007e-05, + "loss": 0.9032, + "step": 1634 + }, + { + "epoch": 0.07961434519027098, + "grad_norm": 1.7323451042175293, + "learning_rate": 3.974276676644606e-05, + "loss": 0.9661, + "step": 1635 + }, + { + "epoch": 0.07966303897937818, + "grad_norm": 1.9947856664657593, + "learning_rate": 3.974226223719892e-05, + "loss": 1.0515, + "step": 1636 + }, + { + "epoch": 0.07971173276848538, + "grad_norm": 2.55903959274292, + "learning_rate": 3.974175721686122e-05, + "loss": 0.9471, + "step": 1637 + }, + { + "epoch": 0.07976042655759258, + "grad_norm": 1.7640072107315063, + "learning_rate": 3.97412517054455e-05, + "loss": 0.9841, + "step": 1638 + }, + { + "epoch": 0.07980912034669978, + "grad_norm": 1.556603193283081, + "learning_rate": 3.9740745702964346e-05, + "loss": 0.836, + "step": 1639 + }, + { + "epoch": 0.07985781413580698, + "grad_norm": 1.4318816661834717, + "learning_rate": 3.974023920943034e-05, + "loss": 0.9949, + "step": 1640 + }, + { + "epoch": 0.07990650792491417, + "grad_norm": 1.9383190870285034, + "learning_rate": 3.973973222485608e-05, + "loss": 0.8974, + "step": 1641 + }, + { + "epoch": 0.07995520171402137, + "grad_norm": 1.5905128717422485, + "learning_rate": 3.973922474925419e-05, + "loss": 0.9659, + "step": 1642 + }, + { + "epoch": 0.08000389550312857, + "grad_norm": 2.3443355560302734, + "learning_rate": 3.9738716782637284e-05, + "loss": 0.9363, + "step": 1643 + }, + { + "epoch": 0.08005258929223577, + "grad_norm": 1.5625262260437012, + "learning_rate": 3.973820832501799e-05, + "loss": 0.9112, + "step": 1644 + }, + { + "epoch": 0.08010128308134297, + "grad_norm": 1.9225983619689941, + "learning_rate": 3.973769937640897e-05, + "loss": 0.8924, + "step": 1645 + }, + { + "epoch": 0.08014997687045017, + "grad_norm": 1.8256434202194214, + "learning_rate": 3.973718993682288e-05, + "loss": 0.9916, + "step": 1646 + }, + { + "epoch": 0.08019867065955737, + "grad_norm": 2.0297276973724365, + "learning_rate": 3.973668000627238e-05, + "loss": 0.9526, + "step": 1647 + }, + { + "epoch": 0.08024736444866458, + "grad_norm": 1.8462074995040894, + "learning_rate": 3.973616958477018e-05, + "loss": 0.9696, + "step": 1648 + }, + { + "epoch": 0.08029605823777178, + "grad_norm": 2.049905776977539, + "learning_rate": 3.973565867232895e-05, + "loss": 0.9499, + "step": 1649 + }, + { + "epoch": 0.08034475202687898, + "grad_norm": 2.0520505905151367, + "learning_rate": 3.973514726896142e-05, + "loss": 0.9331, + "step": 1650 + }, + { + "epoch": 0.08039344581598618, + "grad_norm": 1.9391809701919556, + "learning_rate": 3.97346353746803e-05, + "loss": 0.8786, + "step": 1651 + }, + { + "epoch": 0.08044213960509337, + "grad_norm": 1.8013746738433838, + "learning_rate": 3.973412298949832e-05, + "loss": 1.0054, + "step": 1652 + }, + { + "epoch": 0.08049083339420057, + "grad_norm": 1.6594072580337524, + "learning_rate": 3.973361011342824e-05, + "loss": 0.9648, + "step": 1653 + }, + { + "epoch": 0.08053952718330777, + "grad_norm": 2.3335394859313965, + "learning_rate": 3.97330967464828e-05, + "loss": 0.961, + "step": 1654 + }, + { + "epoch": 0.08058822097241497, + "grad_norm": 0.08553311228752136, + "learning_rate": 3.973258288867479e-05, + "loss": 0.5866, + "step": 1655 + }, + { + "epoch": 0.08063691476152217, + "grad_norm": 2.1454484462738037, + "learning_rate": 3.9732068540016976e-05, + "loss": 0.9597, + "step": 1656 + }, + { + "epoch": 0.08068560855062937, + "grad_norm": 2.365342617034912, + "learning_rate": 3.973155370052216e-05, + "loss": 0.8508, + "step": 1657 + }, + { + "epoch": 0.08073430233973657, + "grad_norm": 2.1626365184783936, + "learning_rate": 3.973103837020314e-05, + "loss": 0.9796, + "step": 1658 + }, + { + "epoch": 0.08078299612884376, + "grad_norm": 1.6044269800186157, + "learning_rate": 3.9730522549072745e-05, + "loss": 0.8796, + "step": 1659 + }, + { + "epoch": 0.08083168991795096, + "grad_norm": 1.8383053541183472, + "learning_rate": 3.9730006237143805e-05, + "loss": 0.9035, + "step": 1660 + }, + { + "epoch": 0.08088038370705816, + "grad_norm": 0.08471373468637466, + "learning_rate": 3.972948943442916e-05, + "loss": 0.6158, + "step": 1661 + }, + { + "epoch": 0.08092907749616536, + "grad_norm": 1.4343973398208618, + "learning_rate": 3.972897214094167e-05, + "loss": 0.9577, + "step": 1662 + }, + { + "epoch": 0.08097777128527256, + "grad_norm": 1.9814256429672241, + "learning_rate": 3.972845435669419e-05, + "loss": 0.9812, + "step": 1663 + }, + { + "epoch": 0.08102646507437976, + "grad_norm": 1.3092743158340454, + "learning_rate": 3.972793608169962e-05, + "loss": 0.8683, + "step": 1664 + }, + { + "epoch": 0.08107515886348696, + "grad_norm": 1.5304659605026245, + "learning_rate": 3.9727417315970837e-05, + "loss": 0.8978, + "step": 1665 + }, + { + "epoch": 0.08112385265259416, + "grad_norm": 2.214567184448242, + "learning_rate": 3.972689805952074e-05, + "loss": 0.9257, + "step": 1666 + }, + { + "epoch": 0.08117254644170135, + "grad_norm": 1.7765133380889893, + "learning_rate": 3.9726378312362264e-05, + "loss": 0.981, + "step": 1667 + }, + { + "epoch": 0.08122124023080857, + "grad_norm": 2.140258312225342, + "learning_rate": 3.9725858074508335e-05, + "loss": 1.0369, + "step": 1668 + }, + { + "epoch": 0.08126993401991577, + "grad_norm": 1.7145593166351318, + "learning_rate": 3.972533734597188e-05, + "loss": 1.0015, + "step": 1669 + }, + { + "epoch": 0.08131862780902296, + "grad_norm": 2.2430360317230225, + "learning_rate": 3.972481612676587e-05, + "loss": 0.953, + "step": 1670 + }, + { + "epoch": 0.08136732159813016, + "grad_norm": 1.9417176246643066, + "learning_rate": 3.972429441690324e-05, + "loss": 0.9362, + "step": 1671 + }, + { + "epoch": 0.08141601538723736, + "grad_norm": 2.0044145584106445, + "learning_rate": 3.972377221639701e-05, + "loss": 0.9229, + "step": 1672 + }, + { + "epoch": 0.08146470917634456, + "grad_norm": 2.1238856315612793, + "learning_rate": 3.972324952526013e-05, + "loss": 0.9074, + "step": 1673 + }, + { + "epoch": 0.08151340296545176, + "grad_norm": 1.274779200553894, + "learning_rate": 3.972272634350563e-05, + "loss": 0.9367, + "step": 1674 + }, + { + "epoch": 0.08156209675455896, + "grad_norm": 1.7485605478286743, + "learning_rate": 3.972220267114651e-05, + "loss": 0.9933, + "step": 1675 + }, + { + "epoch": 0.08161079054366616, + "grad_norm": 1.918784737586975, + "learning_rate": 3.9721678508195803e-05, + "loss": 0.9658, + "step": 1676 + }, + { + "epoch": 0.08165948433277335, + "grad_norm": 1.4390437602996826, + "learning_rate": 3.9721153854666544e-05, + "loss": 1.0249, + "step": 1677 + }, + { + "epoch": 0.08170817812188055, + "grad_norm": 1.6105680465698242, + "learning_rate": 3.972062871057178e-05, + "loss": 0.9788, + "step": 1678 + }, + { + "epoch": 0.08175687191098775, + "grad_norm": 1.5772614479064941, + "learning_rate": 3.9720103075924585e-05, + "loss": 0.8926, + "step": 1679 + }, + { + "epoch": 0.08180556570009495, + "grad_norm": 1.7557884454727173, + "learning_rate": 3.971957695073802e-05, + "loss": 0.9054, + "step": 1680 + }, + { + "epoch": 0.08185425948920215, + "grad_norm": 1.6462934017181396, + "learning_rate": 3.971905033502519e-05, + "loss": 0.9465, + "step": 1681 + }, + { + "epoch": 0.08190295327830935, + "grad_norm": 1.7265397310256958, + "learning_rate": 3.9718523228799177e-05, + "loss": 0.9156, + "step": 1682 + }, + { + "epoch": 0.08195164706741655, + "grad_norm": 1.6120632886886597, + "learning_rate": 3.971799563207311e-05, + "loss": 0.9412, + "step": 1683 + }, + { + "epoch": 0.08200034085652375, + "grad_norm": 1.5733020305633545, + "learning_rate": 3.97174675448601e-05, + "loss": 0.9882, + "step": 1684 + }, + { + "epoch": 0.08204903464563094, + "grad_norm": 2.1134352684020996, + "learning_rate": 3.9716938967173285e-05, + "loss": 0.8356, + "step": 1685 + }, + { + "epoch": 0.08209772843473814, + "grad_norm": 1.8125361204147339, + "learning_rate": 3.971640989902581e-05, + "loss": 0.9042, + "step": 1686 + }, + { + "epoch": 0.08214642222384534, + "grad_norm": 2.2883262634277344, + "learning_rate": 3.971588034043086e-05, + "loss": 0.8228, + "step": 1687 + }, + { + "epoch": 0.08219511601295255, + "grad_norm": 1.4993102550506592, + "learning_rate": 3.971535029140157e-05, + "loss": 0.923, + "step": 1688 + }, + { + "epoch": 0.08224380980205975, + "grad_norm": 1.8425590991973877, + "learning_rate": 3.971481975195115e-05, + "loss": 0.8855, + "step": 1689 + }, + { + "epoch": 0.08229250359116695, + "grad_norm": 1.3063679933547974, + "learning_rate": 3.971428872209279e-05, + "loss": 0.9884, + "step": 1690 + }, + { + "epoch": 0.08234119738027415, + "grad_norm": 1.6891602277755737, + "learning_rate": 3.9713757201839706e-05, + "loss": 0.9706, + "step": 1691 + }, + { + "epoch": 0.08238989116938135, + "grad_norm": 1.8490121364593506, + "learning_rate": 3.971322519120511e-05, + "loss": 0.9191, + "step": 1692 + }, + { + "epoch": 0.08243858495848855, + "grad_norm": 2.005462408065796, + "learning_rate": 3.971269269020224e-05, + "loss": 1.0378, + "step": 1693 + }, + { + "epoch": 0.08248727874759575, + "grad_norm": 1.7108545303344727, + "learning_rate": 3.971215969884435e-05, + "loss": 0.9004, + "step": 1694 + }, + { + "epoch": 0.08253597253670295, + "grad_norm": 1.517677664756775, + "learning_rate": 3.9711626217144674e-05, + "loss": 0.9432, + "step": 1695 + }, + { + "epoch": 0.08258466632581014, + "grad_norm": 1.9387449026107788, + "learning_rate": 3.9711092245116507e-05, + "loss": 0.9397, + "step": 1696 + }, + { + "epoch": 0.08263336011491734, + "grad_norm": 1.7840105295181274, + "learning_rate": 3.9710557782773126e-05, + "loss": 0.9464, + "step": 1697 + }, + { + "epoch": 0.08268205390402454, + "grad_norm": 1.540916085243225, + "learning_rate": 3.971002283012782e-05, + "loss": 0.9347, + "step": 1698 + }, + { + "epoch": 0.08273074769313174, + "grad_norm": 1.663948893547058, + "learning_rate": 3.97094873871939e-05, + "loss": 0.8949, + "step": 1699 + }, + { + "epoch": 0.08277944148223894, + "grad_norm": 2.555504083633423, + "learning_rate": 3.9708951453984684e-05, + "loss": 0.8597, + "step": 1700 + }, + { + "epoch": 0.08282813527134614, + "grad_norm": 1.5908477306365967, + "learning_rate": 3.97084150305135e-05, + "loss": 0.944, + "step": 1701 + }, + { + "epoch": 0.08287682906045334, + "grad_norm": 2.015178918838501, + "learning_rate": 3.97078781167937e-05, + "loss": 0.9135, + "step": 1702 + }, + { + "epoch": 0.08292552284956053, + "grad_norm": 1.6407601833343506, + "learning_rate": 3.970734071283864e-05, + "loss": 0.967, + "step": 1703 + }, + { + "epoch": 0.08297421663866773, + "grad_norm": 2.1186866760253906, + "learning_rate": 3.970680281866167e-05, + "loss": 0.9347, + "step": 1704 + }, + { + "epoch": 0.08302291042777493, + "grad_norm": 1.8333083391189575, + "learning_rate": 3.970626443427619e-05, + "loss": 0.9575, + "step": 1705 + }, + { + "epoch": 0.08307160421688213, + "grad_norm": 1.6389726400375366, + "learning_rate": 3.9705725559695584e-05, + "loss": 0.9694, + "step": 1706 + }, + { + "epoch": 0.08312029800598933, + "grad_norm": 1.5862984657287598, + "learning_rate": 3.970518619493326e-05, + "loss": 0.8665, + "step": 1707 + }, + { + "epoch": 0.08316899179509654, + "grad_norm": 1.4871631860733032, + "learning_rate": 3.970464634000263e-05, + "loss": 0.9817, + "step": 1708 + }, + { + "epoch": 0.08321768558420374, + "grad_norm": 1.656226396560669, + "learning_rate": 3.9704105994917126e-05, + "loss": 0.9761, + "step": 1709 + }, + { + "epoch": 0.08326637937331094, + "grad_norm": 0.08905807882547379, + "learning_rate": 3.9703565159690195e-05, + "loss": 0.6745, + "step": 1710 + }, + { + "epoch": 0.08331507316241814, + "grad_norm": 1.5361015796661377, + "learning_rate": 3.970302383433527e-05, + "loss": 0.9756, + "step": 1711 + }, + { + "epoch": 0.08336376695152534, + "grad_norm": 0.08304490149021149, + "learning_rate": 3.970248201886584e-05, + "loss": 0.6336, + "step": 1712 + }, + { + "epoch": 0.08341246074063254, + "grad_norm": 2.985065221786499, + "learning_rate": 3.970193971329538e-05, + "loss": 0.8662, + "step": 1713 + }, + { + "epoch": 0.08346115452973973, + "grad_norm": 1.5218862295150757, + "learning_rate": 3.9701396917637367e-05, + "loss": 0.9169, + "step": 1714 + }, + { + "epoch": 0.08350984831884693, + "grad_norm": 2.1124460697174072, + "learning_rate": 3.9700853631905304e-05, + "loss": 0.9658, + "step": 1715 + }, + { + "epoch": 0.08355854210795413, + "grad_norm": 1.5379149913787842, + "learning_rate": 3.9700309856112715e-05, + "loss": 0.8722, + "step": 1716 + }, + { + "epoch": 0.08360723589706133, + "grad_norm": 2.1375207901000977, + "learning_rate": 3.969976559027313e-05, + "loss": 0.9838, + "step": 1717 + }, + { + "epoch": 0.08365592968616853, + "grad_norm": 2.129624366760254, + "learning_rate": 3.969922083440007e-05, + "loss": 0.9335, + "step": 1718 + }, + { + "epoch": 0.08370462347527573, + "grad_norm": 3.5300824642181396, + "learning_rate": 3.96986755885071e-05, + "loss": 0.9255, + "step": 1719 + }, + { + "epoch": 0.08375331726438293, + "grad_norm": 1.8555042743682861, + "learning_rate": 3.969812985260778e-05, + "loss": 0.9502, + "step": 1720 + }, + { + "epoch": 0.08380201105349012, + "grad_norm": 2.4101076126098633, + "learning_rate": 3.969758362671568e-05, + "loss": 0.9574, + "step": 1721 + }, + { + "epoch": 0.08385070484259732, + "grad_norm": 2.148587465286255, + "learning_rate": 3.9697036910844386e-05, + "loss": 0.9139, + "step": 1722 + }, + { + "epoch": 0.08389939863170452, + "grad_norm": 2.271026372909546, + "learning_rate": 3.969648970500751e-05, + "loss": 0.9558, + "step": 1723 + }, + { + "epoch": 0.08394809242081172, + "grad_norm": 1.9227393865585327, + "learning_rate": 3.9695942009218655e-05, + "loss": 0.919, + "step": 1724 + }, + { + "epoch": 0.08399678620991892, + "grad_norm": 2.616018772125244, + "learning_rate": 3.969539382349145e-05, + "loss": 0.9185, + "step": 1725 + }, + { + "epoch": 0.08404547999902612, + "grad_norm": 2.3923914432525635, + "learning_rate": 3.969484514783952e-05, + "loss": 0.8856, + "step": 1726 + }, + { + "epoch": 0.08409417378813332, + "grad_norm": 1.6606091260910034, + "learning_rate": 3.9694295982276526e-05, + "loss": 0.9259, + "step": 1727 + }, + { + "epoch": 0.08414286757724053, + "grad_norm": 2.330881118774414, + "learning_rate": 3.9693746326816124e-05, + "loss": 0.9311, + "step": 1728 + }, + { + "epoch": 0.08419156136634773, + "grad_norm": 1.9003549814224243, + "learning_rate": 3.969319618147199e-05, + "loss": 0.9091, + "step": 1729 + }, + { + "epoch": 0.08424025515545493, + "grad_norm": 1.9758920669555664, + "learning_rate": 3.96926455462578e-05, + "loss": 0.9564, + "step": 1730 + }, + { + "epoch": 0.08428894894456213, + "grad_norm": 2.6179792881011963, + "learning_rate": 3.9692094421187264e-05, + "loss": 0.8992, + "step": 1731 + }, + { + "epoch": 0.08433764273366932, + "grad_norm": 1.8456647396087646, + "learning_rate": 3.9691542806274073e-05, + "loss": 1.0393, + "step": 1732 + }, + { + "epoch": 0.08438633652277652, + "grad_norm": 1.6937841176986694, + "learning_rate": 3.969099070153197e-05, + "loss": 1.0629, + "step": 1733 + }, + { + "epoch": 0.08443503031188372, + "grad_norm": 2.058748722076416, + "learning_rate": 3.969043810697467e-05, + "loss": 0.8969, + "step": 1734 + }, + { + "epoch": 0.08448372410099092, + "grad_norm": 2.3305788040161133, + "learning_rate": 3.968988502261593e-05, + "loss": 0.9302, + "step": 1735 + }, + { + "epoch": 0.08453241789009812, + "grad_norm": 2.041966676712036, + "learning_rate": 3.9689331448469506e-05, + "loss": 1.0325, + "step": 1736 + }, + { + "epoch": 0.08458111167920532, + "grad_norm": 1.6178909540176392, + "learning_rate": 3.968877738454916e-05, + "loss": 0.9758, + "step": 1737 + }, + { + "epoch": 0.08462980546831252, + "grad_norm": 1.3927521705627441, + "learning_rate": 3.968822283086869e-05, + "loss": 1.0402, + "step": 1738 + }, + { + "epoch": 0.08467849925741971, + "grad_norm": 2.5667104721069336, + "learning_rate": 3.968766778744188e-05, + "loss": 0.8393, + "step": 1739 + }, + { + "epoch": 0.08472719304652691, + "grad_norm": 2.055882453918457, + "learning_rate": 3.968711225428254e-05, + "loss": 0.9737, + "step": 1740 + }, + { + "epoch": 0.08477588683563411, + "grad_norm": 1.9959357976913452, + "learning_rate": 3.968655623140448e-05, + "loss": 0.8972, + "step": 1741 + }, + { + "epoch": 0.08482458062474131, + "grad_norm": 3.137855052947998, + "learning_rate": 3.968599971882155e-05, + "loss": 0.9489, + "step": 1742 + }, + { + "epoch": 0.08487327441384851, + "grad_norm": 2.0565242767333984, + "learning_rate": 3.9685442716547575e-05, + "loss": 0.9987, + "step": 1743 + }, + { + "epoch": 0.08492196820295571, + "grad_norm": 2.0082342624664307, + "learning_rate": 3.968488522459642e-05, + "loss": 0.8757, + "step": 1744 + }, + { + "epoch": 0.0849706619920629, + "grad_norm": 1.5610839128494263, + "learning_rate": 3.968432724298195e-05, + "loss": 0.9999, + "step": 1745 + }, + { + "epoch": 0.0850193557811701, + "grad_norm": 1.7759900093078613, + "learning_rate": 3.9683768771718044e-05, + "loss": 0.8629, + "step": 1746 + }, + { + "epoch": 0.0850680495702773, + "grad_norm": 2.137946128845215, + "learning_rate": 3.9683209810818593e-05, + "loss": 0.9167, + "step": 1747 + }, + { + "epoch": 0.08511674335938452, + "grad_norm": 1.8428113460540771, + "learning_rate": 3.96826503602975e-05, + "loss": 0.8927, + "step": 1748 + }, + { + "epoch": 0.08516543714849172, + "grad_norm": 6.994509696960449, + "learning_rate": 3.968209042016869e-05, + "loss": 0.964, + "step": 1749 + }, + { + "epoch": 0.08521413093759891, + "grad_norm": 1.85788094997406, + "learning_rate": 3.968152999044609e-05, + "loss": 0.8914, + "step": 1750 + }, + { + "epoch": 0.08526282472670611, + "grad_norm": 2.0158417224884033, + "learning_rate": 3.9680969071143636e-05, + "loss": 0.9072, + "step": 1751 + }, + { + "epoch": 0.08531151851581331, + "grad_norm": 2.078378915786743, + "learning_rate": 3.9680407662275274e-05, + "loss": 1.0714, + "step": 1752 + }, + { + "epoch": 0.08536021230492051, + "grad_norm": 2.2801871299743652, + "learning_rate": 3.9679845763854985e-05, + "loss": 0.9824, + "step": 1753 + }, + { + "epoch": 0.08540890609402771, + "grad_norm": 2.6047041416168213, + "learning_rate": 3.9679283375896734e-05, + "loss": 0.898, + "step": 1754 + }, + { + "epoch": 0.08545759988313491, + "grad_norm": 2.3704943656921387, + "learning_rate": 3.967872049841452e-05, + "loss": 0.9189, + "step": 1755 + }, + { + "epoch": 0.0855062936722421, + "grad_norm": 1.6981102228164673, + "learning_rate": 3.967815713142234e-05, + "loss": 0.8507, + "step": 1756 + }, + { + "epoch": 0.0855549874613493, + "grad_norm": 2.2093656063079834, + "learning_rate": 3.967759327493419e-05, + "loss": 1.0112, + "step": 1757 + }, + { + "epoch": 0.0856036812504565, + "grad_norm": 2.150756597518921, + "learning_rate": 3.967702892896413e-05, + "loss": 0.9328, + "step": 1758 + }, + { + "epoch": 0.0856523750395637, + "grad_norm": 3.2946510314941406, + "learning_rate": 3.967646409352618e-05, + "loss": 0.9691, + "step": 1759 + }, + { + "epoch": 0.0857010688286709, + "grad_norm": 1.4608372449874878, + "learning_rate": 3.967589876863439e-05, + "loss": 1.0383, + "step": 1760 + }, + { + "epoch": 0.0857497626177781, + "grad_norm": 1.8512965440750122, + "learning_rate": 3.967533295430282e-05, + "loss": 0.9604, + "step": 1761 + }, + { + "epoch": 0.0857984564068853, + "grad_norm": 2.776689291000366, + "learning_rate": 3.967476665054555e-05, + "loss": 0.9828, + "step": 1762 + }, + { + "epoch": 0.0858471501959925, + "grad_norm": 2.9028115272521973, + "learning_rate": 3.9674199857376674e-05, + "loss": 0.9063, + "step": 1763 + }, + { + "epoch": 0.0858958439850997, + "grad_norm": 1.7390062808990479, + "learning_rate": 3.967363257481028e-05, + "loss": 0.9544, + "step": 1764 + }, + { + "epoch": 0.0859445377742069, + "grad_norm": 1.8607391119003296, + "learning_rate": 3.967306480286048e-05, + "loss": 0.8332, + "step": 1765 + }, + { + "epoch": 0.0859932315633141, + "grad_norm": 1.5500760078430176, + "learning_rate": 3.967249654154139e-05, + "loss": 0.9177, + "step": 1766 + }, + { + "epoch": 0.08604192535242129, + "grad_norm": 2.872150421142578, + "learning_rate": 3.9671927790867164e-05, + "loss": 0.8465, + "step": 1767 + }, + { + "epoch": 0.0860906191415285, + "grad_norm": 6.669178009033203, + "learning_rate": 3.967135855085194e-05, + "loss": 0.9965, + "step": 1768 + }, + { + "epoch": 0.0861393129306357, + "grad_norm": 1.6369608640670776, + "learning_rate": 3.967078882150987e-05, + "loss": 0.8547, + "step": 1769 + }, + { + "epoch": 0.0861880067197429, + "grad_norm": 0.08773259818553925, + "learning_rate": 3.967021860285515e-05, + "loss": 0.6674, + "step": 1770 + }, + { + "epoch": 0.0862367005088501, + "grad_norm": 2.709003210067749, + "learning_rate": 3.9669647894901936e-05, + "loss": 0.9414, + "step": 1771 + }, + { + "epoch": 0.0862853942979573, + "grad_norm": 1.536565899848938, + "learning_rate": 3.9669076697664444e-05, + "loss": 0.9548, + "step": 1772 + }, + { + "epoch": 0.0863340880870645, + "grad_norm": 3.462799072265625, + "learning_rate": 3.9668505011156876e-05, + "loss": 0.9686, + "step": 1773 + }, + { + "epoch": 0.0863827818761717, + "grad_norm": 1.749321699142456, + "learning_rate": 3.9667932835393455e-05, + "loss": 0.9538, + "step": 1774 + }, + { + "epoch": 0.0864314756652789, + "grad_norm": 2.057295322418213, + "learning_rate": 3.96673601703884e-05, + "loss": 0.897, + "step": 1775 + }, + { + "epoch": 0.0864801694543861, + "grad_norm": 1.701014518737793, + "learning_rate": 3.966678701615598e-05, + "loss": 0.8848, + "step": 1776 + }, + { + "epoch": 0.08652886324349329, + "grad_norm": 1.8185138702392578, + "learning_rate": 3.966621337271043e-05, + "loss": 1.0337, + "step": 1777 + }, + { + "epoch": 0.08657755703260049, + "grad_norm": 2.083761215209961, + "learning_rate": 3.966563924006603e-05, + "loss": 0.8733, + "step": 1778 + }, + { + "epoch": 0.08662625082170769, + "grad_norm": 1.9900093078613281, + "learning_rate": 3.966506461823707e-05, + "loss": 0.9379, + "step": 1779 + }, + { + "epoch": 0.08667494461081489, + "grad_norm": 1.9579403400421143, + "learning_rate": 3.966448950723783e-05, + "loss": 0.8892, + "step": 1780 + }, + { + "epoch": 0.08672363839992209, + "grad_norm": 0.08880641311407089, + "learning_rate": 3.966391390708262e-05, + "loss": 0.6579, + "step": 1781 + }, + { + "epoch": 0.08677233218902929, + "grad_norm": 1.4449909925460815, + "learning_rate": 3.966333781778576e-05, + "loss": 0.9755, + "step": 1782 + }, + { + "epoch": 0.08682102597813648, + "grad_norm": 2.724149227142334, + "learning_rate": 3.966276123936157e-05, + "loss": 0.83, + "step": 1783 + }, + { + "epoch": 0.08686971976724368, + "grad_norm": 1.4757548570632935, + "learning_rate": 3.966218417182441e-05, + "loss": 0.9757, + "step": 1784 + }, + { + "epoch": 0.08691841355635088, + "grad_norm": 2.5173075199127197, + "learning_rate": 3.966160661518863e-05, + "loss": 0.8731, + "step": 1785 + }, + { + "epoch": 0.08696710734545808, + "grad_norm": 2.3975107669830322, + "learning_rate": 3.966102856946859e-05, + "loss": 0.94, + "step": 1786 + }, + { + "epoch": 0.08701580113456528, + "grad_norm": 6.356967926025391, + "learning_rate": 3.966045003467867e-05, + "loss": 0.9714, + "step": 1787 + }, + { + "epoch": 0.08706449492367249, + "grad_norm": 3.4088501930236816, + "learning_rate": 3.965987101083325e-05, + "loss": 0.8657, + "step": 1788 + }, + { + "epoch": 0.08711318871277969, + "grad_norm": 2.269458770751953, + "learning_rate": 3.965929149794677e-05, + "loss": 0.9175, + "step": 1789 + }, + { + "epoch": 0.08716188250188689, + "grad_norm": 2.1730685234069824, + "learning_rate": 3.9658711496033604e-05, + "loss": 0.9215, + "step": 1790 + }, + { + "epoch": 0.08721057629099409, + "grad_norm": 1.6691046953201294, + "learning_rate": 3.96581310051082e-05, + "loss": 0.9931, + "step": 1791 + }, + { + "epoch": 0.08725927008010129, + "grad_norm": 2.996520519256592, + "learning_rate": 3.9657550025185e-05, + "loss": 0.9328, + "step": 1792 + }, + { + "epoch": 0.08730796386920849, + "grad_norm": 2.9745914936065674, + "learning_rate": 3.965696855627845e-05, + "loss": 0.8404, + "step": 1793 + }, + { + "epoch": 0.08735665765831568, + "grad_norm": 1.6212012767791748, + "learning_rate": 3.965638659840301e-05, + "loss": 0.9876, + "step": 1794 + }, + { + "epoch": 0.08740535144742288, + "grad_norm": 1.8813316822052002, + "learning_rate": 3.965580415157316e-05, + "loss": 0.9709, + "step": 1795 + }, + { + "epoch": 0.08745404523653008, + "grad_norm": 1.6656997203826904, + "learning_rate": 3.96552212158034e-05, + "loss": 0.8529, + "step": 1796 + }, + { + "epoch": 0.08750273902563728, + "grad_norm": 1.8769890069961548, + "learning_rate": 3.965463779110821e-05, + "loss": 0.92, + "step": 1797 + }, + { + "epoch": 0.08755143281474448, + "grad_norm": 2.0553784370422363, + "learning_rate": 3.965405387750212e-05, + "loss": 1.0227, + "step": 1798 + }, + { + "epoch": 0.08760012660385168, + "grad_norm": 3.608152389526367, + "learning_rate": 3.965346947499964e-05, + "loss": 0.9793, + "step": 1799 + }, + { + "epoch": 0.08764882039295888, + "grad_norm": 1.7788783311843872, + "learning_rate": 3.965288458361533e-05, + "loss": 0.9554, + "step": 1800 + }, + { + "epoch": 0.08769751418206607, + "grad_norm": 6.493870735168457, + "learning_rate": 3.965229920336371e-05, + "loss": 0.952, + "step": 1801 + }, + { + "epoch": 0.08774620797117327, + "grad_norm": 2.0072033405303955, + "learning_rate": 3.965171333425936e-05, + "loss": 0.8966, + "step": 1802 + }, + { + "epoch": 0.08779490176028047, + "grad_norm": 2.834061861038208, + "learning_rate": 3.965112697631685e-05, + "loss": 0.8863, + "step": 1803 + }, + { + "epoch": 0.08784359554938767, + "grad_norm": 1.685839056968689, + "learning_rate": 3.965054012955076e-05, + "loss": 0.9382, + "step": 1804 + }, + { + "epoch": 0.08789228933849487, + "grad_norm": 1.4879993200302124, + "learning_rate": 3.9649952793975695e-05, + "loss": 0.9466, + "step": 1805 + }, + { + "epoch": 0.08794098312760207, + "grad_norm": 0.08629678189754486, + "learning_rate": 3.964936496960627e-05, + "loss": 0.6222, + "step": 1806 + }, + { + "epoch": 0.08798967691670927, + "grad_norm": 3.0659096240997314, + "learning_rate": 3.9648776656457093e-05, + "loss": 0.897, + "step": 1807 + }, + { + "epoch": 0.08803837070581648, + "grad_norm": 1.8232628107070923, + "learning_rate": 3.96481878545428e-05, + "loss": 0.9464, + "step": 1808 + }, + { + "epoch": 0.08808706449492368, + "grad_norm": 1.619948148727417, + "learning_rate": 3.964759856387805e-05, + "loss": 0.9575, + "step": 1809 + }, + { + "epoch": 0.08813575828403088, + "grad_norm": 5.513167381286621, + "learning_rate": 3.9647008784477494e-05, + "loss": 0.9172, + "step": 1810 + }, + { + "epoch": 0.08818445207313808, + "grad_norm": 3.0125374794006348, + "learning_rate": 3.96464185163558e-05, + "loss": 0.9641, + "step": 1811 + }, + { + "epoch": 0.08823314586224527, + "grad_norm": 2.976986885070801, + "learning_rate": 3.9645827759527654e-05, + "loss": 0.8566, + "step": 1812 + }, + { + "epoch": 0.08828183965135247, + "grad_norm": 2.607534408569336, + "learning_rate": 3.964523651400775e-05, + "loss": 0.9624, + "step": 1813 + }, + { + "epoch": 0.08833053344045967, + "grad_norm": 1.8660787343978882, + "learning_rate": 3.9644644779810803e-05, + "loss": 0.9798, + "step": 1814 + }, + { + "epoch": 0.08837922722956687, + "grad_norm": 0.08213158696889877, + "learning_rate": 3.964405255695153e-05, + "loss": 0.6462, + "step": 1815 + }, + { + "epoch": 0.08842792101867407, + "grad_norm": 1.9868606328964233, + "learning_rate": 3.964345984544465e-05, + "loss": 0.9471, + "step": 1816 + }, + { + "epoch": 0.08847661480778127, + "grad_norm": 2.827853202819824, + "learning_rate": 3.964286664530491e-05, + "loss": 0.9289, + "step": 1817 + }, + { + "epoch": 0.08852530859688847, + "grad_norm": 1.8451992273330688, + "learning_rate": 3.964227295654708e-05, + "loss": 0.9909, + "step": 1818 + }, + { + "epoch": 0.08857400238599566, + "grad_norm": 1.638253092765808, + "learning_rate": 3.964167877918591e-05, + "loss": 1.0147, + "step": 1819 + }, + { + "epoch": 0.08862269617510286, + "grad_norm": 1.8381695747375488, + "learning_rate": 3.96410841132362e-05, + "loss": 1.0012, + "step": 1820 + }, + { + "epoch": 0.08867138996421006, + "grad_norm": 1.7888543605804443, + "learning_rate": 3.964048895871273e-05, + "loss": 0.9286, + "step": 1821 + }, + { + "epoch": 0.08872008375331726, + "grad_norm": 2.9149303436279297, + "learning_rate": 3.963989331563031e-05, + "loss": 0.9453, + "step": 1822 + }, + { + "epoch": 0.08876877754242446, + "grad_norm": 2.2252488136291504, + "learning_rate": 3.963929718400374e-05, + "loss": 0.9973, + "step": 1823 + }, + { + "epoch": 0.08881747133153166, + "grad_norm": 0.09457433223724365, + "learning_rate": 3.963870056384787e-05, + "loss": 0.6961, + "step": 1824 + }, + { + "epoch": 0.08886616512063886, + "grad_norm": 0.08408184349536896, + "learning_rate": 3.963810345517753e-05, + "loss": 0.6091, + "step": 1825 + }, + { + "epoch": 0.08891485890974606, + "grad_norm": 1.4556725025177002, + "learning_rate": 3.963750585800759e-05, + "loss": 0.8893, + "step": 1826 + }, + { + "epoch": 0.08896355269885325, + "grad_norm": 1.5927430391311646, + "learning_rate": 3.9636907772352884e-05, + "loss": 0.8855, + "step": 1827 + }, + { + "epoch": 0.08901224648796047, + "grad_norm": 1.5509600639343262, + "learning_rate": 3.963630919822831e-05, + "loss": 0.8949, + "step": 1828 + }, + { + "epoch": 0.08906094027706767, + "grad_norm": 1.3797178268432617, + "learning_rate": 3.963571013564876e-05, + "loss": 0.9781, + "step": 1829 + }, + { + "epoch": 0.08910963406617486, + "grad_norm": 1.6509730815887451, + "learning_rate": 3.963511058462912e-05, + "loss": 0.9167, + "step": 1830 + }, + { + "epoch": 0.08915832785528206, + "grad_norm": 1.7362157106399536, + "learning_rate": 3.963451054518433e-05, + "loss": 0.9631, + "step": 1831 + }, + { + "epoch": 0.08920702164438926, + "grad_norm": 1.8732643127441406, + "learning_rate": 3.9633910017329285e-05, + "loss": 0.857, + "step": 1832 + }, + { + "epoch": 0.08925571543349646, + "grad_norm": 1.7916291952133179, + "learning_rate": 3.963330900107895e-05, + "loss": 1.0161, + "step": 1833 + }, + { + "epoch": 0.08930440922260366, + "grad_norm": 1.9052008390426636, + "learning_rate": 3.963270749644825e-05, + "loss": 1.0155, + "step": 1834 + }, + { + "epoch": 0.08935310301171086, + "grad_norm": 1.7790898084640503, + "learning_rate": 3.9632105503452176e-05, + "loss": 0.9133, + "step": 1835 + }, + { + "epoch": 0.08940179680081806, + "grad_norm": 2.1502668857574463, + "learning_rate": 3.963150302210568e-05, + "loss": 0.9098, + "step": 1836 + }, + { + "epoch": 0.08945049058992526, + "grad_norm": 1.890602946281433, + "learning_rate": 3.963090005242375e-05, + "loss": 0.8861, + "step": 1837 + }, + { + "epoch": 0.08949918437903245, + "grad_norm": 3.3068811893463135, + "learning_rate": 3.96302965944214e-05, + "loss": 0.8957, + "step": 1838 + }, + { + "epoch": 0.08954787816813965, + "grad_norm": 1.8160626888275146, + "learning_rate": 3.962969264811363e-05, + "loss": 0.8931, + "step": 1839 + }, + { + "epoch": 0.08959657195724685, + "grad_norm": 1.7330399751663208, + "learning_rate": 3.962908821351547e-05, + "loss": 0.8737, + "step": 1840 + }, + { + "epoch": 0.08964526574635405, + "grad_norm": 1.6197034120559692, + "learning_rate": 3.962848329064194e-05, + "loss": 0.9575, + "step": 1841 + }, + { + "epoch": 0.08969395953546125, + "grad_norm": 1.7198506593704224, + "learning_rate": 3.9627877879508113e-05, + "loss": 0.9043, + "step": 1842 + }, + { + "epoch": 0.08974265332456845, + "grad_norm": 2.47883939743042, + "learning_rate": 3.962727198012902e-05, + "loss": 0.8749, + "step": 1843 + }, + { + "epoch": 0.08979134711367565, + "grad_norm": 1.8190276622772217, + "learning_rate": 3.9626665592519756e-05, + "loss": 0.8778, + "step": 1844 + }, + { + "epoch": 0.08984004090278284, + "grad_norm": 1.9436943531036377, + "learning_rate": 3.9626058716695395e-05, + "loss": 0.8939, + "step": 1845 + }, + { + "epoch": 0.08988873469189004, + "grad_norm": 1.4850233793258667, + "learning_rate": 3.962545135267103e-05, + "loss": 0.8661, + "step": 1846 + }, + { + "epoch": 0.08993742848099726, + "grad_norm": 2.2247323989868164, + "learning_rate": 3.962484350046178e-05, + "loss": 0.8831, + "step": 1847 + }, + { + "epoch": 0.08998612227010445, + "grad_norm": 1.8555008172988892, + "learning_rate": 3.962423516008275e-05, + "loss": 0.9245, + "step": 1848 + }, + { + "epoch": 0.09003481605921165, + "grad_norm": 2.051429510116577, + "learning_rate": 3.962362633154909e-05, + "loss": 0.8246, + "step": 1849 + }, + { + "epoch": 0.09008350984831885, + "grad_norm": 2.1463451385498047, + "learning_rate": 3.962301701487593e-05, + "loss": 0.9567, + "step": 1850 + }, + { + "epoch": 0.09013220363742605, + "grad_norm": 1.3196420669555664, + "learning_rate": 3.9622407210078436e-05, + "loss": 0.9176, + "step": 1851 + }, + { + "epoch": 0.09018089742653325, + "grad_norm": 2.2814836502075195, + "learning_rate": 3.962179691717177e-05, + "loss": 0.9011, + "step": 1852 + }, + { + "epoch": 0.09022959121564045, + "grad_norm": 1.9968727827072144, + "learning_rate": 3.962118613617111e-05, + "loss": 0.9277, + "step": 1853 + }, + { + "epoch": 0.09027828500474765, + "grad_norm": 1.6749035120010376, + "learning_rate": 3.962057486709167e-05, + "loss": 1.0133, + "step": 1854 + }, + { + "epoch": 0.09032697879385485, + "grad_norm": 1.6563862562179565, + "learning_rate": 3.961996310994863e-05, + "loss": 0.9416, + "step": 1855 + }, + { + "epoch": 0.09037567258296204, + "grad_norm": 2.341991424560547, + "learning_rate": 3.9619350864757226e-05, + "loss": 1.0312, + "step": 1856 + }, + { + "epoch": 0.09042436637206924, + "grad_norm": 1.4865049123764038, + "learning_rate": 3.961873813153268e-05, + "loss": 1.0314, + "step": 1857 + }, + { + "epoch": 0.09047306016117644, + "grad_norm": 4.002986907958984, + "learning_rate": 3.9618124910290233e-05, + "loss": 0.9935, + "step": 1858 + }, + { + "epoch": 0.09052175395028364, + "grad_norm": 2.1202383041381836, + "learning_rate": 3.9617511201045135e-05, + "loss": 0.8793, + "step": 1859 + }, + { + "epoch": 0.09057044773939084, + "grad_norm": 2.276987075805664, + "learning_rate": 3.961689700381267e-05, + "loss": 0.921, + "step": 1860 + }, + { + "epoch": 0.09061914152849804, + "grad_norm": 3.5636940002441406, + "learning_rate": 3.9616282318608095e-05, + "loss": 0.9779, + "step": 1861 + }, + { + "epoch": 0.09066783531760524, + "grad_norm": 1.3018258810043335, + "learning_rate": 3.96156671454467e-05, + "loss": 0.9302, + "step": 1862 + }, + { + "epoch": 0.09071652910671243, + "grad_norm": 1.7429611682891846, + "learning_rate": 3.961505148434381e-05, + "loss": 0.9466, + "step": 1863 + }, + { + "epoch": 0.09076522289581963, + "grad_norm": 2.7070982456207275, + "learning_rate": 3.961443533531472e-05, + "loss": 0.8651, + "step": 1864 + }, + { + "epoch": 0.09081391668492683, + "grad_norm": 1.8958280086517334, + "learning_rate": 3.961381869837476e-05, + "loss": 0.9682, + "step": 1865 + }, + { + "epoch": 0.09086261047403403, + "grad_norm": 3.2743215560913086, + "learning_rate": 3.961320157353928e-05, + "loss": 1.0181, + "step": 1866 + }, + { + "epoch": 0.09091130426314124, + "grad_norm": 1.7619637250900269, + "learning_rate": 3.961258396082362e-05, + "loss": 1.0405, + "step": 1867 + }, + { + "epoch": 0.09095999805224844, + "grad_norm": 2.020181655883789, + "learning_rate": 3.961196586024314e-05, + "loss": 0.9597, + "step": 1868 + }, + { + "epoch": 0.09100869184135564, + "grad_norm": 1.479326844215393, + "learning_rate": 3.961134727181323e-05, + "loss": 1.0637, + "step": 1869 + }, + { + "epoch": 0.09105738563046284, + "grad_norm": 1.7465124130249023, + "learning_rate": 3.961072819554926e-05, + "loss": 0.8939, + "step": 1870 + }, + { + "epoch": 0.09110607941957004, + "grad_norm": 1.4465434551239014, + "learning_rate": 3.9610108631466644e-05, + "loss": 0.9089, + "step": 1871 + }, + { + "epoch": 0.09115477320867724, + "grad_norm": 2.460724353790283, + "learning_rate": 3.9609488579580784e-05, + "loss": 1.0278, + "step": 1872 + }, + { + "epoch": 0.09120346699778444, + "grad_norm": 1.4311165809631348, + "learning_rate": 3.960886803990711e-05, + "loss": 0.9439, + "step": 1873 + }, + { + "epoch": 0.09125216078689163, + "grad_norm": 0.09240886569023132, + "learning_rate": 3.9608247012461054e-05, + "loss": 0.6284, + "step": 1874 + }, + { + "epoch": 0.09130085457599883, + "grad_norm": 1.7425836324691772, + "learning_rate": 3.960762549725806e-05, + "loss": 0.9488, + "step": 1875 + }, + { + "epoch": 0.09134954836510603, + "grad_norm": 1.5771132707595825, + "learning_rate": 3.9607003494313605e-05, + "loss": 0.9419, + "step": 1876 + }, + { + "epoch": 0.09139824215421323, + "grad_norm": 1.3963121175765991, + "learning_rate": 3.9606381003643145e-05, + "loss": 0.8981, + "step": 1877 + }, + { + "epoch": 0.09144693594332043, + "grad_norm": 1.568121314048767, + "learning_rate": 3.960575802526217e-05, + "loss": 0.8856, + "step": 1878 + }, + { + "epoch": 0.09149562973242763, + "grad_norm": 1.573369026184082, + "learning_rate": 3.960513455918618e-05, + "loss": 0.9238, + "step": 1879 + }, + { + "epoch": 0.09154432352153483, + "grad_norm": 1.606703758239746, + "learning_rate": 3.9604510605430674e-05, + "loss": 1.0317, + "step": 1880 + }, + { + "epoch": 0.09159301731064202, + "grad_norm": 1.9560449123382568, + "learning_rate": 3.960388616401118e-05, + "loss": 0.895, + "step": 1881 + }, + { + "epoch": 0.09164171109974922, + "grad_norm": 1.7368727922439575, + "learning_rate": 3.960326123494324e-05, + "loss": 0.9886, + "step": 1882 + }, + { + "epoch": 0.09169040488885642, + "grad_norm": 1.7215055227279663, + "learning_rate": 3.960263581824238e-05, + "loss": 0.9236, + "step": 1883 + }, + { + "epoch": 0.09173909867796362, + "grad_norm": 1.7324211597442627, + "learning_rate": 3.960200991392417e-05, + "loss": 0.9417, + "step": 1884 + }, + { + "epoch": 0.09178779246707082, + "grad_norm": 1.5384470224380493, + "learning_rate": 3.9601383522004176e-05, + "loss": 0.9462, + "step": 1885 + }, + { + "epoch": 0.09183648625617802, + "grad_norm": 2.7282602787017822, + "learning_rate": 3.960075664249797e-05, + "loss": 0.9326, + "step": 1886 + }, + { + "epoch": 0.09188518004528523, + "grad_norm": 2.0040886402130127, + "learning_rate": 3.960012927542117e-05, + "loss": 0.9182, + "step": 1887 + }, + { + "epoch": 0.09193387383439243, + "grad_norm": 1.518573522567749, + "learning_rate": 3.959950142078937e-05, + "loss": 0.8753, + "step": 1888 + }, + { + "epoch": 0.09198256762349963, + "grad_norm": 1.9441832304000854, + "learning_rate": 3.9598873078618174e-05, + "loss": 0.8484, + "step": 1889 + }, + { + "epoch": 0.09203126141260683, + "grad_norm": 1.6007537841796875, + "learning_rate": 3.959824424892322e-05, + "loss": 0.9679, + "step": 1890 + }, + { + "epoch": 0.09207995520171403, + "grad_norm": 2.10038423538208, + "learning_rate": 3.959761493172016e-05, + "loss": 0.9497, + "step": 1891 + }, + { + "epoch": 0.09212864899082122, + "grad_norm": 1.489881157875061, + "learning_rate": 3.9596985127024645e-05, + "loss": 0.9353, + "step": 1892 + }, + { + "epoch": 0.09217734277992842, + "grad_norm": 2.752655267715454, + "learning_rate": 3.959635483485233e-05, + "loss": 0.9846, + "step": 1893 + }, + { + "epoch": 0.09222603656903562, + "grad_norm": 2.4219870567321777, + "learning_rate": 3.959572405521891e-05, + "loss": 0.8798, + "step": 1894 + }, + { + "epoch": 0.09227473035814282, + "grad_norm": 1.6354162693023682, + "learning_rate": 3.959509278814006e-05, + "loss": 0.9183, + "step": 1895 + }, + { + "epoch": 0.09232342414725002, + "grad_norm": 1.6699397563934326, + "learning_rate": 3.959446103363149e-05, + "loss": 0.8827, + "step": 1896 + }, + { + "epoch": 0.09237211793635722, + "grad_norm": 1.7985703945159912, + "learning_rate": 3.9593828791708926e-05, + "loss": 0.9755, + "step": 1897 + }, + { + "epoch": 0.09242081172546442, + "grad_norm": 2.5263237953186035, + "learning_rate": 3.959319606238808e-05, + "loss": 0.8995, + "step": 1898 + }, + { + "epoch": 0.09246950551457161, + "grad_norm": 1.4790904521942139, + "learning_rate": 3.959256284568469e-05, + "loss": 0.9779, + "step": 1899 + }, + { + "epoch": 0.09251819930367881, + "grad_norm": 2.0694286823272705, + "learning_rate": 3.959192914161452e-05, + "loss": 0.9311, + "step": 1900 + }, + { + "epoch": 0.09256689309278601, + "grad_norm": 2.1097302436828613, + "learning_rate": 3.959129495019331e-05, + "loss": 0.9508, + "step": 1901 + }, + { + "epoch": 0.09261558688189321, + "grad_norm": 4.592859268188477, + "learning_rate": 3.959066027143687e-05, + "loss": 0.8913, + "step": 1902 + }, + { + "epoch": 0.09266428067100041, + "grad_norm": 2.119067668914795, + "learning_rate": 3.9590025105360965e-05, + "loss": 0.9621, + "step": 1903 + }, + { + "epoch": 0.09271297446010761, + "grad_norm": 2.655308485031128, + "learning_rate": 3.9589389451981396e-05, + "loss": 0.9293, + "step": 1904 + }, + { + "epoch": 0.09276166824921481, + "grad_norm": 1.712912917137146, + "learning_rate": 3.9588753311313984e-05, + "loss": 0.8921, + "step": 1905 + }, + { + "epoch": 0.092810362038322, + "grad_norm": 2.080949068069458, + "learning_rate": 3.958811668337454e-05, + "loss": 0.8355, + "step": 1906 + }, + { + "epoch": 0.09285905582742922, + "grad_norm": 1.9860455989837646, + "learning_rate": 3.958747956817891e-05, + "loss": 0.9194, + "step": 1907 + }, + { + "epoch": 0.09290774961653642, + "grad_norm": 1.4540945291519165, + "learning_rate": 3.958684196574294e-05, + "loss": 0.9355, + "step": 1908 + }, + { + "epoch": 0.09295644340564362, + "grad_norm": 2.0347342491149902, + "learning_rate": 3.958620387608249e-05, + "loss": 0.8661, + "step": 1909 + }, + { + "epoch": 0.09300513719475081, + "grad_norm": 2.0772902965545654, + "learning_rate": 3.958556529921343e-05, + "loss": 0.913, + "step": 1910 + }, + { + "epoch": 0.09305383098385801, + "grad_norm": 1.631919026374817, + "learning_rate": 3.958492623515166e-05, + "loss": 0.895, + "step": 1911 + }, + { + "epoch": 0.09310252477296521, + "grad_norm": 1.8386447429656982, + "learning_rate": 3.958428668391305e-05, + "loss": 0.9367, + "step": 1912 + }, + { + "epoch": 0.09315121856207241, + "grad_norm": 4.627694606781006, + "learning_rate": 3.958364664551353e-05, + "loss": 1.0066, + "step": 1913 + }, + { + "epoch": 0.09319991235117961, + "grad_norm": 0.09096984565258026, + "learning_rate": 3.958300611996901e-05, + "loss": 0.7695, + "step": 1914 + }, + { + "epoch": 0.09324860614028681, + "grad_norm": 1.4400404691696167, + "learning_rate": 3.958236510729543e-05, + "loss": 1.0282, + "step": 1915 + }, + { + "epoch": 0.093297299929394, + "grad_norm": 1.591435432434082, + "learning_rate": 3.958172360750873e-05, + "loss": 0.8863, + "step": 1916 + }, + { + "epoch": 0.0933459937185012, + "grad_norm": 2.0800936222076416, + "learning_rate": 3.958108162062488e-05, + "loss": 0.9686, + "step": 1917 + }, + { + "epoch": 0.0933946875076084, + "grad_norm": 3.0156242847442627, + "learning_rate": 3.958043914665983e-05, + "loss": 0.8708, + "step": 1918 + }, + { + "epoch": 0.0934433812967156, + "grad_norm": 1.853438377380371, + "learning_rate": 3.9579796185629573e-05, + "loss": 0.9914, + "step": 1919 + }, + { + "epoch": 0.0934920750858228, + "grad_norm": 2.00252103805542, + "learning_rate": 3.95791527375501e-05, + "loss": 0.9386, + "step": 1920 + }, + { + "epoch": 0.09354076887493, + "grad_norm": 1.7038322687149048, + "learning_rate": 3.957850880243742e-05, + "loss": 0.8149, + "step": 1921 + }, + { + "epoch": 0.0935894626640372, + "grad_norm": 1.572433352470398, + "learning_rate": 3.957786438030754e-05, + "loss": 0.9274, + "step": 1922 + }, + { + "epoch": 0.0936381564531444, + "grad_norm": 1.6059534549713135, + "learning_rate": 3.957721947117651e-05, + "loss": 0.9345, + "step": 1923 + }, + { + "epoch": 0.0936868502422516, + "grad_norm": 3.72151780128479, + "learning_rate": 3.957657407506035e-05, + "loss": 0.9618, + "step": 1924 + }, + { + "epoch": 0.0937355440313588, + "grad_norm": 1.4060771465301514, + "learning_rate": 3.957592819197513e-05, + "loss": 0.9012, + "step": 1925 + }, + { + "epoch": 0.093784237820466, + "grad_norm": 0.09242890030145645, + "learning_rate": 3.9575281821936915e-05, + "loss": 0.6801, + "step": 1926 + }, + { + "epoch": 0.0938329316095732, + "grad_norm": 1.6144672632217407, + "learning_rate": 3.957463496496178e-05, + "loss": 0.9639, + "step": 1927 + }, + { + "epoch": 0.0938816253986804, + "grad_norm": 1.8148374557495117, + "learning_rate": 3.95739876210658e-05, + "loss": 0.899, + "step": 1928 + }, + { + "epoch": 0.0939303191877876, + "grad_norm": 2.1010947227478027, + "learning_rate": 3.9573339790265114e-05, + "loss": 0.989, + "step": 1929 + }, + { + "epoch": 0.0939790129768948, + "grad_norm": 1.8888217210769653, + "learning_rate": 3.9572691472575806e-05, + "loss": 0.8796, + "step": 1930 + }, + { + "epoch": 0.094027706766002, + "grad_norm": 1.4458516836166382, + "learning_rate": 3.957204266801402e-05, + "loss": 0.8955, + "step": 1931 + }, + { + "epoch": 0.0940764005551092, + "grad_norm": 2.414480686187744, + "learning_rate": 3.9571393376595876e-05, + "loss": 0.9217, + "step": 1932 + }, + { + "epoch": 0.0941250943442164, + "grad_norm": 2.188567638397217, + "learning_rate": 3.957074359833755e-05, + "loss": 0.9359, + "step": 1933 + }, + { + "epoch": 0.0941737881333236, + "grad_norm": 0.08284497261047363, + "learning_rate": 3.957009333325518e-05, + "loss": 0.6117, + "step": 1934 + }, + { + "epoch": 0.0942224819224308, + "grad_norm": 1.9844944477081299, + "learning_rate": 3.956944258136496e-05, + "loss": 0.9781, + "step": 1935 + }, + { + "epoch": 0.094271175711538, + "grad_norm": 2.039872169494629, + "learning_rate": 3.956879134268307e-05, + "loss": 0.8456, + "step": 1936 + }, + { + "epoch": 0.09431986950064519, + "grad_norm": 2.0792903900146484, + "learning_rate": 3.956813961722572e-05, + "loss": 0.9512, + "step": 1937 + }, + { + "epoch": 0.09436856328975239, + "grad_norm": 1.9258757829666138, + "learning_rate": 3.9567487405009096e-05, + "loss": 0.9011, + "step": 1938 + }, + { + "epoch": 0.09441725707885959, + "grad_norm": 2.2206451892852783, + "learning_rate": 3.956683470604946e-05, + "loss": 1.0571, + "step": 1939 + }, + { + "epoch": 0.09446595086796679, + "grad_norm": 1.4962248802185059, + "learning_rate": 3.956618152036301e-05, + "loss": 0.8993, + "step": 1940 + }, + { + "epoch": 0.09451464465707399, + "grad_norm": 1.9621692895889282, + "learning_rate": 3.956552784796602e-05, + "loss": 0.8755, + "step": 1941 + }, + { + "epoch": 0.09456333844618119, + "grad_norm": 3.8203721046447754, + "learning_rate": 3.956487368887473e-05, + "loss": 0.9195, + "step": 1942 + }, + { + "epoch": 0.09461203223528838, + "grad_norm": 2.330573320388794, + "learning_rate": 3.956421904310543e-05, + "loss": 0.9711, + "step": 1943 + }, + { + "epoch": 0.09466072602439558, + "grad_norm": 1.7578293085098267, + "learning_rate": 3.95635639106744e-05, + "loss": 0.8628, + "step": 1944 + }, + { + "epoch": 0.09470941981350278, + "grad_norm": 2.117642402648926, + "learning_rate": 3.956290829159793e-05, + "loss": 0.9579, + "step": 1945 + }, + { + "epoch": 0.09475811360260998, + "grad_norm": 2.4137940406799316, + "learning_rate": 3.956225218589234e-05, + "loss": 0.8379, + "step": 1946 + }, + { + "epoch": 0.0948068073917172, + "grad_norm": 2.0070931911468506, + "learning_rate": 3.9561595593573934e-05, + "loss": 1.0172, + "step": 1947 + }, + { + "epoch": 0.09485550118082439, + "grad_norm": 1.3819822072982788, + "learning_rate": 3.9560938514659055e-05, + "loss": 0.8547, + "step": 1948 + }, + { + "epoch": 0.09490419496993159, + "grad_norm": 1.95283043384552, + "learning_rate": 3.956028094916405e-05, + "loss": 0.857, + "step": 1949 + }, + { + "epoch": 0.09495288875903879, + "grad_norm": 1.8917683362960815, + "learning_rate": 3.955962289710527e-05, + "loss": 0.888, + "step": 1950 + }, + { + "epoch": 0.09500158254814599, + "grad_norm": 1.699350118637085, + "learning_rate": 3.955896435849909e-05, + "loss": 0.8976, + "step": 1951 + }, + { + "epoch": 0.09505027633725319, + "grad_norm": 1.6850616931915283, + "learning_rate": 3.955830533336189e-05, + "loss": 0.9177, + "step": 1952 + }, + { + "epoch": 0.09509897012636039, + "grad_norm": 1.7995411157608032, + "learning_rate": 3.9557645821710056e-05, + "loss": 0.9956, + "step": 1953 + }, + { + "epoch": 0.09514766391546758, + "grad_norm": 1.6432489156723022, + "learning_rate": 3.9556985823560004e-05, + "loss": 0.9569, + "step": 1954 + }, + { + "epoch": 0.09519635770457478, + "grad_norm": 1.7408568859100342, + "learning_rate": 3.9556325338928146e-05, + "loss": 0.9189, + "step": 1955 + }, + { + "epoch": 0.09524505149368198, + "grad_norm": 2.1862452030181885, + "learning_rate": 3.955566436783091e-05, + "loss": 0.875, + "step": 1956 + }, + { + "epoch": 0.09529374528278918, + "grad_norm": 1.6633388996124268, + "learning_rate": 3.955500291028474e-05, + "loss": 1.0187, + "step": 1957 + }, + { + "epoch": 0.09534243907189638, + "grad_norm": 2.062755823135376, + "learning_rate": 3.9554340966306094e-05, + "loss": 0.9131, + "step": 1958 + }, + { + "epoch": 0.09539113286100358, + "grad_norm": 1.591483235359192, + "learning_rate": 3.955367853591143e-05, + "loss": 0.9292, + "step": 1959 + }, + { + "epoch": 0.09543982665011078, + "grad_norm": 1.4442996978759766, + "learning_rate": 3.9553015619117225e-05, + "loss": 0.8716, + "step": 1960 + }, + { + "epoch": 0.09548852043921797, + "grad_norm": 1.5923727750778198, + "learning_rate": 3.955235221593998e-05, + "loss": 0.8892, + "step": 1961 + }, + { + "epoch": 0.09553721422832517, + "grad_norm": 1.3709079027175903, + "learning_rate": 3.955168832639619e-05, + "loss": 0.8291, + "step": 1962 + }, + { + "epoch": 0.09558590801743237, + "grad_norm": 1.911008358001709, + "learning_rate": 3.955102395050236e-05, + "loss": 0.8567, + "step": 1963 + }, + { + "epoch": 0.09563460180653957, + "grad_norm": 1.3820276260375977, + "learning_rate": 3.9550359088275044e-05, + "loss": 0.9004, + "step": 1964 + }, + { + "epoch": 0.09568329559564677, + "grad_norm": 2.3099524974823, + "learning_rate": 3.954969373973075e-05, + "loss": 0.9013, + "step": 1965 + }, + { + "epoch": 0.09573198938475397, + "grad_norm": 1.8434088230133057, + "learning_rate": 3.9549027904886054e-05, + "loss": 0.9235, + "step": 1966 + }, + { + "epoch": 0.09578068317386118, + "grad_norm": 1.9649611711502075, + "learning_rate": 3.95483615837575e-05, + "loss": 0.9174, + "step": 1967 + }, + { + "epoch": 0.09582937696296838, + "grad_norm": 1.3398497104644775, + "learning_rate": 3.9547694776361666e-05, + "loss": 0.8693, + "step": 1968 + }, + { + "epoch": 0.09587807075207558, + "grad_norm": 0.0892953872680664, + "learning_rate": 3.9547027482715145e-05, + "loss": 0.7113, + "step": 1969 + }, + { + "epoch": 0.09592676454118278, + "grad_norm": 6.74742317199707, + "learning_rate": 3.954635970283453e-05, + "loss": 0.914, + "step": 1970 + }, + { + "epoch": 0.09597545833028998, + "grad_norm": 1.6558008193969727, + "learning_rate": 3.954569143673644e-05, + "loss": 0.9514, + "step": 1971 + }, + { + "epoch": 0.09602415211939717, + "grad_norm": 1.8051583766937256, + "learning_rate": 3.9545022684437494e-05, + "loss": 0.9699, + "step": 1972 + }, + { + "epoch": 0.09607284590850437, + "grad_norm": 2.220749855041504, + "learning_rate": 3.954435344595432e-05, + "loss": 0.9082, + "step": 1973 + }, + { + "epoch": 0.09612153969761157, + "grad_norm": 2.10673451423645, + "learning_rate": 3.954368372130358e-05, + "loss": 0.8784, + "step": 1974 + }, + { + "epoch": 0.09617023348671877, + "grad_norm": 0.08623719960451126, + "learning_rate": 3.954301351050192e-05, + "loss": 0.6157, + "step": 1975 + }, + { + "epoch": 0.09621892727582597, + "grad_norm": 1.9800817966461182, + "learning_rate": 3.954234281356602e-05, + "loss": 0.8815, + "step": 1976 + }, + { + "epoch": 0.09626762106493317, + "grad_norm": 2.243124008178711, + "learning_rate": 3.954167163051256e-05, + "loss": 0.9436, + "step": 1977 + }, + { + "epoch": 0.09631631485404037, + "grad_norm": 1.5135035514831543, + "learning_rate": 3.9540999961358235e-05, + "loss": 0.8701, + "step": 1978 + }, + { + "epoch": 0.09636500864314756, + "grad_norm": 2.1727755069732666, + "learning_rate": 3.954032780611975e-05, + "loss": 0.9172, + "step": 1979 + }, + { + "epoch": 0.09641370243225476, + "grad_norm": 1.877419114112854, + "learning_rate": 3.9539655164813835e-05, + "loss": 0.8447, + "step": 1980 + }, + { + "epoch": 0.09646239622136196, + "grad_norm": 2.391598701477051, + "learning_rate": 3.953898203745722e-05, + "loss": 0.8967, + "step": 1981 + }, + { + "epoch": 0.09651109001046916, + "grad_norm": 1.7272597551345825, + "learning_rate": 3.953830842406664e-05, + "loss": 0.8785, + "step": 1982 + }, + { + "epoch": 0.09655978379957636, + "grad_norm": 0.08253034949302673, + "learning_rate": 3.953763432465886e-05, + "loss": 0.601, + "step": 1983 + }, + { + "epoch": 0.09660847758868356, + "grad_norm": 1.4505306482315063, + "learning_rate": 3.953695973925064e-05, + "loss": 0.8942, + "step": 1984 + }, + { + "epoch": 0.09665717137779076, + "grad_norm": 2.8768556118011475, + "learning_rate": 3.9536284667858766e-05, + "loss": 0.9684, + "step": 1985 + }, + { + "epoch": 0.09670586516689796, + "grad_norm": 2.2474756240844727, + "learning_rate": 3.9535609110500035e-05, + "loss": 0.9377, + "step": 1986 + }, + { + "epoch": 0.09675455895600517, + "grad_norm": 2.4972569942474365, + "learning_rate": 3.9534933067191244e-05, + "loss": 0.9064, + "step": 1987 + }, + { + "epoch": 0.09680325274511237, + "grad_norm": 2.1838254928588867, + "learning_rate": 3.953425653794921e-05, + "loss": 0.9042, + "step": 1988 + }, + { + "epoch": 0.09685194653421957, + "grad_norm": 2.1695055961608887, + "learning_rate": 3.953357952279076e-05, + "loss": 0.988, + "step": 1989 + }, + { + "epoch": 0.09690064032332676, + "grad_norm": 2.0041041374206543, + "learning_rate": 3.953290202173274e-05, + "loss": 0.8786, + "step": 1990 + }, + { + "epoch": 0.09694933411243396, + "grad_norm": 1.7167332172393799, + "learning_rate": 3.953222403479201e-05, + "loss": 0.9248, + "step": 1991 + }, + { + "epoch": 0.09699802790154116, + "grad_norm": 2.2657039165496826, + "learning_rate": 3.953154556198541e-05, + "loss": 0.9886, + "step": 1992 + }, + { + "epoch": 0.09704672169064836, + "grad_norm": 1.7240862846374512, + "learning_rate": 3.953086660332985e-05, + "loss": 0.9288, + "step": 1993 + }, + { + "epoch": 0.09709541547975556, + "grad_norm": 2.2198734283447266, + "learning_rate": 3.953018715884219e-05, + "loss": 0.8936, + "step": 1994 + }, + { + "epoch": 0.09714410926886276, + "grad_norm": 2.2118401527404785, + "learning_rate": 3.952950722853935e-05, + "loss": 0.9243, + "step": 1995 + }, + { + "epoch": 0.09719280305796996, + "grad_norm": 1.9997262954711914, + "learning_rate": 3.9528826812438236e-05, + "loss": 0.896, + "step": 1996 + }, + { + "epoch": 0.09724149684707716, + "grad_norm": 1.6915853023529053, + "learning_rate": 3.952814591055577e-05, + "loss": 0.834, + "step": 1997 + }, + { + "epoch": 0.09729019063618435, + "grad_norm": 1.7702677249908447, + "learning_rate": 3.9527464522908904e-05, + "loss": 0.8664, + "step": 1998 + }, + { + "epoch": 0.09733888442529155, + "grad_norm": 1.857364296913147, + "learning_rate": 3.9526782649514574e-05, + "loss": 0.9299, + "step": 1999 + }, + { + "epoch": 0.09738757821439875, + "grad_norm": 1.4227253198623657, + "learning_rate": 3.952610029038974e-05, + "loss": 0.8907, + "step": 2000 + }, + { + "epoch": 0.09743627200350595, + "grad_norm": 1.9342684745788574, + "learning_rate": 3.952541744555139e-05, + "loss": 0.8681, + "step": 2001 + }, + { + "epoch": 0.09748496579261315, + "grad_norm": 1.7360230684280396, + "learning_rate": 3.9524734115016494e-05, + "loss": 0.8927, + "step": 2002 + }, + { + "epoch": 0.09753365958172035, + "grad_norm": 1.8327610492706299, + "learning_rate": 3.952405029880205e-05, + "loss": 0.9122, + "step": 2003 + }, + { + "epoch": 0.09758235337082755, + "grad_norm": 3.119948387145996, + "learning_rate": 3.952336599692509e-05, + "loss": 0.922, + "step": 2004 + }, + { + "epoch": 0.09763104715993474, + "grad_norm": 0.08396008610725403, + "learning_rate": 3.9522681209402606e-05, + "loss": 0.6188, + "step": 2005 + }, + { + "epoch": 0.09767974094904194, + "grad_norm": 1.670361876487732, + "learning_rate": 3.952199593625166e-05, + "loss": 0.9775, + "step": 2006 + }, + { + "epoch": 0.09772843473814916, + "grad_norm": 0.08727312088012695, + "learning_rate": 3.9521310177489274e-05, + "loss": 0.5658, + "step": 2007 + }, + { + "epoch": 0.09777712852725635, + "grad_norm": 1.9300869703292847, + "learning_rate": 3.952062393313253e-05, + "loss": 0.8977, + "step": 2008 + }, + { + "epoch": 0.09782582231636355, + "grad_norm": 2.3740146160125732, + "learning_rate": 3.9519937203198476e-05, + "loss": 0.9627, + "step": 2009 + }, + { + "epoch": 0.09787451610547075, + "grad_norm": 1.9362905025482178, + "learning_rate": 3.95192499877042e-05, + "loss": 0.8272, + "step": 2010 + }, + { + "epoch": 0.09792320989457795, + "grad_norm": 3.110717535018921, + "learning_rate": 3.9518562286666816e-05, + "loss": 0.9151, + "step": 2011 + }, + { + "epoch": 0.09797190368368515, + "grad_norm": 1.8715312480926514, + "learning_rate": 3.95178741001034e-05, + "loss": 0.9644, + "step": 2012 + }, + { + "epoch": 0.09802059747279235, + "grad_norm": 1.8744604587554932, + "learning_rate": 3.95171854280311e-05, + "loss": 0.9329, + "step": 2013 + }, + { + "epoch": 0.09806929126189955, + "grad_norm": 2.064594030380249, + "learning_rate": 3.951649627046703e-05, + "loss": 0.913, + "step": 2014 + }, + { + "epoch": 0.09811798505100675, + "grad_norm": 3.1504311561584473, + "learning_rate": 3.951580662742833e-05, + "loss": 1.0214, + "step": 2015 + }, + { + "epoch": 0.09816667884011394, + "grad_norm": 1.6380974054336548, + "learning_rate": 3.951511649893216e-05, + "loss": 0.9089, + "step": 2016 + }, + { + "epoch": 0.09821537262922114, + "grad_norm": 1.4184125661849976, + "learning_rate": 3.9514425884995694e-05, + "loss": 0.8687, + "step": 2017 + }, + { + "epoch": 0.09826406641832834, + "grad_norm": 1.6450722217559814, + "learning_rate": 3.9513734785636097e-05, + "loss": 1.0017, + "step": 2018 + }, + { + "epoch": 0.09831276020743554, + "grad_norm": 1.9599946737289429, + "learning_rate": 3.9513043200870574e-05, + "loss": 0.9483, + "step": 2019 + }, + { + "epoch": 0.09836145399654274, + "grad_norm": 0.08194491267204285, + "learning_rate": 3.951235113071632e-05, + "loss": 0.5376, + "step": 2020 + }, + { + "epoch": 0.09841014778564994, + "grad_norm": 2.112320899963379, + "learning_rate": 3.951165857519056e-05, + "loss": 0.9497, + "step": 2021 + }, + { + "epoch": 0.09845884157475714, + "grad_norm": 2.045133590698242, + "learning_rate": 3.95109655343105e-05, + "loss": 0.8823, + "step": 2022 + }, + { + "epoch": 0.09850753536386433, + "grad_norm": 2.267242193222046, + "learning_rate": 3.95102720080934e-05, + "loss": 0.9634, + "step": 2023 + }, + { + "epoch": 0.09855622915297153, + "grad_norm": 2.3285882472991943, + "learning_rate": 3.9509577996556505e-05, + "loss": 0.9759, + "step": 2024 + }, + { + "epoch": 0.09860492294207873, + "grad_norm": 1.834794521331787, + "learning_rate": 3.950888349971708e-05, + "loss": 0.9841, + "step": 2025 + }, + { + "epoch": 0.09865361673118593, + "grad_norm": 1.8828628063201904, + "learning_rate": 3.95081885175924e-05, + "loss": 0.8985, + "step": 2026 + }, + { + "epoch": 0.09870231052029314, + "grad_norm": 1.7402472496032715, + "learning_rate": 3.950749305019975e-05, + "loss": 0.8773, + "step": 2027 + }, + { + "epoch": 0.09875100430940034, + "grad_norm": 2.0373711585998535, + "learning_rate": 3.950679709755643e-05, + "loss": 0.8761, + "step": 2028 + }, + { + "epoch": 0.09879969809850754, + "grad_norm": 1.7337172031402588, + "learning_rate": 3.9506100659679753e-05, + "loss": 0.9599, + "step": 2029 + }, + { + "epoch": 0.09884839188761474, + "grad_norm": 1.956188440322876, + "learning_rate": 3.950540373658705e-05, + "loss": 0.8843, + "step": 2030 + }, + { + "epoch": 0.09889708567672194, + "grad_norm": 1.8859926462173462, + "learning_rate": 3.950470632829565e-05, + "loss": 0.9953, + "step": 2031 + }, + { + "epoch": 0.09894577946582914, + "grad_norm": 1.7395325899124146, + "learning_rate": 3.9504008434822895e-05, + "loss": 0.9391, + "step": 2032 + }, + { + "epoch": 0.09899447325493634, + "grad_norm": 1.8578529357910156, + "learning_rate": 3.9503310056186154e-05, + "loss": 0.8732, + "step": 2033 + }, + { + "epoch": 0.09904316704404353, + "grad_norm": 1.5169990062713623, + "learning_rate": 3.95026111924028e-05, + "loss": 0.9512, + "step": 2034 + }, + { + "epoch": 0.09909186083315073, + "grad_norm": 2.1416923999786377, + "learning_rate": 3.9501911843490214e-05, + "loss": 0.8318, + "step": 2035 + }, + { + "epoch": 0.09914055462225793, + "grad_norm": 1.808195948600769, + "learning_rate": 3.950121200946579e-05, + "loss": 0.9498, + "step": 2036 + }, + { + "epoch": 0.09918924841136513, + "grad_norm": 2.1158671379089355, + "learning_rate": 3.9500511690346945e-05, + "loss": 0.9451, + "step": 2037 + }, + { + "epoch": 0.09923794220047233, + "grad_norm": 3.2971532344818115, + "learning_rate": 3.949981088615109e-05, + "loss": 0.9219, + "step": 2038 + }, + { + "epoch": 0.09928663598957953, + "grad_norm": 1.9359285831451416, + "learning_rate": 3.949910959689566e-05, + "loss": 0.8655, + "step": 2039 + }, + { + "epoch": 0.09933532977868673, + "grad_norm": 2.978400945663452, + "learning_rate": 3.94984078225981e-05, + "loss": 0.9155, + "step": 2040 + }, + { + "epoch": 0.09938402356779392, + "grad_norm": 2.6946680545806885, + "learning_rate": 3.949770556327587e-05, + "loss": 0.8927, + "step": 2041 + }, + { + "epoch": 0.09943271735690112, + "grad_norm": 1.6053799390792847, + "learning_rate": 3.9497002818946436e-05, + "loss": 1.0103, + "step": 2042 + }, + { + "epoch": 0.09948141114600832, + "grad_norm": 2.06199312210083, + "learning_rate": 3.949629958962728e-05, + "loss": 0.9188, + "step": 2043 + }, + { + "epoch": 0.09953010493511552, + "grad_norm": 0.08952584117650986, + "learning_rate": 3.94955958753359e-05, + "loss": 0.6494, + "step": 2044 + }, + { + "epoch": 0.09957879872422272, + "grad_norm": 4.8421311378479, + "learning_rate": 3.949489167608978e-05, + "loss": 0.9389, + "step": 2045 + }, + { + "epoch": 0.09962749251332992, + "grad_norm": 2.935642957687378, + "learning_rate": 3.949418699190646e-05, + "loss": 0.9463, + "step": 2046 + }, + { + "epoch": 0.09967618630243713, + "grad_norm": 3.3449668884277344, + "learning_rate": 3.949348182280346e-05, + "loss": 0.8859, + "step": 2047 + }, + { + "epoch": 0.09972488009154433, + "grad_norm": 0.07873675227165222, + "learning_rate": 3.949277616879833e-05, + "loss": 0.6089, + "step": 2048 + }, + { + "epoch": 0.09977357388065153, + "grad_norm": 1.798781156539917, + "learning_rate": 3.949207002990861e-05, + "loss": 0.9412, + "step": 2049 + }, + { + "epoch": 0.09982226766975873, + "grad_norm": 2.0300798416137695, + "learning_rate": 3.949136340615187e-05, + "loss": 0.932, + "step": 2050 + }, + { + "epoch": 0.09987096145886593, + "grad_norm": 1.8099098205566406, + "learning_rate": 3.949065629754569e-05, + "loss": 0.9927, + "step": 2051 + }, + { + "epoch": 0.09991965524797312, + "grad_norm": 1.462530255317688, + "learning_rate": 3.948994870410765e-05, + "loss": 0.9515, + "step": 2052 + }, + { + "epoch": 0.09996834903708032, + "grad_norm": 2.062880039215088, + "learning_rate": 3.948924062585537e-05, + "loss": 0.9223, + "step": 2053 + }, + { + "epoch": 0.10001704282618752, + "grad_norm": 2.4907729625701904, + "learning_rate": 3.948853206280644e-05, + "loss": 0.9723, + "step": 2054 + }, + { + "epoch": 0.10006573661529472, + "grad_norm": 0.0850888341665268, + "learning_rate": 3.948782301497851e-05, + "loss": 0.6467, + "step": 2055 + }, + { + "epoch": 0.10011443040440192, + "grad_norm": 3.2489707469940186, + "learning_rate": 3.94871134823892e-05, + "loss": 0.9634, + "step": 2056 + }, + { + "epoch": 0.10016312419350912, + "grad_norm": 3.1363539695739746, + "learning_rate": 3.9486403465056166e-05, + "loss": 0.9575, + "step": 2057 + }, + { + "epoch": 0.10021181798261632, + "grad_norm": 2.6852195262908936, + "learning_rate": 3.948569296299707e-05, + "loss": 0.8928, + "step": 2058 + }, + { + "epoch": 0.10026051177172352, + "grad_norm": 2.1946828365325928, + "learning_rate": 3.948498197622958e-05, + "loss": 1.0057, + "step": 2059 + }, + { + "epoch": 0.10030920556083071, + "grad_norm": 1.299736499786377, + "learning_rate": 3.94842705047714e-05, + "loss": 0.9267, + "step": 2060 + }, + { + "epoch": 0.10035789934993791, + "grad_norm": 2.262009859085083, + "learning_rate": 3.94835585486402e-05, + "loss": 0.963, + "step": 2061 + }, + { + "epoch": 0.10040659313904511, + "grad_norm": 0.0895622968673706, + "learning_rate": 3.9482846107853715e-05, + "loss": 0.6371, + "step": 2062 + }, + { + "epoch": 0.10045528692815231, + "grad_norm": 1.9551748037338257, + "learning_rate": 3.948213318242965e-05, + "loss": 1.0044, + "step": 2063 + }, + { + "epoch": 0.10050398071725951, + "grad_norm": 3.0476415157318115, + "learning_rate": 3.9481419772385746e-05, + "loss": 0.8872, + "step": 2064 + }, + { + "epoch": 0.10055267450636671, + "grad_norm": 1.688635230064392, + "learning_rate": 3.948070587773975e-05, + "loss": 0.9952, + "step": 2065 + }, + { + "epoch": 0.1006013682954739, + "grad_norm": 2.328505754470825, + "learning_rate": 3.947999149850942e-05, + "loss": 0.8869, + "step": 2066 + }, + { + "epoch": 0.10065006208458112, + "grad_norm": 1.8742868900299072, + "learning_rate": 3.9479276634712525e-05, + "loss": 0.9589, + "step": 2067 + }, + { + "epoch": 0.10069875587368832, + "grad_norm": 1.806807518005371, + "learning_rate": 3.947856128636685e-05, + "loss": 0.9679, + "step": 2068 + }, + { + "epoch": 0.10074744966279552, + "grad_norm": 1.857726812362671, + "learning_rate": 3.9477845453490184e-05, + "loss": 0.8734, + "step": 2069 + }, + { + "epoch": 0.10079614345190271, + "grad_norm": 1.721701741218567, + "learning_rate": 3.947712913610034e-05, + "loss": 0.9587, + "step": 2070 + }, + { + "epoch": 0.10084483724100991, + "grad_norm": 1.7502093315124512, + "learning_rate": 3.9476412334215125e-05, + "loss": 0.87, + "step": 2071 + }, + { + "epoch": 0.10089353103011711, + "grad_norm": 2.4869675636291504, + "learning_rate": 3.9475695047852386e-05, + "loss": 0.9415, + "step": 2072 + }, + { + "epoch": 0.10094222481922431, + "grad_norm": 1.859038233757019, + "learning_rate": 3.947497727702995e-05, + "loss": 0.8685, + "step": 2073 + }, + { + "epoch": 0.10099091860833151, + "grad_norm": 1.4160138368606567, + "learning_rate": 3.947425902176568e-05, + "loss": 0.9507, + "step": 2074 + }, + { + "epoch": 0.10103961239743871, + "grad_norm": 1.9763356447219849, + "learning_rate": 3.9473540282077445e-05, + "loss": 0.9899, + "step": 2075 + }, + { + "epoch": 0.1010883061865459, + "grad_norm": 2.372020721435547, + "learning_rate": 3.947282105798312e-05, + "loss": 0.9694, + "step": 2076 + }, + { + "epoch": 0.1011369999756531, + "grad_norm": 1.9002689123153687, + "learning_rate": 3.9472101349500594e-05, + "loss": 0.9509, + "step": 2077 + }, + { + "epoch": 0.1011856937647603, + "grad_norm": 1.6374051570892334, + "learning_rate": 3.947138115664778e-05, + "loss": 0.9673, + "step": 2078 + }, + { + "epoch": 0.1012343875538675, + "grad_norm": 2.507394790649414, + "learning_rate": 3.9470660479442575e-05, + "loss": 0.848, + "step": 2079 + }, + { + "epoch": 0.1012830813429747, + "grad_norm": 4.359569549560547, + "learning_rate": 3.946993931790292e-05, + "loss": 0.8996, + "step": 2080 + }, + { + "epoch": 0.1013317751320819, + "grad_norm": 2.603015661239624, + "learning_rate": 3.946921767204674e-05, + "loss": 0.8345, + "step": 2081 + }, + { + "epoch": 0.1013804689211891, + "grad_norm": 1.8721859455108643, + "learning_rate": 3.9468495541892e-05, + "loss": 0.9406, + "step": 2082 + }, + { + "epoch": 0.1014291627102963, + "grad_norm": 2.144721031188965, + "learning_rate": 3.946777292745666e-05, + "loss": 0.948, + "step": 2083 + }, + { + "epoch": 0.1014778564994035, + "grad_norm": 1.7326546907424927, + "learning_rate": 3.946704982875869e-05, + "loss": 0.9021, + "step": 2084 + }, + { + "epoch": 0.1015265502885107, + "grad_norm": 1.721088171005249, + "learning_rate": 3.946632624581608e-05, + "loss": 0.9637, + "step": 2085 + }, + { + "epoch": 0.1015752440776179, + "grad_norm": 2.2637288570404053, + "learning_rate": 3.946560217864684e-05, + "loss": 0.9604, + "step": 2086 + }, + { + "epoch": 0.1016239378667251, + "grad_norm": 1.7337347269058228, + "learning_rate": 3.946487762726897e-05, + "loss": 0.868, + "step": 2087 + }, + { + "epoch": 0.1016726316558323, + "grad_norm": 1.8402683734893799, + "learning_rate": 3.946415259170048e-05, + "loss": 0.8892, + "step": 2088 + }, + { + "epoch": 0.1017213254449395, + "grad_norm": 3.4744982719421387, + "learning_rate": 3.9463427071959426e-05, + "loss": 0.9333, + "step": 2089 + }, + { + "epoch": 0.1017700192340467, + "grad_norm": 1.5646774768829346, + "learning_rate": 3.9462701068063854e-05, + "loss": 0.9331, + "step": 2090 + }, + { + "epoch": 0.1018187130231539, + "grad_norm": 2.1203935146331787, + "learning_rate": 3.946197458003181e-05, + "loss": 0.9708, + "step": 2091 + }, + { + "epoch": 0.1018674068122611, + "grad_norm": 2.3687729835510254, + "learning_rate": 3.946124760788138e-05, + "loss": 1.0541, + "step": 2092 + }, + { + "epoch": 0.1019161006013683, + "grad_norm": 1.905217170715332, + "learning_rate": 3.946052015163064e-05, + "loss": 1.0058, + "step": 2093 + }, + { + "epoch": 0.1019647943904755, + "grad_norm": 1.9619389772415161, + "learning_rate": 3.945979221129769e-05, + "loss": 0.9464, + "step": 2094 + }, + { + "epoch": 0.1020134881795827, + "grad_norm": 1.9032012224197388, + "learning_rate": 3.945906378690064e-05, + "loss": 0.9515, + "step": 2095 + }, + { + "epoch": 0.1020621819686899, + "grad_norm": 1.8253191709518433, + "learning_rate": 3.945833487845759e-05, + "loss": 0.8948, + "step": 2096 + }, + { + "epoch": 0.10211087575779709, + "grad_norm": 1.9926567077636719, + "learning_rate": 3.945760548598669e-05, + "loss": 0.9329, + "step": 2097 + }, + { + "epoch": 0.10215956954690429, + "grad_norm": 3.731867790222168, + "learning_rate": 3.945687560950609e-05, + "loss": 0.9503, + "step": 2098 + }, + { + "epoch": 0.10220826333601149, + "grad_norm": 1.4331456422805786, + "learning_rate": 3.945614524903392e-05, + "loss": 0.9477, + "step": 2099 + }, + { + "epoch": 0.10225695712511869, + "grad_norm": 1.5364928245544434, + "learning_rate": 3.945541440458837e-05, + "loss": 0.9012, + "step": 2100 + }, + { + "epoch": 0.10230565091422589, + "grad_norm": 1.8690587282180786, + "learning_rate": 3.945468307618761e-05, + "loss": 0.8054, + "step": 2101 + }, + { + "epoch": 0.10235434470333309, + "grad_norm": 3.042936086654663, + "learning_rate": 3.945395126384984e-05, + "loss": 0.9408, + "step": 2102 + }, + { + "epoch": 0.10240303849244028, + "grad_norm": 6.011319637298584, + "learning_rate": 3.9453218967593254e-05, + "loss": 0.9267, + "step": 2103 + }, + { + "epoch": 0.10245173228154748, + "grad_norm": 1.8756024837493896, + "learning_rate": 3.945248618743607e-05, + "loss": 1.0003, + "step": 2104 + }, + { + "epoch": 0.10250042607065468, + "grad_norm": 3.5530331134796143, + "learning_rate": 3.945175292339652e-05, + "loss": 0.8337, + "step": 2105 + }, + { + "epoch": 0.10254911985976188, + "grad_norm": 1.7877181768417358, + "learning_rate": 3.945101917549285e-05, + "loss": 0.8896, + "step": 2106 + }, + { + "epoch": 0.1025978136488691, + "grad_norm": 1.7960820198059082, + "learning_rate": 3.94502849437433e-05, + "loss": 0.9226, + "step": 2107 + }, + { + "epoch": 0.10264650743797629, + "grad_norm": 2.0304932594299316, + "learning_rate": 3.944955022816614e-05, + "loss": 0.803, + "step": 2108 + }, + { + "epoch": 0.10269520122708349, + "grad_norm": 2.5673179626464844, + "learning_rate": 3.944881502877964e-05, + "loss": 0.9705, + "step": 2109 + }, + { + "epoch": 0.10274389501619069, + "grad_norm": 8.183040618896484, + "learning_rate": 3.9448079345602096e-05, + "loss": 0.9239, + "step": 2110 + }, + { + "epoch": 0.10279258880529789, + "grad_norm": 1.5730286836624146, + "learning_rate": 3.94473431786518e-05, + "loss": 0.8928, + "step": 2111 + }, + { + "epoch": 0.10284128259440509, + "grad_norm": 2.6962625980377197, + "learning_rate": 3.944660652794708e-05, + "loss": 0.9057, + "step": 2112 + }, + { + "epoch": 0.10288997638351229, + "grad_norm": 1.8725252151489258, + "learning_rate": 3.944586939350624e-05, + "loss": 0.8854, + "step": 2113 + }, + { + "epoch": 0.10293867017261948, + "grad_norm": 2.538309097290039, + "learning_rate": 3.944513177534763e-05, + "loss": 0.9027, + "step": 2114 + }, + { + "epoch": 0.10298736396172668, + "grad_norm": 2.0968692302703857, + "learning_rate": 3.94443936734896e-05, + "loss": 0.8128, + "step": 2115 + }, + { + "epoch": 0.10303605775083388, + "grad_norm": 2.5053305625915527, + "learning_rate": 3.9443655087950494e-05, + "loss": 0.8934, + "step": 2116 + }, + { + "epoch": 0.10308475153994108, + "grad_norm": 2.4418861865997314, + "learning_rate": 3.94429160187487e-05, + "loss": 0.9325, + "step": 2117 + }, + { + "epoch": 0.10313344532904828, + "grad_norm": 2.6940982341766357, + "learning_rate": 3.944217646590259e-05, + "loss": 0.921, + "step": 2118 + }, + { + "epoch": 0.10318213911815548, + "grad_norm": 1.6643105745315552, + "learning_rate": 3.9441436429430576e-05, + "loss": 0.9688, + "step": 2119 + }, + { + "epoch": 0.10323083290726268, + "grad_norm": 2.048874855041504, + "learning_rate": 3.9440695909351055e-05, + "loss": 0.8512, + "step": 2120 + }, + { + "epoch": 0.10327952669636987, + "grad_norm": 1.6013410091400146, + "learning_rate": 3.943995490568245e-05, + "loss": 0.8722, + "step": 2121 + }, + { + "epoch": 0.10332822048547707, + "grad_norm": 1.5178931951522827, + "learning_rate": 3.9439213418443196e-05, + "loss": 0.877, + "step": 2122 + }, + { + "epoch": 0.10337691427458427, + "grad_norm": 1.7146650552749634, + "learning_rate": 3.943847144765174e-05, + "loss": 0.8549, + "step": 2123 + }, + { + "epoch": 0.10342560806369147, + "grad_norm": 2.0165393352508545, + "learning_rate": 3.943772899332653e-05, + "loss": 0.9783, + "step": 2124 + }, + { + "epoch": 0.10347430185279867, + "grad_norm": 2.0154924392700195, + "learning_rate": 3.943698605548604e-05, + "loss": 1.0211, + "step": 2125 + }, + { + "epoch": 0.10352299564190587, + "grad_norm": 1.959122896194458, + "learning_rate": 3.943624263414875e-05, + "loss": 0.9763, + "step": 2126 + }, + { + "epoch": 0.10357168943101308, + "grad_norm": 1.7147399187088013, + "learning_rate": 3.943549872933315e-05, + "loss": 1.0734, + "step": 2127 + }, + { + "epoch": 0.10362038322012028, + "grad_norm": 1.8630530834197998, + "learning_rate": 3.943475434105775e-05, + "loss": 0.9503, + "step": 2128 + }, + { + "epoch": 0.10366907700922748, + "grad_norm": 2.1487488746643066, + "learning_rate": 3.9434009469341065e-05, + "loss": 1.0211, + "step": 2129 + }, + { + "epoch": 0.10371777079833468, + "grad_norm": 1.8703763484954834, + "learning_rate": 3.943326411420162e-05, + "loss": 0.888, + "step": 2130 + }, + { + "epoch": 0.10376646458744188, + "grad_norm": 2.8099606037139893, + "learning_rate": 3.943251827565796e-05, + "loss": 0.8333, + "step": 2131 + }, + { + "epoch": 0.10381515837654907, + "grad_norm": 1.8446967601776123, + "learning_rate": 3.943177195372863e-05, + "loss": 0.968, + "step": 2132 + }, + { + "epoch": 0.10386385216565627, + "grad_norm": 2.2069666385650635, + "learning_rate": 3.943102514843221e-05, + "loss": 0.8803, + "step": 2133 + }, + { + "epoch": 0.10391254595476347, + "grad_norm": 2.1205313205718994, + "learning_rate": 3.9430277859787256e-05, + "loss": 0.9562, + "step": 2134 + }, + { + "epoch": 0.10396123974387067, + "grad_norm": 2.1577866077423096, + "learning_rate": 3.9429530087812374e-05, + "loss": 0.8559, + "step": 2135 + }, + { + "epoch": 0.10400993353297787, + "grad_norm": 2.21105694770813, + "learning_rate": 3.9428781832526165e-05, + "loss": 0.9249, + "step": 2136 + }, + { + "epoch": 0.10405862732208507, + "grad_norm": 2.0965569019317627, + "learning_rate": 3.942803309394723e-05, + "loss": 0.9326, + "step": 2137 + }, + { + "epoch": 0.10410732111119227, + "grad_norm": 2.0878825187683105, + "learning_rate": 3.94272838720942e-05, + "loss": 0.9042, + "step": 2138 + }, + { + "epoch": 0.10415601490029947, + "grad_norm": 3.881133556365967, + "learning_rate": 3.942653416698572e-05, + "loss": 0.8943, + "step": 2139 + }, + { + "epoch": 0.10420470868940666, + "grad_norm": 2.677551507949829, + "learning_rate": 3.9425783978640425e-05, + "loss": 0.9594, + "step": 2140 + }, + { + "epoch": 0.10425340247851386, + "grad_norm": 2.1326351165771484, + "learning_rate": 3.942503330707698e-05, + "loss": 0.9957, + "step": 2141 + }, + { + "epoch": 0.10430209626762106, + "grad_norm": 2.372316598892212, + "learning_rate": 3.9424282152314066e-05, + "loss": 0.9644, + "step": 2142 + }, + { + "epoch": 0.10435079005672826, + "grad_norm": 1.6893590688705444, + "learning_rate": 3.942353051437037e-05, + "loss": 0.9684, + "step": 2143 + }, + { + "epoch": 0.10439948384583546, + "grad_norm": 2.3067731857299805, + "learning_rate": 3.942277839326457e-05, + "loss": 0.8527, + "step": 2144 + }, + { + "epoch": 0.10444817763494266, + "grad_norm": 1.6853747367858887, + "learning_rate": 3.942202578901538e-05, + "loss": 0.9293, + "step": 2145 + }, + { + "epoch": 0.10449687142404986, + "grad_norm": 1.5121114253997803, + "learning_rate": 3.942127270164155e-05, + "loss": 0.9604, + "step": 2146 + }, + { + "epoch": 0.10454556521315707, + "grad_norm": 2.778066873550415, + "learning_rate": 3.942051913116177e-05, + "loss": 0.9443, + "step": 2147 + }, + { + "epoch": 0.10459425900226427, + "grad_norm": 1.7704404592514038, + "learning_rate": 3.9419765077594814e-05, + "loss": 0.9353, + "step": 2148 + }, + { + "epoch": 0.10464295279137147, + "grad_norm": 1.7435951232910156, + "learning_rate": 3.9419010540959425e-05, + "loss": 0.9296, + "step": 2149 + }, + { + "epoch": 0.10469164658047866, + "grad_norm": 1.8185473680496216, + "learning_rate": 3.941825552127439e-05, + "loss": 0.9486, + "step": 2150 + }, + { + "epoch": 0.10474034036958586, + "grad_norm": 1.7920050621032715, + "learning_rate": 3.941750001855847e-05, + "loss": 0.8832, + "step": 2151 + }, + { + "epoch": 0.10478903415869306, + "grad_norm": 2.0108044147491455, + "learning_rate": 3.9416744032830474e-05, + "loss": 0.9872, + "step": 2152 + }, + { + "epoch": 0.10483772794780026, + "grad_norm": 1.8954741954803467, + "learning_rate": 3.941598756410919e-05, + "loss": 0.8613, + "step": 2153 + }, + { + "epoch": 0.10488642173690746, + "grad_norm": 1.451817274093628, + "learning_rate": 3.941523061241345e-05, + "loss": 0.9859, + "step": 2154 + }, + { + "epoch": 0.10493511552601466, + "grad_norm": 2.3060483932495117, + "learning_rate": 3.9414473177762074e-05, + "loss": 0.907, + "step": 2155 + }, + { + "epoch": 0.10498380931512186, + "grad_norm": 1.7565621137619019, + "learning_rate": 3.9413715260173914e-05, + "loss": 0.9385, + "step": 2156 + }, + { + "epoch": 0.10503250310422906, + "grad_norm": 3.237231969833374, + "learning_rate": 3.9412956859667816e-05, + "loss": 0.9195, + "step": 2157 + }, + { + "epoch": 0.10508119689333625, + "grad_norm": 2.1131582260131836, + "learning_rate": 3.941219797626264e-05, + "loss": 0.951, + "step": 2158 + }, + { + "epoch": 0.10512989068244345, + "grad_norm": 2.0774929523468018, + "learning_rate": 3.941143860997728e-05, + "loss": 1.002, + "step": 2159 + }, + { + "epoch": 0.10517858447155065, + "grad_norm": 2.092543601989746, + "learning_rate": 3.9410678760830606e-05, + "loss": 0.968, + "step": 2160 + }, + { + "epoch": 0.10522727826065785, + "grad_norm": 1.7542644739151, + "learning_rate": 3.940991842884153e-05, + "loss": 0.9521, + "step": 2161 + }, + { + "epoch": 0.10527597204976505, + "grad_norm": 2.1175551414489746, + "learning_rate": 3.940915761402896e-05, + "loss": 0.904, + "step": 2162 + }, + { + "epoch": 0.10532466583887225, + "grad_norm": 2.076897621154785, + "learning_rate": 3.940839631641183e-05, + "loss": 0.9129, + "step": 2163 + }, + { + "epoch": 0.10537335962797945, + "grad_norm": 1.3526393175125122, + "learning_rate": 3.9407634536009074e-05, + "loss": 0.9232, + "step": 2164 + }, + { + "epoch": 0.10542205341708664, + "grad_norm": 4.879053592681885, + "learning_rate": 3.940687227283963e-05, + "loss": 0.9798, + "step": 2165 + }, + { + "epoch": 0.10547074720619384, + "grad_norm": 2.1182467937469482, + "learning_rate": 3.940610952692248e-05, + "loss": 0.8742, + "step": 2166 + }, + { + "epoch": 0.10551944099530106, + "grad_norm": 1.6557505130767822, + "learning_rate": 3.940534629827658e-05, + "loss": 0.9917, + "step": 2167 + }, + { + "epoch": 0.10556813478440825, + "grad_norm": 1.8339358568191528, + "learning_rate": 3.940458258692092e-05, + "loss": 0.9081, + "step": 2168 + }, + { + "epoch": 0.10561682857351545, + "grad_norm": 2.010690212249756, + "learning_rate": 3.940381839287451e-05, + "loss": 0.9455, + "step": 2169 + }, + { + "epoch": 0.10566552236262265, + "grad_norm": 1.6582609415054321, + "learning_rate": 3.9403053716156335e-05, + "loss": 0.86, + "step": 2170 + }, + { + "epoch": 0.10571421615172985, + "grad_norm": 2.370112657546997, + "learning_rate": 3.9402288556785433e-05, + "loss": 0.8628, + "step": 2171 + }, + { + "epoch": 0.10576290994083705, + "grad_norm": 2.038123369216919, + "learning_rate": 3.940152291478084e-05, + "loss": 0.9701, + "step": 2172 + }, + { + "epoch": 0.10581160372994425, + "grad_norm": 2.2066218852996826, + "learning_rate": 3.940075679016159e-05, + "loss": 1.0498, + "step": 2173 + }, + { + "epoch": 0.10586029751905145, + "grad_norm": 1.8929340839385986, + "learning_rate": 3.9399990182946745e-05, + "loss": 0.8624, + "step": 2174 + }, + { + "epoch": 0.10590899130815865, + "grad_norm": 1.8439632654190063, + "learning_rate": 3.939922309315538e-05, + "loss": 0.9705, + "step": 2175 + }, + { + "epoch": 0.10595768509726584, + "grad_norm": 3.202101945877075, + "learning_rate": 3.9398455520806565e-05, + "loss": 0.9217, + "step": 2176 + }, + { + "epoch": 0.10600637888637304, + "grad_norm": 1.9284578561782837, + "learning_rate": 3.9397687465919405e-05, + "loss": 0.8255, + "step": 2177 + }, + { + "epoch": 0.10605507267548024, + "grad_norm": 2.2748141288757324, + "learning_rate": 3.9396918928512995e-05, + "loss": 0.9323, + "step": 2178 + }, + { + "epoch": 0.10610376646458744, + "grad_norm": 1.9327253103256226, + "learning_rate": 3.939614990860647e-05, + "loss": 0.9125, + "step": 2179 + }, + { + "epoch": 0.10615246025369464, + "grad_norm": 5.042523384094238, + "learning_rate": 3.939538040621894e-05, + "loss": 0.9425, + "step": 2180 + }, + { + "epoch": 0.10620115404280184, + "grad_norm": 1.7748196125030518, + "learning_rate": 3.9394610421369555e-05, + "loss": 0.8687, + "step": 2181 + }, + { + "epoch": 0.10624984783190904, + "grad_norm": 2.1767239570617676, + "learning_rate": 3.939383995407746e-05, + "loss": 0.9159, + "step": 2182 + }, + { + "epoch": 0.10629854162101623, + "grad_norm": 2.774317741394043, + "learning_rate": 3.9393069004361836e-05, + "loss": 0.9348, + "step": 2183 + }, + { + "epoch": 0.10634723541012343, + "grad_norm": 2.5428059101104736, + "learning_rate": 3.9392297572241857e-05, + "loss": 0.8286, + "step": 2184 + }, + { + "epoch": 0.10639592919923063, + "grad_norm": 2.176149368286133, + "learning_rate": 3.93915256577367e-05, + "loss": 0.9368, + "step": 2185 + }, + { + "epoch": 0.10644462298833783, + "grad_norm": 2.2385642528533936, + "learning_rate": 3.939075326086558e-05, + "loss": 0.9077, + "step": 2186 + }, + { + "epoch": 0.10649331677744504, + "grad_norm": 2.02262806892395, + "learning_rate": 3.93899803816477e-05, + "loss": 0.9525, + "step": 2187 + }, + { + "epoch": 0.10654201056655224, + "grad_norm": 6.715932369232178, + "learning_rate": 3.938920702010229e-05, + "loss": 0.9007, + "step": 2188 + }, + { + "epoch": 0.10659070435565944, + "grad_norm": 2.067504405975342, + "learning_rate": 3.9388433176248584e-05, + "loss": 0.9009, + "step": 2189 + }, + { + "epoch": 0.10663939814476664, + "grad_norm": 2.32965087890625, + "learning_rate": 3.938765885010584e-05, + "loss": 0.9085, + "step": 2190 + }, + { + "epoch": 0.10668809193387384, + "grad_norm": 3.279771089553833, + "learning_rate": 3.938688404169332e-05, + "loss": 0.8859, + "step": 2191 + }, + { + "epoch": 0.10673678572298104, + "grad_norm": 1.536041021347046, + "learning_rate": 3.938610875103028e-05, + "loss": 0.9075, + "step": 2192 + }, + { + "epoch": 0.10678547951208824, + "grad_norm": 1.7957696914672852, + "learning_rate": 3.938533297813603e-05, + "loss": 0.8626, + "step": 2193 + }, + { + "epoch": 0.10683417330119543, + "grad_norm": 1.9222562313079834, + "learning_rate": 3.9384556723029847e-05, + "loss": 0.9243, + "step": 2194 + }, + { + "epoch": 0.10688286709030263, + "grad_norm": 2.2770750522613525, + "learning_rate": 3.9383779985731045e-05, + "loss": 0.9822, + "step": 2195 + }, + { + "epoch": 0.10693156087940983, + "grad_norm": 2.6438732147216797, + "learning_rate": 3.9383002766258955e-05, + "loss": 0.9072, + "step": 2196 + }, + { + "epoch": 0.10698025466851703, + "grad_norm": 2.0136125087738037, + "learning_rate": 3.93822250646329e-05, + "loss": 0.9568, + "step": 2197 + }, + { + "epoch": 0.10702894845762423, + "grad_norm": 1.337708830833435, + "learning_rate": 3.938144688087223e-05, + "loss": 0.939, + "step": 2198 + }, + { + "epoch": 0.10707764224673143, + "grad_norm": 2.1352148056030273, + "learning_rate": 3.938066821499631e-05, + "loss": 0.7566, + "step": 2199 + }, + { + "epoch": 0.10712633603583863, + "grad_norm": 1.9525690078735352, + "learning_rate": 3.9379889067024486e-05, + "loss": 0.9359, + "step": 2200 + }, + { + "epoch": 0.10717502982494582, + "grad_norm": 2.6344950199127197, + "learning_rate": 3.937910943697616e-05, + "loss": 0.8131, + "step": 2201 + }, + { + "epoch": 0.10722372361405302, + "grad_norm": 2.0088770389556885, + "learning_rate": 3.937832932487073e-05, + "loss": 0.9097, + "step": 2202 + }, + { + "epoch": 0.10727241740316022, + "grad_norm": 3.4907402992248535, + "learning_rate": 3.9377548730727577e-05, + "loss": 1.0086, + "step": 2203 + }, + { + "epoch": 0.10732111119226742, + "grad_norm": 1.9188032150268555, + "learning_rate": 3.937676765456614e-05, + "loss": 0.9074, + "step": 2204 + }, + { + "epoch": 0.10736980498137462, + "grad_norm": 1.856777310371399, + "learning_rate": 3.937598609640584e-05, + "loss": 0.8913, + "step": 2205 + }, + { + "epoch": 0.10741849877048182, + "grad_norm": 1.671587586402893, + "learning_rate": 3.937520405626612e-05, + "loss": 0.9862, + "step": 2206 + }, + { + "epoch": 0.10746719255958903, + "grad_norm": 1.6678463220596313, + "learning_rate": 3.9374421534166425e-05, + "loss": 0.9555, + "step": 2207 + }, + { + "epoch": 0.10751588634869623, + "grad_norm": 2.2032570838928223, + "learning_rate": 3.937363853012624e-05, + "loss": 0.8931, + "step": 2208 + }, + { + "epoch": 0.10756458013780343, + "grad_norm": 1.8015177249908447, + "learning_rate": 3.937285504416502e-05, + "loss": 0.9471, + "step": 2209 + }, + { + "epoch": 0.10761327392691063, + "grad_norm": 2.446798801422119, + "learning_rate": 3.937207107630227e-05, + "loss": 0.909, + "step": 2210 + }, + { + "epoch": 0.10766196771601783, + "grad_norm": 1.679948091506958, + "learning_rate": 3.937128662655748e-05, + "loss": 0.9636, + "step": 2211 + }, + { + "epoch": 0.10771066150512502, + "grad_norm": 1.9753698110580444, + "learning_rate": 3.9370501694950175e-05, + "loss": 0.9527, + "step": 2212 + }, + { + "epoch": 0.10775935529423222, + "grad_norm": 1.8815526962280273, + "learning_rate": 3.9369716281499874e-05, + "loss": 0.8263, + "step": 2213 + }, + { + "epoch": 0.10780804908333942, + "grad_norm": 1.8394943475723267, + "learning_rate": 3.9368930386226116e-05, + "loss": 0.9827, + "step": 2214 + }, + { + "epoch": 0.10785674287244662, + "grad_norm": 1.4512666463851929, + "learning_rate": 3.9368144009148444e-05, + "loss": 0.9008, + "step": 2215 + }, + { + "epoch": 0.10790543666155382, + "grad_norm": 1.9394229650497437, + "learning_rate": 3.936735715028643e-05, + "loss": 0.8825, + "step": 2216 + }, + { + "epoch": 0.10795413045066102, + "grad_norm": 1.9255614280700684, + "learning_rate": 3.936656980965963e-05, + "loss": 0.9471, + "step": 2217 + }, + { + "epoch": 0.10800282423976822, + "grad_norm": 2.5422463417053223, + "learning_rate": 3.936578198728765e-05, + "loss": 0.8325, + "step": 2218 + }, + { + "epoch": 0.10805151802887542, + "grad_norm": 2.087916135787964, + "learning_rate": 3.936499368319007e-05, + "loss": 0.9592, + "step": 2219 + }, + { + "epoch": 0.10810021181798261, + "grad_norm": 2.0652663707733154, + "learning_rate": 3.936420489738651e-05, + "loss": 0.8537, + "step": 2220 + }, + { + "epoch": 0.10814890560708981, + "grad_norm": 2.0266225337982178, + "learning_rate": 3.936341562989659e-05, + "loss": 0.9576, + "step": 2221 + }, + { + "epoch": 0.10819759939619701, + "grad_norm": 1.5427318811416626, + "learning_rate": 3.936262588073994e-05, + "loss": 0.8553, + "step": 2222 + }, + { + "epoch": 0.10824629318530421, + "grad_norm": 2.021988868713379, + "learning_rate": 3.93618356499362e-05, + "loss": 0.9169, + "step": 2223 + }, + { + "epoch": 0.10829498697441141, + "grad_norm": 2.0822784900665283, + "learning_rate": 3.936104493750504e-05, + "loss": 1.0, + "step": 2224 + }, + { + "epoch": 0.10834368076351861, + "grad_norm": 2.1452019214630127, + "learning_rate": 3.936025374346612e-05, + "loss": 0.9174, + "step": 2225 + }, + { + "epoch": 0.1083923745526258, + "grad_norm": 2.196627616882324, + "learning_rate": 3.9359462067839124e-05, + "loss": 0.7724, + "step": 2226 + }, + { + "epoch": 0.10844106834173302, + "grad_norm": 1.6625765562057495, + "learning_rate": 3.935866991064374e-05, + "loss": 0.9048, + "step": 2227 + }, + { + "epoch": 0.10848976213084022, + "grad_norm": 2.2528128623962402, + "learning_rate": 3.935787727189968e-05, + "loss": 0.8371, + "step": 2228 + }, + { + "epoch": 0.10853845591994742, + "grad_norm": 3.7530269622802734, + "learning_rate": 3.9357084151626656e-05, + "loss": 0.9616, + "step": 2229 + }, + { + "epoch": 0.10858714970905461, + "grad_norm": 2.348738193511963, + "learning_rate": 3.93562905498444e-05, + "loss": 0.8983, + "step": 2230 + }, + { + "epoch": 0.10863584349816181, + "grad_norm": 3.602384567260742, + "learning_rate": 3.935549646657265e-05, + "loss": 0.9258, + "step": 2231 + }, + { + "epoch": 0.10868453728726901, + "grad_norm": 1.736953854560852, + "learning_rate": 3.935470190183116e-05, + "loss": 0.9718, + "step": 2232 + }, + { + "epoch": 0.10873323107637621, + "grad_norm": 1.5664894580841064, + "learning_rate": 3.93539068556397e-05, + "loss": 0.9362, + "step": 2233 + }, + { + "epoch": 0.10878192486548341, + "grad_norm": 1.9106131792068481, + "learning_rate": 3.935311132801804e-05, + "loss": 0.9331, + "step": 2234 + }, + { + "epoch": 0.10883061865459061, + "grad_norm": 4.631856918334961, + "learning_rate": 3.9352315318985974e-05, + "loss": 0.8467, + "step": 2235 + }, + { + "epoch": 0.1088793124436978, + "grad_norm": 2.378754138946533, + "learning_rate": 3.93515188285633e-05, + "loss": 0.9479, + "step": 2236 + }, + { + "epoch": 0.108928006232805, + "grad_norm": 1.6902800798416138, + "learning_rate": 3.935072185676982e-05, + "loss": 0.9212, + "step": 2237 + }, + { + "epoch": 0.1089767000219122, + "grad_norm": 2.2373416423797607, + "learning_rate": 3.934992440362538e-05, + "loss": 0.9409, + "step": 2238 + }, + { + "epoch": 0.1090253938110194, + "grad_norm": 2.189028739929199, + "learning_rate": 3.93491264691498e-05, + "loss": 0.9644, + "step": 2239 + }, + { + "epoch": 0.1090740876001266, + "grad_norm": 2.457606792449951, + "learning_rate": 3.934832805336295e-05, + "loss": 0.889, + "step": 2240 + }, + { + "epoch": 0.1091227813892338, + "grad_norm": 2.4515583515167236, + "learning_rate": 3.934752915628466e-05, + "loss": 0.8959, + "step": 2241 + }, + { + "epoch": 0.109171475178341, + "grad_norm": 2.0849413871765137, + "learning_rate": 3.934672977793482e-05, + "loss": 0.8947, + "step": 2242 + }, + { + "epoch": 0.1092201689674482, + "grad_norm": 2.974731922149658, + "learning_rate": 3.934592991833332e-05, + "loss": 1.008, + "step": 2243 + }, + { + "epoch": 0.1092688627565554, + "grad_norm": 1.6128205060958862, + "learning_rate": 3.934512957750004e-05, + "loss": 0.876, + "step": 2244 + }, + { + "epoch": 0.1093175565456626, + "grad_norm": 2.429762363433838, + "learning_rate": 3.93443287554549e-05, + "loss": 0.8845, + "step": 2245 + }, + { + "epoch": 0.1093662503347698, + "grad_norm": 2.068937301635742, + "learning_rate": 3.934352745221783e-05, + "loss": 1.029, + "step": 2246 + }, + { + "epoch": 0.109414944123877, + "grad_norm": 1.8847299814224243, + "learning_rate": 3.934272566780874e-05, + "loss": 0.7599, + "step": 2247 + }, + { + "epoch": 0.1094636379129842, + "grad_norm": 1.4853311777114868, + "learning_rate": 3.9341923402247585e-05, + "loss": 0.9188, + "step": 2248 + }, + { + "epoch": 0.1095123317020914, + "grad_norm": 2.0188701152801514, + "learning_rate": 3.934112065555432e-05, + "loss": 0.9227, + "step": 2249 + }, + { + "epoch": 0.1095610254911986, + "grad_norm": 1.2716563940048218, + "learning_rate": 3.934031742774892e-05, + "loss": 0.8774, + "step": 2250 + }, + { + "epoch": 0.1096097192803058, + "grad_norm": 1.8236562013626099, + "learning_rate": 3.933951371885136e-05, + "loss": 0.9521, + "step": 2251 + }, + { + "epoch": 0.109658413069413, + "grad_norm": 2.61781907081604, + "learning_rate": 3.9338709528881634e-05, + "loss": 0.9915, + "step": 2252 + }, + { + "epoch": 0.1097071068585202, + "grad_norm": 1.435347557067871, + "learning_rate": 3.933790485785975e-05, + "loss": 0.982, + "step": 2253 + }, + { + "epoch": 0.1097558006476274, + "grad_norm": 1.5740338563919067, + "learning_rate": 3.933709970580571e-05, + "loss": 0.9413, + "step": 2254 + }, + { + "epoch": 0.1098044944367346, + "grad_norm": 1.8763031959533691, + "learning_rate": 3.933629407273955e-05, + "loss": 0.9402, + "step": 2255 + }, + { + "epoch": 0.1098531882258418, + "grad_norm": 1.4509806632995605, + "learning_rate": 3.9335487958681316e-05, + "loss": 0.9021, + "step": 2256 + }, + { + "epoch": 0.10990188201494899, + "grad_norm": 2.119558811187744, + "learning_rate": 3.933468136365106e-05, + "loss": 1.016, + "step": 2257 + }, + { + "epoch": 0.10995057580405619, + "grad_norm": 1.9600720405578613, + "learning_rate": 3.9333874287668844e-05, + "loss": 0.871, + "step": 2258 + }, + { + "epoch": 0.10999926959316339, + "grad_norm": 1.687902808189392, + "learning_rate": 3.9333066730754736e-05, + "loss": 0.9335, + "step": 2259 + }, + { + "epoch": 0.11004796338227059, + "grad_norm": 2.1632978916168213, + "learning_rate": 3.9332258692928836e-05, + "loss": 0.9836, + "step": 2260 + }, + { + "epoch": 0.11009665717137779, + "grad_norm": 9.944726943969727, + "learning_rate": 3.933145017421123e-05, + "loss": 0.9031, + "step": 2261 + }, + { + "epoch": 0.11014535096048499, + "grad_norm": 2.5000288486480713, + "learning_rate": 3.933064117462205e-05, + "loss": 0.8585, + "step": 2262 + }, + { + "epoch": 0.11019404474959218, + "grad_norm": 2.5130276679992676, + "learning_rate": 3.9329831694181395e-05, + "loss": 0.925, + "step": 2263 + }, + { + "epoch": 0.11024273853869938, + "grad_norm": 2.1974008083343506, + "learning_rate": 3.932902173290942e-05, + "loss": 0.896, + "step": 2264 + }, + { + "epoch": 0.11029143232780658, + "grad_norm": 0.08574719727039337, + "learning_rate": 3.9328211290826266e-05, + "loss": 0.6162, + "step": 2265 + }, + { + "epoch": 0.11034012611691378, + "grad_norm": 1.4542006254196167, + "learning_rate": 3.93274003679521e-05, + "loss": 0.9601, + "step": 2266 + }, + { + "epoch": 0.110388819906021, + "grad_norm": 2.2152702808380127, + "learning_rate": 3.9326588964307076e-05, + "loss": 0.864, + "step": 2267 + }, + { + "epoch": 0.11043751369512819, + "grad_norm": 2.0347347259521484, + "learning_rate": 3.93257770799114e-05, + "loss": 0.9269, + "step": 2268 + }, + { + "epoch": 0.11048620748423539, + "grad_norm": 1.6223654747009277, + "learning_rate": 3.9324964714785255e-05, + "loss": 0.9512, + "step": 2269 + }, + { + "epoch": 0.11053490127334259, + "grad_norm": 2.2050304412841797, + "learning_rate": 3.9324151868948845e-05, + "loss": 0.8784, + "step": 2270 + }, + { + "epoch": 0.11058359506244979, + "grad_norm": 1.4958754777908325, + "learning_rate": 3.9323338542422404e-05, + "loss": 0.9216, + "step": 2271 + }, + { + "epoch": 0.11063228885155699, + "grad_norm": 2.4719982147216797, + "learning_rate": 3.932252473522615e-05, + "loss": 0.853, + "step": 2272 + }, + { + "epoch": 0.11068098264066419, + "grad_norm": 1.3214364051818848, + "learning_rate": 3.932171044738033e-05, + "loss": 1.0188, + "step": 2273 + }, + { + "epoch": 0.11072967642977138, + "grad_norm": 1.6879396438598633, + "learning_rate": 3.93208956789052e-05, + "loss": 0.8239, + "step": 2274 + }, + { + "epoch": 0.11077837021887858, + "grad_norm": 2.2751872539520264, + "learning_rate": 3.932008042982103e-05, + "loss": 0.8657, + "step": 2275 + }, + { + "epoch": 0.11082706400798578, + "grad_norm": 1.9617704153060913, + "learning_rate": 3.9319264700148106e-05, + "loss": 0.8558, + "step": 2276 + }, + { + "epoch": 0.11087575779709298, + "grad_norm": 1.9704035520553589, + "learning_rate": 3.93184484899067e-05, + "loss": 1.0506, + "step": 2277 + }, + { + "epoch": 0.11092445158620018, + "grad_norm": 2.179535388946533, + "learning_rate": 3.9317631799117136e-05, + "loss": 0.8724, + "step": 2278 + }, + { + "epoch": 0.11097314537530738, + "grad_norm": 3.2273497581481934, + "learning_rate": 3.9316814627799713e-05, + "loss": 0.9557, + "step": 2279 + }, + { + "epoch": 0.11102183916441458, + "grad_norm": 1.761169672012329, + "learning_rate": 3.9315996975974764e-05, + "loss": 0.8694, + "step": 2280 + }, + { + "epoch": 0.11107053295352178, + "grad_norm": 1.9048676490783691, + "learning_rate": 3.931517884366263e-05, + "loss": 0.9795, + "step": 2281 + }, + { + "epoch": 0.11111922674262897, + "grad_norm": 1.6467690467834473, + "learning_rate": 3.9314360230883666e-05, + "loss": 0.8992, + "step": 2282 + }, + { + "epoch": 0.11116792053173617, + "grad_norm": 1.6192342042922974, + "learning_rate": 3.9313541137658224e-05, + "loss": 0.8816, + "step": 2283 + }, + { + "epoch": 0.11121661432084337, + "grad_norm": 2.250032424926758, + "learning_rate": 3.9312721564006684e-05, + "loss": 1.0073, + "step": 2284 + }, + { + "epoch": 0.11126530810995057, + "grad_norm": 1.6710883378982544, + "learning_rate": 3.931190150994944e-05, + "loss": 0.9601, + "step": 2285 + }, + { + "epoch": 0.11131400189905777, + "grad_norm": 2.585340738296509, + "learning_rate": 3.931108097550689e-05, + "loss": 0.8931, + "step": 2286 + }, + { + "epoch": 0.11136269568816498, + "grad_norm": 2.2347378730773926, + "learning_rate": 3.9310259960699426e-05, + "loss": 0.8614, + "step": 2287 + }, + { + "epoch": 0.11141138947727218, + "grad_norm": 2.1165950298309326, + "learning_rate": 3.930943846554749e-05, + "loss": 0.9026, + "step": 2288 + }, + { + "epoch": 0.11146008326637938, + "grad_norm": 1.8601316213607788, + "learning_rate": 3.9308616490071514e-05, + "loss": 0.9241, + "step": 2289 + }, + { + "epoch": 0.11150877705548658, + "grad_norm": 3.115370035171509, + "learning_rate": 3.9307794034291946e-05, + "loss": 0.9774, + "step": 2290 + }, + { + "epoch": 0.11155747084459378, + "grad_norm": 1.4266993999481201, + "learning_rate": 3.930697109822924e-05, + "loss": 0.9946, + "step": 2291 + }, + { + "epoch": 0.11160616463370097, + "grad_norm": 2.4609529972076416, + "learning_rate": 3.930614768190386e-05, + "loss": 0.9127, + "step": 2292 + }, + { + "epoch": 0.11165485842280817, + "grad_norm": 1.344106674194336, + "learning_rate": 3.93053237853363e-05, + "loss": 0.9371, + "step": 2293 + }, + { + "epoch": 0.11170355221191537, + "grad_norm": 1.9896397590637207, + "learning_rate": 3.930449940854706e-05, + "loss": 0.9668, + "step": 2294 + }, + { + "epoch": 0.11175224600102257, + "grad_norm": 2.317401170730591, + "learning_rate": 3.930367455155662e-05, + "loss": 0.863, + "step": 2295 + }, + { + "epoch": 0.11180093979012977, + "grad_norm": 1.7381983995437622, + "learning_rate": 3.930284921438553e-05, + "loss": 0.8498, + "step": 2296 + }, + { + "epoch": 0.11184963357923697, + "grad_norm": 2.131706714630127, + "learning_rate": 3.93020233970543e-05, + "loss": 0.8559, + "step": 2297 + }, + { + "epoch": 0.11189832736834417, + "grad_norm": 1.9878513813018799, + "learning_rate": 3.930119709958347e-05, + "loss": 0.9197, + "step": 2298 + }, + { + "epoch": 0.11194702115745137, + "grad_norm": 1.6574774980545044, + "learning_rate": 3.930037032199361e-05, + "loss": 1.0299, + "step": 2299 + }, + { + "epoch": 0.11199571494655856, + "grad_norm": 2.282226800918579, + "learning_rate": 3.929954306430528e-05, + "loss": 0.9013, + "step": 2300 + }, + { + "epoch": 0.11204440873566576, + "grad_norm": 1.6432594060897827, + "learning_rate": 3.929871532653906e-05, + "loss": 0.9489, + "step": 2301 + }, + { + "epoch": 0.11209310252477296, + "grad_norm": 0.08061659336090088, + "learning_rate": 3.9297887108715527e-05, + "loss": 0.5877, + "step": 2302 + }, + { + "epoch": 0.11214179631388016, + "grad_norm": 0.0875086858868599, + "learning_rate": 3.9297058410855295e-05, + "loss": 0.6748, + "step": 2303 + }, + { + "epoch": 0.11219049010298736, + "grad_norm": 1.8087373971939087, + "learning_rate": 3.9296229232978985e-05, + "loss": 0.9694, + "step": 2304 + }, + { + "epoch": 0.11223918389209456, + "grad_norm": 1.5310497283935547, + "learning_rate": 3.92953995751072e-05, + "loss": 0.8844, + "step": 2305 + }, + { + "epoch": 0.11228787768120176, + "grad_norm": 2.319188117980957, + "learning_rate": 3.92945694372606e-05, + "loss": 0.9708, + "step": 2306 + }, + { + "epoch": 0.11233657147030897, + "grad_norm": 2.2033417224884033, + "learning_rate": 3.9293738819459826e-05, + "loss": 0.8483, + "step": 2307 + }, + { + "epoch": 0.11238526525941617, + "grad_norm": 1.3406341075897217, + "learning_rate": 3.929290772172554e-05, + "loss": 0.9488, + "step": 2308 + }, + { + "epoch": 0.11243395904852337, + "grad_norm": 2.8092689514160156, + "learning_rate": 3.929207614407841e-05, + "loss": 0.9313, + "step": 2309 + }, + { + "epoch": 0.11248265283763056, + "grad_norm": 2.2632572650909424, + "learning_rate": 3.9291244086539135e-05, + "loss": 0.9921, + "step": 2310 + }, + { + "epoch": 0.11253134662673776, + "grad_norm": 2.670820474624634, + "learning_rate": 3.92904115491284e-05, + "loss": 0.8238, + "step": 2311 + }, + { + "epoch": 0.11258004041584496, + "grad_norm": 1.6674273014068604, + "learning_rate": 3.928957853186692e-05, + "loss": 0.8416, + "step": 2312 + }, + { + "epoch": 0.11262873420495216, + "grad_norm": 3.4398324489593506, + "learning_rate": 3.928874503477541e-05, + "loss": 0.9051, + "step": 2313 + }, + { + "epoch": 0.11267742799405936, + "grad_norm": 1.5467902421951294, + "learning_rate": 3.9287911057874614e-05, + "loss": 0.9046, + "step": 2314 + }, + { + "epoch": 0.11272612178316656, + "grad_norm": 1.6367987394332886, + "learning_rate": 3.9287076601185266e-05, + "loss": 0.9536, + "step": 2315 + }, + { + "epoch": 0.11277481557227376, + "grad_norm": 2.2450003623962402, + "learning_rate": 3.9286241664728135e-05, + "loss": 0.9753, + "step": 2316 + }, + { + "epoch": 0.11282350936138096, + "grad_norm": 2.4732167720794678, + "learning_rate": 3.9285406248523985e-05, + "loss": 0.9691, + "step": 2317 + }, + { + "epoch": 0.11287220315048815, + "grad_norm": 1.8913747072219849, + "learning_rate": 3.928457035259359e-05, + "loss": 1.0225, + "step": 2318 + }, + { + "epoch": 0.11292089693959535, + "grad_norm": 1.7818537950515747, + "learning_rate": 3.928373397695775e-05, + "loss": 0.9337, + "step": 2319 + }, + { + "epoch": 0.11296959072870255, + "grad_norm": 1.545114278793335, + "learning_rate": 3.928289712163728e-05, + "loss": 0.9008, + "step": 2320 + }, + { + "epoch": 0.11301828451780975, + "grad_norm": 1.8761504888534546, + "learning_rate": 3.928205978665297e-05, + "loss": 0.9671, + "step": 2321 + }, + { + "epoch": 0.11306697830691695, + "grad_norm": 1.4963972568511963, + "learning_rate": 3.928122197202568e-05, + "loss": 1.0265, + "step": 2322 + }, + { + "epoch": 0.11311567209602415, + "grad_norm": 2.6522462368011475, + "learning_rate": 3.928038367777622e-05, + "loss": 0.9331, + "step": 2323 + }, + { + "epoch": 0.11316436588513135, + "grad_norm": 2.6758275032043457, + "learning_rate": 3.927954490392547e-05, + "loss": 0.832, + "step": 2324 + }, + { + "epoch": 0.11321305967423854, + "grad_norm": 1.707042932510376, + "learning_rate": 3.9278705650494274e-05, + "loss": 0.8988, + "step": 2325 + }, + { + "epoch": 0.11326175346334574, + "grad_norm": 1.8652055263519287, + "learning_rate": 3.927786591750352e-05, + "loss": 0.8683, + "step": 2326 + }, + { + "epoch": 0.11331044725245296, + "grad_norm": 1.549020528793335, + "learning_rate": 3.92770257049741e-05, + "loss": 1.0016, + "step": 2327 + }, + { + "epoch": 0.11335914104156015, + "grad_norm": 1.6168110370635986, + "learning_rate": 3.92761850129269e-05, + "loss": 1.0152, + "step": 2328 + }, + { + "epoch": 0.11340783483066735, + "grad_norm": 2.492668628692627, + "learning_rate": 3.927534384138285e-05, + "loss": 0.9105, + "step": 2329 + }, + { + "epoch": 0.11345652861977455, + "grad_norm": 1.7550413608551025, + "learning_rate": 3.927450219036286e-05, + "loss": 0.9756, + "step": 2330 + }, + { + "epoch": 0.11350522240888175, + "grad_norm": 2.6313579082489014, + "learning_rate": 3.9273660059887866e-05, + "loss": 0.9604, + "step": 2331 + }, + { + "epoch": 0.11355391619798895, + "grad_norm": 1.9255552291870117, + "learning_rate": 3.927281744997882e-05, + "loss": 0.8941, + "step": 2332 + }, + { + "epoch": 0.11360260998709615, + "grad_norm": 1.9230581521987915, + "learning_rate": 3.927197436065669e-05, + "loss": 0.9207, + "step": 2333 + }, + { + "epoch": 0.11365130377620335, + "grad_norm": 1.5493788719177246, + "learning_rate": 3.927113079194244e-05, + "loss": 0.8936, + "step": 2334 + }, + { + "epoch": 0.11369999756531055, + "grad_norm": 1.7888270616531372, + "learning_rate": 3.927028674385705e-05, + "loss": 0.8877, + "step": 2335 + }, + { + "epoch": 0.11374869135441774, + "grad_norm": 2.319643497467041, + "learning_rate": 3.926944221642152e-05, + "loss": 0.9312, + "step": 2336 + }, + { + "epoch": 0.11379738514352494, + "grad_norm": 1.5508555173873901, + "learning_rate": 3.926859720965686e-05, + "loss": 0.8693, + "step": 2337 + }, + { + "epoch": 0.11384607893263214, + "grad_norm": 2.085088014602661, + "learning_rate": 3.9267751723584085e-05, + "loss": 0.9408, + "step": 2338 + }, + { + "epoch": 0.11389477272173934, + "grad_norm": 1.9241701364517212, + "learning_rate": 3.926690575822423e-05, + "loss": 0.9396, + "step": 2339 + }, + { + "epoch": 0.11394346651084654, + "grad_norm": 2.1833081245422363, + "learning_rate": 3.926605931359834e-05, + "loss": 0.921, + "step": 2340 + }, + { + "epoch": 0.11399216029995374, + "grad_norm": 0.08150351047515869, + "learning_rate": 3.9265212389727466e-05, + "loss": 0.6044, + "step": 2341 + }, + { + "epoch": 0.11404085408906094, + "grad_norm": 1.725400686264038, + "learning_rate": 3.926436498663268e-05, + "loss": 0.9541, + "step": 2342 + }, + { + "epoch": 0.11408954787816813, + "grad_norm": 1.436397910118103, + "learning_rate": 3.926351710433505e-05, + "loss": 1.0422, + "step": 2343 + }, + { + "epoch": 0.11413824166727533, + "grad_norm": 1.8246515989303589, + "learning_rate": 3.9262668742855686e-05, + "loss": 0.9356, + "step": 2344 + }, + { + "epoch": 0.11418693545638253, + "grad_norm": 1.4388693571090698, + "learning_rate": 3.9261819902215676e-05, + "loss": 1.0119, + "step": 2345 + }, + { + "epoch": 0.11423562924548973, + "grad_norm": 1.8131089210510254, + "learning_rate": 3.9260970582436145e-05, + "loss": 0.9008, + "step": 2346 + }, + { + "epoch": 0.11428432303459694, + "grad_norm": 3.2800536155700684, + "learning_rate": 3.926012078353821e-05, + "loss": 0.9114, + "step": 2347 + }, + { + "epoch": 0.11433301682370414, + "grad_norm": 1.2692104578018188, + "learning_rate": 3.925927050554301e-05, + "loss": 0.9095, + "step": 2348 + }, + { + "epoch": 0.11438171061281134, + "grad_norm": 2.4017980098724365, + "learning_rate": 3.925841974847171e-05, + "loss": 0.8931, + "step": 2349 + }, + { + "epoch": 0.11443040440191854, + "grad_norm": 3.8743932247161865, + "learning_rate": 3.925756851234545e-05, + "loss": 0.8227, + "step": 2350 + }, + { + "epoch": 0.11447909819102574, + "grad_norm": 1.4084439277648926, + "learning_rate": 3.925671679718543e-05, + "loss": 0.9519, + "step": 2351 + }, + { + "epoch": 0.11452779198013294, + "grad_norm": 0.08655998855829239, + "learning_rate": 3.925586460301282e-05, + "loss": 0.6838, + "step": 2352 + }, + { + "epoch": 0.11457648576924014, + "grad_norm": 1.8081001043319702, + "learning_rate": 3.925501192984882e-05, + "loss": 0.8518, + "step": 2353 + }, + { + "epoch": 0.11462517955834733, + "grad_norm": 1.7443662881851196, + "learning_rate": 3.9254158777714643e-05, + "loss": 0.8754, + "step": 2354 + }, + { + "epoch": 0.11467387334745453, + "grad_norm": 1.6506941318511963, + "learning_rate": 3.925330514663151e-05, + "loss": 0.9626, + "step": 2355 + }, + { + "epoch": 0.11472256713656173, + "grad_norm": 2.108294725418091, + "learning_rate": 3.925245103662066e-05, + "loss": 0.9593, + "step": 2356 + }, + { + "epoch": 0.11477126092566893, + "grad_norm": 1.4475643634796143, + "learning_rate": 3.9251596447703335e-05, + "loss": 0.9247, + "step": 2357 + }, + { + "epoch": 0.11481995471477613, + "grad_norm": 2.375767707824707, + "learning_rate": 3.925074137990079e-05, + "loss": 0.8937, + "step": 2358 + }, + { + "epoch": 0.11486864850388333, + "grad_norm": 1.6373018026351929, + "learning_rate": 3.92498858332343e-05, + "loss": 0.9573, + "step": 2359 + }, + { + "epoch": 0.11491734229299053, + "grad_norm": 1.4245909452438354, + "learning_rate": 3.9249029807725146e-05, + "loss": 0.9358, + "step": 2360 + }, + { + "epoch": 0.11496603608209773, + "grad_norm": 0.08396143466234207, + "learning_rate": 3.924817330339461e-05, + "loss": 0.6254, + "step": 2361 + }, + { + "epoch": 0.11501472987120492, + "grad_norm": 1.948344111442566, + "learning_rate": 3.924731632026402e-05, + "loss": 0.7807, + "step": 2362 + }, + { + "epoch": 0.11506342366031212, + "grad_norm": 1.8289319276809692, + "learning_rate": 3.924645885835468e-05, + "loss": 0.9829, + "step": 2363 + }, + { + "epoch": 0.11511211744941932, + "grad_norm": 35.602298736572266, + "learning_rate": 3.924560091768792e-05, + "loss": 0.9078, + "step": 2364 + }, + { + "epoch": 0.11516081123852652, + "grad_norm": 2.6751999855041504, + "learning_rate": 3.924474249828508e-05, + "loss": 0.8699, + "step": 2365 + }, + { + "epoch": 0.11520950502763372, + "grad_norm": 1.9314261674880981, + "learning_rate": 3.924388360016751e-05, + "loss": 0.9868, + "step": 2366 + }, + { + "epoch": 0.11525819881674093, + "grad_norm": 1.8284547328948975, + "learning_rate": 3.9243024223356594e-05, + "loss": 0.8852, + "step": 2367 + }, + { + "epoch": 0.11530689260584813, + "grad_norm": 2.226447105407715, + "learning_rate": 3.924216436787369e-05, + "loss": 0.9259, + "step": 2368 + }, + { + "epoch": 0.11535558639495533, + "grad_norm": 2.306403398513794, + "learning_rate": 3.924130403374019e-05, + "loss": 0.9842, + "step": 2369 + }, + { + "epoch": 0.11540428018406253, + "grad_norm": 1.5615653991699219, + "learning_rate": 3.92404432209775e-05, + "loss": 0.8774, + "step": 2370 + }, + { + "epoch": 0.11545297397316973, + "grad_norm": 2.3168892860412598, + "learning_rate": 3.9239581929607036e-05, + "loss": 0.8635, + "step": 2371 + }, + { + "epoch": 0.11550166776227692, + "grad_norm": 8.98386287689209, + "learning_rate": 3.923872015965021e-05, + "loss": 0.8683, + "step": 2372 + }, + { + "epoch": 0.11555036155138412, + "grad_norm": 2.05804181098938, + "learning_rate": 3.9237857911128474e-05, + "loss": 0.8797, + "step": 2373 + }, + { + "epoch": 0.11559905534049132, + "grad_norm": 1.6908037662506104, + "learning_rate": 3.9236995184063265e-05, + "loss": 0.929, + "step": 2374 + }, + { + "epoch": 0.11564774912959852, + "grad_norm": 1.756598711013794, + "learning_rate": 3.923613197847604e-05, + "loss": 0.9217, + "step": 2375 + }, + { + "epoch": 0.11569644291870572, + "grad_norm": 1.823745846748352, + "learning_rate": 3.923526829438828e-05, + "loss": 0.9375, + "step": 2376 + }, + { + "epoch": 0.11574513670781292, + "grad_norm": 1.7479798793792725, + "learning_rate": 3.923440413182147e-05, + "loss": 0.9762, + "step": 2377 + }, + { + "epoch": 0.11579383049692012, + "grad_norm": 2.412898540496826, + "learning_rate": 3.923353949079711e-05, + "loss": 0.9982, + "step": 2378 + }, + { + "epoch": 0.11584252428602732, + "grad_norm": 1.9959741830825806, + "learning_rate": 3.923267437133669e-05, + "loss": 0.9407, + "step": 2379 + }, + { + "epoch": 0.11589121807513451, + "grad_norm": 1.9480974674224854, + "learning_rate": 3.923180877346175e-05, + "loss": 1.0102, + "step": 2380 + }, + { + "epoch": 0.11593991186424171, + "grad_norm": 1.5260547399520874, + "learning_rate": 3.923094269719381e-05, + "loss": 0.8532, + "step": 2381 + }, + { + "epoch": 0.11598860565334891, + "grad_norm": 1.7339831590652466, + "learning_rate": 3.9230076142554415e-05, + "loss": 0.8932, + "step": 2382 + }, + { + "epoch": 0.11603729944245611, + "grad_norm": 2.46323299407959, + "learning_rate": 3.9229209109565126e-05, + "loss": 0.8683, + "step": 2383 + }, + { + "epoch": 0.11608599323156331, + "grad_norm": 2.3097474575042725, + "learning_rate": 3.92283415982475e-05, + "loss": 0.9959, + "step": 2384 + }, + { + "epoch": 0.11613468702067051, + "grad_norm": 1.8167115449905396, + "learning_rate": 3.9227473608623133e-05, + "loss": 0.89, + "step": 2385 + }, + { + "epoch": 0.1161833808097777, + "grad_norm": 1.4516816139221191, + "learning_rate": 3.922660514071361e-05, + "loss": 0.9582, + "step": 2386 + }, + { + "epoch": 0.11623207459888492, + "grad_norm": 0.0897081196308136, + "learning_rate": 3.9225736194540515e-05, + "loss": 0.6042, + "step": 2387 + }, + { + "epoch": 0.11628076838799212, + "grad_norm": 5.0840325355529785, + "learning_rate": 3.922486677012549e-05, + "loss": 0.9528, + "step": 2388 + }, + { + "epoch": 0.11632946217709932, + "grad_norm": 1.989454746246338, + "learning_rate": 3.922399686749015e-05, + "loss": 0.9715, + "step": 2389 + }, + { + "epoch": 0.11637815596620651, + "grad_norm": 1.7912514209747314, + "learning_rate": 3.922312648665614e-05, + "loss": 0.9214, + "step": 2390 + }, + { + "epoch": 0.11642684975531371, + "grad_norm": 1.7742291688919067, + "learning_rate": 3.92222556276451e-05, + "loss": 0.8817, + "step": 2391 + }, + { + "epoch": 0.11647554354442091, + "grad_norm": 2.429487466812134, + "learning_rate": 3.92213842904787e-05, + "loss": 0.789, + "step": 2392 + }, + { + "epoch": 0.11652423733352811, + "grad_norm": 2.9357008934020996, + "learning_rate": 3.9220512475178613e-05, + "loss": 0.8683, + "step": 2393 + }, + { + "epoch": 0.11657293112263531, + "grad_norm": 2.133873224258423, + "learning_rate": 3.9219640181766524e-05, + "loss": 0.9957, + "step": 2394 + }, + { + "epoch": 0.11662162491174251, + "grad_norm": 2.2195653915405273, + "learning_rate": 3.921876741026413e-05, + "loss": 0.847, + "step": 2395 + }, + { + "epoch": 0.1166703187008497, + "grad_norm": 1.8841317892074585, + "learning_rate": 3.9217894160693154e-05, + "loss": 1.0355, + "step": 2396 + }, + { + "epoch": 0.1167190124899569, + "grad_norm": 2.163655996322632, + "learning_rate": 3.9217020433075304e-05, + "loss": 0.9266, + "step": 2397 + }, + { + "epoch": 0.1167677062790641, + "grad_norm": 2.181317090988159, + "learning_rate": 3.9216146227432315e-05, + "loss": 0.8544, + "step": 2398 + }, + { + "epoch": 0.1168164000681713, + "grad_norm": 5.246673107147217, + "learning_rate": 3.921527154378594e-05, + "loss": 0.9684, + "step": 2399 + }, + { + "epoch": 0.1168650938572785, + "grad_norm": 2.251110792160034, + "learning_rate": 3.921439638215793e-05, + "loss": 0.9146, + "step": 2400 + }, + { + "epoch": 0.1169137876463857, + "grad_norm": 2.499446153640747, + "learning_rate": 3.921352074257007e-05, + "loss": 0.906, + "step": 2401 + }, + { + "epoch": 0.1169624814354929, + "grad_norm": 3.2601211071014404, + "learning_rate": 3.9212644625044116e-05, + "loss": 0.9566, + "step": 2402 + }, + { + "epoch": 0.1170111752246001, + "grad_norm": 2.01857852935791, + "learning_rate": 3.921176802960188e-05, + "loss": 0.9646, + "step": 2403 + }, + { + "epoch": 0.1170598690137073, + "grad_norm": 1.502017855644226, + "learning_rate": 3.9210890956265164e-05, + "loss": 0.9843, + "step": 2404 + }, + { + "epoch": 0.1171085628028145, + "grad_norm": 3.451629161834717, + "learning_rate": 3.9210013405055785e-05, + "loss": 0.8873, + "step": 2405 + }, + { + "epoch": 0.1171572565919217, + "grad_norm": 1.6591225862503052, + "learning_rate": 3.9209135375995567e-05, + "loss": 0.8613, + "step": 2406 + }, + { + "epoch": 0.1172059503810289, + "grad_norm": 1.6489335298538208, + "learning_rate": 3.920825686910635e-05, + "loss": 0.9104, + "step": 2407 + }, + { + "epoch": 0.1172546441701361, + "grad_norm": 1.8395642042160034, + "learning_rate": 3.920737788441e-05, + "loss": 0.9305, + "step": 2408 + }, + { + "epoch": 0.1173033379592433, + "grad_norm": 2.6858112812042236, + "learning_rate": 3.9206498421928374e-05, + "loss": 0.9326, + "step": 2409 + }, + { + "epoch": 0.1173520317483505, + "grad_norm": 2.128028392791748, + "learning_rate": 3.920561848168335e-05, + "loss": 0.9783, + "step": 2410 + }, + { + "epoch": 0.1174007255374577, + "grad_norm": 1.6336702108383179, + "learning_rate": 3.920473806369681e-05, + "loss": 0.9154, + "step": 2411 + }, + { + "epoch": 0.1174494193265649, + "grad_norm": 1.5996146202087402, + "learning_rate": 3.9203857167990664e-05, + "loss": 0.9466, + "step": 2412 + }, + { + "epoch": 0.1174981131156721, + "grad_norm": 1.7101997137069702, + "learning_rate": 3.920297579458682e-05, + "loss": 0.8654, + "step": 2413 + }, + { + "epoch": 0.1175468069047793, + "grad_norm": 0.08616022020578384, + "learning_rate": 3.92020939435072e-05, + "loss": 0.7057, + "step": 2414 + }, + { + "epoch": 0.1175955006938865, + "grad_norm": 2.0908737182617188, + "learning_rate": 3.920121161477374e-05, + "loss": 0.9431, + "step": 2415 + }, + { + "epoch": 0.1176441944829937, + "grad_norm": 1.626871943473816, + "learning_rate": 3.9200328808408395e-05, + "loss": 0.9135, + "step": 2416 + }, + { + "epoch": 0.11769288827210089, + "grad_norm": 1.4676251411437988, + "learning_rate": 3.9199445524433117e-05, + "loss": 1.0622, + "step": 2417 + }, + { + "epoch": 0.11774158206120809, + "grad_norm": 1.6961441040039062, + "learning_rate": 3.9198561762869884e-05, + "loss": 0.9263, + "step": 2418 + }, + { + "epoch": 0.11779027585031529, + "grad_norm": 1.8847601413726807, + "learning_rate": 3.9197677523740674e-05, + "loss": 0.914, + "step": 2419 + }, + { + "epoch": 0.11783896963942249, + "grad_norm": 2.5973074436187744, + "learning_rate": 3.919679280706748e-05, + "loss": 0.9204, + "step": 2420 + }, + { + "epoch": 0.11788766342852969, + "grad_norm": 3.222126007080078, + "learning_rate": 3.919590761287232e-05, + "loss": 0.9324, + "step": 2421 + }, + { + "epoch": 0.11793635721763689, + "grad_norm": 1.9503382444381714, + "learning_rate": 3.9195021941177214e-05, + "loss": 0.9015, + "step": 2422 + }, + { + "epoch": 0.11798505100674408, + "grad_norm": 2.368367910385132, + "learning_rate": 3.9194135792004185e-05, + "loss": 0.9682, + "step": 2423 + }, + { + "epoch": 0.11803374479585128, + "grad_norm": 1.7499765157699585, + "learning_rate": 3.919324916537528e-05, + "loss": 0.9102, + "step": 2424 + }, + { + "epoch": 0.11808243858495848, + "grad_norm": 1.8613367080688477, + "learning_rate": 3.919236206131254e-05, + "loss": 0.9257, + "step": 2425 + }, + { + "epoch": 0.11813113237406568, + "grad_norm": 3.2293248176574707, + "learning_rate": 3.919147447983805e-05, + "loss": 0.9279, + "step": 2426 + }, + { + "epoch": 0.1181798261631729, + "grad_norm": 1.7024245262145996, + "learning_rate": 3.919058642097388e-05, + "loss": 0.956, + "step": 2427 + }, + { + "epoch": 0.11822851995228009, + "grad_norm": 1.7690844535827637, + "learning_rate": 3.918969788474213e-05, + "loss": 0.9966, + "step": 2428 + }, + { + "epoch": 0.11827721374138729, + "grad_norm": 2.3424692153930664, + "learning_rate": 3.918880887116489e-05, + "loss": 0.9015, + "step": 2429 + }, + { + "epoch": 0.11832590753049449, + "grad_norm": 2.124089241027832, + "learning_rate": 3.918791938026428e-05, + "loss": 0.9494, + "step": 2430 + }, + { + "epoch": 0.11837460131960169, + "grad_norm": 1.5924476385116577, + "learning_rate": 3.918702941206243e-05, + "loss": 0.9689, + "step": 2431 + }, + { + "epoch": 0.11842329510870889, + "grad_norm": 1.8818162679672241, + "learning_rate": 3.9186138966581475e-05, + "loss": 0.8343, + "step": 2432 + }, + { + "epoch": 0.11847198889781609, + "grad_norm": 1.8454493284225464, + "learning_rate": 3.9185248043843554e-05, + "loss": 0.9376, + "step": 2433 + }, + { + "epoch": 0.11852068268692328, + "grad_norm": 1.8027873039245605, + "learning_rate": 3.918435664387085e-05, + "loss": 0.9143, + "step": 2434 + }, + { + "epoch": 0.11856937647603048, + "grad_norm": 1.6728553771972656, + "learning_rate": 3.918346476668552e-05, + "loss": 0.9397, + "step": 2435 + }, + { + "epoch": 0.11861807026513768, + "grad_norm": 13.267712593078613, + "learning_rate": 3.918257241230976e-05, + "loss": 0.9618, + "step": 2436 + }, + { + "epoch": 0.11866676405424488, + "grad_norm": 0.09029881656169891, + "learning_rate": 3.9181679580765756e-05, + "loss": 0.6443, + "step": 2437 + }, + { + "epoch": 0.11871545784335208, + "grad_norm": 1.5404185056686401, + "learning_rate": 3.918078627207572e-05, + "loss": 0.9141, + "step": 2438 + }, + { + "epoch": 0.11876415163245928, + "grad_norm": 1.7696328163146973, + "learning_rate": 3.917989248626188e-05, + "loss": 0.8394, + "step": 2439 + }, + { + "epoch": 0.11881284542156648, + "grad_norm": 1.7340912818908691, + "learning_rate": 3.9178998223346474e-05, + "loss": 0.9742, + "step": 2440 + }, + { + "epoch": 0.11886153921067368, + "grad_norm": 1.8869835138320923, + "learning_rate": 3.9178103483351726e-05, + "loss": 0.95, + "step": 2441 + }, + { + "epoch": 0.11891023299978087, + "grad_norm": 1.5684356689453125, + "learning_rate": 3.917720826629992e-05, + "loss": 0.8943, + "step": 2442 + }, + { + "epoch": 0.11895892678888807, + "grad_norm": 1.888967752456665, + "learning_rate": 3.9176312572213296e-05, + "loss": 0.9268, + "step": 2443 + }, + { + "epoch": 0.11900762057799527, + "grad_norm": 2.2187726497650146, + "learning_rate": 3.9175416401114154e-05, + "loss": 0.9613, + "step": 2444 + }, + { + "epoch": 0.11905631436710247, + "grad_norm": 2.0506491661071777, + "learning_rate": 3.917451975302478e-05, + "loss": 0.9088, + "step": 2445 + }, + { + "epoch": 0.11910500815620967, + "grad_norm": 2.2964391708374023, + "learning_rate": 3.917362262796747e-05, + "loss": 1.0237, + "step": 2446 + }, + { + "epoch": 0.11915370194531688, + "grad_norm": 1.605613112449646, + "learning_rate": 3.917272502596456e-05, + "loss": 0.9394, + "step": 2447 + }, + { + "epoch": 0.11920239573442408, + "grad_norm": 2.000481128692627, + "learning_rate": 3.917182694703837e-05, + "loss": 0.9051, + "step": 2448 + }, + { + "epoch": 0.11925108952353128, + "grad_norm": 2.138721466064453, + "learning_rate": 3.917092839121123e-05, + "loss": 0.8912, + "step": 2449 + }, + { + "epoch": 0.11929978331263848, + "grad_norm": 1.9239829778671265, + "learning_rate": 3.91700293585055e-05, + "loss": 0.8513, + "step": 2450 + }, + { + "epoch": 0.11934847710174568, + "grad_norm": 4.158959865570068, + "learning_rate": 3.916912984894354e-05, + "loss": 0.9436, + "step": 2451 + }, + { + "epoch": 0.11939717089085287, + "grad_norm": 2.3039164543151855, + "learning_rate": 3.916822986254773e-05, + "loss": 0.947, + "step": 2452 + }, + { + "epoch": 0.11944586467996007, + "grad_norm": 2.3523528575897217, + "learning_rate": 3.916732939934045e-05, + "loss": 0.8222, + "step": 2453 + }, + { + "epoch": 0.11949455846906727, + "grad_norm": 6.87133264541626, + "learning_rate": 3.916642845934411e-05, + "loss": 0.9025, + "step": 2454 + }, + { + "epoch": 0.11954325225817447, + "grad_norm": 1.8092471361160278, + "learning_rate": 3.916552704258111e-05, + "loss": 0.8748, + "step": 2455 + }, + { + "epoch": 0.11959194604728167, + "grad_norm": 1.8288336992263794, + "learning_rate": 3.9164625149073885e-05, + "loss": 0.8909, + "step": 2456 + }, + { + "epoch": 0.11964063983638887, + "grad_norm": 6.177211284637451, + "learning_rate": 3.916372277884485e-05, + "loss": 0.8511, + "step": 2457 + }, + { + "epoch": 0.11968933362549607, + "grad_norm": 2.2741072177886963, + "learning_rate": 3.916281993191647e-05, + "loss": 0.8289, + "step": 2458 + }, + { + "epoch": 0.11973802741460327, + "grad_norm": 2.6773509979248047, + "learning_rate": 3.91619166083112e-05, + "loss": 0.9082, + "step": 2459 + }, + { + "epoch": 0.11978672120371046, + "grad_norm": 3.562204599380493, + "learning_rate": 3.91610128080515e-05, + "loss": 0.9492, + "step": 2460 + }, + { + "epoch": 0.11983541499281766, + "grad_norm": 1.9618535041809082, + "learning_rate": 3.916010853115986e-05, + "loss": 0.9461, + "step": 2461 + }, + { + "epoch": 0.11988410878192486, + "grad_norm": 3.601186513900757, + "learning_rate": 3.9159203777658785e-05, + "loss": 0.9516, + "step": 2462 + }, + { + "epoch": 0.11993280257103206, + "grad_norm": 3.1400701999664307, + "learning_rate": 3.915829854757076e-05, + "loss": 0.9729, + "step": 2463 + }, + { + "epoch": 0.11998149636013926, + "grad_norm": 1.6459918022155762, + "learning_rate": 3.915739284091831e-05, + "loss": 0.8802, + "step": 2464 + }, + { + "epoch": 0.12003019014924646, + "grad_norm": 1.9367629289627075, + "learning_rate": 3.915648665772397e-05, + "loss": 0.8771, + "step": 2465 + }, + { + "epoch": 0.12007888393835367, + "grad_norm": 2.2718002796173096, + "learning_rate": 3.915557999801027e-05, + "loss": 0.9032, + "step": 2466 + }, + { + "epoch": 0.12012757772746087, + "grad_norm": 2.0168282985687256, + "learning_rate": 3.9154672861799784e-05, + "loss": 0.9616, + "step": 2467 + }, + { + "epoch": 0.12017627151656807, + "grad_norm": 2.1013894081115723, + "learning_rate": 3.9153765249115055e-05, + "loss": 0.8662, + "step": 2468 + }, + { + "epoch": 0.12022496530567527, + "grad_norm": 2.31990122795105, + "learning_rate": 3.915285715997867e-05, + "loss": 0.9034, + "step": 2469 + }, + { + "epoch": 0.12027365909478246, + "grad_norm": 1.8219412565231323, + "learning_rate": 3.915194859441322e-05, + "loss": 0.9777, + "step": 2470 + }, + { + "epoch": 0.12032235288388966, + "grad_norm": 1.958619236946106, + "learning_rate": 3.9151039552441304e-05, + "loss": 0.7894, + "step": 2471 + }, + { + "epoch": 0.12037104667299686, + "grad_norm": 2.4238274097442627, + "learning_rate": 3.915013003408553e-05, + "loss": 0.9167, + "step": 2472 + }, + { + "epoch": 0.12041974046210406, + "grad_norm": 0.08455900102853775, + "learning_rate": 3.914922003936852e-05, + "loss": 0.6171, + "step": 2473 + }, + { + "epoch": 0.12046843425121126, + "grad_norm": 2.0243546962738037, + "learning_rate": 3.914830956831292e-05, + "loss": 1.0664, + "step": 2474 + }, + { + "epoch": 0.12051712804031846, + "grad_norm": 18.718048095703125, + "learning_rate": 3.914739862094138e-05, + "loss": 0.7255, + "step": 2475 + }, + { + "epoch": 0.12056582182942566, + "grad_norm": 3.2686314582824707, + "learning_rate": 3.914648719727654e-05, + "loss": 0.9046, + "step": 2476 + }, + { + "epoch": 0.12061451561853286, + "grad_norm": 2.751373767852783, + "learning_rate": 3.9145575297341096e-05, + "loss": 0.9325, + "step": 2477 + }, + { + "epoch": 0.12066320940764005, + "grad_norm": 4.147889137268066, + "learning_rate": 3.9144662921157716e-05, + "loss": 0.8459, + "step": 2478 + }, + { + "epoch": 0.12071190319674725, + "grad_norm": 2.0370864868164062, + "learning_rate": 3.91437500687491e-05, + "loss": 0.9039, + "step": 2479 + }, + { + "epoch": 0.12076059698585445, + "grad_norm": 1.9848806858062744, + "learning_rate": 3.9142836740137964e-05, + "loss": 0.8907, + "step": 2480 + }, + { + "epoch": 0.12080929077496165, + "grad_norm": 2.1451914310455322, + "learning_rate": 3.9141922935347014e-05, + "loss": 1.0049, + "step": 2481 + }, + { + "epoch": 0.12085798456406885, + "grad_norm": 2.0296154022216797, + "learning_rate": 3.9141008654398985e-05, + "loss": 0.987, + "step": 2482 + }, + { + "epoch": 0.12090667835317605, + "grad_norm": 2.366224527359009, + "learning_rate": 3.914009389731662e-05, + "loss": 1.0402, + "step": 2483 + }, + { + "epoch": 0.12095537214228325, + "grad_norm": 8.613788604736328, + "learning_rate": 3.913917866412267e-05, + "loss": 0.9066, + "step": 2484 + }, + { + "epoch": 0.12100406593139044, + "grad_norm": 2.7736823558807373, + "learning_rate": 3.913826295483991e-05, + "loss": 0.9455, + "step": 2485 + }, + { + "epoch": 0.12105275972049766, + "grad_norm": 10.516756057739258, + "learning_rate": 3.913734676949112e-05, + "loss": 0.8886, + "step": 2486 + }, + { + "epoch": 0.12110145350960486, + "grad_norm": 4.960543155670166, + "learning_rate": 3.913643010809907e-05, + "loss": 1.0215, + "step": 2487 + }, + { + "epoch": 0.12115014729871205, + "grad_norm": 3.2486250400543213, + "learning_rate": 3.913551297068659e-05, + "loss": 0.8236, + "step": 2488 + }, + { + "epoch": 0.12119884108781925, + "grad_norm": 0.09220041334629059, + "learning_rate": 3.9134595357276474e-05, + "loss": 0.6358, + "step": 2489 + }, + { + "epoch": 0.12124753487692645, + "grad_norm": 2.474050998687744, + "learning_rate": 3.9133677267891556e-05, + "loss": 0.8149, + "step": 2490 + }, + { + "epoch": 0.12129622866603365, + "grad_norm": 2.76619291305542, + "learning_rate": 3.913275870255467e-05, + "loss": 1.0499, + "step": 2491 + }, + { + "epoch": 0.12134492245514085, + "grad_norm": 4.640392303466797, + "learning_rate": 3.913183966128867e-05, + "loss": 0.9174, + "step": 2492 + }, + { + "epoch": 0.12139361624424805, + "grad_norm": 2.775735855102539, + "learning_rate": 3.913092014411641e-05, + "loss": 0.8996, + "step": 2493 + }, + { + "epoch": 0.12144231003335525, + "grad_norm": 2.7114038467407227, + "learning_rate": 3.9130000151060764e-05, + "loss": 0.8828, + "step": 2494 + }, + { + "epoch": 0.12149100382246245, + "grad_norm": 1.5987454652786255, + "learning_rate": 3.912907968214462e-05, + "loss": 0.977, + "step": 2495 + }, + { + "epoch": 0.12153969761156964, + "grad_norm": 1.8025768995285034, + "learning_rate": 3.912815873739089e-05, + "loss": 0.9995, + "step": 2496 + }, + { + "epoch": 0.12158839140067684, + "grad_norm": 2.1831552982330322, + "learning_rate": 3.912723731682245e-05, + "loss": 0.9573, + "step": 2497 + }, + { + "epoch": 0.12163708518978404, + "grad_norm": 2.899121046066284, + "learning_rate": 3.912631542046224e-05, + "loss": 0.805, + "step": 2498 + }, + { + "epoch": 0.12168577897889124, + "grad_norm": 2.5413331985473633, + "learning_rate": 3.9125393048333195e-05, + "loss": 0.873, + "step": 2499 + }, + { + "epoch": 0.12173447276799844, + "grad_norm": 2.3909060955047607, + "learning_rate": 3.9124470200458254e-05, + "loss": 0.904, + "step": 2500 + }, + { + "epoch": 0.12178316655710564, + "grad_norm": 2.5053539276123047, + "learning_rate": 3.912354687686038e-05, + "loss": 0.9575, + "step": 2501 + }, + { + "epoch": 0.12183186034621284, + "grad_norm": 3.2642688751220703, + "learning_rate": 3.912262307756252e-05, + "loss": 0.936, + "step": 2502 + }, + { + "epoch": 0.12188055413532004, + "grad_norm": 2.339442491531372, + "learning_rate": 3.912169880258767e-05, + "loss": 1.059, + "step": 2503 + }, + { + "epoch": 0.12192924792442723, + "grad_norm": 2.2671332359313965, + "learning_rate": 3.9120774051958826e-05, + "loss": 0.877, + "step": 2504 + }, + { + "epoch": 0.12197794171353443, + "grad_norm": 2.6284546852111816, + "learning_rate": 3.911984882569898e-05, + "loss": 0.9214, + "step": 2505 + }, + { + "epoch": 0.12202663550264164, + "grad_norm": 1.9471485614776611, + "learning_rate": 3.9118923123831154e-05, + "loss": 1.0192, + "step": 2506 + }, + { + "epoch": 0.12207532929174884, + "grad_norm": 1.540876865386963, + "learning_rate": 3.911799694637837e-05, + "loss": 0.9273, + "step": 2507 + }, + { + "epoch": 0.12212402308085604, + "grad_norm": 2.5857021808624268, + "learning_rate": 3.9117070293363664e-05, + "loss": 0.9545, + "step": 2508 + }, + { + "epoch": 0.12217271686996324, + "grad_norm": 1.977889895439148, + "learning_rate": 3.91161431648101e-05, + "loss": 0.955, + "step": 2509 + }, + { + "epoch": 0.12222141065907044, + "grad_norm": 2.339787244796753, + "learning_rate": 3.9115215560740726e-05, + "loss": 0.8846, + "step": 2510 + }, + { + "epoch": 0.12227010444817764, + "grad_norm": 2.9772543907165527, + "learning_rate": 3.911428748117862e-05, + "loss": 0.8999, + "step": 2511 + }, + { + "epoch": 0.12231879823728484, + "grad_norm": 2.7304036617279053, + "learning_rate": 3.911335892614688e-05, + "loss": 0.8259, + "step": 2512 + }, + { + "epoch": 0.12236749202639204, + "grad_norm": 2.0659286975860596, + "learning_rate": 3.9112429895668584e-05, + "loss": 0.9338, + "step": 2513 + }, + { + "epoch": 0.12241618581549923, + "grad_norm": 2.2737205028533936, + "learning_rate": 3.911150038976685e-05, + "loss": 0.9454, + "step": 2514 + }, + { + "epoch": 0.12246487960460643, + "grad_norm": 2.2763988971710205, + "learning_rate": 3.911057040846481e-05, + "loss": 0.9267, + "step": 2515 + }, + { + "epoch": 0.12251357339371363, + "grad_norm": 0.08752261847257614, + "learning_rate": 3.910963995178559e-05, + "loss": 0.6275, + "step": 2516 + }, + { + "epoch": 0.12256226718282083, + "grad_norm": 2.485682964324951, + "learning_rate": 3.9108709019752326e-05, + "loss": 0.9253, + "step": 2517 + }, + { + "epoch": 0.12261096097192803, + "grad_norm": 2.0783801078796387, + "learning_rate": 3.910777761238819e-05, + "loss": 0.9961, + "step": 2518 + }, + { + "epoch": 0.12265965476103523, + "grad_norm": 1.9877676963806152, + "learning_rate": 3.9106845729716336e-05, + "loss": 0.9549, + "step": 2519 + }, + { + "epoch": 0.12270834855014243, + "grad_norm": 2.3219263553619385, + "learning_rate": 3.910591337175996e-05, + "loss": 0.9856, + "step": 2520 + }, + { + "epoch": 0.12275704233924963, + "grad_norm": 2.2467875480651855, + "learning_rate": 3.910498053854224e-05, + "loss": 0.8345, + "step": 2521 + }, + { + "epoch": 0.12280573612835682, + "grad_norm": 2.220287561416626, + "learning_rate": 3.9104047230086396e-05, + "loss": 0.9757, + "step": 2522 + }, + { + "epoch": 0.12285442991746402, + "grad_norm": 2.087543249130249, + "learning_rate": 3.910311344641563e-05, + "loss": 0.9135, + "step": 2523 + }, + { + "epoch": 0.12290312370657122, + "grad_norm": 1.6435860395431519, + "learning_rate": 3.910217918755317e-05, + "loss": 0.9043, + "step": 2524 + }, + { + "epoch": 0.12295181749567842, + "grad_norm": 1.62649405002594, + "learning_rate": 3.910124445352227e-05, + "loss": 0.9401, + "step": 2525 + }, + { + "epoch": 0.12300051128478563, + "grad_norm": 1.698807716369629, + "learning_rate": 3.910030924434616e-05, + "loss": 0.9201, + "step": 2526 + }, + { + "epoch": 0.12304920507389283, + "grad_norm": 1.7356668710708618, + "learning_rate": 3.909937356004813e-05, + "loss": 0.9549, + "step": 2527 + }, + { + "epoch": 0.12309789886300003, + "grad_norm": 2.6492443084716797, + "learning_rate": 3.9098437400651435e-05, + "loss": 0.9293, + "step": 2528 + }, + { + "epoch": 0.12314659265210723, + "grad_norm": 2.494450807571411, + "learning_rate": 3.909750076617937e-05, + "loss": 0.9552, + "step": 2529 + }, + { + "epoch": 0.12319528644121443, + "grad_norm": 1.7490144968032837, + "learning_rate": 3.909656365665523e-05, + "loss": 0.9043, + "step": 2530 + }, + { + "epoch": 0.12324398023032163, + "grad_norm": 1.9826583862304688, + "learning_rate": 3.909562607210233e-05, + "loss": 0.9578, + "step": 2531 + }, + { + "epoch": 0.12329267401942882, + "grad_norm": 2.2720422744750977, + "learning_rate": 3.909468801254399e-05, + "loss": 0.905, + "step": 2532 + }, + { + "epoch": 0.12334136780853602, + "grad_norm": 1.7031941413879395, + "learning_rate": 3.9093749478003544e-05, + "loss": 0.8911, + "step": 2533 + }, + { + "epoch": 0.12339006159764322, + "grad_norm": 0.08795371651649475, + "learning_rate": 3.909281046850434e-05, + "loss": 0.6718, + "step": 2534 + }, + { + "epoch": 0.12343875538675042, + "grad_norm": 1.6818506717681885, + "learning_rate": 3.9091870984069734e-05, + "loss": 0.9131, + "step": 2535 + }, + { + "epoch": 0.12348744917585762, + "grad_norm": 1.7893836498260498, + "learning_rate": 3.9090931024723096e-05, + "loss": 0.8746, + "step": 2536 + }, + { + "epoch": 0.12353614296496482, + "grad_norm": 0.0885804146528244, + "learning_rate": 3.908999059048781e-05, + "loss": 0.6614, + "step": 2537 + }, + { + "epoch": 0.12358483675407202, + "grad_norm": 1.8569494485855103, + "learning_rate": 3.9089049681387264e-05, + "loss": 0.9704, + "step": 2538 + }, + { + "epoch": 0.12363353054317922, + "grad_norm": 1.7480436563491821, + "learning_rate": 3.9088108297444874e-05, + "loss": 0.9381, + "step": 2539 + }, + { + "epoch": 0.12368222433228641, + "grad_norm": 1.8558554649353027, + "learning_rate": 3.908716643868404e-05, + "loss": 0.9939, + "step": 2540 + }, + { + "epoch": 0.12373091812139361, + "grad_norm": 2.086632490158081, + "learning_rate": 3.9086224105128204e-05, + "loss": 0.88, + "step": 2541 + }, + { + "epoch": 0.12377961191050081, + "grad_norm": 1.8219391107559204, + "learning_rate": 3.908528129680081e-05, + "loss": 0.9479, + "step": 2542 + }, + { + "epoch": 0.12382830569960801, + "grad_norm": 2.5697214603424072, + "learning_rate": 3.908433801372529e-05, + "loss": 0.8891, + "step": 2543 + }, + { + "epoch": 0.12387699948871521, + "grad_norm": 2.394219398498535, + "learning_rate": 3.908339425592513e-05, + "loss": 0.9533, + "step": 2544 + }, + { + "epoch": 0.12392569327782241, + "grad_norm": 1.7899158000946045, + "learning_rate": 3.90824500234238e-05, + "loss": 0.874, + "step": 2545 + }, + { + "epoch": 0.12397438706692962, + "grad_norm": 2.0014023780822754, + "learning_rate": 3.908150531624479e-05, + "loss": 0.9738, + "step": 2546 + }, + { + "epoch": 0.12402308085603682, + "grad_norm": 2.2333176136016846, + "learning_rate": 3.908056013441159e-05, + "loss": 0.8739, + "step": 2547 + }, + { + "epoch": 0.12407177464514402, + "grad_norm": 1.7134497165679932, + "learning_rate": 3.9079614477947715e-05, + "loss": 0.9489, + "step": 2548 + }, + { + "epoch": 0.12412046843425122, + "grad_norm": 1.8547935485839844, + "learning_rate": 3.907866834687669e-05, + "loss": 0.9411, + "step": 2549 + }, + { + "epoch": 0.12416916222335841, + "grad_norm": 2.236755609512329, + "learning_rate": 3.907772174122205e-05, + "loss": 0.903, + "step": 2550 + }, + { + "epoch": 0.12421785601246561, + "grad_norm": 2.17854380607605, + "learning_rate": 3.907677466100734e-05, + "loss": 0.8946, + "step": 2551 + }, + { + "epoch": 0.12426654980157281, + "grad_norm": 2.8363966941833496, + "learning_rate": 3.907582710625613e-05, + "loss": 0.9046, + "step": 2552 + }, + { + "epoch": 0.12431524359068001, + "grad_norm": 2.1208252906799316, + "learning_rate": 3.907487907699198e-05, + "loss": 0.944, + "step": 2553 + }, + { + "epoch": 0.12436393737978721, + "grad_norm": 1.7830966711044312, + "learning_rate": 3.9073930573238465e-05, + "loss": 0.88, + "step": 2554 + }, + { + "epoch": 0.12441263116889441, + "grad_norm": 2.2409098148345947, + "learning_rate": 3.90729815950192e-05, + "loss": 0.902, + "step": 2555 + }, + { + "epoch": 0.1244613249580016, + "grad_norm": 0.07913366705179214, + "learning_rate": 3.907203214235777e-05, + "loss": 0.5475, + "step": 2556 + }, + { + "epoch": 0.1245100187471088, + "grad_norm": 3.2285189628601074, + "learning_rate": 3.90710822152778e-05, + "loss": 0.8428, + "step": 2557 + }, + { + "epoch": 0.124558712536216, + "grad_norm": 1.9924969673156738, + "learning_rate": 3.9070131813802925e-05, + "loss": 0.8493, + "step": 2558 + }, + { + "epoch": 0.1246074063253232, + "grad_norm": 1.9364335536956787, + "learning_rate": 3.9069180937956785e-05, + "loss": 0.8533, + "step": 2559 + }, + { + "epoch": 0.1246561001144304, + "grad_norm": 1.685900330543518, + "learning_rate": 3.9068229587763026e-05, + "loss": 0.8146, + "step": 2560 + }, + { + "epoch": 0.1247047939035376, + "grad_norm": 1.8369694948196411, + "learning_rate": 3.906727776324531e-05, + "loss": 0.8977, + "step": 2561 + }, + { + "epoch": 0.1247534876926448, + "grad_norm": 1.4544559717178345, + "learning_rate": 3.9066325464427336e-05, + "loss": 0.9585, + "step": 2562 + }, + { + "epoch": 0.124802181481752, + "grad_norm": 3.495358467102051, + "learning_rate": 3.906537269133277e-05, + "loss": 0.9977, + "step": 2563 + }, + { + "epoch": 0.1248508752708592, + "grad_norm": 2.0776569843292236, + "learning_rate": 3.906441944398532e-05, + "loss": 0.9016, + "step": 2564 + }, + { + "epoch": 0.1248995690599664, + "grad_norm": 1.8028879165649414, + "learning_rate": 3.90634657224087e-05, + "loss": 1.0623, + "step": 2565 + }, + { + "epoch": 0.12494826284907361, + "grad_norm": 2.1173081398010254, + "learning_rate": 3.906251152662663e-05, + "loss": 0.896, + "step": 2566 + }, + { + "epoch": 0.1249969566381808, + "grad_norm": 3.253389835357666, + "learning_rate": 3.9061556856662857e-05, + "loss": 0.9121, + "step": 2567 + }, + { + "epoch": 0.125045650427288, + "grad_norm": 1.9527336359024048, + "learning_rate": 3.906060171254111e-05, + "loss": 0.9431, + "step": 2568 + }, + { + "epoch": 0.1250943442163952, + "grad_norm": 1.8088417053222656, + "learning_rate": 3.905964609428516e-05, + "loss": 0.9301, + "step": 2569 + }, + { + "epoch": 0.1251430380055024, + "grad_norm": 2.109006643295288, + "learning_rate": 3.905869000191877e-05, + "loss": 0.85, + "step": 2570 + }, + { + "epoch": 0.1251917317946096, + "grad_norm": 2.2623374462127686, + "learning_rate": 3.905773343546574e-05, + "loss": 0.8911, + "step": 2571 + }, + { + "epoch": 0.12524042558371679, + "grad_norm": 2.2593297958374023, + "learning_rate": 3.905677639494984e-05, + "loss": 0.952, + "step": 2572 + }, + { + "epoch": 0.125289119372824, + "grad_norm": 1.9969738721847534, + "learning_rate": 3.90558188803949e-05, + "loss": 0.9116, + "step": 2573 + }, + { + "epoch": 0.12533781316193118, + "grad_norm": 2.360032558441162, + "learning_rate": 3.905486089182472e-05, + "loss": 1.0023, + "step": 2574 + }, + { + "epoch": 0.1253865069510384, + "grad_norm": 1.9510000944137573, + "learning_rate": 3.9053902429263146e-05, + "loss": 0.954, + "step": 2575 + }, + { + "epoch": 0.1254352007401456, + "grad_norm": 0.08736281841993332, + "learning_rate": 3.9052943492734005e-05, + "loss": 0.6644, + "step": 2576 + }, + { + "epoch": 0.1254838945292528, + "grad_norm": 2.373641014099121, + "learning_rate": 3.905198408226116e-05, + "loss": 0.9259, + "step": 2577 + }, + { + "epoch": 0.12553258831836, + "grad_norm": 2.089446544647217, + "learning_rate": 3.905102419786848e-05, + "loss": 0.9211, + "step": 2578 + }, + { + "epoch": 0.1255812821074672, + "grad_norm": 1.7911237478256226, + "learning_rate": 3.905006383957982e-05, + "loss": 0.8685, + "step": 2579 + }, + { + "epoch": 0.1256299758965744, + "grad_norm": 2.132662057876587, + "learning_rate": 3.90491030074191e-05, + "loss": 0.9243, + "step": 2580 + }, + { + "epoch": 0.1256786696856816, + "grad_norm": 2.034055709838867, + "learning_rate": 3.90481417014102e-05, + "loss": 0.9203, + "step": 2581 + }, + { + "epoch": 0.1257273634747888, + "grad_norm": 1.6302345991134644, + "learning_rate": 3.904717992157704e-05, + "loss": 0.9394, + "step": 2582 + }, + { + "epoch": 0.12577605726389599, + "grad_norm": 2.7833008766174316, + "learning_rate": 3.9046217667943543e-05, + "loss": 0.9129, + "step": 2583 + }, + { + "epoch": 0.1258247510530032, + "grad_norm": 2.564828872680664, + "learning_rate": 3.9045254940533646e-05, + "loss": 0.8403, + "step": 2584 + }, + { + "epoch": 0.12587344484211038, + "grad_norm": 2.264920473098755, + "learning_rate": 3.9044291739371296e-05, + "loss": 0.9166, + "step": 2585 + }, + { + "epoch": 0.1259221386312176, + "grad_norm": 2.5881214141845703, + "learning_rate": 3.904332806448045e-05, + "loss": 0.8843, + "step": 2586 + }, + { + "epoch": 0.12597083242032478, + "grad_norm": 2.115584135055542, + "learning_rate": 3.904236391588508e-05, + "loss": 0.9405, + "step": 2587 + }, + { + "epoch": 0.126019526209432, + "grad_norm": 2.123992681503296, + "learning_rate": 3.904139929360917e-05, + "loss": 0.9838, + "step": 2588 + }, + { + "epoch": 0.12606821999853918, + "grad_norm": 1.4842668771743774, + "learning_rate": 3.904043419767672e-05, + "loss": 1.0626, + "step": 2589 + }, + { + "epoch": 0.1261169137876464, + "grad_norm": 1.9256969690322876, + "learning_rate": 3.9039468628111736e-05, + "loss": 0.8824, + "step": 2590 + }, + { + "epoch": 0.12616560757675357, + "grad_norm": 1.9475114345550537, + "learning_rate": 3.9038502584938234e-05, + "loss": 0.8776, + "step": 2591 + }, + { + "epoch": 0.1262143013658608, + "grad_norm": 2.6063311100006104, + "learning_rate": 3.903753606818025e-05, + "loss": 0.9213, + "step": 2592 + }, + { + "epoch": 0.12626299515496797, + "grad_norm": 1.6702747344970703, + "learning_rate": 3.90365690778618e-05, + "loss": 0.9272, + "step": 2593 + }, + { + "epoch": 0.12631168894407518, + "grad_norm": 3.3467893600463867, + "learning_rate": 3.903560161400698e-05, + "loss": 0.964, + "step": 2594 + }, + { + "epoch": 0.12636038273318237, + "grad_norm": 1.7890475988388062, + "learning_rate": 3.903463367663982e-05, + "loss": 0.9665, + "step": 2595 + }, + { + "epoch": 0.12640907652228958, + "grad_norm": 2.3218071460723877, + "learning_rate": 3.903366526578442e-05, + "loss": 0.8519, + "step": 2596 + }, + { + "epoch": 0.1264577703113968, + "grad_norm": 6.0928730964660645, + "learning_rate": 3.903269638146486e-05, + "loss": 0.9484, + "step": 2597 + }, + { + "epoch": 0.12650646410050398, + "grad_norm": 2.0750625133514404, + "learning_rate": 3.903172702370524e-05, + "loss": 1.0163, + "step": 2598 + }, + { + "epoch": 0.1265551578896112, + "grad_norm": 2.5876715183258057, + "learning_rate": 3.903075719252967e-05, + "loss": 0.9046, + "step": 2599 + }, + { + "epoch": 0.12660385167871838, + "grad_norm": 2.0151326656341553, + "learning_rate": 3.902978688796229e-05, + "loss": 0.9873, + "step": 2600 + }, + { + "epoch": 0.1266525454678256, + "grad_norm": 2.405048131942749, + "learning_rate": 3.902881611002722e-05, + "loss": 0.9215, + "step": 2601 + }, + { + "epoch": 0.12670123925693277, + "grad_norm": 1.5544079542160034, + "learning_rate": 3.902784485874862e-05, + "loss": 0.8935, + "step": 2602 + }, + { + "epoch": 0.12674993304604, + "grad_norm": 5.014412879943848, + "learning_rate": 3.9026873134150635e-05, + "loss": 0.9089, + "step": 2603 + }, + { + "epoch": 0.12679862683514717, + "grad_norm": 1.5183336734771729, + "learning_rate": 3.9025900936257455e-05, + "loss": 0.882, + "step": 2604 + }, + { + "epoch": 0.12684732062425438, + "grad_norm": 1.3723821640014648, + "learning_rate": 3.902492826509325e-05, + "loss": 0.8085, + "step": 2605 + }, + { + "epoch": 0.12689601441336157, + "grad_norm": 2.1840033531188965, + "learning_rate": 3.902395512068222e-05, + "loss": 0.9274, + "step": 2606 + }, + { + "epoch": 0.12694470820246878, + "grad_norm": 1.9723597764968872, + "learning_rate": 3.902298150304857e-05, + "loss": 0.9128, + "step": 2607 + }, + { + "epoch": 0.12699340199157597, + "grad_norm": 2.232809066772461, + "learning_rate": 3.902200741221652e-05, + "loss": 0.9427, + "step": 2608 + }, + { + "epoch": 0.12704209578068318, + "grad_norm": 2.812958240509033, + "learning_rate": 3.902103284821031e-05, + "loss": 0.8417, + "step": 2609 + }, + { + "epoch": 0.12709078956979036, + "grad_norm": 1.8100533485412598, + "learning_rate": 3.9020057811054164e-05, + "loss": 0.9315, + "step": 2610 + }, + { + "epoch": 0.12713948335889758, + "grad_norm": 2.387932300567627, + "learning_rate": 3.9019082300772344e-05, + "loss": 0.936, + "step": 2611 + }, + { + "epoch": 0.12718817714800476, + "grad_norm": 2.3163511753082275, + "learning_rate": 3.901810631738913e-05, + "loss": 0.8707, + "step": 2612 + }, + { + "epoch": 0.12723687093711197, + "grad_norm": 4.757569789886475, + "learning_rate": 3.9017129860928776e-05, + "loss": 0.9586, + "step": 2613 + }, + { + "epoch": 0.12728556472621916, + "grad_norm": 1.8127809762954712, + "learning_rate": 3.901615293141559e-05, + "loss": 0.937, + "step": 2614 + }, + { + "epoch": 0.12733425851532637, + "grad_norm": 1.9527400732040405, + "learning_rate": 3.9015175528873864e-05, + "loss": 0.9217, + "step": 2615 + }, + { + "epoch": 0.12738295230443358, + "grad_norm": 2.35623836517334, + "learning_rate": 3.901419765332791e-05, + "loss": 0.9592, + "step": 2616 + }, + { + "epoch": 0.12743164609354077, + "grad_norm": 0.08749840408563614, + "learning_rate": 3.901321930480206e-05, + "loss": 0.6145, + "step": 2617 + }, + { + "epoch": 0.12748033988264798, + "grad_norm": 2.3617851734161377, + "learning_rate": 3.901224048332064e-05, + "loss": 0.9379, + "step": 2618 + }, + { + "epoch": 0.12752903367175517, + "grad_norm": 3.6566877365112305, + "learning_rate": 3.901126118890801e-05, + "loss": 0.9223, + "step": 2619 + }, + { + "epoch": 0.12757772746086238, + "grad_norm": 0.08631247282028198, + "learning_rate": 3.901028142158852e-05, + "loss": 0.6356, + "step": 2620 + }, + { + "epoch": 0.12762642124996956, + "grad_norm": 3.1313250064849854, + "learning_rate": 3.900930118138655e-05, + "loss": 0.8081, + "step": 2621 + }, + { + "epoch": 0.12767511503907678, + "grad_norm": 2.6071746349334717, + "learning_rate": 3.900832046832649e-05, + "loss": 0.9041, + "step": 2622 + }, + { + "epoch": 0.12772380882818396, + "grad_norm": 1.599671721458435, + "learning_rate": 3.900733928243271e-05, + "loss": 0.8112, + "step": 2623 + }, + { + "epoch": 0.12777250261729117, + "grad_norm": 2.3259165287017822, + "learning_rate": 3.900635762372963e-05, + "loss": 0.9467, + "step": 2624 + }, + { + "epoch": 0.12782119640639836, + "grad_norm": 8.404571533203125, + "learning_rate": 3.900537549224169e-05, + "loss": 0.9338, + "step": 2625 + }, + { + "epoch": 0.12786989019550557, + "grad_norm": 1.9286364316940308, + "learning_rate": 3.9004392887993286e-05, + "loss": 1.015, + "step": 2626 + }, + { + "epoch": 0.12791858398461275, + "grad_norm": 3.1840767860412598, + "learning_rate": 3.900340981100888e-05, + "loss": 0.839, + "step": 2627 + }, + { + "epoch": 0.12796727777371997, + "grad_norm": 1.790573000907898, + "learning_rate": 3.900242626131292e-05, + "loss": 0.8477, + "step": 2628 + }, + { + "epoch": 0.12801597156282715, + "grad_norm": 2.0517477989196777, + "learning_rate": 3.900144223892988e-05, + "loss": 0.9562, + "step": 2629 + }, + { + "epoch": 0.12806466535193436, + "grad_norm": 2.030795097351074, + "learning_rate": 3.9000457743884225e-05, + "loss": 0.946, + "step": 2630 + }, + { + "epoch": 0.12811335914104155, + "grad_norm": 2.3648390769958496, + "learning_rate": 3.8999472776200456e-05, + "loss": 0.8496, + "step": 2631 + }, + { + "epoch": 0.12816205293014876, + "grad_norm": 1.9340505599975586, + "learning_rate": 3.8998487335903066e-05, + "loss": 0.9025, + "step": 2632 + }, + { + "epoch": 0.12821074671925595, + "grad_norm": 1.4545619487762451, + "learning_rate": 3.899750142301657e-05, + "loss": 0.9235, + "step": 2633 + }, + { + "epoch": 0.12825944050836316, + "grad_norm": 2.475773334503174, + "learning_rate": 3.8996515037565504e-05, + "loss": 0.8057, + "step": 2634 + }, + { + "epoch": 0.12830813429747034, + "grad_norm": 2.0139777660369873, + "learning_rate": 3.8995528179574386e-05, + "loss": 0.9503, + "step": 2635 + }, + { + "epoch": 0.12835682808657756, + "grad_norm": 5.992467403411865, + "learning_rate": 3.8994540849067775e-05, + "loss": 0.8663, + "step": 2636 + }, + { + "epoch": 0.12840552187568477, + "grad_norm": 1.662103295326233, + "learning_rate": 3.899355304607023e-05, + "loss": 0.9319, + "step": 2637 + }, + { + "epoch": 0.12845421566479195, + "grad_norm": 2.816788673400879, + "learning_rate": 3.8992564770606315e-05, + "loss": 1.0254, + "step": 2638 + }, + { + "epoch": 0.12850290945389917, + "grad_norm": 1.9576823711395264, + "learning_rate": 3.899157602270062e-05, + "loss": 0.9063, + "step": 2639 + }, + { + "epoch": 0.12855160324300635, + "grad_norm": 1.7554972171783447, + "learning_rate": 3.899058680237775e-05, + "loss": 0.9575, + "step": 2640 + }, + { + "epoch": 0.12860029703211356, + "grad_norm": 1.7088232040405273, + "learning_rate": 3.898959710966229e-05, + "loss": 0.8886, + "step": 2641 + }, + { + "epoch": 0.12864899082122075, + "grad_norm": 1.622133493423462, + "learning_rate": 3.898860694457888e-05, + "loss": 0.903, + "step": 2642 + }, + { + "epoch": 0.12869768461032796, + "grad_norm": 1.5604056119918823, + "learning_rate": 3.8987616307152134e-05, + "loss": 0.9743, + "step": 2643 + }, + { + "epoch": 0.12874637839943515, + "grad_norm": 2.0218467712402344, + "learning_rate": 3.8986625197406706e-05, + "loss": 0.8732, + "step": 2644 + }, + { + "epoch": 0.12879507218854236, + "grad_norm": 2.3358402252197266, + "learning_rate": 3.898563361536724e-05, + "loss": 0.8612, + "step": 2645 + }, + { + "epoch": 0.12884376597764954, + "grad_norm": 1.6369057893753052, + "learning_rate": 3.8984641561058414e-05, + "loss": 0.8792, + "step": 2646 + }, + { + "epoch": 0.12889245976675676, + "grad_norm": 2.171429395675659, + "learning_rate": 3.898364903450489e-05, + "loss": 1.0135, + "step": 2647 + }, + { + "epoch": 0.12894115355586394, + "grad_norm": 1.8914964199066162, + "learning_rate": 3.898265603573137e-05, + "loss": 0.8856, + "step": 2648 + }, + { + "epoch": 0.12898984734497115, + "grad_norm": 1.430293083190918, + "learning_rate": 3.898166256476255e-05, + "loss": 0.9319, + "step": 2649 + }, + { + "epoch": 0.12903854113407834, + "grad_norm": 2.097424030303955, + "learning_rate": 3.8980668621623145e-05, + "loss": 0.9128, + "step": 2650 + }, + { + "epoch": 0.12908723492318555, + "grad_norm": 2.321275234222412, + "learning_rate": 3.8979674206337874e-05, + "loss": 0.8675, + "step": 2651 + }, + { + "epoch": 0.12913592871229274, + "grad_norm": 1.7319103479385376, + "learning_rate": 3.8978679318931475e-05, + "loss": 1.0014, + "step": 2652 + }, + { + "epoch": 0.12918462250139995, + "grad_norm": 1.5372016429901123, + "learning_rate": 3.89776839594287e-05, + "loss": 0.8288, + "step": 2653 + }, + { + "epoch": 0.12923331629050713, + "grad_norm": 2.1972415447235107, + "learning_rate": 3.897668812785431e-05, + "loss": 0.8911, + "step": 2654 + }, + { + "epoch": 0.12928201007961435, + "grad_norm": 3.4451167583465576, + "learning_rate": 3.8975691824233066e-05, + "loss": 0.858, + "step": 2655 + }, + { + "epoch": 0.12933070386872156, + "grad_norm": 2.0502169132232666, + "learning_rate": 3.8974695048589765e-05, + "loss": 0.983, + "step": 2656 + }, + { + "epoch": 0.12937939765782874, + "grad_norm": 2.0915112495422363, + "learning_rate": 3.897369780094919e-05, + "loss": 0.9555, + "step": 2657 + }, + { + "epoch": 0.12942809144693596, + "grad_norm": 3.0504844188690186, + "learning_rate": 3.897270008133615e-05, + "loss": 0.9238, + "step": 2658 + }, + { + "epoch": 0.12947678523604314, + "grad_norm": 1.6228984594345093, + "learning_rate": 3.897170188977547e-05, + "loss": 0.9205, + "step": 2659 + }, + { + "epoch": 0.12952547902515035, + "grad_norm": 1.71782386302948, + "learning_rate": 3.897070322629198e-05, + "loss": 0.9496, + "step": 2660 + }, + { + "epoch": 0.12957417281425754, + "grad_norm": 2.942811965942383, + "learning_rate": 3.8969704090910506e-05, + "loss": 0.937, + "step": 2661 + }, + { + "epoch": 0.12962286660336475, + "grad_norm": 0.0890936553478241, + "learning_rate": 3.896870448365592e-05, + "loss": 0.6278, + "step": 2662 + }, + { + "epoch": 0.12967156039247194, + "grad_norm": 1.69044029712677, + "learning_rate": 3.896770440455308e-05, + "loss": 0.9329, + "step": 2663 + }, + { + "epoch": 0.12972025418157915, + "grad_norm": 1.7471873760223389, + "learning_rate": 3.8966703853626864e-05, + "loss": 0.9177, + "step": 2664 + }, + { + "epoch": 0.12976894797068633, + "grad_norm": 1.7454286813735962, + "learning_rate": 3.896570283090216e-05, + "loss": 0.9143, + "step": 2665 + }, + { + "epoch": 0.12981764175979355, + "grad_norm": 1.9780184030532837, + "learning_rate": 3.896470133640386e-05, + "loss": 0.9772, + "step": 2666 + }, + { + "epoch": 0.12986633554890073, + "grad_norm": 1.8507428169250488, + "learning_rate": 3.89636993701569e-05, + "loss": 0.8639, + "step": 2667 + }, + { + "epoch": 0.12991502933800794, + "grad_norm": 1.9928265810012817, + "learning_rate": 3.896269693218618e-05, + "loss": 0.8991, + "step": 2668 + }, + { + "epoch": 0.12996372312711513, + "grad_norm": 2.2989277839660645, + "learning_rate": 3.896169402251665e-05, + "loss": 0.9782, + "step": 2669 + }, + { + "epoch": 0.13001241691622234, + "grad_norm": 1.7701008319854736, + "learning_rate": 3.8960690641173245e-05, + "loss": 0.8364, + "step": 2670 + }, + { + "epoch": 0.13006111070532952, + "grad_norm": 1.777485966682434, + "learning_rate": 3.8959686788180936e-05, + "loss": 1.0104, + "step": 2671 + }, + { + "epoch": 0.13010980449443674, + "grad_norm": 1.624770998954773, + "learning_rate": 3.895868246356469e-05, + "loss": 0.9767, + "step": 2672 + }, + { + "epoch": 0.13015849828354392, + "grad_norm": 2.2442946434020996, + "learning_rate": 3.895767766734949e-05, + "loss": 0.9144, + "step": 2673 + }, + { + "epoch": 0.13020719207265113, + "grad_norm": 1.9536354541778564, + "learning_rate": 3.8956672399560325e-05, + "loss": 0.9009, + "step": 2674 + }, + { + "epoch": 0.13025588586175832, + "grad_norm": 2.215972661972046, + "learning_rate": 3.8955666660222207e-05, + "loss": 0.9542, + "step": 2675 + }, + { + "epoch": 0.13030457965086553, + "grad_norm": 2.054690361022949, + "learning_rate": 3.895466044936016e-05, + "loss": 0.988, + "step": 2676 + }, + { + "epoch": 0.13035327343997274, + "grad_norm": 1.7731753587722778, + "learning_rate": 3.8953653766999194e-05, + "loss": 1.0071, + "step": 2677 + }, + { + "epoch": 0.13040196722907993, + "grad_norm": 1.926314353942871, + "learning_rate": 3.8952646613164375e-05, + "loss": 1.0467, + "step": 2678 + }, + { + "epoch": 0.13045066101818714, + "grad_norm": 1.8112635612487793, + "learning_rate": 3.8951638987880736e-05, + "loss": 0.8756, + "step": 2679 + }, + { + "epoch": 0.13049935480729433, + "grad_norm": 1.6371151208877563, + "learning_rate": 3.895063089117335e-05, + "loss": 0.9278, + "step": 2680 + }, + { + "epoch": 0.13054804859640154, + "grad_norm": 0.08357325196266174, + "learning_rate": 3.8949622323067295e-05, + "loss": 0.5704, + "step": 2681 + }, + { + "epoch": 0.13059674238550872, + "grad_norm": 2.249171018600464, + "learning_rate": 3.894861328358765e-05, + "loss": 0.8913, + "step": 2682 + }, + { + "epoch": 0.13064543617461594, + "grad_norm": 2.575000762939453, + "learning_rate": 3.894760377275953e-05, + "loss": 0.9301, + "step": 2683 + }, + { + "epoch": 0.13069412996372312, + "grad_norm": 1.5483405590057373, + "learning_rate": 3.8946593790608035e-05, + "loss": 0.8715, + "step": 2684 + }, + { + "epoch": 0.13074282375283033, + "grad_norm": 1.5563099384307861, + "learning_rate": 3.8945583337158296e-05, + "loss": 0.8976, + "step": 2685 + }, + { + "epoch": 0.13079151754193752, + "grad_norm": 2.3484838008880615, + "learning_rate": 3.8944572412435436e-05, + "loss": 0.8548, + "step": 2686 + }, + { + "epoch": 0.13084021133104473, + "grad_norm": 2.010267734527588, + "learning_rate": 3.894356101646462e-05, + "loss": 0.8422, + "step": 2687 + }, + { + "epoch": 0.13088890512015192, + "grad_norm": 1.7445054054260254, + "learning_rate": 3.894254914927099e-05, + "loss": 0.9859, + "step": 2688 + }, + { + "epoch": 0.13093759890925913, + "grad_norm": 1.967688798904419, + "learning_rate": 3.894153681087972e-05, + "loss": 0.9265, + "step": 2689 + }, + { + "epoch": 0.1309862926983663, + "grad_norm": 1.8235220909118652, + "learning_rate": 3.8940524001316e-05, + "loss": 0.9648, + "step": 2690 + }, + { + "epoch": 0.13103498648747353, + "grad_norm": 2.297973871231079, + "learning_rate": 3.8939510720605016e-05, + "loss": 0.8998, + "step": 2691 + }, + { + "epoch": 0.1310836802765807, + "grad_norm": 2.1101033687591553, + "learning_rate": 3.893849696877198e-05, + "loss": 0.9363, + "step": 2692 + }, + { + "epoch": 0.13113237406568792, + "grad_norm": 2.058849573135376, + "learning_rate": 3.8937482745842104e-05, + "loss": 0.9924, + "step": 2693 + }, + { + "epoch": 0.1311810678547951, + "grad_norm": 1.5543729066848755, + "learning_rate": 3.893646805184061e-05, + "loss": 0.9014, + "step": 2694 + }, + { + "epoch": 0.13122976164390232, + "grad_norm": 1.593369483947754, + "learning_rate": 3.893545288679276e-05, + "loss": 0.9103, + "step": 2695 + }, + { + "epoch": 0.13127845543300953, + "grad_norm": 1.6636239290237427, + "learning_rate": 3.893443725072378e-05, + "loss": 0.9096, + "step": 2696 + }, + { + "epoch": 0.13132714922211672, + "grad_norm": 1.637783408164978, + "learning_rate": 3.8933421143658956e-05, + "loss": 0.8701, + "step": 2697 + }, + { + "epoch": 0.13137584301122393, + "grad_norm": 2.7014856338500977, + "learning_rate": 3.893240456562355e-05, + "loss": 0.8135, + "step": 2698 + }, + { + "epoch": 0.13142453680033112, + "grad_norm": 0.09068896621465683, + "learning_rate": 3.8931387516642856e-05, + "loss": 0.5684, + "step": 2699 + }, + { + "epoch": 0.13147323058943833, + "grad_norm": 1.8514317274093628, + "learning_rate": 3.8930369996742176e-05, + "loss": 0.8618, + "step": 2700 + }, + { + "epoch": 0.1315219243785455, + "grad_norm": 1.8663301467895508, + "learning_rate": 3.892935200594681e-05, + "loss": 0.9135, + "step": 2701 + }, + { + "epoch": 0.13157061816765273, + "grad_norm": 2.000364303588867, + "learning_rate": 3.892833354428209e-05, + "loss": 0.9698, + "step": 2702 + }, + { + "epoch": 0.1316193119567599, + "grad_norm": 1.7898515462875366, + "learning_rate": 3.892731461177334e-05, + "loss": 0.982, + "step": 2703 + }, + { + "epoch": 0.13166800574586712, + "grad_norm": 1.652204990386963, + "learning_rate": 3.892629520844592e-05, + "loss": 0.8886, + "step": 2704 + }, + { + "epoch": 0.1317166995349743, + "grad_norm": 2.245002508163452, + "learning_rate": 3.892527533432518e-05, + "loss": 0.9837, + "step": 2705 + }, + { + "epoch": 0.13176539332408152, + "grad_norm": 1.7576863765716553, + "learning_rate": 3.8924254989436496e-05, + "loss": 0.8671, + "step": 2706 + }, + { + "epoch": 0.1318140871131887, + "grad_norm": 1.9641836881637573, + "learning_rate": 3.892323417380524e-05, + "loss": 0.8557, + "step": 2707 + }, + { + "epoch": 0.13186278090229592, + "grad_norm": 1.9623771905899048, + "learning_rate": 3.89222128874568e-05, + "loss": 0.9144, + "step": 2708 + }, + { + "epoch": 0.1319114746914031, + "grad_norm": 2.929952383041382, + "learning_rate": 3.89211911304166e-05, + "loss": 1.0322, + "step": 2709 + }, + { + "epoch": 0.13196016848051031, + "grad_norm": 1.9412157535552979, + "learning_rate": 3.892016890271005e-05, + "loss": 0.7969, + "step": 2710 + }, + { + "epoch": 0.1320088622696175, + "grad_norm": 1.920602798461914, + "learning_rate": 3.891914620436256e-05, + "loss": 0.9097, + "step": 2711 + }, + { + "epoch": 0.1320575560587247, + "grad_norm": 2.652672529220581, + "learning_rate": 3.8918123035399586e-05, + "loss": 0.8528, + "step": 2712 + }, + { + "epoch": 0.1321062498478319, + "grad_norm": 2.711508274078369, + "learning_rate": 3.891709939584658e-05, + "loss": 0.8147, + "step": 2713 + }, + { + "epoch": 0.1321549436369391, + "grad_norm": 2.0813543796539307, + "learning_rate": 3.8916075285729e-05, + "loss": 0.9084, + "step": 2714 + }, + { + "epoch": 0.1322036374260463, + "grad_norm": 1.9280505180358887, + "learning_rate": 3.8915050705072325e-05, + "loss": 0.8746, + "step": 2715 + }, + { + "epoch": 0.1322523312151535, + "grad_norm": 1.819758653640747, + "learning_rate": 3.891402565390204e-05, + "loss": 0.808, + "step": 2716 + }, + { + "epoch": 0.13230102500426072, + "grad_norm": 1.4735158681869507, + "learning_rate": 3.8913000132243634e-05, + "loss": 0.968, + "step": 2717 + }, + { + "epoch": 0.1323497187933679, + "grad_norm": 1.8682612180709839, + "learning_rate": 3.891197414012263e-05, + "loss": 0.9478, + "step": 2718 + }, + { + "epoch": 0.13239841258247512, + "grad_norm": 2.379423141479492, + "learning_rate": 3.891094767756455e-05, + "loss": 0.957, + "step": 2719 + }, + { + "epoch": 0.1324471063715823, + "grad_norm": 1.7852516174316406, + "learning_rate": 3.890992074459492e-05, + "loss": 0.9642, + "step": 2720 + }, + { + "epoch": 0.13249580016068951, + "grad_norm": 1.825500249862671, + "learning_rate": 3.890889334123928e-05, + "loss": 0.9319, + "step": 2721 + }, + { + "epoch": 0.1325444939497967, + "grad_norm": 1.9738357067108154, + "learning_rate": 3.890786546752321e-05, + "loss": 0.9115, + "step": 2722 + }, + { + "epoch": 0.1325931877389039, + "grad_norm": 0.08385580033063889, + "learning_rate": 3.890683712347225e-05, + "loss": 0.6496, + "step": 2723 + }, + { + "epoch": 0.1326418815280111, + "grad_norm": 0.08448290824890137, + "learning_rate": 3.8905808309111994e-05, + "loss": 0.6092, + "step": 2724 + }, + { + "epoch": 0.1326905753171183, + "grad_norm": 1.877523422241211, + "learning_rate": 3.8904779024468034e-05, + "loss": 0.9607, + "step": 2725 + }, + { + "epoch": 0.1327392691062255, + "grad_norm": 1.731677532196045, + "learning_rate": 3.8903749269565964e-05, + "loss": 1.0012, + "step": 2726 + }, + { + "epoch": 0.1327879628953327, + "grad_norm": 2.9837987422943115, + "learning_rate": 3.890271904443142e-05, + "loss": 0.8398, + "step": 2727 + }, + { + "epoch": 0.1328366566844399, + "grad_norm": 2.7138519287109375, + "learning_rate": 3.890168834909001e-05, + "loss": 0.8395, + "step": 2728 + }, + { + "epoch": 0.1328853504735471, + "grad_norm": 2.6081488132476807, + "learning_rate": 3.890065718356738e-05, + "loss": 0.8665, + "step": 2729 + }, + { + "epoch": 0.1329340442626543, + "grad_norm": 2.2039055824279785, + "learning_rate": 3.889962554788918e-05, + "loss": 0.8981, + "step": 2730 + }, + { + "epoch": 0.1329827380517615, + "grad_norm": 1.8532227277755737, + "learning_rate": 3.889859344208107e-05, + "loss": 0.8665, + "step": 2731 + }, + { + "epoch": 0.13303143184086869, + "grad_norm": 1.9300063848495483, + "learning_rate": 3.889756086616873e-05, + "loss": 0.8649, + "step": 2732 + }, + { + "epoch": 0.1330801256299759, + "grad_norm": 3.1647274494171143, + "learning_rate": 3.889652782017784e-05, + "loss": 0.9366, + "step": 2733 + }, + { + "epoch": 0.13312881941908308, + "grad_norm": 2.0372183322906494, + "learning_rate": 3.889549430413409e-05, + "loss": 0.9509, + "step": 2734 + }, + { + "epoch": 0.1331775132081903, + "grad_norm": 2.2061641216278076, + "learning_rate": 3.88944603180632e-05, + "loss": 0.8484, + "step": 2735 + }, + { + "epoch": 0.1332262069972975, + "grad_norm": 2.8239378929138184, + "learning_rate": 3.889342586199089e-05, + "loss": 1.0442, + "step": 2736 + }, + { + "epoch": 0.1332749007864047, + "grad_norm": 2.429605484008789, + "learning_rate": 3.889239093594288e-05, + "loss": 0.8826, + "step": 2737 + }, + { + "epoch": 0.1333235945755119, + "grad_norm": 1.879680871963501, + "learning_rate": 3.8891355539944935e-05, + "loss": 0.9852, + "step": 2738 + }, + { + "epoch": 0.1333722883646191, + "grad_norm": 2.921060562133789, + "learning_rate": 3.889031967402279e-05, + "loss": 0.8697, + "step": 2739 + }, + { + "epoch": 0.1334209821537263, + "grad_norm": 1.6163582801818848, + "learning_rate": 3.8889283338202225e-05, + "loss": 0.9213, + "step": 2740 + }, + { + "epoch": 0.1334696759428335, + "grad_norm": 1.6364326477050781, + "learning_rate": 3.888824653250901e-05, + "loss": 0.8394, + "step": 2741 + }, + { + "epoch": 0.1335183697319407, + "grad_norm": 2.606445789337158, + "learning_rate": 3.8887209256968946e-05, + "loss": 0.8981, + "step": 2742 + }, + { + "epoch": 0.13356706352104789, + "grad_norm": 4.843095302581787, + "learning_rate": 3.888617151160783e-05, + "loss": 0.9578, + "step": 2743 + }, + { + "epoch": 0.1336157573101551, + "grad_norm": 2.1625750064849854, + "learning_rate": 3.888513329645147e-05, + "loss": 0.9613, + "step": 2744 + }, + { + "epoch": 0.13366445109926228, + "grad_norm": 1.8837342262268066, + "learning_rate": 3.88840946115257e-05, + "loss": 0.9109, + "step": 2745 + }, + { + "epoch": 0.1337131448883695, + "grad_norm": 2.6179916858673096, + "learning_rate": 3.888305545685636e-05, + "loss": 0.9357, + "step": 2746 + }, + { + "epoch": 0.13376183867747668, + "grad_norm": 1.3758771419525146, + "learning_rate": 3.888201583246928e-05, + "loss": 0.9296, + "step": 2747 + }, + { + "epoch": 0.1338105324665839, + "grad_norm": 1.7650518417358398, + "learning_rate": 3.888097573839035e-05, + "loss": 0.9635, + "step": 2748 + }, + { + "epoch": 0.13385922625569108, + "grad_norm": 2.095273017883301, + "learning_rate": 3.8879935174645416e-05, + "loss": 0.8569, + "step": 2749 + }, + { + "epoch": 0.1339079200447983, + "grad_norm": 1.4424939155578613, + "learning_rate": 3.887889414126038e-05, + "loss": 0.9455, + "step": 2750 + }, + { + "epoch": 0.13395661383390547, + "grad_norm": 1.7915700674057007, + "learning_rate": 3.887785263826112e-05, + "loss": 0.8545, + "step": 2751 + }, + { + "epoch": 0.1340053076230127, + "grad_norm": 1.939785122871399, + "learning_rate": 3.887681066567357e-05, + "loss": 0.8993, + "step": 2752 + }, + { + "epoch": 0.13405400141211987, + "grad_norm": 1.7691454887390137, + "learning_rate": 3.887576822352363e-05, + "loss": 0.8062, + "step": 2753 + }, + { + "epoch": 0.13410269520122708, + "grad_norm": 2.2358648777008057, + "learning_rate": 3.887472531183722e-05, + "loss": 0.9167, + "step": 2754 + }, + { + "epoch": 0.13415138899033427, + "grad_norm": 2.622390031814575, + "learning_rate": 3.887368193064031e-05, + "loss": 0.944, + "step": 2755 + }, + { + "epoch": 0.13420008277944148, + "grad_norm": 1.9048831462860107, + "learning_rate": 3.8872638079958835e-05, + "loss": 0.9011, + "step": 2756 + }, + { + "epoch": 0.1342487765685487, + "grad_norm": 2.2316553592681885, + "learning_rate": 3.887159375981877e-05, + "loss": 0.8769, + "step": 2757 + }, + { + "epoch": 0.13429747035765588, + "grad_norm": 2.771988868713379, + "learning_rate": 3.88705489702461e-05, + "loss": 0.9564, + "step": 2758 + }, + { + "epoch": 0.1343461641467631, + "grad_norm": 1.6352014541625977, + "learning_rate": 3.886950371126679e-05, + "loss": 0.9621, + "step": 2759 + }, + { + "epoch": 0.13439485793587028, + "grad_norm": 2.4405758380889893, + "learning_rate": 3.886845798290686e-05, + "loss": 1.0272, + "step": 2760 + }, + { + "epoch": 0.1344435517249775, + "grad_norm": 1.9226188659667969, + "learning_rate": 3.8867411785192315e-05, + "loss": 0.9125, + "step": 2761 + }, + { + "epoch": 0.13449224551408467, + "grad_norm": 2.0080699920654297, + "learning_rate": 3.886636511814918e-05, + "loss": 1.0223, + "step": 2762 + }, + { + "epoch": 0.1345409393031919, + "grad_norm": 1.6512670516967773, + "learning_rate": 3.88653179818035e-05, + "loss": 0.9179, + "step": 2763 + }, + { + "epoch": 0.13458963309229907, + "grad_norm": 1.897402048110962, + "learning_rate": 3.886427037618131e-05, + "loss": 1.0102, + "step": 2764 + }, + { + "epoch": 0.13463832688140628, + "grad_norm": 1.9229998588562012, + "learning_rate": 3.886322230130867e-05, + "loss": 0.89, + "step": 2765 + }, + { + "epoch": 0.13468702067051347, + "grad_norm": 2.0189568996429443, + "learning_rate": 3.8862173757211664e-05, + "loss": 0.9394, + "step": 2766 + }, + { + "epoch": 0.13473571445962068, + "grad_norm": 1.8268442153930664, + "learning_rate": 3.886112474391636e-05, + "loss": 0.8659, + "step": 2767 + }, + { + "epoch": 0.13478440824872787, + "grad_norm": 1.9506474733352661, + "learning_rate": 3.8860075261448865e-05, + "loss": 0.8698, + "step": 2768 + }, + { + "epoch": 0.13483310203783508, + "grad_norm": 0.08436199277639389, + "learning_rate": 3.885902530983528e-05, + "loss": 0.62, + "step": 2769 + }, + { + "epoch": 0.13488179582694226, + "grad_norm": 1.7575911283493042, + "learning_rate": 3.885797488910171e-05, + "loss": 1.0018, + "step": 2770 + }, + { + "epoch": 0.13493048961604948, + "grad_norm": 2.0185599327087402, + "learning_rate": 3.88569239992743e-05, + "loss": 0.8979, + "step": 2771 + }, + { + "epoch": 0.13497918340515666, + "grad_norm": 1.9772696495056152, + "learning_rate": 3.885587264037918e-05, + "loss": 0.9035, + "step": 2772 + }, + { + "epoch": 0.13502787719426387, + "grad_norm": 1.7640019655227661, + "learning_rate": 3.885482081244252e-05, + "loss": 0.917, + "step": 2773 + }, + { + "epoch": 0.13507657098337106, + "grad_norm": 1.6882312297821045, + "learning_rate": 3.8853768515490465e-05, + "loss": 0.9911, + "step": 2774 + }, + { + "epoch": 0.13512526477247827, + "grad_norm": 5.148508548736572, + "learning_rate": 3.88527157495492e-05, + "loss": 0.9537, + "step": 2775 + }, + { + "epoch": 0.13517395856158548, + "grad_norm": 1.6550078392028809, + "learning_rate": 3.885166251464492e-05, + "loss": 0.9535, + "step": 2776 + }, + { + "epoch": 0.13522265235069267, + "grad_norm": 2.0204460620880127, + "learning_rate": 3.885060881080381e-05, + "loss": 0.8438, + "step": 2777 + }, + { + "epoch": 0.13527134613979988, + "grad_norm": 2.2736971378326416, + "learning_rate": 3.8849554638052086e-05, + "loss": 0.8807, + "step": 2778 + }, + { + "epoch": 0.13532003992890707, + "grad_norm": 1.400755524635315, + "learning_rate": 3.884849999641597e-05, + "loss": 0.819, + "step": 2779 + }, + { + "epoch": 0.13536873371801428, + "grad_norm": 2.000282049179077, + "learning_rate": 3.8847444885921704e-05, + "loss": 0.883, + "step": 2780 + }, + { + "epoch": 0.13541742750712146, + "grad_norm": 1.7805172204971313, + "learning_rate": 3.8846389306595526e-05, + "loss": 0.8081, + "step": 2781 + }, + { + "epoch": 0.13546612129622868, + "grad_norm": 2.0061798095703125, + "learning_rate": 3.88453332584637e-05, + "loss": 0.9259, + "step": 2782 + }, + { + "epoch": 0.13551481508533586, + "grad_norm": 1.8010090589523315, + "learning_rate": 3.884427674155248e-05, + "loss": 0.8892, + "step": 2783 + }, + { + "epoch": 0.13556350887444307, + "grad_norm": 1.6660436391830444, + "learning_rate": 3.884321975588816e-05, + "loss": 1.0104, + "step": 2784 + }, + { + "epoch": 0.13561220266355026, + "grad_norm": 2.1569483280181885, + "learning_rate": 3.884216230149704e-05, + "loss": 0.9417, + "step": 2785 + }, + { + "epoch": 0.13566089645265747, + "grad_norm": 1.876753568649292, + "learning_rate": 3.884110437840541e-05, + "loss": 0.8747, + "step": 2786 + }, + { + "epoch": 0.13570959024176465, + "grad_norm": 1.751196026802063, + "learning_rate": 3.884004598663959e-05, + "loss": 0.9284, + "step": 2787 + }, + { + "epoch": 0.13575828403087187, + "grad_norm": 1.535421371459961, + "learning_rate": 3.88389871262259e-05, + "loss": 0.9076, + "step": 2788 + }, + { + "epoch": 0.13580697781997905, + "grad_norm": 1.7863150835037231, + "learning_rate": 3.88379277971907e-05, + "loss": 0.8446, + "step": 2789 + }, + { + "epoch": 0.13585567160908626, + "grad_norm": 1.476058006286621, + "learning_rate": 3.883686799956032e-05, + "loss": 0.8157, + "step": 2790 + }, + { + "epoch": 0.13590436539819345, + "grad_norm": 2.258532762527466, + "learning_rate": 3.883580773336114e-05, + "loss": 0.8852, + "step": 2791 + }, + { + "epoch": 0.13595305918730066, + "grad_norm": 0.07918312400579453, + "learning_rate": 3.8834746998619524e-05, + "loss": 0.6092, + "step": 2792 + }, + { + "epoch": 0.13600175297640785, + "grad_norm": 1.9309260845184326, + "learning_rate": 3.883368579536186e-05, + "loss": 0.9344, + "step": 2793 + }, + { + "epoch": 0.13605044676551506, + "grad_norm": 1.7627367973327637, + "learning_rate": 3.883262412361453e-05, + "loss": 0.8737, + "step": 2794 + }, + { + "epoch": 0.13609914055462224, + "grad_norm": 10.10439395904541, + "learning_rate": 3.8831561983403974e-05, + "loss": 0.9379, + "step": 2795 + }, + { + "epoch": 0.13614783434372946, + "grad_norm": 0.08598050475120544, + "learning_rate": 3.883049937475659e-05, + "loss": 0.6411, + "step": 2796 + }, + { + "epoch": 0.13619652813283667, + "grad_norm": 1.678533911705017, + "learning_rate": 3.882943629769881e-05, + "loss": 1.0306, + "step": 2797 + }, + { + "epoch": 0.13624522192194385, + "grad_norm": 1.5457063913345337, + "learning_rate": 3.8828372752257095e-05, + "loss": 0.9261, + "step": 2798 + }, + { + "epoch": 0.13629391571105107, + "grad_norm": 2.447434902191162, + "learning_rate": 3.8827308738457894e-05, + "loss": 0.836, + "step": 2799 + }, + { + "epoch": 0.13634260950015825, + "grad_norm": 2.5023744106292725, + "learning_rate": 3.8826244256327665e-05, + "loss": 0.8702, + "step": 2800 + }, + { + "epoch": 0.13639130328926546, + "grad_norm": 2.161086320877075, + "learning_rate": 3.8825179305892896e-05, + "loss": 0.8509, + "step": 2801 + }, + { + "epoch": 0.13643999707837265, + "grad_norm": 3.3400442600250244, + "learning_rate": 3.8824113887180075e-05, + "loss": 0.8433, + "step": 2802 + }, + { + "epoch": 0.13648869086747986, + "grad_norm": 2.3562560081481934, + "learning_rate": 3.8823048000215704e-05, + "loss": 0.9746, + "step": 2803 + }, + { + "epoch": 0.13653738465658705, + "grad_norm": 2.5579352378845215, + "learning_rate": 3.882198164502629e-05, + "loss": 0.8961, + "step": 2804 + }, + { + "epoch": 0.13658607844569426, + "grad_norm": 1.53098464012146, + "learning_rate": 3.882091482163838e-05, + "loss": 0.8983, + "step": 2805 + }, + { + "epoch": 0.13663477223480144, + "grad_norm": 1.6211084127426147, + "learning_rate": 3.8819847530078496e-05, + "loss": 0.8915, + "step": 2806 + }, + { + "epoch": 0.13668346602390866, + "grad_norm": 1.717727541923523, + "learning_rate": 3.8818779770373183e-05, + "loss": 0.944, + "step": 2807 + }, + { + "epoch": 0.13673215981301584, + "grad_norm": 1.554137110710144, + "learning_rate": 3.8817711542549014e-05, + "loss": 0.9949, + "step": 2808 + }, + { + "epoch": 0.13678085360212305, + "grad_norm": 1.8952122926712036, + "learning_rate": 3.881664284663255e-05, + "loss": 0.9006, + "step": 2809 + }, + { + "epoch": 0.13682954739123024, + "grad_norm": 2.6266438961029053, + "learning_rate": 3.8815573682650375e-05, + "loss": 0.9751, + "step": 2810 + }, + { + "epoch": 0.13687824118033745, + "grad_norm": 1.5988166332244873, + "learning_rate": 3.88145040506291e-05, + "loss": 0.8726, + "step": 2811 + }, + { + "epoch": 0.13692693496944464, + "grad_norm": 1.7791951894760132, + "learning_rate": 3.8813433950595316e-05, + "loss": 0.8325, + "step": 2812 + }, + { + "epoch": 0.13697562875855185, + "grad_norm": 1.4120508432388306, + "learning_rate": 3.881236338257564e-05, + "loss": 0.8654, + "step": 2813 + }, + { + "epoch": 0.13702432254765903, + "grad_norm": 2.4622068405151367, + "learning_rate": 3.881129234659672e-05, + "loss": 0.9158, + "step": 2814 + }, + { + "epoch": 0.13707301633676625, + "grad_norm": 1.9219958782196045, + "learning_rate": 3.881022084268518e-05, + "loss": 0.8681, + "step": 2815 + }, + { + "epoch": 0.13712171012587346, + "grad_norm": 2.5683395862579346, + "learning_rate": 3.880914887086769e-05, + "loss": 0.8745, + "step": 2816 + }, + { + "epoch": 0.13717040391498064, + "grad_norm": 2.0883538722991943, + "learning_rate": 3.880807643117089e-05, + "loss": 0.8511, + "step": 2817 + }, + { + "epoch": 0.13721909770408786, + "grad_norm": 1.7402924299240112, + "learning_rate": 3.8807003523621495e-05, + "loss": 0.9367, + "step": 2818 + }, + { + "epoch": 0.13726779149319504, + "grad_norm": 1.7511265277862549, + "learning_rate": 3.8805930148246164e-05, + "loss": 0.7946, + "step": 2819 + }, + { + "epoch": 0.13731648528230225, + "grad_norm": 2.166886329650879, + "learning_rate": 3.8804856305071606e-05, + "loss": 0.8165, + "step": 2820 + }, + { + "epoch": 0.13736517907140944, + "grad_norm": 2.241929531097412, + "learning_rate": 3.880378199412453e-05, + "loss": 0.9334, + "step": 2821 + }, + { + "epoch": 0.13741387286051665, + "grad_norm": 1.7779300212860107, + "learning_rate": 3.880270721543166e-05, + "loss": 0.9301, + "step": 2822 + }, + { + "epoch": 0.13746256664962384, + "grad_norm": 1.7944968938827515, + "learning_rate": 3.880163196901975e-05, + "loss": 0.9838, + "step": 2823 + }, + { + "epoch": 0.13751126043873105, + "grad_norm": 1.8990486860275269, + "learning_rate": 3.8800556254915516e-05, + "loss": 0.8323, + "step": 2824 + }, + { + "epoch": 0.13755995422783823, + "grad_norm": 1.3923064470291138, + "learning_rate": 3.879948007314573e-05, + "loss": 0.9889, + "step": 2825 + }, + { + "epoch": 0.13760864801694545, + "grad_norm": 1.5602083206176758, + "learning_rate": 3.879840342373717e-05, + "loss": 0.9463, + "step": 2826 + }, + { + "epoch": 0.13765734180605263, + "grad_norm": 2.068596601486206, + "learning_rate": 3.879732630671661e-05, + "loss": 0.9302, + "step": 2827 + }, + { + "epoch": 0.13770603559515984, + "grad_norm": 2.251528024673462, + "learning_rate": 3.879624872211084e-05, + "loss": 0.9318, + "step": 2828 + }, + { + "epoch": 0.13775472938426703, + "grad_norm": 4.128345489501953, + "learning_rate": 3.879517066994667e-05, + "loss": 0.9003, + "step": 2829 + }, + { + "epoch": 0.13780342317337424, + "grad_norm": 1.4740852117538452, + "learning_rate": 3.879409215025092e-05, + "loss": 0.8863, + "step": 2830 + }, + { + "epoch": 0.13785211696248142, + "grad_norm": 1.7124063968658447, + "learning_rate": 3.879301316305041e-05, + "loss": 0.8573, + "step": 2831 + }, + { + "epoch": 0.13790081075158864, + "grad_norm": 2.2823314666748047, + "learning_rate": 3.879193370837199e-05, + "loss": 0.9944, + "step": 2832 + }, + { + "epoch": 0.13794950454069582, + "grad_norm": 2.193859100341797, + "learning_rate": 3.87908537862425e-05, + "loss": 0.9547, + "step": 2833 + }, + { + "epoch": 0.13799819832980303, + "grad_norm": 1.959105134010315, + "learning_rate": 3.878977339668881e-05, + "loss": 0.9016, + "step": 2834 + }, + { + "epoch": 0.13804689211891022, + "grad_norm": 2.053954601287842, + "learning_rate": 3.878869253973779e-05, + "loss": 0.8825, + "step": 2835 + }, + { + "epoch": 0.13809558590801743, + "grad_norm": 2.2112410068511963, + "learning_rate": 3.878761121541633e-05, + "loss": 0.9772, + "step": 2836 + }, + { + "epoch": 0.13814427969712464, + "grad_norm": 0.08892909437417984, + "learning_rate": 3.878652942375134e-05, + "loss": 0.7122, + "step": 2837 + }, + { + "epoch": 0.13819297348623183, + "grad_norm": 2.309988260269165, + "learning_rate": 3.8785447164769706e-05, + "loss": 0.947, + "step": 2838 + }, + { + "epoch": 0.13824166727533904, + "grad_norm": 1.9454286098480225, + "learning_rate": 3.878436443849836e-05, + "loss": 0.8663, + "step": 2839 + }, + { + "epoch": 0.13829036106444623, + "grad_norm": 1.6682673692703247, + "learning_rate": 3.878328124496424e-05, + "loss": 0.9232, + "step": 2840 + }, + { + "epoch": 0.13833905485355344, + "grad_norm": 1.8805674314498901, + "learning_rate": 3.878219758419429e-05, + "loss": 0.9272, + "step": 2841 + }, + { + "epoch": 0.13838774864266062, + "grad_norm": 2.084580421447754, + "learning_rate": 3.878111345621546e-05, + "loss": 0.9425, + "step": 2842 + }, + { + "epoch": 0.13843644243176784, + "grad_norm": 1.5859291553497314, + "learning_rate": 3.878002886105471e-05, + "loss": 0.9185, + "step": 2843 + }, + { + "epoch": 0.13848513622087502, + "grad_norm": 0.0824158787727356, + "learning_rate": 3.8778943798739045e-05, + "loss": 0.6245, + "step": 2844 + }, + { + "epoch": 0.13853383000998223, + "grad_norm": 2.1455156803131104, + "learning_rate": 3.8777858269295435e-05, + "loss": 0.9116, + "step": 2845 + }, + { + "epoch": 0.13858252379908942, + "grad_norm": 1.5093930959701538, + "learning_rate": 3.877677227275089e-05, + "loss": 0.9926, + "step": 2846 + }, + { + "epoch": 0.13863121758819663, + "grad_norm": 1.6444716453552246, + "learning_rate": 3.8775685809132425e-05, + "loss": 0.9524, + "step": 2847 + }, + { + "epoch": 0.13867991137730382, + "grad_norm": 1.871328592300415, + "learning_rate": 3.877459887846706e-05, + "loss": 0.9389, + "step": 2848 + }, + { + "epoch": 0.13872860516641103, + "grad_norm": 9.497575759887695, + "learning_rate": 3.877351148078183e-05, + "loss": 0.9402, + "step": 2849 + }, + { + "epoch": 0.1387772989555182, + "grad_norm": 1.8944588899612427, + "learning_rate": 3.87724236161038e-05, + "loss": 0.9822, + "step": 2850 + }, + { + "epoch": 0.13882599274462543, + "grad_norm": 2.328911781311035, + "learning_rate": 3.877133528446002e-05, + "loss": 0.933, + "step": 2851 + }, + { + "epoch": 0.1388746865337326, + "grad_norm": 2.6278209686279297, + "learning_rate": 3.8770246485877556e-05, + "loss": 0.9007, + "step": 2852 + }, + { + "epoch": 0.13892338032283982, + "grad_norm": 2.1076138019561768, + "learning_rate": 3.876915722038351e-05, + "loss": 0.8384, + "step": 2853 + }, + { + "epoch": 0.138972074111947, + "grad_norm": 2.005824565887451, + "learning_rate": 3.8768067488004957e-05, + "loss": 0.9158, + "step": 2854 + }, + { + "epoch": 0.13902076790105422, + "grad_norm": 1.9852670431137085, + "learning_rate": 3.8766977288769016e-05, + "loss": 0.9068, + "step": 2855 + }, + { + "epoch": 0.13906946169016143, + "grad_norm": 3.589794635772705, + "learning_rate": 3.8765886622702806e-05, + "loss": 0.8823, + "step": 2856 + }, + { + "epoch": 0.13911815547926862, + "grad_norm": 1.5547387599945068, + "learning_rate": 3.876479548983345e-05, + "loss": 0.8656, + "step": 2857 + }, + { + "epoch": 0.13916684926837583, + "grad_norm": 1.4707483053207397, + "learning_rate": 3.87637038901881e-05, + "loss": 0.85, + "step": 2858 + }, + { + "epoch": 0.13921554305748302, + "grad_norm": 1.8405382633209229, + "learning_rate": 3.876261182379391e-05, + "loss": 0.8953, + "step": 2859 + }, + { + "epoch": 0.13926423684659023, + "grad_norm": 2.0333399772644043, + "learning_rate": 3.8761519290678035e-05, + "loss": 0.8216, + "step": 2860 + }, + { + "epoch": 0.1393129306356974, + "grad_norm": 2.537536144256592, + "learning_rate": 3.876042629086766e-05, + "loss": 0.8427, + "step": 2861 + }, + { + "epoch": 0.13936162442480463, + "grad_norm": 2.406618118286133, + "learning_rate": 3.875933282438996e-05, + "loss": 0.8259, + "step": 2862 + }, + { + "epoch": 0.1394103182139118, + "grad_norm": 1.6365413665771484, + "learning_rate": 3.875823889127216e-05, + "loss": 0.8547, + "step": 2863 + }, + { + "epoch": 0.13945901200301902, + "grad_norm": 2.0219104290008545, + "learning_rate": 3.875714449154145e-05, + "loss": 0.8782, + "step": 2864 + }, + { + "epoch": 0.1395077057921262, + "grad_norm": 1.9393644332885742, + "learning_rate": 3.875604962522506e-05, + "loss": 0.985, + "step": 2865 + }, + { + "epoch": 0.13955639958123342, + "grad_norm": 2.0269711017608643, + "learning_rate": 3.875495429235022e-05, + "loss": 0.9855, + "step": 2866 + }, + { + "epoch": 0.1396050933703406, + "grad_norm": 1.7587348222732544, + "learning_rate": 3.8753858492944193e-05, + "loss": 0.8955, + "step": 2867 + }, + { + "epoch": 0.13965378715944782, + "grad_norm": 1.3142292499542236, + "learning_rate": 3.875276222703422e-05, + "loss": 0.8659, + "step": 2868 + }, + { + "epoch": 0.139702480948555, + "grad_norm": 2.025838613510132, + "learning_rate": 3.8751665494647585e-05, + "loss": 0.9323, + "step": 2869 + }, + { + "epoch": 0.13975117473766221, + "grad_norm": 1.701569676399231, + "learning_rate": 3.875056829581155e-05, + "loss": 0.8903, + "step": 2870 + }, + { + "epoch": 0.1397998685267694, + "grad_norm": 2.298746109008789, + "learning_rate": 3.874947063055342e-05, + "loss": 0.922, + "step": 2871 + }, + { + "epoch": 0.1398485623158766, + "grad_norm": 1.7263691425323486, + "learning_rate": 3.874837249890051e-05, + "loss": 0.9594, + "step": 2872 + }, + { + "epoch": 0.1398972561049838, + "grad_norm": 1.9834641218185425, + "learning_rate": 3.874727390088011e-05, + "loss": 0.9914, + "step": 2873 + }, + { + "epoch": 0.139945949894091, + "grad_norm": 1.8059418201446533, + "learning_rate": 3.8746174836519575e-05, + "loss": 0.9609, + "step": 2874 + }, + { + "epoch": 0.1399946436831982, + "grad_norm": 1.6813997030258179, + "learning_rate": 3.8745075305846224e-05, + "loss": 0.8392, + "step": 2875 + }, + { + "epoch": 0.1400433374723054, + "grad_norm": 0.08246814459562302, + "learning_rate": 3.8743975308887415e-05, + "loss": 0.6584, + "step": 2876 + }, + { + "epoch": 0.14009203126141262, + "grad_norm": 2.4454472064971924, + "learning_rate": 3.874287484567051e-05, + "loss": 0.9065, + "step": 2877 + }, + { + "epoch": 0.1401407250505198, + "grad_norm": 2.6319046020507812, + "learning_rate": 3.874177391622289e-05, + "loss": 0.9848, + "step": 2878 + }, + { + "epoch": 0.14018941883962702, + "grad_norm": 6.985170841217041, + "learning_rate": 3.874067252057193e-05, + "loss": 0.8871, + "step": 2879 + }, + { + "epoch": 0.1402381126287342, + "grad_norm": 2.056492567062378, + "learning_rate": 3.873957065874504e-05, + "loss": 0.9129, + "step": 2880 + }, + { + "epoch": 0.14028680641784141, + "grad_norm": 2.3395884037017822, + "learning_rate": 3.873846833076962e-05, + "loss": 0.9014, + "step": 2881 + }, + { + "epoch": 0.1403355002069486, + "grad_norm": 2.9366416931152344, + "learning_rate": 3.8737365536673086e-05, + "loss": 0.964, + "step": 2882 + }, + { + "epoch": 0.1403841939960558, + "grad_norm": 2.1093063354492188, + "learning_rate": 3.873626227648288e-05, + "loss": 0.9382, + "step": 2883 + }, + { + "epoch": 0.140432887785163, + "grad_norm": 2.711822986602783, + "learning_rate": 3.8735158550226436e-05, + "loss": 0.8991, + "step": 2884 + }, + { + "epoch": 0.1404815815742702, + "grad_norm": 2.1438374519348145, + "learning_rate": 3.873405435793122e-05, + "loss": 0.8285, + "step": 2885 + }, + { + "epoch": 0.1405302753633774, + "grad_norm": 2.7748701572418213, + "learning_rate": 3.8732949699624695e-05, + "loss": 0.8647, + "step": 2886 + }, + { + "epoch": 0.1405789691524846, + "grad_norm": 2.1960995197296143, + "learning_rate": 3.873184457533434e-05, + "loss": 0.8853, + "step": 2887 + }, + { + "epoch": 0.1406276629415918, + "grad_norm": 1.7636363506317139, + "learning_rate": 3.873073898508764e-05, + "loss": 0.9068, + "step": 2888 + }, + { + "epoch": 0.140676356730699, + "grad_norm": 1.8625893592834473, + "learning_rate": 3.8729632928912095e-05, + "loss": 0.8525, + "step": 2889 + }, + { + "epoch": 0.1407250505198062, + "grad_norm": 1.708134651184082, + "learning_rate": 3.872852640683523e-05, + "loss": 0.8647, + "step": 2890 + }, + { + "epoch": 0.1407737443089134, + "grad_norm": 2.0899834632873535, + "learning_rate": 3.8727419418884555e-05, + "loss": 1.0107, + "step": 2891 + }, + { + "epoch": 0.14082243809802059, + "grad_norm": 3.18601655960083, + "learning_rate": 3.8726311965087614e-05, + "loss": 0.9018, + "step": 2892 + }, + { + "epoch": 0.1408711318871278, + "grad_norm": 2.6303815841674805, + "learning_rate": 3.8725204045471965e-05, + "loss": 0.9219, + "step": 2893 + }, + { + "epoch": 0.14091982567623498, + "grad_norm": 2.762336015701294, + "learning_rate": 3.872409566006515e-05, + "loss": 0.9012, + "step": 2894 + }, + { + "epoch": 0.1409685194653422, + "grad_norm": 2.3684871196746826, + "learning_rate": 3.872298680889476e-05, + "loss": 0.8853, + "step": 2895 + }, + { + "epoch": 0.1410172132544494, + "grad_norm": 2.1393826007843018, + "learning_rate": 3.872187749198836e-05, + "loss": 0.9152, + "step": 2896 + }, + { + "epoch": 0.1410659070435566, + "grad_norm": 1.9204422235488892, + "learning_rate": 3.8720767709373546e-05, + "loss": 0.9253, + "step": 2897 + }, + { + "epoch": 0.1411146008326638, + "grad_norm": 2.0656259059906006, + "learning_rate": 3.871965746107793e-05, + "loss": 0.9248, + "step": 2898 + }, + { + "epoch": 0.141163294621771, + "grad_norm": 1.911329746246338, + "learning_rate": 3.871854674712913e-05, + "loss": 0.9443, + "step": 2899 + }, + { + "epoch": 0.1412119884108782, + "grad_norm": 2.4443652629852295, + "learning_rate": 3.871743556755477e-05, + "loss": 0.8757, + "step": 2900 + }, + { + "epoch": 0.1412606821999854, + "grad_norm": 1.8711029291152954, + "learning_rate": 3.8716323922382495e-05, + "loss": 0.9473, + "step": 2901 + }, + { + "epoch": 0.1413093759890926, + "grad_norm": 1.8927459716796875, + "learning_rate": 3.8715211811639955e-05, + "loss": 1.021, + "step": 2902 + }, + { + "epoch": 0.14135806977819979, + "grad_norm": 1.629165768623352, + "learning_rate": 3.871409923535482e-05, + "loss": 0.8299, + "step": 2903 + }, + { + "epoch": 0.141406763567307, + "grad_norm": 1.7766000032424927, + "learning_rate": 3.871298619355476e-05, + "loss": 0.8224, + "step": 2904 + }, + { + "epoch": 0.14145545735641418, + "grad_norm": 1.991484522819519, + "learning_rate": 3.871187268626746e-05, + "loss": 0.8833, + "step": 2905 + }, + { + "epoch": 0.1415041511455214, + "grad_norm": 2.1065146923065186, + "learning_rate": 3.871075871352061e-05, + "loss": 0.9139, + "step": 2906 + }, + { + "epoch": 0.14155284493462858, + "grad_norm": 3.1602706909179688, + "learning_rate": 3.870964427534195e-05, + "loss": 0.8771, + "step": 2907 + }, + { + "epoch": 0.1416015387237358, + "grad_norm": 1.4934494495391846, + "learning_rate": 3.870852937175917e-05, + "loss": 0.9266, + "step": 2908 + }, + { + "epoch": 0.14165023251284298, + "grad_norm": 1.875179409980774, + "learning_rate": 3.870741400280002e-05, + "loss": 0.837, + "step": 2909 + }, + { + "epoch": 0.1416989263019502, + "grad_norm": 2.535459518432617, + "learning_rate": 3.870629816849224e-05, + "loss": 0.9685, + "step": 2910 + }, + { + "epoch": 0.14174762009105737, + "grad_norm": 2.3300395011901855, + "learning_rate": 3.8705181868863586e-05, + "loss": 0.9159, + "step": 2911 + }, + { + "epoch": 0.1417963138801646, + "grad_norm": 1.7894090414047241, + "learning_rate": 3.870406510394183e-05, + "loss": 0.9825, + "step": 2912 + }, + { + "epoch": 0.14184500766927177, + "grad_norm": 2.2412755489349365, + "learning_rate": 3.870294787375475e-05, + "loss": 0.8922, + "step": 2913 + }, + { + "epoch": 0.14189370145837898, + "grad_norm": 1.5555047988891602, + "learning_rate": 3.8701830178330134e-05, + "loss": 0.8267, + "step": 2914 + }, + { + "epoch": 0.14194239524748617, + "grad_norm": 1.7448887825012207, + "learning_rate": 3.870071201769579e-05, + "loss": 0.8746, + "step": 2915 + }, + { + "epoch": 0.14199108903659338, + "grad_norm": 2.3618574142456055, + "learning_rate": 3.869959339187953e-05, + "loss": 0.9028, + "step": 2916 + }, + { + "epoch": 0.1420397828257006, + "grad_norm": 1.5558055639266968, + "learning_rate": 3.869847430090918e-05, + "loss": 0.8541, + "step": 2917 + }, + { + "epoch": 0.14208847661480778, + "grad_norm": 3.3863563537597656, + "learning_rate": 3.8697354744812565e-05, + "loss": 0.931, + "step": 2918 + }, + { + "epoch": 0.142137170403915, + "grad_norm": 0.08907461166381836, + "learning_rate": 3.869623472361756e-05, + "loss": 0.6088, + "step": 2919 + }, + { + "epoch": 0.14218586419302218, + "grad_norm": 2.0494229793548584, + "learning_rate": 3.869511423735201e-05, + "loss": 0.9872, + "step": 2920 + }, + { + "epoch": 0.1422345579821294, + "grad_norm": 1.7826344966888428, + "learning_rate": 3.8693993286043785e-05, + "loss": 0.9489, + "step": 2921 + }, + { + "epoch": 0.14228325177123657, + "grad_norm": 2.2555043697357178, + "learning_rate": 3.869287186972077e-05, + "loss": 0.9462, + "step": 2922 + }, + { + "epoch": 0.1423319455603438, + "grad_norm": 0.0892515480518341, + "learning_rate": 3.8691749988410866e-05, + "loss": 0.5899, + "step": 2923 + }, + { + "epoch": 0.14238063934945097, + "grad_norm": 3.97329044342041, + "learning_rate": 3.869062764214198e-05, + "loss": 0.948, + "step": 2924 + }, + { + "epoch": 0.14242933313855818, + "grad_norm": 1.682930827140808, + "learning_rate": 3.868950483094202e-05, + "loss": 0.9258, + "step": 2925 + }, + { + "epoch": 0.14247802692766537, + "grad_norm": 1.9182785749435425, + "learning_rate": 3.868838155483893e-05, + "loss": 0.9616, + "step": 2926 + }, + { + "epoch": 0.14252672071677258, + "grad_norm": 2.7618958950042725, + "learning_rate": 3.8687257813860644e-05, + "loss": 0.8343, + "step": 2927 + }, + { + "epoch": 0.14257541450587977, + "grad_norm": 2.2476987838745117, + "learning_rate": 3.8686133608035106e-05, + "loss": 0.9423, + "step": 2928 + }, + { + "epoch": 0.14262410829498698, + "grad_norm": 1.6008028984069824, + "learning_rate": 3.8685008937390304e-05, + "loss": 0.9573, + "step": 2929 + }, + { + "epoch": 0.14267280208409416, + "grad_norm": 2.803814649581909, + "learning_rate": 3.86838838019542e-05, + "loss": 0.871, + "step": 2930 + }, + { + "epoch": 0.14272149587320138, + "grad_norm": 2.771240472793579, + "learning_rate": 3.868275820175478e-05, + "loss": 0.9049, + "step": 2931 + }, + { + "epoch": 0.14277018966230856, + "grad_norm": 2.06653094291687, + "learning_rate": 3.868163213682004e-05, + "loss": 0.908, + "step": 2932 + }, + { + "epoch": 0.14281888345141577, + "grad_norm": 1.6757683753967285, + "learning_rate": 3.8680505607178e-05, + "loss": 0.9459, + "step": 2933 + }, + { + "epoch": 0.14286757724052296, + "grad_norm": 2.0241551399230957, + "learning_rate": 3.867937861285668e-05, + "loss": 0.9029, + "step": 2934 + }, + { + "epoch": 0.14291627102963017, + "grad_norm": 2.4843318462371826, + "learning_rate": 3.867825115388412e-05, + "loss": 0.9181, + "step": 2935 + }, + { + "epoch": 0.14296496481873738, + "grad_norm": 2.280367851257324, + "learning_rate": 3.867712323028836e-05, + "loss": 0.9963, + "step": 2936 + }, + { + "epoch": 0.14301365860784457, + "grad_norm": 2.1281845569610596, + "learning_rate": 3.8675994842097446e-05, + "loss": 0.9931, + "step": 2937 + }, + { + "epoch": 0.14306235239695178, + "grad_norm": 1.8986942768096924, + "learning_rate": 3.8674865989339465e-05, + "loss": 0.9236, + "step": 2938 + }, + { + "epoch": 0.14311104618605897, + "grad_norm": 3.563642978668213, + "learning_rate": 3.8673736672042494e-05, + "loss": 0.8848, + "step": 2939 + }, + { + "epoch": 0.14315973997516618, + "grad_norm": 2.1789650917053223, + "learning_rate": 3.867260689023461e-05, + "loss": 0.9131, + "step": 2940 + }, + { + "epoch": 0.14320843376427336, + "grad_norm": 1.6525990962982178, + "learning_rate": 3.8671476643943935e-05, + "loss": 0.9174, + "step": 2941 + }, + { + "epoch": 0.14325712755338058, + "grad_norm": 2.43733549118042, + "learning_rate": 3.867034593319857e-05, + "loss": 0.9313, + "step": 2942 + }, + { + "epoch": 0.14330582134248776, + "grad_norm": 2.1048593521118164, + "learning_rate": 3.8669214758026655e-05, + "loss": 0.8671, + "step": 2943 + }, + { + "epoch": 0.14335451513159497, + "grad_norm": 2.565091848373413, + "learning_rate": 3.8668083118456314e-05, + "loss": 0.9859, + "step": 2944 + }, + { + "epoch": 0.14340320892070216, + "grad_norm": 1.861079454421997, + "learning_rate": 3.866695101451571e-05, + "loss": 0.9118, + "step": 2945 + }, + { + "epoch": 0.14345190270980937, + "grad_norm": 1.579534888267517, + "learning_rate": 3.8665818446233e-05, + "loss": 0.9003, + "step": 2946 + }, + { + "epoch": 0.14350059649891655, + "grad_norm": 1.651119351387024, + "learning_rate": 3.8664685413636344e-05, + "loss": 0.8206, + "step": 2947 + }, + { + "epoch": 0.14354929028802377, + "grad_norm": 1.978809118270874, + "learning_rate": 3.8663551916753936e-05, + "loss": 0.8811, + "step": 2948 + }, + { + "epoch": 0.14359798407713095, + "grad_norm": 2.201920509338379, + "learning_rate": 3.866241795561398e-05, + "loss": 0.9362, + "step": 2949 + }, + { + "epoch": 0.14364667786623816, + "grad_norm": 2.503068208694458, + "learning_rate": 3.8661283530244674e-05, + "loss": 0.8636, + "step": 2950 + }, + { + "epoch": 0.14369537165534535, + "grad_norm": 2.5606584548950195, + "learning_rate": 3.866014864067424e-05, + "loss": 0.9711, + "step": 2951 + }, + { + "epoch": 0.14374406544445256, + "grad_norm": 2.5570530891418457, + "learning_rate": 3.86590132869309e-05, + "loss": 0.9027, + "step": 2952 + }, + { + "epoch": 0.14379275923355975, + "grad_norm": 2.4787018299102783, + "learning_rate": 3.865787746904291e-05, + "loss": 1.027, + "step": 2953 + }, + { + "epoch": 0.14384145302266696, + "grad_norm": 1.7960723638534546, + "learning_rate": 3.865674118703851e-05, + "loss": 0.734, + "step": 2954 + }, + { + "epoch": 0.14389014681177414, + "grad_norm": 2.817944049835205, + "learning_rate": 3.865560444094598e-05, + "loss": 0.8471, + "step": 2955 + }, + { + "epoch": 0.14393884060088136, + "grad_norm": 1.9618186950683594, + "learning_rate": 3.8654467230793585e-05, + "loss": 1.0184, + "step": 2956 + }, + { + "epoch": 0.14398753438998857, + "grad_norm": 5.37992000579834, + "learning_rate": 3.865332955660962e-05, + "loss": 0.9046, + "step": 2957 + }, + { + "epoch": 0.14403622817909575, + "grad_norm": 0.11135296523571014, + "learning_rate": 3.8652191418422375e-05, + "loss": 0.5869, + "step": 2958 + }, + { + "epoch": 0.14408492196820297, + "grad_norm": 1.9225904941558838, + "learning_rate": 3.8651052816260175e-05, + "loss": 0.8665, + "step": 2959 + }, + { + "epoch": 0.14413361575731015, + "grad_norm": 2.1285674571990967, + "learning_rate": 3.864991375015133e-05, + "loss": 0.8891, + "step": 2960 + }, + { + "epoch": 0.14418230954641736, + "grad_norm": 2.9893457889556885, + "learning_rate": 3.864877422012418e-05, + "loss": 0.9003, + "step": 2961 + }, + { + "epoch": 0.14423100333552455, + "grad_norm": 2.635221004486084, + "learning_rate": 3.864763422620707e-05, + "loss": 0.9033, + "step": 2962 + }, + { + "epoch": 0.14427969712463176, + "grad_norm": 3.0076375007629395, + "learning_rate": 3.864649376842836e-05, + "loss": 0.8882, + "step": 2963 + }, + { + "epoch": 0.14432839091373895, + "grad_norm": 2.3868255615234375, + "learning_rate": 3.864535284681642e-05, + "loss": 0.8269, + "step": 2964 + }, + { + "epoch": 0.14437708470284616, + "grad_norm": 1.9791022539138794, + "learning_rate": 3.864421146139963e-05, + "loss": 0.9102, + "step": 2965 + }, + { + "epoch": 0.14442577849195334, + "grad_norm": 1.4219818115234375, + "learning_rate": 3.8643069612206376e-05, + "loss": 0.8339, + "step": 2966 + }, + { + "epoch": 0.14447447228106056, + "grad_norm": 5.495380878448486, + "learning_rate": 3.864192729926507e-05, + "loss": 0.8921, + "step": 2967 + }, + { + "epoch": 0.14452316607016774, + "grad_norm": 2.193056583404541, + "learning_rate": 3.8640784522604116e-05, + "loss": 0.8852, + "step": 2968 + }, + { + "epoch": 0.14457185985927495, + "grad_norm": 2.8070456981658936, + "learning_rate": 3.863964128225195e-05, + "loss": 0.913, + "step": 2969 + }, + { + "epoch": 0.14462055364838214, + "grad_norm": 2.042855978012085, + "learning_rate": 3.8638497578237014e-05, + "loss": 0.9206, + "step": 2970 + }, + { + "epoch": 0.14466924743748935, + "grad_norm": 1.696752667427063, + "learning_rate": 3.8637353410587744e-05, + "loss": 0.8771, + "step": 2971 + }, + { + "epoch": 0.14471794122659654, + "grad_norm": 1.7729132175445557, + "learning_rate": 3.863620877933261e-05, + "loss": 0.9185, + "step": 2972 + }, + { + "epoch": 0.14476663501570375, + "grad_norm": 1.9894813299179077, + "learning_rate": 3.863506368450008e-05, + "loss": 0.8919, + "step": 2973 + }, + { + "epoch": 0.14481532880481093, + "grad_norm": 1.8227299451828003, + "learning_rate": 3.8633918126118644e-05, + "loss": 0.8946, + "step": 2974 + }, + { + "epoch": 0.14486402259391815, + "grad_norm": 1.7556087970733643, + "learning_rate": 3.86327721042168e-05, + "loss": 0.9203, + "step": 2975 + }, + { + "epoch": 0.14491271638302536, + "grad_norm": 2.3845374584198, + "learning_rate": 3.863162561882305e-05, + "loss": 0.971, + "step": 2976 + }, + { + "epoch": 0.14496141017213254, + "grad_norm": 2.286839246749878, + "learning_rate": 3.863047866996591e-05, + "loss": 0.8747, + "step": 2977 + }, + { + "epoch": 0.14501010396123976, + "grad_norm": 2.9270031452178955, + "learning_rate": 3.862933125767391e-05, + "loss": 0.9484, + "step": 2978 + }, + { + "epoch": 0.14505879775034694, + "grad_norm": 1.6865423917770386, + "learning_rate": 3.862818338197561e-05, + "loss": 0.8401, + "step": 2979 + }, + { + "epoch": 0.14510749153945415, + "grad_norm": 2.831042766571045, + "learning_rate": 3.862703504289954e-05, + "loss": 0.8351, + "step": 2980 + }, + { + "epoch": 0.14515618532856134, + "grad_norm": 4.662944316864014, + "learning_rate": 3.862588624047428e-05, + "loss": 0.8629, + "step": 2981 + }, + { + "epoch": 0.14520487911766855, + "grad_norm": 3.0307865142822266, + "learning_rate": 3.86247369747284e-05, + "loss": 0.9717, + "step": 2982 + }, + { + "epoch": 0.14525357290677574, + "grad_norm": 0.08771483600139618, + "learning_rate": 3.8623587245690483e-05, + "loss": 0.6141, + "step": 2983 + }, + { + "epoch": 0.14530226669588295, + "grad_norm": 2.685654640197754, + "learning_rate": 3.862243705338914e-05, + "loss": 0.8671, + "step": 2984 + }, + { + "epoch": 0.14535096048499013, + "grad_norm": 3.562067747116089, + "learning_rate": 3.862128639785298e-05, + "loss": 0.9, + "step": 2985 + }, + { + "epoch": 0.14539965427409735, + "grad_norm": 2.0407190322875977, + "learning_rate": 3.8620135279110626e-05, + "loss": 0.945, + "step": 2986 + }, + { + "epoch": 0.14544834806320453, + "grad_norm": 3.2591941356658936, + "learning_rate": 3.86189836971907e-05, + "loss": 0.9343, + "step": 2987 + }, + { + "epoch": 0.14549704185231174, + "grad_norm": 2.1856448650360107, + "learning_rate": 3.861783165212186e-05, + "loss": 0.9308, + "step": 2988 + }, + { + "epoch": 0.14554573564141893, + "grad_norm": 2.9473390579223633, + "learning_rate": 3.8616679143932766e-05, + "loss": 0.9571, + "step": 2989 + }, + { + "epoch": 0.14559442943052614, + "grad_norm": 2.1971349716186523, + "learning_rate": 3.8615526172652075e-05, + "loss": 0.8906, + "step": 2990 + }, + { + "epoch": 0.14564312321963332, + "grad_norm": 3.9246251583099365, + "learning_rate": 3.861437273830847e-05, + "loss": 0.854, + "step": 2991 + }, + { + "epoch": 0.14569181700874054, + "grad_norm": 3.8485960960388184, + "learning_rate": 3.861321884093066e-05, + "loss": 0.8915, + "step": 2992 + }, + { + "epoch": 0.14574051079784772, + "grad_norm": 2.918463945388794, + "learning_rate": 3.861206448054732e-05, + "loss": 0.8212, + "step": 2993 + }, + { + "epoch": 0.14578920458695493, + "grad_norm": 1.7045842409133911, + "learning_rate": 3.8610909657187186e-05, + "loss": 0.8955, + "step": 2994 + }, + { + "epoch": 0.14583789837606212, + "grad_norm": 2.044771909713745, + "learning_rate": 3.860975437087898e-05, + "loss": 0.9021, + "step": 2995 + }, + { + "epoch": 0.14588659216516933, + "grad_norm": 2.246666669845581, + "learning_rate": 3.860859862165143e-05, + "loss": 0.9399, + "step": 2996 + }, + { + "epoch": 0.14593528595427654, + "grad_norm": 1.5605096817016602, + "learning_rate": 3.860744240953329e-05, + "loss": 0.8427, + "step": 2997 + }, + { + "epoch": 0.14598397974338373, + "grad_norm": 4.837510585784912, + "learning_rate": 3.860628573455334e-05, + "loss": 0.8914, + "step": 2998 + }, + { + "epoch": 0.14603267353249094, + "grad_norm": 2.1504967212677, + "learning_rate": 3.860512859674032e-05, + "loss": 0.8893, + "step": 2999 + }, + { + "epoch": 0.14608136732159813, + "grad_norm": 2.7601778507232666, + "learning_rate": 3.860397099612303e-05, + "loss": 0.9447, + "step": 3000 + }, + { + "epoch": 0.14613006111070534, + "grad_norm": 2.301074266433716, + "learning_rate": 3.8602812932730274e-05, + "loss": 0.866, + "step": 3001 + }, + { + "epoch": 0.14617875489981252, + "grad_norm": 2.0528759956359863, + "learning_rate": 3.860165440659084e-05, + "loss": 0.867, + "step": 3002 + }, + { + "epoch": 0.14622744868891974, + "grad_norm": 1.941537857055664, + "learning_rate": 3.860049541773357e-05, + "loss": 0.9337, + "step": 3003 + }, + { + "epoch": 0.14627614247802692, + "grad_norm": 1.915689468383789, + "learning_rate": 3.859933596618727e-05, + "loss": 0.9366, + "step": 3004 + }, + { + "epoch": 0.14632483626713413, + "grad_norm": 1.7855762243270874, + "learning_rate": 3.859817605198079e-05, + "loss": 0.863, + "step": 3005 + }, + { + "epoch": 0.14637353005624132, + "grad_norm": 2.0419459342956543, + "learning_rate": 3.859701567514299e-05, + "loss": 0.8726, + "step": 3006 + }, + { + "epoch": 0.14642222384534853, + "grad_norm": 2.106553316116333, + "learning_rate": 3.859585483570273e-05, + "loss": 0.9143, + "step": 3007 + }, + { + "epoch": 0.14647091763445572, + "grad_norm": 1.6059107780456543, + "learning_rate": 3.859469353368889e-05, + "loss": 0.8557, + "step": 3008 + }, + { + "epoch": 0.14651961142356293, + "grad_norm": 6.475455284118652, + "learning_rate": 3.8593531769130345e-05, + "loss": 0.8995, + "step": 3009 + }, + { + "epoch": 0.1465683052126701, + "grad_norm": 2.6425962448120117, + "learning_rate": 3.8592369542056006e-05, + "loss": 0.8756, + "step": 3010 + }, + { + "epoch": 0.14661699900177733, + "grad_norm": 2.560028314590454, + "learning_rate": 3.859120685249478e-05, + "loss": 0.9487, + "step": 3011 + }, + { + "epoch": 0.1466656927908845, + "grad_norm": 6.631667137145996, + "learning_rate": 3.8590043700475586e-05, + "loss": 0.8629, + "step": 3012 + }, + { + "epoch": 0.14671438657999172, + "grad_norm": 1.8923836946487427, + "learning_rate": 3.858888008602736e-05, + "loss": 0.9928, + "step": 3013 + }, + { + "epoch": 0.1467630803690989, + "grad_norm": 1.911375641822815, + "learning_rate": 3.858771600917905e-05, + "loss": 0.926, + "step": 3014 + }, + { + "epoch": 0.14681177415820612, + "grad_norm": 4.506515026092529, + "learning_rate": 3.85865514699596e-05, + "loss": 0.8516, + "step": 3015 + }, + { + "epoch": 0.14686046794731333, + "grad_norm": 1.7886526584625244, + "learning_rate": 3.8585386468398e-05, + "loss": 0.929, + "step": 3016 + }, + { + "epoch": 0.14690916173642052, + "grad_norm": 2.1745235919952393, + "learning_rate": 3.858422100452321e-05, + "loss": 0.7775, + "step": 3017 + }, + { + "epoch": 0.14695785552552773, + "grad_norm": 2.3288581371307373, + "learning_rate": 3.858305507836423e-05, + "loss": 0.8797, + "step": 3018 + }, + { + "epoch": 0.14700654931463492, + "grad_norm": 1.9219025373458862, + "learning_rate": 3.858188868995006e-05, + "loss": 0.875, + "step": 3019 + }, + { + "epoch": 0.14705524310374213, + "grad_norm": 1.8432313203811646, + "learning_rate": 3.858072183930971e-05, + "loss": 0.8458, + "step": 3020 + }, + { + "epoch": 0.1471039368928493, + "grad_norm": 2.1920058727264404, + "learning_rate": 3.8579554526472224e-05, + "loss": 0.8029, + "step": 3021 + }, + { + "epoch": 0.14715263068195653, + "grad_norm": 8.074204444885254, + "learning_rate": 3.8578386751466616e-05, + "loss": 0.9564, + "step": 3022 + }, + { + "epoch": 0.1472013244710637, + "grad_norm": 2.635866165161133, + "learning_rate": 3.857721851432194e-05, + "loss": 0.9299, + "step": 3023 + }, + { + "epoch": 0.14725001826017092, + "grad_norm": 1.7916994094848633, + "learning_rate": 3.8576049815067264e-05, + "loss": 0.9829, + "step": 3024 + }, + { + "epoch": 0.1472987120492781, + "grad_norm": 7.818848609924316, + "learning_rate": 3.857488065373166e-05, + "loss": 0.8561, + "step": 3025 + }, + { + "epoch": 0.14734740583838532, + "grad_norm": 0.08203983306884766, + "learning_rate": 3.85737110303442e-05, + "loss": 0.591, + "step": 3026 + }, + { + "epoch": 0.1473960996274925, + "grad_norm": 4.414762496948242, + "learning_rate": 3.8572540944933984e-05, + "loss": 0.9593, + "step": 3027 + }, + { + "epoch": 0.14744479341659972, + "grad_norm": 2.177523612976074, + "learning_rate": 3.857137039753012e-05, + "loss": 0.8798, + "step": 3028 + }, + { + "epoch": 0.1474934872057069, + "grad_norm": 2.8176417350769043, + "learning_rate": 3.857019938816172e-05, + "loss": 0.9139, + "step": 3029 + }, + { + "epoch": 0.14754218099481411, + "grad_norm": 0.08530370891094208, + "learning_rate": 3.856902791685793e-05, + "loss": 0.6431, + "step": 3030 + }, + { + "epoch": 0.1475908747839213, + "grad_norm": 2.219982862472534, + "learning_rate": 3.8567855983647866e-05, + "loss": 0.8424, + "step": 3031 + }, + { + "epoch": 0.1476395685730285, + "grad_norm": 2.1448662281036377, + "learning_rate": 3.856668358856069e-05, + "loss": 0.929, + "step": 3032 + }, + { + "epoch": 0.1476882623621357, + "grad_norm": 2.2347664833068848, + "learning_rate": 3.856551073162557e-05, + "loss": 0.9077, + "step": 3033 + }, + { + "epoch": 0.1477369561512429, + "grad_norm": 2.1477787494659424, + "learning_rate": 3.856433741287168e-05, + "loss": 0.8591, + "step": 3034 + }, + { + "epoch": 0.1477856499403501, + "grad_norm": 2.2006020545959473, + "learning_rate": 3.8563163632328204e-05, + "loss": 0.9528, + "step": 3035 + }, + { + "epoch": 0.1478343437294573, + "grad_norm": 1.9954867362976074, + "learning_rate": 3.856198939002434e-05, + "loss": 0.9639, + "step": 3036 + }, + { + "epoch": 0.14788303751856452, + "grad_norm": 2.3189826011657715, + "learning_rate": 3.85608146859893e-05, + "loss": 0.9357, + "step": 3037 + }, + { + "epoch": 0.1479317313076717, + "grad_norm": 2.441176414489746, + "learning_rate": 3.8559639520252294e-05, + "loss": 0.9294, + "step": 3038 + }, + { + "epoch": 0.14798042509677892, + "grad_norm": 3.3506205081939697, + "learning_rate": 3.8558463892842575e-05, + "loss": 0.9617, + "step": 3039 + }, + { + "epoch": 0.1480291188858861, + "grad_norm": 2.4027516841888428, + "learning_rate": 3.855728780378936e-05, + "loss": 0.8798, + "step": 3040 + }, + { + "epoch": 0.14807781267499331, + "grad_norm": 2.9945929050445557, + "learning_rate": 3.855611125312193e-05, + "loss": 0.9787, + "step": 3041 + }, + { + "epoch": 0.1481265064641005, + "grad_norm": 2.088174343109131, + "learning_rate": 3.855493424086953e-05, + "loss": 0.8556, + "step": 3042 + }, + { + "epoch": 0.1481752002532077, + "grad_norm": 2.7314629554748535, + "learning_rate": 3.855375676706146e-05, + "loss": 0.9355, + "step": 3043 + }, + { + "epoch": 0.1482238940423149, + "grad_norm": 2.4868457317352295, + "learning_rate": 3.8552578831726995e-05, + "loss": 0.826, + "step": 3044 + }, + { + "epoch": 0.1482725878314221, + "grad_norm": 0.08326037973165512, + "learning_rate": 3.8551400434895445e-05, + "loss": 0.6189, + "step": 3045 + }, + { + "epoch": 0.1483212816205293, + "grad_norm": 2.308243989944458, + "learning_rate": 3.855022157659611e-05, + "loss": 0.8431, + "step": 3046 + }, + { + "epoch": 0.1483699754096365, + "grad_norm": 1.7927861213684082, + "learning_rate": 3.8549042256858324e-05, + "loss": 0.8447, + "step": 3047 + }, + { + "epoch": 0.1484186691987437, + "grad_norm": 2.822445869445801, + "learning_rate": 3.854786247571142e-05, + "loss": 0.9085, + "step": 3048 + }, + { + "epoch": 0.1484673629878509, + "grad_norm": 2.9954473972320557, + "learning_rate": 3.854668223318475e-05, + "loss": 0.8881, + "step": 3049 + }, + { + "epoch": 0.1485160567769581, + "grad_norm": 0.08698277920484543, + "learning_rate": 3.8545501529307665e-05, + "loss": 0.6103, + "step": 3050 + }, + { + "epoch": 0.1485647505660653, + "grad_norm": 2.5498664379119873, + "learning_rate": 3.854432036410954e-05, + "loss": 0.8653, + "step": 3051 + }, + { + "epoch": 0.14861344435517249, + "grad_norm": 2.395994186401367, + "learning_rate": 3.8543138737619754e-05, + "loss": 0.812, + "step": 3052 + }, + { + "epoch": 0.1486621381442797, + "grad_norm": 2.0134294033050537, + "learning_rate": 3.8541956649867697e-05, + "loss": 0.8476, + "step": 3053 + }, + { + "epoch": 0.14871083193338688, + "grad_norm": 2.975060224533081, + "learning_rate": 3.854077410088279e-05, + "loss": 0.8805, + "step": 3054 + }, + { + "epoch": 0.1487595257224941, + "grad_norm": 2.222491502761841, + "learning_rate": 3.853959109069442e-05, + "loss": 0.8987, + "step": 3055 + }, + { + "epoch": 0.1488082195116013, + "grad_norm": 5.618241310119629, + "learning_rate": 3.853840761933205e-05, + "loss": 0.8754, + "step": 3056 + }, + { + "epoch": 0.1488569133007085, + "grad_norm": 2.0542478561401367, + "learning_rate": 3.8537223686825085e-05, + "loss": 0.9511, + "step": 3057 + }, + { + "epoch": 0.1489056070898157, + "grad_norm": 1.733279824256897, + "learning_rate": 3.8536039293203e-05, + "loss": 0.8614, + "step": 3058 + }, + { + "epoch": 0.1489543008789229, + "grad_norm": 1.6358494758605957, + "learning_rate": 3.853485443849524e-05, + "loss": 0.8989, + "step": 3059 + }, + { + "epoch": 0.1490029946680301, + "grad_norm": 1.8966971635818481, + "learning_rate": 3.853366912273129e-05, + "loss": 0.9287, + "step": 3060 + }, + { + "epoch": 0.1490516884571373, + "grad_norm": 2.029972791671753, + "learning_rate": 3.853248334594063e-05, + "loss": 0.9303, + "step": 3061 + }, + { + "epoch": 0.1491003822462445, + "grad_norm": 2.7715768814086914, + "learning_rate": 3.853129710815275e-05, + "loss": 0.8922, + "step": 3062 + }, + { + "epoch": 0.14914907603535169, + "grad_norm": 1.614369511604309, + "learning_rate": 3.853011040939718e-05, + "loss": 0.9999, + "step": 3063 + }, + { + "epoch": 0.1491977698244589, + "grad_norm": 1.960653305053711, + "learning_rate": 3.8528923249703415e-05, + "loss": 0.925, + "step": 3064 + }, + { + "epoch": 0.14924646361356608, + "grad_norm": 2.5072121620178223, + "learning_rate": 3.8527735629100995e-05, + "loss": 0.9453, + "step": 3065 + }, + { + "epoch": 0.1492951574026733, + "grad_norm": 2.435729742050171, + "learning_rate": 3.852654754761946e-05, + "loss": 0.9191, + "step": 3066 + }, + { + "epoch": 0.14934385119178048, + "grad_norm": 1.8601502180099487, + "learning_rate": 3.852535900528837e-05, + "loss": 0.9177, + "step": 3067 + }, + { + "epoch": 0.1493925449808877, + "grad_norm": 2.2559659481048584, + "learning_rate": 3.8524170002137285e-05, + "loss": 0.9312, + "step": 3068 + }, + { + "epoch": 0.14944123876999488, + "grad_norm": 2.2532167434692383, + "learning_rate": 3.852298053819578e-05, + "loss": 0.9464, + "step": 3069 + }, + { + "epoch": 0.1494899325591021, + "grad_norm": 2.6928234100341797, + "learning_rate": 3.852179061349345e-05, + "loss": 0.8981, + "step": 3070 + }, + { + "epoch": 0.14953862634820927, + "grad_norm": 2.0132508277893066, + "learning_rate": 3.8520600228059886e-05, + "loss": 0.9502, + "step": 3071 + }, + { + "epoch": 0.1495873201373165, + "grad_norm": 1.9984766244888306, + "learning_rate": 3.85194093819247e-05, + "loss": 0.8465, + "step": 3072 + }, + { + "epoch": 0.14963601392642367, + "grad_norm": 2.29756498336792, + "learning_rate": 3.851821807511752e-05, + "loss": 0.8958, + "step": 3073 + }, + { + "epoch": 0.14968470771553088, + "grad_norm": 2.387885332107544, + "learning_rate": 3.8517026307667976e-05, + "loss": 0.8643, + "step": 3074 + }, + { + "epoch": 0.1497334015046381, + "grad_norm": 2.2452778816223145, + "learning_rate": 3.8515834079605716e-05, + "loss": 0.9433, + "step": 3075 + }, + { + "epoch": 0.14978209529374528, + "grad_norm": 2.338212013244629, + "learning_rate": 3.8514641390960394e-05, + "loss": 0.8757, + "step": 3076 + }, + { + "epoch": 0.1498307890828525, + "grad_norm": 2.431015729904175, + "learning_rate": 3.8513448241761676e-05, + "loss": 0.9256, + "step": 3077 + }, + { + "epoch": 0.14987948287195968, + "grad_norm": 2.453559398651123, + "learning_rate": 3.851225463203925e-05, + "loss": 0.7837, + "step": 3078 + }, + { + "epoch": 0.1499281766610669, + "grad_norm": 2.7143325805664062, + "learning_rate": 3.85110605618228e-05, + "loss": 0.8592, + "step": 3079 + }, + { + "epoch": 0.14997687045017408, + "grad_norm": 2.1302335262298584, + "learning_rate": 3.850986603114203e-05, + "loss": 0.89, + "step": 3080 + }, + { + "epoch": 0.1500255642392813, + "grad_norm": 1.6562402248382568, + "learning_rate": 3.850867104002666e-05, + "loss": 0.845, + "step": 3081 + }, + { + "epoch": 0.15007425802838847, + "grad_norm": 2.447218179702759, + "learning_rate": 3.850747558850641e-05, + "loss": 0.8897, + "step": 3082 + }, + { + "epoch": 0.1501229518174957, + "grad_norm": 2.5899202823638916, + "learning_rate": 3.850627967661101e-05, + "loss": 0.9925, + "step": 3083 + }, + { + "epoch": 0.15017164560660287, + "grad_norm": 1.8737895488739014, + "learning_rate": 3.8505083304370216e-05, + "loss": 0.8628, + "step": 3084 + }, + { + "epoch": 0.15022033939571008, + "grad_norm": 2.1589279174804688, + "learning_rate": 3.8503886471813795e-05, + "loss": 0.8869, + "step": 3085 + }, + { + "epoch": 0.15026903318481727, + "grad_norm": 1.9726250171661377, + "learning_rate": 3.8502689178971505e-05, + "loss": 0.9641, + "step": 3086 + }, + { + "epoch": 0.15031772697392448, + "grad_norm": 1.9184199571609497, + "learning_rate": 3.8501491425873135e-05, + "loss": 0.9783, + "step": 3087 + }, + { + "epoch": 0.15036642076303167, + "grad_norm": 1.6321665048599243, + "learning_rate": 3.850029321254849e-05, + "loss": 0.9488, + "step": 3088 + }, + { + "epoch": 0.15041511455213888, + "grad_norm": 1.6509793996810913, + "learning_rate": 3.849909453902735e-05, + "loss": 0.8309, + "step": 3089 + }, + { + "epoch": 0.15046380834124606, + "grad_norm": 1.9921327829360962, + "learning_rate": 3.849789540533955e-05, + "loss": 0.8956, + "step": 3090 + }, + { + "epoch": 0.15051250213035328, + "grad_norm": 1.3839141130447388, + "learning_rate": 3.849669581151492e-05, + "loss": 0.9544, + "step": 3091 + }, + { + "epoch": 0.15056119591946046, + "grad_norm": 2.336092233657837, + "learning_rate": 3.849549575758329e-05, + "loss": 0.9416, + "step": 3092 + }, + { + "epoch": 0.15060988970856767, + "grad_norm": 1.978743076324463, + "learning_rate": 3.849429524357452e-05, + "loss": 0.8708, + "step": 3093 + }, + { + "epoch": 0.15065858349767486, + "grad_norm": 1.7744255065917969, + "learning_rate": 3.8493094269518467e-05, + "loss": 0.8218, + "step": 3094 + }, + { + "epoch": 0.15070727728678207, + "grad_norm": 1.767282485961914, + "learning_rate": 3.849189283544501e-05, + "loss": 0.8638, + "step": 3095 + }, + { + "epoch": 0.15075597107588928, + "grad_norm": 1.8080731630325317, + "learning_rate": 3.8490690941384035e-05, + "loss": 0.8005, + "step": 3096 + }, + { + "epoch": 0.15080466486499647, + "grad_norm": 3.4996142387390137, + "learning_rate": 3.8489488587365434e-05, + "loss": 0.9044, + "step": 3097 + }, + { + "epoch": 0.15085335865410368, + "grad_norm": 2.0813956260681152, + "learning_rate": 3.848828577341912e-05, + "loss": 0.8548, + "step": 3098 + }, + { + "epoch": 0.15090205244321087, + "grad_norm": 2.544788360595703, + "learning_rate": 3.848708249957501e-05, + "loss": 0.8408, + "step": 3099 + }, + { + "epoch": 0.15095074623231808, + "grad_norm": 1.9864442348480225, + "learning_rate": 3.848587876586304e-05, + "loss": 0.87, + "step": 3100 + }, + { + "epoch": 0.15099944002142526, + "grad_norm": 1.9551231861114502, + "learning_rate": 3.848467457231314e-05, + "loss": 0.885, + "step": 3101 + }, + { + "epoch": 0.15104813381053248, + "grad_norm": 5.318953990936279, + "learning_rate": 3.848346991895528e-05, + "loss": 0.9488, + "step": 3102 + }, + { + "epoch": 0.15109682759963966, + "grad_norm": 2.020350694656372, + "learning_rate": 3.848226480581943e-05, + "loss": 0.9375, + "step": 3103 + }, + { + "epoch": 0.15114552138874687, + "grad_norm": 2.088073968887329, + "learning_rate": 3.8481059232935546e-05, + "loss": 0.958, + "step": 3104 + }, + { + "epoch": 0.15119421517785406, + "grad_norm": 1.8713141679763794, + "learning_rate": 3.8479853200333635e-05, + "loss": 0.9326, + "step": 3105 + }, + { + "epoch": 0.15124290896696127, + "grad_norm": 2.118400812149048, + "learning_rate": 3.847864670804369e-05, + "loss": 0.9738, + "step": 3106 + }, + { + "epoch": 0.15129160275606846, + "grad_norm": 2.4056973457336426, + "learning_rate": 3.847743975609572e-05, + "loss": 0.8681, + "step": 3107 + }, + { + "epoch": 0.15134029654517567, + "grad_norm": 1.852080225944519, + "learning_rate": 3.847623234451975e-05, + "loss": 0.8997, + "step": 3108 + }, + { + "epoch": 0.15138899033428285, + "grad_norm": 2.702953815460205, + "learning_rate": 3.847502447334583e-05, + "loss": 0.8524, + "step": 3109 + }, + { + "epoch": 0.15143768412339007, + "grad_norm": 2.0478808879852295, + "learning_rate": 3.8473816142603975e-05, + "loss": 0.9108, + "step": 3110 + }, + { + "epoch": 0.15148637791249725, + "grad_norm": 2.128063201904297, + "learning_rate": 3.847260735232427e-05, + "loss": 0.938, + "step": 3111 + }, + { + "epoch": 0.15153507170160446, + "grad_norm": 2.1030454635620117, + "learning_rate": 3.847139810253676e-05, + "loss": 0.8405, + "step": 3112 + }, + { + "epoch": 0.15158376549071165, + "grad_norm": 1.7725507020950317, + "learning_rate": 3.847018839327155e-05, + "loss": 0.8783, + "step": 3113 + }, + { + "epoch": 0.15163245927981886, + "grad_norm": 2.199514389038086, + "learning_rate": 3.8468978224558714e-05, + "loss": 0.8618, + "step": 3114 + }, + { + "epoch": 0.15168115306892607, + "grad_norm": 1.6246001720428467, + "learning_rate": 3.8467767596428364e-05, + "loss": 0.9389, + "step": 3115 + }, + { + "epoch": 0.15172984685803326, + "grad_norm": 1.911083698272705, + "learning_rate": 3.84665565089106e-05, + "loss": 0.871, + "step": 3116 + }, + { + "epoch": 0.15177854064714047, + "grad_norm": 2.019747257232666, + "learning_rate": 3.846534496203558e-05, + "loss": 0.8939, + "step": 3117 + }, + { + "epoch": 0.15182723443624765, + "grad_norm": 1.5838791131973267, + "learning_rate": 3.846413295583341e-05, + "loss": 0.8888, + "step": 3118 + }, + { + "epoch": 0.15187592822535487, + "grad_norm": 1.7348116636276245, + "learning_rate": 3.846292049033425e-05, + "loss": 0.9439, + "step": 3119 + }, + { + "epoch": 0.15192462201446205, + "grad_norm": 1.7363474369049072, + "learning_rate": 3.846170756556825e-05, + "loss": 0.8465, + "step": 3120 + }, + { + "epoch": 0.15197331580356926, + "grad_norm": 4.236232757568359, + "learning_rate": 3.8460494181565604e-05, + "loss": 0.8493, + "step": 3121 + }, + { + "epoch": 0.15202200959267645, + "grad_norm": 2.7234315872192383, + "learning_rate": 3.845928033835648e-05, + "loss": 1.0196, + "step": 3122 + }, + { + "epoch": 0.15207070338178366, + "grad_norm": 1.63206946849823, + "learning_rate": 3.845806603597107e-05, + "loss": 0.9221, + "step": 3123 + }, + { + "epoch": 0.15211939717089085, + "grad_norm": 2.390916585922241, + "learning_rate": 3.845685127443958e-05, + "loss": 0.9042, + "step": 3124 + }, + { + "epoch": 0.15216809095999806, + "grad_norm": 1.9913020133972168, + "learning_rate": 3.845563605379224e-05, + "loss": 0.9007, + "step": 3125 + }, + { + "epoch": 0.15221678474910524, + "grad_norm": 2.867300510406494, + "learning_rate": 3.845442037405927e-05, + "loss": 0.8259, + "step": 3126 + }, + { + "epoch": 0.15226547853821246, + "grad_norm": 1.8410791158676147, + "learning_rate": 3.845320423527091e-05, + "loss": 0.884, + "step": 3127 + }, + { + "epoch": 0.15231417232731964, + "grad_norm": 2.165870189666748, + "learning_rate": 3.845198763745741e-05, + "loss": 1.0485, + "step": 3128 + }, + { + "epoch": 0.15236286611642685, + "grad_norm": 2.0112335681915283, + "learning_rate": 3.845077058064903e-05, + "loss": 0.9032, + "step": 3129 + }, + { + "epoch": 0.15241155990553404, + "grad_norm": 2.4826483726501465, + "learning_rate": 3.844955306487606e-05, + "loss": 0.8539, + "step": 3130 + }, + { + "epoch": 0.15246025369464125, + "grad_norm": 1.5990484952926636, + "learning_rate": 3.8448335090168776e-05, + "loss": 0.9552, + "step": 3131 + }, + { + "epoch": 0.15250894748374844, + "grad_norm": 2.3992624282836914, + "learning_rate": 3.844711665655747e-05, + "loss": 0.8543, + "step": 3132 + }, + { + "epoch": 0.15255764127285565, + "grad_norm": 2.151740550994873, + "learning_rate": 3.844589776407245e-05, + "loss": 0.8322, + "step": 3133 + }, + { + "epoch": 0.15260633506196283, + "grad_norm": 3.1136744022369385, + "learning_rate": 3.844467841274405e-05, + "loss": 0.834, + "step": 3134 + }, + { + "epoch": 0.15265502885107005, + "grad_norm": 1.9575284719467163, + "learning_rate": 3.844345860260259e-05, + "loss": 0.8494, + "step": 3135 + }, + { + "epoch": 0.15270372264017726, + "grad_norm": 1.5570485591888428, + "learning_rate": 3.844223833367841e-05, + "loss": 0.9281, + "step": 3136 + }, + { + "epoch": 0.15275241642928444, + "grad_norm": 1.580567479133606, + "learning_rate": 3.8441017606001875e-05, + "loss": 0.9547, + "step": 3137 + }, + { + "epoch": 0.15280111021839166, + "grad_norm": 2.6370909214019775, + "learning_rate": 3.8439796419603344e-05, + "loss": 0.8623, + "step": 3138 + }, + { + "epoch": 0.15284980400749884, + "grad_norm": 1.5544346570968628, + "learning_rate": 3.8438574774513195e-05, + "loss": 0.9618, + "step": 3139 + }, + { + "epoch": 0.15289849779660605, + "grad_norm": 1.7918628454208374, + "learning_rate": 3.843735267076183e-05, + "loss": 0.8038, + "step": 3140 + }, + { + "epoch": 0.15294719158571324, + "grad_norm": 4.117266654968262, + "learning_rate": 3.843613010837961e-05, + "loss": 0.8548, + "step": 3141 + }, + { + "epoch": 0.15299588537482045, + "grad_norm": 0.08451442420482635, + "learning_rate": 3.8434907087396994e-05, + "loss": 0.6218, + "step": 3142 + }, + { + "epoch": 0.15304457916392764, + "grad_norm": 1.2924871444702148, + "learning_rate": 3.843368360784438e-05, + "loss": 0.8781, + "step": 3143 + }, + { + "epoch": 0.15309327295303485, + "grad_norm": 2.137242078781128, + "learning_rate": 3.84324596697522e-05, + "loss": 0.848, + "step": 3144 + }, + { + "epoch": 0.15314196674214203, + "grad_norm": 0.08181753009557724, + "learning_rate": 3.843123527315091e-05, + "loss": 0.6066, + "step": 3145 + }, + { + "epoch": 0.15319066053124925, + "grad_norm": 2.0175235271453857, + "learning_rate": 3.843001041807096e-05, + "loss": 0.9085, + "step": 3146 + }, + { + "epoch": 0.15323935432035643, + "grad_norm": 2.248257875442505, + "learning_rate": 3.842878510454282e-05, + "loss": 0.98, + "step": 3147 + }, + { + "epoch": 0.15328804810946364, + "grad_norm": 2.5714809894561768, + "learning_rate": 3.8427559332596966e-05, + "loss": 0.9451, + "step": 3148 + }, + { + "epoch": 0.15333674189857083, + "grad_norm": 2.9369072914123535, + "learning_rate": 3.8426333102263906e-05, + "loss": 0.7805, + "step": 3149 + }, + { + "epoch": 0.15338543568767804, + "grad_norm": 2.3097784519195557, + "learning_rate": 3.842510641357412e-05, + "loss": 0.8314, + "step": 3150 + }, + { + "epoch": 0.15343412947678522, + "grad_norm": 1.871610164642334, + "learning_rate": 3.842387926655813e-05, + "loss": 0.7865, + "step": 3151 + }, + { + "epoch": 0.15348282326589244, + "grad_norm": 0.08764834702014923, + "learning_rate": 3.842265166124647e-05, + "loss": 0.6459, + "step": 3152 + }, + { + "epoch": 0.15353151705499962, + "grad_norm": 0.08064448833465576, + "learning_rate": 3.8421423597669674e-05, + "loss": 0.6343, + "step": 3153 + }, + { + "epoch": 0.15358021084410683, + "grad_norm": 1.617505431175232, + "learning_rate": 3.842019507585827e-05, + "loss": 0.7663, + "step": 3154 + }, + { + "epoch": 0.15362890463321405, + "grad_norm": 2.4286744594573975, + "learning_rate": 3.841896609584285e-05, + "loss": 0.8983, + "step": 3155 + }, + { + "epoch": 0.15367759842232123, + "grad_norm": 1.7920119762420654, + "learning_rate": 3.8417736657653966e-05, + "loss": 0.9348, + "step": 3156 + }, + { + "epoch": 0.15372629221142844, + "grad_norm": 0.08376876264810562, + "learning_rate": 3.84165067613222e-05, + "loss": 0.5994, + "step": 3157 + }, + { + "epoch": 0.15377498600053563, + "grad_norm": 1.6254721879959106, + "learning_rate": 3.841527640687815e-05, + "loss": 0.8856, + "step": 3158 + }, + { + "epoch": 0.15382367978964284, + "grad_norm": 2.721738815307617, + "learning_rate": 3.8414045594352426e-05, + "loss": 0.9039, + "step": 3159 + }, + { + "epoch": 0.15387237357875003, + "grad_norm": 1.5967910289764404, + "learning_rate": 3.841281432377563e-05, + "loss": 0.8677, + "step": 3160 + }, + { + "epoch": 0.15392106736785724, + "grad_norm": 1.5395190715789795, + "learning_rate": 3.84115825951784e-05, + "loss": 0.9638, + "step": 3161 + }, + { + "epoch": 0.15396976115696442, + "grad_norm": 1.6552520990371704, + "learning_rate": 3.841035040859138e-05, + "loss": 0.8657, + "step": 3162 + }, + { + "epoch": 0.15401845494607164, + "grad_norm": 1.955211877822876, + "learning_rate": 3.840911776404521e-05, + "loss": 0.9771, + "step": 3163 + }, + { + "epoch": 0.15406714873517882, + "grad_norm": 2.0820515155792236, + "learning_rate": 3.8407884661570555e-05, + "loss": 0.9803, + "step": 3164 + }, + { + "epoch": 0.15411584252428603, + "grad_norm": 1.9839696884155273, + "learning_rate": 3.8406651101198095e-05, + "loss": 0.8731, + "step": 3165 + }, + { + "epoch": 0.15416453631339322, + "grad_norm": 1.7197905778884888, + "learning_rate": 3.8405417082958505e-05, + "loss": 0.8571, + "step": 3166 + }, + { + "epoch": 0.15421323010250043, + "grad_norm": 1.7715256214141846, + "learning_rate": 3.8404182606882496e-05, + "loss": 0.8994, + "step": 3167 + }, + { + "epoch": 0.15426192389160762, + "grad_norm": 1.6554375886917114, + "learning_rate": 3.8402947673000765e-05, + "loss": 0.9671, + "step": 3168 + }, + { + "epoch": 0.15431061768071483, + "grad_norm": 1.707890510559082, + "learning_rate": 3.840171228134403e-05, + "loss": 0.875, + "step": 3169 + }, + { + "epoch": 0.154359311469822, + "grad_norm": 2.103640079498291, + "learning_rate": 3.840047643194302e-05, + "loss": 0.8623, + "step": 3170 + }, + { + "epoch": 0.15440800525892923, + "grad_norm": 2.6900389194488525, + "learning_rate": 3.8399240124828485e-05, + "loss": 0.8364, + "step": 3171 + }, + { + "epoch": 0.1544566990480364, + "grad_norm": 1.611356258392334, + "learning_rate": 3.8398003360031173e-05, + "loss": 0.8802, + "step": 3172 + }, + { + "epoch": 0.15450539283714362, + "grad_norm": 1.6523083448410034, + "learning_rate": 3.839676613758186e-05, + "loss": 0.9845, + "step": 3173 + }, + { + "epoch": 0.1545540866262508, + "grad_norm": 1.791200876235962, + "learning_rate": 3.83955284575113e-05, + "loss": 0.9029, + "step": 3174 + }, + { + "epoch": 0.15460278041535802, + "grad_norm": 1.644973635673523, + "learning_rate": 3.83942903198503e-05, + "loss": 0.9002, + "step": 3175 + }, + { + "epoch": 0.15465147420446523, + "grad_norm": 3.5844969749450684, + "learning_rate": 3.839305172462964e-05, + "loss": 0.9438, + "step": 3176 + }, + { + "epoch": 0.15470016799357242, + "grad_norm": 2.4020893573760986, + "learning_rate": 3.8391812671880154e-05, + "loss": 0.8193, + "step": 3177 + }, + { + "epoch": 0.15474886178267963, + "grad_norm": 6.044140815734863, + "learning_rate": 3.839057316163264e-05, + "loss": 0.8921, + "step": 3178 + }, + { + "epoch": 0.15479755557178682, + "grad_norm": 1.723496913909912, + "learning_rate": 3.838933319391795e-05, + "loss": 0.9305, + "step": 3179 + }, + { + "epoch": 0.15484624936089403, + "grad_norm": 2.2046027183532715, + "learning_rate": 3.838809276876692e-05, + "loss": 0.8807, + "step": 3180 + }, + { + "epoch": 0.1548949431500012, + "grad_norm": 1.9614605903625488, + "learning_rate": 3.83868518862104e-05, + "loss": 0.9445, + "step": 3181 + }, + { + "epoch": 0.15494363693910843, + "grad_norm": 1.8564164638519287, + "learning_rate": 3.838561054627927e-05, + "loss": 0.8144, + "step": 3182 + }, + { + "epoch": 0.1549923307282156, + "grad_norm": 2.1158390045166016, + "learning_rate": 3.838436874900439e-05, + "loss": 0.9159, + "step": 3183 + }, + { + "epoch": 0.15504102451732282, + "grad_norm": 2.27836012840271, + "learning_rate": 3.838312649441667e-05, + "loss": 0.867, + "step": 3184 + }, + { + "epoch": 0.15508971830643, + "grad_norm": 4.216747283935547, + "learning_rate": 3.838188378254701e-05, + "loss": 1.0406, + "step": 3185 + }, + { + "epoch": 0.15513841209553722, + "grad_norm": 1.5040488243103027, + "learning_rate": 3.83806406134263e-05, + "loss": 1.0014, + "step": 3186 + }, + { + "epoch": 0.1551871058846444, + "grad_norm": 2.6796271800994873, + "learning_rate": 3.8379396987085484e-05, + "loss": 0.9168, + "step": 3187 + }, + { + "epoch": 0.15523579967375162, + "grad_norm": 2.447343349456787, + "learning_rate": 3.837815290355549e-05, + "loss": 0.8914, + "step": 3188 + }, + { + "epoch": 0.1552844934628588, + "grad_norm": 1.4755889177322388, + "learning_rate": 3.837690836286727e-05, + "loss": 0.7691, + "step": 3189 + }, + { + "epoch": 0.15533318725196602, + "grad_norm": 1.8034977912902832, + "learning_rate": 3.837566336505178e-05, + "loss": 0.8334, + "step": 3190 + }, + { + "epoch": 0.1553818810410732, + "grad_norm": 1.4861273765563965, + "learning_rate": 3.837441791013999e-05, + "loss": 0.9026, + "step": 3191 + }, + { + "epoch": 0.1554305748301804, + "grad_norm": 2.3897249698638916, + "learning_rate": 3.837317199816287e-05, + "loss": 0.9338, + "step": 3192 + }, + { + "epoch": 0.1554792686192876, + "grad_norm": 1.8076069355010986, + "learning_rate": 3.8371925629151434e-05, + "loss": 0.9867, + "step": 3193 + }, + { + "epoch": 0.1555279624083948, + "grad_norm": 1.7257959842681885, + "learning_rate": 3.837067880313666e-05, + "loss": 0.9109, + "step": 3194 + }, + { + "epoch": 0.15557665619750202, + "grad_norm": 1.789156436920166, + "learning_rate": 3.836943152014959e-05, + "loss": 0.8492, + "step": 3195 + }, + { + "epoch": 0.1556253499866092, + "grad_norm": 1.8749624490737915, + "learning_rate": 3.836818378022122e-05, + "loss": 0.8408, + "step": 3196 + }, + { + "epoch": 0.15567404377571642, + "grad_norm": 1.6049293279647827, + "learning_rate": 3.8366935583382615e-05, + "loss": 0.8669, + "step": 3197 + }, + { + "epoch": 0.1557227375648236, + "grad_norm": 1.7717989683151245, + "learning_rate": 3.8365686929664805e-05, + "loss": 0.8703, + "step": 3198 + }, + { + "epoch": 0.15577143135393082, + "grad_norm": 1.5173003673553467, + "learning_rate": 3.836443781909887e-05, + "loss": 0.8005, + "step": 3199 + }, + { + "epoch": 0.155820125143038, + "grad_norm": 1.5300129652023315, + "learning_rate": 3.836318825171586e-05, + "loss": 0.9413, + "step": 3200 + }, + { + "epoch": 0.15586881893214521, + "grad_norm": 1.7076201438903809, + "learning_rate": 3.836193822754687e-05, + "loss": 0.859, + "step": 3201 + }, + { + "epoch": 0.1559175127212524, + "grad_norm": 1.4576188325881958, + "learning_rate": 3.836068774662299e-05, + "loss": 0.9663, + "step": 3202 + }, + { + "epoch": 0.1559662065103596, + "grad_norm": 0.09293831139802933, + "learning_rate": 3.835943680897534e-05, + "loss": 0.6236, + "step": 3203 + }, + { + "epoch": 0.1560149002994668, + "grad_norm": 1.7405471801757812, + "learning_rate": 3.8358185414635015e-05, + "loss": 0.8903, + "step": 3204 + }, + { + "epoch": 0.156063594088574, + "grad_norm": 1.8781485557556152, + "learning_rate": 3.8356933563633154e-05, + "loss": 0.829, + "step": 3205 + }, + { + "epoch": 0.1561122878776812, + "grad_norm": 1.5254980325698853, + "learning_rate": 3.83556812560009e-05, + "loss": 1.0051, + "step": 3206 + }, + { + "epoch": 0.1561609816667884, + "grad_norm": 2.0916759967803955, + "learning_rate": 3.8354428491769396e-05, + "loss": 0.8668, + "step": 3207 + }, + { + "epoch": 0.1562096754558956, + "grad_norm": 1.5629360675811768, + "learning_rate": 3.835317527096981e-05, + "loss": 1.0262, + "step": 3208 + }, + { + "epoch": 0.1562583692450028, + "grad_norm": 2.5960493087768555, + "learning_rate": 3.835192159363332e-05, + "loss": 0.8478, + "step": 3209 + }, + { + "epoch": 0.15630706303411, + "grad_norm": 0.07996024936437607, + "learning_rate": 3.835066745979111e-05, + "loss": 0.6453, + "step": 3210 + }, + { + "epoch": 0.1563557568232172, + "grad_norm": 1.6389110088348389, + "learning_rate": 3.834941286947437e-05, + "loss": 0.9424, + "step": 3211 + }, + { + "epoch": 0.15640445061232439, + "grad_norm": 1.3174759149551392, + "learning_rate": 3.834815782271431e-05, + "loss": 0.8638, + "step": 3212 + }, + { + "epoch": 0.1564531444014316, + "grad_norm": 2.2641758918762207, + "learning_rate": 3.834690231954215e-05, + "loss": 0.8387, + "step": 3213 + }, + { + "epoch": 0.15650183819053878, + "grad_norm": 1.4363384246826172, + "learning_rate": 3.834564635998913e-05, + "loss": 0.8533, + "step": 3214 + }, + { + "epoch": 0.156550531979646, + "grad_norm": 1.9631317853927612, + "learning_rate": 3.834438994408648e-05, + "loss": 0.8311, + "step": 3215 + }, + { + "epoch": 0.1565992257687532, + "grad_norm": 1.2019137144088745, + "learning_rate": 3.834313307186546e-05, + "loss": 0.9153, + "step": 3216 + }, + { + "epoch": 0.1566479195578604, + "grad_norm": 1.663886308670044, + "learning_rate": 3.834187574335732e-05, + "loss": 0.8461, + "step": 3217 + }, + { + "epoch": 0.1566966133469676, + "grad_norm": 1.6918143033981323, + "learning_rate": 3.834061795859336e-05, + "loss": 0.9517, + "step": 3218 + }, + { + "epoch": 0.1567453071360748, + "grad_norm": 1.839123249053955, + "learning_rate": 3.833935971760486e-05, + "loss": 0.929, + "step": 3219 + }, + { + "epoch": 0.156794000925182, + "grad_norm": 1.6459797620773315, + "learning_rate": 3.8338101020423104e-05, + "loss": 0.9345, + "step": 3220 + }, + { + "epoch": 0.1568426947142892, + "grad_norm": 2.2779595851898193, + "learning_rate": 3.833684186707942e-05, + "loss": 0.8094, + "step": 3221 + }, + { + "epoch": 0.1568913885033964, + "grad_norm": 2.5352039337158203, + "learning_rate": 3.833558225760512e-05, + "loss": 0.9209, + "step": 3222 + }, + { + "epoch": 0.15694008229250359, + "grad_norm": 1.7961094379425049, + "learning_rate": 3.833432219203154e-05, + "loss": 0.901, + "step": 3223 + }, + { + "epoch": 0.1569887760816108, + "grad_norm": 1.5232406854629517, + "learning_rate": 3.833306167039002e-05, + "loss": 0.8699, + "step": 3224 + }, + { + "epoch": 0.15703746987071798, + "grad_norm": 0.08517056703567505, + "learning_rate": 3.8331800692711924e-05, + "loss": 0.5771, + "step": 3225 + }, + { + "epoch": 0.1570861636598252, + "grad_norm": 1.9687680006027222, + "learning_rate": 3.833053925902862e-05, + "loss": 0.811, + "step": 3226 + }, + { + "epoch": 0.15713485744893238, + "grad_norm": 1.3859869241714478, + "learning_rate": 3.832927736937147e-05, + "loss": 0.8966, + "step": 3227 + }, + { + "epoch": 0.1571835512380396, + "grad_norm": 1.405287742614746, + "learning_rate": 3.832801502377188e-05, + "loss": 0.9566, + "step": 3228 + }, + { + "epoch": 0.15723224502714678, + "grad_norm": 2.0524206161499023, + "learning_rate": 3.832675222226124e-05, + "loss": 0.8532, + "step": 3229 + }, + { + "epoch": 0.157280938816254, + "grad_norm": 1.8056230545043945, + "learning_rate": 3.8325488964870976e-05, + "loss": 0.8751, + "step": 3230 + }, + { + "epoch": 0.15732963260536117, + "grad_norm": 1.4648962020874023, + "learning_rate": 3.832422525163249e-05, + "loss": 0.9083, + "step": 3231 + }, + { + "epoch": 0.1573783263944684, + "grad_norm": 2.833481550216675, + "learning_rate": 3.832296108257724e-05, + "loss": 0.9244, + "step": 3232 + }, + { + "epoch": 0.15742702018357557, + "grad_norm": 1.3762909173965454, + "learning_rate": 3.8321696457736664e-05, + "loss": 0.9587, + "step": 3233 + }, + { + "epoch": 0.15747571397268278, + "grad_norm": 1.4943053722381592, + "learning_rate": 3.832043137714222e-05, + "loss": 0.8505, + "step": 3234 + }, + { + "epoch": 0.15752440776179, + "grad_norm": 3.7618765830993652, + "learning_rate": 3.831916584082537e-05, + "loss": 0.8668, + "step": 3235 + }, + { + "epoch": 0.15757310155089718, + "grad_norm": 1.4041181802749634, + "learning_rate": 3.83178998488176e-05, + "loss": 0.8994, + "step": 3236 + }, + { + "epoch": 0.1576217953400044, + "grad_norm": 1.6697412729263306, + "learning_rate": 3.8316633401150394e-05, + "loss": 0.9536, + "step": 3237 + }, + { + "epoch": 0.15767048912911158, + "grad_norm": 2.244136333465576, + "learning_rate": 3.8315366497855266e-05, + "loss": 0.8613, + "step": 3238 + }, + { + "epoch": 0.1577191829182188, + "grad_norm": 1.6550461053848267, + "learning_rate": 3.8314099138963734e-05, + "loss": 0.8941, + "step": 3239 + }, + { + "epoch": 0.15776787670732598, + "grad_norm": 1.7286547422409058, + "learning_rate": 3.831283132450731e-05, + "loss": 0.9973, + "step": 3240 + }, + { + "epoch": 0.1578165704964332, + "grad_norm": 1.776915192604065, + "learning_rate": 3.831156305451754e-05, + "loss": 0.7898, + "step": 3241 + }, + { + "epoch": 0.15786526428554037, + "grad_norm": 3.068192720413208, + "learning_rate": 3.8310294329025966e-05, + "loss": 0.9088, + "step": 3242 + }, + { + "epoch": 0.1579139580746476, + "grad_norm": 1.979953646659851, + "learning_rate": 3.8309025148064155e-05, + "loss": 0.895, + "step": 3243 + }, + { + "epoch": 0.15796265186375477, + "grad_norm": 2.2229654788970947, + "learning_rate": 3.830775551166367e-05, + "loss": 0.8675, + "step": 3244 + }, + { + "epoch": 0.15801134565286198, + "grad_norm": 0.09201498329639435, + "learning_rate": 3.83064854198561e-05, + "loss": 0.6147, + "step": 3245 + }, + { + "epoch": 0.15806003944196917, + "grad_norm": 0.08545300364494324, + "learning_rate": 3.830521487267304e-05, + "loss": 0.6079, + "step": 3246 + }, + { + "epoch": 0.15810873323107638, + "grad_norm": 2.3974735736846924, + "learning_rate": 3.8303943870146086e-05, + "loss": 0.9076, + "step": 3247 + }, + { + "epoch": 0.15815742702018357, + "grad_norm": 1.5113539695739746, + "learning_rate": 3.830267241230686e-05, + "loss": 0.9211, + "step": 3248 + }, + { + "epoch": 0.15820612080929078, + "grad_norm": 4.442067623138428, + "learning_rate": 3.830140049918699e-05, + "loss": 0.9704, + "step": 3249 + }, + { + "epoch": 0.15825481459839796, + "grad_norm": 1.5672944784164429, + "learning_rate": 3.830012813081811e-05, + "loss": 0.9306, + "step": 3250 + }, + { + "epoch": 0.15830350838750518, + "grad_norm": 1.5763261318206787, + "learning_rate": 3.829885530723188e-05, + "loss": 0.8334, + "step": 3251 + }, + { + "epoch": 0.15835220217661236, + "grad_norm": 1.4432915449142456, + "learning_rate": 3.829758202845995e-05, + "loss": 0.9368, + "step": 3252 + }, + { + "epoch": 0.15840089596571957, + "grad_norm": 1.8372210264205933, + "learning_rate": 3.8296308294534006e-05, + "loss": 0.9246, + "step": 3253 + }, + { + "epoch": 0.15844958975482676, + "grad_norm": 1.3442751169204712, + "learning_rate": 3.8295034105485714e-05, + "loss": 0.8569, + "step": 3254 + }, + { + "epoch": 0.15849828354393397, + "grad_norm": 3.8954761028289795, + "learning_rate": 3.829375946134679e-05, + "loss": 0.8997, + "step": 3255 + }, + { + "epoch": 0.15854697733304118, + "grad_norm": 2.235121011734009, + "learning_rate": 3.8292484362148926e-05, + "loss": 0.8813, + "step": 3256 + }, + { + "epoch": 0.15859567112214837, + "grad_norm": 2.159118175506592, + "learning_rate": 3.829120880792385e-05, + "loss": 0.9175, + "step": 3257 + }, + { + "epoch": 0.15864436491125558, + "grad_norm": 3.769258499145508, + "learning_rate": 3.828993279870329e-05, + "loss": 0.9692, + "step": 3258 + }, + { + "epoch": 0.15869305870036277, + "grad_norm": 1.4948008060455322, + "learning_rate": 3.828865633451898e-05, + "loss": 0.8872, + "step": 3259 + }, + { + "epoch": 0.15874175248946998, + "grad_norm": 2.1334939002990723, + "learning_rate": 3.828737941540267e-05, + "loss": 0.8673, + "step": 3260 + }, + { + "epoch": 0.15879044627857716, + "grad_norm": 1.6945931911468506, + "learning_rate": 3.828610204138613e-05, + "loss": 0.8717, + "step": 3261 + }, + { + "epoch": 0.15883914006768438, + "grad_norm": 4.705633640289307, + "learning_rate": 3.828482421250114e-05, + "loss": 0.8216, + "step": 3262 + }, + { + "epoch": 0.15888783385679156, + "grad_norm": 1.9211463928222656, + "learning_rate": 3.8283545928779484e-05, + "loss": 0.937, + "step": 3263 + }, + { + "epoch": 0.15893652764589877, + "grad_norm": 1.9974459409713745, + "learning_rate": 3.828226719025295e-05, + "loss": 0.9438, + "step": 3264 + }, + { + "epoch": 0.15898522143500596, + "grad_norm": 1.745340347290039, + "learning_rate": 3.828098799695336e-05, + "loss": 0.9624, + "step": 3265 + }, + { + "epoch": 0.15903391522411317, + "grad_norm": 2.366518259048462, + "learning_rate": 3.827970834891252e-05, + "loss": 0.9362, + "step": 3266 + }, + { + "epoch": 0.15908260901322036, + "grad_norm": 1.568316102027893, + "learning_rate": 3.827842824616227e-05, + "loss": 0.9929, + "step": 3267 + }, + { + "epoch": 0.15913130280232757, + "grad_norm": 2.0845470428466797, + "learning_rate": 3.8277147688734445e-05, + "loss": 0.9857, + "step": 3268 + }, + { + "epoch": 0.15917999659143475, + "grad_norm": 2.0555224418640137, + "learning_rate": 3.827586667666092e-05, + "loss": 0.8642, + "step": 3269 + }, + { + "epoch": 0.15922869038054197, + "grad_norm": 1.5809895992279053, + "learning_rate": 3.827458520997353e-05, + "loss": 0.8843, + "step": 3270 + }, + { + "epoch": 0.15927738416964915, + "grad_norm": 0.08355922251939774, + "learning_rate": 3.827330328870417e-05, + "loss": 0.5889, + "step": 3271 + }, + { + "epoch": 0.15932607795875636, + "grad_norm": 1.7426117658615112, + "learning_rate": 3.827202091288473e-05, + "loss": 0.8227, + "step": 3272 + }, + { + "epoch": 0.15937477174786355, + "grad_norm": 1.6361709833145142, + "learning_rate": 3.82707380825471e-05, + "loss": 0.9026, + "step": 3273 + }, + { + "epoch": 0.15942346553697076, + "grad_norm": 2.372680425643921, + "learning_rate": 3.826945479772319e-05, + "loss": 0.9416, + "step": 3274 + }, + { + "epoch": 0.15947215932607797, + "grad_norm": 3.7160089015960693, + "learning_rate": 3.8268171058444936e-05, + "loss": 0.8801, + "step": 3275 + }, + { + "epoch": 0.15952085311518516, + "grad_norm": 2.2324934005737305, + "learning_rate": 3.826688686474426e-05, + "loss": 0.9901, + "step": 3276 + }, + { + "epoch": 0.15956954690429237, + "grad_norm": 1.4869364500045776, + "learning_rate": 3.82656022166531e-05, + "loss": 0.8937, + "step": 3277 + }, + { + "epoch": 0.15961824069339955, + "grad_norm": 1.9204672574996948, + "learning_rate": 3.8264317114203425e-05, + "loss": 0.9162, + "step": 3278 + }, + { + "epoch": 0.15966693448250677, + "grad_norm": 1.9907203912734985, + "learning_rate": 3.82630315574272e-05, + "loss": 0.8844, + "step": 3279 + }, + { + "epoch": 0.15971562827161395, + "grad_norm": 1.8330707550048828, + "learning_rate": 3.826174554635639e-05, + "loss": 0.9429, + "step": 3280 + }, + { + "epoch": 0.15976432206072116, + "grad_norm": 1.8808491230010986, + "learning_rate": 3.8260459081023e-05, + "loss": 0.8541, + "step": 3281 + }, + { + "epoch": 0.15981301584982835, + "grad_norm": 1.8036738634109497, + "learning_rate": 3.8259172161459026e-05, + "loss": 0.8426, + "step": 3282 + }, + { + "epoch": 0.15986170963893556, + "grad_norm": 2.1503491401672363, + "learning_rate": 3.8257884787696475e-05, + "loss": 0.8449, + "step": 3283 + }, + { + "epoch": 0.15991040342804275, + "grad_norm": 2.169985055923462, + "learning_rate": 3.825659695976738e-05, + "loss": 0.9272, + "step": 3284 + }, + { + "epoch": 0.15995909721714996, + "grad_norm": 1.6796475648880005, + "learning_rate": 3.825530867770377e-05, + "loss": 0.9316, + "step": 3285 + }, + { + "epoch": 0.16000779100625714, + "grad_norm": 1.8858428001403809, + "learning_rate": 3.825401994153769e-05, + "loss": 0.9185, + "step": 3286 + }, + { + "epoch": 0.16005648479536436, + "grad_norm": 1.6756991147994995, + "learning_rate": 3.8252730751301205e-05, + "loss": 0.9684, + "step": 3287 + }, + { + "epoch": 0.16010517858447154, + "grad_norm": 2.5861523151397705, + "learning_rate": 3.825144110702638e-05, + "loss": 0.8962, + "step": 3288 + }, + { + "epoch": 0.16015387237357875, + "grad_norm": 2.257742404937744, + "learning_rate": 3.8250151008745284e-05, + "loss": 0.7698, + "step": 3289 + }, + { + "epoch": 0.16020256616268594, + "grad_norm": 2.334388256072998, + "learning_rate": 3.8248860456490026e-05, + "loss": 0.8236, + "step": 3290 + }, + { + "epoch": 0.16025125995179315, + "grad_norm": 1.845710277557373, + "learning_rate": 3.8247569450292694e-05, + "loss": 0.9268, + "step": 3291 + }, + { + "epoch": 0.16029995374090034, + "grad_norm": 3.865511417388916, + "learning_rate": 3.824627799018541e-05, + "loss": 0.9558, + "step": 3292 + }, + { + "epoch": 0.16034864753000755, + "grad_norm": 2.5194592475891113, + "learning_rate": 3.8244986076200296e-05, + "loss": 0.8153, + "step": 3293 + }, + { + "epoch": 0.16039734131911473, + "grad_norm": 1.4590986967086792, + "learning_rate": 3.824369370836949e-05, + "loss": 0.8074, + "step": 3294 + }, + { + "epoch": 0.16044603510822195, + "grad_norm": 2.119534730911255, + "learning_rate": 3.824240088672515e-05, + "loss": 0.915, + "step": 3295 + }, + { + "epoch": 0.16049472889732916, + "grad_norm": 2.669180154800415, + "learning_rate": 3.824110761129941e-05, + "loss": 0.8776, + "step": 3296 + }, + { + "epoch": 0.16054342268643634, + "grad_norm": 1.604539394378662, + "learning_rate": 3.823981388212446e-05, + "loss": 0.8334, + "step": 3297 + }, + { + "epoch": 0.16059211647554356, + "grad_norm": 1.5690181255340576, + "learning_rate": 3.8238519699232476e-05, + "loss": 0.9574, + "step": 3298 + }, + { + "epoch": 0.16064081026465074, + "grad_norm": 1.4286491870880127, + "learning_rate": 3.823722506265565e-05, + "loss": 0.9187, + "step": 3299 + }, + { + "epoch": 0.16068950405375795, + "grad_norm": 1.9048707485198975, + "learning_rate": 3.82359299724262e-05, + "loss": 0.8674, + "step": 3300 + }, + { + "epoch": 0.16073819784286514, + "grad_norm": 1.665668249130249, + "learning_rate": 3.823463442857632e-05, + "loss": 0.8716, + "step": 3301 + }, + { + "epoch": 0.16078689163197235, + "grad_norm": 0.08319531381130219, + "learning_rate": 3.823333843113824e-05, + "loss": 0.5472, + "step": 3302 + }, + { + "epoch": 0.16083558542107954, + "grad_norm": 0.09345278888940811, + "learning_rate": 3.823204198014421e-05, + "loss": 0.6651, + "step": 3303 + }, + { + "epoch": 0.16088427921018675, + "grad_norm": 3.948566198348999, + "learning_rate": 3.823074507562647e-05, + "loss": 0.8611, + "step": 3304 + }, + { + "epoch": 0.16093297299929393, + "grad_norm": 2.9601902961730957, + "learning_rate": 3.822944771761729e-05, + "loss": 0.8714, + "step": 3305 + }, + { + "epoch": 0.16098166678840115, + "grad_norm": 2.3497862815856934, + "learning_rate": 3.822814990614894e-05, + "loss": 0.8634, + "step": 3306 + }, + { + "epoch": 0.16103036057750833, + "grad_norm": 2.6923506259918213, + "learning_rate": 3.822685164125369e-05, + "loss": 0.9038, + "step": 3307 + }, + { + "epoch": 0.16107905436661554, + "grad_norm": 2.262845039367676, + "learning_rate": 3.8225552922963844e-05, + "loss": 0.9246, + "step": 3308 + }, + { + "epoch": 0.16112774815572273, + "grad_norm": 1.4196271896362305, + "learning_rate": 3.822425375131171e-05, + "loss": 0.8694, + "step": 3309 + }, + { + "epoch": 0.16117644194482994, + "grad_norm": 1.8096843957901, + "learning_rate": 3.822295412632961e-05, + "loss": 0.9398, + "step": 3310 + }, + { + "epoch": 0.16122513573393712, + "grad_norm": 1.7728649377822876, + "learning_rate": 3.822165404804985e-05, + "loss": 0.9948, + "step": 3311 + }, + { + "epoch": 0.16127382952304434, + "grad_norm": 1.5031492710113525, + "learning_rate": 3.8220353516504796e-05, + "loss": 0.8554, + "step": 3312 + }, + { + "epoch": 0.16132252331215152, + "grad_norm": 2.2094647884368896, + "learning_rate": 3.821905253172678e-05, + "loss": 0.9219, + "step": 3313 + }, + { + "epoch": 0.16137121710125873, + "grad_norm": 2.26796555519104, + "learning_rate": 3.821775109374817e-05, + "loss": 0.8927, + "step": 3314 + }, + { + "epoch": 0.16141991089036595, + "grad_norm": 1.8653466701507568, + "learning_rate": 3.821644920260134e-05, + "loss": 0.8581, + "step": 3315 + }, + { + "epoch": 0.16146860467947313, + "grad_norm": 1.5817570686340332, + "learning_rate": 3.8215146858318684e-05, + "loss": 0.8, + "step": 3316 + }, + { + "epoch": 0.16151729846858034, + "grad_norm": 2.736990213394165, + "learning_rate": 3.821384406093258e-05, + "loss": 0.8259, + "step": 3317 + }, + { + "epoch": 0.16156599225768753, + "grad_norm": 1.714983582496643, + "learning_rate": 3.821254081047545e-05, + "loss": 0.9212, + "step": 3318 + }, + { + "epoch": 0.16161468604679474, + "grad_norm": 1.5841422080993652, + "learning_rate": 3.8211237106979704e-05, + "loss": 0.8534, + "step": 3319 + }, + { + "epoch": 0.16166337983590193, + "grad_norm": 1.8778258562088013, + "learning_rate": 3.820993295047778e-05, + "loss": 0.9106, + "step": 3320 + }, + { + "epoch": 0.16171207362500914, + "grad_norm": 1.9570541381835938, + "learning_rate": 3.8208628341002104e-05, + "loss": 0.8156, + "step": 3321 + }, + { + "epoch": 0.16176076741411632, + "grad_norm": 1.2808815240859985, + "learning_rate": 3.8207323278585144e-05, + "loss": 0.8824, + "step": 3322 + }, + { + "epoch": 0.16180946120322354, + "grad_norm": 2.1794748306274414, + "learning_rate": 3.8206017763259356e-05, + "loss": 0.8975, + "step": 3323 + }, + { + "epoch": 0.16185815499233072, + "grad_norm": 2.8468055725097656, + "learning_rate": 3.820471179505722e-05, + "loss": 0.8739, + "step": 3324 + }, + { + "epoch": 0.16190684878143793, + "grad_norm": 1.7538925409317017, + "learning_rate": 3.8203405374011214e-05, + "loss": 0.9711, + "step": 3325 + }, + { + "epoch": 0.16195554257054512, + "grad_norm": 2.0404021739959717, + "learning_rate": 3.820209850015384e-05, + "loss": 0.9499, + "step": 3326 + }, + { + "epoch": 0.16200423635965233, + "grad_norm": 1.6509203910827637, + "learning_rate": 3.8200791173517603e-05, + "loss": 0.8627, + "step": 3327 + }, + { + "epoch": 0.16205293014875952, + "grad_norm": 2.162757396697998, + "learning_rate": 3.819948339413503e-05, + "loss": 0.9127, + "step": 3328 + }, + { + "epoch": 0.16210162393786673, + "grad_norm": 1.7124602794647217, + "learning_rate": 3.8198175162038644e-05, + "loss": 0.9784, + "step": 3329 + }, + { + "epoch": 0.1621503177269739, + "grad_norm": 1.8207350969314575, + "learning_rate": 3.8196866477261e-05, + "loss": 0.9848, + "step": 3330 + }, + { + "epoch": 0.16219901151608113, + "grad_norm": 1.754645586013794, + "learning_rate": 3.819555733983464e-05, + "loss": 0.9359, + "step": 3331 + }, + { + "epoch": 0.1622477053051883, + "grad_norm": 2.402888059616089, + "learning_rate": 3.819424774979213e-05, + "loss": 0.9662, + "step": 3332 + }, + { + "epoch": 0.16229639909429552, + "grad_norm": 1.4891139268875122, + "learning_rate": 3.819293770716605e-05, + "loss": 0.8624, + "step": 3333 + }, + { + "epoch": 0.1623450928834027, + "grad_norm": 2.585334062576294, + "learning_rate": 3.8191627211988985e-05, + "loss": 0.9994, + "step": 3334 + }, + { + "epoch": 0.16239378667250992, + "grad_norm": 0.08232834935188293, + "learning_rate": 3.819031626429353e-05, + "loss": 0.5772, + "step": 3335 + }, + { + "epoch": 0.16244248046161713, + "grad_norm": 1.8828556537628174, + "learning_rate": 3.8189004864112303e-05, + "loss": 0.8909, + "step": 3336 + }, + { + "epoch": 0.16249117425072432, + "grad_norm": 1.5139325857162476, + "learning_rate": 3.818769301147792e-05, + "loss": 0.8594, + "step": 3337 + }, + { + "epoch": 0.16253986803983153, + "grad_norm": 2.446944236755371, + "learning_rate": 3.818638070642302e-05, + "loss": 0.9272, + "step": 3338 + }, + { + "epoch": 0.16258856182893872, + "grad_norm": 2.5409958362579346, + "learning_rate": 3.818506794898024e-05, + "loss": 0.8589, + "step": 3339 + }, + { + "epoch": 0.16263725561804593, + "grad_norm": 1.709255576133728, + "learning_rate": 3.818375473918224e-05, + "loss": 0.9095, + "step": 3340 + }, + { + "epoch": 0.1626859494071531, + "grad_norm": 1.5112909078598022, + "learning_rate": 3.8182441077061676e-05, + "loss": 0.8386, + "step": 3341 + }, + { + "epoch": 0.16273464319626033, + "grad_norm": 1.9068349599838257, + "learning_rate": 3.8181126962651234e-05, + "loss": 0.8202, + "step": 3342 + }, + { + "epoch": 0.1627833369853675, + "grad_norm": 1.7196135520935059, + "learning_rate": 3.8179812395983596e-05, + "loss": 0.8573, + "step": 3343 + }, + { + "epoch": 0.16283203077447472, + "grad_norm": 2.3843746185302734, + "learning_rate": 3.817849737709148e-05, + "loss": 0.9052, + "step": 3344 + }, + { + "epoch": 0.1628807245635819, + "grad_norm": 2.0715174674987793, + "learning_rate": 3.8177181906007575e-05, + "loss": 0.8766, + "step": 3345 + }, + { + "epoch": 0.16292941835268912, + "grad_norm": 1.737356185913086, + "learning_rate": 3.8175865982764616e-05, + "loss": 0.8455, + "step": 3346 + }, + { + "epoch": 0.1629781121417963, + "grad_norm": 1.8936116695404053, + "learning_rate": 3.817454960739533e-05, + "loss": 0.8131, + "step": 3347 + }, + { + "epoch": 0.16302680593090352, + "grad_norm": 1.8091868162155151, + "learning_rate": 3.817323277993247e-05, + "loss": 0.9904, + "step": 3348 + }, + { + "epoch": 0.1630754997200107, + "grad_norm": 1.667230248451233, + "learning_rate": 3.817191550040879e-05, + "loss": 0.9751, + "step": 3349 + }, + { + "epoch": 0.16312419350911792, + "grad_norm": 2.843592405319214, + "learning_rate": 3.8170597768857046e-05, + "loss": 0.8755, + "step": 3350 + }, + { + "epoch": 0.1631728872982251, + "grad_norm": 2.1971590518951416, + "learning_rate": 3.816927958531003e-05, + "loss": 0.8276, + "step": 3351 + }, + { + "epoch": 0.1632215810873323, + "grad_norm": 1.5428982973098755, + "learning_rate": 3.816796094980053e-05, + "loss": 0.947, + "step": 3352 + }, + { + "epoch": 0.1632702748764395, + "grad_norm": 6.833925247192383, + "learning_rate": 3.8166641862361344e-05, + "loss": 0.7664, + "step": 3353 + }, + { + "epoch": 0.1633189686655467, + "grad_norm": 0.09110007435083389, + "learning_rate": 3.816532232302528e-05, + "loss": 0.6165, + "step": 3354 + }, + { + "epoch": 0.16336766245465392, + "grad_norm": 2.602351188659668, + "learning_rate": 3.816400233182518e-05, + "loss": 0.9553, + "step": 3355 + }, + { + "epoch": 0.1634163562437611, + "grad_norm": 2.489453077316284, + "learning_rate": 3.8162681888793855e-05, + "loss": 0.9116, + "step": 3356 + }, + { + "epoch": 0.16346505003286832, + "grad_norm": 2.599431037902832, + "learning_rate": 3.816136099396416e-05, + "loss": 0.8998, + "step": 3357 + }, + { + "epoch": 0.1635137438219755, + "grad_norm": 1.6023929119110107, + "learning_rate": 3.8160039647368966e-05, + "loss": 0.896, + "step": 3358 + }, + { + "epoch": 0.16356243761108272, + "grad_norm": 1.5139453411102295, + "learning_rate": 3.815871784904112e-05, + "loss": 0.8915, + "step": 3359 + }, + { + "epoch": 0.1636111314001899, + "grad_norm": 1.5927537679672241, + "learning_rate": 3.815739559901351e-05, + "loss": 0.8801, + "step": 3360 + }, + { + "epoch": 0.16365982518929711, + "grad_norm": 1.663485050201416, + "learning_rate": 3.815607289731904e-05, + "loss": 0.8665, + "step": 3361 + }, + { + "epoch": 0.1637085189784043, + "grad_norm": 1.833832859992981, + "learning_rate": 3.815474974399059e-05, + "loss": 0.884, + "step": 3362 + }, + { + "epoch": 0.1637572127675115, + "grad_norm": 1.4967169761657715, + "learning_rate": 3.815342613906109e-05, + "loss": 0.8901, + "step": 3363 + }, + { + "epoch": 0.1638059065566187, + "grad_norm": 1.6687426567077637, + "learning_rate": 3.815210208256346e-05, + "loss": 0.8815, + "step": 3364 + }, + { + "epoch": 0.1638546003457259, + "grad_norm": 1.6322526931762695, + "learning_rate": 3.8150777574530635e-05, + "loss": 0.806, + "step": 3365 + }, + { + "epoch": 0.1639032941348331, + "grad_norm": 1.5059568881988525, + "learning_rate": 3.814945261499556e-05, + "loss": 0.8434, + "step": 3366 + }, + { + "epoch": 0.1639519879239403, + "grad_norm": 0.08356091380119324, + "learning_rate": 3.8148127203991204e-05, + "loss": 0.5773, + "step": 3367 + }, + { + "epoch": 0.1640006817130475, + "grad_norm": 1.5744034051895142, + "learning_rate": 3.814680134155052e-05, + "loss": 0.9523, + "step": 3368 + }, + { + "epoch": 0.1640493755021547, + "grad_norm": 1.7474322319030762, + "learning_rate": 3.81454750277065e-05, + "loss": 0.9183, + "step": 3369 + }, + { + "epoch": 0.1640980692912619, + "grad_norm": 2.2539563179016113, + "learning_rate": 3.814414826249214e-05, + "loss": 0.928, + "step": 3370 + }, + { + "epoch": 0.1641467630803691, + "grad_norm": 1.9647938013076782, + "learning_rate": 3.814282104594043e-05, + "loss": 0.935, + "step": 3371 + }, + { + "epoch": 0.1641954568694763, + "grad_norm": 2.7875614166259766, + "learning_rate": 3.81414933780844e-05, + "loss": 0.8302, + "step": 3372 + }, + { + "epoch": 0.1642441506585835, + "grad_norm": 1.8109581470489502, + "learning_rate": 3.814016525895706e-05, + "loss": 0.8578, + "step": 3373 + }, + { + "epoch": 0.16429284444769068, + "grad_norm": 1.802760124206543, + "learning_rate": 3.813883668859147e-05, + "loss": 0.9019, + "step": 3374 + }, + { + "epoch": 0.1643415382367979, + "grad_norm": 2.2185003757476807, + "learning_rate": 3.813750766702065e-05, + "loss": 0.9141, + "step": 3375 + }, + { + "epoch": 0.1643902320259051, + "grad_norm": 99.68492126464844, + "learning_rate": 3.8136178194277675e-05, + "loss": 0.8239, + "step": 3376 + }, + { + "epoch": 0.1644389258150123, + "grad_norm": 1.6780154705047607, + "learning_rate": 3.8134848270395615e-05, + "loss": 0.8547, + "step": 3377 + }, + { + "epoch": 0.1644876196041195, + "grad_norm": 1.6504201889038086, + "learning_rate": 3.813351789540755e-05, + "loss": 0.7923, + "step": 3378 + }, + { + "epoch": 0.1645363133932267, + "grad_norm": 1.6184109449386597, + "learning_rate": 3.813218706934658e-05, + "loss": 0.918, + "step": 3379 + }, + { + "epoch": 0.1645850071823339, + "grad_norm": 1.9367132186889648, + "learning_rate": 3.81308557922458e-05, + "loss": 0.8397, + "step": 3380 + }, + { + "epoch": 0.1646337009714411, + "grad_norm": 2.0521750450134277, + "learning_rate": 3.812952406413833e-05, + "loss": 0.8497, + "step": 3381 + }, + { + "epoch": 0.1646823947605483, + "grad_norm": 2.427762031555176, + "learning_rate": 3.812819188505729e-05, + "loss": 0.9231, + "step": 3382 + }, + { + "epoch": 0.16473108854965549, + "grad_norm": 1.5068572759628296, + "learning_rate": 3.8126859255035824e-05, + "loss": 0.7888, + "step": 3383 + }, + { + "epoch": 0.1647797823387627, + "grad_norm": 1.6660276651382446, + "learning_rate": 3.812552617410709e-05, + "loss": 1.0118, + "step": 3384 + }, + { + "epoch": 0.16482847612786988, + "grad_norm": 1.9696917533874512, + "learning_rate": 3.812419264230423e-05, + "loss": 0.9343, + "step": 3385 + }, + { + "epoch": 0.1648771699169771, + "grad_norm": 3.3822762966156006, + "learning_rate": 3.8122858659660436e-05, + "loss": 0.8775, + "step": 3386 + }, + { + "epoch": 0.16492586370608428, + "grad_norm": 0.08329527080059052, + "learning_rate": 3.812152422620887e-05, + "loss": 0.6003, + "step": 3387 + }, + { + "epoch": 0.1649745574951915, + "grad_norm": 2.2349276542663574, + "learning_rate": 3.812018934198274e-05, + "loss": 0.9367, + "step": 3388 + }, + { + "epoch": 0.16502325128429868, + "grad_norm": 2.1630775928497314, + "learning_rate": 3.811885400701525e-05, + "loss": 0.8941, + "step": 3389 + }, + { + "epoch": 0.1650719450734059, + "grad_norm": 2.007024049758911, + "learning_rate": 3.8117518221339614e-05, + "loss": 0.7761, + "step": 3390 + }, + { + "epoch": 0.16512063886251307, + "grad_norm": 2.6042070388793945, + "learning_rate": 3.8116181984989055e-05, + "loss": 0.9249, + "step": 3391 + }, + { + "epoch": 0.1651693326516203, + "grad_norm": 2.2107017040252686, + "learning_rate": 3.811484529799683e-05, + "loss": 0.9069, + "step": 3392 + }, + { + "epoch": 0.16521802644072747, + "grad_norm": 1.9659935235977173, + "learning_rate": 3.811350816039617e-05, + "loss": 0.9148, + "step": 3393 + }, + { + "epoch": 0.16526672022983468, + "grad_norm": 2.749785900115967, + "learning_rate": 3.811217057222034e-05, + "loss": 0.9325, + "step": 3394 + }, + { + "epoch": 0.1653154140189419, + "grad_norm": 4.5255279541015625, + "learning_rate": 3.811083253350262e-05, + "loss": 0.9278, + "step": 3395 + }, + { + "epoch": 0.16536410780804908, + "grad_norm": 2.2066922187805176, + "learning_rate": 3.8109494044276284e-05, + "loss": 0.7823, + "step": 3396 + }, + { + "epoch": 0.1654128015971563, + "grad_norm": 3.0092389583587646, + "learning_rate": 3.810815510457464e-05, + "loss": 0.8753, + "step": 3397 + }, + { + "epoch": 0.16546149538626348, + "grad_norm": 2.72347092628479, + "learning_rate": 3.810681571443098e-05, + "loss": 0.9029, + "step": 3398 + }, + { + "epoch": 0.1655101891753707, + "grad_norm": 2.4657392501831055, + "learning_rate": 3.810547587387863e-05, + "loss": 0.9695, + "step": 3399 + }, + { + "epoch": 0.16555888296447788, + "grad_norm": 3.270580768585205, + "learning_rate": 3.810413558295092e-05, + "loss": 0.9058, + "step": 3400 + }, + { + "epoch": 0.1656075767535851, + "grad_norm": 2.7388477325439453, + "learning_rate": 3.810279484168118e-05, + "loss": 0.8441, + "step": 3401 + }, + { + "epoch": 0.16565627054269227, + "grad_norm": 2.1918299198150635, + "learning_rate": 3.8101453650102773e-05, + "loss": 0.9594, + "step": 3402 + }, + { + "epoch": 0.1657049643317995, + "grad_norm": 2.7136261463165283, + "learning_rate": 3.8100112008249055e-05, + "loss": 0.882, + "step": 3403 + }, + { + "epoch": 0.16575365812090667, + "grad_norm": 3.9984920024871826, + "learning_rate": 3.80987699161534e-05, + "loss": 0.8474, + "step": 3404 + }, + { + "epoch": 0.16580235191001388, + "grad_norm": 1.898828148841858, + "learning_rate": 3.80974273738492e-05, + "loss": 0.8735, + "step": 3405 + }, + { + "epoch": 0.16585104569912107, + "grad_norm": 2.0523107051849365, + "learning_rate": 3.809608438136983e-05, + "loss": 0.8903, + "step": 3406 + }, + { + "epoch": 0.16589973948822828, + "grad_norm": 3.082563638687134, + "learning_rate": 3.809474093874872e-05, + "loss": 0.8986, + "step": 3407 + }, + { + "epoch": 0.16594843327733547, + "grad_norm": 2.3901565074920654, + "learning_rate": 3.8093397046019275e-05, + "loss": 0.8892, + "step": 3408 + }, + { + "epoch": 0.16599712706644268, + "grad_norm": 1.802704095840454, + "learning_rate": 3.809205270321494e-05, + "loss": 0.8369, + "step": 3409 + }, + { + "epoch": 0.16604582085554986, + "grad_norm": 2.436800241470337, + "learning_rate": 3.809070791036913e-05, + "loss": 0.9619, + "step": 3410 + }, + { + "epoch": 0.16609451464465708, + "grad_norm": 0.09016004204750061, + "learning_rate": 3.8089362667515316e-05, + "loss": 0.622, + "step": 3411 + }, + { + "epoch": 0.16614320843376426, + "grad_norm": 1.9335960149765015, + "learning_rate": 3.808801697468696e-05, + "loss": 0.8953, + "step": 3412 + }, + { + "epoch": 0.16619190222287147, + "grad_norm": 2.66751766204834, + "learning_rate": 3.808667083191753e-05, + "loss": 0.9675, + "step": 3413 + }, + { + "epoch": 0.16624059601197866, + "grad_norm": 4.415495872497559, + "learning_rate": 3.808532423924051e-05, + "loss": 0.9271, + "step": 3414 + }, + { + "epoch": 0.16628928980108587, + "grad_norm": 2.0835208892822266, + "learning_rate": 3.808397719668941e-05, + "loss": 0.8571, + "step": 3415 + }, + { + "epoch": 0.16633798359019308, + "grad_norm": 2.9967458248138428, + "learning_rate": 3.8082629704297724e-05, + "loss": 0.888, + "step": 3416 + }, + { + "epoch": 0.16638667737930027, + "grad_norm": 2.409465789794922, + "learning_rate": 3.8081281762098974e-05, + "loss": 0.9155, + "step": 3417 + }, + { + "epoch": 0.16643537116840748, + "grad_norm": 2.4255623817443848, + "learning_rate": 3.807993337012669e-05, + "loss": 0.9323, + "step": 3418 + }, + { + "epoch": 0.16648406495751467, + "grad_norm": 2.6051135063171387, + "learning_rate": 3.8078584528414424e-05, + "loss": 0.917, + "step": 3419 + }, + { + "epoch": 0.16653275874662188, + "grad_norm": 4.829205513000488, + "learning_rate": 3.807723523699571e-05, + "loss": 0.9063, + "step": 3420 + }, + { + "epoch": 0.16658145253572906, + "grad_norm": 3.443040370941162, + "learning_rate": 3.8075885495904124e-05, + "loss": 0.9226, + "step": 3421 + }, + { + "epoch": 0.16663014632483628, + "grad_norm": 2.7374980449676514, + "learning_rate": 3.807453530517324e-05, + "loss": 0.9023, + "step": 3422 + }, + { + "epoch": 0.16667884011394346, + "grad_norm": 1.921110987663269, + "learning_rate": 3.807318466483664e-05, + "loss": 0.8624, + "step": 3423 + }, + { + "epoch": 0.16672753390305067, + "grad_norm": 1.9431352615356445, + "learning_rate": 3.807183357492792e-05, + "loss": 0.9222, + "step": 3424 + }, + { + "epoch": 0.16677622769215786, + "grad_norm": 2.737046480178833, + "learning_rate": 3.80704820354807e-05, + "loss": 0.8139, + "step": 3425 + }, + { + "epoch": 0.16682492148126507, + "grad_norm": 2.6055774688720703, + "learning_rate": 3.8069130046528584e-05, + "loss": 0.9183, + "step": 3426 + }, + { + "epoch": 0.16687361527037226, + "grad_norm": 2.997952461242676, + "learning_rate": 3.806777760810522e-05, + "loss": 0.9077, + "step": 3427 + }, + { + "epoch": 0.16692230905947947, + "grad_norm": 2.579937219619751, + "learning_rate": 3.8066424720244235e-05, + "loss": 0.965, + "step": 3428 + }, + { + "epoch": 0.16697100284858665, + "grad_norm": 3.8350162506103516, + "learning_rate": 3.806507138297929e-05, + "loss": 0.8734, + "step": 3429 + }, + { + "epoch": 0.16701969663769387, + "grad_norm": 2.570761203765869, + "learning_rate": 3.806371759634404e-05, + "loss": 0.9034, + "step": 3430 + }, + { + "epoch": 0.16706839042680105, + "grad_norm": 0.09666602313518524, + "learning_rate": 3.8062363360372175e-05, + "loss": 0.6401, + "step": 3431 + }, + { + "epoch": 0.16711708421590826, + "grad_norm": 2.191190242767334, + "learning_rate": 3.806100867509737e-05, + "loss": 0.8939, + "step": 3432 + }, + { + "epoch": 0.16716577800501545, + "grad_norm": 5.733526229858398, + "learning_rate": 3.8059653540553336e-05, + "loss": 0.9775, + "step": 3433 + }, + { + "epoch": 0.16721447179412266, + "grad_norm": 2.1525063514709473, + "learning_rate": 3.805829795677376e-05, + "loss": 1.0016, + "step": 3434 + }, + { + "epoch": 0.16726316558322987, + "grad_norm": 2.385629892349243, + "learning_rate": 3.805694192379239e-05, + "loss": 0.8209, + "step": 3435 + }, + { + "epoch": 0.16731185937233706, + "grad_norm": 4.443943023681641, + "learning_rate": 3.805558544164293e-05, + "loss": 0.8838, + "step": 3436 + }, + { + "epoch": 0.16736055316144427, + "grad_norm": 3.0911293029785156, + "learning_rate": 3.805422851035915e-05, + "loss": 0.9609, + "step": 3437 + }, + { + "epoch": 0.16740924695055145, + "grad_norm": 4.814721584320068, + "learning_rate": 3.8052871129974783e-05, + "loss": 0.9918, + "step": 3438 + }, + { + "epoch": 0.16745794073965867, + "grad_norm": 2.7926673889160156, + "learning_rate": 3.805151330052359e-05, + "loss": 0.8721, + "step": 3439 + }, + { + "epoch": 0.16750663452876585, + "grad_norm": 2.8511033058166504, + "learning_rate": 3.805015502203938e-05, + "loss": 0.8549, + "step": 3440 + }, + { + "epoch": 0.16755532831787306, + "grad_norm": 3.264427423477173, + "learning_rate": 3.80487962945559e-05, + "loss": 0.8504, + "step": 3441 + }, + { + "epoch": 0.16760402210698025, + "grad_norm": 2.682961940765381, + "learning_rate": 3.804743711810697e-05, + "loss": 0.9059, + "step": 3442 + }, + { + "epoch": 0.16765271589608746, + "grad_norm": 2.681403398513794, + "learning_rate": 3.80460774927264e-05, + "loss": 0.9261, + "step": 3443 + }, + { + "epoch": 0.16770140968519465, + "grad_norm": 2.604113817214966, + "learning_rate": 3.804471741844801e-05, + "loss": 0.8804, + "step": 3444 + }, + { + "epoch": 0.16775010347430186, + "grad_norm": 20.755399703979492, + "learning_rate": 3.804335689530562e-05, + "loss": 0.9099, + "step": 3445 + }, + { + "epoch": 0.16779879726340904, + "grad_norm": 2.329228639602661, + "learning_rate": 3.804199592333309e-05, + "loss": 0.948, + "step": 3446 + }, + { + "epoch": 0.16784749105251626, + "grad_norm": 0.09016521275043488, + "learning_rate": 3.804063450256427e-05, + "loss": 0.7056, + "step": 3447 + }, + { + "epoch": 0.16789618484162344, + "grad_norm": 3.3477392196655273, + "learning_rate": 3.803927263303301e-05, + "loss": 0.7962, + "step": 3448 + }, + { + "epoch": 0.16794487863073065, + "grad_norm": 2.0941498279571533, + "learning_rate": 3.80379103147732e-05, + "loss": 0.966, + "step": 3449 + }, + { + "epoch": 0.16799357241983784, + "grad_norm": 1.9191575050354004, + "learning_rate": 3.803654754781873e-05, + "loss": 0.9362, + "step": 3450 + }, + { + "epoch": 0.16804226620894505, + "grad_norm": 2.311866044998169, + "learning_rate": 3.80351843322035e-05, + "loss": 0.8217, + "step": 3451 + }, + { + "epoch": 0.16809095999805224, + "grad_norm": 0.08456870913505554, + "learning_rate": 3.80338206679614e-05, + "loss": 0.6477, + "step": 3452 + }, + { + "epoch": 0.16813965378715945, + "grad_norm": 1.995431661605835, + "learning_rate": 3.8032456555126385e-05, + "loss": 0.9211, + "step": 3453 + }, + { + "epoch": 0.16818834757626663, + "grad_norm": 2.5426101684570312, + "learning_rate": 3.803109199373236e-05, + "loss": 0.9008, + "step": 3454 + }, + { + "epoch": 0.16823704136537385, + "grad_norm": 1.5546940565109253, + "learning_rate": 3.8029726983813275e-05, + "loss": 0.9051, + "step": 3455 + }, + { + "epoch": 0.16828573515448106, + "grad_norm": 2.972602605819702, + "learning_rate": 3.8028361525403095e-05, + "loss": 0.8947, + "step": 3456 + }, + { + "epoch": 0.16833442894358824, + "grad_norm": 2.6047332286834717, + "learning_rate": 3.802699561853578e-05, + "loss": 0.8902, + "step": 3457 + }, + { + "epoch": 0.16838312273269546, + "grad_norm": 2.6277480125427246, + "learning_rate": 3.8025629263245294e-05, + "loss": 0.8147, + "step": 3458 + }, + { + "epoch": 0.16843181652180264, + "grad_norm": 2.229006290435791, + "learning_rate": 3.8024262459565644e-05, + "loss": 0.9897, + "step": 3459 + }, + { + "epoch": 0.16848051031090985, + "grad_norm": 1.6640535593032837, + "learning_rate": 3.802289520753082e-05, + "loss": 0.8319, + "step": 3460 + }, + { + "epoch": 0.16852920410001704, + "grad_norm": 2.0480358600616455, + "learning_rate": 3.802152750717484e-05, + "loss": 0.9594, + "step": 3461 + }, + { + "epoch": 0.16857789788912425, + "grad_norm": 1.854600191116333, + "learning_rate": 3.802015935853171e-05, + "loss": 0.8178, + "step": 3462 + }, + { + "epoch": 0.16862659167823144, + "grad_norm": 2.1832199096679688, + "learning_rate": 3.801879076163548e-05, + "loss": 0.8473, + "step": 3463 + }, + { + "epoch": 0.16867528546733865, + "grad_norm": 2.071291446685791, + "learning_rate": 3.801742171652018e-05, + "loss": 0.9228, + "step": 3464 + }, + { + "epoch": 0.16872397925644583, + "grad_norm": 2.27302885055542, + "learning_rate": 3.801605222321988e-05, + "loss": 0.9378, + "step": 3465 + }, + { + "epoch": 0.16877267304555305, + "grad_norm": 2.0358612537384033, + "learning_rate": 3.801468228176863e-05, + "loss": 0.846, + "step": 3466 + }, + { + "epoch": 0.16882136683466023, + "grad_norm": 1.7577110528945923, + "learning_rate": 3.8013311892200526e-05, + "loss": 0.8464, + "step": 3467 + }, + { + "epoch": 0.16887006062376744, + "grad_norm": 2.333216428756714, + "learning_rate": 3.801194105454964e-05, + "loss": 0.8846, + "step": 3468 + }, + { + "epoch": 0.16891875441287463, + "grad_norm": 2.203336000442505, + "learning_rate": 3.801056976885008e-05, + "loss": 1.0264, + "step": 3469 + }, + { + "epoch": 0.16896744820198184, + "grad_norm": 1.5475820302963257, + "learning_rate": 3.800919803513595e-05, + "loss": 0.8502, + "step": 3470 + }, + { + "epoch": 0.16901614199108903, + "grad_norm": 2.336240768432617, + "learning_rate": 3.800782585344138e-05, + "loss": 0.9142, + "step": 3471 + }, + { + "epoch": 0.16906483578019624, + "grad_norm": 2.037229061126709, + "learning_rate": 3.8006453223800505e-05, + "loss": 0.904, + "step": 3472 + }, + { + "epoch": 0.16911352956930342, + "grad_norm": 3.4457507133483887, + "learning_rate": 3.800508014624746e-05, + "loss": 0.9301, + "step": 3473 + }, + { + "epoch": 0.16916222335841063, + "grad_norm": 1.5098267793655396, + "learning_rate": 3.800370662081641e-05, + "loss": 0.8578, + "step": 3474 + }, + { + "epoch": 0.16921091714751785, + "grad_norm": 2.6917026042938232, + "learning_rate": 3.8002332647541504e-05, + "loss": 0.7991, + "step": 3475 + }, + { + "epoch": 0.16925961093662503, + "grad_norm": 2.313234329223633, + "learning_rate": 3.800095822645694e-05, + "loss": 0.9146, + "step": 3476 + }, + { + "epoch": 0.16930830472573224, + "grad_norm": 2.185182809829712, + "learning_rate": 3.79995833575969e-05, + "loss": 0.9135, + "step": 3477 + }, + { + "epoch": 0.16935699851483943, + "grad_norm": 1.8336641788482666, + "learning_rate": 3.799820804099558e-05, + "loss": 0.9038, + "step": 3478 + }, + { + "epoch": 0.16940569230394664, + "grad_norm": 2.747786521911621, + "learning_rate": 3.79968322766872e-05, + "loss": 0.9146, + "step": 3479 + }, + { + "epoch": 0.16945438609305383, + "grad_norm": 4.998786926269531, + "learning_rate": 3.7995456064705964e-05, + "loss": 0.8525, + "step": 3480 + }, + { + "epoch": 0.16950307988216104, + "grad_norm": 0.08292108029127121, + "learning_rate": 3.7994079405086126e-05, + "loss": 0.6433, + "step": 3481 + }, + { + "epoch": 0.16955177367126822, + "grad_norm": 1.5748846530914307, + "learning_rate": 3.799270229786192e-05, + "loss": 1.0582, + "step": 3482 + }, + { + "epoch": 0.16960046746037544, + "grad_norm": 1.890930414199829, + "learning_rate": 3.7991324743067604e-05, + "loss": 0.8819, + "step": 3483 + }, + { + "epoch": 0.16964916124948262, + "grad_norm": 1.4829840660095215, + "learning_rate": 3.798994674073745e-05, + "loss": 0.8462, + "step": 3484 + }, + { + "epoch": 0.16969785503858983, + "grad_norm": 1.8486214876174927, + "learning_rate": 3.7988568290905724e-05, + "loss": 0.8262, + "step": 3485 + }, + { + "epoch": 0.16974654882769702, + "grad_norm": 0.08179692178964615, + "learning_rate": 3.798718939360672e-05, + "loss": 0.6402, + "step": 3486 + }, + { + "epoch": 0.16979524261680423, + "grad_norm": 1.7924606800079346, + "learning_rate": 3.798581004887474e-05, + "loss": 0.8762, + "step": 3487 + }, + { + "epoch": 0.16984393640591142, + "grad_norm": 1.756725549697876, + "learning_rate": 3.79844302567441e-05, + "loss": 0.8737, + "step": 3488 + }, + { + "epoch": 0.16989263019501863, + "grad_norm": 3.930821418762207, + "learning_rate": 3.7983050017249123e-05, + "loss": 0.9305, + "step": 3489 + }, + { + "epoch": 0.1699413239841258, + "grad_norm": 2.492892265319824, + "learning_rate": 3.798166933042413e-05, + "loss": 0.8457, + "step": 3490 + }, + { + "epoch": 0.16999001777323303, + "grad_norm": 4.238207817077637, + "learning_rate": 3.798028819630347e-05, + "loss": 0.8601, + "step": 3491 + }, + { + "epoch": 0.1700387115623402, + "grad_norm": 2.0118842124938965, + "learning_rate": 3.7978906614921504e-05, + "loss": 0.884, + "step": 3492 + }, + { + "epoch": 0.17008740535144742, + "grad_norm": 1.5588157176971436, + "learning_rate": 3.79775245863126e-05, + "loss": 0.9155, + "step": 3493 + }, + { + "epoch": 0.1701360991405546, + "grad_norm": 2.6871984004974365, + "learning_rate": 3.7976142110511134e-05, + "loss": 0.8467, + "step": 3494 + }, + { + "epoch": 0.17018479292966182, + "grad_norm": 2.0642523765563965, + "learning_rate": 3.797475918755148e-05, + "loss": 0.9058, + "step": 3495 + }, + { + "epoch": 0.17023348671876903, + "grad_norm": 1.890182614326477, + "learning_rate": 3.7973375817468065e-05, + "loss": 0.9265, + "step": 3496 + }, + { + "epoch": 0.17028218050787622, + "grad_norm": 2.061173915863037, + "learning_rate": 3.797199200029529e-05, + "loss": 0.977, + "step": 3497 + }, + { + "epoch": 0.17033087429698343, + "grad_norm": 2.771052598953247, + "learning_rate": 3.797060773606757e-05, + "loss": 0.7688, + "step": 3498 + }, + { + "epoch": 0.17037956808609062, + "grad_norm": 3.7841622829437256, + "learning_rate": 3.7969223024819345e-05, + "loss": 0.9068, + "step": 3499 + }, + { + "epoch": 0.17042826187519783, + "grad_norm": 2.214447498321533, + "learning_rate": 3.796783786658506e-05, + "loss": 0.8488, + "step": 3500 + }, + { + "epoch": 0.170476955664305, + "grad_norm": 2.5737953186035156, + "learning_rate": 3.796645226139917e-05, + "loss": 0.7874, + "step": 3501 + }, + { + "epoch": 0.17052564945341223, + "grad_norm": 2.2653651237487793, + "learning_rate": 3.7965066209296136e-05, + "loss": 0.9096, + "step": 3502 + }, + { + "epoch": 0.1705743432425194, + "grad_norm": 1.5478971004486084, + "learning_rate": 3.7963679710310445e-05, + "loss": 0.92, + "step": 3503 + }, + { + "epoch": 0.17062303703162662, + "grad_norm": 0.09240386635065079, + "learning_rate": 3.796229276447659e-05, + "loss": 0.6389, + "step": 3504 + }, + { + "epoch": 0.1706717308207338, + "grad_norm": 3.5047078132629395, + "learning_rate": 3.796090537182906e-05, + "loss": 0.9272, + "step": 3505 + }, + { + "epoch": 0.17072042460984102, + "grad_norm": 2.215667247772217, + "learning_rate": 3.795951753240237e-05, + "loss": 0.9351, + "step": 3506 + }, + { + "epoch": 0.1707691183989482, + "grad_norm": 1.7440156936645508, + "learning_rate": 3.7958129246231046e-05, + "loss": 0.8884, + "step": 3507 + }, + { + "epoch": 0.17081781218805542, + "grad_norm": 2.9811365604400635, + "learning_rate": 3.7956740513349615e-05, + "loss": 0.9073, + "step": 3508 + }, + { + "epoch": 0.1708665059771626, + "grad_norm": 1.962432861328125, + "learning_rate": 3.795535133379263e-05, + "loss": 0.9149, + "step": 3509 + }, + { + "epoch": 0.17091519976626982, + "grad_norm": 2.734747886657715, + "learning_rate": 3.7953961707594644e-05, + "loss": 0.9208, + "step": 3510 + }, + { + "epoch": 0.170963893555377, + "grad_norm": 2.436309576034546, + "learning_rate": 3.795257163479022e-05, + "loss": 0.822, + "step": 3511 + }, + { + "epoch": 0.1710125873444842, + "grad_norm": 2.4433186054229736, + "learning_rate": 3.795118111541395e-05, + "loss": 0.8972, + "step": 3512 + }, + { + "epoch": 0.1710612811335914, + "grad_norm": 2.046776533126831, + "learning_rate": 3.79497901495004e-05, + "loss": 0.8835, + "step": 3513 + }, + { + "epoch": 0.1711099749226986, + "grad_norm": 2.1334903240203857, + "learning_rate": 3.794839873708419e-05, + "loss": 0.9022, + "step": 3514 + }, + { + "epoch": 0.17115866871180582, + "grad_norm": 1.8056224584579468, + "learning_rate": 3.794700687819993e-05, + "loss": 0.9382, + "step": 3515 + }, + { + "epoch": 0.171207362500913, + "grad_norm": 2.0634360313415527, + "learning_rate": 3.7945614572882226e-05, + "loss": 0.9025, + "step": 3516 + }, + { + "epoch": 0.17125605629002022, + "grad_norm": 2.465346097946167, + "learning_rate": 3.7944221821165726e-05, + "loss": 0.8692, + "step": 3517 + }, + { + "epoch": 0.1713047500791274, + "grad_norm": 2.030376672744751, + "learning_rate": 3.794282862308508e-05, + "loss": 0.9428, + "step": 3518 + }, + { + "epoch": 0.17135344386823462, + "grad_norm": 2.28037428855896, + "learning_rate": 3.7941434978674934e-05, + "loss": 0.9809, + "step": 3519 + }, + { + "epoch": 0.1714021376573418, + "grad_norm": 1.8605048656463623, + "learning_rate": 3.794004088796995e-05, + "loss": 0.8657, + "step": 3520 + }, + { + "epoch": 0.17145083144644901, + "grad_norm": 2.071256399154663, + "learning_rate": 3.793864635100482e-05, + "loss": 0.853, + "step": 3521 + }, + { + "epoch": 0.1714995252355562, + "grad_norm": 2.273263692855835, + "learning_rate": 3.7937251367814224e-05, + "loss": 0.9558, + "step": 3522 + }, + { + "epoch": 0.1715482190246634, + "grad_norm": 2.028886079788208, + "learning_rate": 3.7935855938432865e-05, + "loss": 1.0001, + "step": 3523 + }, + { + "epoch": 0.1715969128137706, + "grad_norm": 2.498122215270996, + "learning_rate": 3.7934460062895454e-05, + "loss": 0.8917, + "step": 3524 + }, + { + "epoch": 0.1716456066028778, + "grad_norm": 2.4198644161224365, + "learning_rate": 3.793306374123671e-05, + "loss": 0.8262, + "step": 3525 + }, + { + "epoch": 0.171694300391985, + "grad_norm": 2.6426191329956055, + "learning_rate": 3.793166697349137e-05, + "loss": 0.9219, + "step": 3526 + }, + { + "epoch": 0.1717429941810922, + "grad_norm": 2.607760429382324, + "learning_rate": 3.793026975969419e-05, + "loss": 0.9084, + "step": 3527 + }, + { + "epoch": 0.1717916879701994, + "grad_norm": 2.2988839149475098, + "learning_rate": 3.79288720998799e-05, + "loss": 0.9436, + "step": 3528 + }, + { + "epoch": 0.1718403817593066, + "grad_norm": 2.2405917644500732, + "learning_rate": 3.792747399408329e-05, + "loss": 0.8373, + "step": 3529 + }, + { + "epoch": 0.1718890755484138, + "grad_norm": 1.6241463422775269, + "learning_rate": 3.792607544233914e-05, + "loss": 0.8042, + "step": 3530 + }, + { + "epoch": 0.171937769337521, + "grad_norm": 2.7389235496520996, + "learning_rate": 3.7924676444682215e-05, + "loss": 0.8331, + "step": 3531 + }, + { + "epoch": 0.1719864631266282, + "grad_norm": 1.7148513793945312, + "learning_rate": 3.792327700114733e-05, + "loss": 0.8651, + "step": 3532 + }, + { + "epoch": 0.1720351569157354, + "grad_norm": 1.4122604131698608, + "learning_rate": 3.79218771117693e-05, + "loss": 0.9206, + "step": 3533 + }, + { + "epoch": 0.17208385070484258, + "grad_norm": 2.1819725036621094, + "learning_rate": 3.792047677658294e-05, + "loss": 0.8865, + "step": 3534 + }, + { + "epoch": 0.1721325444939498, + "grad_norm": 2.4450836181640625, + "learning_rate": 3.7919075995623085e-05, + "loss": 1.024, + "step": 3535 + }, + { + "epoch": 0.172181238283057, + "grad_norm": 2.3717193603515625, + "learning_rate": 3.791767476892458e-05, + "loss": 0.8877, + "step": 3536 + }, + { + "epoch": 0.1722299320721642, + "grad_norm": 3.8041341304779053, + "learning_rate": 3.7916273096522285e-05, + "loss": 0.8595, + "step": 3537 + }, + { + "epoch": 0.1722786258612714, + "grad_norm": 2.5782763957977295, + "learning_rate": 3.791487097845107e-05, + "loss": 0.9297, + "step": 3538 + }, + { + "epoch": 0.1723273196503786, + "grad_norm": 3.4367547035217285, + "learning_rate": 3.7913468414745794e-05, + "loss": 0.973, + "step": 3539 + }, + { + "epoch": 0.1723760134394858, + "grad_norm": 2.1741790771484375, + "learning_rate": 3.791206540544137e-05, + "loss": 0.8636, + "step": 3540 + }, + { + "epoch": 0.172424707228593, + "grad_norm": 3.4595866203308105, + "learning_rate": 3.791066195057267e-05, + "loss": 0.9373, + "step": 3541 + }, + { + "epoch": 0.1724734010177002, + "grad_norm": 2.2905304431915283, + "learning_rate": 3.790925805017463e-05, + "loss": 0.9162, + "step": 3542 + }, + { + "epoch": 0.17252209480680739, + "grad_norm": 2.2062244415283203, + "learning_rate": 3.790785370428217e-05, + "loss": 0.9644, + "step": 3543 + }, + { + "epoch": 0.1725707885959146, + "grad_norm": 3.9650909900665283, + "learning_rate": 3.7906448912930204e-05, + "loss": 0.9502, + "step": 3544 + }, + { + "epoch": 0.17261948238502178, + "grad_norm": 2.7993979454040527, + "learning_rate": 3.790504367615369e-05, + "loss": 0.8638, + "step": 3545 + }, + { + "epoch": 0.172668176174129, + "grad_norm": 2.6106772422790527, + "learning_rate": 3.790363799398759e-05, + "loss": 0.9105, + "step": 3546 + }, + { + "epoch": 0.17271686996323618, + "grad_norm": 1.611109733581543, + "learning_rate": 3.7902231866466854e-05, + "loss": 0.8111, + "step": 3547 + }, + { + "epoch": 0.1727655637523434, + "grad_norm": 2.567850351333618, + "learning_rate": 3.7900825293626475e-05, + "loss": 0.8681, + "step": 3548 + }, + { + "epoch": 0.17281425754145058, + "grad_norm": 1.9104212522506714, + "learning_rate": 3.789941827550143e-05, + "loss": 0.8682, + "step": 3549 + }, + { + "epoch": 0.1728629513305578, + "grad_norm": 1.8910645246505737, + "learning_rate": 3.789801081212672e-05, + "loss": 0.7543, + "step": 3550 + }, + { + "epoch": 0.17291164511966498, + "grad_norm": 3.365590810775757, + "learning_rate": 3.789660290353737e-05, + "loss": 0.8261, + "step": 3551 + }, + { + "epoch": 0.1729603389087722, + "grad_norm": 1.8534090518951416, + "learning_rate": 3.789519454976839e-05, + "loss": 0.8934, + "step": 3552 + }, + { + "epoch": 0.17300903269787937, + "grad_norm": 2.3754682540893555, + "learning_rate": 3.789378575085481e-05, + "loss": 0.8217, + "step": 3553 + }, + { + "epoch": 0.17305772648698659, + "grad_norm": 1.6143736839294434, + "learning_rate": 3.789237650683167e-05, + "loss": 0.9578, + "step": 3554 + }, + { + "epoch": 0.1731064202760938, + "grad_norm": 2.3633153438568115, + "learning_rate": 3.789096681773404e-05, + "loss": 0.8416, + "step": 3555 + }, + { + "epoch": 0.17315511406520098, + "grad_norm": 1.8202956914901733, + "learning_rate": 3.788955668359698e-05, + "loss": 0.9059, + "step": 3556 + }, + { + "epoch": 0.1732038078543082, + "grad_norm": 2.2533655166625977, + "learning_rate": 3.788814610445557e-05, + "loss": 0.8722, + "step": 3557 + }, + { + "epoch": 0.17325250164341538, + "grad_norm": 2.1639511585235596, + "learning_rate": 3.788673508034489e-05, + "loss": 0.9255, + "step": 3558 + }, + { + "epoch": 0.1733011954325226, + "grad_norm": 1.627066731452942, + "learning_rate": 3.7885323611300043e-05, + "loss": 0.9349, + "step": 3559 + }, + { + "epoch": 0.17334988922162978, + "grad_norm": 0.09060794860124588, + "learning_rate": 3.788391169735614e-05, + "loss": 0.6339, + "step": 3560 + }, + { + "epoch": 0.173398583010737, + "grad_norm": 2.0888946056365967, + "learning_rate": 3.788249933854831e-05, + "loss": 0.9884, + "step": 3561 + }, + { + "epoch": 0.17344727679984417, + "grad_norm": 3.618955135345459, + "learning_rate": 3.788108653491167e-05, + "loss": 0.9152, + "step": 3562 + }, + { + "epoch": 0.1734959705889514, + "grad_norm": 2.8411483764648438, + "learning_rate": 3.787967328648138e-05, + "loss": 0.8884, + "step": 3563 + }, + { + "epoch": 0.17354466437805857, + "grad_norm": 3.3035316467285156, + "learning_rate": 3.787825959329258e-05, + "loss": 0.861, + "step": 3564 + }, + { + "epoch": 0.17359335816716578, + "grad_norm": 2.0976920127868652, + "learning_rate": 3.787684545538044e-05, + "loss": 0.8538, + "step": 3565 + }, + { + "epoch": 0.17364205195627297, + "grad_norm": 3.0395615100860596, + "learning_rate": 3.787543087278015e-05, + "loss": 0.8632, + "step": 3566 + }, + { + "epoch": 0.17369074574538018, + "grad_norm": 2.0325818061828613, + "learning_rate": 3.787401584552687e-05, + "loss": 0.8412, + "step": 3567 + }, + { + "epoch": 0.17373943953448737, + "grad_norm": 0.0845375508069992, + "learning_rate": 3.787260037365584e-05, + "loss": 0.5727, + "step": 3568 + }, + { + "epoch": 0.17378813332359458, + "grad_norm": 2.42956805229187, + "learning_rate": 3.787118445720223e-05, + "loss": 0.8607, + "step": 3569 + }, + { + "epoch": 0.17383682711270176, + "grad_norm": 2.396881580352783, + "learning_rate": 3.786976809620128e-05, + "loss": 0.8417, + "step": 3570 + }, + { + "epoch": 0.17388552090180898, + "grad_norm": 1.4465495347976685, + "learning_rate": 3.786835129068821e-05, + "loss": 0.8591, + "step": 3571 + }, + { + "epoch": 0.17393421469091616, + "grad_norm": 2.9474973678588867, + "learning_rate": 3.786693404069829e-05, + "loss": 0.9311, + "step": 3572 + }, + { + "epoch": 0.17398290848002337, + "grad_norm": 5.9428887367248535, + "learning_rate": 3.786551634626675e-05, + "loss": 0.9341, + "step": 3573 + }, + { + "epoch": 0.17403160226913056, + "grad_norm": 2.1938230991363525, + "learning_rate": 3.7864098207428865e-05, + "loss": 0.851, + "step": 3574 + }, + { + "epoch": 0.17408029605823777, + "grad_norm": 1.8046749830245972, + "learning_rate": 3.78626796242199e-05, + "loss": 0.7902, + "step": 3575 + }, + { + "epoch": 0.17412898984734498, + "grad_norm": 1.8151664733886719, + "learning_rate": 3.786126059667516e-05, + "loss": 0.8639, + "step": 3576 + }, + { + "epoch": 0.17417768363645217, + "grad_norm": 3.2234416007995605, + "learning_rate": 3.785984112482993e-05, + "loss": 0.8978, + "step": 3577 + }, + { + "epoch": 0.17422637742555938, + "grad_norm": 2.486841917037964, + "learning_rate": 3.785842120871952e-05, + "loss": 0.879, + "step": 3578 + }, + { + "epoch": 0.17427507121466657, + "grad_norm": 2.25813889503479, + "learning_rate": 3.7857000848379256e-05, + "loss": 0.8818, + "step": 3579 + }, + { + "epoch": 0.17432376500377378, + "grad_norm": 0.08585474640130997, + "learning_rate": 3.785558004384447e-05, + "loss": 0.5503, + "step": 3580 + }, + { + "epoch": 0.17437245879288096, + "grad_norm": 3.4523534774780273, + "learning_rate": 3.78541587951505e-05, + "loss": 0.94, + "step": 3581 + }, + { + "epoch": 0.17442115258198818, + "grad_norm": 2.522725820541382, + "learning_rate": 3.7852737102332705e-05, + "loss": 0.8705, + "step": 3582 + }, + { + "epoch": 0.17446984637109536, + "grad_norm": 2.867509603500366, + "learning_rate": 3.785131496542644e-05, + "loss": 0.7987, + "step": 3583 + }, + { + "epoch": 0.17451854016020257, + "grad_norm": 2.484941005706787, + "learning_rate": 3.78498923844671e-05, + "loss": 0.8501, + "step": 3584 + }, + { + "epoch": 0.17456723394930976, + "grad_norm": 4.859030246734619, + "learning_rate": 3.784846935949005e-05, + "loss": 0.9099, + "step": 3585 + }, + { + "epoch": 0.17461592773841697, + "grad_norm": 3.003612518310547, + "learning_rate": 3.78470458905307e-05, + "loss": 0.8954, + "step": 3586 + }, + { + "epoch": 0.17466462152752416, + "grad_norm": 1.8822566270828247, + "learning_rate": 3.7845621977624455e-05, + "loss": 0.7796, + "step": 3587 + }, + { + "epoch": 0.17471331531663137, + "grad_norm": 1.9841660261154175, + "learning_rate": 3.7844197620806735e-05, + "loss": 0.9028, + "step": 3588 + }, + { + "epoch": 0.17476200910573855, + "grad_norm": 4.043862342834473, + "learning_rate": 3.784277282011298e-05, + "loss": 0.8381, + "step": 3589 + }, + { + "epoch": 0.17481070289484577, + "grad_norm": 1.977994441986084, + "learning_rate": 3.7841347575578615e-05, + "loss": 0.9704, + "step": 3590 + }, + { + "epoch": 0.17485939668395295, + "grad_norm": 1.9147237539291382, + "learning_rate": 3.783992188723911e-05, + "loss": 0.8849, + "step": 3591 + }, + { + "epoch": 0.17490809047306016, + "grad_norm": 2.5048015117645264, + "learning_rate": 3.783849575512991e-05, + "loss": 0.9559, + "step": 3592 + }, + { + "epoch": 0.17495678426216735, + "grad_norm": 2.0819616317749023, + "learning_rate": 3.783706917928651e-05, + "loss": 0.9376, + "step": 3593 + }, + { + "epoch": 0.17500547805127456, + "grad_norm": 2.1105823516845703, + "learning_rate": 3.7835642159744385e-05, + "loss": 0.9497, + "step": 3594 + }, + { + "epoch": 0.17505417184038177, + "grad_norm": 2.4524919986724854, + "learning_rate": 3.783421469653904e-05, + "loss": 0.8604, + "step": 3595 + }, + { + "epoch": 0.17510286562948896, + "grad_norm": 0.07836595177650452, + "learning_rate": 3.783278678970598e-05, + "loss": 0.5533, + "step": 3596 + }, + { + "epoch": 0.17515155941859617, + "grad_norm": 2.8294901847839355, + "learning_rate": 3.7831358439280716e-05, + "loss": 0.8822, + "step": 3597 + }, + { + "epoch": 0.17520025320770335, + "grad_norm": 1.998356580734253, + "learning_rate": 3.7829929645298784e-05, + "loss": 0.9387, + "step": 3598 + }, + { + "epoch": 0.17524894699681057, + "grad_norm": 2.060455560684204, + "learning_rate": 3.782850040779573e-05, + "loss": 1.0072, + "step": 3599 + }, + { + "epoch": 0.17529764078591775, + "grad_norm": 2.2943060398101807, + "learning_rate": 3.7827070726807104e-05, + "loss": 0.9001, + "step": 3600 + }, + { + "epoch": 0.17534633457502496, + "grad_norm": 3.5569112300872803, + "learning_rate": 3.7825640602368465e-05, + "loss": 0.8492, + "step": 3601 + }, + { + "epoch": 0.17539502836413215, + "grad_norm": 2.7593023777008057, + "learning_rate": 3.782421003451539e-05, + "loss": 0.9185, + "step": 3602 + }, + { + "epoch": 0.17544372215323936, + "grad_norm": 2.132014036178589, + "learning_rate": 3.7822779023283474e-05, + "loss": 0.8442, + "step": 3603 + }, + { + "epoch": 0.17549241594234655, + "grad_norm": 1.6416244506835938, + "learning_rate": 3.7821347568708294e-05, + "loss": 0.9093, + "step": 3604 + }, + { + "epoch": 0.17554110973145376, + "grad_norm": 2.4203712940216064, + "learning_rate": 3.781991567082548e-05, + "loss": 0.9265, + "step": 3605 + }, + { + "epoch": 0.17558980352056094, + "grad_norm": 2.502929925918579, + "learning_rate": 3.781848332967063e-05, + "loss": 0.875, + "step": 3606 + }, + { + "epoch": 0.17563849730966816, + "grad_norm": 2.182687520980835, + "learning_rate": 3.781705054527938e-05, + "loss": 0.8782, + "step": 3607 + }, + { + "epoch": 0.17568719109877534, + "grad_norm": 2.8291306495666504, + "learning_rate": 3.781561731768738e-05, + "loss": 1.0334, + "step": 3608 + }, + { + "epoch": 0.17573588488788255, + "grad_norm": 2.214611291885376, + "learning_rate": 3.7814183646930265e-05, + "loss": 0.87, + "step": 3609 + }, + { + "epoch": 0.17578457867698974, + "grad_norm": 2.3695247173309326, + "learning_rate": 3.781274953304372e-05, + "loss": 0.9919, + "step": 3610 + }, + { + "epoch": 0.17583327246609695, + "grad_norm": 1.595757246017456, + "learning_rate": 3.78113149760634e-05, + "loss": 0.8351, + "step": 3611 + }, + { + "epoch": 0.17588196625520414, + "grad_norm": 2.0774178504943848, + "learning_rate": 3.7809879976025e-05, + "loss": 0.9156, + "step": 3612 + }, + { + "epoch": 0.17593066004431135, + "grad_norm": 2.2526490688323975, + "learning_rate": 3.7808444532964205e-05, + "loss": 0.8468, + "step": 3613 + }, + { + "epoch": 0.17597935383341853, + "grad_norm": 2.3010172843933105, + "learning_rate": 3.780700864691674e-05, + "loss": 0.8885, + "step": 3614 + }, + { + "epoch": 0.17602804762252575, + "grad_norm": 4.491694450378418, + "learning_rate": 3.78055723179183e-05, + "loss": 0.9262, + "step": 3615 + }, + { + "epoch": 0.17607674141163296, + "grad_norm": 1.6490821838378906, + "learning_rate": 3.780413554600462e-05, + "loss": 0.8858, + "step": 3616 + }, + { + "epoch": 0.17612543520074014, + "grad_norm": 2.238105058670044, + "learning_rate": 3.780269833121146e-05, + "loss": 0.9188, + "step": 3617 + }, + { + "epoch": 0.17617412898984736, + "grad_norm": 2.4528539180755615, + "learning_rate": 3.780126067357455e-05, + "loss": 0.8548, + "step": 3618 + }, + { + "epoch": 0.17622282277895454, + "grad_norm": 3.1983461380004883, + "learning_rate": 3.7799822573129655e-05, + "loss": 0.8624, + "step": 3619 + }, + { + "epoch": 0.17627151656806175, + "grad_norm": 2.242125988006592, + "learning_rate": 3.779838402991256e-05, + "loss": 0.8205, + "step": 3620 + }, + { + "epoch": 0.17632021035716894, + "grad_norm": 2.1801016330718994, + "learning_rate": 3.7796945043959025e-05, + "loss": 0.9031, + "step": 3621 + }, + { + "epoch": 0.17636890414627615, + "grad_norm": 2.246039867401123, + "learning_rate": 3.7795505615304866e-05, + "loss": 0.9282, + "step": 3622 + }, + { + "epoch": 0.17641759793538334, + "grad_norm": 1.746948480606079, + "learning_rate": 3.779406574398588e-05, + "loss": 0.9695, + "step": 3623 + }, + { + "epoch": 0.17646629172449055, + "grad_norm": 2.1675596237182617, + "learning_rate": 3.779262543003789e-05, + "loss": 0.9442, + "step": 3624 + }, + { + "epoch": 0.17651498551359773, + "grad_norm": 3.316455125808716, + "learning_rate": 3.779118467349673e-05, + "loss": 0.8644, + "step": 3625 + }, + { + "epoch": 0.17656367930270495, + "grad_norm": 1.8521876335144043, + "learning_rate": 3.778974347439821e-05, + "loss": 0.7378, + "step": 3626 + }, + { + "epoch": 0.17661237309181213, + "grad_norm": 1.9408308267593384, + "learning_rate": 3.778830183277821e-05, + "loss": 0.8717, + "step": 3627 + }, + { + "epoch": 0.17666106688091934, + "grad_norm": 2.179318904876709, + "learning_rate": 3.778685974867258e-05, + "loss": 0.9275, + "step": 3628 + }, + { + "epoch": 0.17670976067002653, + "grad_norm": 1.789421558380127, + "learning_rate": 3.778541722211719e-05, + "loss": 0.8729, + "step": 3629 + }, + { + "epoch": 0.17675845445913374, + "grad_norm": 2.7534942626953125, + "learning_rate": 3.7783974253147924e-05, + "loss": 0.8237, + "step": 3630 + }, + { + "epoch": 0.17680714824824093, + "grad_norm": 3.7180068492889404, + "learning_rate": 3.778253084180068e-05, + "loss": 0.8741, + "step": 3631 + }, + { + "epoch": 0.17685584203734814, + "grad_norm": 3.1879942417144775, + "learning_rate": 3.778108698811135e-05, + "loss": 1.0343, + "step": 3632 + }, + { + "epoch": 0.17690453582645532, + "grad_norm": 3.6619856357574463, + "learning_rate": 3.777964269211587e-05, + "loss": 0.9713, + "step": 3633 + }, + { + "epoch": 0.17695322961556254, + "grad_norm": 2.7806572914123535, + "learning_rate": 3.777819795385015e-05, + "loss": 0.9442, + "step": 3634 + }, + { + "epoch": 0.17700192340466975, + "grad_norm": 2.485682964324951, + "learning_rate": 3.777675277335014e-05, + "loss": 0.976, + "step": 3635 + }, + { + "epoch": 0.17705061719377693, + "grad_norm": 3.378490447998047, + "learning_rate": 3.7775307150651786e-05, + "loss": 0.9204, + "step": 3636 + }, + { + "epoch": 0.17709931098288414, + "grad_norm": 2.432147741317749, + "learning_rate": 3.777386108579104e-05, + "loss": 0.9196, + "step": 3637 + }, + { + "epoch": 0.17714800477199133, + "grad_norm": 2.3013410568237305, + "learning_rate": 3.777241457880389e-05, + "loss": 0.9864, + "step": 3638 + }, + { + "epoch": 0.17719669856109854, + "grad_norm": 2.012889862060547, + "learning_rate": 3.77709676297263e-05, + "loss": 0.9562, + "step": 3639 + }, + { + "epoch": 0.17724539235020573, + "grad_norm": 2.769057273864746, + "learning_rate": 3.7769520238594266e-05, + "loss": 0.8404, + "step": 3640 + }, + { + "epoch": 0.17729408613931294, + "grad_norm": 3.882733106613159, + "learning_rate": 3.77680724054438e-05, + "loss": 0.971, + "step": 3641 + }, + { + "epoch": 0.17734277992842012, + "grad_norm": 2.490007162094116, + "learning_rate": 3.776662413031091e-05, + "loss": 0.9939, + "step": 3642 + }, + { + "epoch": 0.17739147371752734, + "grad_norm": 2.38495135307312, + "learning_rate": 3.776517541323163e-05, + "loss": 0.9551, + "step": 3643 + }, + { + "epoch": 0.17744016750663452, + "grad_norm": 2.0948009490966797, + "learning_rate": 3.7763726254241984e-05, + "loss": 0.8866, + "step": 3644 + }, + { + "epoch": 0.17748886129574173, + "grad_norm": 3.436328649520874, + "learning_rate": 3.776227665337803e-05, + "loss": 0.855, + "step": 3645 + }, + { + "epoch": 0.17753755508484892, + "grad_norm": 2.12176251411438, + "learning_rate": 3.776082661067583e-05, + "loss": 0.9096, + "step": 3646 + }, + { + "epoch": 0.17758624887395613, + "grad_norm": 2.741316080093384, + "learning_rate": 3.775937612617145e-05, + "loss": 0.8703, + "step": 3647 + }, + { + "epoch": 0.17763494266306332, + "grad_norm": 1.8925461769104004, + "learning_rate": 3.7757925199900956e-05, + "loss": 0.8407, + "step": 3648 + }, + { + "epoch": 0.17768363645217053, + "grad_norm": 3.35758638381958, + "learning_rate": 3.775647383190046e-05, + "loss": 0.8716, + "step": 3649 + }, + { + "epoch": 0.17773233024127771, + "grad_norm": 1.988482117652893, + "learning_rate": 3.7755022022206074e-05, + "loss": 0.8324, + "step": 3650 + }, + { + "epoch": 0.17778102403038493, + "grad_norm": 2.3239831924438477, + "learning_rate": 3.775356977085388e-05, + "loss": 0.94, + "step": 3651 + }, + { + "epoch": 0.1778297178194921, + "grad_norm": 2.701188087463379, + "learning_rate": 3.7752117077880025e-05, + "loss": 0.8746, + "step": 3652 + }, + { + "epoch": 0.17787841160859932, + "grad_norm": 3.4914839267730713, + "learning_rate": 3.7750663943320634e-05, + "loss": 0.8663, + "step": 3653 + }, + { + "epoch": 0.1779271053977065, + "grad_norm": 1.8389759063720703, + "learning_rate": 3.7749210367211864e-05, + "loss": 0.8138, + "step": 3654 + }, + { + "epoch": 0.17797579918681372, + "grad_norm": 0.0874597579240799, + "learning_rate": 3.774775634958986e-05, + "loss": 0.6351, + "step": 3655 + }, + { + "epoch": 0.17802449297592093, + "grad_norm": 0.08894257247447968, + "learning_rate": 3.774630189049081e-05, + "loss": 0.653, + "step": 3656 + }, + { + "epoch": 0.17807318676502812, + "grad_norm": 2.713336706161499, + "learning_rate": 3.774484698995087e-05, + "loss": 0.8758, + "step": 3657 + }, + { + "epoch": 0.17812188055413533, + "grad_norm": 2.533693552017212, + "learning_rate": 3.7743391648006257e-05, + "loss": 0.9169, + "step": 3658 + }, + { + "epoch": 0.17817057434324252, + "grad_norm": 0.08487717062234879, + "learning_rate": 3.7741935864693145e-05, + "loss": 0.6029, + "step": 3659 + }, + { + "epoch": 0.17821926813234973, + "grad_norm": 1.8125543594360352, + "learning_rate": 3.774047964004777e-05, + "loss": 1.0171, + "step": 3660 + }, + { + "epoch": 0.1782679619214569, + "grad_norm": 2.597339391708374, + "learning_rate": 3.773902297410635e-05, + "loss": 0.8896, + "step": 3661 + }, + { + "epoch": 0.17831665571056413, + "grad_norm": 2.009178638458252, + "learning_rate": 3.7737565866905106e-05, + "loss": 0.7932, + "step": 3662 + }, + { + "epoch": 0.1783653494996713, + "grad_norm": 2.9407639503479004, + "learning_rate": 3.77361083184803e-05, + "loss": 0.8278, + "step": 3663 + }, + { + "epoch": 0.17841404328877852, + "grad_norm": 2.15598726272583, + "learning_rate": 3.773465032886818e-05, + "loss": 0.7927, + "step": 3664 + }, + { + "epoch": 0.1784627370778857, + "grad_norm": 3.033159017562866, + "learning_rate": 3.773319189810502e-05, + "loss": 0.813, + "step": 3665 + }, + { + "epoch": 0.17851143086699292, + "grad_norm": 2.037446975708008, + "learning_rate": 3.773173302622708e-05, + "loss": 0.9278, + "step": 3666 + }, + { + "epoch": 0.1785601246561001, + "grad_norm": 1.9664207696914673, + "learning_rate": 3.773027371327068e-05, + "loss": 0.8625, + "step": 3667 + }, + { + "epoch": 0.17860881844520732, + "grad_norm": 2.5083534717559814, + "learning_rate": 3.77288139592721e-05, + "loss": 0.9155, + "step": 3668 + }, + { + "epoch": 0.1786575122343145, + "grad_norm": 1.917129397392273, + "learning_rate": 3.772735376426766e-05, + "loss": 0.9511, + "step": 3669 + }, + { + "epoch": 0.17870620602342172, + "grad_norm": 2.9392433166503906, + "learning_rate": 3.7725893128293675e-05, + "loss": 0.9087, + "step": 3670 + }, + { + "epoch": 0.1787548998125289, + "grad_norm": 2.2450504302978516, + "learning_rate": 3.772443205138649e-05, + "loss": 0.8828, + "step": 3671 + }, + { + "epoch": 0.1788035936016361, + "grad_norm": 2.1796047687530518, + "learning_rate": 3.7722970533582424e-05, + "loss": 0.95, + "step": 3672 + }, + { + "epoch": 0.1788522873907433, + "grad_norm": 1.724765419960022, + "learning_rate": 3.772150857491786e-05, + "loss": 0.9783, + "step": 3673 + }, + { + "epoch": 0.1789009811798505, + "grad_norm": 0.08409173786640167, + "learning_rate": 3.772004617542916e-05, + "loss": 0.6501, + "step": 3674 + }, + { + "epoch": 0.17894967496895772, + "grad_norm": 2.0244364738464355, + "learning_rate": 3.771858333515269e-05, + "loss": 0.8837, + "step": 3675 + }, + { + "epoch": 0.1789983687580649, + "grad_norm": 3.6882224082946777, + "learning_rate": 3.771712005412484e-05, + "loss": 0.821, + "step": 3676 + }, + { + "epoch": 0.17904706254717212, + "grad_norm": 1.963250756263733, + "learning_rate": 3.7715656332382025e-05, + "loss": 0.8529, + "step": 3677 + }, + { + "epoch": 0.1790957563362793, + "grad_norm": 1.619019627571106, + "learning_rate": 3.7714192169960634e-05, + "loss": 0.9337, + "step": 3678 + }, + { + "epoch": 0.17914445012538652, + "grad_norm": 2.682145833969116, + "learning_rate": 3.77127275668971e-05, + "loss": 0.8931, + "step": 3679 + }, + { + "epoch": 0.1791931439144937, + "grad_norm": 2.443787097930908, + "learning_rate": 3.771126252322785e-05, + "loss": 0.9566, + "step": 3680 + }, + { + "epoch": 0.17924183770360091, + "grad_norm": 3.9663712978363037, + "learning_rate": 3.7709797038989334e-05, + "loss": 0.935, + "step": 3681 + }, + { + "epoch": 0.1792905314927081, + "grad_norm": 2.1718220710754395, + "learning_rate": 3.7708331114218006e-05, + "loss": 0.8534, + "step": 3682 + }, + { + "epoch": 0.1793392252818153, + "grad_norm": 2.126495361328125, + "learning_rate": 3.770686474895032e-05, + "loss": 0.8319, + "step": 3683 + }, + { + "epoch": 0.1793879190709225, + "grad_norm": 2.551632881164551, + "learning_rate": 3.770539794322276e-05, + "loss": 0.7839, + "step": 3684 + }, + { + "epoch": 0.1794366128600297, + "grad_norm": 2.2946853637695312, + "learning_rate": 3.7703930697071814e-05, + "loss": 0.9779, + "step": 3685 + }, + { + "epoch": 0.1794853066491369, + "grad_norm": 2.5288782119750977, + "learning_rate": 3.770246301053397e-05, + "loss": 0.8588, + "step": 3686 + }, + { + "epoch": 0.1795340004382441, + "grad_norm": 2.179842710494995, + "learning_rate": 3.770099488364576e-05, + "loss": 0.9268, + "step": 3687 + }, + { + "epoch": 0.1795826942273513, + "grad_norm": 2.181314468383789, + "learning_rate": 3.7699526316443674e-05, + "loss": 0.8615, + "step": 3688 + }, + { + "epoch": 0.1796313880164585, + "grad_norm": 2.8175182342529297, + "learning_rate": 3.769805730896426e-05, + "loss": 1.0171, + "step": 3689 + }, + { + "epoch": 0.1796800818055657, + "grad_norm": 2.0334577560424805, + "learning_rate": 3.769658786124406e-05, + "loss": 0.913, + "step": 3690 + }, + { + "epoch": 0.1797287755946729, + "grad_norm": 1.8756651878356934, + "learning_rate": 3.769511797331962e-05, + "loss": 0.934, + "step": 3691 + }, + { + "epoch": 0.1797774693837801, + "grad_norm": 2.58923602104187, + "learning_rate": 3.769364764522751e-05, + "loss": 0.9542, + "step": 3692 + }, + { + "epoch": 0.1798261631728873, + "grad_norm": 3.1622183322906494, + "learning_rate": 3.7692176877004306e-05, + "loss": 0.9681, + "step": 3693 + }, + { + "epoch": 0.1798748569619945, + "grad_norm": 2.937138080596924, + "learning_rate": 3.7690705668686584e-05, + "loss": 0.8744, + "step": 3694 + }, + { + "epoch": 0.1799235507511017, + "grad_norm": 3.442244291305542, + "learning_rate": 3.768923402031094e-05, + "loss": 0.8723, + "step": 3695 + }, + { + "epoch": 0.1799722445402089, + "grad_norm": 1.8494025468826294, + "learning_rate": 3.7687761931914e-05, + "loss": 0.9869, + "step": 3696 + }, + { + "epoch": 0.1800209383293161, + "grad_norm": 1.734902262687683, + "learning_rate": 3.768628940353236e-05, + "loss": 0.9535, + "step": 3697 + }, + { + "epoch": 0.1800696321184233, + "grad_norm": 2.2895309925079346, + "learning_rate": 3.768481643520266e-05, + "loss": 0.9077, + "step": 3698 + }, + { + "epoch": 0.1801183259075305, + "grad_norm": 2.0829756259918213, + "learning_rate": 3.768334302696154e-05, + "loss": 0.88, + "step": 3699 + }, + { + "epoch": 0.1801670196966377, + "grad_norm": 1.6342785358428955, + "learning_rate": 3.7681869178845644e-05, + "loss": 0.9294, + "step": 3700 + }, + { + "epoch": 0.1802157134857449, + "grad_norm": 2.033007860183716, + "learning_rate": 3.768039489089165e-05, + "loss": 0.9242, + "step": 3701 + }, + { + "epoch": 0.1802644072748521, + "grad_norm": 2.1424219608306885, + "learning_rate": 3.767892016313622e-05, + "loss": 0.8346, + "step": 3702 + }, + { + "epoch": 0.18031310106395929, + "grad_norm": 1.9671332836151123, + "learning_rate": 3.7677444995616024e-05, + "loss": 0.9039, + "step": 3703 + }, + { + "epoch": 0.1803617948530665, + "grad_norm": 2.4719269275665283, + "learning_rate": 3.767596938836778e-05, + "loss": 0.9069, + "step": 3704 + }, + { + "epoch": 0.18041048864217368, + "grad_norm": 3.1551406383514404, + "learning_rate": 3.7674493341428194e-05, + "loss": 0.8465, + "step": 3705 + }, + { + "epoch": 0.1804591824312809, + "grad_norm": 1.7295043468475342, + "learning_rate": 3.767301685483397e-05, + "loss": 0.885, + "step": 3706 + }, + { + "epoch": 0.18050787622038808, + "grad_norm": 1.7549481391906738, + "learning_rate": 3.7671539928621844e-05, + "loss": 0.9067, + "step": 3707 + }, + { + "epoch": 0.1805565700094953, + "grad_norm": 2.0391173362731934, + "learning_rate": 3.7670062562828545e-05, + "loss": 0.9137, + "step": 3708 + }, + { + "epoch": 0.18060526379860248, + "grad_norm": 1.7817753553390503, + "learning_rate": 3.7668584757490835e-05, + "loss": 0.8811, + "step": 3709 + }, + { + "epoch": 0.1806539575877097, + "grad_norm": 1.7872532606124878, + "learning_rate": 3.766710651264546e-05, + "loss": 0.8663, + "step": 3710 + }, + { + "epoch": 0.18070265137681688, + "grad_norm": 2.082721710205078, + "learning_rate": 3.76656278283292e-05, + "loss": 0.8832, + "step": 3711 + }, + { + "epoch": 0.1807513451659241, + "grad_norm": 1.760756492614746, + "learning_rate": 3.766414870457884e-05, + "loss": 0.9194, + "step": 3712 + }, + { + "epoch": 0.18080003895503127, + "grad_norm": 1.711624264717102, + "learning_rate": 3.7662669141431175e-05, + "loss": 0.9844, + "step": 3713 + }, + { + "epoch": 0.18084873274413849, + "grad_norm": 5.2983784675598145, + "learning_rate": 3.7661189138923e-05, + "loss": 0.8162, + "step": 3714 + }, + { + "epoch": 0.1808974265332457, + "grad_norm": 1.518457293510437, + "learning_rate": 3.765970869709113e-05, + "loss": 0.9345, + "step": 3715 + }, + { + "epoch": 0.18094612032235288, + "grad_norm": 3.8952081203460693, + "learning_rate": 3.7658227815972403e-05, + "loss": 0.9325, + "step": 3716 + }, + { + "epoch": 0.1809948141114601, + "grad_norm": 1.7932560443878174, + "learning_rate": 3.7656746495603654e-05, + "loss": 0.9469, + "step": 3717 + }, + { + "epoch": 0.18104350790056728, + "grad_norm": 1.8157411813735962, + "learning_rate": 3.765526473602172e-05, + "loss": 0.8756, + "step": 3718 + }, + { + "epoch": 0.1810922016896745, + "grad_norm": 2.3891847133636475, + "learning_rate": 3.7653782537263464e-05, + "loss": 0.8681, + "step": 3719 + }, + { + "epoch": 0.18114089547878168, + "grad_norm": 3.8120274543762207, + "learning_rate": 3.7652299899365756e-05, + "loss": 0.8068, + "step": 3720 + }, + { + "epoch": 0.1811895892678889, + "grad_norm": 0.08490631729364395, + "learning_rate": 3.7650816822365484e-05, + "loss": 0.6313, + "step": 3721 + }, + { + "epoch": 0.18123828305699607, + "grad_norm": 2.7728705406188965, + "learning_rate": 3.764933330629953e-05, + "loss": 0.9626, + "step": 3722 + }, + { + "epoch": 0.1812869768461033, + "grad_norm": 1.9256772994995117, + "learning_rate": 3.7647849351204806e-05, + "loss": 0.9871, + "step": 3723 + }, + { + "epoch": 0.18133567063521047, + "grad_norm": 2.746669292449951, + "learning_rate": 3.7646364957118215e-05, + "loss": 0.8626, + "step": 3724 + }, + { + "epoch": 0.18138436442431768, + "grad_norm": 1.7005881071090698, + "learning_rate": 3.764488012407669e-05, + "loss": 0.9222, + "step": 3725 + }, + { + "epoch": 0.18143305821342487, + "grad_norm": 2.817873477935791, + "learning_rate": 3.764339485211716e-05, + "loss": 0.9488, + "step": 3726 + }, + { + "epoch": 0.18148175200253208, + "grad_norm": 1.654260516166687, + "learning_rate": 3.764190914127658e-05, + "loss": 0.9011, + "step": 3727 + }, + { + "epoch": 0.18153044579163927, + "grad_norm": 1.4899849891662598, + "learning_rate": 3.764042299159189e-05, + "loss": 0.8299, + "step": 3728 + }, + { + "epoch": 0.18157913958074648, + "grad_norm": 1.8317779302597046, + "learning_rate": 3.763893640310008e-05, + "loss": 0.9233, + "step": 3729 + }, + { + "epoch": 0.18162783336985366, + "grad_norm": 1.8520530462265015, + "learning_rate": 3.7637449375838114e-05, + "loss": 0.8835, + "step": 3730 + }, + { + "epoch": 0.18167652715896088, + "grad_norm": 1.951323390007019, + "learning_rate": 3.7635961909843e-05, + "loss": 0.9115, + "step": 3731 + }, + { + "epoch": 0.18172522094806806, + "grad_norm": 1.771371841430664, + "learning_rate": 3.763447400515171e-05, + "loss": 0.884, + "step": 3732 + }, + { + "epoch": 0.18177391473717527, + "grad_norm": 1.4970324039459229, + "learning_rate": 3.763298566180128e-05, + "loss": 0.9219, + "step": 3733 + }, + { + "epoch": 0.1818226085262825, + "grad_norm": 1.749834418296814, + "learning_rate": 3.763149687982872e-05, + "loss": 0.9462, + "step": 3734 + }, + { + "epoch": 0.18187130231538967, + "grad_norm": 2.2597317695617676, + "learning_rate": 3.763000765927106e-05, + "loss": 0.8353, + "step": 3735 + }, + { + "epoch": 0.18191999610449688, + "grad_norm": 1.5100579261779785, + "learning_rate": 3.7628518000165364e-05, + "loss": 0.8745, + "step": 3736 + }, + { + "epoch": 0.18196868989360407, + "grad_norm": 3.3786020278930664, + "learning_rate": 3.762702790254867e-05, + "loss": 0.9716, + "step": 3737 + }, + { + "epoch": 0.18201738368271128, + "grad_norm": 2.574855089187622, + "learning_rate": 3.762553736645805e-05, + "loss": 0.933, + "step": 3738 + }, + { + "epoch": 0.18206607747181847, + "grad_norm": 0.08625306934118271, + "learning_rate": 3.762404639193058e-05, + "loss": 0.6032, + "step": 3739 + }, + { + "epoch": 0.18211477126092568, + "grad_norm": 2.3829922676086426, + "learning_rate": 3.762255497900335e-05, + "loss": 0.8598, + "step": 3740 + }, + { + "epoch": 0.18216346505003286, + "grad_norm": 2.4079082012176514, + "learning_rate": 3.762106312771346e-05, + "loss": 0.8986, + "step": 3741 + }, + { + "epoch": 0.18221215883914008, + "grad_norm": 2.9928030967712402, + "learning_rate": 3.7619570838098016e-05, + "loss": 0.8666, + "step": 3742 + }, + { + "epoch": 0.18226085262824726, + "grad_norm": 3.9013195037841797, + "learning_rate": 3.7618078110194146e-05, + "loss": 0.6814, + "step": 3743 + }, + { + "epoch": 0.18230954641735447, + "grad_norm": 1.973329782485962, + "learning_rate": 3.761658494403897e-05, + "loss": 0.9161, + "step": 3744 + }, + { + "epoch": 0.18235824020646166, + "grad_norm": 2.0819718837738037, + "learning_rate": 3.7615091339669645e-05, + "loss": 0.8811, + "step": 3745 + }, + { + "epoch": 0.18240693399556887, + "grad_norm": 5.971359729766846, + "learning_rate": 3.761359729712331e-05, + "loss": 0.8446, + "step": 3746 + }, + { + "epoch": 0.18245562778467606, + "grad_norm": 1.7549083232879639, + "learning_rate": 3.761210281643713e-05, + "loss": 0.8214, + "step": 3747 + }, + { + "epoch": 0.18250432157378327, + "grad_norm": 2.211028814315796, + "learning_rate": 3.76106078976483e-05, + "loss": 0.8564, + "step": 3748 + }, + { + "epoch": 0.18255301536289045, + "grad_norm": 1.859364628791809, + "learning_rate": 3.760911254079399e-05, + "loss": 0.8772, + "step": 3749 + }, + { + "epoch": 0.18260170915199767, + "grad_norm": 1.8322662115097046, + "learning_rate": 3.7607616745911395e-05, + "loss": 0.9348, + "step": 3750 + }, + { + "epoch": 0.18265040294110485, + "grad_norm": 9.052844047546387, + "learning_rate": 3.760612051303773e-05, + "loss": 0.8844, + "step": 3751 + }, + { + "epoch": 0.18269909673021206, + "grad_norm": 1.8937134742736816, + "learning_rate": 3.760462384221021e-05, + "loss": 0.9586, + "step": 3752 + }, + { + "epoch": 0.18274779051931925, + "grad_norm": 2.898442268371582, + "learning_rate": 3.7603126733466073e-05, + "loss": 0.8418, + "step": 3753 + }, + { + "epoch": 0.18279648430842646, + "grad_norm": 1.6796109676361084, + "learning_rate": 3.760162918684255e-05, + "loss": 0.7984, + "step": 3754 + }, + { + "epoch": 0.18284517809753367, + "grad_norm": 1.6859967708587646, + "learning_rate": 3.760013120237689e-05, + "loss": 0.9357, + "step": 3755 + }, + { + "epoch": 0.18289387188664086, + "grad_norm": 3.507540464401245, + "learning_rate": 3.759863278010637e-05, + "loss": 0.9548, + "step": 3756 + }, + { + "epoch": 0.18294256567574807, + "grad_norm": 2.177098274230957, + "learning_rate": 3.759713392006825e-05, + "loss": 0.8868, + "step": 3757 + }, + { + "epoch": 0.18299125946485525, + "grad_norm": 1.6348267793655396, + "learning_rate": 3.759563462229983e-05, + "loss": 0.88, + "step": 3758 + }, + { + "epoch": 0.18303995325396247, + "grad_norm": 1.936997413635254, + "learning_rate": 3.759413488683838e-05, + "loss": 0.8951, + "step": 3759 + }, + { + "epoch": 0.18308864704306965, + "grad_norm": 1.7012956142425537, + "learning_rate": 3.7592634713721226e-05, + "loss": 0.9626, + "step": 3760 + }, + { + "epoch": 0.18313734083217686, + "grad_norm": 1.979579210281372, + "learning_rate": 3.759113410298569e-05, + "loss": 0.8038, + "step": 3761 + }, + { + "epoch": 0.18318603462128405, + "grad_norm": 2.0539798736572266, + "learning_rate": 3.758963305466908e-05, + "loss": 0.8715, + "step": 3762 + }, + { + "epoch": 0.18323472841039126, + "grad_norm": 1.703603744506836, + "learning_rate": 3.758813156880874e-05, + "loss": 0.876, + "step": 3763 + }, + { + "epoch": 0.18328342219949845, + "grad_norm": 2.024336576461792, + "learning_rate": 3.758662964544202e-05, + "loss": 0.9796, + "step": 3764 + }, + { + "epoch": 0.18333211598860566, + "grad_norm": 1.9544517993927002, + "learning_rate": 3.75851272846063e-05, + "loss": 0.9975, + "step": 3765 + }, + { + "epoch": 0.18338080977771284, + "grad_norm": 1.9280353784561157, + "learning_rate": 3.758362448633892e-05, + "loss": 0.9343, + "step": 3766 + }, + { + "epoch": 0.18342950356682006, + "grad_norm": 1.6466468572616577, + "learning_rate": 3.7582121250677285e-05, + "loss": 0.8403, + "step": 3767 + }, + { + "epoch": 0.18347819735592724, + "grad_norm": 2.1791892051696777, + "learning_rate": 3.758061757765879e-05, + "loss": 0.9108, + "step": 3768 + }, + { + "epoch": 0.18352689114503445, + "grad_norm": 2.394098997116089, + "learning_rate": 3.757911346732081e-05, + "loss": 0.9001, + "step": 3769 + }, + { + "epoch": 0.18357558493414164, + "grad_norm": 1.8843799829483032, + "learning_rate": 3.757760891970079e-05, + "loss": 0.8742, + "step": 3770 + }, + { + "epoch": 0.18362427872324885, + "grad_norm": 2.022941827774048, + "learning_rate": 3.757610393483615e-05, + "loss": 1.0051, + "step": 3771 + }, + { + "epoch": 0.18367297251235604, + "grad_norm": 1.8049248456954956, + "learning_rate": 3.7574598512764316e-05, + "loss": 0.9176, + "step": 3772 + }, + { + "epoch": 0.18372166630146325, + "grad_norm": 2.920403480529785, + "learning_rate": 3.757309265352274e-05, + "loss": 0.9074, + "step": 3773 + }, + { + "epoch": 0.18377036009057046, + "grad_norm": 2.3606276512145996, + "learning_rate": 3.757158635714889e-05, + "loss": 0.926, + "step": 3774 + }, + { + "epoch": 0.18381905387967765, + "grad_norm": 0.08992356061935425, + "learning_rate": 3.7570079623680216e-05, + "loss": 0.6482, + "step": 3775 + }, + { + "epoch": 0.18386774766878486, + "grad_norm": 1.757559895515442, + "learning_rate": 3.756857245315422e-05, + "loss": 0.852, + "step": 3776 + }, + { + "epoch": 0.18391644145789204, + "grad_norm": 3.151250123977661, + "learning_rate": 3.7567064845608376e-05, + "loss": 0.9007, + "step": 3777 + }, + { + "epoch": 0.18396513524699926, + "grad_norm": 2.363691806793213, + "learning_rate": 3.7565556801080195e-05, + "loss": 0.903, + "step": 3778 + }, + { + "epoch": 0.18401382903610644, + "grad_norm": 2.1095468997955322, + "learning_rate": 3.756404831960718e-05, + "loss": 0.8844, + "step": 3779 + }, + { + "epoch": 0.18406252282521365, + "grad_norm": 1.5137505531311035, + "learning_rate": 3.7562539401226874e-05, + "loss": 0.9003, + "step": 3780 + }, + { + "epoch": 0.18411121661432084, + "grad_norm": 1.7372426986694336, + "learning_rate": 3.7561030045976796e-05, + "loss": 0.9474, + "step": 3781 + }, + { + "epoch": 0.18415991040342805, + "grad_norm": 1.5598642826080322, + "learning_rate": 3.755952025389449e-05, + "loss": 0.9675, + "step": 3782 + }, + { + "epoch": 0.18420860419253524, + "grad_norm": 2.0456671714782715, + "learning_rate": 3.7558010025017514e-05, + "loss": 0.8828, + "step": 3783 + }, + { + "epoch": 0.18425729798164245, + "grad_norm": 1.770937442779541, + "learning_rate": 3.755649935938344e-05, + "loss": 0.8096, + "step": 3784 + }, + { + "epoch": 0.18430599177074963, + "grad_norm": 0.08234208822250366, + "learning_rate": 3.7554988257029847e-05, + "loss": 0.5558, + "step": 3785 + }, + { + "epoch": 0.18435468555985685, + "grad_norm": 1.49691641330719, + "learning_rate": 3.755347671799432e-05, + "loss": 0.8521, + "step": 3786 + }, + { + "epoch": 0.18440337934896403, + "grad_norm": 2.0711166858673096, + "learning_rate": 3.755196474231446e-05, + "loss": 0.902, + "step": 3787 + }, + { + "epoch": 0.18445207313807124, + "grad_norm": 2.1979598999023438, + "learning_rate": 3.755045233002787e-05, + "loss": 0.9537, + "step": 3788 + }, + { + "epoch": 0.18450076692717843, + "grad_norm": 2.0003457069396973, + "learning_rate": 3.7548939481172186e-05, + "loss": 0.8972, + "step": 3789 + }, + { + "epoch": 0.18454946071628564, + "grad_norm": 2.1859488487243652, + "learning_rate": 3.754742619578503e-05, + "loss": 0.8612, + "step": 3790 + }, + { + "epoch": 0.18459815450539283, + "grad_norm": 2.226048231124878, + "learning_rate": 3.7545912473904047e-05, + "loss": 0.8129, + "step": 3791 + }, + { + "epoch": 0.18464684829450004, + "grad_norm": 1.5936369895935059, + "learning_rate": 3.7544398315566895e-05, + "loss": 0.9856, + "step": 3792 + }, + { + "epoch": 0.18469554208360722, + "grad_norm": 2.1859130859375, + "learning_rate": 3.7542883720811224e-05, + "loss": 0.7957, + "step": 3793 + }, + { + "epoch": 0.18474423587271444, + "grad_norm": 1.7181761264801025, + "learning_rate": 3.7541368689674736e-05, + "loss": 0.8116, + "step": 3794 + }, + { + "epoch": 0.18479292966182165, + "grad_norm": 1.8662410974502563, + "learning_rate": 3.753985322219509e-05, + "loss": 0.8748, + "step": 3795 + }, + { + "epoch": 0.18484162345092883, + "grad_norm": 2.0914947986602783, + "learning_rate": 3.753833731841e-05, + "loss": 1.0082, + "step": 3796 + }, + { + "epoch": 0.18489031724003605, + "grad_norm": 2.479015588760376, + "learning_rate": 3.753682097835717e-05, + "loss": 0.8622, + "step": 3797 + }, + { + "epoch": 0.18493901102914323, + "grad_norm": 2.025160789489746, + "learning_rate": 3.753530420207432e-05, + "loss": 0.9275, + "step": 3798 + }, + { + "epoch": 0.18498770481825044, + "grad_norm": 1.7829663753509521, + "learning_rate": 3.753378698959918e-05, + "loss": 0.9162, + "step": 3799 + }, + { + "epoch": 0.18503639860735763, + "grad_norm": 2.4539806842803955, + "learning_rate": 3.753226934096949e-05, + "loss": 0.8513, + "step": 3800 + }, + { + "epoch": 0.18508509239646484, + "grad_norm": 2.059645652770996, + "learning_rate": 3.7530751256223004e-05, + "loss": 0.8817, + "step": 3801 + }, + { + "epoch": 0.18513378618557202, + "grad_norm": 2.0484654903411865, + "learning_rate": 3.7529232735397475e-05, + "loss": 0.8708, + "step": 3802 + }, + { + "epoch": 0.18518247997467924, + "grad_norm": 1.7511929273605347, + "learning_rate": 3.7527713778530695e-05, + "loss": 0.9313, + "step": 3803 + }, + { + "epoch": 0.18523117376378642, + "grad_norm": 1.4887022972106934, + "learning_rate": 3.752619438566043e-05, + "loss": 0.9353, + "step": 3804 + }, + { + "epoch": 0.18527986755289363, + "grad_norm": 1.9643269777297974, + "learning_rate": 3.7524674556824476e-05, + "loss": 0.7937, + "step": 3805 + }, + { + "epoch": 0.18532856134200082, + "grad_norm": 1.4366600513458252, + "learning_rate": 3.752315429206065e-05, + "loss": 1.0069, + "step": 3806 + }, + { + "epoch": 0.18537725513110803, + "grad_norm": 1.455572485923767, + "learning_rate": 3.752163359140676e-05, + "loss": 0.8964, + "step": 3807 + }, + { + "epoch": 0.18542594892021522, + "grad_norm": 2.2866930961608887, + "learning_rate": 3.7520112454900636e-05, + "loss": 0.8829, + "step": 3808 + }, + { + "epoch": 0.18547464270932243, + "grad_norm": 1.7539907693862915, + "learning_rate": 3.7518590882580124e-05, + "loss": 0.9318, + "step": 3809 + }, + { + "epoch": 0.18552333649842961, + "grad_norm": 1.8218133449554443, + "learning_rate": 3.751706887448307e-05, + "loss": 0.8922, + "step": 3810 + }, + { + "epoch": 0.18557203028753683, + "grad_norm": 0.08875096589326859, + "learning_rate": 3.7515546430647315e-05, + "loss": 0.6502, + "step": 3811 + }, + { + "epoch": 0.185620724076644, + "grad_norm": 2.231095314025879, + "learning_rate": 3.7514023551110755e-05, + "loss": 0.872, + "step": 3812 + }, + { + "epoch": 0.18566941786575122, + "grad_norm": 1.9317196607589722, + "learning_rate": 3.751250023591126e-05, + "loss": 0.9425, + "step": 3813 + }, + { + "epoch": 0.18571811165485844, + "grad_norm": 2.2608802318573, + "learning_rate": 3.751097648508673e-05, + "loss": 0.9852, + "step": 3814 + }, + { + "epoch": 0.18576680544396562, + "grad_norm": 0.12442035228013992, + "learning_rate": 3.750945229867506e-05, + "loss": 0.6717, + "step": 3815 + }, + { + "epoch": 0.18581549923307283, + "grad_norm": 1.5896633863449097, + "learning_rate": 3.7507927676714165e-05, + "loss": 0.9135, + "step": 3816 + }, + { + "epoch": 0.18586419302218002, + "grad_norm": 1.5505976676940918, + "learning_rate": 3.750640261924198e-05, + "loss": 0.8905, + "step": 3817 + }, + { + "epoch": 0.18591288681128723, + "grad_norm": 2.2468395233154297, + "learning_rate": 3.7504877126296425e-05, + "loss": 0.9056, + "step": 3818 + }, + { + "epoch": 0.18596158060039442, + "grad_norm": 1.5770418643951416, + "learning_rate": 3.750335119791546e-05, + "loss": 0.9854, + "step": 3819 + }, + { + "epoch": 0.18601027438950163, + "grad_norm": 1.7520451545715332, + "learning_rate": 3.750182483413704e-05, + "loss": 0.9114, + "step": 3820 + }, + { + "epoch": 0.1860589681786088, + "grad_norm": 2.0127389430999756, + "learning_rate": 3.750029803499913e-05, + "loss": 0.8847, + "step": 3821 + }, + { + "epoch": 0.18610766196771603, + "grad_norm": 1.7605156898498535, + "learning_rate": 3.749877080053971e-05, + "loss": 0.8075, + "step": 3822 + }, + { + "epoch": 0.1861563557568232, + "grad_norm": 1.9470795392990112, + "learning_rate": 3.749724313079677e-05, + "loss": 0.85, + "step": 3823 + }, + { + "epoch": 0.18620504954593042, + "grad_norm": 2.076155424118042, + "learning_rate": 3.749571502580831e-05, + "loss": 0.9516, + "step": 3824 + }, + { + "epoch": 0.1862537433350376, + "grad_norm": 1.5718388557434082, + "learning_rate": 3.749418648561235e-05, + "loss": 0.8643, + "step": 3825 + }, + { + "epoch": 0.18630243712414482, + "grad_norm": 1.7071493864059448, + "learning_rate": 3.74926575102469e-05, + "loss": 0.9161, + "step": 3826 + }, + { + "epoch": 0.186351130913252, + "grad_norm": 3.020092010498047, + "learning_rate": 3.749112809975e-05, + "loss": 0.8857, + "step": 3827 + }, + { + "epoch": 0.18639982470235922, + "grad_norm": 1.6277496814727783, + "learning_rate": 3.74895982541597e-05, + "loss": 0.8918, + "step": 3828 + }, + { + "epoch": 0.1864485184914664, + "grad_norm": 1.7873846292495728, + "learning_rate": 3.748806797351404e-05, + "loss": 0.8259, + "step": 3829 + }, + { + "epoch": 0.18649721228057362, + "grad_norm": 1.8980507850646973, + "learning_rate": 3.7486537257851095e-05, + "loss": 0.8552, + "step": 3830 + }, + { + "epoch": 0.1865459060696808, + "grad_norm": 3.4379565715789795, + "learning_rate": 3.748500610720895e-05, + "loss": 0.9153, + "step": 3831 + }, + { + "epoch": 0.186594599858788, + "grad_norm": 1.6560262441635132, + "learning_rate": 3.748347452162568e-05, + "loss": 0.7839, + "step": 3832 + }, + { + "epoch": 0.1866432936478952, + "grad_norm": 2.8178329467773438, + "learning_rate": 3.7481942501139385e-05, + "loss": 0.8703, + "step": 3833 + }, + { + "epoch": 0.1866919874370024, + "grad_norm": 1.742787480354309, + "learning_rate": 3.748041004578817e-05, + "loss": 0.8264, + "step": 3834 + }, + { + "epoch": 0.18674068122610962, + "grad_norm": 1.888703465461731, + "learning_rate": 3.747887715561017e-05, + "loss": 0.7978, + "step": 3835 + }, + { + "epoch": 0.1867893750152168, + "grad_norm": 1.8070783615112305, + "learning_rate": 3.747734383064351e-05, + "loss": 0.8312, + "step": 3836 + }, + { + "epoch": 0.18683806880432402, + "grad_norm": 0.08739141374826431, + "learning_rate": 3.747581007092633e-05, + "loss": 0.6349, + "step": 3837 + }, + { + "epoch": 0.1868867625934312, + "grad_norm": 1.7692198753356934, + "learning_rate": 3.747427587649677e-05, + "loss": 0.9324, + "step": 3838 + }, + { + "epoch": 0.18693545638253842, + "grad_norm": 1.7832413911819458, + "learning_rate": 3.747274124739301e-05, + "loss": 0.8796, + "step": 3839 + }, + { + "epoch": 0.1869841501716456, + "grad_norm": 1.8170057535171509, + "learning_rate": 3.7471206183653214e-05, + "loss": 0.7715, + "step": 3840 + }, + { + "epoch": 0.18703284396075281, + "grad_norm": 1.8293732404708862, + "learning_rate": 3.746967068531558e-05, + "loss": 0.8517, + "step": 3841 + }, + { + "epoch": 0.18708153774986, + "grad_norm": 2.7927918434143066, + "learning_rate": 3.746813475241829e-05, + "loss": 0.8159, + "step": 3842 + }, + { + "epoch": 0.1871302315389672, + "grad_norm": 1.7355413436889648, + "learning_rate": 3.746659838499956e-05, + "loss": 0.8813, + "step": 3843 + }, + { + "epoch": 0.1871789253280744, + "grad_norm": 2.006958484649658, + "learning_rate": 3.74650615830976e-05, + "loss": 0.8959, + "step": 3844 + }, + { + "epoch": 0.1872276191171816, + "grad_norm": 0.08225816488265991, + "learning_rate": 3.7463524346750634e-05, + "loss": 0.6024, + "step": 3845 + }, + { + "epoch": 0.1872763129062888, + "grad_norm": 1.615657091140747, + "learning_rate": 3.7461986675996914e-05, + "loss": 0.8541, + "step": 3846 + }, + { + "epoch": 0.187325006695396, + "grad_norm": 5.31710958480835, + "learning_rate": 3.746044857087468e-05, + "loss": 0.8618, + "step": 3847 + }, + { + "epoch": 0.1873737004845032, + "grad_norm": 2.274784564971924, + "learning_rate": 3.74589100314222e-05, + "loss": 0.9201, + "step": 3848 + }, + { + "epoch": 0.1874223942736104, + "grad_norm": 1.5655792951583862, + "learning_rate": 3.745737105767774e-05, + "loss": 0.9394, + "step": 3849 + }, + { + "epoch": 0.1874710880627176, + "grad_norm": 1.5790150165557861, + "learning_rate": 3.745583164967958e-05, + "loss": 0.8836, + "step": 3850 + }, + { + "epoch": 0.1875197818518248, + "grad_norm": 1.6782690286636353, + "learning_rate": 3.745429180746602e-05, + "loss": 0.8173, + "step": 3851 + }, + { + "epoch": 0.187568475640932, + "grad_norm": 2.070636034011841, + "learning_rate": 3.745275153107536e-05, + "loss": 0.8802, + "step": 3852 + }, + { + "epoch": 0.1876171694300392, + "grad_norm": 1.559451699256897, + "learning_rate": 3.7451210820545914e-05, + "loss": 0.9309, + "step": 3853 + }, + { + "epoch": 0.1876658632191464, + "grad_norm": 1.8596405982971191, + "learning_rate": 3.7449669675916005e-05, + "loss": 0.9072, + "step": 3854 + }, + { + "epoch": 0.1877145570082536, + "grad_norm": 2.4529244899749756, + "learning_rate": 3.7448128097223975e-05, + "loss": 0.8805, + "step": 3855 + }, + { + "epoch": 0.1877632507973608, + "grad_norm": 1.6860300302505493, + "learning_rate": 3.744658608450816e-05, + "loss": 0.8236, + "step": 3856 + }, + { + "epoch": 0.187811944586468, + "grad_norm": 1.7261362075805664, + "learning_rate": 3.7445043637806935e-05, + "loss": 0.8759, + "step": 3857 + }, + { + "epoch": 0.1878606383755752, + "grad_norm": 0.09533445537090302, + "learning_rate": 3.744350075715866e-05, + "loss": 0.5725, + "step": 3858 + }, + { + "epoch": 0.1879093321646824, + "grad_norm": 1.7942560911178589, + "learning_rate": 3.7441957442601705e-05, + "loss": 0.9422, + "step": 3859 + }, + { + "epoch": 0.1879580259537896, + "grad_norm": 0.08034489303827286, + "learning_rate": 3.744041369417447e-05, + "loss": 0.6039, + "step": 3860 + }, + { + "epoch": 0.1880067197428968, + "grad_norm": 0.08889125287532806, + "learning_rate": 3.743886951191535e-05, + "loss": 0.6618, + "step": 3861 + }, + { + "epoch": 0.188055413532004, + "grad_norm": 2.1519017219543457, + "learning_rate": 3.743732489586277e-05, + "loss": 1.0164, + "step": 3862 + }, + { + "epoch": 0.18810410732111119, + "grad_norm": 1.7818812131881714, + "learning_rate": 3.743577984605514e-05, + "loss": 0.8761, + "step": 3863 + }, + { + "epoch": 0.1881528011102184, + "grad_norm": 2.0495541095733643, + "learning_rate": 3.74342343625309e-05, + "loss": 0.9593, + "step": 3864 + }, + { + "epoch": 0.18820149489932558, + "grad_norm": 1.8083723783493042, + "learning_rate": 3.7432688445328486e-05, + "loss": 0.9066, + "step": 3865 + }, + { + "epoch": 0.1882501886884328, + "grad_norm": 2.119205951690674, + "learning_rate": 3.743114209448636e-05, + "loss": 0.8351, + "step": 3866 + }, + { + "epoch": 0.18829888247753998, + "grad_norm": 2.106989860534668, + "learning_rate": 3.7429595310042984e-05, + "loss": 0.8889, + "step": 3867 + }, + { + "epoch": 0.1883475762666472, + "grad_norm": 1.2947561740875244, + "learning_rate": 3.742804809203684e-05, + "loss": 0.8855, + "step": 3868 + }, + { + "epoch": 0.18839627005575438, + "grad_norm": 1.5274553298950195, + "learning_rate": 3.74265004405064e-05, + "loss": 0.8765, + "step": 3869 + }, + { + "epoch": 0.1884449638448616, + "grad_norm": 1.7111992835998535, + "learning_rate": 3.7424952355490184e-05, + "loss": 0.8627, + "step": 3870 + }, + { + "epoch": 0.18849365763396878, + "grad_norm": 2.166473627090454, + "learning_rate": 3.742340383702668e-05, + "loss": 0.7649, + "step": 3871 + }, + { + "epoch": 0.188542351423076, + "grad_norm": 1.4911928176879883, + "learning_rate": 3.742185488515442e-05, + "loss": 0.8523, + "step": 3872 + }, + { + "epoch": 0.18859104521218317, + "grad_norm": 2.205199956893921, + "learning_rate": 3.742030549991193e-05, + "loss": 0.9544, + "step": 3873 + }, + { + "epoch": 0.18863973900129039, + "grad_norm": 2.9589219093322754, + "learning_rate": 3.741875568133776e-05, + "loss": 0.7623, + "step": 3874 + }, + { + "epoch": 0.1886884327903976, + "grad_norm": 1.8828768730163574, + "learning_rate": 3.741720542947045e-05, + "loss": 0.8775, + "step": 3875 + }, + { + "epoch": 0.18873712657950478, + "grad_norm": 2.1617751121520996, + "learning_rate": 3.741565474434857e-05, + "loss": 0.9347, + "step": 3876 + }, + { + "epoch": 0.188785820368612, + "grad_norm": 6.615904331207275, + "learning_rate": 3.7414103626010684e-05, + "loss": 0.8962, + "step": 3877 + }, + { + "epoch": 0.18883451415771918, + "grad_norm": 4.047945499420166, + "learning_rate": 3.7412552074495386e-05, + "loss": 0.7996, + "step": 3878 + }, + { + "epoch": 0.1888832079468264, + "grad_norm": 2.06672739982605, + "learning_rate": 3.7411000089841266e-05, + "loss": 0.9543, + "step": 3879 + }, + { + "epoch": 0.18893190173593358, + "grad_norm": 2.3452682495117188, + "learning_rate": 3.740944767208693e-05, + "loss": 0.833, + "step": 3880 + }, + { + "epoch": 0.1889805955250408, + "grad_norm": 2.330583333969116, + "learning_rate": 3.7407894821271e-05, + "loss": 0.8853, + "step": 3881 + }, + { + "epoch": 0.18902928931414797, + "grad_norm": 3.1881585121154785, + "learning_rate": 3.74063415374321e-05, + "loss": 0.9656, + "step": 3882 + }, + { + "epoch": 0.1890779831032552, + "grad_norm": 1.9220008850097656, + "learning_rate": 3.7404787820608856e-05, + "loss": 0.9147, + "step": 3883 + }, + { + "epoch": 0.18912667689236237, + "grad_norm": 2.752659797668457, + "learning_rate": 3.7403233670839935e-05, + "loss": 0.875, + "step": 3884 + }, + { + "epoch": 0.18917537068146958, + "grad_norm": 1.8944437503814697, + "learning_rate": 3.7401679088163994e-05, + "loss": 0.8923, + "step": 3885 + }, + { + "epoch": 0.18922406447057677, + "grad_norm": 1.5726172924041748, + "learning_rate": 3.740012407261969e-05, + "loss": 0.8851, + "step": 3886 + }, + { + "epoch": 0.18927275825968398, + "grad_norm": 2.308211326599121, + "learning_rate": 3.739856862424573e-05, + "loss": 0.8193, + "step": 3887 + }, + { + "epoch": 0.18932145204879117, + "grad_norm": 1.8817510604858398, + "learning_rate": 3.739701274308077e-05, + "loss": 0.8354, + "step": 3888 + }, + { + "epoch": 0.18937014583789838, + "grad_norm": 1.741493582725525, + "learning_rate": 3.7395456429163545e-05, + "loss": 0.8967, + "step": 3889 + }, + { + "epoch": 0.18941883962700556, + "grad_norm": 1.5960105657577515, + "learning_rate": 3.739389968253275e-05, + "loss": 0.8673, + "step": 3890 + }, + { + "epoch": 0.18946753341611278, + "grad_norm": 3.0849595069885254, + "learning_rate": 3.739234250322711e-05, + "loss": 0.8225, + "step": 3891 + }, + { + "epoch": 0.18951622720521996, + "grad_norm": 2.864272117614746, + "learning_rate": 3.7390784891285375e-05, + "loss": 0.8617, + "step": 3892 + }, + { + "epoch": 0.18956492099432717, + "grad_norm": 1.597131371498108, + "learning_rate": 3.738922684674627e-05, + "loss": 0.9457, + "step": 3893 + }, + { + "epoch": 0.1896136147834344, + "grad_norm": 2.654629945755005, + "learning_rate": 3.738766836964857e-05, + "loss": 0.869, + "step": 3894 + }, + { + "epoch": 0.18966230857254157, + "grad_norm": 0.08642509579658508, + "learning_rate": 3.738610946003102e-05, + "loss": 0.6241, + "step": 3895 + }, + { + "epoch": 0.18971100236164878, + "grad_norm": 4.384332180023193, + "learning_rate": 3.738455011793243e-05, + "loss": 0.9025, + "step": 3896 + }, + { + "epoch": 0.18975969615075597, + "grad_norm": 1.8746716976165771, + "learning_rate": 3.7382990343391565e-05, + "loss": 0.8534, + "step": 3897 + }, + { + "epoch": 0.18980838993986318, + "grad_norm": 2.186039686203003, + "learning_rate": 3.738143013644722e-05, + "loss": 0.8921, + "step": 3898 + }, + { + "epoch": 0.18985708372897037, + "grad_norm": 1.7860528230667114, + "learning_rate": 3.7379869497138226e-05, + "loss": 0.8964, + "step": 3899 + }, + { + "epoch": 0.18990577751807758, + "grad_norm": 2.4006757736206055, + "learning_rate": 3.737830842550339e-05, + "loss": 0.8467, + "step": 3900 + }, + { + "epoch": 0.18995447130718476, + "grad_norm": 2.278902530670166, + "learning_rate": 3.7376746921581554e-05, + "loss": 0.9535, + "step": 3901 + }, + { + "epoch": 0.19000316509629198, + "grad_norm": 1.5632364749908447, + "learning_rate": 3.7375184985411545e-05, + "loss": 0.9303, + "step": 3902 + }, + { + "epoch": 0.19005185888539916, + "grad_norm": 1.6996246576309204, + "learning_rate": 3.737362261703223e-05, + "loss": 0.949, + "step": 3903 + }, + { + "epoch": 0.19010055267450637, + "grad_norm": 0.08478222787380219, + "learning_rate": 3.737205981648247e-05, + "loss": 0.6637, + "step": 3904 + }, + { + "epoch": 0.19014924646361356, + "grad_norm": 1.840696096420288, + "learning_rate": 3.7370496583801135e-05, + "loss": 0.8145, + "step": 3905 + }, + { + "epoch": 0.19019794025272077, + "grad_norm": 1.9574917554855347, + "learning_rate": 3.736893291902712e-05, + "loss": 0.9182, + "step": 3906 + }, + { + "epoch": 0.19024663404182796, + "grad_norm": 4.1789960861206055, + "learning_rate": 3.73673688221993e-05, + "loss": 0.9361, + "step": 3907 + }, + { + "epoch": 0.19029532783093517, + "grad_norm": 2.0015501976013184, + "learning_rate": 3.7365804293356605e-05, + "loss": 0.8774, + "step": 3908 + }, + { + "epoch": 0.19034402162004235, + "grad_norm": 1.8983337879180908, + "learning_rate": 3.736423933253795e-05, + "loss": 0.9602, + "step": 3909 + }, + { + "epoch": 0.19039271540914957, + "grad_norm": 2.2785253524780273, + "learning_rate": 3.7362673939782255e-05, + "loss": 0.8788, + "step": 3910 + }, + { + "epoch": 0.19044140919825675, + "grad_norm": 1.982807993888855, + "learning_rate": 3.736110811512846e-05, + "loss": 1.0573, + "step": 3911 + }, + { + "epoch": 0.19049010298736396, + "grad_norm": 1.9447470903396606, + "learning_rate": 3.735954185861552e-05, + "loss": 0.9084, + "step": 3912 + }, + { + "epoch": 0.19053879677647115, + "grad_norm": 1.8920248746871948, + "learning_rate": 3.735797517028239e-05, + "loss": 0.8901, + "step": 3913 + }, + { + "epoch": 0.19058749056557836, + "grad_norm": 1.2761781215667725, + "learning_rate": 3.735640805016805e-05, + "loss": 0.9448, + "step": 3914 + }, + { + "epoch": 0.19063618435468557, + "grad_norm": 0.08298051357269287, + "learning_rate": 3.7354840498311475e-05, + "loss": 0.5887, + "step": 3915 + }, + { + "epoch": 0.19068487814379276, + "grad_norm": 1.7387431859970093, + "learning_rate": 3.735327251475166e-05, + "loss": 0.8884, + "step": 3916 + }, + { + "epoch": 0.19073357193289997, + "grad_norm": 2.039057493209839, + "learning_rate": 3.735170409952761e-05, + "loss": 1.0216, + "step": 3917 + }, + { + "epoch": 0.19078226572200715, + "grad_norm": 2.1027626991271973, + "learning_rate": 3.735013525267834e-05, + "loss": 0.8268, + "step": 3918 + }, + { + "epoch": 0.19083095951111437, + "grad_norm": 11.425809860229492, + "learning_rate": 3.734856597424287e-05, + "loss": 0.941, + "step": 3919 + }, + { + "epoch": 0.19087965330022155, + "grad_norm": 1.7905277013778687, + "learning_rate": 3.734699626426024e-05, + "loss": 0.8678, + "step": 3920 + }, + { + "epoch": 0.19092834708932876, + "grad_norm": 3.8740293979644775, + "learning_rate": 3.73454261227695e-05, + "loss": 0.9496, + "step": 3921 + }, + { + "epoch": 0.19097704087843595, + "grad_norm": 2.5337750911712646, + "learning_rate": 3.7343855549809706e-05, + "loss": 0.8234, + "step": 3922 + }, + { + "epoch": 0.19102573466754316, + "grad_norm": 1.739972710609436, + "learning_rate": 3.734228454541992e-05, + "loss": 0.8954, + "step": 3923 + }, + { + "epoch": 0.19107442845665035, + "grad_norm": 1.5905241966247559, + "learning_rate": 3.7340713109639215e-05, + "loss": 0.9648, + "step": 3924 + }, + { + "epoch": 0.19112312224575756, + "grad_norm": 4.486367702484131, + "learning_rate": 3.733914124250671e-05, + "loss": 0.9815, + "step": 3925 + }, + { + "epoch": 0.19117181603486474, + "grad_norm": 5.603412628173828, + "learning_rate": 3.733756894406147e-05, + "loss": 0.8452, + "step": 3926 + }, + { + "epoch": 0.19122050982397196, + "grad_norm": 1.802276849746704, + "learning_rate": 3.733599621434263e-05, + "loss": 0.8575, + "step": 3927 + }, + { + "epoch": 0.19126920361307914, + "grad_norm": 1.4846950769424438, + "learning_rate": 3.7334423053389296e-05, + "loss": 0.8591, + "step": 3928 + }, + { + "epoch": 0.19131789740218635, + "grad_norm": 1.5802052021026611, + "learning_rate": 3.7332849461240614e-05, + "loss": 0.9856, + "step": 3929 + }, + { + "epoch": 0.19136659119129354, + "grad_norm": 1.7512506246566772, + "learning_rate": 3.7331275437935717e-05, + "loss": 0.9368, + "step": 3930 + }, + { + "epoch": 0.19141528498040075, + "grad_norm": 1.8727933168411255, + "learning_rate": 3.732970098351376e-05, + "loss": 0.8872, + "step": 3931 + }, + { + "epoch": 0.19146397876950794, + "grad_norm": 1.9891972541809082, + "learning_rate": 3.732812609801393e-05, + "loss": 0.8269, + "step": 3932 + }, + { + "epoch": 0.19151267255861515, + "grad_norm": 1.581730842590332, + "learning_rate": 3.732655078147537e-05, + "loss": 0.8887, + "step": 3933 + }, + { + "epoch": 0.19156136634772236, + "grad_norm": 0.0906965509057045, + "learning_rate": 3.7324975033937274e-05, + "loss": 0.7087, + "step": 3934 + }, + { + "epoch": 0.19161006013682955, + "grad_norm": 1.7526415586471558, + "learning_rate": 3.732339885543885e-05, + "loss": 0.9195, + "step": 3935 + }, + { + "epoch": 0.19165875392593676, + "grad_norm": 2.5180513858795166, + "learning_rate": 3.73218222460193e-05, + "loss": 0.8705, + "step": 3936 + }, + { + "epoch": 0.19170744771504394, + "grad_norm": 1.6814167499542236, + "learning_rate": 3.732024520571784e-05, + "loss": 0.9354, + "step": 3937 + }, + { + "epoch": 0.19175614150415116, + "grad_norm": 2.274474859237671, + "learning_rate": 3.7318667734573706e-05, + "loss": 0.8355, + "step": 3938 + }, + { + "epoch": 0.19180483529325834, + "grad_norm": 0.08849556744098663, + "learning_rate": 3.7317089832626135e-05, + "loss": 0.5957, + "step": 3939 + }, + { + "epoch": 0.19185352908236555, + "grad_norm": 2.0374796390533447, + "learning_rate": 3.731551149991437e-05, + "loss": 0.9492, + "step": 3940 + }, + { + "epoch": 0.19190222287147274, + "grad_norm": 2.7624917030334473, + "learning_rate": 3.7313932736477675e-05, + "loss": 0.9068, + "step": 3941 + }, + { + "epoch": 0.19195091666057995, + "grad_norm": 6.126926898956299, + "learning_rate": 3.731235354235533e-05, + "loss": 0.9136, + "step": 3942 + }, + { + "epoch": 0.19199961044968714, + "grad_norm": 1.685504674911499, + "learning_rate": 3.731077391758661e-05, + "loss": 0.8774, + "step": 3943 + }, + { + "epoch": 0.19204830423879435, + "grad_norm": 2.175595760345459, + "learning_rate": 3.7309193862210815e-05, + "loss": 0.8256, + "step": 3944 + }, + { + "epoch": 0.19209699802790153, + "grad_norm": 1.917484164237976, + "learning_rate": 3.730761337626724e-05, + "loss": 0.91, + "step": 3945 + }, + { + "epoch": 0.19214569181700875, + "grad_norm": 1.4737317562103271, + "learning_rate": 3.73060324597952e-05, + "loss": 0.8988, + "step": 3946 + }, + { + "epoch": 0.19219438560611593, + "grad_norm": 1.68598473072052, + "learning_rate": 3.730445111283404e-05, + "loss": 0.9071, + "step": 3947 + }, + { + "epoch": 0.19224307939522314, + "grad_norm": 2.091818332672119, + "learning_rate": 3.730286933542306e-05, + "loss": 0.9295, + "step": 3948 + }, + { + "epoch": 0.19229177318433033, + "grad_norm": 1.7596523761749268, + "learning_rate": 3.7301287127601643e-05, + "loss": 0.9301, + "step": 3949 + }, + { + "epoch": 0.19234046697343754, + "grad_norm": 1.8993675708770752, + "learning_rate": 3.729970448940913e-05, + "loss": 0.8552, + "step": 3950 + }, + { + "epoch": 0.19238916076254473, + "grad_norm": 1.7316235303878784, + "learning_rate": 3.729812142088488e-05, + "loss": 0.8988, + "step": 3951 + }, + { + "epoch": 0.19243785455165194, + "grad_norm": 2.3086044788360596, + "learning_rate": 3.7296537922068295e-05, + "loss": 0.8785, + "step": 3952 + }, + { + "epoch": 0.19248654834075912, + "grad_norm": 1.9640522003173828, + "learning_rate": 3.7294953992998746e-05, + "loss": 0.8347, + "step": 3953 + }, + { + "epoch": 0.19253524212986634, + "grad_norm": 2.0606377124786377, + "learning_rate": 3.7293369633715634e-05, + "loss": 0.8683, + "step": 3954 + }, + { + "epoch": 0.19258393591897355, + "grad_norm": 1.6825504302978516, + "learning_rate": 3.729178484425838e-05, + "loss": 0.817, + "step": 3955 + }, + { + "epoch": 0.19263262970808073, + "grad_norm": 1.5014859437942505, + "learning_rate": 3.729019962466641e-05, + "loss": 0.9303, + "step": 3956 + }, + { + "epoch": 0.19268132349718795, + "grad_norm": 2.314737319946289, + "learning_rate": 3.728861397497914e-05, + "loss": 0.9187, + "step": 3957 + }, + { + "epoch": 0.19273001728629513, + "grad_norm": 1.6752618551254272, + "learning_rate": 3.728702789523602e-05, + "loss": 0.9119, + "step": 3958 + }, + { + "epoch": 0.19277871107540234, + "grad_norm": 1.7518306970596313, + "learning_rate": 3.72854413854765e-05, + "loss": 0.918, + "step": 3959 + }, + { + "epoch": 0.19282740486450953, + "grad_norm": 1.5249801874160767, + "learning_rate": 3.728385444574006e-05, + "loss": 0.9112, + "step": 3960 + }, + { + "epoch": 0.19287609865361674, + "grad_norm": 1.884717583656311, + "learning_rate": 3.7282267076066164e-05, + "loss": 0.8989, + "step": 3961 + }, + { + "epoch": 0.19292479244272392, + "grad_norm": 1.7713005542755127, + "learning_rate": 3.72806792764943e-05, + "loss": 0.9283, + "step": 3962 + }, + { + "epoch": 0.19297348623183114, + "grad_norm": 1.7499372959136963, + "learning_rate": 3.727909104706395e-05, + "loss": 0.8445, + "step": 3963 + }, + { + "epoch": 0.19302218002093832, + "grad_norm": 1.530694842338562, + "learning_rate": 3.727750238781465e-05, + "loss": 0.8782, + "step": 3964 + }, + { + "epoch": 0.19307087381004553, + "grad_norm": 2.636190891265869, + "learning_rate": 3.72759132987859e-05, + "loss": 0.8984, + "step": 3965 + }, + { + "epoch": 0.19311956759915272, + "grad_norm": 1.7805168628692627, + "learning_rate": 3.7274323780017225e-05, + "loss": 0.8267, + "step": 3966 + }, + { + "epoch": 0.19316826138825993, + "grad_norm": 2.0840442180633545, + "learning_rate": 3.727273383154817e-05, + "loss": 0.9242, + "step": 3967 + }, + { + "epoch": 0.19321695517736712, + "grad_norm": 1.6512812376022339, + "learning_rate": 3.7271143453418296e-05, + "loss": 0.9218, + "step": 3968 + }, + { + "epoch": 0.19326564896647433, + "grad_norm": 2.6735105514526367, + "learning_rate": 3.726955264566715e-05, + "loss": 0.8366, + "step": 3969 + }, + { + "epoch": 0.19331434275558151, + "grad_norm": 2.5083837509155273, + "learning_rate": 3.7267961408334305e-05, + "loss": 0.7995, + "step": 3970 + }, + { + "epoch": 0.19336303654468873, + "grad_norm": 1.690268635749817, + "learning_rate": 3.7266369741459345e-05, + "loss": 0.9103, + "step": 3971 + }, + { + "epoch": 0.1934117303337959, + "grad_norm": 1.7870128154754639, + "learning_rate": 3.726477764508186e-05, + "loss": 0.8102, + "step": 3972 + }, + { + "epoch": 0.19346042412290312, + "grad_norm": 1.4742798805236816, + "learning_rate": 3.726318511924147e-05, + "loss": 0.8645, + "step": 3973 + }, + { + "epoch": 0.19350911791201034, + "grad_norm": 1.891880989074707, + "learning_rate": 3.7261592163977766e-05, + "loss": 0.9422, + "step": 3974 + }, + { + "epoch": 0.19355781170111752, + "grad_norm": 1.7561542987823486, + "learning_rate": 3.725999877933038e-05, + "loss": 0.9291, + "step": 3975 + }, + { + "epoch": 0.19360650549022473, + "grad_norm": 1.7785016298294067, + "learning_rate": 3.725840496533896e-05, + "loss": 0.9181, + "step": 3976 + }, + { + "epoch": 0.19365519927933192, + "grad_norm": 5.472272872924805, + "learning_rate": 3.725681072204314e-05, + "loss": 0.9906, + "step": 3977 + }, + { + "epoch": 0.19370389306843913, + "grad_norm": 1.9146860837936401, + "learning_rate": 3.725521604948258e-05, + "loss": 1.0018, + "step": 3978 + }, + { + "epoch": 0.19375258685754632, + "grad_norm": 3.881866693496704, + "learning_rate": 3.7253620947696945e-05, + "loss": 0.8222, + "step": 3979 + }, + { + "epoch": 0.19380128064665353, + "grad_norm": 1.9558072090148926, + "learning_rate": 3.725202541672591e-05, + "loss": 0.8625, + "step": 3980 + }, + { + "epoch": 0.1938499744357607, + "grad_norm": 2.382537603378296, + "learning_rate": 3.725042945660918e-05, + "loss": 0.9484, + "step": 3981 + }, + { + "epoch": 0.19389866822486793, + "grad_norm": 1.8945958614349365, + "learning_rate": 3.7248833067386436e-05, + "loss": 0.9444, + "step": 3982 + }, + { + "epoch": 0.1939473620139751, + "grad_norm": 0.09233897179365158, + "learning_rate": 3.7247236249097404e-05, + "loss": 0.6095, + "step": 3983 + }, + { + "epoch": 0.19399605580308232, + "grad_norm": 2.760970115661621, + "learning_rate": 3.724563900178179e-05, + "loss": 0.8804, + "step": 3984 + }, + { + "epoch": 0.1940447495921895, + "grad_norm": 1.857904076576233, + "learning_rate": 3.7244041325479345e-05, + "loss": 0.8497, + "step": 3985 + }, + { + "epoch": 0.19409344338129672, + "grad_norm": 1.5872946977615356, + "learning_rate": 3.724244322022979e-05, + "loss": 0.8547, + "step": 3986 + }, + { + "epoch": 0.1941421371704039, + "grad_norm": 1.6455637216567993, + "learning_rate": 3.7240844686072894e-05, + "loss": 0.9482, + "step": 3987 + }, + { + "epoch": 0.19419083095951112, + "grad_norm": 2.049403667449951, + "learning_rate": 3.7239245723048415e-05, + "loss": 0.9579, + "step": 3988 + }, + { + "epoch": 0.1942395247486183, + "grad_norm": 2.2360219955444336, + "learning_rate": 3.723764633119612e-05, + "loss": 0.881, + "step": 3989 + }, + { + "epoch": 0.19428821853772552, + "grad_norm": 1.9816739559173584, + "learning_rate": 3.7236046510555804e-05, + "loss": 0.8709, + "step": 3990 + }, + { + "epoch": 0.1943369123268327, + "grad_norm": 1.9766119718551636, + "learning_rate": 3.7234446261167264e-05, + "loss": 0.9625, + "step": 3991 + }, + { + "epoch": 0.1943856061159399, + "grad_norm": 0.08508434146642685, + "learning_rate": 3.723284558307029e-05, + "loss": 0.6104, + "step": 3992 + }, + { + "epoch": 0.1944342999050471, + "grad_norm": 1.946776032447815, + "learning_rate": 3.723124447630472e-05, + "loss": 0.9121, + "step": 3993 + }, + { + "epoch": 0.1944829936941543, + "grad_norm": 2.205904483795166, + "learning_rate": 3.7229642940910376e-05, + "loss": 0.8929, + "step": 3994 + }, + { + "epoch": 0.19453168748326152, + "grad_norm": 1.934114933013916, + "learning_rate": 3.722804097692708e-05, + "loss": 0.8744, + "step": 3995 + }, + { + "epoch": 0.1945803812723687, + "grad_norm": 2.496234178543091, + "learning_rate": 3.7226438584394706e-05, + "loss": 0.9503, + "step": 3996 + }, + { + "epoch": 0.19462907506147592, + "grad_norm": 1.497297763824463, + "learning_rate": 3.72248357633531e-05, + "loss": 0.8875, + "step": 3997 + }, + { + "epoch": 0.1946777688505831, + "grad_norm": 1.535130262374878, + "learning_rate": 3.7223232513842136e-05, + "loss": 0.9763, + "step": 3998 + }, + { + "epoch": 0.19472646263969032, + "grad_norm": 1.916399598121643, + "learning_rate": 3.722162883590169e-05, + "loss": 0.9382, + "step": 3999 + }, + { + "epoch": 0.1947751564287975, + "grad_norm": 2.156188488006592, + "learning_rate": 3.722002472957165e-05, + "loss": 0.8964, + "step": 4000 + }, + { + "epoch": 0.19482385021790471, + "grad_norm": 1.9316259622573853, + "learning_rate": 3.721842019489193e-05, + "loss": 0.854, + "step": 4001 + }, + { + "epoch": 0.1948725440070119, + "grad_norm": 2.3570148944854736, + "learning_rate": 3.7216815231902436e-05, + "loss": 0.9848, + "step": 4002 + }, + { + "epoch": 0.1949212377961191, + "grad_norm": 1.7745072841644287, + "learning_rate": 3.7215209840643094e-05, + "loss": 0.8662, + "step": 4003 + }, + { + "epoch": 0.1949699315852263, + "grad_norm": 1.6347572803497314, + "learning_rate": 3.721360402115384e-05, + "loss": 1.017, + "step": 4004 + }, + { + "epoch": 0.1950186253743335, + "grad_norm": 9.445150375366211, + "learning_rate": 3.7211997773474615e-05, + "loss": 0.8081, + "step": 4005 + }, + { + "epoch": 0.1950673191634407, + "grad_norm": 0.08272973448038101, + "learning_rate": 3.7210391097645376e-05, + "loss": 0.5779, + "step": 4006 + }, + { + "epoch": 0.1951160129525479, + "grad_norm": 1.974456548690796, + "learning_rate": 3.720878399370609e-05, + "loss": 0.9056, + "step": 4007 + }, + { + "epoch": 0.1951647067416551, + "grad_norm": 2.210252285003662, + "learning_rate": 3.7207176461696734e-05, + "loss": 0.8098, + "step": 4008 + }, + { + "epoch": 0.1952134005307623, + "grad_norm": 2.0865278244018555, + "learning_rate": 3.720556850165729e-05, + "loss": 0.9449, + "step": 4009 + }, + { + "epoch": 0.1952620943198695, + "grad_norm": 1.78068208694458, + "learning_rate": 3.7203960113627764e-05, + "loss": 0.9701, + "step": 4010 + }, + { + "epoch": 0.1953107881089767, + "grad_norm": 1.6679236888885498, + "learning_rate": 3.7202351297648164e-05, + "loss": 0.9273, + "step": 4011 + }, + { + "epoch": 0.1953594818980839, + "grad_norm": 1.6368011236190796, + "learning_rate": 3.7200742053758504e-05, + "loss": 0.9539, + "step": 4012 + }, + { + "epoch": 0.1954081756871911, + "grad_norm": 2.5270001888275146, + "learning_rate": 3.719913238199882e-05, + "loss": 0.942, + "step": 4013 + }, + { + "epoch": 0.1954568694762983, + "grad_norm": 2.000950336456299, + "learning_rate": 3.7197522282409144e-05, + "loss": 0.8816, + "step": 4014 + }, + { + "epoch": 0.1955055632654055, + "grad_norm": 4.254615783691406, + "learning_rate": 3.7195911755029534e-05, + "loss": 0.9404, + "step": 4015 + }, + { + "epoch": 0.1955542570545127, + "grad_norm": 2.1067702770233154, + "learning_rate": 3.7194300799900054e-05, + "loss": 0.876, + "step": 4016 + }, + { + "epoch": 0.1956029508436199, + "grad_norm": 1.6833124160766602, + "learning_rate": 3.719268941706077e-05, + "loss": 0.9416, + "step": 4017 + }, + { + "epoch": 0.1956516446327271, + "grad_norm": 2.6370129585266113, + "learning_rate": 3.7191077606551777e-05, + "loss": 0.9222, + "step": 4018 + }, + { + "epoch": 0.1957003384218343, + "grad_norm": 1.8260117769241333, + "learning_rate": 3.7189465368413156e-05, + "loss": 0.9122, + "step": 4019 + }, + { + "epoch": 0.1957490322109415, + "grad_norm": 1.523635983467102, + "learning_rate": 3.718785270268502e-05, + "loss": 0.8625, + "step": 4020 + }, + { + "epoch": 0.1957977260000487, + "grad_norm": 1.777377963066101, + "learning_rate": 3.7186239609407476e-05, + "loss": 1.0409, + "step": 4021 + }, + { + "epoch": 0.1958464197891559, + "grad_norm": 1.3564857244491577, + "learning_rate": 3.7184626088620654e-05, + "loss": 0.9638, + "step": 4022 + }, + { + "epoch": 0.19589511357826309, + "grad_norm": 3.6817400455474854, + "learning_rate": 3.71830121403647e-05, + "loss": 0.8771, + "step": 4023 + }, + { + "epoch": 0.1959438073673703, + "grad_norm": 2.1159844398498535, + "learning_rate": 3.718139776467974e-05, + "loss": 0.8991, + "step": 4024 + }, + { + "epoch": 0.19599250115647748, + "grad_norm": 1.6644021272659302, + "learning_rate": 3.7179782961605954e-05, + "loss": 0.9404, + "step": 4025 + }, + { + "epoch": 0.1960411949455847, + "grad_norm": 1.5135982036590576, + "learning_rate": 3.71781677311835e-05, + "loss": 0.8687, + "step": 4026 + }, + { + "epoch": 0.19608988873469188, + "grad_norm": 2.534468412399292, + "learning_rate": 3.717655207345256e-05, + "loss": 0.8654, + "step": 4027 + }, + { + "epoch": 0.1961385825237991, + "grad_norm": 1.6998950242996216, + "learning_rate": 3.717493598845331e-05, + "loss": 0.9039, + "step": 4028 + }, + { + "epoch": 0.19618727631290628, + "grad_norm": 1.8349347114562988, + "learning_rate": 3.717331947622597e-05, + "loss": 0.9153, + "step": 4029 + }, + { + "epoch": 0.1962359701020135, + "grad_norm": 2.2955551147460938, + "learning_rate": 3.717170253681074e-05, + "loss": 1.0158, + "step": 4030 + }, + { + "epoch": 0.19628466389112068, + "grad_norm": 8.966780662536621, + "learning_rate": 3.717008517024784e-05, + "loss": 0.89, + "step": 4031 + }, + { + "epoch": 0.1963333576802279, + "grad_norm": 0.09055303782224655, + "learning_rate": 3.716846737657751e-05, + "loss": 0.7103, + "step": 4032 + }, + { + "epoch": 0.19638205146933507, + "grad_norm": 2.233656644821167, + "learning_rate": 3.7166849155839993e-05, + "loss": 0.9219, + "step": 4033 + }, + { + "epoch": 0.19643074525844229, + "grad_norm": 4.338805675506592, + "learning_rate": 3.7165230508075525e-05, + "loss": 0.8277, + "step": 4034 + }, + { + "epoch": 0.1964794390475495, + "grad_norm": 2.4099137783050537, + "learning_rate": 3.71636114333244e-05, + "loss": 0.9179, + "step": 4035 + }, + { + "epoch": 0.19652813283665668, + "grad_norm": 2.050842046737671, + "learning_rate": 3.7161991931626856e-05, + "loss": 0.9185, + "step": 4036 + }, + { + "epoch": 0.1965768266257639, + "grad_norm": 2.1564595699310303, + "learning_rate": 3.716037200302321e-05, + "loss": 0.9305, + "step": 4037 + }, + { + "epoch": 0.19662552041487108, + "grad_norm": 2.3632237911224365, + "learning_rate": 3.715875164755375e-05, + "loss": 0.9838, + "step": 4038 + }, + { + "epoch": 0.1966742142039783, + "grad_norm": 3.3910505771636963, + "learning_rate": 3.7157130865258764e-05, + "loss": 0.9526, + "step": 4039 + }, + { + "epoch": 0.19672290799308548, + "grad_norm": 1.4134598970413208, + "learning_rate": 3.7155509656178594e-05, + "loss": 0.9209, + "step": 4040 + }, + { + "epoch": 0.1967716017821927, + "grad_norm": 1.6634048223495483, + "learning_rate": 3.7153888020353555e-05, + "loss": 0.9197, + "step": 4041 + }, + { + "epoch": 0.19682029557129987, + "grad_norm": 7.81432580947876, + "learning_rate": 3.7152265957823994e-05, + "loss": 0.8467, + "step": 4042 + }, + { + "epoch": 0.1968689893604071, + "grad_norm": 1.963071584701538, + "learning_rate": 3.715064346863024e-05, + "loss": 0.9174, + "step": 4043 + }, + { + "epoch": 0.19691768314951427, + "grad_norm": 2.3069310188293457, + "learning_rate": 3.714902055281267e-05, + "loss": 0.8191, + "step": 4044 + }, + { + "epoch": 0.19696637693862148, + "grad_norm": 2.202279567718506, + "learning_rate": 3.714739721041165e-05, + "loss": 0.9832, + "step": 4045 + }, + { + "epoch": 0.19701507072772867, + "grad_norm": 1.926649808883667, + "learning_rate": 3.7145773441467574e-05, + "loss": 0.8682, + "step": 4046 + }, + { + "epoch": 0.19706376451683588, + "grad_norm": 1.9650298357009888, + "learning_rate": 3.714414924602081e-05, + "loss": 0.8513, + "step": 4047 + }, + { + "epoch": 0.19711245830594307, + "grad_norm": 1.8504153490066528, + "learning_rate": 3.7142524624111776e-05, + "loss": 0.9738, + "step": 4048 + }, + { + "epoch": 0.19716115209505028, + "grad_norm": 1.9836727380752563, + "learning_rate": 3.714089957578087e-05, + "loss": 0.9415, + "step": 4049 + }, + { + "epoch": 0.19720984588415746, + "grad_norm": 2.6317059993743896, + "learning_rate": 3.713927410106853e-05, + "loss": 0.8912, + "step": 4050 + }, + { + "epoch": 0.19725853967326468, + "grad_norm": 3.4153640270233154, + "learning_rate": 3.713764820001518e-05, + "loss": 0.7999, + "step": 4051 + }, + { + "epoch": 0.19730723346237186, + "grad_norm": 2.966853141784668, + "learning_rate": 3.7136021872661266e-05, + "loss": 0.8841, + "step": 4052 + }, + { + "epoch": 0.19735592725147907, + "grad_norm": 1.3485785722732544, + "learning_rate": 3.713439511904725e-05, + "loss": 0.8629, + "step": 4053 + }, + { + "epoch": 0.1974046210405863, + "grad_norm": 1.6117662191390991, + "learning_rate": 3.71327679392136e-05, + "loss": 0.9214, + "step": 4054 + }, + { + "epoch": 0.19745331482969347, + "grad_norm": 1.9967679977416992, + "learning_rate": 3.713114033320077e-05, + "loss": 0.8975, + "step": 4055 + }, + { + "epoch": 0.19750200861880068, + "grad_norm": 2.125709295272827, + "learning_rate": 3.7129512301049276e-05, + "loss": 0.8815, + "step": 4056 + }, + { + "epoch": 0.19755070240790787, + "grad_norm": 1.932671308517456, + "learning_rate": 3.712788384279959e-05, + "loss": 0.9784, + "step": 4057 + }, + { + "epoch": 0.19759939619701508, + "grad_norm": 4.004705905914307, + "learning_rate": 3.712625495849224e-05, + "loss": 0.8531, + "step": 4058 + }, + { + "epoch": 0.19764808998612227, + "grad_norm": 1.5900218486785889, + "learning_rate": 3.712462564816773e-05, + "loss": 0.9329, + "step": 4059 + }, + { + "epoch": 0.19769678377522948, + "grad_norm": 1.7821043729782104, + "learning_rate": 3.712299591186661e-05, + "loss": 0.9015, + "step": 4060 + }, + { + "epoch": 0.19774547756433666, + "grad_norm": 1.8050429821014404, + "learning_rate": 3.712136574962939e-05, + "loss": 0.8323, + "step": 4061 + }, + { + "epoch": 0.19779417135344388, + "grad_norm": 2.046506643295288, + "learning_rate": 3.711973516149664e-05, + "loss": 0.9928, + "step": 4062 + }, + { + "epoch": 0.19784286514255106, + "grad_norm": 1.518162727355957, + "learning_rate": 3.711810414750892e-05, + "loss": 0.8909, + "step": 4063 + }, + { + "epoch": 0.19789155893165827, + "grad_norm": 1.5718108415603638, + "learning_rate": 3.71164727077068e-05, + "loss": 0.9073, + "step": 4064 + }, + { + "epoch": 0.19794025272076546, + "grad_norm": 1.39528489112854, + "learning_rate": 3.711484084213086e-05, + "loss": 0.8292, + "step": 4065 + }, + { + "epoch": 0.19798894650987267, + "grad_norm": 1.984727382659912, + "learning_rate": 3.711320855082169e-05, + "loss": 0.8771, + "step": 4066 + }, + { + "epoch": 0.19803764029897986, + "grad_norm": 2.3051295280456543, + "learning_rate": 3.7111575833819894e-05, + "loss": 0.945, + "step": 4067 + }, + { + "epoch": 0.19808633408808707, + "grad_norm": 2.220778465270996, + "learning_rate": 3.7109942691166095e-05, + "loss": 0.8179, + "step": 4068 + }, + { + "epoch": 0.19813502787719425, + "grad_norm": 1.5152004957199097, + "learning_rate": 3.710830912290091e-05, + "loss": 0.8485, + "step": 4069 + }, + { + "epoch": 0.19818372166630147, + "grad_norm": 2.8651089668273926, + "learning_rate": 3.710667512906498e-05, + "loss": 0.8887, + "step": 4070 + }, + { + "epoch": 0.19823241545540865, + "grad_norm": 1.7604786157608032, + "learning_rate": 3.710504070969894e-05, + "loss": 0.9079, + "step": 4071 + }, + { + "epoch": 0.19828110924451586, + "grad_norm": 1.3390181064605713, + "learning_rate": 3.7103405864843456e-05, + "loss": 0.9038, + "step": 4072 + }, + { + "epoch": 0.19832980303362305, + "grad_norm": 5.134512901306152, + "learning_rate": 3.710177059453919e-05, + "loss": 0.8498, + "step": 4073 + }, + { + "epoch": 0.19837849682273026, + "grad_norm": 1.9284257888793945, + "learning_rate": 3.7100134898826824e-05, + "loss": 0.8988, + "step": 4074 + }, + { + "epoch": 0.19842719061183747, + "grad_norm": 1.689728856086731, + "learning_rate": 3.709849877774704e-05, + "loss": 0.8633, + "step": 4075 + }, + { + "epoch": 0.19847588440094466, + "grad_norm": 3.323564052581787, + "learning_rate": 3.709686223134054e-05, + "loss": 0.8612, + "step": 4076 + }, + { + "epoch": 0.19852457819005187, + "grad_norm": 2.1065869331359863, + "learning_rate": 3.7095225259648035e-05, + "loss": 0.8247, + "step": 4077 + }, + { + "epoch": 0.19857327197915906, + "grad_norm": 2.636730194091797, + "learning_rate": 3.709358786271024e-05, + "loss": 0.8458, + "step": 4078 + }, + { + "epoch": 0.19862196576826627, + "grad_norm": 1.4487488269805908, + "learning_rate": 3.709195004056789e-05, + "loss": 0.8623, + "step": 4079 + }, + { + "epoch": 0.19867065955737345, + "grad_norm": 1.636948585510254, + "learning_rate": 3.709031179326173e-05, + "loss": 0.9053, + "step": 4080 + }, + { + "epoch": 0.19871935334648066, + "grad_norm": 2.266115427017212, + "learning_rate": 3.70886731208325e-05, + "loss": 0.8179, + "step": 4081 + }, + { + "epoch": 0.19876804713558785, + "grad_norm": 1.5311158895492554, + "learning_rate": 3.7087034023320964e-05, + "loss": 0.9573, + "step": 4082 + }, + { + "epoch": 0.19881674092469506, + "grad_norm": 1.7813076972961426, + "learning_rate": 3.7085394500767904e-05, + "loss": 0.9533, + "step": 4083 + }, + { + "epoch": 0.19886543471380225, + "grad_norm": 1.6860449314117432, + "learning_rate": 3.70837545532141e-05, + "loss": 0.8064, + "step": 4084 + }, + { + "epoch": 0.19891412850290946, + "grad_norm": 1.6032253503799438, + "learning_rate": 3.708211418070034e-05, + "loss": 0.9007, + "step": 4085 + }, + { + "epoch": 0.19896282229201664, + "grad_norm": 1.8428791761398315, + "learning_rate": 3.708047338326744e-05, + "loss": 0.9366, + "step": 4086 + }, + { + "epoch": 0.19901151608112386, + "grad_norm": 2.0310373306274414, + "learning_rate": 3.707883216095619e-05, + "loss": 0.8739, + "step": 4087 + }, + { + "epoch": 0.19906020987023104, + "grad_norm": 2.142522096633911, + "learning_rate": 3.7077190513807445e-05, + "loss": 0.9053, + "step": 4088 + }, + { + "epoch": 0.19910890365933825, + "grad_norm": 6.375394821166992, + "learning_rate": 3.707554844186203e-05, + "loss": 0.9298, + "step": 4089 + }, + { + "epoch": 0.19915759744844544, + "grad_norm": 2.36881947517395, + "learning_rate": 3.7073905945160784e-05, + "loss": 0.9107, + "step": 4090 + }, + { + "epoch": 0.19920629123755265, + "grad_norm": 2.091432809829712, + "learning_rate": 3.7072263023744576e-05, + "loss": 0.8489, + "step": 4091 + }, + { + "epoch": 0.19925498502665984, + "grad_norm": 0.08812034875154495, + "learning_rate": 3.707061967765427e-05, + "loss": 0.6317, + "step": 4092 + }, + { + "epoch": 0.19930367881576705, + "grad_norm": 1.720378041267395, + "learning_rate": 3.706897590693074e-05, + "loss": 0.9555, + "step": 4093 + }, + { + "epoch": 0.19935237260487426, + "grad_norm": 1.6082528829574585, + "learning_rate": 3.706733171161488e-05, + "loss": 0.8811, + "step": 4094 + }, + { + "epoch": 0.19940106639398145, + "grad_norm": 1.5412973165512085, + "learning_rate": 3.7065687091747584e-05, + "loss": 0.8812, + "step": 4095 + }, + { + "epoch": 0.19944976018308866, + "grad_norm": 2.0239508152008057, + "learning_rate": 3.706404204736977e-05, + "loss": 0.9411, + "step": 4096 + }, + { + "epoch": 0.19949845397219584, + "grad_norm": 1.7078051567077637, + "learning_rate": 3.706239657852235e-05, + "loss": 0.825, + "step": 4097 + }, + { + "epoch": 0.19954714776130306, + "grad_norm": 1.898362636566162, + "learning_rate": 3.7060750685246265e-05, + "loss": 0.8611, + "step": 4098 + }, + { + "epoch": 0.19959584155041024, + "grad_norm": 1.554080843925476, + "learning_rate": 3.7059104367582446e-05, + "loss": 0.8138, + "step": 4099 + }, + { + "epoch": 0.19964453533951745, + "grad_norm": 2.5101919174194336, + "learning_rate": 3.705745762557185e-05, + "loss": 0.8331, + "step": 4100 + }, + { + "epoch": 0.19969322912862464, + "grad_norm": 1.893354058265686, + "learning_rate": 3.705581045925544e-05, + "loss": 0.9429, + "step": 4101 + }, + { + "epoch": 0.19974192291773185, + "grad_norm": 1.6033051013946533, + "learning_rate": 3.7054162868674196e-05, + "loss": 0.8412, + "step": 4102 + }, + { + "epoch": 0.19979061670683904, + "grad_norm": 1.5124577283859253, + "learning_rate": 3.7052514853869096e-05, + "loss": 0.9369, + "step": 4103 + }, + { + "epoch": 0.19983931049594625, + "grad_norm": 5.430034160614014, + "learning_rate": 3.705086641488113e-05, + "loss": 0.8938, + "step": 4104 + }, + { + "epoch": 0.19988800428505343, + "grad_norm": 1.6391701698303223, + "learning_rate": 3.70492175517513e-05, + "loss": 0.9346, + "step": 4105 + }, + { + "epoch": 0.19993669807416065, + "grad_norm": 1.5571424961090088, + "learning_rate": 3.7047568264520644e-05, + "loss": 0.9107, + "step": 4106 + }, + { + "epoch": 0.19998539186326783, + "grad_norm": 1.748121738433838, + "learning_rate": 3.7045918553230164e-05, + "loss": 0.9382, + "step": 4107 + }, + { + "epoch": 0.20003408565237504, + "grad_norm": 2.7877047061920166, + "learning_rate": 3.7044268417920906e-05, + "loss": 0.9174, + "step": 4108 + }, + { + "epoch": 0.20008277944148223, + "grad_norm": 1.8493938446044922, + "learning_rate": 3.704261785863392e-05, + "loss": 0.9293, + "step": 4109 + }, + { + "epoch": 0.20013147323058944, + "grad_norm": 1.9114032983779907, + "learning_rate": 3.704096687541026e-05, + "loss": 0.7714, + "step": 4110 + }, + { + "epoch": 0.20018016701969663, + "grad_norm": 1.6007084846496582, + "learning_rate": 3.7039315468291e-05, + "loss": 0.858, + "step": 4111 + }, + { + "epoch": 0.20022886080880384, + "grad_norm": 0.0900985449552536, + "learning_rate": 3.70376636373172e-05, + "loss": 0.5883, + "step": 4112 + }, + { + "epoch": 0.20027755459791102, + "grad_norm": 1.8183711767196655, + "learning_rate": 3.703601138252998e-05, + "loss": 0.9209, + "step": 4113 + }, + { + "epoch": 0.20032624838701824, + "grad_norm": 1.5718868970870972, + "learning_rate": 3.7034358703970415e-05, + "loss": 0.8995, + "step": 4114 + }, + { + "epoch": 0.20037494217612545, + "grad_norm": 2.3611831665039062, + "learning_rate": 3.703270560167962e-05, + "loss": 0.8964, + "step": 4115 + }, + { + "epoch": 0.20042363596523263, + "grad_norm": 2.174574375152588, + "learning_rate": 3.7031052075698725e-05, + "loss": 0.9531, + "step": 4116 + }, + { + "epoch": 0.20047232975433985, + "grad_norm": 1.8044205904006958, + "learning_rate": 3.7029398126068856e-05, + "loss": 0.9248, + "step": 4117 + }, + { + "epoch": 0.20052102354344703, + "grad_norm": 1.5723223686218262, + "learning_rate": 3.7027743752831156e-05, + "loss": 0.9126, + "step": 4118 + }, + { + "epoch": 0.20056971733255424, + "grad_norm": 1.8978554010391235, + "learning_rate": 3.7026088956026784e-05, + "loss": 0.8916, + "step": 4119 + }, + { + "epoch": 0.20061841112166143, + "grad_norm": 2.061206817626953, + "learning_rate": 3.702443373569689e-05, + "loss": 0.8659, + "step": 4120 + }, + { + "epoch": 0.20066710491076864, + "grad_norm": 1.8806742429733276, + "learning_rate": 3.702277809188266e-05, + "loss": 0.8961, + "step": 4121 + }, + { + "epoch": 0.20071579869987582, + "grad_norm": 2.3559539318084717, + "learning_rate": 3.7021122024625264e-05, + "loss": 0.8864, + "step": 4122 + }, + { + "epoch": 0.20076449248898304, + "grad_norm": 2.9678633213043213, + "learning_rate": 3.701946553396591e-05, + "loss": 0.9005, + "step": 4123 + }, + { + "epoch": 0.20081318627809022, + "grad_norm": 2.8717041015625, + "learning_rate": 3.7017808619945805e-05, + "loss": 0.8288, + "step": 4124 + }, + { + "epoch": 0.20086188006719743, + "grad_norm": 1.966295599937439, + "learning_rate": 3.701615128260615e-05, + "loss": 0.8678, + "step": 4125 + }, + { + "epoch": 0.20091057385630462, + "grad_norm": 1.7444425821304321, + "learning_rate": 3.7014493521988184e-05, + "loss": 0.803, + "step": 4126 + }, + { + "epoch": 0.20095926764541183, + "grad_norm": 1.8632392883300781, + "learning_rate": 3.701283533813314e-05, + "loss": 0.9682, + "step": 4127 + }, + { + "epoch": 0.20100796143451902, + "grad_norm": 1.8309695720672607, + "learning_rate": 3.7011176731082265e-05, + "loss": 0.8807, + "step": 4128 + }, + { + "epoch": 0.20105665522362623, + "grad_norm": 1.6074920892715454, + "learning_rate": 3.700951770087682e-05, + "loss": 0.9633, + "step": 4129 + }, + { + "epoch": 0.20110534901273341, + "grad_norm": 2.4568428993225098, + "learning_rate": 3.700785824755807e-05, + "loss": 0.8467, + "step": 4130 + }, + { + "epoch": 0.20115404280184063, + "grad_norm": 1.7636226415634155, + "learning_rate": 3.7006198371167296e-05, + "loss": 0.9695, + "step": 4131 + }, + { + "epoch": 0.2012027365909478, + "grad_norm": 1.2985217571258545, + "learning_rate": 3.700453807174578e-05, + "loss": 0.9483, + "step": 4132 + }, + { + "epoch": 0.20125143038005502, + "grad_norm": 0.1007290706038475, + "learning_rate": 3.700287734933485e-05, + "loss": 0.629, + "step": 4133 + }, + { + "epoch": 0.20130012416916224, + "grad_norm": 1.775067925453186, + "learning_rate": 3.700121620397578e-05, + "loss": 0.8176, + "step": 4134 + }, + { + "epoch": 0.20134881795826942, + "grad_norm": 1.4538136720657349, + "learning_rate": 3.69995546357099e-05, + "loss": 0.9982, + "step": 4135 + }, + { + "epoch": 0.20139751174737663, + "grad_norm": 2.240016460418701, + "learning_rate": 3.699789264457856e-05, + "loss": 0.9458, + "step": 4136 + }, + { + "epoch": 0.20144620553648382, + "grad_norm": 1.8594794273376465, + "learning_rate": 3.699623023062309e-05, + "loss": 0.8948, + "step": 4137 + }, + { + "epoch": 0.20149489932559103, + "grad_norm": 1.4639703035354614, + "learning_rate": 3.699456739388484e-05, + "loss": 0.8325, + "step": 4138 + }, + { + "epoch": 0.20154359311469822, + "grad_norm": 1.657399296760559, + "learning_rate": 3.699290413440517e-05, + "loss": 0.9503, + "step": 4139 + }, + { + "epoch": 0.20159228690380543, + "grad_norm": 1.528869867324829, + "learning_rate": 3.699124045222547e-05, + "loss": 0.9019, + "step": 4140 + }, + { + "epoch": 0.2016409806929126, + "grad_norm": 1.5109845399856567, + "learning_rate": 3.698957634738711e-05, + "loss": 0.7862, + "step": 4141 + }, + { + "epoch": 0.20168967448201983, + "grad_norm": 1.7324631214141846, + "learning_rate": 3.698791181993149e-05, + "loss": 0.9718, + "step": 4142 + }, + { + "epoch": 0.201738368271127, + "grad_norm": 1.4543025493621826, + "learning_rate": 3.698624686990002e-05, + "loss": 0.9683, + "step": 4143 + }, + { + "epoch": 0.20178706206023422, + "grad_norm": 1.833907961845398, + "learning_rate": 3.698458149733411e-05, + "loss": 0.9413, + "step": 4144 + }, + { + "epoch": 0.2018357558493414, + "grad_norm": 1.8642361164093018, + "learning_rate": 3.6982915702275175e-05, + "loss": 0.967, + "step": 4145 + }, + { + "epoch": 0.20188444963844862, + "grad_norm": 2.2520265579223633, + "learning_rate": 3.698124948476467e-05, + "loss": 0.9274, + "step": 4146 + }, + { + "epoch": 0.2019331434275558, + "grad_norm": 0.1259528547525406, + "learning_rate": 3.697958284484404e-05, + "loss": 0.6706, + "step": 4147 + }, + { + "epoch": 0.20198183721666302, + "grad_norm": 2.0390515327453613, + "learning_rate": 3.697791578255474e-05, + "loss": 0.8916, + "step": 4148 + }, + { + "epoch": 0.2020305310057702, + "grad_norm": 1.9068329334259033, + "learning_rate": 3.6976248297938227e-05, + "loss": 0.9005, + "step": 4149 + }, + { + "epoch": 0.20207922479487742, + "grad_norm": 1.6887003183364868, + "learning_rate": 3.6974580391036e-05, + "loss": 0.8723, + "step": 4150 + }, + { + "epoch": 0.2021279185839846, + "grad_norm": 1.5495599508285522, + "learning_rate": 3.697291206188953e-05, + "loss": 0.9345, + "step": 4151 + }, + { + "epoch": 0.2021766123730918, + "grad_norm": 1.7912437915802002, + "learning_rate": 3.697124331054033e-05, + "loss": 0.8858, + "step": 4152 + }, + { + "epoch": 0.202225306162199, + "grad_norm": 1.8067117929458618, + "learning_rate": 3.696957413702991e-05, + "loss": 0.9823, + "step": 4153 + }, + { + "epoch": 0.2022739999513062, + "grad_norm": 2.561493158340454, + "learning_rate": 3.696790454139978e-05, + "loss": 0.854, + "step": 4154 + }, + { + "epoch": 0.20232269374041342, + "grad_norm": 3.3625733852386475, + "learning_rate": 3.6966234523691474e-05, + "loss": 0.8672, + "step": 4155 + }, + { + "epoch": 0.2023713875295206, + "grad_norm": 2.1405835151672363, + "learning_rate": 3.696456408394654e-05, + "loss": 0.8529, + "step": 4156 + }, + { + "epoch": 0.20242008131862782, + "grad_norm": 2.0455989837646484, + "learning_rate": 3.696289322220652e-05, + "loss": 0.9491, + "step": 4157 + }, + { + "epoch": 0.202468775107735, + "grad_norm": 1.8213303089141846, + "learning_rate": 3.6961221938513e-05, + "loss": 0.8803, + "step": 4158 + }, + { + "epoch": 0.20251746889684222, + "grad_norm": 2.1976513862609863, + "learning_rate": 3.6959550232907526e-05, + "loss": 0.9127, + "step": 4159 + }, + { + "epoch": 0.2025661626859494, + "grad_norm": 1.9255667924880981, + "learning_rate": 3.69578781054317e-05, + "loss": 0.9356, + "step": 4160 + }, + { + "epoch": 0.20261485647505662, + "grad_norm": 1.8217549324035645, + "learning_rate": 3.695620555612711e-05, + "loss": 0.8323, + "step": 4161 + }, + { + "epoch": 0.2026635502641638, + "grad_norm": 1.81253981590271, + "learning_rate": 3.695453258503536e-05, + "loss": 0.8832, + "step": 4162 + }, + { + "epoch": 0.202712244053271, + "grad_norm": 2.2361648082733154, + "learning_rate": 3.695285919219806e-05, + "loss": 0.8307, + "step": 4163 + }, + { + "epoch": 0.2027609378423782, + "grad_norm": 2.2057294845581055, + "learning_rate": 3.695118537765685e-05, + "loss": 0.7779, + "step": 4164 + }, + { + "epoch": 0.2028096316314854, + "grad_norm": 1.5404622554779053, + "learning_rate": 3.6949511141453356e-05, + "loss": 1.0397, + "step": 4165 + }, + { + "epoch": 0.2028583254205926, + "grad_norm": 2.543957471847534, + "learning_rate": 3.694783648362923e-05, + "loss": 0.8985, + "step": 4166 + }, + { + "epoch": 0.2029070192096998, + "grad_norm": 1.6648526191711426, + "learning_rate": 3.6946161404226114e-05, + "loss": 0.8707, + "step": 4167 + }, + { + "epoch": 0.202955712998807, + "grad_norm": 2.583540439605713, + "learning_rate": 3.69444859032857e-05, + "loss": 0.8409, + "step": 4168 + }, + { + "epoch": 0.2030044067879142, + "grad_norm": 2.240683078765869, + "learning_rate": 3.694280998084965e-05, + "loss": 0.9117, + "step": 4169 + }, + { + "epoch": 0.2030531005770214, + "grad_norm": 1.7229938507080078, + "learning_rate": 3.694113363695966e-05, + "loss": 0.8464, + "step": 4170 + }, + { + "epoch": 0.2031017943661286, + "grad_norm": 0.0858774334192276, + "learning_rate": 3.693945687165742e-05, + "loss": 0.5426, + "step": 4171 + }, + { + "epoch": 0.2031504881552358, + "grad_norm": 1.149283528327942, + "learning_rate": 3.6937779684984655e-05, + "loss": 0.7834, + "step": 4172 + }, + { + "epoch": 0.203199181944343, + "grad_norm": 0.08365233242511749, + "learning_rate": 3.693610207698306e-05, + "loss": 0.6343, + "step": 4173 + }, + { + "epoch": 0.2032478757334502, + "grad_norm": 2.069829225540161, + "learning_rate": 3.6934424047694395e-05, + "loss": 0.8225, + "step": 4174 + }, + { + "epoch": 0.2032965695225574, + "grad_norm": 1.43738853931427, + "learning_rate": 3.6932745597160385e-05, + "loss": 0.9136, + "step": 4175 + }, + { + "epoch": 0.2033452633116646, + "grad_norm": 2.1955461502075195, + "learning_rate": 3.693106672542279e-05, + "loss": 0.8082, + "step": 4176 + }, + { + "epoch": 0.2033939571007718, + "grad_norm": 2.8433847427368164, + "learning_rate": 3.692938743252336e-05, + "loss": 0.904, + "step": 4177 + }, + { + "epoch": 0.203442650889879, + "grad_norm": 1.9983378648757935, + "learning_rate": 3.692770771850387e-05, + "loss": 0.8427, + "step": 4178 + }, + { + "epoch": 0.2034913446789862, + "grad_norm": 1.8150204420089722, + "learning_rate": 3.692602758340612e-05, + "loss": 0.8418, + "step": 4179 + }, + { + "epoch": 0.2035400384680934, + "grad_norm": 2.0966131687164307, + "learning_rate": 3.6924347027271876e-05, + "loss": 0.9063, + "step": 4180 + }, + { + "epoch": 0.2035887322572006, + "grad_norm": 1.941766619682312, + "learning_rate": 3.6922666050142966e-05, + "loss": 0.95, + "step": 4181 + }, + { + "epoch": 0.2036374260463078, + "grad_norm": 2.2388176918029785, + "learning_rate": 3.6920984652061186e-05, + "loss": 0.895, + "step": 4182 + }, + { + "epoch": 0.20368611983541499, + "grad_norm": 2.5872113704681396, + "learning_rate": 3.691930283306838e-05, + "loss": 0.8392, + "step": 4183 + }, + { + "epoch": 0.2037348136245222, + "grad_norm": 2.1121771335601807, + "learning_rate": 3.6917620593206374e-05, + "loss": 0.8753, + "step": 4184 + }, + { + "epoch": 0.20378350741362938, + "grad_norm": 3.1391029357910156, + "learning_rate": 3.6915937932517e-05, + "loss": 0.9533, + "step": 4185 + }, + { + "epoch": 0.2038322012027366, + "grad_norm": 1.3477057218551636, + "learning_rate": 3.6914254851042145e-05, + "loss": 0.8564, + "step": 4186 + }, + { + "epoch": 0.20388089499184378, + "grad_norm": 0.08565180003643036, + "learning_rate": 3.691257134882364e-05, + "loss": 0.6405, + "step": 4187 + }, + { + "epoch": 0.203929588780951, + "grad_norm": 3.7424004077911377, + "learning_rate": 3.6910887425903396e-05, + "loss": 0.8587, + "step": 4188 + }, + { + "epoch": 0.20397828257005818, + "grad_norm": 1.669873595237732, + "learning_rate": 3.690920308232328e-05, + "loss": 0.831, + "step": 4189 + }, + { + "epoch": 0.2040269763591654, + "grad_norm": 1.613584041595459, + "learning_rate": 3.69075183181252e-05, + "loss": 0.8694, + "step": 4190 + }, + { + "epoch": 0.20407567014827258, + "grad_norm": 1.8264760971069336, + "learning_rate": 3.690583313335105e-05, + "loss": 0.876, + "step": 4191 + }, + { + "epoch": 0.2041243639373798, + "grad_norm": 1.6347951889038086, + "learning_rate": 3.6904147528042765e-05, + "loss": 0.7036, + "step": 4192 + }, + { + "epoch": 0.20417305772648697, + "grad_norm": 1.938388705253601, + "learning_rate": 3.690246150224227e-05, + "loss": 0.7941, + "step": 4193 + }, + { + "epoch": 0.20422175151559419, + "grad_norm": 1.4859590530395508, + "learning_rate": 3.690077505599151e-05, + "loss": 0.8273, + "step": 4194 + }, + { + "epoch": 0.2042704453047014, + "grad_norm": 1.6893059015274048, + "learning_rate": 3.689908818933242e-05, + "loss": 0.8335, + "step": 4195 + }, + { + "epoch": 0.20431913909380858, + "grad_norm": 5.452273368835449, + "learning_rate": 3.6897400902306973e-05, + "loss": 0.8952, + "step": 4196 + }, + { + "epoch": 0.2043678328829158, + "grad_norm": 0.08661586046218872, + "learning_rate": 3.689571319495714e-05, + "loss": 0.6455, + "step": 4197 + }, + { + "epoch": 0.20441652667202298, + "grad_norm": 1.786808967590332, + "learning_rate": 3.6894025067324894e-05, + "loss": 0.8626, + "step": 4198 + }, + { + "epoch": 0.2044652204611302, + "grad_norm": 1.8874410390853882, + "learning_rate": 3.689233651945224e-05, + "loss": 0.9014, + "step": 4199 + }, + { + "epoch": 0.20451391425023738, + "grad_norm": 1.5660686492919922, + "learning_rate": 3.6890647551381177e-05, + "loss": 0.821, + "step": 4200 + }, + { + "epoch": 0.2045626080393446, + "grad_norm": 1.69298255443573, + "learning_rate": 3.6888958163153715e-05, + "loss": 0.8864, + "step": 4201 + }, + { + "epoch": 0.20461130182845177, + "grad_norm": 1.8372029066085815, + "learning_rate": 3.688726835481187e-05, + "loss": 0.8241, + "step": 4202 + }, + { + "epoch": 0.204659995617559, + "grad_norm": 1.6411552429199219, + "learning_rate": 3.6885578126397697e-05, + "loss": 0.8487, + "step": 4203 + }, + { + "epoch": 0.20470868940666617, + "grad_norm": 1.5524924993515015, + "learning_rate": 3.688388747795322e-05, + "loss": 0.8888, + "step": 4204 + }, + { + "epoch": 0.20475738319577338, + "grad_norm": 2.4618263244628906, + "learning_rate": 3.688219640952051e-05, + "loss": 0.9045, + "step": 4205 + }, + { + "epoch": 0.20480607698488057, + "grad_norm": 2.4727346897125244, + "learning_rate": 3.688050492114162e-05, + "loss": 0.9088, + "step": 4206 + }, + { + "epoch": 0.20485477077398778, + "grad_norm": 1.678496241569519, + "learning_rate": 3.687881301285863e-05, + "loss": 0.8747, + "step": 4207 + }, + { + "epoch": 0.20490346456309497, + "grad_norm": 1.4933611154556274, + "learning_rate": 3.6877120684713626e-05, + "loss": 0.8697, + "step": 4208 + }, + { + "epoch": 0.20495215835220218, + "grad_norm": 2.2609081268310547, + "learning_rate": 3.6875427936748715e-05, + "loss": 0.9151, + "step": 4209 + }, + { + "epoch": 0.20500085214130936, + "grad_norm": 2.3293702602386475, + "learning_rate": 3.6873734769005985e-05, + "loss": 0.9611, + "step": 4210 + }, + { + "epoch": 0.20504954593041658, + "grad_norm": 2.0310745239257812, + "learning_rate": 3.687204118152757e-05, + "loss": 0.9034, + "step": 4211 + }, + { + "epoch": 0.20509823971952376, + "grad_norm": 1.345181941986084, + "learning_rate": 3.6870347174355586e-05, + "loss": 0.8947, + "step": 4212 + }, + { + "epoch": 0.20514693350863097, + "grad_norm": 2.171654462814331, + "learning_rate": 3.686865274753219e-05, + "loss": 0.9014, + "step": 4213 + }, + { + "epoch": 0.2051956272977382, + "grad_norm": 1.9380685091018677, + "learning_rate": 3.6866957901099515e-05, + "loss": 0.9207, + "step": 4214 + }, + { + "epoch": 0.20524432108684537, + "grad_norm": 5.446167469024658, + "learning_rate": 3.6865262635099724e-05, + "loss": 0.9169, + "step": 4215 + }, + { + "epoch": 0.20529301487595258, + "grad_norm": 3.8334126472473145, + "learning_rate": 3.6863566949574984e-05, + "loss": 0.8898, + "step": 4216 + }, + { + "epoch": 0.20534170866505977, + "grad_norm": 1.7884249687194824, + "learning_rate": 3.686187084456747e-05, + "loss": 0.8336, + "step": 4217 + }, + { + "epoch": 0.20539040245416698, + "grad_norm": 1.7662419080734253, + "learning_rate": 3.686017432011939e-05, + "loss": 0.8334, + "step": 4218 + }, + { + "epoch": 0.20543909624327417, + "grad_norm": 1.6934245824813843, + "learning_rate": 3.685847737627294e-05, + "loss": 0.8319, + "step": 4219 + }, + { + "epoch": 0.20548779003238138, + "grad_norm": 1.8720605373382568, + "learning_rate": 3.685678001307033e-05, + "loss": 0.8285, + "step": 4220 + }, + { + "epoch": 0.20553648382148856, + "grad_norm": 1.8875783681869507, + "learning_rate": 3.685508223055377e-05, + "loss": 0.9327, + "step": 4221 + }, + { + "epoch": 0.20558517761059578, + "grad_norm": 1.8173115253448486, + "learning_rate": 3.685338402876551e-05, + "loss": 0.9841, + "step": 4222 + }, + { + "epoch": 0.20563387139970296, + "grad_norm": 1.9456473588943481, + "learning_rate": 3.685168540774778e-05, + "loss": 0.8748, + "step": 4223 + }, + { + "epoch": 0.20568256518881017, + "grad_norm": 1.4660381078720093, + "learning_rate": 3.6849986367542845e-05, + "loss": 0.9043, + "step": 4224 + }, + { + "epoch": 0.20573125897791736, + "grad_norm": 1.296981930732727, + "learning_rate": 3.684828690819296e-05, + "loss": 0.7883, + "step": 4225 + }, + { + "epoch": 0.20577995276702457, + "grad_norm": 1.444622278213501, + "learning_rate": 3.684658702974041e-05, + "loss": 0.8982, + "step": 4226 + }, + { + "epoch": 0.20582864655613176, + "grad_norm": 1.7534449100494385, + "learning_rate": 3.684488673222746e-05, + "loss": 0.8782, + "step": 4227 + }, + { + "epoch": 0.20587734034523897, + "grad_norm": 1.56904935836792, + "learning_rate": 3.684318601569642e-05, + "loss": 0.8532, + "step": 4228 + }, + { + "epoch": 0.20592603413434615, + "grad_norm": 1.4503700733184814, + "learning_rate": 3.684148488018959e-05, + "loss": 0.8957, + "step": 4229 + }, + { + "epoch": 0.20597472792345337, + "grad_norm": 1.5025368928909302, + "learning_rate": 3.6839783325749294e-05, + "loss": 0.8559, + "step": 4230 + }, + { + "epoch": 0.20602342171256055, + "grad_norm": 1.88258957862854, + "learning_rate": 3.683808135241785e-05, + "loss": 0.8572, + "step": 4231 + }, + { + "epoch": 0.20607211550166776, + "grad_norm": 1.8452240228652954, + "learning_rate": 3.68363789602376e-05, + "loss": 0.8581, + "step": 4232 + }, + { + "epoch": 0.20612080929077495, + "grad_norm": 1.4494342803955078, + "learning_rate": 3.683467614925088e-05, + "loss": 0.8576, + "step": 4233 + }, + { + "epoch": 0.20616950307988216, + "grad_norm": 2.1080427169799805, + "learning_rate": 3.683297291950006e-05, + "loss": 0.9174, + "step": 4234 + }, + { + "epoch": 0.20621819686898937, + "grad_norm": 2.143838882446289, + "learning_rate": 3.683126927102751e-05, + "loss": 0.8125, + "step": 4235 + }, + { + "epoch": 0.20626689065809656, + "grad_norm": 1.57454252243042, + "learning_rate": 3.6829565203875594e-05, + "loss": 0.8241, + "step": 4236 + }, + { + "epoch": 0.20631558444720377, + "grad_norm": 2.4071779251098633, + "learning_rate": 3.6827860718086714e-05, + "loss": 0.9671, + "step": 4237 + }, + { + "epoch": 0.20636427823631096, + "grad_norm": 1.4003244638442993, + "learning_rate": 3.6826155813703265e-05, + "loss": 0.9138, + "step": 4238 + }, + { + "epoch": 0.20641297202541817, + "grad_norm": 2.9058077335357666, + "learning_rate": 3.682445049076765e-05, + "loss": 0.8364, + "step": 4239 + }, + { + "epoch": 0.20646166581452535, + "grad_norm": 4.7647786140441895, + "learning_rate": 3.68227447493223e-05, + "loss": 0.9719, + "step": 4240 + }, + { + "epoch": 0.20651035960363257, + "grad_norm": 1.603175163269043, + "learning_rate": 3.6821038589409634e-05, + "loss": 0.8057, + "step": 4241 + }, + { + "epoch": 0.20655905339273975, + "grad_norm": 1.7093700170516968, + "learning_rate": 3.6819332011072106e-05, + "loss": 0.8676, + "step": 4242 + }, + { + "epoch": 0.20660774718184696, + "grad_norm": 1.6617438793182373, + "learning_rate": 3.681762501435216e-05, + "loss": 1.0548, + "step": 4243 + }, + { + "epoch": 0.20665644097095415, + "grad_norm": 1.841536045074463, + "learning_rate": 3.681591759929226e-05, + "loss": 0.994, + "step": 4244 + }, + { + "epoch": 0.20670513476006136, + "grad_norm": 1.7638022899627686, + "learning_rate": 3.681420976593487e-05, + "loss": 0.9208, + "step": 4245 + }, + { + "epoch": 0.20675382854916854, + "grad_norm": 2.0270352363586426, + "learning_rate": 3.681250151432249e-05, + "loss": 0.9452, + "step": 4246 + }, + { + "epoch": 0.20680252233827576, + "grad_norm": 2.2629644870758057, + "learning_rate": 3.681079284449759e-05, + "loss": 0.9054, + "step": 4247 + }, + { + "epoch": 0.20685121612738294, + "grad_norm": 1.4073336124420166, + "learning_rate": 3.680908375650269e-05, + "loss": 0.9218, + "step": 4248 + }, + { + "epoch": 0.20689990991649015, + "grad_norm": 3.0787973403930664, + "learning_rate": 3.6807374250380295e-05, + "loss": 0.7861, + "step": 4249 + }, + { + "epoch": 0.20694860370559734, + "grad_norm": 2.0099730491638184, + "learning_rate": 3.680566432617294e-05, + "loss": 0.7829, + "step": 4250 + }, + { + "epoch": 0.20699729749470455, + "grad_norm": 1.816170573234558, + "learning_rate": 3.680395398392315e-05, + "loss": 0.8132, + "step": 4251 + }, + { + "epoch": 0.20704599128381174, + "grad_norm": 1.5771206617355347, + "learning_rate": 3.680224322367347e-05, + "loss": 0.8635, + "step": 4252 + }, + { + "epoch": 0.20709468507291895, + "grad_norm": 0.08840233087539673, + "learning_rate": 3.680053204546647e-05, + "loss": 0.6631, + "step": 4253 + }, + { + "epoch": 0.20714337886202616, + "grad_norm": 1.8078951835632324, + "learning_rate": 3.679882044934468e-05, + "loss": 0.9676, + "step": 4254 + }, + { + "epoch": 0.20719207265113335, + "grad_norm": 3.253864288330078, + "learning_rate": 3.6797108435350726e-05, + "loss": 0.8428, + "step": 4255 + }, + { + "epoch": 0.20724076644024056, + "grad_norm": 2.4967007637023926, + "learning_rate": 3.6795396003527146e-05, + "loss": 0.888, + "step": 4256 + }, + { + "epoch": 0.20728946022934774, + "grad_norm": 1.7172833681106567, + "learning_rate": 3.679368315391657e-05, + "loss": 0.8698, + "step": 4257 + }, + { + "epoch": 0.20733815401845496, + "grad_norm": 1.5202815532684326, + "learning_rate": 3.6791969886561594e-05, + "loss": 0.9324, + "step": 4258 + }, + { + "epoch": 0.20738684780756214, + "grad_norm": 1.3282289505004883, + "learning_rate": 3.6790256201504834e-05, + "loss": 0.786, + "step": 4259 + }, + { + "epoch": 0.20743554159666935, + "grad_norm": 1.904396653175354, + "learning_rate": 3.678854209878893e-05, + "loss": 0.9177, + "step": 4260 + }, + { + "epoch": 0.20748423538577654, + "grad_norm": 1.4416778087615967, + "learning_rate": 3.678682757845649e-05, + "loss": 0.9493, + "step": 4261 + }, + { + "epoch": 0.20753292917488375, + "grad_norm": 1.8591153621673584, + "learning_rate": 3.67851126405502e-05, + "loss": 0.9161, + "step": 4262 + }, + { + "epoch": 0.20758162296399094, + "grad_norm": 1.6561977863311768, + "learning_rate": 3.6783397285112695e-05, + "loss": 0.8563, + "step": 4263 + }, + { + "epoch": 0.20763031675309815, + "grad_norm": 1.491792917251587, + "learning_rate": 3.6781681512186656e-05, + "loss": 0.931, + "step": 4264 + }, + { + "epoch": 0.20767901054220533, + "grad_norm": 1.9080020189285278, + "learning_rate": 3.677996532181476e-05, + "loss": 0.9749, + "step": 4265 + }, + { + "epoch": 0.20772770433131255, + "grad_norm": 1.7090425491333008, + "learning_rate": 3.677824871403969e-05, + "loss": 0.9249, + "step": 4266 + }, + { + "epoch": 0.20777639812041973, + "grad_norm": 2.451184034347534, + "learning_rate": 3.677653168890416e-05, + "loss": 0.91, + "step": 4267 + }, + { + "epoch": 0.20782509190952694, + "grad_norm": 2.432547092437744, + "learning_rate": 3.6774814246450874e-05, + "loss": 0.8538, + "step": 4268 + }, + { + "epoch": 0.20787378569863413, + "grad_norm": 1.808622121810913, + "learning_rate": 3.677309638672255e-05, + "loss": 0.8685, + "step": 4269 + }, + { + "epoch": 0.20792247948774134, + "grad_norm": 1.9613600969314575, + "learning_rate": 3.6771378109761936e-05, + "loss": 0.9036, + "step": 4270 + }, + { + "epoch": 0.20797117327684853, + "grad_norm": 1.4846938848495483, + "learning_rate": 3.6769659415611755e-05, + "loss": 0.9357, + "step": 4271 + }, + { + "epoch": 0.20801986706595574, + "grad_norm": 1.762290120124817, + "learning_rate": 3.676794030431477e-05, + "loss": 0.8612, + "step": 4272 + }, + { + "epoch": 0.20806856085506292, + "grad_norm": 1.533685326576233, + "learning_rate": 3.6766220775913725e-05, + "loss": 0.7769, + "step": 4273 + }, + { + "epoch": 0.20811725464417014, + "grad_norm": 1.962541937828064, + "learning_rate": 3.676450083045143e-05, + "loss": 0.8845, + "step": 4274 + }, + { + "epoch": 0.20816594843327735, + "grad_norm": 1.761714220046997, + "learning_rate": 3.676278046797063e-05, + "loss": 0.8729, + "step": 4275 + }, + { + "epoch": 0.20821464222238453, + "grad_norm": 1.345515489578247, + "learning_rate": 3.676105968851415e-05, + "loss": 0.8955, + "step": 4276 + }, + { + "epoch": 0.20826333601149175, + "grad_norm": 1.73859703540802, + "learning_rate": 3.675933849212478e-05, + "loss": 0.9768, + "step": 4277 + }, + { + "epoch": 0.20831202980059893, + "grad_norm": 1.7908457517623901, + "learning_rate": 3.6757616878845335e-05, + "loss": 0.8896, + "step": 4278 + }, + { + "epoch": 0.20836072358970614, + "grad_norm": 3.0119619369506836, + "learning_rate": 3.675589484871864e-05, + "loss": 0.7913, + "step": 4279 + }, + { + "epoch": 0.20840941737881333, + "grad_norm": 10.958431243896484, + "learning_rate": 3.675417240178754e-05, + "loss": 0.8115, + "step": 4280 + }, + { + "epoch": 0.20845811116792054, + "grad_norm": 2.3248963356018066, + "learning_rate": 3.675244953809487e-05, + "loss": 0.8883, + "step": 4281 + }, + { + "epoch": 0.20850680495702772, + "grad_norm": 2.584007978439331, + "learning_rate": 3.6750726257683494e-05, + "loss": 0.9188, + "step": 4282 + }, + { + "epoch": 0.20855549874613494, + "grad_norm": 1.7932918071746826, + "learning_rate": 3.674900256059627e-05, + "loss": 0.9091, + "step": 4283 + }, + { + "epoch": 0.20860419253524212, + "grad_norm": 1.5833611488342285, + "learning_rate": 3.674727844687608e-05, + "loss": 0.8877, + "step": 4284 + }, + { + "epoch": 0.20865288632434933, + "grad_norm": 2.6032469272613525, + "learning_rate": 3.6745553916565815e-05, + "loss": 0.8123, + "step": 4285 + }, + { + "epoch": 0.20870158011345652, + "grad_norm": 1.661515235900879, + "learning_rate": 3.674382896970837e-05, + "loss": 0.8649, + "step": 4286 + }, + { + "epoch": 0.20875027390256373, + "grad_norm": 5.470763206481934, + "learning_rate": 3.674210360634665e-05, + "loss": 0.8664, + "step": 4287 + }, + { + "epoch": 0.20879896769167092, + "grad_norm": 1.7478991746902466, + "learning_rate": 3.674037782652358e-05, + "loss": 0.9921, + "step": 4288 + }, + { + "epoch": 0.20884766148077813, + "grad_norm": 1.754202961921692, + "learning_rate": 3.6738651630282075e-05, + "loss": 0.9107, + "step": 4289 + }, + { + "epoch": 0.20889635526988531, + "grad_norm": 1.8290835618972778, + "learning_rate": 3.6736925017665095e-05, + "loss": 1.0194, + "step": 4290 + }, + { + "epoch": 0.20894504905899253, + "grad_norm": 2.704437017440796, + "learning_rate": 3.673519798871557e-05, + "loss": 0.9278, + "step": 4291 + }, + { + "epoch": 0.2089937428480997, + "grad_norm": 1.6617377996444702, + "learning_rate": 3.673347054347648e-05, + "loss": 0.8036, + "step": 4292 + }, + { + "epoch": 0.20904243663720692, + "grad_norm": 2.114872455596924, + "learning_rate": 3.673174268199078e-05, + "loss": 0.9126, + "step": 4293 + }, + { + "epoch": 0.20909113042631414, + "grad_norm": 1.4631009101867676, + "learning_rate": 3.673001440430145e-05, + "loss": 0.8588, + "step": 4294 + }, + { + "epoch": 0.20913982421542132, + "grad_norm": 1.5214636325836182, + "learning_rate": 3.67282857104515e-05, + "loss": 0.8367, + "step": 4295 + }, + { + "epoch": 0.20918851800452853, + "grad_norm": 1.7632675170898438, + "learning_rate": 3.6726556600483904e-05, + "loss": 0.9327, + "step": 4296 + }, + { + "epoch": 0.20923721179363572, + "grad_norm": 1.59052574634552, + "learning_rate": 3.672482707444169e-05, + "loss": 0.8041, + "step": 4297 + }, + { + "epoch": 0.20928590558274293, + "grad_norm": 1.7716376781463623, + "learning_rate": 3.6723097132367884e-05, + "loss": 0.8893, + "step": 4298 + }, + { + "epoch": 0.20933459937185012, + "grad_norm": 3.797945022583008, + "learning_rate": 3.672136677430551e-05, + "loss": 0.8313, + "step": 4299 + }, + { + "epoch": 0.20938329316095733, + "grad_norm": 1.764341950416565, + "learning_rate": 3.671963600029761e-05, + "loss": 0.8064, + "step": 4300 + }, + { + "epoch": 0.2094319869500645, + "grad_norm": 1.6311180591583252, + "learning_rate": 3.671790481038724e-05, + "loss": 0.9178, + "step": 4301 + }, + { + "epoch": 0.20948068073917173, + "grad_norm": 3.233121871948242, + "learning_rate": 3.671617320461747e-05, + "loss": 0.8232, + "step": 4302 + }, + { + "epoch": 0.2095293745282789, + "grad_norm": 1.491626262664795, + "learning_rate": 3.671444118303136e-05, + "loss": 0.9221, + "step": 4303 + }, + { + "epoch": 0.20957806831738612, + "grad_norm": 1.9138150215148926, + "learning_rate": 3.6712708745672e-05, + "loss": 0.8339, + "step": 4304 + }, + { + "epoch": 0.2096267621064933, + "grad_norm": 2.6063528060913086, + "learning_rate": 3.671097589258249e-05, + "loss": 0.9598, + "step": 4305 + }, + { + "epoch": 0.20967545589560052, + "grad_norm": 1.8552273511886597, + "learning_rate": 3.670924262380593e-05, + "loss": 0.994, + "step": 4306 + }, + { + "epoch": 0.2097241496847077, + "grad_norm": 1.840241551399231, + "learning_rate": 3.670750893938544e-05, + "loss": 0.8766, + "step": 4307 + }, + { + "epoch": 0.20977284347381492, + "grad_norm": 2.348893165588379, + "learning_rate": 3.6705774839364134e-05, + "loss": 0.8628, + "step": 4308 + }, + { + "epoch": 0.2098215372629221, + "grad_norm": 1.376869559288025, + "learning_rate": 3.6704040323785163e-05, + "loss": 0.9139, + "step": 4309 + }, + { + "epoch": 0.20987023105202932, + "grad_norm": 2.346578598022461, + "learning_rate": 3.670230539269167e-05, + "loss": 0.9005, + "step": 4310 + }, + { + "epoch": 0.2099189248411365, + "grad_norm": 1.6009215116500854, + "learning_rate": 3.6700570046126795e-05, + "loss": 0.9759, + "step": 4311 + }, + { + "epoch": 0.2099676186302437, + "grad_norm": 2.0674595832824707, + "learning_rate": 3.669883428413372e-05, + "loss": 0.9187, + "step": 4312 + }, + { + "epoch": 0.21001631241935093, + "grad_norm": 2.2072901725769043, + "learning_rate": 3.669709810675563e-05, + "loss": 0.8633, + "step": 4313 + }, + { + "epoch": 0.2100650062084581, + "grad_norm": 1.5161359310150146, + "learning_rate": 3.669536151403569e-05, + "loss": 0.8207, + "step": 4314 + }, + { + "epoch": 0.21011369999756532, + "grad_norm": 2.4254915714263916, + "learning_rate": 3.669362450601711e-05, + "loss": 0.8926, + "step": 4315 + }, + { + "epoch": 0.2101623937866725, + "grad_norm": 1.572859525680542, + "learning_rate": 3.669188708274311e-05, + "loss": 0.7863, + "step": 4316 + }, + { + "epoch": 0.21021108757577972, + "grad_norm": 2.6781091690063477, + "learning_rate": 3.669014924425689e-05, + "loss": 0.9428, + "step": 4317 + }, + { + "epoch": 0.2102597813648869, + "grad_norm": 2.1244378089904785, + "learning_rate": 3.668841099060169e-05, + "loss": 0.9177, + "step": 4318 + }, + { + "epoch": 0.21030847515399412, + "grad_norm": 1.6858717203140259, + "learning_rate": 3.668667232182074e-05, + "loss": 0.9698, + "step": 4319 + }, + { + "epoch": 0.2103571689431013, + "grad_norm": 0.08762975037097931, + "learning_rate": 3.66849332379573e-05, + "loss": 0.6088, + "step": 4320 + }, + { + "epoch": 0.21040586273220852, + "grad_norm": 1.5435940027236938, + "learning_rate": 3.668319373905461e-05, + "loss": 0.8677, + "step": 4321 + }, + { + "epoch": 0.2104545565213157, + "grad_norm": 2.8607280254364014, + "learning_rate": 3.668145382515597e-05, + "loss": 0.8969, + "step": 4322 + }, + { + "epoch": 0.2105032503104229, + "grad_norm": 2.2459919452667236, + "learning_rate": 3.667971349630464e-05, + "loss": 0.9179, + "step": 4323 + }, + { + "epoch": 0.2105519440995301, + "grad_norm": 1.575905680656433, + "learning_rate": 3.6677972752543915e-05, + "loss": 0.8384, + "step": 4324 + }, + { + "epoch": 0.2106006378886373, + "grad_norm": 0.0822463184595108, + "learning_rate": 3.6676231593917105e-05, + "loss": 0.5646, + "step": 4325 + }, + { + "epoch": 0.2106493316777445, + "grad_norm": 2.7165210247039795, + "learning_rate": 3.66744900204675e-05, + "loss": 0.9346, + "step": 4326 + }, + { + "epoch": 0.2106980254668517, + "grad_norm": 7.781564235687256, + "learning_rate": 3.667274803223844e-05, + "loss": 0.8774, + "step": 4327 + }, + { + "epoch": 0.2107467192559589, + "grad_norm": 2.3442840576171875, + "learning_rate": 3.667100562927326e-05, + "loss": 0.9201, + "step": 4328 + }, + { + "epoch": 0.2107954130450661, + "grad_norm": 2.3128466606140137, + "learning_rate": 3.666926281161529e-05, + "loss": 0.9164, + "step": 4329 + }, + { + "epoch": 0.2108441068341733, + "grad_norm": 1.7455776929855347, + "learning_rate": 3.666751957930789e-05, + "loss": 0.952, + "step": 4330 + }, + { + "epoch": 0.2108928006232805, + "grad_norm": 2.3914988040924072, + "learning_rate": 3.666577593239441e-05, + "loss": 0.9073, + "step": 4331 + }, + { + "epoch": 0.2109414944123877, + "grad_norm": 1.7746065855026245, + "learning_rate": 3.666403187091825e-05, + "loss": 0.8768, + "step": 4332 + }, + { + "epoch": 0.2109901882014949, + "grad_norm": 0.08629701286554337, + "learning_rate": 3.666228739492277e-05, + "loss": 0.5756, + "step": 4333 + }, + { + "epoch": 0.2110388819906021, + "grad_norm": 1.4904873371124268, + "learning_rate": 3.666054250445137e-05, + "loss": 0.8224, + "step": 4334 + }, + { + "epoch": 0.2110875757797093, + "grad_norm": 2.9543540477752686, + "learning_rate": 3.6658797199547465e-05, + "loss": 0.8983, + "step": 4335 + }, + { + "epoch": 0.2111362695688165, + "grad_norm": 3.4055628776550293, + "learning_rate": 3.6657051480254445e-05, + "loss": 0.9304, + "step": 4336 + }, + { + "epoch": 0.2111849633579237, + "grad_norm": 1.869614839553833, + "learning_rate": 3.665530534661576e-05, + "loss": 0.8548, + "step": 4337 + }, + { + "epoch": 0.2112336571470309, + "grad_norm": 2.2917609214782715, + "learning_rate": 3.665355879867484e-05, + "loss": 0.8295, + "step": 4338 + }, + { + "epoch": 0.2112823509361381, + "grad_norm": 1.8481993675231934, + "learning_rate": 3.665181183647512e-05, + "loss": 0.9265, + "step": 4339 + }, + { + "epoch": 0.2113310447252453, + "grad_norm": 1.7534191608428955, + "learning_rate": 3.665006446006007e-05, + "loss": 0.8681, + "step": 4340 + }, + { + "epoch": 0.2113797385143525, + "grad_norm": 1.3689826726913452, + "learning_rate": 3.664831666947314e-05, + "loss": 0.8104, + "step": 4341 + }, + { + "epoch": 0.2114284323034597, + "grad_norm": 1.7327194213867188, + "learning_rate": 3.664656846475782e-05, + "loss": 0.8493, + "step": 4342 + }, + { + "epoch": 0.21147712609256689, + "grad_norm": 2.394169330596924, + "learning_rate": 3.6644819845957585e-05, + "loss": 0.9726, + "step": 4343 + }, + { + "epoch": 0.2115258198816741, + "grad_norm": 2.2895772457122803, + "learning_rate": 3.6643070813115945e-05, + "loss": 0.7882, + "step": 4344 + }, + { + "epoch": 0.21157451367078128, + "grad_norm": 2.2209455966949463, + "learning_rate": 3.6641321366276394e-05, + "loss": 0.8835, + "step": 4345 + }, + { + "epoch": 0.2116232074598885, + "grad_norm": 1.5467314720153809, + "learning_rate": 3.663957150548247e-05, + "loss": 0.9026, + "step": 4346 + }, + { + "epoch": 0.21167190124899568, + "grad_norm": 1.7083077430725098, + "learning_rate": 3.663782123077768e-05, + "loss": 0.8664, + "step": 4347 + }, + { + "epoch": 0.2117205950381029, + "grad_norm": 1.582352638244629, + "learning_rate": 3.663607054220557e-05, + "loss": 0.8142, + "step": 4348 + }, + { + "epoch": 0.21176928882721008, + "grad_norm": 2.050028085708618, + "learning_rate": 3.663431943980968e-05, + "loss": 0.8941, + "step": 4349 + }, + { + "epoch": 0.2118179826163173, + "grad_norm": 1.6072864532470703, + "learning_rate": 3.663256792363359e-05, + "loss": 0.8792, + "step": 4350 + }, + { + "epoch": 0.21186667640542448, + "grad_norm": 1.818084955215454, + "learning_rate": 3.663081599372085e-05, + "loss": 0.9289, + "step": 4351 + }, + { + "epoch": 0.2119153701945317, + "grad_norm": 1.5994004011154175, + "learning_rate": 3.662906365011505e-05, + "loss": 0.8341, + "step": 4352 + }, + { + "epoch": 0.2119640639836389, + "grad_norm": 1.4925841093063354, + "learning_rate": 3.662731089285977e-05, + "loss": 0.9222, + "step": 4353 + }, + { + "epoch": 0.21201275777274609, + "grad_norm": 0.08019321411848068, + "learning_rate": 3.662555772199862e-05, + "loss": 0.6325, + "step": 4354 + }, + { + "epoch": 0.2120614515618533, + "grad_norm": 2.1873998641967773, + "learning_rate": 3.6623804137575204e-05, + "loss": 0.8572, + "step": 4355 + }, + { + "epoch": 0.21211014535096048, + "grad_norm": 1.8792918920516968, + "learning_rate": 3.6622050139633144e-05, + "loss": 0.807, + "step": 4356 + }, + { + "epoch": 0.2121588391400677, + "grad_norm": 1.8240809440612793, + "learning_rate": 3.662029572821608e-05, + "loss": 0.9432, + "step": 4357 + }, + { + "epoch": 0.21220753292917488, + "grad_norm": 1.5617492198944092, + "learning_rate": 3.661854090336763e-05, + "loss": 0.9361, + "step": 4358 + }, + { + "epoch": 0.2122562267182821, + "grad_norm": 1.7076560258865356, + "learning_rate": 3.661678566513147e-05, + "loss": 0.8283, + "step": 4359 + }, + { + "epoch": 0.21230492050738928, + "grad_norm": 1.7952711582183838, + "learning_rate": 3.661503001355124e-05, + "loss": 0.8447, + "step": 4360 + }, + { + "epoch": 0.2123536142964965, + "grad_norm": 1.701356291770935, + "learning_rate": 3.6613273948670636e-05, + "loss": 0.9479, + "step": 4361 + }, + { + "epoch": 0.21240230808560367, + "grad_norm": 2.778043031692505, + "learning_rate": 3.661151747053333e-05, + "loss": 0.8325, + "step": 4362 + }, + { + "epoch": 0.2124510018747109, + "grad_norm": 2.6507835388183594, + "learning_rate": 3.6609760579182996e-05, + "loss": 0.9619, + "step": 4363 + }, + { + "epoch": 0.21249969566381807, + "grad_norm": 2.6080448627471924, + "learning_rate": 3.660800327466336e-05, + "loss": 0.9432, + "step": 4364 + }, + { + "epoch": 0.21254838945292528, + "grad_norm": 1.747962236404419, + "learning_rate": 3.6606245557018134e-05, + "loss": 0.8297, + "step": 4365 + }, + { + "epoch": 0.21259708324203247, + "grad_norm": 1.8502092361450195, + "learning_rate": 3.660448742629104e-05, + "loss": 0.9707, + "step": 4366 + }, + { + "epoch": 0.21264577703113968, + "grad_norm": 3.5004050731658936, + "learning_rate": 3.6602728882525794e-05, + "loss": 0.8884, + "step": 4367 + }, + { + "epoch": 0.21269447082024687, + "grad_norm": 2.2015786170959473, + "learning_rate": 3.660096992576616e-05, + "loss": 0.8665, + "step": 4368 + }, + { + "epoch": 0.21274316460935408, + "grad_norm": 0.09624004364013672, + "learning_rate": 3.659921055605589e-05, + "loss": 0.7271, + "step": 4369 + }, + { + "epoch": 0.21279185839846126, + "grad_norm": 2.0409350395202637, + "learning_rate": 3.6597450773438727e-05, + "loss": 0.8869, + "step": 4370 + }, + { + "epoch": 0.21284055218756848, + "grad_norm": 5.18552303314209, + "learning_rate": 3.6595690577958476e-05, + "loss": 0.8176, + "step": 4371 + }, + { + "epoch": 0.21288924597667566, + "grad_norm": 2.02213978767395, + "learning_rate": 3.659392996965891e-05, + "loss": 0.9051, + "step": 4372 + }, + { + "epoch": 0.21293793976578287, + "grad_norm": 2.0049142837524414, + "learning_rate": 3.659216894858382e-05, + "loss": 0.7698, + "step": 4373 + }, + { + "epoch": 0.2129866335548901, + "grad_norm": 1.9970359802246094, + "learning_rate": 3.659040751477702e-05, + "loss": 0.8398, + "step": 4374 + }, + { + "epoch": 0.21303532734399727, + "grad_norm": 2.0835065841674805, + "learning_rate": 3.658864566828231e-05, + "loss": 0.8529, + "step": 4375 + }, + { + "epoch": 0.21308402113310448, + "grad_norm": 1.740605354309082, + "learning_rate": 3.658688340914354e-05, + "loss": 0.8931, + "step": 4376 + }, + { + "epoch": 0.21313271492221167, + "grad_norm": 1.8547611236572266, + "learning_rate": 3.658512073740452e-05, + "loss": 0.8847, + "step": 4377 + }, + { + "epoch": 0.21318140871131888, + "grad_norm": 0.08764064311981201, + "learning_rate": 3.658335765310911e-05, + "loss": 0.633, + "step": 4378 + }, + { + "epoch": 0.21323010250042607, + "grad_norm": 1.4507102966308594, + "learning_rate": 3.658159415630117e-05, + "loss": 0.8596, + "step": 4379 + }, + { + "epoch": 0.21327879628953328, + "grad_norm": 1.7369980812072754, + "learning_rate": 3.657983024702457e-05, + "loss": 0.8453, + "step": 4380 + }, + { + "epoch": 0.21332749007864046, + "grad_norm": 1.627745270729065, + "learning_rate": 3.6578065925323165e-05, + "loss": 0.9459, + "step": 4381 + }, + { + "epoch": 0.21337618386774768, + "grad_norm": 1.408606767654419, + "learning_rate": 3.6576301191240876e-05, + "loss": 0.8821, + "step": 4382 + }, + { + "epoch": 0.21342487765685486, + "grad_norm": 1.7045750617980957, + "learning_rate": 3.6574536044821574e-05, + "loss": 0.896, + "step": 4383 + }, + { + "epoch": 0.21347357144596207, + "grad_norm": 1.5820380449295044, + "learning_rate": 3.657277048610918e-05, + "loss": 0.9315, + "step": 4384 + }, + { + "epoch": 0.21352226523506926, + "grad_norm": 2.896852731704712, + "learning_rate": 3.657100451514761e-05, + "loss": 0.8928, + "step": 4385 + }, + { + "epoch": 0.21357095902417647, + "grad_norm": 1.8369916677474976, + "learning_rate": 3.656923813198079e-05, + "loss": 0.9338, + "step": 4386 + }, + { + "epoch": 0.21361965281328366, + "grad_norm": 2.2339024543762207, + "learning_rate": 3.6567471336652654e-05, + "loss": 0.9745, + "step": 4387 + }, + { + "epoch": 0.21366834660239087, + "grad_norm": 0.08264290541410446, + "learning_rate": 3.656570412920717e-05, + "loss": 0.5747, + "step": 4388 + }, + { + "epoch": 0.21371704039149805, + "grad_norm": 1.7427737712860107, + "learning_rate": 3.6563936509688275e-05, + "loss": 0.8924, + "step": 4389 + }, + { + "epoch": 0.21376573418060527, + "grad_norm": 1.3520781993865967, + "learning_rate": 3.6562168478139954e-05, + "loss": 0.7872, + "step": 4390 + }, + { + "epoch": 0.21381442796971245, + "grad_norm": 2.51175594329834, + "learning_rate": 3.6560400034606185e-05, + "loss": 0.8724, + "step": 4391 + }, + { + "epoch": 0.21386312175881966, + "grad_norm": 2.037461757659912, + "learning_rate": 3.6558631179130945e-05, + "loss": 0.8381, + "step": 4392 + }, + { + "epoch": 0.21391181554792688, + "grad_norm": 1.9183403253555298, + "learning_rate": 3.6556861911758254e-05, + "loss": 0.9583, + "step": 4393 + }, + { + "epoch": 0.21396050933703406, + "grad_norm": 2.23017954826355, + "learning_rate": 3.655509223253211e-05, + "loss": 0.9222, + "step": 4394 + }, + { + "epoch": 0.21400920312614127, + "grad_norm": 1.6299118995666504, + "learning_rate": 3.6553322141496544e-05, + "loss": 0.8498, + "step": 4395 + }, + { + "epoch": 0.21405789691524846, + "grad_norm": 2.545903205871582, + "learning_rate": 3.655155163869557e-05, + "loss": 0.8995, + "step": 4396 + }, + { + "epoch": 0.21410659070435567, + "grad_norm": 2.17325496673584, + "learning_rate": 3.654978072417325e-05, + "loss": 0.8279, + "step": 4397 + }, + { + "epoch": 0.21415528449346286, + "grad_norm": 2.486006498336792, + "learning_rate": 3.654800939797362e-05, + "loss": 0.9252, + "step": 4398 + }, + { + "epoch": 0.21420397828257007, + "grad_norm": 1.4934169054031372, + "learning_rate": 3.6546237660140745e-05, + "loss": 0.933, + "step": 4399 + }, + { + "epoch": 0.21425267207167725, + "grad_norm": 1.416372299194336, + "learning_rate": 3.65444655107187e-05, + "loss": 0.9917, + "step": 4400 + }, + { + "epoch": 0.21430136586078447, + "grad_norm": 1.3850486278533936, + "learning_rate": 3.6542692949751565e-05, + "loss": 0.8961, + "step": 4401 + }, + { + "epoch": 0.21435005964989165, + "grad_norm": 1.6500091552734375, + "learning_rate": 3.654091997728344e-05, + "loss": 0.9038, + "step": 4402 + }, + { + "epoch": 0.21439875343899886, + "grad_norm": 1.7272528409957886, + "learning_rate": 3.653914659335843e-05, + "loss": 0.8541, + "step": 4403 + }, + { + "epoch": 0.21444744722810605, + "grad_norm": 1.630928635597229, + "learning_rate": 3.653737279802062e-05, + "loss": 0.8262, + "step": 4404 + }, + { + "epoch": 0.21449614101721326, + "grad_norm": 2.9350974559783936, + "learning_rate": 3.653559859131417e-05, + "loss": 0.8965, + "step": 4405 + }, + { + "epoch": 0.21454483480632044, + "grad_norm": 1.4548090696334839, + "learning_rate": 3.653382397328319e-05, + "loss": 0.9006, + "step": 4406 + }, + { + "epoch": 0.21459352859542766, + "grad_norm": 2.617136240005493, + "learning_rate": 3.6532048943971836e-05, + "loss": 0.7618, + "step": 4407 + }, + { + "epoch": 0.21464222238453484, + "grad_norm": 1.9052491188049316, + "learning_rate": 3.653027350342425e-05, + "loss": 0.8631, + "step": 4408 + }, + { + "epoch": 0.21469091617364205, + "grad_norm": 1.914258599281311, + "learning_rate": 3.652849765168462e-05, + "loss": 0.8453, + "step": 4409 + }, + { + "epoch": 0.21473960996274924, + "grad_norm": 2.1309664249420166, + "learning_rate": 3.652672138879709e-05, + "loss": 0.8995, + "step": 4410 + }, + { + "epoch": 0.21478830375185645, + "grad_norm": 2.167938232421875, + "learning_rate": 3.652494471480585e-05, + "loss": 0.8605, + "step": 4411 + }, + { + "epoch": 0.21483699754096364, + "grad_norm": 2.584530830383301, + "learning_rate": 3.6523167629755125e-05, + "loss": 0.8478, + "step": 4412 + }, + { + "epoch": 0.21488569133007085, + "grad_norm": 2.1600654125213623, + "learning_rate": 3.6521390133689085e-05, + "loss": 0.8562, + "step": 4413 + }, + { + "epoch": 0.21493438511917806, + "grad_norm": 2.2216851711273193, + "learning_rate": 3.651961222665196e-05, + "loss": 0.8685, + "step": 4414 + }, + { + "epoch": 0.21498307890828525, + "grad_norm": 1.6638221740722656, + "learning_rate": 3.651783390868798e-05, + "loss": 0.9703, + "step": 4415 + }, + { + "epoch": 0.21503177269739246, + "grad_norm": 2.0473880767822266, + "learning_rate": 3.651605517984137e-05, + "loss": 0.8357, + "step": 4416 + }, + { + "epoch": 0.21508046648649964, + "grad_norm": 2.498112916946411, + "learning_rate": 3.651427604015639e-05, + "loss": 0.8457, + "step": 4417 + }, + { + "epoch": 0.21512916027560686, + "grad_norm": 1.9994069337844849, + "learning_rate": 3.651249648967728e-05, + "loss": 0.8963, + "step": 4418 + }, + { + "epoch": 0.21517785406471404, + "grad_norm": 2.0420029163360596, + "learning_rate": 3.651071652844831e-05, + "loss": 0.9349, + "step": 4419 + }, + { + "epoch": 0.21522654785382125, + "grad_norm": 2.2552387714385986, + "learning_rate": 3.650893615651377e-05, + "loss": 0.8637, + "step": 4420 + }, + { + "epoch": 0.21527524164292844, + "grad_norm": 1.7718279361724854, + "learning_rate": 3.6507155373917936e-05, + "loss": 0.8483, + "step": 4421 + }, + { + "epoch": 0.21532393543203565, + "grad_norm": 1.9089759588241577, + "learning_rate": 3.65053741807051e-05, + "loss": 0.8572, + "step": 4422 + }, + { + "epoch": 0.21537262922114284, + "grad_norm": 2.0838401317596436, + "learning_rate": 3.650359257691959e-05, + "loss": 0.9025, + "step": 4423 + }, + { + "epoch": 0.21542132301025005, + "grad_norm": 0.08785957843065262, + "learning_rate": 3.65018105626057e-05, + "loss": 0.6459, + "step": 4424 + }, + { + "epoch": 0.21547001679935723, + "grad_norm": 6.984577178955078, + "learning_rate": 3.650002813780777e-05, + "loss": 0.8359, + "step": 4425 + }, + { + "epoch": 0.21551871058846445, + "grad_norm": 1.792027235031128, + "learning_rate": 3.6498245302570135e-05, + "loss": 0.799, + "step": 4426 + }, + { + "epoch": 0.21556740437757163, + "grad_norm": 2.257126808166504, + "learning_rate": 3.649646205693714e-05, + "loss": 0.9102, + "step": 4427 + }, + { + "epoch": 0.21561609816667884, + "grad_norm": 1.9583617448806763, + "learning_rate": 3.649467840095316e-05, + "loss": 0.9051, + "step": 4428 + }, + { + "epoch": 0.21566479195578603, + "grad_norm": 2.100942611694336, + "learning_rate": 3.6492894334662536e-05, + "loss": 0.8951, + "step": 4429 + }, + { + "epoch": 0.21571348574489324, + "grad_norm": 3.3743157386779785, + "learning_rate": 3.649110985810967e-05, + "loss": 0.8938, + "step": 4430 + }, + { + "epoch": 0.21576217953400043, + "grad_norm": 1.5023890733718872, + "learning_rate": 3.6489324971338944e-05, + "loss": 0.8041, + "step": 4431 + }, + { + "epoch": 0.21581087332310764, + "grad_norm": 1.3777159452438354, + "learning_rate": 3.648753967439475e-05, + "loss": 0.8705, + "step": 4432 + }, + { + "epoch": 0.21585956711221485, + "grad_norm": 1.7614414691925049, + "learning_rate": 3.6485753967321506e-05, + "loss": 0.8884, + "step": 4433 + }, + { + "epoch": 0.21590826090132204, + "grad_norm": 2.0921852588653564, + "learning_rate": 3.648396785016363e-05, + "loss": 0.8347, + "step": 4434 + }, + { + "epoch": 0.21595695469042925, + "grad_norm": 1.5252304077148438, + "learning_rate": 3.648218132296555e-05, + "loss": 0.856, + "step": 4435 + }, + { + "epoch": 0.21600564847953643, + "grad_norm": 1.7473461627960205, + "learning_rate": 3.6480394385771704e-05, + "loss": 0.868, + "step": 4436 + }, + { + "epoch": 0.21605434226864365, + "grad_norm": 2.844672679901123, + "learning_rate": 3.6478607038626545e-05, + "loss": 0.8318, + "step": 4437 + }, + { + "epoch": 0.21610303605775083, + "grad_norm": 0.08277241140604019, + "learning_rate": 3.6476819281574536e-05, + "loss": 0.6504, + "step": 4438 + }, + { + "epoch": 0.21615172984685804, + "grad_norm": 3.3618974685668945, + "learning_rate": 3.647503111466015e-05, + "loss": 0.823, + "step": 4439 + }, + { + "epoch": 0.21620042363596523, + "grad_norm": 2.239453077316284, + "learning_rate": 3.647324253792785e-05, + "loss": 0.7609, + "step": 4440 + }, + { + "epoch": 0.21624911742507244, + "grad_norm": 4.016051769256592, + "learning_rate": 3.647145355142216e-05, + "loss": 0.8562, + "step": 4441 + }, + { + "epoch": 0.21629781121417962, + "grad_norm": 1.8269758224487305, + "learning_rate": 3.6469664155187545e-05, + "loss": 0.7884, + "step": 4442 + }, + { + "epoch": 0.21634650500328684, + "grad_norm": 5.856226444244385, + "learning_rate": 3.646787434926854e-05, + "loss": 0.9582, + "step": 4443 + }, + { + "epoch": 0.21639519879239402, + "grad_norm": 2.3072926998138428, + "learning_rate": 3.646608413370965e-05, + "loss": 0.8846, + "step": 4444 + }, + { + "epoch": 0.21644389258150123, + "grad_norm": 1.6501400470733643, + "learning_rate": 3.646429350855542e-05, + "loss": 0.9032, + "step": 4445 + }, + { + "epoch": 0.21649258637060842, + "grad_norm": 2.411881446838379, + "learning_rate": 3.6462502473850394e-05, + "loss": 0.9043, + "step": 4446 + }, + { + "epoch": 0.21654128015971563, + "grad_norm": 0.08324775099754333, + "learning_rate": 3.6460711029639104e-05, + "loss": 0.5816, + "step": 4447 + }, + { + "epoch": 0.21658997394882282, + "grad_norm": 5.955951690673828, + "learning_rate": 3.645891917596614e-05, + "loss": 0.8925, + "step": 4448 + }, + { + "epoch": 0.21663866773793003, + "grad_norm": 3.6997642517089844, + "learning_rate": 3.6457126912876055e-05, + "loss": 0.9269, + "step": 4449 + }, + { + "epoch": 0.21668736152703721, + "grad_norm": 1.4912108182907104, + "learning_rate": 3.645533424041344e-05, + "loss": 0.8918, + "step": 4450 + }, + { + "epoch": 0.21673605531614443, + "grad_norm": 1.7180818319320679, + "learning_rate": 3.645354115862288e-05, + "loss": 0.8984, + "step": 4451 + }, + { + "epoch": 0.2167847491052516, + "grad_norm": 1.5317152738571167, + "learning_rate": 3.6451747667548994e-05, + "loss": 0.9254, + "step": 4452 + }, + { + "epoch": 0.21683344289435882, + "grad_norm": 1.635852336883545, + "learning_rate": 3.6449953767236375e-05, + "loss": 0.9207, + "step": 4453 + }, + { + "epoch": 0.21688213668346604, + "grad_norm": 1.7095438241958618, + "learning_rate": 3.644815945772966e-05, + "loss": 0.8301, + "step": 4454 + }, + { + "epoch": 0.21693083047257322, + "grad_norm": 1.7325973510742188, + "learning_rate": 3.644636473907348e-05, + "loss": 0.8743, + "step": 4455 + }, + { + "epoch": 0.21697952426168043, + "grad_norm": 2.6432390213012695, + "learning_rate": 3.6444569611312476e-05, + "loss": 0.8656, + "step": 4456 + }, + { + "epoch": 0.21702821805078762, + "grad_norm": 1.8866318464279175, + "learning_rate": 3.644277407449131e-05, + "loss": 0.8871, + "step": 4457 + }, + { + "epoch": 0.21707691183989483, + "grad_norm": 1.8281736373901367, + "learning_rate": 3.644097812865463e-05, + "loss": 0.8592, + "step": 4458 + }, + { + "epoch": 0.21712560562900202, + "grad_norm": 2.237212896347046, + "learning_rate": 3.643918177384712e-05, + "loss": 0.9259, + "step": 4459 + }, + { + "epoch": 0.21717429941810923, + "grad_norm": 7.0260515213012695, + "learning_rate": 3.643738501011347e-05, + "loss": 0.8948, + "step": 4460 + }, + { + "epoch": 0.2172229932072164, + "grad_norm": 2.110260486602783, + "learning_rate": 3.643558783749836e-05, + "loss": 0.7925, + "step": 4461 + }, + { + "epoch": 0.21727168699632363, + "grad_norm": 2.0082805156707764, + "learning_rate": 3.6433790256046516e-05, + "loss": 0.9116, + "step": 4462 + }, + { + "epoch": 0.2173203807854308, + "grad_norm": 1.3003160953521729, + "learning_rate": 3.6431992265802636e-05, + "loss": 0.8534, + "step": 4463 + }, + { + "epoch": 0.21736907457453802, + "grad_norm": 1.3222246170043945, + "learning_rate": 3.643019386681145e-05, + "loss": 0.9627, + "step": 4464 + }, + { + "epoch": 0.2174177683636452, + "grad_norm": 1.6165351867675781, + "learning_rate": 3.6428395059117694e-05, + "loss": 0.8064, + "step": 4465 + }, + { + "epoch": 0.21746646215275242, + "grad_norm": 1.8202532529830933, + "learning_rate": 3.642659584276612e-05, + "loss": 0.9746, + "step": 4466 + }, + { + "epoch": 0.2175151559418596, + "grad_norm": 1.4616191387176514, + "learning_rate": 3.6424796217801464e-05, + "loss": 0.8482, + "step": 4467 + }, + { + "epoch": 0.21756384973096682, + "grad_norm": 1.931198239326477, + "learning_rate": 3.642299618426851e-05, + "loss": 0.9237, + "step": 4468 + }, + { + "epoch": 0.217612543520074, + "grad_norm": 1.6880102157592773, + "learning_rate": 3.6421195742212026e-05, + "loss": 0.8003, + "step": 4469 + }, + { + "epoch": 0.21766123730918122, + "grad_norm": 1.4855492115020752, + "learning_rate": 3.6419394891676804e-05, + "loss": 0.8283, + "step": 4470 + }, + { + "epoch": 0.2177099310982884, + "grad_norm": 1.930121898651123, + "learning_rate": 3.641759363270764e-05, + "loss": 0.8788, + "step": 4471 + }, + { + "epoch": 0.2177586248873956, + "grad_norm": 2.344174861907959, + "learning_rate": 3.6415791965349335e-05, + "loss": 0.9342, + "step": 4472 + }, + { + "epoch": 0.21780731867650283, + "grad_norm": 2.0181941986083984, + "learning_rate": 3.6413989889646704e-05, + "loss": 0.8082, + "step": 4473 + }, + { + "epoch": 0.21785601246561, + "grad_norm": 2.2775278091430664, + "learning_rate": 3.641218740564458e-05, + "loss": 0.9066, + "step": 4474 + }, + { + "epoch": 0.21790470625471722, + "grad_norm": 1.6425511837005615, + "learning_rate": 3.64103845133878e-05, + "loss": 0.8051, + "step": 4475 + }, + { + "epoch": 0.2179534000438244, + "grad_norm": 2.168531656265259, + "learning_rate": 3.64085812129212e-05, + "loss": 0.9369, + "step": 4476 + }, + { + "epoch": 0.21800209383293162, + "grad_norm": 3.702645778656006, + "learning_rate": 3.640677750428966e-05, + "loss": 0.8697, + "step": 4477 + }, + { + "epoch": 0.2180507876220388, + "grad_norm": 1.3738694190979004, + "learning_rate": 3.640497338753803e-05, + "loss": 0.9594, + "step": 4478 + }, + { + "epoch": 0.21809948141114602, + "grad_norm": 1.6994898319244385, + "learning_rate": 3.6403168862711185e-05, + "loss": 0.7786, + "step": 4479 + }, + { + "epoch": 0.2181481752002532, + "grad_norm": 1.4948806762695312, + "learning_rate": 3.6401363929854025e-05, + "loss": 0.9205, + "step": 4480 + }, + { + "epoch": 0.21819686898936042, + "grad_norm": 1.8080536127090454, + "learning_rate": 3.639955858901144e-05, + "loss": 0.9391, + "step": 4481 + }, + { + "epoch": 0.2182455627784676, + "grad_norm": 1.6313260793685913, + "learning_rate": 3.639775284022834e-05, + "loss": 0.8487, + "step": 4482 + }, + { + "epoch": 0.2182942565675748, + "grad_norm": 1.3522164821624756, + "learning_rate": 3.6395946683549646e-05, + "loss": 0.8701, + "step": 4483 + }, + { + "epoch": 0.218342950356682, + "grad_norm": 1.8332459926605225, + "learning_rate": 3.639414011902027e-05, + "loss": 0.8751, + "step": 4484 + }, + { + "epoch": 0.2183916441457892, + "grad_norm": 1.6933164596557617, + "learning_rate": 3.639233314668518e-05, + "loss": 0.8613, + "step": 4485 + }, + { + "epoch": 0.2184403379348964, + "grad_norm": 2.0124316215515137, + "learning_rate": 3.6390525766589305e-05, + "loss": 0.9475, + "step": 4486 + }, + { + "epoch": 0.2184890317240036, + "grad_norm": 0.08501318097114563, + "learning_rate": 3.63887179787776e-05, + "loss": 0.5803, + "step": 4487 + }, + { + "epoch": 0.2185377255131108, + "grad_norm": 1.7565892934799194, + "learning_rate": 3.6386909783295046e-05, + "loss": 0.8539, + "step": 4488 + }, + { + "epoch": 0.218586419302218, + "grad_norm": 2.2619993686676025, + "learning_rate": 3.6385101180186616e-05, + "loss": 0.9239, + "step": 4489 + }, + { + "epoch": 0.2186351130913252, + "grad_norm": 2.8189375400543213, + "learning_rate": 3.63832921694973e-05, + "loss": 0.7866, + "step": 4490 + }, + { + "epoch": 0.2186838068804324, + "grad_norm": 4.207200527191162, + "learning_rate": 3.638148275127209e-05, + "loss": 0.8778, + "step": 4491 + }, + { + "epoch": 0.2187325006695396, + "grad_norm": 3.2257046699523926, + "learning_rate": 3.637967292555602e-05, + "loss": 0.9653, + "step": 4492 + }, + { + "epoch": 0.2187811944586468, + "grad_norm": 1.442470908164978, + "learning_rate": 3.6377862692394085e-05, + "loss": 0.867, + "step": 4493 + }, + { + "epoch": 0.218829888247754, + "grad_norm": 2.0284488201141357, + "learning_rate": 3.6376052051831316e-05, + "loss": 0.7746, + "step": 4494 + }, + { + "epoch": 0.2188785820368612, + "grad_norm": 1.4463790655136108, + "learning_rate": 3.6374241003912764e-05, + "loss": 0.8568, + "step": 4495 + }, + { + "epoch": 0.2189272758259684, + "grad_norm": 1.5654553174972534, + "learning_rate": 3.637242954868348e-05, + "loss": 0.9412, + "step": 4496 + }, + { + "epoch": 0.2189759696150756, + "grad_norm": 1.7653363943099976, + "learning_rate": 3.6370617686188506e-05, + "loss": 0.9453, + "step": 4497 + }, + { + "epoch": 0.2190246634041828, + "grad_norm": 1.8703677654266357, + "learning_rate": 3.636880541647293e-05, + "loss": 0.8897, + "step": 4498 + }, + { + "epoch": 0.21907335719329, + "grad_norm": 2.433931827545166, + "learning_rate": 3.636699273958183e-05, + "loss": 0.9196, + "step": 4499 + }, + { + "epoch": 0.2191220509823972, + "grad_norm": 1.8301204442977905, + "learning_rate": 3.636517965556029e-05, + "loss": 0.894, + "step": 4500 + }, + { + "epoch": 0.2191707447715044, + "grad_norm": 2.148613691329956, + "learning_rate": 3.636336616445341e-05, + "loss": 0.807, + "step": 4501 + }, + { + "epoch": 0.2192194385606116, + "grad_norm": 1.645344614982605, + "learning_rate": 3.6361552266306315e-05, + "loss": 0.8551, + "step": 4502 + }, + { + "epoch": 0.2192681323497188, + "grad_norm": 4.180613994598389, + "learning_rate": 3.635973796116411e-05, + "loss": 0.844, + "step": 4503 + }, + { + "epoch": 0.219316826138826, + "grad_norm": 1.1959846019744873, + "learning_rate": 3.635792324907193e-05, + "loss": 0.8368, + "step": 4504 + }, + { + "epoch": 0.21936551992793318, + "grad_norm": 1.7437371015548706, + "learning_rate": 3.635610813007492e-05, + "loss": 0.9124, + "step": 4505 + }, + { + "epoch": 0.2194142137170404, + "grad_norm": 1.8526735305786133, + "learning_rate": 3.6354292604218224e-05, + "loss": 0.8403, + "step": 4506 + }, + { + "epoch": 0.21946290750614758, + "grad_norm": 1.3483972549438477, + "learning_rate": 3.6352476671547015e-05, + "loss": 0.8516, + "step": 4507 + }, + { + "epoch": 0.2195116012952548, + "grad_norm": 2.0221409797668457, + "learning_rate": 3.6350660332106455e-05, + "loss": 0.8764, + "step": 4508 + }, + { + "epoch": 0.21956029508436198, + "grad_norm": 2.4406144618988037, + "learning_rate": 3.634884358594173e-05, + "loss": 0.9481, + "step": 4509 + }, + { + "epoch": 0.2196089888734692, + "grad_norm": 2.0860655307769775, + "learning_rate": 3.634702643309803e-05, + "loss": 0.9042, + "step": 4510 + }, + { + "epoch": 0.21965768266257638, + "grad_norm": 1.8701386451721191, + "learning_rate": 3.634520887362055e-05, + "loss": 0.9441, + "step": 4511 + }, + { + "epoch": 0.2197063764516836, + "grad_norm": 2.450546979904175, + "learning_rate": 3.634339090755452e-05, + "loss": 0.7805, + "step": 4512 + }, + { + "epoch": 0.2197550702407908, + "grad_norm": 1.4989277124404907, + "learning_rate": 3.634157253494515e-05, + "loss": 0.8737, + "step": 4513 + }, + { + "epoch": 0.21980376402989799, + "grad_norm": 0.09250632673501968, + "learning_rate": 3.6339753755837665e-05, + "loss": 0.63, + "step": 4514 + }, + { + "epoch": 0.2198524578190052, + "grad_norm": 2.283071756362915, + "learning_rate": 3.6337934570277315e-05, + "loss": 0.9605, + "step": 4515 + }, + { + "epoch": 0.21990115160811238, + "grad_norm": 2.004622220993042, + "learning_rate": 3.6336114978309364e-05, + "loss": 0.9518, + "step": 4516 + }, + { + "epoch": 0.2199498453972196, + "grad_norm": 2.1435697078704834, + "learning_rate": 3.633429497997906e-05, + "loss": 0.9175, + "step": 4517 + }, + { + "epoch": 0.21999853918632678, + "grad_norm": 1.3602653741836548, + "learning_rate": 3.633247457533167e-05, + "loss": 0.8561, + "step": 4518 + }, + { + "epoch": 0.220047232975434, + "grad_norm": 2.1580810546875, + "learning_rate": 3.633065376441249e-05, + "loss": 0.954, + "step": 4519 + }, + { + "epoch": 0.22009592676454118, + "grad_norm": 1.795380711555481, + "learning_rate": 3.632883254726681e-05, + "loss": 0.8952, + "step": 4520 + }, + { + "epoch": 0.2201446205536484, + "grad_norm": 1.6063721179962158, + "learning_rate": 3.6327010923939934e-05, + "loss": 0.8516, + "step": 4521 + }, + { + "epoch": 0.22019331434275558, + "grad_norm": 0.08542740345001221, + "learning_rate": 3.632518889447717e-05, + "loss": 0.6197, + "step": 4522 + }, + { + "epoch": 0.2202420081318628, + "grad_norm": 2.3546218872070312, + "learning_rate": 3.6323366458923846e-05, + "loss": 0.8796, + "step": 4523 + }, + { + "epoch": 0.22029070192096997, + "grad_norm": 3.0989511013031006, + "learning_rate": 3.6321543617325295e-05, + "loss": 0.9089, + "step": 4524 + }, + { + "epoch": 0.22033939571007718, + "grad_norm": 2.07474946975708, + "learning_rate": 3.631972036972685e-05, + "loss": 0.9253, + "step": 4525 + }, + { + "epoch": 0.22038808949918437, + "grad_norm": 2.1646759510040283, + "learning_rate": 3.631789671617388e-05, + "loss": 0.9428, + "step": 4526 + }, + { + "epoch": 0.22043678328829158, + "grad_norm": 0.0838804543018341, + "learning_rate": 3.631607265671174e-05, + "loss": 0.536, + "step": 4527 + }, + { + "epoch": 0.22048547707739877, + "grad_norm": 1.6166610717773438, + "learning_rate": 3.6314248191385804e-05, + "loss": 0.8193, + "step": 4528 + }, + { + "epoch": 0.22053417086650598, + "grad_norm": 3.8060367107391357, + "learning_rate": 3.631242332024146e-05, + "loss": 0.866, + "step": 4529 + }, + { + "epoch": 0.22058286465561316, + "grad_norm": 1.9916479587554932, + "learning_rate": 3.631059804332409e-05, + "loss": 0.7441, + "step": 4530 + }, + { + "epoch": 0.22063155844472038, + "grad_norm": 1.3267548084259033, + "learning_rate": 3.6308772360679115e-05, + "loss": 0.9638, + "step": 4531 + }, + { + "epoch": 0.22068025223382756, + "grad_norm": 1.826101303100586, + "learning_rate": 3.630694627235194e-05, + "loss": 0.9463, + "step": 4532 + }, + { + "epoch": 0.22072894602293477, + "grad_norm": 1.8168621063232422, + "learning_rate": 3.630511977838798e-05, + "loss": 0.8315, + "step": 4533 + }, + { + "epoch": 0.220777639812042, + "grad_norm": 1.6994853019714355, + "learning_rate": 3.630329287883269e-05, + "loss": 0.8765, + "step": 4534 + }, + { + "epoch": 0.22082633360114917, + "grad_norm": 3.849287748336792, + "learning_rate": 3.6301465573731494e-05, + "loss": 0.8386, + "step": 4535 + }, + { + "epoch": 0.22087502739025638, + "grad_norm": 1.4525222778320312, + "learning_rate": 3.629963786312987e-05, + "loss": 0.8406, + "step": 4536 + }, + { + "epoch": 0.22092372117936357, + "grad_norm": 0.08302707970142365, + "learning_rate": 3.629780974707325e-05, + "loss": 0.5868, + "step": 4537 + }, + { + "epoch": 0.22097241496847078, + "grad_norm": 2.413804054260254, + "learning_rate": 3.629598122560714e-05, + "loss": 0.8682, + "step": 4538 + }, + { + "epoch": 0.22102110875757797, + "grad_norm": 1.4526333808898926, + "learning_rate": 3.629415229877701e-05, + "loss": 0.8397, + "step": 4539 + }, + { + "epoch": 0.22106980254668518, + "grad_norm": 1.5666217803955078, + "learning_rate": 3.629232296662835e-05, + "loss": 0.9233, + "step": 4540 + }, + { + "epoch": 0.22111849633579236, + "grad_norm": 2.292985200881958, + "learning_rate": 3.629049322920668e-05, + "loss": 0.9329, + "step": 4541 + }, + { + "epoch": 0.22116719012489958, + "grad_norm": 1.5789239406585693, + "learning_rate": 3.628866308655749e-05, + "loss": 0.9502, + "step": 4542 + }, + { + "epoch": 0.22121588391400676, + "grad_norm": 1.8638317584991455, + "learning_rate": 3.628683253872633e-05, + "loss": 0.8143, + "step": 4543 + }, + { + "epoch": 0.22126457770311397, + "grad_norm": 2.6644160747528076, + "learning_rate": 3.628500158575873e-05, + "loss": 0.9186, + "step": 4544 + }, + { + "epoch": 0.22131327149222116, + "grad_norm": 4.005434036254883, + "learning_rate": 3.628317022770022e-05, + "loss": 0.8876, + "step": 4545 + }, + { + "epoch": 0.22136196528132837, + "grad_norm": 2.045041799545288, + "learning_rate": 3.628133846459638e-05, + "loss": 0.9304, + "step": 4546 + }, + { + "epoch": 0.22141065907043556, + "grad_norm": 2.3641669750213623, + "learning_rate": 3.627950629649274e-05, + "loss": 0.8863, + "step": 4547 + }, + { + "epoch": 0.22145935285954277, + "grad_norm": 1.6965452432632446, + "learning_rate": 3.6277673723434915e-05, + "loss": 0.8818, + "step": 4548 + }, + { + "epoch": 0.22150804664864995, + "grad_norm": 3.767761468887329, + "learning_rate": 3.627584074546847e-05, + "loss": 0.9544, + "step": 4549 + }, + { + "epoch": 0.22155674043775717, + "grad_norm": 1.8000335693359375, + "learning_rate": 3.6274007362638994e-05, + "loss": 0.8998, + "step": 4550 + }, + { + "epoch": 0.22160543422686435, + "grad_norm": 2.108473062515259, + "learning_rate": 3.627217357499211e-05, + "loss": 0.8882, + "step": 4551 + }, + { + "epoch": 0.22165412801597156, + "grad_norm": 1.672225832939148, + "learning_rate": 3.6270339382573426e-05, + "loss": 0.8552, + "step": 4552 + }, + { + "epoch": 0.22170282180507878, + "grad_norm": 1.6844115257263184, + "learning_rate": 3.626850478542856e-05, + "loss": 0.919, + "step": 4553 + }, + { + "epoch": 0.22175151559418596, + "grad_norm": 3.785792350769043, + "learning_rate": 3.626666978360315e-05, + "loss": 0.8031, + "step": 4554 + }, + { + "epoch": 0.22180020938329317, + "grad_norm": 1.623430609703064, + "learning_rate": 3.626483437714285e-05, + "loss": 0.8269, + "step": 4555 + }, + { + "epoch": 0.22184890317240036, + "grad_norm": 1.800370693206787, + "learning_rate": 3.626299856609332e-05, + "loss": 0.9925, + "step": 4556 + }, + { + "epoch": 0.22189759696150757, + "grad_norm": 0.08679993450641632, + "learning_rate": 3.626116235050021e-05, + "loss": 0.6121, + "step": 4557 + }, + { + "epoch": 0.22194629075061476, + "grad_norm": 2.1097025871276855, + "learning_rate": 3.6259325730409204e-05, + "loss": 0.7998, + "step": 4558 + }, + { + "epoch": 0.22199498453972197, + "grad_norm": 1.874176025390625, + "learning_rate": 3.6257488705865986e-05, + "loss": 0.8959, + "step": 4559 + }, + { + "epoch": 0.22204367832882915, + "grad_norm": 4.409998416900635, + "learning_rate": 3.625565127691626e-05, + "loss": 0.8159, + "step": 4560 + }, + { + "epoch": 0.22209237211793637, + "grad_norm": 2.0258195400238037, + "learning_rate": 3.625381344360572e-05, + "loss": 0.8475, + "step": 4561 + }, + { + "epoch": 0.22214106590704355, + "grad_norm": 1.4811097383499146, + "learning_rate": 3.625197520598009e-05, + "loss": 0.8626, + "step": 4562 + }, + { + "epoch": 0.22218975969615076, + "grad_norm": 1.9918177127838135, + "learning_rate": 3.62501365640851e-05, + "loss": 0.9137, + "step": 4563 + }, + { + "epoch": 0.22223845348525795, + "grad_norm": 2.138864278793335, + "learning_rate": 3.6248297517966477e-05, + "loss": 0.8782, + "step": 4564 + }, + { + "epoch": 0.22228714727436516, + "grad_norm": 0.15065553784370422, + "learning_rate": 3.6246458067669975e-05, + "loss": 0.6074, + "step": 4565 + }, + { + "epoch": 0.22233584106347234, + "grad_norm": 2.49973726272583, + "learning_rate": 3.624461821324134e-05, + "loss": 0.8533, + "step": 4566 + }, + { + "epoch": 0.22238453485257956, + "grad_norm": 1.7295652627944946, + "learning_rate": 3.624277795472636e-05, + "loss": 1.0111, + "step": 4567 + }, + { + "epoch": 0.22243322864168674, + "grad_norm": 4.025268077850342, + "learning_rate": 3.624093729217078e-05, + "loss": 0.8187, + "step": 4568 + }, + { + "epoch": 0.22248192243079395, + "grad_norm": 2.5719473361968994, + "learning_rate": 3.623909622562042e-05, + "loss": 0.9491, + "step": 4569 + }, + { + "epoch": 0.22253061621990114, + "grad_norm": 0.08109584450721741, + "learning_rate": 3.623725475512105e-05, + "loss": 0.6409, + "step": 4570 + }, + { + "epoch": 0.22257931000900835, + "grad_norm": 2.4057836532592773, + "learning_rate": 3.6235412880718494e-05, + "loss": 0.8917, + "step": 4571 + }, + { + "epoch": 0.22262800379811554, + "grad_norm": 2.903411388397217, + "learning_rate": 3.623357060245856e-05, + "loss": 0.9575, + "step": 4572 + }, + { + "epoch": 0.22267669758722275, + "grad_norm": 1.927666425704956, + "learning_rate": 3.623172792038707e-05, + "loss": 0.8644, + "step": 4573 + }, + { + "epoch": 0.22272539137632996, + "grad_norm": 9.08356761932373, + "learning_rate": 3.622988483454988e-05, + "loss": 0.9306, + "step": 4574 + }, + { + "epoch": 0.22277408516543715, + "grad_norm": 2.2056539058685303, + "learning_rate": 3.622804134499282e-05, + "loss": 0.9208, + "step": 4575 + }, + { + "epoch": 0.22282277895454436, + "grad_norm": 1.5880838632583618, + "learning_rate": 3.622619745176175e-05, + "loss": 0.8982, + "step": 4576 + }, + { + "epoch": 0.22287147274365154, + "grad_norm": 1.630527138710022, + "learning_rate": 3.622435315490254e-05, + "loss": 0.8285, + "step": 4577 + }, + { + "epoch": 0.22292016653275876, + "grad_norm": 1.6849991083145142, + "learning_rate": 3.622250845446106e-05, + "loss": 0.8836, + "step": 4578 + }, + { + "epoch": 0.22296886032186594, + "grad_norm": 1.419697642326355, + "learning_rate": 3.6220663350483215e-05, + "loss": 0.9232, + "step": 4579 + }, + { + "epoch": 0.22301755411097315, + "grad_norm": 1.467543125152588, + "learning_rate": 3.621881784301489e-05, + "loss": 0.899, + "step": 4580 + }, + { + "epoch": 0.22306624790008034, + "grad_norm": 1.449865698814392, + "learning_rate": 3.621697193210199e-05, + "loss": 0.8634, + "step": 4581 + }, + { + "epoch": 0.22311494168918755, + "grad_norm": 1.3476004600524902, + "learning_rate": 3.6215125617790426e-05, + "loss": 0.9344, + "step": 4582 + }, + { + "epoch": 0.22316363547829474, + "grad_norm": 2.163207769393921, + "learning_rate": 3.621327890012614e-05, + "loss": 0.8214, + "step": 4583 + }, + { + "epoch": 0.22321232926740195, + "grad_norm": 2.306140899658203, + "learning_rate": 3.6211431779155056e-05, + "loss": 0.8668, + "step": 4584 + }, + { + "epoch": 0.22326102305650913, + "grad_norm": 2.2374119758605957, + "learning_rate": 3.620958425492314e-05, + "loss": 0.9795, + "step": 4585 + }, + { + "epoch": 0.22330971684561635, + "grad_norm": 1.827734351158142, + "learning_rate": 3.620773632747633e-05, + "loss": 0.8229, + "step": 4586 + }, + { + "epoch": 0.22335841063472353, + "grad_norm": 1.7655839920043945, + "learning_rate": 3.6205887996860603e-05, + "loss": 0.8181, + "step": 4587 + }, + { + "epoch": 0.22340710442383074, + "grad_norm": 1.617902398109436, + "learning_rate": 3.6204039263121936e-05, + "loss": 0.9648, + "step": 4588 + }, + { + "epoch": 0.22345579821293793, + "grad_norm": 1.628030776977539, + "learning_rate": 3.620219012630632e-05, + "loss": 0.8323, + "step": 4589 + }, + { + "epoch": 0.22350449200204514, + "grad_norm": 1.6171636581420898, + "learning_rate": 3.620034058645974e-05, + "loss": 0.7824, + "step": 4590 + }, + { + "epoch": 0.22355318579115233, + "grad_norm": 1.7592724561691284, + "learning_rate": 3.619849064362821e-05, + "loss": 0.885, + "step": 4591 + }, + { + "epoch": 0.22360187958025954, + "grad_norm": 1.6092674732208252, + "learning_rate": 3.6196640297857755e-05, + "loss": 0.8195, + "step": 4592 + }, + { + "epoch": 0.22365057336936675, + "grad_norm": 2.0136330127716064, + "learning_rate": 3.619478954919439e-05, + "loss": 0.8339, + "step": 4593 + }, + { + "epoch": 0.22369926715847394, + "grad_norm": 2.3517563343048096, + "learning_rate": 3.619293839768416e-05, + "loss": 0.8589, + "step": 4594 + }, + { + "epoch": 0.22374796094758115, + "grad_norm": 1.4963265657424927, + "learning_rate": 3.619108684337311e-05, + "loss": 0.9444, + "step": 4595 + }, + { + "epoch": 0.22379665473668833, + "grad_norm": 3.9175078868865967, + "learning_rate": 3.61892348863073e-05, + "loss": 0.8798, + "step": 4596 + }, + { + "epoch": 0.22384534852579555, + "grad_norm": 0.07576202601194382, + "learning_rate": 3.618738252653279e-05, + "loss": 0.6005, + "step": 4597 + }, + { + "epoch": 0.22389404231490273, + "grad_norm": 0.08635574579238892, + "learning_rate": 3.618552976409567e-05, + "loss": 0.6713, + "step": 4598 + }, + { + "epoch": 0.22394273610400994, + "grad_norm": 1.731321096420288, + "learning_rate": 3.618367659904202e-05, + "loss": 0.9474, + "step": 4599 + }, + { + "epoch": 0.22399142989311713, + "grad_norm": 1.7615723609924316, + "learning_rate": 3.618182303141794e-05, + "loss": 0.8605, + "step": 4600 + }, + { + "epoch": 0.22404012368222434, + "grad_norm": 2.0088696479797363, + "learning_rate": 3.6179969061269536e-05, + "loss": 0.9129, + "step": 4601 + }, + { + "epoch": 0.22408881747133153, + "grad_norm": 2.251563549041748, + "learning_rate": 3.617811468864293e-05, + "loss": 0.9213, + "step": 4602 + }, + { + "epoch": 0.22413751126043874, + "grad_norm": 3.2181150913238525, + "learning_rate": 3.617625991358425e-05, + "loss": 0.8444, + "step": 4603 + }, + { + "epoch": 0.22418620504954592, + "grad_norm": 1.709924340248108, + "learning_rate": 3.617440473613961e-05, + "loss": 0.9283, + "step": 4604 + }, + { + "epoch": 0.22423489883865313, + "grad_norm": 1.6122965812683105, + "learning_rate": 3.6172549156355194e-05, + "loss": 0.7831, + "step": 4605 + }, + { + "epoch": 0.22428359262776032, + "grad_norm": 1.4542242288589478, + "learning_rate": 3.6170693174277134e-05, + "loss": 0.954, + "step": 4606 + }, + { + "epoch": 0.22433228641686753, + "grad_norm": 2.0291316509246826, + "learning_rate": 3.616883678995162e-05, + "loss": 0.9218, + "step": 4607 + }, + { + "epoch": 0.22438098020597472, + "grad_norm": 2.1912906169891357, + "learning_rate": 3.6166980003424805e-05, + "loss": 0.8424, + "step": 4608 + }, + { + "epoch": 0.22442967399508193, + "grad_norm": 1.494773507118225, + "learning_rate": 3.6165122814742897e-05, + "loss": 0.8926, + "step": 4609 + }, + { + "epoch": 0.22447836778418911, + "grad_norm": 1.606200933456421, + "learning_rate": 3.616326522395207e-05, + "loss": 0.9772, + "step": 4610 + }, + { + "epoch": 0.22452706157329633, + "grad_norm": 1.966639518737793, + "learning_rate": 3.616140723109856e-05, + "loss": 0.9256, + "step": 4611 + }, + { + "epoch": 0.2245757553624035, + "grad_norm": 1.8037673234939575, + "learning_rate": 3.615954883622857e-05, + "loss": 0.8147, + "step": 4612 + }, + { + "epoch": 0.22462444915151072, + "grad_norm": 1.722983956336975, + "learning_rate": 3.615769003938833e-05, + "loss": 0.8853, + "step": 4613 + }, + { + "epoch": 0.22467314294061794, + "grad_norm": 1.957232117652893, + "learning_rate": 3.615583084062407e-05, + "loss": 0.7992, + "step": 4614 + }, + { + "epoch": 0.22472183672972512, + "grad_norm": 1.5172545909881592, + "learning_rate": 3.615397123998205e-05, + "loss": 0.9267, + "step": 4615 + }, + { + "epoch": 0.22477053051883233, + "grad_norm": 1.8611398935317993, + "learning_rate": 3.615211123750853e-05, + "loss": 0.8998, + "step": 4616 + }, + { + "epoch": 0.22481922430793952, + "grad_norm": 1.5981357097625732, + "learning_rate": 3.615025083324976e-05, + "loss": 0.8568, + "step": 4617 + }, + { + "epoch": 0.22486791809704673, + "grad_norm": 2.658914089202881, + "learning_rate": 3.614839002725203e-05, + "loss": 0.8472, + "step": 4618 + }, + { + "epoch": 0.22491661188615392, + "grad_norm": 1.628898024559021, + "learning_rate": 3.614652881956163e-05, + "loss": 0.9015, + "step": 4619 + }, + { + "epoch": 0.22496530567526113, + "grad_norm": 1.636614203453064, + "learning_rate": 3.6144667210224846e-05, + "loss": 0.887, + "step": 4620 + }, + { + "epoch": 0.22501399946436831, + "grad_norm": 2.0175254344940186, + "learning_rate": 3.6142805199288e-05, + "loss": 0.8194, + "step": 4621 + }, + { + "epoch": 0.22506269325347553, + "grad_norm": 1.522363543510437, + "learning_rate": 3.61409427867974e-05, + "loss": 0.9655, + "step": 4622 + }, + { + "epoch": 0.2251113870425827, + "grad_norm": 1.9694855213165283, + "learning_rate": 3.613907997279937e-05, + "loss": 0.9287, + "step": 4623 + }, + { + "epoch": 0.22516008083168992, + "grad_norm": 1.6772866249084473, + "learning_rate": 3.613721675734026e-05, + "loss": 0.9748, + "step": 4624 + }, + { + "epoch": 0.2252087746207971, + "grad_norm": 1.4715759754180908, + "learning_rate": 3.613535314046642e-05, + "loss": 0.9218, + "step": 4625 + }, + { + "epoch": 0.22525746840990432, + "grad_norm": 1.9913805723190308, + "learning_rate": 3.613348912222419e-05, + "loss": 0.8725, + "step": 4626 + }, + { + "epoch": 0.2253061621990115, + "grad_norm": 2.013695001602173, + "learning_rate": 3.6131624702659946e-05, + "loss": 0.9374, + "step": 4627 + }, + { + "epoch": 0.22535485598811872, + "grad_norm": 1.510591983795166, + "learning_rate": 3.612975988182007e-05, + "loss": 0.8868, + "step": 4628 + }, + { + "epoch": 0.2254035497772259, + "grad_norm": 2.7074291706085205, + "learning_rate": 3.612789465975095e-05, + "loss": 0.9918, + "step": 4629 + }, + { + "epoch": 0.22545224356633312, + "grad_norm": 1.7062251567840576, + "learning_rate": 3.6126029036498974e-05, + "loss": 0.9535, + "step": 4630 + }, + { + "epoch": 0.2255009373554403, + "grad_norm": 1.6999002695083618, + "learning_rate": 3.612416301211055e-05, + "loss": 0.9055, + "step": 4631 + }, + { + "epoch": 0.2255496311445475, + "grad_norm": 1.6510621309280396, + "learning_rate": 3.612229658663211e-05, + "loss": 0.7884, + "step": 4632 + }, + { + "epoch": 0.22559832493365473, + "grad_norm": 1.5684483051300049, + "learning_rate": 3.6120429760110074e-05, + "loss": 0.879, + "step": 4633 + }, + { + "epoch": 0.2256470187227619, + "grad_norm": 2.0888214111328125, + "learning_rate": 3.611856253259087e-05, + "loss": 0.8552, + "step": 4634 + }, + { + "epoch": 0.22569571251186912, + "grad_norm": 1.7534964084625244, + "learning_rate": 3.611669490412096e-05, + "loss": 0.9268, + "step": 4635 + }, + { + "epoch": 0.2257444063009763, + "grad_norm": 2.3449642658233643, + "learning_rate": 3.6114826874746794e-05, + "loss": 0.7798, + "step": 4636 + }, + { + "epoch": 0.22579310009008352, + "grad_norm": 2.0878047943115234, + "learning_rate": 3.611295844451484e-05, + "loss": 0.8971, + "step": 4637 + }, + { + "epoch": 0.2258417938791907, + "grad_norm": 1.5033711194992065, + "learning_rate": 3.611108961347157e-05, + "loss": 0.8268, + "step": 4638 + }, + { + "epoch": 0.22589048766829792, + "grad_norm": 1.434290885925293, + "learning_rate": 3.610922038166348e-05, + "loss": 0.8676, + "step": 4639 + }, + { + "epoch": 0.2259391814574051, + "grad_norm": 1.3789218664169312, + "learning_rate": 3.610735074913707e-05, + "loss": 0.9206, + "step": 4640 + }, + { + "epoch": 0.22598787524651232, + "grad_norm": 0.08750265091657639, + "learning_rate": 3.610548071593884e-05, + "loss": 0.6272, + "step": 4641 + }, + { + "epoch": 0.2260365690356195, + "grad_norm": 1.520607590675354, + "learning_rate": 3.61036102821153e-05, + "loss": 0.9702, + "step": 4642 + }, + { + "epoch": 0.2260852628247267, + "grad_norm": 1.5487334728240967, + "learning_rate": 3.610173944771299e-05, + "loss": 0.851, + "step": 4643 + }, + { + "epoch": 0.2261339566138339, + "grad_norm": 1.8452216386795044, + "learning_rate": 3.6099868212778446e-05, + "loss": 0.8388, + "step": 4644 + }, + { + "epoch": 0.2261826504029411, + "grad_norm": 2.155693531036377, + "learning_rate": 3.609799657735822e-05, + "loss": 0.877, + "step": 4645 + }, + { + "epoch": 0.2262313441920483, + "grad_norm": 1.5290136337280273, + "learning_rate": 3.6096124541498846e-05, + "loss": 0.9063, + "step": 4646 + }, + { + "epoch": 0.2262800379811555, + "grad_norm": 1.3064590692520142, + "learning_rate": 3.609425210524691e-05, + "loss": 0.9658, + "step": 4647 + }, + { + "epoch": 0.2263287317702627, + "grad_norm": 1.7521506547927856, + "learning_rate": 3.609237926864899e-05, + "loss": 0.9493, + "step": 4648 + }, + { + "epoch": 0.2263774255593699, + "grad_norm": 1.8510760068893433, + "learning_rate": 3.6090506031751666e-05, + "loss": 0.8602, + "step": 4649 + }, + { + "epoch": 0.2264261193484771, + "grad_norm": 1.8222259283065796, + "learning_rate": 3.608863239460154e-05, + "loss": 0.9797, + "step": 4650 + }, + { + "epoch": 0.2264748131375843, + "grad_norm": 1.4252735376358032, + "learning_rate": 3.608675835724521e-05, + "loss": 0.8812, + "step": 4651 + }, + { + "epoch": 0.2265235069266915, + "grad_norm": 1.5921751260757446, + "learning_rate": 3.6084883919729304e-05, + "loss": 0.9253, + "step": 4652 + }, + { + "epoch": 0.2265722007157987, + "grad_norm": 1.2497845888137817, + "learning_rate": 3.608300908210044e-05, + "loss": 0.8253, + "step": 4653 + }, + { + "epoch": 0.2266208945049059, + "grad_norm": 0.08656560629606247, + "learning_rate": 3.608113384440527e-05, + "loss": 0.6083, + "step": 4654 + }, + { + "epoch": 0.2266695882940131, + "grad_norm": 1.4642361402511597, + "learning_rate": 3.607925820669042e-05, + "loss": 0.8341, + "step": 4655 + }, + { + "epoch": 0.2267182820831203, + "grad_norm": 2.075528144836426, + "learning_rate": 3.607738216900256e-05, + "loss": 0.944, + "step": 4656 + }, + { + "epoch": 0.2267669758722275, + "grad_norm": 1.7677685022354126, + "learning_rate": 3.607550573138834e-05, + "loss": 0.8729, + "step": 4657 + }, + { + "epoch": 0.2268156696613347, + "grad_norm": 1.2050074338912964, + "learning_rate": 3.6073628893894465e-05, + "loss": 0.8992, + "step": 4658 + }, + { + "epoch": 0.2268643634504419, + "grad_norm": 2.0213608741760254, + "learning_rate": 3.60717516565676e-05, + "loss": 0.9911, + "step": 4659 + }, + { + "epoch": 0.2269130572395491, + "grad_norm": 1.6789032220840454, + "learning_rate": 3.606987401945444e-05, + "loss": 0.9428, + "step": 4660 + }, + { + "epoch": 0.2269617510286563, + "grad_norm": 1.88783860206604, + "learning_rate": 3.6067995982601716e-05, + "loss": 0.9566, + "step": 4661 + }, + { + "epoch": 0.2270104448177635, + "grad_norm": 1.8336213827133179, + "learning_rate": 3.606611754605611e-05, + "loss": 0.8018, + "step": 4662 + }, + { + "epoch": 0.2270591386068707, + "grad_norm": 2.03149676322937, + "learning_rate": 3.606423870986438e-05, + "loss": 0.8554, + "step": 4663 + }, + { + "epoch": 0.2271078323959779, + "grad_norm": 1.6909551620483398, + "learning_rate": 3.606235947407323e-05, + "loss": 0.8812, + "step": 4664 + }, + { + "epoch": 0.22715652618508508, + "grad_norm": 2.217754364013672, + "learning_rate": 3.606047983872944e-05, + "loss": 0.8922, + "step": 4665 + }, + { + "epoch": 0.2272052199741923, + "grad_norm": 1.8247078657150269, + "learning_rate": 3.605859980387974e-05, + "loss": 0.8524, + "step": 4666 + }, + { + "epoch": 0.22725391376329948, + "grad_norm": 1.526257038116455, + "learning_rate": 3.605671936957091e-05, + "loss": 0.8727, + "step": 4667 + }, + { + "epoch": 0.2273026075524067, + "grad_norm": 0.08865982294082642, + "learning_rate": 3.605483853584972e-05, + "loss": 0.6158, + "step": 4668 + }, + { + "epoch": 0.22735130134151388, + "grad_norm": 2.180887222290039, + "learning_rate": 3.605295730276296e-05, + "loss": 0.7556, + "step": 4669 + }, + { + "epoch": 0.2273999951306211, + "grad_norm": 1.829630732536316, + "learning_rate": 3.605107567035742e-05, + "loss": 0.8677, + "step": 4670 + }, + { + "epoch": 0.22744868891972828, + "grad_norm": 2.749272108078003, + "learning_rate": 3.6049193638679906e-05, + "loss": 0.9183, + "step": 4671 + }, + { + "epoch": 0.2274973827088355, + "grad_norm": 1.4675554037094116, + "learning_rate": 3.604731120777724e-05, + "loss": 0.9346, + "step": 4672 + }, + { + "epoch": 0.2275460764979427, + "grad_norm": 1.841580867767334, + "learning_rate": 3.604542837769625e-05, + "loss": 0.865, + "step": 4673 + }, + { + "epoch": 0.22759477028704989, + "grad_norm": 1.512270212173462, + "learning_rate": 3.604354514848376e-05, + "loss": 0.8622, + "step": 4674 + }, + { + "epoch": 0.2276434640761571, + "grad_norm": 1.6304755210876465, + "learning_rate": 3.6041661520186634e-05, + "loss": 0.9395, + "step": 4675 + }, + { + "epoch": 0.22769215786526428, + "grad_norm": 1.7709189653396606, + "learning_rate": 3.6039777492851694e-05, + "loss": 0.7726, + "step": 4676 + }, + { + "epoch": 0.2277408516543715, + "grad_norm": 1.4382578134536743, + "learning_rate": 3.603789306652584e-05, + "loss": 0.7702, + "step": 4677 + }, + { + "epoch": 0.22778954544347868, + "grad_norm": 1.3346737623214722, + "learning_rate": 3.603600824125593e-05, + "loss": 0.8254, + "step": 4678 + }, + { + "epoch": 0.2278382392325859, + "grad_norm": 2.0071117877960205, + "learning_rate": 3.603412301708885e-05, + "loss": 0.8406, + "step": 4679 + }, + { + "epoch": 0.22788693302169308, + "grad_norm": 1.853682041168213, + "learning_rate": 3.6032237394071506e-05, + "loss": 0.8574, + "step": 4680 + }, + { + "epoch": 0.2279356268108003, + "grad_norm": 1.6953608989715576, + "learning_rate": 3.603035137225079e-05, + "loss": 0.8123, + "step": 4681 + }, + { + "epoch": 0.22798432059990748, + "grad_norm": 2.56276273727417, + "learning_rate": 3.6028464951673626e-05, + "loss": 0.8301, + "step": 4682 + }, + { + "epoch": 0.2280330143890147, + "grad_norm": 1.5246186256408691, + "learning_rate": 3.602657813238693e-05, + "loss": 0.8806, + "step": 4683 + }, + { + "epoch": 0.22808170817812187, + "grad_norm": 1.6140083074569702, + "learning_rate": 3.602469091443765e-05, + "loss": 0.8545, + "step": 4684 + }, + { + "epoch": 0.22813040196722909, + "grad_norm": 2.906959056854248, + "learning_rate": 3.6022803297872714e-05, + "loss": 0.8676, + "step": 4685 + }, + { + "epoch": 0.22817909575633627, + "grad_norm": 1.7225068807601929, + "learning_rate": 3.602091528273908e-05, + "loss": 0.9107, + "step": 4686 + }, + { + "epoch": 0.22822778954544348, + "grad_norm": 2.453850269317627, + "learning_rate": 3.601902686908373e-05, + "loss": 0.8224, + "step": 4687 + }, + { + "epoch": 0.22827648333455067, + "grad_norm": 1.832035779953003, + "learning_rate": 3.601713805695362e-05, + "loss": 0.817, + "step": 4688 + }, + { + "epoch": 0.22832517712365788, + "grad_norm": 1.8592184782028198, + "learning_rate": 3.601524884639574e-05, + "loss": 0.8114, + "step": 4689 + }, + { + "epoch": 0.22837387091276506, + "grad_norm": 0.08727116882801056, + "learning_rate": 3.6013359237457085e-05, + "loss": 0.6859, + "step": 4690 + }, + { + "epoch": 0.22842256470187228, + "grad_norm": 2.577348470687866, + "learning_rate": 3.601146923018466e-05, + "loss": 0.9092, + "step": 4691 + }, + { + "epoch": 0.22847125849097946, + "grad_norm": 1.311655044555664, + "learning_rate": 3.600957882462548e-05, + "loss": 0.8536, + "step": 4692 + }, + { + "epoch": 0.22851995228008667, + "grad_norm": 1.8624045848846436, + "learning_rate": 3.600768802082656e-05, + "loss": 0.9308, + "step": 4693 + }, + { + "epoch": 0.2285686460691939, + "grad_norm": 1.0899406671524048, + "learning_rate": 3.6005796818834945e-05, + "loss": 0.8876, + "step": 4694 + }, + { + "epoch": 0.22861733985830107, + "grad_norm": 1.4551255702972412, + "learning_rate": 3.600390521869768e-05, + "loss": 0.9332, + "step": 4695 + }, + { + "epoch": 0.22866603364740828, + "grad_norm": 2.009662628173828, + "learning_rate": 3.6002013220461806e-05, + "loss": 0.9111, + "step": 4696 + }, + { + "epoch": 0.22871472743651547, + "grad_norm": 2.077397108078003, + "learning_rate": 3.60001208241744e-05, + "loss": 0.8221, + "step": 4697 + }, + { + "epoch": 0.22876342122562268, + "grad_norm": 1.5321872234344482, + "learning_rate": 3.599822802988253e-05, + "loss": 0.9414, + "step": 4698 + }, + { + "epoch": 0.22881211501472987, + "grad_norm": 1.625946044921875, + "learning_rate": 3.599633483763327e-05, + "loss": 0.9886, + "step": 4699 + }, + { + "epoch": 0.22886080880383708, + "grad_norm": 1.5556588172912598, + "learning_rate": 3.599444124747373e-05, + "loss": 0.8141, + "step": 4700 + }, + { + "epoch": 0.22890950259294426, + "grad_norm": 1.6091958284378052, + "learning_rate": 3.5992547259451e-05, + "loss": 0.9728, + "step": 4701 + }, + { + "epoch": 0.22895819638205148, + "grad_norm": 2.388606548309326, + "learning_rate": 3.599065287361221e-05, + "loss": 0.8403, + "step": 4702 + }, + { + "epoch": 0.22900689017115866, + "grad_norm": 1.493594765663147, + "learning_rate": 3.598875809000446e-05, + "loss": 0.8544, + "step": 4703 + }, + { + "epoch": 0.22905558396026587, + "grad_norm": 1.5166858434677124, + "learning_rate": 3.5986862908674905e-05, + "loss": 0.8899, + "step": 4704 + }, + { + "epoch": 0.22910427774937306, + "grad_norm": 1.1689602136611938, + "learning_rate": 3.598496732967067e-05, + "loss": 0.9359, + "step": 4705 + }, + { + "epoch": 0.22915297153848027, + "grad_norm": 2.49539852142334, + "learning_rate": 3.598307135303892e-05, + "loss": 0.8579, + "step": 4706 + }, + { + "epoch": 0.22920166532758746, + "grad_norm": 1.3764985799789429, + "learning_rate": 3.5981174978826804e-05, + "loss": 0.8317, + "step": 4707 + }, + { + "epoch": 0.22925035911669467, + "grad_norm": 2.1524345874786377, + "learning_rate": 3.5979278207081506e-05, + "loss": 0.8108, + "step": 4708 + }, + { + "epoch": 0.22929905290580185, + "grad_norm": 1.5859034061431885, + "learning_rate": 3.5977381037850216e-05, + "loss": 0.9199, + "step": 4709 + }, + { + "epoch": 0.22934774669490907, + "grad_norm": 3.696542501449585, + "learning_rate": 3.597548347118011e-05, + "loss": 1.0084, + "step": 4710 + }, + { + "epoch": 0.22939644048401625, + "grad_norm": 1.8656859397888184, + "learning_rate": 3.597358550711839e-05, + "loss": 0.8977, + "step": 4711 + }, + { + "epoch": 0.22944513427312346, + "grad_norm": 1.6826168298721313, + "learning_rate": 3.5971687145712284e-05, + "loss": 0.9347, + "step": 4712 + }, + { + "epoch": 0.22949382806223068, + "grad_norm": 1.666409969329834, + "learning_rate": 3.596978838700899e-05, + "loss": 0.9276, + "step": 4713 + }, + { + "epoch": 0.22954252185133786, + "grad_norm": 2.2121102809906006, + "learning_rate": 3.596788923105577e-05, + "loss": 0.793, + "step": 4714 + }, + { + "epoch": 0.22959121564044507, + "grad_norm": 1.5666965246200562, + "learning_rate": 3.596598967789984e-05, + "loss": 0.9083, + "step": 4715 + }, + { + "epoch": 0.22963990942955226, + "grad_norm": 1.8881006240844727, + "learning_rate": 3.596408972758847e-05, + "loss": 0.8817, + "step": 4716 + }, + { + "epoch": 0.22968860321865947, + "grad_norm": 1.9361469745635986, + "learning_rate": 3.5962189380168894e-05, + "loss": 0.9126, + "step": 4717 + }, + { + "epoch": 0.22973729700776666, + "grad_norm": 2.208406686782837, + "learning_rate": 3.596028863568842e-05, + "loss": 0.9306, + "step": 4718 + }, + { + "epoch": 0.22978599079687387, + "grad_norm": 1.9014830589294434, + "learning_rate": 3.595838749419431e-05, + "loss": 0.8771, + "step": 4719 + }, + { + "epoch": 0.22983468458598105, + "grad_norm": 1.9952986240386963, + "learning_rate": 3.5956485955733843e-05, + "loss": 0.9626, + "step": 4720 + }, + { + "epoch": 0.22988337837508827, + "grad_norm": 2.3128280639648438, + "learning_rate": 3.595458402035434e-05, + "loss": 0.8994, + "step": 4721 + }, + { + "epoch": 0.22993207216419545, + "grad_norm": 1.6622560024261475, + "learning_rate": 3.5952681688103105e-05, + "loss": 0.7974, + "step": 4722 + }, + { + "epoch": 0.22998076595330266, + "grad_norm": 1.278242588043213, + "learning_rate": 3.595077895902746e-05, + "loss": 0.8385, + "step": 4723 + }, + { + "epoch": 0.23002945974240985, + "grad_norm": 6.593469619750977, + "learning_rate": 3.594887583317474e-05, + "loss": 0.9513, + "step": 4724 + }, + { + "epoch": 0.23007815353151706, + "grad_norm": 3.825223207473755, + "learning_rate": 3.594697231059227e-05, + "loss": 0.9413, + "step": 4725 + }, + { + "epoch": 0.23012684732062424, + "grad_norm": 2.1387813091278076, + "learning_rate": 3.594506839132741e-05, + "loss": 0.8862, + "step": 4726 + }, + { + "epoch": 0.23017554110973146, + "grad_norm": 1.8409680128097534, + "learning_rate": 3.594316407542752e-05, + "loss": 0.8783, + "step": 4727 + }, + { + "epoch": 0.23022423489883864, + "grad_norm": 1.518779993057251, + "learning_rate": 3.594125936293997e-05, + "loss": 0.8396, + "step": 4728 + }, + { + "epoch": 0.23027292868794585, + "grad_norm": 2.328809976577759, + "learning_rate": 3.5939354253912144e-05, + "loss": 0.9709, + "step": 4729 + }, + { + "epoch": 0.23032162247705304, + "grad_norm": 1.9743497371673584, + "learning_rate": 3.593744874839143e-05, + "loss": 0.8834, + "step": 4730 + }, + { + "epoch": 0.23037031626616025, + "grad_norm": 1.2945410013198853, + "learning_rate": 3.5935542846425215e-05, + "loss": 0.8815, + "step": 4731 + }, + { + "epoch": 0.23041901005526744, + "grad_norm": 5.15849494934082, + "learning_rate": 3.593363654806092e-05, + "loss": 0.8756, + "step": 4732 + }, + { + "epoch": 0.23046770384437465, + "grad_norm": 1.9171640872955322, + "learning_rate": 3.593172985334597e-05, + "loss": 0.8584, + "step": 4733 + }, + { + "epoch": 0.23051639763348186, + "grad_norm": 0.08439955115318298, + "learning_rate": 3.592982276232779e-05, + "loss": 0.5852, + "step": 4734 + }, + { + "epoch": 0.23056509142258905, + "grad_norm": 0.0839647576212883, + "learning_rate": 3.592791527505381e-05, + "loss": 0.6362, + "step": 4735 + }, + { + "epoch": 0.23061378521169626, + "grad_norm": 1.259678602218628, + "learning_rate": 3.5926007391571494e-05, + "loss": 0.753, + "step": 4736 + }, + { + "epoch": 0.23066247900080344, + "grad_norm": 1.6084096431732178, + "learning_rate": 3.592409911192828e-05, + "loss": 0.8662, + "step": 4737 + }, + { + "epoch": 0.23071117278991066, + "grad_norm": 1.469208002090454, + "learning_rate": 3.592219043617165e-05, + "loss": 0.8971, + "step": 4738 + }, + { + "epoch": 0.23075986657901784, + "grad_norm": 5.433036804199219, + "learning_rate": 3.592028136434909e-05, + "loss": 0.8697, + "step": 4739 + }, + { + "epoch": 0.23080856036812505, + "grad_norm": 0.08399005234241486, + "learning_rate": 3.591837189650807e-05, + "loss": 0.6115, + "step": 4740 + }, + { + "epoch": 0.23085725415723224, + "grad_norm": 2.2139780521392822, + "learning_rate": 3.5916462032696105e-05, + "loss": 0.8933, + "step": 4741 + }, + { + "epoch": 0.23090594794633945, + "grad_norm": 1.5754637718200684, + "learning_rate": 3.591455177296069e-05, + "loss": 0.8328, + "step": 4742 + }, + { + "epoch": 0.23095464173544664, + "grad_norm": 1.7930742502212524, + "learning_rate": 3.591264111734935e-05, + "loss": 0.9145, + "step": 4743 + }, + { + "epoch": 0.23100333552455385, + "grad_norm": 1.6082308292388916, + "learning_rate": 3.5910730065909616e-05, + "loss": 0.9296, + "step": 4744 + }, + { + "epoch": 0.23105202931366103, + "grad_norm": 3.507901191711426, + "learning_rate": 3.590881861868901e-05, + "loss": 0.9593, + "step": 4745 + }, + { + "epoch": 0.23110072310276825, + "grad_norm": 1.3692359924316406, + "learning_rate": 3.5906906775735104e-05, + "loss": 0.9072, + "step": 4746 + }, + { + "epoch": 0.23114941689187543, + "grad_norm": 1.8114116191864014, + "learning_rate": 3.590499453709543e-05, + "loss": 0.8682, + "step": 4747 + }, + { + "epoch": 0.23119811068098264, + "grad_norm": 1.1154041290283203, + "learning_rate": 3.5903081902817575e-05, + "loss": 0.8364, + "step": 4748 + }, + { + "epoch": 0.23124680447008983, + "grad_norm": 0.08523135632276535, + "learning_rate": 3.5901168872949106e-05, + "loss": 0.5322, + "step": 4749 + }, + { + "epoch": 0.23129549825919704, + "grad_norm": 1.8556667566299438, + "learning_rate": 3.589925544753762e-05, + "loss": 0.9891, + "step": 4750 + }, + { + "epoch": 0.23134419204830423, + "grad_norm": 2.3147404193878174, + "learning_rate": 3.589734162663068e-05, + "loss": 0.9284, + "step": 4751 + }, + { + "epoch": 0.23139288583741144, + "grad_norm": 2.601395606994629, + "learning_rate": 3.589542741027594e-05, + "loss": 0.8029, + "step": 4752 + }, + { + "epoch": 0.23144157962651865, + "grad_norm": 1.6574326753616333, + "learning_rate": 3.589351279852099e-05, + "loss": 0.8791, + "step": 4753 + }, + { + "epoch": 0.23149027341562584, + "grad_norm": 1.7475886344909668, + "learning_rate": 3.5891597791413456e-05, + "loss": 0.894, + "step": 4754 + }, + { + "epoch": 0.23153896720473305, + "grad_norm": 1.434953212738037, + "learning_rate": 3.588968238900098e-05, + "loss": 0.8785, + "step": 4755 + }, + { + "epoch": 0.23158766099384023, + "grad_norm": 5.659756183624268, + "learning_rate": 3.58877665913312e-05, + "loss": 0.8912, + "step": 4756 + }, + { + "epoch": 0.23163635478294745, + "grad_norm": 2.628997564315796, + "learning_rate": 3.5885850398451784e-05, + "loss": 0.9496, + "step": 4757 + }, + { + "epoch": 0.23168504857205463, + "grad_norm": 1.6332374811172485, + "learning_rate": 3.588393381041039e-05, + "loss": 0.8684, + "step": 4758 + }, + { + "epoch": 0.23173374236116184, + "grad_norm": 1.2888818979263306, + "learning_rate": 3.588201682725469e-05, + "loss": 0.8938, + "step": 4759 + }, + { + "epoch": 0.23178243615026903, + "grad_norm": 1.9843313694000244, + "learning_rate": 3.588009944903238e-05, + "loss": 0.8863, + "step": 4760 + }, + { + "epoch": 0.23183112993937624, + "grad_norm": 2.401379346847534, + "learning_rate": 3.587818167579114e-05, + "loss": 0.8545, + "step": 4761 + }, + { + "epoch": 0.23187982372848343, + "grad_norm": 2.0488619804382324, + "learning_rate": 3.587626350757869e-05, + "loss": 0.8629, + "step": 4762 + }, + { + "epoch": 0.23192851751759064, + "grad_norm": 1.6917304992675781, + "learning_rate": 3.5874344944442736e-05, + "loss": 0.8756, + "step": 4763 + }, + { + "epoch": 0.23197721130669782, + "grad_norm": 1.5978848934173584, + "learning_rate": 3.5872425986431e-05, + "loss": 0.903, + "step": 4764 + }, + { + "epoch": 0.23202590509580504, + "grad_norm": 1.575252652168274, + "learning_rate": 3.5870506633591234e-05, + "loss": 0.8598, + "step": 4765 + }, + { + "epoch": 0.23207459888491222, + "grad_norm": 2.7182352542877197, + "learning_rate": 3.586858688597115e-05, + "loss": 0.8502, + "step": 4766 + }, + { + "epoch": 0.23212329267401943, + "grad_norm": 2.8254647254943848, + "learning_rate": 3.586666674361853e-05, + "loss": 0.9357, + "step": 4767 + }, + { + "epoch": 0.23217198646312662, + "grad_norm": 3.072267770767212, + "learning_rate": 3.586474620658113e-05, + "loss": 0.891, + "step": 4768 + }, + { + "epoch": 0.23222068025223383, + "grad_norm": 1.5385558605194092, + "learning_rate": 3.586282527490672e-05, + "loss": 0.8832, + "step": 4769 + }, + { + "epoch": 0.23226937404134101, + "grad_norm": 1.5235724449157715, + "learning_rate": 3.586090394864308e-05, + "loss": 0.864, + "step": 4770 + }, + { + "epoch": 0.23231806783044823, + "grad_norm": 1.5280897617340088, + "learning_rate": 3.585898222783802e-05, + "loss": 0.9141, + "step": 4771 + }, + { + "epoch": 0.2323667616195554, + "grad_norm": 1.9174152612686157, + "learning_rate": 3.585706011253932e-05, + "loss": 0.9643, + "step": 4772 + }, + { + "epoch": 0.23241545540866262, + "grad_norm": 1.7384134531021118, + "learning_rate": 3.58551376027948e-05, + "loss": 0.894, + "step": 4773 + }, + { + "epoch": 0.23246414919776984, + "grad_norm": 1.9332562685012817, + "learning_rate": 3.58532146986523e-05, + "loss": 0.897, + "step": 4774 + }, + { + "epoch": 0.23251284298687702, + "grad_norm": 0.09213743358850479, + "learning_rate": 3.585129140015963e-05, + "loss": 0.629, + "step": 4775 + }, + { + "epoch": 0.23256153677598423, + "grad_norm": 1.695266604423523, + "learning_rate": 3.584936770736465e-05, + "loss": 0.7463, + "step": 4776 + }, + { + "epoch": 0.23261023056509142, + "grad_norm": 2.3232369422912598, + "learning_rate": 3.5847443620315197e-05, + "loss": 1.027, + "step": 4777 + }, + { + "epoch": 0.23265892435419863, + "grad_norm": 2.358128309249878, + "learning_rate": 3.584551913905913e-05, + "loss": 0.8715, + "step": 4778 + }, + { + "epoch": 0.23270761814330582, + "grad_norm": 1.8389488458633423, + "learning_rate": 3.5843594263644345e-05, + "loss": 0.9031, + "step": 4779 + }, + { + "epoch": 0.23275631193241303, + "grad_norm": 1.402652382850647, + "learning_rate": 3.58416689941187e-05, + "loss": 0.8936, + "step": 4780 + }, + { + "epoch": 0.23280500572152021, + "grad_norm": 2.2683918476104736, + "learning_rate": 3.58397433305301e-05, + "loss": 0.8743, + "step": 4781 + }, + { + "epoch": 0.23285369951062743, + "grad_norm": 1.3353006839752197, + "learning_rate": 3.5837817272926434e-05, + "loss": 0.8102, + "step": 4782 + }, + { + "epoch": 0.2329023932997346, + "grad_norm": 2.045799732208252, + "learning_rate": 3.5835890821355624e-05, + "loss": 0.8719, + "step": 4783 + }, + { + "epoch": 0.23295108708884182, + "grad_norm": 1.2810636758804321, + "learning_rate": 3.583396397586558e-05, + "loss": 0.8962, + "step": 4784 + }, + { + "epoch": 0.232999780877949, + "grad_norm": 1.743659257888794, + "learning_rate": 3.5832036736504245e-05, + "loss": 0.8648, + "step": 4785 + }, + { + "epoch": 0.23304847466705622, + "grad_norm": 2.2404890060424805, + "learning_rate": 3.5830109103319546e-05, + "loss": 0.9154, + "step": 4786 + }, + { + "epoch": 0.2330971684561634, + "grad_norm": 1.4443395137786865, + "learning_rate": 3.582818107635944e-05, + "loss": 0.9431, + "step": 4787 + }, + { + "epoch": 0.23314586224527062, + "grad_norm": 2.855459451675415, + "learning_rate": 3.582625265567189e-05, + "loss": 0.9055, + "step": 4788 + }, + { + "epoch": 0.2331945560343778, + "grad_norm": 1.6018401384353638, + "learning_rate": 3.582432384130487e-05, + "loss": 0.8468, + "step": 4789 + }, + { + "epoch": 0.23324324982348502, + "grad_norm": 1.9331802129745483, + "learning_rate": 3.582239463330634e-05, + "loss": 0.7865, + "step": 4790 + }, + { + "epoch": 0.2332919436125922, + "grad_norm": 2.6842522621154785, + "learning_rate": 3.58204650317243e-05, + "loss": 0.8597, + "step": 4791 + }, + { + "epoch": 0.2333406374016994, + "grad_norm": 1.7440751791000366, + "learning_rate": 3.581853503660676e-05, + "loss": 0.9041, + "step": 4792 + }, + { + "epoch": 0.23338933119080663, + "grad_norm": 1.424506425857544, + "learning_rate": 3.5816604648001715e-05, + "loss": 0.8682, + "step": 4793 + }, + { + "epoch": 0.2334380249799138, + "grad_norm": 3.6939666271209717, + "learning_rate": 3.581467386595718e-05, + "loss": 0.9103, + "step": 4794 + }, + { + "epoch": 0.23348671876902102, + "grad_norm": 1.9547196626663208, + "learning_rate": 3.58127426905212e-05, + "loss": 0.8965, + "step": 4795 + }, + { + "epoch": 0.2335354125581282, + "grad_norm": 1.7998913526535034, + "learning_rate": 3.58108111217418e-05, + "loss": 0.8176, + "step": 4796 + }, + { + "epoch": 0.23358410634723542, + "grad_norm": 2.052396297454834, + "learning_rate": 3.5808879159667037e-05, + "loss": 0.8698, + "step": 4797 + }, + { + "epoch": 0.2336328001363426, + "grad_norm": 1.4127893447875977, + "learning_rate": 3.580694680434495e-05, + "loss": 0.8682, + "step": 4798 + }, + { + "epoch": 0.23368149392544982, + "grad_norm": 2.6128830909729004, + "learning_rate": 3.580501405582364e-05, + "loss": 0.9154, + "step": 4799 + }, + { + "epoch": 0.233730187714557, + "grad_norm": 1.5097804069519043, + "learning_rate": 3.580308091415114e-05, + "loss": 0.7553, + "step": 4800 + }, + { + "epoch": 0.23377888150366422, + "grad_norm": 1.4880337715148926, + "learning_rate": 3.580114737937558e-05, + "loss": 0.9675, + "step": 4801 + }, + { + "epoch": 0.2338275752927714, + "grad_norm": 3.4354727268218994, + "learning_rate": 3.5799213451545034e-05, + "loss": 0.9054, + "step": 4802 + }, + { + "epoch": 0.2338762690818786, + "grad_norm": 1.408486247062683, + "learning_rate": 3.5797279130707605e-05, + "loss": 0.8848, + "step": 4803 + }, + { + "epoch": 0.2339249628709858, + "grad_norm": 2.1660702228546143, + "learning_rate": 3.5795344416911426e-05, + "loss": 0.8557, + "step": 4804 + }, + { + "epoch": 0.233973656660093, + "grad_norm": 4.566685676574707, + "learning_rate": 3.5793409310204614e-05, + "loss": 0.8987, + "step": 4805 + }, + { + "epoch": 0.2340223504492002, + "grad_norm": 1.8770511150360107, + "learning_rate": 3.57914738106353e-05, + "loss": 0.8968, + "step": 4806 + }, + { + "epoch": 0.2340710442383074, + "grad_norm": 1.6124825477600098, + "learning_rate": 3.578953791825164e-05, + "loss": 0.959, + "step": 4807 + }, + { + "epoch": 0.2341197380274146, + "grad_norm": 5.113442420959473, + "learning_rate": 3.5787601633101785e-05, + "loss": 0.9061, + "step": 4808 + }, + { + "epoch": 0.2341684318165218, + "grad_norm": 2.177797555923462, + "learning_rate": 3.57856649552339e-05, + "loss": 0.8139, + "step": 4809 + }, + { + "epoch": 0.234217125605629, + "grad_norm": 3.3044214248657227, + "learning_rate": 3.578372788469616e-05, + "loss": 0.8349, + "step": 4810 + }, + { + "epoch": 0.2342658193947362, + "grad_norm": 1.6680400371551514, + "learning_rate": 3.5781790421536744e-05, + "loss": 0.9578, + "step": 4811 + }, + { + "epoch": 0.2343145131838434, + "grad_norm": 1.856634497642517, + "learning_rate": 3.5779852565803855e-05, + "loss": 0.9251, + "step": 4812 + }, + { + "epoch": 0.2343632069729506, + "grad_norm": 1.5930171012878418, + "learning_rate": 3.57779143175457e-05, + "loss": 0.95, + "step": 4813 + }, + { + "epoch": 0.2344119007620578, + "grad_norm": 2.5486912727355957, + "learning_rate": 3.5775975676810485e-05, + "loss": 0.9384, + "step": 4814 + }, + { + "epoch": 0.234460594551165, + "grad_norm": 2.075803279876709, + "learning_rate": 3.577403664364643e-05, + "loss": 0.855, + "step": 4815 + }, + { + "epoch": 0.2345092883402722, + "grad_norm": 9.406580924987793, + "learning_rate": 3.577209721810178e-05, + "loss": 0.9228, + "step": 4816 + }, + { + "epoch": 0.2345579821293794, + "grad_norm": 1.8590824604034424, + "learning_rate": 3.5770157400224776e-05, + "loss": 0.955, + "step": 4817 + }, + { + "epoch": 0.2346066759184866, + "grad_norm": 2.7213821411132812, + "learning_rate": 3.5768217190063664e-05, + "loss": 0.9156, + "step": 4818 + }, + { + "epoch": 0.2346553697075938, + "grad_norm": 1.3981095552444458, + "learning_rate": 3.576627658766672e-05, + "loss": 0.9167, + "step": 4819 + }, + { + "epoch": 0.234704063496701, + "grad_norm": 1.9649964570999146, + "learning_rate": 3.5764335593082205e-05, + "loss": 0.811, + "step": 4820 + }, + { + "epoch": 0.2347527572858082, + "grad_norm": 2.103376865386963, + "learning_rate": 3.57623942063584e-05, + "loss": 0.8415, + "step": 4821 + }, + { + "epoch": 0.2348014510749154, + "grad_norm": 2.0804879665374756, + "learning_rate": 3.576045242754361e-05, + "loss": 0.9696, + "step": 4822 + }, + { + "epoch": 0.2348501448640226, + "grad_norm": 9.373592376708984, + "learning_rate": 3.575851025668612e-05, + "loss": 0.9862, + "step": 4823 + }, + { + "epoch": 0.2348988386531298, + "grad_norm": 1.5964056253433228, + "learning_rate": 3.575656769383425e-05, + "loss": 0.8446, + "step": 4824 + }, + { + "epoch": 0.23494753244223698, + "grad_norm": 1.6761033535003662, + "learning_rate": 3.575462473903633e-05, + "loss": 0.8579, + "step": 4825 + }, + { + "epoch": 0.2349962262313442, + "grad_norm": 1.631098985671997, + "learning_rate": 3.5752681392340676e-05, + "loss": 0.8119, + "step": 4826 + }, + { + "epoch": 0.23504492002045138, + "grad_norm": 2.2290804386138916, + "learning_rate": 3.575073765379564e-05, + "loss": 0.8267, + "step": 4827 + }, + { + "epoch": 0.2350936138095586, + "grad_norm": 1.6149508953094482, + "learning_rate": 3.574879352344957e-05, + "loss": 0.8953, + "step": 4828 + }, + { + "epoch": 0.23514230759866578, + "grad_norm": 1.7424107789993286, + "learning_rate": 3.574684900135083e-05, + "loss": 0.9281, + "step": 4829 + }, + { + "epoch": 0.235191001387773, + "grad_norm": 1.3664065599441528, + "learning_rate": 3.5744904087547775e-05, + "loss": 0.9124, + "step": 4830 + }, + { + "epoch": 0.23523969517688018, + "grad_norm": 2.621995210647583, + "learning_rate": 3.5742958782088795e-05, + "loss": 0.8192, + "step": 4831 + }, + { + "epoch": 0.2352883889659874, + "grad_norm": 1.6100192070007324, + "learning_rate": 3.5741013085022285e-05, + "loss": 0.8434, + "step": 4832 + }, + { + "epoch": 0.2353370827550946, + "grad_norm": 3.0415031909942627, + "learning_rate": 3.573906699639664e-05, + "loss": 0.8791, + "step": 4833 + }, + { + "epoch": 0.23538577654420179, + "grad_norm": 2.1114799976348877, + "learning_rate": 3.573712051626027e-05, + "loss": 0.8628, + "step": 4834 + }, + { + "epoch": 0.235434470333309, + "grad_norm": 5.138707160949707, + "learning_rate": 3.5735173644661586e-05, + "loss": 0.9674, + "step": 4835 + }, + { + "epoch": 0.23548316412241618, + "grad_norm": 3.617619276046753, + "learning_rate": 3.573322638164903e-05, + "loss": 0.8786, + "step": 4836 + }, + { + "epoch": 0.2355318579115234, + "grad_norm": 2.110171318054199, + "learning_rate": 3.573127872727103e-05, + "loss": 0.9247, + "step": 4837 + }, + { + "epoch": 0.23558055170063058, + "grad_norm": 1.6928825378417969, + "learning_rate": 3.572933068157603e-05, + "loss": 0.912, + "step": 4838 + }, + { + "epoch": 0.2356292454897378, + "grad_norm": 1.8037396669387817, + "learning_rate": 3.57273822446125e-05, + "loss": 0.8581, + "step": 4839 + }, + { + "epoch": 0.23567793927884498, + "grad_norm": 4.126638412475586, + "learning_rate": 3.572543341642891e-05, + "loss": 0.797, + "step": 4840 + }, + { + "epoch": 0.2357266330679522, + "grad_norm": 1.8872430324554443, + "learning_rate": 3.572348419707372e-05, + "loss": 0.9307, + "step": 4841 + }, + { + "epoch": 0.23577532685705938, + "grad_norm": 3.1186201572418213, + "learning_rate": 3.5721534586595435e-05, + "loss": 0.8734, + "step": 4842 + }, + { + "epoch": 0.2358240206461666, + "grad_norm": 1.5029959678649902, + "learning_rate": 3.571958458504254e-05, + "loss": 0.9895, + "step": 4843 + }, + { + "epoch": 0.23587271443527377, + "grad_norm": 1.5418835878372192, + "learning_rate": 3.571763419246354e-05, + "loss": 0.8389, + "step": 4844 + }, + { + "epoch": 0.23592140822438099, + "grad_norm": 1.9475315809249878, + "learning_rate": 3.5715683408906966e-05, + "loss": 0.943, + "step": 4845 + }, + { + "epoch": 0.23597010201348817, + "grad_norm": 2.2072689533233643, + "learning_rate": 3.571373223442133e-05, + "loss": 0.9388, + "step": 4846 + }, + { + "epoch": 0.23601879580259538, + "grad_norm": 1.7484956979751587, + "learning_rate": 3.5711780669055175e-05, + "loss": 0.9267, + "step": 4847 + }, + { + "epoch": 0.23606748959170257, + "grad_norm": 0.07785547524690628, + "learning_rate": 3.570982871285704e-05, + "loss": 0.514, + "step": 4848 + }, + { + "epoch": 0.23611618338080978, + "grad_norm": 1.6962753534317017, + "learning_rate": 3.570787636587548e-05, + "loss": 0.8183, + "step": 4849 + }, + { + "epoch": 0.23616487716991696, + "grad_norm": 1.5969672203063965, + "learning_rate": 3.570592362815907e-05, + "loss": 0.8894, + "step": 4850 + }, + { + "epoch": 0.23621357095902418, + "grad_norm": 1.8577914237976074, + "learning_rate": 3.570397049975637e-05, + "loss": 0.8738, + "step": 4851 + }, + { + "epoch": 0.23626226474813136, + "grad_norm": 1.3987029790878296, + "learning_rate": 3.570201698071598e-05, + "loss": 0.9156, + "step": 4852 + }, + { + "epoch": 0.23631095853723857, + "grad_norm": 1.9318325519561768, + "learning_rate": 3.570006307108648e-05, + "loss": 0.8699, + "step": 4853 + }, + { + "epoch": 0.2363596523263458, + "grad_norm": 1.738493800163269, + "learning_rate": 3.5698108770916484e-05, + "loss": 0.7684, + "step": 4854 + }, + { + "epoch": 0.23640834611545297, + "grad_norm": 2.009685754776001, + "learning_rate": 3.5696154080254596e-05, + "loss": 0.9075, + "step": 4855 + }, + { + "epoch": 0.23645703990456018, + "grad_norm": 1.6244299411773682, + "learning_rate": 3.569419899914945e-05, + "loss": 0.9447, + "step": 4856 + }, + { + "epoch": 0.23650573369366737, + "grad_norm": 1.6734215021133423, + "learning_rate": 3.569224352764967e-05, + "loss": 0.7761, + "step": 4857 + }, + { + "epoch": 0.23655442748277458, + "grad_norm": 1.8561137914657593, + "learning_rate": 3.56902876658039e-05, + "loss": 0.9, + "step": 4858 + }, + { + "epoch": 0.23660312127188177, + "grad_norm": 1.5220032930374146, + "learning_rate": 3.56883314136608e-05, + "loss": 0.7854, + "step": 4859 + }, + { + "epoch": 0.23665181506098898, + "grad_norm": 1.459359884262085, + "learning_rate": 3.5686374771269015e-05, + "loss": 0.8643, + "step": 4860 + }, + { + "epoch": 0.23670050885009616, + "grad_norm": 2.0358169078826904, + "learning_rate": 3.568441773867724e-05, + "loss": 0.9201, + "step": 4861 + }, + { + "epoch": 0.23674920263920338, + "grad_norm": 1.72407865524292, + "learning_rate": 3.5682460315934134e-05, + "loss": 0.8233, + "step": 4862 + }, + { + "epoch": 0.23679789642831056, + "grad_norm": 1.8055596351623535, + "learning_rate": 3.56805025030884e-05, + "loss": 0.8401, + "step": 4863 + }, + { + "epoch": 0.23684659021741777, + "grad_norm": 1.438356876373291, + "learning_rate": 3.5678544300188735e-05, + "loss": 0.9494, + "step": 4864 + }, + { + "epoch": 0.23689528400652496, + "grad_norm": 2.0057153701782227, + "learning_rate": 3.567658570728386e-05, + "loss": 0.8902, + "step": 4865 + }, + { + "epoch": 0.23694397779563217, + "grad_norm": 2.615795135498047, + "learning_rate": 3.567462672442248e-05, + "loss": 0.8847, + "step": 4866 + }, + { + "epoch": 0.23699267158473936, + "grad_norm": 1.5442924499511719, + "learning_rate": 3.567266735165332e-05, + "loss": 0.8049, + "step": 4867 + }, + { + "epoch": 0.23704136537384657, + "grad_norm": 1.4322975873947144, + "learning_rate": 3.567070758902515e-05, + "loss": 0.8659, + "step": 4868 + }, + { + "epoch": 0.23709005916295375, + "grad_norm": 1.463891625404358, + "learning_rate": 3.5668747436586695e-05, + "loss": 0.8916, + "step": 4869 + }, + { + "epoch": 0.23713875295206097, + "grad_norm": 1.2395954132080078, + "learning_rate": 3.5666786894386714e-05, + "loss": 0.8191, + "step": 4870 + }, + { + "epoch": 0.23718744674116815, + "grad_norm": 1.7899677753448486, + "learning_rate": 3.566482596247398e-05, + "loss": 0.8851, + "step": 4871 + }, + { + "epoch": 0.23723614053027536, + "grad_norm": 1.6462531089782715, + "learning_rate": 3.5662864640897276e-05, + "loss": 0.9877, + "step": 4872 + }, + { + "epoch": 0.23728483431938258, + "grad_norm": 2.20424485206604, + "learning_rate": 3.566090292970538e-05, + "loss": 0.8881, + "step": 4873 + }, + { + "epoch": 0.23733352810848976, + "grad_norm": 1.5640156269073486, + "learning_rate": 3.56589408289471e-05, + "loss": 0.9835, + "step": 4874 + }, + { + "epoch": 0.23738222189759697, + "grad_norm": 1.337203860282898, + "learning_rate": 3.565697833867124e-05, + "loss": 0.85, + "step": 4875 + }, + { + "epoch": 0.23743091568670416, + "grad_norm": 1.5114188194274902, + "learning_rate": 3.5655015458926616e-05, + "loss": 0.9813, + "step": 4876 + }, + { + "epoch": 0.23747960947581137, + "grad_norm": 1.9434791803359985, + "learning_rate": 3.565305218976205e-05, + "loss": 0.9411, + "step": 4877 + }, + { + "epoch": 0.23752830326491856, + "grad_norm": 1.6396393775939941, + "learning_rate": 3.565108853122639e-05, + "loss": 0.9264, + "step": 4878 + }, + { + "epoch": 0.23757699705402577, + "grad_norm": 1.4358805418014526, + "learning_rate": 3.5649124483368475e-05, + "loss": 0.9196, + "step": 4879 + }, + { + "epoch": 0.23762569084313295, + "grad_norm": 1.8389525413513184, + "learning_rate": 3.5647160046237154e-05, + "loss": 0.8934, + "step": 4880 + }, + { + "epoch": 0.23767438463224017, + "grad_norm": 4.719482898712158, + "learning_rate": 3.564519521988131e-05, + "loss": 0.8409, + "step": 4881 + }, + { + "epoch": 0.23772307842134735, + "grad_norm": 1.4566092491149902, + "learning_rate": 3.5643230004349805e-05, + "loss": 0.9229, + "step": 4882 + }, + { + "epoch": 0.23777177221045456, + "grad_norm": 2.3000543117523193, + "learning_rate": 3.5641264399691526e-05, + "loss": 0.8178, + "step": 4883 + }, + { + "epoch": 0.23782046599956175, + "grad_norm": 1.6700382232666016, + "learning_rate": 3.563929840595537e-05, + "loss": 1.0188, + "step": 4884 + }, + { + "epoch": 0.23786915978866896, + "grad_norm": 0.08105674386024475, + "learning_rate": 3.5637332023190244e-05, + "loss": 0.6106, + "step": 4885 + }, + { + "epoch": 0.23791785357777614, + "grad_norm": 1.5723997354507446, + "learning_rate": 3.563536525144505e-05, + "loss": 0.8788, + "step": 4886 + }, + { + "epoch": 0.23796654736688336, + "grad_norm": 1.754774808883667, + "learning_rate": 3.5633398090768726e-05, + "loss": 0.8711, + "step": 4887 + }, + { + "epoch": 0.23801524115599054, + "grad_norm": 1.7021819353103638, + "learning_rate": 3.56314305412102e-05, + "loss": 0.9662, + "step": 4888 + }, + { + "epoch": 0.23806393494509775, + "grad_norm": 1.2655034065246582, + "learning_rate": 3.562946260281841e-05, + "loss": 0.9472, + "step": 4889 + }, + { + "epoch": 0.23811262873420494, + "grad_norm": 2.6478171348571777, + "learning_rate": 3.562749427564231e-05, + "loss": 0.9028, + "step": 4890 + }, + { + "epoch": 0.23816132252331215, + "grad_norm": 1.5380712747573853, + "learning_rate": 3.562552555973087e-05, + "loss": 0.8659, + "step": 4891 + }, + { + "epoch": 0.23821001631241934, + "grad_norm": 1.5762004852294922, + "learning_rate": 3.562355645513306e-05, + "loss": 0.8991, + "step": 4892 + }, + { + "epoch": 0.23825871010152655, + "grad_norm": 1.9045133590698242, + "learning_rate": 3.562158696189785e-05, + "loss": 0.731, + "step": 4893 + }, + { + "epoch": 0.23830740389063376, + "grad_norm": 4.217004776000977, + "learning_rate": 3.5619617080074246e-05, + "loss": 0.7632, + "step": 4894 + }, + { + "epoch": 0.23835609767974095, + "grad_norm": 1.594035029411316, + "learning_rate": 3.561764680971124e-05, + "loss": 0.7814, + "step": 4895 + }, + { + "epoch": 0.23840479146884816, + "grad_norm": 1.8688359260559082, + "learning_rate": 3.5615676150857855e-05, + "loss": 0.8029, + "step": 4896 + }, + { + "epoch": 0.23845348525795534, + "grad_norm": 1.58407461643219, + "learning_rate": 3.561370510356309e-05, + "loss": 0.801, + "step": 4897 + }, + { + "epoch": 0.23850217904706256, + "grad_norm": 2.6645596027374268, + "learning_rate": 3.561173366787599e-05, + "loss": 0.9246, + "step": 4898 + }, + { + "epoch": 0.23855087283616974, + "grad_norm": 1.6094694137573242, + "learning_rate": 3.5609761843845595e-05, + "loss": 0.8951, + "step": 4899 + }, + { + "epoch": 0.23859956662527695, + "grad_norm": 1.5551440715789795, + "learning_rate": 3.560778963152094e-05, + "loss": 0.7916, + "step": 4900 + }, + { + "epoch": 0.23864826041438414, + "grad_norm": 3.3021552562713623, + "learning_rate": 3.5605817030951116e-05, + "loss": 0.7826, + "step": 4901 + }, + { + "epoch": 0.23869695420349135, + "grad_norm": 0.08670466393232346, + "learning_rate": 3.5603844042185154e-05, + "loss": 0.6142, + "step": 4902 + }, + { + "epoch": 0.23874564799259854, + "grad_norm": 1.573188066482544, + "learning_rate": 3.560187066527215e-05, + "loss": 0.9071, + "step": 4903 + }, + { + "epoch": 0.23879434178170575, + "grad_norm": 1.8017780780792236, + "learning_rate": 3.55998969002612e-05, + "loss": 1.0282, + "step": 4904 + }, + { + "epoch": 0.23884303557081293, + "grad_norm": 3.270169496536255, + "learning_rate": 3.559792274720138e-05, + "loss": 0.8885, + "step": 4905 + }, + { + "epoch": 0.23889172935992015, + "grad_norm": 3.257910966873169, + "learning_rate": 3.559594820614182e-05, + "loss": 0.859, + "step": 4906 + }, + { + "epoch": 0.23894042314902733, + "grad_norm": 2.582942008972168, + "learning_rate": 3.5593973277131614e-05, + "loss": 0.8783, + "step": 4907 + }, + { + "epoch": 0.23898911693813454, + "grad_norm": 1.648250937461853, + "learning_rate": 3.559199796021991e-05, + "loss": 0.9582, + "step": 4908 + }, + { + "epoch": 0.23903781072724173, + "grad_norm": 1.9585825204849243, + "learning_rate": 3.5590022255455836e-05, + "loss": 0.8299, + "step": 4909 + }, + { + "epoch": 0.23908650451634894, + "grad_norm": 1.5057895183563232, + "learning_rate": 3.558804616288853e-05, + "loss": 0.8675, + "step": 4910 + }, + { + "epoch": 0.23913519830545613, + "grad_norm": 1.525079369544983, + "learning_rate": 3.558606968256716e-05, + "loss": 0.9596, + "step": 4911 + }, + { + "epoch": 0.23918389209456334, + "grad_norm": 1.6631721258163452, + "learning_rate": 3.558409281454089e-05, + "loss": 0.8255, + "step": 4912 + }, + { + "epoch": 0.23923258588367055, + "grad_norm": 1.4737622737884521, + "learning_rate": 3.558211555885889e-05, + "loss": 0.9224, + "step": 4913 + }, + { + "epoch": 0.23928127967277774, + "grad_norm": 1.5131160020828247, + "learning_rate": 3.5580137915570337e-05, + "loss": 0.8429, + "step": 4914 + }, + { + "epoch": 0.23932997346188495, + "grad_norm": 1.4323188066482544, + "learning_rate": 3.5578159884724435e-05, + "loss": 0.8516, + "step": 4915 + }, + { + "epoch": 0.23937866725099213, + "grad_norm": 2.038001537322998, + "learning_rate": 3.5576181466370386e-05, + "loss": 0.8751, + "step": 4916 + }, + { + "epoch": 0.23942736104009935, + "grad_norm": 4.844418048858643, + "learning_rate": 3.55742026605574e-05, + "loss": 0.9011, + "step": 4917 + }, + { + "epoch": 0.23947605482920653, + "grad_norm": 1.612578272819519, + "learning_rate": 3.557222346733471e-05, + "loss": 0.9625, + "step": 4918 + }, + { + "epoch": 0.23952474861831374, + "grad_norm": 1.803871750831604, + "learning_rate": 3.557024388675153e-05, + "loss": 0.8833, + "step": 4919 + }, + { + "epoch": 0.23957344240742093, + "grad_norm": 1.6451194286346436, + "learning_rate": 3.556826391885713e-05, + "loss": 0.8024, + "step": 4920 + }, + { + "epoch": 0.23962213619652814, + "grad_norm": 1.931444764137268, + "learning_rate": 3.556628356370073e-05, + "loss": 0.8465, + "step": 4921 + }, + { + "epoch": 0.23967082998563533, + "grad_norm": 1.9980823993682861, + "learning_rate": 3.5564302821331616e-05, + "loss": 0.8909, + "step": 4922 + }, + { + "epoch": 0.23971952377474254, + "grad_norm": 1.5705573558807373, + "learning_rate": 3.556232169179905e-05, + "loss": 0.8972, + "step": 4923 + }, + { + "epoch": 0.23976821756384972, + "grad_norm": 2.2376976013183594, + "learning_rate": 3.556034017515231e-05, + "loss": 0.892, + "step": 4924 + }, + { + "epoch": 0.23981691135295694, + "grad_norm": 1.8514227867126465, + "learning_rate": 3.5558358271440687e-05, + "loss": 0.8144, + "step": 4925 + }, + { + "epoch": 0.23986560514206412, + "grad_norm": 1.4202319383621216, + "learning_rate": 3.5556375980713485e-05, + "loss": 0.8816, + "step": 4926 + }, + { + "epoch": 0.23991429893117133, + "grad_norm": 1.4181240797042847, + "learning_rate": 3.555439330302002e-05, + "loss": 0.8588, + "step": 4927 + }, + { + "epoch": 0.23996299272027852, + "grad_norm": 1.5535969734191895, + "learning_rate": 3.5552410238409596e-05, + "loss": 0.9133, + "step": 4928 + }, + { + "epoch": 0.24001168650938573, + "grad_norm": 1.476812720298767, + "learning_rate": 3.5550426786931544e-05, + "loss": 0.9551, + "step": 4929 + }, + { + "epoch": 0.24006038029849291, + "grad_norm": 1.9043498039245605, + "learning_rate": 3.5548442948635215e-05, + "loss": 0.9214, + "step": 4930 + }, + { + "epoch": 0.24010907408760013, + "grad_norm": 0.0884600356221199, + "learning_rate": 3.5546458723569954e-05, + "loss": 0.639, + "step": 4931 + }, + { + "epoch": 0.24015776787670734, + "grad_norm": 1.406969666481018, + "learning_rate": 3.554447411178511e-05, + "loss": 0.8635, + "step": 4932 + }, + { + "epoch": 0.24020646166581452, + "grad_norm": 1.4984627962112427, + "learning_rate": 3.554248911333005e-05, + "loss": 0.9149, + "step": 4933 + }, + { + "epoch": 0.24025515545492174, + "grad_norm": 1.3849482536315918, + "learning_rate": 3.5540503728254165e-05, + "loss": 0.9268, + "step": 4934 + }, + { + "epoch": 0.24030384924402892, + "grad_norm": 2.1162235736846924, + "learning_rate": 3.553851795660683e-05, + "loss": 0.8438, + "step": 4935 + }, + { + "epoch": 0.24035254303313613, + "grad_norm": 1.3138412237167358, + "learning_rate": 3.5536531798437446e-05, + "loss": 0.9233, + "step": 4936 + }, + { + "epoch": 0.24040123682224332, + "grad_norm": 0.08503580838441849, + "learning_rate": 3.553454525379541e-05, + "loss": 0.6138, + "step": 4937 + }, + { + "epoch": 0.24044993061135053, + "grad_norm": 2.393826723098755, + "learning_rate": 3.553255832273015e-05, + "loss": 0.8857, + "step": 4938 + }, + { + "epoch": 0.24049862440045772, + "grad_norm": 1.2398625612258911, + "learning_rate": 3.5530571005291085e-05, + "loss": 0.8518, + "step": 4939 + }, + { + "epoch": 0.24054731818956493, + "grad_norm": 1.8513973951339722, + "learning_rate": 3.5528583301527647e-05, + "loss": 0.8808, + "step": 4940 + }, + { + "epoch": 0.24059601197867211, + "grad_norm": 0.08483439683914185, + "learning_rate": 3.5526595211489286e-05, + "loss": 0.5941, + "step": 4941 + }, + { + "epoch": 0.24064470576777933, + "grad_norm": 1.7295658588409424, + "learning_rate": 3.552460673522546e-05, + "loss": 0.9526, + "step": 4942 + }, + { + "epoch": 0.2406933995568865, + "grad_norm": 1.782311201095581, + "learning_rate": 3.552261787278561e-05, + "loss": 0.8512, + "step": 4943 + }, + { + "epoch": 0.24074209334599372, + "grad_norm": 2.0458226203918457, + "learning_rate": 3.552062862421924e-05, + "loss": 0.941, + "step": 4944 + }, + { + "epoch": 0.2407907871351009, + "grad_norm": 1.4995006322860718, + "learning_rate": 3.5518638989575805e-05, + "loss": 0.8505, + "step": 4945 + }, + { + "epoch": 0.24083948092420812, + "grad_norm": 1.9831770658493042, + "learning_rate": 3.551664896890482e-05, + "loss": 0.8702, + "step": 4946 + }, + { + "epoch": 0.2408881747133153, + "grad_norm": 1.3767768144607544, + "learning_rate": 3.551465856225578e-05, + "loss": 0.8929, + "step": 4947 + }, + { + "epoch": 0.24093686850242252, + "grad_norm": 1.6842906475067139, + "learning_rate": 3.5512667769678185e-05, + "loss": 0.9088, + "step": 4948 + }, + { + "epoch": 0.2409855622915297, + "grad_norm": 1.5230886936187744, + "learning_rate": 3.551067659122157e-05, + "loss": 0.8748, + "step": 4949 + }, + { + "epoch": 0.24103425608063692, + "grad_norm": 1.6408029794692993, + "learning_rate": 3.550868502693546e-05, + "loss": 0.9612, + "step": 4950 + }, + { + "epoch": 0.2410829498697441, + "grad_norm": 2.767662525177002, + "learning_rate": 3.5506693076869396e-05, + "loss": 0.8886, + "step": 4951 + }, + { + "epoch": 0.2411316436588513, + "grad_norm": 1.765924096107483, + "learning_rate": 3.5504700741072926e-05, + "loss": 0.959, + "step": 4952 + }, + { + "epoch": 0.24118033744795853, + "grad_norm": 1.3215062618255615, + "learning_rate": 3.550270801959561e-05, + "loss": 0.8257, + "step": 4953 + }, + { + "epoch": 0.2412290312370657, + "grad_norm": 1.6673725843429565, + "learning_rate": 3.550071491248702e-05, + "loss": 0.825, + "step": 4954 + }, + { + "epoch": 0.24127772502617292, + "grad_norm": 1.3917789459228516, + "learning_rate": 3.5498721419796735e-05, + "loss": 0.9137, + "step": 4955 + }, + { + "epoch": 0.2413264188152801, + "grad_norm": 1.7628508806228638, + "learning_rate": 3.5496727541574336e-05, + "loss": 0.8465, + "step": 4956 + }, + { + "epoch": 0.24137511260438732, + "grad_norm": 1.5738275051116943, + "learning_rate": 3.549473327786944e-05, + "loss": 0.8552, + "step": 4957 + }, + { + "epoch": 0.2414238063934945, + "grad_norm": 1.4102717638015747, + "learning_rate": 3.549273862873162e-05, + "loss": 0.9245, + "step": 4958 + }, + { + "epoch": 0.24147250018260172, + "grad_norm": 3.901277780532837, + "learning_rate": 3.549074359421053e-05, + "loss": 0.8454, + "step": 4959 + }, + { + "epoch": 0.2415211939717089, + "grad_norm": 1.5281037092208862, + "learning_rate": 3.548874817435578e-05, + "loss": 0.8475, + "step": 4960 + }, + { + "epoch": 0.24156988776081612, + "grad_norm": 1.7439460754394531, + "learning_rate": 3.5486752369217e-05, + "loss": 0.8881, + "step": 4961 + }, + { + "epoch": 0.2416185815499233, + "grad_norm": 2.231931686401367, + "learning_rate": 3.548475617884385e-05, + "loss": 0.9186, + "step": 4962 + }, + { + "epoch": 0.2416672753390305, + "grad_norm": 1.6282830238342285, + "learning_rate": 3.5482759603285976e-05, + "loss": 0.9029, + "step": 4963 + }, + { + "epoch": 0.2417159691281377, + "grad_norm": 1.5748413801193237, + "learning_rate": 3.548076264259304e-05, + "loss": 0.9562, + "step": 4964 + }, + { + "epoch": 0.2417646629172449, + "grad_norm": 2.099306106567383, + "learning_rate": 3.5478765296814725e-05, + "loss": 0.8826, + "step": 4965 + }, + { + "epoch": 0.2418133567063521, + "grad_norm": 1.546350121498108, + "learning_rate": 3.547676756600071e-05, + "loss": 0.931, + "step": 4966 + }, + { + "epoch": 0.2418620504954593, + "grad_norm": 1.678848385810852, + "learning_rate": 3.54747694502007e-05, + "loss": 0.9532, + "step": 4967 + }, + { + "epoch": 0.2419107442845665, + "grad_norm": 1.3356812000274658, + "learning_rate": 3.547277094946438e-05, + "loss": 0.9461, + "step": 4968 + }, + { + "epoch": 0.2419594380736737, + "grad_norm": 2.465646505355835, + "learning_rate": 3.547077206384147e-05, + "loss": 0.9305, + "step": 4969 + }, + { + "epoch": 0.2420081318627809, + "grad_norm": 1.7410598993301392, + "learning_rate": 3.54687727933817e-05, + "loss": 0.8921, + "step": 4970 + }, + { + "epoch": 0.2420568256518881, + "grad_norm": 1.4531581401824951, + "learning_rate": 3.5466773138134794e-05, + "loss": 0.8431, + "step": 4971 + }, + { + "epoch": 0.24210551944099531, + "grad_norm": 1.9764299392700195, + "learning_rate": 3.54647730981505e-05, + "loss": 0.8873, + "step": 4972 + }, + { + "epoch": 0.2421542132301025, + "grad_norm": 3.967972755432129, + "learning_rate": 3.546277267347856e-05, + "loss": 0.9347, + "step": 4973 + }, + { + "epoch": 0.2422029070192097, + "grad_norm": 1.425205945968628, + "learning_rate": 3.5460771864168734e-05, + "loss": 0.9115, + "step": 4974 + }, + { + "epoch": 0.2422516008083169, + "grad_norm": 1.3246726989746094, + "learning_rate": 3.545877067027081e-05, + "loss": 0.8263, + "step": 4975 + }, + { + "epoch": 0.2423002945974241, + "grad_norm": 1.6431527137756348, + "learning_rate": 3.545676909183455e-05, + "loss": 0.8798, + "step": 4976 + }, + { + "epoch": 0.2423489883865313, + "grad_norm": 1.4519541263580322, + "learning_rate": 3.5454767128909756e-05, + "loss": 0.9916, + "step": 4977 + }, + { + "epoch": 0.2423976821756385, + "grad_norm": 1.2371035814285278, + "learning_rate": 3.5452764781546216e-05, + "loss": 0.8265, + "step": 4978 + }, + { + "epoch": 0.2424463759647457, + "grad_norm": 1.3490962982177734, + "learning_rate": 3.545076204979374e-05, + "loss": 0.8665, + "step": 4979 + }, + { + "epoch": 0.2424950697538529, + "grad_norm": 1.8630926609039307, + "learning_rate": 3.5448758933702155e-05, + "loss": 0.822, + "step": 4980 + }, + { + "epoch": 0.2425437635429601, + "grad_norm": 1.560499906539917, + "learning_rate": 3.5446755433321284e-05, + "loss": 0.9087, + "step": 4981 + }, + { + "epoch": 0.2425924573320673, + "grad_norm": 1.629143238067627, + "learning_rate": 3.544475154870096e-05, + "loss": 1.0065, + "step": 4982 + }, + { + "epoch": 0.2426411511211745, + "grad_norm": 1.753584861755371, + "learning_rate": 3.5442747279891027e-05, + "loss": 0.9279, + "step": 4983 + }, + { + "epoch": 0.2426898449102817, + "grad_norm": 1.6491596698760986, + "learning_rate": 3.544074262694135e-05, + "loss": 0.9192, + "step": 4984 + }, + { + "epoch": 0.24273853869938888, + "grad_norm": 1.4942553043365479, + "learning_rate": 3.5438737589901796e-05, + "loss": 0.9231, + "step": 4985 + }, + { + "epoch": 0.2427872324884961, + "grad_norm": 1.6950188875198364, + "learning_rate": 3.5436732168822245e-05, + "loss": 0.9125, + "step": 4986 + }, + { + "epoch": 0.24283592627760328, + "grad_norm": 1.7080804109573364, + "learning_rate": 3.5434726363752566e-05, + "loss": 0.7877, + "step": 4987 + }, + { + "epoch": 0.2428846200667105, + "grad_norm": 1.4212864637374878, + "learning_rate": 3.543272017474266e-05, + "loss": 0.944, + "step": 4988 + }, + { + "epoch": 0.24293331385581768, + "grad_norm": 1.690633773803711, + "learning_rate": 3.543071360184243e-05, + "loss": 0.8521, + "step": 4989 + }, + { + "epoch": 0.2429820076449249, + "grad_norm": 1.6626158952713013, + "learning_rate": 3.5428706645101795e-05, + "loss": 0.8055, + "step": 4990 + }, + { + "epoch": 0.24303070143403208, + "grad_norm": 1.3777738809585571, + "learning_rate": 3.5426699304570674e-05, + "loss": 0.8023, + "step": 4991 + }, + { + "epoch": 0.2430793952231393, + "grad_norm": 1.7313159704208374, + "learning_rate": 3.542469158029901e-05, + "loss": 0.8188, + "step": 4992 + }, + { + "epoch": 0.2431280890122465, + "grad_norm": 1.5667991638183594, + "learning_rate": 3.5422683472336724e-05, + "loss": 0.889, + "step": 4993 + }, + { + "epoch": 0.24317678280135369, + "grad_norm": 1.260711431503296, + "learning_rate": 3.5420674980733784e-05, + "loss": 0.8883, + "step": 4994 + }, + { + "epoch": 0.2432254765904609, + "grad_norm": 1.4760414361953735, + "learning_rate": 3.541866610554016e-05, + "loss": 0.8814, + "step": 4995 + }, + { + "epoch": 0.24327417037956808, + "grad_norm": 2.967082977294922, + "learning_rate": 3.5416656846805796e-05, + "loss": 0.8379, + "step": 4996 + }, + { + "epoch": 0.2433228641686753, + "grad_norm": 1.7587518692016602, + "learning_rate": 3.541464720458069e-05, + "loss": 0.8812, + "step": 4997 + }, + { + "epoch": 0.24337155795778248, + "grad_norm": 1.8735666275024414, + "learning_rate": 3.5412637178914834e-05, + "loss": 0.8887, + "step": 4998 + }, + { + "epoch": 0.2434202517468897, + "grad_norm": 0.08797744661569595, + "learning_rate": 3.541062676985822e-05, + "loss": 0.7047, + "step": 4999 + }, + { + "epoch": 0.24346894553599688, + "grad_norm": 2.2064855098724365, + "learning_rate": 3.5408615977460855e-05, + "loss": 0.8313, + "step": 5000 + }, + { + "epoch": 0.2435176393251041, + "grad_norm": 1.9819085597991943, + "learning_rate": 3.540660480177277e-05, + "loss": 0.9066, + "step": 5001 + }, + { + "epoch": 0.24356633311421128, + "grad_norm": 1.5864790678024292, + "learning_rate": 3.5404593242843984e-05, + "loss": 0.8569, + "step": 5002 + }, + { + "epoch": 0.2436150269033185, + "grad_norm": 1.6128597259521484, + "learning_rate": 3.540258130072453e-05, + "loss": 0.8757, + "step": 5003 + }, + { + "epoch": 0.24366372069242567, + "grad_norm": 1.8001736402511597, + "learning_rate": 3.5400568975464474e-05, + "loss": 0.8622, + "step": 5004 + }, + { + "epoch": 0.24371241448153289, + "grad_norm": 2.5746567249298096, + "learning_rate": 3.5398556267113856e-05, + "loss": 0.9258, + "step": 5005 + }, + { + "epoch": 0.24376110827064007, + "grad_norm": 1.5843905210494995, + "learning_rate": 3.539654317572274e-05, + "loss": 0.8419, + "step": 5006 + }, + { + "epoch": 0.24380980205974728, + "grad_norm": 1.6685768365859985, + "learning_rate": 3.539452970134122e-05, + "loss": 0.9584, + "step": 5007 + }, + { + "epoch": 0.24385849584885447, + "grad_norm": 1.6178741455078125, + "learning_rate": 3.5392515844019365e-05, + "loss": 0.8271, + "step": 5008 + }, + { + "epoch": 0.24390718963796168, + "grad_norm": 3.3397417068481445, + "learning_rate": 3.539050160380727e-05, + "loss": 0.788, + "step": 5009 + }, + { + "epoch": 0.24395588342706886, + "grad_norm": 1.3550503253936768, + "learning_rate": 3.538848698075505e-05, + "loss": 0.9509, + "step": 5010 + }, + { + "epoch": 0.24400457721617608, + "grad_norm": 1.3979045152664185, + "learning_rate": 3.5386471974912816e-05, + "loss": 0.9372, + "step": 5011 + }, + { + "epoch": 0.2440532710052833, + "grad_norm": 1.8677674531936646, + "learning_rate": 3.538445658633068e-05, + "loss": 0.8075, + "step": 5012 + }, + { + "epoch": 0.24410196479439047, + "grad_norm": 1.6562087535858154, + "learning_rate": 3.5382440815058786e-05, + "loss": 0.9503, + "step": 5013 + }, + { + "epoch": 0.2441506585834977, + "grad_norm": 1.7041195631027222, + "learning_rate": 3.538042466114728e-05, + "loss": 0.9824, + "step": 5014 + }, + { + "epoch": 0.24419935237260487, + "grad_norm": 1.3142198324203491, + "learning_rate": 3.537840812464631e-05, + "loss": 0.9029, + "step": 5015 + }, + { + "epoch": 0.24424804616171208, + "grad_norm": 1.689681887626648, + "learning_rate": 3.5376391205606024e-05, + "loss": 0.9048, + "step": 5016 + }, + { + "epoch": 0.24429673995081927, + "grad_norm": 1.6347638368606567, + "learning_rate": 3.5374373904076616e-05, + "loss": 0.9116, + "step": 5017 + }, + { + "epoch": 0.24434543373992648, + "grad_norm": 2.024197816848755, + "learning_rate": 3.537235622010825e-05, + "loss": 0.9648, + "step": 5018 + }, + { + "epoch": 0.24439412752903367, + "grad_norm": 1.7931016683578491, + "learning_rate": 3.5370338153751116e-05, + "loss": 0.8045, + "step": 5019 + }, + { + "epoch": 0.24444282131814088, + "grad_norm": 1.4597891569137573, + "learning_rate": 3.536831970505543e-05, + "loss": 0.9051, + "step": 5020 + }, + { + "epoch": 0.24449151510724806, + "grad_norm": 1.7301338911056519, + "learning_rate": 3.5366300874071385e-05, + "loss": 0.8842, + "step": 5021 + }, + { + "epoch": 0.24454020889635528, + "grad_norm": 1.2715070247650146, + "learning_rate": 3.53642816608492e-05, + "loss": 0.883, + "step": 5022 + }, + { + "epoch": 0.24458890268546246, + "grad_norm": 1.603096604347229, + "learning_rate": 3.536226206543912e-05, + "loss": 0.8947, + "step": 5023 + }, + { + "epoch": 0.24463759647456967, + "grad_norm": 1.628390908241272, + "learning_rate": 3.536024208789136e-05, + "loss": 0.831, + "step": 5024 + }, + { + "epoch": 0.24468629026367686, + "grad_norm": 2.136474370956421, + "learning_rate": 3.535822172825618e-05, + "loss": 0.8035, + "step": 5025 + }, + { + "epoch": 0.24473498405278407, + "grad_norm": 0.08211500197649002, + "learning_rate": 3.535620098658384e-05, + "loss": 0.5566, + "step": 5026 + }, + { + "epoch": 0.24478367784189126, + "grad_norm": 0.09064331650733948, + "learning_rate": 3.53541798629246e-05, + "loss": 0.6987, + "step": 5027 + }, + { + "epoch": 0.24483237163099847, + "grad_norm": 1.4764478206634521, + "learning_rate": 3.5352158357328725e-05, + "loss": 0.8085, + "step": 5028 + }, + { + "epoch": 0.24488106542010565, + "grad_norm": 2.148128032684326, + "learning_rate": 3.535013646984652e-05, + "loss": 0.813, + "step": 5029 + }, + { + "epoch": 0.24492975920921287, + "grad_norm": 0.08634394407272339, + "learning_rate": 3.534811420052827e-05, + "loss": 0.6517, + "step": 5030 + }, + { + "epoch": 0.24497845299832005, + "grad_norm": 1.4001595973968506, + "learning_rate": 3.534609154942427e-05, + "loss": 0.8283, + "step": 5031 + }, + { + "epoch": 0.24502714678742726, + "grad_norm": 1.4509005546569824, + "learning_rate": 3.5344068516584854e-05, + "loss": 0.8369, + "step": 5032 + }, + { + "epoch": 0.24507584057653448, + "grad_norm": 2.154876232147217, + "learning_rate": 3.534204510206033e-05, + "loss": 1.0315, + "step": 5033 + }, + { + "epoch": 0.24512453436564166, + "grad_norm": 1.953691840171814, + "learning_rate": 3.534002130590104e-05, + "loss": 0.8496, + "step": 5034 + }, + { + "epoch": 0.24517322815474887, + "grad_norm": 1.5233854055404663, + "learning_rate": 3.5337997128157316e-05, + "loss": 0.9774, + "step": 5035 + }, + { + "epoch": 0.24522192194385606, + "grad_norm": 2.3911654949188232, + "learning_rate": 3.5335972568879516e-05, + "loss": 0.9368, + "step": 5036 + }, + { + "epoch": 0.24527061573296327, + "grad_norm": 2.515530824661255, + "learning_rate": 3.5333947628118006e-05, + "loss": 0.8667, + "step": 5037 + }, + { + "epoch": 0.24531930952207046, + "grad_norm": 2.416656970977783, + "learning_rate": 3.533192230592315e-05, + "loss": 0.8169, + "step": 5038 + }, + { + "epoch": 0.24536800331117767, + "grad_norm": 2.319998025894165, + "learning_rate": 3.532989660234532e-05, + "loss": 0.8199, + "step": 5039 + }, + { + "epoch": 0.24541669710028485, + "grad_norm": 1.5097843408584595, + "learning_rate": 3.532787051743493e-05, + "loss": 0.8516, + "step": 5040 + }, + { + "epoch": 0.24546539088939207, + "grad_norm": 1.2369790077209473, + "learning_rate": 3.532584405124234e-05, + "loss": 0.8422, + "step": 5041 + }, + { + "epoch": 0.24551408467849925, + "grad_norm": 1.8760030269622803, + "learning_rate": 3.5323817203817996e-05, + "loss": 0.9358, + "step": 5042 + }, + { + "epoch": 0.24556277846760646, + "grad_norm": 0.08611597120761871, + "learning_rate": 3.53217899752123e-05, + "loss": 0.5728, + "step": 5043 + }, + { + "epoch": 0.24561147225671365, + "grad_norm": 1.474442720413208, + "learning_rate": 3.5319762365475685e-05, + "loss": 0.9824, + "step": 5044 + }, + { + "epoch": 0.24566016604582086, + "grad_norm": 1.6513166427612305, + "learning_rate": 3.531773437465858e-05, + "loss": 0.9488, + "step": 5045 + }, + { + "epoch": 0.24570885983492805, + "grad_norm": 2.3573198318481445, + "learning_rate": 3.531570600281144e-05, + "loss": 0.8465, + "step": 5046 + }, + { + "epoch": 0.24575755362403526, + "grad_norm": 1.541433334350586, + "learning_rate": 3.531367724998471e-05, + "loss": 0.8697, + "step": 5047 + }, + { + "epoch": 0.24580624741314244, + "grad_norm": 2.0324342250823975, + "learning_rate": 3.531164811622887e-05, + "loss": 0.7756, + "step": 5048 + }, + { + "epoch": 0.24585494120224965, + "grad_norm": 2.8964433670043945, + "learning_rate": 3.530961860159438e-05, + "loss": 0.8844, + "step": 5049 + }, + { + "epoch": 0.24590363499135684, + "grad_norm": 1.4209377765655518, + "learning_rate": 3.5307588706131735e-05, + "loss": 0.8156, + "step": 5050 + }, + { + "epoch": 0.24595232878046405, + "grad_norm": 2.500190019607544, + "learning_rate": 3.530555842989142e-05, + "loss": 0.8867, + "step": 5051 + }, + { + "epoch": 0.24600102256957126, + "grad_norm": 1.523607611656189, + "learning_rate": 3.530352777292395e-05, + "loss": 0.9166, + "step": 5052 + }, + { + "epoch": 0.24604971635867845, + "grad_norm": 1.6044269800186157, + "learning_rate": 3.5301496735279835e-05, + "loss": 0.7922, + "step": 5053 + }, + { + "epoch": 0.24609841014778566, + "grad_norm": 1.4423328638076782, + "learning_rate": 3.529946531700959e-05, + "loss": 0.8479, + "step": 5054 + }, + { + "epoch": 0.24614710393689285, + "grad_norm": 2.214351177215576, + "learning_rate": 3.529743351816374e-05, + "loss": 0.9274, + "step": 5055 + }, + { + "epoch": 0.24619579772600006, + "grad_norm": 1.5660738945007324, + "learning_rate": 3.529540133879285e-05, + "loss": 0.9428, + "step": 5056 + }, + { + "epoch": 0.24624449151510724, + "grad_norm": 1.8573095798492432, + "learning_rate": 3.529336877894745e-05, + "loss": 0.8699, + "step": 5057 + }, + { + "epoch": 0.24629318530421446, + "grad_norm": 1.5731306076049805, + "learning_rate": 3.5291335838678114e-05, + "loss": 1.0038, + "step": 5058 + }, + { + "epoch": 0.24634187909332164, + "grad_norm": 1.4273111820220947, + "learning_rate": 3.52893025180354e-05, + "loss": 0.8649, + "step": 5059 + }, + { + "epoch": 0.24639057288242885, + "grad_norm": 1.7856732606887817, + "learning_rate": 3.528726881706989e-05, + "loss": 0.8359, + "step": 5060 + }, + { + "epoch": 0.24643926667153604, + "grad_norm": 0.08684445172548294, + "learning_rate": 3.528523473583218e-05, + "loss": 0.5602, + "step": 5061 + }, + { + "epoch": 0.24648796046064325, + "grad_norm": 1.4561047554016113, + "learning_rate": 3.528320027437286e-05, + "loss": 0.8663, + "step": 5062 + }, + { + "epoch": 0.24653665424975044, + "grad_norm": 1.3807753324508667, + "learning_rate": 3.5281165432742534e-05, + "loss": 0.82, + "step": 5063 + }, + { + "epoch": 0.24658534803885765, + "grad_norm": 1.9164769649505615, + "learning_rate": 3.527913021099183e-05, + "loss": 0.9694, + "step": 5064 + }, + { + "epoch": 0.24663404182796483, + "grad_norm": 1.7763508558273315, + "learning_rate": 3.527709460917136e-05, + "loss": 0.8517, + "step": 5065 + }, + { + "epoch": 0.24668273561707205, + "grad_norm": 1.8833070993423462, + "learning_rate": 3.527505862733178e-05, + "loss": 0.8856, + "step": 5066 + }, + { + "epoch": 0.24673142940617923, + "grad_norm": 1.7643767595291138, + "learning_rate": 3.527302226552373e-05, + "loss": 0.8836, + "step": 5067 + }, + { + "epoch": 0.24678012319528644, + "grad_norm": 1.4176799058914185, + "learning_rate": 3.527098552379784e-05, + "loss": 0.953, + "step": 5068 + }, + { + "epoch": 0.24682881698439363, + "grad_norm": 1.5714389085769653, + "learning_rate": 3.526894840220481e-05, + "loss": 0.8124, + "step": 5069 + }, + { + "epoch": 0.24687751077350084, + "grad_norm": 0.0831168070435524, + "learning_rate": 3.526691090079528e-05, + "loss": 0.58, + "step": 5070 + }, + { + "epoch": 0.24692620456260803, + "grad_norm": 1.5719588994979858, + "learning_rate": 3.5264873019619954e-05, + "loss": 0.9038, + "step": 5071 + }, + { + "epoch": 0.24697489835171524, + "grad_norm": 1.9124213457107544, + "learning_rate": 3.526283475872953e-05, + "loss": 0.912, + "step": 5072 + }, + { + "epoch": 0.24702359214082245, + "grad_norm": 2.954113245010376, + "learning_rate": 3.526079611817469e-05, + "loss": 0.8517, + "step": 5073 + }, + { + "epoch": 0.24707228592992964, + "grad_norm": 1.5820714235305786, + "learning_rate": 3.525875709800616e-05, + "loss": 0.8816, + "step": 5074 + }, + { + "epoch": 0.24712097971903685, + "grad_norm": 1.903366208076477, + "learning_rate": 3.525671769827465e-05, + "loss": 0.8334, + "step": 5075 + }, + { + "epoch": 0.24716967350814403, + "grad_norm": 1.4469746351242065, + "learning_rate": 3.525467791903091e-05, + "loss": 0.8435, + "step": 5076 + }, + { + "epoch": 0.24721836729725125, + "grad_norm": 1.286222219467163, + "learning_rate": 3.525263776032566e-05, + "loss": 0.8464, + "step": 5077 + }, + { + "epoch": 0.24726706108635843, + "grad_norm": 1.246751070022583, + "learning_rate": 3.525059722220964e-05, + "loss": 0.8982, + "step": 5078 + }, + { + "epoch": 0.24731575487546564, + "grad_norm": 1.3581820726394653, + "learning_rate": 3.5248556304733635e-05, + "loss": 0.879, + "step": 5079 + }, + { + "epoch": 0.24736444866457283, + "grad_norm": 2.106516122817993, + "learning_rate": 3.52465150079484e-05, + "loss": 0.8773, + "step": 5080 + }, + { + "epoch": 0.24741314245368004, + "grad_norm": 1.6224194765090942, + "learning_rate": 3.524447333190472e-05, + "loss": 0.8056, + "step": 5081 + }, + { + "epoch": 0.24746183624278723, + "grad_norm": 2.192328929901123, + "learning_rate": 3.524243127665337e-05, + "loss": 0.8824, + "step": 5082 + }, + { + "epoch": 0.24751053003189444, + "grad_norm": 1.5040724277496338, + "learning_rate": 3.5240388842245154e-05, + "loss": 0.8399, + "step": 5083 + }, + { + "epoch": 0.24755922382100162, + "grad_norm": 1.8073723316192627, + "learning_rate": 3.523834602873087e-05, + "loss": 0.8224, + "step": 5084 + }, + { + "epoch": 0.24760791761010884, + "grad_norm": 1.5529686212539673, + "learning_rate": 3.523630283616135e-05, + "loss": 0.8991, + "step": 5085 + }, + { + "epoch": 0.24765661139921602, + "grad_norm": 1.220744013786316, + "learning_rate": 3.52342592645874e-05, + "loss": 0.8566, + "step": 5086 + }, + { + "epoch": 0.24770530518832323, + "grad_norm": 1.5506293773651123, + "learning_rate": 3.5232215314059866e-05, + "loss": 0.9172, + "step": 5087 + }, + { + "epoch": 0.24775399897743042, + "grad_norm": 1.390471339225769, + "learning_rate": 3.5230170984629585e-05, + "loss": 0.8553, + "step": 5088 + }, + { + "epoch": 0.24780269276653763, + "grad_norm": 1.3027499914169312, + "learning_rate": 3.522812627634741e-05, + "loss": 0.9292, + "step": 5089 + }, + { + "epoch": 0.24785138655564481, + "grad_norm": 0.0839904174208641, + "learning_rate": 3.522608118926421e-05, + "loss": 0.5587, + "step": 5090 + }, + { + "epoch": 0.24790008034475203, + "grad_norm": 1.843827486038208, + "learning_rate": 3.522403572343085e-05, + "loss": 0.9413, + "step": 5091 + }, + { + "epoch": 0.24794877413385924, + "grad_norm": 4.818015098571777, + "learning_rate": 3.522198987889822e-05, + "loss": 0.8527, + "step": 5092 + }, + { + "epoch": 0.24799746792296642, + "grad_norm": 1.645634651184082, + "learning_rate": 3.5219943655717196e-05, + "loss": 0.8657, + "step": 5093 + }, + { + "epoch": 0.24804616171207364, + "grad_norm": 1.356925368309021, + "learning_rate": 3.5217897053938684e-05, + "loss": 0.7581, + "step": 5094 + }, + { + "epoch": 0.24809485550118082, + "grad_norm": 1.3815271854400635, + "learning_rate": 3.52158500736136e-05, + "loss": 0.8556, + "step": 5095 + }, + { + "epoch": 0.24814354929028803, + "grad_norm": 1.7195260524749756, + "learning_rate": 3.521380271479286e-05, + "loss": 0.8209, + "step": 5096 + }, + { + "epoch": 0.24819224307939522, + "grad_norm": 2.3851630687713623, + "learning_rate": 3.521175497752739e-05, + "loss": 0.9166, + "step": 5097 + }, + { + "epoch": 0.24824093686850243, + "grad_norm": 1.6064502000808716, + "learning_rate": 3.520970686186812e-05, + "loss": 0.8158, + "step": 5098 + }, + { + "epoch": 0.24828963065760962, + "grad_norm": 1.4725842475891113, + "learning_rate": 3.520765836786601e-05, + "loss": 0.9547, + "step": 5099 + }, + { + "epoch": 0.24833832444671683, + "grad_norm": 1.541660189628601, + "learning_rate": 3.5205609495572006e-05, + "loss": 0.8774, + "step": 5100 + }, + { + "epoch": 0.24838701823582401, + "grad_norm": 3.7423672676086426, + "learning_rate": 3.520356024503709e-05, + "loss": 0.8438, + "step": 5101 + }, + { + "epoch": 0.24843571202493123, + "grad_norm": 1.6819158792495728, + "learning_rate": 3.520151061631222e-05, + "loss": 0.86, + "step": 5102 + }, + { + "epoch": 0.2484844058140384, + "grad_norm": 2.2968974113464355, + "learning_rate": 3.519946060944839e-05, + "loss": 0.9493, + "step": 5103 + }, + { + "epoch": 0.24853309960314562, + "grad_norm": 0.08508770912885666, + "learning_rate": 3.519741022449659e-05, + "loss": 0.6174, + "step": 5104 + }, + { + "epoch": 0.2485817933922528, + "grad_norm": 1.7336755990982056, + "learning_rate": 3.5195359461507825e-05, + "loss": 0.809, + "step": 5105 + }, + { + "epoch": 0.24863048718136002, + "grad_norm": 1.978164792060852, + "learning_rate": 3.519330832053311e-05, + "loss": 0.8382, + "step": 5106 + }, + { + "epoch": 0.2486791809704672, + "grad_norm": 1.4237085580825806, + "learning_rate": 3.519125680162346e-05, + "loss": 0.7924, + "step": 5107 + }, + { + "epoch": 0.24872787475957442, + "grad_norm": 1.3905526399612427, + "learning_rate": 3.518920490482992e-05, + "loss": 0.8162, + "step": 5108 + }, + { + "epoch": 0.2487765685486816, + "grad_norm": 2.5725505352020264, + "learning_rate": 3.518715263020352e-05, + "loss": 0.941, + "step": 5109 + }, + { + "epoch": 0.24882526233778882, + "grad_norm": 1.8936500549316406, + "learning_rate": 3.518509997779531e-05, + "loss": 0.8873, + "step": 5110 + }, + { + "epoch": 0.248873956126896, + "grad_norm": 1.4560166597366333, + "learning_rate": 3.518304694765636e-05, + "loss": 0.9072, + "step": 5111 + }, + { + "epoch": 0.2489226499160032, + "grad_norm": 1.513098120689392, + "learning_rate": 3.518099353983773e-05, + "loss": 0.8942, + "step": 5112 + }, + { + "epoch": 0.24897134370511043, + "grad_norm": 1.416110634803772, + "learning_rate": 3.51789397543905e-05, + "loss": 0.8256, + "step": 5113 + }, + { + "epoch": 0.2490200374942176, + "grad_norm": 1.9260385036468506, + "learning_rate": 3.517688559136576e-05, + "loss": 0.874, + "step": 5114 + }, + { + "epoch": 0.24906873128332482, + "grad_norm": 1.5425950288772583, + "learning_rate": 3.51748310508146e-05, + "loss": 0.9096, + "step": 5115 + }, + { + "epoch": 0.249117425072432, + "grad_norm": 1.3467885255813599, + "learning_rate": 3.5172776132788146e-05, + "loss": 0.8356, + "step": 5116 + }, + { + "epoch": 0.24916611886153922, + "grad_norm": 1.2892005443572998, + "learning_rate": 3.5170720837337496e-05, + "loss": 0.8807, + "step": 5117 + }, + { + "epoch": 0.2492148126506464, + "grad_norm": 1.4829133749008179, + "learning_rate": 3.516866516451378e-05, + "loss": 0.8823, + "step": 5118 + }, + { + "epoch": 0.24926350643975362, + "grad_norm": 2.373661994934082, + "learning_rate": 3.5166609114368145e-05, + "loss": 0.8862, + "step": 5119 + }, + { + "epoch": 0.2493122002288608, + "grad_norm": 4.021313667297363, + "learning_rate": 3.516455268695172e-05, + "loss": 0.8781, + "step": 5120 + }, + { + "epoch": 0.24936089401796802, + "grad_norm": 2.2963597774505615, + "learning_rate": 3.5162495882315665e-05, + "loss": 0.8942, + "step": 5121 + }, + { + "epoch": 0.2494095878070752, + "grad_norm": 1.8514158725738525, + "learning_rate": 3.5160438700511135e-05, + "loss": 0.9398, + "step": 5122 + }, + { + "epoch": 0.2494582815961824, + "grad_norm": 1.8592714071273804, + "learning_rate": 3.515838114158932e-05, + "loss": 0.8109, + "step": 5123 + }, + { + "epoch": 0.2495069753852896, + "grad_norm": 1.3989429473876953, + "learning_rate": 3.515632320560139e-05, + "loss": 0.8914, + "step": 5124 + }, + { + "epoch": 0.2495556691743968, + "grad_norm": 1.2417569160461426, + "learning_rate": 3.5154264892598534e-05, + "loss": 0.9093, + "step": 5125 + }, + { + "epoch": 0.249604362963504, + "grad_norm": 1.949302077293396, + "learning_rate": 3.515220620263195e-05, + "loss": 0.911, + "step": 5126 + }, + { + "epoch": 0.2496530567526112, + "grad_norm": 1.3945518732070923, + "learning_rate": 3.515014713575287e-05, + "loss": 0.9458, + "step": 5127 + }, + { + "epoch": 0.2497017505417184, + "grad_norm": 1.5654674768447876, + "learning_rate": 3.514808769201249e-05, + "loss": 0.9746, + "step": 5128 + }, + { + "epoch": 0.2497504443308256, + "grad_norm": 1.5048637390136719, + "learning_rate": 3.5146027871462054e-05, + "loss": 0.778, + "step": 5129 + }, + { + "epoch": 0.2497991381199328, + "grad_norm": 1.5040111541748047, + "learning_rate": 3.514396767415279e-05, + "loss": 0.8288, + "step": 5130 + }, + { + "epoch": 0.24984783190904, + "grad_norm": 3.25056529045105, + "learning_rate": 3.514190710013595e-05, + "loss": 0.8651, + "step": 5131 + }, + { + "epoch": 0.24989652569814721, + "grad_norm": 1.752362608909607, + "learning_rate": 3.513984614946279e-05, + "loss": 0.9471, + "step": 5132 + }, + { + "epoch": 0.2499452194872544, + "grad_norm": 4.416263580322266, + "learning_rate": 3.5137784822184576e-05, + "loss": 0.8876, + "step": 5133 + }, + { + "epoch": 0.2499939132763616, + "grad_norm": 10.139444351196289, + "learning_rate": 3.5135723118352586e-05, + "loss": 0.8972, + "step": 5134 + }, + { + "epoch": 0.2500426070654688, + "grad_norm": 1.8239442110061646, + "learning_rate": 3.5133661038018104e-05, + "loss": 0.806, + "step": 5135 + }, + { + "epoch": 0.250091300854576, + "grad_norm": 1.5553903579711914, + "learning_rate": 3.513159858123242e-05, + "loss": 0.9291, + "step": 5136 + }, + { + "epoch": 0.2501399946436832, + "grad_norm": 1.8092776536941528, + "learning_rate": 3.5129535748046846e-05, + "loss": 0.9905, + "step": 5137 + }, + { + "epoch": 0.2501886884327904, + "grad_norm": 1.9704703092575073, + "learning_rate": 3.5127472538512684e-05, + "loss": 0.8586, + "step": 5138 + }, + { + "epoch": 0.2502373822218976, + "grad_norm": 1.4214069843292236, + "learning_rate": 3.512540895268127e-05, + "loss": 0.8875, + "step": 5139 + }, + { + "epoch": 0.2502860760110048, + "grad_norm": 0.09012671560049057, + "learning_rate": 3.5123344990603924e-05, + "loss": 0.6491, + "step": 5140 + }, + { + "epoch": 0.250334769800112, + "grad_norm": 1.4612668752670288, + "learning_rate": 3.5121280652332e-05, + "loss": 0.8796, + "step": 5141 + }, + { + "epoch": 0.2503834635892192, + "grad_norm": 1.6427234411239624, + "learning_rate": 3.511921593791684e-05, + "loss": 0.9531, + "step": 5142 + }, + { + "epoch": 0.2504321573783264, + "grad_norm": 1.6112172603607178, + "learning_rate": 3.5117150847409795e-05, + "loss": 0.9119, + "step": 5143 + }, + { + "epoch": 0.25048085116743357, + "grad_norm": 2.335231304168701, + "learning_rate": 3.511508538086225e-05, + "loss": 0.8454, + "step": 5144 + }, + { + "epoch": 0.2505295449565408, + "grad_norm": 1.377394676208496, + "learning_rate": 3.511301953832559e-05, + "loss": 0.8966, + "step": 5145 + }, + { + "epoch": 0.250578238745648, + "grad_norm": 2.500929594039917, + "learning_rate": 3.511095331985117e-05, + "loss": 0.8467, + "step": 5146 + }, + { + "epoch": 0.2506269325347552, + "grad_norm": 5.872000217437744, + "learning_rate": 3.510888672549042e-05, + "loss": 0.9356, + "step": 5147 + }, + { + "epoch": 0.25067562632386237, + "grad_norm": 3.2452311515808105, + "learning_rate": 3.510681975529473e-05, + "loss": 0.8229, + "step": 5148 + }, + { + "epoch": 0.2507243201129696, + "grad_norm": 2.496715545654297, + "learning_rate": 3.510475240931553e-05, + "loss": 0.8731, + "step": 5149 + }, + { + "epoch": 0.2507730139020768, + "grad_norm": 1.6946979761123657, + "learning_rate": 3.510268468760423e-05, + "loss": 0.8723, + "step": 5150 + }, + { + "epoch": 0.250821707691184, + "grad_norm": 2.6794991493225098, + "learning_rate": 3.510061659021228e-05, + "loss": 0.8632, + "step": 5151 + }, + { + "epoch": 0.2508704014802912, + "grad_norm": 0.08114537596702576, + "learning_rate": 3.50985481171911e-05, + "loss": 0.623, + "step": 5152 + }, + { + "epoch": 0.2509190952693984, + "grad_norm": 1.4423872232437134, + "learning_rate": 3.509647926859217e-05, + "loss": 0.9529, + "step": 5153 + }, + { + "epoch": 0.2509677890585056, + "grad_norm": 1.968706727027893, + "learning_rate": 3.509441004446694e-05, + "loss": 0.8229, + "step": 5154 + }, + { + "epoch": 0.2510164828476128, + "grad_norm": 1.716015338897705, + "learning_rate": 3.509234044486688e-05, + "loss": 0.8916, + "step": 5155 + }, + { + "epoch": 0.25106517663672, + "grad_norm": 3.1599364280700684, + "learning_rate": 3.509027046984348e-05, + "loss": 0.8671, + "step": 5156 + }, + { + "epoch": 0.25111387042582717, + "grad_norm": 1.8876546621322632, + "learning_rate": 3.508820011944822e-05, + "loss": 0.8214, + "step": 5157 + }, + { + "epoch": 0.2511625642149344, + "grad_norm": 1.3621803522109985, + "learning_rate": 3.5086129393732615e-05, + "loss": 0.8586, + "step": 5158 + }, + { + "epoch": 0.2512112580040416, + "grad_norm": 1.8853187561035156, + "learning_rate": 3.508405829274816e-05, + "loss": 0.9511, + "step": 5159 + }, + { + "epoch": 0.2512599517931488, + "grad_norm": 1.4066691398620605, + "learning_rate": 3.5081986816546384e-05, + "loss": 0.9064, + "step": 5160 + }, + { + "epoch": 0.25130864558225596, + "grad_norm": 1.7967562675476074, + "learning_rate": 3.507991496517881e-05, + "loss": 0.8491, + "step": 5161 + }, + { + "epoch": 0.2513573393713632, + "grad_norm": 0.08674035221338272, + "learning_rate": 3.507784273869697e-05, + "loss": 0.6227, + "step": 5162 + }, + { + "epoch": 0.2514060331604704, + "grad_norm": 1.4165093898773193, + "learning_rate": 3.507577013715242e-05, + "loss": 0.9339, + "step": 5163 + }, + { + "epoch": 0.2514547269495776, + "grad_norm": 0.08650912344455719, + "learning_rate": 3.507369716059672e-05, + "loss": 0.6528, + "step": 5164 + }, + { + "epoch": 0.25150342073868476, + "grad_norm": 2.3228533267974854, + "learning_rate": 3.507162380908142e-05, + "loss": 0.8954, + "step": 5165 + }, + { + "epoch": 0.25155211452779197, + "grad_norm": 1.4108824729919434, + "learning_rate": 3.5069550082658114e-05, + "loss": 0.9303, + "step": 5166 + }, + { + "epoch": 0.2516008083168992, + "grad_norm": 1.6097666025161743, + "learning_rate": 3.5067475981378364e-05, + "loss": 0.7068, + "step": 5167 + }, + { + "epoch": 0.2516495021060064, + "grad_norm": 2.359144449234009, + "learning_rate": 3.506540150529378e-05, + "loss": 0.8989, + "step": 5168 + }, + { + "epoch": 0.25169819589511355, + "grad_norm": 1.6273572444915771, + "learning_rate": 3.506332665445596e-05, + "loss": 0.7936, + "step": 5169 + }, + { + "epoch": 0.25174688968422076, + "grad_norm": 2.3054630756378174, + "learning_rate": 3.5061251428916514e-05, + "loss": 0.8686, + "step": 5170 + }, + { + "epoch": 0.251795583473328, + "grad_norm": 1.6039798259735107, + "learning_rate": 3.505917582872707e-05, + "loss": 0.8591, + "step": 5171 + }, + { + "epoch": 0.2518442772624352, + "grad_norm": 1.2934986352920532, + "learning_rate": 3.505709985393925e-05, + "loss": 0.8846, + "step": 5172 + }, + { + "epoch": 0.2518929710515424, + "grad_norm": 2.244096040725708, + "learning_rate": 3.5055023504604705e-05, + "loss": 0.8888, + "step": 5173 + }, + { + "epoch": 0.25194166484064956, + "grad_norm": 2.581587791442871, + "learning_rate": 3.505294678077507e-05, + "loss": 0.8548, + "step": 5174 + }, + { + "epoch": 0.25199035862975677, + "grad_norm": 1.9176146984100342, + "learning_rate": 3.505086968250202e-05, + "loss": 0.9035, + "step": 5175 + }, + { + "epoch": 0.252039052418864, + "grad_norm": 1.900381326675415, + "learning_rate": 3.50487922098372e-05, + "loss": 0.8484, + "step": 5176 + }, + { + "epoch": 0.2520877462079712, + "grad_norm": 1.7479031085968018, + "learning_rate": 3.504671436283231e-05, + "loss": 0.9082, + "step": 5177 + }, + { + "epoch": 0.25213643999707835, + "grad_norm": 1.4308030605316162, + "learning_rate": 3.5044636141539034e-05, + "loss": 0.8251, + "step": 5178 + }, + { + "epoch": 0.25218513378618557, + "grad_norm": 1.8830713033676147, + "learning_rate": 3.504255754600905e-05, + "loss": 0.9316, + "step": 5179 + }, + { + "epoch": 0.2522338275752928, + "grad_norm": 1.5673174858093262, + "learning_rate": 3.504047857629408e-05, + "loss": 0.887, + "step": 5180 + }, + { + "epoch": 0.2522825213644, + "grad_norm": 1.5210931301116943, + "learning_rate": 3.503839923244584e-05, + "loss": 0.9134, + "step": 5181 + }, + { + "epoch": 0.25233121515350715, + "grad_norm": 4.9121575355529785, + "learning_rate": 3.503631951451604e-05, + "loss": 0.8878, + "step": 5182 + }, + { + "epoch": 0.25237990894261436, + "grad_norm": 1.4857087135314941, + "learning_rate": 3.503423942255643e-05, + "loss": 0.8414, + "step": 5183 + }, + { + "epoch": 0.2524286027317216, + "grad_norm": 2.448613405227661, + "learning_rate": 3.503215895661874e-05, + "loss": 0.872, + "step": 5184 + }, + { + "epoch": 0.2524772965208288, + "grad_norm": 3.7350101470947266, + "learning_rate": 3.5030078116754725e-05, + "loss": 0.8952, + "step": 5185 + }, + { + "epoch": 0.25252599030993594, + "grad_norm": 1.5777686834335327, + "learning_rate": 3.5027996903016146e-05, + "loss": 0.8583, + "step": 5186 + }, + { + "epoch": 0.25257468409904316, + "grad_norm": 2.3115172386169434, + "learning_rate": 3.502591531545477e-05, + "loss": 0.8322, + "step": 5187 + }, + { + "epoch": 0.25262337788815037, + "grad_norm": 1.923102617263794, + "learning_rate": 3.502383335412239e-05, + "loss": 0.9213, + "step": 5188 + }, + { + "epoch": 0.2526720716772576, + "grad_norm": 1.6352673768997192, + "learning_rate": 3.502175101907078e-05, + "loss": 0.8728, + "step": 5189 + }, + { + "epoch": 0.25272076546636474, + "grad_norm": 2.2158560752868652, + "learning_rate": 3.501966831035174e-05, + "loss": 0.8322, + "step": 5190 + }, + { + "epoch": 0.25276945925547195, + "grad_norm": 1.6429823637008667, + "learning_rate": 3.501758522801709e-05, + "loss": 0.7615, + "step": 5191 + }, + { + "epoch": 0.25281815304457916, + "grad_norm": 1.676511287689209, + "learning_rate": 3.501550177211863e-05, + "loss": 0.9817, + "step": 5192 + }, + { + "epoch": 0.2528668468336864, + "grad_norm": 1.53475022315979, + "learning_rate": 3.50134179427082e-05, + "loss": 0.9582, + "step": 5193 + }, + { + "epoch": 0.2529155406227936, + "grad_norm": 1.4557836055755615, + "learning_rate": 3.5011333739837626e-05, + "loss": 0.9171, + "step": 5194 + }, + { + "epoch": 0.25296423441190075, + "grad_norm": 1.7311408519744873, + "learning_rate": 3.500924916355875e-05, + "loss": 0.9209, + "step": 5195 + }, + { + "epoch": 0.25301292820100796, + "grad_norm": 1.7107199430465698, + "learning_rate": 3.5007164213923443e-05, + "loss": 0.9799, + "step": 5196 + }, + { + "epoch": 0.25306162199011517, + "grad_norm": 1.7685842514038086, + "learning_rate": 3.500507889098356e-05, + "loss": 0.9105, + "step": 5197 + }, + { + "epoch": 0.2531103157792224, + "grad_norm": 2.231205701828003, + "learning_rate": 3.5002993194790965e-05, + "loss": 0.7717, + "step": 5198 + }, + { + "epoch": 0.25315900956832954, + "grad_norm": 1.4988956451416016, + "learning_rate": 3.500090712539755e-05, + "loss": 0.8985, + "step": 5199 + }, + { + "epoch": 0.25320770335743675, + "grad_norm": 1.6954387426376343, + "learning_rate": 3.49988206828552e-05, + "loss": 0.9928, + "step": 5200 + }, + { + "epoch": 0.25325639714654397, + "grad_norm": 1.4071773290634155, + "learning_rate": 3.499673386721582e-05, + "loss": 0.9358, + "step": 5201 + }, + { + "epoch": 0.2533050909356512, + "grad_norm": 2.7929558753967285, + "learning_rate": 3.499464667853131e-05, + "loss": 0.929, + "step": 5202 + }, + { + "epoch": 0.25335378472475834, + "grad_norm": 2.2012438774108887, + "learning_rate": 3.4992559116853605e-05, + "loss": 0.8401, + "step": 5203 + }, + { + "epoch": 0.25340247851386555, + "grad_norm": 1.387035608291626, + "learning_rate": 3.499047118223462e-05, + "loss": 0.9008, + "step": 5204 + }, + { + "epoch": 0.25345117230297276, + "grad_norm": 3.465606451034546, + "learning_rate": 3.49883828747263e-05, + "loss": 0.8636, + "step": 5205 + }, + { + "epoch": 0.25349986609208, + "grad_norm": 2.1369948387145996, + "learning_rate": 3.498629419438059e-05, + "loss": 0.8432, + "step": 5206 + }, + { + "epoch": 0.25354855988118713, + "grad_norm": 2.1122899055480957, + "learning_rate": 3.498420514124945e-05, + "loss": 0.8903, + "step": 5207 + }, + { + "epoch": 0.25359725367029434, + "grad_norm": 1.9968390464782715, + "learning_rate": 3.4982115715384836e-05, + "loss": 0.8477, + "step": 5208 + }, + { + "epoch": 0.25364594745940156, + "grad_norm": 5.799667835235596, + "learning_rate": 3.4980025916838724e-05, + "loss": 0.9514, + "step": 5209 + }, + { + "epoch": 0.25369464124850877, + "grad_norm": 1.3180742263793945, + "learning_rate": 3.497793574566311e-05, + "loss": 0.9418, + "step": 5210 + }, + { + "epoch": 0.2537433350376159, + "grad_norm": 1.4960429668426514, + "learning_rate": 3.497584520190997e-05, + "loss": 0.9567, + "step": 5211 + }, + { + "epoch": 0.25379202882672314, + "grad_norm": 1.6316348314285278, + "learning_rate": 3.4973754285631315e-05, + "loss": 0.7988, + "step": 5212 + }, + { + "epoch": 0.25384072261583035, + "grad_norm": 9.575799942016602, + "learning_rate": 3.4971662996879166e-05, + "loss": 0.8167, + "step": 5213 + }, + { + "epoch": 0.25388941640493756, + "grad_norm": 1.5470693111419678, + "learning_rate": 3.496957133570553e-05, + "loss": 0.8214, + "step": 5214 + }, + { + "epoch": 0.2539381101940448, + "grad_norm": 1.22503662109375, + "learning_rate": 3.4967479302162446e-05, + "loss": 0.9164, + "step": 5215 + }, + { + "epoch": 0.25398680398315193, + "grad_norm": 1.287968397140503, + "learning_rate": 3.4965386896301944e-05, + "loss": 0.8454, + "step": 5216 + }, + { + "epoch": 0.25403549777225914, + "grad_norm": 2.298339605331421, + "learning_rate": 3.496329411817608e-05, + "loss": 0.8927, + "step": 5217 + }, + { + "epoch": 0.25408419156136636, + "grad_norm": 1.580763578414917, + "learning_rate": 3.496120096783691e-05, + "loss": 0.8732, + "step": 5218 + }, + { + "epoch": 0.25413288535047357, + "grad_norm": 1.3181753158569336, + "learning_rate": 3.4959107445336504e-05, + "loss": 0.9041, + "step": 5219 + }, + { + "epoch": 0.2541815791395807, + "grad_norm": 2.2325356006622314, + "learning_rate": 3.495701355072694e-05, + "loss": 0.8486, + "step": 5220 + }, + { + "epoch": 0.25423027292868794, + "grad_norm": 1.9919273853302002, + "learning_rate": 3.495491928406029e-05, + "loss": 0.9031, + "step": 5221 + }, + { + "epoch": 0.25427896671779515, + "grad_norm": 1.994774580001831, + "learning_rate": 3.4952824645388664e-05, + "loss": 0.8579, + "step": 5222 + }, + { + "epoch": 0.25432766050690236, + "grad_norm": 1.612059473991394, + "learning_rate": 3.4950729634764156e-05, + "loss": 0.8376, + "step": 5223 + }, + { + "epoch": 0.2543763542960095, + "grad_norm": 1.8455740213394165, + "learning_rate": 3.4948634252238896e-05, + "loss": 0.8597, + "step": 5224 + }, + { + "epoch": 0.25442504808511673, + "grad_norm": 1.2888797521591187, + "learning_rate": 3.494653849786499e-05, + "loss": 0.9312, + "step": 5225 + }, + { + "epoch": 0.25447374187422395, + "grad_norm": 1.5204174518585205, + "learning_rate": 3.494444237169458e-05, + "loss": 0.8807, + "step": 5226 + }, + { + "epoch": 0.25452243566333116, + "grad_norm": 4.3043622970581055, + "learning_rate": 3.49423458737798e-05, + "loss": 0.8165, + "step": 5227 + }, + { + "epoch": 0.2545711294524383, + "grad_norm": 2.2990775108337402, + "learning_rate": 3.4940249004172805e-05, + "loss": 0.863, + "step": 5228 + }, + { + "epoch": 0.25461982324154553, + "grad_norm": 0.08993002772331238, + "learning_rate": 3.493815176292575e-05, + "loss": 0.686, + "step": 5229 + }, + { + "epoch": 0.25466851703065274, + "grad_norm": 3.877833604812622, + "learning_rate": 3.493605415009081e-05, + "loss": 0.9273, + "step": 5230 + }, + { + "epoch": 0.25471721081975995, + "grad_norm": 1.113810658454895, + "learning_rate": 3.493395616572017e-05, + "loss": 0.9107, + "step": 5231 + }, + { + "epoch": 0.25476590460886717, + "grad_norm": 2.526521921157837, + "learning_rate": 3.4931857809865994e-05, + "loss": 0.9912, + "step": 5232 + }, + { + "epoch": 0.2548145983979743, + "grad_norm": 1.9749606847763062, + "learning_rate": 3.492975908258051e-05, + "loss": 0.9675, + "step": 5233 + }, + { + "epoch": 0.25486329218708154, + "grad_norm": 1.8423616886138916, + "learning_rate": 3.492765998391589e-05, + "loss": 0.9232, + "step": 5234 + }, + { + "epoch": 0.25491198597618875, + "grad_norm": 2.322960138320923, + "learning_rate": 3.492556051392438e-05, + "loss": 0.8261, + "step": 5235 + }, + { + "epoch": 0.25496067976529596, + "grad_norm": 1.5406721830368042, + "learning_rate": 3.492346067265819e-05, + "loss": 0.8835, + "step": 5236 + }, + { + "epoch": 0.2550093735544031, + "grad_norm": 1.6713893413543701, + "learning_rate": 3.4921360460169546e-05, + "loss": 0.8933, + "step": 5237 + }, + { + "epoch": 0.25505806734351033, + "grad_norm": 1.5848307609558105, + "learning_rate": 3.491925987651071e-05, + "loss": 0.8715, + "step": 5238 + }, + { + "epoch": 0.25510676113261754, + "grad_norm": 1.7995939254760742, + "learning_rate": 3.491715892173392e-05, + "loss": 0.9438, + "step": 5239 + }, + { + "epoch": 0.25515545492172476, + "grad_norm": 1.6078025102615356, + "learning_rate": 3.491505759589144e-05, + "loss": 0.921, + "step": 5240 + }, + { + "epoch": 0.2552041487108319, + "grad_norm": 1.6027395725250244, + "learning_rate": 3.4912955899035545e-05, + "loss": 0.9604, + "step": 5241 + }, + { + "epoch": 0.2552528424999391, + "grad_norm": 7.081265926361084, + "learning_rate": 3.491085383121851e-05, + "loss": 0.9164, + "step": 5242 + }, + { + "epoch": 0.25530153628904634, + "grad_norm": 1.5537582635879517, + "learning_rate": 3.490875139249263e-05, + "loss": 0.8377, + "step": 5243 + }, + { + "epoch": 0.25535023007815355, + "grad_norm": 1.9959430694580078, + "learning_rate": 3.49066485829102e-05, + "loss": 0.8667, + "step": 5244 + }, + { + "epoch": 0.2553989238672607, + "grad_norm": 1.9208636283874512, + "learning_rate": 3.490454540252352e-05, + "loss": 0.8713, + "step": 5245 + }, + { + "epoch": 0.2554476176563679, + "grad_norm": 2.0306005477905273, + "learning_rate": 3.490244185138492e-05, + "loss": 0.8638, + "step": 5246 + }, + { + "epoch": 0.25549631144547513, + "grad_norm": 0.08820263296365738, + "learning_rate": 3.4900337929546726e-05, + "loss": 0.6265, + "step": 5247 + }, + { + "epoch": 0.25554500523458235, + "grad_norm": 1.3834583759307861, + "learning_rate": 3.489823363706125e-05, + "loss": 0.8661, + "step": 5248 + }, + { + "epoch": 0.2555936990236895, + "grad_norm": 1.6571916341781616, + "learning_rate": 3.489612897398087e-05, + "loss": 0.9124, + "step": 5249 + }, + { + "epoch": 0.2556423928127967, + "grad_norm": 0.08956944197416306, + "learning_rate": 3.489402394035792e-05, + "loss": 0.5922, + "step": 5250 + }, + { + "epoch": 0.2556910866019039, + "grad_norm": 1.559242844581604, + "learning_rate": 3.489191853624476e-05, + "loss": 0.8528, + "step": 5251 + }, + { + "epoch": 0.25573978039101114, + "grad_norm": 2.1064138412475586, + "learning_rate": 3.488981276169378e-05, + "loss": 0.8874, + "step": 5252 + }, + { + "epoch": 0.25578847418011835, + "grad_norm": 1.4313335418701172, + "learning_rate": 3.488770661675734e-05, + "loss": 0.971, + "step": 5253 + }, + { + "epoch": 0.2558371679692255, + "grad_norm": 1.5345131158828735, + "learning_rate": 3.4885600101487846e-05, + "loss": 0.9444, + "step": 5254 + }, + { + "epoch": 0.2558858617583327, + "grad_norm": 6.788299560546875, + "learning_rate": 3.4883493215937694e-05, + "loss": 0.8578, + "step": 5255 + }, + { + "epoch": 0.25593455554743993, + "grad_norm": 1.8686636686325073, + "learning_rate": 3.4881385960159286e-05, + "loss": 0.8775, + "step": 5256 + }, + { + "epoch": 0.25598324933654715, + "grad_norm": 0.08700042963027954, + "learning_rate": 3.4879278334205045e-05, + "loss": 0.653, + "step": 5257 + }, + { + "epoch": 0.2560319431256543, + "grad_norm": 1.6930770874023438, + "learning_rate": 3.4877170338127404e-05, + "loss": 0.9653, + "step": 5258 + }, + { + "epoch": 0.2560806369147615, + "grad_norm": 1.3898351192474365, + "learning_rate": 3.4875061971978785e-05, + "loss": 0.8771, + "step": 5259 + }, + { + "epoch": 0.25612933070386873, + "grad_norm": 2.415515899658203, + "learning_rate": 3.4872953235811656e-05, + "loss": 0.9225, + "step": 5260 + }, + { + "epoch": 0.25617802449297594, + "grad_norm": 2.5658111572265625, + "learning_rate": 3.487084412967845e-05, + "loss": 0.9048, + "step": 5261 + }, + { + "epoch": 0.2562267182820831, + "grad_norm": 1.9072586297988892, + "learning_rate": 3.486873465363164e-05, + "loss": 0.8624, + "step": 5262 + }, + { + "epoch": 0.2562754120711903, + "grad_norm": 1.4750019311904907, + "learning_rate": 3.486662480772371e-05, + "loss": 0.8722, + "step": 5263 + }, + { + "epoch": 0.2563241058602975, + "grad_norm": 1.4129395484924316, + "learning_rate": 3.486451459200712e-05, + "loss": 0.8784, + "step": 5264 + }, + { + "epoch": 0.25637279964940474, + "grad_norm": 1.89109206199646, + "learning_rate": 3.486240400653438e-05, + "loss": 0.9473, + "step": 5265 + }, + { + "epoch": 0.2564214934385119, + "grad_norm": 1.4311184883117676, + "learning_rate": 3.486029305135798e-05, + "loss": 0.8836, + "step": 5266 + }, + { + "epoch": 0.2564701872276191, + "grad_norm": 2.0560691356658936, + "learning_rate": 3.485818172653044e-05, + "loss": 0.8683, + "step": 5267 + }, + { + "epoch": 0.2565188810167263, + "grad_norm": 2.1587777137756348, + "learning_rate": 3.4856070032104274e-05, + "loss": 0.8984, + "step": 5268 + }, + { + "epoch": 0.25656757480583353, + "grad_norm": 0.08546245843172073, + "learning_rate": 3.485395796813202e-05, + "loss": 0.6448, + "step": 5269 + }, + { + "epoch": 0.2566162685949407, + "grad_norm": 2.136796712875366, + "learning_rate": 3.48518455346662e-05, + "loss": 0.8228, + "step": 5270 + }, + { + "epoch": 0.2566649623840479, + "grad_norm": 1.7398911714553833, + "learning_rate": 3.4849732731759366e-05, + "loss": 0.8865, + "step": 5271 + }, + { + "epoch": 0.2567136561731551, + "grad_norm": 1.5484566688537598, + "learning_rate": 3.4847619559464076e-05, + "loss": 0.826, + "step": 5272 + }, + { + "epoch": 0.2567623499622623, + "grad_norm": 3.7137291431427, + "learning_rate": 3.4845506017832896e-05, + "loss": 0.8961, + "step": 5273 + }, + { + "epoch": 0.25681104375136954, + "grad_norm": 1.5601136684417725, + "learning_rate": 3.4843392106918405e-05, + "loss": 0.8627, + "step": 5274 + }, + { + "epoch": 0.2568597375404767, + "grad_norm": 1.7131943702697754, + "learning_rate": 3.484127782677318e-05, + "loss": 0.8599, + "step": 5275 + }, + { + "epoch": 0.2569084313295839, + "grad_norm": 1.9954533576965332, + "learning_rate": 3.483916317744981e-05, + "loss": 0.8707, + "step": 5276 + }, + { + "epoch": 0.2569571251186911, + "grad_norm": 2.566002368927002, + "learning_rate": 3.4837048159000915e-05, + "loss": 0.8374, + "step": 5277 + }, + { + "epoch": 0.25700581890779833, + "grad_norm": 1.9096999168395996, + "learning_rate": 3.483493277147909e-05, + "loss": 0.8988, + "step": 5278 + }, + { + "epoch": 0.2570545126969055, + "grad_norm": 1.5922212600708008, + "learning_rate": 3.4832817014936955e-05, + "loss": 0.8131, + "step": 5279 + }, + { + "epoch": 0.2571032064860127, + "grad_norm": 0.08933573216199875, + "learning_rate": 3.483070088942715e-05, + "loss": 0.6201, + "step": 5280 + }, + { + "epoch": 0.2571519002751199, + "grad_norm": 2.6641430854797363, + "learning_rate": 3.48285843950023e-05, + "loss": 0.8806, + "step": 5281 + }, + { + "epoch": 0.25720059406422713, + "grad_norm": 1.535174012184143, + "learning_rate": 3.4826467531715065e-05, + "loss": 0.8563, + "step": 5282 + }, + { + "epoch": 0.2572492878533343, + "grad_norm": 2.055992364883423, + "learning_rate": 3.48243502996181e-05, + "loss": 0.8832, + "step": 5283 + }, + { + "epoch": 0.2572979816424415, + "grad_norm": 2.032000780105591, + "learning_rate": 3.4822232698764066e-05, + "loss": 0.7981, + "step": 5284 + }, + { + "epoch": 0.2573466754315487, + "grad_norm": 1.4458215236663818, + "learning_rate": 3.4820114729205644e-05, + "loss": 0.8826, + "step": 5285 + }, + { + "epoch": 0.2573953692206559, + "grad_norm": 1.8021080493927002, + "learning_rate": 3.481799639099552e-05, + "loss": 0.9476, + "step": 5286 + }, + { + "epoch": 0.2574440630097631, + "grad_norm": 1.8413426876068115, + "learning_rate": 3.481587768418638e-05, + "loss": 0.7918, + "step": 5287 + }, + { + "epoch": 0.2574927567988703, + "grad_norm": 2.0125832557678223, + "learning_rate": 3.481375860883093e-05, + "loss": 0.844, + "step": 5288 + }, + { + "epoch": 0.2575414505879775, + "grad_norm": 1.7345815896987915, + "learning_rate": 3.481163916498189e-05, + "loss": 0.9006, + "step": 5289 + }, + { + "epoch": 0.2575901443770847, + "grad_norm": 2.4992902278900146, + "learning_rate": 3.4809519352691974e-05, + "loss": 0.8908, + "step": 5290 + }, + { + "epoch": 0.2576388381661919, + "grad_norm": 2.4951605796813965, + "learning_rate": 3.480739917201391e-05, + "loss": 0.9066, + "step": 5291 + }, + { + "epoch": 0.2576875319552991, + "grad_norm": 0.09223374724388123, + "learning_rate": 3.480527862300045e-05, + "loss": 0.6546, + "step": 5292 + }, + { + "epoch": 0.2577362257444063, + "grad_norm": 1.7395268678665161, + "learning_rate": 3.480315770570433e-05, + "loss": 0.8209, + "step": 5293 + }, + { + "epoch": 0.2577849195335135, + "grad_norm": 1.6622263193130493, + "learning_rate": 3.480103642017831e-05, + "loss": 0.9136, + "step": 5294 + }, + { + "epoch": 0.2578336133226207, + "grad_norm": 2.4133660793304443, + "learning_rate": 3.4798914766475155e-05, + "loss": 0.8383, + "step": 5295 + }, + { + "epoch": 0.2578823071117279, + "grad_norm": 1.6146868467330933, + "learning_rate": 3.479679274464766e-05, + "loss": 0.9151, + "step": 5296 + }, + { + "epoch": 0.2579310009008351, + "grad_norm": 2.5753111839294434, + "learning_rate": 3.4794670354748585e-05, + "loss": 0.8938, + "step": 5297 + }, + { + "epoch": 0.2579796946899423, + "grad_norm": 1.7388873100280762, + "learning_rate": 3.479254759683074e-05, + "loss": 0.8698, + "step": 5298 + }, + { + "epoch": 0.2580283884790495, + "grad_norm": 1.759015679359436, + "learning_rate": 3.479042447094692e-05, + "loss": 0.9061, + "step": 5299 + }, + { + "epoch": 0.2580770822681567, + "grad_norm": 2.0148377418518066, + "learning_rate": 3.478830097714995e-05, + "loss": 0.8379, + "step": 5300 + }, + { + "epoch": 0.2581257760572639, + "grad_norm": 1.845625638961792, + "learning_rate": 3.4786177115492644e-05, + "loss": 0.8252, + "step": 5301 + }, + { + "epoch": 0.2581744698463711, + "grad_norm": 2.256758451461792, + "learning_rate": 3.478405288602783e-05, + "loss": 0.8749, + "step": 5302 + }, + { + "epoch": 0.2582231636354783, + "grad_norm": 2.1471152305603027, + "learning_rate": 3.4781928288808355e-05, + "loss": 0.8532, + "step": 5303 + }, + { + "epoch": 0.25827185742458547, + "grad_norm": 1.9675776958465576, + "learning_rate": 3.4779803323887066e-05, + "loss": 0.8228, + "step": 5304 + }, + { + "epoch": 0.2583205512136927, + "grad_norm": 1.740460753440857, + "learning_rate": 3.477767799131682e-05, + "loss": 0.8297, + "step": 5305 + }, + { + "epoch": 0.2583692450027999, + "grad_norm": 3.9583723545074463, + "learning_rate": 3.477555229115049e-05, + "loss": 0.9222, + "step": 5306 + }, + { + "epoch": 0.2584179387919071, + "grad_norm": 1.6283539533615112, + "learning_rate": 3.477342622344094e-05, + "loss": 0.8586, + "step": 5307 + }, + { + "epoch": 0.25846663258101427, + "grad_norm": 1.4067091941833496, + "learning_rate": 3.477129978824108e-05, + "loss": 0.8828, + "step": 5308 + }, + { + "epoch": 0.2585153263701215, + "grad_norm": 1.2624518871307373, + "learning_rate": 3.476917298560378e-05, + "loss": 0.8507, + "step": 5309 + }, + { + "epoch": 0.2585640201592287, + "grad_norm": 7.82004976272583, + "learning_rate": 3.476704581558196e-05, + "loss": 0.861, + "step": 5310 + }, + { + "epoch": 0.2586127139483359, + "grad_norm": 2.0708396434783936, + "learning_rate": 3.476491827822852e-05, + "loss": 0.7791, + "step": 5311 + }, + { + "epoch": 0.2586614077374431, + "grad_norm": 3.2178845405578613, + "learning_rate": 3.476279037359641e-05, + "loss": 0.9049, + "step": 5312 + }, + { + "epoch": 0.2587101015265503, + "grad_norm": 1.4122499227523804, + "learning_rate": 3.4760662101738534e-05, + "loss": 0.8515, + "step": 5313 + }, + { + "epoch": 0.2587587953156575, + "grad_norm": 1.9012833833694458, + "learning_rate": 3.475853346270784e-05, + "loss": 0.9179, + "step": 5314 + }, + { + "epoch": 0.2588074891047647, + "grad_norm": 2.4933106899261475, + "learning_rate": 3.475640445655728e-05, + "loss": 0.8588, + "step": 5315 + }, + { + "epoch": 0.2588561828938719, + "grad_norm": 1.7553081512451172, + "learning_rate": 3.4754275083339814e-05, + "loss": 0.9095, + "step": 5316 + }, + { + "epoch": 0.25890487668297907, + "grad_norm": 1.5901654958724976, + "learning_rate": 3.4752145343108414e-05, + "loss": 0.8991, + "step": 5317 + }, + { + "epoch": 0.2589535704720863, + "grad_norm": 1.643349528312683, + "learning_rate": 3.475001523591605e-05, + "loss": 0.9505, + "step": 5318 + }, + { + "epoch": 0.2590022642611935, + "grad_norm": 2.0386719703674316, + "learning_rate": 3.4747884761815716e-05, + "loss": 0.7766, + "step": 5319 + }, + { + "epoch": 0.2590509580503007, + "grad_norm": 1.6823525428771973, + "learning_rate": 3.47457539208604e-05, + "loss": 0.9029, + "step": 5320 + }, + { + "epoch": 0.25909965183940786, + "grad_norm": 1.3926008939743042, + "learning_rate": 3.474362271310311e-05, + "loss": 0.8501, + "step": 5321 + }, + { + "epoch": 0.2591483456285151, + "grad_norm": 1.5080822706222534, + "learning_rate": 3.474149113859686e-05, + "loss": 0.801, + "step": 5322 + }, + { + "epoch": 0.2591970394176223, + "grad_norm": 1.8227379322052002, + "learning_rate": 3.473935919739468e-05, + "loss": 0.9443, + "step": 5323 + }, + { + "epoch": 0.2592457332067295, + "grad_norm": 1.4470884799957275, + "learning_rate": 3.47372268895496e-05, + "loss": 0.8009, + "step": 5324 + }, + { + "epoch": 0.25929442699583666, + "grad_norm": 0.08319484442472458, + "learning_rate": 3.4735094215114645e-05, + "loss": 0.6137, + "step": 5325 + }, + { + "epoch": 0.25934312078494387, + "grad_norm": 1.6483641862869263, + "learning_rate": 3.473296117414288e-05, + "loss": 0.9183, + "step": 5326 + }, + { + "epoch": 0.2593918145740511, + "grad_norm": 1.6614456176757812, + "learning_rate": 3.473082776668737e-05, + "loss": 0.8511, + "step": 5327 + }, + { + "epoch": 0.2594405083631583, + "grad_norm": 2.029189109802246, + "learning_rate": 3.472869399280117e-05, + "loss": 0.8958, + "step": 5328 + }, + { + "epoch": 0.25948920215226545, + "grad_norm": 2.285594940185547, + "learning_rate": 3.472655985253737e-05, + "loss": 0.8094, + "step": 5329 + }, + { + "epoch": 0.25953789594137266, + "grad_norm": 1.5153751373291016, + "learning_rate": 3.472442534594904e-05, + "loss": 0.9487, + "step": 5330 + }, + { + "epoch": 0.2595865897304799, + "grad_norm": 1.6232469081878662, + "learning_rate": 3.47222904730893e-05, + "loss": 0.8208, + "step": 5331 + }, + { + "epoch": 0.2596352835195871, + "grad_norm": 2.8261032104492188, + "learning_rate": 3.472015523401123e-05, + "loss": 0.8527, + "step": 5332 + }, + { + "epoch": 0.2596839773086943, + "grad_norm": 1.410072922706604, + "learning_rate": 3.4718019628767967e-05, + "loss": 0.935, + "step": 5333 + }, + { + "epoch": 0.25973267109780146, + "grad_norm": 1.6453694105148315, + "learning_rate": 3.471588365741262e-05, + "loss": 0.8889, + "step": 5334 + }, + { + "epoch": 0.25978136488690867, + "grad_norm": 1.4887094497680664, + "learning_rate": 3.471374731999832e-05, + "loss": 0.918, + "step": 5335 + }, + { + "epoch": 0.2598300586760159, + "grad_norm": 2.197188377380371, + "learning_rate": 3.4711610616578215e-05, + "loss": 0.7977, + "step": 5336 + }, + { + "epoch": 0.2598787524651231, + "grad_norm": 1.7407135963439941, + "learning_rate": 3.4709473547205466e-05, + "loss": 0.884, + "step": 5337 + }, + { + "epoch": 0.25992744625423025, + "grad_norm": 1.4714707136154175, + "learning_rate": 3.470733611193321e-05, + "loss": 0.8794, + "step": 5338 + }, + { + "epoch": 0.25997614004333747, + "grad_norm": 1.799352765083313, + "learning_rate": 3.470519831081463e-05, + "loss": 0.8093, + "step": 5339 + }, + { + "epoch": 0.2600248338324447, + "grad_norm": 4.112773895263672, + "learning_rate": 3.47030601439029e-05, + "loss": 0.8605, + "step": 5340 + }, + { + "epoch": 0.2600735276215519, + "grad_norm": 1.7001231908798218, + "learning_rate": 3.470092161125121e-05, + "loss": 0.914, + "step": 5341 + }, + { + "epoch": 0.26012222141065905, + "grad_norm": 1.8523004055023193, + "learning_rate": 3.469878271291275e-05, + "loss": 0.9642, + "step": 5342 + }, + { + "epoch": 0.26017091519976626, + "grad_norm": 1.4798961877822876, + "learning_rate": 3.4696643448940744e-05, + "loss": 0.927, + "step": 5343 + }, + { + "epoch": 0.2602196089888735, + "grad_norm": 2.042161226272583, + "learning_rate": 3.469450381938838e-05, + "loss": 0.8315, + "step": 5344 + }, + { + "epoch": 0.2602683027779807, + "grad_norm": 8.804361343383789, + "learning_rate": 3.4692363824308895e-05, + "loss": 0.8717, + "step": 5345 + }, + { + "epoch": 0.26031699656708784, + "grad_norm": 1.4809894561767578, + "learning_rate": 3.4690223463755524e-05, + "loss": 0.8681, + "step": 5346 + }, + { + "epoch": 0.26036569035619506, + "grad_norm": 2.2789723873138428, + "learning_rate": 3.46880827377815e-05, + "loss": 0.8325, + "step": 5347 + }, + { + "epoch": 0.26041438414530227, + "grad_norm": 2.1972200870513916, + "learning_rate": 3.468594164644009e-05, + "loss": 0.8297, + "step": 5348 + }, + { + "epoch": 0.2604630779344095, + "grad_norm": 1.7659801244735718, + "learning_rate": 3.468380018978453e-05, + "loss": 0.9311, + "step": 5349 + }, + { + "epoch": 0.26051177172351664, + "grad_norm": 2.250352621078491, + "learning_rate": 3.4681658367868105e-05, + "loss": 0.8086, + "step": 5350 + }, + { + "epoch": 0.26056046551262385, + "grad_norm": 1.5017427206039429, + "learning_rate": 3.467951618074409e-05, + "loss": 0.8814, + "step": 5351 + }, + { + "epoch": 0.26060915930173106, + "grad_norm": 1.9361040592193604, + "learning_rate": 3.4677373628465765e-05, + "loss": 0.9882, + "step": 5352 + }, + { + "epoch": 0.2606578530908383, + "grad_norm": 2.5933640003204346, + "learning_rate": 3.467523071108645e-05, + "loss": 0.8776, + "step": 5353 + }, + { + "epoch": 0.2607065468799455, + "grad_norm": 1.690596580505371, + "learning_rate": 3.467308742865942e-05, + "loss": 0.9201, + "step": 5354 + }, + { + "epoch": 0.26075524066905265, + "grad_norm": 8.841075897216797, + "learning_rate": 3.4670943781238e-05, + "loss": 0.9333, + "step": 5355 + }, + { + "epoch": 0.26080393445815986, + "grad_norm": 1.7677135467529297, + "learning_rate": 3.466879976887552e-05, + "loss": 0.9376, + "step": 5356 + }, + { + "epoch": 0.26085262824726707, + "grad_norm": 1.422427773475647, + "learning_rate": 3.4666655391625305e-05, + "loss": 0.8157, + "step": 5357 + }, + { + "epoch": 0.2609013220363743, + "grad_norm": 1.3111506700515747, + "learning_rate": 3.466451064954071e-05, + "loss": 0.8387, + "step": 5358 + }, + { + "epoch": 0.26095001582548144, + "grad_norm": 1.6519176959991455, + "learning_rate": 3.466236554267507e-05, + "loss": 0.9526, + "step": 5359 + }, + { + "epoch": 0.26099870961458865, + "grad_norm": 1.873461365699768, + "learning_rate": 3.466022007108174e-05, + "loss": 0.8766, + "step": 5360 + }, + { + "epoch": 0.26104740340369587, + "grad_norm": 0.0889022946357727, + "learning_rate": 3.4658074234814116e-05, + "loss": 0.6239, + "step": 5361 + }, + { + "epoch": 0.2610960971928031, + "grad_norm": 1.632836103439331, + "learning_rate": 3.465592803392555e-05, + "loss": 0.7993, + "step": 5362 + }, + { + "epoch": 0.26114479098191024, + "grad_norm": 1.4185450077056885, + "learning_rate": 3.465378146846944e-05, + "loss": 0.8212, + "step": 5363 + }, + { + "epoch": 0.26119348477101745, + "grad_norm": 5.606350421905518, + "learning_rate": 3.465163453849918e-05, + "loss": 0.8093, + "step": 5364 + }, + { + "epoch": 0.26124217856012466, + "grad_norm": 1.4228764772415161, + "learning_rate": 3.464948724406817e-05, + "loss": 0.912, + "step": 5365 + }, + { + "epoch": 0.2612908723492319, + "grad_norm": 1.9301632642745972, + "learning_rate": 3.464733958522984e-05, + "loss": 0.8606, + "step": 5366 + }, + { + "epoch": 0.26133956613833903, + "grad_norm": 1.508737564086914, + "learning_rate": 3.464519156203759e-05, + "loss": 0.9117, + "step": 5367 + }, + { + "epoch": 0.26138825992744624, + "grad_norm": 1.2138410806655884, + "learning_rate": 3.464304317454487e-05, + "loss": 0.9926, + "step": 5368 + }, + { + "epoch": 0.26143695371655346, + "grad_norm": 1.7559527158737183, + "learning_rate": 3.4640894422805114e-05, + "loss": 0.8526, + "step": 5369 + }, + { + "epoch": 0.26148564750566067, + "grad_norm": 1.926725149154663, + "learning_rate": 3.463874530687178e-05, + "loss": 0.881, + "step": 5370 + }, + { + "epoch": 0.2615343412947678, + "grad_norm": 1.7501999139785767, + "learning_rate": 3.463659582679832e-05, + "loss": 0.7808, + "step": 5371 + }, + { + "epoch": 0.26158303508387504, + "grad_norm": 1.658724308013916, + "learning_rate": 3.4634445982638204e-05, + "loss": 0.8563, + "step": 5372 + }, + { + "epoch": 0.26163172887298225, + "grad_norm": 1.5194271802902222, + "learning_rate": 3.4632295774444904e-05, + "loss": 0.9323, + "step": 5373 + }, + { + "epoch": 0.26168042266208946, + "grad_norm": 2.0293195247650146, + "learning_rate": 3.463014520227192e-05, + "loss": 0.877, + "step": 5374 + }, + { + "epoch": 0.2617291164511967, + "grad_norm": 1.3919992446899414, + "learning_rate": 3.4627994266172736e-05, + "loss": 0.8613, + "step": 5375 + }, + { + "epoch": 0.26177781024030383, + "grad_norm": 1.3204396963119507, + "learning_rate": 3.462584296620086e-05, + "loss": 0.8386, + "step": 5376 + }, + { + "epoch": 0.26182650402941104, + "grad_norm": 2.0855722427368164, + "learning_rate": 3.462369130240981e-05, + "loss": 0.8401, + "step": 5377 + }, + { + "epoch": 0.26187519781851826, + "grad_norm": 2.00567364692688, + "learning_rate": 3.4621539274853105e-05, + "loss": 0.8947, + "step": 5378 + }, + { + "epoch": 0.26192389160762547, + "grad_norm": 1.53973388671875, + "learning_rate": 3.461938688358427e-05, + "loss": 0.874, + "step": 5379 + }, + { + "epoch": 0.2619725853967326, + "grad_norm": 2.2327518463134766, + "learning_rate": 3.461723412865686e-05, + "loss": 0.9536, + "step": 5380 + }, + { + "epoch": 0.26202127918583984, + "grad_norm": 0.08956944942474365, + "learning_rate": 3.4615081010124424e-05, + "loss": 0.5825, + "step": 5381 + }, + { + "epoch": 0.26206997297494705, + "grad_norm": 1.391662836074829, + "learning_rate": 3.4612927528040504e-05, + "loss": 0.8409, + "step": 5382 + }, + { + "epoch": 0.26211866676405426, + "grad_norm": 1.672426462173462, + "learning_rate": 3.4610773682458684e-05, + "loss": 0.8469, + "step": 5383 + }, + { + "epoch": 0.2621673605531614, + "grad_norm": 1.5104413032531738, + "learning_rate": 3.460861947343254e-05, + "loss": 0.8832, + "step": 5384 + }, + { + "epoch": 0.26221605434226863, + "grad_norm": 1.8571515083312988, + "learning_rate": 3.4606464901015654e-05, + "loss": 0.8489, + "step": 5385 + }, + { + "epoch": 0.26226474813137585, + "grad_norm": 1.3110475540161133, + "learning_rate": 3.460430996526162e-05, + "loss": 0.8921, + "step": 5386 + }, + { + "epoch": 0.26231344192048306, + "grad_norm": 1.2733893394470215, + "learning_rate": 3.460215466622404e-05, + "loss": 0.8533, + "step": 5387 + }, + { + "epoch": 0.2623621357095902, + "grad_norm": 1.6971051692962646, + "learning_rate": 3.459999900395654e-05, + "loss": 1.0083, + "step": 5388 + }, + { + "epoch": 0.26241082949869743, + "grad_norm": 2.7021608352661133, + "learning_rate": 3.4597842978512726e-05, + "loss": 0.7838, + "step": 5389 + }, + { + "epoch": 0.26245952328780464, + "grad_norm": 1.3104157447814941, + "learning_rate": 3.4595686589946244e-05, + "loss": 0.8437, + "step": 5390 + }, + { + "epoch": 0.26250821707691185, + "grad_norm": 1.625125527381897, + "learning_rate": 3.459352983831072e-05, + "loss": 0.8695, + "step": 5391 + }, + { + "epoch": 0.26255691086601907, + "grad_norm": 2.189526319503784, + "learning_rate": 3.459137272365981e-05, + "loss": 0.9108, + "step": 5392 + }, + { + "epoch": 0.2626056046551262, + "grad_norm": 1.6116604804992676, + "learning_rate": 3.4589215246047186e-05, + "loss": 0.8225, + "step": 5393 + }, + { + "epoch": 0.26265429844423344, + "grad_norm": 1.4313548803329468, + "learning_rate": 3.4587057405526485e-05, + "loss": 0.9465, + "step": 5394 + }, + { + "epoch": 0.26270299223334065, + "grad_norm": 1.217131495475769, + "learning_rate": 3.458489920215141e-05, + "loss": 0.7999, + "step": 5395 + }, + { + "epoch": 0.26275168602244786, + "grad_norm": 1.8717087507247925, + "learning_rate": 3.458274063597564e-05, + "loss": 0.8679, + "step": 5396 + }, + { + "epoch": 0.262800379811555, + "grad_norm": 1.5287399291992188, + "learning_rate": 3.458058170705286e-05, + "loss": 0.7665, + "step": 5397 + }, + { + "epoch": 0.26284907360066223, + "grad_norm": 1.4877318143844604, + "learning_rate": 3.457842241543679e-05, + "loss": 0.7975, + "step": 5398 + }, + { + "epoch": 0.26289776738976944, + "grad_norm": 3.2805464267730713, + "learning_rate": 3.457626276118112e-05, + "loss": 0.8656, + "step": 5399 + }, + { + "epoch": 0.26294646117887666, + "grad_norm": 1.2334773540496826, + "learning_rate": 3.45741027443396e-05, + "loss": 0.9715, + "step": 5400 + }, + { + "epoch": 0.2629951549679838, + "grad_norm": 1.4447071552276611, + "learning_rate": 3.457194236496593e-05, + "loss": 0.783, + "step": 5401 + }, + { + "epoch": 0.263043848757091, + "grad_norm": 1.5796126127243042, + "learning_rate": 3.456978162311388e-05, + "loss": 0.8698, + "step": 5402 + }, + { + "epoch": 0.26309254254619824, + "grad_norm": 3.6148452758789062, + "learning_rate": 3.4567620518837176e-05, + "loss": 0.8655, + "step": 5403 + }, + { + "epoch": 0.26314123633530545, + "grad_norm": 2.47087025642395, + "learning_rate": 3.4565459052189584e-05, + "loss": 0.791, + "step": 5404 + }, + { + "epoch": 0.2631899301244126, + "grad_norm": 1.3852711915969849, + "learning_rate": 3.456329722322487e-05, + "loss": 0.8317, + "step": 5405 + }, + { + "epoch": 0.2632386239135198, + "grad_norm": 1.338911771774292, + "learning_rate": 3.4561135031996804e-05, + "loss": 0.8539, + "step": 5406 + }, + { + "epoch": 0.26328731770262703, + "grad_norm": 2.397632122039795, + "learning_rate": 3.4558972478559184e-05, + "loss": 0.89, + "step": 5407 + }, + { + "epoch": 0.26333601149173425, + "grad_norm": 1.4886229038238525, + "learning_rate": 3.4556809562965804e-05, + "loss": 0.9892, + "step": 5408 + }, + { + "epoch": 0.2633847052808414, + "grad_norm": 1.788725733757019, + "learning_rate": 3.455464628527044e-05, + "loss": 0.8566, + "step": 5409 + }, + { + "epoch": 0.2634333990699486, + "grad_norm": 2.018982410430908, + "learning_rate": 3.4552482645526935e-05, + "loss": 0.8431, + "step": 5410 + }, + { + "epoch": 0.2634820928590558, + "grad_norm": 1.7008204460144043, + "learning_rate": 3.4550318643789095e-05, + "loss": 0.9714, + "step": 5411 + }, + { + "epoch": 0.26353078664816304, + "grad_norm": 1.7494478225708008, + "learning_rate": 3.454815428011075e-05, + "loss": 0.9439, + "step": 5412 + }, + { + "epoch": 0.26357948043727025, + "grad_norm": 1.453468680381775, + "learning_rate": 3.454598955454574e-05, + "loss": 0.8484, + "step": 5413 + }, + { + "epoch": 0.2636281742263774, + "grad_norm": 1.5316241979599, + "learning_rate": 3.454382446714792e-05, + "loss": 0.9596, + "step": 5414 + }, + { + "epoch": 0.2636768680154846, + "grad_norm": 1.431597113609314, + "learning_rate": 3.4541659017971135e-05, + "loss": 0.8017, + "step": 5415 + }, + { + "epoch": 0.26372556180459183, + "grad_norm": 2.013871908187866, + "learning_rate": 3.453949320706926e-05, + "loss": 0.8139, + "step": 5416 + }, + { + "epoch": 0.26377425559369905, + "grad_norm": 1.4081007242202759, + "learning_rate": 3.453732703449616e-05, + "loss": 0.8361, + "step": 5417 + }, + { + "epoch": 0.2638229493828062, + "grad_norm": 1.3283381462097168, + "learning_rate": 3.4535160500305734e-05, + "loss": 0.8569, + "step": 5418 + }, + { + "epoch": 0.2638716431719134, + "grad_norm": 1.7479770183563232, + "learning_rate": 3.4532993604551853e-05, + "loss": 0.9213, + "step": 5419 + }, + { + "epoch": 0.26392033696102063, + "grad_norm": 1.2328684329986572, + "learning_rate": 3.453082634728844e-05, + "loss": 0.9075, + "step": 5420 + }, + { + "epoch": 0.26396903075012784, + "grad_norm": 2.3743574619293213, + "learning_rate": 3.452865872856939e-05, + "loss": 0.837, + "step": 5421 + }, + { + "epoch": 0.264017724539235, + "grad_norm": 1.8500454425811768, + "learning_rate": 3.4526490748448634e-05, + "loss": 0.9215, + "step": 5422 + }, + { + "epoch": 0.2640664183283422, + "grad_norm": 1.971989631652832, + "learning_rate": 3.45243224069801e-05, + "loss": 0.8624, + "step": 5423 + }, + { + "epoch": 0.2641151121174494, + "grad_norm": 0.09178344160318375, + "learning_rate": 3.4522153704217715e-05, + "loss": 0.6239, + "step": 5424 + }, + { + "epoch": 0.26416380590655664, + "grad_norm": 2.140650749206543, + "learning_rate": 3.451998464021543e-05, + "loss": 0.8314, + "step": 5425 + }, + { + "epoch": 0.2642124996956638, + "grad_norm": 11.343098640441895, + "learning_rate": 3.451781521502721e-05, + "loss": 0.8084, + "step": 5426 + }, + { + "epoch": 0.264261193484771, + "grad_norm": 1.5919406414031982, + "learning_rate": 3.451564542870701e-05, + "loss": 0.8436, + "step": 5427 + }, + { + "epoch": 0.2643098872738782, + "grad_norm": 1.217739224433899, + "learning_rate": 3.451347528130881e-05, + "loss": 0.9175, + "step": 5428 + }, + { + "epoch": 0.26435858106298543, + "grad_norm": 1.6020811796188354, + "learning_rate": 3.4511304772886584e-05, + "loss": 0.9231, + "step": 5429 + }, + { + "epoch": 0.2644072748520926, + "grad_norm": 0.08858952671289444, + "learning_rate": 3.450913390349433e-05, + "loss": 0.5394, + "step": 5430 + }, + { + "epoch": 0.2644559686411998, + "grad_norm": 1.370850920677185, + "learning_rate": 3.450696267318605e-05, + "loss": 0.8651, + "step": 5431 + }, + { + "epoch": 0.264504662430307, + "grad_norm": 1.4216312170028687, + "learning_rate": 3.4504791082015745e-05, + "loss": 0.7408, + "step": 5432 + }, + { + "epoch": 0.2645533562194142, + "grad_norm": 1.4957929849624634, + "learning_rate": 3.450261913003745e-05, + "loss": 0.8388, + "step": 5433 + }, + { + "epoch": 0.26460205000852144, + "grad_norm": 1.3571230173110962, + "learning_rate": 3.450044681730517e-05, + "loss": 0.8267, + "step": 5434 + }, + { + "epoch": 0.2646507437976286, + "grad_norm": 2.4169576168060303, + "learning_rate": 3.449827414387296e-05, + "loss": 0.79, + "step": 5435 + }, + { + "epoch": 0.2646994375867358, + "grad_norm": 1.5257872343063354, + "learning_rate": 3.449610110979486e-05, + "loss": 0.9606, + "step": 5436 + }, + { + "epoch": 0.264748131375843, + "grad_norm": 1.4200222492218018, + "learning_rate": 3.449392771512492e-05, + "loss": 0.8532, + "step": 5437 + }, + { + "epoch": 0.26479682516495023, + "grad_norm": 1.4027833938598633, + "learning_rate": 3.449175395991721e-05, + "loss": 0.8872, + "step": 5438 + }, + { + "epoch": 0.2648455189540574, + "grad_norm": 1.4832649230957031, + "learning_rate": 3.448957984422579e-05, + "loss": 0.8229, + "step": 5439 + }, + { + "epoch": 0.2648942127431646, + "grad_norm": 2.5146265029907227, + "learning_rate": 3.448740536810476e-05, + "loss": 0.8249, + "step": 5440 + }, + { + "epoch": 0.2649429065322718, + "grad_norm": 2.279512643814087, + "learning_rate": 3.44852305316082e-05, + "loss": 0.9774, + "step": 5441 + }, + { + "epoch": 0.26499160032137903, + "grad_norm": 1.7998931407928467, + "learning_rate": 3.4483055334790205e-05, + "loss": 0.9715, + "step": 5442 + }, + { + "epoch": 0.2650402941104862, + "grad_norm": 1.3497869968414307, + "learning_rate": 3.448087977770489e-05, + "loss": 0.8121, + "step": 5443 + }, + { + "epoch": 0.2650889878995934, + "grad_norm": 1.3160725831985474, + "learning_rate": 3.447870386040637e-05, + "loss": 0.9362, + "step": 5444 + }, + { + "epoch": 0.2651376816887006, + "grad_norm": 1.2324904203414917, + "learning_rate": 3.4476527582948764e-05, + "loss": 0.9596, + "step": 5445 + }, + { + "epoch": 0.2651863754778078, + "grad_norm": 1.511333703994751, + "learning_rate": 3.447435094538623e-05, + "loss": 0.896, + "step": 5446 + }, + { + "epoch": 0.265235069266915, + "grad_norm": 1.797391414642334, + "learning_rate": 3.4472173947772884e-05, + "loss": 0.6959, + "step": 5447 + }, + { + "epoch": 0.2652837630560222, + "grad_norm": 1.5826585292816162, + "learning_rate": 3.44699965901629e-05, + "loss": 0.8476, + "step": 5448 + }, + { + "epoch": 0.2653324568451294, + "grad_norm": 1.4857629537582397, + "learning_rate": 3.4467818872610427e-05, + "loss": 0.8703, + "step": 5449 + }, + { + "epoch": 0.2653811506342366, + "grad_norm": 1.6342229843139648, + "learning_rate": 3.446564079516964e-05, + "loss": 0.84, + "step": 5450 + }, + { + "epoch": 0.2654298444233438, + "grad_norm": 1.6262859106063843, + "learning_rate": 3.446346235789472e-05, + "loss": 0.8972, + "step": 5451 + }, + { + "epoch": 0.265478538212451, + "grad_norm": 2.6014397144317627, + "learning_rate": 3.4461283560839856e-05, + "loss": 0.8748, + "step": 5452 + }, + { + "epoch": 0.2655272320015582, + "grad_norm": 1.814479947090149, + "learning_rate": 3.445910440405924e-05, + "loss": 0.8631, + "step": 5453 + }, + { + "epoch": 0.2655759257906654, + "grad_norm": 1.6192600727081299, + "learning_rate": 3.445692488760709e-05, + "loss": 0.8589, + "step": 5454 + }, + { + "epoch": 0.2656246195797726, + "grad_norm": 2.2724359035491943, + "learning_rate": 3.4454745011537616e-05, + "loss": 0.8506, + "step": 5455 + }, + { + "epoch": 0.2656733133688798, + "grad_norm": 1.5215917825698853, + "learning_rate": 3.4452564775905034e-05, + "loss": 0.8208, + "step": 5456 + }, + { + "epoch": 0.265722007157987, + "grad_norm": 0.09256207942962646, + "learning_rate": 3.445038418076359e-05, + "loss": 0.6377, + "step": 5457 + }, + { + "epoch": 0.2657707009470942, + "grad_norm": 5.607672691345215, + "learning_rate": 3.444820322616752e-05, + "loss": 0.8659, + "step": 5458 + }, + { + "epoch": 0.2658193947362014, + "grad_norm": 1.4579463005065918, + "learning_rate": 3.444602191217108e-05, + "loss": 0.8826, + "step": 5459 + }, + { + "epoch": 0.2658680885253086, + "grad_norm": 2.1090192794799805, + "learning_rate": 3.444384023882853e-05, + "loss": 0.9048, + "step": 5460 + }, + { + "epoch": 0.2659167823144158, + "grad_norm": 1.8446370363235474, + "learning_rate": 3.4441658206194136e-05, + "loss": 0.8201, + "step": 5461 + }, + { + "epoch": 0.265965476103523, + "grad_norm": 1.5180784463882446, + "learning_rate": 3.443947581432218e-05, + "loss": 0.8755, + "step": 5462 + }, + { + "epoch": 0.2660141698926302, + "grad_norm": 2.2598323822021484, + "learning_rate": 3.443729306326694e-05, + "loss": 0.8891, + "step": 5463 + }, + { + "epoch": 0.26606286368173737, + "grad_norm": 1.4329397678375244, + "learning_rate": 3.443510995308272e-05, + "loss": 0.9194, + "step": 5464 + }, + { + "epoch": 0.2661115574708446, + "grad_norm": 1.3671637773513794, + "learning_rate": 3.443292648382382e-05, + "loss": 0.799, + "step": 5465 + }, + { + "epoch": 0.2661602512599518, + "grad_norm": 1.3640490770339966, + "learning_rate": 3.4430742655544565e-05, + "loss": 0.8854, + "step": 5466 + }, + { + "epoch": 0.266208945049059, + "grad_norm": 2.1035103797912598, + "learning_rate": 3.442855846829927e-05, + "loss": 0.8734, + "step": 5467 + }, + { + "epoch": 0.26625763883816617, + "grad_norm": 1.6021759510040283, + "learning_rate": 3.4426373922142265e-05, + "loss": 0.8627, + "step": 5468 + }, + { + "epoch": 0.2663063326272734, + "grad_norm": 1.4355974197387695, + "learning_rate": 3.44241890171279e-05, + "loss": 0.8245, + "step": 5469 + }, + { + "epoch": 0.2663550264163806, + "grad_norm": 1.4678987264633179, + "learning_rate": 3.4422003753310505e-05, + "loss": 0.8212, + "step": 5470 + }, + { + "epoch": 0.2664037202054878, + "grad_norm": 1.3747467994689941, + "learning_rate": 3.441981813074446e-05, + "loss": 0.9691, + "step": 5471 + }, + { + "epoch": 0.266452413994595, + "grad_norm": 1.4572744369506836, + "learning_rate": 3.441763214948413e-05, + "loss": 0.7702, + "step": 5472 + }, + { + "epoch": 0.2665011077837022, + "grad_norm": 1.4890387058258057, + "learning_rate": 3.441544580958388e-05, + "loss": 0.8557, + "step": 5473 + }, + { + "epoch": 0.2665498015728094, + "grad_norm": 1.513120412826538, + "learning_rate": 3.4413259111098094e-05, + "loss": 0.8083, + "step": 5474 + }, + { + "epoch": 0.2665984953619166, + "grad_norm": 0.09246379137039185, + "learning_rate": 3.441107205408118e-05, + "loss": 0.6895, + "step": 5475 + }, + { + "epoch": 0.2666471891510238, + "grad_norm": 2.1307079792022705, + "learning_rate": 3.4408884638587536e-05, + "loss": 0.8416, + "step": 5476 + }, + { + "epoch": 0.26669588294013097, + "grad_norm": 0.2145867645740509, + "learning_rate": 3.440669686467157e-05, + "loss": 0.5657, + "step": 5477 + }, + { + "epoch": 0.2667445767292382, + "grad_norm": 1.5755739212036133, + "learning_rate": 3.440450873238771e-05, + "loss": 0.8692, + "step": 5478 + }, + { + "epoch": 0.2667932705183454, + "grad_norm": 1.455088496208191, + "learning_rate": 3.4402320241790375e-05, + "loss": 0.918, + "step": 5479 + }, + { + "epoch": 0.2668419643074526, + "grad_norm": 2.0905346870422363, + "learning_rate": 3.440013139293402e-05, + "loss": 0.8542, + "step": 5480 + }, + { + "epoch": 0.26689065809655976, + "grad_norm": 1.4565232992172241, + "learning_rate": 3.439794218587308e-05, + "loss": 0.8441, + "step": 5481 + }, + { + "epoch": 0.266939351885667, + "grad_norm": 1.5949070453643799, + "learning_rate": 3.439575262066201e-05, + "loss": 0.9619, + "step": 5482 + }, + { + "epoch": 0.2669880456747742, + "grad_norm": 1.9257302284240723, + "learning_rate": 3.4393562697355285e-05, + "loss": 0.8684, + "step": 5483 + }, + { + "epoch": 0.2670367394638814, + "grad_norm": 0.09501426666975021, + "learning_rate": 3.439137241600738e-05, + "loss": 0.6566, + "step": 5484 + }, + { + "epoch": 0.26708543325298856, + "grad_norm": 2.277876138687134, + "learning_rate": 3.438918177667277e-05, + "loss": 0.8398, + "step": 5485 + }, + { + "epoch": 0.26713412704209577, + "grad_norm": 1.949620008468628, + "learning_rate": 3.4386990779405946e-05, + "loss": 0.8092, + "step": 5486 + }, + { + "epoch": 0.267182820831203, + "grad_norm": 1.3544822931289673, + "learning_rate": 3.4384799424261416e-05, + "loss": 0.875, + "step": 5487 + }, + { + "epoch": 0.2672315146203102, + "grad_norm": 2.598083734512329, + "learning_rate": 3.43826077112937e-05, + "loss": 0.8874, + "step": 5488 + }, + { + "epoch": 0.26728020840941735, + "grad_norm": 1.8317325115203857, + "learning_rate": 3.43804156405573e-05, + "loss": 1.0123, + "step": 5489 + }, + { + "epoch": 0.26732890219852457, + "grad_norm": 0.09783480316400528, + "learning_rate": 3.437822321210674e-05, + "loss": 0.6391, + "step": 5490 + }, + { + "epoch": 0.2673775959876318, + "grad_norm": 2.2124838829040527, + "learning_rate": 3.437603042599658e-05, + "loss": 0.9111, + "step": 5491 + }, + { + "epoch": 0.267426289776739, + "grad_norm": 1.9581559896469116, + "learning_rate": 3.437383728228135e-05, + "loss": 0.9025, + "step": 5492 + }, + { + "epoch": 0.2674749835658462, + "grad_norm": 1.904869794845581, + "learning_rate": 3.43716437810156e-05, + "loss": 0.9277, + "step": 5493 + }, + { + "epoch": 0.26752367735495336, + "grad_norm": 1.603618860244751, + "learning_rate": 3.436944992225391e-05, + "loss": 0.9473, + "step": 5494 + }, + { + "epoch": 0.2675723711440606, + "grad_norm": 2.913524866104126, + "learning_rate": 3.4367255706050834e-05, + "loss": 0.8694, + "step": 5495 + }, + { + "epoch": 0.2676210649331678, + "grad_norm": 1.6035761833190918, + "learning_rate": 3.4365061132460965e-05, + "loss": 0.7472, + "step": 5496 + }, + { + "epoch": 0.267669758722275, + "grad_norm": 2.065477132797241, + "learning_rate": 3.436286620153889e-05, + "loss": 0.916, + "step": 5497 + }, + { + "epoch": 0.26771845251138215, + "grad_norm": 1.8229188919067383, + "learning_rate": 3.4360670913339204e-05, + "loss": 0.8397, + "step": 5498 + }, + { + "epoch": 0.26776714630048937, + "grad_norm": 1.63886559009552, + "learning_rate": 3.435847526791653e-05, + "loss": 0.8764, + "step": 5499 + }, + { + "epoch": 0.2678158400895966, + "grad_norm": 1.7464985847473145, + "learning_rate": 3.435627926532547e-05, + "loss": 0.8375, + "step": 5500 + }, + { + "epoch": 0.2678645338787038, + "grad_norm": 1.6543458700180054, + "learning_rate": 3.435408290562066e-05, + "loss": 0.8255, + "step": 5501 + }, + { + "epoch": 0.26791322766781095, + "grad_norm": 1.3141112327575684, + "learning_rate": 3.435188618885672e-05, + "loss": 0.9129, + "step": 5502 + }, + { + "epoch": 0.26796192145691816, + "grad_norm": 1.7689331769943237, + "learning_rate": 3.434968911508831e-05, + "loss": 0.8995, + "step": 5503 + }, + { + "epoch": 0.2680106152460254, + "grad_norm": 1.733655571937561, + "learning_rate": 3.4347491684370066e-05, + "loss": 0.9613, + "step": 5504 + }, + { + "epoch": 0.2680593090351326, + "grad_norm": 1.473551869392395, + "learning_rate": 3.434529389675667e-05, + "loss": 0.8237, + "step": 5505 + }, + { + "epoch": 0.26810800282423974, + "grad_norm": 1.7543355226516724, + "learning_rate": 3.434309575230277e-05, + "loss": 0.7944, + "step": 5506 + }, + { + "epoch": 0.26815669661334696, + "grad_norm": 1.3920776844024658, + "learning_rate": 3.434089725106306e-05, + "loss": 0.9028, + "step": 5507 + }, + { + "epoch": 0.26820539040245417, + "grad_norm": 1.6883831024169922, + "learning_rate": 3.433869839309222e-05, + "loss": 0.9014, + "step": 5508 + }, + { + "epoch": 0.2682540841915614, + "grad_norm": 1.2942240238189697, + "learning_rate": 3.4336499178444956e-05, + "loss": 0.9639, + "step": 5509 + }, + { + "epoch": 0.26830277798066854, + "grad_norm": 1.8691563606262207, + "learning_rate": 3.433429960717596e-05, + "loss": 0.822, + "step": 5510 + }, + { + "epoch": 0.26835147176977575, + "grad_norm": 1.6076997518539429, + "learning_rate": 3.433209967933996e-05, + "loss": 0.837, + "step": 5511 + }, + { + "epoch": 0.26840016555888296, + "grad_norm": 2.3787200450897217, + "learning_rate": 3.432989939499168e-05, + "loss": 0.8575, + "step": 5512 + }, + { + "epoch": 0.2684488593479902, + "grad_norm": 2.4989173412323, + "learning_rate": 3.432769875418583e-05, + "loss": 0.8556, + "step": 5513 + }, + { + "epoch": 0.2684975531370974, + "grad_norm": 1.7138357162475586, + "learning_rate": 3.432549775697718e-05, + "loss": 0.9391, + "step": 5514 + }, + { + "epoch": 0.26854624692620455, + "grad_norm": 1.6661655902862549, + "learning_rate": 3.432329640342046e-05, + "loss": 0.8835, + "step": 5515 + }, + { + "epoch": 0.26859494071531176, + "grad_norm": 1.4647046327590942, + "learning_rate": 3.432109469357044e-05, + "loss": 0.8851, + "step": 5516 + }, + { + "epoch": 0.26864363450441897, + "grad_norm": 3.419440269470215, + "learning_rate": 3.431889262748188e-05, + "loss": 0.8002, + "step": 5517 + }, + { + "epoch": 0.2686923282935262, + "grad_norm": 1.2864264249801636, + "learning_rate": 3.431669020520956e-05, + "loss": 0.8466, + "step": 5518 + }, + { + "epoch": 0.26874102208263334, + "grad_norm": 1.7179484367370605, + "learning_rate": 3.431448742680828e-05, + "loss": 0.7502, + "step": 5519 + }, + { + "epoch": 0.26878971587174055, + "grad_norm": 1.7686936855316162, + "learning_rate": 3.4312284292332805e-05, + "loss": 0.9382, + "step": 5520 + }, + { + "epoch": 0.26883840966084777, + "grad_norm": 2.165844678878784, + "learning_rate": 3.431008080183795e-05, + "loss": 0.8248, + "step": 5521 + }, + { + "epoch": 0.268887103449955, + "grad_norm": 1.784669280052185, + "learning_rate": 3.430787695537854e-05, + "loss": 0.8666, + "step": 5522 + }, + { + "epoch": 0.26893579723906214, + "grad_norm": 1.6039961576461792, + "learning_rate": 3.4305672753009386e-05, + "loss": 0.851, + "step": 5523 + }, + { + "epoch": 0.26898449102816935, + "grad_norm": 5.542195796966553, + "learning_rate": 3.4303468194785316e-05, + "loss": 0.8038, + "step": 5524 + }, + { + "epoch": 0.26903318481727656, + "grad_norm": 1.8479530811309814, + "learning_rate": 3.430126328076117e-05, + "loss": 0.8424, + "step": 5525 + }, + { + "epoch": 0.2690818786063838, + "grad_norm": 1.633669137954712, + "learning_rate": 3.4299058010991794e-05, + "loss": 0.8821, + "step": 5526 + }, + { + "epoch": 0.26913057239549093, + "grad_norm": 1.3210973739624023, + "learning_rate": 3.429685238553205e-05, + "loss": 0.9293, + "step": 5527 + }, + { + "epoch": 0.26917926618459814, + "grad_norm": 2.2887158393859863, + "learning_rate": 3.42946464044368e-05, + "loss": 0.932, + "step": 5528 + }, + { + "epoch": 0.26922795997370536, + "grad_norm": 2.5928330421447754, + "learning_rate": 3.429244006776091e-05, + "loss": 0.8524, + "step": 5529 + }, + { + "epoch": 0.26927665376281257, + "grad_norm": 1.873937726020813, + "learning_rate": 3.4290233375559273e-05, + "loss": 0.881, + "step": 5530 + }, + { + "epoch": 0.2693253475519198, + "grad_norm": 1.4712387323379517, + "learning_rate": 3.4288026327886785e-05, + "loss": 0.8819, + "step": 5531 + }, + { + "epoch": 0.26937404134102694, + "grad_norm": 2.400489568710327, + "learning_rate": 3.4285818924798336e-05, + "loss": 0.9056, + "step": 5532 + }, + { + "epoch": 0.26942273513013415, + "grad_norm": 1.65860915184021, + "learning_rate": 3.428361116634883e-05, + "loss": 0.8786, + "step": 5533 + }, + { + "epoch": 0.26947142891924136, + "grad_norm": 3.115673065185547, + "learning_rate": 3.4281403052593206e-05, + "loss": 0.9254, + "step": 5534 + }, + { + "epoch": 0.2695201227083486, + "grad_norm": 3.0832369327545166, + "learning_rate": 3.4279194583586374e-05, + "loss": 0.8813, + "step": 5535 + }, + { + "epoch": 0.26956881649745573, + "grad_norm": 1.5324885845184326, + "learning_rate": 3.4276985759383275e-05, + "loss": 0.7911, + "step": 5536 + }, + { + "epoch": 0.26961751028656294, + "grad_norm": 1.7436387538909912, + "learning_rate": 3.427477658003885e-05, + "loss": 0.9751, + "step": 5537 + }, + { + "epoch": 0.26966620407567016, + "grad_norm": 1.4421707391738892, + "learning_rate": 3.427256704560806e-05, + "loss": 0.8898, + "step": 5538 + }, + { + "epoch": 0.26971489786477737, + "grad_norm": 1.5081608295440674, + "learning_rate": 3.427035715614587e-05, + "loss": 0.8918, + "step": 5539 + }, + { + "epoch": 0.2697635916538845, + "grad_norm": 1.7613410949707031, + "learning_rate": 3.426814691170723e-05, + "loss": 0.9208, + "step": 5540 + }, + { + "epoch": 0.26981228544299174, + "grad_norm": 1.4517093896865845, + "learning_rate": 3.426593631234714e-05, + "loss": 0.8958, + "step": 5541 + }, + { + "epoch": 0.26986097923209895, + "grad_norm": 1.8609200716018677, + "learning_rate": 3.426372535812059e-05, + "loss": 0.9093, + "step": 5542 + }, + { + "epoch": 0.26990967302120616, + "grad_norm": 1.644770860671997, + "learning_rate": 3.4261514049082566e-05, + "loss": 0.9672, + "step": 5543 + }, + { + "epoch": 0.2699583668103133, + "grad_norm": 1.311665654182434, + "learning_rate": 3.425930238528808e-05, + "loss": 0.9426, + "step": 5544 + }, + { + "epoch": 0.27000706059942053, + "grad_norm": 1.6386561393737793, + "learning_rate": 3.425709036679215e-05, + "loss": 0.8936, + "step": 5545 + }, + { + "epoch": 0.27005575438852775, + "grad_norm": 1.746039867401123, + "learning_rate": 3.425487799364979e-05, + "loss": 0.8061, + "step": 5546 + }, + { + "epoch": 0.27010444817763496, + "grad_norm": 2.1278347969055176, + "learning_rate": 3.425266526591605e-05, + "loss": 0.9001, + "step": 5547 + }, + { + "epoch": 0.2701531419667421, + "grad_norm": 1.8273684978485107, + "learning_rate": 3.425045218364595e-05, + "loss": 0.905, + "step": 5548 + }, + { + "epoch": 0.27020183575584933, + "grad_norm": 1.5104343891143799, + "learning_rate": 3.424823874689456e-05, + "loss": 0.8765, + "step": 5549 + }, + { + "epoch": 0.27025052954495654, + "grad_norm": 1.6262284517288208, + "learning_rate": 3.424602495571693e-05, + "loss": 0.929, + "step": 5550 + }, + { + "epoch": 0.27029922333406375, + "grad_norm": 1.3021985292434692, + "learning_rate": 3.4243810810168135e-05, + "loss": 0.8478, + "step": 5551 + }, + { + "epoch": 0.27034791712317097, + "grad_norm": 1.8221874237060547, + "learning_rate": 3.4241596310303246e-05, + "loss": 0.7944, + "step": 5552 + }, + { + "epoch": 0.2703966109122781, + "grad_norm": 1.2587729692459106, + "learning_rate": 3.423938145617735e-05, + "loss": 0.8807, + "step": 5553 + }, + { + "epoch": 0.27044530470138534, + "grad_norm": 1.7282265424728394, + "learning_rate": 3.423716624784554e-05, + "loss": 0.8436, + "step": 5554 + }, + { + "epoch": 0.27049399849049255, + "grad_norm": 1.5417158603668213, + "learning_rate": 3.423495068536292e-05, + "loss": 0.8862, + "step": 5555 + }, + { + "epoch": 0.27054269227959976, + "grad_norm": 2.065176010131836, + "learning_rate": 3.423273476878461e-05, + "loss": 0.9185, + "step": 5556 + }, + { + "epoch": 0.2705913860687069, + "grad_norm": 1.3576852083206177, + "learning_rate": 3.423051849816572e-05, + "loss": 0.8277, + "step": 5557 + }, + { + "epoch": 0.27064007985781413, + "grad_norm": 1.858152985572815, + "learning_rate": 3.4228301873561384e-05, + "loss": 0.9175, + "step": 5558 + }, + { + "epoch": 0.27068877364692134, + "grad_norm": 1.860136866569519, + "learning_rate": 3.4226084895026746e-05, + "loss": 0.8521, + "step": 5559 + }, + { + "epoch": 0.27073746743602856, + "grad_norm": 1.7789051532745361, + "learning_rate": 3.422386756261694e-05, + "loss": 1.0152, + "step": 5560 + }, + { + "epoch": 0.2707861612251357, + "grad_norm": 1.3896726369857788, + "learning_rate": 3.422164987638714e-05, + "loss": 0.7592, + "step": 5561 + }, + { + "epoch": 0.2708348550142429, + "grad_norm": 1.564003825187683, + "learning_rate": 3.4219431836392504e-05, + "loss": 0.8595, + "step": 5562 + }, + { + "epoch": 0.27088354880335014, + "grad_norm": 1.825620412826538, + "learning_rate": 3.4217213442688194e-05, + "loss": 0.9361, + "step": 5563 + }, + { + "epoch": 0.27093224259245735, + "grad_norm": 2.2690975666046143, + "learning_rate": 3.4214994695329415e-05, + "loss": 0.9075, + "step": 5564 + }, + { + "epoch": 0.2709809363815645, + "grad_norm": 1.2442076206207275, + "learning_rate": 3.4212775594371344e-05, + "loss": 0.8659, + "step": 5565 + }, + { + "epoch": 0.2710296301706717, + "grad_norm": 1.7241023778915405, + "learning_rate": 3.421055613986918e-05, + "loss": 0.9049, + "step": 5566 + }, + { + "epoch": 0.27107832395977893, + "grad_norm": 1.617288589477539, + "learning_rate": 3.420833633187814e-05, + "loss": 0.9457, + "step": 5567 + }, + { + "epoch": 0.27112701774888615, + "grad_norm": 1.5002846717834473, + "learning_rate": 3.4206116170453436e-05, + "loss": 0.8061, + "step": 5568 + }, + { + "epoch": 0.2711757115379933, + "grad_norm": 1.4878264665603638, + "learning_rate": 3.42038956556503e-05, + "loss": 0.8764, + "step": 5569 + }, + { + "epoch": 0.2712244053271005, + "grad_norm": 1.7449982166290283, + "learning_rate": 3.420167478752397e-05, + "loss": 0.8939, + "step": 5570 + }, + { + "epoch": 0.2712730991162077, + "grad_norm": 1.6280368566513062, + "learning_rate": 3.419945356612967e-05, + "loss": 0.9434, + "step": 5571 + }, + { + "epoch": 0.27132179290531494, + "grad_norm": 0.08672907203435898, + "learning_rate": 3.419723199152268e-05, + "loss": 0.5714, + "step": 5572 + }, + { + "epoch": 0.27137048669442215, + "grad_norm": 3.53439998626709, + "learning_rate": 3.419501006375824e-05, + "loss": 0.8556, + "step": 5573 + }, + { + "epoch": 0.2714191804835293, + "grad_norm": 1.6381263732910156, + "learning_rate": 3.4192787782891635e-05, + "loss": 0.9096, + "step": 5574 + }, + { + "epoch": 0.2714678742726365, + "grad_norm": 1.5118446350097656, + "learning_rate": 3.419056514897814e-05, + "loss": 0.8996, + "step": 5575 + }, + { + "epoch": 0.27151656806174373, + "grad_norm": 1.3900797367095947, + "learning_rate": 3.418834216207304e-05, + "loss": 0.9233, + "step": 5576 + }, + { + "epoch": 0.27156526185085095, + "grad_norm": 1.6933374404907227, + "learning_rate": 3.4186118822231643e-05, + "loss": 0.8948, + "step": 5577 + }, + { + "epoch": 0.2716139556399581, + "grad_norm": 1.5964608192443848, + "learning_rate": 3.418389512950924e-05, + "loss": 0.9311, + "step": 5578 + }, + { + "epoch": 0.2716626494290653, + "grad_norm": 1.8542687892913818, + "learning_rate": 3.418167108396116e-05, + "loss": 0.8566, + "step": 5579 + }, + { + "epoch": 0.27171134321817253, + "grad_norm": 1.3137741088867188, + "learning_rate": 3.417944668564271e-05, + "loss": 0.8101, + "step": 5580 + }, + { + "epoch": 0.27176003700727974, + "grad_norm": 1.408619999885559, + "learning_rate": 3.4177221934609235e-05, + "loss": 0.947, + "step": 5581 + }, + { + "epoch": 0.2718087307963869, + "grad_norm": 1.8230513334274292, + "learning_rate": 3.417499683091607e-05, + "loss": 0.9212, + "step": 5582 + }, + { + "epoch": 0.2718574245854941, + "grad_norm": 1.5822267532348633, + "learning_rate": 3.417277137461857e-05, + "loss": 0.8768, + "step": 5583 + }, + { + "epoch": 0.2719061183746013, + "grad_norm": 1.3550375699996948, + "learning_rate": 3.417054556577209e-05, + "loss": 0.8066, + "step": 5584 + }, + { + "epoch": 0.27195481216370854, + "grad_norm": 0.08513061702251434, + "learning_rate": 3.4168319404431996e-05, + "loss": 0.5402, + "step": 5585 + }, + { + "epoch": 0.2720035059528157, + "grad_norm": 1.624119520187378, + "learning_rate": 3.4166092890653656e-05, + "loss": 0.9507, + "step": 5586 + }, + { + "epoch": 0.2720521997419229, + "grad_norm": 1.2415469884872437, + "learning_rate": 3.416386602449247e-05, + "loss": 0.8197, + "step": 5587 + }, + { + "epoch": 0.2721008935310301, + "grad_norm": 1.7845089435577393, + "learning_rate": 3.416163880600383e-05, + "loss": 0.8843, + "step": 5588 + }, + { + "epoch": 0.27214958732013733, + "grad_norm": 1.5340558290481567, + "learning_rate": 3.415941123524313e-05, + "loss": 0.9259, + "step": 5589 + }, + { + "epoch": 0.2721982811092445, + "grad_norm": 1.547601342201233, + "learning_rate": 3.415718331226578e-05, + "loss": 0.9061, + "step": 5590 + }, + { + "epoch": 0.2722469748983517, + "grad_norm": 2.0185375213623047, + "learning_rate": 3.4154955037127214e-05, + "loss": 0.804, + "step": 5591 + }, + { + "epoch": 0.2722956686874589, + "grad_norm": 1.8894652128219604, + "learning_rate": 3.415272640988285e-05, + "loss": 0.806, + "step": 5592 + }, + { + "epoch": 0.2723443624765661, + "grad_norm": 1.459290862083435, + "learning_rate": 3.415049743058812e-05, + "loss": 0.9094, + "step": 5593 + }, + { + "epoch": 0.27239305626567334, + "grad_norm": 0.09725281596183777, + "learning_rate": 3.414826809929848e-05, + "loss": 0.6537, + "step": 5594 + }, + { + "epoch": 0.2724417500547805, + "grad_norm": 3.254255771636963, + "learning_rate": 3.414603841606938e-05, + "loss": 0.927, + "step": 5595 + }, + { + "epoch": 0.2724904438438877, + "grad_norm": 1.8818089962005615, + "learning_rate": 3.414380838095628e-05, + "loss": 0.8402, + "step": 5596 + }, + { + "epoch": 0.2725391376329949, + "grad_norm": 1.139886498451233, + "learning_rate": 3.4141577994014653e-05, + "loss": 0.9018, + "step": 5597 + }, + { + "epoch": 0.27258783142210213, + "grad_norm": 1.524448037147522, + "learning_rate": 3.41393472553e-05, + "loss": 0.8473, + "step": 5598 + }, + { + "epoch": 0.2726365252112093, + "grad_norm": 1.520925760269165, + "learning_rate": 3.4137116164867786e-05, + "loss": 0.906, + "step": 5599 + }, + { + "epoch": 0.2726852190003165, + "grad_norm": 1.52205491065979, + "learning_rate": 3.413488472277352e-05, + "loss": 0.9294, + "step": 5600 + }, + { + "epoch": 0.2727339127894237, + "grad_norm": 2.2202625274658203, + "learning_rate": 3.4132652929072704e-05, + "loss": 0.8714, + "step": 5601 + }, + { + "epoch": 0.27278260657853093, + "grad_norm": 1.9840425252914429, + "learning_rate": 3.413042078382086e-05, + "loss": 0.8169, + "step": 5602 + }, + { + "epoch": 0.2728313003676381, + "grad_norm": 1.5182623863220215, + "learning_rate": 3.412818828707351e-05, + "loss": 0.8148, + "step": 5603 + }, + { + "epoch": 0.2728799941567453, + "grad_norm": 1.2206804752349854, + "learning_rate": 3.412595543888619e-05, + "loss": 0.8706, + "step": 5604 + }, + { + "epoch": 0.2729286879458525, + "grad_norm": 1.4928019046783447, + "learning_rate": 3.412372223931444e-05, + "loss": 0.9323, + "step": 5605 + }, + { + "epoch": 0.2729773817349597, + "grad_norm": 1.3792803287506104, + "learning_rate": 3.412148868841381e-05, + "loss": 0.9299, + "step": 5606 + }, + { + "epoch": 0.2730260755240669, + "grad_norm": 1.4774061441421509, + "learning_rate": 3.4119254786239866e-05, + "loss": 0.8922, + "step": 5607 + }, + { + "epoch": 0.2730747693131741, + "grad_norm": 1.404848337173462, + "learning_rate": 3.4117020532848166e-05, + "loss": 0.8915, + "step": 5608 + }, + { + "epoch": 0.2731234631022813, + "grad_norm": 1.8731118440628052, + "learning_rate": 3.41147859282943e-05, + "loss": 0.9139, + "step": 5609 + }, + { + "epoch": 0.2731721568913885, + "grad_norm": 1.3804775476455688, + "learning_rate": 3.411255097263384e-05, + "loss": 0.8825, + "step": 5610 + }, + { + "epoch": 0.27322085068049573, + "grad_norm": 2.9959561824798584, + "learning_rate": 3.411031566592239e-05, + "loss": 0.8829, + "step": 5611 + }, + { + "epoch": 0.2732695444696029, + "grad_norm": 1.2051410675048828, + "learning_rate": 3.410808000821555e-05, + "loss": 0.9728, + "step": 5612 + }, + { + "epoch": 0.2733182382587101, + "grad_norm": 1.3052421808242798, + "learning_rate": 3.4105843999568936e-05, + "loss": 0.8731, + "step": 5613 + }, + { + "epoch": 0.2733669320478173, + "grad_norm": 2.77514910697937, + "learning_rate": 3.410360764003817e-05, + "loss": 0.8336, + "step": 5614 + }, + { + "epoch": 0.2734156258369245, + "grad_norm": 1.423567533493042, + "learning_rate": 3.410137092967887e-05, + "loss": 0.7867, + "step": 5615 + }, + { + "epoch": 0.2734643196260317, + "grad_norm": 1.4503278732299805, + "learning_rate": 3.409913386854669e-05, + "loss": 0.8711, + "step": 5616 + }, + { + "epoch": 0.2735130134151389, + "grad_norm": 1.3622188568115234, + "learning_rate": 3.409689645669726e-05, + "loss": 0.869, + "step": 5617 + }, + { + "epoch": 0.2735617072042461, + "grad_norm": 0.085499607026577, + "learning_rate": 3.4094658694186254e-05, + "loss": 0.6045, + "step": 5618 + }, + { + "epoch": 0.2736104009933533, + "grad_norm": 1.6754189729690552, + "learning_rate": 3.4092420581069325e-05, + "loss": 0.8492, + "step": 5619 + }, + { + "epoch": 0.2736590947824605, + "grad_norm": 2.161893129348755, + "learning_rate": 3.4090182117402146e-05, + "loss": 0.8797, + "step": 5620 + }, + { + "epoch": 0.2737077885715677, + "grad_norm": 1.728068232536316, + "learning_rate": 3.4087943303240404e-05, + "loss": 0.8959, + "step": 5621 + }, + { + "epoch": 0.2737564823606749, + "grad_norm": 1.5825968980789185, + "learning_rate": 3.408570413863979e-05, + "loss": 0.8972, + "step": 5622 + }, + { + "epoch": 0.2738051761497821, + "grad_norm": 1.4223840236663818, + "learning_rate": 3.4083464623656e-05, + "loss": 0.9164, + "step": 5623 + }, + { + "epoch": 0.27385386993888927, + "grad_norm": 1.4740999937057495, + "learning_rate": 3.408122475834475e-05, + "loss": 0.9153, + "step": 5624 + }, + { + "epoch": 0.2739025637279965, + "grad_norm": 1.530795693397522, + "learning_rate": 3.4078984542761745e-05, + "loss": 0.8109, + "step": 5625 + }, + { + "epoch": 0.2739512575171037, + "grad_norm": 1.9455161094665527, + "learning_rate": 3.407674397696271e-05, + "loss": 0.7987, + "step": 5626 + }, + { + "epoch": 0.2739999513062109, + "grad_norm": 2.55332350730896, + "learning_rate": 3.407450306100339e-05, + "loss": 0.9092, + "step": 5627 + }, + { + "epoch": 0.27404864509531807, + "grad_norm": 4.13974666595459, + "learning_rate": 3.407226179493952e-05, + "loss": 0.8643, + "step": 5628 + }, + { + "epoch": 0.2740973388844253, + "grad_norm": 1.349045753479004, + "learning_rate": 3.407002017882686e-05, + "loss": 0.9153, + "step": 5629 + }, + { + "epoch": 0.2741460326735325, + "grad_norm": 1.8039360046386719, + "learning_rate": 3.406777821272116e-05, + "loss": 0.8286, + "step": 5630 + }, + { + "epoch": 0.2741947264626397, + "grad_norm": 1.8663257360458374, + "learning_rate": 3.40655358966782e-05, + "loss": 0.9619, + "step": 5631 + }, + { + "epoch": 0.2742434202517469, + "grad_norm": 1.5782314538955688, + "learning_rate": 3.4063293230753754e-05, + "loss": 0.9105, + "step": 5632 + }, + { + "epoch": 0.2742921140408541, + "grad_norm": 1.554702639579773, + "learning_rate": 3.40610502150036e-05, + "loss": 0.8187, + "step": 5633 + }, + { + "epoch": 0.2743408078299613, + "grad_norm": 1.2129613161087036, + "learning_rate": 3.405880684948354e-05, + "loss": 0.8932, + "step": 5634 + }, + { + "epoch": 0.2743895016190685, + "grad_norm": 2.3872830867767334, + "learning_rate": 3.405656313424938e-05, + "loss": 0.8219, + "step": 5635 + }, + { + "epoch": 0.2744381954081757, + "grad_norm": 6.8058247566223145, + "learning_rate": 3.405431906935693e-05, + "loss": 0.8914, + "step": 5636 + }, + { + "epoch": 0.27448688919728287, + "grad_norm": 2.1876566410064697, + "learning_rate": 3.405207465486201e-05, + "loss": 0.8496, + "step": 5637 + }, + { + "epoch": 0.2745355829863901, + "grad_norm": 1.443015456199646, + "learning_rate": 3.404982989082046e-05, + "loss": 0.9086, + "step": 5638 + }, + { + "epoch": 0.2745842767754973, + "grad_norm": 1.547873854637146, + "learning_rate": 3.40475847772881e-05, + "loss": 0.8338, + "step": 5639 + }, + { + "epoch": 0.2746329705646045, + "grad_norm": 1.4382537603378296, + "learning_rate": 3.404533931432079e-05, + "loss": 0.7894, + "step": 5640 + }, + { + "epoch": 0.27468166435371166, + "grad_norm": 3.398186445236206, + "learning_rate": 3.404309350197439e-05, + "loss": 0.9525, + "step": 5641 + }, + { + "epoch": 0.2747303581428189, + "grad_norm": 1.2342852354049683, + "learning_rate": 3.404084734030475e-05, + "loss": 0.8293, + "step": 5642 + }, + { + "epoch": 0.2747790519319261, + "grad_norm": 1.3290635347366333, + "learning_rate": 3.403860082936776e-05, + "loss": 0.9574, + "step": 5643 + }, + { + "epoch": 0.2748277457210333, + "grad_norm": 1.4480866193771362, + "learning_rate": 3.403635396921929e-05, + "loss": 0.9288, + "step": 5644 + }, + { + "epoch": 0.27487643951014046, + "grad_norm": 1.7700469493865967, + "learning_rate": 3.403410675991523e-05, + "loss": 0.7859, + "step": 5645 + }, + { + "epoch": 0.27492513329924767, + "grad_norm": 0.09005729854106903, + "learning_rate": 3.40318592015115e-05, + "loss": 0.6317, + "step": 5646 + }, + { + "epoch": 0.2749738270883549, + "grad_norm": 2.330008029937744, + "learning_rate": 3.402961129406398e-05, + "loss": 0.8617, + "step": 5647 + }, + { + "epoch": 0.2750225208774621, + "grad_norm": 1.253724217414856, + "learning_rate": 3.40273630376286e-05, + "loss": 0.819, + "step": 5648 + }, + { + "epoch": 0.27507121466656925, + "grad_norm": 0.0897132158279419, + "learning_rate": 3.4025114432261295e-05, + "loss": 0.6619, + "step": 5649 + }, + { + "epoch": 0.27511990845567647, + "grad_norm": 0.0864357054233551, + "learning_rate": 3.402286547801798e-05, + "loss": 0.6173, + "step": 5650 + }, + { + "epoch": 0.2751686022447837, + "grad_norm": 3.7630727291107178, + "learning_rate": 3.402061617495462e-05, + "loss": 0.8429, + "step": 5651 + }, + { + "epoch": 0.2752172960338909, + "grad_norm": 4.361048698425293, + "learning_rate": 3.401836652312714e-05, + "loss": 0.9618, + "step": 5652 + }, + { + "epoch": 0.2752659898229981, + "grad_norm": 1.358451247215271, + "learning_rate": 3.4016116522591524e-05, + "loss": 0.8375, + "step": 5653 + }, + { + "epoch": 0.27531468361210526, + "grad_norm": 2.82161808013916, + "learning_rate": 3.401386617340373e-05, + "loss": 0.8975, + "step": 5654 + }, + { + "epoch": 0.2753633774012125, + "grad_norm": 1.462958574295044, + "learning_rate": 3.401161547561973e-05, + "loss": 0.8993, + "step": 5655 + }, + { + "epoch": 0.2754120711903197, + "grad_norm": 2.3812506198883057, + "learning_rate": 3.4009364429295523e-05, + "loss": 0.9801, + "step": 5656 + }, + { + "epoch": 0.2754607649794269, + "grad_norm": 1.1594634056091309, + "learning_rate": 3.40071130344871e-05, + "loss": 0.86, + "step": 5657 + }, + { + "epoch": 0.27550945876853405, + "grad_norm": 1.8263674974441528, + "learning_rate": 3.4004861291250467e-05, + "loss": 0.841, + "step": 5658 + }, + { + "epoch": 0.27555815255764127, + "grad_norm": 1.7374768257141113, + "learning_rate": 3.4002609199641624e-05, + "loss": 0.8611, + "step": 5659 + }, + { + "epoch": 0.2756068463467485, + "grad_norm": 1.206705093383789, + "learning_rate": 3.4000356759716604e-05, + "loss": 0.8683, + "step": 5660 + }, + { + "epoch": 0.2756555401358557, + "grad_norm": 1.4385467767715454, + "learning_rate": 3.399810397153143e-05, + "loss": 0.8633, + "step": 5661 + }, + { + "epoch": 0.27570423392496285, + "grad_norm": 1.624280571937561, + "learning_rate": 3.399585083514215e-05, + "loss": 0.8842, + "step": 5662 + }, + { + "epoch": 0.27575292771407006, + "grad_norm": 1.386143445968628, + "learning_rate": 3.39935973506048e-05, + "loss": 0.8771, + "step": 5663 + }, + { + "epoch": 0.2758016215031773, + "grad_norm": 1.3224631547927856, + "learning_rate": 3.3991343517975445e-05, + "loss": 0.8338, + "step": 5664 + }, + { + "epoch": 0.2758503152922845, + "grad_norm": 2.4276490211486816, + "learning_rate": 3.3989089337310135e-05, + "loss": 0.839, + "step": 5665 + }, + { + "epoch": 0.27589900908139164, + "grad_norm": 1.1213175058364868, + "learning_rate": 3.398683480866496e-05, + "loss": 0.8677, + "step": 5666 + }, + { + "epoch": 0.27594770287049886, + "grad_norm": 1.816042184829712, + "learning_rate": 3.398457993209599e-05, + "loss": 0.8027, + "step": 5667 + }, + { + "epoch": 0.27599639665960607, + "grad_norm": 1.8189165592193604, + "learning_rate": 3.398232470765932e-05, + "loss": 0.8838, + "step": 5668 + }, + { + "epoch": 0.2760450904487133, + "grad_norm": 2.0954160690307617, + "learning_rate": 3.398006913541105e-05, + "loss": 0.8767, + "step": 5669 + }, + { + "epoch": 0.27609378423782044, + "grad_norm": 1.766269326210022, + "learning_rate": 3.3977813215407276e-05, + "loss": 0.7773, + "step": 5670 + }, + { + "epoch": 0.27614247802692765, + "grad_norm": 1.7614784240722656, + "learning_rate": 3.3975556947704134e-05, + "loss": 0.9254, + "step": 5671 + }, + { + "epoch": 0.27619117181603486, + "grad_norm": 2.5510127544403076, + "learning_rate": 3.397330033235774e-05, + "loss": 0.7739, + "step": 5672 + }, + { + "epoch": 0.2762398656051421, + "grad_norm": 2.34763765335083, + "learning_rate": 3.3971043369424226e-05, + "loss": 0.9163, + "step": 5673 + }, + { + "epoch": 0.2762885593942493, + "grad_norm": 1.6879663467407227, + "learning_rate": 3.396878605895973e-05, + "loss": 0.879, + "step": 5674 + }, + { + "epoch": 0.27633725318335645, + "grad_norm": 1.56617271900177, + "learning_rate": 3.396652840102041e-05, + "loss": 0.7128, + "step": 5675 + }, + { + "epoch": 0.27638594697246366, + "grad_norm": 1.6114704608917236, + "learning_rate": 3.396427039566242e-05, + "loss": 0.9151, + "step": 5676 + }, + { + "epoch": 0.27643464076157087, + "grad_norm": 1.6002568006515503, + "learning_rate": 3.396201204294193e-05, + "loss": 0.9025, + "step": 5677 + }, + { + "epoch": 0.2764833345506781, + "grad_norm": 0.09629467874765396, + "learning_rate": 3.395975334291511e-05, + "loss": 0.6112, + "step": 5678 + }, + { + "epoch": 0.27653202833978524, + "grad_norm": 1.1938419342041016, + "learning_rate": 3.3957494295638166e-05, + "loss": 0.8818, + "step": 5679 + }, + { + "epoch": 0.27658072212889245, + "grad_norm": 1.615164875984192, + "learning_rate": 3.395523490116728e-05, + "loss": 0.8077, + "step": 5680 + }, + { + "epoch": 0.27662941591799967, + "grad_norm": 1.9627254009246826, + "learning_rate": 3.395297515955864e-05, + "loss": 0.814, + "step": 5681 + }, + { + "epoch": 0.2766781097071069, + "grad_norm": 2.2299959659576416, + "learning_rate": 3.395071507086848e-05, + "loss": 0.9735, + "step": 5682 + }, + { + "epoch": 0.27672680349621404, + "grad_norm": 1.5057791471481323, + "learning_rate": 3.394845463515301e-05, + "loss": 0.768, + "step": 5683 + }, + { + "epoch": 0.27677549728532125, + "grad_norm": 1.3430116176605225, + "learning_rate": 3.394619385246846e-05, + "loss": 0.8787, + "step": 5684 + }, + { + "epoch": 0.27682419107442846, + "grad_norm": 1.6242892742156982, + "learning_rate": 3.394393272287106e-05, + "loss": 0.9086, + "step": 5685 + }, + { + "epoch": 0.2768728848635357, + "grad_norm": 1.682465672492981, + "learning_rate": 3.394167124641707e-05, + "loss": 0.8426, + "step": 5686 + }, + { + "epoch": 0.27692157865264283, + "grad_norm": 1.5012542009353638, + "learning_rate": 3.393940942316273e-05, + "loss": 0.8409, + "step": 5687 + }, + { + "epoch": 0.27697027244175004, + "grad_norm": 1.3125414848327637, + "learning_rate": 3.393714725316432e-05, + "loss": 0.8933, + "step": 5688 + }, + { + "epoch": 0.27701896623085726, + "grad_norm": 1.3175300359725952, + "learning_rate": 3.393488473647809e-05, + "loss": 0.8085, + "step": 5689 + }, + { + "epoch": 0.27706766001996447, + "grad_norm": 1.9162801504135132, + "learning_rate": 3.3932621873160334e-05, + "loss": 0.8897, + "step": 5690 + }, + { + "epoch": 0.2771163538090717, + "grad_norm": 1.9477550983428955, + "learning_rate": 3.393035866326734e-05, + "loss": 0.8876, + "step": 5691 + }, + { + "epoch": 0.27716504759817884, + "grad_norm": 1.6321394443511963, + "learning_rate": 3.392809510685541e-05, + "loss": 0.8629, + "step": 5692 + }, + { + "epoch": 0.27721374138728605, + "grad_norm": 1.6558045148849487, + "learning_rate": 3.392583120398084e-05, + "loss": 0.9533, + "step": 5693 + }, + { + "epoch": 0.27726243517639326, + "grad_norm": 1.557594895362854, + "learning_rate": 3.3923566954699944e-05, + "loss": 0.9106, + "step": 5694 + }, + { + "epoch": 0.2773111289655005, + "grad_norm": 1.3854477405548096, + "learning_rate": 3.392130235906906e-05, + "loss": 0.8654, + "step": 5695 + }, + { + "epoch": 0.27735982275460763, + "grad_norm": 1.574309229850769, + "learning_rate": 3.39190374171445e-05, + "loss": 0.9747, + "step": 5696 + }, + { + "epoch": 0.27740851654371484, + "grad_norm": 1.4163775444030762, + "learning_rate": 3.3916772128982623e-05, + "loss": 0.9004, + "step": 5697 + }, + { + "epoch": 0.27745721033282206, + "grad_norm": 1.4121582508087158, + "learning_rate": 3.391450649463977e-05, + "loss": 0.9724, + "step": 5698 + }, + { + "epoch": 0.27750590412192927, + "grad_norm": 1.5292713642120361, + "learning_rate": 3.391224051417229e-05, + "loss": 0.8558, + "step": 5699 + }, + { + "epoch": 0.2775545979110364, + "grad_norm": 2.264925241470337, + "learning_rate": 3.390997418763657e-05, + "loss": 0.8774, + "step": 5700 + }, + { + "epoch": 0.27760329170014364, + "grad_norm": 1.353453278541565, + "learning_rate": 3.390770751508897e-05, + "loss": 0.8904, + "step": 5701 + }, + { + "epoch": 0.27765198548925085, + "grad_norm": 1.7056249380111694, + "learning_rate": 3.390544049658588e-05, + "loss": 0.7867, + "step": 5702 + }, + { + "epoch": 0.27770067927835806, + "grad_norm": 1.5053763389587402, + "learning_rate": 3.390317313218368e-05, + "loss": 0.8708, + "step": 5703 + }, + { + "epoch": 0.2777493730674652, + "grad_norm": 2.129538059234619, + "learning_rate": 3.390090542193879e-05, + "loss": 0.9763, + "step": 5704 + }, + { + "epoch": 0.27779806685657243, + "grad_norm": 2.2087347507476807, + "learning_rate": 3.389863736590761e-05, + "loss": 0.922, + "step": 5705 + }, + { + "epoch": 0.27784676064567965, + "grad_norm": 2.168929100036621, + "learning_rate": 3.389636896414656e-05, + "loss": 0.9002, + "step": 5706 + }, + { + "epoch": 0.27789545443478686, + "grad_norm": 2.3177907466888428, + "learning_rate": 3.3894100216712056e-05, + "loss": 0.8207, + "step": 5707 + }, + { + "epoch": 0.277944148223894, + "grad_norm": 1.2864842414855957, + "learning_rate": 3.389183112366055e-05, + "loss": 0.9414, + "step": 5708 + }, + { + "epoch": 0.27799284201300123, + "grad_norm": 1.6140896081924438, + "learning_rate": 3.388956168504847e-05, + "loss": 0.9291, + "step": 5709 + }, + { + "epoch": 0.27804153580210844, + "grad_norm": 1.4474356174468994, + "learning_rate": 3.388729190093228e-05, + "loss": 0.8359, + "step": 5710 + }, + { + "epoch": 0.27809022959121565, + "grad_norm": 1.524732232093811, + "learning_rate": 3.3885021771368445e-05, + "loss": 0.8113, + "step": 5711 + }, + { + "epoch": 0.27813892338032287, + "grad_norm": 0.08539924025535583, + "learning_rate": 3.3882751296413426e-05, + "loss": 0.5761, + "step": 5712 + }, + { + "epoch": 0.27818761716943, + "grad_norm": 1.493424415588379, + "learning_rate": 3.38804804761237e-05, + "loss": 0.9612, + "step": 5713 + }, + { + "epoch": 0.27823631095853724, + "grad_norm": 2.0972564220428467, + "learning_rate": 3.3878209310555755e-05, + "loss": 0.8137, + "step": 5714 + }, + { + "epoch": 0.27828500474764445, + "grad_norm": 1.5905065536499023, + "learning_rate": 3.387593779976609e-05, + "loss": 0.8136, + "step": 5715 + }, + { + "epoch": 0.27833369853675166, + "grad_norm": 1.531798243522644, + "learning_rate": 3.387366594381121e-05, + "loss": 0.8793, + "step": 5716 + }, + { + "epoch": 0.2783823923258588, + "grad_norm": 1.859464406967163, + "learning_rate": 3.387139374274761e-05, + "loss": 0.8243, + "step": 5717 + }, + { + "epoch": 0.27843108611496603, + "grad_norm": 1.623781442642212, + "learning_rate": 3.3869121196631836e-05, + "loss": 0.8675, + "step": 5718 + }, + { + "epoch": 0.27847977990407324, + "grad_norm": 1.8461339473724365, + "learning_rate": 3.386684830552041e-05, + "loss": 0.9019, + "step": 5719 + }, + { + "epoch": 0.27852847369318046, + "grad_norm": 1.7386634349822998, + "learning_rate": 3.386457506946987e-05, + "loss": 0.8214, + "step": 5720 + }, + { + "epoch": 0.2785771674822876, + "grad_norm": 1.4837676286697388, + "learning_rate": 3.3862301488536754e-05, + "loss": 0.9314, + "step": 5721 + }, + { + "epoch": 0.2786258612713948, + "grad_norm": 1.6985371112823486, + "learning_rate": 3.3860027562777626e-05, + "loss": 0.8331, + "step": 5722 + }, + { + "epoch": 0.27867455506050204, + "grad_norm": 2.2470879554748535, + "learning_rate": 3.385775329224905e-05, + "loss": 0.8635, + "step": 5723 + }, + { + "epoch": 0.27872324884960925, + "grad_norm": 1.3144127130508423, + "learning_rate": 3.3855478677007605e-05, + "loss": 0.8821, + "step": 5724 + }, + { + "epoch": 0.2787719426387164, + "grad_norm": 1.9399477243423462, + "learning_rate": 3.3853203717109855e-05, + "loss": 0.9297, + "step": 5725 + }, + { + "epoch": 0.2788206364278236, + "grad_norm": 1.523086667060852, + "learning_rate": 3.38509284126124e-05, + "loss": 0.872, + "step": 5726 + }, + { + "epoch": 0.27886933021693083, + "grad_norm": 1.2366408109664917, + "learning_rate": 3.384865276357184e-05, + "loss": 0.8878, + "step": 5727 + }, + { + "epoch": 0.27891802400603805, + "grad_norm": 1.2529075145721436, + "learning_rate": 3.3846376770044784e-05, + "loss": 0.8107, + "step": 5728 + }, + { + "epoch": 0.2789667177951452, + "grad_norm": 1.8375462293624878, + "learning_rate": 3.3844100432087834e-05, + "loss": 0.8677, + "step": 5729 + }, + { + "epoch": 0.2790154115842524, + "grad_norm": 1.874247670173645, + "learning_rate": 3.3841823749757625e-05, + "loss": 0.8363, + "step": 5730 + }, + { + "epoch": 0.2790641053733596, + "grad_norm": 2.6926472187042236, + "learning_rate": 3.38395467231108e-05, + "loss": 0.9288, + "step": 5731 + }, + { + "epoch": 0.27911279916246684, + "grad_norm": 1.6635229587554932, + "learning_rate": 3.3837269352203974e-05, + "loss": 0.8423, + "step": 5732 + }, + { + "epoch": 0.27916149295157405, + "grad_norm": 0.08425331860780716, + "learning_rate": 3.383499163709382e-05, + "loss": 0.5967, + "step": 5733 + }, + { + "epoch": 0.2792101867406812, + "grad_norm": 0.08710834383964539, + "learning_rate": 3.383271357783699e-05, + "loss": 0.6163, + "step": 5734 + }, + { + "epoch": 0.2792588805297884, + "grad_norm": 1.380542516708374, + "learning_rate": 3.383043517449014e-05, + "loss": 0.8006, + "step": 5735 + }, + { + "epoch": 0.27930757431889564, + "grad_norm": 1.4818446636199951, + "learning_rate": 3.3828156427109955e-05, + "loss": 0.7978, + "step": 5736 + }, + { + "epoch": 0.27935626810800285, + "grad_norm": 2.2557830810546875, + "learning_rate": 3.382587733575312e-05, + "loss": 0.8926, + "step": 5737 + }, + { + "epoch": 0.27940496189711, + "grad_norm": 0.08698280900716782, + "learning_rate": 3.382359790047633e-05, + "loss": 0.6658, + "step": 5738 + }, + { + "epoch": 0.2794536556862172, + "grad_norm": 0.08570413291454315, + "learning_rate": 3.3821318121336274e-05, + "loss": 0.5704, + "step": 5739 + }, + { + "epoch": 0.27950234947532443, + "grad_norm": 1.407818078994751, + "learning_rate": 3.3819037998389675e-05, + "loss": 0.8786, + "step": 5740 + }, + { + "epoch": 0.27955104326443164, + "grad_norm": 1.9253668785095215, + "learning_rate": 3.381675753169325e-05, + "loss": 0.8781, + "step": 5741 + }, + { + "epoch": 0.2795997370535388, + "grad_norm": 1.5460280179977417, + "learning_rate": 3.3814476721303714e-05, + "loss": 0.8783, + "step": 5742 + }, + { + "epoch": 0.279648430842646, + "grad_norm": 1.4563391208648682, + "learning_rate": 3.3812195567277813e-05, + "loss": 0.8821, + "step": 5743 + }, + { + "epoch": 0.2796971246317532, + "grad_norm": 1.5695642232894897, + "learning_rate": 3.380991406967228e-05, + "loss": 0.8329, + "step": 5744 + }, + { + "epoch": 0.27974581842086044, + "grad_norm": 1.8874127864837646, + "learning_rate": 3.380763222854389e-05, + "loss": 0.8383, + "step": 5745 + }, + { + "epoch": 0.2797945122099676, + "grad_norm": 1.3908052444458008, + "learning_rate": 3.380535004394938e-05, + "loss": 0.8906, + "step": 5746 + }, + { + "epoch": 0.2798432059990748, + "grad_norm": 1.4322259426116943, + "learning_rate": 3.3803067515945534e-05, + "loss": 0.9296, + "step": 5747 + }, + { + "epoch": 0.279891899788182, + "grad_norm": 1.177517294883728, + "learning_rate": 3.380078464458912e-05, + "loss": 0.8235, + "step": 5748 + }, + { + "epoch": 0.27994059357728923, + "grad_norm": 2.1925477981567383, + "learning_rate": 3.379850142993693e-05, + "loss": 0.9088, + "step": 5749 + }, + { + "epoch": 0.2799892873663964, + "grad_norm": 1.1978734731674194, + "learning_rate": 3.3796217872045756e-05, + "loss": 0.8985, + "step": 5750 + }, + { + "epoch": 0.2800379811555036, + "grad_norm": 1.8044954538345337, + "learning_rate": 3.379393397097241e-05, + "loss": 0.9424, + "step": 5751 + }, + { + "epoch": 0.2800866749446108, + "grad_norm": 1.2441281080245972, + "learning_rate": 3.37916497267737e-05, + "loss": 0.8492, + "step": 5752 + }, + { + "epoch": 0.280135368733718, + "grad_norm": 1.4718780517578125, + "learning_rate": 3.378936513950644e-05, + "loss": 0.894, + "step": 5753 + }, + { + "epoch": 0.28018406252282524, + "grad_norm": 1.386401653289795, + "learning_rate": 3.378708020922747e-05, + "loss": 0.8953, + "step": 5754 + }, + { + "epoch": 0.2802327563119324, + "grad_norm": 1.425950288772583, + "learning_rate": 3.378479493599362e-05, + "loss": 0.7904, + "step": 5755 + }, + { + "epoch": 0.2802814501010396, + "grad_norm": 1.3762016296386719, + "learning_rate": 3.378250931986174e-05, + "loss": 0.8649, + "step": 5756 + }, + { + "epoch": 0.2803301438901468, + "grad_norm": 2.6809194087982178, + "learning_rate": 3.378022336088869e-05, + "loss": 0.8655, + "step": 5757 + }, + { + "epoch": 0.28037883767925403, + "grad_norm": 1.599238634109497, + "learning_rate": 3.377793705913131e-05, + "loss": 0.9199, + "step": 5758 + }, + { + "epoch": 0.2804275314683612, + "grad_norm": 1.3623648881912231, + "learning_rate": 3.3775650414646506e-05, + "loss": 0.864, + "step": 5759 + }, + { + "epoch": 0.2804762252574684, + "grad_norm": 1.2168912887573242, + "learning_rate": 3.3773363427491135e-05, + "loss": 0.8665, + "step": 5760 + }, + { + "epoch": 0.2805249190465756, + "grad_norm": 1.646056056022644, + "learning_rate": 3.3771076097722097e-05, + "loss": 0.8524, + "step": 5761 + }, + { + "epoch": 0.28057361283568283, + "grad_norm": 1.5711387395858765, + "learning_rate": 3.376878842539628e-05, + "loss": 0.8591, + "step": 5762 + }, + { + "epoch": 0.28062230662479, + "grad_norm": 1.5022621154785156, + "learning_rate": 3.37665004105706e-05, + "loss": 0.8367, + "step": 5763 + }, + { + "epoch": 0.2806710004138972, + "grad_norm": 2.013455629348755, + "learning_rate": 3.376421205330197e-05, + "loss": 0.8172, + "step": 5764 + }, + { + "epoch": 0.2807196942030044, + "grad_norm": 1.2840049266815186, + "learning_rate": 3.37619233536473e-05, + "loss": 0.9504, + "step": 5765 + }, + { + "epoch": 0.2807683879921116, + "grad_norm": 0.08782102167606354, + "learning_rate": 3.375963431166354e-05, + "loss": 0.6193, + "step": 5766 + }, + { + "epoch": 0.2808170817812188, + "grad_norm": 1.5093663930892944, + "learning_rate": 3.375734492740762e-05, + "loss": 0.8947, + "step": 5767 + }, + { + "epoch": 0.280865775570326, + "grad_norm": 2.273113965988159, + "learning_rate": 3.375505520093649e-05, + "loss": 0.8815, + "step": 5768 + }, + { + "epoch": 0.2809144693594332, + "grad_norm": 1.2737560272216797, + "learning_rate": 3.375276513230711e-05, + "loss": 0.8187, + "step": 5769 + }, + { + "epoch": 0.2809631631485404, + "grad_norm": 1.2444877624511719, + "learning_rate": 3.375047472157644e-05, + "loss": 0.9141, + "step": 5770 + }, + { + "epoch": 0.28101185693764763, + "grad_norm": 1.4860042333602905, + "learning_rate": 3.3748183968801465e-05, + "loss": 0.874, + "step": 5771 + }, + { + "epoch": 0.2810605507267548, + "grad_norm": 1.4371167421340942, + "learning_rate": 3.374589287403915e-05, + "loss": 0.9342, + "step": 5772 + }, + { + "epoch": 0.281109244515862, + "grad_norm": 1.4656486511230469, + "learning_rate": 3.37436014373465e-05, + "loss": 0.9293, + "step": 5773 + }, + { + "epoch": 0.2811579383049692, + "grad_norm": 1.6107072830200195, + "learning_rate": 3.374130965878052e-05, + "loss": 0.9487, + "step": 5774 + }, + { + "epoch": 0.2812066320940764, + "grad_norm": 2.018982172012329, + "learning_rate": 3.3739017538398193e-05, + "loss": 0.9506, + "step": 5775 + }, + { + "epoch": 0.2812553258831836, + "grad_norm": 2.347630023956299, + "learning_rate": 3.373672507625657e-05, + "loss": 0.8326, + "step": 5776 + }, + { + "epoch": 0.2813040196722908, + "grad_norm": 1.581398606300354, + "learning_rate": 3.373443227241264e-05, + "loss": 0.9172, + "step": 5777 + }, + { + "epoch": 0.281352713461398, + "grad_norm": 1.6590708494186401, + "learning_rate": 3.3732139126923464e-05, + "loss": 0.8461, + "step": 5778 + }, + { + "epoch": 0.2814014072505052, + "grad_norm": 7.721889972686768, + "learning_rate": 3.372984563984607e-05, + "loss": 0.8396, + "step": 5779 + }, + { + "epoch": 0.2814501010396124, + "grad_norm": 2.617438554763794, + "learning_rate": 3.372755181123752e-05, + "loss": 0.8849, + "step": 5780 + }, + { + "epoch": 0.2814987948287196, + "grad_norm": 1.28712797164917, + "learning_rate": 3.372525764115487e-05, + "loss": 0.838, + "step": 5781 + }, + { + "epoch": 0.2815474886178268, + "grad_norm": 1.3794587850570679, + "learning_rate": 3.372296312965518e-05, + "loss": 0.9181, + "step": 5782 + }, + { + "epoch": 0.281596182406934, + "grad_norm": 1.5385302305221558, + "learning_rate": 3.372066827679553e-05, + "loss": 0.8361, + "step": 5783 + }, + { + "epoch": 0.28164487619604117, + "grad_norm": 1.7470604181289673, + "learning_rate": 3.371837308263301e-05, + "loss": 0.9374, + "step": 5784 + }, + { + "epoch": 0.2816935699851484, + "grad_norm": 1.8363261222839355, + "learning_rate": 3.3716077547224705e-05, + "loss": 0.7972, + "step": 5785 + }, + { + "epoch": 0.2817422637742556, + "grad_norm": 1.4720603227615356, + "learning_rate": 3.371378167062772e-05, + "loss": 0.8351, + "step": 5786 + }, + { + "epoch": 0.2817909575633628, + "grad_norm": 1.691348910331726, + "learning_rate": 3.371148545289917e-05, + "loss": 0.8123, + "step": 5787 + }, + { + "epoch": 0.28183965135246997, + "grad_norm": 1.5390105247497559, + "learning_rate": 3.3709188894096164e-05, + "loss": 1.0063, + "step": 5788 + }, + { + "epoch": 0.2818883451415772, + "grad_norm": 1.6606587171554565, + "learning_rate": 3.370689199427584e-05, + "loss": 0.8357, + "step": 5789 + }, + { + "epoch": 0.2819370389306844, + "grad_norm": 1.8432228565216064, + "learning_rate": 3.370459475349532e-05, + "loss": 0.8936, + "step": 5790 + }, + { + "epoch": 0.2819857327197916, + "grad_norm": 2.7576043605804443, + "learning_rate": 3.370229717181176e-05, + "loss": 0.8917, + "step": 5791 + }, + { + "epoch": 0.2820344265088988, + "grad_norm": 1.19974684715271, + "learning_rate": 3.369999924928231e-05, + "loss": 0.8262, + "step": 5792 + }, + { + "epoch": 0.282083120298006, + "grad_norm": 2.614095687866211, + "learning_rate": 3.3697700985964126e-05, + "loss": 0.9318, + "step": 5793 + }, + { + "epoch": 0.2821318140871132, + "grad_norm": 1.7430989742279053, + "learning_rate": 3.3695402381914384e-05, + "loss": 0.8669, + "step": 5794 + }, + { + "epoch": 0.2821805078762204, + "grad_norm": 2.323256731033325, + "learning_rate": 3.3693103437190254e-05, + "loss": 0.8696, + "step": 5795 + }, + { + "epoch": 0.2822292016653276, + "grad_norm": 1.416235089302063, + "learning_rate": 3.3690804151848935e-05, + "loss": 0.8965, + "step": 5796 + }, + { + "epoch": 0.28227789545443477, + "grad_norm": 1.4134596586227417, + "learning_rate": 3.368850452594761e-05, + "loss": 0.8135, + "step": 5797 + }, + { + "epoch": 0.282326589243542, + "grad_norm": 1.6392499208450317, + "learning_rate": 3.3686204559543494e-05, + "loss": 0.9047, + "step": 5798 + }, + { + "epoch": 0.2823752830326492, + "grad_norm": 2.3244125843048096, + "learning_rate": 3.368390425269378e-05, + "loss": 0.9599, + "step": 5799 + }, + { + "epoch": 0.2824239768217564, + "grad_norm": 1.5376231670379639, + "learning_rate": 3.368160360545571e-05, + "loss": 0.7498, + "step": 5800 + }, + { + "epoch": 0.28247267061086356, + "grad_norm": 1.1763149499893188, + "learning_rate": 3.367930261788649e-05, + "loss": 0.7629, + "step": 5801 + }, + { + "epoch": 0.2825213643999708, + "grad_norm": 1.6946529150009155, + "learning_rate": 3.3677001290043386e-05, + "loss": 0.9001, + "step": 5802 + }, + { + "epoch": 0.282570058189078, + "grad_norm": 1.7430791854858398, + "learning_rate": 3.367469962198362e-05, + "loss": 0.8181, + "step": 5803 + }, + { + "epoch": 0.2826187519781852, + "grad_norm": 1.4821592569351196, + "learning_rate": 3.3672397613764456e-05, + "loss": 0.8814, + "step": 5804 + }, + { + "epoch": 0.28266744576729236, + "grad_norm": 1.7752538919448853, + "learning_rate": 3.367009526544315e-05, + "loss": 0.8186, + "step": 5805 + }, + { + "epoch": 0.28271613955639957, + "grad_norm": 1.905436396598816, + "learning_rate": 3.366779257707698e-05, + "loss": 0.8509, + "step": 5806 + }, + { + "epoch": 0.2827648333455068, + "grad_norm": 1.957634687423706, + "learning_rate": 3.366548954872323e-05, + "loss": 0.9099, + "step": 5807 + }, + { + "epoch": 0.282813527134614, + "grad_norm": 1.849560022354126, + "learning_rate": 3.3663186180439175e-05, + "loss": 0.9514, + "step": 5808 + }, + { + "epoch": 0.28286222092372115, + "grad_norm": 1.3116607666015625, + "learning_rate": 3.366088247228212e-05, + "loss": 0.7917, + "step": 5809 + }, + { + "epoch": 0.28291091471282837, + "grad_norm": 1.8965883255004883, + "learning_rate": 3.365857842430937e-05, + "loss": 0.8994, + "step": 5810 + }, + { + "epoch": 0.2829596085019356, + "grad_norm": 2.2573177814483643, + "learning_rate": 3.365627403657823e-05, + "loss": 0.852, + "step": 5811 + }, + { + "epoch": 0.2830083022910428, + "grad_norm": 1.4957010746002197, + "learning_rate": 3.365396930914603e-05, + "loss": 0.845, + "step": 5812 + }, + { + "epoch": 0.28305699608015, + "grad_norm": 1.7208778858184814, + "learning_rate": 3.365166424207011e-05, + "loss": 0.9387, + "step": 5813 + }, + { + "epoch": 0.28310568986925716, + "grad_norm": 1.368437647819519, + "learning_rate": 3.364935883540779e-05, + "loss": 0.8887, + "step": 5814 + }, + { + "epoch": 0.2831543836583644, + "grad_norm": 0.09484897553920746, + "learning_rate": 3.364705308921641e-05, + "loss": 0.6881, + "step": 5815 + }, + { + "epoch": 0.2832030774474716, + "grad_norm": 1.2087054252624512, + "learning_rate": 3.364474700355336e-05, + "loss": 0.847, + "step": 5816 + }, + { + "epoch": 0.2832517712365788, + "grad_norm": 0.08750533312559128, + "learning_rate": 3.364244057847597e-05, + "loss": 0.6093, + "step": 5817 + }, + { + "epoch": 0.28330046502568595, + "grad_norm": 1.6161013841629028, + "learning_rate": 3.364013381404164e-05, + "loss": 0.8816, + "step": 5818 + }, + { + "epoch": 0.28334915881479317, + "grad_norm": 1.2147464752197266, + "learning_rate": 3.363782671030773e-05, + "loss": 0.9535, + "step": 5819 + }, + { + "epoch": 0.2833978526039004, + "grad_norm": 1.3128046989440918, + "learning_rate": 3.3635519267331634e-05, + "loss": 0.8532, + "step": 5820 + }, + { + "epoch": 0.2834465463930076, + "grad_norm": 1.8260552883148193, + "learning_rate": 3.363321148517076e-05, + "loss": 0.8518, + "step": 5821 + }, + { + "epoch": 0.28349524018211475, + "grad_norm": 1.835051417350769, + "learning_rate": 3.3630903363882506e-05, + "loss": 0.8568, + "step": 5822 + }, + { + "epoch": 0.28354393397122196, + "grad_norm": 1.443536639213562, + "learning_rate": 3.362859490352429e-05, + "loss": 0.8479, + "step": 5823 + }, + { + "epoch": 0.2835926277603292, + "grad_norm": 1.6230734586715698, + "learning_rate": 3.3626286104153525e-05, + "loss": 0.8189, + "step": 5824 + }, + { + "epoch": 0.2836413215494364, + "grad_norm": 1.4451181888580322, + "learning_rate": 3.3623976965827654e-05, + "loss": 1.0498, + "step": 5825 + }, + { + "epoch": 0.28369001533854354, + "grad_norm": 1.913422703742981, + "learning_rate": 3.362166748860411e-05, + "loss": 0.888, + "step": 5826 + }, + { + "epoch": 0.28373870912765076, + "grad_norm": 3.0698235034942627, + "learning_rate": 3.361935767254035e-05, + "loss": 0.7907, + "step": 5827 + }, + { + "epoch": 0.28378740291675797, + "grad_norm": 1.2342307567596436, + "learning_rate": 3.3617047517693834e-05, + "loss": 0.9942, + "step": 5828 + }, + { + "epoch": 0.2838360967058652, + "grad_norm": 1.7012540102005005, + "learning_rate": 3.361473702412201e-05, + "loss": 0.884, + "step": 5829 + }, + { + "epoch": 0.28388479049497234, + "grad_norm": 1.5157275199890137, + "learning_rate": 3.3612426191882366e-05, + "loss": 0.9528, + "step": 5830 + }, + { + "epoch": 0.28393348428407955, + "grad_norm": 1.8408430814743042, + "learning_rate": 3.3610115021032377e-05, + "loss": 0.9163, + "step": 5831 + }, + { + "epoch": 0.28398217807318676, + "grad_norm": 1.7910579442977905, + "learning_rate": 3.360780351162953e-05, + "loss": 0.8332, + "step": 5832 + }, + { + "epoch": 0.284030871862294, + "grad_norm": 4.233884334564209, + "learning_rate": 3.360549166373134e-05, + "loss": 0.8905, + "step": 5833 + }, + { + "epoch": 0.2840795656514012, + "grad_norm": 1.5587167739868164, + "learning_rate": 3.36031794773953e-05, + "loss": 0.8024, + "step": 5834 + }, + { + "epoch": 0.28412825944050835, + "grad_norm": 1.1770573854446411, + "learning_rate": 3.3600866952678934e-05, + "loss": 0.8078, + "step": 5835 + }, + { + "epoch": 0.28417695322961556, + "grad_norm": 1.6866440773010254, + "learning_rate": 3.359855408963976e-05, + "loss": 0.848, + "step": 5836 + }, + { + "epoch": 0.28422564701872277, + "grad_norm": 2.2624356746673584, + "learning_rate": 3.35962408883353e-05, + "loss": 0.9533, + "step": 5837 + }, + { + "epoch": 0.28427434080783, + "grad_norm": 1.803169846534729, + "learning_rate": 3.359392734882312e-05, + "loss": 0.8968, + "step": 5838 + }, + { + "epoch": 0.28432303459693714, + "grad_norm": 1.936150074005127, + "learning_rate": 3.359161347116076e-05, + "loss": 0.9164, + "step": 5839 + }, + { + "epoch": 0.28437172838604435, + "grad_norm": 2.3894639015197754, + "learning_rate": 3.3589299255405774e-05, + "loss": 0.9953, + "step": 5840 + }, + { + "epoch": 0.28442042217515157, + "grad_norm": 1.5466638803482056, + "learning_rate": 3.358698470161573e-05, + "loss": 0.9383, + "step": 5841 + }, + { + "epoch": 0.2844691159642588, + "grad_norm": 2.0568294525146484, + "learning_rate": 3.358466980984821e-05, + "loss": 0.9249, + "step": 5842 + }, + { + "epoch": 0.28451780975336594, + "grad_norm": 1.8839397430419922, + "learning_rate": 3.358235458016078e-05, + "loss": 0.8588, + "step": 5843 + }, + { + "epoch": 0.28456650354247315, + "grad_norm": 1.967342495918274, + "learning_rate": 3.358003901261104e-05, + "loss": 0.8945, + "step": 5844 + }, + { + "epoch": 0.28461519733158036, + "grad_norm": 1.8676233291625977, + "learning_rate": 3.3577723107256596e-05, + "loss": 0.8617, + "step": 5845 + }, + { + "epoch": 0.2846638911206876, + "grad_norm": 2.5233349800109863, + "learning_rate": 3.357540686415505e-05, + "loss": 0.8947, + "step": 5846 + }, + { + "epoch": 0.28471258490979473, + "grad_norm": 3.0466036796569824, + "learning_rate": 3.3573090283364015e-05, + "loss": 0.8207, + "step": 5847 + }, + { + "epoch": 0.28476127869890194, + "grad_norm": 1.7698677778244019, + "learning_rate": 3.357077336494113e-05, + "loss": 0.7538, + "step": 5848 + }, + { + "epoch": 0.28480997248800916, + "grad_norm": 1.4727628231048584, + "learning_rate": 3.356845610894402e-05, + "loss": 0.8408, + "step": 5849 + }, + { + "epoch": 0.28485866627711637, + "grad_norm": 1.5681096315383911, + "learning_rate": 3.3566138515430324e-05, + "loss": 0.9534, + "step": 5850 + }, + { + "epoch": 0.2849073600662236, + "grad_norm": 1.9785932302474976, + "learning_rate": 3.3563820584457694e-05, + "loss": 0.8829, + "step": 5851 + }, + { + "epoch": 0.28495605385533074, + "grad_norm": 1.8632310628890991, + "learning_rate": 3.3561502316083794e-05, + "loss": 0.8805, + "step": 5852 + }, + { + "epoch": 0.28500474764443795, + "grad_norm": 2.934103488922119, + "learning_rate": 3.355918371036629e-05, + "loss": 0.8567, + "step": 5853 + }, + { + "epoch": 0.28505344143354516, + "grad_norm": 1.4523992538452148, + "learning_rate": 3.355686476736285e-05, + "loss": 0.8005, + "step": 5854 + }, + { + "epoch": 0.2851021352226524, + "grad_norm": 1.4746747016906738, + "learning_rate": 3.3554545487131164e-05, + "loss": 0.8796, + "step": 5855 + }, + { + "epoch": 0.28515082901175953, + "grad_norm": 2.440025568008423, + "learning_rate": 3.355222586972892e-05, + "loss": 0.8569, + "step": 5856 + }, + { + "epoch": 0.28519952280086674, + "grad_norm": 1.5070805549621582, + "learning_rate": 3.354990591521383e-05, + "loss": 0.9505, + "step": 5857 + }, + { + "epoch": 0.28524821658997396, + "grad_norm": 1.2828761339187622, + "learning_rate": 3.354758562364359e-05, + "loss": 0.891, + "step": 5858 + }, + { + "epoch": 0.28529691037908117, + "grad_norm": 1.8105113506317139, + "learning_rate": 3.3545264995075916e-05, + "loss": 0.7865, + "step": 5859 + }, + { + "epoch": 0.2853456041681883, + "grad_norm": 2.9896240234375, + "learning_rate": 3.354294402956855e-05, + "loss": 0.798, + "step": 5860 + }, + { + "epoch": 0.28539429795729554, + "grad_norm": 2.4394378662109375, + "learning_rate": 3.354062272717921e-05, + "loss": 0.8024, + "step": 5861 + }, + { + "epoch": 0.28544299174640275, + "grad_norm": 1.771012544631958, + "learning_rate": 3.353830108796565e-05, + "loss": 0.7657, + "step": 5862 + }, + { + "epoch": 0.28549168553550996, + "grad_norm": 1.4571571350097656, + "learning_rate": 3.353597911198562e-05, + "loss": 0.7891, + "step": 5863 + }, + { + "epoch": 0.2855403793246171, + "grad_norm": 2.145420551300049, + "learning_rate": 3.353365679929686e-05, + "loss": 0.8932, + "step": 5864 + }, + { + "epoch": 0.28558907311372433, + "grad_norm": 2.1076202392578125, + "learning_rate": 3.353133414995717e-05, + "loss": 0.8391, + "step": 5865 + }, + { + "epoch": 0.28563776690283155, + "grad_norm": 2.5292601585388184, + "learning_rate": 3.3529011164024305e-05, + "loss": 0.8317, + "step": 5866 + }, + { + "epoch": 0.28568646069193876, + "grad_norm": 1.3170169591903687, + "learning_rate": 3.352668784155604e-05, + "loss": 0.9643, + "step": 5867 + }, + { + "epoch": 0.2857351544810459, + "grad_norm": 1.7233155965805054, + "learning_rate": 3.3524364182610195e-05, + "loss": 0.8912, + "step": 5868 + }, + { + "epoch": 0.28578384827015313, + "grad_norm": 1.3551372289657593, + "learning_rate": 3.352204018724455e-05, + "loss": 0.882, + "step": 5869 + }, + { + "epoch": 0.28583254205926034, + "grad_norm": 1.4488675594329834, + "learning_rate": 3.3519715855516936e-05, + "loss": 0.8853, + "step": 5870 + }, + { + "epoch": 0.28588123584836755, + "grad_norm": 1.4978725910186768, + "learning_rate": 3.3517391187485146e-05, + "loss": 0.8148, + "step": 5871 + }, + { + "epoch": 0.28592992963747477, + "grad_norm": 1.5284607410430908, + "learning_rate": 3.351506618320703e-05, + "loss": 0.9403, + "step": 5872 + }, + { + "epoch": 0.2859786234265819, + "grad_norm": 1.2224687337875366, + "learning_rate": 3.35127408427404e-05, + "loss": 0.8516, + "step": 5873 + }, + { + "epoch": 0.28602731721568914, + "grad_norm": 3.444101572036743, + "learning_rate": 3.3510415166143105e-05, + "loss": 0.8512, + "step": 5874 + }, + { + "epoch": 0.28607601100479635, + "grad_norm": 1.5433833599090576, + "learning_rate": 3.3508089153473014e-05, + "loss": 0.9136, + "step": 5875 + }, + { + "epoch": 0.28612470479390356, + "grad_norm": 2.2495832443237305, + "learning_rate": 3.350576280478796e-05, + "loss": 0.919, + "step": 5876 + }, + { + "epoch": 0.2861733985830107, + "grad_norm": 0.0899752750992775, + "learning_rate": 3.350343612014583e-05, + "loss": 0.6033, + "step": 5877 + }, + { + "epoch": 0.28622209237211793, + "grad_norm": 1.874923586845398, + "learning_rate": 3.3501109099604496e-05, + "loss": 0.8916, + "step": 5878 + }, + { + "epoch": 0.28627078616122514, + "grad_norm": 1.7769775390625, + "learning_rate": 3.349878174322184e-05, + "loss": 0.8338, + "step": 5879 + }, + { + "epoch": 0.28631947995033236, + "grad_norm": 1.7164748907089233, + "learning_rate": 3.3496454051055766e-05, + "loss": 0.8856, + "step": 5880 + }, + { + "epoch": 0.2863681737394395, + "grad_norm": 1.851365566253662, + "learning_rate": 3.3494126023164155e-05, + "loss": 0.8756, + "step": 5881 + }, + { + "epoch": 0.2864168675285467, + "grad_norm": 1.7299197912216187, + "learning_rate": 3.349179765960494e-05, + "loss": 0.9005, + "step": 5882 + }, + { + "epoch": 0.28646556131765394, + "grad_norm": 1.4191241264343262, + "learning_rate": 3.3489468960436026e-05, + "loss": 0.8429, + "step": 5883 + }, + { + "epoch": 0.28651425510676115, + "grad_norm": 1.6864755153656006, + "learning_rate": 3.3487139925715335e-05, + "loss": 0.9297, + "step": 5884 + }, + { + "epoch": 0.2865629488958683, + "grad_norm": 2.118129253387451, + "learning_rate": 3.348481055550081e-05, + "loss": 0.9965, + "step": 5885 + }, + { + "epoch": 0.2866116426849755, + "grad_norm": 1.2363520860671997, + "learning_rate": 3.34824808498504e-05, + "loss": 0.8968, + "step": 5886 + }, + { + "epoch": 0.28666033647408273, + "grad_norm": 1.2479382753372192, + "learning_rate": 3.348015080882204e-05, + "loss": 0.8349, + "step": 5887 + }, + { + "epoch": 0.28670903026318995, + "grad_norm": 1.8377697467803955, + "learning_rate": 3.3477820432473706e-05, + "loss": 0.7759, + "step": 5888 + }, + { + "epoch": 0.2867577240522971, + "grad_norm": 1.4386770725250244, + "learning_rate": 3.3475489720863356e-05, + "loss": 0.7975, + "step": 5889 + }, + { + "epoch": 0.2868064178414043, + "grad_norm": 1.6264770030975342, + "learning_rate": 3.3473158674048974e-05, + "loss": 0.8921, + "step": 5890 + }, + { + "epoch": 0.28685511163051153, + "grad_norm": 2.159773826599121, + "learning_rate": 3.3470827292088536e-05, + "loss": 0.9021, + "step": 5891 + }, + { + "epoch": 0.28690380541961874, + "grad_norm": 2.371209144592285, + "learning_rate": 3.346849557504005e-05, + "loss": 0.8861, + "step": 5892 + }, + { + "epoch": 0.28695249920872595, + "grad_norm": 1.6358155012130737, + "learning_rate": 3.346616352296151e-05, + "loss": 0.8499, + "step": 5893 + }, + { + "epoch": 0.2870011929978331, + "grad_norm": 0.08586463332176208, + "learning_rate": 3.346383113591092e-05, + "loss": 0.6182, + "step": 5894 + }, + { + "epoch": 0.2870498867869403, + "grad_norm": 2.5173134803771973, + "learning_rate": 3.3461498413946295e-05, + "loss": 0.8042, + "step": 5895 + }, + { + "epoch": 0.28709858057604754, + "grad_norm": 2.3320090770721436, + "learning_rate": 3.3459165357125676e-05, + "loss": 0.8698, + "step": 5896 + }, + { + "epoch": 0.28714727436515475, + "grad_norm": 1.8942360877990723, + "learning_rate": 3.3456831965507086e-05, + "loss": 0.9358, + "step": 5897 + }, + { + "epoch": 0.2871959681542619, + "grad_norm": 1.7434000968933105, + "learning_rate": 3.345449823914858e-05, + "loss": 0.9763, + "step": 5898 + }, + { + "epoch": 0.2872446619433691, + "grad_norm": 1.324797511100769, + "learning_rate": 3.3452164178108206e-05, + "loss": 0.9379, + "step": 5899 + }, + { + "epoch": 0.28729335573247633, + "grad_norm": 1.5262539386749268, + "learning_rate": 3.344982978244402e-05, + "loss": 0.8638, + "step": 5900 + }, + { + "epoch": 0.28734204952158354, + "grad_norm": 1.8242881298065186, + "learning_rate": 3.344749505221409e-05, + "loss": 0.862, + "step": 5901 + }, + { + "epoch": 0.2873907433106907, + "grad_norm": 1.9342447519302368, + "learning_rate": 3.3445159987476496e-05, + "loss": 0.9227, + "step": 5902 + }, + { + "epoch": 0.2874394370997979, + "grad_norm": 2.0133752822875977, + "learning_rate": 3.3442824588289315e-05, + "loss": 0.868, + "step": 5903 + }, + { + "epoch": 0.2874881308889051, + "grad_norm": 1.983154535293579, + "learning_rate": 3.344048885471066e-05, + "loss": 0.8828, + "step": 5904 + }, + { + "epoch": 0.28753682467801234, + "grad_norm": 1.6136054992675781, + "learning_rate": 3.343815278679861e-05, + "loss": 0.8981, + "step": 5905 + }, + { + "epoch": 0.2875855184671195, + "grad_norm": 1.6588083505630493, + "learning_rate": 3.343581638461128e-05, + "loss": 0.8989, + "step": 5906 + }, + { + "epoch": 0.2876342122562267, + "grad_norm": 0.09309374541044235, + "learning_rate": 3.3433479648206796e-05, + "loss": 0.6135, + "step": 5907 + }, + { + "epoch": 0.2876829060453339, + "grad_norm": 1.6578412055969238, + "learning_rate": 3.343114257764328e-05, + "loss": 0.8087, + "step": 5908 + }, + { + "epoch": 0.28773159983444113, + "grad_norm": 1.763477087020874, + "learning_rate": 3.3428805172978874e-05, + "loss": 0.92, + "step": 5909 + }, + { + "epoch": 0.2877802936235483, + "grad_norm": 2.1615211963653564, + "learning_rate": 3.342646743427171e-05, + "loss": 0.8431, + "step": 5910 + }, + { + "epoch": 0.2878289874126555, + "grad_norm": 1.5119718313217163, + "learning_rate": 3.342412936157995e-05, + "loss": 0.8736, + "step": 5911 + }, + { + "epoch": 0.2878776812017627, + "grad_norm": 1.870600938796997, + "learning_rate": 3.342179095496175e-05, + "loss": 0.8272, + "step": 5912 + }, + { + "epoch": 0.2879263749908699, + "grad_norm": 1.8566926717758179, + "learning_rate": 3.3419452214475264e-05, + "loss": 0.8683, + "step": 5913 + }, + { + "epoch": 0.28797506877997714, + "grad_norm": 1.6056584119796753, + "learning_rate": 3.341711314017869e-05, + "loss": 0.8382, + "step": 5914 + }, + { + "epoch": 0.2880237625690843, + "grad_norm": 1.7351760864257812, + "learning_rate": 3.3414773732130194e-05, + "loss": 0.8918, + "step": 5915 + }, + { + "epoch": 0.2880724563581915, + "grad_norm": 1.5175045728683472, + "learning_rate": 3.3412433990387984e-05, + "loss": 0.8551, + "step": 5916 + }, + { + "epoch": 0.2881211501472987, + "grad_norm": 2.611391067504883, + "learning_rate": 3.3410093915010256e-05, + "loss": 0.8457, + "step": 5917 + }, + { + "epoch": 0.28816984393640593, + "grad_norm": 1.7279767990112305, + "learning_rate": 3.340775350605522e-05, + "loss": 0.8466, + "step": 5918 + }, + { + "epoch": 0.2882185377255131, + "grad_norm": 1.68513822555542, + "learning_rate": 3.340541276358109e-05, + "loss": 0.8153, + "step": 5919 + }, + { + "epoch": 0.2882672315146203, + "grad_norm": 12.482368469238281, + "learning_rate": 3.34030716876461e-05, + "loss": 0.8581, + "step": 5920 + }, + { + "epoch": 0.2883159253037275, + "grad_norm": 1.3485918045043945, + "learning_rate": 3.340073027830847e-05, + "loss": 0.8117, + "step": 5921 + }, + { + "epoch": 0.28836461909283473, + "grad_norm": 1.4509416818618774, + "learning_rate": 3.339838853562647e-05, + "loss": 0.9336, + "step": 5922 + }, + { + "epoch": 0.2884133128819419, + "grad_norm": 1.9120033979415894, + "learning_rate": 3.339604645965832e-05, + "loss": 0.8492, + "step": 5923 + }, + { + "epoch": 0.2884620066710491, + "grad_norm": 1.481939673423767, + "learning_rate": 3.3393704050462296e-05, + "loss": 0.8809, + "step": 5924 + }, + { + "epoch": 0.2885107004601563, + "grad_norm": 2.2778983116149902, + "learning_rate": 3.339136130809666e-05, + "loss": 0.8589, + "step": 5925 + }, + { + "epoch": 0.2885593942492635, + "grad_norm": 1.9880543947219849, + "learning_rate": 3.33890182326197e-05, + "loss": 0.8487, + "step": 5926 + }, + { + "epoch": 0.2886080880383707, + "grad_norm": 1.3271942138671875, + "learning_rate": 3.338667482408968e-05, + "loss": 0.853, + "step": 5927 + }, + { + "epoch": 0.2886567818274779, + "grad_norm": 2.37186336517334, + "learning_rate": 3.3384331082564905e-05, + "loss": 0.9602, + "step": 5928 + }, + { + "epoch": 0.2887054756165851, + "grad_norm": 1.3546384572982788, + "learning_rate": 3.338198700810368e-05, + "loss": 0.8585, + "step": 5929 + }, + { + "epoch": 0.2887541694056923, + "grad_norm": 1.5007636547088623, + "learning_rate": 3.33796426007643e-05, + "loss": 0.9102, + "step": 5930 + }, + { + "epoch": 0.28880286319479953, + "grad_norm": 3.390490770339966, + "learning_rate": 3.33772978606051e-05, + "loss": 0.9636, + "step": 5931 + }, + { + "epoch": 0.2888515569839067, + "grad_norm": 1.5128588676452637, + "learning_rate": 3.337495278768439e-05, + "loss": 0.8842, + "step": 5932 + }, + { + "epoch": 0.2889002507730139, + "grad_norm": 2.0678839683532715, + "learning_rate": 3.337260738206051e-05, + "loss": 0.8143, + "step": 5933 + }, + { + "epoch": 0.2889489445621211, + "grad_norm": 1.6144739389419556, + "learning_rate": 3.3370261643791804e-05, + "loss": 0.8353, + "step": 5934 + }, + { + "epoch": 0.2889976383512283, + "grad_norm": 1.840793251991272, + "learning_rate": 3.336791557293662e-05, + "loss": 0.8198, + "step": 5935 + }, + { + "epoch": 0.2890463321403355, + "grad_norm": 2.0980758666992188, + "learning_rate": 3.3365569169553316e-05, + "loss": 0.8561, + "step": 5936 + }, + { + "epoch": 0.2890950259294427, + "grad_norm": 2.377185106277466, + "learning_rate": 3.3363222433700255e-05, + "loss": 0.8113, + "step": 5937 + }, + { + "epoch": 0.2891437197185499, + "grad_norm": 1.9807488918304443, + "learning_rate": 3.336087536543583e-05, + "loss": 0.8881, + "step": 5938 + }, + { + "epoch": 0.2891924135076571, + "grad_norm": 1.6249759197235107, + "learning_rate": 3.33585279648184e-05, + "loss": 0.7845, + "step": 5939 + }, + { + "epoch": 0.2892411072967643, + "grad_norm": 1.8735476732254028, + "learning_rate": 3.3356180231906375e-05, + "loss": 0.8188, + "step": 5940 + }, + { + "epoch": 0.2892898010858715, + "grad_norm": 1.6677614450454712, + "learning_rate": 3.335383216675815e-05, + "loss": 0.8952, + "step": 5941 + }, + { + "epoch": 0.2893384948749787, + "grad_norm": 1.9762965440750122, + "learning_rate": 3.335148376943213e-05, + "loss": 0.7876, + "step": 5942 + }, + { + "epoch": 0.2893871886640859, + "grad_norm": 1.5277191400527954, + "learning_rate": 3.3349135039986736e-05, + "loss": 0.8891, + "step": 5943 + }, + { + "epoch": 0.28943588245319307, + "grad_norm": 2.2120513916015625, + "learning_rate": 3.334678597848039e-05, + "loss": 0.8734, + "step": 5944 + }, + { + "epoch": 0.2894845762423003, + "grad_norm": 2.140350103378296, + "learning_rate": 3.334443658497152e-05, + "loss": 0.9098, + "step": 5945 + }, + { + "epoch": 0.2895332700314075, + "grad_norm": 1.9367111921310425, + "learning_rate": 3.334208685951858e-05, + "loss": 0.8765, + "step": 5946 + }, + { + "epoch": 0.2895819638205147, + "grad_norm": 1.5716229677200317, + "learning_rate": 3.333973680218001e-05, + "loss": 0.8806, + "step": 5947 + }, + { + "epoch": 0.28963065760962187, + "grad_norm": 1.887700080871582, + "learning_rate": 3.3337386413014273e-05, + "loss": 0.8976, + "step": 5948 + }, + { + "epoch": 0.2896793513987291, + "grad_norm": 1.7452428340911865, + "learning_rate": 3.333503569207984e-05, + "loss": 0.9182, + "step": 5949 + }, + { + "epoch": 0.2897280451878363, + "grad_norm": 1.481374979019165, + "learning_rate": 3.333268463943516e-05, + "loss": 0.8317, + "step": 5950 + }, + { + "epoch": 0.2897767389769435, + "grad_norm": 0.091675765812397, + "learning_rate": 3.333033325513875e-05, + "loss": 0.6328, + "step": 5951 + }, + { + "epoch": 0.2898254327660507, + "grad_norm": 2.1220672130584717, + "learning_rate": 3.3327981539249084e-05, + "loss": 0.8124, + "step": 5952 + }, + { + "epoch": 0.2898741265551579, + "grad_norm": 1.911847710609436, + "learning_rate": 3.332562949182465e-05, + "loss": 0.8974, + "step": 5953 + }, + { + "epoch": 0.2899228203442651, + "grad_norm": 2.042120933532715, + "learning_rate": 3.3323277112923976e-05, + "loss": 0.8146, + "step": 5954 + }, + { + "epoch": 0.2899715141333723, + "grad_norm": 2.8524868488311768, + "learning_rate": 3.332092440260557e-05, + "loss": 0.8059, + "step": 5955 + }, + { + "epoch": 0.2900202079224795, + "grad_norm": 1.5328704118728638, + "learning_rate": 3.331857136092795e-05, + "loss": 0.8641, + "step": 5956 + }, + { + "epoch": 0.29006890171158667, + "grad_norm": 1.6574389934539795, + "learning_rate": 3.3316217987949656e-05, + "loss": 0.8692, + "step": 5957 + }, + { + "epoch": 0.2901175955006939, + "grad_norm": 1.5357003211975098, + "learning_rate": 3.3313864283729225e-05, + "loss": 0.9163, + "step": 5958 + }, + { + "epoch": 0.2901662892898011, + "grad_norm": 2.213242769241333, + "learning_rate": 3.331151024832521e-05, + "loss": 0.8427, + "step": 5959 + }, + { + "epoch": 0.2902149830789083, + "grad_norm": 1.546213984489441, + "learning_rate": 3.3309155881796155e-05, + "loss": 0.8278, + "step": 5960 + }, + { + "epoch": 0.29026367686801546, + "grad_norm": 1.5443638563156128, + "learning_rate": 3.330680118420064e-05, + "loss": 0.8226, + "step": 5961 + }, + { + "epoch": 0.2903123706571227, + "grad_norm": 2.3884952068328857, + "learning_rate": 3.330444615559723e-05, + "loss": 0.9314, + "step": 5962 + }, + { + "epoch": 0.2903610644462299, + "grad_norm": 1.6643273830413818, + "learning_rate": 3.330209079604451e-05, + "loss": 0.8114, + "step": 5963 + }, + { + "epoch": 0.2904097582353371, + "grad_norm": 1.3709323406219482, + "learning_rate": 3.3299735105601066e-05, + "loss": 0.881, + "step": 5964 + }, + { + "epoch": 0.29045845202444426, + "grad_norm": 1.5826765298843384, + "learning_rate": 3.3297379084325504e-05, + "loss": 0.8029, + "step": 5965 + }, + { + "epoch": 0.29050714581355147, + "grad_norm": 2.1202948093414307, + "learning_rate": 3.329502273227642e-05, + "loss": 0.9378, + "step": 5966 + }, + { + "epoch": 0.2905558396026587, + "grad_norm": 1.7752817869186401, + "learning_rate": 3.329266604951244e-05, + "loss": 0.9137, + "step": 5967 + }, + { + "epoch": 0.2906045333917659, + "grad_norm": 3.023425579071045, + "learning_rate": 3.329030903609217e-05, + "loss": 0.8176, + "step": 5968 + }, + { + "epoch": 0.29065322718087305, + "grad_norm": 1.5064020156860352, + "learning_rate": 3.3287951692074256e-05, + "loss": 0.7949, + "step": 5969 + }, + { + "epoch": 0.29070192096998027, + "grad_norm": 2.7986323833465576, + "learning_rate": 3.3285594017517335e-05, + "loss": 0.9282, + "step": 5970 + }, + { + "epoch": 0.2907506147590875, + "grad_norm": 1.6432335376739502, + "learning_rate": 3.3283236012480046e-05, + "loss": 0.8677, + "step": 5971 + }, + { + "epoch": 0.2907993085481947, + "grad_norm": 1.4716795682907104, + "learning_rate": 3.328087767702106e-05, + "loss": 0.9146, + "step": 5972 + }, + { + "epoch": 0.2908480023373019, + "grad_norm": 2.2649624347686768, + "learning_rate": 3.327851901119902e-05, + "loss": 0.9403, + "step": 5973 + }, + { + "epoch": 0.29089669612640906, + "grad_norm": 2.627318859100342, + "learning_rate": 3.3276160015072615e-05, + "loss": 0.8968, + "step": 5974 + }, + { + "epoch": 0.2909453899155163, + "grad_norm": 1.6213408708572388, + "learning_rate": 3.3273800688700514e-05, + "loss": 0.8299, + "step": 5975 + }, + { + "epoch": 0.2909940837046235, + "grad_norm": 1.995453119277954, + "learning_rate": 3.327144103214141e-05, + "loss": 0.8707, + "step": 5976 + }, + { + "epoch": 0.2910427774937307, + "grad_norm": 2.0680325031280518, + "learning_rate": 3.326908104545401e-05, + "loss": 0.902, + "step": 5977 + }, + { + "epoch": 0.29109147128283785, + "grad_norm": 0.08611857146024704, + "learning_rate": 3.3266720728697e-05, + "loss": 0.5383, + "step": 5978 + }, + { + "epoch": 0.29114016507194507, + "grad_norm": 8.303750991821289, + "learning_rate": 3.326436008192911e-05, + "loss": 0.9224, + "step": 5979 + }, + { + "epoch": 0.2911888588610523, + "grad_norm": 2.6481525897979736, + "learning_rate": 3.326199910520904e-05, + "loss": 0.8661, + "step": 5980 + }, + { + "epoch": 0.2912375526501595, + "grad_norm": 1.8895236253738403, + "learning_rate": 3.325963779859555e-05, + "loss": 0.8166, + "step": 5981 + }, + { + "epoch": 0.29128624643926665, + "grad_norm": 7.813030242919922, + "learning_rate": 3.325727616214735e-05, + "loss": 0.8792, + "step": 5982 + }, + { + "epoch": 0.29133494022837386, + "grad_norm": 1.6553714275360107, + "learning_rate": 3.32549141959232e-05, + "loss": 0.8934, + "step": 5983 + }, + { + "epoch": 0.2913836340174811, + "grad_norm": 1.5758332014083862, + "learning_rate": 3.325255189998185e-05, + "loss": 0.9274, + "step": 5984 + }, + { + "epoch": 0.2914323278065883, + "grad_norm": 1.8265888690948486, + "learning_rate": 3.3250189274382065e-05, + "loss": 0.8659, + "step": 5985 + }, + { + "epoch": 0.29148102159569544, + "grad_norm": 1.7262822389602661, + "learning_rate": 3.324782631918261e-05, + "loss": 0.8457, + "step": 5986 + }, + { + "epoch": 0.29152971538480266, + "grad_norm": 1.371204137802124, + "learning_rate": 3.324546303444227e-05, + "loss": 0.9417, + "step": 5987 + }, + { + "epoch": 0.29157840917390987, + "grad_norm": 1.6276341676712036, + "learning_rate": 3.3243099420219826e-05, + "loss": 0.8183, + "step": 5988 + }, + { + "epoch": 0.2916271029630171, + "grad_norm": 2.027500867843628, + "learning_rate": 3.324073547657408e-05, + "loss": 0.8546, + "step": 5989 + }, + { + "epoch": 0.29167579675212424, + "grad_norm": 1.9230066537857056, + "learning_rate": 3.323837120356383e-05, + "loss": 0.8998, + "step": 5990 + }, + { + "epoch": 0.29172449054123145, + "grad_norm": 1.5240771770477295, + "learning_rate": 3.3236006601247894e-05, + "loss": 0.8623, + "step": 5991 + }, + { + "epoch": 0.29177318433033866, + "grad_norm": 2.3690335750579834, + "learning_rate": 3.3233641669685084e-05, + "loss": 0.9312, + "step": 5992 + }, + { + "epoch": 0.2918218781194459, + "grad_norm": 2.825338363647461, + "learning_rate": 3.323127640893423e-05, + "loss": 0.9201, + "step": 5993 + }, + { + "epoch": 0.2918705719085531, + "grad_norm": 22.41427230834961, + "learning_rate": 3.322891081905417e-05, + "loss": 0.8085, + "step": 5994 + }, + { + "epoch": 0.29191926569766025, + "grad_norm": 2.0704898834228516, + "learning_rate": 3.322654490010375e-05, + "loss": 0.8851, + "step": 5995 + }, + { + "epoch": 0.29196795948676746, + "grad_norm": 2.6165931224823, + "learning_rate": 3.322417865214181e-05, + "loss": 0.7997, + "step": 5996 + }, + { + "epoch": 0.29201665327587467, + "grad_norm": 1.5391619205474854, + "learning_rate": 3.3221812075227226e-05, + "loss": 0.9163, + "step": 5997 + }, + { + "epoch": 0.2920653470649819, + "grad_norm": 8.2843599319458, + "learning_rate": 3.321944516941886e-05, + "loss": 0.8928, + "step": 5998 + }, + { + "epoch": 0.29211404085408904, + "grad_norm": 0.08779750019311905, + "learning_rate": 3.3217077934775594e-05, + "loss": 0.6414, + "step": 5999 + }, + { + "epoch": 0.29216273464319625, + "grad_norm": 1.7340840101242065, + "learning_rate": 3.3214710371356306e-05, + "loss": 0.8559, + "step": 6000 + }, + { + "epoch": 0.29221142843230347, + "grad_norm": 1.3716795444488525, + "learning_rate": 3.32123424792199e-05, + "loss": 0.7882, + "step": 6001 + }, + { + "epoch": 0.2922601222214107, + "grad_norm": 1.4260910749435425, + "learning_rate": 3.3209974258425264e-05, + "loss": 0.9603, + "step": 6002 + }, + { + "epoch": 0.29230881601051784, + "grad_norm": 1.6395409107208252, + "learning_rate": 3.320760570903131e-05, + "loss": 0.9491, + "step": 6003 + }, + { + "epoch": 0.29235750979962505, + "grad_norm": 1.7470455169677734, + "learning_rate": 3.320523683109696e-05, + "loss": 0.8946, + "step": 6004 + }, + { + "epoch": 0.29240620358873226, + "grad_norm": 2.1234071254730225, + "learning_rate": 3.3202867624681144e-05, + "loss": 0.7242, + "step": 6005 + }, + { + "epoch": 0.2924548973778395, + "grad_norm": 2.878518581390381, + "learning_rate": 3.320049808984279e-05, + "loss": 0.8201, + "step": 6006 + }, + { + "epoch": 0.29250359116694663, + "grad_norm": 1.4175729751586914, + "learning_rate": 3.319812822664084e-05, + "loss": 0.8541, + "step": 6007 + }, + { + "epoch": 0.29255228495605384, + "grad_norm": 1.368153691291809, + "learning_rate": 3.319575803513425e-05, + "loss": 0.9292, + "step": 6008 + }, + { + "epoch": 0.29260097874516106, + "grad_norm": 3.57430362701416, + "learning_rate": 3.3193387515381985e-05, + "loss": 0.8073, + "step": 6009 + }, + { + "epoch": 0.29264967253426827, + "grad_norm": 1.6634443998336792, + "learning_rate": 3.319101666744299e-05, + "loss": 0.7931, + "step": 6010 + }, + { + "epoch": 0.2926983663233755, + "grad_norm": 1.4950392246246338, + "learning_rate": 3.318864549137626e-05, + "loss": 0.8672, + "step": 6011 + }, + { + "epoch": 0.29274706011248264, + "grad_norm": 2.794163942337036, + "learning_rate": 3.318627398724076e-05, + "loss": 0.8652, + "step": 6012 + }, + { + "epoch": 0.29279575390158985, + "grad_norm": 1.321109414100647, + "learning_rate": 3.318390215509551e-05, + "loss": 0.7912, + "step": 6013 + }, + { + "epoch": 0.29284444769069706, + "grad_norm": 2.183377981185913, + "learning_rate": 3.318152999499948e-05, + "loss": 0.9066, + "step": 6014 + }, + { + "epoch": 0.2928931414798043, + "grad_norm": 1.69114089012146, + "learning_rate": 3.3179157507011694e-05, + "loss": 0.7821, + "step": 6015 + }, + { + "epoch": 0.29294183526891143, + "grad_norm": 2.0644752979278564, + "learning_rate": 3.317678469119115e-05, + "loss": 0.8674, + "step": 6016 + }, + { + "epoch": 0.29299052905801864, + "grad_norm": 2.1264519691467285, + "learning_rate": 3.31744115475969e-05, + "loss": 0.8723, + "step": 6017 + }, + { + "epoch": 0.29303922284712586, + "grad_norm": 2.6180636882781982, + "learning_rate": 3.3172038076287966e-05, + "loss": 0.914, + "step": 6018 + }, + { + "epoch": 0.29308791663623307, + "grad_norm": 1.6419966220855713, + "learning_rate": 3.316966427732337e-05, + "loss": 0.9158, + "step": 6019 + }, + { + "epoch": 0.2931366104253402, + "grad_norm": 4.6089911460876465, + "learning_rate": 3.316729015076218e-05, + "loss": 0.9604, + "step": 6020 + }, + { + "epoch": 0.29318530421444744, + "grad_norm": 1.5847876071929932, + "learning_rate": 3.316491569666345e-05, + "loss": 0.8472, + "step": 6021 + }, + { + "epoch": 0.29323399800355465, + "grad_norm": 1.4323499202728271, + "learning_rate": 3.316254091508623e-05, + "loss": 0.8483, + "step": 6022 + }, + { + "epoch": 0.29328269179266186, + "grad_norm": 1.5304324626922607, + "learning_rate": 3.3160165806089617e-05, + "loss": 0.913, + "step": 6023 + }, + { + "epoch": 0.293331385581769, + "grad_norm": 1.5793637037277222, + "learning_rate": 3.3157790369732676e-05, + "loss": 0.8386, + "step": 6024 + }, + { + "epoch": 0.29338007937087623, + "grad_norm": 2.013375997543335, + "learning_rate": 3.31554146060745e-05, + "loss": 0.7964, + "step": 6025 + }, + { + "epoch": 0.29342877315998345, + "grad_norm": 1.8855795860290527, + "learning_rate": 3.315303851517419e-05, + "loss": 0.8286, + "step": 6026 + }, + { + "epoch": 0.29347746694909066, + "grad_norm": 1.558244228363037, + "learning_rate": 3.315066209709084e-05, + "loss": 0.819, + "step": 6027 + }, + { + "epoch": 0.2935261607381978, + "grad_norm": 1.6413675546646118, + "learning_rate": 3.3148285351883576e-05, + "loss": 0.8781, + "step": 6028 + }, + { + "epoch": 0.29357485452730503, + "grad_norm": 2.4772655963897705, + "learning_rate": 3.314590827961151e-05, + "loss": 0.8273, + "step": 6029 + }, + { + "epoch": 0.29362354831641224, + "grad_norm": 1.7521953582763672, + "learning_rate": 3.3143530880333784e-05, + "loss": 0.8502, + "step": 6030 + }, + { + "epoch": 0.29367224210551945, + "grad_norm": 1.2460750341415405, + "learning_rate": 3.314115315410953e-05, + "loss": 0.8275, + "step": 6031 + }, + { + "epoch": 0.29372093589462667, + "grad_norm": 1.6969454288482666, + "learning_rate": 3.3138775100997885e-05, + "loss": 0.9073, + "step": 6032 + }, + { + "epoch": 0.2937696296837338, + "grad_norm": 1.5814403295516968, + "learning_rate": 3.313639672105802e-05, + "loss": 0.8219, + "step": 6033 + }, + { + "epoch": 0.29381832347284104, + "grad_norm": 1.4609347581863403, + "learning_rate": 3.313401801434908e-05, + "loss": 0.8668, + "step": 6034 + }, + { + "epoch": 0.29386701726194825, + "grad_norm": 1.5846458673477173, + "learning_rate": 3.3131638980930246e-05, + "loss": 0.9096, + "step": 6035 + }, + { + "epoch": 0.29391571105105546, + "grad_norm": 1.4096286296844482, + "learning_rate": 3.31292596208607e-05, + "loss": 0.901, + "step": 6036 + }, + { + "epoch": 0.2939644048401626, + "grad_norm": 2.312185287475586, + "learning_rate": 3.312687993419962e-05, + "loss": 0.8698, + "step": 6037 + }, + { + "epoch": 0.29401309862926983, + "grad_norm": 0.08823508769273758, + "learning_rate": 3.3124499921006206e-05, + "loss": 0.5894, + "step": 6038 + }, + { + "epoch": 0.29406179241837704, + "grad_norm": 1.5492810010910034, + "learning_rate": 3.312211958133966e-05, + "loss": 0.8914, + "step": 6039 + }, + { + "epoch": 0.29411048620748426, + "grad_norm": 6.562582492828369, + "learning_rate": 3.311973891525919e-05, + "loss": 0.914, + "step": 6040 + }, + { + "epoch": 0.2941591799965914, + "grad_norm": 1.9004974365234375, + "learning_rate": 3.311735792282403e-05, + "loss": 0.8822, + "step": 6041 + }, + { + "epoch": 0.2942078737856986, + "grad_norm": 2.2490220069885254, + "learning_rate": 3.3114976604093386e-05, + "loss": 0.9885, + "step": 6042 + }, + { + "epoch": 0.29425656757480584, + "grad_norm": 0.0867455005645752, + "learning_rate": 3.31125949591265e-05, + "loss": 0.6561, + "step": 6043 + }, + { + "epoch": 0.29430526136391305, + "grad_norm": 1.3098020553588867, + "learning_rate": 3.3110212987982624e-05, + "loss": 0.9501, + "step": 6044 + }, + { + "epoch": 0.2943539551530202, + "grad_norm": 1.0831400156021118, + "learning_rate": 3.3107830690721e-05, + "loss": 0.945, + "step": 6045 + }, + { + "epoch": 0.2944026489421274, + "grad_norm": 1.7442346811294556, + "learning_rate": 3.31054480674009e-05, + "loss": 0.9176, + "step": 6046 + }, + { + "epoch": 0.29445134273123463, + "grad_norm": 1.8193556070327759, + "learning_rate": 3.310306511808158e-05, + "loss": 0.8209, + "step": 6047 + }, + { + "epoch": 0.29450003652034185, + "grad_norm": 1.5590887069702148, + "learning_rate": 3.310068184282232e-05, + "loss": 0.9328, + "step": 6048 + }, + { + "epoch": 0.294548730309449, + "grad_norm": 2.143649101257324, + "learning_rate": 3.309829824168241e-05, + "loss": 0.7914, + "step": 6049 + }, + { + "epoch": 0.2945974240985562, + "grad_norm": 1.2062100172042847, + "learning_rate": 3.309591431472113e-05, + "loss": 0.8523, + "step": 6050 + }, + { + "epoch": 0.29464611788766343, + "grad_norm": 1.6227210760116577, + "learning_rate": 3.309353006199779e-05, + "loss": 0.8631, + "step": 6051 + }, + { + "epoch": 0.29469481167677064, + "grad_norm": 7.947082996368408, + "learning_rate": 3.30911454835717e-05, + "loss": 0.833, + "step": 6052 + }, + { + "epoch": 0.29474350546587785, + "grad_norm": 1.9733288288116455, + "learning_rate": 3.308876057950216e-05, + "loss": 0.9575, + "step": 6053 + }, + { + "epoch": 0.294792199254985, + "grad_norm": 1.571847915649414, + "learning_rate": 3.308637534984852e-05, + "loss": 0.8289, + "step": 6054 + }, + { + "epoch": 0.2948408930440922, + "grad_norm": 1.3483209609985352, + "learning_rate": 3.3083989794670096e-05, + "loss": 0.8449, + "step": 6055 + }, + { + "epoch": 0.29488958683319944, + "grad_norm": 9.37887191772461, + "learning_rate": 3.308160391402623e-05, + "loss": 0.8438, + "step": 6056 + }, + { + "epoch": 0.29493828062230665, + "grad_norm": 1.7575026750564575, + "learning_rate": 3.307921770797628e-05, + "loss": 0.9127, + "step": 6057 + }, + { + "epoch": 0.2949869744114138, + "grad_norm": 1.2065889835357666, + "learning_rate": 3.307683117657959e-05, + "loss": 0.881, + "step": 6058 + }, + { + "epoch": 0.295035668200521, + "grad_norm": 1.8647383451461792, + "learning_rate": 3.307444431989554e-05, + "loss": 0.9419, + "step": 6059 + }, + { + "epoch": 0.29508436198962823, + "grad_norm": 1.7861868143081665, + "learning_rate": 3.307205713798349e-05, + "loss": 0.7594, + "step": 6060 + }, + { + "epoch": 0.29513305577873544, + "grad_norm": 1.5357680320739746, + "learning_rate": 3.306966963090282e-05, + "loss": 0.8746, + "step": 6061 + }, + { + "epoch": 0.2951817495678426, + "grad_norm": 1.2037748098373413, + "learning_rate": 3.3067281798712946e-05, + "loss": 0.8522, + "step": 6062 + }, + { + "epoch": 0.2952304433569498, + "grad_norm": 1.6096305847167969, + "learning_rate": 3.306489364147323e-05, + "loss": 0.9215, + "step": 6063 + }, + { + "epoch": 0.295279137146057, + "grad_norm": 1.3863600492477417, + "learning_rate": 3.30625051592431e-05, + "loss": 0.843, + "step": 6064 + }, + { + "epoch": 0.29532783093516424, + "grad_norm": 1.729541301727295, + "learning_rate": 3.306011635208195e-05, + "loss": 0.8378, + "step": 6065 + }, + { + "epoch": 0.2953765247242714, + "grad_norm": 1.8502718210220337, + "learning_rate": 3.3057727220049227e-05, + "loss": 0.8996, + "step": 6066 + }, + { + "epoch": 0.2954252185133786, + "grad_norm": 1.3233263492584229, + "learning_rate": 3.3055337763204356e-05, + "loss": 0.8771, + "step": 6067 + }, + { + "epoch": 0.2954739123024858, + "grad_norm": 5.0149312019348145, + "learning_rate": 3.3052947981606755e-05, + "loss": 0.9021, + "step": 6068 + }, + { + "epoch": 0.29552260609159303, + "grad_norm": 1.5748050212860107, + "learning_rate": 3.3050557875315886e-05, + "loss": 0.8941, + "step": 6069 + }, + { + "epoch": 0.2955712998807002, + "grad_norm": 2.5239830017089844, + "learning_rate": 3.3048167444391204e-05, + "loss": 0.8444, + "step": 6070 + }, + { + "epoch": 0.2956199936698074, + "grad_norm": 1.7164225578308105, + "learning_rate": 3.304577668889216e-05, + "loss": 0.855, + "step": 6071 + }, + { + "epoch": 0.2956686874589146, + "grad_norm": 1.904147744178772, + "learning_rate": 3.304338560887824e-05, + "loss": 0.8583, + "step": 6072 + }, + { + "epoch": 0.2957173812480218, + "grad_norm": 2.947483777999878, + "learning_rate": 3.304099420440891e-05, + "loss": 0.8948, + "step": 6073 + }, + { + "epoch": 0.29576607503712904, + "grad_norm": 1.6714293956756592, + "learning_rate": 3.303860247554366e-05, + "loss": 0.8952, + "step": 6074 + }, + { + "epoch": 0.2958147688262362, + "grad_norm": 1.505373477935791, + "learning_rate": 3.3036210422341986e-05, + "loss": 0.9323, + "step": 6075 + }, + { + "epoch": 0.2958634626153434, + "grad_norm": 1.578953504562378, + "learning_rate": 3.303381804486338e-05, + "loss": 0.8168, + "step": 6076 + }, + { + "epoch": 0.2959121564044506, + "grad_norm": 1.6213529109954834, + "learning_rate": 3.3031425343167374e-05, + "loss": 0.7945, + "step": 6077 + }, + { + "epoch": 0.29596085019355783, + "grad_norm": 2.003342390060425, + "learning_rate": 3.302903231731347e-05, + "loss": 0.9592, + "step": 6078 + }, + { + "epoch": 0.296009543982665, + "grad_norm": 1.6318942308425903, + "learning_rate": 3.30266389673612e-05, + "loss": 0.9554, + "step": 6079 + }, + { + "epoch": 0.2960582377717722, + "grad_norm": 1.5101640224456787, + "learning_rate": 3.3024245293370096e-05, + "loss": 0.8236, + "step": 6080 + }, + { + "epoch": 0.2961069315608794, + "grad_norm": 1.6588096618652344, + "learning_rate": 3.30218512953997e-05, + "loss": 0.8661, + "step": 6081 + }, + { + "epoch": 0.29615562534998663, + "grad_norm": 1.6645193099975586, + "learning_rate": 3.3019456973509573e-05, + "loss": 0.941, + "step": 6082 + }, + { + "epoch": 0.2962043191390938, + "grad_norm": 2.2116894721984863, + "learning_rate": 3.301706232775926e-05, + "loss": 0.9712, + "step": 6083 + }, + { + "epoch": 0.296253012928201, + "grad_norm": 2.544556140899658, + "learning_rate": 3.301466735820834e-05, + "loss": 0.8809, + "step": 6084 + }, + { + "epoch": 0.2963017067173082, + "grad_norm": 1.8584237098693848, + "learning_rate": 3.301227206491638e-05, + "loss": 0.8397, + "step": 6085 + }, + { + "epoch": 0.2963504005064154, + "grad_norm": 2.006981372833252, + "learning_rate": 3.3009876447942964e-05, + "loss": 0.8014, + "step": 6086 + }, + { + "epoch": 0.2963990942955226, + "grad_norm": 1.5391805171966553, + "learning_rate": 3.3007480507347694e-05, + "loss": 0.9513, + "step": 6087 + }, + { + "epoch": 0.2964477880846298, + "grad_norm": 1.5570893287658691, + "learning_rate": 3.300508424319015e-05, + "loss": 0.8587, + "step": 6088 + }, + { + "epoch": 0.296496481873737, + "grad_norm": 3.4815409183502197, + "learning_rate": 3.300268765552996e-05, + "loss": 0.8628, + "step": 6089 + }, + { + "epoch": 0.2965451756628442, + "grad_norm": 1.8896373510360718, + "learning_rate": 3.300029074442671e-05, + "loss": 0.8663, + "step": 6090 + }, + { + "epoch": 0.29659386945195143, + "grad_norm": 2.2360219955444336, + "learning_rate": 3.2997893509940064e-05, + "loss": 0.9468, + "step": 6091 + }, + { + "epoch": 0.2966425632410586, + "grad_norm": 0.09126083552837372, + "learning_rate": 3.2995495952129616e-05, + "loss": 0.667, + "step": 6092 + }, + { + "epoch": 0.2966912570301658, + "grad_norm": 1.4897822141647339, + "learning_rate": 3.2993098071055035e-05, + "loss": 0.8058, + "step": 6093 + }, + { + "epoch": 0.296739950819273, + "grad_norm": 1.675536870956421, + "learning_rate": 3.299069986677594e-05, + "loss": 0.8448, + "step": 6094 + }, + { + "epoch": 0.2967886446083802, + "grad_norm": 1.6804596185684204, + "learning_rate": 3.2988301339352014e-05, + "loss": 0.9242, + "step": 6095 + }, + { + "epoch": 0.2968373383974874, + "grad_norm": 1.7693413496017456, + "learning_rate": 3.2985902488842905e-05, + "loss": 0.8351, + "step": 6096 + }, + { + "epoch": 0.2968860321865946, + "grad_norm": 1.5127900838851929, + "learning_rate": 3.298350331530828e-05, + "loss": 0.8813, + "step": 6097 + }, + { + "epoch": 0.2969347259757018, + "grad_norm": 3.715470552444458, + "learning_rate": 3.2981103818807834e-05, + "loss": 0.9345, + "step": 6098 + }, + { + "epoch": 0.296983419764809, + "grad_norm": 1.53611159324646, + "learning_rate": 3.2978703999401243e-05, + "loss": 0.8753, + "step": 6099 + }, + { + "epoch": 0.2970321135539162, + "grad_norm": 5.146892070770264, + "learning_rate": 3.297630385714821e-05, + "loss": 0.8907, + "step": 6100 + }, + { + "epoch": 0.2970808073430234, + "grad_norm": 1.4955440759658813, + "learning_rate": 3.297390339210844e-05, + "loss": 0.8583, + "step": 6101 + }, + { + "epoch": 0.2971295011321306, + "grad_norm": 1.8907291889190674, + "learning_rate": 3.297150260434163e-05, + "loss": 0.8399, + "step": 6102 + }, + { + "epoch": 0.2971781949212378, + "grad_norm": 2.258697509765625, + "learning_rate": 3.296910149390752e-05, + "loss": 0.8795, + "step": 6103 + }, + { + "epoch": 0.29722688871034497, + "grad_norm": 1.692116618156433, + "learning_rate": 3.296670006086582e-05, + "loss": 0.8532, + "step": 6104 + }, + { + "epoch": 0.2972755824994522, + "grad_norm": 2.577571153640747, + "learning_rate": 3.296429830527627e-05, + "loss": 0.7776, + "step": 6105 + }, + { + "epoch": 0.2973242762885594, + "grad_norm": 1.989997148513794, + "learning_rate": 3.296189622719863e-05, + "loss": 0.8727, + "step": 6106 + }, + { + "epoch": 0.2973729700776666, + "grad_norm": 1.1636359691619873, + "learning_rate": 3.295949382669263e-05, + "loss": 0.8708, + "step": 6107 + }, + { + "epoch": 0.29742166386677377, + "grad_norm": 1.557752251625061, + "learning_rate": 3.295709110381804e-05, + "loss": 0.8522, + "step": 6108 + }, + { + "epoch": 0.297470357655881, + "grad_norm": 1.5851950645446777, + "learning_rate": 3.2954688058634635e-05, + "loss": 0.8689, + "step": 6109 + }, + { + "epoch": 0.2975190514449882, + "grad_norm": 2.014822006225586, + "learning_rate": 3.295228469120218e-05, + "loss": 0.8316, + "step": 6110 + }, + { + "epoch": 0.2975677452340954, + "grad_norm": 1.3744105100631714, + "learning_rate": 3.294988100158046e-05, + "loss": 0.8578, + "step": 6111 + }, + { + "epoch": 0.2976164390232026, + "grad_norm": 0.09317810088396072, + "learning_rate": 3.294747698982927e-05, + "loss": 0.6381, + "step": 6112 + }, + { + "epoch": 0.2976651328123098, + "grad_norm": 1.4898078441619873, + "learning_rate": 3.294507265600841e-05, + "loss": 0.7316, + "step": 6113 + }, + { + "epoch": 0.297713826601417, + "grad_norm": 1.5927016735076904, + "learning_rate": 3.2942668000177685e-05, + "loss": 0.8096, + "step": 6114 + }, + { + "epoch": 0.2977625203905242, + "grad_norm": 1.8206920623779297, + "learning_rate": 3.294026302239691e-05, + "loss": 0.8559, + "step": 6115 + }, + { + "epoch": 0.2978112141796314, + "grad_norm": 1.6159757375717163, + "learning_rate": 3.293785772272593e-05, + "loss": 0.8233, + "step": 6116 + }, + { + "epoch": 0.29785990796873857, + "grad_norm": 2.3959109783172607, + "learning_rate": 3.2935452101224536e-05, + "loss": 0.8595, + "step": 6117 + }, + { + "epoch": 0.2979086017578458, + "grad_norm": 1.4817885160446167, + "learning_rate": 3.293304615795261e-05, + "loss": 0.9877, + "step": 6118 + }, + { + "epoch": 0.297957295546953, + "grad_norm": 2.271169900894165, + "learning_rate": 3.2930639892969976e-05, + "loss": 0.8435, + "step": 6119 + }, + { + "epoch": 0.2980059893360602, + "grad_norm": 1.6933541297912598, + "learning_rate": 3.292823330633649e-05, + "loss": 0.8177, + "step": 6120 + }, + { + "epoch": 0.29805468312516736, + "grad_norm": 1.733593225479126, + "learning_rate": 3.292582639811203e-05, + "loss": 0.9279, + "step": 6121 + }, + { + "epoch": 0.2981033769142746, + "grad_norm": 2.547558546066284, + "learning_rate": 3.2923419168356454e-05, + "loss": 0.768, + "step": 6122 + }, + { + "epoch": 0.2981520707033818, + "grad_norm": 2.833815336227417, + "learning_rate": 3.292101161712965e-05, + "loss": 0.8234, + "step": 6123 + }, + { + "epoch": 0.298200764492489, + "grad_norm": 1.4390662908554077, + "learning_rate": 3.29186037444915e-05, + "loss": 0.9262, + "step": 6124 + }, + { + "epoch": 0.29824945828159616, + "grad_norm": 2.3164243698120117, + "learning_rate": 3.2916195550501906e-05, + "loss": 0.9765, + "step": 6125 + }, + { + "epoch": 0.29829815207070337, + "grad_norm": 2.1027984619140625, + "learning_rate": 3.291378703522077e-05, + "loss": 0.9585, + "step": 6126 + }, + { + "epoch": 0.2983468458598106, + "grad_norm": 1.8240253925323486, + "learning_rate": 3.291137819870801e-05, + "loss": 0.7257, + "step": 6127 + }, + { + "epoch": 0.2983955396489178, + "grad_norm": 2.544149398803711, + "learning_rate": 3.2908969041023535e-05, + "loss": 0.8387, + "step": 6128 + }, + { + "epoch": 0.29844423343802495, + "grad_norm": 1.7446463108062744, + "learning_rate": 3.290655956222728e-05, + "loss": 0.8369, + "step": 6129 + }, + { + "epoch": 0.29849292722713217, + "grad_norm": 1.4706730842590332, + "learning_rate": 3.290414976237918e-05, + "loss": 0.9415, + "step": 6130 + }, + { + "epoch": 0.2985416210162394, + "grad_norm": 1.6579715013504028, + "learning_rate": 3.2901739641539173e-05, + "loss": 0.7957, + "step": 6131 + }, + { + "epoch": 0.2985903148053466, + "grad_norm": 1.5876940488815308, + "learning_rate": 3.289932919976722e-05, + "loss": 0.7982, + "step": 6132 + }, + { + "epoch": 0.2986390085944538, + "grad_norm": 3.6068637371063232, + "learning_rate": 3.289691843712328e-05, + "loss": 0.9891, + "step": 6133 + }, + { + "epoch": 0.29868770238356096, + "grad_norm": 2.7850089073181152, + "learning_rate": 3.289450735366731e-05, + "loss": 0.9927, + "step": 6134 + }, + { + "epoch": 0.2987363961726682, + "grad_norm": 1.9991559982299805, + "learning_rate": 3.28920959494593e-05, + "loss": 0.8629, + "step": 6135 + }, + { + "epoch": 0.2987850899617754, + "grad_norm": 1.6473149061203003, + "learning_rate": 3.288968422455922e-05, + "loss": 0.9209, + "step": 6136 + }, + { + "epoch": 0.2988337837508826, + "grad_norm": 1.6022660732269287, + "learning_rate": 3.2887272179027084e-05, + "loss": 0.8378, + "step": 6137 + }, + { + "epoch": 0.29888247753998975, + "grad_norm": 1.3413877487182617, + "learning_rate": 3.2884859812922865e-05, + "loss": 0.7908, + "step": 6138 + }, + { + "epoch": 0.29893117132909697, + "grad_norm": 4.132745742797852, + "learning_rate": 3.288244712630659e-05, + "loss": 0.8332, + "step": 6139 + }, + { + "epoch": 0.2989798651182042, + "grad_norm": 1.8223168849945068, + "learning_rate": 3.288003411923827e-05, + "loss": 0.9133, + "step": 6140 + }, + { + "epoch": 0.2990285589073114, + "grad_norm": 1.602270245552063, + "learning_rate": 3.287762079177792e-05, + "loss": 0.9544, + "step": 6141 + }, + { + "epoch": 0.29907725269641855, + "grad_norm": 2.3866026401519775, + "learning_rate": 3.287520714398558e-05, + "loss": 0.8471, + "step": 6142 + }, + { + "epoch": 0.29912594648552576, + "grad_norm": 1.8191190958023071, + "learning_rate": 3.2872793175921285e-05, + "loss": 0.9376, + "step": 6143 + }, + { + "epoch": 0.299174640274633, + "grad_norm": 0.08715606480836868, + "learning_rate": 3.287037888764509e-05, + "loss": 0.6023, + "step": 6144 + }, + { + "epoch": 0.2992233340637402, + "grad_norm": 1.8962393999099731, + "learning_rate": 3.286796427921705e-05, + "loss": 0.9259, + "step": 6145 + }, + { + "epoch": 0.29927202785284734, + "grad_norm": 1.7748216390609741, + "learning_rate": 3.286554935069722e-05, + "loss": 0.9278, + "step": 6146 + }, + { + "epoch": 0.29932072164195456, + "grad_norm": 1.931921362876892, + "learning_rate": 3.286313410214568e-05, + "loss": 0.8436, + "step": 6147 + }, + { + "epoch": 0.29936941543106177, + "grad_norm": 2.1975536346435547, + "learning_rate": 3.286071853362251e-05, + "loss": 0.8135, + "step": 6148 + }, + { + "epoch": 0.299418109220169, + "grad_norm": 0.08541274815797806, + "learning_rate": 3.285830264518779e-05, + "loss": 0.6167, + "step": 6149 + }, + { + "epoch": 0.2994668030092762, + "grad_norm": 2.0576469898223877, + "learning_rate": 3.2855886436901614e-05, + "loss": 0.8298, + "step": 6150 + }, + { + "epoch": 0.29951549679838335, + "grad_norm": 1.3159679174423218, + "learning_rate": 3.2853469908824104e-05, + "loss": 0.9458, + "step": 6151 + }, + { + "epoch": 0.29956419058749056, + "grad_norm": 1.3273906707763672, + "learning_rate": 3.285105306101535e-05, + "loss": 0.8675, + "step": 6152 + }, + { + "epoch": 0.2996128843765978, + "grad_norm": 1.720333456993103, + "learning_rate": 3.2848635893535474e-05, + "loss": 0.8217, + "step": 6153 + }, + { + "epoch": 0.299661578165705, + "grad_norm": 1.5431865453720093, + "learning_rate": 3.284621840644461e-05, + "loss": 0.8731, + "step": 6154 + }, + { + "epoch": 0.29971027195481215, + "grad_norm": 2.3367574214935303, + "learning_rate": 3.28438005998029e-05, + "loss": 0.7624, + "step": 6155 + }, + { + "epoch": 0.29975896574391936, + "grad_norm": 1.701305627822876, + "learning_rate": 3.284138247367048e-05, + "loss": 0.9017, + "step": 6156 + }, + { + "epoch": 0.29980765953302657, + "grad_norm": 2.087297201156616, + "learning_rate": 3.283896402810749e-05, + "loss": 0.9277, + "step": 6157 + }, + { + "epoch": 0.2998563533221338, + "grad_norm": 1.5411958694458008, + "learning_rate": 3.2836545263174105e-05, + "loss": 0.8466, + "step": 6158 + }, + { + "epoch": 0.29990504711124094, + "grad_norm": 1.4001003503799438, + "learning_rate": 3.283412617893049e-05, + "loss": 0.8535, + "step": 6159 + }, + { + "epoch": 0.29995374090034815, + "grad_norm": 1.7044399976730347, + "learning_rate": 3.283170677543681e-05, + "loss": 0.9175, + "step": 6160 + }, + { + "epoch": 0.30000243468945537, + "grad_norm": 1.492347002029419, + "learning_rate": 3.282928705275326e-05, + "loss": 0.9145, + "step": 6161 + }, + { + "epoch": 0.3000511284785626, + "grad_norm": 1.4050724506378174, + "learning_rate": 3.282686701094002e-05, + "loss": 0.7973, + "step": 6162 + }, + { + "epoch": 0.30009982226766974, + "grad_norm": 2.043665647506714, + "learning_rate": 3.282444665005729e-05, + "loss": 0.8395, + "step": 6163 + }, + { + "epoch": 0.30014851605677695, + "grad_norm": 1.7127765417099, + "learning_rate": 3.282202597016528e-05, + "loss": 0.9191, + "step": 6164 + }, + { + "epoch": 0.30019720984588416, + "grad_norm": 1.6476532220840454, + "learning_rate": 3.281960497132421e-05, + "loss": 0.8253, + "step": 6165 + }, + { + "epoch": 0.3002459036349914, + "grad_norm": 1.5025161504745483, + "learning_rate": 3.28171836535943e-05, + "loss": 0.7723, + "step": 6166 + }, + { + "epoch": 0.30029459742409853, + "grad_norm": 1.6937154531478882, + "learning_rate": 3.281476201703577e-05, + "loss": 0.8734, + "step": 6167 + }, + { + "epoch": 0.30034329121320574, + "grad_norm": 1.6661521196365356, + "learning_rate": 3.281234006170888e-05, + "loss": 0.8066, + "step": 6168 + }, + { + "epoch": 0.30039198500231296, + "grad_norm": 2.197848320007324, + "learning_rate": 3.2809917787673846e-05, + "loss": 0.8857, + "step": 6169 + }, + { + "epoch": 0.30044067879142017, + "grad_norm": 1.4509981870651245, + "learning_rate": 3.280749519499095e-05, + "loss": 0.9724, + "step": 6170 + }, + { + "epoch": 0.3004893725805274, + "grad_norm": 1.35728120803833, + "learning_rate": 3.280507228372044e-05, + "loss": 0.9918, + "step": 6171 + }, + { + "epoch": 0.30053806636963454, + "grad_norm": 1.323588490486145, + "learning_rate": 3.280264905392259e-05, + "loss": 0.8698, + "step": 6172 + }, + { + "epoch": 0.30058676015874175, + "grad_norm": 1.3174086809158325, + "learning_rate": 3.280022550565768e-05, + "loss": 0.8797, + "step": 6173 + }, + { + "epoch": 0.30063545394784896, + "grad_norm": 1.5688554048538208, + "learning_rate": 3.2797801638985984e-05, + "loss": 0.8892, + "step": 6174 + }, + { + "epoch": 0.3006841477369562, + "grad_norm": 1.6088021993637085, + "learning_rate": 3.2795377453967814e-05, + "loss": 0.8293, + "step": 6175 + }, + { + "epoch": 0.30073284152606333, + "grad_norm": 1.201480746269226, + "learning_rate": 3.279295295066346e-05, + "loss": 0.9289, + "step": 6176 + }, + { + "epoch": 0.30078153531517055, + "grad_norm": 1.2541568279266357, + "learning_rate": 3.2790528129133236e-05, + "loss": 0.8752, + "step": 6177 + }, + { + "epoch": 0.30083022910427776, + "grad_norm": 1.8514822721481323, + "learning_rate": 3.278810298943746e-05, + "loss": 0.8337, + "step": 6178 + }, + { + "epoch": 0.30087892289338497, + "grad_norm": 1.49708890914917, + "learning_rate": 3.278567753163646e-05, + "loss": 0.8103, + "step": 6179 + }, + { + "epoch": 0.3009276166824921, + "grad_norm": 0.09074685722589493, + "learning_rate": 3.278325175579056e-05, + "loss": 0.6889, + "step": 6180 + }, + { + "epoch": 0.30097631047159934, + "grad_norm": 0.0907093957066536, + "learning_rate": 3.278082566196011e-05, + "loss": 0.6496, + "step": 6181 + }, + { + "epoch": 0.30102500426070655, + "grad_norm": 2.145643711090088, + "learning_rate": 3.277839925020545e-05, + "loss": 0.8717, + "step": 6182 + }, + { + "epoch": 0.30107369804981376, + "grad_norm": 2.526291608810425, + "learning_rate": 3.277597252058695e-05, + "loss": 0.8463, + "step": 6183 + }, + { + "epoch": 0.3011223918389209, + "grad_norm": 3.5470995903015137, + "learning_rate": 3.277354547316496e-05, + "loss": 0.909, + "step": 6184 + }, + { + "epoch": 0.30117108562802813, + "grad_norm": 1.8647563457489014, + "learning_rate": 3.277111810799987e-05, + "loss": 0.8271, + "step": 6185 + }, + { + "epoch": 0.30121977941713535, + "grad_norm": 1.8085157871246338, + "learning_rate": 3.276869042515205e-05, + "loss": 0.8362, + "step": 6186 + }, + { + "epoch": 0.30126847320624256, + "grad_norm": 1.5826096534729004, + "learning_rate": 3.276626242468189e-05, + "loss": 0.9146, + "step": 6187 + }, + { + "epoch": 0.3013171669953497, + "grad_norm": 1.6206227540969849, + "learning_rate": 3.2763834106649796e-05, + "loss": 0.8974, + "step": 6188 + }, + { + "epoch": 0.30136586078445693, + "grad_norm": 0.08773033320903778, + "learning_rate": 3.2761405471116156e-05, + "loss": 0.5712, + "step": 6189 + }, + { + "epoch": 0.30141455457356414, + "grad_norm": 1.5892045497894287, + "learning_rate": 3.2758976518141394e-05, + "loss": 0.9136, + "step": 6190 + }, + { + "epoch": 0.30146324836267135, + "grad_norm": 1.4346643686294556, + "learning_rate": 3.2756547247785935e-05, + "loss": 0.9277, + "step": 6191 + }, + { + "epoch": 0.30151194215177857, + "grad_norm": 3.34033203125, + "learning_rate": 3.275411766011018e-05, + "loss": 0.8576, + "step": 6192 + }, + { + "epoch": 0.3015606359408857, + "grad_norm": 1.8720821142196655, + "learning_rate": 3.27516877551746e-05, + "loss": 0.7767, + "step": 6193 + }, + { + "epoch": 0.30160932972999294, + "grad_norm": 1.6981111764907837, + "learning_rate": 3.274925753303963e-05, + "loss": 0.9314, + "step": 6194 + }, + { + "epoch": 0.30165802351910015, + "grad_norm": 1.8006120920181274, + "learning_rate": 3.2746826993765706e-05, + "loss": 0.9829, + "step": 6195 + }, + { + "epoch": 0.30170671730820736, + "grad_norm": 0.08801954984664917, + "learning_rate": 3.274439613741331e-05, + "loss": 0.5636, + "step": 6196 + }, + { + "epoch": 0.3017554110973145, + "grad_norm": 1.3651463985443115, + "learning_rate": 3.2741964964042886e-05, + "loss": 0.7711, + "step": 6197 + }, + { + "epoch": 0.30180410488642173, + "grad_norm": 1.5678844451904297, + "learning_rate": 3.273953347371492e-05, + "loss": 0.9111, + "step": 6198 + }, + { + "epoch": 0.30185279867552894, + "grad_norm": 1.662546157836914, + "learning_rate": 3.2737101666489905e-05, + "loss": 0.9661, + "step": 6199 + }, + { + "epoch": 0.30190149246463616, + "grad_norm": 1.6755205392837524, + "learning_rate": 3.2734669542428324e-05, + "loss": 0.8725, + "step": 6200 + }, + { + "epoch": 0.3019501862537433, + "grad_norm": 1.4134531021118164, + "learning_rate": 3.273223710159068e-05, + "loss": 0.9785, + "step": 6201 + }, + { + "epoch": 0.3019988800428505, + "grad_norm": 1.7612959146499634, + "learning_rate": 3.272980434403747e-05, + "loss": 0.8618, + "step": 6202 + }, + { + "epoch": 0.30204757383195774, + "grad_norm": 0.08598532527685165, + "learning_rate": 3.272737126982922e-05, + "loss": 0.5653, + "step": 6203 + }, + { + "epoch": 0.30209626762106495, + "grad_norm": 1.304276943206787, + "learning_rate": 3.272493787902645e-05, + "loss": 0.9353, + "step": 6204 + }, + { + "epoch": 0.3021449614101721, + "grad_norm": 1.7031681537628174, + "learning_rate": 3.2722504171689684e-05, + "loss": 0.9337, + "step": 6205 + }, + { + "epoch": 0.3021936551992793, + "grad_norm": 1.7460625171661377, + "learning_rate": 3.2720070147879474e-05, + "loss": 0.9116, + "step": 6206 + }, + { + "epoch": 0.30224234898838653, + "grad_norm": 0.08783352375030518, + "learning_rate": 3.271763580765635e-05, + "loss": 0.6009, + "step": 6207 + }, + { + "epoch": 0.30229104277749375, + "grad_norm": 2.2749478816986084, + "learning_rate": 3.271520115108089e-05, + "loss": 0.9038, + "step": 6208 + }, + { + "epoch": 0.3023397365666009, + "grad_norm": 1.593660831451416, + "learning_rate": 3.2712766178213635e-05, + "loss": 0.851, + "step": 6209 + }, + { + "epoch": 0.3023884303557081, + "grad_norm": 1.4308545589447021, + "learning_rate": 3.2710330889115156e-05, + "loss": 0.9158, + "step": 6210 + }, + { + "epoch": 0.30243712414481533, + "grad_norm": 1.696258544921875, + "learning_rate": 3.2707895283846045e-05, + "loss": 0.8103, + "step": 6211 + }, + { + "epoch": 0.30248581793392254, + "grad_norm": 1.4707002639770508, + "learning_rate": 3.270545936246688e-05, + "loss": 0.8228, + "step": 6212 + }, + { + "epoch": 0.30253451172302975, + "grad_norm": 1.4051514863967896, + "learning_rate": 3.270302312503825e-05, + "loss": 0.831, + "step": 6213 + }, + { + "epoch": 0.3025832055121369, + "grad_norm": 1.508169412612915, + "learning_rate": 3.2700586571620765e-05, + "loss": 0.8383, + "step": 6214 + }, + { + "epoch": 0.3026318993012441, + "grad_norm": 2.7917661666870117, + "learning_rate": 3.2698149702275035e-05, + "loss": 0.8455, + "step": 6215 + }, + { + "epoch": 0.30268059309035134, + "grad_norm": 0.08883000165224075, + "learning_rate": 3.2695712517061666e-05, + "loss": 0.6634, + "step": 6216 + }, + { + "epoch": 0.30272928687945855, + "grad_norm": 1.8660454750061035, + "learning_rate": 3.269327501604129e-05, + "loss": 0.7864, + "step": 6217 + }, + { + "epoch": 0.3027779806685657, + "grad_norm": 1.5724250078201294, + "learning_rate": 3.269083719927455e-05, + "loss": 0.9391, + "step": 6218 + }, + { + "epoch": 0.3028266744576729, + "grad_norm": 1.4825714826583862, + "learning_rate": 3.2688399066822064e-05, + "loss": 0.9241, + "step": 6219 + }, + { + "epoch": 0.30287536824678013, + "grad_norm": 0.09400640428066254, + "learning_rate": 3.2685960618744506e-05, + "loss": 0.6513, + "step": 6220 + }, + { + "epoch": 0.30292406203588734, + "grad_norm": 1.5484191179275513, + "learning_rate": 3.268352185510252e-05, + "loss": 0.8772, + "step": 6221 + }, + { + "epoch": 0.3029727558249945, + "grad_norm": 2.1017463207244873, + "learning_rate": 3.2681082775956765e-05, + "loss": 0.9417, + "step": 6222 + }, + { + "epoch": 0.3030214496141017, + "grad_norm": 2.326986074447632, + "learning_rate": 3.267864338136792e-05, + "loss": 0.8421, + "step": 6223 + }, + { + "epoch": 0.3030701434032089, + "grad_norm": 1.5804048776626587, + "learning_rate": 3.267620367139667e-05, + "loss": 0.8515, + "step": 6224 + }, + { + "epoch": 0.30311883719231614, + "grad_norm": 1.4433563947677612, + "learning_rate": 3.26737636461037e-05, + "loss": 0.9163, + "step": 6225 + }, + { + "epoch": 0.3031675309814233, + "grad_norm": 1.4903215169906616, + "learning_rate": 3.26713233055497e-05, + "loss": 0.8652, + "step": 6226 + }, + { + "epoch": 0.3032162247705305, + "grad_norm": 3.0742805004119873, + "learning_rate": 3.2668882649795375e-05, + "loss": 0.7794, + "step": 6227 + }, + { + "epoch": 0.3032649185596377, + "grad_norm": 2.8261990547180176, + "learning_rate": 3.266644167890144e-05, + "loss": 0.769, + "step": 6228 + }, + { + "epoch": 0.30331361234874493, + "grad_norm": 2.1504154205322266, + "learning_rate": 3.2664000392928614e-05, + "loss": 0.9218, + "step": 6229 + }, + { + "epoch": 0.30336230613785214, + "grad_norm": 2.536525249481201, + "learning_rate": 3.2661558791937626e-05, + "loss": 0.7157, + "step": 6230 + }, + { + "epoch": 0.3034109999269593, + "grad_norm": 3.1893725395202637, + "learning_rate": 3.2659116875989206e-05, + "loss": 0.8904, + "step": 6231 + }, + { + "epoch": 0.3034596937160665, + "grad_norm": 1.9375782012939453, + "learning_rate": 3.26566746451441e-05, + "loss": 0.8712, + "step": 6232 + }, + { + "epoch": 0.3035083875051737, + "grad_norm": 3.6378703117370605, + "learning_rate": 3.265423209946306e-05, + "loss": 0.8245, + "step": 6233 + }, + { + "epoch": 0.30355708129428094, + "grad_norm": 2.745964288711548, + "learning_rate": 3.265178923900684e-05, + "loss": 0.8317, + "step": 6234 + }, + { + "epoch": 0.3036057750833881, + "grad_norm": 2.025148868560791, + "learning_rate": 3.264934606383621e-05, + "loss": 0.8275, + "step": 6235 + }, + { + "epoch": 0.3036544688724953, + "grad_norm": 1.7973195314407349, + "learning_rate": 3.264690257401194e-05, + "loss": 0.8563, + "step": 6236 + }, + { + "epoch": 0.3037031626616025, + "grad_norm": 1.8740402460098267, + "learning_rate": 3.2644458769594824e-05, + "loss": 0.8907, + "step": 6237 + }, + { + "epoch": 0.30375185645070973, + "grad_norm": 1.9187898635864258, + "learning_rate": 3.2642014650645636e-05, + "loss": 0.8118, + "step": 6238 + }, + { + "epoch": 0.3038005502398169, + "grad_norm": 1.4651539325714111, + "learning_rate": 3.263957021722519e-05, + "loss": 0.8491, + "step": 6239 + }, + { + "epoch": 0.3038492440289241, + "grad_norm": 1.4772090911865234, + "learning_rate": 3.2637125469394266e-05, + "loss": 0.8153, + "step": 6240 + }, + { + "epoch": 0.3038979378180313, + "grad_norm": 1.3239927291870117, + "learning_rate": 3.2634680407213705e-05, + "loss": 0.859, + "step": 6241 + }, + { + "epoch": 0.30394663160713853, + "grad_norm": 1.5318745374679565, + "learning_rate": 3.263223503074432e-05, + "loss": 0.9242, + "step": 6242 + }, + { + "epoch": 0.3039953253962457, + "grad_norm": 1.8255255222320557, + "learning_rate": 3.262978934004693e-05, + "loss": 0.9644, + "step": 6243 + }, + { + "epoch": 0.3040440191853529, + "grad_norm": 2.2375826835632324, + "learning_rate": 3.262734333518238e-05, + "loss": 1.017, + "step": 6244 + }, + { + "epoch": 0.3040927129744601, + "grad_norm": 2.1226632595062256, + "learning_rate": 3.262489701621151e-05, + "loss": 0.8882, + "step": 6245 + }, + { + "epoch": 0.3041414067635673, + "grad_norm": 1.694399356842041, + "learning_rate": 3.262245038319518e-05, + "loss": 0.9102, + "step": 6246 + }, + { + "epoch": 0.3041901005526745, + "grad_norm": 1.7249053716659546, + "learning_rate": 3.2620003436194244e-05, + "loss": 0.9475, + "step": 6247 + }, + { + "epoch": 0.3042387943417817, + "grad_norm": 2.3049838542938232, + "learning_rate": 3.261755617526958e-05, + "loss": 0.7223, + "step": 6248 + }, + { + "epoch": 0.3042874881308889, + "grad_norm": 1.9421275854110718, + "learning_rate": 3.261510860048205e-05, + "loss": 0.8702, + "step": 6249 + }, + { + "epoch": 0.3043361819199961, + "grad_norm": 1.4558436870574951, + "learning_rate": 3.261266071189254e-05, + "loss": 0.8429, + "step": 6250 + }, + { + "epoch": 0.30438487570910333, + "grad_norm": 4.16795539855957, + "learning_rate": 3.2610212509561944e-05, + "loss": 0.9186, + "step": 6251 + }, + { + "epoch": 0.3044335694982105, + "grad_norm": 0.08920388668775558, + "learning_rate": 3.260776399355117e-05, + "loss": 0.6857, + "step": 6252 + }, + { + "epoch": 0.3044822632873177, + "grad_norm": 1.4918174743652344, + "learning_rate": 3.260531516392111e-05, + "loss": 0.8493, + "step": 6253 + }, + { + "epoch": 0.3045309570764249, + "grad_norm": 1.646787405014038, + "learning_rate": 3.260286602073269e-05, + "loss": 0.8236, + "step": 6254 + }, + { + "epoch": 0.3045796508655321, + "grad_norm": 1.8879985809326172, + "learning_rate": 3.260041656404682e-05, + "loss": 0.9244, + "step": 6255 + }, + { + "epoch": 0.3046283446546393, + "grad_norm": 0.09547162055969238, + "learning_rate": 3.2597966793924445e-05, + "loss": 0.661, + "step": 6256 + }, + { + "epoch": 0.3046770384437465, + "grad_norm": 1.814868450164795, + "learning_rate": 3.25955167104265e-05, + "loss": 0.8627, + "step": 6257 + }, + { + "epoch": 0.3047257322328537, + "grad_norm": 2.5167911052703857, + "learning_rate": 3.259306631361393e-05, + "loss": 0.8867, + "step": 6258 + }, + { + "epoch": 0.3047744260219609, + "grad_norm": 1.7767819166183472, + "learning_rate": 3.259061560354768e-05, + "loss": 0.9009, + "step": 6259 + }, + { + "epoch": 0.3048231198110681, + "grad_norm": 2.85331392288208, + "learning_rate": 3.258816458028872e-05, + "loss": 0.8212, + "step": 6260 + }, + { + "epoch": 0.3048718136001753, + "grad_norm": 2.362710475921631, + "learning_rate": 3.2585713243898025e-05, + "loss": 0.9265, + "step": 6261 + }, + { + "epoch": 0.3049205073892825, + "grad_norm": 3.246980667114258, + "learning_rate": 3.258326159443656e-05, + "loss": 0.9705, + "step": 6262 + }, + { + "epoch": 0.3049692011783897, + "grad_norm": 2.9391441345214844, + "learning_rate": 3.258080963196532e-05, + "loss": 0.91, + "step": 6263 + }, + { + "epoch": 0.30501789496749687, + "grad_norm": 2.184272527694702, + "learning_rate": 3.2578357356545294e-05, + "loss": 0.8842, + "step": 6264 + }, + { + "epoch": 0.3050665887566041, + "grad_norm": 1.8720924854278564, + "learning_rate": 3.2575904768237475e-05, + "loss": 0.8575, + "step": 6265 + }, + { + "epoch": 0.3051152825457113, + "grad_norm": 1.5796966552734375, + "learning_rate": 3.2573451867102886e-05, + "loss": 0.9112, + "step": 6266 + }, + { + "epoch": 0.3051639763348185, + "grad_norm": 1.5543047189712524, + "learning_rate": 3.257099865320253e-05, + "loss": 0.8784, + "step": 6267 + }, + { + "epoch": 0.30521267012392567, + "grad_norm": 1.9384888410568237, + "learning_rate": 3.256854512659744e-05, + "loss": 0.8886, + "step": 6268 + }, + { + "epoch": 0.3052613639130329, + "grad_norm": 1.926950454711914, + "learning_rate": 3.256609128734864e-05, + "loss": 0.9068, + "step": 6269 + }, + { + "epoch": 0.3053100577021401, + "grad_norm": 0.08933144062757492, + "learning_rate": 3.2563637135517176e-05, + "loss": 0.6195, + "step": 6270 + }, + { + "epoch": 0.3053587514912473, + "grad_norm": 1.7383242845535278, + "learning_rate": 3.256118267116409e-05, + "loss": 0.9333, + "step": 6271 + }, + { + "epoch": 0.3054074452803545, + "grad_norm": 1.9445805549621582, + "learning_rate": 3.255872789435044e-05, + "loss": 0.8918, + "step": 6272 + }, + { + "epoch": 0.3054561390694617, + "grad_norm": 1.5198590755462646, + "learning_rate": 3.2556272805137284e-05, + "loss": 0.7726, + "step": 6273 + }, + { + "epoch": 0.3055048328585689, + "grad_norm": 1.8732242584228516, + "learning_rate": 3.255381740358571e-05, + "loss": 0.8703, + "step": 6274 + }, + { + "epoch": 0.3055535266476761, + "grad_norm": 6.0243377685546875, + "learning_rate": 3.255136168975677e-05, + "loss": 0.7711, + "step": 6275 + }, + { + "epoch": 0.3056022204367833, + "grad_norm": 2.537463426589966, + "learning_rate": 3.254890566371158e-05, + "loss": 0.8771, + "step": 6276 + }, + { + "epoch": 0.30565091422589047, + "grad_norm": 3.4741692543029785, + "learning_rate": 3.25464493255112e-05, + "loss": 0.8964, + "step": 6277 + }, + { + "epoch": 0.3056996080149977, + "grad_norm": 2.5169198513031006, + "learning_rate": 3.254399267521676e-05, + "loss": 0.7627, + "step": 6278 + }, + { + "epoch": 0.3057483018041049, + "grad_norm": 1.6912751197814941, + "learning_rate": 3.254153571288936e-05, + "loss": 0.9349, + "step": 6279 + }, + { + "epoch": 0.3057969955932121, + "grad_norm": 0.09061574935913086, + "learning_rate": 3.253907843859011e-05, + "loss": 0.6548, + "step": 6280 + }, + { + "epoch": 0.30584568938231926, + "grad_norm": 1.6405385732650757, + "learning_rate": 3.253662085238014e-05, + "loss": 0.774, + "step": 6281 + }, + { + "epoch": 0.3058943831714265, + "grad_norm": 1.7092703580856323, + "learning_rate": 3.253416295432058e-05, + "loss": 0.8181, + "step": 6282 + }, + { + "epoch": 0.3059430769605337, + "grad_norm": 0.09397532790899277, + "learning_rate": 3.253170474447258e-05, + "loss": 0.5979, + "step": 6283 + }, + { + "epoch": 0.3059917707496409, + "grad_norm": 0.08664640039205551, + "learning_rate": 3.252924622289728e-05, + "loss": 0.6472, + "step": 6284 + }, + { + "epoch": 0.30604046453874806, + "grad_norm": 1.4957079887390137, + "learning_rate": 3.252678738965584e-05, + "loss": 0.8552, + "step": 6285 + }, + { + "epoch": 0.30608915832785527, + "grad_norm": 2.1188619136810303, + "learning_rate": 3.252432824480942e-05, + "loss": 0.7654, + "step": 6286 + }, + { + "epoch": 0.3061378521169625, + "grad_norm": 4.591380596160889, + "learning_rate": 3.252186878841919e-05, + "loss": 0.8992, + "step": 6287 + }, + { + "epoch": 0.3061865459060697, + "grad_norm": 1.8150136470794678, + "learning_rate": 3.251940902054634e-05, + "loss": 0.8394, + "step": 6288 + }, + { + "epoch": 0.30623523969517685, + "grad_norm": 3.399573802947998, + "learning_rate": 3.2516948941252046e-05, + "loss": 0.8258, + "step": 6289 + }, + { + "epoch": 0.30628393348428407, + "grad_norm": 1.615576148033142, + "learning_rate": 3.25144885505975e-05, + "loss": 0.9093, + "step": 6290 + }, + { + "epoch": 0.3063326272733913, + "grad_norm": 2.3337364196777344, + "learning_rate": 3.251202784864392e-05, + "loss": 0.9319, + "step": 6291 + }, + { + "epoch": 0.3063813210624985, + "grad_norm": 2.38492488861084, + "learning_rate": 3.2509566835452494e-05, + "loss": 0.865, + "step": 6292 + }, + { + "epoch": 0.3064300148516057, + "grad_norm": 0.08375289291143417, + "learning_rate": 3.250710551108446e-05, + "loss": 0.5446, + "step": 6293 + }, + { + "epoch": 0.30647870864071286, + "grad_norm": 2.305651903152466, + "learning_rate": 3.2504643875601036e-05, + "loss": 0.9031, + "step": 6294 + }, + { + "epoch": 0.3065274024298201, + "grad_norm": 0.0909961685538292, + "learning_rate": 3.2502181929063457e-05, + "loss": 0.7091, + "step": 6295 + }, + { + "epoch": 0.3065760962189273, + "grad_norm": 2.2318050861358643, + "learning_rate": 3.249971967153296e-05, + "loss": 0.8168, + "step": 6296 + }, + { + "epoch": 0.3066247900080345, + "grad_norm": 1.521903157234192, + "learning_rate": 3.249725710307079e-05, + "loss": 0.8999, + "step": 6297 + }, + { + "epoch": 0.30667348379714165, + "grad_norm": 1.7773241996765137, + "learning_rate": 3.2494794223738227e-05, + "loss": 0.8378, + "step": 6298 + }, + { + "epoch": 0.30672217758624887, + "grad_norm": 1.5699068307876587, + "learning_rate": 3.2492331033596503e-05, + "loss": 0.8599, + "step": 6299 + }, + { + "epoch": 0.3067708713753561, + "grad_norm": 1.4042317867279053, + "learning_rate": 3.2489867532706907e-05, + "loss": 0.8965, + "step": 6300 + }, + { + "epoch": 0.3068195651644633, + "grad_norm": 1.5050262212753296, + "learning_rate": 3.248740372113073e-05, + "loss": 0.8358, + "step": 6301 + }, + { + "epoch": 0.30686825895357045, + "grad_norm": 1.4673340320587158, + "learning_rate": 3.248493959892923e-05, + "loss": 0.8563, + "step": 6302 + }, + { + "epoch": 0.30691695274267766, + "grad_norm": 1.4616020917892456, + "learning_rate": 3.2482475166163725e-05, + "loss": 0.917, + "step": 6303 + }, + { + "epoch": 0.3069656465317849, + "grad_norm": 1.8331834077835083, + "learning_rate": 3.248001042289552e-05, + "loss": 0.9046, + "step": 6304 + }, + { + "epoch": 0.3070143403208921, + "grad_norm": 2.1155669689178467, + "learning_rate": 3.2477545369185915e-05, + "loss": 0.9165, + "step": 6305 + }, + { + "epoch": 0.30706303410999924, + "grad_norm": 2.1063003540039062, + "learning_rate": 3.247508000509623e-05, + "loss": 0.8878, + "step": 6306 + }, + { + "epoch": 0.30711172789910646, + "grad_norm": 1.988053560256958, + "learning_rate": 3.2472614330687785e-05, + "loss": 0.8403, + "step": 6307 + }, + { + "epoch": 0.30716042168821367, + "grad_norm": 2.384852409362793, + "learning_rate": 3.247014834602193e-05, + "loss": 0.9948, + "step": 6308 + }, + { + "epoch": 0.3072091154773209, + "grad_norm": 1.9788442850112915, + "learning_rate": 3.2467682051159995e-05, + "loss": 0.9634, + "step": 6309 + }, + { + "epoch": 0.3072578092664281, + "grad_norm": 1.652535319328308, + "learning_rate": 3.246521544616333e-05, + "loss": 0.8284, + "step": 6310 + }, + { + "epoch": 0.30730650305553525, + "grad_norm": 1.832073450088501, + "learning_rate": 3.24627485310933e-05, + "loss": 0.8877, + "step": 6311 + }, + { + "epoch": 0.30735519684464246, + "grad_norm": 5.22359561920166, + "learning_rate": 3.246028130601126e-05, + "loss": 0.9323, + "step": 6312 + }, + { + "epoch": 0.3074038906337497, + "grad_norm": 1.2896400690078735, + "learning_rate": 3.245781377097858e-05, + "loss": 0.8607, + "step": 6313 + }, + { + "epoch": 0.3074525844228569, + "grad_norm": 1.3362705707550049, + "learning_rate": 3.245534592605665e-05, + "loss": 0.7576, + "step": 6314 + }, + { + "epoch": 0.30750127821196405, + "grad_norm": 2.242527961730957, + "learning_rate": 3.2452877771306866e-05, + "loss": 0.8585, + "step": 6315 + }, + { + "epoch": 0.30754997200107126, + "grad_norm": 1.5705922842025757, + "learning_rate": 3.2450409306790605e-05, + "loss": 0.7792, + "step": 6316 + }, + { + "epoch": 0.30759866579017847, + "grad_norm": 1.8605866432189941, + "learning_rate": 3.2447940532569276e-05, + "loss": 0.8914, + "step": 6317 + }, + { + "epoch": 0.3076473595792857, + "grad_norm": 1.7341268062591553, + "learning_rate": 3.244547144870429e-05, + "loss": 0.8182, + "step": 6318 + }, + { + "epoch": 0.30769605336839284, + "grad_norm": 0.08901891857385635, + "learning_rate": 3.244300205525707e-05, + "loss": 0.7112, + "step": 6319 + }, + { + "epoch": 0.30774474715750005, + "grad_norm": 1.9267306327819824, + "learning_rate": 3.2440532352289036e-05, + "loss": 0.9811, + "step": 6320 + }, + { + "epoch": 0.30779344094660727, + "grad_norm": 1.5749508142471313, + "learning_rate": 3.243806233986163e-05, + "loss": 0.8721, + "step": 6321 + }, + { + "epoch": 0.3078421347357145, + "grad_norm": 1.3496533632278442, + "learning_rate": 3.243559201803629e-05, + "loss": 0.8315, + "step": 6322 + }, + { + "epoch": 0.30789082852482164, + "grad_norm": 2.088078260421753, + "learning_rate": 3.2433121386874455e-05, + "loss": 0.9446, + "step": 6323 + }, + { + "epoch": 0.30793952231392885, + "grad_norm": 0.08756835758686066, + "learning_rate": 3.2430650446437596e-05, + "loss": 0.684, + "step": 6324 + }, + { + "epoch": 0.30798821610303606, + "grad_norm": 1.8142043352127075, + "learning_rate": 3.242817919678718e-05, + "loss": 0.919, + "step": 6325 + }, + { + "epoch": 0.3080369098921433, + "grad_norm": 2.100675344467163, + "learning_rate": 3.242570763798467e-05, + "loss": 0.8399, + "step": 6326 + }, + { + "epoch": 0.30808560368125043, + "grad_norm": 0.0827786847949028, + "learning_rate": 3.2423235770091545e-05, + "loss": 0.5976, + "step": 6327 + }, + { + "epoch": 0.30813429747035764, + "grad_norm": 1.874454379081726, + "learning_rate": 3.2420763593169304e-05, + "loss": 0.8628, + "step": 6328 + }, + { + "epoch": 0.30818299125946486, + "grad_norm": 1.7802321910858154, + "learning_rate": 3.2418291107279436e-05, + "loss": 0.8192, + "step": 6329 + }, + { + "epoch": 0.30823168504857207, + "grad_norm": 2.3448197841644287, + "learning_rate": 3.241581831248344e-05, + "loss": 0.9896, + "step": 6330 + }, + { + "epoch": 0.3082803788376793, + "grad_norm": 2.711371898651123, + "learning_rate": 3.241334520884284e-05, + "loss": 0.8123, + "step": 6331 + }, + { + "epoch": 0.30832907262678644, + "grad_norm": 1.7148300409317017, + "learning_rate": 3.2410871796419136e-05, + "loss": 0.8547, + "step": 6332 + }, + { + "epoch": 0.30837776641589365, + "grad_norm": 2.0837879180908203, + "learning_rate": 3.240839807527386e-05, + "loss": 0.8773, + "step": 6333 + }, + { + "epoch": 0.30842646020500086, + "grad_norm": 2.2284398078918457, + "learning_rate": 3.240592404546856e-05, + "loss": 0.8409, + "step": 6334 + }, + { + "epoch": 0.3084751539941081, + "grad_norm": 2.146404981613159, + "learning_rate": 3.240344970706477e-05, + "loss": 0.8934, + "step": 6335 + }, + { + "epoch": 0.30852384778321523, + "grad_norm": 3.212886095046997, + "learning_rate": 3.2400975060124034e-05, + "loss": 0.8421, + "step": 6336 + }, + { + "epoch": 0.30857254157232245, + "grad_norm": 1.5590522289276123, + "learning_rate": 3.2398500104707914e-05, + "loss": 0.989, + "step": 6337 + }, + { + "epoch": 0.30862123536142966, + "grad_norm": 2.0875916481018066, + "learning_rate": 3.239602484087797e-05, + "loss": 0.8505, + "step": 6338 + }, + { + "epoch": 0.30866992915053687, + "grad_norm": 1.5620441436767578, + "learning_rate": 3.2393549268695786e-05, + "loss": 0.8364, + "step": 6339 + }, + { + "epoch": 0.308718622939644, + "grad_norm": 1.4187489748001099, + "learning_rate": 3.239107338822293e-05, + "loss": 0.9316, + "step": 6340 + }, + { + "epoch": 0.30876731672875124, + "grad_norm": 1.5629583597183228, + "learning_rate": 3.238859719952099e-05, + "loss": 0.8214, + "step": 6341 + }, + { + "epoch": 0.30881601051785845, + "grad_norm": 1.6543502807617188, + "learning_rate": 3.2386120702651566e-05, + "loss": 0.9813, + "step": 6342 + }, + { + "epoch": 0.30886470430696567, + "grad_norm": 1.519180417060852, + "learning_rate": 3.2383643897676265e-05, + "loss": 0.8725, + "step": 6343 + }, + { + "epoch": 0.3089133980960728, + "grad_norm": 1.433348536491394, + "learning_rate": 3.238116678465668e-05, + "loss": 0.8147, + "step": 6344 + }, + { + "epoch": 0.30896209188518003, + "grad_norm": 1.7936652898788452, + "learning_rate": 3.237868936365446e-05, + "loss": 0.8656, + "step": 6345 + }, + { + "epoch": 0.30901078567428725, + "grad_norm": 1.6294307708740234, + "learning_rate": 3.237621163473121e-05, + "loss": 0.8354, + "step": 6346 + }, + { + "epoch": 0.30905947946339446, + "grad_norm": 1.4054100513458252, + "learning_rate": 3.237373359794855e-05, + "loss": 0.8021, + "step": 6347 + }, + { + "epoch": 0.3091081732525016, + "grad_norm": 1.7817317247390747, + "learning_rate": 3.237125525336816e-05, + "loss": 0.7947, + "step": 6348 + }, + { + "epoch": 0.30915686704160883, + "grad_norm": 1.7664858102798462, + "learning_rate": 3.2368776601051654e-05, + "loss": 0.8609, + "step": 6349 + }, + { + "epoch": 0.30920556083071604, + "grad_norm": 1.4223809242248535, + "learning_rate": 3.236629764106071e-05, + "loss": 0.9121, + "step": 6350 + }, + { + "epoch": 0.30925425461982325, + "grad_norm": 2.08174204826355, + "learning_rate": 3.236381837345698e-05, + "loss": 0.8896, + "step": 6351 + }, + { + "epoch": 0.30930294840893047, + "grad_norm": 1.8010668754577637, + "learning_rate": 3.236133879830214e-05, + "loss": 0.7908, + "step": 6352 + }, + { + "epoch": 0.3093516421980376, + "grad_norm": 1.756407618522644, + "learning_rate": 3.2358858915657874e-05, + "loss": 0.8214, + "step": 6353 + }, + { + "epoch": 0.30940033598714484, + "grad_norm": 1.573499321937561, + "learning_rate": 3.235637872558586e-05, + "loss": 0.8311, + "step": 6354 + }, + { + "epoch": 0.30944902977625205, + "grad_norm": 1.844972014427185, + "learning_rate": 3.2353898228147806e-05, + "loss": 0.9077, + "step": 6355 + }, + { + "epoch": 0.30949772356535926, + "grad_norm": 1.8255001306533813, + "learning_rate": 3.235141742340541e-05, + "loss": 0.9528, + "step": 6356 + }, + { + "epoch": 0.3095464173544664, + "grad_norm": 1.7379251718521118, + "learning_rate": 3.234893631142037e-05, + "loss": 0.8988, + "step": 6357 + }, + { + "epoch": 0.30959511114357363, + "grad_norm": 1.5639348030090332, + "learning_rate": 3.234645489225441e-05, + "loss": 0.8314, + "step": 6358 + }, + { + "epoch": 0.30964380493268084, + "grad_norm": 2.0767648220062256, + "learning_rate": 3.234397316596927e-05, + "loss": 0.7422, + "step": 6359 + }, + { + "epoch": 0.30969249872178806, + "grad_norm": 0.09086822718381882, + "learning_rate": 3.234149113262667e-05, + "loss": 0.6607, + "step": 6360 + }, + { + "epoch": 0.3097411925108952, + "grad_norm": 1.4137376546859741, + "learning_rate": 3.233900879228835e-05, + "loss": 0.9481, + "step": 6361 + }, + { + "epoch": 0.3097898863000024, + "grad_norm": 1.4685912132263184, + "learning_rate": 3.2336526145016054e-05, + "loss": 0.806, + "step": 6362 + }, + { + "epoch": 0.30983858008910964, + "grad_norm": 1.361401915550232, + "learning_rate": 3.233404319087156e-05, + "loss": 0.7069, + "step": 6363 + }, + { + "epoch": 0.30988727387821685, + "grad_norm": 1.7685003280639648, + "learning_rate": 3.233155992991661e-05, + "loss": 0.8426, + "step": 6364 + }, + { + "epoch": 0.309935967667324, + "grad_norm": 0.08561636507511139, + "learning_rate": 3.232907636221298e-05, + "loss": 0.6197, + "step": 6365 + }, + { + "epoch": 0.3099846614564312, + "grad_norm": 1.4859020709991455, + "learning_rate": 3.232659248782246e-05, + "loss": 0.7855, + "step": 6366 + }, + { + "epoch": 0.31003335524553843, + "grad_norm": 1.5036662817001343, + "learning_rate": 3.232410830680682e-05, + "loss": 0.8117, + "step": 6367 + }, + { + "epoch": 0.31008204903464565, + "grad_norm": 1.7357239723205566, + "learning_rate": 3.232162381922786e-05, + "loss": 0.8469, + "step": 6368 + }, + { + "epoch": 0.3101307428237528, + "grad_norm": 1.412755012512207, + "learning_rate": 3.231913902514739e-05, + "loss": 0.8517, + "step": 6369 + }, + { + "epoch": 0.31017943661286, + "grad_norm": 0.08912481367588043, + "learning_rate": 3.231665392462721e-05, + "loss": 0.6767, + "step": 6370 + }, + { + "epoch": 0.31022813040196723, + "grad_norm": 1.3099993467330933, + "learning_rate": 3.231416851772914e-05, + "loss": 0.9432, + "step": 6371 + }, + { + "epoch": 0.31027682419107444, + "grad_norm": 1.3779618740081787, + "learning_rate": 3.2311682804515005e-05, + "loss": 0.8747, + "step": 6372 + }, + { + "epoch": 0.31032551798018165, + "grad_norm": 1.1896666288375854, + "learning_rate": 3.2309196785046644e-05, + "loss": 0.9156, + "step": 6373 + }, + { + "epoch": 0.3103742117692888, + "grad_norm": 1.3667094707489014, + "learning_rate": 3.230671045938589e-05, + "loss": 0.8731, + "step": 6374 + }, + { + "epoch": 0.310422905558396, + "grad_norm": 1.293972373008728, + "learning_rate": 3.230422382759458e-05, + "loss": 0.7877, + "step": 6375 + }, + { + "epoch": 0.31047159934750324, + "grad_norm": 1.4329643249511719, + "learning_rate": 3.23017368897346e-05, + "loss": 0.9511, + "step": 6376 + }, + { + "epoch": 0.31052029313661045, + "grad_norm": 1.3732330799102783, + "learning_rate": 3.229924964586778e-05, + "loss": 0.8755, + "step": 6377 + }, + { + "epoch": 0.3105689869257176, + "grad_norm": 1.4339854717254639, + "learning_rate": 3.2296762096056004e-05, + "loss": 0.8059, + "step": 6378 + }, + { + "epoch": 0.3106176807148248, + "grad_norm": 1.741490125656128, + "learning_rate": 3.229427424036115e-05, + "loss": 0.8687, + "step": 6379 + }, + { + "epoch": 0.31066637450393203, + "grad_norm": 1.3057609796524048, + "learning_rate": 3.229178607884511e-05, + "loss": 0.8051, + "step": 6380 + }, + { + "epoch": 0.31071506829303924, + "grad_norm": 1.454505443572998, + "learning_rate": 3.2289297611569756e-05, + "loss": 0.9013, + "step": 6381 + }, + { + "epoch": 0.3107637620821464, + "grad_norm": 2.5145139694213867, + "learning_rate": 3.228680883859701e-05, + "loss": 0.9345, + "step": 6382 + }, + { + "epoch": 0.3108124558712536, + "grad_norm": 1.284759521484375, + "learning_rate": 3.228431975998878e-05, + "loss": 0.8419, + "step": 6383 + }, + { + "epoch": 0.3108611496603608, + "grad_norm": 8.422813415527344, + "learning_rate": 3.228183037580697e-05, + "loss": 0.9397, + "step": 6384 + }, + { + "epoch": 0.31090984344946804, + "grad_norm": 1.2630057334899902, + "learning_rate": 3.227934068611352e-05, + "loss": 0.8689, + "step": 6385 + }, + { + "epoch": 0.3109585372385752, + "grad_norm": 1.3088160753250122, + "learning_rate": 3.227685069097034e-05, + "loss": 0.8295, + "step": 6386 + }, + { + "epoch": 0.3110072310276824, + "grad_norm": 0.08951398730278015, + "learning_rate": 3.2274360390439384e-05, + "loss": 0.7325, + "step": 6387 + }, + { + "epoch": 0.3110559248167896, + "grad_norm": 1.2878366708755493, + "learning_rate": 3.227186978458259e-05, + "loss": 0.8083, + "step": 6388 + }, + { + "epoch": 0.31110461860589683, + "grad_norm": 1.4217666387557983, + "learning_rate": 3.226937887346192e-05, + "loss": 0.7344, + "step": 6389 + }, + { + "epoch": 0.31115331239500404, + "grad_norm": 3.195338249206543, + "learning_rate": 3.2266887657139327e-05, + "loss": 0.9159, + "step": 6390 + }, + { + "epoch": 0.3112020061841112, + "grad_norm": 2.916114568710327, + "learning_rate": 3.226439613567679e-05, + "loss": 0.8901, + "step": 6391 + }, + { + "epoch": 0.3112506999732184, + "grad_norm": 1.4213366508483887, + "learning_rate": 3.226190430913628e-05, + "loss": 0.893, + "step": 6392 + }, + { + "epoch": 0.3112993937623256, + "grad_norm": 1.5205531120300293, + "learning_rate": 3.225941217757978e-05, + "loss": 0.8866, + "step": 6393 + }, + { + "epoch": 0.31134808755143284, + "grad_norm": 2.6493043899536133, + "learning_rate": 3.225691974106929e-05, + "loss": 0.8562, + "step": 6394 + }, + { + "epoch": 0.31139678134054, + "grad_norm": 1.6829872131347656, + "learning_rate": 3.22544269996668e-05, + "loss": 0.7627, + "step": 6395 + }, + { + "epoch": 0.3114454751296472, + "grad_norm": 2.0018792152404785, + "learning_rate": 3.225193395343432e-05, + "loss": 0.9569, + "step": 6396 + }, + { + "epoch": 0.3114941689187544, + "grad_norm": 1.5137474536895752, + "learning_rate": 3.224944060243387e-05, + "loss": 0.8733, + "step": 6397 + }, + { + "epoch": 0.31154286270786163, + "grad_norm": 1.6113617420196533, + "learning_rate": 3.224694694672746e-05, + "loss": 0.8596, + "step": 6398 + }, + { + "epoch": 0.3115915564969688, + "grad_norm": 1.598188042640686, + "learning_rate": 3.224445298637713e-05, + "loss": 0.9413, + "step": 6399 + }, + { + "epoch": 0.311640250286076, + "grad_norm": 1.7307538986206055, + "learning_rate": 3.224195872144493e-05, + "loss": 0.8373, + "step": 6400 + }, + { + "epoch": 0.3116889440751832, + "grad_norm": 1.6916998624801636, + "learning_rate": 3.2239464151992876e-05, + "loss": 0.8945, + "step": 6401 + }, + { + "epoch": 0.31173763786429043, + "grad_norm": 1.422325611114502, + "learning_rate": 3.2236969278083036e-05, + "loss": 0.9722, + "step": 6402 + }, + { + "epoch": 0.3117863316533976, + "grad_norm": 1.5705186128616333, + "learning_rate": 3.223447409977747e-05, + "loss": 0.9229, + "step": 6403 + }, + { + "epoch": 0.3118350254425048, + "grad_norm": 2.328580379486084, + "learning_rate": 3.223197861713825e-05, + "loss": 0.8187, + "step": 6404 + }, + { + "epoch": 0.311883719231612, + "grad_norm": 1.6390224695205688, + "learning_rate": 3.2229482830227445e-05, + "loss": 0.8458, + "step": 6405 + }, + { + "epoch": 0.3119324130207192, + "grad_norm": 2.120892286300659, + "learning_rate": 3.2226986739107136e-05, + "loss": 0.8576, + "step": 6406 + }, + { + "epoch": 0.3119811068098264, + "grad_norm": 1.9622786045074463, + "learning_rate": 3.222449034383942e-05, + "loss": 0.8826, + "step": 6407 + }, + { + "epoch": 0.3120298005989336, + "grad_norm": 1.3071339130401611, + "learning_rate": 3.2221993644486395e-05, + "loss": 0.9073, + "step": 6408 + }, + { + "epoch": 0.3120784943880408, + "grad_norm": 1.4769097566604614, + "learning_rate": 3.221949664111016e-05, + "loss": 0.8666, + "step": 6409 + }, + { + "epoch": 0.312127188177148, + "grad_norm": 2.1414742469787598, + "learning_rate": 3.2216999333772835e-05, + "loss": 0.8708, + "step": 6410 + }, + { + "epoch": 0.31217588196625523, + "grad_norm": 0.11390435695648193, + "learning_rate": 3.221450172253653e-05, + "loss": 0.6097, + "step": 6411 + }, + { + "epoch": 0.3122245757553624, + "grad_norm": 2.2250590324401855, + "learning_rate": 3.221200380746338e-05, + "loss": 0.872, + "step": 6412 + }, + { + "epoch": 0.3122732695444696, + "grad_norm": 1.5933973789215088, + "learning_rate": 3.220950558861553e-05, + "loss": 0.8587, + "step": 6413 + }, + { + "epoch": 0.3123219633335768, + "grad_norm": 1.6671555042266846, + "learning_rate": 3.220700706605512e-05, + "loss": 0.9024, + "step": 6414 + }, + { + "epoch": 0.312370657122684, + "grad_norm": 1.4697835445404053, + "learning_rate": 3.220450823984429e-05, + "loss": 0.8641, + "step": 6415 + }, + { + "epoch": 0.3124193509117912, + "grad_norm": 3.8730766773223877, + "learning_rate": 3.2202009110045205e-05, + "loss": 0.8597, + "step": 6416 + }, + { + "epoch": 0.3124680447008984, + "grad_norm": 1.2876591682434082, + "learning_rate": 3.219950967672003e-05, + "loss": 0.9115, + "step": 6417 + }, + { + "epoch": 0.3125167384900056, + "grad_norm": 6.303310871124268, + "learning_rate": 3.219700993993094e-05, + "loss": 0.8705, + "step": 6418 + }, + { + "epoch": 0.3125654322791128, + "grad_norm": 1.763029932975769, + "learning_rate": 3.219450989974012e-05, + "loss": 0.8832, + "step": 6419 + }, + { + "epoch": 0.31261412606822, + "grad_norm": 1.7531205415725708, + "learning_rate": 3.2192009556209745e-05, + "loss": 0.8872, + "step": 6420 + }, + { + "epoch": 0.3126628198573272, + "grad_norm": 1.8152034282684326, + "learning_rate": 3.2189508909402025e-05, + "loss": 0.8954, + "step": 6421 + }, + { + "epoch": 0.3127115136464344, + "grad_norm": 1.3791718482971191, + "learning_rate": 3.2187007959379157e-05, + "loss": 0.78, + "step": 6422 + }, + { + "epoch": 0.3127602074355416, + "grad_norm": 2.1651196479797363, + "learning_rate": 3.218450670620335e-05, + "loss": 0.9092, + "step": 6423 + }, + { + "epoch": 0.31280890122464877, + "grad_norm": 1.3029810190200806, + "learning_rate": 3.2182005149936844e-05, + "loss": 0.9384, + "step": 6424 + }, + { + "epoch": 0.312857595013756, + "grad_norm": 2.7808637619018555, + "learning_rate": 3.217950329064184e-05, + "loss": 0.8527, + "step": 6425 + }, + { + "epoch": 0.3129062888028632, + "grad_norm": 1.707185983657837, + "learning_rate": 3.217700112838058e-05, + "loss": 0.8653, + "step": 6426 + }, + { + "epoch": 0.3129549825919704, + "grad_norm": 1.7138798236846924, + "learning_rate": 3.217449866321531e-05, + "loss": 0.9093, + "step": 6427 + }, + { + "epoch": 0.31300367638107757, + "grad_norm": 1.3659038543701172, + "learning_rate": 3.2171995895208274e-05, + "loss": 0.9218, + "step": 6428 + }, + { + "epoch": 0.3130523701701848, + "grad_norm": 2.0881428718566895, + "learning_rate": 3.216949282442173e-05, + "loss": 0.7827, + "step": 6429 + }, + { + "epoch": 0.313101063959292, + "grad_norm": 1.3339691162109375, + "learning_rate": 3.2166989450917944e-05, + "loss": 0.8905, + "step": 6430 + }, + { + "epoch": 0.3131497577483992, + "grad_norm": 2.943380832672119, + "learning_rate": 3.216448577475918e-05, + "loss": 0.8942, + "step": 6431 + }, + { + "epoch": 0.3131984515375064, + "grad_norm": 1.6109930276870728, + "learning_rate": 3.216198179600773e-05, + "loss": 0.8377, + "step": 6432 + }, + { + "epoch": 0.3132471453266136, + "grad_norm": 0.0850551426410675, + "learning_rate": 3.2159477514725876e-05, + "loss": 0.6019, + "step": 6433 + }, + { + "epoch": 0.3132958391157208, + "grad_norm": 1.766411542892456, + "learning_rate": 3.2156972930975905e-05, + "loss": 0.8241, + "step": 6434 + }, + { + "epoch": 0.313344532904828, + "grad_norm": 1.7401669025421143, + "learning_rate": 3.215446804482012e-05, + "loss": 0.8679, + "step": 6435 + }, + { + "epoch": 0.3133932266939352, + "grad_norm": 1.3774818181991577, + "learning_rate": 3.215196285632084e-05, + "loss": 0.8848, + "step": 6436 + }, + { + "epoch": 0.31344192048304237, + "grad_norm": 1.3089730739593506, + "learning_rate": 3.214945736554038e-05, + "loss": 0.9296, + "step": 6437 + }, + { + "epoch": 0.3134906142721496, + "grad_norm": 4.114285469055176, + "learning_rate": 3.214695157254106e-05, + "loss": 0.838, + "step": 6438 + }, + { + "epoch": 0.3135393080612568, + "grad_norm": 1.8190351724624634, + "learning_rate": 3.21444454773852e-05, + "loss": 0.915, + "step": 6439 + }, + { + "epoch": 0.313588001850364, + "grad_norm": 1.959370732307434, + "learning_rate": 3.214193908013516e-05, + "loss": 0.8355, + "step": 6440 + }, + { + "epoch": 0.31363669563947116, + "grad_norm": 1.3306936025619507, + "learning_rate": 3.2139432380853274e-05, + "loss": 0.8778, + "step": 6441 + }, + { + "epoch": 0.3136853894285784, + "grad_norm": 1.6767159700393677, + "learning_rate": 3.213692537960191e-05, + "loss": 0.9107, + "step": 6442 + }, + { + "epoch": 0.3137340832176856, + "grad_norm": 1.4272596836090088, + "learning_rate": 3.213441807644341e-05, + "loss": 0.8333, + "step": 6443 + }, + { + "epoch": 0.3137827770067928, + "grad_norm": 1.4000778198242188, + "learning_rate": 3.2131910471440165e-05, + "loss": 0.8097, + "step": 6444 + }, + { + "epoch": 0.31383147079589996, + "grad_norm": 1.6116448640823364, + "learning_rate": 3.2129402564654534e-05, + "loss": 0.8593, + "step": 6445 + }, + { + "epoch": 0.31388016458500717, + "grad_norm": 1.4117108583450317, + "learning_rate": 3.212689435614891e-05, + "loss": 0.846, + "step": 6446 + }, + { + "epoch": 0.3139288583741144, + "grad_norm": 2.310046672821045, + "learning_rate": 3.2124385845985675e-05, + "loss": 0.9212, + "step": 6447 + }, + { + "epoch": 0.3139775521632216, + "grad_norm": 1.6718214750289917, + "learning_rate": 3.212187703422725e-05, + "loss": 0.9311, + "step": 6448 + }, + { + "epoch": 0.31402624595232875, + "grad_norm": 1.6763801574707031, + "learning_rate": 3.211936792093602e-05, + "loss": 0.8338, + "step": 6449 + }, + { + "epoch": 0.31407493974143597, + "grad_norm": 1.927468180656433, + "learning_rate": 3.211685850617441e-05, + "loss": 0.8359, + "step": 6450 + }, + { + "epoch": 0.3141236335305432, + "grad_norm": 3.196119785308838, + "learning_rate": 3.211434879000484e-05, + "loss": 0.827, + "step": 6451 + }, + { + "epoch": 0.3141723273196504, + "grad_norm": 1.5747452974319458, + "learning_rate": 3.2111838772489735e-05, + "loss": 0.8108, + "step": 6452 + }, + { + "epoch": 0.3142210211087576, + "grad_norm": 1.522264838218689, + "learning_rate": 3.2109328453691546e-05, + "loss": 0.8909, + "step": 6453 + }, + { + "epoch": 0.31426971489786476, + "grad_norm": 1.9258226156234741, + "learning_rate": 3.21068178336727e-05, + "loss": 0.8854, + "step": 6454 + }, + { + "epoch": 0.314318408686972, + "grad_norm": 1.3505266904830933, + "learning_rate": 3.210430691249566e-05, + "loss": 0.8402, + "step": 6455 + }, + { + "epoch": 0.3143671024760792, + "grad_norm": 1.306746244430542, + "learning_rate": 3.210179569022288e-05, + "loss": 0.9014, + "step": 6456 + }, + { + "epoch": 0.3144157962651864, + "grad_norm": 1.1361095905303955, + "learning_rate": 3.209928416691683e-05, + "loss": 0.8034, + "step": 6457 + }, + { + "epoch": 0.31446449005429356, + "grad_norm": 1.3641780614852905, + "learning_rate": 3.209677234263999e-05, + "loss": 0.8811, + "step": 6458 + }, + { + "epoch": 0.31451318384340077, + "grad_norm": 3.457120895385742, + "learning_rate": 3.2094260217454824e-05, + "loss": 0.8892, + "step": 6459 + }, + { + "epoch": 0.314561877632508, + "grad_norm": 1.4795522689819336, + "learning_rate": 3.209174779142384e-05, + "loss": 0.74, + "step": 6460 + }, + { + "epoch": 0.3146105714216152, + "grad_norm": 1.7408801317214966, + "learning_rate": 3.208923506460952e-05, + "loss": 0.8503, + "step": 6461 + }, + { + "epoch": 0.31465926521072235, + "grad_norm": 1.717056393623352, + "learning_rate": 3.208672203707438e-05, + "loss": 0.9038, + "step": 6462 + }, + { + "epoch": 0.31470795899982956, + "grad_norm": 1.3380780220031738, + "learning_rate": 3.208420870888092e-05, + "loss": 0.8544, + "step": 6463 + }, + { + "epoch": 0.3147566527889368, + "grad_norm": 2.113694429397583, + "learning_rate": 3.2081695080091676e-05, + "loss": 0.8161, + "step": 6464 + }, + { + "epoch": 0.314805346578044, + "grad_norm": 8.213414192199707, + "learning_rate": 3.207918115076916e-05, + "loss": 0.8164, + "step": 6465 + }, + { + "epoch": 0.31485404036715114, + "grad_norm": 1.817699909210205, + "learning_rate": 3.207666692097592e-05, + "loss": 0.9662, + "step": 6466 + }, + { + "epoch": 0.31490273415625836, + "grad_norm": 1.3293910026550293, + "learning_rate": 3.207415239077448e-05, + "loss": 0.8765, + "step": 6467 + }, + { + "epoch": 0.31495142794536557, + "grad_norm": 3.1554620265960693, + "learning_rate": 3.20716375602274e-05, + "loss": 0.854, + "step": 6468 + }, + { + "epoch": 0.3150001217344728, + "grad_norm": 1.4111140966415405, + "learning_rate": 3.2069122429397235e-05, + "loss": 0.8824, + "step": 6469 + }, + { + "epoch": 0.31504881552358, + "grad_norm": 1.138880968093872, + "learning_rate": 3.2066606998346546e-05, + "loss": 0.9425, + "step": 6470 + }, + { + "epoch": 0.31509750931268715, + "grad_norm": 1.3899890184402466, + "learning_rate": 3.2064091267137914e-05, + "loss": 0.9021, + "step": 6471 + }, + { + "epoch": 0.31514620310179436, + "grad_norm": 1.6542397737503052, + "learning_rate": 3.206157523583391e-05, + "loss": 0.8471, + "step": 6472 + }, + { + "epoch": 0.3151948968909016, + "grad_norm": 1.2346539497375488, + "learning_rate": 3.2059058904497124e-05, + "loss": 0.9344, + "step": 6473 + }, + { + "epoch": 0.3152435906800088, + "grad_norm": 1.2428706884384155, + "learning_rate": 3.2056542273190144e-05, + "loss": 0.8757, + "step": 6474 + }, + { + "epoch": 0.31529228446911595, + "grad_norm": 1.478273630142212, + "learning_rate": 3.205402534197558e-05, + "loss": 0.8208, + "step": 6475 + }, + { + "epoch": 0.31534097825822316, + "grad_norm": 2.056952476501465, + "learning_rate": 3.2051508110916035e-05, + "loss": 0.8814, + "step": 6476 + }, + { + "epoch": 0.31538967204733037, + "grad_norm": 1.2653084993362427, + "learning_rate": 3.2048990580074126e-05, + "loss": 0.9555, + "step": 6477 + }, + { + "epoch": 0.3154383658364376, + "grad_norm": 1.4509042501449585, + "learning_rate": 3.204647274951248e-05, + "loss": 0.837, + "step": 6478 + }, + { + "epoch": 0.31548705962554474, + "grad_norm": 1.518113613128662, + "learning_rate": 3.204395461929372e-05, + "loss": 0.7741, + "step": 6479 + }, + { + "epoch": 0.31553575341465195, + "grad_norm": 1.9744443893432617, + "learning_rate": 3.20414361894805e-05, + "loss": 0.8239, + "step": 6480 + }, + { + "epoch": 0.31558444720375917, + "grad_norm": 0.09210744500160217, + "learning_rate": 3.203891746013545e-05, + "loss": 0.5275, + "step": 6481 + }, + { + "epoch": 0.3156331409928664, + "grad_norm": 1.436382532119751, + "learning_rate": 3.203639843132124e-05, + "loss": 0.8437, + "step": 6482 + }, + { + "epoch": 0.31568183478197354, + "grad_norm": 1.3597370386123657, + "learning_rate": 3.203387910310051e-05, + "loss": 0.8764, + "step": 6483 + }, + { + "epoch": 0.31573052857108075, + "grad_norm": 1.5140976905822754, + "learning_rate": 3.203135947553594e-05, + "loss": 0.8456, + "step": 6484 + }, + { + "epoch": 0.31577922236018796, + "grad_norm": 2.261563777923584, + "learning_rate": 3.202883954869021e-05, + "loss": 0.8134, + "step": 6485 + }, + { + "epoch": 0.3158279161492952, + "grad_norm": 1.7896943092346191, + "learning_rate": 3.202631932262601e-05, + "loss": 0.915, + "step": 6486 + }, + { + "epoch": 0.31587660993840233, + "grad_norm": 1.9898428916931152, + "learning_rate": 3.2023798797406e-05, + "loss": 0.8759, + "step": 6487 + }, + { + "epoch": 0.31592530372750954, + "grad_norm": 1.2983936071395874, + "learning_rate": 3.202127797309291e-05, + "loss": 0.7943, + "step": 6488 + }, + { + "epoch": 0.31597399751661676, + "grad_norm": 1.8422276973724365, + "learning_rate": 3.201875684974944e-05, + "loss": 0.8289, + "step": 6489 + }, + { + "epoch": 0.31602269130572397, + "grad_norm": 3.0740013122558594, + "learning_rate": 3.2016235427438286e-05, + "loss": 0.8655, + "step": 6490 + }, + { + "epoch": 0.3160713850948312, + "grad_norm": 1.5986043214797974, + "learning_rate": 3.201371370622219e-05, + "loss": 0.8713, + "step": 6491 + }, + { + "epoch": 0.31612007888393834, + "grad_norm": 1.7302792072296143, + "learning_rate": 3.201119168616386e-05, + "loss": 0.8805, + "step": 6492 + }, + { + "epoch": 0.31616877267304555, + "grad_norm": 1.5752679109573364, + "learning_rate": 3.200866936732605e-05, + "loss": 0.7668, + "step": 6493 + }, + { + "epoch": 0.31621746646215276, + "grad_norm": 1.3503470420837402, + "learning_rate": 3.2006146749771485e-05, + "loss": 0.8831, + "step": 6494 + }, + { + "epoch": 0.31626616025126, + "grad_norm": 1.479201316833496, + "learning_rate": 3.200362383356293e-05, + "loss": 0.8243, + "step": 6495 + }, + { + "epoch": 0.31631485404036713, + "grad_norm": 1.3534525632858276, + "learning_rate": 3.2001100618763146e-05, + "loss": 0.8958, + "step": 6496 + }, + { + "epoch": 0.31636354782947435, + "grad_norm": 1.675706148147583, + "learning_rate": 3.199857710543488e-05, + "loss": 0.8289, + "step": 6497 + }, + { + "epoch": 0.31641224161858156, + "grad_norm": 2.4024479389190674, + "learning_rate": 3.199605329364092e-05, + "loss": 0.8264, + "step": 6498 + }, + { + "epoch": 0.31646093540768877, + "grad_norm": 1.6894069910049438, + "learning_rate": 3.199352918344404e-05, + "loss": 0.9151, + "step": 6499 + }, + { + "epoch": 0.3165096291967959, + "grad_norm": 0.09781631082296371, + "learning_rate": 3.199100477490702e-05, + "loss": 0.6713, + "step": 6500 + }, + { + "epoch": 0.31655832298590314, + "grad_norm": 3.671069860458374, + "learning_rate": 3.198848006809267e-05, + "loss": 0.9233, + "step": 6501 + }, + { + "epoch": 0.31660701677501035, + "grad_norm": 1.4582061767578125, + "learning_rate": 3.1985955063063783e-05, + "loss": 0.8083, + "step": 6502 + }, + { + "epoch": 0.31665571056411757, + "grad_norm": 1.9888463020324707, + "learning_rate": 3.198342975988317e-05, + "loss": 0.8868, + "step": 6503 + }, + { + "epoch": 0.3167044043532247, + "grad_norm": 1.8787086009979248, + "learning_rate": 3.198090415861365e-05, + "loss": 0.7962, + "step": 6504 + }, + { + "epoch": 0.31675309814233193, + "grad_norm": 1.338149070739746, + "learning_rate": 3.197837825931805e-05, + "loss": 0.8351, + "step": 6505 + }, + { + "epoch": 0.31680179193143915, + "grad_norm": 5.4360737800598145, + "learning_rate": 3.197585206205919e-05, + "loss": 0.895, + "step": 6506 + }, + { + "epoch": 0.31685048572054636, + "grad_norm": 2.320403575897217, + "learning_rate": 3.197332556689993e-05, + "loss": 0.8041, + "step": 6507 + }, + { + "epoch": 0.3168991795096535, + "grad_norm": 0.09172143042087555, + "learning_rate": 3.19707987739031e-05, + "loss": 0.5796, + "step": 6508 + }, + { + "epoch": 0.31694787329876073, + "grad_norm": 1.3863710165023804, + "learning_rate": 3.196827168313156e-05, + "loss": 0.9044, + "step": 6509 + }, + { + "epoch": 0.31699656708786794, + "grad_norm": 1.167282223701477, + "learning_rate": 3.196574429464817e-05, + "loss": 0.8546, + "step": 6510 + }, + { + "epoch": 0.31704526087697515, + "grad_norm": 1.7600349187850952, + "learning_rate": 3.196321660851579e-05, + "loss": 0.7859, + "step": 6511 + }, + { + "epoch": 0.31709395466608237, + "grad_norm": 1.5299729108810425, + "learning_rate": 3.196068862479732e-05, + "loss": 0.8265, + "step": 6512 + }, + { + "epoch": 0.3171426484551895, + "grad_norm": 1.9940305948257446, + "learning_rate": 3.195816034355563e-05, + "loss": 0.8192, + "step": 6513 + }, + { + "epoch": 0.31719134224429674, + "grad_norm": 1.6364901065826416, + "learning_rate": 3.19556317648536e-05, + "loss": 0.9472, + "step": 6514 + }, + { + "epoch": 0.31724003603340395, + "grad_norm": 0.08786557614803314, + "learning_rate": 3.195310288875415e-05, + "loss": 0.6364, + "step": 6515 + }, + { + "epoch": 0.31728872982251116, + "grad_norm": 1.2186275720596313, + "learning_rate": 3.195057371532017e-05, + "loss": 0.8128, + "step": 6516 + }, + { + "epoch": 0.3173374236116183, + "grad_norm": 1.607718825340271, + "learning_rate": 3.194804424461458e-05, + "loss": 0.9054, + "step": 6517 + }, + { + "epoch": 0.31738611740072553, + "grad_norm": 1.577868103981018, + "learning_rate": 3.1945514476700296e-05, + "loss": 0.8583, + "step": 6518 + }, + { + "epoch": 0.31743481118983274, + "grad_norm": 1.67570161819458, + "learning_rate": 3.194298441164026e-05, + "loss": 0.8787, + "step": 6519 + }, + { + "epoch": 0.31748350497893996, + "grad_norm": 1.6949692964553833, + "learning_rate": 3.194045404949739e-05, + "loss": 0.8012, + "step": 6520 + }, + { + "epoch": 0.3175321987680471, + "grad_norm": 1.7072304487228394, + "learning_rate": 3.1937923390334636e-05, + "loss": 0.8384, + "step": 6521 + }, + { + "epoch": 0.3175808925571543, + "grad_norm": 1.3399778604507446, + "learning_rate": 3.193539243421496e-05, + "loss": 0.8669, + "step": 6522 + }, + { + "epoch": 0.31762958634626154, + "grad_norm": 2.275290012359619, + "learning_rate": 3.193286118120129e-05, + "loss": 0.9136, + "step": 6523 + }, + { + "epoch": 0.31767828013536875, + "grad_norm": 2.0688889026641846, + "learning_rate": 3.193032963135663e-05, + "loss": 0.9414, + "step": 6524 + }, + { + "epoch": 0.3177269739244759, + "grad_norm": 1.8000891208648682, + "learning_rate": 3.192779778474393e-05, + "loss": 0.8781, + "step": 6525 + }, + { + "epoch": 0.3177756677135831, + "grad_norm": 2.390510320663452, + "learning_rate": 3.1925265641426164e-05, + "loss": 0.8634, + "step": 6526 + }, + { + "epoch": 0.31782436150269033, + "grad_norm": 1.4210008382797241, + "learning_rate": 3.1922733201466335e-05, + "loss": 0.8574, + "step": 6527 + }, + { + "epoch": 0.31787305529179755, + "grad_norm": 2.070957899093628, + "learning_rate": 3.192020046492742e-05, + "loss": 0.9516, + "step": 6528 + }, + { + "epoch": 0.3179217490809047, + "grad_norm": 2.6788511276245117, + "learning_rate": 3.191766743187244e-05, + "loss": 0.8133, + "step": 6529 + }, + { + "epoch": 0.3179704428700119, + "grad_norm": 3.2978711128234863, + "learning_rate": 3.1915134102364394e-05, + "loss": 0.8479, + "step": 6530 + }, + { + "epoch": 0.31801913665911913, + "grad_norm": 1.2469407320022583, + "learning_rate": 3.1912600476466296e-05, + "loss": 0.8694, + "step": 6531 + }, + { + "epoch": 0.31806783044822634, + "grad_norm": 1.9222898483276367, + "learning_rate": 3.191006655424119e-05, + "loss": 0.7447, + "step": 6532 + }, + { + "epoch": 0.31811652423733355, + "grad_norm": 1.7175607681274414, + "learning_rate": 3.1907532335752076e-05, + "loss": 0.8793, + "step": 6533 + }, + { + "epoch": 0.3181652180264407, + "grad_norm": 1.9145941734313965, + "learning_rate": 3.1904997821062015e-05, + "loss": 0.8393, + "step": 6534 + }, + { + "epoch": 0.3182139118155479, + "grad_norm": 1.3800008296966553, + "learning_rate": 3.190246301023405e-05, + "loss": 0.8997, + "step": 6535 + }, + { + "epoch": 0.31826260560465514, + "grad_norm": 1.2798552513122559, + "learning_rate": 3.189992790333123e-05, + "loss": 0.8503, + "step": 6536 + }, + { + "epoch": 0.31831129939376235, + "grad_norm": 1.3880431652069092, + "learning_rate": 3.1897392500416625e-05, + "loss": 0.9105, + "step": 6537 + }, + { + "epoch": 0.3183599931828695, + "grad_norm": 2.335691213607788, + "learning_rate": 3.1894856801553293e-05, + "loss": 0.8927, + "step": 6538 + }, + { + "epoch": 0.3184086869719767, + "grad_norm": 1.247641682624817, + "learning_rate": 3.189232080680431e-05, + "loss": 0.7574, + "step": 6539 + }, + { + "epoch": 0.31845738076108393, + "grad_norm": 0.08964787423610687, + "learning_rate": 3.188978451623276e-05, + "loss": 0.5833, + "step": 6540 + }, + { + "epoch": 0.31850607455019114, + "grad_norm": 1.9668775796890259, + "learning_rate": 3.188724792990174e-05, + "loss": 0.8093, + "step": 6541 + }, + { + "epoch": 0.3185547683392983, + "grad_norm": 1.6958091259002686, + "learning_rate": 3.188471104787435e-05, + "loss": 0.9053, + "step": 6542 + }, + { + "epoch": 0.3186034621284055, + "grad_norm": 1.9188885688781738, + "learning_rate": 3.188217387021368e-05, + "loss": 0.8293, + "step": 6543 + }, + { + "epoch": 0.3186521559175127, + "grad_norm": 1.6699310541152954, + "learning_rate": 3.187963639698286e-05, + "loss": 0.7964, + "step": 6544 + }, + { + "epoch": 0.31870084970661994, + "grad_norm": 1.1765202283859253, + "learning_rate": 3.187709862824499e-05, + "loss": 0.8397, + "step": 6545 + }, + { + "epoch": 0.3187495434957271, + "grad_norm": 1.4161595106124878, + "learning_rate": 3.187456056406322e-05, + "loss": 0.8531, + "step": 6546 + }, + { + "epoch": 0.3187982372848343, + "grad_norm": 2.0155675411224365, + "learning_rate": 3.1872022204500665e-05, + "loss": 0.8626, + "step": 6547 + }, + { + "epoch": 0.3188469310739415, + "grad_norm": 4.3754448890686035, + "learning_rate": 3.186948354962048e-05, + "loss": 0.8098, + "step": 6548 + }, + { + "epoch": 0.31889562486304873, + "grad_norm": 1.7271322011947632, + "learning_rate": 3.186694459948581e-05, + "loss": 0.8847, + "step": 6549 + }, + { + "epoch": 0.31894431865215594, + "grad_norm": 1.3686087131500244, + "learning_rate": 3.18644053541598e-05, + "loss": 0.8067, + "step": 6550 + }, + { + "epoch": 0.3189930124412631, + "grad_norm": 1.7956490516662598, + "learning_rate": 3.186186581370564e-05, + "loss": 0.9132, + "step": 6551 + }, + { + "epoch": 0.3190417062303703, + "grad_norm": 1.5389785766601562, + "learning_rate": 3.185932597818648e-05, + "loss": 0.877, + "step": 6552 + }, + { + "epoch": 0.3190904000194775, + "grad_norm": 2.0372819900512695, + "learning_rate": 3.185678584766551e-05, + "loss": 0.8359, + "step": 6553 + }, + { + "epoch": 0.31913909380858474, + "grad_norm": 1.7331911325454712, + "learning_rate": 3.1854245422205904e-05, + "loss": 0.901, + "step": 6554 + }, + { + "epoch": 0.3191877875976919, + "grad_norm": 1.461622714996338, + "learning_rate": 3.185170470187087e-05, + "loss": 0.9422, + "step": 6555 + }, + { + "epoch": 0.3192364813867991, + "grad_norm": 2.174774408340454, + "learning_rate": 3.184916368672359e-05, + "loss": 0.8624, + "step": 6556 + }, + { + "epoch": 0.3192851751759063, + "grad_norm": 1.4160430431365967, + "learning_rate": 3.1846622376827294e-05, + "loss": 0.8858, + "step": 6557 + }, + { + "epoch": 0.31933386896501353, + "grad_norm": 1.3859583139419556, + "learning_rate": 3.1844080772245184e-05, + "loss": 0.9253, + "step": 6558 + }, + { + "epoch": 0.3193825627541207, + "grad_norm": 1.6433717012405396, + "learning_rate": 3.184153887304048e-05, + "loss": 0.863, + "step": 6559 + }, + { + "epoch": 0.3194312565432279, + "grad_norm": 3.560943841934204, + "learning_rate": 3.183899667927642e-05, + "loss": 0.9164, + "step": 6560 + }, + { + "epoch": 0.3194799503323351, + "grad_norm": 1.647711157798767, + "learning_rate": 3.183645419101623e-05, + "loss": 0.8592, + "step": 6561 + }, + { + "epoch": 0.31952864412144233, + "grad_norm": 1.31351900100708, + "learning_rate": 3.183391140832317e-05, + "loss": 0.8111, + "step": 6562 + }, + { + "epoch": 0.3195773379105495, + "grad_norm": 1.247298240661621, + "learning_rate": 3.1831368331260485e-05, + "loss": 0.8572, + "step": 6563 + }, + { + "epoch": 0.3196260316996567, + "grad_norm": 1.6107439994812012, + "learning_rate": 3.1828824959891436e-05, + "loss": 0.8906, + "step": 6564 + }, + { + "epoch": 0.3196747254887639, + "grad_norm": 1.4432486295700073, + "learning_rate": 3.182628129427928e-05, + "loss": 0.8219, + "step": 6565 + }, + { + "epoch": 0.3197234192778711, + "grad_norm": 1.5935029983520508, + "learning_rate": 3.182373733448731e-05, + "loss": 0.9314, + "step": 6566 + }, + { + "epoch": 0.3197721130669783, + "grad_norm": 1.812090277671814, + "learning_rate": 3.1821193080578794e-05, + "loss": 0.9006, + "step": 6567 + }, + { + "epoch": 0.3198208068560855, + "grad_norm": 1.9075114727020264, + "learning_rate": 3.181864853261701e-05, + "loss": 0.9989, + "step": 6568 + }, + { + "epoch": 0.3198695006451927, + "grad_norm": 1.8086403608322144, + "learning_rate": 3.181610369066528e-05, + "loss": 0.8607, + "step": 6569 + }, + { + "epoch": 0.3199181944342999, + "grad_norm": 2.442981481552124, + "learning_rate": 3.1813558554786883e-05, + "loss": 0.862, + "step": 6570 + }, + { + "epoch": 0.31996688822340713, + "grad_norm": 2.4527909755706787, + "learning_rate": 3.181101312504514e-05, + "loss": 0.8528, + "step": 6571 + }, + { + "epoch": 0.3200155820125143, + "grad_norm": 1.631928563117981, + "learning_rate": 3.180846740150338e-05, + "loss": 0.8625, + "step": 6572 + }, + { + "epoch": 0.3200642758016215, + "grad_norm": 1.8801109790802002, + "learning_rate": 3.180592138422491e-05, + "loss": 0.9101, + "step": 6573 + }, + { + "epoch": 0.3201129695907287, + "grad_norm": 1.6854352951049805, + "learning_rate": 3.1803375073273066e-05, + "loss": 0.8495, + "step": 6574 + }, + { + "epoch": 0.3201616633798359, + "grad_norm": 0.09032802283763885, + "learning_rate": 3.1800828468711196e-05, + "loss": 0.6335, + "step": 6575 + }, + { + "epoch": 0.3202103571689431, + "grad_norm": 2.8968968391418457, + "learning_rate": 3.1798281570602645e-05, + "loss": 0.8945, + "step": 6576 + }, + { + "epoch": 0.3202590509580503, + "grad_norm": 2.085297107696533, + "learning_rate": 3.179573437901075e-05, + "loss": 0.8272, + "step": 6577 + }, + { + "epoch": 0.3203077447471575, + "grad_norm": 1.5540181398391724, + "learning_rate": 3.17931868939989e-05, + "loss": 0.926, + "step": 6578 + }, + { + "epoch": 0.3203564385362647, + "grad_norm": 2.4475557804107666, + "learning_rate": 3.179063911563044e-05, + "loss": 0.8064, + "step": 6579 + }, + { + "epoch": 0.3204051323253719, + "grad_norm": 1.4275943040847778, + "learning_rate": 3.1788091043968766e-05, + "loss": 0.9382, + "step": 6580 + }, + { + "epoch": 0.3204538261144791, + "grad_norm": 1.7336812019348145, + "learning_rate": 3.178554267907725e-05, + "loss": 0.8056, + "step": 6581 + }, + { + "epoch": 0.3205025199035863, + "grad_norm": 0.08753366023302078, + "learning_rate": 3.1782994021019286e-05, + "loss": 0.5843, + "step": 6582 + }, + { + "epoch": 0.3205512136926935, + "grad_norm": 1.374848484992981, + "learning_rate": 3.178044506985827e-05, + "loss": 0.8487, + "step": 6583 + }, + { + "epoch": 0.32059990748180067, + "grad_norm": 1.7552258968353271, + "learning_rate": 3.177789582565761e-05, + "loss": 0.8557, + "step": 6584 + }, + { + "epoch": 0.3206486012709079, + "grad_norm": 1.460379958152771, + "learning_rate": 3.177534628848072e-05, + "loss": 0.8627, + "step": 6585 + }, + { + "epoch": 0.3206972950600151, + "grad_norm": 1.9531389474868774, + "learning_rate": 3.177279645839101e-05, + "loss": 0.8407, + "step": 6586 + }, + { + "epoch": 0.3207459888491223, + "grad_norm": 1.299349069595337, + "learning_rate": 3.177024633545192e-05, + "loss": 0.7784, + "step": 6587 + }, + { + "epoch": 0.32079468263822947, + "grad_norm": 1.342753291130066, + "learning_rate": 3.176769591972688e-05, + "loss": 0.8594, + "step": 6588 + }, + { + "epoch": 0.3208433764273367, + "grad_norm": 1.7334123849868774, + "learning_rate": 3.176514521127932e-05, + "loss": 0.9176, + "step": 6589 + }, + { + "epoch": 0.3208920702164439, + "grad_norm": 4.056656837463379, + "learning_rate": 3.1762594210172717e-05, + "loss": 0.8785, + "step": 6590 + }, + { + "epoch": 0.3209407640055511, + "grad_norm": 1.7550737857818604, + "learning_rate": 3.17600429164705e-05, + "loss": 0.9247, + "step": 6591 + }, + { + "epoch": 0.3209894577946583, + "grad_norm": 1.9097501039505005, + "learning_rate": 3.175749133023615e-05, + "loss": 0.7309, + "step": 6592 + }, + { + "epoch": 0.3210381515837655, + "grad_norm": 1.2635191679000854, + "learning_rate": 3.175493945153313e-05, + "loss": 0.9183, + "step": 6593 + }, + { + "epoch": 0.3210868453728727, + "grad_norm": 2.0214810371398926, + "learning_rate": 3.175238728042491e-05, + "loss": 0.8739, + "step": 6594 + }, + { + "epoch": 0.3211355391619799, + "grad_norm": 2.0379509925842285, + "learning_rate": 3.1749834816975e-05, + "loss": 0.962, + "step": 6595 + }, + { + "epoch": 0.3211842329510871, + "grad_norm": 1.7711312770843506, + "learning_rate": 3.174728206124686e-05, + "loss": 0.8132, + "step": 6596 + }, + { + "epoch": 0.32123292674019427, + "grad_norm": 1.7567107677459717, + "learning_rate": 3.174472901330402e-05, + "loss": 0.8748, + "step": 6597 + }, + { + "epoch": 0.3212816205293015, + "grad_norm": 1.4980756044387817, + "learning_rate": 3.174217567320997e-05, + "loss": 0.8439, + "step": 6598 + }, + { + "epoch": 0.3213303143184087, + "grad_norm": 1.6848161220550537, + "learning_rate": 3.173962204102823e-05, + "loss": 0.8317, + "step": 6599 + }, + { + "epoch": 0.3213790081075159, + "grad_norm": 1.3821547031402588, + "learning_rate": 3.173706811682232e-05, + "loss": 0.7725, + "step": 6600 + }, + { + "epoch": 0.32142770189662306, + "grad_norm": 1.4694141149520874, + "learning_rate": 3.173451390065577e-05, + "loss": 0.8123, + "step": 6601 + }, + { + "epoch": 0.3214763956857303, + "grad_norm": 1.3192139863967896, + "learning_rate": 3.173195939259213e-05, + "loss": 0.8088, + "step": 6602 + }, + { + "epoch": 0.3215250894748375, + "grad_norm": 2.1130521297454834, + "learning_rate": 3.172940459269492e-05, + "loss": 0.9372, + "step": 6603 + }, + { + "epoch": 0.3215737832639447, + "grad_norm": 3.034015655517578, + "learning_rate": 3.1726849501027696e-05, + "loss": 0.9068, + "step": 6604 + }, + { + "epoch": 0.32162247705305186, + "grad_norm": 1.983691930770874, + "learning_rate": 3.172429411765403e-05, + "loss": 0.8788, + "step": 6605 + }, + { + "epoch": 0.32167117084215907, + "grad_norm": 1.5112581253051758, + "learning_rate": 3.172173844263748e-05, + "loss": 0.9312, + "step": 6606 + }, + { + "epoch": 0.3217198646312663, + "grad_norm": 1.7135006189346313, + "learning_rate": 3.1719182476041614e-05, + "loss": 0.8098, + "step": 6607 + }, + { + "epoch": 0.3217685584203735, + "grad_norm": 3.8551011085510254, + "learning_rate": 3.1716626217930015e-05, + "loss": 0.8579, + "step": 6608 + }, + { + "epoch": 0.32181725220948065, + "grad_norm": 1.3381634950637817, + "learning_rate": 3.1714069668366265e-05, + "loss": 0.8286, + "step": 6609 + }, + { + "epoch": 0.32186594599858787, + "grad_norm": 1.391709804534912, + "learning_rate": 3.1711512827413966e-05, + "loss": 0.9571, + "step": 6610 + }, + { + "epoch": 0.3219146397876951, + "grad_norm": 1.967717170715332, + "learning_rate": 3.1708955695136725e-05, + "loss": 0.771, + "step": 6611 + }, + { + "epoch": 0.3219633335768023, + "grad_norm": 1.2198318243026733, + "learning_rate": 3.170639827159814e-05, + "loss": 0.8907, + "step": 6612 + }, + { + "epoch": 0.3220120273659095, + "grad_norm": 1.2461308240890503, + "learning_rate": 3.170384055686183e-05, + "loss": 0.8492, + "step": 6613 + }, + { + "epoch": 0.32206072115501666, + "grad_norm": 1.3434689044952393, + "learning_rate": 3.170128255099142e-05, + "loss": 0.9154, + "step": 6614 + }, + { + "epoch": 0.3221094149441239, + "grad_norm": 2.032625675201416, + "learning_rate": 3.169872425405053e-05, + "loss": 0.864, + "step": 6615 + }, + { + "epoch": 0.3221581087332311, + "grad_norm": 1.531942367553711, + "learning_rate": 3.169616566610283e-05, + "loss": 0.8161, + "step": 6616 + }, + { + "epoch": 0.3222068025223383, + "grad_norm": 1.5048131942749023, + "learning_rate": 3.1693606787211924e-05, + "loss": 0.9205, + "step": 6617 + }, + { + "epoch": 0.32225549631144546, + "grad_norm": 1.3765552043914795, + "learning_rate": 3.169104761744149e-05, + "loss": 0.8894, + "step": 6618 + }, + { + "epoch": 0.32230419010055267, + "grad_norm": 1.8727883100509644, + "learning_rate": 3.1688488156855174e-05, + "loss": 0.8772, + "step": 6619 + }, + { + "epoch": 0.3223528838896599, + "grad_norm": 1.268607497215271, + "learning_rate": 3.1685928405516655e-05, + "loss": 0.818, + "step": 6620 + }, + { + "epoch": 0.3224015776787671, + "grad_norm": 1.592905044555664, + "learning_rate": 3.16833683634896e-05, + "loss": 0.8149, + "step": 6621 + }, + { + "epoch": 0.32245027146787425, + "grad_norm": 1.6642502546310425, + "learning_rate": 3.168080803083769e-05, + "loss": 0.8041, + "step": 6622 + }, + { + "epoch": 0.32249896525698146, + "grad_norm": 1.8051581382751465, + "learning_rate": 3.167824740762462e-05, + "loss": 0.7676, + "step": 6623 + }, + { + "epoch": 0.3225476590460887, + "grad_norm": 1.7080379724502563, + "learning_rate": 3.167568649391408e-05, + "loss": 0.8579, + "step": 6624 + }, + { + "epoch": 0.3225963528351959, + "grad_norm": 9.766200065612793, + "learning_rate": 3.167312528976977e-05, + "loss": 0.8072, + "step": 6625 + }, + { + "epoch": 0.32264504662430304, + "grad_norm": 1.991890549659729, + "learning_rate": 3.167056379525541e-05, + "loss": 0.8671, + "step": 6626 + }, + { + "epoch": 0.32269374041341026, + "grad_norm": 1.3734138011932373, + "learning_rate": 3.16680020104347e-05, + "loss": 0.8686, + "step": 6627 + }, + { + "epoch": 0.32274243420251747, + "grad_norm": 2.3557240962982178, + "learning_rate": 3.166543993537139e-05, + "loss": 0.7647, + "step": 6628 + }, + { + "epoch": 0.3227911279916247, + "grad_norm": 1.1572239398956299, + "learning_rate": 3.166287757012919e-05, + "loss": 0.7763, + "step": 6629 + }, + { + "epoch": 0.3228398217807319, + "grad_norm": 1.7453733682632446, + "learning_rate": 3.166031491477185e-05, + "loss": 0.8281, + "step": 6630 + }, + { + "epoch": 0.32288851556983905, + "grad_norm": 1.346706748008728, + "learning_rate": 3.165775196936311e-05, + "loss": 0.826, + "step": 6631 + }, + { + "epoch": 0.32293720935894626, + "grad_norm": 3.0862438678741455, + "learning_rate": 3.165518873396673e-05, + "loss": 0.8863, + "step": 6632 + }, + { + "epoch": 0.3229859031480535, + "grad_norm": 1.7284021377563477, + "learning_rate": 3.1652625208646466e-05, + "loss": 0.9414, + "step": 6633 + }, + { + "epoch": 0.3230345969371607, + "grad_norm": 1.5639209747314453, + "learning_rate": 3.1650061393466086e-05, + "loss": 0.9284, + "step": 6634 + }, + { + "epoch": 0.32308329072626785, + "grad_norm": 2.057659864425659, + "learning_rate": 3.164749728848937e-05, + "loss": 0.8347, + "step": 6635 + }, + { + "epoch": 0.32313198451537506, + "grad_norm": 1.6950459480285645, + "learning_rate": 3.164493289378009e-05, + "loss": 0.7928, + "step": 6636 + }, + { + "epoch": 0.32318067830448227, + "grad_norm": 1.3989629745483398, + "learning_rate": 3.1642368209402055e-05, + "loss": 0.8166, + "step": 6637 + }, + { + "epoch": 0.3232293720935895, + "grad_norm": 1.6841671466827393, + "learning_rate": 3.163980323541904e-05, + "loss": 0.9049, + "step": 6638 + }, + { + "epoch": 0.32327806588269664, + "grad_norm": 1.4125896692276, + "learning_rate": 3.1637237971894864e-05, + "loss": 0.845, + "step": 6639 + }, + { + "epoch": 0.32332675967180385, + "grad_norm": 2.0824010372161865, + "learning_rate": 3.163467241889333e-05, + "loss": 0.8448, + "step": 6640 + }, + { + "epoch": 0.32337545346091107, + "grad_norm": 1.300235629081726, + "learning_rate": 3.163210657647826e-05, + "loss": 0.8802, + "step": 6641 + }, + { + "epoch": 0.3234241472500183, + "grad_norm": 1.443047046661377, + "learning_rate": 3.162954044471347e-05, + "loss": 0.9283, + "step": 6642 + }, + { + "epoch": 0.32347284103912544, + "grad_norm": 1.3783786296844482, + "learning_rate": 3.16269740236628e-05, + "loss": 0.894, + "step": 6643 + }, + { + "epoch": 0.32352153482823265, + "grad_norm": 2.247636556625366, + "learning_rate": 3.162440731339011e-05, + "loss": 0.9345, + "step": 6644 + }, + { + "epoch": 0.32357022861733986, + "grad_norm": 1.7661700248718262, + "learning_rate": 3.162184031395922e-05, + "loss": 0.8101, + "step": 6645 + }, + { + "epoch": 0.3236189224064471, + "grad_norm": 2.243375301361084, + "learning_rate": 3.1619273025433984e-05, + "loss": 0.9038, + "step": 6646 + }, + { + "epoch": 0.32366761619555423, + "grad_norm": 3.1466665267944336, + "learning_rate": 3.161670544787828e-05, + "loss": 0.9915, + "step": 6647 + }, + { + "epoch": 0.32371630998466144, + "grad_norm": 0.08933845907449722, + "learning_rate": 3.161413758135596e-05, + "loss": 0.5848, + "step": 6648 + }, + { + "epoch": 0.32376500377376866, + "grad_norm": 2.0551528930664062, + "learning_rate": 3.161156942593092e-05, + "loss": 0.7699, + "step": 6649 + }, + { + "epoch": 0.32381369756287587, + "grad_norm": 1.987022876739502, + "learning_rate": 3.160900098166703e-05, + "loss": 0.8311, + "step": 6650 + }, + { + "epoch": 0.3238623913519831, + "grad_norm": 1.268905520439148, + "learning_rate": 3.160643224862817e-05, + "loss": 0.8737, + "step": 6651 + }, + { + "epoch": 0.32391108514109024, + "grad_norm": 1.769850492477417, + "learning_rate": 3.160386322687826e-05, + "loss": 0.8358, + "step": 6652 + }, + { + "epoch": 0.32395977893019745, + "grad_norm": 1.8954436779022217, + "learning_rate": 3.16012939164812e-05, + "loss": 0.8502, + "step": 6653 + }, + { + "epoch": 0.32400847271930466, + "grad_norm": 1.3362302780151367, + "learning_rate": 3.1598724317500887e-05, + "loss": 0.9022, + "step": 6654 + }, + { + "epoch": 0.3240571665084119, + "grad_norm": 1.5802267789840698, + "learning_rate": 3.1596154430001244e-05, + "loss": 0.88, + "step": 6655 + }, + { + "epoch": 0.32410586029751903, + "grad_norm": 1.7309455871582031, + "learning_rate": 3.159358425404621e-05, + "loss": 0.9046, + "step": 6656 + }, + { + "epoch": 0.32415455408662625, + "grad_norm": 1.3302216529846191, + "learning_rate": 3.159101378969971e-05, + "loss": 0.8365, + "step": 6657 + }, + { + "epoch": 0.32420324787573346, + "grad_norm": 2.4564647674560547, + "learning_rate": 3.158844303702568e-05, + "loss": 0.743, + "step": 6658 + }, + { + "epoch": 0.32425194166484067, + "grad_norm": 1.2783093452453613, + "learning_rate": 3.1585871996088085e-05, + "loss": 0.8726, + "step": 6659 + }, + { + "epoch": 0.3243006354539478, + "grad_norm": 1.6421149969100952, + "learning_rate": 3.1583300666950856e-05, + "loss": 0.8873, + "step": 6660 + }, + { + "epoch": 0.32434932924305504, + "grad_norm": 1.8848103284835815, + "learning_rate": 3.1580729049677974e-05, + "loss": 0.8869, + "step": 6661 + }, + { + "epoch": 0.32439802303216225, + "grad_norm": 1.7615197896957397, + "learning_rate": 3.1578157144333395e-05, + "loss": 0.9103, + "step": 6662 + }, + { + "epoch": 0.32444671682126947, + "grad_norm": 2.506169557571411, + "learning_rate": 3.157558495098111e-05, + "loss": 0.9622, + "step": 6663 + }, + { + "epoch": 0.3244954106103766, + "grad_norm": 1.3947837352752686, + "learning_rate": 3.1573012469685085e-05, + "loss": 0.7728, + "step": 6664 + }, + { + "epoch": 0.32454410439948383, + "grad_norm": 0.08296692371368408, + "learning_rate": 3.1570439700509324e-05, + "loss": 0.5672, + "step": 6665 + }, + { + "epoch": 0.32459279818859105, + "grad_norm": 0.08361419290304184, + "learning_rate": 3.156786664351782e-05, + "loss": 0.6464, + "step": 6666 + }, + { + "epoch": 0.32464149197769826, + "grad_norm": 2.037299633026123, + "learning_rate": 3.156529329877458e-05, + "loss": 0.8282, + "step": 6667 + }, + { + "epoch": 0.3246901857668054, + "grad_norm": 1.659567952156067, + "learning_rate": 3.156271966634361e-05, + "loss": 0.8703, + "step": 6668 + }, + { + "epoch": 0.32473887955591263, + "grad_norm": 2.1685848236083984, + "learning_rate": 3.1560145746288935e-05, + "loss": 0.8521, + "step": 6669 + }, + { + "epoch": 0.32478757334501984, + "grad_norm": 1.666147232055664, + "learning_rate": 3.155757153867458e-05, + "loss": 0.7859, + "step": 6670 + }, + { + "epoch": 0.32483626713412705, + "grad_norm": 3.0230696201324463, + "learning_rate": 3.155499704356459e-05, + "loss": 0.8327, + "step": 6671 + }, + { + "epoch": 0.32488496092323427, + "grad_norm": 1.2615134716033936, + "learning_rate": 3.155242226102298e-05, + "loss": 0.9227, + "step": 6672 + }, + { + "epoch": 0.3249336547123414, + "grad_norm": 1.8950589895248413, + "learning_rate": 3.154984719111382e-05, + "loss": 0.8321, + "step": 6673 + }, + { + "epoch": 0.32498234850144864, + "grad_norm": 1.4312695264816284, + "learning_rate": 3.154727183390116e-05, + "loss": 0.7616, + "step": 6674 + }, + { + "epoch": 0.32503104229055585, + "grad_norm": 1.3586229085922241, + "learning_rate": 3.154469618944906e-05, + "loss": 0.8493, + "step": 6675 + }, + { + "epoch": 0.32507973607966306, + "grad_norm": 1.404473900794983, + "learning_rate": 3.154212025782159e-05, + "loss": 0.9303, + "step": 6676 + }, + { + "epoch": 0.3251284298687702, + "grad_norm": 0.08799708634614944, + "learning_rate": 3.153954403908282e-05, + "loss": 0.6267, + "step": 6677 + }, + { + "epoch": 0.32517712365787743, + "grad_norm": 1.728157639503479, + "learning_rate": 3.1536967533296846e-05, + "loss": 0.8898, + "step": 6678 + }, + { + "epoch": 0.32522581744698464, + "grad_norm": 1.628537654876709, + "learning_rate": 3.153439074052774e-05, + "loss": 0.7691, + "step": 6679 + }, + { + "epoch": 0.32527451123609186, + "grad_norm": 1.9406235218048096, + "learning_rate": 3.153181366083963e-05, + "loss": 0.8839, + "step": 6680 + }, + { + "epoch": 0.325323205025199, + "grad_norm": 1.584962010383606, + "learning_rate": 3.15292362942966e-05, + "loss": 0.846, + "step": 6681 + }, + { + "epoch": 0.3253718988143062, + "grad_norm": 1.5787243843078613, + "learning_rate": 3.1526658640962756e-05, + "loss": 0.8713, + "step": 6682 + }, + { + "epoch": 0.32542059260341344, + "grad_norm": 1.3668243885040283, + "learning_rate": 3.152408070090224e-05, + "loss": 0.9278, + "step": 6683 + }, + { + "epoch": 0.32546928639252065, + "grad_norm": 1.3168278932571411, + "learning_rate": 3.152150247417916e-05, + "loss": 0.858, + "step": 6684 + }, + { + "epoch": 0.3255179801816278, + "grad_norm": 2.094820499420166, + "learning_rate": 3.151892396085765e-05, + "loss": 0.9275, + "step": 6685 + }, + { + "epoch": 0.325566673970735, + "grad_norm": 2.424050807952881, + "learning_rate": 3.1516345161001866e-05, + "loss": 0.9413, + "step": 6686 + }, + { + "epoch": 0.32561536775984223, + "grad_norm": 0.0908690020442009, + "learning_rate": 3.151376607467594e-05, + "loss": 0.6452, + "step": 6687 + }, + { + "epoch": 0.32566406154894945, + "grad_norm": 1.5253779888153076, + "learning_rate": 3.1511186701944036e-05, + "loss": 0.8251, + "step": 6688 + }, + { + "epoch": 0.3257127553380566, + "grad_norm": 3.1522762775421143, + "learning_rate": 3.1508607042870314e-05, + "loss": 0.8589, + "step": 6689 + }, + { + "epoch": 0.3257614491271638, + "grad_norm": 1.9650297164916992, + "learning_rate": 3.1506027097518944e-05, + "loss": 0.846, + "step": 6690 + }, + { + "epoch": 0.32581014291627103, + "grad_norm": 1.949411153793335, + "learning_rate": 3.150344686595409e-05, + "loss": 0.8814, + "step": 6691 + }, + { + "epoch": 0.32585883670537824, + "grad_norm": 1.8923295736312866, + "learning_rate": 3.150086634823995e-05, + "loss": 0.8935, + "step": 6692 + }, + { + "epoch": 0.32590753049448545, + "grad_norm": 1.5506548881530762, + "learning_rate": 3.149828554444072e-05, + "loss": 0.8999, + "step": 6693 + }, + { + "epoch": 0.3259562242835926, + "grad_norm": 1.8126059770584106, + "learning_rate": 3.149570445462058e-05, + "loss": 0.9015, + "step": 6694 + }, + { + "epoch": 0.3260049180726998, + "grad_norm": 1.6141685247421265, + "learning_rate": 3.149312307884375e-05, + "loss": 0.894, + "step": 6695 + }, + { + "epoch": 0.32605361186180704, + "grad_norm": 1.5959880352020264, + "learning_rate": 3.149054141717444e-05, + "loss": 0.8216, + "step": 6696 + }, + { + "epoch": 0.32610230565091425, + "grad_norm": 2.28363299369812, + "learning_rate": 3.148795946967685e-05, + "loss": 0.8882, + "step": 6697 + }, + { + "epoch": 0.3261509994400214, + "grad_norm": 1.9506628513336182, + "learning_rate": 3.148537723641523e-05, + "loss": 0.8309, + "step": 6698 + }, + { + "epoch": 0.3261996932291286, + "grad_norm": 1.5649038553237915, + "learning_rate": 3.14827947174538e-05, + "loss": 0.9103, + "step": 6699 + }, + { + "epoch": 0.32624838701823583, + "grad_norm": 2.540811061859131, + "learning_rate": 3.1480211912856807e-05, + "loss": 0.9833, + "step": 6700 + }, + { + "epoch": 0.32629708080734304, + "grad_norm": 1.5926240682601929, + "learning_rate": 3.14776288226885e-05, + "loss": 0.9283, + "step": 6701 + }, + { + "epoch": 0.3263457745964502, + "grad_norm": 2.3723666667938232, + "learning_rate": 3.147504544701312e-05, + "loss": 0.8249, + "step": 6702 + }, + { + "epoch": 0.3263944683855574, + "grad_norm": 1.7258912324905396, + "learning_rate": 3.147246178589494e-05, + "loss": 0.8759, + "step": 6703 + }, + { + "epoch": 0.3264431621746646, + "grad_norm": 1.590468168258667, + "learning_rate": 3.146987783939824e-05, + "loss": 0.8086, + "step": 6704 + }, + { + "epoch": 0.32649185596377184, + "grad_norm": 1.815885305404663, + "learning_rate": 3.146729360758727e-05, + "loss": 0.8483, + "step": 6705 + }, + { + "epoch": 0.326540549752879, + "grad_norm": 3.37587308883667, + "learning_rate": 3.146470909052633e-05, + "loss": 0.7877, + "step": 6706 + }, + { + "epoch": 0.3265892435419862, + "grad_norm": 1.274954080581665, + "learning_rate": 3.1462124288279705e-05, + "loss": 0.8104, + "step": 6707 + }, + { + "epoch": 0.3266379373310934, + "grad_norm": 2.613670587539673, + "learning_rate": 3.1459539200911695e-05, + "loss": 0.8993, + "step": 6708 + }, + { + "epoch": 0.32668663112020063, + "grad_norm": 2.6192822456359863, + "learning_rate": 3.14569538284866e-05, + "loss": 0.8124, + "step": 6709 + }, + { + "epoch": 0.32673532490930784, + "grad_norm": 1.7249181270599365, + "learning_rate": 3.145436817106873e-05, + "loss": 0.8344, + "step": 6710 + }, + { + "epoch": 0.326784018698415, + "grad_norm": 1.2923803329467773, + "learning_rate": 3.1451782228722416e-05, + "loss": 0.9273, + "step": 6711 + }, + { + "epoch": 0.3268327124875222, + "grad_norm": 1.9016242027282715, + "learning_rate": 3.144919600151197e-05, + "loss": 0.8626, + "step": 6712 + }, + { + "epoch": 0.3268814062766294, + "grad_norm": 1.7961137294769287, + "learning_rate": 3.144660948950173e-05, + "loss": 0.8905, + "step": 6713 + }, + { + "epoch": 0.32693010006573664, + "grad_norm": 1.4896254539489746, + "learning_rate": 3.144402269275604e-05, + "loss": 0.9404, + "step": 6714 + }, + { + "epoch": 0.3269787938548438, + "grad_norm": 3.7562429904937744, + "learning_rate": 3.1441435611339236e-05, + "loss": 0.8735, + "step": 6715 + }, + { + "epoch": 0.327027487643951, + "grad_norm": 0.0922982320189476, + "learning_rate": 3.1438848245315676e-05, + "loss": 0.623, + "step": 6716 + }, + { + "epoch": 0.3270761814330582, + "grad_norm": 2.2387475967407227, + "learning_rate": 3.1436260594749733e-05, + "loss": 0.9383, + "step": 6717 + }, + { + "epoch": 0.32712487522216543, + "grad_norm": 2.329988956451416, + "learning_rate": 3.143367265970575e-05, + "loss": 0.8247, + "step": 6718 + }, + { + "epoch": 0.3271735690112726, + "grad_norm": 1.6179461479187012, + "learning_rate": 3.143108444024813e-05, + "loss": 0.8629, + "step": 6719 + }, + { + "epoch": 0.3272222628003798, + "grad_norm": 1.8399889469146729, + "learning_rate": 3.142849593644124e-05, + "loss": 0.916, + "step": 6720 + }, + { + "epoch": 0.327270956589487, + "grad_norm": 3.095237970352173, + "learning_rate": 3.142590714834947e-05, + "loss": 0.8167, + "step": 6721 + }, + { + "epoch": 0.32731965037859423, + "grad_norm": 1.2220675945281982, + "learning_rate": 3.1423318076037216e-05, + "loss": 0.8181, + "step": 6722 + }, + { + "epoch": 0.3273683441677014, + "grad_norm": 1.6236677169799805, + "learning_rate": 3.142072871956889e-05, + "loss": 0.7625, + "step": 6723 + }, + { + "epoch": 0.3274170379568086, + "grad_norm": 1.5411229133605957, + "learning_rate": 3.141813907900889e-05, + "loss": 0.7408, + "step": 6724 + }, + { + "epoch": 0.3274657317459158, + "grad_norm": 1.7121602296829224, + "learning_rate": 3.141554915442164e-05, + "loss": 0.8116, + "step": 6725 + }, + { + "epoch": 0.327514425535023, + "grad_norm": 2.0123608112335205, + "learning_rate": 3.141295894587156e-05, + "loss": 0.7136, + "step": 6726 + }, + { + "epoch": 0.3275631193241302, + "grad_norm": 1.780104160308838, + "learning_rate": 3.141036845342309e-05, + "loss": 0.8107, + "step": 6727 + }, + { + "epoch": 0.3276118131132374, + "grad_norm": 1.2570877075195312, + "learning_rate": 3.140777767714067e-05, + "loss": 0.9028, + "step": 6728 + }, + { + "epoch": 0.3276605069023446, + "grad_norm": 1.937677264213562, + "learning_rate": 3.140518661708873e-05, + "loss": 0.8341, + "step": 6729 + }, + { + "epoch": 0.3277092006914518, + "grad_norm": 1.299021601676941, + "learning_rate": 3.140259527333174e-05, + "loss": 0.799, + "step": 6730 + }, + { + "epoch": 0.32775789448055903, + "grad_norm": 1.671284794807434, + "learning_rate": 3.140000364593415e-05, + "loss": 0.8546, + "step": 6731 + }, + { + "epoch": 0.3278065882696662, + "grad_norm": 1.602909803390503, + "learning_rate": 3.1397411734960434e-05, + "loss": 0.8772, + "step": 6732 + }, + { + "epoch": 0.3278552820587734, + "grad_norm": 2.2906951904296875, + "learning_rate": 3.139481954047506e-05, + "loss": 0.8709, + "step": 6733 + }, + { + "epoch": 0.3279039758478806, + "grad_norm": 1.1674443483352661, + "learning_rate": 3.139222706254251e-05, + "loss": 0.9626, + "step": 6734 + }, + { + "epoch": 0.3279526696369878, + "grad_norm": 1.2207669019699097, + "learning_rate": 3.1389634301227266e-05, + "loss": 0.8684, + "step": 6735 + }, + { + "epoch": 0.328001363426095, + "grad_norm": 1.8845243453979492, + "learning_rate": 3.138704125659384e-05, + "loss": 1.029, + "step": 6736 + }, + { + "epoch": 0.3280500572152022, + "grad_norm": 1.7723641395568848, + "learning_rate": 3.138444792870672e-05, + "loss": 0.8118, + "step": 6737 + }, + { + "epoch": 0.3280987510043094, + "grad_norm": 2.0404210090637207, + "learning_rate": 3.1381854317630416e-05, + "loss": 0.8967, + "step": 6738 + }, + { + "epoch": 0.3281474447934166, + "grad_norm": 1.7392836809158325, + "learning_rate": 3.1379260423429456e-05, + "loss": 0.8285, + "step": 6739 + }, + { + "epoch": 0.3281961385825238, + "grad_norm": 1.640287160873413, + "learning_rate": 3.1376666246168355e-05, + "loss": 0.9189, + "step": 6740 + }, + { + "epoch": 0.328244832371631, + "grad_norm": 1.5304036140441895, + "learning_rate": 3.137407178591164e-05, + "loss": 0.8126, + "step": 6741 + }, + { + "epoch": 0.3282935261607382, + "grad_norm": 1.5583372116088867, + "learning_rate": 3.1371477042723854e-05, + "loss": 0.9258, + "step": 6742 + }, + { + "epoch": 0.3283422199498454, + "grad_norm": 2.1835997104644775, + "learning_rate": 3.136888201666954e-05, + "loss": 0.866, + "step": 6743 + }, + { + "epoch": 0.3283909137389526, + "grad_norm": 1.6854585409164429, + "learning_rate": 3.136628670781325e-05, + "loss": 0.9208, + "step": 6744 + }, + { + "epoch": 0.3284396075280598, + "grad_norm": 1.2305881977081299, + "learning_rate": 3.136369111621954e-05, + "loss": 0.8333, + "step": 6745 + }, + { + "epoch": 0.328488301317167, + "grad_norm": 4.230301856994629, + "learning_rate": 3.1361095241952976e-05, + "loss": 0.7742, + "step": 6746 + }, + { + "epoch": 0.3285369951062742, + "grad_norm": 1.5845783948898315, + "learning_rate": 3.1358499085078136e-05, + "loss": 0.8963, + "step": 6747 + }, + { + "epoch": 0.32858568889538137, + "grad_norm": 2.1348228454589844, + "learning_rate": 3.135590264565959e-05, + "loss": 0.9392, + "step": 6748 + }, + { + "epoch": 0.3286343826844886, + "grad_norm": 2.584657907485962, + "learning_rate": 3.135330592376193e-05, + "loss": 0.8841, + "step": 6749 + }, + { + "epoch": 0.3286830764735958, + "grad_norm": 1.5858392715454102, + "learning_rate": 3.135070891944976e-05, + "loss": 0.8043, + "step": 6750 + }, + { + "epoch": 0.328731770262703, + "grad_norm": 2.004112958908081, + "learning_rate": 3.134811163278767e-05, + "loss": 0.9125, + "step": 6751 + }, + { + "epoch": 0.3287804640518102, + "grad_norm": 1.3978185653686523, + "learning_rate": 3.1345514063840255e-05, + "loss": 0.7476, + "step": 6752 + }, + { + "epoch": 0.3288291578409174, + "grad_norm": 1.8689191341400146, + "learning_rate": 3.1342916212672156e-05, + "loss": 0.8134, + "step": 6753 + }, + { + "epoch": 0.3288778516300246, + "grad_norm": 1.8693161010742188, + "learning_rate": 3.134031807934798e-05, + "loss": 0.7753, + "step": 6754 + }, + { + "epoch": 0.3289265454191318, + "grad_norm": 2.5568249225616455, + "learning_rate": 3.133771966393235e-05, + "loss": 0.8119, + "step": 6755 + }, + { + "epoch": 0.328975239208239, + "grad_norm": 2.3341846466064453, + "learning_rate": 3.133512096648992e-05, + "loss": 0.8729, + "step": 6756 + }, + { + "epoch": 0.32902393299734617, + "grad_norm": 2.541746139526367, + "learning_rate": 3.133252198708531e-05, + "loss": 0.8415, + "step": 6757 + }, + { + "epoch": 0.3290726267864534, + "grad_norm": 1.210425615310669, + "learning_rate": 3.132992272578319e-05, + "loss": 0.8642, + "step": 6758 + }, + { + "epoch": 0.3291213205755606, + "grad_norm": 1.6054998636245728, + "learning_rate": 3.1327323182648205e-05, + "loss": 0.8728, + "step": 6759 + }, + { + "epoch": 0.3291700143646678, + "grad_norm": 2.2314696311950684, + "learning_rate": 3.132472335774503e-05, + "loss": 0.7728, + "step": 6760 + }, + { + "epoch": 0.32921870815377496, + "grad_norm": 2.0650830268859863, + "learning_rate": 3.132212325113832e-05, + "loss": 0.8254, + "step": 6761 + }, + { + "epoch": 0.3292674019428822, + "grad_norm": 1.756251335144043, + "learning_rate": 3.1319522862892776e-05, + "loss": 0.8566, + "step": 6762 + }, + { + "epoch": 0.3293160957319894, + "grad_norm": 1.6282141208648682, + "learning_rate": 3.131692219307306e-05, + "loss": 0.8935, + "step": 6763 + }, + { + "epoch": 0.3293647895210966, + "grad_norm": 1.826479196548462, + "learning_rate": 3.131432124174387e-05, + "loss": 0.8433, + "step": 6764 + }, + { + "epoch": 0.32941348331020376, + "grad_norm": 1.776387333869934, + "learning_rate": 3.131172000896991e-05, + "loss": 0.9406, + "step": 6765 + }, + { + "epoch": 0.32946217709931097, + "grad_norm": 1.4162495136260986, + "learning_rate": 3.130911849481589e-05, + "loss": 0.8699, + "step": 6766 + }, + { + "epoch": 0.3295108708884182, + "grad_norm": 1.4741119146347046, + "learning_rate": 3.1306516699346505e-05, + "loss": 0.9511, + "step": 6767 + }, + { + "epoch": 0.3295595646775254, + "grad_norm": 0.08683761954307556, + "learning_rate": 3.130391462262649e-05, + "loss": 0.5983, + "step": 6768 + }, + { + "epoch": 0.3296082584666326, + "grad_norm": 1.7149204015731812, + "learning_rate": 3.130131226472057e-05, + "loss": 0.9035, + "step": 6769 + }, + { + "epoch": 0.32965695225573977, + "grad_norm": 2.313877582550049, + "learning_rate": 3.129870962569348e-05, + "loss": 0.7847, + "step": 6770 + }, + { + "epoch": 0.329705646044847, + "grad_norm": 1.8303027153015137, + "learning_rate": 3.129610670560995e-05, + "loss": 0.8415, + "step": 6771 + }, + { + "epoch": 0.3297543398339542, + "grad_norm": 1.2771053314208984, + "learning_rate": 3.1293503504534744e-05, + "loss": 0.9211, + "step": 6772 + }, + { + "epoch": 0.3298030336230614, + "grad_norm": 1.543686866760254, + "learning_rate": 3.1290900022532597e-05, + "loss": 0.8372, + "step": 6773 + }, + { + "epoch": 0.32985172741216856, + "grad_norm": 1.9677023887634277, + "learning_rate": 3.1288296259668296e-05, + "loss": 0.817, + "step": 6774 + }, + { + "epoch": 0.3299004212012758, + "grad_norm": 1.4282634258270264, + "learning_rate": 3.1285692216006584e-05, + "loss": 0.8975, + "step": 6775 + }, + { + "epoch": 0.329949114990383, + "grad_norm": 2.034269332885742, + "learning_rate": 3.128308789161226e-05, + "loss": 0.8251, + "step": 6776 + }, + { + "epoch": 0.3299978087794902, + "grad_norm": 1.8255960941314697, + "learning_rate": 3.128048328655009e-05, + "loss": 0.9126, + "step": 6777 + }, + { + "epoch": 0.33004650256859736, + "grad_norm": 2.1742095947265625, + "learning_rate": 3.1277878400884865e-05, + "loss": 0.8866, + "step": 6778 + }, + { + "epoch": 0.33009519635770457, + "grad_norm": 1.626583218574524, + "learning_rate": 3.1275273234681396e-05, + "loss": 0.8868, + "step": 6779 + }, + { + "epoch": 0.3301438901468118, + "grad_norm": 1.727569818496704, + "learning_rate": 3.127266778800448e-05, + "loss": 0.8649, + "step": 6780 + }, + { + "epoch": 0.330192583935919, + "grad_norm": 1.5482685565948486, + "learning_rate": 3.1270062060918915e-05, + "loss": 0.9145, + "step": 6781 + }, + { + "epoch": 0.33024127772502615, + "grad_norm": 2.1319637298583984, + "learning_rate": 3.126745605348953e-05, + "loss": 0.8758, + "step": 6782 + }, + { + "epoch": 0.33028997151413336, + "grad_norm": 1.6260184049606323, + "learning_rate": 3.1264849765781156e-05, + "loss": 0.834, + "step": 6783 + }, + { + "epoch": 0.3303386653032406, + "grad_norm": 0.09179456532001495, + "learning_rate": 3.126224319785861e-05, + "loss": 0.6667, + "step": 6784 + }, + { + "epoch": 0.3303873590923478, + "grad_norm": 1.406809687614441, + "learning_rate": 3.125963634978674e-05, + "loss": 0.8626, + "step": 6785 + }, + { + "epoch": 0.33043605288145494, + "grad_norm": 2.0595366954803467, + "learning_rate": 3.125702922163039e-05, + "loss": 0.8577, + "step": 6786 + }, + { + "epoch": 0.33048474667056216, + "grad_norm": 3.8033533096313477, + "learning_rate": 3.125442181345441e-05, + "loss": 0.8494, + "step": 6787 + }, + { + "epoch": 0.33053344045966937, + "grad_norm": 2.8555994033813477, + "learning_rate": 3.1251814125323664e-05, + "loss": 0.8129, + "step": 6788 + }, + { + "epoch": 0.3305821342487766, + "grad_norm": 1.3315074443817139, + "learning_rate": 3.124920615730301e-05, + "loss": 0.876, + "step": 6789 + }, + { + "epoch": 0.3306308280378838, + "grad_norm": 2.3970515727996826, + "learning_rate": 3.124659790945733e-05, + "loss": 0.9605, + "step": 6790 + }, + { + "epoch": 0.33067952182699095, + "grad_norm": 2.103196859359741, + "learning_rate": 3.12439893818515e-05, + "loss": 0.8535, + "step": 6791 + }, + { + "epoch": 0.33072821561609816, + "grad_norm": 1.4332464933395386, + "learning_rate": 3.124138057455041e-05, + "loss": 0.8023, + "step": 6792 + }, + { + "epoch": 0.3307769094052054, + "grad_norm": 1.8674852848052979, + "learning_rate": 3.1238771487618955e-05, + "loss": 0.8297, + "step": 6793 + }, + { + "epoch": 0.3308256031943126, + "grad_norm": 1.3152867555618286, + "learning_rate": 3.123616212112204e-05, + "loss": 0.8945, + "step": 6794 + }, + { + "epoch": 0.33087429698341975, + "grad_norm": 2.1517558097839355, + "learning_rate": 3.123355247512456e-05, + "loss": 0.8894, + "step": 6795 + }, + { + "epoch": 0.33092299077252696, + "grad_norm": 1.515198826789856, + "learning_rate": 3.123094254969144e-05, + "loss": 0.7994, + "step": 6796 + }, + { + "epoch": 0.33097168456163417, + "grad_norm": 1.7886542081832886, + "learning_rate": 3.122833234488759e-05, + "loss": 0.8141, + "step": 6797 + }, + { + "epoch": 0.3310203783507414, + "grad_norm": 1.353299617767334, + "learning_rate": 3.122572186077797e-05, + "loss": 0.8956, + "step": 6798 + }, + { + "epoch": 0.33106907213984854, + "grad_norm": 1.997409462928772, + "learning_rate": 3.122311109742748e-05, + "loss": 0.8054, + "step": 6799 + }, + { + "epoch": 0.33111776592895575, + "grad_norm": 1.4409445524215698, + "learning_rate": 3.122050005490108e-05, + "loss": 0.8416, + "step": 6800 + }, + { + "epoch": 0.33116645971806297, + "grad_norm": 1.76932692527771, + "learning_rate": 3.121788873326372e-05, + "loss": 0.9118, + "step": 6801 + }, + { + "epoch": 0.3312151535071702, + "grad_norm": 1.9192246198654175, + "learning_rate": 3.121527713258036e-05, + "loss": 0.7335, + "step": 6802 + }, + { + "epoch": 0.33126384729627734, + "grad_norm": 2.806427478790283, + "learning_rate": 3.121266525291595e-05, + "loss": 0.9063, + "step": 6803 + }, + { + "epoch": 0.33131254108538455, + "grad_norm": 1.530207872390747, + "learning_rate": 3.1210053094335475e-05, + "loss": 0.8885, + "step": 6804 + }, + { + "epoch": 0.33136123487449176, + "grad_norm": 1.453681468963623, + "learning_rate": 3.1207440656903905e-05, + "loss": 0.824, + "step": 6805 + }, + { + "epoch": 0.331409928663599, + "grad_norm": 2.5998623371124268, + "learning_rate": 3.1204827940686226e-05, + "loss": 0.8642, + "step": 6806 + }, + { + "epoch": 0.33145862245270613, + "grad_norm": 2.303009271621704, + "learning_rate": 3.120221494574743e-05, + "loss": 0.7983, + "step": 6807 + }, + { + "epoch": 0.33150731624181334, + "grad_norm": 1.5691465139389038, + "learning_rate": 3.119960167215253e-05, + "loss": 0.8105, + "step": 6808 + }, + { + "epoch": 0.33155601003092056, + "grad_norm": 1.9549765586853027, + "learning_rate": 3.1196988119966496e-05, + "loss": 0.9158, + "step": 6809 + }, + { + "epoch": 0.33160470382002777, + "grad_norm": 1.5138086080551147, + "learning_rate": 3.1194374289254375e-05, + "loss": 0.9134, + "step": 6810 + }, + { + "epoch": 0.331653397609135, + "grad_norm": 1.3507335186004639, + "learning_rate": 3.119176018008117e-05, + "loss": 0.7917, + "step": 6811 + }, + { + "epoch": 0.33170209139824214, + "grad_norm": 1.4591280221939087, + "learning_rate": 3.118914579251191e-05, + "loss": 0.831, + "step": 6812 + }, + { + "epoch": 0.33175078518734935, + "grad_norm": 1.800097107887268, + "learning_rate": 3.1186531126611614e-05, + "loss": 0.8661, + "step": 6813 + }, + { + "epoch": 0.33179947897645656, + "grad_norm": 1.4650321006774902, + "learning_rate": 3.1183916182445356e-05, + "loss": 0.911, + "step": 6814 + }, + { + "epoch": 0.3318481727655638, + "grad_norm": 1.6606929302215576, + "learning_rate": 3.118130096007815e-05, + "loss": 0.8969, + "step": 6815 + }, + { + "epoch": 0.33189686655467093, + "grad_norm": 1.5634698867797852, + "learning_rate": 3.117868545957508e-05, + "loss": 0.9019, + "step": 6816 + }, + { + "epoch": 0.33194556034377815, + "grad_norm": 1.5697656869888306, + "learning_rate": 3.117606968100117e-05, + "loss": 0.8609, + "step": 6817 + }, + { + "epoch": 0.33199425413288536, + "grad_norm": 1.9877463579177856, + "learning_rate": 3.117345362442152e-05, + "loss": 0.884, + "step": 6818 + }, + { + "epoch": 0.33204294792199257, + "grad_norm": 1.6866304874420166, + "learning_rate": 3.117083728990119e-05, + "loss": 0.8513, + "step": 6819 + }, + { + "epoch": 0.3320916417110997, + "grad_norm": 1.527584433555603, + "learning_rate": 3.1168220677505265e-05, + "loss": 0.8796, + "step": 6820 + }, + { + "epoch": 0.33214033550020694, + "grad_norm": 1.4004005193710327, + "learning_rate": 3.116560378729883e-05, + "loss": 0.9299, + "step": 6821 + }, + { + "epoch": 0.33218902928931415, + "grad_norm": 1.9552849531173706, + "learning_rate": 3.116298661934698e-05, + "loss": 0.9042, + "step": 6822 + }, + { + "epoch": 0.33223772307842137, + "grad_norm": 1.4834091663360596, + "learning_rate": 3.1160369173714825e-05, + "loss": 0.819, + "step": 6823 + }, + { + "epoch": 0.3322864168675285, + "grad_norm": 1.6701711416244507, + "learning_rate": 3.115775145046747e-05, + "loss": 0.8673, + "step": 6824 + }, + { + "epoch": 0.33233511065663573, + "grad_norm": 2.168562412261963, + "learning_rate": 3.1155133449670025e-05, + "loss": 0.8564, + "step": 6825 + }, + { + "epoch": 0.33238380444574295, + "grad_norm": 1.9017735719680786, + "learning_rate": 3.115251517138763e-05, + "loss": 0.9219, + "step": 6826 + }, + { + "epoch": 0.33243249823485016, + "grad_norm": 2.2986607551574707, + "learning_rate": 3.114989661568539e-05, + "loss": 0.8317, + "step": 6827 + }, + { + "epoch": 0.3324811920239573, + "grad_norm": 1.7426207065582275, + "learning_rate": 3.114727778262846e-05, + "loss": 0.8647, + "step": 6828 + }, + { + "epoch": 0.33252988581306453, + "grad_norm": 0.08522705733776093, + "learning_rate": 3.114465867228198e-05, + "loss": 0.5234, + "step": 6829 + }, + { + "epoch": 0.33257857960217174, + "grad_norm": 2.1682655811309814, + "learning_rate": 3.114203928471109e-05, + "loss": 0.826, + "step": 6830 + }, + { + "epoch": 0.33262727339127895, + "grad_norm": 1.3546864986419678, + "learning_rate": 3.113941961998097e-05, + "loss": 0.8656, + "step": 6831 + }, + { + "epoch": 0.33267596718038617, + "grad_norm": 1.3325904607772827, + "learning_rate": 3.1136799678156763e-05, + "loss": 0.836, + "step": 6832 + }, + { + "epoch": 0.3327246609694933, + "grad_norm": 1.8872172832489014, + "learning_rate": 3.1134179459303656e-05, + "loss": 0.7873, + "step": 6833 + }, + { + "epoch": 0.33277335475860054, + "grad_norm": 1.508305549621582, + "learning_rate": 3.1131558963486815e-05, + "loss": 0.9989, + "step": 6834 + }, + { + "epoch": 0.33282204854770775, + "grad_norm": 4.47947359085083, + "learning_rate": 3.112893819077143e-05, + "loss": 0.8439, + "step": 6835 + }, + { + "epoch": 0.33287074233681496, + "grad_norm": 1.730529546737671, + "learning_rate": 3.112631714122269e-05, + "loss": 0.7392, + "step": 6836 + }, + { + "epoch": 0.3329194361259221, + "grad_norm": 1.4978193044662476, + "learning_rate": 3.11236958149058e-05, + "loss": 0.8456, + "step": 6837 + }, + { + "epoch": 0.33296812991502933, + "grad_norm": 1.5342159271240234, + "learning_rate": 3.112107421188596e-05, + "loss": 0.911, + "step": 6838 + }, + { + "epoch": 0.33301682370413654, + "grad_norm": 1.3968839645385742, + "learning_rate": 3.111845233222839e-05, + "loss": 0.8622, + "step": 6839 + }, + { + "epoch": 0.33306551749324376, + "grad_norm": 1.7484644651412964, + "learning_rate": 3.11158301759983e-05, + "loss": 0.8645, + "step": 6840 + }, + { + "epoch": 0.3331142112823509, + "grad_norm": 1.4446078538894653, + "learning_rate": 3.1113207743260926e-05, + "loss": 0.8908, + "step": 6841 + }, + { + "epoch": 0.3331629050714581, + "grad_norm": 1.9443520307540894, + "learning_rate": 3.111058503408149e-05, + "loss": 0.8873, + "step": 6842 + }, + { + "epoch": 0.33321159886056534, + "grad_norm": 1.8281856775283813, + "learning_rate": 3.110796204852524e-05, + "loss": 0.7906, + "step": 6843 + }, + { + "epoch": 0.33326029264967255, + "grad_norm": 1.749860167503357, + "learning_rate": 3.110533878665743e-05, + "loss": 0.7632, + "step": 6844 + }, + { + "epoch": 0.3333089864387797, + "grad_norm": 0.09043965488672256, + "learning_rate": 3.1102715248543296e-05, + "loss": 0.6634, + "step": 6845 + }, + { + "epoch": 0.3333576802278869, + "grad_norm": 8.898030281066895, + "learning_rate": 3.110009143424811e-05, + "loss": 0.9161, + "step": 6846 + }, + { + "epoch": 0.33340637401699413, + "grad_norm": 1.4067844152450562, + "learning_rate": 3.109746734383713e-05, + "loss": 0.8255, + "step": 6847 + }, + { + "epoch": 0.33345506780610135, + "grad_norm": 1.4183754920959473, + "learning_rate": 3.1094842977375645e-05, + "loss": 0.9009, + "step": 6848 + }, + { + "epoch": 0.33350376159520856, + "grad_norm": 1.941115140914917, + "learning_rate": 3.109221833492893e-05, + "loss": 0.9064, + "step": 6849 + }, + { + "epoch": 0.3335524553843157, + "grad_norm": 1.5277769565582275, + "learning_rate": 3.1089593416562274e-05, + "loss": 0.8414, + "step": 6850 + }, + { + "epoch": 0.33360114917342293, + "grad_norm": 1.2949583530426025, + "learning_rate": 3.1086968222340966e-05, + "loss": 0.9391, + "step": 6851 + }, + { + "epoch": 0.33364984296253014, + "grad_norm": 1.6172374486923218, + "learning_rate": 3.108434275233032e-05, + "loss": 0.9147, + "step": 6852 + }, + { + "epoch": 0.33369853675163735, + "grad_norm": 0.08948086202144623, + "learning_rate": 3.108171700659563e-05, + "loss": 0.5835, + "step": 6853 + }, + { + "epoch": 0.3337472305407445, + "grad_norm": 1.827307105064392, + "learning_rate": 3.107909098520222e-05, + "loss": 0.8145, + "step": 6854 + }, + { + "epoch": 0.3337959243298517, + "grad_norm": 0.09010087698698044, + "learning_rate": 3.1076464688215416e-05, + "loss": 0.572, + "step": 6855 + }, + { + "epoch": 0.33384461811895894, + "grad_norm": 1.5600299835205078, + "learning_rate": 3.107383811570054e-05, + "loss": 0.9035, + "step": 6856 + }, + { + "epoch": 0.33389331190806615, + "grad_norm": 3.156623601913452, + "learning_rate": 3.107121126772293e-05, + "loss": 0.8025, + "step": 6857 + }, + { + "epoch": 0.3339420056971733, + "grad_norm": 1.5759202241897583, + "learning_rate": 3.106858414434793e-05, + "loss": 0.8057, + "step": 6858 + }, + { + "epoch": 0.3339906994862805, + "grad_norm": 1.73370361328125, + "learning_rate": 3.106595674564089e-05, + "loss": 0.8053, + "step": 6859 + }, + { + "epoch": 0.33403939327538773, + "grad_norm": 1.6052863597869873, + "learning_rate": 3.106332907166717e-05, + "loss": 0.8638, + "step": 6860 + }, + { + "epoch": 0.33408808706449494, + "grad_norm": 1.5000522136688232, + "learning_rate": 3.1060701122492126e-05, + "loss": 0.8987, + "step": 6861 + }, + { + "epoch": 0.3341367808536021, + "grad_norm": 2.224658489227295, + "learning_rate": 3.1058072898181144e-05, + "loss": 0.9464, + "step": 6862 + }, + { + "epoch": 0.3341854746427093, + "grad_norm": 1.3151888847351074, + "learning_rate": 3.105544439879958e-05, + "loss": 0.8422, + "step": 6863 + }, + { + "epoch": 0.3342341684318165, + "grad_norm": 1.7946662902832031, + "learning_rate": 3.105281562441283e-05, + "loss": 0.7711, + "step": 6864 + }, + { + "epoch": 0.33428286222092374, + "grad_norm": 1.5239486694335938, + "learning_rate": 3.105018657508628e-05, + "loss": 0.8533, + "step": 6865 + }, + { + "epoch": 0.3343315560100309, + "grad_norm": 2.4997165203094482, + "learning_rate": 3.1047557250885345e-05, + "loss": 0.819, + "step": 6866 + }, + { + "epoch": 0.3343802497991381, + "grad_norm": 1.4740145206451416, + "learning_rate": 3.10449276518754e-05, + "loss": 0.8832, + "step": 6867 + }, + { + "epoch": 0.3344289435882453, + "grad_norm": 1.4272856712341309, + "learning_rate": 3.104229777812188e-05, + "loss": 0.8675, + "step": 6868 + }, + { + "epoch": 0.33447763737735253, + "grad_norm": 1.2791657447814941, + "learning_rate": 3.10396676296902e-05, + "loss": 0.8191, + "step": 6869 + }, + { + "epoch": 0.33452633116645974, + "grad_norm": 1.8567856550216675, + "learning_rate": 3.103703720664577e-05, + "loss": 0.8207, + "step": 6870 + }, + { + "epoch": 0.3345750249555669, + "grad_norm": 1.6595195531845093, + "learning_rate": 3.103440650905405e-05, + "loss": 0.9008, + "step": 6871 + }, + { + "epoch": 0.3346237187446741, + "grad_norm": 2.3482632637023926, + "learning_rate": 3.103177553698045e-05, + "loss": 1.0139, + "step": 6872 + }, + { + "epoch": 0.3346724125337813, + "grad_norm": 1.472521185874939, + "learning_rate": 3.102914429049043e-05, + "loss": 0.9316, + "step": 6873 + }, + { + "epoch": 0.33472110632288854, + "grad_norm": 1.5945574045181274, + "learning_rate": 3.1026512769649445e-05, + "loss": 0.867, + "step": 6874 + }, + { + "epoch": 0.3347698001119957, + "grad_norm": 1.056992530822754, + "learning_rate": 3.102388097452294e-05, + "loss": 0.9114, + "step": 6875 + }, + { + "epoch": 0.3348184939011029, + "grad_norm": 1.4547191858291626, + "learning_rate": 3.102124890517639e-05, + "loss": 0.8966, + "step": 6876 + }, + { + "epoch": 0.3348671876902101, + "grad_norm": 2.4209816455841064, + "learning_rate": 3.1018616561675284e-05, + "loss": 0.7888, + "step": 6877 + }, + { + "epoch": 0.33491588147931733, + "grad_norm": 2.310779094696045, + "learning_rate": 3.101598394408507e-05, + "loss": 0.8495, + "step": 6878 + }, + { + "epoch": 0.3349645752684245, + "grad_norm": 2.0257186889648438, + "learning_rate": 3.1013351052471255e-05, + "loss": 0.7781, + "step": 6879 + }, + { + "epoch": 0.3350132690575317, + "grad_norm": 1.6523996591567993, + "learning_rate": 3.101071788689933e-05, + "loss": 0.8966, + "step": 6880 + }, + { + "epoch": 0.3350619628466389, + "grad_norm": 4.7568769454956055, + "learning_rate": 3.10080844474348e-05, + "loss": 0.9094, + "step": 6881 + }, + { + "epoch": 0.33511065663574613, + "grad_norm": 2.409848690032959, + "learning_rate": 3.1005450734143164e-05, + "loss": 0.8564, + "step": 6882 + }, + { + "epoch": 0.3351593504248533, + "grad_norm": 1.6324728727340698, + "learning_rate": 3.100281674708993e-05, + "loss": 0.7895, + "step": 6883 + }, + { + "epoch": 0.3352080442139605, + "grad_norm": 1.539298176765442, + "learning_rate": 3.1000182486340635e-05, + "loss": 0.8341, + "step": 6884 + }, + { + "epoch": 0.3352567380030677, + "grad_norm": 1.616054892539978, + "learning_rate": 3.099754795196079e-05, + "loss": 0.9414, + "step": 6885 + }, + { + "epoch": 0.3353054317921749, + "grad_norm": 2.7237346172332764, + "learning_rate": 3.0994913144015936e-05, + "loss": 0.9609, + "step": 6886 + }, + { + "epoch": 0.3353541255812821, + "grad_norm": 2.0691397190093994, + "learning_rate": 3.0992278062571616e-05, + "loss": 0.8622, + "step": 6887 + }, + { + "epoch": 0.3354028193703893, + "grad_norm": 2.2101640701293945, + "learning_rate": 3.0989642707693385e-05, + "loss": 0.934, + "step": 6888 + }, + { + "epoch": 0.3354515131594965, + "grad_norm": 1.458616852760315, + "learning_rate": 3.0987007079446784e-05, + "loss": 0.8844, + "step": 6889 + }, + { + "epoch": 0.3355002069486037, + "grad_norm": 1.296049952507019, + "learning_rate": 3.098437117789738e-05, + "loss": 0.9571, + "step": 6890 + }, + { + "epoch": 0.33554890073771093, + "grad_norm": 2.2398595809936523, + "learning_rate": 3.0981735003110736e-05, + "loss": 0.8728, + "step": 6891 + }, + { + "epoch": 0.3355975945268181, + "grad_norm": 1.4219951629638672, + "learning_rate": 3.0979098555152445e-05, + "loss": 0.8247, + "step": 6892 + }, + { + "epoch": 0.3356462883159253, + "grad_norm": 2.436232566833496, + "learning_rate": 3.0976461834088065e-05, + "loss": 0.8134, + "step": 6893 + }, + { + "epoch": 0.3356949821050325, + "grad_norm": 1.6119369268417358, + "learning_rate": 3.0973824839983204e-05, + "loss": 0.8802, + "step": 6894 + }, + { + "epoch": 0.3357436758941397, + "grad_norm": 1.5611389875411987, + "learning_rate": 3.097118757290344e-05, + "loss": 0.8292, + "step": 6895 + }, + { + "epoch": 0.3357923696832469, + "grad_norm": 2.118506669998169, + "learning_rate": 3.09685500329144e-05, + "loss": 0.7614, + "step": 6896 + }, + { + "epoch": 0.3358410634723541, + "grad_norm": 1.5393821001052856, + "learning_rate": 3.096591222008167e-05, + "loss": 0.8812, + "step": 6897 + }, + { + "epoch": 0.3358897572614613, + "grad_norm": 3.549177885055542, + "learning_rate": 3.096327413447087e-05, + "loss": 0.9225, + "step": 6898 + }, + { + "epoch": 0.3359384510505685, + "grad_norm": 1.6369855403900146, + "learning_rate": 3.0960635776147625e-05, + "loss": 0.8174, + "step": 6899 + }, + { + "epoch": 0.3359871448396757, + "grad_norm": 1.9003030061721802, + "learning_rate": 3.0957997145177574e-05, + "loss": 0.8943, + "step": 6900 + }, + { + "epoch": 0.3360358386287829, + "grad_norm": 1.5659024715423584, + "learning_rate": 3.0955358241626337e-05, + "loss": 0.8605, + "step": 6901 + }, + { + "epoch": 0.3360845324178901, + "grad_norm": 1.5784778594970703, + "learning_rate": 3.095271906555957e-05, + "loss": 0.8929, + "step": 6902 + }, + { + "epoch": 0.3361332262069973, + "grad_norm": 1.595982313156128, + "learning_rate": 3.095007961704292e-05, + "loss": 0.8181, + "step": 6903 + }, + { + "epoch": 0.3361819199961045, + "grad_norm": 3.5126748085021973, + "learning_rate": 3.0947439896142035e-05, + "loss": 0.8747, + "step": 6904 + }, + { + "epoch": 0.3362306137852117, + "grad_norm": 2.176659107208252, + "learning_rate": 3.0944799902922586e-05, + "loss": 0.8068, + "step": 6905 + }, + { + "epoch": 0.3362793075743189, + "grad_norm": 1.859199047088623, + "learning_rate": 3.094215963745024e-05, + "loss": 0.7935, + "step": 6906 + }, + { + "epoch": 0.3363280013634261, + "grad_norm": 1.8976303339004517, + "learning_rate": 3.093951909979068e-05, + "loss": 0.8468, + "step": 6907 + }, + { + "epoch": 0.33637669515253327, + "grad_norm": 1.3692915439605713, + "learning_rate": 3.093687829000958e-05, + "loss": 0.8433, + "step": 6908 + }, + { + "epoch": 0.3364253889416405, + "grad_norm": 2.602722644805908, + "learning_rate": 3.093423720817264e-05, + "loss": 0.8569, + "step": 6909 + }, + { + "epoch": 0.3364740827307477, + "grad_norm": 2.3788890838623047, + "learning_rate": 3.093159585434555e-05, + "loss": 0.9062, + "step": 6910 + }, + { + "epoch": 0.3365227765198549, + "grad_norm": 2.297074794769287, + "learning_rate": 3.092895422859402e-05, + "loss": 0.9187, + "step": 6911 + }, + { + "epoch": 0.3365714703089621, + "grad_norm": 1.4334434270858765, + "learning_rate": 3.092631233098376e-05, + "loss": 0.8351, + "step": 6912 + }, + { + "epoch": 0.3366201640980693, + "grad_norm": 1.9076513051986694, + "learning_rate": 3.092367016158047e-05, + "loss": 0.8973, + "step": 6913 + }, + { + "epoch": 0.3366688578871765, + "grad_norm": 1.5142267942428589, + "learning_rate": 3.09210277204499e-05, + "loss": 0.9022, + "step": 6914 + }, + { + "epoch": 0.3367175516762837, + "grad_norm": 1.481038212776184, + "learning_rate": 3.0918385007657765e-05, + "loss": 0.9424, + "step": 6915 + }, + { + "epoch": 0.3367662454653909, + "grad_norm": 1.6103131771087646, + "learning_rate": 3.0915742023269805e-05, + "loss": 0.8475, + "step": 6916 + }, + { + "epoch": 0.33681493925449807, + "grad_norm": 1.5756934881210327, + "learning_rate": 3.0913098767351774e-05, + "loss": 0.882, + "step": 6917 + }, + { + "epoch": 0.3368636330436053, + "grad_norm": 1.4963719844818115, + "learning_rate": 3.091045523996941e-05, + "loss": 0.9635, + "step": 6918 + }, + { + "epoch": 0.3369123268327125, + "grad_norm": 0.09151280671358109, + "learning_rate": 3.090781144118849e-05, + "loss": 0.6431, + "step": 6919 + }, + { + "epoch": 0.3369610206218197, + "grad_norm": 1.7768219709396362, + "learning_rate": 3.0905167371074746e-05, + "loss": 0.796, + "step": 6920 + }, + { + "epoch": 0.33700971441092686, + "grad_norm": 3.58695650100708, + "learning_rate": 3.090252302969399e-05, + "loss": 0.82, + "step": 6921 + }, + { + "epoch": 0.3370584082000341, + "grad_norm": 0.08872755616903305, + "learning_rate": 3.089987841711197e-05, + "loss": 0.6342, + "step": 6922 + }, + { + "epoch": 0.3371071019891413, + "grad_norm": 1.558573603630066, + "learning_rate": 3.089723353339447e-05, + "loss": 0.8268, + "step": 6923 + }, + { + "epoch": 0.3371557957782485, + "grad_norm": 2.399371862411499, + "learning_rate": 3.089458837860731e-05, + "loss": 0.8372, + "step": 6924 + }, + { + "epoch": 0.33720448956735566, + "grad_norm": 1.5282074213027954, + "learning_rate": 3.089194295281626e-05, + "loss": 0.924, + "step": 6925 + }, + { + "epoch": 0.33725318335646287, + "grad_norm": 2.7636892795562744, + "learning_rate": 3.088929725608714e-05, + "loss": 0.7821, + "step": 6926 + }, + { + "epoch": 0.3373018771455701, + "grad_norm": 1.6688164472579956, + "learning_rate": 3.088665128848576e-05, + "loss": 0.8256, + "step": 6927 + }, + { + "epoch": 0.3373505709346773, + "grad_norm": 1.2765789031982422, + "learning_rate": 3.0884005050077934e-05, + "loss": 0.8514, + "step": 6928 + }, + { + "epoch": 0.3373992647237845, + "grad_norm": 1.3100336790084839, + "learning_rate": 3.088135854092949e-05, + "loss": 0.7902, + "step": 6929 + }, + { + "epoch": 0.33744795851289167, + "grad_norm": 2.4536406993865967, + "learning_rate": 3.0878711761106255e-05, + "loss": 0.8985, + "step": 6930 + }, + { + "epoch": 0.3374966523019989, + "grad_norm": 1.4623838663101196, + "learning_rate": 3.0876064710674076e-05, + "loss": 0.7802, + "step": 6931 + }, + { + "epoch": 0.3375453460911061, + "grad_norm": 0.0900527834892273, + "learning_rate": 3.08734173896988e-05, + "loss": 0.665, + "step": 6932 + }, + { + "epoch": 0.3375940398802133, + "grad_norm": 5.38840913772583, + "learning_rate": 3.087076979824627e-05, + "loss": 0.8947, + "step": 6933 + }, + { + "epoch": 0.33764273366932046, + "grad_norm": 1.5338704586029053, + "learning_rate": 3.0868121936382354e-05, + "loss": 0.8263, + "step": 6934 + }, + { + "epoch": 0.3376914274584277, + "grad_norm": 1.6171462535858154, + "learning_rate": 3.0865473804172915e-05, + "loss": 0.9053, + "step": 6935 + }, + { + "epoch": 0.3377401212475349, + "grad_norm": 2.274195909500122, + "learning_rate": 3.086282540168382e-05, + "loss": 0.8199, + "step": 6936 + }, + { + "epoch": 0.3377888150366421, + "grad_norm": 1.5581668615341187, + "learning_rate": 3.086017672898095e-05, + "loss": 0.9132, + "step": 6937 + }, + { + "epoch": 0.33783750882574926, + "grad_norm": 0.08418422937393188, + "learning_rate": 3.08575277861302e-05, + "loss": 0.6536, + "step": 6938 + }, + { + "epoch": 0.33788620261485647, + "grad_norm": 1.7230662107467651, + "learning_rate": 3.0854878573197456e-05, + "loss": 0.8028, + "step": 6939 + }, + { + "epoch": 0.3379348964039637, + "grad_norm": 1.9315427541732788, + "learning_rate": 3.085222909024862e-05, + "loss": 0.8336, + "step": 6940 + }, + { + "epoch": 0.3379835901930709, + "grad_norm": 1.6188241243362427, + "learning_rate": 3.0849579337349593e-05, + "loss": 0.7917, + "step": 6941 + }, + { + "epoch": 0.33803228398217805, + "grad_norm": 1.2649489641189575, + "learning_rate": 3.084692931456629e-05, + "loss": 0.8575, + "step": 6942 + }, + { + "epoch": 0.33808097777128526, + "grad_norm": 2.353517532348633, + "learning_rate": 3.084427902196463e-05, + "loss": 0.795, + "step": 6943 + }, + { + "epoch": 0.3381296715603925, + "grad_norm": 1.6518913507461548, + "learning_rate": 3.084162845961054e-05, + "loss": 0.8675, + "step": 6944 + }, + { + "epoch": 0.3381783653494997, + "grad_norm": 1.899348497390747, + "learning_rate": 3.083897762756995e-05, + "loss": 0.8577, + "step": 6945 + }, + { + "epoch": 0.33822705913860684, + "grad_norm": 1.272692084312439, + "learning_rate": 3.083632652590881e-05, + "loss": 0.7694, + "step": 6946 + }, + { + "epoch": 0.33827575292771406, + "grad_norm": 0.08945668488740921, + "learning_rate": 3.083367515469306e-05, + "loss": 0.6012, + "step": 6947 + }, + { + "epoch": 0.33832444671682127, + "grad_norm": 1.3274586200714111, + "learning_rate": 3.083102351398865e-05, + "loss": 0.9499, + "step": 6948 + }, + { + "epoch": 0.3383731405059285, + "grad_norm": 1.3884226083755493, + "learning_rate": 3.082837160386155e-05, + "loss": 0.8255, + "step": 6949 + }, + { + "epoch": 0.3384218342950357, + "grad_norm": 1.2939399480819702, + "learning_rate": 3.0825719424377714e-05, + "loss": 0.8196, + "step": 6950 + }, + { + "epoch": 0.33847052808414285, + "grad_norm": 2.1315817832946777, + "learning_rate": 3.0823066975603116e-05, + "loss": 0.8985, + "step": 6951 + }, + { + "epoch": 0.33851922187325006, + "grad_norm": 1.8183382749557495, + "learning_rate": 3.082041425760374e-05, + "loss": 0.8774, + "step": 6952 + }, + { + "epoch": 0.3385679156623573, + "grad_norm": 0.09040579944849014, + "learning_rate": 3.081776127044558e-05, + "loss": 0.6154, + "step": 6953 + }, + { + "epoch": 0.3386166094514645, + "grad_norm": 1.5236403942108154, + "learning_rate": 3.0815108014194616e-05, + "loss": 0.8809, + "step": 6954 + }, + { + "epoch": 0.33866530324057165, + "grad_norm": 1.1873621940612793, + "learning_rate": 3.081245448891686e-05, + "loss": 0.8382, + "step": 6955 + }, + { + "epoch": 0.33871399702967886, + "grad_norm": 1.37347412109375, + "learning_rate": 3.08098006946783e-05, + "loss": 0.7762, + "step": 6956 + }, + { + "epoch": 0.33876269081878607, + "grad_norm": 1.329093337059021, + "learning_rate": 3.080714663154497e-05, + "loss": 0.8933, + "step": 6957 + }, + { + "epoch": 0.3388113846078933, + "grad_norm": 1.2621709108352661, + "learning_rate": 3.0804492299582886e-05, + "loss": 0.8989, + "step": 6958 + }, + { + "epoch": 0.33886007839700044, + "grad_norm": 1.5571553707122803, + "learning_rate": 3.0801837698858074e-05, + "loss": 0.8877, + "step": 6959 + }, + { + "epoch": 0.33890877218610765, + "grad_norm": 1.583874225616455, + "learning_rate": 3.0799182829436555e-05, + "loss": 0.8998, + "step": 6960 + }, + { + "epoch": 0.33895746597521487, + "grad_norm": 1.7714096307754517, + "learning_rate": 3.0796527691384385e-05, + "loss": 0.9618, + "step": 6961 + }, + { + "epoch": 0.3390061597643221, + "grad_norm": 1.5102837085723877, + "learning_rate": 3.07938722847676e-05, + "loss": 0.7465, + "step": 6962 + }, + { + "epoch": 0.33905485355342924, + "grad_norm": 1.0372997522354126, + "learning_rate": 3.0791216609652257e-05, + "loss": 0.838, + "step": 6963 + }, + { + "epoch": 0.33910354734253645, + "grad_norm": 1.356805682182312, + "learning_rate": 3.078856066610442e-05, + "loss": 0.8962, + "step": 6964 + }, + { + "epoch": 0.33915224113164366, + "grad_norm": 1.3844980001449585, + "learning_rate": 3.078590445419015e-05, + "loss": 0.8572, + "step": 6965 + }, + { + "epoch": 0.3392009349207509, + "grad_norm": 1.1271659135818481, + "learning_rate": 3.078324797397553e-05, + "loss": 0.982, + "step": 6966 + }, + { + "epoch": 0.33924962870985803, + "grad_norm": 1.8883213996887207, + "learning_rate": 3.078059122552663e-05, + "loss": 0.8383, + "step": 6967 + }, + { + "epoch": 0.33929832249896524, + "grad_norm": 1.9489392042160034, + "learning_rate": 3.077793420890953e-05, + "loss": 0.8859, + "step": 6968 + }, + { + "epoch": 0.33934701628807246, + "grad_norm": 1.56117844581604, + "learning_rate": 3.077527692419035e-05, + "loss": 0.8658, + "step": 6969 + }, + { + "epoch": 0.33939571007717967, + "grad_norm": 1.6371394395828247, + "learning_rate": 3.0772619371435165e-05, + "loss": 0.9188, + "step": 6970 + }, + { + "epoch": 0.3394444038662869, + "grad_norm": 1.0949656963348389, + "learning_rate": 3.076996155071009e-05, + "loss": 0.8872, + "step": 6971 + }, + { + "epoch": 0.33949309765539404, + "grad_norm": 1.9340147972106934, + "learning_rate": 3.076730346208125e-05, + "loss": 0.8958, + "step": 6972 + }, + { + "epoch": 0.33954179144450125, + "grad_norm": 1.41252601146698, + "learning_rate": 3.0764645105614746e-05, + "loss": 0.8086, + "step": 6973 + }, + { + "epoch": 0.33959048523360846, + "grad_norm": 1.6253461837768555, + "learning_rate": 3.076198648137671e-05, + "loss": 0.7902, + "step": 6974 + }, + { + "epoch": 0.3396391790227157, + "grad_norm": 1.8825490474700928, + "learning_rate": 3.075932758943328e-05, + "loss": 0.7414, + "step": 6975 + }, + { + "epoch": 0.33968787281182283, + "grad_norm": 1.2773972749710083, + "learning_rate": 3.0756668429850595e-05, + "loss": 0.9167, + "step": 6976 + }, + { + "epoch": 0.33973656660093005, + "grad_norm": 0.09012977033853531, + "learning_rate": 3.0754009002694814e-05, + "loss": 0.5687, + "step": 6977 + }, + { + "epoch": 0.33978526039003726, + "grad_norm": 1.3203984498977661, + "learning_rate": 3.075134930803207e-05, + "loss": 0.8272, + "step": 6978 + }, + { + "epoch": 0.33983395417914447, + "grad_norm": 3.7880001068115234, + "learning_rate": 3.074868934592853e-05, + "loss": 0.8487, + "step": 6979 + }, + { + "epoch": 0.3398826479682516, + "grad_norm": 1.9600887298583984, + "learning_rate": 3.074602911645037e-05, + "loss": 0.9327, + "step": 6980 + }, + { + "epoch": 0.33993134175735884, + "grad_norm": 1.9296364784240723, + "learning_rate": 3.0743368619663744e-05, + "loss": 0.8544, + "step": 6981 + }, + { + "epoch": 0.33998003554646605, + "grad_norm": 2.4349372386932373, + "learning_rate": 3.074070785563485e-05, + "loss": 0.884, + "step": 6982 + }, + { + "epoch": 0.34002872933557327, + "grad_norm": 1.5869287252426147, + "learning_rate": 3.073804682442987e-05, + "loss": 0.8493, + "step": 6983 + }, + { + "epoch": 0.3400774231246804, + "grad_norm": 2.0150742530822754, + "learning_rate": 3.0735385526114996e-05, + "loss": 0.8929, + "step": 6984 + }, + { + "epoch": 0.34012611691378764, + "grad_norm": 2.13663911819458, + "learning_rate": 3.0732723960756425e-05, + "loss": 0.8687, + "step": 6985 + }, + { + "epoch": 0.34017481070289485, + "grad_norm": 1.417819857597351, + "learning_rate": 3.073006212842037e-05, + "loss": 0.8597, + "step": 6986 + }, + { + "epoch": 0.34022350449200206, + "grad_norm": 1.1505210399627686, + "learning_rate": 3.072740002917305e-05, + "loss": 0.7819, + "step": 6987 + }, + { + "epoch": 0.3402721982811092, + "grad_norm": 2.0177266597747803, + "learning_rate": 3.0724737663080656e-05, + "loss": 0.8844, + "step": 6988 + }, + { + "epoch": 0.34032089207021643, + "grad_norm": 1.4561972618103027, + "learning_rate": 3.072207503020945e-05, + "loss": 0.781, + "step": 6989 + }, + { + "epoch": 0.34036958585932364, + "grad_norm": 1.489884853363037, + "learning_rate": 3.071941213062564e-05, + "loss": 0.8075, + "step": 6990 + }, + { + "epoch": 0.34041827964843085, + "grad_norm": 1.850589632987976, + "learning_rate": 3.071674896439548e-05, + "loss": 0.9185, + "step": 6991 + }, + { + "epoch": 0.34046697343753807, + "grad_norm": 0.08746518939733505, + "learning_rate": 3.0714085531585216e-05, + "loss": 0.6072, + "step": 6992 + }, + { + "epoch": 0.3405156672266452, + "grad_norm": 1.4793909788131714, + "learning_rate": 3.07114218322611e-05, + "loss": 0.8187, + "step": 6993 + }, + { + "epoch": 0.34056436101575244, + "grad_norm": 1.6952747106552124, + "learning_rate": 3.0708757866489384e-05, + "loss": 0.8152, + "step": 6994 + }, + { + "epoch": 0.34061305480485965, + "grad_norm": 3.2022013664245605, + "learning_rate": 3.0706093634336335e-05, + "loss": 0.845, + "step": 6995 + }, + { + "epoch": 0.34066174859396686, + "grad_norm": 1.8594558238983154, + "learning_rate": 3.0703429135868246e-05, + "loss": 0.849, + "step": 6996 + }, + { + "epoch": 0.340710442383074, + "grad_norm": 1.3238532543182373, + "learning_rate": 3.070076437115136e-05, + "loss": 0.9123, + "step": 6997 + }, + { + "epoch": 0.34075913617218123, + "grad_norm": 2.6438443660736084, + "learning_rate": 3.0698099340252004e-05, + "loss": 0.8615, + "step": 6998 + }, + { + "epoch": 0.34080782996128844, + "grad_norm": 1.407429575920105, + "learning_rate": 3.069543404323645e-05, + "loss": 0.9118, + "step": 6999 + }, + { + "epoch": 0.34085652375039566, + "grad_norm": 2.3815510272979736, + "learning_rate": 3.069276848017099e-05, + "loss": 0.8129, + "step": 7000 + }, + { + "epoch": 0.3409052175395028, + "grad_norm": 2.4511826038360596, + "learning_rate": 3.069010265112194e-05, + "loss": 0.8409, + "step": 7001 + }, + { + "epoch": 0.34095391132861, + "grad_norm": 3.75469708442688, + "learning_rate": 3.068743655615562e-05, + "loss": 0.8943, + "step": 7002 + }, + { + "epoch": 0.34100260511771724, + "grad_norm": 1.4868111610412598, + "learning_rate": 3.068477019533833e-05, + "loss": 0.7482, + "step": 7003 + }, + { + "epoch": 0.34105129890682445, + "grad_norm": 2.6245391368865967, + "learning_rate": 3.068210356873641e-05, + "loss": 0.7682, + "step": 7004 + }, + { + "epoch": 0.3410999926959316, + "grad_norm": 1.3377870321273804, + "learning_rate": 3.0679436676416186e-05, + "loss": 0.8562, + "step": 7005 + }, + { + "epoch": 0.3411486864850388, + "grad_norm": 1.4680461883544922, + "learning_rate": 3.067676951844401e-05, + "loss": 0.8265, + "step": 7006 + }, + { + "epoch": 0.34119738027414603, + "grad_norm": 1.3944449424743652, + "learning_rate": 3.0674102094886213e-05, + "loss": 0.743, + "step": 7007 + }, + { + "epoch": 0.34124607406325325, + "grad_norm": 1.533453106880188, + "learning_rate": 3.067143440580916e-05, + "loss": 0.8289, + "step": 7008 + }, + { + "epoch": 0.34129476785236046, + "grad_norm": 1.4609506130218506, + "learning_rate": 3.06687664512792e-05, + "loss": 0.7839, + "step": 7009 + }, + { + "epoch": 0.3413434616414676, + "grad_norm": 1.41975998878479, + "learning_rate": 3.06660982313627e-05, + "loss": 0.8215, + "step": 7010 + }, + { + "epoch": 0.34139215543057483, + "grad_norm": 1.6115074157714844, + "learning_rate": 3.066342974612603e-05, + "loss": 0.8459, + "step": 7011 + }, + { + "epoch": 0.34144084921968204, + "grad_norm": 2.189192056655884, + "learning_rate": 3.066076099563558e-05, + "loss": 0.7795, + "step": 7012 + }, + { + "epoch": 0.34148954300878925, + "grad_norm": 1.5204375982284546, + "learning_rate": 3.0658091979957725e-05, + "loss": 0.8828, + "step": 7013 + }, + { + "epoch": 0.3415382367978964, + "grad_norm": 1.4750176668167114, + "learning_rate": 3.065542269915886e-05, + "loss": 0.8994, + "step": 7014 + }, + { + "epoch": 0.3415869305870036, + "grad_norm": 2.355104923248291, + "learning_rate": 3.0652753153305385e-05, + "loss": 0.8592, + "step": 7015 + }, + { + "epoch": 0.34163562437611084, + "grad_norm": 1.8296301364898682, + "learning_rate": 3.065008334246371e-05, + "loss": 0.874, + "step": 7016 + }, + { + "epoch": 0.34168431816521805, + "grad_norm": 2.1304733753204346, + "learning_rate": 3.064741326670023e-05, + "loss": 0.718, + "step": 7017 + }, + { + "epoch": 0.3417330119543252, + "grad_norm": 4.0656023025512695, + "learning_rate": 3.064474292608138e-05, + "loss": 0.907, + "step": 7018 + }, + { + "epoch": 0.3417817057434324, + "grad_norm": 1.3348016738891602, + "learning_rate": 3.0642072320673575e-05, + "loss": 0.8346, + "step": 7019 + }, + { + "epoch": 0.34183039953253963, + "grad_norm": 1.525451421737671, + "learning_rate": 3.0639401450543255e-05, + "loss": 0.8483, + "step": 7020 + }, + { + "epoch": 0.34187909332164684, + "grad_norm": 1.71163809299469, + "learning_rate": 3.0636730315756854e-05, + "loss": 0.7519, + "step": 7021 + }, + { + "epoch": 0.341927787110754, + "grad_norm": 1.7053965330123901, + "learning_rate": 3.063405891638081e-05, + "loss": 0.9146, + "step": 7022 + }, + { + "epoch": 0.3419764808998612, + "grad_norm": 1.6221081018447876, + "learning_rate": 3.063138725248159e-05, + "loss": 0.8028, + "step": 7023 + }, + { + "epoch": 0.3420251746889684, + "grad_norm": 1.4984056949615479, + "learning_rate": 3.062871532412564e-05, + "loss": 0.8063, + "step": 7024 + }, + { + "epoch": 0.34207386847807564, + "grad_norm": 2.172104835510254, + "learning_rate": 3.062604313137943e-05, + "loss": 0.8776, + "step": 7025 + }, + { + "epoch": 0.3421225622671828, + "grad_norm": 1.883201003074646, + "learning_rate": 3.062337067430942e-05, + "loss": 0.8777, + "step": 7026 + }, + { + "epoch": 0.34217125605629, + "grad_norm": 1.2374963760375977, + "learning_rate": 3.06206979529821e-05, + "loss": 0.8376, + "step": 7027 + }, + { + "epoch": 0.3422199498453972, + "grad_norm": 1.3989028930664062, + "learning_rate": 3.0618024967463946e-05, + "loss": 0.8418, + "step": 7028 + }, + { + "epoch": 0.34226864363450443, + "grad_norm": 1.7278438806533813, + "learning_rate": 3.0615351717821464e-05, + "loss": 0.9334, + "step": 7029 + }, + { + "epoch": 0.34231733742361165, + "grad_norm": 0.0873318761587143, + "learning_rate": 3.061267820412113e-05, + "loss": 0.5478, + "step": 7030 + }, + { + "epoch": 0.3423660312127188, + "grad_norm": 1.6029413938522339, + "learning_rate": 3.0610004426429465e-05, + "loss": 0.8611, + "step": 7031 + }, + { + "epoch": 0.342414725001826, + "grad_norm": 0.09724070131778717, + "learning_rate": 3.0607330384812966e-05, + "loss": 0.6485, + "step": 7032 + }, + { + "epoch": 0.3424634187909332, + "grad_norm": 0.09118398278951645, + "learning_rate": 3.0604656079338156e-05, + "loss": 0.5742, + "step": 7033 + }, + { + "epoch": 0.34251211258004044, + "grad_norm": 1.3687939643859863, + "learning_rate": 3.060198151007156e-05, + "loss": 0.9159, + "step": 7034 + }, + { + "epoch": 0.3425608063691476, + "grad_norm": 1.2631956338882446, + "learning_rate": 3.059930667707972e-05, + "loss": 0.9233, + "step": 7035 + }, + { + "epoch": 0.3426095001582548, + "grad_norm": 1.3099164962768555, + "learning_rate": 3.059663158042915e-05, + "loss": 0.9244, + "step": 7036 + }, + { + "epoch": 0.342658193947362, + "grad_norm": 2.0907561779022217, + "learning_rate": 3.0593956220186396e-05, + "loss": 0.8318, + "step": 7037 + }, + { + "epoch": 0.34270688773646923, + "grad_norm": 1.6432911157608032, + "learning_rate": 3.059128059641802e-05, + "loss": 0.8472, + "step": 7038 + }, + { + "epoch": 0.3427555815255764, + "grad_norm": 2.0484344959259033, + "learning_rate": 3.058860470919058e-05, + "loss": 0.9145, + "step": 7039 + }, + { + "epoch": 0.3428042753146836, + "grad_norm": 1.2995927333831787, + "learning_rate": 3.058592855857063e-05, + "loss": 0.8065, + "step": 7040 + }, + { + "epoch": 0.3428529691037908, + "grad_norm": 1.3747788667678833, + "learning_rate": 3.058325214462474e-05, + "loss": 0.8425, + "step": 7041 + }, + { + "epoch": 0.34290166289289803, + "grad_norm": 6.431619644165039, + "learning_rate": 3.058057546741949e-05, + "loss": 0.9167, + "step": 7042 + }, + { + "epoch": 0.3429503566820052, + "grad_norm": 1.772801160812378, + "learning_rate": 3.057789852702145e-05, + "loss": 0.7987, + "step": 7043 + }, + { + "epoch": 0.3429990504711124, + "grad_norm": 0.08546256273984909, + "learning_rate": 3.057522132349723e-05, + "loss": 0.6366, + "step": 7044 + }, + { + "epoch": 0.3430477442602196, + "grad_norm": 2.241992473602295, + "learning_rate": 3.0572543856913414e-05, + "loss": 0.855, + "step": 7045 + }, + { + "epoch": 0.3430964380493268, + "grad_norm": 1.7548309564590454, + "learning_rate": 3.0569866127336605e-05, + "loss": 0.8403, + "step": 7046 + }, + { + "epoch": 0.343145131838434, + "grad_norm": 1.844980239868164, + "learning_rate": 3.0567188134833417e-05, + "loss": 0.9048, + "step": 7047 + }, + { + "epoch": 0.3431938256275412, + "grad_norm": 1.5218135118484497, + "learning_rate": 3.056450987947045e-05, + "loss": 0.8431, + "step": 7048 + }, + { + "epoch": 0.3432425194166484, + "grad_norm": 1.7101157903671265, + "learning_rate": 3.0561831361314346e-05, + "loss": 0.8246, + "step": 7049 + }, + { + "epoch": 0.3432912132057556, + "grad_norm": 1.7846934795379639, + "learning_rate": 3.055915258043172e-05, + "loss": 0.9416, + "step": 7050 + }, + { + "epoch": 0.34333990699486283, + "grad_norm": 0.08284781873226166, + "learning_rate": 3.055647353688921e-05, + "loss": 0.5475, + "step": 7051 + }, + { + "epoch": 0.34338860078397, + "grad_norm": 1.7989153861999512, + "learning_rate": 3.055379423075346e-05, + "loss": 0.8988, + "step": 7052 + }, + { + "epoch": 0.3434372945730772, + "grad_norm": 1.7800530195236206, + "learning_rate": 3.0551114662091114e-05, + "loss": 0.8581, + "step": 7053 + }, + { + "epoch": 0.3434859883621844, + "grad_norm": 2.1712419986724854, + "learning_rate": 3.054843483096883e-05, + "loss": 0.8637, + "step": 7054 + }, + { + "epoch": 0.3435346821512916, + "grad_norm": 1.614356279373169, + "learning_rate": 3.054575473745327e-05, + "loss": 0.7628, + "step": 7055 + }, + { + "epoch": 0.3435833759403988, + "grad_norm": 1.4981482028961182, + "learning_rate": 3.05430743816111e-05, + "loss": 0.921, + "step": 7056 + }, + { + "epoch": 0.343632069729506, + "grad_norm": 1.3418222665786743, + "learning_rate": 3.0540393763508984e-05, + "loss": 0.8634, + "step": 7057 + }, + { + "epoch": 0.3436807635186132, + "grad_norm": 0.0887012779712677, + "learning_rate": 3.053771288321363e-05, + "loss": 0.5931, + "step": 7058 + }, + { + "epoch": 0.3437294573077204, + "grad_norm": 2.1900923252105713, + "learning_rate": 3.0535031740791695e-05, + "loss": 0.7914, + "step": 7059 + }, + { + "epoch": 0.3437781510968276, + "grad_norm": 1.9508211612701416, + "learning_rate": 3.053235033630988e-05, + "loss": 0.8636, + "step": 7060 + }, + { + "epoch": 0.3438268448859348, + "grad_norm": 4.188014030456543, + "learning_rate": 3.05296686698349e-05, + "loss": 0.8625, + "step": 7061 + }, + { + "epoch": 0.343875538675042, + "grad_norm": 1.6727057695388794, + "learning_rate": 3.052698674143345e-05, + "loss": 0.8375, + "step": 7062 + }, + { + "epoch": 0.3439242324641492, + "grad_norm": 1.6749855279922485, + "learning_rate": 3.0524304551172246e-05, + "loss": 0.8609, + "step": 7063 + }, + { + "epoch": 0.3439729262532564, + "grad_norm": 2.003047466278076, + "learning_rate": 3.0521622099117996e-05, + "loss": 0.8308, + "step": 7064 + }, + { + "epoch": 0.3440216200423636, + "grad_norm": 1.5555996894836426, + "learning_rate": 3.051893938533745e-05, + "loss": 0.9126, + "step": 7065 + }, + { + "epoch": 0.3440703138314708, + "grad_norm": 1.5305455923080444, + "learning_rate": 3.0516256409897323e-05, + "loss": 0.831, + "step": 7066 + }, + { + "epoch": 0.344119007620578, + "grad_norm": 1.284664511680603, + "learning_rate": 3.051357317286436e-05, + "loss": 0.8954, + "step": 7067 + }, + { + "epoch": 0.34416770140968517, + "grad_norm": 1.576209306716919, + "learning_rate": 3.05108896743053e-05, + "loss": 0.8536, + "step": 7068 + }, + { + "epoch": 0.3442163951987924, + "grad_norm": 0.09388328343629837, + "learning_rate": 3.050820591428691e-05, + "loss": 0.6492, + "step": 7069 + }, + { + "epoch": 0.3442650889878996, + "grad_norm": 1.7073097229003906, + "learning_rate": 3.050552189287593e-05, + "loss": 0.8191, + "step": 7070 + }, + { + "epoch": 0.3443137827770068, + "grad_norm": 3.1165928840637207, + "learning_rate": 3.0502837610139143e-05, + "loss": 0.9113, + "step": 7071 + }, + { + "epoch": 0.344362476566114, + "grad_norm": 1.5608251094818115, + "learning_rate": 3.0500153066143307e-05, + "loss": 0.8069, + "step": 7072 + }, + { + "epoch": 0.3444111703552212, + "grad_norm": 1.8416359424591064, + "learning_rate": 3.0497468260955215e-05, + "loss": 0.8835, + "step": 7073 + }, + { + "epoch": 0.3444598641443284, + "grad_norm": 1.5955950021743774, + "learning_rate": 3.0494783194641634e-05, + "loss": 0.9465, + "step": 7074 + }, + { + "epoch": 0.3445085579334356, + "grad_norm": 0.08619093894958496, + "learning_rate": 3.0492097867269365e-05, + "loss": 0.5814, + "step": 7075 + }, + { + "epoch": 0.3445572517225428, + "grad_norm": 1.6206848621368408, + "learning_rate": 3.048941227890521e-05, + "loss": 0.9291, + "step": 7076 + }, + { + "epoch": 0.34460594551164997, + "grad_norm": 1.7230613231658936, + "learning_rate": 3.048672642961596e-05, + "loss": 0.8577, + "step": 7077 + }, + { + "epoch": 0.3446546393007572, + "grad_norm": 4.5074782371521, + "learning_rate": 3.048404031946844e-05, + "loss": 0.9034, + "step": 7078 + }, + { + "epoch": 0.3447033330898644, + "grad_norm": 2.760999917984009, + "learning_rate": 3.0481353948529464e-05, + "loss": 0.7868, + "step": 7079 + }, + { + "epoch": 0.3447520268789716, + "grad_norm": 2.6534414291381836, + "learning_rate": 3.0478667316865854e-05, + "loss": 0.8011, + "step": 7080 + }, + { + "epoch": 0.34480072066807876, + "grad_norm": 1.6902409791946411, + "learning_rate": 3.0475980424544432e-05, + "loss": 0.9178, + "step": 7081 + }, + { + "epoch": 0.344849414457186, + "grad_norm": 1.9116487503051758, + "learning_rate": 3.0473293271632047e-05, + "loss": 0.7945, + "step": 7082 + }, + { + "epoch": 0.3448981082462932, + "grad_norm": 1.9671603441238403, + "learning_rate": 3.0470605858195533e-05, + "loss": 0.7754, + "step": 7083 + }, + { + "epoch": 0.3449468020354004, + "grad_norm": 1.4864165782928467, + "learning_rate": 3.0467918184301744e-05, + "loss": 0.7739, + "step": 7084 + }, + { + "epoch": 0.34499549582450756, + "grad_norm": 1.5399596691131592, + "learning_rate": 3.0465230250017536e-05, + "loss": 0.8961, + "step": 7085 + }, + { + "epoch": 0.34504418961361477, + "grad_norm": 2.25974178314209, + "learning_rate": 3.0462542055409768e-05, + "loss": 0.8569, + "step": 7086 + }, + { + "epoch": 0.345092883402722, + "grad_norm": 2.240539312362671, + "learning_rate": 3.0459853600545317e-05, + "loss": 0.8059, + "step": 7087 + }, + { + "epoch": 0.3451415771918292, + "grad_norm": 2.4817075729370117, + "learning_rate": 3.0457164885491054e-05, + "loss": 0.8779, + "step": 7088 + }, + { + "epoch": 0.3451902709809364, + "grad_norm": 1.793389081954956, + "learning_rate": 3.045447591031386e-05, + "loss": 0.8217, + "step": 7089 + }, + { + "epoch": 0.34523896477004357, + "grad_norm": 1.379409909248352, + "learning_rate": 3.0451786675080628e-05, + "loss": 0.8661, + "step": 7090 + }, + { + "epoch": 0.3452876585591508, + "grad_norm": 1.683284878730774, + "learning_rate": 3.0449097179858246e-05, + "loss": 0.842, + "step": 7091 + }, + { + "epoch": 0.345336352348258, + "grad_norm": 1.5220979452133179, + "learning_rate": 3.044640742471362e-05, + "loss": 0.8317, + "step": 7092 + }, + { + "epoch": 0.3453850461373652, + "grad_norm": 1.5309059619903564, + "learning_rate": 3.0443717409713653e-05, + "loss": 0.8469, + "step": 7093 + }, + { + "epoch": 0.34543373992647236, + "grad_norm": 1.8687251806259155, + "learning_rate": 3.0441027134925263e-05, + "loss": 0.8605, + "step": 7094 + }, + { + "epoch": 0.3454824337155796, + "grad_norm": 1.770143747329712, + "learning_rate": 3.0438336600415376e-05, + "loss": 0.8998, + "step": 7095 + }, + { + "epoch": 0.3455311275046868, + "grad_norm": 1.334631085395813, + "learning_rate": 3.0435645806250913e-05, + "loss": 0.86, + "step": 7096 + }, + { + "epoch": 0.345579821293794, + "grad_norm": 0.09008653461933136, + "learning_rate": 3.043295475249881e-05, + "loss": 0.6315, + "step": 7097 + }, + { + "epoch": 0.34562851508290116, + "grad_norm": 1.5113136768341064, + "learning_rate": 3.0430263439226005e-05, + "loss": 0.8395, + "step": 7098 + }, + { + "epoch": 0.34567720887200837, + "grad_norm": 2.3043839931488037, + "learning_rate": 3.0427571866499445e-05, + "loss": 0.8722, + "step": 7099 + }, + { + "epoch": 0.3457259026611156, + "grad_norm": 2.091679573059082, + "learning_rate": 3.0424880034386084e-05, + "loss": 0.7894, + "step": 7100 + }, + { + "epoch": 0.3457745964502228, + "grad_norm": 1.2439653873443604, + "learning_rate": 3.0422187942952885e-05, + "loss": 0.9232, + "step": 7101 + }, + { + "epoch": 0.34582329023932995, + "grad_norm": 2.809112787246704, + "learning_rate": 3.041949559226681e-05, + "loss": 0.8817, + "step": 7102 + }, + { + "epoch": 0.34587198402843716, + "grad_norm": 1.2447707653045654, + "learning_rate": 3.0416802982394835e-05, + "loss": 0.9253, + "step": 7103 + }, + { + "epoch": 0.3459206778175444, + "grad_norm": 1.6368032693862915, + "learning_rate": 3.0414110113403927e-05, + "loss": 0.8698, + "step": 7104 + }, + { + "epoch": 0.3459693716066516, + "grad_norm": 1.3566702604293823, + "learning_rate": 3.0411416985361083e-05, + "loss": 0.8227, + "step": 7105 + }, + { + "epoch": 0.34601806539575874, + "grad_norm": 1.3715626001358032, + "learning_rate": 3.0408723598333288e-05, + "loss": 0.8571, + "step": 7106 + }, + { + "epoch": 0.34606675918486596, + "grad_norm": 1.6167665719985962, + "learning_rate": 3.0406029952387555e-05, + "loss": 0.882, + "step": 7107 + }, + { + "epoch": 0.34611545297397317, + "grad_norm": 1.4355857372283936, + "learning_rate": 3.0403336047590877e-05, + "loss": 0.942, + "step": 7108 + }, + { + "epoch": 0.3461641467630804, + "grad_norm": 1.6482162475585938, + "learning_rate": 3.0400641884010262e-05, + "loss": 0.8666, + "step": 7109 + }, + { + "epoch": 0.3462128405521876, + "grad_norm": 2.357405424118042, + "learning_rate": 3.0397947461712734e-05, + "loss": 0.8556, + "step": 7110 + }, + { + "epoch": 0.34626153434129475, + "grad_norm": 1.3468739986419678, + "learning_rate": 3.0395252780765317e-05, + "loss": 0.9111, + "step": 7111 + }, + { + "epoch": 0.34631022813040196, + "grad_norm": 2.3457188606262207, + "learning_rate": 3.0392557841235034e-05, + "loss": 0.8553, + "step": 7112 + }, + { + "epoch": 0.3463589219195092, + "grad_norm": 0.09096286445856094, + "learning_rate": 3.0389862643188927e-05, + "loss": 0.6131, + "step": 7113 + }, + { + "epoch": 0.3464076157086164, + "grad_norm": 1.3695790767669678, + "learning_rate": 3.0387167186694043e-05, + "loss": 0.8606, + "step": 7114 + }, + { + "epoch": 0.34645630949772355, + "grad_norm": 1.6379740238189697, + "learning_rate": 3.0384471471817424e-05, + "loss": 0.9074, + "step": 7115 + }, + { + "epoch": 0.34650500328683076, + "grad_norm": 1.8332465887069702, + "learning_rate": 3.0381775498626137e-05, + "loss": 0.8531, + "step": 7116 + }, + { + "epoch": 0.34655369707593797, + "grad_norm": 1.6885874271392822, + "learning_rate": 3.037907926718723e-05, + "loss": 0.862, + "step": 7117 + }, + { + "epoch": 0.3466023908650452, + "grad_norm": 1.3351577520370483, + "learning_rate": 3.037638277756779e-05, + "loss": 0.855, + "step": 7118 + }, + { + "epoch": 0.34665108465415234, + "grad_norm": 1.8451281785964966, + "learning_rate": 3.037368602983487e-05, + "loss": 0.7502, + "step": 7119 + }, + { + "epoch": 0.34669977844325955, + "grad_norm": 1.5078045129776, + "learning_rate": 3.037098902405557e-05, + "loss": 0.8606, + "step": 7120 + }, + { + "epoch": 0.34674847223236677, + "grad_norm": 1.840044379234314, + "learning_rate": 3.0368291760296977e-05, + "loss": 0.8487, + "step": 7121 + }, + { + "epoch": 0.346797166021474, + "grad_norm": 1.89882493019104, + "learning_rate": 3.0365594238626176e-05, + "loss": 0.8539, + "step": 7122 + }, + { + "epoch": 0.34684585981058114, + "grad_norm": 2.001816749572754, + "learning_rate": 3.0362896459110275e-05, + "loss": 0.9262, + "step": 7123 + }, + { + "epoch": 0.34689455359968835, + "grad_norm": 1.3571555614471436, + "learning_rate": 3.0360198421816376e-05, + "loss": 0.8281, + "step": 7124 + }, + { + "epoch": 0.34694324738879556, + "grad_norm": 2.18814754486084, + "learning_rate": 3.03575001268116e-05, + "loss": 0.79, + "step": 7125 + }, + { + "epoch": 0.3469919411779028, + "grad_norm": 2.008976936340332, + "learning_rate": 3.0354801574163068e-05, + "loss": 0.899, + "step": 7126 + }, + { + "epoch": 0.34704063496700993, + "grad_norm": 1.9231300354003906, + "learning_rate": 3.0352102763937896e-05, + "loss": 0.8126, + "step": 7127 + }, + { + "epoch": 0.34708932875611714, + "grad_norm": 1.7785048484802246, + "learning_rate": 3.0349403696203223e-05, + "loss": 0.7618, + "step": 7128 + }, + { + "epoch": 0.34713802254522436, + "grad_norm": 1.3127555847167969, + "learning_rate": 3.0346704371026194e-05, + "loss": 0.7953, + "step": 7129 + }, + { + "epoch": 0.34718671633433157, + "grad_norm": 1.7492876052856445, + "learning_rate": 3.0344004788473952e-05, + "loss": 0.7635, + "step": 7130 + }, + { + "epoch": 0.3472354101234388, + "grad_norm": 2.076050043106079, + "learning_rate": 3.0341304948613642e-05, + "loss": 0.8978, + "step": 7131 + }, + { + "epoch": 0.34728410391254594, + "grad_norm": 1.5089768171310425, + "learning_rate": 3.0338604851512436e-05, + "loss": 0.8586, + "step": 7132 + }, + { + "epoch": 0.34733279770165315, + "grad_norm": 1.526922583580017, + "learning_rate": 3.0335904497237484e-05, + "loss": 0.8404, + "step": 7133 + }, + { + "epoch": 0.34738149149076036, + "grad_norm": 1.1642392873764038, + "learning_rate": 3.0333203885855965e-05, + "loss": 0.7541, + "step": 7134 + }, + { + "epoch": 0.3474301852798676, + "grad_norm": 1.6820347309112549, + "learning_rate": 3.0330503017435064e-05, + "loss": 0.8872, + "step": 7135 + }, + { + "epoch": 0.34747887906897473, + "grad_norm": 1.6665549278259277, + "learning_rate": 3.0327801892041954e-05, + "loss": 0.8854, + "step": 7136 + }, + { + "epoch": 0.34752757285808195, + "grad_norm": 2.239842176437378, + "learning_rate": 3.032510050974383e-05, + "loss": 0.86, + "step": 7137 + }, + { + "epoch": 0.34757626664718916, + "grad_norm": 1.7923924922943115, + "learning_rate": 3.032239887060789e-05, + "loss": 0.8289, + "step": 7138 + }, + { + "epoch": 0.34762496043629637, + "grad_norm": 1.7493987083435059, + "learning_rate": 3.031969697470134e-05, + "loss": 0.9192, + "step": 7139 + }, + { + "epoch": 0.34767365422540353, + "grad_norm": 1.5586806535720825, + "learning_rate": 3.0316994822091383e-05, + "loss": 0.8009, + "step": 7140 + }, + { + "epoch": 0.34772234801451074, + "grad_norm": 1.4283621311187744, + "learning_rate": 3.0314292412845244e-05, + "loss": 0.842, + "step": 7141 + }, + { + "epoch": 0.34777104180361795, + "grad_norm": 1.2894811630249023, + "learning_rate": 3.0311589747030133e-05, + "loss": 0.8643, + "step": 7142 + }, + { + "epoch": 0.34781973559272517, + "grad_norm": 1.4280649423599243, + "learning_rate": 3.0308886824713286e-05, + "loss": 0.7964, + "step": 7143 + }, + { + "epoch": 0.3478684293818323, + "grad_norm": 1.6031991243362427, + "learning_rate": 3.0306183645961947e-05, + "loss": 0.8112, + "step": 7144 + }, + { + "epoch": 0.34791712317093954, + "grad_norm": 1.6128193140029907, + "learning_rate": 3.030348021084335e-05, + "loss": 0.9056, + "step": 7145 + }, + { + "epoch": 0.34796581696004675, + "grad_norm": 1.2661057710647583, + "learning_rate": 3.030077651942474e-05, + "loss": 0.8825, + "step": 7146 + }, + { + "epoch": 0.34801451074915396, + "grad_norm": 2.2435309886932373, + "learning_rate": 3.029807257177338e-05, + "loss": 0.9186, + "step": 7147 + }, + { + "epoch": 0.3480632045382611, + "grad_norm": 1.9152703285217285, + "learning_rate": 3.029536836795652e-05, + "loss": 0.8218, + "step": 7148 + }, + { + "epoch": 0.34811189832736833, + "grad_norm": 2.4273126125335693, + "learning_rate": 3.0292663908041435e-05, + "loss": 0.9383, + "step": 7149 + }, + { + "epoch": 0.34816059211647554, + "grad_norm": 0.0896347314119339, + "learning_rate": 3.0289959192095398e-05, + "loss": 0.5822, + "step": 7150 + }, + { + "epoch": 0.34820928590558275, + "grad_norm": 1.305305004119873, + "learning_rate": 3.0287254220185684e-05, + "loss": 0.8757, + "step": 7151 + }, + { + "epoch": 0.34825797969468997, + "grad_norm": 2.167941093444824, + "learning_rate": 3.028454899237959e-05, + "loss": 0.9677, + "step": 7152 + }, + { + "epoch": 0.3483066734837971, + "grad_norm": 2.226029872894287, + "learning_rate": 3.0281843508744396e-05, + "loss": 0.9699, + "step": 7153 + }, + { + "epoch": 0.34835536727290434, + "grad_norm": 8.134791374206543, + "learning_rate": 3.0279137769347406e-05, + "loss": 0.8851, + "step": 7154 + }, + { + "epoch": 0.34840406106201155, + "grad_norm": 1.6273990869522095, + "learning_rate": 3.027643177425593e-05, + "loss": 0.8766, + "step": 7155 + }, + { + "epoch": 0.34845275485111876, + "grad_norm": 1.7545616626739502, + "learning_rate": 3.027372552353728e-05, + "loss": 0.8263, + "step": 7156 + }, + { + "epoch": 0.3485014486402259, + "grad_norm": 1.4167073965072632, + "learning_rate": 3.0271019017258765e-05, + "loss": 0.8482, + "step": 7157 + }, + { + "epoch": 0.34855014242933313, + "grad_norm": 1.7330601215362549, + "learning_rate": 3.026831225548772e-05, + "loss": 0.8252, + "step": 7158 + }, + { + "epoch": 0.34859883621844034, + "grad_norm": 0.08559243381023407, + "learning_rate": 3.026560523829147e-05, + "loss": 0.6091, + "step": 7159 + }, + { + "epoch": 0.34864753000754756, + "grad_norm": 2.0615577697753906, + "learning_rate": 3.0262897965737354e-05, + "loss": 0.938, + "step": 7160 + }, + { + "epoch": 0.3486962237966547, + "grad_norm": 3.1258904933929443, + "learning_rate": 3.0260190437892718e-05, + "loss": 0.9417, + "step": 7161 + }, + { + "epoch": 0.3487449175857619, + "grad_norm": 1.5295528173446655, + "learning_rate": 3.0257482654824905e-05, + "loss": 0.8388, + "step": 7162 + }, + { + "epoch": 0.34879361137486914, + "grad_norm": 0.10660693794488907, + "learning_rate": 3.0254774616601272e-05, + "loss": 0.6511, + "step": 7163 + }, + { + "epoch": 0.34884230516397635, + "grad_norm": 1.749577283859253, + "learning_rate": 3.025206632328919e-05, + "loss": 0.8874, + "step": 7164 + }, + { + "epoch": 0.3488909989530835, + "grad_norm": 1.5405371189117432, + "learning_rate": 3.0249357774956025e-05, + "loss": 0.8378, + "step": 7165 + }, + { + "epoch": 0.3489396927421907, + "grad_norm": 0.08401545137166977, + "learning_rate": 3.0246648971669147e-05, + "loss": 0.597, + "step": 7166 + }, + { + "epoch": 0.34898838653129793, + "grad_norm": 1.178398609161377, + "learning_rate": 3.0243939913495947e-05, + "loss": 0.9628, + "step": 7167 + }, + { + "epoch": 0.34903708032040515, + "grad_norm": 2.776005983352661, + "learning_rate": 3.0241230600503804e-05, + "loss": 0.9539, + "step": 7168 + }, + { + "epoch": 0.34908577410951236, + "grad_norm": 2.122610569000244, + "learning_rate": 3.0238521032760117e-05, + "loss": 0.9151, + "step": 7169 + }, + { + "epoch": 0.3491344678986195, + "grad_norm": 1.3182400465011597, + "learning_rate": 3.0235811210332287e-05, + "loss": 0.9526, + "step": 7170 + }, + { + "epoch": 0.34918316168772673, + "grad_norm": 2.111849546432495, + "learning_rate": 3.0233101133287715e-05, + "loss": 0.7924, + "step": 7171 + }, + { + "epoch": 0.34923185547683394, + "grad_norm": 2.924433946609497, + "learning_rate": 3.0230390801693822e-05, + "loss": 0.8129, + "step": 7172 + }, + { + "epoch": 0.34928054926594115, + "grad_norm": 1.371024250984192, + "learning_rate": 3.0227680215618024e-05, + "loss": 1.005, + "step": 7173 + }, + { + "epoch": 0.3493292430550483, + "grad_norm": 1.4775396585464478, + "learning_rate": 3.022496937512775e-05, + "loss": 0.8374, + "step": 7174 + }, + { + "epoch": 0.3493779368441555, + "grad_norm": 1.5873510837554932, + "learning_rate": 3.0222258280290435e-05, + "loss": 0.9274, + "step": 7175 + }, + { + "epoch": 0.34942663063326274, + "grad_norm": 2.308117628097534, + "learning_rate": 3.0219546931173504e-05, + "loss": 0.9685, + "step": 7176 + }, + { + "epoch": 0.34947532442236995, + "grad_norm": 1.9851114749908447, + "learning_rate": 3.021683532784442e-05, + "loss": 0.8149, + "step": 7177 + }, + { + "epoch": 0.3495240182114771, + "grad_norm": 1.1924909353256226, + "learning_rate": 3.021412347037062e-05, + "loss": 0.8714, + "step": 7178 + }, + { + "epoch": 0.3495727120005843, + "grad_norm": 1.7799590826034546, + "learning_rate": 3.0211411358819565e-05, + "loss": 0.793, + "step": 7179 + }, + { + "epoch": 0.34962140578969153, + "grad_norm": 1.3803396224975586, + "learning_rate": 3.020869899325873e-05, + "loss": 0.8329, + "step": 7180 + }, + { + "epoch": 0.34967009957879874, + "grad_norm": 2.2025644779205322, + "learning_rate": 3.0205986373755572e-05, + "loss": 0.8066, + "step": 7181 + }, + { + "epoch": 0.3497187933679059, + "grad_norm": 2.432642936706543, + "learning_rate": 3.0203273500377574e-05, + "loss": 0.9318, + "step": 7182 + }, + { + "epoch": 0.3497674871570131, + "grad_norm": 1.367743730545044, + "learning_rate": 3.020056037319221e-05, + "loss": 0.8598, + "step": 7183 + }, + { + "epoch": 0.3498161809461203, + "grad_norm": 1.4959460496902466, + "learning_rate": 3.0197846992266985e-05, + "loss": 0.8315, + "step": 7184 + }, + { + "epoch": 0.34986487473522754, + "grad_norm": 1.723250150680542, + "learning_rate": 3.0195133357669388e-05, + "loss": 0.8633, + "step": 7185 + }, + { + "epoch": 0.3499135685243347, + "grad_norm": 1.6419601440429688, + "learning_rate": 3.019241946946691e-05, + "loss": 0.8337, + "step": 7186 + }, + { + "epoch": 0.3499622623134419, + "grad_norm": 1.514683723449707, + "learning_rate": 3.0189705327727073e-05, + "loss": 0.8506, + "step": 7187 + }, + { + "epoch": 0.3500109561025491, + "grad_norm": 1.6379337310791016, + "learning_rate": 3.018699093251739e-05, + "loss": 0.8626, + "step": 7188 + }, + { + "epoch": 0.35005964989165633, + "grad_norm": 1.4916216135025024, + "learning_rate": 3.0184276283905375e-05, + "loss": 0.8692, + "step": 7189 + }, + { + "epoch": 0.35010834368076355, + "grad_norm": 1.7413305044174194, + "learning_rate": 3.0181561381958563e-05, + "loss": 0.8622, + "step": 7190 + }, + { + "epoch": 0.3501570374698707, + "grad_norm": 1.2926533222198486, + "learning_rate": 3.0178846226744478e-05, + "loss": 0.8807, + "step": 7191 + }, + { + "epoch": 0.3502057312589779, + "grad_norm": 1.8932126760482788, + "learning_rate": 3.0176130818330666e-05, + "loss": 0.8684, + "step": 7192 + }, + { + "epoch": 0.3502544250480851, + "grad_norm": 1.4823620319366455, + "learning_rate": 3.0173415156784673e-05, + "loss": 0.9186, + "step": 7193 + }, + { + "epoch": 0.35030311883719234, + "grad_norm": 3.2287299633026123, + "learning_rate": 3.0170699242174048e-05, + "loss": 0.8312, + "step": 7194 + }, + { + "epoch": 0.3503518126262995, + "grad_norm": 1.1943764686584473, + "learning_rate": 3.0167983074566357e-05, + "loss": 0.9336, + "step": 7195 + }, + { + "epoch": 0.3504005064154067, + "grad_norm": 1.8003923892974854, + "learning_rate": 3.016526665402916e-05, + "loss": 0.8083, + "step": 7196 + }, + { + "epoch": 0.3504492002045139, + "grad_norm": 2.3693666458129883, + "learning_rate": 3.016254998063002e-05, + "loss": 0.8377, + "step": 7197 + }, + { + "epoch": 0.35049789399362113, + "grad_norm": 1.348308801651001, + "learning_rate": 3.015983305443653e-05, + "loss": 0.9216, + "step": 7198 + }, + { + "epoch": 0.3505465877827283, + "grad_norm": 0.08893930166959763, + "learning_rate": 3.0157115875516267e-05, + "loss": 0.6303, + "step": 7199 + }, + { + "epoch": 0.3505952815718355, + "grad_norm": 5.360852241516113, + "learning_rate": 3.015439844393682e-05, + "loss": 0.8337, + "step": 7200 + }, + { + "epoch": 0.3506439753609427, + "grad_norm": 1.8192384243011475, + "learning_rate": 3.0151680759765782e-05, + "loss": 0.9237, + "step": 7201 + }, + { + "epoch": 0.35069266915004993, + "grad_norm": 2.3911399841308594, + "learning_rate": 3.0148962823070763e-05, + "loss": 0.9201, + "step": 7202 + }, + { + "epoch": 0.3507413629391571, + "grad_norm": 3.1237874031066895, + "learning_rate": 3.0146244633919366e-05, + "loss": 0.8925, + "step": 7203 + }, + { + "epoch": 0.3507900567282643, + "grad_norm": 1.4253326654434204, + "learning_rate": 3.0143526192379212e-05, + "loss": 0.8282, + "step": 7204 + }, + { + "epoch": 0.3508387505173715, + "grad_norm": 1.4386451244354248, + "learning_rate": 3.0140807498517914e-05, + "loss": 0.7875, + "step": 7205 + }, + { + "epoch": 0.3508874443064787, + "grad_norm": 1.7355332374572754, + "learning_rate": 3.013808855240311e-05, + "loss": 0.8683, + "step": 7206 + }, + { + "epoch": 0.3509361380955859, + "grad_norm": 1.5269246101379395, + "learning_rate": 3.013536935410243e-05, + "loss": 0.8515, + "step": 7207 + }, + { + "epoch": 0.3509848318846931, + "grad_norm": 1.2821414470672607, + "learning_rate": 3.0132649903683508e-05, + "loss": 0.8159, + "step": 7208 + }, + { + "epoch": 0.3510335256738003, + "grad_norm": 1.6276624202728271, + "learning_rate": 3.0129930201213998e-05, + "loss": 0.7485, + "step": 7209 + }, + { + "epoch": 0.3510822194629075, + "grad_norm": 0.08767115324735641, + "learning_rate": 3.0127210246761547e-05, + "loss": 0.5809, + "step": 7210 + }, + { + "epoch": 0.35113091325201473, + "grad_norm": 3.645777463912964, + "learning_rate": 3.0124490040393824e-05, + "loss": 0.9161, + "step": 7211 + }, + { + "epoch": 0.3511796070411219, + "grad_norm": 1.3512399196624756, + "learning_rate": 3.0121769582178486e-05, + "loss": 0.8633, + "step": 7212 + }, + { + "epoch": 0.3512283008302291, + "grad_norm": 1.910003423690796, + "learning_rate": 3.0119048872183204e-05, + "loss": 0.8518, + "step": 7213 + }, + { + "epoch": 0.3512769946193363, + "grad_norm": 1.3027640581130981, + "learning_rate": 3.011632791047566e-05, + "loss": 0.8241, + "step": 7214 + }, + { + "epoch": 0.3513256884084435, + "grad_norm": 1.7063101530075073, + "learning_rate": 3.0113606697123537e-05, + "loss": 0.7927, + "step": 7215 + }, + { + "epoch": 0.3513743821975507, + "grad_norm": 1.7983897924423218, + "learning_rate": 3.0110885232194525e-05, + "loss": 0.8866, + "step": 7216 + }, + { + "epoch": 0.3514230759866579, + "grad_norm": 1.2769019603729248, + "learning_rate": 3.010816351575632e-05, + "loss": 0.9066, + "step": 7217 + }, + { + "epoch": 0.3514717697757651, + "grad_norm": 7.307628154754639, + "learning_rate": 3.010544154787663e-05, + "loss": 0.8733, + "step": 7218 + }, + { + "epoch": 0.3515204635648723, + "grad_norm": 1.7979545593261719, + "learning_rate": 3.010271932862316e-05, + "loss": 0.9368, + "step": 7219 + }, + { + "epoch": 0.3515691573539795, + "grad_norm": 1.5711243152618408, + "learning_rate": 3.0099996858063626e-05, + "loss": 0.8476, + "step": 7220 + }, + { + "epoch": 0.3516178511430867, + "grad_norm": 1.4363539218902588, + "learning_rate": 3.009727413626574e-05, + "loss": 0.8689, + "step": 7221 + }, + { + "epoch": 0.3516665449321939, + "grad_norm": 0.09318363666534424, + "learning_rate": 3.0094551163297247e-05, + "loss": 0.634, + "step": 7222 + }, + { + "epoch": 0.3517152387213011, + "grad_norm": 1.453199028968811, + "learning_rate": 3.0091827939225876e-05, + "loss": 0.8392, + "step": 7223 + }, + { + "epoch": 0.3517639325104083, + "grad_norm": 1.2712112665176392, + "learning_rate": 3.0089104464119358e-05, + "loss": 0.8254, + "step": 7224 + }, + { + "epoch": 0.3518126262995155, + "grad_norm": 1.409361481666565, + "learning_rate": 3.0086380738045454e-05, + "loss": 0.8994, + "step": 7225 + }, + { + "epoch": 0.3518613200886227, + "grad_norm": 1.6760592460632324, + "learning_rate": 3.0083656761071904e-05, + "loss": 0.8108, + "step": 7226 + }, + { + "epoch": 0.3519100138777299, + "grad_norm": 1.156382441520691, + "learning_rate": 3.0080932533266474e-05, + "loss": 0.8097, + "step": 7227 + }, + { + "epoch": 0.35195870766683707, + "grad_norm": 1.3838913440704346, + "learning_rate": 3.0078208054696927e-05, + "loss": 0.8271, + "step": 7228 + }, + { + "epoch": 0.3520074014559443, + "grad_norm": 2.3573598861694336, + "learning_rate": 3.0075483325431035e-05, + "loss": 1.004, + "step": 7229 + }, + { + "epoch": 0.3520560952450515, + "grad_norm": 1.4996984004974365, + "learning_rate": 3.007275834553658e-05, + "loss": 0.8823, + "step": 7230 + }, + { + "epoch": 0.3521047890341587, + "grad_norm": 1.3842015266418457, + "learning_rate": 3.0070033115081344e-05, + "loss": 0.8841, + "step": 7231 + }, + { + "epoch": 0.3521534828232659, + "grad_norm": 1.287760853767395, + "learning_rate": 3.006730763413311e-05, + "loss": 0.9076, + "step": 7232 + }, + { + "epoch": 0.3522021766123731, + "grad_norm": 1.383858561515808, + "learning_rate": 3.0064581902759683e-05, + "loss": 0.8862, + "step": 7233 + }, + { + "epoch": 0.3522508704014803, + "grad_norm": 1.4790525436401367, + "learning_rate": 3.0061855921028862e-05, + "loss": 0.8888, + "step": 7234 + }, + { + "epoch": 0.3522995641905875, + "grad_norm": 1.2991830110549927, + "learning_rate": 3.0059129689008462e-05, + "loss": 0.7648, + "step": 7235 + }, + { + "epoch": 0.3523482579796947, + "grad_norm": 1.3791767358779907, + "learning_rate": 3.0056403206766296e-05, + "loss": 0.7664, + "step": 7236 + }, + { + "epoch": 0.35239695176880187, + "grad_norm": 1.9493987560272217, + "learning_rate": 3.0053676474370178e-05, + "loss": 0.8352, + "step": 7237 + }, + { + "epoch": 0.3524456455579091, + "grad_norm": 1.5951416492462158, + "learning_rate": 3.005094949188794e-05, + "loss": 0.967, + "step": 7238 + }, + { + "epoch": 0.3524943393470163, + "grad_norm": 1.3668369054794312, + "learning_rate": 3.004822225938742e-05, + "loss": 0.8763, + "step": 7239 + }, + { + "epoch": 0.3525430331361235, + "grad_norm": 1.3602544069290161, + "learning_rate": 3.0045494776936457e-05, + "loss": 0.8287, + "step": 7240 + }, + { + "epoch": 0.35259172692523066, + "grad_norm": 1.6204124689102173, + "learning_rate": 3.0042767044602894e-05, + "loss": 0.8159, + "step": 7241 + }, + { + "epoch": 0.3526404207143379, + "grad_norm": 1.22701096534729, + "learning_rate": 3.004003906245458e-05, + "loss": 0.8843, + "step": 7242 + }, + { + "epoch": 0.3526891145034451, + "grad_norm": 1.9357818365097046, + "learning_rate": 3.0037310830559385e-05, + "loss": 0.8884, + "step": 7243 + }, + { + "epoch": 0.3527378082925523, + "grad_norm": 1.5729446411132812, + "learning_rate": 3.0034582348985163e-05, + "loss": 0.8444, + "step": 7244 + }, + { + "epoch": 0.35278650208165946, + "grad_norm": 1.7286555767059326, + "learning_rate": 3.003185361779979e-05, + "loss": 0.8428, + "step": 7245 + }, + { + "epoch": 0.35283519587076667, + "grad_norm": 1.8105945587158203, + "learning_rate": 3.002912463707115e-05, + "loss": 0.9124, + "step": 7246 + }, + { + "epoch": 0.3528838896598739, + "grad_norm": 1.2774982452392578, + "learning_rate": 3.002639540686712e-05, + "loss": 0.8954, + "step": 7247 + }, + { + "epoch": 0.3529325834489811, + "grad_norm": 1.3214746713638306, + "learning_rate": 3.0023665927255586e-05, + "loss": 0.8544, + "step": 7248 + }, + { + "epoch": 0.3529812772380883, + "grad_norm": 1.3070635795593262, + "learning_rate": 3.0020936198304448e-05, + "loss": 0.9019, + "step": 7249 + }, + { + "epoch": 0.35302997102719547, + "grad_norm": 2.1194441318511963, + "learning_rate": 3.0018206220081613e-05, + "loss": 0.8464, + "step": 7250 + }, + { + "epoch": 0.3530786648163027, + "grad_norm": 2.138925313949585, + "learning_rate": 3.0015475992654976e-05, + "loss": 0.8422, + "step": 7251 + }, + { + "epoch": 0.3531273586054099, + "grad_norm": 1.5699639320373535, + "learning_rate": 3.0012745516092467e-05, + "loss": 0.8835, + "step": 7252 + }, + { + "epoch": 0.3531760523945171, + "grad_norm": 1.3933359384536743, + "learning_rate": 3.0010014790462e-05, + "loss": 0.9069, + "step": 7253 + }, + { + "epoch": 0.35322474618362426, + "grad_norm": 2.2558228969573975, + "learning_rate": 3.00072838158315e-05, + "loss": 0.8431, + "step": 7254 + }, + { + "epoch": 0.3532734399727315, + "grad_norm": 1.6920676231384277, + "learning_rate": 3.0004552592268906e-05, + "loss": 0.9372, + "step": 7255 + }, + { + "epoch": 0.3533221337618387, + "grad_norm": 1.7243849039077759, + "learning_rate": 3.000182111984215e-05, + "loss": 0.8641, + "step": 7256 + }, + { + "epoch": 0.3533708275509459, + "grad_norm": 2.0206851959228516, + "learning_rate": 2.999908939861919e-05, + "loss": 0.8336, + "step": 7257 + }, + { + "epoch": 0.35341952134005306, + "grad_norm": 1.9629853963851929, + "learning_rate": 2.9996357428667954e-05, + "loss": 0.8225, + "step": 7258 + }, + { + "epoch": 0.35346821512916027, + "grad_norm": 1.6884024143218994, + "learning_rate": 2.999362521005643e-05, + "loss": 0.8323, + "step": 7259 + }, + { + "epoch": 0.3535169089182675, + "grad_norm": 1.8885222673416138, + "learning_rate": 2.9990892742852563e-05, + "loss": 0.8831, + "step": 7260 + }, + { + "epoch": 0.3535656027073747, + "grad_norm": 1.6387042999267578, + "learning_rate": 2.998816002712433e-05, + "loss": 0.8169, + "step": 7261 + }, + { + "epoch": 0.35361429649648185, + "grad_norm": 1.5511085987091064, + "learning_rate": 2.9985427062939698e-05, + "loss": 0.8861, + "step": 7262 + }, + { + "epoch": 0.35366299028558906, + "grad_norm": 2.4933371543884277, + "learning_rate": 2.9982693850366663e-05, + "loss": 0.8919, + "step": 7263 + }, + { + "epoch": 0.3537116840746963, + "grad_norm": 1.6339378356933594, + "learning_rate": 2.997996038947321e-05, + "loss": 0.804, + "step": 7264 + }, + { + "epoch": 0.3537603778638035, + "grad_norm": 2.1528189182281494, + "learning_rate": 2.9977226680327335e-05, + "loss": 0.9018, + "step": 7265 + }, + { + "epoch": 0.35380907165291064, + "grad_norm": 3.354257583618164, + "learning_rate": 2.9974492722997028e-05, + "loss": 0.9308, + "step": 7266 + }, + { + "epoch": 0.35385776544201786, + "grad_norm": 5.028486728668213, + "learning_rate": 2.9971758517550306e-05, + "loss": 0.9016, + "step": 7267 + }, + { + "epoch": 0.35390645923112507, + "grad_norm": 1.7573509216308594, + "learning_rate": 2.996902406405519e-05, + "loss": 0.8148, + "step": 7268 + }, + { + "epoch": 0.3539551530202323, + "grad_norm": 1.9247735738754272, + "learning_rate": 2.9966289362579685e-05, + "loss": 0.8021, + "step": 7269 + }, + { + "epoch": 0.3540038468093395, + "grad_norm": 2.0818469524383545, + "learning_rate": 2.9963554413191826e-05, + "loss": 0.8679, + "step": 7270 + }, + { + "epoch": 0.35405254059844665, + "grad_norm": 1.9858007431030273, + "learning_rate": 2.9960819215959644e-05, + "loss": 0.8479, + "step": 7271 + }, + { + "epoch": 0.35410123438755386, + "grad_norm": 1.2366701364517212, + "learning_rate": 2.995808377095117e-05, + "loss": 0.8686, + "step": 7272 + }, + { + "epoch": 0.3541499281766611, + "grad_norm": 1.8503711223602295, + "learning_rate": 2.9955348078234456e-05, + "loss": 0.8482, + "step": 7273 + }, + { + "epoch": 0.3541986219657683, + "grad_norm": 1.5356957912445068, + "learning_rate": 2.995261213787755e-05, + "loss": 0.8775, + "step": 7274 + }, + { + "epoch": 0.35424731575487545, + "grad_norm": 1.445306420326233, + "learning_rate": 2.9949875949948515e-05, + "loss": 0.8652, + "step": 7275 + }, + { + "epoch": 0.35429600954398266, + "grad_norm": 1.7520672082901, + "learning_rate": 2.9947139514515405e-05, + "loss": 0.8488, + "step": 7276 + }, + { + "epoch": 0.35434470333308987, + "grad_norm": 2.98608660697937, + "learning_rate": 2.9944402831646292e-05, + "loss": 0.8882, + "step": 7277 + }, + { + "epoch": 0.3543933971221971, + "grad_norm": 1.4982362985610962, + "learning_rate": 2.9941665901409257e-05, + "loss": 0.8642, + "step": 7278 + }, + { + "epoch": 0.35444209091130424, + "grad_norm": 1.4022345542907715, + "learning_rate": 2.993892872387237e-05, + "loss": 0.8529, + "step": 7279 + }, + { + "epoch": 0.35449078470041145, + "grad_norm": 1.3290046453475952, + "learning_rate": 2.9936191299103727e-05, + "loss": 0.8084, + "step": 7280 + }, + { + "epoch": 0.35453947848951867, + "grad_norm": 1.7066049575805664, + "learning_rate": 2.993345362717142e-05, + "loss": 0.8636, + "step": 7281 + }, + { + "epoch": 0.3545881722786259, + "grad_norm": 1.3736616373062134, + "learning_rate": 2.9930715708143548e-05, + "loss": 0.8243, + "step": 7282 + }, + { + "epoch": 0.35463686606773304, + "grad_norm": 1.2820571660995483, + "learning_rate": 2.9927977542088214e-05, + "loss": 0.8774, + "step": 7283 + }, + { + "epoch": 0.35468555985684025, + "grad_norm": 1.4571821689605713, + "learning_rate": 2.9925239129073536e-05, + "loss": 0.9241, + "step": 7284 + }, + { + "epoch": 0.35473425364594746, + "grad_norm": 1.5653718709945679, + "learning_rate": 2.9922500469167633e-05, + "loss": 0.8342, + "step": 7285 + }, + { + "epoch": 0.3547829474350547, + "grad_norm": 1.2005434036254883, + "learning_rate": 2.9919761562438626e-05, + "loss": 0.9662, + "step": 7286 + }, + { + "epoch": 0.35483164122416183, + "grad_norm": 1.6383041143417358, + "learning_rate": 2.9917022408954637e-05, + "loss": 0.8116, + "step": 7287 + }, + { + "epoch": 0.35488033501326904, + "grad_norm": 1.5849155187606812, + "learning_rate": 2.991428300878382e-05, + "loss": 0.9173, + "step": 7288 + }, + { + "epoch": 0.35492902880237626, + "grad_norm": 1.3017468452453613, + "learning_rate": 2.991154336199431e-05, + "loss": 0.8429, + "step": 7289 + }, + { + "epoch": 0.35497772259148347, + "grad_norm": 1.5149375200271606, + "learning_rate": 2.9908803468654252e-05, + "loss": 0.9056, + "step": 7290 + }, + { + "epoch": 0.3550264163805907, + "grad_norm": 1.754961371421814, + "learning_rate": 2.9906063328831804e-05, + "loss": 0.8145, + "step": 7291 + }, + { + "epoch": 0.35507511016969784, + "grad_norm": 1.4036673307418823, + "learning_rate": 2.9903322942595125e-05, + "loss": 0.8016, + "step": 7292 + }, + { + "epoch": 0.35512380395880505, + "grad_norm": 1.9777811765670776, + "learning_rate": 2.990058231001239e-05, + "loss": 0.8191, + "step": 7293 + }, + { + "epoch": 0.35517249774791226, + "grad_norm": 1.6821339130401611, + "learning_rate": 2.9897841431151763e-05, + "loss": 0.913, + "step": 7294 + }, + { + "epoch": 0.3552211915370195, + "grad_norm": 1.2714171409606934, + "learning_rate": 2.989510030608143e-05, + "loss": 0.8514, + "step": 7295 + }, + { + "epoch": 0.35526988532612663, + "grad_norm": 1.6454944610595703, + "learning_rate": 2.9892358934869573e-05, + "loss": 0.9277, + "step": 7296 + }, + { + "epoch": 0.35531857911523385, + "grad_norm": 1.7004649639129639, + "learning_rate": 2.9889617317584388e-05, + "loss": 0.8694, + "step": 7297 + }, + { + "epoch": 0.35536727290434106, + "grad_norm": 0.09093301743268967, + "learning_rate": 2.9886875454294075e-05, + "loss": 0.6054, + "step": 7298 + }, + { + "epoch": 0.35541596669344827, + "grad_norm": 1.4453492164611816, + "learning_rate": 2.9884133345066824e-05, + "loss": 0.7957, + "step": 7299 + }, + { + "epoch": 0.35546466048255543, + "grad_norm": 1.7803846597671509, + "learning_rate": 2.9881390989970863e-05, + "loss": 0.9257, + "step": 7300 + }, + { + "epoch": 0.35551335427166264, + "grad_norm": 1.4720220565795898, + "learning_rate": 2.987864838907439e-05, + "loss": 0.8921, + "step": 7301 + }, + { + "epoch": 0.35556204806076985, + "grad_norm": 2.5807888507843018, + "learning_rate": 2.9875905542445646e-05, + "loss": 0.8168, + "step": 7302 + }, + { + "epoch": 0.35561074184987707, + "grad_norm": 1.1565240621566772, + "learning_rate": 2.9873162450152848e-05, + "loss": 0.8525, + "step": 7303 + }, + { + "epoch": 0.3556594356389842, + "grad_norm": 1.4820913076400757, + "learning_rate": 2.9870419112264232e-05, + "loss": 0.8781, + "step": 7304 + }, + { + "epoch": 0.35570812942809144, + "grad_norm": 2.1529476642608643, + "learning_rate": 2.9867675528848043e-05, + "loss": 0.909, + "step": 7305 + }, + { + "epoch": 0.35575682321719865, + "grad_norm": 1.6290090084075928, + "learning_rate": 2.9864931699972528e-05, + "loss": 0.9259, + "step": 7306 + }, + { + "epoch": 0.35580551700630586, + "grad_norm": 1.3012720346450806, + "learning_rate": 2.9862187625705936e-05, + "loss": 0.7874, + "step": 7307 + }, + { + "epoch": 0.355854210795413, + "grad_norm": 1.4855273962020874, + "learning_rate": 2.985944330611653e-05, + "loss": 0.8958, + "step": 7308 + }, + { + "epoch": 0.35590290458452023, + "grad_norm": 1.604838252067566, + "learning_rate": 2.985669874127257e-05, + "loss": 0.8949, + "step": 7309 + }, + { + "epoch": 0.35595159837362744, + "grad_norm": 1.7954354286193848, + "learning_rate": 2.9853953931242324e-05, + "loss": 0.7985, + "step": 7310 + }, + { + "epoch": 0.35600029216273466, + "grad_norm": 1.364338755607605, + "learning_rate": 2.9851208876094087e-05, + "loss": 0.9068, + "step": 7311 + }, + { + "epoch": 0.35604898595184187, + "grad_norm": 1.8202769756317139, + "learning_rate": 2.984846357589612e-05, + "loss": 0.9607, + "step": 7312 + }, + { + "epoch": 0.356097679740949, + "grad_norm": 1.3521102666854858, + "learning_rate": 2.9845718030716732e-05, + "loss": 0.9282, + "step": 7313 + }, + { + "epoch": 0.35614637353005624, + "grad_norm": 1.9805740118026733, + "learning_rate": 2.984297224062421e-05, + "loss": 0.7679, + "step": 7314 + }, + { + "epoch": 0.35619506731916345, + "grad_norm": 1.5570932626724243, + "learning_rate": 2.984022620568685e-05, + "loss": 0.8841, + "step": 7315 + }, + { + "epoch": 0.35624376110827066, + "grad_norm": 1.4478071928024292, + "learning_rate": 2.983747992597297e-05, + "loss": 0.8872, + "step": 7316 + }, + { + "epoch": 0.3562924548973778, + "grad_norm": 2.4399189949035645, + "learning_rate": 2.983473340155088e-05, + "loss": 0.8412, + "step": 7317 + }, + { + "epoch": 0.35634114868648503, + "grad_norm": 2.4003379344940186, + "learning_rate": 2.98319866324889e-05, + "loss": 0.8379, + "step": 7318 + }, + { + "epoch": 0.35638984247559224, + "grad_norm": 1.3476781845092773, + "learning_rate": 2.9829239618855355e-05, + "loss": 0.8784, + "step": 7319 + }, + { + "epoch": 0.35643853626469946, + "grad_norm": 1.497481346130371, + "learning_rate": 2.982649236071858e-05, + "loss": 0.8016, + "step": 7320 + }, + { + "epoch": 0.3564872300538066, + "grad_norm": 0.08920703828334808, + "learning_rate": 2.982374485814691e-05, + "loss": 0.6452, + "step": 7321 + }, + { + "epoch": 0.3565359238429138, + "grad_norm": 1.8178179264068604, + "learning_rate": 2.982099711120869e-05, + "loss": 0.7726, + "step": 7322 + }, + { + "epoch": 0.35658461763202104, + "grad_norm": 1.419736623764038, + "learning_rate": 2.9818249119972272e-05, + "loss": 0.8956, + "step": 7323 + }, + { + "epoch": 0.35663331142112825, + "grad_norm": 1.7776979207992554, + "learning_rate": 2.9815500884506013e-05, + "loss": 0.8785, + "step": 7324 + }, + { + "epoch": 0.3566820052102354, + "grad_norm": 1.2819771766662598, + "learning_rate": 2.9812752404878275e-05, + "loss": 0.8733, + "step": 7325 + }, + { + "epoch": 0.3567306989993426, + "grad_norm": 1.4816126823425293, + "learning_rate": 2.9810003681157424e-05, + "loss": 0.875, + "step": 7326 + }, + { + "epoch": 0.35677939278844983, + "grad_norm": 1.6618014574050903, + "learning_rate": 2.980725471341184e-05, + "loss": 0.9289, + "step": 7327 + }, + { + "epoch": 0.35682808657755705, + "grad_norm": 2.60223126411438, + "learning_rate": 2.9804505501709903e-05, + "loss": 0.8079, + "step": 7328 + }, + { + "epoch": 0.35687678036666426, + "grad_norm": 1.9796268939971924, + "learning_rate": 2.9801756046119988e-05, + "loss": 0.7861, + "step": 7329 + }, + { + "epoch": 0.3569254741557714, + "grad_norm": 1.878883719444275, + "learning_rate": 2.9799006346710504e-05, + "loss": 0.7855, + "step": 7330 + }, + { + "epoch": 0.35697416794487863, + "grad_norm": 1.5205663442611694, + "learning_rate": 2.9796256403549845e-05, + "loss": 0.855, + "step": 7331 + }, + { + "epoch": 0.35702286173398584, + "grad_norm": 1.4377951622009277, + "learning_rate": 2.979350621670641e-05, + "loss": 0.8745, + "step": 7332 + }, + { + "epoch": 0.35707155552309305, + "grad_norm": 1.7740839719772339, + "learning_rate": 2.979075578624862e-05, + "loss": 0.8406, + "step": 7333 + }, + { + "epoch": 0.3571202493122002, + "grad_norm": 1.561195731163025, + "learning_rate": 2.9788005112244883e-05, + "loss": 0.9222, + "step": 7334 + }, + { + "epoch": 0.3571689431013074, + "grad_norm": 1.4617496728897095, + "learning_rate": 2.978525419476363e-05, + "loss": 0.8568, + "step": 7335 + }, + { + "epoch": 0.35721763689041464, + "grad_norm": 1.4014384746551514, + "learning_rate": 2.9782503033873288e-05, + "loss": 0.9521, + "step": 7336 + }, + { + "epoch": 0.35726633067952185, + "grad_norm": 4.299050331115723, + "learning_rate": 2.977975162964229e-05, + "loss": 0.8271, + "step": 7337 + }, + { + "epoch": 0.357315024468629, + "grad_norm": 1.7800023555755615, + "learning_rate": 2.9776999982139073e-05, + "loss": 0.8852, + "step": 7338 + }, + { + "epoch": 0.3573637182577362, + "grad_norm": 2.5449917316436768, + "learning_rate": 2.9774248091432092e-05, + "loss": 0.93, + "step": 7339 + }, + { + "epoch": 0.35741241204684343, + "grad_norm": 1.705072283744812, + "learning_rate": 2.97714959575898e-05, + "loss": 0.8958, + "step": 7340 + }, + { + "epoch": 0.35746110583595064, + "grad_norm": 2.514775037765503, + "learning_rate": 2.9768743580680658e-05, + "loss": 0.8071, + "step": 7341 + }, + { + "epoch": 0.3575097996250578, + "grad_norm": 2.293503761291504, + "learning_rate": 2.9765990960773123e-05, + "loss": 0.8554, + "step": 7342 + }, + { + "epoch": 0.357558493414165, + "grad_norm": 2.6999905109405518, + "learning_rate": 2.9763238097935678e-05, + "loss": 0.86, + "step": 7343 + }, + { + "epoch": 0.3576071872032722, + "grad_norm": 1.5204927921295166, + "learning_rate": 2.976048499223679e-05, + "loss": 0.8146, + "step": 7344 + }, + { + "epoch": 0.35765588099237944, + "grad_norm": 1.6173292398452759, + "learning_rate": 2.9757731643744955e-05, + "loss": 0.8931, + "step": 7345 + }, + { + "epoch": 0.3577045747814866, + "grad_norm": 1.407442569732666, + "learning_rate": 2.975497805252865e-05, + "loss": 0.8483, + "step": 7346 + }, + { + "epoch": 0.3577532685705938, + "grad_norm": 3.0708420276641846, + "learning_rate": 2.9752224218656377e-05, + "loss": 0.8572, + "step": 7347 + }, + { + "epoch": 0.357801962359701, + "grad_norm": 1.6144042015075684, + "learning_rate": 2.974947014219664e-05, + "loss": 0.9461, + "step": 7348 + }, + { + "epoch": 0.35785065614880823, + "grad_norm": 1.2899596691131592, + "learning_rate": 2.9746715823217944e-05, + "loss": 0.8916, + "step": 7349 + }, + { + "epoch": 0.35789934993791545, + "grad_norm": 1.7488905191421509, + "learning_rate": 2.9743961261788802e-05, + "loss": 0.8636, + "step": 7350 + }, + { + "epoch": 0.3579480437270226, + "grad_norm": 1.2712748050689697, + "learning_rate": 2.974120645797773e-05, + "loss": 0.808, + "step": 7351 + }, + { + "epoch": 0.3579967375161298, + "grad_norm": 1.7517677545547485, + "learning_rate": 2.9738451411853266e-05, + "loss": 0.8492, + "step": 7352 + }, + { + "epoch": 0.358045431305237, + "grad_norm": 1.8540980815887451, + "learning_rate": 2.973569612348393e-05, + "loss": 0.8416, + "step": 7353 + }, + { + "epoch": 0.35809412509434424, + "grad_norm": 3.067214250564575, + "learning_rate": 2.973294059293827e-05, + "loss": 0.8052, + "step": 7354 + }, + { + "epoch": 0.3581428188834514, + "grad_norm": 1.2972898483276367, + "learning_rate": 2.9730184820284824e-05, + "loss": 0.906, + "step": 7355 + }, + { + "epoch": 0.3581915126725586, + "grad_norm": 1.4946882724761963, + "learning_rate": 2.9727428805592144e-05, + "loss": 0.8605, + "step": 7356 + }, + { + "epoch": 0.3582402064616658, + "grad_norm": 1.5289710760116577, + "learning_rate": 2.9724672548928785e-05, + "loss": 0.8264, + "step": 7357 + }, + { + "epoch": 0.35828890025077303, + "grad_norm": 2.030226707458496, + "learning_rate": 2.972191605036331e-05, + "loss": 0.7727, + "step": 7358 + }, + { + "epoch": 0.3583375940398802, + "grad_norm": 1.9215481281280518, + "learning_rate": 2.9719159309964293e-05, + "loss": 0.7845, + "step": 7359 + }, + { + "epoch": 0.3583862878289874, + "grad_norm": 1.899243712425232, + "learning_rate": 2.971640232780029e-05, + "loss": 0.8074, + "step": 7360 + }, + { + "epoch": 0.3584349816180946, + "grad_norm": 1.4914820194244385, + "learning_rate": 2.97136451039399e-05, + "loss": 0.7967, + "step": 7361 + }, + { + "epoch": 0.35848367540720183, + "grad_norm": 1.692693829536438, + "learning_rate": 2.9710887638451695e-05, + "loss": 0.8585, + "step": 7362 + }, + { + "epoch": 0.358532369196309, + "grad_norm": 2.2287991046905518, + "learning_rate": 2.9708129931404283e-05, + "loss": 0.8048, + "step": 7363 + }, + { + "epoch": 0.3585810629854162, + "grad_norm": 1.2709336280822754, + "learning_rate": 2.9705371982866253e-05, + "loss": 0.8499, + "step": 7364 + }, + { + "epoch": 0.3586297567745234, + "grad_norm": 1.2973670959472656, + "learning_rate": 2.9702613792906212e-05, + "loss": 0.8707, + "step": 7365 + }, + { + "epoch": 0.3586784505636306, + "grad_norm": 1.6733273267745972, + "learning_rate": 2.9699855361592765e-05, + "loss": 0.9246, + "step": 7366 + }, + { + "epoch": 0.3587271443527378, + "grad_norm": 1.3932487964630127, + "learning_rate": 2.969709668899453e-05, + "loss": 0.8027, + "step": 7367 + }, + { + "epoch": 0.358775838141845, + "grad_norm": 1.3331178426742554, + "learning_rate": 2.9694337775180127e-05, + "loss": 0.8181, + "step": 7368 + }, + { + "epoch": 0.3588245319309522, + "grad_norm": 1.1286982297897339, + "learning_rate": 2.9691578620218197e-05, + "loss": 0.8815, + "step": 7369 + }, + { + "epoch": 0.3588732257200594, + "grad_norm": 1.5359282493591309, + "learning_rate": 2.9688819224177364e-05, + "loss": 0.7245, + "step": 7370 + }, + { + "epoch": 0.35892191950916663, + "grad_norm": 1.28134024143219, + "learning_rate": 2.9686059587126262e-05, + "loss": 0.7942, + "step": 7371 + }, + { + "epoch": 0.3589706132982738, + "grad_norm": 1.6389288902282715, + "learning_rate": 2.9683299709133552e-05, + "loss": 0.7361, + "step": 7372 + }, + { + "epoch": 0.359019307087381, + "grad_norm": 1.5506200790405273, + "learning_rate": 2.9680539590267874e-05, + "loss": 0.7977, + "step": 7373 + }, + { + "epoch": 0.3590680008764882, + "grad_norm": 1.139785647392273, + "learning_rate": 2.9677779230597898e-05, + "loss": 0.8395, + "step": 7374 + }, + { + "epoch": 0.3591166946655954, + "grad_norm": 1.4092413187026978, + "learning_rate": 2.9675018630192268e-05, + "loss": 0.7862, + "step": 7375 + }, + { + "epoch": 0.3591653884547026, + "grad_norm": 1.275537371635437, + "learning_rate": 2.967225778911968e-05, + "loss": 0.8243, + "step": 7376 + }, + { + "epoch": 0.3592140822438098, + "grad_norm": 2.225680351257324, + "learning_rate": 2.966949670744879e-05, + "loss": 0.8722, + "step": 7377 + }, + { + "epoch": 0.359262776032917, + "grad_norm": 1.581636667251587, + "learning_rate": 2.9666735385248295e-05, + "loss": 0.8796, + "step": 7378 + }, + { + "epoch": 0.3593114698220242, + "grad_norm": 2.4983835220336914, + "learning_rate": 2.9663973822586874e-05, + "loss": 0.8464, + "step": 7379 + }, + { + "epoch": 0.3593601636111314, + "grad_norm": 1.6558911800384521, + "learning_rate": 2.966121201953322e-05, + "loss": 0.894, + "step": 7380 + }, + { + "epoch": 0.3594088574002386, + "grad_norm": 1.5072009563446045, + "learning_rate": 2.9658449976156033e-05, + "loss": 0.976, + "step": 7381 + }, + { + "epoch": 0.3594575511893458, + "grad_norm": 1.3369210958480835, + "learning_rate": 2.9655687692524027e-05, + "loss": 0.8583, + "step": 7382 + }, + { + "epoch": 0.359506244978453, + "grad_norm": 2.8315675258636475, + "learning_rate": 2.9652925168705907e-05, + "loss": 0.8353, + "step": 7383 + }, + { + "epoch": 0.3595549387675602, + "grad_norm": 1.8456720113754272, + "learning_rate": 2.9650162404770397e-05, + "loss": 0.9041, + "step": 7384 + }, + { + "epoch": 0.3596036325566674, + "grad_norm": 1.1842544078826904, + "learning_rate": 2.9647399400786216e-05, + "loss": 0.7676, + "step": 7385 + }, + { + "epoch": 0.3596523263457746, + "grad_norm": 1.1642937660217285, + "learning_rate": 2.9644636156822098e-05, + "loss": 0.85, + "step": 7386 + }, + { + "epoch": 0.3597010201348818, + "grad_norm": 1.1236004829406738, + "learning_rate": 2.9641872672946773e-05, + "loss": 0.746, + "step": 7387 + }, + { + "epoch": 0.359749713923989, + "grad_norm": 2.1395437717437744, + "learning_rate": 2.9639108949228984e-05, + "loss": 0.8178, + "step": 7388 + }, + { + "epoch": 0.3597984077130962, + "grad_norm": 1.9467769861221313, + "learning_rate": 2.9636344985737482e-05, + "loss": 0.9088, + "step": 7389 + }, + { + "epoch": 0.3598471015022034, + "grad_norm": 1.604640007019043, + "learning_rate": 2.963358078254102e-05, + "loss": 0.8474, + "step": 7390 + }, + { + "epoch": 0.3598957952913106, + "grad_norm": 1.4049410820007324, + "learning_rate": 2.9630816339708355e-05, + "loss": 0.8308, + "step": 7391 + }, + { + "epoch": 0.3599444890804178, + "grad_norm": 1.2757883071899414, + "learning_rate": 2.962805165730826e-05, + "loss": 0.9269, + "step": 7392 + }, + { + "epoch": 0.359993182869525, + "grad_norm": 1.836981177330017, + "learning_rate": 2.9625286735409503e-05, + "loss": 0.8689, + "step": 7393 + }, + { + "epoch": 0.3600418766586322, + "grad_norm": 1.9779179096221924, + "learning_rate": 2.962252157408086e-05, + "loss": 0.8736, + "step": 7394 + }, + { + "epoch": 0.3600905704477394, + "grad_norm": 1.8624593019485474, + "learning_rate": 2.9619756173391114e-05, + "loss": 0.8717, + "step": 7395 + }, + { + "epoch": 0.3601392642368466, + "grad_norm": 1.2612820863723755, + "learning_rate": 2.9616990533409055e-05, + "loss": 0.9247, + "step": 7396 + }, + { + "epoch": 0.36018795802595377, + "grad_norm": 1.6016712188720703, + "learning_rate": 2.9614224654203482e-05, + "loss": 0.8969, + "step": 7397 + }, + { + "epoch": 0.360236651815061, + "grad_norm": 1.5085657835006714, + "learning_rate": 2.961145853584319e-05, + "loss": 0.8219, + "step": 7398 + }, + { + "epoch": 0.3602853456041682, + "grad_norm": 1.4718097448349, + "learning_rate": 2.9608692178397002e-05, + "loss": 0.9618, + "step": 7399 + }, + { + "epoch": 0.3603340393932754, + "grad_norm": 2.0103347301483154, + "learning_rate": 2.960592558193371e-05, + "loss": 0.8533, + "step": 7400 + }, + { + "epoch": 0.36038273318238256, + "grad_norm": 1.6202315092086792, + "learning_rate": 2.9603158746522144e-05, + "loss": 0.7516, + "step": 7401 + }, + { + "epoch": 0.3604314269714898, + "grad_norm": 1.9608646631240845, + "learning_rate": 2.960039167223113e-05, + "loss": 0.8073, + "step": 7402 + }, + { + "epoch": 0.360480120760597, + "grad_norm": 2.563385486602783, + "learning_rate": 2.95976243591295e-05, + "loss": 0.8965, + "step": 7403 + }, + { + "epoch": 0.3605288145497042, + "grad_norm": 0.08916612714529037, + "learning_rate": 2.959485680728608e-05, + "loss": 0.5996, + "step": 7404 + }, + { + "epoch": 0.36057750833881136, + "grad_norm": 1.3181915283203125, + "learning_rate": 2.9592089016769732e-05, + "loss": 0.9122, + "step": 7405 + }, + { + "epoch": 0.36062620212791857, + "grad_norm": 1.4348152875900269, + "learning_rate": 2.9589320987649293e-05, + "loss": 0.9044, + "step": 7406 + }, + { + "epoch": 0.3606748959170258, + "grad_norm": 2.6658716201782227, + "learning_rate": 2.958655271999362e-05, + "loss": 0.8465, + "step": 7407 + }, + { + "epoch": 0.360723589706133, + "grad_norm": 1.9239108562469482, + "learning_rate": 2.9583784213871573e-05, + "loss": 0.9218, + "step": 7408 + }, + { + "epoch": 0.3607722834952402, + "grad_norm": 1.4365527629852295, + "learning_rate": 2.958101546935202e-05, + "loss": 0.9074, + "step": 7409 + }, + { + "epoch": 0.36082097728434737, + "grad_norm": 1.4563064575195312, + "learning_rate": 2.957824648650383e-05, + "loss": 0.8886, + "step": 7410 + }, + { + "epoch": 0.3608696710734546, + "grad_norm": 1.402625560760498, + "learning_rate": 2.9575477265395892e-05, + "loss": 0.8674, + "step": 7411 + }, + { + "epoch": 0.3609183648625618, + "grad_norm": 2.3581504821777344, + "learning_rate": 2.9572707806097076e-05, + "loss": 0.86, + "step": 7412 + }, + { + "epoch": 0.360967058651669, + "grad_norm": 4.022506237030029, + "learning_rate": 2.9569938108676286e-05, + "loss": 0.906, + "step": 7413 + }, + { + "epoch": 0.36101575244077616, + "grad_norm": 1.4346472024917603, + "learning_rate": 2.956716817320241e-05, + "loss": 0.8974, + "step": 7414 + }, + { + "epoch": 0.3610644462298834, + "grad_norm": 1.1725537776947021, + "learning_rate": 2.9564397999744358e-05, + "loss": 0.8715, + "step": 7415 + }, + { + "epoch": 0.3611131400189906, + "grad_norm": 1.3946170806884766, + "learning_rate": 2.9561627588371036e-05, + "loss": 0.848, + "step": 7416 + }, + { + "epoch": 0.3611618338080978, + "grad_norm": 1.4882586002349854, + "learning_rate": 2.9558856939151354e-05, + "loss": 0.9307, + "step": 7417 + }, + { + "epoch": 0.36121052759720496, + "grad_norm": 4.570128440856934, + "learning_rate": 2.9556086052154226e-05, + "loss": 0.8942, + "step": 7418 + }, + { + "epoch": 0.36125922138631217, + "grad_norm": 1.4817134141921997, + "learning_rate": 2.9553314927448594e-05, + "loss": 0.8288, + "step": 7419 + }, + { + "epoch": 0.3613079151754194, + "grad_norm": 1.7027087211608887, + "learning_rate": 2.9550543565103376e-05, + "loss": 0.8887, + "step": 7420 + }, + { + "epoch": 0.3613566089645266, + "grad_norm": 1.2958799600601196, + "learning_rate": 2.954777196518752e-05, + "loss": 0.7866, + "step": 7421 + }, + { + "epoch": 0.36140530275363375, + "grad_norm": 2.349121332168579, + "learning_rate": 2.954500012776997e-05, + "loss": 0.7635, + "step": 7422 + }, + { + "epoch": 0.36145399654274096, + "grad_norm": 3.138784646987915, + "learning_rate": 2.9542228052919664e-05, + "loss": 0.7783, + "step": 7423 + }, + { + "epoch": 0.3615026903318482, + "grad_norm": 1.9086101055145264, + "learning_rate": 2.9539455740705573e-05, + "loss": 0.7818, + "step": 7424 + }, + { + "epoch": 0.3615513841209554, + "grad_norm": 1.4071186780929565, + "learning_rate": 2.953668319119664e-05, + "loss": 0.7604, + "step": 7425 + }, + { + "epoch": 0.36160007791006255, + "grad_norm": 1.2179752588272095, + "learning_rate": 2.9533910404461853e-05, + "loss": 0.9105, + "step": 7426 + }, + { + "epoch": 0.36164877169916976, + "grad_norm": 1.398889422416687, + "learning_rate": 2.953113738057017e-05, + "loss": 0.8096, + "step": 7427 + }, + { + "epoch": 0.36169746548827697, + "grad_norm": 1.3078198432922363, + "learning_rate": 2.9528364119590578e-05, + "loss": 0.849, + "step": 7428 + }, + { + "epoch": 0.3617461592773842, + "grad_norm": 2.832859516143799, + "learning_rate": 2.952559062159206e-05, + "loss": 0.806, + "step": 7429 + }, + { + "epoch": 0.3617948530664914, + "grad_norm": 2.0036821365356445, + "learning_rate": 2.95228168866436e-05, + "loss": 0.922, + "step": 7430 + }, + { + "epoch": 0.36184354685559855, + "grad_norm": 1.4217169284820557, + "learning_rate": 2.952004291481421e-05, + "loss": 0.8448, + "step": 7431 + }, + { + "epoch": 0.36189224064470576, + "grad_norm": 1.978959321975708, + "learning_rate": 2.9517268706172885e-05, + "loss": 0.8381, + "step": 7432 + }, + { + "epoch": 0.361940934433813, + "grad_norm": 1.9909504652023315, + "learning_rate": 2.9514494260788624e-05, + "loss": 0.8503, + "step": 7433 + }, + { + "epoch": 0.3619896282229202, + "grad_norm": 3.2817442417144775, + "learning_rate": 2.951171957873046e-05, + "loss": 0.8375, + "step": 7434 + }, + { + "epoch": 0.36203832201202735, + "grad_norm": 1.4222092628479004, + "learning_rate": 2.95089446600674e-05, + "loss": 0.8925, + "step": 7435 + }, + { + "epoch": 0.36208701580113456, + "grad_norm": 1.958625316619873, + "learning_rate": 2.950616950486848e-05, + "loss": 0.9642, + "step": 7436 + }, + { + "epoch": 0.36213570959024177, + "grad_norm": 1.4717578887939453, + "learning_rate": 2.9503394113202717e-05, + "loss": 0.89, + "step": 7437 + }, + { + "epoch": 0.362184403379349, + "grad_norm": 1.3264737129211426, + "learning_rate": 2.950061848513917e-05, + "loss": 0.8474, + "step": 7438 + }, + { + "epoch": 0.36223309716845614, + "grad_norm": 1.5014897584915161, + "learning_rate": 2.9497842620746864e-05, + "loss": 0.8719, + "step": 7439 + }, + { + "epoch": 0.36228179095756335, + "grad_norm": 1.4996217489242554, + "learning_rate": 2.949506652009486e-05, + "loss": 0.8914, + "step": 7440 + }, + { + "epoch": 0.36233048474667057, + "grad_norm": 1.3043708801269531, + "learning_rate": 2.9492290183252205e-05, + "loss": 0.8595, + "step": 7441 + }, + { + "epoch": 0.3623791785357778, + "grad_norm": 1.5964702367782593, + "learning_rate": 2.9489513610287975e-05, + "loss": 0.7511, + "step": 7442 + }, + { + "epoch": 0.36242787232488494, + "grad_norm": 1.275428056716919, + "learning_rate": 2.9486736801271222e-05, + "loss": 0.8943, + "step": 7443 + }, + { + "epoch": 0.36247656611399215, + "grad_norm": 1.6251215934753418, + "learning_rate": 2.9483959756271035e-05, + "loss": 0.8198, + "step": 7444 + }, + { + "epoch": 0.36252525990309936, + "grad_norm": 1.4902926683425903, + "learning_rate": 2.9481182475356475e-05, + "loss": 0.9332, + "step": 7445 + }, + { + "epoch": 0.3625739536922066, + "grad_norm": 1.9491057395935059, + "learning_rate": 2.947840495859664e-05, + "loss": 0.8307, + "step": 7446 + }, + { + "epoch": 0.36262264748131373, + "grad_norm": 1.425919771194458, + "learning_rate": 2.947562720606062e-05, + "loss": 0.9204, + "step": 7447 + }, + { + "epoch": 0.36267134127042094, + "grad_norm": 1.497913122177124, + "learning_rate": 2.9472849217817505e-05, + "loss": 0.9017, + "step": 7448 + }, + { + "epoch": 0.36272003505952816, + "grad_norm": 2.1804535388946533, + "learning_rate": 2.9470070993936404e-05, + "loss": 0.9283, + "step": 7449 + }, + { + "epoch": 0.36276872884863537, + "grad_norm": 1.50754714012146, + "learning_rate": 2.946729253448642e-05, + "loss": 0.8751, + "step": 7450 + }, + { + "epoch": 0.3628174226377426, + "grad_norm": 1.5447529554367065, + "learning_rate": 2.9464513839536676e-05, + "loss": 0.8654, + "step": 7451 + }, + { + "epoch": 0.36286611642684974, + "grad_norm": 2.561645269393921, + "learning_rate": 2.9461734909156284e-05, + "loss": 0.8732, + "step": 7452 + }, + { + "epoch": 0.36291481021595695, + "grad_norm": 1.4941892623901367, + "learning_rate": 2.9458955743414374e-05, + "loss": 0.8422, + "step": 7453 + }, + { + "epoch": 0.36296350400506416, + "grad_norm": 1.2834928035736084, + "learning_rate": 2.945617634238008e-05, + "loss": 0.9422, + "step": 7454 + }, + { + "epoch": 0.3630121977941714, + "grad_norm": 1.3767855167388916, + "learning_rate": 2.9453396706122532e-05, + "loss": 0.8253, + "step": 7455 + }, + { + "epoch": 0.36306089158327853, + "grad_norm": 1.9681761264801025, + "learning_rate": 2.9450616834710884e-05, + "loss": 0.845, + "step": 7456 + }, + { + "epoch": 0.36310958537238575, + "grad_norm": 1.5412368774414062, + "learning_rate": 2.9447836728214284e-05, + "loss": 0.8204, + "step": 7457 + }, + { + "epoch": 0.36315827916149296, + "grad_norm": 1.2808811664581299, + "learning_rate": 2.9445056386701877e-05, + "loss": 0.781, + "step": 7458 + }, + { + "epoch": 0.36320697295060017, + "grad_norm": 1.7974497079849243, + "learning_rate": 2.9442275810242833e-05, + "loss": 0.9006, + "step": 7459 + }, + { + "epoch": 0.36325566673970733, + "grad_norm": 1.6025722026824951, + "learning_rate": 2.9439494998906316e-05, + "loss": 0.9142, + "step": 7460 + }, + { + "epoch": 0.36330436052881454, + "grad_norm": 1.5094887018203735, + "learning_rate": 2.9436713952761498e-05, + "loss": 0.7745, + "step": 7461 + }, + { + "epoch": 0.36335305431792175, + "grad_norm": 1.3806124925613403, + "learning_rate": 2.9433932671877566e-05, + "loss": 0.8996, + "step": 7462 + }, + { + "epoch": 0.36340174810702897, + "grad_norm": 1.665513277053833, + "learning_rate": 2.9431151156323694e-05, + "loss": 0.8401, + "step": 7463 + }, + { + "epoch": 0.3634504418961361, + "grad_norm": 1.6560131311416626, + "learning_rate": 2.9428369406169083e-05, + "loss": 0.8132, + "step": 7464 + }, + { + "epoch": 0.36349913568524334, + "grad_norm": 0.08510851114988327, + "learning_rate": 2.942558742148292e-05, + "loss": 0.577, + "step": 7465 + }, + { + "epoch": 0.36354782947435055, + "grad_norm": 1.7286758422851562, + "learning_rate": 2.9422805202334412e-05, + "loss": 0.9234, + "step": 7466 + }, + { + "epoch": 0.36359652326345776, + "grad_norm": 2.1674997806549072, + "learning_rate": 2.9420022748792765e-05, + "loss": 0.8359, + "step": 7467 + }, + { + "epoch": 0.363645217052565, + "grad_norm": 2.0580713748931885, + "learning_rate": 2.9417240060927188e-05, + "loss": 0.8859, + "step": 7468 + }, + { + "epoch": 0.36369391084167213, + "grad_norm": 1.1873382329940796, + "learning_rate": 2.941445713880691e-05, + "loss": 0.8735, + "step": 7469 + }, + { + "epoch": 0.36374260463077934, + "grad_norm": 2.4335110187530518, + "learning_rate": 2.941167398250115e-05, + "loss": 0.8465, + "step": 7470 + }, + { + "epoch": 0.36379129841988656, + "grad_norm": 1.7211710214614868, + "learning_rate": 2.9408890592079145e-05, + "loss": 0.8408, + "step": 7471 + }, + { + "epoch": 0.36383999220899377, + "grad_norm": 2.3529162406921387, + "learning_rate": 2.940610696761013e-05, + "loss": 0.8883, + "step": 7472 + }, + { + "epoch": 0.3638886859981009, + "grad_norm": 2.027043104171753, + "learning_rate": 2.9403323109163343e-05, + "loss": 0.844, + "step": 7473 + }, + { + "epoch": 0.36393737978720814, + "grad_norm": 1.5554252862930298, + "learning_rate": 2.9400539016808038e-05, + "loss": 0.8815, + "step": 7474 + }, + { + "epoch": 0.36398607357631535, + "grad_norm": 2.1036734580993652, + "learning_rate": 2.939775469061347e-05, + "loss": 0.8405, + "step": 7475 + }, + { + "epoch": 0.36403476736542256, + "grad_norm": 1.619423270225525, + "learning_rate": 2.9394970130648893e-05, + "loss": 0.7903, + "step": 7476 + }, + { + "epoch": 0.3640834611545297, + "grad_norm": 1.626872181892395, + "learning_rate": 2.939218533698358e-05, + "loss": 0.7363, + "step": 7477 + }, + { + "epoch": 0.36413215494363693, + "grad_norm": 1.563092827796936, + "learning_rate": 2.93894003096868e-05, + "loss": 0.9176, + "step": 7478 + }, + { + "epoch": 0.36418084873274414, + "grad_norm": 1.4107311964035034, + "learning_rate": 2.9386615048827833e-05, + "loss": 0.9061, + "step": 7479 + }, + { + "epoch": 0.36422954252185136, + "grad_norm": 1.5468690395355225, + "learning_rate": 2.9383829554475954e-05, + "loss": 0.8526, + "step": 7480 + }, + { + "epoch": 0.3642782363109585, + "grad_norm": 1.411694884300232, + "learning_rate": 2.9381043826700463e-05, + "loss": 0.8398, + "step": 7481 + }, + { + "epoch": 0.3643269301000657, + "grad_norm": 1.5409841537475586, + "learning_rate": 2.937825786557065e-05, + "loss": 0.8204, + "step": 7482 + }, + { + "epoch": 0.36437562388917294, + "grad_norm": 1.6633716821670532, + "learning_rate": 2.9375471671155823e-05, + "loss": 0.8202, + "step": 7483 + }, + { + "epoch": 0.36442431767828015, + "grad_norm": 1.3893777132034302, + "learning_rate": 2.937268524352528e-05, + "loss": 0.7587, + "step": 7484 + }, + { + "epoch": 0.3644730114673873, + "grad_norm": 1.2668821811676025, + "learning_rate": 2.9369898582748333e-05, + "loss": 0.7784, + "step": 7485 + }, + { + "epoch": 0.3645217052564945, + "grad_norm": 1.3345355987548828, + "learning_rate": 2.936711168889431e-05, + "loss": 0.7291, + "step": 7486 + }, + { + "epoch": 0.36457039904560173, + "grad_norm": 1.6089640855789185, + "learning_rate": 2.9364324562032532e-05, + "loss": 0.8728, + "step": 7487 + }, + { + "epoch": 0.36461909283470895, + "grad_norm": 1.4430501461029053, + "learning_rate": 2.9361537202232323e-05, + "loss": 0.8078, + "step": 7488 + }, + { + "epoch": 0.36466778662381616, + "grad_norm": 1.5353788137435913, + "learning_rate": 2.9358749609563023e-05, + "loss": 0.959, + "step": 7489 + }, + { + "epoch": 0.3647164804129233, + "grad_norm": 1.3803379535675049, + "learning_rate": 2.9355961784093966e-05, + "loss": 0.8092, + "step": 7490 + }, + { + "epoch": 0.36476517420203053, + "grad_norm": 2.018817186355591, + "learning_rate": 2.9353173725894515e-05, + "loss": 0.8774, + "step": 7491 + }, + { + "epoch": 0.36481386799113774, + "grad_norm": 0.08914240449666977, + "learning_rate": 2.9350385435034013e-05, + "loss": 0.6106, + "step": 7492 + }, + { + "epoch": 0.36486256178024495, + "grad_norm": 1.5408434867858887, + "learning_rate": 2.934759691158182e-05, + "loss": 0.8162, + "step": 7493 + }, + { + "epoch": 0.3649112555693521, + "grad_norm": 3.8375070095062256, + "learning_rate": 2.93448081556073e-05, + "loss": 0.894, + "step": 7494 + }, + { + "epoch": 0.3649599493584593, + "grad_norm": 1.3301749229431152, + "learning_rate": 2.9342019167179828e-05, + "loss": 0.8049, + "step": 7495 + }, + { + "epoch": 0.36500864314756654, + "grad_norm": 1.733147144317627, + "learning_rate": 2.9339229946368778e-05, + "loss": 0.6977, + "step": 7496 + }, + { + "epoch": 0.36505733693667375, + "grad_norm": 0.08783411979675293, + "learning_rate": 2.9336440493243533e-05, + "loss": 0.6182, + "step": 7497 + }, + { + "epoch": 0.3651060307257809, + "grad_norm": 1.602879524230957, + "learning_rate": 2.9333650807873476e-05, + "loss": 0.7824, + "step": 7498 + }, + { + "epoch": 0.3651547245148881, + "grad_norm": 1.2871955633163452, + "learning_rate": 2.9330860890328004e-05, + "loss": 0.7912, + "step": 7499 + }, + { + "epoch": 0.36520341830399533, + "grad_norm": 1.4680490493774414, + "learning_rate": 2.932807074067651e-05, + "loss": 0.9342, + "step": 7500 + }, + { + "epoch": 0.36525211209310254, + "grad_norm": 1.6928421258926392, + "learning_rate": 2.932528035898842e-05, + "loss": 0.8384, + "step": 7501 + }, + { + "epoch": 0.3653008058822097, + "grad_norm": 1.52773118019104, + "learning_rate": 2.9322489745333126e-05, + "loss": 0.8107, + "step": 7502 + }, + { + "epoch": 0.3653494996713169, + "grad_norm": 1.9899853467941284, + "learning_rate": 2.9319698899780045e-05, + "loss": 0.7777, + "step": 7503 + }, + { + "epoch": 0.3653981934604241, + "grad_norm": 1.601992130279541, + "learning_rate": 2.931690782239861e-05, + "loss": 0.8047, + "step": 7504 + }, + { + "epoch": 0.36544688724953134, + "grad_norm": 1.4392045736312866, + "learning_rate": 2.9314116513258234e-05, + "loss": 0.7907, + "step": 7505 + }, + { + "epoch": 0.3654955810386385, + "grad_norm": 0.08878283947706223, + "learning_rate": 2.9311324972428366e-05, + "loss": 0.68, + "step": 7506 + }, + { + "epoch": 0.3655442748277457, + "grad_norm": 1.7291699647903442, + "learning_rate": 2.9308533199978444e-05, + "loss": 0.8273, + "step": 7507 + }, + { + "epoch": 0.3655929686168529, + "grad_norm": 0.09073606133460999, + "learning_rate": 2.930574119597791e-05, + "loss": 0.6009, + "step": 7508 + }, + { + "epoch": 0.36564166240596013, + "grad_norm": 1.7225825786590576, + "learning_rate": 2.930294896049621e-05, + "loss": 0.8497, + "step": 7509 + }, + { + "epoch": 0.36569035619506735, + "grad_norm": 1.6663004159927368, + "learning_rate": 2.930015649360281e-05, + "loss": 0.8986, + "step": 7510 + }, + { + "epoch": 0.3657390499841745, + "grad_norm": 1.72050142288208, + "learning_rate": 2.9297363795367167e-05, + "loss": 0.8778, + "step": 7511 + }, + { + "epoch": 0.3657877437732817, + "grad_norm": 1.5144274234771729, + "learning_rate": 2.9294570865858753e-05, + "loss": 0.894, + "step": 7512 + }, + { + "epoch": 0.3658364375623889, + "grad_norm": 2.1413724422454834, + "learning_rate": 2.929177770514704e-05, + "loss": 0.7908, + "step": 7513 + }, + { + "epoch": 0.36588513135149614, + "grad_norm": 2.067387819290161, + "learning_rate": 2.9288984313301512e-05, + "loss": 0.8888, + "step": 7514 + }, + { + "epoch": 0.3659338251406033, + "grad_norm": 1.465440034866333, + "learning_rate": 2.9286190690391653e-05, + "loss": 0.8258, + "step": 7515 + }, + { + "epoch": 0.3659825189297105, + "grad_norm": 3.881568670272827, + "learning_rate": 2.928339683648695e-05, + "loss": 0.7522, + "step": 7516 + }, + { + "epoch": 0.3660312127188177, + "grad_norm": 1.465804934501648, + "learning_rate": 2.928060275165691e-05, + "loss": 0.8858, + "step": 7517 + }, + { + "epoch": 0.36607990650792493, + "grad_norm": 1.5225117206573486, + "learning_rate": 2.9277808435971024e-05, + "loss": 0.7723, + "step": 7518 + }, + { + "epoch": 0.3661286002970321, + "grad_norm": 2.26520037651062, + "learning_rate": 2.9275013889498807e-05, + "loss": 0.8627, + "step": 7519 + }, + { + "epoch": 0.3661772940861393, + "grad_norm": 1.4919975996017456, + "learning_rate": 2.9272219112309775e-05, + "loss": 0.8253, + "step": 7520 + }, + { + "epoch": 0.3662259878752465, + "grad_norm": 1.3043864965438843, + "learning_rate": 2.9269424104473447e-05, + "loss": 0.8247, + "step": 7521 + }, + { + "epoch": 0.36627468166435373, + "grad_norm": 1.4968663454055786, + "learning_rate": 2.926662886605935e-05, + "loss": 0.8704, + "step": 7522 + }, + { + "epoch": 0.3663233754534609, + "grad_norm": 1.84417724609375, + "learning_rate": 2.9263833397137018e-05, + "loss": 0.945, + "step": 7523 + }, + { + "epoch": 0.3663720692425681, + "grad_norm": 2.4721767902374268, + "learning_rate": 2.9261037697775985e-05, + "loss": 0.7874, + "step": 7524 + }, + { + "epoch": 0.3664207630316753, + "grad_norm": 2.2528326511383057, + "learning_rate": 2.9258241768045796e-05, + "loss": 0.8624, + "step": 7525 + }, + { + "epoch": 0.3664694568207825, + "grad_norm": 1.409589409828186, + "learning_rate": 2.9255445608015995e-05, + "loss": 0.8176, + "step": 7526 + }, + { + "epoch": 0.3665181506098897, + "grad_norm": 1.9780672788619995, + "learning_rate": 2.9252649217756137e-05, + "loss": 0.9368, + "step": 7527 + }, + { + "epoch": 0.3665668443989969, + "grad_norm": 1.8522770404815674, + "learning_rate": 2.9249852597335794e-05, + "loss": 0.8028, + "step": 7528 + }, + { + "epoch": 0.3666155381881041, + "grad_norm": 2.305150270462036, + "learning_rate": 2.924705574682452e-05, + "loss": 0.7587, + "step": 7529 + }, + { + "epoch": 0.3666642319772113, + "grad_norm": 1.5481621026992798, + "learning_rate": 2.9244258666291894e-05, + "loss": 0.8563, + "step": 7530 + }, + { + "epoch": 0.36671292576631853, + "grad_norm": 1.6407994031906128, + "learning_rate": 2.924146135580749e-05, + "loss": 0.894, + "step": 7531 + }, + { + "epoch": 0.3667616195554257, + "grad_norm": 1.9153594970703125, + "learning_rate": 2.923866381544089e-05, + "loss": 0.8908, + "step": 7532 + }, + { + "epoch": 0.3668103133445329, + "grad_norm": 1.5718220472335815, + "learning_rate": 2.9235866045261686e-05, + "loss": 0.7773, + "step": 7533 + }, + { + "epoch": 0.3668590071336401, + "grad_norm": 2.092836618423462, + "learning_rate": 2.923306804533947e-05, + "loss": 0.8438, + "step": 7534 + }, + { + "epoch": 0.3669077009227473, + "grad_norm": 1.1484344005584717, + "learning_rate": 2.9230269815743847e-05, + "loss": 0.8091, + "step": 7535 + }, + { + "epoch": 0.3669563947118545, + "grad_norm": 1.535320520401001, + "learning_rate": 2.9227471356544423e-05, + "loss": 0.8641, + "step": 7536 + }, + { + "epoch": 0.3670050885009617, + "grad_norm": 1.5407696962356567, + "learning_rate": 2.9224672667810806e-05, + "loss": 0.9307, + "step": 7537 + }, + { + "epoch": 0.3670537822900689, + "grad_norm": 4.028205871582031, + "learning_rate": 2.922187374961262e-05, + "loss": 0.7617, + "step": 7538 + }, + { + "epoch": 0.3671024760791761, + "grad_norm": 1.5675265789031982, + "learning_rate": 2.9219074602019474e-05, + "loss": 0.8999, + "step": 7539 + }, + { + "epoch": 0.3671511698682833, + "grad_norm": 1.8227239847183228, + "learning_rate": 2.9216275225101013e-05, + "loss": 0.8537, + "step": 7540 + }, + { + "epoch": 0.3671998636573905, + "grad_norm": 6.568049430847168, + "learning_rate": 2.9213475618926862e-05, + "loss": 0.8509, + "step": 7541 + }, + { + "epoch": 0.3672485574464977, + "grad_norm": 1.4282530546188354, + "learning_rate": 2.9210675783566666e-05, + "loss": 0.8042, + "step": 7542 + }, + { + "epoch": 0.3672972512356049, + "grad_norm": 1.5102033615112305, + "learning_rate": 2.9207875719090074e-05, + "loss": 0.8768, + "step": 7543 + }, + { + "epoch": 0.3673459450247121, + "grad_norm": 1.5167834758758545, + "learning_rate": 2.9205075425566728e-05, + "loss": 0.8555, + "step": 7544 + }, + { + "epoch": 0.3673946388138193, + "grad_norm": 1.3063275814056396, + "learning_rate": 2.9202274903066293e-05, + "loss": 0.8462, + "step": 7545 + }, + { + "epoch": 0.3674433326029265, + "grad_norm": 3.861778974533081, + "learning_rate": 2.9199474151658436e-05, + "loss": 0.8026, + "step": 7546 + }, + { + "epoch": 0.3674920263920337, + "grad_norm": 1.2824703454971313, + "learning_rate": 2.9196673171412818e-05, + "loss": 0.8576, + "step": 7547 + }, + { + "epoch": 0.3675407201811409, + "grad_norm": 0.08950687199831009, + "learning_rate": 2.919387196239911e-05, + "loss": 0.6852, + "step": 7548 + }, + { + "epoch": 0.3675894139702481, + "grad_norm": 1.7739298343658447, + "learning_rate": 2.9191070524687007e-05, + "loss": 0.8163, + "step": 7549 + }, + { + "epoch": 0.3676381077593553, + "grad_norm": 0.09165691584348679, + "learning_rate": 2.918826885834618e-05, + "loss": 0.6513, + "step": 7550 + }, + { + "epoch": 0.3676868015484625, + "grad_norm": 1.3937093019485474, + "learning_rate": 2.918546696344633e-05, + "loss": 0.8562, + "step": 7551 + }, + { + "epoch": 0.3677354953375697, + "grad_norm": 1.3478422164916992, + "learning_rate": 2.9182664840057152e-05, + "loss": 0.8391, + "step": 7552 + }, + { + "epoch": 0.3677841891266769, + "grad_norm": 2.849152088165283, + "learning_rate": 2.9179862488248352e-05, + "loss": 0.8891, + "step": 7553 + }, + { + "epoch": 0.3678328829157841, + "grad_norm": 0.0916784331202507, + "learning_rate": 2.9177059908089633e-05, + "loss": 0.5325, + "step": 7554 + }, + { + "epoch": 0.3678815767048913, + "grad_norm": 1.4520162343978882, + "learning_rate": 2.9174257099650714e-05, + "loss": 0.902, + "step": 7555 + }, + { + "epoch": 0.3679302704939985, + "grad_norm": 1.5340572595596313, + "learning_rate": 2.917145406300131e-05, + "loss": 0.9167, + "step": 7556 + }, + { + "epoch": 0.36797896428310567, + "grad_norm": 1.7091063261032104, + "learning_rate": 2.9168650798211148e-05, + "loss": 0.831, + "step": 7557 + }, + { + "epoch": 0.3680276580722129, + "grad_norm": 0.09425556659698486, + "learning_rate": 2.916584730534997e-05, + "loss": 0.659, + "step": 7558 + }, + { + "epoch": 0.3680763518613201, + "grad_norm": 2.393775701522827, + "learning_rate": 2.9163043584487495e-05, + "loss": 0.8922, + "step": 7559 + }, + { + "epoch": 0.3681250456504273, + "grad_norm": 3.2550106048583984, + "learning_rate": 2.9160239635693483e-05, + "loss": 0.8781, + "step": 7560 + }, + { + "epoch": 0.36817373943953446, + "grad_norm": 1.5222498178482056, + "learning_rate": 2.915743545903767e-05, + "loss": 0.871, + "step": 7561 + }, + { + "epoch": 0.3682224332286417, + "grad_norm": 1.1853373050689697, + "learning_rate": 2.915463105458982e-05, + "loss": 0.8563, + "step": 7562 + }, + { + "epoch": 0.3682711270177489, + "grad_norm": 1.8324989080429077, + "learning_rate": 2.915182642241968e-05, + "loss": 0.8139, + "step": 7563 + }, + { + "epoch": 0.3683198208068561, + "grad_norm": 1.537330150604248, + "learning_rate": 2.914902156259703e-05, + "loss": 0.8925, + "step": 7564 + }, + { + "epoch": 0.36836851459596326, + "grad_norm": 1.5200190544128418, + "learning_rate": 2.914621647519163e-05, + "loss": 0.8736, + "step": 7565 + }, + { + "epoch": 0.36841720838507047, + "grad_norm": 1.4650343656539917, + "learning_rate": 2.9143411160273267e-05, + "loss": 0.8354, + "step": 7566 + }, + { + "epoch": 0.3684659021741777, + "grad_norm": 1.861336350440979, + "learning_rate": 2.914060561791171e-05, + "loss": 0.8242, + "step": 7567 + }, + { + "epoch": 0.3685145959632849, + "grad_norm": 0.08875852823257446, + "learning_rate": 2.9137799848176765e-05, + "loss": 0.553, + "step": 7568 + }, + { + "epoch": 0.3685632897523921, + "grad_norm": 1.8287334442138672, + "learning_rate": 2.9134993851138203e-05, + "loss": 0.8091, + "step": 7569 + }, + { + "epoch": 0.36861198354149927, + "grad_norm": 1.0913145542144775, + "learning_rate": 2.9132187626865845e-05, + "loss": 0.8612, + "step": 7570 + }, + { + "epoch": 0.3686606773306065, + "grad_norm": 1.833274483680725, + "learning_rate": 2.9129381175429482e-05, + "loss": 0.8731, + "step": 7571 + }, + { + "epoch": 0.3687093711197137, + "grad_norm": 1.388552188873291, + "learning_rate": 2.9126574496898935e-05, + "loss": 0.808, + "step": 7572 + }, + { + "epoch": 0.3687580649088209, + "grad_norm": 1.583502173423767, + "learning_rate": 2.912376759134401e-05, + "loss": 0.872, + "step": 7573 + }, + { + "epoch": 0.36880675869792806, + "grad_norm": 2.6228368282318115, + "learning_rate": 2.9120960458834537e-05, + "loss": 0.852, + "step": 7574 + }, + { + "epoch": 0.3688554524870353, + "grad_norm": 1.7961384057998657, + "learning_rate": 2.9118153099440347e-05, + "loss": 0.8431, + "step": 7575 + }, + { + "epoch": 0.3689041462761425, + "grad_norm": 3.2291066646575928, + "learning_rate": 2.911534551323126e-05, + "loss": 0.9021, + "step": 7576 + }, + { + "epoch": 0.3689528400652497, + "grad_norm": 1.939008355140686, + "learning_rate": 2.9112537700277124e-05, + "loss": 0.806, + "step": 7577 + }, + { + "epoch": 0.36900153385435686, + "grad_norm": 1.4403574466705322, + "learning_rate": 2.910972966064778e-05, + "loss": 0.7367, + "step": 7578 + }, + { + "epoch": 0.36905022764346407, + "grad_norm": 1.9874340295791626, + "learning_rate": 2.9106921394413077e-05, + "loss": 0.9415, + "step": 7579 + }, + { + "epoch": 0.3690989214325713, + "grad_norm": 1.4477200508117676, + "learning_rate": 2.9104112901642883e-05, + "loss": 0.8071, + "step": 7580 + }, + { + "epoch": 0.3691476152216785, + "grad_norm": 1.6211647987365723, + "learning_rate": 2.9101304182407047e-05, + "loss": 0.8262, + "step": 7581 + }, + { + "epoch": 0.36919630901078565, + "grad_norm": 1.7025232315063477, + "learning_rate": 2.909849523677544e-05, + "loss": 0.8993, + "step": 7582 + }, + { + "epoch": 0.36924500279989286, + "grad_norm": 1.4979385137557983, + "learning_rate": 2.909568606481793e-05, + "loss": 0.8828, + "step": 7583 + }, + { + "epoch": 0.3692936965890001, + "grad_norm": 0.08907867223024368, + "learning_rate": 2.9092876666604408e-05, + "loss": 0.6115, + "step": 7584 + }, + { + "epoch": 0.3693423903781073, + "grad_norm": 1.9611014127731323, + "learning_rate": 2.909006704220474e-05, + "loss": 0.892, + "step": 7585 + }, + { + "epoch": 0.36939108416721445, + "grad_norm": 1.5022449493408203, + "learning_rate": 2.908725719168883e-05, + "loss": 0.8817, + "step": 7586 + }, + { + "epoch": 0.36943977795632166, + "grad_norm": 1.8834749460220337, + "learning_rate": 2.9084447115126574e-05, + "loss": 0.7808, + "step": 7587 + }, + { + "epoch": 0.36948847174542887, + "grad_norm": 1.9485341310501099, + "learning_rate": 2.9081636812587863e-05, + "loss": 0.8719, + "step": 7588 + }, + { + "epoch": 0.3695371655345361, + "grad_norm": 1.4774837493896484, + "learning_rate": 2.9078826284142604e-05, + "loss": 0.8324, + "step": 7589 + }, + { + "epoch": 0.3695858593236433, + "grad_norm": 1.5825210809707642, + "learning_rate": 2.907601552986072e-05, + "loss": 0.8024, + "step": 7590 + }, + { + "epoch": 0.36963455311275045, + "grad_norm": 2.1759421825408936, + "learning_rate": 2.907320454981212e-05, + "loss": 0.9782, + "step": 7591 + }, + { + "epoch": 0.36968324690185767, + "grad_norm": 1.3593602180480957, + "learning_rate": 2.907039334406673e-05, + "loss": 0.8569, + "step": 7592 + }, + { + "epoch": 0.3697319406909649, + "grad_norm": 1.7517569065093994, + "learning_rate": 2.9067581912694474e-05, + "loss": 0.8057, + "step": 7593 + }, + { + "epoch": 0.3697806344800721, + "grad_norm": 2.0042247772216797, + "learning_rate": 2.9064770255765293e-05, + "loss": 0.8554, + "step": 7594 + }, + { + "epoch": 0.36982932826917925, + "grad_norm": 1.9074653387069702, + "learning_rate": 2.906195837334913e-05, + "loss": 0.8118, + "step": 7595 + }, + { + "epoch": 0.36987802205828646, + "grad_norm": 1.5677764415740967, + "learning_rate": 2.905914626551592e-05, + "loss": 0.8845, + "step": 7596 + }, + { + "epoch": 0.3699267158473937, + "grad_norm": 1.6692774295806885, + "learning_rate": 2.905633393233563e-05, + "loss": 0.8731, + "step": 7597 + }, + { + "epoch": 0.3699754096365009, + "grad_norm": 1.7350798845291138, + "learning_rate": 2.90535213738782e-05, + "loss": 0.9463, + "step": 7598 + }, + { + "epoch": 0.37002410342560804, + "grad_norm": 1.5011506080627441, + "learning_rate": 2.9050708590213603e-05, + "loss": 0.7681, + "step": 7599 + }, + { + "epoch": 0.37007279721471525, + "grad_norm": 2.176513910293579, + "learning_rate": 2.9047895581411804e-05, + "loss": 0.8154, + "step": 7600 + }, + { + "epoch": 0.37012149100382247, + "grad_norm": 1.7737107276916504, + "learning_rate": 2.9045082347542773e-05, + "loss": 0.8207, + "step": 7601 + }, + { + "epoch": 0.3701701847929297, + "grad_norm": 1.3674620389938354, + "learning_rate": 2.9042268888676503e-05, + "loss": 0.8638, + "step": 7602 + }, + { + "epoch": 0.37021887858203684, + "grad_norm": 1.9902127981185913, + "learning_rate": 2.9039455204882964e-05, + "loss": 0.8258, + "step": 7603 + }, + { + "epoch": 0.37026757237114405, + "grad_norm": 1.5626752376556396, + "learning_rate": 2.9036641296232156e-05, + "loss": 0.9034, + "step": 7604 + }, + { + "epoch": 0.37031626616025126, + "grad_norm": 2.3140339851379395, + "learning_rate": 2.903382716279407e-05, + "loss": 0.8446, + "step": 7605 + }, + { + "epoch": 0.3703649599493585, + "grad_norm": 1.7330683469772339, + "learning_rate": 2.9031012804638704e-05, + "loss": 0.8103, + "step": 7606 + }, + { + "epoch": 0.37041365373846563, + "grad_norm": 1.5940589904785156, + "learning_rate": 2.902819822183608e-05, + "loss": 0.7763, + "step": 7607 + }, + { + "epoch": 0.37046234752757284, + "grad_norm": 3.8976447582244873, + "learning_rate": 2.9025383414456194e-05, + "loss": 0.8677, + "step": 7608 + }, + { + "epoch": 0.37051104131668006, + "grad_norm": 2.0718438625335693, + "learning_rate": 2.902256838256908e-05, + "loss": 0.8633, + "step": 7609 + }, + { + "epoch": 0.37055973510578727, + "grad_norm": 2.100450277328491, + "learning_rate": 2.9019753126244753e-05, + "loss": 0.8104, + "step": 7610 + }, + { + "epoch": 0.3706084288948945, + "grad_norm": 1.9280474185943604, + "learning_rate": 2.9016937645553246e-05, + "loss": 0.8248, + "step": 7611 + }, + { + "epoch": 0.37065712268400164, + "grad_norm": 1.561492919921875, + "learning_rate": 2.9014121940564587e-05, + "loss": 0.8597, + "step": 7612 + }, + { + "epoch": 0.37070581647310885, + "grad_norm": 1.8328553438186646, + "learning_rate": 2.9011306011348833e-05, + "loss": 0.8776, + "step": 7613 + }, + { + "epoch": 0.37075451026221606, + "grad_norm": 0.09089460968971252, + "learning_rate": 2.9008489857976008e-05, + "loss": 0.5875, + "step": 7614 + }, + { + "epoch": 0.3708032040513233, + "grad_norm": 2.345215320587158, + "learning_rate": 2.9005673480516186e-05, + "loss": 0.8717, + "step": 7615 + }, + { + "epoch": 0.37085189784043043, + "grad_norm": 1.773477554321289, + "learning_rate": 2.9002856879039408e-05, + "loss": 0.9556, + "step": 7616 + }, + { + "epoch": 0.37090059162953765, + "grad_norm": 1.5398386716842651, + "learning_rate": 2.900004005361575e-05, + "loss": 0.7733, + "step": 7617 + }, + { + "epoch": 0.37094928541864486, + "grad_norm": 1.6888363361358643, + "learning_rate": 2.899722300431527e-05, + "loss": 0.9612, + "step": 7618 + }, + { + "epoch": 0.37099797920775207, + "grad_norm": 1.6509605646133423, + "learning_rate": 2.899440573120805e-05, + "loss": 0.8183, + "step": 7619 + }, + { + "epoch": 0.37104667299685923, + "grad_norm": 2.3282833099365234, + "learning_rate": 2.899158823436417e-05, + "loss": 0.7918, + "step": 7620 + }, + { + "epoch": 0.37109536678596644, + "grad_norm": 1.782280683517456, + "learning_rate": 2.898877051385371e-05, + "loss": 0.7693, + "step": 7621 + }, + { + "epoch": 0.37114406057507365, + "grad_norm": 1.3994182348251343, + "learning_rate": 2.8985952569746763e-05, + "loss": 0.8477, + "step": 7622 + }, + { + "epoch": 0.37119275436418087, + "grad_norm": 2.04839825630188, + "learning_rate": 2.8983134402113425e-05, + "loss": 0.8173, + "step": 7623 + }, + { + "epoch": 0.371241448153288, + "grad_norm": 2.6747570037841797, + "learning_rate": 2.8980316011023807e-05, + "loss": 0.9538, + "step": 7624 + }, + { + "epoch": 0.37129014194239524, + "grad_norm": 2.2199840545654297, + "learning_rate": 2.8977497396548005e-05, + "loss": 0.8221, + "step": 7625 + }, + { + "epoch": 0.37133883573150245, + "grad_norm": 1.4124387502670288, + "learning_rate": 2.8974678558756135e-05, + "loss": 0.887, + "step": 7626 + }, + { + "epoch": 0.37138752952060966, + "grad_norm": 1.9184811115264893, + "learning_rate": 2.8971859497718326e-05, + "loss": 0.8494, + "step": 7627 + }, + { + "epoch": 0.3714362233097169, + "grad_norm": 1.8527179956436157, + "learning_rate": 2.896904021350468e-05, + "loss": 0.845, + "step": 7628 + }, + { + "epoch": 0.37148491709882403, + "grad_norm": 1.3651080131530762, + "learning_rate": 2.8966220706185346e-05, + "loss": 0.8445, + "step": 7629 + }, + { + "epoch": 0.37153361088793124, + "grad_norm": 1.915097713470459, + "learning_rate": 2.8963400975830458e-05, + "loss": 0.8192, + "step": 7630 + }, + { + "epoch": 0.37158230467703846, + "grad_norm": 1.5498888492584229, + "learning_rate": 2.896058102251015e-05, + "loss": 0.8799, + "step": 7631 + }, + { + "epoch": 0.37163099846614567, + "grad_norm": 2.289808511734009, + "learning_rate": 2.8957760846294575e-05, + "loss": 0.8832, + "step": 7632 + }, + { + "epoch": 0.3716796922552528, + "grad_norm": 1.2503783702850342, + "learning_rate": 2.8954940447253883e-05, + "loss": 0.9385, + "step": 7633 + }, + { + "epoch": 0.37172838604436004, + "grad_norm": 1.465721845626831, + "learning_rate": 2.895211982545823e-05, + "loss": 0.8594, + "step": 7634 + }, + { + "epoch": 0.37177707983346725, + "grad_norm": 1.6590279340744019, + "learning_rate": 2.894929898097778e-05, + "loss": 0.8602, + "step": 7635 + }, + { + "epoch": 0.37182577362257446, + "grad_norm": 2.0709547996520996, + "learning_rate": 2.8946477913882697e-05, + "loss": 0.8441, + "step": 7636 + }, + { + "epoch": 0.3718744674116816, + "grad_norm": 1.1879312992095947, + "learning_rate": 2.8943656624243165e-05, + "loss": 0.7947, + "step": 7637 + }, + { + "epoch": 0.37192316120078883, + "grad_norm": 1.8166033029556274, + "learning_rate": 2.8940835112129356e-05, + "loss": 0.8833, + "step": 7638 + }, + { + "epoch": 0.37197185498989604, + "grad_norm": 1.4625914096832275, + "learning_rate": 2.8938013377611464e-05, + "loss": 0.8182, + "step": 7639 + }, + { + "epoch": 0.37202054877900326, + "grad_norm": 1.6090573072433472, + "learning_rate": 2.8935191420759672e-05, + "loss": 0.9201, + "step": 7640 + }, + { + "epoch": 0.3720692425681104, + "grad_norm": 1.9404048919677734, + "learning_rate": 2.8932369241644174e-05, + "loss": 0.8559, + "step": 7641 + }, + { + "epoch": 0.3721179363572176, + "grad_norm": 1.723430871963501, + "learning_rate": 2.8929546840335184e-05, + "loss": 0.8253, + "step": 7642 + }, + { + "epoch": 0.37216663014632484, + "grad_norm": 1.7341312170028687, + "learning_rate": 2.8926724216902896e-05, + "loss": 0.7965, + "step": 7643 + }, + { + "epoch": 0.37221532393543205, + "grad_norm": 1.4762084484100342, + "learning_rate": 2.8923901371417532e-05, + "loss": 0.9112, + "step": 7644 + }, + { + "epoch": 0.3722640177245392, + "grad_norm": 0.0965404361486435, + "learning_rate": 2.892107830394931e-05, + "loss": 0.6142, + "step": 7645 + }, + { + "epoch": 0.3723127115136464, + "grad_norm": 1.7921897172927856, + "learning_rate": 2.891825501456845e-05, + "loss": 0.9164, + "step": 7646 + }, + { + "epoch": 0.37236140530275363, + "grad_norm": 1.3811542987823486, + "learning_rate": 2.8915431503345184e-05, + "loss": 0.8061, + "step": 7647 + }, + { + "epoch": 0.37241009909186085, + "grad_norm": 1.6391687393188477, + "learning_rate": 2.8912607770349744e-05, + "loss": 0.8567, + "step": 7648 + }, + { + "epoch": 0.37245879288096806, + "grad_norm": 0.08965198695659637, + "learning_rate": 2.890978381565237e-05, + "loss": 0.6208, + "step": 7649 + }, + { + "epoch": 0.3725074866700752, + "grad_norm": 1.4941298961639404, + "learning_rate": 2.890695963932332e-05, + "loss": 0.8577, + "step": 7650 + }, + { + "epoch": 0.37255618045918243, + "grad_norm": 1.832680583000183, + "learning_rate": 2.890413524143283e-05, + "loss": 0.8097, + "step": 7651 + }, + { + "epoch": 0.37260487424828964, + "grad_norm": 1.6313436031341553, + "learning_rate": 2.8901310622051165e-05, + "loss": 0.9062, + "step": 7652 + }, + { + "epoch": 0.37265356803739685, + "grad_norm": 1.369235634803772, + "learning_rate": 2.889848578124859e-05, + "loss": 0.9239, + "step": 7653 + }, + { + "epoch": 0.372702261826504, + "grad_norm": 2.2326645851135254, + "learning_rate": 2.8895660719095365e-05, + "loss": 0.824, + "step": 7654 + }, + { + "epoch": 0.3727509556156112, + "grad_norm": 1.722570776939392, + "learning_rate": 2.889283543566178e-05, + "loss": 0.9239, + "step": 7655 + }, + { + "epoch": 0.37279964940471844, + "grad_norm": 0.08860983699560165, + "learning_rate": 2.889000993101809e-05, + "loss": 0.5508, + "step": 7656 + }, + { + "epoch": 0.37284834319382565, + "grad_norm": 0.09361648559570312, + "learning_rate": 2.88871842052346e-05, + "loss": 0.6404, + "step": 7657 + }, + { + "epoch": 0.3728970369829328, + "grad_norm": 1.5450469255447388, + "learning_rate": 2.8884358258381582e-05, + "loss": 0.8335, + "step": 7658 + }, + { + "epoch": 0.37294573077204, + "grad_norm": 1.5640873908996582, + "learning_rate": 2.8881532090529352e-05, + "loss": 0.7878, + "step": 7659 + }, + { + "epoch": 0.37299442456114723, + "grad_norm": 0.09212277829647064, + "learning_rate": 2.8878705701748198e-05, + "loss": 0.6008, + "step": 7660 + }, + { + "epoch": 0.37304311835025444, + "grad_norm": 1.9178684949874878, + "learning_rate": 2.887587909210843e-05, + "loss": 0.9075, + "step": 7661 + }, + { + "epoch": 0.3730918121393616, + "grad_norm": 1.2945773601531982, + "learning_rate": 2.8873052261680365e-05, + "loss": 0.8156, + "step": 7662 + }, + { + "epoch": 0.3731405059284688, + "grad_norm": 1.9103456735610962, + "learning_rate": 2.8870225210534313e-05, + "loss": 0.9091, + "step": 7663 + }, + { + "epoch": 0.373189199717576, + "grad_norm": 1.9092161655426025, + "learning_rate": 2.88673979387406e-05, + "loss": 0.8119, + "step": 7664 + }, + { + "epoch": 0.37323789350668324, + "grad_norm": 1.76851224899292, + "learning_rate": 2.8864570446369547e-05, + "loss": 0.8565, + "step": 7665 + }, + { + "epoch": 0.3732865872957904, + "grad_norm": 1.9193453788757324, + "learning_rate": 2.8861742733491508e-05, + "loss": 1.0062, + "step": 7666 + }, + { + "epoch": 0.3733352810848976, + "grad_norm": 1.7900996208190918, + "learning_rate": 2.885891480017681e-05, + "loss": 0.8005, + "step": 7667 + }, + { + "epoch": 0.3733839748740048, + "grad_norm": 1.8621439933776855, + "learning_rate": 2.8856086646495788e-05, + "loss": 0.8337, + "step": 7668 + }, + { + "epoch": 0.37343266866311203, + "grad_norm": 1.5386234521865845, + "learning_rate": 2.885325827251881e-05, + "loss": 0.8455, + "step": 7669 + }, + { + "epoch": 0.37348136245221925, + "grad_norm": 1.3981554508209229, + "learning_rate": 2.8850429678316226e-05, + "loss": 0.7482, + "step": 7670 + }, + { + "epoch": 0.3735300562413264, + "grad_norm": 1.479670524597168, + "learning_rate": 2.8847600863958395e-05, + "loss": 0.857, + "step": 7671 + }, + { + "epoch": 0.3735787500304336, + "grad_norm": 1.3220449686050415, + "learning_rate": 2.8844771829515682e-05, + "loss": 0.8641, + "step": 7672 + }, + { + "epoch": 0.3736274438195408, + "grad_norm": 2.086000680923462, + "learning_rate": 2.884194257505847e-05, + "loss": 0.8913, + "step": 7673 + }, + { + "epoch": 0.37367613760864804, + "grad_norm": 1.9965044260025024, + "learning_rate": 2.8839113100657132e-05, + "loss": 0.8938, + "step": 7674 + }, + { + "epoch": 0.3737248313977552, + "grad_norm": 1.457700490951538, + "learning_rate": 2.8836283406382045e-05, + "loss": 0.7808, + "step": 7675 + }, + { + "epoch": 0.3737735251868624, + "grad_norm": 10.847090721130371, + "learning_rate": 2.883345349230361e-05, + "loss": 0.8366, + "step": 7676 + }, + { + "epoch": 0.3738222189759696, + "grad_norm": 2.8889248371124268, + "learning_rate": 2.8830623358492205e-05, + "loss": 0.8878, + "step": 7677 + }, + { + "epoch": 0.37387091276507683, + "grad_norm": 2.100944757461548, + "learning_rate": 2.882779300501824e-05, + "loss": 0.7791, + "step": 7678 + }, + { + "epoch": 0.373919606554184, + "grad_norm": 2.0787360668182373, + "learning_rate": 2.8824962431952114e-05, + "loss": 0.8649, + "step": 7679 + }, + { + "epoch": 0.3739683003432912, + "grad_norm": 1.3192940950393677, + "learning_rate": 2.8822131639364248e-05, + "loss": 0.8303, + "step": 7680 + }, + { + "epoch": 0.3740169941323984, + "grad_norm": 1.7444154024124146, + "learning_rate": 2.8819300627325056e-05, + "loss": 0.8363, + "step": 7681 + }, + { + "epoch": 0.37406568792150563, + "grad_norm": 1.4580157995224, + "learning_rate": 2.8816469395904954e-05, + "loss": 0.8221, + "step": 7682 + }, + { + "epoch": 0.3741143817106128, + "grad_norm": 2.248670816421509, + "learning_rate": 2.8813637945174378e-05, + "loss": 0.78, + "step": 7683 + }, + { + "epoch": 0.37416307549972, + "grad_norm": 2.3266329765319824, + "learning_rate": 2.8810806275203747e-05, + "loss": 0.8639, + "step": 7684 + }, + { + "epoch": 0.3742117692888272, + "grad_norm": 2.987694501876831, + "learning_rate": 2.8807974386063507e-05, + "loss": 0.8892, + "step": 7685 + }, + { + "epoch": 0.3742604630779344, + "grad_norm": 1.5551750659942627, + "learning_rate": 2.8805142277824104e-05, + "loss": 0.8211, + "step": 7686 + }, + { + "epoch": 0.3743091568670416, + "grad_norm": 1.3634177446365356, + "learning_rate": 2.880230995055598e-05, + "loss": 0.8608, + "step": 7687 + }, + { + "epoch": 0.3743578506561488, + "grad_norm": 1.339920997619629, + "learning_rate": 2.8799477404329597e-05, + "loss": 0.9148, + "step": 7688 + }, + { + "epoch": 0.374406544445256, + "grad_norm": 1.5721256732940674, + "learning_rate": 2.8796644639215414e-05, + "loss": 0.9566, + "step": 7689 + }, + { + "epoch": 0.3744552382343632, + "grad_norm": 1.9387816190719604, + "learning_rate": 2.879381165528389e-05, + "loss": 0.8841, + "step": 7690 + }, + { + "epoch": 0.37450393202347043, + "grad_norm": 1.5600558519363403, + "learning_rate": 2.8790978452605502e-05, + "loss": 0.8708, + "step": 7691 + }, + { + "epoch": 0.3745526258125776, + "grad_norm": 1.348275065422058, + "learning_rate": 2.8788145031250718e-05, + "loss": 0.8513, + "step": 7692 + }, + { + "epoch": 0.3746013196016848, + "grad_norm": 2.4594833850860596, + "learning_rate": 2.8785311391290035e-05, + "loss": 0.8766, + "step": 7693 + }, + { + "epoch": 0.374650013390792, + "grad_norm": 1.4757699966430664, + "learning_rate": 2.8782477532793917e-05, + "loss": 0.8387, + "step": 7694 + }, + { + "epoch": 0.3746987071798992, + "grad_norm": 1.8523088693618774, + "learning_rate": 2.8779643455832877e-05, + "loss": 0.8596, + "step": 7695 + }, + { + "epoch": 0.3747474009690064, + "grad_norm": 2.07964825630188, + "learning_rate": 2.8776809160477405e-05, + "loss": 0.7398, + "step": 7696 + }, + { + "epoch": 0.3747960947581136, + "grad_norm": 1.6535437107086182, + "learning_rate": 2.8773974646798006e-05, + "loss": 0.7805, + "step": 7697 + }, + { + "epoch": 0.3748447885472208, + "grad_norm": 2.1928327083587646, + "learning_rate": 2.8771139914865182e-05, + "loss": 0.8191, + "step": 7698 + }, + { + "epoch": 0.374893482336328, + "grad_norm": 1.8577011823654175, + "learning_rate": 2.8768304964749457e-05, + "loss": 0.7698, + "step": 7699 + }, + { + "epoch": 0.3749421761254352, + "grad_norm": 2.450967311859131, + "learning_rate": 2.876546979652135e-05, + "loss": 0.7974, + "step": 7700 + }, + { + "epoch": 0.3749908699145424, + "grad_norm": 1.702972173690796, + "learning_rate": 2.8762634410251373e-05, + "loss": 0.762, + "step": 7701 + }, + { + "epoch": 0.3750395637036496, + "grad_norm": 2.024007558822632, + "learning_rate": 2.8759798806010074e-05, + "loss": 0.8563, + "step": 7702 + }, + { + "epoch": 0.3750882574927568, + "grad_norm": 1.5203765630722046, + "learning_rate": 2.8756962983867978e-05, + "loss": 0.8106, + "step": 7703 + }, + { + "epoch": 0.375136951281864, + "grad_norm": 1.2502034902572632, + "learning_rate": 2.875412694389563e-05, + "loss": 0.8047, + "step": 7704 + }, + { + "epoch": 0.3751856450709712, + "grad_norm": 1.2919434309005737, + "learning_rate": 2.875129068616358e-05, + "loss": 0.8523, + "step": 7705 + }, + { + "epoch": 0.3752343388600784, + "grad_norm": 1.5054577589035034, + "learning_rate": 2.8748454210742375e-05, + "loss": 0.8418, + "step": 7706 + }, + { + "epoch": 0.3752830326491856, + "grad_norm": 0.08787993341684341, + "learning_rate": 2.8745617517702568e-05, + "loss": 0.568, + "step": 7707 + }, + { + "epoch": 0.3753317264382928, + "grad_norm": 0.09933266043663025, + "learning_rate": 2.874278060711473e-05, + "loss": 0.6535, + "step": 7708 + }, + { + "epoch": 0.3753804202274, + "grad_norm": 1.53825044631958, + "learning_rate": 2.873994347904943e-05, + "loss": 0.8452, + "step": 7709 + }, + { + "epoch": 0.3754291140165072, + "grad_norm": 4.534036636352539, + "learning_rate": 2.873710613357724e-05, + "loss": 0.8992, + "step": 7710 + }, + { + "epoch": 0.3754778078056144, + "grad_norm": 1.4106916189193726, + "learning_rate": 2.8734268570768736e-05, + "loss": 0.877, + "step": 7711 + }, + { + "epoch": 0.3755265015947216, + "grad_norm": 2.220492124557495, + "learning_rate": 2.8731430790694505e-05, + "loss": 0.8544, + "step": 7712 + }, + { + "epoch": 0.3755751953838288, + "grad_norm": 1.2499710321426392, + "learning_rate": 2.872859279342514e-05, + "loss": 0.8209, + "step": 7713 + }, + { + "epoch": 0.375623889172936, + "grad_norm": 3.6589016914367676, + "learning_rate": 2.872575457903123e-05, + "loss": 0.9411, + "step": 7714 + }, + { + "epoch": 0.3756725829620432, + "grad_norm": 1.201972246170044, + "learning_rate": 2.8722916147583375e-05, + "loss": 0.8135, + "step": 7715 + }, + { + "epoch": 0.3757212767511504, + "grad_norm": 0.08784816414117813, + "learning_rate": 2.872007749915219e-05, + "loss": 0.5417, + "step": 7716 + }, + { + "epoch": 0.37576997054025757, + "grad_norm": 1.457698941230774, + "learning_rate": 2.8717238633808284e-05, + "loss": 0.792, + "step": 7717 + }, + { + "epoch": 0.3758186643293648, + "grad_norm": 1.93050217628479, + "learning_rate": 2.871439955162227e-05, + "loss": 0.8449, + "step": 7718 + }, + { + "epoch": 0.375867358118472, + "grad_norm": 1.927047848701477, + "learning_rate": 2.8711560252664773e-05, + "loss": 0.7391, + "step": 7719 + }, + { + "epoch": 0.3759160519075792, + "grad_norm": 1.5753141641616821, + "learning_rate": 2.870872073700642e-05, + "loss": 0.8001, + "step": 7720 + }, + { + "epoch": 0.37596474569668636, + "grad_norm": 1.7504302263259888, + "learning_rate": 2.8705881004717845e-05, + "loss": 0.8608, + "step": 7721 + }, + { + "epoch": 0.3760134394857936, + "grad_norm": 1.6450868844985962, + "learning_rate": 2.8703041055869692e-05, + "loss": 0.9177, + "step": 7722 + }, + { + "epoch": 0.3760621332749008, + "grad_norm": 1.5825510025024414, + "learning_rate": 2.870020089053259e-05, + "loss": 0.8979, + "step": 7723 + }, + { + "epoch": 0.376110827064008, + "grad_norm": 1.4321268796920776, + "learning_rate": 2.8697360508777202e-05, + "loss": 0.7924, + "step": 7724 + }, + { + "epoch": 0.37615952085311516, + "grad_norm": 5.267017364501953, + "learning_rate": 2.8694519910674178e-05, + "loss": 0.8489, + "step": 7725 + }, + { + "epoch": 0.37620821464222237, + "grad_norm": 1.3758224248886108, + "learning_rate": 2.869167909629418e-05, + "loss": 0.79, + "step": 7726 + }, + { + "epoch": 0.3762569084313296, + "grad_norm": 1.728488802909851, + "learning_rate": 2.8688838065707865e-05, + "loss": 0.8893, + "step": 7727 + }, + { + "epoch": 0.3763056022204368, + "grad_norm": 0.08761042356491089, + "learning_rate": 2.8685996818985916e-05, + "loss": 0.6139, + "step": 7728 + }, + { + "epoch": 0.376354296009544, + "grad_norm": 1.450270414352417, + "learning_rate": 2.8683155356199005e-05, + "loss": 0.8794, + "step": 7729 + }, + { + "epoch": 0.37640298979865117, + "grad_norm": 1.8957569599151611, + "learning_rate": 2.8680313677417814e-05, + "loss": 0.8879, + "step": 7730 + }, + { + "epoch": 0.3764516835877584, + "grad_norm": 0.09732075035572052, + "learning_rate": 2.8677471782713024e-05, + "loss": 0.5663, + "step": 7731 + }, + { + "epoch": 0.3765003773768656, + "grad_norm": 1.5677207708358765, + "learning_rate": 2.867462967215534e-05, + "loss": 0.9103, + "step": 7732 + }, + { + "epoch": 0.3765490711659728, + "grad_norm": 1.3062429428100586, + "learning_rate": 2.8671787345815448e-05, + "loss": 0.8531, + "step": 7733 + }, + { + "epoch": 0.37659776495507996, + "grad_norm": 1.6706854104995728, + "learning_rate": 2.8668944803764054e-05, + "loss": 0.8011, + "step": 7734 + }, + { + "epoch": 0.3766464587441872, + "grad_norm": 3.5281221866607666, + "learning_rate": 2.8666102046071872e-05, + "loss": 0.8841, + "step": 7735 + }, + { + "epoch": 0.3766951525332944, + "grad_norm": 1.7923072576522827, + "learning_rate": 2.8663259072809605e-05, + "loss": 0.9495, + "step": 7736 + }, + { + "epoch": 0.3767438463224016, + "grad_norm": 1.7884429693222046, + "learning_rate": 2.8660415884047977e-05, + "loss": 0.7899, + "step": 7737 + }, + { + "epoch": 0.37679254011150876, + "grad_norm": 2.2743518352508545, + "learning_rate": 2.8657572479857715e-05, + "loss": 0.8188, + "step": 7738 + }, + { + "epoch": 0.37684123390061597, + "grad_norm": 2.0727851390838623, + "learning_rate": 2.865472886030955e-05, + "loss": 0.7723, + "step": 7739 + }, + { + "epoch": 0.3768899276897232, + "grad_norm": 1.8165236711502075, + "learning_rate": 2.865188502547422e-05, + "loss": 0.7936, + "step": 7740 + }, + { + "epoch": 0.3769386214788304, + "grad_norm": 1.4243358373641968, + "learning_rate": 2.8649040975422457e-05, + "loss": 0.8119, + "step": 7741 + }, + { + "epoch": 0.37698731526793755, + "grad_norm": 2.1545701026916504, + "learning_rate": 2.8646196710225016e-05, + "loss": 0.899, + "step": 7742 + }, + { + "epoch": 0.37703600905704476, + "grad_norm": 0.08815399557352066, + "learning_rate": 2.864335222995264e-05, + "loss": 0.6297, + "step": 7743 + }, + { + "epoch": 0.377084702846152, + "grad_norm": 2.126782178878784, + "learning_rate": 2.864050753467609e-05, + "loss": 0.9792, + "step": 7744 + }, + { + "epoch": 0.3771333966352592, + "grad_norm": 1.5571002960205078, + "learning_rate": 2.8637662624466124e-05, + "loss": 0.8733, + "step": 7745 + }, + { + "epoch": 0.37718209042436635, + "grad_norm": 2.0271010398864746, + "learning_rate": 2.8634817499393514e-05, + "loss": 0.9195, + "step": 7746 + }, + { + "epoch": 0.37723078421347356, + "grad_norm": 1.2275670766830444, + "learning_rate": 2.8631972159529033e-05, + "loss": 0.8466, + "step": 7747 + }, + { + "epoch": 0.37727947800258077, + "grad_norm": 2.425549030303955, + "learning_rate": 2.8629126604943455e-05, + "loss": 0.8448, + "step": 7748 + }, + { + "epoch": 0.377328171791688, + "grad_norm": 3.755560874938965, + "learning_rate": 2.8626280835707567e-05, + "loss": 0.7512, + "step": 7749 + }, + { + "epoch": 0.3773768655807952, + "grad_norm": 1.8598575592041016, + "learning_rate": 2.8623434851892163e-05, + "loss": 0.8873, + "step": 7750 + }, + { + "epoch": 0.37742555936990235, + "grad_norm": 1.5662018060684204, + "learning_rate": 2.8620588653568022e-05, + "loss": 0.8177, + "step": 7751 + }, + { + "epoch": 0.37747425315900957, + "grad_norm": 1.7726696729660034, + "learning_rate": 2.8617742240805958e-05, + "loss": 0.8786, + "step": 7752 + }, + { + "epoch": 0.3775229469481168, + "grad_norm": 1.9987375736236572, + "learning_rate": 2.8614895613676762e-05, + "loss": 0.8954, + "step": 7753 + }, + { + "epoch": 0.377571640737224, + "grad_norm": 1.2473291158676147, + "learning_rate": 2.861204877225126e-05, + "loss": 0.8847, + "step": 7754 + }, + { + "epoch": 0.37762033452633115, + "grad_norm": 1.5330097675323486, + "learning_rate": 2.860920171660026e-05, + "loss": 0.7872, + "step": 7755 + }, + { + "epoch": 0.37766902831543836, + "grad_norm": 1.4792319536209106, + "learning_rate": 2.860635444679458e-05, + "loss": 0.8014, + "step": 7756 + }, + { + "epoch": 0.3777177221045456, + "grad_norm": 1.29299795627594, + "learning_rate": 2.8603506962905047e-05, + "loss": 0.7921, + "step": 7757 + }, + { + "epoch": 0.3777664158936528, + "grad_norm": 1.5286619663238525, + "learning_rate": 2.8600659265002494e-05, + "loss": 0.8003, + "step": 7758 + }, + { + "epoch": 0.37781510968275994, + "grad_norm": 2.074130058288574, + "learning_rate": 2.859781135315776e-05, + "loss": 0.8718, + "step": 7759 + }, + { + "epoch": 0.37786380347186715, + "grad_norm": 1.4686583280563354, + "learning_rate": 2.8594963227441677e-05, + "loss": 0.8028, + "step": 7760 + }, + { + "epoch": 0.37791249726097437, + "grad_norm": 1.403725504875183, + "learning_rate": 2.8592114887925108e-05, + "loss": 0.9092, + "step": 7761 + }, + { + "epoch": 0.3779611910500816, + "grad_norm": 2.9982433319091797, + "learning_rate": 2.8589266334678897e-05, + "loss": 0.8738, + "step": 7762 + }, + { + "epoch": 0.37800988483918874, + "grad_norm": 1.6597980260849, + "learning_rate": 2.8586417567773904e-05, + "loss": 0.8893, + "step": 7763 + }, + { + "epoch": 0.37805857862829595, + "grad_norm": 2.334120512008667, + "learning_rate": 2.858356858728099e-05, + "loss": 0.8681, + "step": 7764 + }, + { + "epoch": 0.37810727241740316, + "grad_norm": 1.2742834091186523, + "learning_rate": 2.8580719393271025e-05, + "loss": 0.7855, + "step": 7765 + }, + { + "epoch": 0.3781559662065104, + "grad_norm": 1.7590466737747192, + "learning_rate": 2.857786998581488e-05, + "loss": 0.8634, + "step": 7766 + }, + { + "epoch": 0.37820465999561753, + "grad_norm": 2.2831015586853027, + "learning_rate": 2.8575020364983436e-05, + "loss": 0.8521, + "step": 7767 + }, + { + "epoch": 0.37825335378472474, + "grad_norm": 1.5927050113677979, + "learning_rate": 2.8572170530847574e-05, + "loss": 0.8986, + "step": 7768 + }, + { + "epoch": 0.37830204757383196, + "grad_norm": 1.358123540878296, + "learning_rate": 2.8569320483478203e-05, + "loss": 0.778, + "step": 7769 + }, + { + "epoch": 0.37835074136293917, + "grad_norm": 1.3692247867584229, + "learning_rate": 2.8566470222946193e-05, + "loss": 0.911, + "step": 7770 + }, + { + "epoch": 0.3783994351520464, + "grad_norm": 1.701195240020752, + "learning_rate": 2.856361974932246e-05, + "loss": 0.8123, + "step": 7771 + }, + { + "epoch": 0.37844812894115354, + "grad_norm": 1.9167296886444092, + "learning_rate": 2.8560769062677905e-05, + "loss": 0.8626, + "step": 7772 + }, + { + "epoch": 0.37849682273026075, + "grad_norm": 1.1616493463516235, + "learning_rate": 2.8557918163083437e-05, + "loss": 0.781, + "step": 7773 + }, + { + "epoch": 0.37854551651936796, + "grad_norm": 1.5773837566375732, + "learning_rate": 2.8555067050609975e-05, + "loss": 0.8392, + "step": 7774 + }, + { + "epoch": 0.3785942103084752, + "grad_norm": 1.3524798154830933, + "learning_rate": 2.855221572532844e-05, + "loss": 0.8743, + "step": 7775 + }, + { + "epoch": 0.37864290409758233, + "grad_norm": 1.55650794506073, + "learning_rate": 2.854936418730976e-05, + "loss": 0.8453, + "step": 7776 + }, + { + "epoch": 0.37869159788668955, + "grad_norm": 2.2838423252105713, + "learning_rate": 2.854651243662486e-05, + "loss": 0.8457, + "step": 7777 + }, + { + "epoch": 0.37874029167579676, + "grad_norm": 1.781203031539917, + "learning_rate": 2.8543660473344687e-05, + "loss": 0.84, + "step": 7778 + }, + { + "epoch": 0.37878898546490397, + "grad_norm": 1.4875352382659912, + "learning_rate": 2.854080829754019e-05, + "loss": 0.8698, + "step": 7779 + }, + { + "epoch": 0.37883767925401113, + "grad_norm": 1.4823042154312134, + "learning_rate": 2.85379559092823e-05, + "loss": 0.7973, + "step": 7780 + }, + { + "epoch": 0.37888637304311834, + "grad_norm": 1.3500452041625977, + "learning_rate": 2.8535103308641976e-05, + "loss": 0.7949, + "step": 7781 + }, + { + "epoch": 0.37893506683222555, + "grad_norm": 1.8560004234313965, + "learning_rate": 2.853225049569018e-05, + "loss": 0.9016, + "step": 7782 + }, + { + "epoch": 0.37898376062133277, + "grad_norm": 1.4903485774993896, + "learning_rate": 2.8529397470497878e-05, + "loss": 0.8949, + "step": 7783 + }, + { + "epoch": 0.3790324544104399, + "grad_norm": 1.2902978658676147, + "learning_rate": 2.8526544233136036e-05, + "loss": 0.7956, + "step": 7784 + }, + { + "epoch": 0.37908114819954714, + "grad_norm": 0.08710569143295288, + "learning_rate": 2.8523690783675627e-05, + "loss": 0.6149, + "step": 7785 + }, + { + "epoch": 0.37912984198865435, + "grad_norm": 1.7402973175048828, + "learning_rate": 2.8520837122187638e-05, + "loss": 0.8909, + "step": 7786 + }, + { + "epoch": 0.37917853577776156, + "grad_norm": 1.8059196472167969, + "learning_rate": 2.8517983248743043e-05, + "loss": 0.8531, + "step": 7787 + }, + { + "epoch": 0.3792272295668688, + "grad_norm": 2.5387582778930664, + "learning_rate": 2.8515129163412835e-05, + "loss": 0.8825, + "step": 7788 + }, + { + "epoch": 0.37927592335597593, + "grad_norm": 1.4011861085891724, + "learning_rate": 2.8512274866268015e-05, + "loss": 0.781, + "step": 7789 + }, + { + "epoch": 0.37932461714508314, + "grad_norm": 2.327171802520752, + "learning_rate": 2.8509420357379585e-05, + "loss": 0.9508, + "step": 7790 + }, + { + "epoch": 0.37937331093419036, + "grad_norm": 1.5435771942138672, + "learning_rate": 2.850656563681855e-05, + "loss": 0.8152, + "step": 7791 + }, + { + "epoch": 0.37942200472329757, + "grad_norm": 1.4636722803115845, + "learning_rate": 2.8503710704655915e-05, + "loss": 0.821, + "step": 7792 + }, + { + "epoch": 0.3794706985124047, + "grad_norm": 3.0491456985473633, + "learning_rate": 2.85008555609627e-05, + "loss": 0.855, + "step": 7793 + }, + { + "epoch": 0.37951939230151194, + "grad_norm": 1.6907098293304443, + "learning_rate": 2.849800020580993e-05, + "loss": 0.8697, + "step": 7794 + }, + { + "epoch": 0.37956808609061915, + "grad_norm": 1.4060590267181396, + "learning_rate": 2.8495144639268625e-05, + "loss": 0.8754, + "step": 7795 + }, + { + "epoch": 0.37961677987972636, + "grad_norm": 1.4307098388671875, + "learning_rate": 2.8492288861409833e-05, + "loss": 0.7765, + "step": 7796 + }, + { + "epoch": 0.3796654736688335, + "grad_norm": 1.866998553276062, + "learning_rate": 2.8489432872304568e-05, + "loss": 0.8462, + "step": 7797 + }, + { + "epoch": 0.37971416745794073, + "grad_norm": 1.301499605178833, + "learning_rate": 2.84865766720239e-05, + "loss": 0.8884, + "step": 7798 + }, + { + "epoch": 0.37976286124704794, + "grad_norm": 1.4763953685760498, + "learning_rate": 2.8483720260638853e-05, + "loss": 0.8442, + "step": 7799 + }, + { + "epoch": 0.37981155503615516, + "grad_norm": 1.288808822631836, + "learning_rate": 2.8480863638220494e-05, + "loss": 0.897, + "step": 7800 + }, + { + "epoch": 0.3798602488252623, + "grad_norm": 1.796863079071045, + "learning_rate": 2.8478006804839885e-05, + "loss": 0.8234, + "step": 7801 + }, + { + "epoch": 0.3799089426143695, + "grad_norm": 1.4605014324188232, + "learning_rate": 2.8475149760568073e-05, + "loss": 0.9414, + "step": 7802 + }, + { + "epoch": 0.37995763640347674, + "grad_norm": 1.1120854616165161, + "learning_rate": 2.8472292505476144e-05, + "loss": 0.9164, + "step": 7803 + }, + { + "epoch": 0.38000633019258395, + "grad_norm": 1.750766396522522, + "learning_rate": 2.846943503963517e-05, + "loss": 0.8583, + "step": 7804 + }, + { + "epoch": 0.3800550239816911, + "grad_norm": 1.3003476858139038, + "learning_rate": 2.846657736311622e-05, + "loss": 0.8648, + "step": 7805 + }, + { + "epoch": 0.3801037177707983, + "grad_norm": 2.10872745513916, + "learning_rate": 2.8463719475990387e-05, + "loss": 0.836, + "step": 7806 + }, + { + "epoch": 0.38015241155990553, + "grad_norm": 2.493170738220215, + "learning_rate": 2.8460861378328767e-05, + "loss": 0.8746, + "step": 7807 + }, + { + "epoch": 0.38020110534901275, + "grad_norm": 1.4864981174468994, + "learning_rate": 2.8458003070202442e-05, + "loss": 0.9529, + "step": 7808 + }, + { + "epoch": 0.38024979913811996, + "grad_norm": 1.2768694162368774, + "learning_rate": 2.8455144551682525e-05, + "loss": 0.8807, + "step": 7809 + }, + { + "epoch": 0.3802984929272271, + "grad_norm": 1.2520126104354858, + "learning_rate": 2.8452285822840105e-05, + "loss": 0.852, + "step": 7810 + }, + { + "epoch": 0.38034718671633433, + "grad_norm": 1.5296595096588135, + "learning_rate": 2.8449426883746313e-05, + "loss": 0.8137, + "step": 7811 + }, + { + "epoch": 0.38039588050544154, + "grad_norm": 1.354884147644043, + "learning_rate": 2.8446567734472263e-05, + "loss": 0.8397, + "step": 7812 + }, + { + "epoch": 0.38044457429454875, + "grad_norm": 2.024913787841797, + "learning_rate": 2.844370837508906e-05, + "loss": 0.9083, + "step": 7813 + }, + { + "epoch": 0.3804932680836559, + "grad_norm": 1.3734794855117798, + "learning_rate": 2.8440848805667848e-05, + "loss": 0.8531, + "step": 7814 + }, + { + "epoch": 0.3805419618727631, + "grad_norm": 1.621413230895996, + "learning_rate": 2.8437989026279747e-05, + "loss": 0.8934, + "step": 7815 + }, + { + "epoch": 0.38059065566187034, + "grad_norm": 1.5505571365356445, + "learning_rate": 2.84351290369959e-05, + "loss": 0.7994, + "step": 7816 + }, + { + "epoch": 0.38063934945097755, + "grad_norm": 1.2536770105361938, + "learning_rate": 2.8432268837887456e-05, + "loss": 0.9057, + "step": 7817 + }, + { + "epoch": 0.3806880432400847, + "grad_norm": 1.7369664907455444, + "learning_rate": 2.8429408429025547e-05, + "loss": 0.8743, + "step": 7818 + }, + { + "epoch": 0.3807367370291919, + "grad_norm": 1.9838459491729736, + "learning_rate": 2.842654781048134e-05, + "loss": 0.8584, + "step": 7819 + }, + { + "epoch": 0.38078543081829913, + "grad_norm": 1.1214925050735474, + "learning_rate": 2.8423686982325988e-05, + "loss": 0.8967, + "step": 7820 + }, + { + "epoch": 0.38083412460740634, + "grad_norm": 1.1957290172576904, + "learning_rate": 2.842082594463065e-05, + "loss": 0.7285, + "step": 7821 + }, + { + "epoch": 0.3808828183965135, + "grad_norm": 1.85651433467865, + "learning_rate": 2.8417964697466508e-05, + "loss": 0.8226, + "step": 7822 + }, + { + "epoch": 0.3809315121856207, + "grad_norm": 1.393196702003479, + "learning_rate": 2.841510324090472e-05, + "loss": 0.8715, + "step": 7823 + }, + { + "epoch": 0.3809802059747279, + "grad_norm": 1.1409492492675781, + "learning_rate": 2.841224157501647e-05, + "loss": 0.8065, + "step": 7824 + }, + { + "epoch": 0.38102889976383514, + "grad_norm": 1.6122952699661255, + "learning_rate": 2.8409379699872945e-05, + "loss": 0.8464, + "step": 7825 + }, + { + "epoch": 0.3810775935529423, + "grad_norm": 2.2896499633789062, + "learning_rate": 2.8406517615545334e-05, + "loss": 0.8579, + "step": 7826 + }, + { + "epoch": 0.3811262873420495, + "grad_norm": 1.6566886901855469, + "learning_rate": 2.8403655322104828e-05, + "loss": 0.8325, + "step": 7827 + }, + { + "epoch": 0.3811749811311567, + "grad_norm": 1.7276071310043335, + "learning_rate": 2.8400792819622635e-05, + "loss": 0.8531, + "step": 7828 + }, + { + "epoch": 0.38122367492026393, + "grad_norm": 1.5190061330795288, + "learning_rate": 2.8397930108169953e-05, + "loss": 0.9352, + "step": 7829 + }, + { + "epoch": 0.38127236870937115, + "grad_norm": 1.3872915506362915, + "learning_rate": 2.8395067187817994e-05, + "loss": 0.8237, + "step": 7830 + }, + { + "epoch": 0.3813210624984783, + "grad_norm": 1.7565979957580566, + "learning_rate": 2.839220405863797e-05, + "loss": 0.7966, + "step": 7831 + }, + { + "epoch": 0.3813697562875855, + "grad_norm": 1.4491231441497803, + "learning_rate": 2.8389340720701106e-05, + "loss": 0.7736, + "step": 7832 + }, + { + "epoch": 0.3814184500766927, + "grad_norm": 2.038215160369873, + "learning_rate": 2.838647717407863e-05, + "loss": 0.8256, + "step": 7833 + }, + { + "epoch": 0.38146714386579994, + "grad_norm": 1.6268579959869385, + "learning_rate": 2.8383613418841773e-05, + "loss": 0.6941, + "step": 7834 + }, + { + "epoch": 0.3815158376549071, + "grad_norm": 1.5090171098709106, + "learning_rate": 2.838074945506176e-05, + "loss": 0.8784, + "step": 7835 + }, + { + "epoch": 0.3815645314440143, + "grad_norm": 2.0031578540802, + "learning_rate": 2.8377885282809838e-05, + "loss": 0.8784, + "step": 7836 + }, + { + "epoch": 0.3816132252331215, + "grad_norm": 3.6098203659057617, + "learning_rate": 2.8375020902157265e-05, + "loss": 0.7458, + "step": 7837 + }, + { + "epoch": 0.38166191902222873, + "grad_norm": 2.6279141902923584, + "learning_rate": 2.837215631317528e-05, + "loss": 0.7755, + "step": 7838 + }, + { + "epoch": 0.3817106128113359, + "grad_norm": 2.938798427581787, + "learning_rate": 2.836929151593514e-05, + "loss": 0.8638, + "step": 7839 + }, + { + "epoch": 0.3817593066004431, + "grad_norm": 1.8860574960708618, + "learning_rate": 2.836642651050811e-05, + "loss": 0.8721, + "step": 7840 + }, + { + "epoch": 0.3818080003895503, + "grad_norm": 2.0039966106414795, + "learning_rate": 2.836356129696546e-05, + "loss": 0.9033, + "step": 7841 + }, + { + "epoch": 0.38185669417865753, + "grad_norm": 1.633696436882019, + "learning_rate": 2.8360695875378462e-05, + "loss": 0.8179, + "step": 7842 + }, + { + "epoch": 0.3819053879677647, + "grad_norm": 1.411072850227356, + "learning_rate": 2.835783024581839e-05, + "loss": 0.829, + "step": 7843 + }, + { + "epoch": 0.3819540817568719, + "grad_norm": 1.975454568862915, + "learning_rate": 2.8354964408356532e-05, + "loss": 0.8172, + "step": 7844 + }, + { + "epoch": 0.3820027755459791, + "grad_norm": 1.3518579006195068, + "learning_rate": 2.835209836306417e-05, + "loss": 0.8643, + "step": 7845 + }, + { + "epoch": 0.3820514693350863, + "grad_norm": 1.276013970375061, + "learning_rate": 2.8349232110012596e-05, + "loss": 0.8766, + "step": 7846 + }, + { + "epoch": 0.3821001631241935, + "grad_norm": 1.391140341758728, + "learning_rate": 2.834636564927311e-05, + "loss": 0.873, + "step": 7847 + }, + { + "epoch": 0.3821488569133007, + "grad_norm": 2.457468271255493, + "learning_rate": 2.8343498980917027e-05, + "loss": 0.8135, + "step": 7848 + }, + { + "epoch": 0.3821975507024079, + "grad_norm": 1.582584261894226, + "learning_rate": 2.8340632105015645e-05, + "loss": 0.7919, + "step": 7849 + }, + { + "epoch": 0.3822462444915151, + "grad_norm": 1.673534631729126, + "learning_rate": 2.8337765021640275e-05, + "loss": 0.7215, + "step": 7850 + }, + { + "epoch": 0.38229493828062233, + "grad_norm": 1.839046597480774, + "learning_rate": 2.833489773086224e-05, + "loss": 0.9388, + "step": 7851 + }, + { + "epoch": 0.3823436320697295, + "grad_norm": 1.8405288457870483, + "learning_rate": 2.8332030232752864e-05, + "loss": 0.8267, + "step": 7852 + }, + { + "epoch": 0.3823923258588367, + "grad_norm": 3.5040202140808105, + "learning_rate": 2.8329162527383478e-05, + "loss": 0.804, + "step": 7853 + }, + { + "epoch": 0.3824410196479439, + "grad_norm": 1.1966089010238647, + "learning_rate": 2.832629461482541e-05, + "loss": 0.7904, + "step": 7854 + }, + { + "epoch": 0.3824897134370511, + "grad_norm": 1.627663493156433, + "learning_rate": 2.832342649515001e-05, + "loss": 0.8425, + "step": 7855 + }, + { + "epoch": 0.3825384072261583, + "grad_norm": 1.333345651626587, + "learning_rate": 2.8320558168428615e-05, + "loss": 0.8496, + "step": 7856 + }, + { + "epoch": 0.3825871010152655, + "grad_norm": 1.4352569580078125, + "learning_rate": 2.8317689634732577e-05, + "loss": 0.8578, + "step": 7857 + }, + { + "epoch": 0.3826357948043727, + "grad_norm": 1.7285773754119873, + "learning_rate": 2.831482089413325e-05, + "loss": 0.8518, + "step": 7858 + }, + { + "epoch": 0.3826844885934799, + "grad_norm": 1.5604653358459473, + "learning_rate": 2.8311951946701998e-05, + "loss": 0.9436, + "step": 7859 + }, + { + "epoch": 0.3827331823825871, + "grad_norm": 1.801157832145691, + "learning_rate": 2.8309082792510182e-05, + "loss": 0.8206, + "step": 7860 + }, + { + "epoch": 0.3827818761716943, + "grad_norm": 1.3591485023498535, + "learning_rate": 2.8306213431629168e-05, + "loss": 0.9448, + "step": 7861 + }, + { + "epoch": 0.3828305699608015, + "grad_norm": 1.9046218395233154, + "learning_rate": 2.830334386413035e-05, + "loss": 0.7747, + "step": 7862 + }, + { + "epoch": 0.3828792637499087, + "grad_norm": 1.4950491189956665, + "learning_rate": 2.8300474090085085e-05, + "loss": 0.8528, + "step": 7863 + }, + { + "epoch": 0.3829279575390159, + "grad_norm": 2.6571474075317383, + "learning_rate": 2.8297604109564778e-05, + "loss": 0.8616, + "step": 7864 + }, + { + "epoch": 0.3829766513281231, + "grad_norm": 1.5451966524124146, + "learning_rate": 2.829473392264081e-05, + "loss": 0.8237, + "step": 7865 + }, + { + "epoch": 0.3830253451172303, + "grad_norm": 1.8825700283050537, + "learning_rate": 2.8291863529384572e-05, + "loss": 0.8993, + "step": 7866 + }, + { + "epoch": 0.3830740389063375, + "grad_norm": 1.523960828781128, + "learning_rate": 2.8288992929867482e-05, + "loss": 0.7519, + "step": 7867 + }, + { + "epoch": 0.3831227326954447, + "grad_norm": 1.4241918325424194, + "learning_rate": 2.8286122124160936e-05, + "loss": 0.7847, + "step": 7868 + }, + { + "epoch": 0.3831714264845519, + "grad_norm": 1.5965808629989624, + "learning_rate": 2.8283251112336342e-05, + "loss": 0.8298, + "step": 7869 + }, + { + "epoch": 0.3832201202736591, + "grad_norm": 1.388731598854065, + "learning_rate": 2.8280379894465122e-05, + "loss": 0.8897, + "step": 7870 + }, + { + "epoch": 0.3832688140627663, + "grad_norm": 4.125836372375488, + "learning_rate": 2.8277508470618702e-05, + "loss": 0.7659, + "step": 7871 + }, + { + "epoch": 0.3833175078518735, + "grad_norm": 2.2144532203674316, + "learning_rate": 2.82746368408685e-05, + "loss": 0.8171, + "step": 7872 + }, + { + "epoch": 0.3833662016409807, + "grad_norm": 1.3425158262252808, + "learning_rate": 2.8271765005285953e-05, + "loss": 0.8243, + "step": 7873 + }, + { + "epoch": 0.3834148954300879, + "grad_norm": 2.3053603172302246, + "learning_rate": 2.8268892963942503e-05, + "loss": 0.882, + "step": 7874 + }, + { + "epoch": 0.3834635892191951, + "grad_norm": 1.3155759572982788, + "learning_rate": 2.8266020716909576e-05, + "loss": 0.94, + "step": 7875 + }, + { + "epoch": 0.3835122830083023, + "grad_norm": 1.814414143562317, + "learning_rate": 2.826314826425863e-05, + "loss": 0.8377, + "step": 7876 + }, + { + "epoch": 0.38356097679740947, + "grad_norm": 2.6503713130950928, + "learning_rate": 2.8260275606061126e-05, + "loss": 0.7974, + "step": 7877 + }, + { + "epoch": 0.3836096705865167, + "grad_norm": 1.4600344896316528, + "learning_rate": 2.8257402742388512e-05, + "loss": 0.8436, + "step": 7878 + }, + { + "epoch": 0.3836583643756239, + "grad_norm": 1.5718671083450317, + "learning_rate": 2.825452967331225e-05, + "loss": 0.8094, + "step": 7879 + }, + { + "epoch": 0.3837070581647311, + "grad_norm": 2.1706650257110596, + "learning_rate": 2.825165639890381e-05, + "loss": 0.8256, + "step": 7880 + }, + { + "epoch": 0.38375575195383826, + "grad_norm": 1.6261889934539795, + "learning_rate": 2.8248782919234666e-05, + "loss": 0.8733, + "step": 7881 + }, + { + "epoch": 0.3838044457429455, + "grad_norm": 1.7823717594146729, + "learning_rate": 2.8245909234376295e-05, + "loss": 0.8421, + "step": 7882 + }, + { + "epoch": 0.3838531395320527, + "grad_norm": 2.188302993774414, + "learning_rate": 2.8243035344400174e-05, + "loss": 0.9497, + "step": 7883 + }, + { + "epoch": 0.3839018333211599, + "grad_norm": 1.7312742471694946, + "learning_rate": 2.8240161249377797e-05, + "loss": 0.8671, + "step": 7884 + }, + { + "epoch": 0.38395052711026706, + "grad_norm": 1.7992844581604004, + "learning_rate": 2.8237286949380663e-05, + "loss": 0.8088, + "step": 7885 + }, + { + "epoch": 0.38399922089937427, + "grad_norm": 1.6053191423416138, + "learning_rate": 2.8234412444480263e-05, + "loss": 0.8541, + "step": 7886 + }, + { + "epoch": 0.3840479146884815, + "grad_norm": 2.208097457885742, + "learning_rate": 2.8231537734748102e-05, + "loss": 0.8445, + "step": 7887 + }, + { + "epoch": 0.3840966084775887, + "grad_norm": 2.194242477416992, + "learning_rate": 2.8228662820255697e-05, + "loss": 0.834, + "step": 7888 + }, + { + "epoch": 0.3841453022666959, + "grad_norm": 1.6310187578201294, + "learning_rate": 2.8225787701074545e-05, + "loss": 0.9077, + "step": 7889 + }, + { + "epoch": 0.38419399605580307, + "grad_norm": 1.1595585346221924, + "learning_rate": 2.8222912377276176e-05, + "loss": 0.8289, + "step": 7890 + }, + { + "epoch": 0.3842426898449103, + "grad_norm": 0.09793799370527267, + "learning_rate": 2.8220036848932113e-05, + "loss": 0.6194, + "step": 7891 + }, + { + "epoch": 0.3842913836340175, + "grad_norm": 3.750913143157959, + "learning_rate": 2.8217161116113883e-05, + "loss": 0.9228, + "step": 7892 + }, + { + "epoch": 0.3843400774231247, + "grad_norm": 1.2287977933883667, + "learning_rate": 2.8214285178893024e-05, + "loss": 0.9069, + "step": 7893 + }, + { + "epoch": 0.38438877121223186, + "grad_norm": 1.7367221117019653, + "learning_rate": 2.821140903734107e-05, + "loss": 0.7922, + "step": 7894 + }, + { + "epoch": 0.3844374650013391, + "grad_norm": 2.3821089267730713, + "learning_rate": 2.8208532691529567e-05, + "loss": 0.7785, + "step": 7895 + }, + { + "epoch": 0.3844861587904463, + "grad_norm": 2.727332353591919, + "learning_rate": 2.8205656141530064e-05, + "loss": 0.8967, + "step": 7896 + }, + { + "epoch": 0.3845348525795535, + "grad_norm": 1.7435524463653564, + "learning_rate": 2.820277938741412e-05, + "loss": 0.8221, + "step": 7897 + }, + { + "epoch": 0.38458354636866066, + "grad_norm": 1.8841724395751953, + "learning_rate": 2.819990242925329e-05, + "loss": 0.7645, + "step": 7898 + }, + { + "epoch": 0.38463224015776787, + "grad_norm": 1.592365026473999, + "learning_rate": 2.8197025267119134e-05, + "loss": 0.8085, + "step": 7899 + }, + { + "epoch": 0.3846809339468751, + "grad_norm": 1.6226933002471924, + "learning_rate": 2.819414790108323e-05, + "loss": 0.7775, + "step": 7900 + }, + { + "epoch": 0.3847296277359823, + "grad_norm": 1.837186574935913, + "learning_rate": 2.8191270331217154e-05, + "loss": 0.8903, + "step": 7901 + }, + { + "epoch": 0.38477832152508945, + "grad_norm": 1.6480903625488281, + "learning_rate": 2.818839255759248e-05, + "loss": 0.8776, + "step": 7902 + }, + { + "epoch": 0.38482701531419666, + "grad_norm": 1.4737118482589722, + "learning_rate": 2.818551458028079e-05, + "loss": 0.9147, + "step": 7903 + }, + { + "epoch": 0.3848757091033039, + "grad_norm": 1.556964635848999, + "learning_rate": 2.818263639935368e-05, + "loss": 0.8425, + "step": 7904 + }, + { + "epoch": 0.3849244028924111, + "grad_norm": 5.389280796051025, + "learning_rate": 2.8179758014882744e-05, + "loss": 0.7798, + "step": 7905 + }, + { + "epoch": 0.38497309668151825, + "grad_norm": 1.5764849185943604, + "learning_rate": 2.817687942693958e-05, + "loss": 0.8191, + "step": 7906 + }, + { + "epoch": 0.38502179047062546, + "grad_norm": 1.8091360330581665, + "learning_rate": 2.8174000635595802e-05, + "loss": 0.88, + "step": 7907 + }, + { + "epoch": 0.38507048425973267, + "grad_norm": 2.361194133758545, + "learning_rate": 2.8171121640923007e-05, + "loss": 0.8694, + "step": 7908 + }, + { + "epoch": 0.3851191780488399, + "grad_norm": 1.654784917831421, + "learning_rate": 2.8168242442992818e-05, + "loss": 0.8938, + "step": 7909 + }, + { + "epoch": 0.3851678718379471, + "grad_norm": 2.1513864994049072, + "learning_rate": 2.8165363041876857e-05, + "loss": 0.8931, + "step": 7910 + }, + { + "epoch": 0.38521656562705425, + "grad_norm": 2.429192304611206, + "learning_rate": 2.8162483437646742e-05, + "loss": 0.9039, + "step": 7911 + }, + { + "epoch": 0.38526525941616147, + "grad_norm": 2.0354645252227783, + "learning_rate": 2.8159603630374105e-05, + "loss": 0.8558, + "step": 7912 + }, + { + "epoch": 0.3853139532052687, + "grad_norm": 0.10137780755758286, + "learning_rate": 2.8156723620130585e-05, + "loss": 0.6467, + "step": 7913 + }, + { + "epoch": 0.3853626469943759, + "grad_norm": 0.09570161253213882, + "learning_rate": 2.815384340698782e-05, + "loss": 0.6551, + "step": 7914 + }, + { + "epoch": 0.38541134078348305, + "grad_norm": 0.08993691951036453, + "learning_rate": 2.815096299101746e-05, + "loss": 0.6008, + "step": 7915 + }, + { + "epoch": 0.38546003457259026, + "grad_norm": 0.08896803855895996, + "learning_rate": 2.814808237229115e-05, + "loss": 0.6213, + "step": 7916 + }, + { + "epoch": 0.3855087283616975, + "grad_norm": 2.196057081222534, + "learning_rate": 2.814520155088055e-05, + "loss": 0.7845, + "step": 7917 + }, + { + "epoch": 0.3855574221508047, + "grad_norm": 1.9747647047042847, + "learning_rate": 2.814232052685732e-05, + "loss": 0.8582, + "step": 7918 + }, + { + "epoch": 0.38560611593991184, + "grad_norm": 1.2842406034469604, + "learning_rate": 2.8139439300293123e-05, + "loss": 0.8215, + "step": 7919 + }, + { + "epoch": 0.38565480972901905, + "grad_norm": 2.1299123764038086, + "learning_rate": 2.8136557871259628e-05, + "loss": 0.9044, + "step": 7920 + }, + { + "epoch": 0.38570350351812627, + "grad_norm": 1.3461976051330566, + "learning_rate": 2.8133676239828518e-05, + "loss": 0.7806, + "step": 7921 + }, + { + "epoch": 0.3857521973072335, + "grad_norm": 1.9228030443191528, + "learning_rate": 2.813079440607147e-05, + "loss": 0.8028, + "step": 7922 + }, + { + "epoch": 0.38580089109634064, + "grad_norm": 1.7670353651046753, + "learning_rate": 2.8127912370060172e-05, + "loss": 0.744, + "step": 7923 + }, + { + "epoch": 0.38584958488544785, + "grad_norm": 1.7056940793991089, + "learning_rate": 2.812503013186631e-05, + "loss": 0.8736, + "step": 7924 + }, + { + "epoch": 0.38589827867455506, + "grad_norm": 1.6497055292129517, + "learning_rate": 2.8122147691561576e-05, + "loss": 0.8476, + "step": 7925 + }, + { + "epoch": 0.3859469724636623, + "grad_norm": 3.2596325874328613, + "learning_rate": 2.8119265049217687e-05, + "loss": 0.7656, + "step": 7926 + }, + { + "epoch": 0.38599566625276943, + "grad_norm": 2.5694992542266846, + "learning_rate": 2.8116382204906335e-05, + "loss": 0.8692, + "step": 7927 + }, + { + "epoch": 0.38604436004187664, + "grad_norm": 1.584769606590271, + "learning_rate": 2.8113499158699234e-05, + "loss": 0.896, + "step": 7928 + }, + { + "epoch": 0.38609305383098386, + "grad_norm": 2.0240347385406494, + "learning_rate": 2.8110615910668106e-05, + "loss": 0.8434, + "step": 7929 + }, + { + "epoch": 0.38614174762009107, + "grad_norm": 1.715072751045227, + "learning_rate": 2.810773246088467e-05, + "loss": 0.8515, + "step": 7930 + }, + { + "epoch": 0.3861904414091983, + "grad_norm": 4.898405075073242, + "learning_rate": 2.8104848809420646e-05, + "loss": 0.8675, + "step": 7931 + }, + { + "epoch": 0.38623913519830544, + "grad_norm": 2.6880228519439697, + "learning_rate": 2.8101964956347768e-05, + "loss": 0.8864, + "step": 7932 + }, + { + "epoch": 0.38628782898741265, + "grad_norm": 3.9648845195770264, + "learning_rate": 2.8099080901737773e-05, + "loss": 0.8478, + "step": 7933 + }, + { + "epoch": 0.38633652277651986, + "grad_norm": 1.3644696474075317, + "learning_rate": 2.80961966456624e-05, + "loss": 0.8705, + "step": 7934 + }, + { + "epoch": 0.3863852165656271, + "grad_norm": 2.1134791374206543, + "learning_rate": 2.80933121881934e-05, + "loss": 0.8352, + "step": 7935 + }, + { + "epoch": 0.38643391035473423, + "grad_norm": 1.522156000137329, + "learning_rate": 2.8090427529402523e-05, + "loss": 0.86, + "step": 7936 + }, + { + "epoch": 0.38648260414384145, + "grad_norm": 1.8870229721069336, + "learning_rate": 2.8087542669361525e-05, + "loss": 0.8689, + "step": 7937 + }, + { + "epoch": 0.38653129793294866, + "grad_norm": 1.4216859340667725, + "learning_rate": 2.8084657608142163e-05, + "loss": 0.8109, + "step": 7938 + }, + { + "epoch": 0.38657999172205587, + "grad_norm": 0.09305957704782486, + "learning_rate": 2.8081772345816206e-05, + "loss": 0.6881, + "step": 7939 + }, + { + "epoch": 0.38662868551116303, + "grad_norm": 1.8090050220489502, + "learning_rate": 2.8078886882455426e-05, + "loss": 0.7564, + "step": 7940 + }, + { + "epoch": 0.38667737930027024, + "grad_norm": 2.254188299179077, + "learning_rate": 2.8076001218131597e-05, + "loss": 0.9128, + "step": 7941 + }, + { + "epoch": 0.38672607308937745, + "grad_norm": 2.0758914947509766, + "learning_rate": 2.80731153529165e-05, + "loss": 0.7314, + "step": 7942 + }, + { + "epoch": 0.38677476687848467, + "grad_norm": 1.5161317586898804, + "learning_rate": 2.807022928688192e-05, + "loss": 0.8617, + "step": 7943 + }, + { + "epoch": 0.3868234606675918, + "grad_norm": 2.355351448059082, + "learning_rate": 2.806734302009966e-05, + "loss": 0.7899, + "step": 7944 + }, + { + "epoch": 0.38687215445669904, + "grad_norm": 1.7446006536483765, + "learning_rate": 2.8064456552641498e-05, + "loss": 0.8582, + "step": 7945 + }, + { + "epoch": 0.38692084824580625, + "grad_norm": 1.718522548675537, + "learning_rate": 2.8061569884579252e-05, + "loss": 0.775, + "step": 7946 + }, + { + "epoch": 0.38696954203491346, + "grad_norm": 1.8304154872894287, + "learning_rate": 2.8058683015984714e-05, + "loss": 0.8426, + "step": 7947 + }, + { + "epoch": 0.3870182358240207, + "grad_norm": 1.573499083518982, + "learning_rate": 2.8055795946929702e-05, + "loss": 0.8652, + "step": 7948 + }, + { + "epoch": 0.38706692961312783, + "grad_norm": 2.228606700897217, + "learning_rate": 2.8052908677486034e-05, + "loss": 0.8537, + "step": 7949 + }, + { + "epoch": 0.38711562340223504, + "grad_norm": 1.7768319845199585, + "learning_rate": 2.8050021207725525e-05, + "loss": 0.7883, + "step": 7950 + }, + { + "epoch": 0.38716431719134226, + "grad_norm": 1.7858768701553345, + "learning_rate": 2.804713353772001e-05, + "loss": 0.7804, + "step": 7951 + }, + { + "epoch": 0.38721301098044947, + "grad_norm": 1.389914631843567, + "learning_rate": 2.804424566754131e-05, + "loss": 0.8807, + "step": 7952 + }, + { + "epoch": 0.3872617047695566, + "grad_norm": 2.2849366664886475, + "learning_rate": 2.8041357597261262e-05, + "loss": 0.9137, + "step": 7953 + }, + { + "epoch": 0.38731039855866384, + "grad_norm": 1.454217791557312, + "learning_rate": 2.803846932695172e-05, + "loss": 0.9235, + "step": 7954 + }, + { + "epoch": 0.38735909234777105, + "grad_norm": 1.3127572536468506, + "learning_rate": 2.803558085668451e-05, + "loss": 0.9073, + "step": 7955 + }, + { + "epoch": 0.38740778613687826, + "grad_norm": 1.5363215208053589, + "learning_rate": 2.8032692186531497e-05, + "loss": 0.8043, + "step": 7956 + }, + { + "epoch": 0.3874564799259854, + "grad_norm": 2.7521450519561768, + "learning_rate": 2.8029803316564536e-05, + "loss": 0.9127, + "step": 7957 + }, + { + "epoch": 0.38750517371509263, + "grad_norm": 1.967816710472107, + "learning_rate": 2.8026914246855487e-05, + "loss": 0.8436, + "step": 7958 + }, + { + "epoch": 0.38755386750419984, + "grad_norm": 1.7145715951919556, + "learning_rate": 2.802402497747621e-05, + "loss": 0.8633, + "step": 7959 + }, + { + "epoch": 0.38760256129330706, + "grad_norm": 1.5254733562469482, + "learning_rate": 2.8021135508498585e-05, + "loss": 0.831, + "step": 7960 + }, + { + "epoch": 0.3876512550824142, + "grad_norm": 2.1293880939483643, + "learning_rate": 2.801824583999448e-05, + "loss": 0.8469, + "step": 7961 + }, + { + "epoch": 0.3876999488715214, + "grad_norm": 2.508301019668579, + "learning_rate": 2.801535597203578e-05, + "loss": 0.8219, + "step": 7962 + }, + { + "epoch": 0.38774864266062864, + "grad_norm": 2.066037178039551, + "learning_rate": 2.8012465904694367e-05, + "loss": 0.8791, + "step": 7963 + }, + { + "epoch": 0.38779733644973585, + "grad_norm": 1.3311322927474976, + "learning_rate": 2.800957563804213e-05, + "loss": 0.9079, + "step": 7964 + }, + { + "epoch": 0.387846030238843, + "grad_norm": 1.8478578329086304, + "learning_rate": 2.8006685172150976e-05, + "loss": 0.8736, + "step": 7965 + }, + { + "epoch": 0.3878947240279502, + "grad_norm": 2.11765718460083, + "learning_rate": 2.8003794507092796e-05, + "loss": 0.8208, + "step": 7966 + }, + { + "epoch": 0.38794341781705743, + "grad_norm": 1.8684407472610474, + "learning_rate": 2.8000903642939498e-05, + "loss": 0.8393, + "step": 7967 + }, + { + "epoch": 0.38799211160616465, + "grad_norm": 1.2069605588912964, + "learning_rate": 2.7998012579762995e-05, + "loss": 0.8772, + "step": 7968 + }, + { + "epoch": 0.38804080539527186, + "grad_norm": 2.073631763458252, + "learning_rate": 2.7995121317635193e-05, + "loss": 0.9013, + "step": 7969 + }, + { + "epoch": 0.388089499184379, + "grad_norm": 2.1286604404449463, + "learning_rate": 2.7992229856628027e-05, + "loss": 0.8069, + "step": 7970 + }, + { + "epoch": 0.38813819297348623, + "grad_norm": 2.5813348293304443, + "learning_rate": 2.7989338196813408e-05, + "loss": 0.8185, + "step": 7971 + }, + { + "epoch": 0.38818688676259344, + "grad_norm": 1.780743956565857, + "learning_rate": 2.798644633826328e-05, + "loss": 0.7711, + "step": 7972 + }, + { + "epoch": 0.38823558055170065, + "grad_norm": 2.133759021759033, + "learning_rate": 2.7983554281049566e-05, + "loss": 0.9122, + "step": 7973 + }, + { + "epoch": 0.3882842743408078, + "grad_norm": 1.7465883493423462, + "learning_rate": 2.7980662025244215e-05, + "loss": 0.8025, + "step": 7974 + }, + { + "epoch": 0.388332968129915, + "grad_norm": 1.2934247255325317, + "learning_rate": 2.797776957091916e-05, + "loss": 0.8599, + "step": 7975 + }, + { + "epoch": 0.38838166191902224, + "grad_norm": 0.09333737194538116, + "learning_rate": 2.7974876918146367e-05, + "loss": 0.6401, + "step": 7976 + }, + { + "epoch": 0.38843035570812945, + "grad_norm": 1.980623483657837, + "learning_rate": 2.7971984066997784e-05, + "loss": 0.9033, + "step": 7977 + }, + { + "epoch": 0.3884790494972366, + "grad_norm": 1.9014095067977905, + "learning_rate": 2.7969091017545364e-05, + "loss": 0.8463, + "step": 7978 + }, + { + "epoch": 0.3885277432863438, + "grad_norm": 1.9867887496948242, + "learning_rate": 2.796619776986108e-05, + "loss": 0.8505, + "step": 7979 + }, + { + "epoch": 0.38857643707545103, + "grad_norm": 1.9287118911743164, + "learning_rate": 2.7963304324016907e-05, + "loss": 0.8826, + "step": 7980 + }, + { + "epoch": 0.38862513086455824, + "grad_norm": 2.3100333213806152, + "learning_rate": 2.796041068008481e-05, + "loss": 0.9, + "step": 7981 + }, + { + "epoch": 0.3886738246536654, + "grad_norm": 1.9533928632736206, + "learning_rate": 2.7957516838136772e-05, + "loss": 0.867, + "step": 7982 + }, + { + "epoch": 0.3887225184427726, + "grad_norm": 1.7633978128433228, + "learning_rate": 2.7954622798244776e-05, + "loss": 0.8595, + "step": 7983 + }, + { + "epoch": 0.3887712122318798, + "grad_norm": 1.5376123189926147, + "learning_rate": 2.7951728560480812e-05, + "loss": 0.7885, + "step": 7984 + }, + { + "epoch": 0.38881990602098704, + "grad_norm": 1.7781448364257812, + "learning_rate": 2.7948834124916867e-05, + "loss": 0.9112, + "step": 7985 + }, + { + "epoch": 0.3888685998100942, + "grad_norm": 3.0465235710144043, + "learning_rate": 2.7945939491624963e-05, + "loss": 0.8853, + "step": 7986 + }, + { + "epoch": 0.3889172935992014, + "grad_norm": 2.2641634941101074, + "learning_rate": 2.794304466067708e-05, + "loss": 0.8429, + "step": 7987 + }, + { + "epoch": 0.3889659873883086, + "grad_norm": 1.7814304828643799, + "learning_rate": 2.794014963214524e-05, + "loss": 0.8261, + "step": 7988 + }, + { + "epoch": 0.38901468117741583, + "grad_norm": 1.3190165758132935, + "learning_rate": 2.7937254406101458e-05, + "loss": 0.7603, + "step": 7989 + }, + { + "epoch": 0.38906337496652305, + "grad_norm": 1.4130462408065796, + "learning_rate": 2.7934358982617745e-05, + "loss": 0.8864, + "step": 7990 + }, + { + "epoch": 0.3891120687556302, + "grad_norm": 3.3027045726776123, + "learning_rate": 2.7931463361766126e-05, + "loss": 0.8025, + "step": 7991 + }, + { + "epoch": 0.3891607625447374, + "grad_norm": 4.661304950714111, + "learning_rate": 2.792856754361864e-05, + "loss": 0.9124, + "step": 7992 + }, + { + "epoch": 0.38920945633384463, + "grad_norm": 1.5793654918670654, + "learning_rate": 2.792567152824731e-05, + "loss": 0.8669, + "step": 7993 + }, + { + "epoch": 0.38925815012295184, + "grad_norm": 1.6750692129135132, + "learning_rate": 2.7922775315724173e-05, + "loss": 0.9026, + "step": 7994 + }, + { + "epoch": 0.389306843912059, + "grad_norm": 1.725797414779663, + "learning_rate": 2.791987890612128e-05, + "loss": 0.8423, + "step": 7995 + }, + { + "epoch": 0.3893555377011662, + "grad_norm": 1.523350715637207, + "learning_rate": 2.791698229951068e-05, + "loss": 0.8156, + "step": 7996 + }, + { + "epoch": 0.3894042314902734, + "grad_norm": 3.541461229324341, + "learning_rate": 2.7914085495964425e-05, + "loss": 0.8311, + "step": 7997 + }, + { + "epoch": 0.38945292527938064, + "grad_norm": 2.1255993843078613, + "learning_rate": 2.7911188495554568e-05, + "loss": 0.8644, + "step": 7998 + }, + { + "epoch": 0.3895016190684878, + "grad_norm": 1.4779927730560303, + "learning_rate": 2.7908291298353173e-05, + "loss": 0.8214, + "step": 7999 + }, + { + "epoch": 0.389550312857595, + "grad_norm": 1.8900291919708252, + "learning_rate": 2.7905393904432318e-05, + "loss": 0.7545, + "step": 8000 + }, + { + "epoch": 0.3895990066467022, + "grad_norm": 3.016350507736206, + "learning_rate": 2.7902496313864066e-05, + "loss": 0.9334, + "step": 8001 + }, + { + "epoch": 0.38964770043580943, + "grad_norm": 0.09141139686107635, + "learning_rate": 2.78995985267205e-05, + "loss": 0.6044, + "step": 8002 + }, + { + "epoch": 0.3896963942249166, + "grad_norm": 1.4121041297912598, + "learning_rate": 2.7896700543073697e-05, + "loss": 0.8765, + "step": 8003 + }, + { + "epoch": 0.3897450880140238, + "grad_norm": 0.09133404493331909, + "learning_rate": 2.789380236299575e-05, + "loss": 0.6009, + "step": 8004 + }, + { + "epoch": 0.389793781803131, + "grad_norm": 1.8947142362594604, + "learning_rate": 2.7890903986558748e-05, + "loss": 0.8967, + "step": 8005 + }, + { + "epoch": 0.3898424755922382, + "grad_norm": 1.5075973272323608, + "learning_rate": 2.788800541383479e-05, + "loss": 0.9015, + "step": 8006 + }, + { + "epoch": 0.38989116938134544, + "grad_norm": 1.4924931526184082, + "learning_rate": 2.7885106644895973e-05, + "loss": 0.8734, + "step": 8007 + }, + { + "epoch": 0.3899398631704526, + "grad_norm": 2.0080478191375732, + "learning_rate": 2.7882207679814418e-05, + "loss": 0.8851, + "step": 8008 + }, + { + "epoch": 0.3899885569595598, + "grad_norm": 1.602561116218567, + "learning_rate": 2.7879308518662225e-05, + "loss": 0.9597, + "step": 8009 + }, + { + "epoch": 0.390037250748667, + "grad_norm": 1.3179905414581299, + "learning_rate": 2.787640916151151e-05, + "loss": 0.8252, + "step": 8010 + }, + { + "epoch": 0.39008594453777423, + "grad_norm": 2.1588096618652344, + "learning_rate": 2.7873509608434406e-05, + "loss": 0.9353, + "step": 8011 + }, + { + "epoch": 0.3901346383268814, + "grad_norm": 1.774181842803955, + "learning_rate": 2.7870609859503032e-05, + "loss": 0.8301, + "step": 8012 + }, + { + "epoch": 0.3901833321159886, + "grad_norm": 1.682141661643982, + "learning_rate": 2.7867709914789514e-05, + "loss": 0.7617, + "step": 8013 + }, + { + "epoch": 0.3902320259050958, + "grad_norm": 1.125512957572937, + "learning_rate": 2.7864809774365997e-05, + "loss": 0.8463, + "step": 8014 + }, + { + "epoch": 0.390280719694203, + "grad_norm": 2.060209035873413, + "learning_rate": 2.7861909438304618e-05, + "loss": 0.8139, + "step": 8015 + }, + { + "epoch": 0.3903294134833102, + "grad_norm": 1.3168288469314575, + "learning_rate": 2.785900890667753e-05, + "loss": 0.8275, + "step": 8016 + }, + { + "epoch": 0.3903781072724174, + "grad_norm": 4.472959995269775, + "learning_rate": 2.785610817955688e-05, + "loss": 0.864, + "step": 8017 + }, + { + "epoch": 0.3904268010615246, + "grad_norm": 2.0097532272338867, + "learning_rate": 2.785320725701482e-05, + "loss": 0.8513, + "step": 8018 + }, + { + "epoch": 0.3904754948506318, + "grad_norm": 1.4711389541625977, + "learning_rate": 2.7850306139123515e-05, + "loss": 0.9649, + "step": 8019 + }, + { + "epoch": 0.390524188639739, + "grad_norm": 1.731521487236023, + "learning_rate": 2.7847404825955123e-05, + "loss": 0.8791, + "step": 8020 + }, + { + "epoch": 0.3905728824288462, + "grad_norm": 1.491119384765625, + "learning_rate": 2.7844503317581826e-05, + "loss": 0.7692, + "step": 8021 + }, + { + "epoch": 0.3906215762179534, + "grad_norm": 1.358786940574646, + "learning_rate": 2.784160161407579e-05, + "loss": 0.7793, + "step": 8022 + }, + { + "epoch": 0.3906702700070606, + "grad_norm": 1.218949317932129, + "learning_rate": 2.7838699715509204e-05, + "loss": 0.9138, + "step": 8023 + }, + { + "epoch": 0.3907189637961678, + "grad_norm": 1.860122799873352, + "learning_rate": 2.7835797621954244e-05, + "loss": 0.885, + "step": 8024 + }, + { + "epoch": 0.390767657585275, + "grad_norm": 2.9703457355499268, + "learning_rate": 2.7832895333483104e-05, + "loss": 0.8191, + "step": 8025 + }, + { + "epoch": 0.3908163513743822, + "grad_norm": 2.1818902492523193, + "learning_rate": 2.7829992850167983e-05, + "loss": 0.8765, + "step": 8026 + }, + { + "epoch": 0.3908650451634894, + "grad_norm": 1.4366835355758667, + "learning_rate": 2.7827090172081068e-05, + "loss": 0.7988, + "step": 8027 + }, + { + "epoch": 0.3909137389525966, + "grad_norm": 2.3842685222625732, + "learning_rate": 2.7824187299294574e-05, + "loss": 0.9884, + "step": 8028 + }, + { + "epoch": 0.3909624327417038, + "grad_norm": 1.6528962850570679, + "learning_rate": 2.7821284231880708e-05, + "loss": 0.8994, + "step": 8029 + }, + { + "epoch": 0.391011126530811, + "grad_norm": 1.9348093271255493, + "learning_rate": 2.7818380969911688e-05, + "loss": 0.8215, + "step": 8030 + }, + { + "epoch": 0.3910598203199182, + "grad_norm": 1.55563223361969, + "learning_rate": 2.7815477513459726e-05, + "loss": 0.8491, + "step": 8031 + }, + { + "epoch": 0.3911085141090254, + "grad_norm": 3.1046526432037354, + "learning_rate": 2.7812573862597047e-05, + "loss": 0.7797, + "step": 8032 + }, + { + "epoch": 0.3911572078981326, + "grad_norm": 1.233474850654602, + "learning_rate": 2.7809670017395882e-05, + "loss": 0.7926, + "step": 8033 + }, + { + "epoch": 0.3912059016872398, + "grad_norm": 1.7470158338546753, + "learning_rate": 2.780676597792846e-05, + "loss": 0.8418, + "step": 8034 + }, + { + "epoch": 0.391254595476347, + "grad_norm": 1.6510875225067139, + "learning_rate": 2.7803861744267027e-05, + "loss": 0.8575, + "step": 8035 + }, + { + "epoch": 0.3913032892654542, + "grad_norm": 1.419337272644043, + "learning_rate": 2.7800957316483814e-05, + "loss": 0.9547, + "step": 8036 + }, + { + "epoch": 0.39135198305456137, + "grad_norm": 1.7722829580307007, + "learning_rate": 2.7798052694651082e-05, + "loss": 0.8243, + "step": 8037 + }, + { + "epoch": 0.3914006768436686, + "grad_norm": 1.614212155342102, + "learning_rate": 2.7795147878841084e-05, + "loss": 0.8644, + "step": 8038 + }, + { + "epoch": 0.3914493706327758, + "grad_norm": 1.5991188287734985, + "learning_rate": 2.7792242869126066e-05, + "loss": 0.8649, + "step": 8039 + }, + { + "epoch": 0.391498064421883, + "grad_norm": 2.4787867069244385, + "learning_rate": 2.7789337665578293e-05, + "loss": 0.8742, + "step": 8040 + }, + { + "epoch": 0.39154675821099016, + "grad_norm": 1.4408749341964722, + "learning_rate": 2.7786432268270042e-05, + "loss": 0.8681, + "step": 8041 + }, + { + "epoch": 0.3915954520000974, + "grad_norm": 1.7786595821380615, + "learning_rate": 2.7783526677273572e-05, + "loss": 0.801, + "step": 8042 + }, + { + "epoch": 0.3916441457892046, + "grad_norm": 7.632328987121582, + "learning_rate": 2.7780620892661166e-05, + "loss": 0.8303, + "step": 8043 + }, + { + "epoch": 0.3916928395783118, + "grad_norm": 1.6324411630630493, + "learning_rate": 2.777771491450511e-05, + "loss": 0.867, + "step": 8044 + }, + { + "epoch": 0.39174153336741896, + "grad_norm": 0.09378799796104431, + "learning_rate": 2.7774808742877682e-05, + "loss": 0.5788, + "step": 8045 + }, + { + "epoch": 0.39179022715652617, + "grad_norm": 1.650563359260559, + "learning_rate": 2.7771902377851186e-05, + "loss": 0.8443, + "step": 8046 + }, + { + "epoch": 0.3918389209456334, + "grad_norm": 1.8331454992294312, + "learning_rate": 2.7768995819497906e-05, + "loss": 0.9004, + "step": 8047 + }, + { + "epoch": 0.3918876147347406, + "grad_norm": 2.712599992752075, + "learning_rate": 2.7766089067890143e-05, + "loss": 0.8547, + "step": 8048 + }, + { + "epoch": 0.3919363085238478, + "grad_norm": 2.1296491622924805, + "learning_rate": 2.7763182123100213e-05, + "loss": 0.8931, + "step": 8049 + }, + { + "epoch": 0.39198500231295497, + "grad_norm": 1.776768445968628, + "learning_rate": 2.776027498520041e-05, + "loss": 0.8802, + "step": 8050 + }, + { + "epoch": 0.3920336961020622, + "grad_norm": 1.3438526391983032, + "learning_rate": 2.7757367654263067e-05, + "loss": 0.883, + "step": 8051 + }, + { + "epoch": 0.3920823898911694, + "grad_norm": 1.5940428972244263, + "learning_rate": 2.7754460130360496e-05, + "loss": 0.9052, + "step": 8052 + }, + { + "epoch": 0.3921310836802766, + "grad_norm": 0.09297056496143341, + "learning_rate": 2.7751552413565018e-05, + "loss": 0.6622, + "step": 8053 + }, + { + "epoch": 0.39217977746938376, + "grad_norm": 2.1347076892852783, + "learning_rate": 2.7748644503948968e-05, + "loss": 0.9164, + "step": 8054 + }, + { + "epoch": 0.392228471258491, + "grad_norm": 2.4874799251556396, + "learning_rate": 2.7745736401584683e-05, + "loss": 0.9026, + "step": 8055 + }, + { + "epoch": 0.3922771650475982, + "grad_norm": 1.4789806604385376, + "learning_rate": 2.7742828106544498e-05, + "loss": 0.8165, + "step": 8056 + }, + { + "epoch": 0.3923258588367054, + "grad_norm": 1.5019195079803467, + "learning_rate": 2.773991961890075e-05, + "loss": 0.791, + "step": 8057 + }, + { + "epoch": 0.39237455262581256, + "grad_norm": 3.15205454826355, + "learning_rate": 2.7737010938725805e-05, + "loss": 0.8826, + "step": 8058 + }, + { + "epoch": 0.39242324641491977, + "grad_norm": 1.5078997611999512, + "learning_rate": 2.7734102066092004e-05, + "loss": 0.8272, + "step": 8059 + }, + { + "epoch": 0.392471940204027, + "grad_norm": 1.6321394443511963, + "learning_rate": 2.773119300107171e-05, + "loss": 0.817, + "step": 8060 + }, + { + "epoch": 0.3925206339931342, + "grad_norm": 1.4444210529327393, + "learning_rate": 2.7728283743737286e-05, + "loss": 0.8261, + "step": 8061 + }, + { + "epoch": 0.39256932778224135, + "grad_norm": 1.3788522481918335, + "learning_rate": 2.77253742941611e-05, + "loss": 0.809, + "step": 8062 + }, + { + "epoch": 0.39261802157134856, + "grad_norm": 1.7143429517745972, + "learning_rate": 2.7722464652415526e-05, + "loss": 0.8762, + "step": 8063 + }, + { + "epoch": 0.3926667153604558, + "grad_norm": 0.08550520986318588, + "learning_rate": 2.771955481857293e-05, + "loss": 0.6446, + "step": 8064 + }, + { + "epoch": 0.392715409149563, + "grad_norm": 2.0118846893310547, + "learning_rate": 2.771664479270571e-05, + "loss": 0.8509, + "step": 8065 + }, + { + "epoch": 0.39276410293867015, + "grad_norm": 2.188119888305664, + "learning_rate": 2.7713734574886247e-05, + "loss": 0.7478, + "step": 8066 + }, + { + "epoch": 0.39281279672777736, + "grad_norm": 1.9986164569854736, + "learning_rate": 2.7710824165186934e-05, + "loss": 0.8172, + "step": 8067 + }, + { + "epoch": 0.39286149051688457, + "grad_norm": 1.592031478881836, + "learning_rate": 2.7707913563680167e-05, + "loss": 0.8271, + "step": 8068 + }, + { + "epoch": 0.3929101843059918, + "grad_norm": 1.6895488500595093, + "learning_rate": 2.7705002770438345e-05, + "loss": 0.8182, + "step": 8069 + }, + { + "epoch": 0.392958878095099, + "grad_norm": 1.5647169351577759, + "learning_rate": 2.770209178553388e-05, + "loss": 0.8963, + "step": 8070 + }, + { + "epoch": 0.39300757188420615, + "grad_norm": 1.5099838972091675, + "learning_rate": 2.7699180609039172e-05, + "loss": 0.828, + "step": 8071 + }, + { + "epoch": 0.39305626567331337, + "grad_norm": 1.6556317806243896, + "learning_rate": 2.7696269241026654e-05, + "loss": 0.9316, + "step": 8072 + }, + { + "epoch": 0.3931049594624206, + "grad_norm": 1.6200038194656372, + "learning_rate": 2.7693357681568734e-05, + "loss": 0.8722, + "step": 8073 + }, + { + "epoch": 0.3931536532515278, + "grad_norm": 1.8467748165130615, + "learning_rate": 2.7690445930737832e-05, + "loss": 0.819, + "step": 8074 + }, + { + "epoch": 0.39320234704063495, + "grad_norm": 1.3835322856903076, + "learning_rate": 2.7687533988606396e-05, + "loss": 0.8293, + "step": 8075 + }, + { + "epoch": 0.39325104082974216, + "grad_norm": 3.3749310970306396, + "learning_rate": 2.7684621855246853e-05, + "loss": 0.7555, + "step": 8076 + }, + { + "epoch": 0.3932997346188494, + "grad_norm": 1.7368242740631104, + "learning_rate": 2.768170953073163e-05, + "loss": 0.8414, + "step": 8077 + }, + { + "epoch": 0.3933484284079566, + "grad_norm": 2.482961416244507, + "learning_rate": 2.7678797015133194e-05, + "loss": 0.8494, + "step": 8078 + }, + { + "epoch": 0.39339712219706374, + "grad_norm": 1.542686939239502, + "learning_rate": 2.7675884308523975e-05, + "loss": 0.796, + "step": 8079 + }, + { + "epoch": 0.39344581598617095, + "grad_norm": 1.6067806482315063, + "learning_rate": 2.7672971410976434e-05, + "loss": 0.8735, + "step": 8080 + }, + { + "epoch": 0.39349450977527817, + "grad_norm": 3.624664068222046, + "learning_rate": 2.7670058322563034e-05, + "loss": 0.8126, + "step": 8081 + }, + { + "epoch": 0.3935432035643854, + "grad_norm": 1.4810441732406616, + "learning_rate": 2.7667145043356226e-05, + "loss": 0.7924, + "step": 8082 + }, + { + "epoch": 0.39359189735349254, + "grad_norm": 1.4890161752700806, + "learning_rate": 2.7664231573428488e-05, + "loss": 0.8774, + "step": 8083 + }, + { + "epoch": 0.39364059114259975, + "grad_norm": 1.9387580156326294, + "learning_rate": 2.76613179128523e-05, + "loss": 0.8251, + "step": 8084 + }, + { + "epoch": 0.39368928493170696, + "grad_norm": 1.5329389572143555, + "learning_rate": 2.7658404061700123e-05, + "loss": 0.8008, + "step": 8085 + }, + { + "epoch": 0.3937379787208142, + "grad_norm": 0.09009300172328949, + "learning_rate": 2.7655490020044448e-05, + "loss": 0.5668, + "step": 8086 + }, + { + "epoch": 0.3937866725099214, + "grad_norm": 1.7402923107147217, + "learning_rate": 2.7652575787957758e-05, + "loss": 0.9296, + "step": 8087 + }, + { + "epoch": 0.39383536629902854, + "grad_norm": 1.8066290616989136, + "learning_rate": 2.764966136551255e-05, + "loss": 0.8505, + "step": 8088 + }, + { + "epoch": 0.39388406008813576, + "grad_norm": 2.0088891983032227, + "learning_rate": 2.7646746752781318e-05, + "loss": 0.7993, + "step": 8089 + }, + { + "epoch": 0.39393275387724297, + "grad_norm": 1.6279864311218262, + "learning_rate": 2.7643831949836566e-05, + "loss": 0.7837, + "step": 8090 + }, + { + "epoch": 0.3939814476663502, + "grad_norm": 1.6822251081466675, + "learning_rate": 2.764091695675079e-05, + "loss": 0.9247, + "step": 8091 + }, + { + "epoch": 0.39403014145545734, + "grad_norm": 1.2570159435272217, + "learning_rate": 2.7638001773596514e-05, + "loss": 0.8175, + "step": 8092 + }, + { + "epoch": 0.39407883524456455, + "grad_norm": 6.585788249969482, + "learning_rate": 2.7635086400446245e-05, + "loss": 0.8988, + "step": 8093 + }, + { + "epoch": 0.39412752903367176, + "grad_norm": 1.6611698865890503, + "learning_rate": 2.7632170837372503e-05, + "loss": 0.8053, + "step": 8094 + }, + { + "epoch": 0.394176222822779, + "grad_norm": 1.563685417175293, + "learning_rate": 2.7629255084447823e-05, + "loss": 0.8524, + "step": 8095 + }, + { + "epoch": 0.39422491661188613, + "grad_norm": 1.2778809070587158, + "learning_rate": 2.7626339141744726e-05, + "loss": 0.777, + "step": 8096 + }, + { + "epoch": 0.39427361040099335, + "grad_norm": 1.3744988441467285, + "learning_rate": 2.762342300933574e-05, + "loss": 0.8311, + "step": 8097 + }, + { + "epoch": 0.39432230419010056, + "grad_norm": 1.374714732170105, + "learning_rate": 2.7620506687293417e-05, + "loss": 0.8107, + "step": 8098 + }, + { + "epoch": 0.39437099797920777, + "grad_norm": 1.293127179145813, + "learning_rate": 2.7617590175690297e-05, + "loss": 0.8882, + "step": 8099 + }, + { + "epoch": 0.39441969176831493, + "grad_norm": 2.2377853393554688, + "learning_rate": 2.7614673474598924e-05, + "loss": 0.9454, + "step": 8100 + }, + { + "epoch": 0.39446838555742214, + "grad_norm": 2.6657800674438477, + "learning_rate": 2.7611756584091852e-05, + "loss": 0.8343, + "step": 8101 + }, + { + "epoch": 0.39451707934652935, + "grad_norm": 1.2555959224700928, + "learning_rate": 2.7608839504241647e-05, + "loss": 0.9092, + "step": 8102 + }, + { + "epoch": 0.39456577313563657, + "grad_norm": 2.1442806720733643, + "learning_rate": 2.760592223512086e-05, + "loss": 0.9094, + "step": 8103 + }, + { + "epoch": 0.3946144669247437, + "grad_norm": 1.6947786808013916, + "learning_rate": 2.760300477680207e-05, + "loss": 0.8613, + "step": 8104 + }, + { + "epoch": 0.39466316071385094, + "grad_norm": 1.503467082977295, + "learning_rate": 2.7600087129357836e-05, + "loss": 0.7951, + "step": 8105 + }, + { + "epoch": 0.39471185450295815, + "grad_norm": 1.3632495403289795, + "learning_rate": 2.7597169292860746e-05, + "loss": 0.8107, + "step": 8106 + }, + { + "epoch": 0.39476054829206536, + "grad_norm": 1.4185327291488647, + "learning_rate": 2.7594251267383375e-05, + "loss": 0.8238, + "step": 8107 + }, + { + "epoch": 0.3948092420811726, + "grad_norm": 1.5440726280212402, + "learning_rate": 2.759133305299831e-05, + "loss": 0.7931, + "step": 8108 + }, + { + "epoch": 0.39485793587027973, + "grad_norm": 1.771425485610962, + "learning_rate": 2.758841464977815e-05, + "loss": 0.8924, + "step": 8109 + }, + { + "epoch": 0.39490662965938694, + "grad_norm": 1.8991845846176147, + "learning_rate": 2.758549605779548e-05, + "loss": 0.834, + "step": 8110 + }, + { + "epoch": 0.39495532344849416, + "grad_norm": 1.7584831714630127, + "learning_rate": 2.75825772771229e-05, + "loss": 0.8924, + "step": 8111 + }, + { + "epoch": 0.39500401723760137, + "grad_norm": 1.6851065158843994, + "learning_rate": 2.7579658307833025e-05, + "loss": 0.7612, + "step": 8112 + }, + { + "epoch": 0.3950527110267085, + "grad_norm": 1.5804861783981323, + "learning_rate": 2.7576739149998456e-05, + "loss": 0.8215, + "step": 8113 + }, + { + "epoch": 0.39510140481581574, + "grad_norm": 1.5451661348342896, + "learning_rate": 2.757381980369181e-05, + "loss": 0.8699, + "step": 8114 + }, + { + "epoch": 0.39515009860492295, + "grad_norm": 1.4894050359725952, + "learning_rate": 2.7570900268985704e-05, + "loss": 0.8644, + "step": 8115 + }, + { + "epoch": 0.39519879239403016, + "grad_norm": 1.2676132917404175, + "learning_rate": 2.7567980545952758e-05, + "loss": 0.9094, + "step": 8116 + }, + { + "epoch": 0.3952474861831373, + "grad_norm": 1.4170200824737549, + "learning_rate": 2.7565060634665614e-05, + "loss": 0.8158, + "step": 8117 + }, + { + "epoch": 0.39529617997224453, + "grad_norm": 1.455958366394043, + "learning_rate": 2.7562140535196898e-05, + "loss": 0.8344, + "step": 8118 + }, + { + "epoch": 0.39534487376135174, + "grad_norm": 1.596574306488037, + "learning_rate": 2.7559220247619244e-05, + "loss": 0.8513, + "step": 8119 + }, + { + "epoch": 0.39539356755045896, + "grad_norm": 1.4171229600906372, + "learning_rate": 2.75562997720053e-05, + "loss": 0.8365, + "step": 8120 + }, + { + "epoch": 0.3954422613395661, + "grad_norm": 1.8989452123641968, + "learning_rate": 2.7553379108427707e-05, + "loss": 0.8118, + "step": 8121 + }, + { + "epoch": 0.3954909551286733, + "grad_norm": 1.7199883460998535, + "learning_rate": 2.755045825695912e-05, + "loss": 0.8434, + "step": 8122 + }, + { + "epoch": 0.39553964891778054, + "grad_norm": 1.3398644924163818, + "learning_rate": 2.754753721767219e-05, + "loss": 0.8, + "step": 8123 + }, + { + "epoch": 0.39558834270688775, + "grad_norm": 1.558698058128357, + "learning_rate": 2.7544615990639592e-05, + "loss": 0.928, + "step": 8124 + }, + { + "epoch": 0.3956370364959949, + "grad_norm": 1.5348619222640991, + "learning_rate": 2.7541694575933982e-05, + "loss": 0.8596, + "step": 8125 + }, + { + "epoch": 0.3956857302851021, + "grad_norm": 1.864781141281128, + "learning_rate": 2.7538772973628033e-05, + "loss": 0.899, + "step": 8126 + }, + { + "epoch": 0.39573442407420933, + "grad_norm": 1.430323600769043, + "learning_rate": 2.7535851183794418e-05, + "loss": 0.9862, + "step": 8127 + }, + { + "epoch": 0.39578311786331655, + "grad_norm": 1.3262956142425537, + "learning_rate": 2.7532929206505814e-05, + "loss": 0.9091, + "step": 8128 + }, + { + "epoch": 0.39583181165242376, + "grad_norm": 2.0999932289123535, + "learning_rate": 2.7530007041834915e-05, + "loss": 0.8, + "step": 8129 + }, + { + "epoch": 0.3958805054415309, + "grad_norm": 1.3235492706298828, + "learning_rate": 2.7527084689854392e-05, + "loss": 0.8338, + "step": 8130 + }, + { + "epoch": 0.39592919923063813, + "grad_norm": 2.033724069595337, + "learning_rate": 2.7524162150636965e-05, + "loss": 0.7698, + "step": 8131 + }, + { + "epoch": 0.39597789301974534, + "grad_norm": 1.755784511566162, + "learning_rate": 2.7521239424255313e-05, + "loss": 0.8744, + "step": 8132 + }, + { + "epoch": 0.39602658680885255, + "grad_norm": 2.6333179473876953, + "learning_rate": 2.7518316510782142e-05, + "loss": 0.8689, + "step": 8133 + }, + { + "epoch": 0.3960752805979597, + "grad_norm": 1.4816526174545288, + "learning_rate": 2.7515393410290164e-05, + "loss": 0.8058, + "step": 8134 + }, + { + "epoch": 0.3961239743870669, + "grad_norm": 1.593711256980896, + "learning_rate": 2.7512470122852093e-05, + "loss": 0.8684, + "step": 8135 + }, + { + "epoch": 0.39617266817617414, + "grad_norm": 0.08824921399354935, + "learning_rate": 2.7509546648540643e-05, + "loss": 0.6633, + "step": 8136 + }, + { + "epoch": 0.39622136196528135, + "grad_norm": 1.578614592552185, + "learning_rate": 2.750662298742853e-05, + "loss": 0.8227, + "step": 8137 + }, + { + "epoch": 0.3962700557543885, + "grad_norm": 1.7711389064788818, + "learning_rate": 2.7503699139588488e-05, + "loss": 0.8506, + "step": 8138 + }, + { + "epoch": 0.3963187495434957, + "grad_norm": 1.235784649848938, + "learning_rate": 2.750077510509325e-05, + "loss": 0.8539, + "step": 8139 + }, + { + "epoch": 0.39636744333260293, + "grad_norm": 1.3917571306228638, + "learning_rate": 2.7497850884015544e-05, + "loss": 0.8756, + "step": 8140 + }, + { + "epoch": 0.39641613712171014, + "grad_norm": 1.4029608964920044, + "learning_rate": 2.7494926476428112e-05, + "loss": 0.9271, + "step": 8141 + }, + { + "epoch": 0.3964648309108173, + "grad_norm": 1.6675095558166504, + "learning_rate": 2.74920018824037e-05, + "loss": 0.7901, + "step": 8142 + }, + { + "epoch": 0.3965135246999245, + "grad_norm": 1.3965349197387695, + "learning_rate": 2.7489077102015064e-05, + "loss": 0.8625, + "step": 8143 + }, + { + "epoch": 0.3965622184890317, + "grad_norm": 1.7496904134750366, + "learning_rate": 2.7486152135334947e-05, + "loss": 0.7849, + "step": 8144 + }, + { + "epoch": 0.39661091227813894, + "grad_norm": 1.7649708986282349, + "learning_rate": 2.748322698243611e-05, + "loss": 0.6477, + "step": 8145 + }, + { + "epoch": 0.3966596060672461, + "grad_norm": 1.500646948814392, + "learning_rate": 2.7480301643391327e-05, + "loss": 0.8503, + "step": 8146 + }, + { + "epoch": 0.3967082998563533, + "grad_norm": 1.3057799339294434, + "learning_rate": 2.7477376118273355e-05, + "loss": 0.8962, + "step": 8147 + }, + { + "epoch": 0.3967569936454605, + "grad_norm": 0.08730665594339371, + "learning_rate": 2.7474450407154974e-05, + "loss": 0.6317, + "step": 8148 + }, + { + "epoch": 0.39680568743456773, + "grad_norm": 1.6086280345916748, + "learning_rate": 2.7471524510108952e-05, + "loss": 0.8918, + "step": 8149 + }, + { + "epoch": 0.39685438122367495, + "grad_norm": 1.3757001161575317, + "learning_rate": 2.746859842720808e-05, + "loss": 0.8819, + "step": 8150 + }, + { + "epoch": 0.3969030750127821, + "grad_norm": 1.8191670179367065, + "learning_rate": 2.7465672158525137e-05, + "loss": 0.8345, + "step": 8151 + }, + { + "epoch": 0.3969517688018893, + "grad_norm": 0.08990241587162018, + "learning_rate": 2.7462745704132926e-05, + "loss": 0.5423, + "step": 8152 + }, + { + "epoch": 0.39700046259099653, + "grad_norm": 1.4354254007339478, + "learning_rate": 2.7459819064104225e-05, + "loss": 0.8544, + "step": 8153 + }, + { + "epoch": 0.39704915638010374, + "grad_norm": 1.2835975885391235, + "learning_rate": 2.7456892238511853e-05, + "loss": 0.8102, + "step": 8154 + }, + { + "epoch": 0.3970978501692109, + "grad_norm": 1.4547202587127686, + "learning_rate": 2.7453965227428603e-05, + "loss": 0.8182, + "step": 8155 + }, + { + "epoch": 0.3971465439583181, + "grad_norm": 1.6330407857894897, + "learning_rate": 2.7451038030927293e-05, + "loss": 0.8716, + "step": 8156 + }, + { + "epoch": 0.3971952377474253, + "grad_norm": 1.226080298423767, + "learning_rate": 2.7448110649080727e-05, + "loss": 0.7666, + "step": 8157 + }, + { + "epoch": 0.39724393153653254, + "grad_norm": 1.9900598526000977, + "learning_rate": 2.7445183081961733e-05, + "loss": 0.8393, + "step": 8158 + }, + { + "epoch": 0.3972926253256397, + "grad_norm": 2.526404857635498, + "learning_rate": 2.7442255329643126e-05, + "loss": 0.812, + "step": 8159 + }, + { + "epoch": 0.3973413191147469, + "grad_norm": 1.4385967254638672, + "learning_rate": 2.7439327392197746e-05, + "loss": 0.7836, + "step": 8160 + }, + { + "epoch": 0.3973900129038541, + "grad_norm": 1.5595283508300781, + "learning_rate": 2.7436399269698415e-05, + "loss": 0.8063, + "step": 8161 + }, + { + "epoch": 0.39743870669296133, + "grad_norm": 1.4780514240264893, + "learning_rate": 2.743347096221798e-05, + "loss": 0.8598, + "step": 8162 + }, + { + "epoch": 0.3974874004820685, + "grad_norm": 1.1746704578399658, + "learning_rate": 2.7430542469829266e-05, + "loss": 0.8526, + "step": 8163 + }, + { + "epoch": 0.3975360942711757, + "grad_norm": 1.3834960460662842, + "learning_rate": 2.7427613792605138e-05, + "loss": 0.8634, + "step": 8164 + }, + { + "epoch": 0.3975847880602829, + "grad_norm": 1.342678427696228, + "learning_rate": 2.7424684930618438e-05, + "loss": 0.92, + "step": 8165 + }, + { + "epoch": 0.3976334818493901, + "grad_norm": 1.9688050746917725, + "learning_rate": 2.7421755883942017e-05, + "loss": 0.9408, + "step": 8166 + }, + { + "epoch": 0.39768217563849734, + "grad_norm": 1.4718574285507202, + "learning_rate": 2.741882665264875e-05, + "loss": 0.8957, + "step": 8167 + }, + { + "epoch": 0.3977308694276045, + "grad_norm": 1.4004969596862793, + "learning_rate": 2.7415897236811494e-05, + "loss": 0.7657, + "step": 8168 + }, + { + "epoch": 0.3977795632167117, + "grad_norm": 1.333207607269287, + "learning_rate": 2.7412967636503115e-05, + "loss": 0.9146, + "step": 8169 + }, + { + "epoch": 0.3978282570058189, + "grad_norm": 2.4461331367492676, + "learning_rate": 2.7410037851796493e-05, + "loss": 0.9609, + "step": 8170 + }, + { + "epoch": 0.39787695079492613, + "grad_norm": 2.622971534729004, + "learning_rate": 2.74071078827645e-05, + "loss": 0.9438, + "step": 8171 + }, + { + "epoch": 0.3979256445840333, + "grad_norm": 1.432220697402954, + "learning_rate": 2.7404177729480026e-05, + "loss": 0.7328, + "step": 8172 + }, + { + "epoch": 0.3979743383731405, + "grad_norm": 1.3546569347381592, + "learning_rate": 2.740124739201595e-05, + "loss": 0.7477, + "step": 8173 + }, + { + "epoch": 0.3980230321622477, + "grad_norm": 0.09149825572967529, + "learning_rate": 2.7398316870445177e-05, + "loss": 0.6043, + "step": 8174 + }, + { + "epoch": 0.3980717259513549, + "grad_norm": 1.1930586099624634, + "learning_rate": 2.739538616484059e-05, + "loss": 0.9073, + "step": 8175 + }, + { + "epoch": 0.3981204197404621, + "grad_norm": 1.4211792945861816, + "learning_rate": 2.7392455275275106e-05, + "loss": 0.8465, + "step": 8176 + }, + { + "epoch": 0.3981691135295693, + "grad_norm": 1.2766607999801636, + "learning_rate": 2.7389524201821624e-05, + "loss": 0.7254, + "step": 8177 + }, + { + "epoch": 0.3982178073186765, + "grad_norm": 1.1695442199707031, + "learning_rate": 2.738659294455305e-05, + "loss": 0.8206, + "step": 8178 + }, + { + "epoch": 0.3982665011077837, + "grad_norm": 1.0236047506332397, + "learning_rate": 2.7383661503542303e-05, + "loss": 0.9173, + "step": 8179 + }, + { + "epoch": 0.3983151948968909, + "grad_norm": 1.6597343683242798, + "learning_rate": 2.7380729878862297e-05, + "loss": 0.8691, + "step": 8180 + }, + { + "epoch": 0.3983638886859981, + "grad_norm": 1.3221012353897095, + "learning_rate": 2.737779807058597e-05, + "loss": 0.917, + "step": 8181 + }, + { + "epoch": 0.3984125824751053, + "grad_norm": 1.763620138168335, + "learning_rate": 2.7374866078786242e-05, + "loss": 0.8519, + "step": 8182 + }, + { + "epoch": 0.3984612762642125, + "grad_norm": 1.8822062015533447, + "learning_rate": 2.7371933903536047e-05, + "loss": 0.7901, + "step": 8183 + }, + { + "epoch": 0.3985099700533197, + "grad_norm": 1.3418803215026855, + "learning_rate": 2.7369001544908324e-05, + "loss": 0.7852, + "step": 8184 + }, + { + "epoch": 0.3985586638424269, + "grad_norm": 1.5113061666488647, + "learning_rate": 2.7366069002976017e-05, + "loss": 0.8882, + "step": 8185 + }, + { + "epoch": 0.3986073576315341, + "grad_norm": 1.1739674806594849, + "learning_rate": 2.7363136277812073e-05, + "loss": 0.865, + "step": 8186 + }, + { + "epoch": 0.3986560514206413, + "grad_norm": 1.1761854887008667, + "learning_rate": 2.736020336948944e-05, + "loss": 0.9462, + "step": 8187 + }, + { + "epoch": 0.3987047452097485, + "grad_norm": 1.5190951824188232, + "learning_rate": 2.7357270278081077e-05, + "loss": 0.8358, + "step": 8188 + }, + { + "epoch": 0.3987534389988557, + "grad_norm": 1.7778440713882446, + "learning_rate": 2.7354337003659942e-05, + "loss": 0.8154, + "step": 8189 + }, + { + "epoch": 0.3988021327879629, + "grad_norm": 1.4302369356155396, + "learning_rate": 2.735140354629901e-05, + "loss": 0.8757, + "step": 8190 + }, + { + "epoch": 0.3988508265770701, + "grad_norm": 1.4910098314285278, + "learning_rate": 2.7348469906071248e-05, + "loss": 0.8223, + "step": 8191 + }, + { + "epoch": 0.3988995203661773, + "grad_norm": 1.5513662099838257, + "learning_rate": 2.734553608304961e-05, + "loss": 0.9062, + "step": 8192 + }, + { + "epoch": 0.3989482141552845, + "grad_norm": 2.4078316688537598, + "learning_rate": 2.7342602077307108e-05, + "loss": 0.734, + "step": 8193 + }, + { + "epoch": 0.3989969079443917, + "grad_norm": 1.7908395528793335, + "learning_rate": 2.7339667888916705e-05, + "loss": 0.8679, + "step": 8194 + }, + { + "epoch": 0.3990456017334989, + "grad_norm": 1.518319845199585, + "learning_rate": 2.733673351795139e-05, + "loss": 0.8289, + "step": 8195 + }, + { + "epoch": 0.3990942955226061, + "grad_norm": 1.4906244277954102, + "learning_rate": 2.7333798964484162e-05, + "loss": 0.8678, + "step": 8196 + }, + { + "epoch": 0.39914298931171327, + "grad_norm": 2.23309063911438, + "learning_rate": 2.7330864228588027e-05, + "loss": 0.7926, + "step": 8197 + }, + { + "epoch": 0.3991916831008205, + "grad_norm": 1.6603989601135254, + "learning_rate": 2.732792931033596e-05, + "loss": 0.7616, + "step": 8198 + }, + { + "epoch": 0.3992403768899277, + "grad_norm": 1.4664371013641357, + "learning_rate": 2.7324994209800998e-05, + "loss": 0.8585, + "step": 8199 + }, + { + "epoch": 0.3992890706790349, + "grad_norm": 1.8933137655258179, + "learning_rate": 2.7322058927056126e-05, + "loss": 0.823, + "step": 8200 + }, + { + "epoch": 0.39933776446814206, + "grad_norm": 1.4257022142410278, + "learning_rate": 2.7319123462174377e-05, + "loss": 0.8479, + "step": 8201 + }, + { + "epoch": 0.3993864582572493, + "grad_norm": 1.3208361864089966, + "learning_rate": 2.7316187815228756e-05, + "loss": 0.9524, + "step": 8202 + }, + { + "epoch": 0.3994351520463565, + "grad_norm": 1.4177043437957764, + "learning_rate": 2.7313251986292297e-05, + "loss": 0.8504, + "step": 8203 + }, + { + "epoch": 0.3994838458354637, + "grad_norm": 1.4532054662704468, + "learning_rate": 2.731031597543804e-05, + "loss": 0.78, + "step": 8204 + }, + { + "epoch": 0.39953253962457086, + "grad_norm": 2.099043369293213, + "learning_rate": 2.7307379782738994e-05, + "loss": 0.7222, + "step": 8205 + }, + { + "epoch": 0.39958123341367807, + "grad_norm": 2.380662202835083, + "learning_rate": 2.7304443408268214e-05, + "loss": 0.8221, + "step": 8206 + }, + { + "epoch": 0.3996299272027853, + "grad_norm": 1.599145531654358, + "learning_rate": 2.730150685209874e-05, + "loss": 0.7286, + "step": 8207 + }, + { + "epoch": 0.3996786209918925, + "grad_norm": 1.6236608028411865, + "learning_rate": 2.7298570114303618e-05, + "loss": 0.8199, + "step": 8208 + }, + { + "epoch": 0.3997273147809997, + "grad_norm": 0.09091418981552124, + "learning_rate": 2.7295633194955895e-05, + "loss": 0.6238, + "step": 8209 + }, + { + "epoch": 0.39977600857010687, + "grad_norm": 1.2304375171661377, + "learning_rate": 2.7292696094128633e-05, + "loss": 0.8462, + "step": 8210 + }, + { + "epoch": 0.3998247023592141, + "grad_norm": 1.4287195205688477, + "learning_rate": 2.7289758811894893e-05, + "loss": 0.8581, + "step": 8211 + }, + { + "epoch": 0.3998733961483213, + "grad_norm": 2.33557391166687, + "learning_rate": 2.7286821348327732e-05, + "loss": 0.9093, + "step": 8212 + }, + { + "epoch": 0.3999220899374285, + "grad_norm": 1.4625738859176636, + "learning_rate": 2.728388370350023e-05, + "loss": 0.8672, + "step": 8213 + }, + { + "epoch": 0.39997078372653566, + "grad_norm": 2.4916763305664062, + "learning_rate": 2.7280945877485457e-05, + "loss": 0.8369, + "step": 8214 + }, + { + "epoch": 0.4000194775156429, + "grad_norm": 1.7325576543807983, + "learning_rate": 2.7278007870356495e-05, + "loss": 0.7847, + "step": 8215 + }, + { + "epoch": 0.4000681713047501, + "grad_norm": 1.637642741203308, + "learning_rate": 2.7275069682186418e-05, + "loss": 0.9866, + "step": 8216 + }, + { + "epoch": 0.4001168650938573, + "grad_norm": 1.6121951341629028, + "learning_rate": 2.727213131304832e-05, + "loss": 0.8903, + "step": 8217 + }, + { + "epoch": 0.40016555888296446, + "grad_norm": 2.0042903423309326, + "learning_rate": 2.726919276301529e-05, + "loss": 0.882, + "step": 8218 + }, + { + "epoch": 0.40021425267207167, + "grad_norm": 2.1436214447021484, + "learning_rate": 2.7266254032160433e-05, + "loss": 0.8887, + "step": 8219 + }, + { + "epoch": 0.4002629464611789, + "grad_norm": 1.4794602394104004, + "learning_rate": 2.7263315120556842e-05, + "loss": 0.789, + "step": 8220 + }, + { + "epoch": 0.4003116402502861, + "grad_norm": 2.0859217643737793, + "learning_rate": 2.7260376028277628e-05, + "loss": 0.8964, + "step": 8221 + }, + { + "epoch": 0.40036033403939325, + "grad_norm": 1.7593212127685547, + "learning_rate": 2.725743675539589e-05, + "loss": 0.8273, + "step": 8222 + }, + { + "epoch": 0.40040902782850046, + "grad_norm": 1.7797644138336182, + "learning_rate": 2.725449730198476e-05, + "loss": 0.8899, + "step": 8223 + }, + { + "epoch": 0.4004577216176077, + "grad_norm": 1.6228784322738647, + "learning_rate": 2.7251557668117343e-05, + "loss": 0.8158, + "step": 8224 + }, + { + "epoch": 0.4005064154067149, + "grad_norm": 1.6243619918823242, + "learning_rate": 2.7248617853866767e-05, + "loss": 0.8223, + "step": 8225 + }, + { + "epoch": 0.40055510919582205, + "grad_norm": 2.910403251647949, + "learning_rate": 2.7245677859306163e-05, + "loss": 0.8772, + "step": 8226 + }, + { + "epoch": 0.40060380298492926, + "grad_norm": 1.33454430103302, + "learning_rate": 2.7242737684508664e-05, + "loss": 0.8559, + "step": 8227 + }, + { + "epoch": 0.40065249677403647, + "grad_norm": 1.34198796749115, + "learning_rate": 2.7239797329547406e-05, + "loss": 0.9502, + "step": 8228 + }, + { + "epoch": 0.4007011905631437, + "grad_norm": 1.8850641250610352, + "learning_rate": 2.7236856794495525e-05, + "loss": 0.9236, + "step": 8229 + }, + { + "epoch": 0.4007498843522509, + "grad_norm": 1.5991383790969849, + "learning_rate": 2.7233916079426172e-05, + "loss": 0.8261, + "step": 8230 + }, + { + "epoch": 0.40079857814135805, + "grad_norm": 1.3598577976226807, + "learning_rate": 2.7230975184412495e-05, + "loss": 0.7573, + "step": 8231 + }, + { + "epoch": 0.40084727193046527, + "grad_norm": 1.2992604970932007, + "learning_rate": 2.7228034109527652e-05, + "loss": 0.821, + "step": 8232 + }, + { + "epoch": 0.4008959657195725, + "grad_norm": 1.8016350269317627, + "learning_rate": 2.7225092854844803e-05, + "loss": 0.8601, + "step": 8233 + }, + { + "epoch": 0.4009446595086797, + "grad_norm": 1.3782798051834106, + "learning_rate": 2.722215142043711e-05, + "loss": 0.8785, + "step": 8234 + }, + { + "epoch": 0.40099335329778685, + "grad_norm": 1.182951807975769, + "learning_rate": 2.721920980637774e-05, + "loss": 0.8163, + "step": 8235 + }, + { + "epoch": 0.40104204708689406, + "grad_norm": 1.8330780267715454, + "learning_rate": 2.7216268012739875e-05, + "loss": 0.8525, + "step": 8236 + }, + { + "epoch": 0.4010907408760013, + "grad_norm": 0.09096072614192963, + "learning_rate": 2.721332603959668e-05, + "loss": 0.675, + "step": 8237 + }, + { + "epoch": 0.4011394346651085, + "grad_norm": 1.6169862747192383, + "learning_rate": 2.721038388702134e-05, + "loss": 0.8982, + "step": 8238 + }, + { + "epoch": 0.40118812845421564, + "grad_norm": 1.6682543754577637, + "learning_rate": 2.7207441555087045e-05, + "loss": 0.8628, + "step": 8239 + }, + { + "epoch": 0.40123682224332285, + "grad_norm": 2.245190143585205, + "learning_rate": 2.7204499043866987e-05, + "loss": 0.8964, + "step": 8240 + }, + { + "epoch": 0.40128551603243007, + "grad_norm": 2.0826687812805176, + "learning_rate": 2.720155635343436e-05, + "loss": 0.8911, + "step": 8241 + }, + { + "epoch": 0.4013342098215373, + "grad_norm": 1.6725901365280151, + "learning_rate": 2.7198613483862356e-05, + "loss": 0.8141, + "step": 8242 + }, + { + "epoch": 0.40138290361064444, + "grad_norm": 1.393888235092163, + "learning_rate": 2.7195670435224182e-05, + "loss": 0.7866, + "step": 8243 + }, + { + "epoch": 0.40143159739975165, + "grad_norm": 2.133195161819458, + "learning_rate": 2.719272720759306e-05, + "loss": 0.8303, + "step": 8244 + }, + { + "epoch": 0.40148029118885886, + "grad_norm": 1.3743349313735962, + "learning_rate": 2.7189783801042187e-05, + "loss": 0.7448, + "step": 8245 + }, + { + "epoch": 0.4015289849779661, + "grad_norm": 2.253507614135742, + "learning_rate": 2.7186840215644785e-05, + "loss": 0.781, + "step": 8246 + }, + { + "epoch": 0.4015776787670733, + "grad_norm": 1.3148884773254395, + "learning_rate": 2.718389645147408e-05, + "loss": 0.8671, + "step": 8247 + }, + { + "epoch": 0.40162637255618044, + "grad_norm": 2.1828830242156982, + "learning_rate": 2.71809525086033e-05, + "loss": 0.8981, + "step": 8248 + }, + { + "epoch": 0.40167506634528766, + "grad_norm": 7.699026107788086, + "learning_rate": 2.717800838710567e-05, + "loss": 0.8422, + "step": 8249 + }, + { + "epoch": 0.40172376013439487, + "grad_norm": 2.13332462310791, + "learning_rate": 2.7175064087054423e-05, + "loss": 0.8075, + "step": 8250 + }, + { + "epoch": 0.4017724539235021, + "grad_norm": 1.5767288208007812, + "learning_rate": 2.7172119608522803e-05, + "loss": 0.887, + "step": 8251 + }, + { + "epoch": 0.40182114771260924, + "grad_norm": 0.0924324318766594, + "learning_rate": 2.7169174951584056e-05, + "loss": 0.6068, + "step": 8252 + }, + { + "epoch": 0.40186984150171645, + "grad_norm": 0.0861765593290329, + "learning_rate": 2.7166230116311427e-05, + "loss": 0.5635, + "step": 8253 + }, + { + "epoch": 0.40191853529082366, + "grad_norm": 1.7087889909744263, + "learning_rate": 2.7163285102778165e-05, + "loss": 0.8147, + "step": 8254 + }, + { + "epoch": 0.4019672290799309, + "grad_norm": 1.4931721687316895, + "learning_rate": 2.7160339911057543e-05, + "loss": 0.7876, + "step": 8255 + }, + { + "epoch": 0.40201592286903803, + "grad_norm": 1.4716687202453613, + "learning_rate": 2.715739454122281e-05, + "loss": 0.8221, + "step": 8256 + }, + { + "epoch": 0.40206461665814525, + "grad_norm": 1.4103041887283325, + "learning_rate": 2.715444899334723e-05, + "loss": 0.8848, + "step": 8257 + }, + { + "epoch": 0.40211331044725246, + "grad_norm": 1.5588725805282593, + "learning_rate": 2.7151503267504088e-05, + "loss": 0.9195, + "step": 8258 + }, + { + "epoch": 0.40216200423635967, + "grad_norm": 1.544580340385437, + "learning_rate": 2.7148557363766646e-05, + "loss": 0.8139, + "step": 8259 + }, + { + "epoch": 0.40221069802546683, + "grad_norm": 1.322623372077942, + "learning_rate": 2.7145611282208183e-05, + "loss": 0.89, + "step": 8260 + }, + { + "epoch": 0.40225939181457404, + "grad_norm": 1.7109949588775635, + "learning_rate": 2.7142665022901993e-05, + "loss": 0.8881, + "step": 8261 + }, + { + "epoch": 0.40230808560368125, + "grad_norm": 1.314803123474121, + "learning_rate": 2.7139718585921353e-05, + "loss": 0.8553, + "step": 8262 + }, + { + "epoch": 0.40235677939278847, + "grad_norm": 1.3011393547058105, + "learning_rate": 2.713677197133957e-05, + "loss": 0.8659, + "step": 8263 + }, + { + "epoch": 0.4024054731818956, + "grad_norm": 2.664891242980957, + "learning_rate": 2.7133825179229928e-05, + "loss": 0.8283, + "step": 8264 + }, + { + "epoch": 0.40245416697100284, + "grad_norm": 1.3547821044921875, + "learning_rate": 2.7130878209665738e-05, + "loss": 0.9001, + "step": 8265 + }, + { + "epoch": 0.40250286076011005, + "grad_norm": 2.3309035301208496, + "learning_rate": 2.7127931062720306e-05, + "loss": 0.8344, + "step": 8266 + }, + { + "epoch": 0.40255155454921726, + "grad_norm": 1.3251805305480957, + "learning_rate": 2.7124983738466934e-05, + "loss": 0.8259, + "step": 8267 + }, + { + "epoch": 0.4026002483383245, + "grad_norm": 1.8502558469772339, + "learning_rate": 2.712203623697894e-05, + "loss": 0.7415, + "step": 8268 + }, + { + "epoch": 0.40264894212743163, + "grad_norm": 1.801647663116455, + "learning_rate": 2.7119088558329648e-05, + "loss": 0.8143, + "step": 8269 + }, + { + "epoch": 0.40269763591653884, + "grad_norm": 2.049309253692627, + "learning_rate": 2.711614070259238e-05, + "loss": 0.8905, + "step": 8270 + }, + { + "epoch": 0.40274632970564606, + "grad_norm": 1.861258864402771, + "learning_rate": 2.711319266984046e-05, + "loss": 0.8763, + "step": 8271 + }, + { + "epoch": 0.40279502349475327, + "grad_norm": 3.79648756980896, + "learning_rate": 2.7110244460147224e-05, + "loss": 0.8561, + "step": 8272 + }, + { + "epoch": 0.4028437172838604, + "grad_norm": 1.698339581489563, + "learning_rate": 2.710729607358601e-05, + "loss": 0.825, + "step": 8273 + }, + { + "epoch": 0.40289241107296764, + "grad_norm": 1.516753911972046, + "learning_rate": 2.7104347510230156e-05, + "loss": 0.9421, + "step": 8274 + }, + { + "epoch": 0.40294110486207485, + "grad_norm": 1.8754123449325562, + "learning_rate": 2.7101398770153014e-05, + "loss": 0.8644, + "step": 8275 + }, + { + "epoch": 0.40298979865118206, + "grad_norm": 1.4387789964675903, + "learning_rate": 2.709844985342793e-05, + "loss": 0.9701, + "step": 8276 + }, + { + "epoch": 0.4030384924402892, + "grad_norm": 4.082260608673096, + "learning_rate": 2.7095500760128256e-05, + "loss": 0.7889, + "step": 8277 + }, + { + "epoch": 0.40308718622939643, + "grad_norm": 1.6156964302062988, + "learning_rate": 2.7092551490327354e-05, + "loss": 0.8405, + "step": 8278 + }, + { + "epoch": 0.40313588001850365, + "grad_norm": 1.3414232730865479, + "learning_rate": 2.708960204409859e-05, + "loss": 0.7578, + "step": 8279 + }, + { + "epoch": 0.40318457380761086, + "grad_norm": 1.5635372400283813, + "learning_rate": 2.7086652421515326e-05, + "loss": 0.8495, + "step": 8280 + }, + { + "epoch": 0.403233267596718, + "grad_norm": 1.4812133312225342, + "learning_rate": 2.7083702622650933e-05, + "loss": 0.8697, + "step": 8281 + }, + { + "epoch": 0.4032819613858252, + "grad_norm": 1.6470011472702026, + "learning_rate": 2.7080752647578798e-05, + "loss": 0.8467, + "step": 8282 + }, + { + "epoch": 0.40333065517493244, + "grad_norm": 2.170515775680542, + "learning_rate": 2.707780249637229e-05, + "loss": 0.8349, + "step": 8283 + }, + { + "epoch": 0.40337934896403965, + "grad_norm": 1.544174313545227, + "learning_rate": 2.70748521691048e-05, + "loss": 0.7809, + "step": 8284 + }, + { + "epoch": 0.4034280427531468, + "grad_norm": 1.467068076133728, + "learning_rate": 2.7071901665849723e-05, + "loss": 0.9248, + "step": 8285 + }, + { + "epoch": 0.403476736542254, + "grad_norm": 1.5375665426254272, + "learning_rate": 2.7068950986680445e-05, + "loss": 0.8129, + "step": 8286 + }, + { + "epoch": 0.40352543033136123, + "grad_norm": 1.3126322031021118, + "learning_rate": 2.7066000131670365e-05, + "loss": 0.8706, + "step": 8287 + }, + { + "epoch": 0.40357412412046845, + "grad_norm": 3.432560443878174, + "learning_rate": 2.706304910089289e-05, + "loss": 0.8498, + "step": 8288 + }, + { + "epoch": 0.40362281790957566, + "grad_norm": 1.668662667274475, + "learning_rate": 2.706009789442142e-05, + "loss": 0.8181, + "step": 8289 + }, + { + "epoch": 0.4036715116986828, + "grad_norm": 2.3357162475585938, + "learning_rate": 2.7057146512329374e-05, + "loss": 0.7958, + "step": 8290 + }, + { + "epoch": 0.40372020548779003, + "grad_norm": 1.5399070978164673, + "learning_rate": 2.7054194954690164e-05, + "loss": 0.8154, + "step": 8291 + }, + { + "epoch": 0.40376889927689724, + "grad_norm": 1.6201505661010742, + "learning_rate": 2.7051243221577213e-05, + "loss": 0.8551, + "step": 8292 + }, + { + "epoch": 0.40381759306600445, + "grad_norm": 1.6072287559509277, + "learning_rate": 2.7048291313063945e-05, + "loss": 0.8033, + "step": 8293 + }, + { + "epoch": 0.4038662868551116, + "grad_norm": 1.2085816860198975, + "learning_rate": 2.7045339229223785e-05, + "loss": 0.7775, + "step": 8294 + }, + { + "epoch": 0.4039149806442188, + "grad_norm": 1.1806766986846924, + "learning_rate": 2.7042386970130174e-05, + "loss": 0.8089, + "step": 8295 + }, + { + "epoch": 0.40396367443332604, + "grad_norm": 2.002960443496704, + "learning_rate": 2.7039434535856543e-05, + "loss": 0.7645, + "step": 8296 + }, + { + "epoch": 0.40401236822243325, + "grad_norm": 2.095162868499756, + "learning_rate": 2.703648192647633e-05, + "loss": 0.9123, + "step": 8297 + }, + { + "epoch": 0.4040610620115404, + "grad_norm": 1.3048665523529053, + "learning_rate": 2.703352914206299e-05, + "loss": 0.8944, + "step": 8298 + }, + { + "epoch": 0.4041097558006476, + "grad_norm": 2.0601022243499756, + "learning_rate": 2.703057618268998e-05, + "loss": 0.7903, + "step": 8299 + }, + { + "epoch": 0.40415844958975483, + "grad_norm": 0.09406398236751556, + "learning_rate": 2.7027623048430737e-05, + "loss": 0.5897, + "step": 8300 + }, + { + "epoch": 0.40420714337886204, + "grad_norm": 1.5874543190002441, + "learning_rate": 2.702466973935873e-05, + "loss": 0.7826, + "step": 8301 + }, + { + "epoch": 0.4042558371679692, + "grad_norm": 1.4797773361206055, + "learning_rate": 2.7021716255547427e-05, + "loss": 0.8214, + "step": 8302 + }, + { + "epoch": 0.4043045309570764, + "grad_norm": 1.8058546781539917, + "learning_rate": 2.7018762597070288e-05, + "loss": 0.7863, + "step": 8303 + }, + { + "epoch": 0.4043532247461836, + "grad_norm": 1.5323458909988403, + "learning_rate": 2.7015808764000793e-05, + "loss": 0.8553, + "step": 8304 + }, + { + "epoch": 0.40440191853529084, + "grad_norm": 2.2543935775756836, + "learning_rate": 2.7012854756412417e-05, + "loss": 0.7878, + "step": 8305 + }, + { + "epoch": 0.404450612324398, + "grad_norm": 1.7775238752365112, + "learning_rate": 2.7009900574378636e-05, + "loss": 0.8346, + "step": 8306 + }, + { + "epoch": 0.4044993061135052, + "grad_norm": 2.172761917114258, + "learning_rate": 2.7006946217972938e-05, + "loss": 0.7985, + "step": 8307 + }, + { + "epoch": 0.4045479999026124, + "grad_norm": 1.9751285314559937, + "learning_rate": 2.7003991687268815e-05, + "loss": 0.8017, + "step": 8308 + }, + { + "epoch": 0.40459669369171963, + "grad_norm": 1.9370198249816895, + "learning_rate": 2.7001036982339765e-05, + "loss": 0.8451, + "step": 8309 + }, + { + "epoch": 0.40464538748082685, + "grad_norm": 1.43328857421875, + "learning_rate": 2.699808210325928e-05, + "loss": 0.9212, + "step": 8310 + }, + { + "epoch": 0.404694081269934, + "grad_norm": 1.613964557647705, + "learning_rate": 2.6995127050100856e-05, + "loss": 0.8484, + "step": 8311 + }, + { + "epoch": 0.4047427750590412, + "grad_norm": 2.4412670135498047, + "learning_rate": 2.6992171822938017e-05, + "loss": 0.8416, + "step": 8312 + }, + { + "epoch": 0.40479146884814843, + "grad_norm": 1.4950710535049438, + "learning_rate": 2.698921642184426e-05, + "loss": 0.8979, + "step": 8313 + }, + { + "epoch": 0.40484016263725564, + "grad_norm": 1.3360570669174194, + "learning_rate": 2.698626084689312e-05, + "loss": 0.8809, + "step": 8314 + }, + { + "epoch": 0.4048888564263628, + "grad_norm": 2.430452346801758, + "learning_rate": 2.698330509815809e-05, + "loss": 0.8793, + "step": 8315 + }, + { + "epoch": 0.40493755021547, + "grad_norm": 0.08884581178426743, + "learning_rate": 2.698034917571272e-05, + "loss": 0.5871, + "step": 8316 + }, + { + "epoch": 0.4049862440045772, + "grad_norm": 0.0922030359506607, + "learning_rate": 2.697739307963052e-05, + "loss": 0.5845, + "step": 8317 + }, + { + "epoch": 0.40503493779368444, + "grad_norm": 1.4638108015060425, + "learning_rate": 2.6974436809985033e-05, + "loss": 0.7983, + "step": 8318 + }, + { + "epoch": 0.4050836315827916, + "grad_norm": 1.5215303897857666, + "learning_rate": 2.6971480366849796e-05, + "loss": 0.8681, + "step": 8319 + }, + { + "epoch": 0.4051323253718988, + "grad_norm": 1.432372808456421, + "learning_rate": 2.6968523750298346e-05, + "loss": 0.8461, + "step": 8320 + }, + { + "epoch": 0.405181019161006, + "grad_norm": 1.258187174797058, + "learning_rate": 2.6965566960404233e-05, + "loss": 0.7852, + "step": 8321 + }, + { + "epoch": 0.40522971295011323, + "grad_norm": 1.521830439567566, + "learning_rate": 2.6962609997241013e-05, + "loss": 0.8939, + "step": 8322 + }, + { + "epoch": 0.4052784067392204, + "grad_norm": 1.8699671030044556, + "learning_rate": 2.695965286088223e-05, + "loss": 0.7956, + "step": 8323 + }, + { + "epoch": 0.4053271005283276, + "grad_norm": 2.9355199337005615, + "learning_rate": 2.6956695551401452e-05, + "loss": 0.7928, + "step": 8324 + }, + { + "epoch": 0.4053757943174348, + "grad_norm": 1.4521420001983643, + "learning_rate": 2.6953738068872232e-05, + "loss": 0.8815, + "step": 8325 + }, + { + "epoch": 0.405424488106542, + "grad_norm": 2.181938648223877, + "learning_rate": 2.6950780413368143e-05, + "loss": 0.8572, + "step": 8326 + }, + { + "epoch": 0.40547318189564924, + "grad_norm": 1.7115545272827148, + "learning_rate": 2.6947822584962763e-05, + "loss": 0.7484, + "step": 8327 + }, + { + "epoch": 0.4055218756847564, + "grad_norm": 1.2540394067764282, + "learning_rate": 2.694486458372966e-05, + "loss": 0.9287, + "step": 8328 + }, + { + "epoch": 0.4055705694738636, + "grad_norm": 1.4640767574310303, + "learning_rate": 2.6941906409742417e-05, + "loss": 0.887, + "step": 8329 + }, + { + "epoch": 0.4056192632629708, + "grad_norm": 1.6952636241912842, + "learning_rate": 2.693894806307462e-05, + "loss": 0.9225, + "step": 8330 + }, + { + "epoch": 0.40566795705207803, + "grad_norm": 1.3932478427886963, + "learning_rate": 2.6935989543799852e-05, + "loss": 0.7773, + "step": 8331 + }, + { + "epoch": 0.4057166508411852, + "grad_norm": 1.3010717630386353, + "learning_rate": 2.693303085199172e-05, + "loss": 0.7303, + "step": 8332 + }, + { + "epoch": 0.4057653446302924, + "grad_norm": 2.628725290298462, + "learning_rate": 2.6930071987723804e-05, + "loss": 0.8782, + "step": 8333 + }, + { + "epoch": 0.4058140384193996, + "grad_norm": 1.7057108879089355, + "learning_rate": 2.692711295106972e-05, + "loss": 0.889, + "step": 8334 + }, + { + "epoch": 0.4058627322085068, + "grad_norm": 1.8263722658157349, + "learning_rate": 2.692415374210307e-05, + "loss": 0.9163, + "step": 8335 + }, + { + "epoch": 0.405911425997614, + "grad_norm": 2.2377877235412598, + "learning_rate": 2.692119436089747e-05, + "loss": 0.9064, + "step": 8336 + }, + { + "epoch": 0.4059601197867212, + "grad_norm": 2.2454583644866943, + "learning_rate": 2.691823480752652e-05, + "loss": 0.8581, + "step": 8337 + }, + { + "epoch": 0.4060088135758284, + "grad_norm": 1.8611252307891846, + "learning_rate": 2.6915275082063853e-05, + "loss": 0.8905, + "step": 8338 + }, + { + "epoch": 0.4060575073649356, + "grad_norm": 1.426903486251831, + "learning_rate": 2.691231518458309e-05, + "loss": 0.8267, + "step": 8339 + }, + { + "epoch": 0.4061062011540428, + "grad_norm": 1.9396440982818604, + "learning_rate": 2.690935511515785e-05, + "loss": 0.9335, + "step": 8340 + }, + { + "epoch": 0.40615489494315, + "grad_norm": 1.763663649559021, + "learning_rate": 2.690639487386177e-05, + "loss": 0.8442, + "step": 8341 + }, + { + "epoch": 0.4062035887322572, + "grad_norm": 1.705317735671997, + "learning_rate": 2.69034344607685e-05, + "loss": 0.9281, + "step": 8342 + }, + { + "epoch": 0.4062522825213644, + "grad_norm": 1.9794481992721558, + "learning_rate": 2.690047387595166e-05, + "loss": 0.8062, + "step": 8343 + }, + { + "epoch": 0.4063009763104716, + "grad_norm": 1.3608225584030151, + "learning_rate": 2.6897513119484907e-05, + "loss": 0.8953, + "step": 8344 + }, + { + "epoch": 0.4063496700995788, + "grad_norm": 3.54133939743042, + "learning_rate": 2.689455219144188e-05, + "loss": 0.8492, + "step": 8345 + }, + { + "epoch": 0.406398363888686, + "grad_norm": 1.5580974817276, + "learning_rate": 2.6891591091896248e-05, + "loss": 0.8038, + "step": 8346 + }, + { + "epoch": 0.4064470576777932, + "grad_norm": 1.3120155334472656, + "learning_rate": 2.6888629820921647e-05, + "loss": 0.7739, + "step": 8347 + }, + { + "epoch": 0.4064957514669004, + "grad_norm": 2.1264090538024902, + "learning_rate": 2.688566837859176e-05, + "loss": 0.8543, + "step": 8348 + }, + { + "epoch": 0.4065444452560076, + "grad_norm": 1.6068642139434814, + "learning_rate": 2.688270676498024e-05, + "loss": 0.8839, + "step": 8349 + }, + { + "epoch": 0.4065931390451148, + "grad_norm": 1.6333250999450684, + "learning_rate": 2.6879744980160766e-05, + "loss": 0.8104, + "step": 8350 + }, + { + "epoch": 0.406641832834222, + "grad_norm": 1.3040298223495483, + "learning_rate": 2.6876783024207002e-05, + "loss": 0.8358, + "step": 8351 + }, + { + "epoch": 0.4066905266233292, + "grad_norm": 1.3991966247558594, + "learning_rate": 2.687382089719264e-05, + "loss": 0.7528, + "step": 8352 + }, + { + "epoch": 0.4067392204124364, + "grad_norm": 1.181322693824768, + "learning_rate": 2.687085859919136e-05, + "loss": 0.8295, + "step": 8353 + }, + { + "epoch": 0.4067879142015436, + "grad_norm": 1.9219956398010254, + "learning_rate": 2.6867896130276835e-05, + "loss": 0.8214, + "step": 8354 + }, + { + "epoch": 0.4068366079906508, + "grad_norm": 1.5106332302093506, + "learning_rate": 2.686493349052277e-05, + "loss": 0.8018, + "step": 8355 + }, + { + "epoch": 0.406885301779758, + "grad_norm": 1.2670210599899292, + "learning_rate": 2.6861970680002864e-05, + "loss": 0.7605, + "step": 8356 + }, + { + "epoch": 0.40693399556886517, + "grad_norm": 1.6441291570663452, + "learning_rate": 2.6859007698790808e-05, + "loss": 0.8533, + "step": 8357 + }, + { + "epoch": 0.4069826893579724, + "grad_norm": 4.525288105010986, + "learning_rate": 2.685604454696031e-05, + "loss": 0.9196, + "step": 8358 + }, + { + "epoch": 0.4070313831470796, + "grad_norm": 1.4731688499450684, + "learning_rate": 2.685308122458508e-05, + "loss": 0.8556, + "step": 8359 + }, + { + "epoch": 0.4070800769361868, + "grad_norm": 1.3803755044937134, + "learning_rate": 2.6850117731738826e-05, + "loss": 0.805, + "step": 8360 + }, + { + "epoch": 0.40712877072529396, + "grad_norm": 1.6813819408416748, + "learning_rate": 2.6847154068495272e-05, + "loss": 0.8107, + "step": 8361 + }, + { + "epoch": 0.4071774645144012, + "grad_norm": 1.77072012424469, + "learning_rate": 2.684419023492814e-05, + "loss": 0.88, + "step": 8362 + }, + { + "epoch": 0.4072261583035084, + "grad_norm": 1.5983251333236694, + "learning_rate": 2.6841226231111145e-05, + "loss": 0.7918, + "step": 8363 + }, + { + "epoch": 0.4072748520926156, + "grad_norm": 1.6932235956192017, + "learning_rate": 2.6838262057118026e-05, + "loss": 0.8997, + "step": 8364 + }, + { + "epoch": 0.40732354588172276, + "grad_norm": 1.5622800588607788, + "learning_rate": 2.683529771302252e-05, + "loss": 0.7648, + "step": 8365 + }, + { + "epoch": 0.40737223967082997, + "grad_norm": 4.351684093475342, + "learning_rate": 2.6832333198898353e-05, + "loss": 0.7715, + "step": 8366 + }, + { + "epoch": 0.4074209334599372, + "grad_norm": 1.4523377418518066, + "learning_rate": 2.682936851481928e-05, + "loss": 0.8656, + "step": 8367 + }, + { + "epoch": 0.4074696272490444, + "grad_norm": 3.2914135456085205, + "learning_rate": 2.6826403660859042e-05, + "loss": 0.8793, + "step": 8368 + }, + { + "epoch": 0.4075183210381516, + "grad_norm": 1.7859067916870117, + "learning_rate": 2.6823438637091385e-05, + "loss": 0.7953, + "step": 8369 + }, + { + "epoch": 0.40756701482725877, + "grad_norm": 1.4805561304092407, + "learning_rate": 2.682047344359007e-05, + "loss": 0.8048, + "step": 8370 + }, + { + "epoch": 0.407615708616366, + "grad_norm": 1.6731648445129395, + "learning_rate": 2.6817508080428862e-05, + "loss": 0.8049, + "step": 8371 + }, + { + "epoch": 0.4076644024054732, + "grad_norm": 1.5701426267623901, + "learning_rate": 2.6814542547681516e-05, + "loss": 0.8593, + "step": 8372 + }, + { + "epoch": 0.4077130961945804, + "grad_norm": 1.4364346265792847, + "learning_rate": 2.6811576845421803e-05, + "loss": 0.8182, + "step": 8373 + }, + { + "epoch": 0.40776178998368756, + "grad_norm": 1.5628186464309692, + "learning_rate": 2.6808610973723496e-05, + "loss": 0.8515, + "step": 8374 + }, + { + "epoch": 0.4078104837727948, + "grad_norm": 1.7244545221328735, + "learning_rate": 2.6805644932660372e-05, + "loss": 0.8439, + "step": 8375 + }, + { + "epoch": 0.407859177561902, + "grad_norm": 2.1872403621673584, + "learning_rate": 2.680267872230621e-05, + "loss": 0.8953, + "step": 8376 + }, + { + "epoch": 0.4079078713510092, + "grad_norm": 1.8331416845321655, + "learning_rate": 2.679971234273479e-05, + "loss": 0.7857, + "step": 8377 + }, + { + "epoch": 0.40795656514011636, + "grad_norm": 2.5359325408935547, + "learning_rate": 2.6796745794019906e-05, + "loss": 0.9238, + "step": 8378 + }, + { + "epoch": 0.40800525892922357, + "grad_norm": 2.4045679569244385, + "learning_rate": 2.6793779076235354e-05, + "loss": 0.7018, + "step": 8379 + }, + { + "epoch": 0.4080539527183308, + "grad_norm": 3.1887500286102295, + "learning_rate": 2.679081218945492e-05, + "loss": 0.827, + "step": 8380 + }, + { + "epoch": 0.408102646507438, + "grad_norm": 4.171352863311768, + "learning_rate": 2.6787845133752422e-05, + "loss": 0.813, + "step": 8381 + }, + { + "epoch": 0.40815134029654515, + "grad_norm": 1.7649405002593994, + "learning_rate": 2.6784877909201658e-05, + "loss": 0.883, + "step": 8382 + }, + { + "epoch": 0.40820003408565236, + "grad_norm": 1.2382205724716187, + "learning_rate": 2.6781910515876434e-05, + "loss": 0.8845, + "step": 8383 + }, + { + "epoch": 0.4082487278747596, + "grad_norm": 2.037372350692749, + "learning_rate": 2.6778942953850565e-05, + "loss": 0.7616, + "step": 8384 + }, + { + "epoch": 0.4082974216638668, + "grad_norm": 1.4422895908355713, + "learning_rate": 2.677597522319787e-05, + "loss": 0.839, + "step": 8385 + }, + { + "epoch": 0.40834611545297395, + "grad_norm": 1.5240942239761353, + "learning_rate": 2.6773007323992188e-05, + "loss": 0.8811, + "step": 8386 + }, + { + "epoch": 0.40839480924208116, + "grad_norm": 1.525321364402771, + "learning_rate": 2.6770039256307318e-05, + "loss": 0.8506, + "step": 8387 + }, + { + "epoch": 0.40844350303118837, + "grad_norm": 2.1665899753570557, + "learning_rate": 2.676707102021711e-05, + "loss": 0.8707, + "step": 8388 + }, + { + "epoch": 0.4084921968202956, + "grad_norm": 0.09293504804372787, + "learning_rate": 2.6764102615795394e-05, + "loss": 0.6044, + "step": 8389 + }, + { + "epoch": 0.4085408906094028, + "grad_norm": 1.8633975982666016, + "learning_rate": 2.6761134043116e-05, + "loss": 0.8708, + "step": 8390 + }, + { + "epoch": 0.40858958439850995, + "grad_norm": 2.2720704078674316, + "learning_rate": 2.675816530225279e-05, + "loss": 0.8897, + "step": 8391 + }, + { + "epoch": 0.40863827818761717, + "grad_norm": 1.8378870487213135, + "learning_rate": 2.6755196393279593e-05, + "loss": 0.8244, + "step": 8392 + }, + { + "epoch": 0.4086869719767244, + "grad_norm": 2.789961576461792, + "learning_rate": 2.6752227316270273e-05, + "loss": 0.8395, + "step": 8393 + }, + { + "epoch": 0.4087356657658316, + "grad_norm": 1.9890395402908325, + "learning_rate": 2.674925807129869e-05, + "loss": 0.8632, + "step": 8394 + }, + { + "epoch": 0.40878435955493875, + "grad_norm": 1.5187584161758423, + "learning_rate": 2.6746288658438692e-05, + "loss": 0.7547, + "step": 8395 + }, + { + "epoch": 0.40883305334404596, + "grad_norm": 1.4081848859786987, + "learning_rate": 2.6743319077764147e-05, + "loss": 0.9209, + "step": 8396 + }, + { + "epoch": 0.4088817471331532, + "grad_norm": 2.3063466548919678, + "learning_rate": 2.674034932934893e-05, + "loss": 0.833, + "step": 8397 + }, + { + "epoch": 0.4089304409222604, + "grad_norm": 3.1959407329559326, + "learning_rate": 2.6737379413266902e-05, + "loss": 0.8446, + "step": 8398 + }, + { + "epoch": 0.40897913471136754, + "grad_norm": 1.5559316873550415, + "learning_rate": 2.673440932959195e-05, + "loss": 0.7007, + "step": 8399 + }, + { + "epoch": 0.40902782850047475, + "grad_norm": 1.5878055095672607, + "learning_rate": 2.673143907839795e-05, + "loss": 0.8417, + "step": 8400 + }, + { + "epoch": 0.40907652228958197, + "grad_norm": 1.4770944118499756, + "learning_rate": 2.672846865975879e-05, + "loss": 0.9252, + "step": 8401 + }, + { + "epoch": 0.4091252160786892, + "grad_norm": 1.35516357421875, + "learning_rate": 2.672549807374836e-05, + "loss": 0.8362, + "step": 8402 + }, + { + "epoch": 0.40917390986779634, + "grad_norm": 1.941485047340393, + "learning_rate": 2.6722527320440557e-05, + "loss": 0.9099, + "step": 8403 + }, + { + "epoch": 0.40922260365690355, + "grad_norm": 1.93259859085083, + "learning_rate": 2.671955639990927e-05, + "loss": 0.8269, + "step": 8404 + }, + { + "epoch": 0.40927129744601076, + "grad_norm": 1.2206854820251465, + "learning_rate": 2.6716585312228406e-05, + "loss": 0.7747, + "step": 8405 + }, + { + "epoch": 0.409319991235118, + "grad_norm": 2.3448455333709717, + "learning_rate": 2.6713614057471863e-05, + "loss": 0.9669, + "step": 8406 + }, + { + "epoch": 0.4093686850242252, + "grad_norm": 1.7582335472106934, + "learning_rate": 2.6710642635713565e-05, + "loss": 0.8196, + "step": 8407 + }, + { + "epoch": 0.40941737881333234, + "grad_norm": 1.6558265686035156, + "learning_rate": 2.670767104702742e-05, + "loss": 0.8387, + "step": 8408 + }, + { + "epoch": 0.40946607260243956, + "grad_norm": 6.856170177459717, + "learning_rate": 2.6704699291487343e-05, + "loss": 0.8048, + "step": 8409 + }, + { + "epoch": 0.40951476639154677, + "grad_norm": 1.5083256959915161, + "learning_rate": 2.6701727369167258e-05, + "loss": 0.8917, + "step": 8410 + }, + { + "epoch": 0.409563460180654, + "grad_norm": 1.4255387783050537, + "learning_rate": 2.6698755280141097e-05, + "loss": 0.9444, + "step": 8411 + }, + { + "epoch": 0.40961215396976114, + "grad_norm": 1.970474123954773, + "learning_rate": 2.6695783024482786e-05, + "loss": 0.9548, + "step": 8412 + }, + { + "epoch": 0.40966084775886835, + "grad_norm": 2.086235284805298, + "learning_rate": 2.6692810602266264e-05, + "loss": 0.8389, + "step": 8413 + }, + { + "epoch": 0.40970954154797556, + "grad_norm": 1.6808727979660034, + "learning_rate": 2.6689838013565467e-05, + "loss": 0.8038, + "step": 8414 + }, + { + "epoch": 0.4097582353370828, + "grad_norm": 2.2407455444335938, + "learning_rate": 2.6686865258454338e-05, + "loss": 0.867, + "step": 8415 + }, + { + "epoch": 0.40980692912618993, + "grad_norm": 1.8958194255828857, + "learning_rate": 2.668389233700683e-05, + "loss": 0.7264, + "step": 8416 + }, + { + "epoch": 0.40985562291529715, + "grad_norm": 0.08951272815465927, + "learning_rate": 2.6680919249296886e-05, + "loss": 0.6642, + "step": 8417 + }, + { + "epoch": 0.40990431670440436, + "grad_norm": 1.5953565835952759, + "learning_rate": 2.667794599539847e-05, + "loss": 0.9802, + "step": 8418 + }, + { + "epoch": 0.40995301049351157, + "grad_norm": 1.635690450668335, + "learning_rate": 2.667497257538554e-05, + "loss": 0.8913, + "step": 8419 + }, + { + "epoch": 0.41000170428261873, + "grad_norm": 1.6924457550048828, + "learning_rate": 2.6671998989332052e-05, + "loss": 0.8172, + "step": 8420 + }, + { + "epoch": 0.41005039807172594, + "grad_norm": 1.8784006834030151, + "learning_rate": 2.666902523731198e-05, + "loss": 0.88, + "step": 8421 + }, + { + "epoch": 0.41009909186083315, + "grad_norm": 1.371732234954834, + "learning_rate": 2.66660513193993e-05, + "loss": 0.8238, + "step": 8422 + }, + { + "epoch": 0.41014778564994037, + "grad_norm": 1.7217763662338257, + "learning_rate": 2.6663077235667987e-05, + "loss": 0.8927, + "step": 8423 + }, + { + "epoch": 0.4101964794390475, + "grad_norm": 1.660207748413086, + "learning_rate": 2.6660102986192025e-05, + "loss": 0.8223, + "step": 8424 + }, + { + "epoch": 0.41024517322815474, + "grad_norm": 0.08976351469755173, + "learning_rate": 2.665712857104539e-05, + "loss": 0.6111, + "step": 8425 + }, + { + "epoch": 0.41029386701726195, + "grad_norm": 1.521308183670044, + "learning_rate": 2.6654153990302076e-05, + "loss": 1.0188, + "step": 8426 + }, + { + "epoch": 0.41034256080636916, + "grad_norm": 1.738014578819275, + "learning_rate": 2.6651179244036066e-05, + "loss": 0.9079, + "step": 8427 + }, + { + "epoch": 0.4103912545954764, + "grad_norm": 1.34385085105896, + "learning_rate": 2.6648204332321377e-05, + "loss": 0.8808, + "step": 8428 + }, + { + "epoch": 0.41043994838458353, + "grad_norm": 1.5609232187271118, + "learning_rate": 2.6645229255231996e-05, + "loss": 0.7815, + "step": 8429 + }, + { + "epoch": 0.41048864217369074, + "grad_norm": 1.7137601375579834, + "learning_rate": 2.664225401284193e-05, + "loss": 0.8197, + "step": 8430 + }, + { + "epoch": 0.41053733596279796, + "grad_norm": 1.4088969230651855, + "learning_rate": 2.663927860522519e-05, + "loss": 0.9114, + "step": 8431 + }, + { + "epoch": 0.41058602975190517, + "grad_norm": 1.7725062370300293, + "learning_rate": 2.663630303245579e-05, + "loss": 0.8829, + "step": 8432 + }, + { + "epoch": 0.4106347235410123, + "grad_norm": 1.5569357872009277, + "learning_rate": 2.6633327294607748e-05, + "loss": 0.9611, + "step": 8433 + }, + { + "epoch": 0.41068341733011954, + "grad_norm": 1.753043532371521, + "learning_rate": 2.6630351391755084e-05, + "loss": 0.8792, + "step": 8434 + }, + { + "epoch": 0.41073211111922675, + "grad_norm": 5.004899501800537, + "learning_rate": 2.662737532397182e-05, + "loss": 0.8597, + "step": 8435 + }, + { + "epoch": 0.41078080490833396, + "grad_norm": 2.1812713146209717, + "learning_rate": 2.6624399091331998e-05, + "loss": 0.8351, + "step": 8436 + }, + { + "epoch": 0.4108294986974411, + "grad_norm": 1.5010417699813843, + "learning_rate": 2.6621422693909637e-05, + "loss": 0.9193, + "step": 8437 + }, + { + "epoch": 0.41087819248654833, + "grad_norm": 2.9217207431793213, + "learning_rate": 2.661844613177879e-05, + "loss": 0.8207, + "step": 8438 + }, + { + "epoch": 0.41092688627565555, + "grad_norm": 1.4137144088745117, + "learning_rate": 2.661546940501349e-05, + "loss": 0.9181, + "step": 8439 + }, + { + "epoch": 0.41097558006476276, + "grad_norm": 1.2470252513885498, + "learning_rate": 2.661249251368778e-05, + "loss": 0.7808, + "step": 8440 + }, + { + "epoch": 0.4110242738538699, + "grad_norm": 3.4668352603912354, + "learning_rate": 2.6609515457875722e-05, + "loss": 0.811, + "step": 8441 + }, + { + "epoch": 0.4110729676429771, + "grad_norm": 1.8305193185806274, + "learning_rate": 2.6606538237651357e-05, + "loss": 0.905, + "step": 8442 + }, + { + "epoch": 0.41112166143208434, + "grad_norm": 1.7161105871200562, + "learning_rate": 2.660356085308876e-05, + "loss": 0.7474, + "step": 8443 + }, + { + "epoch": 0.41117035522119155, + "grad_norm": 2.1568145751953125, + "learning_rate": 2.660058330426198e-05, + "loss": 0.7743, + "step": 8444 + }, + { + "epoch": 0.4112190490102987, + "grad_norm": 1.7227604389190674, + "learning_rate": 2.659760559124509e-05, + "loss": 0.84, + "step": 8445 + }, + { + "epoch": 0.4112677427994059, + "grad_norm": 1.7857996225357056, + "learning_rate": 2.659462771411216e-05, + "loss": 0.9123, + "step": 8446 + }, + { + "epoch": 0.41131643658851313, + "grad_norm": 1.4669158458709717, + "learning_rate": 2.6591649672937267e-05, + "loss": 0.9307, + "step": 8447 + }, + { + "epoch": 0.41136513037762035, + "grad_norm": 1.7521398067474365, + "learning_rate": 2.6588671467794483e-05, + "loss": 0.861, + "step": 8448 + }, + { + "epoch": 0.41141382416672756, + "grad_norm": 1.56252920627594, + "learning_rate": 2.658569309875789e-05, + "loss": 0.828, + "step": 8449 + }, + { + "epoch": 0.4114625179558347, + "grad_norm": 2.0467021465301514, + "learning_rate": 2.6582714565901588e-05, + "loss": 0.8006, + "step": 8450 + }, + { + "epoch": 0.41151121174494193, + "grad_norm": 1.676580786705017, + "learning_rate": 2.657973586929966e-05, + "loss": 0.9788, + "step": 8451 + }, + { + "epoch": 0.41155990553404914, + "grad_norm": 1.3856621980667114, + "learning_rate": 2.6576757009026205e-05, + "loss": 0.941, + "step": 8452 + }, + { + "epoch": 0.41160859932315635, + "grad_norm": 1.8353711366653442, + "learning_rate": 2.6573777985155317e-05, + "loss": 0.9449, + "step": 8453 + }, + { + "epoch": 0.4116572931122635, + "grad_norm": 1.8264187574386597, + "learning_rate": 2.6570798797761103e-05, + "loss": 0.8096, + "step": 8454 + }, + { + "epoch": 0.4117059869013707, + "grad_norm": 0.09181281179189682, + "learning_rate": 2.6567819446917668e-05, + "loss": 0.6522, + "step": 8455 + }, + { + "epoch": 0.41175468069047794, + "grad_norm": 1.2604304552078247, + "learning_rate": 2.6564839932699126e-05, + "loss": 0.8607, + "step": 8456 + }, + { + "epoch": 0.41180337447958515, + "grad_norm": 1.7109959125518799, + "learning_rate": 2.656186025517959e-05, + "loss": 0.7992, + "step": 8457 + }, + { + "epoch": 0.4118520682686923, + "grad_norm": 2.8128409385681152, + "learning_rate": 2.6558880414433186e-05, + "loss": 0.8387, + "step": 8458 + }, + { + "epoch": 0.4119007620577995, + "grad_norm": 2.4320123195648193, + "learning_rate": 2.6555900410534028e-05, + "loss": 0.8323, + "step": 8459 + }, + { + "epoch": 0.41194945584690673, + "grad_norm": 1.8988838195800781, + "learning_rate": 2.6552920243556256e-05, + "loss": 0.756, + "step": 8460 + }, + { + "epoch": 0.41199814963601394, + "grad_norm": 1.8170433044433594, + "learning_rate": 2.6549939913573994e-05, + "loss": 0.8412, + "step": 8461 + }, + { + "epoch": 0.4120468434251211, + "grad_norm": 2.051222324371338, + "learning_rate": 2.654695942066138e-05, + "loss": 0.8314, + "step": 8462 + }, + { + "epoch": 0.4120955372142283, + "grad_norm": 2.6579935550689697, + "learning_rate": 2.654397876489255e-05, + "loss": 0.9011, + "step": 8463 + }, + { + "epoch": 0.4121442310033355, + "grad_norm": 2.7005608081817627, + "learning_rate": 2.6540997946341652e-05, + "loss": 0.9133, + "step": 8464 + }, + { + "epoch": 0.41219292479244274, + "grad_norm": 1.7191705703735352, + "learning_rate": 2.6538016965082838e-05, + "loss": 0.8794, + "step": 8465 + }, + { + "epoch": 0.4122416185815499, + "grad_norm": 2.4863333702087402, + "learning_rate": 2.6535035821190255e-05, + "loss": 0.8452, + "step": 8466 + }, + { + "epoch": 0.4122903123706571, + "grad_norm": 1.5367393493652344, + "learning_rate": 2.653205451473806e-05, + "loss": 0.8065, + "step": 8467 + }, + { + "epoch": 0.4123390061597643, + "grad_norm": 1.5601376295089722, + "learning_rate": 2.652907304580041e-05, + "loss": 0.8361, + "step": 8468 + }, + { + "epoch": 0.41238769994887153, + "grad_norm": 1.2525328397750854, + "learning_rate": 2.6526091414451475e-05, + "loss": 0.8167, + "step": 8469 + }, + { + "epoch": 0.41243639373797875, + "grad_norm": 2.1462395191192627, + "learning_rate": 2.6523109620765417e-05, + "loss": 0.7892, + "step": 8470 + }, + { + "epoch": 0.4124850875270859, + "grad_norm": 1.4946976900100708, + "learning_rate": 2.652012766481642e-05, + "loss": 0.915, + "step": 8471 + }, + { + "epoch": 0.4125337813161931, + "grad_norm": 2.1396844387054443, + "learning_rate": 2.651714554667864e-05, + "loss": 0.8321, + "step": 8472 + }, + { + "epoch": 0.41258247510530033, + "grad_norm": 5.849621295928955, + "learning_rate": 2.6514163266426282e-05, + "loss": 0.8389, + "step": 8473 + }, + { + "epoch": 0.41263116889440754, + "grad_norm": 1.8248099088668823, + "learning_rate": 2.6511180824133516e-05, + "loss": 0.8229, + "step": 8474 + }, + { + "epoch": 0.4126798626835147, + "grad_norm": 1.4698896408081055, + "learning_rate": 2.6508198219874536e-05, + "loss": 0.8784, + "step": 8475 + }, + { + "epoch": 0.4127285564726219, + "grad_norm": 1.7649985551834106, + "learning_rate": 2.650521545372353e-05, + "loss": 0.8812, + "step": 8476 + }, + { + "epoch": 0.4127772502617291, + "grad_norm": 0.09280276298522949, + "learning_rate": 2.6502232525754695e-05, + "loss": 0.6856, + "step": 8477 + }, + { + "epoch": 0.41282594405083634, + "grad_norm": 2.534531831741333, + "learning_rate": 2.649924943604223e-05, + "loss": 0.8278, + "step": 8478 + }, + { + "epoch": 0.4128746378399435, + "grad_norm": 1.4271596670150757, + "learning_rate": 2.6496266184660348e-05, + "loss": 0.8872, + "step": 8479 + }, + { + "epoch": 0.4129233316290507, + "grad_norm": 1.6842867136001587, + "learning_rate": 2.6493282771683248e-05, + "loss": 0.887, + "step": 8480 + }, + { + "epoch": 0.4129720254181579, + "grad_norm": 1.2091842889785767, + "learning_rate": 2.649029919718515e-05, + "loss": 0.8039, + "step": 8481 + }, + { + "epoch": 0.41302071920726513, + "grad_norm": 1.4869905710220337, + "learning_rate": 2.648731546124027e-05, + "loss": 0.9724, + "step": 8482 + }, + { + "epoch": 0.4130694129963723, + "grad_norm": 5.036444187164307, + "learning_rate": 2.648433156392282e-05, + "loss": 0.8207, + "step": 8483 + }, + { + "epoch": 0.4131181067854795, + "grad_norm": 2.8294789791107178, + "learning_rate": 2.6481347505307037e-05, + "loss": 0.9144, + "step": 8484 + }, + { + "epoch": 0.4131668005745867, + "grad_norm": 1.7899435758590698, + "learning_rate": 2.6478363285467138e-05, + "loss": 0.9099, + "step": 8485 + }, + { + "epoch": 0.4132154943636939, + "grad_norm": 1.680227279663086, + "learning_rate": 2.647537890447737e-05, + "loss": 0.8646, + "step": 8486 + }, + { + "epoch": 0.41326418815280114, + "grad_norm": 0.09055079519748688, + "learning_rate": 2.647239436241196e-05, + "loss": 0.6582, + "step": 8487 + }, + { + "epoch": 0.4133128819419083, + "grad_norm": 0.09186185896396637, + "learning_rate": 2.6469409659345143e-05, + "loss": 0.5906, + "step": 8488 + }, + { + "epoch": 0.4133615757310155, + "grad_norm": 1.5216258764266968, + "learning_rate": 2.646642479535117e-05, + "loss": 0.8916, + "step": 8489 + }, + { + "epoch": 0.4134102695201227, + "grad_norm": 2.463364839553833, + "learning_rate": 2.64634397705043e-05, + "loss": 0.868, + "step": 8490 + }, + { + "epoch": 0.41345896330922993, + "grad_norm": 1.4657706022262573, + "learning_rate": 2.6460454584878774e-05, + "loss": 0.8276, + "step": 8491 + }, + { + "epoch": 0.4135076570983371, + "grad_norm": 1.4130691289901733, + "learning_rate": 2.6457469238548846e-05, + "loss": 0.861, + "step": 8492 + }, + { + "epoch": 0.4135563508874443, + "grad_norm": 1.5920237302780151, + "learning_rate": 2.6454483731588783e-05, + "loss": 0.8704, + "step": 8493 + }, + { + "epoch": 0.4136050446765515, + "grad_norm": 1.5758217573165894, + "learning_rate": 2.6451498064072852e-05, + "loss": 0.8678, + "step": 8494 + }, + { + "epoch": 0.4136537384656587, + "grad_norm": 1.207223892211914, + "learning_rate": 2.6448512236075318e-05, + "loss": 0.8702, + "step": 8495 + }, + { + "epoch": 0.4137024322547659, + "grad_norm": 0.09235357493162155, + "learning_rate": 2.6445526247670456e-05, + "loss": 0.6716, + "step": 8496 + }, + { + "epoch": 0.4137511260438731, + "grad_norm": 1.2185550928115845, + "learning_rate": 2.6442540098932536e-05, + "loss": 0.9046, + "step": 8497 + }, + { + "epoch": 0.4137998198329803, + "grad_norm": 2.3205065727233887, + "learning_rate": 2.6439553789935844e-05, + "loss": 0.958, + "step": 8498 + }, + { + "epoch": 0.4138485136220875, + "grad_norm": 1.8847427368164062, + "learning_rate": 2.6436567320754663e-05, + "loss": 0.8507, + "step": 8499 + }, + { + "epoch": 0.4138972074111947, + "grad_norm": 1.5652492046356201, + "learning_rate": 2.643358069146329e-05, + "loss": 0.7788, + "step": 8500 + }, + { + "epoch": 0.4139459012003019, + "grad_norm": 1.4097849130630493, + "learning_rate": 2.6430593902136003e-05, + "loss": 0.9353, + "step": 8501 + }, + { + "epoch": 0.4139945949894091, + "grad_norm": 1.7062445878982544, + "learning_rate": 2.642760695284711e-05, + "loss": 0.8904, + "step": 8502 + }, + { + "epoch": 0.4140432887785163, + "grad_norm": 1.4673891067504883, + "learning_rate": 2.642461984367091e-05, + "loss": 0.8369, + "step": 8503 + }, + { + "epoch": 0.4140919825676235, + "grad_norm": 1.7629517316818237, + "learning_rate": 2.6421632574681706e-05, + "loss": 0.8334, + "step": 8504 + }, + { + "epoch": 0.4141406763567307, + "grad_norm": 1.2727620601654053, + "learning_rate": 2.6418645145953804e-05, + "loss": 0.8736, + "step": 8505 + }, + { + "epoch": 0.4141893701458379, + "grad_norm": 1.320972204208374, + "learning_rate": 2.641565755756152e-05, + "loss": 0.8979, + "step": 8506 + }, + { + "epoch": 0.4142380639349451, + "grad_norm": 1.608024001121521, + "learning_rate": 2.6412669809579163e-05, + "loss": 0.8345, + "step": 8507 + }, + { + "epoch": 0.4142867577240523, + "grad_norm": 1.4586389064788818, + "learning_rate": 2.6409681902081063e-05, + "loss": 0.9103, + "step": 8508 + }, + { + "epoch": 0.4143354515131595, + "grad_norm": 5.859705448150635, + "learning_rate": 2.640669383514154e-05, + "loss": 0.8965, + "step": 8509 + }, + { + "epoch": 0.4143841453022667, + "grad_norm": 1.3448201417922974, + "learning_rate": 2.6403705608834932e-05, + "loss": 0.8998, + "step": 8510 + }, + { + "epoch": 0.4144328390913739, + "grad_norm": 1.61177659034729, + "learning_rate": 2.6400717223235556e-05, + "loss": 0.939, + "step": 8511 + }, + { + "epoch": 0.4144815328804811, + "grad_norm": 1.290376901626587, + "learning_rate": 2.6397728678417757e-05, + "loss": 0.9547, + "step": 8512 + }, + { + "epoch": 0.4145302266695883, + "grad_norm": 1.4586893320083618, + "learning_rate": 2.6394739974455874e-05, + "loss": 0.8038, + "step": 8513 + }, + { + "epoch": 0.4145789204586955, + "grad_norm": 1.7297769784927368, + "learning_rate": 2.6391751111424256e-05, + "loss": 0.7996, + "step": 8514 + }, + { + "epoch": 0.4146276142478027, + "grad_norm": 1.699670672416687, + "learning_rate": 2.6388762089397236e-05, + "loss": 0.7443, + "step": 8515 + }, + { + "epoch": 0.4146763080369099, + "grad_norm": 6.719211578369141, + "learning_rate": 2.6385772908449183e-05, + "loss": 0.887, + "step": 8516 + }, + { + "epoch": 0.41472500182601707, + "grad_norm": 1.8486039638519287, + "learning_rate": 2.6382783568654445e-05, + "loss": 0.8086, + "step": 8517 + }, + { + "epoch": 0.4147736956151243, + "grad_norm": 4.692688941955566, + "learning_rate": 2.6379794070087387e-05, + "loss": 0.8099, + "step": 8518 + }, + { + "epoch": 0.4148223894042315, + "grad_norm": 1.138315200805664, + "learning_rate": 2.6376804412822363e-05, + "loss": 0.807, + "step": 8519 + }, + { + "epoch": 0.4148710831933387, + "grad_norm": 1.5630733966827393, + "learning_rate": 2.6373814596933757e-05, + "loss": 0.8527, + "step": 8520 + }, + { + "epoch": 0.41491977698244586, + "grad_norm": 1.4775844812393188, + "learning_rate": 2.6370824622495925e-05, + "loss": 0.9139, + "step": 8521 + }, + { + "epoch": 0.4149684707715531, + "grad_norm": 1.4056227207183838, + "learning_rate": 2.636783448958325e-05, + "loss": 0.8973, + "step": 8522 + }, + { + "epoch": 0.4150171645606603, + "grad_norm": 2.1451847553253174, + "learning_rate": 2.6364844198270114e-05, + "loss": 0.891, + "step": 8523 + }, + { + "epoch": 0.4150658583497675, + "grad_norm": 2.0628035068511963, + "learning_rate": 2.6361853748630894e-05, + "loss": 0.8747, + "step": 8524 + }, + { + "epoch": 0.41511455213887466, + "grad_norm": 1.5827716588974, + "learning_rate": 2.635886314073999e-05, + "loss": 0.9076, + "step": 8525 + }, + { + "epoch": 0.41516324592798187, + "grad_norm": 2.3222084045410156, + "learning_rate": 2.6355872374671783e-05, + "loss": 0.9517, + "step": 8526 + }, + { + "epoch": 0.4152119397170891, + "grad_norm": 1.5324517488479614, + "learning_rate": 2.6352881450500668e-05, + "loss": 0.8903, + "step": 8527 + }, + { + "epoch": 0.4152606335061963, + "grad_norm": 1.8228951692581177, + "learning_rate": 2.6349890368301045e-05, + "loss": 0.7575, + "step": 8528 + }, + { + "epoch": 0.4153093272953035, + "grad_norm": 1.4792823791503906, + "learning_rate": 2.6346899128147324e-05, + "loss": 0.881, + "step": 8529 + }, + { + "epoch": 0.41535802108441067, + "grad_norm": 1.3683063983917236, + "learning_rate": 2.6343907730113906e-05, + "loss": 0.8556, + "step": 8530 + }, + { + "epoch": 0.4154067148735179, + "grad_norm": 2.404540777206421, + "learning_rate": 2.6340916174275207e-05, + "loss": 0.8497, + "step": 8531 + }, + { + "epoch": 0.4154554086626251, + "grad_norm": 2.0891082286834717, + "learning_rate": 2.6337924460705645e-05, + "loss": 0.783, + "step": 8532 + }, + { + "epoch": 0.4155041024517323, + "grad_norm": 1.2108142375946045, + "learning_rate": 2.6334932589479625e-05, + "loss": 0.8292, + "step": 8533 + }, + { + "epoch": 0.41555279624083946, + "grad_norm": 1.398285984992981, + "learning_rate": 2.6331940560671586e-05, + "loss": 0.9244, + "step": 8534 + }, + { + "epoch": 0.4156014900299467, + "grad_norm": 1.3025314807891846, + "learning_rate": 2.6328948374355948e-05, + "loss": 0.791, + "step": 8535 + }, + { + "epoch": 0.4156501838190539, + "grad_norm": 1.2749875783920288, + "learning_rate": 2.6325956030607137e-05, + "loss": 0.9024, + "step": 8536 + }, + { + "epoch": 0.4156988776081611, + "grad_norm": 1.913223385810852, + "learning_rate": 2.6322963529499595e-05, + "loss": 0.8004, + "step": 8537 + }, + { + "epoch": 0.41574757139726826, + "grad_norm": 1.2895324230194092, + "learning_rate": 2.631997087110776e-05, + "loss": 0.8635, + "step": 8538 + }, + { + "epoch": 0.41579626518637547, + "grad_norm": 2.140942335128784, + "learning_rate": 2.6316978055506068e-05, + "loss": 0.9022, + "step": 8539 + }, + { + "epoch": 0.4158449589754827, + "grad_norm": 1.805548071861267, + "learning_rate": 2.631398508276898e-05, + "loss": 0.7876, + "step": 8540 + }, + { + "epoch": 0.4158936527645899, + "grad_norm": 1.1939642429351807, + "learning_rate": 2.6310991952970932e-05, + "loss": 0.8719, + "step": 8541 + }, + { + "epoch": 0.41594234655369705, + "grad_norm": 1.8202913999557495, + "learning_rate": 2.630799866618639e-05, + "loss": 0.7716, + "step": 8542 + }, + { + "epoch": 0.41599104034280426, + "grad_norm": 1.887616515159607, + "learning_rate": 2.63050052224898e-05, + "loss": 0.8758, + "step": 8543 + }, + { + "epoch": 0.4160397341319115, + "grad_norm": 1.7957878112792969, + "learning_rate": 2.6302011621955626e-05, + "loss": 0.8185, + "step": 8544 + }, + { + "epoch": 0.4160884279210187, + "grad_norm": 2.512225389480591, + "learning_rate": 2.6299017864658348e-05, + "loss": 0.9294, + "step": 8545 + }, + { + "epoch": 0.41613712171012585, + "grad_norm": 3.0063183307647705, + "learning_rate": 2.629602395067242e-05, + "loss": 0.775, + "step": 8546 + }, + { + "epoch": 0.41618581549923306, + "grad_norm": 1.6201430559158325, + "learning_rate": 2.6293029880072328e-05, + "loss": 0.7763, + "step": 8547 + }, + { + "epoch": 0.41623450928834027, + "grad_norm": 1.4245089292526245, + "learning_rate": 2.629003565293253e-05, + "loss": 0.8478, + "step": 8548 + }, + { + "epoch": 0.4162832030774475, + "grad_norm": 1.7728487253189087, + "learning_rate": 2.628704126932754e-05, + "loss": 0.9088, + "step": 8549 + }, + { + "epoch": 0.4163318968665547, + "grad_norm": 2.424159288406372, + "learning_rate": 2.6284046729331817e-05, + "loss": 0.9076, + "step": 8550 + }, + { + "epoch": 0.41638059065566185, + "grad_norm": 1.3369022607803345, + "learning_rate": 2.6281052033019855e-05, + "loss": 0.8114, + "step": 8551 + }, + { + "epoch": 0.41642928444476907, + "grad_norm": 1.5825034379959106, + "learning_rate": 2.6278057180466154e-05, + "loss": 0.94, + "step": 8552 + }, + { + "epoch": 0.4164779782338763, + "grad_norm": 1.2675178050994873, + "learning_rate": 2.6275062171745208e-05, + "loss": 0.7721, + "step": 8553 + }, + { + "epoch": 0.4165266720229835, + "grad_norm": 1.7917002439498901, + "learning_rate": 2.627206700693152e-05, + "loss": 0.8354, + "step": 8554 + }, + { + "epoch": 0.41657536581209065, + "grad_norm": 0.10666380822658539, + "learning_rate": 2.626907168609959e-05, + "loss": 0.6516, + "step": 8555 + }, + { + "epoch": 0.41662405960119786, + "grad_norm": 1.873788595199585, + "learning_rate": 2.6266076209323935e-05, + "loss": 0.8117, + "step": 8556 + }, + { + "epoch": 0.4166727533903051, + "grad_norm": 1.3108153343200684, + "learning_rate": 2.626308057667906e-05, + "loss": 0.7694, + "step": 8557 + }, + { + "epoch": 0.4167214471794123, + "grad_norm": 1.341067910194397, + "learning_rate": 2.6260084788239485e-05, + "loss": 0.7716, + "step": 8558 + }, + { + "epoch": 0.41677014096851944, + "grad_norm": 1.2405139207839966, + "learning_rate": 2.6257088844079726e-05, + "loss": 0.8505, + "step": 8559 + }, + { + "epoch": 0.41681883475762666, + "grad_norm": 1.5861270427703857, + "learning_rate": 2.6254092744274317e-05, + "loss": 0.8486, + "step": 8560 + }, + { + "epoch": 0.41686752854673387, + "grad_norm": 1.5851235389709473, + "learning_rate": 2.6251096488897784e-05, + "loss": 0.8775, + "step": 8561 + }, + { + "epoch": 0.4169162223358411, + "grad_norm": 1.3031712770462036, + "learning_rate": 2.6248100078024647e-05, + "loss": 0.8757, + "step": 8562 + }, + { + "epoch": 0.41696491612494824, + "grad_norm": 1.6469404697418213, + "learning_rate": 2.6245103511729458e-05, + "loss": 0.9086, + "step": 8563 + }, + { + "epoch": 0.41701360991405545, + "grad_norm": 1.2571948766708374, + "learning_rate": 2.624210679008675e-05, + "loss": 0.9416, + "step": 8564 + }, + { + "epoch": 0.41706230370316266, + "grad_norm": 3.4421029090881348, + "learning_rate": 2.623910991317106e-05, + "loss": 0.7685, + "step": 8565 + }, + { + "epoch": 0.4171109974922699, + "grad_norm": 1.5316438674926758, + "learning_rate": 2.6236112881056947e-05, + "loss": 0.8703, + "step": 8566 + }, + { + "epoch": 0.4171596912813771, + "grad_norm": 2.922008752822876, + "learning_rate": 2.6233115693818957e-05, + "loss": 0.7608, + "step": 8567 + }, + { + "epoch": 0.41720838507048424, + "grad_norm": 1.8681260347366333, + "learning_rate": 2.623011835153164e-05, + "loss": 0.8852, + "step": 8568 + }, + { + "epoch": 0.41725707885959146, + "grad_norm": 1.3934191465377808, + "learning_rate": 2.6227120854269566e-05, + "loss": 0.8128, + "step": 8569 + }, + { + "epoch": 0.41730577264869867, + "grad_norm": 2.2449002265930176, + "learning_rate": 2.6224123202107294e-05, + "loss": 0.8292, + "step": 8570 + }, + { + "epoch": 0.4173544664378059, + "grad_norm": 2.218815565109253, + "learning_rate": 2.622112539511939e-05, + "loss": 0.8435, + "step": 8571 + }, + { + "epoch": 0.41740316022691304, + "grad_norm": 3.361023426055908, + "learning_rate": 2.6218127433380422e-05, + "loss": 0.8344, + "step": 8572 + }, + { + "epoch": 0.41745185401602025, + "grad_norm": 2.195683002471924, + "learning_rate": 2.6215129316964965e-05, + "loss": 0.8798, + "step": 8573 + }, + { + "epoch": 0.41750054780512746, + "grad_norm": 1.3738361597061157, + "learning_rate": 2.6212131045947608e-05, + "loss": 0.8671, + "step": 8574 + }, + { + "epoch": 0.4175492415942347, + "grad_norm": 1.836690902709961, + "learning_rate": 2.6209132620402916e-05, + "loss": 0.7872, + "step": 8575 + }, + { + "epoch": 0.41759793538334183, + "grad_norm": 2.1880834102630615, + "learning_rate": 2.6206134040405494e-05, + "loss": 0.7777, + "step": 8576 + }, + { + "epoch": 0.41764662917244905, + "grad_norm": 1.929219365119934, + "learning_rate": 2.6203135306029913e-05, + "loss": 0.776, + "step": 8577 + }, + { + "epoch": 0.41769532296155626, + "grad_norm": 2.4213075637817383, + "learning_rate": 2.6200136417350774e-05, + "loss": 0.8593, + "step": 8578 + }, + { + "epoch": 0.41774401675066347, + "grad_norm": 0.09139756113290787, + "learning_rate": 2.6197137374442686e-05, + "loss": 0.6346, + "step": 8579 + }, + { + "epoch": 0.41779271053977063, + "grad_norm": 2.600393533706665, + "learning_rate": 2.619413817738023e-05, + "loss": 0.8369, + "step": 8580 + }, + { + "epoch": 0.41784140432887784, + "grad_norm": 0.088526152074337, + "learning_rate": 2.619113882623803e-05, + "loss": 0.6159, + "step": 8581 + }, + { + "epoch": 0.41789009811798505, + "grad_norm": 1.5194141864776611, + "learning_rate": 2.6188139321090682e-05, + "loss": 0.9083, + "step": 8582 + }, + { + "epoch": 0.41793879190709227, + "grad_norm": 2.144850492477417, + "learning_rate": 2.618513966201281e-05, + "loss": 0.8733, + "step": 8583 + }, + { + "epoch": 0.4179874856961994, + "grad_norm": 2.115579605102539, + "learning_rate": 2.618213984907902e-05, + "loss": 0.8696, + "step": 8584 + }, + { + "epoch": 0.41803617948530664, + "grad_norm": 1.5241944789886475, + "learning_rate": 2.617913988236394e-05, + "loss": 0.8684, + "step": 8585 + }, + { + "epoch": 0.41808487327441385, + "grad_norm": 1.515454649925232, + "learning_rate": 2.6176139761942194e-05, + "loss": 0.873, + "step": 8586 + }, + { + "epoch": 0.41813356706352106, + "grad_norm": 1.5095010995864868, + "learning_rate": 2.6173139487888405e-05, + "loss": 0.8779, + "step": 8587 + }, + { + "epoch": 0.4181822608526283, + "grad_norm": 1.6228991746902466, + "learning_rate": 2.61701390602772e-05, + "loss": 0.8835, + "step": 8588 + }, + { + "epoch": 0.41823095464173543, + "grad_norm": 1.6306260824203491, + "learning_rate": 2.6167138479183234e-05, + "loss": 0.8583, + "step": 8589 + }, + { + "epoch": 0.41827964843084264, + "grad_norm": 1.7401435375213623, + "learning_rate": 2.616413774468114e-05, + "loss": 0.9019, + "step": 8590 + }, + { + "epoch": 0.41832834221994986, + "grad_norm": 1.8294930458068848, + "learning_rate": 2.616113685684555e-05, + "loss": 0.9223, + "step": 8591 + }, + { + "epoch": 0.41837703600905707, + "grad_norm": 2.2157764434814453, + "learning_rate": 2.6158135815751125e-05, + "loss": 0.8847, + "step": 8592 + }, + { + "epoch": 0.4184257297981642, + "grad_norm": 2.4096503257751465, + "learning_rate": 2.6155134621472504e-05, + "loss": 0.8378, + "step": 8593 + }, + { + "epoch": 0.41847442358727144, + "grad_norm": 1.3076746463775635, + "learning_rate": 2.6152133274084344e-05, + "loss": 0.9319, + "step": 8594 + }, + { + "epoch": 0.41852311737637865, + "grad_norm": 1.4430367946624756, + "learning_rate": 2.614913177366132e-05, + "loss": 0.8094, + "step": 8595 + }, + { + "epoch": 0.41857181116548586, + "grad_norm": 1.6074358224868774, + "learning_rate": 2.6146130120278078e-05, + "loss": 0.8631, + "step": 8596 + }, + { + "epoch": 0.418620504954593, + "grad_norm": 1.346307635307312, + "learning_rate": 2.6143128314009286e-05, + "loss": 0.8767, + "step": 8597 + }, + { + "epoch": 0.41866919874370023, + "grad_norm": 1.7362208366394043, + "learning_rate": 2.6140126354929612e-05, + "loss": 0.8196, + "step": 8598 + }, + { + "epoch": 0.41871789253280745, + "grad_norm": 3.0862648487091064, + "learning_rate": 2.6137124243113746e-05, + "loss": 0.9276, + "step": 8599 + }, + { + "epoch": 0.41876658632191466, + "grad_norm": 1.2599247694015503, + "learning_rate": 2.6134121978636353e-05, + "loss": 0.9404, + "step": 8600 + }, + { + "epoch": 0.4188152801110218, + "grad_norm": 1.6634758710861206, + "learning_rate": 2.6131119561572114e-05, + "loss": 0.8523, + "step": 8601 + }, + { + "epoch": 0.418863973900129, + "grad_norm": 1.5670374631881714, + "learning_rate": 2.6128116991995713e-05, + "loss": 0.8744, + "step": 8602 + }, + { + "epoch": 0.41891266768923624, + "grad_norm": 1.2203161716461182, + "learning_rate": 2.6125114269981852e-05, + "loss": 0.8135, + "step": 8603 + }, + { + "epoch": 0.41896136147834345, + "grad_norm": 2.0010416507720947, + "learning_rate": 2.612211139560521e-05, + "loss": 0.9121, + "step": 8604 + }, + { + "epoch": 0.4190100552674506, + "grad_norm": 1.355129361152649, + "learning_rate": 2.6119108368940485e-05, + "loss": 0.8446, + "step": 8605 + }, + { + "epoch": 0.4190587490565578, + "grad_norm": 1.4595234394073486, + "learning_rate": 2.611610519006239e-05, + "loss": 0.885, + "step": 8606 + }, + { + "epoch": 0.41910744284566503, + "grad_norm": 0.08962520211935043, + "learning_rate": 2.6113101859045618e-05, + "loss": 0.5979, + "step": 8607 + }, + { + "epoch": 0.41915613663477225, + "grad_norm": 1.2975749969482422, + "learning_rate": 2.611009837596487e-05, + "loss": 0.8144, + "step": 8608 + }, + { + "epoch": 0.41920483042387946, + "grad_norm": 1.777230978012085, + "learning_rate": 2.6107094740894877e-05, + "loss": 0.8559, + "step": 8609 + }, + { + "epoch": 0.4192535242129866, + "grad_norm": 2.1435110569000244, + "learning_rate": 2.610409095391034e-05, + "loss": 0.868, + "step": 8610 + }, + { + "epoch": 0.41930221800209383, + "grad_norm": 1.5500874519348145, + "learning_rate": 2.6101087015085993e-05, + "loss": 0.8411, + "step": 8611 + }, + { + "epoch": 0.41935091179120104, + "grad_norm": 1.3886653184890747, + "learning_rate": 2.609808292449654e-05, + "loss": 0.9228, + "step": 8612 + }, + { + "epoch": 0.41939960558030825, + "grad_norm": 1.7301751375198364, + "learning_rate": 2.6095078682216728e-05, + "loss": 0.8152, + "step": 8613 + }, + { + "epoch": 0.4194482993694154, + "grad_norm": 1.3563131093978882, + "learning_rate": 2.6092074288321273e-05, + "loss": 0.8309, + "step": 8614 + }, + { + "epoch": 0.4194969931585226, + "grad_norm": 0.09300381690263748, + "learning_rate": 2.608906974288492e-05, + "loss": 0.5875, + "step": 8615 + }, + { + "epoch": 0.41954568694762984, + "grad_norm": 1.305108666419983, + "learning_rate": 2.6086065045982395e-05, + "loss": 0.8228, + "step": 8616 + }, + { + "epoch": 0.41959438073673705, + "grad_norm": 1.4573211669921875, + "learning_rate": 2.608306019768845e-05, + "loss": 0.7985, + "step": 8617 + }, + { + "epoch": 0.4196430745258442, + "grad_norm": 1.1837066411972046, + "learning_rate": 2.6080055198077823e-05, + "loss": 0.893, + "step": 8618 + }, + { + "epoch": 0.4196917683149514, + "grad_norm": 1.3663214445114136, + "learning_rate": 2.6077050047225273e-05, + "loss": 0.8236, + "step": 8619 + }, + { + "epoch": 0.41974046210405863, + "grad_norm": 1.503859281539917, + "learning_rate": 2.607404474520555e-05, + "loss": 0.8119, + "step": 8620 + }, + { + "epoch": 0.41978915589316584, + "grad_norm": 2.3042001724243164, + "learning_rate": 2.607103929209341e-05, + "loss": 0.8599, + "step": 8621 + }, + { + "epoch": 0.419837849682273, + "grad_norm": 2.310373306274414, + "learning_rate": 2.6068033687963614e-05, + "loss": 0.8185, + "step": 8622 + }, + { + "epoch": 0.4198865434713802, + "grad_norm": 1.5963332653045654, + "learning_rate": 2.6065027932890923e-05, + "loss": 0.8458, + "step": 8623 + }, + { + "epoch": 0.4199352372604874, + "grad_norm": 1.262819766998291, + "learning_rate": 2.6062022026950114e-05, + "loss": 0.7623, + "step": 8624 + }, + { + "epoch": 0.41998393104959464, + "grad_norm": 1.670032262802124, + "learning_rate": 2.6059015970215953e-05, + "loss": 0.8197, + "step": 8625 + }, + { + "epoch": 0.42003262483870185, + "grad_norm": 1.1361846923828125, + "learning_rate": 2.6056009762763215e-05, + "loss": 0.8409, + "step": 8626 + }, + { + "epoch": 0.420081318627809, + "grad_norm": 1.3982417583465576, + "learning_rate": 2.6053003404666685e-05, + "loss": 0.9036, + "step": 8627 + }, + { + "epoch": 0.4201300124169162, + "grad_norm": 2.2537410259246826, + "learning_rate": 2.604999689600114e-05, + "loss": 0.8716, + "step": 8628 + }, + { + "epoch": 0.42017870620602343, + "grad_norm": 1.523045301437378, + "learning_rate": 2.6046990236841372e-05, + "loss": 0.8275, + "step": 8629 + }, + { + "epoch": 0.42022739999513065, + "grad_norm": 0.0863293781876564, + "learning_rate": 2.6043983427262172e-05, + "loss": 0.5729, + "step": 8630 + }, + { + "epoch": 0.4202760937842378, + "grad_norm": 3.411896228790283, + "learning_rate": 2.604097646733833e-05, + "loss": 0.8242, + "step": 8631 + }, + { + "epoch": 0.420324787573345, + "grad_norm": 0.09069307893514633, + "learning_rate": 2.603796935714465e-05, + "loss": 0.6477, + "step": 8632 + }, + { + "epoch": 0.42037348136245223, + "grad_norm": 2.7236411571502686, + "learning_rate": 2.6034962096755933e-05, + "loss": 0.8738, + "step": 8633 + }, + { + "epoch": 0.42042217515155944, + "grad_norm": 1.147840976715088, + "learning_rate": 2.6031954686246983e-05, + "loss": 0.8892, + "step": 8634 + }, + { + "epoch": 0.4204708689406666, + "grad_norm": 1.763400912284851, + "learning_rate": 2.602894712569261e-05, + "loss": 0.8613, + "step": 8635 + }, + { + "epoch": 0.4205195627297738, + "grad_norm": 8.379782676696777, + "learning_rate": 2.602593941516763e-05, + "loss": 0.9197, + "step": 8636 + }, + { + "epoch": 0.420568256518881, + "grad_norm": 1.7192862033843994, + "learning_rate": 2.6022931554746854e-05, + "loss": 0.7965, + "step": 8637 + }, + { + "epoch": 0.42061695030798824, + "grad_norm": 1.2087140083312988, + "learning_rate": 2.6019923544505106e-05, + "loss": 0.7826, + "step": 8638 + }, + { + "epoch": 0.4206656440970954, + "grad_norm": 1.3254287242889404, + "learning_rate": 2.6016915384517206e-05, + "loss": 0.7957, + "step": 8639 + }, + { + "epoch": 0.4207143378862026, + "grad_norm": 1.701978087425232, + "learning_rate": 2.6013907074857995e-05, + "loss": 0.8829, + "step": 8640 + }, + { + "epoch": 0.4207630316753098, + "grad_norm": 1.8924320936203003, + "learning_rate": 2.6010898615602296e-05, + "loss": 0.7705, + "step": 8641 + }, + { + "epoch": 0.42081172546441703, + "grad_norm": 1.7368252277374268, + "learning_rate": 2.600789000682495e-05, + "loss": 0.7946, + "step": 8642 + }, + { + "epoch": 0.4208604192535242, + "grad_norm": 1.2374742031097412, + "learning_rate": 2.6004881248600786e-05, + "loss": 0.8147, + "step": 8643 + }, + { + "epoch": 0.4209091130426314, + "grad_norm": 1.3135515451431274, + "learning_rate": 2.6001872341004656e-05, + "loss": 0.8556, + "step": 8644 + }, + { + "epoch": 0.4209578068317386, + "grad_norm": 1.815068244934082, + "learning_rate": 2.59988632841114e-05, + "loss": 0.9332, + "step": 8645 + }, + { + "epoch": 0.4210065006208458, + "grad_norm": 7.632833003997803, + "learning_rate": 2.5995854077995875e-05, + "loss": 0.9282, + "step": 8646 + }, + { + "epoch": 0.42105519440995304, + "grad_norm": 0.0903763547539711, + "learning_rate": 2.5992844722732936e-05, + "loss": 0.6383, + "step": 8647 + }, + { + "epoch": 0.4211038881990602, + "grad_norm": 1.7743250131607056, + "learning_rate": 2.5989835218397436e-05, + "loss": 0.9227, + "step": 8648 + }, + { + "epoch": 0.4211525819881674, + "grad_norm": 1.5577588081359863, + "learning_rate": 2.598682556506424e-05, + "loss": 0.7846, + "step": 8649 + }, + { + "epoch": 0.4212012757772746, + "grad_norm": 2.9613592624664307, + "learning_rate": 2.5983815762808214e-05, + "loss": 0.8304, + "step": 8650 + }, + { + "epoch": 0.42124996956638183, + "grad_norm": 1.4653520584106445, + "learning_rate": 2.598080581170422e-05, + "loss": 0.7735, + "step": 8651 + }, + { + "epoch": 0.421298663355489, + "grad_norm": 1.851017951965332, + "learning_rate": 2.597779571182714e-05, + "loss": 0.9545, + "step": 8652 + }, + { + "epoch": 0.4213473571445962, + "grad_norm": 1.5059289932250977, + "learning_rate": 2.5974785463251843e-05, + "loss": 0.8035, + "step": 8653 + }, + { + "epoch": 0.4213960509337034, + "grad_norm": 1.2386096715927124, + "learning_rate": 2.5971775066053217e-05, + "loss": 0.8181, + "step": 8654 + }, + { + "epoch": 0.4214447447228106, + "grad_norm": 1.938045859336853, + "learning_rate": 2.596876452030614e-05, + "loss": 0.8101, + "step": 8655 + }, + { + "epoch": 0.4214934385119178, + "grad_norm": 1.6046993732452393, + "learning_rate": 2.59657538260855e-05, + "loss": 0.9312, + "step": 8656 + }, + { + "epoch": 0.421542132301025, + "grad_norm": 1.3329460620880127, + "learning_rate": 2.596274298346619e-05, + "loss": 0.845, + "step": 8657 + }, + { + "epoch": 0.4215908260901322, + "grad_norm": 1.3464446067810059, + "learning_rate": 2.5959731992523107e-05, + "loss": 0.8624, + "step": 8658 + }, + { + "epoch": 0.4216395198792394, + "grad_norm": 1.6386756896972656, + "learning_rate": 2.5956720853331145e-05, + "loss": 0.7478, + "step": 8659 + }, + { + "epoch": 0.4216882136683466, + "grad_norm": 1.6011890172958374, + "learning_rate": 2.595370956596521e-05, + "loss": 0.8309, + "step": 8660 + }, + { + "epoch": 0.4217369074574538, + "grad_norm": 1.4877859354019165, + "learning_rate": 2.5950698130500204e-05, + "loss": 0.9236, + "step": 8661 + }, + { + "epoch": 0.421785601246561, + "grad_norm": 1.700684905052185, + "learning_rate": 2.5947686547011046e-05, + "loss": 0.8952, + "step": 8662 + }, + { + "epoch": 0.4218342950356682, + "grad_norm": 1.2553133964538574, + "learning_rate": 2.594467481557264e-05, + "loss": 0.963, + "step": 8663 + }, + { + "epoch": 0.4218829888247754, + "grad_norm": 2.527055025100708, + "learning_rate": 2.5941662936259905e-05, + "loss": 0.7907, + "step": 8664 + }, + { + "epoch": 0.4219316826138826, + "grad_norm": 1.9650452136993408, + "learning_rate": 2.5938650909147763e-05, + "loss": 0.8749, + "step": 8665 + }, + { + "epoch": 0.4219803764029898, + "grad_norm": 1.5052976608276367, + "learning_rate": 2.5935638734311134e-05, + "loss": 0.7725, + "step": 8666 + }, + { + "epoch": 0.422029070192097, + "grad_norm": 0.08825750648975372, + "learning_rate": 2.5932626411824953e-05, + "loss": 0.5796, + "step": 8667 + }, + { + "epoch": 0.4220777639812042, + "grad_norm": 1.2427923679351807, + "learning_rate": 2.5929613941764145e-05, + "loss": 0.8023, + "step": 8668 + }, + { + "epoch": 0.4221264577703114, + "grad_norm": 1.572989821434021, + "learning_rate": 2.592660132420366e-05, + "loss": 0.752, + "step": 8669 + }, + { + "epoch": 0.4221751515594186, + "grad_norm": 1.5212064981460571, + "learning_rate": 2.592358855921842e-05, + "loss": 0.808, + "step": 8670 + }, + { + "epoch": 0.4222238453485258, + "grad_norm": 1.609656572341919, + "learning_rate": 2.5920575646883377e-05, + "loss": 0.8718, + "step": 8671 + }, + { + "epoch": 0.422272539137633, + "grad_norm": 1.5985993146896362, + "learning_rate": 2.5917562587273475e-05, + "loss": 0.9428, + "step": 8672 + }, + { + "epoch": 0.4223212329267402, + "grad_norm": 1.156406283378601, + "learning_rate": 2.5914549380463664e-05, + "loss": 0.8288, + "step": 8673 + }, + { + "epoch": 0.4223699267158474, + "grad_norm": 1.6529207229614258, + "learning_rate": 2.5911536026528895e-05, + "loss": 0.8918, + "step": 8674 + }, + { + "epoch": 0.4224186205049546, + "grad_norm": 1.823912262916565, + "learning_rate": 2.590852252554413e-05, + "loss": 0.8515, + "step": 8675 + }, + { + "epoch": 0.4224673142940618, + "grad_norm": 1.3617968559265137, + "learning_rate": 2.5905508877584328e-05, + "loss": 0.8832, + "step": 8676 + }, + { + "epoch": 0.42251600808316897, + "grad_norm": 1.5117892026901245, + "learning_rate": 2.5902495082724454e-05, + "loss": 0.9687, + "step": 8677 + }, + { + "epoch": 0.4225647018722762, + "grad_norm": 1.9837232828140259, + "learning_rate": 2.5899481141039476e-05, + "loss": 0.7758, + "step": 8678 + }, + { + "epoch": 0.4226133956613834, + "grad_norm": 2.4270541667938232, + "learning_rate": 2.589646705260437e-05, + "loss": 0.8026, + "step": 8679 + }, + { + "epoch": 0.4226620894504906, + "grad_norm": 2.1790356636047363, + "learning_rate": 2.5893452817494104e-05, + "loss": 0.7549, + "step": 8680 + }, + { + "epoch": 0.42271078323959776, + "grad_norm": 1.8292030096054077, + "learning_rate": 2.5890438435783668e-05, + "loss": 0.8545, + "step": 8681 + }, + { + "epoch": 0.422759477028705, + "grad_norm": 1.3142204284667969, + "learning_rate": 2.588742390754803e-05, + "loss": 0.7924, + "step": 8682 + }, + { + "epoch": 0.4228081708178122, + "grad_norm": 1.8058924674987793, + "learning_rate": 2.5884409232862192e-05, + "loss": 0.8259, + "step": 8683 + }, + { + "epoch": 0.4228568646069194, + "grad_norm": 1.2167036533355713, + "learning_rate": 2.588139441180114e-05, + "loss": 0.8275, + "step": 8684 + }, + { + "epoch": 0.42290555839602656, + "grad_norm": 1.6955546140670776, + "learning_rate": 2.587837944443986e-05, + "loss": 0.8504, + "step": 8685 + }, + { + "epoch": 0.42295425218513377, + "grad_norm": 1.3363593816757202, + "learning_rate": 2.5875364330853357e-05, + "loss": 0.8209, + "step": 8686 + }, + { + "epoch": 0.423002945974241, + "grad_norm": 1.6673274040222168, + "learning_rate": 2.587234907111663e-05, + "loss": 0.7818, + "step": 8687 + }, + { + "epoch": 0.4230516397633482, + "grad_norm": 1.7697950601577759, + "learning_rate": 2.5869333665304685e-05, + "loss": 0.7943, + "step": 8688 + }, + { + "epoch": 0.4231003335524554, + "grad_norm": 1.2722774744033813, + "learning_rate": 2.586631811349253e-05, + "loss": 0.841, + "step": 8689 + }, + { + "epoch": 0.42314902734156257, + "grad_norm": 1.24715256690979, + "learning_rate": 2.5863302415755177e-05, + "loss": 0.7946, + "step": 8690 + }, + { + "epoch": 0.4231977211306698, + "grad_norm": 1.5158358812332153, + "learning_rate": 2.5860286572167644e-05, + "loss": 0.8427, + "step": 8691 + }, + { + "epoch": 0.423246414919777, + "grad_norm": 1.2567466497421265, + "learning_rate": 2.5857270582804948e-05, + "loss": 0.8199, + "step": 8692 + }, + { + "epoch": 0.4232951087088842, + "grad_norm": 3.1992642879486084, + "learning_rate": 2.5854254447742112e-05, + "loss": 0.8388, + "step": 8693 + }, + { + "epoch": 0.42334380249799136, + "grad_norm": 1.3282867670059204, + "learning_rate": 2.5851238167054164e-05, + "loss": 0.8243, + "step": 8694 + }, + { + "epoch": 0.4233924962870986, + "grad_norm": 1.4409615993499756, + "learning_rate": 2.584822174081613e-05, + "loss": 0.7819, + "step": 8695 + }, + { + "epoch": 0.4234411900762058, + "grad_norm": 1.2521389722824097, + "learning_rate": 2.5845205169103045e-05, + "loss": 0.8751, + "step": 8696 + }, + { + "epoch": 0.423489883865313, + "grad_norm": 1.9713917970657349, + "learning_rate": 2.5842188451989945e-05, + "loss": 0.7619, + "step": 8697 + }, + { + "epoch": 0.42353857765442016, + "grad_norm": 1.3672422170639038, + "learning_rate": 2.5839171589551884e-05, + "loss": 0.8724, + "step": 8698 + }, + { + "epoch": 0.42358727144352737, + "grad_norm": 1.4940276145935059, + "learning_rate": 2.5836154581863895e-05, + "loss": 0.8928, + "step": 8699 + }, + { + "epoch": 0.4236359652326346, + "grad_norm": 1.2295262813568115, + "learning_rate": 2.583313742900103e-05, + "loss": 0.9118, + "step": 8700 + }, + { + "epoch": 0.4236846590217418, + "grad_norm": 1.4445645809173584, + "learning_rate": 2.5830120131038338e-05, + "loss": 0.8709, + "step": 8701 + }, + { + "epoch": 0.42373335281084895, + "grad_norm": 1.3768993616104126, + "learning_rate": 2.5827102688050875e-05, + "loss": 0.8837, + "step": 8702 + }, + { + "epoch": 0.42378204659995616, + "grad_norm": 5.372384071350098, + "learning_rate": 2.58240851001137e-05, + "loss": 0.7937, + "step": 8703 + }, + { + "epoch": 0.4238307403890634, + "grad_norm": 1.7484283447265625, + "learning_rate": 2.5821067367301877e-05, + "loss": 0.8718, + "step": 8704 + }, + { + "epoch": 0.4238794341781706, + "grad_norm": 1.6162101030349731, + "learning_rate": 2.5818049489690475e-05, + "loss": 0.8588, + "step": 8705 + }, + { + "epoch": 0.4239281279672778, + "grad_norm": 0.09190258383750916, + "learning_rate": 2.5815031467354566e-05, + "loss": 0.5845, + "step": 8706 + }, + { + "epoch": 0.42397682175638496, + "grad_norm": 2.2125837802886963, + "learning_rate": 2.5812013300369208e-05, + "loss": 0.8655, + "step": 8707 + }, + { + "epoch": 0.42402551554549217, + "grad_norm": 3.335747718811035, + "learning_rate": 2.5808994988809498e-05, + "loss": 0.8407, + "step": 8708 + }, + { + "epoch": 0.4240742093345994, + "grad_norm": 1.953608512878418, + "learning_rate": 2.5805976532750504e-05, + "loss": 0.8652, + "step": 8709 + }, + { + "epoch": 0.4241229031237066, + "grad_norm": 1.6569732427597046, + "learning_rate": 2.5802957932267317e-05, + "loss": 0.8398, + "step": 8710 + }, + { + "epoch": 0.42417159691281375, + "grad_norm": 2.0325026512145996, + "learning_rate": 2.579993918743502e-05, + "loss": 0.8919, + "step": 8711 + }, + { + "epoch": 0.42422029070192097, + "grad_norm": 1.6816538572311401, + "learning_rate": 2.5796920298328706e-05, + "loss": 0.8554, + "step": 8712 + }, + { + "epoch": 0.4242689844910282, + "grad_norm": 1.5326074361801147, + "learning_rate": 2.5793901265023477e-05, + "loss": 0.8404, + "step": 8713 + }, + { + "epoch": 0.4243176782801354, + "grad_norm": 2.20477294921875, + "learning_rate": 2.579088208759442e-05, + "loss": 0.9518, + "step": 8714 + }, + { + "epoch": 0.42436637206924255, + "grad_norm": 1.2281485795974731, + "learning_rate": 2.5787862766116646e-05, + "loss": 0.8173, + "step": 8715 + }, + { + "epoch": 0.42441506585834976, + "grad_norm": 2.2932822704315186, + "learning_rate": 2.578484330066526e-05, + "loss": 0.8731, + "step": 8716 + }, + { + "epoch": 0.424463759647457, + "grad_norm": 1.993538737297058, + "learning_rate": 2.5781823691315363e-05, + "loss": 0.7608, + "step": 8717 + }, + { + "epoch": 0.4245124534365642, + "grad_norm": 1.3615530729293823, + "learning_rate": 2.577880393814207e-05, + "loss": 0.8294, + "step": 8718 + }, + { + "epoch": 0.42456114722567134, + "grad_norm": 1.4385329484939575, + "learning_rate": 2.5775784041220514e-05, + "loss": 0.8381, + "step": 8719 + }, + { + "epoch": 0.42460984101477856, + "grad_norm": 1.877288579940796, + "learning_rate": 2.57727640006258e-05, + "loss": 0.7757, + "step": 8720 + }, + { + "epoch": 0.42465853480388577, + "grad_norm": 1.2829616069793701, + "learning_rate": 2.5769743816433055e-05, + "loss": 0.8699, + "step": 8721 + }, + { + "epoch": 0.424707228592993, + "grad_norm": 1.771808385848999, + "learning_rate": 2.576672348871741e-05, + "loss": 0.8495, + "step": 8722 + }, + { + "epoch": 0.42475592238210014, + "grad_norm": 1.4692808389663696, + "learning_rate": 2.5763703017553986e-05, + "loss": 0.8052, + "step": 8723 + }, + { + "epoch": 0.42480461617120735, + "grad_norm": 1.3654474020004272, + "learning_rate": 2.5760682403017927e-05, + "loss": 0.8142, + "step": 8724 + }, + { + "epoch": 0.42485330996031456, + "grad_norm": 2.194626569747925, + "learning_rate": 2.575766164518436e-05, + "loss": 0.79, + "step": 8725 + }, + { + "epoch": 0.4249020037494218, + "grad_norm": 1.8127259016036987, + "learning_rate": 2.5754640744128448e-05, + "loss": 0.7915, + "step": 8726 + }, + { + "epoch": 0.424950697538529, + "grad_norm": 1.3149943351745605, + "learning_rate": 2.5751619699925315e-05, + "loss": 0.8794, + "step": 8727 + }, + { + "epoch": 0.42499939132763614, + "grad_norm": 1.4512830972671509, + "learning_rate": 2.5748598512650117e-05, + "loss": 0.7504, + "step": 8728 + }, + { + "epoch": 0.42504808511674336, + "grad_norm": 1.3017783164978027, + "learning_rate": 2.574557718237801e-05, + "loss": 0.8162, + "step": 8729 + }, + { + "epoch": 0.42509677890585057, + "grad_norm": 1.5265095233917236, + "learning_rate": 2.5742555709184146e-05, + "loss": 0.9159, + "step": 8730 + }, + { + "epoch": 0.4251454726949578, + "grad_norm": 1.7975267171859741, + "learning_rate": 2.5739534093143686e-05, + "loss": 0.8503, + "step": 8731 + }, + { + "epoch": 0.42519416648406494, + "grad_norm": 2.137363910675049, + "learning_rate": 2.5736512334331788e-05, + "loss": 0.8462, + "step": 8732 + }, + { + "epoch": 0.42524286027317215, + "grad_norm": 1.746098518371582, + "learning_rate": 2.5733490432823627e-05, + "loss": 0.7723, + "step": 8733 + }, + { + "epoch": 0.42529155406227936, + "grad_norm": 0.08724527806043625, + "learning_rate": 2.573046838869437e-05, + "loss": 0.6187, + "step": 8734 + }, + { + "epoch": 0.4253402478513866, + "grad_norm": 0.09137669950723648, + "learning_rate": 2.5727446202019187e-05, + "loss": 0.6211, + "step": 8735 + }, + { + "epoch": 0.42538894164049373, + "grad_norm": 1.84645414352417, + "learning_rate": 2.5724423872873253e-05, + "loss": 0.8137, + "step": 8736 + }, + { + "epoch": 0.42543763542960095, + "grad_norm": 1.3668798208236694, + "learning_rate": 2.5721401401331756e-05, + "loss": 0.8188, + "step": 8737 + }, + { + "epoch": 0.42548632921870816, + "grad_norm": 0.0950702652335167, + "learning_rate": 2.5718378787469883e-05, + "loss": 0.6153, + "step": 8738 + }, + { + "epoch": 0.42553502300781537, + "grad_norm": 1.5280829668045044, + "learning_rate": 2.5715356031362815e-05, + "loss": 0.9153, + "step": 8739 + }, + { + "epoch": 0.42558371679692253, + "grad_norm": 4.090211868286133, + "learning_rate": 2.5712333133085733e-05, + "loss": 0.8381, + "step": 8740 + }, + { + "epoch": 0.42563241058602974, + "grad_norm": 0.09863752126693726, + "learning_rate": 2.5709310092713853e-05, + "loss": 0.6259, + "step": 8741 + }, + { + "epoch": 0.42568110437513695, + "grad_norm": 1.8097773790359497, + "learning_rate": 2.5706286910322368e-05, + "loss": 0.8402, + "step": 8742 + }, + { + "epoch": 0.42572979816424417, + "grad_norm": 2.2643163204193115, + "learning_rate": 2.570326358598647e-05, + "loss": 0.9745, + "step": 8743 + }, + { + "epoch": 0.4257784919533513, + "grad_norm": 1.2775495052337646, + "learning_rate": 2.5700240119781373e-05, + "loss": 0.8453, + "step": 8744 + }, + { + "epoch": 0.42582718574245854, + "grad_norm": 1.4581667184829712, + "learning_rate": 2.5697216511782283e-05, + "loss": 0.8585, + "step": 8745 + }, + { + "epoch": 0.42587587953156575, + "grad_norm": 2.107599973678589, + "learning_rate": 2.569419276206441e-05, + "loss": 0.7711, + "step": 8746 + }, + { + "epoch": 0.42592457332067296, + "grad_norm": 1.7420532703399658, + "learning_rate": 2.5691168870702976e-05, + "loss": 0.8458, + "step": 8747 + }, + { + "epoch": 0.4259732671097802, + "grad_norm": 2.11181640625, + "learning_rate": 2.568814483777319e-05, + "loss": 0.8179, + "step": 8748 + }, + { + "epoch": 0.42602196089888733, + "grad_norm": 1.5930684804916382, + "learning_rate": 2.568512066335029e-05, + "loss": 0.8116, + "step": 8749 + }, + { + "epoch": 0.42607065468799454, + "grad_norm": 2.0061893463134766, + "learning_rate": 2.5682096347509492e-05, + "loss": 0.824, + "step": 8750 + }, + { + "epoch": 0.42611934847710176, + "grad_norm": 1.6058001518249512, + "learning_rate": 2.5679071890326035e-05, + "loss": 0.8936, + "step": 8751 + }, + { + "epoch": 0.42616804226620897, + "grad_norm": 1.4647743701934814, + "learning_rate": 2.5676047291875144e-05, + "loss": 0.8122, + "step": 8752 + }, + { + "epoch": 0.4262167360553161, + "grad_norm": 1.653992772102356, + "learning_rate": 2.5673022552232058e-05, + "loss": 0.896, + "step": 8753 + }, + { + "epoch": 0.42626542984442334, + "grad_norm": 1.3980939388275146, + "learning_rate": 2.566999767147201e-05, + "loss": 0.8183, + "step": 8754 + }, + { + "epoch": 0.42631412363353055, + "grad_norm": 1.5259907245635986, + "learning_rate": 2.5666972649670266e-05, + "loss": 0.8522, + "step": 8755 + }, + { + "epoch": 0.42636281742263776, + "grad_norm": 1.687159776687622, + "learning_rate": 2.5663947486902055e-05, + "loss": 0.8535, + "step": 8756 + }, + { + "epoch": 0.4264115112117449, + "grad_norm": 2.5924456119537354, + "learning_rate": 2.5660922183242633e-05, + "loss": 0.8308, + "step": 8757 + }, + { + "epoch": 0.42646020500085213, + "grad_norm": 1.3265626430511475, + "learning_rate": 2.565789673876726e-05, + "loss": 0.8712, + "step": 8758 + }, + { + "epoch": 0.42650889878995935, + "grad_norm": 1.5699504613876343, + "learning_rate": 2.5654871153551188e-05, + "loss": 0.8789, + "step": 8759 + }, + { + "epoch": 0.42655759257906656, + "grad_norm": 1.484735369682312, + "learning_rate": 2.565184542766968e-05, + "loss": 0.8256, + "step": 8760 + }, + { + "epoch": 0.4266062863681737, + "grad_norm": 1.5750600099563599, + "learning_rate": 2.5648819561198003e-05, + "loss": 0.8153, + "step": 8761 + }, + { + "epoch": 0.4266549801572809, + "grad_norm": 1.3841803073883057, + "learning_rate": 2.5645793554211416e-05, + "loss": 0.8488, + "step": 8762 + }, + { + "epoch": 0.42670367394638814, + "grad_norm": 1.8198885917663574, + "learning_rate": 2.564276740678521e-05, + "loss": 0.757, + "step": 8763 + }, + { + "epoch": 0.42675236773549535, + "grad_norm": 1.3741201162338257, + "learning_rate": 2.563974111899465e-05, + "loss": 0.8759, + "step": 8764 + }, + { + "epoch": 0.4268010615246025, + "grad_norm": 1.4361002445220947, + "learning_rate": 2.5636714690915014e-05, + "loss": 0.8823, + "step": 8765 + }, + { + "epoch": 0.4268497553137097, + "grad_norm": 1.8140004873275757, + "learning_rate": 2.5633688122621577e-05, + "loss": 0.9311, + "step": 8766 + }, + { + "epoch": 0.42689844910281693, + "grad_norm": 1.6949687004089355, + "learning_rate": 2.5630661414189643e-05, + "loss": 0.7886, + "step": 8767 + }, + { + "epoch": 0.42694714289192415, + "grad_norm": 1.8665717840194702, + "learning_rate": 2.5627634565694496e-05, + "loss": 0.9004, + "step": 8768 + }, + { + "epoch": 0.42699583668103136, + "grad_norm": 1.4190952777862549, + "learning_rate": 2.5624607577211416e-05, + "loss": 0.8443, + "step": 8769 + }, + { + "epoch": 0.4270445304701385, + "grad_norm": 1.4545632600784302, + "learning_rate": 2.562158044881572e-05, + "loss": 0.8147, + "step": 8770 + }, + { + "epoch": 0.42709322425924573, + "grad_norm": 1.3187298774719238, + "learning_rate": 2.5618553180582688e-05, + "loss": 0.7792, + "step": 8771 + }, + { + "epoch": 0.42714191804835294, + "grad_norm": 0.09451797604560852, + "learning_rate": 2.561552577258764e-05, + "loss": 0.5765, + "step": 8772 + }, + { + "epoch": 0.42719061183746015, + "grad_norm": 1.711283564567566, + "learning_rate": 2.5612498224905876e-05, + "loss": 0.859, + "step": 8773 + }, + { + "epoch": 0.4272393056265673, + "grad_norm": 1.3518469333648682, + "learning_rate": 2.5609470537612707e-05, + "loss": 0.7914, + "step": 8774 + }, + { + "epoch": 0.4272879994156745, + "grad_norm": 1.2607300281524658, + "learning_rate": 2.5606442710783443e-05, + "loss": 0.8391, + "step": 8775 + }, + { + "epoch": 0.42733669320478174, + "grad_norm": 6.630612373352051, + "learning_rate": 2.5603414744493406e-05, + "loss": 0.8051, + "step": 8776 + }, + { + "epoch": 0.42738538699388895, + "grad_norm": 2.240861177444458, + "learning_rate": 2.5600386638817915e-05, + "loss": 0.9434, + "step": 8777 + }, + { + "epoch": 0.4274340807829961, + "grad_norm": 1.7536643743515015, + "learning_rate": 2.55973583938323e-05, + "loss": 0.7114, + "step": 8778 + }, + { + "epoch": 0.4274827745721033, + "grad_norm": 1.815700650215149, + "learning_rate": 2.5594330009611888e-05, + "loss": 0.8063, + "step": 8779 + }, + { + "epoch": 0.42753146836121053, + "grad_norm": 2.0552237033843994, + "learning_rate": 2.5591301486232002e-05, + "loss": 0.8293, + "step": 8780 + }, + { + "epoch": 0.42758016215031774, + "grad_norm": 0.09225796163082123, + "learning_rate": 2.5588272823767985e-05, + "loss": 0.6182, + "step": 8781 + }, + { + "epoch": 0.4276288559394249, + "grad_norm": 1.5434621572494507, + "learning_rate": 2.5585244022295168e-05, + "loss": 0.8083, + "step": 8782 + }, + { + "epoch": 0.4276775497285321, + "grad_norm": 0.09253474324941635, + "learning_rate": 2.5582215081888894e-05, + "loss": 0.6538, + "step": 8783 + }, + { + "epoch": 0.4277262435176393, + "grad_norm": 1.4557461738586426, + "learning_rate": 2.5579186002624517e-05, + "loss": 0.7873, + "step": 8784 + }, + { + "epoch": 0.42777493730674654, + "grad_norm": 1.4903979301452637, + "learning_rate": 2.557615678457738e-05, + "loss": 0.8227, + "step": 8785 + }, + { + "epoch": 0.42782363109585375, + "grad_norm": 1.5718008279800415, + "learning_rate": 2.5573127427822824e-05, + "loss": 0.8287, + "step": 8786 + }, + { + "epoch": 0.4278723248849609, + "grad_norm": 1.3408503532409668, + "learning_rate": 2.557009793243623e-05, + "loss": 0.8173, + "step": 8787 + }, + { + "epoch": 0.4279210186740681, + "grad_norm": 2.4195194244384766, + "learning_rate": 2.5567068298492937e-05, + "loss": 0.9911, + "step": 8788 + }, + { + "epoch": 0.42796971246317533, + "grad_norm": 1.8393492698669434, + "learning_rate": 2.5564038526068306e-05, + "loss": 0.9154, + "step": 8789 + }, + { + "epoch": 0.42801840625228255, + "grad_norm": 3.363135814666748, + "learning_rate": 2.5561008615237716e-05, + "loss": 0.8182, + "step": 8790 + }, + { + "epoch": 0.4280671000413897, + "grad_norm": 1.8112949132919312, + "learning_rate": 2.5557978566076528e-05, + "loss": 0.8895, + "step": 8791 + }, + { + "epoch": 0.4281157938304969, + "grad_norm": 1.4307904243469238, + "learning_rate": 2.5554948378660115e-05, + "loss": 0.849, + "step": 8792 + }, + { + "epoch": 0.42816448761960413, + "grad_norm": 6.7971510887146, + "learning_rate": 2.5551918053063857e-05, + "loss": 0.7943, + "step": 8793 + }, + { + "epoch": 0.42821318140871134, + "grad_norm": 1.9590535163879395, + "learning_rate": 2.5548887589363132e-05, + "loss": 0.9447, + "step": 8794 + }, + { + "epoch": 0.4282618751978185, + "grad_norm": 1.6697156429290771, + "learning_rate": 2.554585698763332e-05, + "loss": 0.7776, + "step": 8795 + }, + { + "epoch": 0.4283105689869257, + "grad_norm": 1.5004316568374634, + "learning_rate": 2.5542826247949806e-05, + "loss": 0.8751, + "step": 8796 + }, + { + "epoch": 0.4283592627760329, + "grad_norm": 1.5444902181625366, + "learning_rate": 2.553979537038799e-05, + "loss": 0.8605, + "step": 8797 + }, + { + "epoch": 0.42840795656514014, + "grad_norm": 1.6768392324447632, + "learning_rate": 2.5536764355023252e-05, + "loss": 0.7891, + "step": 8798 + }, + { + "epoch": 0.4284566503542473, + "grad_norm": 1.3623613119125366, + "learning_rate": 2.5533733201931004e-05, + "loss": 0.8233, + "step": 8799 + }, + { + "epoch": 0.4285053441433545, + "grad_norm": 2.4738402366638184, + "learning_rate": 2.5530701911186632e-05, + "loss": 0.9499, + "step": 8800 + }, + { + "epoch": 0.4285540379324617, + "grad_norm": 1.811847448348999, + "learning_rate": 2.552767048286555e-05, + "loss": 0.813, + "step": 8801 + }, + { + "epoch": 0.42860273172156893, + "grad_norm": 1.2560476064682007, + "learning_rate": 2.5524638917043158e-05, + "loss": 0.8511, + "step": 8802 + }, + { + "epoch": 0.4286514255106761, + "grad_norm": 1.4043450355529785, + "learning_rate": 2.5521607213794872e-05, + "loss": 0.8623, + "step": 8803 + }, + { + "epoch": 0.4287001192997833, + "grad_norm": 0.08567678183317184, + "learning_rate": 2.5518575373196104e-05, + "loss": 0.5939, + "step": 8804 + }, + { + "epoch": 0.4287488130888905, + "grad_norm": 1.6947399377822876, + "learning_rate": 2.5515543395322265e-05, + "loss": 0.8909, + "step": 8805 + }, + { + "epoch": 0.4287975068779977, + "grad_norm": 1.8249790668487549, + "learning_rate": 2.5512511280248776e-05, + "loss": 0.8635, + "step": 8806 + }, + { + "epoch": 0.42884620066710494, + "grad_norm": 1.304720163345337, + "learning_rate": 2.550947902805108e-05, + "loss": 0.8094, + "step": 8807 + }, + { + "epoch": 0.4288948944562121, + "grad_norm": 1.5157935619354248, + "learning_rate": 2.5506446638804584e-05, + "loss": 0.9044, + "step": 8808 + }, + { + "epoch": 0.4289435882453193, + "grad_norm": 1.569462776184082, + "learning_rate": 2.5503414112584728e-05, + "loss": 0.7897, + "step": 8809 + }, + { + "epoch": 0.4289922820344265, + "grad_norm": 1.6917450428009033, + "learning_rate": 2.5500381449466946e-05, + "loss": 0.8337, + "step": 8810 + }, + { + "epoch": 0.42904097582353373, + "grad_norm": 1.6711941957473755, + "learning_rate": 2.549734864952667e-05, + "loss": 0.8477, + "step": 8811 + }, + { + "epoch": 0.4290896696126409, + "grad_norm": 1.3657015562057495, + "learning_rate": 2.5494315712839342e-05, + "loss": 0.9074, + "step": 8812 + }, + { + "epoch": 0.4291383634017481, + "grad_norm": 1.3608561754226685, + "learning_rate": 2.549128263948042e-05, + "loss": 0.8557, + "step": 8813 + }, + { + "epoch": 0.4291870571908553, + "grad_norm": 1.6411038637161255, + "learning_rate": 2.5488249429525334e-05, + "loss": 0.9432, + "step": 8814 + }, + { + "epoch": 0.4292357509799625, + "grad_norm": 1.6151937246322632, + "learning_rate": 2.5485216083049542e-05, + "loss": 0.818, + "step": 8815 + }, + { + "epoch": 0.4292844447690697, + "grad_norm": 2.076350450515747, + "learning_rate": 2.54821826001285e-05, + "loss": 0.8051, + "step": 8816 + }, + { + "epoch": 0.4293331385581769, + "grad_norm": 1.5295758247375488, + "learning_rate": 2.547914898083767e-05, + "loss": 0.8136, + "step": 8817 + }, + { + "epoch": 0.4293818323472841, + "grad_norm": 2.912964105606079, + "learning_rate": 2.5476115225252508e-05, + "loss": 0.9358, + "step": 8818 + }, + { + "epoch": 0.4294305261363913, + "grad_norm": 1.9962294101715088, + "learning_rate": 2.5473081333448485e-05, + "loss": 0.836, + "step": 8819 + }, + { + "epoch": 0.4294792199254985, + "grad_norm": 1.4240939617156982, + "learning_rate": 2.5470047305501054e-05, + "loss": 0.8694, + "step": 8820 + }, + { + "epoch": 0.4295279137146057, + "grad_norm": 1.7148338556289673, + "learning_rate": 2.5467013141485704e-05, + "loss": 0.8893, + "step": 8821 + }, + { + "epoch": 0.4295766075037129, + "grad_norm": 2.7604849338531494, + "learning_rate": 2.5463978841477904e-05, + "loss": 0.8987, + "step": 8822 + }, + { + "epoch": 0.4296253012928201, + "grad_norm": 1.2962324619293213, + "learning_rate": 2.5460944405553133e-05, + "loss": 0.7989, + "step": 8823 + }, + { + "epoch": 0.4296739950819273, + "grad_norm": 1.5771980285644531, + "learning_rate": 2.5457909833786874e-05, + "loss": 0.8139, + "step": 8824 + }, + { + "epoch": 0.4297226888710345, + "grad_norm": 1.7433308362960815, + "learning_rate": 2.5454875126254605e-05, + "loss": 0.8821, + "step": 8825 + }, + { + "epoch": 0.4297713826601417, + "grad_norm": 1.3272699117660522, + "learning_rate": 2.5451840283031823e-05, + "loss": 0.7972, + "step": 8826 + }, + { + "epoch": 0.4298200764492489, + "grad_norm": 1.444543719291687, + "learning_rate": 2.544880530419401e-05, + "loss": 0.904, + "step": 8827 + }, + { + "epoch": 0.4298687702383561, + "grad_norm": 2.1000185012817383, + "learning_rate": 2.5445770189816676e-05, + "loss": 0.8034, + "step": 8828 + }, + { + "epoch": 0.4299174640274633, + "grad_norm": 1.257603645324707, + "learning_rate": 2.5442734939975307e-05, + "loss": 0.8401, + "step": 8829 + }, + { + "epoch": 0.4299661578165705, + "grad_norm": 1.7120968103408813, + "learning_rate": 2.5439699554745415e-05, + "loss": 0.8477, + "step": 8830 + }, + { + "epoch": 0.4300148516056777, + "grad_norm": 1.4992529153823853, + "learning_rate": 2.5436664034202497e-05, + "loss": 0.8964, + "step": 8831 + }, + { + "epoch": 0.4300635453947849, + "grad_norm": 1.2794009447097778, + "learning_rate": 2.543362837842207e-05, + "loss": 0.8811, + "step": 8832 + }, + { + "epoch": 0.4301122391838921, + "grad_norm": 1.6210306882858276, + "learning_rate": 2.543059258747964e-05, + "loss": 0.7618, + "step": 8833 + }, + { + "epoch": 0.4301609329729993, + "grad_norm": 1.9201561212539673, + "learning_rate": 2.542755666145072e-05, + "loss": 0.9406, + "step": 8834 + }, + { + "epoch": 0.4302096267621065, + "grad_norm": 1.4698357582092285, + "learning_rate": 2.5424520600410833e-05, + "loss": 0.7748, + "step": 8835 + }, + { + "epoch": 0.4302583205512137, + "grad_norm": 1.8394055366516113, + "learning_rate": 2.5421484404435505e-05, + "loss": 0.8773, + "step": 8836 + }, + { + "epoch": 0.43030701434032087, + "grad_norm": 1.53221595287323, + "learning_rate": 2.541844807360026e-05, + "loss": 0.8297, + "step": 8837 + }, + { + "epoch": 0.4303557081294281, + "grad_norm": 1.3300440311431885, + "learning_rate": 2.5415411607980623e-05, + "loss": 0.8553, + "step": 8838 + }, + { + "epoch": 0.4304044019185353, + "grad_norm": 1.5161519050598145, + "learning_rate": 2.541237500765213e-05, + "loss": 0.9062, + "step": 8839 + }, + { + "epoch": 0.4304530957076425, + "grad_norm": 1.6228229999542236, + "learning_rate": 2.5409338272690312e-05, + "loss": 0.8566, + "step": 8840 + }, + { + "epoch": 0.43050178949674966, + "grad_norm": 1.6773267984390259, + "learning_rate": 2.540630140317071e-05, + "loss": 0.7352, + "step": 8841 + }, + { + "epoch": 0.4305504832858569, + "grad_norm": 1.2857354879379272, + "learning_rate": 2.540326439916887e-05, + "loss": 0.8002, + "step": 8842 + }, + { + "epoch": 0.4305991770749641, + "grad_norm": 2.272439479827881, + "learning_rate": 2.5400227260760338e-05, + "loss": 0.8509, + "step": 8843 + }, + { + "epoch": 0.4306478708640713, + "grad_norm": 1.6282216310501099, + "learning_rate": 2.5397189988020658e-05, + "loss": 0.8603, + "step": 8844 + }, + { + "epoch": 0.43069656465317846, + "grad_norm": 1.544716715812683, + "learning_rate": 2.539415258102538e-05, + "loss": 0.7432, + "step": 8845 + }, + { + "epoch": 0.4307452584422857, + "grad_norm": 2.8639633655548096, + "learning_rate": 2.5391115039850074e-05, + "loss": 0.7183, + "step": 8846 + }, + { + "epoch": 0.4307939522313929, + "grad_norm": 1.3915135860443115, + "learning_rate": 2.5388077364570288e-05, + "loss": 0.7429, + "step": 8847 + }, + { + "epoch": 0.4308426460205001, + "grad_norm": 1.6849963665008545, + "learning_rate": 2.5385039555261585e-05, + "loss": 0.8469, + "step": 8848 + }, + { + "epoch": 0.4308913398096073, + "grad_norm": 0.09336161613464355, + "learning_rate": 2.5382001611999524e-05, + "loss": 0.6937, + "step": 8849 + }, + { + "epoch": 0.43094003359871447, + "grad_norm": 1.2369632720947266, + "learning_rate": 2.5378963534859694e-05, + "loss": 0.903, + "step": 8850 + }, + { + "epoch": 0.4309887273878217, + "grad_norm": 1.5873889923095703, + "learning_rate": 2.537592532391765e-05, + "loss": 0.9263, + "step": 8851 + }, + { + "epoch": 0.4310374211769289, + "grad_norm": 1.6812551021575928, + "learning_rate": 2.5372886979248977e-05, + "loss": 0.7782, + "step": 8852 + }, + { + "epoch": 0.4310861149660361, + "grad_norm": 3.978860378265381, + "learning_rate": 2.5369848500929248e-05, + "loss": 0.8692, + "step": 8853 + }, + { + "epoch": 0.43113480875514326, + "grad_norm": 1.4252278804779053, + "learning_rate": 2.536680988903405e-05, + "loss": 0.8406, + "step": 8854 + }, + { + "epoch": 0.4311835025442505, + "grad_norm": 1.3216665983200073, + "learning_rate": 2.5363771143638963e-05, + "loss": 0.8666, + "step": 8855 + }, + { + "epoch": 0.4312321963333577, + "grad_norm": 1.19400155544281, + "learning_rate": 2.536073226481958e-05, + "loss": 0.8602, + "step": 8856 + }, + { + "epoch": 0.4312808901224649, + "grad_norm": 1.2435723543167114, + "learning_rate": 2.535769325265149e-05, + "loss": 0.8526, + "step": 8857 + }, + { + "epoch": 0.43132958391157206, + "grad_norm": 1.4686042070388794, + "learning_rate": 2.5354654107210296e-05, + "loss": 0.8181, + "step": 8858 + }, + { + "epoch": 0.43137827770067927, + "grad_norm": 1.2935283184051514, + "learning_rate": 2.5351614828571594e-05, + "loss": 0.8111, + "step": 8859 + }, + { + "epoch": 0.4314269714897865, + "grad_norm": 0.08980964124202728, + "learning_rate": 2.5348575416810986e-05, + "loss": 0.5626, + "step": 8860 + }, + { + "epoch": 0.4314756652788937, + "grad_norm": 1.3982094526290894, + "learning_rate": 2.5345535872004074e-05, + "loss": 0.8473, + "step": 8861 + }, + { + "epoch": 0.43152435906800085, + "grad_norm": 1.5679198503494263, + "learning_rate": 2.5342496194226477e-05, + "loss": 0.9485, + "step": 8862 + }, + { + "epoch": 0.43157305285710806, + "grad_norm": 1.4318735599517822, + "learning_rate": 2.5339456383553788e-05, + "loss": 0.7483, + "step": 8863 + }, + { + "epoch": 0.4316217466462153, + "grad_norm": 1.767418384552002, + "learning_rate": 2.5336416440061643e-05, + "loss": 0.871, + "step": 8864 + }, + { + "epoch": 0.4316704404353225, + "grad_norm": 1.6835427284240723, + "learning_rate": 2.5333376363825647e-05, + "loss": 0.8136, + "step": 8865 + }, + { + "epoch": 0.4317191342244297, + "grad_norm": 0.08598041534423828, + "learning_rate": 2.5330336154921434e-05, + "loss": 0.6434, + "step": 8866 + }, + { + "epoch": 0.43176782801353686, + "grad_norm": 1.8922709226608276, + "learning_rate": 2.5327295813424618e-05, + "loss": 0.925, + "step": 8867 + }, + { + "epoch": 0.43181652180264407, + "grad_norm": 1.3414884805679321, + "learning_rate": 2.5324255339410836e-05, + "loss": 0.8374, + "step": 8868 + }, + { + "epoch": 0.4318652155917513, + "grad_norm": 1.1794679164886475, + "learning_rate": 2.5321214732955718e-05, + "loss": 0.8155, + "step": 8869 + }, + { + "epoch": 0.4319139093808585, + "grad_norm": 1.1338295936584473, + "learning_rate": 2.5318173994134893e-05, + "loss": 0.8251, + "step": 8870 + }, + { + "epoch": 0.43196260316996565, + "grad_norm": 0.09184038639068604, + "learning_rate": 2.531513312302401e-05, + "loss": 0.6081, + "step": 8871 + }, + { + "epoch": 0.43201129695907287, + "grad_norm": 0.09097485989332199, + "learning_rate": 2.5312092119698704e-05, + "loss": 0.5992, + "step": 8872 + }, + { + "epoch": 0.4320599907481801, + "grad_norm": 1.6251972913742065, + "learning_rate": 2.5309050984234624e-05, + "loss": 0.7661, + "step": 8873 + }, + { + "epoch": 0.4321086845372873, + "grad_norm": 1.8025087118148804, + "learning_rate": 2.5306009716707417e-05, + "loss": 0.8053, + "step": 8874 + }, + { + "epoch": 0.43215737832639445, + "grad_norm": 1.3085553646087646, + "learning_rate": 2.5302968317192728e-05, + "loss": 0.9687, + "step": 8875 + }, + { + "epoch": 0.43220607211550166, + "grad_norm": 1.4116843938827515, + "learning_rate": 2.5299926785766223e-05, + "loss": 0.8363, + "step": 8876 + }, + { + "epoch": 0.4322547659046089, + "grad_norm": 1.1679974794387817, + "learning_rate": 2.5296885122503557e-05, + "loss": 0.7926, + "step": 8877 + }, + { + "epoch": 0.4323034596937161, + "grad_norm": 1.571160078048706, + "learning_rate": 2.5293843327480386e-05, + "loss": 0.9495, + "step": 8878 + }, + { + "epoch": 0.43235215348282324, + "grad_norm": 1.860607385635376, + "learning_rate": 2.5290801400772383e-05, + "loss": 0.8561, + "step": 8879 + }, + { + "epoch": 0.43240084727193046, + "grad_norm": 1.466031789779663, + "learning_rate": 2.5287759342455217e-05, + "loss": 0.8257, + "step": 8880 + }, + { + "epoch": 0.43244954106103767, + "grad_norm": 1.362182378768921, + "learning_rate": 2.528471715260455e-05, + "loss": 0.809, + "step": 8881 + }, + { + "epoch": 0.4324982348501449, + "grad_norm": 1.5760611295700073, + "learning_rate": 2.5281674831296066e-05, + "loss": 0.7543, + "step": 8882 + }, + { + "epoch": 0.43254692863925204, + "grad_norm": 1.4113860130310059, + "learning_rate": 2.5278632378605438e-05, + "loss": 0.8459, + "step": 8883 + }, + { + "epoch": 0.43259562242835925, + "grad_norm": 1.2933356761932373, + "learning_rate": 2.5275589794608342e-05, + "loss": 0.8093, + "step": 8884 + }, + { + "epoch": 0.43264431621746646, + "grad_norm": 2.934589385986328, + "learning_rate": 2.5272547079380476e-05, + "loss": 0.7174, + "step": 8885 + }, + { + "epoch": 0.4326930100065737, + "grad_norm": 0.09152834862470627, + "learning_rate": 2.5269504232997514e-05, + "loss": 0.6139, + "step": 8886 + }, + { + "epoch": 0.4327417037956809, + "grad_norm": 1.467466950416565, + "learning_rate": 2.526646125553516e-05, + "loss": 0.9141, + "step": 8887 + }, + { + "epoch": 0.43279039758478804, + "grad_norm": 1.4984197616577148, + "learning_rate": 2.5263418147069096e-05, + "loss": 0.9501, + "step": 8888 + }, + { + "epoch": 0.43283909137389526, + "grad_norm": 2.477306604385376, + "learning_rate": 2.5260374907675033e-05, + "loss": 0.9693, + "step": 8889 + }, + { + "epoch": 0.43288778516300247, + "grad_norm": 2.6987526416778564, + "learning_rate": 2.5257331537428664e-05, + "loss": 0.8161, + "step": 8890 + }, + { + "epoch": 0.4329364789521097, + "grad_norm": 2.0845859050750732, + "learning_rate": 2.5254288036405693e-05, + "loss": 0.8528, + "step": 8891 + }, + { + "epoch": 0.43298517274121684, + "grad_norm": 1.2539469003677368, + "learning_rate": 2.5251244404681822e-05, + "loss": 0.8033, + "step": 8892 + }, + { + "epoch": 0.43303386653032405, + "grad_norm": 1.8321186304092407, + "learning_rate": 2.524820064233277e-05, + "loss": 0.9155, + "step": 8893 + }, + { + "epoch": 0.43308256031943126, + "grad_norm": 1.891882061958313, + "learning_rate": 2.524515674943425e-05, + "loss": 0.7982, + "step": 8894 + }, + { + "epoch": 0.4331312541085385, + "grad_norm": 1.990443468093872, + "learning_rate": 2.5242112726061975e-05, + "loss": 0.8708, + "step": 8895 + }, + { + "epoch": 0.43317994789764563, + "grad_norm": 1.586868405342102, + "learning_rate": 2.523906857229167e-05, + "loss": 0.8276, + "step": 8896 + }, + { + "epoch": 0.43322864168675285, + "grad_norm": 1.8245576620101929, + "learning_rate": 2.5236024288199063e-05, + "loss": 0.8122, + "step": 8897 + }, + { + "epoch": 0.43327733547586006, + "grad_norm": 1.4855201244354248, + "learning_rate": 2.523297987385987e-05, + "loss": 0.9133, + "step": 8898 + }, + { + "epoch": 0.43332602926496727, + "grad_norm": 1.5146011114120483, + "learning_rate": 2.522993532934982e-05, + "loss": 0.8376, + "step": 8899 + }, + { + "epoch": 0.43337472305407443, + "grad_norm": 1.359104871749878, + "learning_rate": 2.5226890654744657e-05, + "loss": 0.8473, + "step": 8900 + }, + { + "epoch": 0.43342341684318164, + "grad_norm": 1.7259652614593506, + "learning_rate": 2.5223845850120112e-05, + "loss": 0.8116, + "step": 8901 + }, + { + "epoch": 0.43347211063228885, + "grad_norm": 1.5376554727554321, + "learning_rate": 2.5220800915551923e-05, + "loss": 0.8427, + "step": 8902 + }, + { + "epoch": 0.43352080442139607, + "grad_norm": 1.5314372777938843, + "learning_rate": 2.521775585111584e-05, + "loss": 0.9007, + "step": 8903 + }, + { + "epoch": 0.4335694982105032, + "grad_norm": 2.281812906265259, + "learning_rate": 2.5214710656887605e-05, + "loss": 0.7873, + "step": 8904 + }, + { + "epoch": 0.43361819199961044, + "grad_norm": 1.4548258781433105, + "learning_rate": 2.5211665332942955e-05, + "loss": 0.8715, + "step": 8905 + }, + { + "epoch": 0.43366688578871765, + "grad_norm": 1.7192388772964478, + "learning_rate": 2.520861987935767e-05, + "loss": 0.788, + "step": 8906 + }, + { + "epoch": 0.43371557957782486, + "grad_norm": 1.464473009109497, + "learning_rate": 2.520557429620748e-05, + "loss": 0.9633, + "step": 8907 + }, + { + "epoch": 0.4337642733669321, + "grad_norm": 1.4988652467727661, + "learning_rate": 2.5202528583568155e-05, + "loss": 0.7934, + "step": 8908 + }, + { + "epoch": 0.43381296715603923, + "grad_norm": 1.8793941736221313, + "learning_rate": 2.5199482741515463e-05, + "loss": 0.8108, + "step": 8909 + }, + { + "epoch": 0.43386166094514644, + "grad_norm": 1.8669925928115845, + "learning_rate": 2.5196436770125158e-05, + "loss": 0.9079, + "step": 8910 + }, + { + "epoch": 0.43391035473425366, + "grad_norm": 2.2970283031463623, + "learning_rate": 2.519339066947302e-05, + "loss": 0.792, + "step": 8911 + }, + { + "epoch": 0.43395904852336087, + "grad_norm": 7.836113452911377, + "learning_rate": 2.5190344439634818e-05, + "loss": 0.8073, + "step": 8912 + }, + { + "epoch": 0.434007742312468, + "grad_norm": 1.3576936721801758, + "learning_rate": 2.5187298080686312e-05, + "loss": 0.8167, + "step": 8913 + }, + { + "epoch": 0.43405643610157524, + "grad_norm": 2.1381096839904785, + "learning_rate": 2.5184251592703307e-05, + "loss": 0.8384, + "step": 8914 + }, + { + "epoch": 0.43410512989068245, + "grad_norm": 1.7258466482162476, + "learning_rate": 2.5181204975761562e-05, + "loss": 0.8019, + "step": 8915 + }, + { + "epoch": 0.43415382367978966, + "grad_norm": 1.8308218717575073, + "learning_rate": 2.5178158229936872e-05, + "loss": 0.912, + "step": 8916 + }, + { + "epoch": 0.4342025174688968, + "grad_norm": 1.374101996421814, + "learning_rate": 2.5175111355305028e-05, + "loss": 0.8521, + "step": 8917 + }, + { + "epoch": 0.43425121125800403, + "grad_norm": 1.8239331245422363, + "learning_rate": 2.5172064351941818e-05, + "loss": 0.8601, + "step": 8918 + }, + { + "epoch": 0.43429990504711125, + "grad_norm": 1.5263303518295288, + "learning_rate": 2.5169017219923034e-05, + "loss": 0.9609, + "step": 8919 + }, + { + "epoch": 0.43434859883621846, + "grad_norm": 1.347458839416504, + "learning_rate": 2.5165969959324472e-05, + "loss": 0.7743, + "step": 8920 + }, + { + "epoch": 0.4343972926253256, + "grad_norm": 1.5403072834014893, + "learning_rate": 2.516292257022194e-05, + "loss": 0.789, + "step": 8921 + }, + { + "epoch": 0.4344459864144328, + "grad_norm": 0.08998275548219681, + "learning_rate": 2.5159875052691237e-05, + "loss": 0.6478, + "step": 8922 + }, + { + "epoch": 0.43449468020354004, + "grad_norm": 1.7955695390701294, + "learning_rate": 2.515682740680817e-05, + "loss": 0.8822, + "step": 8923 + }, + { + "epoch": 0.43454337399264725, + "grad_norm": 2.1214122772216797, + "learning_rate": 2.5153779632648555e-05, + "loss": 0.7968, + "step": 8924 + }, + { + "epoch": 0.4345920677817544, + "grad_norm": 1.551060438156128, + "learning_rate": 2.5150731730288196e-05, + "loss": 0.8931, + "step": 8925 + }, + { + "epoch": 0.4346407615708616, + "grad_norm": 1.1999695301055908, + "learning_rate": 2.514768369980292e-05, + "loss": 0.8341, + "step": 8926 + }, + { + "epoch": 0.43468945535996883, + "grad_norm": 2.2331106662750244, + "learning_rate": 2.514463554126854e-05, + "loss": 0.8584, + "step": 8927 + }, + { + "epoch": 0.43473814914907605, + "grad_norm": 1.2639479637145996, + "learning_rate": 2.5141587254760884e-05, + "loss": 0.8705, + "step": 8928 + }, + { + "epoch": 0.43478684293818326, + "grad_norm": 1.3905547857284546, + "learning_rate": 2.5138538840355774e-05, + "loss": 0.8633, + "step": 8929 + }, + { + "epoch": 0.4348355367272904, + "grad_norm": 1.888069987297058, + "learning_rate": 2.5135490298129047e-05, + "loss": 0.8554, + "step": 8930 + }, + { + "epoch": 0.43488423051639763, + "grad_norm": 1.541144847869873, + "learning_rate": 2.5132441628156526e-05, + "loss": 0.8123, + "step": 8931 + }, + { + "epoch": 0.43493292430550484, + "grad_norm": 1.6242971420288086, + "learning_rate": 2.512939283051405e-05, + "loss": 0.7854, + "step": 8932 + }, + { + "epoch": 0.43498161809461205, + "grad_norm": 2.298992156982422, + "learning_rate": 2.5126343905277465e-05, + "loss": 0.7775, + "step": 8933 + }, + { + "epoch": 0.4350303118837192, + "grad_norm": 3.8806002140045166, + "learning_rate": 2.5123294852522602e-05, + "loss": 0.7932, + "step": 8934 + }, + { + "epoch": 0.4350790056728264, + "grad_norm": 1.375859022140503, + "learning_rate": 2.5120245672325318e-05, + "loss": 0.9091, + "step": 8935 + }, + { + "epoch": 0.43512769946193364, + "grad_norm": 2.3449501991271973, + "learning_rate": 2.511719636476145e-05, + "loss": 0.8478, + "step": 8936 + }, + { + "epoch": 0.43517639325104085, + "grad_norm": 1.2988572120666504, + "learning_rate": 2.5114146929906857e-05, + "loss": 0.8155, + "step": 8937 + }, + { + "epoch": 0.435225087040148, + "grad_norm": 1.2093335390090942, + "learning_rate": 2.51110973678374e-05, + "loss": 0.9013, + "step": 8938 + }, + { + "epoch": 0.4352737808292552, + "grad_norm": 2.8277230262756348, + "learning_rate": 2.5108047678628922e-05, + "loss": 0.8335, + "step": 8939 + }, + { + "epoch": 0.43532247461836243, + "grad_norm": 1.4797964096069336, + "learning_rate": 2.5104997862357297e-05, + "loss": 0.9095, + "step": 8940 + }, + { + "epoch": 0.43537116840746964, + "grad_norm": 2.084895372390747, + "learning_rate": 2.5101947919098385e-05, + "loss": 0.8962, + "step": 8941 + }, + { + "epoch": 0.4354198621965768, + "grad_norm": 1.587714433670044, + "learning_rate": 2.5098897848928054e-05, + "loss": 0.8926, + "step": 8942 + }, + { + "epoch": 0.435468555985684, + "grad_norm": 1.8292250633239746, + "learning_rate": 2.5095847651922167e-05, + "loss": 0.8489, + "step": 8943 + }, + { + "epoch": 0.4355172497747912, + "grad_norm": 1.4940950870513916, + "learning_rate": 2.509279732815661e-05, + "loss": 0.862, + "step": 8944 + }, + { + "epoch": 0.43556594356389844, + "grad_norm": 1.3503479957580566, + "learning_rate": 2.5089746877707253e-05, + "loss": 0.8513, + "step": 8945 + }, + { + "epoch": 0.43561463735300565, + "grad_norm": 3.1872785091400146, + "learning_rate": 2.5086696300649985e-05, + "loss": 0.8966, + "step": 8946 + }, + { + "epoch": 0.4356633311421128, + "grad_norm": 1.514809489250183, + "learning_rate": 2.5083645597060678e-05, + "loss": 0.8579, + "step": 8947 + }, + { + "epoch": 0.43571202493122, + "grad_norm": 1.5496888160705566, + "learning_rate": 2.5080594767015224e-05, + "loss": 0.822, + "step": 8948 + }, + { + "epoch": 0.43576071872032723, + "grad_norm": 1.9638248682022095, + "learning_rate": 2.507754381058951e-05, + "loss": 0.8534, + "step": 8949 + }, + { + "epoch": 0.43580941250943445, + "grad_norm": 3.3835606575012207, + "learning_rate": 2.5074492727859427e-05, + "loss": 0.8883, + "step": 8950 + }, + { + "epoch": 0.4358581062985416, + "grad_norm": 2.80151104927063, + "learning_rate": 2.5071441518900885e-05, + "loss": 0.7203, + "step": 8951 + }, + { + "epoch": 0.4359068000876488, + "grad_norm": 2.017306089401245, + "learning_rate": 2.5068390183789766e-05, + "loss": 0.8265, + "step": 8952 + }, + { + "epoch": 0.43595549387675603, + "grad_norm": 1.760183572769165, + "learning_rate": 2.506533872260198e-05, + "loss": 0.7766, + "step": 8953 + }, + { + "epoch": 0.43600418766586324, + "grad_norm": 1.5572973489761353, + "learning_rate": 2.5062287135413433e-05, + "loss": 0.7649, + "step": 8954 + }, + { + "epoch": 0.4360528814549704, + "grad_norm": 1.4951616525650024, + "learning_rate": 2.505923542230003e-05, + "loss": 0.7491, + "step": 8955 + }, + { + "epoch": 0.4361015752440776, + "grad_norm": 1.8415806293487549, + "learning_rate": 2.505618358333769e-05, + "loss": 0.8358, + "step": 8956 + }, + { + "epoch": 0.4361502690331848, + "grad_norm": 1.5185520648956299, + "learning_rate": 2.5053131618602314e-05, + "loss": 0.7713, + "step": 8957 + }, + { + "epoch": 0.43619896282229204, + "grad_norm": 1.4479020833969116, + "learning_rate": 2.505007952816983e-05, + "loss": 0.8832, + "step": 8958 + }, + { + "epoch": 0.4362476566113992, + "grad_norm": 1.8471105098724365, + "learning_rate": 2.504702731211616e-05, + "loss": 0.8368, + "step": 8959 + }, + { + "epoch": 0.4362963504005064, + "grad_norm": 0.09087859094142914, + "learning_rate": 2.5043974970517227e-05, + "loss": 0.6137, + "step": 8960 + }, + { + "epoch": 0.4363450441896136, + "grad_norm": 3.39837646484375, + "learning_rate": 2.5040922503448953e-05, + "loss": 0.82, + "step": 8961 + }, + { + "epoch": 0.43639373797872083, + "grad_norm": 1.9210209846496582, + "learning_rate": 2.5037869910987277e-05, + "loss": 0.8018, + "step": 8962 + }, + { + "epoch": 0.436442431767828, + "grad_norm": 2.1305325031280518, + "learning_rate": 2.5034817193208123e-05, + "loss": 0.8646, + "step": 8963 + }, + { + "epoch": 0.4364911255569352, + "grad_norm": 2.1951894760131836, + "learning_rate": 2.5031764350187433e-05, + "loss": 0.869, + "step": 8964 + }, + { + "epoch": 0.4365398193460424, + "grad_norm": 1.6869934797286987, + "learning_rate": 2.502871138200114e-05, + "loss": 0.8475, + "step": 8965 + }, + { + "epoch": 0.4365885131351496, + "grad_norm": 1.6267915964126587, + "learning_rate": 2.5025658288725202e-05, + "loss": 0.8914, + "step": 8966 + }, + { + "epoch": 0.43663720692425684, + "grad_norm": 1.3413550853729248, + "learning_rate": 2.5022605070435554e-05, + "loss": 0.8634, + "step": 8967 + }, + { + "epoch": 0.436685900713364, + "grad_norm": 1.4413172006607056, + "learning_rate": 2.501955172720815e-05, + "loss": 0.7909, + "step": 8968 + }, + { + "epoch": 0.4367345945024712, + "grad_norm": 1.3796155452728271, + "learning_rate": 2.5016498259118933e-05, + "loss": 0.7839, + "step": 8969 + }, + { + "epoch": 0.4367832882915784, + "grad_norm": 2.0255346298217773, + "learning_rate": 2.501344466624387e-05, + "loss": 0.9093, + "step": 8970 + }, + { + "epoch": 0.43683198208068563, + "grad_norm": 1.4001437425613403, + "learning_rate": 2.501039094865891e-05, + "loss": 0.8852, + "step": 8971 + }, + { + "epoch": 0.4368806758697928, + "grad_norm": 1.2783544063568115, + "learning_rate": 2.5007337106440015e-05, + "loss": 0.8557, + "step": 8972 + }, + { + "epoch": 0.4369293696589, + "grad_norm": 1.6450200080871582, + "learning_rate": 2.5004283139663157e-05, + "loss": 0.8699, + "step": 8973 + }, + { + "epoch": 0.4369780634480072, + "grad_norm": 2.805370330810547, + "learning_rate": 2.5001229048404297e-05, + "loss": 0.8626, + "step": 8974 + }, + { + "epoch": 0.4370267572371144, + "grad_norm": 1.6008268594741821, + "learning_rate": 2.499817483273941e-05, + "loss": 0.8019, + "step": 8975 + }, + { + "epoch": 0.4370754510262216, + "grad_norm": 1.3171576261520386, + "learning_rate": 2.499512049274447e-05, + "loss": 0.8967, + "step": 8976 + }, + { + "epoch": 0.4371241448153288, + "grad_norm": 1.6439231634140015, + "learning_rate": 2.4992066028495457e-05, + "loss": 0.9296, + "step": 8977 + }, + { + "epoch": 0.437172838604436, + "grad_norm": 1.7555506229400635, + "learning_rate": 2.4989011440068342e-05, + "loss": 0.8766, + "step": 8978 + }, + { + "epoch": 0.4372215323935432, + "grad_norm": 1.9857321977615356, + "learning_rate": 2.4985956727539103e-05, + "loss": 0.8206, + "step": 8979 + }, + { + "epoch": 0.4372702261826504, + "grad_norm": 1.9376883506774902, + "learning_rate": 2.498290189098375e-05, + "loss": 0.8409, + "step": 8980 + }, + { + "epoch": 0.4373189199717576, + "grad_norm": 1.5604819059371948, + "learning_rate": 2.4979846930478252e-05, + "loss": 0.8511, + "step": 8981 + }, + { + "epoch": 0.4373676137608648, + "grad_norm": 2.666748046875, + "learning_rate": 2.4976791846098605e-05, + "loss": 0.8047, + "step": 8982 + }, + { + "epoch": 0.437416307549972, + "grad_norm": 1.2319252490997314, + "learning_rate": 2.4973736637920817e-05, + "loss": 0.765, + "step": 8983 + }, + { + "epoch": 0.4374650013390792, + "grad_norm": 2.270679473876953, + "learning_rate": 2.4970681306020863e-05, + "loss": 0.8202, + "step": 8984 + }, + { + "epoch": 0.4375136951281864, + "grad_norm": 0.08891207724809647, + "learning_rate": 2.4967625850474767e-05, + "loss": 0.5696, + "step": 8985 + }, + { + "epoch": 0.4375623889172936, + "grad_norm": 1.6437829732894897, + "learning_rate": 2.4964570271358524e-05, + "loss": 0.8691, + "step": 8986 + }, + { + "epoch": 0.4376110827064008, + "grad_norm": 3.765866279602051, + "learning_rate": 2.496151456874814e-05, + "loss": 0.8858, + "step": 8987 + }, + { + "epoch": 0.437659776495508, + "grad_norm": 1.2472037076950073, + "learning_rate": 2.495845874271963e-05, + "loss": 0.8425, + "step": 8988 + }, + { + "epoch": 0.4377084702846152, + "grad_norm": 1.4893403053283691, + "learning_rate": 2.495540279334901e-05, + "loss": 0.8652, + "step": 8989 + }, + { + "epoch": 0.4377571640737224, + "grad_norm": 1.8206888437271118, + "learning_rate": 2.4952346720712292e-05, + "loss": 0.8805, + "step": 8990 + }, + { + "epoch": 0.4378058578628296, + "grad_norm": 7.140981674194336, + "learning_rate": 2.4949290524885495e-05, + "loss": 0.8484, + "step": 8991 + }, + { + "epoch": 0.4378545516519368, + "grad_norm": 1.7792607545852661, + "learning_rate": 2.494623420594465e-05, + "loss": 0.801, + "step": 8992 + }, + { + "epoch": 0.437903245441044, + "grad_norm": 4.212263107299805, + "learning_rate": 2.494317776396577e-05, + "loss": 0.8159, + "step": 8993 + }, + { + "epoch": 0.4379519392301512, + "grad_norm": 4.699007511138916, + "learning_rate": 2.4940121199024895e-05, + "loss": 0.8577, + "step": 8994 + }, + { + "epoch": 0.4380006330192584, + "grad_norm": 1.2491453886032104, + "learning_rate": 2.493706451119806e-05, + "loss": 0.7892, + "step": 8995 + }, + { + "epoch": 0.4380493268083656, + "grad_norm": 1.2606420516967773, + "learning_rate": 2.493400770056129e-05, + "loss": 0.8647, + "step": 8996 + }, + { + "epoch": 0.43809802059747277, + "grad_norm": 1.732311725616455, + "learning_rate": 2.493095076719063e-05, + "loss": 0.7675, + "step": 8997 + }, + { + "epoch": 0.43814671438658, + "grad_norm": 2.0076727867126465, + "learning_rate": 2.4927893711162123e-05, + "loss": 0.8512, + "step": 8998 + }, + { + "epoch": 0.4381954081756872, + "grad_norm": 1.8312913179397583, + "learning_rate": 2.4924836532551807e-05, + "loss": 0.8846, + "step": 8999 + }, + { + "epoch": 0.4382441019647944, + "grad_norm": 2.6192691326141357, + "learning_rate": 2.4921779231435742e-05, + "loss": 0.8721, + "step": 9000 + }, + { + "epoch": 0.43829279575390157, + "grad_norm": 1.3903234004974365, + "learning_rate": 2.4918721807889955e-05, + "loss": 0.7522, + "step": 9001 + }, + { + "epoch": 0.4383414895430088, + "grad_norm": 1.7267351150512695, + "learning_rate": 2.4915664261990524e-05, + "loss": 0.7774, + "step": 9002 + }, + { + "epoch": 0.438390183332116, + "grad_norm": 1.3656160831451416, + "learning_rate": 2.4912606593813494e-05, + "loss": 0.8187, + "step": 9003 + }, + { + "epoch": 0.4384388771212232, + "grad_norm": 1.5849777460098267, + "learning_rate": 2.4909548803434925e-05, + "loss": 0.8592, + "step": 9004 + }, + { + "epoch": 0.43848757091033036, + "grad_norm": 1.760000228881836, + "learning_rate": 2.4906490890930886e-05, + "loss": 0.828, + "step": 9005 + }, + { + "epoch": 0.4385362646994376, + "grad_norm": 1.6919995546340942, + "learning_rate": 2.4903432856377435e-05, + "loss": 0.8274, + "step": 9006 + }, + { + "epoch": 0.4385849584885448, + "grad_norm": 1.6776440143585205, + "learning_rate": 2.4900374699850648e-05, + "loss": 0.7978, + "step": 9007 + }, + { + "epoch": 0.438633652277652, + "grad_norm": 1.1191989183425903, + "learning_rate": 2.4897316421426593e-05, + "loss": 0.8883, + "step": 9008 + }, + { + "epoch": 0.4386823460667592, + "grad_norm": 1.2674870491027832, + "learning_rate": 2.4894258021181345e-05, + "loss": 0.8049, + "step": 9009 + }, + { + "epoch": 0.43873103985586637, + "grad_norm": 1.439787745475769, + "learning_rate": 2.4891199499190984e-05, + "loss": 0.7796, + "step": 9010 + }, + { + "epoch": 0.4387797336449736, + "grad_norm": 1.5848947763442993, + "learning_rate": 2.4888140855531587e-05, + "loss": 0.8457, + "step": 9011 + }, + { + "epoch": 0.4388284274340808, + "grad_norm": 1.616752028465271, + "learning_rate": 2.488508209027924e-05, + "loss": 0.8282, + "step": 9012 + }, + { + "epoch": 0.438877121223188, + "grad_norm": 1.475186824798584, + "learning_rate": 2.4882023203510036e-05, + "loss": 0.8796, + "step": 9013 + }, + { + "epoch": 0.43892581501229516, + "grad_norm": 1.6201812028884888, + "learning_rate": 2.487896419530005e-05, + "loss": 0.747, + "step": 9014 + }, + { + "epoch": 0.4389745088014024, + "grad_norm": 1.6968767642974854, + "learning_rate": 2.4875905065725393e-05, + "loss": 0.8562, + "step": 9015 + }, + { + "epoch": 0.4390232025905096, + "grad_norm": 1.8669495582580566, + "learning_rate": 2.4872845814862148e-05, + "loss": 0.8406, + "step": 9016 + }, + { + "epoch": 0.4390718963796168, + "grad_norm": 1.469435453414917, + "learning_rate": 2.4869786442786418e-05, + "loss": 0.9533, + "step": 9017 + }, + { + "epoch": 0.43912059016872396, + "grad_norm": 1.8270070552825928, + "learning_rate": 2.4866726949574313e-05, + "loss": 0.8509, + "step": 9018 + }, + { + "epoch": 0.43916928395783117, + "grad_norm": 1.4108151197433472, + "learning_rate": 2.4863667335301926e-05, + "loss": 0.884, + "step": 9019 + }, + { + "epoch": 0.4392179777469384, + "grad_norm": 1.5746203660964966, + "learning_rate": 2.4860607600045373e-05, + "loss": 0.8164, + "step": 9020 + }, + { + "epoch": 0.4392666715360456, + "grad_norm": 3.5298006534576416, + "learning_rate": 2.4857547743880766e-05, + "loss": 0.7763, + "step": 9021 + }, + { + "epoch": 0.43931536532515275, + "grad_norm": 1.4140030145645142, + "learning_rate": 2.4854487766884208e-05, + "loss": 0.8089, + "step": 9022 + }, + { + "epoch": 0.43936405911425996, + "grad_norm": 1.523736834526062, + "learning_rate": 2.485142766913183e-05, + "loss": 0.858, + "step": 9023 + }, + { + "epoch": 0.4394127529033672, + "grad_norm": 1.3199807405471802, + "learning_rate": 2.4848367450699744e-05, + "loss": 0.9372, + "step": 9024 + }, + { + "epoch": 0.4394614466924744, + "grad_norm": 1.9553011655807495, + "learning_rate": 2.4845307111664076e-05, + "loss": 0.8731, + "step": 9025 + }, + { + "epoch": 0.4395101404815816, + "grad_norm": 0.09144949913024902, + "learning_rate": 2.4842246652100955e-05, + "loss": 0.6721, + "step": 9026 + }, + { + "epoch": 0.43955883427068876, + "grad_norm": 1.2740076780319214, + "learning_rate": 2.4839186072086505e-05, + "loss": 0.8265, + "step": 9027 + }, + { + "epoch": 0.43960752805979597, + "grad_norm": 1.2713313102722168, + "learning_rate": 2.4836125371696857e-05, + "loss": 0.7486, + "step": 9028 + }, + { + "epoch": 0.4396562218489032, + "grad_norm": 1.5123826265335083, + "learning_rate": 2.4833064551008152e-05, + "loss": 0.7739, + "step": 9029 + }, + { + "epoch": 0.4397049156380104, + "grad_norm": 2.3421473503112793, + "learning_rate": 2.4830003610096523e-05, + "loss": 0.8174, + "step": 9030 + }, + { + "epoch": 0.43975360942711755, + "grad_norm": 1.5702356100082397, + "learning_rate": 2.482694254903812e-05, + "loss": 0.7922, + "step": 9031 + }, + { + "epoch": 0.43980230321622477, + "grad_norm": 2.5444700717926025, + "learning_rate": 2.482388136790908e-05, + "loss": 0.8389, + "step": 9032 + }, + { + "epoch": 0.439850997005332, + "grad_norm": 1.6016026735305786, + "learning_rate": 2.4820820066785542e-05, + "loss": 0.8593, + "step": 9033 + }, + { + "epoch": 0.4398996907944392, + "grad_norm": 1.3817603588104248, + "learning_rate": 2.481775864574367e-05, + "loss": 0.7409, + "step": 9034 + }, + { + "epoch": 0.43994838458354635, + "grad_norm": 0.09849559515714645, + "learning_rate": 2.481469710485962e-05, + "loss": 0.5962, + "step": 9035 + }, + { + "epoch": 0.43999707837265356, + "grad_norm": 1.1941465139389038, + "learning_rate": 2.4811635444209533e-05, + "loss": 0.8079, + "step": 9036 + }, + { + "epoch": 0.4400457721617608, + "grad_norm": 0.10008540004491806, + "learning_rate": 2.4808573663869578e-05, + "loss": 0.6716, + "step": 9037 + }, + { + "epoch": 0.440094465950868, + "grad_norm": 1.551591396331787, + "learning_rate": 2.480551176391591e-05, + "loss": 0.8324, + "step": 9038 + }, + { + "epoch": 0.44014315973997514, + "grad_norm": 2.19181227684021, + "learning_rate": 2.4802449744424706e-05, + "loss": 0.9629, + "step": 9039 + }, + { + "epoch": 0.44019185352908236, + "grad_norm": 1.4296915531158447, + "learning_rate": 2.4799387605472123e-05, + "loss": 0.7801, + "step": 9040 + }, + { + "epoch": 0.44024054731818957, + "grad_norm": 2.7752771377563477, + "learning_rate": 2.479632534713434e-05, + "loss": 0.7926, + "step": 9041 + }, + { + "epoch": 0.4402892411072968, + "grad_norm": 1.2704956531524658, + "learning_rate": 2.4793262969487523e-05, + "loss": 0.774, + "step": 9042 + }, + { + "epoch": 0.44033793489640394, + "grad_norm": 2.329562187194824, + "learning_rate": 2.479020047260785e-05, + "loss": 0.9098, + "step": 9043 + }, + { + "epoch": 0.44038662868551115, + "grad_norm": 1.2819061279296875, + "learning_rate": 2.4787137856571507e-05, + "loss": 0.8695, + "step": 9044 + }, + { + "epoch": 0.44043532247461836, + "grad_norm": 0.0980086624622345, + "learning_rate": 2.478407512145467e-05, + "loss": 0.6267, + "step": 9045 + }, + { + "epoch": 0.4404840162637256, + "grad_norm": 0.08944389224052429, + "learning_rate": 2.4781012267333527e-05, + "loss": 0.6161, + "step": 9046 + }, + { + "epoch": 0.4405327100528328, + "grad_norm": 2.37817645072937, + "learning_rate": 2.4777949294284277e-05, + "loss": 0.7713, + "step": 9047 + }, + { + "epoch": 0.44058140384193994, + "grad_norm": 1.5396784543991089, + "learning_rate": 2.4774886202383094e-05, + "loss": 0.7757, + "step": 9048 + }, + { + "epoch": 0.44063009763104716, + "grad_norm": 1.2896872758865356, + "learning_rate": 2.4771822991706187e-05, + "loss": 0.8862, + "step": 9049 + }, + { + "epoch": 0.44067879142015437, + "grad_norm": 1.4377695322036743, + "learning_rate": 2.476875966232975e-05, + "loss": 0.8111, + "step": 9050 + }, + { + "epoch": 0.4407274852092616, + "grad_norm": 1.4533452987670898, + "learning_rate": 2.4765696214329973e-05, + "loss": 0.7862, + "step": 9051 + }, + { + "epoch": 0.44077617899836874, + "grad_norm": 1.8535126447677612, + "learning_rate": 2.476263264778307e-05, + "loss": 0.8882, + "step": 9052 + }, + { + "epoch": 0.44082487278747595, + "grad_norm": 2.2626242637634277, + "learning_rate": 2.4759568962765243e-05, + "loss": 0.8918, + "step": 9053 + }, + { + "epoch": 0.44087356657658316, + "grad_norm": 1.87518310546875, + "learning_rate": 2.4756505159352715e-05, + "loss": 0.932, + "step": 9054 + }, + { + "epoch": 0.4409222603656904, + "grad_norm": 0.08949357271194458, + "learning_rate": 2.475344123762168e-05, + "loss": 0.6359, + "step": 9055 + }, + { + "epoch": 0.44097095415479753, + "grad_norm": 1.3193119764328003, + "learning_rate": 2.4750377197648362e-05, + "loss": 0.907, + "step": 9056 + }, + { + "epoch": 0.44101964794390475, + "grad_norm": 1.7864526510238647, + "learning_rate": 2.4747313039508983e-05, + "loss": 0.847, + "step": 9057 + }, + { + "epoch": 0.44106834173301196, + "grad_norm": 0.09147164970636368, + "learning_rate": 2.4744248763279753e-05, + "loss": 0.6412, + "step": 9058 + }, + { + "epoch": 0.44111703552211917, + "grad_norm": 1.4459267854690552, + "learning_rate": 2.4741184369036903e-05, + "loss": 0.8992, + "step": 9059 + }, + { + "epoch": 0.44116572931122633, + "grad_norm": 1.3497166633605957, + "learning_rate": 2.473811985685666e-05, + "loss": 0.7985, + "step": 9060 + }, + { + "epoch": 0.44121442310033354, + "grad_norm": 0.0974099412560463, + "learning_rate": 2.4735055226815257e-05, + "loss": 0.7033, + "step": 9061 + }, + { + "epoch": 0.44126311688944075, + "grad_norm": 1.346668004989624, + "learning_rate": 2.4731990478988918e-05, + "loss": 0.8428, + "step": 9062 + }, + { + "epoch": 0.44131181067854797, + "grad_norm": 1.6912873983383179, + "learning_rate": 2.4728925613453886e-05, + "loss": 0.8625, + "step": 9063 + }, + { + "epoch": 0.4413605044676551, + "grad_norm": 3.3929855823516846, + "learning_rate": 2.4725860630286402e-05, + "loss": 0.848, + "step": 9064 + }, + { + "epoch": 0.44140919825676234, + "grad_norm": 1.271359920501709, + "learning_rate": 2.4722795529562698e-05, + "loss": 0.8183, + "step": 9065 + }, + { + "epoch": 0.44145789204586955, + "grad_norm": 1.6019283533096313, + "learning_rate": 2.471973031135903e-05, + "loss": 0.8425, + "step": 9066 + }, + { + "epoch": 0.44150658583497676, + "grad_norm": 1.8242270946502686, + "learning_rate": 2.4716664975751633e-05, + "loss": 0.8335, + "step": 9067 + }, + { + "epoch": 0.441555279624084, + "grad_norm": 3.410320520401001, + "learning_rate": 2.4713599522816772e-05, + "loss": 0.8113, + "step": 9068 + }, + { + "epoch": 0.44160397341319113, + "grad_norm": 1.916599154472351, + "learning_rate": 2.4710533952630692e-05, + "loss": 0.8944, + "step": 9069 + }, + { + "epoch": 0.44165266720229834, + "grad_norm": 1.4695338010787964, + "learning_rate": 2.470746826526965e-05, + "loss": 0.929, + "step": 9070 + }, + { + "epoch": 0.44170136099140556, + "grad_norm": 1.362306833267212, + "learning_rate": 2.470440246080991e-05, + "loss": 0.8606, + "step": 9071 + }, + { + "epoch": 0.44175005478051277, + "grad_norm": 1.7981330156326294, + "learning_rate": 2.4701336539327723e-05, + "loss": 0.8414, + "step": 9072 + }, + { + "epoch": 0.4417987485696199, + "grad_norm": 1.4134128093719482, + "learning_rate": 2.469827050089936e-05, + "loss": 0.8506, + "step": 9073 + }, + { + "epoch": 0.44184744235872714, + "grad_norm": 1.7461187839508057, + "learning_rate": 2.4695204345601085e-05, + "loss": 0.8909, + "step": 9074 + }, + { + "epoch": 0.44189613614783435, + "grad_norm": 1.596090316772461, + "learning_rate": 2.4692138073509186e-05, + "loss": 0.9054, + "step": 9075 + }, + { + "epoch": 0.44194482993694156, + "grad_norm": 1.658018946647644, + "learning_rate": 2.468907168469992e-05, + "loss": 0.9017, + "step": 9076 + }, + { + "epoch": 0.4419935237260487, + "grad_norm": 1.1424392461776733, + "learning_rate": 2.4686005179249567e-05, + "loss": 0.8447, + "step": 9077 + }, + { + "epoch": 0.44204221751515593, + "grad_norm": 2.45904278755188, + "learning_rate": 2.4682938557234412e-05, + "loss": 0.7864, + "step": 9078 + }, + { + "epoch": 0.44209091130426315, + "grad_norm": 1.6011890172958374, + "learning_rate": 2.4679871818730732e-05, + "loss": 0.8129, + "step": 9079 + }, + { + "epoch": 0.44213960509337036, + "grad_norm": 1.5539844036102295, + "learning_rate": 2.4676804963814814e-05, + "loss": 0.918, + "step": 9080 + }, + { + "epoch": 0.4421882988824775, + "grad_norm": 1.4208110570907593, + "learning_rate": 2.467373799256294e-05, + "loss": 0.936, + "step": 9081 + }, + { + "epoch": 0.4422369926715847, + "grad_norm": 1.321543574333191, + "learning_rate": 2.4670670905051416e-05, + "loss": 0.8489, + "step": 9082 + }, + { + "epoch": 0.44228568646069194, + "grad_norm": 2.0933585166931152, + "learning_rate": 2.466760370135652e-05, + "loss": 0.7995, + "step": 9083 + }, + { + "epoch": 0.44233438024979915, + "grad_norm": 1.3829224109649658, + "learning_rate": 2.466453638155456e-05, + "loss": 0.8257, + "step": 9084 + }, + { + "epoch": 0.4423830740389063, + "grad_norm": 2.111374616622925, + "learning_rate": 2.4661468945721833e-05, + "loss": 0.7146, + "step": 9085 + }, + { + "epoch": 0.4424317678280135, + "grad_norm": 1.1737682819366455, + "learning_rate": 2.465840139393464e-05, + "loss": 0.8222, + "step": 9086 + }, + { + "epoch": 0.44248046161712073, + "grad_norm": 1.7061233520507812, + "learning_rate": 2.465533372626929e-05, + "loss": 0.819, + "step": 9087 + }, + { + "epoch": 0.44252915540622795, + "grad_norm": 5.4543962478637695, + "learning_rate": 2.4652265942802082e-05, + "loss": 0.86, + "step": 9088 + }, + { + "epoch": 0.44257784919533516, + "grad_norm": 46.74634552001953, + "learning_rate": 2.464919804360934e-05, + "loss": 0.7994, + "step": 9089 + }, + { + "epoch": 0.4426265429844423, + "grad_norm": 1.4874110221862793, + "learning_rate": 2.4646130028767373e-05, + "loss": 0.8621, + "step": 9090 + }, + { + "epoch": 0.44267523677354953, + "grad_norm": 1.2870755195617676, + "learning_rate": 2.4643061898352496e-05, + "loss": 0.8053, + "step": 9091 + }, + { + "epoch": 0.44272393056265674, + "grad_norm": 1.613077163696289, + "learning_rate": 2.4639993652441035e-05, + "loss": 0.8078, + "step": 9092 + }, + { + "epoch": 0.44277262435176395, + "grad_norm": 1.5460059642791748, + "learning_rate": 2.46369252911093e-05, + "loss": 0.7723, + "step": 9093 + }, + { + "epoch": 0.4428213181408711, + "grad_norm": 2.373013734817505, + "learning_rate": 2.4633856814433635e-05, + "loss": 0.803, + "step": 9094 + }, + { + "epoch": 0.4428700119299783, + "grad_norm": 1.8312501907348633, + "learning_rate": 2.4630788222490356e-05, + "loss": 0.8256, + "step": 9095 + }, + { + "epoch": 0.44291870571908554, + "grad_norm": 2.4609322547912598, + "learning_rate": 2.462771951535579e-05, + "loss": 0.8269, + "step": 9096 + }, + { + "epoch": 0.44296739950819275, + "grad_norm": 1.7649552822113037, + "learning_rate": 2.462465069310629e-05, + "loss": 0.8618, + "step": 9097 + }, + { + "epoch": 0.4430160932972999, + "grad_norm": 1.7214115858078003, + "learning_rate": 2.4621581755818178e-05, + "loss": 0.8174, + "step": 9098 + }, + { + "epoch": 0.4430647870864071, + "grad_norm": 2.3418614864349365, + "learning_rate": 2.46185127035678e-05, + "loss": 0.8084, + "step": 9099 + }, + { + "epoch": 0.44311348087551433, + "grad_norm": 1.56923508644104, + "learning_rate": 2.4615443536431497e-05, + "loss": 0.8342, + "step": 9100 + }, + { + "epoch": 0.44316217466462154, + "grad_norm": 1.5484017133712769, + "learning_rate": 2.4612374254485615e-05, + "loss": 0.687, + "step": 9101 + }, + { + "epoch": 0.4432108684537287, + "grad_norm": 0.09320226311683655, + "learning_rate": 2.46093048578065e-05, + "loss": 0.5819, + "step": 9102 + }, + { + "epoch": 0.4432595622428359, + "grad_norm": 1.577781081199646, + "learning_rate": 2.4606235346470513e-05, + "loss": 0.8328, + "step": 9103 + }, + { + "epoch": 0.4433082560319431, + "grad_norm": 1.5665405988693237, + "learning_rate": 2.460316572055399e-05, + "loss": 0.7941, + "step": 9104 + }, + { + "epoch": 0.44335694982105034, + "grad_norm": 2.4246068000793457, + "learning_rate": 2.460009598013331e-05, + "loss": 0.8593, + "step": 9105 + }, + { + "epoch": 0.44340564361015755, + "grad_norm": 1.8897265195846558, + "learning_rate": 2.4597026125284826e-05, + "loss": 0.9553, + "step": 9106 + }, + { + "epoch": 0.4434543373992647, + "grad_norm": 4.601324081420898, + "learning_rate": 2.459395615608489e-05, + "loss": 0.8067, + "step": 9107 + }, + { + "epoch": 0.4435030311883719, + "grad_norm": 1.5840392112731934, + "learning_rate": 2.4590886072609877e-05, + "loss": 0.838, + "step": 9108 + }, + { + "epoch": 0.44355172497747913, + "grad_norm": 1.492888331413269, + "learning_rate": 2.4587815874936157e-05, + "loss": 0.857, + "step": 9109 + }, + { + "epoch": 0.44360041876658635, + "grad_norm": 1.7467374801635742, + "learning_rate": 2.4584745563140096e-05, + "loss": 0.8774, + "step": 9110 + }, + { + "epoch": 0.4436491125556935, + "grad_norm": 2.5437519550323486, + "learning_rate": 2.4581675137298075e-05, + "loss": 0.846, + "step": 9111 + }, + { + "epoch": 0.4436978063448007, + "grad_norm": 0.09068446606397629, + "learning_rate": 2.4578604597486466e-05, + "loss": 0.629, + "step": 9112 + }, + { + "epoch": 0.44374650013390793, + "grad_norm": 0.09186334162950516, + "learning_rate": 2.4575533943781646e-05, + "loss": 0.6076, + "step": 9113 + }, + { + "epoch": 0.44379519392301514, + "grad_norm": 1.7267299890518188, + "learning_rate": 2.4572463176260005e-05, + "loss": 0.8078, + "step": 9114 + }, + { + "epoch": 0.4438438877121223, + "grad_norm": 1.4876281023025513, + "learning_rate": 2.4569392294997923e-05, + "loss": 0.8369, + "step": 9115 + }, + { + "epoch": 0.4438925815012295, + "grad_norm": 1.717596411705017, + "learning_rate": 2.45663213000718e-05, + "loss": 0.854, + "step": 9116 + }, + { + "epoch": 0.4439412752903367, + "grad_norm": 0.09149747341871262, + "learning_rate": 2.4563250191558008e-05, + "loss": 0.5713, + "step": 9117 + }, + { + "epoch": 0.44398996907944394, + "grad_norm": 1.622650384902954, + "learning_rate": 2.4560178969532955e-05, + "loss": 0.8438, + "step": 9118 + }, + { + "epoch": 0.4440386628685511, + "grad_norm": 1.807005763053894, + "learning_rate": 2.4557107634073037e-05, + "loss": 0.8951, + "step": 9119 + }, + { + "epoch": 0.4440873566576583, + "grad_norm": 1.483452558517456, + "learning_rate": 2.4554036185254648e-05, + "loss": 0.7916, + "step": 9120 + }, + { + "epoch": 0.4441360504467655, + "grad_norm": 1.4496629238128662, + "learning_rate": 2.4550964623154195e-05, + "loss": 0.8662, + "step": 9121 + }, + { + "epoch": 0.44418474423587273, + "grad_norm": 1.431618332862854, + "learning_rate": 2.4547892947848083e-05, + "loss": 0.8594, + "step": 9122 + }, + { + "epoch": 0.4442334380249799, + "grad_norm": 1.8517823219299316, + "learning_rate": 2.4544821159412715e-05, + "loss": 0.8523, + "step": 9123 + }, + { + "epoch": 0.4442821318140871, + "grad_norm": 0.08906524628400803, + "learning_rate": 2.454174925792451e-05, + "loss": 0.5745, + "step": 9124 + }, + { + "epoch": 0.4443308256031943, + "grad_norm": 1.4948725700378418, + "learning_rate": 2.4538677243459878e-05, + "loss": 0.9204, + "step": 9125 + }, + { + "epoch": 0.4443795193923015, + "grad_norm": 1.3144783973693848, + "learning_rate": 2.4535605116095233e-05, + "loss": 0.9014, + "step": 9126 + }, + { + "epoch": 0.44442821318140874, + "grad_norm": 2.548079490661621, + "learning_rate": 2.4532532875907e-05, + "loss": 0.8174, + "step": 9127 + }, + { + "epoch": 0.4444769069705159, + "grad_norm": 1.6005425453186035, + "learning_rate": 2.45294605229716e-05, + "loss": 0.8734, + "step": 9128 + }, + { + "epoch": 0.4445256007596231, + "grad_norm": 1.7531529664993286, + "learning_rate": 2.4526388057365455e-05, + "loss": 0.7555, + "step": 9129 + }, + { + "epoch": 0.4445742945487303, + "grad_norm": 1.466461181640625, + "learning_rate": 2.4523315479164994e-05, + "loss": 0.8322, + "step": 9130 + }, + { + "epoch": 0.44462298833783753, + "grad_norm": 6.327359676361084, + "learning_rate": 2.4520242788446647e-05, + "loss": 0.9352, + "step": 9131 + }, + { + "epoch": 0.4446716821269447, + "grad_norm": 2.135821580886841, + "learning_rate": 2.4517169985286846e-05, + "loss": 0.8501, + "step": 9132 + }, + { + "epoch": 0.4447203759160519, + "grad_norm": 0.09154050052165985, + "learning_rate": 2.451409706976203e-05, + "loss": 0.6302, + "step": 9133 + }, + { + "epoch": 0.4447690697051591, + "grad_norm": 1.944012999534607, + "learning_rate": 2.451102404194864e-05, + "loss": 0.8281, + "step": 9134 + }, + { + "epoch": 0.4448177634942663, + "grad_norm": 2.0846469402313232, + "learning_rate": 2.4507950901923118e-05, + "loss": 0.8833, + "step": 9135 + }, + { + "epoch": 0.4448664572833735, + "grad_norm": 1.7936536073684692, + "learning_rate": 2.4504877649761908e-05, + "loss": 0.801, + "step": 9136 + }, + { + "epoch": 0.4449151510724807, + "grad_norm": 1.9538031816482544, + "learning_rate": 2.450180428554145e-05, + "loss": 0.8842, + "step": 9137 + }, + { + "epoch": 0.4449638448615879, + "grad_norm": 1.8752574920654297, + "learning_rate": 2.4498730809338204e-05, + "loss": 0.7868, + "step": 9138 + }, + { + "epoch": 0.4450125386506951, + "grad_norm": 3.2695508003234863, + "learning_rate": 2.4495657221228616e-05, + "loss": 0.7141, + "step": 9139 + }, + { + "epoch": 0.4450612324398023, + "grad_norm": 1.4396483898162842, + "learning_rate": 2.4492583521289145e-05, + "loss": 0.8354, + "step": 9140 + }, + { + "epoch": 0.4451099262289095, + "grad_norm": 1.6623549461364746, + "learning_rate": 2.4489509709596252e-05, + "loss": 0.8645, + "step": 9141 + }, + { + "epoch": 0.4451586200180167, + "grad_norm": 1.989485502243042, + "learning_rate": 2.448643578622639e-05, + "loss": 0.7772, + "step": 9142 + }, + { + "epoch": 0.4452073138071239, + "grad_norm": 2.041114091873169, + "learning_rate": 2.4483361751256035e-05, + "loss": 0.8166, + "step": 9143 + }, + { + "epoch": 0.4452560075962311, + "grad_norm": 1.5210444927215576, + "learning_rate": 2.4480287604761647e-05, + "loss": 0.8618, + "step": 9144 + }, + { + "epoch": 0.4453047013853383, + "grad_norm": 2.1520111560821533, + "learning_rate": 2.4477213346819696e-05, + "loss": 0.7961, + "step": 9145 + }, + { + "epoch": 0.4453533951744455, + "grad_norm": 2.0735936164855957, + "learning_rate": 2.447413897750665e-05, + "loss": 0.848, + "step": 9146 + }, + { + "epoch": 0.4454020889635527, + "grad_norm": 1.7839066982269287, + "learning_rate": 2.4471064496898995e-05, + "loss": 0.8231, + "step": 9147 + }, + { + "epoch": 0.4454507827526599, + "grad_norm": 1.5678898096084595, + "learning_rate": 2.44679899050732e-05, + "loss": 0.8226, + "step": 9148 + }, + { + "epoch": 0.4454994765417671, + "grad_norm": 1.6617554426193237, + "learning_rate": 2.446491520210575e-05, + "loss": 0.8485, + "step": 9149 + }, + { + "epoch": 0.4455481703308743, + "grad_norm": 1.846137523651123, + "learning_rate": 2.4461840388073128e-05, + "loss": 0.8531, + "step": 9150 + }, + { + "epoch": 0.4455968641199815, + "grad_norm": 2.8947856426239014, + "learning_rate": 2.445876546305182e-05, + "loss": 0.8687, + "step": 9151 + }, + { + "epoch": 0.4456455579090887, + "grad_norm": 2.867511749267578, + "learning_rate": 2.4455690427118305e-05, + "loss": 0.7872, + "step": 9152 + }, + { + "epoch": 0.4456942516981959, + "grad_norm": 1.4620996713638306, + "learning_rate": 2.4452615280349096e-05, + "loss": 0.7948, + "step": 9153 + }, + { + "epoch": 0.4457429454873031, + "grad_norm": 1.353134036064148, + "learning_rate": 2.4449540022820666e-05, + "loss": 0.8112, + "step": 9154 + }, + { + "epoch": 0.4457916392764103, + "grad_norm": 1.2769508361816406, + "learning_rate": 2.444646465460953e-05, + "loss": 0.8513, + "step": 9155 + }, + { + "epoch": 0.4458403330655175, + "grad_norm": 1.5842006206512451, + "learning_rate": 2.4443389175792178e-05, + "loss": 0.7997, + "step": 9156 + }, + { + "epoch": 0.44588902685462467, + "grad_norm": 1.5884424448013306, + "learning_rate": 2.444031358644512e-05, + "loss": 0.9827, + "step": 9157 + }, + { + "epoch": 0.4459377206437319, + "grad_norm": 2.560382604598999, + "learning_rate": 2.4437237886644855e-05, + "loss": 0.8193, + "step": 9158 + }, + { + "epoch": 0.4459864144328391, + "grad_norm": 0.09049457311630249, + "learning_rate": 2.4434162076467888e-05, + "loss": 0.6492, + "step": 9159 + }, + { + "epoch": 0.4460351082219463, + "grad_norm": 3.0994930267333984, + "learning_rate": 2.4431086155990732e-05, + "loss": 0.8287, + "step": 9160 + }, + { + "epoch": 0.44608380201105347, + "grad_norm": 1.2950663566589355, + "learning_rate": 2.4428010125289913e-05, + "loss": 0.8738, + "step": 9161 + }, + { + "epoch": 0.4461324958001607, + "grad_norm": 3.297830104827881, + "learning_rate": 2.4424933984441926e-05, + "loss": 0.8166, + "step": 9162 + }, + { + "epoch": 0.4461811895892679, + "grad_norm": 1.5669482946395874, + "learning_rate": 2.4421857733523315e-05, + "loss": 0.8017, + "step": 9163 + }, + { + "epoch": 0.4462298833783751, + "grad_norm": 3.6504580974578857, + "learning_rate": 2.441878137261059e-05, + "loss": 0.8408, + "step": 9164 + }, + { + "epoch": 0.44627857716748226, + "grad_norm": 1.5713317394256592, + "learning_rate": 2.441570490178027e-05, + "loss": 0.7937, + "step": 9165 + }, + { + "epoch": 0.4463272709565895, + "grad_norm": 1.8161894083023071, + "learning_rate": 2.441262832110889e-05, + "loss": 0.8433, + "step": 9166 + }, + { + "epoch": 0.4463759647456967, + "grad_norm": 1.4101954698562622, + "learning_rate": 2.4409551630672977e-05, + "loss": 0.8247, + "step": 9167 + }, + { + "epoch": 0.4464246585348039, + "grad_norm": 1.318488359451294, + "learning_rate": 2.4406474830549062e-05, + "loss": 0.864, + "step": 9168 + }, + { + "epoch": 0.4464733523239111, + "grad_norm": 1.4406682252883911, + "learning_rate": 2.440339792081369e-05, + "loss": 0.784, + "step": 9169 + }, + { + "epoch": 0.44652204611301827, + "grad_norm": 1.6723933219909668, + "learning_rate": 2.4400320901543392e-05, + "loss": 0.8468, + "step": 9170 + }, + { + "epoch": 0.4465707399021255, + "grad_norm": 1.5128023624420166, + "learning_rate": 2.4397243772814713e-05, + "loss": 0.8262, + "step": 9171 + }, + { + "epoch": 0.4466194336912327, + "grad_norm": 1.8045099973678589, + "learning_rate": 2.4394166534704186e-05, + "loss": 0.8293, + "step": 9172 + }, + { + "epoch": 0.4466681274803399, + "grad_norm": 1.5904179811477661, + "learning_rate": 2.4391089187288367e-05, + "loss": 0.8547, + "step": 9173 + }, + { + "epoch": 0.44671682126944706, + "grad_norm": 1.8237533569335938, + "learning_rate": 2.438801173064381e-05, + "loss": 0.7879, + "step": 9174 + }, + { + "epoch": 0.4467655150585543, + "grad_norm": 4.225331783294678, + "learning_rate": 2.438493416484706e-05, + "loss": 0.8459, + "step": 9175 + }, + { + "epoch": 0.4468142088476615, + "grad_norm": 1.95465886592865, + "learning_rate": 2.438185648997467e-05, + "loss": 0.7615, + "step": 9176 + }, + { + "epoch": 0.4468629026367687, + "grad_norm": 1.7163259983062744, + "learning_rate": 2.43787787061032e-05, + "loss": 0.9571, + "step": 9177 + }, + { + "epoch": 0.44691159642587586, + "grad_norm": 1.4853460788726807, + "learning_rate": 2.4375700813309212e-05, + "loss": 0.9024, + "step": 9178 + }, + { + "epoch": 0.44696029021498307, + "grad_norm": 1.2353078126907349, + "learning_rate": 2.437262281166927e-05, + "loss": 0.8564, + "step": 9179 + }, + { + "epoch": 0.4470089840040903, + "grad_norm": 1.492863655090332, + "learning_rate": 2.4369544701259934e-05, + "loss": 0.8553, + "step": 9180 + }, + { + "epoch": 0.4470576777931975, + "grad_norm": 1.3039379119873047, + "learning_rate": 2.436646648215778e-05, + "loss": 0.8061, + "step": 9181 + }, + { + "epoch": 0.44710637158230465, + "grad_norm": 1.8610881567001343, + "learning_rate": 2.4363388154439363e-05, + "loss": 0.8786, + "step": 9182 + }, + { + "epoch": 0.44715506537141186, + "grad_norm": 3.165781021118164, + "learning_rate": 2.436030971818127e-05, + "loss": 0.8636, + "step": 9183 + }, + { + "epoch": 0.4472037591605191, + "grad_norm": 1.3667595386505127, + "learning_rate": 2.4357231173460082e-05, + "loss": 0.8065, + "step": 9184 + }, + { + "epoch": 0.4472524529496263, + "grad_norm": 1.9147906303405762, + "learning_rate": 2.4354152520352367e-05, + "loss": 0.9468, + "step": 9185 + }, + { + "epoch": 0.4473011467387335, + "grad_norm": 2.0956876277923584, + "learning_rate": 2.4351073758934715e-05, + "loss": 0.8498, + "step": 9186 + }, + { + "epoch": 0.44734984052784066, + "grad_norm": 2.5286900997161865, + "learning_rate": 2.4347994889283704e-05, + "loss": 0.8032, + "step": 9187 + }, + { + "epoch": 0.44739853431694787, + "grad_norm": 1.350241780281067, + "learning_rate": 2.4344915911475928e-05, + "loss": 0.7723, + "step": 9188 + }, + { + "epoch": 0.4474472281060551, + "grad_norm": 1.5669989585876465, + "learning_rate": 2.4341836825587967e-05, + "loss": 0.8369, + "step": 9189 + }, + { + "epoch": 0.4474959218951623, + "grad_norm": 1.2968326807022095, + "learning_rate": 2.433875763169642e-05, + "loss": 0.8068, + "step": 9190 + }, + { + "epoch": 0.44754461568426945, + "grad_norm": 1.7716009616851807, + "learning_rate": 2.4335678329877877e-05, + "loss": 0.8462, + "step": 9191 + }, + { + "epoch": 0.44759330947337667, + "grad_norm": 1.4710172414779663, + "learning_rate": 2.4332598920208943e-05, + "loss": 0.8365, + "step": 9192 + }, + { + "epoch": 0.4476420032624839, + "grad_norm": 1.3898262977600098, + "learning_rate": 2.4329519402766223e-05, + "loss": 0.8702, + "step": 9193 + }, + { + "epoch": 0.4476906970515911, + "grad_norm": 1.5578725337982178, + "learning_rate": 2.4326439777626307e-05, + "loss": 0.8129, + "step": 9194 + }, + { + "epoch": 0.44773939084069825, + "grad_norm": 1.3463332653045654, + "learning_rate": 2.432336004486581e-05, + "loss": 0.862, + "step": 9195 + }, + { + "epoch": 0.44778808462980546, + "grad_norm": 1.3893884420394897, + "learning_rate": 2.4320280204561337e-05, + "loss": 0.7927, + "step": 9196 + }, + { + "epoch": 0.4478367784189127, + "grad_norm": 1.384905457496643, + "learning_rate": 2.431720025678949e-05, + "loss": 0.8488, + "step": 9197 + }, + { + "epoch": 0.4478854722080199, + "grad_norm": 1.8960055112838745, + "learning_rate": 2.4314120201626906e-05, + "loss": 0.8939, + "step": 9198 + }, + { + "epoch": 0.44793416599712704, + "grad_norm": 1.4773749113082886, + "learning_rate": 2.4311040039150186e-05, + "loss": 0.9087, + "step": 9199 + }, + { + "epoch": 0.44798285978623426, + "grad_norm": 1.8415637016296387, + "learning_rate": 2.4307959769435954e-05, + "loss": 0.8991, + "step": 9200 + }, + { + "epoch": 0.44803155357534147, + "grad_norm": 1.308058500289917, + "learning_rate": 2.430487939256083e-05, + "loss": 0.8428, + "step": 9201 + }, + { + "epoch": 0.4480802473644487, + "grad_norm": 1.6182366609573364, + "learning_rate": 2.4301798908601427e-05, + "loss": 0.8096, + "step": 9202 + }, + { + "epoch": 0.44812894115355584, + "grad_norm": 1.3797025680541992, + "learning_rate": 2.429871831763439e-05, + "loss": 0.8824, + "step": 9203 + }, + { + "epoch": 0.44817763494266305, + "grad_norm": 1.6338286399841309, + "learning_rate": 2.429563761973635e-05, + "loss": 0.9382, + "step": 9204 + }, + { + "epoch": 0.44822632873177026, + "grad_norm": 1.3127055168151855, + "learning_rate": 2.4292556814983926e-05, + "loss": 0.8937, + "step": 9205 + }, + { + "epoch": 0.4482750225208775, + "grad_norm": 1.8682646751403809, + "learning_rate": 2.4289475903453758e-05, + "loss": 0.8678, + "step": 9206 + }, + { + "epoch": 0.4483237163099847, + "grad_norm": 0.0935816541314125, + "learning_rate": 2.428639488522249e-05, + "loss": 0.5979, + "step": 9207 + }, + { + "epoch": 0.44837241009909184, + "grad_norm": 1.3814295530319214, + "learning_rate": 2.428331376036676e-05, + "loss": 0.7776, + "step": 9208 + }, + { + "epoch": 0.44842110388819906, + "grad_norm": 0.09868499636650085, + "learning_rate": 2.4280232528963205e-05, + "loss": 0.6277, + "step": 9209 + }, + { + "epoch": 0.44846979767730627, + "grad_norm": 1.843115210533142, + "learning_rate": 2.4277151191088483e-05, + "loss": 0.9008, + "step": 9210 + }, + { + "epoch": 0.4485184914664135, + "grad_norm": 1.9571104049682617, + "learning_rate": 2.4274069746819223e-05, + "loss": 0.8389, + "step": 9211 + }, + { + "epoch": 0.44856718525552064, + "grad_norm": 2.1626710891723633, + "learning_rate": 2.4270988196232095e-05, + "loss": 0.7907, + "step": 9212 + }, + { + "epoch": 0.44861587904462785, + "grad_norm": 1.3804727792739868, + "learning_rate": 2.4267906539403745e-05, + "loss": 0.813, + "step": 9213 + }, + { + "epoch": 0.44866457283373506, + "grad_norm": 5.444220066070557, + "learning_rate": 2.4264824776410834e-05, + "loss": 0.8667, + "step": 9214 + }, + { + "epoch": 0.4487132666228423, + "grad_norm": 2.256767749786377, + "learning_rate": 2.4261742907330018e-05, + "loss": 0.8392, + "step": 9215 + }, + { + "epoch": 0.44876196041194943, + "grad_norm": 1.3571834564208984, + "learning_rate": 2.4258660932237956e-05, + "loss": 0.8864, + "step": 9216 + }, + { + "epoch": 0.44881065420105665, + "grad_norm": 1.5046601295471191, + "learning_rate": 2.4255578851211315e-05, + "loss": 0.7777, + "step": 9217 + }, + { + "epoch": 0.44885934799016386, + "grad_norm": 1.9074585437774658, + "learning_rate": 2.4252496664326763e-05, + "loss": 0.9582, + "step": 9218 + }, + { + "epoch": 0.44890804177927107, + "grad_norm": 1.4133628606796265, + "learning_rate": 2.4249414371660968e-05, + "loss": 0.7346, + "step": 9219 + }, + { + "epoch": 0.44895673556837823, + "grad_norm": 5.594919204711914, + "learning_rate": 2.4246331973290604e-05, + "loss": 0.8645, + "step": 9220 + }, + { + "epoch": 0.44900542935748544, + "grad_norm": 0.09016067534685135, + "learning_rate": 2.4243249469292342e-05, + "loss": 0.6163, + "step": 9221 + }, + { + "epoch": 0.44905412314659265, + "grad_norm": 0.088408924639225, + "learning_rate": 2.4240166859742865e-05, + "loss": 0.6346, + "step": 9222 + }, + { + "epoch": 0.44910281693569987, + "grad_norm": 1.2980231046676636, + "learning_rate": 2.423708414471885e-05, + "loss": 0.7596, + "step": 9223 + }, + { + "epoch": 0.449151510724807, + "grad_norm": 1.398883581161499, + "learning_rate": 2.4234001324296984e-05, + "loss": 0.8576, + "step": 9224 + }, + { + "epoch": 0.44920020451391424, + "grad_norm": 2.1098392009735107, + "learning_rate": 2.4230918398553946e-05, + "loss": 0.8106, + "step": 9225 + }, + { + "epoch": 0.44924889830302145, + "grad_norm": 1.260680913925171, + "learning_rate": 2.4227835367566428e-05, + "loss": 0.8751, + "step": 9226 + }, + { + "epoch": 0.44929759209212866, + "grad_norm": 1.7617590427398682, + "learning_rate": 2.422475223141112e-05, + "loss": 0.8152, + "step": 9227 + }, + { + "epoch": 0.4493462858812359, + "grad_norm": 2.042428731918335, + "learning_rate": 2.4221668990164715e-05, + "loss": 0.8989, + "step": 9228 + }, + { + "epoch": 0.44939497967034303, + "grad_norm": 1.4516383409500122, + "learning_rate": 2.421858564390391e-05, + "loss": 0.8565, + "step": 9229 + }, + { + "epoch": 0.44944367345945024, + "grad_norm": 1.8422460556030273, + "learning_rate": 2.42155021927054e-05, + "loss": 0.8934, + "step": 9230 + }, + { + "epoch": 0.44949236724855746, + "grad_norm": 1.6528196334838867, + "learning_rate": 2.4212418636645885e-05, + "loss": 0.8269, + "step": 9231 + }, + { + "epoch": 0.44954106103766467, + "grad_norm": 1.6632037162780762, + "learning_rate": 2.4209334975802076e-05, + "loss": 0.8284, + "step": 9232 + }, + { + "epoch": 0.4495897548267718, + "grad_norm": 1.6030197143554688, + "learning_rate": 2.420625121025068e-05, + "loss": 0.9072, + "step": 9233 + }, + { + "epoch": 0.44963844861587904, + "grad_norm": 3.162501335144043, + "learning_rate": 2.4203167340068392e-05, + "loss": 0.8835, + "step": 9234 + }, + { + "epoch": 0.44968714240498625, + "grad_norm": 2.036489725112915, + "learning_rate": 2.420008336533194e-05, + "loss": 0.7807, + "step": 9235 + }, + { + "epoch": 0.44973583619409346, + "grad_norm": 2.3031415939331055, + "learning_rate": 2.4196999286118032e-05, + "loss": 0.896, + "step": 9236 + }, + { + "epoch": 0.4497845299832006, + "grad_norm": 1.5760550498962402, + "learning_rate": 2.4193915102503384e-05, + "loss": 0.8041, + "step": 9237 + }, + { + "epoch": 0.44983322377230783, + "grad_norm": 2.0298008918762207, + "learning_rate": 2.4190830814564715e-05, + "loss": 0.803, + "step": 9238 + }, + { + "epoch": 0.44988191756141505, + "grad_norm": 1.4829109907150269, + "learning_rate": 2.4187746422378747e-05, + "loss": 0.7149, + "step": 9239 + }, + { + "epoch": 0.44993061135052226, + "grad_norm": 1.6144682168960571, + "learning_rate": 2.41846619260222e-05, + "loss": 0.7623, + "step": 9240 + }, + { + "epoch": 0.4499793051396294, + "grad_norm": 1.4991799592971802, + "learning_rate": 2.4181577325571814e-05, + "loss": 0.865, + "step": 9241 + }, + { + "epoch": 0.45002799892873663, + "grad_norm": 0.09423143416643143, + "learning_rate": 2.4178492621104304e-05, + "loss": 0.6514, + "step": 9242 + }, + { + "epoch": 0.45007669271784384, + "grad_norm": 1.4313665628433228, + "learning_rate": 2.4175407812696408e-05, + "loss": 0.8385, + "step": 9243 + }, + { + "epoch": 0.45012538650695105, + "grad_norm": 1.2380937337875366, + "learning_rate": 2.417232290042487e-05, + "loss": 0.8361, + "step": 9244 + }, + { + "epoch": 0.45017408029605827, + "grad_norm": 2.151728868484497, + "learning_rate": 2.4169237884366416e-05, + "loss": 0.8244, + "step": 9245 + }, + { + "epoch": 0.4502227740851654, + "grad_norm": 1.631047248840332, + "learning_rate": 2.416615276459779e-05, + "loss": 0.7811, + "step": 9246 + }, + { + "epoch": 0.45027146787427264, + "grad_norm": 1.2254252433776855, + "learning_rate": 2.4163067541195736e-05, + "loss": 0.831, + "step": 9247 + }, + { + "epoch": 0.45032016166337985, + "grad_norm": 1.3747464418411255, + "learning_rate": 2.4159982214236994e-05, + "loss": 0.8498, + "step": 9248 + }, + { + "epoch": 0.45036885545248706, + "grad_norm": 1.2384296655654907, + "learning_rate": 2.415689678379831e-05, + "loss": 0.7985, + "step": 9249 + }, + { + "epoch": 0.4504175492415942, + "grad_norm": 1.7538765668869019, + "learning_rate": 2.415381124995645e-05, + "loss": 0.7348, + "step": 9250 + }, + { + "epoch": 0.45046624303070143, + "grad_norm": 1.5536199808120728, + "learning_rate": 2.4150725612788145e-05, + "loss": 0.8234, + "step": 9251 + }, + { + "epoch": 0.45051493681980864, + "grad_norm": 2.1797525882720947, + "learning_rate": 2.4147639872370175e-05, + "loss": 0.8557, + "step": 9252 + }, + { + "epoch": 0.45056363060891585, + "grad_norm": 1.9803389310836792, + "learning_rate": 2.414455402877928e-05, + "loss": 0.8824, + "step": 9253 + }, + { + "epoch": 0.450612324398023, + "grad_norm": 1.721333622932434, + "learning_rate": 2.4141468082092222e-05, + "loss": 0.9617, + "step": 9254 + }, + { + "epoch": 0.4506610181871302, + "grad_norm": 1.6900955438613892, + "learning_rate": 2.4138382032385766e-05, + "loss": 0.7757, + "step": 9255 + }, + { + "epoch": 0.45070971197623744, + "grad_norm": 1.6345560550689697, + "learning_rate": 2.4135295879736685e-05, + "loss": 0.8082, + "step": 9256 + }, + { + "epoch": 0.45075840576534465, + "grad_norm": 1.7654565572738647, + "learning_rate": 2.413220962422174e-05, + "loss": 0.7638, + "step": 9257 + }, + { + "epoch": 0.4508070995544518, + "grad_norm": 2.26946759223938, + "learning_rate": 2.4129123265917705e-05, + "loss": 0.832, + "step": 9258 + }, + { + "epoch": 0.450855793343559, + "grad_norm": 1.4134918451309204, + "learning_rate": 2.4126036804901353e-05, + "loss": 0.8685, + "step": 9259 + }, + { + "epoch": 0.45090448713266623, + "grad_norm": 0.0933213010430336, + "learning_rate": 2.412295024124946e-05, + "loss": 0.6436, + "step": 9260 + }, + { + "epoch": 0.45095318092177344, + "grad_norm": 1.3957277536392212, + "learning_rate": 2.4119863575038797e-05, + "loss": 0.8715, + "step": 9261 + }, + { + "epoch": 0.4510018747108806, + "grad_norm": 1.6232985258102417, + "learning_rate": 2.411677680634616e-05, + "loss": 0.8068, + "step": 9262 + }, + { + "epoch": 0.4510505684999878, + "grad_norm": 1.5532264709472656, + "learning_rate": 2.4113689935248318e-05, + "loss": 0.872, + "step": 9263 + }, + { + "epoch": 0.451099262289095, + "grad_norm": 1.484448790550232, + "learning_rate": 2.411060296182207e-05, + "loss": 0.7773, + "step": 9264 + }, + { + "epoch": 0.45114795607820224, + "grad_norm": 1.484782099723816, + "learning_rate": 2.4107515886144196e-05, + "loss": 0.8671, + "step": 9265 + }, + { + "epoch": 0.45119664986730945, + "grad_norm": 4.031190395355225, + "learning_rate": 2.4104428708291487e-05, + "loss": 0.8214, + "step": 9266 + }, + { + "epoch": 0.4512453436564166, + "grad_norm": 1.780124545097351, + "learning_rate": 2.4101341428340748e-05, + "loss": 0.86, + "step": 9267 + }, + { + "epoch": 0.4512940374455238, + "grad_norm": 1.5838751792907715, + "learning_rate": 2.409825404636876e-05, + "loss": 0.9072, + "step": 9268 + }, + { + "epoch": 0.45134273123463103, + "grad_norm": 1.2485777139663696, + "learning_rate": 2.4095166562452326e-05, + "loss": 0.7863, + "step": 9269 + }, + { + "epoch": 0.45139142502373825, + "grad_norm": 1.378786563873291, + "learning_rate": 2.4092078976668252e-05, + "loss": 0.8818, + "step": 9270 + }, + { + "epoch": 0.4514401188128454, + "grad_norm": 1.303175926208496, + "learning_rate": 2.4088991289093346e-05, + "loss": 0.8509, + "step": 9271 + }, + { + "epoch": 0.4514888126019526, + "grad_norm": 1.4395406246185303, + "learning_rate": 2.4085903499804405e-05, + "loss": 0.8219, + "step": 9272 + }, + { + "epoch": 0.45153750639105983, + "grad_norm": 1.9092674255371094, + "learning_rate": 2.408281560887824e-05, + "loss": 0.8532, + "step": 9273 + }, + { + "epoch": 0.45158620018016704, + "grad_norm": 1.6959035396575928, + "learning_rate": 2.407972761639167e-05, + "loss": 0.8407, + "step": 9274 + }, + { + "epoch": 0.4516348939692742, + "grad_norm": 1.5053061246871948, + "learning_rate": 2.40766395224215e-05, + "loss": 0.9262, + "step": 9275 + }, + { + "epoch": 0.4516835877583814, + "grad_norm": 1.7496145963668823, + "learning_rate": 2.4073551327044556e-05, + "loss": 0.8571, + "step": 9276 + }, + { + "epoch": 0.4517322815474886, + "grad_norm": 1.5615291595458984, + "learning_rate": 2.407046303033764e-05, + "loss": 0.8125, + "step": 9277 + }, + { + "epoch": 0.45178097533659584, + "grad_norm": 1.979655385017395, + "learning_rate": 2.406737463237759e-05, + "loss": 0.8523, + "step": 9278 + }, + { + "epoch": 0.451829669125703, + "grad_norm": 4.61497163772583, + "learning_rate": 2.4064286133241228e-05, + "loss": 0.8524, + "step": 9279 + }, + { + "epoch": 0.4518783629148102, + "grad_norm": 2.4113681316375732, + "learning_rate": 2.406119753300538e-05, + "loss": 0.8316, + "step": 9280 + }, + { + "epoch": 0.4519270567039174, + "grad_norm": 1.576491355895996, + "learning_rate": 2.4058108831746867e-05, + "loss": 0.8463, + "step": 9281 + }, + { + "epoch": 0.45197575049302463, + "grad_norm": 2.1984519958496094, + "learning_rate": 2.4055020029542528e-05, + "loss": 0.8939, + "step": 9282 + }, + { + "epoch": 0.4520244442821318, + "grad_norm": 3.0392425060272217, + "learning_rate": 2.4051931126469196e-05, + "loss": 0.7707, + "step": 9283 + }, + { + "epoch": 0.452073138071239, + "grad_norm": 2.387341260910034, + "learning_rate": 2.4048842122603714e-05, + "loss": 0.7863, + "step": 9284 + }, + { + "epoch": 0.4521218318603462, + "grad_norm": 1.5187883377075195, + "learning_rate": 2.4045753018022904e-05, + "loss": 0.9075, + "step": 9285 + }, + { + "epoch": 0.4521705256494534, + "grad_norm": 1.823853611946106, + "learning_rate": 2.404266381280363e-05, + "loss": 0.8605, + "step": 9286 + }, + { + "epoch": 0.45221921943856064, + "grad_norm": 1.55734384059906, + "learning_rate": 2.403957450702272e-05, + "loss": 0.7986, + "step": 9287 + }, + { + "epoch": 0.4522679132276678, + "grad_norm": 1.4641165733337402, + "learning_rate": 2.4036485100757024e-05, + "loss": 0.8269, + "step": 9288 + }, + { + "epoch": 0.452316607016775, + "grad_norm": 1.733610987663269, + "learning_rate": 2.4033395594083392e-05, + "loss": 0.8351, + "step": 9289 + }, + { + "epoch": 0.4523653008058822, + "grad_norm": 1.5524230003356934, + "learning_rate": 2.4030305987078684e-05, + "loss": 0.81, + "step": 9290 + }, + { + "epoch": 0.45241399459498943, + "grad_norm": 1.9498555660247803, + "learning_rate": 2.402721627981974e-05, + "loss": 0.8843, + "step": 9291 + }, + { + "epoch": 0.4524626883840966, + "grad_norm": 1.1406787633895874, + "learning_rate": 2.402412647238342e-05, + "loss": 0.8779, + "step": 9292 + }, + { + "epoch": 0.4525113821732038, + "grad_norm": 1.4980521202087402, + "learning_rate": 2.4021036564846595e-05, + "loss": 0.861, + "step": 9293 + }, + { + "epoch": 0.452560075962311, + "grad_norm": 1.455414056777954, + "learning_rate": 2.401794655728612e-05, + "loss": 0.8352, + "step": 9294 + }, + { + "epoch": 0.4526087697514182, + "grad_norm": 1.7051533460617065, + "learning_rate": 2.4014856449778855e-05, + "loss": 0.7714, + "step": 9295 + }, + { + "epoch": 0.4526574635405254, + "grad_norm": 1.6244300603866577, + "learning_rate": 2.4011766242401673e-05, + "loss": 0.8887, + "step": 9296 + }, + { + "epoch": 0.4527061573296326, + "grad_norm": 2.63506817817688, + "learning_rate": 2.4008675935231436e-05, + "loss": 0.8213, + "step": 9297 + }, + { + "epoch": 0.4527548511187398, + "grad_norm": 3.0570478439331055, + "learning_rate": 2.4005585528345022e-05, + "loss": 0.8622, + "step": 9298 + }, + { + "epoch": 0.452803544907847, + "grad_norm": 1.6800730228424072, + "learning_rate": 2.40024950218193e-05, + "loss": 0.8682, + "step": 9299 + }, + { + "epoch": 0.4528522386969542, + "grad_norm": 3.3720641136169434, + "learning_rate": 2.399940441573115e-05, + "loss": 0.9061, + "step": 9300 + }, + { + "epoch": 0.4529009324860614, + "grad_norm": 1.7547898292541504, + "learning_rate": 2.399631371015745e-05, + "loss": 0.8032, + "step": 9301 + }, + { + "epoch": 0.4529496262751686, + "grad_norm": 1.8132939338684082, + "learning_rate": 2.3993222905175087e-05, + "loss": 0.8914, + "step": 9302 + }, + { + "epoch": 0.4529983200642758, + "grad_norm": 1.2245171070098877, + "learning_rate": 2.3990132000860938e-05, + "loss": 0.8281, + "step": 9303 + }, + { + "epoch": 0.453047013853383, + "grad_norm": 1.3936549425125122, + "learning_rate": 2.398704099729189e-05, + "loss": 0.8155, + "step": 9304 + }, + { + "epoch": 0.4530957076424902, + "grad_norm": 1.4479954242706299, + "learning_rate": 2.3983949894544834e-05, + "loss": 0.7967, + "step": 9305 + }, + { + "epoch": 0.4531444014315974, + "grad_norm": 2.117676258087158, + "learning_rate": 2.3980858692696657e-05, + "loss": 0.9155, + "step": 9306 + }, + { + "epoch": 0.4531930952207046, + "grad_norm": 1.9414749145507812, + "learning_rate": 2.397776739182426e-05, + "loss": 0.8756, + "step": 9307 + }, + { + "epoch": 0.4532417890098118, + "grad_norm": 1.8110743761062622, + "learning_rate": 2.397467599200454e-05, + "loss": 0.7907, + "step": 9308 + }, + { + "epoch": 0.453290482798919, + "grad_norm": 2.1351680755615234, + "learning_rate": 2.397158449331439e-05, + "loss": 0.8008, + "step": 9309 + }, + { + "epoch": 0.4533391765880262, + "grad_norm": 1.7643587589263916, + "learning_rate": 2.396849289583071e-05, + "loss": 0.8493, + "step": 9310 + }, + { + "epoch": 0.4533878703771334, + "grad_norm": 4.635244846343994, + "learning_rate": 2.3965401199630414e-05, + "loss": 0.8421, + "step": 9311 + }, + { + "epoch": 0.4534365641662406, + "grad_norm": 1.7459428310394287, + "learning_rate": 2.39623094047904e-05, + "loss": 0.761, + "step": 9312 + }, + { + "epoch": 0.4534852579553478, + "grad_norm": 1.3694219589233398, + "learning_rate": 2.395921751138758e-05, + "loss": 0.8432, + "step": 9313 + }, + { + "epoch": 0.453533951744455, + "grad_norm": 2.1104178428649902, + "learning_rate": 2.3956125519498853e-05, + "loss": 0.8107, + "step": 9314 + }, + { + "epoch": 0.4535826455335622, + "grad_norm": 2.0041887760162354, + "learning_rate": 2.3953033429201155e-05, + "loss": 0.8792, + "step": 9315 + }, + { + "epoch": 0.4536313393226694, + "grad_norm": 1.8069899082183838, + "learning_rate": 2.3949941240571385e-05, + "loss": 0.8435, + "step": 9316 + }, + { + "epoch": 0.45368003311177657, + "grad_norm": 1.5575246810913086, + "learning_rate": 2.3946848953686468e-05, + "loss": 0.8881, + "step": 9317 + }, + { + "epoch": 0.4537287269008838, + "grad_norm": 1.9434468746185303, + "learning_rate": 2.3943756568623323e-05, + "loss": 0.7373, + "step": 9318 + }, + { + "epoch": 0.453777420689991, + "grad_norm": 0.09351564198732376, + "learning_rate": 2.3940664085458876e-05, + "loss": 0.6096, + "step": 9319 + }, + { + "epoch": 0.4538261144790982, + "grad_norm": 3.0664849281311035, + "learning_rate": 2.3937571504270044e-05, + "loss": 0.8625, + "step": 9320 + }, + { + "epoch": 0.45387480826820537, + "grad_norm": 1.6752629280090332, + "learning_rate": 2.393447882513376e-05, + "loss": 0.781, + "step": 9321 + }, + { + "epoch": 0.4539235020573126, + "grad_norm": 1.7842018604278564, + "learning_rate": 2.393138604812696e-05, + "loss": 0.8614, + "step": 9322 + }, + { + "epoch": 0.4539721958464198, + "grad_norm": 1.743140459060669, + "learning_rate": 2.392829317332658e-05, + "loss": 0.8511, + "step": 9323 + }, + { + "epoch": 0.454020889635527, + "grad_norm": 1.8565815687179565, + "learning_rate": 2.3925200200809546e-05, + "loss": 0.8181, + "step": 9324 + }, + { + "epoch": 0.4540695834246342, + "grad_norm": 1.2134736776351929, + "learning_rate": 2.3922107130652798e-05, + "loss": 0.8317, + "step": 9325 + }, + { + "epoch": 0.4541182772137414, + "grad_norm": 1.5378667116165161, + "learning_rate": 2.391901396293328e-05, + "loss": 0.8478, + "step": 9326 + }, + { + "epoch": 0.4541669710028486, + "grad_norm": 1.4376322031021118, + "learning_rate": 2.3915920697727927e-05, + "loss": 0.8878, + "step": 9327 + }, + { + "epoch": 0.4542156647919558, + "grad_norm": 0.08724281936883926, + "learning_rate": 2.391282733511369e-05, + "loss": 0.6421, + "step": 9328 + }, + { + "epoch": 0.454264358581063, + "grad_norm": 0.09492353349924088, + "learning_rate": 2.390973387516752e-05, + "loss": 0.6416, + "step": 9329 + }, + { + "epoch": 0.45431305237017017, + "grad_norm": 1.3465051651000977, + "learning_rate": 2.3906640317966358e-05, + "loss": 0.8507, + "step": 9330 + }, + { + "epoch": 0.4543617461592774, + "grad_norm": 1.6022616624832153, + "learning_rate": 2.3903546663587168e-05, + "loss": 0.8048, + "step": 9331 + }, + { + "epoch": 0.4544104399483846, + "grad_norm": 1.9234970808029175, + "learning_rate": 2.39004529121069e-05, + "loss": 0.8344, + "step": 9332 + }, + { + "epoch": 0.4544591337374918, + "grad_norm": 2.5524022579193115, + "learning_rate": 2.3897359063602505e-05, + "loss": 0.8957, + "step": 9333 + }, + { + "epoch": 0.45450782752659896, + "grad_norm": 0.08721481263637543, + "learning_rate": 2.389426511815095e-05, + "loss": 0.6236, + "step": 9334 + }, + { + "epoch": 0.4545565213157062, + "grad_norm": 1.311015248298645, + "learning_rate": 2.3891171075829195e-05, + "loss": 0.9466, + "step": 9335 + }, + { + "epoch": 0.4546052151048134, + "grad_norm": 2.1666767597198486, + "learning_rate": 2.3888076936714204e-05, + "loss": 0.8987, + "step": 9336 + }, + { + "epoch": 0.4546539088939206, + "grad_norm": 1.4151486158370972, + "learning_rate": 2.3884982700882945e-05, + "loss": 0.7777, + "step": 9337 + }, + { + "epoch": 0.45470260268302776, + "grad_norm": 0.08474066853523254, + "learning_rate": 2.3881888368412386e-05, + "loss": 0.5578, + "step": 9338 + }, + { + "epoch": 0.45475129647213497, + "grad_norm": 1.5137206315994263, + "learning_rate": 2.3878793939379505e-05, + "loss": 0.8782, + "step": 9339 + }, + { + "epoch": 0.4547999902612422, + "grad_norm": 1.3185672760009766, + "learning_rate": 2.3875699413861263e-05, + "loss": 0.7672, + "step": 9340 + }, + { + "epoch": 0.4548486840503494, + "grad_norm": 1.4554592370986938, + "learning_rate": 2.387260479193465e-05, + "loss": 0.9414, + "step": 9341 + }, + { + "epoch": 0.45489737783945655, + "grad_norm": 1.5982229709625244, + "learning_rate": 2.386951007367664e-05, + "loss": 0.7854, + "step": 9342 + }, + { + "epoch": 0.45494607162856376, + "grad_norm": 1.628440499305725, + "learning_rate": 2.3866415259164208e-05, + "loss": 0.745, + "step": 9343 + }, + { + "epoch": 0.454994765417671, + "grad_norm": 1.4318500757217407, + "learning_rate": 2.3863320348474352e-05, + "loss": 0.964, + "step": 9344 + }, + { + "epoch": 0.4550434592067782, + "grad_norm": 1.5278774499893188, + "learning_rate": 2.3860225341684045e-05, + "loss": 0.9461, + "step": 9345 + }, + { + "epoch": 0.4550921529958854, + "grad_norm": 1.8027158975601196, + "learning_rate": 2.3857130238870285e-05, + "loss": 0.8655, + "step": 9346 + }, + { + "epoch": 0.45514084678499256, + "grad_norm": 1.6553531885147095, + "learning_rate": 2.3854035040110057e-05, + "loss": 0.8387, + "step": 9347 + }, + { + "epoch": 0.45518954057409977, + "grad_norm": 1.586249828338623, + "learning_rate": 2.3850939745480357e-05, + "loss": 0.8875, + "step": 9348 + }, + { + "epoch": 0.455238234363207, + "grad_norm": 2.5298268795013428, + "learning_rate": 2.3847844355058176e-05, + "loss": 0.7937, + "step": 9349 + }, + { + "epoch": 0.4552869281523142, + "grad_norm": 2.0459163188934326, + "learning_rate": 2.384474886892051e-05, + "loss": 0.868, + "step": 9350 + }, + { + "epoch": 0.45533562194142135, + "grad_norm": 1.8541353940963745, + "learning_rate": 2.3841653287144376e-05, + "loss": 0.8066, + "step": 9351 + }, + { + "epoch": 0.45538431573052857, + "grad_norm": 1.4973914623260498, + "learning_rate": 2.3838557609806766e-05, + "loss": 0.8315, + "step": 9352 + }, + { + "epoch": 0.4554330095196358, + "grad_norm": 2.112278699874878, + "learning_rate": 2.3835461836984683e-05, + "loss": 0.831, + "step": 9353 + }, + { + "epoch": 0.455481703308743, + "grad_norm": 1.4152041673660278, + "learning_rate": 2.383236596875514e-05, + "loss": 0.7744, + "step": 9354 + }, + { + "epoch": 0.45553039709785015, + "grad_norm": 1.461103081703186, + "learning_rate": 2.382927000519514e-05, + "loss": 0.8248, + "step": 9355 + }, + { + "epoch": 0.45557909088695736, + "grad_norm": 1.4296083450317383, + "learning_rate": 2.38261739463817e-05, + "loss": 0.9361, + "step": 9356 + }, + { + "epoch": 0.4556277846760646, + "grad_norm": 1.9797999858856201, + "learning_rate": 2.382307779239183e-05, + "loss": 0.7922, + "step": 9357 + }, + { + "epoch": 0.4556764784651718, + "grad_norm": 1.465471863746643, + "learning_rate": 2.381998154330256e-05, + "loss": 0.826, + "step": 9358 + }, + { + "epoch": 0.45572517225427894, + "grad_norm": 1.9835729598999023, + "learning_rate": 2.38168851991909e-05, + "loss": 0.8845, + "step": 9359 + }, + { + "epoch": 0.45577386604338616, + "grad_norm": 1.2896828651428223, + "learning_rate": 2.381378876013386e-05, + "loss": 0.8545, + "step": 9360 + }, + { + "epoch": 0.45582255983249337, + "grad_norm": 1.5559988021850586, + "learning_rate": 2.381069222620849e-05, + "loss": 0.9053, + "step": 9361 + }, + { + "epoch": 0.4558712536216006, + "grad_norm": 1.8672354221343994, + "learning_rate": 2.3807595597491798e-05, + "loss": 0.8304, + "step": 9362 + }, + { + "epoch": 0.45591994741070774, + "grad_norm": 1.9466060400009155, + "learning_rate": 2.3804498874060827e-05, + "loss": 0.9049, + "step": 9363 + }, + { + "epoch": 0.45596864119981495, + "grad_norm": 3.28164005279541, + "learning_rate": 2.3801402055992585e-05, + "loss": 0.8272, + "step": 9364 + }, + { + "epoch": 0.45601733498892216, + "grad_norm": 2.51106858253479, + "learning_rate": 2.379830514336413e-05, + "loss": 0.8368, + "step": 9365 + }, + { + "epoch": 0.4560660287780294, + "grad_norm": 0.09219799190759659, + "learning_rate": 2.3795208136252484e-05, + "loss": 0.5568, + "step": 9366 + }, + { + "epoch": 0.4561147225671366, + "grad_norm": 2.947798490524292, + "learning_rate": 2.3792111034734696e-05, + "loss": 0.7856, + "step": 9367 + }, + { + "epoch": 0.45616341635624374, + "grad_norm": 1.8437674045562744, + "learning_rate": 2.3789013838887795e-05, + "loss": 0.787, + "step": 9368 + }, + { + "epoch": 0.45621211014535096, + "grad_norm": 1.7924444675445557, + "learning_rate": 2.3785916548788834e-05, + "loss": 0.894, + "step": 9369 + }, + { + "epoch": 0.45626080393445817, + "grad_norm": 1.9265516996383667, + "learning_rate": 2.3782819164514846e-05, + "loss": 0.9443, + "step": 9370 + }, + { + "epoch": 0.4563094977235654, + "grad_norm": 1.9986103773117065, + "learning_rate": 2.377972168614289e-05, + "loss": 0.8358, + "step": 9371 + }, + { + "epoch": 0.45635819151267254, + "grad_norm": 1.873665452003479, + "learning_rate": 2.3776624113750014e-05, + "loss": 0.8273, + "step": 9372 + }, + { + "epoch": 0.45640688530177975, + "grad_norm": 1.4642781019210815, + "learning_rate": 2.3773526447413265e-05, + "loss": 0.8287, + "step": 9373 + }, + { + "epoch": 0.45645557909088696, + "grad_norm": 2.3006019592285156, + "learning_rate": 2.3770428687209707e-05, + "loss": 0.812, + "step": 9374 + }, + { + "epoch": 0.4565042728799942, + "grad_norm": 2.055706739425659, + "learning_rate": 2.376733083321639e-05, + "loss": 0.8829, + "step": 9375 + }, + { + "epoch": 0.45655296666910133, + "grad_norm": 0.10439599305391312, + "learning_rate": 2.3764232885510374e-05, + "loss": 0.6464, + "step": 9376 + }, + { + "epoch": 0.45660166045820855, + "grad_norm": 1.5668535232543945, + "learning_rate": 2.3761134844168724e-05, + "loss": 0.8695, + "step": 9377 + }, + { + "epoch": 0.45665035424731576, + "grad_norm": 1.6374201774597168, + "learning_rate": 2.3758036709268497e-05, + "loss": 0.733, + "step": 9378 + }, + { + "epoch": 0.45669904803642297, + "grad_norm": 2.21762752532959, + "learning_rate": 2.3754938480886763e-05, + "loss": 0.7941, + "step": 9379 + }, + { + "epoch": 0.45674774182553013, + "grad_norm": 2.1288564205169678, + "learning_rate": 2.3751840159100595e-05, + "loss": 0.8386, + "step": 9380 + }, + { + "epoch": 0.45679643561463734, + "grad_norm": 1.821272611618042, + "learning_rate": 2.3748741743987063e-05, + "loss": 0.9063, + "step": 9381 + }, + { + "epoch": 0.45684512940374455, + "grad_norm": 1.6330845355987549, + "learning_rate": 2.3745643235623237e-05, + "loss": 0.8696, + "step": 9382 + }, + { + "epoch": 0.45689382319285177, + "grad_norm": 1.508890151977539, + "learning_rate": 2.3742544634086197e-05, + "loss": 0.8568, + "step": 9383 + }, + { + "epoch": 0.4569425169819589, + "grad_norm": 1.9301843643188477, + "learning_rate": 2.3739445939453015e-05, + "loss": 0.8245, + "step": 9384 + }, + { + "epoch": 0.45699121077106614, + "grad_norm": 7.657761573791504, + "learning_rate": 2.3736347151800776e-05, + "loss": 0.8303, + "step": 9385 + }, + { + "epoch": 0.45703990456017335, + "grad_norm": 1.5539109706878662, + "learning_rate": 2.3733248271206556e-05, + "loss": 0.8258, + "step": 9386 + }, + { + "epoch": 0.45708859834928056, + "grad_norm": 2.287461996078491, + "learning_rate": 2.373014929774745e-05, + "loss": 0.9183, + "step": 9387 + }, + { + "epoch": 0.4571372921383878, + "grad_norm": 1.4506021738052368, + "learning_rate": 2.3727050231500534e-05, + "loss": 0.867, + "step": 9388 + }, + { + "epoch": 0.45718598592749493, + "grad_norm": 1.4510774612426758, + "learning_rate": 2.372395107254291e-05, + "loss": 0.8695, + "step": 9389 + }, + { + "epoch": 0.45723467971660214, + "grad_norm": 1.3983690738677979, + "learning_rate": 2.372085182095166e-05, + "loss": 0.7585, + "step": 9390 + }, + { + "epoch": 0.45728337350570936, + "grad_norm": 2.5425705909729004, + "learning_rate": 2.3717752476803883e-05, + "loss": 0.9558, + "step": 9391 + }, + { + "epoch": 0.45733206729481657, + "grad_norm": 1.3508349657058716, + "learning_rate": 2.3714653040176673e-05, + "loss": 0.9349, + "step": 9392 + }, + { + "epoch": 0.4573807610839237, + "grad_norm": 1.3788113594055176, + "learning_rate": 2.371155351114713e-05, + "loss": 0.8108, + "step": 9393 + }, + { + "epoch": 0.45742945487303094, + "grad_norm": 3.3966152667999268, + "learning_rate": 2.3708453889792356e-05, + "loss": 0.9426, + "step": 9394 + }, + { + "epoch": 0.45747814866213815, + "grad_norm": 1.4022003412246704, + "learning_rate": 2.3705354176189456e-05, + "loss": 0.8599, + "step": 9395 + }, + { + "epoch": 0.45752684245124536, + "grad_norm": 2.1840157508850098, + "learning_rate": 2.370225437041553e-05, + "loss": 0.844, + "step": 9396 + }, + { + "epoch": 0.4575755362403525, + "grad_norm": 2.064444065093994, + "learning_rate": 2.3699154472547693e-05, + "loss": 0.8556, + "step": 9397 + }, + { + "epoch": 0.45762423002945973, + "grad_norm": 1.3324882984161377, + "learning_rate": 2.3696054482663045e-05, + "loss": 0.7979, + "step": 9398 + }, + { + "epoch": 0.45767292381856695, + "grad_norm": 2.4002749919891357, + "learning_rate": 2.3692954400838707e-05, + "loss": 0.9209, + "step": 9399 + }, + { + "epoch": 0.45772161760767416, + "grad_norm": 1.4297295808792114, + "learning_rate": 2.3689854227151793e-05, + "loss": 0.8776, + "step": 9400 + }, + { + "epoch": 0.4577703113967813, + "grad_norm": 1.945505976676941, + "learning_rate": 2.3686753961679415e-05, + "loss": 0.8992, + "step": 9401 + }, + { + "epoch": 0.45781900518588853, + "grad_norm": 1.4579957723617554, + "learning_rate": 2.36836536044987e-05, + "loss": 0.8848, + "step": 9402 + }, + { + "epoch": 0.45786769897499574, + "grad_norm": 1.574660062789917, + "learning_rate": 2.3680553155686763e-05, + "loss": 0.7556, + "step": 9403 + }, + { + "epoch": 0.45791639276410295, + "grad_norm": 1.6605470180511475, + "learning_rate": 2.3677452615320732e-05, + "loss": 0.8358, + "step": 9404 + }, + { + "epoch": 0.45796508655321017, + "grad_norm": 1.667250156402588, + "learning_rate": 2.3674351983477735e-05, + "loss": 0.9027, + "step": 9405 + }, + { + "epoch": 0.4580137803423173, + "grad_norm": 1.8343521356582642, + "learning_rate": 2.36712512602349e-05, + "loss": 0.8514, + "step": 9406 + }, + { + "epoch": 0.45806247413142454, + "grad_norm": 1.6004366874694824, + "learning_rate": 2.366815044566934e-05, + "loss": 0.8122, + "step": 9407 + }, + { + "epoch": 0.45811116792053175, + "grad_norm": 1.4852635860443115, + "learning_rate": 2.3665049539858216e-05, + "loss": 0.8551, + "step": 9408 + }, + { + "epoch": 0.45815986170963896, + "grad_norm": 1.3709518909454346, + "learning_rate": 2.3661948542878645e-05, + "loss": 0.8136, + "step": 9409 + }, + { + "epoch": 0.4582085554987461, + "grad_norm": 1.714150071144104, + "learning_rate": 2.365884745480777e-05, + "loss": 0.9033, + "step": 9410 + }, + { + "epoch": 0.45825724928785333, + "grad_norm": 1.2029365301132202, + "learning_rate": 2.3655746275722737e-05, + "loss": 0.7731, + "step": 9411 + }, + { + "epoch": 0.45830594307696054, + "grad_norm": 1.4102766513824463, + "learning_rate": 2.3652645005700682e-05, + "loss": 0.8598, + "step": 9412 + }, + { + "epoch": 0.45835463686606776, + "grad_norm": 1.5227919816970825, + "learning_rate": 2.3649543644818748e-05, + "loss": 0.8386, + "step": 9413 + }, + { + "epoch": 0.4584033306551749, + "grad_norm": 3.2988839149475098, + "learning_rate": 2.364644219315408e-05, + "loss": 0.8355, + "step": 9414 + }, + { + "epoch": 0.4584520244442821, + "grad_norm": 1.523937463760376, + "learning_rate": 2.3643340650783827e-05, + "loss": 0.8471, + "step": 9415 + }, + { + "epoch": 0.45850071823338934, + "grad_norm": 2.2708141803741455, + "learning_rate": 2.364023901778515e-05, + "loss": 0.7639, + "step": 9416 + }, + { + "epoch": 0.45854941202249655, + "grad_norm": 1.5420384407043457, + "learning_rate": 2.3637137294235192e-05, + "loss": 0.7932, + "step": 9417 + }, + { + "epoch": 0.4585981058116037, + "grad_norm": 6.367826461791992, + "learning_rate": 2.3634035480211113e-05, + "loss": 0.8475, + "step": 9418 + }, + { + "epoch": 0.4586467996007109, + "grad_norm": 1.5280234813690186, + "learning_rate": 2.3630933575790068e-05, + "loss": 0.9018, + "step": 9419 + }, + { + "epoch": 0.45869549338981813, + "grad_norm": 2.213805913925171, + "learning_rate": 2.3627831581049222e-05, + "loss": 0.8448, + "step": 9420 + }, + { + "epoch": 0.45874418717892534, + "grad_norm": 3.2515015602111816, + "learning_rate": 2.3624729496065734e-05, + "loss": 0.7946, + "step": 9421 + }, + { + "epoch": 0.4587928809680325, + "grad_norm": 3.286517381668091, + "learning_rate": 2.362162732091677e-05, + "loss": 0.9453, + "step": 9422 + }, + { + "epoch": 0.4588415747571397, + "grad_norm": 5.120028972625732, + "learning_rate": 2.3618525055679493e-05, + "loss": 0.812, + "step": 9423 + }, + { + "epoch": 0.4588902685462469, + "grad_norm": 1.6786524057388306, + "learning_rate": 2.3615422700431078e-05, + "loss": 0.794, + "step": 9424 + }, + { + "epoch": 0.45893896233535414, + "grad_norm": 1.3864115476608276, + "learning_rate": 2.3612320255248692e-05, + "loss": 0.8722, + "step": 9425 + }, + { + "epoch": 0.45898765612446135, + "grad_norm": 2.747267723083496, + "learning_rate": 2.3609217720209517e-05, + "loss": 0.8084, + "step": 9426 + }, + { + "epoch": 0.4590363499135685, + "grad_norm": 1.6816377639770508, + "learning_rate": 2.3606115095390715e-05, + "loss": 0.8218, + "step": 9427 + }, + { + "epoch": 0.4590850437026757, + "grad_norm": 2.4299144744873047, + "learning_rate": 2.3603012380869473e-05, + "loss": 0.7923, + "step": 9428 + }, + { + "epoch": 0.45913373749178293, + "grad_norm": 1.8665484189987183, + "learning_rate": 2.3599909576722967e-05, + "loss": 0.8898, + "step": 9429 + }, + { + "epoch": 0.45918243128089015, + "grad_norm": 1.421526312828064, + "learning_rate": 2.3596806683028382e-05, + "loss": 0.8077, + "step": 9430 + }, + { + "epoch": 0.4592311250699973, + "grad_norm": 2.000256299972534, + "learning_rate": 2.3593703699862903e-05, + "loss": 0.7503, + "step": 9431 + }, + { + "epoch": 0.4592798188591045, + "grad_norm": 1.59358549118042, + "learning_rate": 2.359060062730372e-05, + "loss": 0.8227, + "step": 9432 + }, + { + "epoch": 0.45932851264821173, + "grad_norm": 2.7328367233276367, + "learning_rate": 2.3587497465428026e-05, + "loss": 0.8002, + "step": 9433 + }, + { + "epoch": 0.45937720643731894, + "grad_norm": 2.0130608081817627, + "learning_rate": 2.3584394214312994e-05, + "loss": 0.8632, + "step": 9434 + }, + { + "epoch": 0.4594259002264261, + "grad_norm": 1.6799945831298828, + "learning_rate": 2.3581290874035836e-05, + "loss": 0.8236, + "step": 9435 + }, + { + "epoch": 0.4594745940155333, + "grad_norm": 1.8224340677261353, + "learning_rate": 2.3578187444673736e-05, + "loss": 0.8976, + "step": 9436 + }, + { + "epoch": 0.4595232878046405, + "grad_norm": 0.09716988354921341, + "learning_rate": 2.35750839263039e-05, + "loss": 0.629, + "step": 9437 + }, + { + "epoch": 0.45957198159374774, + "grad_norm": 1.3764554262161255, + "learning_rate": 2.3571980319003524e-05, + "loss": 0.8072, + "step": 9438 + }, + { + "epoch": 0.4596206753828549, + "grad_norm": 2.2919869422912598, + "learning_rate": 2.356887662284981e-05, + "loss": 0.7858, + "step": 9439 + }, + { + "epoch": 0.4596693691719621, + "grad_norm": 0.09084028750658035, + "learning_rate": 2.3565772837919964e-05, + "loss": 0.6254, + "step": 9440 + }, + { + "epoch": 0.4597180629610693, + "grad_norm": 2.602638006210327, + "learning_rate": 2.3562668964291197e-05, + "loss": 0.8627, + "step": 9441 + }, + { + "epoch": 0.45976675675017653, + "grad_norm": 1.9873803853988647, + "learning_rate": 2.3559565002040717e-05, + "loss": 0.8439, + "step": 9442 + }, + { + "epoch": 0.4598154505392837, + "grad_norm": 2.2014787197113037, + "learning_rate": 2.355646095124573e-05, + "loss": 0.8655, + "step": 9443 + }, + { + "epoch": 0.4598641443283909, + "grad_norm": 1.5751808881759644, + "learning_rate": 2.355335681198345e-05, + "loss": 0.8738, + "step": 9444 + }, + { + "epoch": 0.4599128381174981, + "grad_norm": 1.486635446548462, + "learning_rate": 2.3550252584331095e-05, + "loss": 0.8408, + "step": 9445 + }, + { + "epoch": 0.4599615319066053, + "grad_norm": 1.911900281906128, + "learning_rate": 2.354714826836589e-05, + "loss": 0.7824, + "step": 9446 + }, + { + "epoch": 0.46001022569571254, + "grad_norm": 1.6266437768936157, + "learning_rate": 2.3544043864165044e-05, + "loss": 0.8581, + "step": 9447 + }, + { + "epoch": 0.4600589194848197, + "grad_norm": 1.5795944929122925, + "learning_rate": 2.3540939371805783e-05, + "loss": 0.7532, + "step": 9448 + }, + { + "epoch": 0.4601076132739269, + "grad_norm": 1.7724641561508179, + "learning_rate": 2.3537834791365332e-05, + "loss": 0.8306, + "step": 9449 + }, + { + "epoch": 0.4601563070630341, + "grad_norm": 2.0832114219665527, + "learning_rate": 2.3534730122920922e-05, + "loss": 0.7323, + "step": 9450 + }, + { + "epoch": 0.46020500085214133, + "grad_norm": 1.4830085039138794, + "learning_rate": 2.3531625366549776e-05, + "loss": 0.7656, + "step": 9451 + }, + { + "epoch": 0.4602536946412485, + "grad_norm": 1.5859402418136597, + "learning_rate": 2.3528520522329123e-05, + "loss": 0.8859, + "step": 9452 + }, + { + "epoch": 0.4603023884303557, + "grad_norm": 1.4562609195709229, + "learning_rate": 2.3525415590336205e-05, + "loss": 0.8536, + "step": 9453 + }, + { + "epoch": 0.4603510822194629, + "grad_norm": 1.497797966003418, + "learning_rate": 2.3522310570648252e-05, + "loss": 0.7754, + "step": 9454 + }, + { + "epoch": 0.4603997760085701, + "grad_norm": 1.449002742767334, + "learning_rate": 2.3519205463342503e-05, + "loss": 0.8861, + "step": 9455 + }, + { + "epoch": 0.4604484697976773, + "grad_norm": 1.6917130947113037, + "learning_rate": 2.3516100268496197e-05, + "loss": 0.8195, + "step": 9456 + }, + { + "epoch": 0.4604971635867845, + "grad_norm": 1.7988604307174683, + "learning_rate": 2.3512994986186576e-05, + "loss": 0.791, + "step": 9457 + }, + { + "epoch": 0.4605458573758917, + "grad_norm": 3.1326487064361572, + "learning_rate": 2.350988961649088e-05, + "loss": 0.8395, + "step": 9458 + }, + { + "epoch": 0.4605945511649989, + "grad_norm": 1.850110650062561, + "learning_rate": 2.350678415948636e-05, + "loss": 0.8545, + "step": 9459 + }, + { + "epoch": 0.4606432449541061, + "grad_norm": 0.092884361743927, + "learning_rate": 2.350367861525027e-05, + "loss": 0.5818, + "step": 9460 + }, + { + "epoch": 0.4606919387432133, + "grad_norm": 2.2177541255950928, + "learning_rate": 2.3500572983859854e-05, + "loss": 0.7392, + "step": 9461 + }, + { + "epoch": 0.4607406325323205, + "grad_norm": 1.1531239748001099, + "learning_rate": 2.3497467265392366e-05, + "loss": 0.8954, + "step": 9462 + }, + { + "epoch": 0.4607893263214277, + "grad_norm": 4.712494850158691, + "learning_rate": 2.349436145992506e-05, + "loss": 0.9016, + "step": 9463 + }, + { + "epoch": 0.4608380201105349, + "grad_norm": 1.131941795349121, + "learning_rate": 2.3491255567535195e-05, + "loss": 0.8725, + "step": 9464 + }, + { + "epoch": 0.4608867138996421, + "grad_norm": 0.9628004431724548, + "learning_rate": 2.3488149588300032e-05, + "loss": 0.8914, + "step": 9465 + }, + { + "epoch": 0.4609354076887493, + "grad_norm": 1.3775628805160522, + "learning_rate": 2.3485043522296816e-05, + "loss": 0.8198, + "step": 9466 + }, + { + "epoch": 0.4609841014778565, + "grad_norm": 1.378213882446289, + "learning_rate": 2.3481937369602838e-05, + "loss": 0.8876, + "step": 9467 + }, + { + "epoch": 0.4610327952669637, + "grad_norm": 2.380969285964966, + "learning_rate": 2.347883113029535e-05, + "loss": 0.8284, + "step": 9468 + }, + { + "epoch": 0.4610814890560709, + "grad_norm": 1.9711928367614746, + "learning_rate": 2.347572480445161e-05, + "loss": 0.8862, + "step": 9469 + }, + { + "epoch": 0.4611301828451781, + "grad_norm": 1.3604304790496826, + "learning_rate": 2.347261839214891e-05, + "loss": 0.845, + "step": 9470 + }, + { + "epoch": 0.4611788766342853, + "grad_norm": 2.1294939517974854, + "learning_rate": 2.3469511893464508e-05, + "loss": 0.7839, + "step": 9471 + }, + { + "epoch": 0.4612275704233925, + "grad_norm": 1.1836981773376465, + "learning_rate": 2.346640530847568e-05, + "loss": 0.946, + "step": 9472 + }, + { + "epoch": 0.4612762642124997, + "grad_norm": 3.252530097961426, + "learning_rate": 2.3463298637259702e-05, + "loss": 0.8096, + "step": 9473 + }, + { + "epoch": 0.4613249580016069, + "grad_norm": 1.5619081258773804, + "learning_rate": 2.3460191879893856e-05, + "loss": 0.6993, + "step": 9474 + }, + { + "epoch": 0.4613736517907141, + "grad_norm": 2.139279365539551, + "learning_rate": 2.3457085036455426e-05, + "loss": 0.8189, + "step": 9475 + }, + { + "epoch": 0.4614223455798213, + "grad_norm": 2.623940944671631, + "learning_rate": 2.3453978107021687e-05, + "loss": 0.8274, + "step": 9476 + }, + { + "epoch": 0.46147103936892847, + "grad_norm": 2.0330958366394043, + "learning_rate": 2.3450871091669927e-05, + "loss": 0.7634, + "step": 9477 + }, + { + "epoch": 0.4615197331580357, + "grad_norm": 1.5791987180709839, + "learning_rate": 2.344776399047744e-05, + "loss": 0.8569, + "step": 9478 + }, + { + "epoch": 0.4615684269471429, + "grad_norm": 4.942636489868164, + "learning_rate": 2.34446568035215e-05, + "loss": 0.806, + "step": 9479 + }, + { + "epoch": 0.4616171207362501, + "grad_norm": 1.5337042808532715, + "learning_rate": 2.3441549530879407e-05, + "loss": 0.7711, + "step": 9480 + }, + { + "epoch": 0.46166581452535727, + "grad_norm": 4.916550159454346, + "learning_rate": 2.343844217262846e-05, + "loss": 0.7885, + "step": 9481 + }, + { + "epoch": 0.4617145083144645, + "grad_norm": 1.0899600982666016, + "learning_rate": 2.343533472884595e-05, + "loss": 0.7753, + "step": 9482 + }, + { + "epoch": 0.4617632021035717, + "grad_norm": 1.1983942985534668, + "learning_rate": 2.3432227199609182e-05, + "loss": 0.8346, + "step": 9483 + }, + { + "epoch": 0.4618118958926789, + "grad_norm": 0.12359480559825897, + "learning_rate": 2.3429119584995442e-05, + "loss": 0.6301, + "step": 9484 + }, + { + "epoch": 0.4618605896817861, + "grad_norm": 1.4422645568847656, + "learning_rate": 2.3426011885082046e-05, + "loss": 0.7957, + "step": 9485 + }, + { + "epoch": 0.4619092834708933, + "grad_norm": 1.8003084659576416, + "learning_rate": 2.3422904099946283e-05, + "loss": 0.8011, + "step": 9486 + }, + { + "epoch": 0.4619579772600005, + "grad_norm": 1.4923721551895142, + "learning_rate": 2.3419796229665474e-05, + "loss": 0.8173, + "step": 9487 + }, + { + "epoch": 0.4620066710491077, + "grad_norm": 1.3506762981414795, + "learning_rate": 2.3416688274316922e-05, + "loss": 0.9307, + "step": 9488 + }, + { + "epoch": 0.4620553648382149, + "grad_norm": 1.3613203763961792, + "learning_rate": 2.341358023397793e-05, + "loss": 0.9163, + "step": 9489 + }, + { + "epoch": 0.46210405862732207, + "grad_norm": 3.139536142349243, + "learning_rate": 2.341047210872583e-05, + "loss": 0.8146, + "step": 9490 + }, + { + "epoch": 0.4621527524164293, + "grad_norm": 1.6729592084884644, + "learning_rate": 2.340736389863792e-05, + "loss": 0.9434, + "step": 9491 + }, + { + "epoch": 0.4622014462055365, + "grad_norm": 2.63322114944458, + "learning_rate": 2.3404255603791525e-05, + "loss": 0.8219, + "step": 9492 + }, + { + "epoch": 0.4622501399946437, + "grad_norm": 11.230531692504883, + "learning_rate": 2.3401147224263957e-05, + "loss": 0.8311, + "step": 9493 + }, + { + "epoch": 0.46229883378375086, + "grad_norm": 1.456547498703003, + "learning_rate": 2.3398038760132543e-05, + "loss": 0.8503, + "step": 9494 + }, + { + "epoch": 0.4623475275728581, + "grad_norm": 1.6420316696166992, + "learning_rate": 2.3394930211474603e-05, + "loss": 0.7742, + "step": 9495 + }, + { + "epoch": 0.4623962213619653, + "grad_norm": 1.244420051574707, + "learning_rate": 2.3391821578367472e-05, + "loss": 0.8047, + "step": 9496 + }, + { + "epoch": 0.4624449151510725, + "grad_norm": 0.08887434005737305, + "learning_rate": 2.3388712860888463e-05, + "loss": 0.6589, + "step": 9497 + }, + { + "epoch": 0.46249360894017966, + "grad_norm": 1.7536295652389526, + "learning_rate": 2.338560405911491e-05, + "loss": 0.8991, + "step": 9498 + }, + { + "epoch": 0.46254230272928687, + "grad_norm": 2.1700856685638428, + "learning_rate": 2.3382495173124154e-05, + "loss": 0.8722, + "step": 9499 + }, + { + "epoch": 0.4625909965183941, + "grad_norm": 1.6001110076904297, + "learning_rate": 2.3379386202993523e-05, + "loss": 0.7584, + "step": 9500 + }, + { + "epoch": 0.4626396903075013, + "grad_norm": 0.08843635022640228, + "learning_rate": 2.3376277148800347e-05, + "loss": 0.6258, + "step": 9501 + }, + { + "epoch": 0.46268838409660845, + "grad_norm": 1.4447318315505981, + "learning_rate": 2.337316801062197e-05, + "loss": 0.7939, + "step": 9502 + }, + { + "epoch": 0.46273707788571566, + "grad_norm": 1.1994638442993164, + "learning_rate": 2.3370058788535737e-05, + "loss": 0.8859, + "step": 9503 + }, + { + "epoch": 0.4627857716748229, + "grad_norm": 1.5606945753097534, + "learning_rate": 2.3366949482618982e-05, + "loss": 0.7572, + "step": 9504 + }, + { + "epoch": 0.4628344654639301, + "grad_norm": 1.4422789812088013, + "learning_rate": 2.336384009294905e-05, + "loss": 0.7812, + "step": 9505 + }, + { + "epoch": 0.4628831592530373, + "grad_norm": 0.10875290632247925, + "learning_rate": 2.336073061960329e-05, + "loss": 0.6009, + "step": 9506 + }, + { + "epoch": 0.46293185304214446, + "grad_norm": 2.005992889404297, + "learning_rate": 2.3357621062659048e-05, + "loss": 0.8296, + "step": 9507 + }, + { + "epoch": 0.46298054683125167, + "grad_norm": 1.77765691280365, + "learning_rate": 2.335451142219367e-05, + "loss": 0.8128, + "step": 9508 + }, + { + "epoch": 0.4630292406203589, + "grad_norm": 1.8829137086868286, + "learning_rate": 2.3351401698284523e-05, + "loss": 0.8974, + "step": 9509 + }, + { + "epoch": 0.4630779344094661, + "grad_norm": 7.666214942932129, + "learning_rate": 2.3348291891008946e-05, + "loss": 0.8522, + "step": 9510 + }, + { + "epoch": 0.46312662819857325, + "grad_norm": 1.3198137283325195, + "learning_rate": 2.334518200044431e-05, + "loss": 0.7668, + "step": 9511 + }, + { + "epoch": 0.46317532198768047, + "grad_norm": 1.936940312385559, + "learning_rate": 2.3342072026667966e-05, + "loss": 0.9552, + "step": 9512 + }, + { + "epoch": 0.4632240157767877, + "grad_norm": 1.3870010375976562, + "learning_rate": 2.3338961969757272e-05, + "loss": 0.8598, + "step": 9513 + }, + { + "epoch": 0.4632727095658949, + "grad_norm": 1.4836037158966064, + "learning_rate": 2.33358518297896e-05, + "loss": 0.851, + "step": 9514 + }, + { + "epoch": 0.46332140335500205, + "grad_norm": 1.7274901866912842, + "learning_rate": 2.3332741606842308e-05, + "loss": 0.8405, + "step": 9515 + }, + { + "epoch": 0.46337009714410926, + "grad_norm": 1.3026833534240723, + "learning_rate": 2.3329631300992757e-05, + "loss": 0.8525, + "step": 9516 + }, + { + "epoch": 0.4634187909332165, + "grad_norm": 0.08937790989875793, + "learning_rate": 2.3326520912318333e-05, + "loss": 0.5569, + "step": 9517 + }, + { + "epoch": 0.4634674847223237, + "grad_norm": 1.6540998220443726, + "learning_rate": 2.3323410440896392e-05, + "loss": 0.7762, + "step": 9518 + }, + { + "epoch": 0.46351617851143084, + "grad_norm": 1.5781750679016113, + "learning_rate": 2.3320299886804314e-05, + "loss": 0.766, + "step": 9519 + }, + { + "epoch": 0.46356487230053806, + "grad_norm": 1.9053242206573486, + "learning_rate": 2.3317189250119476e-05, + "loss": 0.8197, + "step": 9520 + }, + { + "epoch": 0.46361356608964527, + "grad_norm": 0.08858995139598846, + "learning_rate": 2.3314078530919256e-05, + "loss": 0.6242, + "step": 9521 + }, + { + "epoch": 0.4636622598787525, + "grad_norm": 2.242734670639038, + "learning_rate": 2.3310967729281026e-05, + "loss": 0.8389, + "step": 9522 + }, + { + "epoch": 0.46371095366785964, + "grad_norm": 1.4994584321975708, + "learning_rate": 2.3307856845282175e-05, + "loss": 0.8219, + "step": 9523 + }, + { + "epoch": 0.46375964745696685, + "grad_norm": 2.371572732925415, + "learning_rate": 2.3304745879000076e-05, + "loss": 0.8587, + "step": 9524 + }, + { + "epoch": 0.46380834124607406, + "grad_norm": 2.435760974884033, + "learning_rate": 2.330163483051213e-05, + "loss": 0.9063, + "step": 9525 + }, + { + "epoch": 0.4638570350351813, + "grad_norm": 2.0242862701416016, + "learning_rate": 2.3298523699895715e-05, + "loss": 0.9116, + "step": 9526 + }, + { + "epoch": 0.4639057288242885, + "grad_norm": 1.3633142709732056, + "learning_rate": 2.329541248722822e-05, + "loss": 0.7449, + "step": 9527 + }, + { + "epoch": 0.46395442261339565, + "grad_norm": 1.3699085712432861, + "learning_rate": 2.3292301192587035e-05, + "loss": 0.8321, + "step": 9528 + }, + { + "epoch": 0.46400311640250286, + "grad_norm": 1.6364020109176636, + "learning_rate": 2.3289189816049563e-05, + "loss": 0.9163, + "step": 9529 + }, + { + "epoch": 0.46405181019161007, + "grad_norm": 1.196271538734436, + "learning_rate": 2.3286078357693195e-05, + "loss": 0.9313, + "step": 9530 + }, + { + "epoch": 0.4641005039807173, + "grad_norm": 1.4158655405044556, + "learning_rate": 2.328296681759533e-05, + "loss": 0.8599, + "step": 9531 + }, + { + "epoch": 0.46414919776982444, + "grad_norm": 1.5900976657867432, + "learning_rate": 2.327985519583336e-05, + "loss": 0.7439, + "step": 9532 + }, + { + "epoch": 0.46419789155893165, + "grad_norm": 1.3199442625045776, + "learning_rate": 2.3276743492484696e-05, + "loss": 0.7777, + "step": 9533 + }, + { + "epoch": 0.46424658534803886, + "grad_norm": 2.5126795768737793, + "learning_rate": 2.3273631707626742e-05, + "loss": 0.7941, + "step": 9534 + }, + { + "epoch": 0.4642952791371461, + "grad_norm": 1.5555895566940308, + "learning_rate": 2.3270519841336897e-05, + "loss": 0.8376, + "step": 9535 + }, + { + "epoch": 0.46434397292625323, + "grad_norm": 1.4675194025039673, + "learning_rate": 2.3267407893692573e-05, + "loss": 0.8768, + "step": 9536 + }, + { + "epoch": 0.46439266671536045, + "grad_norm": 2.047128200531006, + "learning_rate": 2.3264295864771182e-05, + "loss": 0.8012, + "step": 9537 + }, + { + "epoch": 0.46444136050446766, + "grad_norm": 1.687175989151001, + "learning_rate": 2.326118375465013e-05, + "loss": 0.7933, + "step": 9538 + }, + { + "epoch": 0.46449005429357487, + "grad_norm": 1.4739466905593872, + "learning_rate": 2.3258071563406835e-05, + "loss": 0.8918, + "step": 9539 + }, + { + "epoch": 0.46453874808268203, + "grad_norm": 2.0756001472473145, + "learning_rate": 2.3254959291118713e-05, + "loss": 0.8126, + "step": 9540 + }, + { + "epoch": 0.46458744187178924, + "grad_norm": 1.4608166217803955, + "learning_rate": 2.3251846937863186e-05, + "loss": 0.8881, + "step": 9541 + }, + { + "epoch": 0.46463613566089645, + "grad_norm": 2.2879607677459717, + "learning_rate": 2.324873450371767e-05, + "loss": 0.8917, + "step": 9542 + }, + { + "epoch": 0.46468482945000367, + "grad_norm": 0.09847338497638702, + "learning_rate": 2.3245621988759585e-05, + "loss": 0.5791, + "step": 9543 + }, + { + "epoch": 0.4647335232391108, + "grad_norm": 1.8237576484680176, + "learning_rate": 2.3242509393066355e-05, + "loss": 0.9094, + "step": 9544 + }, + { + "epoch": 0.46478221702821804, + "grad_norm": 1.6300678253173828, + "learning_rate": 2.3239396716715406e-05, + "loss": 0.8444, + "step": 9545 + }, + { + "epoch": 0.46483091081732525, + "grad_norm": 2.2119240760803223, + "learning_rate": 2.3236283959784173e-05, + "loss": 0.8119, + "step": 9546 + }, + { + "epoch": 0.46487960460643246, + "grad_norm": 0.0946350246667862, + "learning_rate": 2.323317112235008e-05, + "loss": 0.5915, + "step": 9547 + }, + { + "epoch": 0.4649282983955397, + "grad_norm": 1.479537844657898, + "learning_rate": 2.3230058204490557e-05, + "loss": 0.8441, + "step": 9548 + }, + { + "epoch": 0.46497699218464683, + "grad_norm": 0.09253904968500137, + "learning_rate": 2.322694520628305e-05, + "loss": 0.6822, + "step": 9549 + }, + { + "epoch": 0.46502568597375404, + "grad_norm": 1.7157851457595825, + "learning_rate": 2.322383212780498e-05, + "loss": 0.9307, + "step": 9550 + }, + { + "epoch": 0.46507437976286126, + "grad_norm": 1.6522992849349976, + "learning_rate": 2.3220718969133796e-05, + "loss": 0.8805, + "step": 9551 + }, + { + "epoch": 0.46512307355196847, + "grad_norm": 1.713485836982727, + "learning_rate": 2.3217605730346935e-05, + "loss": 0.7774, + "step": 9552 + }, + { + "epoch": 0.4651717673410756, + "grad_norm": 3.4199841022491455, + "learning_rate": 2.321449241152183e-05, + "loss": 0.8121, + "step": 9553 + }, + { + "epoch": 0.46522046113018284, + "grad_norm": 1.8113363981246948, + "learning_rate": 2.321137901273594e-05, + "loss": 0.8488, + "step": 9554 + }, + { + "epoch": 0.46526915491929005, + "grad_norm": 1.6622239351272583, + "learning_rate": 2.32082655340667e-05, + "loss": 0.8595, + "step": 9555 + }, + { + "epoch": 0.46531784870839726, + "grad_norm": 1.7634592056274414, + "learning_rate": 2.320515197559157e-05, + "loss": 0.865, + "step": 9556 + }, + { + "epoch": 0.4653665424975044, + "grad_norm": 1.5301570892333984, + "learning_rate": 2.3202038337387988e-05, + "loss": 0.7697, + "step": 9557 + }, + { + "epoch": 0.46541523628661163, + "grad_norm": 2.1506083011627197, + "learning_rate": 2.3198924619533404e-05, + "loss": 0.8566, + "step": 9558 + }, + { + "epoch": 0.46546393007571885, + "grad_norm": 1.422890305519104, + "learning_rate": 2.3195810822105286e-05, + "loss": 0.8533, + "step": 9559 + }, + { + "epoch": 0.46551262386482606, + "grad_norm": 1.493672490119934, + "learning_rate": 2.3192696945181082e-05, + "loss": 0.9682, + "step": 9560 + }, + { + "epoch": 0.4655613176539332, + "grad_norm": 2.07556414604187, + "learning_rate": 2.318958298883825e-05, + "loss": 0.8368, + "step": 9561 + }, + { + "epoch": 0.46561001144304043, + "grad_norm": 0.0937376394867897, + "learning_rate": 2.3186468953154247e-05, + "loss": 0.6312, + "step": 9562 + }, + { + "epoch": 0.46565870523214764, + "grad_norm": 1.635290503501892, + "learning_rate": 2.3183354838206542e-05, + "loss": 0.8761, + "step": 9563 + }, + { + "epoch": 0.46570739902125485, + "grad_norm": 1.9309954643249512, + "learning_rate": 2.3180240644072596e-05, + "loss": 0.9029, + "step": 9564 + }, + { + "epoch": 0.46575609281036207, + "grad_norm": 1.6364561319351196, + "learning_rate": 2.3177126370829872e-05, + "loss": 0.8053, + "step": 9565 + }, + { + "epoch": 0.4658047865994692, + "grad_norm": 1.8176530599594116, + "learning_rate": 2.3174012018555844e-05, + "loss": 0.7849, + "step": 9566 + }, + { + "epoch": 0.46585348038857644, + "grad_norm": 1.708375334739685, + "learning_rate": 2.3170897587327964e-05, + "loss": 0.903, + "step": 9567 + }, + { + "epoch": 0.46590217417768365, + "grad_norm": 1.5659509897232056, + "learning_rate": 2.3167783077223725e-05, + "loss": 0.7924, + "step": 9568 + }, + { + "epoch": 0.46595086796679086, + "grad_norm": 1.540980339050293, + "learning_rate": 2.3164668488320597e-05, + "loss": 0.8047, + "step": 9569 + }, + { + "epoch": 0.465999561755898, + "grad_norm": 2.211533308029175, + "learning_rate": 2.3161553820696054e-05, + "loss": 0.8905, + "step": 9570 + }, + { + "epoch": 0.46604825554500523, + "grad_norm": 2.471097946166992, + "learning_rate": 2.3158439074427564e-05, + "loss": 0.8588, + "step": 9571 + }, + { + "epoch": 0.46609694933411244, + "grad_norm": 2.6666176319122314, + "learning_rate": 2.3155324249592618e-05, + "loss": 0.8999, + "step": 9572 + }, + { + "epoch": 0.46614564312321966, + "grad_norm": 0.09047599136829376, + "learning_rate": 2.3152209346268695e-05, + "loss": 0.5852, + "step": 9573 + }, + { + "epoch": 0.4661943369123268, + "grad_norm": 1.572757601737976, + "learning_rate": 2.3149094364533275e-05, + "loss": 0.8953, + "step": 9574 + }, + { + "epoch": 0.466243030701434, + "grad_norm": 1.4408204555511475, + "learning_rate": 2.3145979304463842e-05, + "loss": 0.7299, + "step": 9575 + }, + { + "epoch": 0.46629172449054124, + "grad_norm": 1.8908343315124512, + "learning_rate": 2.314286416613789e-05, + "loss": 0.827, + "step": 9576 + }, + { + "epoch": 0.46634041827964845, + "grad_norm": 1.6544030904769897, + "learning_rate": 2.313974894963291e-05, + "loss": 0.9056, + "step": 9577 + }, + { + "epoch": 0.4663891120687556, + "grad_norm": 1.6914520263671875, + "learning_rate": 2.3136633655026382e-05, + "loss": 0.9139, + "step": 9578 + }, + { + "epoch": 0.4664378058578628, + "grad_norm": 1.556460976600647, + "learning_rate": 2.3133518282395805e-05, + "loss": 0.8958, + "step": 9579 + }, + { + "epoch": 0.46648649964697003, + "grad_norm": 1.9352130889892578, + "learning_rate": 2.313040283181868e-05, + "loss": 0.6948, + "step": 9580 + }, + { + "epoch": 0.46653519343607724, + "grad_norm": 0.09271930158138275, + "learning_rate": 2.31272873033725e-05, + "loss": 0.6415, + "step": 9581 + }, + { + "epoch": 0.4665838872251844, + "grad_norm": 1.3895153999328613, + "learning_rate": 2.3124171697134755e-05, + "loss": 0.8133, + "step": 9582 + }, + { + "epoch": 0.4666325810142916, + "grad_norm": 1.7961499691009521, + "learning_rate": 2.312105601318296e-05, + "loss": 0.8349, + "step": 9583 + }, + { + "epoch": 0.4666812748033988, + "grad_norm": 1.7568795680999756, + "learning_rate": 2.3117940251594614e-05, + "loss": 0.7041, + "step": 9584 + }, + { + "epoch": 0.46672996859250604, + "grad_norm": 1.3496098518371582, + "learning_rate": 2.311482441244722e-05, + "loss": 0.8031, + "step": 9585 + }, + { + "epoch": 0.46677866238161325, + "grad_norm": 3.402045249938965, + "learning_rate": 2.3111708495818284e-05, + "loss": 0.7965, + "step": 9586 + }, + { + "epoch": 0.4668273561707204, + "grad_norm": 1.3730697631835938, + "learning_rate": 2.310859250178531e-05, + "loss": 0.8905, + "step": 9587 + }, + { + "epoch": 0.4668760499598276, + "grad_norm": 1.9534341096878052, + "learning_rate": 2.3105476430425818e-05, + "loss": 0.851, + "step": 9588 + }, + { + "epoch": 0.46692474374893483, + "grad_norm": 2.1514527797698975, + "learning_rate": 2.3102360281817316e-05, + "loss": 0.8717, + "step": 9589 + }, + { + "epoch": 0.46697343753804205, + "grad_norm": 1.5399136543273926, + "learning_rate": 2.309924405603732e-05, + "loss": 0.8852, + "step": 9590 + }, + { + "epoch": 0.4670221313271492, + "grad_norm": 2.548295259475708, + "learning_rate": 2.3096127753163344e-05, + "loss": 0.7805, + "step": 9591 + }, + { + "epoch": 0.4670708251162564, + "grad_norm": 1.582964539527893, + "learning_rate": 2.3093011373272913e-05, + "loss": 0.9285, + "step": 9592 + }, + { + "epoch": 0.46711951890536363, + "grad_norm": 2.4076342582702637, + "learning_rate": 2.3089894916443537e-05, + "loss": 0.738, + "step": 9593 + }, + { + "epoch": 0.46716821269447084, + "grad_norm": 1.541421890258789, + "learning_rate": 2.3086778382752746e-05, + "loss": 0.8309, + "step": 9594 + }, + { + "epoch": 0.467216906483578, + "grad_norm": 0.10037525743246078, + "learning_rate": 2.308366177227806e-05, + "loss": 0.6815, + "step": 9595 + }, + { + "epoch": 0.4672656002726852, + "grad_norm": 1.542281150817871, + "learning_rate": 2.3080545085097003e-05, + "loss": 0.8563, + "step": 9596 + }, + { + "epoch": 0.4673142940617924, + "grad_norm": 1.585633397102356, + "learning_rate": 2.307742832128711e-05, + "loss": 0.8357, + "step": 9597 + }, + { + "epoch": 0.46736298785089964, + "grad_norm": 1.505815863609314, + "learning_rate": 2.30743114809259e-05, + "loss": 0.8761, + "step": 9598 + }, + { + "epoch": 0.4674116816400068, + "grad_norm": 2.6466994285583496, + "learning_rate": 2.3071194564090923e-05, + "loss": 0.8529, + "step": 9599 + }, + { + "epoch": 0.467460375429114, + "grad_norm": 1.7588011026382446, + "learning_rate": 2.30680775708597e-05, + "loss": 0.822, + "step": 9600 + }, + { + "epoch": 0.4675090692182212, + "grad_norm": 3.2868385314941406, + "learning_rate": 2.3064960501309762e-05, + "loss": 0.8196, + "step": 9601 + }, + { + "epoch": 0.46755776300732843, + "grad_norm": 1.6794157028198242, + "learning_rate": 2.306184335551865e-05, + "loss": 0.9408, + "step": 9602 + }, + { + "epoch": 0.4676064567964356, + "grad_norm": 1.5129374265670776, + "learning_rate": 2.305872613356391e-05, + "loss": 0.8093, + "step": 9603 + }, + { + "epoch": 0.4676551505855428, + "grad_norm": 2.1048507690429688, + "learning_rate": 2.305560883552308e-05, + "loss": 0.8369, + "step": 9604 + }, + { + "epoch": 0.46770384437465, + "grad_norm": 1.4761366844177246, + "learning_rate": 2.30524914614737e-05, + "loss": 0.8602, + "step": 9605 + }, + { + "epoch": 0.4677525381637572, + "grad_norm": 1.769097924232483, + "learning_rate": 2.3049374011493314e-05, + "loss": 0.8625, + "step": 9606 + }, + { + "epoch": 0.46780123195286444, + "grad_norm": 2.3632912635803223, + "learning_rate": 2.3046256485659472e-05, + "loss": 0.8648, + "step": 9607 + }, + { + "epoch": 0.4678499257419716, + "grad_norm": 1.4909512996673584, + "learning_rate": 2.304313888404972e-05, + "loss": 0.8588, + "step": 9608 + }, + { + "epoch": 0.4678986195310788, + "grad_norm": 1.4919167757034302, + "learning_rate": 2.3040021206741617e-05, + "loss": 0.7524, + "step": 9609 + }, + { + "epoch": 0.467947313320186, + "grad_norm": 1.418560266494751, + "learning_rate": 2.303690345381271e-05, + "loss": 0.8503, + "step": 9610 + }, + { + "epoch": 0.46799600710929323, + "grad_norm": 3.7629761695861816, + "learning_rate": 2.3033785625340544e-05, + "loss": 0.8028, + "step": 9611 + }, + { + "epoch": 0.4680447008984004, + "grad_norm": 5.67594051361084, + "learning_rate": 2.303066772140269e-05, + "loss": 0.9146, + "step": 9612 + }, + { + "epoch": 0.4680933946875076, + "grad_norm": 1.5924655199050903, + "learning_rate": 2.3027549742076702e-05, + "loss": 0.865, + "step": 9613 + }, + { + "epoch": 0.4681420884766148, + "grad_norm": 3.371048927307129, + "learning_rate": 2.3024431687440133e-05, + "loss": 0.8034, + "step": 9614 + }, + { + "epoch": 0.468190782265722, + "grad_norm": 1.5895434617996216, + "learning_rate": 2.3021313557570552e-05, + "loss": 0.8201, + "step": 9615 + }, + { + "epoch": 0.4682394760548292, + "grad_norm": 1.471542477607727, + "learning_rate": 2.3018195352545523e-05, + "loss": 0.8122, + "step": 9616 + }, + { + "epoch": 0.4682881698439364, + "grad_norm": 1.1562198400497437, + "learning_rate": 2.3015077072442608e-05, + "loss": 0.8592, + "step": 9617 + }, + { + "epoch": 0.4683368636330436, + "grad_norm": 1.8100725412368774, + "learning_rate": 2.3011958717339377e-05, + "loss": 0.8203, + "step": 9618 + }, + { + "epoch": 0.4683855574221508, + "grad_norm": 1.664250135421753, + "learning_rate": 2.300884028731339e-05, + "loss": 0.8468, + "step": 9619 + }, + { + "epoch": 0.468434251211258, + "grad_norm": 1.6765714883804321, + "learning_rate": 2.3005721782442233e-05, + "loss": 0.8695, + "step": 9620 + }, + { + "epoch": 0.4684829450003652, + "grad_norm": 3.866168975830078, + "learning_rate": 2.3002603202803476e-05, + "loss": 0.7905, + "step": 9621 + }, + { + "epoch": 0.4685316387894724, + "grad_norm": 1.3538014888763428, + "learning_rate": 2.299948454847469e-05, + "loss": 0.8274, + "step": 9622 + }, + { + "epoch": 0.4685803325785796, + "grad_norm": 1.416087031364441, + "learning_rate": 2.299636581953345e-05, + "loss": 0.7719, + "step": 9623 + }, + { + "epoch": 0.4686290263676868, + "grad_norm": 1.9121413230895996, + "learning_rate": 2.2993247016057336e-05, + "loss": 0.8378, + "step": 9624 + }, + { + "epoch": 0.468677720156794, + "grad_norm": 1.7014873027801514, + "learning_rate": 2.2990128138123927e-05, + "loss": 0.8141, + "step": 9625 + }, + { + "epoch": 0.4687264139459012, + "grad_norm": 2.302557945251465, + "learning_rate": 2.2987009185810812e-05, + "loss": 0.8827, + "step": 9626 + }, + { + "epoch": 0.4687751077350084, + "grad_norm": 1.8943380117416382, + "learning_rate": 2.2983890159195568e-05, + "loss": 0.768, + "step": 9627 + }, + { + "epoch": 0.4688238015241156, + "grad_norm": 1.3582141399383545, + "learning_rate": 2.298077105835579e-05, + "loss": 0.9777, + "step": 9628 + }, + { + "epoch": 0.4688724953132228, + "grad_norm": 1.7614612579345703, + "learning_rate": 2.2977651883369056e-05, + "loss": 0.831, + "step": 9629 + }, + { + "epoch": 0.46892118910233, + "grad_norm": 1.4505056142807007, + "learning_rate": 2.297453263431296e-05, + "loss": 0.8406, + "step": 9630 + }, + { + "epoch": 0.4689698828914372, + "grad_norm": 1.6934411525726318, + "learning_rate": 2.29714133112651e-05, + "loss": 0.8028, + "step": 9631 + }, + { + "epoch": 0.4690185766805444, + "grad_norm": 1.5884618759155273, + "learning_rate": 2.2968293914303056e-05, + "loss": 0.7812, + "step": 9632 + }, + { + "epoch": 0.4690672704696516, + "grad_norm": 1.407713532447815, + "learning_rate": 2.2965174443504426e-05, + "loss": 0.8754, + "step": 9633 + }, + { + "epoch": 0.4691159642587588, + "grad_norm": 1.5650568008422852, + "learning_rate": 2.296205489894682e-05, + "loss": 0.8448, + "step": 9634 + }, + { + "epoch": 0.469164658047866, + "grad_norm": 1.393799901008606, + "learning_rate": 2.2958935280707824e-05, + "loss": 0.8605, + "step": 9635 + }, + { + "epoch": 0.4692133518369732, + "grad_norm": 1.5281119346618652, + "learning_rate": 2.2955815588865046e-05, + "loss": 0.8276, + "step": 9636 + }, + { + "epoch": 0.46926204562608037, + "grad_norm": 1.468758225440979, + "learning_rate": 2.295269582349608e-05, + "loss": 0.8873, + "step": 9637 + }, + { + "epoch": 0.4693107394151876, + "grad_norm": 1.9064159393310547, + "learning_rate": 2.2949575984678542e-05, + "loss": 0.8366, + "step": 9638 + }, + { + "epoch": 0.4693594332042948, + "grad_norm": 0.0909847766160965, + "learning_rate": 2.2946456072490033e-05, + "loss": 0.576, + "step": 9639 + }, + { + "epoch": 0.469408126993402, + "grad_norm": 2.7034494876861572, + "learning_rate": 2.2943336087008152e-05, + "loss": 0.8138, + "step": 9640 + }, + { + "epoch": 0.46945682078250917, + "grad_norm": 2.2520759105682373, + "learning_rate": 2.2940216028310526e-05, + "loss": 0.7925, + "step": 9641 + }, + { + "epoch": 0.4695055145716164, + "grad_norm": 1.1462126970291138, + "learning_rate": 2.2937095896474756e-05, + "loss": 0.8681, + "step": 9642 + }, + { + "epoch": 0.4695542083607236, + "grad_norm": 1.2915114164352417, + "learning_rate": 2.293397569157846e-05, + "loss": 0.7632, + "step": 9643 + }, + { + "epoch": 0.4696029021498308, + "grad_norm": 1.7976772785186768, + "learning_rate": 2.293085541369925e-05, + "loss": 0.9536, + "step": 9644 + }, + { + "epoch": 0.469651595938938, + "grad_norm": 1.6115326881408691, + "learning_rate": 2.292773506291474e-05, + "loss": 0.8427, + "step": 9645 + }, + { + "epoch": 0.4697002897280452, + "grad_norm": 0.08746649324893951, + "learning_rate": 2.292461463930256e-05, + "loss": 0.5997, + "step": 9646 + }, + { + "epoch": 0.4697489835171524, + "grad_norm": 1.5555933713912964, + "learning_rate": 2.292149414294032e-05, + "loss": 0.8246, + "step": 9647 + }, + { + "epoch": 0.4697976773062596, + "grad_norm": 1.938303828239441, + "learning_rate": 2.2918373573905645e-05, + "loss": 0.6928, + "step": 9648 + }, + { + "epoch": 0.4698463710953668, + "grad_norm": 1.5974328517913818, + "learning_rate": 2.291525293227616e-05, + "loss": 0.8015, + "step": 9649 + }, + { + "epoch": 0.46989506488447397, + "grad_norm": 1.9173171520233154, + "learning_rate": 2.2912132218129498e-05, + "loss": 0.8189, + "step": 9650 + }, + { + "epoch": 0.4699437586735812, + "grad_norm": 1.5077787637710571, + "learning_rate": 2.2909011431543284e-05, + "loss": 0.972, + "step": 9651 + }, + { + "epoch": 0.4699924524626884, + "grad_norm": 2.0351691246032715, + "learning_rate": 2.2905890572595143e-05, + "loss": 0.7832, + "step": 9652 + }, + { + "epoch": 0.4700411462517956, + "grad_norm": 1.6259636878967285, + "learning_rate": 2.290276964136271e-05, + "loss": 0.8432, + "step": 9653 + }, + { + "epoch": 0.47008984004090276, + "grad_norm": 1.6038362979888916, + "learning_rate": 2.2899648637923608e-05, + "loss": 0.9452, + "step": 9654 + }, + { + "epoch": 0.47013853383001, + "grad_norm": 2.8963778018951416, + "learning_rate": 2.289652756235549e-05, + "loss": 0.9046, + "step": 9655 + }, + { + "epoch": 0.4701872276191172, + "grad_norm": 4.5135955810546875, + "learning_rate": 2.2893406414735984e-05, + "loss": 0.9214, + "step": 9656 + }, + { + "epoch": 0.4702359214082244, + "grad_norm": 1.3112198114395142, + "learning_rate": 2.289028519514273e-05, + "loss": 0.7994, + "step": 9657 + }, + { + "epoch": 0.47028461519733156, + "grad_norm": 2.3807363510131836, + "learning_rate": 2.2887163903653368e-05, + "loss": 0.8254, + "step": 9658 + }, + { + "epoch": 0.47033330898643877, + "grad_norm": 1.1847233772277832, + "learning_rate": 2.288404254034554e-05, + "loss": 0.8856, + "step": 9659 + }, + { + "epoch": 0.470382002775546, + "grad_norm": 2.5870161056518555, + "learning_rate": 2.288092110529689e-05, + "loss": 0.8714, + "step": 9660 + }, + { + "epoch": 0.4704306965646532, + "grad_norm": 1.2928657531738281, + "learning_rate": 2.2877799598585065e-05, + "loss": 0.9033, + "step": 9661 + }, + { + "epoch": 0.47047939035376035, + "grad_norm": 1.671343445777893, + "learning_rate": 2.287467802028771e-05, + "loss": 0.9144, + "step": 9662 + }, + { + "epoch": 0.47052808414286756, + "grad_norm": 1.8442397117614746, + "learning_rate": 2.2871556370482483e-05, + "loss": 0.8009, + "step": 9663 + }, + { + "epoch": 0.4705767779319748, + "grad_norm": 1.3164799213409424, + "learning_rate": 2.2868434649247026e-05, + "loss": 0.8487, + "step": 9664 + }, + { + "epoch": 0.470625471721082, + "grad_norm": 1.6490577459335327, + "learning_rate": 2.2865312856659e-05, + "loss": 0.796, + "step": 9665 + }, + { + "epoch": 0.4706741655101892, + "grad_norm": 1.8118630647659302, + "learning_rate": 2.286219099279605e-05, + "loss": 0.9, + "step": 9666 + }, + { + "epoch": 0.47072285929929636, + "grad_norm": 1.7347781658172607, + "learning_rate": 2.285906905773584e-05, + "loss": 0.8266, + "step": 9667 + }, + { + "epoch": 0.47077155308840357, + "grad_norm": 1.2350109815597534, + "learning_rate": 2.2855947051556027e-05, + "loss": 0.768, + "step": 9668 + }, + { + "epoch": 0.4708202468775108, + "grad_norm": 1.4767705202102661, + "learning_rate": 2.2852824974334272e-05, + "loss": 0.8588, + "step": 9669 + }, + { + "epoch": 0.470868940666618, + "grad_norm": 2.2159295082092285, + "learning_rate": 2.2849702826148226e-05, + "loss": 0.8067, + "step": 9670 + }, + { + "epoch": 0.47091763445572515, + "grad_norm": 1.409261703491211, + "learning_rate": 2.2846580607075572e-05, + "loss": 0.8513, + "step": 9671 + }, + { + "epoch": 0.47096632824483237, + "grad_norm": 1.7590440511703491, + "learning_rate": 2.2843458317193968e-05, + "loss": 0.8182, + "step": 9672 + }, + { + "epoch": 0.4710150220339396, + "grad_norm": 4.938271522521973, + "learning_rate": 2.2840335956581075e-05, + "loss": 0.9914, + "step": 9673 + }, + { + "epoch": 0.4710637158230468, + "grad_norm": 3.1302547454833984, + "learning_rate": 2.2837213525314565e-05, + "loss": 0.8038, + "step": 9674 + }, + { + "epoch": 0.47111240961215395, + "grad_norm": 1.4660788774490356, + "learning_rate": 2.2834091023472117e-05, + "loss": 0.8273, + "step": 9675 + }, + { + "epoch": 0.47116110340126116, + "grad_norm": 1.8914251327514648, + "learning_rate": 2.2830968451131382e-05, + "loss": 0.8959, + "step": 9676 + }, + { + "epoch": 0.4712097971903684, + "grad_norm": 1.8374379873275757, + "learning_rate": 2.2827845808370056e-05, + "loss": 0.8864, + "step": 9677 + }, + { + "epoch": 0.4712584909794756, + "grad_norm": 4.1996049880981445, + "learning_rate": 2.2824723095265807e-05, + "loss": 0.7964, + "step": 9678 + }, + { + "epoch": 0.47130718476858274, + "grad_norm": 1.3955082893371582, + "learning_rate": 2.2821600311896317e-05, + "loss": 0.8027, + "step": 9679 + }, + { + "epoch": 0.47135587855768996, + "grad_norm": 4.784780979156494, + "learning_rate": 2.2818477458339264e-05, + "loss": 0.8173, + "step": 9680 + }, + { + "epoch": 0.47140457234679717, + "grad_norm": 3.160963773727417, + "learning_rate": 2.2815354534672322e-05, + "loss": 0.7764, + "step": 9681 + }, + { + "epoch": 0.4714532661359044, + "grad_norm": 1.151466727256775, + "learning_rate": 2.2812231540973184e-05, + "loss": 0.8168, + "step": 9682 + }, + { + "epoch": 0.47150195992501154, + "grad_norm": 1.393662691116333, + "learning_rate": 2.2809108477319522e-05, + "loss": 0.7583, + "step": 9683 + }, + { + "epoch": 0.47155065371411875, + "grad_norm": 2.050398111343384, + "learning_rate": 2.2805985343789037e-05, + "loss": 0.7657, + "step": 9684 + }, + { + "epoch": 0.47159934750322596, + "grad_norm": 1.442256212234497, + "learning_rate": 2.280286214045941e-05, + "loss": 0.8965, + "step": 9685 + }, + { + "epoch": 0.4716480412923332, + "grad_norm": 1.5825320482254028, + "learning_rate": 2.279973886740833e-05, + "loss": 0.7977, + "step": 9686 + }, + { + "epoch": 0.4716967350814404, + "grad_norm": 2.5936484336853027, + "learning_rate": 2.2796615524713488e-05, + "loss": 0.8663, + "step": 9687 + }, + { + "epoch": 0.47174542887054755, + "grad_norm": 2.072342872619629, + "learning_rate": 2.2793492112452588e-05, + "loss": 0.7545, + "step": 9688 + }, + { + "epoch": 0.47179412265965476, + "grad_norm": 1.552575945854187, + "learning_rate": 2.279036863070331e-05, + "loss": 0.8511, + "step": 9689 + }, + { + "epoch": 0.47184281644876197, + "grad_norm": 0.09129578620195389, + "learning_rate": 2.278724507954336e-05, + "loss": 0.6717, + "step": 9690 + }, + { + "epoch": 0.4718915102378692, + "grad_norm": 1.6451078653335571, + "learning_rate": 2.2784121459050432e-05, + "loss": 0.8214, + "step": 9691 + }, + { + "epoch": 0.47194020402697634, + "grad_norm": 1.390196681022644, + "learning_rate": 2.2780997769302234e-05, + "loss": 0.87, + "step": 9692 + }, + { + "epoch": 0.47198889781608355, + "grad_norm": 2.10080623626709, + "learning_rate": 2.2777874010376458e-05, + "loss": 0.841, + "step": 9693 + }, + { + "epoch": 0.47203759160519076, + "grad_norm": 0.09172099828720093, + "learning_rate": 2.277475018235082e-05, + "loss": 0.6312, + "step": 9694 + }, + { + "epoch": 0.472086285394298, + "grad_norm": 1.1206183433532715, + "learning_rate": 2.2771626285303016e-05, + "loss": 0.9441, + "step": 9695 + }, + { + "epoch": 0.47213497918340513, + "grad_norm": 1.789348840713501, + "learning_rate": 2.2768502319310746e-05, + "loss": 0.7479, + "step": 9696 + }, + { + "epoch": 0.47218367297251235, + "grad_norm": 2.057852029800415, + "learning_rate": 2.2765378284451733e-05, + "loss": 0.8494, + "step": 9697 + }, + { + "epoch": 0.47223236676161956, + "grad_norm": 10.594517707824707, + "learning_rate": 2.276225418080369e-05, + "loss": 0.8769, + "step": 9698 + }, + { + "epoch": 0.4722810605507268, + "grad_norm": 1.6463960409164429, + "learning_rate": 2.2759130008444314e-05, + "loss": 0.7982, + "step": 9699 + }, + { + "epoch": 0.47232975433983393, + "grad_norm": 1.1775310039520264, + "learning_rate": 2.275600576745133e-05, + "loss": 0.8356, + "step": 9700 + }, + { + "epoch": 0.47237844812894114, + "grad_norm": 0.09378410130739212, + "learning_rate": 2.2752881457902455e-05, + "loss": 0.6538, + "step": 9701 + }, + { + "epoch": 0.47242714191804835, + "grad_norm": 1.3425557613372803, + "learning_rate": 2.27497570798754e-05, + "loss": 0.8305, + "step": 9702 + }, + { + "epoch": 0.47247583570715557, + "grad_norm": 2.727139472961426, + "learning_rate": 2.2746632633447888e-05, + "loss": 0.7903, + "step": 9703 + }, + { + "epoch": 0.4725245294962627, + "grad_norm": 1.292462944984436, + "learning_rate": 2.2743508118697636e-05, + "loss": 0.8951, + "step": 9704 + }, + { + "epoch": 0.47257322328536994, + "grad_norm": 1.1972532272338867, + "learning_rate": 2.274038353570237e-05, + "loss": 0.8827, + "step": 9705 + }, + { + "epoch": 0.47262191707447715, + "grad_norm": 1.6334925889968872, + "learning_rate": 2.273725888453981e-05, + "loss": 0.7387, + "step": 9706 + }, + { + "epoch": 0.47267061086358436, + "grad_norm": 1.5649553537368774, + "learning_rate": 2.273413416528769e-05, + "loss": 0.8321, + "step": 9707 + }, + { + "epoch": 0.4727193046526916, + "grad_norm": 4.287748336791992, + "learning_rate": 2.2731009378023738e-05, + "loss": 0.8438, + "step": 9708 + }, + { + "epoch": 0.47276799844179873, + "grad_norm": 1.8742711544036865, + "learning_rate": 2.2727884522825675e-05, + "loss": 0.8089, + "step": 9709 + }, + { + "epoch": 0.47281669223090594, + "grad_norm": 1.8328032493591309, + "learning_rate": 2.2724759599771237e-05, + "loss": 0.7551, + "step": 9710 + }, + { + "epoch": 0.47286538602001316, + "grad_norm": 1.5785282850265503, + "learning_rate": 2.272163460893815e-05, + "loss": 0.8481, + "step": 9711 + }, + { + "epoch": 0.47291407980912037, + "grad_norm": 0.08940300345420837, + "learning_rate": 2.2718509550404165e-05, + "loss": 0.5738, + "step": 9712 + }, + { + "epoch": 0.4729627735982275, + "grad_norm": 1.202347755432129, + "learning_rate": 2.2715384424246994e-05, + "loss": 0.9158, + "step": 9713 + }, + { + "epoch": 0.47301146738733474, + "grad_norm": 3.1523220539093018, + "learning_rate": 2.2712259230544395e-05, + "loss": 0.8051, + "step": 9714 + }, + { + "epoch": 0.47306016117644195, + "grad_norm": 1.4183269739151, + "learning_rate": 2.27091339693741e-05, + "loss": 0.7689, + "step": 9715 + }, + { + "epoch": 0.47310885496554916, + "grad_norm": 1.1851211786270142, + "learning_rate": 2.2706008640813843e-05, + "loss": 0.8591, + "step": 9716 + }, + { + "epoch": 0.4731575487546563, + "grad_norm": 1.5106250047683716, + "learning_rate": 2.2702883244941382e-05, + "loss": 0.8267, + "step": 9717 + }, + { + "epoch": 0.47320624254376353, + "grad_norm": 1.9663232564926147, + "learning_rate": 2.2699757781834454e-05, + "loss": 0.8762, + "step": 9718 + }, + { + "epoch": 0.47325493633287075, + "grad_norm": 1.9259874820709229, + "learning_rate": 2.2696632251570808e-05, + "loss": 0.8111, + "step": 9719 + }, + { + "epoch": 0.47330363012197796, + "grad_norm": 1.1707576513290405, + "learning_rate": 2.2693506654228176e-05, + "loss": 0.8467, + "step": 9720 + }, + { + "epoch": 0.4733523239110851, + "grad_norm": 1.6225016117095947, + "learning_rate": 2.2690380989884333e-05, + "loss": 0.9617, + "step": 9721 + }, + { + "epoch": 0.47340101770019233, + "grad_norm": 2.4019808769226074, + "learning_rate": 2.2687255258617013e-05, + "loss": 0.8791, + "step": 9722 + }, + { + "epoch": 0.47344971148929954, + "grad_norm": 1.4452579021453857, + "learning_rate": 2.2684129460503978e-05, + "loss": 0.8557, + "step": 9723 + }, + { + "epoch": 0.47349840527840675, + "grad_norm": 2.1007139682769775, + "learning_rate": 2.2681003595622974e-05, + "loss": 0.8346, + "step": 9724 + }, + { + "epoch": 0.47354709906751397, + "grad_norm": 2.1625049114227295, + "learning_rate": 2.2677877664051763e-05, + "loss": 0.9496, + "step": 9725 + }, + { + "epoch": 0.4735957928566211, + "grad_norm": 2.0043342113494873, + "learning_rate": 2.2674751665868095e-05, + "loss": 0.7581, + "step": 9726 + }, + { + "epoch": 0.47364448664572834, + "grad_norm": 1.2917330265045166, + "learning_rate": 2.2671625601149742e-05, + "loss": 0.7833, + "step": 9727 + }, + { + "epoch": 0.47369318043483555, + "grad_norm": 1.2606230974197388, + "learning_rate": 2.2668499469974447e-05, + "loss": 0.8197, + "step": 9728 + }, + { + "epoch": 0.47374187422394276, + "grad_norm": 1.570878505706787, + "learning_rate": 2.2665373272419997e-05, + "loss": 0.8395, + "step": 9729 + }, + { + "epoch": 0.4737905680130499, + "grad_norm": 1.8824480772018433, + "learning_rate": 2.2662247008564143e-05, + "loss": 0.7814, + "step": 9730 + }, + { + "epoch": 0.47383926180215713, + "grad_norm": 1.7067524194717407, + "learning_rate": 2.265912067848465e-05, + "loss": 0.8542, + "step": 9731 + }, + { + "epoch": 0.47388795559126434, + "grad_norm": 1.7868350744247437, + "learning_rate": 2.2655994282259288e-05, + "loss": 0.7763, + "step": 9732 + }, + { + "epoch": 0.47393664938037156, + "grad_norm": 1.6138789653778076, + "learning_rate": 2.2652867819965824e-05, + "loss": 0.7206, + "step": 9733 + }, + { + "epoch": 0.4739853431694787, + "grad_norm": 1.3995002508163452, + "learning_rate": 2.264974129168203e-05, + "loss": 0.8657, + "step": 9734 + }, + { + "epoch": 0.4740340369585859, + "grad_norm": 1.437496304512024, + "learning_rate": 2.264661469748568e-05, + "loss": 0.8516, + "step": 9735 + }, + { + "epoch": 0.47408273074769314, + "grad_norm": 1.2997843027114868, + "learning_rate": 2.2643488037454548e-05, + "loss": 0.8232, + "step": 9736 + }, + { + "epoch": 0.47413142453680035, + "grad_norm": 1.408382534980774, + "learning_rate": 2.264036131166641e-05, + "loss": 0.8279, + "step": 9737 + }, + { + "epoch": 0.4741801183259075, + "grad_norm": 1.4301964044570923, + "learning_rate": 2.263723452019905e-05, + "loss": 0.8424, + "step": 9738 + }, + { + "epoch": 0.4742288121150147, + "grad_norm": 1.1121933460235596, + "learning_rate": 2.263410766313024e-05, + "loss": 0.919, + "step": 9739 + }, + { + "epoch": 0.47427750590412193, + "grad_norm": 0.1017412468791008, + "learning_rate": 2.2630980740537757e-05, + "loss": 0.6425, + "step": 9740 + }, + { + "epoch": 0.47432619969322914, + "grad_norm": 1.5987157821655273, + "learning_rate": 2.262785375249939e-05, + "loss": 0.9304, + "step": 9741 + }, + { + "epoch": 0.4743748934823363, + "grad_norm": 2.194049119949341, + "learning_rate": 2.2624726699092915e-05, + "loss": 0.8972, + "step": 9742 + }, + { + "epoch": 0.4744235872714435, + "grad_norm": 2.609971284866333, + "learning_rate": 2.262159958039613e-05, + "loss": 0.7566, + "step": 9743 + }, + { + "epoch": 0.4744722810605507, + "grad_norm": 2.1396842002868652, + "learning_rate": 2.261847239648682e-05, + "loss": 0.7548, + "step": 9744 + }, + { + "epoch": 0.47452097484965794, + "grad_norm": 1.4266473054885864, + "learning_rate": 2.2615345147442764e-05, + "loss": 0.9337, + "step": 9745 + }, + { + "epoch": 0.47456966863876515, + "grad_norm": 1.43192720413208, + "learning_rate": 2.2612217833341757e-05, + "loss": 0.7586, + "step": 9746 + }, + { + "epoch": 0.4746183624278723, + "grad_norm": 2.107076406478882, + "learning_rate": 2.26090904542616e-05, + "loss": 0.8579, + "step": 9747 + }, + { + "epoch": 0.4746670562169795, + "grad_norm": 1.7151730060577393, + "learning_rate": 2.2605963010280082e-05, + "loss": 0.8676, + "step": 9748 + }, + { + "epoch": 0.47471575000608673, + "grad_norm": 1.3816149234771729, + "learning_rate": 2.2602835501474986e-05, + "loss": 0.8077, + "step": 9749 + }, + { + "epoch": 0.47476444379519395, + "grad_norm": 1.5994521379470825, + "learning_rate": 2.259970792792413e-05, + "loss": 0.8765, + "step": 9750 + }, + { + "epoch": 0.4748131375843011, + "grad_norm": 1.4887796640396118, + "learning_rate": 2.2596580289705297e-05, + "loss": 0.7479, + "step": 9751 + }, + { + "epoch": 0.4748618313734083, + "grad_norm": 2.830352306365967, + "learning_rate": 2.2593452586896297e-05, + "loss": 0.8367, + "step": 9752 + }, + { + "epoch": 0.47491052516251553, + "grad_norm": 1.1811316013336182, + "learning_rate": 2.2590324819574923e-05, + "loss": 0.7852, + "step": 9753 + }, + { + "epoch": 0.47495921895162274, + "grad_norm": 2.052412748336792, + "learning_rate": 2.2587196987818983e-05, + "loss": 0.8937, + "step": 9754 + }, + { + "epoch": 0.4750079127407299, + "grad_norm": 1.4077237844467163, + "learning_rate": 2.2584069091706287e-05, + "loss": 0.8901, + "step": 9755 + }, + { + "epoch": 0.4750566065298371, + "grad_norm": 1.8814096450805664, + "learning_rate": 2.2580941131314628e-05, + "loss": 0.7687, + "step": 9756 + }, + { + "epoch": 0.4751053003189443, + "grad_norm": 1.9202933311462402, + "learning_rate": 2.2577813106721823e-05, + "loss": 0.8044, + "step": 9757 + }, + { + "epoch": 0.47515399410805154, + "grad_norm": 2.276768922805786, + "learning_rate": 2.2574685018005686e-05, + "loss": 0.7581, + "step": 9758 + }, + { + "epoch": 0.4752026878971587, + "grad_norm": 1.5821990966796875, + "learning_rate": 2.257155686524403e-05, + "loss": 0.818, + "step": 9759 + }, + { + "epoch": 0.4752513816862659, + "grad_norm": 1.6144301891326904, + "learning_rate": 2.256842864851466e-05, + "loss": 0.8551, + "step": 9760 + }, + { + "epoch": 0.4753000754753731, + "grad_norm": 1.3857789039611816, + "learning_rate": 2.256530036789539e-05, + "loss": 0.8636, + "step": 9761 + }, + { + "epoch": 0.47534876926448033, + "grad_norm": 1.7628313302993774, + "learning_rate": 2.256217202346404e-05, + "loss": 0.8158, + "step": 9762 + }, + { + "epoch": 0.4753974630535875, + "grad_norm": 1.638077735900879, + "learning_rate": 2.2559043615298424e-05, + "loss": 0.8708, + "step": 9763 + }, + { + "epoch": 0.4754461568426947, + "grad_norm": 1.6590723991394043, + "learning_rate": 2.255591514347637e-05, + "loss": 0.8078, + "step": 9764 + }, + { + "epoch": 0.4754948506318019, + "grad_norm": 1.4746639728546143, + "learning_rate": 2.2552786608075693e-05, + "loss": 0.8912, + "step": 9765 + }, + { + "epoch": 0.4755435444209091, + "grad_norm": 1.90211021900177, + "learning_rate": 2.2549658009174214e-05, + "loss": 0.8027, + "step": 9766 + }, + { + "epoch": 0.47559223821001634, + "grad_norm": 1.377245545387268, + "learning_rate": 2.254652934684976e-05, + "loss": 0.8365, + "step": 9767 + }, + { + "epoch": 0.4756409319991235, + "grad_norm": 1.5054395198822021, + "learning_rate": 2.254340062118016e-05, + "loss": 0.8446, + "step": 9768 + }, + { + "epoch": 0.4756896257882307, + "grad_norm": 1.3204174041748047, + "learning_rate": 2.2540271832243236e-05, + "loss": 0.848, + "step": 9769 + }, + { + "epoch": 0.4757383195773379, + "grad_norm": 2.478673219680786, + "learning_rate": 2.253714298011682e-05, + "loss": 0.7404, + "step": 9770 + }, + { + "epoch": 0.47578701336644513, + "grad_norm": 3.129880428314209, + "learning_rate": 2.2534014064878732e-05, + "loss": 0.8542, + "step": 9771 + }, + { + "epoch": 0.4758357071555523, + "grad_norm": 1.3694771528244019, + "learning_rate": 2.2530885086606822e-05, + "loss": 0.8607, + "step": 9772 + }, + { + "epoch": 0.4758844009446595, + "grad_norm": 1.391127347946167, + "learning_rate": 2.2527756045378914e-05, + "loss": 0.8071, + "step": 9773 + }, + { + "epoch": 0.4759330947337667, + "grad_norm": 1.501429557800293, + "learning_rate": 2.2524626941272843e-05, + "loss": 0.8593, + "step": 9774 + }, + { + "epoch": 0.4759817885228739, + "grad_norm": 1.6802833080291748, + "learning_rate": 2.2521497774366444e-05, + "loss": 0.7901, + "step": 9775 + }, + { + "epoch": 0.4760304823119811, + "grad_norm": 1.2378519773483276, + "learning_rate": 2.2518368544737558e-05, + "loss": 0.8183, + "step": 9776 + }, + { + "epoch": 0.4760791761010883, + "grad_norm": 1.2990925312042236, + "learning_rate": 2.251523925246403e-05, + "loss": 0.8033, + "step": 9777 + }, + { + "epoch": 0.4761278698901955, + "grad_norm": 1.257610559463501, + "learning_rate": 2.251210989762369e-05, + "loss": 0.8093, + "step": 9778 + }, + { + "epoch": 0.4761765636793027, + "grad_norm": 1.3422234058380127, + "learning_rate": 2.2508980480294386e-05, + "loss": 0.7874, + "step": 9779 + }, + { + "epoch": 0.4762252574684099, + "grad_norm": 1.5613921880722046, + "learning_rate": 2.250585100055397e-05, + "loss": 0.8105, + "step": 9780 + }, + { + "epoch": 0.4762739512575171, + "grad_norm": 13.05695915222168, + "learning_rate": 2.2502721458480284e-05, + "loss": 0.9134, + "step": 9781 + }, + { + "epoch": 0.4763226450466243, + "grad_norm": 1.8312504291534424, + "learning_rate": 2.2499591854151167e-05, + "loss": 0.8681, + "step": 9782 + }, + { + "epoch": 0.4763713388357315, + "grad_norm": 2.030749559402466, + "learning_rate": 2.249646218764448e-05, + "loss": 0.837, + "step": 9783 + }, + { + "epoch": 0.4764200326248387, + "grad_norm": 2.8216912746429443, + "learning_rate": 2.249333245903807e-05, + "loss": 0.8808, + "step": 9784 + }, + { + "epoch": 0.4764687264139459, + "grad_norm": 1.166398048400879, + "learning_rate": 2.2490202668409778e-05, + "loss": 0.7808, + "step": 9785 + }, + { + "epoch": 0.4765174202030531, + "grad_norm": 2.3508689403533936, + "learning_rate": 2.2487072815837466e-05, + "loss": 0.8806, + "step": 9786 + }, + { + "epoch": 0.4765661139921603, + "grad_norm": 1.4562841653823853, + "learning_rate": 2.2483942901399e-05, + "loss": 0.9074, + "step": 9787 + }, + { + "epoch": 0.4766148077812675, + "grad_norm": 1.8404817581176758, + "learning_rate": 2.248081292517223e-05, + "loss": 0.9012, + "step": 9788 + }, + { + "epoch": 0.4766635015703747, + "grad_norm": 3.7685558795928955, + "learning_rate": 2.2477682887235005e-05, + "loss": 0.9113, + "step": 9789 + }, + { + "epoch": 0.4767121953594819, + "grad_norm": 1.9390738010406494, + "learning_rate": 2.24745527876652e-05, + "loss": 0.8384, + "step": 9790 + }, + { + "epoch": 0.4767608891485891, + "grad_norm": 0.09880761802196503, + "learning_rate": 2.2471422626540666e-05, + "loss": 0.6241, + "step": 9791 + }, + { + "epoch": 0.4768095829376963, + "grad_norm": 1.7522140741348267, + "learning_rate": 2.2468292403939263e-05, + "loss": 0.8748, + "step": 9792 + }, + { + "epoch": 0.4768582767268035, + "grad_norm": 1.4000016450881958, + "learning_rate": 2.2465162119938865e-05, + "loss": 0.8726, + "step": 9793 + }, + { + "epoch": 0.4769069705159107, + "grad_norm": 1.5378201007843018, + "learning_rate": 2.2462031774617333e-05, + "loss": 0.8407, + "step": 9794 + }, + { + "epoch": 0.4769556643050179, + "grad_norm": 1.9660730361938477, + "learning_rate": 2.2458901368052538e-05, + "loss": 0.8464, + "step": 9795 + }, + { + "epoch": 0.4770043580941251, + "grad_norm": 1.598933219909668, + "learning_rate": 2.2455770900322345e-05, + "loss": 0.838, + "step": 9796 + }, + { + "epoch": 0.47705305188323227, + "grad_norm": 1.6779272556304932, + "learning_rate": 2.2452640371504634e-05, + "loss": 0.8252, + "step": 9797 + }, + { + "epoch": 0.4771017456723395, + "grad_norm": 1.9517186880111694, + "learning_rate": 2.2449509781677268e-05, + "loss": 0.8693, + "step": 9798 + }, + { + "epoch": 0.4771504394614467, + "grad_norm": 2.145148992538452, + "learning_rate": 2.2446379130918122e-05, + "loss": 0.8373, + "step": 9799 + }, + { + "epoch": 0.4771991332505539, + "grad_norm": 1.4593603610992432, + "learning_rate": 2.2443248419305067e-05, + "loss": 0.7858, + "step": 9800 + }, + { + "epoch": 0.47724782703966107, + "grad_norm": 1.6155633926391602, + "learning_rate": 2.2440117646915988e-05, + "loss": 0.9331, + "step": 9801 + }, + { + "epoch": 0.4772965208287683, + "grad_norm": 1.5660269260406494, + "learning_rate": 2.2436986813828758e-05, + "loss": 0.9054, + "step": 9802 + }, + { + "epoch": 0.4773452146178755, + "grad_norm": 1.3773101568222046, + "learning_rate": 2.243385592012126e-05, + "loss": 0.9239, + "step": 9803 + }, + { + "epoch": 0.4773939084069827, + "grad_norm": 1.8909944295883179, + "learning_rate": 2.243072496587138e-05, + "loss": 0.8236, + "step": 9804 + }, + { + "epoch": 0.4774426021960899, + "grad_norm": 12.998905181884766, + "learning_rate": 2.2427593951156986e-05, + "loss": 0.8012, + "step": 9805 + }, + { + "epoch": 0.4774912959851971, + "grad_norm": 2.53603458404541, + "learning_rate": 2.2424462876055973e-05, + "loss": 0.9423, + "step": 9806 + }, + { + "epoch": 0.4775399897743043, + "grad_norm": 1.5013048648834229, + "learning_rate": 2.242133174064623e-05, + "loss": 0.7982, + "step": 9807 + }, + { + "epoch": 0.4775886835634115, + "grad_norm": 1.9033695459365845, + "learning_rate": 2.2418200545005635e-05, + "loss": 0.8556, + "step": 9808 + }, + { + "epoch": 0.4776373773525187, + "grad_norm": 0.08966515958309174, + "learning_rate": 2.2415069289212078e-05, + "loss": 0.5634, + "step": 9809 + }, + { + "epoch": 0.47768607114162587, + "grad_norm": 2.2121334075927734, + "learning_rate": 2.241193797334346e-05, + "loss": 0.8011, + "step": 9810 + }, + { + "epoch": 0.4777347649307331, + "grad_norm": 1.75102698802948, + "learning_rate": 2.2408806597477662e-05, + "loss": 0.7862, + "step": 9811 + }, + { + "epoch": 0.4777834587198403, + "grad_norm": 2.0219850540161133, + "learning_rate": 2.2405675161692582e-05, + "loss": 0.727, + "step": 9812 + }, + { + "epoch": 0.4778321525089475, + "grad_norm": 1.6506075859069824, + "learning_rate": 2.2402543666066112e-05, + "loss": 0.8155, + "step": 9813 + }, + { + "epoch": 0.47788084629805466, + "grad_norm": 1.5410076379776, + "learning_rate": 2.239941211067614e-05, + "loss": 0.9306, + "step": 9814 + }, + { + "epoch": 0.4779295400871619, + "grad_norm": 1.5653671026229858, + "learning_rate": 2.2396280495600583e-05, + "loss": 0.749, + "step": 9815 + }, + { + "epoch": 0.4779782338762691, + "grad_norm": 2.2092370986938477, + "learning_rate": 2.2393148820917324e-05, + "loss": 0.7857, + "step": 9816 + }, + { + "epoch": 0.4780269276653763, + "grad_norm": 1.6458418369293213, + "learning_rate": 2.239001708670428e-05, + "loss": 0.8699, + "step": 9817 + }, + { + "epoch": 0.47807562145448346, + "grad_norm": 1.5118521451950073, + "learning_rate": 2.2386885293039335e-05, + "loss": 0.8951, + "step": 9818 + }, + { + "epoch": 0.47812431524359067, + "grad_norm": 1.6468470096588135, + "learning_rate": 2.23837534400004e-05, + "loss": 0.802, + "step": 9819 + }, + { + "epoch": 0.4781730090326979, + "grad_norm": 1.7853904962539673, + "learning_rate": 2.238062152766539e-05, + "loss": 0.917, + "step": 9820 + }, + { + "epoch": 0.4782217028218051, + "grad_norm": 1.8122186660766602, + "learning_rate": 2.2377489556112194e-05, + "loss": 0.8869, + "step": 9821 + }, + { + "epoch": 0.47827039661091225, + "grad_norm": 1.3963866233825684, + "learning_rate": 2.2374357525418728e-05, + "loss": 0.8231, + "step": 9822 + }, + { + "epoch": 0.47831909040001946, + "grad_norm": 1.5054101943969727, + "learning_rate": 2.2371225435662908e-05, + "loss": 0.804, + "step": 9823 + }, + { + "epoch": 0.4783677841891267, + "grad_norm": 2.152540683746338, + "learning_rate": 2.2368093286922634e-05, + "loss": 0.7763, + "step": 9824 + }, + { + "epoch": 0.4784164779782339, + "grad_norm": 3.0184290409088135, + "learning_rate": 2.236496107927582e-05, + "loss": 0.8979, + "step": 9825 + }, + { + "epoch": 0.4784651717673411, + "grad_norm": 1.6304590702056885, + "learning_rate": 2.2361828812800392e-05, + "loss": 0.8262, + "step": 9826 + }, + { + "epoch": 0.47851386555644826, + "grad_norm": 1.5510311126708984, + "learning_rate": 2.2358696487574254e-05, + "loss": 0.8711, + "step": 9827 + }, + { + "epoch": 0.47856255934555547, + "grad_norm": 1.2680799961090088, + "learning_rate": 2.2355564103675323e-05, + "loss": 0.8574, + "step": 9828 + }, + { + "epoch": 0.4786112531346627, + "grad_norm": 11.439925193786621, + "learning_rate": 2.2352431661181523e-05, + "loss": 0.7986, + "step": 9829 + }, + { + "epoch": 0.4786599469237699, + "grad_norm": 1.437476396560669, + "learning_rate": 2.2349299160170773e-05, + "loss": 0.8908, + "step": 9830 + }, + { + "epoch": 0.47870864071287705, + "grad_norm": 1.358357548713684, + "learning_rate": 2.2346166600720987e-05, + "loss": 0.8807, + "step": 9831 + }, + { + "epoch": 0.47875733450198427, + "grad_norm": 1.6510190963745117, + "learning_rate": 2.23430339829101e-05, + "loss": 0.8159, + "step": 9832 + }, + { + "epoch": 0.4788060282910915, + "grad_norm": 1.6565310955047607, + "learning_rate": 2.2339901306816027e-05, + "loss": 0.8172, + "step": 9833 + }, + { + "epoch": 0.4788547220801987, + "grad_norm": 1.1729686260223389, + "learning_rate": 2.233676857251669e-05, + "loss": 0.7968, + "step": 9834 + }, + { + "epoch": 0.47890341586930585, + "grad_norm": 2.2346620559692383, + "learning_rate": 2.2333635780090024e-05, + "loss": 0.8125, + "step": 9835 + }, + { + "epoch": 0.47895210965841306, + "grad_norm": 1.4751276969909668, + "learning_rate": 2.2330502929613958e-05, + "loss": 0.783, + "step": 9836 + }, + { + "epoch": 0.4790008034475203, + "grad_norm": 1.3741378784179688, + "learning_rate": 2.2327370021166412e-05, + "loss": 0.8741, + "step": 9837 + }, + { + "epoch": 0.4790494972366275, + "grad_norm": 2.969097375869751, + "learning_rate": 2.2324237054825332e-05, + "loss": 0.8053, + "step": 9838 + }, + { + "epoch": 0.47909819102573464, + "grad_norm": 2.6897122859954834, + "learning_rate": 2.232110403066864e-05, + "loss": 0.8901, + "step": 9839 + }, + { + "epoch": 0.47914688481484186, + "grad_norm": 1.469531536102295, + "learning_rate": 2.2317970948774276e-05, + "loss": 0.8216, + "step": 9840 + }, + { + "epoch": 0.47919557860394907, + "grad_norm": 1.607166051864624, + "learning_rate": 2.2314837809220173e-05, + "loss": 0.8151, + "step": 9841 + }, + { + "epoch": 0.4792442723930563, + "grad_norm": 2.6163814067840576, + "learning_rate": 2.231170461208427e-05, + "loss": 0.9199, + "step": 9842 + }, + { + "epoch": 0.47929296618216344, + "grad_norm": 2.050044536590576, + "learning_rate": 2.23085713574445e-05, + "loss": 0.7649, + "step": 9843 + }, + { + "epoch": 0.47934165997127065, + "grad_norm": 1.2888363599777222, + "learning_rate": 2.230543804537881e-05, + "loss": 0.8312, + "step": 9844 + }, + { + "epoch": 0.47939035376037786, + "grad_norm": 1.600109577178955, + "learning_rate": 2.230230467596513e-05, + "loss": 0.7829, + "step": 9845 + }, + { + "epoch": 0.4794390475494851, + "grad_norm": 1.5266963243484497, + "learning_rate": 2.2299171249281423e-05, + "loss": 0.8548, + "step": 9846 + }, + { + "epoch": 0.4794877413385923, + "grad_norm": 1.5837838649749756, + "learning_rate": 2.2296037765405615e-05, + "loss": 0.8533, + "step": 9847 + }, + { + "epoch": 0.47953643512769945, + "grad_norm": 1.7107449769973755, + "learning_rate": 2.2292904224415662e-05, + "loss": 0.8328, + "step": 9848 + }, + { + "epoch": 0.47958512891680666, + "grad_norm": 1.670634388923645, + "learning_rate": 2.2289770626389507e-05, + "loss": 0.7849, + "step": 9849 + }, + { + "epoch": 0.47963382270591387, + "grad_norm": 1.636206030845642, + "learning_rate": 2.2286636971405095e-05, + "loss": 0.7324, + "step": 9850 + }, + { + "epoch": 0.4796825164950211, + "grad_norm": 2.06451416015625, + "learning_rate": 2.228350325954038e-05, + "loss": 0.8743, + "step": 9851 + }, + { + "epoch": 0.47973121028412824, + "grad_norm": 2.62701678276062, + "learning_rate": 2.2280369490873316e-05, + "loss": 0.8336, + "step": 9852 + }, + { + "epoch": 0.47977990407323545, + "grad_norm": 3.3394315242767334, + "learning_rate": 2.2277235665481857e-05, + "loss": 0.7632, + "step": 9853 + }, + { + "epoch": 0.47982859786234267, + "grad_norm": 1.437740445137024, + "learning_rate": 2.2274101783443944e-05, + "loss": 0.8385, + "step": 9854 + }, + { + "epoch": 0.4798772916514499, + "grad_norm": 1.706630825996399, + "learning_rate": 2.2270967844837547e-05, + "loss": 0.7782, + "step": 9855 + }, + { + "epoch": 0.47992598544055703, + "grad_norm": 1.549991250038147, + "learning_rate": 2.2267833849740614e-05, + "loss": 0.816, + "step": 9856 + }, + { + "epoch": 0.47997467922966425, + "grad_norm": 1.4432908296585083, + "learning_rate": 2.2264699798231115e-05, + "loss": 0.8413, + "step": 9857 + }, + { + "epoch": 0.48002337301877146, + "grad_norm": 2.9112188816070557, + "learning_rate": 2.2261565690386995e-05, + "loss": 0.8517, + "step": 9858 + }, + { + "epoch": 0.4800720668078787, + "grad_norm": 1.8457361459732056, + "learning_rate": 2.225843152628622e-05, + "loss": 0.7719, + "step": 9859 + }, + { + "epoch": 0.48012076059698583, + "grad_norm": 2.438032865524292, + "learning_rate": 2.2255297306006765e-05, + "loss": 0.8129, + "step": 9860 + }, + { + "epoch": 0.48016945438609304, + "grad_norm": 1.5875325202941895, + "learning_rate": 2.225216302962658e-05, + "loss": 0.8393, + "step": 9861 + }, + { + "epoch": 0.48021814817520025, + "grad_norm": 3.3001272678375244, + "learning_rate": 2.2249028697223634e-05, + "loss": 0.7139, + "step": 9862 + }, + { + "epoch": 0.48026684196430747, + "grad_norm": 2.588870048522949, + "learning_rate": 2.2245894308875894e-05, + "loss": 0.7531, + "step": 9863 + }, + { + "epoch": 0.4803155357534147, + "grad_norm": 1.2779165506362915, + "learning_rate": 2.2242759864661327e-05, + "loss": 0.8788, + "step": 9864 + }, + { + "epoch": 0.48036422954252184, + "grad_norm": 0.09394566714763641, + "learning_rate": 2.22396253646579e-05, + "loss": 0.6223, + "step": 9865 + }, + { + "epoch": 0.48041292333162905, + "grad_norm": 2.205704689025879, + "learning_rate": 2.223649080894359e-05, + "loss": 0.7633, + "step": 9866 + }, + { + "epoch": 0.48046161712073626, + "grad_norm": 2.7206015586853027, + "learning_rate": 2.2233356197596367e-05, + "loss": 0.8446, + "step": 9867 + }, + { + "epoch": 0.4805103109098435, + "grad_norm": 1.9478543996810913, + "learning_rate": 2.2230221530694206e-05, + "loss": 0.8627, + "step": 9868 + }, + { + "epoch": 0.48055900469895063, + "grad_norm": 0.0989520475268364, + "learning_rate": 2.2227086808315086e-05, + "loss": 0.6616, + "step": 9869 + }, + { + "epoch": 0.48060769848805784, + "grad_norm": 1.6616753339767456, + "learning_rate": 2.2223952030536972e-05, + "loss": 0.8925, + "step": 9870 + }, + { + "epoch": 0.48065639227716506, + "grad_norm": 1.5136590003967285, + "learning_rate": 2.2220817197437853e-05, + "loss": 0.7369, + "step": 9871 + }, + { + "epoch": 0.48070508606627227, + "grad_norm": 1.8582344055175781, + "learning_rate": 2.22176823090957e-05, + "loss": 0.8774, + "step": 9872 + }, + { + "epoch": 0.4807537798553794, + "grad_norm": 1.3183255195617676, + "learning_rate": 2.22145473655885e-05, + "loss": 0.8293, + "step": 9873 + }, + { + "epoch": 0.48080247364448664, + "grad_norm": 2.4105451107025146, + "learning_rate": 2.2211412366994228e-05, + "loss": 0.8396, + "step": 9874 + }, + { + "epoch": 0.48085116743359385, + "grad_norm": 2.048081398010254, + "learning_rate": 2.2208277313390877e-05, + "loss": 0.7875, + "step": 9875 + }, + { + "epoch": 0.48089986122270106, + "grad_norm": 1.8270193338394165, + "learning_rate": 2.2205142204856426e-05, + "loss": 0.8462, + "step": 9876 + }, + { + "epoch": 0.4809485550118082, + "grad_norm": 3.0760676860809326, + "learning_rate": 2.220200704146886e-05, + "loss": 0.8584, + "step": 9877 + }, + { + "epoch": 0.48099724880091543, + "grad_norm": 3.2283921241760254, + "learning_rate": 2.219887182330617e-05, + "loss": 0.8674, + "step": 9878 + }, + { + "epoch": 0.48104594259002265, + "grad_norm": 3.7498273849487305, + "learning_rate": 2.2195736550446343e-05, + "loss": 0.8603, + "step": 9879 + }, + { + "epoch": 0.48109463637912986, + "grad_norm": 1.6494907140731812, + "learning_rate": 2.2192601222967364e-05, + "loss": 0.7833, + "step": 9880 + }, + { + "epoch": 0.481143330168237, + "grad_norm": 1.4583277702331543, + "learning_rate": 2.218946584094723e-05, + "loss": 0.8482, + "step": 9881 + }, + { + "epoch": 0.48119202395734423, + "grad_norm": 2.823967933654785, + "learning_rate": 2.218633040446394e-05, + "loss": 0.8312, + "step": 9882 + }, + { + "epoch": 0.48124071774645144, + "grad_norm": 1.3649364709854126, + "learning_rate": 2.218319491359548e-05, + "loss": 0.7236, + "step": 9883 + }, + { + "epoch": 0.48128941153555865, + "grad_norm": 4.774943828582764, + "learning_rate": 2.2180059368419843e-05, + "loss": 0.8014, + "step": 9884 + }, + { + "epoch": 0.48133810532466587, + "grad_norm": 1.5308588743209839, + "learning_rate": 2.2176923769015034e-05, + "loss": 0.8521, + "step": 9885 + }, + { + "epoch": 0.481386799113773, + "grad_norm": 1.734975814819336, + "learning_rate": 2.217378811545905e-05, + "loss": 0.8131, + "step": 9886 + }, + { + "epoch": 0.48143549290288024, + "grad_norm": 2.530327796936035, + "learning_rate": 2.2170652407829884e-05, + "loss": 0.8861, + "step": 9887 + }, + { + "epoch": 0.48148418669198745, + "grad_norm": 1.3384501934051514, + "learning_rate": 2.2167516646205543e-05, + "loss": 0.8143, + "step": 9888 + }, + { + "epoch": 0.48153288048109466, + "grad_norm": 1.5201046466827393, + "learning_rate": 2.2164380830664028e-05, + "loss": 0.8018, + "step": 9889 + }, + { + "epoch": 0.4815815742702018, + "grad_norm": 1.6458951234817505, + "learning_rate": 2.2161244961283344e-05, + "loss": 0.8254, + "step": 9890 + }, + { + "epoch": 0.48163026805930903, + "grad_norm": 1.6063779592514038, + "learning_rate": 2.2158109038141492e-05, + "loss": 0.8094, + "step": 9891 + }, + { + "epoch": 0.48167896184841624, + "grad_norm": 1.605696201324463, + "learning_rate": 2.215497306131648e-05, + "loss": 0.7909, + "step": 9892 + }, + { + "epoch": 0.48172765563752346, + "grad_norm": 7.0114336013793945, + "learning_rate": 2.2151837030886326e-05, + "loss": 0.9125, + "step": 9893 + }, + { + "epoch": 0.4817763494266306, + "grad_norm": 1.5629019737243652, + "learning_rate": 2.2148700946929015e-05, + "loss": 0.6949, + "step": 9894 + }, + { + "epoch": 0.4818250432157378, + "grad_norm": 1.6159683465957642, + "learning_rate": 2.2145564809522576e-05, + "loss": 0.8785, + "step": 9895 + }, + { + "epoch": 0.48187373700484504, + "grad_norm": 2.8955602645874023, + "learning_rate": 2.214242861874502e-05, + "loss": 0.8607, + "step": 9896 + }, + { + "epoch": 0.48192243079395225, + "grad_norm": 2.4611661434173584, + "learning_rate": 2.213929237467436e-05, + "loss": 0.8581, + "step": 9897 + }, + { + "epoch": 0.4819711245830594, + "grad_norm": 3.4400882720947266, + "learning_rate": 2.2136156077388602e-05, + "loss": 0.811, + "step": 9898 + }, + { + "epoch": 0.4820198183721666, + "grad_norm": 1.5957401990890503, + "learning_rate": 2.213301972696577e-05, + "loss": 0.8574, + "step": 9899 + }, + { + "epoch": 0.48206851216127383, + "grad_norm": 1.8027966022491455, + "learning_rate": 2.2129883323483878e-05, + "loss": 0.9015, + "step": 9900 + }, + { + "epoch": 0.48211720595038104, + "grad_norm": 2.622265100479126, + "learning_rate": 2.2126746867020938e-05, + "loss": 0.8302, + "step": 9901 + }, + { + "epoch": 0.4821658997394882, + "grad_norm": 2.801940441131592, + "learning_rate": 2.2123610357654983e-05, + "loss": 0.8338, + "step": 9902 + }, + { + "epoch": 0.4822145935285954, + "grad_norm": 4.370853900909424, + "learning_rate": 2.2120473795464025e-05, + "loss": 0.8595, + "step": 9903 + }, + { + "epoch": 0.4822632873177026, + "grad_norm": 1.9955518245697021, + "learning_rate": 2.2117337180526083e-05, + "loss": 0.9132, + "step": 9904 + }, + { + "epoch": 0.48231198110680984, + "grad_norm": 1.6404218673706055, + "learning_rate": 2.2114200512919194e-05, + "loss": 0.8351, + "step": 9905 + }, + { + "epoch": 0.48236067489591705, + "grad_norm": 0.09458629041910172, + "learning_rate": 2.211106379272137e-05, + "loss": 0.598, + "step": 9906 + }, + { + "epoch": 0.4824093686850242, + "grad_norm": 2.57954740524292, + "learning_rate": 2.210792702001064e-05, + "loss": 0.8507, + "step": 9907 + }, + { + "epoch": 0.4824580624741314, + "grad_norm": 2.179924726486206, + "learning_rate": 2.2104790194865037e-05, + "loss": 0.7872, + "step": 9908 + }, + { + "epoch": 0.48250675626323863, + "grad_norm": 2.9176745414733887, + "learning_rate": 2.210165331736258e-05, + "loss": 0.7593, + "step": 9909 + }, + { + "epoch": 0.48255545005234585, + "grad_norm": 1.7230478525161743, + "learning_rate": 2.209851638758131e-05, + "loss": 0.8525, + "step": 9910 + }, + { + "epoch": 0.482604143841453, + "grad_norm": 0.09059768915176392, + "learning_rate": 2.2095379405599255e-05, + "loss": 0.6477, + "step": 9911 + }, + { + "epoch": 0.4826528376305602, + "grad_norm": 2.473642587661743, + "learning_rate": 2.2092242371494444e-05, + "loss": 0.8341, + "step": 9912 + }, + { + "epoch": 0.48270153141966743, + "grad_norm": 2.229726552963257, + "learning_rate": 2.2089105285344914e-05, + "loss": 0.9611, + "step": 9913 + }, + { + "epoch": 0.48275022520877464, + "grad_norm": 1.3845605850219727, + "learning_rate": 2.2085968147228693e-05, + "loss": 0.8326, + "step": 9914 + }, + { + "epoch": 0.4827989189978818, + "grad_norm": 1.677579402923584, + "learning_rate": 2.2082830957223833e-05, + "loss": 0.8288, + "step": 9915 + }, + { + "epoch": 0.482847612786989, + "grad_norm": 1.6847530603408813, + "learning_rate": 2.207969371540836e-05, + "loss": 0.8203, + "step": 9916 + }, + { + "epoch": 0.4828963065760962, + "grad_norm": 1.7552798986434937, + "learning_rate": 2.2076556421860315e-05, + "loss": 0.7924, + "step": 9917 + }, + { + "epoch": 0.48294500036520344, + "grad_norm": 1.2698814868927002, + "learning_rate": 2.207341907665774e-05, + "loss": 0.9225, + "step": 9918 + }, + { + "epoch": 0.4829936941543106, + "grad_norm": 1.7338008880615234, + "learning_rate": 2.207028167987868e-05, + "loss": 0.8926, + "step": 9919 + }, + { + "epoch": 0.4830423879434178, + "grad_norm": 1.6142839193344116, + "learning_rate": 2.206714423160117e-05, + "loss": 0.8413, + "step": 9920 + }, + { + "epoch": 0.483091081732525, + "grad_norm": 1.7977126836776733, + "learning_rate": 2.2064006731903264e-05, + "loss": 0.9072, + "step": 9921 + }, + { + "epoch": 0.48313977552163223, + "grad_norm": 1.6487271785736084, + "learning_rate": 2.2060869180862997e-05, + "loss": 0.8954, + "step": 9922 + }, + { + "epoch": 0.4831884693107394, + "grad_norm": 1.7839173078536987, + "learning_rate": 2.205773157855842e-05, + "loss": 0.7828, + "step": 9923 + }, + { + "epoch": 0.4832371630998466, + "grad_norm": 1.9655333757400513, + "learning_rate": 2.2054593925067584e-05, + "loss": 0.9006, + "step": 9924 + }, + { + "epoch": 0.4832858568889538, + "grad_norm": 5.183908939361572, + "learning_rate": 2.2051456220468538e-05, + "loss": 0.8563, + "step": 9925 + }, + { + "epoch": 0.483334550678061, + "grad_norm": 1.7603646516799927, + "learning_rate": 2.2048318464839335e-05, + "loss": 0.8224, + "step": 9926 + }, + { + "epoch": 0.48338324446716824, + "grad_norm": 2.6336164474487305, + "learning_rate": 2.2045180658258017e-05, + "loss": 0.8013, + "step": 9927 + }, + { + "epoch": 0.4834319382562754, + "grad_norm": 1.2729942798614502, + "learning_rate": 2.2042042800802646e-05, + "loss": 0.8246, + "step": 9928 + }, + { + "epoch": 0.4834806320453826, + "grad_norm": 2.2559545040130615, + "learning_rate": 2.2038904892551276e-05, + "loss": 0.8621, + "step": 9929 + }, + { + "epoch": 0.4835293258344898, + "grad_norm": 1.9741131067276, + "learning_rate": 2.2035766933581958e-05, + "loss": 0.8697, + "step": 9930 + }, + { + "epoch": 0.48357801962359703, + "grad_norm": 2.4055628776550293, + "learning_rate": 2.2032628923972754e-05, + "loss": 0.89, + "step": 9931 + }, + { + "epoch": 0.4836267134127042, + "grad_norm": 0.09104962646961212, + "learning_rate": 2.2029490863801722e-05, + "loss": 0.6175, + "step": 9932 + }, + { + "epoch": 0.4836754072018114, + "grad_norm": 1.6245954036712646, + "learning_rate": 2.202635275314692e-05, + "loss": 0.8758, + "step": 9933 + }, + { + "epoch": 0.4837241009909186, + "grad_norm": 2.1730332374572754, + "learning_rate": 2.2023214592086402e-05, + "loss": 0.7136, + "step": 9934 + }, + { + "epoch": 0.4837727947800258, + "grad_norm": 1.769432783126831, + "learning_rate": 2.202007638069824e-05, + "loss": 0.9042, + "step": 9935 + }, + { + "epoch": 0.483821488569133, + "grad_norm": 1.7037193775177002, + "learning_rate": 2.2016938119060493e-05, + "loss": 0.9381, + "step": 9936 + }, + { + "epoch": 0.4838701823582402, + "grad_norm": 4.386973857879639, + "learning_rate": 2.201379980725123e-05, + "loss": 0.9066, + "step": 9937 + }, + { + "epoch": 0.4839188761473474, + "grad_norm": 1.378369927406311, + "learning_rate": 2.201066144534851e-05, + "loss": 0.8307, + "step": 9938 + }, + { + "epoch": 0.4839675699364546, + "grad_norm": 1.8037441968917847, + "learning_rate": 2.2007523033430403e-05, + "loss": 0.8301, + "step": 9939 + }, + { + "epoch": 0.4840162637255618, + "grad_norm": 1.4775290489196777, + "learning_rate": 2.200438457157498e-05, + "loss": 0.813, + "step": 9940 + }, + { + "epoch": 0.484064957514669, + "grad_norm": 3.0387680530548096, + "learning_rate": 2.2001246059860307e-05, + "loss": 0.9015, + "step": 9941 + }, + { + "epoch": 0.4841136513037762, + "grad_norm": 1.6493546962738037, + "learning_rate": 2.199810749836446e-05, + "loss": 0.9457, + "step": 9942 + }, + { + "epoch": 0.4841623450928834, + "grad_norm": 1.7455883026123047, + "learning_rate": 2.19949688871655e-05, + "loss": 0.8046, + "step": 9943 + }, + { + "epoch": 0.48421103888199063, + "grad_norm": 1.6046957969665527, + "learning_rate": 2.199183022634151e-05, + "loss": 0.9421, + "step": 9944 + }, + { + "epoch": 0.4842597326710978, + "grad_norm": 2.210702419281006, + "learning_rate": 2.1988691515970557e-05, + "loss": 0.7982, + "step": 9945 + }, + { + "epoch": 0.484308426460205, + "grad_norm": 1.918779969215393, + "learning_rate": 2.1985552756130724e-05, + "loss": 0.7818, + "step": 9946 + }, + { + "epoch": 0.4843571202493122, + "grad_norm": 2.1179559230804443, + "learning_rate": 2.1982413946900087e-05, + "loss": 0.8022, + "step": 9947 + }, + { + "epoch": 0.4844058140384194, + "grad_norm": 1.374267339706421, + "learning_rate": 2.1979275088356724e-05, + "loss": 0.87, + "step": 9948 + }, + { + "epoch": 0.4844545078275266, + "grad_norm": 1.7876893281936646, + "learning_rate": 2.197613618057871e-05, + "loss": 0.9319, + "step": 9949 + }, + { + "epoch": 0.4845032016166338, + "grad_norm": 2.4590606689453125, + "learning_rate": 2.197299722364413e-05, + "loss": 0.799, + "step": 9950 + }, + { + "epoch": 0.484551895405741, + "grad_norm": 1.8555102348327637, + "learning_rate": 2.1969858217631065e-05, + "loss": 0.8061, + "step": 9951 + }, + { + "epoch": 0.4846005891948482, + "grad_norm": 1.728420615196228, + "learning_rate": 2.196671916261759e-05, + "loss": 0.8463, + "step": 9952 + }, + { + "epoch": 0.4846492829839554, + "grad_norm": 3.801377296447754, + "learning_rate": 2.1963580058681805e-05, + "loss": 0.8301, + "step": 9953 + }, + { + "epoch": 0.4846979767730626, + "grad_norm": 3.7635304927825928, + "learning_rate": 2.1960440905901778e-05, + "loss": 0.9756, + "step": 9954 + }, + { + "epoch": 0.4847466705621698, + "grad_norm": 1.9757808446884155, + "learning_rate": 2.195730170435561e-05, + "loss": 0.9008, + "step": 9955 + }, + { + "epoch": 0.484795364351277, + "grad_norm": 1.4275072813034058, + "learning_rate": 2.1954162454121387e-05, + "loss": 0.8562, + "step": 9956 + }, + { + "epoch": 0.48484405814038417, + "grad_norm": 1.9576398134231567, + "learning_rate": 2.1951023155277192e-05, + "loss": 0.8495, + "step": 9957 + }, + { + "epoch": 0.4848927519294914, + "grad_norm": 2.9406468868255615, + "learning_rate": 2.1947883807901117e-05, + "loss": 0.7842, + "step": 9958 + }, + { + "epoch": 0.4849414457185986, + "grad_norm": 2.1321218013763428, + "learning_rate": 2.1944744412071257e-05, + "loss": 0.8694, + "step": 9959 + }, + { + "epoch": 0.4849901395077058, + "grad_norm": 1.4314664602279663, + "learning_rate": 2.1941604967865695e-05, + "loss": 0.9056, + "step": 9960 + }, + { + "epoch": 0.48503883329681297, + "grad_norm": 1.5615180730819702, + "learning_rate": 2.193846547536254e-05, + "loss": 0.8985, + "step": 9961 + }, + { + "epoch": 0.4850875270859202, + "grad_norm": 1.3901069164276123, + "learning_rate": 2.1935325934639883e-05, + "loss": 0.8207, + "step": 9962 + }, + { + "epoch": 0.4851362208750274, + "grad_norm": 3.130324602127075, + "learning_rate": 2.1932186345775808e-05, + "loss": 0.8016, + "step": 9963 + }, + { + "epoch": 0.4851849146641346, + "grad_norm": 2.0480916500091553, + "learning_rate": 2.1929046708848425e-05, + "loss": 0.8833, + "step": 9964 + }, + { + "epoch": 0.4852336084532418, + "grad_norm": 1.8783200979232788, + "learning_rate": 2.1925907023935832e-05, + "loss": 0.8426, + "step": 9965 + }, + { + "epoch": 0.485282302242349, + "grad_norm": 1.3803460597991943, + "learning_rate": 2.1922767291116123e-05, + "loss": 0.8624, + "step": 9966 + }, + { + "epoch": 0.4853309960314562, + "grad_norm": 1.7536393404006958, + "learning_rate": 2.1919627510467397e-05, + "loss": 0.9274, + "step": 9967 + }, + { + "epoch": 0.4853796898205634, + "grad_norm": 1.4029020071029663, + "learning_rate": 2.191648768206777e-05, + "loss": 0.8639, + "step": 9968 + }, + { + "epoch": 0.4854283836096706, + "grad_norm": 1.607354998588562, + "learning_rate": 2.1913347805995337e-05, + "loss": 0.7729, + "step": 9969 + }, + { + "epoch": 0.48547707739877777, + "grad_norm": 0.10199908912181854, + "learning_rate": 2.1910207882328202e-05, + "loss": 0.648, + "step": 9970 + }, + { + "epoch": 0.485525771187885, + "grad_norm": 1.4505565166473389, + "learning_rate": 2.190706791114447e-05, + "loss": 0.8606, + "step": 9971 + }, + { + "epoch": 0.4855744649769922, + "grad_norm": 1.3934061527252197, + "learning_rate": 2.190392789252225e-05, + "loss": 0.8407, + "step": 9972 + }, + { + "epoch": 0.4856231587660994, + "grad_norm": 1.9472620487213135, + "learning_rate": 2.1900787826539646e-05, + "loss": 0.9311, + "step": 9973 + }, + { + "epoch": 0.48567185255520656, + "grad_norm": 3.006418228149414, + "learning_rate": 2.189764771327478e-05, + "loss": 0.8557, + "step": 9974 + }, + { + "epoch": 0.4857205463443138, + "grad_norm": 1.361290454864502, + "learning_rate": 2.1894507552805747e-05, + "loss": 0.8957, + "step": 9975 + }, + { + "epoch": 0.485769240133421, + "grad_norm": 1.152561902999878, + "learning_rate": 2.1891367345210668e-05, + "loss": 0.7788, + "step": 9976 + }, + { + "epoch": 0.4858179339225282, + "grad_norm": 1.5562057495117188, + "learning_rate": 2.1888227090567655e-05, + "loss": 0.8623, + "step": 9977 + }, + { + "epoch": 0.48586662771163536, + "grad_norm": 1.4223066568374634, + "learning_rate": 2.1885086788954822e-05, + "loss": 0.8489, + "step": 9978 + }, + { + "epoch": 0.48591532150074257, + "grad_norm": 2.7278101444244385, + "learning_rate": 2.1881946440450282e-05, + "loss": 0.8007, + "step": 9979 + }, + { + "epoch": 0.4859640152898498, + "grad_norm": 2.207714080810547, + "learning_rate": 2.1878806045132156e-05, + "loss": 0.7451, + "step": 9980 + }, + { + "epoch": 0.486012709078957, + "grad_norm": 1.7118643522262573, + "learning_rate": 2.187566560307855e-05, + "loss": 0.8315, + "step": 9981 + }, + { + "epoch": 0.48606140286806415, + "grad_norm": 1.5217342376708984, + "learning_rate": 2.1872525114367598e-05, + "loss": 0.8471, + "step": 9982 + }, + { + "epoch": 0.48611009665717136, + "grad_norm": 1.5162156820297241, + "learning_rate": 2.186938457907741e-05, + "loss": 0.7846, + "step": 9983 + }, + { + "epoch": 0.4861587904462786, + "grad_norm": 2.4010024070739746, + "learning_rate": 2.186624399728611e-05, + "loss": 0.7651, + "step": 9984 + }, + { + "epoch": 0.4862074842353858, + "grad_norm": 1.8577553033828735, + "learning_rate": 2.186310336907182e-05, + "loss": 0.8482, + "step": 9985 + }, + { + "epoch": 0.486256178024493, + "grad_norm": 1.9319243431091309, + "learning_rate": 2.185996269451267e-05, + "loss": 0.8589, + "step": 9986 + }, + { + "epoch": 0.48630487181360016, + "grad_norm": 1.471447467803955, + "learning_rate": 2.1856821973686777e-05, + "loss": 0.8344, + "step": 9987 + }, + { + "epoch": 0.48635356560270737, + "grad_norm": 1.6552276611328125, + "learning_rate": 2.185368120667226e-05, + "loss": 0.8729, + "step": 9988 + }, + { + "epoch": 0.4864022593918146, + "grad_norm": 1.4283912181854248, + "learning_rate": 2.185054039354726e-05, + "loss": 0.802, + "step": 9989 + }, + { + "epoch": 0.4864509531809218, + "grad_norm": 1.815077543258667, + "learning_rate": 2.1847399534389896e-05, + "loss": 0.8223, + "step": 9990 + }, + { + "epoch": 0.48649964697002895, + "grad_norm": 2.6073665618896484, + "learning_rate": 2.1844258629278302e-05, + "loss": 0.8613, + "step": 9991 + }, + { + "epoch": 0.48654834075913617, + "grad_norm": 2.812556505203247, + "learning_rate": 2.184111767829061e-05, + "loss": 0.8132, + "step": 9992 + }, + { + "epoch": 0.4865970345482434, + "grad_norm": 1.9337526559829712, + "learning_rate": 2.183797668150494e-05, + "loss": 0.7846, + "step": 9993 + }, + { + "epoch": 0.4866457283373506, + "grad_norm": 2.2551586627960205, + "learning_rate": 2.1834835638999435e-05, + "loss": 0.9251, + "step": 9994 + }, + { + "epoch": 0.48669442212645775, + "grad_norm": 0.09116805344820023, + "learning_rate": 2.1831694550852234e-05, + "loss": 0.6163, + "step": 9995 + }, + { + "epoch": 0.48674311591556496, + "grad_norm": 5.567049026489258, + "learning_rate": 2.182855341714145e-05, + "loss": 0.8573, + "step": 9996 + }, + { + "epoch": 0.4867918097046722, + "grad_norm": 3.3078501224517822, + "learning_rate": 2.1825412237945245e-05, + "loss": 0.8537, + "step": 9997 + }, + { + "epoch": 0.4868405034937794, + "grad_norm": 1.8336677551269531, + "learning_rate": 2.182227101334174e-05, + "loss": 0.8853, + "step": 9998 + }, + { + "epoch": 0.48688919728288654, + "grad_norm": 1.672736406326294, + "learning_rate": 2.1819129743409083e-05, + "loss": 0.7887, + "step": 9999 + }, + { + "epoch": 0.48693789107199376, + "grad_norm": 3.0837759971618652, + "learning_rate": 2.18159884282254e-05, + "loss": 0.8672, + "step": 10000 + }, + { + "epoch": 0.48698658486110097, + "grad_norm": 1.7380439043045044, + "learning_rate": 2.1812847067868844e-05, + "loss": 0.9031, + "step": 10001 + }, + { + "epoch": 0.4870352786502082, + "grad_norm": 1.3584928512573242, + "learning_rate": 2.180970566241755e-05, + "loss": 0.7906, + "step": 10002 + }, + { + "epoch": 0.48708397243931534, + "grad_norm": 1.3840086460113525, + "learning_rate": 2.180656421194966e-05, + "loss": 0.8484, + "step": 10003 + }, + { + "epoch": 0.48713266622842255, + "grad_norm": 1.189534306526184, + "learning_rate": 2.1803422716543316e-05, + "loss": 0.8229, + "step": 10004 + }, + { + "epoch": 0.48718136001752976, + "grad_norm": 3.238485097885132, + "learning_rate": 2.1800281176276677e-05, + "loss": 1.0293, + "step": 10005 + }, + { + "epoch": 0.487230053806637, + "grad_norm": 1.7521779537200928, + "learning_rate": 2.179713959122788e-05, + "loss": 0.8975, + "step": 10006 + }, + { + "epoch": 0.4872787475957442, + "grad_norm": 1.360424280166626, + "learning_rate": 2.1793997961475067e-05, + "loss": 0.7897, + "step": 10007 + }, + { + "epoch": 0.48732744138485135, + "grad_norm": 1.9411882162094116, + "learning_rate": 2.1790856287096394e-05, + "loss": 0.801, + "step": 10008 + }, + { + "epoch": 0.48737613517395856, + "grad_norm": 0.09038794785737991, + "learning_rate": 2.1787714568170006e-05, + "loss": 0.6168, + "step": 10009 + }, + { + "epoch": 0.48742482896306577, + "grad_norm": 1.8810474872589111, + "learning_rate": 2.1784572804774055e-05, + "loss": 0.7749, + "step": 10010 + }, + { + "epoch": 0.487473522752173, + "grad_norm": 1.4451979398727417, + "learning_rate": 2.1781430996986693e-05, + "loss": 0.9026, + "step": 10011 + }, + { + "epoch": 0.48752221654128014, + "grad_norm": 2.837505340576172, + "learning_rate": 2.1778289144886074e-05, + "loss": 0.8317, + "step": 10012 + }, + { + "epoch": 0.48757091033038735, + "grad_norm": 2.21315336227417, + "learning_rate": 2.177514724855035e-05, + "loss": 0.7802, + "step": 10013 + }, + { + "epoch": 0.48761960411949457, + "grad_norm": 1.8540114164352417, + "learning_rate": 2.177200530805767e-05, + "loss": 0.7515, + "step": 10014 + }, + { + "epoch": 0.4876682979086018, + "grad_norm": 1.8233990669250488, + "learning_rate": 2.1768863323486205e-05, + "loss": 0.8486, + "step": 10015 + }, + { + "epoch": 0.48771699169770893, + "grad_norm": 1.6406927108764648, + "learning_rate": 2.1765721294914105e-05, + "loss": 0.8784, + "step": 10016 + }, + { + "epoch": 0.48776568548681615, + "grad_norm": 3.276991844177246, + "learning_rate": 2.176257922241952e-05, + "loss": 0.7406, + "step": 10017 + }, + { + "epoch": 0.48781437927592336, + "grad_norm": 1.768309235572815, + "learning_rate": 2.1759437106080616e-05, + "loss": 0.8711, + "step": 10018 + }, + { + "epoch": 0.4878630730650306, + "grad_norm": 0.09667297452688217, + "learning_rate": 2.1756294945975558e-05, + "loss": 0.5801, + "step": 10019 + }, + { + "epoch": 0.48791176685413773, + "grad_norm": 1.739390254020691, + "learning_rate": 2.17531527421825e-05, + "loss": 0.7475, + "step": 10020 + }, + { + "epoch": 0.48796046064324494, + "grad_norm": 1.8241841793060303, + "learning_rate": 2.175001049477961e-05, + "loss": 0.7776, + "step": 10021 + }, + { + "epoch": 0.48800915443235215, + "grad_norm": 1.4539343118667603, + "learning_rate": 2.1746868203845048e-05, + "loss": 0.8673, + "step": 10022 + }, + { + "epoch": 0.48805784822145937, + "grad_norm": 2.161168336868286, + "learning_rate": 2.174372586945698e-05, + "loss": 0.816, + "step": 10023 + }, + { + "epoch": 0.4881065420105666, + "grad_norm": 1.7131389379501343, + "learning_rate": 2.1740583491693573e-05, + "loss": 0.8296, + "step": 10024 + }, + { + "epoch": 0.48815523579967374, + "grad_norm": 2.7948458194732666, + "learning_rate": 2.1737441070632985e-05, + "loss": 0.9117, + "step": 10025 + }, + { + "epoch": 0.48820392958878095, + "grad_norm": 1.5064469575881958, + "learning_rate": 2.17342986063534e-05, + "loss": 0.8074, + "step": 10026 + }, + { + "epoch": 0.48825262337788816, + "grad_norm": 1.3989429473876953, + "learning_rate": 2.173115609893298e-05, + "loss": 0.9458, + "step": 10027 + }, + { + "epoch": 0.4883013171669954, + "grad_norm": 1.9296246767044067, + "learning_rate": 2.1728013548449895e-05, + "loss": 0.8681, + "step": 10028 + }, + { + "epoch": 0.48835001095610253, + "grad_norm": 1.7385221719741821, + "learning_rate": 2.1724870954982316e-05, + "loss": 0.8801, + "step": 10029 + }, + { + "epoch": 0.48839870474520974, + "grad_norm": 1.8635138273239136, + "learning_rate": 2.172172831860841e-05, + "loss": 0.8119, + "step": 10030 + }, + { + "epoch": 0.48844739853431696, + "grad_norm": 2.0840587615966797, + "learning_rate": 2.171858563940636e-05, + "loss": 0.882, + "step": 10031 + }, + { + "epoch": 0.48849609232342417, + "grad_norm": 1.6385891437530518, + "learning_rate": 2.171544291745433e-05, + "loss": 0.9015, + "step": 10032 + }, + { + "epoch": 0.4885447861125313, + "grad_norm": 1.4260010719299316, + "learning_rate": 2.1712300152830496e-05, + "loss": 0.9141, + "step": 10033 + }, + { + "epoch": 0.48859347990163854, + "grad_norm": 1.3814300298690796, + "learning_rate": 2.170915734561305e-05, + "loss": 0.867, + "step": 10034 + }, + { + "epoch": 0.48864217369074575, + "grad_norm": 2.227635145187378, + "learning_rate": 2.1706014495880155e-05, + "loss": 0.7878, + "step": 10035 + }, + { + "epoch": 0.48869086747985296, + "grad_norm": 1.732466697692871, + "learning_rate": 2.1702871603709993e-05, + "loss": 0.8646, + "step": 10036 + }, + { + "epoch": 0.4887395612689601, + "grad_norm": 2.2235121726989746, + "learning_rate": 2.169972866918075e-05, + "loss": 0.7522, + "step": 10037 + }, + { + "epoch": 0.48878825505806733, + "grad_norm": 3.407228469848633, + "learning_rate": 2.16965856923706e-05, + "loss": 0.7892, + "step": 10038 + }, + { + "epoch": 0.48883694884717455, + "grad_norm": 1.9777097702026367, + "learning_rate": 2.169344267335772e-05, + "loss": 0.8398, + "step": 10039 + }, + { + "epoch": 0.48888564263628176, + "grad_norm": 1.9037928581237793, + "learning_rate": 2.1690299612220302e-05, + "loss": 0.834, + "step": 10040 + }, + { + "epoch": 0.4889343364253889, + "grad_norm": 1.3329707384109497, + "learning_rate": 2.1687156509036533e-05, + "loss": 0.8328, + "step": 10041 + }, + { + "epoch": 0.48898303021449613, + "grad_norm": 1.9005366563796997, + "learning_rate": 2.168401336388459e-05, + "loss": 0.8914, + "step": 10042 + }, + { + "epoch": 0.48903172400360334, + "grad_norm": 2.4766743183135986, + "learning_rate": 2.1680870176842653e-05, + "loss": 0.7991, + "step": 10043 + }, + { + "epoch": 0.48908041779271055, + "grad_norm": 1.495896816253662, + "learning_rate": 2.1677726947988922e-05, + "loss": 0.794, + "step": 10044 + }, + { + "epoch": 0.48912911158181777, + "grad_norm": 1.8399049043655396, + "learning_rate": 2.167458367740158e-05, + "loss": 0.7487, + "step": 10045 + }, + { + "epoch": 0.4891778053709249, + "grad_norm": 3.199077844619751, + "learning_rate": 2.167144036515882e-05, + "loss": 0.8298, + "step": 10046 + }, + { + "epoch": 0.48922649916003214, + "grad_norm": 2.401934862136841, + "learning_rate": 2.1668297011338822e-05, + "loss": 0.8413, + "step": 10047 + }, + { + "epoch": 0.48927519294913935, + "grad_norm": 3.159712076187134, + "learning_rate": 2.1665153616019787e-05, + "loss": 0.7466, + "step": 10048 + }, + { + "epoch": 0.48932388673824656, + "grad_norm": 1.800097107887268, + "learning_rate": 2.1662010179279905e-05, + "loss": 0.9121, + "step": 10049 + }, + { + "epoch": 0.4893725805273537, + "grad_norm": 1.3772703409194946, + "learning_rate": 2.1658866701197368e-05, + "loss": 0.8359, + "step": 10050 + }, + { + "epoch": 0.48942127431646093, + "grad_norm": 1.405609130859375, + "learning_rate": 2.1655723181850374e-05, + "loss": 0.7846, + "step": 10051 + }, + { + "epoch": 0.48946996810556814, + "grad_norm": 1.5443058013916016, + "learning_rate": 2.1652579621317108e-05, + "loss": 0.8085, + "step": 10052 + }, + { + "epoch": 0.48951866189467536, + "grad_norm": 1.3651013374328613, + "learning_rate": 2.1649436019675774e-05, + "loss": 0.7546, + "step": 10053 + }, + { + "epoch": 0.4895673556837825, + "grad_norm": 1.9407942295074463, + "learning_rate": 2.164629237700457e-05, + "loss": 0.8218, + "step": 10054 + }, + { + "epoch": 0.4896160494728897, + "grad_norm": 1.290459394454956, + "learning_rate": 2.1643148693381693e-05, + "loss": 0.8109, + "step": 10055 + }, + { + "epoch": 0.48966474326199694, + "grad_norm": 1.9283695220947266, + "learning_rate": 2.1640004968885342e-05, + "loss": 0.7835, + "step": 10056 + }, + { + "epoch": 0.48971343705110415, + "grad_norm": 3.299213171005249, + "learning_rate": 2.1636861203593723e-05, + "loss": 0.8065, + "step": 10057 + }, + { + "epoch": 0.4897621308402113, + "grad_norm": 1.6311662197113037, + "learning_rate": 2.1633717397585027e-05, + "loss": 0.8033, + "step": 10058 + }, + { + "epoch": 0.4898108246293185, + "grad_norm": 1.4977003335952759, + "learning_rate": 2.1630573550937465e-05, + "loss": 0.8316, + "step": 10059 + }, + { + "epoch": 0.48985951841842573, + "grad_norm": 1.5039808750152588, + "learning_rate": 2.1627429663729232e-05, + "loss": 0.7978, + "step": 10060 + }, + { + "epoch": 0.48990821220753294, + "grad_norm": 1.6499953269958496, + "learning_rate": 2.162428573603854e-05, + "loss": 0.8501, + "step": 10061 + }, + { + "epoch": 0.4899569059966401, + "grad_norm": 1.7710257768630981, + "learning_rate": 2.162114176794359e-05, + "loss": 0.8018, + "step": 10062 + }, + { + "epoch": 0.4900055997857473, + "grad_norm": 1.585453987121582, + "learning_rate": 2.1617997759522596e-05, + "loss": 0.8698, + "step": 10063 + }, + { + "epoch": 0.4900542935748545, + "grad_norm": 4.04530143737793, + "learning_rate": 2.161485371085376e-05, + "loss": 0.8492, + "step": 10064 + }, + { + "epoch": 0.49010298736396174, + "grad_norm": 1.7529252767562866, + "learning_rate": 2.1611709622015287e-05, + "loss": 0.9539, + "step": 10065 + }, + { + "epoch": 0.49015168115306895, + "grad_norm": 2.1787424087524414, + "learning_rate": 2.1608565493085396e-05, + "loss": 0.8511, + "step": 10066 + }, + { + "epoch": 0.4902003749421761, + "grad_norm": 3.6772658824920654, + "learning_rate": 2.1605421324142286e-05, + "loss": 0.9118, + "step": 10067 + }, + { + "epoch": 0.4902490687312833, + "grad_norm": 0.09083420783281326, + "learning_rate": 2.1602277115264178e-05, + "loss": 0.6124, + "step": 10068 + }, + { + "epoch": 0.49029776252039053, + "grad_norm": 1.637117862701416, + "learning_rate": 2.1599132866529276e-05, + "loss": 0.8449, + "step": 10069 + }, + { + "epoch": 0.49034645630949775, + "grad_norm": 1.773407220840454, + "learning_rate": 2.15959885780158e-05, + "loss": 0.7855, + "step": 10070 + }, + { + "epoch": 0.4903951500986049, + "grad_norm": 3.7957239151000977, + "learning_rate": 2.1592844249801964e-05, + "loss": 0.7654, + "step": 10071 + }, + { + "epoch": 0.4904438438877121, + "grad_norm": 3.518948793411255, + "learning_rate": 2.158969988196598e-05, + "loss": 0.8335, + "step": 10072 + }, + { + "epoch": 0.49049253767681933, + "grad_norm": 1.7924598455429077, + "learning_rate": 2.158655547458607e-05, + "loss": 0.8203, + "step": 10073 + }, + { + "epoch": 0.49054123146592654, + "grad_norm": 0.09319984912872314, + "learning_rate": 2.1583411027740445e-05, + "loss": 0.6089, + "step": 10074 + }, + { + "epoch": 0.4905899252550337, + "grad_norm": 1.5162112712860107, + "learning_rate": 2.1580266541507327e-05, + "loss": 0.8451, + "step": 10075 + }, + { + "epoch": 0.4906386190441409, + "grad_norm": 1.8147865533828735, + "learning_rate": 2.1577122015964935e-05, + "loss": 0.8864, + "step": 10076 + }, + { + "epoch": 0.4906873128332481, + "grad_norm": 1.765421748161316, + "learning_rate": 2.1573977451191487e-05, + "loss": 0.8294, + "step": 10077 + }, + { + "epoch": 0.49073600662235534, + "grad_norm": 2.8844847679138184, + "learning_rate": 2.157083284726521e-05, + "loss": 0.8302, + "step": 10078 + }, + { + "epoch": 0.4907847004114625, + "grad_norm": 1.5026049613952637, + "learning_rate": 2.1567688204264324e-05, + "loss": 0.8016, + "step": 10079 + }, + { + "epoch": 0.4908333942005697, + "grad_norm": 2.4910497665405273, + "learning_rate": 2.156454352226705e-05, + "loss": 0.8147, + "step": 10080 + }, + { + "epoch": 0.4908820879896769, + "grad_norm": 1.706039547920227, + "learning_rate": 2.156139880135161e-05, + "loss": 0.8903, + "step": 10081 + }, + { + "epoch": 0.49093078177878413, + "grad_norm": 1.5350807905197144, + "learning_rate": 2.155825404159623e-05, + "loss": 0.8206, + "step": 10082 + }, + { + "epoch": 0.4909794755678913, + "grad_norm": 1.3529906272888184, + "learning_rate": 2.1555109243079144e-05, + "loss": 0.8285, + "step": 10083 + }, + { + "epoch": 0.4910281693569985, + "grad_norm": 1.5766313076019287, + "learning_rate": 2.1551964405878565e-05, + "loss": 0.8447, + "step": 10084 + }, + { + "epoch": 0.4910768631461057, + "grad_norm": 2.605604410171509, + "learning_rate": 2.1548819530072742e-05, + "loss": 0.8678, + "step": 10085 + }, + { + "epoch": 0.4911255569352129, + "grad_norm": 1.6116621494293213, + "learning_rate": 2.1545674615739886e-05, + "loss": 0.8752, + "step": 10086 + }, + { + "epoch": 0.49117425072432014, + "grad_norm": 3.454540491104126, + "learning_rate": 2.1542529662958236e-05, + "loss": 0.7754, + "step": 10087 + }, + { + "epoch": 0.4912229445134273, + "grad_norm": 6.755306243896484, + "learning_rate": 2.153938467180602e-05, + "loss": 0.744, + "step": 10088 + }, + { + "epoch": 0.4912716383025345, + "grad_norm": 1.7286313772201538, + "learning_rate": 2.1536239642361468e-05, + "loss": 0.7497, + "step": 10089 + }, + { + "epoch": 0.4913203320916417, + "grad_norm": 1.6219123601913452, + "learning_rate": 2.153309457470281e-05, + "loss": 0.898, + "step": 10090 + }, + { + "epoch": 0.49136902588074893, + "grad_norm": 1.7947356700897217, + "learning_rate": 2.152994946890829e-05, + "loss": 0.8197, + "step": 10091 + }, + { + "epoch": 0.4914177196698561, + "grad_norm": 1.445289134979248, + "learning_rate": 2.1526804325056134e-05, + "loss": 0.8485, + "step": 10092 + }, + { + "epoch": 0.4914664134589633, + "grad_norm": 1.7081363201141357, + "learning_rate": 2.1523659143224584e-05, + "loss": 0.76, + "step": 10093 + }, + { + "epoch": 0.4915151072480705, + "grad_norm": 1.6724380254745483, + "learning_rate": 2.1520513923491878e-05, + "loss": 0.9508, + "step": 10094 + }, + { + "epoch": 0.49156380103717773, + "grad_norm": 1.5564055442810059, + "learning_rate": 2.1517368665936247e-05, + "loss": 0.9439, + "step": 10095 + }, + { + "epoch": 0.4916124948262849, + "grad_norm": 1.8397536277770996, + "learning_rate": 2.151422337063593e-05, + "loss": 0.8323, + "step": 10096 + }, + { + "epoch": 0.4916611886153921, + "grad_norm": 1.4769164323806763, + "learning_rate": 2.1511078037669173e-05, + "loss": 0.9307, + "step": 10097 + }, + { + "epoch": 0.4917098824044993, + "grad_norm": 1.8271396160125732, + "learning_rate": 2.1507932667114203e-05, + "loss": 0.7893, + "step": 10098 + }, + { + "epoch": 0.4917585761936065, + "grad_norm": 1.7209590673446655, + "learning_rate": 2.1504787259049274e-05, + "loss": 0.8069, + "step": 10099 + }, + { + "epoch": 0.4918072699827137, + "grad_norm": 1.330989122390747, + "learning_rate": 2.150164181355263e-05, + "loss": 0.8053, + "step": 10100 + }, + { + "epoch": 0.4918559637718209, + "grad_norm": 1.4911965131759644, + "learning_rate": 2.1498496330702506e-05, + "loss": 0.8745, + "step": 10101 + }, + { + "epoch": 0.4919046575609281, + "grad_norm": 1.9575515985488892, + "learning_rate": 2.1495350810577148e-05, + "loss": 0.9548, + "step": 10102 + }, + { + "epoch": 0.4919533513500353, + "grad_norm": 2.112027168273926, + "learning_rate": 2.14922052532548e-05, + "loss": 0.8196, + "step": 10103 + }, + { + "epoch": 0.49200204513914253, + "grad_norm": 1.4112403392791748, + "learning_rate": 2.1489059658813717e-05, + "loss": 0.8446, + "step": 10104 + }, + { + "epoch": 0.4920507389282497, + "grad_norm": 2.8509039878845215, + "learning_rate": 2.148591402733213e-05, + "loss": 0.8236, + "step": 10105 + }, + { + "epoch": 0.4920994327173569, + "grad_norm": 2.5255532264709473, + "learning_rate": 2.1482768358888303e-05, + "loss": 0.8689, + "step": 10106 + }, + { + "epoch": 0.4921481265064641, + "grad_norm": 1.5016905069351196, + "learning_rate": 2.147962265356048e-05, + "loss": 0.7502, + "step": 10107 + }, + { + "epoch": 0.4921968202955713, + "grad_norm": 17.743383407592773, + "learning_rate": 2.14764769114269e-05, + "loss": 0.9133, + "step": 10108 + }, + { + "epoch": 0.4922455140846785, + "grad_norm": 1.6341500282287598, + "learning_rate": 2.147333113256583e-05, + "loss": 0.8172, + "step": 10109 + }, + { + "epoch": 0.4922942078737857, + "grad_norm": 2.593794345855713, + "learning_rate": 2.147018531705551e-05, + "loss": 0.8207, + "step": 10110 + }, + { + "epoch": 0.4923429016628929, + "grad_norm": 2.2864632606506348, + "learning_rate": 2.1467039464974196e-05, + "loss": 0.8252, + "step": 10111 + }, + { + "epoch": 0.4923915954520001, + "grad_norm": 1.5063928365707397, + "learning_rate": 2.1463893576400144e-05, + "loss": 0.8792, + "step": 10112 + }, + { + "epoch": 0.4924402892411073, + "grad_norm": 0.09272550791501999, + "learning_rate": 2.14607476514116e-05, + "loss": 0.6173, + "step": 10113 + }, + { + "epoch": 0.4924889830302145, + "grad_norm": 5.424806594848633, + "learning_rate": 2.1457601690086828e-05, + "loss": 0.9125, + "step": 10114 + }, + { + "epoch": 0.4925376768193217, + "grad_norm": 1.9424858093261719, + "learning_rate": 2.1454455692504085e-05, + "loss": 0.9167, + "step": 10115 + }, + { + "epoch": 0.4925863706084289, + "grad_norm": 1.5234098434448242, + "learning_rate": 2.1451309658741623e-05, + "loss": 0.7554, + "step": 10116 + }, + { + "epoch": 0.49263506439753607, + "grad_norm": 2.2717559337615967, + "learning_rate": 2.14481635888777e-05, + "loss": 0.8159, + "step": 10117 + }, + { + "epoch": 0.4926837581866433, + "grad_norm": 1.5388858318328857, + "learning_rate": 2.1445017482990576e-05, + "loss": 0.8769, + "step": 10118 + }, + { + "epoch": 0.4927324519757505, + "grad_norm": 0.09341145306825638, + "learning_rate": 2.1441871341158508e-05, + "loss": 0.6302, + "step": 10119 + }, + { + "epoch": 0.4927811457648577, + "grad_norm": 1.775015115737915, + "learning_rate": 2.1438725163459758e-05, + "loss": 0.9313, + "step": 10120 + }, + { + "epoch": 0.49282983955396487, + "grad_norm": 1.373549461364746, + "learning_rate": 2.1435578949972592e-05, + "loss": 0.9002, + "step": 10121 + }, + { + "epoch": 0.4928785333430721, + "grad_norm": 1.8471801280975342, + "learning_rate": 2.1432432700775263e-05, + "loss": 0.6787, + "step": 10122 + }, + { + "epoch": 0.4929272271321793, + "grad_norm": 1.32632315158844, + "learning_rate": 2.1429286415946046e-05, + "loss": 0.9078, + "step": 10123 + }, + { + "epoch": 0.4929759209212865, + "grad_norm": 2.1119089126586914, + "learning_rate": 2.14261400955632e-05, + "loss": 0.7906, + "step": 10124 + }, + { + "epoch": 0.4930246147103937, + "grad_norm": 1.7669239044189453, + "learning_rate": 2.1422993739704988e-05, + "loss": 0.9193, + "step": 10125 + }, + { + "epoch": 0.4930733084995009, + "grad_norm": 1.5074632167816162, + "learning_rate": 2.1419847348449676e-05, + "loss": 0.9581, + "step": 10126 + }, + { + "epoch": 0.4931220022886081, + "grad_norm": 1.6483420133590698, + "learning_rate": 2.141670092187553e-05, + "loss": 0.8442, + "step": 10127 + }, + { + "epoch": 0.4931706960777153, + "grad_norm": 2.1585099697113037, + "learning_rate": 2.1413554460060818e-05, + "loss": 0.7617, + "step": 10128 + }, + { + "epoch": 0.4932193898668225, + "grad_norm": 0.08916542679071426, + "learning_rate": 2.1410407963083813e-05, + "loss": 0.5737, + "step": 10129 + }, + { + "epoch": 0.49326808365592967, + "grad_norm": 1.975929617881775, + "learning_rate": 2.1407261431022786e-05, + "loss": 0.725, + "step": 10130 + }, + { + "epoch": 0.4933167774450369, + "grad_norm": 1.6140680313110352, + "learning_rate": 2.1404114863955997e-05, + "loss": 0.8583, + "step": 10131 + }, + { + "epoch": 0.4933654712341441, + "grad_norm": 1.6664117574691772, + "learning_rate": 2.140096826196172e-05, + "loss": 0.8848, + "step": 10132 + }, + { + "epoch": 0.4934141650232513, + "grad_norm": 2.588543176651001, + "learning_rate": 2.139782162511823e-05, + "loss": 0.833, + "step": 10133 + }, + { + "epoch": 0.49346285881235846, + "grad_norm": 0.09781093150377274, + "learning_rate": 2.1394674953503802e-05, + "loss": 0.6784, + "step": 10134 + }, + { + "epoch": 0.4935115526014657, + "grad_norm": 1.9730699062347412, + "learning_rate": 2.1391528247196706e-05, + "loss": 0.8347, + "step": 10135 + }, + { + "epoch": 0.4935602463905729, + "grad_norm": 1.796714425086975, + "learning_rate": 2.1388381506275223e-05, + "loss": 0.7525, + "step": 10136 + }, + { + "epoch": 0.4936089401796801, + "grad_norm": 2.078455924987793, + "learning_rate": 2.138523473081762e-05, + "loss": 0.8674, + "step": 10137 + }, + { + "epoch": 0.49365763396878726, + "grad_norm": 1.9667632579803467, + "learning_rate": 2.138208792090218e-05, + "loss": 0.861, + "step": 10138 + }, + { + "epoch": 0.49370632775789447, + "grad_norm": 1.3655861616134644, + "learning_rate": 2.1378941076607176e-05, + "loss": 0.848, + "step": 10139 + }, + { + "epoch": 0.4937550215470017, + "grad_norm": 3.0987472534179688, + "learning_rate": 2.1375794198010885e-05, + "loss": 0.8046, + "step": 10140 + }, + { + "epoch": 0.4938037153361089, + "grad_norm": 1.920538067817688, + "learning_rate": 2.1372647285191587e-05, + "loss": 0.8896, + "step": 10141 + }, + { + "epoch": 0.49385240912521605, + "grad_norm": 2.243333339691162, + "learning_rate": 2.136950033822756e-05, + "loss": 0.8427, + "step": 10142 + }, + { + "epoch": 0.49390110291432326, + "grad_norm": 2.731861114501953, + "learning_rate": 2.136635335719709e-05, + "loss": 0.7572, + "step": 10143 + }, + { + "epoch": 0.4939497967034305, + "grad_norm": 2.2238426208496094, + "learning_rate": 2.136320634217846e-05, + "loss": 0.8424, + "step": 10144 + }, + { + "epoch": 0.4939984904925377, + "grad_norm": 2.1198339462280273, + "learning_rate": 2.1360059293249945e-05, + "loss": 0.7467, + "step": 10145 + }, + { + "epoch": 0.4940471842816449, + "grad_norm": 1.7906829118728638, + "learning_rate": 2.1356912210489827e-05, + "loss": 0.8512, + "step": 10146 + }, + { + "epoch": 0.49409587807075206, + "grad_norm": 1.7168563604354858, + "learning_rate": 2.13537650939764e-05, + "loss": 0.765, + "step": 10147 + }, + { + "epoch": 0.49414457185985927, + "grad_norm": 1.9362008571624756, + "learning_rate": 2.135061794378794e-05, + "loss": 0.9049, + "step": 10148 + }, + { + "epoch": 0.4941932656489665, + "grad_norm": 2.6973211765289307, + "learning_rate": 2.1347470760002734e-05, + "loss": 0.7172, + "step": 10149 + }, + { + "epoch": 0.4942419594380737, + "grad_norm": 1.8176755905151367, + "learning_rate": 2.1344323542699072e-05, + "loss": 0.7866, + "step": 10150 + }, + { + "epoch": 0.49429065322718085, + "grad_norm": 3.52095103263855, + "learning_rate": 2.134117629195524e-05, + "loss": 0.8152, + "step": 10151 + }, + { + "epoch": 0.49433934701628807, + "grad_norm": 1.5838106870651245, + "learning_rate": 2.1338029007849523e-05, + "loss": 0.8157, + "step": 10152 + }, + { + "epoch": 0.4943880408053953, + "grad_norm": 1.689852237701416, + "learning_rate": 2.1334881690460217e-05, + "loss": 0.7902, + "step": 10153 + }, + { + "epoch": 0.4944367345945025, + "grad_norm": 5.357853889465332, + "learning_rate": 2.1331734339865605e-05, + "loss": 0.8377, + "step": 10154 + }, + { + "epoch": 0.49448542838360965, + "grad_norm": 2.0940985679626465, + "learning_rate": 2.132858695614398e-05, + "loss": 0.8113, + "step": 10155 + }, + { + "epoch": 0.49453412217271686, + "grad_norm": 2.0949113368988037, + "learning_rate": 2.1325439539373627e-05, + "loss": 0.8348, + "step": 10156 + }, + { + "epoch": 0.4945828159618241, + "grad_norm": 2.2914609909057617, + "learning_rate": 2.1322292089632853e-05, + "loss": 0.798, + "step": 10157 + }, + { + "epoch": 0.4946315097509313, + "grad_norm": 1.7584160566329956, + "learning_rate": 2.131914460699994e-05, + "loss": 0.7603, + "step": 10158 + }, + { + "epoch": 0.49468020354003844, + "grad_norm": 1.9617739915847778, + "learning_rate": 2.1315997091553188e-05, + "loss": 0.7529, + "step": 10159 + }, + { + "epoch": 0.49472889732914566, + "grad_norm": 1.3771297931671143, + "learning_rate": 2.131284954337088e-05, + "loss": 0.9541, + "step": 10160 + }, + { + "epoch": 0.49477759111825287, + "grad_norm": 1.8951469659805298, + "learning_rate": 2.1309701962531327e-05, + "loss": 0.8407, + "step": 10161 + }, + { + "epoch": 0.4948262849073601, + "grad_norm": 1.8744200468063354, + "learning_rate": 2.1306554349112813e-05, + "loss": 0.8092, + "step": 10162 + }, + { + "epoch": 0.49487497869646724, + "grad_norm": 1.6723337173461914, + "learning_rate": 2.1303406703193644e-05, + "loss": 0.8156, + "step": 10163 + }, + { + "epoch": 0.49492367248557445, + "grad_norm": 2.5852551460266113, + "learning_rate": 2.1300259024852114e-05, + "loss": 0.8661, + "step": 10164 + }, + { + "epoch": 0.49497236627468166, + "grad_norm": 2.519655704498291, + "learning_rate": 2.1297111314166522e-05, + "loss": 0.7925, + "step": 10165 + }, + { + "epoch": 0.4950210600637889, + "grad_norm": 1.7870737314224243, + "learning_rate": 2.129396357121517e-05, + "loss": 0.8394, + "step": 10166 + }, + { + "epoch": 0.4950697538528961, + "grad_norm": 1.6561652421951294, + "learning_rate": 2.129081579607635e-05, + "loss": 0.7885, + "step": 10167 + }, + { + "epoch": 0.49511844764200325, + "grad_norm": 2.693934440612793, + "learning_rate": 2.128766798882838e-05, + "loss": 0.846, + "step": 10168 + }, + { + "epoch": 0.49516714143111046, + "grad_norm": 2.119263172149658, + "learning_rate": 2.1284520149549546e-05, + "loss": 0.7915, + "step": 10169 + }, + { + "epoch": 0.49521583522021767, + "grad_norm": 4.513456344604492, + "learning_rate": 2.128137227831815e-05, + "loss": 0.8217, + "step": 10170 + }, + { + "epoch": 0.4952645290093249, + "grad_norm": 2.0569934844970703, + "learning_rate": 2.12782243752125e-05, + "loss": 0.9606, + "step": 10171 + }, + { + "epoch": 0.49531322279843204, + "grad_norm": 2.1367321014404297, + "learning_rate": 2.1275076440310914e-05, + "loss": 0.7443, + "step": 10172 + }, + { + "epoch": 0.49536191658753925, + "grad_norm": 1.9587489366531372, + "learning_rate": 2.127192847369168e-05, + "loss": 0.8909, + "step": 10173 + }, + { + "epoch": 0.49541061037664647, + "grad_norm": 1.781419038772583, + "learning_rate": 2.126878047543311e-05, + "loss": 0.9195, + "step": 10174 + }, + { + "epoch": 0.4954593041657537, + "grad_norm": 1.7726259231567383, + "learning_rate": 2.126563244561351e-05, + "loss": 0.8789, + "step": 10175 + }, + { + "epoch": 0.49550799795486083, + "grad_norm": 2.2739217281341553, + "learning_rate": 2.1262484384311186e-05, + "loss": 0.8956, + "step": 10176 + }, + { + "epoch": 0.49555669174396805, + "grad_norm": 1.8476272821426392, + "learning_rate": 2.1259336291604448e-05, + "loss": 0.8456, + "step": 10177 + }, + { + "epoch": 0.49560538553307526, + "grad_norm": 3.224858045578003, + "learning_rate": 2.1256188167571606e-05, + "loss": 0.873, + "step": 10178 + }, + { + "epoch": 0.4956540793221825, + "grad_norm": 1.632204532623291, + "learning_rate": 2.1253040012290967e-05, + "loss": 0.7808, + "step": 10179 + }, + { + "epoch": 0.49570277311128963, + "grad_norm": 1.5217829942703247, + "learning_rate": 2.1249891825840847e-05, + "loss": 0.894, + "step": 10180 + }, + { + "epoch": 0.49575146690039684, + "grad_norm": 1.650869369506836, + "learning_rate": 2.1246743608299552e-05, + "loss": 0.8029, + "step": 10181 + }, + { + "epoch": 0.49580016068950405, + "grad_norm": 2.7927184104919434, + "learning_rate": 2.1243595359745397e-05, + "loss": 0.8137, + "step": 10182 + }, + { + "epoch": 0.49584885447861127, + "grad_norm": 1.9990694522857666, + "learning_rate": 2.124044708025669e-05, + "loss": 0.8608, + "step": 10183 + }, + { + "epoch": 0.4958975482677185, + "grad_norm": 1.3889074325561523, + "learning_rate": 2.1237298769911755e-05, + "loss": 0.9144, + "step": 10184 + }, + { + "epoch": 0.49594624205682564, + "grad_norm": 1.5801217555999756, + "learning_rate": 2.1234150428788892e-05, + "loss": 0.878, + "step": 10185 + }, + { + "epoch": 0.49599493584593285, + "grad_norm": 2.753695011138916, + "learning_rate": 2.123100205696643e-05, + "loss": 0.8328, + "step": 10186 + }, + { + "epoch": 0.49604362963504006, + "grad_norm": 1.4437724351882935, + "learning_rate": 2.1227853654522682e-05, + "loss": 0.8804, + "step": 10187 + }, + { + "epoch": 0.4960923234241473, + "grad_norm": 4.036582946777344, + "learning_rate": 2.122470522153596e-05, + "loss": 0.8371, + "step": 10188 + }, + { + "epoch": 0.49614101721325443, + "grad_norm": 1.4034472703933716, + "learning_rate": 2.1221556758084584e-05, + "loss": 0.8308, + "step": 10189 + }, + { + "epoch": 0.49618971100236164, + "grad_norm": 1.8605636358261108, + "learning_rate": 2.121840826424687e-05, + "loss": 0.8493, + "step": 10190 + }, + { + "epoch": 0.49623840479146886, + "grad_norm": 1.9666436910629272, + "learning_rate": 2.121525974010114e-05, + "loss": 0.8402, + "step": 10191 + }, + { + "epoch": 0.49628709858057607, + "grad_norm": 3.411675214767456, + "learning_rate": 2.1212111185725716e-05, + "loss": 0.9367, + "step": 10192 + }, + { + "epoch": 0.4963357923696832, + "grad_norm": 1.8961328268051147, + "learning_rate": 2.1208962601198913e-05, + "loss": 0.9098, + "step": 10193 + }, + { + "epoch": 0.49638448615879044, + "grad_norm": 1.5094826221466064, + "learning_rate": 2.1205813986599052e-05, + "loss": 0.854, + "step": 10194 + }, + { + "epoch": 0.49643317994789765, + "grad_norm": 4.434866905212402, + "learning_rate": 2.1202665342004467e-05, + "loss": 0.8341, + "step": 10195 + }, + { + "epoch": 0.49648187373700486, + "grad_norm": 3.0348961353302, + "learning_rate": 2.1199516667493465e-05, + "loss": 0.8411, + "step": 10196 + }, + { + "epoch": 0.496530567526112, + "grad_norm": 2.5272507667541504, + "learning_rate": 2.119636796314438e-05, + "loss": 0.8084, + "step": 10197 + }, + { + "epoch": 0.49657926131521923, + "grad_norm": 1.65297269821167, + "learning_rate": 2.1193219229035535e-05, + "loss": 0.8372, + "step": 10198 + }, + { + "epoch": 0.49662795510432645, + "grad_norm": 1.5862846374511719, + "learning_rate": 2.1190070465245244e-05, + "loss": 0.75, + "step": 10199 + }, + { + "epoch": 0.49667664889343366, + "grad_norm": 2.222245693206787, + "learning_rate": 2.118692167185185e-05, + "loss": 0.8439, + "step": 10200 + }, + { + "epoch": 0.4967253426825408, + "grad_norm": 1.3917306661605835, + "learning_rate": 2.1183772848933664e-05, + "loss": 0.7952, + "step": 10201 + }, + { + "epoch": 0.49677403647164803, + "grad_norm": 1.7893309593200684, + "learning_rate": 2.1180623996569026e-05, + "loss": 0.8653, + "step": 10202 + }, + { + "epoch": 0.49682273026075524, + "grad_norm": 1.876174807548523, + "learning_rate": 2.1177475114836258e-05, + "loss": 0.8353, + "step": 10203 + }, + { + "epoch": 0.49687142404986245, + "grad_norm": 1.6817002296447754, + "learning_rate": 2.1174326203813687e-05, + "loss": 0.7874, + "step": 10204 + }, + { + "epoch": 0.49692011783896967, + "grad_norm": 2.0936379432678223, + "learning_rate": 2.117117726357965e-05, + "loss": 0.7444, + "step": 10205 + }, + { + "epoch": 0.4969688116280768, + "grad_norm": 1.8511242866516113, + "learning_rate": 2.1168028294212467e-05, + "loss": 0.8492, + "step": 10206 + }, + { + "epoch": 0.49701750541718404, + "grad_norm": 2.3191635608673096, + "learning_rate": 2.1164879295790472e-05, + "loss": 0.8571, + "step": 10207 + }, + { + "epoch": 0.49706619920629125, + "grad_norm": 1.628970980644226, + "learning_rate": 2.1161730268392e-05, + "loss": 0.8946, + "step": 10208 + }, + { + "epoch": 0.49711489299539846, + "grad_norm": 2.0230343341827393, + "learning_rate": 2.1158581212095385e-05, + "loss": 0.8693, + "step": 10209 + }, + { + "epoch": 0.4971635867845056, + "grad_norm": 2.1216723918914795, + "learning_rate": 2.1155432126978958e-05, + "loss": 0.8931, + "step": 10210 + }, + { + "epoch": 0.49721228057361283, + "grad_norm": 3.2461965084075928, + "learning_rate": 2.115228301312104e-05, + "loss": 0.8322, + "step": 10211 + }, + { + "epoch": 0.49726097436272004, + "grad_norm": 1.7271157503128052, + "learning_rate": 2.1149133870599986e-05, + "loss": 0.8728, + "step": 10212 + }, + { + "epoch": 0.49730966815182726, + "grad_norm": 0.10150136053562164, + "learning_rate": 2.114598469949412e-05, + "loss": 0.72, + "step": 10213 + }, + { + "epoch": 0.4973583619409344, + "grad_norm": 2.12453556060791, + "learning_rate": 2.114283549988178e-05, + "loss": 0.8804, + "step": 10214 + }, + { + "epoch": 0.4974070557300416, + "grad_norm": 1.866288423538208, + "learning_rate": 2.1139686271841305e-05, + "loss": 0.9233, + "step": 10215 + }, + { + "epoch": 0.49745574951914884, + "grad_norm": 1.3571240901947021, + "learning_rate": 2.1136537015451027e-05, + "loss": 0.8109, + "step": 10216 + }, + { + "epoch": 0.49750444330825605, + "grad_norm": 2.029916524887085, + "learning_rate": 2.113338773078929e-05, + "loss": 0.9981, + "step": 10217 + }, + { + "epoch": 0.4975531370973632, + "grad_norm": 3.2105679512023926, + "learning_rate": 2.1130238417934432e-05, + "loss": 0.8872, + "step": 10218 + }, + { + "epoch": 0.4976018308864704, + "grad_norm": 4.304175853729248, + "learning_rate": 2.1127089076964788e-05, + "loss": 0.781, + "step": 10219 + }, + { + "epoch": 0.49765052467557763, + "grad_norm": 0.10210153460502625, + "learning_rate": 2.1123939707958695e-05, + "loss": 0.6285, + "step": 10220 + }, + { + "epoch": 0.49769921846468484, + "grad_norm": 2.025179386138916, + "learning_rate": 2.1120790310994505e-05, + "loss": 0.8036, + "step": 10221 + }, + { + "epoch": 0.497747912253792, + "grad_norm": 2.2478389739990234, + "learning_rate": 2.111764088615055e-05, + "loss": 0.8217, + "step": 10222 + }, + { + "epoch": 0.4977966060428992, + "grad_norm": 2.5955564975738525, + "learning_rate": 2.111449143350518e-05, + "loss": 0.8794, + "step": 10223 + }, + { + "epoch": 0.4978452998320064, + "grad_norm": 2.6081831455230713, + "learning_rate": 2.1111341953136734e-05, + "loss": 0.8808, + "step": 10224 + }, + { + "epoch": 0.49789399362111364, + "grad_norm": 1.7255266904830933, + "learning_rate": 2.1108192445123553e-05, + "loss": 0.7977, + "step": 10225 + }, + { + "epoch": 0.49794268741022085, + "grad_norm": 2.7396602630615234, + "learning_rate": 2.1105042909543986e-05, + "loss": 0.8427, + "step": 10226 + }, + { + "epoch": 0.497991381199328, + "grad_norm": 2.0244674682617188, + "learning_rate": 2.1101893346476374e-05, + "loss": 0.9487, + "step": 10227 + }, + { + "epoch": 0.4980400749884352, + "grad_norm": 2.004211664199829, + "learning_rate": 2.1098743755999063e-05, + "loss": 0.9919, + "step": 10228 + }, + { + "epoch": 0.49808876877754243, + "grad_norm": 1.8528445959091187, + "learning_rate": 2.1095594138190405e-05, + "loss": 0.8148, + "step": 10229 + }, + { + "epoch": 0.49813746256664965, + "grad_norm": 2.0344138145446777, + "learning_rate": 2.109244449312874e-05, + "loss": 0.835, + "step": 10230 + }, + { + "epoch": 0.4981861563557568, + "grad_norm": 1.7928526401519775, + "learning_rate": 2.1089294820892418e-05, + "loss": 0.9084, + "step": 10231 + }, + { + "epoch": 0.498234850144864, + "grad_norm": 2.009474277496338, + "learning_rate": 2.108614512155979e-05, + "loss": 0.8588, + "step": 10232 + }, + { + "epoch": 0.49828354393397123, + "grad_norm": 6.427781581878662, + "learning_rate": 2.1082995395209203e-05, + "loss": 0.8353, + "step": 10233 + }, + { + "epoch": 0.49833223772307844, + "grad_norm": 2.163092851638794, + "learning_rate": 2.107984564191901e-05, + "loss": 0.7941, + "step": 10234 + }, + { + "epoch": 0.4983809315121856, + "grad_norm": 1.4964866638183594, + "learning_rate": 2.1076695861767554e-05, + "loss": 0.8007, + "step": 10235 + }, + { + "epoch": 0.4984296253012928, + "grad_norm": 1.5110934972763062, + "learning_rate": 2.1073546054833186e-05, + "loss": 0.8136, + "step": 10236 + }, + { + "epoch": 0.4984783190904, + "grad_norm": 0.09318230301141739, + "learning_rate": 2.1070396221194272e-05, + "loss": 0.5369, + "step": 10237 + }, + { + "epoch": 0.49852701287950724, + "grad_norm": 1.6337295770645142, + "learning_rate": 2.1067246360929148e-05, + "loss": 0.8241, + "step": 10238 + }, + { + "epoch": 0.4985757066686144, + "grad_norm": 2.221698045730591, + "learning_rate": 2.1064096474116175e-05, + "loss": 0.8022, + "step": 10239 + }, + { + "epoch": 0.4986244004577216, + "grad_norm": 2.3795089721679688, + "learning_rate": 2.106094656083371e-05, + "loss": 0.8403, + "step": 10240 + }, + { + "epoch": 0.4986730942468288, + "grad_norm": 1.9277724027633667, + "learning_rate": 2.1057796621160095e-05, + "loss": 0.7986, + "step": 10241 + }, + { + "epoch": 0.49872178803593603, + "grad_norm": 1.675261378288269, + "learning_rate": 2.1054646655173695e-05, + "loss": 0.8042, + "step": 10242 + }, + { + "epoch": 0.4987704818250432, + "grad_norm": 4.081839561462402, + "learning_rate": 2.1051496662952863e-05, + "loss": 0.8327, + "step": 10243 + }, + { + "epoch": 0.4988191756141504, + "grad_norm": 1.3141590356826782, + "learning_rate": 2.104834664457595e-05, + "loss": 0.8151, + "step": 10244 + }, + { + "epoch": 0.4988678694032576, + "grad_norm": 3.506815195083618, + "learning_rate": 2.104519660012133e-05, + "loss": 0.8248, + "step": 10245 + }, + { + "epoch": 0.4989165631923648, + "grad_norm": 2.2168397903442383, + "learning_rate": 2.1042046529667343e-05, + "loss": 0.7878, + "step": 10246 + }, + { + "epoch": 0.49896525698147204, + "grad_norm": 1.3155543804168701, + "learning_rate": 2.103889643329236e-05, + "loss": 0.7605, + "step": 10247 + }, + { + "epoch": 0.4990139507705792, + "grad_norm": 6.839803218841553, + "learning_rate": 2.103574631107473e-05, + "loss": 0.8308, + "step": 10248 + }, + { + "epoch": 0.4990626445596864, + "grad_norm": 4.718263626098633, + "learning_rate": 2.103259616309281e-05, + "loss": 0.8545, + "step": 10249 + }, + { + "epoch": 0.4991113383487936, + "grad_norm": 1.8859304189682007, + "learning_rate": 2.102944598942497e-05, + "loss": 0.8055, + "step": 10250 + }, + { + "epoch": 0.49916003213790083, + "grad_norm": 7.5615057945251465, + "learning_rate": 2.1026295790149564e-05, + "loss": 0.8407, + "step": 10251 + }, + { + "epoch": 0.499208725927008, + "grad_norm": 1.2953981161117554, + "learning_rate": 2.1023145565344963e-05, + "loss": 0.789, + "step": 10252 + }, + { + "epoch": 0.4992574197161152, + "grad_norm": 3.102997303009033, + "learning_rate": 2.101999531508952e-05, + "loss": 0.9311, + "step": 10253 + }, + { + "epoch": 0.4993061135052224, + "grad_norm": 2.191922426223755, + "learning_rate": 2.1016845039461603e-05, + "loss": 0.9106, + "step": 10254 + }, + { + "epoch": 0.49935480729432963, + "grad_norm": 1.8180363178253174, + "learning_rate": 2.1013694738539572e-05, + "loss": 0.7622, + "step": 10255 + }, + { + "epoch": 0.4994035010834368, + "grad_norm": 1.6249881982803345, + "learning_rate": 2.101054441240179e-05, + "loss": 0.7587, + "step": 10256 + }, + { + "epoch": 0.499452194872544, + "grad_norm": 1.799574851989746, + "learning_rate": 2.1007394061126622e-05, + "loss": 0.8678, + "step": 10257 + }, + { + "epoch": 0.4995008886616512, + "grad_norm": 1.7081284523010254, + "learning_rate": 2.1004243684792435e-05, + "loss": 0.7694, + "step": 10258 + }, + { + "epoch": 0.4995495824507584, + "grad_norm": 2.235445737838745, + "learning_rate": 2.10010932834776e-05, + "loss": 0.8705, + "step": 10259 + }, + { + "epoch": 0.4995982762398656, + "grad_norm": 1.740412950515747, + "learning_rate": 2.0997942857260473e-05, + "loss": 0.844, + "step": 10260 + }, + { + "epoch": 0.4996469700289728, + "grad_norm": 1.9652541875839233, + "learning_rate": 2.099479240621943e-05, + "loss": 0.7455, + "step": 10261 + }, + { + "epoch": 0.49969566381808, + "grad_norm": 1.4892168045043945, + "learning_rate": 2.0991641930432834e-05, + "loss": 0.8419, + "step": 10262 + }, + { + "epoch": 0.4997443576071872, + "grad_norm": 2.052311658859253, + "learning_rate": 2.0988491429979057e-05, + "loss": 0.909, + "step": 10263 + }, + { + "epoch": 0.49979305139629443, + "grad_norm": 2.2332706451416016, + "learning_rate": 2.0985340904936465e-05, + "loss": 0.7877, + "step": 10264 + }, + { + "epoch": 0.4998417451854016, + "grad_norm": 2.0843207836151123, + "learning_rate": 2.098219035538342e-05, + "loss": 0.9431, + "step": 10265 + }, + { + "epoch": 0.4998904389745088, + "grad_norm": 1.7641664743423462, + "learning_rate": 2.0979039781398306e-05, + "loss": 0.7976, + "step": 10266 + }, + { + "epoch": 0.499939132763616, + "grad_norm": 2.97615909576416, + "learning_rate": 2.0975889183059494e-05, + "loss": 0.8468, + "step": 10267 + }, + { + "epoch": 0.4999878265527232, + "grad_norm": 1.5989303588867188, + "learning_rate": 2.097273856044534e-05, + "loss": 0.9215, + "step": 10268 + }, + { + "epoch": 0.5000365203418304, + "grad_norm": 1.8196231126785278, + "learning_rate": 2.0969587913634234e-05, + "loss": 0.9076, + "step": 10269 + }, + { + "epoch": 0.5000852141309376, + "grad_norm": 1.6314165592193604, + "learning_rate": 2.0966437242704532e-05, + "loss": 0.7729, + "step": 10270 + }, + { + "epoch": 0.5001339079200448, + "grad_norm": 8.670584678649902, + "learning_rate": 2.096328654773462e-05, + "loss": 0.8481, + "step": 10271 + }, + { + "epoch": 0.500182601709152, + "grad_norm": 1.657462477684021, + "learning_rate": 2.0960135828802865e-05, + "loss": 0.9448, + "step": 10272 + }, + { + "epoch": 0.5002312954982592, + "grad_norm": 2.9483675956726074, + "learning_rate": 2.0956985085987646e-05, + "loss": 0.8393, + "step": 10273 + }, + { + "epoch": 0.5002799892873664, + "grad_norm": 1.3994760513305664, + "learning_rate": 2.0953834319367338e-05, + "loss": 0.8451, + "step": 10274 + }, + { + "epoch": 0.5003286830764736, + "grad_norm": 1.4657272100448608, + "learning_rate": 2.095068352902031e-05, + "loss": 0.7814, + "step": 10275 + }, + { + "epoch": 0.5003773768655808, + "grad_norm": 2.1284735202789307, + "learning_rate": 2.0947532715024946e-05, + "loss": 0.7932, + "step": 10276 + }, + { + "epoch": 0.500426070654688, + "grad_norm": 1.6226184368133545, + "learning_rate": 2.0944381877459618e-05, + "loss": 0.7952, + "step": 10277 + }, + { + "epoch": 0.5004747644437952, + "grad_norm": 2.1044962406158447, + "learning_rate": 2.09412310164027e-05, + "loss": 0.9301, + "step": 10278 + }, + { + "epoch": 0.5005234582329023, + "grad_norm": 2.3511648178100586, + "learning_rate": 2.093808013193258e-05, + "loss": 0.7944, + "step": 10279 + }, + { + "epoch": 0.5005721520220096, + "grad_norm": 1.4376139640808105, + "learning_rate": 2.0934929224127626e-05, + "loss": 0.7977, + "step": 10280 + }, + { + "epoch": 0.5006208458111168, + "grad_norm": 1.7413781881332397, + "learning_rate": 2.093177829306623e-05, + "loss": 0.8441, + "step": 10281 + }, + { + "epoch": 0.500669539600224, + "grad_norm": 2.616499423980713, + "learning_rate": 2.092862733882676e-05, + "loss": 0.8035, + "step": 10282 + }, + { + "epoch": 0.5007182333893312, + "grad_norm": 1.6951264142990112, + "learning_rate": 2.0925476361487603e-05, + "loss": 0.8371, + "step": 10283 + }, + { + "epoch": 0.5007669271784384, + "grad_norm": 0.09279341995716095, + "learning_rate": 2.0922325361127137e-05, + "loss": 0.6229, + "step": 10284 + }, + { + "epoch": 0.5008156209675456, + "grad_norm": 3.1199734210968018, + "learning_rate": 2.0919174337823736e-05, + "loss": 0.8072, + "step": 10285 + }, + { + "epoch": 0.5008643147566528, + "grad_norm": 2.2128844261169434, + "learning_rate": 2.0916023291655792e-05, + "loss": 0.8113, + "step": 10286 + }, + { + "epoch": 0.50091300854576, + "grad_norm": 1.4066544771194458, + "learning_rate": 2.091287222270169e-05, + "loss": 0.6777, + "step": 10287 + }, + { + "epoch": 0.5009617023348671, + "grad_norm": 1.4766149520874023, + "learning_rate": 2.090972113103981e-05, + "loss": 0.7917, + "step": 10288 + }, + { + "epoch": 0.5010103961239744, + "grad_norm": 1.5719106197357178, + "learning_rate": 2.090657001674853e-05, + "loss": 0.8285, + "step": 10289 + }, + { + "epoch": 0.5010590899130816, + "grad_norm": 1.5853114128112793, + "learning_rate": 2.0903418879906235e-05, + "loss": 0.8686, + "step": 10290 + }, + { + "epoch": 0.5011077837021888, + "grad_norm": 1.6395766735076904, + "learning_rate": 2.0900267720591318e-05, + "loss": 0.8286, + "step": 10291 + }, + { + "epoch": 0.501156477491296, + "grad_norm": 1.4349642992019653, + "learning_rate": 2.0897116538882155e-05, + "loss": 0.853, + "step": 10292 + }, + { + "epoch": 0.5012051712804032, + "grad_norm": 2.5011658668518066, + "learning_rate": 2.0893965334857143e-05, + "loss": 0.9021, + "step": 10293 + }, + { + "epoch": 0.5012538650695104, + "grad_norm": 1.6832417249679565, + "learning_rate": 2.0890814108594653e-05, + "loss": 0.6774, + "step": 10294 + }, + { + "epoch": 0.5013025588586176, + "grad_norm": 1.5608420372009277, + "learning_rate": 2.0887662860173087e-05, + "loss": 0.9256, + "step": 10295 + }, + { + "epoch": 0.5013512526477247, + "grad_norm": 1.494348406791687, + "learning_rate": 2.088451158967083e-05, + "loss": 0.8294, + "step": 10296 + }, + { + "epoch": 0.501399946436832, + "grad_norm": 4.7692670822143555, + "learning_rate": 2.0881360297166256e-05, + "loss": 0.8738, + "step": 10297 + }, + { + "epoch": 0.5014486402259392, + "grad_norm": 1.7358258962631226, + "learning_rate": 2.0878208982737774e-05, + "loss": 0.8401, + "step": 10298 + }, + { + "epoch": 0.5014973340150464, + "grad_norm": 1.732946515083313, + "learning_rate": 2.087505764646376e-05, + "loss": 0.8008, + "step": 10299 + }, + { + "epoch": 0.5015460278041536, + "grad_norm": 1.7717205286026, + "learning_rate": 2.08719062884226e-05, + "loss": 0.8288, + "step": 10300 + }, + { + "epoch": 0.5015947215932608, + "grad_norm": 1.710904836654663, + "learning_rate": 2.0868754908692704e-05, + "loss": 0.7872, + "step": 10301 + }, + { + "epoch": 0.501643415382368, + "grad_norm": 1.4982070922851562, + "learning_rate": 2.086560350735244e-05, + "loss": 0.8078, + "step": 10302 + }, + { + "epoch": 0.5016921091714752, + "grad_norm": 1.4449125528335571, + "learning_rate": 2.0862452084480214e-05, + "loss": 0.8772, + "step": 10303 + }, + { + "epoch": 0.5017408029605824, + "grad_norm": 0.09249988943338394, + "learning_rate": 2.0859300640154415e-05, + "loss": 0.6078, + "step": 10304 + }, + { + "epoch": 0.5017894967496895, + "grad_norm": 1.5389084815979004, + "learning_rate": 2.0856149174453434e-05, + "loss": 0.8181, + "step": 10305 + }, + { + "epoch": 0.5018381905387967, + "grad_norm": 1.617520809173584, + "learning_rate": 2.0852997687455664e-05, + "loss": 0.8139, + "step": 10306 + }, + { + "epoch": 0.501886884327904, + "grad_norm": 1.5196465253829956, + "learning_rate": 2.0849846179239504e-05, + "loss": 0.7539, + "step": 10307 + }, + { + "epoch": 0.5019355781170112, + "grad_norm": 1.499930739402771, + "learning_rate": 2.0846694649883334e-05, + "loss": 0.8624, + "step": 10308 + }, + { + "epoch": 0.5019842719061184, + "grad_norm": 3.2346432209014893, + "learning_rate": 2.084354309946556e-05, + "loss": 0.7966, + "step": 10309 + }, + { + "epoch": 0.5020329656952256, + "grad_norm": 9.285652160644531, + "learning_rate": 2.084039152806457e-05, + "loss": 0.8767, + "step": 10310 + }, + { + "epoch": 0.5020816594843328, + "grad_norm": 1.2204753160476685, + "learning_rate": 2.083723993575877e-05, + "loss": 0.9381, + "step": 10311 + }, + { + "epoch": 0.50213035327344, + "grad_norm": 1.9312087297439575, + "learning_rate": 2.083408832262655e-05, + "loss": 0.8065, + "step": 10312 + }, + { + "epoch": 0.5021790470625471, + "grad_norm": 1.5641545057296753, + "learning_rate": 2.0830936688746307e-05, + "loss": 0.8402, + "step": 10313 + }, + { + "epoch": 0.5022277408516543, + "grad_norm": 1.4012994766235352, + "learning_rate": 2.082778503419644e-05, + "loss": 0.7947, + "step": 10314 + }, + { + "epoch": 0.5022764346407615, + "grad_norm": 1.3154244422912598, + "learning_rate": 2.082463335905534e-05, + "loss": 0.8395, + "step": 10315 + }, + { + "epoch": 0.5023251284298688, + "grad_norm": 2.603179693222046, + "learning_rate": 2.082148166340141e-05, + "loss": 0.8741, + "step": 10316 + }, + { + "epoch": 0.502373822218976, + "grad_norm": 1.9943350553512573, + "learning_rate": 2.0818329947313055e-05, + "loss": 0.7354, + "step": 10317 + }, + { + "epoch": 0.5024225160080832, + "grad_norm": 3.477766513824463, + "learning_rate": 2.0815178210868666e-05, + "loss": 0.8961, + "step": 10318 + }, + { + "epoch": 0.5024712097971904, + "grad_norm": 1.7344129085540771, + "learning_rate": 2.081202645414664e-05, + "loss": 0.9522, + "step": 10319 + }, + { + "epoch": 0.5025199035862976, + "grad_norm": 2.125002145767212, + "learning_rate": 2.0808874677225385e-05, + "loss": 0.8705, + "step": 10320 + }, + { + "epoch": 0.5025685973754047, + "grad_norm": 2.0301597118377686, + "learning_rate": 2.0805722880183296e-05, + "loss": 0.7147, + "step": 10321 + }, + { + "epoch": 0.5026172911645119, + "grad_norm": 1.830962896347046, + "learning_rate": 2.080257106309878e-05, + "loss": 0.8361, + "step": 10322 + }, + { + "epoch": 0.5026659849536191, + "grad_norm": 2.3923285007476807, + "learning_rate": 2.0799419226050232e-05, + "loss": 0.8873, + "step": 10323 + }, + { + "epoch": 0.5027146787427264, + "grad_norm": 1.454615831375122, + "learning_rate": 2.079626736911606e-05, + "loss": 0.8605, + "step": 10324 + }, + { + "epoch": 0.5027633725318336, + "grad_norm": 1.6657171249389648, + "learning_rate": 2.079311549237467e-05, + "loss": 0.7691, + "step": 10325 + }, + { + "epoch": 0.5028120663209408, + "grad_norm": 1.7206813097000122, + "learning_rate": 2.078996359590445e-05, + "loss": 0.7708, + "step": 10326 + }, + { + "epoch": 0.502860760110048, + "grad_norm": 3.005622148513794, + "learning_rate": 2.078681167978382e-05, + "loss": 0.9768, + "step": 10327 + }, + { + "epoch": 0.5029094538991552, + "grad_norm": 2.2648136615753174, + "learning_rate": 2.0783659744091177e-05, + "loss": 0.7925, + "step": 10328 + }, + { + "epoch": 0.5029581476882624, + "grad_norm": 1.4950039386749268, + "learning_rate": 2.0780507788904926e-05, + "loss": 0.7771, + "step": 10329 + }, + { + "epoch": 0.5030068414773695, + "grad_norm": 1.6313194036483765, + "learning_rate": 2.077735581430347e-05, + "loss": 0.8577, + "step": 10330 + }, + { + "epoch": 0.5030555352664767, + "grad_norm": 8.7286376953125, + "learning_rate": 2.0774203820365216e-05, + "loss": 0.8522, + "step": 10331 + }, + { + "epoch": 0.5031042290555839, + "grad_norm": 1.5745967626571655, + "learning_rate": 2.0771051807168576e-05, + "loss": 0.7724, + "step": 10332 + }, + { + "epoch": 0.5031529228446912, + "grad_norm": 1.475486159324646, + "learning_rate": 2.0767899774791952e-05, + "loss": 0.8481, + "step": 10333 + }, + { + "epoch": 0.5032016166337984, + "grad_norm": 1.5094398260116577, + "learning_rate": 2.0764747723313753e-05, + "loss": 0.8122, + "step": 10334 + }, + { + "epoch": 0.5032503104229056, + "grad_norm": 1.504563808441162, + "learning_rate": 2.076159565281238e-05, + "loss": 0.8147, + "step": 10335 + }, + { + "epoch": 0.5032990042120128, + "grad_norm": 1.8140037059783936, + "learning_rate": 2.0758443563366247e-05, + "loss": 0.8471, + "step": 10336 + }, + { + "epoch": 0.50334769800112, + "grad_norm": 2.2155416011810303, + "learning_rate": 2.0755291455053763e-05, + "loss": 0.8542, + "step": 10337 + }, + { + "epoch": 0.5033963917902271, + "grad_norm": 1.9549740552902222, + "learning_rate": 2.0752139327953333e-05, + "loss": 0.8182, + "step": 10338 + }, + { + "epoch": 0.5034450855793343, + "grad_norm": 2.1055991649627686, + "learning_rate": 2.074898718214337e-05, + "loss": 0.8247, + "step": 10339 + }, + { + "epoch": 0.5034937793684415, + "grad_norm": 1.3641973733901978, + "learning_rate": 2.0745835017702277e-05, + "loss": 0.791, + "step": 10340 + }, + { + "epoch": 0.5035424731575487, + "grad_norm": 3.2676525115966797, + "learning_rate": 2.074268283470848e-05, + "loss": 0.8238, + "step": 10341 + }, + { + "epoch": 0.503591166946656, + "grad_norm": 1.9982354640960693, + "learning_rate": 2.0739530633240373e-05, + "loss": 0.837, + "step": 10342 + }, + { + "epoch": 0.5036398607357632, + "grad_norm": 1.6159714460372925, + "learning_rate": 2.0736378413376378e-05, + "loss": 0.9385, + "step": 10343 + }, + { + "epoch": 0.5036885545248704, + "grad_norm": 1.652278184890747, + "learning_rate": 2.0733226175194902e-05, + "loss": 0.8519, + "step": 10344 + }, + { + "epoch": 0.5037372483139776, + "grad_norm": 1.6765249967575073, + "learning_rate": 2.073007391877435e-05, + "loss": 0.7829, + "step": 10345 + }, + { + "epoch": 0.5037859421030848, + "grad_norm": 1.913467526435852, + "learning_rate": 2.072692164419315e-05, + "loss": 0.8381, + "step": 10346 + }, + { + "epoch": 0.5038346358921919, + "grad_norm": 2.9878146648406982, + "learning_rate": 2.072376935152971e-05, + "loss": 0.7642, + "step": 10347 + }, + { + "epoch": 0.5038833296812991, + "grad_norm": 1.5146435499191284, + "learning_rate": 2.0720617040862437e-05, + "loss": 0.7698, + "step": 10348 + }, + { + "epoch": 0.5039320234704063, + "grad_norm": 1.7617545127868652, + "learning_rate": 2.071746471226975e-05, + "loss": 0.8302, + "step": 10349 + }, + { + "epoch": 0.5039807172595135, + "grad_norm": 1.4358621835708618, + "learning_rate": 2.0714312365830063e-05, + "loss": 0.77, + "step": 10350 + }, + { + "epoch": 0.5040294110486208, + "grad_norm": 2.160325527191162, + "learning_rate": 2.0711160001621793e-05, + "loss": 0.7919, + "step": 10351 + }, + { + "epoch": 0.504078104837728, + "grad_norm": 1.4634720087051392, + "learning_rate": 2.070800761972335e-05, + "loss": 0.8432, + "step": 10352 + }, + { + "epoch": 0.5041267986268352, + "grad_norm": 2.3678596019744873, + "learning_rate": 2.0704855220213153e-05, + "loss": 0.9181, + "step": 10353 + }, + { + "epoch": 0.5041754924159424, + "grad_norm": 1.7194873094558716, + "learning_rate": 2.0701702803169625e-05, + "loss": 0.7965, + "step": 10354 + }, + { + "epoch": 0.5042241862050495, + "grad_norm": 1.7636483907699585, + "learning_rate": 2.0698550368671167e-05, + "loss": 0.8323, + "step": 10355 + }, + { + "epoch": 0.5042728799941567, + "grad_norm": 1.9704539775848389, + "learning_rate": 2.069539791679621e-05, + "loss": 0.8356, + "step": 10356 + }, + { + "epoch": 0.5043215737832639, + "grad_norm": 3.1647496223449707, + "learning_rate": 2.0692245447623165e-05, + "loss": 0.8211, + "step": 10357 + }, + { + "epoch": 0.5043702675723711, + "grad_norm": 0.09101087599992752, + "learning_rate": 2.068909296123045e-05, + "loss": 0.6605, + "step": 10358 + }, + { + "epoch": 0.5044189613614783, + "grad_norm": 4.061710357666016, + "learning_rate": 2.0685940457696487e-05, + "loss": 0.7963, + "step": 10359 + }, + { + "epoch": 0.5044676551505856, + "grad_norm": 1.6279903650283813, + "learning_rate": 2.0682787937099684e-05, + "loss": 0.768, + "step": 10360 + }, + { + "epoch": 0.5045163489396928, + "grad_norm": 1.4351977109909058, + "learning_rate": 2.0679635399518482e-05, + "loss": 0.8942, + "step": 10361 + }, + { + "epoch": 0.5045650427288, + "grad_norm": 1.4739201068878174, + "learning_rate": 2.067648284503128e-05, + "loss": 0.8554, + "step": 10362 + }, + { + "epoch": 0.5046137365179071, + "grad_norm": 2.5628414154052734, + "learning_rate": 2.067333027371651e-05, + "loss": 0.9345, + "step": 10363 + }, + { + "epoch": 0.5046624303070143, + "grad_norm": 2.0160717964172363, + "learning_rate": 2.0670177685652586e-05, + "loss": 0.7443, + "step": 10364 + }, + { + "epoch": 0.5047111240961215, + "grad_norm": 1.5907543897628784, + "learning_rate": 2.066702508091793e-05, + "loss": 0.8403, + "step": 10365 + }, + { + "epoch": 0.5047598178852287, + "grad_norm": 1.474138617515564, + "learning_rate": 2.066387245959096e-05, + "loss": 0.8122, + "step": 10366 + }, + { + "epoch": 0.5048085116743359, + "grad_norm": 1.8090177774429321, + "learning_rate": 2.0660719821750107e-05, + "loss": 0.7697, + "step": 10367 + }, + { + "epoch": 0.5048572054634431, + "grad_norm": 1.7860898971557617, + "learning_rate": 2.065756716747379e-05, + "loss": 0.8485, + "step": 10368 + }, + { + "epoch": 0.5049058992525504, + "grad_norm": 0.08782325685024261, + "learning_rate": 2.0654414496840424e-05, + "loss": 0.5935, + "step": 10369 + }, + { + "epoch": 0.5049545930416576, + "grad_norm": 1.3898475170135498, + "learning_rate": 2.0651261809928442e-05, + "loss": 0.7766, + "step": 10370 + }, + { + "epoch": 0.5050032868307648, + "grad_norm": 1.371213436126709, + "learning_rate": 2.0648109106816263e-05, + "loss": 0.8438, + "step": 10371 + }, + { + "epoch": 0.5050519806198719, + "grad_norm": 1.3437607288360596, + "learning_rate": 2.0644956387582313e-05, + "loss": 0.8745, + "step": 10372 + }, + { + "epoch": 0.5051006744089791, + "grad_norm": 2.0857417583465576, + "learning_rate": 2.064180365230501e-05, + "loss": 0.7638, + "step": 10373 + }, + { + "epoch": 0.5051493681980863, + "grad_norm": 2.487560987472534, + "learning_rate": 2.063865090106278e-05, + "loss": 0.8761, + "step": 10374 + }, + { + "epoch": 0.5051980619871935, + "grad_norm": 2.0205626487731934, + "learning_rate": 2.0635498133934056e-05, + "loss": 0.778, + "step": 10375 + }, + { + "epoch": 0.5052467557763007, + "grad_norm": 1.5234103202819824, + "learning_rate": 2.0632345350997262e-05, + "loss": 0.8164, + "step": 10376 + }, + { + "epoch": 0.505295449565408, + "grad_norm": 3.254525661468506, + "learning_rate": 2.0629192552330815e-05, + "loss": 0.7573, + "step": 10377 + }, + { + "epoch": 0.5053441433545152, + "grad_norm": 1.3736791610717773, + "learning_rate": 2.062603973801314e-05, + "loss": 0.8928, + "step": 10378 + }, + { + "epoch": 0.5053928371436224, + "grad_norm": 1.6613247394561768, + "learning_rate": 2.0622886908122678e-05, + "loss": 0.7955, + "step": 10379 + }, + { + "epoch": 0.5054415309327295, + "grad_norm": 1.5074434280395508, + "learning_rate": 2.061973406273784e-05, + "loss": 0.8012, + "step": 10380 + }, + { + "epoch": 0.5054902247218367, + "grad_norm": 1.9201788902282715, + "learning_rate": 2.0616581201937062e-05, + "loss": 0.7709, + "step": 10381 + }, + { + "epoch": 0.5055389185109439, + "grad_norm": 1.4067344665527344, + "learning_rate": 2.0613428325798773e-05, + "loss": 0.8516, + "step": 10382 + }, + { + "epoch": 0.5055876123000511, + "grad_norm": 1.5911494493484497, + "learning_rate": 2.0610275434401402e-05, + "loss": 0.877, + "step": 10383 + }, + { + "epoch": 0.5056363060891583, + "grad_norm": 2.017721652984619, + "learning_rate": 2.060712252782337e-05, + "loss": 0.8836, + "step": 10384 + }, + { + "epoch": 0.5056849998782655, + "grad_norm": 1.3608592748641968, + "learning_rate": 2.0603969606143113e-05, + "loss": 0.8529, + "step": 10385 + }, + { + "epoch": 0.5057336936673728, + "grad_norm": 2.044156074523926, + "learning_rate": 2.0600816669439054e-05, + "loss": 0.7922, + "step": 10386 + }, + { + "epoch": 0.50578238745648, + "grad_norm": 1.3486782312393188, + "learning_rate": 2.0597663717789628e-05, + "loss": 0.8325, + "step": 10387 + }, + { + "epoch": 0.5058310812455872, + "grad_norm": 1.6911468505859375, + "learning_rate": 2.0594510751273255e-05, + "loss": 0.8934, + "step": 10388 + }, + { + "epoch": 0.5058797750346943, + "grad_norm": 1.6662870645523071, + "learning_rate": 2.0591357769968375e-05, + "loss": 0.7883, + "step": 10389 + }, + { + "epoch": 0.5059284688238015, + "grad_norm": 1.4887090921401978, + "learning_rate": 2.058820477395342e-05, + "loss": 0.8608, + "step": 10390 + }, + { + "epoch": 0.5059771626129087, + "grad_norm": 1.9782596826553345, + "learning_rate": 2.058505176330682e-05, + "loss": 0.8889, + "step": 10391 + }, + { + "epoch": 0.5060258564020159, + "grad_norm": 3.0591578483581543, + "learning_rate": 2.0581898738107e-05, + "loss": 0.9084, + "step": 10392 + }, + { + "epoch": 0.5060745501911231, + "grad_norm": 0.09569216519594193, + "learning_rate": 2.0578745698432397e-05, + "loss": 0.6056, + "step": 10393 + }, + { + "epoch": 0.5061232439802303, + "grad_norm": 1.9574238061904907, + "learning_rate": 2.0575592644361448e-05, + "loss": 0.7444, + "step": 10394 + }, + { + "epoch": 0.5061719377693376, + "grad_norm": 1.599118709564209, + "learning_rate": 2.0572439575972568e-05, + "loss": 0.8895, + "step": 10395 + }, + { + "epoch": 0.5062206315584448, + "grad_norm": 1.6632834672927856, + "learning_rate": 2.0569286493344208e-05, + "loss": 0.8143, + "step": 10396 + }, + { + "epoch": 0.5062693253475519, + "grad_norm": 1.9635322093963623, + "learning_rate": 2.0566133396554795e-05, + "loss": 0.8561, + "step": 10397 + }, + { + "epoch": 0.5063180191366591, + "grad_norm": 2.1006247997283936, + "learning_rate": 2.0562980285682764e-05, + "loss": 0.7958, + "step": 10398 + }, + { + "epoch": 0.5063667129257663, + "grad_norm": 1.597285509109497, + "learning_rate": 2.055982716080654e-05, + "loss": 0.8595, + "step": 10399 + }, + { + "epoch": 0.5064154067148735, + "grad_norm": 1.8224904537200928, + "learning_rate": 2.055667402200457e-05, + "loss": 0.8862, + "step": 10400 + }, + { + "epoch": 0.5064641005039807, + "grad_norm": 1.476113200187683, + "learning_rate": 2.0553520869355278e-05, + "loss": 0.9325, + "step": 10401 + }, + { + "epoch": 0.5065127942930879, + "grad_norm": 1.6998119354248047, + "learning_rate": 2.0550367702937107e-05, + "loss": 0.903, + "step": 10402 + }, + { + "epoch": 0.5065614880821951, + "grad_norm": 0.0920393168926239, + "learning_rate": 2.054721452282849e-05, + "loss": 0.586, + "step": 10403 + }, + { + "epoch": 0.5066101818713024, + "grad_norm": 1.4512643814086914, + "learning_rate": 2.0544061329107857e-05, + "loss": 0.8027, + "step": 10404 + }, + { + "epoch": 0.5066588756604096, + "grad_norm": 3.6380813121795654, + "learning_rate": 2.054090812185366e-05, + "loss": 0.7966, + "step": 10405 + }, + { + "epoch": 0.5067075694495167, + "grad_norm": 3.2608017921447754, + "learning_rate": 2.0537754901144315e-05, + "loss": 0.7711, + "step": 10406 + }, + { + "epoch": 0.5067562632386239, + "grad_norm": 2.7474637031555176, + "learning_rate": 2.053460166705827e-05, + "loss": 0.7956, + "step": 10407 + }, + { + "epoch": 0.5068049570277311, + "grad_norm": 1.7286081314086914, + "learning_rate": 2.0531448419673962e-05, + "loss": 0.778, + "step": 10408 + }, + { + "epoch": 0.5068536508168383, + "grad_norm": 1.9348856210708618, + "learning_rate": 2.0528295159069817e-05, + "loss": 0.9244, + "step": 10409 + }, + { + "epoch": 0.5069023446059455, + "grad_norm": 1.518886923789978, + "learning_rate": 2.0525141885324284e-05, + "loss": 0.8232, + "step": 10410 + }, + { + "epoch": 0.5069510383950527, + "grad_norm": 2.565619707107544, + "learning_rate": 2.0521988598515806e-05, + "loss": 0.8015, + "step": 10411 + }, + { + "epoch": 0.50699973218416, + "grad_norm": 1.8709666728973389, + "learning_rate": 2.0518835298722815e-05, + "loss": 0.7708, + "step": 10412 + }, + { + "epoch": 0.5070484259732672, + "grad_norm": 0.09502305090427399, + "learning_rate": 2.0515681986023744e-05, + "loss": 0.6055, + "step": 10413 + }, + { + "epoch": 0.5070971197623743, + "grad_norm": 2.3238580226898193, + "learning_rate": 2.051252866049704e-05, + "loss": 0.8104, + "step": 10414 + }, + { + "epoch": 0.5071458135514815, + "grad_norm": 1.7441744804382324, + "learning_rate": 2.0509375322221134e-05, + "loss": 0.7814, + "step": 10415 + }, + { + "epoch": 0.5071945073405887, + "grad_norm": 1.6357474327087402, + "learning_rate": 2.0506221971274477e-05, + "loss": 0.8263, + "step": 10416 + }, + { + "epoch": 0.5072432011296959, + "grad_norm": 1.6252806186676025, + "learning_rate": 2.0503068607735494e-05, + "loss": 0.7518, + "step": 10417 + }, + { + "epoch": 0.5072918949188031, + "grad_norm": 1.5847349166870117, + "learning_rate": 2.0499915231682638e-05, + "loss": 0.8956, + "step": 10418 + }, + { + "epoch": 0.5073405887079103, + "grad_norm": 1.764894962310791, + "learning_rate": 2.049676184319434e-05, + "loss": 0.8625, + "step": 10419 + }, + { + "epoch": 0.5073892824970175, + "grad_norm": 1.8649712800979614, + "learning_rate": 2.0493608442349053e-05, + "loss": 0.8113, + "step": 10420 + }, + { + "epoch": 0.5074379762861247, + "grad_norm": 1.4761255979537964, + "learning_rate": 2.0490455029225208e-05, + "loss": 0.7902, + "step": 10421 + }, + { + "epoch": 0.5074866700752318, + "grad_norm": 1.4616445302963257, + "learning_rate": 2.048730160390125e-05, + "loss": 0.8743, + "step": 10422 + }, + { + "epoch": 0.5075353638643391, + "grad_norm": 1.4502487182617188, + "learning_rate": 2.0484148166455622e-05, + "loss": 0.853, + "step": 10423 + }, + { + "epoch": 0.5075840576534463, + "grad_norm": 1.5382187366485596, + "learning_rate": 2.048099471696676e-05, + "loss": 0.9375, + "step": 10424 + }, + { + "epoch": 0.5076327514425535, + "grad_norm": 2.108181953430176, + "learning_rate": 2.0477841255513114e-05, + "loss": 0.8349, + "step": 10425 + }, + { + "epoch": 0.5076814452316607, + "grad_norm": 2.4439570903778076, + "learning_rate": 2.047468778217312e-05, + "loss": 0.7504, + "step": 10426 + }, + { + "epoch": 0.5077301390207679, + "grad_norm": 1.741075873374939, + "learning_rate": 2.0471534297025227e-05, + "loss": 0.8497, + "step": 10427 + }, + { + "epoch": 0.5077788328098751, + "grad_norm": 1.9457271099090576, + "learning_rate": 2.0468380800147875e-05, + "loss": 0.8759, + "step": 10428 + }, + { + "epoch": 0.5078275265989823, + "grad_norm": 1.1546872854232788, + "learning_rate": 2.0465227291619504e-05, + "loss": 0.8744, + "step": 10429 + }, + { + "epoch": 0.5078762203880895, + "grad_norm": 1.693058729171753, + "learning_rate": 2.046207377151857e-05, + "loss": 0.8563, + "step": 10430 + }, + { + "epoch": 0.5079249141771967, + "grad_norm": 1.867823839187622, + "learning_rate": 2.04589202399235e-05, + "loss": 0.8298, + "step": 10431 + }, + { + "epoch": 0.5079736079663039, + "grad_norm": 1.6925132274627686, + "learning_rate": 2.045576669691275e-05, + "loss": 0.8918, + "step": 10432 + }, + { + "epoch": 0.5080223017554111, + "grad_norm": 1.7933390140533447, + "learning_rate": 2.0452613142564766e-05, + "loss": 0.779, + "step": 10433 + }, + { + "epoch": 0.5080709955445183, + "grad_norm": 1.531496524810791, + "learning_rate": 2.0449459576957985e-05, + "loss": 0.7887, + "step": 10434 + }, + { + "epoch": 0.5081196893336255, + "grad_norm": 1.5329198837280273, + "learning_rate": 2.0446306000170858e-05, + "loss": 0.7247, + "step": 10435 + }, + { + "epoch": 0.5081683831227327, + "grad_norm": 1.7509766817092896, + "learning_rate": 2.0443152412281825e-05, + "loss": 0.8501, + "step": 10436 + }, + { + "epoch": 0.5082170769118399, + "grad_norm": 7.409929275512695, + "learning_rate": 2.0439998813369343e-05, + "loss": 0.8007, + "step": 10437 + }, + { + "epoch": 0.5082657707009471, + "grad_norm": 1.5805184841156006, + "learning_rate": 2.043684520351184e-05, + "loss": 0.9, + "step": 10438 + }, + { + "epoch": 0.5083144644900542, + "grad_norm": 2.7079427242279053, + "learning_rate": 2.043369158278778e-05, + "loss": 0.8977, + "step": 10439 + }, + { + "epoch": 0.5083631582791615, + "grad_norm": 1.6588976383209229, + "learning_rate": 2.0430537951275596e-05, + "loss": 0.8953, + "step": 10440 + }, + { + "epoch": 0.5084118520682687, + "grad_norm": 1.5558280944824219, + "learning_rate": 2.0427384309053747e-05, + "loss": 0.8572, + "step": 10441 + }, + { + "epoch": 0.5084605458573759, + "grad_norm": 1.769722819328308, + "learning_rate": 2.0424230656200672e-05, + "loss": 0.8093, + "step": 10442 + }, + { + "epoch": 0.5085092396464831, + "grad_norm": 1.3728868961334229, + "learning_rate": 2.0421076992794823e-05, + "loss": 0.8619, + "step": 10443 + }, + { + "epoch": 0.5085579334355903, + "grad_norm": 2.919372797012329, + "learning_rate": 2.0417923318914643e-05, + "loss": 0.8699, + "step": 10444 + }, + { + "epoch": 0.5086066272246975, + "grad_norm": 1.7747374773025513, + "learning_rate": 2.0414769634638584e-05, + "loss": 0.9928, + "step": 10445 + }, + { + "epoch": 0.5086553210138047, + "grad_norm": 2.9372034072875977, + "learning_rate": 2.041161594004509e-05, + "loss": 0.8581, + "step": 10446 + }, + { + "epoch": 0.5087040148029119, + "grad_norm": 1.7375918626785278, + "learning_rate": 2.0408462235212615e-05, + "loss": 0.9004, + "step": 10447 + }, + { + "epoch": 0.508752708592019, + "grad_norm": 2.2845942974090576, + "learning_rate": 2.0405308520219597e-05, + "loss": 0.7514, + "step": 10448 + }, + { + "epoch": 0.5088014023811263, + "grad_norm": 1.5174524784088135, + "learning_rate": 2.04021547951445e-05, + "loss": 0.8322, + "step": 10449 + }, + { + "epoch": 0.5088500961702335, + "grad_norm": 4.555256366729736, + "learning_rate": 2.0399001060065768e-05, + "loss": 0.8535, + "step": 10450 + }, + { + "epoch": 0.5088987899593407, + "grad_norm": 1.256176233291626, + "learning_rate": 2.0395847315061847e-05, + "loss": 0.8295, + "step": 10451 + }, + { + "epoch": 0.5089474837484479, + "grad_norm": 1.7576416730880737, + "learning_rate": 2.0392693560211186e-05, + "loss": 0.8165, + "step": 10452 + }, + { + "epoch": 0.5089961775375551, + "grad_norm": 1.5003868341445923, + "learning_rate": 2.0389539795592235e-05, + "loss": 0.7981, + "step": 10453 + }, + { + "epoch": 0.5090448713266623, + "grad_norm": 1.824434518814087, + "learning_rate": 2.0386386021283445e-05, + "loss": 0.8176, + "step": 10454 + }, + { + "epoch": 0.5090935651157695, + "grad_norm": 1.5047850608825684, + "learning_rate": 2.038323223736327e-05, + "loss": 0.809, + "step": 10455 + }, + { + "epoch": 0.5091422589048766, + "grad_norm": 2.148472309112549, + "learning_rate": 2.0380078443910162e-05, + "loss": 0.825, + "step": 10456 + }, + { + "epoch": 0.5091909526939838, + "grad_norm": 4.01572847366333, + "learning_rate": 2.0376924641002564e-05, + "loss": 0.812, + "step": 10457 + }, + { + "epoch": 0.5092396464830911, + "grad_norm": 1.4611700773239136, + "learning_rate": 2.037377082871893e-05, + "loss": 0.9499, + "step": 10458 + }, + { + "epoch": 0.5092883402721983, + "grad_norm": 1.849955677986145, + "learning_rate": 2.0370617007137717e-05, + "loss": 0.7681, + "step": 10459 + }, + { + "epoch": 0.5093370340613055, + "grad_norm": 3.1442930698394775, + "learning_rate": 2.036746317633737e-05, + "loss": 0.8493, + "step": 10460 + }, + { + "epoch": 0.5093857278504127, + "grad_norm": 1.6685274839401245, + "learning_rate": 2.036430933639634e-05, + "loss": 0.8313, + "step": 10461 + }, + { + "epoch": 0.5094344216395199, + "grad_norm": 1.3328403234481812, + "learning_rate": 2.0361155487393086e-05, + "loss": 0.9073, + "step": 10462 + }, + { + "epoch": 0.5094831154286271, + "grad_norm": 1.8569597005844116, + "learning_rate": 2.0358001629406053e-05, + "loss": 0.8473, + "step": 10463 + }, + { + "epoch": 0.5095318092177343, + "grad_norm": 2.5609664916992188, + "learning_rate": 2.0354847762513698e-05, + "loss": 0.7358, + "step": 10464 + }, + { + "epoch": 0.5095805030068414, + "grad_norm": 1.3742231130599976, + "learning_rate": 2.0351693886794477e-05, + "loss": 0.7823, + "step": 10465 + }, + { + "epoch": 0.5096291967959486, + "grad_norm": 2.040370225906372, + "learning_rate": 2.0348540002326836e-05, + "loss": 0.7927, + "step": 10466 + }, + { + "epoch": 0.5096778905850559, + "grad_norm": 2.316932201385498, + "learning_rate": 2.0345386109189223e-05, + "loss": 0.7672, + "step": 10467 + }, + { + "epoch": 0.5097265843741631, + "grad_norm": 1.7849053144454956, + "learning_rate": 2.0342232207460108e-05, + "loss": 0.8261, + "step": 10468 + }, + { + "epoch": 0.5097752781632703, + "grad_norm": 2.729133367538452, + "learning_rate": 2.0339078297217928e-05, + "loss": 0.8587, + "step": 10469 + }, + { + "epoch": 0.5098239719523775, + "grad_norm": 2.403109073638916, + "learning_rate": 2.033592437854115e-05, + "loss": 0.8438, + "step": 10470 + }, + { + "epoch": 0.5098726657414847, + "grad_norm": 3.080472469329834, + "learning_rate": 2.0332770451508223e-05, + "loss": 0.8095, + "step": 10471 + }, + { + "epoch": 0.5099213595305919, + "grad_norm": 2.843071937561035, + "learning_rate": 2.0329616516197602e-05, + "loss": 0.7551, + "step": 10472 + }, + { + "epoch": 0.509970053319699, + "grad_norm": 1.421958327293396, + "learning_rate": 2.0326462572687736e-05, + "loss": 0.8668, + "step": 10473 + }, + { + "epoch": 0.5100187471088062, + "grad_norm": 3.182790756225586, + "learning_rate": 2.0323308621057087e-05, + "loss": 0.8326, + "step": 10474 + }, + { + "epoch": 0.5100674408979134, + "grad_norm": 2.2798209190368652, + "learning_rate": 2.0320154661384104e-05, + "loss": 0.8207, + "step": 10475 + }, + { + "epoch": 0.5101161346870207, + "grad_norm": 3.2266035079956055, + "learning_rate": 2.0317000693747247e-05, + "loss": 0.7885, + "step": 10476 + }, + { + "epoch": 0.5101648284761279, + "grad_norm": 2.0558342933654785, + "learning_rate": 2.031384671822497e-05, + "loss": 0.8585, + "step": 10477 + }, + { + "epoch": 0.5102135222652351, + "grad_norm": 1.8963428735733032, + "learning_rate": 2.031069273489572e-05, + "loss": 0.7912, + "step": 10478 + }, + { + "epoch": 0.5102622160543423, + "grad_norm": 1.886406660079956, + "learning_rate": 2.0307538743837963e-05, + "loss": 0.7465, + "step": 10479 + }, + { + "epoch": 0.5103109098434495, + "grad_norm": 4.77662992477417, + "learning_rate": 2.0304384745130155e-05, + "loss": 0.7829, + "step": 10480 + }, + { + "epoch": 0.5103596036325566, + "grad_norm": 1.6137417554855347, + "learning_rate": 2.0301230738850746e-05, + "loss": 0.8653, + "step": 10481 + }, + { + "epoch": 0.5104082974216638, + "grad_norm": 1.4044837951660156, + "learning_rate": 2.0298076725078203e-05, + "loss": 0.7765, + "step": 10482 + }, + { + "epoch": 0.510456991210771, + "grad_norm": 0.09407170861959457, + "learning_rate": 2.029492270389096e-05, + "loss": 0.5868, + "step": 10483 + }, + { + "epoch": 0.5105056849998783, + "grad_norm": 1.8964849710464478, + "learning_rate": 2.0291768675367497e-05, + "loss": 0.6921, + "step": 10484 + }, + { + "epoch": 0.5105543787889855, + "grad_norm": 1.5740019083023071, + "learning_rate": 2.028861463958626e-05, + "loss": 0.8736, + "step": 10485 + }, + { + "epoch": 0.5106030725780927, + "grad_norm": 0.09941089898347855, + "learning_rate": 2.028546059662571e-05, + "loss": 0.6194, + "step": 10486 + }, + { + "epoch": 0.5106517663671999, + "grad_norm": 1.449677586555481, + "learning_rate": 2.02823065465643e-05, + "loss": 0.8734, + "step": 10487 + }, + { + "epoch": 0.5107004601563071, + "grad_norm": 1.5016679763793945, + "learning_rate": 2.027915248948048e-05, + "loss": 0.9284, + "step": 10488 + }, + { + "epoch": 0.5107491539454143, + "grad_norm": 2.2979369163513184, + "learning_rate": 2.0275998425452728e-05, + "loss": 0.8681, + "step": 10489 + }, + { + "epoch": 0.5107978477345214, + "grad_norm": 1.948157548904419, + "learning_rate": 2.027284435455948e-05, + "loss": 0.7512, + "step": 10490 + }, + { + "epoch": 0.5108465415236286, + "grad_norm": 1.71587073802948, + "learning_rate": 2.0269690276879213e-05, + "loss": 0.8769, + "step": 10491 + }, + { + "epoch": 0.5108952353127358, + "grad_norm": 1.8207505941390991, + "learning_rate": 2.0266536192490377e-05, + "loss": 0.8877, + "step": 10492 + }, + { + "epoch": 0.510943929101843, + "grad_norm": 3.5094716548919678, + "learning_rate": 2.0263382101471424e-05, + "loss": 0.891, + "step": 10493 + }, + { + "epoch": 0.5109926228909503, + "grad_norm": 1.4090555906295776, + "learning_rate": 2.0260228003900817e-05, + "loss": 0.8243, + "step": 10494 + }, + { + "epoch": 0.5110413166800575, + "grad_norm": 1.437103271484375, + "learning_rate": 2.0257073899857018e-05, + "loss": 0.8244, + "step": 10495 + }, + { + "epoch": 0.5110900104691647, + "grad_norm": 1.344854474067688, + "learning_rate": 2.0253919789418482e-05, + "loss": 0.8609, + "step": 10496 + }, + { + "epoch": 0.5111387042582719, + "grad_norm": 2.016286611557007, + "learning_rate": 2.025076567266366e-05, + "loss": 0.7924, + "step": 10497 + }, + { + "epoch": 0.511187398047379, + "grad_norm": 2.4696195125579834, + "learning_rate": 2.0247611549671022e-05, + "loss": 0.8506, + "step": 10498 + }, + { + "epoch": 0.5112360918364862, + "grad_norm": 2.096959352493286, + "learning_rate": 2.024445742051903e-05, + "loss": 0.8501, + "step": 10499 + }, + { + "epoch": 0.5112847856255934, + "grad_norm": 1.4656401872634888, + "learning_rate": 2.0241303285286135e-05, + "loss": 0.8514, + "step": 10500 + }, + { + "epoch": 0.5113334794147006, + "grad_norm": 1.8605550527572632, + "learning_rate": 2.0238149144050796e-05, + "loss": 0.9339, + "step": 10501 + }, + { + "epoch": 0.5113821732038079, + "grad_norm": 1.4431930780410767, + "learning_rate": 2.0234994996891482e-05, + "loss": 0.8473, + "step": 10502 + }, + { + "epoch": 0.5114308669929151, + "grad_norm": 1.7897894382476807, + "learning_rate": 2.0231840843886638e-05, + "loss": 0.7892, + "step": 10503 + }, + { + "epoch": 0.5114795607820223, + "grad_norm": 1.4684447050094604, + "learning_rate": 2.0228686685114732e-05, + "loss": 0.7926, + "step": 10504 + }, + { + "epoch": 0.5115282545711295, + "grad_norm": 1.570710301399231, + "learning_rate": 2.022553252065423e-05, + "loss": 0.8462, + "step": 10505 + }, + { + "epoch": 0.5115769483602367, + "grad_norm": 1.4810590744018555, + "learning_rate": 2.0222378350583587e-05, + "loss": 0.8156, + "step": 10506 + }, + { + "epoch": 0.5116256421493438, + "grad_norm": 1.6811540126800537, + "learning_rate": 2.0219224174981256e-05, + "loss": 0.8512, + "step": 10507 + }, + { + "epoch": 0.511674335938451, + "grad_norm": 1.8436399698257446, + "learning_rate": 2.0216069993925706e-05, + "loss": 0.8507, + "step": 10508 + }, + { + "epoch": 0.5117230297275582, + "grad_norm": 1.3750025033950806, + "learning_rate": 2.0212915807495396e-05, + "loss": 0.8994, + "step": 10509 + }, + { + "epoch": 0.5117717235166654, + "grad_norm": 0.09668323397636414, + "learning_rate": 2.0209761615768783e-05, + "loss": 0.5822, + "step": 10510 + }, + { + "epoch": 0.5118204173057727, + "grad_norm": 2.0218207836151123, + "learning_rate": 2.0206607418824338e-05, + "loss": 0.8387, + "step": 10511 + }, + { + "epoch": 0.5118691110948799, + "grad_norm": 1.9959708452224731, + "learning_rate": 2.0203453216740504e-05, + "loss": 0.7626, + "step": 10512 + }, + { + "epoch": 0.5119178048839871, + "grad_norm": 4.241205215454102, + "learning_rate": 2.020029900959576e-05, + "loss": 0.8878, + "step": 10513 + }, + { + "epoch": 0.5119664986730943, + "grad_norm": 1.7172259092330933, + "learning_rate": 2.0197144797468557e-05, + "loss": 0.7801, + "step": 10514 + }, + { + "epoch": 0.5120151924622014, + "grad_norm": 2.3081743717193604, + "learning_rate": 2.0193990580437363e-05, + "loss": 0.8612, + "step": 10515 + }, + { + "epoch": 0.5120638862513086, + "grad_norm": 2.320871591567993, + "learning_rate": 2.0190836358580636e-05, + "loss": 0.7568, + "step": 10516 + }, + { + "epoch": 0.5121125800404158, + "grad_norm": 1.6175256967544556, + "learning_rate": 2.0187682131976834e-05, + "loss": 0.7823, + "step": 10517 + }, + { + "epoch": 0.512161273829523, + "grad_norm": 6.082548141479492, + "learning_rate": 2.0184527900704418e-05, + "loss": 0.8025, + "step": 10518 + }, + { + "epoch": 0.5122099676186302, + "grad_norm": 1.4373717308044434, + "learning_rate": 2.018137366484185e-05, + "loss": 0.8375, + "step": 10519 + }, + { + "epoch": 0.5122586614077375, + "grad_norm": 1.3840938806533813, + "learning_rate": 2.0178219424467608e-05, + "loss": 0.8967, + "step": 10520 + }, + { + "epoch": 0.5123073551968447, + "grad_norm": 1.6062326431274414, + "learning_rate": 2.0175065179660137e-05, + "loss": 0.8594, + "step": 10521 + }, + { + "epoch": 0.5123560489859519, + "grad_norm": 1.5294381380081177, + "learning_rate": 2.0171910930497907e-05, + "loss": 0.9095, + "step": 10522 + }, + { + "epoch": 0.5124047427750591, + "grad_norm": 1.9229427576065063, + "learning_rate": 2.016875667705937e-05, + "loss": 0.7828, + "step": 10523 + }, + { + "epoch": 0.5124534365641662, + "grad_norm": 1.3407785892486572, + "learning_rate": 2.0165602419423e-05, + "loss": 0.8838, + "step": 10524 + }, + { + "epoch": 0.5125021303532734, + "grad_norm": 2.838325023651123, + "learning_rate": 2.0162448157667258e-05, + "loss": 0.8128, + "step": 10525 + }, + { + "epoch": 0.5125508241423806, + "grad_norm": 1.9727591276168823, + "learning_rate": 2.0159293891870595e-05, + "loss": 0.839, + "step": 10526 + }, + { + "epoch": 0.5125995179314878, + "grad_norm": 2.9474778175354004, + "learning_rate": 2.0156139622111485e-05, + "loss": 0.824, + "step": 10527 + }, + { + "epoch": 0.512648211720595, + "grad_norm": 1.861742377281189, + "learning_rate": 2.0152985348468386e-05, + "loss": 0.7264, + "step": 10528 + }, + { + "epoch": 0.5126969055097023, + "grad_norm": 3.1044514179229736, + "learning_rate": 2.014983107101976e-05, + "loss": 0.7852, + "step": 10529 + }, + { + "epoch": 0.5127455992988095, + "grad_norm": 1.3885242938995361, + "learning_rate": 2.014667678984408e-05, + "loss": 0.8675, + "step": 10530 + }, + { + "epoch": 0.5127942930879167, + "grad_norm": 2.533205986022949, + "learning_rate": 2.0143522505019805e-05, + "loss": 0.8685, + "step": 10531 + }, + { + "epoch": 0.5128429868770238, + "grad_norm": 2.3551595211029053, + "learning_rate": 2.0140368216625387e-05, + "loss": 0.8012, + "step": 10532 + }, + { + "epoch": 0.512891680666131, + "grad_norm": 1.7702802419662476, + "learning_rate": 2.0137213924739292e-05, + "loss": 0.8646, + "step": 10533 + }, + { + "epoch": 0.5129403744552382, + "grad_norm": 1.223353624343872, + "learning_rate": 2.0134059629439994e-05, + "loss": 0.8565, + "step": 10534 + }, + { + "epoch": 0.5129890682443454, + "grad_norm": 1.908029556274414, + "learning_rate": 2.0130905330805954e-05, + "loss": 0.8116, + "step": 10535 + }, + { + "epoch": 0.5130377620334526, + "grad_norm": 1.6849884986877441, + "learning_rate": 2.012775102891563e-05, + "loss": 0.899, + "step": 10536 + }, + { + "epoch": 0.5130864558225599, + "grad_norm": 2.512784004211426, + "learning_rate": 2.012459672384748e-05, + "loss": 0.9021, + "step": 10537 + }, + { + "epoch": 0.5131351496116671, + "grad_norm": 4.357310771942139, + "learning_rate": 2.0121442415679984e-05, + "loss": 0.904, + "step": 10538 + }, + { + "epoch": 0.5131838434007743, + "grad_norm": 2.6490676403045654, + "learning_rate": 2.0118288104491594e-05, + "loss": 0.7879, + "step": 10539 + }, + { + "epoch": 0.5132325371898814, + "grad_norm": 2.160541296005249, + "learning_rate": 2.0115133790360782e-05, + "loss": 0.8798, + "step": 10540 + }, + { + "epoch": 0.5132812309789886, + "grad_norm": 1.4207489490509033, + "learning_rate": 2.0111979473365998e-05, + "loss": 0.7748, + "step": 10541 + }, + { + "epoch": 0.5133299247680958, + "grad_norm": 1.4512611627578735, + "learning_rate": 2.0108825153585716e-05, + "loss": 0.8122, + "step": 10542 + }, + { + "epoch": 0.513378618557203, + "grad_norm": 2.079350709915161, + "learning_rate": 2.0105670831098402e-05, + "loss": 0.8038, + "step": 10543 + }, + { + "epoch": 0.5134273123463102, + "grad_norm": 1.4024229049682617, + "learning_rate": 2.0102516505982514e-05, + "loss": 0.8277, + "step": 10544 + }, + { + "epoch": 0.5134760061354174, + "grad_norm": 1.6008243560791016, + "learning_rate": 2.009936217831652e-05, + "loss": 0.8783, + "step": 10545 + }, + { + "epoch": 0.5135246999245247, + "grad_norm": 1.742789387702942, + "learning_rate": 2.0096207848178883e-05, + "loss": 0.8107, + "step": 10546 + }, + { + "epoch": 0.5135733937136319, + "grad_norm": 7.753567218780518, + "learning_rate": 2.009305351564806e-05, + "loss": 0.8136, + "step": 10547 + }, + { + "epoch": 0.5136220875027391, + "grad_norm": 1.4234073162078857, + "learning_rate": 2.008989918080253e-05, + "loss": 0.8193, + "step": 10548 + }, + { + "epoch": 0.5136707812918462, + "grad_norm": 2.955402374267578, + "learning_rate": 2.0086744843720746e-05, + "loss": 0.7241, + "step": 10549 + }, + { + "epoch": 0.5137194750809534, + "grad_norm": 2.458949089050293, + "learning_rate": 2.008359050448118e-05, + "loss": 0.8581, + "step": 10550 + }, + { + "epoch": 0.5137681688700606, + "grad_norm": 1.4354753494262695, + "learning_rate": 2.008043616316229e-05, + "loss": 0.7936, + "step": 10551 + }, + { + "epoch": 0.5138168626591678, + "grad_norm": 1.6353894472122192, + "learning_rate": 2.0077281819842542e-05, + "loss": 0.855, + "step": 10552 + }, + { + "epoch": 0.513865556448275, + "grad_norm": 1.2892954349517822, + "learning_rate": 2.0074127474600406e-05, + "loss": 0.824, + "step": 10553 + }, + { + "epoch": 0.5139142502373822, + "grad_norm": 1.4499900341033936, + "learning_rate": 2.0070973127514343e-05, + "loss": 0.794, + "step": 10554 + }, + { + "epoch": 0.5139629440264895, + "grad_norm": 1.5008962154388428, + "learning_rate": 2.006781877866281e-05, + "loss": 0.902, + "step": 10555 + }, + { + "epoch": 0.5140116378155967, + "grad_norm": 2.1705641746520996, + "learning_rate": 2.0064664428124278e-05, + "loss": 0.8363, + "step": 10556 + }, + { + "epoch": 0.5140603316047038, + "grad_norm": 2.344129800796509, + "learning_rate": 2.0061510075977217e-05, + "loss": 0.9093, + "step": 10557 + }, + { + "epoch": 0.514109025393811, + "grad_norm": 1.7805593013763428, + "learning_rate": 2.0058355722300086e-05, + "loss": 0.9001, + "step": 10558 + }, + { + "epoch": 0.5141577191829182, + "grad_norm": 1.7800476551055908, + "learning_rate": 2.0055201367171354e-05, + "loss": 0.884, + "step": 10559 + }, + { + "epoch": 0.5142064129720254, + "grad_norm": 1.3133118152618408, + "learning_rate": 2.0052047010669484e-05, + "loss": 0.9244, + "step": 10560 + }, + { + "epoch": 0.5142551067611326, + "grad_norm": 2.1879677772521973, + "learning_rate": 2.0048892652872937e-05, + "loss": 0.8048, + "step": 10561 + }, + { + "epoch": 0.5143038005502398, + "grad_norm": 1.5659180879592896, + "learning_rate": 2.0045738293860176e-05, + "loss": 0.7877, + "step": 10562 + }, + { + "epoch": 0.514352494339347, + "grad_norm": 1.7546764612197876, + "learning_rate": 2.0042583933709677e-05, + "loss": 0.8008, + "step": 10563 + }, + { + "epoch": 0.5144011881284543, + "grad_norm": 1.4630006551742554, + "learning_rate": 2.0039429572499895e-05, + "loss": 0.869, + "step": 10564 + }, + { + "epoch": 0.5144498819175615, + "grad_norm": 1.1726101636886597, + "learning_rate": 2.0036275210309304e-05, + "loss": 0.8491, + "step": 10565 + }, + { + "epoch": 0.5144985757066686, + "grad_norm": 2.535120725631714, + "learning_rate": 2.003312084721636e-05, + "loss": 0.8572, + "step": 10566 + }, + { + "epoch": 0.5145472694957758, + "grad_norm": 1.6860861778259277, + "learning_rate": 2.002996648329953e-05, + "loss": 0.9405, + "step": 10567 + }, + { + "epoch": 0.514595963284883, + "grad_norm": 1.6010091304779053, + "learning_rate": 2.0026812118637283e-05, + "loss": 0.8208, + "step": 10568 + }, + { + "epoch": 0.5146446570739902, + "grad_norm": 3.800337314605713, + "learning_rate": 2.002365775330809e-05, + "loss": 0.9176, + "step": 10569 + }, + { + "epoch": 0.5146933508630974, + "grad_norm": 1.8652435541152954, + "learning_rate": 2.0020503387390394e-05, + "loss": 0.8545, + "step": 10570 + }, + { + "epoch": 0.5147420446522046, + "grad_norm": 0.09142090380191803, + "learning_rate": 2.001734902096268e-05, + "loss": 0.5837, + "step": 10571 + }, + { + "epoch": 0.5147907384413118, + "grad_norm": 1.8573904037475586, + "learning_rate": 2.0014194654103415e-05, + "loss": 0.8458, + "step": 10572 + }, + { + "epoch": 0.5148394322304191, + "grad_norm": 1.3554856777191162, + "learning_rate": 2.0011040286891047e-05, + "loss": 0.7828, + "step": 10573 + }, + { + "epoch": 0.5148881260195262, + "grad_norm": 1.8703608512878418, + "learning_rate": 2.0007885919404056e-05, + "loss": 0.8675, + "step": 10574 + }, + { + "epoch": 0.5149368198086334, + "grad_norm": 3.775244951248169, + "learning_rate": 2.0004731551720904e-05, + "loss": 0.8801, + "step": 10575 + }, + { + "epoch": 0.5149855135977406, + "grad_norm": 1.4761061668395996, + "learning_rate": 2.0001577183920044e-05, + "loss": 0.7209, + "step": 10576 + }, + { + "epoch": 0.5150342073868478, + "grad_norm": 1.4651808738708496, + "learning_rate": 1.999842281607996e-05, + "loss": 0.7815, + "step": 10577 + }, + { + "epoch": 0.515082901175955, + "grad_norm": 1.741759181022644, + "learning_rate": 1.9995268448279103e-05, + "loss": 0.7911, + "step": 10578 + }, + { + "epoch": 0.5151315949650622, + "grad_norm": 3.936380624771118, + "learning_rate": 1.9992114080595954e-05, + "loss": 0.8979, + "step": 10579 + }, + { + "epoch": 0.5151802887541694, + "grad_norm": 1.731166958808899, + "learning_rate": 1.9988959713108957e-05, + "loss": 0.8108, + "step": 10580 + }, + { + "epoch": 0.5152289825432766, + "grad_norm": 2.613898992538452, + "learning_rate": 1.9985805345896598e-05, + "loss": 0.7744, + "step": 10581 + }, + { + "epoch": 0.5152776763323837, + "grad_norm": 1.6886107921600342, + "learning_rate": 1.9982650979037324e-05, + "loss": 0.7565, + "step": 10582 + }, + { + "epoch": 0.515326370121491, + "grad_norm": 1.3201512098312378, + "learning_rate": 1.997949661260961e-05, + "loss": 0.9392, + "step": 10583 + }, + { + "epoch": 0.5153750639105982, + "grad_norm": 2.775747537612915, + "learning_rate": 1.997634224669192e-05, + "loss": 0.7847, + "step": 10584 + }, + { + "epoch": 0.5154237576997054, + "grad_norm": 1.8948864936828613, + "learning_rate": 1.9973187881362716e-05, + "loss": 0.8029, + "step": 10585 + }, + { + "epoch": 0.5154724514888126, + "grad_norm": 1.7922990322113037, + "learning_rate": 1.9970033516700476e-05, + "loss": 0.8412, + "step": 10586 + }, + { + "epoch": 0.5155211452779198, + "grad_norm": 1.6554104089736938, + "learning_rate": 1.9966879152783645e-05, + "loss": 0.9147, + "step": 10587 + }, + { + "epoch": 0.515569839067027, + "grad_norm": 2.7907536029815674, + "learning_rate": 1.9963724789690706e-05, + "loss": 0.8393, + "step": 10588 + }, + { + "epoch": 0.5156185328561342, + "grad_norm": 2.1083765029907227, + "learning_rate": 1.996057042750011e-05, + "loss": 0.8807, + "step": 10589 + }, + { + "epoch": 0.5156672266452414, + "grad_norm": 6.895627021789551, + "learning_rate": 1.995741606629033e-05, + "loss": 0.8674, + "step": 10590 + }, + { + "epoch": 0.5157159204343486, + "grad_norm": 1.6844993829727173, + "learning_rate": 1.995426170613983e-05, + "loss": 0.6714, + "step": 10591 + }, + { + "epoch": 0.5157646142234558, + "grad_norm": 1.4688611030578613, + "learning_rate": 1.995110734712707e-05, + "loss": 0.8061, + "step": 10592 + }, + { + "epoch": 0.515813308012563, + "grad_norm": 2.200904607772827, + "learning_rate": 1.9947952989330523e-05, + "loss": 0.796, + "step": 10593 + }, + { + "epoch": 0.5158620018016702, + "grad_norm": 1.5494922399520874, + "learning_rate": 1.994479863282865e-05, + "loss": 0.8841, + "step": 10594 + }, + { + "epoch": 0.5159106955907774, + "grad_norm": 2.35958194732666, + "learning_rate": 1.9941644277699917e-05, + "loss": 0.8076, + "step": 10595 + }, + { + "epoch": 0.5159593893798846, + "grad_norm": 1.4833756685256958, + "learning_rate": 1.9938489924022786e-05, + "loss": 0.816, + "step": 10596 + }, + { + "epoch": 0.5160080831689918, + "grad_norm": 1.6948528289794922, + "learning_rate": 1.993533557187573e-05, + "loss": 0.8457, + "step": 10597 + }, + { + "epoch": 0.516056776958099, + "grad_norm": 4.2094268798828125, + "learning_rate": 1.99321812213372e-05, + "loss": 0.8266, + "step": 10598 + }, + { + "epoch": 0.5161054707472061, + "grad_norm": 1.3045951128005981, + "learning_rate": 1.992902687248567e-05, + "loss": 0.7886, + "step": 10599 + }, + { + "epoch": 0.5161541645363134, + "grad_norm": 1.702445149421692, + "learning_rate": 1.9925872525399597e-05, + "loss": 0.8915, + "step": 10600 + }, + { + "epoch": 0.5162028583254206, + "grad_norm": 0.09864954650402069, + "learning_rate": 1.9922718180157464e-05, + "loss": 0.5913, + "step": 10601 + }, + { + "epoch": 0.5162515521145278, + "grad_norm": 1.63698410987854, + "learning_rate": 1.9919563836837717e-05, + "loss": 0.8928, + "step": 10602 + }, + { + "epoch": 0.516300245903635, + "grad_norm": 2.0660674571990967, + "learning_rate": 1.9916409495518824e-05, + "loss": 0.8945, + "step": 10603 + }, + { + "epoch": 0.5163489396927422, + "grad_norm": 2.2916197776794434, + "learning_rate": 1.991325515627926e-05, + "loss": 0.696, + "step": 10604 + }, + { + "epoch": 0.5163976334818494, + "grad_norm": 3.752423048019409, + "learning_rate": 1.9910100819197474e-05, + "loss": 0.8791, + "step": 10605 + }, + { + "epoch": 0.5164463272709566, + "grad_norm": 1.8036643266677856, + "learning_rate": 1.9906946484351942e-05, + "loss": 0.8451, + "step": 10606 + }, + { + "epoch": 0.5164950210600638, + "grad_norm": 3.050628185272217, + "learning_rate": 1.9903792151821124e-05, + "loss": 0.832, + "step": 10607 + }, + { + "epoch": 0.5165437148491709, + "grad_norm": 1.6793488264083862, + "learning_rate": 1.990063782168349e-05, + "loss": 0.8727, + "step": 10608 + }, + { + "epoch": 0.5165924086382782, + "grad_norm": 1.843079924583435, + "learning_rate": 1.989748349401749e-05, + "loss": 0.8372, + "step": 10609 + }, + { + "epoch": 0.5166411024273854, + "grad_norm": 2.303194284439087, + "learning_rate": 1.989432916890161e-05, + "loss": 0.818, + "step": 10610 + }, + { + "epoch": 0.5166897962164926, + "grad_norm": 2.826259136199951, + "learning_rate": 1.989117484641429e-05, + "loss": 0.8813, + "step": 10611 + }, + { + "epoch": 0.5167384900055998, + "grad_norm": 0.09097691625356674, + "learning_rate": 1.9888020526634005e-05, + "loss": 0.6819, + "step": 10612 + }, + { + "epoch": 0.516787183794707, + "grad_norm": 2.4203481674194336, + "learning_rate": 1.9884866209639225e-05, + "loss": 0.9255, + "step": 10613 + }, + { + "epoch": 0.5168358775838142, + "grad_norm": 1.7297706604003906, + "learning_rate": 1.9881711895508405e-05, + "loss": 0.9318, + "step": 10614 + }, + { + "epoch": 0.5168845713729214, + "grad_norm": 0.09329155087471008, + "learning_rate": 1.9878557584320023e-05, + "loss": 0.57, + "step": 10615 + }, + { + "epoch": 0.5169332651620285, + "grad_norm": 1.4980833530426025, + "learning_rate": 1.987540327615252e-05, + "loss": 0.923, + "step": 10616 + }, + { + "epoch": 0.5169819589511357, + "grad_norm": 1.7530605792999268, + "learning_rate": 1.987224897108438e-05, + "loss": 0.8318, + "step": 10617 + }, + { + "epoch": 0.517030652740243, + "grad_norm": 1.5789145231246948, + "learning_rate": 1.9869094669194053e-05, + "loss": 0.7569, + "step": 10618 + }, + { + "epoch": 0.5170793465293502, + "grad_norm": 2.7806570529937744, + "learning_rate": 1.986594037056001e-05, + "loss": 0.8317, + "step": 10619 + }, + { + "epoch": 0.5171280403184574, + "grad_norm": 1.9771612882614136, + "learning_rate": 1.986278607526071e-05, + "loss": 0.8322, + "step": 10620 + }, + { + "epoch": 0.5171767341075646, + "grad_norm": 1.5823475122451782, + "learning_rate": 1.9859631783374623e-05, + "loss": 0.8393, + "step": 10621 + }, + { + "epoch": 0.5172254278966718, + "grad_norm": 2.4444453716278076, + "learning_rate": 1.9856477494980205e-05, + "loss": 0.9166, + "step": 10622 + }, + { + "epoch": 0.517274121685779, + "grad_norm": 1.63729989528656, + "learning_rate": 1.985332321015592e-05, + "loss": 0.8521, + "step": 10623 + }, + { + "epoch": 0.5173228154748862, + "grad_norm": 2.2105844020843506, + "learning_rate": 1.985016892898024e-05, + "loss": 0.8679, + "step": 10624 + }, + { + "epoch": 0.5173715092639933, + "grad_norm": 1.4465835094451904, + "learning_rate": 1.9847014651531617e-05, + "loss": 0.8415, + "step": 10625 + }, + { + "epoch": 0.5174202030531005, + "grad_norm": 2.22373628616333, + "learning_rate": 1.9843860377888525e-05, + "loss": 0.8103, + "step": 10626 + }, + { + "epoch": 0.5174688968422078, + "grad_norm": 3.062486410140991, + "learning_rate": 1.984070610812941e-05, + "loss": 0.8046, + "step": 10627 + }, + { + "epoch": 0.517517590631315, + "grad_norm": 2.1913037300109863, + "learning_rate": 1.9837551842332755e-05, + "loss": 0.7913, + "step": 10628 + }, + { + "epoch": 0.5175662844204222, + "grad_norm": 0.09425997734069824, + "learning_rate": 1.9834397580577005e-05, + "loss": 0.6488, + "step": 10629 + }, + { + "epoch": 0.5176149782095294, + "grad_norm": 2.044870376586914, + "learning_rate": 1.9831243322940637e-05, + "loss": 0.8214, + "step": 10630 + }, + { + "epoch": 0.5176636719986366, + "grad_norm": 2.2282586097717285, + "learning_rate": 1.98280890695021e-05, + "loss": 0.8237, + "step": 10631 + }, + { + "epoch": 0.5177123657877438, + "grad_norm": 1.5865694284439087, + "learning_rate": 1.9824934820339866e-05, + "loss": 0.9014, + "step": 10632 + }, + { + "epoch": 0.5177610595768509, + "grad_norm": 1.6517142057418823, + "learning_rate": 1.98217805755324e-05, + "loss": 0.7556, + "step": 10633 + }, + { + "epoch": 0.5178097533659581, + "grad_norm": 1.8966846466064453, + "learning_rate": 1.981862633515815e-05, + "loss": 0.7746, + "step": 10634 + }, + { + "epoch": 0.5178584471550653, + "grad_norm": 6.251481533050537, + "learning_rate": 1.9815472099295586e-05, + "loss": 0.8643, + "step": 10635 + }, + { + "epoch": 0.5179071409441726, + "grad_norm": 2.646826982498169, + "learning_rate": 1.9812317868023172e-05, + "loss": 0.7676, + "step": 10636 + }, + { + "epoch": 0.5179558347332798, + "grad_norm": 1.986855149269104, + "learning_rate": 1.9809163641419374e-05, + "loss": 0.8458, + "step": 10637 + }, + { + "epoch": 0.518004528522387, + "grad_norm": 1.7857215404510498, + "learning_rate": 1.9806009419562643e-05, + "loss": 0.8539, + "step": 10638 + }, + { + "epoch": 0.5180532223114942, + "grad_norm": 1.9844889640808105, + "learning_rate": 1.9802855202531453e-05, + "loss": 0.8493, + "step": 10639 + }, + { + "epoch": 0.5181019161006014, + "grad_norm": 2.249985456466675, + "learning_rate": 1.9799700990404247e-05, + "loss": 0.8684, + "step": 10640 + }, + { + "epoch": 0.5181506098897085, + "grad_norm": 7.203118324279785, + "learning_rate": 1.97965467832595e-05, + "loss": 0.8148, + "step": 10641 + }, + { + "epoch": 0.5181993036788157, + "grad_norm": 2.4838082790374756, + "learning_rate": 1.979339258117567e-05, + "loss": 0.7649, + "step": 10642 + }, + { + "epoch": 0.5182479974679229, + "grad_norm": 2.9168715476989746, + "learning_rate": 1.979023838423122e-05, + "loss": 0.8702, + "step": 10643 + }, + { + "epoch": 0.5182966912570302, + "grad_norm": 5.998945236206055, + "learning_rate": 1.978708419250461e-05, + "loss": 0.8524, + "step": 10644 + }, + { + "epoch": 0.5183453850461374, + "grad_norm": 2.027130126953125, + "learning_rate": 1.9783930006074298e-05, + "loss": 0.8141, + "step": 10645 + }, + { + "epoch": 0.5183940788352446, + "grad_norm": 2.3152987957000732, + "learning_rate": 1.978077582501875e-05, + "loss": 0.8131, + "step": 10646 + }, + { + "epoch": 0.5184427726243518, + "grad_norm": 2.4081528186798096, + "learning_rate": 1.977762164941642e-05, + "loss": 0.8458, + "step": 10647 + }, + { + "epoch": 0.518491466413459, + "grad_norm": 0.09491817653179169, + "learning_rate": 1.9774467479345775e-05, + "loss": 0.6562, + "step": 10648 + }, + { + "epoch": 0.5185401602025662, + "grad_norm": 4.35409688949585, + "learning_rate": 1.977131331488527e-05, + "loss": 0.8523, + "step": 10649 + }, + { + "epoch": 0.5185888539916733, + "grad_norm": 2.940178871154785, + "learning_rate": 1.976815915611337e-05, + "loss": 0.8959, + "step": 10650 + }, + { + "epoch": 0.5186375477807805, + "grad_norm": 3.613450050354004, + "learning_rate": 1.9765005003108525e-05, + "loss": 0.96, + "step": 10651 + }, + { + "epoch": 0.5186862415698877, + "grad_norm": 1.4877055883407593, + "learning_rate": 1.9761850855949204e-05, + "loss": 0.7656, + "step": 10652 + }, + { + "epoch": 0.518734935358995, + "grad_norm": 0.10904218256473541, + "learning_rate": 1.9758696714713872e-05, + "loss": 0.6205, + "step": 10653 + }, + { + "epoch": 0.5187836291481022, + "grad_norm": 2.854374647140503, + "learning_rate": 1.9755542579480975e-05, + "loss": 0.7762, + "step": 10654 + }, + { + "epoch": 0.5188323229372094, + "grad_norm": 2.488739013671875, + "learning_rate": 1.9752388450328984e-05, + "loss": 0.8251, + "step": 10655 + }, + { + "epoch": 0.5188810167263166, + "grad_norm": 1.7082630395889282, + "learning_rate": 1.9749234327336344e-05, + "loss": 0.8867, + "step": 10656 + }, + { + "epoch": 0.5189297105154238, + "grad_norm": 1.5260165929794312, + "learning_rate": 1.974608021058153e-05, + "loss": 0.8321, + "step": 10657 + }, + { + "epoch": 0.5189784043045309, + "grad_norm": 1.5615932941436768, + "learning_rate": 1.974292610014299e-05, + "loss": 0.9252, + "step": 10658 + }, + { + "epoch": 0.5190270980936381, + "grad_norm": 1.872933030128479, + "learning_rate": 1.9739771996099193e-05, + "loss": 0.7914, + "step": 10659 + }, + { + "epoch": 0.5190757918827453, + "grad_norm": 2.451730251312256, + "learning_rate": 1.9736617898528583e-05, + "loss": 0.9089, + "step": 10660 + }, + { + "epoch": 0.5191244856718525, + "grad_norm": 3.1184353828430176, + "learning_rate": 1.9733463807509626e-05, + "loss": 0.9706, + "step": 10661 + }, + { + "epoch": 0.5191731794609598, + "grad_norm": 1.6063125133514404, + "learning_rate": 1.9730309723120793e-05, + "loss": 0.8671, + "step": 10662 + }, + { + "epoch": 0.519221873250067, + "grad_norm": 1.4527561664581299, + "learning_rate": 1.972715564544052e-05, + "loss": 0.8962, + "step": 10663 + }, + { + "epoch": 0.5192705670391742, + "grad_norm": 1.5893908739089966, + "learning_rate": 1.972400157454728e-05, + "loss": 0.8567, + "step": 10664 + }, + { + "epoch": 0.5193192608282814, + "grad_norm": 1.672325849533081, + "learning_rate": 1.9720847510519518e-05, + "loss": 0.7217, + "step": 10665 + }, + { + "epoch": 0.5193679546173886, + "grad_norm": 2.362896680831909, + "learning_rate": 1.971769345343571e-05, + "loss": 0.8553, + "step": 10666 + }, + { + "epoch": 0.5194166484064957, + "grad_norm": 2.1860768795013428, + "learning_rate": 1.9714539403374297e-05, + "loss": 0.8474, + "step": 10667 + }, + { + "epoch": 0.5194653421956029, + "grad_norm": 2.5883262157440186, + "learning_rate": 1.9711385360413747e-05, + "loss": 0.8596, + "step": 10668 + }, + { + "epoch": 0.5195140359847101, + "grad_norm": 1.8791584968566895, + "learning_rate": 1.970823132463251e-05, + "loss": 0.7755, + "step": 10669 + }, + { + "epoch": 0.5195627297738173, + "grad_norm": 1.805430293083191, + "learning_rate": 1.9705077296109046e-05, + "loss": 0.8233, + "step": 10670 + }, + { + "epoch": 0.5196114235629246, + "grad_norm": 0.09314437955617905, + "learning_rate": 1.9701923274921807e-05, + "loss": 0.5859, + "step": 10671 + }, + { + "epoch": 0.5196601173520318, + "grad_norm": 1.7689896821975708, + "learning_rate": 1.9698769261149257e-05, + "loss": 0.8895, + "step": 10672 + }, + { + "epoch": 0.519708811141139, + "grad_norm": 1.8807936906814575, + "learning_rate": 1.9695615254869848e-05, + "loss": 0.813, + "step": 10673 + }, + { + "epoch": 0.5197575049302462, + "grad_norm": 1.656768560409546, + "learning_rate": 1.9692461256162037e-05, + "loss": 0.8268, + "step": 10674 + }, + { + "epoch": 0.5198061987193533, + "grad_norm": 1.7593159675598145, + "learning_rate": 1.9689307265104288e-05, + "loss": 0.7057, + "step": 10675 + }, + { + "epoch": 0.5198548925084605, + "grad_norm": 0.094717837870121, + "learning_rate": 1.9686153281775037e-05, + "loss": 0.6458, + "step": 10676 + }, + { + "epoch": 0.5199035862975677, + "grad_norm": 1.781304955482483, + "learning_rate": 1.9682999306252763e-05, + "loss": 0.7317, + "step": 10677 + }, + { + "epoch": 0.5199522800866749, + "grad_norm": 1.2713406085968018, + "learning_rate": 1.9679845338615902e-05, + "loss": 0.8015, + "step": 10678 + }, + { + "epoch": 0.5200009738757821, + "grad_norm": 1.8640004396438599, + "learning_rate": 1.967669137894292e-05, + "loss": 0.8618, + "step": 10679 + }, + { + "epoch": 0.5200496676648894, + "grad_norm": 1.5304385423660278, + "learning_rate": 1.9673537427312267e-05, + "loss": 0.8205, + "step": 10680 + }, + { + "epoch": 0.5200983614539966, + "grad_norm": 2.2023565769195557, + "learning_rate": 1.96703834838024e-05, + "loss": 0.8184, + "step": 10681 + }, + { + "epoch": 0.5201470552431038, + "grad_norm": 1.9686278104782104, + "learning_rate": 1.9667229548491783e-05, + "loss": 0.777, + "step": 10682 + }, + { + "epoch": 0.520195749032211, + "grad_norm": 1.5132395029067993, + "learning_rate": 1.966407562145885e-05, + "loss": 0.8791, + "step": 10683 + }, + { + "epoch": 0.5202444428213181, + "grad_norm": 2.0709218978881836, + "learning_rate": 1.9660921702782075e-05, + "loss": 0.9104, + "step": 10684 + }, + { + "epoch": 0.5202931366104253, + "grad_norm": 3.5954184532165527, + "learning_rate": 1.96577677925399e-05, + "loss": 0.9489, + "step": 10685 + }, + { + "epoch": 0.5203418303995325, + "grad_norm": 5.264892578125, + "learning_rate": 1.965461389081078e-05, + "loss": 0.7416, + "step": 10686 + }, + { + "epoch": 0.5203905241886397, + "grad_norm": 2.899550676345825, + "learning_rate": 1.9651459997673174e-05, + "loss": 0.8698, + "step": 10687 + }, + { + "epoch": 0.520439217977747, + "grad_norm": 2.4489269256591797, + "learning_rate": 1.9648306113205536e-05, + "loss": 0.8344, + "step": 10688 + }, + { + "epoch": 0.5204879117668542, + "grad_norm": 2.0307328701019287, + "learning_rate": 1.9645152237486305e-05, + "loss": 0.7501, + "step": 10689 + }, + { + "epoch": 0.5205366055559614, + "grad_norm": 1.889377236366272, + "learning_rate": 1.9641998370593957e-05, + "loss": 0.7997, + "step": 10690 + }, + { + "epoch": 0.5205852993450686, + "grad_norm": 2.6661863327026367, + "learning_rate": 1.9638844512606924e-05, + "loss": 0.7988, + "step": 10691 + }, + { + "epoch": 0.5206339931341757, + "grad_norm": 1.416534423828125, + "learning_rate": 1.9635690663603665e-05, + "loss": 0.8122, + "step": 10692 + }, + { + "epoch": 0.5206826869232829, + "grad_norm": 2.3275842666625977, + "learning_rate": 1.9632536823662638e-05, + "loss": 0.869, + "step": 10693 + }, + { + "epoch": 0.5207313807123901, + "grad_norm": 2.019073963165283, + "learning_rate": 1.9629382992862286e-05, + "loss": 0.8298, + "step": 10694 + }, + { + "epoch": 0.5207800745014973, + "grad_norm": 2.9316089153289795, + "learning_rate": 1.9626229171281075e-05, + "loss": 0.8963, + "step": 10695 + }, + { + "epoch": 0.5208287682906045, + "grad_norm": 4.142888069152832, + "learning_rate": 1.9623075358997442e-05, + "loss": 0.8471, + "step": 10696 + }, + { + "epoch": 0.5208774620797118, + "grad_norm": 0.09744197875261307, + "learning_rate": 1.9619921556089848e-05, + "loss": 0.6045, + "step": 10697 + }, + { + "epoch": 0.520926155868819, + "grad_norm": 2.0503780841827393, + "learning_rate": 1.9616767762636734e-05, + "loss": 0.9047, + "step": 10698 + }, + { + "epoch": 0.5209748496579262, + "grad_norm": 1.7123240232467651, + "learning_rate": 1.961361397871656e-05, + "loss": 0.7885, + "step": 10699 + }, + { + "epoch": 0.5210235434470333, + "grad_norm": 2.3772218227386475, + "learning_rate": 1.9610460204407768e-05, + "loss": 0.8352, + "step": 10700 + }, + { + "epoch": 0.5210722372361405, + "grad_norm": 1.6081563234329224, + "learning_rate": 1.960730643978882e-05, + "loss": 0.7924, + "step": 10701 + }, + { + "epoch": 0.5211209310252477, + "grad_norm": 2.335894823074341, + "learning_rate": 1.960415268493816e-05, + "loss": 0.9134, + "step": 10702 + }, + { + "epoch": 0.5211696248143549, + "grad_norm": 1.791736364364624, + "learning_rate": 1.9600998939934236e-05, + "loss": 0.9051, + "step": 10703 + }, + { + "epoch": 0.5212183186034621, + "grad_norm": 1.3758511543273926, + "learning_rate": 1.9597845204855505e-05, + "loss": 0.9111, + "step": 10704 + }, + { + "epoch": 0.5212670123925693, + "grad_norm": 1.8283504247665405, + "learning_rate": 1.9594691479780403e-05, + "loss": 0.9422, + "step": 10705 + }, + { + "epoch": 0.5213157061816766, + "grad_norm": 1.5234321355819702, + "learning_rate": 1.9591537764787395e-05, + "loss": 0.7831, + "step": 10706 + }, + { + "epoch": 0.5213643999707838, + "grad_norm": 1.7663081884384155, + "learning_rate": 1.9588384059954916e-05, + "loss": 0.7834, + "step": 10707 + }, + { + "epoch": 0.521413093759891, + "grad_norm": 1.7101666927337646, + "learning_rate": 1.9585230365361426e-05, + "loss": 0.7973, + "step": 10708 + }, + { + "epoch": 0.5214617875489981, + "grad_norm": 1.7190883159637451, + "learning_rate": 1.9582076681085364e-05, + "loss": 0.8616, + "step": 10709 + }, + { + "epoch": 0.5215104813381053, + "grad_norm": 3.2015092372894287, + "learning_rate": 1.9578923007205187e-05, + "loss": 0.7733, + "step": 10710 + }, + { + "epoch": 0.5215591751272125, + "grad_norm": 2.4329824447631836, + "learning_rate": 1.9575769343799335e-05, + "loss": 0.8812, + "step": 10711 + }, + { + "epoch": 0.5216078689163197, + "grad_norm": 2.1897311210632324, + "learning_rate": 1.9572615690946256e-05, + "loss": 0.854, + "step": 10712 + }, + { + "epoch": 0.5216565627054269, + "grad_norm": 2.441209316253662, + "learning_rate": 1.956946204872441e-05, + "loss": 0.7338, + "step": 10713 + }, + { + "epoch": 0.5217052564945341, + "grad_norm": 1.4370249509811401, + "learning_rate": 1.956630841721223e-05, + "loss": 0.8462, + "step": 10714 + }, + { + "epoch": 0.5217539502836414, + "grad_norm": 1.1905920505523682, + "learning_rate": 1.9563154796488167e-05, + "loss": 0.786, + "step": 10715 + }, + { + "epoch": 0.5218026440727486, + "grad_norm": 3.4113340377807617, + "learning_rate": 1.9560001186630667e-05, + "loss": 0.7888, + "step": 10716 + }, + { + "epoch": 0.5218513378618557, + "grad_norm": 1.7015217542648315, + "learning_rate": 1.9556847587718185e-05, + "loss": 0.7653, + "step": 10717 + }, + { + "epoch": 0.5219000316509629, + "grad_norm": 1.958426594734192, + "learning_rate": 1.955369399982915e-05, + "loss": 0.8663, + "step": 10718 + }, + { + "epoch": 0.5219487254400701, + "grad_norm": 2.295361280441284, + "learning_rate": 1.9550540423042025e-05, + "loss": 0.7808, + "step": 10719 + }, + { + "epoch": 0.5219974192291773, + "grad_norm": 1.7372912168502808, + "learning_rate": 1.954738685743524e-05, + "loss": 0.8149, + "step": 10720 + }, + { + "epoch": 0.5220461130182845, + "grad_norm": 1.7516076564788818, + "learning_rate": 1.9544233303087252e-05, + "loss": 0.8513, + "step": 10721 + }, + { + "epoch": 0.5220948068073917, + "grad_norm": 13.76406478881836, + "learning_rate": 1.9541079760076505e-05, + "loss": 0.7719, + "step": 10722 + }, + { + "epoch": 0.5221435005964989, + "grad_norm": 1.8349330425262451, + "learning_rate": 1.9537926228481434e-05, + "loss": 0.7839, + "step": 10723 + }, + { + "epoch": 0.5221921943856062, + "grad_norm": 1.3719854354858398, + "learning_rate": 1.95347727083805e-05, + "loss": 0.8083, + "step": 10724 + }, + { + "epoch": 0.5222408881747134, + "grad_norm": 1.5673943758010864, + "learning_rate": 1.953161919985213e-05, + "loss": 0.7678, + "step": 10725 + }, + { + "epoch": 0.5222895819638205, + "grad_norm": 5.952001094818115, + "learning_rate": 1.9528465702974783e-05, + "loss": 0.8239, + "step": 10726 + }, + { + "epoch": 0.5223382757529277, + "grad_norm": 2.3951873779296875, + "learning_rate": 1.9525312217826883e-05, + "loss": 0.8932, + "step": 10727 + }, + { + "epoch": 0.5223869695420349, + "grad_norm": 1.4861434698104858, + "learning_rate": 1.9522158744486893e-05, + "loss": 0.8743, + "step": 10728 + }, + { + "epoch": 0.5224356633311421, + "grad_norm": 1.5930105447769165, + "learning_rate": 1.9519005283033248e-05, + "loss": 0.8632, + "step": 10729 + }, + { + "epoch": 0.5224843571202493, + "grad_norm": 1.714112401008606, + "learning_rate": 1.9515851833544385e-05, + "loss": 0.916, + "step": 10730 + }, + { + "epoch": 0.5225330509093565, + "grad_norm": 1.7522175312042236, + "learning_rate": 1.9512698396098754e-05, + "loss": 0.8124, + "step": 10731 + }, + { + "epoch": 0.5225817446984637, + "grad_norm": 1.5849686861038208, + "learning_rate": 1.9509544970774795e-05, + "loss": 0.8408, + "step": 10732 + }, + { + "epoch": 0.522630438487571, + "grad_norm": 2.042048215866089, + "learning_rate": 1.9506391557650953e-05, + "loss": 0.8722, + "step": 10733 + }, + { + "epoch": 0.5226791322766781, + "grad_norm": 2.8008604049682617, + "learning_rate": 1.950323815680566e-05, + "loss": 0.8439, + "step": 10734 + }, + { + "epoch": 0.5227278260657853, + "grad_norm": 2.6008896827697754, + "learning_rate": 1.9500084768317372e-05, + "loss": 0.7759, + "step": 10735 + }, + { + "epoch": 0.5227765198548925, + "grad_norm": 2.2651805877685547, + "learning_rate": 1.9496931392264513e-05, + "loss": 0.9015, + "step": 10736 + }, + { + "epoch": 0.5228252136439997, + "grad_norm": 2.8119630813598633, + "learning_rate": 1.9493778028725536e-05, + "loss": 0.8502, + "step": 10737 + }, + { + "epoch": 0.5228739074331069, + "grad_norm": 1.5941044092178345, + "learning_rate": 1.949062467777887e-05, + "loss": 0.8812, + "step": 10738 + }, + { + "epoch": 0.5229226012222141, + "grad_norm": 1.9163230657577515, + "learning_rate": 1.9487471339502972e-05, + "loss": 0.914, + "step": 10739 + }, + { + "epoch": 0.5229712950113213, + "grad_norm": 1.3762961626052856, + "learning_rate": 1.9484318013976262e-05, + "loss": 0.8433, + "step": 10740 + }, + { + "epoch": 0.5230199888004285, + "grad_norm": 1.625829815864563, + "learning_rate": 1.948116470127719e-05, + "loss": 0.8802, + "step": 10741 + }, + { + "epoch": 0.5230686825895356, + "grad_norm": 1.9438681602478027, + "learning_rate": 1.9478011401484197e-05, + "loss": 0.9415, + "step": 10742 + }, + { + "epoch": 0.5231173763786429, + "grad_norm": 1.656036376953125, + "learning_rate": 1.9474858114675715e-05, + "loss": 0.8459, + "step": 10743 + }, + { + "epoch": 0.5231660701677501, + "grad_norm": 3.631237745285034, + "learning_rate": 1.9471704840930186e-05, + "loss": 0.8537, + "step": 10744 + }, + { + "epoch": 0.5232147639568573, + "grad_norm": 1.2876616716384888, + "learning_rate": 1.9468551580326045e-05, + "loss": 0.8939, + "step": 10745 + }, + { + "epoch": 0.5232634577459645, + "grad_norm": 1.4123294353485107, + "learning_rate": 1.946539833294174e-05, + "loss": 0.7517, + "step": 10746 + }, + { + "epoch": 0.5233121515350717, + "grad_norm": 0.09545982629060745, + "learning_rate": 1.9462245098855692e-05, + "loss": 0.6376, + "step": 10747 + }, + { + "epoch": 0.5233608453241789, + "grad_norm": 1.5726807117462158, + "learning_rate": 1.9459091878146355e-05, + "loss": 0.9197, + "step": 10748 + }, + { + "epoch": 0.5234095391132861, + "grad_norm": 1.9650717973709106, + "learning_rate": 1.9455938670892146e-05, + "loss": 0.8045, + "step": 10749 + }, + { + "epoch": 0.5234582329023934, + "grad_norm": 1.9045813083648682, + "learning_rate": 1.9452785477171515e-05, + "loss": 0.8569, + "step": 10750 + }, + { + "epoch": 0.5235069266915005, + "grad_norm": 1.3822935819625854, + "learning_rate": 1.9449632297062896e-05, + "loss": 0.9619, + "step": 10751 + }, + { + "epoch": 0.5235556204806077, + "grad_norm": 7.367873668670654, + "learning_rate": 1.9446479130644722e-05, + "loss": 0.7931, + "step": 10752 + }, + { + "epoch": 0.5236043142697149, + "grad_norm": 1.517212152481079, + "learning_rate": 1.944332597799544e-05, + "loss": 0.7787, + "step": 10753 + }, + { + "epoch": 0.5236530080588221, + "grad_norm": 1.4021003246307373, + "learning_rate": 1.944017283919346e-05, + "loss": 0.8225, + "step": 10754 + }, + { + "epoch": 0.5237017018479293, + "grad_norm": 1.678759217262268, + "learning_rate": 1.943701971431725e-05, + "loss": 0.839, + "step": 10755 + }, + { + "epoch": 0.5237503956370365, + "grad_norm": 1.6108204126358032, + "learning_rate": 1.9433866603445212e-05, + "loss": 0.8339, + "step": 10756 + }, + { + "epoch": 0.5237990894261437, + "grad_norm": 1.6323800086975098, + "learning_rate": 1.94307135066558e-05, + "loss": 0.8294, + "step": 10757 + }, + { + "epoch": 0.5238477832152509, + "grad_norm": 1.9289963245391846, + "learning_rate": 1.942756042402744e-05, + "loss": 0.8547, + "step": 10758 + }, + { + "epoch": 0.523896477004358, + "grad_norm": 1.775679588317871, + "learning_rate": 1.9424407355638562e-05, + "loss": 0.797, + "step": 10759 + }, + { + "epoch": 0.5239451707934653, + "grad_norm": 1.6530194282531738, + "learning_rate": 1.9421254301567606e-05, + "loss": 0.7556, + "step": 10760 + }, + { + "epoch": 0.5239938645825725, + "grad_norm": 1.5409815311431885, + "learning_rate": 1.9418101261893e-05, + "loss": 0.8648, + "step": 10761 + }, + { + "epoch": 0.5240425583716797, + "grad_norm": 1.4008946418762207, + "learning_rate": 1.9414948236693187e-05, + "loss": 0.8957, + "step": 10762 + }, + { + "epoch": 0.5240912521607869, + "grad_norm": 1.1962244510650635, + "learning_rate": 1.941179522604658e-05, + "loss": 0.8224, + "step": 10763 + }, + { + "epoch": 0.5241399459498941, + "grad_norm": 1.689447283744812, + "learning_rate": 1.940864223003163e-05, + "loss": 0.7729, + "step": 10764 + }, + { + "epoch": 0.5241886397390013, + "grad_norm": 1.5387969017028809, + "learning_rate": 1.9405489248726748e-05, + "loss": 0.8617, + "step": 10765 + }, + { + "epoch": 0.5242373335281085, + "grad_norm": 2.4767396450042725, + "learning_rate": 1.9402336282210382e-05, + "loss": 0.874, + "step": 10766 + }, + { + "epoch": 0.5242860273172157, + "grad_norm": 2.3550684452056885, + "learning_rate": 1.9399183330560953e-05, + "loss": 0.886, + "step": 10767 + }, + { + "epoch": 0.5243347211063228, + "grad_norm": 1.3668701648712158, + "learning_rate": 1.9396030393856897e-05, + "loss": 0.9019, + "step": 10768 + }, + { + "epoch": 0.5243834148954301, + "grad_norm": 2.2581045627593994, + "learning_rate": 1.9392877472176635e-05, + "loss": 0.8073, + "step": 10769 + }, + { + "epoch": 0.5244321086845373, + "grad_norm": 1.47897469997406, + "learning_rate": 1.93897245655986e-05, + "loss": 0.8918, + "step": 10770 + }, + { + "epoch": 0.5244808024736445, + "grad_norm": 1.354517936706543, + "learning_rate": 1.938657167420123e-05, + "loss": 0.8998, + "step": 10771 + }, + { + "epoch": 0.5245294962627517, + "grad_norm": 2.2291271686553955, + "learning_rate": 1.9383418798062938e-05, + "loss": 0.8696, + "step": 10772 + }, + { + "epoch": 0.5245781900518589, + "grad_norm": 1.4921993017196655, + "learning_rate": 1.9380265937262165e-05, + "loss": 0.8212, + "step": 10773 + }, + { + "epoch": 0.5246268838409661, + "grad_norm": 0.09582458436489105, + "learning_rate": 1.9377113091877325e-05, + "loss": 0.6863, + "step": 10774 + }, + { + "epoch": 0.5246755776300733, + "grad_norm": 1.4021719694137573, + "learning_rate": 1.9373960261986865e-05, + "loss": 0.876, + "step": 10775 + }, + { + "epoch": 0.5247242714191804, + "grad_norm": 1.5613362789154053, + "learning_rate": 1.9370807447669195e-05, + "loss": 0.7599, + "step": 10776 + }, + { + "epoch": 0.5247729652082876, + "grad_norm": 3.1159045696258545, + "learning_rate": 1.936765464900275e-05, + "loss": 0.803, + "step": 10777 + }, + { + "epoch": 0.5248216589973949, + "grad_norm": 1.6427276134490967, + "learning_rate": 1.9364501866065947e-05, + "loss": 0.7953, + "step": 10778 + }, + { + "epoch": 0.5248703527865021, + "grad_norm": 1.411165714263916, + "learning_rate": 1.9361349098937224e-05, + "loss": 0.8489, + "step": 10779 + }, + { + "epoch": 0.5249190465756093, + "grad_norm": 1.6732217073440552, + "learning_rate": 1.9358196347694993e-05, + "loss": 0.9466, + "step": 10780 + }, + { + "epoch": 0.5249677403647165, + "grad_norm": 1.70326828956604, + "learning_rate": 1.9355043612417694e-05, + "loss": 0.9524, + "step": 10781 + }, + { + "epoch": 0.5250164341538237, + "grad_norm": 2.5441977977752686, + "learning_rate": 1.9351890893183744e-05, + "loss": 0.8522, + "step": 10782 + }, + { + "epoch": 0.5250651279429309, + "grad_norm": 1.9716072082519531, + "learning_rate": 1.9348738190071558e-05, + "loss": 0.8278, + "step": 10783 + }, + { + "epoch": 0.5251138217320381, + "grad_norm": 1.843206763267517, + "learning_rate": 1.9345585503159582e-05, + "loss": 0.8334, + "step": 10784 + }, + { + "epoch": 0.5251625155211452, + "grad_norm": 1.7725199460983276, + "learning_rate": 1.9342432832526214e-05, + "loss": 0.6925, + "step": 10785 + }, + { + "epoch": 0.5252112093102524, + "grad_norm": 1.724092721939087, + "learning_rate": 1.9339280178249903e-05, + "loss": 0.8561, + "step": 10786 + }, + { + "epoch": 0.5252599030993597, + "grad_norm": 1.7178444862365723, + "learning_rate": 1.9336127540409043e-05, + "loss": 0.785, + "step": 10787 + }, + { + "epoch": 0.5253085968884669, + "grad_norm": 2.882970094680786, + "learning_rate": 1.933297491908208e-05, + "loss": 0.8947, + "step": 10788 + }, + { + "epoch": 0.5253572906775741, + "grad_norm": 4.637271881103516, + "learning_rate": 1.932982231434742e-05, + "loss": 0.8543, + "step": 10789 + }, + { + "epoch": 0.5254059844666813, + "grad_norm": 2.885911703109741, + "learning_rate": 1.9326669726283492e-05, + "loss": 0.9016, + "step": 10790 + }, + { + "epoch": 0.5254546782557885, + "grad_norm": 1.5068070888519287, + "learning_rate": 1.9323517154968725e-05, + "loss": 0.7818, + "step": 10791 + }, + { + "epoch": 0.5255033720448957, + "grad_norm": 2.0122334957122803, + "learning_rate": 1.932036460048152e-05, + "loss": 0.8122, + "step": 10792 + }, + { + "epoch": 0.5255520658340028, + "grad_norm": 1.6369564533233643, + "learning_rate": 1.931721206290032e-05, + "loss": 0.7912, + "step": 10793 + }, + { + "epoch": 0.52560075962311, + "grad_norm": 1.4599074125289917, + "learning_rate": 1.931405954230352e-05, + "loss": 0.9367, + "step": 10794 + }, + { + "epoch": 0.5256494534122172, + "grad_norm": 2.469008207321167, + "learning_rate": 1.9310907038769556e-05, + "loss": 0.8334, + "step": 10795 + }, + { + "epoch": 0.5256981472013245, + "grad_norm": 3.572941541671753, + "learning_rate": 1.9307754552376842e-05, + "loss": 0.8857, + "step": 10796 + }, + { + "epoch": 0.5257468409904317, + "grad_norm": 1.3597500324249268, + "learning_rate": 1.93046020832038e-05, + "loss": 0.8699, + "step": 10797 + }, + { + "epoch": 0.5257955347795389, + "grad_norm": 2.6841719150543213, + "learning_rate": 1.9301449631328837e-05, + "loss": 0.8033, + "step": 10798 + }, + { + "epoch": 0.5258442285686461, + "grad_norm": 1.6071481704711914, + "learning_rate": 1.929829719683039e-05, + "loss": 0.8511, + "step": 10799 + }, + { + "epoch": 0.5258929223577533, + "grad_norm": 1.3441280126571655, + "learning_rate": 1.929514477978685e-05, + "loss": 0.7671, + "step": 10800 + }, + { + "epoch": 0.5259416161468604, + "grad_norm": 0.08773824572563171, + "learning_rate": 1.929199238027665e-05, + "loss": 0.6258, + "step": 10801 + }, + { + "epoch": 0.5259903099359676, + "grad_norm": 3.2002756595611572, + "learning_rate": 1.9288839998378213e-05, + "loss": 0.8556, + "step": 10802 + }, + { + "epoch": 0.5260390037250748, + "grad_norm": 1.4752155542373657, + "learning_rate": 1.9285687634169937e-05, + "loss": 0.8404, + "step": 10803 + }, + { + "epoch": 0.526087697514182, + "grad_norm": 1.5650662183761597, + "learning_rate": 1.9282535287730258e-05, + "loss": 0.9389, + "step": 10804 + }, + { + "epoch": 0.5261363913032893, + "grad_norm": 0.09063815325498581, + "learning_rate": 1.9279382959137566e-05, + "loss": 0.5908, + "step": 10805 + }, + { + "epoch": 0.5261850850923965, + "grad_norm": 1.322822093963623, + "learning_rate": 1.92762306484703e-05, + "loss": 0.6739, + "step": 10806 + }, + { + "epoch": 0.5262337788815037, + "grad_norm": 1.654691219329834, + "learning_rate": 1.9273078355806852e-05, + "loss": 0.7976, + "step": 10807 + }, + { + "epoch": 0.5262824726706109, + "grad_norm": 1.2167354822158813, + "learning_rate": 1.9269926081225653e-05, + "loss": 0.8418, + "step": 10808 + }, + { + "epoch": 0.5263311664597181, + "grad_norm": 2.0925498008728027, + "learning_rate": 1.9266773824805105e-05, + "loss": 0.7476, + "step": 10809 + }, + { + "epoch": 0.5263798602488252, + "grad_norm": 1.4722177982330322, + "learning_rate": 1.926362158662363e-05, + "loss": 0.8219, + "step": 10810 + }, + { + "epoch": 0.5264285540379324, + "grad_norm": 1.9890650510787964, + "learning_rate": 1.9260469366759633e-05, + "loss": 0.7541, + "step": 10811 + }, + { + "epoch": 0.5264772478270396, + "grad_norm": 1.8070396184921265, + "learning_rate": 1.9257317165291523e-05, + "loss": 0.7702, + "step": 10812 + }, + { + "epoch": 0.5265259416161469, + "grad_norm": 1.4421879053115845, + "learning_rate": 1.9254164982297727e-05, + "loss": 0.824, + "step": 10813 + }, + { + "epoch": 0.5265746354052541, + "grad_norm": 1.5128332376480103, + "learning_rate": 1.9251012817856635e-05, + "loss": 0.7863, + "step": 10814 + }, + { + "epoch": 0.5266233291943613, + "grad_norm": 2.789985418319702, + "learning_rate": 1.9247860672046677e-05, + "loss": 0.8073, + "step": 10815 + }, + { + "epoch": 0.5266720229834685, + "grad_norm": 2.5535311698913574, + "learning_rate": 1.9244708544946244e-05, + "loss": 0.8213, + "step": 10816 + }, + { + "epoch": 0.5267207167725757, + "grad_norm": 1.727964162826538, + "learning_rate": 1.924155643663376e-05, + "loss": 0.8143, + "step": 10817 + }, + { + "epoch": 0.5267694105616828, + "grad_norm": 1.6098692417144775, + "learning_rate": 1.9238404347187624e-05, + "loss": 0.8715, + "step": 10818 + }, + { + "epoch": 0.52681810435079, + "grad_norm": 1.7833421230316162, + "learning_rate": 1.923525227668625e-05, + "loss": 0.8444, + "step": 10819 + }, + { + "epoch": 0.5268667981398972, + "grad_norm": 1.677972674369812, + "learning_rate": 1.923210022520805e-05, + "loss": 0.8476, + "step": 10820 + }, + { + "epoch": 0.5269154919290044, + "grad_norm": 3.1962757110595703, + "learning_rate": 1.9228948192831424e-05, + "loss": 0.8479, + "step": 10821 + }, + { + "epoch": 0.5269641857181117, + "grad_norm": 1.7492362260818481, + "learning_rate": 1.9225796179634787e-05, + "loss": 0.8598, + "step": 10822 + }, + { + "epoch": 0.5270128795072189, + "grad_norm": 2.194535970687866, + "learning_rate": 1.9222644185696532e-05, + "loss": 0.7713, + "step": 10823 + }, + { + "epoch": 0.5270615732963261, + "grad_norm": 2.156646490097046, + "learning_rate": 1.921949221109508e-05, + "loss": 0.7921, + "step": 10824 + }, + { + "epoch": 0.5271102670854333, + "grad_norm": 2.077571392059326, + "learning_rate": 1.9216340255908827e-05, + "loss": 0.8065, + "step": 10825 + }, + { + "epoch": 0.5271589608745405, + "grad_norm": 1.6306424140930176, + "learning_rate": 1.9213188320216186e-05, + "loss": 0.8066, + "step": 10826 + }, + { + "epoch": 0.5272076546636476, + "grad_norm": 1.895326018333435, + "learning_rate": 1.9210036404095552e-05, + "loss": 0.8128, + "step": 10827 + }, + { + "epoch": 0.5272563484527548, + "grad_norm": 1.6018867492675781, + "learning_rate": 1.9206884507625345e-05, + "loss": 0.8149, + "step": 10828 + }, + { + "epoch": 0.527305042241862, + "grad_norm": 1.9216971397399902, + "learning_rate": 1.9203732630883944e-05, + "loss": 0.7802, + "step": 10829 + }, + { + "epoch": 0.5273537360309692, + "grad_norm": 1.680652379989624, + "learning_rate": 1.920058077394977e-05, + "loss": 0.8278, + "step": 10830 + }, + { + "epoch": 0.5274024298200765, + "grad_norm": 1.5821349620819092, + "learning_rate": 1.9197428936901225e-05, + "loss": 0.8446, + "step": 10831 + }, + { + "epoch": 0.5274511236091837, + "grad_norm": 1.492935061454773, + "learning_rate": 1.9194277119816704e-05, + "loss": 0.8802, + "step": 10832 + }, + { + "epoch": 0.5274998173982909, + "grad_norm": 1.73066246509552, + "learning_rate": 1.9191125322774625e-05, + "loss": 0.8643, + "step": 10833 + }, + { + "epoch": 0.5275485111873981, + "grad_norm": 1.9924821853637695, + "learning_rate": 1.9187973545853364e-05, + "loss": 0.775, + "step": 10834 + }, + { + "epoch": 0.5275972049765052, + "grad_norm": 1.754870057106018, + "learning_rate": 1.9184821789131348e-05, + "loss": 0.8067, + "step": 10835 + }, + { + "epoch": 0.5276458987656124, + "grad_norm": 1.6854692697525024, + "learning_rate": 1.918167005268695e-05, + "loss": 0.8058, + "step": 10836 + }, + { + "epoch": 0.5276945925547196, + "grad_norm": 1.7672679424285889, + "learning_rate": 1.9178518336598596e-05, + "loss": 0.8396, + "step": 10837 + }, + { + "epoch": 0.5277432863438268, + "grad_norm": 1.8015471696853638, + "learning_rate": 1.9175366640944667e-05, + "loss": 0.7795, + "step": 10838 + }, + { + "epoch": 0.527791980132934, + "grad_norm": 1.704663634300232, + "learning_rate": 1.9172214965803566e-05, + "loss": 0.7631, + "step": 10839 + }, + { + "epoch": 0.5278406739220413, + "grad_norm": 1.7878892421722412, + "learning_rate": 1.9169063311253696e-05, + "loss": 0.7803, + "step": 10840 + }, + { + "epoch": 0.5278893677111485, + "grad_norm": 1.4736616611480713, + "learning_rate": 1.916591167737345e-05, + "loss": 0.7956, + "step": 10841 + }, + { + "epoch": 0.5279380615002557, + "grad_norm": 0.09158240258693695, + "learning_rate": 1.9162760064241235e-05, + "loss": 0.5987, + "step": 10842 + }, + { + "epoch": 0.5279867552893629, + "grad_norm": 1.3011633157730103, + "learning_rate": 1.9159608471935432e-05, + "loss": 0.8808, + "step": 10843 + }, + { + "epoch": 0.52803544907847, + "grad_norm": 1.560744285583496, + "learning_rate": 1.9156456900534453e-05, + "loss": 0.8846, + "step": 10844 + }, + { + "epoch": 0.5280841428675772, + "grad_norm": 1.4716007709503174, + "learning_rate": 1.9153305350116676e-05, + "loss": 0.7915, + "step": 10845 + }, + { + "epoch": 0.5281328366566844, + "grad_norm": 2.032090187072754, + "learning_rate": 1.915015382076051e-05, + "loss": 0.8542, + "step": 10846 + }, + { + "epoch": 0.5281815304457916, + "grad_norm": 3.559635877609253, + "learning_rate": 1.914700231254434e-05, + "loss": 0.8192, + "step": 10847 + }, + { + "epoch": 0.5282302242348988, + "grad_norm": 1.4279465675354004, + "learning_rate": 1.9143850825546576e-05, + "loss": 0.884, + "step": 10848 + }, + { + "epoch": 0.5282789180240061, + "grad_norm": 1.7459344863891602, + "learning_rate": 1.9140699359845588e-05, + "loss": 0.7806, + "step": 10849 + }, + { + "epoch": 0.5283276118131133, + "grad_norm": 1.312450647354126, + "learning_rate": 1.913754791551979e-05, + "loss": 0.885, + "step": 10850 + }, + { + "epoch": 0.5283763056022205, + "grad_norm": 1.444946050643921, + "learning_rate": 1.9134396492647566e-05, + "loss": 0.9574, + "step": 10851 + }, + { + "epoch": 0.5284249993913276, + "grad_norm": 1.3529952764511108, + "learning_rate": 1.9131245091307303e-05, + "loss": 0.877, + "step": 10852 + }, + { + "epoch": 0.5284736931804348, + "grad_norm": 1.3410859107971191, + "learning_rate": 1.91280937115774e-05, + "loss": 0.7902, + "step": 10853 + }, + { + "epoch": 0.528522386969542, + "grad_norm": 1.9210833311080933, + "learning_rate": 1.9124942353536243e-05, + "loss": 0.8811, + "step": 10854 + }, + { + "epoch": 0.5285710807586492, + "grad_norm": 1.4491727352142334, + "learning_rate": 1.9121791017262236e-05, + "loss": 0.8253, + "step": 10855 + }, + { + "epoch": 0.5286197745477564, + "grad_norm": 0.09883523732423782, + "learning_rate": 1.9118639702833747e-05, + "loss": 0.662, + "step": 10856 + }, + { + "epoch": 0.5286684683368637, + "grad_norm": 0.08676255494356155, + "learning_rate": 1.9115488410329184e-05, + "loss": 0.5374, + "step": 10857 + }, + { + "epoch": 0.5287171621259709, + "grad_norm": 1.7230457067489624, + "learning_rate": 1.9112337139826916e-05, + "loss": 0.8452, + "step": 10858 + }, + { + "epoch": 0.5287658559150781, + "grad_norm": 0.0911078229546547, + "learning_rate": 1.9109185891405346e-05, + "loss": 0.5898, + "step": 10859 + }, + { + "epoch": 0.5288145497041852, + "grad_norm": 1.7430082559585571, + "learning_rate": 1.9106034665142864e-05, + "loss": 0.8566, + "step": 10860 + }, + { + "epoch": 0.5288632434932924, + "grad_norm": 1.367383360862732, + "learning_rate": 1.910288346111784e-05, + "loss": 0.8428, + "step": 10861 + }, + { + "epoch": 0.5289119372823996, + "grad_norm": 1.4946856498718262, + "learning_rate": 1.909973227940869e-05, + "loss": 0.8437, + "step": 10862 + }, + { + "epoch": 0.5289606310715068, + "grad_norm": 1.4969474077224731, + "learning_rate": 1.9096581120093765e-05, + "loss": 0.8566, + "step": 10863 + }, + { + "epoch": 0.529009324860614, + "grad_norm": 1.1885658502578735, + "learning_rate": 1.909342998325148e-05, + "loss": 0.856, + "step": 10864 + }, + { + "epoch": 0.5290580186497212, + "grad_norm": 1.2754346132278442, + "learning_rate": 1.9090278868960197e-05, + "loss": 0.8066, + "step": 10865 + }, + { + "epoch": 0.5291067124388285, + "grad_norm": 1.16449773311615, + "learning_rate": 1.9087127777298313e-05, + "loss": 0.8241, + "step": 10866 + }, + { + "epoch": 0.5291554062279357, + "grad_norm": 0.09464000165462494, + "learning_rate": 1.908397670834421e-05, + "loss": 0.5805, + "step": 10867 + }, + { + "epoch": 0.5292041000170429, + "grad_norm": 1.6602609157562256, + "learning_rate": 1.908082566217627e-05, + "loss": 0.7866, + "step": 10868 + }, + { + "epoch": 0.52925279380615, + "grad_norm": 1.584672451019287, + "learning_rate": 1.9077674638872873e-05, + "loss": 0.8187, + "step": 10869 + }, + { + "epoch": 0.5293014875952572, + "grad_norm": 2.464552640914917, + "learning_rate": 1.90745236385124e-05, + "loss": 0.8049, + "step": 10870 + }, + { + "epoch": 0.5293501813843644, + "grad_norm": 1.5872523784637451, + "learning_rate": 1.9071372661173246e-05, + "loss": 0.8604, + "step": 10871 + }, + { + "epoch": 0.5293988751734716, + "grad_norm": 1.4544024467468262, + "learning_rate": 1.9068221706933775e-05, + "loss": 0.8632, + "step": 10872 + }, + { + "epoch": 0.5294475689625788, + "grad_norm": 1.850771427154541, + "learning_rate": 1.9065070775872377e-05, + "loss": 0.8565, + "step": 10873 + }, + { + "epoch": 0.529496262751686, + "grad_norm": 1.2262229919433594, + "learning_rate": 1.9061919868067425e-05, + "loss": 0.8851, + "step": 10874 + }, + { + "epoch": 0.5295449565407933, + "grad_norm": 1.3839308023452759, + "learning_rate": 1.9058768983597305e-05, + "loss": 0.7479, + "step": 10875 + }, + { + "epoch": 0.5295936503299005, + "grad_norm": 1.586037278175354, + "learning_rate": 1.9055618122540385e-05, + "loss": 0.9001, + "step": 10876 + }, + { + "epoch": 0.5296423441190076, + "grad_norm": 2.1307480335235596, + "learning_rate": 1.9052467284975064e-05, + "loss": 0.8537, + "step": 10877 + }, + { + "epoch": 0.5296910379081148, + "grad_norm": 1.3849284648895264, + "learning_rate": 1.9049316470979695e-05, + "loss": 0.8419, + "step": 10878 + }, + { + "epoch": 0.529739731697222, + "grad_norm": 2.012294292449951, + "learning_rate": 1.9046165680632666e-05, + "loss": 0.8004, + "step": 10879 + }, + { + "epoch": 0.5297884254863292, + "grad_norm": 1.5785325765609741, + "learning_rate": 1.904301491401236e-05, + "loss": 0.8396, + "step": 10880 + }, + { + "epoch": 0.5298371192754364, + "grad_norm": 0.0961487889289856, + "learning_rate": 1.9039864171197138e-05, + "loss": 0.5695, + "step": 10881 + }, + { + "epoch": 0.5298858130645436, + "grad_norm": 4.367892742156982, + "learning_rate": 1.9036713452265384e-05, + "loss": 0.7159, + "step": 10882 + }, + { + "epoch": 0.5299345068536508, + "grad_norm": 0.09947004914283752, + "learning_rate": 1.903356275729547e-05, + "loss": 0.6718, + "step": 10883 + }, + { + "epoch": 0.5299832006427581, + "grad_norm": 1.5237292051315308, + "learning_rate": 1.9030412086365776e-05, + "loss": 0.8263, + "step": 10884 + }, + { + "epoch": 0.5300318944318653, + "grad_norm": 1.5018720626831055, + "learning_rate": 1.9027261439554663e-05, + "loss": 0.8321, + "step": 10885 + }, + { + "epoch": 0.5300805882209724, + "grad_norm": 1.6575512886047363, + "learning_rate": 1.902411081694052e-05, + "loss": 0.7423, + "step": 10886 + }, + { + "epoch": 0.5301292820100796, + "grad_norm": 1.5685598850250244, + "learning_rate": 1.9020960218601697e-05, + "loss": 0.8865, + "step": 10887 + }, + { + "epoch": 0.5301779757991868, + "grad_norm": 1.5136858224868774, + "learning_rate": 1.9017809644616587e-05, + "loss": 0.8352, + "step": 10888 + }, + { + "epoch": 0.530226669588294, + "grad_norm": 3.497743606567383, + "learning_rate": 1.9014659095063545e-05, + "loss": 0.8791, + "step": 10889 + }, + { + "epoch": 0.5302753633774012, + "grad_norm": 2.0676960945129395, + "learning_rate": 1.901150857002095e-05, + "loss": 0.8383, + "step": 10890 + }, + { + "epoch": 0.5303240571665084, + "grad_norm": 2.1857874393463135, + "learning_rate": 1.9008358069567173e-05, + "loss": 0.8677, + "step": 10891 + }, + { + "epoch": 0.5303727509556156, + "grad_norm": 1.3477579355239868, + "learning_rate": 1.9005207593780574e-05, + "loss": 0.7962, + "step": 10892 + }, + { + "epoch": 0.5304214447447229, + "grad_norm": 1.6282423734664917, + "learning_rate": 1.9002057142739534e-05, + "loss": 0.8444, + "step": 10893 + }, + { + "epoch": 0.53047013853383, + "grad_norm": 2.382211208343506, + "learning_rate": 1.8998906716522404e-05, + "loss": 0.815, + "step": 10894 + }, + { + "epoch": 0.5305188323229372, + "grad_norm": 1.5972191095352173, + "learning_rate": 1.8995756315207568e-05, + "loss": 0.8379, + "step": 10895 + }, + { + "epoch": 0.5305675261120444, + "grad_norm": 1.4678839445114136, + "learning_rate": 1.8992605938873384e-05, + "loss": 0.802, + "step": 10896 + }, + { + "epoch": 0.5306162199011516, + "grad_norm": 0.09001606702804565, + "learning_rate": 1.898945558759822e-05, + "loss": 0.5627, + "step": 10897 + }, + { + "epoch": 0.5306649136902588, + "grad_norm": 1.3922479152679443, + "learning_rate": 1.8986305261460435e-05, + "loss": 0.8458, + "step": 10898 + }, + { + "epoch": 0.530713607479366, + "grad_norm": 1.546690583229065, + "learning_rate": 1.89831549605384e-05, + "loss": 0.8616, + "step": 10899 + }, + { + "epoch": 0.5307623012684732, + "grad_norm": 1.5678592920303345, + "learning_rate": 1.8980004684910484e-05, + "loss": 0.8469, + "step": 10900 + }, + { + "epoch": 0.5308109950575804, + "grad_norm": 4.709558486938477, + "learning_rate": 1.897685443465504e-05, + "loss": 0.7224, + "step": 10901 + }, + { + "epoch": 0.5308596888466875, + "grad_norm": 1.5430246591567993, + "learning_rate": 1.897370420985044e-05, + "loss": 0.7273, + "step": 10902 + }, + { + "epoch": 0.5309083826357948, + "grad_norm": 1.4098129272460938, + "learning_rate": 1.8970554010575035e-05, + "loss": 0.9083, + "step": 10903 + }, + { + "epoch": 0.530957076424902, + "grad_norm": 1.6742467880249023, + "learning_rate": 1.8967403836907196e-05, + "loss": 0.8136, + "step": 10904 + }, + { + "epoch": 0.5310057702140092, + "grad_norm": 1.8827133178710938, + "learning_rate": 1.8964253688925278e-05, + "loss": 0.7721, + "step": 10905 + }, + { + "epoch": 0.5310544640031164, + "grad_norm": 1.40691339969635, + "learning_rate": 1.8961103566707653e-05, + "loss": 0.8645, + "step": 10906 + }, + { + "epoch": 0.5311031577922236, + "grad_norm": 1.713001012802124, + "learning_rate": 1.895795347033266e-05, + "loss": 0.8302, + "step": 10907 + }, + { + "epoch": 0.5311518515813308, + "grad_norm": 1.9033336639404297, + "learning_rate": 1.8954803399878673e-05, + "loss": 0.8671, + "step": 10908 + }, + { + "epoch": 0.531200545370438, + "grad_norm": 1.9126070737838745, + "learning_rate": 1.8951653355424052e-05, + "loss": 0.7885, + "step": 10909 + }, + { + "epoch": 0.5312492391595453, + "grad_norm": 2.2282731533050537, + "learning_rate": 1.894850333704714e-05, + "loss": 0.8867, + "step": 10910 + }, + { + "epoch": 0.5312979329486524, + "grad_norm": 2.105991840362549, + "learning_rate": 1.894535334482631e-05, + "loss": 0.7911, + "step": 10911 + }, + { + "epoch": 0.5313466267377596, + "grad_norm": 1.4101616144180298, + "learning_rate": 1.894220337883991e-05, + "loss": 0.8397, + "step": 10912 + }, + { + "epoch": 0.5313953205268668, + "grad_norm": 2.433493137359619, + "learning_rate": 1.8939053439166303e-05, + "loss": 0.8116, + "step": 10913 + }, + { + "epoch": 0.531444014315974, + "grad_norm": 1.4197885990142822, + "learning_rate": 1.8935903525883828e-05, + "loss": 0.9471, + "step": 10914 + }, + { + "epoch": 0.5314927081050812, + "grad_norm": 1.9592955112457275, + "learning_rate": 1.893275363907086e-05, + "loss": 0.7989, + "step": 10915 + }, + { + "epoch": 0.5315414018941884, + "grad_norm": 2.2887980937957764, + "learning_rate": 1.8929603778805735e-05, + "loss": 0.8912, + "step": 10916 + }, + { + "epoch": 0.5315900956832956, + "grad_norm": 1.6316734552383423, + "learning_rate": 1.8926453945166817e-05, + "loss": 0.8299, + "step": 10917 + }, + { + "epoch": 0.5316387894724028, + "grad_norm": 1.3705826997756958, + "learning_rate": 1.892330413823245e-05, + "loss": 0.8432, + "step": 10918 + }, + { + "epoch": 0.5316874832615099, + "grad_norm": 0.09169970452785492, + "learning_rate": 1.8920154358080996e-05, + "loss": 0.5742, + "step": 10919 + }, + { + "epoch": 0.5317361770506172, + "grad_norm": 1.3268166780471802, + "learning_rate": 1.89170046047908e-05, + "loss": 0.7657, + "step": 10920 + }, + { + "epoch": 0.5317848708397244, + "grad_norm": 3.492985725402832, + "learning_rate": 1.891385487844021e-05, + "loss": 0.7992, + "step": 10921 + }, + { + "epoch": 0.5318335646288316, + "grad_norm": 1.7508602142333984, + "learning_rate": 1.891070517910759e-05, + "loss": 0.8833, + "step": 10922 + }, + { + "epoch": 0.5318822584179388, + "grad_norm": 1.400809645652771, + "learning_rate": 1.8907555506871263e-05, + "loss": 0.8145, + "step": 10923 + }, + { + "epoch": 0.531930952207046, + "grad_norm": 1.3537216186523438, + "learning_rate": 1.8904405861809608e-05, + "loss": 0.8348, + "step": 10924 + }, + { + "epoch": 0.5319796459961532, + "grad_norm": 1.6819993257522583, + "learning_rate": 1.8901256244000943e-05, + "loss": 0.8907, + "step": 10925 + }, + { + "epoch": 0.5320283397852604, + "grad_norm": 1.7635865211486816, + "learning_rate": 1.8898106653523636e-05, + "loss": 0.8504, + "step": 10926 + }, + { + "epoch": 0.5320770335743676, + "grad_norm": 1.160488486289978, + "learning_rate": 1.889495709045602e-05, + "loss": 0.8477, + "step": 10927 + }, + { + "epoch": 0.5321257273634747, + "grad_norm": 1.4027087688446045, + "learning_rate": 1.889180755487645e-05, + "loss": 0.7539, + "step": 10928 + }, + { + "epoch": 0.532174421152582, + "grad_norm": 1.6691749095916748, + "learning_rate": 1.8888658046863276e-05, + "loss": 0.8242, + "step": 10929 + }, + { + "epoch": 0.5322231149416892, + "grad_norm": 1.8917561769485474, + "learning_rate": 1.8885508566494825e-05, + "loss": 0.8398, + "step": 10930 + }, + { + "epoch": 0.5322718087307964, + "grad_norm": 1.5918552875518799, + "learning_rate": 1.8882359113849456e-05, + "loss": 0.8726, + "step": 10931 + }, + { + "epoch": 0.5323205025199036, + "grad_norm": 1.5324591398239136, + "learning_rate": 1.8879209689005502e-05, + "loss": 0.8465, + "step": 10932 + }, + { + "epoch": 0.5323691963090108, + "grad_norm": 1.2983990907669067, + "learning_rate": 1.887606029204131e-05, + "loss": 0.8784, + "step": 10933 + }, + { + "epoch": 0.532417890098118, + "grad_norm": 1.187229871749878, + "learning_rate": 1.887291092303522e-05, + "loss": 0.8682, + "step": 10934 + }, + { + "epoch": 0.5324665838872252, + "grad_norm": 1.312143325805664, + "learning_rate": 1.886976158206558e-05, + "loss": 0.8388, + "step": 10935 + }, + { + "epoch": 0.5325152776763323, + "grad_norm": 1.37840735912323, + "learning_rate": 1.8866612269210716e-05, + "loss": 0.8274, + "step": 10936 + }, + { + "epoch": 0.5325639714654395, + "grad_norm": 1.4396088123321533, + "learning_rate": 1.8863462984548983e-05, + "loss": 0.8978, + "step": 10937 + }, + { + "epoch": 0.5326126652545468, + "grad_norm": 1.7230784893035889, + "learning_rate": 1.8860313728158702e-05, + "loss": 0.7512, + "step": 10938 + }, + { + "epoch": 0.532661359043654, + "grad_norm": 1.6955584287643433, + "learning_rate": 1.885716450011822e-05, + "loss": 0.8604, + "step": 10939 + }, + { + "epoch": 0.5327100528327612, + "grad_norm": 1.4525423049926758, + "learning_rate": 1.8854015300505884e-05, + "loss": 0.8405, + "step": 10940 + }, + { + "epoch": 0.5327587466218684, + "grad_norm": 1.3257145881652832, + "learning_rate": 1.8850866129400017e-05, + "loss": 0.8489, + "step": 10941 + }, + { + "epoch": 0.5328074404109756, + "grad_norm": 1.6458944082260132, + "learning_rate": 1.8847716986878964e-05, + "loss": 0.8159, + "step": 10942 + }, + { + "epoch": 0.5328561342000828, + "grad_norm": 1.2871791124343872, + "learning_rate": 1.8844567873021052e-05, + "loss": 0.7747, + "step": 10943 + }, + { + "epoch": 0.53290482798919, + "grad_norm": 1.6209136247634888, + "learning_rate": 1.8841418787904625e-05, + "loss": 0.8411, + "step": 10944 + }, + { + "epoch": 0.5329535217782971, + "grad_norm": 1.5691277980804443, + "learning_rate": 1.8838269731608005e-05, + "loss": 0.9363, + "step": 10945 + }, + { + "epoch": 0.5330022155674043, + "grad_norm": 1.4931453466415405, + "learning_rate": 1.8835120704209535e-05, + "loss": 0.8593, + "step": 10946 + }, + { + "epoch": 0.5330509093565116, + "grad_norm": 1.9103022813796997, + "learning_rate": 1.883197170578754e-05, + "loss": 0.8502, + "step": 10947 + }, + { + "epoch": 0.5330996031456188, + "grad_norm": 1.4239472150802612, + "learning_rate": 1.8828822736420357e-05, + "loss": 0.835, + "step": 10948 + }, + { + "epoch": 0.533148296934726, + "grad_norm": 1.4676449298858643, + "learning_rate": 1.8825673796186316e-05, + "loss": 0.8725, + "step": 10949 + }, + { + "epoch": 0.5331969907238332, + "grad_norm": 1.1217764616012573, + "learning_rate": 1.8822524885163745e-05, + "loss": 0.868, + "step": 10950 + }, + { + "epoch": 0.5332456845129404, + "grad_norm": 1.5454092025756836, + "learning_rate": 1.881937600343098e-05, + "loss": 0.7623, + "step": 10951 + }, + { + "epoch": 0.5332943783020476, + "grad_norm": 1.7486581802368164, + "learning_rate": 1.881622715106634e-05, + "loss": 0.7907, + "step": 10952 + }, + { + "epoch": 0.5333430720911547, + "grad_norm": 2.5344223976135254, + "learning_rate": 1.881307832814816e-05, + "loss": 0.8372, + "step": 10953 + }, + { + "epoch": 0.5333917658802619, + "grad_norm": 1.1387966871261597, + "learning_rate": 1.880992953475476e-05, + "loss": 0.9304, + "step": 10954 + }, + { + "epoch": 0.5334404596693691, + "grad_norm": 1.576140284538269, + "learning_rate": 1.8806780770964475e-05, + "loss": 0.8732, + "step": 10955 + }, + { + "epoch": 0.5334891534584764, + "grad_norm": 2.3849599361419678, + "learning_rate": 1.8803632036855625e-05, + "loss": 0.7342, + "step": 10956 + }, + { + "epoch": 0.5335378472475836, + "grad_norm": 1.458797812461853, + "learning_rate": 1.8800483332506542e-05, + "loss": 0.8452, + "step": 10957 + }, + { + "epoch": 0.5335865410366908, + "grad_norm": 1.6293089389801025, + "learning_rate": 1.879733465799554e-05, + "loss": 0.9242, + "step": 10958 + }, + { + "epoch": 0.533635234825798, + "grad_norm": 1.7297773361206055, + "learning_rate": 1.8794186013400944e-05, + "loss": 0.8146, + "step": 10959 + }, + { + "epoch": 0.5336839286149052, + "grad_norm": 1.201882004737854, + "learning_rate": 1.8791037398801094e-05, + "loss": 0.8058, + "step": 10960 + }, + { + "epoch": 0.5337326224040123, + "grad_norm": 3.1289076805114746, + "learning_rate": 1.8787888814274287e-05, + "loss": 0.741, + "step": 10961 + }, + { + "epoch": 0.5337813161931195, + "grad_norm": 0.09193667769432068, + "learning_rate": 1.8784740259898866e-05, + "loss": 0.577, + "step": 10962 + }, + { + "epoch": 0.5338300099822267, + "grad_norm": 3.1396799087524414, + "learning_rate": 1.8781591735753132e-05, + "loss": 0.8192, + "step": 10963 + }, + { + "epoch": 0.533878703771334, + "grad_norm": 1.5062350034713745, + "learning_rate": 1.8778443241915426e-05, + "loss": 0.8199, + "step": 10964 + }, + { + "epoch": 0.5339273975604412, + "grad_norm": 1.7580777406692505, + "learning_rate": 1.8775294778464044e-05, + "loss": 0.8626, + "step": 10965 + }, + { + "epoch": 0.5339760913495484, + "grad_norm": 1.5240892171859741, + "learning_rate": 1.8772146345477328e-05, + "loss": 0.8091, + "step": 10966 + }, + { + "epoch": 0.5340247851386556, + "grad_norm": 1.536004662513733, + "learning_rate": 1.8768997943033572e-05, + "loss": 0.8567, + "step": 10967 + }, + { + "epoch": 0.5340734789277628, + "grad_norm": 1.3427083492279053, + "learning_rate": 1.8765849571211108e-05, + "loss": 0.7859, + "step": 10968 + }, + { + "epoch": 0.53412217271687, + "grad_norm": 1.6238093376159668, + "learning_rate": 1.8762701230088252e-05, + "loss": 0.7873, + "step": 10969 + }, + { + "epoch": 0.5341708665059771, + "grad_norm": 1.7215192317962646, + "learning_rate": 1.875955291974331e-05, + "loss": 0.8085, + "step": 10970 + }, + { + "epoch": 0.5342195602950843, + "grad_norm": 1.4328922033309937, + "learning_rate": 1.875640464025461e-05, + "loss": 0.8499, + "step": 10971 + }, + { + "epoch": 0.5342682540841915, + "grad_norm": 1.6358247995376587, + "learning_rate": 1.875325639170045e-05, + "loss": 0.7513, + "step": 10972 + }, + { + "epoch": 0.5343169478732988, + "grad_norm": 1.744803786277771, + "learning_rate": 1.875010817415916e-05, + "loss": 0.9106, + "step": 10973 + }, + { + "epoch": 0.534365641662406, + "grad_norm": 1.2869789600372314, + "learning_rate": 1.8746959987709036e-05, + "loss": 0.8575, + "step": 10974 + }, + { + "epoch": 0.5344143354515132, + "grad_norm": 1.5744153261184692, + "learning_rate": 1.87438118324284e-05, + "loss": 0.8626, + "step": 10975 + }, + { + "epoch": 0.5344630292406204, + "grad_norm": 4.043846607208252, + "learning_rate": 1.874066370839556e-05, + "loss": 0.853, + "step": 10976 + }, + { + "epoch": 0.5345117230297276, + "grad_norm": 2.2644565105438232, + "learning_rate": 1.873751561568882e-05, + "loss": 0.8418, + "step": 10977 + }, + { + "epoch": 0.5345604168188347, + "grad_norm": 1.7296810150146484, + "learning_rate": 1.8734367554386494e-05, + "loss": 0.842, + "step": 10978 + }, + { + "epoch": 0.5346091106079419, + "grad_norm": 1.5237730741500854, + "learning_rate": 1.873121952456689e-05, + "loss": 0.9097, + "step": 10979 + }, + { + "epoch": 0.5346578043970491, + "grad_norm": 1.6314842700958252, + "learning_rate": 1.8728071526308328e-05, + "loss": 0.8331, + "step": 10980 + }, + { + "epoch": 0.5347064981861563, + "grad_norm": 0.09662467986345291, + "learning_rate": 1.872492355968909e-05, + "loss": 0.5992, + "step": 10981 + }, + { + "epoch": 0.5347551919752636, + "grad_norm": 2.469433307647705, + "learning_rate": 1.8721775624787503e-05, + "loss": 0.8676, + "step": 10982 + }, + { + "epoch": 0.5348038857643708, + "grad_norm": 1.8780500888824463, + "learning_rate": 1.8718627721681857e-05, + "loss": 0.899, + "step": 10983 + }, + { + "epoch": 0.534852579553478, + "grad_norm": 1.4560565948486328, + "learning_rate": 1.8715479850450468e-05, + "loss": 0.8354, + "step": 10984 + }, + { + "epoch": 0.5349012733425852, + "grad_norm": 1.4535802602767944, + "learning_rate": 1.8712332011171627e-05, + "loss": 0.8398, + "step": 10985 + }, + { + "epoch": 0.5349499671316924, + "grad_norm": 1.5877922773361206, + "learning_rate": 1.8709184203923656e-05, + "loss": 0.9061, + "step": 10986 + }, + { + "epoch": 0.5349986609207995, + "grad_norm": 1.6301558017730713, + "learning_rate": 1.8706036428784838e-05, + "loss": 0.8107, + "step": 10987 + }, + { + "epoch": 0.5350473547099067, + "grad_norm": 1.5319557189941406, + "learning_rate": 1.870288868583348e-05, + "loss": 0.8875, + "step": 10988 + }, + { + "epoch": 0.5350960484990139, + "grad_norm": 1.6833479404449463, + "learning_rate": 1.8699740975147893e-05, + "loss": 0.7975, + "step": 10989 + }, + { + "epoch": 0.5351447422881211, + "grad_norm": 2.111936569213867, + "learning_rate": 1.869659329680636e-05, + "loss": 0.9302, + "step": 10990 + }, + { + "epoch": 0.5351934360772284, + "grad_norm": 1.7457045316696167, + "learning_rate": 1.869344565088719e-05, + "loss": 0.8814, + "step": 10991 + }, + { + "epoch": 0.5352421298663356, + "grad_norm": 1.424877643585205, + "learning_rate": 1.8690298037468676e-05, + "loss": 0.7641, + "step": 10992 + }, + { + "epoch": 0.5352908236554428, + "grad_norm": 3.29880952835083, + "learning_rate": 1.8687150456629126e-05, + "loss": 0.8328, + "step": 10993 + }, + { + "epoch": 0.53533951744455, + "grad_norm": 1.367875576019287, + "learning_rate": 1.8684002908446822e-05, + "loss": 0.8835, + "step": 10994 + }, + { + "epoch": 0.5353882112336571, + "grad_norm": 1.9074681997299194, + "learning_rate": 1.868085539300007e-05, + "loss": 0.8046, + "step": 10995 + }, + { + "epoch": 0.5354369050227643, + "grad_norm": 1.4854283332824707, + "learning_rate": 1.8677707910367154e-05, + "loss": 0.8787, + "step": 10996 + }, + { + "epoch": 0.5354855988118715, + "grad_norm": 1.629097580909729, + "learning_rate": 1.867456046062638e-05, + "loss": 0.8559, + "step": 10997 + }, + { + "epoch": 0.5355342926009787, + "grad_norm": 0.10398740321397781, + "learning_rate": 1.8671413043856026e-05, + "loss": 0.6091, + "step": 10998 + }, + { + "epoch": 0.535582986390086, + "grad_norm": 0.09652465581893921, + "learning_rate": 1.8668265660134395e-05, + "loss": 0.6553, + "step": 10999 + }, + { + "epoch": 0.5356316801791932, + "grad_norm": 1.7983906269073486, + "learning_rate": 1.8665118309539793e-05, + "loss": 0.8822, + "step": 11000 + }, + { + "epoch": 0.5356803739683004, + "grad_norm": 9.756416320800781, + "learning_rate": 1.8661970992150477e-05, + "loss": 0.8184, + "step": 11001 + }, + { + "epoch": 0.5357290677574076, + "grad_norm": 1.4379509687423706, + "learning_rate": 1.865882370804477e-05, + "loss": 0.9237, + "step": 11002 + }, + { + "epoch": 0.5357777615465148, + "grad_norm": 1.4310065507888794, + "learning_rate": 1.865567645730093e-05, + "loss": 0.8348, + "step": 11003 + }, + { + "epoch": 0.5358264553356219, + "grad_norm": 1.9499619007110596, + "learning_rate": 1.865252923999727e-05, + "loss": 0.7947, + "step": 11004 + }, + { + "epoch": 0.5358751491247291, + "grad_norm": 1.4756412506103516, + "learning_rate": 1.864938205621207e-05, + "loss": 0.7894, + "step": 11005 + }, + { + "epoch": 0.5359238429138363, + "grad_norm": 1.7357522249221802, + "learning_rate": 1.864623490602361e-05, + "loss": 0.7691, + "step": 11006 + }, + { + "epoch": 0.5359725367029435, + "grad_norm": 1.4252077341079712, + "learning_rate": 1.8643087789510176e-05, + "loss": 0.7863, + "step": 11007 + }, + { + "epoch": 0.5360212304920507, + "grad_norm": 2.0273244380950928, + "learning_rate": 1.8639940706750058e-05, + "loss": 0.8393, + "step": 11008 + }, + { + "epoch": 0.536069924281158, + "grad_norm": 1.3969295024871826, + "learning_rate": 1.8636793657821547e-05, + "loss": 0.8852, + "step": 11009 + }, + { + "epoch": 0.5361186180702652, + "grad_norm": 2.295444965362549, + "learning_rate": 1.8633646642802912e-05, + "loss": 0.791, + "step": 11010 + }, + { + "epoch": 0.5361673118593724, + "grad_norm": 1.5393487215042114, + "learning_rate": 1.8630499661772447e-05, + "loss": 0.8695, + "step": 11011 + }, + { + "epoch": 0.5362160056484795, + "grad_norm": 1.591928243637085, + "learning_rate": 1.862735271480842e-05, + "loss": 0.8582, + "step": 11012 + }, + { + "epoch": 0.5362646994375867, + "grad_norm": 8.53598690032959, + "learning_rate": 1.8624205801989125e-05, + "loss": 0.7945, + "step": 11013 + }, + { + "epoch": 0.5363133932266939, + "grad_norm": 0.09628309309482574, + "learning_rate": 1.862105892339283e-05, + "loss": 0.5837, + "step": 11014 + }, + { + "epoch": 0.5363620870158011, + "grad_norm": 1.7449499368667603, + "learning_rate": 1.861791207909783e-05, + "loss": 0.8142, + "step": 11015 + }, + { + "epoch": 0.5364107808049083, + "grad_norm": 1.4296035766601562, + "learning_rate": 1.8614765269182386e-05, + "loss": 0.8388, + "step": 11016 + }, + { + "epoch": 0.5364594745940156, + "grad_norm": 1.4155009984970093, + "learning_rate": 1.8611618493724777e-05, + "loss": 0.813, + "step": 11017 + }, + { + "epoch": 0.5365081683831228, + "grad_norm": 2.064415454864502, + "learning_rate": 1.8608471752803297e-05, + "loss": 0.7791, + "step": 11018 + }, + { + "epoch": 0.53655686217223, + "grad_norm": 1.8463603258132935, + "learning_rate": 1.8605325046496198e-05, + "loss": 0.8101, + "step": 11019 + }, + { + "epoch": 0.5366055559613371, + "grad_norm": 3.9223437309265137, + "learning_rate": 1.8602178374881772e-05, + "loss": 0.8822, + "step": 11020 + }, + { + "epoch": 0.5366542497504443, + "grad_norm": 1.6633164882659912, + "learning_rate": 1.8599031738038283e-05, + "loss": 0.9187, + "step": 11021 + }, + { + "epoch": 0.5367029435395515, + "grad_norm": 1.410172462463379, + "learning_rate": 1.8595885136044013e-05, + "loss": 0.7722, + "step": 11022 + }, + { + "epoch": 0.5367516373286587, + "grad_norm": 1.7406336069107056, + "learning_rate": 1.859273856897722e-05, + "loss": 0.8361, + "step": 11023 + }, + { + "epoch": 0.5368003311177659, + "grad_norm": 2.6521401405334473, + "learning_rate": 1.8589592036916197e-05, + "loss": 0.831, + "step": 11024 + }, + { + "epoch": 0.5368490249068731, + "grad_norm": 2.497980833053589, + "learning_rate": 1.8586445539939186e-05, + "loss": 0.866, + "step": 11025 + }, + { + "epoch": 0.5368977186959804, + "grad_norm": 1.5011883974075317, + "learning_rate": 1.858329907812448e-05, + "loss": 0.8874, + "step": 11026 + }, + { + "epoch": 0.5369464124850876, + "grad_norm": 0.09012829512357712, + "learning_rate": 1.858015265155033e-05, + "loss": 0.621, + "step": 11027 + }, + { + "epoch": 0.5369951062741948, + "grad_norm": 1.6933190822601318, + "learning_rate": 1.857700626029502e-05, + "loss": 0.8176, + "step": 11028 + }, + { + "epoch": 0.5370438000633019, + "grad_norm": 1.4585812091827393, + "learning_rate": 1.8573859904436807e-05, + "loss": 0.8121, + "step": 11029 + }, + { + "epoch": 0.5370924938524091, + "grad_norm": 1.8285926580429077, + "learning_rate": 1.8570713584053954e-05, + "loss": 0.9184, + "step": 11030 + }, + { + "epoch": 0.5371411876415163, + "grad_norm": 1.5833916664123535, + "learning_rate": 1.856756729922474e-05, + "loss": 0.8559, + "step": 11031 + }, + { + "epoch": 0.5371898814306235, + "grad_norm": 1.4290568828582764, + "learning_rate": 1.856442105002741e-05, + "loss": 0.9102, + "step": 11032 + }, + { + "epoch": 0.5372385752197307, + "grad_norm": 1.3677228689193726, + "learning_rate": 1.8561274836540252e-05, + "loss": 0.814, + "step": 11033 + }, + { + "epoch": 0.5372872690088379, + "grad_norm": 2.032665252685547, + "learning_rate": 1.8558128658841502e-05, + "loss": 0.8016, + "step": 11034 + }, + { + "epoch": 0.5373359627979452, + "grad_norm": 1.4434994459152222, + "learning_rate": 1.8554982517009434e-05, + "loss": 0.8299, + "step": 11035 + }, + { + "epoch": 0.5373846565870524, + "grad_norm": 1.803471565246582, + "learning_rate": 1.8551836411122307e-05, + "loss": 0.8986, + "step": 11036 + }, + { + "epoch": 0.5374333503761595, + "grad_norm": 2.05079984664917, + "learning_rate": 1.8548690341258377e-05, + "loss": 0.7878, + "step": 11037 + }, + { + "epoch": 0.5374820441652667, + "grad_norm": 1.5026750564575195, + "learning_rate": 1.854554430749592e-05, + "loss": 0.8067, + "step": 11038 + }, + { + "epoch": 0.5375307379543739, + "grad_norm": 2.246305465698242, + "learning_rate": 1.8542398309913172e-05, + "loss": 0.8511, + "step": 11039 + }, + { + "epoch": 0.5375794317434811, + "grad_norm": 1.8223882913589478, + "learning_rate": 1.8539252348588406e-05, + "loss": 0.9038, + "step": 11040 + }, + { + "epoch": 0.5376281255325883, + "grad_norm": 1.2944283485412598, + "learning_rate": 1.8536106423599862e-05, + "loss": 0.8035, + "step": 11041 + }, + { + "epoch": 0.5376768193216955, + "grad_norm": 1.313876986503601, + "learning_rate": 1.853296053502581e-05, + "loss": 0.8391, + "step": 11042 + }, + { + "epoch": 0.5377255131108027, + "grad_norm": 1.544665813446045, + "learning_rate": 1.8529814682944493e-05, + "loss": 0.7955, + "step": 11043 + }, + { + "epoch": 0.53777420689991, + "grad_norm": 1.7248128652572632, + "learning_rate": 1.8526668867434182e-05, + "loss": 0.8028, + "step": 11044 + }, + { + "epoch": 0.5378229006890172, + "grad_norm": 2.124251365661621, + "learning_rate": 1.8523523088573103e-05, + "loss": 0.8514, + "step": 11045 + }, + { + "epoch": 0.5378715944781243, + "grad_norm": 1.4614349603652954, + "learning_rate": 1.852037734643953e-05, + "loss": 0.8882, + "step": 11046 + }, + { + "epoch": 0.5379202882672315, + "grad_norm": 3.2639949321746826, + "learning_rate": 1.8517231641111704e-05, + "loss": 0.8837, + "step": 11047 + }, + { + "epoch": 0.5379689820563387, + "grad_norm": 1.7898263931274414, + "learning_rate": 1.851408597266787e-05, + "loss": 0.7429, + "step": 11048 + }, + { + "epoch": 0.5380176758454459, + "grad_norm": 2.212322235107422, + "learning_rate": 1.851094034118629e-05, + "loss": 0.8584, + "step": 11049 + }, + { + "epoch": 0.5380663696345531, + "grad_norm": 1.3746938705444336, + "learning_rate": 1.85077947467452e-05, + "loss": 0.7514, + "step": 11050 + }, + { + "epoch": 0.5381150634236603, + "grad_norm": 1.3922510147094727, + "learning_rate": 1.8504649189422862e-05, + "loss": 0.7434, + "step": 11051 + }, + { + "epoch": 0.5381637572127675, + "grad_norm": 1.9857420921325684, + "learning_rate": 1.85015036692975e-05, + "loss": 0.7558, + "step": 11052 + }, + { + "epoch": 0.5382124510018748, + "grad_norm": 1.4043865203857422, + "learning_rate": 1.8498358186447382e-05, + "loss": 0.7281, + "step": 11053 + }, + { + "epoch": 0.5382611447909819, + "grad_norm": 1.2796436548233032, + "learning_rate": 1.849521274095073e-05, + "loss": 0.7791, + "step": 11054 + }, + { + "epoch": 0.5383098385800891, + "grad_norm": 1.6784731149673462, + "learning_rate": 1.8492067332885803e-05, + "loss": 0.956, + "step": 11055 + }, + { + "epoch": 0.5383585323691963, + "grad_norm": 1.526412010192871, + "learning_rate": 1.8488921962330834e-05, + "loss": 0.8282, + "step": 11056 + }, + { + "epoch": 0.5384072261583035, + "grad_norm": 1.4770036935806274, + "learning_rate": 1.8485776629364073e-05, + "loss": 0.8293, + "step": 11057 + }, + { + "epoch": 0.5384559199474107, + "grad_norm": 1.1327447891235352, + "learning_rate": 1.848263133406376e-05, + "loss": 0.8193, + "step": 11058 + }, + { + "epoch": 0.5385046137365179, + "grad_norm": 1.2267426252365112, + "learning_rate": 1.8479486076508125e-05, + "loss": 0.8115, + "step": 11059 + }, + { + "epoch": 0.5385533075256251, + "grad_norm": 1.4202088117599487, + "learning_rate": 1.847634085677542e-05, + "loss": 0.8526, + "step": 11060 + }, + { + "epoch": 0.5386020013147323, + "grad_norm": 1.2827214002609253, + "learning_rate": 1.8473195674943863e-05, + "loss": 0.8032, + "step": 11061 + }, + { + "epoch": 0.5386506951038396, + "grad_norm": 0.1002291813492775, + "learning_rate": 1.847005053109172e-05, + "loss": 0.7057, + "step": 11062 + }, + { + "epoch": 0.5386993888929467, + "grad_norm": 1.4391460418701172, + "learning_rate": 1.8466905425297196e-05, + "loss": 0.7622, + "step": 11063 + }, + { + "epoch": 0.5387480826820539, + "grad_norm": 1.6508227586746216, + "learning_rate": 1.8463760357638542e-05, + "loss": 0.7837, + "step": 11064 + }, + { + "epoch": 0.5387967764711611, + "grad_norm": 0.09083419293165207, + "learning_rate": 1.8460615328193986e-05, + "loss": 0.612, + "step": 11065 + }, + { + "epoch": 0.5388454702602683, + "grad_norm": 1.6164342164993286, + "learning_rate": 1.8457470337041774e-05, + "loss": 0.8613, + "step": 11066 + }, + { + "epoch": 0.5388941640493755, + "grad_norm": 1.680602788925171, + "learning_rate": 1.8454325384260117e-05, + "loss": 0.7845, + "step": 11067 + }, + { + "epoch": 0.5389428578384827, + "grad_norm": 1.5578008890151978, + "learning_rate": 1.8451180469927258e-05, + "loss": 0.7985, + "step": 11068 + }, + { + "epoch": 0.5389915516275899, + "grad_norm": 1.4517406225204468, + "learning_rate": 1.8448035594121438e-05, + "loss": 0.7923, + "step": 11069 + }, + { + "epoch": 0.5390402454166972, + "grad_norm": 1.7974483966827393, + "learning_rate": 1.8444890756920862e-05, + "loss": 0.9081, + "step": 11070 + }, + { + "epoch": 0.5390889392058043, + "grad_norm": 1.568825602531433, + "learning_rate": 1.8441745958403776e-05, + "loss": 0.8296, + "step": 11071 + }, + { + "epoch": 0.5391376329949115, + "grad_norm": 1.714536190032959, + "learning_rate": 1.8438601198648396e-05, + "loss": 0.8304, + "step": 11072 + }, + { + "epoch": 0.5391863267840187, + "grad_norm": 3.6031930446624756, + "learning_rate": 1.843545647773296e-05, + "loss": 0.7718, + "step": 11073 + }, + { + "epoch": 0.5392350205731259, + "grad_norm": 1.6792014837265015, + "learning_rate": 1.8432311795735683e-05, + "loss": 0.8775, + "step": 11074 + }, + { + "epoch": 0.5392837143622331, + "grad_norm": 1.2487739324569702, + "learning_rate": 1.84291671527348e-05, + "loss": 0.803, + "step": 11075 + }, + { + "epoch": 0.5393324081513403, + "grad_norm": 1.6890735626220703, + "learning_rate": 1.8426022548808516e-05, + "loss": 0.7652, + "step": 11076 + }, + { + "epoch": 0.5393811019404475, + "grad_norm": 1.4368340969085693, + "learning_rate": 1.8422877984035068e-05, + "loss": 0.8305, + "step": 11077 + }, + { + "epoch": 0.5394297957295547, + "grad_norm": 2.207240104675293, + "learning_rate": 1.8419733458492676e-05, + "loss": 0.8529, + "step": 11078 + }, + { + "epoch": 0.5394784895186618, + "grad_norm": 1.2883319854736328, + "learning_rate": 1.8416588972259555e-05, + "loss": 0.7724, + "step": 11079 + }, + { + "epoch": 0.539527183307769, + "grad_norm": 1.6697425842285156, + "learning_rate": 1.8413444525413937e-05, + "loss": 0.8166, + "step": 11080 + }, + { + "epoch": 0.5395758770968763, + "grad_norm": 1.7301465272903442, + "learning_rate": 1.841030011803402e-05, + "loss": 0.823, + "step": 11081 + }, + { + "epoch": 0.5396245708859835, + "grad_norm": 1.5588819980621338, + "learning_rate": 1.8407155750198046e-05, + "loss": 0.7708, + "step": 11082 + }, + { + "epoch": 0.5396732646750907, + "grad_norm": 1.571081280708313, + "learning_rate": 1.8404011421984203e-05, + "loss": 0.8987, + "step": 11083 + }, + { + "epoch": 0.5397219584641979, + "grad_norm": 1.6212648153305054, + "learning_rate": 1.840086713347073e-05, + "loss": 0.7687, + "step": 11084 + }, + { + "epoch": 0.5397706522533051, + "grad_norm": 2.168705701828003, + "learning_rate": 1.8397722884735826e-05, + "loss": 0.8211, + "step": 11085 + }, + { + "epoch": 0.5398193460424123, + "grad_norm": 1.4294040203094482, + "learning_rate": 1.839457867585772e-05, + "loss": 0.8293, + "step": 11086 + }, + { + "epoch": 0.5398680398315195, + "grad_norm": 1.7909680604934692, + "learning_rate": 1.839143450691461e-05, + "loss": 0.8192, + "step": 11087 + }, + { + "epoch": 0.5399167336206266, + "grad_norm": 1.400752067565918, + "learning_rate": 1.8388290377984713e-05, + "loss": 0.7595, + "step": 11088 + }, + { + "epoch": 0.5399654274097339, + "grad_norm": 2.0203092098236084, + "learning_rate": 1.8385146289146247e-05, + "loss": 0.7571, + "step": 11089 + }, + { + "epoch": 0.5400141211988411, + "grad_norm": 1.2660942077636719, + "learning_rate": 1.8382002240477407e-05, + "loss": 0.9006, + "step": 11090 + }, + { + "epoch": 0.5400628149879483, + "grad_norm": 2.1192305088043213, + "learning_rate": 1.8378858232056415e-05, + "loss": 0.8524, + "step": 11091 + }, + { + "epoch": 0.5401115087770555, + "grad_norm": 1.3147295713424683, + "learning_rate": 1.8375714263961463e-05, + "loss": 0.8157, + "step": 11092 + }, + { + "epoch": 0.5401602025661627, + "grad_norm": 2.0820558071136475, + "learning_rate": 1.8372570336270774e-05, + "loss": 0.9093, + "step": 11093 + }, + { + "epoch": 0.5402088963552699, + "grad_norm": 1.7798634767532349, + "learning_rate": 1.8369426449062542e-05, + "loss": 0.7376, + "step": 11094 + }, + { + "epoch": 0.5402575901443771, + "grad_norm": 1.4442343711853027, + "learning_rate": 1.8366282602414983e-05, + "loss": 0.8415, + "step": 11095 + }, + { + "epoch": 0.5403062839334842, + "grad_norm": 1.4316890239715576, + "learning_rate": 1.8363138796406287e-05, + "loss": 0.8137, + "step": 11096 + }, + { + "epoch": 0.5403549777225914, + "grad_norm": 1.5147817134857178, + "learning_rate": 1.8359995031114658e-05, + "loss": 0.8008, + "step": 11097 + }, + { + "epoch": 0.5404036715116987, + "grad_norm": 0.09303796291351318, + "learning_rate": 1.8356851306618317e-05, + "loss": 0.6167, + "step": 11098 + }, + { + "epoch": 0.5404523653008059, + "grad_norm": 1.443434238433838, + "learning_rate": 1.8353707622995432e-05, + "loss": 0.832, + "step": 11099 + }, + { + "epoch": 0.5405010590899131, + "grad_norm": 1.6020761728286743, + "learning_rate": 1.8350563980324233e-05, + "loss": 0.8503, + "step": 11100 + }, + { + "epoch": 0.5405497528790203, + "grad_norm": 1.6439526081085205, + "learning_rate": 1.8347420378682896e-05, + "loss": 0.8903, + "step": 11101 + }, + { + "epoch": 0.5405984466681275, + "grad_norm": 1.9264439344406128, + "learning_rate": 1.8344276818149636e-05, + "loss": 0.8193, + "step": 11102 + }, + { + "epoch": 0.5406471404572347, + "grad_norm": 2.657308340072632, + "learning_rate": 1.834113329880264e-05, + "loss": 0.8411, + "step": 11103 + }, + { + "epoch": 0.5406958342463419, + "grad_norm": 1.5230960845947266, + "learning_rate": 1.8337989820720105e-05, + "loss": 0.8029, + "step": 11104 + }, + { + "epoch": 0.540744528035449, + "grad_norm": 1.5402345657348633, + "learning_rate": 1.8334846383980217e-05, + "loss": 0.8216, + "step": 11105 + }, + { + "epoch": 0.5407932218245562, + "grad_norm": 3.1802985668182373, + "learning_rate": 1.833170298866118e-05, + "loss": 0.9166, + "step": 11106 + }, + { + "epoch": 0.5408419156136635, + "grad_norm": 2.585092067718506, + "learning_rate": 1.8328559634841186e-05, + "loss": 0.7408, + "step": 11107 + }, + { + "epoch": 0.5408906094027707, + "grad_norm": 2.424802541732788, + "learning_rate": 1.832541632259842e-05, + "loss": 0.8257, + "step": 11108 + }, + { + "epoch": 0.5409393031918779, + "grad_norm": 2.7245614528656006, + "learning_rate": 1.8322273052011084e-05, + "loss": 0.8768, + "step": 11109 + }, + { + "epoch": 0.5409879969809851, + "grad_norm": 1.6313598155975342, + "learning_rate": 1.831912982315735e-05, + "loss": 0.8133, + "step": 11110 + }, + { + "epoch": 0.5410366907700923, + "grad_norm": 1.6567401885986328, + "learning_rate": 1.8315986636115425e-05, + "loss": 0.7718, + "step": 11111 + }, + { + "epoch": 0.5410853845591995, + "grad_norm": 2.0743489265441895, + "learning_rate": 1.8312843490963474e-05, + "loss": 0.7922, + "step": 11112 + }, + { + "epoch": 0.5411340783483066, + "grad_norm": 1.9662775993347168, + "learning_rate": 1.83097003877797e-05, + "loss": 0.7386, + "step": 11113 + }, + { + "epoch": 0.5411827721374138, + "grad_norm": 1.2557127475738525, + "learning_rate": 1.8306557326642285e-05, + "loss": 0.8089, + "step": 11114 + }, + { + "epoch": 0.541231465926521, + "grad_norm": 1.4587368965148926, + "learning_rate": 1.830341430762941e-05, + "loss": 0.8683, + "step": 11115 + }, + { + "epoch": 0.5412801597156283, + "grad_norm": 1.5707789659500122, + "learning_rate": 1.8300271330819254e-05, + "loss": 0.9048, + "step": 11116 + }, + { + "epoch": 0.5413288535047355, + "grad_norm": 1.3934624195098877, + "learning_rate": 1.8297128396290004e-05, + "loss": 0.8327, + "step": 11117 + }, + { + "epoch": 0.5413775472938427, + "grad_norm": 1.4523353576660156, + "learning_rate": 1.829398550411985e-05, + "loss": 0.7801, + "step": 11118 + }, + { + "epoch": 0.5414262410829499, + "grad_norm": 1.819851279258728, + "learning_rate": 1.8290842654386955e-05, + "loss": 0.9238, + "step": 11119 + }, + { + "epoch": 0.5414749348720571, + "grad_norm": 1.7711237668991089, + "learning_rate": 1.8287699847169508e-05, + "loss": 0.8134, + "step": 11120 + }, + { + "epoch": 0.5415236286611642, + "grad_norm": 1.9645116329193115, + "learning_rate": 1.8284557082545678e-05, + "loss": 0.7437, + "step": 11121 + }, + { + "epoch": 0.5415723224502714, + "grad_norm": 1.6773512363433838, + "learning_rate": 1.828141436059365e-05, + "loss": 0.7116, + "step": 11122 + }, + { + "epoch": 0.5416210162393786, + "grad_norm": 1.3987665176391602, + "learning_rate": 1.8278271681391594e-05, + "loss": 0.8861, + "step": 11123 + }, + { + "epoch": 0.5416697100284859, + "grad_norm": 1.2096095085144043, + "learning_rate": 1.8275129045017697e-05, + "loss": 0.9268, + "step": 11124 + }, + { + "epoch": 0.5417184038175931, + "grad_norm": 1.2578256130218506, + "learning_rate": 1.827198645155011e-05, + "loss": 0.8523, + "step": 11125 + }, + { + "epoch": 0.5417670976067003, + "grad_norm": 1.4552767276763916, + "learning_rate": 1.826884390106702e-05, + "loss": 0.8289, + "step": 11126 + }, + { + "epoch": 0.5418157913958075, + "grad_norm": 1.731009840965271, + "learning_rate": 1.8265701393646605e-05, + "loss": 0.8844, + "step": 11127 + }, + { + "epoch": 0.5418644851849147, + "grad_norm": 1.5349171161651611, + "learning_rate": 1.8262558929367015e-05, + "loss": 0.8446, + "step": 11128 + }, + { + "epoch": 0.5419131789740219, + "grad_norm": 18.193485260009766, + "learning_rate": 1.8259416508306437e-05, + "loss": 0.7876, + "step": 11129 + }, + { + "epoch": 0.541961872763129, + "grad_norm": 1.5557960271835327, + "learning_rate": 1.8256274130543023e-05, + "loss": 0.8621, + "step": 11130 + }, + { + "epoch": 0.5420105665522362, + "grad_norm": 1.3906069993972778, + "learning_rate": 1.8253131796154962e-05, + "loss": 0.8793, + "step": 11131 + }, + { + "epoch": 0.5420592603413434, + "grad_norm": 2.076578378677368, + "learning_rate": 1.8249989505220397e-05, + "loss": 0.8894, + "step": 11132 + }, + { + "epoch": 0.5421079541304507, + "grad_norm": 1.6008111238479614, + "learning_rate": 1.824684725781751e-05, + "loss": 0.8987, + "step": 11133 + }, + { + "epoch": 0.5421566479195579, + "grad_norm": 1.620957374572754, + "learning_rate": 1.824370505402445e-05, + "loss": 0.8535, + "step": 11134 + }, + { + "epoch": 0.5422053417086651, + "grad_norm": 2.3224008083343506, + "learning_rate": 1.824056289391939e-05, + "loss": 0.9411, + "step": 11135 + }, + { + "epoch": 0.5422540354977723, + "grad_norm": 1.6327334642410278, + "learning_rate": 1.8237420777580485e-05, + "loss": 0.7971, + "step": 11136 + }, + { + "epoch": 0.5423027292868795, + "grad_norm": 1.5587832927703857, + "learning_rate": 1.8234278705085898e-05, + "loss": 0.9014, + "step": 11137 + }, + { + "epoch": 0.5423514230759866, + "grad_norm": 1.6566832065582275, + "learning_rate": 1.82311366765138e-05, + "loss": 0.9017, + "step": 11138 + }, + { + "epoch": 0.5424001168650938, + "grad_norm": 2.048626184463501, + "learning_rate": 1.822799469194233e-05, + "loss": 0.7792, + "step": 11139 + }, + { + "epoch": 0.542448810654201, + "grad_norm": 1.7550894021987915, + "learning_rate": 1.822485275144966e-05, + "loss": 0.8015, + "step": 11140 + }, + { + "epoch": 0.5424975044433082, + "grad_norm": 1.614999532699585, + "learning_rate": 1.8221710855113932e-05, + "loss": 0.8341, + "step": 11141 + }, + { + "epoch": 0.5425461982324155, + "grad_norm": 2.516221046447754, + "learning_rate": 1.8218569003013313e-05, + "loss": 0.7936, + "step": 11142 + }, + { + "epoch": 0.5425948920215227, + "grad_norm": 1.8498492240905762, + "learning_rate": 1.8215427195225952e-05, + "loss": 0.7623, + "step": 11143 + }, + { + "epoch": 0.5426435858106299, + "grad_norm": 1.530278205871582, + "learning_rate": 1.821228543183e-05, + "loss": 0.9002, + "step": 11144 + }, + { + "epoch": 0.5426922795997371, + "grad_norm": 1.3162109851837158, + "learning_rate": 1.820914371290361e-05, + "loss": 0.8019, + "step": 11145 + }, + { + "epoch": 0.5427409733888443, + "grad_norm": 1.3985563516616821, + "learning_rate": 1.8206002038524932e-05, + "loss": 0.851, + "step": 11146 + }, + { + "epoch": 0.5427896671779514, + "grad_norm": 2.0367140769958496, + "learning_rate": 1.8202860408772126e-05, + "loss": 0.8418, + "step": 11147 + }, + { + "epoch": 0.5428383609670586, + "grad_norm": 1.5876481533050537, + "learning_rate": 1.8199718823723326e-05, + "loss": 0.8529, + "step": 11148 + }, + { + "epoch": 0.5428870547561658, + "grad_norm": 1.9071444272994995, + "learning_rate": 1.8196577283456687e-05, + "loss": 0.8212, + "step": 11149 + }, + { + "epoch": 0.542935748545273, + "grad_norm": 2.1428980827331543, + "learning_rate": 1.8193435788050347e-05, + "loss": 0.8409, + "step": 11150 + }, + { + "epoch": 0.5429844423343803, + "grad_norm": 1.9024688005447388, + "learning_rate": 1.8190294337582458e-05, + "loss": 0.7924, + "step": 11151 + }, + { + "epoch": 0.5430331361234875, + "grad_norm": 2.2633495330810547, + "learning_rate": 1.8187152932131162e-05, + "loss": 0.822, + "step": 11152 + }, + { + "epoch": 0.5430818299125947, + "grad_norm": 2.7126715183258057, + "learning_rate": 1.818401157177461e-05, + "loss": 0.8625, + "step": 11153 + }, + { + "epoch": 0.5431305237017019, + "grad_norm": 1.79611337184906, + "learning_rate": 1.8180870256590927e-05, + "loss": 0.8373, + "step": 11154 + }, + { + "epoch": 0.543179217490809, + "grad_norm": 1.5017852783203125, + "learning_rate": 1.817772898665827e-05, + "loss": 0.7669, + "step": 11155 + }, + { + "epoch": 0.5432279112799162, + "grad_norm": 1.8659799098968506, + "learning_rate": 1.817458776205476e-05, + "loss": 0.8346, + "step": 11156 + }, + { + "epoch": 0.5432766050690234, + "grad_norm": 0.09100009500980377, + "learning_rate": 1.8171446582858546e-05, + "loss": 0.6212, + "step": 11157 + }, + { + "epoch": 0.5433252988581306, + "grad_norm": 1.8380036354064941, + "learning_rate": 1.8168305449147772e-05, + "loss": 0.7629, + "step": 11158 + }, + { + "epoch": 0.5433739926472378, + "grad_norm": 1.6417447328567505, + "learning_rate": 1.816516436100056e-05, + "loss": 0.8478, + "step": 11159 + }, + { + "epoch": 0.5434226864363451, + "grad_norm": 1.5260727405548096, + "learning_rate": 1.8162023318495067e-05, + "loss": 0.7992, + "step": 11160 + }, + { + "epoch": 0.5434713802254523, + "grad_norm": 1.5955924987792969, + "learning_rate": 1.8158882321709396e-05, + "loss": 0.8199, + "step": 11161 + }, + { + "epoch": 0.5435200740145595, + "grad_norm": 3.063042640686035, + "learning_rate": 1.8155741370721708e-05, + "loss": 0.7808, + "step": 11162 + }, + { + "epoch": 0.5435687678036667, + "grad_norm": 1.1860769987106323, + "learning_rate": 1.8152600465610108e-05, + "loss": 0.8568, + "step": 11163 + }, + { + "epoch": 0.5436174615927738, + "grad_norm": 1.4643478393554688, + "learning_rate": 1.8149459606452748e-05, + "loss": 0.9332, + "step": 11164 + }, + { + "epoch": 0.543666155381881, + "grad_norm": 2.4915738105773926, + "learning_rate": 1.8146318793327743e-05, + "loss": 0.8296, + "step": 11165 + }, + { + "epoch": 0.5437148491709882, + "grad_norm": 1.184104084968567, + "learning_rate": 1.8143178026313234e-05, + "loss": 0.7824, + "step": 11166 + }, + { + "epoch": 0.5437635429600954, + "grad_norm": 2.3204076290130615, + "learning_rate": 1.8140037305487335e-05, + "loss": 0.7951, + "step": 11167 + }, + { + "epoch": 0.5438122367492026, + "grad_norm": 1.6968125104904175, + "learning_rate": 1.813689663092818e-05, + "loss": 0.8355, + "step": 11168 + }, + { + "epoch": 0.5438609305383099, + "grad_norm": 1.9402049779891968, + "learning_rate": 1.8133756002713896e-05, + "loss": 0.8581, + "step": 11169 + }, + { + "epoch": 0.5439096243274171, + "grad_norm": 6.833459854125977, + "learning_rate": 1.8130615420922594e-05, + "loss": 0.901, + "step": 11170 + }, + { + "epoch": 0.5439583181165243, + "grad_norm": 1.6306302547454834, + "learning_rate": 1.8127474885632413e-05, + "loss": 0.8146, + "step": 11171 + }, + { + "epoch": 0.5440070119056314, + "grad_norm": 1.5522634983062744, + "learning_rate": 1.8124334396921453e-05, + "loss": 0.8216, + "step": 11172 + }, + { + "epoch": 0.5440557056947386, + "grad_norm": 1.4963834285736084, + "learning_rate": 1.8121193954867854e-05, + "loss": 0.706, + "step": 11173 + }, + { + "epoch": 0.5441043994838458, + "grad_norm": 1.4938900470733643, + "learning_rate": 1.811805355954972e-05, + "loss": 0.8642, + "step": 11174 + }, + { + "epoch": 0.544153093272953, + "grad_norm": 2.1917645931243896, + "learning_rate": 1.8114913211045188e-05, + "loss": 0.8259, + "step": 11175 + }, + { + "epoch": 0.5442017870620602, + "grad_norm": 1.5397870540618896, + "learning_rate": 1.8111772909432352e-05, + "loss": 0.9163, + "step": 11176 + }, + { + "epoch": 0.5442504808511675, + "grad_norm": 2.1096510887145996, + "learning_rate": 1.8108632654789332e-05, + "loss": 0.8072, + "step": 11177 + }, + { + "epoch": 0.5442991746402747, + "grad_norm": 2.9154481887817383, + "learning_rate": 1.810549244719426e-05, + "loss": 0.8102, + "step": 11178 + }, + { + "epoch": 0.5443478684293819, + "grad_norm": 3.023655652999878, + "learning_rate": 1.8102352286725227e-05, + "loss": 0.8946, + "step": 11179 + }, + { + "epoch": 0.544396562218489, + "grad_norm": 1.4030410051345825, + "learning_rate": 1.8099212173460358e-05, + "loss": 0.8093, + "step": 11180 + }, + { + "epoch": 0.5444452560075962, + "grad_norm": 1.8830690383911133, + "learning_rate": 1.8096072107477752e-05, + "loss": 0.8429, + "step": 11181 + }, + { + "epoch": 0.5444939497967034, + "grad_norm": 3.4093706607818604, + "learning_rate": 1.809293208885554e-05, + "loss": 0.8331, + "step": 11182 + }, + { + "epoch": 0.5445426435858106, + "grad_norm": 1.6337028741836548, + "learning_rate": 1.8089792117671805e-05, + "loss": 0.8046, + "step": 11183 + }, + { + "epoch": 0.5445913373749178, + "grad_norm": 2.287348747253418, + "learning_rate": 1.8086652194004673e-05, + "loss": 0.8167, + "step": 11184 + }, + { + "epoch": 0.544640031164025, + "grad_norm": 1.3549704551696777, + "learning_rate": 1.8083512317932235e-05, + "loss": 0.9044, + "step": 11185 + }, + { + "epoch": 0.5446887249531323, + "grad_norm": 1.6137733459472656, + "learning_rate": 1.8080372489532603e-05, + "loss": 0.7204, + "step": 11186 + }, + { + "epoch": 0.5447374187422395, + "grad_norm": 1.506106972694397, + "learning_rate": 1.8077232708883884e-05, + "loss": 0.8932, + "step": 11187 + }, + { + "epoch": 0.5447861125313467, + "grad_norm": 4.708857536315918, + "learning_rate": 1.807409297606417e-05, + "loss": 0.9261, + "step": 11188 + }, + { + "epoch": 0.5448348063204538, + "grad_norm": 1.792576789855957, + "learning_rate": 1.8070953291151582e-05, + "loss": 0.7979, + "step": 11189 + }, + { + "epoch": 0.544883500109561, + "grad_norm": 1.8267964124679565, + "learning_rate": 1.8067813654224195e-05, + "loss": 0.7791, + "step": 11190 + }, + { + "epoch": 0.5449321938986682, + "grad_norm": 1.719168782234192, + "learning_rate": 1.806467406536013e-05, + "loss": 0.8779, + "step": 11191 + }, + { + "epoch": 0.5449808876877754, + "grad_norm": 1.3709583282470703, + "learning_rate": 1.8061534524637465e-05, + "loss": 0.863, + "step": 11192 + }, + { + "epoch": 0.5450295814768826, + "grad_norm": 2.40859317779541, + "learning_rate": 1.805839503213431e-05, + "loss": 0.8916, + "step": 11193 + }, + { + "epoch": 0.5450782752659898, + "grad_norm": 2.142580986022949, + "learning_rate": 1.8055255587928747e-05, + "loss": 0.8887, + "step": 11194 + }, + { + "epoch": 0.545126969055097, + "grad_norm": 1.4019790887832642, + "learning_rate": 1.8052116192098887e-05, + "loss": 0.8946, + "step": 11195 + }, + { + "epoch": 0.5451756628442043, + "grad_norm": 1.4039400815963745, + "learning_rate": 1.8048976844722818e-05, + "loss": 0.8808, + "step": 11196 + }, + { + "epoch": 0.5452243566333114, + "grad_norm": 1.5863046646118164, + "learning_rate": 1.8045837545878616e-05, + "loss": 0.8282, + "step": 11197 + }, + { + "epoch": 0.5452730504224186, + "grad_norm": 2.640697479248047, + "learning_rate": 1.8042698295644396e-05, + "loss": 0.8021, + "step": 11198 + }, + { + "epoch": 0.5453217442115258, + "grad_norm": 1.7387531995773315, + "learning_rate": 1.8039559094098225e-05, + "loss": 0.8081, + "step": 11199 + }, + { + "epoch": 0.545370438000633, + "grad_norm": 1.9303101301193237, + "learning_rate": 1.8036419941318208e-05, + "loss": 0.7868, + "step": 11200 + }, + { + "epoch": 0.5454191317897402, + "grad_norm": 2.969308614730835, + "learning_rate": 1.8033280837382417e-05, + "loss": 0.7406, + "step": 11201 + }, + { + "epoch": 0.5454678255788474, + "grad_norm": 1.3348487615585327, + "learning_rate": 1.803014178236895e-05, + "loss": 0.8849, + "step": 11202 + }, + { + "epoch": 0.5455165193679546, + "grad_norm": 1.6615300178527832, + "learning_rate": 1.802700277635588e-05, + "loss": 0.8182, + "step": 11203 + }, + { + "epoch": 0.5455652131570619, + "grad_norm": 1.285965919494629, + "learning_rate": 1.80238638194213e-05, + "loss": 0.8092, + "step": 11204 + }, + { + "epoch": 0.5456139069461691, + "grad_norm": 1.644997000694275, + "learning_rate": 1.8020724911643283e-05, + "loss": 0.8002, + "step": 11205 + }, + { + "epoch": 0.5456626007352762, + "grad_norm": 1.6890108585357666, + "learning_rate": 1.8017586053099916e-05, + "loss": 0.8343, + "step": 11206 + }, + { + "epoch": 0.5457112945243834, + "grad_norm": 2.8111910820007324, + "learning_rate": 1.8014447243869283e-05, + "loss": 0.7431, + "step": 11207 + }, + { + "epoch": 0.5457599883134906, + "grad_norm": 1.5948593616485596, + "learning_rate": 1.8011308484029446e-05, + "loss": 0.8493, + "step": 11208 + }, + { + "epoch": 0.5458086821025978, + "grad_norm": 1.6249645948410034, + "learning_rate": 1.8008169773658498e-05, + "loss": 0.7633, + "step": 11209 + }, + { + "epoch": 0.545857375891705, + "grad_norm": 2.7984447479248047, + "learning_rate": 1.8005031112834505e-05, + "loss": 0.9142, + "step": 11210 + }, + { + "epoch": 0.5459060696808122, + "grad_norm": 2.945950746536255, + "learning_rate": 1.800189250163555e-05, + "loss": 0.739, + "step": 11211 + }, + { + "epoch": 0.5459547634699194, + "grad_norm": 1.6976561546325684, + "learning_rate": 1.7998753940139696e-05, + "loss": 0.8164, + "step": 11212 + }, + { + "epoch": 0.5460034572590267, + "grad_norm": 1.7035220861434937, + "learning_rate": 1.7995615428425028e-05, + "loss": 0.8652, + "step": 11213 + }, + { + "epoch": 0.5460521510481338, + "grad_norm": 1.9360153675079346, + "learning_rate": 1.79924769665696e-05, + "loss": 0.851, + "step": 11214 + }, + { + "epoch": 0.546100844837241, + "grad_norm": 1.8619238138198853, + "learning_rate": 1.7989338554651493e-05, + "loss": 0.8674, + "step": 11215 + }, + { + "epoch": 0.5461495386263482, + "grad_norm": 1.4074395895004272, + "learning_rate": 1.7986200192748775e-05, + "loss": 0.8353, + "step": 11216 + }, + { + "epoch": 0.5461982324154554, + "grad_norm": 1.7622785568237305, + "learning_rate": 1.7983061880939504e-05, + "loss": 0.936, + "step": 11217 + }, + { + "epoch": 0.5462469262045626, + "grad_norm": 1.50645112991333, + "learning_rate": 1.7979923619301763e-05, + "loss": 0.7725, + "step": 11218 + }, + { + "epoch": 0.5462956199936698, + "grad_norm": 1.637966513633728, + "learning_rate": 1.79767854079136e-05, + "loss": 0.8514, + "step": 11219 + }, + { + "epoch": 0.546344313782777, + "grad_norm": 1.8121333122253418, + "learning_rate": 1.797364724685309e-05, + "loss": 0.8734, + "step": 11220 + }, + { + "epoch": 0.5463930075718842, + "grad_norm": 1.7667628526687622, + "learning_rate": 1.7970509136198285e-05, + "loss": 0.7503, + "step": 11221 + }, + { + "epoch": 0.5464417013609915, + "grad_norm": 1.4365406036376953, + "learning_rate": 1.7967371076027252e-05, + "loss": 0.8508, + "step": 11222 + }, + { + "epoch": 0.5464903951500986, + "grad_norm": 2.7886388301849365, + "learning_rate": 1.796423306641805e-05, + "loss": 0.8842, + "step": 11223 + }, + { + "epoch": 0.5465390889392058, + "grad_norm": 1.7354345321655273, + "learning_rate": 1.796109510744873e-05, + "loss": 0.844, + "step": 11224 + }, + { + "epoch": 0.546587782728313, + "grad_norm": 1.962769627571106, + "learning_rate": 1.7957957199197357e-05, + "loss": 0.8632, + "step": 11225 + }, + { + "epoch": 0.5466364765174202, + "grad_norm": 2.6663436889648438, + "learning_rate": 1.7954819341741983e-05, + "loss": 0.8105, + "step": 11226 + }, + { + "epoch": 0.5466851703065274, + "grad_norm": 1.3460640907287598, + "learning_rate": 1.7951681535160675e-05, + "loss": 0.8179, + "step": 11227 + }, + { + "epoch": 0.5467338640956346, + "grad_norm": 1.4396307468414307, + "learning_rate": 1.7948543779531466e-05, + "loss": 0.7934, + "step": 11228 + }, + { + "epoch": 0.5467825578847418, + "grad_norm": 1.5163143873214722, + "learning_rate": 1.7945406074932423e-05, + "loss": 0.7263, + "step": 11229 + }, + { + "epoch": 0.546831251673849, + "grad_norm": 1.3784598112106323, + "learning_rate": 1.7942268421441587e-05, + "loss": 0.9132, + "step": 11230 + }, + { + "epoch": 0.5468799454629562, + "grad_norm": 1.3466057777404785, + "learning_rate": 1.7939130819137013e-05, + "loss": 0.7451, + "step": 11231 + }, + { + "epoch": 0.5469286392520634, + "grad_norm": 1.8028045892715454, + "learning_rate": 1.7935993268096743e-05, + "loss": 0.9133, + "step": 11232 + }, + { + "epoch": 0.5469773330411706, + "grad_norm": 1.3635499477386475, + "learning_rate": 1.793285576839884e-05, + "loss": 0.8764, + "step": 11233 + }, + { + "epoch": 0.5470260268302778, + "grad_norm": 1.2696764469146729, + "learning_rate": 1.7929718320121328e-05, + "loss": 0.8219, + "step": 11234 + }, + { + "epoch": 0.547074720619385, + "grad_norm": 1.8658944368362427, + "learning_rate": 1.792658092334226e-05, + "loss": 0.8552, + "step": 11235 + }, + { + "epoch": 0.5471234144084922, + "grad_norm": 1.2932844161987305, + "learning_rate": 1.792344357813969e-05, + "loss": 0.8578, + "step": 11236 + }, + { + "epoch": 0.5471721081975994, + "grad_norm": 2.519221782684326, + "learning_rate": 1.7920306284591644e-05, + "loss": 0.8073, + "step": 11237 + }, + { + "epoch": 0.5472208019867066, + "grad_norm": 3.161590099334717, + "learning_rate": 1.7917169042776174e-05, + "loss": 0.8059, + "step": 11238 + }, + { + "epoch": 0.5472694957758137, + "grad_norm": 1.2236024141311646, + "learning_rate": 1.7914031852771304e-05, + "loss": 0.8894, + "step": 11239 + }, + { + "epoch": 0.547318189564921, + "grad_norm": 1.4402711391448975, + "learning_rate": 1.7910894714655096e-05, + "loss": 0.8471, + "step": 11240 + }, + { + "epoch": 0.5473668833540282, + "grad_norm": 1.4039556980133057, + "learning_rate": 1.7907757628505563e-05, + "loss": 0.8195, + "step": 11241 + }, + { + "epoch": 0.5474155771431354, + "grad_norm": 2.724821090698242, + "learning_rate": 1.7904620594400755e-05, + "loss": 0.8479, + "step": 11242 + }, + { + "epoch": 0.5474642709322426, + "grad_norm": 1.1284000873565674, + "learning_rate": 1.7901483612418697e-05, + "loss": 0.7228, + "step": 11243 + }, + { + "epoch": 0.5475129647213498, + "grad_norm": 1.295093059539795, + "learning_rate": 1.7898346682637425e-05, + "loss": 0.8147, + "step": 11244 + }, + { + "epoch": 0.547561658510457, + "grad_norm": 1.8892546892166138, + "learning_rate": 1.7895209805134966e-05, + "loss": 0.7419, + "step": 11245 + }, + { + "epoch": 0.5476103522995642, + "grad_norm": 1.503868579864502, + "learning_rate": 1.7892072979989362e-05, + "loss": 0.8185, + "step": 11246 + }, + { + "epoch": 0.5476590460886714, + "grad_norm": 1.0489232540130615, + "learning_rate": 1.7888936207278638e-05, + "loss": 0.7873, + "step": 11247 + }, + { + "epoch": 0.5477077398777785, + "grad_norm": 1.2994005680084229, + "learning_rate": 1.788579948708081e-05, + "loss": 0.7659, + "step": 11248 + }, + { + "epoch": 0.5477564336668858, + "grad_norm": 1.2727330923080444, + "learning_rate": 1.788266281947392e-05, + "loss": 0.7994, + "step": 11249 + }, + { + "epoch": 0.547805127455993, + "grad_norm": 2.578963279724121, + "learning_rate": 1.7879526204535978e-05, + "loss": 0.8719, + "step": 11250 + }, + { + "epoch": 0.5478538212451002, + "grad_norm": 2.1081132888793945, + "learning_rate": 1.7876389642345024e-05, + "loss": 0.8276, + "step": 11251 + }, + { + "epoch": 0.5479025150342074, + "grad_norm": 2.1056666374206543, + "learning_rate": 1.7873253132979065e-05, + "loss": 0.8434, + "step": 11252 + }, + { + "epoch": 0.5479512088233146, + "grad_norm": 2.082577705383301, + "learning_rate": 1.7870116676516132e-05, + "loss": 0.78, + "step": 11253 + }, + { + "epoch": 0.5479999026124218, + "grad_norm": 1.5663081407546997, + "learning_rate": 1.7866980273034232e-05, + "loss": 0.77, + "step": 11254 + }, + { + "epoch": 0.548048596401529, + "grad_norm": 1.672661304473877, + "learning_rate": 1.7863843922611398e-05, + "loss": 0.8342, + "step": 11255 + }, + { + "epoch": 0.5480972901906361, + "grad_norm": 1.4600732326507568, + "learning_rate": 1.7860707625325647e-05, + "loss": 0.8092, + "step": 11256 + }, + { + "epoch": 0.5481459839797433, + "grad_norm": 1.7834172248840332, + "learning_rate": 1.7857571381254978e-05, + "loss": 0.851, + "step": 11257 + }, + { + "epoch": 0.5481946777688506, + "grad_norm": 1.8310050964355469, + "learning_rate": 1.7854435190477427e-05, + "loss": 0.8938, + "step": 11258 + }, + { + "epoch": 0.5482433715579578, + "grad_norm": 1.707336664199829, + "learning_rate": 1.785129905307099e-05, + "loss": 0.8313, + "step": 11259 + }, + { + "epoch": 0.548292065347065, + "grad_norm": 0.09780818223953247, + "learning_rate": 1.7848162969113687e-05, + "loss": 0.6298, + "step": 11260 + }, + { + "epoch": 0.5483407591361722, + "grad_norm": 2.519378900527954, + "learning_rate": 1.7845026938683522e-05, + "loss": 0.8748, + "step": 11261 + }, + { + "epoch": 0.5483894529252794, + "grad_norm": 1.7829902172088623, + "learning_rate": 1.7841890961858518e-05, + "loss": 0.7463, + "step": 11262 + }, + { + "epoch": 0.5484381467143866, + "grad_norm": 3.02247953414917, + "learning_rate": 1.7838755038716663e-05, + "loss": 0.779, + "step": 11263 + }, + { + "epoch": 0.5484868405034938, + "grad_norm": 1.3153762817382812, + "learning_rate": 1.7835619169335982e-05, + "loss": 0.8288, + "step": 11264 + }, + { + "epoch": 0.5485355342926009, + "grad_norm": 2.8506104946136475, + "learning_rate": 1.7832483353794464e-05, + "loss": 0.8229, + "step": 11265 + }, + { + "epoch": 0.5485842280817081, + "grad_norm": 1.2390167713165283, + "learning_rate": 1.782934759217012e-05, + "loss": 0.8408, + "step": 11266 + }, + { + "epoch": 0.5486329218708154, + "grad_norm": 2.322080612182617, + "learning_rate": 1.7826211884540958e-05, + "loss": 0.8145, + "step": 11267 + }, + { + "epoch": 0.5486816156599226, + "grad_norm": 1.2721531391143799, + "learning_rate": 1.7823076230984966e-05, + "loss": 0.8018, + "step": 11268 + }, + { + "epoch": 0.5487303094490298, + "grad_norm": 1.567777156829834, + "learning_rate": 1.781994063158016e-05, + "loss": 0.8548, + "step": 11269 + }, + { + "epoch": 0.548779003238137, + "grad_norm": 1.5364818572998047, + "learning_rate": 1.7816805086404525e-05, + "loss": 0.8162, + "step": 11270 + }, + { + "epoch": 0.5488276970272442, + "grad_norm": 1.2346258163452148, + "learning_rate": 1.781366959553607e-05, + "loss": 0.8892, + "step": 11271 + }, + { + "epoch": 0.5488763908163514, + "grad_norm": 2.3258800506591797, + "learning_rate": 1.7810534159052772e-05, + "loss": 0.9053, + "step": 11272 + }, + { + "epoch": 0.5489250846054585, + "grad_norm": 1.5546684265136719, + "learning_rate": 1.7807398777032643e-05, + "loss": 0.8629, + "step": 11273 + }, + { + "epoch": 0.5489737783945657, + "grad_norm": 1.8064342737197876, + "learning_rate": 1.7804263449553664e-05, + "loss": 0.8531, + "step": 11274 + }, + { + "epoch": 0.549022472183673, + "grad_norm": 2.3925578594207764, + "learning_rate": 1.7801128176693835e-05, + "loss": 0.7961, + "step": 11275 + }, + { + "epoch": 0.5490711659727802, + "grad_norm": 1.620774507522583, + "learning_rate": 1.7797992958531146e-05, + "loss": 0.8385, + "step": 11276 + }, + { + "epoch": 0.5491198597618874, + "grad_norm": 1.4478501081466675, + "learning_rate": 1.7794857795143577e-05, + "loss": 0.8935, + "step": 11277 + }, + { + "epoch": 0.5491685535509946, + "grad_norm": 2.4075629711151123, + "learning_rate": 1.779172268660913e-05, + "loss": 0.8505, + "step": 11278 + }, + { + "epoch": 0.5492172473401018, + "grad_norm": 2.227261781692505, + "learning_rate": 1.7788587633005772e-05, + "loss": 0.8085, + "step": 11279 + }, + { + "epoch": 0.549265941129209, + "grad_norm": 1.3355027437210083, + "learning_rate": 1.7785452634411506e-05, + "loss": 0.8695, + "step": 11280 + }, + { + "epoch": 0.5493146349183161, + "grad_norm": 2.124272108078003, + "learning_rate": 1.7782317690904305e-05, + "loss": 0.7423, + "step": 11281 + }, + { + "epoch": 0.5493633287074233, + "grad_norm": 1.6584887504577637, + "learning_rate": 1.7779182802562157e-05, + "loss": 0.9288, + "step": 11282 + }, + { + "epoch": 0.5494120224965305, + "grad_norm": 2.6845543384552, + "learning_rate": 1.777604796946303e-05, + "loss": 0.8191, + "step": 11283 + }, + { + "epoch": 0.5494607162856378, + "grad_norm": 1.520372986793518, + "learning_rate": 1.7772913191684924e-05, + "loss": 0.8305, + "step": 11284 + }, + { + "epoch": 0.549509410074745, + "grad_norm": 1.295257568359375, + "learning_rate": 1.7769778469305798e-05, + "loss": 0.883, + "step": 11285 + }, + { + "epoch": 0.5495581038638522, + "grad_norm": 2.016218900680542, + "learning_rate": 1.7766643802403633e-05, + "loss": 0.7882, + "step": 11286 + }, + { + "epoch": 0.5496067976529594, + "grad_norm": 5.082643508911133, + "learning_rate": 1.7763509191056418e-05, + "loss": 0.8547, + "step": 11287 + }, + { + "epoch": 0.5496554914420666, + "grad_norm": 1.6281929016113281, + "learning_rate": 1.776037463534211e-05, + "loss": 0.8524, + "step": 11288 + }, + { + "epoch": 0.5497041852311738, + "grad_norm": 1.8824915885925293, + "learning_rate": 1.7757240135338683e-05, + "loss": 0.7771, + "step": 11289 + }, + { + "epoch": 0.5497528790202809, + "grad_norm": 2.2010655403137207, + "learning_rate": 1.7754105691124116e-05, + "loss": 0.872, + "step": 11290 + }, + { + "epoch": 0.5498015728093881, + "grad_norm": 1.5529202222824097, + "learning_rate": 1.775097130277638e-05, + "loss": 0.8759, + "step": 11291 + }, + { + "epoch": 0.5498502665984953, + "grad_norm": 1.7456883192062378, + "learning_rate": 1.7747836970373428e-05, + "loss": 0.86, + "step": 11292 + }, + { + "epoch": 0.5498989603876026, + "grad_norm": 1.8728629350662231, + "learning_rate": 1.774470269399325e-05, + "loss": 0.7896, + "step": 11293 + }, + { + "epoch": 0.5499476541767098, + "grad_norm": 1.8930103778839111, + "learning_rate": 1.7741568473713783e-05, + "loss": 0.8325, + "step": 11294 + }, + { + "epoch": 0.549996347965817, + "grad_norm": 1.3582652807235718, + "learning_rate": 1.7738434309613008e-05, + "loss": 0.8181, + "step": 11295 + }, + { + "epoch": 0.5500450417549242, + "grad_norm": 1.4226126670837402, + "learning_rate": 1.7735300201768892e-05, + "loss": 0.8056, + "step": 11296 + }, + { + "epoch": 0.5500937355440314, + "grad_norm": 1.7413805723190308, + "learning_rate": 1.7732166150259386e-05, + "loss": 0.7808, + "step": 11297 + }, + { + "epoch": 0.5501424293331385, + "grad_norm": 7.351942539215088, + "learning_rate": 1.772903215516246e-05, + "loss": 0.8313, + "step": 11298 + }, + { + "epoch": 0.5501911231222457, + "grad_norm": 1.6056535243988037, + "learning_rate": 1.772589821655606e-05, + "loss": 0.8359, + "step": 11299 + }, + { + "epoch": 0.5502398169113529, + "grad_norm": 1.919842004776001, + "learning_rate": 1.7722764334518157e-05, + "loss": 0.7589, + "step": 11300 + }, + { + "epoch": 0.5502885107004601, + "grad_norm": 1.5813997983932495, + "learning_rate": 1.7719630509126687e-05, + "loss": 0.8368, + "step": 11301 + }, + { + "epoch": 0.5503372044895674, + "grad_norm": 1.8774868249893188, + "learning_rate": 1.7716496740459626e-05, + "loss": 0.8236, + "step": 11302 + }, + { + "epoch": 0.5503858982786746, + "grad_norm": 1.9626986980438232, + "learning_rate": 1.771336302859491e-05, + "loss": 0.9137, + "step": 11303 + }, + { + "epoch": 0.5504345920677818, + "grad_norm": 3.0170469284057617, + "learning_rate": 1.77102293736105e-05, + "loss": 0.9239, + "step": 11304 + }, + { + "epoch": 0.550483285856889, + "grad_norm": 1.8062278032302856, + "learning_rate": 1.7707095775584344e-05, + "loss": 0.9132, + "step": 11305 + }, + { + "epoch": 0.5505319796459962, + "grad_norm": 1.4370925426483154, + "learning_rate": 1.770396223459439e-05, + "loss": 0.7709, + "step": 11306 + }, + { + "epoch": 0.5505806734351033, + "grad_norm": 1.2830907106399536, + "learning_rate": 1.7700828750718587e-05, + "loss": 0.7588, + "step": 11307 + }, + { + "epoch": 0.5506293672242105, + "grad_norm": 1.557570219039917, + "learning_rate": 1.769769532403487e-05, + "loss": 0.7873, + "step": 11308 + }, + { + "epoch": 0.5506780610133177, + "grad_norm": 2.741532802581787, + "learning_rate": 1.7694561954621204e-05, + "loss": 0.773, + "step": 11309 + }, + { + "epoch": 0.550726754802425, + "grad_norm": 3.254258871078491, + "learning_rate": 1.769142864255551e-05, + "loss": 0.8905, + "step": 11310 + }, + { + "epoch": 0.5507754485915322, + "grad_norm": 1.949944257736206, + "learning_rate": 1.768829538791574e-05, + "loss": 0.7282, + "step": 11311 + }, + { + "epoch": 0.5508241423806394, + "grad_norm": 1.8170146942138672, + "learning_rate": 1.7685162190779834e-05, + "loss": 0.9018, + "step": 11312 + }, + { + "epoch": 0.5508728361697466, + "grad_norm": 1.7132583856582642, + "learning_rate": 1.7682029051225734e-05, + "loss": 0.7336, + "step": 11313 + }, + { + "epoch": 0.5509215299588538, + "grad_norm": 1.3053414821624756, + "learning_rate": 1.7678895969331363e-05, + "loss": 0.7624, + "step": 11314 + }, + { + "epoch": 0.5509702237479609, + "grad_norm": 1.5382375717163086, + "learning_rate": 1.767576294517467e-05, + "loss": 0.8545, + "step": 11315 + }, + { + "epoch": 0.5510189175370681, + "grad_norm": 1.2973169088363647, + "learning_rate": 1.767262997883359e-05, + "loss": 0.8384, + "step": 11316 + }, + { + "epoch": 0.5510676113261753, + "grad_norm": 1.6744881868362427, + "learning_rate": 1.766949707038605e-05, + "loss": 0.8517, + "step": 11317 + }, + { + "epoch": 0.5511163051152825, + "grad_norm": 1.441618800163269, + "learning_rate": 1.7666364219909983e-05, + "loss": 0.9419, + "step": 11318 + }, + { + "epoch": 0.5511649989043897, + "grad_norm": 1.4448018074035645, + "learning_rate": 1.7663231427483314e-05, + "loss": 0.8314, + "step": 11319 + }, + { + "epoch": 0.551213692693497, + "grad_norm": 0.09701560437679291, + "learning_rate": 1.7660098693183986e-05, + "loss": 0.6259, + "step": 11320 + }, + { + "epoch": 0.5512623864826042, + "grad_norm": 1.5231837034225464, + "learning_rate": 1.7656966017089907e-05, + "loss": 0.7816, + "step": 11321 + }, + { + "epoch": 0.5513110802717114, + "grad_norm": 1.5363692045211792, + "learning_rate": 1.7653833399279023e-05, + "loss": 0.7292, + "step": 11322 + }, + { + "epoch": 0.5513597740608186, + "grad_norm": 1.3526029586791992, + "learning_rate": 1.7650700839829237e-05, + "loss": 0.8788, + "step": 11323 + }, + { + "epoch": 0.5514084678499257, + "grad_norm": 1.8256574869155884, + "learning_rate": 1.764756833881848e-05, + "loss": 0.7627, + "step": 11324 + }, + { + "epoch": 0.5514571616390329, + "grad_norm": 1.4242401123046875, + "learning_rate": 1.764443589632468e-05, + "loss": 0.8975, + "step": 11325 + }, + { + "epoch": 0.5515058554281401, + "grad_norm": 2.173215866088867, + "learning_rate": 1.7641303512425746e-05, + "loss": 0.8213, + "step": 11326 + }, + { + "epoch": 0.5515545492172473, + "grad_norm": 6.053020000457764, + "learning_rate": 1.7638171187199615e-05, + "loss": 0.8699, + "step": 11327 + }, + { + "epoch": 0.5516032430063545, + "grad_norm": 1.8028533458709717, + "learning_rate": 1.763503892072418e-05, + "loss": 0.9104, + "step": 11328 + }, + { + "epoch": 0.5516519367954618, + "grad_norm": 1.6663806438446045, + "learning_rate": 1.7631906713077376e-05, + "loss": 0.7964, + "step": 11329 + }, + { + "epoch": 0.551700630584569, + "grad_norm": 7.523215293884277, + "learning_rate": 1.7628774564337102e-05, + "loss": 0.7903, + "step": 11330 + }, + { + "epoch": 0.5517493243736762, + "grad_norm": 1.6754283905029297, + "learning_rate": 1.7625642474581275e-05, + "loss": 0.8449, + "step": 11331 + }, + { + "epoch": 0.5517980181627833, + "grad_norm": 1.3640202283859253, + "learning_rate": 1.762251044388781e-05, + "loss": 0.7806, + "step": 11332 + }, + { + "epoch": 0.5518467119518905, + "grad_norm": 3.0374374389648438, + "learning_rate": 1.7619378472334624e-05, + "loss": 0.9054, + "step": 11333 + }, + { + "epoch": 0.5518954057409977, + "grad_norm": 1.497300624847412, + "learning_rate": 1.7616246559999602e-05, + "loss": 0.8258, + "step": 11334 + }, + { + "epoch": 0.5519440995301049, + "grad_norm": 1.966353416442871, + "learning_rate": 1.7613114706960665e-05, + "loss": 0.8022, + "step": 11335 + }, + { + "epoch": 0.5519927933192121, + "grad_norm": 6.042669773101807, + "learning_rate": 1.7609982913295728e-05, + "loss": 0.8546, + "step": 11336 + }, + { + "epoch": 0.5520414871083194, + "grad_norm": 1.6933996677398682, + "learning_rate": 1.7606851179082675e-05, + "loss": 0.8581, + "step": 11337 + }, + { + "epoch": 0.5520901808974266, + "grad_norm": 1.9850010871887207, + "learning_rate": 1.7603719504399427e-05, + "loss": 0.869, + "step": 11338 + }, + { + "epoch": 0.5521388746865338, + "grad_norm": 0.09038742631673813, + "learning_rate": 1.7600587889323862e-05, + "loss": 0.6103, + "step": 11339 + }, + { + "epoch": 0.5521875684756409, + "grad_norm": 2.049330711364746, + "learning_rate": 1.75974563339339e-05, + "loss": 0.8205, + "step": 11340 + }, + { + "epoch": 0.5522362622647481, + "grad_norm": 1.4002900123596191, + "learning_rate": 1.7594324838307425e-05, + "loss": 0.8766, + "step": 11341 + }, + { + "epoch": 0.5522849560538553, + "grad_norm": 1.5292410850524902, + "learning_rate": 1.7591193402522345e-05, + "loss": 0.8036, + "step": 11342 + }, + { + "epoch": 0.5523336498429625, + "grad_norm": 1.4191266298294067, + "learning_rate": 1.7588062026656545e-05, + "loss": 0.8687, + "step": 11343 + }, + { + "epoch": 0.5523823436320697, + "grad_norm": 1.3846673965454102, + "learning_rate": 1.7584930710787918e-05, + "loss": 0.8453, + "step": 11344 + }, + { + "epoch": 0.5524310374211769, + "grad_norm": 0.09960675984621048, + "learning_rate": 1.758179945499437e-05, + "loss": 0.6697, + "step": 11345 + }, + { + "epoch": 0.5524797312102842, + "grad_norm": 2.0525901317596436, + "learning_rate": 1.7578668259353774e-05, + "loss": 0.7563, + "step": 11346 + }, + { + "epoch": 0.5525284249993914, + "grad_norm": 1.6307604312896729, + "learning_rate": 1.757553712394403e-05, + "loss": 0.8066, + "step": 11347 + }, + { + "epoch": 0.5525771187884986, + "grad_norm": 1.4013618230819702, + "learning_rate": 1.7572406048843018e-05, + "loss": 0.8488, + "step": 11348 + }, + { + "epoch": 0.5526258125776057, + "grad_norm": 1.51724374294281, + "learning_rate": 1.756927503412863e-05, + "loss": 0.9466, + "step": 11349 + }, + { + "epoch": 0.5526745063667129, + "grad_norm": 1.8831167221069336, + "learning_rate": 1.7566144079878742e-05, + "loss": 0.68, + "step": 11350 + }, + { + "epoch": 0.5527232001558201, + "grad_norm": 1.6586363315582275, + "learning_rate": 1.7563013186171252e-05, + "loss": 0.7798, + "step": 11351 + }, + { + "epoch": 0.5527718939449273, + "grad_norm": 1.341090440750122, + "learning_rate": 1.755988235308402e-05, + "loss": 0.9475, + "step": 11352 + }, + { + "epoch": 0.5528205877340345, + "grad_norm": 1.6722002029418945, + "learning_rate": 1.7556751580694943e-05, + "loss": 0.8205, + "step": 11353 + }, + { + "epoch": 0.5528692815231417, + "grad_norm": 1.4766308069229126, + "learning_rate": 1.7553620869081888e-05, + "loss": 0.8637, + "step": 11354 + }, + { + "epoch": 0.552917975312249, + "grad_norm": 10.434549331665039, + "learning_rate": 1.7550490218322735e-05, + "loss": 0.8729, + "step": 11355 + }, + { + "epoch": 0.5529666691013562, + "grad_norm": 1.2994449138641357, + "learning_rate": 1.7547359628495372e-05, + "loss": 0.781, + "step": 11356 + }, + { + "epoch": 0.5530153628904633, + "grad_norm": 1.3318265676498413, + "learning_rate": 1.754422909967765e-05, + "loss": 0.8339, + "step": 11357 + }, + { + "epoch": 0.5530640566795705, + "grad_norm": 1.3091075420379639, + "learning_rate": 1.754109863194747e-05, + "loss": 0.9034, + "step": 11358 + }, + { + "epoch": 0.5531127504686777, + "grad_norm": 1.2018165588378906, + "learning_rate": 1.753796822538267e-05, + "loss": 0.8748, + "step": 11359 + }, + { + "epoch": 0.5531614442577849, + "grad_norm": 1.357970952987671, + "learning_rate": 1.753483788006114e-05, + "loss": 0.8713, + "step": 11360 + }, + { + "epoch": 0.5532101380468921, + "grad_norm": 1.960675835609436, + "learning_rate": 1.7531707596060744e-05, + "loss": 0.7856, + "step": 11361 + }, + { + "epoch": 0.5532588318359993, + "grad_norm": 2.3351833820343018, + "learning_rate": 1.7528577373459344e-05, + "loss": 0.7911, + "step": 11362 + }, + { + "epoch": 0.5533075256251065, + "grad_norm": 2.6425764560699463, + "learning_rate": 1.7525447212334804e-05, + "loss": 0.8305, + "step": 11363 + }, + { + "epoch": 0.5533562194142138, + "grad_norm": 3.5785224437713623, + "learning_rate": 1.752231711276499e-05, + "loss": 0.7745, + "step": 11364 + }, + { + "epoch": 0.553404913203321, + "grad_norm": 1.6453109979629517, + "learning_rate": 1.7519187074827776e-05, + "loss": 0.8147, + "step": 11365 + }, + { + "epoch": 0.5534536069924281, + "grad_norm": 2.3535985946655273, + "learning_rate": 1.7516057098600997e-05, + "loss": 0.7728, + "step": 11366 + }, + { + "epoch": 0.5535023007815353, + "grad_norm": 1.6268937587738037, + "learning_rate": 1.7512927184162537e-05, + "loss": 0.7601, + "step": 11367 + }, + { + "epoch": 0.5535509945706425, + "grad_norm": 1.349510908126831, + "learning_rate": 1.750979733159023e-05, + "loss": 0.8028, + "step": 11368 + }, + { + "epoch": 0.5535996883597497, + "grad_norm": 1.45510733127594, + "learning_rate": 1.7506667540961945e-05, + "loss": 0.7218, + "step": 11369 + }, + { + "epoch": 0.5536483821488569, + "grad_norm": 1.2097077369689941, + "learning_rate": 1.7503537812355524e-05, + "loss": 0.787, + "step": 11370 + }, + { + "epoch": 0.5536970759379641, + "grad_norm": 2.3959381580352783, + "learning_rate": 1.750040814584884e-05, + "loss": 0.8253, + "step": 11371 + }, + { + "epoch": 0.5537457697270713, + "grad_norm": 1.7895350456237793, + "learning_rate": 1.7497278541519723e-05, + "loss": 0.8568, + "step": 11372 + }, + { + "epoch": 0.5537944635161786, + "grad_norm": 1.2349129915237427, + "learning_rate": 1.7494148999446037e-05, + "loss": 0.9291, + "step": 11373 + }, + { + "epoch": 0.5538431573052857, + "grad_norm": 1.7013697624206543, + "learning_rate": 1.7491019519705618e-05, + "loss": 0.7803, + "step": 11374 + }, + { + "epoch": 0.5538918510943929, + "grad_norm": 1.4080371856689453, + "learning_rate": 1.7487890102376314e-05, + "loss": 0.7837, + "step": 11375 + }, + { + "epoch": 0.5539405448835001, + "grad_norm": 1.1868494749069214, + "learning_rate": 1.7484760747535978e-05, + "loss": 0.832, + "step": 11376 + }, + { + "epoch": 0.5539892386726073, + "grad_norm": 1.3122854232788086, + "learning_rate": 1.7481631455262442e-05, + "loss": 0.8844, + "step": 11377 + }, + { + "epoch": 0.5540379324617145, + "grad_norm": 1.3096823692321777, + "learning_rate": 1.7478502225633566e-05, + "loss": 0.8452, + "step": 11378 + }, + { + "epoch": 0.5540866262508217, + "grad_norm": 1.3763456344604492, + "learning_rate": 1.7475373058727164e-05, + "loss": 0.8376, + "step": 11379 + }, + { + "epoch": 0.5541353200399289, + "grad_norm": 1.3728363513946533, + "learning_rate": 1.7472243954621096e-05, + "loss": 0.7768, + "step": 11380 + }, + { + "epoch": 0.5541840138290361, + "grad_norm": 1.8388499021530151, + "learning_rate": 1.7469114913393185e-05, + "loss": 0.8027, + "step": 11381 + }, + { + "epoch": 0.5542327076181434, + "grad_norm": 1.38711416721344, + "learning_rate": 1.746598593512127e-05, + "loss": 0.8116, + "step": 11382 + }, + { + "epoch": 0.5542814014072505, + "grad_norm": 1.4148573875427246, + "learning_rate": 1.7462857019883188e-05, + "loss": 0.8214, + "step": 11383 + }, + { + "epoch": 0.5543300951963577, + "grad_norm": 2.2810418605804443, + "learning_rate": 1.7459728167756767e-05, + "loss": 0.8902, + "step": 11384 + }, + { + "epoch": 0.5543787889854649, + "grad_norm": 0.0950545221567154, + "learning_rate": 1.7456599378819848e-05, + "loss": 0.6461, + "step": 11385 + }, + { + "epoch": 0.5544274827745721, + "grad_norm": 1.4413400888442993, + "learning_rate": 1.745347065315024e-05, + "loss": 0.8714, + "step": 11386 + }, + { + "epoch": 0.5544761765636793, + "grad_norm": 1.7681657075881958, + "learning_rate": 1.7450341990825796e-05, + "loss": 0.9112, + "step": 11387 + }, + { + "epoch": 0.5545248703527865, + "grad_norm": 1.738194227218628, + "learning_rate": 1.7447213391924313e-05, + "loss": 0.758, + "step": 11388 + }, + { + "epoch": 0.5545735641418937, + "grad_norm": 2.2754735946655273, + "learning_rate": 1.7444084856523634e-05, + "loss": 0.8743, + "step": 11389 + }, + { + "epoch": 0.554622257931001, + "grad_norm": 1.3029022216796875, + "learning_rate": 1.744095638470158e-05, + "loss": 0.7752, + "step": 11390 + }, + { + "epoch": 0.554670951720108, + "grad_norm": 1.5589083433151245, + "learning_rate": 1.7437827976535972e-05, + "loss": 0.8632, + "step": 11391 + }, + { + "epoch": 0.5547196455092153, + "grad_norm": 1.129058837890625, + "learning_rate": 1.7434699632104617e-05, + "loss": 0.8471, + "step": 11392 + }, + { + "epoch": 0.5547683392983225, + "grad_norm": 1.563075065612793, + "learning_rate": 1.7431571351485342e-05, + "loss": 0.8623, + "step": 11393 + }, + { + "epoch": 0.5548170330874297, + "grad_norm": 1.5295941829681396, + "learning_rate": 1.7428443134755978e-05, + "loss": 0.7806, + "step": 11394 + }, + { + "epoch": 0.5548657268765369, + "grad_norm": 1.2126946449279785, + "learning_rate": 1.7425314981994314e-05, + "loss": 0.8253, + "step": 11395 + }, + { + "epoch": 0.5549144206656441, + "grad_norm": 1.2437515258789062, + "learning_rate": 1.742218689327818e-05, + "loss": 0.8832, + "step": 11396 + }, + { + "epoch": 0.5549631144547513, + "grad_norm": 1.4537005424499512, + "learning_rate": 1.741905886868538e-05, + "loss": 0.8247, + "step": 11397 + }, + { + "epoch": 0.5550118082438585, + "grad_norm": 1.4634594917297363, + "learning_rate": 1.7415930908293727e-05, + "loss": 0.8522, + "step": 11398 + }, + { + "epoch": 0.5550605020329656, + "grad_norm": 2.020716667175293, + "learning_rate": 1.741280301218102e-05, + "loss": 0.8025, + "step": 11399 + }, + { + "epoch": 0.5551091958220729, + "grad_norm": 1.6217643022537231, + "learning_rate": 1.7409675180425087e-05, + "loss": 0.7393, + "step": 11400 + }, + { + "epoch": 0.5551578896111801, + "grad_norm": 1.6189842224121094, + "learning_rate": 1.740654741310371e-05, + "loss": 0.8202, + "step": 11401 + }, + { + "epoch": 0.5552065834002873, + "grad_norm": 1.2776867151260376, + "learning_rate": 1.7403419710294713e-05, + "loss": 0.8501, + "step": 11402 + }, + { + "epoch": 0.5552552771893945, + "grad_norm": 1.6513086557388306, + "learning_rate": 1.7400292072075878e-05, + "loss": 0.7077, + "step": 11403 + }, + { + "epoch": 0.5553039709785017, + "grad_norm": 1.0559829473495483, + "learning_rate": 1.7397164498525014e-05, + "loss": 0.8349, + "step": 11404 + }, + { + "epoch": 0.5553526647676089, + "grad_norm": 1.3925715684890747, + "learning_rate": 1.7394036989719928e-05, + "loss": 0.8458, + "step": 11405 + }, + { + "epoch": 0.5554013585567161, + "grad_norm": 1.8193843364715576, + "learning_rate": 1.73909095457384e-05, + "loss": 0.7974, + "step": 11406 + }, + { + "epoch": 0.5554500523458233, + "grad_norm": 1.2315397262573242, + "learning_rate": 1.7387782166658246e-05, + "loss": 0.875, + "step": 11407 + }, + { + "epoch": 0.5554987461349304, + "grad_norm": 3.6781256198883057, + "learning_rate": 1.738465485255724e-05, + "loss": 0.8148, + "step": 11408 + }, + { + "epoch": 0.5555474399240377, + "grad_norm": 1.966368317604065, + "learning_rate": 1.7381527603513194e-05, + "loss": 0.8225, + "step": 11409 + }, + { + "epoch": 0.5555961337131449, + "grad_norm": 1.3879225254058838, + "learning_rate": 1.7378400419603872e-05, + "loss": 0.8523, + "step": 11410 + }, + { + "epoch": 0.5556448275022521, + "grad_norm": 1.3779304027557373, + "learning_rate": 1.7375273300907088e-05, + "loss": 0.824, + "step": 11411 + }, + { + "epoch": 0.5556935212913593, + "grad_norm": 1.3402646780014038, + "learning_rate": 1.7372146247500617e-05, + "loss": 0.7843, + "step": 11412 + }, + { + "epoch": 0.5557422150804665, + "grad_norm": 1.4503203630447388, + "learning_rate": 1.736901925946225e-05, + "loss": 0.8596, + "step": 11413 + }, + { + "epoch": 0.5557909088695737, + "grad_norm": 1.1601827144622803, + "learning_rate": 1.736589233686977e-05, + "loss": 0.8703, + "step": 11414 + }, + { + "epoch": 0.5558396026586809, + "grad_norm": 1.646611213684082, + "learning_rate": 1.7362765479800952e-05, + "loss": 0.7919, + "step": 11415 + }, + { + "epoch": 0.555888296447788, + "grad_norm": 0.09003371745347977, + "learning_rate": 1.7359638688333592e-05, + "loss": 0.6176, + "step": 11416 + }, + { + "epoch": 0.5559369902368952, + "grad_norm": 0.09040183573961258, + "learning_rate": 1.7356511962545455e-05, + "loss": 0.5939, + "step": 11417 + }, + { + "epoch": 0.5559856840260025, + "grad_norm": 2.113260269165039, + "learning_rate": 1.735338530251433e-05, + "loss": 0.7864, + "step": 11418 + }, + { + "epoch": 0.5560343778151097, + "grad_norm": 1.281728744506836, + "learning_rate": 1.7350258708317977e-05, + "loss": 0.8658, + "step": 11419 + }, + { + "epoch": 0.5560830716042169, + "grad_norm": 1.2556688785552979, + "learning_rate": 1.7347132180034186e-05, + "loss": 0.8295, + "step": 11420 + }, + { + "epoch": 0.5561317653933241, + "grad_norm": 1.285271406173706, + "learning_rate": 1.734400571774072e-05, + "loss": 0.8567, + "step": 11421 + }, + { + "epoch": 0.5561804591824313, + "grad_norm": 0.09077849239110947, + "learning_rate": 1.734087932151536e-05, + "loss": 0.6624, + "step": 11422 + }, + { + "epoch": 0.5562291529715385, + "grad_norm": 1.5570303201675415, + "learning_rate": 1.7337752991435867e-05, + "loss": 0.8955, + "step": 11423 + }, + { + "epoch": 0.5562778467606457, + "grad_norm": 1.9498486518859863, + "learning_rate": 1.7334626727580003e-05, + "loss": 0.8499, + "step": 11424 + }, + { + "epoch": 0.5563265405497528, + "grad_norm": 1.3135666847229004, + "learning_rate": 1.7331500530025556e-05, + "loss": 0.7334, + "step": 11425 + }, + { + "epoch": 0.55637523433886, + "grad_norm": 1.5572205781936646, + "learning_rate": 1.7328374398850268e-05, + "loss": 0.7753, + "step": 11426 + }, + { + "epoch": 0.5564239281279673, + "grad_norm": 1.4114359617233276, + "learning_rate": 1.7325248334131912e-05, + "loss": 0.8138, + "step": 11427 + }, + { + "epoch": 0.5564726219170745, + "grad_norm": 1.5892990827560425, + "learning_rate": 1.7322122335948244e-05, + "loss": 0.734, + "step": 11428 + }, + { + "epoch": 0.5565213157061817, + "grad_norm": 1.8396445512771606, + "learning_rate": 1.7318996404377036e-05, + "loss": 0.867, + "step": 11429 + }, + { + "epoch": 0.5565700094952889, + "grad_norm": 1.7270400524139404, + "learning_rate": 1.731587053949603e-05, + "loss": 0.9051, + "step": 11430 + }, + { + "epoch": 0.5566187032843961, + "grad_norm": 1.4220490455627441, + "learning_rate": 1.7312744741382993e-05, + "loss": 0.9204, + "step": 11431 + }, + { + "epoch": 0.5566673970735033, + "grad_norm": 1.4631870985031128, + "learning_rate": 1.7309619010115674e-05, + "loss": 0.6871, + "step": 11432 + }, + { + "epoch": 0.5567160908626104, + "grad_norm": 1.5361485481262207, + "learning_rate": 1.730649334577182e-05, + "loss": 0.809, + "step": 11433 + }, + { + "epoch": 0.5567647846517176, + "grad_norm": 1.3975725173950195, + "learning_rate": 1.7303367748429202e-05, + "loss": 0.8837, + "step": 11434 + }, + { + "epoch": 0.5568134784408248, + "grad_norm": 1.1682723760604858, + "learning_rate": 1.7300242218165545e-05, + "loss": 0.8729, + "step": 11435 + }, + { + "epoch": 0.5568621722299321, + "grad_norm": 1.4335788488388062, + "learning_rate": 1.729711675505862e-05, + "loss": 0.8604, + "step": 11436 + }, + { + "epoch": 0.5569108660190393, + "grad_norm": 1.3306844234466553, + "learning_rate": 1.7293991359186154e-05, + "loss": 0.7876, + "step": 11437 + }, + { + "epoch": 0.5569595598081465, + "grad_norm": 3.5695924758911133, + "learning_rate": 1.7290866030625914e-05, + "loss": 0.9491, + "step": 11438 + }, + { + "epoch": 0.5570082535972537, + "grad_norm": 1.929015040397644, + "learning_rate": 1.7287740769455612e-05, + "loss": 0.844, + "step": 11439 + }, + { + "epoch": 0.5570569473863609, + "grad_norm": 1.433064341545105, + "learning_rate": 1.7284615575753012e-05, + "loss": 0.8697, + "step": 11440 + }, + { + "epoch": 0.557105641175468, + "grad_norm": 1.996031403541565, + "learning_rate": 1.7281490449595842e-05, + "loss": 0.8417, + "step": 11441 + }, + { + "epoch": 0.5571543349645752, + "grad_norm": 1.4901224374771118, + "learning_rate": 1.727836539106186e-05, + "loss": 0.8253, + "step": 11442 + }, + { + "epoch": 0.5572030287536824, + "grad_norm": 1.7499386072158813, + "learning_rate": 1.727524040022877e-05, + "loss": 0.7816, + "step": 11443 + }, + { + "epoch": 0.5572517225427897, + "grad_norm": 1.4372299909591675, + "learning_rate": 1.7272115477174328e-05, + "loss": 0.8416, + "step": 11444 + }, + { + "epoch": 0.5573004163318969, + "grad_norm": 1.313410758972168, + "learning_rate": 1.726899062197627e-05, + "loss": 0.7998, + "step": 11445 + }, + { + "epoch": 0.5573491101210041, + "grad_norm": 1.6453546285629272, + "learning_rate": 1.726586583471231e-05, + "loss": 0.7446, + "step": 11446 + }, + { + "epoch": 0.5573978039101113, + "grad_norm": 1.8108407258987427, + "learning_rate": 1.7262741115460195e-05, + "loss": 0.8489, + "step": 11447 + }, + { + "epoch": 0.5574464976992185, + "grad_norm": 2.425584316253662, + "learning_rate": 1.7259616464297634e-05, + "loss": 0.8823, + "step": 11448 + }, + { + "epoch": 0.5574951914883257, + "grad_norm": 1.6213467121124268, + "learning_rate": 1.7256491881302374e-05, + "loss": 0.7709, + "step": 11449 + }, + { + "epoch": 0.5575438852774328, + "grad_norm": 1.7829527854919434, + "learning_rate": 1.725336736655212e-05, + "loss": 0.834, + "step": 11450 + }, + { + "epoch": 0.55759257906654, + "grad_norm": 1.4024066925048828, + "learning_rate": 1.725024292012461e-05, + "loss": 0.7745, + "step": 11451 + }, + { + "epoch": 0.5576412728556472, + "grad_norm": 0.0941503718495369, + "learning_rate": 1.724711854209755e-05, + "loss": 0.557, + "step": 11452 + }, + { + "epoch": 0.5576899666447545, + "grad_norm": 1.38213050365448, + "learning_rate": 1.724399423254867e-05, + "loss": 0.8733, + "step": 11453 + }, + { + "epoch": 0.5577386604338617, + "grad_norm": 1.3170208930969238, + "learning_rate": 1.7240869991555692e-05, + "loss": 0.9167, + "step": 11454 + }, + { + "epoch": 0.5577873542229689, + "grad_norm": 1.5378341674804688, + "learning_rate": 1.7237745819196316e-05, + "loss": 0.8232, + "step": 11455 + }, + { + "epoch": 0.5578360480120761, + "grad_norm": 1.5564638376235962, + "learning_rate": 1.723462171554827e-05, + "loss": 0.9166, + "step": 11456 + }, + { + "epoch": 0.5578847418011833, + "grad_norm": 1.097615361213684, + "learning_rate": 1.7231497680689254e-05, + "loss": 0.7598, + "step": 11457 + }, + { + "epoch": 0.5579334355902904, + "grad_norm": 1.5702396631240845, + "learning_rate": 1.7228373714696997e-05, + "loss": 0.7149, + "step": 11458 + }, + { + "epoch": 0.5579821293793976, + "grad_norm": 2.714134693145752, + "learning_rate": 1.7225249817649188e-05, + "loss": 0.8718, + "step": 11459 + }, + { + "epoch": 0.5580308231685048, + "grad_norm": 1.4292129278182983, + "learning_rate": 1.722212598962355e-05, + "loss": 0.8521, + "step": 11460 + }, + { + "epoch": 0.558079516957612, + "grad_norm": 1.460591197013855, + "learning_rate": 1.7219002230697773e-05, + "loss": 0.8685, + "step": 11461 + }, + { + "epoch": 0.5581282107467193, + "grad_norm": 1.444893479347229, + "learning_rate": 1.7215878540949574e-05, + "loss": 0.8719, + "step": 11462 + }, + { + "epoch": 0.5581769045358265, + "grad_norm": 1.6380324363708496, + "learning_rate": 1.7212754920456643e-05, + "loss": 0.8145, + "step": 11463 + }, + { + "epoch": 0.5582255983249337, + "grad_norm": 1.9543322324752808, + "learning_rate": 1.720963136929669e-05, + "loss": 0.83, + "step": 11464 + }, + { + "epoch": 0.5582742921140409, + "grad_norm": 1.5949046611785889, + "learning_rate": 1.720650788754742e-05, + "loss": 0.7933, + "step": 11465 + }, + { + "epoch": 0.5583229859031481, + "grad_norm": 1.2836486101150513, + "learning_rate": 1.7203384475286512e-05, + "loss": 0.8076, + "step": 11466 + }, + { + "epoch": 0.5583716796922552, + "grad_norm": 1.4734420776367188, + "learning_rate": 1.7200261132591677e-05, + "loss": 0.916, + "step": 11467 + }, + { + "epoch": 0.5584203734813624, + "grad_norm": 1.4442709684371948, + "learning_rate": 1.7197137859540594e-05, + "loss": 0.8772, + "step": 11468 + }, + { + "epoch": 0.5584690672704696, + "grad_norm": 1.6998330354690552, + "learning_rate": 1.719401465621097e-05, + "loss": 0.8405, + "step": 11469 + }, + { + "epoch": 0.5585177610595768, + "grad_norm": 1.9839752912521362, + "learning_rate": 1.719089152268048e-05, + "loss": 0.8546, + "step": 11470 + }, + { + "epoch": 0.5585664548486841, + "grad_norm": 1.5340181589126587, + "learning_rate": 1.7187768459026826e-05, + "loss": 0.7856, + "step": 11471 + }, + { + "epoch": 0.5586151486377913, + "grad_norm": 1.3654183149337769, + "learning_rate": 1.718464546532768e-05, + "loss": 0.7939, + "step": 11472 + }, + { + "epoch": 0.5586638424268985, + "grad_norm": 1.6432663202285767, + "learning_rate": 1.718152254166074e-05, + "loss": 0.9108, + "step": 11473 + }, + { + "epoch": 0.5587125362160057, + "grad_norm": 1.5132757425308228, + "learning_rate": 1.7178399688103686e-05, + "loss": 0.8293, + "step": 11474 + }, + { + "epoch": 0.5587612300051128, + "grad_norm": 1.9097312688827515, + "learning_rate": 1.717527690473419e-05, + "loss": 0.7484, + "step": 11475 + }, + { + "epoch": 0.55880992379422, + "grad_norm": 1.3699854612350464, + "learning_rate": 1.717215419162995e-05, + "loss": 0.8914, + "step": 11476 + }, + { + "epoch": 0.5588586175833272, + "grad_norm": 1.4931422472000122, + "learning_rate": 1.716903154886862e-05, + "loss": 0.8776, + "step": 11477 + }, + { + "epoch": 0.5589073113724344, + "grad_norm": 1.2549508810043335, + "learning_rate": 1.7165908976527896e-05, + "loss": 0.8275, + "step": 11478 + }, + { + "epoch": 0.5589560051615416, + "grad_norm": 1.7249661684036255, + "learning_rate": 1.716278647468544e-05, + "loss": 0.8938, + "step": 11479 + }, + { + "epoch": 0.5590046989506489, + "grad_norm": 1.3256638050079346, + "learning_rate": 1.7159664043418935e-05, + "loss": 0.7936, + "step": 11480 + }, + { + "epoch": 0.5590533927397561, + "grad_norm": 0.0953187644481659, + "learning_rate": 1.715654168280604e-05, + "loss": 0.6479, + "step": 11481 + }, + { + "epoch": 0.5591020865288633, + "grad_norm": 2.424954652786255, + "learning_rate": 1.7153419392924428e-05, + "loss": 0.8586, + "step": 11482 + }, + { + "epoch": 0.5591507803179705, + "grad_norm": 1.3175313472747803, + "learning_rate": 1.7150297173851777e-05, + "loss": 0.7024, + "step": 11483 + }, + { + "epoch": 0.5591994741070776, + "grad_norm": 1.4560694694519043, + "learning_rate": 1.7147175025665738e-05, + "loss": 0.8159, + "step": 11484 + }, + { + "epoch": 0.5592481678961848, + "grad_norm": 1.9023391008377075, + "learning_rate": 1.714405294844398e-05, + "loss": 0.808, + "step": 11485 + }, + { + "epoch": 0.559296861685292, + "grad_norm": 1.327272891998291, + "learning_rate": 1.7140930942264163e-05, + "loss": 0.9208, + "step": 11486 + }, + { + "epoch": 0.5593455554743992, + "grad_norm": 1.2933921813964844, + "learning_rate": 1.713780900720396e-05, + "loss": 0.9115, + "step": 11487 + }, + { + "epoch": 0.5593942492635064, + "grad_norm": 1.231693983078003, + "learning_rate": 1.7134687143341005e-05, + "loss": 0.903, + "step": 11488 + }, + { + "epoch": 0.5594429430526137, + "grad_norm": 1.4591941833496094, + "learning_rate": 1.713156535075298e-05, + "loss": 0.8158, + "step": 11489 + }, + { + "epoch": 0.5594916368417209, + "grad_norm": 2.2055795192718506, + "learning_rate": 1.7128443629517524e-05, + "loss": 0.8271, + "step": 11490 + }, + { + "epoch": 0.5595403306308281, + "grad_norm": 1.576546311378479, + "learning_rate": 1.7125321979712294e-05, + "loss": 0.8348, + "step": 11491 + }, + { + "epoch": 0.5595890244199352, + "grad_norm": 1.2650514841079712, + "learning_rate": 1.7122200401414938e-05, + "loss": 0.7111, + "step": 11492 + }, + { + "epoch": 0.5596377182090424, + "grad_norm": 1.1557544469833374, + "learning_rate": 1.711907889470311e-05, + "loss": 0.8456, + "step": 11493 + }, + { + "epoch": 0.5596864119981496, + "grad_norm": 1.254861831665039, + "learning_rate": 1.711595745965447e-05, + "loss": 0.77, + "step": 11494 + }, + { + "epoch": 0.5597351057872568, + "grad_norm": 1.6016110181808472, + "learning_rate": 1.7112836096346635e-05, + "loss": 0.8051, + "step": 11495 + }, + { + "epoch": 0.559783799576364, + "grad_norm": 1.6002277135849, + "learning_rate": 1.7109714804857278e-05, + "loss": 0.8001, + "step": 11496 + }, + { + "epoch": 0.5598324933654713, + "grad_norm": 1.2431241273880005, + "learning_rate": 1.710659358526402e-05, + "loss": 0.8114, + "step": 11497 + }, + { + "epoch": 0.5598811871545785, + "grad_norm": 1.4369921684265137, + "learning_rate": 1.7103472437644516e-05, + "loss": 0.7173, + "step": 11498 + }, + { + "epoch": 0.5599298809436857, + "grad_norm": 2.4050753116607666, + "learning_rate": 1.7100351362076396e-05, + "loss": 0.7549, + "step": 11499 + }, + { + "epoch": 0.5599785747327928, + "grad_norm": 2.5764002799987793, + "learning_rate": 1.7097230358637302e-05, + "loss": 0.7979, + "step": 11500 + }, + { + "epoch": 0.5600272685219, + "grad_norm": 1.695366621017456, + "learning_rate": 1.7094109427404864e-05, + "loss": 0.8422, + "step": 11501 + }, + { + "epoch": 0.5600759623110072, + "grad_norm": 1.6403735876083374, + "learning_rate": 1.7090988568456716e-05, + "loss": 0.8488, + "step": 11502 + }, + { + "epoch": 0.5601246561001144, + "grad_norm": 1.3613141775131226, + "learning_rate": 1.7087867781870505e-05, + "loss": 0.9155, + "step": 11503 + }, + { + "epoch": 0.5601733498892216, + "grad_norm": 1.1373826265335083, + "learning_rate": 1.7084747067723836e-05, + "loss": 0.7704, + "step": 11504 + }, + { + "epoch": 0.5602220436783288, + "grad_norm": 1.389530062675476, + "learning_rate": 1.708162642609436e-05, + "loss": 0.7694, + "step": 11505 + }, + { + "epoch": 0.560270737467436, + "grad_norm": 1.5250930786132812, + "learning_rate": 1.7078505857059686e-05, + "loss": 0.771, + "step": 11506 + }, + { + "epoch": 0.5603194312565433, + "grad_norm": 1.575729250907898, + "learning_rate": 1.707538536069745e-05, + "loss": 0.8328, + "step": 11507 + }, + { + "epoch": 0.5603681250456505, + "grad_norm": 1.3304271697998047, + "learning_rate": 1.7072264937085263e-05, + "loss": 0.8358, + "step": 11508 + }, + { + "epoch": 0.5604168188347576, + "grad_norm": 1.5260354280471802, + "learning_rate": 1.7069144586300762e-05, + "loss": 0.7619, + "step": 11509 + }, + { + "epoch": 0.5604655126238648, + "grad_norm": 1.4992451667785645, + "learning_rate": 1.7066024308421547e-05, + "loss": 0.8484, + "step": 11510 + }, + { + "epoch": 0.560514206412972, + "grad_norm": 1.5073192119598389, + "learning_rate": 1.7062904103525254e-05, + "loss": 0.767, + "step": 11511 + }, + { + "epoch": 0.5605629002020792, + "grad_norm": 1.6026904582977295, + "learning_rate": 1.7059783971689477e-05, + "loss": 0.7506, + "step": 11512 + }, + { + "epoch": 0.5606115939911864, + "grad_norm": 1.4428259134292603, + "learning_rate": 1.7056663912991848e-05, + "loss": 0.9068, + "step": 11513 + }, + { + "epoch": 0.5606602877802936, + "grad_norm": 1.8104676008224487, + "learning_rate": 1.7053543927509974e-05, + "loss": 0.7862, + "step": 11514 + }, + { + "epoch": 0.5607089815694009, + "grad_norm": 1.2332215309143066, + "learning_rate": 1.7050424015321458e-05, + "loss": 0.8572, + "step": 11515 + }, + { + "epoch": 0.5607576753585081, + "grad_norm": 1.4241206645965576, + "learning_rate": 1.7047304176503925e-05, + "loss": 0.749, + "step": 11516 + }, + { + "epoch": 0.5608063691476152, + "grad_norm": 1.3101696968078613, + "learning_rate": 1.7044184411134957e-05, + "loss": 0.8891, + "step": 11517 + }, + { + "epoch": 0.5608550629367224, + "grad_norm": 1.1559709310531616, + "learning_rate": 1.7041064719292183e-05, + "loss": 0.7627, + "step": 11518 + }, + { + "epoch": 0.5609037567258296, + "grad_norm": 1.873774528503418, + "learning_rate": 1.7037945101053187e-05, + "loss": 0.7911, + "step": 11519 + }, + { + "epoch": 0.5609524505149368, + "grad_norm": 1.4289606809616089, + "learning_rate": 1.7034825556495578e-05, + "loss": 0.7859, + "step": 11520 + }, + { + "epoch": 0.561001144304044, + "grad_norm": 1.4205368757247925, + "learning_rate": 1.703170608569695e-05, + "loss": 0.8648, + "step": 11521 + }, + { + "epoch": 0.5610498380931512, + "grad_norm": 1.206402063369751, + "learning_rate": 1.702858668873491e-05, + "loss": 0.8977, + "step": 11522 + }, + { + "epoch": 0.5610985318822584, + "grad_norm": 1.759165644645691, + "learning_rate": 1.7025467365687042e-05, + "loss": 0.8705, + "step": 11523 + }, + { + "epoch": 0.5611472256713657, + "grad_norm": 1.1434799432754517, + "learning_rate": 1.7022348116630944e-05, + "loss": 0.8051, + "step": 11524 + }, + { + "epoch": 0.5611959194604729, + "grad_norm": 0.09145248681306839, + "learning_rate": 1.7019228941644218e-05, + "loss": 0.5552, + "step": 11525 + }, + { + "epoch": 0.56124461324958, + "grad_norm": 1.4702945947647095, + "learning_rate": 1.7016109840804432e-05, + "loss": 0.7875, + "step": 11526 + }, + { + "epoch": 0.5612933070386872, + "grad_norm": 1.8144879341125488, + "learning_rate": 1.701299081418919e-05, + "loss": 0.7138, + "step": 11527 + }, + { + "epoch": 0.5613420008277944, + "grad_norm": 1.3682211637496948, + "learning_rate": 1.7009871861876077e-05, + "loss": 0.8322, + "step": 11528 + }, + { + "epoch": 0.5613906946169016, + "grad_norm": 1.7215157747268677, + "learning_rate": 1.7006752983942674e-05, + "loss": 0.8135, + "step": 11529 + }, + { + "epoch": 0.5614393884060088, + "grad_norm": 8.799454689025879, + "learning_rate": 1.7003634180466557e-05, + "loss": 0.8079, + "step": 11530 + }, + { + "epoch": 0.561488082195116, + "grad_norm": 1.3043208122253418, + "learning_rate": 1.7000515451525324e-05, + "loss": 0.8866, + "step": 11531 + }, + { + "epoch": 0.5615367759842232, + "grad_norm": 1.529564619064331, + "learning_rate": 1.699739679719653e-05, + "loss": 0.8331, + "step": 11532 + }, + { + "epoch": 0.5615854697733305, + "grad_norm": 0.110714852809906, + "learning_rate": 1.6994278217557767e-05, + "loss": 0.6283, + "step": 11533 + }, + { + "epoch": 0.5616341635624376, + "grad_norm": 1.1750727891921997, + "learning_rate": 1.6991159712686616e-05, + "loss": 0.8133, + "step": 11534 + }, + { + "epoch": 0.5616828573515448, + "grad_norm": 1.5283607244491577, + "learning_rate": 1.6988041282660633e-05, + "loss": 0.8896, + "step": 11535 + }, + { + "epoch": 0.561731551140652, + "grad_norm": 1.0856289863586426, + "learning_rate": 1.6984922927557402e-05, + "loss": 0.7821, + "step": 11536 + }, + { + "epoch": 0.5617802449297592, + "grad_norm": 1.5558139085769653, + "learning_rate": 1.698180464745448e-05, + "loss": 0.8343, + "step": 11537 + }, + { + "epoch": 0.5618289387188664, + "grad_norm": 1.7109099626541138, + "learning_rate": 1.6978686442429455e-05, + "loss": 0.8509, + "step": 11538 + }, + { + "epoch": 0.5618776325079736, + "grad_norm": 1.3059977293014526, + "learning_rate": 1.697556831255987e-05, + "loss": 0.7744, + "step": 11539 + }, + { + "epoch": 0.5619263262970808, + "grad_norm": 2.503718376159668, + "learning_rate": 1.697245025792331e-05, + "loss": 0.8198, + "step": 11540 + }, + { + "epoch": 0.561975020086188, + "grad_norm": 1.3002257347106934, + "learning_rate": 1.6969332278597316e-05, + "loss": 0.9805, + "step": 11541 + }, + { + "epoch": 0.5620237138752953, + "grad_norm": 2.4899415969848633, + "learning_rate": 1.696621437465946e-05, + "loss": 0.8585, + "step": 11542 + }, + { + "epoch": 0.5620724076644024, + "grad_norm": 2.2537384033203125, + "learning_rate": 1.6963096546187298e-05, + "loss": 0.9143, + "step": 11543 + }, + { + "epoch": 0.5621211014535096, + "grad_norm": 1.2403441667556763, + "learning_rate": 1.6959978793258386e-05, + "loss": 0.7401, + "step": 11544 + }, + { + "epoch": 0.5621697952426168, + "grad_norm": 0.09707986563444138, + "learning_rate": 1.6956861115950283e-05, + "loss": 0.6278, + "step": 11545 + }, + { + "epoch": 0.562218489031724, + "grad_norm": 2.373339891433716, + "learning_rate": 1.695374351434053e-05, + "loss": 0.806, + "step": 11546 + }, + { + "epoch": 0.5622671828208312, + "grad_norm": 1.3501007556915283, + "learning_rate": 1.6950625988506696e-05, + "loss": 0.9062, + "step": 11547 + }, + { + "epoch": 0.5623158766099384, + "grad_norm": 2.1049587726593018, + "learning_rate": 1.694750853852631e-05, + "loss": 0.7604, + "step": 11548 + }, + { + "epoch": 0.5623645703990456, + "grad_norm": 0.09481221437454224, + "learning_rate": 1.694439116447693e-05, + "loss": 0.6127, + "step": 11549 + }, + { + "epoch": 0.5624132641881529, + "grad_norm": 1.1418906450271606, + "learning_rate": 1.6941273866436093e-05, + "loss": 0.8304, + "step": 11550 + }, + { + "epoch": 0.56246195797726, + "grad_norm": 1.2063572406768799, + "learning_rate": 1.693815664448136e-05, + "loss": 0.7588, + "step": 11551 + }, + { + "epoch": 0.5625106517663672, + "grad_norm": 1.600392460823059, + "learning_rate": 1.6935039498690248e-05, + "loss": 0.9223, + "step": 11552 + }, + { + "epoch": 0.5625593455554744, + "grad_norm": 1.401814341545105, + "learning_rate": 1.6931922429140305e-05, + "loss": 0.7928, + "step": 11553 + }, + { + "epoch": 0.5626080393445816, + "grad_norm": 1.2708381414413452, + "learning_rate": 1.6928805435909084e-05, + "loss": 0.8783, + "step": 11554 + }, + { + "epoch": 0.5626567331336888, + "grad_norm": 1.2341232299804688, + "learning_rate": 1.69256885190741e-05, + "loss": 0.8331, + "step": 11555 + }, + { + "epoch": 0.562705426922796, + "grad_norm": 1.602657437324524, + "learning_rate": 1.69225716787129e-05, + "loss": 0.8176, + "step": 11556 + }, + { + "epoch": 0.5627541207119032, + "grad_norm": 1.3126003742218018, + "learning_rate": 1.6919454914903e-05, + "loss": 0.8153, + "step": 11557 + }, + { + "epoch": 0.5628028145010104, + "grad_norm": 1.304146409034729, + "learning_rate": 1.6916338227721948e-05, + "loss": 0.9403, + "step": 11558 + }, + { + "epoch": 0.5628515082901175, + "grad_norm": 2.704366683959961, + "learning_rate": 1.691322161724726e-05, + "loss": 0.8333, + "step": 11559 + }, + { + "epoch": 0.5629002020792248, + "grad_norm": 1.3873193264007568, + "learning_rate": 1.6910105083556473e-05, + "loss": 0.8655, + "step": 11560 + }, + { + "epoch": 0.562948895868332, + "grad_norm": 1.2641632556915283, + "learning_rate": 1.6906988626727094e-05, + "loss": 0.8372, + "step": 11561 + }, + { + "epoch": 0.5629975896574392, + "grad_norm": 1.4823806285858154, + "learning_rate": 1.6903872246836655e-05, + "loss": 0.819, + "step": 11562 + }, + { + "epoch": 0.5630462834465464, + "grad_norm": 0.10644130408763885, + "learning_rate": 1.6900755943962688e-05, + "loss": 0.6243, + "step": 11563 + }, + { + "epoch": 0.5630949772356536, + "grad_norm": 1.472641944885254, + "learning_rate": 1.6897639718182687e-05, + "loss": 0.827, + "step": 11564 + }, + { + "epoch": 0.5631436710247608, + "grad_norm": 0.09964686632156372, + "learning_rate": 1.689452356957419e-05, + "loss": 0.6395, + "step": 11565 + }, + { + "epoch": 0.563192364813868, + "grad_norm": 1.6600606441497803, + "learning_rate": 1.689140749821469e-05, + "loss": 0.8135, + "step": 11566 + }, + { + "epoch": 0.5632410586029752, + "grad_norm": 1.5759518146514893, + "learning_rate": 1.688829150418173e-05, + "loss": 0.7436, + "step": 11567 + }, + { + "epoch": 0.5632897523920823, + "grad_norm": 1.2781554460525513, + "learning_rate": 1.6885175587552787e-05, + "loss": 0.8348, + "step": 11568 + }, + { + "epoch": 0.5633384461811896, + "grad_norm": 0.095990389585495, + "learning_rate": 1.6882059748405397e-05, + "loss": 0.6145, + "step": 11569 + }, + { + "epoch": 0.5633871399702968, + "grad_norm": 1.3937904834747314, + "learning_rate": 1.6878943986817046e-05, + "loss": 0.8978, + "step": 11570 + }, + { + "epoch": 0.563435833759404, + "grad_norm": 1.8744888305664062, + "learning_rate": 1.6875828302865245e-05, + "loss": 0.8156, + "step": 11571 + }, + { + "epoch": 0.5634845275485112, + "grad_norm": 1.3260773420333862, + "learning_rate": 1.6872712696627507e-05, + "loss": 0.8151, + "step": 11572 + }, + { + "epoch": 0.5635332213376184, + "grad_norm": 1.8859922885894775, + "learning_rate": 1.686959716818132e-05, + "loss": 0.8011, + "step": 11573 + }, + { + "epoch": 0.5635819151267256, + "grad_norm": 1.9412193298339844, + "learning_rate": 1.68664817176042e-05, + "loss": 0.8543, + "step": 11574 + }, + { + "epoch": 0.5636306089158328, + "grad_norm": 1.3945505619049072, + "learning_rate": 1.686336634497362e-05, + "loss": 0.8449, + "step": 11575 + }, + { + "epoch": 0.5636793027049399, + "grad_norm": 2.4067296981811523, + "learning_rate": 1.68602510503671e-05, + "loss": 0.8418, + "step": 11576 + }, + { + "epoch": 0.5637279964940471, + "grad_norm": 1.590573787689209, + "learning_rate": 1.6857135833862114e-05, + "loss": 0.7768, + "step": 11577 + }, + { + "epoch": 0.5637766902831544, + "grad_norm": 2.77629017829895, + "learning_rate": 1.6854020695536164e-05, + "loss": 0.8289, + "step": 11578 + }, + { + "epoch": 0.5638253840722616, + "grad_norm": 2.0221805572509766, + "learning_rate": 1.685090563546673e-05, + "loss": 0.7336, + "step": 11579 + }, + { + "epoch": 0.5638740778613688, + "grad_norm": 1.4515079259872437, + "learning_rate": 1.6847790653731315e-05, + "loss": 0.8047, + "step": 11580 + }, + { + "epoch": 0.563922771650476, + "grad_norm": 1.3517484664916992, + "learning_rate": 1.6844675750407386e-05, + "loss": 0.8257, + "step": 11581 + }, + { + "epoch": 0.5639714654395832, + "grad_norm": 2.202712297439575, + "learning_rate": 1.6841560925572436e-05, + "loss": 0.8607, + "step": 11582 + }, + { + "epoch": 0.5640201592286904, + "grad_norm": 1.3378318548202515, + "learning_rate": 1.6838446179303956e-05, + "loss": 0.8156, + "step": 11583 + }, + { + "epoch": 0.5640688530177976, + "grad_norm": 2.447129011154175, + "learning_rate": 1.6835331511679406e-05, + "loss": 0.8835, + "step": 11584 + }, + { + "epoch": 0.5641175468069047, + "grad_norm": 2.2859721183776855, + "learning_rate": 1.683221692277628e-05, + "loss": 0.9061, + "step": 11585 + }, + { + "epoch": 0.564166240596012, + "grad_norm": 1.5352507829666138, + "learning_rate": 1.682910241267204e-05, + "loss": 0.8798, + "step": 11586 + }, + { + "epoch": 0.5642149343851192, + "grad_norm": 2.107348918914795, + "learning_rate": 1.682598798144417e-05, + "loss": 0.7977, + "step": 11587 + }, + { + "epoch": 0.5642636281742264, + "grad_norm": 1.1942024230957031, + "learning_rate": 1.6822873629170134e-05, + "loss": 0.8158, + "step": 11588 + }, + { + "epoch": 0.5643123219633336, + "grad_norm": 1.4699733257293701, + "learning_rate": 1.6819759355927414e-05, + "loss": 0.8779, + "step": 11589 + }, + { + "epoch": 0.5643610157524408, + "grad_norm": 1.4759634733200073, + "learning_rate": 1.6816645161793465e-05, + "loss": 0.8096, + "step": 11590 + }, + { + "epoch": 0.564409709541548, + "grad_norm": 1.3504503965377808, + "learning_rate": 1.6813531046845756e-05, + "loss": 0.8485, + "step": 11591 + }, + { + "epoch": 0.5644584033306552, + "grad_norm": 0.09452690184116364, + "learning_rate": 1.681041701116176e-05, + "loss": 0.6687, + "step": 11592 + }, + { + "epoch": 0.5645070971197623, + "grad_norm": 1.3089107275009155, + "learning_rate": 1.680730305481892e-05, + "loss": 0.8386, + "step": 11593 + }, + { + "epoch": 0.5645557909088695, + "grad_norm": 2.898815393447876, + "learning_rate": 1.6804189177894718e-05, + "loss": 0.8152, + "step": 11594 + }, + { + "epoch": 0.5646044846979767, + "grad_norm": 1.9823299646377563, + "learning_rate": 1.6801075380466592e-05, + "loss": 0.8274, + "step": 11595 + }, + { + "epoch": 0.564653178487084, + "grad_norm": 1.748526692390442, + "learning_rate": 1.6797961662612022e-05, + "loss": 0.8075, + "step": 11596 + }, + { + "epoch": 0.5647018722761912, + "grad_norm": 1.398854374885559, + "learning_rate": 1.6794848024408435e-05, + "loss": 0.7375, + "step": 11597 + }, + { + "epoch": 0.5647505660652984, + "grad_norm": 1.537460446357727, + "learning_rate": 1.6791734465933305e-05, + "loss": 0.9045, + "step": 11598 + }, + { + "epoch": 0.5647992598544056, + "grad_norm": 1.6232712268829346, + "learning_rate": 1.6788620987264065e-05, + "loss": 0.7833, + "step": 11599 + }, + { + "epoch": 0.5648479536435128, + "grad_norm": 1.2921897172927856, + "learning_rate": 1.6785507588478176e-05, + "loss": 0.8645, + "step": 11600 + }, + { + "epoch": 0.5648966474326199, + "grad_norm": 1.3746591806411743, + "learning_rate": 1.678239426965307e-05, + "loss": 0.8261, + "step": 11601 + }, + { + "epoch": 0.5649453412217271, + "grad_norm": 1.2017430067062378, + "learning_rate": 1.6779281030866204e-05, + "loss": 0.7877, + "step": 11602 + }, + { + "epoch": 0.5649940350108343, + "grad_norm": 1.4118587970733643, + "learning_rate": 1.6776167872195023e-05, + "loss": 0.7949, + "step": 11603 + }, + { + "epoch": 0.5650427287999416, + "grad_norm": 1.6803799867630005, + "learning_rate": 1.6773054793716954e-05, + "loss": 0.8556, + "step": 11604 + }, + { + "epoch": 0.5650914225890488, + "grad_norm": 1.7086634635925293, + "learning_rate": 1.6769941795509447e-05, + "loss": 0.8014, + "step": 11605 + }, + { + "epoch": 0.565140116378156, + "grad_norm": 1.6001813411712646, + "learning_rate": 1.6766828877649922e-05, + "loss": 0.8376, + "step": 11606 + }, + { + "epoch": 0.5651888101672632, + "grad_norm": 1.1679246425628662, + "learning_rate": 1.676371604021583e-05, + "loss": 0.8025, + "step": 11607 + }, + { + "epoch": 0.5652375039563704, + "grad_norm": 1.6309325695037842, + "learning_rate": 1.67606032832846e-05, + "loss": 0.842, + "step": 11608 + }, + { + "epoch": 0.5652861977454776, + "grad_norm": 1.5721057653427124, + "learning_rate": 1.6757490606933655e-05, + "loss": 0.8507, + "step": 11609 + }, + { + "epoch": 0.5653348915345847, + "grad_norm": 1.7782543897628784, + "learning_rate": 1.6754378011240425e-05, + "loss": 0.7925, + "step": 11610 + }, + { + "epoch": 0.5653835853236919, + "grad_norm": 2.1669416427612305, + "learning_rate": 1.6751265496282334e-05, + "loss": 0.7868, + "step": 11611 + }, + { + "epoch": 0.5654322791127991, + "grad_norm": 1.4010957479476929, + "learning_rate": 1.674815306213682e-05, + "loss": 0.7358, + "step": 11612 + }, + { + "epoch": 0.5654809729019064, + "grad_norm": 1.2927275896072388, + "learning_rate": 1.674504070888129e-05, + "loss": 0.866, + "step": 11613 + }, + { + "epoch": 0.5655296666910136, + "grad_norm": 0.09849932789802551, + "learning_rate": 1.6741928436593175e-05, + "loss": 0.7176, + "step": 11614 + }, + { + "epoch": 0.5655783604801208, + "grad_norm": 1.3736728429794312, + "learning_rate": 1.6738816245349876e-05, + "loss": 0.9435, + "step": 11615 + }, + { + "epoch": 0.565627054269228, + "grad_norm": 1.6641254425048828, + "learning_rate": 1.6735704135228828e-05, + "loss": 0.8737, + "step": 11616 + }, + { + "epoch": 0.5656757480583352, + "grad_norm": 1.6833558082580566, + "learning_rate": 1.673259210630743e-05, + "loss": 0.8211, + "step": 11617 + }, + { + "epoch": 0.5657244418474423, + "grad_norm": 1.4034321308135986, + "learning_rate": 1.6729480158663113e-05, + "loss": 0.8599, + "step": 11618 + }, + { + "epoch": 0.5657731356365495, + "grad_norm": 1.4586641788482666, + "learning_rate": 1.6726368292373265e-05, + "loss": 0.8816, + "step": 11619 + }, + { + "epoch": 0.5658218294256567, + "grad_norm": 1.6964607238769531, + "learning_rate": 1.6723256507515314e-05, + "loss": 0.8298, + "step": 11620 + }, + { + "epoch": 0.5658705232147639, + "grad_norm": 1.6358264684677124, + "learning_rate": 1.6720144804166646e-05, + "loss": 0.9246, + "step": 11621 + }, + { + "epoch": 0.5659192170038712, + "grad_norm": 3.288757801055908, + "learning_rate": 1.6717033182404676e-05, + "loss": 0.7374, + "step": 11622 + }, + { + "epoch": 0.5659679107929784, + "grad_norm": 2.6235175132751465, + "learning_rate": 1.671392164230681e-05, + "loss": 0.7155, + "step": 11623 + }, + { + "epoch": 0.5660166045820856, + "grad_norm": 0.09554146975278854, + "learning_rate": 1.6710810183950437e-05, + "loss": 0.5838, + "step": 11624 + }, + { + "epoch": 0.5660652983711928, + "grad_norm": 1.1808667182922363, + "learning_rate": 1.6707698807412968e-05, + "loss": 0.785, + "step": 11625 + }, + { + "epoch": 0.5661139921603, + "grad_norm": 1.7752537727355957, + "learning_rate": 1.6704587512771785e-05, + "loss": 0.7634, + "step": 11626 + }, + { + "epoch": 0.5661626859494071, + "grad_norm": 4.0056962966918945, + "learning_rate": 1.6701476300104295e-05, + "loss": 0.885, + "step": 11627 + }, + { + "epoch": 0.5662113797385143, + "grad_norm": 1.272214412689209, + "learning_rate": 1.6698365169487874e-05, + "loss": 0.8064, + "step": 11628 + }, + { + "epoch": 0.5662600735276215, + "grad_norm": 2.2355897426605225, + "learning_rate": 1.6695254120999927e-05, + "loss": 0.8232, + "step": 11629 + }, + { + "epoch": 0.5663087673167287, + "grad_norm": 2.380709648132324, + "learning_rate": 1.6692143154717832e-05, + "loss": 0.7985, + "step": 11630 + }, + { + "epoch": 0.566357461105836, + "grad_norm": 1.522728681564331, + "learning_rate": 1.6689032270718974e-05, + "loss": 0.6969, + "step": 11631 + }, + { + "epoch": 0.5664061548949432, + "grad_norm": 1.2053347826004028, + "learning_rate": 1.668592146908075e-05, + "loss": 0.7322, + "step": 11632 + }, + { + "epoch": 0.5664548486840504, + "grad_norm": 1.7145576477050781, + "learning_rate": 1.668281074988052e-05, + "loss": 0.846, + "step": 11633 + }, + { + "epoch": 0.5665035424731576, + "grad_norm": 1.6976280212402344, + "learning_rate": 1.6679700113195693e-05, + "loss": 0.8281, + "step": 11634 + }, + { + "epoch": 0.5665522362622647, + "grad_norm": 1.4227707386016846, + "learning_rate": 1.667658955910361e-05, + "loss": 0.8381, + "step": 11635 + }, + { + "epoch": 0.5666009300513719, + "grad_norm": 1.8151859045028687, + "learning_rate": 1.6673479087681674e-05, + "loss": 0.7728, + "step": 11636 + }, + { + "epoch": 0.5666496238404791, + "grad_norm": 1.6155630350112915, + "learning_rate": 1.667036869900725e-05, + "loss": 0.8369, + "step": 11637 + }, + { + "epoch": 0.5666983176295863, + "grad_norm": 0.09076511859893799, + "learning_rate": 1.6667258393157702e-05, + "loss": 0.5482, + "step": 11638 + }, + { + "epoch": 0.5667470114186935, + "grad_norm": 1.7372081279754639, + "learning_rate": 1.6664148170210407e-05, + "loss": 0.8522, + "step": 11639 + }, + { + "epoch": 0.5667957052078008, + "grad_norm": 1.371999979019165, + "learning_rate": 1.6661038030242735e-05, + "loss": 0.8472, + "step": 11640 + }, + { + "epoch": 0.566844398996908, + "grad_norm": 1.1164627075195312, + "learning_rate": 1.6657927973332044e-05, + "loss": 0.8537, + "step": 11641 + }, + { + "epoch": 0.5668930927860152, + "grad_norm": 1.5955039262771606, + "learning_rate": 1.665481799955569e-05, + "loss": 0.8281, + "step": 11642 + }, + { + "epoch": 0.5669417865751224, + "grad_norm": 2.402653694152832, + "learning_rate": 1.6651708108991057e-05, + "loss": 0.8765, + "step": 11643 + }, + { + "epoch": 0.5669904803642295, + "grad_norm": 1.607468605041504, + "learning_rate": 1.6648598301715483e-05, + "loss": 0.773, + "step": 11644 + }, + { + "epoch": 0.5670391741533367, + "grad_norm": 1.9130867719650269, + "learning_rate": 1.6645488577806336e-05, + "loss": 0.8903, + "step": 11645 + }, + { + "epoch": 0.5670878679424439, + "grad_norm": 2.2616353034973145, + "learning_rate": 1.664237893734096e-05, + "loss": 0.8409, + "step": 11646 + }, + { + "epoch": 0.5671365617315511, + "grad_norm": 0.09246904402971268, + "learning_rate": 1.6639269380396724e-05, + "loss": 0.6016, + "step": 11647 + }, + { + "epoch": 0.5671852555206583, + "grad_norm": 1.548318862915039, + "learning_rate": 1.663615990705096e-05, + "loss": 0.7259, + "step": 11648 + }, + { + "epoch": 0.5672339493097656, + "grad_norm": 1.6449837684631348, + "learning_rate": 1.663305051738103e-05, + "loss": 0.8687, + "step": 11649 + }, + { + "epoch": 0.5672826430988728, + "grad_norm": 1.5725821256637573, + "learning_rate": 1.6629941211464273e-05, + "loss": 0.7785, + "step": 11650 + }, + { + "epoch": 0.56733133688798, + "grad_norm": 1.5014266967773438, + "learning_rate": 1.662683198937803e-05, + "loss": 0.9404, + "step": 11651 + }, + { + "epoch": 0.5673800306770871, + "grad_norm": 0.08894295990467072, + "learning_rate": 1.6623722851199656e-05, + "loss": 0.6578, + "step": 11652 + }, + { + "epoch": 0.5674287244661943, + "grad_norm": 1.4574198722839355, + "learning_rate": 1.662061379700648e-05, + "loss": 0.8128, + "step": 11653 + }, + { + "epoch": 0.5674774182553015, + "grad_norm": 1.2464672327041626, + "learning_rate": 1.661750482687585e-05, + "loss": 0.839, + "step": 11654 + }, + { + "epoch": 0.5675261120444087, + "grad_norm": 1.4190080165863037, + "learning_rate": 1.661439594088509e-05, + "loss": 0.8461, + "step": 11655 + }, + { + "epoch": 0.5675748058335159, + "grad_norm": 1.347656488418579, + "learning_rate": 1.6611287139111547e-05, + "loss": 0.7788, + "step": 11656 + }, + { + "epoch": 0.5676234996226232, + "grad_norm": 0.09218060970306396, + "learning_rate": 1.6608178421632538e-05, + "loss": 0.5914, + "step": 11657 + }, + { + "epoch": 0.5676721934117304, + "grad_norm": 1.7050052881240845, + "learning_rate": 1.66050697885254e-05, + "loss": 0.9533, + "step": 11658 + }, + { + "epoch": 0.5677208872008376, + "grad_norm": 1.770135760307312, + "learning_rate": 1.660196123986746e-05, + "loss": 0.836, + "step": 11659 + }, + { + "epoch": 0.5677695809899447, + "grad_norm": 1.5475119352340698, + "learning_rate": 1.6598852775736053e-05, + "loss": 0.8093, + "step": 11660 + }, + { + "epoch": 0.5678182747790519, + "grad_norm": 1.3997397422790527, + "learning_rate": 1.6595744396208485e-05, + "loss": 0.8487, + "step": 11661 + }, + { + "epoch": 0.5678669685681591, + "grad_norm": 1.6483811140060425, + "learning_rate": 1.6592636101362084e-05, + "loss": 0.9169, + "step": 11662 + }, + { + "epoch": 0.5679156623572663, + "grad_norm": 2.8943161964416504, + "learning_rate": 1.658952789127418e-05, + "loss": 0.7826, + "step": 11663 + }, + { + "epoch": 0.5679643561463735, + "grad_norm": 1.2716916799545288, + "learning_rate": 1.6586419766022072e-05, + "loss": 0.7418, + "step": 11664 + }, + { + "epoch": 0.5680130499354807, + "grad_norm": 1.616446614265442, + "learning_rate": 1.6583311725683088e-05, + "loss": 0.9321, + "step": 11665 + }, + { + "epoch": 0.568061743724588, + "grad_norm": 1.3809380531311035, + "learning_rate": 1.6580203770334533e-05, + "loss": 0.8653, + "step": 11666 + }, + { + "epoch": 0.5681104375136952, + "grad_norm": 1.5840924978256226, + "learning_rate": 1.6577095900053724e-05, + "loss": 0.731, + "step": 11667 + }, + { + "epoch": 0.5681591313028024, + "grad_norm": 1.6735525131225586, + "learning_rate": 1.6573988114917964e-05, + "loss": 0.7327, + "step": 11668 + }, + { + "epoch": 0.5682078250919095, + "grad_norm": 1.8805370330810547, + "learning_rate": 1.6570880415004568e-05, + "loss": 0.8605, + "step": 11669 + }, + { + "epoch": 0.5682565188810167, + "grad_norm": 2.405919313430786, + "learning_rate": 1.6567772800390828e-05, + "loss": 0.7988, + "step": 11670 + }, + { + "epoch": 0.5683052126701239, + "grad_norm": 1.5224158763885498, + "learning_rate": 1.656466527115405e-05, + "loss": 0.8552, + "step": 11671 + }, + { + "epoch": 0.5683539064592311, + "grad_norm": 1.8051687479019165, + "learning_rate": 1.6561557827371544e-05, + "loss": 0.8122, + "step": 11672 + }, + { + "epoch": 0.5684026002483383, + "grad_norm": 1.3267143964767456, + "learning_rate": 1.6558450469120596e-05, + "loss": 0.8602, + "step": 11673 + }, + { + "epoch": 0.5684512940374455, + "grad_norm": 1.4096139669418335, + "learning_rate": 1.655534319647851e-05, + "loss": 0.6972, + "step": 11674 + }, + { + "epoch": 0.5684999878265528, + "grad_norm": 4.107852935791016, + "learning_rate": 1.655223600952257e-05, + "loss": 0.775, + "step": 11675 + }, + { + "epoch": 0.56854868161566, + "grad_norm": 1.1630672216415405, + "learning_rate": 1.654912890833008e-05, + "loss": 0.8069, + "step": 11676 + }, + { + "epoch": 0.5685973754047671, + "grad_norm": 1.7956664562225342, + "learning_rate": 1.654602189297832e-05, + "loss": 0.7927, + "step": 11677 + }, + { + "epoch": 0.5686460691938743, + "grad_norm": 2.1290640830993652, + "learning_rate": 1.6542914963544584e-05, + "loss": 0.8466, + "step": 11678 + }, + { + "epoch": 0.5686947629829815, + "grad_norm": 1.1431283950805664, + "learning_rate": 1.6539808120106147e-05, + "loss": 0.8428, + "step": 11679 + }, + { + "epoch": 0.5687434567720887, + "grad_norm": 1.3726563453674316, + "learning_rate": 1.65367013627403e-05, + "loss": 0.8606, + "step": 11680 + }, + { + "epoch": 0.5687921505611959, + "grad_norm": 1.9190927743911743, + "learning_rate": 1.6533594691524323e-05, + "loss": 0.8799, + "step": 11681 + }, + { + "epoch": 0.5688408443503031, + "grad_norm": 1.5100674629211426, + "learning_rate": 1.6530488106535495e-05, + "loss": 0.8369, + "step": 11682 + }, + { + "epoch": 0.5688895381394103, + "grad_norm": 2.6750028133392334, + "learning_rate": 1.6527381607851096e-05, + "loss": 0.8118, + "step": 11683 + }, + { + "epoch": 0.5689382319285176, + "grad_norm": 1.4819759130477905, + "learning_rate": 1.6524275195548386e-05, + "loss": 0.8041, + "step": 11684 + }, + { + "epoch": 0.5689869257176248, + "grad_norm": 1.496812343597412, + "learning_rate": 1.652116886970466e-05, + "loss": 0.7917, + "step": 11685 + }, + { + "epoch": 0.5690356195067319, + "grad_norm": 1.421414852142334, + "learning_rate": 1.6518062630397166e-05, + "loss": 0.7152, + "step": 11686 + }, + { + "epoch": 0.5690843132958391, + "grad_norm": 1.4190467596054077, + "learning_rate": 1.6514956477703187e-05, + "loss": 0.804, + "step": 11687 + }, + { + "epoch": 0.5691330070849463, + "grad_norm": 1.2813892364501953, + "learning_rate": 1.6511850411699978e-05, + "loss": 0.8125, + "step": 11688 + }, + { + "epoch": 0.5691817008740535, + "grad_norm": 2.2187466621398926, + "learning_rate": 1.6508744432464815e-05, + "loss": 0.8176, + "step": 11689 + }, + { + "epoch": 0.5692303946631607, + "grad_norm": 1.4579153060913086, + "learning_rate": 1.6505638540074947e-05, + "loss": 0.8204, + "step": 11690 + }, + { + "epoch": 0.5692790884522679, + "grad_norm": 1.44338059425354, + "learning_rate": 1.6502532734607637e-05, + "loss": 0.7936, + "step": 11691 + }, + { + "epoch": 0.5693277822413751, + "grad_norm": 6.900587558746338, + "learning_rate": 1.6499427016140152e-05, + "loss": 0.799, + "step": 11692 + }, + { + "epoch": 0.5693764760304824, + "grad_norm": 1.3316048383712769, + "learning_rate": 1.649632138474973e-05, + "loss": 0.763, + "step": 11693 + }, + { + "epoch": 0.5694251698195895, + "grad_norm": 1.7440844774246216, + "learning_rate": 1.6493215840513644e-05, + "loss": 0.8003, + "step": 11694 + }, + { + "epoch": 0.5694738636086967, + "grad_norm": 1.4780528545379639, + "learning_rate": 1.6490110383509126e-05, + "loss": 0.8424, + "step": 11695 + }, + { + "epoch": 0.5695225573978039, + "grad_norm": 1.96538507938385, + "learning_rate": 1.6487005013813434e-05, + "loss": 0.8958, + "step": 11696 + }, + { + "epoch": 0.5695712511869111, + "grad_norm": 6.907103538513184, + "learning_rate": 1.6483899731503807e-05, + "loss": 0.9718, + "step": 11697 + }, + { + "epoch": 0.5696199449760183, + "grad_norm": 2.854506015777588, + "learning_rate": 1.6480794536657507e-05, + "loss": 0.8446, + "step": 11698 + }, + { + "epoch": 0.5696686387651255, + "grad_norm": 1.2658604383468628, + "learning_rate": 1.6477689429351754e-05, + "loss": 0.81, + "step": 11699 + }, + { + "epoch": 0.5697173325542327, + "grad_norm": 1.9918725490570068, + "learning_rate": 1.6474584409663798e-05, + "loss": 0.7957, + "step": 11700 + }, + { + "epoch": 0.56976602634334, + "grad_norm": 1.47319757938385, + "learning_rate": 1.647147947767088e-05, + "loss": 0.9264, + "step": 11701 + }, + { + "epoch": 0.5698147201324472, + "grad_norm": 1.36324143409729, + "learning_rate": 1.646837463345023e-05, + "loss": 0.8375, + "step": 11702 + }, + { + "epoch": 0.5698634139215543, + "grad_norm": 2.504110813140869, + "learning_rate": 1.6465269877079085e-05, + "loss": 0.8094, + "step": 11703 + }, + { + "epoch": 0.5699121077106615, + "grad_norm": 2.192966938018799, + "learning_rate": 1.6462165208634668e-05, + "loss": 0.8287, + "step": 11704 + }, + { + "epoch": 0.5699608014997687, + "grad_norm": 1.6626477241516113, + "learning_rate": 1.6459060628194223e-05, + "loss": 0.8596, + "step": 11705 + }, + { + "epoch": 0.5700094952888759, + "grad_norm": 2.228633403778076, + "learning_rate": 1.645595613583496e-05, + "loss": 0.8442, + "step": 11706 + }, + { + "epoch": 0.5700581890779831, + "grad_norm": 1.1563620567321777, + "learning_rate": 1.645285173163412e-05, + "loss": 0.8827, + "step": 11707 + }, + { + "epoch": 0.5701068828670903, + "grad_norm": 1.398821234703064, + "learning_rate": 1.6449747415668908e-05, + "loss": 0.8285, + "step": 11708 + }, + { + "epoch": 0.5701555766561975, + "grad_norm": 4.980806827545166, + "learning_rate": 1.6446643188016558e-05, + "loss": 0.7972, + "step": 11709 + }, + { + "epoch": 0.5702042704453048, + "grad_norm": 1.5132287740707397, + "learning_rate": 1.6443539048754277e-05, + "loss": 0.8688, + "step": 11710 + }, + { + "epoch": 0.5702529642344119, + "grad_norm": 2.8004140853881836, + "learning_rate": 1.6440434997959286e-05, + "loss": 0.8953, + "step": 11711 + }, + { + "epoch": 0.5703016580235191, + "grad_norm": 1.807845115661621, + "learning_rate": 1.643733103570881e-05, + "loss": 0.8756, + "step": 11712 + }, + { + "epoch": 0.5703503518126263, + "grad_norm": 1.568616509437561, + "learning_rate": 1.6434227162080035e-05, + "loss": 0.7993, + "step": 11713 + }, + { + "epoch": 0.5703990456017335, + "grad_norm": 1.3652853965759277, + "learning_rate": 1.64311233771502e-05, + "loss": 0.7734, + "step": 11714 + }, + { + "epoch": 0.5704477393908407, + "grad_norm": 1.9158806800842285, + "learning_rate": 1.6428019680996483e-05, + "loss": 0.7726, + "step": 11715 + }, + { + "epoch": 0.5704964331799479, + "grad_norm": 1.4547137022018433, + "learning_rate": 1.6424916073696108e-05, + "loss": 0.8573, + "step": 11716 + }, + { + "epoch": 0.5705451269690551, + "grad_norm": 1.6335296630859375, + "learning_rate": 1.6421812555326268e-05, + "loss": 0.7622, + "step": 11717 + }, + { + "epoch": 0.5705938207581623, + "grad_norm": 2.0109078884124756, + "learning_rate": 1.6418709125964178e-05, + "loss": 0.8843, + "step": 11718 + }, + { + "epoch": 0.5706425145472694, + "grad_norm": 1.5242388248443604, + "learning_rate": 1.641560578568701e-05, + "loss": 0.7955, + "step": 11719 + }, + { + "epoch": 0.5706912083363767, + "grad_norm": 1.7687478065490723, + "learning_rate": 1.6412502534571978e-05, + "loss": 0.7512, + "step": 11720 + }, + { + "epoch": 0.5707399021254839, + "grad_norm": 1.7051929235458374, + "learning_rate": 1.6409399372696282e-05, + "loss": 0.8515, + "step": 11721 + }, + { + "epoch": 0.5707885959145911, + "grad_norm": 1.3720595836639404, + "learning_rate": 1.6406296300137094e-05, + "loss": 0.9025, + "step": 11722 + }, + { + "epoch": 0.5708372897036983, + "grad_norm": 2.5218505859375, + "learning_rate": 1.6403193316971625e-05, + "loss": 0.8141, + "step": 11723 + }, + { + "epoch": 0.5708859834928055, + "grad_norm": 2.191628932952881, + "learning_rate": 1.6400090423277036e-05, + "loss": 0.8936, + "step": 11724 + }, + { + "epoch": 0.5709346772819127, + "grad_norm": 1.4377739429473877, + "learning_rate": 1.6396987619130537e-05, + "loss": 0.8084, + "step": 11725 + }, + { + "epoch": 0.5709833710710199, + "grad_norm": 0.09292563796043396, + "learning_rate": 1.639388490460929e-05, + "loss": 0.5811, + "step": 11726 + }, + { + "epoch": 0.5710320648601271, + "grad_norm": 2.5052549839019775, + "learning_rate": 1.6390782279790496e-05, + "loss": 0.8242, + "step": 11727 + }, + { + "epoch": 0.5710807586492342, + "grad_norm": 1.5731056928634644, + "learning_rate": 1.638767974475131e-05, + "loss": 0.8118, + "step": 11728 + }, + { + "epoch": 0.5711294524383415, + "grad_norm": 1.3660554885864258, + "learning_rate": 1.6384577299568932e-05, + "loss": 0.824, + "step": 11729 + }, + { + "epoch": 0.5711781462274487, + "grad_norm": 1.7566356658935547, + "learning_rate": 1.6381474944320513e-05, + "loss": 0.7236, + "step": 11730 + }, + { + "epoch": 0.5712268400165559, + "grad_norm": 1.3478087186813354, + "learning_rate": 1.6378372679083237e-05, + "loss": 0.9021, + "step": 11731 + }, + { + "epoch": 0.5712755338056631, + "grad_norm": 1.5511138439178467, + "learning_rate": 1.6375270503934273e-05, + "loss": 0.8552, + "step": 11732 + }, + { + "epoch": 0.5713242275947703, + "grad_norm": 1.6298896074295044, + "learning_rate": 1.637216841895078e-05, + "loss": 0.7675, + "step": 11733 + }, + { + "epoch": 0.5713729213838775, + "grad_norm": 1.280774474143982, + "learning_rate": 1.636906642420994e-05, + "loss": 0.7366, + "step": 11734 + }, + { + "epoch": 0.5714216151729847, + "grad_norm": 1.6885720491409302, + "learning_rate": 1.6365964519788894e-05, + "loss": 0.7944, + "step": 11735 + }, + { + "epoch": 0.5714703089620918, + "grad_norm": 1.2339304685592651, + "learning_rate": 1.6362862705764818e-05, + "loss": 0.8274, + "step": 11736 + }, + { + "epoch": 0.571519002751199, + "grad_norm": 1.7986632585525513, + "learning_rate": 1.6359760982214858e-05, + "loss": 0.8276, + "step": 11737 + }, + { + "epoch": 0.5715676965403063, + "grad_norm": 1.4867433309555054, + "learning_rate": 1.635665934921618e-05, + "loss": 0.8614, + "step": 11738 + }, + { + "epoch": 0.5716163903294135, + "grad_norm": 2.016258716583252, + "learning_rate": 1.6353557806845928e-05, + "loss": 0.7878, + "step": 11739 + }, + { + "epoch": 0.5716650841185207, + "grad_norm": 0.09602691233158112, + "learning_rate": 1.6350456355181255e-05, + "loss": 0.5657, + "step": 11740 + }, + { + "epoch": 0.5717137779076279, + "grad_norm": 1.9706045389175415, + "learning_rate": 1.6347354994299328e-05, + "loss": 0.8822, + "step": 11741 + }, + { + "epoch": 0.5717624716967351, + "grad_norm": 3.0120043754577637, + "learning_rate": 1.6344253724277263e-05, + "loss": 0.788, + "step": 11742 + }, + { + "epoch": 0.5718111654858423, + "grad_norm": 1.7599694728851318, + "learning_rate": 1.6341152545192232e-05, + "loss": 0.7687, + "step": 11743 + }, + { + "epoch": 0.5718598592749495, + "grad_norm": 1.7689388990402222, + "learning_rate": 1.633805145712136e-05, + "loss": 0.8832, + "step": 11744 + }, + { + "epoch": 0.5719085530640566, + "grad_norm": 2.942410469055176, + "learning_rate": 1.633495046014179e-05, + "loss": 0.8993, + "step": 11745 + }, + { + "epoch": 0.5719572468531638, + "grad_norm": 1.72932767868042, + "learning_rate": 1.6331849554330662e-05, + "loss": 0.8363, + "step": 11746 + }, + { + "epoch": 0.5720059406422711, + "grad_norm": 1.69829261302948, + "learning_rate": 1.6328748739765115e-05, + "loss": 0.8073, + "step": 11747 + }, + { + "epoch": 0.5720546344313783, + "grad_norm": 1.6276484727859497, + "learning_rate": 1.6325648016522272e-05, + "loss": 0.7336, + "step": 11748 + }, + { + "epoch": 0.5721033282204855, + "grad_norm": 1.8185107707977295, + "learning_rate": 1.6322547384679275e-05, + "loss": 0.7921, + "step": 11749 + }, + { + "epoch": 0.5721520220095927, + "grad_norm": 1.6903486251831055, + "learning_rate": 1.631944684431324e-05, + "loss": 0.844, + "step": 11750 + }, + { + "epoch": 0.5722007157986999, + "grad_norm": 1.4971362352371216, + "learning_rate": 1.6316346395501304e-05, + "loss": 0.7888, + "step": 11751 + }, + { + "epoch": 0.5722494095878071, + "grad_norm": 0.0973396971821785, + "learning_rate": 1.631324603832059e-05, + "loss": 0.6656, + "step": 11752 + }, + { + "epoch": 0.5722981033769142, + "grad_norm": 3.8253226280212402, + "learning_rate": 1.6310145772848214e-05, + "loss": 0.7164, + "step": 11753 + }, + { + "epoch": 0.5723467971660214, + "grad_norm": 2.4323792457580566, + "learning_rate": 1.63070455991613e-05, + "loss": 0.8712, + "step": 11754 + }, + { + "epoch": 0.5723954909551286, + "grad_norm": 1.4191787242889404, + "learning_rate": 1.6303945517336958e-05, + "loss": 0.7931, + "step": 11755 + }, + { + "epoch": 0.5724441847442359, + "grad_norm": 1.5359667539596558, + "learning_rate": 1.6300845527452317e-05, + "loss": 0.8346, + "step": 11756 + }, + { + "epoch": 0.5724928785333431, + "grad_norm": 1.7025688886642456, + "learning_rate": 1.6297745629584473e-05, + "loss": 0.891, + "step": 11757 + }, + { + "epoch": 0.5725415723224503, + "grad_norm": 1.3549343347549438, + "learning_rate": 1.6294645823810554e-05, + "loss": 0.8637, + "step": 11758 + }, + { + "epoch": 0.5725902661115575, + "grad_norm": 1.5026179552078247, + "learning_rate": 1.6291546110207647e-05, + "loss": 0.8431, + "step": 11759 + }, + { + "epoch": 0.5726389599006647, + "grad_norm": 1.2810121774673462, + "learning_rate": 1.6288446488852872e-05, + "loss": 0.8144, + "step": 11760 + }, + { + "epoch": 0.5726876536897719, + "grad_norm": 1.9487446546554565, + "learning_rate": 1.628534695982333e-05, + "loss": 0.7833, + "step": 11761 + }, + { + "epoch": 0.572736347478879, + "grad_norm": 1.6301658153533936, + "learning_rate": 1.6282247523196117e-05, + "loss": 0.846, + "step": 11762 + }, + { + "epoch": 0.5727850412679862, + "grad_norm": 1.1808215379714966, + "learning_rate": 1.6279148179048348e-05, + "loss": 0.8074, + "step": 11763 + }, + { + "epoch": 0.5728337350570935, + "grad_norm": 2.425386667251587, + "learning_rate": 1.6276048927457092e-05, + "loss": 0.886, + "step": 11764 + }, + { + "epoch": 0.5728824288462007, + "grad_norm": 2.2944717407226562, + "learning_rate": 1.6272949768499473e-05, + "loss": 0.7909, + "step": 11765 + }, + { + "epoch": 0.5729311226353079, + "grad_norm": 1.9629853963851929, + "learning_rate": 1.6269850702252558e-05, + "loss": 0.8415, + "step": 11766 + }, + { + "epoch": 0.5729798164244151, + "grad_norm": 1.2274397611618042, + "learning_rate": 1.6266751728793454e-05, + "loss": 0.8795, + "step": 11767 + }, + { + "epoch": 0.5730285102135223, + "grad_norm": 1.4139865636825562, + "learning_rate": 1.626365284819923e-05, + "loss": 0.8632, + "step": 11768 + }, + { + "epoch": 0.5730772040026295, + "grad_norm": 1.3901835680007935, + "learning_rate": 1.6260554060546988e-05, + "loss": 0.7536, + "step": 11769 + }, + { + "epoch": 0.5731258977917366, + "grad_norm": 2.1510612964630127, + "learning_rate": 1.625745536591381e-05, + "loss": 0.846, + "step": 11770 + }, + { + "epoch": 0.5731745915808438, + "grad_norm": 1.1989006996154785, + "learning_rate": 1.6254356764376763e-05, + "loss": 0.7528, + "step": 11771 + }, + { + "epoch": 0.573223285369951, + "grad_norm": 1.540771484375, + "learning_rate": 1.6251258256012943e-05, + "loss": 0.8289, + "step": 11772 + }, + { + "epoch": 0.5732719791590583, + "grad_norm": 2.398927688598633, + "learning_rate": 1.6248159840899408e-05, + "loss": 0.8078, + "step": 11773 + }, + { + "epoch": 0.5733206729481655, + "grad_norm": 1.5389575958251953, + "learning_rate": 1.624506151911324e-05, + "loss": 0.7109, + "step": 11774 + }, + { + "epoch": 0.5733693667372727, + "grad_norm": 0.08793176710605621, + "learning_rate": 1.624196329073151e-05, + "loss": 0.5233, + "step": 11775 + }, + { + "epoch": 0.5734180605263799, + "grad_norm": 1.7920830249786377, + "learning_rate": 1.6238865155831286e-05, + "loss": 0.816, + "step": 11776 + }, + { + "epoch": 0.5734667543154871, + "grad_norm": 1.1594061851501465, + "learning_rate": 1.623576711448963e-05, + "loss": 0.8758, + "step": 11777 + }, + { + "epoch": 0.5735154481045942, + "grad_norm": 1.8576817512512207, + "learning_rate": 1.623266916678362e-05, + "loss": 0.7545, + "step": 11778 + }, + { + "epoch": 0.5735641418937014, + "grad_norm": 1.422524094581604, + "learning_rate": 1.62295713127903e-05, + "loss": 0.7917, + "step": 11779 + }, + { + "epoch": 0.5736128356828086, + "grad_norm": 1.56089186668396, + "learning_rate": 1.6226473552586735e-05, + "loss": 0.8368, + "step": 11780 + }, + { + "epoch": 0.5736615294719158, + "grad_norm": 1.7573161125183105, + "learning_rate": 1.6223375886249996e-05, + "loss": 0.7424, + "step": 11781 + }, + { + "epoch": 0.5737102232610231, + "grad_norm": 1.2320882081985474, + "learning_rate": 1.6220278313857113e-05, + "loss": 0.814, + "step": 11782 + }, + { + "epoch": 0.5737589170501303, + "grad_norm": 1.9120427370071411, + "learning_rate": 1.6217180835485157e-05, + "loss": 0.8048, + "step": 11783 + }, + { + "epoch": 0.5738076108392375, + "grad_norm": 2.023253917694092, + "learning_rate": 1.6214083451211173e-05, + "loss": 0.7358, + "step": 11784 + }, + { + "epoch": 0.5738563046283447, + "grad_norm": 1.470017671585083, + "learning_rate": 1.6210986161112212e-05, + "loss": 0.8233, + "step": 11785 + }, + { + "epoch": 0.5739049984174519, + "grad_norm": 2.188070297241211, + "learning_rate": 1.620788896526531e-05, + "loss": 0.7889, + "step": 11786 + }, + { + "epoch": 0.573953692206559, + "grad_norm": 1.9017977714538574, + "learning_rate": 1.6204791863747523e-05, + "loss": 0.8934, + "step": 11787 + }, + { + "epoch": 0.5740023859956662, + "grad_norm": 1.102634072303772, + "learning_rate": 1.6201694856635873e-05, + "loss": 0.8917, + "step": 11788 + }, + { + "epoch": 0.5740510797847734, + "grad_norm": 1.3645853996276855, + "learning_rate": 1.6198597944007414e-05, + "loss": 0.7932, + "step": 11789 + }, + { + "epoch": 0.5740997735738806, + "grad_norm": 2.1579558849334717, + "learning_rate": 1.619550112593918e-05, + "loss": 0.8139, + "step": 11790 + }, + { + "epoch": 0.5741484673629879, + "grad_norm": 1.5333375930786133, + "learning_rate": 1.61924044025082e-05, + "loss": 0.8514, + "step": 11791 + }, + { + "epoch": 0.5741971611520951, + "grad_norm": 1.291452169418335, + "learning_rate": 1.6189307773791516e-05, + "loss": 0.8803, + "step": 11792 + }, + { + "epoch": 0.5742458549412023, + "grad_norm": 1.7582197189331055, + "learning_rate": 1.618621123986614e-05, + "loss": 0.8668, + "step": 11793 + }, + { + "epoch": 0.5742945487303095, + "grad_norm": 1.4981672763824463, + "learning_rate": 1.6183114800809115e-05, + "loss": 0.7914, + "step": 11794 + }, + { + "epoch": 0.5743432425194166, + "grad_norm": 1.3285022974014282, + "learning_rate": 1.6180018456697445e-05, + "loss": 0.7152, + "step": 11795 + }, + { + "epoch": 0.5743919363085238, + "grad_norm": 2.0452170372009277, + "learning_rate": 1.6176922207608173e-05, + "loss": 0.7767, + "step": 11796 + }, + { + "epoch": 0.574440630097631, + "grad_norm": 1.4723395109176636, + "learning_rate": 1.6173826053618306e-05, + "loss": 0.8465, + "step": 11797 + }, + { + "epoch": 0.5744893238867382, + "grad_norm": 1.5452018976211548, + "learning_rate": 1.617072999480487e-05, + "loss": 0.839, + "step": 11798 + }, + { + "epoch": 0.5745380176758454, + "grad_norm": 1.5811049938201904, + "learning_rate": 1.6167634031244867e-05, + "loss": 0.8294, + "step": 11799 + }, + { + "epoch": 0.5745867114649527, + "grad_norm": 1.2244688272476196, + "learning_rate": 1.6164538163015317e-05, + "loss": 0.7755, + "step": 11800 + }, + { + "epoch": 0.5746354052540599, + "grad_norm": 1.7956085205078125, + "learning_rate": 1.616144239019324e-05, + "loss": 0.8328, + "step": 11801 + }, + { + "epoch": 0.5746840990431671, + "grad_norm": 1.3000024557113647, + "learning_rate": 1.6158346712855624e-05, + "loss": 0.8116, + "step": 11802 + }, + { + "epoch": 0.5747327928322743, + "grad_norm": 1.5389939546585083, + "learning_rate": 1.6155251131079494e-05, + "loss": 0.8159, + "step": 11803 + }, + { + "epoch": 0.5747814866213814, + "grad_norm": 1.2840049266815186, + "learning_rate": 1.615215564494183e-05, + "loss": 0.804, + "step": 11804 + }, + { + "epoch": 0.5748301804104886, + "grad_norm": 2.070992946624756, + "learning_rate": 1.6149060254519653e-05, + "loss": 0.7844, + "step": 11805 + }, + { + "epoch": 0.5748788741995958, + "grad_norm": 1.532251238822937, + "learning_rate": 1.614596495988995e-05, + "loss": 0.8053, + "step": 11806 + }, + { + "epoch": 0.574927567988703, + "grad_norm": 1.8308628797531128, + "learning_rate": 1.6142869761129725e-05, + "loss": 0.7366, + "step": 11807 + }, + { + "epoch": 0.5749762617778102, + "grad_norm": 1.4130221605300903, + "learning_rate": 1.6139774658315958e-05, + "loss": 0.8949, + "step": 11808 + }, + { + "epoch": 0.5750249555669175, + "grad_norm": 1.4866318702697754, + "learning_rate": 1.613667965152565e-05, + "loss": 0.8547, + "step": 11809 + }, + { + "epoch": 0.5750736493560247, + "grad_norm": 1.606611728668213, + "learning_rate": 1.6133584740835795e-05, + "loss": 0.8966, + "step": 11810 + }, + { + "epoch": 0.5751223431451319, + "grad_norm": 1.144804835319519, + "learning_rate": 1.6130489926323363e-05, + "loss": 0.8209, + "step": 11811 + }, + { + "epoch": 0.575171036934239, + "grad_norm": 0.09973306208848953, + "learning_rate": 1.6127395208065357e-05, + "loss": 0.5883, + "step": 11812 + }, + { + "epoch": 0.5752197307233462, + "grad_norm": 1.7205402851104736, + "learning_rate": 1.6124300586138737e-05, + "loss": 0.7535, + "step": 11813 + }, + { + "epoch": 0.5752684245124534, + "grad_norm": 1.595881462097168, + "learning_rate": 1.6121206060620505e-05, + "loss": 0.8289, + "step": 11814 + }, + { + "epoch": 0.5753171183015606, + "grad_norm": 1.300062656402588, + "learning_rate": 1.6118111631587618e-05, + "loss": 0.7599, + "step": 11815 + }, + { + "epoch": 0.5753658120906678, + "grad_norm": 3.026090145111084, + "learning_rate": 1.6115017299117065e-05, + "loss": 0.8621, + "step": 11816 + }, + { + "epoch": 0.575414505879775, + "grad_norm": 1.7524352073669434, + "learning_rate": 1.6111923063285803e-05, + "loss": 0.8567, + "step": 11817 + }, + { + "epoch": 0.5754631996688823, + "grad_norm": 1.7032991647720337, + "learning_rate": 1.6108828924170815e-05, + "loss": 0.724, + "step": 11818 + }, + { + "epoch": 0.5755118934579895, + "grad_norm": 1.9965088367462158, + "learning_rate": 1.6105734881849054e-05, + "loss": 0.8377, + "step": 11819 + }, + { + "epoch": 0.5755605872470966, + "grad_norm": 1.3475267887115479, + "learning_rate": 1.6102640936397498e-05, + "loss": 0.8694, + "step": 11820 + }, + { + "epoch": 0.5756092810362038, + "grad_norm": 1.3145865201950073, + "learning_rate": 1.609954708789311e-05, + "loss": 0.8859, + "step": 11821 + }, + { + "epoch": 0.575657974825311, + "grad_norm": 1.3335990905761719, + "learning_rate": 1.6096453336412835e-05, + "loss": 0.9096, + "step": 11822 + }, + { + "epoch": 0.5757066686144182, + "grad_norm": 1.601745843887329, + "learning_rate": 1.609335968203365e-05, + "loss": 0.8183, + "step": 11823 + }, + { + "epoch": 0.5757553624035254, + "grad_norm": 1.651978850364685, + "learning_rate": 1.6090266124832485e-05, + "loss": 0.856, + "step": 11824 + }, + { + "epoch": 0.5758040561926326, + "grad_norm": 1.2134637832641602, + "learning_rate": 1.6087172664886316e-05, + "loss": 0.8027, + "step": 11825 + }, + { + "epoch": 0.5758527499817399, + "grad_norm": 1.5595552921295166, + "learning_rate": 1.608407930227208e-05, + "loss": 0.821, + "step": 11826 + }, + { + "epoch": 0.5759014437708471, + "grad_norm": 1.5969246625900269, + "learning_rate": 1.6080986037066733e-05, + "loss": 0.8374, + "step": 11827 + }, + { + "epoch": 0.5759501375599543, + "grad_norm": 1.4081693887710571, + "learning_rate": 1.607789286934721e-05, + "loss": 0.8701, + "step": 11828 + }, + { + "epoch": 0.5759988313490614, + "grad_norm": 0.10069616883993149, + "learning_rate": 1.6074799799190458e-05, + "loss": 0.643, + "step": 11829 + }, + { + "epoch": 0.5760475251381686, + "grad_norm": 1.739760398864746, + "learning_rate": 1.6071706826673427e-05, + "loss": 0.77, + "step": 11830 + }, + { + "epoch": 0.5760962189272758, + "grad_norm": 1.7377440929412842, + "learning_rate": 1.6068613951873037e-05, + "loss": 0.8169, + "step": 11831 + }, + { + "epoch": 0.576144912716383, + "grad_norm": 1.7164289951324463, + "learning_rate": 1.6065521174866247e-05, + "loss": 0.7703, + "step": 11832 + }, + { + "epoch": 0.5761936065054902, + "grad_norm": 0.08848840743303299, + "learning_rate": 1.6062428495729963e-05, + "loss": 0.5872, + "step": 11833 + }, + { + "epoch": 0.5762423002945974, + "grad_norm": 1.948520302772522, + "learning_rate": 1.6059335914541137e-05, + "loss": 0.8862, + "step": 11834 + }, + { + "epoch": 0.5762909940837047, + "grad_norm": 1.5584447383880615, + "learning_rate": 1.6056243431376683e-05, + "loss": 0.8287, + "step": 11835 + }, + { + "epoch": 0.5763396878728119, + "grad_norm": 1.3367630243301392, + "learning_rate": 1.6053151046313542e-05, + "loss": 0.8641, + "step": 11836 + }, + { + "epoch": 0.576388381661919, + "grad_norm": 1.323722243309021, + "learning_rate": 1.605005875942862e-05, + "loss": 0.7775, + "step": 11837 + }, + { + "epoch": 0.5764370754510262, + "grad_norm": 1.273629903793335, + "learning_rate": 1.6046966570798855e-05, + "loss": 0.787, + "step": 11838 + }, + { + "epoch": 0.5764857692401334, + "grad_norm": 1.4313257932662964, + "learning_rate": 1.604387448050115e-05, + "loss": 0.8278, + "step": 11839 + }, + { + "epoch": 0.5765344630292406, + "grad_norm": 1.7671868801116943, + "learning_rate": 1.6040782488612427e-05, + "loss": 0.7247, + "step": 11840 + }, + { + "epoch": 0.5765831568183478, + "grad_norm": 3.0610289573669434, + "learning_rate": 1.6037690595209605e-05, + "loss": 0.7482, + "step": 11841 + }, + { + "epoch": 0.576631850607455, + "grad_norm": 2.0276949405670166, + "learning_rate": 1.603459880036959e-05, + "loss": 0.7712, + "step": 11842 + }, + { + "epoch": 0.5766805443965622, + "grad_norm": 3.064699172973633, + "learning_rate": 1.6031507104169294e-05, + "loss": 0.8147, + "step": 11843 + }, + { + "epoch": 0.5767292381856695, + "grad_norm": 1.3504588603973389, + "learning_rate": 1.6028415506685614e-05, + "loss": 0.7442, + "step": 11844 + }, + { + "epoch": 0.5767779319747767, + "grad_norm": 1.4617482423782349, + "learning_rate": 1.602532400799547e-05, + "loss": 0.8872, + "step": 11845 + }, + { + "epoch": 0.5768266257638838, + "grad_norm": 1.7290269136428833, + "learning_rate": 1.6022232608175743e-05, + "loss": 0.868, + "step": 11846 + }, + { + "epoch": 0.576875319552991, + "grad_norm": 1.587013602256775, + "learning_rate": 1.6019141307303346e-05, + "loss": 0.8747, + "step": 11847 + }, + { + "epoch": 0.5769240133420982, + "grad_norm": 1.142155408859253, + "learning_rate": 1.6016050105455172e-05, + "loss": 0.8838, + "step": 11848 + }, + { + "epoch": 0.5769727071312054, + "grad_norm": 1.4583640098571777, + "learning_rate": 1.6012959002708112e-05, + "loss": 0.8677, + "step": 11849 + }, + { + "epoch": 0.5770214009203126, + "grad_norm": 5.138436317443848, + "learning_rate": 1.6009867999139072e-05, + "loss": 0.8589, + "step": 11850 + }, + { + "epoch": 0.5770700947094198, + "grad_norm": 1.1522992849349976, + "learning_rate": 1.6006777094824916e-05, + "loss": 0.8548, + "step": 11851 + }, + { + "epoch": 0.577118788498527, + "grad_norm": 1.7470324039459229, + "learning_rate": 1.6003686289842554e-05, + "loss": 0.7554, + "step": 11852 + }, + { + "epoch": 0.5771674822876343, + "grad_norm": 1.7151224613189697, + "learning_rate": 1.6000595584268853e-05, + "loss": 0.7653, + "step": 11853 + }, + { + "epoch": 0.5772161760767414, + "grad_norm": 1.7662336826324463, + "learning_rate": 1.5997504978180707e-05, + "loss": 0.8465, + "step": 11854 + }, + { + "epoch": 0.5772648698658486, + "grad_norm": 1.2710238695144653, + "learning_rate": 1.5994414471654988e-05, + "loss": 0.7333, + "step": 11855 + }, + { + "epoch": 0.5773135636549558, + "grad_norm": 1.233204960823059, + "learning_rate": 1.599132406476857e-05, + "loss": 0.8778, + "step": 11856 + }, + { + "epoch": 0.577362257444063, + "grad_norm": 1.327086091041565, + "learning_rate": 1.5988233757598334e-05, + "loss": 0.801, + "step": 11857 + }, + { + "epoch": 0.5774109512331702, + "grad_norm": 5.3877644538879395, + "learning_rate": 1.5985143550221145e-05, + "loss": 0.8357, + "step": 11858 + }, + { + "epoch": 0.5774596450222774, + "grad_norm": 0.09816757589578629, + "learning_rate": 1.5982053442713885e-05, + "loss": 0.6651, + "step": 11859 + }, + { + "epoch": 0.5775083388113846, + "grad_norm": 1.9686185121536255, + "learning_rate": 1.5978963435153405e-05, + "loss": 0.8524, + "step": 11860 + }, + { + "epoch": 0.5775570326004918, + "grad_norm": 0.08784715086221695, + "learning_rate": 1.5975873527616582e-05, + "loss": 0.5731, + "step": 11861 + }, + { + "epoch": 0.5776057263895991, + "grad_norm": 0.09166672080755234, + "learning_rate": 1.5972783720180267e-05, + "loss": 0.5675, + "step": 11862 + }, + { + "epoch": 0.5776544201787062, + "grad_norm": 1.4869956970214844, + "learning_rate": 1.5969694012921326e-05, + "loss": 0.8455, + "step": 11863 + }, + { + "epoch": 0.5777031139678134, + "grad_norm": 1.7075825929641724, + "learning_rate": 1.596660440591661e-05, + "loss": 0.8599, + "step": 11864 + }, + { + "epoch": 0.5777518077569206, + "grad_norm": 2.1655914783477783, + "learning_rate": 1.5963514899242986e-05, + "loss": 0.8126, + "step": 11865 + }, + { + "epoch": 0.5778005015460278, + "grad_norm": 1.7709355354309082, + "learning_rate": 1.596042549297729e-05, + "loss": 0.7191, + "step": 11866 + }, + { + "epoch": 0.577849195335135, + "grad_norm": 1.4605132341384888, + "learning_rate": 1.5957336187196384e-05, + "loss": 0.9072, + "step": 11867 + }, + { + "epoch": 0.5778978891242422, + "grad_norm": 1.8076071739196777, + "learning_rate": 1.5954246981977103e-05, + "loss": 0.8922, + "step": 11868 + }, + { + "epoch": 0.5779465829133494, + "grad_norm": 1.5942920446395874, + "learning_rate": 1.5951157877396296e-05, + "loss": 0.9243, + "step": 11869 + }, + { + "epoch": 0.5779952767024567, + "grad_norm": 2.235006332397461, + "learning_rate": 1.5948068873530807e-05, + "loss": 0.8787, + "step": 11870 + }, + { + "epoch": 0.5780439704915638, + "grad_norm": 2.2262678146362305, + "learning_rate": 1.5944979970457472e-05, + "loss": 0.8577, + "step": 11871 + }, + { + "epoch": 0.578092664280671, + "grad_norm": 1.7391334772109985, + "learning_rate": 1.594189116825314e-05, + "loss": 0.7894, + "step": 11872 + }, + { + "epoch": 0.5781413580697782, + "grad_norm": 1.6395493745803833, + "learning_rate": 1.5938802466994627e-05, + "loss": 0.8047, + "step": 11873 + }, + { + "epoch": 0.5781900518588854, + "grad_norm": 4.880466461181641, + "learning_rate": 1.593571386675878e-05, + "loss": 0.7554, + "step": 11874 + }, + { + "epoch": 0.5782387456479926, + "grad_norm": 3.1161630153656006, + "learning_rate": 1.593262536762241e-05, + "loss": 0.83, + "step": 11875 + }, + { + "epoch": 0.5782874394370998, + "grad_norm": 1.7021795511245728, + "learning_rate": 1.5929536969662365e-05, + "loss": 0.9125, + "step": 11876 + }, + { + "epoch": 0.578336133226207, + "grad_norm": 1.7200677394866943, + "learning_rate": 1.5926448672955454e-05, + "loss": 0.8478, + "step": 11877 + }, + { + "epoch": 0.5783848270153142, + "grad_norm": 1.5112884044647217, + "learning_rate": 1.59233604775785e-05, + "loss": 0.8404, + "step": 11878 + }, + { + "epoch": 0.5784335208044213, + "grad_norm": 1.77664053440094, + "learning_rate": 1.5920272383608334e-05, + "loss": 0.8448, + "step": 11879 + }, + { + "epoch": 0.5784822145935286, + "grad_norm": 2.0961246490478516, + "learning_rate": 1.591718439112176e-05, + "loss": 0.8058, + "step": 11880 + }, + { + "epoch": 0.5785309083826358, + "grad_norm": 0.09403571486473083, + "learning_rate": 1.59140965001956e-05, + "loss": 0.596, + "step": 11881 + }, + { + "epoch": 0.578579602171743, + "grad_norm": 0.09682010859251022, + "learning_rate": 1.5911008710906658e-05, + "loss": 0.6698, + "step": 11882 + }, + { + "epoch": 0.5786282959608502, + "grad_norm": 1.4604685306549072, + "learning_rate": 1.590792102333175e-05, + "loss": 0.8442, + "step": 11883 + }, + { + "epoch": 0.5786769897499574, + "grad_norm": 1.8020176887512207, + "learning_rate": 1.590483343754768e-05, + "loss": 0.8061, + "step": 11884 + }, + { + "epoch": 0.5787256835390646, + "grad_norm": 0.0948135182261467, + "learning_rate": 1.590174595363125e-05, + "loss": 0.6252, + "step": 11885 + }, + { + "epoch": 0.5787743773281718, + "grad_norm": 1.21598482131958, + "learning_rate": 1.589865857165926e-05, + "loss": 0.9602, + "step": 11886 + }, + { + "epoch": 0.578823071117279, + "grad_norm": 2.7375025749206543, + "learning_rate": 1.589557129170852e-05, + "loss": 0.8322, + "step": 11887 + }, + { + "epoch": 0.5788717649063861, + "grad_norm": 1.6233830451965332, + "learning_rate": 1.589248411385581e-05, + "loss": 0.7615, + "step": 11888 + }, + { + "epoch": 0.5789204586954934, + "grad_norm": 0.0914435163140297, + "learning_rate": 1.5889397038177932e-05, + "loss": 0.5955, + "step": 11889 + }, + { + "epoch": 0.5789691524846006, + "grad_norm": 2.475602149963379, + "learning_rate": 1.588631006475169e-05, + "loss": 0.9627, + "step": 11890 + }, + { + "epoch": 0.5790178462737078, + "grad_norm": 1.5250039100646973, + "learning_rate": 1.5883223193653846e-05, + "loss": 0.8098, + "step": 11891 + }, + { + "epoch": 0.579066540062815, + "grad_norm": 1.427276611328125, + "learning_rate": 1.5880136424961206e-05, + "loss": 0.8031, + "step": 11892 + }, + { + "epoch": 0.5791152338519222, + "grad_norm": 3.931004524230957, + "learning_rate": 1.5877049758750543e-05, + "loss": 0.8232, + "step": 11893 + }, + { + "epoch": 0.5791639276410294, + "grad_norm": 1.4446769952774048, + "learning_rate": 1.5873963195098653e-05, + "loss": 0.7617, + "step": 11894 + }, + { + "epoch": 0.5792126214301366, + "grad_norm": 1.448462724685669, + "learning_rate": 1.5870876734082298e-05, + "loss": 0.8761, + "step": 11895 + }, + { + "epoch": 0.5792613152192437, + "grad_norm": 1.3439280986785889, + "learning_rate": 1.5867790375778266e-05, + "loss": 0.7928, + "step": 11896 + }, + { + "epoch": 0.579310009008351, + "grad_norm": 1.47232186794281, + "learning_rate": 1.5864704120263322e-05, + "loss": 0.7876, + "step": 11897 + }, + { + "epoch": 0.5793587027974582, + "grad_norm": 1.3968157768249512, + "learning_rate": 1.5861617967614234e-05, + "loss": 0.8422, + "step": 11898 + }, + { + "epoch": 0.5794073965865654, + "grad_norm": 1.5773670673370361, + "learning_rate": 1.585853191790778e-05, + "loss": 0.7611, + "step": 11899 + }, + { + "epoch": 0.5794560903756726, + "grad_norm": 5.338950157165527, + "learning_rate": 1.5855445971220724e-05, + "loss": 0.897, + "step": 11900 + }, + { + "epoch": 0.5795047841647798, + "grad_norm": 1.7188583612442017, + "learning_rate": 1.5852360127629835e-05, + "loss": 0.8155, + "step": 11901 + }, + { + "epoch": 0.579553477953887, + "grad_norm": 1.6604740619659424, + "learning_rate": 1.5849274387211852e-05, + "loss": 0.8154, + "step": 11902 + }, + { + "epoch": 0.5796021717429942, + "grad_norm": 1.5150458812713623, + "learning_rate": 1.584618875004356e-05, + "loss": 0.8273, + "step": 11903 + }, + { + "epoch": 0.5796508655321014, + "grad_norm": 1.5298407077789307, + "learning_rate": 1.5843103216201693e-05, + "loss": 0.8426, + "step": 11904 + }, + { + "epoch": 0.5796995593212085, + "grad_norm": 1.8549216985702515, + "learning_rate": 1.5840017785763016e-05, + "loss": 0.7732, + "step": 11905 + }, + { + "epoch": 0.5797482531103157, + "grad_norm": 1.7362029552459717, + "learning_rate": 1.583693245880427e-05, + "loss": 0.8757, + "step": 11906 + }, + { + "epoch": 0.579796946899423, + "grad_norm": 2.762906074523926, + "learning_rate": 1.583384723540222e-05, + "loss": 0.8194, + "step": 11907 + }, + { + "epoch": 0.5798456406885302, + "grad_norm": 0.0852174386382103, + "learning_rate": 1.5830762115633587e-05, + "loss": 0.6115, + "step": 11908 + }, + { + "epoch": 0.5798943344776374, + "grad_norm": 1.4401675462722778, + "learning_rate": 1.5827677099575128e-05, + "loss": 0.8172, + "step": 11909 + }, + { + "epoch": 0.5799430282667446, + "grad_norm": 1.3679766654968262, + "learning_rate": 1.5824592187303595e-05, + "loss": 0.8233, + "step": 11910 + }, + { + "epoch": 0.5799917220558518, + "grad_norm": 1.6564521789550781, + "learning_rate": 1.58215073788957e-05, + "loss": 0.8007, + "step": 11911 + }, + { + "epoch": 0.580040415844959, + "grad_norm": 1.5287482738494873, + "learning_rate": 1.5818422674428193e-05, + "loss": 0.7371, + "step": 11912 + }, + { + "epoch": 0.5800891096340661, + "grad_norm": 2.0663537979125977, + "learning_rate": 1.5815338073977803e-05, + "loss": 0.757, + "step": 11913 + }, + { + "epoch": 0.5801378034231733, + "grad_norm": 1.5522887706756592, + "learning_rate": 1.5812253577621263e-05, + "loss": 0.8325, + "step": 11914 + }, + { + "epoch": 0.5801864972122805, + "grad_norm": 1.3787603378295898, + "learning_rate": 1.580916918543529e-05, + "loss": 0.8134, + "step": 11915 + }, + { + "epoch": 0.5802351910013878, + "grad_norm": 0.09518615901470184, + "learning_rate": 1.5806084897496626e-05, + "loss": 0.594, + "step": 11916 + }, + { + "epoch": 0.580283884790495, + "grad_norm": 2.0096120834350586, + "learning_rate": 1.5803000713881974e-05, + "loss": 0.8634, + "step": 11917 + }, + { + "epoch": 0.5803325785796022, + "grad_norm": 1.7035495042800903, + "learning_rate": 1.579991663466806e-05, + "loss": 0.801, + "step": 11918 + }, + { + "epoch": 0.5803812723687094, + "grad_norm": 1.8591111898422241, + "learning_rate": 1.579683265993161e-05, + "loss": 0.8214, + "step": 11919 + }, + { + "epoch": 0.5804299661578166, + "grad_norm": 1.6353179216384888, + "learning_rate": 1.5793748789749324e-05, + "loss": 0.7756, + "step": 11920 + }, + { + "epoch": 0.5804786599469238, + "grad_norm": 1.3813307285308838, + "learning_rate": 1.5790665024197927e-05, + "loss": 0.7774, + "step": 11921 + }, + { + "epoch": 0.5805273537360309, + "grad_norm": 1.658668875694275, + "learning_rate": 1.5787581363354118e-05, + "loss": 0.9114, + "step": 11922 + }, + { + "epoch": 0.5805760475251381, + "grad_norm": 1.4410072565078735, + "learning_rate": 1.5784497807294613e-05, + "loss": 0.7614, + "step": 11923 + }, + { + "epoch": 0.5806247413142454, + "grad_norm": 1.7118074893951416, + "learning_rate": 1.57814143560961e-05, + "loss": 1.0034, + "step": 11924 + }, + { + "epoch": 0.5806734351033526, + "grad_norm": 1.7397514581680298, + "learning_rate": 1.57783310098353e-05, + "loss": 0.7734, + "step": 11925 + }, + { + "epoch": 0.5807221288924598, + "grad_norm": 2.386470317840576, + "learning_rate": 1.5775247768588887e-05, + "loss": 0.7398, + "step": 11926 + }, + { + "epoch": 0.580770822681567, + "grad_norm": 2.640557050704956, + "learning_rate": 1.577216463243358e-05, + "loss": 0.8135, + "step": 11927 + }, + { + "epoch": 0.5808195164706742, + "grad_norm": 1.4761366844177246, + "learning_rate": 1.5769081601446057e-05, + "loss": 0.7395, + "step": 11928 + }, + { + "epoch": 0.5808682102597814, + "grad_norm": 1.3891218900680542, + "learning_rate": 1.576599867570302e-05, + "loss": 0.7618, + "step": 11929 + }, + { + "epoch": 0.5809169040488885, + "grad_norm": 0.09659235924482346, + "learning_rate": 1.5762915855281153e-05, + "loss": 0.611, + "step": 11930 + }, + { + "epoch": 0.5809655978379957, + "grad_norm": 1.4504711627960205, + "learning_rate": 1.5759833140257134e-05, + "loss": 0.8826, + "step": 11931 + }, + { + "epoch": 0.5810142916271029, + "grad_norm": 3.053743839263916, + "learning_rate": 1.5756750530707664e-05, + "loss": 0.8387, + "step": 11932 + }, + { + "epoch": 0.5810629854162102, + "grad_norm": 1.5805026292800903, + "learning_rate": 1.5753668026709402e-05, + "loss": 0.8145, + "step": 11933 + }, + { + "epoch": 0.5811116792053174, + "grad_norm": 1.2671862840652466, + "learning_rate": 1.575058562833904e-05, + "loss": 0.8045, + "step": 11934 + }, + { + "epoch": 0.5811603729944246, + "grad_norm": 2.3190579414367676, + "learning_rate": 1.574750333567324e-05, + "loss": 0.8306, + "step": 11935 + }, + { + "epoch": 0.5812090667835318, + "grad_norm": 1.534942388534546, + "learning_rate": 1.5744421148788695e-05, + "loss": 0.7444, + "step": 11936 + }, + { + "epoch": 0.581257760572639, + "grad_norm": 0.08920088410377502, + "learning_rate": 1.574133906776205e-05, + "loss": 0.5235, + "step": 11937 + }, + { + "epoch": 0.5813064543617461, + "grad_norm": 0.09759102761745453, + "learning_rate": 1.5738257092669986e-05, + "loss": 0.6289, + "step": 11938 + }, + { + "epoch": 0.5813551481508533, + "grad_norm": 1.5369477272033691, + "learning_rate": 1.5735175223589173e-05, + "loss": 0.8165, + "step": 11939 + }, + { + "epoch": 0.5814038419399605, + "grad_norm": 1.7895487546920776, + "learning_rate": 1.5732093460596255e-05, + "loss": 0.8209, + "step": 11940 + }, + { + "epoch": 0.5814525357290677, + "grad_norm": 2.2484333515167236, + "learning_rate": 1.5729011803767912e-05, + "loss": 0.8306, + "step": 11941 + }, + { + "epoch": 0.581501229518175, + "grad_norm": 1.7892154455184937, + "learning_rate": 1.572593025318078e-05, + "loss": 0.9164, + "step": 11942 + }, + { + "epoch": 0.5815499233072822, + "grad_norm": 2.058931350708008, + "learning_rate": 1.5722848808911527e-05, + "loss": 0.927, + "step": 11943 + }, + { + "epoch": 0.5815986170963894, + "grad_norm": 1.413164496421814, + "learning_rate": 1.57197674710368e-05, + "loss": 0.7726, + "step": 11944 + }, + { + "epoch": 0.5816473108854966, + "grad_norm": 1.1896278858184814, + "learning_rate": 1.571668623963325e-05, + "loss": 0.8174, + "step": 11945 + }, + { + "epoch": 0.5816960046746038, + "grad_norm": 1.2801579236984253, + "learning_rate": 1.5713605114777516e-05, + "loss": 0.8486, + "step": 11946 + }, + { + "epoch": 0.5817446984637109, + "grad_norm": 1.5297974348068237, + "learning_rate": 1.5710524096546242e-05, + "loss": 0.816, + "step": 11947 + }, + { + "epoch": 0.5817933922528181, + "grad_norm": 1.6199688911437988, + "learning_rate": 1.570744318501608e-05, + "loss": 0.7659, + "step": 11948 + }, + { + "epoch": 0.5818420860419253, + "grad_norm": 1.4828020334243774, + "learning_rate": 1.5704362380263655e-05, + "loss": 0.7401, + "step": 11949 + }, + { + "epoch": 0.5818907798310325, + "grad_norm": 1.448065996170044, + "learning_rate": 1.5701281682365612e-05, + "loss": 0.7702, + "step": 11950 + }, + { + "epoch": 0.5819394736201398, + "grad_norm": 1.7698086500167847, + "learning_rate": 1.5698201091398573e-05, + "loss": 0.7654, + "step": 11951 + }, + { + "epoch": 0.581988167409247, + "grad_norm": 1.625110387802124, + "learning_rate": 1.5695120607439185e-05, + "loss": 0.7413, + "step": 11952 + }, + { + "epoch": 0.5820368611983542, + "grad_norm": 1.259809970855713, + "learning_rate": 1.5692040230564053e-05, + "loss": 0.7801, + "step": 11953 + }, + { + "epoch": 0.5820855549874614, + "grad_norm": 1.329598069190979, + "learning_rate": 1.5688959960849824e-05, + "loss": 0.891, + "step": 11954 + }, + { + "epoch": 0.5821342487765685, + "grad_norm": 2.00510573387146, + "learning_rate": 1.5685879798373097e-05, + "loss": 0.8048, + "step": 11955 + }, + { + "epoch": 0.5821829425656757, + "grad_norm": 0.0941867008805275, + "learning_rate": 1.5682799743210512e-05, + "loss": 0.6176, + "step": 11956 + }, + { + "epoch": 0.5822316363547829, + "grad_norm": 1.9569073915481567, + "learning_rate": 1.567971979543867e-05, + "loss": 0.8751, + "step": 11957 + }, + { + "epoch": 0.5822803301438901, + "grad_norm": 1.6281672716140747, + "learning_rate": 1.567663995513419e-05, + "loss": 0.9382, + "step": 11958 + }, + { + "epoch": 0.5823290239329973, + "grad_norm": 1.312619686126709, + "learning_rate": 1.5673560222373697e-05, + "loss": 0.8101, + "step": 11959 + }, + { + "epoch": 0.5823777177221046, + "grad_norm": 1.5302438735961914, + "learning_rate": 1.567048059723378e-05, + "loss": 0.8063, + "step": 11960 + }, + { + "epoch": 0.5824264115112118, + "grad_norm": 1.1772584915161133, + "learning_rate": 1.566740107979106e-05, + "loss": 0.8469, + "step": 11961 + }, + { + "epoch": 0.582475105300319, + "grad_norm": 1.5926369428634644, + "learning_rate": 1.5664321670122126e-05, + "loss": 0.778, + "step": 11962 + }, + { + "epoch": 0.5825237990894262, + "grad_norm": 1.3201998472213745, + "learning_rate": 1.566124236830359e-05, + "loss": 0.8299, + "step": 11963 + }, + { + "epoch": 0.5825724928785333, + "grad_norm": 3.531933069229126, + "learning_rate": 1.5658163174412037e-05, + "loss": 0.8129, + "step": 11964 + }, + { + "epoch": 0.5826211866676405, + "grad_norm": 1.7696022987365723, + "learning_rate": 1.5655084088524086e-05, + "loss": 0.831, + "step": 11965 + }, + { + "epoch": 0.5826698804567477, + "grad_norm": 2.222132444381714, + "learning_rate": 1.56520051107163e-05, + "loss": 0.7992, + "step": 11966 + }, + { + "epoch": 0.5827185742458549, + "grad_norm": 1.5597023963928223, + "learning_rate": 1.564892624106529e-05, + "loss": 0.8223, + "step": 11967 + }, + { + "epoch": 0.5827672680349621, + "grad_norm": 2.6878793239593506, + "learning_rate": 1.5645847479647636e-05, + "loss": 0.7742, + "step": 11968 + }, + { + "epoch": 0.5828159618240694, + "grad_norm": 1.7028024196624756, + "learning_rate": 1.564276882653992e-05, + "loss": 0.8985, + "step": 11969 + }, + { + "epoch": 0.5828646556131766, + "grad_norm": 0.0998753011226654, + "learning_rate": 1.5639690281818735e-05, + "loss": 0.6189, + "step": 11970 + }, + { + "epoch": 0.5829133494022838, + "grad_norm": 1.783908486366272, + "learning_rate": 1.5636611845560643e-05, + "loss": 0.7724, + "step": 11971 + }, + { + "epoch": 0.5829620431913909, + "grad_norm": 1.4376164674758911, + "learning_rate": 1.5633533517842234e-05, + "loss": 0.8343, + "step": 11972 + }, + { + "epoch": 0.5830107369804981, + "grad_norm": 2.04245662689209, + "learning_rate": 1.5630455298740073e-05, + "loss": 0.7949, + "step": 11973 + }, + { + "epoch": 0.5830594307696053, + "grad_norm": 2.122079849243164, + "learning_rate": 1.562737718833074e-05, + "loss": 0.8097, + "step": 11974 + }, + { + "epoch": 0.5831081245587125, + "grad_norm": 1.6138991117477417, + "learning_rate": 1.5624299186690794e-05, + "loss": 0.8624, + "step": 11975 + }, + { + "epoch": 0.5831568183478197, + "grad_norm": 1.3937246799468994, + "learning_rate": 1.562122129389681e-05, + "loss": 0.765, + "step": 11976 + }, + { + "epoch": 0.583205512136927, + "grad_norm": 1.51726496219635, + "learning_rate": 1.5618143510025337e-05, + "loss": 0.7845, + "step": 11977 + }, + { + "epoch": 0.5832542059260342, + "grad_norm": 1.432641863822937, + "learning_rate": 1.5615065835152944e-05, + "loss": 0.9158, + "step": 11978 + }, + { + "epoch": 0.5833028997151414, + "grad_norm": 2.0098884105682373, + "learning_rate": 1.5611988269356195e-05, + "loss": 0.8406, + "step": 11979 + }, + { + "epoch": 0.5833515935042485, + "grad_norm": 1.8638781309127808, + "learning_rate": 1.5608910812711633e-05, + "loss": 0.7774, + "step": 11980 + }, + { + "epoch": 0.5834002872933557, + "grad_norm": 1.5131797790527344, + "learning_rate": 1.560583346529582e-05, + "loss": 0.7483, + "step": 11981 + }, + { + "epoch": 0.5834489810824629, + "grad_norm": 1.7365446090698242, + "learning_rate": 1.5602756227185297e-05, + "loss": 0.8518, + "step": 11982 + }, + { + "epoch": 0.5834976748715701, + "grad_norm": 1.340298056602478, + "learning_rate": 1.5599679098456618e-05, + "loss": 0.9144, + "step": 11983 + }, + { + "epoch": 0.5835463686606773, + "grad_norm": 5.516584396362305, + "learning_rate": 1.5596602079186316e-05, + "loss": 0.8284, + "step": 11984 + }, + { + "epoch": 0.5835950624497845, + "grad_norm": 1.4972622394561768, + "learning_rate": 1.5593525169450944e-05, + "loss": 0.8683, + "step": 11985 + }, + { + "epoch": 0.5836437562388918, + "grad_norm": 1.3717784881591797, + "learning_rate": 1.5590448369327026e-05, + "loss": 0.756, + "step": 11986 + }, + { + "epoch": 0.583692450027999, + "grad_norm": 2.7666711807250977, + "learning_rate": 1.558737167889111e-05, + "loss": 0.7778, + "step": 11987 + }, + { + "epoch": 0.5837411438171062, + "grad_norm": 1.9158194065093994, + "learning_rate": 1.5584295098219734e-05, + "loss": 0.8102, + "step": 11988 + }, + { + "epoch": 0.5837898376062133, + "grad_norm": 1.4075403213500977, + "learning_rate": 1.5581218627389415e-05, + "loss": 0.77, + "step": 11989 + }, + { + "epoch": 0.5838385313953205, + "grad_norm": 2.0696816444396973, + "learning_rate": 1.557814226647669e-05, + "loss": 0.8217, + "step": 11990 + }, + { + "epoch": 0.5838872251844277, + "grad_norm": 1.630785346031189, + "learning_rate": 1.557506601555807e-05, + "loss": 0.7603, + "step": 11991 + }, + { + "epoch": 0.5839359189735349, + "grad_norm": 3.854872465133667, + "learning_rate": 1.5571989874710094e-05, + "loss": 0.7605, + "step": 11992 + }, + { + "epoch": 0.5839846127626421, + "grad_norm": 1.2827564477920532, + "learning_rate": 1.556891384400927e-05, + "loss": 0.7945, + "step": 11993 + }, + { + "epoch": 0.5840333065517493, + "grad_norm": 0.09637002646923065, + "learning_rate": 1.5565837923532122e-05, + "loss": 0.6421, + "step": 11994 + }, + { + "epoch": 0.5840820003408566, + "grad_norm": 2.567723512649536, + "learning_rate": 1.5562762113355155e-05, + "loss": 0.8515, + "step": 11995 + }, + { + "epoch": 0.5841306941299638, + "grad_norm": 1.9254587888717651, + "learning_rate": 1.5559686413554893e-05, + "loss": 0.8312, + "step": 11996 + }, + { + "epoch": 0.5841793879190709, + "grad_norm": 3.197915554046631, + "learning_rate": 1.5556610824207825e-05, + "loss": 0.811, + "step": 11997 + }, + { + "epoch": 0.5842280817081781, + "grad_norm": 1.8121026754379272, + "learning_rate": 1.5553535345390473e-05, + "loss": 0.9333, + "step": 11998 + }, + { + "epoch": 0.5842767754972853, + "grad_norm": 1.748781681060791, + "learning_rate": 1.555045997717934e-05, + "loss": 0.8075, + "step": 11999 + }, + { + "epoch": 0.5843254692863925, + "grad_norm": 1.5434985160827637, + "learning_rate": 1.554738471965091e-05, + "loss": 0.741, + "step": 12000 + }, + { + "epoch": 0.5843741630754997, + "grad_norm": 1.3340405225753784, + "learning_rate": 1.55443095728817e-05, + "loss": 0.8296, + "step": 12001 + }, + { + "epoch": 0.5844228568646069, + "grad_norm": 0.09966117888689041, + "learning_rate": 1.5541234536948188e-05, + "loss": 0.627, + "step": 12002 + }, + { + "epoch": 0.5844715506537141, + "grad_norm": 1.46700119972229, + "learning_rate": 1.5538159611926885e-05, + "loss": 0.9238, + "step": 12003 + }, + { + "epoch": 0.5845202444428214, + "grad_norm": 1.3528283834457397, + "learning_rate": 1.5535084797894255e-05, + "loss": 0.7588, + "step": 12004 + }, + { + "epoch": 0.5845689382319286, + "grad_norm": 1.291436791419983, + "learning_rate": 1.553201009492681e-05, + "loss": 0.8614, + "step": 12005 + }, + { + "epoch": 0.5846176320210357, + "grad_norm": 1.5245251655578613, + "learning_rate": 1.552893550310101e-05, + "loss": 0.8113, + "step": 12006 + }, + { + "epoch": 0.5846663258101429, + "grad_norm": 2.147026538848877, + "learning_rate": 1.5525861022493353e-05, + "loss": 0.795, + "step": 12007 + }, + { + "epoch": 0.5847150195992501, + "grad_norm": 0.09314194321632385, + "learning_rate": 1.552278665318031e-05, + "loss": 0.5678, + "step": 12008 + }, + { + "epoch": 0.5847637133883573, + "grad_norm": 1.8030214309692383, + "learning_rate": 1.5519712395238356e-05, + "loss": 0.8688, + "step": 12009 + }, + { + "epoch": 0.5848124071774645, + "grad_norm": 1.4484583139419556, + "learning_rate": 1.5516638248743968e-05, + "loss": 0.7004, + "step": 12010 + }, + { + "epoch": 0.5848611009665717, + "grad_norm": 1.4508434534072876, + "learning_rate": 1.5513564213773608e-05, + "loss": 0.8938, + "step": 12011 + }, + { + "epoch": 0.584909794755679, + "grad_norm": 1.2759554386138916, + "learning_rate": 1.5510490290403758e-05, + "loss": 0.7446, + "step": 12012 + }, + { + "epoch": 0.5849584885447862, + "grad_norm": 1.3610743284225464, + "learning_rate": 1.550741647871086e-05, + "loss": 0.7947, + "step": 12013 + }, + { + "epoch": 0.5850071823338933, + "grad_norm": 1.3619377613067627, + "learning_rate": 1.5504342778771387e-05, + "loss": 0.883, + "step": 12014 + }, + { + "epoch": 0.5850558761230005, + "grad_norm": 1.2681630849838257, + "learning_rate": 1.55012691906618e-05, + "loss": 0.7454, + "step": 12015 + }, + { + "epoch": 0.5851045699121077, + "grad_norm": 1.6356160640716553, + "learning_rate": 1.549819571445856e-05, + "loss": 0.7868, + "step": 12016 + }, + { + "epoch": 0.5851532637012149, + "grad_norm": 2.273369550704956, + "learning_rate": 1.54951223502381e-05, + "loss": 0.8076, + "step": 12017 + }, + { + "epoch": 0.5852019574903221, + "grad_norm": 0.09436429291963577, + "learning_rate": 1.5492049098076882e-05, + "loss": 0.5922, + "step": 12018 + }, + { + "epoch": 0.5852506512794293, + "grad_norm": 1.3502451181411743, + "learning_rate": 1.5488975958051362e-05, + "loss": 0.8601, + "step": 12019 + }, + { + "epoch": 0.5852993450685365, + "grad_norm": 1.7171766757965088, + "learning_rate": 1.5485902930237972e-05, + "loss": 0.7325, + "step": 12020 + }, + { + "epoch": 0.5853480388576437, + "grad_norm": 2.1913440227508545, + "learning_rate": 1.5482830014713158e-05, + "loss": 0.7985, + "step": 12021 + }, + { + "epoch": 0.585396732646751, + "grad_norm": 1.4180880784988403, + "learning_rate": 1.5479757211553363e-05, + "loss": 0.8414, + "step": 12022 + }, + { + "epoch": 0.5854454264358581, + "grad_norm": 1.3719041347503662, + "learning_rate": 1.5476684520835016e-05, + "loss": 0.8166, + "step": 12023 + }, + { + "epoch": 0.5854941202249653, + "grad_norm": 1.8643794059753418, + "learning_rate": 1.547361194263455e-05, + "loss": 0.8516, + "step": 12024 + }, + { + "epoch": 0.5855428140140725, + "grad_norm": 1.3244409561157227, + "learning_rate": 1.547053947702841e-05, + "loss": 0.8341, + "step": 12025 + }, + { + "epoch": 0.5855915078031797, + "grad_norm": 1.9471029043197632, + "learning_rate": 1.5467467124093004e-05, + "loss": 0.8639, + "step": 12026 + }, + { + "epoch": 0.5856402015922869, + "grad_norm": 1.29862380027771, + "learning_rate": 1.5464394883904767e-05, + "loss": 0.7266, + "step": 12027 + }, + { + "epoch": 0.5856888953813941, + "grad_norm": 1.435907006263733, + "learning_rate": 1.5461322756540132e-05, + "loss": 0.821, + "step": 12028 + }, + { + "epoch": 0.5857375891705013, + "grad_norm": 2.093661069869995, + "learning_rate": 1.5458250742075493e-05, + "loss": 0.8352, + "step": 12029 + }, + { + "epoch": 0.5857862829596086, + "grad_norm": 1.2345600128173828, + "learning_rate": 1.545517884058729e-05, + "loss": 0.8717, + "step": 12030 + }, + { + "epoch": 0.5858349767487157, + "grad_norm": 1.9210017919540405, + "learning_rate": 1.545210705215192e-05, + "loss": 0.9172, + "step": 12031 + }, + { + "epoch": 0.5858836705378229, + "grad_norm": 3.0525684356689453, + "learning_rate": 1.5449035376845812e-05, + "loss": 0.9154, + "step": 12032 + }, + { + "epoch": 0.5859323643269301, + "grad_norm": 2.0382792949676514, + "learning_rate": 1.544596381474536e-05, + "loss": 0.7621, + "step": 12033 + }, + { + "epoch": 0.5859810581160373, + "grad_norm": 1.577951431274414, + "learning_rate": 1.5442892365926974e-05, + "loss": 0.786, + "step": 12034 + }, + { + "epoch": 0.5860297519051445, + "grad_norm": 0.15698888897895813, + "learning_rate": 1.543982103046705e-05, + "loss": 0.5724, + "step": 12035 + }, + { + "epoch": 0.5860784456942517, + "grad_norm": 1.5286084413528442, + "learning_rate": 1.5436749808442e-05, + "loss": 0.7321, + "step": 12036 + }, + { + "epoch": 0.5861271394833589, + "grad_norm": 1.865623950958252, + "learning_rate": 1.5433678699928208e-05, + "loss": 0.8415, + "step": 12037 + }, + { + "epoch": 0.5861758332724661, + "grad_norm": 2.4280357360839844, + "learning_rate": 1.5430607705002073e-05, + "loss": 0.7192, + "step": 12038 + }, + { + "epoch": 0.5862245270615732, + "grad_norm": 1.556691288948059, + "learning_rate": 1.542753682374e-05, + "loss": 0.9111, + "step": 12039 + }, + { + "epoch": 0.5862732208506805, + "grad_norm": 1.505401849746704, + "learning_rate": 1.5424466056218357e-05, + "loss": 0.8855, + "step": 12040 + }, + { + "epoch": 0.5863219146397877, + "grad_norm": 1.5568242073059082, + "learning_rate": 1.5421395402513547e-05, + "loss": 0.8322, + "step": 12041 + }, + { + "epoch": 0.5863706084288949, + "grad_norm": 0.10085101425647736, + "learning_rate": 1.5418324862701932e-05, + "loss": 0.6906, + "step": 12042 + }, + { + "epoch": 0.5864193022180021, + "grad_norm": 2.626314640045166, + "learning_rate": 1.541525443685991e-05, + "loss": 0.7732, + "step": 12043 + }, + { + "epoch": 0.5864679960071093, + "grad_norm": 2.0424530506134033, + "learning_rate": 1.5412184125063846e-05, + "loss": 0.8082, + "step": 12044 + }, + { + "epoch": 0.5865166897962165, + "grad_norm": 1.779179573059082, + "learning_rate": 1.5409113927390133e-05, + "loss": 0.7813, + "step": 12045 + }, + { + "epoch": 0.5865653835853237, + "grad_norm": 1.4208929538726807, + "learning_rate": 1.5406043843915116e-05, + "loss": 0.8389, + "step": 12046 + }, + { + "epoch": 0.5866140773744309, + "grad_norm": 2.119708776473999, + "learning_rate": 1.5402973874715177e-05, + "loss": 0.8166, + "step": 12047 + }, + { + "epoch": 0.586662771163538, + "grad_norm": 1.8328042030334473, + "learning_rate": 1.5399904019866696e-05, + "loss": 0.7499, + "step": 12048 + }, + { + "epoch": 0.5867114649526453, + "grad_norm": 1.3758004903793335, + "learning_rate": 1.539683427944601e-05, + "loss": 0.8511, + "step": 12049 + }, + { + "epoch": 0.5867601587417525, + "grad_norm": 1.9767125844955444, + "learning_rate": 1.5393764653529493e-05, + "loss": 0.8151, + "step": 12050 + }, + { + "epoch": 0.5868088525308597, + "grad_norm": 1.4845852851867676, + "learning_rate": 1.5390695142193506e-05, + "loss": 0.87, + "step": 12051 + }, + { + "epoch": 0.5868575463199669, + "grad_norm": 2.0264289379119873, + "learning_rate": 1.5387625745514395e-05, + "loss": 0.8471, + "step": 12052 + }, + { + "epoch": 0.5869062401090741, + "grad_norm": 2.0625357627868652, + "learning_rate": 1.5384556463568506e-05, + "loss": 0.9026, + "step": 12053 + }, + { + "epoch": 0.5869549338981813, + "grad_norm": 1.5461039543151855, + "learning_rate": 1.538148729643221e-05, + "loss": 0.8438, + "step": 12054 + }, + { + "epoch": 0.5870036276872885, + "grad_norm": 1.325252652168274, + "learning_rate": 1.537841824418183e-05, + "loss": 0.892, + "step": 12055 + }, + { + "epoch": 0.5870523214763956, + "grad_norm": 1.450052261352539, + "learning_rate": 1.5375349306893714e-05, + "loss": 0.9124, + "step": 12056 + }, + { + "epoch": 0.5871010152655028, + "grad_norm": 1.7684983015060425, + "learning_rate": 1.5372280484644213e-05, + "loss": 0.8091, + "step": 12057 + }, + { + "epoch": 0.5871497090546101, + "grad_norm": 1.8316401243209839, + "learning_rate": 1.536921177750965e-05, + "loss": 0.8179, + "step": 12058 + }, + { + "epoch": 0.5871984028437173, + "grad_norm": 1.645141839981079, + "learning_rate": 1.5366143185566375e-05, + "loss": 0.8915, + "step": 12059 + }, + { + "epoch": 0.5872470966328245, + "grad_norm": 0.09319773316383362, + "learning_rate": 1.53630747088907e-05, + "loss": 0.6426, + "step": 12060 + }, + { + "epoch": 0.5872957904219317, + "grad_norm": 1.5751436948776245, + "learning_rate": 1.536000634755898e-05, + "loss": 0.8329, + "step": 12061 + }, + { + "epoch": 0.5873444842110389, + "grad_norm": 1.5541046857833862, + "learning_rate": 1.535693810164751e-05, + "loss": 0.8406, + "step": 12062 + }, + { + "epoch": 0.5873931780001461, + "grad_norm": 1.4752044677734375, + "learning_rate": 1.5353869971232637e-05, + "loss": 0.8244, + "step": 12063 + }, + { + "epoch": 0.5874418717892533, + "grad_norm": 1.4800752401351929, + "learning_rate": 1.5350801956390666e-05, + "loss": 0.8632, + "step": 12064 + }, + { + "epoch": 0.5874905655783604, + "grad_norm": 1.4629565477371216, + "learning_rate": 1.5347734057197924e-05, + "loss": 0.9089, + "step": 12065 + }, + { + "epoch": 0.5875392593674676, + "grad_norm": 1.8959236145019531, + "learning_rate": 1.5344666273730717e-05, + "loss": 0.8775, + "step": 12066 + }, + { + "epoch": 0.5875879531565749, + "grad_norm": 1.9457664489746094, + "learning_rate": 1.5341598606065362e-05, + "loss": 0.7687, + "step": 12067 + }, + { + "epoch": 0.5876366469456821, + "grad_norm": 2.091620445251465, + "learning_rate": 1.5338531054278173e-05, + "loss": 0.8191, + "step": 12068 + }, + { + "epoch": 0.5876853407347893, + "grad_norm": 2.0946297645568848, + "learning_rate": 1.533546361844544e-05, + "loss": 0.7262, + "step": 12069 + }, + { + "epoch": 0.5877340345238965, + "grad_norm": 2.5023486614227295, + "learning_rate": 1.5332396298643487e-05, + "loss": 0.6807, + "step": 12070 + }, + { + "epoch": 0.5877827283130037, + "grad_norm": 1.8696213960647583, + "learning_rate": 1.532932909494859e-05, + "loss": 0.8792, + "step": 12071 + }, + { + "epoch": 0.5878314221021109, + "grad_norm": 1.3941140174865723, + "learning_rate": 1.5326262007437063e-05, + "loss": 0.8328, + "step": 12072 + }, + { + "epoch": 0.587880115891218, + "grad_norm": 1.6984928846359253, + "learning_rate": 1.5323195036185192e-05, + "loss": 0.7998, + "step": 12073 + }, + { + "epoch": 0.5879288096803252, + "grad_norm": 1.4807299375534058, + "learning_rate": 1.5320128181269275e-05, + "loss": 0.8455, + "step": 12074 + }, + { + "epoch": 0.5879775034694324, + "grad_norm": 1.7821376323699951, + "learning_rate": 1.531706144276559e-05, + "loss": 0.8034, + "step": 12075 + }, + { + "epoch": 0.5880261972585397, + "grad_norm": 1.277234435081482, + "learning_rate": 1.531399482075043e-05, + "loss": 0.9159, + "step": 12076 + }, + { + "epoch": 0.5880748910476469, + "grad_norm": 1.6256303787231445, + "learning_rate": 1.5310928315300086e-05, + "loss": 0.7827, + "step": 12077 + }, + { + "epoch": 0.5881235848367541, + "grad_norm": 1.6842831373214722, + "learning_rate": 1.5307861926490817e-05, + "loss": 0.8063, + "step": 12078 + }, + { + "epoch": 0.5881722786258613, + "grad_norm": 1.3096829652786255, + "learning_rate": 1.5304795654398918e-05, + "loss": 0.8047, + "step": 12079 + }, + { + "epoch": 0.5882209724149685, + "grad_norm": 1.6541876792907715, + "learning_rate": 1.5301729499100644e-05, + "loss": 0.8084, + "step": 12080 + }, + { + "epoch": 0.5882696662040757, + "grad_norm": 2.049720525741577, + "learning_rate": 1.5298663460672287e-05, + "loss": 0.774, + "step": 12081 + }, + { + "epoch": 0.5883183599931828, + "grad_norm": 1.9709733724594116, + "learning_rate": 1.52955975391901e-05, + "loss": 0.9177, + "step": 12082 + }, + { + "epoch": 0.58836705378229, + "grad_norm": 2.6675546169281006, + "learning_rate": 1.529253173473036e-05, + "loss": 0.8781, + "step": 12083 + }, + { + "epoch": 0.5884157475713973, + "grad_norm": 1.68222177028656, + "learning_rate": 1.528946604736931e-05, + "loss": 0.89, + "step": 12084 + }, + { + "epoch": 0.5884644413605045, + "grad_norm": 1.9972667694091797, + "learning_rate": 1.5286400477183238e-05, + "loss": 0.7977, + "step": 12085 + }, + { + "epoch": 0.5885131351496117, + "grad_norm": 7.05013370513916, + "learning_rate": 1.528333502424837e-05, + "loss": 0.8385, + "step": 12086 + }, + { + "epoch": 0.5885618289387189, + "grad_norm": 1.3427759408950806, + "learning_rate": 1.5280269688640976e-05, + "loss": 0.8808, + "step": 12087 + }, + { + "epoch": 0.5886105227278261, + "grad_norm": 1.8461341857910156, + "learning_rate": 1.5277204470437306e-05, + "loss": 0.8569, + "step": 12088 + }, + { + "epoch": 0.5886592165169333, + "grad_norm": 1.317598581314087, + "learning_rate": 1.52741393697136e-05, + "loss": 0.7448, + "step": 12089 + }, + { + "epoch": 0.5887079103060404, + "grad_norm": 1.8358057737350464, + "learning_rate": 1.527107438654612e-05, + "loss": 0.8323, + "step": 12090 + }, + { + "epoch": 0.5887566040951476, + "grad_norm": 2.100451946258545, + "learning_rate": 1.5268009521011085e-05, + "loss": 0.7381, + "step": 12091 + }, + { + "epoch": 0.5888052978842548, + "grad_norm": 0.08934091031551361, + "learning_rate": 1.5264944773184756e-05, + "loss": 0.6188, + "step": 12092 + }, + { + "epoch": 0.588853991673362, + "grad_norm": 1.7311781644821167, + "learning_rate": 1.5261880143143346e-05, + "loss": 0.7623, + "step": 12093 + }, + { + "epoch": 0.5889026854624693, + "grad_norm": 3.2276532649993896, + "learning_rate": 1.5258815630963104e-05, + "loss": 0.9216, + "step": 12094 + }, + { + "epoch": 0.5889513792515765, + "grad_norm": 2.2872314453125, + "learning_rate": 1.5255751236720254e-05, + "loss": 0.8059, + "step": 12095 + }, + { + "epoch": 0.5890000730406837, + "grad_norm": 2.173434019088745, + "learning_rate": 1.525268696049102e-05, + "loss": 0.8178, + "step": 12096 + }, + { + "epoch": 0.5890487668297909, + "grad_norm": 2.5312609672546387, + "learning_rate": 1.5249622802351643e-05, + "loss": 0.8977, + "step": 12097 + }, + { + "epoch": 0.589097460618898, + "grad_norm": 1.336513876914978, + "learning_rate": 1.524655876237832e-05, + "loss": 0.7703, + "step": 12098 + }, + { + "epoch": 0.5891461544080052, + "grad_norm": 1.537858009338379, + "learning_rate": 1.5243494840647292e-05, + "loss": 0.7544, + "step": 12099 + }, + { + "epoch": 0.5891948481971124, + "grad_norm": 1.5756515264511108, + "learning_rate": 1.5240431037234756e-05, + "loss": 0.7838, + "step": 12100 + }, + { + "epoch": 0.5892435419862196, + "grad_norm": 1.437707543373108, + "learning_rate": 1.5237367352216934e-05, + "loss": 0.7303, + "step": 12101 + }, + { + "epoch": 0.5892922357753269, + "grad_norm": 2.11548113822937, + "learning_rate": 1.523430378567003e-05, + "loss": 0.8873, + "step": 12102 + }, + { + "epoch": 0.5893409295644341, + "grad_norm": 1.490708827972412, + "learning_rate": 1.5231240337670264e-05, + "loss": 0.8049, + "step": 12103 + }, + { + "epoch": 0.5893896233535413, + "grad_norm": 1.5297521352767944, + "learning_rate": 1.5228177008293818e-05, + "loss": 0.8763, + "step": 12104 + }, + { + "epoch": 0.5894383171426485, + "grad_norm": 2.030768394470215, + "learning_rate": 1.5225113797616913e-05, + "loss": 0.7416, + "step": 12105 + }, + { + "epoch": 0.5894870109317557, + "grad_norm": 1.918548822402954, + "learning_rate": 1.5222050705715732e-05, + "loss": 0.8109, + "step": 12106 + }, + { + "epoch": 0.5895357047208628, + "grad_norm": 1.5946550369262695, + "learning_rate": 1.5218987732666473e-05, + "loss": 0.7985, + "step": 12107 + }, + { + "epoch": 0.58958439850997, + "grad_norm": 1.8289707899093628, + "learning_rate": 1.5215924878545338e-05, + "loss": 0.7537, + "step": 12108 + }, + { + "epoch": 0.5896330922990772, + "grad_norm": 2.074456214904785, + "learning_rate": 1.52128621434285e-05, + "loss": 0.8763, + "step": 12109 + }, + { + "epoch": 0.5896817860881844, + "grad_norm": 1.4273827075958252, + "learning_rate": 1.5209799527392158e-05, + "loss": 0.8323, + "step": 12110 + }, + { + "epoch": 0.5897304798772917, + "grad_norm": 1.6302286386489868, + "learning_rate": 1.5206737030512484e-05, + "loss": 0.8205, + "step": 12111 + }, + { + "epoch": 0.5897791736663989, + "grad_norm": 1.9423468112945557, + "learning_rate": 1.5203674652865669e-05, + "loss": 0.8608, + "step": 12112 + }, + { + "epoch": 0.5898278674555061, + "grad_norm": 1.5662115812301636, + "learning_rate": 1.520061239452788e-05, + "loss": 0.8798, + "step": 12113 + }, + { + "epoch": 0.5898765612446133, + "grad_norm": 1.7878620624542236, + "learning_rate": 1.5197550255575304e-05, + "loss": 0.8771, + "step": 12114 + }, + { + "epoch": 0.5899252550337204, + "grad_norm": 1.8719900846481323, + "learning_rate": 1.5194488236084093e-05, + "loss": 0.8131, + "step": 12115 + }, + { + "epoch": 0.5899739488228276, + "grad_norm": 1.6013797521591187, + "learning_rate": 1.5191426336130427e-05, + "loss": 0.9019, + "step": 12116 + }, + { + "epoch": 0.5900226426119348, + "grad_norm": 1.798992395401001, + "learning_rate": 1.5188364555790472e-05, + "loss": 0.7636, + "step": 12117 + }, + { + "epoch": 0.590071336401042, + "grad_norm": 2.286541223526001, + "learning_rate": 1.5185302895140384e-05, + "loss": 0.7754, + "step": 12118 + }, + { + "epoch": 0.5901200301901492, + "grad_norm": 2.1594648361206055, + "learning_rate": 1.5182241354256333e-05, + "loss": 0.7772, + "step": 12119 + }, + { + "epoch": 0.5901687239792565, + "grad_norm": 1.415276050567627, + "learning_rate": 1.5179179933214458e-05, + "loss": 0.7732, + "step": 12120 + }, + { + "epoch": 0.5902174177683637, + "grad_norm": 1.7246720790863037, + "learning_rate": 1.5176118632090933e-05, + "loss": 0.7969, + "step": 12121 + }, + { + "epoch": 0.5902661115574709, + "grad_norm": 1.6097861528396606, + "learning_rate": 1.5173057450961887e-05, + "loss": 0.802, + "step": 12122 + }, + { + "epoch": 0.5903148053465781, + "grad_norm": 3.1040873527526855, + "learning_rate": 1.516999638990348e-05, + "loss": 0.8642, + "step": 12123 + }, + { + "epoch": 0.5903634991356852, + "grad_norm": 1.3975834846496582, + "learning_rate": 1.5166935448991851e-05, + "loss": 0.8349, + "step": 12124 + }, + { + "epoch": 0.5904121929247924, + "grad_norm": 2.140584945678711, + "learning_rate": 1.5163874628303153e-05, + "loss": 0.8538, + "step": 12125 + }, + { + "epoch": 0.5904608867138996, + "grad_norm": 1.557077407836914, + "learning_rate": 1.5160813927913502e-05, + "loss": 0.7384, + "step": 12126 + }, + { + "epoch": 0.5905095805030068, + "grad_norm": 1.540570855140686, + "learning_rate": 1.515775334789905e-05, + "loss": 0.865, + "step": 12127 + }, + { + "epoch": 0.590558274292114, + "grad_norm": 1.986472487449646, + "learning_rate": 1.5154692888335931e-05, + "loss": 0.8072, + "step": 12128 + }, + { + "epoch": 0.5906069680812213, + "grad_norm": 1.3651257753372192, + "learning_rate": 1.515163254930026e-05, + "loss": 0.8193, + "step": 12129 + }, + { + "epoch": 0.5906556618703285, + "grad_norm": 1.3871058225631714, + "learning_rate": 1.5148572330868177e-05, + "loss": 0.8557, + "step": 12130 + }, + { + "epoch": 0.5907043556594357, + "grad_norm": 1.3924294710159302, + "learning_rate": 1.5145512233115795e-05, + "loss": 0.925, + "step": 12131 + }, + { + "epoch": 0.5907530494485428, + "grad_norm": 1.8582432270050049, + "learning_rate": 1.5142452256119244e-05, + "loss": 0.7784, + "step": 12132 + }, + { + "epoch": 0.59080174323765, + "grad_norm": 1.9576327800750732, + "learning_rate": 1.5139392399954628e-05, + "loss": 0.8525, + "step": 12133 + }, + { + "epoch": 0.5908504370267572, + "grad_norm": 0.09284351021051407, + "learning_rate": 1.513633266469808e-05, + "loss": 0.5919, + "step": 12134 + }, + { + "epoch": 0.5908991308158644, + "grad_norm": 1.9502073526382446, + "learning_rate": 1.5133273050425695e-05, + "loss": 0.8771, + "step": 12135 + }, + { + "epoch": 0.5909478246049716, + "grad_norm": 1.3197300434112549, + "learning_rate": 1.513021355721358e-05, + "loss": 0.8255, + "step": 12136 + }, + { + "epoch": 0.5909965183940789, + "grad_norm": 1.7731186151504517, + "learning_rate": 1.512715418513786e-05, + "loss": 0.7981, + "step": 12137 + }, + { + "epoch": 0.5910452121831861, + "grad_norm": 1.8427481651306152, + "learning_rate": 1.5124094934274613e-05, + "loss": 0.8635, + "step": 12138 + }, + { + "epoch": 0.5910939059722933, + "grad_norm": 1.5107897520065308, + "learning_rate": 1.5121035804699955e-05, + "loss": 0.8905, + "step": 12139 + }, + { + "epoch": 0.5911425997614004, + "grad_norm": 1.6194419860839844, + "learning_rate": 1.511797679648997e-05, + "loss": 0.7797, + "step": 12140 + }, + { + "epoch": 0.5911912935505076, + "grad_norm": 1.280770182609558, + "learning_rate": 1.5114917909720768e-05, + "loss": 0.8633, + "step": 12141 + }, + { + "epoch": 0.5912399873396148, + "grad_norm": 1.6647838354110718, + "learning_rate": 1.511185914446842e-05, + "loss": 0.891, + "step": 12142 + }, + { + "epoch": 0.591288681128722, + "grad_norm": 1.7609518766403198, + "learning_rate": 1.5108800500809028e-05, + "loss": 0.7893, + "step": 12143 + }, + { + "epoch": 0.5913373749178292, + "grad_norm": 1.4391553401947021, + "learning_rate": 1.5105741978818662e-05, + "loss": 0.7977, + "step": 12144 + }, + { + "epoch": 0.5913860687069364, + "grad_norm": 2.3453972339630127, + "learning_rate": 1.5102683578573412e-05, + "loss": 0.9618, + "step": 12145 + }, + { + "epoch": 0.5914347624960437, + "grad_norm": 1.5074111223220825, + "learning_rate": 1.5099625300149354e-05, + "loss": 0.8156, + "step": 12146 + }, + { + "epoch": 0.5914834562851509, + "grad_norm": 2.598938226699829, + "learning_rate": 1.5096567143622563e-05, + "loss": 0.734, + "step": 12147 + }, + { + "epoch": 0.5915321500742581, + "grad_norm": 1.729411244392395, + "learning_rate": 1.509350910906912e-05, + "loss": 0.817, + "step": 12148 + }, + { + "epoch": 0.5915808438633652, + "grad_norm": 1.765565276145935, + "learning_rate": 1.5090451196565076e-05, + "loss": 0.7537, + "step": 12149 + }, + { + "epoch": 0.5916295376524724, + "grad_norm": 1.3828070163726807, + "learning_rate": 1.5087393406186515e-05, + "loss": 0.8194, + "step": 12150 + }, + { + "epoch": 0.5916782314415796, + "grad_norm": 83.00824737548828, + "learning_rate": 1.5084335738009481e-05, + "loss": 0.8302, + "step": 12151 + }, + { + "epoch": 0.5917269252306868, + "grad_norm": 1.5335215330123901, + "learning_rate": 1.508127819211005e-05, + "loss": 0.7631, + "step": 12152 + }, + { + "epoch": 0.591775619019794, + "grad_norm": 1.5131193399429321, + "learning_rate": 1.5078220768564266e-05, + "loss": 0.8474, + "step": 12153 + }, + { + "epoch": 0.5918243128089012, + "grad_norm": 1.6725012063980103, + "learning_rate": 1.50751634674482e-05, + "loss": 0.7279, + "step": 12154 + }, + { + "epoch": 0.5918730065980085, + "grad_norm": 1.9833190441131592, + "learning_rate": 1.5072106288837882e-05, + "loss": 0.8417, + "step": 12155 + }, + { + "epoch": 0.5919217003871157, + "grad_norm": 1.564141035079956, + "learning_rate": 1.506904923280937e-05, + "loss": 0.8577, + "step": 12156 + }, + { + "epoch": 0.5919703941762228, + "grad_norm": 1.9367915391921997, + "learning_rate": 1.5065992299438716e-05, + "loss": 0.8873, + "step": 12157 + }, + { + "epoch": 0.59201908796533, + "grad_norm": 1.191827416419983, + "learning_rate": 1.5062935488801945e-05, + "loss": 0.7962, + "step": 12158 + }, + { + "epoch": 0.5920677817544372, + "grad_norm": 2.089613199234009, + "learning_rate": 1.5059878800975107e-05, + "loss": 0.8533, + "step": 12159 + }, + { + "epoch": 0.5921164755435444, + "grad_norm": 2.2669482231140137, + "learning_rate": 1.5056822236034236e-05, + "loss": 0.8593, + "step": 12160 + }, + { + "epoch": 0.5921651693326516, + "grad_norm": 2.2496068477630615, + "learning_rate": 1.5053765794055362e-05, + "loss": 0.8396, + "step": 12161 + }, + { + "epoch": 0.5922138631217588, + "grad_norm": 1.478246808052063, + "learning_rate": 1.5050709475114508e-05, + "loss": 0.794, + "step": 12162 + }, + { + "epoch": 0.592262556910866, + "grad_norm": 1.5155055522918701, + "learning_rate": 1.504765327928772e-05, + "loss": 0.8367, + "step": 12163 + }, + { + "epoch": 0.5923112506999733, + "grad_norm": 1.5009421110153198, + "learning_rate": 1.5044597206650997e-05, + "loss": 0.798, + "step": 12164 + }, + { + "epoch": 0.5923599444890805, + "grad_norm": 1.4505956172943115, + "learning_rate": 1.5041541257280371e-05, + "loss": 0.8546, + "step": 12165 + }, + { + "epoch": 0.5924086382781876, + "grad_norm": 2.0204079151153564, + "learning_rate": 1.5038485431251867e-05, + "loss": 0.7915, + "step": 12166 + }, + { + "epoch": 0.5924573320672948, + "grad_norm": 1.8514230251312256, + "learning_rate": 1.5035429728641481e-05, + "loss": 0.8111, + "step": 12167 + }, + { + "epoch": 0.592506025856402, + "grad_norm": 2.3103208541870117, + "learning_rate": 1.503237414952524e-05, + "loss": 0.7489, + "step": 12168 + }, + { + "epoch": 0.5925547196455092, + "grad_norm": 1.7655092477798462, + "learning_rate": 1.5029318693979138e-05, + "loss": 0.8086, + "step": 12169 + }, + { + "epoch": 0.5926034134346164, + "grad_norm": 2.596191167831421, + "learning_rate": 1.5026263362079196e-05, + "loss": 0.8344, + "step": 12170 + }, + { + "epoch": 0.5926521072237236, + "grad_norm": 1.668359398841858, + "learning_rate": 1.5023208153901396e-05, + "loss": 0.8284, + "step": 12171 + }, + { + "epoch": 0.5927008010128308, + "grad_norm": 1.7701219320297241, + "learning_rate": 1.502015306952176e-05, + "loss": 0.7963, + "step": 12172 + }, + { + "epoch": 0.5927494948019381, + "grad_norm": 1.6715253591537476, + "learning_rate": 1.5017098109016258e-05, + "loss": 0.8294, + "step": 12173 + }, + { + "epoch": 0.5927981885910452, + "grad_norm": 1.622143268585205, + "learning_rate": 1.5014043272460899e-05, + "loss": 0.8774, + "step": 12174 + }, + { + "epoch": 0.5928468823801524, + "grad_norm": 2.153839349746704, + "learning_rate": 1.5010988559931665e-05, + "loss": 0.7948, + "step": 12175 + }, + { + "epoch": 0.5928955761692596, + "grad_norm": 2.0287840366363525, + "learning_rate": 1.5007933971504547e-05, + "loss": 0.8103, + "step": 12176 + }, + { + "epoch": 0.5929442699583668, + "grad_norm": 2.0653867721557617, + "learning_rate": 1.5004879507255532e-05, + "loss": 0.7685, + "step": 12177 + }, + { + "epoch": 0.592992963747474, + "grad_norm": 3.2898240089416504, + "learning_rate": 1.5001825167260588e-05, + "loss": 0.8259, + "step": 12178 + }, + { + "epoch": 0.5930416575365812, + "grad_norm": 1.5210965871810913, + "learning_rate": 1.4998770951595708e-05, + "loss": 0.7615, + "step": 12179 + }, + { + "epoch": 0.5930903513256884, + "grad_norm": 1.9368540048599243, + "learning_rate": 1.4995716860336846e-05, + "loss": 0.8308, + "step": 12180 + }, + { + "epoch": 0.5931390451147956, + "grad_norm": 1.8009495735168457, + "learning_rate": 1.4992662893559988e-05, + "loss": 0.8665, + "step": 12181 + }, + { + "epoch": 0.5931877389039029, + "grad_norm": 1.9997683763504028, + "learning_rate": 1.4989609051341094e-05, + "loss": 0.855, + "step": 12182 + }, + { + "epoch": 0.59323643269301, + "grad_norm": 1.7666418552398682, + "learning_rate": 1.4986555333756142e-05, + "loss": 0.961, + "step": 12183 + }, + { + "epoch": 0.5932851264821172, + "grad_norm": 2.240506649017334, + "learning_rate": 1.498350174088107e-05, + "loss": 0.8494, + "step": 12184 + }, + { + "epoch": 0.5933338202712244, + "grad_norm": 2.1081860065460205, + "learning_rate": 1.4980448272791854e-05, + "loss": 0.8506, + "step": 12185 + }, + { + "epoch": 0.5933825140603316, + "grad_norm": 1.6942331790924072, + "learning_rate": 1.4977394929564453e-05, + "loss": 0.8099, + "step": 12186 + }, + { + "epoch": 0.5934312078494388, + "grad_norm": 2.437922477722168, + "learning_rate": 1.49743417112748e-05, + "loss": 0.7917, + "step": 12187 + }, + { + "epoch": 0.593479901638546, + "grad_norm": 0.09321364015340805, + "learning_rate": 1.4971288617998863e-05, + "loss": 0.6126, + "step": 12188 + }, + { + "epoch": 0.5935285954276532, + "grad_norm": 2.5355353355407715, + "learning_rate": 1.4968235649812575e-05, + "loss": 0.8378, + "step": 12189 + }, + { + "epoch": 0.5935772892167605, + "grad_norm": 1.6083303689956665, + "learning_rate": 1.4965182806791885e-05, + "loss": 0.8156, + "step": 12190 + }, + { + "epoch": 0.5936259830058676, + "grad_norm": 3.2354624271392822, + "learning_rate": 1.496213008901273e-05, + "loss": 0.8615, + "step": 12191 + }, + { + "epoch": 0.5936746767949748, + "grad_norm": 3.102444648742676, + "learning_rate": 1.4959077496551055e-05, + "loss": 0.8402, + "step": 12192 + }, + { + "epoch": 0.593723370584082, + "grad_norm": 3.49312686920166, + "learning_rate": 1.4956025029482781e-05, + "loss": 0.863, + "step": 12193 + }, + { + "epoch": 0.5937720643731892, + "grad_norm": 1.6397625207901, + "learning_rate": 1.495297268788385e-05, + "loss": 0.7307, + "step": 12194 + }, + { + "epoch": 0.5938207581622964, + "grad_norm": 2.0152816772460938, + "learning_rate": 1.4949920471830174e-05, + "loss": 0.8568, + "step": 12195 + }, + { + "epoch": 0.5938694519514036, + "grad_norm": 1.6415574550628662, + "learning_rate": 1.494686838139769e-05, + "loss": 0.9365, + "step": 12196 + }, + { + "epoch": 0.5939181457405108, + "grad_norm": 1.6368768215179443, + "learning_rate": 1.494381641666232e-05, + "loss": 0.9055, + "step": 12197 + }, + { + "epoch": 0.593966839529618, + "grad_norm": 1.6018054485321045, + "learning_rate": 1.4940764577699973e-05, + "loss": 0.7858, + "step": 12198 + }, + { + "epoch": 0.5940155333187251, + "grad_norm": 1.6251673698425293, + "learning_rate": 1.4937712864586576e-05, + "loss": 0.8045, + "step": 12199 + }, + { + "epoch": 0.5940642271078324, + "grad_norm": 3.5733895301818848, + "learning_rate": 1.4934661277398025e-05, + "loss": 0.8727, + "step": 12200 + }, + { + "epoch": 0.5941129208969396, + "grad_norm": 2.1809020042419434, + "learning_rate": 1.4931609816210242e-05, + "loss": 0.8213, + "step": 12201 + }, + { + "epoch": 0.5941616146860468, + "grad_norm": 1.4167230129241943, + "learning_rate": 1.4928558481099122e-05, + "loss": 0.7915, + "step": 12202 + }, + { + "epoch": 0.594210308475154, + "grad_norm": 3.2135565280914307, + "learning_rate": 1.4925507272140575e-05, + "loss": 0.9134, + "step": 12203 + }, + { + "epoch": 0.5942590022642612, + "grad_norm": 2.258479356765747, + "learning_rate": 1.4922456189410495e-05, + "loss": 0.8002, + "step": 12204 + }, + { + "epoch": 0.5943076960533684, + "grad_norm": 2.063131093978882, + "learning_rate": 1.4919405232984778e-05, + "loss": 0.8417, + "step": 12205 + }, + { + "epoch": 0.5943563898424756, + "grad_norm": 2.8066060543060303, + "learning_rate": 1.491635440293933e-05, + "loss": 0.8002, + "step": 12206 + }, + { + "epoch": 0.5944050836315828, + "grad_norm": 1.899977207183838, + "learning_rate": 1.4913303699350019e-05, + "loss": 0.7573, + "step": 12207 + }, + { + "epoch": 0.5944537774206899, + "grad_norm": 1.9115304946899414, + "learning_rate": 1.491025312229275e-05, + "loss": 0.8828, + "step": 12208 + }, + { + "epoch": 0.5945024712097972, + "grad_norm": 2.0077645778656006, + "learning_rate": 1.4907202671843393e-05, + "loss": 0.7504, + "step": 12209 + }, + { + "epoch": 0.5945511649989044, + "grad_norm": 2.0434751510620117, + "learning_rate": 1.4904152348077836e-05, + "loss": 0.8808, + "step": 12210 + }, + { + "epoch": 0.5945998587880116, + "grad_norm": 1.71919846534729, + "learning_rate": 1.490110215107195e-05, + "loss": 0.7614, + "step": 12211 + }, + { + "epoch": 0.5946485525771188, + "grad_norm": 1.5369503498077393, + "learning_rate": 1.4898052080901622e-05, + "loss": 0.7872, + "step": 12212 + }, + { + "epoch": 0.594697246366226, + "grad_norm": 2.3302996158599854, + "learning_rate": 1.4895002137642704e-05, + "loss": 0.7838, + "step": 12213 + }, + { + "epoch": 0.5947459401553332, + "grad_norm": 1.579803228378296, + "learning_rate": 1.4891952321371084e-05, + "loss": 0.8602, + "step": 12214 + }, + { + "epoch": 0.5947946339444404, + "grad_norm": 2.530306816101074, + "learning_rate": 1.4888902632162606e-05, + "loss": 0.8677, + "step": 12215 + }, + { + "epoch": 0.5948433277335475, + "grad_norm": 1.4874348640441895, + "learning_rate": 1.4885853070093141e-05, + "loss": 0.9574, + "step": 12216 + }, + { + "epoch": 0.5948920215226547, + "grad_norm": 1.5320616960525513, + "learning_rate": 1.4882803635238556e-05, + "loss": 0.7744, + "step": 12217 + }, + { + "epoch": 0.594940715311762, + "grad_norm": 2.6882736682891846, + "learning_rate": 1.4879754327674687e-05, + "loss": 0.8896, + "step": 12218 + }, + { + "epoch": 0.5949894091008692, + "grad_norm": 1.9880503416061401, + "learning_rate": 1.48767051474774e-05, + "loss": 0.8574, + "step": 12219 + }, + { + "epoch": 0.5950381028899764, + "grad_norm": 1.6617804765701294, + "learning_rate": 1.487365609472254e-05, + "loss": 0.8362, + "step": 12220 + }, + { + "epoch": 0.5950867966790836, + "grad_norm": 2.1644816398620605, + "learning_rate": 1.4870607169485959e-05, + "loss": 0.8554, + "step": 12221 + }, + { + "epoch": 0.5951354904681908, + "grad_norm": 3.589402914047241, + "learning_rate": 1.4867558371843483e-05, + "loss": 0.854, + "step": 12222 + }, + { + "epoch": 0.595184184257298, + "grad_norm": 3.505842447280884, + "learning_rate": 1.4864509701870966e-05, + "loss": 0.8686, + "step": 12223 + }, + { + "epoch": 0.5952328780464052, + "grad_norm": 1.9930484294891357, + "learning_rate": 1.4861461159644233e-05, + "loss": 0.7904, + "step": 12224 + }, + { + "epoch": 0.5952815718355123, + "grad_norm": 1.6280146837234497, + "learning_rate": 1.4858412745239119e-05, + "loss": 0.8594, + "step": 12225 + }, + { + "epoch": 0.5953302656246195, + "grad_norm": 2.158982038497925, + "learning_rate": 1.4855364458731465e-05, + "loss": 0.7426, + "step": 12226 + }, + { + "epoch": 0.5953789594137268, + "grad_norm": 2.737079620361328, + "learning_rate": 1.4852316300197083e-05, + "loss": 0.803, + "step": 12227 + }, + { + "epoch": 0.595427653202834, + "grad_norm": 1.6948322057724, + "learning_rate": 1.4849268269711812e-05, + "loss": 0.832, + "step": 12228 + }, + { + "epoch": 0.5954763469919412, + "grad_norm": 0.08968853205442429, + "learning_rate": 1.4846220367351452e-05, + "loss": 0.6208, + "step": 12229 + }, + { + "epoch": 0.5955250407810484, + "grad_norm": 1.7231534719467163, + "learning_rate": 1.4843172593191839e-05, + "loss": 0.804, + "step": 12230 + }, + { + "epoch": 0.5955737345701556, + "grad_norm": 0.09669410437345505, + "learning_rate": 1.4840124947308771e-05, + "loss": 0.6149, + "step": 12231 + }, + { + "epoch": 0.5956224283592628, + "grad_norm": 4.176159858703613, + "learning_rate": 1.483707742977807e-05, + "loss": 0.8538, + "step": 12232 + }, + { + "epoch": 0.5956711221483699, + "grad_norm": 2.0198190212249756, + "learning_rate": 1.4834030040675531e-05, + "loss": 0.8713, + "step": 12233 + }, + { + "epoch": 0.5957198159374771, + "grad_norm": 2.1713931560516357, + "learning_rate": 1.483098278007697e-05, + "loss": 0.8715, + "step": 12234 + }, + { + "epoch": 0.5957685097265843, + "grad_norm": 2.217398166656494, + "learning_rate": 1.4827935648058189e-05, + "loss": 0.8141, + "step": 12235 + }, + { + "epoch": 0.5958172035156916, + "grad_norm": 1.5609126091003418, + "learning_rate": 1.4824888644694975e-05, + "loss": 0.8075, + "step": 12236 + }, + { + "epoch": 0.5958658973047988, + "grad_norm": 2.0019264221191406, + "learning_rate": 1.4821841770063133e-05, + "loss": 0.8955, + "step": 12237 + }, + { + "epoch": 0.595914591093906, + "grad_norm": 1.8145908117294312, + "learning_rate": 1.4818795024238442e-05, + "loss": 0.812, + "step": 12238 + }, + { + "epoch": 0.5959632848830132, + "grad_norm": 1.7708740234375, + "learning_rate": 1.4815748407296701e-05, + "loss": 0.8695, + "step": 12239 + }, + { + "epoch": 0.5960119786721204, + "grad_norm": 1.7763546705245972, + "learning_rate": 1.481270191931369e-05, + "loss": 0.8253, + "step": 12240 + }, + { + "epoch": 0.5960606724612276, + "grad_norm": 2.6265716552734375, + "learning_rate": 1.4809655560365195e-05, + "loss": 0.8103, + "step": 12241 + }, + { + "epoch": 0.5961093662503347, + "grad_norm": 1.7215005159378052, + "learning_rate": 1.4806609330526984e-05, + "loss": 0.8408, + "step": 12242 + }, + { + "epoch": 0.5961580600394419, + "grad_norm": 1.416457176208496, + "learning_rate": 1.4803563229874849e-05, + "loss": 0.8311, + "step": 12243 + }, + { + "epoch": 0.5962067538285492, + "grad_norm": 2.106900215148926, + "learning_rate": 1.4800517258484543e-05, + "loss": 0.8764, + "step": 12244 + }, + { + "epoch": 0.5962554476176564, + "grad_norm": 1.911096453666687, + "learning_rate": 1.4797471416431845e-05, + "loss": 0.9059, + "step": 12245 + }, + { + "epoch": 0.5963041414067636, + "grad_norm": 1.7418107986450195, + "learning_rate": 1.479442570379253e-05, + "loss": 0.7869, + "step": 12246 + }, + { + "epoch": 0.5963528351958708, + "grad_norm": 1.229740858078003, + "learning_rate": 1.4791380120642341e-05, + "loss": 0.7084, + "step": 12247 + }, + { + "epoch": 0.596401528984978, + "grad_norm": 1.5179027318954468, + "learning_rate": 1.4788334667057047e-05, + "loss": 0.8913, + "step": 12248 + }, + { + "epoch": 0.5964502227740852, + "grad_norm": 2.528493881225586, + "learning_rate": 1.4785289343112404e-05, + "loss": 0.8031, + "step": 12249 + }, + { + "epoch": 0.5964989165631923, + "grad_norm": 1.5754404067993164, + "learning_rate": 1.4782244148884171e-05, + "loss": 0.785, + "step": 12250 + }, + { + "epoch": 0.5965476103522995, + "grad_norm": 1.5746430158615112, + "learning_rate": 1.477919908444808e-05, + "loss": 0.8062, + "step": 12251 + }, + { + "epoch": 0.5965963041414067, + "grad_norm": 2.1975390911102295, + "learning_rate": 1.47761541498799e-05, + "loss": 0.8334, + "step": 12252 + }, + { + "epoch": 0.596644997930514, + "grad_norm": 2.263226270675659, + "learning_rate": 1.477310934525535e-05, + "loss": 0.7385, + "step": 12253 + }, + { + "epoch": 0.5966936917196212, + "grad_norm": 1.6481012105941772, + "learning_rate": 1.4770064670650182e-05, + "loss": 0.9096, + "step": 12254 + }, + { + "epoch": 0.5967423855087284, + "grad_norm": 1.7986974716186523, + "learning_rate": 1.4767020126140139e-05, + "loss": 0.8793, + "step": 12255 + }, + { + "epoch": 0.5967910792978356, + "grad_norm": 2.0469207763671875, + "learning_rate": 1.476397571180094e-05, + "loss": 0.7572, + "step": 12256 + }, + { + "epoch": 0.5968397730869428, + "grad_norm": 0.09228111058473587, + "learning_rate": 1.4760931427708332e-05, + "loss": 0.5941, + "step": 12257 + }, + { + "epoch": 0.5968884668760499, + "grad_norm": 1.6911253929138184, + "learning_rate": 1.4757887273938025e-05, + "loss": 0.8509, + "step": 12258 + }, + { + "epoch": 0.5969371606651571, + "grad_norm": 1.5476915836334229, + "learning_rate": 1.4754843250565758e-05, + "loss": 0.7682, + "step": 12259 + }, + { + "epoch": 0.5969858544542643, + "grad_norm": 1.4977115392684937, + "learning_rate": 1.4751799357667235e-05, + "loss": 0.8124, + "step": 12260 + }, + { + "epoch": 0.5970345482433715, + "grad_norm": 2.6511905193328857, + "learning_rate": 1.4748755595318185e-05, + "loss": 0.7846, + "step": 12261 + }, + { + "epoch": 0.5970832420324788, + "grad_norm": 0.08961812406778336, + "learning_rate": 1.4745711963594315e-05, + "loss": 0.6188, + "step": 12262 + }, + { + "epoch": 0.597131935821586, + "grad_norm": 1.6536833047866821, + "learning_rate": 1.4742668462571348e-05, + "loss": 0.7346, + "step": 12263 + }, + { + "epoch": 0.5971806296106932, + "grad_norm": 1.7552939653396606, + "learning_rate": 1.4739625092324972e-05, + "loss": 0.7583, + "step": 12264 + }, + { + "epoch": 0.5972293233998004, + "grad_norm": 1.6516356468200684, + "learning_rate": 1.4736581852930902e-05, + "loss": 0.823, + "step": 12265 + }, + { + "epoch": 0.5972780171889076, + "grad_norm": 2.71378493309021, + "learning_rate": 1.4733538744464846e-05, + "loss": 0.8227, + "step": 12266 + }, + { + "epoch": 0.5973267109780147, + "grad_norm": 2.1668999195098877, + "learning_rate": 1.4730495767002488e-05, + "loss": 0.8309, + "step": 12267 + }, + { + "epoch": 0.5973754047671219, + "grad_norm": 1.5300183296203613, + "learning_rate": 1.4727452920619532e-05, + "loss": 0.8465, + "step": 12268 + }, + { + "epoch": 0.5974240985562291, + "grad_norm": 1.7514939308166504, + "learning_rate": 1.4724410205391664e-05, + "loss": 0.84, + "step": 12269 + }, + { + "epoch": 0.5974727923453363, + "grad_norm": 1.8068101406097412, + "learning_rate": 1.4721367621394574e-05, + "loss": 0.8124, + "step": 12270 + }, + { + "epoch": 0.5975214861344436, + "grad_norm": 2.369777202606201, + "learning_rate": 1.471832516870394e-05, + "loss": 0.8151, + "step": 12271 + }, + { + "epoch": 0.5975701799235508, + "grad_norm": 2.4272706508636475, + "learning_rate": 1.471528284739546e-05, + "loss": 0.9311, + "step": 12272 + }, + { + "epoch": 0.597618873712658, + "grad_norm": 0.09124103933572769, + "learning_rate": 1.471224065754479e-05, + "loss": 0.5956, + "step": 12273 + }, + { + "epoch": 0.5976675675017652, + "grad_norm": 1.377081036567688, + "learning_rate": 1.4709198599227617e-05, + "loss": 0.8346, + "step": 12274 + }, + { + "epoch": 0.5977162612908723, + "grad_norm": 2.6494908332824707, + "learning_rate": 1.4706156672519618e-05, + "loss": 0.8583, + "step": 12275 + }, + { + "epoch": 0.5977649550799795, + "grad_norm": 2.111651659011841, + "learning_rate": 1.470311487749645e-05, + "loss": 0.8313, + "step": 12276 + }, + { + "epoch": 0.5978136488690867, + "grad_norm": 4.524930477142334, + "learning_rate": 1.4700073214233783e-05, + "loss": 0.8518, + "step": 12277 + }, + { + "epoch": 0.5978623426581939, + "grad_norm": 1.5984479188919067, + "learning_rate": 1.4697031682807275e-05, + "loss": 0.7855, + "step": 12278 + }, + { + "epoch": 0.5979110364473011, + "grad_norm": 2.008493661880493, + "learning_rate": 1.4693990283292596e-05, + "loss": 0.7972, + "step": 12279 + }, + { + "epoch": 0.5979597302364084, + "grad_norm": 0.10066793859004974, + "learning_rate": 1.4690949015765383e-05, + "loss": 0.5828, + "step": 12280 + }, + { + "epoch": 0.5980084240255156, + "grad_norm": 2.1964635848999023, + "learning_rate": 1.4687907880301306e-05, + "loss": 0.7964, + "step": 12281 + }, + { + "epoch": 0.5980571178146228, + "grad_norm": 1.8975632190704346, + "learning_rate": 1.4684866876975998e-05, + "loss": 0.7852, + "step": 12282 + }, + { + "epoch": 0.59810581160373, + "grad_norm": 1.738374948501587, + "learning_rate": 1.4681826005865114e-05, + "loss": 0.8045, + "step": 12283 + }, + { + "epoch": 0.5981545053928371, + "grad_norm": 2.3717844486236572, + "learning_rate": 1.4678785267044289e-05, + "loss": 0.8681, + "step": 12284 + }, + { + "epoch": 0.5982031991819443, + "grad_norm": 1.4286247491836548, + "learning_rate": 1.4675744660589164e-05, + "loss": 0.7781, + "step": 12285 + }, + { + "epoch": 0.5982518929710515, + "grad_norm": 1.4860533475875854, + "learning_rate": 1.4672704186575387e-05, + "loss": 0.8748, + "step": 12286 + }, + { + "epoch": 0.5983005867601587, + "grad_norm": 2.333505868911743, + "learning_rate": 1.466966384507857e-05, + "loss": 0.7406, + "step": 12287 + }, + { + "epoch": 0.598349280549266, + "grad_norm": 1.4053341150283813, + "learning_rate": 1.466662363617436e-05, + "loss": 0.8406, + "step": 12288 + }, + { + "epoch": 0.5983979743383732, + "grad_norm": 1.6166653633117676, + "learning_rate": 1.4663583559938363e-05, + "loss": 0.9384, + "step": 12289 + }, + { + "epoch": 0.5984466681274804, + "grad_norm": 2.3355016708374023, + "learning_rate": 1.4660543616446217e-05, + "loss": 0.8333, + "step": 12290 + }, + { + "epoch": 0.5984953619165876, + "grad_norm": 1.6658573150634766, + "learning_rate": 1.4657503805773532e-05, + "loss": 0.9124, + "step": 12291 + }, + { + "epoch": 0.5985440557056947, + "grad_norm": 1.674013614654541, + "learning_rate": 1.4654464127995933e-05, + "loss": 0.8147, + "step": 12292 + }, + { + "epoch": 0.5985927494948019, + "grad_norm": 1.7920647859573364, + "learning_rate": 1.4651424583189017e-05, + "loss": 0.8303, + "step": 12293 + }, + { + "epoch": 0.5986414432839091, + "grad_norm": 1.6769338846206665, + "learning_rate": 1.4648385171428405e-05, + "loss": 0.9339, + "step": 12294 + }, + { + "epoch": 0.5986901370730163, + "grad_norm": 2.348335027694702, + "learning_rate": 1.4645345892789708e-05, + "loss": 0.7971, + "step": 12295 + }, + { + "epoch": 0.5987388308621235, + "grad_norm": 2.0086727142333984, + "learning_rate": 1.4642306747348511e-05, + "loss": 0.8836, + "step": 12296 + }, + { + "epoch": 0.5987875246512308, + "grad_norm": 1.9445635080337524, + "learning_rate": 1.4639267735180427e-05, + "loss": 0.7812, + "step": 12297 + }, + { + "epoch": 0.598836218440338, + "grad_norm": 1.4684902429580688, + "learning_rate": 1.4636228856361045e-05, + "loss": 0.8128, + "step": 12298 + }, + { + "epoch": 0.5988849122294452, + "grad_norm": 1.7697829008102417, + "learning_rate": 1.463319011096596e-05, + "loss": 0.9091, + "step": 12299 + }, + { + "epoch": 0.5989336060185524, + "grad_norm": 1.5199682712554932, + "learning_rate": 1.4630151499070757e-05, + "loss": 0.8385, + "step": 12300 + }, + { + "epoch": 0.5989822998076595, + "grad_norm": 2.530388355255127, + "learning_rate": 1.4627113020751033e-05, + "loss": 0.8363, + "step": 12301 + }, + { + "epoch": 0.5990309935967667, + "grad_norm": 1.4399851560592651, + "learning_rate": 1.4624074676082356e-05, + "loss": 0.8366, + "step": 12302 + }, + { + "epoch": 0.5990796873858739, + "grad_norm": 1.92905592918396, + "learning_rate": 1.4621036465140318e-05, + "loss": 0.8828, + "step": 12303 + }, + { + "epoch": 0.5991283811749811, + "grad_norm": 1.6768471002578735, + "learning_rate": 1.4617998388000478e-05, + "loss": 0.8401, + "step": 12304 + }, + { + "epoch": 0.5991770749640883, + "grad_norm": 1.7119725942611694, + "learning_rate": 1.4614960444738422e-05, + "loss": 0.786, + "step": 12305 + }, + { + "epoch": 0.5992257687531956, + "grad_norm": 1.361045479774475, + "learning_rate": 1.4611922635429721e-05, + "loss": 0.7708, + "step": 12306 + }, + { + "epoch": 0.5992744625423028, + "grad_norm": 2.2353672981262207, + "learning_rate": 1.460888496014993e-05, + "loss": 0.7779, + "step": 12307 + }, + { + "epoch": 0.59932315633141, + "grad_norm": 1.6820435523986816, + "learning_rate": 1.4605847418974624e-05, + "loss": 0.7663, + "step": 12308 + }, + { + "epoch": 0.5993718501205171, + "grad_norm": 0.09609567373991013, + "learning_rate": 1.4602810011979347e-05, + "loss": 0.6293, + "step": 12309 + }, + { + "epoch": 0.5994205439096243, + "grad_norm": 2.008338212966919, + "learning_rate": 1.4599772739239672e-05, + "loss": 0.715, + "step": 12310 + }, + { + "epoch": 0.5994692376987315, + "grad_norm": 2.085118055343628, + "learning_rate": 1.4596735600831133e-05, + "loss": 0.8254, + "step": 12311 + }, + { + "epoch": 0.5995179314878387, + "grad_norm": 1.4380658864974976, + "learning_rate": 1.4593698596829296e-05, + "loss": 0.8753, + "step": 12312 + }, + { + "epoch": 0.5995666252769459, + "grad_norm": 1.5516114234924316, + "learning_rate": 1.4590661727309694e-05, + "loss": 0.8424, + "step": 12313 + }, + { + "epoch": 0.5996153190660531, + "grad_norm": 1.5193673372268677, + "learning_rate": 1.4587624992347875e-05, + "loss": 0.8121, + "step": 12314 + }, + { + "epoch": 0.5996640128551604, + "grad_norm": 1.5568941831588745, + "learning_rate": 1.4584588392019386e-05, + "loss": 0.8256, + "step": 12315 + }, + { + "epoch": 0.5997127066442676, + "grad_norm": 1.7135850191116333, + "learning_rate": 1.4581551926399743e-05, + "loss": 0.8532, + "step": 12316 + }, + { + "epoch": 0.5997614004333747, + "grad_norm": 2.0834219455718994, + "learning_rate": 1.4578515595564502e-05, + "loss": 0.8474, + "step": 12317 + }, + { + "epoch": 0.5998100942224819, + "grad_norm": 0.09476915001869202, + "learning_rate": 1.4575479399589169e-05, + "loss": 0.7077, + "step": 12318 + }, + { + "epoch": 0.5998587880115891, + "grad_norm": 2.422539710998535, + "learning_rate": 1.4572443338549286e-05, + "loss": 0.843, + "step": 12319 + }, + { + "epoch": 0.5999074818006963, + "grad_norm": 1.4984183311462402, + "learning_rate": 1.4569407412520367e-05, + "loss": 0.8234, + "step": 12320 + }, + { + "epoch": 0.5999561755898035, + "grad_norm": 18.807680130004883, + "learning_rate": 1.4566371621577941e-05, + "loss": 0.7557, + "step": 12321 + }, + { + "epoch": 0.6000048693789107, + "grad_norm": 1.7630223035812378, + "learning_rate": 1.4563335965797508e-05, + "loss": 0.8369, + "step": 12322 + }, + { + "epoch": 0.600053563168018, + "grad_norm": 1.5923619270324707, + "learning_rate": 1.4560300445254587e-05, + "loss": 0.7604, + "step": 12323 + }, + { + "epoch": 0.6001022569571252, + "grad_norm": 2.817920207977295, + "learning_rate": 1.4557265060024696e-05, + "loss": 0.7302, + "step": 12324 + }, + { + "epoch": 0.6001509507462324, + "grad_norm": 1.7593590021133423, + "learning_rate": 1.4554229810183329e-05, + "loss": 0.7841, + "step": 12325 + }, + { + "epoch": 0.6001996445353395, + "grad_norm": 1.37920343875885, + "learning_rate": 1.4551194695805996e-05, + "loss": 0.7422, + "step": 12326 + }, + { + "epoch": 0.6002483383244467, + "grad_norm": 1.400391697883606, + "learning_rate": 1.4548159716968187e-05, + "loss": 0.795, + "step": 12327 + }, + { + "epoch": 0.6002970321135539, + "grad_norm": 1.4517927169799805, + "learning_rate": 1.4545124873745405e-05, + "loss": 0.8095, + "step": 12328 + }, + { + "epoch": 0.6003457259026611, + "grad_norm": 1.516599178314209, + "learning_rate": 1.4542090166213135e-05, + "loss": 0.8751, + "step": 12329 + }, + { + "epoch": 0.6003944196917683, + "grad_norm": 1.5614460706710815, + "learning_rate": 1.4539055594446877e-05, + "loss": 0.7873, + "step": 12330 + }, + { + "epoch": 0.6004431134808755, + "grad_norm": 2.7353901863098145, + "learning_rate": 1.45360211585221e-05, + "loss": 0.8054, + "step": 12331 + }, + { + "epoch": 0.6004918072699827, + "grad_norm": 1.6237586736679077, + "learning_rate": 1.4532986858514304e-05, + "loss": 0.8323, + "step": 12332 + }, + { + "epoch": 0.60054050105909, + "grad_norm": 2.0205767154693604, + "learning_rate": 1.452995269449895e-05, + "loss": 0.7894, + "step": 12333 + }, + { + "epoch": 0.6005891948481971, + "grad_norm": 1.9507137537002563, + "learning_rate": 1.4526918666551523e-05, + "loss": 0.8167, + "step": 12334 + }, + { + "epoch": 0.6006378886373043, + "grad_norm": 2.112731695175171, + "learning_rate": 1.4523884774747497e-05, + "loss": 0.8465, + "step": 12335 + }, + { + "epoch": 0.6006865824264115, + "grad_norm": 1.345590591430664, + "learning_rate": 1.4520851019162333e-05, + "loss": 0.8135, + "step": 12336 + }, + { + "epoch": 0.6007352762155187, + "grad_norm": 1.6827328205108643, + "learning_rate": 1.4517817399871503e-05, + "loss": 0.7695, + "step": 12337 + }, + { + "epoch": 0.6007839700046259, + "grad_norm": 1.612619161605835, + "learning_rate": 1.451478391695046e-05, + "loss": 0.7662, + "step": 12338 + }, + { + "epoch": 0.6008326637937331, + "grad_norm": 1.5940518379211426, + "learning_rate": 1.4511750570474676e-05, + "loss": 0.7938, + "step": 12339 + }, + { + "epoch": 0.6008813575828403, + "grad_norm": 0.09444914013147354, + "learning_rate": 1.4508717360519588e-05, + "loss": 0.6201, + "step": 12340 + }, + { + "epoch": 0.6009300513719475, + "grad_norm": 2.179813861846924, + "learning_rate": 1.4505684287160661e-05, + "loss": 0.8516, + "step": 12341 + }, + { + "epoch": 0.6009787451610548, + "grad_norm": 1.4245043992996216, + "learning_rate": 1.4502651350473333e-05, + "loss": 0.895, + "step": 12342 + }, + { + "epoch": 0.6010274389501619, + "grad_norm": 1.7973136901855469, + "learning_rate": 1.4499618550533057e-05, + "loss": 0.7636, + "step": 12343 + }, + { + "epoch": 0.6010761327392691, + "grad_norm": 2.477226495742798, + "learning_rate": 1.4496585887415275e-05, + "loss": 0.7774, + "step": 12344 + }, + { + "epoch": 0.6011248265283763, + "grad_norm": 2.6214599609375, + "learning_rate": 1.4493553361195416e-05, + "loss": 0.8524, + "step": 12345 + }, + { + "epoch": 0.6011735203174835, + "grad_norm": 2.097470283508301, + "learning_rate": 1.4490520971948927e-05, + "loss": 0.8176, + "step": 12346 + }, + { + "epoch": 0.6012222141065907, + "grad_norm": 1.9116315841674805, + "learning_rate": 1.4487488719751221e-05, + "loss": 0.8774, + "step": 12347 + }, + { + "epoch": 0.6012709078956979, + "grad_norm": 1.9218803644180298, + "learning_rate": 1.4484456604677742e-05, + "loss": 0.8008, + "step": 12348 + }, + { + "epoch": 0.6013196016848051, + "grad_norm": 2.0475449562072754, + "learning_rate": 1.4481424626803902e-05, + "loss": 0.7859, + "step": 12349 + }, + { + "epoch": 0.6013682954739124, + "grad_norm": 2.9245502948760986, + "learning_rate": 1.4478392786205136e-05, + "loss": 0.8197, + "step": 12350 + }, + { + "epoch": 0.6014169892630195, + "grad_norm": 2.777575731277466, + "learning_rate": 1.4475361082956845e-05, + "loss": 0.7957, + "step": 12351 + }, + { + "epoch": 0.6014656830521267, + "grad_norm": 1.2666809558868408, + "learning_rate": 1.447232951713446e-05, + "loss": 0.9059, + "step": 12352 + }, + { + "epoch": 0.6015143768412339, + "grad_norm": 1.5855292081832886, + "learning_rate": 1.4469298088813373e-05, + "loss": 0.9377, + "step": 12353 + }, + { + "epoch": 0.6015630706303411, + "grad_norm": 2.0107240676879883, + "learning_rate": 1.4466266798069e-05, + "loss": 0.8609, + "step": 12354 + }, + { + "epoch": 0.6016117644194483, + "grad_norm": 0.09484801441431046, + "learning_rate": 1.4463235644976754e-05, + "loss": 0.5583, + "step": 12355 + }, + { + "epoch": 0.6016604582085555, + "grad_norm": 2.6602699756622314, + "learning_rate": 1.4460204629612018e-05, + "loss": 0.8337, + "step": 12356 + }, + { + "epoch": 0.6017091519976627, + "grad_norm": 3.820998191833496, + "learning_rate": 1.44571737520502e-05, + "loss": 0.8996, + "step": 12357 + }, + { + "epoch": 0.6017578457867699, + "grad_norm": 2.065303087234497, + "learning_rate": 1.4454143012366686e-05, + "loss": 0.74, + "step": 12358 + }, + { + "epoch": 0.601806539575877, + "grad_norm": 1.944452166557312, + "learning_rate": 1.4451112410636881e-05, + "loss": 0.7994, + "step": 12359 + }, + { + "epoch": 0.6018552333649843, + "grad_norm": 2.1758320331573486, + "learning_rate": 1.444808194693615e-05, + "loss": 0.8192, + "step": 12360 + }, + { + "epoch": 0.6019039271540915, + "grad_norm": 0.09257490187883377, + "learning_rate": 1.4445051621339895e-05, + "loss": 0.6054, + "step": 12361 + }, + { + "epoch": 0.6019526209431987, + "grad_norm": 2.1619935035705566, + "learning_rate": 1.444202143392348e-05, + "loss": 0.8444, + "step": 12362 + }, + { + "epoch": 0.6020013147323059, + "grad_norm": 1.8150478601455688, + "learning_rate": 1.443899138476229e-05, + "loss": 0.8097, + "step": 12363 + }, + { + "epoch": 0.6020500085214131, + "grad_norm": 1.9741270542144775, + "learning_rate": 1.4435961473931697e-05, + "loss": 0.7853, + "step": 12364 + }, + { + "epoch": 0.6020987023105203, + "grad_norm": 1.7669970989227295, + "learning_rate": 1.4432931701507066e-05, + "loss": 0.7775, + "step": 12365 + }, + { + "epoch": 0.6021473960996275, + "grad_norm": 1.361922025680542, + "learning_rate": 1.4429902067563779e-05, + "loss": 0.9122, + "step": 12366 + }, + { + "epoch": 0.6021960898887347, + "grad_norm": 1.770210862159729, + "learning_rate": 1.4426872572177174e-05, + "loss": 0.815, + "step": 12367 + }, + { + "epoch": 0.6022447836778418, + "grad_norm": 1.9457323551177979, + "learning_rate": 1.442384321542263e-05, + "loss": 0.7518, + "step": 12368 + }, + { + "epoch": 0.6022934774669491, + "grad_norm": 1.4736109972000122, + "learning_rate": 1.442081399737549e-05, + "loss": 0.8693, + "step": 12369 + }, + { + "epoch": 0.6023421712560563, + "grad_norm": 1.8256334066390991, + "learning_rate": 1.4417784918111108e-05, + "loss": 0.8311, + "step": 12370 + }, + { + "epoch": 0.6023908650451635, + "grad_norm": 4.112700462341309, + "learning_rate": 1.4414755977704835e-05, + "loss": 0.9527, + "step": 12371 + }, + { + "epoch": 0.6024395588342707, + "grad_norm": 1.5066057443618774, + "learning_rate": 1.4411727176232027e-05, + "loss": 0.8595, + "step": 12372 + }, + { + "epoch": 0.6024882526233779, + "grad_norm": 2.197594404220581, + "learning_rate": 1.4408698513768005e-05, + "loss": 0.8122, + "step": 12373 + }, + { + "epoch": 0.6025369464124851, + "grad_norm": 1.667728066444397, + "learning_rate": 1.4405669990388114e-05, + "loss": 0.8358, + "step": 12374 + }, + { + "epoch": 0.6025856402015923, + "grad_norm": 4.473073482513428, + "learning_rate": 1.4402641606167703e-05, + "loss": 0.8513, + "step": 12375 + }, + { + "epoch": 0.6026343339906994, + "grad_norm": 2.677582263946533, + "learning_rate": 1.4399613361182083e-05, + "loss": 0.7651, + "step": 12376 + }, + { + "epoch": 0.6026830277798066, + "grad_norm": 2.83198881149292, + "learning_rate": 1.4396585255506597e-05, + "loss": 0.7675, + "step": 12377 + }, + { + "epoch": 0.6027317215689139, + "grad_norm": 1.7682701349258423, + "learning_rate": 1.4393557289216564e-05, + "loss": 0.7799, + "step": 12378 + }, + { + "epoch": 0.6027804153580211, + "grad_norm": 1.4475293159484863, + "learning_rate": 1.4390529462387303e-05, + "loss": 0.8443, + "step": 12379 + }, + { + "epoch": 0.6028291091471283, + "grad_norm": 1.923649787902832, + "learning_rate": 1.4387501775094127e-05, + "loss": 0.7412, + "step": 12380 + }, + { + "epoch": 0.6028778029362355, + "grad_norm": 2.0300755500793457, + "learning_rate": 1.4384474227412368e-05, + "loss": 0.7989, + "step": 12381 + }, + { + "epoch": 0.6029264967253427, + "grad_norm": 1.9766128063201904, + "learning_rate": 1.4381446819417315e-05, + "loss": 0.8487, + "step": 12382 + }, + { + "epoch": 0.6029751905144499, + "grad_norm": 1.5200897455215454, + "learning_rate": 1.4378419551184286e-05, + "loss": 0.7791, + "step": 12383 + }, + { + "epoch": 0.6030238843035571, + "grad_norm": 1.5960115194320679, + "learning_rate": 1.437539242278859e-05, + "loss": 0.7868, + "step": 12384 + }, + { + "epoch": 0.6030725780926642, + "grad_norm": 0.0965719074010849, + "learning_rate": 1.4372365434305513e-05, + "loss": 0.5938, + "step": 12385 + }, + { + "epoch": 0.6031212718817714, + "grad_norm": 1.5666553974151611, + "learning_rate": 1.4369338585810362e-05, + "loss": 0.8104, + "step": 12386 + }, + { + "epoch": 0.6031699656708787, + "grad_norm": 3.4690709114074707, + "learning_rate": 1.4366311877378423e-05, + "loss": 0.7356, + "step": 12387 + }, + { + "epoch": 0.6032186594599859, + "grad_norm": 2.091592788696289, + "learning_rate": 1.4363285309085e-05, + "loss": 0.8052, + "step": 12388 + }, + { + "epoch": 0.6032673532490931, + "grad_norm": 1.8363885879516602, + "learning_rate": 1.4360258881005357e-05, + "loss": 0.8455, + "step": 12389 + }, + { + "epoch": 0.6033160470382003, + "grad_norm": 1.5858694314956665, + "learning_rate": 1.4357232593214798e-05, + "loss": 0.8531, + "step": 12390 + }, + { + "epoch": 0.6033647408273075, + "grad_norm": 1.6334631443023682, + "learning_rate": 1.4354206445788585e-05, + "loss": 0.742, + "step": 12391 + }, + { + "epoch": 0.6034134346164147, + "grad_norm": 1.3145724534988403, + "learning_rate": 1.4351180438802006e-05, + "loss": 0.8345, + "step": 12392 + }, + { + "epoch": 0.6034621284055218, + "grad_norm": 1.642411470413208, + "learning_rate": 1.4348154572330323e-05, + "loss": 0.8719, + "step": 12393 + }, + { + "epoch": 0.603510822194629, + "grad_norm": 1.3993268013000488, + "learning_rate": 1.4345128846448812e-05, + "loss": 0.8321, + "step": 12394 + }, + { + "epoch": 0.6035595159837362, + "grad_norm": 1.7355505228042603, + "learning_rate": 1.4342103261232744e-05, + "loss": 0.8652, + "step": 12395 + }, + { + "epoch": 0.6036082097728435, + "grad_norm": 1.9272900819778442, + "learning_rate": 1.4339077816757367e-05, + "loss": 0.7161, + "step": 12396 + }, + { + "epoch": 0.6036569035619507, + "grad_norm": 1.8828706741333008, + "learning_rate": 1.4336052513097953e-05, + "loss": 0.7831, + "step": 12397 + }, + { + "epoch": 0.6037055973510579, + "grad_norm": 1.3856284618377686, + "learning_rate": 1.433302735032974e-05, + "loss": 0.7429, + "step": 12398 + }, + { + "epoch": 0.6037542911401651, + "grad_norm": 2.462688446044922, + "learning_rate": 1.4330002328527992e-05, + "loss": 0.9141, + "step": 12399 + }, + { + "epoch": 0.6038029849292723, + "grad_norm": 0.09728936851024628, + "learning_rate": 1.4326977447767949e-05, + "loss": 0.5694, + "step": 12400 + }, + { + "epoch": 0.6038516787183795, + "grad_norm": 2.0158843994140625, + "learning_rate": 1.4323952708124869e-05, + "loss": 0.8525, + "step": 12401 + }, + { + "epoch": 0.6039003725074866, + "grad_norm": 1.5487241744995117, + "learning_rate": 1.4320928109673972e-05, + "loss": 0.8686, + "step": 12402 + }, + { + "epoch": 0.6039490662965938, + "grad_norm": 1.5861501693725586, + "learning_rate": 1.4317903652490508e-05, + "loss": 0.838, + "step": 12403 + }, + { + "epoch": 0.603997760085701, + "grad_norm": 2.199693202972412, + "learning_rate": 1.4314879336649716e-05, + "loss": 0.7569, + "step": 12404 + }, + { + "epoch": 0.6040464538748083, + "grad_norm": 1.3552736043930054, + "learning_rate": 1.431185516222681e-05, + "loss": 0.8069, + "step": 12405 + }, + { + "epoch": 0.6040951476639155, + "grad_norm": 0.08949628472328186, + "learning_rate": 1.4308831129297032e-05, + "loss": 0.5864, + "step": 12406 + }, + { + "epoch": 0.6041438414530227, + "grad_norm": 1.3935366868972778, + "learning_rate": 1.4305807237935599e-05, + "loss": 0.9069, + "step": 12407 + }, + { + "epoch": 0.6041925352421299, + "grad_norm": 1.3631824254989624, + "learning_rate": 1.4302783488217725e-05, + "loss": 0.7749, + "step": 12408 + }, + { + "epoch": 0.6042412290312371, + "grad_norm": 1.6868489980697632, + "learning_rate": 1.4299759880218634e-05, + "loss": 0.8375, + "step": 12409 + }, + { + "epoch": 0.6042899228203442, + "grad_norm": 1.963128685951233, + "learning_rate": 1.4296736414013538e-05, + "loss": 0.6265, + "step": 12410 + }, + { + "epoch": 0.6043386166094514, + "grad_norm": 1.468257188796997, + "learning_rate": 1.4293713089677639e-05, + "loss": 0.8587, + "step": 12411 + }, + { + "epoch": 0.6043873103985586, + "grad_norm": 1.622968316078186, + "learning_rate": 1.4290689907286153e-05, + "loss": 0.7881, + "step": 12412 + }, + { + "epoch": 0.6044360041876659, + "grad_norm": 1.4315965175628662, + "learning_rate": 1.428766686691427e-05, + "loss": 0.7525, + "step": 12413 + }, + { + "epoch": 0.6044846979767731, + "grad_norm": 1.841628909111023, + "learning_rate": 1.4284643968637193e-05, + "loss": 0.9238, + "step": 12414 + }, + { + "epoch": 0.6045333917658803, + "grad_norm": 1.6970148086547852, + "learning_rate": 1.4281621212530124e-05, + "loss": 0.9219, + "step": 12415 + }, + { + "epoch": 0.6045820855549875, + "grad_norm": 1.7991327047348022, + "learning_rate": 1.4278598598668242e-05, + "loss": 0.8634, + "step": 12416 + }, + { + "epoch": 0.6046307793440947, + "grad_norm": 1.3579373359680176, + "learning_rate": 1.4275576127126754e-05, + "loss": 0.8246, + "step": 12417 + }, + { + "epoch": 0.6046794731332018, + "grad_norm": 1.5524271726608276, + "learning_rate": 1.4272553797980821e-05, + "loss": 0.8533, + "step": 12418 + }, + { + "epoch": 0.604728166922309, + "grad_norm": 1.524187445640564, + "learning_rate": 1.4269531611305642e-05, + "loss": 0.8416, + "step": 12419 + }, + { + "epoch": 0.6047768607114162, + "grad_norm": 2.5020837783813477, + "learning_rate": 1.4266509567176378e-05, + "loss": 0.845, + "step": 12420 + }, + { + "epoch": 0.6048255545005234, + "grad_norm": 1.3517827987670898, + "learning_rate": 1.4263487665668217e-05, + "loss": 0.9254, + "step": 12421 + }, + { + "epoch": 0.6048742482896307, + "grad_norm": 1.3262314796447754, + "learning_rate": 1.426046590685632e-05, + "loss": 0.8024, + "step": 12422 + }, + { + "epoch": 0.6049229420787379, + "grad_norm": 1.8975650072097778, + "learning_rate": 1.4257444290815854e-05, + "loss": 0.7479, + "step": 12423 + }, + { + "epoch": 0.6049716358678451, + "grad_norm": 3.6378085613250732, + "learning_rate": 1.4254422817621995e-05, + "loss": 0.8591, + "step": 12424 + }, + { + "epoch": 0.6050203296569523, + "grad_norm": 1.4931813478469849, + "learning_rate": 1.4251401487349883e-05, + "loss": 0.82, + "step": 12425 + }, + { + "epoch": 0.6050690234460595, + "grad_norm": 2.970672369003296, + "learning_rate": 1.4248380300074694e-05, + "loss": 0.7609, + "step": 12426 + }, + { + "epoch": 0.6051177172351666, + "grad_norm": 1.6984121799468994, + "learning_rate": 1.424535925587156e-05, + "loss": 0.7723, + "step": 12427 + }, + { + "epoch": 0.6051664110242738, + "grad_norm": 1.602218747138977, + "learning_rate": 1.4242338354815642e-05, + "loss": 0.7783, + "step": 12428 + }, + { + "epoch": 0.605215104813381, + "grad_norm": 4.8395233154296875, + "learning_rate": 1.4239317596982078e-05, + "loss": 0.7796, + "step": 12429 + }, + { + "epoch": 0.6052637986024882, + "grad_norm": 1.3684899806976318, + "learning_rate": 1.4236296982446023e-05, + "loss": 0.8145, + "step": 12430 + }, + { + "epoch": 0.6053124923915955, + "grad_norm": 1.7798932790756226, + "learning_rate": 1.42332765112826e-05, + "loss": 0.6675, + "step": 12431 + }, + { + "epoch": 0.6053611861807027, + "grad_norm": 1.5152332782745361, + "learning_rate": 1.4230256183566949e-05, + "loss": 0.8009, + "step": 12432 + }, + { + "epoch": 0.6054098799698099, + "grad_norm": 1.7783621549606323, + "learning_rate": 1.4227235999374207e-05, + "loss": 0.8373, + "step": 12433 + }, + { + "epoch": 0.6054585737589171, + "grad_norm": 2.6895129680633545, + "learning_rate": 1.4224215958779488e-05, + "loss": 0.7288, + "step": 12434 + }, + { + "epoch": 0.6055072675480242, + "grad_norm": 1.352860450744629, + "learning_rate": 1.4221196061857933e-05, + "loss": 0.8439, + "step": 12435 + }, + { + "epoch": 0.6055559613371314, + "grad_norm": 3.7498021125793457, + "learning_rate": 1.4218176308684645e-05, + "loss": 0.8581, + "step": 12436 + }, + { + "epoch": 0.6056046551262386, + "grad_norm": 3.1427414417266846, + "learning_rate": 1.4215156699334754e-05, + "loss": 0.8321, + "step": 12437 + }, + { + "epoch": 0.6056533489153458, + "grad_norm": 2.265091896057129, + "learning_rate": 1.421213723388336e-05, + "loss": 0.7962, + "step": 12438 + }, + { + "epoch": 0.605702042704453, + "grad_norm": 1.6040730476379395, + "learning_rate": 1.4209117912405587e-05, + "loss": 0.8104, + "step": 12439 + }, + { + "epoch": 0.6057507364935603, + "grad_norm": 2.0636606216430664, + "learning_rate": 1.420609873497653e-05, + "loss": 0.7787, + "step": 12440 + }, + { + "epoch": 0.6057994302826675, + "grad_norm": 1.437659740447998, + "learning_rate": 1.4203079701671303e-05, + "loss": 0.7599, + "step": 12441 + }, + { + "epoch": 0.6058481240717747, + "grad_norm": 1.7469263076782227, + "learning_rate": 1.4200060812564988e-05, + "loss": 0.9114, + "step": 12442 + }, + { + "epoch": 0.6058968178608819, + "grad_norm": 1.3366445302963257, + "learning_rate": 1.4197042067732688e-05, + "loss": 0.8098, + "step": 12443 + }, + { + "epoch": 0.605945511649989, + "grad_norm": 1.7234632968902588, + "learning_rate": 1.41940234672495e-05, + "loss": 0.9339, + "step": 12444 + }, + { + "epoch": 0.6059942054390962, + "grad_norm": 10.38673210144043, + "learning_rate": 1.4191005011190503e-05, + "loss": 0.8274, + "step": 12445 + }, + { + "epoch": 0.6060428992282034, + "grad_norm": 1.8455071449279785, + "learning_rate": 1.4187986699630795e-05, + "loss": 0.7818, + "step": 12446 + }, + { + "epoch": 0.6060915930173106, + "grad_norm": 2.4050605297088623, + "learning_rate": 1.4184968532645442e-05, + "loss": 0.7774, + "step": 12447 + }, + { + "epoch": 0.6061402868064178, + "grad_norm": 1.541359543800354, + "learning_rate": 1.4181950510309533e-05, + "loss": 0.8521, + "step": 12448 + }, + { + "epoch": 0.6061889805955251, + "grad_norm": 2.5805184841156006, + "learning_rate": 1.4178932632698126e-05, + "loss": 0.8553, + "step": 12449 + }, + { + "epoch": 0.6062376743846323, + "grad_norm": 1.7118273973464966, + "learning_rate": 1.4175914899886307e-05, + "loss": 0.848, + "step": 12450 + }, + { + "epoch": 0.6062863681737395, + "grad_norm": 3.3584234714508057, + "learning_rate": 1.417289731194913e-05, + "loss": 0.7906, + "step": 12451 + }, + { + "epoch": 0.6063350619628466, + "grad_norm": 2.4539451599121094, + "learning_rate": 1.4169879868961665e-05, + "loss": 0.751, + "step": 12452 + }, + { + "epoch": 0.6063837557519538, + "grad_norm": 1.5800336599349976, + "learning_rate": 1.4166862570998978e-05, + "loss": 0.8002, + "step": 12453 + }, + { + "epoch": 0.606432449541061, + "grad_norm": 2.1106412410736084, + "learning_rate": 1.4163845418136106e-05, + "loss": 0.81, + "step": 12454 + }, + { + "epoch": 0.6064811433301682, + "grad_norm": 1.831357717514038, + "learning_rate": 1.4160828410448122e-05, + "loss": 0.8178, + "step": 12455 + }, + { + "epoch": 0.6065298371192754, + "grad_norm": 1.682170033454895, + "learning_rate": 1.4157811548010054e-05, + "loss": 0.7706, + "step": 12456 + }, + { + "epoch": 0.6065785309083827, + "grad_norm": 4.414748668670654, + "learning_rate": 1.4154794830896963e-05, + "loss": 0.8948, + "step": 12457 + }, + { + "epoch": 0.6066272246974899, + "grad_norm": 1.1847676038742065, + "learning_rate": 1.4151778259183875e-05, + "loss": 0.8147, + "step": 12458 + }, + { + "epoch": 0.6066759184865971, + "grad_norm": 1.8765565156936646, + "learning_rate": 1.4148761832945848e-05, + "loss": 0.8999, + "step": 12459 + }, + { + "epoch": 0.6067246122757043, + "grad_norm": 1.553231120109558, + "learning_rate": 1.4145745552257894e-05, + "loss": 0.8117, + "step": 12460 + }, + { + "epoch": 0.6067733060648114, + "grad_norm": 3.394803285598755, + "learning_rate": 1.4142729417195062e-05, + "loss": 0.8442, + "step": 12461 + }, + { + "epoch": 0.6068219998539186, + "grad_norm": 1.8927747011184692, + "learning_rate": 1.4139713427832362e-05, + "loss": 0.7824, + "step": 12462 + }, + { + "epoch": 0.6068706936430258, + "grad_norm": 1.8052412271499634, + "learning_rate": 1.4136697584244826e-05, + "loss": 0.786, + "step": 12463 + }, + { + "epoch": 0.606919387432133, + "grad_norm": 1.5278031826019287, + "learning_rate": 1.4133681886507477e-05, + "loss": 0.9751, + "step": 12464 + }, + { + "epoch": 0.6069680812212402, + "grad_norm": 1.7343952655792236, + "learning_rate": 1.4130666334695318e-05, + "loss": 0.7407, + "step": 12465 + }, + { + "epoch": 0.6070167750103475, + "grad_norm": 1.4679560661315918, + "learning_rate": 1.4127650928883374e-05, + "loss": 0.7987, + "step": 12466 + }, + { + "epoch": 0.6070654687994547, + "grad_norm": 1.5325976610183716, + "learning_rate": 1.4124635669146644e-05, + "loss": 0.8064, + "step": 12467 + }, + { + "epoch": 0.6071141625885619, + "grad_norm": 1.7512017488479614, + "learning_rate": 1.4121620555560149e-05, + "loss": 0.7878, + "step": 12468 + }, + { + "epoch": 0.607162856377669, + "grad_norm": 1.500403881072998, + "learning_rate": 1.4118605588198868e-05, + "loss": 0.7809, + "step": 12469 + }, + { + "epoch": 0.6072115501667762, + "grad_norm": 1.6274454593658447, + "learning_rate": 1.4115590767137817e-05, + "loss": 0.7888, + "step": 12470 + }, + { + "epoch": 0.6072602439558834, + "grad_norm": 1.7734484672546387, + "learning_rate": 1.4112576092451975e-05, + "loss": 0.8061, + "step": 12471 + }, + { + "epoch": 0.6073089377449906, + "grad_norm": 1.867743968963623, + "learning_rate": 1.4109561564216338e-05, + "loss": 0.7966, + "step": 12472 + }, + { + "epoch": 0.6073576315340978, + "grad_norm": 0.09628479182720184, + "learning_rate": 1.4106547182505898e-05, + "loss": 0.6403, + "step": 12473 + }, + { + "epoch": 0.607406325323205, + "grad_norm": 1.4058741331100464, + "learning_rate": 1.4103532947395632e-05, + "loss": 0.8493, + "step": 12474 + }, + { + "epoch": 0.6074550191123123, + "grad_norm": 2.0109426975250244, + "learning_rate": 1.410051885896053e-05, + "loss": 0.8539, + "step": 12475 + }, + { + "epoch": 0.6075037129014195, + "grad_norm": 1.9882837533950806, + "learning_rate": 1.4097504917275551e-05, + "loss": 0.7522, + "step": 12476 + }, + { + "epoch": 0.6075524066905266, + "grad_norm": 1.3893163204193115, + "learning_rate": 1.409449112241568e-05, + "loss": 0.7508, + "step": 12477 + }, + { + "epoch": 0.6076011004796338, + "grad_norm": 2.26940655708313, + "learning_rate": 1.4091477474455877e-05, + "loss": 0.7651, + "step": 12478 + }, + { + "epoch": 0.607649794268741, + "grad_norm": 2.08720326423645, + "learning_rate": 1.4088463973471113e-05, + "loss": 0.832, + "step": 12479 + }, + { + "epoch": 0.6076984880578482, + "grad_norm": 1.196083664894104, + "learning_rate": 1.4085450619536343e-05, + "loss": 0.9176, + "step": 12480 + }, + { + "epoch": 0.6077471818469554, + "grad_norm": 1.9427241086959839, + "learning_rate": 1.4082437412726535e-05, + "loss": 0.8945, + "step": 12481 + }, + { + "epoch": 0.6077958756360626, + "grad_norm": 1.5919418334960938, + "learning_rate": 1.4079424353116628e-05, + "loss": 0.8475, + "step": 12482 + }, + { + "epoch": 0.6078445694251698, + "grad_norm": 4.825352191925049, + "learning_rate": 1.4076411440781581e-05, + "loss": 0.8315, + "step": 12483 + }, + { + "epoch": 0.6078932632142771, + "grad_norm": 1.856233835220337, + "learning_rate": 1.4073398675796348e-05, + "loss": 0.7782, + "step": 12484 + }, + { + "epoch": 0.6079419570033843, + "grad_norm": 1.5898691415786743, + "learning_rate": 1.4070386058235851e-05, + "loss": 0.9191, + "step": 12485 + }, + { + "epoch": 0.6079906507924914, + "grad_norm": 2.097346782684326, + "learning_rate": 1.4067373588175053e-05, + "loss": 0.8184, + "step": 12486 + }, + { + "epoch": 0.6080393445815986, + "grad_norm": 1.2271074056625366, + "learning_rate": 1.4064361265688872e-05, + "loss": 0.8478, + "step": 12487 + }, + { + "epoch": 0.6080880383707058, + "grad_norm": 2.3155133724212646, + "learning_rate": 1.4061349090852248e-05, + "loss": 0.809, + "step": 12488 + }, + { + "epoch": 0.608136732159813, + "grad_norm": 1.5399956703186035, + "learning_rate": 1.4058337063740102e-05, + "loss": 0.86, + "step": 12489 + }, + { + "epoch": 0.6081854259489202, + "grad_norm": 1.4548341035842896, + "learning_rate": 1.4055325184427373e-05, + "loss": 0.7589, + "step": 12490 + }, + { + "epoch": 0.6082341197380274, + "grad_norm": 2.0438637733459473, + "learning_rate": 1.4052313452988961e-05, + "loss": 0.7804, + "step": 12491 + }, + { + "epoch": 0.6082828135271346, + "grad_norm": 2.614126443862915, + "learning_rate": 1.4049301869499794e-05, + "loss": 0.7228, + "step": 12492 + }, + { + "epoch": 0.6083315073162419, + "grad_norm": 2.8227193355560303, + "learning_rate": 1.4046290434034797e-05, + "loss": 0.8186, + "step": 12493 + }, + { + "epoch": 0.608380201105349, + "grad_norm": 1.8575432300567627, + "learning_rate": 1.4043279146668858e-05, + "loss": 0.8921, + "step": 12494 + }, + { + "epoch": 0.6084288948944562, + "grad_norm": 1.5278903245925903, + "learning_rate": 1.4040268007476898e-05, + "loss": 0.8121, + "step": 12495 + }, + { + "epoch": 0.6084775886835634, + "grad_norm": 1.4403331279754639, + "learning_rate": 1.403725701653381e-05, + "loss": 0.8436, + "step": 12496 + }, + { + "epoch": 0.6085262824726706, + "grad_norm": 1.4081292152404785, + "learning_rate": 1.4034246173914507e-05, + "loss": 0.8158, + "step": 12497 + }, + { + "epoch": 0.6085749762617778, + "grad_norm": 2.017263889312744, + "learning_rate": 1.4031235479693865e-05, + "loss": 0.8318, + "step": 12498 + }, + { + "epoch": 0.608623670050885, + "grad_norm": 1.8236063718795776, + "learning_rate": 1.4028224933946794e-05, + "loss": 0.8586, + "step": 12499 + }, + { + "epoch": 0.6086723638399922, + "grad_norm": 1.6658844947814941, + "learning_rate": 1.4025214536748162e-05, + "loss": 0.8208, + "step": 12500 + }, + { + "epoch": 0.6087210576290994, + "grad_norm": 1.5289415121078491, + "learning_rate": 1.4022204288172869e-05, + "loss": 0.9481, + "step": 12501 + }, + { + "epoch": 0.6087697514182067, + "grad_norm": 0.0880969688296318, + "learning_rate": 1.4019194188295787e-05, + "loss": 0.5794, + "step": 12502 + }, + { + "epoch": 0.6088184452073138, + "grad_norm": 1.6731916666030884, + "learning_rate": 1.401618423719179e-05, + "loss": 0.7921, + "step": 12503 + }, + { + "epoch": 0.608867138996421, + "grad_norm": 1.3674051761627197, + "learning_rate": 1.4013174434935767e-05, + "loss": 0.7037, + "step": 12504 + }, + { + "epoch": 0.6089158327855282, + "grad_norm": 1.6193268299102783, + "learning_rate": 1.4010164781602566e-05, + "loss": 0.7414, + "step": 12505 + }, + { + "epoch": 0.6089645265746354, + "grad_norm": 1.9311074018478394, + "learning_rate": 1.4007155277267073e-05, + "loss": 0.8767, + "step": 12506 + }, + { + "epoch": 0.6090132203637426, + "grad_norm": 1.5402143001556396, + "learning_rate": 1.4004145922004128e-05, + "loss": 0.7913, + "step": 12507 + }, + { + "epoch": 0.6090619141528498, + "grad_norm": 1.3408044576644897, + "learning_rate": 1.4001136715888605e-05, + "loss": 0.8412, + "step": 12508 + }, + { + "epoch": 0.609110607941957, + "grad_norm": 1.6679232120513916, + "learning_rate": 1.399812765899535e-05, + "loss": 0.8055, + "step": 12509 + }, + { + "epoch": 0.6091593017310643, + "grad_norm": 1.4199765920639038, + "learning_rate": 1.3995118751399224e-05, + "loss": 0.6771, + "step": 12510 + }, + { + "epoch": 0.6092079955201714, + "grad_norm": 1.8169621229171753, + "learning_rate": 1.3992109993175058e-05, + "loss": 0.8372, + "step": 12511 + }, + { + "epoch": 0.6092566893092786, + "grad_norm": 1.6975444555282593, + "learning_rate": 1.3989101384397703e-05, + "loss": 0.8704, + "step": 12512 + }, + { + "epoch": 0.6093053830983858, + "grad_norm": 3.357069253921509, + "learning_rate": 1.398609292514201e-05, + "loss": 0.7097, + "step": 12513 + }, + { + "epoch": 0.609354076887493, + "grad_norm": 1.8567746877670288, + "learning_rate": 1.3983084615482794e-05, + "loss": 0.8862, + "step": 12514 + }, + { + "epoch": 0.6094027706766002, + "grad_norm": 1.7505860328674316, + "learning_rate": 1.3980076455494903e-05, + "loss": 0.78, + "step": 12515 + }, + { + "epoch": 0.6094514644657074, + "grad_norm": 1.6775822639465332, + "learning_rate": 1.3977068445253155e-05, + "loss": 0.8633, + "step": 12516 + }, + { + "epoch": 0.6095001582548146, + "grad_norm": 1.5859200954437256, + "learning_rate": 1.397406058483238e-05, + "loss": 0.8534, + "step": 12517 + }, + { + "epoch": 0.6095488520439218, + "grad_norm": 1.6152092218399048, + "learning_rate": 1.3971052874307396e-05, + "loss": 0.7947, + "step": 12518 + }, + { + "epoch": 0.6095975458330289, + "grad_norm": 1.7825578451156616, + "learning_rate": 1.3968045313753025e-05, + "loss": 0.8266, + "step": 12519 + }, + { + "epoch": 0.6096462396221362, + "grad_norm": 0.0932086780667305, + "learning_rate": 1.3965037903244072e-05, + "loss": 0.6285, + "step": 12520 + }, + { + "epoch": 0.6096949334112434, + "grad_norm": 2.4603240489959717, + "learning_rate": 1.396203064285535e-05, + "loss": 0.8984, + "step": 12521 + }, + { + "epoch": 0.6097436272003506, + "grad_norm": 1.5319037437438965, + "learning_rate": 1.3959023532661676e-05, + "loss": 0.8536, + "step": 12522 + }, + { + "epoch": 0.6097923209894578, + "grad_norm": 1.67697012424469, + "learning_rate": 1.3956016572737831e-05, + "loss": 0.836, + "step": 12523 + }, + { + "epoch": 0.609841014778565, + "grad_norm": 1.6825549602508545, + "learning_rate": 1.3953009763158631e-05, + "loss": 0.8981, + "step": 12524 + }, + { + "epoch": 0.6098897085676722, + "grad_norm": 4.075835227966309, + "learning_rate": 1.3950003103998863e-05, + "loss": 0.8119, + "step": 12525 + }, + { + "epoch": 0.6099384023567794, + "grad_norm": 1.8320279121398926, + "learning_rate": 1.3946996595333324e-05, + "loss": 0.8125, + "step": 12526 + }, + { + "epoch": 0.6099870961458866, + "grad_norm": 4.53490686416626, + "learning_rate": 1.3943990237236791e-05, + "loss": 0.846, + "step": 12527 + }, + { + "epoch": 0.6100357899349937, + "grad_norm": 2.52449631690979, + "learning_rate": 1.3940984029784059e-05, + "loss": 0.8097, + "step": 12528 + }, + { + "epoch": 0.610084483724101, + "grad_norm": 1.9383010864257812, + "learning_rate": 1.3937977973049894e-05, + "loss": 0.9229, + "step": 12529 + }, + { + "epoch": 0.6101331775132082, + "grad_norm": 2.1079137325286865, + "learning_rate": 1.3934972067109085e-05, + "loss": 0.8372, + "step": 12530 + }, + { + "epoch": 0.6101818713023154, + "grad_norm": 1.9947624206542969, + "learning_rate": 1.3931966312036394e-05, + "loss": 0.803, + "step": 12531 + }, + { + "epoch": 0.6102305650914226, + "grad_norm": 2.723886013031006, + "learning_rate": 1.3928960707906592e-05, + "loss": 0.823, + "step": 12532 + }, + { + "epoch": 0.6102792588805298, + "grad_norm": 1.4471882581710815, + "learning_rate": 1.3925955254794457e-05, + "loss": 0.8503, + "step": 12533 + }, + { + "epoch": 0.610327952669637, + "grad_norm": 1.439642071723938, + "learning_rate": 1.3922949952774729e-05, + "loss": 0.8799, + "step": 12534 + }, + { + "epoch": 0.6103766464587442, + "grad_norm": 2.669076919555664, + "learning_rate": 1.3919944801922186e-05, + "loss": 0.9068, + "step": 12535 + }, + { + "epoch": 0.6104253402478513, + "grad_norm": 1.844169020652771, + "learning_rate": 1.3916939802311558e-05, + "loss": 0.8108, + "step": 12536 + }, + { + "epoch": 0.6104740340369585, + "grad_norm": 2.4728095531463623, + "learning_rate": 1.3913934954017613e-05, + "loss": 0.8161, + "step": 12537 + }, + { + "epoch": 0.6105227278260658, + "grad_norm": 2.4509971141815186, + "learning_rate": 1.3910930257115087e-05, + "loss": 0.7951, + "step": 12538 + }, + { + "epoch": 0.610571421615173, + "grad_norm": 4.464463233947754, + "learning_rate": 1.3907925711678733e-05, + "loss": 0.9008, + "step": 12539 + }, + { + "epoch": 0.6106201154042802, + "grad_norm": 4.249631881713867, + "learning_rate": 1.3904921317783276e-05, + "loss": 0.793, + "step": 12540 + }, + { + "epoch": 0.6106688091933874, + "grad_norm": 1.773122787475586, + "learning_rate": 1.3901917075503458e-05, + "loss": 0.891, + "step": 12541 + }, + { + "epoch": 0.6107175029824946, + "grad_norm": 4.277166366577148, + "learning_rate": 1.3898912984914015e-05, + "loss": 0.7935, + "step": 12542 + }, + { + "epoch": 0.6107661967716018, + "grad_norm": 1.633736491203308, + "learning_rate": 1.389590904608966e-05, + "loss": 0.8007, + "step": 12543 + }, + { + "epoch": 0.610814890560709, + "grad_norm": 1.9136277437210083, + "learning_rate": 1.3892905259105128e-05, + "loss": 0.8456, + "step": 12544 + }, + { + "epoch": 0.6108635843498161, + "grad_norm": 1.5882198810577393, + "learning_rate": 1.3889901624035134e-05, + "loss": 0.8526, + "step": 12545 + }, + { + "epoch": 0.6109122781389233, + "grad_norm": 0.09518910944461823, + "learning_rate": 1.3886898140954394e-05, + "loss": 0.5669, + "step": 12546 + }, + { + "epoch": 0.6109609719280306, + "grad_norm": 1.6980531215667725, + "learning_rate": 1.3883894809937618e-05, + "loss": 0.8945, + "step": 12547 + }, + { + "epoch": 0.6110096657171378, + "grad_norm": 1.7519769668579102, + "learning_rate": 1.3880891631059524e-05, + "loss": 0.8395, + "step": 12548 + }, + { + "epoch": 0.611058359506245, + "grad_norm": 2.1191368103027344, + "learning_rate": 1.3877888604394798e-05, + "loss": 0.8669, + "step": 12549 + }, + { + "epoch": 0.6111070532953522, + "grad_norm": 1.3335853815078735, + "learning_rate": 1.3874885730018162e-05, + "loss": 0.933, + "step": 12550 + }, + { + "epoch": 0.6111557470844594, + "grad_norm": 1.40567147731781, + "learning_rate": 1.3871883008004292e-05, + "loss": 0.8028, + "step": 12551 + }, + { + "epoch": 0.6112044408735666, + "grad_norm": 1.7358287572860718, + "learning_rate": 1.3868880438427891e-05, + "loss": 0.8621, + "step": 12552 + }, + { + "epoch": 0.6112531346626737, + "grad_norm": 2.7315995693206787, + "learning_rate": 1.3865878021363655e-05, + "loss": 0.8084, + "step": 12553 + }, + { + "epoch": 0.6113018284517809, + "grad_norm": 2.318020820617676, + "learning_rate": 1.3862875756886255e-05, + "loss": 0.6443, + "step": 12554 + }, + { + "epoch": 0.6113505222408882, + "grad_norm": 2.416433095932007, + "learning_rate": 1.385987364507039e-05, + "loss": 0.7766, + "step": 12555 + }, + { + "epoch": 0.6113992160299954, + "grad_norm": 2.1571788787841797, + "learning_rate": 1.3856871685990718e-05, + "loss": 0.7955, + "step": 12556 + }, + { + "epoch": 0.6114479098191026, + "grad_norm": 2.856052875518799, + "learning_rate": 1.3853869879721932e-05, + "loss": 0.8163, + "step": 12557 + }, + { + "epoch": 0.6114966036082098, + "grad_norm": 1.932163953781128, + "learning_rate": 1.3850868226338685e-05, + "loss": 0.8625, + "step": 12558 + }, + { + "epoch": 0.611545297397317, + "grad_norm": 2.049795389175415, + "learning_rate": 1.3847866725915657e-05, + "loss": 0.7972, + "step": 12559 + }, + { + "epoch": 0.6115939911864242, + "grad_norm": 1.7865198850631714, + "learning_rate": 1.3844865378527499e-05, + "loss": 0.8462, + "step": 12560 + }, + { + "epoch": 0.6116426849755314, + "grad_norm": 0.09668445587158203, + "learning_rate": 1.3841864184248879e-05, + "loss": 0.588, + "step": 12561 + }, + { + "epoch": 0.6116913787646385, + "grad_norm": 1.335359811782837, + "learning_rate": 1.3838863143154453e-05, + "loss": 0.7832, + "step": 12562 + }, + { + "epoch": 0.6117400725537457, + "grad_norm": 2.1777257919311523, + "learning_rate": 1.3835862255318863e-05, + "loss": 0.7628, + "step": 12563 + }, + { + "epoch": 0.611788766342853, + "grad_norm": 1.9450232982635498, + "learning_rate": 1.3832861520816767e-05, + "loss": 0.8272, + "step": 12564 + }, + { + "epoch": 0.6118374601319602, + "grad_norm": 0.09235995262861252, + "learning_rate": 1.3829860939722797e-05, + "loss": 0.5325, + "step": 12565 + }, + { + "epoch": 0.6118861539210674, + "grad_norm": 1.7583138942718506, + "learning_rate": 1.3826860512111604e-05, + "loss": 0.8448, + "step": 12566 + }, + { + "epoch": 0.6119348477101746, + "grad_norm": 1.6931896209716797, + "learning_rate": 1.382386023805781e-05, + "loss": 0.8415, + "step": 12567 + }, + { + "epoch": 0.6119835414992818, + "grad_norm": 1.6119074821472168, + "learning_rate": 1.3820860117636069e-05, + "loss": 0.8348, + "step": 12568 + }, + { + "epoch": 0.612032235288389, + "grad_norm": 2.161738872528076, + "learning_rate": 1.3817860150920982e-05, + "loss": 0.8579, + "step": 12569 + }, + { + "epoch": 0.6120809290774961, + "grad_norm": 1.9282994270324707, + "learning_rate": 1.3814860337987201e-05, + "loss": 0.9648, + "step": 12570 + }, + { + "epoch": 0.6121296228666033, + "grad_norm": 1.6068655252456665, + "learning_rate": 1.3811860678909321e-05, + "loss": 0.8301, + "step": 12571 + }, + { + "epoch": 0.6121783166557105, + "grad_norm": 1.9341202974319458, + "learning_rate": 1.3808861173761974e-05, + "loss": 0.8268, + "step": 12572 + }, + { + "epoch": 0.6122270104448178, + "grad_norm": 2.8390607833862305, + "learning_rate": 1.3805861822619777e-05, + "loss": 0.7984, + "step": 12573 + }, + { + "epoch": 0.612275704233925, + "grad_norm": 1.569549322128296, + "learning_rate": 1.3802862625557322e-05, + "loss": 0.7527, + "step": 12574 + }, + { + "epoch": 0.6123243980230322, + "grad_norm": 1.792353868484497, + "learning_rate": 1.379986358264923e-05, + "loss": 0.8646, + "step": 12575 + }, + { + "epoch": 0.6123730918121394, + "grad_norm": 1.5624895095825195, + "learning_rate": 1.379686469397009e-05, + "loss": 0.7465, + "step": 12576 + }, + { + "epoch": 0.6124217856012466, + "grad_norm": 1.2433286905288696, + "learning_rate": 1.379386595959452e-05, + "loss": 0.8042, + "step": 12577 + }, + { + "epoch": 0.6124704793903537, + "grad_norm": 1.8866803646087646, + "learning_rate": 1.3790867379597085e-05, + "loss": 0.9186, + "step": 12578 + }, + { + "epoch": 0.6125191731794609, + "grad_norm": 1.8467179536819458, + "learning_rate": 1.3787868954052404e-05, + "loss": 0.7752, + "step": 12579 + }, + { + "epoch": 0.6125678669685681, + "grad_norm": 2.0106945037841797, + "learning_rate": 1.3784870683035038e-05, + "loss": 0.8234, + "step": 12580 + }, + { + "epoch": 0.6126165607576753, + "grad_norm": 1.544244647026062, + "learning_rate": 1.378187256661958e-05, + "loss": 0.762, + "step": 12581 + }, + { + "epoch": 0.6126652545467826, + "grad_norm": 1.643659234046936, + "learning_rate": 1.3778874604880614e-05, + "loss": 0.8651, + "step": 12582 + }, + { + "epoch": 0.6127139483358898, + "grad_norm": 1.235331654548645, + "learning_rate": 1.3775876797892706e-05, + "loss": 0.8749, + "step": 12583 + }, + { + "epoch": 0.612762642124997, + "grad_norm": 2.2436306476593018, + "learning_rate": 1.377287914573044e-05, + "loss": 1.0033, + "step": 12584 + }, + { + "epoch": 0.6128113359141042, + "grad_norm": 2.386275291442871, + "learning_rate": 1.3769881648468361e-05, + "loss": 0.8598, + "step": 12585 + }, + { + "epoch": 0.6128600297032114, + "grad_norm": 0.10043513029813766, + "learning_rate": 1.3766884306181055e-05, + "loss": 0.5838, + "step": 12586 + }, + { + "epoch": 0.6129087234923185, + "grad_norm": 1.2996892929077148, + "learning_rate": 1.376388711894306e-05, + "loss": 0.7718, + "step": 12587 + }, + { + "epoch": 0.6129574172814257, + "grad_norm": 1.269775629043579, + "learning_rate": 1.3760890086828946e-05, + "loss": 0.7898, + "step": 12588 + }, + { + "epoch": 0.6130061110705329, + "grad_norm": 2.7573297023773193, + "learning_rate": 1.3757893209913258e-05, + "loss": 0.7155, + "step": 12589 + }, + { + "epoch": 0.6130548048596401, + "grad_norm": 1.5574891567230225, + "learning_rate": 1.375489648827055e-05, + "loss": 0.706, + "step": 12590 + }, + { + "epoch": 0.6131034986487474, + "grad_norm": 2.146038770675659, + "learning_rate": 1.3751899921975356e-05, + "loss": 0.913, + "step": 12591 + }, + { + "epoch": 0.6131521924378546, + "grad_norm": 3.680983066558838, + "learning_rate": 1.3748903511102221e-05, + "loss": 0.7822, + "step": 12592 + }, + { + "epoch": 0.6132008862269618, + "grad_norm": 2.3941924571990967, + "learning_rate": 1.3745907255725688e-05, + "loss": 0.889, + "step": 12593 + }, + { + "epoch": 0.613249580016069, + "grad_norm": 0.09033434838056564, + "learning_rate": 1.3742911155920275e-05, + "loss": 0.6282, + "step": 12594 + }, + { + "epoch": 0.6132982738051761, + "grad_norm": 1.406152367591858, + "learning_rate": 1.373991521176052e-05, + "loss": 0.8053, + "step": 12595 + }, + { + "epoch": 0.6133469675942833, + "grad_norm": 1.614616870880127, + "learning_rate": 1.3736919423320942e-05, + "loss": 0.8182, + "step": 12596 + }, + { + "epoch": 0.6133956613833905, + "grad_norm": 1.5945000648498535, + "learning_rate": 1.3733923790676071e-05, + "loss": 0.8366, + "step": 12597 + }, + { + "epoch": 0.6134443551724977, + "grad_norm": 1.4118098020553589, + "learning_rate": 1.3730928313900411e-05, + "loss": 0.6571, + "step": 12598 + }, + { + "epoch": 0.613493048961605, + "grad_norm": 0.0893913134932518, + "learning_rate": 1.3727932993068489e-05, + "loss": 0.6074, + "step": 12599 + }, + { + "epoch": 0.6135417427507122, + "grad_norm": 3.974210023880005, + "learning_rate": 1.3724937828254797e-05, + "loss": 0.8102, + "step": 12600 + }, + { + "epoch": 0.6135904365398194, + "grad_norm": 1.8392126560211182, + "learning_rate": 1.372194281953385e-05, + "loss": 0.7594, + "step": 12601 + }, + { + "epoch": 0.6136391303289266, + "grad_norm": 2.3019120693206787, + "learning_rate": 1.3718947966980153e-05, + "loss": 0.8721, + "step": 12602 + }, + { + "epoch": 0.6136878241180338, + "grad_norm": 1.3490082025527954, + "learning_rate": 1.3715953270668193e-05, + "loss": 0.7989, + "step": 12603 + }, + { + "epoch": 0.6137365179071409, + "grad_norm": 0.09576999396085739, + "learning_rate": 1.371295873067247e-05, + "loss": 0.6223, + "step": 12604 + }, + { + "epoch": 0.6137852116962481, + "grad_norm": 2.609104871749878, + "learning_rate": 1.3709964347067467e-05, + "loss": 0.8289, + "step": 12605 + }, + { + "epoch": 0.6138339054853553, + "grad_norm": 2.2463722229003906, + "learning_rate": 1.3706970119927686e-05, + "loss": 0.8664, + "step": 12606 + }, + { + "epoch": 0.6138825992744625, + "grad_norm": 9.40267276763916, + "learning_rate": 1.3703976049327586e-05, + "loss": 0.8719, + "step": 12607 + }, + { + "epoch": 0.6139312930635697, + "grad_norm": 1.563516616821289, + "learning_rate": 1.3700982135341664e-05, + "loss": 0.7532, + "step": 12608 + }, + { + "epoch": 0.613979986852677, + "grad_norm": 1.9881778955459595, + "learning_rate": 1.3697988378044378e-05, + "loss": 0.8282, + "step": 12609 + }, + { + "epoch": 0.6140286806417842, + "grad_norm": 1.596873164176941, + "learning_rate": 1.3694994777510205e-05, + "loss": 0.9045, + "step": 12610 + }, + { + "epoch": 0.6140773744308914, + "grad_norm": 2.338106393814087, + "learning_rate": 1.3692001333813618e-05, + "loss": 0.8196, + "step": 12611 + }, + { + "epoch": 0.6141260682199985, + "grad_norm": 1.654788613319397, + "learning_rate": 1.3689008047029066e-05, + "loss": 0.7998, + "step": 12612 + }, + { + "epoch": 0.6141747620091057, + "grad_norm": 1.8769783973693848, + "learning_rate": 1.3686014917231027e-05, + "loss": 0.8278, + "step": 12613 + }, + { + "epoch": 0.6142234557982129, + "grad_norm": 1.5737911462783813, + "learning_rate": 1.368302194449393e-05, + "loss": 0.7929, + "step": 12614 + }, + { + "epoch": 0.6142721495873201, + "grad_norm": 1.5680757761001587, + "learning_rate": 1.3680029128892249e-05, + "loss": 0.7686, + "step": 12615 + }, + { + "epoch": 0.6143208433764273, + "grad_norm": 1.3742659091949463, + "learning_rate": 1.3677036470500408e-05, + "loss": 0.7404, + "step": 12616 + }, + { + "epoch": 0.6143695371655346, + "grad_norm": 2.3724677562713623, + "learning_rate": 1.367404396939287e-05, + "loss": 0.8817, + "step": 12617 + }, + { + "epoch": 0.6144182309546418, + "grad_norm": 1.760157585144043, + "learning_rate": 1.3671051625644059e-05, + "loss": 0.8337, + "step": 12618 + }, + { + "epoch": 0.614466924743749, + "grad_norm": 1.3837436437606812, + "learning_rate": 1.3668059439328422e-05, + "loss": 0.7826, + "step": 12619 + }, + { + "epoch": 0.6145156185328562, + "grad_norm": 1.432826280593872, + "learning_rate": 1.3665067410520378e-05, + "loss": 0.7388, + "step": 12620 + }, + { + "epoch": 0.6145643123219633, + "grad_norm": 1.4386571645736694, + "learning_rate": 1.3662075539294359e-05, + "loss": 0.8549, + "step": 12621 + }, + { + "epoch": 0.6146130061110705, + "grad_norm": 1.7738354206085205, + "learning_rate": 1.3659083825724796e-05, + "loss": 0.9392, + "step": 12622 + }, + { + "epoch": 0.6146616999001777, + "grad_norm": 1.7554527521133423, + "learning_rate": 1.3656092269886094e-05, + "loss": 0.8951, + "step": 12623 + }, + { + "epoch": 0.6147103936892849, + "grad_norm": 2.7163543701171875, + "learning_rate": 1.3653100871852681e-05, + "loss": 0.8161, + "step": 12624 + }, + { + "epoch": 0.6147590874783921, + "grad_norm": 1.5135892629623413, + "learning_rate": 1.365010963169896e-05, + "loss": 0.9439, + "step": 12625 + }, + { + "epoch": 0.6148077812674994, + "grad_norm": 1.261088490486145, + "learning_rate": 1.3647118549499342e-05, + "loss": 0.852, + "step": 12626 + }, + { + "epoch": 0.6148564750566066, + "grad_norm": 1.4919402599334717, + "learning_rate": 1.3644127625328225e-05, + "loss": 0.7797, + "step": 12627 + }, + { + "epoch": 0.6149051688457138, + "grad_norm": 1.3490078449249268, + "learning_rate": 1.3641136859260021e-05, + "loss": 0.7553, + "step": 12628 + }, + { + "epoch": 0.6149538626348209, + "grad_norm": 1.4984822273254395, + "learning_rate": 1.3638146251369109e-05, + "loss": 0.956, + "step": 12629 + }, + { + "epoch": 0.6150025564239281, + "grad_norm": 13.250103950500488, + "learning_rate": 1.363515580172989e-05, + "loss": 0.7773, + "step": 12630 + }, + { + "epoch": 0.6150512502130353, + "grad_norm": 1.5867308378219604, + "learning_rate": 1.3632165510416756e-05, + "loss": 0.8317, + "step": 12631 + }, + { + "epoch": 0.6150999440021425, + "grad_norm": 2.000216484069824, + "learning_rate": 1.3629175377504078e-05, + "loss": 0.9093, + "step": 12632 + }, + { + "epoch": 0.6151486377912497, + "grad_norm": 1.9185147285461426, + "learning_rate": 1.3626185403066253e-05, + "loss": 0.816, + "step": 12633 + }, + { + "epoch": 0.6151973315803569, + "grad_norm": 1.3966950178146362, + "learning_rate": 1.3623195587177635e-05, + "loss": 0.9138, + "step": 12634 + }, + { + "epoch": 0.6152460253694642, + "grad_norm": 1.6298916339874268, + "learning_rate": 1.3620205929912622e-05, + "loss": 0.8983, + "step": 12635 + }, + { + "epoch": 0.6152947191585714, + "grad_norm": 1.6557304859161377, + "learning_rate": 1.3617216431345556e-05, + "loss": 0.8048, + "step": 12636 + }, + { + "epoch": 0.6153434129476785, + "grad_norm": 1.6189130544662476, + "learning_rate": 1.3614227091550824e-05, + "loss": 0.8789, + "step": 12637 + }, + { + "epoch": 0.6153921067367857, + "grad_norm": 2.5859177112579346, + "learning_rate": 1.3611237910602769e-05, + "loss": 0.8447, + "step": 12638 + }, + { + "epoch": 0.6154408005258929, + "grad_norm": 1.5055410861968994, + "learning_rate": 1.3608248888575755e-05, + "loss": 0.8046, + "step": 12639 + }, + { + "epoch": 0.6154894943150001, + "grad_norm": 1.480710744857788, + "learning_rate": 1.3605260025544129e-05, + "loss": 0.8684, + "step": 12640 + }, + { + "epoch": 0.6155381881041073, + "grad_norm": 0.09974682331085205, + "learning_rate": 1.3602271321582243e-05, + "loss": 0.6654, + "step": 12641 + }, + { + "epoch": 0.6155868818932145, + "grad_norm": 1.2705618143081665, + "learning_rate": 1.359928277676445e-05, + "loss": 0.8782, + "step": 12642 + }, + { + "epoch": 0.6156355756823217, + "grad_norm": 2.8210361003875732, + "learning_rate": 1.3596294391165073e-05, + "loss": 0.8263, + "step": 12643 + }, + { + "epoch": 0.615684269471429, + "grad_norm": 2.5679476261138916, + "learning_rate": 1.3593306164858464e-05, + "loss": 0.776, + "step": 12644 + }, + { + "epoch": 0.6157329632605362, + "grad_norm": 1.7053534984588623, + "learning_rate": 1.359031809791894e-05, + "loss": 0.797, + "step": 12645 + }, + { + "epoch": 0.6157816570496433, + "grad_norm": 1.565231442451477, + "learning_rate": 1.3587330190420845e-05, + "loss": 0.7613, + "step": 12646 + }, + { + "epoch": 0.6158303508387505, + "grad_norm": 1.5022839307785034, + "learning_rate": 1.3584342442438488e-05, + "loss": 0.8039, + "step": 12647 + }, + { + "epoch": 0.6158790446278577, + "grad_norm": 2.206557273864746, + "learning_rate": 1.3581354854046207e-05, + "loss": 0.8585, + "step": 12648 + }, + { + "epoch": 0.6159277384169649, + "grad_norm": 1.382250189781189, + "learning_rate": 1.3578367425318302e-05, + "loss": 0.8822, + "step": 12649 + }, + { + "epoch": 0.6159764322060721, + "grad_norm": 1.5799142122268677, + "learning_rate": 1.3575380156329093e-05, + "loss": 0.87, + "step": 12650 + }, + { + "epoch": 0.6160251259951793, + "grad_norm": 1.5579986572265625, + "learning_rate": 1.3572393047152894e-05, + "loss": 0.7654, + "step": 12651 + }, + { + "epoch": 0.6160738197842865, + "grad_norm": 1.520314335823059, + "learning_rate": 1.3569406097864e-05, + "loss": 0.8354, + "step": 12652 + }, + { + "epoch": 0.6161225135733938, + "grad_norm": 1.2661840915679932, + "learning_rate": 1.3566419308536718e-05, + "loss": 0.7639, + "step": 12653 + }, + { + "epoch": 0.6161712073625009, + "grad_norm": 0.09435755014419556, + "learning_rate": 1.3563432679245341e-05, + "loss": 0.5937, + "step": 12654 + }, + { + "epoch": 0.6162199011516081, + "grad_norm": 1.1749062538146973, + "learning_rate": 1.3560446210064165e-05, + "loss": 0.7757, + "step": 12655 + }, + { + "epoch": 0.6162685949407153, + "grad_norm": 1.401623249053955, + "learning_rate": 1.355745990106747e-05, + "loss": 0.8152, + "step": 12656 + }, + { + "epoch": 0.6163172887298225, + "grad_norm": 1.9968374967575073, + "learning_rate": 1.3554473752329558e-05, + "loss": 0.768, + "step": 12657 + }, + { + "epoch": 0.6163659825189297, + "grad_norm": 0.18711698055267334, + "learning_rate": 1.3551487763924688e-05, + "loss": 0.5881, + "step": 12658 + }, + { + "epoch": 0.6164146763080369, + "grad_norm": 2.520855665206909, + "learning_rate": 1.3548501935927158e-05, + "loss": 0.8848, + "step": 12659 + }, + { + "epoch": 0.6164633700971441, + "grad_norm": 1.8570678234100342, + "learning_rate": 1.3545516268411222e-05, + "loss": 0.8171, + "step": 12660 + }, + { + "epoch": 0.6165120638862513, + "grad_norm": 1.223624587059021, + "learning_rate": 1.354253076145116e-05, + "loss": 0.8257, + "step": 12661 + }, + { + "epoch": 0.6165607576753586, + "grad_norm": 1.7826579809188843, + "learning_rate": 1.3539545415121236e-05, + "loss": 0.7462, + "step": 12662 + }, + { + "epoch": 0.6166094514644657, + "grad_norm": 1.4481881856918335, + "learning_rate": 1.3536560229495704e-05, + "loss": 0.7934, + "step": 12663 + }, + { + "epoch": 0.6166581452535729, + "grad_norm": 2.954493999481201, + "learning_rate": 1.3533575204648832e-05, + "loss": 0.8373, + "step": 12664 + }, + { + "epoch": 0.6167068390426801, + "grad_norm": 1.521241307258606, + "learning_rate": 1.353059034065486e-05, + "loss": 0.8079, + "step": 12665 + }, + { + "epoch": 0.6167555328317873, + "grad_norm": 1.5453590154647827, + "learning_rate": 1.3527605637588055e-05, + "loss": 0.7565, + "step": 12666 + }, + { + "epoch": 0.6168042266208945, + "grad_norm": 1.5979052782058716, + "learning_rate": 1.3524621095522638e-05, + "loss": 0.8629, + "step": 12667 + }, + { + "epoch": 0.6168529204100017, + "grad_norm": 1.5030722618103027, + "learning_rate": 1.3521636714532864e-05, + "loss": 0.8444, + "step": 12668 + }, + { + "epoch": 0.6169016141991089, + "grad_norm": 1.6960049867630005, + "learning_rate": 1.3518652494692966e-05, + "loss": 0.7868, + "step": 12669 + }, + { + "epoch": 0.6169503079882162, + "grad_norm": 1.3338419198989868, + "learning_rate": 1.3515668436077179e-05, + "loss": 0.8087, + "step": 12670 + }, + { + "epoch": 0.6169990017773233, + "grad_norm": 1.5204341411590576, + "learning_rate": 1.3512684538759738e-05, + "loss": 0.7358, + "step": 12671 + }, + { + "epoch": 0.6170476955664305, + "grad_norm": 1.66019868850708, + "learning_rate": 1.3509700802814851e-05, + "loss": 0.7716, + "step": 12672 + }, + { + "epoch": 0.6170963893555377, + "grad_norm": 2.2745535373687744, + "learning_rate": 1.3506717228316757e-05, + "loss": 0.7963, + "step": 12673 + }, + { + "epoch": 0.6171450831446449, + "grad_norm": 1.410529613494873, + "learning_rate": 1.3503733815339657e-05, + "loss": 0.8708, + "step": 12674 + }, + { + "epoch": 0.6171937769337521, + "grad_norm": 2.922642230987549, + "learning_rate": 1.3500750563957775e-05, + "loss": 0.8261, + "step": 12675 + }, + { + "epoch": 0.6172424707228593, + "grad_norm": 0.09810113906860352, + "learning_rate": 1.3497767474245312e-05, + "loss": 0.7089, + "step": 12676 + }, + { + "epoch": 0.6172911645119665, + "grad_norm": 1.6797997951507568, + "learning_rate": 1.349478454627648e-05, + "loss": 0.8526, + "step": 12677 + }, + { + "epoch": 0.6173398583010737, + "grad_norm": 3.081801414489746, + "learning_rate": 1.349180178012547e-05, + "loss": 0.8701, + "step": 12678 + }, + { + "epoch": 0.6173885520901808, + "grad_norm": 1.4843738079071045, + "learning_rate": 1.3488819175866494e-05, + "loss": 0.8253, + "step": 12679 + }, + { + "epoch": 0.617437245879288, + "grad_norm": 1.494185447692871, + "learning_rate": 1.3485836733573723e-05, + "loss": 0.915, + "step": 12680 + }, + { + "epoch": 0.6174859396683953, + "grad_norm": 0.09385670721530914, + "learning_rate": 1.348285445332136e-05, + "loss": 0.5894, + "step": 12681 + }, + { + "epoch": 0.6175346334575025, + "grad_norm": 1.580436110496521, + "learning_rate": 1.3479872335183588e-05, + "loss": 0.9147, + "step": 12682 + }, + { + "epoch": 0.6175833272466097, + "grad_norm": 1.7291761636734009, + "learning_rate": 1.3476890379234588e-05, + "loss": 0.895, + "step": 12683 + }, + { + "epoch": 0.6176320210357169, + "grad_norm": 1.8521491289138794, + "learning_rate": 1.3473908585548533e-05, + "loss": 0.7566, + "step": 12684 + }, + { + "epoch": 0.6176807148248241, + "grad_norm": 1.6452070474624634, + "learning_rate": 1.3470926954199594e-05, + "loss": 0.7974, + "step": 12685 + }, + { + "epoch": 0.6177294086139313, + "grad_norm": 1.4069334268569946, + "learning_rate": 1.3467945485261953e-05, + "loss": 0.8431, + "step": 12686 + }, + { + "epoch": 0.6177781024030385, + "grad_norm": 1.599481463432312, + "learning_rate": 1.3464964178809753e-05, + "loss": 0.8526, + "step": 12687 + }, + { + "epoch": 0.6178267961921456, + "grad_norm": 1.3263887166976929, + "learning_rate": 1.3461983034917174e-05, + "loss": 0.9365, + "step": 12688 + }, + { + "epoch": 0.6178754899812529, + "grad_norm": 1.7379523515701294, + "learning_rate": 1.3459002053658355e-05, + "loss": 0.7689, + "step": 12689 + }, + { + "epoch": 0.6179241837703601, + "grad_norm": 7.084683895111084, + "learning_rate": 1.3456021235107453e-05, + "loss": 0.8023, + "step": 12690 + }, + { + "epoch": 0.6179728775594673, + "grad_norm": 1.5832184553146362, + "learning_rate": 1.3453040579338627e-05, + "loss": 0.8689, + "step": 12691 + }, + { + "epoch": 0.6180215713485745, + "grad_norm": 1.8969249725341797, + "learning_rate": 1.3450060086426006e-05, + "loss": 0.8302, + "step": 12692 + }, + { + "epoch": 0.6180702651376817, + "grad_norm": 1.4242278337478638, + "learning_rate": 1.3447079756443749e-05, + "loss": 0.8313, + "step": 12693 + }, + { + "epoch": 0.6181189589267889, + "grad_norm": 1.433611273765564, + "learning_rate": 1.344409958946597e-05, + "loss": 0.8158, + "step": 12694 + }, + { + "epoch": 0.6181676527158961, + "grad_norm": 1.7110555171966553, + "learning_rate": 1.3441119585566824e-05, + "loss": 0.7643, + "step": 12695 + }, + { + "epoch": 0.6182163465050032, + "grad_norm": 1.3365213871002197, + "learning_rate": 1.3438139744820414e-05, + "loss": 0.8224, + "step": 12696 + }, + { + "epoch": 0.6182650402941104, + "grad_norm": 0.09507939964532852, + "learning_rate": 1.3435160067300883e-05, + "loss": 0.6392, + "step": 12697 + }, + { + "epoch": 0.6183137340832177, + "grad_norm": 2.1094307899475098, + "learning_rate": 1.3432180553082337e-05, + "loss": 0.8214, + "step": 12698 + }, + { + "epoch": 0.6183624278723249, + "grad_norm": 5.855996608734131, + "learning_rate": 1.3429201202238909e-05, + "loss": 0.901, + "step": 12699 + }, + { + "epoch": 0.6184111216614321, + "grad_norm": 1.6574430465698242, + "learning_rate": 1.342622201484469e-05, + "loss": 0.7973, + "step": 12700 + }, + { + "epoch": 0.6184598154505393, + "grad_norm": 1.214752435684204, + "learning_rate": 1.34232429909738e-05, + "loss": 0.7994, + "step": 12701 + }, + { + "epoch": 0.6185085092396465, + "grad_norm": 1.236343264579773, + "learning_rate": 1.3420264130700347e-05, + "loss": 0.753, + "step": 12702 + }, + { + "epoch": 0.6185572030287537, + "grad_norm": 1.542936086654663, + "learning_rate": 1.3417285434098416e-05, + "loss": 0.8091, + "step": 12703 + }, + { + "epoch": 0.6186058968178609, + "grad_norm": 1.7080994844436646, + "learning_rate": 1.3414306901242112e-05, + "loss": 0.8902, + "step": 12704 + }, + { + "epoch": 0.618654590606968, + "grad_norm": 2.2450380325317383, + "learning_rate": 1.341132853220552e-05, + "loss": 0.866, + "step": 12705 + }, + { + "epoch": 0.6187032843960752, + "grad_norm": 1.6324455738067627, + "learning_rate": 1.3408350327062745e-05, + "loss": 0.7004, + "step": 12706 + }, + { + "epoch": 0.6187519781851825, + "grad_norm": 1.788133978843689, + "learning_rate": 1.3405372285887844e-05, + "loss": 0.765, + "step": 12707 + }, + { + "epoch": 0.6188006719742897, + "grad_norm": 1.7964181900024414, + "learning_rate": 1.3402394408754918e-05, + "loss": 0.7991, + "step": 12708 + }, + { + "epoch": 0.6188493657633969, + "grad_norm": 1.692314624786377, + "learning_rate": 1.3399416695738024e-05, + "loss": 0.8351, + "step": 12709 + }, + { + "epoch": 0.6188980595525041, + "grad_norm": 1.781483769416809, + "learning_rate": 1.3396439146911242e-05, + "loss": 0.7026, + "step": 12710 + }, + { + "epoch": 0.6189467533416113, + "grad_norm": 1.7687506675720215, + "learning_rate": 1.3393461762348648e-05, + "loss": 0.8277, + "step": 12711 + }, + { + "epoch": 0.6189954471307185, + "grad_norm": 1.645978331565857, + "learning_rate": 1.3390484542124284e-05, + "loss": 0.786, + "step": 12712 + }, + { + "epoch": 0.6190441409198256, + "grad_norm": 2.0406720638275146, + "learning_rate": 1.3387507486312226e-05, + "loss": 0.862, + "step": 12713 + }, + { + "epoch": 0.6190928347089328, + "grad_norm": 1.9454929828643799, + "learning_rate": 1.3384530594986516e-05, + "loss": 0.7668, + "step": 12714 + }, + { + "epoch": 0.61914152849804, + "grad_norm": 1.4738577604293823, + "learning_rate": 1.338155386822122e-05, + "loss": 0.7648, + "step": 12715 + }, + { + "epoch": 0.6191902222871473, + "grad_norm": 1.6677241325378418, + "learning_rate": 1.3378577306090366e-05, + "loss": 0.7985, + "step": 12716 + }, + { + "epoch": 0.6192389160762545, + "grad_norm": 1.3055555820465088, + "learning_rate": 1.3375600908668016e-05, + "loss": 0.8474, + "step": 12717 + }, + { + "epoch": 0.6192876098653617, + "grad_norm": 1.6942578554153442, + "learning_rate": 1.3372624676028188e-05, + "loss": 0.8151, + "step": 12718 + }, + { + "epoch": 0.6193363036544689, + "grad_norm": 2.874039649963379, + "learning_rate": 1.3369648608244923e-05, + "loss": 0.8793, + "step": 12719 + }, + { + "epoch": 0.6193849974435761, + "grad_norm": 2.0462841987609863, + "learning_rate": 1.3366672705392259e-05, + "loss": 0.804, + "step": 12720 + }, + { + "epoch": 0.6194336912326833, + "grad_norm": 2.380918264389038, + "learning_rate": 1.3363696967544214e-05, + "loss": 0.6938, + "step": 12721 + }, + { + "epoch": 0.6194823850217904, + "grad_norm": 2.571169853210449, + "learning_rate": 1.3360721394774818e-05, + "loss": 0.9071, + "step": 12722 + }, + { + "epoch": 0.6195310788108976, + "grad_norm": 1.2672415971755981, + "learning_rate": 1.3357745987158074e-05, + "loss": 0.8454, + "step": 12723 + }, + { + "epoch": 0.6195797726000049, + "grad_norm": 4.044585227966309, + "learning_rate": 1.3354770744768013e-05, + "loss": 0.8279, + "step": 12724 + }, + { + "epoch": 0.6196284663891121, + "grad_norm": 1.3984156847000122, + "learning_rate": 1.3351795667678628e-05, + "loss": 0.7669, + "step": 12725 + }, + { + "epoch": 0.6196771601782193, + "grad_norm": 2.10074520111084, + "learning_rate": 1.3348820755963935e-05, + "loss": 0.8373, + "step": 12726 + }, + { + "epoch": 0.6197258539673265, + "grad_norm": 1.5811307430267334, + "learning_rate": 1.3345846009697932e-05, + "loss": 0.8683, + "step": 12727 + }, + { + "epoch": 0.6197745477564337, + "grad_norm": 1.670799970626831, + "learning_rate": 1.334287142895462e-05, + "loss": 0.858, + "step": 12728 + }, + { + "epoch": 0.6198232415455409, + "grad_norm": 2.144251585006714, + "learning_rate": 1.3339897013807981e-05, + "loss": 0.7464, + "step": 12729 + }, + { + "epoch": 0.619871935334648, + "grad_norm": 1.2691000699996948, + "learning_rate": 1.3336922764332009e-05, + "loss": 0.8705, + "step": 12730 + }, + { + "epoch": 0.6199206291237552, + "grad_norm": 2.233297348022461, + "learning_rate": 1.3333948680600703e-05, + "loss": 0.7646, + "step": 12731 + }, + { + "epoch": 0.6199693229128624, + "grad_norm": 1.5143458843231201, + "learning_rate": 1.3330974762688022e-05, + "loss": 0.8643, + "step": 12732 + }, + { + "epoch": 0.6200180167019697, + "grad_norm": 1.5176066160202026, + "learning_rate": 1.3328001010667954e-05, + "loss": 0.7394, + "step": 12733 + }, + { + "epoch": 0.6200667104910769, + "grad_norm": 0.09348847717046738, + "learning_rate": 1.3325027424614468e-05, + "loss": 0.6524, + "step": 12734 + }, + { + "epoch": 0.6201154042801841, + "grad_norm": 1.7007962465286255, + "learning_rate": 1.3322054004601539e-05, + "loss": 0.8218, + "step": 12735 + }, + { + "epoch": 0.6201640980692913, + "grad_norm": 1.348199725151062, + "learning_rate": 1.3319080750703117e-05, + "loss": 0.8494, + "step": 12736 + }, + { + "epoch": 0.6202127918583985, + "grad_norm": 2.009018659591675, + "learning_rate": 1.3316107662993183e-05, + "loss": 0.7312, + "step": 12737 + }, + { + "epoch": 0.6202614856475056, + "grad_norm": 1.440550446510315, + "learning_rate": 1.3313134741545667e-05, + "loss": 0.7927, + "step": 12738 + }, + { + "epoch": 0.6203101794366128, + "grad_norm": 2.478874683380127, + "learning_rate": 1.3310161986434538e-05, + "loss": 0.7823, + "step": 12739 + }, + { + "epoch": 0.62035887322572, + "grad_norm": 1.7034447193145752, + "learning_rate": 1.3307189397733745e-05, + "loss": 0.8268, + "step": 12740 + }, + { + "epoch": 0.6204075670148272, + "grad_norm": 1.7376140356063843, + "learning_rate": 1.3304216975517216e-05, + "loss": 0.7801, + "step": 12741 + }, + { + "epoch": 0.6204562608039345, + "grad_norm": 1.427297830581665, + "learning_rate": 1.330124471985891e-05, + "loss": 0.7485, + "step": 12742 + }, + { + "epoch": 0.6205049545930417, + "grad_norm": 1.594024896621704, + "learning_rate": 1.3298272630832745e-05, + "loss": 0.7632, + "step": 12743 + }, + { + "epoch": 0.6205536483821489, + "grad_norm": 1.3998852968215942, + "learning_rate": 1.3295300708512668e-05, + "loss": 0.8303, + "step": 12744 + }, + { + "epoch": 0.6206023421712561, + "grad_norm": 1.711307406425476, + "learning_rate": 1.329232895297259e-05, + "loss": 0.708, + "step": 12745 + }, + { + "epoch": 0.6206510359603633, + "grad_norm": 1.1660034656524658, + "learning_rate": 1.3289357364286445e-05, + "loss": 0.8644, + "step": 12746 + }, + { + "epoch": 0.6206997297494704, + "grad_norm": 2.6712288856506348, + "learning_rate": 1.3286385942528144e-05, + "loss": 0.9009, + "step": 12747 + }, + { + "epoch": 0.6207484235385776, + "grad_norm": 0.09441959112882614, + "learning_rate": 1.3283414687771606e-05, + "loss": 0.6614, + "step": 12748 + }, + { + "epoch": 0.6207971173276848, + "grad_norm": 1.4377385377883911, + "learning_rate": 1.3280443600090736e-05, + "loss": 0.8578, + "step": 12749 + }, + { + "epoch": 0.620845811116792, + "grad_norm": 1.4520697593688965, + "learning_rate": 1.3277472679559446e-05, + "loss": 0.8241, + "step": 12750 + }, + { + "epoch": 0.6208945049058993, + "grad_norm": 2.6554319858551025, + "learning_rate": 1.3274501926251642e-05, + "loss": 0.8823, + "step": 12751 + }, + { + "epoch": 0.6209431986950065, + "grad_norm": 1.5445069074630737, + "learning_rate": 1.3271531340241208e-05, + "loss": 0.9291, + "step": 12752 + }, + { + "epoch": 0.6209918924841137, + "grad_norm": 1.7392866611480713, + "learning_rate": 1.3268560921602056e-05, + "loss": 0.8036, + "step": 12753 + }, + { + "epoch": 0.6210405862732209, + "grad_norm": 6.711293697357178, + "learning_rate": 1.3265590670408055e-05, + "loss": 0.8047, + "step": 12754 + }, + { + "epoch": 0.621089280062328, + "grad_norm": 1.5205111503601074, + "learning_rate": 1.3262620586733104e-05, + "loss": 0.7417, + "step": 12755 + }, + { + "epoch": 0.6211379738514352, + "grad_norm": 0.0980762243270874, + "learning_rate": 1.3259650670651076e-05, + "loss": 0.6379, + "step": 12756 + }, + { + "epoch": 0.6211866676405424, + "grad_norm": 1.5679161548614502, + "learning_rate": 1.3256680922235861e-05, + "loss": 0.7365, + "step": 12757 + }, + { + "epoch": 0.6212353614296496, + "grad_norm": 0.09088481962680817, + "learning_rate": 1.3253711341561313e-05, + "loss": 0.6348, + "step": 12758 + }, + { + "epoch": 0.6212840552187568, + "grad_norm": 1.3281517028808594, + "learning_rate": 1.3250741928701313e-05, + "loss": 0.7429, + "step": 12759 + }, + { + "epoch": 0.6213327490078641, + "grad_norm": 1.8039878606796265, + "learning_rate": 1.3247772683729729e-05, + "loss": 0.8196, + "step": 12760 + }, + { + "epoch": 0.6213814427969713, + "grad_norm": 0.10664180666208267, + "learning_rate": 1.3244803606720409e-05, + "loss": 0.6397, + "step": 12761 + }, + { + "epoch": 0.6214301365860785, + "grad_norm": 1.842419981956482, + "learning_rate": 1.3241834697747218e-05, + "loss": 0.7816, + "step": 12762 + }, + { + "epoch": 0.6214788303751857, + "grad_norm": 1.3255819082260132, + "learning_rate": 1.3238865956884004e-05, + "loss": 0.8362, + "step": 12763 + }, + { + "epoch": 0.6215275241642928, + "grad_norm": 2.2334768772125244, + "learning_rate": 1.3235897384204619e-05, + "loss": 0.7944, + "step": 12764 + }, + { + "epoch": 0.6215762179534, + "grad_norm": 1.4774218797683716, + "learning_rate": 1.3232928979782895e-05, + "loss": 0.7894, + "step": 12765 + }, + { + "epoch": 0.6216249117425072, + "grad_norm": 1.5772500038146973, + "learning_rate": 1.322996074369269e-05, + "loss": 0.8607, + "step": 12766 + }, + { + "epoch": 0.6216736055316144, + "grad_norm": 2.0824694633483887, + "learning_rate": 1.3226992676007822e-05, + "loss": 0.7687, + "step": 12767 + }, + { + "epoch": 0.6217222993207216, + "grad_norm": 3.0409321784973145, + "learning_rate": 1.3224024776802135e-05, + "loss": 0.8671, + "step": 12768 + }, + { + "epoch": 0.6217709931098289, + "grad_norm": 1.7277913093566895, + "learning_rate": 1.3221057046149443e-05, + "loss": 0.7397, + "step": 12769 + }, + { + "epoch": 0.6218196868989361, + "grad_norm": 1.8756592273712158, + "learning_rate": 1.3218089484123573e-05, + "loss": 0.8019, + "step": 12770 + }, + { + "epoch": 0.6218683806880433, + "grad_norm": 2.70459246635437, + "learning_rate": 1.321512209079835e-05, + "loss": 0.8507, + "step": 12771 + }, + { + "epoch": 0.6219170744771504, + "grad_norm": 1.6496447324752808, + "learning_rate": 1.3212154866247579e-05, + "loss": 0.7646, + "step": 12772 + }, + { + "epoch": 0.6219657682662576, + "grad_norm": 1.4052796363830566, + "learning_rate": 1.3209187810545085e-05, + "loss": 0.8882, + "step": 12773 + }, + { + "epoch": 0.6220144620553648, + "grad_norm": 1.7336204051971436, + "learning_rate": 1.3206220923764654e-05, + "loss": 0.7301, + "step": 12774 + }, + { + "epoch": 0.622063155844472, + "grad_norm": 1.4733299016952515, + "learning_rate": 1.3203254205980105e-05, + "loss": 0.8598, + "step": 12775 + }, + { + "epoch": 0.6221118496335792, + "grad_norm": 4.748133182525635, + "learning_rate": 1.3200287657265218e-05, + "loss": 0.8548, + "step": 12776 + }, + { + "epoch": 0.6221605434226865, + "grad_norm": 1.3064366579055786, + "learning_rate": 1.3197321277693803e-05, + "loss": 0.7952, + "step": 12777 + }, + { + "epoch": 0.6222092372117937, + "grad_norm": 1.6718246936798096, + "learning_rate": 1.3194355067339633e-05, + "loss": 0.8596, + "step": 12778 + }, + { + "epoch": 0.6222579310009009, + "grad_norm": 1.9209774732589722, + "learning_rate": 1.3191389026276504e-05, + "loss": 0.8434, + "step": 12779 + }, + { + "epoch": 0.6223066247900081, + "grad_norm": 1.8587888479232788, + "learning_rate": 1.31884231545782e-05, + "loss": 0.8705, + "step": 12780 + }, + { + "epoch": 0.6223553185791152, + "grad_norm": 1.608211636543274, + "learning_rate": 1.3185457452318486e-05, + "loss": 0.8333, + "step": 12781 + }, + { + "epoch": 0.6224040123682224, + "grad_norm": 1.806835651397705, + "learning_rate": 1.3182491919571145e-05, + "loss": 0.8039, + "step": 12782 + }, + { + "epoch": 0.6224527061573296, + "grad_norm": 1.9101148843765259, + "learning_rate": 1.3179526556409932e-05, + "loss": 0.8042, + "step": 12783 + }, + { + "epoch": 0.6225013999464368, + "grad_norm": 7.281325817108154, + "learning_rate": 1.3176561362908622e-05, + "loss": 0.7364, + "step": 12784 + }, + { + "epoch": 0.622550093735544, + "grad_norm": 1.60686194896698, + "learning_rate": 1.3173596339140963e-05, + "loss": 0.7588, + "step": 12785 + }, + { + "epoch": 0.6225987875246513, + "grad_norm": 1.868222713470459, + "learning_rate": 1.317063148518073e-05, + "loss": 0.8411, + "step": 12786 + }, + { + "epoch": 0.6226474813137585, + "grad_norm": 1.5061002969741821, + "learning_rate": 1.316766680110165e-05, + "loss": 0.8078, + "step": 12787 + }, + { + "epoch": 0.6226961751028657, + "grad_norm": 1.6014348268508911, + "learning_rate": 1.3164702286977492e-05, + "loss": 0.8194, + "step": 12788 + }, + { + "epoch": 0.6227448688919728, + "grad_norm": 2.000532627105713, + "learning_rate": 1.3161737942881978e-05, + "loss": 0.7191, + "step": 12789 + }, + { + "epoch": 0.62279356268108, + "grad_norm": 1.9379323720932007, + "learning_rate": 1.3158773768888855e-05, + "loss": 0.7812, + "step": 12790 + }, + { + "epoch": 0.6228422564701872, + "grad_norm": 2.3942630290985107, + "learning_rate": 1.3155809765071866e-05, + "loss": 0.924, + "step": 12791 + }, + { + "epoch": 0.6228909502592944, + "grad_norm": 2.5468344688415527, + "learning_rate": 1.315284593150473e-05, + "loss": 0.7853, + "step": 12792 + }, + { + "epoch": 0.6229396440484016, + "grad_norm": 1.4633896350860596, + "learning_rate": 1.3149882268261179e-05, + "loss": 0.8066, + "step": 12793 + }, + { + "epoch": 0.6229883378375088, + "grad_norm": 1.576634168624878, + "learning_rate": 1.3146918775414923e-05, + "loss": 0.7731, + "step": 12794 + }, + { + "epoch": 0.6230370316266161, + "grad_norm": 3.341169595718384, + "learning_rate": 1.3143955453039699e-05, + "loss": 0.8122, + "step": 12795 + }, + { + "epoch": 0.6230857254157233, + "grad_norm": 1.8204033374786377, + "learning_rate": 1.3140992301209198e-05, + "loss": 0.8207, + "step": 12796 + }, + { + "epoch": 0.6231344192048304, + "grad_norm": 1.4746471643447876, + "learning_rate": 1.3138029319997148e-05, + "loss": 0.8214, + "step": 12797 + }, + { + "epoch": 0.6231831129939376, + "grad_norm": 1.9041167497634888, + "learning_rate": 1.3135066509477234e-05, + "loss": 0.8694, + "step": 12798 + }, + { + "epoch": 0.6232318067830448, + "grad_norm": 1.5314185619354248, + "learning_rate": 1.313210386972317e-05, + "loss": 0.7574, + "step": 12799 + }, + { + "epoch": 0.623280500572152, + "grad_norm": 1.731155514717102, + "learning_rate": 1.312914140080865e-05, + "loss": 0.7402, + "step": 12800 + }, + { + "epoch": 0.6233291943612592, + "grad_norm": 2.0449798107147217, + "learning_rate": 1.3126179102807361e-05, + "loss": 0.8182, + "step": 12801 + }, + { + "epoch": 0.6233778881503664, + "grad_norm": 0.09935479611158371, + "learning_rate": 1.3123216975793001e-05, + "loss": 0.6358, + "step": 12802 + }, + { + "epoch": 0.6234265819394736, + "grad_norm": 1.3675280809402466, + "learning_rate": 1.3120255019839237e-05, + "loss": 0.8405, + "step": 12803 + }, + { + "epoch": 0.6234752757285809, + "grad_norm": 1.5992282629013062, + "learning_rate": 1.3117293235019765e-05, + "loss": 0.9219, + "step": 12804 + }, + { + "epoch": 0.6235239695176881, + "grad_norm": 1.4962986707687378, + "learning_rate": 1.3114331621408245e-05, + "loss": 0.777, + "step": 12805 + }, + { + "epoch": 0.6235726633067952, + "grad_norm": 1.8554381132125854, + "learning_rate": 1.3111370179078358e-05, + "loss": 0.8529, + "step": 12806 + }, + { + "epoch": 0.6236213570959024, + "grad_norm": 2.159278154373169, + "learning_rate": 1.310840890810376e-05, + "loss": 0.8624, + "step": 12807 + }, + { + "epoch": 0.6236700508850096, + "grad_norm": 2.1297221183776855, + "learning_rate": 1.3105447808558119e-05, + "loss": 0.9032, + "step": 12808 + }, + { + "epoch": 0.6237187446741168, + "grad_norm": 1.7528079748153687, + "learning_rate": 1.3102486880515101e-05, + "loss": 0.7781, + "step": 12809 + }, + { + "epoch": 0.623767438463224, + "grad_norm": 1.5514265298843384, + "learning_rate": 1.3099526124048343e-05, + "loss": 0.8261, + "step": 12810 + }, + { + "epoch": 0.6238161322523312, + "grad_norm": 1.696248173713684, + "learning_rate": 1.309656553923151e-05, + "loss": 0.7516, + "step": 12811 + }, + { + "epoch": 0.6238648260414384, + "grad_norm": 1.6414337158203125, + "learning_rate": 1.3093605126138229e-05, + "loss": 0.8088, + "step": 12812 + }, + { + "epoch": 0.6239135198305457, + "grad_norm": 1.466362714767456, + "learning_rate": 1.3090644884842157e-05, + "loss": 0.8698, + "step": 12813 + }, + { + "epoch": 0.6239622136196528, + "grad_norm": 1.4907346963882446, + "learning_rate": 1.3087684815416917e-05, + "loss": 0.7888, + "step": 12814 + }, + { + "epoch": 0.62401090740876, + "grad_norm": 1.8431179523468018, + "learning_rate": 1.3084724917936153e-05, + "loss": 0.7769, + "step": 12815 + }, + { + "epoch": 0.6240596011978672, + "grad_norm": 1.8157753944396973, + "learning_rate": 1.3081765192473484e-05, + "loss": 0.8305, + "step": 12816 + }, + { + "epoch": 0.6241082949869744, + "grad_norm": 2.0745599269866943, + "learning_rate": 1.3078805639102543e-05, + "loss": 0.8342, + "step": 12817 + }, + { + "epoch": 0.6241569887760816, + "grad_norm": 8.338811874389648, + "learning_rate": 1.3075846257896933e-05, + "loss": 0.878, + "step": 12818 + }, + { + "epoch": 0.6242056825651888, + "grad_norm": 1.4755431413650513, + "learning_rate": 1.3072887048930282e-05, + "loss": 0.7843, + "step": 12819 + }, + { + "epoch": 0.624254376354296, + "grad_norm": 1.3516243696212769, + "learning_rate": 1.3069928012276204e-05, + "loss": 0.852, + "step": 12820 + }, + { + "epoch": 0.6243030701434032, + "grad_norm": 1.4686484336853027, + "learning_rate": 1.306696914800829e-05, + "loss": 0.7937, + "step": 12821 + }, + { + "epoch": 0.6243517639325105, + "grad_norm": 1.5528055429458618, + "learning_rate": 1.3064010456200156e-05, + "loss": 0.8734, + "step": 12822 + }, + { + "epoch": 0.6244004577216176, + "grad_norm": 1.6284745931625366, + "learning_rate": 1.3061051936925386e-05, + "loss": 0.9039, + "step": 12823 + }, + { + "epoch": 0.6244491515107248, + "grad_norm": 2.25176739692688, + "learning_rate": 1.3058093590257595e-05, + "loss": 0.8111, + "step": 12824 + }, + { + "epoch": 0.624497845299832, + "grad_norm": 1.6948670148849487, + "learning_rate": 1.3055135416270348e-05, + "loss": 0.8957, + "step": 12825 + }, + { + "epoch": 0.6245465390889392, + "grad_norm": 2.3103997707366943, + "learning_rate": 1.3052177415037251e-05, + "loss": 0.8264, + "step": 12826 + }, + { + "epoch": 0.6245952328780464, + "grad_norm": 2.780423402786255, + "learning_rate": 1.3049219586631862e-05, + "loss": 0.8301, + "step": 12827 + }, + { + "epoch": 0.6246439266671536, + "grad_norm": 1.734366536140442, + "learning_rate": 1.3046261931127773e-05, + "loss": 0.8364, + "step": 12828 + }, + { + "epoch": 0.6246926204562608, + "grad_norm": 1.3669192790985107, + "learning_rate": 1.3043304448598554e-05, + "loss": 0.8323, + "step": 12829 + }, + { + "epoch": 0.624741314245368, + "grad_norm": 1.4624767303466797, + "learning_rate": 1.304034713911777e-05, + "loss": 0.7255, + "step": 12830 + }, + { + "epoch": 0.6247900080344752, + "grad_norm": 2.1769003868103027, + "learning_rate": 1.3037390002758992e-05, + "loss": 0.8672, + "step": 12831 + }, + { + "epoch": 0.6248387018235824, + "grad_norm": 2.527594804763794, + "learning_rate": 1.3034433039595763e-05, + "loss": 0.9155, + "step": 12832 + }, + { + "epoch": 0.6248873956126896, + "grad_norm": 2.055095911026001, + "learning_rate": 1.303147624970166e-05, + "loss": 0.7757, + "step": 12833 + }, + { + "epoch": 0.6249360894017968, + "grad_norm": 1.7782678604125977, + "learning_rate": 1.3028519633150207e-05, + "loss": 0.7532, + "step": 12834 + }, + { + "epoch": 0.624984783190904, + "grad_norm": 2.1617677211761475, + "learning_rate": 1.3025563190014974e-05, + "loss": 0.8378, + "step": 12835 + }, + { + "epoch": 0.6250334769800112, + "grad_norm": 2.0989909172058105, + "learning_rate": 1.3022606920369483e-05, + "loss": 0.9138, + "step": 12836 + }, + { + "epoch": 0.6250821707691184, + "grad_norm": 2.0062694549560547, + "learning_rate": 1.301965082428729e-05, + "loss": 0.8942, + "step": 12837 + }, + { + "epoch": 0.6251308645582256, + "grad_norm": 1.822447419166565, + "learning_rate": 1.3016694901841914e-05, + "loss": 0.8173, + "step": 12838 + }, + { + "epoch": 0.6251795583473327, + "grad_norm": 1.529346227645874, + "learning_rate": 1.3013739153106886e-05, + "loss": 0.8834, + "step": 12839 + }, + { + "epoch": 0.62522825213644, + "grad_norm": 1.5286283493041992, + "learning_rate": 1.3010783578155743e-05, + "loss": 0.7824, + "step": 12840 + }, + { + "epoch": 0.6252769459255472, + "grad_norm": 1.6771894693374634, + "learning_rate": 1.3007828177061988e-05, + "loss": 0.7808, + "step": 12841 + }, + { + "epoch": 0.6253256397146544, + "grad_norm": 2.0798702239990234, + "learning_rate": 1.3004872949899147e-05, + "loss": 0.834, + "step": 12842 + }, + { + "epoch": 0.6253743335037616, + "grad_norm": 1.4380121231079102, + "learning_rate": 1.3001917896740728e-05, + "loss": 0.7879, + "step": 12843 + }, + { + "epoch": 0.6254230272928688, + "grad_norm": 1.4606388807296753, + "learning_rate": 1.2998963017660245e-05, + "loss": 0.8517, + "step": 12844 + }, + { + "epoch": 0.625471721081976, + "grad_norm": 1.93768310546875, + "learning_rate": 1.2996008312731187e-05, + "loss": 0.782, + "step": 12845 + }, + { + "epoch": 0.6255204148710832, + "grad_norm": 2.453010320663452, + "learning_rate": 1.2993053782027069e-05, + "loss": 0.9254, + "step": 12846 + }, + { + "epoch": 0.6255691086601904, + "grad_norm": 2.030853271484375, + "learning_rate": 1.2990099425621373e-05, + "loss": 0.7719, + "step": 12847 + }, + { + "epoch": 0.6256178024492975, + "grad_norm": 1.741284966468811, + "learning_rate": 1.2987145243587586e-05, + "loss": 0.7517, + "step": 12848 + }, + { + "epoch": 0.6256664962384048, + "grad_norm": 1.5803983211517334, + "learning_rate": 1.2984191235999216e-05, + "loss": 0.8559, + "step": 12849 + }, + { + "epoch": 0.625715190027512, + "grad_norm": 0.09606607258319855, + "learning_rate": 1.2981237402929714e-05, + "loss": 0.6664, + "step": 12850 + }, + { + "epoch": 0.6257638838166192, + "grad_norm": 2.1276190280914307, + "learning_rate": 1.297828374445258e-05, + "loss": 0.8337, + "step": 12851 + }, + { + "epoch": 0.6258125776057264, + "grad_norm": 1.2570533752441406, + "learning_rate": 1.297533026064127e-05, + "loss": 0.7735, + "step": 12852 + }, + { + "epoch": 0.6258612713948336, + "grad_norm": 2.5466208457946777, + "learning_rate": 1.2972376951569272e-05, + "loss": 0.788, + "step": 12853 + }, + { + "epoch": 0.6259099651839408, + "grad_norm": 1.5079597234725952, + "learning_rate": 1.296942381731003e-05, + "loss": 0.891, + "step": 12854 + }, + { + "epoch": 0.625958658973048, + "grad_norm": 5.598006248474121, + "learning_rate": 1.2966470857937017e-05, + "loss": 0.8603, + "step": 12855 + }, + { + "epoch": 0.6260073527621551, + "grad_norm": 1.8160847425460815, + "learning_rate": 1.2963518073523678e-05, + "loss": 0.7879, + "step": 12856 + }, + { + "epoch": 0.6260560465512623, + "grad_norm": 1.907740831375122, + "learning_rate": 1.2960565464143469e-05, + "loss": 0.829, + "step": 12857 + }, + { + "epoch": 0.6261047403403696, + "grad_norm": 1.3939988613128662, + "learning_rate": 1.2957613029869833e-05, + "loss": 0.9522, + "step": 12858 + }, + { + "epoch": 0.6261534341294768, + "grad_norm": 1.7951488494873047, + "learning_rate": 1.2954660770776215e-05, + "loss": 0.8881, + "step": 12859 + }, + { + "epoch": 0.626202127918584, + "grad_norm": 1.4965966939926147, + "learning_rate": 1.2951708686936062e-05, + "loss": 0.7983, + "step": 12860 + }, + { + "epoch": 0.6262508217076912, + "grad_norm": 1.4069437980651855, + "learning_rate": 1.2948756778422789e-05, + "loss": 0.7247, + "step": 12861 + }, + { + "epoch": 0.6262995154967984, + "grad_norm": 1.6003364324569702, + "learning_rate": 1.2945805045309843e-05, + "loss": 0.6994, + "step": 12862 + }, + { + "epoch": 0.6263482092859056, + "grad_norm": 1.6353402137756348, + "learning_rate": 1.294285348767063e-05, + "loss": 0.7557, + "step": 12863 + }, + { + "epoch": 0.6263969030750128, + "grad_norm": 1.5948429107666016, + "learning_rate": 1.2939902105578583e-05, + "loss": 0.7552, + "step": 12864 + }, + { + "epoch": 0.6264455968641199, + "grad_norm": 1.4814350605010986, + "learning_rate": 1.2936950899107115e-05, + "loss": 0.7459, + "step": 12865 + }, + { + "epoch": 0.6264942906532271, + "grad_norm": 1.4074866771697998, + "learning_rate": 1.2933999868329643e-05, + "loss": 0.8107, + "step": 12866 + }, + { + "epoch": 0.6265429844423344, + "grad_norm": 1.8266384601593018, + "learning_rate": 1.2931049013319559e-05, + "loss": 0.8073, + "step": 12867 + }, + { + "epoch": 0.6265916782314416, + "grad_norm": 1.8041722774505615, + "learning_rate": 1.2928098334150278e-05, + "loss": 0.8972, + "step": 12868 + }, + { + "epoch": 0.6266403720205488, + "grad_norm": 0.09308463335037231, + "learning_rate": 1.2925147830895202e-05, + "loss": 0.6457, + "step": 12869 + }, + { + "epoch": 0.626689065809656, + "grad_norm": 1.6691951751708984, + "learning_rate": 1.2922197503627713e-05, + "loss": 0.8851, + "step": 12870 + }, + { + "epoch": 0.6267377595987632, + "grad_norm": 1.4465645551681519, + "learning_rate": 1.2919247352421209e-05, + "loss": 0.7436, + "step": 12871 + }, + { + "epoch": 0.6267864533878704, + "grad_norm": 1.797182559967041, + "learning_rate": 1.2916297377349072e-05, + "loss": 0.7448, + "step": 12872 + }, + { + "epoch": 0.6268351471769775, + "grad_norm": 2.2846004962921143, + "learning_rate": 1.2913347578484686e-05, + "loss": 0.8775, + "step": 12873 + }, + { + "epoch": 0.6268838409660847, + "grad_norm": 1.3696316480636597, + "learning_rate": 1.2910397955901417e-05, + "loss": 0.8011, + "step": 12874 + }, + { + "epoch": 0.626932534755192, + "grad_norm": 0.09229873865842819, + "learning_rate": 1.2907448509672656e-05, + "loss": 0.5834, + "step": 12875 + }, + { + "epoch": 0.6269812285442992, + "grad_norm": 3.0722742080688477, + "learning_rate": 1.290449923987175e-05, + "loss": 0.745, + "step": 12876 + }, + { + "epoch": 0.6270299223334064, + "grad_norm": 1.1514322757720947, + "learning_rate": 1.2901550146572079e-05, + "loss": 0.7848, + "step": 12877 + }, + { + "epoch": 0.6270786161225136, + "grad_norm": 0.09282399713993073, + "learning_rate": 1.2898601229846995e-05, + "loss": 0.6428, + "step": 12878 + }, + { + "epoch": 0.6271273099116208, + "grad_norm": 1.4247877597808838, + "learning_rate": 1.2895652489769845e-05, + "loss": 0.9069, + "step": 12879 + }, + { + "epoch": 0.627176003700728, + "grad_norm": 1.7046080827713013, + "learning_rate": 1.2892703926413996e-05, + "loss": 0.7646, + "step": 12880 + }, + { + "epoch": 0.6272246974898352, + "grad_norm": 1.7658780813217163, + "learning_rate": 1.288975553985278e-05, + "loss": 0.8245, + "step": 12881 + }, + { + "epoch": 0.6272733912789423, + "grad_norm": 1.2636985778808594, + "learning_rate": 1.2886807330159549e-05, + "loss": 0.8364, + "step": 12882 + }, + { + "epoch": 0.6273220850680495, + "grad_norm": 1.7571684122085571, + "learning_rate": 1.2883859297407627e-05, + "loss": 0.8041, + "step": 12883 + }, + { + "epoch": 0.6273707788571568, + "grad_norm": 1.2647184133529663, + "learning_rate": 1.2880911441670362e-05, + "loss": 0.7717, + "step": 12884 + }, + { + "epoch": 0.627419472646264, + "grad_norm": 1.4942620992660522, + "learning_rate": 1.2877963763021067e-05, + "loss": 0.8307, + "step": 12885 + }, + { + "epoch": 0.6274681664353712, + "grad_norm": 1.6624542474746704, + "learning_rate": 1.2875016261533076e-05, + "loss": 0.8454, + "step": 12886 + }, + { + "epoch": 0.6275168602244784, + "grad_norm": 1.7049640417099, + "learning_rate": 1.28720689372797e-05, + "loss": 0.7814, + "step": 12887 + }, + { + "epoch": 0.6275655540135856, + "grad_norm": 2.1300039291381836, + "learning_rate": 1.286912179033426e-05, + "loss": 0.7733, + "step": 12888 + }, + { + "epoch": 0.6276142478026928, + "grad_norm": 2.956967830657959, + "learning_rate": 1.2866174820770074e-05, + "loss": 0.8497, + "step": 12889 + }, + { + "epoch": 0.6276629415917999, + "grad_norm": 2.136904716491699, + "learning_rate": 1.2863228028660433e-05, + "loss": 0.824, + "step": 12890 + }, + { + "epoch": 0.6277116353809071, + "grad_norm": 2.465768337249756, + "learning_rate": 1.286028141407865e-05, + "loss": 0.7632, + "step": 12891 + }, + { + "epoch": 0.6277603291700143, + "grad_norm": 1.4653743505477905, + "learning_rate": 1.2857334977098013e-05, + "loss": 0.8505, + "step": 12892 + }, + { + "epoch": 0.6278090229591216, + "grad_norm": 2.2698564529418945, + "learning_rate": 1.2854388717791824e-05, + "loss": 0.7916, + "step": 12893 + }, + { + "epoch": 0.6278577167482288, + "grad_norm": 1.5520455837249756, + "learning_rate": 1.285144263623336e-05, + "loss": 0.8102, + "step": 12894 + }, + { + "epoch": 0.627906410537336, + "grad_norm": 10.34850788116455, + "learning_rate": 1.284849673249592e-05, + "loss": 0.8708, + "step": 12895 + }, + { + "epoch": 0.6279551043264432, + "grad_norm": 1.4049293994903564, + "learning_rate": 1.2845551006652772e-05, + "loss": 0.7727, + "step": 12896 + }, + { + "epoch": 0.6280037981155504, + "grad_norm": 1.3912417888641357, + "learning_rate": 1.2842605458777193e-05, + "loss": 0.7564, + "step": 12897 + }, + { + "epoch": 0.6280524919046575, + "grad_norm": 1.4815764427185059, + "learning_rate": 1.2839660088942463e-05, + "loss": 0.7989, + "step": 12898 + }, + { + "epoch": 0.6281011856937647, + "grad_norm": 1.9258084297180176, + "learning_rate": 1.2836714897221833e-05, + "loss": 0.8497, + "step": 12899 + }, + { + "epoch": 0.6281498794828719, + "grad_norm": 1.5374934673309326, + "learning_rate": 1.2833769883688578e-05, + "loss": 0.8735, + "step": 12900 + }, + { + "epoch": 0.6281985732719791, + "grad_norm": 2.1624536514282227, + "learning_rate": 1.2830825048415952e-05, + "loss": 0.9248, + "step": 12901 + }, + { + "epoch": 0.6282472670610864, + "grad_norm": 1.7653213739395142, + "learning_rate": 1.2827880391477203e-05, + "loss": 0.7014, + "step": 12902 + }, + { + "epoch": 0.6282959608501936, + "grad_norm": 1.7284916639328003, + "learning_rate": 1.2824935912945582e-05, + "loss": 0.7574, + "step": 12903 + }, + { + "epoch": 0.6283446546393008, + "grad_norm": 1.5931147336959839, + "learning_rate": 1.2821991612894342e-05, + "loss": 0.7811, + "step": 12904 + }, + { + "epoch": 0.628393348428408, + "grad_norm": 0.09528259932994843, + "learning_rate": 1.2819047491396705e-05, + "loss": 0.6102, + "step": 12905 + }, + { + "epoch": 0.6284420422175152, + "grad_norm": 1.444909691810608, + "learning_rate": 1.2816103548525927e-05, + "loss": 0.8604, + "step": 12906 + }, + { + "epoch": 0.6284907360066223, + "grad_norm": 1.647200584411621, + "learning_rate": 1.2813159784355219e-05, + "loss": 0.8577, + "step": 12907 + }, + { + "epoch": 0.6285394297957295, + "grad_norm": 1.4749608039855957, + "learning_rate": 1.2810216198957816e-05, + "loss": 0.7557, + "step": 12908 + }, + { + "epoch": 0.6285881235848367, + "grad_norm": 1.6001583337783813, + "learning_rate": 1.2807272792406946e-05, + "loss": 0.8846, + "step": 12909 + }, + { + "epoch": 0.628636817373944, + "grad_norm": 1.5047892332077026, + "learning_rate": 1.2804329564775814e-05, + "loss": 0.9118, + "step": 12910 + }, + { + "epoch": 0.6286855111630512, + "grad_norm": 1.8778460025787354, + "learning_rate": 1.2801386516137652e-05, + "loss": 0.8898, + "step": 12911 + }, + { + "epoch": 0.6287342049521584, + "grad_norm": 1.444115161895752, + "learning_rate": 1.2798443646565648e-05, + "loss": 0.7581, + "step": 12912 + }, + { + "epoch": 0.6287828987412656, + "grad_norm": 1.7575989961624146, + "learning_rate": 1.2795500956133022e-05, + "loss": 0.8159, + "step": 12913 + }, + { + "epoch": 0.6288315925303728, + "grad_norm": 1.445446491241455, + "learning_rate": 1.279255844491296e-05, + "loss": 0.7408, + "step": 12914 + }, + { + "epoch": 0.6288802863194799, + "grad_norm": 2.6373095512390137, + "learning_rate": 1.2789616112978665e-05, + "loss": 0.74, + "step": 12915 + }, + { + "epoch": 0.6289289801085871, + "grad_norm": 1.5019383430480957, + "learning_rate": 1.2786673960403327e-05, + "loss": 0.8004, + "step": 12916 + }, + { + "epoch": 0.6289776738976943, + "grad_norm": 2.4824588298797607, + "learning_rate": 1.2783731987260128e-05, + "loss": 0.8368, + "step": 12917 + }, + { + "epoch": 0.6290263676868015, + "grad_norm": 2.2929694652557373, + "learning_rate": 1.2780790193622261e-05, + "loss": 0.8396, + "step": 12918 + }, + { + "epoch": 0.6290750614759087, + "grad_norm": 1.6450824737548828, + "learning_rate": 1.2777848579562893e-05, + "loss": 0.7483, + "step": 12919 + }, + { + "epoch": 0.629123755265016, + "grad_norm": 2.253079414367676, + "learning_rate": 1.2774907145155204e-05, + "loss": 0.887, + "step": 12920 + }, + { + "epoch": 0.6291724490541232, + "grad_norm": 1.4589594602584839, + "learning_rate": 1.2771965890472351e-05, + "loss": 0.8326, + "step": 12921 + }, + { + "epoch": 0.6292211428432304, + "grad_norm": 1.4369486570358276, + "learning_rate": 1.276902481558751e-05, + "loss": 0.7956, + "step": 12922 + }, + { + "epoch": 0.6292698366323376, + "grad_norm": 1.5717793703079224, + "learning_rate": 1.2766083920573833e-05, + "loss": 0.7689, + "step": 12923 + }, + { + "epoch": 0.6293185304214447, + "grad_norm": 1.5590516328811646, + "learning_rate": 1.2763143205504487e-05, + "loss": 0.7863, + "step": 12924 + }, + { + "epoch": 0.6293672242105519, + "grad_norm": 2.2417781352996826, + "learning_rate": 1.2760202670452601e-05, + "loss": 0.8347, + "step": 12925 + }, + { + "epoch": 0.6294159179996591, + "grad_norm": 3.164165496826172, + "learning_rate": 1.2757262315491345e-05, + "loss": 0.8409, + "step": 12926 + }, + { + "epoch": 0.6294646117887663, + "grad_norm": 1.9879252910614014, + "learning_rate": 1.275432214069384e-05, + "loss": 0.8954, + "step": 12927 + }, + { + "epoch": 0.6295133055778735, + "grad_norm": 1.5346932411193848, + "learning_rate": 1.2751382146133233e-05, + "loss": 0.8768, + "step": 12928 + }, + { + "epoch": 0.6295619993669808, + "grad_norm": 1.344990849494934, + "learning_rate": 1.2748442331882662e-05, + "loss": 0.7986, + "step": 12929 + }, + { + "epoch": 0.629610693156088, + "grad_norm": 2.320791482925415, + "learning_rate": 1.2745502698015248e-05, + "loss": 0.7863, + "step": 12930 + }, + { + "epoch": 0.6296593869451952, + "grad_norm": 1.3786025047302246, + "learning_rate": 1.2742563244604114e-05, + "loss": 0.7453, + "step": 12931 + }, + { + "epoch": 0.6297080807343023, + "grad_norm": 1.7481110095977783, + "learning_rate": 1.2739623971722379e-05, + "loss": 0.8609, + "step": 12932 + }, + { + "epoch": 0.6297567745234095, + "grad_norm": 1.8138235807418823, + "learning_rate": 1.2736684879443166e-05, + "loss": 0.8429, + "step": 12933 + }, + { + "epoch": 0.6298054683125167, + "grad_norm": 1.8237165212631226, + "learning_rate": 1.2733745967839572e-05, + "loss": 0.8103, + "step": 12934 + }, + { + "epoch": 0.6298541621016239, + "grad_norm": 1.406486988067627, + "learning_rate": 1.2730807236984717e-05, + "loss": 0.806, + "step": 12935 + }, + { + "epoch": 0.6299028558907311, + "grad_norm": 6.776662826538086, + "learning_rate": 1.2727868686951687e-05, + "loss": 0.7799, + "step": 12936 + }, + { + "epoch": 0.6299515496798384, + "grad_norm": 1.856139898300171, + "learning_rate": 1.2724930317813588e-05, + "loss": 0.7888, + "step": 12937 + }, + { + "epoch": 0.6300002434689456, + "grad_norm": 1.5227105617523193, + "learning_rate": 1.2721992129643513e-05, + "loss": 0.878, + "step": 12938 + }, + { + "epoch": 0.6300489372580528, + "grad_norm": 2.257844924926758, + "learning_rate": 1.2719054122514543e-05, + "loss": 0.8639, + "step": 12939 + }, + { + "epoch": 0.63009763104716, + "grad_norm": 1.3157703876495361, + "learning_rate": 1.2716116296499774e-05, + "loss": 0.8283, + "step": 12940 + }, + { + "epoch": 0.6301463248362671, + "grad_norm": 1.5813498497009277, + "learning_rate": 1.2713178651672267e-05, + "loss": 0.8124, + "step": 12941 + }, + { + "epoch": 0.6301950186253743, + "grad_norm": 1.686444640159607, + "learning_rate": 1.2710241188105114e-05, + "loss": 0.851, + "step": 12942 + }, + { + "epoch": 0.6302437124144815, + "grad_norm": 1.2828298807144165, + "learning_rate": 1.270730390587137e-05, + "loss": 0.8076, + "step": 12943 + }, + { + "epoch": 0.6302924062035887, + "grad_norm": 2.507673740386963, + "learning_rate": 1.270436680504411e-05, + "loss": 0.8687, + "step": 12944 + }, + { + "epoch": 0.6303410999926959, + "grad_norm": 1.4987984895706177, + "learning_rate": 1.2701429885696386e-05, + "loss": 0.8437, + "step": 12945 + }, + { + "epoch": 0.6303897937818032, + "grad_norm": 1.6266660690307617, + "learning_rate": 1.2698493147901268e-05, + "loss": 0.8759, + "step": 12946 + }, + { + "epoch": 0.6304384875709104, + "grad_norm": 1.7699074745178223, + "learning_rate": 1.2695556591731788e-05, + "loss": 0.9611, + "step": 12947 + }, + { + "epoch": 0.6304871813600176, + "grad_norm": 1.6186234951019287, + "learning_rate": 1.2692620217261007e-05, + "loss": 0.8797, + "step": 12948 + }, + { + "epoch": 0.6305358751491247, + "grad_norm": 1.6769369840621948, + "learning_rate": 1.268968402456197e-05, + "loss": 0.8618, + "step": 12949 + }, + { + "epoch": 0.6305845689382319, + "grad_norm": 1.5697276592254639, + "learning_rate": 1.2686748013707702e-05, + "loss": 0.8137, + "step": 12950 + }, + { + "epoch": 0.6306332627273391, + "grad_norm": 2.257949113845825, + "learning_rate": 1.2683812184771249e-05, + "loss": 0.8373, + "step": 12951 + }, + { + "epoch": 0.6306819565164463, + "grad_norm": 1.7939770221710205, + "learning_rate": 1.268087653782563e-05, + "loss": 0.8569, + "step": 12952 + }, + { + "epoch": 0.6307306503055535, + "grad_norm": 1.5935211181640625, + "learning_rate": 1.2677941072943885e-05, + "loss": 0.9043, + "step": 12953 + }, + { + "epoch": 0.6307793440946607, + "grad_norm": 3.5003387928009033, + "learning_rate": 1.267500579019901e-05, + "loss": 0.821, + "step": 12954 + }, + { + "epoch": 0.630828037883768, + "grad_norm": 1.9675604104995728, + "learning_rate": 1.2672070689664046e-05, + "loss": 0.7775, + "step": 12955 + }, + { + "epoch": 0.6308767316728752, + "grad_norm": 2.0954062938690186, + "learning_rate": 1.2669135771411983e-05, + "loss": 0.7601, + "step": 12956 + }, + { + "epoch": 0.6309254254619823, + "grad_norm": 0.09939872473478317, + "learning_rate": 1.2666201035515834e-05, + "loss": 0.6382, + "step": 12957 + }, + { + "epoch": 0.6309741192510895, + "grad_norm": 1.5949527025222778, + "learning_rate": 1.2663266482048613e-05, + "loss": 0.8548, + "step": 12958 + }, + { + "epoch": 0.6310228130401967, + "grad_norm": 1.320169448852539, + "learning_rate": 1.26603321110833e-05, + "loss": 0.8058, + "step": 12959 + }, + { + "epoch": 0.6310715068293039, + "grad_norm": 1.4696629047393799, + "learning_rate": 1.2657397922692898e-05, + "loss": 0.8501, + "step": 12960 + }, + { + "epoch": 0.6311202006184111, + "grad_norm": 3.672945499420166, + "learning_rate": 1.2654463916950388e-05, + "loss": 0.7872, + "step": 12961 + }, + { + "epoch": 0.6311688944075183, + "grad_norm": 1.8738386631011963, + "learning_rate": 1.2651530093928765e-05, + "loss": 0.7568, + "step": 12962 + }, + { + "epoch": 0.6312175881966255, + "grad_norm": 1.4202723503112793, + "learning_rate": 1.2648596453700994e-05, + "loss": 0.8139, + "step": 12963 + }, + { + "epoch": 0.6312662819857328, + "grad_norm": 1.7845429182052612, + "learning_rate": 1.2645662996340064e-05, + "loss": 0.6745, + "step": 12964 + }, + { + "epoch": 0.63131497577484, + "grad_norm": 1.6741513013839722, + "learning_rate": 1.264272972191893e-05, + "loss": 0.8095, + "step": 12965 + }, + { + "epoch": 0.6313636695639471, + "grad_norm": 1.6938914060592651, + "learning_rate": 1.2639796630510568e-05, + "loss": 0.8633, + "step": 12966 + }, + { + "epoch": 0.6314123633530543, + "grad_norm": 2.184537887573242, + "learning_rate": 1.2636863722187932e-05, + "loss": 0.8399, + "step": 12967 + }, + { + "epoch": 0.6314610571421615, + "grad_norm": 1.6569448709487915, + "learning_rate": 1.2633930997023983e-05, + "loss": 0.8064, + "step": 12968 + }, + { + "epoch": 0.6315097509312687, + "grad_norm": 1.8227897882461548, + "learning_rate": 1.2630998455091681e-05, + "loss": 0.7823, + "step": 12969 + }, + { + "epoch": 0.6315584447203759, + "grad_norm": 1.7719765901565552, + "learning_rate": 1.2628066096463956e-05, + "loss": 0.8234, + "step": 12970 + }, + { + "epoch": 0.6316071385094831, + "grad_norm": 1.4867891073226929, + "learning_rate": 1.2625133921213766e-05, + "loss": 0.8762, + "step": 12971 + }, + { + "epoch": 0.6316558322985903, + "grad_norm": 1.572936773300171, + "learning_rate": 1.2622201929414034e-05, + "loss": 0.7909, + "step": 12972 + }, + { + "epoch": 0.6317045260876976, + "grad_norm": 1.6093076467514038, + "learning_rate": 1.2619270121137706e-05, + "loss": 0.8782, + "step": 12973 + }, + { + "epoch": 0.6317532198768047, + "grad_norm": 1.2399550676345825, + "learning_rate": 1.26163384964577e-05, + "loss": 0.8671, + "step": 12974 + }, + { + "epoch": 0.6318019136659119, + "grad_norm": 2.247265100479126, + "learning_rate": 1.261340705544696e-05, + "loss": 0.912, + "step": 12975 + }, + { + "epoch": 0.6318506074550191, + "grad_norm": 1.6311008930206299, + "learning_rate": 1.2610475798178383e-05, + "loss": 0.8149, + "step": 12976 + }, + { + "epoch": 0.6318993012441263, + "grad_norm": 1.635244607925415, + "learning_rate": 1.260754472472489e-05, + "loss": 0.8885, + "step": 12977 + }, + { + "epoch": 0.6319479950332335, + "grad_norm": 1.4653021097183228, + "learning_rate": 1.2604613835159412e-05, + "loss": 0.8644, + "step": 12978 + }, + { + "epoch": 0.6319966888223407, + "grad_norm": 1.8078086376190186, + "learning_rate": 1.2601683129554826e-05, + "loss": 0.8138, + "step": 12979 + }, + { + "epoch": 0.6320453826114479, + "grad_norm": 3.2673370838165283, + "learning_rate": 1.2598752607984053e-05, + "loss": 0.8599, + "step": 12980 + }, + { + "epoch": 0.6320940764005551, + "grad_norm": 1.4339938163757324, + "learning_rate": 1.2595822270519978e-05, + "loss": 0.852, + "step": 12981 + }, + { + "epoch": 0.6321427701896624, + "grad_norm": 2.3802101612091064, + "learning_rate": 1.2592892117235507e-05, + "loss": 0.7974, + "step": 12982 + }, + { + "epoch": 0.6321914639787695, + "grad_norm": 1.6260615587234497, + "learning_rate": 1.2589962148203512e-05, + "loss": 0.8417, + "step": 12983 + }, + { + "epoch": 0.6322401577678767, + "grad_norm": 1.4868191480636597, + "learning_rate": 1.2587032363496894e-05, + "loss": 0.8363, + "step": 12984 + }, + { + "epoch": 0.6322888515569839, + "grad_norm": 1.9122189283370972, + "learning_rate": 1.2584102763188513e-05, + "loss": 0.7938, + "step": 12985 + }, + { + "epoch": 0.6323375453460911, + "grad_norm": 2.1269140243530273, + "learning_rate": 1.258117334735125e-05, + "loss": 0.7257, + "step": 12986 + }, + { + "epoch": 0.6323862391351983, + "grad_norm": 1.3934592008590698, + "learning_rate": 1.2578244116057988e-05, + "loss": 0.9445, + "step": 12987 + }, + { + "epoch": 0.6324349329243055, + "grad_norm": 1.590018391609192, + "learning_rate": 1.257531506938157e-05, + "loss": 0.8606, + "step": 12988 + }, + { + "epoch": 0.6324836267134127, + "grad_norm": 1.7389469146728516, + "learning_rate": 1.257238620739487e-05, + "loss": 0.8107, + "step": 12989 + }, + { + "epoch": 0.63253232050252, + "grad_norm": 1.4091589450836182, + "learning_rate": 1.2569457530170737e-05, + "loss": 0.8351, + "step": 12990 + }, + { + "epoch": 0.632581014291627, + "grad_norm": 2.752511739730835, + "learning_rate": 1.2566529037782033e-05, + "loss": 0.7866, + "step": 12991 + }, + { + "epoch": 0.6326297080807343, + "grad_norm": 1.9556419849395752, + "learning_rate": 1.2563600730301591e-05, + "loss": 0.8012, + "step": 12992 + }, + { + "epoch": 0.6326784018698415, + "grad_norm": 1.5978448390960693, + "learning_rate": 1.2560672607802264e-05, + "loss": 0.81, + "step": 12993 + }, + { + "epoch": 0.6327270956589487, + "grad_norm": 2.002429485321045, + "learning_rate": 1.2557744670356878e-05, + "loss": 0.7699, + "step": 12994 + }, + { + "epoch": 0.6327757894480559, + "grad_norm": 1.272026777267456, + "learning_rate": 1.2554816918038277e-05, + "loss": 0.8602, + "step": 12995 + }, + { + "epoch": 0.6328244832371631, + "grad_norm": 1.39418625831604, + "learning_rate": 1.2551889350919278e-05, + "loss": 0.9313, + "step": 12996 + }, + { + "epoch": 0.6328731770262703, + "grad_norm": 1.730107069015503, + "learning_rate": 1.254896196907271e-05, + "loss": 0.8814, + "step": 12997 + }, + { + "epoch": 0.6329218708153775, + "grad_norm": 1.9058420658111572, + "learning_rate": 1.25460347725714e-05, + "loss": 0.7734, + "step": 12998 + }, + { + "epoch": 0.6329705646044848, + "grad_norm": 1.953193187713623, + "learning_rate": 1.2543107761488148e-05, + "loss": 0.8625, + "step": 12999 + }, + { + "epoch": 0.6330192583935919, + "grad_norm": 1.8597264289855957, + "learning_rate": 1.2540180935895778e-05, + "loss": 0.7845, + "step": 13000 + }, + { + "epoch": 0.6330679521826991, + "grad_norm": 1.7514972686767578, + "learning_rate": 1.2537254295867082e-05, + "loss": 0.8249, + "step": 13001 + }, + { + "epoch": 0.6331166459718063, + "grad_norm": 1.640696406364441, + "learning_rate": 1.2534327841474866e-05, + "loss": 0.7524, + "step": 13002 + }, + { + "epoch": 0.6331653397609135, + "grad_norm": 0.09408387541770935, + "learning_rate": 1.2531401572791923e-05, + "loss": 0.6718, + "step": 13003 + }, + { + "epoch": 0.6332140335500207, + "grad_norm": 2.1696619987487793, + "learning_rate": 1.2528475489891057e-05, + "loss": 0.7952, + "step": 13004 + }, + { + "epoch": 0.6332627273391279, + "grad_norm": 1.5632678270339966, + "learning_rate": 1.2525549592845034e-05, + "loss": 0.8247, + "step": 13005 + }, + { + "epoch": 0.6333114211282351, + "grad_norm": 0.0918244794011116, + "learning_rate": 1.2522623881726645e-05, + "loss": 0.6384, + "step": 13006 + }, + { + "epoch": 0.6333601149173423, + "grad_norm": 1.6473346948623657, + "learning_rate": 1.251969835660868e-05, + "loss": 0.7978, + "step": 13007 + }, + { + "epoch": 0.6334088087064494, + "grad_norm": 1.634647250175476, + "learning_rate": 1.251677301756389e-05, + "loss": 0.7397, + "step": 13008 + }, + { + "epoch": 0.6334575024955567, + "grad_norm": 1.684882402420044, + "learning_rate": 1.2513847864665058e-05, + "loss": 0.8718, + "step": 13009 + }, + { + "epoch": 0.6335061962846639, + "grad_norm": 0.09485817700624466, + "learning_rate": 1.2510922897984945e-05, + "loss": 0.6392, + "step": 13010 + }, + { + "epoch": 0.6335548900737711, + "grad_norm": 2.7099530696868896, + "learning_rate": 1.2507998117596305e-05, + "loss": 0.7557, + "step": 13011 + }, + { + "epoch": 0.6336035838628783, + "grad_norm": 1.3547589778900146, + "learning_rate": 1.2505073523571891e-05, + "loss": 0.8146, + "step": 13012 + }, + { + "epoch": 0.6336522776519855, + "grad_norm": 2.3203978538513184, + "learning_rate": 1.2502149115984468e-05, + "loss": 0.8772, + "step": 13013 + }, + { + "epoch": 0.6337009714410927, + "grad_norm": 1.9287503957748413, + "learning_rate": 1.2499224894906758e-05, + "loss": 0.7734, + "step": 13014 + }, + { + "epoch": 0.6337496652301999, + "grad_norm": 1.9128799438476562, + "learning_rate": 1.2496300860411517e-05, + "loss": 0.7648, + "step": 13015 + }, + { + "epoch": 0.633798359019307, + "grad_norm": 1.4033395051956177, + "learning_rate": 1.2493377012571477e-05, + "loss": 0.8843, + "step": 13016 + }, + { + "epoch": 0.6338470528084142, + "grad_norm": 1.5683115720748901, + "learning_rate": 1.2490453351459362e-05, + "loss": 0.8235, + "step": 13017 + }, + { + "epoch": 0.6338957465975215, + "grad_norm": 1.7185615301132202, + "learning_rate": 1.248752987714791e-05, + "loss": 0.8397, + "step": 13018 + }, + { + "epoch": 0.6339444403866287, + "grad_norm": 1.5605220794677734, + "learning_rate": 1.2484606589709836e-05, + "loss": 0.8229, + "step": 13019 + }, + { + "epoch": 0.6339931341757359, + "grad_norm": 1.6575016975402832, + "learning_rate": 1.2481683489217864e-05, + "loss": 0.8031, + "step": 13020 + }, + { + "epoch": 0.6340418279648431, + "grad_norm": 2.1506447792053223, + "learning_rate": 1.247876057574469e-05, + "loss": 0.8831, + "step": 13021 + }, + { + "epoch": 0.6340905217539503, + "grad_norm": 1.3649182319641113, + "learning_rate": 1.2475837849363046e-05, + "loss": 0.8176, + "step": 13022 + }, + { + "epoch": 0.6341392155430575, + "grad_norm": 1.5762546062469482, + "learning_rate": 1.247291531014561e-05, + "loss": 0.7834, + "step": 13023 + }, + { + "epoch": 0.6341879093321647, + "grad_norm": 1.4263185262680054, + "learning_rate": 1.2469992958165099e-05, + "loss": 0.8, + "step": 13024 + }, + { + "epoch": 0.6342366031212718, + "grad_norm": 1.9301671981811523, + "learning_rate": 1.246707079349419e-05, + "loss": 0.7908, + "step": 13025 + }, + { + "epoch": 0.634285296910379, + "grad_norm": 1.4783258438110352, + "learning_rate": 1.2464148816205586e-05, + "loss": 0.8661, + "step": 13026 + }, + { + "epoch": 0.6343339906994863, + "grad_norm": 2.0904457569122314, + "learning_rate": 1.2461227026371975e-05, + "loss": 0.6823, + "step": 13027 + }, + { + "epoch": 0.6343826844885935, + "grad_norm": 1.7127513885498047, + "learning_rate": 1.245830542406602e-05, + "loss": 0.8897, + "step": 13028 + }, + { + "epoch": 0.6344313782777007, + "grad_norm": 1.6780343055725098, + "learning_rate": 1.2455384009360414e-05, + "loss": 0.8736, + "step": 13029 + }, + { + "epoch": 0.6344800720668079, + "grad_norm": 2.0191967487335205, + "learning_rate": 1.245246278232781e-05, + "loss": 0.705, + "step": 13030 + }, + { + "epoch": 0.6345287658559151, + "grad_norm": 1.8460711240768433, + "learning_rate": 1.2449541743040887e-05, + "loss": 0.8884, + "step": 13031 + }, + { + "epoch": 0.6345774596450223, + "grad_norm": 1.489378809928894, + "learning_rate": 1.2446620891572296e-05, + "loss": 0.8553, + "step": 13032 + }, + { + "epoch": 0.6346261534341294, + "grad_norm": 1.6882892847061157, + "learning_rate": 1.244370022799471e-05, + "loss": 0.8147, + "step": 13033 + }, + { + "epoch": 0.6346748472232366, + "grad_norm": 1.6067851781845093, + "learning_rate": 1.244077975238076e-05, + "loss": 0.8599, + "step": 13034 + }, + { + "epoch": 0.6347235410123439, + "grad_norm": 1.6004420518875122, + "learning_rate": 1.243785946480311e-05, + "loss": 0.8226, + "step": 13035 + }, + { + "epoch": 0.6347722348014511, + "grad_norm": 1.6160343885421753, + "learning_rate": 1.2434939365334389e-05, + "loss": 0.7552, + "step": 13036 + }, + { + "epoch": 0.6348209285905583, + "grad_norm": 1.4048017263412476, + "learning_rate": 1.2432019454047242e-05, + "loss": 0.7503, + "step": 13037 + }, + { + "epoch": 0.6348696223796655, + "grad_norm": 1.541335940361023, + "learning_rate": 1.2429099731014302e-05, + "loss": 0.7532, + "step": 13038 + }, + { + "epoch": 0.6349183161687727, + "grad_norm": 1.3519392013549805, + "learning_rate": 1.24261801963082e-05, + "loss": 0.7156, + "step": 13039 + }, + { + "epoch": 0.6349670099578799, + "grad_norm": 2.4713611602783203, + "learning_rate": 1.2423260850001553e-05, + "loss": 0.8651, + "step": 13040 + }, + { + "epoch": 0.6350157037469871, + "grad_norm": 4.452530860900879, + "learning_rate": 1.242034169216698e-05, + "loss": 0.901, + "step": 13041 + }, + { + "epoch": 0.6350643975360942, + "grad_norm": 1.320082426071167, + "learning_rate": 1.2417422722877107e-05, + "loss": 0.7991, + "step": 13042 + }, + { + "epoch": 0.6351130913252014, + "grad_norm": 1.4174326658248901, + "learning_rate": 1.2414503942204528e-05, + "loss": 0.8393, + "step": 13043 + }, + { + "epoch": 0.6351617851143087, + "grad_norm": 1.4170713424682617, + "learning_rate": 1.2411585350221861e-05, + "loss": 0.8615, + "step": 13044 + }, + { + "epoch": 0.6352104789034159, + "grad_norm": 1.4303191900253296, + "learning_rate": 1.2408666947001696e-05, + "loss": 0.8918, + "step": 13045 + }, + { + "epoch": 0.6352591726925231, + "grad_norm": 1.160498857498169, + "learning_rate": 1.2405748732616626e-05, + "loss": 0.7421, + "step": 13046 + }, + { + "epoch": 0.6353078664816303, + "grad_norm": 1.7417964935302734, + "learning_rate": 1.2402830707139259e-05, + "loss": 0.6566, + "step": 13047 + }, + { + "epoch": 0.6353565602707375, + "grad_norm": 1.8847854137420654, + "learning_rate": 1.2399912870642164e-05, + "loss": 0.7552, + "step": 13048 + }, + { + "epoch": 0.6354052540598447, + "grad_norm": 1.454240322113037, + "learning_rate": 1.2396995223197938e-05, + "loss": 0.794, + "step": 13049 + }, + { + "epoch": 0.6354539478489518, + "grad_norm": 2.0038039684295654, + "learning_rate": 1.2394077764879143e-05, + "loss": 0.8991, + "step": 13050 + }, + { + "epoch": 0.635502641638059, + "grad_norm": 1.4737087488174438, + "learning_rate": 1.2391160495758362e-05, + "loss": 0.6689, + "step": 13051 + }, + { + "epoch": 0.6355513354271662, + "grad_norm": 2.209928035736084, + "learning_rate": 1.238824341590815e-05, + "loss": 0.8369, + "step": 13052 + }, + { + "epoch": 0.6356000292162735, + "grad_norm": 1.718831181526184, + "learning_rate": 1.2385326525401082e-05, + "loss": 0.7226, + "step": 13053 + }, + { + "epoch": 0.6356487230053807, + "grad_norm": 1.3473267555236816, + "learning_rate": 1.2382409824309706e-05, + "loss": 0.8261, + "step": 13054 + }, + { + "epoch": 0.6356974167944879, + "grad_norm": 5.78423547744751, + "learning_rate": 1.237949331270659e-05, + "loss": 0.8338, + "step": 13055 + }, + { + "epoch": 0.6357461105835951, + "grad_norm": 1.8051087856292725, + "learning_rate": 1.2376576990664262e-05, + "loss": 0.7032, + "step": 13056 + }, + { + "epoch": 0.6357948043727023, + "grad_norm": 1.7177828550338745, + "learning_rate": 1.2373660858255279e-05, + "loss": 0.7893, + "step": 13057 + }, + { + "epoch": 0.6358434981618094, + "grad_norm": 1.813191294670105, + "learning_rate": 1.2370744915552184e-05, + "loss": 0.9033, + "step": 13058 + }, + { + "epoch": 0.6358921919509166, + "grad_norm": 1.7170794010162354, + "learning_rate": 1.2367829162627497e-05, + "loss": 0.726, + "step": 13059 + }, + { + "epoch": 0.6359408857400238, + "grad_norm": 1.8845618963241577, + "learning_rate": 1.2364913599553759e-05, + "loss": 0.9355, + "step": 13060 + }, + { + "epoch": 0.635989579529131, + "grad_norm": 4.9612884521484375, + "learning_rate": 1.2361998226403488e-05, + "loss": 0.82, + "step": 13061 + }, + { + "epoch": 0.6360382733182383, + "grad_norm": 5.374853610992432, + "learning_rate": 1.2359083043249216e-05, + "loss": 0.8499, + "step": 13062 + }, + { + "epoch": 0.6360869671073455, + "grad_norm": 1.4777697324752808, + "learning_rate": 1.2356168050163442e-05, + "loss": 0.8315, + "step": 13063 + }, + { + "epoch": 0.6361356608964527, + "grad_norm": 1.6247138977050781, + "learning_rate": 1.235325324721869e-05, + "loss": 0.9512, + "step": 13064 + }, + { + "epoch": 0.6361843546855599, + "grad_norm": 1.5221526622772217, + "learning_rate": 1.2350338634487456e-05, + "loss": 0.7182, + "step": 13065 + }, + { + "epoch": 0.6362330484746671, + "grad_norm": 3.5184085369110107, + "learning_rate": 1.2347424212042246e-05, + "loss": 0.8214, + "step": 13066 + }, + { + "epoch": 0.6362817422637742, + "grad_norm": 1.8082220554351807, + "learning_rate": 1.2344509979955559e-05, + "loss": 0.8451, + "step": 13067 + }, + { + "epoch": 0.6363304360528814, + "grad_norm": 1.9699623584747314, + "learning_rate": 1.2341595938299884e-05, + "loss": 0.8401, + "step": 13068 + }, + { + "epoch": 0.6363791298419886, + "grad_norm": 1.5670382976531982, + "learning_rate": 1.2338682087147708e-05, + "loss": 0.9243, + "step": 13069 + }, + { + "epoch": 0.6364278236310958, + "grad_norm": 2.090705394744873, + "learning_rate": 1.233576842657151e-05, + "loss": 0.7485, + "step": 13070 + }, + { + "epoch": 0.6364765174202031, + "grad_norm": 1.6313502788543701, + "learning_rate": 1.233285495664378e-05, + "loss": 0.7907, + "step": 13071 + }, + { + "epoch": 0.6365252112093103, + "grad_norm": 1.3286161422729492, + "learning_rate": 1.2329941677436974e-05, + "loss": 0.774, + "step": 13072 + }, + { + "epoch": 0.6365739049984175, + "grad_norm": 1.447731614112854, + "learning_rate": 1.2327028589023575e-05, + "loss": 0.78, + "step": 13073 + }, + { + "epoch": 0.6366225987875247, + "grad_norm": 1.3818471431732178, + "learning_rate": 1.2324115691476033e-05, + "loss": 0.8671, + "step": 13074 + }, + { + "epoch": 0.6366712925766318, + "grad_norm": 1.5227049589157104, + "learning_rate": 1.2321202984866818e-05, + "loss": 0.7974, + "step": 13075 + }, + { + "epoch": 0.636719986365739, + "grad_norm": 0.09667690843343735, + "learning_rate": 1.231829046926837e-05, + "loss": 0.6335, + "step": 13076 + }, + { + "epoch": 0.6367686801548462, + "grad_norm": 1.5921847820281982, + "learning_rate": 1.2315378144753152e-05, + "loss": 0.8277, + "step": 13077 + }, + { + "epoch": 0.6368173739439534, + "grad_norm": 1.5299016237258911, + "learning_rate": 1.231246601139361e-05, + "loss": 0.8578, + "step": 13078 + }, + { + "epoch": 0.6368660677330606, + "grad_norm": 2.269716739654541, + "learning_rate": 1.2309554069262166e-05, + "loss": 0.8924, + "step": 13079 + }, + { + "epoch": 0.6369147615221679, + "grad_norm": 1.8844540119171143, + "learning_rate": 1.2306642318431277e-05, + "loss": 0.9284, + "step": 13080 + }, + { + "epoch": 0.6369634553112751, + "grad_norm": 1.4239966869354248, + "learning_rate": 1.2303730758973353e-05, + "loss": 0.8235, + "step": 13081 + }, + { + "epoch": 0.6370121491003823, + "grad_norm": 1.7845776081085205, + "learning_rate": 1.230081939096083e-05, + "loss": 0.8428, + "step": 13082 + }, + { + "epoch": 0.6370608428894895, + "grad_norm": 1.6966242790222168, + "learning_rate": 1.2297908214466125e-05, + "loss": 0.8115, + "step": 13083 + }, + { + "epoch": 0.6371095366785966, + "grad_norm": 1.7383028268814087, + "learning_rate": 1.2294997229561663e-05, + "loss": 0.8368, + "step": 13084 + }, + { + "epoch": 0.6371582304677038, + "grad_norm": 1.2839889526367188, + "learning_rate": 1.2292086436319841e-05, + "loss": 0.733, + "step": 13085 + }, + { + "epoch": 0.637206924256811, + "grad_norm": 1.640597939491272, + "learning_rate": 1.228917583481307e-05, + "loss": 0.8153, + "step": 13086 + }, + { + "epoch": 0.6372556180459182, + "grad_norm": 4.700300216674805, + "learning_rate": 1.2286265425113761e-05, + "loss": 0.7801, + "step": 13087 + }, + { + "epoch": 0.6373043118350254, + "grad_norm": 1.8924797773361206, + "learning_rate": 1.2283355207294294e-05, + "loss": 0.8362, + "step": 13088 + }, + { + "epoch": 0.6373530056241327, + "grad_norm": 1.5348910093307495, + "learning_rate": 1.2280445181427075e-05, + "loss": 0.8319, + "step": 13089 + }, + { + "epoch": 0.6374016994132399, + "grad_norm": 1.2874445915222168, + "learning_rate": 1.2277535347584481e-05, + "loss": 0.7483, + "step": 13090 + }, + { + "epoch": 0.6374503932023471, + "grad_norm": 1.4021316766738892, + "learning_rate": 1.2274625705838909e-05, + "loss": 0.7764, + "step": 13091 + }, + { + "epoch": 0.6374990869914542, + "grad_norm": 7.562536716461182, + "learning_rate": 1.2271716256262715e-05, + "loss": 0.8033, + "step": 13092 + }, + { + "epoch": 0.6375477807805614, + "grad_norm": 1.7064381837844849, + "learning_rate": 1.2268806998928296e-05, + "loss": 0.8349, + "step": 13093 + }, + { + "epoch": 0.6375964745696686, + "grad_norm": 1.6626038551330566, + "learning_rate": 1.2265897933908e-05, + "loss": 0.8715, + "step": 13094 + }, + { + "epoch": 0.6376451683587758, + "grad_norm": 0.10015548020601273, + "learning_rate": 1.2262989061274198e-05, + "loss": 0.5955, + "step": 13095 + }, + { + "epoch": 0.637693862147883, + "grad_norm": 1.4014077186584473, + "learning_rate": 1.2260080381099252e-05, + "loss": 0.857, + "step": 13096 + }, + { + "epoch": 0.6377425559369903, + "grad_norm": 1.462506890296936, + "learning_rate": 1.2257171893455509e-05, + "loss": 0.8455, + "step": 13097 + }, + { + "epoch": 0.6377912497260975, + "grad_norm": 2.0082225799560547, + "learning_rate": 1.2254263598415324e-05, + "loss": 0.8111, + "step": 13098 + }, + { + "epoch": 0.6378399435152047, + "grad_norm": 0.0922820046544075, + "learning_rate": 1.2251355496051032e-05, + "loss": 0.5785, + "step": 13099 + }, + { + "epoch": 0.6378886373043119, + "grad_norm": 2.322936534881592, + "learning_rate": 1.2248447586434989e-05, + "loss": 0.8505, + "step": 13100 + }, + { + "epoch": 0.637937331093419, + "grad_norm": 2.301658868789673, + "learning_rate": 1.224553986963951e-05, + "loss": 0.8696, + "step": 13101 + }, + { + "epoch": 0.6379860248825262, + "grad_norm": 2.221386432647705, + "learning_rate": 1.2242632345736943e-05, + "loss": 0.8822, + "step": 13102 + }, + { + "epoch": 0.6380347186716334, + "grad_norm": 0.08869868516921997, + "learning_rate": 1.2239725014799593e-05, + "loss": 0.5683, + "step": 13103 + }, + { + "epoch": 0.6380834124607406, + "grad_norm": 1.4807811975479126, + "learning_rate": 1.2236817876899796e-05, + "loss": 0.7992, + "step": 13104 + }, + { + "epoch": 0.6381321062498478, + "grad_norm": 2.2458128929138184, + "learning_rate": 1.223391093210986e-05, + "loss": 0.7956, + "step": 13105 + }, + { + "epoch": 0.638180800038955, + "grad_norm": 1.6301226615905762, + "learning_rate": 1.2231004180502097e-05, + "loss": 0.7453, + "step": 13106 + }, + { + "epoch": 0.6382294938280623, + "grad_norm": 1.4856908321380615, + "learning_rate": 1.222809762214882e-05, + "loss": 0.8576, + "step": 13107 + }, + { + "epoch": 0.6382781876171695, + "grad_norm": 1.7171059846878052, + "learning_rate": 1.2225191257122316e-05, + "loss": 0.8294, + "step": 13108 + }, + { + "epoch": 0.6383268814062766, + "grad_norm": 1.3938237428665161, + "learning_rate": 1.2222285085494897e-05, + "loss": 0.8094, + "step": 13109 + }, + { + "epoch": 0.6383755751953838, + "grad_norm": 0.09768743813037872, + "learning_rate": 1.2219379107338835e-05, + "loss": 0.5869, + "step": 13110 + }, + { + "epoch": 0.638424268984491, + "grad_norm": 2.051440477371216, + "learning_rate": 1.2216473322726434e-05, + "loss": 0.7673, + "step": 13111 + }, + { + "epoch": 0.6384729627735982, + "grad_norm": 0.0894286185503006, + "learning_rate": 1.2213567731729965e-05, + "loss": 0.561, + "step": 13112 + }, + { + "epoch": 0.6385216565627054, + "grad_norm": 1.652233362197876, + "learning_rate": 1.2210662334421714e-05, + "loss": 0.8398, + "step": 13113 + }, + { + "epoch": 0.6385703503518126, + "grad_norm": 1.5116288661956787, + "learning_rate": 1.2207757130873944e-05, + "loss": 0.7694, + "step": 13114 + }, + { + "epoch": 0.6386190441409199, + "grad_norm": 1.9844131469726562, + "learning_rate": 1.2204852121158921e-05, + "loss": 0.7915, + "step": 13115 + }, + { + "epoch": 0.6386677379300271, + "grad_norm": 1.5370643138885498, + "learning_rate": 1.220194730534892e-05, + "loss": 0.8515, + "step": 13116 + }, + { + "epoch": 0.6387164317191342, + "grad_norm": 1.344353199005127, + "learning_rate": 1.2199042683516187e-05, + "loss": 0.8889, + "step": 13117 + }, + { + "epoch": 0.6387651255082414, + "grad_norm": 1.4942046403884888, + "learning_rate": 1.2196138255732981e-05, + "loss": 0.8438, + "step": 13118 + }, + { + "epoch": 0.6388138192973486, + "grad_norm": 1.5323009490966797, + "learning_rate": 1.219323402207154e-05, + "loss": 0.7436, + "step": 13119 + }, + { + "epoch": 0.6388625130864558, + "grad_norm": 1.903167963027954, + "learning_rate": 1.2190329982604125e-05, + "loss": 0.8461, + "step": 13120 + }, + { + "epoch": 0.638911206875563, + "grad_norm": 1.919772982597351, + "learning_rate": 1.2187426137402958e-05, + "loss": 0.8434, + "step": 13121 + }, + { + "epoch": 0.6389599006646702, + "grad_norm": 1.5690672397613525, + "learning_rate": 1.2184522486540284e-05, + "loss": 0.8768, + "step": 13122 + }, + { + "epoch": 0.6390085944537774, + "grad_norm": 2.7245876789093018, + "learning_rate": 1.2181619030088319e-05, + "loss": 0.8308, + "step": 13123 + }, + { + "epoch": 0.6390572882428847, + "grad_norm": 1.7801494598388672, + "learning_rate": 1.2178715768119296e-05, + "loss": 0.8332, + "step": 13124 + }, + { + "epoch": 0.6391059820319919, + "grad_norm": 3.4575722217559814, + "learning_rate": 1.2175812700705431e-05, + "loss": 0.8094, + "step": 13125 + }, + { + "epoch": 0.639154675821099, + "grad_norm": 2.89005446434021, + "learning_rate": 1.2172909827918935e-05, + "loss": 0.7578, + "step": 13126 + }, + { + "epoch": 0.6392033696102062, + "grad_norm": 2.251772403717041, + "learning_rate": 1.2170007149832027e-05, + "loss": 0.7468, + "step": 13127 + }, + { + "epoch": 0.6392520633993134, + "grad_norm": 1.5438785552978516, + "learning_rate": 1.2167104666516897e-05, + "loss": 0.8578, + "step": 13128 + }, + { + "epoch": 0.6393007571884206, + "grad_norm": 2.4456441402435303, + "learning_rate": 1.2164202378045765e-05, + "loss": 0.8062, + "step": 13129 + }, + { + "epoch": 0.6393494509775278, + "grad_norm": 1.449489951133728, + "learning_rate": 1.2161300284490803e-05, + "loss": 0.8342, + "step": 13130 + }, + { + "epoch": 0.639398144766635, + "grad_norm": 4.773652076721191, + "learning_rate": 1.2158398385924218e-05, + "loss": 0.8474, + "step": 13131 + }, + { + "epoch": 0.6394468385557422, + "grad_norm": 1.2167558670043945, + "learning_rate": 1.2155496682418182e-05, + "loss": 0.7776, + "step": 13132 + }, + { + "epoch": 0.6394955323448495, + "grad_norm": 1.8379653692245483, + "learning_rate": 1.2152595174044883e-05, + "loss": 0.8109, + "step": 13133 + }, + { + "epoch": 0.6395442261339566, + "grad_norm": 2.0136334896087646, + "learning_rate": 1.214969386087649e-05, + "loss": 0.8643, + "step": 13134 + }, + { + "epoch": 0.6395929199230638, + "grad_norm": 1.6872406005859375, + "learning_rate": 1.214679274298518e-05, + "loss": 0.8152, + "step": 13135 + }, + { + "epoch": 0.639641613712171, + "grad_norm": 2.017303705215454, + "learning_rate": 1.2143891820443126e-05, + "loss": 0.8042, + "step": 13136 + }, + { + "epoch": 0.6396903075012782, + "grad_norm": 1.4281097650527954, + "learning_rate": 1.214099109332247e-05, + "loss": 0.8487, + "step": 13137 + }, + { + "epoch": 0.6397390012903854, + "grad_norm": 1.674707293510437, + "learning_rate": 1.2138090561695383e-05, + "loss": 0.8193, + "step": 13138 + }, + { + "epoch": 0.6397876950794926, + "grad_norm": 1.9863072633743286, + "learning_rate": 1.2135190225634007e-05, + "loss": 0.9235, + "step": 13139 + }, + { + "epoch": 0.6398363888685998, + "grad_norm": 1.8615491390228271, + "learning_rate": 1.213229008521049e-05, + "loss": 0.8089, + "step": 13140 + }, + { + "epoch": 0.639885082657707, + "grad_norm": 1.6506867408752441, + "learning_rate": 1.2129390140496973e-05, + "loss": 0.874, + "step": 13141 + }, + { + "epoch": 0.6399337764468143, + "grad_norm": 2.7492423057556152, + "learning_rate": 1.2126490391565602e-05, + "loss": 0.8393, + "step": 13142 + }, + { + "epoch": 0.6399824702359214, + "grad_norm": 1.1769241094589233, + "learning_rate": 1.212359083848849e-05, + "loss": 0.8481, + "step": 13143 + }, + { + "epoch": 0.6400311640250286, + "grad_norm": 3.0061304569244385, + "learning_rate": 1.2120691481337785e-05, + "loss": 0.8623, + "step": 13144 + }, + { + "epoch": 0.6400798578141358, + "grad_norm": 2.9925270080566406, + "learning_rate": 1.211779232018559e-05, + "loss": 0.8559, + "step": 13145 + }, + { + "epoch": 0.640128551603243, + "grad_norm": 1.7218778133392334, + "learning_rate": 1.2114893355104027e-05, + "loss": 0.8261, + "step": 13146 + }, + { + "epoch": 0.6401772453923502, + "grad_norm": 2.61789870262146, + "learning_rate": 1.2111994586165216e-05, + "loss": 0.7813, + "step": 13147 + }, + { + "epoch": 0.6402259391814574, + "grad_norm": 1.6333760023117065, + "learning_rate": 1.2109096013441259e-05, + "loss": 0.8645, + "step": 13148 + }, + { + "epoch": 0.6402746329705646, + "grad_norm": 0.09487617760896683, + "learning_rate": 1.210619763700426e-05, + "loss": 0.6611, + "step": 13149 + }, + { + "epoch": 0.6403233267596719, + "grad_norm": 1.9822667837142944, + "learning_rate": 1.2103299456926308e-05, + "loss": 0.8373, + "step": 13150 + }, + { + "epoch": 0.640372020548779, + "grad_norm": 1.954815149307251, + "learning_rate": 1.210040147327951e-05, + "loss": 0.8095, + "step": 13151 + }, + { + "epoch": 0.6404207143378862, + "grad_norm": 2.2469608783721924, + "learning_rate": 1.209750368613594e-05, + "loss": 0.8502, + "step": 13152 + }, + { + "epoch": 0.6404694081269934, + "grad_norm": 1.5978658199310303, + "learning_rate": 1.209460609556769e-05, + "loss": 0.7578, + "step": 13153 + }, + { + "epoch": 0.6405181019161006, + "grad_norm": 1.7311522960662842, + "learning_rate": 1.209170870164683e-05, + "loss": 0.7759, + "step": 13154 + }, + { + "epoch": 0.6405667957052078, + "grad_norm": 1.7080614566802979, + "learning_rate": 1.2088811504445434e-05, + "loss": 0.7762, + "step": 13155 + }, + { + "epoch": 0.640615489494315, + "grad_norm": 2.2442047595977783, + "learning_rate": 1.208591450403558e-05, + "loss": 0.8086, + "step": 13156 + }, + { + "epoch": 0.6406641832834222, + "grad_norm": 2.085887908935547, + "learning_rate": 1.208301770048932e-05, + "loss": 0.849, + "step": 13157 + }, + { + "epoch": 0.6407128770725294, + "grad_norm": 2.00968861579895, + "learning_rate": 1.2080121093878722e-05, + "loss": 0.7515, + "step": 13158 + }, + { + "epoch": 0.6407615708616367, + "grad_norm": 1.532415509223938, + "learning_rate": 1.207722468427583e-05, + "loss": 0.8094, + "step": 13159 + }, + { + "epoch": 0.6408102646507438, + "grad_norm": 1.4283517599105835, + "learning_rate": 1.2074328471752703e-05, + "loss": 0.7955, + "step": 13160 + }, + { + "epoch": 0.640858958439851, + "grad_norm": 1.5735650062561035, + "learning_rate": 1.2071432456381369e-05, + "loss": 0.8725, + "step": 13161 + }, + { + "epoch": 0.6409076522289582, + "grad_norm": 1.869931697845459, + "learning_rate": 1.206853663823388e-05, + "loss": 0.8109, + "step": 13162 + }, + { + "epoch": 0.6409563460180654, + "grad_norm": 1.3784044981002808, + "learning_rate": 1.2065641017382259e-05, + "loss": 0.8206, + "step": 13163 + }, + { + "epoch": 0.6410050398071726, + "grad_norm": 1.8601950407028198, + "learning_rate": 1.206274559389855e-05, + "loss": 0.7698, + "step": 13164 + }, + { + "epoch": 0.6410537335962798, + "grad_norm": 2.3000574111938477, + "learning_rate": 1.2059850367854762e-05, + "loss": 0.7639, + "step": 13165 + }, + { + "epoch": 0.641102427385387, + "grad_norm": 1.9527971744537354, + "learning_rate": 1.2056955339322918e-05, + "loss": 0.9315, + "step": 13166 + }, + { + "epoch": 0.6411511211744942, + "grad_norm": 2.073335647583008, + "learning_rate": 1.2054060508375046e-05, + "loss": 0.7763, + "step": 13167 + }, + { + "epoch": 0.6411998149636013, + "grad_norm": 1.4954490661621094, + "learning_rate": 1.2051165875083131e-05, + "loss": 0.8077, + "step": 13168 + }, + { + "epoch": 0.6412485087527086, + "grad_norm": 1.463761568069458, + "learning_rate": 1.2048271439519195e-05, + "loss": 0.7301, + "step": 13169 + }, + { + "epoch": 0.6412972025418158, + "grad_norm": 1.406756043434143, + "learning_rate": 1.2045377201755227e-05, + "loss": 0.7708, + "step": 13170 + }, + { + "epoch": 0.641345896330923, + "grad_norm": 1.7023876905441284, + "learning_rate": 1.2042483161863236e-05, + "loss": 0.8085, + "step": 13171 + }, + { + "epoch": 0.6413945901200302, + "grad_norm": 2.1553914546966553, + "learning_rate": 1.2039589319915192e-05, + "loss": 0.9215, + "step": 13172 + }, + { + "epoch": 0.6414432839091374, + "grad_norm": 1.802588939666748, + "learning_rate": 1.20366956759831e-05, + "loss": 0.8738, + "step": 13173 + }, + { + "epoch": 0.6414919776982446, + "grad_norm": 1.794388771057129, + "learning_rate": 1.203380223013892e-05, + "loss": 0.808, + "step": 13174 + }, + { + "epoch": 0.6415406714873518, + "grad_norm": 1.5287175178527832, + "learning_rate": 1.2030908982454637e-05, + "loss": 0.8663, + "step": 13175 + }, + { + "epoch": 0.6415893652764589, + "grad_norm": 1.8864761590957642, + "learning_rate": 1.2028015933002221e-05, + "loss": 0.7986, + "step": 13176 + }, + { + "epoch": 0.6416380590655661, + "grad_norm": 2.9581151008605957, + "learning_rate": 1.2025123081853638e-05, + "loss": 0.7622, + "step": 13177 + }, + { + "epoch": 0.6416867528546734, + "grad_norm": 1.6552683115005493, + "learning_rate": 1.2022230429080843e-05, + "loss": 0.7679, + "step": 13178 + }, + { + "epoch": 0.6417354466437806, + "grad_norm": 0.09614988416433334, + "learning_rate": 1.2019337974755792e-05, + "loss": 0.6542, + "step": 13179 + }, + { + "epoch": 0.6417841404328878, + "grad_norm": 2.5986146926879883, + "learning_rate": 1.2016445718950443e-05, + "loss": 0.7771, + "step": 13180 + }, + { + "epoch": 0.641832834221995, + "grad_norm": 1.7665290832519531, + "learning_rate": 1.2013553661736727e-05, + "loss": 0.8669, + "step": 13181 + }, + { + "epoch": 0.6418815280111022, + "grad_norm": 0.10054504871368408, + "learning_rate": 1.20106618031866e-05, + "loss": 0.6837, + "step": 13182 + }, + { + "epoch": 0.6419302218002094, + "grad_norm": 1.9994678497314453, + "learning_rate": 1.2007770143371982e-05, + "loss": 0.9147, + "step": 13183 + }, + { + "epoch": 0.6419789155893166, + "grad_norm": 1.641708493232727, + "learning_rate": 1.2004878682364808e-05, + "loss": 0.7337, + "step": 13184 + }, + { + "epoch": 0.6420276093784237, + "grad_norm": 1.6695858240127563, + "learning_rate": 1.2001987420237012e-05, + "loss": 0.9156, + "step": 13185 + }, + { + "epoch": 0.642076303167531, + "grad_norm": 2.7284092903137207, + "learning_rate": 1.1999096357060504e-05, + "loss": 0.7864, + "step": 13186 + }, + { + "epoch": 0.6421249969566382, + "grad_norm": 1.8464800119400024, + "learning_rate": 1.199620549290721e-05, + "loss": 0.7857, + "step": 13187 + }, + { + "epoch": 0.6421736907457454, + "grad_norm": 1.7249889373779297, + "learning_rate": 1.1993314827849027e-05, + "loss": 0.7959, + "step": 13188 + }, + { + "epoch": 0.6422223845348526, + "grad_norm": 1.6794307231903076, + "learning_rate": 1.1990424361957875e-05, + "loss": 0.9039, + "step": 13189 + }, + { + "epoch": 0.6422710783239598, + "grad_norm": 1.5128284692764282, + "learning_rate": 1.1987534095305641e-05, + "loss": 0.9041, + "step": 13190 + }, + { + "epoch": 0.642319772113067, + "grad_norm": 1.526984691619873, + "learning_rate": 1.198464402796423e-05, + "loss": 0.8697, + "step": 13191 + }, + { + "epoch": 0.6423684659021742, + "grad_norm": 0.09709173440933228, + "learning_rate": 1.1981754160005525e-05, + "loss": 0.6494, + "step": 13192 + }, + { + "epoch": 0.6424171596912813, + "grad_norm": 1.5993516445159912, + "learning_rate": 1.1978864491501425e-05, + "loss": 0.905, + "step": 13193 + }, + { + "epoch": 0.6424658534803885, + "grad_norm": 1.519407033920288, + "learning_rate": 1.1975975022523794e-05, + "loss": 0.7197, + "step": 13194 + }, + { + "epoch": 0.6425145472694958, + "grad_norm": 1.577402114868164, + "learning_rate": 1.1973085753144515e-05, + "loss": 0.8141, + "step": 13195 + }, + { + "epoch": 0.642563241058603, + "grad_norm": 1.1411426067352295, + "learning_rate": 1.1970196683435469e-05, + "loss": 0.7341, + "step": 13196 + }, + { + "epoch": 0.6426119348477102, + "grad_norm": 1.4406704902648926, + "learning_rate": 1.1967307813468505e-05, + "loss": 0.7941, + "step": 13197 + }, + { + "epoch": 0.6426606286368174, + "grad_norm": 1.4404702186584473, + "learning_rate": 1.1964419143315493e-05, + "loss": 0.8053, + "step": 13198 + }, + { + "epoch": 0.6427093224259246, + "grad_norm": 0.10448712855577469, + "learning_rate": 1.1961530673048288e-05, + "loss": 0.5911, + "step": 13199 + }, + { + "epoch": 0.6427580162150318, + "grad_norm": 1.2786370515823364, + "learning_rate": 1.1958642402738744e-05, + "loss": 0.7826, + "step": 13200 + }, + { + "epoch": 0.642806710004139, + "grad_norm": 2.527435064315796, + "learning_rate": 1.1955754332458698e-05, + "loss": 0.8322, + "step": 13201 + }, + { + "epoch": 0.6428554037932461, + "grad_norm": 1.8831006288528442, + "learning_rate": 1.1952866462280004e-05, + "loss": 0.8704, + "step": 13202 + }, + { + "epoch": 0.6429040975823533, + "grad_norm": 1.2964746952056885, + "learning_rate": 1.1949978792274482e-05, + "loss": 0.7595, + "step": 13203 + }, + { + "epoch": 0.6429527913714606, + "grad_norm": 2.466613292694092, + "learning_rate": 1.1947091322513971e-05, + "loss": 0.9151, + "step": 13204 + }, + { + "epoch": 0.6430014851605678, + "grad_norm": 1.5082608461380005, + "learning_rate": 1.1944204053070305e-05, + "loss": 0.8394, + "step": 13205 + }, + { + "epoch": 0.643050178949675, + "grad_norm": 1.5360162258148193, + "learning_rate": 1.1941316984015292e-05, + "loss": 0.8105, + "step": 13206 + }, + { + "epoch": 0.6430988727387822, + "grad_norm": 1.471465826034546, + "learning_rate": 1.1938430115420756e-05, + "loss": 0.7932, + "step": 13207 + }, + { + "epoch": 0.6431475665278894, + "grad_norm": 1.960453748703003, + "learning_rate": 1.1935543447358502e-05, + "loss": 0.8172, + "step": 13208 + }, + { + "epoch": 0.6431962603169966, + "grad_norm": 1.3204536437988281, + "learning_rate": 1.193265697990035e-05, + "loss": 0.9105, + "step": 13209 + }, + { + "epoch": 0.6432449541061037, + "grad_norm": 1.4394081830978394, + "learning_rate": 1.192977071311808e-05, + "loss": 0.8394, + "step": 13210 + }, + { + "epoch": 0.6432936478952109, + "grad_norm": 2.783690929412842, + "learning_rate": 1.1926884647083509e-05, + "loss": 0.842, + "step": 13211 + }, + { + "epoch": 0.6433423416843181, + "grad_norm": 1.3923879861831665, + "learning_rate": 1.1923998781868411e-05, + "loss": 0.7876, + "step": 13212 + }, + { + "epoch": 0.6433910354734254, + "grad_norm": 1.928536057472229, + "learning_rate": 1.1921113117544582e-05, + "loss": 0.7825, + "step": 13213 + }, + { + "epoch": 0.6434397292625326, + "grad_norm": 1.5291001796722412, + "learning_rate": 1.19182276541838e-05, + "loss": 0.794, + "step": 13214 + }, + { + "epoch": 0.6434884230516398, + "grad_norm": 1.6527042388916016, + "learning_rate": 1.191534239185784e-05, + "loss": 0.874, + "step": 13215 + }, + { + "epoch": 0.643537116840747, + "grad_norm": 2.60685396194458, + "learning_rate": 1.1912457330638483e-05, + "loss": 0.8017, + "step": 13216 + }, + { + "epoch": 0.6435858106298542, + "grad_norm": 1.4716613292694092, + "learning_rate": 1.1909572470597479e-05, + "loss": 0.7243, + "step": 13217 + }, + { + "epoch": 0.6436345044189613, + "grad_norm": 1.329402208328247, + "learning_rate": 1.1906687811806606e-05, + "loss": 0.7962, + "step": 13218 + }, + { + "epoch": 0.6436831982080685, + "grad_norm": 1.351170301437378, + "learning_rate": 1.1903803354337601e-05, + "loss": 0.8515, + "step": 13219 + }, + { + "epoch": 0.6437318919971757, + "grad_norm": 1.6237354278564453, + "learning_rate": 1.1900919098262234e-05, + "loss": 0.7048, + "step": 13220 + }, + { + "epoch": 0.6437805857862829, + "grad_norm": 1.3952534198760986, + "learning_rate": 1.1898035043652237e-05, + "loss": 0.8192, + "step": 13221 + }, + { + "epoch": 0.6438292795753902, + "grad_norm": 1.5951818227767944, + "learning_rate": 1.1895151190579366e-05, + "loss": 0.7582, + "step": 13222 + }, + { + "epoch": 0.6438779733644974, + "grad_norm": 1.3795958757400513, + "learning_rate": 1.1892267539115338e-05, + "loss": 0.8531, + "step": 13223 + }, + { + "epoch": 0.6439266671536046, + "grad_norm": 1.7659704685211182, + "learning_rate": 1.1889384089331894e-05, + "loss": 0.7888, + "step": 13224 + }, + { + "epoch": 0.6439753609427118, + "grad_norm": 0.10764630138874054, + "learning_rate": 1.1886500841300769e-05, + "loss": 0.5828, + "step": 13225 + }, + { + "epoch": 0.644024054731819, + "grad_norm": 3.3165838718414307, + "learning_rate": 1.1883617795093669e-05, + "loss": 0.9475, + "step": 13226 + }, + { + "epoch": 0.6440727485209261, + "grad_norm": 1.4260263442993164, + "learning_rate": 1.188073495078232e-05, + "loss": 0.7836, + "step": 13227 + }, + { + "epoch": 0.6441214423100333, + "grad_norm": 1.1043474674224854, + "learning_rate": 1.1877852308438424e-05, + "loss": 0.7935, + "step": 13228 + }, + { + "epoch": 0.6441701360991405, + "grad_norm": 1.4516180753707886, + "learning_rate": 1.1874969868133701e-05, + "loss": 0.7754, + "step": 13229 + }, + { + "epoch": 0.6442188298882477, + "grad_norm": 1.8196096420288086, + "learning_rate": 1.1872087629939837e-05, + "loss": 0.8463, + "step": 13230 + }, + { + "epoch": 0.644267523677355, + "grad_norm": 0.0914660170674324, + "learning_rate": 1.1869205593928539e-05, + "loss": 0.6383, + "step": 13231 + }, + { + "epoch": 0.6443162174664622, + "grad_norm": 1.7940702438354492, + "learning_rate": 1.1866323760171485e-05, + "loss": 0.7635, + "step": 13232 + }, + { + "epoch": 0.6443649112555694, + "grad_norm": 1.506637692451477, + "learning_rate": 1.1863442128740377e-05, + "loss": 0.8379, + "step": 13233 + }, + { + "epoch": 0.6444136050446766, + "grad_norm": 1.6169723272323608, + "learning_rate": 1.1860560699706887e-05, + "loss": 0.8747, + "step": 13234 + }, + { + "epoch": 0.6444622988337837, + "grad_norm": 1.439874291419983, + "learning_rate": 1.1857679473142685e-05, + "loss": 0.7801, + "step": 13235 + }, + { + "epoch": 0.6445109926228909, + "grad_norm": 2.1468193531036377, + "learning_rate": 1.1854798449119454e-05, + "loss": 0.7456, + "step": 13236 + }, + { + "epoch": 0.6445596864119981, + "grad_norm": 0.0960717424750328, + "learning_rate": 1.1851917627708848e-05, + "loss": 0.584, + "step": 13237 + }, + { + "epoch": 0.6446083802011053, + "grad_norm": 1.6555507183074951, + "learning_rate": 1.1849037008982546e-05, + "loss": 0.8201, + "step": 13238 + }, + { + "epoch": 0.6446570739902125, + "grad_norm": 1.2911031246185303, + "learning_rate": 1.1846156593012181e-05, + "loss": 0.8225, + "step": 13239 + }, + { + "epoch": 0.6447057677793198, + "grad_norm": 1.707776427268982, + "learning_rate": 1.1843276379869423e-05, + "loss": 0.8296, + "step": 13240 + }, + { + "epoch": 0.644754461568427, + "grad_norm": 2.3029444217681885, + "learning_rate": 1.1840396369625901e-05, + "loss": 0.8322, + "step": 13241 + }, + { + "epoch": 0.6448031553575342, + "grad_norm": 1.920235276222229, + "learning_rate": 1.1837516562353268e-05, + "loss": 0.9034, + "step": 13242 + }, + { + "epoch": 0.6448518491466414, + "grad_norm": 1.7753524780273438, + "learning_rate": 1.1834636958123152e-05, + "loss": 0.8748, + "step": 13243 + }, + { + "epoch": 0.6449005429357485, + "grad_norm": 1.3505414724349976, + "learning_rate": 1.1831757557007182e-05, + "loss": 0.9201, + "step": 13244 + }, + { + "epoch": 0.6449492367248557, + "grad_norm": 1.3702964782714844, + "learning_rate": 1.1828878359076996e-05, + "loss": 0.7333, + "step": 13245 + }, + { + "epoch": 0.6449979305139629, + "grad_norm": 1.4895384311676025, + "learning_rate": 1.1825999364404201e-05, + "loss": 0.8198, + "step": 13246 + }, + { + "epoch": 0.6450466243030701, + "grad_norm": 1.3797231912612915, + "learning_rate": 1.1823120573060423e-05, + "loss": 0.8354, + "step": 13247 + }, + { + "epoch": 0.6450953180921773, + "grad_norm": 1.8404115438461304, + "learning_rate": 1.1820241985117257e-05, + "loss": 0.7314, + "step": 13248 + }, + { + "epoch": 0.6451440118812846, + "grad_norm": 1.5260878801345825, + "learning_rate": 1.1817363600646323e-05, + "loss": 0.7319, + "step": 13249 + }, + { + "epoch": 0.6451927056703918, + "grad_norm": 1.5915167331695557, + "learning_rate": 1.1814485419719212e-05, + "loss": 0.7361, + "step": 13250 + }, + { + "epoch": 0.645241399459499, + "grad_norm": 1.535129427909851, + "learning_rate": 1.1811607442407529e-05, + "loss": 0.7989, + "step": 13251 + }, + { + "epoch": 0.6452900932486061, + "grad_norm": 1.1682652235031128, + "learning_rate": 1.1808729668782853e-05, + "loss": 0.7341, + "step": 13252 + }, + { + "epoch": 0.6453387870377133, + "grad_norm": 1.062943935394287, + "learning_rate": 1.1805852098916777e-05, + "loss": 0.7829, + "step": 13253 + }, + { + "epoch": 0.6453874808268205, + "grad_norm": 2.0658977031707764, + "learning_rate": 1.1802974732880872e-05, + "loss": 0.7367, + "step": 13254 + }, + { + "epoch": 0.6454361746159277, + "grad_norm": 1.7028671503067017, + "learning_rate": 1.1800097570746716e-05, + "loss": 0.8349, + "step": 13255 + }, + { + "epoch": 0.6454848684050349, + "grad_norm": 1.475492000579834, + "learning_rate": 1.1797220612585885e-05, + "loss": 0.8083, + "step": 13256 + }, + { + "epoch": 0.6455335621941422, + "grad_norm": 2.444688320159912, + "learning_rate": 1.1794343858469941e-05, + "loss": 0.7132, + "step": 13257 + }, + { + "epoch": 0.6455822559832494, + "grad_norm": 1.096661925315857, + "learning_rate": 1.1791467308470441e-05, + "loss": 0.8175, + "step": 13258 + }, + { + "epoch": 0.6456309497723566, + "grad_norm": 1.47635018825531, + "learning_rate": 1.1788590962658935e-05, + "loss": 0.8486, + "step": 13259 + }, + { + "epoch": 0.6456796435614638, + "grad_norm": 2.5725576877593994, + "learning_rate": 1.1785714821106984e-05, + "loss": 0.8012, + "step": 13260 + }, + { + "epoch": 0.6457283373505709, + "grad_norm": 1.6881734132766724, + "learning_rate": 1.178283888388612e-05, + "loss": 0.8622, + "step": 13261 + }, + { + "epoch": 0.6457770311396781, + "grad_norm": 1.3980056047439575, + "learning_rate": 1.1779963151067892e-05, + "loss": 0.872, + "step": 13262 + }, + { + "epoch": 0.6458257249287853, + "grad_norm": 1.5850661993026733, + "learning_rate": 1.177708762272383e-05, + "loss": 0.8902, + "step": 13263 + }, + { + "epoch": 0.6458744187178925, + "grad_norm": 0.09480250626802444, + "learning_rate": 1.1774212298925458e-05, + "loss": 0.5537, + "step": 13264 + }, + { + "epoch": 0.6459231125069997, + "grad_norm": 1.8457682132720947, + "learning_rate": 1.1771337179744311e-05, + "loss": 0.7671, + "step": 13265 + }, + { + "epoch": 0.645971806296107, + "grad_norm": 1.529435634613037, + "learning_rate": 1.1768462265251896e-05, + "loss": 0.9298, + "step": 13266 + }, + { + "epoch": 0.6460205000852142, + "grad_norm": 1.343915581703186, + "learning_rate": 1.1765587555519742e-05, + "loss": 0.8457, + "step": 13267 + }, + { + "epoch": 0.6460691938743214, + "grad_norm": 3.0192854404449463, + "learning_rate": 1.1762713050619338e-05, + "loss": 0.7933, + "step": 13268 + }, + { + "epoch": 0.6461178876634285, + "grad_norm": 1.6286945343017578, + "learning_rate": 1.175983875062221e-05, + "loss": 0.7471, + "step": 13269 + }, + { + "epoch": 0.6461665814525357, + "grad_norm": 2.9607341289520264, + "learning_rate": 1.1756964655599834e-05, + "loss": 0.7978, + "step": 13270 + }, + { + "epoch": 0.6462152752416429, + "grad_norm": 1.1800559759140015, + "learning_rate": 1.1754090765623719e-05, + "loss": 0.7904, + "step": 13271 + }, + { + "epoch": 0.6462639690307501, + "grad_norm": 1.6289478540420532, + "learning_rate": 1.1751217080765342e-05, + "loss": 0.9242, + "step": 13272 + }, + { + "epoch": 0.6463126628198573, + "grad_norm": 1.502854585647583, + "learning_rate": 1.1748343601096192e-05, + "loss": 0.8662, + "step": 13273 + }, + { + "epoch": 0.6463613566089645, + "grad_norm": 1.2912102937698364, + "learning_rate": 1.1745470326687757e-05, + "loss": 0.7777, + "step": 13274 + }, + { + "epoch": 0.6464100503980718, + "grad_norm": 1.6211357116699219, + "learning_rate": 1.1742597257611491e-05, + "loss": 0.771, + "step": 13275 + }, + { + "epoch": 0.646458744187179, + "grad_norm": 3.3165504932403564, + "learning_rate": 1.173972439393888e-05, + "loss": 0.804, + "step": 13276 + }, + { + "epoch": 0.6465074379762861, + "grad_norm": 3.1292121410369873, + "learning_rate": 1.173685173574137e-05, + "loss": 0.9241, + "step": 13277 + }, + { + "epoch": 0.6465561317653933, + "grad_norm": 1.768100380897522, + "learning_rate": 1.1733979283090431e-05, + "loss": 0.8238, + "step": 13278 + }, + { + "epoch": 0.6466048255545005, + "grad_norm": 1.4911494255065918, + "learning_rate": 1.1731107036057506e-05, + "loss": 0.8665, + "step": 13279 + }, + { + "epoch": 0.6466535193436077, + "grad_norm": 1.086344838142395, + "learning_rate": 1.1728234994714053e-05, + "loss": 0.8246, + "step": 13280 + }, + { + "epoch": 0.6467022131327149, + "grad_norm": 1.5046228170394897, + "learning_rate": 1.1725363159131504e-05, + "loss": 0.7442, + "step": 13281 + }, + { + "epoch": 0.6467509069218221, + "grad_norm": 1.9129966497421265, + "learning_rate": 1.172249152938131e-05, + "loss": 0.8854, + "step": 13282 + }, + { + "epoch": 0.6467996007109293, + "grad_norm": 2.3480536937713623, + "learning_rate": 1.1719620105534884e-05, + "loss": 0.8234, + "step": 13283 + }, + { + "epoch": 0.6468482945000366, + "grad_norm": 1.5109875202178955, + "learning_rate": 1.1716748887663663e-05, + "loss": 0.8995, + "step": 13284 + }, + { + "epoch": 0.6468969882891438, + "grad_norm": 1.71047043800354, + "learning_rate": 1.1713877875839073e-05, + "loss": 0.7396, + "step": 13285 + }, + { + "epoch": 0.6469456820782509, + "grad_norm": 1.7421993017196655, + "learning_rate": 1.1711007070132525e-05, + "loss": 0.7807, + "step": 13286 + }, + { + "epoch": 0.6469943758673581, + "grad_norm": 0.09792868793010712, + "learning_rate": 1.1708136470615431e-05, + "loss": 0.5841, + "step": 13287 + }, + { + "epoch": 0.6470430696564653, + "grad_norm": 1.5850414037704468, + "learning_rate": 1.1705266077359197e-05, + "loss": 0.8404, + "step": 13288 + }, + { + "epoch": 0.6470917634455725, + "grad_norm": 2.39613676071167, + "learning_rate": 1.1702395890435232e-05, + "loss": 0.8441, + "step": 13289 + }, + { + "epoch": 0.6471404572346797, + "grad_norm": 1.8222345113754272, + "learning_rate": 1.1699525909914917e-05, + "loss": 0.8703, + "step": 13290 + }, + { + "epoch": 0.6471891510237869, + "grad_norm": 1.6739369630813599, + "learning_rate": 1.1696656135869664e-05, + "loss": 0.7412, + "step": 13291 + }, + { + "epoch": 0.6472378448128941, + "grad_norm": 1.5333532094955444, + "learning_rate": 1.1693786568370835e-05, + "loss": 0.7779, + "step": 13292 + }, + { + "epoch": 0.6472865386020014, + "grad_norm": 1.7299175262451172, + "learning_rate": 1.1690917207489823e-05, + "loss": 0.827, + "step": 13293 + }, + { + "epoch": 0.6473352323911085, + "grad_norm": 1.7265758514404297, + "learning_rate": 1.1688048053298009e-05, + "loss": 0.8399, + "step": 13294 + }, + { + "epoch": 0.6473839261802157, + "grad_norm": 1.5593544244766235, + "learning_rate": 1.168517910586675e-05, + "loss": 0.8285, + "step": 13295 + }, + { + "epoch": 0.6474326199693229, + "grad_norm": 1.1374523639678955, + "learning_rate": 1.1682310365267431e-05, + "loss": 0.7924, + "step": 13296 + }, + { + "epoch": 0.6474813137584301, + "grad_norm": 1.6177071332931519, + "learning_rate": 1.1679441831571388e-05, + "loss": 0.8417, + "step": 13297 + }, + { + "epoch": 0.6475300075475373, + "grad_norm": 1.5985859632492065, + "learning_rate": 1.1676573504849996e-05, + "loss": 0.8942, + "step": 13298 + }, + { + "epoch": 0.6475787013366445, + "grad_norm": 1.4154342412948608, + "learning_rate": 1.1673705385174592e-05, + "loss": 0.7542, + "step": 13299 + }, + { + "epoch": 0.6476273951257517, + "grad_norm": 1.3859704732894897, + "learning_rate": 1.167083747261653e-05, + "loss": 0.7883, + "step": 13300 + }, + { + "epoch": 0.647676088914859, + "grad_norm": 1.392511248588562, + "learning_rate": 1.1667969767247142e-05, + "loss": 0.8341, + "step": 13301 + }, + { + "epoch": 0.6477247827039662, + "grad_norm": 1.5703920125961304, + "learning_rate": 1.166510226913777e-05, + "loss": 0.7852, + "step": 13302 + }, + { + "epoch": 0.6477734764930733, + "grad_norm": 2.4144976139068604, + "learning_rate": 1.1662234978359734e-05, + "loss": 0.8633, + "step": 13303 + }, + { + "epoch": 0.6478221702821805, + "grad_norm": 1.4179322719573975, + "learning_rate": 1.1659367894984359e-05, + "loss": 0.816, + "step": 13304 + }, + { + "epoch": 0.6478708640712877, + "grad_norm": 1.2138522863388062, + "learning_rate": 1.1656501019082975e-05, + "loss": 0.8736, + "step": 13305 + }, + { + "epoch": 0.6479195578603949, + "grad_norm": 1.433180570602417, + "learning_rate": 1.1653634350726888e-05, + "loss": 0.8517, + "step": 13306 + }, + { + "epoch": 0.6479682516495021, + "grad_norm": 2.1383955478668213, + "learning_rate": 1.165076788998741e-05, + "loss": 0.8505, + "step": 13307 + }, + { + "epoch": 0.6480169454386093, + "grad_norm": 1.5512099266052246, + "learning_rate": 1.1647901636935839e-05, + "loss": 0.7484, + "step": 13308 + }, + { + "epoch": 0.6480656392277165, + "grad_norm": 1.1011391878128052, + "learning_rate": 1.1645035591643476e-05, + "loss": 0.8083, + "step": 13309 + }, + { + "epoch": 0.6481143330168238, + "grad_norm": 0.09097953140735626, + "learning_rate": 1.1642169754181616e-05, + "loss": 0.5986, + "step": 13310 + }, + { + "epoch": 0.6481630268059309, + "grad_norm": 1.4742275476455688, + "learning_rate": 1.1639304124621547e-05, + "loss": 0.8767, + "step": 13311 + }, + { + "epoch": 0.6482117205950381, + "grad_norm": 1.7851667404174805, + "learning_rate": 1.163643870303454e-05, + "loss": 0.819, + "step": 13312 + }, + { + "epoch": 0.6482604143841453, + "grad_norm": 1.9943028688430786, + "learning_rate": 1.1633573489491891e-05, + "loss": 0.8054, + "step": 13313 + }, + { + "epoch": 0.6483091081732525, + "grad_norm": 1.5298959016799927, + "learning_rate": 1.1630708484064866e-05, + "loss": 0.7928, + "step": 13314 + }, + { + "epoch": 0.6483578019623597, + "grad_norm": 1.5086091756820679, + "learning_rate": 1.1627843686824729e-05, + "loss": 0.8893, + "step": 13315 + }, + { + "epoch": 0.6484064957514669, + "grad_norm": 1.4237099885940552, + "learning_rate": 1.1624979097842743e-05, + "loss": 0.8444, + "step": 13316 + }, + { + "epoch": 0.6484551895405741, + "grad_norm": 1.42634916305542, + "learning_rate": 1.1622114717190166e-05, + "loss": 0.7734, + "step": 13317 + }, + { + "epoch": 0.6485038833296813, + "grad_norm": 1.3156819343566895, + "learning_rate": 1.1619250544938249e-05, + "loss": 0.8975, + "step": 13318 + }, + { + "epoch": 0.6485525771187886, + "grad_norm": 0.09564193338155746, + "learning_rate": 1.1616386581158232e-05, + "loss": 0.5706, + "step": 13319 + }, + { + "epoch": 0.6486012709078957, + "grad_norm": 1.8639023303985596, + "learning_rate": 1.1613522825921378e-05, + "loss": 0.8127, + "step": 13320 + }, + { + "epoch": 0.6486499646970029, + "grad_norm": 1.4244807958602905, + "learning_rate": 1.1610659279298892e-05, + "loss": 0.8999, + "step": 13321 + }, + { + "epoch": 0.6486986584861101, + "grad_norm": 1.7927498817443848, + "learning_rate": 1.1607795941362042e-05, + "loss": 0.859, + "step": 13322 + }, + { + "epoch": 0.6487473522752173, + "grad_norm": 2.3109982013702393, + "learning_rate": 1.1604932812182011e-05, + "loss": 0.8093, + "step": 13323 + }, + { + "epoch": 0.6487960460643245, + "grad_norm": 1.529978632926941, + "learning_rate": 1.1602069891830052e-05, + "loss": 0.8314, + "step": 13324 + }, + { + "epoch": 0.6488447398534317, + "grad_norm": 1.6343636512756348, + "learning_rate": 1.159920718037737e-05, + "loss": 0.8678, + "step": 13325 + }, + { + "epoch": 0.6488934336425389, + "grad_norm": 1.6197534799575806, + "learning_rate": 1.1596344677895175e-05, + "loss": 0.8034, + "step": 13326 + }, + { + "epoch": 0.6489421274316461, + "grad_norm": 1.5438549518585205, + "learning_rate": 1.1593482384454676e-05, + "loss": 0.7421, + "step": 13327 + }, + { + "epoch": 0.6489908212207532, + "grad_norm": 1.5355278253555298, + "learning_rate": 1.1590620300127058e-05, + "loss": 0.7981, + "step": 13328 + }, + { + "epoch": 0.6490395150098605, + "grad_norm": 3.14259672164917, + "learning_rate": 1.1587758424983541e-05, + "loss": 0.7492, + "step": 13329 + }, + { + "epoch": 0.6490882087989677, + "grad_norm": 1.8402607440948486, + "learning_rate": 1.1584896759095288e-05, + "loss": 0.8137, + "step": 13330 + }, + { + "epoch": 0.6491369025880749, + "grad_norm": 2.03450870513916, + "learning_rate": 1.15820353025335e-05, + "loss": 0.9503, + "step": 13331 + }, + { + "epoch": 0.6491855963771821, + "grad_norm": 0.09482928365468979, + "learning_rate": 1.1579174055369353e-05, + "loss": 0.5904, + "step": 13332 + }, + { + "epoch": 0.6492342901662893, + "grad_norm": 1.0911287069320679, + "learning_rate": 1.1576313017674019e-05, + "loss": 0.7717, + "step": 13333 + }, + { + "epoch": 0.6492829839553965, + "grad_norm": 1.4624384641647339, + "learning_rate": 1.1573452189518667e-05, + "loss": 0.7382, + "step": 13334 + }, + { + "epoch": 0.6493316777445037, + "grad_norm": 1.7796032428741455, + "learning_rate": 1.1570591570974452e-05, + "loss": 0.8158, + "step": 13335 + }, + { + "epoch": 0.6493803715336108, + "grad_norm": 1.4578949213027954, + "learning_rate": 1.1567731162112556e-05, + "loss": 0.796, + "step": 13336 + }, + { + "epoch": 0.649429065322718, + "grad_norm": 1.954003095626831, + "learning_rate": 1.15648709630041e-05, + "loss": 0.7972, + "step": 13337 + }, + { + "epoch": 0.6494777591118253, + "grad_norm": 1.4152438640594482, + "learning_rate": 1.1562010973720263e-05, + "loss": 0.8368, + "step": 13338 + }, + { + "epoch": 0.6495264529009325, + "grad_norm": 1.5603692531585693, + "learning_rate": 1.1559151194332157e-05, + "loss": 0.755, + "step": 13339 + }, + { + "epoch": 0.6495751466900397, + "grad_norm": 1.578495979309082, + "learning_rate": 1.1556291624910944e-05, + "loss": 0.782, + "step": 13340 + }, + { + "epoch": 0.6496238404791469, + "grad_norm": 1.4706156253814697, + "learning_rate": 1.1553432265527747e-05, + "loss": 0.8195, + "step": 13341 + }, + { + "epoch": 0.6496725342682541, + "grad_norm": 1.230944037437439, + "learning_rate": 1.155057311625369e-05, + "loss": 0.792, + "step": 13342 + }, + { + "epoch": 0.6497212280573613, + "grad_norm": 2.1423964500427246, + "learning_rate": 1.1547714177159898e-05, + "loss": 0.7742, + "step": 13343 + }, + { + "epoch": 0.6497699218464685, + "grad_norm": 1.4385285377502441, + "learning_rate": 1.154485544831748e-05, + "loss": 0.9455, + "step": 13344 + }, + { + "epoch": 0.6498186156355756, + "grad_norm": 0.09250616282224655, + "learning_rate": 1.154199692979757e-05, + "loss": 0.561, + "step": 13345 + }, + { + "epoch": 0.6498673094246828, + "grad_norm": 1.7932261228561401, + "learning_rate": 1.1539138621671238e-05, + "loss": 0.8163, + "step": 13346 + }, + { + "epoch": 0.6499160032137901, + "grad_norm": 1.291102409362793, + "learning_rate": 1.1536280524009613e-05, + "loss": 0.795, + "step": 13347 + }, + { + "epoch": 0.6499646970028973, + "grad_norm": 2.2678208351135254, + "learning_rate": 1.1533422636883784e-05, + "loss": 0.7782, + "step": 13348 + }, + { + "epoch": 0.6500133907920045, + "grad_norm": 1.6354615688323975, + "learning_rate": 1.1530564960364838e-05, + "loss": 0.8165, + "step": 13349 + }, + { + "epoch": 0.6500620845811117, + "grad_norm": 1.4131828546524048, + "learning_rate": 1.1527707494523861e-05, + "loss": 0.84, + "step": 13350 + }, + { + "epoch": 0.6501107783702189, + "grad_norm": 1.6705485582351685, + "learning_rate": 1.1524850239431933e-05, + "loss": 0.7902, + "step": 13351 + }, + { + "epoch": 0.6501594721593261, + "grad_norm": 1.517472267150879, + "learning_rate": 1.1521993195160128e-05, + "loss": 0.8432, + "step": 13352 + }, + { + "epoch": 0.6502081659484332, + "grad_norm": 1.440024971961975, + "learning_rate": 1.1519136361779505e-05, + "loss": 0.8679, + "step": 13353 + }, + { + "epoch": 0.6502568597375404, + "grad_norm": 1.6446434259414673, + "learning_rate": 1.1516279739361149e-05, + "loss": 0.7783, + "step": 13354 + }, + { + "epoch": 0.6503055535266477, + "grad_norm": 1.4918750524520874, + "learning_rate": 1.1513423327976106e-05, + "loss": 0.7115, + "step": 13355 + }, + { + "epoch": 0.6503542473157549, + "grad_norm": 1.493786334991455, + "learning_rate": 1.151056712769543e-05, + "loss": 0.7386, + "step": 13356 + }, + { + "epoch": 0.6504029411048621, + "grad_norm": 1.2899986505508423, + "learning_rate": 1.1507711138590176e-05, + "loss": 0.6888, + "step": 13357 + }, + { + "epoch": 0.6504516348939693, + "grad_norm": 1.4162137508392334, + "learning_rate": 1.1504855360731377e-05, + "loss": 0.7972, + "step": 13358 + }, + { + "epoch": 0.6505003286830765, + "grad_norm": 1.2458412647247314, + "learning_rate": 1.1501999794190077e-05, + "loss": 0.8739, + "step": 13359 + }, + { + "epoch": 0.6505490224721837, + "grad_norm": 1.9998629093170166, + "learning_rate": 1.1499144439037306e-05, + "loss": 0.829, + "step": 13360 + }, + { + "epoch": 0.6505977162612909, + "grad_norm": 0.0930476114153862, + "learning_rate": 1.1496289295344094e-05, + "loss": 0.6038, + "step": 13361 + }, + { + "epoch": 0.650646410050398, + "grad_norm": 1.672989010810852, + "learning_rate": 1.1493434363181453e-05, + "loss": 0.9231, + "step": 13362 + }, + { + "epoch": 0.6506951038395052, + "grad_norm": 1.4254251718521118, + "learning_rate": 1.1490579642620415e-05, + "loss": 0.79, + "step": 13363 + }, + { + "epoch": 0.6507437976286125, + "grad_norm": 1.5803555250167847, + "learning_rate": 1.1487725133731984e-05, + "loss": 0.7939, + "step": 13364 + }, + { + "epoch": 0.6507924914177197, + "grad_norm": 2.3040640354156494, + "learning_rate": 1.1484870836587168e-05, + "loss": 0.7343, + "step": 13365 + }, + { + "epoch": 0.6508411852068269, + "grad_norm": 1.3147566318511963, + "learning_rate": 1.1482016751256965e-05, + "loss": 0.8259, + "step": 13366 + }, + { + "epoch": 0.6508898789959341, + "grad_norm": 1.6362411975860596, + "learning_rate": 1.147916287781237e-05, + "loss": 0.9056, + "step": 13367 + }, + { + "epoch": 0.6509385727850413, + "grad_norm": 2.005566358566284, + "learning_rate": 1.147630921632438e-05, + "loss": 0.8088, + "step": 13368 + }, + { + "epoch": 0.6509872665741485, + "grad_norm": 0.09815707802772522, + "learning_rate": 1.1473455766863972e-05, + "loss": 0.6218, + "step": 13369 + }, + { + "epoch": 0.6510359603632556, + "grad_norm": 1.1736818552017212, + "learning_rate": 1.1470602529502122e-05, + "loss": 0.7636, + "step": 13370 + }, + { + "epoch": 0.6510846541523628, + "grad_norm": 1.4003357887268066, + "learning_rate": 1.1467749504309828e-05, + "loss": 0.8469, + "step": 13371 + }, + { + "epoch": 0.65113334794147, + "grad_norm": 1.5060474872589111, + "learning_rate": 1.1464896691358027e-05, + "loss": 0.9177, + "step": 13372 + }, + { + "epoch": 0.6511820417305773, + "grad_norm": 1.8364074230194092, + "learning_rate": 1.1462044090717708e-05, + "loss": 0.8449, + "step": 13373 + }, + { + "epoch": 0.6512307355196845, + "grad_norm": 1.7388607263565063, + "learning_rate": 1.145919170245982e-05, + "loss": 0.726, + "step": 13374 + }, + { + "epoch": 0.6512794293087917, + "grad_norm": 2.5017435550689697, + "learning_rate": 1.1456339526655316e-05, + "loss": 0.794, + "step": 13375 + }, + { + "epoch": 0.6513281230978989, + "grad_norm": 1.6265190839767456, + "learning_rate": 1.1453487563375145e-05, + "loss": 0.9155, + "step": 13376 + }, + { + "epoch": 0.6513768168870061, + "grad_norm": 5.660342216491699, + "learning_rate": 1.1450635812690245e-05, + "loss": 0.7424, + "step": 13377 + }, + { + "epoch": 0.6514255106761132, + "grad_norm": 1.128314733505249, + "learning_rate": 1.144778427467157e-05, + "loss": 0.7567, + "step": 13378 + }, + { + "epoch": 0.6514742044652204, + "grad_norm": 1.7124919891357422, + "learning_rate": 1.1444932949390028e-05, + "loss": 0.8949, + "step": 13379 + }, + { + "epoch": 0.6515228982543276, + "grad_norm": 1.7283767461776733, + "learning_rate": 1.1442081836916575e-05, + "loss": 0.8324, + "step": 13380 + }, + { + "epoch": 0.6515715920434348, + "grad_norm": 1.3021972179412842, + "learning_rate": 1.1439230937322102e-05, + "loss": 0.7067, + "step": 13381 + }, + { + "epoch": 0.6516202858325421, + "grad_norm": 2.5107359886169434, + "learning_rate": 1.1436380250677544e-05, + "loss": 0.8267, + "step": 13382 + }, + { + "epoch": 0.6516689796216493, + "grad_norm": 1.3095835447311401, + "learning_rate": 1.1433529777053812e-05, + "loss": 0.7406, + "step": 13383 + }, + { + "epoch": 0.6517176734107565, + "grad_norm": 0.09423647820949554, + "learning_rate": 1.1430679516521806e-05, + "loss": 0.535, + "step": 13384 + }, + { + "epoch": 0.6517663671998637, + "grad_norm": 1.9717518091201782, + "learning_rate": 1.1427829469152427e-05, + "loss": 0.7932, + "step": 13385 + }, + { + "epoch": 0.6518150609889709, + "grad_norm": 1.6092451810836792, + "learning_rate": 1.1424979635016564e-05, + "loss": 0.8455, + "step": 13386 + }, + { + "epoch": 0.651863754778078, + "grad_norm": 1.3885321617126465, + "learning_rate": 1.1422130014185133e-05, + "loss": 0.8401, + "step": 13387 + }, + { + "epoch": 0.6519124485671852, + "grad_norm": 1.4008872509002686, + "learning_rate": 1.1419280606728981e-05, + "loss": 0.8427, + "step": 13388 + }, + { + "epoch": 0.6519611423562924, + "grad_norm": 1.3781163692474365, + "learning_rate": 1.1416431412719024e-05, + "loss": 0.8628, + "step": 13389 + }, + { + "epoch": 0.6520098361453996, + "grad_norm": 1.1487940549850464, + "learning_rate": 1.14135824322261e-05, + "loss": 0.8147, + "step": 13390 + }, + { + "epoch": 0.6520585299345069, + "grad_norm": 2.0188024044036865, + "learning_rate": 1.1410733665321108e-05, + "loss": 0.8816, + "step": 13391 + }, + { + "epoch": 0.6521072237236141, + "grad_norm": 1.495643138885498, + "learning_rate": 1.1407885112074897e-05, + "loss": 0.8759, + "step": 13392 + }, + { + "epoch": 0.6521559175127213, + "grad_norm": 1.7306851148605347, + "learning_rate": 1.1405036772558318e-05, + "loss": 0.8222, + "step": 13393 + }, + { + "epoch": 0.6522046113018285, + "grad_norm": 2.113344669342041, + "learning_rate": 1.1402188646842251e-05, + "loss": 0.8938, + "step": 13394 + }, + { + "epoch": 0.6522533050909356, + "grad_norm": 0.09363675862550735, + "learning_rate": 1.1399340734997508e-05, + "loss": 0.5803, + "step": 13395 + }, + { + "epoch": 0.6523019988800428, + "grad_norm": 1.4364426136016846, + "learning_rate": 1.1396493037094965e-05, + "loss": 0.8325, + "step": 13396 + }, + { + "epoch": 0.65235069266915, + "grad_norm": 1.6331143379211426, + "learning_rate": 1.1393645553205426e-05, + "loss": 0.7783, + "step": 13397 + }, + { + "epoch": 0.6523993864582572, + "grad_norm": 1.395543098449707, + "learning_rate": 1.1390798283399747e-05, + "loss": 0.851, + "step": 13398 + }, + { + "epoch": 0.6524480802473644, + "grad_norm": 1.914793610572815, + "learning_rate": 1.1387951227748744e-05, + "loss": 0.8746, + "step": 13399 + }, + { + "epoch": 0.6524967740364717, + "grad_norm": 1.5960140228271484, + "learning_rate": 1.138510438632324e-05, + "loss": 0.8065, + "step": 13400 + }, + { + "epoch": 0.6525454678255789, + "grad_norm": 1.972564935684204, + "learning_rate": 1.138225775919405e-05, + "loss": 0.9008, + "step": 13401 + }, + { + "epoch": 0.6525941616146861, + "grad_norm": 2.1278858184814453, + "learning_rate": 1.1379411346431976e-05, + "loss": 0.7141, + "step": 13402 + }, + { + "epoch": 0.6526428554037933, + "grad_norm": 2.3541250228881836, + "learning_rate": 1.137656514810785e-05, + "loss": 0.8683, + "step": 13403 + }, + { + "epoch": 0.6526915491929004, + "grad_norm": 2.854971170425415, + "learning_rate": 1.1373719164292431e-05, + "loss": 0.8379, + "step": 13404 + }, + { + "epoch": 0.6527402429820076, + "grad_norm": 1.5492371320724487, + "learning_rate": 1.1370873395056545e-05, + "loss": 0.716, + "step": 13405 + }, + { + "epoch": 0.6527889367711148, + "grad_norm": 1.6775552034378052, + "learning_rate": 1.136802784047097e-05, + "loss": 0.842, + "step": 13406 + }, + { + "epoch": 0.652837630560222, + "grad_norm": 1.766632080078125, + "learning_rate": 1.1365182500606489e-05, + "loss": 0.7847, + "step": 13407 + }, + { + "epoch": 0.6528863243493293, + "grad_norm": 1.4841516017913818, + "learning_rate": 1.136233737553388e-05, + "loss": 0.8039, + "step": 13408 + }, + { + "epoch": 0.6529350181384365, + "grad_norm": 1.3722562789916992, + "learning_rate": 1.1359492465323921e-05, + "loss": 0.8203, + "step": 13409 + }, + { + "epoch": 0.6529837119275437, + "grad_norm": 2.5579679012298584, + "learning_rate": 1.1356647770047369e-05, + "loss": 0.8582, + "step": 13410 + }, + { + "epoch": 0.6530324057166509, + "grad_norm": 2.687781810760498, + "learning_rate": 1.1353803289774992e-05, + "loss": 0.7952, + "step": 13411 + }, + { + "epoch": 0.653081099505758, + "grad_norm": 1.594957709312439, + "learning_rate": 1.135095902457755e-05, + "loss": 0.7854, + "step": 13412 + }, + { + "epoch": 0.6531297932948652, + "grad_norm": 1.7990931272506714, + "learning_rate": 1.1348114974525779e-05, + "loss": 0.783, + "step": 13413 + }, + { + "epoch": 0.6531784870839724, + "grad_norm": 1.8362395763397217, + "learning_rate": 1.1345271139690448e-05, + "loss": 0.9157, + "step": 13414 + }, + { + "epoch": 0.6532271808730796, + "grad_norm": 1.913454294204712, + "learning_rate": 1.1342427520142288e-05, + "loss": 0.8093, + "step": 13415 + }, + { + "epoch": 0.6532758746621868, + "grad_norm": 2.7921581268310547, + "learning_rate": 1.1339584115952028e-05, + "loss": 0.8209, + "step": 13416 + }, + { + "epoch": 0.653324568451294, + "grad_norm": 1.6107982397079468, + "learning_rate": 1.1336740927190406e-05, + "loss": 0.7272, + "step": 13417 + }, + { + "epoch": 0.6533732622404013, + "grad_norm": 1.712624430656433, + "learning_rate": 1.1333897953928141e-05, + "loss": 0.8622, + "step": 13418 + }, + { + "epoch": 0.6534219560295085, + "grad_norm": 1.412798285484314, + "learning_rate": 1.1331055196235956e-05, + "loss": 0.878, + "step": 13419 + }, + { + "epoch": 0.6534706498186157, + "grad_norm": 1.4351437091827393, + "learning_rate": 1.1328212654184562e-05, + "loss": 0.7527, + "step": 13420 + }, + { + "epoch": 0.6535193436077228, + "grad_norm": 1.3256425857543945, + "learning_rate": 1.1325370327844664e-05, + "loss": 0.8608, + "step": 13421 + }, + { + "epoch": 0.65356803739683, + "grad_norm": 1.3527419567108154, + "learning_rate": 1.1322528217286974e-05, + "loss": 0.851, + "step": 13422 + }, + { + "epoch": 0.6536167311859372, + "grad_norm": 2.0828187465667725, + "learning_rate": 1.1319686322582189e-05, + "loss": 0.8216, + "step": 13423 + }, + { + "epoch": 0.6536654249750444, + "grad_norm": 2.1764557361602783, + "learning_rate": 1.1316844643800997e-05, + "loss": 0.7762, + "step": 13424 + }, + { + "epoch": 0.6537141187641516, + "grad_norm": 1.2879149913787842, + "learning_rate": 1.1314003181014085e-05, + "loss": 0.8041, + "step": 13425 + }, + { + "epoch": 0.6537628125532589, + "grad_norm": 1.9567973613739014, + "learning_rate": 1.1311161934292138e-05, + "loss": 0.7208, + "step": 13426 + }, + { + "epoch": 0.6538115063423661, + "grad_norm": 1.505393147468567, + "learning_rate": 1.1308320903705829e-05, + "loss": 0.7882, + "step": 13427 + }, + { + "epoch": 0.6538602001314733, + "grad_norm": 1.8842747211456299, + "learning_rate": 1.1305480089325824e-05, + "loss": 0.874, + "step": 13428 + }, + { + "epoch": 0.6539088939205804, + "grad_norm": 1.502941608428955, + "learning_rate": 1.1302639491222806e-05, + "loss": 0.8402, + "step": 13429 + }, + { + "epoch": 0.6539575877096876, + "grad_norm": 1.6511752605438232, + "learning_rate": 1.1299799109467413e-05, + "loss": 0.8803, + "step": 13430 + }, + { + "epoch": 0.6540062814987948, + "grad_norm": 1.6056795120239258, + "learning_rate": 1.1296958944130323e-05, + "loss": 0.8186, + "step": 13431 + }, + { + "epoch": 0.654054975287902, + "grad_norm": 1.66426420211792, + "learning_rate": 1.1294118995282158e-05, + "loss": 0.857, + "step": 13432 + }, + { + "epoch": 0.6541036690770092, + "grad_norm": 7.366072654724121, + "learning_rate": 1.1291279262993582e-05, + "loss": 0.7779, + "step": 13433 + }, + { + "epoch": 0.6541523628661164, + "grad_norm": 1.8484853506088257, + "learning_rate": 1.1288439747335232e-05, + "loss": 0.7625, + "step": 13434 + }, + { + "epoch": 0.6542010566552237, + "grad_norm": 1.3024052381515503, + "learning_rate": 1.128560044837773e-05, + "loss": 0.7119, + "step": 13435 + }, + { + "epoch": 0.6542497504443309, + "grad_norm": 0.09738998115062714, + "learning_rate": 1.1282761366191723e-05, + "loss": 0.6177, + "step": 13436 + }, + { + "epoch": 0.654298444233438, + "grad_norm": 1.7401400804519653, + "learning_rate": 1.1279922500847809e-05, + "loss": 0.7694, + "step": 13437 + }, + { + "epoch": 0.6543471380225452, + "grad_norm": 4.564733028411865, + "learning_rate": 1.1277083852416632e-05, + "loss": 0.7999, + "step": 13438 + }, + { + "epoch": 0.6543958318116524, + "grad_norm": 1.6735730171203613, + "learning_rate": 1.1274245420968777e-05, + "loss": 0.7372, + "step": 13439 + }, + { + "epoch": 0.6544445256007596, + "grad_norm": 1.8758940696716309, + "learning_rate": 1.1271407206574869e-05, + "loss": 0.8662, + "step": 13440 + }, + { + "epoch": 0.6544932193898668, + "grad_norm": 1.681246042251587, + "learning_rate": 1.1268569209305501e-05, + "loss": 0.8396, + "step": 13441 + }, + { + "epoch": 0.654541913178974, + "grad_norm": 1.707251787185669, + "learning_rate": 1.126573142923127e-05, + "loss": 0.7834, + "step": 13442 + }, + { + "epoch": 0.6545906069680812, + "grad_norm": 1.7821645736694336, + "learning_rate": 1.1262893866422767e-05, + "loss": 0.7611, + "step": 13443 + }, + { + "epoch": 0.6546393007571885, + "grad_norm": 1.8193508386611938, + "learning_rate": 1.1260056520950569e-05, + "loss": 0.7758, + "step": 13444 + }, + { + "epoch": 0.6546879945462957, + "grad_norm": 1.5743331909179688, + "learning_rate": 1.1257219392885277e-05, + "loss": 0.7457, + "step": 13445 + }, + { + "epoch": 0.6547366883354028, + "grad_norm": 2.1901333332061768, + "learning_rate": 1.1254382482297434e-05, + "loss": 0.8058, + "step": 13446 + }, + { + "epoch": 0.65478538212451, + "grad_norm": 3.9815151691436768, + "learning_rate": 1.1251545789257638e-05, + "loss": 0.9198, + "step": 13447 + }, + { + "epoch": 0.6548340759136172, + "grad_norm": 1.5174684524536133, + "learning_rate": 1.1248709313836423e-05, + "loss": 0.7832, + "step": 13448 + }, + { + "epoch": 0.6548827697027244, + "grad_norm": 1.4491604566574097, + "learning_rate": 1.124587305610437e-05, + "loss": 0.7383, + "step": 13449 + }, + { + "epoch": 0.6549314634918316, + "grad_norm": 1.5029897689819336, + "learning_rate": 1.1243037016132026e-05, + "loss": 0.7762, + "step": 13450 + }, + { + "epoch": 0.6549801572809388, + "grad_norm": 1.3383703231811523, + "learning_rate": 1.1240201193989932e-05, + "loss": 0.8448, + "step": 13451 + }, + { + "epoch": 0.655028851070046, + "grad_norm": 1.482373595237732, + "learning_rate": 1.123736558974863e-05, + "loss": 0.8608, + "step": 13452 + }, + { + "epoch": 0.6550775448591533, + "grad_norm": 1.4639590978622437, + "learning_rate": 1.1234530203478653e-05, + "loss": 0.8257, + "step": 13453 + }, + { + "epoch": 0.6551262386482604, + "grad_norm": 1.3896732330322266, + "learning_rate": 1.1231695035250551e-05, + "loss": 0.7906, + "step": 13454 + }, + { + "epoch": 0.6551749324373676, + "grad_norm": 1.365552306175232, + "learning_rate": 1.1228860085134818e-05, + "loss": 0.7597, + "step": 13455 + }, + { + "epoch": 0.6552236262264748, + "grad_norm": 1.5071558952331543, + "learning_rate": 1.1226025353202e-05, + "loss": 0.8971, + "step": 13456 + }, + { + "epoch": 0.655272320015582, + "grad_norm": 1.6664669513702393, + "learning_rate": 1.12231908395226e-05, + "loss": 0.7785, + "step": 13457 + }, + { + "epoch": 0.6553210138046892, + "grad_norm": 1.59788179397583, + "learning_rate": 1.1220356544167128e-05, + "loss": 0.8302, + "step": 13458 + }, + { + "epoch": 0.6553697075937964, + "grad_norm": 1.797888994216919, + "learning_rate": 1.1217522467206088e-05, + "loss": 0.7233, + "step": 13459 + }, + { + "epoch": 0.6554184013829036, + "grad_norm": 1.6597943305969238, + "learning_rate": 1.1214688608709979e-05, + "loss": 0.8008, + "step": 13460 + }, + { + "epoch": 0.6554670951720108, + "grad_norm": 1.324705958366394, + "learning_rate": 1.1211854968749289e-05, + "loss": 0.7788, + "step": 13461 + }, + { + "epoch": 0.6555157889611181, + "grad_norm": 3.839571237564087, + "learning_rate": 1.1209021547394501e-05, + "loss": 0.7691, + "step": 13462 + }, + { + "epoch": 0.6555644827502252, + "grad_norm": 1.448570966720581, + "learning_rate": 1.1206188344716111e-05, + "loss": 0.8062, + "step": 13463 + }, + { + "epoch": 0.6556131765393324, + "grad_norm": 1.681318759918213, + "learning_rate": 1.1203355360784589e-05, + "loss": 0.7494, + "step": 13464 + }, + { + "epoch": 0.6556618703284396, + "grad_norm": 3.2474191188812256, + "learning_rate": 1.1200522595670402e-05, + "loss": 0.8347, + "step": 13465 + }, + { + "epoch": 0.6557105641175468, + "grad_norm": 1.35350501537323, + "learning_rate": 1.119769004944402e-05, + "loss": 0.8253, + "step": 13466 + }, + { + "epoch": 0.655759257906654, + "grad_norm": 1.248687982559204, + "learning_rate": 1.11948577221759e-05, + "loss": 0.858, + "step": 13467 + }, + { + "epoch": 0.6558079516957612, + "grad_norm": 1.227103590965271, + "learning_rate": 1.11920256139365e-05, + "loss": 0.8008, + "step": 13468 + }, + { + "epoch": 0.6558566454848684, + "grad_norm": 1.3289997577667236, + "learning_rate": 1.1189193724796261e-05, + "loss": 0.9276, + "step": 13469 + }, + { + "epoch": 0.6559053392739757, + "grad_norm": 1.5958471298217773, + "learning_rate": 1.1186362054825635e-05, + "loss": 0.8781, + "step": 13470 + }, + { + "epoch": 0.6559540330630828, + "grad_norm": 1.3197760581970215, + "learning_rate": 1.1183530604095046e-05, + "loss": 0.8251, + "step": 13471 + }, + { + "epoch": 0.65600272685219, + "grad_norm": 1.36883544921875, + "learning_rate": 1.1180699372674946e-05, + "loss": 0.9217, + "step": 13472 + }, + { + "epoch": 0.6560514206412972, + "grad_norm": 1.4033344984054565, + "learning_rate": 1.117786836063575e-05, + "loss": 0.8343, + "step": 13473 + }, + { + "epoch": 0.6561001144304044, + "grad_norm": 1.5854017734527588, + "learning_rate": 1.117503756804789e-05, + "loss": 0.8214, + "step": 13474 + }, + { + "epoch": 0.6561488082195116, + "grad_norm": 3.13655948638916, + "learning_rate": 1.1172206994981769e-05, + "loss": 0.8199, + "step": 13475 + }, + { + "epoch": 0.6561975020086188, + "grad_norm": 0.0889943540096283, + "learning_rate": 1.1169376641507807e-05, + "loss": 0.5873, + "step": 13476 + }, + { + "epoch": 0.656246195797726, + "grad_norm": 1.7785452604293823, + "learning_rate": 1.1166546507696405e-05, + "loss": 0.8073, + "step": 13477 + }, + { + "epoch": 0.6562948895868332, + "grad_norm": 1.2469254732131958, + "learning_rate": 1.1163716593617964e-05, + "loss": 0.8026, + "step": 13478 + }, + { + "epoch": 0.6563435833759405, + "grad_norm": 1.4006074666976929, + "learning_rate": 1.1160886899342871e-05, + "loss": 0.9156, + "step": 13479 + }, + { + "epoch": 0.6563922771650476, + "grad_norm": 1.7745299339294434, + "learning_rate": 1.1158057424941537e-05, + "loss": 0.8542, + "step": 13480 + }, + { + "epoch": 0.6564409709541548, + "grad_norm": 3.4267446994781494, + "learning_rate": 1.1155228170484316e-05, + "loss": 0.8794, + "step": 13481 + }, + { + "epoch": 0.656489664743262, + "grad_norm": 1.6830977201461792, + "learning_rate": 1.1152399136041607e-05, + "loss": 0.7017, + "step": 13482 + }, + { + "epoch": 0.6565383585323692, + "grad_norm": 1.5683095455169678, + "learning_rate": 1.1149570321683778e-05, + "loss": 0.7264, + "step": 13483 + }, + { + "epoch": 0.6565870523214764, + "grad_norm": 1.4913341999053955, + "learning_rate": 1.1146741727481194e-05, + "loss": 0.7791, + "step": 13484 + }, + { + "epoch": 0.6566357461105836, + "grad_norm": 1.727344036102295, + "learning_rate": 1.1143913353504217e-05, + "loss": 0.8087, + "step": 13485 + }, + { + "epoch": 0.6566844398996908, + "grad_norm": 1.5932518243789673, + "learning_rate": 1.1141085199823193e-05, + "loss": 0.8134, + "step": 13486 + }, + { + "epoch": 0.656733133688798, + "grad_norm": 1.4448610544204712, + "learning_rate": 1.11382572665085e-05, + "loss": 0.8528, + "step": 13487 + }, + { + "epoch": 0.6567818274779051, + "grad_norm": 1.4318394660949707, + "learning_rate": 1.1135429553630451e-05, + "loss": 0.7491, + "step": 13488 + }, + { + "epoch": 0.6568305212670124, + "grad_norm": 1.5379799604415894, + "learning_rate": 1.1132602061259413e-05, + "loss": 0.793, + "step": 13489 + }, + { + "epoch": 0.6568792150561196, + "grad_norm": 1.9374264478683472, + "learning_rate": 1.1129774789465693e-05, + "loss": 0.8092, + "step": 13490 + }, + { + "epoch": 0.6569279088452268, + "grad_norm": 1.7782008647918701, + "learning_rate": 1.1126947738319642e-05, + "loss": 0.8494, + "step": 13491 + }, + { + "epoch": 0.656976602634334, + "grad_norm": 1.4844970703125, + "learning_rate": 1.1124120907891574e-05, + "loss": 0.8928, + "step": 13492 + }, + { + "epoch": 0.6570252964234412, + "grad_norm": 1.664401650428772, + "learning_rate": 1.1121294298251808e-05, + "loss": 0.9245, + "step": 13493 + }, + { + "epoch": 0.6570739902125484, + "grad_norm": 1.8645944595336914, + "learning_rate": 1.1118467909470656e-05, + "loss": 0.8176, + "step": 13494 + }, + { + "epoch": 0.6571226840016556, + "grad_norm": 1.320418119430542, + "learning_rate": 1.1115641741618414e-05, + "loss": 0.8849, + "step": 13495 + }, + { + "epoch": 0.6571713777907627, + "grad_norm": 2.2447597980499268, + "learning_rate": 1.1112815794765415e-05, + "loss": 0.809, + "step": 13496 + }, + { + "epoch": 0.65722007157987, + "grad_norm": 1.3669116497039795, + "learning_rate": 1.1109990068981916e-05, + "loss": 0.7844, + "step": 13497 + }, + { + "epoch": 0.6572687653689772, + "grad_norm": 1.4339969158172607, + "learning_rate": 1.1107164564338238e-05, + "loss": 0.8403, + "step": 13498 + }, + { + "epoch": 0.6573174591580844, + "grad_norm": 1.3528375625610352, + "learning_rate": 1.1104339280904636e-05, + "loss": 0.9085, + "step": 13499 + }, + { + "epoch": 0.6573661529471916, + "grad_norm": 0.09000229835510254, + "learning_rate": 1.1101514218751417e-05, + "loss": 0.583, + "step": 13500 + }, + { + "epoch": 0.6574148467362988, + "grad_norm": 1.3199363946914673, + "learning_rate": 1.109868937794884e-05, + "loss": 0.8277, + "step": 13501 + }, + { + "epoch": 0.657463540525406, + "grad_norm": 1.538108229637146, + "learning_rate": 1.109586475856717e-05, + "loss": 0.7566, + "step": 13502 + }, + { + "epoch": 0.6575122343145132, + "grad_norm": 1.5141245126724243, + "learning_rate": 1.109304036067669e-05, + "loss": 0.7463, + "step": 13503 + }, + { + "epoch": 0.6575609281036204, + "grad_norm": 1.4753012657165527, + "learning_rate": 1.1090216184347629e-05, + "loss": 0.8122, + "step": 13504 + }, + { + "epoch": 0.6576096218927275, + "grad_norm": 1.311491847038269, + "learning_rate": 1.108739222965027e-05, + "loss": 0.8226, + "step": 13505 + }, + { + "epoch": 0.6576583156818347, + "grad_norm": 1.3667693138122559, + "learning_rate": 1.1084568496654822e-05, + "loss": 0.8302, + "step": 13506 + }, + { + "epoch": 0.657707009470942, + "grad_norm": 1.5069642066955566, + "learning_rate": 1.1081744985431556e-05, + "loss": 0.7413, + "step": 13507 + }, + { + "epoch": 0.6577557032600492, + "grad_norm": 1.8285346031188965, + "learning_rate": 1.1078921696050697e-05, + "loss": 0.7994, + "step": 13508 + }, + { + "epoch": 0.6578043970491564, + "grad_norm": 1.614973783493042, + "learning_rate": 1.1076098628582473e-05, + "loss": 0.7987, + "step": 13509 + }, + { + "epoch": 0.6578530908382636, + "grad_norm": 0.09515809267759323, + "learning_rate": 1.1073275783097111e-05, + "loss": 0.5825, + "step": 13510 + }, + { + "epoch": 0.6579017846273708, + "grad_norm": 1.5204999446868896, + "learning_rate": 1.1070453159664818e-05, + "loss": 0.8658, + "step": 13511 + }, + { + "epoch": 0.657950478416478, + "grad_norm": 1.2997881174087524, + "learning_rate": 1.1067630758355832e-05, + "loss": 0.8159, + "step": 13512 + }, + { + "epoch": 0.6579991722055851, + "grad_norm": 1.7072805166244507, + "learning_rate": 1.1064808579240331e-05, + "loss": 0.79, + "step": 13513 + }, + { + "epoch": 0.6580478659946923, + "grad_norm": 1.428363561630249, + "learning_rate": 1.106198662238854e-05, + "loss": 0.8837, + "step": 13514 + }, + { + "epoch": 0.6580965597837996, + "grad_norm": 1.5160547494888306, + "learning_rate": 1.1059164887870644e-05, + "loss": 0.8114, + "step": 13515 + }, + { + "epoch": 0.6581452535729068, + "grad_norm": 1.7567861080169678, + "learning_rate": 1.1056343375756836e-05, + "loss": 0.7844, + "step": 13516 + }, + { + "epoch": 0.658193947362014, + "grad_norm": 1.7939023971557617, + "learning_rate": 1.1053522086117305e-05, + "loss": 0.8071, + "step": 13517 + }, + { + "epoch": 0.6582426411511212, + "grad_norm": 1.2291984558105469, + "learning_rate": 1.1050701019022227e-05, + "loss": 0.822, + "step": 13518 + }, + { + "epoch": 0.6582913349402284, + "grad_norm": 0.10009729117155075, + "learning_rate": 1.104788017454178e-05, + "loss": 0.7153, + "step": 13519 + }, + { + "epoch": 0.6583400287293356, + "grad_norm": 2.1196272373199463, + "learning_rate": 1.1045059552746125e-05, + "loss": 0.7945, + "step": 13520 + }, + { + "epoch": 0.6583887225184428, + "grad_norm": 1.6702094078063965, + "learning_rate": 1.1042239153705432e-05, + "loss": 0.8654, + "step": 13521 + }, + { + "epoch": 0.6584374163075499, + "grad_norm": 3.070143461227417, + "learning_rate": 1.1039418977489849e-05, + "loss": 0.7943, + "step": 13522 + }, + { + "epoch": 0.6584861100966571, + "grad_norm": 1.9353657960891724, + "learning_rate": 1.1036599024169546e-05, + "loss": 0.7862, + "step": 13523 + }, + { + "epoch": 0.6585348038857644, + "grad_norm": 1.7647138833999634, + "learning_rate": 1.1033779293814657e-05, + "loss": 0.8563, + "step": 13524 + }, + { + "epoch": 0.6585834976748716, + "grad_norm": 0.11230297386646271, + "learning_rate": 1.1030959786495326e-05, + "loss": 0.6267, + "step": 13525 + }, + { + "epoch": 0.6586321914639788, + "grad_norm": 1.3981941938400269, + "learning_rate": 1.1028140502281688e-05, + "loss": 0.7447, + "step": 13526 + }, + { + "epoch": 0.658680885253086, + "grad_norm": 1.0682274103164673, + "learning_rate": 1.1025321441243872e-05, + "loss": 0.8006, + "step": 13527 + }, + { + "epoch": 0.6587295790421932, + "grad_norm": 2.4612269401550293, + "learning_rate": 1.1022502603452005e-05, + "loss": 0.8808, + "step": 13528 + }, + { + "epoch": 0.6587782728313004, + "grad_norm": 1.4117918014526367, + "learning_rate": 1.1019683988976203e-05, + "loss": 0.7348, + "step": 13529 + }, + { + "epoch": 0.6588269666204075, + "grad_norm": 2.018052339553833, + "learning_rate": 1.1016865597886575e-05, + "loss": 0.8784, + "step": 13530 + }, + { + "epoch": 0.6588756604095147, + "grad_norm": 1.7064684629440308, + "learning_rate": 1.101404743025324e-05, + "loss": 0.8218, + "step": 13531 + }, + { + "epoch": 0.6589243541986219, + "grad_norm": 1.4809099435806274, + "learning_rate": 1.1011229486146295e-05, + "loss": 0.813, + "step": 13532 + }, + { + "epoch": 0.6589730479877292, + "grad_norm": 1.7939963340759277, + "learning_rate": 1.1008411765635835e-05, + "loss": 0.726, + "step": 13533 + }, + { + "epoch": 0.6590217417768364, + "grad_norm": 1.2800486087799072, + "learning_rate": 1.1005594268791956e-05, + "loss": 0.7349, + "step": 13534 + }, + { + "epoch": 0.6590704355659436, + "grad_norm": 1.478934407234192, + "learning_rate": 1.1002776995684736e-05, + "loss": 0.8217, + "step": 13535 + }, + { + "epoch": 0.6591191293550508, + "grad_norm": 1.6272870302200317, + "learning_rate": 1.0999959946384258e-05, + "loss": 0.8384, + "step": 13536 + }, + { + "epoch": 0.659167823144158, + "grad_norm": 1.5419923067092896, + "learning_rate": 1.0997143120960592e-05, + "loss": 0.8039, + "step": 13537 + }, + { + "epoch": 0.6592165169332652, + "grad_norm": 1.8534657955169678, + "learning_rate": 1.0994326519483826e-05, + "loss": 0.8028, + "step": 13538 + }, + { + "epoch": 0.6592652107223723, + "grad_norm": 1.3909779787063599, + "learning_rate": 1.0991510142023993e-05, + "loss": 0.8153, + "step": 13539 + }, + { + "epoch": 0.6593139045114795, + "grad_norm": 1.739155650138855, + "learning_rate": 1.0988693988651182e-05, + "loss": 0.8308, + "step": 13540 + }, + { + "epoch": 0.6593625983005867, + "grad_norm": 1.7290271520614624, + "learning_rate": 1.0985878059435415e-05, + "loss": 0.8253, + "step": 13541 + }, + { + "epoch": 0.659411292089694, + "grad_norm": 1.532160758972168, + "learning_rate": 1.098306235444676e-05, + "loss": 0.8416, + "step": 13542 + }, + { + "epoch": 0.6594599858788012, + "grad_norm": 1.277081847190857, + "learning_rate": 1.0980246873755252e-05, + "loss": 0.88, + "step": 13543 + }, + { + "epoch": 0.6595086796679084, + "grad_norm": 1.5259352922439575, + "learning_rate": 1.0977431617430919e-05, + "loss": 0.8183, + "step": 13544 + }, + { + "epoch": 0.6595573734570156, + "grad_norm": 1.49606454372406, + "learning_rate": 1.0974616585543809e-05, + "loss": 0.8311, + "step": 13545 + }, + { + "epoch": 0.6596060672461228, + "grad_norm": 0.09389588981866837, + "learning_rate": 1.0971801778163922e-05, + "loss": 0.5951, + "step": 13546 + }, + { + "epoch": 0.6596547610352299, + "grad_norm": 1.585873007774353, + "learning_rate": 1.0968987195361302e-05, + "loss": 0.8524, + "step": 13547 + }, + { + "epoch": 0.6597034548243371, + "grad_norm": 2.1094181537628174, + "learning_rate": 1.0966172837205935e-05, + "loss": 0.7611, + "step": 13548 + }, + { + "epoch": 0.6597521486134443, + "grad_norm": 1.6315220594406128, + "learning_rate": 1.0963358703767849e-05, + "loss": 0.7956, + "step": 13549 + }, + { + "epoch": 0.6598008424025515, + "grad_norm": 1.7743841409683228, + "learning_rate": 1.096054479511704e-05, + "loss": 0.8544, + "step": 13550 + }, + { + "epoch": 0.6598495361916588, + "grad_norm": 1.60171639919281, + "learning_rate": 1.0957731111323504e-05, + "loss": 0.7669, + "step": 13551 + }, + { + "epoch": 0.659898229980766, + "grad_norm": 1.3736456632614136, + "learning_rate": 1.0954917652457228e-05, + "loss": 0.8307, + "step": 13552 + }, + { + "epoch": 0.6599469237698732, + "grad_norm": 1.2359169721603394, + "learning_rate": 1.0952104418588196e-05, + "loss": 0.8661, + "step": 13553 + }, + { + "epoch": 0.6599956175589804, + "grad_norm": 1.2277549505233765, + "learning_rate": 1.0949291409786407e-05, + "loss": 0.8162, + "step": 13554 + }, + { + "epoch": 0.6600443113480875, + "grad_norm": 1.8350012302398682, + "learning_rate": 1.0946478626121802e-05, + "loss": 0.793, + "step": 13555 + }, + { + "epoch": 0.6600930051371947, + "grad_norm": 1.5544599294662476, + "learning_rate": 1.0943666067664382e-05, + "loss": 0.7316, + "step": 13556 + }, + { + "epoch": 0.6601416989263019, + "grad_norm": 1.4214072227478027, + "learning_rate": 1.094085373448408e-05, + "loss": 0.8702, + "step": 13557 + }, + { + "epoch": 0.6601903927154091, + "grad_norm": 1.9329367876052856, + "learning_rate": 1.0938041626650875e-05, + "loss": 0.8446, + "step": 13558 + }, + { + "epoch": 0.6602390865045163, + "grad_norm": 1.424465537071228, + "learning_rate": 1.093522974423471e-05, + "loss": 0.7611, + "step": 13559 + }, + { + "epoch": 0.6602877802936236, + "grad_norm": 1.6151372194290161, + "learning_rate": 1.0932418087305524e-05, + "loss": 0.7999, + "step": 13560 + }, + { + "epoch": 0.6603364740827308, + "grad_norm": 2.1847169399261475, + "learning_rate": 1.0929606655933283e-05, + "loss": 0.8087, + "step": 13561 + }, + { + "epoch": 0.660385167871838, + "grad_norm": 2.4789042472839355, + "learning_rate": 1.0926795450187884e-05, + "loss": 0.8433, + "step": 13562 + }, + { + "epoch": 0.6604338616609452, + "grad_norm": 1.8047701120376587, + "learning_rate": 1.0923984470139293e-05, + "loss": 0.7341, + "step": 13563 + }, + { + "epoch": 0.6604825554500523, + "grad_norm": 1.398098111152649, + "learning_rate": 1.0921173715857397e-05, + "loss": 0.7241, + "step": 13564 + }, + { + "epoch": 0.6605312492391595, + "grad_norm": 1.3569716215133667, + "learning_rate": 1.0918363187412144e-05, + "loss": 0.7579, + "step": 13565 + }, + { + "epoch": 0.6605799430282667, + "grad_norm": 1.160143494606018, + "learning_rate": 1.0915552884873436e-05, + "loss": 0.7813, + "step": 13566 + }, + { + "epoch": 0.6606286368173739, + "grad_norm": 1.2174955606460571, + "learning_rate": 1.0912742808311174e-05, + "loss": 0.8314, + "step": 13567 + }, + { + "epoch": 0.6606773306064812, + "grad_norm": 1.4007434844970703, + "learning_rate": 1.0909932957795266e-05, + "loss": 0.7441, + "step": 13568 + }, + { + "epoch": 0.6607260243955884, + "grad_norm": 1.4027272462844849, + "learning_rate": 1.0907123333395604e-05, + "loss": 0.7213, + "step": 13569 + }, + { + "epoch": 0.6607747181846956, + "grad_norm": 1.152773380279541, + "learning_rate": 1.0904313935182078e-05, + "loss": 0.7616, + "step": 13570 + }, + { + "epoch": 0.6608234119738028, + "grad_norm": 1.3495006561279297, + "learning_rate": 1.0901504763224565e-05, + "loss": 0.8473, + "step": 13571 + }, + { + "epoch": 0.6608721057629099, + "grad_norm": 2.179903745651245, + "learning_rate": 1.0898695817592958e-05, + "loss": 0.805, + "step": 13572 + }, + { + "epoch": 0.6609207995520171, + "grad_norm": 1.3706246614456177, + "learning_rate": 1.0895887098357122e-05, + "loss": 0.8441, + "step": 13573 + }, + { + "epoch": 0.6609694933411243, + "grad_norm": 2.1153793334960938, + "learning_rate": 1.0893078605586921e-05, + "loss": 0.8156, + "step": 13574 + }, + { + "epoch": 0.6610181871302315, + "grad_norm": 1.528649091720581, + "learning_rate": 1.0890270339352225e-05, + "loss": 0.7409, + "step": 13575 + }, + { + "epoch": 0.6610668809193387, + "grad_norm": 1.9468629360198975, + "learning_rate": 1.0887462299722884e-05, + "loss": 0.7693, + "step": 13576 + }, + { + "epoch": 0.661115574708446, + "grad_norm": 1.1336451768875122, + "learning_rate": 1.0884654486768749e-05, + "loss": 0.8604, + "step": 13577 + }, + { + "epoch": 0.6611642684975532, + "grad_norm": 1.2950583696365356, + "learning_rate": 1.0881846900559665e-05, + "loss": 0.9003, + "step": 13578 + }, + { + "epoch": 0.6612129622866604, + "grad_norm": 2.6827168464660645, + "learning_rate": 1.0879039541165471e-05, + "loss": 0.8463, + "step": 13579 + }, + { + "epoch": 0.6612616560757676, + "grad_norm": 1.663776159286499, + "learning_rate": 1.0876232408655989e-05, + "loss": 0.7821, + "step": 13580 + }, + { + "epoch": 0.6613103498648747, + "grad_norm": 2.1741433143615723, + "learning_rate": 1.0873425503101069e-05, + "loss": 0.8019, + "step": 13581 + }, + { + "epoch": 0.6613590436539819, + "grad_norm": 1.3765273094177246, + "learning_rate": 1.0870618824570521e-05, + "loss": 0.7886, + "step": 13582 + }, + { + "epoch": 0.6614077374430891, + "grad_norm": 1.436867594718933, + "learning_rate": 1.0867812373134162e-05, + "loss": 0.7408, + "step": 13583 + }, + { + "epoch": 0.6614564312321963, + "grad_norm": 1.5040334463119507, + "learning_rate": 1.08650061488618e-05, + "loss": 0.7829, + "step": 13584 + }, + { + "epoch": 0.6615051250213035, + "grad_norm": 1.596977949142456, + "learning_rate": 1.0862200151823248e-05, + "loss": 0.8082, + "step": 13585 + }, + { + "epoch": 0.6615538188104108, + "grad_norm": 1.6716151237487793, + "learning_rate": 1.0859394382088295e-05, + "loss": 0.8962, + "step": 13586 + }, + { + "epoch": 0.661602512599518, + "grad_norm": 1.6929885149002075, + "learning_rate": 1.0856588839726745e-05, + "loss": 0.9052, + "step": 13587 + }, + { + "epoch": 0.6616512063886252, + "grad_norm": 1.7300125360488892, + "learning_rate": 1.085378352480837e-05, + "loss": 0.8158, + "step": 13588 + }, + { + "epoch": 0.6616999001777323, + "grad_norm": 1.561413288116455, + "learning_rate": 1.0850978437402984e-05, + "loss": 0.8598, + "step": 13589 + }, + { + "epoch": 0.6617485939668395, + "grad_norm": 1.4459784030914307, + "learning_rate": 1.0848173577580323e-05, + "loss": 0.8317, + "step": 13590 + }, + { + "epoch": 0.6617972877559467, + "grad_norm": 1.5332520008087158, + "learning_rate": 1.0845368945410188e-05, + "loss": 0.926, + "step": 13591 + }, + { + "epoch": 0.6618459815450539, + "grad_norm": 1.5341728925704956, + "learning_rate": 1.0842564540962335e-05, + "loss": 0.8321, + "step": 13592 + }, + { + "epoch": 0.6618946753341611, + "grad_norm": 2.054197311401367, + "learning_rate": 1.0839760364306526e-05, + "loss": 0.7969, + "step": 13593 + }, + { + "epoch": 0.6619433691232683, + "grad_norm": 1.5163425207138062, + "learning_rate": 1.083695641551251e-05, + "loss": 0.8131, + "step": 13594 + }, + { + "epoch": 0.6619920629123756, + "grad_norm": 1.3658310174942017, + "learning_rate": 1.0834152694650034e-05, + "loss": 0.8126, + "step": 13595 + }, + { + "epoch": 0.6620407567014828, + "grad_norm": 2.6258583068847656, + "learning_rate": 1.0831349201788859e-05, + "loss": 0.8705, + "step": 13596 + }, + { + "epoch": 0.6620894504905899, + "grad_norm": 2.3987748622894287, + "learning_rate": 1.0828545936998691e-05, + "loss": 0.7436, + "step": 13597 + }, + { + "epoch": 0.6621381442796971, + "grad_norm": 2.458054780960083, + "learning_rate": 1.08257429003493e-05, + "loss": 0.8523, + "step": 13598 + }, + { + "epoch": 0.6621868380688043, + "grad_norm": 1.408033013343811, + "learning_rate": 1.082294009191037e-05, + "loss": 0.8091, + "step": 13599 + }, + { + "epoch": 0.6622355318579115, + "grad_norm": 20.312578201293945, + "learning_rate": 1.0820137511751651e-05, + "loss": 0.7985, + "step": 13600 + }, + { + "epoch": 0.6622842256470187, + "grad_norm": 1.3151205778121948, + "learning_rate": 1.081733515994285e-05, + "loss": 0.7911, + "step": 13601 + }, + { + "epoch": 0.6623329194361259, + "grad_norm": 1.9369187355041504, + "learning_rate": 1.0814533036553674e-05, + "loss": 0.8113, + "step": 13602 + }, + { + "epoch": 0.6623816132252331, + "grad_norm": 1.5882296562194824, + "learning_rate": 1.0811731141653828e-05, + "loss": 0.823, + "step": 13603 + }, + { + "epoch": 0.6624303070143404, + "grad_norm": 2.3936522006988525, + "learning_rate": 1.0808929475312998e-05, + "loss": 0.8685, + "step": 13604 + }, + { + "epoch": 0.6624790008034476, + "grad_norm": 2.310962677001953, + "learning_rate": 1.0806128037600898e-05, + "loss": 0.7219, + "step": 13605 + }, + { + "epoch": 0.6625276945925547, + "grad_norm": 1.783806562423706, + "learning_rate": 1.0803326828587189e-05, + "loss": 0.7947, + "step": 13606 + }, + { + "epoch": 0.6625763883816619, + "grad_norm": 1.9300888776779175, + "learning_rate": 1.0800525848341579e-05, + "loss": 0.7546, + "step": 13607 + }, + { + "epoch": 0.6626250821707691, + "grad_norm": 1.4434181451797485, + "learning_rate": 1.0797725096933709e-05, + "loss": 0.8306, + "step": 13608 + }, + { + "epoch": 0.6626737759598763, + "grad_norm": 1.44609534740448, + "learning_rate": 1.0794924574433279e-05, + "loss": 0.8632, + "step": 13609 + }, + { + "epoch": 0.6627224697489835, + "grad_norm": 1.775308609008789, + "learning_rate": 1.0792124280909934e-05, + "loss": 0.8417, + "step": 13610 + }, + { + "epoch": 0.6627711635380907, + "grad_norm": 2.7991604804992676, + "learning_rate": 1.0789324216433333e-05, + "loss": 0.8561, + "step": 13611 + }, + { + "epoch": 0.662819857327198, + "grad_norm": 1.6668827533721924, + "learning_rate": 1.0786524381073145e-05, + "loss": 0.8043, + "step": 13612 + }, + { + "epoch": 0.6628685511163052, + "grad_norm": 1.887999176979065, + "learning_rate": 1.0783724774898992e-05, + "loss": 0.7643, + "step": 13613 + }, + { + "epoch": 0.6629172449054123, + "grad_norm": 1.6417487859725952, + "learning_rate": 1.0780925397980536e-05, + "loss": 0.7536, + "step": 13614 + }, + { + "epoch": 0.6629659386945195, + "grad_norm": 1.9357198476791382, + "learning_rate": 1.0778126250387388e-05, + "loss": 0.8863, + "step": 13615 + }, + { + "epoch": 0.6630146324836267, + "grad_norm": 1.264792799949646, + "learning_rate": 1.0775327332189195e-05, + "loss": 0.8706, + "step": 13616 + }, + { + "epoch": 0.6630633262727339, + "grad_norm": 1.9747389554977417, + "learning_rate": 1.077252864345558e-05, + "loss": 0.763, + "step": 13617 + }, + { + "epoch": 0.6631120200618411, + "grad_norm": 1.7396855354309082, + "learning_rate": 1.0769730184256154e-05, + "loss": 0.7974, + "step": 13618 + }, + { + "epoch": 0.6631607138509483, + "grad_norm": 1.5815109014511108, + "learning_rate": 1.0766931954660533e-05, + "loss": 0.8523, + "step": 13619 + }, + { + "epoch": 0.6632094076400555, + "grad_norm": 1.2051178216934204, + "learning_rate": 1.0764133954738314e-05, + "loss": 0.8235, + "step": 13620 + }, + { + "epoch": 0.6632581014291627, + "grad_norm": 1.452298641204834, + "learning_rate": 1.076133618455912e-05, + "loss": 0.8137, + "step": 13621 + }, + { + "epoch": 0.66330679521827, + "grad_norm": 1.415320634841919, + "learning_rate": 1.0758538644192513e-05, + "loss": 0.912, + "step": 13622 + }, + { + "epoch": 0.6633554890073771, + "grad_norm": 1.4670056104660034, + "learning_rate": 1.075574133370811e-05, + "loss": 0.8827, + "step": 13623 + }, + { + "epoch": 0.6634041827964843, + "grad_norm": 1.453357458114624, + "learning_rate": 1.0752944253175484e-05, + "loss": 0.8202, + "step": 13624 + }, + { + "epoch": 0.6634528765855915, + "grad_norm": 1.8702867031097412, + "learning_rate": 1.075014740266421e-05, + "loss": 0.9047, + "step": 13625 + }, + { + "epoch": 0.6635015703746987, + "grad_norm": 1.2558801174163818, + "learning_rate": 1.0747350782243866e-05, + "loss": 0.8018, + "step": 13626 + }, + { + "epoch": 0.6635502641638059, + "grad_norm": 9.428528785705566, + "learning_rate": 1.0744554391984015e-05, + "loss": 0.8088, + "step": 13627 + }, + { + "epoch": 0.6635989579529131, + "grad_norm": 1.512337565422058, + "learning_rate": 1.0741758231954216e-05, + "loss": 0.8346, + "step": 13628 + }, + { + "epoch": 0.6636476517420203, + "grad_norm": 1.4073702096939087, + "learning_rate": 1.0738962302224025e-05, + "loss": 0.8554, + "step": 13629 + }, + { + "epoch": 0.6636963455311276, + "grad_norm": 1.9125163555145264, + "learning_rate": 1.0736166602862982e-05, + "loss": 0.7361, + "step": 13630 + }, + { + "epoch": 0.6637450393202347, + "grad_norm": 1.3546068668365479, + "learning_rate": 1.073337113394065e-05, + "loss": 0.8434, + "step": 13631 + }, + { + "epoch": 0.6637937331093419, + "grad_norm": 1.626198172569275, + "learning_rate": 1.0730575895526556e-05, + "loss": 0.871, + "step": 13632 + }, + { + "epoch": 0.6638424268984491, + "grad_norm": 1.514470100402832, + "learning_rate": 1.0727780887690228e-05, + "loss": 0.7251, + "step": 13633 + }, + { + "epoch": 0.6638911206875563, + "grad_norm": 1.9435361623764038, + "learning_rate": 1.07249861105012e-05, + "loss": 0.8184, + "step": 13634 + }, + { + "epoch": 0.6639398144766635, + "grad_norm": 2.5709481239318848, + "learning_rate": 1.0722191564028985e-05, + "loss": 0.8323, + "step": 13635 + }, + { + "epoch": 0.6639885082657707, + "grad_norm": 1.5034129619598389, + "learning_rate": 1.0719397248343104e-05, + "loss": 0.7212, + "step": 13636 + }, + { + "epoch": 0.6640372020548779, + "grad_norm": 2.2121191024780273, + "learning_rate": 1.071660316351306e-05, + "loss": 0.7879, + "step": 13637 + }, + { + "epoch": 0.6640858958439851, + "grad_norm": 1.909746766090393, + "learning_rate": 1.071380930960836e-05, + "loss": 0.7937, + "step": 13638 + }, + { + "epoch": 0.6641345896330924, + "grad_norm": 1.679320216178894, + "learning_rate": 1.0711015686698491e-05, + "loss": 0.8225, + "step": 13639 + }, + { + "epoch": 0.6641832834221995, + "grad_norm": 1.2570265531539917, + "learning_rate": 1.0708222294852964e-05, + "loss": 0.7602, + "step": 13640 + }, + { + "epoch": 0.6642319772113067, + "grad_norm": 2.606630563735962, + "learning_rate": 1.0705429134141252e-05, + "loss": 0.8415, + "step": 13641 + }, + { + "epoch": 0.6642806710004139, + "grad_norm": 1.2046242952346802, + "learning_rate": 1.0702636204632836e-05, + "loss": 0.8458, + "step": 13642 + }, + { + "epoch": 0.6643293647895211, + "grad_norm": 1.7839242219924927, + "learning_rate": 1.0699843506397199e-05, + "loss": 0.8506, + "step": 13643 + }, + { + "epoch": 0.6643780585786283, + "grad_norm": 1.8592331409454346, + "learning_rate": 1.0697051039503798e-05, + "loss": 0.8825, + "step": 13644 + }, + { + "epoch": 0.6644267523677355, + "grad_norm": 1.560478687286377, + "learning_rate": 1.06942588040221e-05, + "loss": 0.8568, + "step": 13645 + }, + { + "epoch": 0.6644754461568427, + "grad_norm": 1.217721939086914, + "learning_rate": 1.0691466800021554e-05, + "loss": 0.7303, + "step": 13646 + }, + { + "epoch": 0.6645241399459499, + "grad_norm": 1.6236494779586792, + "learning_rate": 1.068867502757164e-05, + "loss": 0.8256, + "step": 13647 + }, + { + "epoch": 0.664572833735057, + "grad_norm": 1.6640952825546265, + "learning_rate": 1.0685883486741766e-05, + "loss": 0.8054, + "step": 13648 + }, + { + "epoch": 0.6646215275241643, + "grad_norm": 2.3028571605682373, + "learning_rate": 1.0683092177601397e-05, + "loss": 0.8929, + "step": 13649 + }, + { + "epoch": 0.6646702213132715, + "grad_norm": 1.9800646305084229, + "learning_rate": 1.0680301100219959e-05, + "loss": 0.7579, + "step": 13650 + }, + { + "epoch": 0.6647189151023787, + "grad_norm": 1.6612060070037842, + "learning_rate": 1.0677510254666883e-05, + "loss": 0.8539, + "step": 13651 + }, + { + "epoch": 0.6647676088914859, + "grad_norm": 1.4912508726119995, + "learning_rate": 1.0674719641011588e-05, + "loss": 0.7807, + "step": 13652 + }, + { + "epoch": 0.6648163026805931, + "grad_norm": 1.966243028640747, + "learning_rate": 1.0671929259323483e-05, + "loss": 0.8064, + "step": 13653 + }, + { + "epoch": 0.6648649964697003, + "grad_norm": 2.092095136642456, + "learning_rate": 1.0669139109672004e-05, + "loss": 0.8536, + "step": 13654 + }, + { + "epoch": 0.6649136902588075, + "grad_norm": 1.4482498168945312, + "learning_rate": 1.0666349192126528e-05, + "loss": 0.8282, + "step": 13655 + }, + { + "epoch": 0.6649623840479146, + "grad_norm": 1.5407748222351074, + "learning_rate": 1.066355950675648e-05, + "loss": 0.7677, + "step": 13656 + }, + { + "epoch": 0.6650110778370218, + "grad_norm": 1.9851553440093994, + "learning_rate": 1.0660770053631224e-05, + "loss": 0.6759, + "step": 13657 + }, + { + "epoch": 0.6650597716261291, + "grad_norm": 2.329209804534912, + "learning_rate": 1.0657980832820174e-05, + "loss": 0.799, + "step": 13658 + }, + { + "epoch": 0.6651084654152363, + "grad_norm": 1.8312040567398071, + "learning_rate": 1.0655191844392703e-05, + "loss": 0.8235, + "step": 13659 + }, + { + "epoch": 0.6651571592043435, + "grad_norm": 1.4148977994918823, + "learning_rate": 1.0652403088418186e-05, + "loss": 0.7619, + "step": 13660 + }, + { + "epoch": 0.6652058529934507, + "grad_norm": 2.0522303581237793, + "learning_rate": 1.0649614564965996e-05, + "loss": 0.7658, + "step": 13661 + }, + { + "epoch": 0.6652545467825579, + "grad_norm": 1.8026701211929321, + "learning_rate": 1.0646826274105483e-05, + "loss": 0.8961, + "step": 13662 + }, + { + "epoch": 0.6653032405716651, + "grad_norm": 1.5652213096618652, + "learning_rate": 1.0644038215906041e-05, + "loss": 0.811, + "step": 13663 + }, + { + "epoch": 0.6653519343607723, + "grad_norm": 1.2145122289657593, + "learning_rate": 1.0641250390436983e-05, + "loss": 0.7676, + "step": 13664 + }, + { + "epoch": 0.6654006281498794, + "grad_norm": 1.6860729455947876, + "learning_rate": 1.0638462797767693e-05, + "loss": 0.8522, + "step": 13665 + }, + { + "epoch": 0.6654493219389866, + "grad_norm": 1.6683505773544312, + "learning_rate": 1.0635675437967476e-05, + "loss": 0.7675, + "step": 13666 + }, + { + "epoch": 0.6654980157280939, + "grad_norm": 0.096294105052948, + "learning_rate": 1.0632888311105693e-05, + "loss": 0.5959, + "step": 13667 + }, + { + "epoch": 0.6655467095172011, + "grad_norm": 1.4469789266586304, + "learning_rate": 1.063010141725167e-05, + "loss": 0.7962, + "step": 13668 + }, + { + "epoch": 0.6655954033063083, + "grad_norm": 1.7110487222671509, + "learning_rate": 1.0627314756474721e-05, + "loss": 0.9362, + "step": 13669 + }, + { + "epoch": 0.6656440970954155, + "grad_norm": 1.5994925498962402, + "learning_rate": 1.0624528328844185e-05, + "loss": 0.8944, + "step": 13670 + }, + { + "epoch": 0.6656927908845227, + "grad_norm": 1.4150890111923218, + "learning_rate": 1.062174213442935e-05, + "loss": 0.8422, + "step": 13671 + }, + { + "epoch": 0.6657414846736299, + "grad_norm": 1.7223947048187256, + "learning_rate": 1.0618956173299547e-05, + "loss": 0.7907, + "step": 13672 + }, + { + "epoch": 0.665790178462737, + "grad_norm": 1.900475263595581, + "learning_rate": 1.061617044552405e-05, + "loss": 0.7505, + "step": 13673 + }, + { + "epoch": 0.6658388722518442, + "grad_norm": 0.09204129129648209, + "learning_rate": 1.0613384951172177e-05, + "loss": 0.5999, + "step": 13674 + }, + { + "epoch": 0.6658875660409515, + "grad_norm": 1.8962340354919434, + "learning_rate": 1.0610599690313207e-05, + "loss": 0.7613, + "step": 13675 + }, + { + "epoch": 0.6659362598300587, + "grad_norm": 2.294398069381714, + "learning_rate": 1.0607814663016427e-05, + "loss": 0.7471, + "step": 13676 + }, + { + "epoch": 0.6659849536191659, + "grad_norm": 2.2269413471221924, + "learning_rate": 1.0605029869351114e-05, + "loss": 0.8072, + "step": 13677 + }, + { + "epoch": 0.6660336474082731, + "grad_norm": 1.6680419445037842, + "learning_rate": 1.0602245309386539e-05, + "loss": 0.8143, + "step": 13678 + }, + { + "epoch": 0.6660823411973803, + "grad_norm": 1.64181649684906, + "learning_rate": 1.0599460983191969e-05, + "loss": 0.8624, + "step": 13679 + }, + { + "epoch": 0.6661310349864875, + "grad_norm": 1.427620768547058, + "learning_rate": 1.0596676890836657e-05, + "loss": 0.8708, + "step": 13680 + }, + { + "epoch": 0.6661797287755947, + "grad_norm": 1.8062546253204346, + "learning_rate": 1.0593893032389872e-05, + "loss": 0.7698, + "step": 13681 + }, + { + "epoch": 0.6662284225647018, + "grad_norm": 1.6179661750793457, + "learning_rate": 1.0591109407920856e-05, + "loss": 0.807, + "step": 13682 + }, + { + "epoch": 0.666277116353809, + "grad_norm": 1.7292166948318481, + "learning_rate": 1.058832601749885e-05, + "loss": 0.7582, + "step": 13683 + }, + { + "epoch": 0.6663258101429163, + "grad_norm": 0.0961562991142273, + "learning_rate": 1.0585542861193092e-05, + "loss": 0.5652, + "step": 13684 + }, + { + "epoch": 0.6663745039320235, + "grad_norm": 4.961952209472656, + "learning_rate": 1.0582759939072815e-05, + "loss": 0.8849, + "step": 13685 + }, + { + "epoch": 0.6664231977211307, + "grad_norm": 2.704657793045044, + "learning_rate": 1.0579977251207244e-05, + "loss": 0.9196, + "step": 13686 + }, + { + "epoch": 0.6664718915102379, + "grad_norm": 1.911566972732544, + "learning_rate": 1.0577194797665595e-05, + "loss": 0.8217, + "step": 13687 + }, + { + "epoch": 0.6665205852993451, + "grad_norm": 2.1719624996185303, + "learning_rate": 1.0574412578517089e-05, + "loss": 0.8326, + "step": 13688 + }, + { + "epoch": 0.6665692790884523, + "grad_norm": 1.6437205076217651, + "learning_rate": 1.0571630593830919e-05, + "loss": 0.8424, + "step": 13689 + }, + { + "epoch": 0.6666179728775594, + "grad_norm": 1.2134768962860107, + "learning_rate": 1.0568848843676306e-05, + "loss": 0.8369, + "step": 13690 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.2253713607788086, + "learning_rate": 1.0566067328122436e-05, + "loss": 0.8183, + "step": 13691 + }, + { + "epoch": 0.6667153604557738, + "grad_norm": 1.8183555603027344, + "learning_rate": 1.0563286047238503e-05, + "loss": 0.7603, + "step": 13692 + }, + { + "epoch": 0.666764054244881, + "grad_norm": 0.10130798816680908, + "learning_rate": 1.0560505001093691e-05, + "loss": 0.6404, + "step": 13693 + }, + { + "epoch": 0.6668127480339883, + "grad_norm": 2.3198015689849854, + "learning_rate": 1.0557724189757174e-05, + "loss": 0.7935, + "step": 13694 + }, + { + "epoch": 0.6668614418230955, + "grad_norm": 2.86323881149292, + "learning_rate": 1.0554943613298133e-05, + "loss": 0.8363, + "step": 13695 + }, + { + "epoch": 0.6669101356122027, + "grad_norm": 1.8627945184707642, + "learning_rate": 1.055216327178573e-05, + "loss": 0.8718, + "step": 13696 + }, + { + "epoch": 0.6669588294013099, + "grad_norm": 1.5935620069503784, + "learning_rate": 1.0549383165289116e-05, + "loss": 0.7855, + "step": 13697 + }, + { + "epoch": 0.6670075231904171, + "grad_norm": 1.5381660461425781, + "learning_rate": 1.0546603293877476e-05, + "loss": 0.8038, + "step": 13698 + }, + { + "epoch": 0.6670562169795242, + "grad_norm": 1.2440171241760254, + "learning_rate": 1.0543823657619924e-05, + "loss": 0.7879, + "step": 13699 + }, + { + "epoch": 0.6671049107686314, + "grad_norm": 1.519406795501709, + "learning_rate": 1.054104425658563e-05, + "loss": 0.8612, + "step": 13700 + }, + { + "epoch": 0.6671536045577386, + "grad_norm": 1.3396717309951782, + "learning_rate": 1.053826509084372e-05, + "loss": 0.8369, + "step": 13701 + }, + { + "epoch": 0.6672022983468459, + "grad_norm": 1.5352420806884766, + "learning_rate": 1.0535486160463334e-05, + "loss": 0.7246, + "step": 13702 + }, + { + "epoch": 0.6672509921359531, + "grad_norm": 1.5287920236587524, + "learning_rate": 1.0532707465513586e-05, + "loss": 0.8591, + "step": 13703 + }, + { + "epoch": 0.6672996859250603, + "grad_norm": 1.8738733530044556, + "learning_rate": 1.0529929006063599e-05, + "loss": 0.9159, + "step": 13704 + }, + { + "epoch": 0.6673483797141675, + "grad_norm": 2.2131001949310303, + "learning_rate": 1.0527150782182507e-05, + "loss": 0.7962, + "step": 13705 + }, + { + "epoch": 0.6673970735032747, + "grad_norm": 1.317618727684021, + "learning_rate": 1.0524372793939386e-05, + "loss": 0.8097, + "step": 13706 + }, + { + "epoch": 0.6674457672923818, + "grad_norm": 1.8299013376235962, + "learning_rate": 1.0521595041403373e-05, + "loss": 0.8665, + "step": 13707 + }, + { + "epoch": 0.667494461081489, + "grad_norm": 1.5734796524047852, + "learning_rate": 1.051881752464353e-05, + "loss": 0.6909, + "step": 13708 + }, + { + "epoch": 0.6675431548705962, + "grad_norm": 1.5230015516281128, + "learning_rate": 1.0516040243728974e-05, + "loss": 0.7907, + "step": 13709 + }, + { + "epoch": 0.6675918486597034, + "grad_norm": 1.5712661743164062, + "learning_rate": 1.051326319872878e-05, + "loss": 0.7864, + "step": 13710 + }, + { + "epoch": 0.6676405424488107, + "grad_norm": 1.956547737121582, + "learning_rate": 1.0510486389712033e-05, + "loss": 0.794, + "step": 13711 + }, + { + "epoch": 0.6676892362379179, + "grad_norm": 1.493441104888916, + "learning_rate": 1.0507709816747798e-05, + "loss": 0.7965, + "step": 13712 + }, + { + "epoch": 0.6677379300270251, + "grad_norm": 1.5820330381393433, + "learning_rate": 1.0504933479905141e-05, + "loss": 0.9062, + "step": 13713 + }, + { + "epoch": 0.6677866238161323, + "grad_norm": 2.354747772216797, + "learning_rate": 1.0502157379253146e-05, + "loss": 0.7644, + "step": 13714 + }, + { + "epoch": 0.6678353176052394, + "grad_norm": 0.0964067280292511, + "learning_rate": 1.0499381514860836e-05, + "loss": 0.6259, + "step": 13715 + }, + { + "epoch": 0.6678840113943466, + "grad_norm": 1.5275729894638062, + "learning_rate": 1.0496605886797293e-05, + "loss": 0.8344, + "step": 13716 + }, + { + "epoch": 0.6679327051834538, + "grad_norm": 1.5173954963684082, + "learning_rate": 1.0493830495131527e-05, + "loss": 0.8393, + "step": 13717 + }, + { + "epoch": 0.667981398972561, + "grad_norm": 1.3473212718963623, + "learning_rate": 1.0491055339932603e-05, + "loss": 0.8098, + "step": 13718 + }, + { + "epoch": 0.6680300927616682, + "grad_norm": 1.5887551307678223, + "learning_rate": 1.0488280421269545e-05, + "loss": 0.8545, + "step": 13719 + }, + { + "epoch": 0.6680787865507755, + "grad_norm": 1.5038902759552002, + "learning_rate": 1.048550573921137e-05, + "loss": 0.8729, + "step": 13720 + }, + { + "epoch": 0.6681274803398827, + "grad_norm": 1.8747100830078125, + "learning_rate": 1.0482731293827127e-05, + "loss": 0.8102, + "step": 13721 + }, + { + "epoch": 0.6681761741289899, + "grad_norm": 1.362676739692688, + "learning_rate": 1.047995708518579e-05, + "loss": 0.7252, + "step": 13722 + }, + { + "epoch": 0.6682248679180971, + "grad_norm": 3.0916311740875244, + "learning_rate": 1.0477183113356405e-05, + "loss": 0.8213, + "step": 13723 + }, + { + "epoch": 0.6682735617072042, + "grad_norm": 1.4411609172821045, + "learning_rate": 1.0474409378407946e-05, + "loss": 0.7652, + "step": 13724 + }, + { + "epoch": 0.6683222554963114, + "grad_norm": 1.8291547298431396, + "learning_rate": 1.0471635880409427e-05, + "loss": 0.8075, + "step": 13725 + }, + { + "epoch": 0.6683709492854186, + "grad_norm": 1.9004192352294922, + "learning_rate": 1.0468862619429836e-05, + "loss": 0.7767, + "step": 13726 + }, + { + "epoch": 0.6684196430745258, + "grad_norm": 2.118555784225464, + "learning_rate": 1.0466089595538157e-05, + "loss": 0.8453, + "step": 13727 + }, + { + "epoch": 0.668468336863633, + "grad_norm": 1.400716781616211, + "learning_rate": 1.0463316808803366e-05, + "loss": 0.8628, + "step": 13728 + }, + { + "epoch": 0.6685170306527403, + "grad_norm": 1.5999445915222168, + "learning_rate": 1.0460544259294432e-05, + "loss": 0.8255, + "step": 13729 + }, + { + "epoch": 0.6685657244418475, + "grad_norm": 1.2850457429885864, + "learning_rate": 1.0457771947080345e-05, + "loss": 0.898, + "step": 13730 + }, + { + "epoch": 0.6686144182309547, + "grad_norm": 1.4512295722961426, + "learning_rate": 1.0454999872230034e-05, + "loss": 0.7815, + "step": 13731 + }, + { + "epoch": 0.6686631120200618, + "grad_norm": 3.428253173828125, + "learning_rate": 1.0452228034812481e-05, + "loss": 0.7543, + "step": 13732 + }, + { + "epoch": 0.668711805809169, + "grad_norm": 1.3549528121948242, + "learning_rate": 1.0449456434896627e-05, + "loss": 0.8071, + "step": 13733 + }, + { + "epoch": 0.6687604995982762, + "grad_norm": 2.8751626014709473, + "learning_rate": 1.0446685072551413e-05, + "loss": 0.8557, + "step": 13734 + }, + { + "epoch": 0.6688091933873834, + "grad_norm": 1.8289707899093628, + "learning_rate": 1.0443913947845781e-05, + "loss": 0.8315, + "step": 13735 + }, + { + "epoch": 0.6688578871764906, + "grad_norm": 1.4584858417510986, + "learning_rate": 1.0441143060848658e-05, + "loss": 0.7767, + "step": 13736 + }, + { + "epoch": 0.6689065809655979, + "grad_norm": 1.6932867765426636, + "learning_rate": 1.0438372411628974e-05, + "loss": 0.8246, + "step": 13737 + }, + { + "epoch": 0.6689552747547051, + "grad_norm": 1.5357029438018799, + "learning_rate": 1.0435602000255649e-05, + "loss": 0.8981, + "step": 13738 + }, + { + "epoch": 0.6690039685438123, + "grad_norm": 2.541433334350586, + "learning_rate": 1.0432831826797587e-05, + "loss": 0.9283, + "step": 13739 + }, + { + "epoch": 0.6690526623329195, + "grad_norm": 1.787100911140442, + "learning_rate": 1.0430061891323714e-05, + "loss": 0.8636, + "step": 13740 + }, + { + "epoch": 0.6691013561220266, + "grad_norm": 2.8217899799346924, + "learning_rate": 1.0427292193902925e-05, + "loss": 0.8244, + "step": 13741 + }, + { + "epoch": 0.6691500499111338, + "grad_norm": 2.354607105255127, + "learning_rate": 1.0424522734604114e-05, + "loss": 0.7627, + "step": 13742 + }, + { + "epoch": 0.669198743700241, + "grad_norm": 1.3258891105651855, + "learning_rate": 1.0421753513496173e-05, + "loss": 0.7249, + "step": 13743 + }, + { + "epoch": 0.6692474374893482, + "grad_norm": 1.5467689037322998, + "learning_rate": 1.0418984530647989e-05, + "loss": 0.804, + "step": 13744 + }, + { + "epoch": 0.6692961312784554, + "grad_norm": 1.5483484268188477, + "learning_rate": 1.0416215786128437e-05, + "loss": 0.7636, + "step": 13745 + }, + { + "epoch": 0.6693448250675627, + "grad_norm": 1.764246940612793, + "learning_rate": 1.041344728000639e-05, + "loss": 0.8123, + "step": 13746 + }, + { + "epoch": 0.6693935188566699, + "grad_norm": 3.0627384185791016, + "learning_rate": 1.0410679012350717e-05, + "loss": 0.788, + "step": 13747 + }, + { + "epoch": 0.6694422126457771, + "grad_norm": 1.6727616786956787, + "learning_rate": 1.0407910983230271e-05, + "loss": 0.7809, + "step": 13748 + }, + { + "epoch": 0.6694909064348842, + "grad_norm": 2.9138236045837402, + "learning_rate": 1.040514319271392e-05, + "loss": 0.7572, + "step": 13749 + }, + { + "epoch": 0.6695396002239914, + "grad_norm": 1.761373519897461, + "learning_rate": 1.0402375640870507e-05, + "loss": 0.736, + "step": 13750 + }, + { + "epoch": 0.6695882940130986, + "grad_norm": 1.3818844556808472, + "learning_rate": 1.0399608327768877e-05, + "loss": 0.7771, + "step": 13751 + }, + { + "epoch": 0.6696369878022058, + "grad_norm": 1.288959264755249, + "learning_rate": 1.0396841253477863e-05, + "loss": 0.7876, + "step": 13752 + }, + { + "epoch": 0.669685681591313, + "grad_norm": 1.3279458284378052, + "learning_rate": 1.0394074418066298e-05, + "loss": 0.8927, + "step": 13753 + }, + { + "epoch": 0.6697343753804202, + "grad_norm": 2.4441778659820557, + "learning_rate": 1.0391307821603011e-05, + "loss": 0.7421, + "step": 13754 + }, + { + "epoch": 0.6697830691695275, + "grad_norm": 2.3943092823028564, + "learning_rate": 1.0388541464156807e-05, + "loss": 0.7938, + "step": 13755 + }, + { + "epoch": 0.6698317629586347, + "grad_norm": 1.2716834545135498, + "learning_rate": 1.0385775345796526e-05, + "loss": 0.7459, + "step": 13756 + }, + { + "epoch": 0.6698804567477418, + "grad_norm": 1.8458549976348877, + "learning_rate": 1.0383009466590948e-05, + "loss": 0.8278, + "step": 13757 + }, + { + "epoch": 0.669929150536849, + "grad_norm": 1.4727858304977417, + "learning_rate": 1.038024382660889e-05, + "loss": 0.7408, + "step": 13758 + }, + { + "epoch": 0.6699778443259562, + "grad_norm": 0.09516578167676926, + "learning_rate": 1.0377478425919145e-05, + "loss": 0.5957, + "step": 13759 + }, + { + "epoch": 0.6700265381150634, + "grad_norm": 1.564957857131958, + "learning_rate": 1.0374713264590502e-05, + "loss": 0.7901, + "step": 13760 + }, + { + "epoch": 0.6700752319041706, + "grad_norm": 2.010531425476074, + "learning_rate": 1.0371948342691744e-05, + "loss": 0.8571, + "step": 13761 + }, + { + "epoch": 0.6701239256932778, + "grad_norm": 0.10822664946317673, + "learning_rate": 1.0369183660291641e-05, + "loss": 0.6495, + "step": 13762 + }, + { + "epoch": 0.670172619482385, + "grad_norm": 1.3966779708862305, + "learning_rate": 1.036641921745899e-05, + "loss": 0.8666, + "step": 13763 + }, + { + "epoch": 0.6702213132714923, + "grad_norm": 1.2844703197479248, + "learning_rate": 1.0363655014262521e-05, + "loss": 0.7945, + "step": 13764 + }, + { + "epoch": 0.6702700070605995, + "grad_norm": 1.5478522777557373, + "learning_rate": 1.036089105077103e-05, + "loss": 0.8299, + "step": 13765 + }, + { + "epoch": 0.6703187008497066, + "grad_norm": 1.590954065322876, + "learning_rate": 1.0358127327053235e-05, + "loss": 0.9015, + "step": 13766 + }, + { + "epoch": 0.6703673946388138, + "grad_norm": 2.5400805473327637, + "learning_rate": 1.035536384317791e-05, + "loss": 0.8006, + "step": 13767 + }, + { + "epoch": 0.670416088427921, + "grad_norm": 0.11067240685224533, + "learning_rate": 1.0352600599213792e-05, + "loss": 0.6592, + "step": 13768 + }, + { + "epoch": 0.6704647822170282, + "grad_norm": 2.2798123359680176, + "learning_rate": 1.034983759522961e-05, + "loss": 0.9114, + "step": 13769 + }, + { + "epoch": 0.6705134760061354, + "grad_norm": 1.4613162279129028, + "learning_rate": 1.03470748312941e-05, + "loss": 0.8606, + "step": 13770 + }, + { + "epoch": 0.6705621697952426, + "grad_norm": 1.7972289323806763, + "learning_rate": 1.0344312307475972e-05, + "loss": 0.8229, + "step": 13771 + }, + { + "epoch": 0.6706108635843498, + "grad_norm": 1.569435715675354, + "learning_rate": 1.0341550023843973e-05, + "loss": 0.8525, + "step": 13772 + }, + { + "epoch": 0.6706595573734571, + "grad_norm": 1.3576854467391968, + "learning_rate": 1.0338787980466787e-05, + "loss": 0.7887, + "step": 13773 + }, + { + "epoch": 0.6707082511625642, + "grad_norm": 2.566817283630371, + "learning_rate": 1.0336026177413141e-05, + "loss": 0.7777, + "step": 13774 + }, + { + "epoch": 0.6707569449516714, + "grad_norm": 0.09991078823804855, + "learning_rate": 1.0333264614751712e-05, + "loss": 0.6633, + "step": 13775 + }, + { + "epoch": 0.6708056387407786, + "grad_norm": 1.5704847574234009, + "learning_rate": 1.0330503292551213e-05, + "loss": 0.7926, + "step": 13776 + }, + { + "epoch": 0.6708543325298858, + "grad_norm": 1.6096118688583374, + "learning_rate": 1.0327742210880327e-05, + "loss": 0.8202, + "step": 13777 + }, + { + "epoch": 0.670903026318993, + "grad_norm": 1.812748908996582, + "learning_rate": 1.0324981369807729e-05, + "loss": 0.8533, + "step": 13778 + }, + { + "epoch": 0.6709517201081002, + "grad_norm": 1.7842559814453125, + "learning_rate": 1.0322220769402116e-05, + "loss": 0.7633, + "step": 13779 + }, + { + "epoch": 0.6710004138972074, + "grad_norm": 1.3714556694030762, + "learning_rate": 1.0319460409732128e-05, + "loss": 0.7202, + "step": 13780 + }, + { + "epoch": 0.6710491076863146, + "grad_norm": 1.2382742166519165, + "learning_rate": 1.0316700290866461e-05, + "loss": 0.8307, + "step": 13781 + }, + { + "epoch": 0.6710978014754219, + "grad_norm": 1.496696949005127, + "learning_rate": 1.031394041287374e-05, + "loss": 0.8995, + "step": 13782 + }, + { + "epoch": 0.671146495264529, + "grad_norm": 2.049070119857788, + "learning_rate": 1.0311180775822644e-05, + "loss": 0.7516, + "step": 13783 + }, + { + "epoch": 0.6711951890536362, + "grad_norm": 1.871654748916626, + "learning_rate": 1.0308421379781808e-05, + "loss": 0.8707, + "step": 13784 + }, + { + "epoch": 0.6712438828427434, + "grad_norm": 1.3889999389648438, + "learning_rate": 1.0305662224819874e-05, + "loss": 0.7961, + "step": 13785 + }, + { + "epoch": 0.6712925766318506, + "grad_norm": 1.3233473300933838, + "learning_rate": 1.030290331100548e-05, + "loss": 0.8943, + "step": 13786 + }, + { + "epoch": 0.6713412704209578, + "grad_norm": 1.696364402770996, + "learning_rate": 1.0300144638407245e-05, + "loss": 0.8579, + "step": 13787 + }, + { + "epoch": 0.671389964210065, + "grad_norm": 1.393458366394043, + "learning_rate": 1.02973862070938e-05, + "loss": 0.7548, + "step": 13788 + }, + { + "epoch": 0.6714386579991722, + "grad_norm": 1.6949723958969116, + "learning_rate": 1.0294628017133748e-05, + "loss": 0.8749, + "step": 13789 + }, + { + "epoch": 0.6714873517882795, + "grad_norm": 1.5226253271102905, + "learning_rate": 1.0291870068595717e-05, + "loss": 0.9781, + "step": 13790 + }, + { + "epoch": 0.6715360455773866, + "grad_norm": 1.3972272872924805, + "learning_rate": 1.0289112361548303e-05, + "loss": 0.8241, + "step": 13791 + }, + { + "epoch": 0.6715847393664938, + "grad_norm": 1.53734290599823, + "learning_rate": 1.0286354896060105e-05, + "loss": 0.7904, + "step": 13792 + }, + { + "epoch": 0.671633433155601, + "grad_norm": 1.4434319734573364, + "learning_rate": 1.0283597672199717e-05, + "loss": 0.8677, + "step": 13793 + }, + { + "epoch": 0.6716821269447082, + "grad_norm": 1.465814232826233, + "learning_rate": 1.0280840690035718e-05, + "loss": 0.897, + "step": 13794 + }, + { + "epoch": 0.6717308207338154, + "grad_norm": 1.532888650894165, + "learning_rate": 1.0278083949636697e-05, + "loss": 0.8698, + "step": 13795 + }, + { + "epoch": 0.6717795145229226, + "grad_norm": 1.9124759435653687, + "learning_rate": 1.0275327451071221e-05, + "loss": 0.8974, + "step": 13796 + }, + { + "epoch": 0.6718282083120298, + "grad_norm": 1.4963438510894775, + "learning_rate": 1.0272571194407862e-05, + "loss": 0.9047, + "step": 13797 + }, + { + "epoch": 0.671876902101137, + "grad_norm": 1.4235442876815796, + "learning_rate": 1.0269815179715174e-05, + "loss": 0.8029, + "step": 13798 + }, + { + "epoch": 0.6719255958902443, + "grad_norm": 1.5456515550613403, + "learning_rate": 1.0267059407061732e-05, + "loss": 0.7727, + "step": 13799 + }, + { + "epoch": 0.6719742896793514, + "grad_norm": 1.3548177480697632, + "learning_rate": 1.0264303876516071e-05, + "loss": 0.7675, + "step": 13800 + }, + { + "epoch": 0.6720229834684586, + "grad_norm": 1.4143749475479126, + "learning_rate": 1.0261548588146739e-05, + "loss": 0.8581, + "step": 13801 + }, + { + "epoch": 0.6720716772575658, + "grad_norm": 1.7428077459335327, + "learning_rate": 1.0258793542022273e-05, + "loss": 0.7594, + "step": 13802 + }, + { + "epoch": 0.672120371046673, + "grad_norm": 1.4615918397903442, + "learning_rate": 1.0256038738211206e-05, + "loss": 0.8963, + "step": 13803 + }, + { + "epoch": 0.6721690648357802, + "grad_norm": 1.4174212217330933, + "learning_rate": 1.0253284176782068e-05, + "loss": 0.8454, + "step": 13804 + }, + { + "epoch": 0.6722177586248874, + "grad_norm": 1.671203374862671, + "learning_rate": 1.0250529857803372e-05, + "loss": 0.8399, + "step": 13805 + }, + { + "epoch": 0.6722664524139946, + "grad_norm": 2.02167010307312, + "learning_rate": 1.0247775781343625e-05, + "loss": 0.8599, + "step": 13806 + }, + { + "epoch": 0.6723151462031018, + "grad_norm": 1.3726544380187988, + "learning_rate": 1.0245021947471361e-05, + "loss": 0.8235, + "step": 13807 + }, + { + "epoch": 0.672363839992209, + "grad_norm": 1.787957787513733, + "learning_rate": 1.024226835625505e-05, + "loss": 0.769, + "step": 13808 + }, + { + "epoch": 0.6724125337813162, + "grad_norm": 1.656976342201233, + "learning_rate": 1.0239515007763213e-05, + "loss": 0.8595, + "step": 13809 + }, + { + "epoch": 0.6724612275704234, + "grad_norm": 4.4757399559021, + "learning_rate": 1.0236761902064327e-05, + "loss": 0.8203, + "step": 13810 + }, + { + "epoch": 0.6725099213595306, + "grad_norm": 1.617937684059143, + "learning_rate": 1.023400903922688e-05, + "loss": 0.8476, + "step": 13811 + }, + { + "epoch": 0.6725586151486378, + "grad_norm": 1.474249243736267, + "learning_rate": 1.023125641931935e-05, + "loss": 0.7942, + "step": 13812 + }, + { + "epoch": 0.672607308937745, + "grad_norm": 1.821291208267212, + "learning_rate": 1.0228504042410199e-05, + "loss": 0.9462, + "step": 13813 + }, + { + "epoch": 0.6726560027268522, + "grad_norm": 1.6001695394515991, + "learning_rate": 1.0225751908567916e-05, + "loss": 0.8016, + "step": 13814 + }, + { + "epoch": 0.6727046965159594, + "grad_norm": 1.8952670097351074, + "learning_rate": 1.0223000017860932e-05, + "loss": 0.8279, + "step": 13815 + }, + { + "epoch": 0.6727533903050665, + "grad_norm": 1.955309271812439, + "learning_rate": 1.0220248370357723e-05, + "loss": 0.8594, + "step": 13816 + }, + { + "epoch": 0.6728020840941737, + "grad_norm": 1.5655512809753418, + "learning_rate": 1.0217496966126719e-05, + "loss": 0.8015, + "step": 13817 + }, + { + "epoch": 0.672850777883281, + "grad_norm": 2.155637741088867, + "learning_rate": 1.0214745805236375e-05, + "loss": 0.7343, + "step": 13818 + }, + { + "epoch": 0.6728994716723882, + "grad_norm": 1.3189350366592407, + "learning_rate": 1.0211994887755122e-05, + "loss": 0.7643, + "step": 13819 + }, + { + "epoch": 0.6729481654614954, + "grad_norm": 1.5306236743927002, + "learning_rate": 1.0209244213751379e-05, + "loss": 0.8177, + "step": 13820 + }, + { + "epoch": 0.6729968592506026, + "grad_norm": 1.8199050426483154, + "learning_rate": 1.0206493783293596e-05, + "loss": 0.7737, + "step": 13821 + }, + { + "epoch": 0.6730455530397098, + "grad_norm": 2.2581517696380615, + "learning_rate": 1.0203743596450157e-05, + "loss": 0.8515, + "step": 13822 + }, + { + "epoch": 0.673094246828817, + "grad_norm": 2.2739615440368652, + "learning_rate": 1.0200993653289504e-05, + "loss": 0.7735, + "step": 13823 + }, + { + "epoch": 0.6731429406179242, + "grad_norm": 1.5344352722167969, + "learning_rate": 1.0198243953880014e-05, + "loss": 0.8336, + "step": 13824 + }, + { + "epoch": 0.6731916344070313, + "grad_norm": 1.5259373188018799, + "learning_rate": 1.0195494498290106e-05, + "loss": 0.8434, + "step": 13825 + }, + { + "epoch": 0.6732403281961385, + "grad_norm": 1.631820797920227, + "learning_rate": 1.0192745286588166e-05, + "loss": 0.8205, + "step": 13826 + }, + { + "epoch": 0.6732890219852458, + "grad_norm": 1.9425990581512451, + "learning_rate": 1.0189996318842581e-05, + "loss": 0.892, + "step": 13827 + }, + { + "epoch": 0.673337715774353, + "grad_norm": 1.7821460962295532, + "learning_rate": 1.0187247595121732e-05, + "loss": 0.7844, + "step": 13828 + }, + { + "epoch": 0.6733864095634602, + "grad_norm": 1.521937370300293, + "learning_rate": 1.0184499115493987e-05, + "loss": 0.8886, + "step": 13829 + }, + { + "epoch": 0.6734351033525674, + "grad_norm": 3.2650721073150635, + "learning_rate": 1.0181750880027735e-05, + "loss": 0.7613, + "step": 13830 + }, + { + "epoch": 0.6734837971416746, + "grad_norm": 0.0962076410651207, + "learning_rate": 1.0179002888791311e-05, + "loss": 0.6167, + "step": 13831 + }, + { + "epoch": 0.6735324909307818, + "grad_norm": 1.5840773582458496, + "learning_rate": 1.0176255141853102e-05, + "loss": 0.847, + "step": 13832 + }, + { + "epoch": 0.6735811847198889, + "grad_norm": 1.2255418300628662, + "learning_rate": 1.0173507639281426e-05, + "loss": 0.8162, + "step": 13833 + }, + { + "epoch": 0.6736298785089961, + "grad_norm": 1.3805047273635864, + "learning_rate": 1.0170760381144648e-05, + "loss": 0.8135, + "step": 13834 + }, + { + "epoch": 0.6736785722981034, + "grad_norm": 1.4841147661209106, + "learning_rate": 1.0168013367511106e-05, + "loss": 0.8391, + "step": 13835 + }, + { + "epoch": 0.6737272660872106, + "grad_norm": 1.5519717931747437, + "learning_rate": 1.0165266598449126e-05, + "loss": 0.7321, + "step": 13836 + }, + { + "epoch": 0.6737759598763178, + "grad_norm": 1.274694561958313, + "learning_rate": 1.0162520074027035e-05, + "loss": 0.8265, + "step": 13837 + }, + { + "epoch": 0.673824653665425, + "grad_norm": 1.4649080038070679, + "learning_rate": 1.0159773794313147e-05, + "loss": 0.7038, + "step": 13838 + }, + { + "epoch": 0.6738733474545322, + "grad_norm": 2.2038943767547607, + "learning_rate": 1.0157027759375801e-05, + "loss": 0.7889, + "step": 13839 + }, + { + "epoch": 0.6739220412436394, + "grad_norm": 3.276611804962158, + "learning_rate": 1.0154281969283271e-05, + "loss": 0.7883, + "step": 13840 + }, + { + "epoch": 0.6739707350327466, + "grad_norm": 1.4293221235275269, + "learning_rate": 1.0151536424103879e-05, + "loss": 0.8731, + "step": 13841 + }, + { + "epoch": 0.6740194288218537, + "grad_norm": 2.387014627456665, + "learning_rate": 1.014879112390592e-05, + "loss": 0.786, + "step": 13842 + }, + { + "epoch": 0.6740681226109609, + "grad_norm": 2.2118659019470215, + "learning_rate": 1.0146046068757677e-05, + "loss": 0.8535, + "step": 13843 + }, + { + "epoch": 0.6741168164000682, + "grad_norm": 1.406131625175476, + "learning_rate": 1.0143301258727438e-05, + "loss": 0.7648, + "step": 13844 + }, + { + "epoch": 0.6741655101891754, + "grad_norm": 0.09842150658369064, + "learning_rate": 1.0140556693883479e-05, + "loss": 0.6657, + "step": 13845 + }, + { + "epoch": 0.6742142039782826, + "grad_norm": 2.178806781768799, + "learning_rate": 1.0137812374294072e-05, + "loss": 0.8981, + "step": 13846 + }, + { + "epoch": 0.6742628977673898, + "grad_norm": 3.5174121856689453, + "learning_rate": 1.013506830002747e-05, + "loss": 0.7854, + "step": 13847 + }, + { + "epoch": 0.674311591556497, + "grad_norm": 4.734489917755127, + "learning_rate": 1.0132324471151955e-05, + "loss": 0.8609, + "step": 13848 + }, + { + "epoch": 0.6743602853456042, + "grad_norm": 1.9060765504837036, + "learning_rate": 1.0129580887735768e-05, + "loss": 0.8023, + "step": 13849 + }, + { + "epoch": 0.6744089791347113, + "grad_norm": 1.6420437097549438, + "learning_rate": 1.0126837549847157e-05, + "loss": 0.763, + "step": 13850 + }, + { + "epoch": 0.6744576729238185, + "grad_norm": 1.9814621210098267, + "learning_rate": 1.0124094457554361e-05, + "loss": 0.7625, + "step": 13851 + }, + { + "epoch": 0.6745063667129257, + "grad_norm": 1.6314327716827393, + "learning_rate": 1.0121351610925614e-05, + "loss": 0.877, + "step": 13852 + }, + { + "epoch": 0.674555060502033, + "grad_norm": 1.9933807849884033, + "learning_rate": 1.0118609010029151e-05, + "loss": 0.7754, + "step": 13853 + }, + { + "epoch": 0.6746037542911402, + "grad_norm": 1.60613214969635, + "learning_rate": 1.0115866654933184e-05, + "loss": 0.8022, + "step": 13854 + }, + { + "epoch": 0.6746524480802474, + "grad_norm": 1.4953248500823975, + "learning_rate": 1.0113124545705939e-05, + "loss": 0.7483, + "step": 13855 + }, + { + "epoch": 0.6747011418693546, + "grad_norm": 1.8287051916122437, + "learning_rate": 1.011038268241562e-05, + "loss": 0.7701, + "step": 13856 + }, + { + "epoch": 0.6747498356584618, + "grad_norm": 1.5944761037826538, + "learning_rate": 1.0107641065130428e-05, + "loss": 0.7031, + "step": 13857 + }, + { + "epoch": 0.674798529447569, + "grad_norm": 1.2511740922927856, + "learning_rate": 1.0104899693918573e-05, + "loss": 0.8252, + "step": 13858 + }, + { + "epoch": 0.6748472232366761, + "grad_norm": 1.756919503211975, + "learning_rate": 1.0102158568848242e-05, + "loss": 0.8722, + "step": 13859 + }, + { + "epoch": 0.6748959170257833, + "grad_norm": 4.302449703216553, + "learning_rate": 1.0099417689987616e-05, + "loss": 0.8014, + "step": 13860 + }, + { + "epoch": 0.6749446108148905, + "grad_norm": 1.9120920896530151, + "learning_rate": 1.009667705740488e-05, + "loss": 0.9441, + "step": 13861 + }, + { + "epoch": 0.6749933046039978, + "grad_norm": 2.2505760192871094, + "learning_rate": 1.0093936671168204e-05, + "loss": 0.7787, + "step": 13862 + }, + { + "epoch": 0.675041998393105, + "grad_norm": 2.007847309112549, + "learning_rate": 1.009119653134576e-05, + "loss": 0.8454, + "step": 13863 + }, + { + "epoch": 0.6750906921822122, + "grad_norm": 1.5710850954055786, + "learning_rate": 1.0088456638005692e-05, + "loss": 0.8662, + "step": 13864 + }, + { + "epoch": 0.6751393859713194, + "grad_norm": 1.5972743034362793, + "learning_rate": 1.0085716991216188e-05, + "loss": 0.8108, + "step": 13865 + }, + { + "epoch": 0.6751880797604266, + "grad_norm": 2.894023895263672, + "learning_rate": 1.0082977591045362e-05, + "loss": 0.7668, + "step": 13866 + }, + { + "epoch": 0.6752367735495337, + "grad_norm": 2.337378978729248, + "learning_rate": 1.008023843756138e-05, + "loss": 0.7973, + "step": 13867 + }, + { + "epoch": 0.6752854673386409, + "grad_norm": 1.8173811435699463, + "learning_rate": 1.0077499530832374e-05, + "loss": 0.7833, + "step": 13868 + }, + { + "epoch": 0.6753341611277481, + "grad_norm": 1.3076452016830444, + "learning_rate": 1.0074760870926465e-05, + "loss": 0.7573, + "step": 13869 + }, + { + "epoch": 0.6753828549168553, + "grad_norm": 1.273685336112976, + "learning_rate": 1.007202245791179e-05, + "loss": 0.7796, + "step": 13870 + }, + { + "epoch": 0.6754315487059626, + "grad_norm": 1.8508268594741821, + "learning_rate": 1.0069284291856452e-05, + "loss": 0.8343, + "step": 13871 + }, + { + "epoch": 0.6754802424950698, + "grad_norm": 2.6100449562072754, + "learning_rate": 1.0066546372828589e-05, + "loss": 0.7892, + "step": 13872 + }, + { + "epoch": 0.675528936284177, + "grad_norm": 1.9540882110595703, + "learning_rate": 1.0063808700896276e-05, + "loss": 0.7569, + "step": 13873 + }, + { + "epoch": 0.6755776300732842, + "grad_norm": 1.6864007711410522, + "learning_rate": 1.0061071276127642e-05, + "loss": 0.8051, + "step": 13874 + }, + { + "epoch": 0.6756263238623913, + "grad_norm": 2.249323606491089, + "learning_rate": 1.0058334098590747e-05, + "loss": 0.8217, + "step": 13875 + }, + { + "epoch": 0.6756750176514985, + "grad_norm": 5.466775894165039, + "learning_rate": 1.005559716835371e-05, + "loss": 0.8451, + "step": 13876 + }, + { + "epoch": 0.6757237114406057, + "grad_norm": 3.0404646396636963, + "learning_rate": 1.0052860485484598e-05, + "loss": 0.8277, + "step": 13877 + }, + { + "epoch": 0.6757724052297129, + "grad_norm": 1.670674204826355, + "learning_rate": 1.0050124050051491e-05, + "loss": 0.7356, + "step": 13878 + }, + { + "epoch": 0.6758210990188201, + "grad_norm": 1.3033361434936523, + "learning_rate": 1.0047387862122454e-05, + "loss": 0.8814, + "step": 13879 + }, + { + "epoch": 0.6758697928079274, + "grad_norm": 1.6875802278518677, + "learning_rate": 1.0044651921765544e-05, + "loss": 0.7374, + "step": 13880 + }, + { + "epoch": 0.6759184865970346, + "grad_norm": 1.6141735315322876, + "learning_rate": 1.0041916229048839e-05, + "loss": 0.818, + "step": 13881 + }, + { + "epoch": 0.6759671803861418, + "grad_norm": 1.5615136623382568, + "learning_rate": 1.0039180784040363e-05, + "loss": 0.8627, + "step": 13882 + }, + { + "epoch": 0.676015874175249, + "grad_norm": 1.497535228729248, + "learning_rate": 1.0036445586808186e-05, + "loss": 0.7977, + "step": 13883 + }, + { + "epoch": 0.6760645679643561, + "grad_norm": 2.2347137928009033, + "learning_rate": 1.0033710637420317e-05, + "loss": 0.9122, + "step": 13884 + }, + { + "epoch": 0.6761132617534633, + "grad_norm": 1.4742316007614136, + "learning_rate": 1.0030975935944816e-05, + "loss": 0.8518, + "step": 13885 + }, + { + "epoch": 0.6761619555425705, + "grad_norm": 1.60161554813385, + "learning_rate": 1.0028241482449696e-05, + "loss": 0.7607, + "step": 13886 + }, + { + "epoch": 0.6762106493316777, + "grad_norm": 1.6104345321655273, + "learning_rate": 1.002550727700297e-05, + "loss": 0.7458, + "step": 13887 + }, + { + "epoch": 0.676259343120785, + "grad_norm": 1.6369562149047852, + "learning_rate": 1.0022773319672677e-05, + "loss": 0.7347, + "step": 13888 + }, + { + "epoch": 0.6763080369098922, + "grad_norm": 1.7309730052947998, + "learning_rate": 1.002003961052679e-05, + "loss": 0.8254, + "step": 13889 + }, + { + "epoch": 0.6763567306989994, + "grad_norm": 1.3521404266357422, + "learning_rate": 1.0017306149633343e-05, + "loss": 0.76, + "step": 13890 + }, + { + "epoch": 0.6764054244881066, + "grad_norm": 1.859176754951477, + "learning_rate": 1.00145729370603e-05, + "loss": 0.7527, + "step": 13891 + }, + { + "epoch": 0.6764541182772137, + "grad_norm": 1.9603095054626465, + "learning_rate": 1.0011839972875676e-05, + "loss": 0.7575, + "step": 13892 + }, + { + "epoch": 0.6765028120663209, + "grad_norm": 1.2252018451690674, + "learning_rate": 1.000910725714744e-05, + "loss": 0.9124, + "step": 13893 + }, + { + "epoch": 0.6765515058554281, + "grad_norm": 1.6865360736846924, + "learning_rate": 1.0006374789943576e-05, + "loss": 0.8153, + "step": 13894 + }, + { + "epoch": 0.6766001996445353, + "grad_norm": 1.6928142309188843, + "learning_rate": 1.0003642571332048e-05, + "loss": 0.7645, + "step": 13895 + }, + { + "epoch": 0.6766488934336425, + "grad_norm": 4.700911521911621, + "learning_rate": 1.0000910601380823e-05, + "loss": 0.8411, + "step": 13896 + }, + { + "epoch": 0.6766975872227498, + "grad_norm": 1.8208621740341187, + "learning_rate": 9.998178880157858e-06, + "loss": 0.8218, + "step": 13897 + }, + { + "epoch": 0.676746281011857, + "grad_norm": 2.102090835571289, + "learning_rate": 9.995447407731096e-06, + "loss": 0.8515, + "step": 13898 + }, + { + "epoch": 0.6767949748009642, + "grad_norm": 0.09778116643428802, + "learning_rate": 9.992716184168502e-06, + "loss": 0.6177, + "step": 13899 + }, + { + "epoch": 0.6768436685900714, + "grad_norm": 1.5374231338500977, + "learning_rate": 9.989985209538004e-06, + "loss": 0.8401, + "step": 13900 + }, + { + "epoch": 0.6768923623791785, + "grad_norm": 1.4370684623718262, + "learning_rate": 9.987254483907538e-06, + "loss": 0.7763, + "step": 13901 + }, + { + "epoch": 0.6769410561682857, + "grad_norm": 2.389582395553589, + "learning_rate": 9.984524007345029e-06, + "loss": 0.8739, + "step": 13902 + }, + { + "epoch": 0.6769897499573929, + "grad_norm": 2.089604616165161, + "learning_rate": 9.981793779918396e-06, + "loss": 0.7957, + "step": 13903 + }, + { + "epoch": 0.6770384437465001, + "grad_norm": 2.811920404434204, + "learning_rate": 9.979063801695559e-06, + "loss": 0.8215, + "step": 13904 + }, + { + "epoch": 0.6770871375356073, + "grad_norm": 1.6950130462646484, + "learning_rate": 9.976334072744422e-06, + "loss": 0.8597, + "step": 13905 + }, + { + "epoch": 0.6771358313247146, + "grad_norm": 6.7133355140686035, + "learning_rate": 9.97360459313289e-06, + "loss": 0.9195, + "step": 13906 + }, + { + "epoch": 0.6771845251138218, + "grad_norm": 3.119227170944214, + "learning_rate": 9.970875362928852e-06, + "loss": 0.9451, + "step": 13907 + }, + { + "epoch": 0.677233218902929, + "grad_norm": 2.062800884246826, + "learning_rate": 9.968146382200208e-06, + "loss": 0.8179, + "step": 13908 + }, + { + "epoch": 0.6772819126920361, + "grad_norm": 1.5354747772216797, + "learning_rate": 9.96541765101484e-06, + "loss": 0.8659, + "step": 13909 + }, + { + "epoch": 0.6773306064811433, + "grad_norm": 1.2756657600402832, + "learning_rate": 9.962689169440622e-06, + "loss": 0.8195, + "step": 13910 + }, + { + "epoch": 0.6773793002702505, + "grad_norm": 1.9596502780914307, + "learning_rate": 9.959960937545426e-06, + "loss": 0.8373, + "step": 13911 + }, + { + "epoch": 0.6774279940593577, + "grad_norm": 1.5748707056045532, + "learning_rate": 9.957232955397116e-06, + "loss": 0.9012, + "step": 13912 + }, + { + "epoch": 0.6774766878484649, + "grad_norm": 2.1078693866729736, + "learning_rate": 9.954505223063553e-06, + "loss": 0.8037, + "step": 13913 + }, + { + "epoch": 0.6775253816375721, + "grad_norm": 1.4815561771392822, + "learning_rate": 9.951777740612589e-06, + "loss": 0.7866, + "step": 13914 + }, + { + "epoch": 0.6775740754266794, + "grad_norm": 1.9544955492019653, + "learning_rate": 9.949050508112062e-06, + "loss": 0.7869, + "step": 13915 + }, + { + "epoch": 0.6776227692157866, + "grad_norm": 1.62172269821167, + "learning_rate": 9.946323525629832e-06, + "loss": 0.7988, + "step": 13916 + }, + { + "epoch": 0.6776714630048937, + "grad_norm": 1.8153494596481323, + "learning_rate": 9.94359679323371e-06, + "loss": 0.802, + "step": 13917 + }, + { + "epoch": 0.6777201567940009, + "grad_norm": 1.6415904760360718, + "learning_rate": 9.940870310991541e-06, + "loss": 0.8239, + "step": 13918 + }, + { + "epoch": 0.6777688505831081, + "grad_norm": 1.9249401092529297, + "learning_rate": 9.938144078971139e-06, + "loss": 0.7997, + "step": 13919 + }, + { + "epoch": 0.6778175443722153, + "grad_norm": 1.5910648107528687, + "learning_rate": 9.935418097240322e-06, + "loss": 0.7817, + "step": 13920 + }, + { + "epoch": 0.6778662381613225, + "grad_norm": 1.4543330669403076, + "learning_rate": 9.932692365866897e-06, + "loss": 0.8278, + "step": 13921 + }, + { + "epoch": 0.6779149319504297, + "grad_norm": 1.4061362743377686, + "learning_rate": 9.929966884918661e-06, + "loss": 0.8003, + "step": 13922 + }, + { + "epoch": 0.677963625739537, + "grad_norm": 0.11389070004224777, + "learning_rate": 9.927241654463428e-06, + "loss": 0.6126, + "step": 13923 + }, + { + "epoch": 0.6780123195286442, + "grad_norm": 1.5574220418930054, + "learning_rate": 9.924516674568967e-06, + "loss": 0.8363, + "step": 13924 + }, + { + "epoch": 0.6780610133177514, + "grad_norm": 1.640734314918518, + "learning_rate": 9.921791945303085e-06, + "loss": 0.7782, + "step": 13925 + }, + { + "epoch": 0.6781097071068585, + "grad_norm": 1.6958751678466797, + "learning_rate": 9.919067466733532e-06, + "loss": 0.8297, + "step": 13926 + }, + { + "epoch": 0.6781584008959657, + "grad_norm": 2.8252336978912354, + "learning_rate": 9.916343238928104e-06, + "loss": 0.8289, + "step": 13927 + }, + { + "epoch": 0.6782070946850729, + "grad_norm": 1.3433048725128174, + "learning_rate": 9.913619261954554e-06, + "loss": 0.853, + "step": 13928 + }, + { + "epoch": 0.6782557884741801, + "grad_norm": 1.936867117881775, + "learning_rate": 9.91089553588064e-06, + "loss": 0.8078, + "step": 13929 + }, + { + "epoch": 0.6783044822632873, + "grad_norm": 2.009413480758667, + "learning_rate": 9.908172060774132e-06, + "loss": 0.8267, + "step": 13930 + }, + { + "epoch": 0.6783531760523945, + "grad_norm": 1.5542868375778198, + "learning_rate": 9.905448836702752e-06, + "loss": 0.7795, + "step": 13931 + }, + { + "epoch": 0.6784018698415017, + "grad_norm": 1.8550270795822144, + "learning_rate": 9.902725863734266e-06, + "loss": 0.7489, + "step": 13932 + }, + { + "epoch": 0.678450563630609, + "grad_norm": 2.8352370262145996, + "learning_rate": 9.900003141936381e-06, + "loss": 0.8322, + "step": 13933 + }, + { + "epoch": 0.6784992574197161, + "grad_norm": 1.6570497751235962, + "learning_rate": 9.897280671376845e-06, + "loss": 0.8427, + "step": 13934 + }, + { + "epoch": 0.6785479512088233, + "grad_norm": 1.5231164693832397, + "learning_rate": 9.894558452123373e-06, + "loss": 0.7763, + "step": 13935 + }, + { + "epoch": 0.6785966449979305, + "grad_norm": 0.09686020016670227, + "learning_rate": 9.891836484243682e-06, + "loss": 0.6725, + "step": 13936 + }, + { + "epoch": 0.6786453387870377, + "grad_norm": 0.10361858457326889, + "learning_rate": 9.889114767805482e-06, + "loss": 0.6594, + "step": 13937 + }, + { + "epoch": 0.6786940325761449, + "grad_norm": 1.5812572240829468, + "learning_rate": 9.886393302876463e-06, + "loss": 0.7659, + "step": 13938 + }, + { + "epoch": 0.6787427263652521, + "grad_norm": 1.8689347505569458, + "learning_rate": 9.883672089524347e-06, + "loss": 0.9544, + "step": 13939 + }, + { + "epoch": 0.6787914201543593, + "grad_norm": 2.5242953300476074, + "learning_rate": 9.880951127816797e-06, + "loss": 0.8583, + "step": 13940 + }, + { + "epoch": 0.6788401139434665, + "grad_norm": 1.5606493949890137, + "learning_rate": 9.878230417821526e-06, + "loss": 0.8049, + "step": 13941 + }, + { + "epoch": 0.6788888077325738, + "grad_norm": 1.997510552406311, + "learning_rate": 9.875509959606182e-06, + "loss": 0.7894, + "step": 13942 + }, + { + "epoch": 0.6789375015216809, + "grad_norm": 1.9020789861679077, + "learning_rate": 9.872789753238456e-06, + "loss": 0.8231, + "step": 13943 + }, + { + "epoch": 0.6789861953107881, + "grad_norm": 2.5039525032043457, + "learning_rate": 9.87006979878601e-06, + "loss": 0.8465, + "step": 13944 + }, + { + "epoch": 0.6790348890998953, + "grad_norm": 0.0929594412446022, + "learning_rate": 9.867350096316499e-06, + "loss": 0.6186, + "step": 13945 + }, + { + "epoch": 0.6790835828890025, + "grad_norm": 1.4283069372177124, + "learning_rate": 9.864630645897582e-06, + "loss": 0.8023, + "step": 13946 + }, + { + "epoch": 0.6791322766781097, + "grad_norm": 1.9377561807632446, + "learning_rate": 9.86191144759689e-06, + "loss": 0.8471, + "step": 13947 + }, + { + "epoch": 0.6791809704672169, + "grad_norm": 1.5276970863342285, + "learning_rate": 9.859192501482094e-06, + "loss": 0.8263, + "step": 13948 + }, + { + "epoch": 0.6792296642563241, + "grad_norm": 2.880037784576416, + "learning_rate": 9.856473807620793e-06, + "loss": 0.781, + "step": 13949 + }, + { + "epoch": 0.6792783580454314, + "grad_norm": 1.7724581956863403, + "learning_rate": 9.853755366080636e-06, + "loss": 0.7964, + "step": 13950 + }, + { + "epoch": 0.6793270518345385, + "grad_norm": 1.7082310914993286, + "learning_rate": 9.85103717692924e-06, + "loss": 0.7643, + "step": 13951 + }, + { + "epoch": 0.6793757456236457, + "grad_norm": 5.254205226898193, + "learning_rate": 9.848319240234221e-06, + "loss": 0.8219, + "step": 13952 + }, + { + "epoch": 0.6794244394127529, + "grad_norm": 1.563789963722229, + "learning_rate": 9.845601556063186e-06, + "loss": 0.8849, + "step": 13953 + }, + { + "epoch": 0.6794731332018601, + "grad_norm": 1.6031250953674316, + "learning_rate": 9.84288412448374e-06, + "loss": 0.8246, + "step": 13954 + }, + { + "epoch": 0.6795218269909673, + "grad_norm": 1.5291316509246826, + "learning_rate": 9.840166945563474e-06, + "loss": 0.8142, + "step": 13955 + }, + { + "epoch": 0.6795705207800745, + "grad_norm": 1.8405210971832275, + "learning_rate": 9.837450019369977e-06, + "loss": 0.8003, + "step": 13956 + }, + { + "epoch": 0.6796192145691817, + "grad_norm": 1.325943946838379, + "learning_rate": 9.834733345970844e-06, + "loss": 0.8727, + "step": 13957 + }, + { + "epoch": 0.6796679083582889, + "grad_norm": 1.77336585521698, + "learning_rate": 9.832016925433645e-06, + "loss": 0.8491, + "step": 13958 + }, + { + "epoch": 0.6797166021473962, + "grad_norm": 2.593461513519287, + "learning_rate": 9.829300757825952e-06, + "loss": 0.8418, + "step": 13959 + }, + { + "epoch": 0.6797652959365033, + "grad_norm": 1.700400710105896, + "learning_rate": 9.826584843215332e-06, + "loss": 0.8889, + "step": 13960 + }, + { + "epoch": 0.6798139897256105, + "grad_norm": 1.645120620727539, + "learning_rate": 9.823869181669339e-06, + "loss": 0.8255, + "step": 13961 + }, + { + "epoch": 0.6798626835147177, + "grad_norm": 1.3241212368011475, + "learning_rate": 9.82115377325553e-06, + "loss": 0.8501, + "step": 13962 + }, + { + "epoch": 0.6799113773038249, + "grad_norm": 1.2055392265319824, + "learning_rate": 9.818438618041449e-06, + "loss": 0.7916, + "step": 13963 + }, + { + "epoch": 0.6799600710929321, + "grad_norm": 1.5351470708847046, + "learning_rate": 9.815723716094635e-06, + "loss": 0.8407, + "step": 13964 + }, + { + "epoch": 0.6800087648820393, + "grad_norm": 1.8388932943344116, + "learning_rate": 9.813009067482619e-06, + "loss": 0.789, + "step": 13965 + }, + { + "epoch": 0.6800574586711465, + "grad_norm": 1.202439546585083, + "learning_rate": 9.810294672272928e-06, + "loss": 0.8845, + "step": 13966 + }, + { + "epoch": 0.6801061524602537, + "grad_norm": 1.2747286558151245, + "learning_rate": 9.807580530533092e-06, + "loss": 0.8847, + "step": 13967 + }, + { + "epoch": 0.6801548462493608, + "grad_norm": 2.273559808731079, + "learning_rate": 9.804866642330619e-06, + "loss": 0.7674, + "step": 13968 + }, + { + "epoch": 0.6802035400384681, + "grad_norm": 10.832539558410645, + "learning_rate": 9.80215300773302e-06, + "loss": 0.8537, + "step": 13969 + }, + { + "epoch": 0.6802522338275753, + "grad_norm": 1.860670804977417, + "learning_rate": 9.799439626807794e-06, + "loss": 0.8303, + "step": 13970 + }, + { + "epoch": 0.6803009276166825, + "grad_norm": 2.5927927494049072, + "learning_rate": 9.796726499622435e-06, + "loss": 0.8183, + "step": 13971 + }, + { + "epoch": 0.6803496214057897, + "grad_norm": 1.634026050567627, + "learning_rate": 9.794013626244437e-06, + "loss": 0.799, + "step": 13972 + }, + { + "epoch": 0.6803983151948969, + "grad_norm": 1.3243038654327393, + "learning_rate": 9.791301006741271e-06, + "loss": 0.76, + "step": 13973 + }, + { + "epoch": 0.6804470089840041, + "grad_norm": 1.8544284105300903, + "learning_rate": 9.788588641180441e-06, + "loss": 0.941, + "step": 13974 + }, + { + "epoch": 0.6804957027731113, + "grad_norm": 1.6136138439178467, + "learning_rate": 9.785876529629383e-06, + "loss": 0.8416, + "step": 13975 + }, + { + "epoch": 0.6805443965622184, + "grad_norm": 0.09161166846752167, + "learning_rate": 9.783164672155585e-06, + "loss": 0.6361, + "step": 13976 + }, + { + "epoch": 0.6805930903513256, + "grad_norm": 1.990671992301941, + "learning_rate": 9.7804530688265e-06, + "loss": 0.868, + "step": 13977 + }, + { + "epoch": 0.6806417841404329, + "grad_norm": 2.3404457569122314, + "learning_rate": 9.777741719709572e-06, + "loss": 0.7775, + "step": 13978 + }, + { + "epoch": 0.6806904779295401, + "grad_norm": 1.4866414070129395, + "learning_rate": 9.775030624872256e-06, + "loss": 0.8229, + "step": 13979 + }, + { + "epoch": 0.6807391717186473, + "grad_norm": 1.67893385887146, + "learning_rate": 9.772319784381974e-06, + "loss": 0.9169, + "step": 13980 + }, + { + "epoch": 0.6807878655077545, + "grad_norm": 1.7764127254486084, + "learning_rate": 9.769609198306185e-06, + "loss": 0.7952, + "step": 13981 + }, + { + "epoch": 0.6808365592968617, + "grad_norm": 1.4250408411026, + "learning_rate": 9.766898866712286e-06, + "loss": 0.8034, + "step": 13982 + }, + { + "epoch": 0.6808852530859689, + "grad_norm": 1.4396716356277466, + "learning_rate": 9.764188789667725e-06, + "loss": 0.7812, + "step": 13983 + }, + { + "epoch": 0.6809339468750761, + "grad_norm": 2.262427568435669, + "learning_rate": 9.761478967239888e-06, + "loss": 0.8427, + "step": 13984 + }, + { + "epoch": 0.6809826406641832, + "grad_norm": 1.746729850769043, + "learning_rate": 9.758769399496202e-06, + "loss": 0.7702, + "step": 13985 + }, + { + "epoch": 0.6810313344532904, + "grad_norm": 1.4241883754730225, + "learning_rate": 9.75606008650406e-06, + "loss": 0.8148, + "step": 13986 + }, + { + "epoch": 0.6810800282423977, + "grad_norm": 1.9507640600204468, + "learning_rate": 9.753351028330858e-06, + "loss": 0.8316, + "step": 13987 + }, + { + "epoch": 0.6811287220315049, + "grad_norm": 1.4408267736434937, + "learning_rate": 9.750642225043982e-06, + "loss": 0.8629, + "step": 13988 + }, + { + "epoch": 0.6811774158206121, + "grad_norm": 0.09940794855356216, + "learning_rate": 9.74793367671081e-06, + "loss": 0.6282, + "step": 13989 + }, + { + "epoch": 0.6812261096097193, + "grad_norm": 1.3013572692871094, + "learning_rate": 9.745225383398738e-06, + "loss": 0.8273, + "step": 13990 + }, + { + "epoch": 0.6812748033988265, + "grad_norm": 1.6855368614196777, + "learning_rate": 9.742517345175101e-06, + "loss": 0.9456, + "step": 13991 + }, + { + "epoch": 0.6813234971879337, + "grad_norm": 4.6658854484558105, + "learning_rate": 9.739809562107299e-06, + "loss": 0.8225, + "step": 13992 + }, + { + "epoch": 0.6813721909770408, + "grad_norm": 1.3867932558059692, + "learning_rate": 9.737102034262651e-06, + "loss": 0.7872, + "step": 13993 + }, + { + "epoch": 0.681420884766148, + "grad_norm": 5.765913486480713, + "learning_rate": 9.734394761708537e-06, + "loss": 0.8004, + "step": 13994 + }, + { + "epoch": 0.6814695785552553, + "grad_norm": 1.204138159751892, + "learning_rate": 9.731687744512285e-06, + "loss": 0.8878, + "step": 13995 + }, + { + "epoch": 0.6815182723443625, + "grad_norm": 0.10533195734024048, + "learning_rate": 9.728980982741232e-06, + "loss": 0.5628, + "step": 13996 + }, + { + "epoch": 0.6815669661334697, + "grad_norm": 1.9237605333328247, + "learning_rate": 9.726274476462729e-06, + "loss": 0.8396, + "step": 13997 + }, + { + "epoch": 0.6816156599225769, + "grad_norm": 1.4380922317504883, + "learning_rate": 9.723568225744069e-06, + "loss": 0.8331, + "step": 13998 + }, + { + "epoch": 0.6816643537116841, + "grad_norm": 1.4540776014328003, + "learning_rate": 9.7208622306526e-06, + "loss": 0.7714, + "step": 13999 + }, + { + "epoch": 0.6817130475007913, + "grad_norm": 1.5407953262329102, + "learning_rate": 9.718156491255609e-06, + "loss": 0.8797, + "step": 14000 + }, + { + "epoch": 0.6817617412898985, + "grad_norm": 2.3706138134002686, + "learning_rate": 9.715451007620418e-06, + "loss": 0.7752, + "step": 14001 + }, + { + "epoch": 0.6818104350790056, + "grad_norm": 1.9100136756896973, + "learning_rate": 9.71274577981432e-06, + "loss": 0.8331, + "step": 14002 + }, + { + "epoch": 0.6818591288681128, + "grad_norm": 0.08930676430463791, + "learning_rate": 9.710040807904609e-06, + "loss": 0.5677, + "step": 14003 + }, + { + "epoch": 0.68190782265722, + "grad_norm": 1.676056146621704, + "learning_rate": 9.707336091958572e-06, + "loss": 0.6918, + "step": 14004 + }, + { + "epoch": 0.6819565164463273, + "grad_norm": 2.343959331512451, + "learning_rate": 9.704631632043488e-06, + "loss": 0.8554, + "step": 14005 + }, + { + "epoch": 0.6820052102354345, + "grad_norm": 1.907478928565979, + "learning_rate": 9.701927428226629e-06, + "loss": 0.7408, + "step": 14006 + }, + { + "epoch": 0.6820539040245417, + "grad_norm": 1.4369862079620361, + "learning_rate": 9.699223480575259e-06, + "loss": 0.8526, + "step": 14007 + }, + { + "epoch": 0.6821025978136489, + "grad_norm": 3.0701582431793213, + "learning_rate": 9.696519789156651e-06, + "loss": 0.722, + "step": 14008 + }, + { + "epoch": 0.6821512916027561, + "grad_norm": 1.4578673839569092, + "learning_rate": 9.693816354038053e-06, + "loss": 0.802, + "step": 14009 + }, + { + "epoch": 0.6821999853918632, + "grad_norm": 1.2952769994735718, + "learning_rate": 9.691113175286712e-06, + "loss": 0.9186, + "step": 14010 + }, + { + "epoch": 0.6822486791809704, + "grad_norm": 2.1929516792297363, + "learning_rate": 9.68841025296987e-06, + "loss": 0.8928, + "step": 14011 + }, + { + "epoch": 0.6822973729700776, + "grad_norm": 1.3166533708572388, + "learning_rate": 9.685707587154764e-06, + "loss": 0.7952, + "step": 14012 + }, + { + "epoch": 0.6823460667591849, + "grad_norm": 2.0213115215301514, + "learning_rate": 9.683005177908624e-06, + "loss": 0.855, + "step": 14013 + }, + { + "epoch": 0.6823947605482921, + "grad_norm": 1.8659915924072266, + "learning_rate": 9.68030302529867e-06, + "loss": 0.743, + "step": 14014 + }, + { + "epoch": 0.6824434543373993, + "grad_norm": 1.7674647569656372, + "learning_rate": 9.677601129392109e-06, + "loss": 0.9552, + "step": 14015 + }, + { + "epoch": 0.6824921481265065, + "grad_norm": 0.09348313510417938, + "learning_rate": 9.674899490256172e-06, + "loss": 0.6319, + "step": 14016 + }, + { + "epoch": 0.6825408419156137, + "grad_norm": 0.09590327739715576, + "learning_rate": 9.67219810795805e-06, + "loss": 0.5538, + "step": 14017 + }, + { + "epoch": 0.6825895357047209, + "grad_norm": 1.3748154640197754, + "learning_rate": 9.669496982564941e-06, + "loss": 0.7902, + "step": 14018 + }, + { + "epoch": 0.682638229493828, + "grad_norm": 2.5088608264923096, + "learning_rate": 9.666796114144037e-06, + "loss": 0.8284, + "step": 14019 + }, + { + "epoch": 0.6826869232829352, + "grad_norm": 1.3666924238204956, + "learning_rate": 9.664095502762522e-06, + "loss": 0.789, + "step": 14020 + }, + { + "epoch": 0.6827356170720424, + "grad_norm": 1.3951627016067505, + "learning_rate": 9.661395148487576e-06, + "loss": 0.742, + "step": 14021 + }, + { + "epoch": 0.6827843108611497, + "grad_norm": 1.7111488580703735, + "learning_rate": 9.658695051386364e-06, + "loss": 0.7934, + "step": 14022 + }, + { + "epoch": 0.6828330046502569, + "grad_norm": 2.37874698638916, + "learning_rate": 9.655995211526058e-06, + "loss": 0.813, + "step": 14023 + }, + { + "epoch": 0.6828816984393641, + "grad_norm": 1.4956252574920654, + "learning_rate": 9.653295628973806e-06, + "loss": 0.8156, + "step": 14024 + }, + { + "epoch": 0.6829303922284713, + "grad_norm": 1.7311980724334717, + "learning_rate": 9.650596303796777e-06, + "loss": 0.8029, + "step": 14025 + }, + { + "epoch": 0.6829790860175785, + "grad_norm": 2.2143917083740234, + "learning_rate": 9.647897236062109e-06, + "loss": 0.7549, + "step": 14026 + }, + { + "epoch": 0.6830277798066856, + "grad_norm": 1.378391146659851, + "learning_rate": 9.645198425836939e-06, + "loss": 0.7991, + "step": 14027 + }, + { + "epoch": 0.6830764735957928, + "grad_norm": 5.400629997253418, + "learning_rate": 9.642499873188405e-06, + "loss": 0.7739, + "step": 14028 + }, + { + "epoch": 0.6831251673849, + "grad_norm": 3.2860093116760254, + "learning_rate": 9.639801578183629e-06, + "loss": 0.8824, + "step": 14029 + }, + { + "epoch": 0.6831738611740072, + "grad_norm": 1.4141838550567627, + "learning_rate": 9.637103540889734e-06, + "loss": 0.8508, + "step": 14030 + }, + { + "epoch": 0.6832225549631145, + "grad_norm": 1.6053388118743896, + "learning_rate": 9.634405761373826e-06, + "loss": 0.797, + "step": 14031 + }, + { + "epoch": 0.6832712487522217, + "grad_norm": 1.3772603273391724, + "learning_rate": 9.631708239703033e-06, + "loss": 0.7483, + "step": 14032 + }, + { + "epoch": 0.6833199425413289, + "grad_norm": 0.09630954265594482, + "learning_rate": 9.629010975944431e-06, + "loss": 0.6077, + "step": 14033 + }, + { + "epoch": 0.6833686363304361, + "grad_norm": 1.5251879692077637, + "learning_rate": 9.62631397016514e-06, + "loss": 0.8192, + "step": 14034 + }, + { + "epoch": 0.6834173301195432, + "grad_norm": 3.367656707763672, + "learning_rate": 9.623617222432218e-06, + "loss": 0.8317, + "step": 14035 + }, + { + "epoch": 0.6834660239086504, + "grad_norm": 1.6398208141326904, + "learning_rate": 9.620920732812773e-06, + "loss": 0.81, + "step": 14036 + }, + { + "epoch": 0.6835147176977576, + "grad_norm": 1.4663509130477905, + "learning_rate": 9.618224501373871e-06, + "loss": 0.809, + "step": 14037 + }, + { + "epoch": 0.6835634114868648, + "grad_norm": 1.7149831056594849, + "learning_rate": 9.615528528182574e-06, + "loss": 0.7864, + "step": 14038 + }, + { + "epoch": 0.683612105275972, + "grad_norm": 1.7276880741119385, + "learning_rate": 9.612832813305965e-06, + "loss": 0.8278, + "step": 14039 + }, + { + "epoch": 0.6836607990650793, + "grad_norm": 1.1076675653457642, + "learning_rate": 9.610137356811073e-06, + "loss": 0.8786, + "step": 14040 + }, + { + "epoch": 0.6837094928541865, + "grad_norm": 3.0276496410369873, + "learning_rate": 9.607442158764976e-06, + "loss": 0.7986, + "step": 14041 + }, + { + "epoch": 0.6837581866432937, + "grad_norm": 1.3487417697906494, + "learning_rate": 9.60474721923469e-06, + "loss": 0.8423, + "step": 14042 + }, + { + "epoch": 0.6838068804324009, + "grad_norm": 1.2253289222717285, + "learning_rate": 9.602052538287269e-06, + "loss": 0.7993, + "step": 14043 + }, + { + "epoch": 0.683855574221508, + "grad_norm": 1.9782885313034058, + "learning_rate": 9.599358115989742e-06, + "loss": 0.8375, + "step": 14044 + }, + { + "epoch": 0.6839042680106152, + "grad_norm": 1.8846893310546875, + "learning_rate": 9.59666395240913e-06, + "loss": 0.7083, + "step": 14045 + }, + { + "epoch": 0.6839529617997224, + "grad_norm": 3.017918109893799, + "learning_rate": 9.593970047612448e-06, + "loss": 0.8876, + "step": 14046 + }, + { + "epoch": 0.6840016555888296, + "grad_norm": 1.645016074180603, + "learning_rate": 9.591276401666705e-06, + "loss": 0.7399, + "step": 14047 + }, + { + "epoch": 0.6840503493779369, + "grad_norm": 1.804297685623169, + "learning_rate": 9.588583014638923e-06, + "loss": 0.8009, + "step": 14048 + }, + { + "epoch": 0.6840990431670441, + "grad_norm": 4.250003814697266, + "learning_rate": 9.585889886596077e-06, + "loss": 0.8608, + "step": 14049 + }, + { + "epoch": 0.6841477369561513, + "grad_norm": 1.6021100282669067, + "learning_rate": 9.583197017605182e-06, + "loss": 0.8366, + "step": 14050 + }, + { + "epoch": 0.6841964307452585, + "grad_norm": 1.9143606424331665, + "learning_rate": 9.580504407733196e-06, + "loss": 0.8363, + "step": 14051 + }, + { + "epoch": 0.6842451245343656, + "grad_norm": 1.2337565422058105, + "learning_rate": 9.57781205704712e-06, + "loss": 0.7919, + "step": 14052 + }, + { + "epoch": 0.6842938183234728, + "grad_norm": 1.5049396753311157, + "learning_rate": 9.575119965613919e-06, + "loss": 0.7947, + "step": 14053 + }, + { + "epoch": 0.68434251211258, + "grad_norm": 1.2783148288726807, + "learning_rate": 9.57242813350056e-06, + "loss": 0.8037, + "step": 14054 + }, + { + "epoch": 0.6843912059016872, + "grad_norm": 1.685831904411316, + "learning_rate": 9.569736560774001e-06, + "loss": 0.85, + "step": 14055 + }, + { + "epoch": 0.6844398996907944, + "grad_norm": 1.412318468093872, + "learning_rate": 9.56704524750119e-06, + "loss": 0.8769, + "step": 14056 + }, + { + "epoch": 0.6844885934799017, + "grad_norm": 2.1006572246551514, + "learning_rate": 9.564354193749095e-06, + "loss": 0.7592, + "step": 14057 + }, + { + "epoch": 0.6845372872690089, + "grad_norm": 1.9166016578674316, + "learning_rate": 9.561663399584624e-06, + "loss": 0.8077, + "step": 14058 + }, + { + "epoch": 0.6845859810581161, + "grad_norm": 2.666404962539673, + "learning_rate": 9.558972865074737e-06, + "loss": 0.7987, + "step": 14059 + }, + { + "epoch": 0.6846346748472233, + "grad_norm": 1.5830906629562378, + "learning_rate": 9.556282590286352e-06, + "loss": 0.7503, + "step": 14060 + }, + { + "epoch": 0.6846833686363304, + "grad_norm": 1.458791732788086, + "learning_rate": 9.553592575286388e-06, + "loss": 0.782, + "step": 14061 + }, + { + "epoch": 0.6847320624254376, + "grad_norm": 1.5788573026657104, + "learning_rate": 9.550902820141763e-06, + "loss": 0.7803, + "step": 14062 + }, + { + "epoch": 0.6847807562145448, + "grad_norm": 0.09320572763681412, + "learning_rate": 9.548213324919382e-06, + "loss": 0.6601, + "step": 14063 + }, + { + "epoch": 0.684829450003652, + "grad_norm": 1.4875680208206177, + "learning_rate": 9.545524089686147e-06, + "loss": 0.7494, + "step": 14064 + }, + { + "epoch": 0.6848781437927592, + "grad_norm": 1.6271686553955078, + "learning_rate": 9.542835114508946e-06, + "loss": 0.7899, + "step": 14065 + }, + { + "epoch": 0.6849268375818665, + "grad_norm": 2.12046217918396, + "learning_rate": 9.540146399454685e-06, + "loss": 0.8369, + "step": 14066 + }, + { + "epoch": 0.6849755313709737, + "grad_norm": 1.3832530975341797, + "learning_rate": 9.537457944590233e-06, + "loss": 0.8161, + "step": 14067 + }, + { + "epoch": 0.6850242251600809, + "grad_norm": 1.4841231107711792, + "learning_rate": 9.534769749982471e-06, + "loss": 0.8056, + "step": 14068 + }, + { + "epoch": 0.685072918949188, + "grad_norm": 1.9188188314437866, + "learning_rate": 9.532081815698264e-06, + "loss": 0.7984, + "step": 14069 + }, + { + "epoch": 0.6851216127382952, + "grad_norm": 1.74631667137146, + "learning_rate": 9.529394141804475e-06, + "loss": 0.8716, + "step": 14070 + }, + { + "epoch": 0.6851703065274024, + "grad_norm": 1.7960506677627563, + "learning_rate": 9.526706728367964e-06, + "loss": 0.7836, + "step": 14071 + }, + { + "epoch": 0.6852190003165096, + "grad_norm": 1.9793801307678223, + "learning_rate": 9.524019575455576e-06, + "loss": 0.8915, + "step": 14072 + }, + { + "epoch": 0.6852676941056168, + "grad_norm": 1.9961333274841309, + "learning_rate": 9.52133268313416e-06, + "loss": 0.848, + "step": 14073 + }, + { + "epoch": 0.685316387894724, + "grad_norm": 3.6061575412750244, + "learning_rate": 9.518646051470543e-06, + "loss": 0.9159, + "step": 14074 + }, + { + "epoch": 0.6853650816838313, + "grad_norm": 3.2932322025299072, + "learning_rate": 9.515959680531558e-06, + "loss": 0.8006, + "step": 14075 + }, + { + "epoch": 0.6854137754729385, + "grad_norm": 0.0910697728395462, + "learning_rate": 9.513273570384038e-06, + "loss": 0.6175, + "step": 14076 + }, + { + "epoch": 0.6854624692620456, + "grad_norm": 1.426216959953308, + "learning_rate": 9.510587721094796e-06, + "loss": 0.7763, + "step": 14077 + }, + { + "epoch": 0.6855111630511528, + "grad_norm": 1.6418869495391846, + "learning_rate": 9.507902132730637e-06, + "loss": 0.8368, + "step": 14078 + }, + { + "epoch": 0.68555985684026, + "grad_norm": 1.6238280534744263, + "learning_rate": 9.505216805358373e-06, + "loss": 0.7834, + "step": 14079 + }, + { + "epoch": 0.6856085506293672, + "grad_norm": 1.7381738424301147, + "learning_rate": 9.502531739044795e-06, + "loss": 0.7222, + "step": 14080 + }, + { + "epoch": 0.6856572444184744, + "grad_norm": 1.7509307861328125, + "learning_rate": 9.499846933856698e-06, + "loss": 0.8104, + "step": 14081 + }, + { + "epoch": 0.6857059382075816, + "grad_norm": 1.5589467287063599, + "learning_rate": 9.497162389860858e-06, + "loss": 0.8531, + "step": 14082 + }, + { + "epoch": 0.6857546319966888, + "grad_norm": 2.9131696224212646, + "learning_rate": 9.494478107124077e-06, + "loss": 0.7567, + "step": 14083 + }, + { + "epoch": 0.6858033257857961, + "grad_norm": 1.3538841009140015, + "learning_rate": 9.491794085713095e-06, + "loss": 0.7919, + "step": 14084 + }, + { + "epoch": 0.6858520195749033, + "grad_norm": 1.9477176666259766, + "learning_rate": 9.489110325694703e-06, + "loss": 0.8585, + "step": 14085 + }, + { + "epoch": 0.6859007133640104, + "grad_norm": 1.8277419805526733, + "learning_rate": 9.48642682713565e-06, + "loss": 0.7506, + "step": 14086 + }, + { + "epoch": 0.6859494071531176, + "grad_norm": 2.8622701168060303, + "learning_rate": 9.483743590102685e-06, + "loss": 0.8868, + "step": 14087 + }, + { + "epoch": 0.6859981009422248, + "grad_norm": 1.8440256118774414, + "learning_rate": 9.481060614662559e-06, + "loss": 0.9745, + "step": 14088 + }, + { + "epoch": 0.686046794731332, + "grad_norm": 1.6708265542984009, + "learning_rate": 9.478377900882e-06, + "loss": 0.8227, + "step": 14089 + }, + { + "epoch": 0.6860954885204392, + "grad_norm": 1.5931326150894165, + "learning_rate": 9.475695448827766e-06, + "loss": 0.8549, + "step": 14090 + }, + { + "epoch": 0.6861441823095464, + "grad_norm": 1.5365629196166992, + "learning_rate": 9.473013258566554e-06, + "loss": 0.8781, + "step": 14091 + }, + { + "epoch": 0.6861928760986536, + "grad_norm": 1.7972266674041748, + "learning_rate": 9.47033133016511e-06, + "loss": 0.8048, + "step": 14092 + }, + { + "epoch": 0.6862415698877609, + "grad_norm": 1.4338829517364502, + "learning_rate": 9.467649663690121e-06, + "loss": 0.7551, + "step": 14093 + }, + { + "epoch": 0.686290263676868, + "grad_norm": 1.4994865655899048, + "learning_rate": 9.464968259208314e-06, + "loss": 0.8633, + "step": 14094 + }, + { + "epoch": 0.6863389574659752, + "grad_norm": 20.881763458251953, + "learning_rate": 9.462287116786379e-06, + "loss": 0.7376, + "step": 14095 + }, + { + "epoch": 0.6863876512550824, + "grad_norm": 1.4484268426895142, + "learning_rate": 9.459606236491018e-06, + "loss": 0.7521, + "step": 14096 + }, + { + "epoch": 0.6864363450441896, + "grad_norm": 1.497251272201538, + "learning_rate": 9.456925618388908e-06, + "loss": 0.842, + "step": 14097 + }, + { + "epoch": 0.6864850388332968, + "grad_norm": 1.8202239274978638, + "learning_rate": 9.45424526254673e-06, + "loss": 0.829, + "step": 14098 + }, + { + "epoch": 0.686533732622404, + "grad_norm": 2.6193349361419678, + "learning_rate": 9.451565169031176e-06, + "loss": 0.7799, + "step": 14099 + }, + { + "epoch": 0.6865824264115112, + "grad_norm": 1.81663978099823, + "learning_rate": 9.448885337908886e-06, + "loss": 0.8443, + "step": 14100 + }, + { + "epoch": 0.6866311202006184, + "grad_norm": 1.7946566343307495, + "learning_rate": 9.44620576924655e-06, + "loss": 0.7958, + "step": 14101 + }, + { + "epoch": 0.6866798139897257, + "grad_norm": 0.10039789974689484, + "learning_rate": 9.443526463110795e-06, + "loss": 0.5913, + "step": 14102 + }, + { + "epoch": 0.6867285077788328, + "grad_norm": 1.8616101741790771, + "learning_rate": 9.440847419568287e-06, + "loss": 0.7549, + "step": 14103 + }, + { + "epoch": 0.68677720156794, + "grad_norm": 1.3536731004714966, + "learning_rate": 9.438168638685661e-06, + "loss": 0.8318, + "step": 14104 + }, + { + "epoch": 0.6868258953570472, + "grad_norm": 1.6532200574874878, + "learning_rate": 9.435490120529547e-06, + "loss": 0.8258, + "step": 14105 + }, + { + "epoch": 0.6868745891461544, + "grad_norm": 1.2955310344696045, + "learning_rate": 9.432811865166595e-06, + "loss": 0.8051, + "step": 14106 + }, + { + "epoch": 0.6869232829352616, + "grad_norm": 2.150632619857788, + "learning_rate": 9.430133872663397e-06, + "loss": 0.8108, + "step": 14107 + }, + { + "epoch": 0.6869719767243688, + "grad_norm": 1.287312388420105, + "learning_rate": 9.427456143086594e-06, + "loss": 0.8223, + "step": 14108 + }, + { + "epoch": 0.687020670513476, + "grad_norm": 2.0151710510253906, + "learning_rate": 9.424778676502773e-06, + "loss": 0.8032, + "step": 14109 + }, + { + "epoch": 0.6870693643025833, + "grad_norm": 1.3865492343902588, + "learning_rate": 9.422101472978551e-06, + "loss": 0.8213, + "step": 14110 + }, + { + "epoch": 0.6871180580916904, + "grad_norm": 1.8372679948806763, + "learning_rate": 9.419424532580521e-06, + "loss": 0.8318, + "step": 14111 + }, + { + "epoch": 0.6871667518807976, + "grad_norm": 2.282944679260254, + "learning_rate": 9.41674785537527e-06, + "loss": 0.8479, + "step": 14112 + }, + { + "epoch": 0.6872154456699048, + "grad_norm": 1.8612818717956543, + "learning_rate": 9.41407144142938e-06, + "loss": 0.9085, + "step": 14113 + }, + { + "epoch": 0.687264139459012, + "grad_norm": 2.2292046546936035, + "learning_rate": 9.411395290809429e-06, + "loss": 0.8239, + "step": 14114 + }, + { + "epoch": 0.6873128332481192, + "grad_norm": 2.0029637813568115, + "learning_rate": 9.408719403581986e-06, + "loss": 0.8863, + "step": 14115 + }, + { + "epoch": 0.6873615270372264, + "grad_norm": 1.421926498413086, + "learning_rate": 9.406043779813606e-06, + "loss": 0.8754, + "step": 14116 + }, + { + "epoch": 0.6874102208263336, + "grad_norm": 1.8381822109222412, + "learning_rate": 9.403368419570858e-06, + "loss": 0.8476, + "step": 14117 + }, + { + "epoch": 0.6874589146154408, + "grad_norm": 1.3826322555541992, + "learning_rate": 9.400693322920288e-06, + "loss": 0.7737, + "step": 14118 + }, + { + "epoch": 0.687507608404548, + "grad_norm": 1.9634405374526978, + "learning_rate": 9.39801848992844e-06, + "loss": 0.7925, + "step": 14119 + }, + { + "epoch": 0.6875563021936552, + "grad_norm": 5.943453311920166, + "learning_rate": 9.395343920661846e-06, + "loss": 0.838, + "step": 14120 + }, + { + "epoch": 0.6876049959827624, + "grad_norm": 8.297646522521973, + "learning_rate": 9.39266961518704e-06, + "loss": 0.7697, + "step": 14121 + }, + { + "epoch": 0.6876536897718696, + "grad_norm": 1.604314923286438, + "learning_rate": 9.389995573570545e-06, + "loss": 0.8201, + "step": 14122 + }, + { + "epoch": 0.6877023835609768, + "grad_norm": 1.4403644800186157, + "learning_rate": 9.387321795878879e-06, + "loss": 0.8961, + "step": 14123 + }, + { + "epoch": 0.687751077350084, + "grad_norm": 3.668498992919922, + "learning_rate": 9.384648282178541e-06, + "loss": 0.8347, + "step": 14124 + }, + { + "epoch": 0.6877997711391912, + "grad_norm": 1.5462181568145752, + "learning_rate": 9.381975032536053e-06, + "loss": 0.8664, + "step": 14125 + }, + { + "epoch": 0.6878484649282984, + "grad_norm": 1.3937193155288696, + "learning_rate": 9.379302047017903e-06, + "loss": 0.7939, + "step": 14126 + }, + { + "epoch": 0.6878971587174056, + "grad_norm": 2.2228195667266846, + "learning_rate": 9.376629325690586e-06, + "loss": 0.7801, + "step": 14127 + }, + { + "epoch": 0.6879458525065127, + "grad_norm": 1.610564947128296, + "learning_rate": 9.37395686862058e-06, + "loss": 0.8096, + "step": 14128 + }, + { + "epoch": 0.68799454629562, + "grad_norm": 1.6752711534500122, + "learning_rate": 9.371284675874366e-06, + "loss": 0.6919, + "step": 14129 + }, + { + "epoch": 0.6880432400847272, + "grad_norm": 1.4697940349578857, + "learning_rate": 9.368612747518416e-06, + "loss": 0.9107, + "step": 14130 + }, + { + "epoch": 0.6880919338738344, + "grad_norm": 2.498265027999878, + "learning_rate": 9.365941083619193e-06, + "loss": 0.8659, + "step": 14131 + }, + { + "epoch": 0.6881406276629416, + "grad_norm": 1.9573924541473389, + "learning_rate": 9.363269684243155e-06, + "loss": 0.8092, + "step": 14132 + }, + { + "epoch": 0.6881893214520488, + "grad_norm": 2.2889974117279053, + "learning_rate": 9.360598549456745e-06, + "loss": 0.8741, + "step": 14133 + }, + { + "epoch": 0.688238015241156, + "grad_norm": 2.3475937843322754, + "learning_rate": 9.357927679326427e-06, + "loss": 0.7752, + "step": 14134 + }, + { + "epoch": 0.6882867090302632, + "grad_norm": 2.7022693157196045, + "learning_rate": 9.355257073918625e-06, + "loss": 0.764, + "step": 14135 + }, + { + "epoch": 0.6883354028193703, + "grad_norm": 1.3111090660095215, + "learning_rate": 9.352586733299773e-06, + "loss": 0.789, + "step": 14136 + }, + { + "epoch": 0.6883840966084775, + "grad_norm": 1.9094568490982056, + "learning_rate": 9.3499166575363e-06, + "loss": 0.8037, + "step": 14137 + }, + { + "epoch": 0.6884327903975848, + "grad_norm": 2.09824800491333, + "learning_rate": 9.34724684669462e-06, + "loss": 0.8433, + "step": 14138 + }, + { + "epoch": 0.688481484186692, + "grad_norm": 1.621761441230774, + "learning_rate": 9.344577300841145e-06, + "loss": 0.9019, + "step": 14139 + }, + { + "epoch": 0.6885301779757992, + "grad_norm": 1.603165626525879, + "learning_rate": 9.341908020042275e-06, + "loss": 0.7992, + "step": 14140 + }, + { + "epoch": 0.6885788717649064, + "grad_norm": 1.47035813331604, + "learning_rate": 9.339239004364429e-06, + "loss": 0.7422, + "step": 14141 + }, + { + "epoch": 0.6886275655540136, + "grad_norm": 2.2313168048858643, + "learning_rate": 9.33657025387397e-06, + "loss": 0.8251, + "step": 14142 + }, + { + "epoch": 0.6886762593431208, + "grad_norm": 0.09269506484270096, + "learning_rate": 9.333901768637312e-06, + "loss": 0.6022, + "step": 14143 + }, + { + "epoch": 0.688724953132228, + "grad_norm": 1.5898128747940063, + "learning_rate": 9.331233548720806e-06, + "loss": 0.7689, + "step": 14144 + }, + { + "epoch": 0.6887736469213351, + "grad_norm": 2.6411664485931396, + "learning_rate": 9.328565594190846e-06, + "loss": 0.7267, + "step": 14145 + }, + { + "epoch": 0.6888223407104423, + "grad_norm": 1.8753248453140259, + "learning_rate": 9.32589790511379e-06, + "loss": 0.7971, + "step": 14146 + }, + { + "epoch": 0.6888710344995496, + "grad_norm": 1.4586800336837769, + "learning_rate": 9.323230481555989e-06, + "loss": 0.8081, + "step": 14147 + }, + { + "epoch": 0.6889197282886568, + "grad_norm": 0.09625795483589172, + "learning_rate": 9.320563323583819e-06, + "loss": 0.6072, + "step": 14148 + }, + { + "epoch": 0.688968422077764, + "grad_norm": 2.522278308868408, + "learning_rate": 9.317896431263593e-06, + "loss": 0.8107, + "step": 14149 + }, + { + "epoch": 0.6890171158668712, + "grad_norm": 4.176743984222412, + "learning_rate": 9.315229804661682e-06, + "loss": 0.7715, + "step": 14150 + }, + { + "epoch": 0.6890658096559784, + "grad_norm": 1.77591872215271, + "learning_rate": 9.312563443844392e-06, + "loss": 0.8153, + "step": 14151 + }, + { + "epoch": 0.6891145034450856, + "grad_norm": 2.2385711669921875, + "learning_rate": 9.309897348878065e-06, + "loss": 0.8554, + "step": 14152 + }, + { + "epoch": 0.6891631972341927, + "grad_norm": 1.4523134231567383, + "learning_rate": 9.307231519829018e-06, + "loss": 0.7633, + "step": 14153 + }, + { + "epoch": 0.6892118910232999, + "grad_norm": 2.2605013847351074, + "learning_rate": 9.304565956763563e-06, + "loss": 0.9054, + "step": 14154 + }, + { + "epoch": 0.6892605848124072, + "grad_norm": 2.457038164138794, + "learning_rate": 9.301900659748003e-06, + "loss": 0.7481, + "step": 14155 + }, + { + "epoch": 0.6893092786015144, + "grad_norm": 1.4069664478302002, + "learning_rate": 9.299235628848634e-06, + "loss": 0.7762, + "step": 14156 + }, + { + "epoch": 0.6893579723906216, + "grad_norm": 1.8746623992919922, + "learning_rate": 9.296570864131767e-06, + "loss": 0.7379, + "step": 14157 + }, + { + "epoch": 0.6894066661797288, + "grad_norm": 1.5398237705230713, + "learning_rate": 9.293906365663662e-06, + "loss": 0.8054, + "step": 14158 + }, + { + "epoch": 0.689455359968836, + "grad_norm": 1.6811074018478394, + "learning_rate": 9.291242133510626e-06, + "loss": 0.7258, + "step": 14159 + }, + { + "epoch": 0.6895040537579432, + "grad_norm": 2.253891706466675, + "learning_rate": 9.288578167738905e-06, + "loss": 0.8166, + "step": 14160 + }, + { + "epoch": 0.6895527475470504, + "grad_norm": 1.8630809783935547, + "learning_rate": 9.285914468414785e-06, + "loss": 0.7814, + "step": 14161 + }, + { + "epoch": 0.6896014413361575, + "grad_norm": 1.7428702116012573, + "learning_rate": 9.283251035604519e-06, + "loss": 0.7185, + "step": 14162 + }, + { + "epoch": 0.6896501351252647, + "grad_norm": 2.1496243476867676, + "learning_rate": 9.280587869374363e-06, + "loss": 0.8698, + "step": 14163 + }, + { + "epoch": 0.689698828914372, + "grad_norm": 1.6613662242889404, + "learning_rate": 9.277924969790558e-06, + "loss": 0.7483, + "step": 14164 + }, + { + "epoch": 0.6897475227034792, + "grad_norm": 1.5269520282745361, + "learning_rate": 9.275262336919342e-06, + "loss": 0.8305, + "step": 14165 + }, + { + "epoch": 0.6897962164925864, + "grad_norm": 1.4752854108810425, + "learning_rate": 9.272599970826967e-06, + "loss": 0.7622, + "step": 14166 + }, + { + "epoch": 0.6898449102816936, + "grad_norm": 1.155551552772522, + "learning_rate": 9.26993787157963e-06, + "loss": 0.8508, + "step": 14167 + }, + { + "epoch": 0.6898936040708008, + "grad_norm": 1.4923617839813232, + "learning_rate": 9.267276039243576e-06, + "loss": 0.8028, + "step": 14168 + }, + { + "epoch": 0.689942297859908, + "grad_norm": 2.085801839828491, + "learning_rate": 9.264614473885008e-06, + "loss": 0.8665, + "step": 14169 + }, + { + "epoch": 0.6899909916490151, + "grad_norm": 1.536767601966858, + "learning_rate": 9.261953175570135e-06, + "loss": 0.9632, + "step": 14170 + }, + { + "epoch": 0.6900396854381223, + "grad_norm": 1.2752453088760376, + "learning_rate": 9.259292144365156e-06, + "loss": 0.8664, + "step": 14171 + }, + { + "epoch": 0.6900883792272295, + "grad_norm": 2.1991331577301025, + "learning_rate": 9.256631380336262e-06, + "loss": 0.8624, + "step": 14172 + }, + { + "epoch": 0.6901370730163368, + "grad_norm": 2.79777193069458, + "learning_rate": 9.253970883549643e-06, + "loss": 0.9101, + "step": 14173 + }, + { + "epoch": 0.690185766805444, + "grad_norm": 1.7159473896026611, + "learning_rate": 9.25131065407147e-06, + "loss": 0.831, + "step": 14174 + }, + { + "epoch": 0.6902344605945512, + "grad_norm": 4.216012001037598, + "learning_rate": 9.248650691967933e-06, + "loss": 0.8107, + "step": 14175 + }, + { + "epoch": 0.6902831543836584, + "grad_norm": 1.588992953300476, + "learning_rate": 9.24599099730519e-06, + "loss": 0.7692, + "step": 14176 + }, + { + "epoch": 0.6903318481727656, + "grad_norm": 2.0070018768310547, + "learning_rate": 9.243331570149403e-06, + "loss": 0.8757, + "step": 14177 + }, + { + "epoch": 0.6903805419618728, + "grad_norm": 2.4525511264801025, + "learning_rate": 9.240672410566722e-06, + "loss": 0.8236, + "step": 14178 + }, + { + "epoch": 0.6904292357509799, + "grad_norm": 1.7586065530776978, + "learning_rate": 9.238013518623297e-06, + "loss": 0.8789, + "step": 14179 + }, + { + "epoch": 0.6904779295400871, + "grad_norm": 1.4888263940811157, + "learning_rate": 9.235354894385264e-06, + "loss": 0.7494, + "step": 14180 + }, + { + "epoch": 0.6905266233291943, + "grad_norm": 1.240660548210144, + "learning_rate": 9.232696537918764e-06, + "loss": 0.7521, + "step": 14181 + }, + { + "epoch": 0.6905753171183016, + "grad_norm": 0.09219805151224136, + "learning_rate": 9.230038449289915e-06, + "loss": 0.5747, + "step": 14182 + }, + { + "epoch": 0.6906240109074088, + "grad_norm": 1.5005784034729004, + "learning_rate": 9.227380628564843e-06, + "loss": 0.7327, + "step": 14183 + }, + { + "epoch": 0.690672704696516, + "grad_norm": 1.7866917848587036, + "learning_rate": 9.224723075809653e-06, + "loss": 0.845, + "step": 14184 + }, + { + "epoch": 0.6907213984856232, + "grad_norm": 1.552852749824524, + "learning_rate": 9.222065791090468e-06, + "loss": 0.7735, + "step": 14185 + }, + { + "epoch": 0.6907700922747304, + "grad_norm": 1.8680036067962646, + "learning_rate": 9.219408774473378e-06, + "loss": 0.7764, + "step": 14186 + }, + { + "epoch": 0.6908187860638375, + "grad_norm": 1.975543737411499, + "learning_rate": 9.216752026024477e-06, + "loss": 0.7411, + "step": 14187 + }, + { + "epoch": 0.6908674798529447, + "grad_norm": 1.6482722759246826, + "learning_rate": 9.214095545809854e-06, + "loss": 0.7582, + "step": 14188 + }, + { + "epoch": 0.6909161736420519, + "grad_norm": 2.0871758460998535, + "learning_rate": 9.211439333895586e-06, + "loss": 0.8425, + "step": 14189 + }, + { + "epoch": 0.6909648674311591, + "grad_norm": 1.8417898416519165, + "learning_rate": 9.208783390347748e-06, + "loss": 0.8141, + "step": 14190 + }, + { + "epoch": 0.6910135612202664, + "grad_norm": 1.6065682172775269, + "learning_rate": 9.206127715232402e-06, + "loss": 0.886, + "step": 14191 + }, + { + "epoch": 0.6910622550093736, + "grad_norm": 2.025986671447754, + "learning_rate": 9.203472308615627e-06, + "loss": 0.8542, + "step": 14192 + }, + { + "epoch": 0.6911109487984808, + "grad_norm": 1.6552106142044067, + "learning_rate": 9.200817170563448e-06, + "loss": 0.8138, + "step": 14193 + }, + { + "epoch": 0.691159642587588, + "grad_norm": 1.6228243112564087, + "learning_rate": 9.198162301141933e-06, + "loss": 0.8189, + "step": 14194 + }, + { + "epoch": 0.6912083363766951, + "grad_norm": 1.1687037944793701, + "learning_rate": 9.195507700417115e-06, + "loss": 0.772, + "step": 14195 + }, + { + "epoch": 0.6912570301658023, + "grad_norm": 1.515369176864624, + "learning_rate": 9.192853368455029e-06, + "loss": 0.8857, + "step": 14196 + }, + { + "epoch": 0.6913057239549095, + "grad_norm": 1.8445254564285278, + "learning_rate": 9.1901993053217e-06, + "loss": 0.8355, + "step": 14197 + }, + { + "epoch": 0.6913544177440167, + "grad_norm": 2.336887836456299, + "learning_rate": 9.187545511083142e-06, + "loss": 0.7533, + "step": 14198 + }, + { + "epoch": 0.691403111533124, + "grad_norm": 2.829479932785034, + "learning_rate": 9.18489198580539e-06, + "loss": 0.8528, + "step": 14199 + }, + { + "epoch": 0.6914518053222312, + "grad_norm": 1.689292550086975, + "learning_rate": 9.182238729554423e-06, + "loss": 0.8698, + "step": 14200 + }, + { + "epoch": 0.6915004991113384, + "grad_norm": 1.8869471549987793, + "learning_rate": 9.179585742396269e-06, + "loss": 0.775, + "step": 14201 + }, + { + "epoch": 0.6915491929004456, + "grad_norm": 0.09843654930591583, + "learning_rate": 9.176933024396887e-06, + "loss": 0.6262, + "step": 14202 + }, + { + "epoch": 0.6915978866895528, + "grad_norm": 1.5253006219863892, + "learning_rate": 9.174280575622295e-06, + "loss": 0.7208, + "step": 14203 + }, + { + "epoch": 0.6916465804786599, + "grad_norm": 1.5198642015457153, + "learning_rate": 9.171628396138458e-06, + "loss": 0.8646, + "step": 14204 + }, + { + "epoch": 0.6916952742677671, + "grad_norm": 1.9797611236572266, + "learning_rate": 9.168976486011355e-06, + "loss": 0.7826, + "step": 14205 + }, + { + "epoch": 0.6917439680568743, + "grad_norm": 1.282978892326355, + "learning_rate": 9.166324845306944e-06, + "loss": 0.8064, + "step": 14206 + }, + { + "epoch": 0.6917926618459815, + "grad_norm": 1.7415201663970947, + "learning_rate": 9.163673474091186e-06, + "loss": 0.8656, + "step": 14207 + }, + { + "epoch": 0.6918413556350888, + "grad_norm": 1.3921740055084229, + "learning_rate": 9.161022372430052e-06, + "loss": 0.8112, + "step": 14208 + }, + { + "epoch": 0.691890049424196, + "grad_norm": 1.6583127975463867, + "learning_rate": 9.158371540389464e-06, + "loss": 0.7896, + "step": 14209 + }, + { + "epoch": 0.6919387432133032, + "grad_norm": 1.806292176246643, + "learning_rate": 9.155720978035375e-06, + "loss": 0.8029, + "step": 14210 + }, + { + "epoch": 0.6919874370024104, + "grad_norm": 1.6545515060424805, + "learning_rate": 9.153070685433718e-06, + "loss": 0.8755, + "step": 14211 + }, + { + "epoch": 0.6920361307915175, + "grad_norm": 1.8760322332382202, + "learning_rate": 9.150420662650417e-06, + "loss": 0.8122, + "step": 14212 + }, + { + "epoch": 0.6920848245806247, + "grad_norm": 0.09330420196056366, + "learning_rate": 9.14777090975139e-06, + "loss": 0.5689, + "step": 14213 + }, + { + "epoch": 0.6921335183697319, + "grad_norm": 2.354362726211548, + "learning_rate": 9.145121426802544e-06, + "loss": 0.8969, + "step": 14214 + }, + { + "epoch": 0.6921822121588391, + "grad_norm": 1.402914047241211, + "learning_rate": 9.142472213869806e-06, + "loss": 0.8067, + "step": 14215 + }, + { + "epoch": 0.6922309059479463, + "grad_norm": 2.005167245864868, + "learning_rate": 9.139823271019048e-06, + "loss": 0.8586, + "step": 14216 + }, + { + "epoch": 0.6922795997370536, + "grad_norm": 1.3589544296264648, + "learning_rate": 9.13717459831619e-06, + "loss": 0.776, + "step": 14217 + }, + { + "epoch": 0.6923282935261608, + "grad_norm": 1.886674165725708, + "learning_rate": 9.134526195827092e-06, + "loss": 0.8501, + "step": 14218 + }, + { + "epoch": 0.692376987315268, + "grad_norm": 1.3955447673797607, + "learning_rate": 9.131878063617649e-06, + "loss": 0.7951, + "step": 14219 + }, + { + "epoch": 0.6924256811043752, + "grad_norm": 2.862412929534912, + "learning_rate": 9.129230201753733e-06, + "loss": 0.7427, + "step": 14220 + }, + { + "epoch": 0.6924743748934823, + "grad_norm": 1.4683432579040527, + "learning_rate": 9.126582610301207e-06, + "loss": 0.7933, + "step": 14221 + }, + { + "epoch": 0.6925230686825895, + "grad_norm": 1.9374562501907349, + "learning_rate": 9.12393528932593e-06, + "loss": 0.8309, + "step": 14222 + }, + { + "epoch": 0.6925717624716967, + "grad_norm": 1.4574956893920898, + "learning_rate": 9.121288238893746e-06, + "loss": 0.8121, + "step": 14223 + }, + { + "epoch": 0.6926204562608039, + "grad_norm": 1.4767049551010132, + "learning_rate": 9.118641459070522e-06, + "loss": 0.8226, + "step": 14224 + }, + { + "epoch": 0.6926691500499111, + "grad_norm": 4.604982852935791, + "learning_rate": 9.11599494992207e-06, + "loss": 0.811, + "step": 14225 + }, + { + "epoch": 0.6927178438390184, + "grad_norm": 1.30890691280365, + "learning_rate": 9.113348711514248e-06, + "loss": 0.7482, + "step": 14226 + }, + { + "epoch": 0.6927665376281256, + "grad_norm": 1.5280107259750366, + "learning_rate": 9.110702743912865e-06, + "loss": 0.8289, + "step": 14227 + }, + { + "epoch": 0.6928152314172328, + "grad_norm": 1.468881368637085, + "learning_rate": 9.108057047183744e-06, + "loss": 0.812, + "step": 14228 + }, + { + "epoch": 0.6928639252063399, + "grad_norm": 1.5758967399597168, + "learning_rate": 9.105411621392699e-06, + "loss": 0.8144, + "step": 14229 + }, + { + "epoch": 0.6929126189954471, + "grad_norm": 1.5955263376235962, + "learning_rate": 9.10276646660553e-06, + "loss": 0.77, + "step": 14230 + }, + { + "epoch": 0.6929613127845543, + "grad_norm": 1.533346176147461, + "learning_rate": 9.100121582888042e-06, + "loss": 0.7779, + "step": 14231 + }, + { + "epoch": 0.6930100065736615, + "grad_norm": 1.5856999158859253, + "learning_rate": 9.097476970306023e-06, + "loss": 0.811, + "step": 14232 + }, + { + "epoch": 0.6930587003627687, + "grad_norm": 1.5595698356628418, + "learning_rate": 9.09483262892525e-06, + "loss": 0.8374, + "step": 14233 + }, + { + "epoch": 0.6931073941518759, + "grad_norm": 1.7813968658447266, + "learning_rate": 9.092188558811516e-06, + "loss": 0.793, + "step": 14234 + }, + { + "epoch": 0.6931560879409832, + "grad_norm": 1.9155182838439941, + "learning_rate": 9.089544760030589e-06, + "loss": 0.7836, + "step": 14235 + }, + { + "epoch": 0.6932047817300904, + "grad_norm": 1.5405806303024292, + "learning_rate": 9.086901232648228e-06, + "loss": 0.8489, + "step": 14236 + }, + { + "epoch": 0.6932534755191976, + "grad_norm": 2.122133255004883, + "learning_rate": 9.084257976730195e-06, + "loss": 0.8567, + "step": 14237 + }, + { + "epoch": 0.6933021693083047, + "grad_norm": 1.2348246574401855, + "learning_rate": 9.08161499234224e-06, + "loss": 0.7652, + "step": 14238 + }, + { + "epoch": 0.6933508630974119, + "grad_norm": 2.516890287399292, + "learning_rate": 9.07897227955011e-06, + "loss": 0.7154, + "step": 14239 + }, + { + "epoch": 0.6933995568865191, + "grad_norm": 1.957584261894226, + "learning_rate": 9.076329838419538e-06, + "loss": 0.6806, + "step": 14240 + }, + { + "epoch": 0.6934482506756263, + "grad_norm": 1.7685246467590332, + "learning_rate": 9.073687669016256e-06, + "loss": 0.7849, + "step": 14241 + }, + { + "epoch": 0.6934969444647335, + "grad_norm": 1.7562556266784668, + "learning_rate": 9.071045771405985e-06, + "loss": 0.8472, + "step": 14242 + }, + { + "epoch": 0.6935456382538407, + "grad_norm": 1.3995862007141113, + "learning_rate": 9.068404145654453e-06, + "loss": 0.7688, + "step": 14243 + }, + { + "epoch": 0.693594332042948, + "grad_norm": 1.9882862567901611, + "learning_rate": 9.065762791827363e-06, + "loss": 0.7434, + "step": 14244 + }, + { + "epoch": 0.6936430258320552, + "grad_norm": 1.6221354007720947, + "learning_rate": 9.063121709990422e-06, + "loss": 0.8628, + "step": 14245 + }, + { + "epoch": 0.6936917196211623, + "grad_norm": 1.735932469367981, + "learning_rate": 9.060480900209326e-06, + "loss": 0.8601, + "step": 14246 + }, + { + "epoch": 0.6937404134102695, + "grad_norm": 1.8956103324890137, + "learning_rate": 9.057840362549764e-06, + "loss": 0.8146, + "step": 14247 + }, + { + "epoch": 0.6937891071993767, + "grad_norm": 0.09633410722017288, + "learning_rate": 9.05520009707742e-06, + "loss": 0.6432, + "step": 14248 + }, + { + "epoch": 0.6938378009884839, + "grad_norm": 1.4983938932418823, + "learning_rate": 9.052560103857967e-06, + "loss": 0.7642, + "step": 14249 + }, + { + "epoch": 0.6938864947775911, + "grad_norm": 2.896207332611084, + "learning_rate": 9.049920382957091e-06, + "loss": 0.8436, + "step": 14250 + }, + { + "epoch": 0.6939351885666983, + "grad_norm": 1.7791591882705688, + "learning_rate": 9.047280934440433e-06, + "loss": 0.9065, + "step": 14251 + }, + { + "epoch": 0.6939838823558055, + "grad_norm": 0.09545117616653442, + "learning_rate": 9.044641758373672e-06, + "loss": 0.6621, + "step": 14252 + }, + { + "epoch": 0.6940325761449128, + "grad_norm": 1.3933892250061035, + "learning_rate": 9.042002854822431e-06, + "loss": 0.8098, + "step": 14253 + }, + { + "epoch": 0.6940812699340199, + "grad_norm": 1.3846933841705322, + "learning_rate": 9.039364223852378e-06, + "loss": 0.8188, + "step": 14254 + }, + { + "epoch": 0.6941299637231271, + "grad_norm": 2.1196179389953613, + "learning_rate": 9.036725865529138e-06, + "loss": 0.7812, + "step": 14255 + }, + { + "epoch": 0.6941786575122343, + "grad_norm": 1.6327005624771118, + "learning_rate": 9.034087779918332e-06, + "loss": 0.8149, + "step": 14256 + }, + { + "epoch": 0.6942273513013415, + "grad_norm": 1.4476176500320435, + "learning_rate": 9.03144996708561e-06, + "loss": 0.7316, + "step": 14257 + }, + { + "epoch": 0.6942760450904487, + "grad_norm": 1.539554238319397, + "learning_rate": 9.028812427096557e-06, + "loss": 0.7867, + "step": 14258 + }, + { + "epoch": 0.6943247388795559, + "grad_norm": 1.9130494594573975, + "learning_rate": 9.026175160016806e-06, + "loss": 0.8783, + "step": 14259 + }, + { + "epoch": 0.6943734326686631, + "grad_norm": 2.3273487091064453, + "learning_rate": 9.023538165911936e-06, + "loss": 0.773, + "step": 14260 + }, + { + "epoch": 0.6944221264577704, + "grad_norm": 1.9946496486663818, + "learning_rate": 9.02090144484756e-06, + "loss": 0.8683, + "step": 14261 + }, + { + "epoch": 0.6944708202468776, + "grad_norm": 1.459261417388916, + "learning_rate": 9.018264996889265e-06, + "loss": 0.8504, + "step": 14262 + }, + { + "epoch": 0.6945195140359847, + "grad_norm": 1.2671654224395752, + "learning_rate": 9.015628822102627e-06, + "loss": 0.8808, + "step": 14263 + }, + { + "epoch": 0.6945682078250919, + "grad_norm": 1.289903998374939, + "learning_rate": 9.012992920553225e-06, + "loss": 0.8359, + "step": 14264 + }, + { + "epoch": 0.6946169016141991, + "grad_norm": 3.0983591079711914, + "learning_rate": 9.010357292306618e-06, + "loss": 0.7869, + "step": 14265 + }, + { + "epoch": 0.6946655954033063, + "grad_norm": 1.4780857563018799, + "learning_rate": 9.007721937428389e-06, + "loss": 0.7352, + "step": 14266 + }, + { + "epoch": 0.6947142891924135, + "grad_norm": 1.5294451713562012, + "learning_rate": 9.005086855984064e-06, + "loss": 0.8068, + "step": 14267 + }, + { + "epoch": 0.6947629829815207, + "grad_norm": 1.8559656143188477, + "learning_rate": 9.002452048039222e-06, + "loss": 0.737, + "step": 14268 + }, + { + "epoch": 0.6948116767706279, + "grad_norm": 1.9485408067703247, + "learning_rate": 8.999817513659375e-06, + "loss": 0.7586, + "step": 14269 + }, + { + "epoch": 0.6948603705597352, + "grad_norm": 1.2524406909942627, + "learning_rate": 8.997183252910075e-06, + "loss": 0.8306, + "step": 14270 + }, + { + "epoch": 0.6949090643488423, + "grad_norm": 1.8139301538467407, + "learning_rate": 8.994549265856844e-06, + "loss": 0.7887, + "step": 14271 + }, + { + "epoch": 0.6949577581379495, + "grad_norm": 2.1219139099121094, + "learning_rate": 8.991915552565206e-06, + "loss": 0.8153, + "step": 14272 + }, + { + "epoch": 0.6950064519270567, + "grad_norm": 2.1729493141174316, + "learning_rate": 8.989282113100672e-06, + "loss": 0.7654, + "step": 14273 + }, + { + "epoch": 0.6950551457161639, + "grad_norm": 1.6470146179199219, + "learning_rate": 8.986648947528742e-06, + "loss": 0.8379, + "step": 14274 + }, + { + "epoch": 0.6951038395052711, + "grad_norm": 1.3774747848510742, + "learning_rate": 8.984016055914936e-06, + "loss": 0.8497, + "step": 14275 + }, + { + "epoch": 0.6951525332943783, + "grad_norm": 1.6034342050552368, + "learning_rate": 8.981383438324722e-06, + "loss": 0.8636, + "step": 14276 + }, + { + "epoch": 0.6952012270834855, + "grad_norm": 1.750340461730957, + "learning_rate": 8.978751094823608e-06, + "loss": 0.8526, + "step": 14277 + }, + { + "epoch": 0.6952499208725927, + "grad_norm": 1.2377655506134033, + "learning_rate": 8.976119025477064e-06, + "loss": 0.9157, + "step": 14278 + }, + { + "epoch": 0.6952986146617, + "grad_norm": 1.4777562618255615, + "learning_rate": 8.973487230350563e-06, + "loss": 0.7136, + "step": 14279 + }, + { + "epoch": 0.6953473084508071, + "grad_norm": 1.3287158012390137, + "learning_rate": 8.970855709509577e-06, + "loss": 0.8536, + "step": 14280 + }, + { + "epoch": 0.6953960022399143, + "grad_norm": 1.2871211767196655, + "learning_rate": 8.968224463019557e-06, + "loss": 0.7942, + "step": 14281 + }, + { + "epoch": 0.6954446960290215, + "grad_norm": 1.3106375932693481, + "learning_rate": 8.965593490945961e-06, + "loss": 0.7437, + "step": 14282 + }, + { + "epoch": 0.6954933898181287, + "grad_norm": 1.7510682344436646, + "learning_rate": 8.962962793354225e-06, + "loss": 0.9006, + "step": 14283 + }, + { + "epoch": 0.6955420836072359, + "grad_norm": 2.0983755588531494, + "learning_rate": 8.960332370309801e-06, + "loss": 0.8405, + "step": 14284 + }, + { + "epoch": 0.6955907773963431, + "grad_norm": 2.3838703632354736, + "learning_rate": 8.95770222187812e-06, + "loss": 0.8099, + "step": 14285 + }, + { + "epoch": 0.6956394711854503, + "grad_norm": 1.554014801979065, + "learning_rate": 8.9550723481246e-06, + "loss": 0.8001, + "step": 14286 + }, + { + "epoch": 0.6956881649745575, + "grad_norm": 1.7779483795166016, + "learning_rate": 8.952442749114662e-06, + "loss": 0.9045, + "step": 14287 + }, + { + "epoch": 0.6957368587636646, + "grad_norm": 1.4897708892822266, + "learning_rate": 8.94981342491372e-06, + "loss": 0.7913, + "step": 14288 + }, + { + "epoch": 0.6957855525527719, + "grad_norm": 1.2906315326690674, + "learning_rate": 8.947184375587176e-06, + "loss": 0.8212, + "step": 14289 + }, + { + "epoch": 0.6958342463418791, + "grad_norm": 1.7006511688232422, + "learning_rate": 8.94455560120043e-06, + "loss": 0.8589, + "step": 14290 + }, + { + "epoch": 0.6958829401309863, + "grad_norm": 2.291593551635742, + "learning_rate": 8.941927101818868e-06, + "loss": 0.7301, + "step": 14291 + }, + { + "epoch": 0.6959316339200935, + "grad_norm": 1.6699436902999878, + "learning_rate": 8.93929887750788e-06, + "loss": 0.7327, + "step": 14292 + }, + { + "epoch": 0.6959803277092007, + "grad_norm": 1.8606539964675903, + "learning_rate": 8.936670928332833e-06, + "loss": 0.8791, + "step": 14293 + }, + { + "epoch": 0.6960290214983079, + "grad_norm": 1.551674485206604, + "learning_rate": 8.934043254359112e-06, + "loss": 0.875, + "step": 14294 + }, + { + "epoch": 0.6960777152874151, + "grad_norm": 1.3939547538757324, + "learning_rate": 8.931415855652075e-06, + "loss": 0.7502, + "step": 14295 + }, + { + "epoch": 0.6961264090765222, + "grad_norm": 1.5432276725769043, + "learning_rate": 8.928788732277077e-06, + "loss": 0.8048, + "step": 14296 + }, + { + "epoch": 0.6961751028656294, + "grad_norm": 2.1317219734191895, + "learning_rate": 8.926161884299469e-06, + "loss": 0.8776, + "step": 14297 + }, + { + "epoch": 0.6962237966547367, + "grad_norm": 1.363932728767395, + "learning_rate": 8.923535311784594e-06, + "loss": 0.8875, + "step": 14298 + }, + { + "epoch": 0.6962724904438439, + "grad_norm": 1.7193603515625, + "learning_rate": 8.920909014797787e-06, + "loss": 0.8805, + "step": 14299 + }, + { + "epoch": 0.6963211842329511, + "grad_norm": 1.5826603174209595, + "learning_rate": 8.918282993404372e-06, + "loss": 0.7251, + "step": 14300 + }, + { + "epoch": 0.6963698780220583, + "grad_norm": 1.7634248733520508, + "learning_rate": 8.915657247669694e-06, + "loss": 0.7739, + "step": 14301 + }, + { + "epoch": 0.6964185718111655, + "grad_norm": 2.225429058074951, + "learning_rate": 8.913031777659035e-06, + "loss": 0.7795, + "step": 14302 + }, + { + "epoch": 0.6964672656002727, + "grad_norm": 1.764600157737732, + "learning_rate": 8.910406583437732e-06, + "loss": 0.7596, + "step": 14303 + }, + { + "epoch": 0.6965159593893799, + "grad_norm": 0.09668822586536407, + "learning_rate": 8.907781665071074e-06, + "loss": 0.6341, + "step": 14304 + }, + { + "epoch": 0.696564653178487, + "grad_norm": 2.818997383117676, + "learning_rate": 8.905157022624358e-06, + "loss": 0.8106, + "step": 14305 + }, + { + "epoch": 0.6966133469675942, + "grad_norm": 2.3975489139556885, + "learning_rate": 8.902532656162874e-06, + "loss": 0.8056, + "step": 14306 + }, + { + "epoch": 0.6966620407567015, + "grad_norm": 1.3303909301757812, + "learning_rate": 8.899908565751894e-06, + "loss": 0.7632, + "step": 14307 + }, + { + "epoch": 0.6967107345458087, + "grad_norm": 1.5409702062606812, + "learning_rate": 8.897284751456716e-06, + "loss": 0.7613, + "step": 14308 + }, + { + "epoch": 0.6967594283349159, + "grad_norm": 1.459721565246582, + "learning_rate": 8.894661213342578e-06, + "loss": 0.8881, + "step": 14309 + }, + { + "epoch": 0.6968081221240231, + "grad_norm": 1.5092769861221313, + "learning_rate": 8.892037951474768e-06, + "loss": 0.8766, + "step": 14310 + }, + { + "epoch": 0.6968568159131303, + "grad_norm": 1.5106841325759888, + "learning_rate": 8.88941496591851e-06, + "loss": 0.7627, + "step": 14311 + }, + { + "epoch": 0.6969055097022375, + "grad_norm": 13.255807876586914, + "learning_rate": 8.886792256739079e-06, + "loss": 0.9054, + "step": 14312 + }, + { + "epoch": 0.6969542034913446, + "grad_norm": 1.4362126588821411, + "learning_rate": 8.884169824001702e-06, + "loss": 0.8491, + "step": 14313 + }, + { + "epoch": 0.6970028972804518, + "grad_norm": 3.0610463619232178, + "learning_rate": 8.881547667771609e-06, + "loss": 0.8211, + "step": 14314 + }, + { + "epoch": 0.697051591069559, + "grad_norm": 1.3235774040222168, + "learning_rate": 8.878925788114043e-06, + "loss": 0.8051, + "step": 14315 + }, + { + "epoch": 0.6971002848586663, + "grad_norm": 1.468875765800476, + "learning_rate": 8.876304185094198e-06, + "loss": 0.8141, + "step": 14316 + }, + { + "epoch": 0.6971489786477735, + "grad_norm": 1.5130436420440674, + "learning_rate": 8.873682858777317e-06, + "loss": 0.7302, + "step": 14317 + }, + { + "epoch": 0.6971976724368807, + "grad_norm": 1.571118712425232, + "learning_rate": 8.871061809228573e-06, + "loss": 0.7287, + "step": 14318 + }, + { + "epoch": 0.6972463662259879, + "grad_norm": 1.4839913845062256, + "learning_rate": 8.868441036513188e-06, + "loss": 0.7455, + "step": 14319 + }, + { + "epoch": 0.6972950600150951, + "grad_norm": 1.763283610343933, + "learning_rate": 8.86582054069635e-06, + "loss": 0.7964, + "step": 14320 + }, + { + "epoch": 0.6973437538042023, + "grad_norm": 1.8755344152450562, + "learning_rate": 8.86320032184324e-06, + "loss": 0.776, + "step": 14321 + }, + { + "epoch": 0.6973924475933094, + "grad_norm": 1.5651943683624268, + "learning_rate": 8.860580380019038e-06, + "loss": 0.871, + "step": 14322 + }, + { + "epoch": 0.6974411413824166, + "grad_norm": 1.5289651155471802, + "learning_rate": 8.857960715288905e-06, + "loss": 0.8798, + "step": 14323 + }, + { + "epoch": 0.6974898351715239, + "grad_norm": 1.314671516418457, + "learning_rate": 8.855341327718032e-06, + "loss": 0.7564, + "step": 14324 + }, + { + "epoch": 0.6975385289606311, + "grad_norm": 0.10056477785110474, + "learning_rate": 8.852722217371544e-06, + "loss": 0.5861, + "step": 14325 + }, + { + "epoch": 0.6975872227497383, + "grad_norm": 5.606205463409424, + "learning_rate": 8.850103384314622e-06, + "loss": 0.918, + "step": 14326 + }, + { + "epoch": 0.6976359165388455, + "grad_norm": 1.9839832782745361, + "learning_rate": 8.84748482861238e-06, + "loss": 0.8539, + "step": 14327 + }, + { + "epoch": 0.6976846103279527, + "grad_norm": 1.8236682415008545, + "learning_rate": 8.844866550329978e-06, + "loss": 0.8863, + "step": 14328 + }, + { + "epoch": 0.6977333041170599, + "grad_norm": 1.5077977180480957, + "learning_rate": 8.842248549532534e-06, + "loss": 0.9233, + "step": 14329 + }, + { + "epoch": 0.697781997906167, + "grad_norm": 2.3198208808898926, + "learning_rate": 8.83963082628518e-06, + "loss": 0.8809, + "step": 14330 + }, + { + "epoch": 0.6978306916952742, + "grad_norm": 1.7999317646026611, + "learning_rate": 8.837013380653023e-06, + "loss": 0.7734, + "step": 14331 + }, + { + "epoch": 0.6978793854843814, + "grad_norm": 1.5395574569702148, + "learning_rate": 8.834396212701171e-06, + "loss": 0.6916, + "step": 14332 + }, + { + "epoch": 0.6979280792734887, + "grad_norm": 2.777059316635132, + "learning_rate": 8.831779322494745e-06, + "loss": 0.7934, + "step": 14333 + }, + { + "epoch": 0.6979767730625959, + "grad_norm": 1.7321072816848755, + "learning_rate": 8.82916271009881e-06, + "loss": 0.8036, + "step": 14334 + }, + { + "epoch": 0.6980254668517031, + "grad_norm": 1.587384581565857, + "learning_rate": 8.826546375578481e-06, + "loss": 0.849, + "step": 14335 + }, + { + "epoch": 0.6980741606408103, + "grad_norm": 1.3985038995742798, + "learning_rate": 8.82393031899883e-06, + "loss": 0.8365, + "step": 14336 + }, + { + "epoch": 0.6981228544299175, + "grad_norm": 1.6194086074829102, + "learning_rate": 8.82131454042493e-06, + "loss": 0.8708, + "step": 14337 + }, + { + "epoch": 0.6981715482190247, + "grad_norm": 0.10106683522462845, + "learning_rate": 8.81869903992185e-06, + "loss": 0.5512, + "step": 14338 + }, + { + "epoch": 0.6982202420081318, + "grad_norm": 3.524479627609253, + "learning_rate": 8.816083817554653e-06, + "loss": 0.7366, + "step": 14339 + }, + { + "epoch": 0.698268935797239, + "grad_norm": 1.5818239450454712, + "learning_rate": 8.813468873388386e-06, + "loss": 0.8395, + "step": 14340 + }, + { + "epoch": 0.6983176295863462, + "grad_norm": 1.4680650234222412, + "learning_rate": 8.810854207488105e-06, + "loss": 0.7201, + "step": 14341 + }, + { + "epoch": 0.6983663233754535, + "grad_norm": 2.443215847015381, + "learning_rate": 8.808239819918838e-06, + "loss": 0.8213, + "step": 14342 + }, + { + "epoch": 0.6984150171645607, + "grad_norm": 1.5631486177444458, + "learning_rate": 8.805625710745632e-06, + "loss": 0.8135, + "step": 14343 + }, + { + "epoch": 0.6984637109536679, + "grad_norm": 0.10038130730390549, + "learning_rate": 8.803011880033507e-06, + "loss": 0.5731, + "step": 14344 + }, + { + "epoch": 0.6985124047427751, + "grad_norm": 5.030182361602783, + "learning_rate": 8.800398327847484e-06, + "loss": 0.8252, + "step": 14345 + }, + { + "epoch": 0.6985610985318823, + "grad_norm": 1.9175403118133545, + "learning_rate": 8.797785054252572e-06, + "loss": 0.8414, + "step": 14346 + }, + { + "epoch": 0.6986097923209894, + "grad_norm": 1.2405259609222412, + "learning_rate": 8.795172059313777e-06, + "loss": 0.7775, + "step": 14347 + }, + { + "epoch": 0.6986584861100966, + "grad_norm": 1.4901578426361084, + "learning_rate": 8.792559343096102e-06, + "loss": 0.7003, + "step": 14348 + }, + { + "epoch": 0.6987071798992038, + "grad_norm": 4.019256114959717, + "learning_rate": 8.789946905664532e-06, + "loss": 0.7784, + "step": 14349 + }, + { + "epoch": 0.698755873688311, + "grad_norm": 1.3700942993164062, + "learning_rate": 8.787334747084058e-06, + "loss": 0.7368, + "step": 14350 + }, + { + "epoch": 0.6988045674774183, + "grad_norm": 1.2947583198547363, + "learning_rate": 8.784722867419644e-06, + "loss": 0.7665, + "step": 14351 + }, + { + "epoch": 0.6988532612665255, + "grad_norm": 2.156639575958252, + "learning_rate": 8.78211126673628e-06, + "loss": 0.8106, + "step": 14352 + }, + { + "epoch": 0.6989019550556327, + "grad_norm": 1.2905620336532593, + "learning_rate": 8.779499945098922e-06, + "loss": 0.7922, + "step": 14353 + }, + { + "epoch": 0.6989506488447399, + "grad_norm": 1.6771111488342285, + "learning_rate": 8.776888902572524e-06, + "loss": 0.8784, + "step": 14354 + }, + { + "epoch": 0.698999342633847, + "grad_norm": 1.749566912651062, + "learning_rate": 8.774278139222038e-06, + "loss": 0.8606, + "step": 14355 + }, + { + "epoch": 0.6990480364229542, + "grad_norm": 1.4249892234802246, + "learning_rate": 8.771667655112411e-06, + "loss": 0.8449, + "step": 14356 + }, + { + "epoch": 0.6990967302120614, + "grad_norm": 0.09740000218153, + "learning_rate": 8.769057450308569e-06, + "loss": 0.5873, + "step": 14357 + }, + { + "epoch": 0.6991454240011686, + "grad_norm": 4.1096320152282715, + "learning_rate": 8.766447524875443e-06, + "loss": 0.7667, + "step": 14358 + }, + { + "epoch": 0.6991941177902758, + "grad_norm": 3.7430405616760254, + "learning_rate": 8.763837878877973e-06, + "loss": 0.772, + "step": 14359 + }, + { + "epoch": 0.6992428115793831, + "grad_norm": 2.4728941917419434, + "learning_rate": 8.761228512381046e-06, + "loss": 0.8042, + "step": 14360 + }, + { + "epoch": 0.6992915053684903, + "grad_norm": 2.4565699100494385, + "learning_rate": 8.758619425449597e-06, + "loss": 0.8219, + "step": 14361 + }, + { + "epoch": 0.6993401991575975, + "grad_norm": 1.589361548423767, + "learning_rate": 8.756010618148503e-06, + "loss": 0.7412, + "step": 14362 + }, + { + "epoch": 0.6993888929467047, + "grad_norm": 1.3331319093704224, + "learning_rate": 8.753402090542676e-06, + "loss": 0.6965, + "step": 14363 + }, + { + "epoch": 0.6994375867358118, + "grad_norm": 1.3167426586151123, + "learning_rate": 8.750793842696996e-06, + "loss": 0.7788, + "step": 14364 + }, + { + "epoch": 0.699486280524919, + "grad_norm": 1.5543068647384644, + "learning_rate": 8.748185874676341e-06, + "loss": 0.7944, + "step": 14365 + }, + { + "epoch": 0.6995349743140262, + "grad_norm": 2.2066073417663574, + "learning_rate": 8.7455781865456e-06, + "loss": 0.8425, + "step": 14366 + }, + { + "epoch": 0.6995836681031334, + "grad_norm": 1.514625072479248, + "learning_rate": 8.742970778369613e-06, + "loss": 0.8209, + "step": 14367 + }, + { + "epoch": 0.6996323618922407, + "grad_norm": 1.2224881649017334, + "learning_rate": 8.74036365021327e-06, + "loss": 0.7494, + "step": 14368 + }, + { + "epoch": 0.6996810556813479, + "grad_norm": 1.3595569133758545, + "learning_rate": 8.737756802141393e-06, + "loss": 0.7175, + "step": 14369 + }, + { + "epoch": 0.6997297494704551, + "grad_norm": 1.6147797107696533, + "learning_rate": 8.735150234218849e-06, + "loss": 0.78, + "step": 14370 + }, + { + "epoch": 0.6997784432595623, + "grad_norm": 4.009260654449463, + "learning_rate": 8.732543946510473e-06, + "loss": 0.8095, + "step": 14371 + }, + { + "epoch": 0.6998271370486694, + "grad_norm": 1.806593894958496, + "learning_rate": 8.72993793908109e-06, + "loss": 0.8497, + "step": 14372 + }, + { + "epoch": 0.6998758308377766, + "grad_norm": 1.3863768577575684, + "learning_rate": 8.72733221199553e-06, + "loss": 0.8024, + "step": 14373 + }, + { + "epoch": 0.6999245246268838, + "grad_norm": 1.3469069004058838, + "learning_rate": 8.7247267653186e-06, + "loss": 0.7223, + "step": 14374 + }, + { + "epoch": 0.699973218415991, + "grad_norm": 4.115614414215088, + "learning_rate": 8.722121599115136e-06, + "loss": 0.7425, + "step": 14375 + }, + { + "epoch": 0.7000219122050982, + "grad_norm": 1.457106590270996, + "learning_rate": 8.719516713449911e-06, + "loss": 0.8647, + "step": 14376 + }, + { + "epoch": 0.7000706059942055, + "grad_norm": 1.4219155311584473, + "learning_rate": 8.716912108387751e-06, + "loss": 0.761, + "step": 14377 + }, + { + "epoch": 0.7001192997833127, + "grad_norm": 1.3951659202575684, + "learning_rate": 8.714307783993416e-06, + "loss": 0.8086, + "step": 14378 + }, + { + "epoch": 0.7001679935724199, + "grad_norm": 1.9556419849395752, + "learning_rate": 8.711703740331712e-06, + "loss": 0.8804, + "step": 14379 + }, + { + "epoch": 0.7002166873615271, + "grad_norm": 1.3332139253616333, + "learning_rate": 8.709099977467405e-06, + "loss": 0.7403, + "step": 14380 + }, + { + "epoch": 0.7002653811506342, + "grad_norm": 1.9291107654571533, + "learning_rate": 8.706496495465266e-06, + "loss": 0.8197, + "step": 14381 + }, + { + "epoch": 0.7003140749397414, + "grad_norm": 1.3313370943069458, + "learning_rate": 8.703893294390057e-06, + "loss": 0.7982, + "step": 14382 + }, + { + "epoch": 0.7003627687288486, + "grad_norm": 1.3990774154663086, + "learning_rate": 8.701290374306526e-06, + "loss": 0.77, + "step": 14383 + }, + { + "epoch": 0.7004114625179558, + "grad_norm": 2.6463208198547363, + "learning_rate": 8.698687735279442e-06, + "loss": 0.8115, + "step": 14384 + }, + { + "epoch": 0.700460156307063, + "grad_norm": 1.3183343410491943, + "learning_rate": 8.696085377373514e-06, + "loss": 0.8788, + "step": 14385 + }, + { + "epoch": 0.7005088500961703, + "grad_norm": 1.3366502523422241, + "learning_rate": 8.6934833006535e-06, + "loss": 0.7768, + "step": 14386 + }, + { + "epoch": 0.7005575438852775, + "grad_norm": 1.8702774047851562, + "learning_rate": 8.69088150518412e-06, + "loss": 0.8432, + "step": 14387 + }, + { + "epoch": 0.7006062376743847, + "grad_norm": 1.6925069093704224, + "learning_rate": 8.688279991030095e-06, + "loss": 0.779, + "step": 14388 + }, + { + "epoch": 0.7006549314634918, + "grad_norm": 1.5757840871810913, + "learning_rate": 8.685678758256136e-06, + "loss": 0.8273, + "step": 14389 + }, + { + "epoch": 0.700703625252599, + "grad_norm": 4.918626308441162, + "learning_rate": 8.683077806926949e-06, + "loss": 0.8248, + "step": 14390 + }, + { + "epoch": 0.7007523190417062, + "grad_norm": 0.09801261872053146, + "learning_rate": 8.680477137107236e-06, + "loss": 0.5849, + "step": 14391 + }, + { + "epoch": 0.7008010128308134, + "grad_norm": 1.5079898834228516, + "learning_rate": 8.677876748861676e-06, + "loss": 0.8306, + "step": 14392 + }, + { + "epoch": 0.7008497066199206, + "grad_norm": 1.5553582906723022, + "learning_rate": 8.675276642254973e-06, + "loss": 0.7077, + "step": 14393 + }, + { + "epoch": 0.7008984004090278, + "grad_norm": 1.5481088161468506, + "learning_rate": 8.672676817351795e-06, + "loss": 0.7979, + "step": 14394 + }, + { + "epoch": 0.7009470941981351, + "grad_norm": 1.2557131052017212, + "learning_rate": 8.670077274216812e-06, + "loss": 0.7692, + "step": 14395 + }, + { + "epoch": 0.7009957879872423, + "grad_norm": 1.407269835472107, + "learning_rate": 8.667478012914694e-06, + "loss": 0.7577, + "step": 14396 + }, + { + "epoch": 0.7010444817763495, + "grad_norm": 1.0676498413085938, + "learning_rate": 8.664879033510092e-06, + "loss": 0.7468, + "step": 14397 + }, + { + "epoch": 0.7010931755654566, + "grad_norm": 2.00178861618042, + "learning_rate": 8.662280336067656e-06, + "loss": 0.8874, + "step": 14398 + }, + { + "epoch": 0.7011418693545638, + "grad_norm": 1.3701800107955933, + "learning_rate": 8.659681920652032e-06, + "loss": 0.7328, + "step": 14399 + }, + { + "epoch": 0.701190563143671, + "grad_norm": 2.521972417831421, + "learning_rate": 8.657083787327849e-06, + "loss": 0.8342, + "step": 14400 + }, + { + "epoch": 0.7012392569327782, + "grad_norm": 1.3670474290847778, + "learning_rate": 8.654485936159745e-06, + "loss": 0.807, + "step": 14401 + }, + { + "epoch": 0.7012879507218854, + "grad_norm": 1.3025588989257812, + "learning_rate": 8.65188836721234e-06, + "loss": 0.8147, + "step": 14402 + }, + { + "epoch": 0.7013366445109926, + "grad_norm": 1.50399911403656, + "learning_rate": 8.649291080550244e-06, + "loss": 0.8737, + "step": 14403 + }, + { + "epoch": 0.7013853383000999, + "grad_norm": 2.160628318786621, + "learning_rate": 8.64669407623807e-06, + "loss": 0.8695, + "step": 14404 + }, + { + "epoch": 0.7014340320892071, + "grad_norm": 2.0644543170928955, + "learning_rate": 8.644097354340414e-06, + "loss": 0.7922, + "step": 14405 + }, + { + "epoch": 0.7014827258783142, + "grad_norm": 1.2828730344772339, + "learning_rate": 8.641500914921872e-06, + "loss": 0.782, + "step": 14406 + }, + { + "epoch": 0.7015314196674214, + "grad_norm": 2.013392686843872, + "learning_rate": 8.63890475804703e-06, + "loss": 0.7262, + "step": 14407 + }, + { + "epoch": 0.7015801134565286, + "grad_norm": 1.527169942855835, + "learning_rate": 8.636308883780468e-06, + "loss": 0.8688, + "step": 14408 + }, + { + "epoch": 0.7016288072456358, + "grad_norm": 1.2904994487762451, + "learning_rate": 8.633713292186753e-06, + "loss": 0.7505, + "step": 14409 + }, + { + "epoch": 0.701677501034743, + "grad_norm": 2.9099087715148926, + "learning_rate": 8.63111798333047e-06, + "loss": 0.7867, + "step": 14410 + }, + { + "epoch": 0.7017261948238502, + "grad_norm": 1.3026492595672607, + "learning_rate": 8.62852295727615e-06, + "loss": 0.7476, + "step": 14411 + }, + { + "epoch": 0.7017748886129574, + "grad_norm": 1.7431236505508423, + "learning_rate": 8.625928214088363e-06, + "loss": 0.888, + "step": 14412 + }, + { + "epoch": 0.7018235824020647, + "grad_norm": 1.2352678775787354, + "learning_rate": 8.62333375383165e-06, + "loss": 0.8625, + "step": 14413 + }, + { + "epoch": 0.7018722761911718, + "grad_norm": 1.3378828763961792, + "learning_rate": 8.620739576570549e-06, + "loss": 0.785, + "step": 14414 + }, + { + "epoch": 0.701920969980279, + "grad_norm": 1.2694792747497559, + "learning_rate": 8.618145682369588e-06, + "loss": 0.8935, + "step": 14415 + }, + { + "epoch": 0.7019696637693862, + "grad_norm": 0.09417929500341415, + "learning_rate": 8.61555207129328e-06, + "loss": 0.5666, + "step": 14416 + }, + { + "epoch": 0.7020183575584934, + "grad_norm": 0.09555655717849731, + "learning_rate": 8.61295874340617e-06, + "loss": 0.5577, + "step": 14417 + }, + { + "epoch": 0.7020670513476006, + "grad_norm": 2.2058181762695312, + "learning_rate": 8.610365698772736e-06, + "loss": 0.9022, + "step": 14418 + }, + { + "epoch": 0.7021157451367078, + "grad_norm": 1.4082170724868774, + "learning_rate": 8.607772937457507e-06, + "loss": 0.8636, + "step": 14419 + }, + { + "epoch": 0.702164438925815, + "grad_norm": 1.8303166627883911, + "learning_rate": 8.60518045952495e-06, + "loss": 0.7631, + "step": 14420 + }, + { + "epoch": 0.7022131327149223, + "grad_norm": 3.5693893432617188, + "learning_rate": 8.602588265039574e-06, + "loss": 0.7807, + "step": 14421 + }, + { + "epoch": 0.7022618265040295, + "grad_norm": 1.7239940166473389, + "learning_rate": 8.599996354065857e-06, + "loss": 0.8095, + "step": 14422 + }, + { + "epoch": 0.7023105202931366, + "grad_norm": 1.2869594097137451, + "learning_rate": 8.597404726668259e-06, + "loss": 0.7599, + "step": 14423 + }, + { + "epoch": 0.7023592140822438, + "grad_norm": 1.4847965240478516, + "learning_rate": 8.594813382911276e-06, + "loss": 0.8046, + "step": 14424 + }, + { + "epoch": 0.702407907871351, + "grad_norm": 1.804052472114563, + "learning_rate": 8.592222322859334e-06, + "loss": 0.8041, + "step": 14425 + }, + { + "epoch": 0.7024566016604582, + "grad_norm": 1.6335617303848267, + "learning_rate": 8.589631546576915e-06, + "loss": 0.7898, + "step": 14426 + }, + { + "epoch": 0.7025052954495654, + "grad_norm": 0.09605948626995087, + "learning_rate": 8.58704105412844e-06, + "loss": 0.6286, + "step": 14427 + }, + { + "epoch": 0.7025539892386726, + "grad_norm": 1.6034152507781982, + "learning_rate": 8.584450845578365e-06, + "loss": 0.8538, + "step": 14428 + }, + { + "epoch": 0.7026026830277798, + "grad_norm": 1.9262608289718628, + "learning_rate": 8.581860920991116e-06, + "loss": 0.9285, + "step": 14429 + }, + { + "epoch": 0.702651376816887, + "grad_norm": 2.0833094120025635, + "learning_rate": 8.579271280431118e-06, + "loss": 0.8025, + "step": 14430 + }, + { + "epoch": 0.7027000706059942, + "grad_norm": 1.594736933708191, + "learning_rate": 8.576681923962789e-06, + "loss": 0.8489, + "step": 14431 + }, + { + "epoch": 0.7027487643951014, + "grad_norm": 1.6856969594955444, + "learning_rate": 8.574092851650528e-06, + "loss": 0.8238, + "step": 14432 + }, + { + "epoch": 0.7027974581842086, + "grad_norm": 1.8507704734802246, + "learning_rate": 8.571504063558769e-06, + "loss": 0.8866, + "step": 14433 + }, + { + "epoch": 0.7028461519733158, + "grad_norm": 1.4966517686843872, + "learning_rate": 8.568915559751868e-06, + "loss": 0.7794, + "step": 14434 + }, + { + "epoch": 0.702894845762423, + "grad_norm": 1.2324261665344238, + "learning_rate": 8.566327340294253e-06, + "loss": 0.7915, + "step": 14435 + }, + { + "epoch": 0.7029435395515302, + "grad_norm": 1.8022983074188232, + "learning_rate": 8.563739405250273e-06, + "loss": 0.721, + "step": 14436 + }, + { + "epoch": 0.7029922333406374, + "grad_norm": 1.3484760522842407, + "learning_rate": 8.561151754684324e-06, + "loss": 0.785, + "step": 14437 + }, + { + "epoch": 0.7030409271297446, + "grad_norm": 1.3226879835128784, + "learning_rate": 8.558564388660771e-06, + "loss": 0.7723, + "step": 14438 + }, + { + "epoch": 0.7030896209188519, + "grad_norm": 1.4111852645874023, + "learning_rate": 8.55597730724397e-06, + "loss": 0.8111, + "step": 14439 + }, + { + "epoch": 0.703138314707959, + "grad_norm": 1.540608525276184, + "learning_rate": 8.553390510498276e-06, + "loss": 0.9346, + "step": 14440 + }, + { + "epoch": 0.7031870084970662, + "grad_norm": 1.4805145263671875, + "learning_rate": 8.550803998488031e-06, + "loss": 0.9007, + "step": 14441 + }, + { + "epoch": 0.7032357022861734, + "grad_norm": 1.1657068729400635, + "learning_rate": 8.548217771277594e-06, + "loss": 0.8024, + "step": 14442 + }, + { + "epoch": 0.7032843960752806, + "grad_norm": 3.593456268310547, + "learning_rate": 8.545631828931272e-06, + "loss": 0.8126, + "step": 14443 + }, + { + "epoch": 0.7033330898643878, + "grad_norm": 2.4746897220611572, + "learning_rate": 8.543046171513408e-06, + "loss": 0.8993, + "step": 14444 + }, + { + "epoch": 0.703381783653495, + "grad_norm": 1.336751103401184, + "learning_rate": 8.540460799088312e-06, + "loss": 0.8376, + "step": 14445 + }, + { + "epoch": 0.7034304774426022, + "grad_norm": 1.5205512046813965, + "learning_rate": 8.537875711720303e-06, + "loss": 0.8044, + "step": 14446 + }, + { + "epoch": 0.7034791712317094, + "grad_norm": 1.4593394994735718, + "learning_rate": 8.535290909473679e-06, + "loss": 0.7733, + "step": 14447 + }, + { + "epoch": 0.7035278650208165, + "grad_norm": 3.9460649490356445, + "learning_rate": 8.53270639241274e-06, + "loss": 0.8758, + "step": 14448 + }, + { + "epoch": 0.7035765588099238, + "grad_norm": 1.2271615266799927, + "learning_rate": 8.530122160601772e-06, + "loss": 0.8181, + "step": 14449 + }, + { + "epoch": 0.703625252599031, + "grad_norm": 1.3102693557739258, + "learning_rate": 8.527538214105065e-06, + "loss": 0.8004, + "step": 14450 + }, + { + "epoch": 0.7036739463881382, + "grad_norm": 1.3396587371826172, + "learning_rate": 8.524954552986882e-06, + "loss": 0.7685, + "step": 14451 + }, + { + "epoch": 0.7037226401772454, + "grad_norm": 1.3739413022994995, + "learning_rate": 8.522371177311506e-06, + "loss": 0.8731, + "step": 14452 + }, + { + "epoch": 0.7037713339663526, + "grad_norm": 1.5507227182388306, + "learning_rate": 8.519788087143195e-06, + "loss": 0.8147, + "step": 14453 + }, + { + "epoch": 0.7038200277554598, + "grad_norm": 1.733337163925171, + "learning_rate": 8.517205282546204e-06, + "loss": 0.7932, + "step": 14454 + }, + { + "epoch": 0.703868721544567, + "grad_norm": 1.6083948612213135, + "learning_rate": 8.514622763584777e-06, + "loss": 0.7868, + "step": 14455 + }, + { + "epoch": 0.7039174153336741, + "grad_norm": 1.467294454574585, + "learning_rate": 8.512040530323156e-06, + "loss": 0.7437, + "step": 14456 + }, + { + "epoch": 0.7039661091227813, + "grad_norm": 1.8780897855758667, + "learning_rate": 8.509458582825574e-06, + "loss": 0.8122, + "step": 14457 + }, + { + "epoch": 0.7040148029118886, + "grad_norm": 1.6852235794067383, + "learning_rate": 8.506876921156258e-06, + "loss": 0.8964, + "step": 14458 + }, + { + "epoch": 0.7040634967009958, + "grad_norm": 2.3107852935791016, + "learning_rate": 8.504295545379424e-06, + "loss": 0.7841, + "step": 14459 + }, + { + "epoch": 0.704112190490103, + "grad_norm": 1.5863558053970337, + "learning_rate": 8.501714455559283e-06, + "loss": 0.8214, + "step": 14460 + }, + { + "epoch": 0.7041608842792102, + "grad_norm": 1.364975929260254, + "learning_rate": 8.499133651760047e-06, + "loss": 0.8136, + "step": 14461 + }, + { + "epoch": 0.7042095780683174, + "grad_norm": 1.5536081790924072, + "learning_rate": 8.496553134045912e-06, + "loss": 0.8331, + "step": 14462 + }, + { + "epoch": 0.7042582718574246, + "grad_norm": 1.5517995357513428, + "learning_rate": 8.493972902481066e-06, + "loss": 0.7891, + "step": 14463 + }, + { + "epoch": 0.7043069656465318, + "grad_norm": 1.3856658935546875, + "learning_rate": 8.491392957129695e-06, + "loss": 0.7627, + "step": 14464 + }, + { + "epoch": 0.7043556594356389, + "grad_norm": 3.103134870529175, + "learning_rate": 8.48881329805597e-06, + "loss": 0.873, + "step": 14465 + }, + { + "epoch": 0.7044043532247461, + "grad_norm": 1.3329006433486938, + "learning_rate": 8.486233925324066e-06, + "loss": 0.8203, + "step": 14466 + }, + { + "epoch": 0.7044530470138534, + "grad_norm": 2.2073848247528076, + "learning_rate": 8.483654838998137e-06, + "loss": 0.7837, + "step": 14467 + }, + { + "epoch": 0.7045017408029606, + "grad_norm": 0.09758725762367249, + "learning_rate": 8.481076039142355e-06, + "loss": 0.6231, + "step": 14468 + }, + { + "epoch": 0.7045504345920678, + "grad_norm": 1.5288740396499634, + "learning_rate": 8.478497525820846e-06, + "loss": 0.8203, + "step": 14469 + }, + { + "epoch": 0.704599128381175, + "grad_norm": 2.021665334701538, + "learning_rate": 8.475919299097772e-06, + "loss": 0.8256, + "step": 14470 + }, + { + "epoch": 0.7046478221702822, + "grad_norm": 1.2961430549621582, + "learning_rate": 8.473341359037244e-06, + "loss": 0.8339, + "step": 14471 + }, + { + "epoch": 0.7046965159593894, + "grad_norm": 1.4796544313430786, + "learning_rate": 8.470763705703409e-06, + "loss": 0.8186, + "step": 14472 + }, + { + "epoch": 0.7047452097484965, + "grad_norm": 1.1861300468444824, + "learning_rate": 8.468186339160376e-06, + "loss": 0.7691, + "step": 14473 + }, + { + "epoch": 0.7047939035376037, + "grad_norm": 1.3817017078399658, + "learning_rate": 8.465609259472252e-06, + "loss": 0.9329, + "step": 14474 + }, + { + "epoch": 0.704842597326711, + "grad_norm": 1.6056512594223022, + "learning_rate": 8.463032466703164e-06, + "loss": 0.8264, + "step": 14475 + }, + { + "epoch": 0.7048912911158182, + "grad_norm": 2.676093816757202, + "learning_rate": 8.460455960917183e-06, + "loss": 0.8691, + "step": 14476 + }, + { + "epoch": 0.7049399849049254, + "grad_norm": 1.7921172380447388, + "learning_rate": 8.457879742178425e-06, + "loss": 0.8115, + "step": 14477 + }, + { + "epoch": 0.7049886786940326, + "grad_norm": 2.20593523979187, + "learning_rate": 8.455303810550945e-06, + "loss": 0.7237, + "step": 14478 + }, + { + "epoch": 0.7050373724831398, + "grad_norm": 2.3919677734375, + "learning_rate": 8.452728166098844e-06, + "loss": 0.7162, + "step": 14479 + }, + { + "epoch": 0.705086066272247, + "grad_norm": 2.412135601043701, + "learning_rate": 8.450152808886183e-06, + "loss": 0.7794, + "step": 14480 + }, + { + "epoch": 0.7051347600613542, + "grad_norm": 0.10054010152816772, + "learning_rate": 8.447577738977024e-06, + "loss": 0.6356, + "step": 14481 + }, + { + "epoch": 0.7051834538504613, + "grad_norm": 1.61565101146698, + "learning_rate": 8.44500295643542e-06, + "loss": 0.6216, + "step": 14482 + }, + { + "epoch": 0.7052321476395685, + "grad_norm": 1.3497012853622437, + "learning_rate": 8.442428461325416e-06, + "loss": 0.745, + "step": 14483 + }, + { + "epoch": 0.7052808414286758, + "grad_norm": 1.3779399394989014, + "learning_rate": 8.43985425371107e-06, + "loss": 0.7719, + "step": 14484 + }, + { + "epoch": 0.705329535217783, + "grad_norm": 1.7900196313858032, + "learning_rate": 8.437280333656393e-06, + "loss": 0.8558, + "step": 14485 + }, + { + "epoch": 0.7053782290068902, + "grad_norm": 2.365211009979248, + "learning_rate": 8.434706701225433e-06, + "loss": 0.9111, + "step": 14486 + }, + { + "epoch": 0.7054269227959974, + "grad_norm": 1.3236548900604248, + "learning_rate": 8.432133356482183e-06, + "loss": 0.8191, + "step": 14487 + }, + { + "epoch": 0.7054756165851046, + "grad_norm": 1.4037622213363647, + "learning_rate": 8.42956029949068e-06, + "loss": 0.8047, + "step": 14488 + }, + { + "epoch": 0.7055243103742118, + "grad_norm": 2.2436108589172363, + "learning_rate": 8.426987530314921e-06, + "loss": 0.7182, + "step": 14489 + }, + { + "epoch": 0.7055730041633189, + "grad_norm": 2.0351386070251465, + "learning_rate": 8.424415049018899e-06, + "loss": 0.8503, + "step": 14490 + }, + { + "epoch": 0.7056216979524261, + "grad_norm": 1.377262830734253, + "learning_rate": 8.42184285566661e-06, + "loss": 0.8272, + "step": 14491 + }, + { + "epoch": 0.7056703917415333, + "grad_norm": 1.3132964372634888, + "learning_rate": 8.419270950322027e-06, + "loss": 0.8362, + "step": 14492 + }, + { + "epoch": 0.7057190855306406, + "grad_norm": 0.10355585813522339, + "learning_rate": 8.41669933304915e-06, + "loss": 0.7, + "step": 14493 + }, + { + "epoch": 0.7057677793197478, + "grad_norm": 2.1858396530151367, + "learning_rate": 8.41412800391192e-06, + "loss": 0.852, + "step": 14494 + }, + { + "epoch": 0.705816473108855, + "grad_norm": 1.5767234563827515, + "learning_rate": 8.41155696297432e-06, + "loss": 0.7829, + "step": 14495 + }, + { + "epoch": 0.7058651668979622, + "grad_norm": 1.511104702949524, + "learning_rate": 8.408986210300294e-06, + "loss": 0.7332, + "step": 14496 + }, + { + "epoch": 0.7059138606870694, + "grad_norm": 3.119859218597412, + "learning_rate": 8.406415745953792e-06, + "loss": 0.7354, + "step": 14497 + }, + { + "epoch": 0.7059625544761766, + "grad_norm": 1.3043626546859741, + "learning_rate": 8.40384556999876e-06, + "loss": 0.7896, + "step": 14498 + }, + { + "epoch": 0.7060112482652837, + "grad_norm": 1.7264785766601562, + "learning_rate": 8.401275682499124e-06, + "loss": 0.8599, + "step": 14499 + }, + { + "epoch": 0.7060599420543909, + "grad_norm": 1.7762746810913086, + "learning_rate": 8.398706083518815e-06, + "loss": 0.7925, + "step": 14500 + }, + { + "epoch": 0.7061086358434981, + "grad_norm": 1.9694812297821045, + "learning_rate": 8.396136773121741e-06, + "loss": 0.7725, + "step": 14501 + }, + { + "epoch": 0.7061573296326054, + "grad_norm": 1.8543951511383057, + "learning_rate": 8.39356775137183e-06, + "loss": 0.85, + "step": 14502 + }, + { + "epoch": 0.7062060234217126, + "grad_norm": 1.500380277633667, + "learning_rate": 8.39099901833298e-06, + "loss": 0.7821, + "step": 14503 + }, + { + "epoch": 0.7062547172108198, + "grad_norm": 1.3419761657714844, + "learning_rate": 8.388430574069087e-06, + "loss": 0.7827, + "step": 14504 + }, + { + "epoch": 0.706303410999927, + "grad_norm": 1.5829041004180908, + "learning_rate": 8.385862418644042e-06, + "loss": 0.7876, + "step": 14505 + }, + { + "epoch": 0.7063521047890342, + "grad_norm": 1.4794280529022217, + "learning_rate": 8.38329455212173e-06, + "loss": 0.8106, + "step": 14506 + }, + { + "epoch": 0.7064007985781413, + "grad_norm": 2.2676475048065186, + "learning_rate": 8.380726974566023e-06, + "loss": 0.7758, + "step": 14507 + }, + { + "epoch": 0.7064494923672485, + "grad_norm": 1.7790788412094116, + "learning_rate": 8.378159686040795e-06, + "loss": 0.8033, + "step": 14508 + }, + { + "epoch": 0.7064981861563557, + "grad_norm": 1.8854560852050781, + "learning_rate": 8.375592686609894e-06, + "loss": 0.8308, + "step": 14509 + }, + { + "epoch": 0.706546879945463, + "grad_norm": 0.10138044506311417, + "learning_rate": 8.373025976337193e-06, + "loss": 0.6166, + "step": 14510 + }, + { + "epoch": 0.7065955737345702, + "grad_norm": 1.4119024276733398, + "learning_rate": 8.370459555286531e-06, + "loss": 0.731, + "step": 14511 + }, + { + "epoch": 0.7066442675236774, + "grad_norm": 2.2330784797668457, + "learning_rate": 8.36789342352175e-06, + "loss": 0.796, + "step": 14512 + }, + { + "epoch": 0.7066929613127846, + "grad_norm": 1.3052278757095337, + "learning_rate": 8.365327581106677e-06, + "loss": 0.9136, + "step": 14513 + }, + { + "epoch": 0.7067416551018918, + "grad_norm": 1.297925353050232, + "learning_rate": 8.362762028105143e-06, + "loss": 0.7775, + "step": 14514 + }, + { + "epoch": 0.7067903488909989, + "grad_norm": 1.4109601974487305, + "learning_rate": 8.360196764580965e-06, + "loss": 0.7319, + "step": 14515 + }, + { + "epoch": 0.7068390426801061, + "grad_norm": 1.5861337184906006, + "learning_rate": 8.357631790597953e-06, + "loss": 0.8214, + "step": 14516 + }, + { + "epoch": 0.7068877364692133, + "grad_norm": 0.09683103859424591, + "learning_rate": 8.355067106219912e-06, + "loss": 0.6487, + "step": 14517 + }, + { + "epoch": 0.7069364302583205, + "grad_norm": 0.09777379035949707, + "learning_rate": 8.352502711510632e-06, + "loss": 0.622, + "step": 14518 + }, + { + "epoch": 0.7069851240474277, + "grad_norm": 2.1271684169769287, + "learning_rate": 8.349938606533922e-06, + "loss": 0.8635, + "step": 14519 + }, + { + "epoch": 0.707033817836535, + "grad_norm": 2.120854377746582, + "learning_rate": 8.347374791353538e-06, + "loss": 0.893, + "step": 14520 + }, + { + "epoch": 0.7070825116256422, + "grad_norm": 1.2503325939178467, + "learning_rate": 8.344811266033278e-06, + "loss": 0.7225, + "step": 14521 + }, + { + "epoch": 0.7071312054147494, + "grad_norm": 1.4106956720352173, + "learning_rate": 8.342248030636896e-06, + "loss": 0.9801, + "step": 14522 + }, + { + "epoch": 0.7071798992038566, + "grad_norm": 2.3944289684295654, + "learning_rate": 8.33968508522816e-06, + "loss": 0.7961, + "step": 14523 + }, + { + "epoch": 0.7072285929929637, + "grad_norm": 1.7533291578292847, + "learning_rate": 8.337122429870819e-06, + "loss": 0.7307, + "step": 14524 + }, + { + "epoch": 0.7072772867820709, + "grad_norm": 4.171407699584961, + "learning_rate": 8.334560064628614e-06, + "loss": 0.724, + "step": 14525 + }, + { + "epoch": 0.7073259805711781, + "grad_norm": 1.4847849607467651, + "learning_rate": 8.331997989565307e-06, + "loss": 0.7641, + "step": 14526 + }, + { + "epoch": 0.7073746743602853, + "grad_norm": 1.602866291999817, + "learning_rate": 8.329436204744596e-06, + "loss": 0.7802, + "step": 14527 + }, + { + "epoch": 0.7074233681493926, + "grad_norm": 1.3132853507995605, + "learning_rate": 8.326874710230242e-06, + "loss": 0.828, + "step": 14528 + }, + { + "epoch": 0.7074720619384998, + "grad_norm": 1.2521486282348633, + "learning_rate": 8.324313506085928e-06, + "loss": 0.8045, + "step": 14529 + }, + { + "epoch": 0.707520755727607, + "grad_norm": 1.2588963508605957, + "learning_rate": 8.321752592375388e-06, + "loss": 0.7919, + "step": 14530 + }, + { + "epoch": 0.7075694495167142, + "grad_norm": 1.8937733173370361, + "learning_rate": 8.319191969162315e-06, + "loss": 0.947, + "step": 14531 + }, + { + "epoch": 0.7076181433058213, + "grad_norm": 1.433227777481079, + "learning_rate": 8.3166316365104e-06, + "loss": 0.7967, + "step": 14532 + }, + { + "epoch": 0.7076668370949285, + "grad_norm": 1.7585426568984985, + "learning_rate": 8.314071594483353e-06, + "loss": 0.852, + "step": 14533 + }, + { + "epoch": 0.7077155308840357, + "grad_norm": 1.7646479606628418, + "learning_rate": 8.311511843144826e-06, + "loss": 0.8819, + "step": 14534 + }, + { + "epoch": 0.7077642246731429, + "grad_norm": 1.2243621349334717, + "learning_rate": 8.308952382558524e-06, + "loss": 0.7687, + "step": 14535 + }, + { + "epoch": 0.7078129184622501, + "grad_norm": 0.09997137635946274, + "learning_rate": 8.30639321278808e-06, + "loss": 0.6316, + "step": 14536 + }, + { + "epoch": 0.7078616122513574, + "grad_norm": 1.592830777168274, + "learning_rate": 8.30383433389718e-06, + "loss": 0.7613, + "step": 14537 + }, + { + "epoch": 0.7079103060404646, + "grad_norm": 2.3027007579803467, + "learning_rate": 8.301275745949468e-06, + "loss": 0.8395, + "step": 14538 + }, + { + "epoch": 0.7079589998295718, + "grad_norm": 2.3188023567199707, + "learning_rate": 8.298717449008586e-06, + "loss": 0.8013, + "step": 14539 + }, + { + "epoch": 0.708007693618679, + "grad_norm": 1.4814777374267578, + "learning_rate": 8.296159443138176e-06, + "loss": 0.8336, + "step": 14540 + }, + { + "epoch": 0.7080563874077861, + "grad_norm": 1.3685764074325562, + "learning_rate": 8.29360172840186e-06, + "loss": 0.7503, + "step": 14541 + }, + { + "epoch": 0.7081050811968933, + "grad_norm": 1.4709149599075317, + "learning_rate": 8.291044304863283e-06, + "loss": 0.84, + "step": 14542 + }, + { + "epoch": 0.7081537749860005, + "grad_norm": 1.60003662109375, + "learning_rate": 8.288487172586032e-06, + "loss": 0.8627, + "step": 14543 + }, + { + "epoch": 0.7082024687751077, + "grad_norm": 1.505763053894043, + "learning_rate": 8.285930331633745e-06, + "loss": 0.782, + "step": 14544 + }, + { + "epoch": 0.7082511625642149, + "grad_norm": 1.2524603605270386, + "learning_rate": 8.283373782069993e-06, + "loss": 0.9184, + "step": 14545 + }, + { + "epoch": 0.7082998563533222, + "grad_norm": 1.3054593801498413, + "learning_rate": 8.280817523958395e-06, + "loss": 0.8968, + "step": 14546 + }, + { + "epoch": 0.7083485501424294, + "grad_norm": 2.730182409286499, + "learning_rate": 8.278261557362528e-06, + "loss": 0.708, + "step": 14547 + }, + { + "epoch": 0.7083972439315366, + "grad_norm": 2.9176509380340576, + "learning_rate": 8.275705882345974e-06, + "loss": 0.7847, + "step": 14548 + }, + { + "epoch": 0.7084459377206437, + "grad_norm": 2.0937366485595703, + "learning_rate": 8.273150498972307e-06, + "loss": 0.8107, + "step": 14549 + }, + { + "epoch": 0.7084946315097509, + "grad_norm": 1.8794610500335693, + "learning_rate": 8.270595407305083e-06, + "loss": 0.7749, + "step": 14550 + }, + { + "epoch": 0.7085433252988581, + "grad_norm": 2.0128843784332275, + "learning_rate": 8.268040607407881e-06, + "loss": 0.8956, + "step": 14551 + }, + { + "epoch": 0.7085920190879653, + "grad_norm": 1.3015389442443848, + "learning_rate": 8.265486099344224e-06, + "loss": 0.7804, + "step": 14552 + }, + { + "epoch": 0.7086407128770725, + "grad_norm": 1.4411224126815796, + "learning_rate": 8.262931883177678e-06, + "loss": 0.7621, + "step": 14553 + }, + { + "epoch": 0.7086894066661797, + "grad_norm": 2.1590957641601562, + "learning_rate": 8.260377958971772e-06, + "loss": 0.7798, + "step": 14554 + }, + { + "epoch": 0.708738100455287, + "grad_norm": 2.000997543334961, + "learning_rate": 8.257824326790034e-06, + "loss": 0.8743, + "step": 14555 + }, + { + "epoch": 0.7087867942443942, + "grad_norm": 1.7835873365402222, + "learning_rate": 8.255270986695987e-06, + "loss": 0.7561, + "step": 14556 + }, + { + "epoch": 0.7088354880335014, + "grad_norm": 2.948357343673706, + "learning_rate": 8.252717938753144e-06, + "loss": 0.7974, + "step": 14557 + }, + { + "epoch": 0.7088841818226085, + "grad_norm": 1.3731838464736938, + "learning_rate": 8.250165183025014e-06, + "loss": 0.8505, + "step": 14558 + }, + { + "epoch": 0.7089328756117157, + "grad_norm": 1.5967071056365967, + "learning_rate": 8.247612719575095e-06, + "loss": 0.7755, + "step": 14559 + }, + { + "epoch": 0.7089815694008229, + "grad_norm": 1.7098443508148193, + "learning_rate": 8.245060548466877e-06, + "loss": 0.7461, + "step": 14560 + }, + { + "epoch": 0.7090302631899301, + "grad_norm": 1.403796911239624, + "learning_rate": 8.242508669763855e-06, + "loss": 0.9075, + "step": 14561 + }, + { + "epoch": 0.7090789569790373, + "grad_norm": 1.6159815788269043, + "learning_rate": 8.239957083529502e-06, + "loss": 0.7977, + "step": 14562 + }, + { + "epoch": 0.7091276507681445, + "grad_norm": 1.2381821870803833, + "learning_rate": 8.237405789827288e-06, + "loss": 0.8055, + "step": 14563 + }, + { + "epoch": 0.7091763445572518, + "grad_norm": 1.3736804723739624, + "learning_rate": 8.234854788720679e-06, + "loss": 0.8135, + "step": 14564 + }, + { + "epoch": 0.709225038346359, + "grad_norm": 1.2410725355148315, + "learning_rate": 8.232304080273128e-06, + "loss": 0.9059, + "step": 14565 + }, + { + "epoch": 0.7092737321354661, + "grad_norm": 1.5732275247573853, + "learning_rate": 8.229753664548088e-06, + "loss": 0.7574, + "step": 14566 + }, + { + "epoch": 0.7093224259245733, + "grad_norm": 1.730391025543213, + "learning_rate": 8.227203541608997e-06, + "loss": 0.9167, + "step": 14567 + }, + { + "epoch": 0.7093711197136805, + "grad_norm": 3.3918895721435547, + "learning_rate": 8.224653711519293e-06, + "loss": 0.8078, + "step": 14568 + }, + { + "epoch": 0.7094198135027877, + "grad_norm": 2.4420325756073, + "learning_rate": 8.222104174342394e-06, + "loss": 0.7451, + "step": 14569 + }, + { + "epoch": 0.7094685072918949, + "grad_norm": 1.2690659761428833, + "learning_rate": 8.219554930141734e-06, + "loss": 0.818, + "step": 14570 + }, + { + "epoch": 0.7095172010810021, + "grad_norm": 0.08989771455526352, + "learning_rate": 8.217005978980719e-06, + "loss": 0.5405, + "step": 14571 + }, + { + "epoch": 0.7095658948701093, + "grad_norm": 0.09419182687997818, + "learning_rate": 8.214457320922755e-06, + "loss": 0.616, + "step": 14572 + }, + { + "epoch": 0.7096145886592166, + "grad_norm": 1.9948272705078125, + "learning_rate": 8.211908956031238e-06, + "loss": 0.8672, + "step": 14573 + }, + { + "epoch": 0.7096632824483237, + "grad_norm": 1.608015537261963, + "learning_rate": 8.209360884369564e-06, + "loss": 0.9404, + "step": 14574 + }, + { + "epoch": 0.7097119762374309, + "grad_norm": 1.2921217679977417, + "learning_rate": 8.20681310600111e-06, + "loss": 0.8712, + "step": 14575 + }, + { + "epoch": 0.7097606700265381, + "grad_norm": 2.6128547191619873, + "learning_rate": 8.204265620989249e-06, + "loss": 0.8764, + "step": 14576 + }, + { + "epoch": 0.7098093638156453, + "grad_norm": 1.3285033702850342, + "learning_rate": 8.201718429397372e-06, + "loss": 0.7122, + "step": 14577 + }, + { + "epoch": 0.7098580576047525, + "grad_norm": 1.8819677829742432, + "learning_rate": 8.199171531288807e-06, + "loss": 0.8362, + "step": 14578 + }, + { + "epoch": 0.7099067513938597, + "grad_norm": 1.2336658239364624, + "learning_rate": 8.196624926726944e-06, + "loss": 0.826, + "step": 14579 + }, + { + "epoch": 0.7099554451829669, + "grad_norm": 2.309171199798584, + "learning_rate": 8.194078615775097e-06, + "loss": 0.7712, + "step": 14580 + }, + { + "epoch": 0.7100041389720742, + "grad_norm": 1.5225554704666138, + "learning_rate": 8.191532598496628e-06, + "loss": 0.9219, + "step": 14581 + }, + { + "epoch": 0.7100528327611814, + "grad_norm": 1.6989928483963013, + "learning_rate": 8.18898687495486e-06, + "loss": 0.8324, + "step": 14582 + }, + { + "epoch": 0.7101015265502885, + "grad_norm": 0.09435774385929108, + "learning_rate": 8.186441445213116e-06, + "loss": 0.6231, + "step": 14583 + }, + { + "epoch": 0.7101502203393957, + "grad_norm": 1.749855399131775, + "learning_rate": 8.183896309334732e-06, + "loss": 0.7696, + "step": 14584 + }, + { + "epoch": 0.7101989141285029, + "grad_norm": 1.3932324647903442, + "learning_rate": 8.181351467382989e-06, + "loss": 0.7567, + "step": 14585 + }, + { + "epoch": 0.7102476079176101, + "grad_norm": 1.4185081720352173, + "learning_rate": 8.17880691942122e-06, + "loss": 0.841, + "step": 14586 + }, + { + "epoch": 0.7102963017067173, + "grad_norm": 1.4969253540039062, + "learning_rate": 8.176262665512693e-06, + "loss": 0.8, + "step": 14587 + }, + { + "epoch": 0.7103449954958245, + "grad_norm": 1.7250796556472778, + "learning_rate": 8.17371870572072e-06, + "loss": 0.6817, + "step": 14588 + }, + { + "epoch": 0.7103936892849317, + "grad_norm": 1.6665459871292114, + "learning_rate": 8.17117504010857e-06, + "loss": 0.8463, + "step": 14589 + }, + { + "epoch": 0.710442383074039, + "grad_norm": 1.686794400215149, + "learning_rate": 8.16863166873952e-06, + "loss": 0.8465, + "step": 14590 + }, + { + "epoch": 0.710491076863146, + "grad_norm": 3.787116765975952, + "learning_rate": 8.166088591676836e-06, + "loss": 0.8364, + "step": 14591 + }, + { + "epoch": 0.7105397706522533, + "grad_norm": 2.8685476779937744, + "learning_rate": 8.163545808983769e-06, + "loss": 0.8516, + "step": 14592 + }, + { + "epoch": 0.7105884644413605, + "grad_norm": 1.2907650470733643, + "learning_rate": 8.161003320723593e-06, + "loss": 0.8777, + "step": 14593 + }, + { + "epoch": 0.7106371582304677, + "grad_norm": 1.471283197402954, + "learning_rate": 8.158461126959525e-06, + "loss": 0.8659, + "step": 14594 + }, + { + "epoch": 0.7106858520195749, + "grad_norm": 1.3001213073730469, + "learning_rate": 8.155919227754825e-06, + "loss": 0.8168, + "step": 14595 + }, + { + "epoch": 0.7107345458086821, + "grad_norm": 1.7211257219314575, + "learning_rate": 8.153377623172713e-06, + "loss": 0.8043, + "step": 14596 + }, + { + "epoch": 0.7107832395977893, + "grad_norm": 1.1635360717773438, + "learning_rate": 8.150836313276413e-06, + "loss": 0.7554, + "step": 14597 + }, + { + "epoch": 0.7108319333868965, + "grad_norm": 1.4509447813034058, + "learning_rate": 8.14829529812914e-06, + "loss": 0.8691, + "step": 14598 + }, + { + "epoch": 0.7108806271760038, + "grad_norm": 1.798536777496338, + "learning_rate": 8.145754577794094e-06, + "loss": 0.8415, + "step": 14599 + }, + { + "epoch": 0.7109293209651109, + "grad_norm": 1.4076850414276123, + "learning_rate": 8.1432141523345e-06, + "loss": 0.9598, + "step": 14600 + }, + { + "epoch": 0.7109780147542181, + "grad_norm": 1.61378014087677, + "learning_rate": 8.14067402181352e-06, + "loss": 0.7384, + "step": 14601 + }, + { + "epoch": 0.7110267085433253, + "grad_norm": 2.021801233291626, + "learning_rate": 8.138134186294369e-06, + "loss": 0.7789, + "step": 14602 + }, + { + "epoch": 0.7110754023324325, + "grad_norm": 1.1142038106918335, + "learning_rate": 8.135594645840197e-06, + "loss": 0.8416, + "step": 14603 + }, + { + "epoch": 0.7111240961215397, + "grad_norm": 1.288343906402588, + "learning_rate": 8.133055400514197e-06, + "loss": 0.7541, + "step": 14604 + }, + { + "epoch": 0.7111727899106469, + "grad_norm": 2.172279119491577, + "learning_rate": 8.130516450379527e-06, + "loss": 0.9372, + "step": 14605 + }, + { + "epoch": 0.7112214836997541, + "grad_norm": 1.4667694568634033, + "learning_rate": 8.12797779549934e-06, + "loss": 0.7897, + "step": 14606 + }, + { + "epoch": 0.7112701774888613, + "grad_norm": 2.1673789024353027, + "learning_rate": 8.12543943593679e-06, + "loss": 0.8776, + "step": 14607 + }, + { + "epoch": 0.7113188712779684, + "grad_norm": 2.001938581466675, + "learning_rate": 8.122901371755016e-06, + "loss": 0.9063, + "step": 14608 + }, + { + "epoch": 0.7113675650670757, + "grad_norm": 3.163273572921753, + "learning_rate": 8.120363603017155e-06, + "loss": 0.7635, + "step": 14609 + }, + { + "epoch": 0.7114162588561829, + "grad_norm": 1.5795010328292847, + "learning_rate": 8.117826129786322e-06, + "loss": 0.769, + "step": 14610 + }, + { + "epoch": 0.7114649526452901, + "grad_norm": 1.7375437021255493, + "learning_rate": 8.115288952125657e-06, + "loss": 0.8356, + "step": 14611 + }, + { + "epoch": 0.7115136464343973, + "grad_norm": 1.8372787237167358, + "learning_rate": 8.112752070098261e-06, + "loss": 0.8161, + "step": 14612 + }, + { + "epoch": 0.7115623402235045, + "grad_norm": 1.4738906621932983, + "learning_rate": 8.110215483767244e-06, + "loss": 0.8123, + "step": 14613 + }, + { + "epoch": 0.7116110340126117, + "grad_norm": 1.3696727752685547, + "learning_rate": 8.107679193195699e-06, + "loss": 0.9215, + "step": 14614 + }, + { + "epoch": 0.7116597278017189, + "grad_norm": 0.10036948323249817, + "learning_rate": 8.105143198446718e-06, + "loss": 0.6168, + "step": 14615 + }, + { + "epoch": 0.711708421590826, + "grad_norm": 1.5707296133041382, + "learning_rate": 8.102607499583387e-06, + "loss": 0.8198, + "step": 14616 + }, + { + "epoch": 0.7117571153799332, + "grad_norm": 1.49355947971344, + "learning_rate": 8.100072096668776e-06, + "loss": 0.8094, + "step": 14617 + }, + { + "epoch": 0.7118058091690405, + "grad_norm": 1.7708948850631714, + "learning_rate": 8.097536989765951e-06, + "loss": 0.7393, + "step": 14618 + }, + { + "epoch": 0.7118545029581477, + "grad_norm": 1.7269388437271118, + "learning_rate": 8.095002178937985e-06, + "loss": 0.7682, + "step": 14619 + }, + { + "epoch": 0.7119031967472549, + "grad_norm": 1.6651151180267334, + "learning_rate": 8.092467664247927e-06, + "loss": 0.8087, + "step": 14620 + }, + { + "epoch": 0.7119518905363621, + "grad_norm": 1.677448034286499, + "learning_rate": 8.089933445758819e-06, + "loss": 0.8363, + "step": 14621 + }, + { + "epoch": 0.7120005843254693, + "grad_norm": 1.4265176057815552, + "learning_rate": 8.087399523533706e-06, + "loss": 0.7971, + "step": 14622 + }, + { + "epoch": 0.7120492781145765, + "grad_norm": 2.462324380874634, + "learning_rate": 8.084865897635612e-06, + "loss": 0.8298, + "step": 14623 + }, + { + "epoch": 0.7120979719036837, + "grad_norm": 0.0939563512802124, + "learning_rate": 8.082332568127566e-06, + "loss": 0.607, + "step": 14624 + }, + { + "epoch": 0.7121466656927908, + "grad_norm": 1.5808526277542114, + "learning_rate": 8.079799535072585e-06, + "loss": 0.7678, + "step": 14625 + }, + { + "epoch": 0.712195359481898, + "grad_norm": 1.9953848123550415, + "learning_rate": 8.077266798533679e-06, + "loss": 0.7724, + "step": 14626 + }, + { + "epoch": 0.7122440532710053, + "grad_norm": 1.7595041990280151, + "learning_rate": 8.07473435857384e-06, + "loss": 0.7593, + "step": 14627 + }, + { + "epoch": 0.7122927470601125, + "grad_norm": 1.5240232944488525, + "learning_rate": 8.072202215256084e-06, + "loss": 0.789, + "step": 14628 + }, + { + "epoch": 0.7123414408492197, + "grad_norm": 1.7465940713882446, + "learning_rate": 8.069670368643372e-06, + "loss": 0.7727, + "step": 14629 + }, + { + "epoch": 0.7123901346383269, + "grad_norm": 1.708150029182434, + "learning_rate": 8.067138818798705e-06, + "loss": 0.8021, + "step": 14630 + }, + { + "epoch": 0.7124388284274341, + "grad_norm": 2.2332394123077393, + "learning_rate": 8.064607565785047e-06, + "loss": 0.8775, + "step": 14631 + }, + { + "epoch": 0.7124875222165413, + "grad_norm": 1.3323233127593994, + "learning_rate": 8.062076609665365e-06, + "loss": 0.8691, + "step": 14632 + }, + { + "epoch": 0.7125362160056484, + "grad_norm": 1.9692745208740234, + "learning_rate": 8.059545950502617e-06, + "loss": 0.8454, + "step": 14633 + }, + { + "epoch": 0.7125849097947556, + "grad_norm": 1.474156141281128, + "learning_rate": 8.057015588359743e-06, + "loss": 0.7765, + "step": 14634 + }, + { + "epoch": 0.7126336035838629, + "grad_norm": 1.5239263772964478, + "learning_rate": 8.054485523299709e-06, + "loss": 0.8307, + "step": 14635 + }, + { + "epoch": 0.7126822973729701, + "grad_norm": 1.6026527881622314, + "learning_rate": 8.051955755385423e-06, + "loss": 0.7524, + "step": 14636 + }, + { + "epoch": 0.7127309911620773, + "grad_norm": 1.823996663093567, + "learning_rate": 8.049426284679843e-06, + "loss": 0.818, + "step": 14637 + }, + { + "epoch": 0.7127796849511845, + "grad_norm": 1.402222990989685, + "learning_rate": 8.046897111245857e-06, + "loss": 0.8007, + "step": 14638 + }, + { + "epoch": 0.7128283787402917, + "grad_norm": 1.7379463911056519, + "learning_rate": 8.044368235146405e-06, + "loss": 0.8102, + "step": 14639 + }, + { + "epoch": 0.7128770725293989, + "grad_norm": 1.2276790142059326, + "learning_rate": 8.041839656444381e-06, + "loss": 0.7982, + "step": 14640 + }, + { + "epoch": 0.7129257663185061, + "grad_norm": 1.51030433177948, + "learning_rate": 8.039311375202678e-06, + "loss": 0.8062, + "step": 14641 + }, + { + "epoch": 0.7129744601076132, + "grad_norm": 1.3038442134857178, + "learning_rate": 8.036783391484211e-06, + "loss": 0.8293, + "step": 14642 + }, + { + "epoch": 0.7130231538967204, + "grad_norm": 0.09402713924646378, + "learning_rate": 8.034255705351834e-06, + "loss": 0.639, + "step": 14643 + }, + { + "epoch": 0.7130718476858277, + "grad_norm": 1.6130479574203491, + "learning_rate": 8.03172831686845e-06, + "loss": 0.8988, + "step": 14644 + }, + { + "epoch": 0.7131205414749349, + "grad_norm": 1.8225202560424805, + "learning_rate": 8.029201226096902e-06, + "loss": 0.7722, + "step": 14645 + }, + { + "epoch": 0.7131692352640421, + "grad_norm": 1.4219248294830322, + "learning_rate": 8.026674433100071e-06, + "loss": 0.8322, + "step": 14646 + }, + { + "epoch": 0.7132179290531493, + "grad_norm": 1.6560190916061401, + "learning_rate": 8.02414793794081e-06, + "loss": 0.7747, + "step": 14647 + }, + { + "epoch": 0.7132666228422565, + "grad_norm": 2.2694919109344482, + "learning_rate": 8.021621740681956e-06, + "loss": 0.7914, + "step": 14648 + }, + { + "epoch": 0.7133153166313637, + "grad_norm": 1.8017594814300537, + "learning_rate": 8.019095841386355e-06, + "loss": 0.8526, + "step": 14649 + }, + { + "epoch": 0.7133640104204708, + "grad_norm": 0.0956791415810585, + "learning_rate": 8.01657024011683e-06, + "loss": 0.6713, + "step": 14650 + }, + { + "epoch": 0.713412704209578, + "grad_norm": 1.5173146724700928, + "learning_rate": 8.014044936936227e-06, + "loss": 0.9014, + "step": 14651 + }, + { + "epoch": 0.7134613979986852, + "grad_norm": 2.06054949760437, + "learning_rate": 8.011519931907334e-06, + "loss": 0.7865, + "step": 14652 + }, + { + "epoch": 0.7135100917877925, + "grad_norm": 6.562020301818848, + "learning_rate": 8.00899522509299e-06, + "loss": 0.8873, + "step": 14653 + }, + { + "epoch": 0.7135587855768997, + "grad_norm": 1.4912612438201904, + "learning_rate": 8.006470816555972e-06, + "loss": 0.7113, + "step": 14654 + }, + { + "epoch": 0.7136074793660069, + "grad_norm": 1.523871660232544, + "learning_rate": 8.00394670635909e-06, + "loss": 0.7391, + "step": 14655 + }, + { + "epoch": 0.7136561731551141, + "grad_norm": 2.1217315196990967, + "learning_rate": 8.001422894565127e-06, + "loss": 0.8462, + "step": 14656 + }, + { + "epoch": 0.7137048669442213, + "grad_norm": 1.510993480682373, + "learning_rate": 7.998899381236864e-06, + "loss": 0.845, + "step": 14657 + }, + { + "epoch": 0.7137535607333285, + "grad_norm": 1.6621800661087036, + "learning_rate": 7.996376166437074e-06, + "loss": 0.9698, + "step": 14658 + }, + { + "epoch": 0.7138022545224356, + "grad_norm": 1.474586009979248, + "learning_rate": 7.993853250228515e-06, + "loss": 0.8, + "step": 14659 + }, + { + "epoch": 0.7138509483115428, + "grad_norm": 1.8955196142196655, + "learning_rate": 7.991330632673963e-06, + "loss": 0.8564, + "step": 14660 + }, + { + "epoch": 0.71389964210065, + "grad_norm": 1.7836956977844238, + "learning_rate": 7.988808313836143e-06, + "loss": 0.7536, + "step": 14661 + }, + { + "epoch": 0.7139483358897573, + "grad_norm": 1.430128812789917, + "learning_rate": 7.986286293777818e-06, + "loss": 0.8327, + "step": 14662 + }, + { + "epoch": 0.7139970296788645, + "grad_norm": 1.5811967849731445, + "learning_rate": 7.983764572561719e-06, + "loss": 0.7471, + "step": 14663 + }, + { + "epoch": 0.7140457234679717, + "grad_norm": 1.8443878889083862, + "learning_rate": 7.98124315025057e-06, + "loss": 0.7967, + "step": 14664 + }, + { + "epoch": 0.7140944172570789, + "grad_norm": 1.6114073991775513, + "learning_rate": 7.978722026907093e-06, + "loss": 0.883, + "step": 14665 + }, + { + "epoch": 0.7141431110461861, + "grad_norm": 1.6301703453063965, + "learning_rate": 7.976201202594002e-06, + "loss": 0.8528, + "step": 14666 + }, + { + "epoch": 0.7141918048352932, + "grad_norm": 2.246428966522217, + "learning_rate": 7.973680677374005e-06, + "loss": 0.8084, + "step": 14667 + }, + { + "epoch": 0.7142404986244004, + "grad_norm": 1.3385974168777466, + "learning_rate": 7.971160451309796e-06, + "loss": 0.8324, + "step": 14668 + }, + { + "epoch": 0.7142891924135076, + "grad_norm": 1.3449283838272095, + "learning_rate": 7.968640524464059e-06, + "loss": 0.7914, + "step": 14669 + }, + { + "epoch": 0.7143378862026148, + "grad_norm": 2.0296499729156494, + "learning_rate": 7.966120896899494e-06, + "loss": 0.7984, + "step": 14670 + }, + { + "epoch": 0.7143865799917221, + "grad_norm": 1.7307407855987549, + "learning_rate": 7.96360156867877e-06, + "loss": 0.862, + "step": 14671 + }, + { + "epoch": 0.7144352737808293, + "grad_norm": 0.09507117420434952, + "learning_rate": 7.961082539864553e-06, + "loss": 0.5829, + "step": 14672 + }, + { + "epoch": 0.7144839675699365, + "grad_norm": 1.551076054573059, + "learning_rate": 7.958563810519506e-06, + "loss": 0.776, + "step": 14673 + }, + { + "epoch": 0.7145326613590437, + "grad_norm": 1.4057329893112183, + "learning_rate": 7.956045380706281e-06, + "loss": 0.763, + "step": 14674 + }, + { + "epoch": 0.7145813551481508, + "grad_norm": 2.2707419395446777, + "learning_rate": 7.953527250487527e-06, + "loss": 0.7154, + "step": 14675 + }, + { + "epoch": 0.714630048937258, + "grad_norm": 1.5218311548233032, + "learning_rate": 7.95100941992588e-06, + "loss": 0.8989, + "step": 14676 + }, + { + "epoch": 0.7146787427263652, + "grad_norm": 1.39706552028656, + "learning_rate": 7.948491889083971e-06, + "loss": 0.8128, + "step": 14677 + }, + { + "epoch": 0.7147274365154724, + "grad_norm": 1.5965150594711304, + "learning_rate": 7.94597465802442e-06, + "loss": 0.837, + "step": 14678 + }, + { + "epoch": 0.7147761303045796, + "grad_norm": 3.2670538425445557, + "learning_rate": 7.943457726809856e-06, + "loss": 0.8118, + "step": 14679 + }, + { + "epoch": 0.7148248240936869, + "grad_norm": 1.2940555810928345, + "learning_rate": 7.94094109550288e-06, + "loss": 0.8245, + "step": 14680 + }, + { + "epoch": 0.7148735178827941, + "grad_norm": 2.307358980178833, + "learning_rate": 7.938424764166094e-06, + "loss": 0.7945, + "step": 14681 + }, + { + "epoch": 0.7149222116719013, + "grad_norm": 2.1374754905700684, + "learning_rate": 7.93590873286209e-06, + "loss": 0.8166, + "step": 14682 + }, + { + "epoch": 0.7149709054610085, + "grad_norm": 2.221426486968994, + "learning_rate": 7.933393001653456e-06, + "loss": 0.7172, + "step": 14683 + }, + { + "epoch": 0.7150195992501156, + "grad_norm": 1.4823791980743408, + "learning_rate": 7.930877570602774e-06, + "loss": 0.8307, + "step": 14684 + }, + { + "epoch": 0.7150682930392228, + "grad_norm": 0.09735453873872757, + "learning_rate": 7.928362439772604e-06, + "loss": 0.6039, + "step": 14685 + }, + { + "epoch": 0.71511698682833, + "grad_norm": 1.3701889514923096, + "learning_rate": 7.925847609225532e-06, + "loss": 0.8292, + "step": 14686 + }, + { + "epoch": 0.7151656806174372, + "grad_norm": 1.7029660940170288, + "learning_rate": 7.923333079024089e-06, + "loss": 0.7294, + "step": 14687 + }, + { + "epoch": 0.7152143744065445, + "grad_norm": 1.4298460483551025, + "learning_rate": 7.920818849230842e-06, + "loss": 0.8098, + "step": 14688 + }, + { + "epoch": 0.7152630681956517, + "grad_norm": 1.7990398406982422, + "learning_rate": 7.91830491990833e-06, + "loss": 0.8226, + "step": 14689 + }, + { + "epoch": 0.7153117619847589, + "grad_norm": 1.6863682270050049, + "learning_rate": 7.915791291119083e-06, + "loss": 0.8638, + "step": 14690 + }, + { + "epoch": 0.7153604557738661, + "grad_norm": 1.5544477701187134, + "learning_rate": 7.91327796292563e-06, + "loss": 0.8405, + "step": 14691 + }, + { + "epoch": 0.7154091495629732, + "grad_norm": 1.3888102769851685, + "learning_rate": 7.910764935390483e-06, + "loss": 0.7968, + "step": 14692 + }, + { + "epoch": 0.7154578433520804, + "grad_norm": 1.3824594020843506, + "learning_rate": 7.908252208576175e-06, + "loss": 0.8426, + "step": 14693 + }, + { + "epoch": 0.7155065371411876, + "grad_norm": 1.6610931158065796, + "learning_rate": 7.905739782545181e-06, + "loss": 0.8034, + "step": 14694 + }, + { + "epoch": 0.7155552309302948, + "grad_norm": 1.2693759202957153, + "learning_rate": 7.903227657360027e-06, + "loss": 0.8321, + "step": 14695 + }, + { + "epoch": 0.715603924719402, + "grad_norm": 2.0088582038879395, + "learning_rate": 7.900715833083172e-06, + "loss": 0.8125, + "step": 14696 + }, + { + "epoch": 0.7156526185085093, + "grad_norm": 1.6498790979385376, + "learning_rate": 7.898204309777123e-06, + "loss": 0.7835, + "step": 14697 + }, + { + "epoch": 0.7157013122976165, + "grad_norm": 1.8594783544540405, + "learning_rate": 7.895693087504343e-06, + "loss": 0.8875, + "step": 14698 + }, + { + "epoch": 0.7157500060867237, + "grad_norm": 1.4138950109481812, + "learning_rate": 7.893182166327298e-06, + "loss": 0.8352, + "step": 14699 + }, + { + "epoch": 0.7157986998758309, + "grad_norm": 1.3267264366149902, + "learning_rate": 7.890671546308461e-06, + "loss": 0.7906, + "step": 14700 + }, + { + "epoch": 0.715847393664938, + "grad_norm": 2.1598894596099854, + "learning_rate": 7.88816122751026e-06, + "loss": 0.8188, + "step": 14701 + }, + { + "epoch": 0.7158960874540452, + "grad_norm": 1.5547361373901367, + "learning_rate": 7.885651209995169e-06, + "loss": 0.7579, + "step": 14702 + }, + { + "epoch": 0.7159447812431524, + "grad_norm": 2.1387805938720703, + "learning_rate": 7.883141493825592e-06, + "loss": 0.7939, + "step": 14703 + }, + { + "epoch": 0.7159934750322596, + "grad_norm": 2.240804672241211, + "learning_rate": 7.880632079063983e-06, + "loss": 0.7719, + "step": 14704 + }, + { + "epoch": 0.7160421688213668, + "grad_norm": 1.5474704504013062, + "learning_rate": 7.878122965772757e-06, + "loss": 0.8168, + "step": 14705 + }, + { + "epoch": 0.716090862610474, + "grad_norm": 2.281012535095215, + "learning_rate": 7.875614154014327e-06, + "loss": 0.7662, + "step": 14706 + }, + { + "epoch": 0.7161395563995813, + "grad_norm": 3.973177909851074, + "learning_rate": 7.873105643851098e-06, + "loss": 0.8806, + "step": 14707 + }, + { + "epoch": 0.7161882501886885, + "grad_norm": 2.1631052494049072, + "learning_rate": 7.870597435345469e-06, + "loss": 0.7642, + "step": 14708 + }, + { + "epoch": 0.7162369439777956, + "grad_norm": 5.7054877281188965, + "learning_rate": 7.868089528559845e-06, + "loss": 0.835, + "step": 14709 + }, + { + "epoch": 0.7162856377669028, + "grad_norm": 1.4514691829681396, + "learning_rate": 7.865581923556587e-06, + "loss": 0.8807, + "step": 14710 + }, + { + "epoch": 0.71633433155601, + "grad_norm": 1.841692566871643, + "learning_rate": 7.863074620398101e-06, + "loss": 0.7772, + "step": 14711 + }, + { + "epoch": 0.7163830253451172, + "grad_norm": 1.6153286695480347, + "learning_rate": 7.860567619146724e-06, + "loss": 0.7837, + "step": 14712 + }, + { + "epoch": 0.7164317191342244, + "grad_norm": 1.447653889656067, + "learning_rate": 7.858060919864842e-06, + "loss": 0.7303, + "step": 14713 + }, + { + "epoch": 0.7164804129233316, + "grad_norm": 1.8613954782485962, + "learning_rate": 7.855554522614803e-06, + "loss": 0.8805, + "step": 14714 + }, + { + "epoch": 0.7165291067124389, + "grad_norm": 2.4543232917785645, + "learning_rate": 7.853048427458952e-06, + "loss": 0.7106, + "step": 14715 + }, + { + "epoch": 0.7165778005015461, + "grad_norm": 1.5646339654922485, + "learning_rate": 7.850542634459628e-06, + "loss": 0.8259, + "step": 14716 + }, + { + "epoch": 0.7166264942906533, + "grad_norm": 1.2505910396575928, + "learning_rate": 7.848037143679164e-06, + "loss": 0.7587, + "step": 14717 + }, + { + "epoch": 0.7166751880797604, + "grad_norm": 1.8711447715759277, + "learning_rate": 7.845531955179884e-06, + "loss": 0.855, + "step": 14718 + }, + { + "epoch": 0.7167238818688676, + "grad_norm": 2.212733030319214, + "learning_rate": 7.843027069024098e-06, + "loss": 0.8411, + "step": 14719 + }, + { + "epoch": 0.7167725756579748, + "grad_norm": 1.6703810691833496, + "learning_rate": 7.84052248527413e-06, + "loss": 0.8918, + "step": 14720 + }, + { + "epoch": 0.716821269447082, + "grad_norm": 1.865253210067749, + "learning_rate": 7.838018203992272e-06, + "loss": 0.8898, + "step": 14721 + }, + { + "epoch": 0.7168699632361892, + "grad_norm": 1.1916687488555908, + "learning_rate": 7.83551422524082e-06, + "loss": 0.7546, + "step": 14722 + }, + { + "epoch": 0.7169186570252964, + "grad_norm": 1.4743173122406006, + "learning_rate": 7.833010549082063e-06, + "loss": 0.8654, + "step": 14723 + }, + { + "epoch": 0.7169673508144037, + "grad_norm": 1.2769981622695923, + "learning_rate": 7.830507175578276e-06, + "loss": 0.8492, + "step": 14724 + }, + { + "epoch": 0.7170160446035109, + "grad_norm": 1.7487506866455078, + "learning_rate": 7.828004104791733e-06, + "loss": 0.789, + "step": 14725 + }, + { + "epoch": 0.717064738392618, + "grad_norm": 1.712796688079834, + "learning_rate": 7.825501336784697e-06, + "loss": 0.868, + "step": 14726 + }, + { + "epoch": 0.7171134321817252, + "grad_norm": 3.0431692600250244, + "learning_rate": 7.822998871619421e-06, + "loss": 0.7668, + "step": 14727 + }, + { + "epoch": 0.7171621259708324, + "grad_norm": 1.8003689050674438, + "learning_rate": 7.820496709358163e-06, + "loss": 0.9055, + "step": 14728 + }, + { + "epoch": 0.7172108197599396, + "grad_norm": 1.6140193939208984, + "learning_rate": 7.817994850063159e-06, + "loss": 0.8646, + "step": 14729 + }, + { + "epoch": 0.7172595135490468, + "grad_norm": 2.8369269371032715, + "learning_rate": 7.815493293796648e-06, + "loss": 0.8587, + "step": 14730 + }, + { + "epoch": 0.717308207338154, + "grad_norm": 1.8009421825408936, + "learning_rate": 7.812992040620848e-06, + "loss": 0.7497, + "step": 14731 + }, + { + "epoch": 0.7173569011272612, + "grad_norm": 1.5066101551055908, + "learning_rate": 7.810491090597985e-06, + "loss": 0.7857, + "step": 14732 + }, + { + "epoch": 0.7174055949163685, + "grad_norm": 2.293659210205078, + "learning_rate": 7.807990443790265e-06, + "loss": 0.7071, + "step": 14733 + }, + { + "epoch": 0.7174542887054756, + "grad_norm": 1.248969316482544, + "learning_rate": 7.805490100259896e-06, + "loss": 0.8468, + "step": 14734 + }, + { + "epoch": 0.7175029824945828, + "grad_norm": 1.510754942893982, + "learning_rate": 7.802990060069072e-06, + "loss": 0.8582, + "step": 14735 + }, + { + "epoch": 0.71755167628369, + "grad_norm": 1.6814765930175781, + "learning_rate": 7.800490323279974e-06, + "loss": 0.842, + "step": 14736 + }, + { + "epoch": 0.7176003700727972, + "grad_norm": 2.700834274291992, + "learning_rate": 7.797990889954807e-06, + "loss": 0.7859, + "step": 14737 + }, + { + "epoch": 0.7176490638619044, + "grad_norm": 1.7749415636062622, + "learning_rate": 7.795491760155716e-06, + "loss": 0.8458, + "step": 14738 + }, + { + "epoch": 0.7176977576510116, + "grad_norm": 1.4309217929840088, + "learning_rate": 7.792992933944887e-06, + "loss": 0.8295, + "step": 14739 + }, + { + "epoch": 0.7177464514401188, + "grad_norm": 1.8638005256652832, + "learning_rate": 7.790494411384471e-06, + "loss": 0.802, + "step": 14740 + }, + { + "epoch": 0.717795145229226, + "grad_norm": 1.5344312191009521, + "learning_rate": 7.78799619253662e-06, + "loss": 0.687, + "step": 14741 + }, + { + "epoch": 0.7178438390183333, + "grad_norm": 1.202386736869812, + "learning_rate": 7.785498277463477e-06, + "loss": 0.8743, + "step": 14742 + }, + { + "epoch": 0.7178925328074404, + "grad_norm": 1.411946177482605, + "learning_rate": 7.78300066622717e-06, + "loss": 0.8344, + "step": 14743 + }, + { + "epoch": 0.7179412265965476, + "grad_norm": 1.5031129121780396, + "learning_rate": 7.78050335888985e-06, + "loss": 0.7942, + "step": 14744 + }, + { + "epoch": 0.7179899203856548, + "grad_norm": 2.3366260528564453, + "learning_rate": 7.77800635551361e-06, + "loss": 0.7867, + "step": 14745 + }, + { + "epoch": 0.718038614174762, + "grad_norm": 1.4772521257400513, + "learning_rate": 7.77550965616059e-06, + "loss": 0.9113, + "step": 14746 + }, + { + "epoch": 0.7180873079638692, + "grad_norm": 1.9315053224563599, + "learning_rate": 7.773013260892867e-06, + "loss": 0.7798, + "step": 14747 + }, + { + "epoch": 0.7181360017529764, + "grad_norm": 1.9044822454452515, + "learning_rate": 7.770517169772563e-06, + "loss": 0.847, + "step": 14748 + }, + { + "epoch": 0.7181846955420836, + "grad_norm": 1.8661775588989258, + "learning_rate": 7.768021382861757e-06, + "loss": 0.7999, + "step": 14749 + }, + { + "epoch": 0.7182333893311909, + "grad_norm": 1.683090090751648, + "learning_rate": 7.765525900222527e-06, + "loss": 0.7996, + "step": 14750 + }, + { + "epoch": 0.718282083120298, + "grad_norm": 3.5016438961029053, + "learning_rate": 7.76303072191697e-06, + "loss": 0.7969, + "step": 14751 + }, + { + "epoch": 0.7183307769094052, + "grad_norm": 2.472923755645752, + "learning_rate": 7.760535848007129e-06, + "loss": 0.778, + "step": 14752 + }, + { + "epoch": 0.7183794706985124, + "grad_norm": 1.3083891868591309, + "learning_rate": 7.758041278555087e-06, + "loss": 0.834, + "step": 14753 + }, + { + "epoch": 0.7184281644876196, + "grad_norm": 2.362342119216919, + "learning_rate": 7.755547013622869e-06, + "loss": 0.7761, + "step": 14754 + }, + { + "epoch": 0.7184768582767268, + "grad_norm": 1.320317029953003, + "learning_rate": 7.753053053272542e-06, + "loss": 0.7527, + "step": 14755 + }, + { + "epoch": 0.718525552065834, + "grad_norm": 1.7247569561004639, + "learning_rate": 7.750559397566138e-06, + "loss": 0.8295, + "step": 14756 + }, + { + "epoch": 0.7185742458549412, + "grad_norm": 1.450761079788208, + "learning_rate": 7.748066046565685e-06, + "loss": 0.7801, + "step": 14757 + }, + { + "epoch": 0.7186229396440484, + "grad_norm": 1.3502641916275024, + "learning_rate": 7.745573000333208e-06, + "loss": 0.8594, + "step": 14758 + }, + { + "epoch": 0.7186716334331557, + "grad_norm": 1.5014435052871704, + "learning_rate": 7.743080258930714e-06, + "loss": 0.8469, + "step": 14759 + }, + { + "epoch": 0.7187203272222628, + "grad_norm": 1.3181854486465454, + "learning_rate": 7.740587822420228e-06, + "loss": 0.8243, + "step": 14760 + }, + { + "epoch": 0.71876902101137, + "grad_norm": 3.6239736080169678, + "learning_rate": 7.738095690863724e-06, + "loss": 0.8644, + "step": 14761 + }, + { + "epoch": 0.7188177148004772, + "grad_norm": 1.3947433233261108, + "learning_rate": 7.73560386432322e-06, + "loss": 0.7279, + "step": 14762 + }, + { + "epoch": 0.7188664085895844, + "grad_norm": 2.4113101959228516, + "learning_rate": 7.733112342860675e-06, + "loss": 0.9144, + "step": 14763 + }, + { + "epoch": 0.7189151023786916, + "grad_norm": 3.9184060096740723, + "learning_rate": 7.730621126538087e-06, + "loss": 0.8593, + "step": 14764 + }, + { + "epoch": 0.7189637961677988, + "grad_norm": 1.9033482074737549, + "learning_rate": 7.728130215417416e-06, + "loss": 0.8205, + "step": 14765 + }, + { + "epoch": 0.719012489956906, + "grad_norm": 1.5509347915649414, + "learning_rate": 7.725639609560624e-06, + "loss": 0.8519, + "step": 14766 + }, + { + "epoch": 0.7190611837460132, + "grad_norm": 1.5641196966171265, + "learning_rate": 7.723149309029669e-06, + "loss": 0.8509, + "step": 14767 + }, + { + "epoch": 0.7191098775351203, + "grad_norm": 1.3331711292266846, + "learning_rate": 7.720659313886485e-06, + "loss": 0.7667, + "step": 14768 + }, + { + "epoch": 0.7191585713242276, + "grad_norm": 1.5958129167556763, + "learning_rate": 7.718169624193035e-06, + "loss": 0.8157, + "step": 14769 + }, + { + "epoch": 0.7192072651133348, + "grad_norm": 1.950092077255249, + "learning_rate": 7.71568024001122e-06, + "loss": 0.7605, + "step": 14770 + }, + { + "epoch": 0.719255958902442, + "grad_norm": 1.3412041664123535, + "learning_rate": 7.713191161402988e-06, + "loss": 0.7693, + "step": 14771 + }, + { + "epoch": 0.7193046526915492, + "grad_norm": 1.5131779909133911, + "learning_rate": 7.710702388430244e-06, + "loss": 0.7675, + "step": 14772 + }, + { + "epoch": 0.7193533464806564, + "grad_norm": 2.721759796142578, + "learning_rate": 7.7082139211549e-06, + "loss": 0.768, + "step": 14773 + }, + { + "epoch": 0.7194020402697636, + "grad_norm": 2.0357887744903564, + "learning_rate": 7.705725759638856e-06, + "loss": 0.7738, + "step": 14774 + }, + { + "epoch": 0.7194507340588708, + "grad_norm": 1.7470141649246216, + "learning_rate": 7.703237903944006e-06, + "loss": 0.7945, + "step": 14775 + }, + { + "epoch": 0.719499427847978, + "grad_norm": 1.6780275106430054, + "learning_rate": 7.700750354132231e-06, + "loss": 0.8443, + "step": 14776 + }, + { + "epoch": 0.7195481216370851, + "grad_norm": 3.610607862472534, + "learning_rate": 7.698263110265413e-06, + "loss": 0.7319, + "step": 14777 + }, + { + "epoch": 0.7195968154261924, + "grad_norm": 2.105642318725586, + "learning_rate": 7.695776172405416e-06, + "loss": 0.8735, + "step": 14778 + }, + { + "epoch": 0.7196455092152996, + "grad_norm": 1.2323505878448486, + "learning_rate": 7.693289540614114e-06, + "loss": 0.7754, + "step": 14779 + }, + { + "epoch": 0.7196942030044068, + "grad_norm": 1.5136250257492065, + "learning_rate": 7.69080321495336e-06, + "loss": 0.8126, + "step": 14780 + }, + { + "epoch": 0.719742896793514, + "grad_norm": 1.3423106670379639, + "learning_rate": 7.688317195484996e-06, + "loss": 0.8242, + "step": 14781 + }, + { + "epoch": 0.7197915905826212, + "grad_norm": 2.0638043880462646, + "learning_rate": 7.685831482270863e-06, + "loss": 0.8343, + "step": 14782 + }, + { + "epoch": 0.7198402843717284, + "grad_norm": 1.4836370944976807, + "learning_rate": 7.683346075372797e-06, + "loss": 0.7941, + "step": 14783 + }, + { + "epoch": 0.7198889781608356, + "grad_norm": 1.4388655424118042, + "learning_rate": 7.68086097485262e-06, + "loss": 0.8596, + "step": 14784 + }, + { + "epoch": 0.7199376719499427, + "grad_norm": 2.3001246452331543, + "learning_rate": 7.67837618077214e-06, + "loss": 0.7666, + "step": 14785 + }, + { + "epoch": 0.71998636573905, + "grad_norm": 1.6639800071716309, + "learning_rate": 7.675891693193191e-06, + "loss": 0.823, + "step": 14786 + }, + { + "epoch": 0.7200350595281572, + "grad_norm": 2.9989047050476074, + "learning_rate": 7.67340751217755e-06, + "loss": 0.7675, + "step": 14787 + }, + { + "epoch": 0.7200837533172644, + "grad_norm": 1.6734302043914795, + "learning_rate": 7.670923637787023e-06, + "loss": 0.7773, + "step": 14788 + }, + { + "epoch": 0.7201324471063716, + "grad_norm": 1.364442229270935, + "learning_rate": 7.668440070083396e-06, + "loss": 0.9086, + "step": 14789 + }, + { + "epoch": 0.7201811408954788, + "grad_norm": 1.8086917400360107, + "learning_rate": 7.665956809128447e-06, + "loss": 0.7003, + "step": 14790 + }, + { + "epoch": 0.720229834684586, + "grad_norm": 1.7192394733428955, + "learning_rate": 7.663473854983947e-06, + "loss": 0.9311, + "step": 14791 + }, + { + "epoch": 0.7202785284736932, + "grad_norm": 1.1526738405227661, + "learning_rate": 7.66099120771166e-06, + "loss": 0.9294, + "step": 14792 + }, + { + "epoch": 0.7203272222628003, + "grad_norm": 1.3117001056671143, + "learning_rate": 7.65850886737334e-06, + "loss": 0.8301, + "step": 14793 + }, + { + "epoch": 0.7203759160519075, + "grad_norm": 1.9112827777862549, + "learning_rate": 7.656026834030733e-06, + "loss": 0.8213, + "step": 14794 + }, + { + "epoch": 0.7204246098410148, + "grad_norm": 1.4622315168380737, + "learning_rate": 7.653545107745595e-06, + "loss": 0.7797, + "step": 14795 + }, + { + "epoch": 0.720473303630122, + "grad_norm": 1.2411866188049316, + "learning_rate": 7.651063688579636e-06, + "loss": 0.7478, + "step": 14796 + }, + { + "epoch": 0.7205219974192292, + "grad_norm": 2.4888534545898438, + "learning_rate": 7.648582576594601e-06, + "loss": 0.8497, + "step": 14797 + }, + { + "epoch": 0.7205706912083364, + "grad_norm": 2.030369997024536, + "learning_rate": 7.646101771852199e-06, + "loss": 0.8047, + "step": 14798 + }, + { + "epoch": 0.7206193849974436, + "grad_norm": 1.3021965026855469, + "learning_rate": 7.643621274414141e-06, + "loss": 0.7422, + "step": 14799 + }, + { + "epoch": 0.7206680787865508, + "grad_norm": 1.9906128644943237, + "learning_rate": 7.641141084342134e-06, + "loss": 0.8452, + "step": 14800 + }, + { + "epoch": 0.720716772575658, + "grad_norm": 1.6494792699813843, + "learning_rate": 7.638661201697859e-06, + "loss": 0.8064, + "step": 14801 + }, + { + "epoch": 0.7207654663647651, + "grad_norm": 1.6347627639770508, + "learning_rate": 7.636181626543028e-06, + "loss": 0.6982, + "step": 14802 + }, + { + "epoch": 0.7208141601538723, + "grad_norm": 1.5024394989013672, + "learning_rate": 7.633702358939295e-06, + "loss": 0.8193, + "step": 14803 + }, + { + "epoch": 0.7208628539429796, + "grad_norm": 1.9187568426132202, + "learning_rate": 7.631223398948354e-06, + "loss": 0.7812, + "step": 14804 + }, + { + "epoch": 0.7209115477320868, + "grad_norm": 0.11145390570163727, + "learning_rate": 7.628744746631849e-06, + "loss": 0.6512, + "step": 14805 + }, + { + "epoch": 0.720960241521194, + "grad_norm": 1.6327745914459229, + "learning_rate": 7.62626640205145e-06, + "loss": 0.8212, + "step": 14806 + }, + { + "epoch": 0.7210089353103012, + "grad_norm": 1.635645866394043, + "learning_rate": 7.623788365268802e-06, + "loss": 0.8271, + "step": 14807 + }, + { + "epoch": 0.7210576290994084, + "grad_norm": 0.09599262475967407, + "learning_rate": 7.621310636345542e-06, + "loss": 0.5943, + "step": 14808 + }, + { + "epoch": 0.7211063228885156, + "grad_norm": 1.5058636665344238, + "learning_rate": 7.618833215343322e-06, + "loss": 0.9195, + "step": 14809 + }, + { + "epoch": 0.7211550166776227, + "grad_norm": 1.8862862586975098, + "learning_rate": 7.616356102323741e-06, + "loss": 0.8697, + "step": 14810 + }, + { + "epoch": 0.7212037104667299, + "grad_norm": 1.2970836162567139, + "learning_rate": 7.613879297348443e-06, + "loss": 0.7349, + "step": 14811 + }, + { + "epoch": 0.7212524042558371, + "grad_norm": 1.5078155994415283, + "learning_rate": 7.6114028004790154e-06, + "loss": 0.7756, + "step": 14812 + }, + { + "epoch": 0.7213010980449444, + "grad_norm": 1.5486055612564087, + "learning_rate": 7.608926611777077e-06, + "loss": 0.8565, + "step": 14813 + }, + { + "epoch": 0.7213497918340516, + "grad_norm": 1.4260698556900024, + "learning_rate": 7.60645073130422e-06, + "loss": 0.8475, + "step": 14814 + }, + { + "epoch": 0.7213984856231588, + "grad_norm": 1.4295768737792969, + "learning_rate": 7.603975159122033e-06, + "loss": 0.8488, + "step": 14815 + }, + { + "epoch": 0.721447179412266, + "grad_norm": 2.694523811340332, + "learning_rate": 7.60149989529209e-06, + "loss": 0.8193, + "step": 14816 + }, + { + "epoch": 0.7214958732013732, + "grad_norm": 2.7310798168182373, + "learning_rate": 7.599024939875965e-06, + "loss": 0.8507, + "step": 14817 + }, + { + "epoch": 0.7215445669904804, + "grad_norm": 1.6721718311309814, + "learning_rate": 7.596550292935236e-06, + "loss": 0.9495, + "step": 14818 + }, + { + "epoch": 0.7215932607795875, + "grad_norm": 1.7634365558624268, + "learning_rate": 7.594075954531437e-06, + "loss": 0.6903, + "step": 14819 + }, + { + "epoch": 0.7216419545686947, + "grad_norm": 1.6827003955841064, + "learning_rate": 7.5916019247261445e-06, + "loss": 0.8318, + "step": 14820 + }, + { + "epoch": 0.721690648357802, + "grad_norm": 1.3992369174957275, + "learning_rate": 7.589128203580869e-06, + "loss": 0.823, + "step": 14821 + }, + { + "epoch": 0.7217393421469092, + "grad_norm": 1.5765349864959717, + "learning_rate": 7.586654791157171e-06, + "loss": 0.8227, + "step": 14822 + }, + { + "epoch": 0.7217880359360164, + "grad_norm": 3.550936698913574, + "learning_rate": 7.584181687516565e-06, + "loss": 0.8146, + "step": 14823 + }, + { + "epoch": 0.7218367297251236, + "grad_norm": 1.796201467514038, + "learning_rate": 7.581708892720572e-06, + "loss": 0.7137, + "step": 14824 + }, + { + "epoch": 0.7218854235142308, + "grad_norm": 1.3338603973388672, + "learning_rate": 7.579236406830701e-06, + "loss": 0.7867, + "step": 14825 + }, + { + "epoch": 0.721934117303338, + "grad_norm": 2.056781053543091, + "learning_rate": 7.5767642299084595e-06, + "loss": 0.8721, + "step": 14826 + }, + { + "epoch": 0.7219828110924451, + "grad_norm": 1.5955703258514404, + "learning_rate": 7.574292362015339e-06, + "loss": 0.8765, + "step": 14827 + }, + { + "epoch": 0.7220315048815523, + "grad_norm": 1.2340675592422485, + "learning_rate": 7.5718208032128215e-06, + "loss": 0.8694, + "step": 14828 + }, + { + "epoch": 0.7220801986706595, + "grad_norm": 1.8760802745819092, + "learning_rate": 7.569349553562404e-06, + "loss": 0.8747, + "step": 14829 + }, + { + "epoch": 0.7221288924597667, + "grad_norm": 2.1510403156280518, + "learning_rate": 7.566878613125548e-06, + "loss": 0.8438, + "step": 14830 + }, + { + "epoch": 0.722177586248874, + "grad_norm": 1.5781627893447876, + "learning_rate": 7.564407981963719e-06, + "loss": 0.8665, + "step": 14831 + }, + { + "epoch": 0.7222262800379812, + "grad_norm": 3.378045082092285, + "learning_rate": 7.561937660138377e-06, + "loss": 0.8725, + "step": 14832 + }, + { + "epoch": 0.7222749738270884, + "grad_norm": 1.4037070274353027, + "learning_rate": 7.559467647710969e-06, + "loss": 0.764, + "step": 14833 + }, + { + "epoch": 0.7223236676161956, + "grad_norm": 1.8654452562332153, + "learning_rate": 7.5569979447429385e-06, + "loss": 0.7679, + "step": 14834 + }, + { + "epoch": 0.7223723614053027, + "grad_norm": 1.4282772541046143, + "learning_rate": 7.554528551295716e-06, + "loss": 0.8065, + "step": 14835 + }, + { + "epoch": 0.7224210551944099, + "grad_norm": 1.4146862030029297, + "learning_rate": 7.552059467430728e-06, + "loss": 0.7196, + "step": 14836 + }, + { + "epoch": 0.7224697489835171, + "grad_norm": 1.7251663208007812, + "learning_rate": 7.549590693209399e-06, + "loss": 0.8918, + "step": 14837 + }, + { + "epoch": 0.7225184427726243, + "grad_norm": 1.5580440759658813, + "learning_rate": 7.547122228693138e-06, + "loss": 0.8588, + "step": 14838 + }, + { + "epoch": 0.7225671365617315, + "grad_norm": 1.3266499042510986, + "learning_rate": 7.544654073943347e-06, + "loss": 0.8577, + "step": 14839 + }, + { + "epoch": 0.7226158303508388, + "grad_norm": 1.2899091243743896, + "learning_rate": 7.542186229021422e-06, + "loss": 0.8586, + "step": 14840 + }, + { + "epoch": 0.722664524139946, + "grad_norm": 1.9480241537094116, + "learning_rate": 7.539718693988749e-06, + "loss": 0.8078, + "step": 14841 + }, + { + "epoch": 0.7227132179290532, + "grad_norm": 1.649131417274475, + "learning_rate": 7.53725146890671e-06, + "loss": 0.7654, + "step": 14842 + }, + { + "epoch": 0.7227619117181604, + "grad_norm": 1.4027268886566162, + "learning_rate": 7.534784553836678e-06, + "loss": 0.7845, + "step": 14843 + }, + { + "epoch": 0.7228106055072675, + "grad_norm": 1.3094662427902222, + "learning_rate": 7.5323179488400155e-06, + "loss": 0.7915, + "step": 14844 + }, + { + "epoch": 0.7228592992963747, + "grad_norm": 1.780487298965454, + "learning_rate": 7.529851653978075e-06, + "loss": 0.9073, + "step": 14845 + }, + { + "epoch": 0.7229079930854819, + "grad_norm": 1.430214762687683, + "learning_rate": 7.527385669312226e-06, + "loss": 0.7583, + "step": 14846 + }, + { + "epoch": 0.7229566868745891, + "grad_norm": 1.3373193740844727, + "learning_rate": 7.524919994903779e-06, + "loss": 0.8322, + "step": 14847 + }, + { + "epoch": 0.7230053806636964, + "grad_norm": 1.639973759651184, + "learning_rate": 7.522454630814093e-06, + "loss": 0.734, + "step": 14848 + }, + { + "epoch": 0.7230540744528036, + "grad_norm": 1.3110140562057495, + "learning_rate": 7.519989577104487e-06, + "loss": 0.8273, + "step": 14849 + }, + { + "epoch": 0.7231027682419108, + "grad_norm": 1.3598320484161377, + "learning_rate": 7.517524833836276e-06, + "loss": 0.8583, + "step": 14850 + }, + { + "epoch": 0.723151462031018, + "grad_norm": 1.389695405960083, + "learning_rate": 7.515060401070775e-06, + "loss": 0.774, + "step": 14851 + }, + { + "epoch": 0.7232001558201251, + "grad_norm": 1.4382543563842773, + "learning_rate": 7.512596278869278e-06, + "loss": 0.7906, + "step": 14852 + }, + { + "epoch": 0.7232488496092323, + "grad_norm": 1.9328856468200684, + "learning_rate": 7.510132467293098e-06, + "loss": 0.9111, + "step": 14853 + }, + { + "epoch": 0.7232975433983395, + "grad_norm": 1.434834599494934, + "learning_rate": 7.507668966403501e-06, + "loss": 0.8031, + "step": 14854 + }, + { + "epoch": 0.7233462371874467, + "grad_norm": 3.3016445636749268, + "learning_rate": 7.50520577626179e-06, + "loss": 0.8169, + "step": 14855 + }, + { + "epoch": 0.7233949309765539, + "grad_norm": 1.89363694190979, + "learning_rate": 7.5027428969292095e-06, + "loss": 0.7286, + "step": 14856 + }, + { + "epoch": 0.7234436247656612, + "grad_norm": 2.368297576904297, + "learning_rate": 7.500280328467047e-06, + "loss": 0.7504, + "step": 14857 + }, + { + "epoch": 0.7234923185547684, + "grad_norm": 2.07541823387146, + "learning_rate": 7.497818070936551e-06, + "loss": 0.8639, + "step": 14858 + }, + { + "epoch": 0.7235410123438756, + "grad_norm": 1.3882780075073242, + "learning_rate": 7.495356124398965e-06, + "loss": 0.7256, + "step": 14859 + }, + { + "epoch": 0.7235897061329828, + "grad_norm": 1.2948145866394043, + "learning_rate": 7.492894488915547e-06, + "loss": 0.89, + "step": 14860 + }, + { + "epoch": 0.7236383999220899, + "grad_norm": 1.3394083976745605, + "learning_rate": 7.490433164547506e-06, + "loss": 0.8143, + "step": 14861 + }, + { + "epoch": 0.7236870937111971, + "grad_norm": 1.3887343406677246, + "learning_rate": 7.487972151356096e-06, + "loss": 0.7907, + "step": 14862 + }, + { + "epoch": 0.7237357875003043, + "grad_norm": 2.0340192317962646, + "learning_rate": 7.4855114494025025e-06, + "loss": 0.9081, + "step": 14863 + }, + { + "epoch": 0.7237844812894115, + "grad_norm": 1.2327394485473633, + "learning_rate": 7.483051058747961e-06, + "loss": 0.7603, + "step": 14864 + }, + { + "epoch": 0.7238331750785187, + "grad_norm": 1.46929132938385, + "learning_rate": 7.480590979453665e-06, + "loss": 0.9323, + "step": 14865 + }, + { + "epoch": 0.723881868867626, + "grad_norm": 1.390093445777893, + "learning_rate": 7.4781312115808125e-06, + "loss": 0.747, + "step": 14866 + }, + { + "epoch": 0.7239305626567332, + "grad_norm": 1.572824478149414, + "learning_rate": 7.475671755190585e-06, + "loss": 0.7966, + "step": 14867 + }, + { + "epoch": 0.7239792564458404, + "grad_norm": 1.4395509958267212, + "learning_rate": 7.47321261034416e-06, + "loss": 0.8689, + "step": 14868 + }, + { + "epoch": 0.7240279502349475, + "grad_norm": 1.3306339979171753, + "learning_rate": 7.4707537771027265e-06, + "loss": 0.8407, + "step": 14869 + }, + { + "epoch": 0.7240766440240547, + "grad_norm": 0.09498634189367294, + "learning_rate": 7.46829525552742e-06, + "loss": 0.5725, + "step": 14870 + }, + { + "epoch": 0.7241253378131619, + "grad_norm": 1.225325584411621, + "learning_rate": 7.465837045679427e-06, + "loss": 0.7882, + "step": 14871 + }, + { + "epoch": 0.7241740316022691, + "grad_norm": 1.5998070240020752, + "learning_rate": 7.463379147619865e-06, + "loss": 0.8321, + "step": 14872 + }, + { + "epoch": 0.7242227253913763, + "grad_norm": 1.6585185527801514, + "learning_rate": 7.460921561409898e-06, + "loss": 0.8268, + "step": 14873 + }, + { + "epoch": 0.7242714191804835, + "grad_norm": 15.678585052490234, + "learning_rate": 7.458464287110649e-06, + "loss": 0.8563, + "step": 14874 + }, + { + "epoch": 0.7243201129695908, + "grad_norm": 1.9735769033432007, + "learning_rate": 7.456007324783246e-06, + "loss": 0.8109, + "step": 14875 + }, + { + "epoch": 0.724368806758698, + "grad_norm": 2.61737322807312, + "learning_rate": 7.453550674488803e-06, + "loss": 0.8151, + "step": 14876 + }, + { + "epoch": 0.7244175005478052, + "grad_norm": 1.4428846836090088, + "learning_rate": 7.451094336288424e-06, + "loss": 0.8226, + "step": 14877 + }, + { + "epoch": 0.7244661943369123, + "grad_norm": 1.3099479675292969, + "learning_rate": 7.448638310243233e-06, + "loss": 0.7996, + "step": 14878 + }, + { + "epoch": 0.7245148881260195, + "grad_norm": 1.4778112173080444, + "learning_rate": 7.446182596414293e-06, + "loss": 0.8573, + "step": 14879 + }, + { + "epoch": 0.7245635819151267, + "grad_norm": 1.8036890029907227, + "learning_rate": 7.443727194862713e-06, + "loss": 0.8278, + "step": 14880 + }, + { + "epoch": 0.7246122757042339, + "grad_norm": 2.0071964263916016, + "learning_rate": 7.441272105649564e-06, + "loss": 0.7509, + "step": 14881 + }, + { + "epoch": 0.7246609694933411, + "grad_norm": 0.09118019789457321, + "learning_rate": 7.4388173288359165e-06, + "loss": 0.6077, + "step": 14882 + }, + { + "epoch": 0.7247096632824483, + "grad_norm": 2.5116970539093018, + "learning_rate": 7.436362864482834e-06, + "loss": 0.8484, + "step": 14883 + }, + { + "epoch": 0.7247583570715556, + "grad_norm": 2.026165246963501, + "learning_rate": 7.4339087126513695e-06, + "loss": 0.8525, + "step": 14884 + }, + { + "epoch": 0.7248070508606628, + "grad_norm": 1.4103952646255493, + "learning_rate": 7.431454873402573e-06, + "loss": 0.7888, + "step": 14885 + }, + { + "epoch": 0.7248557446497699, + "grad_norm": 1.261334776878357, + "learning_rate": 7.429001346797473e-06, + "loss": 0.8041, + "step": 14886 + }, + { + "epoch": 0.7249044384388771, + "grad_norm": 3.22599458694458, + "learning_rate": 7.42654813289712e-06, + "loss": 0.8628, + "step": 14887 + }, + { + "epoch": 0.7249531322279843, + "grad_norm": 1.722968578338623, + "learning_rate": 7.424095231762527e-06, + "loss": 0.757, + "step": 14888 + }, + { + "epoch": 0.7250018260170915, + "grad_norm": 1.41139554977417, + "learning_rate": 7.421642643454712e-06, + "loss": 0.7755, + "step": 14889 + }, + { + "epoch": 0.7250505198061987, + "grad_norm": 1.7162084579467773, + "learning_rate": 7.419190368034683e-06, + "loss": 0.8605, + "step": 14890 + }, + { + "epoch": 0.7250992135953059, + "grad_norm": 1.797309398651123, + "learning_rate": 7.416738405563442e-06, + "loss": 0.8548, + "step": 14891 + }, + { + "epoch": 0.7251479073844131, + "grad_norm": 2.798595905303955, + "learning_rate": 7.41428675610198e-06, + "loss": 0.8158, + "step": 14892 + }, + { + "epoch": 0.7251966011735204, + "grad_norm": 1.4007205963134766, + "learning_rate": 7.411835419711282e-06, + "loss": 0.7986, + "step": 14893 + }, + { + "epoch": 0.7252452949626275, + "grad_norm": 1.6571595668792725, + "learning_rate": 7.409384396452319e-06, + "loss": 0.7773, + "step": 14894 + }, + { + "epoch": 0.7252939887517347, + "grad_norm": 2.435126781463623, + "learning_rate": 7.40693368638608e-06, + "loss": 0.7867, + "step": 14895 + }, + { + "epoch": 0.7253426825408419, + "grad_norm": 1.630052089691162, + "learning_rate": 7.4044832895735004e-06, + "loss": 0.8947, + "step": 14896 + }, + { + "epoch": 0.7253913763299491, + "grad_norm": 1.264371395111084, + "learning_rate": 7.402033206075556e-06, + "loss": 0.8049, + "step": 14897 + }, + { + "epoch": 0.7254400701190563, + "grad_norm": 1.9647654294967651, + "learning_rate": 7.399583435953182e-06, + "loss": 0.7935, + "step": 14898 + }, + { + "epoch": 0.7254887639081635, + "grad_norm": 0.09289123862981796, + "learning_rate": 7.397133979267319e-06, + "loss": 0.6346, + "step": 14899 + }, + { + "epoch": 0.7255374576972707, + "grad_norm": 1.3567851781845093, + "learning_rate": 7.394684836078898e-06, + "loss": 0.7827, + "step": 14900 + }, + { + "epoch": 0.725586151486378, + "grad_norm": 1.6917433738708496, + "learning_rate": 7.392236006448843e-06, + "loss": 0.8445, + "step": 14901 + }, + { + "epoch": 0.7256348452754852, + "grad_norm": 1.9165822267532349, + "learning_rate": 7.389787490438063e-06, + "loss": 0.7544, + "step": 14902 + }, + { + "epoch": 0.7256835390645923, + "grad_norm": 1.9722651243209839, + "learning_rate": 7.3873392881074645e-06, + "loss": 0.8205, + "step": 14903 + }, + { + "epoch": 0.7257322328536995, + "grad_norm": 1.5737049579620361, + "learning_rate": 7.384891399517964e-06, + "loss": 0.7217, + "step": 14904 + }, + { + "epoch": 0.7257809266428067, + "grad_norm": 1.5765011310577393, + "learning_rate": 7.382443824730427e-06, + "loss": 0.8191, + "step": 14905 + }, + { + "epoch": 0.7258296204319139, + "grad_norm": 3.123566150665283, + "learning_rate": 7.379996563805758e-06, + "loss": 0.9158, + "step": 14906 + }, + { + "epoch": 0.7258783142210211, + "grad_norm": 3.028517961502075, + "learning_rate": 7.377549616804824e-06, + "loss": 0.7786, + "step": 14907 + }, + { + "epoch": 0.7259270080101283, + "grad_norm": 1.5807678699493408, + "learning_rate": 7.375102983788494e-06, + "loss": 0.8577, + "step": 14908 + }, + { + "epoch": 0.7259757017992355, + "grad_norm": 1.788793921470642, + "learning_rate": 7.372656664817628e-06, + "loss": 0.7644, + "step": 14909 + }, + { + "epoch": 0.7260243955883428, + "grad_norm": 1.4615191221237183, + "learning_rate": 7.370210659953072e-06, + "loss": 0.8683, + "step": 14910 + }, + { + "epoch": 0.7260730893774499, + "grad_norm": 1.4178099632263184, + "learning_rate": 7.367764969255693e-06, + "loss": 0.8992, + "step": 14911 + }, + { + "epoch": 0.7261217831665571, + "grad_norm": 2.093153953552246, + "learning_rate": 7.365319592786298e-06, + "loss": 0.827, + "step": 14912 + }, + { + "epoch": 0.7261704769556643, + "grad_norm": 1.9559882879257202, + "learning_rate": 7.362874530605741e-06, + "loss": 0.8206, + "step": 14913 + }, + { + "epoch": 0.7262191707447715, + "grad_norm": 1.3987089395523071, + "learning_rate": 7.3604297827748205e-06, + "loss": 0.8144, + "step": 14914 + }, + { + "epoch": 0.7262678645338787, + "grad_norm": 1.5396333932876587, + "learning_rate": 7.357985349354369e-06, + "loss": 0.8316, + "step": 14915 + }, + { + "epoch": 0.7263165583229859, + "grad_norm": 1.3266793489456177, + "learning_rate": 7.355541230405183e-06, + "loss": 0.7703, + "step": 14916 + }, + { + "epoch": 0.7263652521120931, + "grad_norm": 1.4733924865722656, + "learning_rate": 7.353097425988056e-06, + "loss": 0.7568, + "step": 14917 + }, + { + "epoch": 0.7264139459012003, + "grad_norm": 3.704263925552368, + "learning_rate": 7.350653936163799e-06, + "loss": 0.7659, + "step": 14918 + }, + { + "epoch": 0.7264626396903076, + "grad_norm": 2.2084548473358154, + "learning_rate": 7.348210760993162e-06, + "loss": 0.7976, + "step": 14919 + }, + { + "epoch": 0.7265113334794147, + "grad_norm": 1.382441520690918, + "learning_rate": 7.345767900536951e-06, + "loss": 0.8825, + "step": 14920 + }, + { + "epoch": 0.7265600272685219, + "grad_norm": 2.5393457412719727, + "learning_rate": 7.343325354855904e-06, + "loss": 0.8552, + "step": 14921 + }, + { + "epoch": 0.7266087210576291, + "grad_norm": 1.3219901323318481, + "learning_rate": 7.3408831240107985e-06, + "loss": 0.8577, + "step": 14922 + }, + { + "epoch": 0.7266574148467363, + "grad_norm": 1.292784571647644, + "learning_rate": 7.338441208062379e-06, + "loss": 0.8847, + "step": 14923 + }, + { + "epoch": 0.7267061086358435, + "grad_norm": 1.7086169719696045, + "learning_rate": 7.335999607071391e-06, + "loss": 0.7827, + "step": 14924 + }, + { + "epoch": 0.7267548024249507, + "grad_norm": 2.2970404624938965, + "learning_rate": 7.333558321098566e-06, + "loss": 0.8875, + "step": 14925 + }, + { + "epoch": 0.7268034962140579, + "grad_norm": 1.7504727840423584, + "learning_rate": 7.331117350204626e-06, + "loss": 0.8741, + "step": 14926 + }, + { + "epoch": 0.7268521900031651, + "grad_norm": 2.706451177597046, + "learning_rate": 7.32867669445031e-06, + "loss": 0.7462, + "step": 14927 + }, + { + "epoch": 0.7269008837922722, + "grad_norm": 2.7241077423095703, + "learning_rate": 7.3262363538963035e-06, + "loss": 0.7917, + "step": 14928 + }, + { + "epoch": 0.7269495775813795, + "grad_norm": 1.7306709289550781, + "learning_rate": 7.323796328603337e-06, + "loss": 0.8611, + "step": 14929 + }, + { + "epoch": 0.7269982713704867, + "grad_norm": 1.9419492483139038, + "learning_rate": 7.321356618632079e-06, + "loss": 0.8154, + "step": 14930 + }, + { + "epoch": 0.7270469651595939, + "grad_norm": 2.8213319778442383, + "learning_rate": 7.318917224043239e-06, + "loss": 0.7899, + "step": 14931 + }, + { + "epoch": 0.7270956589487011, + "grad_norm": 2.524745225906372, + "learning_rate": 7.316478144897488e-06, + "loss": 0.8704, + "step": 14932 + }, + { + "epoch": 0.7271443527378083, + "grad_norm": 1.5440688133239746, + "learning_rate": 7.3140393812555e-06, + "loss": 0.8308, + "step": 14933 + }, + { + "epoch": 0.7271930465269155, + "grad_norm": 2.037491798400879, + "learning_rate": 7.311600933177938e-06, + "loss": 0.8571, + "step": 14934 + }, + { + "epoch": 0.7272417403160227, + "grad_norm": 1.343928337097168, + "learning_rate": 7.30916280072546e-06, + "loss": 0.8893, + "step": 14935 + }, + { + "epoch": 0.72729043410513, + "grad_norm": 1.5630608797073364, + "learning_rate": 7.3067249839587155e-06, + "loss": 0.7849, + "step": 14936 + }, + { + "epoch": 0.727339127894237, + "grad_norm": 1.9118989706039429, + "learning_rate": 7.304287482938337e-06, + "loss": 0.7004, + "step": 14937 + }, + { + "epoch": 0.7273878216833443, + "grad_norm": 1.4558448791503906, + "learning_rate": 7.30185029772497e-06, + "loss": 0.8068, + "step": 14938 + }, + { + "epoch": 0.7274365154724515, + "grad_norm": 1.4386444091796875, + "learning_rate": 7.299413428379239e-06, + "loss": 0.8144, + "step": 14939 + }, + { + "epoch": 0.7274852092615587, + "grad_norm": 1.743067741394043, + "learning_rate": 7.296976874961752e-06, + "loss": 0.9082, + "step": 14940 + }, + { + "epoch": 0.7275339030506659, + "grad_norm": 1.2704148292541504, + "learning_rate": 7.294540637533128e-06, + "loss": 0.8386, + "step": 14941 + }, + { + "epoch": 0.7275825968397731, + "grad_norm": 1.6369305849075317, + "learning_rate": 7.292104716153961e-06, + "loss": 0.7326, + "step": 14942 + }, + { + "epoch": 0.7276312906288803, + "grad_norm": 1.6425193548202515, + "learning_rate": 7.289669110884847e-06, + "loss": 0.8407, + "step": 14943 + }, + { + "epoch": 0.7276799844179875, + "grad_norm": 1.6973395347595215, + "learning_rate": 7.2872338217863744e-06, + "loss": 0.8579, + "step": 14944 + }, + { + "epoch": 0.7277286782070946, + "grad_norm": 2.5732076168060303, + "learning_rate": 7.284798848919115e-06, + "loss": 0.7928, + "step": 14945 + }, + { + "epoch": 0.7277773719962018, + "grad_norm": 1.3815407752990723, + "learning_rate": 7.282364192343647e-06, + "loss": 0.8438, + "step": 14946 + }, + { + "epoch": 0.7278260657853091, + "grad_norm": 1.7769057750701904, + "learning_rate": 7.279929852120531e-06, + "loss": 0.7138, + "step": 14947 + }, + { + "epoch": 0.7278747595744163, + "grad_norm": 1.4274131059646606, + "learning_rate": 7.277495828310319e-06, + "loss": 0.7566, + "step": 14948 + }, + { + "epoch": 0.7279234533635235, + "grad_norm": 1.2672770023345947, + "learning_rate": 7.2750621209735575e-06, + "loss": 0.6916, + "step": 14949 + }, + { + "epoch": 0.7279721471526307, + "grad_norm": 1.730214238166809, + "learning_rate": 7.2726287301707875e-06, + "loss": 0.7858, + "step": 14950 + }, + { + "epoch": 0.7280208409417379, + "grad_norm": 1.5910388231277466, + "learning_rate": 7.270195655962537e-06, + "loss": 0.8716, + "step": 14951 + }, + { + "epoch": 0.7280695347308451, + "grad_norm": 1.264693021774292, + "learning_rate": 7.267762898409332e-06, + "loss": 0.7913, + "step": 14952 + }, + { + "epoch": 0.7281182285199522, + "grad_norm": 1.622951626777649, + "learning_rate": 7.265330457571684e-06, + "loss": 0.8136, + "step": 14953 + }, + { + "epoch": 0.7281669223090594, + "grad_norm": 1.8433270454406738, + "learning_rate": 7.262898333510095e-06, + "loss": 0.8537, + "step": 14954 + }, + { + "epoch": 0.7282156160981667, + "grad_norm": 1.4243559837341309, + "learning_rate": 7.260466526285086e-06, + "loss": 0.7858, + "step": 14955 + }, + { + "epoch": 0.7282643098872739, + "grad_norm": 2.180795192718506, + "learning_rate": 7.258035035957121e-06, + "loss": 0.8477, + "step": 14956 + }, + { + "epoch": 0.7283130036763811, + "grad_norm": 1.4238917827606201, + "learning_rate": 7.2556038625867e-06, + "loss": 0.8246, + "step": 14957 + }, + { + "epoch": 0.7283616974654883, + "grad_norm": 1.4582208395004272, + "learning_rate": 7.253173006234298e-06, + "loss": 0.7049, + "step": 14958 + }, + { + "epoch": 0.7284103912545955, + "grad_norm": 1.6496093273162842, + "learning_rate": 7.2507424669603765e-06, + "loss": 0.7861, + "step": 14959 + }, + { + "epoch": 0.7284590850437027, + "grad_norm": 1.375169038772583, + "learning_rate": 7.248312244825402e-06, + "loss": 0.8712, + "step": 14960 + }, + { + "epoch": 0.7285077788328099, + "grad_norm": 1.8359520435333252, + "learning_rate": 7.245882339889814e-06, + "loss": 0.8437, + "step": 14961 + }, + { + "epoch": 0.728556472621917, + "grad_norm": 1.7355960607528687, + "learning_rate": 7.24345275221408e-06, + "loss": 0.8289, + "step": 14962 + }, + { + "epoch": 0.7286051664110242, + "grad_norm": 1.3740555047988892, + "learning_rate": 7.241023481858609e-06, + "loss": 0.7115, + "step": 14963 + }, + { + "epoch": 0.7286538602001315, + "grad_norm": 1.4314332008361816, + "learning_rate": 7.2385945288838536e-06, + "loss": 0.8393, + "step": 14964 + }, + { + "epoch": 0.7287025539892387, + "grad_norm": 1.5908167362213135, + "learning_rate": 7.236165893350211e-06, + "loss": 0.7697, + "step": 14965 + }, + { + "epoch": 0.7287512477783459, + "grad_norm": 1.8691414594650269, + "learning_rate": 7.233737575318111e-06, + "loss": 0.8349, + "step": 14966 + }, + { + "epoch": 0.7287999415674531, + "grad_norm": 2.8436014652252197, + "learning_rate": 7.231309574847955e-06, + "loss": 0.7358, + "step": 14967 + }, + { + "epoch": 0.7288486353565603, + "grad_norm": 1.7180383205413818, + "learning_rate": 7.22888189200013e-06, + "loss": 0.7868, + "step": 14968 + }, + { + "epoch": 0.7288973291456675, + "grad_norm": 1.2967554330825806, + "learning_rate": 7.226454526835045e-06, + "loss": 0.8342, + "step": 14969 + }, + { + "epoch": 0.7289460229347746, + "grad_norm": 1.3776543140411377, + "learning_rate": 7.224027479413056e-06, + "loss": 0.7448, + "step": 14970 + }, + { + "epoch": 0.7289947167238818, + "grad_norm": 1.7917391061782837, + "learning_rate": 7.2216007497945615e-06, + "loss": 0.7139, + "step": 14971 + }, + { + "epoch": 0.729043410512989, + "grad_norm": 1.2940946817398071, + "learning_rate": 7.219174338039898e-06, + "loss": 0.7449, + "step": 14972 + }, + { + "epoch": 0.7290921043020963, + "grad_norm": 1.2259427309036255, + "learning_rate": 7.216748244209446e-06, + "loss": 0.8143, + "step": 14973 + }, + { + "epoch": 0.7291407980912035, + "grad_norm": 1.1277568340301514, + "learning_rate": 7.2143224683635485e-06, + "loss": 0.7505, + "step": 14974 + }, + { + "epoch": 0.7291894918803107, + "grad_norm": 1.498226523399353, + "learning_rate": 7.211897010562545e-06, + "loss": 0.8598, + "step": 14975 + }, + { + "epoch": 0.7292381856694179, + "grad_norm": 1.4315576553344727, + "learning_rate": 7.209471870866769e-06, + "loss": 0.8245, + "step": 14976 + }, + { + "epoch": 0.7292868794585251, + "grad_norm": 1.2298659086227417, + "learning_rate": 7.207047049336538e-06, + "loss": 0.8054, + "step": 14977 + }, + { + "epoch": 0.7293355732476323, + "grad_norm": 1.3882017135620117, + "learning_rate": 7.204622546032194e-06, + "loss": 0.8206, + "step": 14978 + }, + { + "epoch": 0.7293842670367394, + "grad_norm": 1.4457340240478516, + "learning_rate": 7.202198361014015e-06, + "loss": 0.7563, + "step": 14979 + }, + { + "epoch": 0.7294329608258466, + "grad_norm": 2.2774124145507812, + "learning_rate": 7.199774494342327e-06, + "loss": 0.8497, + "step": 14980 + }, + { + "epoch": 0.7294816546149538, + "grad_norm": 0.09484075754880905, + "learning_rate": 7.197350946077415e-06, + "loss": 0.5367, + "step": 14981 + }, + { + "epoch": 0.7295303484040611, + "grad_norm": 2.1036829948425293, + "learning_rate": 7.194927716279565e-06, + "loss": 0.7938, + "step": 14982 + }, + { + "epoch": 0.7295790421931683, + "grad_norm": 1.4612994194030762, + "learning_rate": 7.192504805009055e-06, + "loss": 0.8176, + "step": 14983 + }, + { + "epoch": 0.7296277359822755, + "grad_norm": 1.7518305778503418, + "learning_rate": 7.190082212326157e-06, + "loss": 0.9005, + "step": 14984 + }, + { + "epoch": 0.7296764297713827, + "grad_norm": 1.528085470199585, + "learning_rate": 7.187659938291132e-06, + "loss": 0.7798, + "step": 14985 + }, + { + "epoch": 0.7297251235604899, + "grad_norm": 2.728140115737915, + "learning_rate": 7.185237982964229e-06, + "loss": 0.8039, + "step": 14986 + }, + { + "epoch": 0.729773817349597, + "grad_norm": 1.4976308345794678, + "learning_rate": 7.18281634640571e-06, + "loss": 0.8572, + "step": 14987 + }, + { + "epoch": 0.7298225111387042, + "grad_norm": 1.8592790365219116, + "learning_rate": 7.180395028675791e-06, + "loss": 0.7423, + "step": 14988 + }, + { + "epoch": 0.7298712049278114, + "grad_norm": 1.7558653354644775, + "learning_rate": 7.17797402983472e-06, + "loss": 0.7885, + "step": 14989 + }, + { + "epoch": 0.7299198987169186, + "grad_norm": 2.8941428661346436, + "learning_rate": 7.175553349942717e-06, + "loss": 0.8773, + "step": 14990 + }, + { + "epoch": 0.7299685925060259, + "grad_norm": 1.3620774745941162, + "learning_rate": 7.1731329890599895e-06, + "loss": 0.8854, + "step": 14991 + }, + { + "epoch": 0.7300172862951331, + "grad_norm": 1.676720142364502, + "learning_rate": 7.170712947246752e-06, + "loss": 0.8806, + "step": 14992 + }, + { + "epoch": 0.7300659800842403, + "grad_norm": 1.552276611328125, + "learning_rate": 7.168293224563199e-06, + "loss": 0.8912, + "step": 14993 + }, + { + "epoch": 0.7301146738733475, + "grad_norm": 1.7701646089553833, + "learning_rate": 7.16587382106952e-06, + "loss": 0.7738, + "step": 14994 + }, + { + "epoch": 0.7301633676624546, + "grad_norm": 1.389944076538086, + "learning_rate": 7.163454736825895e-06, + "loss": 0.8284, + "step": 14995 + }, + { + "epoch": 0.7302120614515618, + "grad_norm": 1.2113746404647827, + "learning_rate": 7.161035971892509e-06, + "loss": 0.7551, + "step": 14996 + }, + { + "epoch": 0.730260755240669, + "grad_norm": 1.7829550504684448, + "learning_rate": 7.158617526329525e-06, + "loss": 0.8754, + "step": 14997 + }, + { + "epoch": 0.7303094490297762, + "grad_norm": 1.4768019914627075, + "learning_rate": 7.156199400197102e-06, + "loss": 0.7552, + "step": 14998 + }, + { + "epoch": 0.7303581428188834, + "grad_norm": 1.3851937055587769, + "learning_rate": 7.153781593555389e-06, + "loss": 0.8515, + "step": 14999 + }, + { + "epoch": 0.7304068366079907, + "grad_norm": 1.5576000213623047, + "learning_rate": 7.151364106464529e-06, + "loss": 0.8838, + "step": 15000 + }, + { + "epoch": 0.7304555303970979, + "grad_norm": 1.6230772733688354, + "learning_rate": 7.14894693898466e-06, + "loss": 0.8203, + "step": 15001 + }, + { + "epoch": 0.7305042241862051, + "grad_norm": 1.5797632932662964, + "learning_rate": 7.146530091175907e-06, + "loss": 0.8861, + "step": 15002 + }, + { + "epoch": 0.7305529179753123, + "grad_norm": 1.667803168296814, + "learning_rate": 7.144113563098383e-06, + "loss": 0.942, + "step": 15003 + }, + { + "epoch": 0.7306016117644194, + "grad_norm": 1.2657690048217773, + "learning_rate": 7.14169735481222e-06, + "loss": 0.783, + "step": 15004 + }, + { + "epoch": 0.7306503055535266, + "grad_norm": 2.0627083778381348, + "learning_rate": 7.139281466377492e-06, + "loss": 0.8598, + "step": 15005 + }, + { + "epoch": 0.7306989993426338, + "grad_norm": 1.4363294839859009, + "learning_rate": 7.13686589785432e-06, + "loss": 0.8183, + "step": 15006 + }, + { + "epoch": 0.730747693131741, + "grad_norm": 2.073975086212158, + "learning_rate": 7.13445064930278e-06, + "loss": 0.8229, + "step": 15007 + }, + { + "epoch": 0.7307963869208483, + "grad_norm": 1.3885220289230347, + "learning_rate": 7.1320357207829545e-06, + "loss": 0.8806, + "step": 15008 + }, + { + "epoch": 0.7308450807099555, + "grad_norm": 1.4155977964401245, + "learning_rate": 7.129621112354912e-06, + "loss": 0.7703, + "step": 15009 + }, + { + "epoch": 0.7308937744990627, + "grad_norm": 1.3075520992279053, + "learning_rate": 7.1272068240787185e-06, + "loss": 0.8201, + "step": 15010 + }, + { + "epoch": 0.7309424682881699, + "grad_norm": 1.6379157304763794, + "learning_rate": 7.124792856014429e-06, + "loss": 0.8533, + "step": 15011 + }, + { + "epoch": 0.730991162077277, + "grad_norm": 1.326612949371338, + "learning_rate": 7.122379208222086e-06, + "loss": 0.8772, + "step": 15012 + }, + { + "epoch": 0.7310398558663842, + "grad_norm": 1.5578696727752686, + "learning_rate": 7.1199658807617455e-06, + "loss": 0.8909, + "step": 15013 + }, + { + "epoch": 0.7310885496554914, + "grad_norm": 1.403019666671753, + "learning_rate": 7.117552873693414e-06, + "loss": 0.7872, + "step": 15014 + }, + { + "epoch": 0.7311372434445986, + "grad_norm": 1.7809882164001465, + "learning_rate": 7.1151401870771385e-06, + "loss": 0.8172, + "step": 15015 + }, + { + "epoch": 0.7311859372337058, + "grad_norm": 1.6371588706970215, + "learning_rate": 7.112727820972924e-06, + "loss": 0.853, + "step": 15016 + }, + { + "epoch": 0.731234631022813, + "grad_norm": 0.10256300866603851, + "learning_rate": 7.1103157754407795e-06, + "loss": 0.6059, + "step": 15017 + }, + { + "epoch": 0.7312833248119203, + "grad_norm": 1.4176099300384521, + "learning_rate": 7.107904050540706e-06, + "loss": 0.7753, + "step": 15018 + }, + { + "epoch": 0.7313320186010275, + "grad_norm": 1.8424426317214966, + "learning_rate": 7.10549264633269e-06, + "loss": 0.8083, + "step": 15019 + }, + { + "epoch": 0.7313807123901347, + "grad_norm": 1.621601939201355, + "learning_rate": 7.10308156287673e-06, + "loss": 0.8046, + "step": 15020 + }, + { + "epoch": 0.7314294061792418, + "grad_norm": 1.8500858545303345, + "learning_rate": 7.10067080023278e-06, + "loss": 0.8828, + "step": 15021 + }, + { + "epoch": 0.731478099968349, + "grad_norm": 1.5949301719665527, + "learning_rate": 7.098260358460836e-06, + "loss": 0.8041, + "step": 15022 + }, + { + "epoch": 0.7315267937574562, + "grad_norm": 0.0961434617638588, + "learning_rate": 7.095850237620825e-06, + "loss": 0.5744, + "step": 15023 + }, + { + "epoch": 0.7315754875465634, + "grad_norm": 1.7489850521087646, + "learning_rate": 7.093440437772725e-06, + "loss": 0.7482, + "step": 15024 + }, + { + "epoch": 0.7316241813356706, + "grad_norm": 1.6096303462982178, + "learning_rate": 7.091030958976468e-06, + "loss": 0.7483, + "step": 15025 + }, + { + "epoch": 0.7316728751247779, + "grad_norm": 1.4685988426208496, + "learning_rate": 7.088621801291988e-06, + "loss": 0.8442, + "step": 15026 + }, + { + "epoch": 0.7317215689138851, + "grad_norm": 3.121648073196411, + "learning_rate": 7.086212964779231e-06, + "loss": 0.8459, + "step": 15027 + }, + { + "epoch": 0.7317702627029923, + "grad_norm": 1.5160149335861206, + "learning_rate": 7.08380444949809e-06, + "loss": 0.7836, + "step": 15028 + }, + { + "epoch": 0.7318189564920994, + "grad_norm": 1.6577388048171997, + "learning_rate": 7.081396255508508e-06, + "loss": 0.8133, + "step": 15029 + }, + { + "epoch": 0.7318676502812066, + "grad_norm": 3.7903871536254883, + "learning_rate": 7.078988382870355e-06, + "loss": 0.8315, + "step": 15030 + }, + { + "epoch": 0.7319163440703138, + "grad_norm": 1.4617736339569092, + "learning_rate": 7.076580831643552e-06, + "loss": 0.7917, + "step": 15031 + }, + { + "epoch": 0.731965037859421, + "grad_norm": 1.1154453754425049, + "learning_rate": 7.074173601887977e-06, + "loss": 0.8006, + "step": 15032 + }, + { + "epoch": 0.7320137316485282, + "grad_norm": 1.376312255859375, + "learning_rate": 7.071766693663516e-06, + "loss": 0.8055, + "step": 15033 + }, + { + "epoch": 0.7320624254376354, + "grad_norm": 2.5022389888763428, + "learning_rate": 7.069360107030035e-06, + "loss": 0.8354, + "step": 15034 + }, + { + "epoch": 0.7321111192267427, + "grad_norm": 3.3672032356262207, + "learning_rate": 7.066953842047393e-06, + "loss": 0.7476, + "step": 15035 + }, + { + "epoch": 0.7321598130158499, + "grad_norm": 1.6313072443008423, + "learning_rate": 7.064547898775467e-06, + "loss": 0.7605, + "step": 15036 + }, + { + "epoch": 0.7322085068049571, + "grad_norm": 2.6413400173187256, + "learning_rate": 7.0621422772740775e-06, + "loss": 0.7953, + "step": 15037 + }, + { + "epoch": 0.7322572005940642, + "grad_norm": 1.4190428256988525, + "learning_rate": 7.0597369776030935e-06, + "loss": 0.7437, + "step": 15038 + }, + { + "epoch": 0.7323058943831714, + "grad_norm": 2.2345998287200928, + "learning_rate": 7.057331999822319e-06, + "loss": 0.8296, + "step": 15039 + }, + { + "epoch": 0.7323545881722786, + "grad_norm": 1.625112771987915, + "learning_rate": 7.054927343991596e-06, + "loss": 0.7966, + "step": 15040 + }, + { + "epoch": 0.7324032819613858, + "grad_norm": 1.5709959268569946, + "learning_rate": 7.052523010170736e-06, + "loss": 0.7466, + "step": 15041 + }, + { + "epoch": 0.732451975750493, + "grad_norm": 1.4891061782836914, + "learning_rate": 7.050118998419547e-06, + "loss": 0.8734, + "step": 15042 + }, + { + "epoch": 0.7325006695396002, + "grad_norm": 1.5672264099121094, + "learning_rate": 7.047715308797829e-06, + "loss": 0.7929, + "step": 15043 + }, + { + "epoch": 0.7325493633287075, + "grad_norm": 1.2560451030731201, + "learning_rate": 7.045311941365374e-06, + "loss": 0.7597, + "step": 15044 + }, + { + "epoch": 0.7325980571178147, + "grad_norm": 2.8652842044830322, + "learning_rate": 7.042908896181964e-06, + "loss": 0.7461, + "step": 15045 + }, + { + "epoch": 0.7326467509069218, + "grad_norm": 1.3439785242080688, + "learning_rate": 7.040506173307371e-06, + "loss": 0.7179, + "step": 15046 + }, + { + "epoch": 0.732695444696029, + "grad_norm": 2.8243966102600098, + "learning_rate": 7.038103772801375e-06, + "loss": 0.8274, + "step": 15047 + }, + { + "epoch": 0.7327441384851362, + "grad_norm": 2.0938849449157715, + "learning_rate": 7.0357016947237285e-06, + "loss": 0.8107, + "step": 15048 + }, + { + "epoch": 0.7327928322742434, + "grad_norm": 1.6997531652450562, + "learning_rate": 7.033299939134186e-06, + "loss": 0.761, + "step": 15049 + }, + { + "epoch": 0.7328415260633506, + "grad_norm": 1.658220648765564, + "learning_rate": 7.030898506092489e-06, + "loss": 0.7547, + "step": 15050 + }, + { + "epoch": 0.7328902198524578, + "grad_norm": 1.9814810752868652, + "learning_rate": 7.028497395658374e-06, + "loss": 0.7549, + "step": 15051 + }, + { + "epoch": 0.732938913641565, + "grad_norm": 1.4481678009033203, + "learning_rate": 7.02609660789157e-06, + "loss": 0.7536, + "step": 15052 + }, + { + "epoch": 0.7329876074306723, + "grad_norm": 1.5287857055664062, + "learning_rate": 7.023696142851795e-06, + "loss": 0.9059, + "step": 15053 + }, + { + "epoch": 0.7330363012197794, + "grad_norm": 1.7032066583633423, + "learning_rate": 7.021296000598754e-06, + "loss": 0.9049, + "step": 15054 + }, + { + "epoch": 0.7330849950088866, + "grad_norm": 1.3805856704711914, + "learning_rate": 7.0188961811921675e-06, + "loss": 0.7921, + "step": 15055 + }, + { + "epoch": 0.7331336887979938, + "grad_norm": 1.4833282232284546, + "learning_rate": 7.01649668469172e-06, + "loss": 0.7314, + "step": 15056 + }, + { + "epoch": 0.733182382587101, + "grad_norm": 3.1091575622558594, + "learning_rate": 7.014097511157101e-06, + "loss": 0.7315, + "step": 15057 + }, + { + "epoch": 0.7332310763762082, + "grad_norm": 1.4498625993728638, + "learning_rate": 7.011698660647992e-06, + "loss": 0.8125, + "step": 15058 + }, + { + "epoch": 0.7332797701653154, + "grad_norm": 1.6278339624404907, + "learning_rate": 7.009300133224062e-06, + "loss": 0.7938, + "step": 15059 + }, + { + "epoch": 0.7333284639544226, + "grad_norm": 1.6560320854187012, + "learning_rate": 7.006901928944975e-06, + "loss": 0.835, + "step": 15060 + }, + { + "epoch": 0.7333771577435299, + "grad_norm": 1.6416007280349731, + "learning_rate": 7.004504047870389e-06, + "loss": 0.8103, + "step": 15061 + }, + { + "epoch": 0.7334258515326371, + "grad_norm": 1.2218611240386963, + "learning_rate": 7.002106490059948e-06, + "loss": 0.7959, + "step": 15062 + }, + { + "epoch": 0.7334745453217442, + "grad_norm": 1.8673065900802612, + "learning_rate": 6.999709255573288e-06, + "loss": 0.8842, + "step": 15063 + }, + { + "epoch": 0.7335232391108514, + "grad_norm": 1.0979236364364624, + "learning_rate": 6.9973123444700506e-06, + "loss": 0.7894, + "step": 15064 + }, + { + "epoch": 0.7335719328999586, + "grad_norm": 1.6694334745407104, + "learning_rate": 6.994915756809855e-06, + "loss": 0.8967, + "step": 15065 + }, + { + "epoch": 0.7336206266890658, + "grad_norm": 1.3724876642227173, + "learning_rate": 6.9925194926523165e-06, + "loss": 0.8177, + "step": 15066 + }, + { + "epoch": 0.733669320478173, + "grad_norm": 1.363786220550537, + "learning_rate": 6.99012355205704e-06, + "loss": 0.9287, + "step": 15067 + }, + { + "epoch": 0.7337180142672802, + "grad_norm": 1.3518881797790527, + "learning_rate": 6.987727935083628e-06, + "loss": 0.805, + "step": 15068 + }, + { + "epoch": 0.7337667080563874, + "grad_norm": 1.8290810585021973, + "learning_rate": 6.985332641791669e-06, + "loss": 0.8144, + "step": 15069 + }, + { + "epoch": 0.7338154018454947, + "grad_norm": 1.5963163375854492, + "learning_rate": 6.982937672240742e-06, + "loss": 0.8525, + "step": 15070 + }, + { + "epoch": 0.7338640956346018, + "grad_norm": 1.636574387550354, + "learning_rate": 6.980543026490438e-06, + "loss": 0.8515, + "step": 15071 + }, + { + "epoch": 0.733912789423709, + "grad_norm": 1.3476555347442627, + "learning_rate": 6.9781487046003005e-06, + "loss": 0.8472, + "step": 15072 + }, + { + "epoch": 0.7339614832128162, + "grad_norm": 1.3505668640136719, + "learning_rate": 6.9757547066299155e-06, + "loss": 0.7837, + "step": 15073 + }, + { + "epoch": 0.7340101770019234, + "grad_norm": 1.522016167640686, + "learning_rate": 6.973361032638808e-06, + "loss": 0.7352, + "step": 15074 + }, + { + "epoch": 0.7340588707910306, + "grad_norm": 1.4981880187988281, + "learning_rate": 6.970967682686536e-06, + "loss": 0.8985, + "step": 15075 + }, + { + "epoch": 0.7341075645801378, + "grad_norm": 1.286413550376892, + "learning_rate": 6.968574656832632e-06, + "loss": 0.9006, + "step": 15076 + }, + { + "epoch": 0.734156258369245, + "grad_norm": 1.3294308185577393, + "learning_rate": 6.966181955136615e-06, + "loss": 0.8232, + "step": 15077 + }, + { + "epoch": 0.7342049521583522, + "grad_norm": 1.3050997257232666, + "learning_rate": 6.963789577658024e-06, + "loss": 0.8708, + "step": 15078 + }, + { + "epoch": 0.7342536459474595, + "grad_norm": 2.761880874633789, + "learning_rate": 6.961397524456344e-06, + "loss": 0.755, + "step": 15079 + }, + { + "epoch": 0.7343023397365666, + "grad_norm": 1.848829746246338, + "learning_rate": 6.959005795591101e-06, + "loss": 0.8577, + "step": 15080 + }, + { + "epoch": 0.7343510335256738, + "grad_norm": 1.5834318399429321, + "learning_rate": 6.956614391121763e-06, + "loss": 0.8802, + "step": 15081 + }, + { + "epoch": 0.734399727314781, + "grad_norm": 1.7346913814544678, + "learning_rate": 6.95422331110784e-06, + "loss": 0.8522, + "step": 15082 + }, + { + "epoch": 0.7344484211038882, + "grad_norm": 1.5936193466186523, + "learning_rate": 6.9518325556087995e-06, + "loss": 0.7917, + "step": 15083 + }, + { + "epoch": 0.7344971148929954, + "grad_norm": 1.6671442985534668, + "learning_rate": 6.949442124684109e-06, + "loss": 0.8589, + "step": 15084 + }, + { + "epoch": 0.7345458086821026, + "grad_norm": 1.5926754474639893, + "learning_rate": 6.94705201839325e-06, + "loss": 0.8203, + "step": 15085 + }, + { + "epoch": 0.7345945024712098, + "grad_norm": 0.09376106411218643, + "learning_rate": 6.944662236795647e-06, + "loss": 0.6091, + "step": 15086 + }, + { + "epoch": 0.734643196260317, + "grad_norm": 1.4152257442474365, + "learning_rate": 6.942272779950774e-06, + "loss": 0.9996, + "step": 15087 + }, + { + "epoch": 0.7346918900494241, + "grad_norm": 1.9746878147125244, + "learning_rate": 6.939883647918044e-06, + "loss": 0.6739, + "step": 15088 + }, + { + "epoch": 0.7347405838385314, + "grad_norm": 1.447128176689148, + "learning_rate": 6.937494840756906e-06, + "loss": 0.7463, + "step": 15089 + }, + { + "epoch": 0.7347892776276386, + "grad_norm": 1.8178359270095825, + "learning_rate": 6.935106358526775e-06, + "loss": 0.882, + "step": 15090 + }, + { + "epoch": 0.7348379714167458, + "grad_norm": 1.2163225412368774, + "learning_rate": 6.932718201287063e-06, + "loss": 0.8259, + "step": 15091 + }, + { + "epoch": 0.734886665205853, + "grad_norm": 1.3400813341140747, + "learning_rate": 6.930330369097178e-06, + "loss": 0.8223, + "step": 15092 + }, + { + "epoch": 0.7349353589949602, + "grad_norm": 2.733489513397217, + "learning_rate": 6.927942862016517e-06, + "loss": 0.8154, + "step": 15093 + }, + { + "epoch": 0.7349840527840674, + "grad_norm": 1.4130570888519287, + "learning_rate": 6.92555568010447e-06, + "loss": 0.8514, + "step": 15094 + }, + { + "epoch": 0.7350327465731746, + "grad_norm": 2.3130087852478027, + "learning_rate": 6.92316882342041e-06, + "loss": 0.8098, + "step": 15095 + }, + { + "epoch": 0.7350814403622818, + "grad_norm": 1.183667778968811, + "learning_rate": 6.920782292023729e-06, + "loss": 0.7818, + "step": 15096 + }, + { + "epoch": 0.735130134151389, + "grad_norm": 0.0991961658000946, + "learning_rate": 6.918396085973771e-06, + "loss": 0.6065, + "step": 15097 + }, + { + "epoch": 0.7351788279404962, + "grad_norm": 2.4624269008636475, + "learning_rate": 6.916010205329908e-06, + "loss": 0.798, + "step": 15098 + }, + { + "epoch": 0.7352275217296034, + "grad_norm": 1.3334462642669678, + "learning_rate": 6.913624650151485e-06, + "loss": 0.85, + "step": 15099 + }, + { + "epoch": 0.7352762155187106, + "grad_norm": 1.3996886014938354, + "learning_rate": 6.911239420497841e-06, + "loss": 0.7833, + "step": 15100 + }, + { + "epoch": 0.7353249093078178, + "grad_norm": 1.2504842281341553, + "learning_rate": 6.90885451642831e-06, + "loss": 0.7798, + "step": 15101 + }, + { + "epoch": 0.735373603096925, + "grad_norm": 1.5421626567840576, + "learning_rate": 6.906469938002216e-06, + "loss": 0.8531, + "step": 15102 + }, + { + "epoch": 0.7354222968860322, + "grad_norm": 1.7747769355773926, + "learning_rate": 6.904085685278878e-06, + "loss": 0.824, + "step": 15103 + }, + { + "epoch": 0.7354709906751394, + "grad_norm": 1.5693459510803223, + "learning_rate": 6.9017017583175935e-06, + "loss": 0.8015, + "step": 15104 + }, + { + "epoch": 0.7355196844642465, + "grad_norm": 1.7282284498214722, + "learning_rate": 6.899318157177681e-06, + "loss": 0.7405, + "step": 15105 + }, + { + "epoch": 0.7355683782533537, + "grad_norm": 0.09943830221891403, + "learning_rate": 6.896934881918422e-06, + "loss": 0.6055, + "step": 15106 + }, + { + "epoch": 0.735617072042461, + "grad_norm": 2.385322332382202, + "learning_rate": 6.894551932599105e-06, + "loss": 0.7945, + "step": 15107 + }, + { + "epoch": 0.7356657658315682, + "grad_norm": 1.4847382307052612, + "learning_rate": 6.892169309279002e-06, + "loss": 0.8458, + "step": 15108 + }, + { + "epoch": 0.7357144596206754, + "grad_norm": 1.493795394897461, + "learning_rate": 6.8897870120173815e-06, + "loss": 0.8419, + "step": 15109 + }, + { + "epoch": 0.7357631534097826, + "grad_norm": 1.6593918800354004, + "learning_rate": 6.887405040873505e-06, + "loss": 0.84, + "step": 15110 + }, + { + "epoch": 0.7358118471988898, + "grad_norm": 1.9140654802322388, + "learning_rate": 6.885023395906625e-06, + "loss": 0.8831, + "step": 15111 + }, + { + "epoch": 0.735860540987997, + "grad_norm": 1.627622127532959, + "learning_rate": 6.882642077175976e-06, + "loss": 0.8614, + "step": 15112 + }, + { + "epoch": 0.7359092347771041, + "grad_norm": 1.5495238304138184, + "learning_rate": 6.880261084740813e-06, + "loss": 0.9127, + "step": 15113 + }, + { + "epoch": 0.7359579285662113, + "grad_norm": 1.2574634552001953, + "learning_rate": 6.877880418660341e-06, + "loss": 0.6878, + "step": 15114 + }, + { + "epoch": 0.7360066223553186, + "grad_norm": 1.4145115613937378, + "learning_rate": 6.875500078993798e-06, + "loss": 0.8779, + "step": 15115 + }, + { + "epoch": 0.7360553161444258, + "grad_norm": 1.6785705089569092, + "learning_rate": 6.873120065800382e-06, + "loss": 0.8722, + "step": 15116 + }, + { + "epoch": 0.736104009933533, + "grad_norm": 1.2913724184036255, + "learning_rate": 6.870740379139307e-06, + "loss": 0.8034, + "step": 15117 + }, + { + "epoch": 0.7361527037226402, + "grad_norm": 1.3626089096069336, + "learning_rate": 6.868361019069758e-06, + "loss": 0.8685, + "step": 15118 + }, + { + "epoch": 0.7362013975117474, + "grad_norm": 1.2823981046676636, + "learning_rate": 6.865981985650929e-06, + "loss": 0.7977, + "step": 15119 + }, + { + "epoch": 0.7362500913008546, + "grad_norm": 2.7726988792419434, + "learning_rate": 6.863603278941993e-06, + "loss": 0.7818, + "step": 15120 + }, + { + "epoch": 0.7362987850899618, + "grad_norm": 1.2975106239318848, + "learning_rate": 6.861224899002117e-06, + "loss": 0.8469, + "step": 15121 + }, + { + "epoch": 0.7363474788790689, + "grad_norm": 1.7170135974884033, + "learning_rate": 6.858846845890483e-06, + "loss": 0.8451, + "step": 15122 + }, + { + "epoch": 0.7363961726681761, + "grad_norm": 1.363512635231018, + "learning_rate": 6.85646911966622e-06, + "loss": 0.8165, + "step": 15123 + }, + { + "epoch": 0.7364448664572834, + "grad_norm": 1.5553414821624756, + "learning_rate": 6.8540917203884914e-06, + "loss": 0.909, + "step": 15124 + }, + { + "epoch": 0.7364935602463906, + "grad_norm": 1.73583984375, + "learning_rate": 6.8517146481164295e-06, + "loss": 0.8123, + "step": 15125 + }, + { + "epoch": 0.7365422540354978, + "grad_norm": 1.233891248703003, + "learning_rate": 6.849337902909166e-06, + "loss": 0.7947, + "step": 15126 + }, + { + "epoch": 0.736590947824605, + "grad_norm": 1.4462571144104004, + "learning_rate": 6.84696148482582e-06, + "loss": 0.8301, + "step": 15127 + }, + { + "epoch": 0.7366396416137122, + "grad_norm": 2.55617618560791, + "learning_rate": 6.8445853939255e-06, + "loss": 0.7826, + "step": 15128 + }, + { + "epoch": 0.7366883354028194, + "grad_norm": 0.10380822420120239, + "learning_rate": 6.842209630267331e-06, + "loss": 0.5792, + "step": 15129 + }, + { + "epoch": 0.7367370291919265, + "grad_norm": 2.255093812942505, + "learning_rate": 6.839834193910384e-06, + "loss": 0.741, + "step": 15130 + }, + { + "epoch": 0.7367857229810337, + "grad_norm": 1.2003720998764038, + "learning_rate": 6.837459084913775e-06, + "loss": 0.7108, + "step": 15131 + }, + { + "epoch": 0.7368344167701409, + "grad_norm": 2.0622174739837646, + "learning_rate": 6.835084303336559e-06, + "loss": 0.8176, + "step": 15132 + }, + { + "epoch": 0.7368831105592482, + "grad_norm": 1.6197024583816528, + "learning_rate": 6.832709849237826e-06, + "loss": 0.765, + "step": 15133 + }, + { + "epoch": 0.7369318043483554, + "grad_norm": 1.5298383235931396, + "learning_rate": 6.830335722676636e-06, + "loss": 0.8494, + "step": 15134 + }, + { + "epoch": 0.7369804981374626, + "grad_norm": 1.4281765222549438, + "learning_rate": 6.827961923712041e-06, + "loss": 0.8641, + "step": 15135 + }, + { + "epoch": 0.7370291919265698, + "grad_norm": 2.1060237884521484, + "learning_rate": 6.825588452403105e-06, + "loss": 0.709, + "step": 15136 + }, + { + "epoch": 0.737077885715677, + "grad_norm": 1.3796908855438232, + "learning_rate": 6.823215308808846e-06, + "loss": 0.7891, + "step": 15137 + }, + { + "epoch": 0.7371265795047842, + "grad_norm": 2.2980151176452637, + "learning_rate": 6.82084249298832e-06, + "loss": 0.7902, + "step": 15138 + }, + { + "epoch": 0.7371752732938913, + "grad_norm": 1.4178566932678223, + "learning_rate": 6.818470005000524e-06, + "loss": 0.9378, + "step": 15139 + }, + { + "epoch": 0.7372239670829985, + "grad_norm": 1.458910346031189, + "learning_rate": 6.816097844904497e-06, + "loss": 0.8536, + "step": 15140 + }, + { + "epoch": 0.7372726608721057, + "grad_norm": 0.09599647670984268, + "learning_rate": 6.813726012759238e-06, + "loss": 0.5782, + "step": 15141 + }, + { + "epoch": 0.737321354661213, + "grad_norm": 1.5596572160720825, + "learning_rate": 6.811354508623747e-06, + "loss": 0.8535, + "step": 15142 + }, + { + "epoch": 0.7373700484503202, + "grad_norm": 2.750950813293457, + "learning_rate": 6.8089833325570136e-06, + "loss": 0.7453, + "step": 15143 + }, + { + "epoch": 0.7374187422394274, + "grad_norm": 1.637149691581726, + "learning_rate": 6.806612484618018e-06, + "loss": 0.8381, + "step": 15144 + }, + { + "epoch": 0.7374674360285346, + "grad_norm": 1.7031553983688354, + "learning_rate": 6.804241964865752e-06, + "loss": 0.894, + "step": 15145 + }, + { + "epoch": 0.7375161298176418, + "grad_norm": 1.3950587511062622, + "learning_rate": 6.801871773359156e-06, + "loss": 0.8585, + "step": 15146 + }, + { + "epoch": 0.7375648236067489, + "grad_norm": 1.3303653001785278, + "learning_rate": 6.799501910157218e-06, + "loss": 0.7162, + "step": 15147 + }, + { + "epoch": 0.7376135173958561, + "grad_norm": 1.3202120065689087, + "learning_rate": 6.797132375318858e-06, + "loss": 0.875, + "step": 15148 + }, + { + "epoch": 0.7376622111849633, + "grad_norm": 1.2116522789001465, + "learning_rate": 6.794763168903042e-06, + "loss": 0.7901, + "step": 15149 + }, + { + "epoch": 0.7377109049740705, + "grad_norm": 2.087561845779419, + "learning_rate": 6.7923942909686955e-06, + "loss": 0.7353, + "step": 15150 + }, + { + "epoch": 0.7377595987631778, + "grad_norm": 1.7643134593963623, + "learning_rate": 6.790025741574746e-06, + "loss": 0.844, + "step": 15151 + }, + { + "epoch": 0.737808292552285, + "grad_norm": 1.6275635957717896, + "learning_rate": 6.787657520780111e-06, + "loss": 0.8893, + "step": 15152 + }, + { + "epoch": 0.7378569863413922, + "grad_norm": 3.2157514095306396, + "learning_rate": 6.785289628643701e-06, + "loss": 0.8052, + "step": 15153 + }, + { + "epoch": 0.7379056801304994, + "grad_norm": 1.3980580568313599, + "learning_rate": 6.7829220652244135e-06, + "loss": 0.9321, + "step": 15154 + }, + { + "epoch": 0.7379543739196065, + "grad_norm": 1.5318342447280884, + "learning_rate": 6.780554830581139e-06, + "loss": 0.7468, + "step": 15155 + }, + { + "epoch": 0.7380030677087137, + "grad_norm": 2.0951852798461914, + "learning_rate": 6.7781879247727765e-06, + "loss": 0.9318, + "step": 15156 + }, + { + "epoch": 0.7380517614978209, + "grad_norm": 2.0089480876922607, + "learning_rate": 6.7758213478581935e-06, + "loss": 0.7506, + "step": 15157 + }, + { + "epoch": 0.7381004552869281, + "grad_norm": 1.3411835432052612, + "learning_rate": 6.77345509989626e-06, + "loss": 0.7299, + "step": 15158 + }, + { + "epoch": 0.7381491490760353, + "grad_norm": 1.2269260883331299, + "learning_rate": 6.771089180945838e-06, + "loss": 0.8644, + "step": 15159 + }, + { + "epoch": 0.7381978428651426, + "grad_norm": 1.575366497039795, + "learning_rate": 6.768723591065778e-06, + "loss": 0.8581, + "step": 15160 + }, + { + "epoch": 0.7382465366542498, + "grad_norm": 0.09651809930801392, + "learning_rate": 6.766358330314924e-06, + "loss": 0.6621, + "step": 15161 + }, + { + "epoch": 0.738295230443357, + "grad_norm": 1.4766439199447632, + "learning_rate": 6.763993398752115e-06, + "loss": 0.8164, + "step": 15162 + }, + { + "epoch": 0.7383439242324642, + "grad_norm": 1.7189527750015259, + "learning_rate": 6.761628796436168e-06, + "loss": 0.737, + "step": 15163 + }, + { + "epoch": 0.7383926180215713, + "grad_norm": 0.09594190865755081, + "learning_rate": 6.759264523425921e-06, + "loss": 0.6217, + "step": 15164 + }, + { + "epoch": 0.7384413118106785, + "grad_norm": 0.09887882322072983, + "learning_rate": 6.756900579780174e-06, + "loss": 0.6983, + "step": 15165 + }, + { + "epoch": 0.7384900055997857, + "grad_norm": 2.9190311431884766, + "learning_rate": 6.754536965557734e-06, + "loss": 0.8556, + "step": 15166 + }, + { + "epoch": 0.7385386993888929, + "grad_norm": 1.389835238456726, + "learning_rate": 6.752173680817396e-06, + "loss": 0.7763, + "step": 15167 + }, + { + "epoch": 0.7385873931780002, + "grad_norm": 1.599510669708252, + "learning_rate": 6.749810725617943e-06, + "loss": 0.8306, + "step": 15168 + }, + { + "epoch": 0.7386360869671074, + "grad_norm": 1.3130924701690674, + "learning_rate": 6.7474481000181565e-06, + "loss": 0.8375, + "step": 15169 + }, + { + "epoch": 0.7386847807562146, + "grad_norm": 4.424593448638916, + "learning_rate": 6.745085804076808e-06, + "loss": 0.8752, + "step": 15170 + }, + { + "epoch": 0.7387334745453218, + "grad_norm": 1.7512097358703613, + "learning_rate": 6.742723837852658e-06, + "loss": 0.8303, + "step": 15171 + }, + { + "epoch": 0.7387821683344289, + "grad_norm": 2.3568167686462402, + "learning_rate": 6.740362201404456e-06, + "loss": 0.9277, + "step": 15172 + }, + { + "epoch": 0.7388308621235361, + "grad_norm": 1.4407683610916138, + "learning_rate": 6.738000894790959e-06, + "loss": 0.8713, + "step": 15173 + }, + { + "epoch": 0.7388795559126433, + "grad_norm": 3.0455691814422607, + "learning_rate": 6.735639918070897e-06, + "loss": 0.8463, + "step": 15174 + }, + { + "epoch": 0.7389282497017505, + "grad_norm": 0.10357531905174255, + "learning_rate": 6.7332792713030034e-06, + "loss": 0.6309, + "step": 15175 + }, + { + "epoch": 0.7389769434908577, + "grad_norm": 1.4655777215957642, + "learning_rate": 6.730918954545998e-06, + "loss": 0.8526, + "step": 15176 + }, + { + "epoch": 0.739025637279965, + "grad_norm": 1.602845549583435, + "learning_rate": 6.728558967858592e-06, + "loss": 0.7096, + "step": 15177 + }, + { + "epoch": 0.7390743310690722, + "grad_norm": 1.5121066570281982, + "learning_rate": 6.726199311299493e-06, + "loss": 0.8291, + "step": 15178 + }, + { + "epoch": 0.7391230248581794, + "grad_norm": 1.5257153511047363, + "learning_rate": 6.723839984927389e-06, + "loss": 0.7372, + "step": 15179 + }, + { + "epoch": 0.7391717186472866, + "grad_norm": 1.4715334177017212, + "learning_rate": 6.72148098880099e-06, + "loss": 0.8815, + "step": 15180 + }, + { + "epoch": 0.7392204124363937, + "grad_norm": 0.0970592349767685, + "learning_rate": 6.719122322978948e-06, + "loss": 0.6295, + "step": 15181 + }, + { + "epoch": 0.7392691062255009, + "grad_norm": 1.840479850769043, + "learning_rate": 6.716763987519961e-06, + "loss": 0.7527, + "step": 15182 + }, + { + "epoch": 0.7393178000146081, + "grad_norm": 1.3880218267440796, + "learning_rate": 6.71440598248267e-06, + "loss": 0.8498, + "step": 15183 + }, + { + "epoch": 0.7393664938037153, + "grad_norm": 1.4457544088363647, + "learning_rate": 6.712048307925747e-06, + "loss": 0.8166, + "step": 15184 + }, + { + "epoch": 0.7394151875928225, + "grad_norm": 2.1338720321655273, + "learning_rate": 6.709690963907835e-06, + "loss": 0.8091, + "step": 15185 + }, + { + "epoch": 0.7394638813819298, + "grad_norm": 1.6412113904953003, + "learning_rate": 6.707333950487565e-06, + "loss": 0.8313, + "step": 15186 + }, + { + "epoch": 0.739512575171037, + "grad_norm": 2.3839104175567627, + "learning_rate": 6.704977267723587e-06, + "loss": 0.7897, + "step": 15187 + }, + { + "epoch": 0.7395612689601442, + "grad_norm": 2.1015360355377197, + "learning_rate": 6.702620915674498e-06, + "loss": 0.7611, + "step": 15188 + }, + { + "epoch": 0.7396099627492513, + "grad_norm": 1.307466983795166, + "learning_rate": 6.700264894398942e-06, + "loss": 0.8014, + "step": 15189 + }, + { + "epoch": 0.7396586565383585, + "grad_norm": 2.155945301055908, + "learning_rate": 6.697909203955495e-06, + "loss": 0.805, + "step": 15190 + }, + { + "epoch": 0.7397073503274657, + "grad_norm": 1.2921779155731201, + "learning_rate": 6.695553844402776e-06, + "loss": 0.8547, + "step": 15191 + }, + { + "epoch": 0.7397560441165729, + "grad_norm": 1.5659005641937256, + "learning_rate": 6.693198815799366e-06, + "loss": 0.7692, + "step": 15192 + }, + { + "epoch": 0.7398047379056801, + "grad_norm": 1.9421379566192627, + "learning_rate": 6.690844118203845e-06, + "loss": 0.9037, + "step": 15193 + }, + { + "epoch": 0.7398534316947873, + "grad_norm": 1.626828908920288, + "learning_rate": 6.6884897516748025e-06, + "loss": 0.7603, + "step": 15194 + }, + { + "epoch": 0.7399021254838946, + "grad_norm": 1.947208046913147, + "learning_rate": 6.686135716270776e-06, + "loss": 0.9545, + "step": 15195 + }, + { + "epoch": 0.7399508192730018, + "grad_norm": 1.6956839561462402, + "learning_rate": 6.6837820120503525e-06, + "loss": 0.8478, + "step": 15196 + }, + { + "epoch": 0.739999513062109, + "grad_norm": 1.718200922012329, + "learning_rate": 6.681428639072054e-06, + "loss": 0.75, + "step": 15197 + }, + { + "epoch": 0.7400482068512161, + "grad_norm": 1.29778254032135, + "learning_rate": 6.679075597394436e-06, + "loss": 0.8916, + "step": 15198 + }, + { + "epoch": 0.7400969006403233, + "grad_norm": 1.505684733390808, + "learning_rate": 6.6767228870760285e-06, + "loss": 0.9612, + "step": 15199 + }, + { + "epoch": 0.7401455944294305, + "grad_norm": 0.09812457859516144, + "learning_rate": 6.674370508175354e-06, + "loss": 0.6766, + "step": 15200 + }, + { + "epoch": 0.7401942882185377, + "grad_norm": 1.762584924697876, + "learning_rate": 6.672018460750929e-06, + "loss": 0.8515, + "step": 15201 + }, + { + "epoch": 0.7402429820076449, + "grad_norm": 1.5713529586791992, + "learning_rate": 6.669666744861258e-06, + "loss": 0.7968, + "step": 15202 + }, + { + "epoch": 0.7402916757967521, + "grad_norm": 1.6387439966201782, + "learning_rate": 6.667315360564843e-06, + "loss": 0.7714, + "step": 15203 + }, + { + "epoch": 0.7403403695858594, + "grad_norm": 0.09081576019525528, + "learning_rate": 6.664964307920167e-06, + "loss": 0.617, + "step": 15204 + }, + { + "epoch": 0.7403890633749666, + "grad_norm": 1.3407337665557861, + "learning_rate": 6.662613586985733e-06, + "loss": 0.8192, + "step": 15205 + }, + { + "epoch": 0.7404377571640737, + "grad_norm": 0.09495797753334045, + "learning_rate": 6.6602631978199895e-06, + "loss": 0.5672, + "step": 15206 + }, + { + "epoch": 0.7404864509531809, + "grad_norm": 1.8429646492004395, + "learning_rate": 6.657913140481422e-06, + "loss": 0.8772, + "step": 15207 + }, + { + "epoch": 0.7405351447422881, + "grad_norm": 1.5440518856048584, + "learning_rate": 6.655563415028481e-06, + "loss": 0.6941, + "step": 15208 + }, + { + "epoch": 0.7405838385313953, + "grad_norm": 0.10724714398384094, + "learning_rate": 6.653214021519616e-06, + "loss": 0.6365, + "step": 15209 + }, + { + "epoch": 0.7406325323205025, + "grad_norm": 1.5991424322128296, + "learning_rate": 6.6508649600132706e-06, + "loss": 0.8685, + "step": 15210 + }, + { + "epoch": 0.7406812261096097, + "grad_norm": 1.1898503303527832, + "learning_rate": 6.648516230567876e-06, + "loss": 0.8123, + "step": 15211 + }, + { + "epoch": 0.740729919898717, + "grad_norm": 1.346439003944397, + "learning_rate": 6.646167833241856e-06, + "loss": 0.8646, + "step": 15212 + }, + { + "epoch": 0.7407786136878242, + "grad_norm": 1.4730241298675537, + "learning_rate": 6.643819768093625e-06, + "loss": 0.7823, + "step": 15213 + }, + { + "epoch": 0.7408273074769313, + "grad_norm": 1.6637386083602905, + "learning_rate": 6.6414720351816e-06, + "loss": 0.8076, + "step": 15214 + }, + { + "epoch": 0.7408760012660385, + "grad_norm": 5.58461856842041, + "learning_rate": 6.6391246345641755e-06, + "loss": 0.7612, + "step": 15215 + }, + { + "epoch": 0.7409246950551457, + "grad_norm": 1.7138652801513672, + "learning_rate": 6.636777566299746e-06, + "loss": 0.7541, + "step": 15216 + }, + { + "epoch": 0.7409733888442529, + "grad_norm": 1.422767162322998, + "learning_rate": 6.634430830446689e-06, + "loss": 0.8267, + "step": 15217 + }, + { + "epoch": 0.7410220826333601, + "grad_norm": 1.3097388744354248, + "learning_rate": 6.632084427063388e-06, + "loss": 0.8684, + "step": 15218 + }, + { + "epoch": 0.7410707764224673, + "grad_norm": 1.2202622890472412, + "learning_rate": 6.629738356208204e-06, + "loss": 0.8547, + "step": 15219 + }, + { + "epoch": 0.7411194702115745, + "grad_norm": 2.4928925037384033, + "learning_rate": 6.627392617939497e-06, + "loss": 0.8302, + "step": 15220 + }, + { + "epoch": 0.7411681640006818, + "grad_norm": 1.6272947788238525, + "learning_rate": 6.625047212315612e-06, + "loss": 0.8249, + "step": 15221 + }, + { + "epoch": 0.741216857789789, + "grad_norm": 0.09505033493041992, + "learning_rate": 6.62270213939491e-06, + "loss": 0.5928, + "step": 15222 + }, + { + "epoch": 0.7412655515788961, + "grad_norm": 1.705988883972168, + "learning_rate": 6.6203573992357e-06, + "loss": 0.7468, + "step": 15223 + }, + { + "epoch": 0.7413142453680033, + "grad_norm": 1.6823772192001343, + "learning_rate": 6.618012991896323e-06, + "loss": 0.7749, + "step": 15224 + }, + { + "epoch": 0.7413629391571105, + "grad_norm": 1.6608860492706299, + "learning_rate": 6.615668917435096e-06, + "loss": 0.8554, + "step": 15225 + }, + { + "epoch": 0.7414116329462177, + "grad_norm": 1.366463303565979, + "learning_rate": 6.6133251759103235e-06, + "loss": 0.853, + "step": 15226 + }, + { + "epoch": 0.7414603267353249, + "grad_norm": 1.4551942348480225, + "learning_rate": 6.610981767380309e-06, + "loss": 0.8474, + "step": 15227 + }, + { + "epoch": 0.7415090205244321, + "grad_norm": 1.9774638414382935, + "learning_rate": 6.6086386919033416e-06, + "loss": 0.8449, + "step": 15228 + }, + { + "epoch": 0.7415577143135393, + "grad_norm": 1.948183536529541, + "learning_rate": 6.6062959495377086e-06, + "loss": 0.7823, + "step": 15229 + }, + { + "epoch": 0.7416064081026466, + "grad_norm": 1.553511142730713, + "learning_rate": 6.603953540341681e-06, + "loss": 0.8438, + "step": 15230 + }, + { + "epoch": 0.7416551018917537, + "grad_norm": 1.2642003297805786, + "learning_rate": 6.60161146437354e-06, + "loss": 0.8407, + "step": 15231 + }, + { + "epoch": 0.7417037956808609, + "grad_norm": 1.536937952041626, + "learning_rate": 6.5992697216915256e-06, + "loss": 0.8427, + "step": 15232 + }, + { + "epoch": 0.7417524894699681, + "grad_norm": 3.047426700592041, + "learning_rate": 6.596928312353903e-06, + "loss": 0.7815, + "step": 15233 + }, + { + "epoch": 0.7418011832590753, + "grad_norm": 1.754395604133606, + "learning_rate": 6.594587236418912e-06, + "loss": 0.8305, + "step": 15234 + }, + { + "epoch": 0.7418498770481825, + "grad_norm": 2.159893035888672, + "learning_rate": 6.592246493944785e-06, + "loss": 0.834, + "step": 15235 + }, + { + "epoch": 0.7418985708372897, + "grad_norm": 1.6021485328674316, + "learning_rate": 6.589906084989748e-06, + "loss": 0.7577, + "step": 15236 + }, + { + "epoch": 0.7419472646263969, + "grad_norm": 1.7879550457000732, + "learning_rate": 6.5875660096120144e-06, + "loss": 0.7377, + "step": 15237 + }, + { + "epoch": 0.7419959584155041, + "grad_norm": 1.3801989555358887, + "learning_rate": 6.585226267869813e-06, + "loss": 0.7929, + "step": 15238 + }, + { + "epoch": 0.7420446522046114, + "grad_norm": 1.7928297519683838, + "learning_rate": 6.582886859821316e-06, + "loss": 0.7334, + "step": 15239 + }, + { + "epoch": 0.7420933459937185, + "grad_norm": 1.4232909679412842, + "learning_rate": 6.580547785524747e-06, + "loss": 0.808, + "step": 15240 + }, + { + "epoch": 0.7421420397828257, + "grad_norm": 1.4237911701202393, + "learning_rate": 6.578209045038262e-06, + "loss": 0.8342, + "step": 15241 + }, + { + "epoch": 0.7421907335719329, + "grad_norm": 1.4739024639129639, + "learning_rate": 6.575870638420057e-06, + "loss": 0.7285, + "step": 15242 + }, + { + "epoch": 0.7422394273610401, + "grad_norm": 1.529294729232788, + "learning_rate": 6.5735325657282934e-06, + "loss": 0.7129, + "step": 15243 + }, + { + "epoch": 0.7422881211501473, + "grad_norm": 1.5452146530151367, + "learning_rate": 6.571194827021123e-06, + "loss": 0.8871, + "step": 15244 + }, + { + "epoch": 0.7423368149392545, + "grad_norm": 2.3963992595672607, + "learning_rate": 6.568857422356723e-06, + "loss": 0.8636, + "step": 15245 + }, + { + "epoch": 0.7423855087283617, + "grad_norm": 1.683173418045044, + "learning_rate": 6.566520351793202e-06, + "loss": 0.7806, + "step": 15246 + }, + { + "epoch": 0.7424342025174689, + "grad_norm": 1.4603949785232544, + "learning_rate": 6.564183615388728e-06, + "loss": 0.8155, + "step": 15247 + }, + { + "epoch": 0.742482896306576, + "grad_norm": 1.39560866355896, + "learning_rate": 6.561847213201398e-06, + "loss": 0.7869, + "step": 15248 + }, + { + "epoch": 0.7425315900956833, + "grad_norm": 1.5715216398239136, + "learning_rate": 6.55951114528935e-06, + "loss": 0.8281, + "step": 15249 + }, + { + "epoch": 0.7425802838847905, + "grad_norm": 1.701027750968933, + "learning_rate": 6.557175411710688e-06, + "loss": 0.8106, + "step": 15250 + }, + { + "epoch": 0.7426289776738977, + "grad_norm": 1.724426031112671, + "learning_rate": 6.5548400125235115e-06, + "loss": 0.7098, + "step": 15251 + }, + { + "epoch": 0.7426776714630049, + "grad_norm": 2.6187057495117188, + "learning_rate": 6.5525049477859185e-06, + "loss": 0.7565, + "step": 15252 + }, + { + "epoch": 0.7427263652521121, + "grad_norm": 0.0930788442492485, + "learning_rate": 6.5501702175559825e-06, + "loss": 0.6027, + "step": 15253 + }, + { + "epoch": 0.7427750590412193, + "grad_norm": 1.4710725545883179, + "learning_rate": 6.547835821891802e-06, + "loss": 0.7984, + "step": 15254 + }, + { + "epoch": 0.7428237528303265, + "grad_norm": 1.4823983907699585, + "learning_rate": 6.545501760851418e-06, + "loss": 0.7983, + "step": 15255 + }, + { + "epoch": 0.7428724466194337, + "grad_norm": 3.4336984157562256, + "learning_rate": 6.543168034492919e-06, + "loss": 0.8173, + "step": 15256 + }, + { + "epoch": 0.7429211404085408, + "grad_norm": 4.0848388671875, + "learning_rate": 6.54083464287433e-06, + "loss": 0.7699, + "step": 15257 + }, + { + "epoch": 0.7429698341976481, + "grad_norm": 1.5436524152755737, + "learning_rate": 6.538501586053709e-06, + "loss": 0.8048, + "step": 15258 + }, + { + "epoch": 0.7430185279867553, + "grad_norm": 0.09767592698335648, + "learning_rate": 6.536168864089092e-06, + "loss": 0.64, + "step": 15259 + }, + { + "epoch": 0.7430672217758625, + "grad_norm": 2.0349862575531006, + "learning_rate": 6.533836477038502e-06, + "loss": 0.7616, + "step": 15260 + }, + { + "epoch": 0.7431159155649697, + "grad_norm": 1.3289525508880615, + "learning_rate": 6.531504424959956e-06, + "loss": 0.7485, + "step": 15261 + }, + { + "epoch": 0.7431646093540769, + "grad_norm": 1.4793508052825928, + "learning_rate": 6.529172707911459e-06, + "loss": 0.717, + "step": 15262 + }, + { + "epoch": 0.7432133031431841, + "grad_norm": 1.3284541368484497, + "learning_rate": 6.526841325951032e-06, + "loss": 0.8448, + "step": 15263 + }, + { + "epoch": 0.7432619969322913, + "grad_norm": 3.0053205490112305, + "learning_rate": 6.524510279136645e-06, + "loss": 0.8468, + "step": 15264 + }, + { + "epoch": 0.7433106907213984, + "grad_norm": 1.6195058822631836, + "learning_rate": 6.522179567526296e-06, + "loss": 0.7362, + "step": 15265 + }, + { + "epoch": 0.7433593845105056, + "grad_norm": 1.7317125797271729, + "learning_rate": 6.5198491911779625e-06, + "loss": 0.8156, + "step": 15266 + }, + { + "epoch": 0.7434080782996129, + "grad_norm": 3.519240140914917, + "learning_rate": 6.517519150149607e-06, + "loss": 0.8855, + "step": 15267 + }, + { + "epoch": 0.7434567720887201, + "grad_norm": 1.7812058925628662, + "learning_rate": 6.515189444499192e-06, + "loss": 0.7136, + "step": 15268 + }, + { + "epoch": 0.7435054658778273, + "grad_norm": 1.5238317251205444, + "learning_rate": 6.5128600742846706e-06, + "loss": 0.7388, + "step": 15269 + }, + { + "epoch": 0.7435541596669345, + "grad_norm": 1.628377079963684, + "learning_rate": 6.510531039563985e-06, + "loss": 0.8404, + "step": 15270 + }, + { + "epoch": 0.7436028534560417, + "grad_norm": 1.8984180688858032, + "learning_rate": 6.508202340395069e-06, + "loss": 0.8437, + "step": 15271 + }, + { + "epoch": 0.7436515472451489, + "grad_norm": 1.6529841423034668, + "learning_rate": 6.505873976835844e-06, + "loss": 0.8306, + "step": 15272 + }, + { + "epoch": 0.743700241034256, + "grad_norm": 1.5825899839401245, + "learning_rate": 6.503545948944238e-06, + "loss": 0.8859, + "step": 15273 + }, + { + "epoch": 0.7437489348233632, + "grad_norm": 1.6326546669006348, + "learning_rate": 6.501218256778159e-06, + "loss": 0.7923, + "step": 15274 + }, + { + "epoch": 0.7437976286124705, + "grad_norm": 1.4158934354782104, + "learning_rate": 6.498890900395507e-06, + "loss": 0.7519, + "step": 15275 + }, + { + "epoch": 0.7438463224015777, + "grad_norm": 1.3365204334259033, + "learning_rate": 6.496563879854174e-06, + "loss": 0.8313, + "step": 15276 + }, + { + "epoch": 0.7438950161906849, + "grad_norm": 1.6571495532989502, + "learning_rate": 6.494237195212045e-06, + "loss": 0.8333, + "step": 15277 + }, + { + "epoch": 0.7439437099797921, + "grad_norm": 3.8605167865753174, + "learning_rate": 6.491910846526999e-06, + "loss": 0.7567, + "step": 15278 + }, + { + "epoch": 0.7439924037688993, + "grad_norm": 2.0403220653533936, + "learning_rate": 6.489584833856894e-06, + "loss": 0.9186, + "step": 15279 + }, + { + "epoch": 0.7440410975580065, + "grad_norm": 1.915982723236084, + "learning_rate": 6.4872591572596136e-06, + "loss": 0.9109, + "step": 15280 + }, + { + "epoch": 0.7440897913471137, + "grad_norm": 1.7222844362258911, + "learning_rate": 6.48493381679298e-06, + "loss": 0.7245, + "step": 15281 + }, + { + "epoch": 0.7441384851362208, + "grad_norm": 1.6683744192123413, + "learning_rate": 6.482608812514854e-06, + "loss": 0.8102, + "step": 15282 + }, + { + "epoch": 0.744187178925328, + "grad_norm": 0.09799496829509735, + "learning_rate": 6.4802841444830685e-06, + "loss": 0.6705, + "step": 15283 + }, + { + "epoch": 0.7442358727144353, + "grad_norm": 1.416803002357483, + "learning_rate": 6.4779598127554475e-06, + "loss": 0.7831, + "step": 15284 + }, + { + "epoch": 0.7442845665035425, + "grad_norm": 1.6227104663848877, + "learning_rate": 6.475635817389807e-06, + "loss": 0.7853, + "step": 15285 + }, + { + "epoch": 0.7443332602926497, + "grad_norm": 1.4481723308563232, + "learning_rate": 6.473312158443961e-06, + "loss": 0.808, + "step": 15286 + }, + { + "epoch": 0.7443819540817569, + "grad_norm": 1.395703911781311, + "learning_rate": 6.470988835975706e-06, + "loss": 0.8542, + "step": 15287 + }, + { + "epoch": 0.7444306478708641, + "grad_norm": 1.3688979148864746, + "learning_rate": 6.4686658500428345e-06, + "loss": 0.7829, + "step": 15288 + }, + { + "epoch": 0.7444793416599713, + "grad_norm": 1.6498332023620605, + "learning_rate": 6.466343200703144e-06, + "loss": 0.805, + "step": 15289 + }, + { + "epoch": 0.7445280354490784, + "grad_norm": 1.8914523124694824, + "learning_rate": 6.464020888014388e-06, + "loss": 0.921, + "step": 15290 + }, + { + "epoch": 0.7445767292381856, + "grad_norm": 1.3645529747009277, + "learning_rate": 6.46169891203436e-06, + "loss": 0.8035, + "step": 15291 + }, + { + "epoch": 0.7446254230272928, + "grad_norm": 1.4980583190917969, + "learning_rate": 6.45937727282079e-06, + "loss": 0.7315, + "step": 15292 + }, + { + "epoch": 0.7446741168164001, + "grad_norm": 1.8211250305175781, + "learning_rate": 6.457055970431456e-06, + "loss": 0.8583, + "step": 15293 + }, + { + "epoch": 0.7447228106055073, + "grad_norm": 2.487684726715088, + "learning_rate": 6.454735004924086e-06, + "loss": 0.889, + "step": 15294 + }, + { + "epoch": 0.7447715043946145, + "grad_norm": 1.2177656888961792, + "learning_rate": 6.452414376356413e-06, + "loss": 0.9078, + "step": 15295 + }, + { + "epoch": 0.7448201981837217, + "grad_norm": 1.2868170738220215, + "learning_rate": 6.45009408478618e-06, + "loss": 0.7018, + "step": 15296 + }, + { + "epoch": 0.7448688919728289, + "grad_norm": 1.5446745157241821, + "learning_rate": 6.447774130271078e-06, + "loss": 0.8003, + "step": 15297 + }, + { + "epoch": 0.7449175857619361, + "grad_norm": 1.7274991273880005, + "learning_rate": 6.4454545128688454e-06, + "loss": 0.7909, + "step": 15298 + }, + { + "epoch": 0.7449662795510432, + "grad_norm": 1.4439822435379028, + "learning_rate": 6.443135232637154e-06, + "loss": 0.8195, + "step": 15299 + }, + { + "epoch": 0.7450149733401504, + "grad_norm": 0.09678684175014496, + "learning_rate": 6.440816289633716e-06, + "loss": 0.585, + "step": 15300 + }, + { + "epoch": 0.7450636671292576, + "grad_norm": 1.5159695148468018, + "learning_rate": 6.43849768391621e-06, + "loss": 0.8745, + "step": 15301 + }, + { + "epoch": 0.7451123609183649, + "grad_norm": 1.461153268814087, + "learning_rate": 6.436179415542303e-06, + "loss": 0.758, + "step": 15302 + }, + { + "epoch": 0.7451610547074721, + "grad_norm": 1.7663804292678833, + "learning_rate": 6.433861484569681e-06, + "loss": 0.8449, + "step": 15303 + }, + { + "epoch": 0.7452097484965793, + "grad_norm": 1.1712745428085327, + "learning_rate": 6.43154389105598e-06, + "loss": 0.8528, + "step": 15304 + }, + { + "epoch": 0.7452584422856865, + "grad_norm": 1.561441421508789, + "learning_rate": 6.429226635058876e-06, + "loss": 0.7626, + "step": 15305 + }, + { + "epoch": 0.7453071360747937, + "grad_norm": 1.2685686349868774, + "learning_rate": 6.426909716635983e-06, + "loss": 0.7713, + "step": 15306 + }, + { + "epoch": 0.7453558298639008, + "grad_norm": 1.4916614294052124, + "learning_rate": 6.424593135844954e-06, + "loss": 0.8863, + "step": 15307 + }, + { + "epoch": 0.745404523653008, + "grad_norm": 0.1058351993560791, + "learning_rate": 6.422276892743411e-06, + "loss": 0.5909, + "step": 15308 + }, + { + "epoch": 0.7454532174421152, + "grad_norm": 1.6581196784973145, + "learning_rate": 6.419960987388966e-06, + "loss": 0.8072, + "step": 15309 + }, + { + "epoch": 0.7455019112312224, + "grad_norm": 1.3250142335891724, + "learning_rate": 6.417645419839229e-06, + "loss": 0.8107, + "step": 15310 + }, + { + "epoch": 0.7455506050203297, + "grad_norm": 1.5465790033340454, + "learning_rate": 6.4153301901518025e-06, + "loss": 0.8053, + "step": 15311 + }, + { + "epoch": 0.7455992988094369, + "grad_norm": 2.2650721073150635, + "learning_rate": 6.413015298384276e-06, + "loss": 0.821, + "step": 15312 + }, + { + "epoch": 0.7456479925985441, + "grad_norm": 1.444481611251831, + "learning_rate": 6.4107007445942246e-06, + "loss": 0.811, + "step": 15313 + }, + { + "epoch": 0.7456966863876513, + "grad_norm": 1.9577847719192505, + "learning_rate": 6.408386528839245e-06, + "loss": 0.7421, + "step": 15314 + }, + { + "epoch": 0.7457453801767584, + "grad_norm": 1.5506054162979126, + "learning_rate": 6.406072651176878e-06, + "loss": 0.7622, + "step": 15315 + }, + { + "epoch": 0.7457940739658656, + "grad_norm": 1.5926822423934937, + "learning_rate": 6.403759111664698e-06, + "loss": 0.8304, + "step": 15316 + }, + { + "epoch": 0.7458427677549728, + "grad_norm": 1.678230881690979, + "learning_rate": 6.401445910360251e-06, + "loss": 0.8909, + "step": 15317 + }, + { + "epoch": 0.74589146154408, + "grad_norm": 2.459247350692749, + "learning_rate": 6.399133047321076e-06, + "loss": 0.8003, + "step": 15318 + }, + { + "epoch": 0.7459401553331872, + "grad_norm": 1.5670287609100342, + "learning_rate": 6.396820522604708e-06, + "loss": 0.7925, + "step": 15319 + }, + { + "epoch": 0.7459888491222945, + "grad_norm": 1.4605019092559814, + "learning_rate": 6.394508336268668e-06, + "loss": 0.8323, + "step": 15320 + }, + { + "epoch": 0.7460375429114017, + "grad_norm": 2.0969860553741455, + "learning_rate": 6.392196488370475e-06, + "loss": 0.8162, + "step": 15321 + }, + { + "epoch": 0.7460862367005089, + "grad_norm": 1.8987094163894653, + "learning_rate": 6.389884978967627e-06, + "loss": 0.7609, + "step": 15322 + }, + { + "epoch": 0.7461349304896161, + "grad_norm": 1.7581568956375122, + "learning_rate": 6.387573808117637e-06, + "loss": 0.8326, + "step": 15323 + }, + { + "epoch": 0.7461836242787232, + "grad_norm": 1.4942325353622437, + "learning_rate": 6.3852629758779926e-06, + "loss": 0.8045, + "step": 15324 + }, + { + "epoch": 0.7462323180678304, + "grad_norm": 1.5132273435592651, + "learning_rate": 6.382952482306171e-06, + "loss": 0.8789, + "step": 15325 + }, + { + "epoch": 0.7462810118569376, + "grad_norm": 0.10280709713697433, + "learning_rate": 6.380642327459648e-06, + "loss": 0.5944, + "step": 15326 + }, + { + "epoch": 0.7463297056460448, + "grad_norm": 1.9546258449554443, + "learning_rate": 6.378332511395888e-06, + "loss": 0.7464, + "step": 15327 + }, + { + "epoch": 0.746378399435152, + "grad_norm": 1.6184152364730835, + "learning_rate": 6.376023034172352e-06, + "loss": 0.7566, + "step": 15328 + }, + { + "epoch": 0.7464270932242593, + "grad_norm": 1.3306944370269775, + "learning_rate": 6.373713895846483e-06, + "loss": 0.8709, + "step": 15329 + }, + { + "epoch": 0.7464757870133665, + "grad_norm": 1.804304599761963, + "learning_rate": 6.371405096475716e-06, + "loss": 0.7294, + "step": 15330 + }, + { + "epoch": 0.7465244808024737, + "grad_norm": 1.8876303434371948, + "learning_rate": 6.3690966361175044e-06, + "loss": 0.7407, + "step": 15331 + }, + { + "epoch": 0.7465731745915808, + "grad_norm": 1.5595299005508423, + "learning_rate": 6.366788514829243e-06, + "loss": 0.8969, + "step": 15332 + }, + { + "epoch": 0.746621868380688, + "grad_norm": 1.2653542757034302, + "learning_rate": 6.364480732668368e-06, + "loss": 0.7328, + "step": 15333 + }, + { + "epoch": 0.7466705621697952, + "grad_norm": 1.2966423034667969, + "learning_rate": 6.362173289692277e-06, + "loss": 0.7712, + "step": 15334 + }, + { + "epoch": 0.7467192559589024, + "grad_norm": 4.668359756469727, + "learning_rate": 6.359866185958367e-06, + "loss": 0.8071, + "step": 15335 + }, + { + "epoch": 0.7467679497480096, + "grad_norm": 1.7613530158996582, + "learning_rate": 6.357559421524033e-06, + "loss": 0.744, + "step": 15336 + }, + { + "epoch": 0.7468166435371169, + "grad_norm": 1.385816216468811, + "learning_rate": 6.35525299644665e-06, + "loss": 0.7577, + "step": 15337 + }, + { + "epoch": 0.7468653373262241, + "grad_norm": 1.480584979057312, + "learning_rate": 6.352946910783593e-06, + "loss": 0.849, + "step": 15338 + }, + { + "epoch": 0.7469140311153313, + "grad_norm": 0.09412849694490433, + "learning_rate": 6.35064116459222e-06, + "loss": 0.5839, + "step": 15339 + }, + { + "epoch": 0.7469627249044385, + "grad_norm": 2.3750336170196533, + "learning_rate": 6.348335757929904e-06, + "loss": 0.855, + "step": 15340 + }, + { + "epoch": 0.7470114186935456, + "grad_norm": 1.5743472576141357, + "learning_rate": 6.34603069085397e-06, + "loss": 0.7405, + "step": 15341 + }, + { + "epoch": 0.7470601124826528, + "grad_norm": 1.3460946083068848, + "learning_rate": 6.343725963421772e-06, + "loss": 0.8277, + "step": 15342 + }, + { + "epoch": 0.74710880627176, + "grad_norm": 1.4999854564666748, + "learning_rate": 6.341421575690636e-06, + "loss": 0.8336, + "step": 15343 + }, + { + "epoch": 0.7471575000608672, + "grad_norm": 1.5474945306777954, + "learning_rate": 6.339117527717886e-06, + "loss": 0.7962, + "step": 15344 + }, + { + "epoch": 0.7472061938499744, + "grad_norm": 1.7957452535629272, + "learning_rate": 6.33681381956083e-06, + "loss": 0.8462, + "step": 15345 + }, + { + "epoch": 0.7472548876390817, + "grad_norm": 0.09652215242385864, + "learning_rate": 6.334510451276772e-06, + "loss": 0.6739, + "step": 15346 + }, + { + "epoch": 0.7473035814281889, + "grad_norm": 1.421928882598877, + "learning_rate": 6.332207422923025e-06, + "loss": 0.7829, + "step": 15347 + }, + { + "epoch": 0.7473522752172961, + "grad_norm": 1.5535242557525635, + "learning_rate": 6.3299047345568485e-06, + "loss": 0.8126, + "step": 15348 + }, + { + "epoch": 0.7474009690064032, + "grad_norm": 1.2136023044586182, + "learning_rate": 6.327602386235554e-06, + "loss": 0.8372, + "step": 15349 + }, + { + "epoch": 0.7474496627955104, + "grad_norm": 1.3326737880706787, + "learning_rate": 6.325300378016384e-06, + "loss": 0.7628, + "step": 15350 + }, + { + "epoch": 0.7474983565846176, + "grad_norm": 2.6125998497009277, + "learning_rate": 6.32299870995662e-06, + "loss": 0.7774, + "step": 15351 + }, + { + "epoch": 0.7475470503737248, + "grad_norm": 2.2401158809661865, + "learning_rate": 6.3206973821135095e-06, + "loss": 0.9068, + "step": 15352 + }, + { + "epoch": 0.747595744162832, + "grad_norm": 1.8088455200195312, + "learning_rate": 6.318396394544293e-06, + "loss": 0.7228, + "step": 15353 + }, + { + "epoch": 0.7476444379519392, + "grad_norm": 1.4237699508666992, + "learning_rate": 6.316095747306226e-06, + "loss": 0.8301, + "step": 15354 + }, + { + "epoch": 0.7476931317410465, + "grad_norm": 1.5087376832962036, + "learning_rate": 6.313795440456512e-06, + "loss": 0.8063, + "step": 15355 + }, + { + "epoch": 0.7477418255301537, + "grad_norm": 1.465838074684143, + "learning_rate": 6.311495474052398e-06, + "loss": 0.715, + "step": 15356 + }, + { + "epoch": 0.7477905193192609, + "grad_norm": 1.9374454021453857, + "learning_rate": 6.309195848151068e-06, + "loss": 0.8562, + "step": 15357 + }, + { + "epoch": 0.747839213108368, + "grad_norm": 3.997532844543457, + "learning_rate": 6.3068965628097454e-06, + "loss": 0.8133, + "step": 15358 + }, + { + "epoch": 0.7478879068974752, + "grad_norm": 1.5539156198501587, + "learning_rate": 6.304597618085622e-06, + "loss": 0.7638, + "step": 15359 + }, + { + "epoch": 0.7479366006865824, + "grad_norm": 1.8893897533416748, + "learning_rate": 6.30229901403588e-06, + "loss": 0.8363, + "step": 15360 + }, + { + "epoch": 0.7479852944756896, + "grad_norm": 1.3675400018692017, + "learning_rate": 6.300000750717699e-06, + "loss": 0.8227, + "step": 15361 + }, + { + "epoch": 0.7480339882647968, + "grad_norm": 4.341843128204346, + "learning_rate": 6.297702828188242e-06, + "loss": 0.8524, + "step": 15362 + }, + { + "epoch": 0.748082682053904, + "grad_norm": 2.504498243331909, + "learning_rate": 6.295405246504689e-06, + "loss": 0.7806, + "step": 15363 + }, + { + "epoch": 0.7481313758430113, + "grad_norm": 1.55751633644104, + "learning_rate": 6.293108005724167e-06, + "loss": 0.7736, + "step": 15364 + }, + { + "epoch": 0.7481800696321185, + "grad_norm": 1.4664881229400635, + "learning_rate": 6.29081110590384e-06, + "loss": 0.8141, + "step": 15365 + }, + { + "epoch": 0.7482287634212256, + "grad_norm": 1.9470229148864746, + "learning_rate": 6.288514547100837e-06, + "loss": 0.8031, + "step": 15366 + }, + { + "epoch": 0.7482774572103328, + "grad_norm": 0.0917239785194397, + "learning_rate": 6.2862183293722845e-06, + "loss": 0.6155, + "step": 15367 + }, + { + "epoch": 0.74832615099944, + "grad_norm": 1.5646659135818481, + "learning_rate": 6.283922452775302e-06, + "loss": 0.8079, + "step": 15368 + }, + { + "epoch": 0.7483748447885472, + "grad_norm": 1.382994532585144, + "learning_rate": 6.2816269173669985e-06, + "loss": 0.7816, + "step": 15369 + }, + { + "epoch": 0.7484235385776544, + "grad_norm": 1.2936629056930542, + "learning_rate": 6.279331723204476e-06, + "loss": 0.8694, + "step": 15370 + }, + { + "epoch": 0.7484722323667616, + "grad_norm": 1.3599728345870972, + "learning_rate": 6.2770368703448215e-06, + "loss": 0.7451, + "step": 15371 + }, + { + "epoch": 0.7485209261558688, + "grad_norm": 1.5011271238327026, + "learning_rate": 6.274742358845141e-06, + "loss": 0.7573, + "step": 15372 + }, + { + "epoch": 0.7485696199449761, + "grad_norm": 1.4980891942977905, + "learning_rate": 6.272448188762479e-06, + "loss": 0.7723, + "step": 15373 + }, + { + "epoch": 0.7486183137340832, + "grad_norm": 1.7378559112548828, + "learning_rate": 6.270154360153928e-06, + "loss": 0.8299, + "step": 15374 + }, + { + "epoch": 0.7486670075231904, + "grad_norm": 1.5003570318222046, + "learning_rate": 6.267860873076541e-06, + "loss": 0.8193, + "step": 15375 + }, + { + "epoch": 0.7487157013122976, + "grad_norm": 1.9120142459869385, + "learning_rate": 6.265567727587365e-06, + "loss": 0.8138, + "step": 15376 + }, + { + "epoch": 0.7487643951014048, + "grad_norm": 1.6371757984161377, + "learning_rate": 6.2632749237434455e-06, + "loss": 0.8497, + "step": 15377 + }, + { + "epoch": 0.748813088890512, + "grad_norm": 1.890320897102356, + "learning_rate": 6.260982461601812e-06, + "loss": 0.8326, + "step": 15378 + }, + { + "epoch": 0.7488617826796192, + "grad_norm": 1.949021816253662, + "learning_rate": 6.258690341219495e-06, + "loss": 0.6909, + "step": 15379 + }, + { + "epoch": 0.7489104764687264, + "grad_norm": 1.8706437349319458, + "learning_rate": 6.2563985626535075e-06, + "loss": 0.8355, + "step": 15380 + }, + { + "epoch": 0.7489591702578337, + "grad_norm": 1.9404634237289429, + "learning_rate": 6.254107125960853e-06, + "loss": 0.7777, + "step": 15381 + }, + { + "epoch": 0.7490078640469409, + "grad_norm": 1.3442623615264893, + "learning_rate": 6.251816031198543e-06, + "loss": 0.7934, + "step": 15382 + }, + { + "epoch": 0.749056557836048, + "grad_norm": 1.37799870967865, + "learning_rate": 6.249525278423563e-06, + "loss": 0.7902, + "step": 15383 + }, + { + "epoch": 0.7491052516251552, + "grad_norm": 3.3942363262176514, + "learning_rate": 6.2472348676928954e-06, + "loss": 0.8049, + "step": 15384 + }, + { + "epoch": 0.7491539454142624, + "grad_norm": 1.2939120531082153, + "learning_rate": 6.244944799063513e-06, + "loss": 0.7862, + "step": 15385 + }, + { + "epoch": 0.7492026392033696, + "grad_norm": 1.926514744758606, + "learning_rate": 6.242655072592385e-06, + "loss": 0.6973, + "step": 15386 + }, + { + "epoch": 0.7492513329924768, + "grad_norm": 2.627422571182251, + "learning_rate": 6.240365688336465e-06, + "loss": 0.8628, + "step": 15387 + }, + { + "epoch": 0.749300026781584, + "grad_norm": 1.9321973323822021, + "learning_rate": 6.2380766463526976e-06, + "loss": 0.7566, + "step": 15388 + }, + { + "epoch": 0.7493487205706912, + "grad_norm": 2.050025701522827, + "learning_rate": 6.23578794669804e-06, + "loss": 0.7645, + "step": 15389 + }, + { + "epoch": 0.7493974143597985, + "grad_norm": 1.4309066534042358, + "learning_rate": 6.2334995894294015e-06, + "loss": 0.7994, + "step": 15390 + }, + { + "epoch": 0.7494461081489056, + "grad_norm": 1.6224249601364136, + "learning_rate": 6.23121157460372e-06, + "loss": 0.8813, + "step": 15391 + }, + { + "epoch": 0.7494948019380128, + "grad_norm": 1.2316700220108032, + "learning_rate": 6.228923902277908e-06, + "loss": 0.7892, + "step": 15392 + }, + { + "epoch": 0.74954349572712, + "grad_norm": 1.4257363080978394, + "learning_rate": 6.2266365725088685e-06, + "loss": 0.8338, + "step": 15393 + }, + { + "epoch": 0.7495921895162272, + "grad_norm": 2.025966167449951, + "learning_rate": 6.2243495853534995e-06, + "loss": 0.8182, + "step": 15394 + }, + { + "epoch": 0.7496408833053344, + "grad_norm": 1.8081670999526978, + "learning_rate": 6.2220629408686915e-06, + "loss": 0.7201, + "step": 15395 + }, + { + "epoch": 0.7496895770944416, + "grad_norm": 1.6093695163726807, + "learning_rate": 6.219776639111324e-06, + "loss": 0.8274, + "step": 15396 + }, + { + "epoch": 0.7497382708835488, + "grad_norm": 1.5550851821899414, + "learning_rate": 6.217490680138261e-06, + "loss": 0.7718, + "step": 15397 + }, + { + "epoch": 0.749786964672656, + "grad_norm": 1.8032615184783936, + "learning_rate": 6.215205064006389e-06, + "loss": 0.7771, + "step": 15398 + }, + { + "epoch": 0.7498356584617633, + "grad_norm": 2.2247848510742188, + "learning_rate": 6.212919790772533e-06, + "loss": 0.8144, + "step": 15399 + }, + { + "epoch": 0.7498843522508704, + "grad_norm": 1.9225122928619385, + "learning_rate": 6.21063486049357e-06, + "loss": 0.8226, + "step": 15400 + }, + { + "epoch": 0.7499330460399776, + "grad_norm": 1.410428524017334, + "learning_rate": 6.208350273226307e-06, + "loss": 0.8067, + "step": 15401 + }, + { + "epoch": 0.7499817398290848, + "grad_norm": 1.565788745880127, + "learning_rate": 6.2060660290275956e-06, + "loss": 0.7636, + "step": 15402 + }, + { + "epoch": 0.750030433618192, + "grad_norm": 1.1671924591064453, + "learning_rate": 6.203782127954248e-06, + "loss": 0.8033, + "step": 15403 + }, + { + "epoch": 0.7500791274072992, + "grad_norm": 1.8567277193069458, + "learning_rate": 6.201498570063074e-06, + "loss": 0.8773, + "step": 15404 + }, + { + "epoch": 0.7501278211964064, + "grad_norm": 1.8952523469924927, + "learning_rate": 6.19921535541089e-06, + "loss": 0.8348, + "step": 15405 + }, + { + "epoch": 0.7501765149855136, + "grad_norm": 1.5801841020584106, + "learning_rate": 6.196932484054474e-06, + "loss": 0.833, + "step": 15406 + }, + { + "epoch": 0.7502252087746208, + "grad_norm": 1.4330172538757324, + "learning_rate": 6.194649956050631e-06, + "loss": 0.8612, + "step": 15407 + }, + { + "epoch": 0.750273902563728, + "grad_norm": 1.3356103897094727, + "learning_rate": 6.192367771456118e-06, + "loss": 0.806, + "step": 15408 + }, + { + "epoch": 0.7503225963528352, + "grad_norm": 1.7899065017700195, + "learning_rate": 6.19008593032772e-06, + "loss": 0.7898, + "step": 15409 + }, + { + "epoch": 0.7503712901419424, + "grad_norm": 1.9585731029510498, + "learning_rate": 6.187804432722195e-06, + "loss": 0.8799, + "step": 15410 + }, + { + "epoch": 0.7504199839310496, + "grad_norm": 1.578926682472229, + "learning_rate": 6.1855232786962885e-06, + "loss": 0.7321, + "step": 15411 + }, + { + "epoch": 0.7504686777201568, + "grad_norm": 0.09723029285669327, + "learning_rate": 6.1832424683067604e-06, + "loss": 0.6145, + "step": 15412 + }, + { + "epoch": 0.750517371509264, + "grad_norm": 1.8662885427474976, + "learning_rate": 6.180962001610325e-06, + "loss": 0.7313, + "step": 15413 + }, + { + "epoch": 0.7505660652983712, + "grad_norm": 1.888512372970581, + "learning_rate": 6.178681878663733e-06, + "loss": 0.839, + "step": 15414 + }, + { + "epoch": 0.7506147590874784, + "grad_norm": 1.8232516050338745, + "learning_rate": 6.176402099523675e-06, + "loss": 0.8449, + "step": 15415 + }, + { + "epoch": 0.7506634528765856, + "grad_norm": 2.4725260734558105, + "learning_rate": 6.174122664246882e-06, + "loss": 0.8229, + "step": 15416 + }, + { + "epoch": 0.7507121466656927, + "grad_norm": 1.5188528299331665, + "learning_rate": 6.171843572890049e-06, + "loss": 0.7157, + "step": 15417 + }, + { + "epoch": 0.7507608404548, + "grad_norm": 1.1589833498001099, + "learning_rate": 6.169564825509866e-06, + "loss": 0.7967, + "step": 15418 + }, + { + "epoch": 0.7508095342439072, + "grad_norm": 1.584015965461731, + "learning_rate": 6.167286422163021e-06, + "loss": 0.7627, + "step": 15419 + }, + { + "epoch": 0.7508582280330144, + "grad_norm": 1.7699049711227417, + "learning_rate": 6.165008362906186e-06, + "loss": 0.8025, + "step": 15420 + }, + { + "epoch": 0.7509069218221216, + "grad_norm": 1.2368566989898682, + "learning_rate": 6.162730647796029e-06, + "loss": 0.7101, + "step": 15421 + }, + { + "epoch": 0.7509556156112288, + "grad_norm": 1.7743314504623413, + "learning_rate": 6.160453276889204e-06, + "loss": 0.7806, + "step": 15422 + }, + { + "epoch": 0.751004309400336, + "grad_norm": 1.8575313091278076, + "learning_rate": 6.158176250242378e-06, + "loss": 0.7987, + "step": 15423 + }, + { + "epoch": 0.7510530031894432, + "grad_norm": 1.8226451873779297, + "learning_rate": 6.155899567912167e-06, + "loss": 0.7657, + "step": 15424 + }, + { + "epoch": 0.7511016969785503, + "grad_norm": 1.7359836101531982, + "learning_rate": 6.153623229955224e-06, + "loss": 0.8452, + "step": 15425 + }, + { + "epoch": 0.7511503907676575, + "grad_norm": 5.739856719970703, + "learning_rate": 6.151347236428165e-06, + "loss": 0.7568, + "step": 15426 + }, + { + "epoch": 0.7511990845567648, + "grad_norm": 2.2338178157806396, + "learning_rate": 6.149071587387605e-06, + "loss": 0.8342, + "step": 15427 + }, + { + "epoch": 0.751247778345872, + "grad_norm": 6.034976005554199, + "learning_rate": 6.146796282890153e-06, + "loss": 0.8146, + "step": 15428 + }, + { + "epoch": 0.7512964721349792, + "grad_norm": 1.7068790197372437, + "learning_rate": 6.144521322992405e-06, + "loss": 0.7667, + "step": 15429 + }, + { + "epoch": 0.7513451659240864, + "grad_norm": 1.463252305984497, + "learning_rate": 6.142246707750954e-06, + "loss": 0.7541, + "step": 15430 + }, + { + "epoch": 0.7513938597131936, + "grad_norm": 1.895617127418518, + "learning_rate": 6.139972437222373e-06, + "loss": 0.7658, + "step": 15431 + }, + { + "epoch": 0.7514425535023008, + "grad_norm": 1.6840764284133911, + "learning_rate": 6.137698511463248e-06, + "loss": 0.7672, + "step": 15432 + }, + { + "epoch": 0.7514912472914079, + "grad_norm": 1.3227765560150146, + "learning_rate": 6.135424930530136e-06, + "loss": 0.8178, + "step": 15433 + }, + { + "epoch": 0.7515399410805151, + "grad_norm": 1.9123623371124268, + "learning_rate": 6.133151694479593e-06, + "loss": 0.831, + "step": 15434 + }, + { + "epoch": 0.7515886348696224, + "grad_norm": 1.5804673433303833, + "learning_rate": 6.130878803368166e-06, + "loss": 0.7209, + "step": 15435 + }, + { + "epoch": 0.7516373286587296, + "grad_norm": 1.3466933965682983, + "learning_rate": 6.1286062572523915e-06, + "loss": 0.8353, + "step": 15436 + }, + { + "epoch": 0.7516860224478368, + "grad_norm": 0.09263549745082855, + "learning_rate": 6.126334056188803e-06, + "loss": 0.6138, + "step": 15437 + }, + { + "epoch": 0.751734716236944, + "grad_norm": 1.7685600519180298, + "learning_rate": 6.1240622002339205e-06, + "loss": 0.873, + "step": 15438 + }, + { + "epoch": 0.7517834100260512, + "grad_norm": 1.452920913696289, + "learning_rate": 6.121790689444249e-06, + "loss": 0.775, + "step": 15439 + }, + { + "epoch": 0.7518321038151584, + "grad_norm": 1.4435648918151855, + "learning_rate": 6.119519523876305e-06, + "loss": 0.8363, + "step": 15440 + }, + { + "epoch": 0.7518807976042656, + "grad_norm": 1.8812307119369507, + "learning_rate": 6.11724870358658e-06, + "loss": 0.8301, + "step": 15441 + }, + { + "epoch": 0.7519294913933727, + "grad_norm": 1.3166899681091309, + "learning_rate": 6.114978228631558e-06, + "loss": 0.7807, + "step": 15442 + }, + { + "epoch": 0.7519781851824799, + "grad_norm": 1.9867194890975952, + "learning_rate": 6.112708099067719e-06, + "loss": 0.7571, + "step": 15443 + }, + { + "epoch": 0.7520268789715872, + "grad_norm": 1.9656956195831299, + "learning_rate": 6.110438314951532e-06, + "loss": 0.7879, + "step": 15444 + }, + { + "epoch": 0.7520755727606944, + "grad_norm": 1.7836365699768066, + "learning_rate": 6.108168876339457e-06, + "loss": 0.8579, + "step": 15445 + }, + { + "epoch": 0.7521242665498016, + "grad_norm": 1.670946717262268, + "learning_rate": 6.10589978328795e-06, + "loss": 0.8492, + "step": 15446 + }, + { + "epoch": 0.7521729603389088, + "grad_norm": 1.6937401294708252, + "learning_rate": 6.103631035853452e-06, + "loss": 0.903, + "step": 15447 + }, + { + "epoch": 0.752221654128016, + "grad_norm": 3.2812910079956055, + "learning_rate": 6.101362634092391e-06, + "loss": 0.8348, + "step": 15448 + }, + { + "epoch": 0.7522703479171232, + "grad_norm": 1.7272425889968872, + "learning_rate": 6.099094578061218e-06, + "loss": 0.7684, + "step": 15449 + }, + { + "epoch": 0.7523190417062303, + "grad_norm": 1.4758563041687012, + "learning_rate": 6.096826867816319e-06, + "loss": 0.825, + "step": 15450 + }, + { + "epoch": 0.7523677354953375, + "grad_norm": 13.545467376708984, + "learning_rate": 6.094559503414128e-06, + "loss": 0.8207, + "step": 15451 + }, + { + "epoch": 0.7524164292844447, + "grad_norm": 1.6799286603927612, + "learning_rate": 6.092292484911035e-06, + "loss": 0.7292, + "step": 15452 + }, + { + "epoch": 0.752465123073552, + "grad_norm": 1.5766870975494385, + "learning_rate": 6.090025812363436e-06, + "loss": 0.8562, + "step": 15453 + }, + { + "epoch": 0.7525138168626592, + "grad_norm": 1.6428829431533813, + "learning_rate": 6.087759485827711e-06, + "loss": 0.7384, + "step": 15454 + }, + { + "epoch": 0.7525625106517664, + "grad_norm": 2.076953649520874, + "learning_rate": 6.085493505360234e-06, + "loss": 0.8194, + "step": 15455 + }, + { + "epoch": 0.7526112044408736, + "grad_norm": 1.5409185886383057, + "learning_rate": 6.083227871017385e-06, + "loss": 0.8039, + "step": 15456 + }, + { + "epoch": 0.7526598982299808, + "grad_norm": 1.5205278396606445, + "learning_rate": 6.080962582855501e-06, + "loss": 0.797, + "step": 15457 + }, + { + "epoch": 0.752708592019088, + "grad_norm": 4.450541019439697, + "learning_rate": 6.0786976409309526e-06, + "loss": 0.7345, + "step": 15458 + }, + { + "epoch": 0.7527572858081951, + "grad_norm": 1.220745325088501, + "learning_rate": 6.076433045300058e-06, + "loss": 0.7883, + "step": 15459 + }, + { + "epoch": 0.7528059795973023, + "grad_norm": 1.5572112798690796, + "learning_rate": 6.074168796019167e-06, + "loss": 0.8664, + "step": 15460 + }, + { + "epoch": 0.7528546733864095, + "grad_norm": 2.443739414215088, + "learning_rate": 6.071904893144595e-06, + "loss": 0.8562, + "step": 15461 + }, + { + "epoch": 0.7529033671755168, + "grad_norm": 1.5968226194381714, + "learning_rate": 6.069641336732654e-06, + "loss": 0.747, + "step": 15462 + }, + { + "epoch": 0.752952060964624, + "grad_norm": 1.7699781656265259, + "learning_rate": 6.067378126839669e-06, + "loss": 0.8358, + "step": 15463 + }, + { + "epoch": 0.7530007547537312, + "grad_norm": 2.08642315864563, + "learning_rate": 6.06511526352191e-06, + "loss": 0.82, + "step": 15464 + }, + { + "epoch": 0.7530494485428384, + "grad_norm": 1.5481584072113037, + "learning_rate": 6.062852746835692e-06, + "loss": 0.7326, + "step": 15465 + }, + { + "epoch": 0.7530981423319456, + "grad_norm": 1.5264825820922852, + "learning_rate": 6.060590576837269e-06, + "loss": 0.8019, + "step": 15466 + }, + { + "epoch": 0.7531468361210527, + "grad_norm": 1.6539031267166138, + "learning_rate": 6.058328753582934e-06, + "loss": 0.7817, + "step": 15467 + }, + { + "epoch": 0.7531955299101599, + "grad_norm": 1.7882206439971924, + "learning_rate": 6.0560672771289406e-06, + "loss": 0.8742, + "step": 15468 + }, + { + "epoch": 0.7532442236992671, + "grad_norm": 1.6708265542984009, + "learning_rate": 6.053806147531547e-06, + "loss": 0.8378, + "step": 15469 + }, + { + "epoch": 0.7532929174883743, + "grad_norm": 1.6839607954025269, + "learning_rate": 6.051545364846996e-06, + "loss": 0.7757, + "step": 15470 + }, + { + "epoch": 0.7533416112774816, + "grad_norm": 1.5852001905441284, + "learning_rate": 6.0492849291315185e-06, + "loss": 0.8389, + "step": 15471 + }, + { + "epoch": 0.7533903050665888, + "grad_norm": 2.1599905490875244, + "learning_rate": 6.0470248404413645e-06, + "loss": 0.7468, + "step": 15472 + }, + { + "epoch": 0.753438998855696, + "grad_norm": 1.4721617698669434, + "learning_rate": 6.044765098832725e-06, + "loss": 0.8664, + "step": 15473 + }, + { + "epoch": 0.7534876926448032, + "grad_norm": 3.11031436920166, + "learning_rate": 6.042505704361836e-06, + "loss": 0.8964, + "step": 15474 + }, + { + "epoch": 0.7535363864339104, + "grad_norm": 1.158336877822876, + "learning_rate": 6.0402466570848875e-06, + "loss": 0.7843, + "step": 15475 + }, + { + "epoch": 0.7535850802230175, + "grad_norm": 2.268573045730591, + "learning_rate": 6.0379879570580775e-06, + "loss": 0.827, + "step": 15476 + }, + { + "epoch": 0.7536337740121247, + "grad_norm": 3.2472610473632812, + "learning_rate": 6.03572960433759e-06, + "loss": 0.7934, + "step": 15477 + }, + { + "epoch": 0.7536824678012319, + "grad_norm": 1.8292187452316284, + "learning_rate": 6.033471598979601e-06, + "loss": 0.8263, + "step": 15478 + }, + { + "epoch": 0.7537311615903391, + "grad_norm": 2.0906338691711426, + "learning_rate": 6.031213941040279e-06, + "loss": 0.7729, + "step": 15479 + }, + { + "epoch": 0.7537798553794464, + "grad_norm": 1.3435969352722168, + "learning_rate": 6.02895663057578e-06, + "loss": 0.7945, + "step": 15480 + }, + { + "epoch": 0.7538285491685536, + "grad_norm": 1.4208906888961792, + "learning_rate": 6.026699667642269e-06, + "loss": 0.8167, + "step": 15481 + }, + { + "epoch": 0.7538772429576608, + "grad_norm": 1.170949101448059, + "learning_rate": 6.024443052295865e-06, + "loss": 0.7778, + "step": 15482 + }, + { + "epoch": 0.753925936746768, + "grad_norm": 1.3813860416412354, + "learning_rate": 6.022186784592723e-06, + "loss": 0.8594, + "step": 15483 + }, + { + "epoch": 0.7539746305358751, + "grad_norm": 1.2817364931106567, + "learning_rate": 6.019930864588956e-06, + "loss": 0.7817, + "step": 15484 + }, + { + "epoch": 0.7540233243249823, + "grad_norm": 1.4966638088226318, + "learning_rate": 6.017675292340686e-06, + "loss": 0.7974, + "step": 15485 + }, + { + "epoch": 0.7540720181140895, + "grad_norm": 1.4226664304733276, + "learning_rate": 6.015420067904016e-06, + "loss": 0.8165, + "step": 15486 + }, + { + "epoch": 0.7541207119031967, + "grad_norm": 0.09450238198041916, + "learning_rate": 6.013165191335049e-06, + "loss": 0.5707, + "step": 15487 + }, + { + "epoch": 0.754169405692304, + "grad_norm": 2.063086986541748, + "learning_rate": 6.0109106626898705e-06, + "loss": 0.8624, + "step": 15488 + }, + { + "epoch": 0.7542180994814112, + "grad_norm": 1.8524330854415894, + "learning_rate": 6.0086564820245666e-06, + "loss": 0.7795, + "step": 15489 + }, + { + "epoch": 0.7542667932705184, + "grad_norm": 1.4630308151245117, + "learning_rate": 6.006402649395202e-06, + "loss": 0.8507, + "step": 15490 + }, + { + "epoch": 0.7543154870596256, + "grad_norm": 2.5815558433532715, + "learning_rate": 6.004149164857853e-06, + "loss": 0.8063, + "step": 15491 + }, + { + "epoch": 0.7543641808487327, + "grad_norm": 1.2814863920211792, + "learning_rate": 6.00189602846857e-06, + "loss": 0.8763, + "step": 15492 + }, + { + "epoch": 0.7544128746378399, + "grad_norm": 1.4592629671096802, + "learning_rate": 5.999643240283399e-06, + "loss": 0.8218, + "step": 15493 + }, + { + "epoch": 0.7544615684269471, + "grad_norm": 1.6384121179580688, + "learning_rate": 5.9973908003583805e-06, + "loss": 0.8356, + "step": 15494 + }, + { + "epoch": 0.7545102622160543, + "grad_norm": 1.6695055961608887, + "learning_rate": 5.995138708749542e-06, + "loss": 0.8217, + "step": 15495 + }, + { + "epoch": 0.7545589560051615, + "grad_norm": 1.708681344985962, + "learning_rate": 5.992886965512903e-06, + "loss": 0.744, + "step": 15496 + }, + { + "epoch": 0.7546076497942688, + "grad_norm": 1.5819272994995117, + "learning_rate": 5.990635570704473e-06, + "loss": 0.8231, + "step": 15497 + }, + { + "epoch": 0.754656343583376, + "grad_norm": 1.603238582611084, + "learning_rate": 5.988384524380273e-06, + "loss": 0.8179, + "step": 15498 + }, + { + "epoch": 0.7547050373724832, + "grad_norm": 1.7343238592147827, + "learning_rate": 5.986133826596275e-06, + "loss": 0.7036, + "step": 15499 + }, + { + "epoch": 0.7547537311615904, + "grad_norm": 2.4209649562835693, + "learning_rate": 5.983883477408479e-06, + "loss": 0.8632, + "step": 15500 + }, + { + "epoch": 0.7548024249506975, + "grad_norm": 1.7007195949554443, + "learning_rate": 5.981633476872861e-06, + "loss": 0.7839, + "step": 15501 + }, + { + "epoch": 0.7548511187398047, + "grad_norm": 1.6089261770248413, + "learning_rate": 5.979383825045388e-06, + "loss": 0.7664, + "step": 15502 + }, + { + "epoch": 0.7548998125289119, + "grad_norm": 2.148073196411133, + "learning_rate": 5.97713452198202e-06, + "loss": 0.8339, + "step": 15503 + }, + { + "epoch": 0.7549485063180191, + "grad_norm": 1.428356647491455, + "learning_rate": 5.974885567738711e-06, + "loss": 0.8058, + "step": 15504 + }, + { + "epoch": 0.7549972001071263, + "grad_norm": 1.9101109504699707, + "learning_rate": 5.9726369623714005e-06, + "loss": 0.8219, + "step": 15505 + }, + { + "epoch": 0.7550458938962336, + "grad_norm": 1.4236619472503662, + "learning_rate": 5.9703887059360185e-06, + "loss": 0.8847, + "step": 15506 + }, + { + "epoch": 0.7550945876853408, + "grad_norm": 1.9363484382629395, + "learning_rate": 5.968140798488511e-06, + "loss": 0.7603, + "step": 15507 + }, + { + "epoch": 0.755143281474448, + "grad_norm": 2.3825292587280273, + "learning_rate": 5.965893240084766e-06, + "loss": 0.8278, + "step": 15508 + }, + { + "epoch": 0.7551919752635551, + "grad_norm": 1.2361165285110474, + "learning_rate": 5.963646030780719e-06, + "loss": 0.678, + "step": 15509 + }, + { + "epoch": 0.7552406690526623, + "grad_norm": 1.5643351078033447, + "learning_rate": 5.961399170632245e-06, + "loss": 0.8224, + "step": 15510 + }, + { + "epoch": 0.7552893628417695, + "grad_norm": 1.4310286045074463, + "learning_rate": 5.959152659695253e-06, + "loss": 0.7238, + "step": 15511 + }, + { + "epoch": 0.7553380566308767, + "grad_norm": 2.295807123184204, + "learning_rate": 5.956906498025619e-06, + "loss": 0.8339, + "step": 15512 + }, + { + "epoch": 0.7553867504199839, + "grad_norm": 2.4781882762908936, + "learning_rate": 5.954660685679208e-06, + "loss": 0.8084, + "step": 15513 + }, + { + "epoch": 0.7554354442090911, + "grad_norm": 1.572701096534729, + "learning_rate": 5.952415222711907e-06, + "loss": 0.7077, + "step": 15514 + }, + { + "epoch": 0.7554841379981984, + "grad_norm": 0.10393436998128891, + "learning_rate": 5.950170109179545e-06, + "loss": 0.6363, + "step": 15515 + }, + { + "epoch": 0.7555328317873056, + "grad_norm": 1.572540283203125, + "learning_rate": 5.947925345137997e-06, + "loss": 0.8276, + "step": 15516 + }, + { + "epoch": 0.7555815255764128, + "grad_norm": 1.6663652658462524, + "learning_rate": 5.945680930643074e-06, + "loss": 0.8979, + "step": 15517 + }, + { + "epoch": 0.7556302193655199, + "grad_norm": 1.4841594696044922, + "learning_rate": 5.9434368657506244e-06, + "loss": 0.7643, + "step": 15518 + }, + { + "epoch": 0.7556789131546271, + "grad_norm": 1.3468077182769775, + "learning_rate": 5.941193150516465e-06, + "loss": 0.7929, + "step": 15519 + }, + { + "epoch": 0.7557276069437343, + "grad_norm": 2.047935724258423, + "learning_rate": 5.9389497849964015e-06, + "loss": 0.9007, + "step": 15520 + }, + { + "epoch": 0.7557763007328415, + "grad_norm": 1.5816859006881714, + "learning_rate": 5.936706769246257e-06, + "loss": 0.839, + "step": 15521 + }, + { + "epoch": 0.7558249945219487, + "grad_norm": 3.369624376296997, + "learning_rate": 5.9344641033218e-06, + "loss": 0.6969, + "step": 15522 + }, + { + "epoch": 0.755873688311056, + "grad_norm": 1.686108946800232, + "learning_rate": 5.932221787278844e-06, + "loss": 0.7293, + "step": 15523 + }, + { + "epoch": 0.7559223821001632, + "grad_norm": 2.000833749771118, + "learning_rate": 5.9299798211731415e-06, + "loss": 0.7358, + "step": 15524 + }, + { + "epoch": 0.7559710758892704, + "grad_norm": 1.761849045753479, + "learning_rate": 5.92773820506048e-06, + "loss": 0.8171, + "step": 15525 + }, + { + "epoch": 0.7560197696783775, + "grad_norm": 1.688148856163025, + "learning_rate": 5.925496938996613e-06, + "loss": 0.8731, + "step": 15526 + }, + { + "epoch": 0.7560684634674847, + "grad_norm": 2.326913595199585, + "learning_rate": 5.923256023037294e-06, + "loss": 0.8368, + "step": 15527 + }, + { + "epoch": 0.7561171572565919, + "grad_norm": 1.3915914297103882, + "learning_rate": 5.921015457238266e-06, + "loss": 0.8095, + "step": 15528 + }, + { + "epoch": 0.7561658510456991, + "grad_norm": 2.115247964859009, + "learning_rate": 5.918775241655259e-06, + "loss": 0.8536, + "step": 15529 + }, + { + "epoch": 0.7562145448348063, + "grad_norm": 1.7067930698394775, + "learning_rate": 5.916535376344003e-06, + "loss": 0.8088, + "step": 15530 + }, + { + "epoch": 0.7562632386239135, + "grad_norm": 1.6486691236495972, + "learning_rate": 5.914295861360209e-06, + "loss": 0.8159, + "step": 15531 + }, + { + "epoch": 0.7563119324130207, + "grad_norm": 1.1208609342575073, + "learning_rate": 5.912056696759601e-06, + "loss": 0.9351, + "step": 15532 + }, + { + "epoch": 0.756360626202128, + "grad_norm": 1.4896544218063354, + "learning_rate": 5.9098178825978545e-06, + "loss": 0.8783, + "step": 15533 + }, + { + "epoch": 0.7564093199912351, + "grad_norm": 1.1674134731292725, + "learning_rate": 5.9075794189306805e-06, + "loss": 0.7571, + "step": 15534 + }, + { + "epoch": 0.7564580137803423, + "grad_norm": 1.844745397567749, + "learning_rate": 5.9053413058137524e-06, + "loss": 0.7542, + "step": 15535 + }, + { + "epoch": 0.7565067075694495, + "grad_norm": 1.7218414545059204, + "learning_rate": 5.903103543302744e-06, + "loss": 0.7893, + "step": 15536 + }, + { + "epoch": 0.7565554013585567, + "grad_norm": 1.7070441246032715, + "learning_rate": 5.90086613145332e-06, + "loss": 0.7737, + "step": 15537 + }, + { + "epoch": 0.7566040951476639, + "grad_norm": 1.2666510343551636, + "learning_rate": 5.898629070321136e-06, + "loss": 0.8785, + "step": 15538 + }, + { + "epoch": 0.7566527889367711, + "grad_norm": 1.2808767557144165, + "learning_rate": 5.89639235996184e-06, + "loss": 0.8971, + "step": 15539 + }, + { + "epoch": 0.7567014827258783, + "grad_norm": 1.8592987060546875, + "learning_rate": 5.894156000431066e-06, + "loss": 0.7758, + "step": 15540 + }, + { + "epoch": 0.7567501765149856, + "grad_norm": 1.7237417697906494, + "learning_rate": 5.891919991784451e-06, + "loss": 0.7868, + "step": 15541 + }, + { + "epoch": 0.7567988703040928, + "grad_norm": 1.6856824159622192, + "learning_rate": 5.889684334077614e-06, + "loss": 0.7108, + "step": 15542 + }, + { + "epoch": 0.7568475640931999, + "grad_norm": 1.7133677005767822, + "learning_rate": 5.887449027366165e-06, + "loss": 0.7656, + "step": 15543 + }, + { + "epoch": 0.7568962578823071, + "grad_norm": 2.5719451904296875, + "learning_rate": 5.88521407170571e-06, + "loss": 0.8936, + "step": 15544 + }, + { + "epoch": 0.7569449516714143, + "grad_norm": 0.10049257427453995, + "learning_rate": 5.8829794671518395e-06, + "loss": 0.5715, + "step": 15545 + }, + { + "epoch": 0.7569936454605215, + "grad_norm": 1.6254122257232666, + "learning_rate": 5.880745213760142e-06, + "loss": 0.8052, + "step": 15546 + }, + { + "epoch": 0.7570423392496287, + "grad_norm": 1.640599012374878, + "learning_rate": 5.8785113115861965e-06, + "loss": 0.8078, + "step": 15547 + }, + { + "epoch": 0.7570910330387359, + "grad_norm": 3.6305394172668457, + "learning_rate": 5.876277760685563e-06, + "loss": 0.7662, + "step": 15548 + }, + { + "epoch": 0.7571397268278431, + "grad_norm": 1.2510963678359985, + "learning_rate": 5.874044561113812e-06, + "loss": 0.8389, + "step": 15549 + }, + { + "epoch": 0.7571884206169504, + "grad_norm": 1.5107266902923584, + "learning_rate": 5.871811712926492e-06, + "loss": 0.8291, + "step": 15550 + }, + { + "epoch": 0.7572371144060575, + "grad_norm": 1.5770868062973022, + "learning_rate": 5.869579216179145e-06, + "loss": 0.8332, + "step": 15551 + }, + { + "epoch": 0.7572858081951647, + "grad_norm": 1.3772178888320923, + "learning_rate": 5.867347070927301e-06, + "loss": 0.7919, + "step": 15552 + }, + { + "epoch": 0.7573345019842719, + "grad_norm": 1.7888590097427368, + "learning_rate": 5.86511527722649e-06, + "loss": 0.792, + "step": 15553 + }, + { + "epoch": 0.7573831957733791, + "grad_norm": 1.839539647102356, + "learning_rate": 5.862883835132223e-06, + "loss": 0.7801, + "step": 15554 + }, + { + "epoch": 0.7574318895624863, + "grad_norm": 1.8502542972564697, + "learning_rate": 5.86065274470001e-06, + "loss": 0.778, + "step": 15555 + }, + { + "epoch": 0.7574805833515935, + "grad_norm": 1.4131278991699219, + "learning_rate": 5.858422005985349e-06, + "loss": 0.8618, + "step": 15556 + }, + { + "epoch": 0.7575292771407007, + "grad_norm": 1.558354139328003, + "learning_rate": 5.856191619043725e-06, + "loss": 0.8389, + "step": 15557 + }, + { + "epoch": 0.7575779709298079, + "grad_norm": 2.3437063694000244, + "learning_rate": 5.853961583930634e-06, + "loss": 0.8391, + "step": 15558 + }, + { + "epoch": 0.7576266647189152, + "grad_norm": 1.3734935522079468, + "learning_rate": 5.851731900701529e-06, + "loss": 0.9123, + "step": 15559 + }, + { + "epoch": 0.7576753585080223, + "grad_norm": 1.54616117477417, + "learning_rate": 5.849502569411887e-06, + "loss": 0.8052, + "step": 15560 + }, + { + "epoch": 0.7577240522971295, + "grad_norm": 1.9277701377868652, + "learning_rate": 5.84727359011716e-06, + "loss": 0.7884, + "step": 15561 + }, + { + "epoch": 0.7577727460862367, + "grad_norm": 1.3591877222061157, + "learning_rate": 5.845044962872792e-06, + "loss": 0.8385, + "step": 15562 + }, + { + "epoch": 0.7578214398753439, + "grad_norm": 1.626243233680725, + "learning_rate": 5.842816687734221e-06, + "loss": 0.8375, + "step": 15563 + }, + { + "epoch": 0.7578701336644511, + "grad_norm": 1.4514405727386475, + "learning_rate": 5.840588764756871e-06, + "loss": 0.7694, + "step": 15564 + }, + { + "epoch": 0.7579188274535583, + "grad_norm": 1.681280493736267, + "learning_rate": 5.8383611939961785e-06, + "loss": 0.909, + "step": 15565 + }, + { + "epoch": 0.7579675212426655, + "grad_norm": 1.2267178297042847, + "learning_rate": 5.836133975507528e-06, + "loss": 0.8919, + "step": 15566 + }, + { + "epoch": 0.7580162150317727, + "grad_norm": 2.011291265487671, + "learning_rate": 5.8339071093463505e-06, + "loss": 0.7473, + "step": 15567 + }, + { + "epoch": 0.7580649088208798, + "grad_norm": 1.0982329845428467, + "learning_rate": 5.831680595568012e-06, + "loss": 0.7172, + "step": 15568 + }, + { + "epoch": 0.7581136026099871, + "grad_norm": 1.819079875946045, + "learning_rate": 5.829454434227919e-06, + "loss": 0.8689, + "step": 15569 + }, + { + "epoch": 0.7581622963990943, + "grad_norm": 1.3624166250228882, + "learning_rate": 5.827228625381436e-06, + "loss": 0.9013, + "step": 15570 + }, + { + "epoch": 0.7582109901882015, + "grad_norm": 1.4916599988937378, + "learning_rate": 5.825003169083929e-06, + "loss": 0.8346, + "step": 15571 + }, + { + "epoch": 0.7582596839773087, + "grad_norm": 1.5294336080551147, + "learning_rate": 5.822778065390771e-06, + "loss": 0.8625, + "step": 15572 + }, + { + "epoch": 0.7583083777664159, + "grad_norm": 1.8054956197738647, + "learning_rate": 5.820553314357291e-06, + "loss": 0.7922, + "step": 15573 + }, + { + "epoch": 0.7583570715555231, + "grad_norm": 1.5591249465942383, + "learning_rate": 5.818328916038851e-06, + "loss": 0.737, + "step": 15574 + }, + { + "epoch": 0.7584057653446303, + "grad_norm": 0.09428554773330688, + "learning_rate": 5.81610487049076e-06, + "loss": 0.5261, + "step": 15575 + }, + { + "epoch": 0.7584544591337375, + "grad_norm": 1.7516974210739136, + "learning_rate": 5.8138811777683615e-06, + "loss": 0.809, + "step": 15576 + }, + { + "epoch": 0.7585031529228446, + "grad_norm": 1.8659497499465942, + "learning_rate": 5.811657837926961e-06, + "loss": 0.854, + "step": 15577 + }, + { + "epoch": 0.7585518467119519, + "grad_norm": 1.166155219078064, + "learning_rate": 5.809434851021865e-06, + "loss": 0.8015, + "step": 15578 + }, + { + "epoch": 0.7586005405010591, + "grad_norm": 1.2885595560073853, + "learning_rate": 5.807212217108371e-06, + "loss": 0.8906, + "step": 15579 + }, + { + "epoch": 0.7586492342901663, + "grad_norm": 1.628416895866394, + "learning_rate": 5.80498993624176e-06, + "loss": 0.8766, + "step": 15580 + }, + { + "epoch": 0.7586979280792735, + "grad_norm": 1.3506618738174438, + "learning_rate": 5.802768008477333e-06, + "loss": 0.8014, + "step": 15581 + }, + { + "epoch": 0.7587466218683807, + "grad_norm": 1.331502079963684, + "learning_rate": 5.800546433870333e-06, + "loss": 0.8884, + "step": 15582 + }, + { + "epoch": 0.7587953156574879, + "grad_norm": 1.6007603406906128, + "learning_rate": 5.798325212476041e-06, + "loss": 0.7349, + "step": 15583 + }, + { + "epoch": 0.7588440094465951, + "grad_norm": 0.09210670739412308, + "learning_rate": 5.796104344349704e-06, + "loss": 0.5426, + "step": 15584 + }, + { + "epoch": 0.7588927032357022, + "grad_norm": 1.6894508600234985, + "learning_rate": 5.793883829546567e-06, + "loss": 0.732, + "step": 15585 + }, + { + "epoch": 0.7589413970248094, + "grad_norm": 2.0187618732452393, + "learning_rate": 5.791663668121865e-06, + "loss": 0.7437, + "step": 15586 + }, + { + "epoch": 0.7589900908139167, + "grad_norm": 1.5872493982315063, + "learning_rate": 5.789443860130825e-06, + "loss": 0.9007, + "step": 15587 + }, + { + "epoch": 0.7590387846030239, + "grad_norm": 1.291635513305664, + "learning_rate": 5.787224405628664e-06, + "loss": 0.7647, + "step": 15588 + }, + { + "epoch": 0.7590874783921311, + "grad_norm": 1.813177227973938, + "learning_rate": 5.785005304670585e-06, + "loss": 0.8164, + "step": 15589 + }, + { + "epoch": 0.7591361721812383, + "grad_norm": 1.5364965200424194, + "learning_rate": 5.78278655731181e-06, + "loss": 0.921, + "step": 15590 + }, + { + "epoch": 0.7591848659703455, + "grad_norm": 1.4881573915481567, + "learning_rate": 5.7805681636075005e-06, + "loss": 0.795, + "step": 15591 + }, + { + "epoch": 0.7592335597594527, + "grad_norm": 1.6060945987701416, + "learning_rate": 5.778350123612864e-06, + "loss": 0.7857, + "step": 15592 + }, + { + "epoch": 0.7592822535485598, + "grad_norm": 1.573770523071289, + "learning_rate": 5.776132437383062e-06, + "loss": 0.8109, + "step": 15593 + }, + { + "epoch": 0.759330947337667, + "grad_norm": 2.866950511932373, + "learning_rate": 5.7739151049732625e-06, + "loss": 0.7909, + "step": 15594 + }, + { + "epoch": 0.7593796411267743, + "grad_norm": 2.404465436935425, + "learning_rate": 5.771698126438623e-06, + "loss": 0.8897, + "step": 15595 + }, + { + "epoch": 0.7594283349158815, + "grad_norm": 1.4828988313674927, + "learning_rate": 5.769481501834289e-06, + "loss": 0.8806, + "step": 15596 + }, + { + "epoch": 0.7594770287049887, + "grad_norm": 1.2558759450912476, + "learning_rate": 5.7672652312154e-06, + "loss": 0.7988, + "step": 15597 + }, + { + "epoch": 0.7595257224940959, + "grad_norm": 2.331430435180664, + "learning_rate": 5.765049314637088e-06, + "loss": 0.7919, + "step": 15598 + }, + { + "epoch": 0.7595744162832031, + "grad_norm": 2.024811267852783, + "learning_rate": 5.762833752154464e-06, + "loss": 0.8157, + "step": 15599 + }, + { + "epoch": 0.7596231100723103, + "grad_norm": 0.10999195277690887, + "learning_rate": 5.760618543822656e-06, + "loss": 0.645, + "step": 15600 + }, + { + "epoch": 0.7596718038614175, + "grad_norm": 1.6377540826797485, + "learning_rate": 5.758403689696759e-06, + "loss": 0.8161, + "step": 15601 + }, + { + "epoch": 0.7597204976505246, + "grad_norm": 12.893399238586426, + "learning_rate": 5.756189189831869e-06, + "loss": 0.7843, + "step": 15602 + }, + { + "epoch": 0.7597691914396318, + "grad_norm": 1.5362001657485962, + "learning_rate": 5.753975044283071e-06, + "loss": 0.9028, + "step": 15603 + }, + { + "epoch": 0.759817885228739, + "grad_norm": 4.172022342681885, + "learning_rate": 5.751761253105443e-06, + "loss": 0.8143, + "step": 15604 + }, + { + "epoch": 0.7598665790178463, + "grad_norm": 0.09456391632556915, + "learning_rate": 5.749547816354053e-06, + "loss": 0.6143, + "step": 15605 + }, + { + "epoch": 0.7599152728069535, + "grad_norm": 1.6081064939498901, + "learning_rate": 5.747334734083956e-06, + "loss": 0.8465, + "step": 15606 + }, + { + "epoch": 0.7599639665960607, + "grad_norm": 1.951603889465332, + "learning_rate": 5.745122006350217e-06, + "loss": 0.8059, + "step": 15607 + }, + { + "epoch": 0.7600126603851679, + "grad_norm": 1.7359435558319092, + "learning_rate": 5.742909633207856e-06, + "loss": 0.7492, + "step": 15608 + }, + { + "epoch": 0.7600613541742751, + "grad_norm": 1.892486333847046, + "learning_rate": 5.740697614711923e-06, + "loss": 0.8063, + "step": 15609 + }, + { + "epoch": 0.7601100479633822, + "grad_norm": 1.5034949779510498, + "learning_rate": 5.738485950917439e-06, + "loss": 0.7985, + "step": 15610 + }, + { + "epoch": 0.7601587417524894, + "grad_norm": 1.5125925540924072, + "learning_rate": 5.736274641879416e-06, + "loss": 0.74, + "step": 15611 + }, + { + "epoch": 0.7602074355415966, + "grad_norm": 1.7143940925598145, + "learning_rate": 5.7340636876528625e-06, + "loss": 0.8392, + "step": 15612 + }, + { + "epoch": 0.7602561293307039, + "grad_norm": 2.899648427963257, + "learning_rate": 5.7318530882927755e-06, + "loss": 0.8405, + "step": 15613 + }, + { + "epoch": 0.7603048231198111, + "grad_norm": 1.6501774787902832, + "learning_rate": 5.7296428438541445e-06, + "loss": 0.7757, + "step": 15614 + }, + { + "epoch": 0.7603535169089183, + "grad_norm": 1.6881051063537598, + "learning_rate": 5.72743295439194e-06, + "loss": 0.7597, + "step": 15615 + }, + { + "epoch": 0.7604022106980255, + "grad_norm": 1.137714147567749, + "learning_rate": 5.7252234199611564e-06, + "loss": 0.7758, + "step": 15616 + }, + { + "epoch": 0.7604509044871327, + "grad_norm": 1.4879802465438843, + "learning_rate": 5.723014240616729e-06, + "loss": 0.8073, + "step": 15617 + }, + { + "epoch": 0.7604995982762399, + "grad_norm": 0.09445402771234512, + "learning_rate": 5.720805416413637e-06, + "loss": 0.6119, + "step": 15618 + }, + { + "epoch": 0.760548292065347, + "grad_norm": 1.9155462980270386, + "learning_rate": 5.7185969474068e-06, + "loss": 0.8437, + "step": 15619 + }, + { + "epoch": 0.7605969858544542, + "grad_norm": 5.165453910827637, + "learning_rate": 5.716388833651172e-06, + "loss": 0.876, + "step": 15620 + }, + { + "epoch": 0.7606456796435614, + "grad_norm": 1.797280192375183, + "learning_rate": 5.714181075201672e-06, + "loss": 0.7118, + "step": 15621 + }, + { + "epoch": 0.7606943734326687, + "grad_norm": 2.04296612739563, + "learning_rate": 5.711973672113218e-06, + "loss": 0.875, + "step": 15622 + }, + { + "epoch": 0.7607430672217759, + "grad_norm": 1.5257240533828735, + "learning_rate": 5.7097666244407316e-06, + "loss": 0.8095, + "step": 15623 + }, + { + "epoch": 0.7607917610108831, + "grad_norm": 1.453694224357605, + "learning_rate": 5.70755993223909e-06, + "loss": 0.84, + "step": 15624 + }, + { + "epoch": 0.7608404547999903, + "grad_norm": 0.09821323305368423, + "learning_rate": 5.7053535955632125e-06, + "loss": 0.5802, + "step": 15625 + }, + { + "epoch": 0.7608891485890975, + "grad_norm": 1.6412044763565063, + "learning_rate": 5.703147614467954e-06, + "loss": 0.7767, + "step": 15626 + }, + { + "epoch": 0.7609378423782046, + "grad_norm": 1.4052176475524902, + "learning_rate": 5.700941989008211e-06, + "loss": 0.8163, + "step": 15627 + }, + { + "epoch": 0.7609865361673118, + "grad_norm": 2.201585531234741, + "learning_rate": 5.698736719238838e-06, + "loss": 0.7994, + "step": 15628 + }, + { + "epoch": 0.761035229956419, + "grad_norm": 1.6831532716751099, + "learning_rate": 5.696531805214684e-06, + "loss": 0.8636, + "step": 15629 + }, + { + "epoch": 0.7610839237455262, + "grad_norm": 1.7487685680389404, + "learning_rate": 5.694327246990621e-06, + "loss": 0.8576, + "step": 15630 + }, + { + "epoch": 0.7611326175346335, + "grad_norm": 2.2427167892456055, + "learning_rate": 5.692123044621461e-06, + "loss": 0.8016, + "step": 15631 + }, + { + "epoch": 0.7611813113237407, + "grad_norm": 1.3896197080612183, + "learning_rate": 5.689919198162055e-06, + "loss": 0.7978, + "step": 15632 + }, + { + "epoch": 0.7612300051128479, + "grad_norm": 1.8007304668426514, + "learning_rate": 5.6877157076672e-06, + "loss": 0.7738, + "step": 15633 + }, + { + "epoch": 0.7612786989019551, + "grad_norm": 1.3237760066986084, + "learning_rate": 5.685512573191729e-06, + "loss": 0.6688, + "step": 15634 + }, + { + "epoch": 0.7613273926910623, + "grad_norm": 1.6457949876785278, + "learning_rate": 5.683309794790439e-06, + "loss": 0.8181, + "step": 15635 + }, + { + "epoch": 0.7613760864801694, + "grad_norm": 2.7577505111694336, + "learning_rate": 5.681107372518124e-06, + "loss": 0.8588, + "step": 15636 + }, + { + "epoch": 0.7614247802692766, + "grad_norm": 1.9197014570236206, + "learning_rate": 5.678905306429567e-06, + "loss": 0.7689, + "step": 15637 + }, + { + "epoch": 0.7614734740583838, + "grad_norm": 2.2455310821533203, + "learning_rate": 5.67670359657954e-06, + "loss": 0.802, + "step": 15638 + }, + { + "epoch": 0.761522167847491, + "grad_norm": 1.596343994140625, + "learning_rate": 5.67450224302283e-06, + "loss": 0.7737, + "step": 15639 + }, + { + "epoch": 0.7615708616365983, + "grad_norm": 1.3701199293136597, + "learning_rate": 5.672301245814169e-06, + "loss": 0.8311, + "step": 15640 + }, + { + "epoch": 0.7616195554257055, + "grad_norm": 1.4512284994125366, + "learning_rate": 5.670100605008335e-06, + "loss": 0.7837, + "step": 15641 + }, + { + "epoch": 0.7616682492148127, + "grad_norm": 1.5280705690383911, + "learning_rate": 5.667900320660043e-06, + "loss": 0.8477, + "step": 15642 + }, + { + "epoch": 0.7617169430039199, + "grad_norm": 1.588181495666504, + "learning_rate": 5.665700392824041e-06, + "loss": 0.8764, + "step": 15643 + }, + { + "epoch": 0.761765636793027, + "grad_norm": 1.5283458232879639, + "learning_rate": 5.66350082155505e-06, + "loss": 0.7945, + "step": 15644 + }, + { + "epoch": 0.7618143305821342, + "grad_norm": 1.5752748250961304, + "learning_rate": 5.6613016069077834e-06, + "loss": 0.8781, + "step": 15645 + }, + { + "epoch": 0.7618630243712414, + "grad_norm": 1.613717794418335, + "learning_rate": 5.659102748936946e-06, + "loss": 0.7915, + "step": 15646 + }, + { + "epoch": 0.7619117181603486, + "grad_norm": 1.4661340713500977, + "learning_rate": 5.656904247697237e-06, + "loss": 0.8237, + "step": 15647 + }, + { + "epoch": 0.7619604119494559, + "grad_norm": 1.5089484453201294, + "learning_rate": 5.654706103243342e-06, + "loss": 0.9113, + "step": 15648 + }, + { + "epoch": 0.7620091057385631, + "grad_norm": 2.3165204524993896, + "learning_rate": 5.6525083156299345e-06, + "loss": 0.8942, + "step": 15649 + }, + { + "epoch": 0.7620577995276703, + "grad_norm": 2.9318082332611084, + "learning_rate": 5.650310884911696e-06, + "loss": 0.7615, + "step": 15650 + }, + { + "epoch": 0.7621064933167775, + "grad_norm": 2.0484437942504883, + "learning_rate": 5.648113811143283e-06, + "loss": 0.9197, + "step": 15651 + }, + { + "epoch": 0.7621551871058846, + "grad_norm": 0.09842539578676224, + "learning_rate": 5.645917094379347e-06, + "loss": 0.5988, + "step": 15652 + }, + { + "epoch": 0.7622038808949918, + "grad_norm": 1.4060100317001343, + "learning_rate": 5.643720734674534e-06, + "loss": 0.765, + "step": 15653 + }, + { + "epoch": 0.762252574684099, + "grad_norm": 1.9684902429580688, + "learning_rate": 5.641524732083474e-06, + "loss": 0.8332, + "step": 15654 + }, + { + "epoch": 0.7623012684732062, + "grad_norm": 1.556267261505127, + "learning_rate": 5.639329086660796e-06, + "loss": 0.7311, + "step": 15655 + }, + { + "epoch": 0.7623499622623134, + "grad_norm": 1.3114484548568726, + "learning_rate": 5.637133798461119e-06, + "loss": 0.8573, + "step": 15656 + }, + { + "epoch": 0.7623986560514207, + "grad_norm": 1.5325701236724854, + "learning_rate": 5.63493886753904e-06, + "loss": 0.8962, + "step": 15657 + }, + { + "epoch": 0.7624473498405279, + "grad_norm": 1.4207267761230469, + "learning_rate": 5.632744293949173e-06, + "loss": 0.7328, + "step": 15658 + }, + { + "epoch": 0.7624960436296351, + "grad_norm": 1.5349825620651245, + "learning_rate": 5.6305500777461e-06, + "loss": 0.831, + "step": 15659 + }, + { + "epoch": 0.7625447374187423, + "grad_norm": 1.5506043434143066, + "learning_rate": 5.628356218984405e-06, + "loss": 0.8346, + "step": 15660 + }, + { + "epoch": 0.7625934312078494, + "grad_norm": 1.252446174621582, + "learning_rate": 5.62616271771866e-06, + "loss": 0.8803, + "step": 15661 + }, + { + "epoch": 0.7626421249969566, + "grad_norm": 1.3637396097183228, + "learning_rate": 5.623969574003427e-06, + "loss": 0.8673, + "step": 15662 + }, + { + "epoch": 0.7626908187860638, + "grad_norm": 8.88510513305664, + "learning_rate": 5.621776787893263e-06, + "loss": 0.8111, + "step": 15663 + }, + { + "epoch": 0.762739512575171, + "grad_norm": 1.6271400451660156, + "learning_rate": 5.619584359442707e-06, + "loss": 0.7475, + "step": 15664 + }, + { + "epoch": 0.7627882063642782, + "grad_norm": 3.7089309692382812, + "learning_rate": 5.617392288706312e-06, + "loss": 0.877, + "step": 15665 + }, + { + "epoch": 0.7628369001533855, + "grad_norm": 5.005794048309326, + "learning_rate": 5.6152005757385815e-06, + "loss": 0.8025, + "step": 15666 + }, + { + "epoch": 0.7628855939424927, + "grad_norm": 2.3062031269073486, + "learning_rate": 5.6130092205940635e-06, + "loss": 0.8898, + "step": 15667 + }, + { + "epoch": 0.7629342877315999, + "grad_norm": 1.4833929538726807, + "learning_rate": 5.610818223327239e-06, + "loss": 0.8266, + "step": 15668 + }, + { + "epoch": 0.762982981520707, + "grad_norm": 1.8675999641418457, + "learning_rate": 5.6086275839926275e-06, + "loss": 0.8111, + "step": 15669 + }, + { + "epoch": 0.7630316753098142, + "grad_norm": 1.3381417989730835, + "learning_rate": 5.60643730264472e-06, + "loss": 0.8966, + "step": 15670 + }, + { + "epoch": 0.7630803690989214, + "grad_norm": 1.6522120237350464, + "learning_rate": 5.604247379337995e-06, + "loss": 0.807, + "step": 15671 + }, + { + "epoch": 0.7631290628880286, + "grad_norm": 1.501939296722412, + "learning_rate": 5.602057814126931e-06, + "loss": 0.8664, + "step": 15672 + }, + { + "epoch": 0.7631777566771358, + "grad_norm": 1.9300456047058105, + "learning_rate": 5.5998686070659835e-06, + "loss": 0.8573, + "step": 15673 + }, + { + "epoch": 0.763226450466243, + "grad_norm": 2.4477319717407227, + "learning_rate": 5.597679758209631e-06, + "loss": 0.8961, + "step": 15674 + }, + { + "epoch": 0.7632751442553503, + "grad_norm": 1.1851123571395874, + "learning_rate": 5.5954912676122945e-06, + "loss": 0.6832, + "step": 15675 + }, + { + "epoch": 0.7633238380444575, + "grad_norm": 1.5977803468704224, + "learning_rate": 5.593303135328438e-06, + "loss": 0.8306, + "step": 15676 + }, + { + "epoch": 0.7633725318335647, + "grad_norm": 1.836065649986267, + "learning_rate": 5.591115361412469e-06, + "loss": 0.8265, + "step": 15677 + }, + { + "epoch": 0.7634212256226718, + "grad_norm": 1.3071568012237549, + "learning_rate": 5.588927945918823e-06, + "loss": 0.843, + "step": 15678 + }, + { + "epoch": 0.763469919411779, + "grad_norm": 1.4224095344543457, + "learning_rate": 5.586740888901909e-06, + "loss": 0.8758, + "step": 15679 + }, + { + "epoch": 0.7635186132008862, + "grad_norm": 3.541356325149536, + "learning_rate": 5.584554190416125e-06, + "loss": 0.8043, + "step": 15680 + }, + { + "epoch": 0.7635673069899934, + "grad_norm": 1.5114436149597168, + "learning_rate": 5.582367850515882e-06, + "loss": 0.7097, + "step": 15681 + }, + { + "epoch": 0.7636160007791006, + "grad_norm": 1.4132332801818848, + "learning_rate": 5.5801818692555386e-06, + "loss": 0.8016, + "step": 15682 + }, + { + "epoch": 0.7636646945682078, + "grad_norm": 1.2081090211868286, + "learning_rate": 5.577996246689501e-06, + "loss": 0.8065, + "step": 15683 + }, + { + "epoch": 0.7637133883573151, + "grad_norm": 2.059377670288086, + "learning_rate": 5.575810982872107e-06, + "loss": 0.7749, + "step": 15684 + }, + { + "epoch": 0.7637620821464223, + "grad_norm": 1.395056962966919, + "learning_rate": 5.573626077857737e-06, + "loss": 0.7582, + "step": 15685 + }, + { + "epoch": 0.7638107759355294, + "grad_norm": 1.5422630310058594, + "learning_rate": 5.571441531700734e-06, + "loss": 0.8431, + "step": 15686 + }, + { + "epoch": 0.7638594697246366, + "grad_norm": 2.167618989944458, + "learning_rate": 5.569257344455439e-06, + "loss": 0.8679, + "step": 15687 + }, + { + "epoch": 0.7639081635137438, + "grad_norm": 1.528340220451355, + "learning_rate": 5.5670735161761825e-06, + "loss": 0.7271, + "step": 15688 + }, + { + "epoch": 0.763956857302851, + "grad_norm": 1.7393913269042969, + "learning_rate": 5.564890046917282e-06, + "loss": 0.794, + "step": 15689 + }, + { + "epoch": 0.7640055510919582, + "grad_norm": 0.09502211213111877, + "learning_rate": 5.562706936733069e-06, + "loss": 0.5907, + "step": 15690 + }, + { + "epoch": 0.7640542448810654, + "grad_norm": 1.3014044761657715, + "learning_rate": 5.560524185677827e-06, + "loss": 0.8491, + "step": 15691 + }, + { + "epoch": 0.7641029386701726, + "grad_norm": 0.10033630579710007, + "learning_rate": 5.558341793805868e-06, + "loss": 0.6521, + "step": 15692 + }, + { + "epoch": 0.7641516324592799, + "grad_norm": 1.5872722864151, + "learning_rate": 5.556159761171474e-06, + "loss": 0.8074, + "step": 15693 + }, + { + "epoch": 0.764200326248387, + "grad_norm": 1.3291964530944824, + "learning_rate": 5.553978087828922e-06, + "loss": 0.7722, + "step": 15694 + }, + { + "epoch": 0.7642490200374942, + "grad_norm": 1.5611580610275269, + "learning_rate": 5.551796773832481e-06, + "loss": 0.7789, + "step": 15695 + }, + { + "epoch": 0.7642977138266014, + "grad_norm": 3.5176427364349365, + "learning_rate": 5.5496158192364155e-06, + "loss": 0.7132, + "step": 15696 + }, + { + "epoch": 0.7643464076157086, + "grad_norm": 1.4094858169555664, + "learning_rate": 5.547435224094972e-06, + "loss": 0.804, + "step": 15697 + }, + { + "epoch": 0.7643951014048158, + "grad_norm": 1.5251994132995605, + "learning_rate": 5.545254988462389e-06, + "loss": 0.7972, + "step": 15698 + }, + { + "epoch": 0.764443795193923, + "grad_norm": 1.949183464050293, + "learning_rate": 5.54307511239292e-06, + "loss": 0.7653, + "step": 15699 + }, + { + "epoch": 0.7644924889830302, + "grad_norm": 2.689875841140747, + "learning_rate": 5.5408955959407605e-06, + "loss": 0.8279, + "step": 15700 + }, + { + "epoch": 0.7645411827721375, + "grad_norm": 2.626765012741089, + "learning_rate": 5.53871643916015e-06, + "loss": 0.8529, + "step": 15701 + }, + { + "epoch": 0.7645898765612447, + "grad_norm": 1.3167078495025635, + "learning_rate": 5.5365376421052864e-06, + "loss": 0.8675, + "step": 15702 + }, + { + "epoch": 0.7646385703503518, + "grad_norm": 1.7013452053070068, + "learning_rate": 5.5343592048303664e-06, + "loss": 0.8038, + "step": 15703 + }, + { + "epoch": 0.764687264139459, + "grad_norm": 1.3403728008270264, + "learning_rate": 5.532181127389582e-06, + "loss": 0.8245, + "step": 15704 + }, + { + "epoch": 0.7647359579285662, + "grad_norm": 1.8044072389602661, + "learning_rate": 5.53000340983711e-06, + "loss": 0.8318, + "step": 15705 + }, + { + "epoch": 0.7647846517176734, + "grad_norm": 1.9973368644714355, + "learning_rate": 5.527826052227121e-06, + "loss": 0.7093, + "step": 15706 + }, + { + "epoch": 0.7648333455067806, + "grad_norm": 1.7119208574295044, + "learning_rate": 5.525649054613782e-06, + "loss": 0.7474, + "step": 15707 + }, + { + "epoch": 0.7648820392958878, + "grad_norm": 2.560875415802002, + "learning_rate": 5.523472417051234e-06, + "loss": 0.7506, + "step": 15708 + }, + { + "epoch": 0.764930733084995, + "grad_norm": 1.3914523124694824, + "learning_rate": 5.521296139593635e-06, + "loss": 0.7947, + "step": 15709 + }, + { + "epoch": 0.7649794268741023, + "grad_norm": 1.4664336442947388, + "learning_rate": 5.519120222295116e-06, + "loss": 0.7842, + "step": 15710 + }, + { + "epoch": 0.7650281206632094, + "grad_norm": 1.600180745124817, + "learning_rate": 5.516944665209803e-06, + "loss": 0.8146, + "step": 15711 + }, + { + "epoch": 0.7650768144523166, + "grad_norm": 1.505768060684204, + "learning_rate": 5.514769468391809e-06, + "loss": 0.8792, + "step": 15712 + }, + { + "epoch": 0.7651255082414238, + "grad_norm": 1.3671787977218628, + "learning_rate": 5.512594631895247e-06, + "loss": 0.7982, + "step": 15713 + }, + { + "epoch": 0.765174202030531, + "grad_norm": 2.8979289531707764, + "learning_rate": 5.5104201557742145e-06, + "loss": 0.8975, + "step": 15714 + }, + { + "epoch": 0.7652228958196382, + "grad_norm": 3.0310683250427246, + "learning_rate": 5.508246040082794e-06, + "loss": 0.8813, + "step": 15715 + }, + { + "epoch": 0.7652715896087454, + "grad_norm": 1.5283459424972534, + "learning_rate": 5.506072284875088e-06, + "loss": 0.8028, + "step": 15716 + }, + { + "epoch": 0.7653202833978526, + "grad_norm": 1.346467137336731, + "learning_rate": 5.503898890205144e-06, + "loss": 0.8606, + "step": 15717 + }, + { + "epoch": 0.7653689771869598, + "grad_norm": 1.4278522729873657, + "learning_rate": 5.501725856127042e-06, + "loss": 0.887, + "step": 15718 + }, + { + "epoch": 0.765417670976067, + "grad_norm": 1.7894837856292725, + "learning_rate": 5.49955318269483e-06, + "loss": 0.8804, + "step": 15719 + }, + { + "epoch": 0.7654663647651742, + "grad_norm": 1.2464076280593872, + "learning_rate": 5.497380869962556e-06, + "loss": 0.7363, + "step": 15720 + }, + { + "epoch": 0.7655150585542814, + "grad_norm": 1.6096246242523193, + "learning_rate": 5.495208917984256e-06, + "loss": 0.861, + "step": 15721 + }, + { + "epoch": 0.7655637523433886, + "grad_norm": 1.5076807737350464, + "learning_rate": 5.4930373268139545e-06, + "loss": 0.751, + "step": 15722 + }, + { + "epoch": 0.7656124461324958, + "grad_norm": 1.2114875316619873, + "learning_rate": 5.490866096505674e-06, + "loss": 0.8312, + "step": 15723 + }, + { + "epoch": 0.765661139921603, + "grad_norm": 1.3424108028411865, + "learning_rate": 5.488695227113416e-06, + "loss": 0.7459, + "step": 15724 + }, + { + "epoch": 0.7657098337107102, + "grad_norm": 1.872969388961792, + "learning_rate": 5.486524718691199e-06, + "loss": 0.839, + "step": 15725 + }, + { + "epoch": 0.7657585274998174, + "grad_norm": 2.227677345275879, + "learning_rate": 5.484354571292991e-06, + "loss": 0.8641, + "step": 15726 + }, + { + "epoch": 0.7658072212889246, + "grad_norm": 1.1831910610198975, + "learning_rate": 5.4821847849727924e-06, + "loss": 0.8528, + "step": 15727 + }, + { + "epoch": 0.7658559150780317, + "grad_norm": 1.685982346534729, + "learning_rate": 5.4800153597845714e-06, + "loss": 0.8179, + "step": 15728 + }, + { + "epoch": 0.765904608867139, + "grad_norm": 1.26901113986969, + "learning_rate": 5.477846295782292e-06, + "loss": 0.7722, + "step": 15729 + }, + { + "epoch": 0.7659533026562462, + "grad_norm": 1.686023473739624, + "learning_rate": 5.47567759301991e-06, + "loss": 0.7604, + "step": 15730 + }, + { + "epoch": 0.7660019964453534, + "grad_norm": 1.679702877998352, + "learning_rate": 5.473509251551365e-06, + "loss": 0.751, + "step": 15731 + }, + { + "epoch": 0.7660506902344606, + "grad_norm": 0.08987187594175339, + "learning_rate": 5.471341271430617e-06, + "loss": 0.5765, + "step": 15732 + }, + { + "epoch": 0.7660993840235678, + "grad_norm": 1.5355844497680664, + "learning_rate": 5.469173652711563e-06, + "loss": 0.7929, + "step": 15733 + }, + { + "epoch": 0.766148077812675, + "grad_norm": 1.6658267974853516, + "learning_rate": 5.467006395448156e-06, + "loss": 0.8566, + "step": 15734 + }, + { + "epoch": 0.7661967716017822, + "grad_norm": 1.3802990913391113, + "learning_rate": 5.464839499694275e-06, + "loss": 0.8763, + "step": 15735 + }, + { + "epoch": 0.7662454653908894, + "grad_norm": 1.5981812477111816, + "learning_rate": 5.4626729655038435e-06, + "loss": 0.8265, + "step": 15736 + }, + { + "epoch": 0.7662941591799965, + "grad_norm": 2.2315399646759033, + "learning_rate": 5.460506792930747e-06, + "loss": 0.8666, + "step": 15737 + }, + { + "epoch": 0.7663428529691038, + "grad_norm": 1.4794203042984009, + "learning_rate": 5.458340982028865e-06, + "loss": 0.8381, + "step": 15738 + }, + { + "epoch": 0.766391546758211, + "grad_norm": 1.5131868124008179, + "learning_rate": 5.456175532852086e-06, + "loss": 0.8135, + "step": 15739 + }, + { + "epoch": 0.7664402405473182, + "grad_norm": 1.5869101285934448, + "learning_rate": 5.4540104454542564e-06, + "loss": 0.7472, + "step": 15740 + }, + { + "epoch": 0.7664889343364254, + "grad_norm": 2.038896322250366, + "learning_rate": 5.451845719889257e-06, + "loss": 0.7557, + "step": 15741 + }, + { + "epoch": 0.7665376281255326, + "grad_norm": 1.6163500547409058, + "learning_rate": 5.449681356210907e-06, + "loss": 0.768, + "step": 15742 + }, + { + "epoch": 0.7665863219146398, + "grad_norm": 1.5990453958511353, + "learning_rate": 5.447517354473069e-06, + "loss": 0.8178, + "step": 15743 + }, + { + "epoch": 0.766635015703747, + "grad_norm": 1.5017560720443726, + "learning_rate": 5.44535371472956e-06, + "loss": 0.8288, + "step": 15744 + }, + { + "epoch": 0.7666837094928541, + "grad_norm": 2.144951820373535, + "learning_rate": 5.443190437034207e-06, + "loss": 0.8745, + "step": 15745 + }, + { + "epoch": 0.7667324032819613, + "grad_norm": 1.2187577486038208, + "learning_rate": 5.441027521440818e-06, + "loss": 0.742, + "step": 15746 + }, + { + "epoch": 0.7667810970710686, + "grad_norm": 3.1189279556274414, + "learning_rate": 5.4388649680031905e-06, + "loss": 0.7589, + "step": 15747 + }, + { + "epoch": 0.7668297908601758, + "grad_norm": 1.382947325706482, + "learning_rate": 5.4367027767751375e-06, + "loss": 0.769, + "step": 15748 + }, + { + "epoch": 0.766878484649283, + "grad_norm": 1.6090363264083862, + "learning_rate": 5.434540947810418e-06, + "loss": 0.7787, + "step": 15749 + }, + { + "epoch": 0.7669271784383902, + "grad_norm": 1.338210940361023, + "learning_rate": 5.432379481162828e-06, + "loss": 0.8284, + "step": 15750 + }, + { + "epoch": 0.7669758722274974, + "grad_norm": 1.7174259424209595, + "learning_rate": 5.430218376886125e-06, + "loss": 0.7972, + "step": 15751 + }, + { + "epoch": 0.7670245660166046, + "grad_norm": 0.09739702194929123, + "learning_rate": 5.42805763503407e-06, + "loss": 0.6788, + "step": 15752 + }, + { + "epoch": 0.7670732598057117, + "grad_norm": 1.3900741338729858, + "learning_rate": 5.425897255660409e-06, + "loss": 0.8575, + "step": 15753 + }, + { + "epoch": 0.7671219535948189, + "grad_norm": 1.5976775884628296, + "learning_rate": 5.42373723881888e-06, + "loss": 0.8527, + "step": 15754 + }, + { + "epoch": 0.7671706473839262, + "grad_norm": 1.390813946723938, + "learning_rate": 5.42157758456322e-06, + "loss": 0.7661, + "step": 15755 + }, + { + "epoch": 0.7672193411730334, + "grad_norm": 3.4059531688690186, + "learning_rate": 5.419418292947145e-06, + "loss": 0.7976, + "step": 15756 + }, + { + "epoch": 0.7672680349621406, + "grad_norm": 4.255560874938965, + "learning_rate": 5.41725936402437e-06, + "loss": 0.8066, + "step": 15757 + }, + { + "epoch": 0.7673167287512478, + "grad_norm": 1.607312798500061, + "learning_rate": 5.415100797848592e-06, + "loss": 0.8337, + "step": 15758 + }, + { + "epoch": 0.767365422540355, + "grad_norm": 1.6663131713867188, + "learning_rate": 5.4129425944735156e-06, + "loss": 0.9261, + "step": 15759 + }, + { + "epoch": 0.7674141163294622, + "grad_norm": 1.7098788022994995, + "learning_rate": 5.410784753952825e-06, + "loss": 0.945, + "step": 15760 + }, + { + "epoch": 0.7674628101185694, + "grad_norm": 1.2349718809127808, + "learning_rate": 5.408627276340193e-06, + "loss": 0.7822, + "step": 15761 + }, + { + "epoch": 0.7675115039076765, + "grad_norm": 1.5258665084838867, + "learning_rate": 5.406470161689285e-06, + "loss": 0.7765, + "step": 15762 + }, + { + "epoch": 0.7675601976967837, + "grad_norm": 1.7657670974731445, + "learning_rate": 5.404313410053766e-06, + "loss": 0.8368, + "step": 15763 + }, + { + "epoch": 0.767608891485891, + "grad_norm": 1.4396061897277832, + "learning_rate": 5.40215702148728e-06, + "loss": 0.8273, + "step": 15764 + }, + { + "epoch": 0.7676575852749982, + "grad_norm": 1.5649008750915527, + "learning_rate": 5.400000996043469e-06, + "loss": 0.874, + "step": 15765 + }, + { + "epoch": 0.7677062790641054, + "grad_norm": 1.8463383913040161, + "learning_rate": 5.3978453337759595e-06, + "loss": 0.8861, + "step": 15766 + }, + { + "epoch": 0.7677549728532126, + "grad_norm": 2.488781213760376, + "learning_rate": 5.395690034738383e-06, + "loss": 0.7932, + "step": 15767 + }, + { + "epoch": 0.7678036666423198, + "grad_norm": 1.3595257997512817, + "learning_rate": 5.393535098984351e-06, + "loss": 0.7731, + "step": 15768 + }, + { + "epoch": 0.767852360431427, + "grad_norm": 1.896445393562317, + "learning_rate": 5.3913805265674626e-06, + "loss": 0.6888, + "step": 15769 + }, + { + "epoch": 0.7679010542205341, + "grad_norm": 1.2917896509170532, + "learning_rate": 5.389226317541316e-06, + "loss": 0.8059, + "step": 15770 + }, + { + "epoch": 0.7679497480096413, + "grad_norm": 1.3083503246307373, + "learning_rate": 5.387072471959498e-06, + "loss": 0.8279, + "step": 15771 + }, + { + "epoch": 0.7679984417987485, + "grad_norm": 0.10163335502147675, + "learning_rate": 5.384918989875585e-06, + "loss": 0.586, + "step": 15772 + }, + { + "epoch": 0.7680471355878558, + "grad_norm": 1.4113563299179077, + "learning_rate": 5.382765871343136e-06, + "loss": 0.8384, + "step": 15773 + }, + { + "epoch": 0.768095829376963, + "grad_norm": 1.7569434642791748, + "learning_rate": 5.3806131164157334e-06, + "loss": 0.8073, + "step": 15774 + }, + { + "epoch": 0.7681445231660702, + "grad_norm": 1.4944095611572266, + "learning_rate": 5.378460725146899e-06, + "loss": 0.8122, + "step": 15775 + }, + { + "epoch": 0.7681932169551774, + "grad_norm": 2.262434720993042, + "learning_rate": 5.3763086975902e-06, + "loss": 0.8897, + "step": 15776 + }, + { + "epoch": 0.7682419107442846, + "grad_norm": 1.5994844436645508, + "learning_rate": 5.374157033799143e-06, + "loss": 0.8052, + "step": 15777 + }, + { + "epoch": 0.7682906045333918, + "grad_norm": 1.7903395891189575, + "learning_rate": 5.37200573382727e-06, + "loss": 0.7874, + "step": 15778 + }, + { + "epoch": 0.7683392983224989, + "grad_norm": 2.27245831489563, + "learning_rate": 5.369854797728089e-06, + "loss": 0.829, + "step": 15779 + }, + { + "epoch": 0.7683879921116061, + "grad_norm": 1.0974528789520264, + "learning_rate": 5.367704225555101e-06, + "loss": 0.7442, + "step": 15780 + }, + { + "epoch": 0.7684366859007133, + "grad_norm": 1.3542386293411255, + "learning_rate": 5.365554017361807e-06, + "loss": 0.8372, + "step": 15781 + }, + { + "epoch": 0.7684853796898206, + "grad_norm": 1.9679800271987915, + "learning_rate": 5.363404173201685e-06, + "loss": 0.9063, + "step": 15782 + }, + { + "epoch": 0.7685340734789278, + "grad_norm": 1.5649651288986206, + "learning_rate": 5.361254693128229e-06, + "loss": 0.7956, + "step": 15783 + }, + { + "epoch": 0.768582767268035, + "grad_norm": 1.98796546459198, + "learning_rate": 5.359105577194886e-06, + "loss": 0.8796, + "step": 15784 + }, + { + "epoch": 0.7686314610571422, + "grad_norm": 0.09550116956233978, + "learning_rate": 5.35695682545514e-06, + "loss": 0.6257, + "step": 15785 + }, + { + "epoch": 0.7686801548462494, + "grad_norm": 1.4168044328689575, + "learning_rate": 5.354808437962413e-06, + "loss": 0.7558, + "step": 15786 + }, + { + "epoch": 0.7687288486353565, + "grad_norm": 3.2100021839141846, + "learning_rate": 5.35266041477017e-06, + "loss": 0.7108, + "step": 15787 + }, + { + "epoch": 0.7687775424244637, + "grad_norm": 1.4199614524841309, + "learning_rate": 5.350512755931834e-06, + "loss": 0.8759, + "step": 15788 + }, + { + "epoch": 0.7688262362135709, + "grad_norm": 1.2375166416168213, + "learning_rate": 5.348365461500822e-06, + "loss": 0.7714, + "step": 15789 + }, + { + "epoch": 0.7688749300026781, + "grad_norm": 1.5109041929244995, + "learning_rate": 5.346218531530569e-06, + "loss": 0.7833, + "step": 15790 + }, + { + "epoch": 0.7689236237917854, + "grad_norm": 1.4278994798660278, + "learning_rate": 5.344071966074451e-06, + "loss": 0.7518, + "step": 15791 + }, + { + "epoch": 0.7689723175808926, + "grad_norm": 1.5485436916351318, + "learning_rate": 5.341925765185894e-06, + "loss": 0.8521, + "step": 15792 + }, + { + "epoch": 0.7690210113699998, + "grad_norm": 1.9726436138153076, + "learning_rate": 5.339779928918258e-06, + "loss": 0.699, + "step": 15793 + }, + { + "epoch": 0.769069705159107, + "grad_norm": 1.6568447351455688, + "learning_rate": 5.337634457324938e-06, + "loss": 0.938, + "step": 15794 + }, + { + "epoch": 0.7691183989482142, + "grad_norm": 1.5961850881576538, + "learning_rate": 5.335489350459297e-06, + "loss": 0.8191, + "step": 15795 + }, + { + "epoch": 0.7691670927373213, + "grad_norm": 1.7922953367233276, + "learning_rate": 5.3333446083746955e-06, + "loss": 0.7528, + "step": 15796 + }, + { + "epoch": 0.7692157865264285, + "grad_norm": 1.8415470123291016, + "learning_rate": 5.331200231124485e-06, + "loss": 0.8089, + "step": 15797 + }, + { + "epoch": 0.7692644803155357, + "grad_norm": 2.3184523582458496, + "learning_rate": 5.329056218762001e-06, + "loss": 0.8537, + "step": 15798 + }, + { + "epoch": 0.769313174104643, + "grad_norm": 1.3093624114990234, + "learning_rate": 5.3269125713405925e-06, + "loss": 0.807, + "step": 15799 + }, + { + "epoch": 0.7693618678937502, + "grad_norm": 0.09612581133842468, + "learning_rate": 5.324769288913558e-06, + "loss": 0.6466, + "step": 15800 + }, + { + "epoch": 0.7694105616828574, + "grad_norm": 2.023646831512451, + "learning_rate": 5.322626371534234e-06, + "loss": 0.7257, + "step": 15801 + }, + { + "epoch": 0.7694592554719646, + "grad_norm": 1.4607819318771362, + "learning_rate": 5.320483819255915e-06, + "loss": 0.9198, + "step": 15802 + }, + { + "epoch": 0.7695079492610718, + "grad_norm": 2.1601855754852295, + "learning_rate": 5.318341632131901e-06, + "loss": 0.8709, + "step": 15803 + }, + { + "epoch": 0.7695566430501789, + "grad_norm": 1.476304292678833, + "learning_rate": 5.316199810215477e-06, + "loss": 0.8196, + "step": 15804 + }, + { + "epoch": 0.7696053368392861, + "grad_norm": 2.835867404937744, + "learning_rate": 5.314058353559921e-06, + "loss": 0.8705, + "step": 15805 + }, + { + "epoch": 0.7696540306283933, + "grad_norm": 1.5945920944213867, + "learning_rate": 5.311917262218504e-06, + "loss": 0.7977, + "step": 15806 + }, + { + "epoch": 0.7697027244175005, + "grad_norm": 1.5304009914398193, + "learning_rate": 5.309776536244477e-06, + "loss": 0.843, + "step": 15807 + }, + { + "epoch": 0.7697514182066078, + "grad_norm": 1.37041175365448, + "learning_rate": 5.307636175691111e-06, + "loss": 0.713, + "step": 15808 + }, + { + "epoch": 0.769800111995715, + "grad_norm": 1.3559619188308716, + "learning_rate": 5.3054961806116226e-06, + "loss": 0.8392, + "step": 15809 + }, + { + "epoch": 0.7698488057848222, + "grad_norm": 1.307066559791565, + "learning_rate": 5.303356551059262e-06, + "loss": 0.8077, + "step": 15810 + }, + { + "epoch": 0.7698974995739294, + "grad_norm": 1.80239999294281, + "learning_rate": 5.301217287087246e-06, + "loss": 0.863, + "step": 15811 + }, + { + "epoch": 0.7699461933630365, + "grad_norm": 1.5681945085525513, + "learning_rate": 5.299078388748793e-06, + "loss": 0.7666, + "step": 15812 + }, + { + "epoch": 0.7699948871521437, + "grad_norm": 1.9884274005889893, + "learning_rate": 5.296939856097103e-06, + "loss": 0.8787, + "step": 15813 + }, + { + "epoch": 0.7700435809412509, + "grad_norm": 1.7545667886734009, + "learning_rate": 5.294801689185376e-06, + "loss": 0.7794, + "step": 15814 + }, + { + "epoch": 0.7700922747303581, + "grad_norm": 1.7762715816497803, + "learning_rate": 5.292663888066796e-06, + "loss": 0.8617, + "step": 15815 + }, + { + "epoch": 0.7701409685194653, + "grad_norm": 1.3949989080429077, + "learning_rate": 5.2905264527945375e-06, + "loss": 0.7995, + "step": 15816 + }, + { + "epoch": 0.7701896623085726, + "grad_norm": 2.217982053756714, + "learning_rate": 5.288389383421781e-06, + "loss": 0.7149, + "step": 15817 + }, + { + "epoch": 0.7702383560976798, + "grad_norm": 1.964914083480835, + "learning_rate": 5.286252680001682e-06, + "loss": 0.8389, + "step": 15818 + }, + { + "epoch": 0.770287049886787, + "grad_norm": 2.8331525325775146, + "learning_rate": 5.284116342587386e-06, + "loss": 0.8125, + "step": 15819 + }, + { + "epoch": 0.7703357436758942, + "grad_norm": 1.9729454517364502, + "learning_rate": 5.281980371232041e-06, + "loss": 0.7512, + "step": 15820 + }, + { + "epoch": 0.7703844374650013, + "grad_norm": 1.5840564966201782, + "learning_rate": 5.279844765988773e-06, + "loss": 0.7471, + "step": 15821 + }, + { + "epoch": 0.7704331312541085, + "grad_norm": 4.706479549407959, + "learning_rate": 5.277709526910711e-06, + "loss": 0.7869, + "step": 15822 + }, + { + "epoch": 0.7704818250432157, + "grad_norm": 1.872410535812378, + "learning_rate": 5.275574654050963e-06, + "loss": 0.8638, + "step": 15823 + }, + { + "epoch": 0.7705305188323229, + "grad_norm": 2.178443670272827, + "learning_rate": 5.273440147462636e-06, + "loss": 0.7789, + "step": 15824 + }, + { + "epoch": 0.7705792126214301, + "grad_norm": 1.7003940343856812, + "learning_rate": 5.271306007198838e-06, + "loss": 0.7925, + "step": 15825 + }, + { + "epoch": 0.7706279064105374, + "grad_norm": 1.8377203941345215, + "learning_rate": 5.269172233312634e-06, + "loss": 0.8319, + "step": 15826 + }, + { + "epoch": 0.7706766001996446, + "grad_norm": 1.5996347665786743, + "learning_rate": 5.2670388258571205e-06, + "loss": 0.7811, + "step": 15827 + }, + { + "epoch": 0.7707252939887518, + "grad_norm": 1.6175343990325928, + "learning_rate": 5.2649057848853595e-06, + "loss": 0.7941, + "step": 15828 + }, + { + "epoch": 0.7707739877778589, + "grad_norm": 4.3941779136657715, + "learning_rate": 5.26277311045041e-06, + "loss": 0.9084, + "step": 15829 + }, + { + "epoch": 0.7708226815669661, + "grad_norm": 1.4896810054779053, + "learning_rate": 5.260640802605323e-06, + "loss": 0.8557, + "step": 15830 + }, + { + "epoch": 0.7708713753560733, + "grad_norm": 1.4446995258331299, + "learning_rate": 5.258508861403142e-06, + "loss": 0.8817, + "step": 15831 + }, + { + "epoch": 0.7709200691451805, + "grad_norm": 0.10204360634088516, + "learning_rate": 5.256377286896896e-06, + "loss": 0.6076, + "step": 15832 + }, + { + "epoch": 0.7709687629342877, + "grad_norm": 1.7084922790527344, + "learning_rate": 5.2542460791396025e-06, + "loss": 0.8068, + "step": 15833 + }, + { + "epoch": 0.771017456723395, + "grad_norm": 1.612808346748352, + "learning_rate": 5.2521152381842945e-06, + "loss": 0.862, + "step": 15834 + }, + { + "epoch": 0.7710661505125022, + "grad_norm": 1.6527847051620483, + "learning_rate": 5.249984764083953e-06, + "loss": 0.7909, + "step": 15835 + }, + { + "epoch": 0.7711148443016094, + "grad_norm": 1.6979516744613647, + "learning_rate": 5.247854656891591e-06, + "loss": 0.7537, + "step": 15836 + }, + { + "epoch": 0.7711635380907166, + "grad_norm": 1.2304164171218872, + "learning_rate": 5.245724916660188e-06, + "loss": 0.8824, + "step": 15837 + }, + { + "epoch": 0.7712122318798237, + "grad_norm": 1.4393843412399292, + "learning_rate": 5.243595543442725e-06, + "loss": 0.8901, + "step": 15838 + }, + { + "epoch": 0.7712609256689309, + "grad_norm": 2.086289167404175, + "learning_rate": 5.241466537292168e-06, + "loss": 0.8641, + "step": 15839 + }, + { + "epoch": 0.7713096194580381, + "grad_norm": 2.218186855316162, + "learning_rate": 5.2393378982614715e-06, + "loss": 0.8326, + "step": 15840 + }, + { + "epoch": 0.7713583132471453, + "grad_norm": 1.869757056236267, + "learning_rate": 5.237209626403601e-06, + "loss": 0.8267, + "step": 15841 + }, + { + "epoch": 0.7714070070362525, + "grad_norm": 1.4333324432373047, + "learning_rate": 5.235081721771475e-06, + "loss": 0.7495, + "step": 15842 + }, + { + "epoch": 0.7714557008253597, + "grad_norm": 1.2445305585861206, + "learning_rate": 5.232954184418051e-06, + "loss": 0.8121, + "step": 15843 + }, + { + "epoch": 0.771504394614467, + "grad_norm": 1.4331289529800415, + "learning_rate": 5.230827014396225e-06, + "loss": 0.8671, + "step": 15844 + }, + { + "epoch": 0.7715530884035742, + "grad_norm": 1.3779096603393555, + "learning_rate": 5.2287002117589284e-06, + "loss": 0.8777, + "step": 15845 + }, + { + "epoch": 0.7716017821926813, + "grad_norm": 1.9489879608154297, + "learning_rate": 5.226573776559063e-06, + "loss": 0.8327, + "step": 15846 + }, + { + "epoch": 0.7716504759817885, + "grad_norm": 0.09565449506044388, + "learning_rate": 5.224447708849514e-06, + "loss": 0.6237, + "step": 15847 + }, + { + "epoch": 0.7716991697708957, + "grad_norm": 1.398077130317688, + "learning_rate": 5.222322008683187e-06, + "loss": 0.8356, + "step": 15848 + }, + { + "epoch": 0.7717478635600029, + "grad_norm": 2.430121898651123, + "learning_rate": 5.220196676112939e-06, + "loss": 0.7083, + "step": 15849 + }, + { + "epoch": 0.7717965573491101, + "grad_norm": 1.4729933738708496, + "learning_rate": 5.218071711191654e-06, + "loss": 0.84, + "step": 15850 + }, + { + "epoch": 0.7718452511382173, + "grad_norm": 1.2731472253799438, + "learning_rate": 5.215947113972173e-06, + "loss": 0.8396, + "step": 15851 + }, + { + "epoch": 0.7718939449273245, + "grad_norm": 1.2726720571517944, + "learning_rate": 5.213822884507362e-06, + "loss": 0.8585, + "step": 15852 + }, + { + "epoch": 0.7719426387164318, + "grad_norm": 1.5283796787261963, + "learning_rate": 5.211699022850054e-06, + "loss": 0.9026, + "step": 15853 + }, + { + "epoch": 0.7719913325055389, + "grad_norm": 1.6408814191818237, + "learning_rate": 5.2095755290530816e-06, + "loss": 0.8584, + "step": 15854 + }, + { + "epoch": 0.7720400262946461, + "grad_norm": 0.09250401705503464, + "learning_rate": 5.207452403169267e-06, + "loss": 0.6822, + "step": 15855 + }, + { + "epoch": 0.7720887200837533, + "grad_norm": 1.5877339839935303, + "learning_rate": 5.205329645251416e-06, + "loss": 0.802, + "step": 15856 + }, + { + "epoch": 0.7721374138728605, + "grad_norm": 2.754833459854126, + "learning_rate": 5.20320725535235e-06, + "loss": 0.8572, + "step": 15857 + }, + { + "epoch": 0.7721861076619677, + "grad_norm": 1.3905038833618164, + "learning_rate": 5.20108523352484e-06, + "loss": 0.8863, + "step": 15858 + }, + { + "epoch": 0.7722348014510749, + "grad_norm": 1.5343331098556519, + "learning_rate": 5.198963579821694e-06, + "loss": 0.7558, + "step": 15859 + }, + { + "epoch": 0.7722834952401821, + "grad_norm": 1.5520603656768799, + "learning_rate": 5.196842294295677e-06, + "loss": 0.8675, + "step": 15860 + }, + { + "epoch": 0.7723321890292894, + "grad_norm": 0.09783866256475449, + "learning_rate": 5.194721376999554e-06, + "loss": 0.6431, + "step": 15861 + }, + { + "epoch": 0.7723808828183966, + "grad_norm": 1.6857328414916992, + "learning_rate": 5.192600827986089e-06, + "loss": 0.8691, + "step": 15862 + }, + { + "epoch": 0.7724295766075037, + "grad_norm": 2.292560338973999, + "learning_rate": 5.19048064730803e-06, + "loss": 0.8711, + "step": 15863 + }, + { + "epoch": 0.7724782703966109, + "grad_norm": 0.09115327149629593, + "learning_rate": 5.1883608350181135e-06, + "loss": 0.5654, + "step": 15864 + }, + { + "epoch": 0.7725269641857181, + "grad_norm": 1.4666920900344849, + "learning_rate": 5.186241391169073e-06, + "loss": 0.7731, + "step": 15865 + }, + { + "epoch": 0.7725756579748253, + "grad_norm": 2.1044859886169434, + "learning_rate": 5.184122315813629e-06, + "loss": 0.9364, + "step": 15866 + }, + { + "epoch": 0.7726243517639325, + "grad_norm": 1.485639214515686, + "learning_rate": 5.182003609004485e-06, + "loss": 0.7569, + "step": 15867 + }, + { + "epoch": 0.7726730455530397, + "grad_norm": 1.792547583580017, + "learning_rate": 5.1798852707943605e-06, + "loss": 0.8592, + "step": 15868 + }, + { + "epoch": 0.7727217393421469, + "grad_norm": 1.5497195720672607, + "learning_rate": 5.177767301235939e-06, + "loss": 0.697, + "step": 15869 + }, + { + "epoch": 0.7727704331312542, + "grad_norm": 1.8534002304077148, + "learning_rate": 5.175649700381908e-06, + "loss": 0.7832, + "step": 15870 + }, + { + "epoch": 0.7728191269203613, + "grad_norm": 2.649522066116333, + "learning_rate": 5.173532468284943e-06, + "loss": 0.8778, + "step": 15871 + }, + { + "epoch": 0.7728678207094685, + "grad_norm": 1.3247047662734985, + "learning_rate": 5.1714156049977095e-06, + "loss": 0.8368, + "step": 15872 + }, + { + "epoch": 0.7729165144985757, + "grad_norm": 1.276432991027832, + "learning_rate": 5.169299110572863e-06, + "loss": 0.8362, + "step": 15873 + }, + { + "epoch": 0.7729652082876829, + "grad_norm": 1.8291670083999634, + "learning_rate": 5.167182985063055e-06, + "loss": 0.8907, + "step": 15874 + }, + { + "epoch": 0.7730139020767901, + "grad_norm": 2.7030563354492188, + "learning_rate": 5.165067228520917e-06, + "loss": 0.8274, + "step": 15875 + }, + { + "epoch": 0.7730625958658973, + "grad_norm": 1.6289750337600708, + "learning_rate": 5.162951840999091e-06, + "loss": 0.8254, + "step": 15876 + }, + { + "epoch": 0.7731112896550045, + "grad_norm": 1.3124966621398926, + "learning_rate": 5.160836822550188e-06, + "loss": 0.8254, + "step": 15877 + }, + { + "epoch": 0.7731599834441117, + "grad_norm": 1.253017783164978, + "learning_rate": 5.1587221732268245e-06, + "loss": 0.7523, + "step": 15878 + }, + { + "epoch": 0.773208677233219, + "grad_norm": 4.843581676483154, + "learning_rate": 5.1566078930816e-06, + "loss": 0.855, + "step": 15879 + }, + { + "epoch": 0.7732573710223261, + "grad_norm": 1.6915990114212036, + "learning_rate": 5.154493982167106e-06, + "loss": 0.7666, + "step": 15880 + }, + { + "epoch": 0.7733060648114333, + "grad_norm": 2.0095138549804688, + "learning_rate": 5.152380440535929e-06, + "loss": 0.719, + "step": 15881 + }, + { + "epoch": 0.7733547586005405, + "grad_norm": 1.4014618396759033, + "learning_rate": 5.150267268240636e-06, + "loss": 0.7969, + "step": 15882 + }, + { + "epoch": 0.7734034523896477, + "grad_norm": 1.6190403699874878, + "learning_rate": 5.14815446533381e-06, + "loss": 0.8543, + "step": 15883 + }, + { + "epoch": 0.7734521461787549, + "grad_norm": 1.5638530254364014, + "learning_rate": 5.1460420318679835e-06, + "loss": 0.8668, + "step": 15884 + }, + { + "epoch": 0.7735008399678621, + "grad_norm": 1.3070118427276611, + "learning_rate": 5.143929967895729e-06, + "loss": 0.7335, + "step": 15885 + }, + { + "epoch": 0.7735495337569693, + "grad_norm": 1.2090880870819092, + "learning_rate": 5.141818273469559e-06, + "loss": 0.822, + "step": 15886 + }, + { + "epoch": 0.7735982275460765, + "grad_norm": 8.07219409942627, + "learning_rate": 5.13970694864202e-06, + "loss": 0.7725, + "step": 15887 + }, + { + "epoch": 0.7736469213351836, + "grad_norm": 1.731504201889038, + "learning_rate": 5.137595993465625e-06, + "loss": 0.8118, + "step": 15888 + }, + { + "epoch": 0.7736956151242909, + "grad_norm": 1.3923444747924805, + "learning_rate": 5.135485407992886e-06, + "loss": 0.7675, + "step": 15889 + }, + { + "epoch": 0.7737443089133981, + "grad_norm": 1.9272658824920654, + "learning_rate": 5.133375192276302e-06, + "loss": 0.7477, + "step": 15890 + }, + { + "epoch": 0.7737930027025053, + "grad_norm": 1.5632731914520264, + "learning_rate": 5.131265346368359e-06, + "loss": 0.8219, + "step": 15891 + }, + { + "epoch": 0.7738416964916125, + "grad_norm": 1.4721490144729614, + "learning_rate": 5.1291558703215584e-06, + "loss": 0.8686, + "step": 15892 + }, + { + "epoch": 0.7738903902807197, + "grad_norm": 1.7867794036865234, + "learning_rate": 5.1270467641883505e-06, + "loss": 0.8228, + "step": 15893 + }, + { + "epoch": 0.7739390840698269, + "grad_norm": 1.36957585811615, + "learning_rate": 5.1249380280212226e-06, + "loss": 0.731, + "step": 15894 + }, + { + "epoch": 0.7739877778589341, + "grad_norm": 3.514930486679077, + "learning_rate": 5.122829661872603e-06, + "loss": 0.8147, + "step": 15895 + }, + { + "epoch": 0.7740364716480413, + "grad_norm": 1.1554702520370483, + "learning_rate": 5.120721665794961e-06, + "loss": 0.8369, + "step": 15896 + }, + { + "epoch": 0.7740851654371484, + "grad_norm": 1.4657320976257324, + "learning_rate": 5.118614039840721e-06, + "loss": 0.8316, + "step": 15897 + }, + { + "epoch": 0.7741338592262557, + "grad_norm": 1.42184579372406, + "learning_rate": 5.1165067840623095e-06, + "loss": 0.8407, + "step": 15898 + }, + { + "epoch": 0.7741825530153629, + "grad_norm": 1.743866205215454, + "learning_rate": 5.114399898512161e-06, + "loss": 0.7625, + "step": 15899 + }, + { + "epoch": 0.7742312468044701, + "grad_norm": 2.4729294776916504, + "learning_rate": 5.112293383242659e-06, + "loss": 0.7961, + "step": 15900 + }, + { + "epoch": 0.7742799405935773, + "grad_norm": 1.4090975522994995, + "learning_rate": 5.11018723830623e-06, + "loss": 0.7414, + "step": 15901 + }, + { + "epoch": 0.7743286343826845, + "grad_norm": 1.7429183721542358, + "learning_rate": 5.108081463755239e-06, + "loss": 0.776, + "step": 15902 + }, + { + "epoch": 0.7743773281717917, + "grad_norm": 4.203670978546143, + "learning_rate": 5.1059760596420836e-06, + "loss": 0.8079, + "step": 15903 + }, + { + "epoch": 0.7744260219608989, + "grad_norm": 1.615926742553711, + "learning_rate": 5.103871026019134e-06, + "loss": 0.8326, + "step": 15904 + }, + { + "epoch": 0.774474715750006, + "grad_norm": 1.4155107736587524, + "learning_rate": 5.1017663629387494e-06, + "loss": 0.8165, + "step": 15905 + }, + { + "epoch": 0.7745234095391133, + "grad_norm": 1.5283482074737549, + "learning_rate": 5.099662070453284e-06, + "loss": 0.826, + "step": 15906 + }, + { + "epoch": 0.7745721033282205, + "grad_norm": 1.8003478050231934, + "learning_rate": 5.097558148615078e-06, + "loss": 0.7682, + "step": 15907 + }, + { + "epoch": 0.7746207971173277, + "grad_norm": 1.2503139972686768, + "learning_rate": 5.095454597476486e-06, + "loss": 0.8184, + "step": 15908 + }, + { + "epoch": 0.7746694909064349, + "grad_norm": 3.602921485900879, + "learning_rate": 5.093351417089805e-06, + "loss": 0.8456, + "step": 15909 + }, + { + "epoch": 0.7747181846955421, + "grad_norm": 2.052959680557251, + "learning_rate": 5.091248607507373e-06, + "loss": 0.8079, + "step": 15910 + }, + { + "epoch": 0.7747668784846493, + "grad_norm": 1.7282538414001465, + "learning_rate": 5.089146168781494e-06, + "loss": 0.8052, + "step": 15911 + }, + { + "epoch": 0.7748155722737565, + "grad_norm": 1.244063377380371, + "learning_rate": 5.08704410096446e-06, + "loss": 0.7766, + "step": 15912 + }, + { + "epoch": 0.7748642660628636, + "grad_norm": 1.504854440689087, + "learning_rate": 5.084942404108566e-06, + "loss": 0.7622, + "step": 15913 + }, + { + "epoch": 0.7749129598519708, + "grad_norm": 1.2561613321304321, + "learning_rate": 5.082841078266088e-06, + "loss": 0.7887, + "step": 15914 + }, + { + "epoch": 0.774961653641078, + "grad_norm": 1.6020115613937378, + "learning_rate": 5.080740123489298e-06, + "loss": 0.7794, + "step": 15915 + }, + { + "epoch": 0.7750103474301853, + "grad_norm": 1.5477540493011475, + "learning_rate": 5.078639539830454e-06, + "loss": 0.7681, + "step": 15916 + }, + { + "epoch": 0.7750590412192925, + "grad_norm": 1.52933931350708, + "learning_rate": 5.076539327341823e-06, + "loss": 0.8624, + "step": 15917 + }, + { + "epoch": 0.7751077350083997, + "grad_norm": 1.5258028507232666, + "learning_rate": 5.074439486075624e-06, + "loss": 0.7633, + "step": 15918 + }, + { + "epoch": 0.7751564287975069, + "grad_norm": 1.5469985008239746, + "learning_rate": 5.0723400160841095e-06, + "loss": 0.8438, + "step": 15919 + }, + { + "epoch": 0.7752051225866141, + "grad_norm": 1.2546679973602295, + "learning_rate": 5.0702409174194976e-06, + "loss": 0.8227, + "step": 15920 + }, + { + "epoch": 0.7752538163757213, + "grad_norm": 1.4771322011947632, + "learning_rate": 5.068142190134006e-06, + "loss": 0.8372, + "step": 15921 + }, + { + "epoch": 0.7753025101648284, + "grad_norm": 1.9281538724899292, + "learning_rate": 5.066043834279839e-06, + "loss": 0.7666, + "step": 15922 + }, + { + "epoch": 0.7753512039539356, + "grad_norm": 1.8754560947418213, + "learning_rate": 5.063945849909191e-06, + "loss": 0.8343, + "step": 15923 + }, + { + "epoch": 0.7753998977430429, + "grad_norm": 1.344100832939148, + "learning_rate": 5.061848237074254e-06, + "loss": 0.7976, + "step": 15924 + }, + { + "epoch": 0.7754485915321501, + "grad_norm": 1.442928433418274, + "learning_rate": 5.059750995827198e-06, + "loss": 0.838, + "step": 15925 + }, + { + "epoch": 0.7754972853212573, + "grad_norm": 1.6722419261932373, + "learning_rate": 5.057654126220204e-06, + "loss": 0.9357, + "step": 15926 + }, + { + "epoch": 0.7755459791103645, + "grad_norm": 1.5920497179031372, + "learning_rate": 5.055557628305425e-06, + "loss": 0.8489, + "step": 15927 + }, + { + "epoch": 0.7755946728994717, + "grad_norm": 1.9493635892868042, + "learning_rate": 5.053461502135013e-06, + "loss": 0.8413, + "step": 15928 + }, + { + "epoch": 0.7756433666885789, + "grad_norm": 1.2362662553787231, + "learning_rate": 5.051365747761107e-06, + "loss": 0.8011, + "step": 15929 + }, + { + "epoch": 0.775692060477686, + "grad_norm": 1.458831787109375, + "learning_rate": 5.049270365235843e-06, + "loss": 0.766, + "step": 15930 + }, + { + "epoch": 0.7757407542667932, + "grad_norm": 2.4354658126831055, + "learning_rate": 5.0471753546113425e-06, + "loss": 0.7579, + "step": 15931 + }, + { + "epoch": 0.7757894480559004, + "grad_norm": 1.9514570236206055, + "learning_rate": 5.0450807159397185e-06, + "loss": 0.8226, + "step": 15932 + }, + { + "epoch": 0.7758381418450077, + "grad_norm": 2.1040923595428467, + "learning_rate": 5.042986449273068e-06, + "loss": 0.8656, + "step": 15933 + }, + { + "epoch": 0.7758868356341149, + "grad_norm": 1.531112790107727, + "learning_rate": 5.040892554663506e-06, + "loss": 0.8851, + "step": 15934 + }, + { + "epoch": 0.7759355294232221, + "grad_norm": 2.5308659076690674, + "learning_rate": 5.0387990321630935e-06, + "loss": 0.8473, + "step": 15935 + }, + { + "epoch": 0.7759842232123293, + "grad_norm": 1.9800267219543457, + "learning_rate": 5.036705881823924e-06, + "loss": 0.7768, + "step": 15936 + }, + { + "epoch": 0.7760329170014365, + "grad_norm": 1.2334487438201904, + "learning_rate": 5.034613103698061e-06, + "loss": 0.7982, + "step": 15937 + }, + { + "epoch": 0.7760816107905437, + "grad_norm": 3.8525781631469727, + "learning_rate": 5.032520697837562e-06, + "loss": 0.9139, + "step": 15938 + }, + { + "epoch": 0.7761303045796508, + "grad_norm": 0.106715627014637, + "learning_rate": 5.030428664294475e-06, + "loss": 0.6936, + "step": 15939 + }, + { + "epoch": 0.776178998368758, + "grad_norm": 1.2963762283325195, + "learning_rate": 5.02833700312084e-06, + "loss": 0.8733, + "step": 15940 + }, + { + "epoch": 0.7762276921578652, + "grad_norm": 2.0245392322540283, + "learning_rate": 5.026245714368687e-06, + "loss": 0.739, + "step": 15941 + }, + { + "epoch": 0.7762763859469725, + "grad_norm": 1.4730467796325684, + "learning_rate": 5.024154798090031e-06, + "loss": 0.7638, + "step": 15942 + }, + { + "epoch": 0.7763250797360797, + "grad_norm": 1.8925962448120117, + "learning_rate": 5.0220642543369005e-06, + "loss": 0.8726, + "step": 15943 + }, + { + "epoch": 0.7763737735251869, + "grad_norm": 2.0486533641815186, + "learning_rate": 5.019974083161276e-06, + "loss": 0.8504, + "step": 15944 + }, + { + "epoch": 0.7764224673142941, + "grad_norm": 1.817335844039917, + "learning_rate": 5.017884284615169e-06, + "loss": 0.8303, + "step": 15945 + }, + { + "epoch": 0.7764711611034013, + "grad_norm": 1.7899789810180664, + "learning_rate": 5.015794858750556e-06, + "loss": 0.8178, + "step": 15946 + }, + { + "epoch": 0.7765198548925084, + "grad_norm": 1.7695093154907227, + "learning_rate": 5.013705805619413e-06, + "loss": 0.8349, + "step": 15947 + }, + { + "epoch": 0.7765685486816156, + "grad_norm": 2.5167293548583984, + "learning_rate": 5.011617125273702e-06, + "loss": 0.8321, + "step": 15948 + }, + { + "epoch": 0.7766172424707228, + "grad_norm": 1.5243171453475952, + "learning_rate": 5.009528817765379e-06, + "loss": 0.8487, + "step": 15949 + }, + { + "epoch": 0.77666593625983, + "grad_norm": 2.109290838241577, + "learning_rate": 5.007440883146404e-06, + "loss": 0.8117, + "step": 15950 + }, + { + "epoch": 0.7767146300489373, + "grad_norm": 1.6893948316574097, + "learning_rate": 5.005353321468691e-06, + "loss": 0.8489, + "step": 15951 + }, + { + "epoch": 0.7767633238380445, + "grad_norm": 1.3394232988357544, + "learning_rate": 5.003266132784193e-06, + "loss": 0.8024, + "step": 15952 + }, + { + "epoch": 0.7768120176271517, + "grad_norm": 1.3595234155654907, + "learning_rate": 5.001179317144806e-06, + "loss": 0.7834, + "step": 15953 + }, + { + "epoch": 0.7768607114162589, + "grad_norm": 1.4161574840545654, + "learning_rate": 4.9990928746024584e-06, + "loss": 0.8005, + "step": 15954 + }, + { + "epoch": 0.7769094052053661, + "grad_norm": 1.6518737077713013, + "learning_rate": 4.997006805209042e-06, + "loss": 0.783, + "step": 15955 + }, + { + "epoch": 0.7769580989944732, + "grad_norm": 1.6418921947479248, + "learning_rate": 4.994921109016441e-06, + "loss": 0.9207, + "step": 15956 + }, + { + "epoch": 0.7770067927835804, + "grad_norm": 1.9388118982315063, + "learning_rate": 4.992835786076562e-06, + "loss": 0.8413, + "step": 15957 + }, + { + "epoch": 0.7770554865726876, + "grad_norm": 2.581500291824341, + "learning_rate": 4.990750836441247e-06, + "loss": 0.896, + "step": 15958 + }, + { + "epoch": 0.7771041803617948, + "grad_norm": 1.391671061515808, + "learning_rate": 4.988666260162385e-06, + "loss": 0.884, + "step": 15959 + }, + { + "epoch": 0.7771528741509021, + "grad_norm": 1.6627154350280762, + "learning_rate": 4.9865820572918065e-06, + "loss": 0.7691, + "step": 15960 + }, + { + "epoch": 0.7772015679400093, + "grad_norm": 1.9268064498901367, + "learning_rate": 4.984498227881376e-06, + "loss": 0.7966, + "step": 15961 + }, + { + "epoch": 0.7772502617291165, + "grad_norm": 1.66830313205719, + "learning_rate": 4.982414771982919e-06, + "loss": 0.7816, + "step": 15962 + }, + { + "epoch": 0.7772989555182237, + "grad_norm": 1.8820466995239258, + "learning_rate": 4.980331689648264e-06, + "loss": 0.834, + "step": 15963 + }, + { + "epoch": 0.7773476493073308, + "grad_norm": 1.5029940605163574, + "learning_rate": 4.978248980929228e-06, + "loss": 0.87, + "step": 15964 + }, + { + "epoch": 0.777396343096438, + "grad_norm": 1.9538853168487549, + "learning_rate": 4.976166645877613e-06, + "loss": 0.786, + "step": 15965 + }, + { + "epoch": 0.7774450368855452, + "grad_norm": 1.4490987062454224, + "learning_rate": 4.974084684545233e-06, + "loss": 0.8414, + "step": 15966 + }, + { + "epoch": 0.7774937306746524, + "grad_norm": 1.4737071990966797, + "learning_rate": 4.972003096983855e-06, + "loss": 0.8416, + "step": 15967 + }, + { + "epoch": 0.7775424244637597, + "grad_norm": 2.380703926086426, + "learning_rate": 4.9699218832452765e-06, + "loss": 0.7951, + "step": 15968 + }, + { + "epoch": 0.7775911182528669, + "grad_norm": 1.9436954259872437, + "learning_rate": 4.967841043381263e-06, + "loss": 0.7413, + "step": 15969 + }, + { + "epoch": 0.7776398120419741, + "grad_norm": 1.4758580923080444, + "learning_rate": 4.965760577443572e-06, + "loss": 0.7984, + "step": 15970 + }, + { + "epoch": 0.7776885058310813, + "grad_norm": 1.42436683177948, + "learning_rate": 4.963680485483957e-06, + "loss": 0.8939, + "step": 15971 + }, + { + "epoch": 0.7777371996201884, + "grad_norm": 1.7347553968429565, + "learning_rate": 4.961600767554162e-06, + "loss": 0.8187, + "step": 15972 + }, + { + "epoch": 0.7777858934092956, + "grad_norm": 1.24210786819458, + "learning_rate": 4.95952142370592e-06, + "loss": 0.8082, + "step": 15973 + }, + { + "epoch": 0.7778345871984028, + "grad_norm": 1.511508822441101, + "learning_rate": 4.957442453990953e-06, + "loss": 0.7816, + "step": 15974 + }, + { + "epoch": 0.77788328098751, + "grad_norm": 1.9773814678192139, + "learning_rate": 4.955363858460977e-06, + "loss": 0.86, + "step": 15975 + }, + { + "epoch": 0.7779319747766172, + "grad_norm": 2.13081693649292, + "learning_rate": 4.9532856371676905e-06, + "loss": 0.7987, + "step": 15976 + }, + { + "epoch": 0.7779806685657245, + "grad_norm": 1.4022620916366577, + "learning_rate": 4.9512077901627995e-06, + "loss": 0.7495, + "step": 15977 + }, + { + "epoch": 0.7780293623548317, + "grad_norm": 1.4209556579589844, + "learning_rate": 4.94913031749799e-06, + "loss": 0.8687, + "step": 15978 + }, + { + "epoch": 0.7780780561439389, + "grad_norm": 1.3987613916397095, + "learning_rate": 4.947053219224936e-06, + "loss": 0.8613, + "step": 15979 + }, + { + "epoch": 0.7781267499330461, + "grad_norm": 1.7900314331054688, + "learning_rate": 4.9449764953953036e-06, + "loss": 0.8068, + "step": 15980 + }, + { + "epoch": 0.7781754437221532, + "grad_norm": 1.3593651056289673, + "learning_rate": 4.942900146060754e-06, + "loss": 0.7854, + "step": 15981 + }, + { + "epoch": 0.7782241375112604, + "grad_norm": 4.817457675933838, + "learning_rate": 4.940824171272935e-06, + "loss": 0.8557, + "step": 15982 + }, + { + "epoch": 0.7782728313003676, + "grad_norm": 2.041865587234497, + "learning_rate": 4.938748571083491e-06, + "loss": 0.7052, + "step": 15983 + }, + { + "epoch": 0.7783215250894748, + "grad_norm": 1.1135073900222778, + "learning_rate": 4.936673345544041e-06, + "loss": 0.8616, + "step": 15984 + }, + { + "epoch": 0.778370218878582, + "grad_norm": 1.4744256734848022, + "learning_rate": 4.934598494706222e-06, + "loss": 0.8434, + "step": 15985 + }, + { + "epoch": 0.7784189126676893, + "grad_norm": 1.8531866073608398, + "learning_rate": 4.932524018621639e-06, + "loss": 0.8241, + "step": 15986 + }, + { + "epoch": 0.7784676064567965, + "grad_norm": 2.0274062156677246, + "learning_rate": 4.9304499173418955e-06, + "loss": 0.8637, + "step": 15987 + }, + { + "epoch": 0.7785163002459037, + "grad_norm": 2.060885190963745, + "learning_rate": 4.928376190918582e-06, + "loss": 0.8138, + "step": 15988 + }, + { + "epoch": 0.7785649940350108, + "grad_norm": 2.3304266929626465, + "learning_rate": 4.926302839403287e-06, + "loss": 0.7895, + "step": 15989 + }, + { + "epoch": 0.778613687824118, + "grad_norm": 2.1409924030303955, + "learning_rate": 4.924229862847583e-06, + "loss": 0.789, + "step": 15990 + }, + { + "epoch": 0.7786623816132252, + "grad_norm": 1.864681601524353, + "learning_rate": 4.9221572613030285e-06, + "loss": 0.7388, + "step": 15991 + }, + { + "epoch": 0.7787110754023324, + "grad_norm": 1.8542536497116089, + "learning_rate": 4.9200850348212e-06, + "loss": 0.7652, + "step": 15992 + }, + { + "epoch": 0.7787597691914396, + "grad_norm": 1.4843090772628784, + "learning_rate": 4.918013183453619e-06, + "loss": 0.8646, + "step": 15993 + }, + { + "epoch": 0.7788084629805468, + "grad_norm": 1.5753891468048096, + "learning_rate": 4.915941707251848e-06, + "loss": 0.8273, + "step": 15994 + }, + { + "epoch": 0.7788571567696541, + "grad_norm": 1.2460581064224243, + "learning_rate": 4.913870606267388e-06, + "loss": 0.8377, + "step": 15995 + }, + { + "epoch": 0.7789058505587613, + "grad_norm": 2.445312023162842, + "learning_rate": 4.91179988055178e-06, + "loss": 0.8568, + "step": 15996 + }, + { + "epoch": 0.7789545443478685, + "grad_norm": 1.5585761070251465, + "learning_rate": 4.9097295301565266e-06, + "loss": 0.8673, + "step": 15997 + }, + { + "epoch": 0.7790032381369756, + "grad_norm": 1.5026905536651611, + "learning_rate": 4.907659555133124e-06, + "loss": 0.7878, + "step": 15998 + }, + { + "epoch": 0.7790519319260828, + "grad_norm": 2.2434659004211426, + "learning_rate": 4.905589955533068e-06, + "loss": 0.8116, + "step": 15999 + }, + { + "epoch": 0.77910062571519, + "grad_norm": 1.5028126239776611, + "learning_rate": 4.903520731407832e-06, + "loss": 0.7833, + "step": 16000 + }, + { + "epoch": 0.7791493195042972, + "grad_norm": 1.2853118181228638, + "learning_rate": 4.901451882808906e-06, + "loss": 0.7376, + "step": 16001 + }, + { + "epoch": 0.7791980132934044, + "grad_norm": 1.4701428413391113, + "learning_rate": 4.899383409787728e-06, + "loss": 0.7829, + "step": 16002 + }, + { + "epoch": 0.7792467070825116, + "grad_norm": 1.376726508140564, + "learning_rate": 4.897315312395778e-06, + "loss": 0.7575, + "step": 16003 + }, + { + "epoch": 0.7792954008716189, + "grad_norm": 3.3788037300109863, + "learning_rate": 4.895247590684476e-06, + "loss": 0.8224, + "step": 16004 + }, + { + "epoch": 0.7793440946607261, + "grad_norm": 1.4115978479385376, + "learning_rate": 4.893180244705271e-06, + "loss": 0.8746, + "step": 16005 + }, + { + "epoch": 0.7793927884498332, + "grad_norm": 1.270323634147644, + "learning_rate": 4.8911132745095845e-06, + "loss": 0.8264, + "step": 16006 + }, + { + "epoch": 0.7794414822389404, + "grad_norm": 1.4575639963150024, + "learning_rate": 4.889046680148828e-06, + "loss": 0.7477, + "step": 16007 + }, + { + "epoch": 0.7794901760280476, + "grad_norm": 1.9915772676467896, + "learning_rate": 4.886980461674425e-06, + "loss": 0.8477, + "step": 16008 + }, + { + "epoch": 0.7795388698171548, + "grad_norm": 1.4652810096740723, + "learning_rate": 4.884914619137748e-06, + "loss": 0.8275, + "step": 16009 + }, + { + "epoch": 0.779587563606262, + "grad_norm": 1.8093987703323364, + "learning_rate": 4.882849152590212e-06, + "loss": 0.8177, + "step": 16010 + }, + { + "epoch": 0.7796362573953692, + "grad_norm": 1.2166916131973267, + "learning_rate": 4.880784062083168e-06, + "loss": 0.7898, + "step": 16011 + }, + { + "epoch": 0.7796849511844764, + "grad_norm": 2.0441434383392334, + "learning_rate": 4.878719347668006e-06, + "loss": 0.6846, + "step": 16012 + }, + { + "epoch": 0.7797336449735837, + "grad_norm": 0.09590110182762146, + "learning_rate": 4.8766550093960785e-06, + "loss": 0.6438, + "step": 16013 + }, + { + "epoch": 0.7797823387626909, + "grad_norm": 2.1406030654907227, + "learning_rate": 4.874591047318731e-06, + "loss": 0.7878, + "step": 16014 + }, + { + "epoch": 0.779831032551798, + "grad_norm": 2.2889506816864014, + "learning_rate": 4.87252746148732e-06, + "loss": 0.8756, + "step": 16015 + }, + { + "epoch": 0.7798797263409052, + "grad_norm": 1.1871652603149414, + "learning_rate": 4.870464251953158e-06, + "loss": 0.8193, + "step": 16016 + }, + { + "epoch": 0.7799284201300124, + "grad_norm": 1.640724778175354, + "learning_rate": 4.868401418767588e-06, + "loss": 0.7405, + "step": 16017 + }, + { + "epoch": 0.7799771139191196, + "grad_norm": 1.6823514699935913, + "learning_rate": 4.8663389619819e-06, + "loss": 0.8486, + "step": 16018 + }, + { + "epoch": 0.7800258077082268, + "grad_norm": 1.728858470916748, + "learning_rate": 4.864276881647418e-06, + "loss": 0.7893, + "step": 16019 + }, + { + "epoch": 0.780074501497334, + "grad_norm": 2.7409279346466064, + "learning_rate": 4.862215177815428e-06, + "loss": 0.7894, + "step": 16020 + }, + { + "epoch": 0.7801231952864413, + "grad_norm": 3.336388349533081, + "learning_rate": 4.860153850537215e-06, + "loss": 0.8675, + "step": 16021 + }, + { + "epoch": 0.7801718890755485, + "grad_norm": 1.5523916482925415, + "learning_rate": 4.858092899864056e-06, + "loss": 0.7236, + "step": 16022 + }, + { + "epoch": 0.7802205828646556, + "grad_norm": 1.4641609191894531, + "learning_rate": 4.856032325847215e-06, + "loss": 0.7509, + "step": 16023 + }, + { + "epoch": 0.7802692766537628, + "grad_norm": 1.6091985702514648, + "learning_rate": 4.853972128537954e-06, + "loss": 0.8807, + "step": 16024 + }, + { + "epoch": 0.78031797044287, + "grad_norm": 2.1262643337249756, + "learning_rate": 4.851912307987507e-06, + "loss": 0.7453, + "step": 16025 + }, + { + "epoch": 0.7803666642319772, + "grad_norm": 1.4239054918289185, + "learning_rate": 4.849852864247138e-06, + "loss": 0.776, + "step": 16026 + }, + { + "epoch": 0.7804153580210844, + "grad_norm": 0.11632698774337769, + "learning_rate": 4.8477937973680475e-06, + "loss": 0.6005, + "step": 16027 + }, + { + "epoch": 0.7804640518101916, + "grad_norm": 1.3538905382156372, + "learning_rate": 4.845735107401473e-06, + "loss": 0.8429, + "step": 16028 + }, + { + "epoch": 0.7805127455992988, + "grad_norm": 1.663636326789856, + "learning_rate": 4.84367679439862e-06, + "loss": 0.7025, + "step": 16029 + }, + { + "epoch": 0.780561439388406, + "grad_norm": 1.8950839042663574, + "learning_rate": 4.841618858410688e-06, + "loss": 0.7911, + "step": 16030 + }, + { + "epoch": 0.7806101331775132, + "grad_norm": 1.5563799142837524, + "learning_rate": 4.839561299488871e-06, + "loss": 0.804, + "step": 16031 + }, + { + "epoch": 0.7806588269666204, + "grad_norm": 1.4754198789596558, + "learning_rate": 4.837504117684346e-06, + "loss": 0.8724, + "step": 16032 + }, + { + "epoch": 0.7807075207557276, + "grad_norm": 1.9622105360031128, + "learning_rate": 4.83544731304829e-06, + "loss": 0.8636, + "step": 16033 + }, + { + "epoch": 0.7807562145448348, + "grad_norm": 2.7351696491241455, + "learning_rate": 4.833390885631859e-06, + "loss": 0.8456, + "step": 16034 + }, + { + "epoch": 0.780804908333942, + "grad_norm": 2.1713125705718994, + "learning_rate": 4.831334835486219e-06, + "loss": 0.7716, + "step": 16035 + }, + { + "epoch": 0.7808536021230492, + "grad_norm": 1.501898169517517, + "learning_rate": 4.829279162662506e-06, + "loss": 0.7192, + "step": 16036 + }, + { + "epoch": 0.7809022959121564, + "grad_norm": 1.3655221462249756, + "learning_rate": 4.827223867211859e-06, + "loss": 0.8181, + "step": 16037 + }, + { + "epoch": 0.7809509897012636, + "grad_norm": 1.0970816612243652, + "learning_rate": 4.825168949185402e-06, + "loss": 0.7928, + "step": 16038 + }, + { + "epoch": 0.7809996834903709, + "grad_norm": 1.9723601341247559, + "learning_rate": 4.823114408634248e-06, + "loss": 0.7959, + "step": 16039 + }, + { + "epoch": 0.781048377279478, + "grad_norm": 1.9278870820999146, + "learning_rate": 4.8210602456095105e-06, + "loss": 0.7455, + "step": 16040 + }, + { + "epoch": 0.7810970710685852, + "grad_norm": 1.369547963142395, + "learning_rate": 4.8190064601622814e-06, + "loss": 0.7569, + "step": 16041 + }, + { + "epoch": 0.7811457648576924, + "grad_norm": 2.2260451316833496, + "learning_rate": 4.816953052343645e-06, + "loss": 0.8476, + "step": 16042 + }, + { + "epoch": 0.7811944586467996, + "grad_norm": 1.551364541053772, + "learning_rate": 4.814900022204696e-06, + "loss": 0.8046, + "step": 16043 + }, + { + "epoch": 0.7812431524359068, + "grad_norm": 2.0627548694610596, + "learning_rate": 4.812847369796485e-06, + "loss": 0.8621, + "step": 16044 + }, + { + "epoch": 0.781291846225014, + "grad_norm": 2.2083818912506104, + "learning_rate": 4.8107950951700845e-06, + "loss": 0.8065, + "step": 16045 + }, + { + "epoch": 0.7813405400141212, + "grad_norm": 1.7703498601913452, + "learning_rate": 4.80874319837654e-06, + "loss": 0.861, + "step": 16046 + }, + { + "epoch": 0.7813892338032284, + "grad_norm": 2.044304370880127, + "learning_rate": 4.806691679466895e-06, + "loss": 0.788, + "step": 16047 + }, + { + "epoch": 0.7814379275923355, + "grad_norm": 1.1744509935379028, + "learning_rate": 4.80464053849218e-06, + "loss": 0.721, + "step": 16048 + }, + { + "epoch": 0.7814866213814428, + "grad_norm": 2.1055707931518555, + "learning_rate": 4.80258977550341e-06, + "loss": 0.7726, + "step": 16049 + }, + { + "epoch": 0.78153531517055, + "grad_norm": 3.234445095062256, + "learning_rate": 4.8005393905516175e-06, + "loss": 0.7907, + "step": 16050 + }, + { + "epoch": 0.7815840089596572, + "grad_norm": 2.065385341644287, + "learning_rate": 4.7984893836877815e-06, + "loss": 0.8087, + "step": 16051 + }, + { + "epoch": 0.7816327027487644, + "grad_norm": 1.7459352016448975, + "learning_rate": 4.7964397549629184e-06, + "loss": 0.8923, + "step": 16052 + }, + { + "epoch": 0.7816813965378716, + "grad_norm": 1.4948114156723022, + "learning_rate": 4.794390504427993e-06, + "loss": 0.6741, + "step": 16053 + }, + { + "epoch": 0.7817300903269788, + "grad_norm": 1.395918846130371, + "learning_rate": 4.792341632133995e-06, + "loss": 0.7994, + "step": 16054 + }, + { + "epoch": 0.781778784116086, + "grad_norm": 1.3379855155944824, + "learning_rate": 4.790293138131887e-06, + "loss": 0.8701, + "step": 16055 + }, + { + "epoch": 0.7818274779051932, + "grad_norm": 1.397669792175293, + "learning_rate": 4.788245022472622e-06, + "loss": 0.7895, + "step": 16056 + }, + { + "epoch": 0.7818761716943003, + "grad_norm": 2.033959150314331, + "learning_rate": 4.786197285207151e-06, + "loss": 0.7769, + "step": 16057 + }, + { + "epoch": 0.7819248654834076, + "grad_norm": 1.3092236518859863, + "learning_rate": 4.784149926386401e-06, + "loss": 0.8635, + "step": 16058 + }, + { + "epoch": 0.7819735592725148, + "grad_norm": 2.1178019046783447, + "learning_rate": 4.7821029460613225e-06, + "loss": 0.8769, + "step": 16059 + }, + { + "epoch": 0.782022253061622, + "grad_norm": 1.4630439281463623, + "learning_rate": 4.7800563442828084e-06, + "loss": 0.7617, + "step": 16060 + }, + { + "epoch": 0.7820709468507292, + "grad_norm": 1.5034137964248657, + "learning_rate": 4.778010121101793e-06, + "loss": 0.8277, + "step": 16061 + }, + { + "epoch": 0.7821196406398364, + "grad_norm": 1.366729974746704, + "learning_rate": 4.775964276569152e-06, + "loss": 0.8088, + "step": 16062 + }, + { + "epoch": 0.7821683344289436, + "grad_norm": 1.4557231664657593, + "learning_rate": 4.773918810735792e-06, + "loss": 0.7982, + "step": 16063 + }, + { + "epoch": 0.7822170282180508, + "grad_norm": 1.5292384624481201, + "learning_rate": 4.771873723652591e-06, + "loss": 0.7911, + "step": 16064 + }, + { + "epoch": 0.7822657220071579, + "grad_norm": 1.2091630697250366, + "learning_rate": 4.769829015370413e-06, + "loss": 0.8273, + "step": 16065 + }, + { + "epoch": 0.7823144157962652, + "grad_norm": 1.5842961072921753, + "learning_rate": 4.76778468594014e-06, + "loss": 0.7677, + "step": 16066 + }, + { + "epoch": 0.7823631095853724, + "grad_norm": 1.438258171081543, + "learning_rate": 4.765740735412598e-06, + "loss": 0.8561, + "step": 16067 + }, + { + "epoch": 0.7824118033744796, + "grad_norm": 1.6017688512802124, + "learning_rate": 4.763697163838659e-06, + "loss": 0.8463, + "step": 16068 + }, + { + "epoch": 0.7824604971635868, + "grad_norm": 1.5398585796356201, + "learning_rate": 4.761653971269129e-06, + "loss": 0.8357, + "step": 16069 + }, + { + "epoch": 0.782509190952694, + "grad_norm": 1.28672194480896, + "learning_rate": 4.759611157754851e-06, + "loss": 0.8037, + "step": 16070 + }, + { + "epoch": 0.7825578847418012, + "grad_norm": 1.386287808418274, + "learning_rate": 4.757568723346635e-06, + "loss": 0.7828, + "step": 16071 + }, + { + "epoch": 0.7826065785309084, + "grad_norm": 1.7960835695266724, + "learning_rate": 4.755526668095287e-06, + "loss": 0.8188, + "step": 16072 + }, + { + "epoch": 0.7826552723200155, + "grad_norm": 1.9379558563232422, + "learning_rate": 4.753484992051602e-06, + "loss": 0.7862, + "step": 16073 + }, + { + "epoch": 0.7827039661091227, + "grad_norm": 1.5317785739898682, + "learning_rate": 4.7514436952663645e-06, + "loss": 0.7356, + "step": 16074 + }, + { + "epoch": 0.78275265989823, + "grad_norm": 1.8150168657302856, + "learning_rate": 4.7494027777903665e-06, + "loss": 0.7916, + "step": 16075 + }, + { + "epoch": 0.7828013536873372, + "grad_norm": 1.4323441982269287, + "learning_rate": 4.747362239674351e-06, + "loss": 0.8267, + "step": 16076 + }, + { + "epoch": 0.7828500474764444, + "grad_norm": 2.341188669204712, + "learning_rate": 4.7453220809690995e-06, + "loss": 0.7305, + "step": 16077 + }, + { + "epoch": 0.7828987412655516, + "grad_norm": 1.2705031633377075, + "learning_rate": 4.743282301725349e-06, + "loss": 0.8388, + "step": 16078 + }, + { + "epoch": 0.7829474350546588, + "grad_norm": 2.0683929920196533, + "learning_rate": 4.741242901993846e-06, + "loss": 0.8407, + "step": 16079 + }, + { + "epoch": 0.782996128843766, + "grad_norm": 2.2599451541900635, + "learning_rate": 4.739203881825314e-06, + "loss": 0.8249, + "step": 16080 + }, + { + "epoch": 0.7830448226328732, + "grad_norm": 1.6603217124938965, + "learning_rate": 4.7371652412704785e-06, + "loss": 0.8549, + "step": 16081 + }, + { + "epoch": 0.7830935164219803, + "grad_norm": 2.0222883224487305, + "learning_rate": 4.735126980380047e-06, + "loss": 0.8805, + "step": 16082 + }, + { + "epoch": 0.7831422102110875, + "grad_norm": 1.6874743700027466, + "learning_rate": 4.733089099204726e-06, + "loss": 0.8259, + "step": 16083 + }, + { + "epoch": 0.7831909040001948, + "grad_norm": 0.10368698090314865, + "learning_rate": 4.731051597795205e-06, + "loss": 0.7264, + "step": 16084 + }, + { + "epoch": 0.783239597789302, + "grad_norm": 1.4011907577514648, + "learning_rate": 4.729014476202161e-06, + "loss": 0.8206, + "step": 16085 + }, + { + "epoch": 0.7832882915784092, + "grad_norm": 1.774021029472351, + "learning_rate": 4.726977734476281e-06, + "loss": 0.8461, + "step": 16086 + }, + { + "epoch": 0.7833369853675164, + "grad_norm": 1.7373312711715698, + "learning_rate": 4.724941372668222e-06, + "loss": 0.844, + "step": 16087 + }, + { + "epoch": 0.7833856791566236, + "grad_norm": 1.451598882675171, + "learning_rate": 4.722905390828638e-06, + "loss": 0.8109, + "step": 16088 + }, + { + "epoch": 0.7834343729457308, + "grad_norm": 1.9396272897720337, + "learning_rate": 4.720869789008178e-06, + "loss": 0.8157, + "step": 16089 + }, + { + "epoch": 0.7834830667348379, + "grad_norm": 1.7112337350845337, + "learning_rate": 4.718834567257473e-06, + "loss": 0.8319, + "step": 16090 + }, + { + "epoch": 0.7835317605239451, + "grad_norm": 1.2917391061782837, + "learning_rate": 4.7167997256271526e-06, + "loss": 0.8384, + "step": 16091 + }, + { + "epoch": 0.7835804543130523, + "grad_norm": 1.3729948997497559, + "learning_rate": 4.714765264167831e-06, + "loss": 0.8473, + "step": 16092 + }, + { + "epoch": 0.7836291481021596, + "grad_norm": 0.10544773936271667, + "learning_rate": 4.712731182930113e-06, + "loss": 0.5666, + "step": 16093 + }, + { + "epoch": 0.7836778418912668, + "grad_norm": 1.471976399421692, + "learning_rate": 4.7106974819646055e-06, + "loss": 0.8262, + "step": 16094 + }, + { + "epoch": 0.783726535680374, + "grad_norm": 1.581563949584961, + "learning_rate": 4.708664161321892e-06, + "loss": 0.7206, + "step": 16095 + }, + { + "epoch": 0.7837752294694812, + "grad_norm": 1.7558262348175049, + "learning_rate": 4.706631221052551e-06, + "loss": 0.7842, + "step": 16096 + }, + { + "epoch": 0.7838239232585884, + "grad_norm": 1.373153805732727, + "learning_rate": 4.704598661207154e-06, + "loss": 0.8212, + "step": 16097 + }, + { + "epoch": 0.7838726170476956, + "grad_norm": 1.8151081800460815, + "learning_rate": 4.702566481836262e-06, + "loss": 0.8738, + "step": 16098 + }, + { + "epoch": 0.7839213108368027, + "grad_norm": 1.6587616205215454, + "learning_rate": 4.700534682990418e-06, + "loss": 0.7919, + "step": 16099 + }, + { + "epoch": 0.7839700046259099, + "grad_norm": 3.967942237854004, + "learning_rate": 4.6985032647201686e-06, + "loss": 0.873, + "step": 16100 + }, + { + "epoch": 0.7840186984150171, + "grad_norm": 1.8646416664123535, + "learning_rate": 4.696472227076054e-06, + "loss": 0.7984, + "step": 16101 + }, + { + "epoch": 0.7840673922041244, + "grad_norm": 11.395590782165527, + "learning_rate": 4.694441570108576e-06, + "loss": 0.8862, + "step": 16102 + }, + { + "epoch": 0.7841160859932316, + "grad_norm": 1.202737808227539, + "learning_rate": 4.6924112938682684e-06, + "loss": 0.8683, + "step": 16103 + }, + { + "epoch": 0.7841647797823388, + "grad_norm": 1.4780298471450806, + "learning_rate": 4.690381398405623e-06, + "loss": 0.8319, + "step": 16104 + }, + { + "epoch": 0.784213473571446, + "grad_norm": 1.980873465538025, + "learning_rate": 4.688351883771136e-06, + "loss": 0.7545, + "step": 16105 + }, + { + "epoch": 0.7842621673605532, + "grad_norm": 1.9267152547836304, + "learning_rate": 4.686322750015293e-06, + "loss": 0.7795, + "step": 16106 + }, + { + "epoch": 0.7843108611496603, + "grad_norm": 1.515873908996582, + "learning_rate": 4.684293997188567e-06, + "loss": 0.852, + "step": 16107 + }, + { + "epoch": 0.7843595549387675, + "grad_norm": 1.262717843055725, + "learning_rate": 4.6822656253414265e-06, + "loss": 0.8477, + "step": 16108 + }, + { + "epoch": 0.7844082487278747, + "grad_norm": 1.27472722530365, + "learning_rate": 4.680237634524316e-06, + "loss": 0.8158, + "step": 16109 + }, + { + "epoch": 0.784456942516982, + "grad_norm": 1.9334602355957031, + "learning_rate": 4.678210024787706e-06, + "loss": 0.7668, + "step": 16110 + }, + { + "epoch": 0.7845056363060892, + "grad_norm": 1.477211356163025, + "learning_rate": 4.676182796182005e-06, + "loss": 0.7734, + "step": 16111 + }, + { + "epoch": 0.7845543300951964, + "grad_norm": 1.4530109167099, + "learning_rate": 4.674155948757666e-06, + "loss": 0.7267, + "step": 16112 + }, + { + "epoch": 0.7846030238843036, + "grad_norm": 2.0134873390197754, + "learning_rate": 4.672129482565084e-06, + "loss": 0.6597, + "step": 16113 + }, + { + "epoch": 0.7846517176734108, + "grad_norm": 1.8224300146102905, + "learning_rate": 4.6701033976546835e-06, + "loss": 0.8123, + "step": 16114 + }, + { + "epoch": 0.784700411462518, + "grad_norm": 1.9134095907211304, + "learning_rate": 4.66807769407686e-06, + "loss": 0.7619, + "step": 16115 + }, + { + "epoch": 0.7847491052516251, + "grad_norm": 1.1978031396865845, + "learning_rate": 4.666052371881995e-06, + "loss": 0.7841, + "step": 16116 + }, + { + "epoch": 0.7847977990407323, + "grad_norm": 1.7014498710632324, + "learning_rate": 4.664027431120488e-06, + "loss": 0.8165, + "step": 16117 + }, + { + "epoch": 0.7848464928298395, + "grad_norm": 1.285863995552063, + "learning_rate": 4.662002871842684e-06, + "loss": 0.8596, + "step": 16118 + }, + { + "epoch": 0.7848951866189467, + "grad_norm": 1.732079267501831, + "learning_rate": 4.65997869409897e-06, + "loss": 0.7867, + "step": 16119 + }, + { + "epoch": 0.784943880408054, + "grad_norm": 2.470242500305176, + "learning_rate": 4.6579548979396716e-06, + "loss": 0.9198, + "step": 16120 + }, + { + "epoch": 0.7849925741971612, + "grad_norm": 0.09206021577119827, + "learning_rate": 4.65593148341515e-06, + "loss": 0.5716, + "step": 16121 + }, + { + "epoch": 0.7850412679862684, + "grad_norm": 1.8304214477539062, + "learning_rate": 4.653908450575733e-06, + "loss": 0.8117, + "step": 16122 + }, + { + "epoch": 0.7850899617753756, + "grad_norm": 2.1082873344421387, + "learning_rate": 4.651885799471736e-06, + "loss": 0.7704, + "step": 16123 + }, + { + "epoch": 0.7851386555644827, + "grad_norm": 1.4627243280410767, + "learning_rate": 4.6498635301534915e-06, + "loss": 0.7911, + "step": 16124 + }, + { + "epoch": 0.7851873493535899, + "grad_norm": 2.603672504425049, + "learning_rate": 4.647841642671278e-06, + "loss": 0.7649, + "step": 16125 + }, + { + "epoch": 0.7852360431426971, + "grad_norm": 1.330031394958496, + "learning_rate": 4.645820137075417e-06, + "loss": 0.8295, + "step": 16126 + }, + { + "epoch": 0.7852847369318043, + "grad_norm": 1.3889939785003662, + "learning_rate": 4.643799013416166e-06, + "loss": 0.8607, + "step": 16127 + }, + { + "epoch": 0.7853334307209116, + "grad_norm": 1.1637095212936401, + "learning_rate": 4.641778271743822e-06, + "loss": 0.7857, + "step": 16128 + }, + { + "epoch": 0.7853821245100188, + "grad_norm": 1.5936683416366577, + "learning_rate": 4.639757912108642e-06, + "loss": 0.8175, + "step": 16129 + }, + { + "epoch": 0.785430818299126, + "grad_norm": 1.470688819885254, + "learning_rate": 4.637737934560886e-06, + "loss": 0.765, + "step": 16130 + }, + { + "epoch": 0.7854795120882332, + "grad_norm": 1.7984044551849365, + "learning_rate": 4.635718339150801e-06, + "loss": 0.7826, + "step": 16131 + }, + { + "epoch": 0.7855282058773403, + "grad_norm": 1.3840456008911133, + "learning_rate": 4.633699125928621e-06, + "loss": 0.7706, + "step": 16132 + }, + { + "epoch": 0.7855768996664475, + "grad_norm": 5.644026279449463, + "learning_rate": 4.631680294944578e-06, + "loss": 0.7334, + "step": 16133 + }, + { + "epoch": 0.7856255934555547, + "grad_norm": 1.3813176155090332, + "learning_rate": 4.629661846248881e-06, + "loss": 0.8687, + "step": 16134 + }, + { + "epoch": 0.7856742872446619, + "grad_norm": 1.4013127088546753, + "learning_rate": 4.62764377989176e-06, + "loss": 0.7833, + "step": 16135 + }, + { + "epoch": 0.7857229810337691, + "grad_norm": 1.6484675407409668, + "learning_rate": 4.62562609592339e-06, + "loss": 0.7427, + "step": 16136 + }, + { + "epoch": 0.7857716748228764, + "grad_norm": 1.3222625255584717, + "learning_rate": 4.623608794393977e-06, + "loss": 0.8975, + "step": 16137 + }, + { + "epoch": 0.7858203686119836, + "grad_norm": 1.3766974210739136, + "learning_rate": 4.621591875353699e-06, + "loss": 0.7749, + "step": 16138 + }, + { + "epoch": 0.7858690624010908, + "grad_norm": 1.7366987466812134, + "learning_rate": 4.619575338852724e-06, + "loss": 0.8753, + "step": 16139 + }, + { + "epoch": 0.785917756190198, + "grad_norm": 1.9219212532043457, + "learning_rate": 4.617559184941216e-06, + "loss": 0.8882, + "step": 16140 + }, + { + "epoch": 0.7859664499793051, + "grad_norm": 1.8460756540298462, + "learning_rate": 4.615543413669325e-06, + "loss": 0.7123, + "step": 16141 + }, + { + "epoch": 0.7860151437684123, + "grad_norm": 1.460209608078003, + "learning_rate": 4.613528025087193e-06, + "loss": 0.7824, + "step": 16142 + }, + { + "epoch": 0.7860638375575195, + "grad_norm": 1.7616156339645386, + "learning_rate": 4.611513019244951e-06, + "loss": 0.8821, + "step": 16143 + }, + { + "epoch": 0.7861125313466267, + "grad_norm": 1.351676344871521, + "learning_rate": 4.609498396192731e-06, + "loss": 0.7579, + "step": 16144 + }, + { + "epoch": 0.7861612251357339, + "grad_norm": 1.225120186805725, + "learning_rate": 4.6074841559806394e-06, + "loss": 0.7945, + "step": 16145 + }, + { + "epoch": 0.7862099189248412, + "grad_norm": 1.4938348531723022, + "learning_rate": 4.605470298658785e-06, + "loss": 0.8478, + "step": 16146 + }, + { + "epoch": 0.7862586127139484, + "grad_norm": 1.055780053138733, + "learning_rate": 4.6034568242772594e-06, + "loss": 0.7489, + "step": 16147 + }, + { + "epoch": 0.7863073065030556, + "grad_norm": 2.232797145843506, + "learning_rate": 4.60144373288615e-06, + "loss": 0.7907, + "step": 16148 + }, + { + "epoch": 0.7863560002921627, + "grad_norm": 1.750002384185791, + "learning_rate": 4.599431024535533e-06, + "loss": 0.85, + "step": 16149 + }, + { + "epoch": 0.7864046940812699, + "grad_norm": 2.4022998809814453, + "learning_rate": 4.597418699275471e-06, + "loss": 0.7689, + "step": 16150 + }, + { + "epoch": 0.7864533878703771, + "grad_norm": 1.6661654710769653, + "learning_rate": 4.595406757156018e-06, + "loss": 0.8511, + "step": 16151 + }, + { + "epoch": 0.7865020816594843, + "grad_norm": 1.8699960708618164, + "learning_rate": 4.5933951982272374e-06, + "loss": 0.8148, + "step": 16152 + }, + { + "epoch": 0.7865507754485915, + "grad_norm": 1.8063346147537231, + "learning_rate": 4.5913840225391446e-06, + "loss": 0.8428, + "step": 16153 + }, + { + "epoch": 0.7865994692376987, + "grad_norm": 1.820530652999878, + "learning_rate": 4.589373230141787e-06, + "loss": 0.7874, + "step": 16154 + }, + { + "epoch": 0.786648163026806, + "grad_norm": 1.4828426837921143, + "learning_rate": 4.587362821085173e-06, + "loss": 0.7909, + "step": 16155 + }, + { + "epoch": 0.7866968568159132, + "grad_norm": 2.2012643814086914, + "learning_rate": 4.585352795419315e-06, + "loss": 0.8856, + "step": 16156 + }, + { + "epoch": 0.7867455506050204, + "grad_norm": 1.7879823446273804, + "learning_rate": 4.583343153194213e-06, + "loss": 0.9243, + "step": 16157 + }, + { + "epoch": 0.7867942443941275, + "grad_norm": 1.1046178340911865, + "learning_rate": 4.581333894459847e-06, + "loss": 0.7337, + "step": 16158 + }, + { + "epoch": 0.7868429381832347, + "grad_norm": 2.2330033779144287, + "learning_rate": 4.5793250192662205e-06, + "loss": 0.8849, + "step": 16159 + }, + { + "epoch": 0.7868916319723419, + "grad_norm": 1.5597805976867676, + "learning_rate": 4.577316527663278e-06, + "loss": 0.8457, + "step": 16160 + }, + { + "epoch": 0.7869403257614491, + "grad_norm": 1.4577556848526, + "learning_rate": 4.575308419701003e-06, + "loss": 0.7726, + "step": 16161 + }, + { + "epoch": 0.7869890195505563, + "grad_norm": 2.460709571838379, + "learning_rate": 4.573300695429328e-06, + "loss": 0.7946, + "step": 16162 + }, + { + "epoch": 0.7870377133396635, + "grad_norm": 1.860421895980835, + "learning_rate": 4.57129335489821e-06, + "loss": 0.7934, + "step": 16163 + }, + { + "epoch": 0.7870864071287708, + "grad_norm": 1.6040468215942383, + "learning_rate": 4.569286398157575e-06, + "loss": 0.753, + "step": 16164 + }, + { + "epoch": 0.787135100917878, + "grad_norm": 2.524655342102051, + "learning_rate": 4.567279825257349e-06, + "loss": 0.8111, + "step": 16165 + }, + { + "epoch": 0.7871837947069851, + "grad_norm": 0.10702668875455856, + "learning_rate": 4.565273636247444e-06, + "loss": 0.6255, + "step": 16166 + }, + { + "epoch": 0.7872324884960923, + "grad_norm": 1.775205373764038, + "learning_rate": 4.56326783117776e-06, + "loss": 0.8317, + "step": 16167 + }, + { + "epoch": 0.7872811822851995, + "grad_norm": 2.5230588912963867, + "learning_rate": 4.561262410098206e-06, + "loss": 0.7751, + "step": 16168 + }, + { + "epoch": 0.7873298760743067, + "grad_norm": 1.3714951276779175, + "learning_rate": 4.559257373058645e-06, + "loss": 0.7452, + "step": 16169 + }, + { + "epoch": 0.7873785698634139, + "grad_norm": 1.8642226457595825, + "learning_rate": 4.5572527201089776e-06, + "loss": 0.8831, + "step": 16170 + }, + { + "epoch": 0.7874272636525211, + "grad_norm": 0.10457798093557358, + "learning_rate": 4.5552484512990455e-06, + "loss": 0.5899, + "step": 16171 + }, + { + "epoch": 0.7874759574416283, + "grad_norm": 1.3556653261184692, + "learning_rate": 4.553244566678723e-06, + "loss": 0.8189, + "step": 16172 + }, + { + "epoch": 0.7875246512307356, + "grad_norm": 1.9955190420150757, + "learning_rate": 4.551241066297849e-06, + "loss": 0.859, + "step": 16173 + }, + { + "epoch": 0.7875733450198428, + "grad_norm": 1.356223702430725, + "learning_rate": 4.549237950206258e-06, + "loss": 0.8503, + "step": 16174 + }, + { + "epoch": 0.7876220388089499, + "grad_norm": 1.8966692686080933, + "learning_rate": 4.547235218453789e-06, + "loss": 0.8438, + "step": 16175 + }, + { + "epoch": 0.7876707325980571, + "grad_norm": 1.4999529123306274, + "learning_rate": 4.545232871090246e-06, + "loss": 0.8078, + "step": 16176 + }, + { + "epoch": 0.7877194263871643, + "grad_norm": 1.210730791091919, + "learning_rate": 4.543230908165455e-06, + "loss": 0.8115, + "step": 16177 + }, + { + "epoch": 0.7877681201762715, + "grad_norm": 1.4120957851409912, + "learning_rate": 4.541229329729191e-06, + "loss": 0.8038, + "step": 16178 + }, + { + "epoch": 0.7878168139653787, + "grad_norm": 1.9473565816879272, + "learning_rate": 4.539228135831264e-06, + "loss": 0.7508, + "step": 16179 + }, + { + "epoch": 0.7878655077544859, + "grad_norm": 1.433297872543335, + "learning_rate": 4.537227326521445e-06, + "loss": 0.822, + "step": 16180 + }, + { + "epoch": 0.7879142015435932, + "grad_norm": 1.383910894393921, + "learning_rate": 4.535226901849507e-06, + "loss": 0.7765, + "step": 16181 + }, + { + "epoch": 0.7879628953327004, + "grad_norm": 0.10223900526762009, + "learning_rate": 4.533226861865212e-06, + "loss": 0.5751, + "step": 16182 + }, + { + "epoch": 0.7880115891218075, + "grad_norm": 1.2937264442443848, + "learning_rate": 4.531227206618301e-06, + "loss": 0.7532, + "step": 16183 + }, + { + "epoch": 0.7880602829109147, + "grad_norm": 2.0847153663635254, + "learning_rate": 4.529227936158535e-06, + "loss": 0.8361, + "step": 16184 + }, + { + "epoch": 0.7881089767000219, + "grad_norm": 1.4393484592437744, + "learning_rate": 4.5272290505356244e-06, + "loss": 0.8217, + "step": 16185 + }, + { + "epoch": 0.7881576704891291, + "grad_norm": 2.0308966636657715, + "learning_rate": 4.525230549799309e-06, + "loss": 0.8085, + "step": 16186 + }, + { + "epoch": 0.7882063642782363, + "grad_norm": 1.3164231777191162, + "learning_rate": 4.523232433999292e-06, + "loss": 0.8748, + "step": 16187 + }, + { + "epoch": 0.7882550580673435, + "grad_norm": 1.7097150087356567, + "learning_rate": 4.52123470318528e-06, + "loss": 0.7728, + "step": 16188 + }, + { + "epoch": 0.7883037518564507, + "grad_norm": 1.1626054048538208, + "learning_rate": 4.519237357406965e-06, + "loss": 0.7998, + "step": 16189 + }, + { + "epoch": 0.788352445645558, + "grad_norm": 1.5380395650863647, + "learning_rate": 4.517240396714034e-06, + "loss": 0.8581, + "step": 16190 + }, + { + "epoch": 0.788401139434665, + "grad_norm": 0.09534475952386856, + "learning_rate": 4.515243821156159e-06, + "loss": 0.715, + "step": 16191 + }, + { + "epoch": 0.7884498332237723, + "grad_norm": 1.0441452264785767, + "learning_rate": 4.513247630783006e-06, + "loss": 0.7958, + "step": 16192 + }, + { + "epoch": 0.7884985270128795, + "grad_norm": 1.4265329837799072, + "learning_rate": 4.511251825644229e-06, + "loss": 0.768, + "step": 16193 + }, + { + "epoch": 0.7885472208019867, + "grad_norm": 1.5100959539413452, + "learning_rate": 4.509256405789473e-06, + "loss": 0.7514, + "step": 16194 + }, + { + "epoch": 0.7885959145910939, + "grad_norm": 1.446142315864563, + "learning_rate": 4.507261371268379e-06, + "loss": 0.8627, + "step": 16195 + }, + { + "epoch": 0.7886446083802011, + "grad_norm": 14.135676383972168, + "learning_rate": 4.50526672213057e-06, + "loss": 0.8015, + "step": 16196 + }, + { + "epoch": 0.7886933021693083, + "grad_norm": 1.3991121053695679, + "learning_rate": 4.5032724584256675e-06, + "loss": 0.794, + "step": 16197 + }, + { + "epoch": 0.7887419959584155, + "grad_norm": 1.3079967498779297, + "learning_rate": 4.501278580203272e-06, + "loss": 0.8082, + "step": 16198 + }, + { + "epoch": 0.7887906897475228, + "grad_norm": 1.4575048685073853, + "learning_rate": 4.499285087512988e-06, + "loss": 0.8435, + "step": 16199 + }, + { + "epoch": 0.7888393835366299, + "grad_norm": 1.66218900680542, + "learning_rate": 4.497291980404399e-06, + "loss": 0.751, + "step": 16200 + }, + { + "epoch": 0.7888880773257371, + "grad_norm": 1.9981844425201416, + "learning_rate": 4.495299258927084e-06, + "loss": 0.8069, + "step": 16201 + }, + { + "epoch": 0.7889367711148443, + "grad_norm": 1.2872027158737183, + "learning_rate": 4.493306923130611e-06, + "loss": 0.7503, + "step": 16202 + }, + { + "epoch": 0.7889854649039515, + "grad_norm": 2.2519748210906982, + "learning_rate": 4.491314973064547e-06, + "loss": 0.8502, + "step": 16203 + }, + { + "epoch": 0.7890341586930587, + "grad_norm": 1.7444194555282593, + "learning_rate": 4.489323408778434e-06, + "loss": 0.8132, + "step": 16204 + }, + { + "epoch": 0.7890828524821659, + "grad_norm": 1.9195818901062012, + "learning_rate": 4.48733223032182e-06, + "loss": 0.7943, + "step": 16205 + }, + { + "epoch": 0.7891315462712731, + "grad_norm": 1.3874449729919434, + "learning_rate": 4.485341437744228e-06, + "loss": 0.6673, + "step": 16206 + }, + { + "epoch": 0.7891802400603803, + "grad_norm": 1.530142903327942, + "learning_rate": 4.4833510310951845e-06, + "loss": 0.8316, + "step": 16207 + }, + { + "epoch": 0.7892289338494874, + "grad_norm": 1.3671540021896362, + "learning_rate": 4.4813610104241964e-06, + "loss": 0.8042, + "step": 16208 + }, + { + "epoch": 0.7892776276385947, + "grad_norm": 2.1407296657562256, + "learning_rate": 4.4793713757807635e-06, + "loss": 0.793, + "step": 16209 + }, + { + "epoch": 0.7893263214277019, + "grad_norm": 1.565232753753662, + "learning_rate": 4.477382127214396e-06, + "loss": 0.78, + "step": 16210 + }, + { + "epoch": 0.7893750152168091, + "grad_norm": 2.256835699081421, + "learning_rate": 4.47539326477455e-06, + "loss": 0.8533, + "step": 16211 + }, + { + "epoch": 0.7894237090059163, + "grad_norm": 2.2283272743225098, + "learning_rate": 4.473404788510718e-06, + "loss": 0.8707, + "step": 16212 + }, + { + "epoch": 0.7894724027950235, + "grad_norm": 1.3909580707550049, + "learning_rate": 4.4714166984723575e-06, + "loss": 0.8103, + "step": 16213 + }, + { + "epoch": 0.7895210965841307, + "grad_norm": 1.588192343711853, + "learning_rate": 4.469428994708922e-06, + "loss": 0.8485, + "step": 16214 + }, + { + "epoch": 0.7895697903732379, + "grad_norm": 1.4479633569717407, + "learning_rate": 4.467441677269857e-06, + "loss": 0.7439, + "step": 16215 + }, + { + "epoch": 0.7896184841623451, + "grad_norm": 1.6215379238128662, + "learning_rate": 4.465454746204596e-06, + "loss": 0.7931, + "step": 16216 + }, + { + "epoch": 0.7896671779514522, + "grad_norm": 1.5434150695800781, + "learning_rate": 4.463468201562566e-06, + "loss": 0.8019, + "step": 16217 + }, + { + "epoch": 0.7897158717405595, + "grad_norm": 1.5523842573165894, + "learning_rate": 4.461482043393175e-06, + "loss": 0.7585, + "step": 16218 + }, + { + "epoch": 0.7897645655296667, + "grad_norm": 1.813261866569519, + "learning_rate": 4.459496271745845e-06, + "loss": 0.6895, + "step": 16219 + }, + { + "epoch": 0.7898132593187739, + "grad_norm": 1.5106960535049438, + "learning_rate": 4.457510886669951e-06, + "loss": 0.7216, + "step": 16220 + }, + { + "epoch": 0.7898619531078811, + "grad_norm": 1.4005181789398193, + "learning_rate": 4.455525888214902e-06, + "loss": 0.8929, + "step": 16221 + }, + { + "epoch": 0.7899106468969883, + "grad_norm": 2.0364036560058594, + "learning_rate": 4.453541276430053e-06, + "loss": 0.7825, + "step": 16222 + }, + { + "epoch": 0.7899593406860955, + "grad_norm": 1.158589482307434, + "learning_rate": 4.4515570513647875e-06, + "loss": 0.7444, + "step": 16223 + }, + { + "epoch": 0.7900080344752027, + "grad_norm": 1.3849245309829712, + "learning_rate": 4.449573213068459e-06, + "loss": 0.7944, + "step": 16224 + }, + { + "epoch": 0.7900567282643098, + "grad_norm": 0.10009552538394928, + "learning_rate": 4.44758976159041e-06, + "loss": 0.606, + "step": 16225 + }, + { + "epoch": 0.790105422053417, + "grad_norm": 1.3811830282211304, + "learning_rate": 4.445606696979992e-06, + "loss": 0.767, + "step": 16226 + }, + { + "epoch": 0.7901541158425243, + "grad_norm": 1.6494795083999634, + "learning_rate": 4.443624019286517e-06, + "loss": 0.8495, + "step": 16227 + }, + { + "epoch": 0.7902028096316315, + "grad_norm": 1.3176584243774414, + "learning_rate": 4.44164172855932e-06, + "loss": 0.8614, + "step": 16228 + }, + { + "epoch": 0.7902515034207387, + "grad_norm": 0.10565104335546494, + "learning_rate": 4.439659824847697e-06, + "loss": 0.5944, + "step": 16229 + }, + { + "epoch": 0.7903001972098459, + "grad_norm": 2.201462507247925, + "learning_rate": 4.437678308200959e-06, + "loss": 0.8631, + "step": 16230 + }, + { + "epoch": 0.7903488909989531, + "grad_norm": 1.3628971576690674, + "learning_rate": 4.43569717866839e-06, + "loss": 0.7325, + "step": 16231 + }, + { + "epoch": 0.7903975847880603, + "grad_norm": 1.8616639375686646, + "learning_rate": 4.433716436299269e-06, + "loss": 0.7869, + "step": 16232 + }, + { + "epoch": 0.7904462785771674, + "grad_norm": 1.5586274862289429, + "learning_rate": 4.431736081142881e-06, + "loss": 0.8294, + "step": 16233 + }, + { + "epoch": 0.7904949723662746, + "grad_norm": 1.5261794328689575, + "learning_rate": 4.429756113248467e-06, + "loss": 0.7286, + "step": 16234 + }, + { + "epoch": 0.7905436661553819, + "grad_norm": 1.547621488571167, + "learning_rate": 4.427776532665302e-06, + "loss": 0.7411, + "step": 16235 + }, + { + "epoch": 0.7905923599444891, + "grad_norm": 1.3210357427597046, + "learning_rate": 4.425797339442602e-06, + "loss": 0.6848, + "step": 16236 + }, + { + "epoch": 0.7906410537335963, + "grad_norm": 1.587538719177246, + "learning_rate": 4.42381853362962e-06, + "loss": 0.8429, + "step": 16237 + }, + { + "epoch": 0.7906897475227035, + "grad_norm": 1.2230349779129028, + "learning_rate": 4.421840115275571e-06, + "loss": 0.8136, + "step": 16238 + }, + { + "epoch": 0.7907384413118107, + "grad_norm": 1.780799150466919, + "learning_rate": 4.419862084429671e-06, + "loss": 0.8195, + "step": 16239 + }, + { + "epoch": 0.7907871351009179, + "grad_norm": 1.5294471979141235, + "learning_rate": 4.41788444114112e-06, + "loss": 0.8341, + "step": 16240 + }, + { + "epoch": 0.7908358288900251, + "grad_norm": 1.3445498943328857, + "learning_rate": 4.415907185459116e-06, + "loss": 0.7223, + "step": 16241 + }, + { + "epoch": 0.7908845226791322, + "grad_norm": 1.4468964338302612, + "learning_rate": 4.4139303174328414e-06, + "loss": 0.8297, + "step": 16242 + }, + { + "epoch": 0.7909332164682394, + "grad_norm": 6.057516574859619, + "learning_rate": 4.411953837111467e-06, + "loss": 0.8173, + "step": 16243 + }, + { + "epoch": 0.7909819102573467, + "grad_norm": 2.7273950576782227, + "learning_rate": 4.409977744544165e-06, + "loss": 0.7816, + "step": 16244 + }, + { + "epoch": 0.7910306040464539, + "grad_norm": 1.3954455852508545, + "learning_rate": 4.4080020397800905e-06, + "loss": 0.8249, + "step": 16245 + }, + { + "epoch": 0.7910792978355611, + "grad_norm": 1.6278454065322876, + "learning_rate": 4.406026722868386e-06, + "loss": 0.7642, + "step": 16246 + }, + { + "epoch": 0.7911279916246683, + "grad_norm": 2.3817286491394043, + "learning_rate": 4.404051793858188e-06, + "loss": 0.7637, + "step": 16247 + }, + { + "epoch": 0.7911766854137755, + "grad_norm": 2.04345703125, + "learning_rate": 4.402077252798625e-06, + "loss": 0.8748, + "step": 16248 + }, + { + "epoch": 0.7912253792028827, + "grad_norm": 0.10546647012233734, + "learning_rate": 4.400103099738811e-06, + "loss": 0.6075, + "step": 16249 + }, + { + "epoch": 0.7912740729919898, + "grad_norm": 1.449796199798584, + "learning_rate": 4.398129334727854e-06, + "loss": 0.8688, + "step": 16250 + }, + { + "epoch": 0.791322766781097, + "grad_norm": 0.101351797580719, + "learning_rate": 4.396155957814854e-06, + "loss": 0.5796, + "step": 16251 + }, + { + "epoch": 0.7913714605702042, + "grad_norm": 1.406025767326355, + "learning_rate": 4.3941829690488905e-06, + "loss": 0.783, + "step": 16252 + }, + { + "epoch": 0.7914201543593115, + "grad_norm": 1.3555097579956055, + "learning_rate": 4.392210368479055e-06, + "loss": 0.8656, + "step": 16253 + }, + { + "epoch": 0.7914688481484187, + "grad_norm": 1.6315124034881592, + "learning_rate": 4.39023815615441e-06, + "loss": 0.7355, + "step": 16254 + }, + { + "epoch": 0.7915175419375259, + "grad_norm": 1.7794597148895264, + "learning_rate": 4.3882663321240135e-06, + "loss": 0.8286, + "step": 16255 + }, + { + "epoch": 0.7915662357266331, + "grad_norm": 2.167416572570801, + "learning_rate": 4.386294896436916e-06, + "loss": 0.7363, + "step": 16256 + }, + { + "epoch": 0.7916149295157403, + "grad_norm": 1.6586111783981323, + "learning_rate": 4.384323849142156e-06, + "loss": 0.801, + "step": 16257 + }, + { + "epoch": 0.7916636233048475, + "grad_norm": 1.278931975364685, + "learning_rate": 4.382353190288764e-06, + "loss": 0.8108, + "step": 16258 + }, + { + "epoch": 0.7917123170939546, + "grad_norm": 1.3504014015197754, + "learning_rate": 4.3803829199257604e-06, + "loss": 0.9209, + "step": 16259 + }, + { + "epoch": 0.7917610108830618, + "grad_norm": 1.1526463031768799, + "learning_rate": 4.378413038102149e-06, + "loss": 0.813, + "step": 16260 + }, + { + "epoch": 0.791809704672169, + "grad_norm": 2.771109104156494, + "learning_rate": 4.376443544866951e-06, + "loss": 0.9148, + "step": 16261 + }, + { + "epoch": 0.7918583984612763, + "grad_norm": 1.3444817066192627, + "learning_rate": 4.374474440269132e-06, + "loss": 0.8274, + "step": 16262 + }, + { + "epoch": 0.7919070922503835, + "grad_norm": 1.2149485349655151, + "learning_rate": 4.372505724357692e-06, + "loss": 0.8276, + "step": 16263 + }, + { + "epoch": 0.7919557860394907, + "grad_norm": 1.982890248298645, + "learning_rate": 4.370537397181595e-06, + "loss": 0.7548, + "step": 16264 + }, + { + "epoch": 0.7920044798285979, + "grad_norm": 1.665473461151123, + "learning_rate": 4.368569458789809e-06, + "loss": 0.7422, + "step": 16265 + }, + { + "epoch": 0.7920531736177051, + "grad_norm": 0.09413517266511917, + "learning_rate": 4.3666019092312805e-06, + "loss": 0.5324, + "step": 16266 + }, + { + "epoch": 0.7921018674068122, + "grad_norm": 1.6375389099121094, + "learning_rate": 4.36463474855495e-06, + "loss": 0.7839, + "step": 16267 + }, + { + "epoch": 0.7921505611959194, + "grad_norm": 1.2809280157089233, + "learning_rate": 4.3626679768097665e-06, + "loss": 0.8615, + "step": 16268 + }, + { + "epoch": 0.7921992549850266, + "grad_norm": 1.6933132410049438, + "learning_rate": 4.360701594044632e-06, + "loss": 0.8668, + "step": 16269 + }, + { + "epoch": 0.7922479487741338, + "grad_norm": 1.5190666913986206, + "learning_rate": 4.358735600308482e-06, + "loss": 0.8239, + "step": 16270 + }, + { + "epoch": 0.7922966425632411, + "grad_norm": 1.842361330986023, + "learning_rate": 4.356769995650199e-06, + "loss": 0.8016, + "step": 16271 + }, + { + "epoch": 0.7923453363523483, + "grad_norm": 1.3959258794784546, + "learning_rate": 4.354804780118693e-06, + "loss": 0.8396, + "step": 16272 + }, + { + "epoch": 0.7923940301414555, + "grad_norm": 1.6526238918304443, + "learning_rate": 4.352839953762846e-06, + "loss": 0.8244, + "step": 16273 + }, + { + "epoch": 0.7924427239305627, + "grad_norm": 1.588592767715454, + "learning_rate": 4.350875516631532e-06, + "loss": 0.8515, + "step": 16274 + }, + { + "epoch": 0.7924914177196699, + "grad_norm": 1.7595831155776978, + "learning_rate": 4.348911468773615e-06, + "loss": 0.8005, + "step": 16275 + }, + { + "epoch": 0.792540111508777, + "grad_norm": 1.4715478420257568, + "learning_rate": 4.34694781023795e-06, + "loss": 0.7723, + "step": 16276 + }, + { + "epoch": 0.7925888052978842, + "grad_norm": 1.9189667701721191, + "learning_rate": 4.344984541073393e-06, + "loss": 0.81, + "step": 16277 + }, + { + "epoch": 0.7926374990869914, + "grad_norm": 1.5497511625289917, + "learning_rate": 4.3430216613287634e-06, + "loss": 0.9198, + "step": 16278 + }, + { + "epoch": 0.7926861928760986, + "grad_norm": 1.2917908430099487, + "learning_rate": 4.341059171052908e-06, + "loss": 0.8961, + "step": 16279 + }, + { + "epoch": 0.7927348866652059, + "grad_norm": 2.0083870887756348, + "learning_rate": 4.339097070294622e-06, + "loss": 0.8722, + "step": 16280 + }, + { + "epoch": 0.7927835804543131, + "grad_norm": 1.5120365619659424, + "learning_rate": 4.337135359102731e-06, + "loss": 0.8451, + "step": 16281 + }, + { + "epoch": 0.7928322742434203, + "grad_norm": 1.318495750427246, + "learning_rate": 4.335174037526026e-06, + "loss": 0.8232, + "step": 16282 + }, + { + "epoch": 0.7928809680325275, + "grad_norm": 9.033432006835938, + "learning_rate": 4.33321310561329e-06, + "loss": 0.8838, + "step": 16283 + }, + { + "epoch": 0.7929296618216346, + "grad_norm": 0.09365993738174438, + "learning_rate": 4.331252563413315e-06, + "loss": 0.6841, + "step": 16284 + }, + { + "epoch": 0.7929783556107418, + "grad_norm": 1.7433054447174072, + "learning_rate": 4.329292410974852e-06, + "loss": 0.8551, + "step": 16285 + }, + { + "epoch": 0.793027049399849, + "grad_norm": 1.4691499471664429, + "learning_rate": 4.32733264834668e-06, + "loss": 0.8722, + "step": 16286 + }, + { + "epoch": 0.7930757431889562, + "grad_norm": 1.1133168935775757, + "learning_rate": 4.325373275577525e-06, + "loss": 0.929, + "step": 16287 + }, + { + "epoch": 0.7931244369780635, + "grad_norm": 1.6514734029769897, + "learning_rate": 4.323414292716146e-06, + "loss": 0.7904, + "step": 16288 + }, + { + "epoch": 0.7931731307671707, + "grad_norm": 1.3241368532180786, + "learning_rate": 4.321455699811266e-06, + "loss": 0.8072, + "step": 16289 + }, + { + "epoch": 0.7932218245562779, + "grad_norm": 1.619417428970337, + "learning_rate": 4.319497496911604e-06, + "loss": 0.8394, + "step": 16290 + }, + { + "epoch": 0.7932705183453851, + "grad_norm": 2.2851967811584473, + "learning_rate": 4.3175396840658705e-06, + "loss": 0.855, + "step": 16291 + }, + { + "epoch": 0.7933192121344922, + "grad_norm": 3.6181442737579346, + "learning_rate": 4.315582261322764e-06, + "loss": 0.781, + "step": 16292 + }, + { + "epoch": 0.7933679059235994, + "grad_norm": 1.6998324394226074, + "learning_rate": 4.313625228730989e-06, + "loss": 0.7824, + "step": 16293 + }, + { + "epoch": 0.7934165997127066, + "grad_norm": 1.3734365701675415, + "learning_rate": 4.3116685863392036e-06, + "loss": 0.8206, + "step": 16294 + }, + { + "epoch": 0.7934652935018138, + "grad_norm": 1.19979727268219, + "learning_rate": 4.309712334196101e-06, + "loss": 0.7966, + "step": 16295 + }, + { + "epoch": 0.793513987290921, + "grad_norm": 1.6828973293304443, + "learning_rate": 4.307756472350333e-06, + "loss": 0.8107, + "step": 16296 + }, + { + "epoch": 0.7935626810800283, + "grad_norm": 1.5904805660247803, + "learning_rate": 4.305801000850554e-06, + "loss": 0.812, + "step": 16297 + }, + { + "epoch": 0.7936113748691355, + "grad_norm": 1.203047275543213, + "learning_rate": 4.303845919745406e-06, + "loss": 0.879, + "step": 16298 + }, + { + "epoch": 0.7936600686582427, + "grad_norm": 1.3028597831726074, + "learning_rate": 4.301891229083521e-06, + "loss": 0.7866, + "step": 16299 + }, + { + "epoch": 0.7937087624473499, + "grad_norm": 1.813090443611145, + "learning_rate": 4.299936928913526e-06, + "loss": 0.887, + "step": 16300 + }, + { + "epoch": 0.793757456236457, + "grad_norm": 1.500732421875, + "learning_rate": 4.2979830192840225e-06, + "loss": 0.7923, + "step": 16301 + }, + { + "epoch": 0.7938061500255642, + "grad_norm": 1.8945280313491821, + "learning_rate": 4.2960295002436345e-06, + "loss": 0.6907, + "step": 16302 + }, + { + "epoch": 0.7938548438146714, + "grad_norm": 2.0135631561279297, + "learning_rate": 4.294076371840936e-06, + "loss": 0.8436, + "step": 16303 + }, + { + "epoch": 0.7939035376037786, + "grad_norm": 1.5916463136672974, + "learning_rate": 4.292123634124523e-06, + "loss": 0.8387, + "step": 16304 + }, + { + "epoch": 0.7939522313928858, + "grad_norm": 2.466038227081299, + "learning_rate": 4.290171287142968e-06, + "loss": 0.818, + "step": 16305 + }, + { + "epoch": 0.7940009251819931, + "grad_norm": 0.09699895232915878, + "learning_rate": 4.288219330944834e-06, + "loss": 0.5692, + "step": 16306 + }, + { + "epoch": 0.7940496189711003, + "grad_norm": 1.91966712474823, + "learning_rate": 4.286267765578676e-06, + "loss": 0.74, + "step": 16307 + }, + { + "epoch": 0.7940983127602075, + "grad_norm": 3.934847116470337, + "learning_rate": 4.284316591093041e-06, + "loss": 0.837, + "step": 16308 + }, + { + "epoch": 0.7941470065493146, + "grad_norm": 1.520287275314331, + "learning_rate": 4.282365807536464e-06, + "loss": 0.8001, + "step": 16309 + }, + { + "epoch": 0.7941957003384218, + "grad_norm": 5.0406270027160645, + "learning_rate": 4.280415414957468e-06, + "loss": 0.7281, + "step": 16310 + }, + { + "epoch": 0.794244394127529, + "grad_norm": 1.5739588737487793, + "learning_rate": 4.278465413404569e-06, + "loss": 0.8183, + "step": 16311 + }, + { + "epoch": 0.7942930879166362, + "grad_norm": 1.6080620288848877, + "learning_rate": 4.276515802926282e-06, + "loss": 0.796, + "step": 16312 + }, + { + "epoch": 0.7943417817057434, + "grad_norm": 1.9324952363967896, + "learning_rate": 4.274566583571096e-06, + "loss": 0.831, + "step": 16313 + }, + { + "epoch": 0.7943904754948506, + "grad_norm": 1.3555189371109009, + "learning_rate": 4.272617755387503e-06, + "loss": 0.8662, + "step": 16314 + }, + { + "epoch": 0.7944391692839579, + "grad_norm": 1.5566542148590088, + "learning_rate": 4.270669318423974e-06, + "loss": 0.7431, + "step": 16315 + }, + { + "epoch": 0.7944878630730651, + "grad_norm": 1.991452693939209, + "learning_rate": 4.2687212727289815e-06, + "loss": 0.8218, + "step": 16316 + }, + { + "epoch": 0.7945365568621723, + "grad_norm": 1.8855758905410767, + "learning_rate": 4.266773618350981e-06, + "loss": 0.8686, + "step": 16317 + }, + { + "epoch": 0.7945852506512794, + "grad_norm": 1.8996676206588745, + "learning_rate": 4.264826355338416e-06, + "loss": 0.8003, + "step": 16318 + }, + { + "epoch": 0.7946339444403866, + "grad_norm": 1.5653080940246582, + "learning_rate": 4.2628794837397415e-06, + "loss": 0.8213, + "step": 16319 + }, + { + "epoch": 0.7946826382294938, + "grad_norm": 1.5770022869110107, + "learning_rate": 4.2609330036033624e-06, + "loss": 0.8791, + "step": 16320 + }, + { + "epoch": 0.794731332018601, + "grad_norm": 1.5138262510299683, + "learning_rate": 4.258986914977718e-06, + "loss": 0.8363, + "step": 16321 + }, + { + "epoch": 0.7947800258077082, + "grad_norm": 0.10169997066259384, + "learning_rate": 4.257041217911207e-06, + "loss": 0.6187, + "step": 16322 + }, + { + "epoch": 0.7948287195968154, + "grad_norm": 1.3616502285003662, + "learning_rate": 4.2550959124522314e-06, + "loss": 0.828, + "step": 16323 + }, + { + "epoch": 0.7948774133859227, + "grad_norm": 1.4446096420288086, + "learning_rate": 4.253150998649183e-06, + "loss": 0.7148, + "step": 16324 + }, + { + "epoch": 0.7949261071750299, + "grad_norm": 1.9447499513626099, + "learning_rate": 4.251206476550436e-06, + "loss": 0.6775, + "step": 16325 + }, + { + "epoch": 0.794974800964137, + "grad_norm": 0.1019030213356018, + "learning_rate": 4.249262346204366e-06, + "loss": 0.5261, + "step": 16326 + }, + { + "epoch": 0.7950234947532442, + "grad_norm": 2.1188011169433594, + "learning_rate": 4.247318607659323e-06, + "loss": 0.7437, + "step": 16327 + }, + { + "epoch": 0.7950721885423514, + "grad_norm": 1.3825722932815552, + "learning_rate": 4.24537526096368e-06, + "loss": 0.8553, + "step": 16328 + }, + { + "epoch": 0.7951208823314586, + "grad_norm": 1.6051430702209473, + "learning_rate": 4.243432306165751e-06, + "loss": 0.8068, + "step": 16329 + }, + { + "epoch": 0.7951695761205658, + "grad_norm": 1.5875699520111084, + "learning_rate": 4.241489743313885e-06, + "loss": 0.7987, + "step": 16330 + }, + { + "epoch": 0.795218269909673, + "grad_norm": 2.030244827270508, + "learning_rate": 4.239547572456399e-06, + "loss": 0.7661, + "step": 16331 + }, + { + "epoch": 0.7952669636987802, + "grad_norm": 1.6122246980667114, + "learning_rate": 4.237605793641606e-06, + "loss": 0.8315, + "step": 16332 + }, + { + "epoch": 0.7953156574878875, + "grad_norm": 1.2948896884918213, + "learning_rate": 4.235664406917803e-06, + "loss": 0.8682, + "step": 16333 + }, + { + "epoch": 0.7953643512769947, + "grad_norm": 1.6960728168487549, + "learning_rate": 4.233723412333283e-06, + "loss": 0.7337, + "step": 16334 + }, + { + "epoch": 0.7954130450661018, + "grad_norm": 1.792194128036499, + "learning_rate": 4.23178280993634e-06, + "loss": 0.82, + "step": 16335 + }, + { + "epoch": 0.795461738855209, + "grad_norm": 5.685561180114746, + "learning_rate": 4.229842599775227e-06, + "loss": 0.8476, + "step": 16336 + }, + { + "epoch": 0.7955104326443162, + "grad_norm": 1.6311157941818237, + "learning_rate": 4.2279027818982255e-06, + "loss": 0.7615, + "step": 16337 + }, + { + "epoch": 0.7955591264334234, + "grad_norm": 1.7238776683807373, + "learning_rate": 4.225963356353573e-06, + "loss": 0.7922, + "step": 16338 + }, + { + "epoch": 0.7956078202225306, + "grad_norm": 1.9533241987228394, + "learning_rate": 4.2240243231895236e-06, + "loss": 0.7294, + "step": 16339 + }, + { + "epoch": 0.7956565140116378, + "grad_norm": 2.1898717880249023, + "learning_rate": 4.222085682454306e-06, + "loss": 0.7956, + "step": 16340 + }, + { + "epoch": 0.795705207800745, + "grad_norm": 1.6883662939071655, + "learning_rate": 4.220147434196144e-06, + "loss": 0.8517, + "step": 16341 + }, + { + "epoch": 0.7957539015898523, + "grad_norm": 0.09712644666433334, + "learning_rate": 4.218209578463261e-06, + "loss": 0.571, + "step": 16342 + }, + { + "epoch": 0.7958025953789594, + "grad_norm": 2.4190781116485596, + "learning_rate": 4.2162721153038455e-06, + "loss": 0.748, + "step": 16343 + }, + { + "epoch": 0.7958512891680666, + "grad_norm": 1.5069323778152466, + "learning_rate": 4.214335044766109e-06, + "loss": 0.8284, + "step": 16344 + }, + { + "epoch": 0.7958999829571738, + "grad_norm": 1.6494945287704468, + "learning_rate": 4.212398366898218e-06, + "loss": 0.823, + "step": 16345 + }, + { + "epoch": 0.795948676746281, + "grad_norm": 1.8779540061950684, + "learning_rate": 4.210462081748361e-06, + "loss": 0.8407, + "step": 16346 + }, + { + "epoch": 0.7959973705353882, + "grad_norm": 1.8491473197937012, + "learning_rate": 4.2085261893647015e-06, + "loss": 0.8238, + "step": 16347 + }, + { + "epoch": 0.7960460643244954, + "grad_norm": 1.327507495880127, + "learning_rate": 4.206590689795391e-06, + "loss": 0.8458, + "step": 16348 + }, + { + "epoch": 0.7960947581136026, + "grad_norm": 1.5289689302444458, + "learning_rate": 4.204655583088577e-06, + "loss": 0.768, + "step": 16349 + }, + { + "epoch": 0.7961434519027099, + "grad_norm": 2.3559937477111816, + "learning_rate": 4.2027208692923956e-06, + "loss": 0.7305, + "step": 16350 + }, + { + "epoch": 0.796192145691817, + "grad_norm": 5.578014373779297, + "learning_rate": 4.200786548454976e-06, + "loss": 0.7357, + "step": 16351 + }, + { + "epoch": 0.7962408394809242, + "grad_norm": 2.847421646118164, + "learning_rate": 4.198852620624423e-06, + "loss": 0.678, + "step": 16352 + }, + { + "epoch": 0.7962895332700314, + "grad_norm": 2.080169916152954, + "learning_rate": 4.196919085848858e-06, + "loss": 0.7916, + "step": 16353 + }, + { + "epoch": 0.7963382270591386, + "grad_norm": 1.6666356325149536, + "learning_rate": 4.19498594417637e-06, + "loss": 0.8221, + "step": 16354 + }, + { + "epoch": 0.7963869208482458, + "grad_norm": 1.4788684844970703, + "learning_rate": 4.19305319565505e-06, + "loss": 0.6879, + "step": 16355 + }, + { + "epoch": 0.796435614637353, + "grad_norm": 1.4715362787246704, + "learning_rate": 4.191120840332972e-06, + "loss": 0.7536, + "step": 16356 + }, + { + "epoch": 0.7964843084264602, + "grad_norm": 1.6400128602981567, + "learning_rate": 4.1891888782582054e-06, + "loss": 0.8366, + "step": 16357 + }, + { + "epoch": 0.7965330022155674, + "grad_norm": 1.6980149745941162, + "learning_rate": 4.187257309478805e-06, + "loss": 0.8066, + "step": 16358 + }, + { + "epoch": 0.7965816960046747, + "grad_norm": 1.645297884941101, + "learning_rate": 4.185326134042822e-06, + "loss": 0.8259, + "step": 16359 + }, + { + "epoch": 0.7966303897937818, + "grad_norm": 1.5920768976211548, + "learning_rate": 4.183395351998294e-06, + "loss": 0.8134, + "step": 16360 + }, + { + "epoch": 0.796679083582889, + "grad_norm": 1.7849763631820679, + "learning_rate": 4.1814649633932424e-06, + "loss": 0.8354, + "step": 16361 + }, + { + "epoch": 0.7967277773719962, + "grad_norm": 1.2905458211898804, + "learning_rate": 4.179534968275698e-06, + "loss": 0.8166, + "step": 16362 + }, + { + "epoch": 0.7967764711611034, + "grad_norm": 1.8876171112060547, + "learning_rate": 4.177605366693662e-06, + "loss": 0.7504, + "step": 16363 + }, + { + "epoch": 0.7968251649502106, + "grad_norm": 1.5637930631637573, + "learning_rate": 4.1756761586951386e-06, + "loss": 0.7894, + "step": 16364 + }, + { + "epoch": 0.7968738587393178, + "grad_norm": 1.3092784881591797, + "learning_rate": 4.173747344328112e-06, + "loss": 0.823, + "step": 16365 + }, + { + "epoch": 0.796922552528425, + "grad_norm": 1.5365394353866577, + "learning_rate": 4.1718189236405625e-06, + "loss": 0.7969, + "step": 16366 + }, + { + "epoch": 0.7969712463175322, + "grad_norm": 1.5982712507247925, + "learning_rate": 4.169890896680459e-06, + "loss": 0.8433, + "step": 16367 + }, + { + "epoch": 0.7970199401066393, + "grad_norm": 1.7246235609054565, + "learning_rate": 4.167963263495764e-06, + "loss": 0.7275, + "step": 16368 + }, + { + "epoch": 0.7970686338957466, + "grad_norm": 2.1962008476257324, + "learning_rate": 4.166036024134423e-06, + "loss": 0.861, + "step": 16369 + }, + { + "epoch": 0.7971173276848538, + "grad_norm": 1.3305596113204956, + "learning_rate": 4.1641091786443885e-06, + "loss": 0.8767, + "step": 16370 + }, + { + "epoch": 0.797166021473961, + "grad_norm": 2.3826897144317627, + "learning_rate": 4.162182727073572e-06, + "loss": 0.8501, + "step": 16371 + }, + { + "epoch": 0.7972147152630682, + "grad_norm": 1.695317268371582, + "learning_rate": 4.1602566694699085e-06, + "loss": 0.7586, + "step": 16372 + }, + { + "epoch": 0.7972634090521754, + "grad_norm": 1.7267323732376099, + "learning_rate": 4.158331005881307e-06, + "loss": 0.8286, + "step": 16373 + }, + { + "epoch": 0.7973121028412826, + "grad_norm": 1.7557950019836426, + "learning_rate": 4.156405736355662e-06, + "loss": 0.8585, + "step": 16374 + }, + { + "epoch": 0.7973607966303898, + "grad_norm": 1.6586958169937134, + "learning_rate": 4.1544808609408726e-06, + "loss": 0.7058, + "step": 16375 + }, + { + "epoch": 0.797409490419497, + "grad_norm": 2.2215323448181152, + "learning_rate": 4.152556379684809e-06, + "loss": 0.7394, + "step": 16376 + }, + { + "epoch": 0.7974581842086041, + "grad_norm": 1.6851353645324707, + "learning_rate": 4.1506322926353615e-06, + "loss": 0.7874, + "step": 16377 + }, + { + "epoch": 0.7975068779977114, + "grad_norm": 1.7636685371398926, + "learning_rate": 4.1487085998403695e-06, + "loss": 0.7374, + "step": 16378 + }, + { + "epoch": 0.7975555717868186, + "grad_norm": 1.9692137241363525, + "learning_rate": 4.1467853013477085e-06, + "loss": 0.8628, + "step": 16379 + }, + { + "epoch": 0.7976042655759258, + "grad_norm": 2.1837146282196045, + "learning_rate": 4.144862397205199e-06, + "loss": 0.749, + "step": 16380 + }, + { + "epoch": 0.797652959365033, + "grad_norm": 1.31545090675354, + "learning_rate": 4.142939887460686e-06, + "loss": 0.7765, + "step": 16381 + }, + { + "epoch": 0.7977016531541402, + "grad_norm": 1.9527150392532349, + "learning_rate": 4.14101777216199e-06, + "loss": 0.8686, + "step": 16382 + }, + { + "epoch": 0.7977503469432474, + "grad_norm": 2.300551176071167, + "learning_rate": 4.139096051356921e-06, + "loss": 0.7511, + "step": 16383 + }, + { + "epoch": 0.7977990407323546, + "grad_norm": 1.6348819732666016, + "learning_rate": 4.137174725093287e-06, + "loss": 0.8045, + "step": 16384 + }, + { + "epoch": 0.7978477345214617, + "grad_norm": 2.3951921463012695, + "learning_rate": 4.13525379341887e-06, + "loss": 0.8816, + "step": 16385 + }, + { + "epoch": 0.797896428310569, + "grad_norm": 2.1233139038085938, + "learning_rate": 4.133333256381473e-06, + "loss": 0.8482, + "step": 16386 + }, + { + "epoch": 0.7979451220996762, + "grad_norm": 1.2548943758010864, + "learning_rate": 4.131413114028848e-06, + "loss": 0.8492, + "step": 16387 + }, + { + "epoch": 0.7979938158887834, + "grad_norm": 2.2676820755004883, + "learning_rate": 4.129493366408779e-06, + "loss": 0.7704, + "step": 16388 + }, + { + "epoch": 0.7980425096778906, + "grad_norm": 0.09819255769252777, + "learning_rate": 4.1275740135689975e-06, + "loss": 0.5908, + "step": 16389 + }, + { + "epoch": 0.7980912034669978, + "grad_norm": 0.0910535529255867, + "learning_rate": 4.1256550555572675e-06, + "loss": 0.627, + "step": 16390 + }, + { + "epoch": 0.798139897256105, + "grad_norm": 1.3493142127990723, + "learning_rate": 4.123736492421313e-06, + "loss": 0.8117, + "step": 16391 + }, + { + "epoch": 0.7981885910452122, + "grad_norm": 0.09167299419641495, + "learning_rate": 4.121818324208857e-06, + "loss": 0.5981, + "step": 16392 + }, + { + "epoch": 0.7982372848343193, + "grad_norm": 1.6140787601470947, + "learning_rate": 4.119900550967628e-06, + "loss": 0.8331, + "step": 16393 + }, + { + "epoch": 0.7982859786234265, + "grad_norm": 2.020632266998291, + "learning_rate": 4.11798317274531e-06, + "loss": 0.8468, + "step": 16394 + }, + { + "epoch": 0.7983346724125338, + "grad_norm": 2.677314281463623, + "learning_rate": 4.11606618958962e-06, + "loss": 0.9081, + "step": 16395 + }, + { + "epoch": 0.798383366201641, + "grad_norm": 0.0941208153963089, + "learning_rate": 4.114149601548221e-06, + "loss": 0.6224, + "step": 16396 + }, + { + "epoch": 0.7984320599907482, + "grad_norm": 2.397230625152588, + "learning_rate": 4.112233408668802e-06, + "loss": 0.7958, + "step": 16397 + }, + { + "epoch": 0.7984807537798554, + "grad_norm": 1.839945673942566, + "learning_rate": 4.110317610999028e-06, + "loss": 0.8671, + "step": 16398 + }, + { + "epoch": 0.7985294475689626, + "grad_norm": 1.9936046600341797, + "learning_rate": 4.108402208586552e-06, + "loss": 0.7409, + "step": 16399 + }, + { + "epoch": 0.7985781413580698, + "grad_norm": 1.2778183221817017, + "learning_rate": 4.106487201479019e-06, + "loss": 0.8379, + "step": 16400 + }, + { + "epoch": 0.798626835147177, + "grad_norm": 3.2655768394470215, + "learning_rate": 4.104572589724061e-06, + "loss": 0.6758, + "step": 16401 + }, + { + "epoch": 0.7986755289362841, + "grad_norm": 1.1106913089752197, + "learning_rate": 4.102658373369322e-06, + "loss": 0.8475, + "step": 16402 + }, + { + "epoch": 0.7987242227253913, + "grad_norm": 1.3462952375411987, + "learning_rate": 4.1007445524623904e-06, + "loss": 0.7983, + "step": 16403 + }, + { + "epoch": 0.7987729165144986, + "grad_norm": 1.2560312747955322, + "learning_rate": 4.098831127050895e-06, + "loss": 0.7667, + "step": 16404 + }, + { + "epoch": 0.7988216103036058, + "grad_norm": 1.5763075351715088, + "learning_rate": 4.096918097182428e-06, + "loss": 0.7289, + "step": 16405 + }, + { + "epoch": 0.798870304092713, + "grad_norm": 1.4981250762939453, + "learning_rate": 4.095005462904569e-06, + "loss": 0.6966, + "step": 16406 + }, + { + "epoch": 0.7989189978818202, + "grad_norm": 1.4626351594924927, + "learning_rate": 4.093093224264901e-06, + "loss": 0.8123, + "step": 16407 + }, + { + "epoch": 0.7989676916709274, + "grad_norm": 2.215494155883789, + "learning_rate": 4.091181381310989e-06, + "loss": 0.8845, + "step": 16408 + }, + { + "epoch": 0.7990163854600346, + "grad_norm": 1.3008272647857666, + "learning_rate": 4.0892699340903915e-06, + "loss": 0.8292, + "step": 16409 + }, + { + "epoch": 0.7990650792491417, + "grad_norm": 2.2146835327148438, + "learning_rate": 4.08735888265065e-06, + "loss": 0.7781, + "step": 16410 + }, + { + "epoch": 0.7991137730382489, + "grad_norm": 1.4569770097732544, + "learning_rate": 4.085448227039317e-06, + "loss": 0.8053, + "step": 16411 + }, + { + "epoch": 0.7991624668273561, + "grad_norm": 1.6381562948226929, + "learning_rate": 4.083537967303901e-06, + "loss": 0.8491, + "step": 16412 + }, + { + "epoch": 0.7992111606164634, + "grad_norm": 2.037870407104492, + "learning_rate": 4.081628103491932e-06, + "loss": 0.7344, + "step": 16413 + }, + { + "epoch": 0.7992598544055706, + "grad_norm": 1.3334428071975708, + "learning_rate": 4.079718635650918e-06, + "loss": 0.7945, + "step": 16414 + }, + { + "epoch": 0.7993085481946778, + "grad_norm": 1.4193564653396606, + "learning_rate": 4.077809563828352e-06, + "loss": 0.7511, + "step": 16415 + }, + { + "epoch": 0.799357241983785, + "grad_norm": 1.899490475654602, + "learning_rate": 4.075900888071727e-06, + "loss": 0.7792, + "step": 16416 + }, + { + "epoch": 0.7994059357728922, + "grad_norm": 1.6866233348846436, + "learning_rate": 4.073992608428519e-06, + "loss": 0.8275, + "step": 16417 + }, + { + "epoch": 0.7994546295619994, + "grad_norm": 2.1376640796661377, + "learning_rate": 4.072084724946195e-06, + "loss": 0.7226, + "step": 16418 + }, + { + "epoch": 0.7995033233511065, + "grad_norm": 1.7480700016021729, + "learning_rate": 4.070177237672217e-06, + "loss": 0.7682, + "step": 16419 + }, + { + "epoch": 0.7995520171402137, + "grad_norm": 1.4955437183380127, + "learning_rate": 4.068270146654028e-06, + "loss": 0.8701, + "step": 16420 + }, + { + "epoch": 0.799600710929321, + "grad_norm": 1.366058349609375, + "learning_rate": 4.066363451939077e-06, + "loss": 0.7953, + "step": 16421 + }, + { + "epoch": 0.7996494047184282, + "grad_norm": 1.812451958656311, + "learning_rate": 4.064457153574785e-06, + "loss": 0.7688, + "step": 16422 + }, + { + "epoch": 0.7996980985075354, + "grad_norm": 1.661295771598816, + "learning_rate": 4.062551251608577e-06, + "loss": 0.8506, + "step": 16423 + }, + { + "epoch": 0.7997467922966426, + "grad_norm": 1.1606942415237427, + "learning_rate": 4.060645746087861e-06, + "loss": 0.763, + "step": 16424 + }, + { + "epoch": 0.7997954860857498, + "grad_norm": 2.5953562259674072, + "learning_rate": 4.058740637060032e-06, + "loss": 0.8307, + "step": 16425 + }, + { + "epoch": 0.799844179874857, + "grad_norm": 1.5882844924926758, + "learning_rate": 4.056835924572487e-06, + "loss": 0.7899, + "step": 16426 + }, + { + "epoch": 0.7998928736639641, + "grad_norm": 1.6677074432373047, + "learning_rate": 4.054931608672594e-06, + "loss": 0.8343, + "step": 16427 + }, + { + "epoch": 0.7999415674530713, + "grad_norm": 2.141127824783325, + "learning_rate": 4.053027689407742e-06, + "loss": 0.7984, + "step": 16428 + }, + { + "epoch": 0.7999902612421785, + "grad_norm": 2.140002727508545, + "learning_rate": 4.051124166825269e-06, + "loss": 0.8, + "step": 16429 + }, + { + "epoch": 0.8000389550312857, + "grad_norm": 2.2429637908935547, + "learning_rate": 4.049221040972544e-06, + "loss": 0.8359, + "step": 16430 + }, + { + "epoch": 0.800087648820393, + "grad_norm": 1.78337824344635, + "learning_rate": 4.047318311896897e-06, + "loss": 0.7866, + "step": 16431 + }, + { + "epoch": 0.8001363426095002, + "grad_norm": 0.09191183000802994, + "learning_rate": 4.0454159796456635e-06, + "loss": 0.565, + "step": 16432 + }, + { + "epoch": 0.8001850363986074, + "grad_norm": 1.713208556175232, + "learning_rate": 4.0435140442661615e-06, + "loss": 0.8287, + "step": 16433 + }, + { + "epoch": 0.8002337301877146, + "grad_norm": 2.047745943069458, + "learning_rate": 4.041612505805699e-06, + "loss": 0.8306, + "step": 16434 + }, + { + "epoch": 0.8002824239768218, + "grad_norm": 2.074965715408325, + "learning_rate": 4.039711364311589e-06, + "loss": 0.8281, + "step": 16435 + }, + { + "epoch": 0.8003311177659289, + "grad_norm": 1.7651009559631348, + "learning_rate": 4.037810619831104e-06, + "loss": 0.7863, + "step": 16436 + }, + { + "epoch": 0.8003798115550361, + "grad_norm": 2.766165018081665, + "learning_rate": 4.035910272411545e-06, + "loss": 0.8757, + "step": 16437 + }, + { + "epoch": 0.8004285053441433, + "grad_norm": 1.5218243598937988, + "learning_rate": 4.034010322100164e-06, + "loss": 0.8365, + "step": 16438 + }, + { + "epoch": 0.8004771991332505, + "grad_norm": 1.3220372200012207, + "learning_rate": 4.032110768944235e-06, + "loss": 0.6748, + "step": 16439 + }, + { + "epoch": 0.8005258929223578, + "grad_norm": 1.5703624486923218, + "learning_rate": 4.030211612991011e-06, + "loss": 0.753, + "step": 16440 + }, + { + "epoch": 0.800574586711465, + "grad_norm": 1.5341178178787231, + "learning_rate": 4.0283128542877255e-06, + "loss": 0.7377, + "step": 16441 + }, + { + "epoch": 0.8006232805005722, + "grad_norm": 1.3759257793426514, + "learning_rate": 4.026414492881614e-06, + "loss": 0.7744, + "step": 16442 + }, + { + "epoch": 0.8006719742896794, + "grad_norm": 1.6118916273117065, + "learning_rate": 4.024516528819895e-06, + "loss": 0.8822, + "step": 16443 + }, + { + "epoch": 0.8007206680787865, + "grad_norm": 1.795719861984253, + "learning_rate": 4.022618962149794e-06, + "loss": 0.8571, + "step": 16444 + }, + { + "epoch": 0.8007693618678937, + "grad_norm": 1.447901725769043, + "learning_rate": 4.020721792918492e-06, + "loss": 0.8753, + "step": 16445 + }, + { + "epoch": 0.8008180556570009, + "grad_norm": 2.8424623012542725, + "learning_rate": 4.0188250211732026e-06, + "loss": 0.7838, + "step": 16446 + }, + { + "epoch": 0.8008667494461081, + "grad_norm": 1.9038844108581543, + "learning_rate": 4.016928646961089e-06, + "loss": 0.8569, + "step": 16447 + }, + { + "epoch": 0.8009154432352154, + "grad_norm": 2.817490577697754, + "learning_rate": 4.0150326703293355e-06, + "loss": 0.8086, + "step": 16448 + }, + { + "epoch": 0.8009641370243226, + "grad_norm": 2.0036702156066895, + "learning_rate": 4.013137091325104e-06, + "loss": 0.8399, + "step": 16449 + }, + { + "epoch": 0.8010128308134298, + "grad_norm": 1.2514617443084717, + "learning_rate": 4.011241909995538e-06, + "loss": 0.9123, + "step": 16450 + }, + { + "epoch": 0.801061524602537, + "grad_norm": 4.108359336853027, + "learning_rate": 4.0093471263878e-06, + "loss": 0.8491, + "step": 16451 + }, + { + "epoch": 0.8011102183916441, + "grad_norm": 1.2598462104797363, + "learning_rate": 4.007452740548996e-06, + "loss": 0.7834, + "step": 16452 + }, + { + "epoch": 0.8011589121807513, + "grad_norm": 1.3705931901931763, + "learning_rate": 4.005558752526277e-06, + "loss": 0.8245, + "step": 16453 + }, + { + "epoch": 0.8012076059698585, + "grad_norm": 1.580978274345398, + "learning_rate": 4.00366516236673e-06, + "loss": 0.8653, + "step": 16454 + }, + { + "epoch": 0.8012562997589657, + "grad_norm": 1.9205750226974487, + "learning_rate": 4.001771970117476e-06, + "loss": 0.7315, + "step": 16455 + }, + { + "epoch": 0.8013049935480729, + "grad_norm": 2.173184871673584, + "learning_rate": 3.999879175825605e-06, + "loss": 0.939, + "step": 16456 + }, + { + "epoch": 0.8013536873371802, + "grad_norm": 1.526615858078003, + "learning_rate": 3.997986779538196e-06, + "loss": 0.7859, + "step": 16457 + }, + { + "epoch": 0.8014023811262874, + "grad_norm": 1.3828710317611694, + "learning_rate": 3.9960947813023266e-06, + "loss": 0.8365, + "step": 16458 + }, + { + "epoch": 0.8014510749153946, + "grad_norm": 2.194490432739258, + "learning_rate": 3.994203181165057e-06, + "loss": 0.7574, + "step": 16459 + }, + { + "epoch": 0.8014997687045018, + "grad_norm": 1.8617836236953735, + "learning_rate": 3.992311979173445e-06, + "loss": 0.8382, + "step": 16460 + }, + { + "epoch": 0.8015484624936089, + "grad_norm": 1.3391660451889038, + "learning_rate": 3.9904211753745234e-06, + "loss": 0.7638, + "step": 16461 + }, + { + "epoch": 0.8015971562827161, + "grad_norm": 1.4697520732879639, + "learning_rate": 3.988530769815342e-06, + "loss": 0.8672, + "step": 16462 + }, + { + "epoch": 0.8016458500718233, + "grad_norm": 2.246147632598877, + "learning_rate": 3.986640762542917e-06, + "loss": 0.8142, + "step": 16463 + }, + { + "epoch": 0.8016945438609305, + "grad_norm": 1.311866283416748, + "learning_rate": 3.9847511536042645e-06, + "loss": 0.7611, + "step": 16464 + }, + { + "epoch": 0.8017432376500377, + "grad_norm": 1.6910288333892822, + "learning_rate": 3.982861943046385e-06, + "loss": 0.7943, + "step": 16465 + }, + { + "epoch": 0.801791931439145, + "grad_norm": 1.9447426795959473, + "learning_rate": 3.980973130916277e-06, + "loss": 0.8701, + "step": 16466 + }, + { + "epoch": 0.8018406252282522, + "grad_norm": 1.194968581199646, + "learning_rate": 3.979084717260922e-06, + "loss": 0.8262, + "step": 16467 + }, + { + "epoch": 0.8018893190173594, + "grad_norm": 1.5513650178909302, + "learning_rate": 3.977196702127295e-06, + "loss": 0.8127, + "step": 16468 + }, + { + "epoch": 0.8019380128064665, + "grad_norm": 1.6244240999221802, + "learning_rate": 3.975309085562362e-06, + "loss": 0.846, + "step": 16469 + }, + { + "epoch": 0.8019867065955737, + "grad_norm": 1.6552814245224, + "learning_rate": 3.973421867613072e-06, + "loss": 0.8479, + "step": 16470 + }, + { + "epoch": 0.8020354003846809, + "grad_norm": 2.776578664779663, + "learning_rate": 3.971535048326378e-06, + "loss": 0.8784, + "step": 16471 + }, + { + "epoch": 0.8020840941737881, + "grad_norm": 1.2691627740859985, + "learning_rate": 3.969648627749212e-06, + "loss": 0.9139, + "step": 16472 + }, + { + "epoch": 0.8021327879628953, + "grad_norm": 1.740665078163147, + "learning_rate": 3.967762605928496e-06, + "loss": 0.7903, + "step": 16473 + }, + { + "epoch": 0.8021814817520025, + "grad_norm": 1.331335425376892, + "learning_rate": 3.96587698291115e-06, + "loss": 0.822, + "step": 16474 + }, + { + "epoch": 0.8022301755411098, + "grad_norm": 1.460787296295166, + "learning_rate": 3.963991758744072e-06, + "loss": 0.7525, + "step": 16475 + }, + { + "epoch": 0.802278869330217, + "grad_norm": 1.2531590461730957, + "learning_rate": 3.962106933474166e-06, + "loss": 0.8319, + "step": 16476 + }, + { + "epoch": 0.8023275631193242, + "grad_norm": 1.9560856819152832, + "learning_rate": 3.960222507148308e-06, + "loss": 0.8467, + "step": 16477 + }, + { + "epoch": 0.8023762569084313, + "grad_norm": 3.0242080688476562, + "learning_rate": 3.9583384798133755e-06, + "loss": 0.8432, + "step": 16478 + }, + { + "epoch": 0.8024249506975385, + "grad_norm": 1.5171408653259277, + "learning_rate": 3.956454851516239e-06, + "loss": 0.8389, + "step": 16479 + }, + { + "epoch": 0.8024736444866457, + "grad_norm": 1.7309716939926147, + "learning_rate": 3.954571622303751e-06, + "loss": 0.7549, + "step": 16480 + }, + { + "epoch": 0.8025223382757529, + "grad_norm": 1.6459355354309082, + "learning_rate": 3.952688792222759e-06, + "loss": 0.847, + "step": 16481 + }, + { + "epoch": 0.8025710320648601, + "grad_norm": 3.948847532272339, + "learning_rate": 3.950806361320095e-06, + "loss": 0.811, + "step": 16482 + }, + { + "epoch": 0.8026197258539673, + "grad_norm": 1.7280662059783936, + "learning_rate": 3.948924329642585e-06, + "loss": 0.7947, + "step": 16483 + }, + { + "epoch": 0.8026684196430746, + "grad_norm": 3.0655150413513184, + "learning_rate": 3.947042697237047e-06, + "loss": 0.8828, + "step": 16484 + }, + { + "epoch": 0.8027171134321818, + "grad_norm": 2.088888645172119, + "learning_rate": 3.94516146415028e-06, + "loss": 0.7959, + "step": 16485 + }, + { + "epoch": 0.8027658072212889, + "grad_norm": 1.8296139240264893, + "learning_rate": 3.943280630429096e-06, + "loss": 0.7706, + "step": 16486 + }, + { + "epoch": 0.8028145010103961, + "grad_norm": 2.3299942016601562, + "learning_rate": 3.9414001961202596e-06, + "loss": 0.9226, + "step": 16487 + }, + { + "epoch": 0.8028631947995033, + "grad_norm": 1.4533157348632812, + "learning_rate": 3.939520161270569e-06, + "loss": 0.7623, + "step": 16488 + }, + { + "epoch": 0.8029118885886105, + "grad_norm": 1.2045155763626099, + "learning_rate": 3.937640525926767e-06, + "loss": 0.8683, + "step": 16489 + }, + { + "epoch": 0.8029605823777177, + "grad_norm": 1.2452802658081055, + "learning_rate": 3.9357612901356265e-06, + "loss": 0.7616, + "step": 16490 + }, + { + "epoch": 0.8030092761668249, + "grad_norm": 2.2069153785705566, + "learning_rate": 3.933882453943889e-06, + "loss": 0.8576, + "step": 16491 + }, + { + "epoch": 0.8030579699559321, + "grad_norm": 1.4065213203430176, + "learning_rate": 3.932004017398292e-06, + "loss": 0.8088, + "step": 16492 + }, + { + "epoch": 0.8031066637450394, + "grad_norm": 1.5200988054275513, + "learning_rate": 3.930125980545558e-06, + "loss": 0.8056, + "step": 16493 + }, + { + "epoch": 0.8031553575341466, + "grad_norm": 1.4913982152938843, + "learning_rate": 3.9282483434324015e-06, + "loss": 0.7511, + "step": 16494 + }, + { + "epoch": 0.8032040513232537, + "grad_norm": 1.9850597381591797, + "learning_rate": 3.926371106105542e-06, + "loss": 0.8158, + "step": 16495 + }, + { + "epoch": 0.8032527451123609, + "grad_norm": 1.6813056468963623, + "learning_rate": 3.924494268611656e-06, + "loss": 0.7232, + "step": 16496 + }, + { + "epoch": 0.8033014389014681, + "grad_norm": 1.690394401550293, + "learning_rate": 3.9226178309974525e-06, + "loss": 0.7439, + "step": 16497 + }, + { + "epoch": 0.8033501326905753, + "grad_norm": 2.719866991043091, + "learning_rate": 3.920741793309586e-06, + "loss": 0.8251, + "step": 16498 + }, + { + "epoch": 0.8033988264796825, + "grad_norm": 1.5493614673614502, + "learning_rate": 3.918866155594738e-06, + "loss": 0.7242, + "step": 16499 + }, + { + "epoch": 0.8034475202687897, + "grad_norm": 3.1346828937530518, + "learning_rate": 3.916990917899562e-06, + "loss": 0.8312, + "step": 16500 + }, + { + "epoch": 0.803496214057897, + "grad_norm": 1.7150872945785522, + "learning_rate": 3.9151160802706955e-06, + "loss": 0.7898, + "step": 16501 + }, + { + "epoch": 0.8035449078470042, + "grad_norm": 1.3849300146102905, + "learning_rate": 3.913241642754795e-06, + "loss": 0.8278, + "step": 16502 + }, + { + "epoch": 0.8035936016361113, + "grad_norm": 3.309234380722046, + "learning_rate": 3.911367605398464e-06, + "loss": 0.737, + "step": 16503 + }, + { + "epoch": 0.8036422954252185, + "grad_norm": 1.3189603090286255, + "learning_rate": 3.909493968248344e-06, + "loss": 0.8441, + "step": 16504 + }, + { + "epoch": 0.8036909892143257, + "grad_norm": 1.3463472127914429, + "learning_rate": 3.907620731351014e-06, + "loss": 0.754, + "step": 16505 + }, + { + "epoch": 0.8037396830034329, + "grad_norm": 2.292588949203491, + "learning_rate": 3.905747894753094e-06, + "loss": 0.8179, + "step": 16506 + }, + { + "epoch": 0.8037883767925401, + "grad_norm": 1.3534858226776123, + "learning_rate": 3.90387545850116e-06, + "loss": 0.7674, + "step": 16507 + }, + { + "epoch": 0.8038370705816473, + "grad_norm": 1.6547199487686157, + "learning_rate": 3.9020034226417935e-06, + "loss": 0.7984, + "step": 16508 + }, + { + "epoch": 0.8038857643707545, + "grad_norm": 2.0714261531829834, + "learning_rate": 3.90013178722156e-06, + "loss": 0.8878, + "step": 16509 + }, + { + "epoch": 0.8039344581598618, + "grad_norm": 2.2370998859405518, + "learning_rate": 3.89826055228701e-06, + "loss": 0.7422, + "step": 16510 + }, + { + "epoch": 0.8039831519489689, + "grad_norm": 1.71680748462677, + "learning_rate": 3.896389717884708e-06, + "loss": 0.7914, + "step": 16511 + }, + { + "epoch": 0.8040318457380761, + "grad_norm": 1.6332368850708008, + "learning_rate": 3.894519284061169e-06, + "loss": 0.789, + "step": 16512 + }, + { + "epoch": 0.8040805395271833, + "grad_norm": 1.6627559661865234, + "learning_rate": 3.892649250862936e-06, + "loss": 0.7685, + "step": 16513 + }, + { + "epoch": 0.8041292333162905, + "grad_norm": 3.056293487548828, + "learning_rate": 3.8907796183365224e-06, + "loss": 0.8081, + "step": 16514 + }, + { + "epoch": 0.8041779271053977, + "grad_norm": 1.3038747310638428, + "learning_rate": 3.888910386528433e-06, + "loss": 0.9179, + "step": 16515 + }, + { + "epoch": 0.8042266208945049, + "grad_norm": 1.4396727085113525, + "learning_rate": 3.887041555485169e-06, + "loss": 0.8236, + "step": 16516 + }, + { + "epoch": 0.8042753146836121, + "grad_norm": 1.401552677154541, + "learning_rate": 3.885173125253214e-06, + "loss": 0.7883, + "step": 16517 + }, + { + "epoch": 0.8043240084727193, + "grad_norm": 1.2965151071548462, + "learning_rate": 3.883305095879047e-06, + "loss": 0.7503, + "step": 16518 + }, + { + "epoch": 0.8043727022618266, + "grad_norm": 1.76886785030365, + "learning_rate": 3.881437467409131e-06, + "loss": 0.8459, + "step": 16519 + }, + { + "epoch": 0.8044213960509337, + "grad_norm": 1.7006460428237915, + "learning_rate": 3.879570239889936e-06, + "loss": 0.8762, + "step": 16520 + }, + { + "epoch": 0.8044700898400409, + "grad_norm": 1.3432525396347046, + "learning_rate": 3.877703413367892e-06, + "loss": 0.8874, + "step": 16521 + }, + { + "epoch": 0.8045187836291481, + "grad_norm": 1.265667200088501, + "learning_rate": 3.87583698788945e-06, + "loss": 0.7598, + "step": 16522 + }, + { + "epoch": 0.8045674774182553, + "grad_norm": 4.0387654304504395, + "learning_rate": 3.873970963501035e-06, + "loss": 0.8178, + "step": 16523 + }, + { + "epoch": 0.8046161712073625, + "grad_norm": 2.7093098163604736, + "learning_rate": 3.872105340249059e-06, + "loss": 0.8633, + "step": 16524 + }, + { + "epoch": 0.8046648649964697, + "grad_norm": 3.8226985931396484, + "learning_rate": 3.870240118179935e-06, + "loss": 0.7926, + "step": 16525 + }, + { + "epoch": 0.8047135587855769, + "grad_norm": 1.2550411224365234, + "learning_rate": 3.8683752973400614e-06, + "loss": 0.7363, + "step": 16526 + }, + { + "epoch": 0.8047622525746841, + "grad_norm": 1.9152474403381348, + "learning_rate": 3.86651087777582e-06, + "loss": 0.8336, + "step": 16527 + }, + { + "epoch": 0.8048109463637912, + "grad_norm": 1.8019427061080933, + "learning_rate": 3.864646859533592e-06, + "loss": 0.8149, + "step": 16528 + }, + { + "epoch": 0.8048596401528985, + "grad_norm": 1.7639824151992798, + "learning_rate": 3.86278324265974e-06, + "loss": 0.8014, + "step": 16529 + }, + { + "epoch": 0.8049083339420057, + "grad_norm": 2.155534505844116, + "learning_rate": 3.8609200272006296e-06, + "loss": 0.7257, + "step": 16530 + }, + { + "epoch": 0.8049570277311129, + "grad_norm": 1.5013915300369263, + "learning_rate": 3.859057213202608e-06, + "loss": 0.7419, + "step": 16531 + }, + { + "epoch": 0.8050057215202201, + "grad_norm": 1.4922255277633667, + "learning_rate": 3.857194800712007e-06, + "loss": 0.8163, + "step": 16532 + }, + { + "epoch": 0.8050544153093273, + "grad_norm": 2.889524221420288, + "learning_rate": 3.855332789775159e-06, + "loss": 0.8834, + "step": 16533 + }, + { + "epoch": 0.8051031090984345, + "grad_norm": 4.44390344619751, + "learning_rate": 3.85347118043838e-06, + "loss": 0.7866, + "step": 16534 + }, + { + "epoch": 0.8051518028875417, + "grad_norm": 1.9232240915298462, + "learning_rate": 3.851609972747976e-06, + "loss": 0.7327, + "step": 16535 + }, + { + "epoch": 0.805200496676649, + "grad_norm": 1.3891665935516357, + "learning_rate": 3.849749166750243e-06, + "loss": 0.8652, + "step": 16536 + }, + { + "epoch": 0.805249190465756, + "grad_norm": 1.240920066833496, + "learning_rate": 3.84788876249148e-06, + "loss": 0.8409, + "step": 16537 + }, + { + "epoch": 0.8052978842548633, + "grad_norm": 1.4756699800491333, + "learning_rate": 3.846028760017948e-06, + "loss": 0.8256, + "step": 16538 + }, + { + "epoch": 0.8053465780439705, + "grad_norm": 2.215282440185547, + "learning_rate": 3.84416915937593e-06, + "loss": 0.8649, + "step": 16539 + }, + { + "epoch": 0.8053952718330777, + "grad_norm": 1.3382513523101807, + "learning_rate": 3.842309960611674e-06, + "loss": 0.7597, + "step": 16540 + }, + { + "epoch": 0.8054439656221849, + "grad_norm": 1.2524245977401733, + "learning_rate": 3.840451163771433e-06, + "loss": 0.8399, + "step": 16541 + }, + { + "epoch": 0.8054926594112921, + "grad_norm": 1.6532886028289795, + "learning_rate": 3.838592768901441e-06, + "loss": 0.8614, + "step": 16542 + }, + { + "epoch": 0.8055413532003993, + "grad_norm": 1.4371943473815918, + "learning_rate": 3.836734776047926e-06, + "loss": 0.8218, + "step": 16543 + }, + { + "epoch": 0.8055900469895065, + "grad_norm": 2.8324785232543945, + "learning_rate": 3.8348771852571135e-06, + "loss": 0.8093, + "step": 16544 + }, + { + "epoch": 0.8056387407786136, + "grad_norm": 1.556731104850769, + "learning_rate": 3.833019996575196e-06, + "loss": 0.6982, + "step": 16545 + }, + { + "epoch": 0.8056874345677209, + "grad_norm": 3.4502477645874023, + "learning_rate": 3.83116321004839e-06, + "loss": 0.8376, + "step": 16546 + }, + { + "epoch": 0.8057361283568281, + "grad_norm": 1.4794011116027832, + "learning_rate": 3.829306825722865e-06, + "loss": 0.8374, + "step": 16547 + }, + { + "epoch": 0.8057848221459353, + "grad_norm": 1.9877904653549194, + "learning_rate": 3.827450843644809e-06, + "loss": 0.8253, + "step": 16548 + }, + { + "epoch": 0.8058335159350425, + "grad_norm": 1.580190658569336, + "learning_rate": 3.825595263860391e-06, + "loss": 0.8042, + "step": 16549 + }, + { + "epoch": 0.8058822097241497, + "grad_norm": 1.4667952060699463, + "learning_rate": 3.823740086415764e-06, + "loss": 0.802, + "step": 16550 + }, + { + "epoch": 0.8059309035132569, + "grad_norm": 1.8299527168273926, + "learning_rate": 3.8218853113570764e-06, + "loss": 0.8213, + "step": 16551 + }, + { + "epoch": 0.8059795973023641, + "grad_norm": 1.4656859636306763, + "learning_rate": 3.8200309387304635e-06, + "loss": 0.77, + "step": 16552 + }, + { + "epoch": 0.8060282910914713, + "grad_norm": 1.503087043762207, + "learning_rate": 3.818176968582065e-06, + "loss": 0.8601, + "step": 16553 + }, + { + "epoch": 0.8060769848805784, + "grad_norm": 0.09751293808221817, + "learning_rate": 3.816323400957979e-06, + "loss": 0.5847, + "step": 16554 + }, + { + "epoch": 0.8061256786696857, + "grad_norm": 1.4641169309616089, + "learning_rate": 3.8144702359043375e-06, + "loss": 0.8049, + "step": 16555 + }, + { + "epoch": 0.8061743724587929, + "grad_norm": 1.6156359910964966, + "learning_rate": 3.8126174734672106e-06, + "loss": 0.7434, + "step": 16556 + }, + { + "epoch": 0.8062230662479001, + "grad_norm": 1.6710764169692993, + "learning_rate": 3.810765113692707e-06, + "loss": 0.7969, + "step": 16557 + }, + { + "epoch": 0.8062717600370073, + "grad_norm": 1.259099006652832, + "learning_rate": 3.8089131566268966e-06, + "loss": 0.875, + "step": 16558 + }, + { + "epoch": 0.8063204538261145, + "grad_norm": 2.0614941120147705, + "learning_rate": 3.8070616023158423e-06, + "loss": 0.7874, + "step": 16559 + }, + { + "epoch": 0.8063691476152217, + "grad_norm": 1.2728853225708008, + "learning_rate": 3.8052104508056186e-06, + "loss": 0.7709, + "step": 16560 + }, + { + "epoch": 0.8064178414043289, + "grad_norm": 1.7060503959655762, + "learning_rate": 3.8033597021422507e-06, + "loss": 0.8751, + "step": 16561 + }, + { + "epoch": 0.806466535193436, + "grad_norm": 1.3187229633331299, + "learning_rate": 3.8015093563717975e-06, + "loss": 0.7472, + "step": 16562 + }, + { + "epoch": 0.8065152289825432, + "grad_norm": 1.793077826499939, + "learning_rate": 3.799659413540264e-06, + "loss": 0.7734, + "step": 16563 + }, + { + "epoch": 0.8065639227716505, + "grad_norm": 1.3888226747512817, + "learning_rate": 3.7978098736936874e-06, + "loss": 0.8499, + "step": 16564 + }, + { + "epoch": 0.8066126165607577, + "grad_norm": 1.9694899320602417, + "learning_rate": 3.795960736878066e-06, + "loss": 0.7739, + "step": 16565 + }, + { + "epoch": 0.8066613103498649, + "grad_norm": 3.7981574535369873, + "learning_rate": 3.7941120031393986e-06, + "loss": 0.7954, + "step": 16566 + }, + { + "epoch": 0.8067100041389721, + "grad_norm": 1.5756044387817383, + "learning_rate": 3.7922636725236726e-06, + "loss": 0.698, + "step": 16567 + }, + { + "epoch": 0.8067586979280793, + "grad_norm": 1.725472331047058, + "learning_rate": 3.7904157450768653e-06, + "loss": 0.8206, + "step": 16568 + }, + { + "epoch": 0.8068073917171865, + "grad_norm": 1.408003330230713, + "learning_rate": 3.788568220844946e-06, + "loss": 0.8436, + "step": 16569 + }, + { + "epoch": 0.8068560855062936, + "grad_norm": 1.6784071922302246, + "learning_rate": 3.7867210998738645e-06, + "loss": 0.9175, + "step": 16570 + }, + { + "epoch": 0.8069047792954008, + "grad_norm": 1.3829602003097534, + "learning_rate": 3.784874382209578e-06, + "loss": 0.9146, + "step": 16571 + }, + { + "epoch": 0.806953473084508, + "grad_norm": 2.4274682998657227, + "learning_rate": 3.7830280678980204e-06, + "loss": 0.7322, + "step": 16572 + }, + { + "epoch": 0.8070021668736153, + "grad_norm": 2.078671932220459, + "learning_rate": 3.781182156985117e-06, + "loss": 0.8253, + "step": 16573 + }, + { + "epoch": 0.8070508606627225, + "grad_norm": 1.4024103879928589, + "learning_rate": 3.7793366495167873e-06, + "loss": 0.8462, + "step": 16574 + }, + { + "epoch": 0.8070995544518297, + "grad_norm": 1.6807960271835327, + "learning_rate": 3.7774915455389382e-06, + "loss": 0.8529, + "step": 16575 + }, + { + "epoch": 0.8071482482409369, + "grad_norm": 1.58762788772583, + "learning_rate": 3.775646845097465e-06, + "loss": 0.807, + "step": 16576 + }, + { + "epoch": 0.8071969420300441, + "grad_norm": 1.8254356384277344, + "learning_rate": 3.773802548238257e-06, + "loss": 0.7728, + "step": 16577 + }, + { + "epoch": 0.8072456358191513, + "grad_norm": 1.3117636442184448, + "learning_rate": 3.7719586550071885e-06, + "loss": 0.747, + "step": 16578 + }, + { + "epoch": 0.8072943296082584, + "grad_norm": 2.3642241954803467, + "learning_rate": 3.7701151654501234e-06, + "loss": 0.925, + "step": 16579 + }, + { + "epoch": 0.8073430233973656, + "grad_norm": 1.9955211877822876, + "learning_rate": 3.768272079612929e-06, + "loss": 0.8361, + "step": 16580 + }, + { + "epoch": 0.8073917171864728, + "grad_norm": 3.139953374862671, + "learning_rate": 3.7664293975414445e-06, + "loss": 0.832, + "step": 16581 + }, + { + "epoch": 0.8074404109755801, + "grad_norm": 1.8739259243011475, + "learning_rate": 3.764587119281511e-06, + "loss": 0.8299, + "step": 16582 + }, + { + "epoch": 0.8074891047646873, + "grad_norm": 1.549308180809021, + "learning_rate": 3.7627452448789536e-06, + "loss": 0.9263, + "step": 16583 + }, + { + "epoch": 0.8075377985537945, + "grad_norm": 3.6940441131591797, + "learning_rate": 3.7609037743795874e-06, + "loss": 0.7801, + "step": 16584 + }, + { + "epoch": 0.8075864923429017, + "grad_norm": 2.1862001419067383, + "learning_rate": 3.7590627078292195e-06, + "loss": 0.8723, + "step": 16585 + }, + { + "epoch": 0.8076351861320089, + "grad_norm": 1.5278176069259644, + "learning_rate": 3.7572220452736497e-06, + "loss": 0.8492, + "step": 16586 + }, + { + "epoch": 0.807683879921116, + "grad_norm": 1.8320261240005493, + "learning_rate": 3.7553817867586564e-06, + "loss": 0.8981, + "step": 16587 + }, + { + "epoch": 0.8077325737102232, + "grad_norm": 1.568617820739746, + "learning_rate": 3.753541932330027e-06, + "loss": 0.7225, + "step": 16588 + }, + { + "epoch": 0.8077812674993304, + "grad_norm": 1.6212947368621826, + "learning_rate": 3.7517024820335234e-06, + "loss": 0.8051, + "step": 16589 + }, + { + "epoch": 0.8078299612884376, + "grad_norm": 1.2669731378555298, + "learning_rate": 3.7498634359149043e-06, + "loss": 0.8228, + "step": 16590 + }, + { + "epoch": 0.8078786550775449, + "grad_norm": 1.5491584539413452, + "learning_rate": 3.748024794019911e-06, + "loss": 0.7943, + "step": 16591 + }, + { + "epoch": 0.8079273488666521, + "grad_norm": 1.5593162775039673, + "learning_rate": 3.7461865563942847e-06, + "loss": 0.8407, + "step": 16592 + }, + { + "epoch": 0.8079760426557593, + "grad_norm": 1.371524691581726, + "learning_rate": 3.7443487230837484e-06, + "loss": 0.8581, + "step": 16593 + }, + { + "epoch": 0.8080247364448665, + "grad_norm": 1.4051545858383179, + "learning_rate": 3.742511294134015e-06, + "loss": 0.7211, + "step": 16594 + }, + { + "epoch": 0.8080734302339737, + "grad_norm": 0.09562782198190689, + "learning_rate": 3.740674269590805e-06, + "loss": 0.6037, + "step": 16595 + }, + { + "epoch": 0.8081221240230808, + "grad_norm": 2.245279312133789, + "learning_rate": 3.738837649499796e-06, + "loss": 0.8401, + "step": 16596 + }, + { + "epoch": 0.808170817812188, + "grad_norm": 4.495702266693115, + "learning_rate": 3.737001433906693e-06, + "loss": 0.7846, + "step": 16597 + }, + { + "epoch": 0.8082195116012952, + "grad_norm": 2.7390456199645996, + "learning_rate": 3.735165622857151e-06, + "loss": 0.7395, + "step": 16598 + }, + { + "epoch": 0.8082682053904024, + "grad_norm": 1.9879491329193115, + "learning_rate": 3.7333302163968534e-06, + "loss": 0.7806, + "step": 16599 + }, + { + "epoch": 0.8083168991795097, + "grad_norm": 2.131512403488159, + "learning_rate": 3.73149521457145e-06, + "loss": 0.856, + "step": 16600 + }, + { + "epoch": 0.8083655929686169, + "grad_norm": 1.7185295820236206, + "learning_rate": 3.7296606174265847e-06, + "loss": 0.859, + "step": 16601 + }, + { + "epoch": 0.8084142867577241, + "grad_norm": 2.5237770080566406, + "learning_rate": 3.7278264250078966e-06, + "loss": 0.7654, + "step": 16602 + }, + { + "epoch": 0.8084629805468313, + "grad_norm": 1.5426684617996216, + "learning_rate": 3.7259926373610044e-06, + "loss": 0.8102, + "step": 16603 + }, + { + "epoch": 0.8085116743359384, + "grad_norm": 1.2865740060806274, + "learning_rate": 3.7241592545315386e-06, + "loss": 0.734, + "step": 16604 + }, + { + "epoch": 0.8085603681250456, + "grad_norm": 1.662777304649353, + "learning_rate": 3.722326276565087e-06, + "loss": 0.8496, + "step": 16605 + }, + { + "epoch": 0.8086090619141528, + "grad_norm": 1.3074718713760376, + "learning_rate": 3.720493703507262e-06, + "loss": 0.8433, + "step": 16606 + }, + { + "epoch": 0.80865775570326, + "grad_norm": 1.634629487991333, + "learning_rate": 3.7186615354036294e-06, + "loss": 0.8362, + "step": 16607 + }, + { + "epoch": 0.8087064494923673, + "grad_norm": 1.8910577297210693, + "learning_rate": 3.7168297722997837e-06, + "loss": 0.8347, + "step": 16608 + }, + { + "epoch": 0.8087551432814745, + "grad_norm": 3.1926910877227783, + "learning_rate": 3.71499841424128e-06, + "loss": 0.7984, + "step": 16609 + }, + { + "epoch": 0.8088038370705817, + "grad_norm": 4.747060775756836, + "learning_rate": 3.7131674612736703e-06, + "loss": 0.9068, + "step": 16610 + }, + { + "epoch": 0.8088525308596889, + "grad_norm": 1.4038769006729126, + "learning_rate": 3.7113369134425158e-06, + "loss": 0.7871, + "step": 16611 + }, + { + "epoch": 0.808901224648796, + "grad_norm": 1.6271973848342896, + "learning_rate": 3.70950677079333e-06, + "loss": 0.7148, + "step": 16612 + }, + { + "epoch": 0.8089499184379032, + "grad_norm": 1.4623968601226807, + "learning_rate": 3.7076770333716595e-06, + "loss": 0.7373, + "step": 16613 + }, + { + "epoch": 0.8089986122270104, + "grad_norm": 1.5026483535766602, + "learning_rate": 3.705847701222998e-06, + "loss": 0.7589, + "step": 16614 + }, + { + "epoch": 0.8090473060161176, + "grad_norm": 3.6475489139556885, + "learning_rate": 3.7040187743928634e-06, + "loss": 0.8362, + "step": 16615 + }, + { + "epoch": 0.8090959998052248, + "grad_norm": 1.4019137620925903, + "learning_rate": 3.7021902529267494e-06, + "loss": 0.8696, + "step": 16616 + }, + { + "epoch": 0.809144693594332, + "grad_norm": 1.646217703819275, + "learning_rate": 3.700362136870139e-06, + "loss": 0.7661, + "step": 16617 + }, + { + "epoch": 0.8091933873834393, + "grad_norm": 1.6117336750030518, + "learning_rate": 3.6985344262685073e-06, + "loss": 0.7043, + "step": 16618 + }, + { + "epoch": 0.8092420811725465, + "grad_norm": 1.4817670583724976, + "learning_rate": 3.696707121167311e-06, + "loss": 0.7973, + "step": 16619 + }, + { + "epoch": 0.8092907749616537, + "grad_norm": 1.8665294647216797, + "learning_rate": 3.694880221612023e-06, + "loss": 0.8349, + "step": 16620 + }, + { + "epoch": 0.8093394687507608, + "grad_norm": 19.747936248779297, + "learning_rate": 3.693053727648066e-06, + "loss": 0.7537, + "step": 16621 + }, + { + "epoch": 0.809388162539868, + "grad_norm": 1.6593455076217651, + "learning_rate": 3.691227639320889e-06, + "loss": 0.8557, + "step": 16622 + }, + { + "epoch": 0.8094368563289752, + "grad_norm": 1.8560696840286255, + "learning_rate": 3.6894019566759133e-06, + "loss": 0.7906, + "step": 16623 + }, + { + "epoch": 0.8094855501180824, + "grad_norm": 1.7076518535614014, + "learning_rate": 3.6875766797585488e-06, + "loss": 0.8584, + "step": 16624 + }, + { + "epoch": 0.8095342439071896, + "grad_norm": 1.8134205341339111, + "learning_rate": 3.685751808614202e-06, + "loss": 0.8452, + "step": 16625 + }, + { + "epoch": 0.8095829376962969, + "grad_norm": 1.9583606719970703, + "learning_rate": 3.683927343288267e-06, + "loss": 0.7621, + "step": 16626 + }, + { + "epoch": 0.8096316314854041, + "grad_norm": 2.0293445587158203, + "learning_rate": 3.682103283826126e-06, + "loss": 0.8365, + "step": 16627 + }, + { + "epoch": 0.8096803252745113, + "grad_norm": 1.4152895212173462, + "learning_rate": 3.6802796302731492e-06, + "loss": 0.7246, + "step": 16628 + }, + { + "epoch": 0.8097290190636184, + "grad_norm": 4.545481204986572, + "learning_rate": 3.678456382674711e-06, + "loss": 0.8351, + "step": 16629 + }, + { + "epoch": 0.8097777128527256, + "grad_norm": 1.6012455224990845, + "learning_rate": 3.6766335410761577e-06, + "loss": 0.7759, + "step": 16630 + }, + { + "epoch": 0.8098264066418328, + "grad_norm": 1.8328050374984741, + "learning_rate": 3.674811105522833e-06, + "loss": 0.8716, + "step": 16631 + }, + { + "epoch": 0.80987510043094, + "grad_norm": 2.439303398132324, + "learning_rate": 3.6729890760600695e-06, + "loss": 0.7687, + "step": 16632 + }, + { + "epoch": 0.8099237942200472, + "grad_norm": 1.83119535446167, + "learning_rate": 3.671167452733193e-06, + "loss": 0.8333, + "step": 16633 + }, + { + "epoch": 0.8099724880091544, + "grad_norm": 2.0746946334838867, + "learning_rate": 3.669346235587514e-06, + "loss": 0.7468, + "step": 16634 + }, + { + "epoch": 0.8100211817982617, + "grad_norm": 1.6449898481369019, + "learning_rate": 3.667525424668337e-06, + "loss": 0.7277, + "step": 16635 + }, + { + "epoch": 0.8100698755873689, + "grad_norm": 1.5376720428466797, + "learning_rate": 3.6657050200209534e-06, + "loss": 0.698, + "step": 16636 + }, + { + "epoch": 0.8101185693764761, + "grad_norm": 1.7959132194519043, + "learning_rate": 3.6638850216906452e-06, + "loss": 0.8538, + "step": 16637 + }, + { + "epoch": 0.8101672631655832, + "grad_norm": 1.603394865989685, + "learning_rate": 3.6620654297226855e-06, + "loss": 0.847, + "step": 16638 + }, + { + "epoch": 0.8102159569546904, + "grad_norm": 2.139056444168091, + "learning_rate": 3.66024624416234e-06, + "loss": 0.8225, + "step": 16639 + }, + { + "epoch": 0.8102646507437976, + "grad_norm": 1.376718521118164, + "learning_rate": 3.658427465054859e-06, + "loss": 0.7835, + "step": 16640 + }, + { + "epoch": 0.8103133445329048, + "grad_norm": 1.3552768230438232, + "learning_rate": 3.6566090924454866e-06, + "loss": 0.759, + "step": 16641 + }, + { + "epoch": 0.810362038322012, + "grad_norm": 2.5861098766326904, + "learning_rate": 3.654791126379451e-06, + "loss": 0.8567, + "step": 16642 + }, + { + "epoch": 0.8104107321111192, + "grad_norm": 2.3841392993927, + "learning_rate": 3.6529735669019783e-06, + "loss": 0.7918, + "step": 16643 + }, + { + "epoch": 0.8104594259002265, + "grad_norm": 1.8992353677749634, + "learning_rate": 3.6511564140582767e-06, + "loss": 0.8615, + "step": 16644 + }, + { + "epoch": 0.8105081196893337, + "grad_norm": 1.3535934686660767, + "learning_rate": 3.6493396678935477e-06, + "loss": 0.8466, + "step": 16645 + }, + { + "epoch": 0.8105568134784408, + "grad_norm": 1.5630587339401245, + "learning_rate": 3.6475233284529934e-06, + "loss": 0.8374, + "step": 16646 + }, + { + "epoch": 0.810605507267548, + "grad_norm": 0.10219482332468033, + "learning_rate": 3.6457073957817766e-06, + "loss": 0.6111, + "step": 16647 + }, + { + "epoch": 0.8106542010566552, + "grad_norm": 1.3554712533950806, + "learning_rate": 3.6438918699250845e-06, + "loss": 0.7982, + "step": 16648 + }, + { + "epoch": 0.8107028948457624, + "grad_norm": 1.5620871782302856, + "learning_rate": 3.642076750928074e-06, + "loss": 0.7769, + "step": 16649 + }, + { + "epoch": 0.8107515886348696, + "grad_norm": 2.1570844650268555, + "learning_rate": 3.6402620388358977e-06, + "loss": 0.765, + "step": 16650 + }, + { + "epoch": 0.8108002824239768, + "grad_norm": 1.8679298162460327, + "learning_rate": 3.638447733693693e-06, + "loss": 0.8481, + "step": 16651 + }, + { + "epoch": 0.810848976213084, + "grad_norm": 2.1253724098205566, + "learning_rate": 3.636633835546588e-06, + "loss": 0.8097, + "step": 16652 + }, + { + "epoch": 0.8108976700021913, + "grad_norm": 2.2792367935180664, + "learning_rate": 3.6348203444397177e-06, + "loss": 0.8357, + "step": 16653 + }, + { + "epoch": 0.8109463637912985, + "grad_norm": 1.6830878257751465, + "learning_rate": 3.6330072604181732e-06, + "loss": 0.8091, + "step": 16654 + }, + { + "epoch": 0.8109950575804056, + "grad_norm": 2.3410749435424805, + "learning_rate": 3.6311945835270756e-06, + "loss": 0.8085, + "step": 16655 + }, + { + "epoch": 0.8110437513695128, + "grad_norm": 1.6340543031692505, + "learning_rate": 3.6293823138114957e-06, + "loss": 0.8139, + "step": 16656 + }, + { + "epoch": 0.81109244515862, + "grad_norm": 3.534209728240967, + "learning_rate": 3.627570451316529e-06, + "loss": 0.7556, + "step": 16657 + }, + { + "epoch": 0.8111411389477272, + "grad_norm": 1.6735352277755737, + "learning_rate": 3.6257589960872385e-06, + "loss": 0.7598, + "step": 16658 + }, + { + "epoch": 0.8111898327368344, + "grad_norm": 1.7185174226760864, + "learning_rate": 3.6239479481686866e-06, + "loss": 0.8722, + "step": 16659 + }, + { + "epoch": 0.8112385265259416, + "grad_norm": 2.1499176025390625, + "learning_rate": 3.622137307605924e-06, + "loss": 0.7663, + "step": 16660 + }, + { + "epoch": 0.8112872203150489, + "grad_norm": 3.7706799507141113, + "learning_rate": 3.6203270744439834e-06, + "loss": 0.832, + "step": 16661 + }, + { + "epoch": 0.8113359141041561, + "grad_norm": 1.2663207054138184, + "learning_rate": 3.6185172487279084e-06, + "loss": 0.8511, + "step": 16662 + }, + { + "epoch": 0.8113846078932632, + "grad_norm": 1.3310942649841309, + "learning_rate": 3.6167078305027037e-06, + "loss": 0.8645, + "step": 16663 + }, + { + "epoch": 0.8114333016823704, + "grad_norm": 1.821825623512268, + "learning_rate": 3.6148988198133926e-06, + "loss": 0.7969, + "step": 16664 + }, + { + "epoch": 0.8114819954714776, + "grad_norm": 1.3741472959518433, + "learning_rate": 3.6130902167049574e-06, + "loss": 0.7638, + "step": 16665 + }, + { + "epoch": 0.8115306892605848, + "grad_norm": 2.417151927947998, + "learning_rate": 3.611282021222404e-06, + "loss": 0.77, + "step": 16666 + }, + { + "epoch": 0.811579383049692, + "grad_norm": 1.9318879842758179, + "learning_rate": 3.6094742334107038e-06, + "loss": 0.889, + "step": 16667 + }, + { + "epoch": 0.8116280768387992, + "grad_norm": 1.4687607288360596, + "learning_rate": 3.607666853314822e-06, + "loss": 0.8335, + "step": 16668 + }, + { + "epoch": 0.8116767706279064, + "grad_norm": 1.7774537801742554, + "learning_rate": 3.605859880979732e-06, + "loss": 0.8269, + "step": 16669 + }, + { + "epoch": 0.8117254644170137, + "grad_norm": 1.3599839210510254, + "learning_rate": 3.60405331645036e-06, + "loss": 0.6591, + "step": 16670 + }, + { + "epoch": 0.8117741582061208, + "grad_norm": 1.644322156906128, + "learning_rate": 3.602247159771668e-06, + "loss": 0.8553, + "step": 16671 + }, + { + "epoch": 0.811822851995228, + "grad_norm": 0.09993227571249008, + "learning_rate": 3.6004414109885645e-06, + "loss": 0.6267, + "step": 16672 + }, + { + "epoch": 0.8118715457843352, + "grad_norm": 1.3715444803237915, + "learning_rate": 3.5986360701459776e-06, + "loss": 0.8139, + "step": 16673 + }, + { + "epoch": 0.8119202395734424, + "grad_norm": 3.3786439895629883, + "learning_rate": 3.5968311372888164e-06, + "loss": 0.782, + "step": 16674 + }, + { + "epoch": 0.8119689333625496, + "grad_norm": 1.45205819606781, + "learning_rate": 3.595026612461976e-06, + "loss": 0.8488, + "step": 16675 + }, + { + "epoch": 0.8120176271516568, + "grad_norm": 1.3376859426498413, + "learning_rate": 3.5932224957103444e-06, + "loss": 0.8288, + "step": 16676 + }, + { + "epoch": 0.812066320940764, + "grad_norm": 0.09622073918581009, + "learning_rate": 3.5914187870787932e-06, + "loss": 0.6295, + "step": 16677 + }, + { + "epoch": 0.8121150147298712, + "grad_norm": 1.3641172647476196, + "learning_rate": 3.589615486612208e-06, + "loss": 0.843, + "step": 16678 + }, + { + "epoch": 0.8121637085189785, + "grad_norm": 1.7793829441070557, + "learning_rate": 3.587812594355422e-06, + "loss": 0.7093, + "step": 16679 + }, + { + "epoch": 0.8122124023080856, + "grad_norm": 2.651474714279175, + "learning_rate": 3.5860101103532997e-06, + "loss": 0.7382, + "step": 16680 + }, + { + "epoch": 0.8122610960971928, + "grad_norm": 1.1500097513198853, + "learning_rate": 3.5842080346506713e-06, + "loss": 0.8497, + "step": 16681 + }, + { + "epoch": 0.8123097898863, + "grad_norm": 1.4175611734390259, + "learning_rate": 3.582406367292366e-06, + "loss": 0.8526, + "step": 16682 + }, + { + "epoch": 0.8123584836754072, + "grad_norm": 2.4723329544067383, + "learning_rate": 3.5806051083231985e-06, + "loss": 0.8064, + "step": 16683 + }, + { + "epoch": 0.8124071774645144, + "grad_norm": 1.3854399919509888, + "learning_rate": 3.5788042577879757e-06, + "loss": 0.7333, + "step": 16684 + }, + { + "epoch": 0.8124558712536216, + "grad_norm": 1.627768635749817, + "learning_rate": 3.577003815731497e-06, + "loss": 0.8549, + "step": 16685 + }, + { + "epoch": 0.8125045650427288, + "grad_norm": 1.5824451446533203, + "learning_rate": 3.575203782198542e-06, + "loss": 0.7943, + "step": 16686 + }, + { + "epoch": 0.812553258831836, + "grad_norm": 1.9729177951812744, + "learning_rate": 3.573404157233893e-06, + "loss": 0.798, + "step": 16687 + }, + { + "epoch": 0.8126019526209431, + "grad_norm": 1.6885486841201782, + "learning_rate": 3.571604940882307e-06, + "loss": 0.8202, + "step": 16688 + }, + { + "epoch": 0.8126506464100504, + "grad_norm": 1.3780906200408936, + "learning_rate": 3.5698061331885515e-06, + "loss": 0.7457, + "step": 16689 + }, + { + "epoch": 0.8126993401991576, + "grad_norm": 1.9439239501953125, + "learning_rate": 3.5680077341973674e-06, + "loss": 0.8524, + "step": 16690 + }, + { + "epoch": 0.8127480339882648, + "grad_norm": 1.6783417463302612, + "learning_rate": 3.566209743953486e-06, + "loss": 0.8371, + "step": 16691 + }, + { + "epoch": 0.812796727777372, + "grad_norm": 1.2084954977035522, + "learning_rate": 3.5644121625016383e-06, + "loss": 0.7571, + "step": 16692 + }, + { + "epoch": 0.8128454215664792, + "grad_norm": 1.5104012489318848, + "learning_rate": 3.562614989886537e-06, + "loss": 0.8787, + "step": 16693 + }, + { + "epoch": 0.8128941153555864, + "grad_norm": 1.5895051956176758, + "learning_rate": 3.5608182261528845e-06, + "loss": 0.7299, + "step": 16694 + }, + { + "epoch": 0.8129428091446936, + "grad_norm": 1.5632222890853882, + "learning_rate": 3.5590218713453785e-06, + "loss": 0.7496, + "step": 16695 + }, + { + "epoch": 0.8129915029338008, + "grad_norm": 1.7809507846832275, + "learning_rate": 3.5572259255086983e-06, + "loss": 0.8173, + "step": 16696 + }, + { + "epoch": 0.813040196722908, + "grad_norm": 1.4123201370239258, + "learning_rate": 3.5554303886875265e-06, + "loss": 0.7157, + "step": 16697 + }, + { + "epoch": 0.8130888905120152, + "grad_norm": 2.65553879737854, + "learning_rate": 3.5536352609265225e-06, + "loss": 0.9409, + "step": 16698 + }, + { + "epoch": 0.8131375843011224, + "grad_norm": 1.2016767263412476, + "learning_rate": 3.5518405422703415e-06, + "loss": 0.7287, + "step": 16699 + }, + { + "epoch": 0.8131862780902296, + "grad_norm": 1.3567230701446533, + "learning_rate": 3.5500462327636263e-06, + "loss": 0.8099, + "step": 16700 + }, + { + "epoch": 0.8132349718793368, + "grad_norm": 1.6273597478866577, + "learning_rate": 3.5482523324510123e-06, + "loss": 0.719, + "step": 16701 + }, + { + "epoch": 0.813283665668444, + "grad_norm": 1.5358244180679321, + "learning_rate": 3.5464588413771206e-06, + "loss": 0.8613, + "step": 16702 + }, + { + "epoch": 0.8133323594575512, + "grad_norm": 1.6179155111312866, + "learning_rate": 3.5446657595865607e-06, + "loss": 0.756, + "step": 16703 + }, + { + "epoch": 0.8133810532466584, + "grad_norm": 1.57139253616333, + "learning_rate": 3.5428730871239504e-06, + "loss": 0.7565, + "step": 16704 + }, + { + "epoch": 0.8134297470357655, + "grad_norm": 1.2629115581512451, + "learning_rate": 3.5410808240338624e-06, + "loss": 0.781, + "step": 16705 + }, + { + "epoch": 0.8134784408248728, + "grad_norm": 2.2524824142456055, + "learning_rate": 3.539288970360899e-06, + "loss": 0.7361, + "step": 16706 + }, + { + "epoch": 0.81352713461398, + "grad_norm": 3.9242138862609863, + "learning_rate": 3.5374975261496137e-06, + "loss": 0.8379, + "step": 16707 + }, + { + "epoch": 0.8135758284030872, + "grad_norm": 2.433277130126953, + "learning_rate": 3.5357064914445837e-06, + "loss": 0.7117, + "step": 16708 + }, + { + "epoch": 0.8136245221921944, + "grad_norm": 2.8694794178009033, + "learning_rate": 3.533915866290356e-06, + "loss": 0.7567, + "step": 16709 + }, + { + "epoch": 0.8136732159813016, + "grad_norm": 1.8748676776885986, + "learning_rate": 3.5321256507314728e-06, + "loss": 0.8015, + "step": 16710 + }, + { + "epoch": 0.8137219097704088, + "grad_norm": 1.5023221969604492, + "learning_rate": 3.5303358448124646e-06, + "loss": 0.8823, + "step": 16711 + }, + { + "epoch": 0.813770603559516, + "grad_norm": 2.4817087650299072, + "learning_rate": 3.5285464485778477e-06, + "loss": 0.7349, + "step": 16712 + }, + { + "epoch": 0.8138192973486232, + "grad_norm": 1.6100239753723145, + "learning_rate": 3.5267574620721523e-06, + "loss": 0.8713, + "step": 16713 + }, + { + "epoch": 0.8138679911377303, + "grad_norm": 1.779935359954834, + "learning_rate": 3.5249688853398544e-06, + "loss": 0.8637, + "step": 16714 + }, + { + "epoch": 0.8139166849268376, + "grad_norm": 2.3256218433380127, + "learning_rate": 3.5231807184254652e-06, + "loss": 0.8119, + "step": 16715 + }, + { + "epoch": 0.8139653787159448, + "grad_norm": 1.4399513006210327, + "learning_rate": 3.5213929613734577e-06, + "loss": 0.8049, + "step": 16716 + }, + { + "epoch": 0.814014072505052, + "grad_norm": 1.419607400894165, + "learning_rate": 3.5196056142283008e-06, + "loss": 0.9037, + "step": 16717 + }, + { + "epoch": 0.8140627662941592, + "grad_norm": 2.932673454284668, + "learning_rate": 3.5178186770344568e-06, + "loss": 0.8463, + "step": 16718 + }, + { + "epoch": 0.8141114600832664, + "grad_norm": 3.519376039505005, + "learning_rate": 3.5160321498363725e-06, + "loss": 0.7283, + "step": 16719 + }, + { + "epoch": 0.8141601538723736, + "grad_norm": 1.325451374053955, + "learning_rate": 3.514246032678501e-06, + "loss": 0.7567, + "step": 16720 + }, + { + "epoch": 0.8142088476614808, + "grad_norm": 2.5121231079101562, + "learning_rate": 3.5124603256052535e-06, + "loss": 0.9221, + "step": 16721 + }, + { + "epoch": 0.8142575414505879, + "grad_norm": 1.5278481245040894, + "learning_rate": 3.510675028661068e-06, + "loss": 0.7747, + "step": 16722 + }, + { + "epoch": 0.8143062352396951, + "grad_norm": 1.4895515441894531, + "learning_rate": 3.508890141890333e-06, + "loss": 0.7152, + "step": 16723 + }, + { + "epoch": 0.8143549290288024, + "grad_norm": 1.611420750617981, + "learning_rate": 3.5071056653374668e-06, + "loss": 0.7793, + "step": 16724 + }, + { + "epoch": 0.8144036228179096, + "grad_norm": 1.306671142578125, + "learning_rate": 3.505321599046849e-06, + "loss": 0.7829, + "step": 16725 + }, + { + "epoch": 0.8144523166070168, + "grad_norm": 1.2669893503189087, + "learning_rate": 3.5035379430628622e-06, + "loss": 0.8787, + "step": 16726 + }, + { + "epoch": 0.814501010396124, + "grad_norm": 1.8759864568710327, + "learning_rate": 3.501754697429871e-06, + "loss": 0.7931, + "step": 16727 + }, + { + "epoch": 0.8145497041852312, + "grad_norm": 3.1747753620147705, + "learning_rate": 3.499971862192233e-06, + "loss": 0.8962, + "step": 16728 + }, + { + "epoch": 0.8145983979743384, + "grad_norm": 2.1747093200683594, + "learning_rate": 3.498189437394308e-06, + "loss": 0.8359, + "step": 16729 + }, + { + "epoch": 0.8146470917634455, + "grad_norm": 1.263783574104309, + "learning_rate": 3.496407423080415e-06, + "loss": 0.8049, + "step": 16730 + }, + { + "epoch": 0.8146957855525527, + "grad_norm": 1.4061362743377686, + "learning_rate": 3.494625819294899e-06, + "loss": 0.795, + "step": 16731 + }, + { + "epoch": 0.8147444793416599, + "grad_norm": 1.3144049644470215, + "learning_rate": 3.492844626082068e-06, + "loss": 0.8473, + "step": 16732 + }, + { + "epoch": 0.8147931731307672, + "grad_norm": 1.956502079963684, + "learning_rate": 3.4910638434862356e-06, + "loss": 0.7949, + "step": 16733 + }, + { + "epoch": 0.8148418669198744, + "grad_norm": 1.2410725355148315, + "learning_rate": 3.489283471551692e-06, + "loss": 0.7212, + "step": 16734 + }, + { + "epoch": 0.8148905607089816, + "grad_norm": 1.2238259315490723, + "learning_rate": 3.4875035103227296e-06, + "loss": 0.8021, + "step": 16735 + }, + { + "epoch": 0.8149392544980888, + "grad_norm": 2.4488720893859863, + "learning_rate": 3.485723959843621e-06, + "loss": 0.7975, + "step": 16736 + }, + { + "epoch": 0.814987948287196, + "grad_norm": 1.5934321880340576, + "learning_rate": 3.4839448201586312e-06, + "loss": 0.8625, + "step": 16737 + }, + { + "epoch": 0.8150366420763032, + "grad_norm": 1.618086576461792, + "learning_rate": 3.4821660913120226e-06, + "loss": 0.8848, + "step": 16738 + }, + { + "epoch": 0.8150853358654103, + "grad_norm": 1.3686764240264893, + "learning_rate": 3.4803877733480418e-06, + "loss": 0.8417, + "step": 16739 + }, + { + "epoch": 0.8151340296545175, + "grad_norm": 1.200973391532898, + "learning_rate": 3.478609866310918e-06, + "loss": 0.7883, + "step": 16740 + }, + { + "epoch": 0.8151827234436247, + "grad_norm": 1.8323497772216797, + "learning_rate": 3.4768323702448825e-06, + "loss": 0.941, + "step": 16741 + }, + { + "epoch": 0.815231417232732, + "grad_norm": 1.8362703323364258, + "learning_rate": 3.4750552851941466e-06, + "loss": 0.8313, + "step": 16742 + }, + { + "epoch": 0.8152801110218392, + "grad_norm": 1.7242980003356934, + "learning_rate": 3.4732786112029193e-06, + "loss": 0.7844, + "step": 16743 + }, + { + "epoch": 0.8153288048109464, + "grad_norm": 0.09392024576663971, + "learning_rate": 3.471502348315392e-06, + "loss": 0.579, + "step": 16744 + }, + { + "epoch": 0.8153774986000536, + "grad_norm": 1.8531856536865234, + "learning_rate": 3.4697264965757516e-06, + "loss": 0.797, + "step": 16745 + }, + { + "epoch": 0.8154261923891608, + "grad_norm": 1.44844388961792, + "learning_rate": 3.4679510560281715e-06, + "loss": 0.7356, + "step": 16746 + }, + { + "epoch": 0.8154748861782679, + "grad_norm": 1.2961903810501099, + "learning_rate": 3.4661760267168097e-06, + "loss": 0.7969, + "step": 16747 + }, + { + "epoch": 0.8155235799673751, + "grad_norm": 1.23531174659729, + "learning_rate": 3.4644014086858335e-06, + "loss": 0.8219, + "step": 16748 + }, + { + "epoch": 0.8155722737564823, + "grad_norm": 1.7562413215637207, + "learning_rate": 3.4626272019793806e-06, + "loss": 0.8139, + "step": 16749 + }, + { + "epoch": 0.8156209675455895, + "grad_norm": 1.583361268043518, + "learning_rate": 3.460853406641582e-06, + "loss": 0.7832, + "step": 16750 + }, + { + "epoch": 0.8156696613346968, + "grad_norm": 1.925302505493164, + "learning_rate": 3.4590800227165634e-06, + "loss": 0.8263, + "step": 16751 + }, + { + "epoch": 0.815718355123804, + "grad_norm": 1.408042550086975, + "learning_rate": 3.457307050248437e-06, + "loss": 0.7789, + "step": 16752 + }, + { + "epoch": 0.8157670489129112, + "grad_norm": 1.9327501058578491, + "learning_rate": 3.455534489281307e-06, + "loss": 0.8702, + "step": 16753 + }, + { + "epoch": 0.8158157427020184, + "grad_norm": 1.6611725091934204, + "learning_rate": 3.453762339859259e-06, + "loss": 0.7457, + "step": 16754 + }, + { + "epoch": 0.8158644364911256, + "grad_norm": 9.773118019104004, + "learning_rate": 3.4519906020263916e-06, + "loss": 0.7827, + "step": 16755 + }, + { + "epoch": 0.8159131302802327, + "grad_norm": 1.6081386804580688, + "learning_rate": 3.450219275826758e-06, + "loss": 0.8027, + "step": 16756 + }, + { + "epoch": 0.8159618240693399, + "grad_norm": 1.714793086051941, + "learning_rate": 3.448448361304433e-06, + "loss": 0.8089, + "step": 16757 + }, + { + "epoch": 0.8160105178584471, + "grad_norm": 1.5304979085922241, + "learning_rate": 3.4466778585034644e-06, + "loss": 0.7825, + "step": 16758 + }, + { + "epoch": 0.8160592116475544, + "grad_norm": 1.5833314657211304, + "learning_rate": 3.444907767467893e-06, + "loss": 0.868, + "step": 16759 + }, + { + "epoch": 0.8161079054366616, + "grad_norm": 1.299363613128662, + "learning_rate": 3.443138088241751e-06, + "loss": 0.7545, + "step": 16760 + }, + { + "epoch": 0.8161565992257688, + "grad_norm": 1.4619417190551758, + "learning_rate": 3.441368820869053e-06, + "loss": 0.8355, + "step": 16761 + }, + { + "epoch": 0.816205293014876, + "grad_norm": 1.6750887632369995, + "learning_rate": 3.439599965393825e-06, + "loss": 0.8141, + "step": 16762 + }, + { + "epoch": 0.8162539868039832, + "grad_norm": 1.4332212209701538, + "learning_rate": 3.4378315218600487e-06, + "loss": 0.8707, + "step": 16763 + }, + { + "epoch": 0.8163026805930903, + "grad_norm": 2.4549002647399902, + "learning_rate": 3.4360634903117318e-06, + "loss": 0.8065, + "step": 16764 + }, + { + "epoch": 0.8163513743821975, + "grad_norm": 1.670078158378601, + "learning_rate": 3.4342958707928363e-06, + "loss": 0.8127, + "step": 16765 + }, + { + "epoch": 0.8164000681713047, + "grad_norm": 2.0044548511505127, + "learning_rate": 3.4325286633473475e-06, + "loss": 0.8564, + "step": 16766 + }, + { + "epoch": 0.8164487619604119, + "grad_norm": 1.8192802667617798, + "learning_rate": 3.4307618680192188e-06, + "loss": 0.834, + "step": 16767 + }, + { + "epoch": 0.8164974557495192, + "grad_norm": 1.7484941482543945, + "learning_rate": 3.4289954848524e-06, + "loss": 0.6953, + "step": 16768 + }, + { + "epoch": 0.8165461495386264, + "grad_norm": 1.3971607685089111, + "learning_rate": 3.4272295138908265e-06, + "loss": 0.8342, + "step": 16769 + }, + { + "epoch": 0.8165948433277336, + "grad_norm": 0.09742681682109833, + "learning_rate": 3.4254639551784274e-06, + "loss": 0.6046, + "step": 16770 + }, + { + "epoch": 0.8166435371168408, + "grad_norm": 0.09988843649625778, + "learning_rate": 3.423698808759133e-06, + "loss": 0.6427, + "step": 16771 + }, + { + "epoch": 0.8166922309059479, + "grad_norm": 2.3413078784942627, + "learning_rate": 3.4219340746768314e-06, + "loss": 0.8853, + "step": 16772 + }, + { + "epoch": 0.8167409246950551, + "grad_norm": 1.2752922773361206, + "learning_rate": 3.4201697529754417e-06, + "loss": 0.8349, + "step": 16773 + }, + { + "epoch": 0.8167896184841623, + "grad_norm": 0.09515116363763809, + "learning_rate": 3.418405843698831e-06, + "loss": 0.6195, + "step": 16774 + }, + { + "epoch": 0.8168383122732695, + "grad_norm": 0.09831391274929047, + "learning_rate": 3.416642346890893e-06, + "loss": 0.6768, + "step": 16775 + }, + { + "epoch": 0.8168870060623767, + "grad_norm": 2.249903440475464, + "learning_rate": 3.4148792625954875e-06, + "loss": 0.7477, + "step": 16776 + }, + { + "epoch": 0.816935699851484, + "grad_norm": 2.556896924972534, + "learning_rate": 3.4131165908564666e-06, + "loss": 0.8491, + "step": 16777 + }, + { + "epoch": 0.8169843936405912, + "grad_norm": 1.3411978483200073, + "learning_rate": 3.4113543317176932e-06, + "loss": 0.6881, + "step": 16778 + }, + { + "epoch": 0.8170330874296984, + "grad_norm": 1.6855566501617432, + "learning_rate": 3.4095924852229858e-06, + "loss": 0.8621, + "step": 16779 + }, + { + "epoch": 0.8170817812188056, + "grad_norm": 1.5819289684295654, + "learning_rate": 3.407831051416186e-06, + "loss": 0.8723, + "step": 16780 + }, + { + "epoch": 0.8171304750079127, + "grad_norm": 1.2483086585998535, + "learning_rate": 3.406070030341091e-06, + "loss": 0.8284, + "step": 16781 + }, + { + "epoch": 0.8171791687970199, + "grad_norm": 1.3964747190475464, + "learning_rate": 3.404309422041523e-06, + "loss": 0.7929, + "step": 16782 + }, + { + "epoch": 0.8172278625861271, + "grad_norm": 2.156625986099243, + "learning_rate": 3.4025492265612712e-06, + "loss": 0.8557, + "step": 16783 + }, + { + "epoch": 0.8172765563752343, + "grad_norm": 2.641101598739624, + "learning_rate": 3.400789443944119e-06, + "loss": 0.8446, + "step": 16784 + }, + { + "epoch": 0.8173252501643415, + "grad_norm": 1.8353043794631958, + "learning_rate": 3.3990300742338446e-06, + "loss": 0.6933, + "step": 16785 + }, + { + "epoch": 0.8173739439534488, + "grad_norm": 0.10741180926561356, + "learning_rate": 3.3972711174742057e-06, + "loss": 0.6898, + "step": 16786 + }, + { + "epoch": 0.817422637742556, + "grad_norm": 1.642367959022522, + "learning_rate": 3.395512573708972e-06, + "loss": 0.7517, + "step": 16787 + }, + { + "epoch": 0.8174713315316632, + "grad_norm": 1.6294262409210205, + "learning_rate": 3.3937544429818646e-06, + "loss": 0.8346, + "step": 16788 + }, + { + "epoch": 0.8175200253207703, + "grad_norm": 1.5172315835952759, + "learning_rate": 3.391996725336637e-06, + "loss": 0.8124, + "step": 16789 + }, + { + "epoch": 0.8175687191098775, + "grad_norm": 1.2134315967559814, + "learning_rate": 3.3902394208170052e-06, + "loss": 0.726, + "step": 16790 + }, + { + "epoch": 0.8176174128989847, + "grad_norm": 1.502686619758606, + "learning_rate": 3.388482529466681e-06, + "loss": 0.9154, + "step": 16791 + }, + { + "epoch": 0.8176661066880919, + "grad_norm": 1.3934427499771118, + "learning_rate": 3.3867260513293678e-06, + "loss": 0.819, + "step": 16792 + }, + { + "epoch": 0.8177148004771991, + "grad_norm": 1.6849511861801147, + "learning_rate": 3.3849699864487606e-06, + "loss": 0.7431, + "step": 16793 + }, + { + "epoch": 0.8177634942663063, + "grad_norm": 1.9801846742630005, + "learning_rate": 3.383214334868539e-06, + "loss": 0.8812, + "step": 16794 + }, + { + "epoch": 0.8178121880554136, + "grad_norm": 1.3283637762069702, + "learning_rate": 3.3814590966323757e-06, + "loss": 0.7746, + "step": 16795 + }, + { + "epoch": 0.8178608818445208, + "grad_norm": 1.9021161794662476, + "learning_rate": 3.3797042717839323e-06, + "loss": 0.8307, + "step": 16796 + }, + { + "epoch": 0.817909575633628, + "grad_norm": 1.921213150024414, + "learning_rate": 3.377949860366856e-06, + "loss": 0.9245, + "step": 16797 + }, + { + "epoch": 0.8179582694227351, + "grad_norm": 1.4839508533477783, + "learning_rate": 3.376195862424798e-06, + "loss": 0.7971, + "step": 16798 + }, + { + "epoch": 0.8180069632118423, + "grad_norm": 3.340622901916504, + "learning_rate": 3.374442278001382e-06, + "loss": 0.8684, + "step": 16799 + }, + { + "epoch": 0.8180556570009495, + "grad_norm": 1.6012450456619263, + "learning_rate": 3.3726891071402325e-06, + "loss": 0.8595, + "step": 16800 + }, + { + "epoch": 0.8181043507900567, + "grad_norm": 1.540403962135315, + "learning_rate": 3.3709363498849544e-06, + "loss": 0.8632, + "step": 16801 + }, + { + "epoch": 0.8181530445791639, + "grad_norm": 2.6618573665618896, + "learning_rate": 3.3691840062791538e-06, + "loss": 0.7723, + "step": 16802 + }, + { + "epoch": 0.8182017383682711, + "grad_norm": 1.4588943719863892, + "learning_rate": 3.367432076366417e-06, + "loss": 0.8772, + "step": 16803 + }, + { + "epoch": 0.8182504321573784, + "grad_norm": 1.297549843788147, + "learning_rate": 3.3656805601903232e-06, + "loss": 0.7691, + "step": 16804 + }, + { + "epoch": 0.8182991259464856, + "grad_norm": 1.1905382871627808, + "learning_rate": 3.363929457794437e-06, + "loss": 0.7865, + "step": 16805 + }, + { + "epoch": 0.8183478197355927, + "grad_norm": 1.4351788759231567, + "learning_rate": 3.3621787692223264e-06, + "loss": 0.7858, + "step": 16806 + }, + { + "epoch": 0.8183965135246999, + "grad_norm": 1.8936944007873535, + "learning_rate": 3.3604284945175357e-06, + "loss": 0.8553, + "step": 16807 + }, + { + "epoch": 0.8184452073138071, + "grad_norm": 1.5810613632202148, + "learning_rate": 3.358678633723604e-06, + "loss": 0.8286, + "step": 16808 + }, + { + "epoch": 0.8184939011029143, + "grad_norm": 2.350494384765625, + "learning_rate": 3.356929186884059e-06, + "loss": 0.8023, + "step": 16809 + }, + { + "epoch": 0.8185425948920215, + "grad_norm": 1.4738155603408813, + "learning_rate": 3.355180154042419e-06, + "loss": 0.79, + "step": 16810 + }, + { + "epoch": 0.8185912886811287, + "grad_norm": 1.4344894886016846, + "learning_rate": 3.353431535242189e-06, + "loss": 0.7709, + "step": 16811 + }, + { + "epoch": 0.818639982470236, + "grad_norm": 3.802644729614258, + "learning_rate": 3.3516833305268624e-06, + "loss": 0.8726, + "step": 16812 + }, + { + "epoch": 0.8186886762593432, + "grad_norm": 3.1630961894989014, + "learning_rate": 3.349935539939941e-06, + "loss": 0.826, + "step": 16813 + }, + { + "epoch": 0.8187373700484504, + "grad_norm": 1.6373043060302734, + "learning_rate": 3.3481881635248813e-06, + "loss": 0.798, + "step": 16814 + }, + { + "epoch": 0.8187860638375575, + "grad_norm": 1.5527386665344238, + "learning_rate": 3.34644120132517e-06, + "loss": 0.7573, + "step": 16815 + }, + { + "epoch": 0.8188347576266647, + "grad_norm": 1.9924569129943848, + "learning_rate": 3.3446946533842416e-06, + "loss": 0.8108, + "step": 16816 + }, + { + "epoch": 0.8188834514157719, + "grad_norm": 1.619947910308838, + "learning_rate": 3.342948519745557e-06, + "loss": 0.7988, + "step": 16817 + }, + { + "epoch": 0.8189321452048791, + "grad_norm": 1.8753842115402222, + "learning_rate": 3.3412028004525454e-06, + "loss": 0.8274, + "step": 16818 + }, + { + "epoch": 0.8189808389939863, + "grad_norm": 1.614707112312317, + "learning_rate": 3.3394574955486305e-06, + "loss": 0.8621, + "step": 16819 + }, + { + "epoch": 0.8190295327830935, + "grad_norm": 1.2782278060913086, + "learning_rate": 3.337712605077239e-06, + "loss": 0.8394, + "step": 16820 + }, + { + "epoch": 0.8190782265722008, + "grad_norm": 1.4756196737289429, + "learning_rate": 3.3359681290817546e-06, + "loss": 0.7722, + "step": 16821 + }, + { + "epoch": 0.819126920361308, + "grad_norm": 1.4481377601623535, + "learning_rate": 3.3342240676055916e-06, + "loss": 0.8496, + "step": 16822 + }, + { + "epoch": 0.8191756141504151, + "grad_norm": 1.573928952217102, + "learning_rate": 3.332480420692117e-06, + "loss": 0.7811, + "step": 16823 + }, + { + "epoch": 0.8192243079395223, + "grad_norm": 1.3863847255706787, + "learning_rate": 3.330737188384714e-06, + "loss": 0.8163, + "step": 16824 + }, + { + "epoch": 0.8192730017286295, + "grad_norm": 1.8668293952941895, + "learning_rate": 3.3289943707267457e-06, + "loss": 0.84, + "step": 16825 + }, + { + "epoch": 0.8193216955177367, + "grad_norm": 1.3171167373657227, + "learning_rate": 3.32725196776156e-06, + "loss": 0.7314, + "step": 16826 + }, + { + "epoch": 0.8193703893068439, + "grad_norm": 1.567111849784851, + "learning_rate": 3.3255099795325043e-06, + "loss": 0.8606, + "step": 16827 + }, + { + "epoch": 0.8194190830959511, + "grad_norm": 1.4218858480453491, + "learning_rate": 3.323768406082901e-06, + "loss": 0.8467, + "step": 16828 + }, + { + "epoch": 0.8194677768850583, + "grad_norm": 1.5172922611236572, + "learning_rate": 3.32202724745609e-06, + "loss": 0.7491, + "step": 16829 + }, + { + "epoch": 0.8195164706741656, + "grad_norm": 1.6433894634246826, + "learning_rate": 3.3202865036953624e-06, + "loss": 0.7622, + "step": 16830 + }, + { + "epoch": 0.8195651644632727, + "grad_norm": 1.3969780206680298, + "learning_rate": 3.3185461748440375e-06, + "loss": 0.9035, + "step": 16831 + }, + { + "epoch": 0.8196138582523799, + "grad_norm": 1.7364407777786255, + "learning_rate": 3.3168062609453887e-06, + "loss": 0.7253, + "step": 16832 + }, + { + "epoch": 0.8196625520414871, + "grad_norm": 0.09905947744846344, + "learning_rate": 3.3150667620427093e-06, + "loss": 0.6146, + "step": 16833 + }, + { + "epoch": 0.8197112458305943, + "grad_norm": 1.8684542179107666, + "learning_rate": 3.3133276781792656e-06, + "loss": 0.7557, + "step": 16834 + }, + { + "epoch": 0.8197599396197015, + "grad_norm": 2.9073429107666016, + "learning_rate": 3.3115890093983173e-06, + "loss": 0.7307, + "step": 16835 + }, + { + "epoch": 0.8198086334088087, + "grad_norm": 1.3431800603866577, + "learning_rate": 3.3098507557431135e-06, + "loss": 0.8081, + "step": 16836 + }, + { + "epoch": 0.8198573271979159, + "grad_norm": 1.4651544094085693, + "learning_rate": 3.3081129172568914e-06, + "loss": 0.8124, + "step": 16837 + }, + { + "epoch": 0.8199060209870231, + "grad_norm": 0.09679253399372101, + "learning_rate": 3.3063754939828896e-06, + "loss": 0.6438, + "step": 16838 + }, + { + "epoch": 0.8199547147761304, + "grad_norm": 1.3560904264450073, + "learning_rate": 3.304638485964311e-06, + "loss": 0.8132, + "step": 16839 + }, + { + "epoch": 0.8200034085652375, + "grad_norm": 1.7118171453475952, + "learning_rate": 3.3029018932443768e-06, + "loss": 0.8391, + "step": 16840 + }, + { + "epoch": 0.8200521023543447, + "grad_norm": 1.4208694696426392, + "learning_rate": 3.3011657158662814e-06, + "loss": 0.8319, + "step": 16841 + }, + { + "epoch": 0.8201007961434519, + "grad_norm": 1.7783770561218262, + "learning_rate": 3.29942995387321e-06, + "loss": 0.7474, + "step": 16842 + }, + { + "epoch": 0.8201494899325591, + "grad_norm": 1.3249164819717407, + "learning_rate": 3.2976946073083414e-06, + "loss": 0.7817, + "step": 16843 + }, + { + "epoch": 0.8201981837216663, + "grad_norm": 1.6357288360595703, + "learning_rate": 3.2959596762148416e-06, + "loss": 0.7728, + "step": 16844 + }, + { + "epoch": 0.8202468775107735, + "grad_norm": 1.6650009155273438, + "learning_rate": 3.294225160635869e-06, + "loss": 0.8223, + "step": 16845 + }, + { + "epoch": 0.8202955712998807, + "grad_norm": 1.4229681491851807, + "learning_rate": 3.2924910606145643e-06, + "loss": 0.8372, + "step": 16846 + }, + { + "epoch": 0.820344265088988, + "grad_norm": 1.43051278591156, + "learning_rate": 3.2907573761940716e-06, + "loss": 0.7633, + "step": 16847 + }, + { + "epoch": 0.820392958878095, + "grad_norm": 1.6238456964492798, + "learning_rate": 3.289024107417513e-06, + "loss": 0.8275, + "step": 16848 + }, + { + "epoch": 0.8204416526672023, + "grad_norm": 3.5773766040802, + "learning_rate": 3.2872912543280043e-06, + "loss": 0.8402, + "step": 16849 + }, + { + "epoch": 0.8204903464563095, + "grad_norm": 1.3568799495697021, + "learning_rate": 3.2855588169686483e-06, + "loss": 0.8126, + "step": 16850 + }, + { + "epoch": 0.8205390402454167, + "grad_norm": 2.0002615451812744, + "learning_rate": 3.283826795382541e-06, + "loss": 0.7727, + "step": 16851 + }, + { + "epoch": 0.8205877340345239, + "grad_norm": 1.6662315130233765, + "learning_rate": 3.282095189612766e-06, + "loss": 0.845, + "step": 16852 + }, + { + "epoch": 0.8206364278236311, + "grad_norm": 1.7210081815719604, + "learning_rate": 3.280363999702396e-06, + "loss": 0.8608, + "step": 16853 + }, + { + "epoch": 0.8206851216127383, + "grad_norm": 1.8312337398529053, + "learning_rate": 3.278633225694499e-06, + "loss": 0.8878, + "step": 16854 + }, + { + "epoch": 0.8207338154018455, + "grad_norm": 1.4641467332839966, + "learning_rate": 3.276902867632119e-06, + "loss": 0.769, + "step": 16855 + }, + { + "epoch": 0.8207825091909527, + "grad_norm": 1.741430640220642, + "learning_rate": 3.27517292555831e-06, + "loss": 0.897, + "step": 16856 + }, + { + "epoch": 0.8208312029800598, + "grad_norm": 1.515101671218872, + "learning_rate": 3.2734433995160987e-06, + "loss": 0.7853, + "step": 16857 + }, + { + "epoch": 0.8208798967691671, + "grad_norm": 1.5117074251174927, + "learning_rate": 3.271714289548509e-06, + "loss": 0.8752, + "step": 16858 + }, + { + "epoch": 0.8209285905582743, + "grad_norm": 0.09991801530122757, + "learning_rate": 3.2699855956985506e-06, + "loss": 0.6678, + "step": 16859 + }, + { + "epoch": 0.8209772843473815, + "grad_norm": 2.2818498611450195, + "learning_rate": 3.2682573180092247e-06, + "loss": 0.8247, + "step": 16860 + }, + { + "epoch": 0.8210259781364887, + "grad_norm": 1.4937514066696167, + "learning_rate": 3.266529456523526e-06, + "loss": 0.8498, + "step": 16861 + }, + { + "epoch": 0.8210746719255959, + "grad_norm": 1.6383030414581299, + "learning_rate": 3.2648020112844314e-06, + "loss": 0.8374, + "step": 16862 + }, + { + "epoch": 0.8211233657147031, + "grad_norm": 8.23947525024414, + "learning_rate": 3.263074982334908e-06, + "loss": 0.8587, + "step": 16863 + }, + { + "epoch": 0.8211720595038103, + "grad_norm": 1.4494338035583496, + "learning_rate": 3.2613483697179295e-06, + "loss": 0.82, + "step": 16864 + }, + { + "epoch": 0.8212207532929174, + "grad_norm": 1.320665717124939, + "learning_rate": 3.2596221734764268e-06, + "loss": 0.8528, + "step": 16865 + }, + { + "epoch": 0.8212694470820247, + "grad_norm": 1.860505223274231, + "learning_rate": 3.2578963936533547e-06, + "loss": 0.841, + "step": 16866 + }, + { + "epoch": 0.8213181408711319, + "grad_norm": 1.5031713247299194, + "learning_rate": 3.2561710302916373e-06, + "loss": 0.8289, + "step": 16867 + }, + { + "epoch": 0.8213668346602391, + "grad_norm": 1.5640438795089722, + "learning_rate": 3.2544460834341905e-06, + "loss": 0.7846, + "step": 16868 + }, + { + "epoch": 0.8214155284493463, + "grad_norm": 1.4164541959762573, + "learning_rate": 3.252721553123923e-06, + "loss": 0.8501, + "step": 16869 + }, + { + "epoch": 0.8214642222384535, + "grad_norm": 1.4162319898605347, + "learning_rate": 3.2509974394037315e-06, + "loss": 0.7221, + "step": 16870 + }, + { + "epoch": 0.8215129160275607, + "grad_norm": 1.6250518560409546, + "learning_rate": 3.2492737423165144e-06, + "loss": 0.7696, + "step": 16871 + }, + { + "epoch": 0.8215616098166679, + "grad_norm": 1.548196792602539, + "learning_rate": 3.247550461905131e-06, + "loss": 0.8439, + "step": 16872 + }, + { + "epoch": 0.8216103036057751, + "grad_norm": 1.6166552305221558, + "learning_rate": 3.2458275982124654e-06, + "loss": 0.8368, + "step": 16873 + }, + { + "epoch": 0.8216589973948822, + "grad_norm": 1.1430588960647583, + "learning_rate": 3.2441051512813582e-06, + "loss": 0.8788, + "step": 16874 + }, + { + "epoch": 0.8217076911839895, + "grad_norm": 2.599308729171753, + "learning_rate": 3.2423831211546663e-06, + "loss": 0.7915, + "step": 16875 + }, + { + "epoch": 0.8217563849730967, + "grad_norm": 1.4762489795684814, + "learning_rate": 3.2406615078752247e-06, + "loss": 0.815, + "step": 16876 + }, + { + "epoch": 0.8218050787622039, + "grad_norm": 1.2266321182250977, + "learning_rate": 3.238940311485854e-06, + "loss": 0.8303, + "step": 16877 + }, + { + "epoch": 0.8218537725513111, + "grad_norm": 1.178400993347168, + "learning_rate": 3.237219532029372e-06, + "loss": 0.7891, + "step": 16878 + }, + { + "epoch": 0.8219024663404183, + "grad_norm": 1.8089741468429565, + "learning_rate": 3.235499169548577e-06, + "loss": 0.8078, + "step": 16879 + }, + { + "epoch": 0.8219511601295255, + "grad_norm": 1.7140297889709473, + "learning_rate": 3.2337792240862774e-06, + "loss": 0.8318, + "step": 16880 + }, + { + "epoch": 0.8219998539186327, + "grad_norm": 1.5872321128845215, + "learning_rate": 3.232059695685241e-06, + "loss": 0.7512, + "step": 16881 + }, + { + "epoch": 0.8220485477077398, + "grad_norm": 2.0875954627990723, + "learning_rate": 3.2303405843882562e-06, + "loss": 0.8136, + "step": 16882 + }, + { + "epoch": 0.822097241496847, + "grad_norm": 1.30197274684906, + "learning_rate": 3.2286218902380708e-06, + "loss": 0.6867, + "step": 16883 + }, + { + "epoch": 0.8221459352859543, + "grad_norm": 1.4384725093841553, + "learning_rate": 3.226903613277448e-06, + "loss": 0.8679, + "step": 16884 + }, + { + "epoch": 0.8221946290750615, + "grad_norm": 1.802594542503357, + "learning_rate": 3.2251857535491293e-06, + "loss": 0.8019, + "step": 16885 + }, + { + "epoch": 0.8222433228641687, + "grad_norm": 1.6215819120407104, + "learning_rate": 3.2234683110958363e-06, + "loss": 0.8329, + "step": 16886 + }, + { + "epoch": 0.8222920166532759, + "grad_norm": 1.4715726375579834, + "learning_rate": 3.2217512859603105e-06, + "loss": 0.7953, + "step": 16887 + }, + { + "epoch": 0.8223407104423831, + "grad_norm": 1.161281704902649, + "learning_rate": 3.220034678185242e-06, + "loss": 0.7462, + "step": 16888 + }, + { + "epoch": 0.8223894042314903, + "grad_norm": 1.7480621337890625, + "learning_rate": 3.21831848781335e-06, + "loss": 0.7533, + "step": 16889 + }, + { + "epoch": 0.8224380980205974, + "grad_norm": 1.1463639736175537, + "learning_rate": 3.216602714887305e-06, + "loss": 0.8398, + "step": 16890 + }, + { + "epoch": 0.8224867918097046, + "grad_norm": 1.301794171333313, + "learning_rate": 3.214887359449803e-06, + "loss": 0.847, + "step": 16891 + }, + { + "epoch": 0.8225354855988118, + "grad_norm": 1.5727691650390625, + "learning_rate": 3.213172421543509e-06, + "loss": 0.7756, + "step": 16892 + }, + { + "epoch": 0.8225841793879191, + "grad_norm": 1.2863043546676636, + "learning_rate": 3.211457901211081e-06, + "loss": 0.7519, + "step": 16893 + }, + { + "epoch": 0.8226328731770263, + "grad_norm": 1.4322673082351685, + "learning_rate": 3.2097437984951685e-06, + "loss": 0.7982, + "step": 16894 + }, + { + "epoch": 0.8226815669661335, + "grad_norm": 1.6915301084518433, + "learning_rate": 3.208030113438407e-06, + "loss": 0.8341, + "step": 16895 + }, + { + "epoch": 0.8227302607552407, + "grad_norm": 0.10046298801898956, + "learning_rate": 3.206316846083435e-06, + "loss": 0.5908, + "step": 16896 + }, + { + "epoch": 0.8227789545443479, + "grad_norm": 1.3777810335159302, + "learning_rate": 3.2046039964728526e-06, + "loss": 0.8051, + "step": 16897 + }, + { + "epoch": 0.8228276483334551, + "grad_norm": 1.3972909450531006, + "learning_rate": 3.2028915646492844e-06, + "loss": 0.7875, + "step": 16898 + }, + { + "epoch": 0.8228763421225622, + "grad_norm": 1.8776631355285645, + "learning_rate": 3.2011795506553177e-06, + "loss": 0.7841, + "step": 16899 + }, + { + "epoch": 0.8229250359116694, + "grad_norm": 1.5398608446121216, + "learning_rate": 3.1994679545335414e-06, + "loss": 0.8624, + "step": 16900 + }, + { + "epoch": 0.8229737297007766, + "grad_norm": 1.4049240350723267, + "learning_rate": 3.1977567763265325e-06, + "loss": 0.7841, + "step": 16901 + }, + { + "epoch": 0.8230224234898839, + "grad_norm": 1.5537152290344238, + "learning_rate": 3.1960460160768547e-06, + "loss": 0.8186, + "step": 16902 + }, + { + "epoch": 0.8230711172789911, + "grad_norm": 1.8019129037857056, + "learning_rate": 3.1943356738270648e-06, + "loss": 0.7831, + "step": 16903 + }, + { + "epoch": 0.8231198110680983, + "grad_norm": 2.3507091999053955, + "learning_rate": 3.1926257496197086e-06, + "loss": 0.8449, + "step": 16904 + }, + { + "epoch": 0.8231685048572055, + "grad_norm": 1.9150416851043701, + "learning_rate": 3.190916243497317e-06, + "loss": 0.8367, + "step": 16905 + }, + { + "epoch": 0.8232171986463127, + "grad_norm": 1.7173752784729004, + "learning_rate": 3.189207155502414e-06, + "loss": 0.8225, + "step": 16906 + }, + { + "epoch": 0.8232658924354198, + "grad_norm": 1.5909388065338135, + "learning_rate": 3.1874984856775182e-06, + "loss": 0.7817, + "step": 16907 + }, + { + "epoch": 0.823314586224527, + "grad_norm": 1.3423452377319336, + "learning_rate": 3.1857902340651313e-06, + "loss": 0.8189, + "step": 16908 + }, + { + "epoch": 0.8233632800136342, + "grad_norm": 1.261653184890747, + "learning_rate": 3.1840824007077463e-06, + "loss": 0.7624, + "step": 16909 + }, + { + "epoch": 0.8234119738027414, + "grad_norm": 2.2700564861297607, + "learning_rate": 3.1823749856478447e-06, + "loss": 0.7805, + "step": 16910 + }, + { + "epoch": 0.8234606675918487, + "grad_norm": 1.8756804466247559, + "learning_rate": 3.180667988927899e-06, + "loss": 0.8665, + "step": 16911 + }, + { + "epoch": 0.8235093613809559, + "grad_norm": 1.4940910339355469, + "learning_rate": 3.178961410590369e-06, + "loss": 0.8277, + "step": 16912 + }, + { + "epoch": 0.8235580551700631, + "grad_norm": 2.5153675079345703, + "learning_rate": 3.1772552506777087e-06, + "loss": 0.8493, + "step": 16913 + }, + { + "epoch": 0.8236067489591703, + "grad_norm": 2.3809056282043457, + "learning_rate": 3.175549509232354e-06, + "loss": 0.8409, + "step": 16914 + }, + { + "epoch": 0.8236554427482775, + "grad_norm": 1.6528481245040894, + "learning_rate": 3.173844186296742e-06, + "loss": 0.8117, + "step": 16915 + }, + { + "epoch": 0.8237041365373846, + "grad_norm": 1.505719542503357, + "learning_rate": 3.1721392819132914e-06, + "loss": 0.7128, + "step": 16916 + }, + { + "epoch": 0.8237528303264918, + "grad_norm": 1.3413459062576294, + "learning_rate": 3.1704347961244086e-06, + "loss": 0.873, + "step": 16917 + }, + { + "epoch": 0.823801524115599, + "grad_norm": 1.7116504907608032, + "learning_rate": 3.168730728972498e-06, + "loss": 0.7371, + "step": 16918 + }, + { + "epoch": 0.8238502179047063, + "grad_norm": 1.6817078590393066, + "learning_rate": 3.1670270804999425e-06, + "loss": 0.8204, + "step": 16919 + }, + { + "epoch": 0.8238989116938135, + "grad_norm": 2.382786989212036, + "learning_rate": 3.1653238507491247e-06, + "loss": 0.8623, + "step": 16920 + }, + { + "epoch": 0.8239476054829207, + "grad_norm": 1.615989089012146, + "learning_rate": 3.1636210397624057e-06, + "loss": 0.8667, + "step": 16921 + }, + { + "epoch": 0.8239962992720279, + "grad_norm": 2.7292072772979736, + "learning_rate": 3.161918647582156e-06, + "loss": 0.8096, + "step": 16922 + }, + { + "epoch": 0.8240449930611351, + "grad_norm": 1.3215218782424927, + "learning_rate": 3.160216674250709e-06, + "loss": 0.8484, + "step": 16923 + }, + { + "epoch": 0.8240936868502422, + "grad_norm": 1.763674020767212, + "learning_rate": 3.1585151198104147e-06, + "loss": 0.8073, + "step": 16924 + }, + { + "epoch": 0.8241423806393494, + "grad_norm": 1.7345346212387085, + "learning_rate": 3.1568139843035837e-06, + "loss": 0.7646, + "step": 16925 + }, + { + "epoch": 0.8241910744284566, + "grad_norm": 2.0649402141571045, + "learning_rate": 3.155113267772545e-06, + "loss": 0.749, + "step": 16926 + }, + { + "epoch": 0.8242397682175638, + "grad_norm": 1.7005759477615356, + "learning_rate": 3.1534129702595995e-06, + "loss": 0.7724, + "step": 16927 + }, + { + "epoch": 0.824288462006671, + "grad_norm": 2.06632399559021, + "learning_rate": 3.151713091807038e-06, + "loss": 0.8541, + "step": 16928 + }, + { + "epoch": 0.8243371557957783, + "grad_norm": 0.09301094710826874, + "learning_rate": 3.1500136324571583e-06, + "loss": 0.5928, + "step": 16929 + }, + { + "epoch": 0.8243858495848855, + "grad_norm": 2.900749921798706, + "learning_rate": 3.148314592252217e-06, + "loss": 0.7555, + "step": 16930 + }, + { + "epoch": 0.8244345433739927, + "grad_norm": 1.641290545463562, + "learning_rate": 3.146615971234497e-06, + "loss": 0.7803, + "step": 16931 + }, + { + "epoch": 0.8244832371630998, + "grad_norm": 1.474417805671692, + "learning_rate": 3.144917769446232e-06, + "loss": 0.7251, + "step": 16932 + }, + { + "epoch": 0.824531930952207, + "grad_norm": 2.676776885986328, + "learning_rate": 3.1432199869296776e-06, + "loss": 0.8473, + "step": 16933 + }, + { + "epoch": 0.8245806247413142, + "grad_norm": 1.4775300025939941, + "learning_rate": 3.1415226237270625e-06, + "loss": 0.8416, + "step": 16934 + }, + { + "epoch": 0.8246293185304214, + "grad_norm": 1.948878288269043, + "learning_rate": 3.1398256798806103e-06, + "loss": 0.7957, + "step": 16935 + }, + { + "epoch": 0.8246780123195286, + "grad_norm": 1.2967344522476196, + "learning_rate": 3.1381291554325324e-06, + "loss": 0.7923, + "step": 16936 + }, + { + "epoch": 0.8247267061086359, + "grad_norm": 2.456948757171631, + "learning_rate": 3.136433050425023e-06, + "loss": 0.8428, + "step": 16937 + }, + { + "epoch": 0.8247753998977431, + "grad_norm": 1.3994184732437134, + "learning_rate": 3.1347373649002887e-06, + "loss": 0.7871, + "step": 16938 + }, + { + "epoch": 0.8248240936868503, + "grad_norm": 1.8522186279296875, + "learning_rate": 3.1330420989004917e-06, + "loss": 0.7825, + "step": 16939 + }, + { + "epoch": 0.8248727874759575, + "grad_norm": 3.712700843811035, + "learning_rate": 3.131347252467818e-06, + "loss": 0.8684, + "step": 16940 + }, + { + "epoch": 0.8249214812650646, + "grad_norm": 1.66623854637146, + "learning_rate": 3.1296528256444114e-06, + "loss": 0.7504, + "step": 16941 + }, + { + "epoch": 0.8249701750541718, + "grad_norm": 1.5035165548324585, + "learning_rate": 3.127958818472434e-06, + "loss": 0.8252, + "step": 16942 + }, + { + "epoch": 0.825018868843279, + "grad_norm": 0.09551005810499191, + "learning_rate": 3.126265230994019e-06, + "loss": 0.6046, + "step": 16943 + }, + { + "epoch": 0.8250675626323862, + "grad_norm": 1.6719260215759277, + "learning_rate": 3.1245720632512945e-06, + "loss": 0.8437, + "step": 16944 + }, + { + "epoch": 0.8251162564214934, + "grad_norm": 2.0571069717407227, + "learning_rate": 3.122879315286378e-06, + "loss": 0.7839, + "step": 16945 + }, + { + "epoch": 0.8251649502106007, + "grad_norm": 0.0968146026134491, + "learning_rate": 3.1211869871413713e-06, + "loss": 0.5742, + "step": 16946 + }, + { + "epoch": 0.8252136439997079, + "grad_norm": 1.39165198802948, + "learning_rate": 3.11949507885839e-06, + "loss": 0.8047, + "step": 16947 + }, + { + "epoch": 0.8252623377888151, + "grad_norm": 1.3710472583770752, + "learning_rate": 3.1178035904794957e-06, + "loss": 0.8385, + "step": 16948 + }, + { + "epoch": 0.8253110315779222, + "grad_norm": 1.7944823503494263, + "learning_rate": 3.1161125220467838e-06, + "loss": 0.8423, + "step": 16949 + }, + { + "epoch": 0.8253597253670294, + "grad_norm": 1.7590876817703247, + "learning_rate": 3.1144218736023093e-06, + "loss": 0.8602, + "step": 16950 + }, + { + "epoch": 0.8254084191561366, + "grad_norm": 1.7813231945037842, + "learning_rate": 3.1127316451881317e-06, + "loss": 0.8495, + "step": 16951 + }, + { + "epoch": 0.8254571129452438, + "grad_norm": 1.407041311264038, + "learning_rate": 3.1110418368462936e-06, + "loss": 0.8528, + "step": 16952 + }, + { + "epoch": 0.825505806734351, + "grad_norm": 2.2037227153778076, + "learning_rate": 3.109352448618832e-06, + "loss": 0.7704, + "step": 16953 + }, + { + "epoch": 0.8255545005234582, + "grad_norm": 8.952773094177246, + "learning_rate": 3.107663480547767e-06, + "loss": 0.7699, + "step": 16954 + }, + { + "epoch": 0.8256031943125655, + "grad_norm": 0.09421604871749878, + "learning_rate": 3.1059749326751065e-06, + "loss": 0.6011, + "step": 16955 + }, + { + "epoch": 0.8256518881016727, + "grad_norm": 1.540617823600769, + "learning_rate": 3.1042868050428666e-06, + "loss": 0.7634, + "step": 16956 + }, + { + "epoch": 0.8257005818907799, + "grad_norm": 2.094332456588745, + "learning_rate": 3.102599097693031e-06, + "loss": 0.7695, + "step": 16957 + }, + { + "epoch": 0.825749275679887, + "grad_norm": 1.4747401475906372, + "learning_rate": 3.100911810667586e-06, + "loss": 0.8322, + "step": 16958 + }, + { + "epoch": 0.8257979694689942, + "grad_norm": 1.5930967330932617, + "learning_rate": 3.099224944008499e-06, + "loss": 0.7842, + "step": 16959 + }, + { + "epoch": 0.8258466632581014, + "grad_norm": 1.3916147947311401, + "learning_rate": 3.0975384977577326e-06, + "loss": 0.8015, + "step": 16960 + }, + { + "epoch": 0.8258953570472086, + "grad_norm": 1.7778431177139282, + "learning_rate": 3.095852471957237e-06, + "loss": 0.8719, + "step": 16961 + }, + { + "epoch": 0.8259440508363158, + "grad_norm": 1.6182082891464233, + "learning_rate": 3.0941668666489533e-06, + "loss": 0.7717, + "step": 16962 + }, + { + "epoch": 0.825992744625423, + "grad_norm": 2.0080857276916504, + "learning_rate": 3.0924816818748084e-06, + "loss": 0.8981, + "step": 16963 + }, + { + "epoch": 0.8260414384145303, + "grad_norm": 2.5278332233428955, + "learning_rate": 3.0907969176767216e-06, + "loss": 0.8334, + "step": 16964 + }, + { + "epoch": 0.8260901322036375, + "grad_norm": 1.7679061889648438, + "learning_rate": 3.0891125740966044e-06, + "loss": 0.7883, + "step": 16965 + }, + { + "epoch": 0.8261388259927446, + "grad_norm": 1.5931284427642822, + "learning_rate": 3.0874286511763562e-06, + "loss": 0.8565, + "step": 16966 + }, + { + "epoch": 0.8261875197818518, + "grad_norm": 1.421053171157837, + "learning_rate": 3.0857451489578615e-06, + "loss": 0.8555, + "step": 16967 + }, + { + "epoch": 0.826236213570959, + "grad_norm": 1.4142534732818604, + "learning_rate": 3.084062067482998e-06, + "loss": 0.7962, + "step": 16968 + }, + { + "epoch": 0.8262849073600662, + "grad_norm": 1.664842128753662, + "learning_rate": 3.0823794067936343e-06, + "loss": 0.7948, + "step": 16969 + }, + { + "epoch": 0.8263336011491734, + "grad_norm": 1.5999947786331177, + "learning_rate": 3.0806971669316233e-06, + "loss": 0.7976, + "step": 16970 + }, + { + "epoch": 0.8263822949382806, + "grad_norm": 2.582221031188965, + "learning_rate": 3.079015347938814e-06, + "loss": 0.7982, + "step": 16971 + }, + { + "epoch": 0.8264309887273878, + "grad_norm": 1.5554991960525513, + "learning_rate": 3.0773339498570376e-06, + "loss": 0.8485, + "step": 16972 + }, + { + "epoch": 0.8264796825164951, + "grad_norm": 1.4329197406768799, + "learning_rate": 3.075652972728129e-06, + "loss": 0.849, + "step": 16973 + }, + { + "epoch": 0.8265283763056023, + "grad_norm": 0.09879352897405624, + "learning_rate": 3.0739724165938866e-06, + "loss": 0.6044, + "step": 16974 + }, + { + "epoch": 0.8265770700947094, + "grad_norm": 3.3464958667755127, + "learning_rate": 3.0722922814961297e-06, + "loss": 0.7883, + "step": 16975 + }, + { + "epoch": 0.8266257638838166, + "grad_norm": 1.4613261222839355, + "learning_rate": 3.070612567476645e-06, + "loss": 0.8918, + "step": 16976 + }, + { + "epoch": 0.8266744576729238, + "grad_norm": 2.1950652599334717, + "learning_rate": 3.068933274577217e-06, + "loss": 0.8209, + "step": 16977 + }, + { + "epoch": 0.826723151462031, + "grad_norm": 1.5879677534103394, + "learning_rate": 3.0672544028396166e-06, + "loss": 0.7657, + "step": 16978 + }, + { + "epoch": 0.8267718452511382, + "grad_norm": 1.655659794807434, + "learning_rate": 3.065575952305602e-06, + "loss": 0.8774, + "step": 16979 + }, + { + "epoch": 0.8268205390402454, + "grad_norm": 1.4360606670379639, + "learning_rate": 3.063897923016941e-06, + "loss": 0.8084, + "step": 16980 + }, + { + "epoch": 0.8268692328293527, + "grad_norm": 1.7190066576004028, + "learning_rate": 3.062220315015352e-06, + "loss": 0.8364, + "step": 16981 + }, + { + "epoch": 0.8269179266184599, + "grad_norm": 2.0376150608062744, + "learning_rate": 3.0605431283425858e-06, + "loss": 0.7543, + "step": 16982 + }, + { + "epoch": 0.826966620407567, + "grad_norm": 2.112281322479248, + "learning_rate": 3.0588663630403447e-06, + "loss": 0.7863, + "step": 16983 + }, + { + "epoch": 0.8270153141966742, + "grad_norm": 1.5521459579467773, + "learning_rate": 3.0571900191503535e-06, + "loss": 0.8332, + "step": 16984 + }, + { + "epoch": 0.8270640079857814, + "grad_norm": 1.3488272428512573, + "learning_rate": 3.0555140967143047e-06, + "loss": 0.8406, + "step": 16985 + }, + { + "epoch": 0.8271127017748886, + "grad_norm": 1.3619886636734009, + "learning_rate": 3.053838595773888e-06, + "loss": 0.7714, + "step": 16986 + }, + { + "epoch": 0.8271613955639958, + "grad_norm": 1.7581026554107666, + "learning_rate": 3.0521635163707806e-06, + "loss": 0.8446, + "step": 16987 + }, + { + "epoch": 0.827210089353103, + "grad_norm": 1.5021803379058838, + "learning_rate": 3.0504888585466453e-06, + "loss": 0.807, + "step": 16988 + }, + { + "epoch": 0.8272587831422102, + "grad_norm": 1.5956357717514038, + "learning_rate": 3.048814622343157e-06, + "loss": 0.843, + "step": 16989 + }, + { + "epoch": 0.8273074769313175, + "grad_norm": 1.4545999765396118, + "learning_rate": 3.047140807801938e-06, + "loss": 0.7693, + "step": 16990 + }, + { + "epoch": 0.8273561707204246, + "grad_norm": 1.1580172777175903, + "learning_rate": 3.045467414964649e-06, + "loss": 0.8001, + "step": 16991 + }, + { + "epoch": 0.8274048645095318, + "grad_norm": 2.000763416290283, + "learning_rate": 3.043794443872894e-06, + "loss": 0.7897, + "step": 16992 + }, + { + "epoch": 0.827453558298639, + "grad_norm": 1.6855326890945435, + "learning_rate": 3.042121894568304e-06, + "loss": 0.7291, + "step": 16993 + }, + { + "epoch": 0.8275022520877462, + "grad_norm": 1.2261070013046265, + "learning_rate": 3.0404497670924748e-06, + "loss": 0.7726, + "step": 16994 + }, + { + "epoch": 0.8275509458768534, + "grad_norm": 1.228554129600525, + "learning_rate": 3.038778061487002e-06, + "loss": 0.7376, + "step": 16995 + }, + { + "epoch": 0.8275996396659606, + "grad_norm": 3.1104962825775146, + "learning_rate": 3.0371067777934793e-06, + "loss": 0.7241, + "step": 16996 + }, + { + "epoch": 0.8276483334550678, + "grad_norm": 3.4036316871643066, + "learning_rate": 3.035435916053464e-06, + "loss": 0.7566, + "step": 16997 + }, + { + "epoch": 0.827697027244175, + "grad_norm": 1.395807147026062, + "learning_rate": 3.0337654763085366e-06, + "loss": 0.6867, + "step": 16998 + }, + { + "epoch": 0.8277457210332823, + "grad_norm": 1.464547872543335, + "learning_rate": 3.0320954586002283e-06, + "loss": 0.7195, + "step": 16999 + }, + { + "epoch": 0.8277944148223894, + "grad_norm": 1.7078737020492554, + "learning_rate": 3.030425862970099e-06, + "loss": 0.7515, + "step": 17000 + }, + { + "epoch": 0.8278431086114966, + "grad_norm": 1.482635498046875, + "learning_rate": 3.028756689459673e-06, + "loss": 0.7458, + "step": 17001 + }, + { + "epoch": 0.8278918024006038, + "grad_norm": 1.6678465604782104, + "learning_rate": 3.0270879381104733e-06, + "loss": 0.8038, + "step": 17002 + }, + { + "epoch": 0.827940496189711, + "grad_norm": 1.1620208024978638, + "learning_rate": 3.0254196089640063e-06, + "loss": 0.8207, + "step": 17003 + }, + { + "epoch": 0.8279891899788182, + "grad_norm": 1.2342203855514526, + "learning_rate": 3.0237517020617725e-06, + "loss": 0.7497, + "step": 17004 + }, + { + "epoch": 0.8280378837679254, + "grad_norm": 1.2406429052352905, + "learning_rate": 3.022084217445269e-06, + "loss": 0.7609, + "step": 17005 + }, + { + "epoch": 0.8280865775570326, + "grad_norm": 0.09788143634796143, + "learning_rate": 3.020417155155961e-06, + "loss": 0.5933, + "step": 17006 + }, + { + "epoch": 0.8281352713461398, + "grad_norm": 1.4176764488220215, + "learning_rate": 3.018750515235329e-06, + "loss": 0.8091, + "step": 17007 + }, + { + "epoch": 0.828183965135247, + "grad_norm": 2.0857458114624023, + "learning_rate": 3.017084297724826e-06, + "loss": 0.7505, + "step": 17008 + }, + { + "epoch": 0.8282326589243542, + "grad_norm": 2.5289063453674316, + "learning_rate": 3.0154185026658987e-06, + "loss": 0.7769, + "step": 17009 + }, + { + "epoch": 0.8282813527134614, + "grad_norm": 1.4180700778961182, + "learning_rate": 3.0137531300999858e-06, + "loss": 0.8579, + "step": 17010 + }, + { + "epoch": 0.8283300465025686, + "grad_norm": 1.5016781091690063, + "learning_rate": 3.012088180068513e-06, + "loss": 0.7458, + "step": 17011 + }, + { + "epoch": 0.8283787402916758, + "grad_norm": 2.9361002445220947, + "learning_rate": 3.010423652612895e-06, + "loss": 0.8046, + "step": 17012 + }, + { + "epoch": 0.828427434080783, + "grad_norm": 2.2907257080078125, + "learning_rate": 3.0087595477745356e-06, + "loss": 0.8875, + "step": 17013 + }, + { + "epoch": 0.8284761278698902, + "grad_norm": 1.53722083568573, + "learning_rate": 3.007095865594829e-06, + "loss": 0.7721, + "step": 17014 + }, + { + "epoch": 0.8285248216589974, + "grad_norm": 1.446419596672058, + "learning_rate": 3.0054326061151685e-06, + "loss": 0.7867, + "step": 17015 + }, + { + "epoch": 0.8285735154481046, + "grad_norm": 1.9721219539642334, + "learning_rate": 3.0037697693769185e-06, + "loss": 0.8342, + "step": 17016 + }, + { + "epoch": 0.8286222092372117, + "grad_norm": 1.50880765914917, + "learning_rate": 3.0021073554214465e-06, + "loss": 0.8959, + "step": 17017 + }, + { + "epoch": 0.828670903026319, + "grad_norm": 2.269272804260254, + "learning_rate": 3.000445364290103e-06, + "loss": 0.8544, + "step": 17018 + }, + { + "epoch": 0.8287195968154262, + "grad_norm": 1.5964548587799072, + "learning_rate": 2.9987837960242316e-06, + "loss": 0.7221, + "step": 17019 + }, + { + "epoch": 0.8287682906045334, + "grad_norm": 1.8490902185440063, + "learning_rate": 2.997122650665165e-06, + "loss": 0.8255, + "step": 17020 + }, + { + "epoch": 0.8288169843936406, + "grad_norm": 1.533542275428772, + "learning_rate": 2.9954619282542195e-06, + "loss": 0.7038, + "step": 17021 + }, + { + "epoch": 0.8288656781827478, + "grad_norm": 1.4287875890731812, + "learning_rate": 2.993801628832711e-06, + "loss": 0.8372, + "step": 17022 + }, + { + "epoch": 0.828914371971855, + "grad_norm": 1.6477978229522705, + "learning_rate": 2.9921417524419326e-06, + "loss": 0.8065, + "step": 17023 + }, + { + "epoch": 0.8289630657609622, + "grad_norm": 1.6952062845230103, + "learning_rate": 2.990482299123183e-06, + "loss": 0.8114, + "step": 17024 + }, + { + "epoch": 0.8290117595500693, + "grad_norm": 1.404176115989685, + "learning_rate": 2.9888232689177375e-06, + "loss": 0.8132, + "step": 17025 + }, + { + "epoch": 0.8290604533391766, + "grad_norm": 1.4815444946289062, + "learning_rate": 2.9871646618668637e-06, + "loss": 0.7946, + "step": 17026 + }, + { + "epoch": 0.8291091471282838, + "grad_norm": 2.177856206893921, + "learning_rate": 2.9855064780118217e-06, + "loss": 0.8222, + "step": 17027 + }, + { + "epoch": 0.829157840917391, + "grad_norm": 1.2662187814712524, + "learning_rate": 2.9838487173938556e-06, + "loss": 0.7168, + "step": 17028 + }, + { + "epoch": 0.8292065347064982, + "grad_norm": 1.7139225006103516, + "learning_rate": 2.9821913800542047e-06, + "loss": 0.7912, + "step": 17029 + }, + { + "epoch": 0.8292552284956054, + "grad_norm": 1.5965887308120728, + "learning_rate": 2.9805344660340905e-06, + "loss": 0.8863, + "step": 17030 + }, + { + "epoch": 0.8293039222847126, + "grad_norm": 1.7104476690292358, + "learning_rate": 2.9788779753747433e-06, + "loss": 0.7126, + "step": 17031 + }, + { + "epoch": 0.8293526160738198, + "grad_norm": 1.6209856271743774, + "learning_rate": 2.977221908117347e-06, + "loss": 0.8469, + "step": 17032 + }, + { + "epoch": 0.829401309862927, + "grad_norm": 1.3026494979858398, + "learning_rate": 2.975566264303118e-06, + "loss": 0.8643, + "step": 17033 + }, + { + "epoch": 0.8294500036520341, + "grad_norm": 1.9195233583450317, + "learning_rate": 2.973911043973221e-06, + "loss": 0.7484, + "step": 17034 + }, + { + "epoch": 0.8294986974411414, + "grad_norm": 1.9672907590866089, + "learning_rate": 2.9722562471688455e-06, + "loss": 0.7919, + "step": 17035 + }, + { + "epoch": 0.8295473912302486, + "grad_norm": 2.686784029006958, + "learning_rate": 2.970601873931147e-06, + "loss": 0.8417, + "step": 17036 + }, + { + "epoch": 0.8295960850193558, + "grad_norm": 2.045448064804077, + "learning_rate": 2.968947924301273e-06, + "loss": 0.8034, + "step": 17037 + }, + { + "epoch": 0.829644778808463, + "grad_norm": 1.8175840377807617, + "learning_rate": 2.967294398320384e-06, + "loss": 0.7108, + "step": 17038 + }, + { + "epoch": 0.8296934725975702, + "grad_norm": 1.4211581945419312, + "learning_rate": 2.965641296029591e-06, + "loss": 0.8073, + "step": 17039 + }, + { + "epoch": 0.8297421663866774, + "grad_norm": 1.6185237169265747, + "learning_rate": 2.9639886174700305e-06, + "loss": 0.8328, + "step": 17040 + }, + { + "epoch": 0.8297908601757846, + "grad_norm": 1.4265124797821045, + "learning_rate": 2.9623363626828004e-06, + "loss": 0.861, + "step": 17041 + }, + { + "epoch": 0.8298395539648917, + "grad_norm": 1.814664363861084, + "learning_rate": 2.9606845317090083e-06, + "loss": 0.8146, + "step": 17042 + }, + { + "epoch": 0.8298882477539989, + "grad_norm": 2.415390729904175, + "learning_rate": 2.9590331245897453e-06, + "loss": 0.9275, + "step": 17043 + }, + { + "epoch": 0.8299369415431062, + "grad_norm": 1.608482837677002, + "learning_rate": 2.9573821413660853e-06, + "loss": 0.9083, + "step": 17044 + }, + { + "epoch": 0.8299856353322134, + "grad_norm": 6.657153129577637, + "learning_rate": 2.9557315820790976e-06, + "loss": 0.8746, + "step": 17045 + }, + { + "epoch": 0.8300343291213206, + "grad_norm": 0.09017588943243027, + "learning_rate": 2.9540814467698365e-06, + "loss": 0.5708, + "step": 17046 + }, + { + "epoch": 0.8300830229104278, + "grad_norm": 1.6570496559143066, + "learning_rate": 2.9524317354793642e-06, + "loss": 0.734, + "step": 17047 + }, + { + "epoch": 0.830131716699535, + "grad_norm": 1.6343564987182617, + "learning_rate": 2.950782448248695e-06, + "loss": 0.9183, + "step": 17048 + }, + { + "epoch": 0.8301804104886422, + "grad_norm": 1.756378173828125, + "learning_rate": 2.9491335851188794e-06, + "loss": 0.8498, + "step": 17049 + }, + { + "epoch": 0.8302291042777493, + "grad_norm": 1.5870288610458374, + "learning_rate": 2.9474851461309084e-06, + "loss": 0.7282, + "step": 17050 + }, + { + "epoch": 0.8302777980668565, + "grad_norm": 1.9017845392227173, + "learning_rate": 2.945837131325806e-06, + "loss": 0.7731, + "step": 17051 + }, + { + "epoch": 0.8303264918559637, + "grad_norm": 1.4904766082763672, + "learning_rate": 2.9441895407445575e-06, + "loss": 0.852, + "step": 17052 + }, + { + "epoch": 0.830375185645071, + "grad_norm": 1.4079126119613647, + "learning_rate": 2.942542374428148e-06, + "loss": 0.8627, + "step": 17053 + }, + { + "epoch": 0.8304238794341782, + "grad_norm": 1.374457597732544, + "learning_rate": 2.9408956324175575e-06, + "loss": 0.8323, + "step": 17054 + }, + { + "epoch": 0.8304725732232854, + "grad_norm": 1.5689387321472168, + "learning_rate": 2.9392493147537383e-06, + "loss": 0.8784, + "step": 17055 + }, + { + "epoch": 0.8305212670123926, + "grad_norm": 1.5751906633377075, + "learning_rate": 2.937603421477655e-06, + "loss": 0.8216, + "step": 17056 + }, + { + "epoch": 0.8305699608014998, + "grad_norm": 4.603269577026367, + "learning_rate": 2.935957952630233e-06, + "loss": 0.8088, + "step": 17057 + }, + { + "epoch": 0.830618654590607, + "grad_norm": 1.6731595993041992, + "learning_rate": 2.934312908252417e-06, + "loss": 0.7656, + "step": 17058 + }, + { + "epoch": 0.8306673483797141, + "grad_norm": 1.7374411821365356, + "learning_rate": 2.932668288385123e-06, + "loss": 0.8247, + "step": 17059 + }, + { + "epoch": 0.8307160421688213, + "grad_norm": 1.6934508085250854, + "learning_rate": 2.9310240930692636e-06, + "loss": 0.8095, + "step": 17060 + }, + { + "epoch": 0.8307647359579285, + "grad_norm": 1.6065438985824585, + "learning_rate": 2.929380322345734e-06, + "loss": 0.7786, + "step": 17061 + }, + { + "epoch": 0.8308134297470358, + "grad_norm": 1.6733582019805908, + "learning_rate": 2.9277369762554264e-06, + "loss": 0.8369, + "step": 17062 + }, + { + "epoch": 0.830862123536143, + "grad_norm": 1.6379921436309814, + "learning_rate": 2.926094054839219e-06, + "loss": 0.9299, + "step": 17063 + }, + { + "epoch": 0.8309108173252502, + "grad_norm": 2.0460476875305176, + "learning_rate": 2.9244515581379727e-06, + "loss": 0.8682, + "step": 17064 + }, + { + "epoch": 0.8309595111143574, + "grad_norm": 1.7644147872924805, + "learning_rate": 2.922809486192557e-06, + "loss": 0.7495, + "step": 17065 + }, + { + "epoch": 0.8310082049034646, + "grad_norm": 1.4023096561431885, + "learning_rate": 2.9211678390438102e-06, + "loss": 0.8091, + "step": 17066 + }, + { + "epoch": 0.8310568986925717, + "grad_norm": 1.1061887741088867, + "learning_rate": 2.919526616732571e-06, + "loss": 0.7571, + "step": 17067 + }, + { + "epoch": 0.8311055924816789, + "grad_norm": 1.8254499435424805, + "learning_rate": 2.9178858192996662e-06, + "loss": 0.9019, + "step": 17068 + }, + { + "epoch": 0.8311542862707861, + "grad_norm": 1.6007574796676636, + "learning_rate": 2.916245446785908e-06, + "loss": 0.7246, + "step": 17069 + }, + { + "epoch": 0.8312029800598933, + "grad_norm": 2.173123836517334, + "learning_rate": 2.9146054992321016e-06, + "loss": 0.7795, + "step": 17070 + }, + { + "epoch": 0.8312516738490006, + "grad_norm": 2.7892744541168213, + "learning_rate": 2.9129659766790406e-06, + "loss": 0.8607, + "step": 17071 + }, + { + "epoch": 0.8313003676381078, + "grad_norm": 1.52589750289917, + "learning_rate": 2.9113268791675107e-06, + "loss": 0.799, + "step": 17072 + }, + { + "epoch": 0.831349061427215, + "grad_norm": 1.4820135831832886, + "learning_rate": 2.909688206738277e-06, + "loss": 0.7194, + "step": 17073 + }, + { + "epoch": 0.8313977552163222, + "grad_norm": 1.3757998943328857, + "learning_rate": 2.908049959432113e-06, + "loss": 0.7557, + "step": 17074 + }, + { + "epoch": 0.8314464490054294, + "grad_norm": 1.7091209888458252, + "learning_rate": 2.9064121372897646e-06, + "loss": 0.685, + "step": 17075 + }, + { + "epoch": 0.8314951427945365, + "grad_norm": 2.4716546535491943, + "learning_rate": 2.9047747403519722e-06, + "loss": 0.798, + "step": 17076 + }, + { + "epoch": 0.8315438365836437, + "grad_norm": 1.4458569288253784, + "learning_rate": 2.903137768659465e-06, + "loss": 0.8779, + "step": 17077 + }, + { + "epoch": 0.8315925303727509, + "grad_norm": 1.2869007587432861, + "learning_rate": 2.901501222252967e-06, + "loss": 0.8264, + "step": 17078 + }, + { + "epoch": 0.8316412241618582, + "grad_norm": 1.4846597909927368, + "learning_rate": 2.8998651011731847e-06, + "loss": 0.7365, + "step": 17079 + }, + { + "epoch": 0.8316899179509654, + "grad_norm": 1.785911202430725, + "learning_rate": 2.898229405460817e-06, + "loss": 0.7099, + "step": 17080 + }, + { + "epoch": 0.8317386117400726, + "grad_norm": 1.6268233060836792, + "learning_rate": 2.8965941351565473e-06, + "loss": 0.7781, + "step": 17081 + }, + { + "epoch": 0.8317873055291798, + "grad_norm": 1.4416799545288086, + "learning_rate": 2.894959290301067e-06, + "loss": 0.8184, + "step": 17082 + }, + { + "epoch": 0.831835999318287, + "grad_norm": 1.5080270767211914, + "learning_rate": 2.893324870935026e-06, + "loss": 0.8355, + "step": 17083 + }, + { + "epoch": 0.8318846931073941, + "grad_norm": 3.430671453475952, + "learning_rate": 2.8916908770990915e-06, + "loss": 0.8134, + "step": 17084 + }, + { + "epoch": 0.8319333868965013, + "grad_norm": 1.4658037424087524, + "learning_rate": 2.8900573088339067e-06, + "loss": 0.8331, + "step": 17085 + }, + { + "epoch": 0.8319820806856085, + "grad_norm": 0.09809961915016174, + "learning_rate": 2.8884241661801084e-06, + "loss": 0.5977, + "step": 17086 + }, + { + "epoch": 0.8320307744747157, + "grad_norm": 1.5591232776641846, + "learning_rate": 2.8867914491783168e-06, + "loss": 0.7658, + "step": 17087 + }, + { + "epoch": 0.832079468263823, + "grad_norm": 1.3028209209442139, + "learning_rate": 2.885159157869144e-06, + "loss": 0.7486, + "step": 17088 + }, + { + "epoch": 0.8321281620529302, + "grad_norm": 1.301520586013794, + "learning_rate": 2.8835272922932067e-06, + "loss": 0.7816, + "step": 17089 + }, + { + "epoch": 0.8321768558420374, + "grad_norm": 1.5479273796081543, + "learning_rate": 2.881895852491079e-06, + "loss": 0.7803, + "step": 17090 + }, + { + "epoch": 0.8322255496311446, + "grad_norm": 1.8816481828689575, + "learning_rate": 2.8802648385033636e-06, + "loss": 0.8221, + "step": 17091 + }, + { + "epoch": 0.8322742434202517, + "grad_norm": 0.09568001329898834, + "learning_rate": 2.878634250370611e-06, + "loss": 0.635, + "step": 17092 + }, + { + "epoch": 0.8323229372093589, + "grad_norm": 3.594651937484741, + "learning_rate": 2.8770040881333973e-06, + "loss": 0.7097, + "step": 17093 + }, + { + "epoch": 0.8323716309984661, + "grad_norm": 5.775457859039307, + "learning_rate": 2.8753743518322673e-06, + "loss": 0.8472, + "step": 17094 + }, + { + "epoch": 0.8324203247875733, + "grad_norm": 1.5878568887710571, + "learning_rate": 2.8737450415077627e-06, + "loss": 0.821, + "step": 17095 + }, + { + "epoch": 0.8324690185766805, + "grad_norm": 1.5076665878295898, + "learning_rate": 2.872116157200413e-06, + "loss": 0.6999, + "step": 17096 + }, + { + "epoch": 0.8325177123657878, + "grad_norm": 1.625774621963501, + "learning_rate": 2.870487698950728e-06, + "loss": 0.8412, + "step": 17097 + }, + { + "epoch": 0.832566406154895, + "grad_norm": 1.3476957082748413, + "learning_rate": 2.868859666799235e-06, + "loss": 0.8577, + "step": 17098 + }, + { + "epoch": 0.8326150999440022, + "grad_norm": 1.5707917213439941, + "learning_rate": 2.867232060786409e-06, + "loss": 0.7659, + "step": 17099 + }, + { + "epoch": 0.8326637937331094, + "grad_norm": 2.005687952041626, + "learning_rate": 2.865604880952757e-06, + "loss": 0.7533, + "step": 17100 + }, + { + "epoch": 0.8327124875222165, + "grad_norm": 1.2421373128890991, + "learning_rate": 2.8639781273387357e-06, + "loss": 0.7603, + "step": 17101 + }, + { + "epoch": 0.8327611813113237, + "grad_norm": 2.033521890640259, + "learning_rate": 2.8623517999848283e-06, + "loss": 0.8236, + "step": 17102 + }, + { + "epoch": 0.8328098751004309, + "grad_norm": 1.6383166313171387, + "learning_rate": 2.86072589893148e-06, + "loss": 0.7918, + "step": 17103 + }, + { + "epoch": 0.8328585688895381, + "grad_norm": 1.4829562902450562, + "learning_rate": 2.8591004242191343e-06, + "loss": 0.853, + "step": 17104 + }, + { + "epoch": 0.8329072626786453, + "grad_norm": 1.4856574535369873, + "learning_rate": 2.8574753758882368e-06, + "loss": 0.841, + "step": 17105 + }, + { + "epoch": 0.8329559564677526, + "grad_norm": 1.4349976778030396, + "learning_rate": 2.8558507539791926e-06, + "loss": 0.8073, + "step": 17106 + }, + { + "epoch": 0.8330046502568598, + "grad_norm": 1.7234292030334473, + "learning_rate": 2.8542265585324357e-06, + "loss": 0.8617, + "step": 17107 + }, + { + "epoch": 0.833053344045967, + "grad_norm": 2.3850202560424805, + "learning_rate": 2.8526027895883456e-06, + "loss": 0.7914, + "step": 17108 + }, + { + "epoch": 0.8331020378350741, + "grad_norm": 1.489466667175293, + "learning_rate": 2.8509794471873277e-06, + "loss": 0.722, + "step": 17109 + }, + { + "epoch": 0.8331507316241813, + "grad_norm": 1.356628656387329, + "learning_rate": 2.8493565313697622e-06, + "loss": 0.7543, + "step": 17110 + }, + { + "epoch": 0.8331994254132885, + "grad_norm": 0.10456059128046036, + "learning_rate": 2.847734042176016e-06, + "loss": 0.58, + "step": 17111 + }, + { + "epoch": 0.8332481192023957, + "grad_norm": 1.4782094955444336, + "learning_rate": 2.846111979646449e-06, + "loss": 0.7898, + "step": 17112 + }, + { + "epoch": 0.8332968129915029, + "grad_norm": 1.666529655456543, + "learning_rate": 2.8444903438214045e-06, + "loss": 0.8908, + "step": 17113 + }, + { + "epoch": 0.8333455067806101, + "grad_norm": 1.8450660705566406, + "learning_rate": 2.8428691347412372e-06, + "loss": 0.8569, + "step": 17114 + }, + { + "epoch": 0.8333942005697174, + "grad_norm": 2.0056707859039307, + "learning_rate": 2.8412483524462577e-06, + "loss": 0.8565, + "step": 17115 + }, + { + "epoch": 0.8334428943588246, + "grad_norm": 1.6425764560699463, + "learning_rate": 2.8396279969767906e-06, + "loss": 0.7445, + "step": 17116 + }, + { + "epoch": 0.8334915881479318, + "grad_norm": 1.6723637580871582, + "learning_rate": 2.8380080683731437e-06, + "loss": 0.7384, + "step": 17117 + }, + { + "epoch": 0.8335402819370389, + "grad_norm": 1.7392730712890625, + "learning_rate": 2.836388566675612e-06, + "loss": 0.7774, + "step": 17118 + }, + { + "epoch": 0.8335889757261461, + "grad_norm": 1.2360777854919434, + "learning_rate": 2.8347694919244784e-06, + "loss": 0.8365, + "step": 17119 + }, + { + "epoch": 0.8336376695152533, + "grad_norm": 2.7620208263397217, + "learning_rate": 2.8331508441600175e-06, + "loss": 0.8657, + "step": 17120 + }, + { + "epoch": 0.8336863633043605, + "grad_norm": 1.1952812671661377, + "learning_rate": 2.8315326234224947e-06, + "loss": 0.7395, + "step": 17121 + }, + { + "epoch": 0.8337350570934677, + "grad_norm": 1.774573564529419, + "learning_rate": 2.829914829752165e-06, + "loss": 0.7724, + "step": 17122 + }, + { + "epoch": 0.833783750882575, + "grad_norm": 2.090970277786255, + "learning_rate": 2.8282974631892647e-06, + "loss": 0.8495, + "step": 17123 + }, + { + "epoch": 0.8338324446716822, + "grad_norm": 1.5321623086929321, + "learning_rate": 2.8266805237740345e-06, + "loss": 0.8096, + "step": 17124 + }, + { + "epoch": 0.8338811384607894, + "grad_norm": 1.7129851579666138, + "learning_rate": 2.8250640115466923e-06, + "loss": 0.8856, + "step": 17125 + }, + { + "epoch": 0.8339298322498965, + "grad_norm": 1.9588185548782349, + "learning_rate": 2.823447926547449e-06, + "loss": 0.8217, + "step": 17126 + }, + { + "epoch": 0.8339785260390037, + "grad_norm": 1.7941440343856812, + "learning_rate": 2.8218322688165046e-06, + "loss": 0.8675, + "step": 17127 + }, + { + "epoch": 0.8340272198281109, + "grad_norm": 0.09460418671369553, + "learning_rate": 2.8202170383940485e-06, + "loss": 0.5797, + "step": 17128 + }, + { + "epoch": 0.8340759136172181, + "grad_norm": 2.164645195007324, + "learning_rate": 2.8186022353202604e-06, + "loss": 0.905, + "step": 17129 + }, + { + "epoch": 0.8341246074063253, + "grad_norm": 1.3179813623428345, + "learning_rate": 2.8169878596353073e-06, + "loss": 0.8832, + "step": 17130 + }, + { + "epoch": 0.8341733011954325, + "grad_norm": 1.5613423585891724, + "learning_rate": 2.8153739113793468e-06, + "loss": 0.8004, + "step": 17131 + }, + { + "epoch": 0.8342219949845397, + "grad_norm": 1.7363255023956299, + "learning_rate": 2.813760390592524e-06, + "loss": 0.8792, + "step": 17132 + }, + { + "epoch": 0.834270688773647, + "grad_norm": 1.2304600477218628, + "learning_rate": 2.8121472973149845e-06, + "loss": 0.8141, + "step": 17133 + }, + { + "epoch": 0.8343193825627542, + "grad_norm": 2.0309674739837646, + "learning_rate": 2.8105346315868456e-06, + "loss": 0.7685, + "step": 17134 + }, + { + "epoch": 0.8343680763518613, + "grad_norm": 1.153645634651184, + "learning_rate": 2.808922393448226e-06, + "loss": 0.7323, + "step": 17135 + }, + { + "epoch": 0.8344167701409685, + "grad_norm": 1.4430489540100098, + "learning_rate": 2.807310582939229e-06, + "loss": 0.8088, + "step": 17136 + }, + { + "epoch": 0.8344654639300757, + "grad_norm": 2.022550344467163, + "learning_rate": 2.8056992000999496e-06, + "loss": 0.7591, + "step": 17137 + }, + { + "epoch": 0.8345141577191829, + "grad_norm": 1.446917176246643, + "learning_rate": 2.8040882449704707e-06, + "loss": 0.7511, + "step": 17138 + }, + { + "epoch": 0.8345628515082901, + "grad_norm": 1.505232810974121, + "learning_rate": 2.802477717590859e-06, + "loss": 0.8458, + "step": 17139 + }, + { + "epoch": 0.8346115452973973, + "grad_norm": 2.9307525157928467, + "learning_rate": 2.8008676180011907e-06, + "loss": 0.8337, + "step": 17140 + }, + { + "epoch": 0.8346602390865046, + "grad_norm": 1.1800916194915771, + "learning_rate": 2.7992579462415003e-06, + "loss": 0.8031, + "step": 17141 + }, + { + "epoch": 0.8347089328756118, + "grad_norm": 1.4196959733963013, + "learning_rate": 2.7976487023518426e-06, + "loss": 0.8664, + "step": 17142 + }, + { + "epoch": 0.8347576266647189, + "grad_norm": 1.1401358842849731, + "learning_rate": 2.7960398863722393e-06, + "loss": 0.6716, + "step": 17143 + }, + { + "epoch": 0.8348063204538261, + "grad_norm": 1.470151424407959, + "learning_rate": 2.7944314983427133e-06, + "loss": 0.8038, + "step": 17144 + }, + { + "epoch": 0.8348550142429333, + "grad_norm": 1.2633724212646484, + "learning_rate": 2.792823538303273e-06, + "loss": 0.8231, + "step": 17145 + }, + { + "epoch": 0.8349037080320405, + "grad_norm": 1.3789279460906982, + "learning_rate": 2.791216006293911e-06, + "loss": 0.8964, + "step": 17146 + }, + { + "epoch": 0.8349524018211477, + "grad_norm": 1.306437373161316, + "learning_rate": 2.78960890235463e-06, + "loss": 0.7255, + "step": 17147 + }, + { + "epoch": 0.8350010956102549, + "grad_norm": 1.1853010654449463, + "learning_rate": 2.788002226525388e-06, + "loss": 0.7812, + "step": 17148 + }, + { + "epoch": 0.8350497893993621, + "grad_norm": 2.7149336338043213, + "learning_rate": 2.7863959788461683e-06, + "loss": 0.7, + "step": 17149 + }, + { + "epoch": 0.8350984831884694, + "grad_norm": 1.3765130043029785, + "learning_rate": 2.784790159356907e-06, + "loss": 0.7429, + "step": 17150 + }, + { + "epoch": 0.8351471769775765, + "grad_norm": 0.09064006805419922, + "learning_rate": 2.7831847680975664e-06, + "loss": 0.5824, + "step": 17151 + }, + { + "epoch": 0.8351958707666837, + "grad_norm": 1.5673747062683105, + "learning_rate": 2.7815798051080743e-06, + "loss": 0.7707, + "step": 17152 + }, + { + "epoch": 0.8352445645557909, + "grad_norm": 1.6343533992767334, + "learning_rate": 2.7799752704283547e-06, + "loss": 0.8263, + "step": 17153 + }, + { + "epoch": 0.8352932583448981, + "grad_norm": 2.165412425994873, + "learning_rate": 2.7783711640983214e-06, + "loss": 0.8342, + "step": 17154 + }, + { + "epoch": 0.8353419521340053, + "grad_norm": 1.515857458114624, + "learning_rate": 2.77676748615787e-06, + "loss": 0.9069, + "step": 17155 + }, + { + "epoch": 0.8353906459231125, + "grad_norm": 1.8189584016799927, + "learning_rate": 2.7751642366469057e-06, + "loss": 0.7615, + "step": 17156 + }, + { + "epoch": 0.8354393397122197, + "grad_norm": 1.2024517059326172, + "learning_rate": 2.7735614156052926e-06, + "loss": 0.7467, + "step": 17157 + }, + { + "epoch": 0.8354880335013269, + "grad_norm": 1.2623258829116821, + "learning_rate": 2.771959023072921e-06, + "loss": 0.7463, + "step": 17158 + }, + { + "epoch": 0.8355367272904342, + "grad_norm": 1.7149853706359863, + "learning_rate": 2.77035705908963e-06, + "loss": 0.8719, + "step": 17159 + }, + { + "epoch": 0.8355854210795413, + "grad_norm": 1.8348382711410522, + "learning_rate": 2.7687555236952813e-06, + "loss": 0.8047, + "step": 17160 + }, + { + "epoch": 0.8356341148686485, + "grad_norm": 1.5814955234527588, + "learning_rate": 2.7671544169297095e-06, + "loss": 0.8639, + "step": 17161 + }, + { + "epoch": 0.8356828086577557, + "grad_norm": 1.1480448246002197, + "learning_rate": 2.76555373883274e-06, + "loss": 0.804, + "step": 17162 + }, + { + "epoch": 0.8357315024468629, + "grad_norm": 2.1259076595306396, + "learning_rate": 2.763953489444202e-06, + "loss": 0.7784, + "step": 17163 + }, + { + "epoch": 0.8357801962359701, + "grad_norm": 1.554905652999878, + "learning_rate": 2.762353668803881e-06, + "loss": 0.8761, + "step": 17164 + }, + { + "epoch": 0.8358288900250773, + "grad_norm": 1.4538606405258179, + "learning_rate": 2.760754276951596e-06, + "loss": 0.7412, + "step": 17165 + }, + { + "epoch": 0.8358775838141845, + "grad_norm": 1.3996158838272095, + "learning_rate": 2.7591553139271087e-06, + "loss": 0.767, + "step": 17166 + }, + { + "epoch": 0.8359262776032917, + "grad_norm": 1.6350064277648926, + "learning_rate": 2.757556779770212e-06, + "loss": 0.7597, + "step": 17167 + }, + { + "epoch": 0.8359749713923988, + "grad_norm": 3.2750325202941895, + "learning_rate": 2.7559586745206603e-06, + "loss": 0.77, + "step": 17168 + }, + { + "epoch": 0.8360236651815061, + "grad_norm": 1.3216919898986816, + "learning_rate": 2.7543609982182084e-06, + "loss": 0.783, + "step": 17169 + }, + { + "epoch": 0.8360723589706133, + "grad_norm": 1.6733187437057495, + "learning_rate": 2.7527637509026006e-06, + "loss": 0.8152, + "step": 17170 + }, + { + "epoch": 0.8361210527597205, + "grad_norm": 1.375793695449829, + "learning_rate": 2.751166932613567e-06, + "loss": 0.8698, + "step": 17171 + }, + { + "epoch": 0.8361697465488277, + "grad_norm": 1.6257736682891846, + "learning_rate": 2.7495705433908293e-06, + "loss": 0.8431, + "step": 17172 + }, + { + "epoch": 0.8362184403379349, + "grad_norm": 2.0152299404144287, + "learning_rate": 2.747974583274091e-06, + "loss": 0.8076, + "step": 17173 + }, + { + "epoch": 0.8362671341270421, + "grad_norm": 1.7974460124969482, + "learning_rate": 2.746379052303063e-06, + "loss": 0.7594, + "step": 17174 + }, + { + "epoch": 0.8363158279161493, + "grad_norm": 1.667314887046814, + "learning_rate": 2.744783950517429e-06, + "loss": 0.7594, + "step": 17175 + }, + { + "epoch": 0.8363645217052565, + "grad_norm": 1.688961148262024, + "learning_rate": 2.743189277956868e-06, + "loss": 0.8194, + "step": 17176 + }, + { + "epoch": 0.8364132154943636, + "grad_norm": 1.667601227760315, + "learning_rate": 2.7415950346610467e-06, + "loss": 0.7516, + "step": 17177 + }, + { + "epoch": 0.8364619092834709, + "grad_norm": 1.5662260055541992, + "learning_rate": 2.740001220669624e-06, + "loss": 0.941, + "step": 17178 + }, + { + "epoch": 0.8365106030725781, + "grad_norm": 1.5551724433898926, + "learning_rate": 2.738407836022243e-06, + "loss": 0.8076, + "step": 17179 + }, + { + "epoch": 0.8365592968616853, + "grad_norm": 1.46538507938385, + "learning_rate": 2.7368148807585404e-06, + "loss": 0.8006, + "step": 17180 + }, + { + "epoch": 0.8366079906507925, + "grad_norm": 1.2960702180862427, + "learning_rate": 2.7352223549181436e-06, + "loss": 0.7828, + "step": 17181 + }, + { + "epoch": 0.8366566844398997, + "grad_norm": 2.9316623210906982, + "learning_rate": 2.7336302585406584e-06, + "loss": 0.8417, + "step": 17182 + }, + { + "epoch": 0.8367053782290069, + "grad_norm": 1.481342077255249, + "learning_rate": 2.7320385916657e-06, + "loss": 0.8008, + "step": 17183 + }, + { + "epoch": 0.8367540720181141, + "grad_norm": 1.4078712463378906, + "learning_rate": 2.730447354332857e-06, + "loss": 0.8613, + "step": 17184 + }, + { + "epoch": 0.8368027658072212, + "grad_norm": 2.263561487197876, + "learning_rate": 2.7288565465817083e-06, + "loss": 0.7509, + "step": 17185 + }, + { + "epoch": 0.8368514595963285, + "grad_norm": 1.2810732126235962, + "learning_rate": 2.7272661684518296e-06, + "loss": 0.8285, + "step": 17186 + }, + { + "epoch": 0.8369001533854357, + "grad_norm": 5.036071300506592, + "learning_rate": 2.7256762199827804e-06, + "loss": 0.8574, + "step": 17187 + }, + { + "epoch": 0.8369488471745429, + "grad_norm": 1.4731613397598267, + "learning_rate": 2.724086701214108e-06, + "loss": 0.812, + "step": 17188 + }, + { + "epoch": 0.8369975409636501, + "grad_norm": 1.5932776927947998, + "learning_rate": 2.7224976121853575e-06, + "loss": 0.7824, + "step": 17189 + }, + { + "epoch": 0.8370462347527573, + "grad_norm": 2.532567262649536, + "learning_rate": 2.7209089529360455e-06, + "loss": 0.901, + "step": 17190 + }, + { + "epoch": 0.8370949285418645, + "grad_norm": 1.2510755062103271, + "learning_rate": 2.7193207235057115e-06, + "loss": 0.8933, + "step": 17191 + }, + { + "epoch": 0.8371436223309717, + "grad_norm": 1.5403201580047607, + "learning_rate": 2.717732923933838e-06, + "loss": 0.7742, + "step": 17192 + }, + { + "epoch": 0.8371923161200789, + "grad_norm": 1.6032737493515015, + "learning_rate": 2.7161455542599413e-06, + "loss": 0.8195, + "step": 17193 + }, + { + "epoch": 0.837241009909186, + "grad_norm": 1.6582127809524536, + "learning_rate": 2.714558614523497e-06, + "loss": 0.6992, + "step": 17194 + }, + { + "epoch": 0.8372897036982933, + "grad_norm": 1.3144187927246094, + "learning_rate": 2.7129721047639845e-06, + "loss": 0.6522, + "step": 17195 + }, + { + "epoch": 0.8373383974874005, + "grad_norm": 1.4381459951400757, + "learning_rate": 2.711386025020868e-06, + "loss": 0.7847, + "step": 17196 + }, + { + "epoch": 0.8373870912765077, + "grad_norm": 2.6387813091278076, + "learning_rate": 2.7098003753335934e-06, + "loss": 0.7928, + "step": 17197 + }, + { + "epoch": 0.8374357850656149, + "grad_norm": 1.3808112144470215, + "learning_rate": 2.7082151557416224e-06, + "loss": 0.9151, + "step": 17198 + }, + { + "epoch": 0.8374844788547221, + "grad_norm": 4.498668670654297, + "learning_rate": 2.706630366284366e-06, + "loss": 0.8101, + "step": 17199 + }, + { + "epoch": 0.8375331726438293, + "grad_norm": 1.3653677701950073, + "learning_rate": 2.7050460070012643e-06, + "loss": 0.7486, + "step": 17200 + }, + { + "epoch": 0.8375818664329365, + "grad_norm": 1.7770591974258423, + "learning_rate": 2.7034620779317134e-06, + "loss": 0.7069, + "step": 17201 + }, + { + "epoch": 0.8376305602220436, + "grad_norm": 1.5118708610534668, + "learning_rate": 2.7018785791151226e-06, + "loss": 0.8855, + "step": 17202 + }, + { + "epoch": 0.8376792540111508, + "grad_norm": 1.3641397953033447, + "learning_rate": 2.7002955105908803e-06, + "loss": 0.8504, + "step": 17203 + }, + { + "epoch": 0.837727947800258, + "grad_norm": 2.0855820178985596, + "learning_rate": 2.698712872398359e-06, + "loss": 0.7569, + "step": 17204 + }, + { + "epoch": 0.8377766415893653, + "grad_norm": 2.554766893386841, + "learning_rate": 2.697130664576941e-06, + "loss": 0.7716, + "step": 17205 + }, + { + "epoch": 0.8378253353784725, + "grad_norm": 1.6746776103973389, + "learning_rate": 2.6955488871659683e-06, + "loss": 0.8419, + "step": 17206 + }, + { + "epoch": 0.8378740291675797, + "grad_norm": 1.3286834955215454, + "learning_rate": 2.6939675402048028e-06, + "loss": 0.8206, + "step": 17207 + }, + { + "epoch": 0.8379227229566869, + "grad_norm": 2.265589714050293, + "learning_rate": 2.6923866237327632e-06, + "loss": 0.7346, + "step": 17208 + }, + { + "epoch": 0.8379714167457941, + "grad_norm": 1.8486905097961426, + "learning_rate": 2.69080613778919e-06, + "loss": 0.849, + "step": 17209 + }, + { + "epoch": 0.8380201105349012, + "grad_norm": 1.961128830909729, + "learning_rate": 2.689226082413392e-06, + "loss": 0.734, + "step": 17210 + }, + { + "epoch": 0.8380688043240084, + "grad_norm": 1.6604464054107666, + "learning_rate": 2.6876464576446727e-06, + "loss": 0.8426, + "step": 17211 + }, + { + "epoch": 0.8381174981131156, + "grad_norm": 2.0825839042663574, + "learning_rate": 2.6860672635223275e-06, + "loss": 0.8504, + "step": 17212 + }, + { + "epoch": 0.8381661919022229, + "grad_norm": 0.10199809074401855, + "learning_rate": 2.6844885000856334e-06, + "loss": 0.6543, + "step": 17213 + }, + { + "epoch": 0.8382148856913301, + "grad_norm": 3.163611888885498, + "learning_rate": 2.6829101673738735e-06, + "loss": 0.7646, + "step": 17214 + }, + { + "epoch": 0.8382635794804373, + "grad_norm": 0.09718948602676392, + "learning_rate": 2.6813322654262953e-06, + "loss": 0.6468, + "step": 17215 + }, + { + "epoch": 0.8383122732695445, + "grad_norm": 1.5521917343139648, + "learning_rate": 2.6797547942821635e-06, + "loss": 0.7907, + "step": 17216 + }, + { + "epoch": 0.8383609670586517, + "grad_norm": 1.5962960720062256, + "learning_rate": 2.678177753980702e-06, + "loss": 0.8239, + "step": 17217 + }, + { + "epoch": 0.8384096608477589, + "grad_norm": 2.190551519393921, + "learning_rate": 2.676601144561153e-06, + "loss": 0.8581, + "step": 17218 + }, + { + "epoch": 0.838458354636866, + "grad_norm": 1.6555951833724976, + "learning_rate": 2.675024966062729e-06, + "loss": 0.7374, + "step": 17219 + }, + { + "epoch": 0.8385070484259732, + "grad_norm": 2.7203190326690674, + "learning_rate": 2.6734492185246396e-06, + "loss": 0.7969, + "step": 17220 + }, + { + "epoch": 0.8385557422150804, + "grad_norm": 1.5905405282974243, + "learning_rate": 2.6718739019860817e-06, + "loss": 0.8552, + "step": 17221 + }, + { + "epoch": 0.8386044360041877, + "grad_norm": 1.455318808555603, + "learning_rate": 2.6702990164862354e-06, + "loss": 0.8685, + "step": 17222 + }, + { + "epoch": 0.8386531297932949, + "grad_norm": 1.5805046558380127, + "learning_rate": 2.668724562064289e-06, + "loss": 0.8563, + "step": 17223 + }, + { + "epoch": 0.8387018235824021, + "grad_norm": 1.9025615453720093, + "learning_rate": 2.667150538759391e-06, + "loss": 0.7842, + "step": 17224 + }, + { + "epoch": 0.8387505173715093, + "grad_norm": 1.283958911895752, + "learning_rate": 2.665576946610706e-06, + "loss": 0.7792, + "step": 17225 + }, + { + "epoch": 0.8387992111606165, + "grad_norm": 2.049013376235962, + "learning_rate": 2.6640037856573788e-06, + "loss": 0.7879, + "step": 17226 + }, + { + "epoch": 0.8388479049497236, + "grad_norm": 1.5792884826660156, + "learning_rate": 2.6624310559385348e-06, + "loss": 0.7327, + "step": 17227 + }, + { + "epoch": 0.8388965987388308, + "grad_norm": 2.1600401401519775, + "learning_rate": 2.660858757493301e-06, + "loss": 0.8737, + "step": 17228 + }, + { + "epoch": 0.838945292527938, + "grad_norm": 7.666391372680664, + "learning_rate": 2.659286890360786e-06, + "loss": 0.7819, + "step": 17229 + }, + { + "epoch": 0.8389939863170452, + "grad_norm": 1.7849669456481934, + "learning_rate": 2.65771545458009e-06, + "loss": 0.7824, + "step": 17230 + }, + { + "epoch": 0.8390426801061525, + "grad_norm": 1.3391774892807007, + "learning_rate": 2.6561444501903033e-06, + "loss": 0.8601, + "step": 17231 + }, + { + "epoch": 0.8390913738952597, + "grad_norm": 1.2944049835205078, + "learning_rate": 2.6545738772305018e-06, + "loss": 0.7419, + "step": 17232 + }, + { + "epoch": 0.8391400676843669, + "grad_norm": 1.3749412298202515, + "learning_rate": 2.653003735739761e-06, + "loss": 0.815, + "step": 17233 + }, + { + "epoch": 0.8391887614734741, + "grad_norm": 3.559094190597534, + "learning_rate": 2.651434025757131e-06, + "loss": 0.7783, + "step": 17234 + }, + { + "epoch": 0.8392374552625813, + "grad_norm": 3.594409942626953, + "learning_rate": 2.6498647473216645e-06, + "loss": 0.7945, + "step": 17235 + }, + { + "epoch": 0.8392861490516884, + "grad_norm": 1.6254559755325317, + "learning_rate": 2.648295900472393e-06, + "loss": 0.6766, + "step": 17236 + }, + { + "epoch": 0.8393348428407956, + "grad_norm": 1.6391546726226807, + "learning_rate": 2.6467274852483438e-06, + "loss": 0.7862, + "step": 17237 + }, + { + "epoch": 0.8393835366299028, + "grad_norm": 1.616137981414795, + "learning_rate": 2.6451595016885303e-06, + "loss": 0.7874, + "step": 17238 + }, + { + "epoch": 0.83943223041901, + "grad_norm": 1.8268098831176758, + "learning_rate": 2.643591949831956e-06, + "loss": 0.7636, + "step": 17239 + }, + { + "epoch": 0.8394809242081173, + "grad_norm": 1.7380104064941406, + "learning_rate": 2.6420248297176152e-06, + "loss": 0.9025, + "step": 17240 + }, + { + "epoch": 0.8395296179972245, + "grad_norm": 2.387916326522827, + "learning_rate": 2.640458141384483e-06, + "loss": 0.7844, + "step": 17241 + }, + { + "epoch": 0.8395783117863317, + "grad_norm": 1.8564670085906982, + "learning_rate": 2.638891884871544e-06, + "loss": 0.8085, + "step": 17242 + }, + { + "epoch": 0.8396270055754389, + "grad_norm": 1.4333581924438477, + "learning_rate": 2.637326060217751e-06, + "loss": 0.8844, + "step": 17243 + }, + { + "epoch": 0.839675699364546, + "grad_norm": 4.309941291809082, + "learning_rate": 2.6357606674620573e-06, + "loss": 0.8496, + "step": 17244 + }, + { + "epoch": 0.8397243931536532, + "grad_norm": 1.409935474395752, + "learning_rate": 2.6341957066433986e-06, + "loss": 0.8789, + "step": 17245 + }, + { + "epoch": 0.8397730869427604, + "grad_norm": 1.2419589757919312, + "learning_rate": 2.632631177800704e-06, + "loss": 0.7855, + "step": 17246 + }, + { + "epoch": 0.8398217807318676, + "grad_norm": 1.9316229820251465, + "learning_rate": 2.6310670809728933e-06, + "loss": 0.795, + "step": 17247 + }, + { + "epoch": 0.8398704745209749, + "grad_norm": 2.285425901412964, + "learning_rate": 2.6295034161988686e-06, + "loss": 0.8341, + "step": 17248 + }, + { + "epoch": 0.8399191683100821, + "grad_norm": 2.6902034282684326, + "learning_rate": 2.627940183517539e-06, + "loss": 0.836, + "step": 17249 + }, + { + "epoch": 0.8399678620991893, + "grad_norm": 1.9428852796554565, + "learning_rate": 2.626377382967773e-06, + "loss": 0.9036, + "step": 17250 + }, + { + "epoch": 0.8400165558882965, + "grad_norm": 2.0131161212921143, + "learning_rate": 2.624815014588458e-06, + "loss": 0.8086, + "step": 17251 + }, + { + "epoch": 0.8400652496774037, + "grad_norm": 2.3991901874542236, + "learning_rate": 2.6232530784184527e-06, + "loss": 0.8031, + "step": 17252 + }, + { + "epoch": 0.8401139434665108, + "grad_norm": 1.4590588808059692, + "learning_rate": 2.621691574496612e-06, + "loss": 0.7922, + "step": 17253 + }, + { + "epoch": 0.840162637255618, + "grad_norm": 1.2498481273651123, + "learning_rate": 2.6201305028617773e-06, + "loss": 0.8555, + "step": 17254 + }, + { + "epoch": 0.8402113310447252, + "grad_norm": 1.535696268081665, + "learning_rate": 2.6185698635527778e-06, + "loss": 0.8493, + "step": 17255 + }, + { + "epoch": 0.8402600248338324, + "grad_norm": 0.10089097172021866, + "learning_rate": 2.6170096566084445e-06, + "loss": 0.6936, + "step": 17256 + }, + { + "epoch": 0.8403087186229397, + "grad_norm": 2.1233530044555664, + "learning_rate": 2.615449882067573e-06, + "loss": 0.7774, + "step": 17257 + }, + { + "epoch": 0.8403574124120469, + "grad_norm": 2.6535873413085938, + "learning_rate": 2.6138905399689817e-06, + "loss": 0.8139, + "step": 17258 + }, + { + "epoch": 0.8404061062011541, + "grad_norm": 1.5573583841323853, + "learning_rate": 2.6123316303514366e-06, + "loss": 0.78, + "step": 17259 + }, + { + "epoch": 0.8404547999902613, + "grad_norm": 1.5015544891357422, + "learning_rate": 2.6107731532537338e-06, + "loss": 0.7717, + "step": 17260 + }, + { + "epoch": 0.8405034937793684, + "grad_norm": 1.670945405960083, + "learning_rate": 2.6092151087146333e-06, + "loss": 0.7639, + "step": 17261 + }, + { + "epoch": 0.8405521875684756, + "grad_norm": 1.566240906715393, + "learning_rate": 2.607657496772893e-06, + "loss": 0.8056, + "step": 17262 + }, + { + "epoch": 0.8406008813575828, + "grad_norm": 1.5686322450637817, + "learning_rate": 2.6061003174672574e-06, + "loss": 0.9199, + "step": 17263 + }, + { + "epoch": 0.84064957514669, + "grad_norm": 1.2050373554229736, + "learning_rate": 2.6045435708364595e-06, + "loss": 0.7595, + "step": 17264 + }, + { + "epoch": 0.8406982689357972, + "grad_norm": 1.3437978029251099, + "learning_rate": 2.602987256919234e-06, + "loss": 0.7819, + "step": 17265 + }, + { + "epoch": 0.8407469627249045, + "grad_norm": 1.2045695781707764, + "learning_rate": 2.6014313757542796e-06, + "loss": 0.7676, + "step": 17266 + }, + { + "epoch": 0.8407956565140117, + "grad_norm": 1.2511268854141235, + "learning_rate": 2.599875927380313e-06, + "loss": 0.8171, + "step": 17267 + }, + { + "epoch": 0.8408443503031189, + "grad_norm": 1.5384888648986816, + "learning_rate": 2.5983209118360096e-06, + "loss": 0.8458, + "step": 17268 + }, + { + "epoch": 0.840893044092226, + "grad_norm": 2.1140286922454834, + "learning_rate": 2.5967663291600676e-06, + "loss": 0.7314, + "step": 17269 + }, + { + "epoch": 0.8409417378813332, + "grad_norm": 1.2629618644714355, + "learning_rate": 2.5952121793911467e-06, + "loss": 0.8012, + "step": 17270 + }, + { + "epoch": 0.8409904316704404, + "grad_norm": 2.0282235145568848, + "learning_rate": 2.5936584625679074e-06, + "loss": 0.8705, + "step": 17271 + }, + { + "epoch": 0.8410391254595476, + "grad_norm": 1.229695439338684, + "learning_rate": 2.5921051787290096e-06, + "loss": 0.7853, + "step": 17272 + }, + { + "epoch": 0.8410878192486548, + "grad_norm": 1.7718795537948608, + "learning_rate": 2.5905523279130716e-06, + "loss": 0.7534, + "step": 17273 + }, + { + "epoch": 0.841136513037762, + "grad_norm": 1.2886533737182617, + "learning_rate": 2.5889999101587424e-06, + "loss": 0.7329, + "step": 17274 + }, + { + "epoch": 0.8411852068268693, + "grad_norm": 1.5307387113571167, + "learning_rate": 2.5874479255046204e-06, + "loss": 0.789, + "step": 17275 + }, + { + "epoch": 0.8412339006159765, + "grad_norm": 2.284999132156372, + "learning_rate": 2.5858963739893205e-06, + "loss": 0.8527, + "step": 17276 + }, + { + "epoch": 0.8412825944050837, + "grad_norm": 1.3884329795837402, + "learning_rate": 2.5843452556514394e-06, + "loss": 0.7908, + "step": 17277 + }, + { + "epoch": 0.8413312881941908, + "grad_norm": 3.7928502559661865, + "learning_rate": 2.5827945705295545e-06, + "loss": 0.7346, + "step": 17278 + }, + { + "epoch": 0.841379981983298, + "grad_norm": 1.2674102783203125, + "learning_rate": 2.581244318662244e-06, + "loss": 0.7903, + "step": 17279 + }, + { + "epoch": 0.8414286757724052, + "grad_norm": 1.9726052284240723, + "learning_rate": 2.5796945000880703e-06, + "loss": 0.8457, + "step": 17280 + }, + { + "epoch": 0.8414773695615124, + "grad_norm": 1.6261295080184937, + "learning_rate": 2.5781451148455827e-06, + "loss": 0.8107, + "step": 17281 + }, + { + "epoch": 0.8415260633506196, + "grad_norm": 2.091729164123535, + "learning_rate": 2.5765961629733216e-06, + "loss": 0.7374, + "step": 17282 + }, + { + "epoch": 0.8415747571397268, + "grad_norm": 1.3865798711776733, + "learning_rate": 2.5750476445098206e-06, + "loss": 0.8412, + "step": 17283 + }, + { + "epoch": 0.8416234509288341, + "grad_norm": 1.5588438510894775, + "learning_rate": 2.573499559493602e-06, + "loss": 0.8679, + "step": 17284 + }, + { + "epoch": 0.8416721447179413, + "grad_norm": 1.6497926712036133, + "learning_rate": 2.571951907963168e-06, + "loss": 0.8586, + "step": 17285 + }, + { + "epoch": 0.8417208385070484, + "grad_norm": 1.5879340171813965, + "learning_rate": 2.57040468995702e-06, + "loss": 0.7381, + "step": 17286 + }, + { + "epoch": 0.8417695322961556, + "grad_norm": 1.3307316303253174, + "learning_rate": 2.5688579055136443e-06, + "loss": 0.8302, + "step": 17287 + }, + { + "epoch": 0.8418182260852628, + "grad_norm": 0.09741099923849106, + "learning_rate": 2.567311554671519e-06, + "loss": 0.5742, + "step": 17288 + }, + { + "epoch": 0.84186691987437, + "grad_norm": 1.7188035249710083, + "learning_rate": 2.5657656374691066e-06, + "loss": 0.8672, + "step": 17289 + }, + { + "epoch": 0.8419156136634772, + "grad_norm": 1.6607816219329834, + "learning_rate": 2.564220153944863e-06, + "loss": 0.7632, + "step": 17290 + }, + { + "epoch": 0.8419643074525844, + "grad_norm": 1.916124701499939, + "learning_rate": 2.562675104137229e-06, + "loss": 0.8603, + "step": 17291 + }, + { + "epoch": 0.8420130012416916, + "grad_norm": 1.5766040086746216, + "learning_rate": 2.5611304880846465e-06, + "loss": 0.7646, + "step": 17292 + }, + { + "epoch": 0.8420616950307989, + "grad_norm": 0.09497740119695663, + "learning_rate": 2.559586305825534e-06, + "loss": 0.5847, + "step": 17293 + }, + { + "epoch": 0.8421103888199061, + "grad_norm": 1.7633016109466553, + "learning_rate": 2.5580425573983016e-06, + "loss": 0.8367, + "step": 17294 + }, + { + "epoch": 0.8421590826090132, + "grad_norm": 1.4525489807128906, + "learning_rate": 2.5564992428413506e-06, + "loss": 0.7113, + "step": 17295 + }, + { + "epoch": 0.8422077763981204, + "grad_norm": 1.5012481212615967, + "learning_rate": 2.5549563621930706e-06, + "loss": 0.7965, + "step": 17296 + }, + { + "epoch": 0.8422564701872276, + "grad_norm": 1.234763264656067, + "learning_rate": 2.5534139154918426e-06, + "loss": 0.8683, + "step": 17297 + }, + { + "epoch": 0.8423051639763348, + "grad_norm": 1.8910768032073975, + "learning_rate": 2.551871902776035e-06, + "loss": 0.8411, + "step": 17298 + }, + { + "epoch": 0.842353857765442, + "grad_norm": 1.9034854173660278, + "learning_rate": 2.5503303240839984e-06, + "loss": 0.8988, + "step": 17299 + }, + { + "epoch": 0.8424025515545492, + "grad_norm": 1.4923756122589111, + "learning_rate": 2.5487891794540964e-06, + "loss": 0.8875, + "step": 17300 + }, + { + "epoch": 0.8424512453436565, + "grad_norm": 1.8739746809005737, + "learning_rate": 2.5472484689246457e-06, + "loss": 0.7882, + "step": 17301 + }, + { + "epoch": 0.8424999391327637, + "grad_norm": 1.4387884140014648, + "learning_rate": 2.5457081925339845e-06, + "loss": 0.8748, + "step": 17302 + }, + { + "epoch": 0.8425486329218708, + "grad_norm": 2.1663856506347656, + "learning_rate": 2.544168350320424e-06, + "loss": 0.9283, + "step": 17303 + }, + { + "epoch": 0.842597326710978, + "grad_norm": 1.189970850944519, + "learning_rate": 2.542628942322265e-06, + "loss": 0.8109, + "step": 17304 + }, + { + "epoch": 0.8426460205000852, + "grad_norm": 5.328514099121094, + "learning_rate": 2.541089968577806e-06, + "loss": 0.7511, + "step": 17305 + }, + { + "epoch": 0.8426947142891924, + "grad_norm": 1.6851404905319214, + "learning_rate": 2.53955142912532e-06, + "loss": 0.8623, + "step": 17306 + }, + { + "epoch": 0.8427434080782996, + "grad_norm": 1.760599136352539, + "learning_rate": 2.5380133240030922e-06, + "loss": 0.8839, + "step": 17307 + }, + { + "epoch": 0.8427921018674068, + "grad_norm": 1.469456672668457, + "learning_rate": 2.5364756532493685e-06, + "loss": 0.7598, + "step": 17308 + }, + { + "epoch": 0.842840795656514, + "grad_norm": 1.6491210460662842, + "learning_rate": 2.534938416902413e-06, + "loss": 0.8446, + "step": 17309 + }, + { + "epoch": 0.8428894894456213, + "grad_norm": 1.7238341569900513, + "learning_rate": 2.5334016150004457e-06, + "loss": 0.9208, + "step": 17310 + }, + { + "epoch": 0.8429381832347284, + "grad_norm": 1.3563166856765747, + "learning_rate": 2.531865247581713e-06, + "loss": 0.7952, + "step": 17311 + }, + { + "epoch": 0.8429868770238356, + "grad_norm": 1.4373880624771118, + "learning_rate": 2.5303293146844233e-06, + "loss": 0.779, + "step": 17312 + }, + { + "epoch": 0.8430355708129428, + "grad_norm": 1.966362476348877, + "learning_rate": 2.528793816346782e-06, + "loss": 0.8716, + "step": 17313 + }, + { + "epoch": 0.84308426460205, + "grad_norm": 1.6911485195159912, + "learning_rate": 2.527258752606996e-06, + "loss": 0.7867, + "step": 17314 + }, + { + "epoch": 0.8431329583911572, + "grad_norm": 1.2857836484909058, + "learning_rate": 2.5257241235032305e-06, + "loss": 0.8883, + "step": 17315 + }, + { + "epoch": 0.8431816521802644, + "grad_norm": 1.3623298406600952, + "learning_rate": 2.5241899290736814e-06, + "loss": 0.8198, + "step": 17316 + }, + { + "epoch": 0.8432303459693716, + "grad_norm": 1.330803394317627, + "learning_rate": 2.522656169356492e-06, + "loss": 0.7369, + "step": 17317 + }, + { + "epoch": 0.8432790397584788, + "grad_norm": 1.5628411769866943, + "learning_rate": 2.5211228443898293e-06, + "loss": 0.8089, + "step": 17318 + }, + { + "epoch": 0.8433277335475861, + "grad_norm": 1.391345500946045, + "learning_rate": 2.5195899542118274e-06, + "loss": 0.8073, + "step": 17319 + }, + { + "epoch": 0.8433764273366932, + "grad_norm": 2.1131503582000732, + "learning_rate": 2.518057498860622e-06, + "loss": 0.885, + "step": 17320 + }, + { + "epoch": 0.8434251211258004, + "grad_norm": 1.6875827312469482, + "learning_rate": 2.5165254783743274e-06, + "loss": 0.8862, + "step": 17321 + }, + { + "epoch": 0.8434738149149076, + "grad_norm": 2.385406732559204, + "learning_rate": 2.5149938927910533e-06, + "loss": 0.8409, + "step": 17322 + }, + { + "epoch": 0.8435225087040148, + "grad_norm": 1.150774598121643, + "learning_rate": 2.5134627421489087e-06, + "loss": 0.9039, + "step": 17323 + }, + { + "epoch": 0.843571202493122, + "grad_norm": 1.6059828996658325, + "learning_rate": 2.5119320264859635e-06, + "loss": 0.7976, + "step": 17324 + }, + { + "epoch": 0.8436198962822292, + "grad_norm": 1.660154938697815, + "learning_rate": 2.5104017458403118e-06, + "loss": 0.7656, + "step": 17325 + }, + { + "epoch": 0.8436685900713364, + "grad_norm": 1.932831883430481, + "learning_rate": 2.508871900250003e-06, + "loss": 0.7893, + "step": 17326 + }, + { + "epoch": 0.8437172838604436, + "grad_norm": 2.394554376602173, + "learning_rate": 2.5073424897531063e-06, + "loss": 0.7106, + "step": 17327 + }, + { + "epoch": 0.8437659776495507, + "grad_norm": 1.742538332939148, + "learning_rate": 2.5058135143876585e-06, + "loss": 0.8052, + "step": 17328 + }, + { + "epoch": 0.843814671438658, + "grad_norm": 1.5071160793304443, + "learning_rate": 2.5042849741916952e-06, + "loss": 0.8044, + "step": 17329 + }, + { + "epoch": 0.8438633652277652, + "grad_norm": 1.5303707122802734, + "learning_rate": 2.5027568692032376e-06, + "loss": 0.834, + "step": 17330 + }, + { + "epoch": 0.8439120590168724, + "grad_norm": 1.3322854042053223, + "learning_rate": 2.5012291994602934e-06, + "loss": 0.8284, + "step": 17331 + }, + { + "epoch": 0.8439607528059796, + "grad_norm": 1.1796178817749023, + "learning_rate": 2.499701965000878e-06, + "loss": 0.7097, + "step": 17332 + }, + { + "epoch": 0.8440094465950868, + "grad_norm": 1.52289879322052, + "learning_rate": 2.498175165862964e-06, + "loss": 0.7645, + "step": 17333 + }, + { + "epoch": 0.844058140384194, + "grad_norm": 1.5619596242904663, + "learning_rate": 2.4966488020845425e-06, + "loss": 0.7812, + "step": 17334 + }, + { + "epoch": 0.8441068341733012, + "grad_norm": 1.8761383295059204, + "learning_rate": 2.4951228737035772e-06, + "loss": 0.7169, + "step": 17335 + }, + { + "epoch": 0.8441555279624084, + "grad_norm": 1.519839882850647, + "learning_rate": 2.493597380758026e-06, + "loss": 0.7814, + "step": 17336 + }, + { + "epoch": 0.8442042217515155, + "grad_norm": 1.9235033988952637, + "learning_rate": 2.4920723232858367e-06, + "loss": 0.8328, + "step": 17337 + }, + { + "epoch": 0.8442529155406228, + "grad_norm": 1.5554295778274536, + "learning_rate": 2.490547701324946e-06, + "loss": 0.8687, + "step": 17338 + }, + { + "epoch": 0.84430160932973, + "grad_norm": 1.8039802312850952, + "learning_rate": 2.4890235149132758e-06, + "loss": 0.7798, + "step": 17339 + }, + { + "epoch": 0.8443503031188372, + "grad_norm": 1.3558099269866943, + "learning_rate": 2.487499764088739e-06, + "loss": 0.819, + "step": 17340 + }, + { + "epoch": 0.8443989969079444, + "grad_norm": 0.10464834421873093, + "learning_rate": 2.4859764488892445e-06, + "loss": 0.5832, + "step": 17341 + }, + { + "epoch": 0.8444476906970516, + "grad_norm": 1.4846009016036987, + "learning_rate": 2.484453569352685e-06, + "loss": 0.8288, + "step": 17342 + }, + { + "epoch": 0.8444963844861588, + "grad_norm": 0.0965462177991867, + "learning_rate": 2.482931125516941e-06, + "loss": 0.608, + "step": 17343 + }, + { + "epoch": 0.844545078275266, + "grad_norm": 1.6477371454238892, + "learning_rate": 2.4814091174198797e-06, + "loss": 0.778, + "step": 17344 + }, + { + "epoch": 0.8445937720643731, + "grad_norm": 0.09523101150989532, + "learning_rate": 2.4798875450993644e-06, + "loss": 0.6547, + "step": 17345 + }, + { + "epoch": 0.8446424658534804, + "grad_norm": 1.723440170288086, + "learning_rate": 2.4783664085932445e-06, + "loss": 0.7621, + "step": 17346 + }, + { + "epoch": 0.8446911596425876, + "grad_norm": 3.787276029586792, + "learning_rate": 2.476845707939357e-06, + "loss": 0.8243, + "step": 17347 + }, + { + "epoch": 0.8447398534316948, + "grad_norm": 2.0627965927124023, + "learning_rate": 2.4753254431755314e-06, + "loss": 0.8024, + "step": 17348 + }, + { + "epoch": 0.844788547220802, + "grad_norm": 1.3091893196105957, + "learning_rate": 2.4738056143395837e-06, + "loss": 0.8178, + "step": 17349 + }, + { + "epoch": 0.8448372410099092, + "grad_norm": 1.9410574436187744, + "learning_rate": 2.4722862214693133e-06, + "loss": 0.9076, + "step": 17350 + }, + { + "epoch": 0.8448859347990164, + "grad_norm": 2.9095306396484375, + "learning_rate": 2.470767264602527e-06, + "loss": 0.7112, + "step": 17351 + }, + { + "epoch": 0.8449346285881236, + "grad_norm": 1.7843493223190308, + "learning_rate": 2.469248743777002e-06, + "loss": 0.8434, + "step": 17352 + }, + { + "epoch": 0.8449833223772308, + "grad_norm": 1.4341734647750854, + "learning_rate": 2.467730659030514e-06, + "loss": 0.9105, + "step": 17353 + }, + { + "epoch": 0.8450320161663379, + "grad_norm": 1.4523723125457764, + "learning_rate": 2.4662130104008242e-06, + "loss": 0.7992, + "step": 17354 + }, + { + "epoch": 0.8450807099554452, + "grad_norm": 1.868180513381958, + "learning_rate": 2.4646957979256823e-06, + "loss": 0.7886, + "step": 17355 + }, + { + "epoch": 0.8451294037445524, + "grad_norm": 1.8572708368301392, + "learning_rate": 2.463179021642834e-06, + "loss": 0.8285, + "step": 17356 + }, + { + "epoch": 0.8451780975336596, + "grad_norm": 1.2520872354507446, + "learning_rate": 2.46166268159e-06, + "loss": 0.8179, + "step": 17357 + }, + { + "epoch": 0.8452267913227668, + "grad_norm": 0.09808409959077835, + "learning_rate": 2.460146777804917e-06, + "loss": 0.6416, + "step": 17358 + }, + { + "epoch": 0.845275485111874, + "grad_norm": 1.8045753240585327, + "learning_rate": 2.4586313103252724e-06, + "loss": 0.7249, + "step": 17359 + }, + { + "epoch": 0.8453241789009812, + "grad_norm": 1.7003992795944214, + "learning_rate": 2.457116279188776e-06, + "loss": 0.7895, + "step": 17360 + }, + { + "epoch": 0.8453728726900884, + "grad_norm": 1.3472861051559448, + "learning_rate": 2.4556016844331134e-06, + "loss": 0.817, + "step": 17361 + }, + { + "epoch": 0.8454215664791955, + "grad_norm": 1.248414397239685, + "learning_rate": 2.4540875260959585e-06, + "loss": 0.7943, + "step": 17362 + }, + { + "epoch": 0.8454702602683027, + "grad_norm": 1.0750555992126465, + "learning_rate": 2.4525738042149747e-06, + "loss": 0.8612, + "step": 17363 + }, + { + "epoch": 0.84551895405741, + "grad_norm": 1.5790280103683472, + "learning_rate": 2.451060518827815e-06, + "loss": 0.8368, + "step": 17364 + }, + { + "epoch": 0.8455676478465172, + "grad_norm": 1.5357805490493774, + "learning_rate": 2.449547669972132e-06, + "loss": 0.8363, + "step": 17365 + }, + { + "epoch": 0.8456163416356244, + "grad_norm": 1.600859522819519, + "learning_rate": 2.4480352576855436e-06, + "loss": 0.7524, + "step": 17366 + }, + { + "epoch": 0.8456650354247316, + "grad_norm": 2.316601514816284, + "learning_rate": 2.4465232820056863e-06, + "loss": 0.7628, + "step": 17367 + }, + { + "epoch": 0.8457137292138388, + "grad_norm": 1.3866767883300781, + "learning_rate": 2.445011742970156e-06, + "loss": 0.7979, + "step": 17368 + }, + { + "epoch": 0.845762423002946, + "grad_norm": 1.751624345779419, + "learning_rate": 2.4435006406165606e-06, + "loss": 0.7601, + "step": 17369 + }, + { + "epoch": 0.8458111167920531, + "grad_norm": 1.5443763732910156, + "learning_rate": 2.4419899749824904e-06, + "loss": 0.8695, + "step": 17370 + }, + { + "epoch": 0.8458598105811603, + "grad_norm": 1.4690399169921875, + "learning_rate": 2.440479746105517e-06, + "loss": 0.7908, + "step": 17371 + }, + { + "epoch": 0.8459085043702675, + "grad_norm": 1.1585968732833862, + "learning_rate": 2.4389699540232137e-06, + "loss": 0.774, + "step": 17372 + }, + { + "epoch": 0.8459571981593748, + "grad_norm": 1.5707920789718628, + "learning_rate": 2.4374605987731293e-06, + "loss": 0.8243, + "step": 17373 + }, + { + "epoch": 0.846005891948482, + "grad_norm": 1.6226762533187866, + "learning_rate": 2.4359516803928205e-06, + "loss": 0.8271, + "step": 17374 + }, + { + "epoch": 0.8460545857375892, + "grad_norm": 1.6145870685577393, + "learning_rate": 2.4344431989198093e-06, + "loss": 0.8364, + "step": 17375 + }, + { + "epoch": 0.8461032795266964, + "grad_norm": 4.650728702545166, + "learning_rate": 2.4329351543916315e-06, + "loss": 0.8647, + "step": 17376 + }, + { + "epoch": 0.8461519733158036, + "grad_norm": 2.124213218688965, + "learning_rate": 2.431427546845786e-06, + "loss": 0.8316, + "step": 17377 + }, + { + "epoch": 0.8462006671049108, + "grad_norm": 1.379731297492981, + "learning_rate": 2.4299203763197855e-06, + "loss": 0.7924, + "step": 17378 + }, + { + "epoch": 0.8462493608940179, + "grad_norm": 2.198552370071411, + "learning_rate": 2.4284136428511195e-06, + "loss": 0.8204, + "step": 17379 + }, + { + "epoch": 0.8462980546831251, + "grad_norm": 1.3973665237426758, + "learning_rate": 2.426907346477261e-06, + "loss": 0.7747, + "step": 17380 + }, + { + "epoch": 0.8463467484722323, + "grad_norm": 2.1309814453125, + "learning_rate": 2.4254014872356925e-06, + "loss": 0.8491, + "step": 17381 + }, + { + "epoch": 0.8463954422613396, + "grad_norm": 2.0551271438598633, + "learning_rate": 2.4238960651638554e-06, + "loss": 0.7918, + "step": 17382 + }, + { + "epoch": 0.8464441360504468, + "grad_norm": 3.7213187217712402, + "learning_rate": 2.422391080299216e-06, + "loss": 0.8059, + "step": 17383 + }, + { + "epoch": 0.846492829839554, + "grad_norm": 1.2826755046844482, + "learning_rate": 2.420886532679192e-06, + "loss": 0.7742, + "step": 17384 + }, + { + "epoch": 0.8465415236286612, + "grad_norm": 2.6626474857330322, + "learning_rate": 2.419382422341221e-06, + "loss": 0.7308, + "step": 17385 + }, + { + "epoch": 0.8465902174177684, + "grad_norm": 1.4948595762252808, + "learning_rate": 2.4178787493227175e-06, + "loss": 0.8082, + "step": 17386 + }, + { + "epoch": 0.8466389112068755, + "grad_norm": 1.8838263750076294, + "learning_rate": 2.4163755136610824e-06, + "loss": 0.8728, + "step": 17387 + }, + { + "epoch": 0.8466876049959827, + "grad_norm": 2.3805413246154785, + "learning_rate": 2.41487271539371e-06, + "loss": 0.6591, + "step": 17388 + }, + { + "epoch": 0.8467362987850899, + "grad_norm": 1.7061771154403687, + "learning_rate": 2.4133703545579824e-06, + "loss": 0.7774, + "step": 17389 + }, + { + "epoch": 0.8467849925741971, + "grad_norm": 1.404200553894043, + "learning_rate": 2.4118684311912686e-06, + "loss": 0.8059, + "step": 17390 + }, + { + "epoch": 0.8468336863633044, + "grad_norm": 2.258920907974243, + "learning_rate": 2.4103669453309287e-06, + "loss": 0.7852, + "step": 17391 + }, + { + "epoch": 0.8468823801524116, + "grad_norm": 1.3363052606582642, + "learning_rate": 2.4088658970143184e-06, + "loss": 0.9017, + "step": 17392 + }, + { + "epoch": 0.8469310739415188, + "grad_norm": 1.4400172233581543, + "learning_rate": 2.4073652862787735e-06, + "loss": 0.7886, + "step": 17393 + }, + { + "epoch": 0.846979767730626, + "grad_norm": 1.2530847787857056, + "learning_rate": 2.4058651131616208e-06, + "loss": 0.7111, + "step": 17394 + }, + { + "epoch": 0.8470284615197332, + "grad_norm": 1.5462359189987183, + "learning_rate": 2.4043653777001772e-06, + "loss": 0.8296, + "step": 17395 + }, + { + "epoch": 0.8470771553088403, + "grad_norm": 2.1058599948883057, + "learning_rate": 2.4028660799317496e-06, + "loss": 0.8662, + "step": 17396 + }, + { + "epoch": 0.8471258490979475, + "grad_norm": 1.3720256090164185, + "learning_rate": 2.4013672198936335e-06, + "loss": 0.8169, + "step": 17397 + }, + { + "epoch": 0.8471745428870547, + "grad_norm": 1.4940847158432007, + "learning_rate": 2.3998687976231104e-06, + "loss": 0.7697, + "step": 17398 + }, + { + "epoch": 0.847223236676162, + "grad_norm": 0.08956866711378098, + "learning_rate": 2.3983708131574534e-06, + "loss": 0.5969, + "step": 17399 + }, + { + "epoch": 0.8472719304652692, + "grad_norm": 1.6538751125335693, + "learning_rate": 2.39687326653393e-06, + "loss": 0.8065, + "step": 17400 + }, + { + "epoch": 0.8473206242543764, + "grad_norm": 1.9035452604293823, + "learning_rate": 2.3953761577897907e-06, + "loss": 0.8137, + "step": 17401 + }, + { + "epoch": 0.8473693180434836, + "grad_norm": 1.562791347503662, + "learning_rate": 2.393879486962274e-06, + "loss": 0.7949, + "step": 17402 + }, + { + "epoch": 0.8474180118325908, + "grad_norm": 1.7130693197250366, + "learning_rate": 2.3923832540886083e-06, + "loss": 0.8531, + "step": 17403 + }, + { + "epoch": 0.8474667056216979, + "grad_norm": 1.2799923419952393, + "learning_rate": 2.3908874592060172e-06, + "loss": 0.8528, + "step": 17404 + }, + { + "epoch": 0.8475153994108051, + "grad_norm": 1.7099400758743286, + "learning_rate": 2.389392102351704e-06, + "loss": 0.8538, + "step": 17405 + }, + { + "epoch": 0.8475640931999123, + "grad_norm": 1.4962817430496216, + "learning_rate": 2.3878971835628704e-06, + "loss": 0.8271, + "step": 17406 + }, + { + "epoch": 0.8476127869890195, + "grad_norm": 0.09594567865133286, + "learning_rate": 2.3864027028766977e-06, + "loss": 0.5746, + "step": 17407 + }, + { + "epoch": 0.8476614807781268, + "grad_norm": 2.277848720550537, + "learning_rate": 2.3849086603303605e-06, + "loss": 0.7558, + "step": 17408 + }, + { + "epoch": 0.847710174567234, + "grad_norm": 1.3358553647994995, + "learning_rate": 2.383415055961036e-06, + "loss": 0.8654, + "step": 17409 + }, + { + "epoch": 0.8477588683563412, + "grad_norm": 1.3490265607833862, + "learning_rate": 2.3819218898058584e-06, + "loss": 0.7289, + "step": 17410 + }, + { + "epoch": 0.8478075621454484, + "grad_norm": 1.5119463205337524, + "learning_rate": 2.380429161901985e-06, + "loss": 0.7977, + "step": 17411 + }, + { + "epoch": 0.8478562559345556, + "grad_norm": 1.4387447834014893, + "learning_rate": 2.3789368722865438e-06, + "loss": 0.8763, + "step": 17412 + }, + { + "epoch": 0.8479049497236627, + "grad_norm": 1.9222440719604492, + "learning_rate": 2.3774450209966517e-06, + "loss": 0.8042, + "step": 17413 + }, + { + "epoch": 0.8479536435127699, + "grad_norm": 1.365031123161316, + "learning_rate": 2.3759536080694233e-06, + "loss": 0.8601, + "step": 17414 + }, + { + "epoch": 0.8480023373018771, + "grad_norm": 3.407649517059326, + "learning_rate": 2.3744626335419518e-06, + "loss": 0.8382, + "step": 17415 + }, + { + "epoch": 0.8480510310909843, + "grad_norm": 1.4832427501678467, + "learning_rate": 2.372972097451336e-06, + "loss": 0.7544, + "step": 17416 + }, + { + "epoch": 0.8480997248800916, + "grad_norm": 1.582663655281067, + "learning_rate": 2.3714819998346393e-06, + "loss": 0.7979, + "step": 17417 + }, + { + "epoch": 0.8481484186691988, + "grad_norm": 1.2162953615188599, + "learning_rate": 2.3699923407289438e-06, + "loss": 0.7342, + "step": 17418 + }, + { + "epoch": 0.848197112458306, + "grad_norm": 4.4388837814331055, + "learning_rate": 2.3685031201712884e-06, + "loss": 0.8975, + "step": 17419 + }, + { + "epoch": 0.8482458062474132, + "grad_norm": 2.2793169021606445, + "learning_rate": 2.3670143381987276e-06, + "loss": 0.7793, + "step": 17420 + }, + { + "epoch": 0.8482945000365203, + "grad_norm": 1.4147801399230957, + "learning_rate": 2.3655259948482943e-06, + "loss": 0.8916, + "step": 17421 + }, + { + "epoch": 0.8483431938256275, + "grad_norm": 1.3247499465942383, + "learning_rate": 2.364038090157006e-06, + "loss": 0.8621, + "step": 17422 + }, + { + "epoch": 0.8483918876147347, + "grad_norm": 3.9010612964630127, + "learning_rate": 2.3625506241618854e-06, + "loss": 0.7846, + "step": 17423 + }, + { + "epoch": 0.8484405814038419, + "grad_norm": 0.0965176522731781, + "learning_rate": 2.3610635968999173e-06, + "loss": 0.5512, + "step": 17424 + }, + { + "epoch": 0.8484892751929491, + "grad_norm": 2.46096134185791, + "learning_rate": 2.3595770084081092e-06, + "loss": 0.9056, + "step": 17425 + }, + { + "epoch": 0.8485379689820564, + "grad_norm": 1.5188106298446655, + "learning_rate": 2.3580908587234254e-06, + "loss": 0.8126, + "step": 17426 + }, + { + "epoch": 0.8485866627711636, + "grad_norm": 1.9146517515182495, + "learning_rate": 2.3566051478828423e-06, + "loss": 0.7052, + "step": 17427 + }, + { + "epoch": 0.8486353565602708, + "grad_norm": 1.1540744304656982, + "learning_rate": 2.355119875923315e-06, + "loss": 0.9472, + "step": 17428 + }, + { + "epoch": 0.8486840503493779, + "grad_norm": 1.5374112129211426, + "learning_rate": 2.3536350428817876e-06, + "loss": 0.7254, + "step": 17429 + }, + { + "epoch": 0.8487327441384851, + "grad_norm": 1.6453399658203125, + "learning_rate": 2.3521506487951994e-06, + "loss": 0.8113, + "step": 17430 + }, + { + "epoch": 0.8487814379275923, + "grad_norm": 2.1343352794647217, + "learning_rate": 2.3506666937004696e-06, + "loss": 0.8062, + "step": 17431 + }, + { + "epoch": 0.8488301317166995, + "grad_norm": 1.8736169338226318, + "learning_rate": 2.349183177634522e-06, + "loss": 0.7966, + "step": 17432 + }, + { + "epoch": 0.8488788255058067, + "grad_norm": 1.8849660158157349, + "learning_rate": 2.3477001006342446e-06, + "loss": 0.7733, + "step": 17433 + }, + { + "epoch": 0.848927519294914, + "grad_norm": 1.590825080871582, + "learning_rate": 2.346217462736544e-06, + "loss": 0.7828, + "step": 17434 + }, + { + "epoch": 0.8489762130840212, + "grad_norm": 1.6156935691833496, + "learning_rate": 2.344735263978286e-06, + "loss": 0.7788, + "step": 17435 + }, + { + "epoch": 0.8490249068731284, + "grad_norm": 1.597231149673462, + "learning_rate": 2.343253504396352e-06, + "loss": 0.7442, + "step": 17436 + }, + { + "epoch": 0.8490736006622356, + "grad_norm": 1.7144113779067993, + "learning_rate": 2.3417721840275977e-06, + "loss": 0.8566, + "step": 17437 + }, + { + "epoch": 0.8491222944513427, + "grad_norm": 0.09786593168973923, + "learning_rate": 2.3402913029088704e-06, + "loss": 0.5917, + "step": 17438 + }, + { + "epoch": 0.8491709882404499, + "grad_norm": 1.5669081211090088, + "learning_rate": 2.3388108610770053e-06, + "loss": 0.8029, + "step": 17439 + }, + { + "epoch": 0.8492196820295571, + "grad_norm": 1.978973627090454, + "learning_rate": 2.337330858568827e-06, + "loss": 0.7651, + "step": 17440 + }, + { + "epoch": 0.8492683758186643, + "grad_norm": 1.5940518379211426, + "learning_rate": 2.335851295421163e-06, + "loss": 0.8478, + "step": 17441 + }, + { + "epoch": 0.8493170696077715, + "grad_norm": 0.09613358974456787, + "learning_rate": 2.3343721716708e-06, + "loss": 0.6226, + "step": 17442 + }, + { + "epoch": 0.8493657633968787, + "grad_norm": 1.6914210319519043, + "learning_rate": 2.3328934873545438e-06, + "loss": 0.7619, + "step": 17443 + }, + { + "epoch": 0.849414457185986, + "grad_norm": 2.060739278793335, + "learning_rate": 2.3314152425091742e-06, + "loss": 0.7592, + "step": 17444 + }, + { + "epoch": 0.8494631509750932, + "grad_norm": 1.7079380750656128, + "learning_rate": 2.329937437171459e-06, + "loss": 0.7767, + "step": 17445 + }, + { + "epoch": 0.8495118447642003, + "grad_norm": 2.1998040676116943, + "learning_rate": 2.3284600713781624e-06, + "loss": 0.7576, + "step": 17446 + }, + { + "epoch": 0.8495605385533075, + "grad_norm": 1.411438226699829, + "learning_rate": 2.326983145166033e-06, + "loss": 0.7154, + "step": 17447 + }, + { + "epoch": 0.8496092323424147, + "grad_norm": 1.6508328914642334, + "learning_rate": 2.32550665857181e-06, + "loss": 0.8185, + "step": 17448 + }, + { + "epoch": 0.8496579261315219, + "grad_norm": 3.7264933586120605, + "learning_rate": 2.3240306116322155e-06, + "loss": 0.7951, + "step": 17449 + }, + { + "epoch": 0.8497066199206291, + "grad_norm": 1.7679505348205566, + "learning_rate": 2.322555004383973e-06, + "loss": 0.7351, + "step": 17450 + }, + { + "epoch": 0.8497553137097363, + "grad_norm": 1.474295973777771, + "learning_rate": 2.3210798368637888e-06, + "loss": 0.6937, + "step": 17451 + }, + { + "epoch": 0.8498040074988435, + "grad_norm": 1.3358112573623657, + "learning_rate": 2.319605109108356e-06, + "loss": 0.7684, + "step": 17452 + }, + { + "epoch": 0.8498527012879508, + "grad_norm": 4.099311351776123, + "learning_rate": 2.3181308211543563e-06, + "loss": 0.8388, + "step": 17453 + }, + { + "epoch": 0.849901395077058, + "grad_norm": 2.8153414726257324, + "learning_rate": 2.316656973038467e-06, + "loss": 0.7214, + "step": 17454 + }, + { + "epoch": 0.8499500888661651, + "grad_norm": 1.2534589767456055, + "learning_rate": 2.315183564797345e-06, + "loss": 0.7838, + "step": 17455 + }, + { + "epoch": 0.8499987826552723, + "grad_norm": 1.3681161403656006, + "learning_rate": 2.3137105964676463e-06, + "loss": 0.7367, + "step": 17456 + }, + { + "epoch": 0.8500474764443795, + "grad_norm": 1.6148287057876587, + "learning_rate": 2.31223806808601e-06, + "loss": 0.8546, + "step": 17457 + }, + { + "epoch": 0.8500961702334867, + "grad_norm": 1.7142976522445679, + "learning_rate": 2.3107659796890625e-06, + "loss": 0.8153, + "step": 17458 + }, + { + "epoch": 0.8501448640225939, + "grad_norm": 2.417039394378662, + "learning_rate": 2.3092943313134208e-06, + "loss": 0.7163, + "step": 17459 + }, + { + "epoch": 0.8501935578117011, + "grad_norm": 1.9072942733764648, + "learning_rate": 2.3078231229956982e-06, + "loss": 0.8682, + "step": 17460 + }, + { + "epoch": 0.8502422516008084, + "grad_norm": 1.554800033569336, + "learning_rate": 2.3063523547724896e-06, + "loss": 0.862, + "step": 17461 + }, + { + "epoch": 0.8502909453899156, + "grad_norm": 1.4793943166732788, + "learning_rate": 2.3048820266803817e-06, + "loss": 0.6663, + "step": 17462 + }, + { + "epoch": 0.8503396391790227, + "grad_norm": 1.5226387977600098, + "learning_rate": 2.3034121387559426e-06, + "loss": 0.8915, + "step": 17463 + }, + { + "epoch": 0.8503883329681299, + "grad_norm": 1.3562873601913452, + "learning_rate": 2.3019426910357434e-06, + "loss": 0.8334, + "step": 17464 + }, + { + "epoch": 0.8504370267572371, + "grad_norm": 2.6007566452026367, + "learning_rate": 2.3004736835563323e-06, + "loss": 0.8518, + "step": 17465 + }, + { + "epoch": 0.8504857205463443, + "grad_norm": 1.468912959098816, + "learning_rate": 2.2990051163542472e-06, + "loss": 0.7588, + "step": 17466 + }, + { + "epoch": 0.8505344143354515, + "grad_norm": 1.6490283012390137, + "learning_rate": 2.2975369894660336e-06, + "loss": 0.7875, + "step": 17467 + }, + { + "epoch": 0.8505831081245587, + "grad_norm": 2.558110237121582, + "learning_rate": 2.296069302928192e-06, + "loss": 0.7493, + "step": 17468 + }, + { + "epoch": 0.8506318019136659, + "grad_norm": 1.3862584829330444, + "learning_rate": 2.294602056777244e-06, + "loss": 0.8804, + "step": 17469 + }, + { + "epoch": 0.8506804957027732, + "grad_norm": 1.2893905639648438, + "learning_rate": 2.293135251049685e-06, + "loss": 0.8028, + "step": 17470 + }, + { + "epoch": 0.8507291894918803, + "grad_norm": 1.2795546054840088, + "learning_rate": 2.2916688857820013e-06, + "loss": 0.7411, + "step": 17471 + }, + { + "epoch": 0.8507778832809875, + "grad_norm": 1.3761440515518188, + "learning_rate": 2.290202961010668e-06, + "loss": 0.798, + "step": 17472 + }, + { + "epoch": 0.8508265770700947, + "grad_norm": 1.8270834684371948, + "learning_rate": 2.288737476772147e-06, + "loss": 0.8521, + "step": 17473 + }, + { + "epoch": 0.8508752708592019, + "grad_norm": 1.4579368829727173, + "learning_rate": 2.287272433102905e-06, + "loss": 0.8179, + "step": 17474 + }, + { + "epoch": 0.8509239646483091, + "grad_norm": 2.083700180053711, + "learning_rate": 2.2858078300393684e-06, + "loss": 0.8297, + "step": 17475 + }, + { + "epoch": 0.8509726584374163, + "grad_norm": 1.6811753511428833, + "learning_rate": 2.284343667617983e-06, + "loss": 0.7927, + "step": 17476 + }, + { + "epoch": 0.8510213522265235, + "grad_norm": 1.5097311735153198, + "learning_rate": 2.2828799458751584e-06, + "loss": 0.8654, + "step": 17477 + }, + { + "epoch": 0.8510700460156307, + "grad_norm": 1.865555763244629, + "learning_rate": 2.2814166648473135e-06, + "loss": 0.8311, + "step": 17478 + }, + { + "epoch": 0.851118739804738, + "grad_norm": 1.5559594631195068, + "learning_rate": 2.2799538245708443e-06, + "loss": 0.8062, + "step": 17479 + }, + { + "epoch": 0.8511674335938451, + "grad_norm": 1.2894420623779297, + "learning_rate": 2.278491425082141e-06, + "loss": 0.7699, + "step": 17480 + }, + { + "epoch": 0.8512161273829523, + "grad_norm": 1.4531062841415405, + "learning_rate": 2.2770294664175774e-06, + "loss": 0.8478, + "step": 17481 + }, + { + "epoch": 0.8512648211720595, + "grad_norm": 1.2704404592514038, + "learning_rate": 2.2755679486135175e-06, + "loss": 0.7783, + "step": 17482 + }, + { + "epoch": 0.8513135149611667, + "grad_norm": 2.0238451957702637, + "learning_rate": 2.2741068717063298e-06, + "loss": 0.8038, + "step": 17483 + }, + { + "epoch": 0.8513622087502739, + "grad_norm": 1.3744683265686035, + "learning_rate": 2.272646235732343e-06, + "loss": 0.8621, + "step": 17484 + }, + { + "epoch": 0.8514109025393811, + "grad_norm": 1.5007493495941162, + "learning_rate": 2.2711860407279043e-06, + "loss": 0.769, + "step": 17485 + }, + { + "epoch": 0.8514595963284883, + "grad_norm": 1.5740981101989746, + "learning_rate": 2.269726286729321e-06, + "loss": 0.7449, + "step": 17486 + }, + { + "epoch": 0.8515082901175955, + "grad_norm": 1.5012753009796143, + "learning_rate": 2.268266973772917e-06, + "loss": 0.7778, + "step": 17487 + }, + { + "epoch": 0.8515569839067026, + "grad_norm": 1.8500137329101562, + "learning_rate": 2.2668081018949882e-06, + "loss": 0.7705, + "step": 17488 + }, + { + "epoch": 0.8516056776958099, + "grad_norm": 1.2557538747787476, + "learning_rate": 2.2653496711318225e-06, + "loss": 0.8796, + "step": 17489 + }, + { + "epoch": 0.8516543714849171, + "grad_norm": 1.4474024772644043, + "learning_rate": 2.2638916815197075e-06, + "loss": 0.8555, + "step": 17490 + }, + { + "epoch": 0.8517030652740243, + "grad_norm": 1.3801227807998657, + "learning_rate": 2.262434133094895e-06, + "loss": 0.8139, + "step": 17491 + }, + { + "epoch": 0.8517517590631315, + "grad_norm": 2.3792121410369873, + "learning_rate": 2.2609770258936604e-06, + "loss": 0.8537, + "step": 17492 + }, + { + "epoch": 0.8518004528522387, + "grad_norm": 1.8299965858459473, + "learning_rate": 2.259520359952232e-06, + "loss": 0.8119, + "step": 17493 + }, + { + "epoch": 0.8518491466413459, + "grad_norm": 1.6028133630752563, + "learning_rate": 2.2580641353068543e-06, + "loss": 0.8209, + "step": 17494 + }, + { + "epoch": 0.8518978404304531, + "grad_norm": 0.09904836863279343, + "learning_rate": 2.2566083519937497e-06, + "loss": 0.6029, + "step": 17495 + }, + { + "epoch": 0.8519465342195603, + "grad_norm": 2.81120228767395, + "learning_rate": 2.2551530100491313e-06, + "loss": 0.8503, + "step": 17496 + }, + { + "epoch": 0.8519952280086674, + "grad_norm": 1.6595661640167236, + "learning_rate": 2.253698109509197e-06, + "loss": 0.9182, + "step": 17497 + }, + { + "epoch": 0.8520439217977747, + "grad_norm": 2.306290626525879, + "learning_rate": 2.2522436504101418e-06, + "loss": 0.7855, + "step": 17498 + }, + { + "epoch": 0.8520926155868819, + "grad_norm": 1.1912190914154053, + "learning_rate": 2.250789632788146e-06, + "loss": 0.8673, + "step": 17499 + }, + { + "epoch": 0.8521413093759891, + "grad_norm": 1.728414535522461, + "learning_rate": 2.2493360566793697e-06, + "loss": 0.7718, + "step": 17500 + }, + { + "epoch": 0.8521900031650963, + "grad_norm": 0.09533074498176575, + "learning_rate": 2.2478829221199816e-06, + "loss": 0.5559, + "step": 17501 + }, + { + "epoch": 0.8522386969542035, + "grad_norm": 1.3580749034881592, + "learning_rate": 2.246430229146126e-06, + "loss": 0.7684, + "step": 17502 + }, + { + "epoch": 0.8522873907433107, + "grad_norm": 1.677199125289917, + "learning_rate": 2.2449779777939364e-06, + "loss": 0.8173, + "step": 17503 + }, + { + "epoch": 0.8523360845324179, + "grad_norm": 1.6713207960128784, + "learning_rate": 2.243526168099537e-06, + "loss": 0.7038, + "step": 17504 + }, + { + "epoch": 0.852384778321525, + "grad_norm": 1.3432121276855469, + "learning_rate": 2.2420748000990454e-06, + "loss": 0.8403, + "step": 17505 + }, + { + "epoch": 0.8524334721106323, + "grad_norm": 1.782184362411499, + "learning_rate": 2.2406238738285603e-06, + "loss": 0.8312, + "step": 17506 + }, + { + "epoch": 0.8524821658997395, + "grad_norm": 1.8658300638198853, + "learning_rate": 2.2391733893241765e-06, + "loss": 0.7904, + "step": 17507 + }, + { + "epoch": 0.8525308596888467, + "grad_norm": 1.537669062614441, + "learning_rate": 2.2377233466219695e-06, + "loss": 0.7551, + "step": 17508 + }, + { + "epoch": 0.8525795534779539, + "grad_norm": 1.414783239364624, + "learning_rate": 2.2362737457580196e-06, + "loss": 0.8182, + "step": 17509 + }, + { + "epoch": 0.8526282472670611, + "grad_norm": 1.5473467111587524, + "learning_rate": 2.234824586768378e-06, + "loss": 0.766, + "step": 17510 + }, + { + "epoch": 0.8526769410561683, + "grad_norm": 1.6456687450408936, + "learning_rate": 2.233375869689094e-06, + "loss": 0.8228, + "step": 17511 + }, + { + "epoch": 0.8527256348452755, + "grad_norm": 1.2201987504959106, + "learning_rate": 2.231927594556207e-06, + "loss": 0.7571, + "step": 17512 + }, + { + "epoch": 0.8527743286343827, + "grad_norm": 1.5591806173324585, + "learning_rate": 2.2304797614057417e-06, + "loss": 0.8466, + "step": 17513 + }, + { + "epoch": 0.8528230224234898, + "grad_norm": 1.5359455347061157, + "learning_rate": 2.2290323702737094e-06, + "loss": 0.8462, + "step": 17514 + }, + { + "epoch": 0.852871716212597, + "grad_norm": 1.65821373462677, + "learning_rate": 2.227585421196119e-06, + "loss": 0.7813, + "step": 17515 + }, + { + "epoch": 0.8529204100017043, + "grad_norm": 4.246715068817139, + "learning_rate": 2.226138914208962e-06, + "loss": 0.844, + "step": 17516 + }, + { + "epoch": 0.8529691037908115, + "grad_norm": 1.4912446737289429, + "learning_rate": 2.2246928493482155e-06, + "loss": 0.7964, + "step": 17517 + }, + { + "epoch": 0.8530177975799187, + "grad_norm": 1.5770831108093262, + "learning_rate": 2.2232472266498608e-06, + "loss": 0.7379, + "step": 17518 + }, + { + "epoch": 0.8530664913690259, + "grad_norm": 2.6073286533355713, + "learning_rate": 2.2218020461498504e-06, + "loss": 0.8771, + "step": 17519 + }, + { + "epoch": 0.8531151851581331, + "grad_norm": 3.51396107673645, + "learning_rate": 2.220357307884133e-06, + "loss": 0.7711, + "step": 17520 + }, + { + "epoch": 0.8531638789472403, + "grad_norm": 1.2450902462005615, + "learning_rate": 2.2189130118886527e-06, + "loss": 0.8758, + "step": 17521 + }, + { + "epoch": 0.8532125727363474, + "grad_norm": 0.09736131876707077, + "learning_rate": 2.2174691581993302e-06, + "loss": 0.5986, + "step": 17522 + }, + { + "epoch": 0.8532612665254546, + "grad_norm": 1.4368122816085815, + "learning_rate": 2.216025746852084e-06, + "loss": 0.8169, + "step": 17523 + }, + { + "epoch": 0.8533099603145619, + "grad_norm": 1.6932988166809082, + "learning_rate": 2.214582777882814e-06, + "loss": 0.8223, + "step": 17524 + }, + { + "epoch": 0.8533586541036691, + "grad_norm": 1.835731029510498, + "learning_rate": 2.2131402513274282e-06, + "loss": 0.8983, + "step": 17525 + }, + { + "epoch": 0.8534073478927763, + "grad_norm": 1.5143213272094727, + "learning_rate": 2.211698167221794e-06, + "loss": 0.7786, + "step": 17526 + }, + { + "epoch": 0.8534560416818835, + "grad_norm": 1.3433165550231934, + "learning_rate": 2.210256525601795e-06, + "loss": 0.8261, + "step": 17527 + }, + { + "epoch": 0.8535047354709907, + "grad_norm": 1.9817132949829102, + "learning_rate": 2.2088153265032797e-06, + "loss": 0.7539, + "step": 17528 + }, + { + "epoch": 0.8535534292600979, + "grad_norm": 1.6830164194107056, + "learning_rate": 2.2073745699621106e-06, + "loss": 0.7458, + "step": 17529 + }, + { + "epoch": 0.853602123049205, + "grad_norm": 1.8071026802062988, + "learning_rate": 2.2059342560141193e-06, + "loss": 0.7548, + "step": 17530 + }, + { + "epoch": 0.8536508168383122, + "grad_norm": 1.971217155456543, + "learning_rate": 2.2044943846951326e-06, + "loss": 0.8447, + "step": 17531 + }, + { + "epoch": 0.8536995106274194, + "grad_norm": 1.3451533317565918, + "learning_rate": 2.2030549560409797e-06, + "loss": 0.7339, + "step": 17532 + }, + { + "epoch": 0.8537482044165267, + "grad_norm": 1.553462028503418, + "learning_rate": 2.201615970087447e-06, + "loss": 0.8003, + "step": 17533 + }, + { + "epoch": 0.8537968982056339, + "grad_norm": 3.145501136779785, + "learning_rate": 2.200177426870349e-06, + "loss": 0.8556, + "step": 17534 + }, + { + "epoch": 0.8538455919947411, + "grad_norm": 3.250596284866333, + "learning_rate": 2.198739326425454e-06, + "loss": 0.7433, + "step": 17535 + }, + { + "epoch": 0.8538942857838483, + "grad_norm": 2.014683485031128, + "learning_rate": 2.197301668788543e-06, + "loss": 0.8714, + "step": 17536 + }, + { + "epoch": 0.8539429795729555, + "grad_norm": 1.4465023279190063, + "learning_rate": 2.1958644539953778e-06, + "loss": 0.8321, + "step": 17537 + }, + { + "epoch": 0.8539916733620627, + "grad_norm": 1.4870030879974365, + "learning_rate": 2.1944276820817077e-06, + "loss": 0.916, + "step": 17538 + }, + { + "epoch": 0.8540403671511698, + "grad_norm": 1.6877250671386719, + "learning_rate": 2.1929913530832713e-06, + "loss": 0.8461, + "step": 17539 + }, + { + "epoch": 0.854089060940277, + "grad_norm": 1.719367504119873, + "learning_rate": 2.1915554670357954e-06, + "loss": 0.7982, + "step": 17540 + }, + { + "epoch": 0.8541377547293842, + "grad_norm": 1.6887450218200684, + "learning_rate": 2.1901200239750085e-06, + "loss": 0.8553, + "step": 17541 + }, + { + "epoch": 0.8541864485184915, + "grad_norm": 1.9192100763320923, + "learning_rate": 2.188685023936601e-06, + "loss": 0.839, + "step": 17542 + }, + { + "epoch": 0.8542351423075987, + "grad_norm": 1.9037333726882935, + "learning_rate": 2.187250466956288e-06, + "loss": 0.7877, + "step": 17543 + }, + { + "epoch": 0.8542838360967059, + "grad_norm": 1.6214046478271484, + "learning_rate": 2.185816353069734e-06, + "loss": 0.7993, + "step": 17544 + }, + { + "epoch": 0.8543325298858131, + "grad_norm": 1.44711434841156, + "learning_rate": 2.184382682312627e-06, + "loss": 0.7505, + "step": 17545 + }, + { + "epoch": 0.8543812236749203, + "grad_norm": 2.6487343311309814, + "learning_rate": 2.1829494547206242e-06, + "loss": 0.8476, + "step": 17546 + }, + { + "epoch": 0.8544299174640274, + "grad_norm": 1.6094478368759155, + "learning_rate": 2.1815166703293776e-06, + "loss": 0.821, + "step": 17547 + }, + { + "epoch": 0.8544786112531346, + "grad_norm": 1.82554292678833, + "learning_rate": 2.1800843291745298e-06, + "loss": 0.7982, + "step": 17548 + }, + { + "epoch": 0.8545273050422418, + "grad_norm": 2.099719285964966, + "learning_rate": 2.1786524312917058e-06, + "loss": 0.7664, + "step": 17549 + }, + { + "epoch": 0.854575998831349, + "grad_norm": 2.480530261993408, + "learning_rate": 2.1772209767165343e-06, + "loss": 0.7965, + "step": 17550 + }, + { + "epoch": 0.8546246926204563, + "grad_norm": 1.3792158365249634, + "learning_rate": 2.1757899654846095e-06, + "loss": 0.7241, + "step": 17551 + }, + { + "epoch": 0.8546733864095635, + "grad_norm": 1.8708255290985107, + "learning_rate": 2.1743593976315358e-06, + "loss": 0.8987, + "step": 17552 + }, + { + "epoch": 0.8547220801986707, + "grad_norm": 6.57915735244751, + "learning_rate": 2.1729292731928984e-06, + "loss": 0.9158, + "step": 17553 + }, + { + "epoch": 0.8547707739877779, + "grad_norm": 3.120875358581543, + "learning_rate": 2.1714995922042714e-06, + "loss": 0.8797, + "step": 17554 + }, + { + "epoch": 0.8548194677768851, + "grad_norm": 1.7085283994674683, + "learning_rate": 2.170070354701217e-06, + "loss": 0.8315, + "step": 17555 + }, + { + "epoch": 0.8548681615659922, + "grad_norm": 2.2299067974090576, + "learning_rate": 2.168641560719289e-06, + "loss": 0.8099, + "step": 17556 + }, + { + "epoch": 0.8549168553550994, + "grad_norm": 1.2653579711914062, + "learning_rate": 2.167213210294028e-06, + "loss": 0.8116, + "step": 17557 + }, + { + "epoch": 0.8549655491442066, + "grad_norm": 1.6241453886032104, + "learning_rate": 2.165785303460959e-06, + "loss": 0.8849, + "step": 17558 + }, + { + "epoch": 0.8550142429333139, + "grad_norm": 1.1957467794418335, + "learning_rate": 2.1643578402556133e-06, + "loss": 0.7722, + "step": 17559 + }, + { + "epoch": 0.8550629367224211, + "grad_norm": 3.1503636837005615, + "learning_rate": 2.162930820713489e-06, + "loss": 0.8204, + "step": 17560 + }, + { + "epoch": 0.8551116305115283, + "grad_norm": 1.496609091758728, + "learning_rate": 2.1615042448700897e-06, + "loss": 0.8805, + "step": 17561 + }, + { + "epoch": 0.8551603243006355, + "grad_norm": 1.3741554021835327, + "learning_rate": 2.160078112760897e-06, + "loss": 0.8105, + "step": 17562 + }, + { + "epoch": 0.8552090180897427, + "grad_norm": 4.511055946350098, + "learning_rate": 2.158652424421388e-06, + "loss": 0.8422, + "step": 17563 + }, + { + "epoch": 0.8552577118788498, + "grad_norm": 1.126205325126648, + "learning_rate": 2.157227179887027e-06, + "loss": 0.7925, + "step": 17564 + }, + { + "epoch": 0.855306405667957, + "grad_norm": 2.4272947311401367, + "learning_rate": 2.155802379193268e-06, + "loss": 0.7843, + "step": 17565 + }, + { + "epoch": 0.8553550994570642, + "grad_norm": 1.7233238220214844, + "learning_rate": 2.1543780223755496e-06, + "loss": 0.7558, + "step": 17566 + }, + { + "epoch": 0.8554037932461714, + "grad_norm": 1.6104403734207153, + "learning_rate": 2.1529541094693073e-06, + "loss": 0.7243, + "step": 17567 + }, + { + "epoch": 0.8554524870352787, + "grad_norm": 2.5816731452941895, + "learning_rate": 2.1515306405099535e-06, + "loss": 0.8214, + "step": 17568 + }, + { + "epoch": 0.8555011808243859, + "grad_norm": 1.7867478132247925, + "learning_rate": 2.150107615532906e-06, + "loss": 0.7868, + "step": 17569 + }, + { + "epoch": 0.8555498746134931, + "grad_norm": 1.4756419658660889, + "learning_rate": 2.1486850345735586e-06, + "loss": 0.9083, + "step": 17570 + }, + { + "epoch": 0.8555985684026003, + "grad_norm": 1.6185574531555176, + "learning_rate": 2.1472628976672995e-06, + "loss": 0.7293, + "step": 17571 + }, + { + "epoch": 0.8556472621917075, + "grad_norm": 1.526969075202942, + "learning_rate": 2.1458412048495036e-06, + "loss": 0.9557, + "step": 17572 + }, + { + "epoch": 0.8556959559808146, + "grad_norm": 1.4313526153564453, + "learning_rate": 2.1444199561555344e-06, + "loss": 0.8681, + "step": 17573 + }, + { + "epoch": 0.8557446497699218, + "grad_norm": 2.2889351844787598, + "learning_rate": 2.142999151620748e-06, + "loss": 0.7386, + "step": 17574 + }, + { + "epoch": 0.855793343559029, + "grad_norm": 1.9133843183517456, + "learning_rate": 2.141578791280483e-06, + "loss": 0.7408, + "step": 17575 + }, + { + "epoch": 0.8558420373481362, + "grad_norm": 1.4308700561523438, + "learning_rate": 2.14015887517008e-06, + "loss": 0.7044, + "step": 17576 + }, + { + "epoch": 0.8558907311372435, + "grad_norm": 1.4128329753875732, + "learning_rate": 2.1387394033248455e-06, + "loss": 0.8421, + "step": 17577 + }, + { + "epoch": 0.8559394249263507, + "grad_norm": 1.5393037796020508, + "learning_rate": 2.137320375780101e-06, + "loss": 0.9109, + "step": 17578 + }, + { + "epoch": 0.8559881187154579, + "grad_norm": 6.017606258392334, + "learning_rate": 2.1359017925711423e-06, + "loss": 0.776, + "step": 17579 + }, + { + "epoch": 0.8560368125045651, + "grad_norm": 2.8769683837890625, + "learning_rate": 2.1344836537332548e-06, + "loss": 0.879, + "step": 17580 + }, + { + "epoch": 0.8560855062936722, + "grad_norm": 1.717894434928894, + "learning_rate": 2.1330659593017147e-06, + "loss": 0.7863, + "step": 17581 + }, + { + "epoch": 0.8561342000827794, + "grad_norm": 1.6176837682724, + "learning_rate": 2.1316487093117823e-06, + "loss": 0.863, + "step": 17582 + }, + { + "epoch": 0.8561828938718866, + "grad_norm": 2.4449539184570312, + "learning_rate": 2.1302319037987272e-06, + "loss": 0.818, + "step": 17583 + }, + { + "epoch": 0.8562315876609938, + "grad_norm": 2.501356601715088, + "learning_rate": 2.128815542797775e-06, + "loss": 0.7787, + "step": 17584 + }, + { + "epoch": 0.856280281450101, + "grad_norm": 2.0157968997955322, + "learning_rate": 2.127399626344173e-06, + "loss": 0.8233, + "step": 17585 + }, + { + "epoch": 0.8563289752392083, + "grad_norm": 1.7123721837997437, + "learning_rate": 2.1259841544731263e-06, + "loss": 0.8819, + "step": 17586 + }, + { + "epoch": 0.8563776690283155, + "grad_norm": 1.505206823348999, + "learning_rate": 2.1245691272198574e-06, + "loss": 0.749, + "step": 17587 + }, + { + "epoch": 0.8564263628174227, + "grad_norm": 1.8641678094863892, + "learning_rate": 2.1231545446195632e-06, + "loss": 0.8357, + "step": 17588 + }, + { + "epoch": 0.8564750566065298, + "grad_norm": 1.9399034976959229, + "learning_rate": 2.1217404067074286e-06, + "loss": 0.8095, + "step": 17589 + }, + { + "epoch": 0.856523750395637, + "grad_norm": 1.4799656867980957, + "learning_rate": 2.1203267135186324e-06, + "loss": 0.7366, + "step": 17590 + }, + { + "epoch": 0.8565724441847442, + "grad_norm": 1.3181902170181274, + "learning_rate": 2.1189134650883326e-06, + "loss": 0.7482, + "step": 17591 + }, + { + "epoch": 0.8566211379738514, + "grad_norm": 1.4942487478256226, + "learning_rate": 2.1175006614517013e-06, + "loss": 0.8401, + "step": 17592 + }, + { + "epoch": 0.8566698317629586, + "grad_norm": 1.2871474027633667, + "learning_rate": 2.1160883026438616e-06, + "loss": 0.7595, + "step": 17593 + }, + { + "epoch": 0.8567185255520658, + "grad_norm": 1.695306420326233, + "learning_rate": 2.114676388699961e-06, + "loss": 0.7779, + "step": 17594 + }, + { + "epoch": 0.8567672193411731, + "grad_norm": 1.2276493310928345, + "learning_rate": 2.1132649196551157e-06, + "loss": 0.845, + "step": 17595 + }, + { + "epoch": 0.8568159131302803, + "grad_norm": 1.7508456707000732, + "learning_rate": 2.111853895544438e-06, + "loss": 0.7791, + "step": 17596 + }, + { + "epoch": 0.8568646069193875, + "grad_norm": 2.8467299938201904, + "learning_rate": 2.1104433164030235e-06, + "loss": 0.8697, + "step": 17597 + }, + { + "epoch": 0.8569133007084946, + "grad_norm": 1.4801428318023682, + "learning_rate": 2.1090331822659602e-06, + "loss": 0.8259, + "step": 17598 + }, + { + "epoch": 0.8569619944976018, + "grad_norm": 1.4003673791885376, + "learning_rate": 2.1076234931683338e-06, + "loss": 0.714, + "step": 17599 + }, + { + "epoch": 0.857010688286709, + "grad_norm": 1.7417062520980835, + "learning_rate": 2.1062142491451977e-06, + "loss": 0.8416, + "step": 17600 + }, + { + "epoch": 0.8570593820758162, + "grad_norm": 1.585168719291687, + "learning_rate": 2.104805450231622e-06, + "loss": 0.8093, + "step": 17601 + }, + { + "epoch": 0.8571080758649234, + "grad_norm": 1.7516725063323975, + "learning_rate": 2.103397096462634e-06, + "loss": 0.7943, + "step": 17602 + }, + { + "epoch": 0.8571567696540306, + "grad_norm": 1.3195947408676147, + "learning_rate": 2.101989187873279e-06, + "loss": 0.7158, + "step": 17603 + }, + { + "epoch": 0.8572054634431379, + "grad_norm": 1.5928866863250732, + "learning_rate": 2.100581724498574e-06, + "loss": 0.7477, + "step": 17604 + }, + { + "epoch": 0.8572541572322451, + "grad_norm": 0.09174146503210068, + "learning_rate": 2.09917470637353e-06, + "loss": 0.543, + "step": 17605 + }, + { + "epoch": 0.8573028510213522, + "grad_norm": 1.2739661931991577, + "learning_rate": 2.097768133533149e-06, + "loss": 0.8354, + "step": 17606 + }, + { + "epoch": 0.8573515448104594, + "grad_norm": 8.034469604492188, + "learning_rate": 2.096362006012418e-06, + "loss": 0.7538, + "step": 17607 + }, + { + "epoch": 0.8574002385995666, + "grad_norm": 1.4772303104400635, + "learning_rate": 2.0949563238463133e-06, + "loss": 0.8651, + "step": 17608 + }, + { + "epoch": 0.8574489323886738, + "grad_norm": 1.4697266817092896, + "learning_rate": 2.0935510870697983e-06, + "loss": 0.7917, + "step": 17609 + }, + { + "epoch": 0.857497626177781, + "grad_norm": 1.7522952556610107, + "learning_rate": 2.0921462957178386e-06, + "loss": 0.8501, + "step": 17610 + }, + { + "epoch": 0.8575463199668882, + "grad_norm": 1.4748578071594238, + "learning_rate": 2.0907419498253724e-06, + "loss": 0.8319, + "step": 17611 + }, + { + "epoch": 0.8575950137559955, + "grad_norm": 1.218077301979065, + "learning_rate": 2.089338049427332e-06, + "loss": 0.8257, + "step": 17612 + }, + { + "epoch": 0.8576437075451027, + "grad_norm": 2.1653826236724854, + "learning_rate": 2.0879345945586405e-06, + "loss": 0.8638, + "step": 17613 + }, + { + "epoch": 0.8576924013342099, + "grad_norm": 1.630646824836731, + "learning_rate": 2.0865315852542102e-06, + "loss": 0.7885, + "step": 17614 + }, + { + "epoch": 0.857741095123317, + "grad_norm": 1.6087594032287598, + "learning_rate": 2.0851290215489374e-06, + "loss": 0.7984, + "step": 17615 + }, + { + "epoch": 0.8577897889124242, + "grad_norm": 1.7096984386444092, + "learning_rate": 2.0837269034777163e-06, + "loss": 0.8144, + "step": 17616 + }, + { + "epoch": 0.8578384827015314, + "grad_norm": 1.378279685974121, + "learning_rate": 2.0823252310754174e-06, + "loss": 0.7724, + "step": 17617 + }, + { + "epoch": 0.8578871764906386, + "grad_norm": 1.1843117475509644, + "learning_rate": 2.080924004376916e-06, + "loss": 0.7784, + "step": 17618 + }, + { + "epoch": 0.8579358702797458, + "grad_norm": 1.2754552364349365, + "learning_rate": 2.0795232234170636e-06, + "loss": 0.772, + "step": 17619 + }, + { + "epoch": 0.857984564068853, + "grad_norm": 1.490123987197876, + "learning_rate": 2.0781228882307047e-06, + "loss": 0.805, + "step": 17620 + }, + { + "epoch": 0.8580332578579603, + "grad_norm": 1.3225780725479126, + "learning_rate": 2.076722998852674e-06, + "loss": 0.8153, + "step": 17621 + }, + { + "epoch": 0.8580819516470675, + "grad_norm": 1.4855015277862549, + "learning_rate": 2.075323555317792e-06, + "loss": 0.7019, + "step": 17622 + }, + { + "epoch": 0.8581306454361746, + "grad_norm": 1.3919309377670288, + "learning_rate": 2.0739245576608693e-06, + "loss": 0.7734, + "step": 17623 + }, + { + "epoch": 0.8581793392252818, + "grad_norm": 1.4853605031967163, + "learning_rate": 2.0725260059167106e-06, + "loss": 0.9019, + "step": 17624 + }, + { + "epoch": 0.858228033014389, + "grad_norm": 4.355327129364014, + "learning_rate": 2.0711279001201003e-06, + "loss": 0.6852, + "step": 17625 + }, + { + "epoch": 0.8582767268034962, + "grad_norm": 0.09065678715705872, + "learning_rate": 2.069730240305814e-06, + "loss": 0.6176, + "step": 17626 + }, + { + "epoch": 0.8583254205926034, + "grad_norm": 1.236136555671692, + "learning_rate": 2.068333026508629e-06, + "loss": 0.8129, + "step": 17627 + }, + { + "epoch": 0.8583741143817106, + "grad_norm": 1.8480898141860962, + "learning_rate": 2.0669362587632924e-06, + "loss": 0.8453, + "step": 17628 + }, + { + "epoch": 0.8584228081708178, + "grad_norm": 1.5223280191421509, + "learning_rate": 2.0655399371045525e-06, + "loss": 0.8503, + "step": 17629 + }, + { + "epoch": 0.858471501959925, + "grad_norm": 1.5192488431930542, + "learning_rate": 2.0641440615671414e-06, + "loss": 0.7884, + "step": 17630 + }, + { + "epoch": 0.8585201957490322, + "grad_norm": 1.872924566268921, + "learning_rate": 2.0627486321857827e-06, + "loss": 0.8576, + "step": 17631 + }, + { + "epoch": 0.8585688895381394, + "grad_norm": 1.5639772415161133, + "learning_rate": 2.0613536489951878e-06, + "loss": 0.7426, + "step": 17632 + }, + { + "epoch": 0.8586175833272466, + "grad_norm": 1.759202003479004, + "learning_rate": 2.059959112030052e-06, + "loss": 0.866, + "step": 17633 + }, + { + "epoch": 0.8586662771163538, + "grad_norm": 1.5251097679138184, + "learning_rate": 2.058565021325076e-06, + "loss": 0.7625, + "step": 17634 + }, + { + "epoch": 0.858714970905461, + "grad_norm": 0.09426997601985931, + "learning_rate": 2.057171376914924e-06, + "loss": 0.6088, + "step": 17635 + }, + { + "epoch": 0.8587636646945682, + "grad_norm": 1.8065075874328613, + "learning_rate": 2.055778178834278e-06, + "loss": 0.7405, + "step": 17636 + }, + { + "epoch": 0.8588123584836754, + "grad_norm": 1.7178055047988892, + "learning_rate": 2.0543854271177778e-06, + "loss": 0.8304, + "step": 17637 + }, + { + "epoch": 0.8588610522727826, + "grad_norm": 1.7362622022628784, + "learning_rate": 2.05299312180008e-06, + "loss": 0.836, + "step": 17638 + }, + { + "epoch": 0.8589097460618899, + "grad_norm": 2.4434094429016113, + "learning_rate": 2.0516012629158142e-06, + "loss": 0.8336, + "step": 17639 + }, + { + "epoch": 0.858958439850997, + "grad_norm": 1.540870189666748, + "learning_rate": 2.050209850499598e-06, + "loss": 0.7845, + "step": 17640 + }, + { + "epoch": 0.8590071336401042, + "grad_norm": 1.7491247653961182, + "learning_rate": 2.048818884586059e-06, + "loss": 0.7938, + "step": 17641 + }, + { + "epoch": 0.8590558274292114, + "grad_norm": 1.6014103889465332, + "learning_rate": 2.0474283652097783e-06, + "loss": 0.7106, + "step": 17642 + }, + { + "epoch": 0.8591045212183186, + "grad_norm": 3.0571749210357666, + "learning_rate": 2.046038292405361e-06, + "loss": 0.8634, + "step": 17643 + }, + { + "epoch": 0.8591532150074258, + "grad_norm": 2.021425724029541, + "learning_rate": 2.044648666207372e-06, + "loss": 0.7872, + "step": 17644 + }, + { + "epoch": 0.859201908796533, + "grad_norm": 0.09781011193990707, + "learning_rate": 2.0432594866503885e-06, + "loss": 0.5813, + "step": 17645 + }, + { + "epoch": 0.8592506025856402, + "grad_norm": 1.3522988557815552, + "learning_rate": 2.0418707537689598e-06, + "loss": 0.7452, + "step": 17646 + }, + { + "epoch": 0.8592992963747474, + "grad_norm": 1.409502387046814, + "learning_rate": 2.0404824675976353e-06, + "loss": 0.8065, + "step": 17647 + }, + { + "epoch": 0.8593479901638545, + "grad_norm": 1.600699782371521, + "learning_rate": 2.039094628170948e-06, + "loss": 0.8098, + "step": 17648 + }, + { + "epoch": 0.8593966839529618, + "grad_norm": 1.7165496349334717, + "learning_rate": 2.037707235523414e-06, + "loss": 0.7817, + "step": 17649 + }, + { + "epoch": 0.859445377742069, + "grad_norm": 1.4936288595199585, + "learning_rate": 2.0363202896895573e-06, + "loss": 0.7763, + "step": 17650 + }, + { + "epoch": 0.8594940715311762, + "grad_norm": 1.563813328742981, + "learning_rate": 2.034933790703866e-06, + "loss": 0.8972, + "step": 17651 + }, + { + "epoch": 0.8595427653202834, + "grad_norm": 2.0972914695739746, + "learning_rate": 2.0335477386008405e-06, + "loss": 0.9019, + "step": 17652 + }, + { + "epoch": 0.8595914591093906, + "grad_norm": 1.7749885320663452, + "learning_rate": 2.0321621334149453e-06, + "loss": 0.7268, + "step": 17653 + }, + { + "epoch": 0.8596401528984978, + "grad_norm": 2.2606470584869385, + "learning_rate": 2.0307769751806595e-06, + "loss": 0.8878, + "step": 17654 + }, + { + "epoch": 0.859688846687605, + "grad_norm": 1.395462155342102, + "learning_rate": 2.0293922639324346e-06, + "loss": 0.8178, + "step": 17655 + }, + { + "epoch": 0.8597375404767122, + "grad_norm": 1.8216753005981445, + "learning_rate": 2.028007999704715e-06, + "loss": 0.8072, + "step": 17656 + }, + { + "epoch": 0.8597862342658193, + "grad_norm": 1.8941196203231812, + "learning_rate": 2.0266241825319376e-06, + "loss": 0.7219, + "step": 17657 + }, + { + "epoch": 0.8598349280549266, + "grad_norm": 1.7624602317810059, + "learning_rate": 2.0252408124485166e-06, + "loss": 0.8229, + "step": 17658 + }, + { + "epoch": 0.8598836218440338, + "grad_norm": 1.9865894317626953, + "learning_rate": 2.0238578894888784e-06, + "loss": 0.9011, + "step": 17659 + }, + { + "epoch": 0.859932315633141, + "grad_norm": 1.7234236001968384, + "learning_rate": 2.022475413687404e-06, + "loss": 0.8686, + "step": 17660 + }, + { + "epoch": 0.8599810094222482, + "grad_norm": 1.9144853353500366, + "learning_rate": 2.0210933850784984e-06, + "loss": 0.7779, + "step": 17661 + }, + { + "epoch": 0.8600297032113554, + "grad_norm": 1.396195650100708, + "learning_rate": 2.0197118036965335e-06, + "loss": 0.843, + "step": 17662 + }, + { + "epoch": 0.8600783970004626, + "grad_norm": 0.09467055648565292, + "learning_rate": 2.0183306695758765e-06, + "loss": 0.5949, + "step": 17663 + }, + { + "epoch": 0.8601270907895698, + "grad_norm": 1.3157222270965576, + "learning_rate": 2.016949982750884e-06, + "loss": 0.8225, + "step": 17664 + }, + { + "epoch": 0.8601757845786769, + "grad_norm": 1.6506102085113525, + "learning_rate": 2.0155697432559008e-06, + "loss": 0.882, + "step": 17665 + }, + { + "epoch": 0.8602244783677842, + "grad_norm": 1.9857096672058105, + "learning_rate": 2.0141899511252607e-06, + "loss": 0.8215, + "step": 17666 + }, + { + "epoch": 0.8602731721568914, + "grad_norm": 1.633717656135559, + "learning_rate": 2.0128106063932805e-06, + "loss": 0.7988, + "step": 17667 + }, + { + "epoch": 0.8603218659459986, + "grad_norm": 1.6179510354995728, + "learning_rate": 2.0114317090942803e-06, + "loss": 0.7937, + "step": 17668 + }, + { + "epoch": 0.8603705597351058, + "grad_norm": 1.2419118881225586, + "learning_rate": 2.0100532592625565e-06, + "loss": 0.7715, + "step": 17669 + }, + { + "epoch": 0.860419253524213, + "grad_norm": 2.881823778152466, + "learning_rate": 2.0086752569323973e-06, + "loss": 0.7449, + "step": 17670 + }, + { + "epoch": 0.8604679473133202, + "grad_norm": 1.4982738494873047, + "learning_rate": 2.0072977021380848e-06, + "loss": 0.8523, + "step": 17671 + }, + { + "epoch": 0.8605166411024274, + "grad_norm": 1.7529510259628296, + "learning_rate": 2.005920594913879e-06, + "loss": 0.8257, + "step": 17672 + }, + { + "epoch": 0.8605653348915346, + "grad_norm": 2.075948715209961, + "learning_rate": 2.004543935294041e-06, + "loss": 0.7286, + "step": 17673 + }, + { + "epoch": 0.8606140286806417, + "grad_norm": 1.8728258609771729, + "learning_rate": 2.003167723312811e-06, + "loss": 0.8051, + "step": 17674 + }, + { + "epoch": 0.860662722469749, + "grad_norm": 1.4712494611740112, + "learning_rate": 2.0017919590044264e-06, + "loss": 0.7946, + "step": 17675 + }, + { + "epoch": 0.8607114162588562, + "grad_norm": 1.487778902053833, + "learning_rate": 2.000416642403109e-06, + "loss": 0.7835, + "step": 17676 + }, + { + "epoch": 0.8607601100479634, + "grad_norm": 1.8744133710861206, + "learning_rate": 1.999041773543062e-06, + "loss": 0.7438, + "step": 17677 + }, + { + "epoch": 0.8608088038370706, + "grad_norm": 2.2457337379455566, + "learning_rate": 1.997667352458499e-06, + "loss": 0.7873, + "step": 17678 + }, + { + "epoch": 0.8608574976261778, + "grad_norm": 1.6901984214782715, + "learning_rate": 1.9962933791836004e-06, + "loss": 0.895, + "step": 17679 + }, + { + "epoch": 0.860906191415285, + "grad_norm": 1.1788814067840576, + "learning_rate": 1.994919853752546e-06, + "loss": 0.8835, + "step": 17680 + }, + { + "epoch": 0.8609548852043922, + "grad_norm": 1.558484435081482, + "learning_rate": 1.9935467761995e-06, + "loss": 0.7882, + "step": 17681 + }, + { + "epoch": 0.8610035789934993, + "grad_norm": 1.4456886053085327, + "learning_rate": 1.992174146558623e-06, + "loss": 0.7929, + "step": 17682 + }, + { + "epoch": 0.8610522727826065, + "grad_norm": 1.294973611831665, + "learning_rate": 1.9908019648640532e-06, + "loss": 0.8007, + "step": 17683 + }, + { + "epoch": 0.8611009665717138, + "grad_norm": 0.09793601185083389, + "learning_rate": 1.989430231149925e-06, + "loss": 0.5787, + "step": 17684 + }, + { + "epoch": 0.861149660360821, + "grad_norm": 2.0123209953308105, + "learning_rate": 1.9880589454503684e-06, + "loss": 0.7842, + "step": 17685 + }, + { + "epoch": 0.8611983541499282, + "grad_norm": 1.4819120168685913, + "learning_rate": 1.9866881077994794e-06, + "loss": 0.8702, + "step": 17686 + }, + { + "epoch": 0.8612470479390354, + "grad_norm": 1.4153293371200562, + "learning_rate": 1.98531771823137e-06, + "loss": 0.7568, + "step": 17687 + }, + { + "epoch": 0.8612957417281426, + "grad_norm": 1.2481549978256226, + "learning_rate": 1.983947776780126e-06, + "loss": 0.9448, + "step": 17688 + }, + { + "epoch": 0.8613444355172498, + "grad_norm": 1.9790101051330566, + "learning_rate": 1.9825782834798234e-06, + "loss": 0.7858, + "step": 17689 + }, + { + "epoch": 0.8613931293063569, + "grad_norm": 1.4668196439743042, + "learning_rate": 1.9812092383645274e-06, + "loss": 0.8788, + "step": 17690 + }, + { + "epoch": 0.8614418230954641, + "grad_norm": 3.282888889312744, + "learning_rate": 1.9798406414682936e-06, + "loss": 0.7782, + "step": 17691 + }, + { + "epoch": 0.8614905168845713, + "grad_norm": 1.345988154411316, + "learning_rate": 1.978472492825172e-06, + "loss": 0.7219, + "step": 17692 + }, + { + "epoch": 0.8615392106736786, + "grad_norm": 1.4318591356277466, + "learning_rate": 1.9771047924691844e-06, + "loss": 0.8829, + "step": 17693 + }, + { + "epoch": 0.8615879044627858, + "grad_norm": 0.10968749225139618, + "learning_rate": 1.975737540434364e-06, + "loss": 0.6057, + "step": 17694 + }, + { + "epoch": 0.861636598251893, + "grad_norm": 1.5443154573440552, + "learning_rate": 1.9743707367547096e-06, + "loss": 0.7184, + "step": 17695 + }, + { + "epoch": 0.8616852920410002, + "grad_norm": 1.403472900390625, + "learning_rate": 1.9730043814642297e-06, + "loss": 0.8608, + "step": 17696 + }, + { + "epoch": 0.8617339858301074, + "grad_norm": 1.2981460094451904, + "learning_rate": 1.9716384745969087e-06, + "loss": 0.718, + "step": 17697 + }, + { + "epoch": 0.8617826796192146, + "grad_norm": 1.7692457437515259, + "learning_rate": 1.970273016186721e-06, + "loss": 0.7945, + "step": 17698 + }, + { + "epoch": 0.8618313734083217, + "grad_norm": 1.7973004579544067, + "learning_rate": 1.9689080062676446e-06, + "loss": 0.8972, + "step": 17699 + }, + { + "epoch": 0.8618800671974289, + "grad_norm": 1.3213080167770386, + "learning_rate": 1.9675434448736186e-06, + "loss": 0.7812, + "step": 17700 + }, + { + "epoch": 0.8619287609865361, + "grad_norm": 2.4402689933776855, + "learning_rate": 1.966179332038598e-06, + "loss": 0.8135, + "step": 17701 + }, + { + "epoch": 0.8619774547756434, + "grad_norm": 3.1160309314727783, + "learning_rate": 1.964815667796507e-06, + "loss": 0.7484, + "step": 17702 + }, + { + "epoch": 0.8620261485647506, + "grad_norm": 1.5435154438018799, + "learning_rate": 1.9634524521812716e-06, + "loss": 0.9019, + "step": 17703 + }, + { + "epoch": 0.8620748423538578, + "grad_norm": 2.375962495803833, + "learning_rate": 1.9620896852268024e-06, + "loss": 0.8283, + "step": 17704 + }, + { + "epoch": 0.862123536142965, + "grad_norm": 2.293002128601074, + "learning_rate": 1.960727366966997e-06, + "loss": 0.7949, + "step": 17705 + }, + { + "epoch": 0.8621722299320722, + "grad_norm": 1.7128534317016602, + "learning_rate": 1.959365497435741e-06, + "loss": 0.898, + "step": 17706 + }, + { + "epoch": 0.8622209237211793, + "grad_norm": 2.0828824043273926, + "learning_rate": 1.9580040766669106e-06, + "loss": 0.7984, + "step": 17707 + }, + { + "epoch": 0.8622696175102865, + "grad_norm": 3.051051139831543, + "learning_rate": 1.9566431046943825e-06, + "loss": 0.8065, + "step": 17708 + }, + { + "epoch": 0.8623183112993937, + "grad_norm": 1.1956216096878052, + "learning_rate": 1.955282581551994e-06, + "loss": 0.8062, + "step": 17709 + }, + { + "epoch": 0.862367005088501, + "grad_norm": 0.09425701200962067, + "learning_rate": 1.9539225072736044e-06, + "loss": 0.6342, + "step": 17710 + }, + { + "epoch": 0.8624156988776082, + "grad_norm": 1.5180283784866333, + "learning_rate": 1.95256288189303e-06, + "loss": 0.8299, + "step": 17711 + }, + { + "epoch": 0.8624643926667154, + "grad_norm": 1.5175111293792725, + "learning_rate": 1.9512037054441026e-06, + "loss": 0.8206, + "step": 17712 + }, + { + "epoch": 0.8625130864558226, + "grad_norm": 3.1495919227600098, + "learning_rate": 1.949844977960629e-06, + "loss": 0.8266, + "step": 17713 + }, + { + "epoch": 0.8625617802449298, + "grad_norm": 1.7446389198303223, + "learning_rate": 1.9484866994764063e-06, + "loss": 0.7487, + "step": 17714 + }, + { + "epoch": 0.862610474034037, + "grad_norm": 1.1904196739196777, + "learning_rate": 1.9471288700252254e-06, + "loss": 0.891, + "step": 17715 + }, + { + "epoch": 0.8626591678231441, + "grad_norm": 0.09327910095453262, + "learning_rate": 1.945771489640853e-06, + "loss": 0.5785, + "step": 17716 + }, + { + "epoch": 0.8627078616122513, + "grad_norm": 1.5506982803344727, + "learning_rate": 1.9444145583570706e-06, + "loss": 0.8027, + "step": 17717 + }, + { + "epoch": 0.8627565554013585, + "grad_norm": 1.243544578552246, + "learning_rate": 1.9430580762076133e-06, + "loss": 0.8936, + "step": 17718 + }, + { + "epoch": 0.8628052491904658, + "grad_norm": 2.011457681655884, + "learning_rate": 1.941702043226239e-06, + "loss": 0.8766, + "step": 17719 + }, + { + "epoch": 0.862853942979573, + "grad_norm": 1.5859293937683105, + "learning_rate": 1.9403464594466713e-06, + "loss": 0.7698, + "step": 17720 + }, + { + "epoch": 0.8629026367686802, + "grad_norm": 1.2631161212921143, + "learning_rate": 1.9389913249026303e-06, + "loss": 0.8058, + "step": 17721 + }, + { + "epoch": 0.8629513305577874, + "grad_norm": 1.5895938873291016, + "learning_rate": 1.937636639627829e-06, + "loss": 0.8457, + "step": 17722 + }, + { + "epoch": 0.8630000243468946, + "grad_norm": 1.525078535079956, + "learning_rate": 1.9362824036559624e-06, + "loss": 0.9495, + "step": 17723 + }, + { + "epoch": 0.8630487181360017, + "grad_norm": 1.5136135816574097, + "learning_rate": 1.9349286170207195e-06, + "loss": 0.7732, + "step": 17724 + }, + { + "epoch": 0.8630974119251089, + "grad_norm": 2.3628270626068115, + "learning_rate": 1.9335752797557726e-06, + "loss": 0.9113, + "step": 17725 + }, + { + "epoch": 0.8631461057142161, + "grad_norm": 1.498780369758606, + "learning_rate": 1.9322223918947848e-06, + "loss": 0.8281, + "step": 17726 + }, + { + "epoch": 0.8631947995033233, + "grad_norm": 1.58011794090271, + "learning_rate": 1.9308699534714172e-06, + "loss": 0.8165, + "step": 17727 + }, + { + "epoch": 0.8632434932924306, + "grad_norm": 1.628528118133545, + "learning_rate": 1.929517964519305e-06, + "loss": 0.8144, + "step": 17728 + }, + { + "epoch": 0.8632921870815378, + "grad_norm": 1.3406862020492554, + "learning_rate": 1.928166425072082e-06, + "loss": 0.7117, + "step": 17729 + }, + { + "epoch": 0.863340880870645, + "grad_norm": 1.3678866624832153, + "learning_rate": 1.9268153351633657e-06, + "loss": 0.7992, + "step": 17730 + }, + { + "epoch": 0.8633895746597522, + "grad_norm": 1.1991560459136963, + "learning_rate": 1.9254646948267665e-06, + "loss": 0.8429, + "step": 17731 + }, + { + "epoch": 0.8634382684488594, + "grad_norm": 1.8071630001068115, + "learning_rate": 1.924114504095882e-06, + "loss": 0.8091, + "step": 17732 + }, + { + "epoch": 0.8634869622379665, + "grad_norm": 1.244965672492981, + "learning_rate": 1.922764763004297e-06, + "loss": 0.8488, + "step": 17733 + }, + { + "epoch": 0.8635356560270737, + "grad_norm": 1.4701483249664307, + "learning_rate": 1.921415471585586e-06, + "loss": 0.8422, + "step": 17734 + }, + { + "epoch": 0.8635843498161809, + "grad_norm": 1.828500747680664, + "learning_rate": 1.9200666298733096e-06, + "loss": 0.8547, + "step": 17735 + }, + { + "epoch": 0.8636330436052881, + "grad_norm": 1.825932502746582, + "learning_rate": 1.918718237901027e-06, + "loss": 0.7419, + "step": 17736 + }, + { + "epoch": 0.8636817373943954, + "grad_norm": 5.342960834503174, + "learning_rate": 1.917370295702279e-06, + "loss": 0.8599, + "step": 17737 + }, + { + "epoch": 0.8637304311835026, + "grad_norm": 1.5211080312728882, + "learning_rate": 1.916022803310593e-06, + "loss": 0.7423, + "step": 17738 + }, + { + "epoch": 0.8637791249726098, + "grad_norm": 14.857969284057617, + "learning_rate": 1.914675760759488e-06, + "loss": 0.9838, + "step": 17739 + }, + { + "epoch": 0.863827818761717, + "grad_norm": 1.3291956186294556, + "learning_rate": 1.913329168082474e-06, + "loss": 0.8463, + "step": 17740 + }, + { + "epoch": 0.8638765125508241, + "grad_norm": 1.8881667852401733, + "learning_rate": 1.9119830253130445e-06, + "loss": 0.9136, + "step": 17741 + }, + { + "epoch": 0.8639252063399313, + "grad_norm": 1.5158764123916626, + "learning_rate": 1.910637332484684e-06, + "loss": 0.7361, + "step": 17742 + }, + { + "epoch": 0.8639739001290385, + "grad_norm": 2.0705294609069824, + "learning_rate": 1.909292089630874e-06, + "loss": 0.8243, + "step": 17743 + }, + { + "epoch": 0.8640225939181457, + "grad_norm": 1.519837737083435, + "learning_rate": 1.9079472967850687e-06, + "loss": 0.8132, + "step": 17744 + }, + { + "epoch": 0.8640712877072529, + "grad_norm": 1.2431280612945557, + "learning_rate": 1.9066029539807296e-06, + "loss": 0.7234, + "step": 17745 + }, + { + "epoch": 0.8641199814963602, + "grad_norm": 1.282690405845642, + "learning_rate": 1.9052590612512834e-06, + "loss": 0.739, + "step": 17746 + }, + { + "epoch": 0.8641686752854674, + "grad_norm": 1.4064172506332397, + "learning_rate": 1.9039156186301722e-06, + "loss": 0.8565, + "step": 17747 + }, + { + "epoch": 0.8642173690745746, + "grad_norm": 1.5804071426391602, + "learning_rate": 1.9025726261508115e-06, + "loss": 0.8287, + "step": 17748 + }, + { + "epoch": 0.8642660628636817, + "grad_norm": 1.8066177368164062, + "learning_rate": 1.9012300838466012e-06, + "loss": 0.8373, + "step": 17749 + }, + { + "epoch": 0.8643147566527889, + "grad_norm": 1.899060606956482, + "learning_rate": 1.8998879917509506e-06, + "loss": 0.8448, + "step": 17750 + }, + { + "epoch": 0.8643634504418961, + "grad_norm": 1.6489691734313965, + "learning_rate": 1.89854634989723e-06, + "loss": 0.8098, + "step": 17751 + }, + { + "epoch": 0.8644121442310033, + "grad_norm": 1.6762630939483643, + "learning_rate": 1.8972051583188244e-06, + "loss": 0.8817, + "step": 17752 + }, + { + "epoch": 0.8644608380201105, + "grad_norm": 1.4937018156051636, + "learning_rate": 1.895864417049087e-06, + "loss": 0.8462, + "step": 17753 + }, + { + "epoch": 0.8645095318092177, + "grad_norm": 1.2992465496063232, + "learning_rate": 1.8945241261213731e-06, + "loss": 0.7869, + "step": 17754 + }, + { + "epoch": 0.864558225598325, + "grad_norm": 0.09267491102218628, + "learning_rate": 1.8931842855690253e-06, + "loss": 0.5735, + "step": 17755 + }, + { + "epoch": 0.8646069193874322, + "grad_norm": 2.51835560798645, + "learning_rate": 1.8918448954253676e-06, + "loss": 0.8441, + "step": 17756 + }, + { + "epoch": 0.8646556131765394, + "grad_norm": 1.3659919500350952, + "learning_rate": 1.8905059557237205e-06, + "loss": 0.7788, + "step": 17757 + }, + { + "epoch": 0.8647043069656465, + "grad_norm": 1.1467876434326172, + "learning_rate": 1.8891674664973837e-06, + "loss": 0.6886, + "step": 17758 + }, + { + "epoch": 0.8647530007547537, + "grad_norm": 1.5100245475769043, + "learning_rate": 1.887829427779666e-06, + "loss": 0.806, + "step": 17759 + }, + { + "epoch": 0.8648016945438609, + "grad_norm": 1.6206351518630981, + "learning_rate": 1.8864918396038345e-06, + "loss": 0.7789, + "step": 17760 + }, + { + "epoch": 0.8648503883329681, + "grad_norm": 2.0445990562438965, + "learning_rate": 1.885154702003178e-06, + "loss": 0.7857, + "step": 17761 + }, + { + "epoch": 0.8648990821220753, + "grad_norm": 1.4374561309814453, + "learning_rate": 1.8838180150109431e-06, + "loss": 0.8246, + "step": 17762 + }, + { + "epoch": 0.8649477759111825, + "grad_norm": 1.4297336339950562, + "learning_rate": 1.88248177866039e-06, + "loss": 0.7816, + "step": 17763 + }, + { + "epoch": 0.8649964697002898, + "grad_norm": 1.659514307975769, + "learning_rate": 1.8811459929847542e-06, + "loss": 0.8517, + "step": 17764 + }, + { + "epoch": 0.865045163489397, + "grad_norm": 1.8148143291473389, + "learning_rate": 1.8798106580172626e-06, + "loss": 0.7678, + "step": 17765 + }, + { + "epoch": 0.8650938572785041, + "grad_norm": 1.2789926528930664, + "learning_rate": 1.8784757737911351e-06, + "loss": 0.8526, + "step": 17766 + }, + { + "epoch": 0.8651425510676113, + "grad_norm": 2.010700225830078, + "learning_rate": 1.87714134033957e-06, + "loss": 0.7896, + "step": 17767 + }, + { + "epoch": 0.8651912448567185, + "grad_norm": 1.9540983438491821, + "learning_rate": 1.8758073576957736e-06, + "loss": 0.7742, + "step": 17768 + }, + { + "epoch": 0.8652399386458257, + "grad_norm": 2.64228892326355, + "learning_rate": 1.874473825892915e-06, + "loss": 0.7821, + "step": 17769 + }, + { + "epoch": 0.8652886324349329, + "grad_norm": 1.2115360498428345, + "learning_rate": 1.873140744964177e-06, + "loss": 0.8088, + "step": 17770 + }, + { + "epoch": 0.8653373262240401, + "grad_norm": 1.5245412588119507, + "learning_rate": 1.8718081149427147e-06, + "loss": 0.8094, + "step": 17771 + }, + { + "epoch": 0.8653860200131474, + "grad_norm": 1.4433565139770508, + "learning_rate": 1.8704759358616798e-06, + "loss": 0.8342, + "step": 17772 + }, + { + "epoch": 0.8654347138022546, + "grad_norm": 1.3884143829345703, + "learning_rate": 1.8691442077542076e-06, + "loss": 0.8646, + "step": 17773 + }, + { + "epoch": 0.8654834075913618, + "grad_norm": 1.4945780038833618, + "learning_rate": 1.8678129306534275e-06, + "loss": 0.7472, + "step": 17774 + }, + { + "epoch": 0.8655321013804689, + "grad_norm": 1.2830288410186768, + "learning_rate": 1.866482104592453e-06, + "loss": 0.8617, + "step": 17775 + }, + { + "epoch": 0.8655807951695761, + "grad_norm": 0.09928689897060394, + "learning_rate": 1.8651517296043865e-06, + "loss": 0.6776, + "step": 17776 + }, + { + "epoch": 0.8656294889586833, + "grad_norm": 1.3332529067993164, + "learning_rate": 1.8638218057223256e-06, + "loss": 0.8084, + "step": 17777 + }, + { + "epoch": 0.8656781827477905, + "grad_norm": 1.3703012466430664, + "learning_rate": 1.8624923329793531e-06, + "loss": 0.817, + "step": 17778 + }, + { + "epoch": 0.8657268765368977, + "grad_norm": 2.0809714794158936, + "learning_rate": 1.861163311408538e-06, + "loss": 0.8053, + "step": 17779 + }, + { + "epoch": 0.8657755703260049, + "grad_norm": 2.3133370876312256, + "learning_rate": 1.8598347410429385e-06, + "loss": 0.8211, + "step": 17780 + }, + { + "epoch": 0.8658242641151122, + "grad_norm": 1.9663746356964111, + "learning_rate": 1.8585066219156033e-06, + "loss": 0.7377, + "step": 17781 + }, + { + "epoch": 0.8658729579042194, + "grad_norm": 1.4071539640426636, + "learning_rate": 1.8571789540595708e-06, + "loss": 0.8054, + "step": 17782 + }, + { + "epoch": 0.8659216516933265, + "grad_norm": 1.362849473953247, + "learning_rate": 1.8558517375078655e-06, + "loss": 0.867, + "step": 17783 + }, + { + "epoch": 0.8659703454824337, + "grad_norm": 2.286353826522827, + "learning_rate": 1.8545249722934988e-06, + "loss": 0.8133, + "step": 17784 + }, + { + "epoch": 0.8660190392715409, + "grad_norm": 1.361587643623352, + "learning_rate": 1.8531986584494843e-06, + "loss": 0.8519, + "step": 17785 + }, + { + "epoch": 0.8660677330606481, + "grad_norm": 3.0549263954162598, + "learning_rate": 1.8518727960088024e-06, + "loss": 0.7984, + "step": 17786 + }, + { + "epoch": 0.8661164268497553, + "grad_norm": 1.8166484832763672, + "learning_rate": 1.850547385004442e-06, + "loss": 0.8054, + "step": 17787 + }, + { + "epoch": 0.8661651206388625, + "grad_norm": 1.8852958679199219, + "learning_rate": 1.8492224254693703e-06, + "loss": 0.8217, + "step": 17788 + }, + { + "epoch": 0.8662138144279697, + "grad_norm": 1.4599298238754272, + "learning_rate": 1.8478979174365452e-06, + "loss": 0.8168, + "step": 17789 + }, + { + "epoch": 0.866262508217077, + "grad_norm": 1.3930851221084595, + "learning_rate": 1.8465738609389162e-06, + "loss": 0.7106, + "step": 17790 + }, + { + "epoch": 0.8663112020061842, + "grad_norm": 1.4027143716812134, + "learning_rate": 1.8452502560094143e-06, + "loss": 0.8334, + "step": 17791 + }, + { + "epoch": 0.8663598957952913, + "grad_norm": 1.2335227727890015, + "learning_rate": 1.8439271026809714e-06, + "loss": 0.8828, + "step": 17792 + }, + { + "epoch": 0.8664085895843985, + "grad_norm": 1.6600441932678223, + "learning_rate": 1.8426044009864895e-06, + "loss": 0.7926, + "step": 17793 + }, + { + "epoch": 0.8664572833735057, + "grad_norm": 0.09866160899400711, + "learning_rate": 1.8412821509588897e-06, + "loss": 0.6812, + "step": 17794 + }, + { + "epoch": 0.8665059771626129, + "grad_norm": 2.877275228500366, + "learning_rate": 1.8399603526310429e-06, + "loss": 0.7427, + "step": 17795 + }, + { + "epoch": 0.8665546709517201, + "grad_norm": 1.4510927200317383, + "learning_rate": 1.8386390060358427e-06, + "loss": 0.8305, + "step": 17796 + }, + { + "epoch": 0.8666033647408273, + "grad_norm": 1.8048585653305054, + "learning_rate": 1.8373181112061523e-06, + "loss": 0.8655, + "step": 17797 + }, + { + "epoch": 0.8666520585299345, + "grad_norm": 1.9148612022399902, + "learning_rate": 1.8359976681748292e-06, + "loss": 0.7666, + "step": 17798 + }, + { + "epoch": 0.8667007523190418, + "grad_norm": 1.795774221420288, + "learning_rate": 1.834677676974721e-06, + "loss": 0.8689, + "step": 17799 + }, + { + "epoch": 0.8667494461081489, + "grad_norm": 1.3759746551513672, + "learning_rate": 1.8333581376386588e-06, + "loss": 0.8157, + "step": 17800 + }, + { + "epoch": 0.8667981398972561, + "grad_norm": 2.0489275455474854, + "learning_rate": 1.8320390501994767e-06, + "loss": 0.7449, + "step": 17801 + }, + { + "epoch": 0.8668468336863633, + "grad_norm": 1.359344244003296, + "learning_rate": 1.8307204146899704e-06, + "loss": 0.7047, + "step": 17802 + }, + { + "epoch": 0.8668955274754705, + "grad_norm": 1.5008646249771118, + "learning_rate": 1.8294022311429605e-06, + "loss": 0.7391, + "step": 17803 + }, + { + "epoch": 0.8669442212645777, + "grad_norm": 1.5054640769958496, + "learning_rate": 1.8280844995912183e-06, + "loss": 0.8245, + "step": 17804 + }, + { + "epoch": 0.8669929150536849, + "grad_norm": 1.6537877321243286, + "learning_rate": 1.8267672200675334e-06, + "loss": 0.8081, + "step": 17805 + }, + { + "epoch": 0.8670416088427921, + "grad_norm": 1.372012972831726, + "learning_rate": 1.8254503926046729e-06, + "loss": 0.8436, + "step": 17806 + }, + { + "epoch": 0.8670903026318993, + "grad_norm": 1.242270588874817, + "learning_rate": 1.824134017235386e-06, + "loss": 0.7488, + "step": 17807 + }, + { + "epoch": 0.8671389964210064, + "grad_norm": 1.710976243019104, + "learning_rate": 1.822818093992431e-06, + "loss": 0.7369, + "step": 17808 + }, + { + "epoch": 0.8671876902101137, + "grad_norm": 1.4907236099243164, + "learning_rate": 1.8215026229085242e-06, + "loss": 0.7825, + "step": 17809 + }, + { + "epoch": 0.8672363839992209, + "grad_norm": 1.820112705230713, + "learning_rate": 1.8201876040164056e-06, + "loss": 0.8711, + "step": 17810 + }, + { + "epoch": 0.8672850777883281, + "grad_norm": 1.202116847038269, + "learning_rate": 1.8188730373487717e-06, + "loss": 0.7618, + "step": 17811 + }, + { + "epoch": 0.8673337715774353, + "grad_norm": 2.709164619445801, + "learning_rate": 1.8175589229383294e-06, + "loss": 0.8474, + "step": 17812 + }, + { + "epoch": 0.8673824653665425, + "grad_norm": 1.5515071153640747, + "learning_rate": 1.8162452608177683e-06, + "loss": 0.8024, + "step": 17813 + }, + { + "epoch": 0.8674311591556497, + "grad_norm": 2.1526408195495605, + "learning_rate": 1.8149320510197643e-06, + "loss": 0.8222, + "step": 17814 + }, + { + "epoch": 0.8674798529447569, + "grad_norm": 1.3914397954940796, + "learning_rate": 1.8136192935769847e-06, + "loss": 0.7815, + "step": 17815 + }, + { + "epoch": 0.8675285467338641, + "grad_norm": 1.5559111833572388, + "learning_rate": 1.812306988522079e-06, + "loss": 0.8086, + "step": 17816 + }, + { + "epoch": 0.8675772405229712, + "grad_norm": 4.115781784057617, + "learning_rate": 1.8109951358877008e-06, + "loss": 0.7829, + "step": 17817 + }, + { + "epoch": 0.8676259343120785, + "grad_norm": 1.3765718936920166, + "learning_rate": 1.8096837357064712e-06, + "loss": 0.8127, + "step": 17818 + }, + { + "epoch": 0.8676746281011857, + "grad_norm": 1.2133921384811401, + "learning_rate": 1.8083727880110257e-06, + "loss": 0.7769, + "step": 17819 + }, + { + "epoch": 0.8677233218902929, + "grad_norm": 1.5066003799438477, + "learning_rate": 1.8070622928339565e-06, + "loss": 0.7906, + "step": 17820 + }, + { + "epoch": 0.8677720156794001, + "grad_norm": 1.3214709758758545, + "learning_rate": 1.8057522502078773e-06, + "loss": 0.7791, + "step": 17821 + }, + { + "epoch": 0.8678207094685073, + "grad_norm": 1.584867238998413, + "learning_rate": 1.8044426601653664e-06, + "loss": 0.7944, + "step": 17822 + }, + { + "epoch": 0.8678694032576145, + "grad_norm": 1.486480712890625, + "learning_rate": 1.8031335227390068e-06, + "loss": 0.8183, + "step": 17823 + }, + { + "epoch": 0.8679180970467217, + "grad_norm": 0.09387598186731339, + "learning_rate": 1.8018248379613568e-06, + "loss": 0.5517, + "step": 17824 + }, + { + "epoch": 0.8679667908358288, + "grad_norm": 0.09998574107885361, + "learning_rate": 1.8005166058649726e-06, + "loss": 0.652, + "step": 17825 + }, + { + "epoch": 0.868015484624936, + "grad_norm": 1.3744001388549805, + "learning_rate": 1.7992088264824038e-06, + "loss": 0.8339, + "step": 17826 + }, + { + "epoch": 0.8680641784140433, + "grad_norm": 1.4924119710922241, + "learning_rate": 1.7979014998461664e-06, + "loss": 0.8394, + "step": 17827 + }, + { + "epoch": 0.8681128722031505, + "grad_norm": 1.4632490873336792, + "learning_rate": 1.7965946259887923e-06, + "loss": 0.794, + "step": 17828 + }, + { + "epoch": 0.8681615659922577, + "grad_norm": 1.4993382692337036, + "learning_rate": 1.7952882049427866e-06, + "loss": 0.7066, + "step": 17829 + }, + { + "epoch": 0.8682102597813649, + "grad_norm": 1.6936416625976562, + "learning_rate": 1.7939822367406478e-06, + "loss": 0.8183, + "step": 17830 + }, + { + "epoch": 0.8682589535704721, + "grad_norm": 1.577105164527893, + "learning_rate": 1.792676721414861e-06, + "loss": 0.785, + "step": 17831 + }, + { + "epoch": 0.8683076473595793, + "grad_norm": 1.5325897932052612, + "learning_rate": 1.7913716589979001e-06, + "loss": 0.8547, + "step": 17832 + }, + { + "epoch": 0.8683563411486865, + "grad_norm": 0.09630583226680756, + "learning_rate": 1.7900670495222282e-06, + "loss": 0.6058, + "step": 17833 + }, + { + "epoch": 0.8684050349377936, + "grad_norm": 1.5628043413162231, + "learning_rate": 1.7887628930203016e-06, + "loss": 0.7381, + "step": 17834 + }, + { + "epoch": 0.8684537287269009, + "grad_norm": 2.013397455215454, + "learning_rate": 1.787459189524552e-06, + "loss": 0.8013, + "step": 17835 + }, + { + "epoch": 0.8685024225160081, + "grad_norm": 1.4852995872497559, + "learning_rate": 1.7861559390674199e-06, + "loss": 0.8235, + "step": 17836 + }, + { + "epoch": 0.8685511163051153, + "grad_norm": 1.5529778003692627, + "learning_rate": 1.7848531416813196e-06, + "loss": 0.7742, + "step": 17837 + }, + { + "epoch": 0.8685998100942225, + "grad_norm": 0.09794574975967407, + "learning_rate": 1.7835507973986587e-06, + "loss": 0.6239, + "step": 17838 + }, + { + "epoch": 0.8686485038833297, + "grad_norm": 1.1166800260543823, + "learning_rate": 1.7822489062518333e-06, + "loss": 0.7542, + "step": 17839 + }, + { + "epoch": 0.8686971976724369, + "grad_norm": 1.6363762617111206, + "learning_rate": 1.7809474682732264e-06, + "loss": 0.8531, + "step": 17840 + }, + { + "epoch": 0.8687458914615441, + "grad_norm": 0.09934067726135254, + "learning_rate": 1.7796464834952121e-06, + "loss": 0.6284, + "step": 17841 + }, + { + "epoch": 0.8687945852506512, + "grad_norm": 3.893376588821411, + "learning_rate": 1.7783459519501534e-06, + "loss": 0.7883, + "step": 17842 + }, + { + "epoch": 0.8688432790397584, + "grad_norm": 2.048215627670288, + "learning_rate": 1.7770458736704022e-06, + "loss": 0.793, + "step": 17843 + }, + { + "epoch": 0.8688919728288657, + "grad_norm": 1.295217752456665, + "learning_rate": 1.7757462486882903e-06, + "loss": 0.7636, + "step": 17844 + }, + { + "epoch": 0.8689406666179729, + "grad_norm": 1.5364645719528198, + "learning_rate": 1.7744470770361565e-06, + "loss": 0.7346, + "step": 17845 + }, + { + "epoch": 0.8689893604070801, + "grad_norm": 1.6874749660491943, + "learning_rate": 1.7731483587463128e-06, + "loss": 0.8917, + "step": 17846 + }, + { + "epoch": 0.8690380541961873, + "grad_norm": 1.460517406463623, + "learning_rate": 1.7718500938510664e-06, + "loss": 0.7981, + "step": 17847 + }, + { + "epoch": 0.8690867479852945, + "grad_norm": 1.7200454473495483, + "learning_rate": 1.7705522823827093e-06, + "loss": 0.8797, + "step": 17848 + }, + { + "epoch": 0.8691354417744017, + "grad_norm": 1.5476011037826538, + "learning_rate": 1.7692549243735292e-06, + "loss": 0.8306, + "step": 17849 + }, + { + "epoch": 0.8691841355635088, + "grad_norm": 1.4835726022720337, + "learning_rate": 1.7679580198557933e-06, + "loss": 0.7518, + "step": 17850 + }, + { + "epoch": 0.869232829352616, + "grad_norm": 1.393892526626587, + "learning_rate": 1.7666615688617606e-06, + "loss": 0.7955, + "step": 17851 + }, + { + "epoch": 0.8692815231417232, + "grad_norm": 0.10118526220321655, + "learning_rate": 1.7653655714236895e-06, + "loss": 0.6776, + "step": 17852 + }, + { + "epoch": 0.8693302169308305, + "grad_norm": 1.7678831815719604, + "learning_rate": 1.7640700275738076e-06, + "loss": 0.7749, + "step": 17853 + }, + { + "epoch": 0.8693789107199377, + "grad_norm": 1.9489330053329468, + "learning_rate": 1.7627749373443514e-06, + "loss": 0.7625, + "step": 17854 + }, + { + "epoch": 0.8694276045090449, + "grad_norm": 1.686409592628479, + "learning_rate": 1.7614803007675263e-06, + "loss": 0.7664, + "step": 17855 + }, + { + "epoch": 0.8694762982981521, + "grad_norm": 1.5129170417785645, + "learning_rate": 1.7601861178755441e-06, + "loss": 0.9536, + "step": 17856 + }, + { + "epoch": 0.8695249920872593, + "grad_norm": 2.0343408584594727, + "learning_rate": 1.7588923887005948e-06, + "loss": 0.8704, + "step": 17857 + }, + { + "epoch": 0.8695736858763665, + "grad_norm": 1.33561110496521, + "learning_rate": 1.7575991132748594e-06, + "loss": 0.8766, + "step": 17858 + }, + { + "epoch": 0.8696223796654736, + "grad_norm": 1.7036226987838745, + "learning_rate": 1.756306291630514e-06, + "loss": 0.8369, + "step": 17859 + }, + { + "epoch": 0.8696710734545808, + "grad_norm": 1.440208911895752, + "learning_rate": 1.7550139237997066e-06, + "loss": 0.7726, + "step": 17860 + }, + { + "epoch": 0.869719767243688, + "grad_norm": 1.9141499996185303, + "learning_rate": 1.7537220098145979e-06, + "loss": 0.8084, + "step": 17861 + }, + { + "epoch": 0.8697684610327953, + "grad_norm": 1.3572849035263062, + "learning_rate": 1.7524305497073114e-06, + "loss": 0.8927, + "step": 17862 + }, + { + "epoch": 0.8698171548219025, + "grad_norm": 0.10524780303239822, + "learning_rate": 1.7511395435099831e-06, + "loss": 0.6266, + "step": 17863 + }, + { + "epoch": 0.8698658486110097, + "grad_norm": 2.4666130542755127, + "learning_rate": 1.7498489912547211e-06, + "loss": 0.8867, + "step": 17864 + }, + { + "epoch": 0.8699145424001169, + "grad_norm": 1.434712290763855, + "learning_rate": 1.7485588929736307e-06, + "loss": 0.7906, + "step": 17865 + }, + { + "epoch": 0.8699632361892241, + "grad_norm": 1.7123972177505493, + "learning_rate": 1.7472692486987997e-06, + "loss": 0.8191, + "step": 17866 + }, + { + "epoch": 0.8700119299783312, + "grad_norm": 2.126833200454712, + "learning_rate": 1.7459800584623089e-06, + "loss": 0.8085, + "step": 17867 + }, + { + "epoch": 0.8700606237674384, + "grad_norm": 1.575976014137268, + "learning_rate": 1.7446913222962348e-06, + "loss": 0.8197, + "step": 17868 + }, + { + "epoch": 0.8701093175565456, + "grad_norm": 1.4743901491165161, + "learning_rate": 1.743403040232623e-06, + "loss": 0.8352, + "step": 17869 + }, + { + "epoch": 0.8701580113456528, + "grad_norm": 2.7521603107452393, + "learning_rate": 1.7421152123035302e-06, + "loss": 0.7545, + "step": 17870 + }, + { + "epoch": 0.8702067051347601, + "grad_norm": 1.3300838470458984, + "learning_rate": 1.7408278385409793e-06, + "loss": 0.8104, + "step": 17871 + }, + { + "epoch": 0.8702553989238673, + "grad_norm": 1.8092347383499146, + "learning_rate": 1.739540918977005e-06, + "loss": 0.8878, + "step": 17872 + }, + { + "epoch": 0.8703040927129745, + "grad_norm": 1.535353660583496, + "learning_rate": 1.7382544536436152e-06, + "loss": 0.9023, + "step": 17873 + }, + { + "epoch": 0.8703527865020817, + "grad_norm": 1.5933806896209717, + "learning_rate": 1.7369684425728106e-06, + "loss": 0.7862, + "step": 17874 + }, + { + "epoch": 0.8704014802911889, + "grad_norm": 1.3512134552001953, + "learning_rate": 1.7356828857965814e-06, + "loss": 0.8695, + "step": 17875 + }, + { + "epoch": 0.870450174080296, + "grad_norm": 1.3727562427520752, + "learning_rate": 1.7343977833469017e-06, + "loss": 0.7388, + "step": 17876 + }, + { + "epoch": 0.8704988678694032, + "grad_norm": 1.5686545372009277, + "learning_rate": 1.7331131352557483e-06, + "loss": 0.8571, + "step": 17877 + }, + { + "epoch": 0.8705475616585104, + "grad_norm": 1.7113466262817383, + "learning_rate": 1.7318289415550672e-06, + "loss": 0.8457, + "step": 17878 + }, + { + "epoch": 0.8705962554476177, + "grad_norm": 1.8679755926132202, + "learning_rate": 1.7305452022768077e-06, + "loss": 0.7793, + "step": 17879 + }, + { + "epoch": 0.8706449492367249, + "grad_norm": 2.8249006271362305, + "learning_rate": 1.7292619174529046e-06, + "loss": 0.9071, + "step": 17880 + }, + { + "epoch": 0.8706936430258321, + "grad_norm": 1.594911813735962, + "learning_rate": 1.7279790871152747e-06, + "loss": 0.8509, + "step": 17881 + }, + { + "epoch": 0.8707423368149393, + "grad_norm": 1.646100640296936, + "learning_rate": 1.7266967112958321e-06, + "loss": 0.8412, + "step": 17882 + }, + { + "epoch": 0.8707910306040465, + "grad_norm": 1.9071952104568481, + "learning_rate": 1.725414790026474e-06, + "loss": 0.7997, + "step": 17883 + }, + { + "epoch": 0.8708397243931536, + "grad_norm": 1.7729424238204956, + "learning_rate": 1.72413332333909e-06, + "loss": 0.8364, + "step": 17884 + }, + { + "epoch": 0.8708884181822608, + "grad_norm": 1.2504585981369019, + "learning_rate": 1.7228523112655526e-06, + "loss": 0.8101, + "step": 17885 + }, + { + "epoch": 0.870937111971368, + "grad_norm": 1.149112582206726, + "learning_rate": 1.721571753837734e-06, + "loss": 0.724, + "step": 17886 + }, + { + "epoch": 0.8709858057604752, + "grad_norm": 1.3345929384231567, + "learning_rate": 1.7202916510874845e-06, + "loss": 0.7872, + "step": 17887 + }, + { + "epoch": 0.8710344995495825, + "grad_norm": 1.8071717023849487, + "learning_rate": 1.7190120030466474e-06, + "loss": 0.8331, + "step": 17888 + }, + { + "epoch": 0.8710831933386897, + "grad_norm": 1.7633883953094482, + "learning_rate": 1.7177328097470526e-06, + "loss": 0.9288, + "step": 17889 + }, + { + "epoch": 0.8711318871277969, + "grad_norm": 1.9092473983764648, + "learning_rate": 1.7164540712205214e-06, + "loss": 0.8822, + "step": 17890 + }, + { + "epoch": 0.8711805809169041, + "grad_norm": 1.5265599489212036, + "learning_rate": 1.7151757874988618e-06, + "loss": 0.8176, + "step": 17891 + }, + { + "epoch": 0.8712292747060113, + "grad_norm": 1.737284541130066, + "learning_rate": 1.7138979586138704e-06, + "loss": 0.7201, + "step": 17892 + }, + { + "epoch": 0.8712779684951184, + "grad_norm": 1.745750069618225, + "learning_rate": 1.7126205845973332e-06, + "loss": 0.6773, + "step": 17893 + }, + { + "epoch": 0.8713266622842256, + "grad_norm": 1.8285186290740967, + "learning_rate": 1.7113436654810289e-06, + "loss": 0.8342, + "step": 17894 + }, + { + "epoch": 0.8713753560733328, + "grad_norm": 1.438493251800537, + "learning_rate": 1.710067201296719e-06, + "loss": 0.8057, + "step": 17895 + }, + { + "epoch": 0.87142404986244, + "grad_norm": 1.3739742040634155, + "learning_rate": 1.7087911920761558e-06, + "loss": 0.7904, + "step": 17896 + }, + { + "epoch": 0.8714727436515473, + "grad_norm": 1.4645029306411743, + "learning_rate": 1.7075156378510782e-06, + "loss": 0.8018, + "step": 17897 + }, + { + "epoch": 0.8715214374406545, + "grad_norm": 1.319581151008606, + "learning_rate": 1.7062405386532165e-06, + "loss": 0.794, + "step": 17898 + }, + { + "epoch": 0.8715701312297617, + "grad_norm": 0.10955887287855148, + "learning_rate": 1.70496589451429e-06, + "loss": 0.6055, + "step": 17899 + }, + { + "epoch": 0.8716188250188689, + "grad_norm": 1.8011198043823242, + "learning_rate": 1.703691705466004e-06, + "loss": 0.7717, + "step": 17900 + }, + { + "epoch": 0.871667518807976, + "grad_norm": 2.397902488708496, + "learning_rate": 1.7024179715400558e-06, + "loss": 0.7393, + "step": 17901 + }, + { + "epoch": 0.8717162125970832, + "grad_norm": 1.5797241926193237, + "learning_rate": 1.7011446927681263e-06, + "loss": 0.7666, + "step": 17902 + }, + { + "epoch": 0.8717649063861904, + "grad_norm": 1.8540884256362915, + "learning_rate": 1.699871869181897e-06, + "loss": 0.8513, + "step": 17903 + }, + { + "epoch": 0.8718136001752976, + "grad_norm": 0.11002042889595032, + "learning_rate": 1.6985995008130163e-06, + "loss": 0.629, + "step": 17904 + }, + { + "epoch": 0.8718622939644048, + "grad_norm": 1.6802237033843994, + "learning_rate": 1.6973275876931472e-06, + "loss": 0.7077, + "step": 17905 + }, + { + "epoch": 0.8719109877535121, + "grad_norm": 1.603552222251892, + "learning_rate": 1.6960561298539201e-06, + "loss": 0.7675, + "step": 17906 + }, + { + "epoch": 0.8719596815426193, + "grad_norm": 1.424705982208252, + "learning_rate": 1.6947851273269678e-06, + "loss": 0.812, + "step": 17907 + }, + { + "epoch": 0.8720083753317265, + "grad_norm": 1.3263381719589233, + "learning_rate": 1.6935145801439023e-06, + "loss": 0.8342, + "step": 17908 + }, + { + "epoch": 0.8720570691208336, + "grad_norm": 1.8028175830841064, + "learning_rate": 1.6922444883363298e-06, + "loss": 0.8005, + "step": 17909 + }, + { + "epoch": 0.8721057629099408, + "grad_norm": 1.5525504350662231, + "learning_rate": 1.6909748519358515e-06, + "loss": 0.8883, + "step": 17910 + }, + { + "epoch": 0.872154456699048, + "grad_norm": 1.4217305183410645, + "learning_rate": 1.6897056709740334e-06, + "loss": 0.7844, + "step": 17911 + }, + { + "epoch": 0.8722031504881552, + "grad_norm": 1.3307340145111084, + "learning_rate": 1.6884369454824656e-06, + "loss": 0.8418, + "step": 17912 + }, + { + "epoch": 0.8722518442772624, + "grad_norm": 0.09197398275136948, + "learning_rate": 1.6871686754926919e-06, + "loss": 0.5727, + "step": 17913 + }, + { + "epoch": 0.8723005380663696, + "grad_norm": 1.3422971963882446, + "learning_rate": 1.6859008610362692e-06, + "loss": 0.8303, + "step": 17914 + }, + { + "epoch": 0.8723492318554769, + "grad_norm": 1.9351410865783691, + "learning_rate": 1.6846335021447324e-06, + "loss": 0.8325, + "step": 17915 + }, + { + "epoch": 0.8723979256445841, + "grad_norm": 1.8826557397842407, + "learning_rate": 1.683366598849605e-06, + "loss": 0.8611, + "step": 17916 + }, + { + "epoch": 0.8724466194336913, + "grad_norm": 1.6489769220352173, + "learning_rate": 1.6821001511824087e-06, + "loss": 0.8068, + "step": 17917 + }, + { + "epoch": 0.8724953132227984, + "grad_norm": 1.3826180696487427, + "learning_rate": 1.6808341591746357e-06, + "loss": 0.8105, + "step": 17918 + }, + { + "epoch": 0.8725440070119056, + "grad_norm": 1.3142699003219604, + "learning_rate": 1.6795686228577901e-06, + "loss": 0.7777, + "step": 17919 + }, + { + "epoch": 0.8725927008010128, + "grad_norm": 1.5023528337478638, + "learning_rate": 1.6783035422633375e-06, + "loss": 0.7076, + "step": 17920 + }, + { + "epoch": 0.87264139459012, + "grad_norm": 1.8833141326904297, + "learning_rate": 1.6770389174227597e-06, + "loss": 0.9196, + "step": 17921 + }, + { + "epoch": 0.8726900883792272, + "grad_norm": 1.4073028564453125, + "learning_rate": 1.6757747483675068e-06, + "loss": 0.8424, + "step": 17922 + }, + { + "epoch": 0.8727387821683344, + "grad_norm": 1.994936466217041, + "learning_rate": 1.6745110351290295e-06, + "loss": 0.8369, + "step": 17923 + }, + { + "epoch": 0.8727874759574417, + "grad_norm": 1.5758934020996094, + "learning_rate": 1.6732477777387624e-06, + "loss": 0.7658, + "step": 17924 + }, + { + "epoch": 0.8728361697465489, + "grad_norm": 1.7587276697158813, + "learning_rate": 1.6719849762281227e-06, + "loss": 0.7148, + "step": 17925 + }, + { + "epoch": 0.872884863535656, + "grad_norm": 1.6579967737197876, + "learning_rate": 1.6707226306285341e-06, + "loss": 0.8194, + "step": 17926 + }, + { + "epoch": 0.8729335573247632, + "grad_norm": 1.7001219987869263, + "learning_rate": 1.669460740971387e-06, + "loss": 0.8448, + "step": 17927 + }, + { + "epoch": 0.8729822511138704, + "grad_norm": 1.6516765356063843, + "learning_rate": 1.6681993072880788e-06, + "loss": 0.8002, + "step": 17928 + }, + { + "epoch": 0.8730309449029776, + "grad_norm": 1.3760908842086792, + "learning_rate": 1.6669383296099794e-06, + "loss": 0.7953, + "step": 17929 + }, + { + "epoch": 0.8730796386920848, + "grad_norm": 1.8167204856872559, + "learning_rate": 1.6656778079684643e-06, + "loss": 0.8017, + "step": 17930 + }, + { + "epoch": 0.873128332481192, + "grad_norm": 2.1801700592041016, + "learning_rate": 1.6644177423948837e-06, + "loss": 0.9068, + "step": 17931 + }, + { + "epoch": 0.8731770262702993, + "grad_norm": 0.0959765613079071, + "learning_rate": 1.6631581329205859e-06, + "loss": 0.6236, + "step": 17932 + }, + { + "epoch": 0.8732257200594065, + "grad_norm": 1.802197813987732, + "learning_rate": 1.6618989795768992e-06, + "loss": 0.7249, + "step": 17933 + }, + { + "epoch": 0.8732744138485137, + "grad_norm": 1.380432367324829, + "learning_rate": 1.6606402823951451e-06, + "loss": 0.9235, + "step": 17934 + }, + { + "epoch": 0.8733231076376208, + "grad_norm": 2.0736026763916016, + "learning_rate": 1.659382041406643e-06, + "loss": 0.773, + "step": 17935 + }, + { + "epoch": 0.873371801426728, + "grad_norm": 1.3410451412200928, + "learning_rate": 1.6581242566426769e-06, + "loss": 0.8023, + "step": 17936 + }, + { + "epoch": 0.8734204952158352, + "grad_norm": 1.7162219285964966, + "learning_rate": 1.6568669281345485e-06, + "loss": 0.7287, + "step": 17937 + }, + { + "epoch": 0.8734691890049424, + "grad_norm": 2.081338405609131, + "learning_rate": 1.6556100559135236e-06, + "loss": 0.781, + "step": 17938 + }, + { + "epoch": 0.8735178827940496, + "grad_norm": 1.4401073455810547, + "learning_rate": 1.654353640010875e-06, + "loss": 0.8358, + "step": 17939 + }, + { + "epoch": 0.8735665765831568, + "grad_norm": 1.8145670890808105, + "learning_rate": 1.6530976804578492e-06, + "loss": 0.8054, + "step": 17940 + }, + { + "epoch": 0.873615270372264, + "grad_norm": 1.2524127960205078, + "learning_rate": 1.651842177285694e-06, + "loss": 0.8502, + "step": 17941 + }, + { + "epoch": 0.8736639641613713, + "grad_norm": 1.2395256757736206, + "learning_rate": 1.650587130525636e-06, + "loss": 0.7569, + "step": 17942 + }, + { + "epoch": 0.8737126579504784, + "grad_norm": 0.09749875962734222, + "learning_rate": 1.6493325402088967e-06, + "loss": 0.5804, + "step": 17943 + }, + { + "epoch": 0.8737613517395856, + "grad_norm": 1.6756727695465088, + "learning_rate": 1.6480784063666799e-06, + "loss": 0.7563, + "step": 17944 + }, + { + "epoch": 0.8738100455286928, + "grad_norm": 1.770960807800293, + "learning_rate": 1.6468247290301898e-06, + "loss": 0.7478, + "step": 17945 + }, + { + "epoch": 0.8738587393178, + "grad_norm": 1.695339560508728, + "learning_rate": 1.6455715082306078e-06, + "loss": 0.8011, + "step": 17946 + }, + { + "epoch": 0.8739074331069072, + "grad_norm": 1.6551035642623901, + "learning_rate": 1.6443187439991071e-06, + "loss": 0.8169, + "step": 17947 + }, + { + "epoch": 0.8739561268960144, + "grad_norm": 1.582154631614685, + "learning_rate": 1.6430664363668514e-06, + "loss": 0.8155, + "step": 17948 + }, + { + "epoch": 0.8740048206851216, + "grad_norm": 1.5786972045898438, + "learning_rate": 1.6418145853649937e-06, + "loss": 0.7938, + "step": 17949 + }, + { + "epoch": 0.8740535144742289, + "grad_norm": 1.6288964748382568, + "learning_rate": 1.6405631910246711e-06, + "loss": 0.7895, + "step": 17950 + }, + { + "epoch": 0.8741022082633361, + "grad_norm": 1.4122087955474854, + "learning_rate": 1.6393122533770122e-06, + "loss": 0.819, + "step": 17951 + }, + { + "epoch": 0.8741509020524432, + "grad_norm": 1.4757447242736816, + "learning_rate": 1.6380617724531345e-06, + "loss": 0.7896, + "step": 17952 + }, + { + "epoch": 0.8741995958415504, + "grad_norm": 1.7345703840255737, + "learning_rate": 1.6368117482841416e-06, + "loss": 0.8089, + "step": 17953 + }, + { + "epoch": 0.8742482896306576, + "grad_norm": 1.4236819744110107, + "learning_rate": 1.6355621809011358e-06, + "loss": 0.8396, + "step": 17954 + }, + { + "epoch": 0.8742969834197648, + "grad_norm": 3.1194710731506348, + "learning_rate": 1.634313070335194e-06, + "loss": 0.7489, + "step": 17955 + }, + { + "epoch": 0.874345677208872, + "grad_norm": 1.349110722541809, + "learning_rate": 1.6330644166173892e-06, + "loss": 0.7566, + "step": 17956 + }, + { + "epoch": 0.8743943709979792, + "grad_norm": 1.9263343811035156, + "learning_rate": 1.6318162197787812e-06, + "loss": 0.852, + "step": 17957 + }, + { + "epoch": 0.8744430647870864, + "grad_norm": 0.21895167231559753, + "learning_rate": 1.6305684798504207e-06, + "loss": 0.5991, + "step": 17958 + }, + { + "epoch": 0.8744917585761937, + "grad_norm": 1.8778102397918701, + "learning_rate": 1.6293211968633426e-06, + "loss": 0.7788, + "step": 17959 + }, + { + "epoch": 0.8745404523653008, + "grad_norm": 1.5254802703857422, + "learning_rate": 1.6280743708485713e-06, + "loss": 0.8822, + "step": 17960 + }, + { + "epoch": 0.874589146154408, + "grad_norm": 1.4397765398025513, + "learning_rate": 1.626828001837133e-06, + "loss": 0.7872, + "step": 17961 + }, + { + "epoch": 0.8746378399435152, + "grad_norm": 6.824172496795654, + "learning_rate": 1.6255820898600161e-06, + "loss": 0.8382, + "step": 17962 + }, + { + "epoch": 0.8746865337326224, + "grad_norm": 1.7871644496917725, + "learning_rate": 1.624336634948227e-06, + "loss": 0.7993, + "step": 17963 + }, + { + "epoch": 0.8747352275217296, + "grad_norm": 1.1546573638916016, + "learning_rate": 1.6230916371327322e-06, + "loss": 0.8032, + "step": 17964 + }, + { + "epoch": 0.8747839213108368, + "grad_norm": 1.865222454071045, + "learning_rate": 1.6218470964445132e-06, + "loss": 0.7712, + "step": 17965 + }, + { + "epoch": 0.874832615099944, + "grad_norm": 1.4235719442367554, + "learning_rate": 1.620603012914521e-06, + "loss": 0.8203, + "step": 17966 + }, + { + "epoch": 0.8748813088890512, + "grad_norm": 1.4558695554733276, + "learning_rate": 1.6193593865737024e-06, + "loss": 0.6671, + "step": 17967 + }, + { + "epoch": 0.8749300026781583, + "grad_norm": 1.4262800216674805, + "learning_rate": 1.618116217453003e-06, + "loss": 0.7895, + "step": 17968 + }, + { + "epoch": 0.8749786964672656, + "grad_norm": 1.6522045135498047, + "learning_rate": 1.6168735055833296e-06, + "loss": 0.8159, + "step": 17969 + }, + { + "epoch": 0.8750273902563728, + "grad_norm": 1.703042984008789, + "learning_rate": 1.615631250995613e-06, + "loss": 0.761, + "step": 17970 + }, + { + "epoch": 0.87507608404548, + "grad_norm": 1.4209150075912476, + "learning_rate": 1.6143894537207373e-06, + "loss": 0.7887, + "step": 17971 + }, + { + "epoch": 0.8751247778345872, + "grad_norm": 2.2343926429748535, + "learning_rate": 1.6131481137896043e-06, + "loss": 0.7462, + "step": 17972 + }, + { + "epoch": 0.8751734716236944, + "grad_norm": 2.2721211910247803, + "learning_rate": 1.6119072312330874e-06, + "loss": 0.904, + "step": 17973 + }, + { + "epoch": 0.8752221654128016, + "grad_norm": 1.7303446531295776, + "learning_rate": 1.6106668060820552e-06, + "loss": 0.7331, + "step": 17974 + }, + { + "epoch": 0.8752708592019088, + "grad_norm": 2.832933187484741, + "learning_rate": 1.6094268383673628e-06, + "loss": 0.7688, + "step": 17975 + }, + { + "epoch": 0.875319552991016, + "grad_norm": 1.2871222496032715, + "learning_rate": 1.6081873281198502e-06, + "loss": 0.7354, + "step": 17976 + }, + { + "epoch": 0.8753682467801231, + "grad_norm": 1.1284745931625366, + "learning_rate": 1.6069482753703636e-06, + "loss": 0.7242, + "step": 17977 + }, + { + "epoch": 0.8754169405692304, + "grad_norm": 1.2751529216766357, + "learning_rate": 1.6057096801497075e-06, + "loss": 0.9051, + "step": 17978 + }, + { + "epoch": 0.8754656343583376, + "grad_norm": 1.6577762365341187, + "learning_rate": 1.6044715424887037e-06, + "loss": 0.8523, + "step": 17979 + }, + { + "epoch": 0.8755143281474448, + "grad_norm": 1.3321077823638916, + "learning_rate": 1.6032338624181475e-06, + "loss": 0.7331, + "step": 17980 + }, + { + "epoch": 0.875563021936552, + "grad_norm": 1.780840277671814, + "learning_rate": 1.6019966399688258e-06, + "loss": 0.8416, + "step": 17981 + }, + { + "epoch": 0.8756117157256592, + "grad_norm": 1.4532129764556885, + "learning_rate": 1.600759875171516e-06, + "loss": 0.694, + "step": 17982 + }, + { + "epoch": 0.8756604095147664, + "grad_norm": 2.935027837753296, + "learning_rate": 1.599523568056982e-06, + "loss": 0.7492, + "step": 17983 + }, + { + "epoch": 0.8757091033038736, + "grad_norm": 1.8072768449783325, + "learning_rate": 1.5982877186559776e-06, + "loss": 0.7745, + "step": 17984 + }, + { + "epoch": 0.8757577970929807, + "grad_norm": 0.10124193876981735, + "learning_rate": 1.5970523269992399e-06, + "loss": 0.61, + "step": 17985 + }, + { + "epoch": 0.875806490882088, + "grad_norm": 1.6862316131591797, + "learning_rate": 1.5958173931175092e-06, + "loss": 0.7844, + "step": 17986 + }, + { + "epoch": 0.8758551846711952, + "grad_norm": 0.10183317214250565, + "learning_rate": 1.594582917041494e-06, + "loss": 0.6084, + "step": 17987 + }, + { + "epoch": 0.8759038784603024, + "grad_norm": 1.432591199874878, + "learning_rate": 1.5933488988019075e-06, + "loss": 0.8096, + "step": 17988 + }, + { + "epoch": 0.8759525722494096, + "grad_norm": 2.5503320693969727, + "learning_rate": 1.5921153384294475e-06, + "loss": 0.8591, + "step": 17989 + }, + { + "epoch": 0.8760012660385168, + "grad_norm": 1.4195399284362793, + "learning_rate": 1.590882235954796e-06, + "loss": 0.774, + "step": 17990 + }, + { + "epoch": 0.876049959827624, + "grad_norm": 1.690500259399414, + "learning_rate": 1.5896495914086286e-06, + "loss": 0.824, + "step": 17991 + }, + { + "epoch": 0.8760986536167312, + "grad_norm": 2.493603467941284, + "learning_rate": 1.588417404821605e-06, + "loss": 0.8948, + "step": 17992 + }, + { + "epoch": 0.8761473474058384, + "grad_norm": 1.360953450202942, + "learning_rate": 1.5871856762243788e-06, + "loss": 0.7481, + "step": 17993 + }, + { + "epoch": 0.8761960411949455, + "grad_norm": 7.9557647705078125, + "learning_rate": 1.5859544056475828e-06, + "loss": 0.8285, + "step": 17994 + }, + { + "epoch": 0.8762447349840528, + "grad_norm": 1.4357248544692993, + "learning_rate": 1.584723593121853e-06, + "loss": 0.7616, + "step": 17995 + }, + { + "epoch": 0.87629342877316, + "grad_norm": 2.103592872619629, + "learning_rate": 1.5834932386778023e-06, + "loss": 0.7564, + "step": 17996 + }, + { + "epoch": 0.8763421225622672, + "grad_norm": 1.8009886741638184, + "learning_rate": 1.5822633423460376e-06, + "loss": 0.8387, + "step": 17997 + }, + { + "epoch": 0.8763908163513744, + "grad_norm": 1.6272804737091064, + "learning_rate": 1.581033904157152e-06, + "loss": 0.7088, + "step": 17998 + }, + { + "epoch": 0.8764395101404816, + "grad_norm": 1.6631697416305542, + "learning_rate": 1.5798049241417278e-06, + "loss": 0.8839, + "step": 17999 + }, + { + "epoch": 0.8764882039295888, + "grad_norm": 1.210755467414856, + "learning_rate": 1.5785764023303341e-06, + "loss": 0.8543, + "step": 18000 + }, + { + "epoch": 0.876536897718696, + "grad_norm": 1.4830446243286133, + "learning_rate": 1.5773483387535327e-06, + "loss": 0.8447, + "step": 18001 + }, + { + "epoch": 0.8765855915078031, + "grad_norm": 1.7778575420379639, + "learning_rate": 1.5761207334418683e-06, + "loss": 0.8214, + "step": 18002 + }, + { + "epoch": 0.8766342852969103, + "grad_norm": 1.5306942462921143, + "learning_rate": 1.574893586425883e-06, + "loss": 0.8131, + "step": 18003 + }, + { + "epoch": 0.8766829790860176, + "grad_norm": 1.3248523473739624, + "learning_rate": 1.5736668977360991e-06, + "loss": 0.8287, + "step": 18004 + }, + { + "epoch": 0.8767316728751248, + "grad_norm": 1.670251488685608, + "learning_rate": 1.5724406674030325e-06, + "loss": 0.7999, + "step": 18005 + }, + { + "epoch": 0.876780366664232, + "grad_norm": 2.0918214321136475, + "learning_rate": 1.5712148954571827e-06, + "loss": 0.9492, + "step": 18006 + }, + { + "epoch": 0.8768290604533392, + "grad_norm": 1.8309741020202637, + "learning_rate": 1.5699895819290433e-06, + "loss": 0.8566, + "step": 18007 + }, + { + "epoch": 0.8768777542424464, + "grad_norm": 2.136864423751831, + "learning_rate": 1.5687647268490947e-06, + "loss": 0.8362, + "step": 18008 + }, + { + "epoch": 0.8769264480315536, + "grad_norm": 1.4588971138000488, + "learning_rate": 1.567540330247803e-06, + "loss": 0.7725, + "step": 18009 + }, + { + "epoch": 0.8769751418206607, + "grad_norm": 1.3249064683914185, + "learning_rate": 1.5663163921556267e-06, + "loss": 0.8569, + "step": 18010 + }, + { + "epoch": 0.8770238356097679, + "grad_norm": 1.3553590774536133, + "learning_rate": 1.5650929126030078e-06, + "loss": 0.7825, + "step": 18011 + }, + { + "epoch": 0.8770725293988751, + "grad_norm": 1.242375135421753, + "learning_rate": 1.5638698916203887e-06, + "loss": 0.7356, + "step": 18012 + }, + { + "epoch": 0.8771212231879824, + "grad_norm": 0.10336074978113174, + "learning_rate": 1.5626473292381805e-06, + "loss": 0.6251, + "step": 18013 + }, + { + "epoch": 0.8771699169770896, + "grad_norm": 1.4905799627304077, + "learning_rate": 1.5614252254868056e-06, + "loss": 0.8075, + "step": 18014 + }, + { + "epoch": 0.8772186107661968, + "grad_norm": 1.4966814517974854, + "learning_rate": 1.5602035803966598e-06, + "loss": 0.8231, + "step": 18015 + }, + { + "epoch": 0.877267304555304, + "grad_norm": 1.336255669593811, + "learning_rate": 1.5589823939981296e-06, + "loss": 0.8203, + "step": 18016 + }, + { + "epoch": 0.8773159983444112, + "grad_norm": 1.3163394927978516, + "learning_rate": 1.5577616663215955e-06, + "loss": 0.7671, + "step": 18017 + }, + { + "epoch": 0.8773646921335184, + "grad_norm": 1.3831881284713745, + "learning_rate": 1.5565413973974153e-06, + "loss": 0.7237, + "step": 18018 + }, + { + "epoch": 0.8774133859226255, + "grad_norm": 1.6988612413406372, + "learning_rate": 1.555321587255958e-06, + "loss": 0.7788, + "step": 18019 + }, + { + "epoch": 0.8774620797117327, + "grad_norm": 1.5216554403305054, + "learning_rate": 1.5541022359275526e-06, + "loss": 0.7846, + "step": 18020 + }, + { + "epoch": 0.87751077350084, + "grad_norm": 2.2147018909454346, + "learning_rate": 1.5528833434425394e-06, + "loss": 0.849, + "step": 18021 + }, + { + "epoch": 0.8775594672899472, + "grad_norm": 1.207949161529541, + "learning_rate": 1.551664909831232e-06, + "loss": 0.8628, + "step": 18022 + }, + { + "epoch": 0.8776081610790544, + "grad_norm": 1.7651020288467407, + "learning_rate": 1.5504469351239416e-06, + "loss": 0.8888, + "step": 18023 + }, + { + "epoch": 0.8776568548681616, + "grad_norm": 1.8041523694992065, + "learning_rate": 1.5492294193509683e-06, + "loss": 0.8835, + "step": 18024 + }, + { + "epoch": 0.8777055486572688, + "grad_norm": 1.2341489791870117, + "learning_rate": 1.5480123625425924e-06, + "loss": 0.8835, + "step": 18025 + }, + { + "epoch": 0.877754242446376, + "grad_norm": 1.5642732381820679, + "learning_rate": 1.5467957647290966e-06, + "loss": 0.7387, + "step": 18026 + }, + { + "epoch": 0.8778029362354831, + "grad_norm": 1.4536890983581543, + "learning_rate": 1.545579625940732e-06, + "loss": 0.7969, + "step": 18027 + }, + { + "epoch": 0.8778516300245903, + "grad_norm": 1.2423052787780762, + "learning_rate": 1.5443639462077654e-06, + "loss": 0.811, + "step": 18028 + }, + { + "epoch": 0.8779003238136975, + "grad_norm": 1.3309108018875122, + "learning_rate": 1.5431487255604194e-06, + "loss": 0.8144, + "step": 18029 + }, + { + "epoch": 0.8779490176028047, + "grad_norm": 2.5029029846191406, + "learning_rate": 1.5419339640289365e-06, + "loss": 0.8084, + "step": 18030 + }, + { + "epoch": 0.877997711391912, + "grad_norm": 2.030634880065918, + "learning_rate": 1.5407196616435282e-06, + "loss": 0.7652, + "step": 18031 + }, + { + "epoch": 0.8780464051810192, + "grad_norm": 1.8157151937484741, + "learning_rate": 1.5395058184344015e-06, + "loss": 0.7846, + "step": 18032 + }, + { + "epoch": 0.8780950989701264, + "grad_norm": 1.3734581470489502, + "learning_rate": 1.5382924344317497e-06, + "loss": 0.8035, + "step": 18033 + }, + { + "epoch": 0.8781437927592336, + "grad_norm": 0.10291777551174164, + "learning_rate": 1.5370795096657554e-06, + "loss": 0.668, + "step": 18034 + }, + { + "epoch": 0.8781924865483408, + "grad_norm": 1.3001971244812012, + "learning_rate": 1.535867044166599e-06, + "loss": 0.8138, + "step": 18035 + }, + { + "epoch": 0.8782411803374479, + "grad_norm": 1.4873018264770508, + "learning_rate": 1.5346550379644253e-06, + "loss": 0.7054, + "step": 18036 + }, + { + "epoch": 0.8782898741265551, + "grad_norm": 1.5295530557632446, + "learning_rate": 1.533443491089397e-06, + "loss": 0.761, + "step": 18037 + }, + { + "epoch": 0.8783385679156623, + "grad_norm": 1.471030592918396, + "learning_rate": 1.5322324035716408e-06, + "loss": 0.774, + "step": 18038 + }, + { + "epoch": 0.8783872617047696, + "grad_norm": 1.5999702215194702, + "learning_rate": 1.5310217754412882e-06, + "loss": 0.8184, + "step": 18039 + }, + { + "epoch": 0.8784359554938768, + "grad_norm": 1.9539310932159424, + "learning_rate": 1.5298116067284552e-06, + "loss": 0.8185, + "step": 18040 + }, + { + "epoch": 0.878484649282984, + "grad_norm": 1.4496514797210693, + "learning_rate": 1.5286018974632422e-06, + "loss": 0.8684, + "step": 18041 + }, + { + "epoch": 0.8785333430720912, + "grad_norm": 1.537773847579956, + "learning_rate": 1.5273926476757405e-06, + "loss": 0.7973, + "step": 18042 + }, + { + "epoch": 0.8785820368611984, + "grad_norm": 1.333335041999817, + "learning_rate": 1.5261838573960263e-06, + "loss": 0.7852, + "step": 18043 + }, + { + "epoch": 0.8786307306503055, + "grad_norm": 1.2852133512496948, + "learning_rate": 1.5249755266541821e-06, + "loss": 0.8261, + "step": 18044 + }, + { + "epoch": 0.8786794244394127, + "grad_norm": 2.0760796070098877, + "learning_rate": 1.5237676554802482e-06, + "loss": 0.748, + "step": 18045 + }, + { + "epoch": 0.8787281182285199, + "grad_norm": 1.8031057119369507, + "learning_rate": 1.522560243904283e-06, + "loss": 0.8792, + "step": 18046 + }, + { + "epoch": 0.8787768120176271, + "grad_norm": 1.3793045282363892, + "learning_rate": 1.5213532919563157e-06, + "loss": 0.8119, + "step": 18047 + }, + { + "epoch": 0.8788255058067344, + "grad_norm": 1.577323079109192, + "learning_rate": 1.5201467996663688e-06, + "loss": 0.7974, + "step": 18048 + }, + { + "epoch": 0.8788741995958416, + "grad_norm": 2.1276633739471436, + "learning_rate": 1.5189407670644585e-06, + "loss": 0.8342, + "step": 18049 + }, + { + "epoch": 0.8789228933849488, + "grad_norm": 1.4600509405136108, + "learning_rate": 1.5177351941805784e-06, + "loss": 0.8573, + "step": 18050 + }, + { + "epoch": 0.878971587174056, + "grad_norm": 1.3198479413986206, + "learning_rate": 1.5165300810447225e-06, + "loss": 0.75, + "step": 18051 + }, + { + "epoch": 0.8790202809631632, + "grad_norm": 1.9519259929656982, + "learning_rate": 1.5153254276868645e-06, + "loss": 0.8655, + "step": 18052 + }, + { + "epoch": 0.8790689747522703, + "grad_norm": 5.718244552612305, + "learning_rate": 1.5141212341369692e-06, + "loss": 0.7595, + "step": 18053 + }, + { + "epoch": 0.8791176685413775, + "grad_norm": 2.09686541557312, + "learning_rate": 1.5129175004249974e-06, + "loss": 0.8804, + "step": 18054 + }, + { + "epoch": 0.8791663623304847, + "grad_norm": 1.3464690446853638, + "learning_rate": 1.5117142265808872e-06, + "loss": 0.778, + "step": 18055 + }, + { + "epoch": 0.8792150561195919, + "grad_norm": 1.6298649311065674, + "learning_rate": 1.5105114126345722e-06, + "loss": 0.7587, + "step": 18056 + }, + { + "epoch": 0.8792637499086992, + "grad_norm": 0.09804892539978027, + "learning_rate": 1.509309058615973e-06, + "loss": 0.6254, + "step": 18057 + }, + { + "epoch": 0.8793124436978064, + "grad_norm": 0.09203719347715378, + "learning_rate": 1.508107164554995e-06, + "loss": 0.6031, + "step": 18058 + }, + { + "epoch": 0.8793611374869136, + "grad_norm": 1.6865566968917847, + "learning_rate": 1.5069057304815382e-06, + "loss": 0.828, + "step": 18059 + }, + { + "epoch": 0.8794098312760208, + "grad_norm": 1.4309476613998413, + "learning_rate": 1.5057047564254855e-06, + "loss": 0.8724, + "step": 18060 + }, + { + "epoch": 0.8794585250651279, + "grad_norm": 1.5622072219848633, + "learning_rate": 1.5045042424167155e-06, + "loss": 0.7568, + "step": 18061 + }, + { + "epoch": 0.8795072188542351, + "grad_norm": 1.3625917434692383, + "learning_rate": 1.5033041884850841e-06, + "loss": 0.8984, + "step": 18062 + }, + { + "epoch": 0.8795559126433423, + "grad_norm": 1.8984562158584595, + "learning_rate": 1.5021045946604518e-06, + "loss": 0.8267, + "step": 18063 + }, + { + "epoch": 0.8796046064324495, + "grad_norm": 1.322862148284912, + "learning_rate": 1.5009054609726547e-06, + "loss": 0.8154, + "step": 18064 + }, + { + "epoch": 0.8796533002215567, + "grad_norm": 1.5950913429260254, + "learning_rate": 1.4997067874515204e-06, + "loss": 0.8041, + "step": 18065 + }, + { + "epoch": 0.879701994010664, + "grad_norm": 1.6220256090164185, + "learning_rate": 1.4985085741268668e-06, + "loss": 0.8003, + "step": 18066 + }, + { + "epoch": 0.8797506877997712, + "grad_norm": 1.556114912033081, + "learning_rate": 1.497310821028497e-06, + "loss": 0.907, + "step": 18067 + }, + { + "epoch": 0.8797993815888784, + "grad_norm": 1.281740665435791, + "learning_rate": 1.4961135281862116e-06, + "loss": 0.7793, + "step": 18068 + }, + { + "epoch": 0.8798480753779855, + "grad_norm": 1.4313708543777466, + "learning_rate": 1.4949166956297823e-06, + "loss": 0.9033, + "step": 18069 + }, + { + "epoch": 0.8798967691670927, + "grad_norm": 1.7554125785827637, + "learning_rate": 1.4937203233889964e-06, + "loss": 0.8008, + "step": 18070 + }, + { + "epoch": 0.8799454629561999, + "grad_norm": 1.6472073793411255, + "learning_rate": 1.4925244114935966e-06, + "loss": 0.7618, + "step": 18071 + }, + { + "epoch": 0.8799941567453071, + "grad_norm": 1.2943283319473267, + "learning_rate": 1.491328959973346e-06, + "loss": 0.8484, + "step": 18072 + }, + { + "epoch": 0.8800428505344143, + "grad_norm": 1.768929362297058, + "learning_rate": 1.4901339688579697e-06, + "loss": 0.7714, + "step": 18073 + }, + { + "epoch": 0.8800915443235215, + "grad_norm": 1.6810270547866821, + "learning_rate": 1.4889394381772016e-06, + "loss": 0.8507, + "step": 18074 + }, + { + "epoch": 0.8801402381126288, + "grad_norm": 1.3271664381027222, + "learning_rate": 1.4877453679607534e-06, + "loss": 0.7575, + "step": 18075 + }, + { + "epoch": 0.880188931901736, + "grad_norm": 1.4469873905181885, + "learning_rate": 1.4865517582383237e-06, + "loss": 0.862, + "step": 18076 + }, + { + "epoch": 0.8802376256908432, + "grad_norm": 1.6748554706573486, + "learning_rate": 1.485358609039611e-06, + "loss": 0.8625, + "step": 18077 + }, + { + "epoch": 0.8802863194799503, + "grad_norm": 1.804993987083435, + "learning_rate": 1.484165920394287e-06, + "loss": 0.8198, + "step": 18078 + }, + { + "epoch": 0.8803350132690575, + "grad_norm": 1.4001007080078125, + "learning_rate": 1.4829736923320305e-06, + "loss": 0.9058, + "step": 18079 + }, + { + "epoch": 0.8803837070581647, + "grad_norm": 1.5941672325134277, + "learning_rate": 1.4817819248824838e-06, + "loss": 0.8717, + "step": 18080 + }, + { + "epoch": 0.8804324008472719, + "grad_norm": 1.30686354637146, + "learning_rate": 1.4805906180753039e-06, + "loss": 0.8009, + "step": 18081 + }, + { + "epoch": 0.8804810946363791, + "grad_norm": 1.4813429117202759, + "learning_rate": 1.4793997719401222e-06, + "loss": 0.7878, + "step": 18082 + }, + { + "epoch": 0.8805297884254863, + "grad_norm": 1.7440557479858398, + "learning_rate": 1.478209386506555e-06, + "loss": 0.9027, + "step": 18083 + }, + { + "epoch": 0.8805784822145936, + "grad_norm": 1.6300591230392456, + "learning_rate": 1.4770194618042256e-06, + "loss": 0.8082, + "step": 18084 + }, + { + "epoch": 0.8806271760037008, + "grad_norm": 1.4214754104614258, + "learning_rate": 1.475829997862719e-06, + "loss": 0.6762, + "step": 18085 + }, + { + "epoch": 0.8806758697928079, + "grad_norm": 0.1010330393910408, + "learning_rate": 1.4746409947116359e-06, + "loss": 0.6061, + "step": 18086 + }, + { + "epoch": 0.8807245635819151, + "grad_norm": 1.5546331405639648, + "learning_rate": 1.4734524523805417e-06, + "loss": 0.8396, + "step": 18087 + }, + { + "epoch": 0.8807732573710223, + "grad_norm": 1.2630367279052734, + "learning_rate": 1.4722643708990103e-06, + "loss": 0.8429, + "step": 18088 + }, + { + "epoch": 0.8808219511601295, + "grad_norm": 1.510703444480896, + "learning_rate": 1.4710767502965896e-06, + "loss": 0.7907, + "step": 18089 + }, + { + "epoch": 0.8808706449492367, + "grad_norm": 1.7812621593475342, + "learning_rate": 1.4698895906028266e-06, + "loss": 0.8383, + "step": 18090 + }, + { + "epoch": 0.8809193387383439, + "grad_norm": 1.0975871086120605, + "learning_rate": 1.468702891847249e-06, + "loss": 0.7526, + "step": 18091 + }, + { + "epoch": 0.8809680325274512, + "grad_norm": 1.5452419519424438, + "learning_rate": 1.4675166540593732e-06, + "loss": 0.8211, + "step": 18092 + }, + { + "epoch": 0.8810167263165584, + "grad_norm": 3.013510227203369, + "learning_rate": 1.466330877268718e-06, + "loss": 0.8074, + "step": 18093 + }, + { + "epoch": 0.8810654201056656, + "grad_norm": 0.09341615438461304, + "learning_rate": 1.4651455615047638e-06, + "loss": 0.5611, + "step": 18094 + }, + { + "epoch": 0.8811141138947727, + "grad_norm": 1.2928224802017212, + "learning_rate": 1.4639607067970096e-06, + "loss": 0.8467, + "step": 18095 + }, + { + "epoch": 0.8811628076838799, + "grad_norm": 0.09806736558675766, + "learning_rate": 1.4627763131749184e-06, + "loss": 0.658, + "step": 18096 + }, + { + "epoch": 0.8812115014729871, + "grad_norm": 1.8226072788238525, + "learning_rate": 1.46159238066796e-06, + "loss": 0.8515, + "step": 18097 + }, + { + "epoch": 0.8812601952620943, + "grad_norm": 1.84063720703125, + "learning_rate": 1.4604089093055795e-06, + "loss": 0.8832, + "step": 18098 + }, + { + "epoch": 0.8813088890512015, + "grad_norm": 1.3692265748977661, + "learning_rate": 1.4592258991172181e-06, + "loss": 0.8161, + "step": 18099 + }, + { + "epoch": 0.8813575828403087, + "grad_norm": 1.3912402391433716, + "learning_rate": 1.4580433501323033e-06, + "loss": 0.9081, + "step": 18100 + }, + { + "epoch": 0.881406276629416, + "grad_norm": 1.5354886054992676, + "learning_rate": 1.4568612623802514e-06, + "loss": 0.8256, + "step": 18101 + }, + { + "epoch": 0.8814549704185232, + "grad_norm": 1.595154047012329, + "learning_rate": 1.4556796358904656e-06, + "loss": 0.8399, + "step": 18102 + }, + { + "epoch": 0.8815036642076303, + "grad_norm": 1.6136375665664673, + "learning_rate": 1.4544984706923382e-06, + "loss": 0.8268, + "step": 18103 + }, + { + "epoch": 0.8815523579967375, + "grad_norm": 1.4774166345596313, + "learning_rate": 1.4533177668152542e-06, + "loss": 0.8137, + "step": 18104 + }, + { + "epoch": 0.8816010517858447, + "grad_norm": 2.792370557785034, + "learning_rate": 1.4521375242885816e-06, + "loss": 0.8561, + "step": 18105 + }, + { + "epoch": 0.8816497455749519, + "grad_norm": 1.6723273992538452, + "learning_rate": 1.4509577431416788e-06, + "loss": 0.8564, + "step": 18106 + }, + { + "epoch": 0.8816984393640591, + "grad_norm": 2.075160026550293, + "learning_rate": 1.4497784234038935e-06, + "loss": 0.7798, + "step": 18107 + }, + { + "epoch": 0.8817471331531663, + "grad_norm": 1.512872338294983, + "learning_rate": 1.4485995651045603e-06, + "loss": 0.7744, + "step": 18108 + }, + { + "epoch": 0.8817958269422735, + "grad_norm": 1.315940022468567, + "learning_rate": 1.4474211682730066e-06, + "loss": 0.8365, + "step": 18109 + }, + { + "epoch": 0.8818445207313808, + "grad_norm": 1.7937721014022827, + "learning_rate": 1.4462432329385423e-06, + "loss": 0.9143, + "step": 18110 + }, + { + "epoch": 0.881893214520488, + "grad_norm": 2.0420596599578857, + "learning_rate": 1.4450657591304663e-06, + "loss": 0.936, + "step": 18111 + }, + { + "epoch": 0.8819419083095951, + "grad_norm": 1.5663402080535889, + "learning_rate": 1.4438887468780726e-06, + "loss": 0.8367, + "step": 18112 + }, + { + "epoch": 0.8819906020987023, + "grad_norm": 1.757439374923706, + "learning_rate": 1.4427121962106405e-06, + "loss": 0.786, + "step": 18113 + }, + { + "epoch": 0.8820392958878095, + "grad_norm": 2.04262638092041, + "learning_rate": 1.441536107157433e-06, + "loss": 0.7179, + "step": 18114 + }, + { + "epoch": 0.8820879896769167, + "grad_norm": 1.5333397388458252, + "learning_rate": 1.4403604797477066e-06, + "loss": 0.8097, + "step": 18115 + }, + { + "epoch": 0.8821366834660239, + "grad_norm": 1.391111135482788, + "learning_rate": 1.4391853140107071e-06, + "loss": 0.9748, + "step": 18116 + }, + { + "epoch": 0.8821853772551311, + "grad_norm": 1.8236206769943237, + "learning_rate": 1.438010609975664e-06, + "loss": 0.7951, + "step": 18117 + }, + { + "epoch": 0.8822340710442383, + "grad_norm": 1.322818636894226, + "learning_rate": 1.4368363676717988e-06, + "loss": 0.7816, + "step": 18118 + }, + { + "epoch": 0.8822827648333456, + "grad_norm": 1.4535218477249146, + "learning_rate": 1.4356625871283236e-06, + "loss": 0.8436, + "step": 18119 + }, + { + "epoch": 0.8823314586224527, + "grad_norm": 2.2040414810180664, + "learning_rate": 1.4344892683744282e-06, + "loss": 0.8713, + "step": 18120 + }, + { + "epoch": 0.8823801524115599, + "grad_norm": 1.270887851715088, + "learning_rate": 1.4333164114393139e-06, + "loss": 0.7353, + "step": 18121 + }, + { + "epoch": 0.8824288462006671, + "grad_norm": 1.2319196462631226, + "learning_rate": 1.4321440163521394e-06, + "loss": 0.7396, + "step": 18122 + }, + { + "epoch": 0.8824775399897743, + "grad_norm": 1.7043684720993042, + "learning_rate": 1.4309720831420792e-06, + "loss": 0.7975, + "step": 18123 + }, + { + "epoch": 0.8825262337788815, + "grad_norm": 1.4651901721954346, + "learning_rate": 1.4298006118382812e-06, + "loss": 0.7475, + "step": 18124 + }, + { + "epoch": 0.8825749275679887, + "grad_norm": 2.072390079498291, + "learning_rate": 1.4286296024698864e-06, + "loss": 0.7615, + "step": 18125 + }, + { + "epoch": 0.8826236213570959, + "grad_norm": 1.4332302808761597, + "learning_rate": 1.4274590550660227e-06, + "loss": 0.8667, + "step": 18126 + }, + { + "epoch": 0.8826723151462031, + "grad_norm": 1.6406019926071167, + "learning_rate": 1.4262889696558046e-06, + "loss": 0.809, + "step": 18127 + }, + { + "epoch": 0.8827210089353102, + "grad_norm": 1.6042777299880981, + "learning_rate": 1.4251193462683487e-06, + "loss": 0.7879, + "step": 18128 + }, + { + "epoch": 0.8827697027244175, + "grad_norm": 1.4126265048980713, + "learning_rate": 1.4239501849327363e-06, + "loss": 0.8289, + "step": 18129 + }, + { + "epoch": 0.8828183965135247, + "grad_norm": 1.140778660774231, + "learning_rate": 1.422781485678064e-06, + "loss": 0.8522, + "step": 18130 + }, + { + "epoch": 0.8828670903026319, + "grad_norm": 1.5136969089508057, + "learning_rate": 1.421613248533389e-06, + "loss": 0.7776, + "step": 18131 + }, + { + "epoch": 0.8829157840917391, + "grad_norm": 1.5471484661102295, + "learning_rate": 1.420445473527783e-06, + "loss": 0.8425, + "step": 18132 + }, + { + "epoch": 0.8829644778808463, + "grad_norm": 1.9955902099609375, + "learning_rate": 1.4192781606902873e-06, + "loss": 0.7521, + "step": 18133 + }, + { + "epoch": 0.8830131716699535, + "grad_norm": 1.6591548919677734, + "learning_rate": 1.418111310049941e-06, + "loss": 0.8602, + "step": 18134 + }, + { + "epoch": 0.8830618654590607, + "grad_norm": 1.6791224479675293, + "learning_rate": 1.4169449216357744e-06, + "loss": 0.7981, + "step": 18135 + }, + { + "epoch": 0.883110559248168, + "grad_norm": 0.08957595378160477, + "learning_rate": 1.4157789954767908e-06, + "loss": 0.5777, + "step": 18136 + }, + { + "epoch": 0.883159253037275, + "grad_norm": 1.2611325979232788, + "learning_rate": 1.414613531602007e-06, + "loss": 0.8951, + "step": 18137 + }, + { + "epoch": 0.8832079468263823, + "grad_norm": 1.7998919486999512, + "learning_rate": 1.4134485300404e-06, + "loss": 0.8065, + "step": 18138 + }, + { + "epoch": 0.8832566406154895, + "grad_norm": 1.491191029548645, + "learning_rate": 1.4122839908209574e-06, + "loss": 0.8304, + "step": 18139 + }, + { + "epoch": 0.8833053344045967, + "grad_norm": 1.7420622110366821, + "learning_rate": 1.411119913972645e-06, + "loss": 0.8246, + "step": 18140 + }, + { + "epoch": 0.8833540281937039, + "grad_norm": 4.873826503753662, + "learning_rate": 1.4099562995244197e-06, + "loss": 0.7922, + "step": 18141 + }, + { + "epoch": 0.8834027219828111, + "grad_norm": 0.09821582585573196, + "learning_rate": 1.4087931475052274e-06, + "loss": 0.5763, + "step": 18142 + }, + { + "epoch": 0.8834514157719183, + "grad_norm": 2.6542611122131348, + "learning_rate": 1.407630457943996e-06, + "loss": 0.796, + "step": 18143 + }, + { + "epoch": 0.8835001095610255, + "grad_norm": 1.6517757177352905, + "learning_rate": 1.4064682308696597e-06, + "loss": 0.933, + "step": 18144 + }, + { + "epoch": 0.8835488033501326, + "grad_norm": 1.828331708908081, + "learning_rate": 1.405306466311116e-06, + "loss": 0.8045, + "step": 18145 + }, + { + "epoch": 0.8835974971392399, + "grad_norm": 1.7462294101715088, + "learning_rate": 1.4041451642972725e-06, + "loss": 0.8979, + "step": 18146 + }, + { + "epoch": 0.8836461909283471, + "grad_norm": 1.4335782527923584, + "learning_rate": 1.4029843248570108e-06, + "loss": 0.8347, + "step": 18147 + }, + { + "epoch": 0.8836948847174543, + "grad_norm": 1.360898494720459, + "learning_rate": 1.4018239480192098e-06, + "loss": 0.8028, + "step": 18148 + }, + { + "epoch": 0.8837435785065615, + "grad_norm": 1.580103874206543, + "learning_rate": 1.4006640338127354e-06, + "loss": 0.8224, + "step": 18149 + }, + { + "epoch": 0.8837922722956687, + "grad_norm": 2.064891815185547, + "learning_rate": 1.39950458226644e-06, + "loss": 0.7523, + "step": 18150 + }, + { + "epoch": 0.8838409660847759, + "grad_norm": 1.4270048141479492, + "learning_rate": 1.3983455934091628e-06, + "loss": 0.8602, + "step": 18151 + }, + { + "epoch": 0.8838896598738831, + "grad_norm": 1.7584110498428345, + "learning_rate": 1.3971870672697319e-06, + "loss": 0.7818, + "step": 18152 + }, + { + "epoch": 0.8839383536629903, + "grad_norm": 1.2539598941802979, + "learning_rate": 1.3960290038769753e-06, + "loss": 0.8758, + "step": 18153 + }, + { + "epoch": 0.8839870474520974, + "grad_norm": 1.567613959312439, + "learning_rate": 1.3948714032596856e-06, + "loss": 0.8275, + "step": 18154 + }, + { + "epoch": 0.8840357412412047, + "grad_norm": 1.6454733610153198, + "learning_rate": 1.3937142654466706e-06, + "loss": 0.8556, + "step": 18155 + }, + { + "epoch": 0.8840844350303119, + "grad_norm": 1.4703861474990845, + "learning_rate": 1.39255759046671e-06, + "loss": 0.821, + "step": 18156 + }, + { + "epoch": 0.8841331288194191, + "grad_norm": 1.1669563055038452, + "learning_rate": 1.3914013783485757e-06, + "loss": 0.7315, + "step": 18157 + }, + { + "epoch": 0.8841818226085263, + "grad_norm": 0.11218859255313873, + "learning_rate": 1.3902456291210299e-06, + "loss": 0.6106, + "step": 18158 + }, + { + "epoch": 0.8842305163976335, + "grad_norm": 3.3421225547790527, + "learning_rate": 1.3890903428128178e-06, + "loss": 0.8788, + "step": 18159 + }, + { + "epoch": 0.8842792101867407, + "grad_norm": 1.3259966373443604, + "learning_rate": 1.3879355194526833e-06, + "loss": 0.796, + "step": 18160 + }, + { + "epoch": 0.8843279039758479, + "grad_norm": 1.2890572547912598, + "learning_rate": 1.386781159069348e-06, + "loss": 0.8639, + "step": 18161 + }, + { + "epoch": 0.884376597764955, + "grad_norm": 1.3182202577590942, + "learning_rate": 1.3856272616915268e-06, + "loss": 0.7975, + "step": 18162 + }, + { + "epoch": 0.8844252915540622, + "grad_norm": 1.4830868244171143, + "learning_rate": 1.3844738273479275e-06, + "loss": 0.8167, + "step": 18163 + }, + { + "epoch": 0.8844739853431695, + "grad_norm": 1.6449228525161743, + "learning_rate": 1.3833208560672383e-06, + "loss": 0.818, + "step": 18164 + }, + { + "epoch": 0.8845226791322767, + "grad_norm": 1.3981095552444458, + "learning_rate": 1.3821683478781389e-06, + "loss": 0.8733, + "step": 18165 + }, + { + "epoch": 0.8845713729213839, + "grad_norm": 1.432329535484314, + "learning_rate": 1.3810163028093015e-06, + "loss": 0.8413, + "step": 18166 + }, + { + "epoch": 0.8846200667104911, + "grad_norm": 1.3082785606384277, + "learning_rate": 1.3798647208893813e-06, + "loss": 0.8072, + "step": 18167 + }, + { + "epoch": 0.8846687604995983, + "grad_norm": 1.8057698011398315, + "learning_rate": 1.3787136021470214e-06, + "loss": 0.8856, + "step": 18168 + }, + { + "epoch": 0.8847174542887055, + "grad_norm": 1.8947432041168213, + "learning_rate": 1.3775629466108575e-06, + "loss": 0.757, + "step": 18169 + }, + { + "epoch": 0.8847661480778126, + "grad_norm": 1.492681622505188, + "learning_rate": 1.3764127543095196e-06, + "loss": 0.8298, + "step": 18170 + }, + { + "epoch": 0.8848148418669198, + "grad_norm": 3.020841121673584, + "learning_rate": 1.3752630252716047e-06, + "loss": 0.8188, + "step": 18171 + }, + { + "epoch": 0.884863535656027, + "grad_norm": 1.5823894739151, + "learning_rate": 1.3741137595257236e-06, + "loss": 0.9471, + "step": 18172 + }, + { + "epoch": 0.8849122294451343, + "grad_norm": 1.1629250049591064, + "learning_rate": 1.3729649571004622e-06, + "loss": 0.8047, + "step": 18173 + }, + { + "epoch": 0.8849609232342415, + "grad_norm": 1.923032283782959, + "learning_rate": 1.3718166180243953e-06, + "loss": 0.8681, + "step": 18174 + }, + { + "epoch": 0.8850096170233487, + "grad_norm": 1.6354185342788696, + "learning_rate": 1.370668742326089e-06, + "loss": 0.7963, + "step": 18175 + }, + { + "epoch": 0.8850583108124559, + "grad_norm": 2.0782787799835205, + "learning_rate": 1.3695213300340959e-06, + "loss": 0.8163, + "step": 18176 + }, + { + "epoch": 0.8851070046015631, + "grad_norm": 2.1757118701934814, + "learning_rate": 1.368374381176958e-06, + "loss": 0.8628, + "step": 18177 + }, + { + "epoch": 0.8851556983906703, + "grad_norm": 1.608736276626587, + "learning_rate": 1.367227895783203e-06, + "loss": 0.7904, + "step": 18178 + }, + { + "epoch": 0.8852043921797774, + "grad_norm": 1.7506685256958008, + "learning_rate": 1.3660818738813597e-06, + "loss": 0.8281, + "step": 18179 + }, + { + "epoch": 0.8852530859688846, + "grad_norm": 1.7647663354873657, + "learning_rate": 1.3649363154999228e-06, + "loss": 0.7421, + "step": 18180 + }, + { + "epoch": 0.8853017797579918, + "grad_norm": 2.702244281768799, + "learning_rate": 1.3637912206673964e-06, + "loss": 0.8061, + "step": 18181 + }, + { + "epoch": 0.8853504735470991, + "grad_norm": 1.723414421081543, + "learning_rate": 1.362646589412262e-06, + "loss": 0.8454, + "step": 18182 + }, + { + "epoch": 0.8853991673362063, + "grad_norm": 3.546602964401245, + "learning_rate": 1.3615024217629946e-06, + "loss": 0.8031, + "step": 18183 + }, + { + "epoch": 0.8854478611253135, + "grad_norm": 1.2502177953720093, + "learning_rate": 1.3603587177480514e-06, + "loss": 0.7223, + "step": 18184 + }, + { + "epoch": 0.8854965549144207, + "grad_norm": 1.80810546875, + "learning_rate": 1.3592154773958833e-06, + "loss": 0.799, + "step": 18185 + }, + { + "epoch": 0.8855452487035279, + "grad_norm": 1.5016225576400757, + "learning_rate": 1.358072700734936e-06, + "loss": 0.8282, + "step": 18186 + }, + { + "epoch": 0.885593942492635, + "grad_norm": 1.4910237789154053, + "learning_rate": 1.3569303877936247e-06, + "loss": 0.8454, + "step": 18187 + }, + { + "epoch": 0.8856426362817422, + "grad_norm": 3.4070165157318115, + "learning_rate": 1.3557885386003756e-06, + "loss": 0.7702, + "step": 18188 + }, + { + "epoch": 0.8856913300708494, + "grad_norm": 1.6943798065185547, + "learning_rate": 1.3546471531835791e-06, + "loss": 0.7959, + "step": 18189 + }, + { + "epoch": 0.8857400238599566, + "grad_norm": 1.5403730869293213, + "learning_rate": 1.3535062315716374e-06, + "loss": 0.7787, + "step": 18190 + }, + { + "epoch": 0.8857887176490639, + "grad_norm": 2.1610188484191895, + "learning_rate": 1.3523657737929296e-06, + "loss": 0.8095, + "step": 18191 + }, + { + "epoch": 0.8858374114381711, + "grad_norm": 1.8029212951660156, + "learning_rate": 1.3512257798758199e-06, + "loss": 0.8055, + "step": 18192 + }, + { + "epoch": 0.8858861052272783, + "grad_norm": 1.5925898551940918, + "learning_rate": 1.3500862498486766e-06, + "loss": 0.7891, + "step": 18193 + }, + { + "epoch": 0.8859347990163855, + "grad_norm": 1.5021955966949463, + "learning_rate": 1.3489471837398304e-06, + "loss": 0.8488, + "step": 18194 + }, + { + "epoch": 0.8859834928054927, + "grad_norm": 0.09235314279794693, + "learning_rate": 1.3478085815776298e-06, + "loss": 0.568, + "step": 18195 + }, + { + "epoch": 0.8860321865945998, + "grad_norm": 1.8819870948791504, + "learning_rate": 1.3466704433903854e-06, + "loss": 0.83, + "step": 18196 + }, + { + "epoch": 0.886080880383707, + "grad_norm": 1.4608904123306274, + "learning_rate": 1.345532769206419e-06, + "loss": 0.7657, + "step": 18197 + }, + { + "epoch": 0.8861295741728142, + "grad_norm": 1.503463625907898, + "learning_rate": 1.3443955590540236e-06, + "loss": 0.7454, + "step": 18198 + }, + { + "epoch": 0.8861782679619215, + "grad_norm": 1.8667839765548706, + "learning_rate": 1.3432588129614898e-06, + "loss": 0.9092, + "step": 18199 + }, + { + "epoch": 0.8862269617510287, + "grad_norm": 1.4849977493286133, + "learning_rate": 1.3421225309570952e-06, + "loss": 0.7968, + "step": 18200 + }, + { + "epoch": 0.8862756555401359, + "grad_norm": 1.2332066297531128, + "learning_rate": 1.3409867130691013e-06, + "loss": 0.784, + "step": 18201 + }, + { + "epoch": 0.8863243493292431, + "grad_norm": 1.404525637626648, + "learning_rate": 1.33985135932577e-06, + "loss": 0.8511, + "step": 18202 + }, + { + "epoch": 0.8863730431183503, + "grad_norm": 1.340203881263733, + "learning_rate": 1.33871646975533e-06, + "loss": 0.8107, + "step": 18203 + }, + { + "epoch": 0.8864217369074574, + "grad_norm": 1.5389769077301025, + "learning_rate": 1.3375820443860276e-06, + "loss": 0.7923, + "step": 18204 + }, + { + "epoch": 0.8864704306965646, + "grad_norm": 2.391425132751465, + "learning_rate": 1.3364480832460646e-06, + "loss": 0.7614, + "step": 18205 + }, + { + "epoch": 0.8865191244856718, + "grad_norm": 2.2472546100616455, + "learning_rate": 1.335314586363663e-06, + "loss": 0.8232, + "step": 18206 + }, + { + "epoch": 0.886567818274779, + "grad_norm": 1.3012527227401733, + "learning_rate": 1.3341815537670112e-06, + "loss": 0.7722, + "step": 18207 + }, + { + "epoch": 0.8866165120638863, + "grad_norm": 1.5290865898132324, + "learning_rate": 1.3330489854842954e-06, + "loss": 0.8421, + "step": 18208 + }, + { + "epoch": 0.8866652058529935, + "grad_norm": 1.5724828243255615, + "learning_rate": 1.3319168815436868e-06, + "loss": 0.765, + "step": 18209 + }, + { + "epoch": 0.8867138996421007, + "grad_norm": 1.4066303968429565, + "learning_rate": 1.330785241973349e-06, + "loss": 0.7899, + "step": 18210 + }, + { + "epoch": 0.8867625934312079, + "grad_norm": 1.9074372053146362, + "learning_rate": 1.3296540668014312e-06, + "loss": 0.7925, + "step": 18211 + }, + { + "epoch": 0.8868112872203151, + "grad_norm": 1.565622091293335, + "learning_rate": 1.3285233560560663e-06, + "loss": 0.7967, + "step": 18212 + }, + { + "epoch": 0.8868599810094222, + "grad_norm": 1.9346855878829956, + "learning_rate": 1.3273931097653892e-06, + "loss": 0.7414, + "step": 18213 + }, + { + "epoch": 0.8869086747985294, + "grad_norm": 1.895526647567749, + "learning_rate": 1.326263327957511e-06, + "loss": 0.816, + "step": 18214 + }, + { + "epoch": 0.8869573685876366, + "grad_norm": 1.8419363498687744, + "learning_rate": 1.325134010660536e-06, + "loss": 0.7949, + "step": 18215 + }, + { + "epoch": 0.8870060623767438, + "grad_norm": 1.4671436548233032, + "learning_rate": 1.3240051579025548e-06, + "loss": 0.7199, + "step": 18216 + }, + { + "epoch": 0.887054756165851, + "grad_norm": 2.0292348861694336, + "learning_rate": 1.3228767697116474e-06, + "loss": 0.831, + "step": 18217 + }, + { + "epoch": 0.8871034499549583, + "grad_norm": 1.3699181079864502, + "learning_rate": 1.3217488461158844e-06, + "loss": 0.709, + "step": 18218 + }, + { + "epoch": 0.8871521437440655, + "grad_norm": 1.5546584129333496, + "learning_rate": 1.3206213871433193e-06, + "loss": 0.8312, + "step": 18219 + }, + { + "epoch": 0.8872008375331727, + "grad_norm": 1.4188727140426636, + "learning_rate": 1.3194943928220006e-06, + "loss": 0.8736, + "step": 18220 + }, + { + "epoch": 0.8872495313222798, + "grad_norm": 0.10213901847600937, + "learning_rate": 1.3183678631799635e-06, + "loss": 0.5818, + "step": 18221 + }, + { + "epoch": 0.887298225111387, + "grad_norm": 1.5565550327301025, + "learning_rate": 1.317241798245228e-06, + "loss": 0.824, + "step": 18222 + }, + { + "epoch": 0.8873469189004942, + "grad_norm": 1.494299054145813, + "learning_rate": 1.316116198045807e-06, + "loss": 0.8156, + "step": 18223 + }, + { + "epoch": 0.8873956126896014, + "grad_norm": 3.0622146129608154, + "learning_rate": 1.3149910626097008e-06, + "loss": 0.9076, + "step": 18224 + }, + { + "epoch": 0.8874443064787086, + "grad_norm": 1.3283053636550903, + "learning_rate": 1.3138663919648931e-06, + "loss": 0.7745, + "step": 18225 + }, + { + "epoch": 0.8874930002678159, + "grad_norm": 1.6037750244140625, + "learning_rate": 1.3127421861393641e-06, + "loss": 0.8091, + "step": 18226 + }, + { + "epoch": 0.8875416940569231, + "grad_norm": 0.10206739604473114, + "learning_rate": 1.3116184451610758e-06, + "loss": 0.591, + "step": 18227 + }, + { + "epoch": 0.8875903878460303, + "grad_norm": 1.1524431705474854, + "learning_rate": 1.3104951690579838e-06, + "loss": 0.8541, + "step": 18228 + }, + { + "epoch": 0.8876390816351374, + "grad_norm": 1.4502193927764893, + "learning_rate": 1.3093723578580252e-06, + "loss": 0.7926, + "step": 18229 + }, + { + "epoch": 0.8876877754242446, + "grad_norm": 2.4633402824401855, + "learning_rate": 1.3082500115891405e-06, + "loss": 0.6976, + "step": 18230 + }, + { + "epoch": 0.8877364692133518, + "grad_norm": 1.2806023359298706, + "learning_rate": 1.3071281302792339e-06, + "loss": 0.8385, + "step": 18231 + }, + { + "epoch": 0.887785163002459, + "grad_norm": 1.324896216392517, + "learning_rate": 1.3060067139562227e-06, + "loss": 0.7531, + "step": 18232 + }, + { + "epoch": 0.8878338567915662, + "grad_norm": 2.450906753540039, + "learning_rate": 1.3048857626479982e-06, + "loss": 0.8444, + "step": 18233 + }, + { + "epoch": 0.8878825505806734, + "grad_norm": 1.8441118001937866, + "learning_rate": 1.3037652763824448e-06, + "loss": 0.8382, + "step": 18234 + }, + { + "epoch": 0.8879312443697807, + "grad_norm": 1.8284335136413574, + "learning_rate": 1.3026452551874359e-06, + "loss": 0.7541, + "step": 18235 + }, + { + "epoch": 0.8879799381588879, + "grad_norm": 1.6840208768844604, + "learning_rate": 1.3015256990908264e-06, + "loss": 0.8247, + "step": 18236 + }, + { + "epoch": 0.8880286319479951, + "grad_norm": 1.5681568384170532, + "learning_rate": 1.300406608120477e-06, + "loss": 0.8931, + "step": 18237 + }, + { + "epoch": 0.8880773257371022, + "grad_norm": 1.3053816556930542, + "learning_rate": 1.2992879823042115e-06, + "loss": 0.7655, + "step": 18238 + }, + { + "epoch": 0.8881260195262094, + "grad_norm": 1.2872010469436646, + "learning_rate": 1.29816982166987e-06, + "loss": 0.7748, + "step": 18239 + }, + { + "epoch": 0.8881747133153166, + "grad_norm": 1.7378730773925781, + "learning_rate": 1.297052126245253e-06, + "loss": 0.7798, + "step": 18240 + }, + { + "epoch": 0.8882234071044238, + "grad_norm": 1.5138746500015259, + "learning_rate": 1.295934896058173e-06, + "loss": 0.8059, + "step": 18241 + }, + { + "epoch": 0.888272100893531, + "grad_norm": 1.5651026964187622, + "learning_rate": 1.294818131136417e-06, + "loss": 0.7247, + "step": 18242 + }, + { + "epoch": 0.8883207946826382, + "grad_norm": 1.85735285282135, + "learning_rate": 1.293701831507761e-06, + "loss": 0.7092, + "step": 18243 + }, + { + "epoch": 0.8883694884717455, + "grad_norm": 1.8489938974380493, + "learning_rate": 1.2925859971999866e-06, + "loss": 0.8394, + "step": 18244 + }, + { + "epoch": 0.8884181822608527, + "grad_norm": 3.0630297660827637, + "learning_rate": 1.2914706282408318e-06, + "loss": 0.7938, + "step": 18245 + }, + { + "epoch": 0.8884668760499598, + "grad_norm": 7.743725776672363, + "learning_rate": 1.290355724658059e-06, + "loss": 0.8103, + "step": 18246 + }, + { + "epoch": 0.888515569839067, + "grad_norm": 1.6947083473205566, + "learning_rate": 1.289241286479388e-06, + "loss": 0.8424, + "step": 18247 + }, + { + "epoch": 0.8885642636281742, + "grad_norm": 1.3924272060394287, + "learning_rate": 1.2881273137325478e-06, + "loss": 0.7458, + "step": 18248 + }, + { + "epoch": 0.8886129574172814, + "grad_norm": 1.9981272220611572, + "learning_rate": 1.2870138064452476e-06, + "loss": 0.7711, + "step": 18249 + }, + { + "epoch": 0.8886616512063886, + "grad_norm": 1.492876410484314, + "learning_rate": 1.2859007646451848e-06, + "loss": 0.9131, + "step": 18250 + }, + { + "epoch": 0.8887103449954958, + "grad_norm": 1.7558485269546509, + "learning_rate": 1.2847881883600465e-06, + "loss": 0.888, + "step": 18251 + }, + { + "epoch": 0.888759038784603, + "grad_norm": 1.4278247356414795, + "learning_rate": 1.283676077617504e-06, + "loss": 0.7893, + "step": 18252 + }, + { + "epoch": 0.8888077325737103, + "grad_norm": 1.4604591131210327, + "learning_rate": 1.2825644324452346e-06, + "loss": 0.8898, + "step": 18253 + }, + { + "epoch": 0.8888564263628175, + "grad_norm": 1.3162864446640015, + "learning_rate": 1.2814532528708722e-06, + "loss": 0.7711, + "step": 18254 + }, + { + "epoch": 0.8889051201519246, + "grad_norm": 1.287796139717102, + "learning_rate": 1.2803425389220748e-06, + "loss": 0.8632, + "step": 18255 + }, + { + "epoch": 0.8889538139410318, + "grad_norm": 1.5253247022628784, + "learning_rate": 1.2792322906264575e-06, + "loss": 0.7863, + "step": 18256 + }, + { + "epoch": 0.889002507730139, + "grad_norm": 1.9524624347686768, + "learning_rate": 1.2781225080116478e-06, + "loss": 0.8661, + "step": 18257 + }, + { + "epoch": 0.8890512015192462, + "grad_norm": 1.4319690465927124, + "learning_rate": 1.2770131911052452e-06, + "loss": 0.7721, + "step": 18258 + }, + { + "epoch": 0.8890998953083534, + "grad_norm": 1.809436559677124, + "learning_rate": 1.2759043399348481e-06, + "loss": 0.8168, + "step": 18259 + }, + { + "epoch": 0.8891485890974606, + "grad_norm": 1.5386980772018433, + "learning_rate": 1.2747959545280386e-06, + "loss": 0.8642, + "step": 18260 + }, + { + "epoch": 0.8891972828865679, + "grad_norm": 1.5609285831451416, + "learning_rate": 1.2736880349123836e-06, + "loss": 0.7993, + "step": 18261 + }, + { + "epoch": 0.8892459766756751, + "grad_norm": 1.4942350387573242, + "learning_rate": 1.27258058111545e-06, + "loss": 0.8373, + "step": 18262 + }, + { + "epoch": 0.8892946704647822, + "grad_norm": 2.152496814727783, + "learning_rate": 1.2714735931647782e-06, + "loss": 0.855, + "step": 18263 + }, + { + "epoch": 0.8893433642538894, + "grad_norm": 1.5711027383804321, + "learning_rate": 1.2703670710879102e-06, + "loss": 0.8017, + "step": 18264 + }, + { + "epoch": 0.8893920580429966, + "grad_norm": 2.1978955268859863, + "learning_rate": 1.2692610149123708e-06, + "loss": 0.9091, + "step": 18265 + }, + { + "epoch": 0.8894407518321038, + "grad_norm": 1.6135176420211792, + "learning_rate": 1.268155424665669e-06, + "loss": 0.7863, + "step": 18266 + }, + { + "epoch": 0.889489445621211, + "grad_norm": 1.5888793468475342, + "learning_rate": 1.2670503003753097e-06, + "loss": 0.8713, + "step": 18267 + }, + { + "epoch": 0.8895381394103182, + "grad_norm": 4.267625331878662, + "learning_rate": 1.2659456420687844e-06, + "loss": 0.7619, + "step": 18268 + }, + { + "epoch": 0.8895868331994254, + "grad_norm": 1.6346769332885742, + "learning_rate": 1.2648414497735662e-06, + "loss": 0.8199, + "step": 18269 + }, + { + "epoch": 0.8896355269885327, + "grad_norm": 1.3106602430343628, + "learning_rate": 1.2637377235171265e-06, + "loss": 0.8431, + "step": 18270 + }, + { + "epoch": 0.8896842207776399, + "grad_norm": 1.3868823051452637, + "learning_rate": 1.2626344633269172e-06, + "loss": 0.8644, + "step": 18271 + }, + { + "epoch": 0.889732914566747, + "grad_norm": 2.8012990951538086, + "learning_rate": 1.2615316692303848e-06, + "loss": 0.8689, + "step": 18272 + }, + { + "epoch": 0.8897816083558542, + "grad_norm": 1.6865270137786865, + "learning_rate": 1.260429341254963e-06, + "loss": 0.864, + "step": 18273 + }, + { + "epoch": 0.8898303021449614, + "grad_norm": 1.5051642656326294, + "learning_rate": 1.2593274794280674e-06, + "loss": 0.8683, + "step": 18274 + }, + { + "epoch": 0.8898789959340686, + "grad_norm": 1.79032301902771, + "learning_rate": 1.258226083777112e-06, + "loss": 0.7406, + "step": 18275 + }, + { + "epoch": 0.8899276897231758, + "grad_norm": 2.0428223609924316, + "learning_rate": 1.2571251543294905e-06, + "loss": 0.8942, + "step": 18276 + }, + { + "epoch": 0.889976383512283, + "grad_norm": 1.4850587844848633, + "learning_rate": 1.2560246911125895e-06, + "loss": 0.744, + "step": 18277 + }, + { + "epoch": 0.8900250773013902, + "grad_norm": 1.9368175268173218, + "learning_rate": 1.2549246941537808e-06, + "loss": 0.8323, + "step": 18278 + }, + { + "epoch": 0.8900737710904975, + "grad_norm": 1.5204678773880005, + "learning_rate": 1.2538251634804334e-06, + "loss": 0.8225, + "step": 18279 + }, + { + "epoch": 0.8901224648796046, + "grad_norm": 2.540412187576294, + "learning_rate": 1.2527260991198897e-06, + "loss": 0.8596, + "step": 18280 + }, + { + "epoch": 0.8901711586687118, + "grad_norm": 1.3666622638702393, + "learning_rate": 1.251627501099497e-06, + "loss": 0.8642, + "step": 18281 + }, + { + "epoch": 0.890219852457819, + "grad_norm": 1.3659098148345947, + "learning_rate": 1.2505293694465804e-06, + "loss": 0.8963, + "step": 18282 + }, + { + "epoch": 0.8902685462469262, + "grad_norm": 1.7710297107696533, + "learning_rate": 1.2494317041884552e-06, + "loss": 0.7645, + "step": 18283 + }, + { + "epoch": 0.8903172400360334, + "grad_norm": 1.4351786375045776, + "learning_rate": 1.2483345053524242e-06, + "loss": 0.7882, + "step": 18284 + }, + { + "epoch": 0.8903659338251406, + "grad_norm": 1.7584898471832275, + "learning_rate": 1.2472377729657837e-06, + "loss": 0.8076, + "step": 18285 + }, + { + "epoch": 0.8904146276142478, + "grad_norm": 1.2014073133468628, + "learning_rate": 1.2461415070558113e-06, + "loss": 0.8188, + "step": 18286 + }, + { + "epoch": 0.890463321403355, + "grad_norm": 1.4859648942947388, + "learning_rate": 1.2450457076497768e-06, + "loss": 0.843, + "step": 18287 + }, + { + "epoch": 0.8905120151924621, + "grad_norm": 2.6970107555389404, + "learning_rate": 1.2439503747749471e-06, + "loss": 0.7958, + "step": 18288 + }, + { + "epoch": 0.8905607089815694, + "grad_norm": 2.1762096881866455, + "learning_rate": 1.2428555084585559e-06, + "loss": 0.866, + "step": 18289 + }, + { + "epoch": 0.8906094027706766, + "grad_norm": 2.3508455753326416, + "learning_rate": 1.2417611087278457e-06, + "loss": 0.8473, + "step": 18290 + }, + { + "epoch": 0.8906580965597838, + "grad_norm": 1.4074941873550415, + "learning_rate": 1.2406671756100398e-06, + "loss": 0.7798, + "step": 18291 + }, + { + "epoch": 0.890706790348891, + "grad_norm": 2.015049457550049, + "learning_rate": 1.2395737091323489e-06, + "loss": 0.8436, + "step": 18292 + }, + { + "epoch": 0.8907554841379982, + "grad_norm": 0.09764742106199265, + "learning_rate": 1.238480709321972e-06, + "loss": 0.5728, + "step": 18293 + }, + { + "epoch": 0.8908041779271054, + "grad_norm": 1.8103142976760864, + "learning_rate": 1.2373881762060936e-06, + "loss": 0.801, + "step": 18294 + }, + { + "epoch": 0.8908528717162126, + "grad_norm": 1.4857802391052246, + "learning_rate": 1.2362961098119009e-06, + "loss": 0.8341, + "step": 18295 + }, + { + "epoch": 0.8909015655053198, + "grad_norm": 1.481896162033081, + "learning_rate": 1.2352045101665499e-06, + "loss": 0.7257, + "step": 18296 + }, + { + "epoch": 0.890950259294427, + "grad_norm": 2.052340030670166, + "learning_rate": 1.2341133772972013e-06, + "loss": 0.8259, + "step": 18297 + }, + { + "epoch": 0.8909989530835342, + "grad_norm": 1.1752777099609375, + "learning_rate": 1.2330227112309888e-06, + "loss": 0.7415, + "step": 18298 + }, + { + "epoch": 0.8910476468726414, + "grad_norm": 1.526946783065796, + "learning_rate": 1.2319325119950487e-06, + "loss": 0.7894, + "step": 18299 + }, + { + "epoch": 0.8910963406617486, + "grad_norm": 1.5158865451812744, + "learning_rate": 1.230842779616499e-06, + "loss": 0.6705, + "step": 18300 + }, + { + "epoch": 0.8911450344508558, + "grad_norm": 1.5513814687728882, + "learning_rate": 1.2297535141224448e-06, + "loss": 0.8059, + "step": 18301 + }, + { + "epoch": 0.891193728239963, + "grad_norm": 2.222818374633789, + "learning_rate": 1.2286647155399888e-06, + "loss": 0.7255, + "step": 18302 + }, + { + "epoch": 0.8912424220290702, + "grad_norm": 1.3179525136947632, + "learning_rate": 1.2275763838962029e-06, + "loss": 0.7965, + "step": 18303 + }, + { + "epoch": 0.8912911158181774, + "grad_norm": 1.636906623840332, + "learning_rate": 1.226488519218174e-06, + "loss": 0.8239, + "step": 18304 + }, + { + "epoch": 0.8913398096072845, + "grad_norm": 2.020521640777588, + "learning_rate": 1.2254011215329475e-06, + "loss": 0.8087, + "step": 18305 + }, + { + "epoch": 0.8913885033963918, + "grad_norm": 1.74432373046875, + "learning_rate": 1.2243141908675814e-06, + "loss": 0.7847, + "step": 18306 + }, + { + "epoch": 0.891437197185499, + "grad_norm": 2.9795773029327393, + "learning_rate": 1.2232277272491144e-06, + "loss": 0.9295, + "step": 18307 + }, + { + "epoch": 0.8914858909746062, + "grad_norm": 1.451079249382019, + "learning_rate": 1.222141730704567e-06, + "loss": 0.8707, + "step": 18308 + }, + { + "epoch": 0.8915345847637134, + "grad_norm": 1.3005037307739258, + "learning_rate": 1.2210562012609572e-06, + "loss": 0.6582, + "step": 18309 + }, + { + "epoch": 0.8915832785528206, + "grad_norm": 1.639013409614563, + "learning_rate": 1.219971138945284e-06, + "loss": 0.8832, + "step": 18310 + }, + { + "epoch": 0.8916319723419278, + "grad_norm": 3.05073881149292, + "learning_rate": 1.2188865437845476e-06, + "loss": 0.7657, + "step": 18311 + }, + { + "epoch": 0.891680666131035, + "grad_norm": 1.3755601644515991, + "learning_rate": 1.2178024158057155e-06, + "loss": 0.7632, + "step": 18312 + }, + { + "epoch": 0.8917293599201422, + "grad_norm": 1.4517745971679688, + "learning_rate": 1.216718755035764e-06, + "loss": 0.775, + "step": 18313 + }, + { + "epoch": 0.8917780537092493, + "grad_norm": 0.09723810106515884, + "learning_rate": 1.2156355615016423e-06, + "loss": 0.5917, + "step": 18314 + }, + { + "epoch": 0.8918267474983566, + "grad_norm": 2.0383236408233643, + "learning_rate": 1.2145528352303005e-06, + "loss": 0.7197, + "step": 18315 + }, + { + "epoch": 0.8918754412874638, + "grad_norm": 1.4298278093338013, + "learning_rate": 1.21347057624867e-06, + "loss": 0.8593, + "step": 18316 + }, + { + "epoch": 0.891924135076571, + "grad_norm": 1.2689340114593506, + "learning_rate": 1.2123887845836713e-06, + "loss": 0.8216, + "step": 18317 + }, + { + "epoch": 0.8919728288656782, + "grad_norm": 1.866870403289795, + "learning_rate": 1.2113074602622165e-06, + "loss": 0.7674, + "step": 18318 + }, + { + "epoch": 0.8920215226547854, + "grad_norm": 2.2934250831604004, + "learning_rate": 1.2102266033111998e-06, + "loss": 0.7813, + "step": 18319 + }, + { + "epoch": 0.8920702164438926, + "grad_norm": 1.5013610124588013, + "learning_rate": 1.2091462137575105e-06, + "loss": 0.8056, + "step": 18320 + }, + { + "epoch": 0.8921189102329998, + "grad_norm": 1.7246098518371582, + "learning_rate": 1.2080662916280183e-06, + "loss": 0.9037, + "step": 18321 + }, + { + "epoch": 0.8921676040221069, + "grad_norm": 5.781087875366211, + "learning_rate": 1.2069868369495952e-06, + "loss": 0.8169, + "step": 18322 + }, + { + "epoch": 0.8922162978112141, + "grad_norm": 1.40639066696167, + "learning_rate": 1.2059078497490862e-06, + "loss": 0.781, + "step": 18323 + }, + { + "epoch": 0.8922649916003214, + "grad_norm": 1.719556450843811, + "learning_rate": 1.2048293300533343e-06, + "loss": 0.7812, + "step": 18324 + }, + { + "epoch": 0.8923136853894286, + "grad_norm": 1.4197403192520142, + "learning_rate": 1.203751277889167e-06, + "loss": 0.7797, + "step": 18325 + }, + { + "epoch": 0.8923623791785358, + "grad_norm": 4.1268229484558105, + "learning_rate": 1.2026736932833983e-06, + "loss": 0.8304, + "step": 18326 + }, + { + "epoch": 0.892411072967643, + "grad_norm": 2.551119089126587, + "learning_rate": 1.2015965762628379e-06, + "loss": 0.8557, + "step": 18327 + }, + { + "epoch": 0.8924597667567502, + "grad_norm": 1.1964396238327026, + "learning_rate": 1.2005199268542756e-06, + "loss": 0.8762, + "step": 18328 + }, + { + "epoch": 0.8925084605458574, + "grad_norm": 2.4243695735931396, + "learning_rate": 1.1994437450844897e-06, + "loss": 0.7946, + "step": 18329 + }, + { + "epoch": 0.8925571543349645, + "grad_norm": 1.5505516529083252, + "learning_rate": 1.1983680309802593e-06, + "loss": 0.7774, + "step": 18330 + }, + { + "epoch": 0.8926058481240717, + "grad_norm": 1.4667214155197144, + "learning_rate": 1.197292784568338e-06, + "loss": 0.8064, + "step": 18331 + }, + { + "epoch": 0.892654541913179, + "grad_norm": 2.1086182594299316, + "learning_rate": 1.1962180058754736e-06, + "loss": 0.75, + "step": 18332 + }, + { + "epoch": 0.8927032357022862, + "grad_norm": 1.521160364151001, + "learning_rate": 1.1951436949284e-06, + "loss": 0.8186, + "step": 18333 + }, + { + "epoch": 0.8927519294913934, + "grad_norm": 1.3451581001281738, + "learning_rate": 1.1940698517538406e-06, + "loss": 0.8788, + "step": 18334 + }, + { + "epoch": 0.8928006232805006, + "grad_norm": 1.6946593523025513, + "learning_rate": 1.192996476378512e-06, + "loss": 0.8837, + "step": 18335 + }, + { + "epoch": 0.8928493170696078, + "grad_norm": 1.8313720226287842, + "learning_rate": 1.1919235688291075e-06, + "loss": 0.7563, + "step": 18336 + }, + { + "epoch": 0.892898010858715, + "grad_norm": 1.2566113471984863, + "learning_rate": 1.1908511291323199e-06, + "loss": 0.7924, + "step": 18337 + }, + { + "epoch": 0.8929467046478222, + "grad_norm": 1.4887595176696777, + "learning_rate": 1.1897791573148209e-06, + "loss": 0.8609, + "step": 18338 + }, + { + "epoch": 0.8929953984369293, + "grad_norm": 1.562363624572754, + "learning_rate": 1.1887076534032894e-06, + "loss": 0.8054, + "step": 18339 + }, + { + "epoch": 0.8930440922260365, + "grad_norm": 1.650117039680481, + "learning_rate": 1.1876366174243614e-06, + "loss": 0.8532, + "step": 18340 + }, + { + "epoch": 0.8930927860151437, + "grad_norm": 1.6541991233825684, + "learning_rate": 1.1865660494046937e-06, + "loss": 0.8738, + "step": 18341 + }, + { + "epoch": 0.893141479804251, + "grad_norm": 0.09318123012781143, + "learning_rate": 1.1854959493709095e-06, + "loss": 0.6356, + "step": 18342 + }, + { + "epoch": 0.8931901735933582, + "grad_norm": 1.1341955661773682, + "learning_rate": 1.1844263173496274e-06, + "loss": 0.7418, + "step": 18343 + }, + { + "epoch": 0.8932388673824654, + "grad_norm": 1.3504016399383545, + "learning_rate": 1.1833571533674593e-06, + "loss": 0.7962, + "step": 18344 + }, + { + "epoch": 0.8932875611715726, + "grad_norm": 2.360368490219116, + "learning_rate": 1.1822884574509907e-06, + "loss": 0.7912, + "step": 18345 + }, + { + "epoch": 0.8933362549606798, + "grad_norm": 4.240288257598877, + "learning_rate": 1.1812202296268226e-06, + "loss": 0.7913, + "step": 18346 + }, + { + "epoch": 0.8933849487497869, + "grad_norm": 2.618114709854126, + "learning_rate": 1.1801524699215073e-06, + "loss": 0.8479, + "step": 18347 + }, + { + "epoch": 0.8934336425388941, + "grad_norm": 1.5066264867782593, + "learning_rate": 1.179085178361623e-06, + "loss": 0.8765, + "step": 18348 + }, + { + "epoch": 0.8934823363280013, + "grad_norm": 1.5985875129699707, + "learning_rate": 1.1780183549737068e-06, + "loss": 0.8373, + "step": 18349 + }, + { + "epoch": 0.8935310301171085, + "grad_norm": 1.955818772315979, + "learning_rate": 1.1769519997843014e-06, + "loss": 0.7935, + "step": 18350 + }, + { + "epoch": 0.8935797239062158, + "grad_norm": 1.5068053007125854, + "learning_rate": 1.1758861128199305e-06, + "loss": 0.8127, + "step": 18351 + }, + { + "epoch": 0.893628417695323, + "grad_norm": 1.3067609071731567, + "learning_rate": 1.1748206941071061e-06, + "loss": 0.6905, + "step": 18352 + }, + { + "epoch": 0.8936771114844302, + "grad_norm": 3.215585947036743, + "learning_rate": 1.1737557436723402e-06, + "loss": 0.7529, + "step": 18353 + }, + { + "epoch": 0.8937258052735374, + "grad_norm": 3.106795072555542, + "learning_rate": 1.1726912615421093e-06, + "loss": 0.7982, + "step": 18354 + }, + { + "epoch": 0.8937744990626446, + "grad_norm": 0.08949983865022659, + "learning_rate": 1.171627247742908e-06, + "loss": 0.5461, + "step": 18355 + }, + { + "epoch": 0.8938231928517517, + "grad_norm": 2.0206844806671143, + "learning_rate": 1.1705637023011885e-06, + "loss": 0.7794, + "step": 18356 + }, + { + "epoch": 0.8938718866408589, + "grad_norm": 2.0705983638763428, + "learning_rate": 1.169500625243416e-06, + "loss": 0.8534, + "step": 18357 + }, + { + "epoch": 0.8939205804299661, + "grad_norm": 1.565242886543274, + "learning_rate": 1.1684380165960342e-06, + "loss": 0.9206, + "step": 18358 + }, + { + "epoch": 0.8939692742190734, + "grad_norm": 1.3682292699813843, + "learning_rate": 1.1673758763854726e-06, + "loss": 0.7445, + "step": 18359 + }, + { + "epoch": 0.8940179680081806, + "grad_norm": 1.3661236763000488, + "learning_rate": 1.1663142046381525e-06, + "loss": 0.7788, + "step": 18360 + }, + { + "epoch": 0.8940666617972878, + "grad_norm": 1.4354147911071777, + "learning_rate": 1.1652530013804819e-06, + "loss": 0.7816, + "step": 18361 + }, + { + "epoch": 0.894115355586395, + "grad_norm": 2.863908290863037, + "learning_rate": 1.164192266638866e-06, + "loss": 0.715, + "step": 18362 + }, + { + "epoch": 0.8941640493755022, + "grad_norm": 1.8622177839279175, + "learning_rate": 1.1631320004396774e-06, + "loss": 0.7734, + "step": 18363 + }, + { + "epoch": 0.8942127431646093, + "grad_norm": 2.7497358322143555, + "learning_rate": 1.1620722028093013e-06, + "loss": 0.7371, + "step": 18364 + }, + { + "epoch": 0.8942614369537165, + "grad_norm": 1.5336401462554932, + "learning_rate": 1.161012873774099e-06, + "loss": 0.8216, + "step": 18365 + }, + { + "epoch": 0.8943101307428237, + "grad_norm": 1.6564834117889404, + "learning_rate": 1.159954013360416e-06, + "loss": 0.7019, + "step": 18366 + }, + { + "epoch": 0.8943588245319309, + "grad_norm": 2.474954128265381, + "learning_rate": 1.1588956215945957e-06, + "loss": 0.7742, + "step": 18367 + }, + { + "epoch": 0.8944075183210382, + "grad_norm": 1.3840439319610596, + "learning_rate": 1.157837698502966e-06, + "loss": 0.8607, + "step": 18368 + }, + { + "epoch": 0.8944562121101454, + "grad_norm": 3.672541379928589, + "learning_rate": 1.1567802441118414e-06, + "loss": 0.7256, + "step": 18369 + }, + { + "epoch": 0.8945049058992526, + "grad_norm": 1.9426097869873047, + "learning_rate": 1.1557232584475208e-06, + "loss": 0.8007, + "step": 18370 + }, + { + "epoch": 0.8945535996883598, + "grad_norm": 1.5461875200271606, + "learning_rate": 1.15466674153631e-06, + "loss": 0.8714, + "step": 18371 + }, + { + "epoch": 0.894602293477467, + "grad_norm": 8.29548168182373, + "learning_rate": 1.1536106934044766e-06, + "loss": 0.8963, + "step": 18372 + }, + { + "epoch": 0.8946509872665741, + "grad_norm": 1.809525489807129, + "learning_rate": 1.1525551140782997e-06, + "loss": 0.7248, + "step": 18373 + }, + { + "epoch": 0.8946996810556813, + "grad_norm": 2.006801128387451, + "learning_rate": 1.1515000035840297e-06, + "loss": 0.7998, + "step": 18374 + }, + { + "epoch": 0.8947483748447885, + "grad_norm": 1.4207758903503418, + "learning_rate": 1.1504453619479183e-06, + "loss": 0.8288, + "step": 18375 + }, + { + "epoch": 0.8947970686338957, + "grad_norm": 1.2553181648254395, + "learning_rate": 1.1493911891961962e-06, + "loss": 0.817, + "step": 18376 + }, + { + "epoch": 0.894845762423003, + "grad_norm": 1.7633862495422363, + "learning_rate": 1.1483374853550866e-06, + "loss": 0.8146, + "step": 18377 + }, + { + "epoch": 0.8948944562121102, + "grad_norm": 1.4723045825958252, + "learning_rate": 1.147284250450802e-06, + "loss": 0.9051, + "step": 18378 + }, + { + "epoch": 0.8949431500012174, + "grad_norm": 2.067756414413452, + "learning_rate": 1.1462314845095346e-06, + "loss": 0.7832, + "step": 18379 + }, + { + "epoch": 0.8949918437903246, + "grad_norm": 1.460900902748108, + "learning_rate": 1.1451791875574837e-06, + "loss": 0.8323, + "step": 18380 + }, + { + "epoch": 0.8950405375794317, + "grad_norm": 2.083611249923706, + "learning_rate": 1.144127359620819e-06, + "loss": 0.8808, + "step": 18381 + }, + { + "epoch": 0.8950892313685389, + "grad_norm": 1.945175290107727, + "learning_rate": 1.143076000725707e-06, + "loss": 0.8336, + "step": 18382 + }, + { + "epoch": 0.8951379251576461, + "grad_norm": 1.65477454662323, + "learning_rate": 1.1420251108982971e-06, + "loss": 0.8585, + "step": 18383 + }, + { + "epoch": 0.8951866189467533, + "grad_norm": 2.8922290802001953, + "learning_rate": 1.1409746901647312e-06, + "loss": 0.7934, + "step": 18384 + }, + { + "epoch": 0.8952353127358605, + "grad_norm": 1.9690048694610596, + "learning_rate": 1.1399247385511413e-06, + "loss": 0.8051, + "step": 18385 + }, + { + "epoch": 0.8952840065249678, + "grad_norm": 1.591271996498108, + "learning_rate": 1.1388752560836424e-06, + "loss": 0.7989, + "step": 18386 + }, + { + "epoch": 0.895332700314075, + "grad_norm": 1.406182885169983, + "learning_rate": 1.137826242788338e-06, + "loss": 0.851, + "step": 18387 + }, + { + "epoch": 0.8953813941031822, + "grad_norm": 1.3467974662780762, + "learning_rate": 1.1367776986913314e-06, + "loss": 0.9442, + "step": 18388 + }, + { + "epoch": 0.8954300878922893, + "grad_norm": 1.8066896200180054, + "learning_rate": 1.1357296238186933e-06, + "loss": 0.7922, + "step": 18389 + }, + { + "epoch": 0.8954787816813965, + "grad_norm": 1.8789936304092407, + "learning_rate": 1.1346820181965046e-06, + "loss": 0.8097, + "step": 18390 + }, + { + "epoch": 0.8955274754705037, + "grad_norm": 1.4259164333343506, + "learning_rate": 1.1336348818508203e-06, + "loss": 0.8815, + "step": 18391 + }, + { + "epoch": 0.8955761692596109, + "grad_norm": 1.55315101146698, + "learning_rate": 1.1325882148076883e-06, + "loss": 0.7522, + "step": 18392 + }, + { + "epoch": 0.8956248630487181, + "grad_norm": 1.2840386629104614, + "learning_rate": 1.1315420170931435e-06, + "loss": 0.7699, + "step": 18393 + }, + { + "epoch": 0.8956735568378253, + "grad_norm": 1.5562989711761475, + "learning_rate": 1.1304962887332138e-06, + "loss": 0.9019, + "step": 18394 + }, + { + "epoch": 0.8957222506269326, + "grad_norm": 2.956080198287964, + "learning_rate": 1.1294510297539075e-06, + "loss": 0.74, + "step": 18395 + }, + { + "epoch": 0.8957709444160398, + "grad_norm": 1.7202117443084717, + "learning_rate": 1.1284062401812257e-06, + "loss": 0.8237, + "step": 18396 + }, + { + "epoch": 0.895819638205147, + "grad_norm": 1.3289401531219482, + "learning_rate": 1.1273619200411657e-06, + "loss": 0.7582, + "step": 18397 + }, + { + "epoch": 0.8958683319942541, + "grad_norm": 1.2715771198272705, + "learning_rate": 1.126318069359691e-06, + "loss": 0.7563, + "step": 18398 + }, + { + "epoch": 0.8959170257833613, + "grad_norm": 1.8509901762008667, + "learning_rate": 1.1252746881627785e-06, + "loss": 0.7998, + "step": 18399 + }, + { + "epoch": 0.8959657195724685, + "grad_norm": 1.5771549940109253, + "learning_rate": 1.1242317764763811e-06, + "loss": 0.811, + "step": 18400 + }, + { + "epoch": 0.8960144133615757, + "grad_norm": 3.162076473236084, + "learning_rate": 1.1231893343264378e-06, + "loss": 0.7632, + "step": 18401 + }, + { + "epoch": 0.8960631071506829, + "grad_norm": 1.2102562189102173, + "learning_rate": 1.1221473617388812e-06, + "loss": 0.8447, + "step": 18402 + }, + { + "epoch": 0.8961118009397901, + "grad_norm": 1.5824323892593384, + "learning_rate": 1.1211058587396262e-06, + "loss": 0.8153, + "step": 18403 + }, + { + "epoch": 0.8961604947288974, + "grad_norm": 1.2112462520599365, + "learning_rate": 1.1200648253545899e-06, + "loss": 0.7591, + "step": 18404 + }, + { + "epoch": 0.8962091885180046, + "grad_norm": 1.280637264251709, + "learning_rate": 1.1190242616096581e-06, + "loss": 0.837, + "step": 18405 + }, + { + "epoch": 0.8962578823071117, + "grad_norm": 1.5527310371398926, + "learning_rate": 1.1179841675307256e-06, + "loss": 0.7448, + "step": 18406 + }, + { + "epoch": 0.8963065760962189, + "grad_norm": 1.6726917028427124, + "learning_rate": 1.1169445431436498e-06, + "loss": 0.7845, + "step": 18407 + }, + { + "epoch": 0.8963552698853261, + "grad_norm": 1.2979087829589844, + "learning_rate": 1.1159053884743055e-06, + "loss": 0.7311, + "step": 18408 + }, + { + "epoch": 0.8964039636744333, + "grad_norm": 1.351333498954773, + "learning_rate": 1.1148667035485361e-06, + "loss": 0.7986, + "step": 18409 + }, + { + "epoch": 0.8964526574635405, + "grad_norm": 1.4179216623306274, + "learning_rate": 1.113828488392177e-06, + "loss": 0.8021, + "step": 18410 + }, + { + "epoch": 0.8965013512526477, + "grad_norm": 1.6458407640457153, + "learning_rate": 1.1127907430310602e-06, + "loss": 0.7398, + "step": 18411 + }, + { + "epoch": 0.896550045041755, + "grad_norm": 1.7014133930206299, + "learning_rate": 1.1117534674909924e-06, + "loss": 0.7455, + "step": 18412 + }, + { + "epoch": 0.8965987388308622, + "grad_norm": 1.7013596296310425, + "learning_rate": 1.110716661797784e-06, + "loss": 0.7118, + "step": 18413 + }, + { + "epoch": 0.8966474326199694, + "grad_norm": 2.5408549308776855, + "learning_rate": 1.1096803259772137e-06, + "loss": 0.811, + "step": 18414 + }, + { + "epoch": 0.8966961264090765, + "grad_norm": 2.282528877258301, + "learning_rate": 1.1086444600550705e-06, + "loss": 0.8348, + "step": 18415 + }, + { + "epoch": 0.8967448201981837, + "grad_norm": 1.3896567821502686, + "learning_rate": 1.1076090640571201e-06, + "loss": 0.791, + "step": 18416 + }, + { + "epoch": 0.8967935139872909, + "grad_norm": 1.5235826969146729, + "learning_rate": 1.1065741380091178e-06, + "loss": 0.8755, + "step": 18417 + }, + { + "epoch": 0.8968422077763981, + "grad_norm": 1.987815022468567, + "learning_rate": 1.1055396819368048e-06, + "loss": 0.7633, + "step": 18418 + }, + { + "epoch": 0.8968909015655053, + "grad_norm": 0.10095095634460449, + "learning_rate": 1.1045056958659118e-06, + "loss": 0.6432, + "step": 18419 + }, + { + "epoch": 0.8969395953546125, + "grad_norm": 1.3258919715881348, + "learning_rate": 1.1034721798221692e-06, + "loss": 0.8561, + "step": 18420 + }, + { + "epoch": 0.8969882891437198, + "grad_norm": 2.263075351715088, + "learning_rate": 1.1024391338312745e-06, + "loss": 0.727, + "step": 18421 + }, + { + "epoch": 0.897036982932827, + "grad_norm": 1.8170973062515259, + "learning_rate": 1.1014065579189337e-06, + "loss": 0.8558, + "step": 18422 + }, + { + "epoch": 0.8970856767219341, + "grad_norm": 1.4718176126480103, + "learning_rate": 1.1003744521108217e-06, + "loss": 0.7931, + "step": 18423 + }, + { + "epoch": 0.8971343705110413, + "grad_norm": 1.7671061754226685, + "learning_rate": 1.0993428164326203e-06, + "loss": 0.7617, + "step": 18424 + }, + { + "epoch": 0.8971830643001485, + "grad_norm": 1.8506392240524292, + "learning_rate": 1.098311650909991e-06, + "loss": 0.7756, + "step": 18425 + }, + { + "epoch": 0.8972317580892557, + "grad_norm": 1.3414078950881958, + "learning_rate": 1.0972809555685826e-06, + "loss": 0.7935, + "step": 18426 + }, + { + "epoch": 0.8972804518783629, + "grad_norm": 1.667130947113037, + "learning_rate": 1.0962507304340343e-06, + "loss": 0.7439, + "step": 18427 + }, + { + "epoch": 0.8973291456674701, + "grad_norm": 1.8342007398605347, + "learning_rate": 1.0952209755319721e-06, + "loss": 0.7612, + "step": 18428 + }, + { + "epoch": 0.8973778394565773, + "grad_norm": 1.5134855508804321, + "learning_rate": 1.0941916908880134e-06, + "loss": 0.7718, + "step": 18429 + }, + { + "epoch": 0.8974265332456846, + "grad_norm": 1.9177743196487427, + "learning_rate": 1.0931628765277557e-06, + "loss": 0.7895, + "step": 18430 + }, + { + "epoch": 0.8974752270347918, + "grad_norm": 1.6671578884124756, + "learning_rate": 1.0921345324767984e-06, + "loss": 0.8292, + "step": 18431 + }, + { + "epoch": 0.8975239208238989, + "grad_norm": 1.7185947895050049, + "learning_rate": 1.0911066587607188e-06, + "loss": 0.8721, + "step": 18432 + }, + { + "epoch": 0.8975726146130061, + "grad_norm": 1.4148364067077637, + "learning_rate": 1.0900792554050854e-06, + "loss": 0.7935, + "step": 18433 + }, + { + "epoch": 0.8976213084021133, + "grad_norm": 1.572749376296997, + "learning_rate": 1.0890523224354532e-06, + "loss": 0.7895, + "step": 18434 + }, + { + "epoch": 0.8976700021912205, + "grad_norm": 0.09876576066017151, + "learning_rate": 1.0880258598773708e-06, + "loss": 0.6049, + "step": 18435 + }, + { + "epoch": 0.8977186959803277, + "grad_norm": 1.6822644472122192, + "learning_rate": 1.0869998677563665e-06, + "loss": 0.8505, + "step": 18436 + }, + { + "epoch": 0.8977673897694349, + "grad_norm": 1.9275883436203003, + "learning_rate": 1.0859743460979665e-06, + "loss": 0.8307, + "step": 18437 + }, + { + "epoch": 0.8978160835585421, + "grad_norm": 1.4857959747314453, + "learning_rate": 1.0849492949276773e-06, + "loss": 0.767, + "step": 18438 + }, + { + "epoch": 0.8978647773476494, + "grad_norm": 1.9679291248321533, + "learning_rate": 1.0839247142710008e-06, + "loss": 0.8419, + "step": 18439 + }, + { + "epoch": 0.8979134711367565, + "grad_norm": 1.8904287815093994, + "learning_rate": 1.082900604153423e-06, + "loss": 0.7717, + "step": 18440 + }, + { + "epoch": 0.8979621649258637, + "grad_norm": 2.2218918800354004, + "learning_rate": 1.0818769646004168e-06, + "loss": 0.8726, + "step": 18441 + }, + { + "epoch": 0.8980108587149709, + "grad_norm": 1.2178317308425903, + "learning_rate": 1.0808537956374443e-06, + "loss": 0.8184, + "step": 18442 + }, + { + "epoch": 0.8980595525040781, + "grad_norm": 1.4327542781829834, + "learning_rate": 1.0798310972899606e-06, + "loss": 0.8231, + "step": 18443 + }, + { + "epoch": 0.8981082462931853, + "grad_norm": 2.8741533756256104, + "learning_rate": 1.078808869583403e-06, + "loss": 0.871, + "step": 18444 + }, + { + "epoch": 0.8981569400822925, + "grad_norm": 1.628772497177124, + "learning_rate": 1.0777871125432005e-06, + "loss": 0.8298, + "step": 18445 + }, + { + "epoch": 0.8982056338713997, + "grad_norm": 1.9676400423049927, + "learning_rate": 1.0767658261947677e-06, + "loss": 0.8187, + "step": 18446 + }, + { + "epoch": 0.898254327660507, + "grad_norm": 1.8147637844085693, + "learning_rate": 1.075745010563507e-06, + "loss": 0.8132, + "step": 18447 + }, + { + "epoch": 0.898303021449614, + "grad_norm": 1.4768390655517578, + "learning_rate": 1.0747246656748223e-06, + "loss": 0.869, + "step": 18448 + }, + { + "epoch": 0.8983517152387213, + "grad_norm": 3.85022234916687, + "learning_rate": 1.07370479155408e-06, + "loss": 0.7837, + "step": 18449 + }, + { + "epoch": 0.8984004090278285, + "grad_norm": 1.3322553634643555, + "learning_rate": 1.0726853882266597e-06, + "loss": 0.8794, + "step": 18450 + }, + { + "epoch": 0.8984491028169357, + "grad_norm": 1.270155668258667, + "learning_rate": 1.0716664557179168e-06, + "loss": 0.7296, + "step": 18451 + }, + { + "epoch": 0.8984977966060429, + "grad_norm": 1.821521282196045, + "learning_rate": 1.0706479940531956e-06, + "loss": 0.7283, + "step": 18452 + }, + { + "epoch": 0.8985464903951501, + "grad_norm": 4.592439651489258, + "learning_rate": 1.069630003257831e-06, + "loss": 0.787, + "step": 18453 + }, + { + "epoch": 0.8985951841842573, + "grad_norm": 1.8986796140670776, + "learning_rate": 1.0686124833571433e-06, + "loss": 0.6944, + "step": 18454 + }, + { + "epoch": 0.8986438779733645, + "grad_norm": 1.4501914978027344, + "learning_rate": 1.067595434376454e-06, + "loss": 0.7423, + "step": 18455 + }, + { + "epoch": 0.8986925717624717, + "grad_norm": 2.872303009033203, + "learning_rate": 1.0665788563410473e-06, + "loss": 0.8323, + "step": 18456 + }, + { + "epoch": 0.8987412655515788, + "grad_norm": 1.38197660446167, + "learning_rate": 1.0655627492762233e-06, + "loss": 0.7781, + "step": 18457 + }, + { + "epoch": 0.8987899593406861, + "grad_norm": 1.9554224014282227, + "learning_rate": 1.064547113207246e-06, + "loss": 0.7768, + "step": 18458 + }, + { + "epoch": 0.8988386531297933, + "grad_norm": 1.295678734779358, + "learning_rate": 1.063531948159391e-06, + "loss": 0.7357, + "step": 18459 + }, + { + "epoch": 0.8988873469189005, + "grad_norm": 1.7525827884674072, + "learning_rate": 1.0625172541579042e-06, + "loss": 0.814, + "step": 18460 + }, + { + "epoch": 0.8989360407080077, + "grad_norm": 2.970189332962036, + "learning_rate": 1.0615030312280238e-06, + "loss": 0.7719, + "step": 18461 + }, + { + "epoch": 0.8989847344971149, + "grad_norm": 1.7755911350250244, + "learning_rate": 1.0604892793949872e-06, + "loss": 0.8635, + "step": 18462 + }, + { + "epoch": 0.8990334282862221, + "grad_norm": 2.606520414352417, + "learning_rate": 1.059475998684003e-06, + "loss": 0.7305, + "step": 18463 + }, + { + "epoch": 0.8990821220753293, + "grad_norm": 1.3544411659240723, + "learning_rate": 1.0584631891202845e-06, + "loss": 0.8681, + "step": 18464 + }, + { + "epoch": 0.8991308158644364, + "grad_norm": 5.52534818649292, + "learning_rate": 1.057450850729016e-06, + "loss": 0.8076, + "step": 18465 + }, + { + "epoch": 0.8991795096535437, + "grad_norm": 1.4273792505264282, + "learning_rate": 1.0564389835353884e-06, + "loss": 0.8591, + "step": 18466 + }, + { + "epoch": 0.8992282034426509, + "grad_norm": 1.7351912260055542, + "learning_rate": 1.0554275875645658e-06, + "loss": 0.7838, + "step": 18467 + }, + { + "epoch": 0.8992768972317581, + "grad_norm": 1.4309414625167847, + "learning_rate": 1.0544166628417086e-06, + "loss": 0.7467, + "step": 18468 + }, + { + "epoch": 0.8993255910208653, + "grad_norm": 1.534334421157837, + "learning_rate": 1.0534062093919695e-06, + "loss": 0.7057, + "step": 18469 + }, + { + "epoch": 0.8993742848099725, + "grad_norm": 1.710700273513794, + "learning_rate": 1.0523962272404731e-06, + "loss": 0.7925, + "step": 18470 + }, + { + "epoch": 0.8994229785990797, + "grad_norm": 1.6333800554275513, + "learning_rate": 1.0513867164123525e-06, + "loss": 0.7513, + "step": 18471 + }, + { + "epoch": 0.8994716723881869, + "grad_norm": 1.2786154747009277, + "learning_rate": 1.0503776769327102e-06, + "loss": 0.779, + "step": 18472 + }, + { + "epoch": 0.8995203661772941, + "grad_norm": 2.1051599979400635, + "learning_rate": 1.0493691088266544e-06, + "loss": 0.8079, + "step": 18473 + }, + { + "epoch": 0.8995690599664012, + "grad_norm": 3.3031723499298096, + "learning_rate": 1.0483610121192678e-06, + "loss": 0.818, + "step": 18474 + }, + { + "epoch": 0.8996177537555085, + "grad_norm": 1.7451707124710083, + "learning_rate": 1.0473533868356323e-06, + "loss": 0.8508, + "step": 18475 + }, + { + "epoch": 0.8996664475446157, + "grad_norm": 5.60463809967041, + "learning_rate": 1.0463462330008057e-06, + "loss": 0.7455, + "step": 18476 + }, + { + "epoch": 0.8997151413337229, + "grad_norm": 3.823204517364502, + "learning_rate": 1.045339550639848e-06, + "loss": 0.8481, + "step": 18477 + }, + { + "epoch": 0.8997638351228301, + "grad_norm": 2.405985116958618, + "learning_rate": 1.0443333397777944e-06, + "loss": 0.7993, + "step": 18478 + }, + { + "epoch": 0.8998125289119373, + "grad_norm": 1.5955793857574463, + "learning_rate": 1.0433276004396764e-06, + "loss": 0.8144, + "step": 18479 + }, + { + "epoch": 0.8998612227010445, + "grad_norm": 0.0912172868847847, + "learning_rate": 1.0423223326505182e-06, + "loss": 0.5297, + "step": 18480 + }, + { + "epoch": 0.8999099164901517, + "grad_norm": 1.6127238273620605, + "learning_rate": 1.0413175364353135e-06, + "loss": 0.7237, + "step": 18481 + }, + { + "epoch": 0.8999586102792588, + "grad_norm": 0.09058994799852371, + "learning_rate": 1.0403132118190662e-06, + "loss": 0.5982, + "step": 18482 + }, + { + "epoch": 0.900007304068366, + "grad_norm": 0.09604323655366898, + "learning_rate": 1.0393093588267588e-06, + "loss": 0.6158, + "step": 18483 + }, + { + "epoch": 0.9000559978574733, + "grad_norm": 1.7735340595245361, + "learning_rate": 1.0383059774833581e-06, + "loss": 0.9515, + "step": 18484 + }, + { + "epoch": 0.9001046916465805, + "grad_norm": 1.4727431535720825, + "learning_rate": 1.0373030678138263e-06, + "loss": 0.7927, + "step": 18485 + }, + { + "epoch": 0.9001533854356877, + "grad_norm": 1.8156111240386963, + "learning_rate": 1.0363006298431078e-06, + "loss": 0.7272, + "step": 18486 + }, + { + "epoch": 0.9002020792247949, + "grad_norm": 1.563887119293213, + "learning_rate": 1.0352986635961405e-06, + "loss": 0.7672, + "step": 18487 + }, + { + "epoch": 0.9002507730139021, + "grad_norm": 1.3824735879898071, + "learning_rate": 1.0342971690978465e-06, + "loss": 0.8309, + "step": 18488 + }, + { + "epoch": 0.9002994668030093, + "grad_norm": 1.702791452407837, + "learning_rate": 1.0332961463731416e-06, + "loss": 0.7404, + "step": 18489 + }, + { + "epoch": 0.9003481605921165, + "grad_norm": 3.3677239418029785, + "learning_rate": 1.032295595446926e-06, + "loss": 0.8814, + "step": 18490 + }, + { + "epoch": 0.9003968543812236, + "grad_norm": 1.657511830329895, + "learning_rate": 1.031295516344084e-06, + "loss": 0.8883, + "step": 18491 + }, + { + "epoch": 0.9004455481703308, + "grad_norm": 1.4149609804153442, + "learning_rate": 1.0302959090894981e-06, + "loss": 0.7424, + "step": 18492 + }, + { + "epoch": 0.9004942419594381, + "grad_norm": 2.2299983501434326, + "learning_rate": 1.0292967737080307e-06, + "loss": 0.8402, + "step": 18493 + }, + { + "epoch": 0.9005429357485453, + "grad_norm": 2.266209125518799, + "learning_rate": 1.0282981102245348e-06, + "loss": 0.8741, + "step": 18494 + }, + { + "epoch": 0.9005916295376525, + "grad_norm": 3.2074878215789795, + "learning_rate": 1.0272999186638532e-06, + "loss": 0.7479, + "step": 18495 + }, + { + "epoch": 0.9006403233267597, + "grad_norm": 1.7097318172454834, + "learning_rate": 1.0263021990508127e-06, + "loss": 0.8003, + "step": 18496 + }, + { + "epoch": 0.9006890171158669, + "grad_norm": 1.53947913646698, + "learning_rate": 1.025304951410242e-06, + "loss": 0.8195, + "step": 18497 + }, + { + "epoch": 0.9007377109049741, + "grad_norm": 1.8117129802703857, + "learning_rate": 1.024308175766935e-06, + "loss": 0.6708, + "step": 18498 + }, + { + "epoch": 0.9007864046940812, + "grad_norm": 1.8380873203277588, + "learning_rate": 1.0233118721456936e-06, + "loss": 0.8903, + "step": 18499 + }, + { + "epoch": 0.9008350984831884, + "grad_norm": 1.5682510137557983, + "learning_rate": 1.0223160405713005e-06, + "loss": 0.8369, + "step": 18500 + }, + { + "epoch": 0.9008837922722956, + "grad_norm": 1.2286183834075928, + "learning_rate": 1.0213206810685273e-06, + "loss": 0.8885, + "step": 18501 + }, + { + "epoch": 0.9009324860614029, + "grad_norm": 1.6163790225982666, + "learning_rate": 1.0203257936621314e-06, + "loss": 0.8675, + "step": 18502 + }, + { + "epoch": 0.9009811798505101, + "grad_norm": 1.877798318862915, + "learning_rate": 1.0193313783768621e-06, + "loss": 0.8951, + "step": 18503 + }, + { + "epoch": 0.9010298736396173, + "grad_norm": 2.5754148960113525, + "learning_rate": 1.0183374352374553e-06, + "loss": 0.8059, + "step": 18504 + }, + { + "epoch": 0.9010785674287245, + "grad_norm": 1.581236481666565, + "learning_rate": 1.0173439642686312e-06, + "loss": 0.7885, + "step": 18505 + }, + { + "epoch": 0.9011272612178317, + "grad_norm": 1.3518977165222168, + "learning_rate": 1.0163509654951143e-06, + "loss": 0.8066, + "step": 18506 + }, + { + "epoch": 0.9011759550069388, + "grad_norm": 1.4088391065597534, + "learning_rate": 1.0153584389415915e-06, + "loss": 0.8928, + "step": 18507 + }, + { + "epoch": 0.901224648796046, + "grad_norm": 1.2010622024536133, + "learning_rate": 1.0143663846327611e-06, + "loss": 0.884, + "step": 18508 + }, + { + "epoch": 0.9012733425851532, + "grad_norm": 2.492219924926758, + "learning_rate": 1.0133748025932987e-06, + "loss": 0.8013, + "step": 18509 + }, + { + "epoch": 0.9013220363742604, + "grad_norm": 1.5464978218078613, + "learning_rate": 1.0123836928478691e-06, + "loss": 0.7346, + "step": 18510 + }, + { + "epoch": 0.9013707301633677, + "grad_norm": 1.6636948585510254, + "learning_rate": 1.011393055421126e-06, + "loss": 0.8814, + "step": 18511 + }, + { + "epoch": 0.9014194239524749, + "grad_norm": 1.599010705947876, + "learning_rate": 1.0104028903377095e-06, + "loss": 0.8268, + "step": 18512 + }, + { + "epoch": 0.9014681177415821, + "grad_norm": 1.9095454216003418, + "learning_rate": 1.0094131976222577e-06, + "loss": 0.8452, + "step": 18513 + }, + { + "epoch": 0.9015168115306893, + "grad_norm": 1.2944632768630981, + "learning_rate": 1.0084239772993798e-06, + "loss": 0.7717, + "step": 18514 + }, + { + "epoch": 0.9015655053197965, + "grad_norm": 2.2668044567108154, + "learning_rate": 1.0074352293936917e-06, + "loss": 0.7797, + "step": 18515 + }, + { + "epoch": 0.9016141991089036, + "grad_norm": 2.1834192276000977, + "learning_rate": 1.0064469539297783e-06, + "loss": 0.7861, + "step": 18516 + }, + { + "epoch": 0.9016628928980108, + "grad_norm": 2.528362274169922, + "learning_rate": 1.005459150932231e-06, + "loss": 0.7263, + "step": 18517 + }, + { + "epoch": 0.901711586687118, + "grad_norm": 1.5769459009170532, + "learning_rate": 1.0044718204256187e-06, + "loss": 0.8677, + "step": 18518 + }, + { + "epoch": 0.9017602804762253, + "grad_norm": 1.3177287578582764, + "learning_rate": 1.0034849624345e-06, + "loss": 0.8672, + "step": 18519 + }, + { + "epoch": 0.9018089742653325, + "grad_norm": 1.5294926166534424, + "learning_rate": 1.0024985769834305e-06, + "loss": 0.6946, + "step": 18520 + }, + { + "epoch": 0.9018576680544397, + "grad_norm": 1.4514780044555664, + "learning_rate": 1.0015126640969354e-06, + "loss": 0.7725, + "step": 18521 + }, + { + "epoch": 0.9019063618435469, + "grad_norm": 1.950048804283142, + "learning_rate": 1.00052722379955e-06, + "loss": 0.8424, + "step": 18522 + }, + { + "epoch": 0.9019550556326541, + "grad_norm": 3.7097787857055664, + "learning_rate": 9.995422561157774e-07, + "loss": 0.7971, + "step": 18523 + }, + { + "epoch": 0.9020037494217612, + "grad_norm": 3.141719102859497, + "learning_rate": 9.98557761070127e-07, + "loss": 0.6667, + "step": 18524 + }, + { + "epoch": 0.9020524432108684, + "grad_norm": 1.9400103092193604, + "learning_rate": 9.975737386870833e-07, + "loss": 0.7913, + "step": 18525 + }, + { + "epoch": 0.9021011369999756, + "grad_norm": 1.3741521835327148, + "learning_rate": 9.965901889911267e-07, + "loss": 0.8103, + "step": 18526 + }, + { + "epoch": 0.9021498307890828, + "grad_norm": 1.3996164798736572, + "learning_rate": 9.956071120067202e-07, + "loss": 0.8231, + "step": 18527 + }, + { + "epoch": 0.90219852457819, + "grad_norm": 1.239311933517456, + "learning_rate": 9.946245077583194e-07, + "loss": 0.821, + "step": 18528 + }, + { + "epoch": 0.9022472183672973, + "grad_norm": 1.3924676179885864, + "learning_rate": 9.936423762703696e-07, + "loss": 0.8361, + "step": 18529 + }, + { + "epoch": 0.9022959121564045, + "grad_norm": 0.09428662806749344, + "learning_rate": 9.926607175672954e-07, + "loss": 0.6203, + "step": 18530 + }, + { + "epoch": 0.9023446059455117, + "grad_norm": 0.1076309010386467, + "learning_rate": 9.916795316735218e-07, + "loss": 0.6091, + "step": 18531 + }, + { + "epoch": 0.9023932997346189, + "grad_norm": 1.3057658672332764, + "learning_rate": 9.906988186134493e-07, + "loss": 0.9274, + "step": 18532 + }, + { + "epoch": 0.902441993523726, + "grad_norm": 1.300139307975769, + "learning_rate": 9.897185784114783e-07, + "loss": 0.8089, + "step": 18533 + }, + { + "epoch": 0.9024906873128332, + "grad_norm": 1.5773669481277466, + "learning_rate": 9.887388110919916e-07, + "loss": 0.7961, + "step": 18534 + }, + { + "epoch": 0.9025393811019404, + "grad_norm": 1.5111550092697144, + "learning_rate": 9.877595166793609e-07, + "loss": 0.8234, + "step": 18535 + }, + { + "epoch": 0.9025880748910476, + "grad_norm": 2.388051986694336, + "learning_rate": 9.867806951979465e-07, + "loss": 0.8344, + "step": 18536 + }, + { + "epoch": 0.9026367686801549, + "grad_norm": 1.170751929283142, + "learning_rate": 9.858023466720933e-07, + "loss": 0.7522, + "step": 18537 + }, + { + "epoch": 0.9026854624692621, + "grad_norm": 2.0719785690307617, + "learning_rate": 9.848244711261423e-07, + "loss": 0.816, + "step": 18538 + }, + { + "epoch": 0.9027341562583693, + "grad_norm": 1.429911494255066, + "learning_rate": 9.838470685844136e-07, + "loss": 0.7943, + "step": 18539 + }, + { + "epoch": 0.9027828500474765, + "grad_norm": 1.6952078342437744, + "learning_rate": 9.828701390712235e-07, + "loss": 0.8112, + "step": 18540 + }, + { + "epoch": 0.9028315438365836, + "grad_norm": 2.458388328552246, + "learning_rate": 9.81893682610875e-07, + "loss": 0.9042, + "step": 18541 + }, + { + "epoch": 0.9028802376256908, + "grad_norm": 1.9356261491775513, + "learning_rate": 9.809176992276549e-07, + "loss": 0.8036, + "step": 18542 + }, + { + "epoch": 0.902928931414798, + "grad_norm": 1.465657353401184, + "learning_rate": 9.799421889458394e-07, + "loss": 0.8608, + "step": 18543 + }, + { + "epoch": 0.9029776252039052, + "grad_norm": 1.4583330154418945, + "learning_rate": 9.789671517896982e-07, + "loss": 0.855, + "step": 18544 + }, + { + "epoch": 0.9030263189930124, + "grad_norm": 2.5217132568359375, + "learning_rate": 9.77992587783483e-07, + "loss": 0.8005, + "step": 18545 + }, + { + "epoch": 0.9030750127821197, + "grad_norm": 1.4396318197250366, + "learning_rate": 9.770184969514362e-07, + "loss": 0.7266, + "step": 18546 + }, + { + "epoch": 0.9031237065712269, + "grad_norm": 1.4959863424301147, + "learning_rate": 9.760448793177857e-07, + "loss": 0.8658, + "step": 18547 + }, + { + "epoch": 0.9031724003603341, + "grad_norm": 1.562557578086853, + "learning_rate": 9.750717349067561e-07, + "loss": 0.739, + "step": 18548 + }, + { + "epoch": 0.9032210941494412, + "grad_norm": 1.295438528060913, + "learning_rate": 9.740990637425506e-07, + "loss": 0.7747, + "step": 18549 + }, + { + "epoch": 0.9032697879385484, + "grad_norm": 0.10045719146728516, + "learning_rate": 9.731268658493676e-07, + "loss": 0.6168, + "step": 18550 + }, + { + "epoch": 0.9033184817276556, + "grad_norm": 3.1902263164520264, + "learning_rate": 9.721551412513875e-07, + "loss": 0.7539, + "step": 18551 + }, + { + "epoch": 0.9033671755167628, + "grad_norm": 1.5042691230773926, + "learning_rate": 9.711838899727843e-07, + "loss": 0.8497, + "step": 18552 + }, + { + "epoch": 0.90341586930587, + "grad_norm": 2.019503593444824, + "learning_rate": 9.702131120377145e-07, + "loss": 0.7456, + "step": 18553 + }, + { + "epoch": 0.9034645630949772, + "grad_norm": 1.423892855644226, + "learning_rate": 9.69242807470332e-07, + "loss": 0.7438, + "step": 18554 + }, + { + "epoch": 0.9035132568840845, + "grad_norm": 1.4608635902404785, + "learning_rate": 9.682729762947663e-07, + "loss": 0.7952, + "step": 18555 + }, + { + "epoch": 0.9035619506731917, + "grad_norm": 1.6248342990875244, + "learning_rate": 9.673036185351449e-07, + "loss": 0.7927, + "step": 18556 + }, + { + "epoch": 0.9036106444622989, + "grad_norm": 1.3041484355926514, + "learning_rate": 9.663347342155838e-07, + "loss": 0.8064, + "step": 18557 + }, + { + "epoch": 0.903659338251406, + "grad_norm": 1.6291940212249756, + "learning_rate": 9.653663233601817e-07, + "loss": 0.8473, + "step": 18558 + }, + { + "epoch": 0.9037080320405132, + "grad_norm": 1.3185935020446777, + "learning_rate": 9.64398385993026e-07, + "loss": 0.7529, + "step": 18559 + }, + { + "epoch": 0.9037567258296204, + "grad_norm": 1.4623959064483643, + "learning_rate": 9.634309221381978e-07, + "loss": 0.8836, + "step": 18560 + }, + { + "epoch": 0.9038054196187276, + "grad_norm": 0.09711764752864838, + "learning_rate": 9.624639318197615e-07, + "loss": 0.5854, + "step": 18561 + }, + { + "epoch": 0.9038541134078348, + "grad_norm": 0.09781357645988464, + "learning_rate": 9.614974150617695e-07, + "loss": 0.6599, + "step": 18562 + }, + { + "epoch": 0.903902807196942, + "grad_norm": 1.4142707586288452, + "learning_rate": 9.605313718882647e-07, + "loss": 0.6574, + "step": 18563 + }, + { + "epoch": 0.9039515009860493, + "grad_norm": 1.9008678197860718, + "learning_rate": 9.595658023232813e-07, + "loss": 0.7869, + "step": 18564 + }, + { + "epoch": 0.9040001947751565, + "grad_norm": 1.3996224403381348, + "learning_rate": 9.586007063908287e-07, + "loss": 0.9037, + "step": 18565 + }, + { + "epoch": 0.9040488885642636, + "grad_norm": 1.6870126724243164, + "learning_rate": 9.576360841149256e-07, + "loss": 0.824, + "step": 18566 + }, + { + "epoch": 0.9040975823533708, + "grad_norm": 2.322068214416504, + "learning_rate": 9.566719355195576e-07, + "loss": 0.789, + "step": 18567 + }, + { + "epoch": 0.904146276142478, + "grad_norm": 1.656831979751587, + "learning_rate": 9.557082606287116e-07, + "loss": 0.7998, + "step": 18568 + }, + { + "epoch": 0.9041949699315852, + "grad_norm": 1.7504823207855225, + "learning_rate": 9.547450594663598e-07, + "loss": 0.791, + "step": 18569 + }, + { + "epoch": 0.9042436637206924, + "grad_norm": 1.5913333892822266, + "learning_rate": 9.537823320564587e-07, + "loss": 0.8605, + "step": 18570 + }, + { + "epoch": 0.9042923575097996, + "grad_norm": 1.4273767471313477, + "learning_rate": 9.528200784229646e-07, + "loss": 0.7499, + "step": 18571 + }, + { + "epoch": 0.9043410512989069, + "grad_norm": 1.810502290725708, + "learning_rate": 9.518582985898007e-07, + "loss": 0.7283, + "step": 18572 + }, + { + "epoch": 0.9043897450880141, + "grad_norm": 1.4889137744903564, + "learning_rate": 9.508969925809053e-07, + "loss": 0.7792, + "step": 18573 + }, + { + "epoch": 0.9044384388771213, + "grad_norm": 2.004857063293457, + "learning_rate": 9.499361604201796e-07, + "loss": 0.803, + "step": 18574 + }, + { + "epoch": 0.9044871326662284, + "grad_norm": 0.10022737085819244, + "learning_rate": 9.489758021315287e-07, + "loss": 0.615, + "step": 18575 + }, + { + "epoch": 0.9045358264553356, + "grad_norm": 1.3297486305236816, + "learning_rate": 9.480159177388426e-07, + "loss": 0.7854, + "step": 18576 + }, + { + "epoch": 0.9045845202444428, + "grad_norm": 2.339162826538086, + "learning_rate": 9.470565072659954e-07, + "loss": 0.855, + "step": 18577 + }, + { + "epoch": 0.90463321403355, + "grad_norm": 2.36008358001709, + "learning_rate": 9.460975707368614e-07, + "loss": 0.8167, + "step": 18578 + }, + { + "epoch": 0.9046819078226572, + "grad_norm": 1.619650959968567, + "learning_rate": 9.451391081752814e-07, + "loss": 0.8021, + "step": 18579 + }, + { + "epoch": 0.9047306016117644, + "grad_norm": 5.732860565185547, + "learning_rate": 9.441811196051076e-07, + "loss": 0.7137, + "step": 18580 + }, + { + "epoch": 0.9047792954008717, + "grad_norm": 1.8580189943313599, + "learning_rate": 9.432236050501609e-07, + "loss": 0.7878, + "step": 18581 + }, + { + "epoch": 0.9048279891899789, + "grad_norm": 2.32022762298584, + "learning_rate": 9.422665645342688e-07, + "loss": 0.8279, + "step": 18582 + }, + { + "epoch": 0.904876682979086, + "grad_norm": 1.6371678113937378, + "learning_rate": 9.413099980812323e-07, + "loss": 0.8449, + "step": 18583 + }, + { + "epoch": 0.9049253767681932, + "grad_norm": 2.468606948852539, + "learning_rate": 9.403539057148458e-07, + "loss": 0.8245, + "step": 18584 + }, + { + "epoch": 0.9049740705573004, + "grad_norm": 2.7427515983581543, + "learning_rate": 9.393982874588969e-07, + "loss": 0.8399, + "step": 18585 + }, + { + "epoch": 0.9050227643464076, + "grad_norm": 1.4185208082199097, + "learning_rate": 9.38443143337151e-07, + "loss": 0.8834, + "step": 18586 + }, + { + "epoch": 0.9050714581355148, + "grad_norm": 1.5722333192825317, + "learning_rate": 9.374884733733713e-07, + "loss": 0.7822, + "step": 18587 + }, + { + "epoch": 0.905120151924622, + "grad_norm": 1.583702564239502, + "learning_rate": 9.365342775913011e-07, + "loss": 0.7964, + "step": 18588 + }, + { + "epoch": 0.9051688457137292, + "grad_norm": 1.8506355285644531, + "learning_rate": 9.355805560146836e-07, + "loss": 0.7921, + "step": 18589 + }, + { + "epoch": 0.9052175395028365, + "grad_norm": 1.4987431764602661, + "learning_rate": 9.34627308667233e-07, + "loss": 0.8216, + "step": 18590 + }, + { + "epoch": 0.9052662332919437, + "grad_norm": 1.8151060342788696, + "learning_rate": 9.336745355726684e-07, + "loss": 0.7566, + "step": 18591 + }, + { + "epoch": 0.9053149270810508, + "grad_norm": 2.0068156719207764, + "learning_rate": 9.327222367546884e-07, + "loss": 0.8663, + "step": 18592 + }, + { + "epoch": 0.905363620870158, + "grad_norm": 1.3800185918807983, + "learning_rate": 9.317704122369808e-07, + "loss": 0.8336, + "step": 18593 + }, + { + "epoch": 0.9054123146592652, + "grad_norm": 1.4921988248825073, + "learning_rate": 9.308190620432245e-07, + "loss": 0.7737, + "step": 18594 + }, + { + "epoch": 0.9054610084483724, + "grad_norm": 3.483625888824463, + "learning_rate": 9.298681861970804e-07, + "loss": 0.7878, + "step": 18595 + }, + { + "epoch": 0.9055097022374796, + "grad_norm": 2.0292298793792725, + "learning_rate": 9.289177847222053e-07, + "loss": 0.7703, + "step": 18596 + }, + { + "epoch": 0.9055583960265868, + "grad_norm": 1.5511592626571655, + "learning_rate": 9.27967857642238e-07, + "loss": 0.8234, + "step": 18597 + }, + { + "epoch": 0.905607089815694, + "grad_norm": 1.314408779144287, + "learning_rate": 9.270184049808084e-07, + "loss": 0.7863, + "step": 18598 + }, + { + "epoch": 0.9056557836048013, + "grad_norm": 1.4755842685699463, + "learning_rate": 9.260694267615377e-07, + "loss": 0.7351, + "step": 18599 + }, + { + "epoch": 0.9057044773939084, + "grad_norm": 1.4206733703613281, + "learning_rate": 9.251209230080271e-07, + "loss": 0.859, + "step": 18600 + }, + { + "epoch": 0.9057531711830156, + "grad_norm": 1.4025295972824097, + "learning_rate": 9.241728937438755e-07, + "loss": 0.7178, + "step": 18601 + }, + { + "epoch": 0.9058018649721228, + "grad_norm": 1.5761734247207642, + "learning_rate": 9.232253389926593e-07, + "loss": 0.8621, + "step": 18602 + }, + { + "epoch": 0.90585055876123, + "grad_norm": 1.4259223937988281, + "learning_rate": 9.222782587779533e-07, + "loss": 0.8416, + "step": 18603 + }, + { + "epoch": 0.9058992525503372, + "grad_norm": 1.655966877937317, + "learning_rate": 9.213316531233163e-07, + "loss": 0.7114, + "step": 18604 + }, + { + "epoch": 0.9059479463394444, + "grad_norm": 0.09286128729581833, + "learning_rate": 9.203855220522895e-07, + "loss": 0.5728, + "step": 18605 + }, + { + "epoch": 0.9059966401285516, + "grad_norm": 1.6291900873184204, + "learning_rate": 9.194398655884185e-07, + "loss": 0.7292, + "step": 18606 + }, + { + "epoch": 0.9060453339176588, + "grad_norm": 0.09493766725063324, + "learning_rate": 9.184946837552156e-07, + "loss": 0.5858, + "step": 18607 + }, + { + "epoch": 0.906094027706766, + "grad_norm": 2.352036476135254, + "learning_rate": 9.175499765761997e-07, + "loss": 0.7783, + "step": 18608 + }, + { + "epoch": 0.9061427214958732, + "grad_norm": 1.311805009841919, + "learning_rate": 9.166057440748677e-07, + "loss": 0.758, + "step": 18609 + }, + { + "epoch": 0.9061914152849804, + "grad_norm": 1.400092363357544, + "learning_rate": 9.156619862747073e-07, + "loss": 0.8585, + "step": 18610 + }, + { + "epoch": 0.9062401090740876, + "grad_norm": 0.10067323595285416, + "learning_rate": 9.147187031991956e-07, + "loss": 0.6101, + "step": 18611 + }, + { + "epoch": 0.9062888028631948, + "grad_norm": 2.0133166313171387, + "learning_rate": 9.137758948717978e-07, + "loss": 0.8041, + "step": 18612 + }, + { + "epoch": 0.906337496652302, + "grad_norm": 1.6051379442214966, + "learning_rate": 9.128335613159622e-07, + "loss": 0.749, + "step": 18613 + }, + { + "epoch": 0.9063861904414092, + "grad_norm": 3.1515934467315674, + "learning_rate": 9.118917025551321e-07, + "loss": 0.7077, + "step": 18614 + }, + { + "epoch": 0.9064348842305164, + "grad_norm": 1.5924581289291382, + "learning_rate": 9.1095031861274e-07, + "loss": 0.7988, + "step": 18615 + }, + { + "epoch": 0.9064835780196236, + "grad_norm": 1.6847845315933228, + "learning_rate": 9.100094095121936e-07, + "loss": 0.7894, + "step": 18616 + }, + { + "epoch": 0.9065322718087307, + "grad_norm": 1.9292278289794922, + "learning_rate": 9.090689752769056e-07, + "loss": 0.8862, + "step": 18617 + }, + { + "epoch": 0.906580965597838, + "grad_norm": 1.2038949728012085, + "learning_rate": 9.081290159302703e-07, + "loss": 0.8097, + "step": 18618 + }, + { + "epoch": 0.9066296593869452, + "grad_norm": 0.10291563719511032, + "learning_rate": 9.071895314956647e-07, + "loss": 0.5782, + "step": 18619 + }, + { + "epoch": 0.9066783531760524, + "grad_norm": 10.404300689697266, + "learning_rate": 9.06250521996459e-07, + "loss": 0.8817, + "step": 18620 + }, + { + "epoch": 0.9067270469651596, + "grad_norm": 1.3058096170425415, + "learning_rate": 9.053119874560124e-07, + "loss": 0.8041, + "step": 18621 + }, + { + "epoch": 0.9067757407542668, + "grad_norm": 2.265897274017334, + "learning_rate": 9.043739278976749e-07, + "loss": 0.8845, + "step": 18622 + }, + { + "epoch": 0.906824434543374, + "grad_norm": 2.3961732387542725, + "learning_rate": 9.034363433447724e-07, + "loss": 0.7032, + "step": 18623 + }, + { + "epoch": 0.9068731283324812, + "grad_norm": 1.3547214269638062, + "learning_rate": 9.024992338206351e-07, + "loss": 0.7952, + "step": 18624 + }, + { + "epoch": 0.9069218221215883, + "grad_norm": 1.7703630924224854, + "learning_rate": 9.015625993485688e-07, + "loss": 0.6902, + "step": 18625 + }, + { + "epoch": 0.9069705159106956, + "grad_norm": 1.7259584665298462, + "learning_rate": 9.00626439951875e-07, + "loss": 0.7857, + "step": 18626 + }, + { + "epoch": 0.9070192096998028, + "grad_norm": 1.6919879913330078, + "learning_rate": 8.996907556538392e-07, + "loss": 0.7702, + "step": 18627 + }, + { + "epoch": 0.90706790348891, + "grad_norm": 1.7545902729034424, + "learning_rate": 8.987555464777364e-07, + "loss": 0.8696, + "step": 18628 + }, + { + "epoch": 0.9071165972780172, + "grad_norm": 1.3421783447265625, + "learning_rate": 8.978208124468346e-07, + "loss": 0.7411, + "step": 18629 + }, + { + "epoch": 0.9071652910671244, + "grad_norm": 1.1793246269226074, + "learning_rate": 8.968865535843774e-07, + "loss": 0.8604, + "step": 18630 + }, + { + "epoch": 0.9072139848562316, + "grad_norm": 1.7458739280700684, + "learning_rate": 8.95952769913615e-07, + "loss": 0.815, + "step": 18631 + }, + { + "epoch": 0.9072626786453388, + "grad_norm": 1.693932294845581, + "learning_rate": 8.950194614577623e-07, + "loss": 0.7316, + "step": 18632 + }, + { + "epoch": 0.907311372434446, + "grad_norm": 1.30810546875, + "learning_rate": 8.940866282400451e-07, + "loss": 0.7828, + "step": 18633 + }, + { + "epoch": 0.9073600662235531, + "grad_norm": 1.3169808387756348, + "learning_rate": 8.931542702836671e-07, + "loss": 0.7826, + "step": 18634 + }, + { + "epoch": 0.9074087600126604, + "grad_norm": 1.2719967365264893, + "learning_rate": 8.922223876118164e-07, + "loss": 0.8044, + "step": 18635 + }, + { + "epoch": 0.9074574538017676, + "grad_norm": 0.09946776926517487, + "learning_rate": 8.912909802476766e-07, + "loss": 0.604, + "step": 18636 + }, + { + "epoch": 0.9075061475908748, + "grad_norm": 2.428192615509033, + "learning_rate": 8.903600482144137e-07, + "loss": 0.764, + "step": 18637 + }, + { + "epoch": 0.907554841379982, + "grad_norm": 1.5728439092636108, + "learning_rate": 8.894295915351936e-07, + "loss": 0.7548, + "step": 18638 + }, + { + "epoch": 0.9076035351690892, + "grad_norm": 1.4796662330627441, + "learning_rate": 8.884996102331467e-07, + "loss": 0.8401, + "step": 18639 + }, + { + "epoch": 0.9076522289581964, + "grad_norm": 1.6491113901138306, + "learning_rate": 8.875701043314211e-07, + "loss": 0.7974, + "step": 18640 + }, + { + "epoch": 0.9077009227473036, + "grad_norm": 2.78056263923645, + "learning_rate": 8.866410738531273e-07, + "loss": 0.8354, + "step": 18641 + }, + { + "epoch": 0.9077496165364107, + "grad_norm": 1.2204092741012573, + "learning_rate": 8.857125188213822e-07, + "loss": 0.8194, + "step": 18642 + }, + { + "epoch": 0.9077983103255179, + "grad_norm": 1.7396240234375, + "learning_rate": 8.847844392592786e-07, + "loss": 0.7944, + "step": 18643 + }, + { + "epoch": 0.9078470041146252, + "grad_norm": 1.4234166145324707, + "learning_rate": 8.838568351899068e-07, + "loss": 0.7468, + "step": 18644 + }, + { + "epoch": 0.9078956979037324, + "grad_norm": 2.040588855743408, + "learning_rate": 8.829297066363396e-07, + "loss": 0.8807, + "step": 18645 + }, + { + "epoch": 0.9079443916928396, + "grad_norm": 1.5851126909255981, + "learning_rate": 8.820030536216384e-07, + "loss": 0.8577, + "step": 18646 + }, + { + "epoch": 0.9079930854819468, + "grad_norm": 1.6143568754196167, + "learning_rate": 8.810768761688537e-07, + "loss": 0.8882, + "step": 18647 + }, + { + "epoch": 0.908041779271054, + "grad_norm": 1.889597773551941, + "learning_rate": 8.801511743010227e-07, + "loss": 0.8234, + "step": 18648 + }, + { + "epoch": 0.9080904730601612, + "grad_norm": 1.4254094362258911, + "learning_rate": 8.79225948041178e-07, + "loss": 0.8175, + "step": 18649 + }, + { + "epoch": 0.9081391668492684, + "grad_norm": 1.736722707748413, + "learning_rate": 8.783011974123302e-07, + "loss": 0.8542, + "step": 18650 + }, + { + "epoch": 0.9081878606383755, + "grad_norm": 1.7921321392059326, + "learning_rate": 8.773769224374851e-07, + "loss": 0.8551, + "step": 18651 + }, + { + "epoch": 0.9082365544274827, + "grad_norm": 1.4277971982955933, + "learning_rate": 8.764531231396311e-07, + "loss": 0.7646, + "step": 18652 + }, + { + "epoch": 0.90828524821659, + "grad_norm": 1.9280345439910889, + "learning_rate": 8.755297995417478e-07, + "loss": 0.8264, + "step": 18653 + }, + { + "epoch": 0.9083339420056972, + "grad_norm": 2.0850722789764404, + "learning_rate": 8.746069516668076e-07, + "loss": 0.8407, + "step": 18654 + }, + { + "epoch": 0.9083826357948044, + "grad_norm": 1.4880911111831665, + "learning_rate": 8.736845795377613e-07, + "loss": 0.7788, + "step": 18655 + }, + { + "epoch": 0.9084313295839116, + "grad_norm": 1.5711854696273804, + "learning_rate": 8.727626831775526e-07, + "loss": 0.8372, + "step": 18656 + }, + { + "epoch": 0.9084800233730188, + "grad_norm": 1.5006603002548218, + "learning_rate": 8.718412626091188e-07, + "loss": 0.7521, + "step": 18657 + }, + { + "epoch": 0.908528717162126, + "grad_norm": 1.8499245643615723, + "learning_rate": 8.709203178553793e-07, + "loss": 0.8834, + "step": 18658 + }, + { + "epoch": 0.9085774109512331, + "grad_norm": 1.5502374172210693, + "learning_rate": 8.699998489392381e-07, + "loss": 0.8512, + "step": 18659 + }, + { + "epoch": 0.9086261047403403, + "grad_norm": 1.3401650190353394, + "learning_rate": 8.690798558835967e-07, + "loss": 0.8356, + "step": 18660 + }, + { + "epoch": 0.9086747985294475, + "grad_norm": 1.7399119138717651, + "learning_rate": 8.681603387113369e-07, + "loss": 0.7835, + "step": 18661 + }, + { + "epoch": 0.9087234923185548, + "grad_norm": 1.9600869417190552, + "learning_rate": 8.672412974453337e-07, + "loss": 0.8858, + "step": 18662 + }, + { + "epoch": 0.908772186107662, + "grad_norm": 1.4635553359985352, + "learning_rate": 8.663227321084466e-07, + "loss": 0.7371, + "step": 18663 + }, + { + "epoch": 0.9088208798967692, + "grad_norm": 1.4661054611206055, + "learning_rate": 8.654046427235286e-07, + "loss": 0.6855, + "step": 18664 + }, + { + "epoch": 0.9088695736858764, + "grad_norm": 1.8874719142913818, + "learning_rate": 8.644870293134123e-07, + "loss": 0.8151, + "step": 18665 + }, + { + "epoch": 0.9089182674749836, + "grad_norm": 1.7290159463882446, + "learning_rate": 8.635698919009261e-07, + "loss": 0.7974, + "step": 18666 + }, + { + "epoch": 0.9089669612640907, + "grad_norm": 7.920434951782227, + "learning_rate": 8.626532305088853e-07, + "loss": 0.8248, + "step": 18667 + }, + { + "epoch": 0.9090156550531979, + "grad_norm": 1.1616930961608887, + "learning_rate": 8.617370451600893e-07, + "loss": 0.6776, + "step": 18668 + }, + { + "epoch": 0.9090643488423051, + "grad_norm": 1.7343999147415161, + "learning_rate": 8.608213358773287e-07, + "loss": 0.8377, + "step": 18669 + }, + { + "epoch": 0.9091130426314123, + "grad_norm": 1.6633027791976929, + "learning_rate": 8.599061026833832e-07, + "loss": 0.8723, + "step": 18670 + }, + { + "epoch": 0.9091617364205196, + "grad_norm": 1.259464144706726, + "learning_rate": 8.58991345601019e-07, + "loss": 0.8285, + "step": 18671 + }, + { + "epoch": 0.9092104302096268, + "grad_norm": 1.7373528480529785, + "learning_rate": 8.580770646529868e-07, + "loss": 0.7516, + "step": 18672 + }, + { + "epoch": 0.909259123998734, + "grad_norm": 1.2424005270004272, + "learning_rate": 8.571632598620395e-07, + "loss": 0.6788, + "step": 18673 + }, + { + "epoch": 0.9093078177878412, + "grad_norm": 2.0000884532928467, + "learning_rate": 8.562499312508943e-07, + "loss": 0.8831, + "step": 18674 + }, + { + "epoch": 0.9093565115769484, + "grad_norm": 1.4192700386047363, + "learning_rate": 8.553370788422843e-07, + "loss": 0.8555, + "step": 18675 + }, + { + "epoch": 0.9094052053660555, + "grad_norm": 1.6549711227416992, + "learning_rate": 8.544247026589048e-07, + "loss": 0.7419, + "step": 18676 + }, + { + "epoch": 0.9094538991551627, + "grad_norm": 2.5627694129943848, + "learning_rate": 8.535128027234574e-07, + "loss": 0.7888, + "step": 18677 + }, + { + "epoch": 0.9095025929442699, + "grad_norm": 1.8037793636322021, + "learning_rate": 8.526013790586262e-07, + "loss": 0.8576, + "step": 18678 + }, + { + "epoch": 0.9095512867333772, + "grad_norm": 1.4674406051635742, + "learning_rate": 8.5169043168708e-07, + "loss": 0.7918, + "step": 18679 + }, + { + "epoch": 0.9095999805224844, + "grad_norm": 1.3680822849273682, + "learning_rate": 8.507799606314826e-07, + "loss": 0.8524, + "step": 18680 + }, + { + "epoch": 0.9096486743115916, + "grad_norm": 1.554783821105957, + "learning_rate": 8.49869965914476e-07, + "loss": 0.7752, + "step": 18681 + }, + { + "epoch": 0.9096973681006988, + "grad_norm": 1.7099099159240723, + "learning_rate": 8.489604475587043e-07, + "loss": 0.7678, + "step": 18682 + }, + { + "epoch": 0.909746061889806, + "grad_norm": 1.7903227806091309, + "learning_rate": 8.480514055867828e-07, + "loss": 0.7736, + "step": 18683 + }, + { + "epoch": 0.9097947556789131, + "grad_norm": 1.4883774518966675, + "learning_rate": 8.471428400213332e-07, + "loss": 0.7445, + "step": 18684 + }, + { + "epoch": 0.9098434494680203, + "grad_norm": 2.2568821907043457, + "learning_rate": 8.462347508849489e-07, + "loss": 0.8282, + "step": 18685 + }, + { + "epoch": 0.9098921432571275, + "grad_norm": 2.620553731918335, + "learning_rate": 8.453271382002204e-07, + "loss": 0.856, + "step": 18686 + }, + { + "epoch": 0.9099408370462347, + "grad_norm": 1.1700350046157837, + "learning_rate": 8.44420001989732e-07, + "loss": 0.7536, + "step": 18687 + }, + { + "epoch": 0.909989530835342, + "grad_norm": 2.03644061088562, + "learning_rate": 8.435133422760367e-07, + "loss": 0.768, + "step": 18688 + }, + { + "epoch": 0.9100382246244492, + "grad_norm": 1.278659462928772, + "learning_rate": 8.426071590816965e-07, + "loss": 0.8314, + "step": 18689 + }, + { + "epoch": 0.9100869184135564, + "grad_norm": 1.4392112493515015, + "learning_rate": 8.417014524292467e-07, + "loss": 0.7819, + "step": 18690 + }, + { + "epoch": 0.9101356122026636, + "grad_norm": 0.09932246804237366, + "learning_rate": 8.407962223412225e-07, + "loss": 0.5824, + "step": 18691 + }, + { + "epoch": 0.9101843059917708, + "grad_norm": 1.5426441431045532, + "learning_rate": 8.398914688401394e-07, + "loss": 0.7549, + "step": 18692 + }, + { + "epoch": 0.9102329997808779, + "grad_norm": 2.014039993286133, + "learning_rate": 8.389871919485037e-07, + "loss": 0.8213, + "step": 18693 + }, + { + "epoch": 0.9102816935699851, + "grad_norm": 1.4762061834335327, + "learning_rate": 8.380833916888065e-07, + "loss": 0.7597, + "step": 18694 + }, + { + "epoch": 0.9103303873590923, + "grad_norm": 1.325058102607727, + "learning_rate": 8.37180068083534e-07, + "loss": 0.7577, + "step": 18695 + }, + { + "epoch": 0.9103790811481995, + "grad_norm": 1.5442582368850708, + "learning_rate": 8.362772211551528e-07, + "loss": 0.7685, + "step": 18696 + }, + { + "epoch": 0.9104277749373068, + "grad_norm": 1.6154848337173462, + "learning_rate": 8.353748509261228e-07, + "loss": 0.8877, + "step": 18697 + }, + { + "epoch": 0.910476468726414, + "grad_norm": 1.6507502794265747, + "learning_rate": 8.344729574188948e-07, + "loss": 0.7783, + "step": 18698 + }, + { + "epoch": 0.9105251625155212, + "grad_norm": 1.553615927696228, + "learning_rate": 8.335715406558931e-07, + "loss": 0.7262, + "step": 18699 + }, + { + "epoch": 0.9105738563046284, + "grad_norm": 1.3900370597839355, + "learning_rate": 8.326706006595508e-07, + "loss": 0.8007, + "step": 18700 + }, + { + "epoch": 0.9106225500937355, + "grad_norm": 1.4283174276351929, + "learning_rate": 8.317701374522724e-07, + "loss": 0.7656, + "step": 18701 + }, + { + "epoch": 0.9106712438828427, + "grad_norm": 2.135072946548462, + "learning_rate": 8.30870151056462e-07, + "loss": 0.8351, + "step": 18702 + }, + { + "epoch": 0.9107199376719499, + "grad_norm": 2.0799858570098877, + "learning_rate": 8.299706414945041e-07, + "loss": 0.7622, + "step": 18703 + }, + { + "epoch": 0.9107686314610571, + "grad_norm": 1.6618939638137817, + "learning_rate": 8.29071608788774e-07, + "loss": 0.9114, + "step": 18704 + }, + { + "epoch": 0.9108173252501643, + "grad_norm": 1.8593205213546753, + "learning_rate": 8.281730529616361e-07, + "loss": 0.7647, + "step": 18705 + }, + { + "epoch": 0.9108660190392716, + "grad_norm": 3.400233268737793, + "learning_rate": 8.272749740354391e-07, + "loss": 0.8742, + "step": 18706 + }, + { + "epoch": 0.9109147128283788, + "grad_norm": 1.8172632455825806, + "learning_rate": 8.263773720325275e-07, + "loss": 0.783, + "step": 18707 + }, + { + "epoch": 0.910963406617486, + "grad_norm": 1.484621286392212, + "learning_rate": 8.254802469752255e-07, + "loss": 0.8149, + "step": 18708 + }, + { + "epoch": 0.9110121004065931, + "grad_norm": 1.3519973754882812, + "learning_rate": 8.24583598885853e-07, + "loss": 0.7584, + "step": 18709 + }, + { + "epoch": 0.9110607941957003, + "grad_norm": 1.3866734504699707, + "learning_rate": 8.236874277867102e-07, + "loss": 0.7849, + "step": 18710 + }, + { + "epoch": 0.9111094879848075, + "grad_norm": 1.9983675479888916, + "learning_rate": 8.227917337000902e-07, + "loss": 0.8623, + "step": 18711 + }, + { + "epoch": 0.9111581817739147, + "grad_norm": 1.3055775165557861, + "learning_rate": 8.218965166482773e-07, + "loss": 0.8259, + "step": 18712 + }, + { + "epoch": 0.9112068755630219, + "grad_norm": 1.3035855293273926, + "learning_rate": 8.210017766535338e-07, + "loss": 0.8567, + "step": 18713 + }, + { + "epoch": 0.9112555693521291, + "grad_norm": 2.6577095985412598, + "learning_rate": 8.201075137381198e-07, + "loss": 0.838, + "step": 18714 + }, + { + "epoch": 0.9113042631412364, + "grad_norm": 3.227055788040161, + "learning_rate": 8.19213727924284e-07, + "loss": 0.7635, + "step": 18715 + }, + { + "epoch": 0.9113529569303436, + "grad_norm": 1.3321367502212524, + "learning_rate": 8.183204192342509e-07, + "loss": 0.6952, + "step": 18716 + }, + { + "epoch": 0.9114016507194508, + "grad_norm": 1.2672897577285767, + "learning_rate": 8.174275876902493e-07, + "loss": 0.8328, + "step": 18717 + }, + { + "epoch": 0.9114503445085579, + "grad_norm": 1.8351895809173584, + "learning_rate": 8.165352333144838e-07, + "loss": 0.751, + "step": 18718 + }, + { + "epoch": 0.9114990382976651, + "grad_norm": 1.738842487335205, + "learning_rate": 8.156433561291543e-07, + "loss": 0.7774, + "step": 18719 + }, + { + "epoch": 0.9115477320867723, + "grad_norm": 1.3964308500289917, + "learning_rate": 8.147519561564454e-07, + "loss": 0.7097, + "step": 18720 + }, + { + "epoch": 0.9115964258758795, + "grad_norm": 1.231505036354065, + "learning_rate": 8.138610334185326e-07, + "loss": 0.8222, + "step": 18721 + }, + { + "epoch": 0.9116451196649867, + "grad_norm": 6.365245342254639, + "learning_rate": 8.129705879375738e-07, + "loss": 0.8173, + "step": 18722 + }, + { + "epoch": 0.911693813454094, + "grad_norm": 2.0745935440063477, + "learning_rate": 8.1208061973572e-07, + "loss": 0.7344, + "step": 18723 + }, + { + "epoch": 0.9117425072432012, + "grad_norm": 1.7864148616790771, + "learning_rate": 8.111911288351138e-07, + "loss": 0.7386, + "step": 18724 + }, + { + "epoch": 0.9117912010323084, + "grad_norm": 1.5517659187316895, + "learning_rate": 8.103021152578728e-07, + "loss": 0.8071, + "step": 18725 + }, + { + "epoch": 0.9118398948214155, + "grad_norm": 1.9394797086715698, + "learning_rate": 8.094135790261193e-07, + "loss": 0.7271, + "step": 18726 + }, + { + "epoch": 0.9118885886105227, + "grad_norm": 0.10063312202692032, + "learning_rate": 8.085255201619535e-07, + "loss": 0.6243, + "step": 18727 + }, + { + "epoch": 0.9119372823996299, + "grad_norm": 1.3382679224014282, + "learning_rate": 8.076379386874644e-07, + "loss": 0.863, + "step": 18728 + }, + { + "epoch": 0.9119859761887371, + "grad_norm": 1.370289921760559, + "learning_rate": 8.067508346247321e-07, + "loss": 0.8016, + "step": 18729 + }, + { + "epoch": 0.9120346699778443, + "grad_norm": 1.3313404321670532, + "learning_rate": 8.05864207995819e-07, + "loss": 0.7014, + "step": 18730 + }, + { + "epoch": 0.9120833637669515, + "grad_norm": 1.889163851737976, + "learning_rate": 8.049780588227895e-07, + "loss": 0.7797, + "step": 18731 + }, + { + "epoch": 0.9121320575560588, + "grad_norm": 2.1973540782928467, + "learning_rate": 8.040923871276774e-07, + "loss": 0.8527, + "step": 18732 + }, + { + "epoch": 0.912180751345166, + "grad_norm": 0.0952657163143158, + "learning_rate": 8.032071929325202e-07, + "loss": 0.6321, + "step": 18733 + }, + { + "epoch": 0.9122294451342732, + "grad_norm": 1.6840513944625854, + "learning_rate": 8.023224762593318e-07, + "loss": 0.7548, + "step": 18734 + }, + { + "epoch": 0.9122781389233803, + "grad_norm": 1.6580241918563843, + "learning_rate": 8.014382371301211e-07, + "loss": 0.9182, + "step": 18735 + }, + { + "epoch": 0.9123268327124875, + "grad_norm": 2.5453076362609863, + "learning_rate": 8.005544755668882e-07, + "loss": 0.8055, + "step": 18736 + }, + { + "epoch": 0.9123755265015947, + "grad_norm": 1.5478593111038208, + "learning_rate": 7.996711915916089e-07, + "loss": 0.7892, + "step": 18737 + }, + { + "epoch": 0.9124242202907019, + "grad_norm": 1.2878246307373047, + "learning_rate": 7.987883852262635e-07, + "loss": 0.7207, + "step": 18738 + }, + { + "epoch": 0.9124729140798091, + "grad_norm": 2.0876731872558594, + "learning_rate": 7.979060564928054e-07, + "loss": 0.7849, + "step": 18739 + }, + { + "epoch": 0.9125216078689163, + "grad_norm": 1.5469638109207153, + "learning_rate": 7.970242054131883e-07, + "loss": 0.7803, + "step": 18740 + }, + { + "epoch": 0.9125703016580236, + "grad_norm": 1.6601057052612305, + "learning_rate": 7.96142832009339e-07, + "loss": 0.7958, + "step": 18741 + }, + { + "epoch": 0.9126189954471308, + "grad_norm": 1.5268279314041138, + "learning_rate": 7.95261936303191e-07, + "loss": 0.7778, + "step": 18742 + }, + { + "epoch": 0.9126676892362379, + "grad_norm": 2.0880842208862305, + "learning_rate": 7.943815183166537e-07, + "loss": 0.6644, + "step": 18743 + }, + { + "epoch": 0.9127163830253451, + "grad_norm": 0.13224393129348755, + "learning_rate": 7.935015780716271e-07, + "loss": 0.734, + "step": 18744 + }, + { + "epoch": 0.9127650768144523, + "grad_norm": 1.4651241302490234, + "learning_rate": 7.926221155900005e-07, + "loss": 0.8141, + "step": 18745 + }, + { + "epoch": 0.9128137706035595, + "grad_norm": 0.09528760612010956, + "learning_rate": 7.917431308936474e-07, + "loss": 0.6529, + "step": 18746 + }, + { + "epoch": 0.9128624643926667, + "grad_norm": 1.7364530563354492, + "learning_rate": 7.908646240044393e-07, + "loss": 0.7704, + "step": 18747 + }, + { + "epoch": 0.9129111581817739, + "grad_norm": 1.5886868238449097, + "learning_rate": 7.899865949442209e-07, + "loss": 0.8266, + "step": 18748 + }, + { + "epoch": 0.9129598519708811, + "grad_norm": 1.8105568885803223, + "learning_rate": 7.891090437348392e-07, + "loss": 0.8191, + "step": 18749 + }, + { + "epoch": 0.9130085457599884, + "grad_norm": 1.3403371572494507, + "learning_rate": 7.882319703981233e-07, + "loss": 0.8497, + "step": 18750 + }, + { + "epoch": 0.9130572395490956, + "grad_norm": 0.09348539263010025, + "learning_rate": 7.873553749558871e-07, + "loss": 0.612, + "step": 18751 + }, + { + "epoch": 0.9131059333382027, + "grad_norm": 1.493367075920105, + "learning_rate": 7.864792574299396e-07, + "loss": 0.8575, + "step": 18752 + }, + { + "epoch": 0.9131546271273099, + "grad_norm": 1.5892711877822876, + "learning_rate": 7.856036178420701e-07, + "loss": 0.9291, + "step": 18753 + }, + { + "epoch": 0.9132033209164171, + "grad_norm": 1.9772748947143555, + "learning_rate": 7.847284562140633e-07, + "loss": 0.861, + "step": 18754 + }, + { + "epoch": 0.9132520147055243, + "grad_norm": 1.4565629959106445, + "learning_rate": 7.838537725676865e-07, + "loss": 0.8478, + "step": 18755 + }, + { + "epoch": 0.9133007084946315, + "grad_norm": 1.891264796257019, + "learning_rate": 7.82979566924702e-07, + "loss": 0.8216, + "step": 18756 + }, + { + "epoch": 0.9133494022837387, + "grad_norm": 1.4703994989395142, + "learning_rate": 7.821058393068504e-07, + "loss": 0.8901, + "step": 18757 + }, + { + "epoch": 0.9133980960728459, + "grad_norm": 1.5513447523117065, + "learning_rate": 7.812325897358697e-07, + "loss": 0.7581, + "step": 18758 + }, + { + "epoch": 0.9134467898619532, + "grad_norm": 2.004956007003784, + "learning_rate": 7.803598182334804e-07, + "loss": 0.7822, + "step": 18759 + }, + { + "epoch": 0.9134954836510603, + "grad_norm": 1.2876523733139038, + "learning_rate": 7.79487524821394e-07, + "loss": 0.9027, + "step": 18760 + }, + { + "epoch": 0.9135441774401675, + "grad_norm": 1.7739503383636475, + "learning_rate": 7.786157095213065e-07, + "loss": 0.8275, + "step": 18761 + }, + { + "epoch": 0.9135928712292747, + "grad_norm": 0.10158459842205048, + "learning_rate": 7.777443723549072e-07, + "loss": 0.6692, + "step": 18762 + }, + { + "epoch": 0.9136415650183819, + "grad_norm": 0.08955439180135727, + "learning_rate": 7.768735133438676e-07, + "loss": 0.5383, + "step": 18763 + }, + { + "epoch": 0.9136902588074891, + "grad_norm": 1.2986539602279663, + "learning_rate": 7.760031325098527e-07, + "loss": 0.7654, + "step": 18764 + }, + { + "epoch": 0.9137389525965963, + "grad_norm": 1.389570713043213, + "learning_rate": 7.751332298745096e-07, + "loss": 0.7938, + "step": 18765 + }, + { + "epoch": 0.9137876463857035, + "grad_norm": 1.4514628648757935, + "learning_rate": 7.742638054594853e-07, + "loss": 0.7874, + "step": 18766 + }, + { + "epoch": 0.9138363401748107, + "grad_norm": 1.5628585815429688, + "learning_rate": 7.733948592863982e-07, + "loss": 0.8399, + "step": 18767 + }, + { + "epoch": 0.9138850339639178, + "grad_norm": 1.5371736288070679, + "learning_rate": 7.725263913768688e-07, + "loss": 0.8232, + "step": 18768 + }, + { + "epoch": 0.9139337277530251, + "grad_norm": 1.2163625955581665, + "learning_rate": 7.716584017524998e-07, + "loss": 0.7268, + "step": 18769 + }, + { + "epoch": 0.9139824215421323, + "grad_norm": 1.4737396240234375, + "learning_rate": 7.707908904348782e-07, + "loss": 0.827, + "step": 18770 + }, + { + "epoch": 0.9140311153312395, + "grad_norm": 1.6837555170059204, + "learning_rate": 7.699238574455892e-07, + "loss": 0.8974, + "step": 18771 + }, + { + "epoch": 0.9140798091203467, + "grad_norm": 1.4643936157226562, + "learning_rate": 7.690573028061931e-07, + "loss": 0.8406, + "step": 18772 + }, + { + "epoch": 0.9141285029094539, + "grad_norm": 1.5586239099502563, + "learning_rate": 7.681912265382574e-07, + "loss": 0.8091, + "step": 18773 + }, + { + "epoch": 0.9141771966985611, + "grad_norm": 1.7585957050323486, + "learning_rate": 7.673256286633112e-07, + "loss": 0.8059, + "step": 18774 + }, + { + "epoch": 0.9142258904876683, + "grad_norm": 1.5533214807510376, + "learning_rate": 7.664605092028976e-07, + "loss": 0.835, + "step": 18775 + }, + { + "epoch": 0.9142745842767755, + "grad_norm": 1.6362078189849854, + "learning_rate": 7.655958681785303e-07, + "loss": 0.7038, + "step": 18776 + }, + { + "epoch": 0.9143232780658826, + "grad_norm": 1.584349274635315, + "learning_rate": 7.647317056117209e-07, + "loss": 0.7854, + "step": 18777 + }, + { + "epoch": 0.9143719718549899, + "grad_norm": 0.09515434503555298, + "learning_rate": 7.638680215239657e-07, + "loss": 0.6351, + "step": 18778 + }, + { + "epoch": 0.9144206656440971, + "grad_norm": 1.550887942314148, + "learning_rate": 7.630048159367453e-07, + "loss": 0.7613, + "step": 18779 + }, + { + "epoch": 0.9144693594332043, + "grad_norm": 1.212558627128601, + "learning_rate": 7.621420888715337e-07, + "loss": 0.7708, + "step": 18780 + }, + { + "epoch": 0.9145180532223115, + "grad_norm": 1.4115439653396606, + "learning_rate": 7.612798403497912e-07, + "loss": 0.7879, + "step": 18781 + }, + { + "epoch": 0.9145667470114187, + "grad_norm": 2.210472822189331, + "learning_rate": 7.604180703929698e-07, + "loss": 0.7679, + "step": 18782 + }, + { + "epoch": 0.9146154408005259, + "grad_norm": 2.580307722091675, + "learning_rate": 7.595567790224989e-07, + "loss": 0.8844, + "step": 18783 + }, + { + "epoch": 0.9146641345896331, + "grad_norm": 1.1683682203292847, + "learning_rate": 7.586959662598125e-07, + "loss": 0.7374, + "step": 18784 + }, + { + "epoch": 0.9147128283787402, + "grad_norm": 1.6126596927642822, + "learning_rate": 7.578356321263136e-07, + "loss": 0.8899, + "step": 18785 + }, + { + "epoch": 0.9147615221678475, + "grad_norm": 1.8281289339065552, + "learning_rate": 7.569757766434094e-07, + "loss": 0.8475, + "step": 18786 + }, + { + "epoch": 0.9148102159569547, + "grad_norm": 1.8623257875442505, + "learning_rate": 7.561163998324872e-07, + "loss": 0.8734, + "step": 18787 + }, + { + "epoch": 0.9148589097460619, + "grad_norm": 0.09517095237970352, + "learning_rate": 7.552575017149233e-07, + "loss": 0.6099, + "step": 18788 + }, + { + "epoch": 0.9149076035351691, + "grad_norm": 1.3758798837661743, + "learning_rate": 7.543990823120873e-07, + "loss": 0.8531, + "step": 18789 + }, + { + "epoch": 0.9149562973242763, + "grad_norm": 2.3288419246673584, + "learning_rate": 7.535411416453242e-07, + "loss": 0.7981, + "step": 18790 + }, + { + "epoch": 0.9150049911133835, + "grad_norm": 1.1567630767822266, + "learning_rate": 7.526836797359837e-07, + "loss": 0.8113, + "step": 18791 + }, + { + "epoch": 0.9150536849024907, + "grad_norm": 1.4822897911071777, + "learning_rate": 7.518266966053889e-07, + "loss": 0.79, + "step": 18792 + }, + { + "epoch": 0.9151023786915979, + "grad_norm": 2.9133718013763428, + "learning_rate": 7.509701922748602e-07, + "loss": 0.7359, + "step": 18793 + }, + { + "epoch": 0.915151072480705, + "grad_norm": 1.7838702201843262, + "learning_rate": 7.501141667657052e-07, + "loss": 0.8853, + "step": 18794 + }, + { + "epoch": 0.9151997662698123, + "grad_norm": 1.3935530185699463, + "learning_rate": 7.492586200992114e-07, + "loss": 0.8353, + "step": 18795 + }, + { + "epoch": 0.9152484600589195, + "grad_norm": 1.9634848833084106, + "learning_rate": 7.484035522966704e-07, + "loss": 0.9081, + "step": 18796 + }, + { + "epoch": 0.9152971538480267, + "grad_norm": 2.113450765609741, + "learning_rate": 7.475489633793409e-07, + "loss": 0.7762, + "step": 18797 + }, + { + "epoch": 0.9153458476371339, + "grad_norm": 2.8526713848114014, + "learning_rate": 7.466948533684925e-07, + "loss": 0.8242, + "step": 18798 + }, + { + "epoch": 0.9153945414262411, + "grad_norm": 2.035062074661255, + "learning_rate": 7.458412222853595e-07, + "loss": 0.835, + "step": 18799 + }, + { + "epoch": 0.9154432352153483, + "grad_norm": 1.633963942527771, + "learning_rate": 7.449880701511825e-07, + "loss": 0.7417, + "step": 18800 + }, + { + "epoch": 0.9154919290044555, + "grad_norm": 0.09710004925727844, + "learning_rate": 7.441353969871845e-07, + "loss": 0.6631, + "step": 18801 + }, + { + "epoch": 0.9155406227935626, + "grad_norm": 2.042987823486328, + "learning_rate": 7.432832028145732e-07, + "loss": 0.7306, + "step": 18802 + }, + { + "epoch": 0.9155893165826698, + "grad_norm": 1.3057482242584229, + "learning_rate": 7.424314876545491e-07, + "loss": 0.7978, + "step": 18803 + }, + { + "epoch": 0.9156380103717771, + "grad_norm": 1.6214430332183838, + "learning_rate": 7.415802515282977e-07, + "loss": 0.7569, + "step": 18804 + }, + { + "epoch": 0.9156867041608843, + "grad_norm": 1.1377179622650146, + "learning_rate": 7.407294944569931e-07, + "loss": 0.7975, + "step": 18805 + }, + { + "epoch": 0.9157353979499915, + "grad_norm": 1.9509941339492798, + "learning_rate": 7.398792164617941e-07, + "loss": 0.7978, + "step": 18806 + }, + { + "epoch": 0.9157840917390987, + "grad_norm": 2.3848116397857666, + "learning_rate": 7.390294175638635e-07, + "loss": 0.7634, + "step": 18807 + }, + { + "epoch": 0.9158327855282059, + "grad_norm": 1.441745400428772, + "learning_rate": 7.381800977843245e-07, + "loss": 0.8112, + "step": 18808 + }, + { + "epoch": 0.9158814793173131, + "grad_norm": 1.2577887773513794, + "learning_rate": 7.373312571443159e-07, + "loss": 0.7682, + "step": 18809 + }, + { + "epoch": 0.9159301731064203, + "grad_norm": 1.747678518295288, + "learning_rate": 7.364828956649495e-07, + "loss": 0.8077, + "step": 18810 + }, + { + "epoch": 0.9159788668955274, + "grad_norm": 1.5194133520126343, + "learning_rate": 7.356350133673263e-07, + "loss": 0.8582, + "step": 18811 + }, + { + "epoch": 0.9160275606846346, + "grad_norm": 1.5343817472457886, + "learning_rate": 7.347876102725382e-07, + "loss": 0.7468, + "step": 18812 + }, + { + "epoch": 0.9160762544737419, + "grad_norm": 1.5262458324432373, + "learning_rate": 7.33940686401664e-07, + "loss": 0.7019, + "step": 18813 + }, + { + "epoch": 0.9161249482628491, + "grad_norm": 1.7462469339370728, + "learning_rate": 7.330942417757736e-07, + "loss": 0.8026, + "step": 18814 + }, + { + "epoch": 0.9161736420519563, + "grad_norm": 1.5953351259231567, + "learning_rate": 7.322482764159189e-07, + "loss": 0.8083, + "step": 18815 + }, + { + "epoch": 0.9162223358410635, + "grad_norm": 1.494933843612671, + "learning_rate": 7.314027903431453e-07, + "loss": 0.6836, + "step": 18816 + }, + { + "epoch": 0.9162710296301707, + "grad_norm": 1.474304437637329, + "learning_rate": 7.30557783578485e-07, + "loss": 0.7289, + "step": 18817 + }, + { + "epoch": 0.9163197234192779, + "grad_norm": 1.8719996213912964, + "learning_rate": 7.297132561429565e-07, + "loss": 0.8725, + "step": 18818 + }, + { + "epoch": 0.916368417208385, + "grad_norm": 1.9013272523880005, + "learning_rate": 7.288692080575699e-07, + "loss": 0.7841, + "step": 18819 + }, + { + "epoch": 0.9164171109974922, + "grad_norm": 1.2138441801071167, + "learning_rate": 7.280256393433172e-07, + "loss": 0.8643, + "step": 18820 + }, + { + "epoch": 0.9164658047865994, + "grad_norm": 1.5400182008743286, + "learning_rate": 7.271825500211816e-07, + "loss": 0.7225, + "step": 18821 + }, + { + "epoch": 0.9165144985757067, + "grad_norm": 1.4502710103988647, + "learning_rate": 7.263399401121396e-07, + "loss": 0.8073, + "step": 18822 + }, + { + "epoch": 0.9165631923648139, + "grad_norm": 1.8697811365127563, + "learning_rate": 7.254978096371479e-07, + "loss": 0.8574, + "step": 18823 + }, + { + "epoch": 0.9166118861539211, + "grad_norm": 2.0048604011535645, + "learning_rate": 7.246561586171585e-07, + "loss": 0.7827, + "step": 18824 + }, + { + "epoch": 0.9166605799430283, + "grad_norm": 1.3045525550842285, + "learning_rate": 7.238149870730993e-07, + "loss": 0.8015, + "step": 18825 + }, + { + "epoch": 0.9167092737321355, + "grad_norm": 2.024179220199585, + "learning_rate": 7.229742950259022e-07, + "loss": 0.8695, + "step": 18826 + }, + { + "epoch": 0.9167579675212426, + "grad_norm": 1.5666760206222534, + "learning_rate": 7.221340824964796e-07, + "loss": 0.8346, + "step": 18827 + }, + { + "epoch": 0.9168066613103498, + "grad_norm": 2.222722291946411, + "learning_rate": 7.212943495057278e-07, + "loss": 0.7961, + "step": 18828 + }, + { + "epoch": 0.916855355099457, + "grad_norm": 1.7835406064987183, + "learning_rate": 7.204550960745349e-07, + "loss": 0.8257, + "step": 18829 + }, + { + "epoch": 0.9169040488885642, + "grad_norm": 1.239530324935913, + "learning_rate": 7.196163222237817e-07, + "loss": 0.7636, + "step": 18830 + }, + { + "epoch": 0.9169527426776715, + "grad_norm": 2.754143476486206, + "learning_rate": 7.187780279743295e-07, + "loss": 0.7857, + "step": 18831 + }, + { + "epoch": 0.9170014364667787, + "grad_norm": 1.372220516204834, + "learning_rate": 7.179402133470304e-07, + "loss": 0.8055, + "step": 18832 + }, + { + "epoch": 0.9170501302558859, + "grad_norm": 1.614424705505371, + "learning_rate": 7.171028783627298e-07, + "loss": 0.8035, + "step": 18833 + }, + { + "epoch": 0.9170988240449931, + "grad_norm": 1.8624703884124756, + "learning_rate": 7.162660230422513e-07, + "loss": 0.7959, + "step": 18834 + }, + { + "epoch": 0.9171475178341003, + "grad_norm": 2.2182722091674805, + "learning_rate": 7.154296474064138e-07, + "loss": 0.8709, + "step": 18835 + }, + { + "epoch": 0.9171962116232074, + "grad_norm": 1.797247290611267, + "learning_rate": 7.145937514760226e-07, + "loss": 0.7499, + "step": 18836 + }, + { + "epoch": 0.9172449054123146, + "grad_norm": 1.4928916692733765, + "learning_rate": 7.137583352718703e-07, + "loss": 0.7752, + "step": 18837 + }, + { + "epoch": 0.9172935992014218, + "grad_norm": 1.9248408079147339, + "learning_rate": 7.129233988147378e-07, + "loss": 0.7566, + "step": 18838 + }, + { + "epoch": 0.917342292990529, + "grad_norm": 1.2168253660202026, + "learning_rate": 7.120889421253907e-07, + "loss": 0.7817, + "step": 18839 + }, + { + "epoch": 0.9173909867796363, + "grad_norm": 2.2807705402374268, + "learning_rate": 7.11254965224597e-07, + "loss": 0.8175, + "step": 18840 + }, + { + "epoch": 0.9174396805687435, + "grad_norm": 1.5153077840805054, + "learning_rate": 7.104214681330867e-07, + "loss": 0.7621, + "step": 18841 + }, + { + "epoch": 0.9174883743578507, + "grad_norm": 1.5860515832901, + "learning_rate": 7.095884508716078e-07, + "loss": 0.8286, + "step": 18842 + }, + { + "epoch": 0.9175370681469579, + "grad_norm": 1.5145632028579712, + "learning_rate": 7.087559134608701e-07, + "loss": 0.8934, + "step": 18843 + }, + { + "epoch": 0.917585761936065, + "grad_norm": 1.4616129398345947, + "learning_rate": 7.079238559215907e-07, + "loss": 0.7704, + "step": 18844 + }, + { + "epoch": 0.9176344557251722, + "grad_norm": 2.1701059341430664, + "learning_rate": 7.070922782744638e-07, + "loss": 0.7799, + "step": 18845 + }, + { + "epoch": 0.9176831495142794, + "grad_norm": 4.8835272789001465, + "learning_rate": 7.062611805401754e-07, + "loss": 0.7371, + "step": 18846 + }, + { + "epoch": 0.9177318433033866, + "grad_norm": 1.5757571458816528, + "learning_rate": 7.05430562739402e-07, + "loss": 0.8201, + "step": 18847 + }, + { + "epoch": 0.9177805370924939, + "grad_norm": 1.7771797180175781, + "learning_rate": 7.046004248927963e-07, + "loss": 0.8701, + "step": 18848 + }, + { + "epoch": 0.9178292308816011, + "grad_norm": 2.0539379119873047, + "learning_rate": 7.037707670210214e-07, + "loss": 0.7592, + "step": 18849 + }, + { + "epoch": 0.9178779246707083, + "grad_norm": 2.051891326904297, + "learning_rate": 7.029415891447034e-07, + "loss": 0.7912, + "step": 18850 + }, + { + "epoch": 0.9179266184598155, + "grad_norm": 1.801501750946045, + "learning_rate": 7.021128912844743e-07, + "loss": 0.7093, + "step": 18851 + }, + { + "epoch": 0.9179753122489227, + "grad_norm": 1.436713457107544, + "learning_rate": 7.012846734609469e-07, + "loss": 0.7106, + "step": 18852 + }, + { + "epoch": 0.9180240060380298, + "grad_norm": 1.8268964290618896, + "learning_rate": 7.004569356947222e-07, + "loss": 0.7964, + "step": 18853 + }, + { + "epoch": 0.918072699827137, + "grad_norm": 1.3868108987808228, + "learning_rate": 6.996296780063905e-07, + "loss": 0.9591, + "step": 18854 + }, + { + "epoch": 0.9181213936162442, + "grad_norm": 1.6165635585784912, + "learning_rate": 6.988029004165287e-07, + "loss": 0.8352, + "step": 18855 + }, + { + "epoch": 0.9181700874053514, + "grad_norm": 2.3659660816192627, + "learning_rate": 6.979766029457092e-07, + "loss": 0.7867, + "step": 18856 + }, + { + "epoch": 0.9182187811944587, + "grad_norm": 1.6366461515426636, + "learning_rate": 6.971507856144777e-07, + "loss": 0.9246, + "step": 18857 + }, + { + "epoch": 0.9182674749835659, + "grad_norm": 2.6274447441101074, + "learning_rate": 6.963254484433801e-07, + "loss": 0.8423, + "step": 18858 + }, + { + "epoch": 0.9183161687726731, + "grad_norm": 1.5134867429733276, + "learning_rate": 6.955005914529489e-07, + "loss": 0.8062, + "step": 18859 + }, + { + "epoch": 0.9183648625617803, + "grad_norm": 1.7758736610412598, + "learning_rate": 6.946762146637009e-07, + "loss": 0.8338, + "step": 18860 + }, + { + "epoch": 0.9184135563508874, + "grad_norm": 2.1797983646392822, + "learning_rate": 6.93852318096142e-07, + "loss": 0.8495, + "step": 18861 + }, + { + "epoch": 0.9184622501399946, + "grad_norm": 1.7504184246063232, + "learning_rate": 6.930289017707669e-07, + "loss": 0.7936, + "step": 18862 + }, + { + "epoch": 0.9185109439291018, + "grad_norm": 1.446405053138733, + "learning_rate": 6.922059657080594e-07, + "loss": 0.8292, + "step": 18863 + }, + { + "epoch": 0.918559637718209, + "grad_norm": 1.8234606981277466, + "learning_rate": 6.913835099284849e-07, + "loss": 0.8525, + "step": 18864 + }, + { + "epoch": 0.9186083315073162, + "grad_norm": 1.373347282409668, + "learning_rate": 6.905615344525118e-07, + "loss": 0.7701, + "step": 18865 + }, + { + "epoch": 0.9186570252964235, + "grad_norm": 1.6242626905441284, + "learning_rate": 6.897400393005748e-07, + "loss": 0.7683, + "step": 18866 + }, + { + "epoch": 0.9187057190855307, + "grad_norm": 1.6053162813186646, + "learning_rate": 6.889190244931177e-07, + "loss": 0.7303, + "step": 18867 + }, + { + "epoch": 0.9187544128746379, + "grad_norm": 1.5711320638656616, + "learning_rate": 6.880984900505616e-07, + "loss": 0.82, + "step": 18868 + }, + { + "epoch": 0.918803106663745, + "grad_norm": 2.015650510787964, + "learning_rate": 6.872784359933171e-07, + "loss": 0.8246, + "step": 18869 + }, + { + "epoch": 0.9188518004528522, + "grad_norm": 1.440003514289856, + "learning_rate": 6.864588623417812e-07, + "loss": 0.8535, + "step": 18870 + }, + { + "epoch": 0.9189004942419594, + "grad_norm": 0.09482764452695847, + "learning_rate": 6.856397691163419e-07, + "loss": 0.5798, + "step": 18871 + }, + { + "epoch": 0.9189491880310666, + "grad_norm": 2.2754688262939453, + "learning_rate": 6.848211563373741e-07, + "loss": 0.9042, + "step": 18872 + }, + { + "epoch": 0.9189978818201738, + "grad_norm": 3.362546682357788, + "learning_rate": 6.840030240252416e-07, + "loss": 0.8751, + "step": 18873 + }, + { + "epoch": 0.919046575609281, + "grad_norm": 1.6037222146987915, + "learning_rate": 6.831853722002923e-07, + "loss": 0.7784, + "step": 18874 + }, + { + "epoch": 0.9190952693983883, + "grad_norm": 1.260252833366394, + "learning_rate": 6.823682008828702e-07, + "loss": 0.8262, + "step": 18875 + }, + { + "epoch": 0.9191439631874955, + "grad_norm": 1.5554194450378418, + "learning_rate": 6.815515100933012e-07, + "loss": 0.7531, + "step": 18876 + }, + { + "epoch": 0.9191926569766027, + "grad_norm": 1.2486534118652344, + "learning_rate": 6.807352998519001e-07, + "loss": 0.8278, + "step": 18877 + }, + { + "epoch": 0.9192413507657098, + "grad_norm": 1.3644071817398071, + "learning_rate": 6.799195701789685e-07, + "loss": 0.7419, + "step": 18878 + }, + { + "epoch": 0.919290044554817, + "grad_norm": 1.7158931493759155, + "learning_rate": 6.791043210948012e-07, + "loss": 0.7368, + "step": 18879 + }, + { + "epoch": 0.9193387383439242, + "grad_norm": 2.6812901496887207, + "learning_rate": 6.782895526196731e-07, + "loss": 0.7789, + "step": 18880 + }, + { + "epoch": 0.9193874321330314, + "grad_norm": 1.592007040977478, + "learning_rate": 6.774752647738547e-07, + "loss": 0.9119, + "step": 18881 + }, + { + "epoch": 0.9194361259221386, + "grad_norm": 2.1718568801879883, + "learning_rate": 6.766614575776032e-07, + "loss": 0.7466, + "step": 18882 + }, + { + "epoch": 0.9194848197112458, + "grad_norm": 1.388196349143982, + "learning_rate": 6.758481310511556e-07, + "loss": 0.8258, + "step": 18883 + }, + { + "epoch": 0.9195335135003531, + "grad_norm": 1.4014302492141724, + "learning_rate": 6.75035285214749e-07, + "loss": 0.8568, + "step": 18884 + }, + { + "epoch": 0.9195822072894603, + "grad_norm": 1.5361438989639282, + "learning_rate": 6.74222920088603e-07, + "loss": 0.7735, + "step": 18885 + }, + { + "epoch": 0.9196309010785674, + "grad_norm": 1.4435079097747803, + "learning_rate": 6.734110356929235e-07, + "loss": 0.8063, + "step": 18886 + }, + { + "epoch": 0.9196795948676746, + "grad_norm": 1.3607829809188843, + "learning_rate": 6.725996320479078e-07, + "loss": 0.7975, + "step": 18887 + }, + { + "epoch": 0.9197282886567818, + "grad_norm": 1.7213490009307861, + "learning_rate": 6.717887091737373e-07, + "loss": 0.8474, + "step": 18888 + }, + { + "epoch": 0.919776982445889, + "grad_norm": 1.8106547594070435, + "learning_rate": 6.709782670905851e-07, + "loss": 0.8456, + "step": 18889 + }, + { + "epoch": 0.9198256762349962, + "grad_norm": 1.3525466918945312, + "learning_rate": 6.701683058186082e-07, + "loss": 0.8102, + "step": 18890 + }, + { + "epoch": 0.9198743700241034, + "grad_norm": 1.6443865299224854, + "learning_rate": 6.693588253779615e-07, + "loss": 0.9506, + "step": 18891 + }, + { + "epoch": 0.9199230638132107, + "grad_norm": 2.1067121028900146, + "learning_rate": 6.685498257887734e-07, + "loss": 0.8536, + "step": 18892 + }, + { + "epoch": 0.9199717576023179, + "grad_norm": 1.7827714681625366, + "learning_rate": 6.677413070711747e-07, + "loss": 0.8849, + "step": 18893 + }, + { + "epoch": 0.9200204513914251, + "grad_norm": 1.829897403717041, + "learning_rate": 6.66933269245269e-07, + "loss": 0.7934, + "step": 18894 + }, + { + "epoch": 0.9200691451805322, + "grad_norm": 1.3434897661209106, + "learning_rate": 6.661257123311626e-07, + "loss": 0.9526, + "step": 18895 + }, + { + "epoch": 0.9201178389696394, + "grad_norm": 1.8379747867584229, + "learning_rate": 6.653186363489416e-07, + "loss": 0.8253, + "step": 18896 + }, + { + "epoch": 0.9201665327587466, + "grad_norm": 1.815522313117981, + "learning_rate": 6.645120413186812e-07, + "loss": 0.8433, + "step": 18897 + }, + { + "epoch": 0.9202152265478538, + "grad_norm": 2.226550817489624, + "learning_rate": 6.637059272604518e-07, + "loss": 0.7232, + "step": 18898 + }, + { + "epoch": 0.920263920336961, + "grad_norm": 1.5519627332687378, + "learning_rate": 6.629002941942952e-07, + "loss": 0.9074, + "step": 18899 + }, + { + "epoch": 0.9203126141260682, + "grad_norm": 1.8719106912612915, + "learning_rate": 6.620951421402599e-07, + "loss": 0.8638, + "step": 18900 + }, + { + "epoch": 0.9203613079151755, + "grad_norm": 2.286088705062866, + "learning_rate": 6.612904711183699e-07, + "loss": 0.7278, + "step": 18901 + }, + { + "epoch": 0.9204100017042827, + "grad_norm": 1.3756698369979858, + "learning_rate": 6.604862811486423e-07, + "loss": 0.7904, + "step": 18902 + }, + { + "epoch": 0.9204586954933898, + "grad_norm": 1.3523284196853638, + "learning_rate": 6.596825722510813e-07, + "loss": 0.7614, + "step": 18903 + }, + { + "epoch": 0.920507389282497, + "grad_norm": 1.5943248271942139, + "learning_rate": 6.588793444456798e-07, + "loss": 0.7888, + "step": 18904 + }, + { + "epoch": 0.9205560830716042, + "grad_norm": 2.080289602279663, + "learning_rate": 6.580765977524217e-07, + "loss": 0.905, + "step": 18905 + }, + { + "epoch": 0.9206047768607114, + "grad_norm": 1.5857847929000854, + "learning_rate": 6.572743321912667e-07, + "loss": 0.8392, + "step": 18906 + }, + { + "epoch": 0.9206534706498186, + "grad_norm": 1.6790859699249268, + "learning_rate": 6.56472547782181e-07, + "loss": 0.8044, + "step": 18907 + }, + { + "epoch": 0.9207021644389258, + "grad_norm": 1.2081202268600464, + "learning_rate": 6.556712445450997e-07, + "loss": 0.7221, + "step": 18908 + }, + { + "epoch": 0.920750858228033, + "grad_norm": 1.2601897716522217, + "learning_rate": 6.548704224999625e-07, + "loss": 0.8543, + "step": 18909 + }, + { + "epoch": 0.9207995520171403, + "grad_norm": 1.4584827423095703, + "learning_rate": 6.540700816666867e-07, + "loss": 0.8796, + "step": 18910 + }, + { + "epoch": 0.9208482458062475, + "grad_norm": 2.3905208110809326, + "learning_rate": 6.532702220651832e-07, + "loss": 0.7417, + "step": 18911 + }, + { + "epoch": 0.9208969395953546, + "grad_norm": 1.4370710849761963, + "learning_rate": 6.524708437153448e-07, + "loss": 0.6833, + "step": 18912 + }, + { + "epoch": 0.9209456333844618, + "grad_norm": 1.89602530002594, + "learning_rate": 6.516719466370603e-07, + "loss": 0.6426, + "step": 18913 + }, + { + "epoch": 0.920994327173569, + "grad_norm": 2.467705726623535, + "learning_rate": 6.508735308501979e-07, + "loss": 0.8627, + "step": 18914 + }, + { + "epoch": 0.9210430209626762, + "grad_norm": 1.5528990030288696, + "learning_rate": 6.500755963746197e-07, + "loss": 0.7017, + "step": 18915 + }, + { + "epoch": 0.9210917147517834, + "grad_norm": 1.665027379989624, + "learning_rate": 6.492781432301809e-07, + "loss": 0.8369, + "step": 18916 + }, + { + "epoch": 0.9211404085408906, + "grad_norm": 0.09630202502012253, + "learning_rate": 6.484811714367056e-07, + "loss": 0.6891, + "step": 18917 + }, + { + "epoch": 0.9211891023299978, + "grad_norm": 1.9655756950378418, + "learning_rate": 6.476846810140292e-07, + "loss": 0.8416, + "step": 18918 + }, + { + "epoch": 0.9212377961191051, + "grad_norm": 3.6649484634399414, + "learning_rate": 6.4688867198196e-07, + "loss": 0.8517, + "step": 18919 + }, + { + "epoch": 0.9212864899082122, + "grad_norm": 1.353318452835083, + "learning_rate": 6.460931443603002e-07, + "loss": 0.8488, + "step": 18920 + }, + { + "epoch": 0.9213351836973194, + "grad_norm": 1.7415417432785034, + "learning_rate": 6.452980981688384e-07, + "loss": 0.816, + "step": 18921 + }, + { + "epoch": 0.9213838774864266, + "grad_norm": 1.4873583316802979, + "learning_rate": 6.44503533427352e-07, + "loss": 0.7221, + "step": 18922 + }, + { + "epoch": 0.9214325712755338, + "grad_norm": 1.5047072172164917, + "learning_rate": 6.43709450155603e-07, + "loss": 0.8093, + "step": 18923 + }, + { + "epoch": 0.921481265064641, + "grad_norm": 1.3210551738739014, + "learning_rate": 6.429158483733466e-07, + "loss": 0.8748, + "step": 18924 + }, + { + "epoch": 0.9215299588537482, + "grad_norm": 1.6960992813110352, + "learning_rate": 6.421227281003228e-07, + "loss": 0.7124, + "step": 18925 + }, + { + "epoch": 0.9215786526428554, + "grad_norm": 1.3276593685150146, + "learning_rate": 6.413300893562624e-07, + "loss": 0.8226, + "step": 18926 + }, + { + "epoch": 0.9216273464319626, + "grad_norm": 1.4421350955963135, + "learning_rate": 6.405379321608807e-07, + "loss": 0.8455, + "step": 18927 + }, + { + "epoch": 0.9216760402210697, + "grad_norm": 1.6624372005462646, + "learning_rate": 6.397462565338841e-07, + "loss": 0.8325, + "step": 18928 + }, + { + "epoch": 0.921724734010177, + "grad_norm": 1.7114930152893066, + "learning_rate": 6.389550624949637e-07, + "loss": 0.8357, + "step": 18929 + }, + { + "epoch": 0.9217734277992842, + "grad_norm": 1.5046859979629517, + "learning_rate": 6.381643500638013e-07, + "loss": 0.8521, + "step": 18930 + }, + { + "epoch": 0.9218221215883914, + "grad_norm": 1.7806901931762695, + "learning_rate": 6.373741192600658e-07, + "loss": 0.867, + "step": 18931 + }, + { + "epoch": 0.9218708153774986, + "grad_norm": 1.5423977375030518, + "learning_rate": 6.365843701034125e-07, + "loss": 0.8331, + "step": 18932 + }, + { + "epoch": 0.9219195091666058, + "grad_norm": 1.9010300636291504, + "learning_rate": 6.357951026134901e-07, + "loss": 0.8686, + "step": 18933 + }, + { + "epoch": 0.921968202955713, + "grad_norm": 2.0927529335021973, + "learning_rate": 6.350063168099318e-07, + "loss": 0.8035, + "step": 18934 + }, + { + "epoch": 0.9220168967448202, + "grad_norm": 1.6678555011749268, + "learning_rate": 6.342180127123554e-07, + "loss": 0.9149, + "step": 18935 + }, + { + "epoch": 0.9220655905339274, + "grad_norm": 1.5374605655670166, + "learning_rate": 6.334301903403739e-07, + "loss": 0.7551, + "step": 18936 + }, + { + "epoch": 0.9221142843230345, + "grad_norm": 0.09693726152181625, + "learning_rate": 6.326428497135806e-07, + "loss": 0.6079, + "step": 18937 + }, + { + "epoch": 0.9221629781121418, + "grad_norm": 1.398549199104309, + "learning_rate": 6.31855990851562e-07, + "loss": 0.7867, + "step": 18938 + }, + { + "epoch": 0.922211671901249, + "grad_norm": 1.5483503341674805, + "learning_rate": 6.310696137738914e-07, + "loss": 0.8139, + "step": 18939 + }, + { + "epoch": 0.9222603656903562, + "grad_norm": 1.4535154104232788, + "learning_rate": 6.30283718500131e-07, + "loss": 0.7282, + "step": 18940 + }, + { + "epoch": 0.9223090594794634, + "grad_norm": 1.5247596502304077, + "learning_rate": 6.294983050498249e-07, + "loss": 0.772, + "step": 18941 + }, + { + "epoch": 0.9223577532685706, + "grad_norm": 1.2957477569580078, + "learning_rate": 6.287133734425221e-07, + "loss": 0.8321, + "step": 18942 + }, + { + "epoch": 0.9224064470576778, + "grad_norm": 1.9004400968551636, + "learning_rate": 6.279289236977337e-07, + "loss": 0.7642, + "step": 18943 + }, + { + "epoch": 0.922455140846785, + "grad_norm": 1.796096682548523, + "learning_rate": 6.271449558349818e-07, + "loss": 0.8364, + "step": 18944 + }, + { + "epoch": 0.9225038346358921, + "grad_norm": 1.6577675342559814, + "learning_rate": 6.263614698737663e-07, + "loss": 0.8437, + "step": 18945 + }, + { + "epoch": 0.9225525284249994, + "grad_norm": 1.6021991968154907, + "learning_rate": 6.255784658335761e-07, + "loss": 0.8109, + "step": 18946 + }, + { + "epoch": 0.9226012222141066, + "grad_norm": 1.339699387550354, + "learning_rate": 6.247959437338869e-07, + "loss": 0.8197, + "step": 18947 + }, + { + "epoch": 0.9226499160032138, + "grad_norm": 1.7561267614364624, + "learning_rate": 6.24013903594165e-07, + "loss": 0.7775, + "step": 18948 + }, + { + "epoch": 0.922698609792321, + "grad_norm": 1.899684190750122, + "learning_rate": 6.232323454338662e-07, + "loss": 0.7507, + "step": 18949 + }, + { + "epoch": 0.9227473035814282, + "grad_norm": 0.09641146659851074, + "learning_rate": 6.22451269272426e-07, + "loss": 0.6165, + "step": 18950 + }, + { + "epoch": 0.9227959973705354, + "grad_norm": 1.2264962196350098, + "learning_rate": 6.2167067512928e-07, + "loss": 0.7636, + "step": 18951 + }, + { + "epoch": 0.9228446911596426, + "grad_norm": 4.508517265319824, + "learning_rate": 6.208905630238416e-07, + "loss": 0.8025, + "step": 18952 + }, + { + "epoch": 0.9228933849487498, + "grad_norm": 1.5284504890441895, + "learning_rate": 6.201109329755173e-07, + "loss": 0.7975, + "step": 18953 + }, + { + "epoch": 0.9229420787378569, + "grad_norm": 1.3238736391067505, + "learning_rate": 6.193317850037006e-07, + "loss": 0.8168, + "step": 18954 + }, + { + "epoch": 0.9229907725269642, + "grad_norm": 2.2806458473205566, + "learning_rate": 6.185531191277716e-07, + "loss": 0.785, + "step": 18955 + }, + { + "epoch": 0.9230394663160714, + "grad_norm": 2.296795129776001, + "learning_rate": 6.177749353671036e-07, + "loss": 0.7642, + "step": 18956 + }, + { + "epoch": 0.9230881601051786, + "grad_norm": 1.467846155166626, + "learning_rate": 6.16997233741048e-07, + "loss": 0.7449, + "step": 18957 + }, + { + "epoch": 0.9231368538942858, + "grad_norm": 0.09204351156949997, + "learning_rate": 6.162200142689578e-07, + "loss": 0.5539, + "step": 18958 + }, + { + "epoch": 0.923185547683393, + "grad_norm": 1.3328745365142822, + "learning_rate": 6.154432769701579e-07, + "loss": 0.8842, + "step": 18959 + }, + { + "epoch": 0.9232342414725002, + "grad_norm": 1.663926362991333, + "learning_rate": 6.146670218639772e-07, + "loss": 0.7913, + "step": 18960 + }, + { + "epoch": 0.9232829352616074, + "grad_norm": 1.9718097448349, + "learning_rate": 6.138912489697201e-07, + "loss": 0.7953, + "step": 18961 + }, + { + "epoch": 0.9233316290507145, + "grad_norm": 1.3731739521026611, + "learning_rate": 6.131159583066848e-07, + "loss": 0.7851, + "step": 18962 + }, + { + "epoch": 0.9233803228398217, + "grad_norm": 1.2463024854660034, + "learning_rate": 6.123411498941601e-07, + "loss": 0.7622, + "step": 18963 + }, + { + "epoch": 0.923429016628929, + "grad_norm": 1.7239744663238525, + "learning_rate": 6.115668237514127e-07, + "loss": 0.893, + "step": 18964 + }, + { + "epoch": 0.9234777104180362, + "grad_norm": 1.615440845489502, + "learning_rate": 6.107929798977141e-07, + "loss": 0.8345, + "step": 18965 + }, + { + "epoch": 0.9235264042071434, + "grad_norm": 1.3146332502365112, + "learning_rate": 6.100196183523044e-07, + "loss": 0.7956, + "step": 18966 + }, + { + "epoch": 0.9235750979962506, + "grad_norm": 1.2424194812774658, + "learning_rate": 6.092467391344259e-07, + "loss": 0.8063, + "step": 18967 + }, + { + "epoch": 0.9236237917853578, + "grad_norm": 1.7183786630630493, + "learning_rate": 6.08474342263301e-07, + "loss": 0.8404, + "step": 18968 + }, + { + "epoch": 0.923672485574465, + "grad_norm": 1.5131800174713135, + "learning_rate": 6.077024277581478e-07, + "loss": 0.8712, + "step": 18969 + }, + { + "epoch": 0.9237211793635722, + "grad_norm": 1.6954137086868286, + "learning_rate": 6.06930995638162e-07, + "loss": 0.8714, + "step": 18970 + }, + { + "epoch": 0.9237698731526793, + "grad_norm": 1.7399866580963135, + "learning_rate": 6.061600459225392e-07, + "loss": 0.8089, + "step": 18971 + }, + { + "epoch": 0.9238185669417865, + "grad_norm": 1.9054900407791138, + "learning_rate": 6.053895786304509e-07, + "loss": 0.83, + "step": 18972 + }, + { + "epoch": 0.9238672607308938, + "grad_norm": 1.4261972904205322, + "learning_rate": 6.046195937810639e-07, + "loss": 0.7559, + "step": 18973 + }, + { + "epoch": 0.923915954520001, + "grad_norm": 1.7854782342910767, + "learning_rate": 6.038500913935363e-07, + "loss": 0.8341, + "step": 18974 + }, + { + "epoch": 0.9239646483091082, + "grad_norm": 1.512309193611145, + "learning_rate": 6.030810714870039e-07, + "loss": 0.804, + "step": 18975 + }, + { + "epoch": 0.9240133420982154, + "grad_norm": 2.457477569580078, + "learning_rate": 6.02312534080598e-07, + "loss": 0.7109, + "step": 18976 + }, + { + "epoch": 0.9240620358873226, + "grad_norm": 1.4060159921646118, + "learning_rate": 6.015444791934388e-07, + "loss": 0.8303, + "step": 18977 + }, + { + "epoch": 0.9241107296764298, + "grad_norm": 0.10528040677309036, + "learning_rate": 6.007769068446267e-07, + "loss": 0.6711, + "step": 18978 + }, + { + "epoch": 0.9241594234655369, + "grad_norm": 1.9641082286834717, + "learning_rate": 6.000098170532597e-07, + "loss": 0.7868, + "step": 18979 + }, + { + "epoch": 0.9242081172546441, + "grad_norm": 0.09155318886041641, + "learning_rate": 5.99243209838416e-07, + "loss": 0.5696, + "step": 18980 + }, + { + "epoch": 0.9242568110437513, + "grad_norm": 1.4979071617126465, + "learning_rate": 5.98477085219169e-07, + "loss": 0.8666, + "step": 18981 + }, + { + "epoch": 0.9243055048328586, + "grad_norm": 1.8539845943450928, + "learning_rate": 5.977114432145703e-07, + "loss": 0.8523, + "step": 18982 + }, + { + "epoch": 0.9243541986219658, + "grad_norm": 1.807068109512329, + "learning_rate": 5.969462838436691e-07, + "loss": 0.8086, + "step": 18983 + }, + { + "epoch": 0.924402892411073, + "grad_norm": 1.939275860786438, + "learning_rate": 5.961816071254989e-07, + "loss": 0.7902, + "step": 18984 + }, + { + "epoch": 0.9244515862001802, + "grad_norm": 1.8086508512496948, + "learning_rate": 5.954174130790802e-07, + "loss": 0.7901, + "step": 18985 + }, + { + "epoch": 0.9245002799892874, + "grad_norm": 2.186006546020508, + "learning_rate": 5.946537017234222e-07, + "loss": 0.764, + "step": 18986 + }, + { + "epoch": 0.9245489737783945, + "grad_norm": 2.2236063480377197, + "learning_rate": 5.93890473077523e-07, + "loss": 0.8943, + "step": 18987 + }, + { + "epoch": 0.9245976675675017, + "grad_norm": 2.2064719200134277, + "learning_rate": 5.931277271603697e-07, + "loss": 0.7586, + "step": 18988 + }, + { + "epoch": 0.9246463613566089, + "grad_norm": 0.0915125235915184, + "learning_rate": 5.923654639909316e-07, + "loss": 0.665, + "step": 18989 + }, + { + "epoch": 0.9246950551457161, + "grad_norm": 1.8421131372451782, + "learning_rate": 5.916036835881689e-07, + "loss": 0.7663, + "step": 18990 + }, + { + "epoch": 0.9247437489348234, + "grad_norm": 1.6215282678604126, + "learning_rate": 5.9084238597104e-07, + "loss": 0.8116, + "step": 18991 + }, + { + "epoch": 0.9247924427239306, + "grad_norm": 1.7364031076431274, + "learning_rate": 5.900815711584717e-07, + "loss": 0.8004, + "step": 18992 + }, + { + "epoch": 0.9248411365130378, + "grad_norm": 1.4685330390930176, + "learning_rate": 5.893212391693981e-07, + "loss": 0.7782, + "step": 18993 + }, + { + "epoch": 0.924889830302145, + "grad_norm": 1.3887181282043457, + "learning_rate": 5.885613900227261e-07, + "loss": 0.719, + "step": 18994 + }, + { + "epoch": 0.9249385240912522, + "grad_norm": 1.548978567123413, + "learning_rate": 5.878020237373605e-07, + "loss": 0.8214, + "step": 18995 + }, + { + "epoch": 0.9249872178803593, + "grad_norm": 1.6357851028442383, + "learning_rate": 5.870431403321886e-07, + "loss": 0.7904, + "step": 18996 + }, + { + "epoch": 0.9250359116694665, + "grad_norm": 1.978860855102539, + "learning_rate": 5.862847398260885e-07, + "loss": 0.7546, + "step": 18997 + }, + { + "epoch": 0.9250846054585737, + "grad_norm": 1.7631243467330933, + "learning_rate": 5.855268222379274e-07, + "loss": 0.7768, + "step": 18998 + }, + { + "epoch": 0.925133299247681, + "grad_norm": 1.423521637916565, + "learning_rate": 5.847693875865523e-07, + "loss": 0.842, + "step": 18999 + }, + { + "epoch": 0.9251819930367882, + "grad_norm": 1.3337117433547974, + "learning_rate": 5.840124358908151e-07, + "loss": 0.8039, + "step": 19000 + }, + { + "epoch": 0.9252306868258954, + "grad_norm": 1.438773274421692, + "learning_rate": 5.832559671695337e-07, + "loss": 0.7503, + "step": 19001 + }, + { + "epoch": 0.9252793806150026, + "grad_norm": 2.3920273780822754, + "learning_rate": 5.824999814415355e-07, + "loss": 0.7881, + "step": 19002 + }, + { + "epoch": 0.9253280744041098, + "grad_norm": 1.5863980054855347, + "learning_rate": 5.817444787256143e-07, + "loss": 0.7605, + "step": 19003 + }, + { + "epoch": 0.9253767681932169, + "grad_norm": 1.4141017198562622, + "learning_rate": 5.809894590405729e-07, + "loss": 0.6961, + "step": 19004 + }, + { + "epoch": 0.9254254619823241, + "grad_norm": 1.9399075508117676, + "learning_rate": 5.802349224051896e-07, + "loss": 0.7398, + "step": 19005 + }, + { + "epoch": 0.9254741557714313, + "grad_norm": 1.2819029092788696, + "learning_rate": 5.794808688382314e-07, + "loss": 0.8064, + "step": 19006 + }, + { + "epoch": 0.9255228495605385, + "grad_norm": 1.8490982055664062, + "learning_rate": 5.787272983584613e-07, + "loss": 0.7833, + "step": 19007 + }, + { + "epoch": 0.9255715433496458, + "grad_norm": 1.426418662071228, + "learning_rate": 5.779742109846176e-07, + "loss": 0.8343, + "step": 19008 + }, + { + "epoch": 0.925620237138753, + "grad_norm": 1.4664653539657593, + "learning_rate": 5.772216067354386e-07, + "loss": 0.898, + "step": 19009 + }, + { + "epoch": 0.9256689309278602, + "grad_norm": 1.5416334867477417, + "learning_rate": 5.764694856296382e-07, + "loss": 0.6441, + "step": 19010 + }, + { + "epoch": 0.9257176247169674, + "grad_norm": 1.8995187282562256, + "learning_rate": 5.757178476859327e-07, + "loss": 0.8466, + "step": 19011 + }, + { + "epoch": 0.9257663185060746, + "grad_norm": 5.216369152069092, + "learning_rate": 5.74966692923018e-07, + "loss": 0.8114, + "step": 19012 + }, + { + "epoch": 0.9258150122951817, + "grad_norm": 1.7817329168319702, + "learning_rate": 5.742160213595748e-07, + "loss": 0.7313, + "step": 19013 + }, + { + "epoch": 0.9258637060842889, + "grad_norm": 1.54389488697052, + "learning_rate": 5.73465833014284e-07, + "loss": 0.8674, + "step": 19014 + }, + { + "epoch": 0.9259123998733961, + "grad_norm": 2.2637693881988525, + "learning_rate": 5.727161279057991e-07, + "loss": 0.8385, + "step": 19015 + }, + { + "epoch": 0.9259610936625033, + "grad_norm": 1.2824904918670654, + "learning_rate": 5.719669060527744e-07, + "loss": 0.9235, + "step": 19016 + }, + { + "epoch": 0.9260097874516106, + "grad_norm": 2.41269850730896, + "learning_rate": 5.712181674738393e-07, + "loss": 0.78, + "step": 19017 + }, + { + "epoch": 0.9260584812407178, + "grad_norm": 1.5613734722137451, + "learning_rate": 5.704699121876256e-07, + "loss": 0.8194, + "step": 19018 + }, + { + "epoch": 0.926107175029825, + "grad_norm": 1.5727084875106812, + "learning_rate": 5.697221402127473e-07, + "loss": 0.7565, + "step": 19019 + }, + { + "epoch": 0.9261558688189322, + "grad_norm": 2.3947949409484863, + "learning_rate": 5.689748515678006e-07, + "loss": 0.8829, + "step": 19020 + }, + { + "epoch": 0.9262045626080393, + "grad_norm": 4.254486083984375, + "learning_rate": 5.682280462713752e-07, + "loss": 0.7499, + "step": 19021 + }, + { + "epoch": 0.9262532563971465, + "grad_norm": 2.0645370483398438, + "learning_rate": 5.674817243420494e-07, + "loss": 0.7541, + "step": 19022 + }, + { + "epoch": 0.9263019501862537, + "grad_norm": 1.5643274784088135, + "learning_rate": 5.667358857983884e-07, + "loss": 0.763, + "step": 19023 + }, + { + "epoch": 0.9263506439753609, + "grad_norm": 1.87839937210083, + "learning_rate": 5.659905306589398e-07, + "loss": 0.8183, + "step": 19024 + }, + { + "epoch": 0.9263993377644681, + "grad_norm": 1.258697748184204, + "learning_rate": 5.65245658942255e-07, + "loss": 0.7449, + "step": 19025 + }, + { + "epoch": 0.9264480315535754, + "grad_norm": 1.3451135158538818, + "learning_rate": 5.645012706668528e-07, + "loss": 0.7654, + "step": 19026 + }, + { + "epoch": 0.9264967253426826, + "grad_norm": 1.4923334121704102, + "learning_rate": 5.637573658512541e-07, + "loss": 0.7158, + "step": 19027 + }, + { + "epoch": 0.9265454191317898, + "grad_norm": 1.335249662399292, + "learning_rate": 5.630139445139637e-07, + "loss": 0.7917, + "step": 19028 + }, + { + "epoch": 0.926594112920897, + "grad_norm": 1.379948616027832, + "learning_rate": 5.622710066734737e-07, + "loss": 0.7425, + "step": 19029 + }, + { + "epoch": 0.9266428067100041, + "grad_norm": 1.5766949653625488, + "learning_rate": 5.615285523482627e-07, + "loss": 0.7903, + "step": 19030 + }, + { + "epoch": 0.9266915004991113, + "grad_norm": 1.6170475482940674, + "learning_rate": 5.607865815568048e-07, + "loss": 0.8803, + "step": 19031 + }, + { + "epoch": 0.9267401942882185, + "grad_norm": 2.387622594833374, + "learning_rate": 5.600450943175495e-07, + "loss": 0.7573, + "step": 19032 + }, + { + "epoch": 0.9267888880773257, + "grad_norm": 2.8019628524780273, + "learning_rate": 5.593040906489444e-07, + "loss": 0.926, + "step": 19033 + }, + { + "epoch": 0.926837581866433, + "grad_norm": 1.384696364402771, + "learning_rate": 5.585635705694259e-07, + "loss": 0.8704, + "step": 19034 + }, + { + "epoch": 0.9268862756555402, + "grad_norm": 1.7857410907745361, + "learning_rate": 5.578235340974103e-07, + "loss": 0.7307, + "step": 19035 + }, + { + "epoch": 0.9269349694446474, + "grad_norm": 1.3711187839508057, + "learning_rate": 5.570839812513073e-07, + "loss": 0.6814, + "step": 19036 + }, + { + "epoch": 0.9269836632337546, + "grad_norm": 1.6830720901489258, + "learning_rate": 5.563449120495112e-07, + "loss": 0.7326, + "step": 19037 + }, + { + "epoch": 0.9270323570228617, + "grad_norm": 2.8178162574768066, + "learning_rate": 5.556063265104095e-07, + "loss": 0.8201, + "step": 19038 + }, + { + "epoch": 0.9270810508119689, + "grad_norm": 1.9626363515853882, + "learning_rate": 5.54868224652374e-07, + "loss": 0.7953, + "step": 19039 + }, + { + "epoch": 0.9271297446010761, + "grad_norm": 1.3399312496185303, + "learning_rate": 5.541306064937635e-07, + "loss": 0.7837, + "step": 19040 + }, + { + "epoch": 0.9271784383901833, + "grad_norm": 1.5042792558670044, + "learning_rate": 5.533934720529255e-07, + "loss": 0.7588, + "step": 19041 + }, + { + "epoch": 0.9272271321792905, + "grad_norm": 1.365006446838379, + "learning_rate": 5.526568213481986e-07, + "loss": 0.8575, + "step": 19042 + }, + { + "epoch": 0.9272758259683977, + "grad_norm": 2.7702882289886475, + "learning_rate": 5.519206543979084e-07, + "loss": 0.7606, + "step": 19043 + }, + { + "epoch": 0.927324519757505, + "grad_norm": 1.7161989212036133, + "learning_rate": 5.511849712203643e-07, + "loss": 0.7764, + "step": 19044 + }, + { + "epoch": 0.9273732135466122, + "grad_norm": 1.9391533136367798, + "learning_rate": 5.504497718338675e-07, + "loss": 0.785, + "step": 19045 + }, + { + "epoch": 0.9274219073357193, + "grad_norm": 1.8179678916931152, + "learning_rate": 5.497150562567055e-07, + "loss": 0.8201, + "step": 19046 + }, + { + "epoch": 0.9274706011248265, + "grad_norm": 1.3658366203308105, + "learning_rate": 5.489808245071548e-07, + "loss": 0.8867, + "step": 19047 + }, + { + "epoch": 0.9275192949139337, + "grad_norm": 1.7597441673278809, + "learning_rate": 5.482470766034787e-07, + "loss": 0.9093, + "step": 19048 + }, + { + "epoch": 0.9275679887030409, + "grad_norm": 1.8017841577529907, + "learning_rate": 5.475138125639334e-07, + "loss": 0.7962, + "step": 19049 + }, + { + "epoch": 0.9276166824921481, + "grad_norm": 1.3169325590133667, + "learning_rate": 5.467810324067513e-07, + "loss": 0.8164, + "step": 19050 + }, + { + "epoch": 0.9276653762812553, + "grad_norm": 1.4863189458847046, + "learning_rate": 5.460487361501687e-07, + "loss": 0.7449, + "step": 19051 + }, + { + "epoch": 0.9277140700703626, + "grad_norm": 1.7926177978515625, + "learning_rate": 5.453169238123934e-07, + "loss": 0.7799, + "step": 19052 + }, + { + "epoch": 0.9277627638594698, + "grad_norm": 1.8047597408294678, + "learning_rate": 5.445855954116352e-07, + "loss": 0.7986, + "step": 19053 + }, + { + "epoch": 0.927811457648577, + "grad_norm": 1.629587173461914, + "learning_rate": 5.438547509660864e-07, + "loss": 0.7919, + "step": 19054 + }, + { + "epoch": 0.9278601514376841, + "grad_norm": 1.3040624856948853, + "learning_rate": 5.431243904939232e-07, + "loss": 0.7983, + "step": 19055 + }, + { + "epoch": 0.9279088452267913, + "grad_norm": 1.8738385438919067, + "learning_rate": 5.423945140133135e-07, + "loss": 0.8207, + "step": 19056 + }, + { + "epoch": 0.9279575390158985, + "grad_norm": 0.09644210338592529, + "learning_rate": 5.41665121542414e-07, + "loss": 0.5951, + "step": 19057 + }, + { + "epoch": 0.9280062328050057, + "grad_norm": 0.100394606590271, + "learning_rate": 5.409362130993723e-07, + "loss": 0.6035, + "step": 19058 + }, + { + "epoch": 0.9280549265941129, + "grad_norm": 1.2280925512313843, + "learning_rate": 5.402077887023116e-07, + "loss": 0.8887, + "step": 19059 + }, + { + "epoch": 0.9281036203832201, + "grad_norm": 1.6604713201522827, + "learning_rate": 5.39479848369362e-07, + "loss": 0.8399, + "step": 19060 + }, + { + "epoch": 0.9281523141723274, + "grad_norm": 1.460077166557312, + "learning_rate": 5.3875239211862e-07, + "loss": 0.8581, + "step": 19061 + }, + { + "epoch": 0.9282010079614346, + "grad_norm": 0.09513316303491592, + "learning_rate": 5.380254199681889e-07, + "loss": 0.602, + "step": 19062 + }, + { + "epoch": 0.9282497017505417, + "grad_norm": 1.1938378810882568, + "learning_rate": 5.372989319361521e-07, + "loss": 0.8262, + "step": 19063 + }, + { + "epoch": 0.9282983955396489, + "grad_norm": 1.7655543088912964, + "learning_rate": 5.365729280405751e-07, + "loss": 0.831, + "step": 19064 + }, + { + "epoch": 0.9283470893287561, + "grad_norm": 1.4220869541168213, + "learning_rate": 5.358474082995235e-07, + "loss": 0.6747, + "step": 19065 + }, + { + "epoch": 0.9283957831178633, + "grad_norm": 1.5546388626098633, + "learning_rate": 5.351223727310406e-07, + "loss": 0.8522, + "step": 19066 + }, + { + "epoch": 0.9284444769069705, + "grad_norm": 2.7094409465789795, + "learning_rate": 5.343978213531676e-07, + "loss": 0.6869, + "step": 19067 + }, + { + "epoch": 0.9284931706960777, + "grad_norm": 1.3923054933547974, + "learning_rate": 5.336737541839188e-07, + "loss": 0.8027, + "step": 19068 + }, + { + "epoch": 0.9285418644851849, + "grad_norm": 2.195291042327881, + "learning_rate": 5.329501712413132e-07, + "loss": 0.8347, + "step": 19069 + }, + { + "epoch": 0.9285905582742922, + "grad_norm": 2.2525315284729004, + "learning_rate": 5.322270725433454e-07, + "loss": 0.8465, + "step": 19070 + }, + { + "epoch": 0.9286392520633994, + "grad_norm": 1.2616111040115356, + "learning_rate": 5.315044581080054e-07, + "loss": 0.8065, + "step": 19071 + }, + { + "epoch": 0.9286879458525065, + "grad_norm": 2.4509549140930176, + "learning_rate": 5.307823279532653e-07, + "loss": 0.9457, + "step": 19072 + }, + { + "epoch": 0.9287366396416137, + "grad_norm": 2.502174139022827, + "learning_rate": 5.300606820970888e-07, + "loss": 0.8864, + "step": 19073 + }, + { + "epoch": 0.9287853334307209, + "grad_norm": 1.5282076597213745, + "learning_rate": 5.293395205574325e-07, + "loss": 0.8574, + "step": 19074 + }, + { + "epoch": 0.9288340272198281, + "grad_norm": 1.7340246438980103, + "learning_rate": 5.286188433522288e-07, + "loss": 0.7392, + "step": 19075 + }, + { + "epoch": 0.9288827210089353, + "grad_norm": 1.251158356666565, + "learning_rate": 5.278986504994055e-07, + "loss": 0.8741, + "step": 19076 + }, + { + "epoch": 0.9289314147980425, + "grad_norm": 1.9839575290679932, + "learning_rate": 5.271789420168793e-07, + "loss": 0.7852, + "step": 19077 + }, + { + "epoch": 0.9289801085871497, + "grad_norm": 1.6286280155181885, + "learning_rate": 5.264597179225539e-07, + "loss": 0.8145, + "step": 19078 + }, + { + "epoch": 0.929028802376257, + "grad_norm": 2.8301761150360107, + "learning_rate": 5.25740978234317e-07, + "loss": 0.7423, + "step": 19079 + }, + { + "epoch": 0.9290774961653641, + "grad_norm": 2.592059373855591, + "learning_rate": 5.2502272297005e-07, + "loss": 0.7833, + "step": 19080 + }, + { + "epoch": 0.9291261899544713, + "grad_norm": 2.6632277965545654, + "learning_rate": 5.243049521476184e-07, + "loss": 0.8499, + "step": 19081 + }, + { + "epoch": 0.9291748837435785, + "grad_norm": 1.520700454711914, + "learning_rate": 5.235876657848748e-07, + "loss": 0.9137, + "step": 19082 + }, + { + "epoch": 0.9292235775326857, + "grad_norm": 1.4074487686157227, + "learning_rate": 5.228708638996671e-07, + "loss": 0.8655, + "step": 19083 + }, + { + "epoch": 0.9292722713217929, + "grad_norm": 4.41301155090332, + "learning_rate": 5.221545465098188e-07, + "loss": 0.8281, + "step": 19084 + }, + { + "epoch": 0.9293209651109001, + "grad_norm": 1.5989078283309937, + "learning_rate": 5.214387136331533e-07, + "loss": 0.839, + "step": 19085 + }, + { + "epoch": 0.9293696589000073, + "grad_norm": 1.7334917783737183, + "learning_rate": 5.207233652874766e-07, + "loss": 0.7528, + "step": 19086 + }, + { + "epoch": 0.9294183526891145, + "grad_norm": 1.3469706773757935, + "learning_rate": 5.200085014905809e-07, + "loss": 0.8048, + "step": 19087 + }, + { + "epoch": 0.9294670464782216, + "grad_norm": 1.4242773056030273, + "learning_rate": 5.192941222602522e-07, + "loss": 0.7907, + "step": 19088 + }, + { + "epoch": 0.9295157402673289, + "grad_norm": 0.09902531653642654, + "learning_rate": 5.185802276142582e-07, + "loss": 0.6614, + "step": 19089 + }, + { + "epoch": 0.9295644340564361, + "grad_norm": 1.404863953590393, + "learning_rate": 5.17866817570356e-07, + "loss": 0.7656, + "step": 19090 + }, + { + "epoch": 0.9296131278455433, + "grad_norm": 1.6330695152282715, + "learning_rate": 5.171538921462937e-07, + "loss": 0.717, + "step": 19091 + }, + { + "epoch": 0.9296618216346505, + "grad_norm": 0.09550058841705322, + "learning_rate": 5.164414513598016e-07, + "loss": 0.5521, + "step": 19092 + }, + { + "epoch": 0.9297105154237577, + "grad_norm": 1.3144185543060303, + "learning_rate": 5.157294952286074e-07, + "loss": 0.8427, + "step": 19093 + }, + { + "epoch": 0.9297592092128649, + "grad_norm": 1.3305612802505493, + "learning_rate": 5.150180237704194e-07, + "loss": 0.8513, + "step": 19094 + }, + { + "epoch": 0.9298079030019721, + "grad_norm": 1.5737861394882202, + "learning_rate": 5.143070370029347e-07, + "loss": 0.8164, + "step": 19095 + }, + { + "epoch": 0.9298565967910793, + "grad_norm": 2.8366665840148926, + "learning_rate": 5.135965349438365e-07, + "loss": 0.9099, + "step": 19096 + }, + { + "epoch": 0.9299052905801864, + "grad_norm": 1.536397099494934, + "learning_rate": 5.128865176108045e-07, + "loss": 0.8211, + "step": 19097 + }, + { + "epoch": 0.9299539843692937, + "grad_norm": 1.345298409461975, + "learning_rate": 5.121769850214953e-07, + "loss": 0.8734, + "step": 19098 + }, + { + "epoch": 0.9300026781584009, + "grad_norm": 1.1741443872451782, + "learning_rate": 5.114679371935594e-07, + "loss": 0.848, + "step": 19099 + }, + { + "epoch": 0.9300513719475081, + "grad_norm": 1.418501377105713, + "learning_rate": 5.107593741446381e-07, + "loss": 0.8217, + "step": 19100 + }, + { + "epoch": 0.9301000657366153, + "grad_norm": 1.8036329746246338, + "learning_rate": 5.100512958923509e-07, + "loss": 0.8456, + "step": 19101 + }, + { + "epoch": 0.9301487595257225, + "grad_norm": 1.579931378364563, + "learning_rate": 5.093437024543169e-07, + "loss": 0.7625, + "step": 19102 + }, + { + "epoch": 0.9301974533148297, + "grad_norm": 1.6369003057479858, + "learning_rate": 5.086365938481352e-07, + "loss": 0.8632, + "step": 19103 + }, + { + "epoch": 0.9302461471039369, + "grad_norm": 1.6014471054077148, + "learning_rate": 5.079299700913965e-07, + "loss": 0.9524, + "step": 19104 + }, + { + "epoch": 0.930294840893044, + "grad_norm": 1.3529123067855835, + "learning_rate": 5.072238312016753e-07, + "loss": 0.842, + "step": 19105 + }, + { + "epoch": 0.9303435346821513, + "grad_norm": 1.421613097190857, + "learning_rate": 5.065181771965399e-07, + "loss": 0.7735, + "step": 19106 + }, + { + "epoch": 0.9303922284712585, + "grad_norm": 1.6686410903930664, + "learning_rate": 5.058130080935409e-07, + "loss": 0.7803, + "step": 19107 + }, + { + "epoch": 0.9304409222603657, + "grad_norm": 1.880601167678833, + "learning_rate": 5.051083239102194e-07, + "loss": 0.8048, + "step": 19108 + }, + { + "epoch": 0.9304896160494729, + "grad_norm": 2.963239908218384, + "learning_rate": 5.044041246641107e-07, + "loss": 0.7207, + "step": 19109 + }, + { + "epoch": 0.9305383098385801, + "grad_norm": 1.7179539203643799, + "learning_rate": 5.037004103727228e-07, + "loss": 0.8118, + "step": 19110 + }, + { + "epoch": 0.9305870036276873, + "grad_norm": 2.784395217895508, + "learning_rate": 5.029971810535683e-07, + "loss": 0.7511, + "step": 19111 + }, + { + "epoch": 0.9306356974167945, + "grad_norm": 1.3121808767318726, + "learning_rate": 5.022944367241311e-07, + "loss": 0.8059, + "step": 19112 + }, + { + "epoch": 0.9306843912059017, + "grad_norm": 1.7096694707870483, + "learning_rate": 5.015921774019017e-07, + "loss": 0.7921, + "step": 19113 + }, + { + "epoch": 0.9307330849950088, + "grad_norm": 1.3848447799682617, + "learning_rate": 5.008904031043438e-07, + "loss": 0.7549, + "step": 19114 + }, + { + "epoch": 0.930781778784116, + "grad_norm": 2.363588571548462, + "learning_rate": 5.001891138489123e-07, + "loss": 0.8443, + "step": 19115 + }, + { + "epoch": 0.9308304725732233, + "grad_norm": 1.5842695236206055, + "learning_rate": 4.994883096530601e-07, + "loss": 0.8317, + "step": 19116 + }, + { + "epoch": 0.9308791663623305, + "grad_norm": 1.832973837852478, + "learning_rate": 4.987879905342086e-07, + "loss": 0.7689, + "step": 19117 + }, + { + "epoch": 0.9309278601514377, + "grad_norm": 1.803096890449524, + "learning_rate": 4.980881565097906e-07, + "loss": 0.806, + "step": 19118 + }, + { + "epoch": 0.9309765539405449, + "grad_norm": 1.6685816049575806, + "learning_rate": 4.97388807597201e-07, + "loss": 0.785, + "step": 19119 + }, + { + "epoch": 0.9310252477296521, + "grad_norm": 1.5743662118911743, + "learning_rate": 4.966899438138462e-07, + "loss": 0.799, + "step": 19120 + }, + { + "epoch": 0.9310739415187593, + "grad_norm": 2.0263872146606445, + "learning_rate": 4.959915651771075e-07, + "loss": 0.7362, + "step": 19121 + }, + { + "epoch": 0.9311226353078664, + "grad_norm": 1.6563961505889893, + "learning_rate": 4.952936717043533e-07, + "loss": 0.7923, + "step": 19122 + }, + { + "epoch": 0.9311713290969736, + "grad_norm": 1.6063865423202515, + "learning_rate": 4.94596263412952e-07, + "loss": 0.7553, + "step": 19123 + }, + { + "epoch": 0.9312200228860809, + "grad_norm": 1.3711727857589722, + "learning_rate": 4.938993403202452e-07, + "loss": 0.9427, + "step": 19124 + }, + { + "epoch": 0.9312687166751881, + "grad_norm": 1.6526297330856323, + "learning_rate": 4.932029024435725e-07, + "loss": 0.8043, + "step": 19125 + }, + { + "epoch": 0.9313174104642953, + "grad_norm": 1.3085914850234985, + "learning_rate": 4.925069498002533e-07, + "loss": 0.8567, + "step": 19126 + }, + { + "epoch": 0.9313661042534025, + "grad_norm": 1.265252947807312, + "learning_rate": 4.918114824076026e-07, + "loss": 0.7925, + "step": 19127 + }, + { + "epoch": 0.9314147980425097, + "grad_norm": 1.359268069267273, + "learning_rate": 4.911165002829221e-07, + "loss": 0.8121, + "step": 19128 + }, + { + "epoch": 0.9314634918316169, + "grad_norm": 1.8782436847686768, + "learning_rate": 4.904220034434958e-07, + "loss": 0.8282, + "step": 19129 + }, + { + "epoch": 0.9315121856207241, + "grad_norm": 2.2579824924468994, + "learning_rate": 4.897279919066011e-07, + "loss": 0.889, + "step": 19130 + }, + { + "epoch": 0.9315608794098312, + "grad_norm": 2.2865166664123535, + "learning_rate": 4.890344656895018e-07, + "loss": 0.9549, + "step": 19131 + }, + { + "epoch": 0.9316095731989384, + "grad_norm": 1.483113169670105, + "learning_rate": 4.883414248094509e-07, + "loss": 0.7714, + "step": 19132 + }, + { + "epoch": 0.9316582669880457, + "grad_norm": 0.09547014534473419, + "learning_rate": 4.876488692836812e-07, + "loss": 0.6355, + "step": 19133 + }, + { + "epoch": 0.9317069607771529, + "grad_norm": 2.0508956909179688, + "learning_rate": 4.8695679912943e-07, + "loss": 0.8599, + "step": 19134 + }, + { + "epoch": 0.9317556545662601, + "grad_norm": 1.7304697036743164, + "learning_rate": 4.862652143639057e-07, + "loss": 0.7483, + "step": 19135 + }, + { + "epoch": 0.9318043483553673, + "grad_norm": 1.699317455291748, + "learning_rate": 4.855741150043125e-07, + "loss": 0.848, + "step": 19136 + }, + { + "epoch": 0.9318530421444745, + "grad_norm": 2.0794804096221924, + "learning_rate": 4.848835010678432e-07, + "loss": 0.8204, + "step": 19137 + }, + { + "epoch": 0.9319017359335817, + "grad_norm": 1.6582497358322144, + "learning_rate": 4.841933725716774e-07, + "loss": 0.8524, + "step": 19138 + }, + { + "epoch": 0.9319504297226888, + "grad_norm": 1.8247177600860596, + "learning_rate": 4.835037295329792e-07, + "loss": 0.7196, + "step": 19139 + }, + { + "epoch": 0.931999123511796, + "grad_norm": 1.2036858797073364, + "learning_rate": 4.828145719689059e-07, + "loss": 0.8046, + "step": 19140 + }, + { + "epoch": 0.9320478173009032, + "grad_norm": 1.1402825117111206, + "learning_rate": 4.821258998965995e-07, + "loss": 0.8172, + "step": 19141 + }, + { + "epoch": 0.9320965110900105, + "grad_norm": 1.7026745080947876, + "learning_rate": 4.814377133331882e-07, + "loss": 0.7153, + "step": 19142 + }, + { + "epoch": 0.9321452048791177, + "grad_norm": 1.5355772972106934, + "learning_rate": 4.807500122957965e-07, + "loss": 0.7471, + "step": 19143 + }, + { + "epoch": 0.9321938986682249, + "grad_norm": 1.5298336744308472, + "learning_rate": 4.800627968015281e-07, + "loss": 0.8067, + "step": 19144 + }, + { + "epoch": 0.9322425924573321, + "grad_norm": 2.615734577178955, + "learning_rate": 4.793760668674763e-07, + "loss": 0.8379, + "step": 19145 + }, + { + "epoch": 0.9322912862464393, + "grad_norm": 1.4145252704620361, + "learning_rate": 4.786898225107251e-07, + "loss": 0.8023, + "step": 19146 + }, + { + "epoch": 0.9323399800355464, + "grad_norm": 1.9575625658035278, + "learning_rate": 4.780040637483452e-07, + "loss": 0.7543, + "step": 19147 + }, + { + "epoch": 0.9323886738246536, + "grad_norm": 1.8281856775283813, + "learning_rate": 4.773187905973942e-07, + "loss": 0.8654, + "step": 19148 + }, + { + "epoch": 0.9324373676137608, + "grad_norm": 1.665236234664917, + "learning_rate": 4.766340030749184e-07, + "loss": 0.7573, + "step": 19149 + }, + { + "epoch": 0.932486061402868, + "grad_norm": 1.7456648349761963, + "learning_rate": 4.759497011979486e-07, + "loss": 0.8816, + "step": 19150 + }, + { + "epoch": 0.9325347551919753, + "grad_norm": 1.1047126054763794, + "learning_rate": 4.7526588498351123e-07, + "loss": 0.7336, + "step": 19151 + }, + { + "epoch": 0.9325834489810825, + "grad_norm": 1.3069522380828857, + "learning_rate": 4.7458255444861714e-07, + "loss": 0.7458, + "step": 19152 + }, + { + "epoch": 0.9326321427701897, + "grad_norm": 1.745952844619751, + "learning_rate": 4.7389970961026154e-07, + "loss": 0.7916, + "step": 19153 + }, + { + "epoch": 0.9326808365592969, + "grad_norm": 1.6595497131347656, + "learning_rate": 4.7321735048543095e-07, + "loss": 0.7742, + "step": 19154 + }, + { + "epoch": 0.9327295303484041, + "grad_norm": 1.28412663936615, + "learning_rate": 4.725354770911006e-07, + "loss": 0.8013, + "step": 19155 + }, + { + "epoch": 0.9327782241375112, + "grad_norm": 1.4968419075012207, + "learning_rate": 4.7185408944422806e-07, + "loss": 0.8315, + "step": 19156 + }, + { + "epoch": 0.9328269179266184, + "grad_norm": 1.400496244430542, + "learning_rate": 4.7117318756176646e-07, + "loss": 0.7961, + "step": 19157 + }, + { + "epoch": 0.9328756117157256, + "grad_norm": 1.8168017864227295, + "learning_rate": 4.7049277146065333e-07, + "loss": 0.7831, + "step": 19158 + }, + { + "epoch": 0.9329243055048329, + "grad_norm": 5.731535911560059, + "learning_rate": 4.698128411578107e-07, + "loss": 0.7776, + "step": 19159 + }, + { + "epoch": 0.9329729992939401, + "grad_norm": 1.2708123922348022, + "learning_rate": 4.6913339667015834e-07, + "loss": 0.8613, + "step": 19160 + }, + { + "epoch": 0.9330216930830473, + "grad_norm": 2.6304874420166016, + "learning_rate": 4.6845443801458945e-07, + "loss": 0.8068, + "step": 19161 + }, + { + "epoch": 0.9330703868721545, + "grad_norm": 1.3016479015350342, + "learning_rate": 4.677759652079994e-07, + "loss": 0.8001, + "step": 19162 + }, + { + "epoch": 0.9331190806612617, + "grad_norm": 1.3359609842300415, + "learning_rate": 4.6709797826726353e-07, + "loss": 0.7027, + "step": 19163 + }, + { + "epoch": 0.9331677744503688, + "grad_norm": 1.4539308547973633, + "learning_rate": 4.664204772092462e-07, + "loss": 0.8616, + "step": 19164 + }, + { + "epoch": 0.933216468239476, + "grad_norm": 1.5079177618026733, + "learning_rate": 4.6574346205079836e-07, + "loss": 0.8308, + "step": 19165 + }, + { + "epoch": 0.9332651620285832, + "grad_norm": 1.6474553346633911, + "learning_rate": 4.6506693280876427e-07, + "loss": 0.8835, + "step": 19166 + }, + { + "epoch": 0.9333138558176904, + "grad_norm": 1.0804660320281982, + "learning_rate": 4.6439088949997267e-07, + "loss": 0.8616, + "step": 19167 + }, + { + "epoch": 0.9333625496067977, + "grad_norm": 2.2096123695373535, + "learning_rate": 4.6371533214123685e-07, + "loss": 0.8301, + "step": 19168 + }, + { + "epoch": 0.9334112433959049, + "grad_norm": 1.5881322622299194, + "learning_rate": 4.630402607493656e-07, + "loss": 0.8074, + "step": 19169 + }, + { + "epoch": 0.9334599371850121, + "grad_norm": 1.6585209369659424, + "learning_rate": 4.623656753411476e-07, + "loss": 0.812, + "step": 19170 + }, + { + "epoch": 0.9335086309741193, + "grad_norm": 1.7115669250488281, + "learning_rate": 4.6169157593336514e-07, + "loss": 0.8109, + "step": 19171 + }, + { + "epoch": 0.9335573247632265, + "grad_norm": 7.536828517913818, + "learning_rate": 4.6101796254278464e-07, + "loss": 0.7901, + "step": 19172 + }, + { + "epoch": 0.9336060185523336, + "grad_norm": 1.8962225914001465, + "learning_rate": 4.603448351861639e-07, + "loss": 0.7853, + "step": 19173 + }, + { + "epoch": 0.9336547123414408, + "grad_norm": 1.461332082748413, + "learning_rate": 4.5967219388025173e-07, + "loss": 0.854, + "step": 19174 + }, + { + "epoch": 0.933703406130548, + "grad_norm": 1.5737252235412598, + "learning_rate": 4.5900003864177034e-07, + "loss": 0.8566, + "step": 19175 + }, + { + "epoch": 0.9337520999196552, + "grad_norm": 1.619176983833313, + "learning_rate": 4.583283694874463e-07, + "loss": 0.8462, + "step": 19176 + }, + { + "epoch": 0.9338007937087625, + "grad_norm": 1.5141663551330566, + "learning_rate": 4.5765718643398405e-07, + "loss": 0.7884, + "step": 19177 + }, + { + "epoch": 0.9338494874978697, + "grad_norm": 1.1803569793701172, + "learning_rate": 4.569864894980836e-07, + "loss": 0.828, + "step": 19178 + }, + { + "epoch": 0.9338981812869769, + "grad_norm": 0.09388472139835358, + "learning_rate": 4.5631627869642705e-07, + "loss": 0.5919, + "step": 19179 + }, + { + "epoch": 0.9339468750760841, + "grad_norm": 2.2825424671173096, + "learning_rate": 4.5564655404568337e-07, + "loss": 0.7641, + "step": 19180 + }, + { + "epoch": 0.9339955688651912, + "grad_norm": 1.8363817930221558, + "learning_rate": 4.549773155625126e-07, + "loss": 0.8511, + "step": 19181 + }, + { + "epoch": 0.9340442626542984, + "grad_norm": 1.8359462022781372, + "learning_rate": 4.5430856326356354e-07, + "loss": 0.7774, + "step": 19182 + }, + { + "epoch": 0.9340929564434056, + "grad_norm": 1.4413156509399414, + "learning_rate": 4.5364029716547185e-07, + "loss": 0.7974, + "step": 19183 + }, + { + "epoch": 0.9341416502325128, + "grad_norm": 1.6693363189697266, + "learning_rate": 4.529725172848598e-07, + "loss": 0.7551, + "step": 19184 + }, + { + "epoch": 0.93419034402162, + "grad_norm": 1.3999055624008179, + "learning_rate": 4.523052236383385e-07, + "loss": 0.7785, + "step": 19185 + }, + { + "epoch": 0.9342390378107273, + "grad_norm": 1.3554078340530396, + "learning_rate": 4.516384162425058e-07, + "loss": 0.729, + "step": 19186 + }, + { + "epoch": 0.9342877315998345, + "grad_norm": 1.2915934324264526, + "learning_rate": 4.5097209511395293e-07, + "loss": 0.8922, + "step": 19187 + }, + { + "epoch": 0.9343364253889417, + "grad_norm": 1.2430423498153687, + "learning_rate": 4.5030626026924874e-07, + "loss": 0.7834, + "step": 19188 + }, + { + "epoch": 0.9343851191780489, + "grad_norm": 1.6368639469146729, + "learning_rate": 4.4964091172496006e-07, + "loss": 0.7923, + "step": 19189 + }, + { + "epoch": 0.934433812967156, + "grad_norm": 1.56759512424469, + "learning_rate": 4.4897604949763583e-07, + "loss": 0.7478, + "step": 19190 + }, + { + "epoch": 0.9344825067562632, + "grad_norm": 1.2781587839126587, + "learning_rate": 4.48311673603814e-07, + "loss": 0.7672, + "step": 19191 + }, + { + "epoch": 0.9345312005453704, + "grad_norm": 1.1528602838516235, + "learning_rate": 4.4764778406002575e-07, + "loss": 0.782, + "step": 19192 + }, + { + "epoch": 0.9345798943344776, + "grad_norm": 1.34832763671875, + "learning_rate": 4.469843808827756e-07, + "loss": 0.7695, + "step": 19193 + }, + { + "epoch": 0.9346285881235848, + "grad_norm": 1.4656256437301636, + "learning_rate": 4.4632146408857715e-07, + "loss": 0.8581, + "step": 19194 + }, + { + "epoch": 0.9346772819126921, + "grad_norm": 2.307734966278076, + "learning_rate": 4.456590336939126e-07, + "loss": 0.8497, + "step": 19195 + }, + { + "epoch": 0.9347259757017993, + "grad_norm": 1.4007326364517212, + "learning_rate": 4.4499708971526226e-07, + "loss": 0.826, + "step": 19196 + }, + { + "epoch": 0.9347746694909065, + "grad_norm": 0.1158127561211586, + "learning_rate": 4.443356321690928e-07, + "loss": 0.584, + "step": 19197 + }, + { + "epoch": 0.9348233632800136, + "grad_norm": 1.5329596996307373, + "learning_rate": 4.4367466107185785e-07, + "loss": 0.7862, + "step": 19198 + }, + { + "epoch": 0.9348720570691208, + "grad_norm": 1.5710804462432861, + "learning_rate": 4.430141764399998e-07, + "loss": 0.7746, + "step": 19199 + }, + { + "epoch": 0.934920750858228, + "grad_norm": 2.2939038276672363, + "learning_rate": 4.423541782899454e-07, + "loss": 0.8174, + "step": 19200 + }, + { + "epoch": 0.9349694446473352, + "grad_norm": 1.9165576696395874, + "learning_rate": 4.4169466663811277e-07, + "loss": 0.731, + "step": 19201 + }, + { + "epoch": 0.9350181384364424, + "grad_norm": 2.433065414428711, + "learning_rate": 4.4103564150091093e-07, + "loss": 0.8027, + "step": 19202 + }, + { + "epoch": 0.9350668322255496, + "grad_norm": 2.5543806552886963, + "learning_rate": 4.403771028947312e-07, + "loss": 0.8138, + "step": 19203 + }, + { + "epoch": 0.9351155260146569, + "grad_norm": 1.4699697494506836, + "learning_rate": 4.397190508359539e-07, + "loss": 0.8437, + "step": 19204 + }, + { + "epoch": 0.9351642198037641, + "grad_norm": 2.093251943588257, + "learning_rate": 4.390614853409481e-07, + "loss": 0.8584, + "step": 19205 + }, + { + "epoch": 0.9352129135928712, + "grad_norm": 1.977463722229004, + "learning_rate": 4.3840440642607176e-07, + "loss": 0.7648, + "step": 19206 + }, + { + "epoch": 0.9352616073819784, + "grad_norm": 1.5130747556686401, + "learning_rate": 4.3774781410766966e-07, + "loss": 0.8197, + "step": 19207 + }, + { + "epoch": 0.9353103011710856, + "grad_norm": 2.3607897758483887, + "learning_rate": 4.3709170840207093e-07, + "loss": 0.8852, + "step": 19208 + }, + { + "epoch": 0.9353589949601928, + "grad_norm": 1.8088513612747192, + "learning_rate": 4.3643608932560477e-07, + "loss": 0.8954, + "step": 19209 + }, + { + "epoch": 0.9354076887493, + "grad_norm": 1.6098108291625977, + "learning_rate": 4.3578095689457143e-07, + "loss": 0.6864, + "step": 19210 + }, + { + "epoch": 0.9354563825384072, + "grad_norm": 0.0902467668056488, + "learning_rate": 4.3512631112527127e-07, + "loss": 0.5728, + "step": 19211 + }, + { + "epoch": 0.9355050763275145, + "grad_norm": 0.09563497453927994, + "learning_rate": 4.3447215203398897e-07, + "loss": 0.6457, + "step": 19212 + }, + { + "epoch": 0.9355537701166217, + "grad_norm": 1.698571801185608, + "learning_rate": 4.3381847963699375e-07, + "loss": 0.8298, + "step": 19213 + }, + { + "epoch": 0.9356024639057289, + "grad_norm": 1.898245096206665, + "learning_rate": 4.331652939505504e-07, + "loss": 0.9343, + "step": 19214 + }, + { + "epoch": 0.935651157694836, + "grad_norm": 1.4427670240402222, + "learning_rate": 4.325125949909015e-07, + "loss": 0.7999, + "step": 19215 + }, + { + "epoch": 0.9356998514839432, + "grad_norm": 2.0414040088653564, + "learning_rate": 4.3186038277428956e-07, + "loss": 0.8125, + "step": 19216 + }, + { + "epoch": 0.9357485452730504, + "grad_norm": 1.666648030281067, + "learning_rate": 4.312086573169305e-07, + "loss": 0.7624, + "step": 19217 + }, + { + "epoch": 0.9357972390621576, + "grad_norm": 1.4305709600448608, + "learning_rate": 4.3055741863504476e-07, + "loss": 0.7315, + "step": 19218 + }, + { + "epoch": 0.9358459328512648, + "grad_norm": 2.010131359100342, + "learning_rate": 4.2990666674482375e-07, + "loss": 0.7805, + "step": 19219 + }, + { + "epoch": 0.935894626640372, + "grad_norm": 1.530569314956665, + "learning_rate": 4.2925640166245895e-07, + "loss": 0.8027, + "step": 19220 + }, + { + "epoch": 0.9359433204294793, + "grad_norm": 2.662001609802246, + "learning_rate": 4.2860662340412863e-07, + "loss": 0.7691, + "step": 19221 + }, + { + "epoch": 0.9359920142185865, + "grad_norm": 1.9707149267196655, + "learning_rate": 4.2795733198599085e-07, + "loss": 0.7821, + "step": 19222 + }, + { + "epoch": 0.9360407080076936, + "grad_norm": 1.8684289455413818, + "learning_rate": 4.273085274241995e-07, + "loss": 0.7984, + "step": 19223 + }, + { + "epoch": 0.9360894017968008, + "grad_norm": 1.7896157503128052, + "learning_rate": 4.2666020973489044e-07, + "loss": 0.7313, + "step": 19224 + }, + { + "epoch": 0.936138095585908, + "grad_norm": 4.622921943664551, + "learning_rate": 4.2601237893419744e-07, + "loss": 0.7894, + "step": 19225 + }, + { + "epoch": 0.9361867893750152, + "grad_norm": 2.401402473449707, + "learning_rate": 4.253650350382277e-07, + "loss": 0.7429, + "step": 19226 + }, + { + "epoch": 0.9362354831641224, + "grad_norm": 1.7558668851852417, + "learning_rate": 4.247181780630927e-07, + "loss": 0.7981, + "step": 19227 + }, + { + "epoch": 0.9362841769532296, + "grad_norm": 2.324441909790039, + "learning_rate": 4.240718080248707e-07, + "loss": 0.839, + "step": 19228 + }, + { + "epoch": 0.9363328707423368, + "grad_norm": 1.572561264038086, + "learning_rate": 4.234259249396511e-07, + "loss": 0.7843, + "step": 19229 + }, + { + "epoch": 0.936381564531444, + "grad_norm": 1.290973424911499, + "learning_rate": 4.2278052882349876e-07, + "loss": 0.8812, + "step": 19230 + }, + { + "epoch": 0.9364302583205513, + "grad_norm": 2.0021986961364746, + "learning_rate": 4.2213561969246085e-07, + "loss": 0.8788, + "step": 19231 + }, + { + "epoch": 0.9364789521096584, + "grad_norm": 1.5374977588653564, + "learning_rate": 4.2149119756258906e-07, + "loss": 0.7373, + "step": 19232 + }, + { + "epoch": 0.9365276458987656, + "grad_norm": 1.4134494066238403, + "learning_rate": 4.208472624499038e-07, + "loss": 0.786, + "step": 19233 + }, + { + "epoch": 0.9365763396878728, + "grad_norm": 1.4561748504638672, + "learning_rate": 4.2020381437043454e-07, + "loss": 0.9279, + "step": 19234 + }, + { + "epoch": 0.93662503347698, + "grad_norm": 1.5941071510314941, + "learning_rate": 4.1956085334017516e-07, + "loss": 0.6815, + "step": 19235 + }, + { + "epoch": 0.9366737272660872, + "grad_norm": 1.851564645767212, + "learning_rate": 4.1891837937512616e-07, + "loss": 0.8688, + "step": 19236 + }, + { + "epoch": 0.9367224210551944, + "grad_norm": 1.2718466520309448, + "learning_rate": 4.1827639249126804e-07, + "loss": 0.8359, + "step": 19237 + }, + { + "epoch": 0.9367711148443016, + "grad_norm": 2.0100090503692627, + "learning_rate": 4.1763489270457257e-07, + "loss": 0.8976, + "step": 19238 + }, + { + "epoch": 0.9368198086334089, + "grad_norm": 1.2039258480072021, + "learning_rate": 4.1699388003099137e-07, + "loss": 0.8563, + "step": 19239 + }, + { + "epoch": 0.936868502422516, + "grad_norm": 1.75050950050354, + "learning_rate": 4.1635335448647176e-07, + "loss": 0.8411, + "step": 19240 + }, + { + "epoch": 0.9369171962116232, + "grad_norm": 1.383000373840332, + "learning_rate": 4.157133160869542e-07, + "loss": 0.8109, + "step": 19241 + }, + { + "epoch": 0.9369658900007304, + "grad_norm": 2.3823816776275635, + "learning_rate": 4.15073764848346e-07, + "loss": 0.7875, + "step": 19242 + }, + { + "epoch": 0.9370145837898376, + "grad_norm": 1.7097564935684204, + "learning_rate": 4.144347007865679e-07, + "loss": 0.8331, + "step": 19243 + }, + { + "epoch": 0.9370632775789448, + "grad_norm": 1.8816649913787842, + "learning_rate": 4.1379612391751147e-07, + "loss": 0.8091, + "step": 19244 + }, + { + "epoch": 0.937111971368052, + "grad_norm": 1.462380051612854, + "learning_rate": 4.13158034257064e-07, + "loss": 0.81, + "step": 19245 + }, + { + "epoch": 0.9371606651571592, + "grad_norm": 2.049668788909912, + "learning_rate": 4.125204318210951e-07, + "loss": 0.8854, + "step": 19246 + }, + { + "epoch": 0.9372093589462664, + "grad_norm": 1.6832841634750366, + "learning_rate": 4.1188331662546545e-07, + "loss": 0.7254, + "step": 19247 + }, + { + "epoch": 0.9372580527353735, + "grad_norm": 1.7106534242630005, + "learning_rate": 4.1124668868602446e-07, + "loss": 0.885, + "step": 19248 + }, + { + "epoch": 0.9373067465244808, + "grad_norm": 1.244473934173584, + "learning_rate": 4.1061054801860844e-07, + "loss": 0.8279, + "step": 19249 + }, + { + "epoch": 0.937355440313588, + "grad_norm": 1.3354828357696533, + "learning_rate": 4.099748946390425e-07, + "loss": 0.7975, + "step": 19250 + }, + { + "epoch": 0.9374041341026952, + "grad_norm": 1.3690513372421265, + "learning_rate": 4.09339728563134e-07, + "loss": 0.8886, + "step": 19251 + }, + { + "epoch": 0.9374528278918024, + "grad_norm": 0.10013285279273987, + "learning_rate": 4.08705049806688e-07, + "loss": 0.6293, + "step": 19252 + }, + { + "epoch": 0.9375015216809096, + "grad_norm": 1.408753514289856, + "learning_rate": 4.0807085838548974e-07, + "loss": 0.8779, + "step": 19253 + }, + { + "epoch": 0.9375502154700168, + "grad_norm": 1.4575108289718628, + "learning_rate": 4.074371543153155e-07, + "loss": 0.9173, + "step": 19254 + }, + { + "epoch": 0.937598909259124, + "grad_norm": 1.6041226387023926, + "learning_rate": 4.0680393761192816e-07, + "loss": 0.8165, + "step": 19255 + }, + { + "epoch": 0.9376476030482312, + "grad_norm": 1.202807068824768, + "learning_rate": 4.0617120829107957e-07, + "loss": 0.7819, + "step": 19256 + }, + { + "epoch": 0.9376962968373384, + "grad_norm": 2.349897623062134, + "learning_rate": 4.055389663685083e-07, + "loss": 0.8507, + "step": 19257 + }, + { + "epoch": 0.9377449906264456, + "grad_norm": 1.5760976076126099, + "learning_rate": 4.0490721185994177e-07, + "loss": 0.7824, + "step": 19258 + }, + { + "epoch": 0.9377936844155528, + "grad_norm": 2.5055668354034424, + "learning_rate": 4.04275944781094e-07, + "loss": 0.8102, + "step": 19259 + }, + { + "epoch": 0.93784237820466, + "grad_norm": 2.508979320526123, + "learning_rate": 4.0364516514767027e-07, + "loss": 0.6947, + "step": 19260 + }, + { + "epoch": 0.9378910719937672, + "grad_norm": 1.614451289176941, + "learning_rate": 4.0301487297536024e-07, + "loss": 0.7663, + "step": 19261 + }, + { + "epoch": 0.9379397657828744, + "grad_norm": 1.7757201194763184, + "learning_rate": 4.023850682798425e-07, + "loss": 0.8268, + "step": 19262 + }, + { + "epoch": 0.9379884595719816, + "grad_norm": 2.1592698097229004, + "learning_rate": 4.017557510767822e-07, + "loss": 0.8269, + "step": 19263 + }, + { + "epoch": 0.9380371533610888, + "grad_norm": 1.697921633720398, + "learning_rate": 4.011269213818336e-07, + "loss": 0.7927, + "step": 19264 + }, + { + "epoch": 0.9380858471501959, + "grad_norm": 1.5047943592071533, + "learning_rate": 4.0049857921063973e-07, + "loss": 0.7517, + "step": 19265 + }, + { + "epoch": 0.9381345409393032, + "grad_norm": 1.2723866701126099, + "learning_rate": 3.9987072457883026e-07, + "loss": 0.8358, + "step": 19266 + }, + { + "epoch": 0.9381832347284104, + "grad_norm": 1.4150880575180054, + "learning_rate": 3.99243357502026e-07, + "loss": 0.8164, + "step": 19267 + }, + { + "epoch": 0.9382319285175176, + "grad_norm": 1.354881763458252, + "learning_rate": 3.9861647799582793e-07, + "loss": 0.8087, + "step": 19268 + }, + { + "epoch": 0.9382806223066248, + "grad_norm": 2.865961790084839, + "learning_rate": 3.9799008607583457e-07, + "loss": 0.7942, + "step": 19269 + }, + { + "epoch": 0.938329316095732, + "grad_norm": 1.8126816749572754, + "learning_rate": 3.9736418175762235e-07, + "loss": 0.7599, + "step": 19270 + }, + { + "epoch": 0.9383780098848392, + "grad_norm": 1.2144358158111572, + "learning_rate": 3.9673876505676557e-07, + "loss": 0.7982, + "step": 19271 + }, + { + "epoch": 0.9384267036739464, + "grad_norm": 1.7979300022125244, + "learning_rate": 3.961138359888184e-07, + "loss": 0.8102, + "step": 19272 + }, + { + "epoch": 0.9384753974630536, + "grad_norm": 1.507705807685852, + "learning_rate": 3.9548939456932613e-07, + "loss": 0.8359, + "step": 19273 + }, + { + "epoch": 0.9385240912521607, + "grad_norm": 1.794519066810608, + "learning_rate": 3.9486544081382525e-07, + "loss": 0.7848, + "step": 19274 + }, + { + "epoch": 0.938572785041268, + "grad_norm": 1.3534770011901855, + "learning_rate": 3.942419747378301e-07, + "loss": 0.836, + "step": 19275 + }, + { + "epoch": 0.9386214788303752, + "grad_norm": 1.3935312032699585, + "learning_rate": 3.9361899635685706e-07, + "loss": 0.9146, + "step": 19276 + }, + { + "epoch": 0.9386701726194824, + "grad_norm": 0.0994759202003479, + "learning_rate": 3.9299650568639824e-07, + "loss": 0.598, + "step": 19277 + }, + { + "epoch": 0.9387188664085896, + "grad_norm": 1.327812910079956, + "learning_rate": 3.92374502741939e-07, + "loss": 0.8335, + "step": 19278 + }, + { + "epoch": 0.9387675601976968, + "grad_norm": 1.5300897359848022, + "learning_rate": 3.9175298753895143e-07, + "loss": 0.8598, + "step": 19279 + }, + { + "epoch": 0.938816253986804, + "grad_norm": 2.4827535152435303, + "learning_rate": 3.911319600928942e-07, + "loss": 0.7285, + "step": 19280 + }, + { + "epoch": 0.9388649477759112, + "grad_norm": 2.04657244682312, + "learning_rate": 3.905114204192195e-07, + "loss": 0.8253, + "step": 19281 + }, + { + "epoch": 0.9389136415650183, + "grad_norm": 2.6246559619903564, + "learning_rate": 3.898913685333594e-07, + "loss": 0.8636, + "step": 19282 + }, + { + "epoch": 0.9389623353541255, + "grad_norm": 6.9330973625183105, + "learning_rate": 3.8927180445074374e-07, + "loss": 0.8651, + "step": 19283 + }, + { + "epoch": 0.9390110291432328, + "grad_norm": 1.9865643978118896, + "learning_rate": 3.886527281867758e-07, + "loss": 0.8702, + "step": 19284 + }, + { + "epoch": 0.93905972293234, + "grad_norm": 1.2748912572860718, + "learning_rate": 3.88034139756861e-07, + "loss": 0.8353, + "step": 19285 + }, + { + "epoch": 0.9391084167214472, + "grad_norm": 2.1047589778900146, + "learning_rate": 3.874160391763848e-07, + "loss": 0.8917, + "step": 19286 + }, + { + "epoch": 0.9391571105105544, + "grad_norm": 1.6283729076385498, + "learning_rate": 3.8679842646072166e-07, + "loss": 0.8459, + "step": 19287 + }, + { + "epoch": 0.9392058042996616, + "grad_norm": 1.7360039949417114, + "learning_rate": 3.8618130162523916e-07, + "loss": 0.8134, + "step": 19288 + }, + { + "epoch": 0.9392544980887688, + "grad_norm": 1.3919473886489868, + "learning_rate": 3.8556466468528284e-07, + "loss": 0.8421, + "step": 19289 + }, + { + "epoch": 0.939303191877876, + "grad_norm": 1.9545931816101074, + "learning_rate": 3.8494851565619386e-07, + "loss": 0.8037, + "step": 19290 + }, + { + "epoch": 0.9393518856669831, + "grad_norm": 2.0420544147491455, + "learning_rate": 3.843328545532976e-07, + "loss": 0.6534, + "step": 19291 + }, + { + "epoch": 0.9394005794560903, + "grad_norm": 8.754813194274902, + "learning_rate": 3.83717681391913e-07, + "loss": 0.871, + "step": 19292 + }, + { + "epoch": 0.9394492732451976, + "grad_norm": 1.2861101627349854, + "learning_rate": 3.831029961873367e-07, + "loss": 0.8492, + "step": 19293 + }, + { + "epoch": 0.9394979670343048, + "grad_norm": 1.4411360025405884, + "learning_rate": 3.824887989548631e-07, + "loss": 0.7992, + "step": 19294 + }, + { + "epoch": 0.939546660823412, + "grad_norm": 0.09027823060750961, + "learning_rate": 3.8187508970977115e-07, + "loss": 0.6189, + "step": 19295 + }, + { + "epoch": 0.9395953546125192, + "grad_norm": 1.5553467273712158, + "learning_rate": 3.812618684673242e-07, + "loss": 0.8504, + "step": 19296 + }, + { + "epoch": 0.9396440484016264, + "grad_norm": 1.3582029342651367, + "learning_rate": 3.806491352427766e-07, + "loss": 0.8519, + "step": 19297 + }, + { + "epoch": 0.9396927421907336, + "grad_norm": 2.1602227687835693, + "learning_rate": 3.800368900513696e-07, + "loss": 0.8455, + "step": 19298 + }, + { + "epoch": 0.9397414359798407, + "grad_norm": 3.2674007415771484, + "learning_rate": 3.794251329083354e-07, + "loss": 0.8082, + "step": 19299 + }, + { + "epoch": 0.9397901297689479, + "grad_norm": 1.4876031875610352, + "learning_rate": 3.788138638288863e-07, + "loss": 0.8442, + "step": 19300 + }, + { + "epoch": 0.9398388235580551, + "grad_norm": 3.1288068294525146, + "learning_rate": 3.782030828282368e-07, + "loss": 0.899, + "step": 19301 + }, + { + "epoch": 0.9398875173471624, + "grad_norm": 1.8946834802627563, + "learning_rate": 3.7759278992157035e-07, + "loss": 0.8205, + "step": 19302 + }, + { + "epoch": 0.9399362111362696, + "grad_norm": 2.1491281986236572, + "learning_rate": 3.769829851240747e-07, + "loss": 0.8173, + "step": 19303 + }, + { + "epoch": 0.9399849049253768, + "grad_norm": 2.072713851928711, + "learning_rate": 3.763736684509156e-07, + "loss": 0.8229, + "step": 19304 + }, + { + "epoch": 0.940033598714484, + "grad_norm": 1.8591763973236084, + "learning_rate": 3.75764839917252e-07, + "loss": 0.8224, + "step": 19305 + }, + { + "epoch": 0.9400822925035912, + "grad_norm": 1.5988675355911255, + "learning_rate": 3.751564995382251e-07, + "loss": 0.8891, + "step": 19306 + }, + { + "epoch": 0.9401309862926983, + "grad_norm": 0.09422627091407776, + "learning_rate": 3.7454864732897165e-07, + "loss": 0.6068, + "step": 19307 + }, + { + "epoch": 0.9401796800818055, + "grad_norm": 1.7964000701904297, + "learning_rate": 3.7394128330460853e-07, + "loss": 0.7677, + "step": 19308 + }, + { + "epoch": 0.9402283738709127, + "grad_norm": 1.5449622869491577, + "learning_rate": 3.7333440748024584e-07, + "loss": 0.8147, + "step": 19309 + }, + { + "epoch": 0.94027706766002, + "grad_norm": 1.1602914333343506, + "learning_rate": 3.727280198709782e-07, + "loss": 0.6369, + "step": 19310 + }, + { + "epoch": 0.9403257614491272, + "grad_norm": 2.282987594604492, + "learning_rate": 3.7212212049189346e-07, + "loss": 0.905, + "step": 19311 + }, + { + "epoch": 0.9403744552382344, + "grad_norm": 1.9291293621063232, + "learning_rate": 3.7151670935805963e-07, + "loss": 0.8556, + "step": 19312 + }, + { + "epoch": 0.9404231490273416, + "grad_norm": 2.2692415714263916, + "learning_rate": 3.709117864845357e-07, + "loss": 0.8476, + "step": 19313 + }, + { + "epoch": 0.9404718428164488, + "grad_norm": 1.597043514251709, + "learning_rate": 3.7030735188637203e-07, + "loss": 0.7358, + "step": 19314 + }, + { + "epoch": 0.940520536605556, + "grad_norm": 4.540280818939209, + "learning_rate": 3.6970340557860307e-07, + "loss": 0.8929, + "step": 19315 + }, + { + "epoch": 0.9405692303946631, + "grad_norm": 1.4848132133483887, + "learning_rate": 3.6909994757625023e-07, + "loss": 0.807, + "step": 19316 + }, + { + "epoch": 0.9406179241837703, + "grad_norm": 2.036484718322754, + "learning_rate": 3.684969778943259e-07, + "loss": 0.7575, + "step": 19317 + }, + { + "epoch": 0.9406666179728775, + "grad_norm": 1.7926849126815796, + "learning_rate": 3.678944965478315e-07, + "loss": 0.7386, + "step": 19318 + }, + { + "epoch": 0.9407153117619848, + "grad_norm": 1.3823410272598267, + "learning_rate": 3.672925035517483e-07, + "loss": 0.7772, + "step": 19319 + }, + { + "epoch": 0.940764005551092, + "grad_norm": 1.7025229930877686, + "learning_rate": 3.666909989210576e-07, + "loss": 0.7938, + "step": 19320 + }, + { + "epoch": 0.9408126993401992, + "grad_norm": 1.5556409358978271, + "learning_rate": 3.660899826707165e-07, + "loss": 0.8012, + "step": 19321 + }, + { + "epoch": 0.9408613931293064, + "grad_norm": 2.0837457180023193, + "learning_rate": 3.6548945481567733e-07, + "loss": 0.7771, + "step": 19322 + }, + { + "epoch": 0.9409100869184136, + "grad_norm": 1.8797228336334229, + "learning_rate": 3.648894153708793e-07, + "loss": 0.8937, + "step": 19323 + }, + { + "epoch": 0.9409587807075207, + "grad_norm": 1.78376042842865, + "learning_rate": 3.64289864351246e-07, + "loss": 0.8986, + "step": 19324 + }, + { + "epoch": 0.9410074744966279, + "grad_norm": 1.5338118076324463, + "learning_rate": 3.6369080177169227e-07, + "loss": 0.8127, + "step": 19325 + }, + { + "epoch": 0.9410561682857351, + "grad_norm": 1.1902437210083008, + "learning_rate": 3.630922276471194e-07, + "loss": 0.8341, + "step": 19326 + }, + { + "epoch": 0.9411048620748423, + "grad_norm": 1.2677876949310303, + "learning_rate": 3.624941419924222e-07, + "loss": 0.9209, + "step": 19327 + }, + { + "epoch": 0.9411535558639496, + "grad_norm": 1.8134983777999878, + "learning_rate": 3.618965448224687e-07, + "loss": 0.7968, + "step": 19328 + }, + { + "epoch": 0.9412022496530568, + "grad_norm": 1.5365052223205566, + "learning_rate": 3.6129943615213157e-07, + "loss": 0.7306, + "step": 19329 + }, + { + "epoch": 0.941250943442164, + "grad_norm": 1.4391618967056274, + "learning_rate": 3.6070281599626114e-07, + "loss": 0.8175, + "step": 19330 + }, + { + "epoch": 0.9412996372312712, + "grad_norm": 2.3136889934539795, + "learning_rate": 3.6010668436969875e-07, + "loss": 0.7765, + "step": 19331 + }, + { + "epoch": 0.9413483310203784, + "grad_norm": 4.65984582901001, + "learning_rate": 3.5951104128727486e-07, + "loss": 0.7525, + "step": 19332 + }, + { + "epoch": 0.9413970248094855, + "grad_norm": 0.09303305298089981, + "learning_rate": 3.5891588676379983e-07, + "loss": 0.6376, + "step": 19333 + }, + { + "epoch": 0.9414457185985927, + "grad_norm": 1.9415645599365234, + "learning_rate": 3.583212208140885e-07, + "loss": 0.8565, + "step": 19334 + }, + { + "epoch": 0.9414944123876999, + "grad_norm": 1.6643497943878174, + "learning_rate": 3.5772704345292454e-07, + "loss": 0.7946, + "step": 19335 + }, + { + "epoch": 0.9415431061768071, + "grad_norm": 2.079927921295166, + "learning_rate": 3.5713335469509394e-07, + "loss": 0.7625, + "step": 19336 + }, + { + "epoch": 0.9415917999659144, + "grad_norm": 1.6380348205566406, + "learning_rate": 3.565401545553582e-07, + "loss": 0.7492, + "step": 19337 + }, + { + "epoch": 0.9416404937550216, + "grad_norm": 1.1247206926345825, + "learning_rate": 3.5594744304848104e-07, + "loss": 0.7866, + "step": 19338 + }, + { + "epoch": 0.9416891875441288, + "grad_norm": 1.5048136711120605, + "learning_rate": 3.5535522018919963e-07, + "loss": 0.8837, + "step": 19339 + }, + { + "epoch": 0.941737881333236, + "grad_norm": 1.5520399808883667, + "learning_rate": 3.547634859922489e-07, + "loss": 0.7681, + "step": 19340 + }, + { + "epoch": 0.9417865751223431, + "grad_norm": 1.3047422170639038, + "learning_rate": 3.54172240472348e-07, + "loss": 0.703, + "step": 19341 + }, + { + "epoch": 0.9418352689114503, + "grad_norm": 1.2425438165664673, + "learning_rate": 3.5358148364420307e-07, + "loss": 0.7762, + "step": 19342 + }, + { + "epoch": 0.9418839627005575, + "grad_norm": 1.412361979484558, + "learning_rate": 3.5299121552251345e-07, + "loss": 0.8127, + "step": 19343 + }, + { + "epoch": 0.9419326564896647, + "grad_norm": 1.4485268592834473, + "learning_rate": 3.524014361219541e-07, + "loss": 0.7666, + "step": 19344 + }, + { + "epoch": 0.941981350278772, + "grad_norm": 3.8066468238830566, + "learning_rate": 3.518121454572021e-07, + "loss": 0.7855, + "step": 19345 + }, + { + "epoch": 0.9420300440678792, + "grad_norm": 1.860327124595642, + "learning_rate": 3.5122334354291466e-07, + "loss": 0.8371, + "step": 19346 + }, + { + "epoch": 0.9420787378569864, + "grad_norm": 1.2073838710784912, + "learning_rate": 3.506350303937378e-07, + "loss": 0.7827, + "step": 19347 + }, + { + "epoch": 0.9421274316460936, + "grad_norm": 1.7385902404785156, + "learning_rate": 3.5004720602430653e-07, + "loss": 0.7651, + "step": 19348 + }, + { + "epoch": 0.9421761254352008, + "grad_norm": 1.5753191709518433, + "learning_rate": 3.4945987044924024e-07, + "loss": 0.7744, + "step": 19349 + }, + { + "epoch": 0.9422248192243079, + "grad_norm": 1.3657512664794922, + "learning_rate": 3.4887302368315614e-07, + "loss": 0.8572, + "step": 19350 + }, + { + "epoch": 0.9422735130134151, + "grad_norm": 1.7394124269485474, + "learning_rate": 3.482866657406425e-07, + "loss": 0.7366, + "step": 19351 + }, + { + "epoch": 0.9423222068025223, + "grad_norm": 1.4511293172836304, + "learning_rate": 3.477007966362922e-07, + "loss": 0.7297, + "step": 19352 + }, + { + "epoch": 0.9423709005916295, + "grad_norm": 1.2148483991622925, + "learning_rate": 3.4711541638467794e-07, + "loss": 0.8386, + "step": 19353 + }, + { + "epoch": 0.9424195943807367, + "grad_norm": 2.6884851455688477, + "learning_rate": 3.46530525000357e-07, + "loss": 0.7502, + "step": 19354 + }, + { + "epoch": 0.942468288169844, + "grad_norm": 1.348971962928772, + "learning_rate": 3.4594612249788217e-07, + "loss": 0.8884, + "step": 19355 + }, + { + "epoch": 0.9425169819589512, + "grad_norm": 1.2593088150024414, + "learning_rate": 3.453622088917885e-07, + "loss": 0.9003, + "step": 19356 + }, + { + "epoch": 0.9425656757480584, + "grad_norm": 1.9074885845184326, + "learning_rate": 3.4477878419660437e-07, + "loss": 0.8099, + "step": 19357 + }, + { + "epoch": 0.9426143695371655, + "grad_norm": 1.6329994201660156, + "learning_rate": 3.4419584842683816e-07, + "loss": 0.9082, + "step": 19358 + }, + { + "epoch": 0.9426630633262727, + "grad_norm": 1.6908105611801147, + "learning_rate": 3.4361340159699384e-07, + "loss": 0.8378, + "step": 19359 + }, + { + "epoch": 0.9427117571153799, + "grad_norm": 1.505330204963684, + "learning_rate": 3.430314437215554e-07, + "loss": 0.807, + "step": 19360 + }, + { + "epoch": 0.9427604509044871, + "grad_norm": 2.0681521892547607, + "learning_rate": 3.424499748150045e-07, + "loss": 0.8575, + "step": 19361 + }, + { + "epoch": 0.9428091446935943, + "grad_norm": 1.7874690294265747, + "learning_rate": 3.418689948918008e-07, + "loss": 0.7656, + "step": 19362 + }, + { + "epoch": 0.9428578384827015, + "grad_norm": 2.18337082862854, + "learning_rate": 3.4128850396639935e-07, + "loss": 0.7602, + "step": 19363 + }, + { + "epoch": 0.9429065322718088, + "grad_norm": 1.3460583686828613, + "learning_rate": 3.407085020532397e-07, + "loss": 0.7939, + "step": 19364 + }, + { + "epoch": 0.942955226060916, + "grad_norm": 1.8610845804214478, + "learning_rate": 3.4012898916674807e-07, + "loss": 0.823, + "step": 19365 + }, + { + "epoch": 0.9430039198500231, + "grad_norm": 1.5347132682800293, + "learning_rate": 3.3954996532133966e-07, + "loss": 0.8118, + "step": 19366 + }, + { + "epoch": 0.9430526136391303, + "grad_norm": 2.4521942138671875, + "learning_rate": 3.3897143053142066e-07, + "loss": 0.6889, + "step": 19367 + }, + { + "epoch": 0.9431013074282375, + "grad_norm": 1.6759389638900757, + "learning_rate": 3.383933848113752e-07, + "loss": 0.8511, + "step": 19368 + }, + { + "epoch": 0.9431500012173447, + "grad_norm": 1.8106740713119507, + "learning_rate": 3.378158281755917e-07, + "loss": 0.8937, + "step": 19369 + }, + { + "epoch": 0.9431986950064519, + "grad_norm": 1.653836965560913, + "learning_rate": 3.3723876063842977e-07, + "loss": 0.7076, + "step": 19370 + }, + { + "epoch": 0.9432473887955591, + "grad_norm": 1.5105725526809692, + "learning_rate": 3.3666218221424685e-07, + "loss": 0.7761, + "step": 19371 + }, + { + "epoch": 0.9432960825846664, + "grad_norm": 1.8313084840774536, + "learning_rate": 3.360860929173848e-07, + "loss": 0.7377, + "step": 19372 + }, + { + "epoch": 0.9433447763737736, + "grad_norm": 1.4623942375183105, + "learning_rate": 3.355104927621766e-07, + "loss": 0.7242, + "step": 19373 + }, + { + "epoch": 0.9433934701628808, + "grad_norm": 2.12093186378479, + "learning_rate": 3.3493538176293526e-07, + "loss": 0.8853, + "step": 19374 + }, + { + "epoch": 0.9434421639519879, + "grad_norm": 1.5030182600021362, + "learning_rate": 3.343607599339671e-07, + "loss": 0.7795, + "step": 19375 + }, + { + "epoch": 0.9434908577410951, + "grad_norm": 1.6179752349853516, + "learning_rate": 3.33786627289574e-07, + "loss": 0.778, + "step": 19376 + }, + { + "epoch": 0.9435395515302023, + "grad_norm": 1.5493035316467285, + "learning_rate": 3.332129838440268e-07, + "loss": 0.8054, + "step": 19377 + }, + { + "epoch": 0.9435882453193095, + "grad_norm": 1.5198419094085693, + "learning_rate": 3.326398296116029e-07, + "loss": 0.7812, + "step": 19378 + }, + { + "epoch": 0.9436369391084167, + "grad_norm": 1.707608699798584, + "learning_rate": 3.320671646065532e-07, + "loss": 0.7076, + "step": 19379 + }, + { + "epoch": 0.9436856328975239, + "grad_norm": 2.143897533416748, + "learning_rate": 3.314949888431285e-07, + "loss": 0.814, + "step": 19380 + }, + { + "epoch": 0.9437343266866312, + "grad_norm": 8.551348686218262, + "learning_rate": 3.3092330233555736e-07, + "loss": 0.6872, + "step": 19381 + }, + { + "epoch": 0.9437830204757384, + "grad_norm": 1.3295247554779053, + "learning_rate": 3.30352105098064e-07, + "loss": 0.8052, + "step": 19382 + }, + { + "epoch": 0.9438317142648455, + "grad_norm": 1.8030128479003906, + "learning_rate": 3.2978139714485493e-07, + "loss": 0.8657, + "step": 19383 + }, + { + "epoch": 0.9438804080539527, + "grad_norm": 1.7606521844863892, + "learning_rate": 3.292111784901253e-07, + "loss": 0.8093, + "step": 19384 + }, + { + "epoch": 0.9439291018430599, + "grad_norm": 1.4604198932647705, + "learning_rate": 3.2864144914806604e-07, + "loss": 0.8703, + "step": 19385 + }, + { + "epoch": 0.9439777956321671, + "grad_norm": 1.5924456119537354, + "learning_rate": 3.280722091328392e-07, + "loss": 0.6882, + "step": 19386 + }, + { + "epoch": 0.9440264894212743, + "grad_norm": 1.5759186744689941, + "learning_rate": 3.2750345845861343e-07, + "loss": 0.8904, + "step": 19387 + }, + { + "epoch": 0.9440751832103815, + "grad_norm": 1.4172922372817993, + "learning_rate": 3.269351971395307e-07, + "loss": 0.7887, + "step": 19388 + }, + { + "epoch": 0.9441238769994887, + "grad_norm": 1.2952300310134888, + "learning_rate": 3.2636742518972866e-07, + "loss": 0.8096, + "step": 19389 + }, + { + "epoch": 0.944172570788596, + "grad_norm": 1.785457968711853, + "learning_rate": 3.258001426233315e-07, + "loss": 0.8491, + "step": 19390 + }, + { + "epoch": 0.9442212645777032, + "grad_norm": 1.589662790298462, + "learning_rate": 3.25233349454448e-07, + "loss": 0.8557, + "step": 19391 + }, + { + "epoch": 0.9442699583668103, + "grad_norm": 1.677251935005188, + "learning_rate": 3.246670456971823e-07, + "loss": 0.7952, + "step": 19392 + }, + { + "epoch": 0.9443186521559175, + "grad_norm": 1.2481597661972046, + "learning_rate": 3.241012313656122e-07, + "loss": 0.8072, + "step": 19393 + }, + { + "epoch": 0.9443673459450247, + "grad_norm": 1.7575095891952515, + "learning_rate": 3.2353590647382417e-07, + "loss": 0.8378, + "step": 19394 + }, + { + "epoch": 0.9444160397341319, + "grad_norm": 1.837786316871643, + "learning_rate": 3.229710710358691e-07, + "loss": 0.7455, + "step": 19395 + }, + { + "epoch": 0.9444647335232391, + "grad_norm": 1.5905841588974, + "learning_rate": 3.2240672506580696e-07, + "loss": 0.761, + "step": 19396 + }, + { + "epoch": 0.9445134273123463, + "grad_norm": 2.116879463195801, + "learning_rate": 3.218428685776687e-07, + "loss": 0.7743, + "step": 19397 + }, + { + "epoch": 0.9445621211014535, + "grad_norm": 1.573020100593567, + "learning_rate": 3.212795015854853e-07, + "loss": 0.7964, + "step": 19398 + }, + { + "epoch": 0.9446108148905608, + "grad_norm": 1.4585033655166626, + "learning_rate": 3.207166241032678e-07, + "loss": 0.8203, + "step": 19399 + }, + { + "epoch": 0.9446595086796679, + "grad_norm": 1.710442304611206, + "learning_rate": 3.2015423614501604e-07, + "loss": 0.8265, + "step": 19400 + }, + { + "epoch": 0.9447082024687751, + "grad_norm": 2.002929925918579, + "learning_rate": 3.195923377247279e-07, + "loss": 0.7963, + "step": 19401 + }, + { + "epoch": 0.9447568962578823, + "grad_norm": 1.7219778299331665, + "learning_rate": 3.190309288563698e-07, + "loss": 0.8553, + "step": 19402 + }, + { + "epoch": 0.9448055900469895, + "grad_norm": 1.293986439704895, + "learning_rate": 3.1847000955391285e-07, + "loss": 0.7215, + "step": 19403 + }, + { + "epoch": 0.9448542838360967, + "grad_norm": 1.4670323133468628, + "learning_rate": 3.179095798313103e-07, + "loss": 0.7576, + "step": 19404 + }, + { + "epoch": 0.9449029776252039, + "grad_norm": 1.7202950716018677, + "learning_rate": 3.173496397025e-07, + "loss": 0.869, + "step": 19405 + }, + { + "epoch": 0.9449516714143111, + "grad_norm": 1.572493553161621, + "learning_rate": 3.1679018918141293e-07, + "loss": 0.7979, + "step": 19406 + }, + { + "epoch": 0.9450003652034183, + "grad_norm": 1.7027868032455444, + "learning_rate": 3.1623122828196464e-07, + "loss": 0.7695, + "step": 19407 + }, + { + "epoch": 0.9450490589925254, + "grad_norm": 1.4396542310714722, + "learning_rate": 3.1567275701805733e-07, + "loss": 0.7811, + "step": 19408 + }, + { + "epoch": 0.9450977527816327, + "grad_norm": 1.5061702728271484, + "learning_rate": 3.151147754035866e-07, + "loss": 0.7924, + "step": 19409 + }, + { + "epoch": 0.9451464465707399, + "grad_norm": 1.8078067302703857, + "learning_rate": 3.145572834524324e-07, + "loss": 0.7377, + "step": 19410 + }, + { + "epoch": 0.9451951403598471, + "grad_norm": 1.4037703275680542, + "learning_rate": 3.14000281178457e-07, + "loss": 0.829, + "step": 19411 + }, + { + "epoch": 0.9452438341489543, + "grad_norm": 1.264643907546997, + "learning_rate": 3.134437685955205e-07, + "loss": 0.7432, + "step": 19412 + }, + { + "epoch": 0.9452925279380615, + "grad_norm": 1.5421102046966553, + "learning_rate": 3.1288774571746503e-07, + "loss": 0.7502, + "step": 19413 + }, + { + "epoch": 0.9453412217271687, + "grad_norm": 1.586248755455017, + "learning_rate": 3.1233221255812406e-07, + "loss": 0.7773, + "step": 19414 + }, + { + "epoch": 0.9453899155162759, + "grad_norm": 1.8500486612319946, + "learning_rate": 3.117771691313132e-07, + "loss": 0.9254, + "step": 19415 + }, + { + "epoch": 0.9454386093053831, + "grad_norm": 1.3898779153823853, + "learning_rate": 3.1122261545083907e-07, + "loss": 0.8001, + "step": 19416 + }, + { + "epoch": 0.9454873030944903, + "grad_norm": 4.930172443389893, + "learning_rate": 3.106685515304997e-07, + "loss": 0.8151, + "step": 19417 + }, + { + "epoch": 0.9455359968835975, + "grad_norm": 1.1815283298492432, + "learning_rate": 3.101149773840728e-07, + "loss": 0.7564, + "step": 19418 + }, + { + "epoch": 0.9455846906727047, + "grad_norm": 2.056912899017334, + "learning_rate": 3.095618930253319e-07, + "loss": 0.8135, + "step": 19419 + }, + { + "epoch": 0.9456333844618119, + "grad_norm": 2.4773659706115723, + "learning_rate": 3.0900929846803486e-07, + "loss": 0.807, + "step": 19420 + }, + { + "epoch": 0.9456820782509191, + "grad_norm": 1.1731712818145752, + "learning_rate": 3.084571937259284e-07, + "loss": 0.8061, + "step": 19421 + }, + { + "epoch": 0.9457307720400263, + "grad_norm": 1.5074862241744995, + "learning_rate": 3.079055788127416e-07, + "loss": 0.8234, + "step": 19422 + }, + { + "epoch": 0.9457794658291335, + "grad_norm": 1.4192382097244263, + "learning_rate": 3.073544537422013e-07, + "loss": 0.8425, + "step": 19423 + }, + { + "epoch": 0.9458281596182407, + "grad_norm": 1.4456241130828857, + "learning_rate": 3.0680381852801424e-07, + "loss": 0.7289, + "step": 19424 + }, + { + "epoch": 0.9458768534073478, + "grad_norm": 1.1512446403503418, + "learning_rate": 3.062536731838761e-07, + "loss": 0.8055, + "step": 19425 + }, + { + "epoch": 0.945925547196455, + "grad_norm": 4.45573616027832, + "learning_rate": 3.057040177234738e-07, + "loss": 0.7638, + "step": 19426 + }, + { + "epoch": 0.9459742409855623, + "grad_norm": 1.4834872484207153, + "learning_rate": 3.05154852160483e-07, + "loss": 0.6985, + "step": 19427 + }, + { + "epoch": 0.9460229347746695, + "grad_norm": 1.4692018032073975, + "learning_rate": 3.0460617650855506e-07, + "loss": 0.8005, + "step": 19428 + }, + { + "epoch": 0.9460716285637767, + "grad_norm": 2.5954298973083496, + "learning_rate": 3.040579907813479e-07, + "loss": 0.8015, + "step": 19429 + }, + { + "epoch": 0.9461203223528839, + "grad_norm": 0.09593882411718369, + "learning_rate": 3.035102949924951e-07, + "loss": 0.6163, + "step": 19430 + }, + { + "epoch": 0.9461690161419911, + "grad_norm": 2.6463124752044678, + "learning_rate": 3.029630891556168e-07, + "loss": 0.789, + "step": 19431 + }, + { + "epoch": 0.9462177099310983, + "grad_norm": 2.0935821533203125, + "learning_rate": 3.0241637328433107e-07, + "loss": 0.7406, + "step": 19432 + }, + { + "epoch": 0.9462664037202055, + "grad_norm": 1.4537615776062012, + "learning_rate": 3.018701473922292e-07, + "loss": 0.8414, + "step": 19433 + }, + { + "epoch": 0.9463150975093126, + "grad_norm": 1.4236282110214233, + "learning_rate": 3.01324411492907e-07, + "loss": 0.7329, + "step": 19434 + }, + { + "epoch": 0.9463637912984199, + "grad_norm": 1.3278487920761108, + "learning_rate": 3.007791655999337e-07, + "loss": 0.7852, + "step": 19435 + }, + { + "epoch": 0.9464124850875271, + "grad_norm": 1.6824606657028198, + "learning_rate": 3.0023440972687835e-07, + "loss": 0.7824, + "step": 19436 + }, + { + "epoch": 0.9464611788766343, + "grad_norm": 1.5063318014144897, + "learning_rate": 2.996901438872857e-07, + "loss": 0.7046, + "step": 19437 + }, + { + "epoch": 0.9465098726657415, + "grad_norm": 1.7671293020248413, + "learning_rate": 2.99146368094696e-07, + "loss": 0.7728, + "step": 19438 + }, + { + "epoch": 0.9465585664548487, + "grad_norm": 1.499320149421692, + "learning_rate": 2.9860308236263847e-07, + "loss": 0.8351, + "step": 19439 + }, + { + "epoch": 0.9466072602439559, + "grad_norm": 1.3957228660583496, + "learning_rate": 2.9806028670462674e-07, + "loss": 0.832, + "step": 19440 + }, + { + "epoch": 0.9466559540330631, + "grad_norm": 1.5010093450546265, + "learning_rate": 2.9751798113415886e-07, + "loss": 0.7538, + "step": 19441 + }, + { + "epoch": 0.9467046478221702, + "grad_norm": 1.5269114971160889, + "learning_rate": 2.9697616566472855e-07, + "loss": 0.8464, + "step": 19442 + }, + { + "epoch": 0.9467533416112774, + "grad_norm": 0.09199230372905731, + "learning_rate": 2.9643484030981384e-07, + "loss": 0.5593, + "step": 19443 + }, + { + "epoch": 0.9468020354003847, + "grad_norm": 1.6783448457717896, + "learning_rate": 2.9589400508287736e-07, + "loss": 0.8311, + "step": 19444 + }, + { + "epoch": 0.9468507291894919, + "grad_norm": 1.2647995948791504, + "learning_rate": 2.95353659997375e-07, + "loss": 0.7693, + "step": 19445 + }, + { + "epoch": 0.9468994229785991, + "grad_norm": 1.4788017272949219, + "learning_rate": 2.9481380506674484e-07, + "loss": 0.7683, + "step": 19446 + }, + { + "epoch": 0.9469481167677063, + "grad_norm": 3.8933334350585938, + "learning_rate": 2.9427444030442064e-07, + "loss": 0.8139, + "step": 19447 + }, + { + "epoch": 0.9469968105568135, + "grad_norm": 1.1665432453155518, + "learning_rate": 2.937355657238161e-07, + "loss": 0.7606, + "step": 19448 + }, + { + "epoch": 0.9470455043459207, + "grad_norm": 1.9313346147537231, + "learning_rate": 2.9319718133833386e-07, + "loss": 0.7427, + "step": 19449 + }, + { + "epoch": 0.9470941981350279, + "grad_norm": 2.3397297859191895, + "learning_rate": 2.9265928716137204e-07, + "loss": 0.8248, + "step": 19450 + }, + { + "epoch": 0.947142891924135, + "grad_norm": 1.7288260459899902, + "learning_rate": 2.921218832063044e-07, + "loss": 0.787, + "step": 19451 + }, + { + "epoch": 0.9471915857132422, + "grad_norm": 2.4214346408843994, + "learning_rate": 2.915849694865025e-07, + "loss": 0.7728, + "step": 19452 + }, + { + "epoch": 0.9472402795023495, + "grad_norm": 1.6501790285110474, + "learning_rate": 2.9104854601532227e-07, + "loss": 0.8214, + "step": 19453 + }, + { + "epoch": 0.9472889732914567, + "grad_norm": 1.6789695024490356, + "learning_rate": 2.9051261280610423e-07, + "loss": 0.7094, + "step": 19454 + }, + { + "epoch": 0.9473376670805639, + "grad_norm": 1.732480764389038, + "learning_rate": 2.899771698721843e-07, + "loss": 0.8134, + "step": 19455 + }, + { + "epoch": 0.9473863608696711, + "grad_norm": 2.2501895427703857, + "learning_rate": 2.894422172268785e-07, + "loss": 0.7919, + "step": 19456 + }, + { + "epoch": 0.9474350546587783, + "grad_norm": 1.5397790670394897, + "learning_rate": 2.889077548834962e-07, + "loss": 0.7904, + "step": 19457 + }, + { + "epoch": 0.9474837484478855, + "grad_norm": 1.7703522443771362, + "learning_rate": 2.883737828553268e-07, + "loss": 0.7795, + "step": 19458 + }, + { + "epoch": 0.9475324422369926, + "grad_norm": 2.7932565212249756, + "learning_rate": 2.878403011556619e-07, + "loss": 0.7714, + "step": 19459 + }, + { + "epoch": 0.9475811360260998, + "grad_norm": 1.286899447441101, + "learning_rate": 2.873073097977641e-07, + "loss": 0.8505, + "step": 19460 + }, + { + "epoch": 0.947629829815207, + "grad_norm": 1.6695846319198608, + "learning_rate": 2.8677480879489404e-07, + "loss": 0.9244, + "step": 19461 + }, + { + "epoch": 0.9476785236043143, + "grad_norm": 1.866632103919983, + "learning_rate": 2.8624279816029885e-07, + "loss": 0.834, + "step": 19462 + }, + { + "epoch": 0.9477272173934215, + "grad_norm": 3.0922021865844727, + "learning_rate": 2.857112779072102e-07, + "loss": 0.8346, + "step": 19463 + }, + { + "epoch": 0.9477759111825287, + "grad_norm": 1.5852456092834473, + "learning_rate": 2.85180248048853e-07, + "loss": 0.7624, + "step": 19464 + }, + { + "epoch": 0.9478246049716359, + "grad_norm": 1.7434337139129639, + "learning_rate": 2.846497085984323e-07, + "loss": 0.7463, + "step": 19465 + }, + { + "epoch": 0.9478732987607431, + "grad_norm": 1.6117198467254639, + "learning_rate": 2.8411965956914866e-07, + "loss": 0.7403, + "step": 19466 + }, + { + "epoch": 0.9479219925498502, + "grad_norm": 1.443908929824829, + "learning_rate": 2.8359010097418705e-07, + "loss": 0.8376, + "step": 19467 + }, + { + "epoch": 0.9479706863389574, + "grad_norm": 1.841007947921753, + "learning_rate": 2.8306103282671917e-07, + "loss": 0.7888, + "step": 19468 + }, + { + "epoch": 0.9480193801280646, + "grad_norm": 1.3353697061538696, + "learning_rate": 2.8253245513990336e-07, + "loss": 0.8247, + "step": 19469 + }, + { + "epoch": 0.9480680739171718, + "grad_norm": 1.2868032455444336, + "learning_rate": 2.8200436792689357e-07, + "loss": 0.8875, + "step": 19470 + }, + { + "epoch": 0.9481167677062791, + "grad_norm": 1.408158302307129, + "learning_rate": 2.814767712008215e-07, + "loss": 0.8718, + "step": 19471 + }, + { + "epoch": 0.9481654614953863, + "grad_norm": 1.490302562713623, + "learning_rate": 2.8094966497481224e-07, + "loss": 0.7918, + "step": 19472 + }, + { + "epoch": 0.9482141552844935, + "grad_norm": 1.8774762153625488, + "learning_rate": 2.8042304926197973e-07, + "loss": 0.8608, + "step": 19473 + }, + { + "epoch": 0.9482628490736007, + "grad_norm": 1.6094419956207275, + "learning_rate": 2.7989692407542015e-07, + "loss": 0.805, + "step": 19474 + }, + { + "epoch": 0.9483115428627079, + "grad_norm": 1.9572036266326904, + "learning_rate": 2.79371289428223e-07, + "loss": 0.8368, + "step": 19475 + }, + { + "epoch": 0.948360236651815, + "grad_norm": 1.4484362602233887, + "learning_rate": 2.7884614533346457e-07, + "loss": 0.8146, + "step": 19476 + }, + { + "epoch": 0.9484089304409222, + "grad_norm": 1.7632725238800049, + "learning_rate": 2.783214918042032e-07, + "loss": 0.6826, + "step": 19477 + }, + { + "epoch": 0.9484576242300294, + "grad_norm": 1.9318279027938843, + "learning_rate": 2.777973288534952e-07, + "loss": 0.8645, + "step": 19478 + }, + { + "epoch": 0.9485063180191367, + "grad_norm": 1.5173497200012207, + "learning_rate": 2.772736564943768e-07, + "loss": 0.7883, + "step": 19479 + }, + { + "epoch": 0.9485550118082439, + "grad_norm": 1.3503658771514893, + "learning_rate": 2.76750474739873e-07, + "loss": 0.8299, + "step": 19480 + }, + { + "epoch": 0.9486037055973511, + "grad_norm": 1.3582234382629395, + "learning_rate": 2.7622778360300253e-07, + "loss": 0.7515, + "step": 19481 + }, + { + "epoch": 0.9486523993864583, + "grad_norm": 0.10080870985984802, + "learning_rate": 2.7570558309676143e-07, + "loss": 0.6038, + "step": 19482 + }, + { + "epoch": 0.9487010931755655, + "grad_norm": 2.6085407733917236, + "learning_rate": 2.7518387323414386e-07, + "loss": 0.8171, + "step": 19483 + }, + { + "epoch": 0.9487497869646726, + "grad_norm": 1.5410727262496948, + "learning_rate": 2.7466265402812387e-07, + "loss": 0.7125, + "step": 19484 + }, + { + "epoch": 0.9487984807537798, + "grad_norm": 1.7046537399291992, + "learning_rate": 2.7414192549167106e-07, + "loss": 0.7851, + "step": 19485 + }, + { + "epoch": 0.948847174542887, + "grad_norm": 0.09545926004648209, + "learning_rate": 2.736216876377351e-07, + "loss": 0.62, + "step": 19486 + }, + { + "epoch": 0.9488958683319942, + "grad_norm": 1.81297767162323, + "learning_rate": 2.73101940479259e-07, + "loss": 0.7832, + "step": 19487 + }, + { + "epoch": 0.9489445621211015, + "grad_norm": 1.6425288915634155, + "learning_rate": 2.725826840291701e-07, + "loss": 0.7347, + "step": 19488 + }, + { + "epoch": 0.9489932559102087, + "grad_norm": 1.7662984132766724, + "learning_rate": 2.720639183003848e-07, + "loss": 0.8445, + "step": 19489 + }, + { + "epoch": 0.9490419496993159, + "grad_norm": 4.378505706787109, + "learning_rate": 2.715456433058106e-07, + "loss": 0.7898, + "step": 19490 + }, + { + "epoch": 0.9490906434884231, + "grad_norm": 1.3549243211746216, + "learning_rate": 2.710278590583348e-07, + "loss": 0.6311, + "step": 19491 + }, + { + "epoch": 0.9491393372775303, + "grad_norm": 1.4970998764038086, + "learning_rate": 2.7051056557084286e-07, + "loss": 0.8143, + "step": 19492 + }, + { + "epoch": 0.9491880310666374, + "grad_norm": 2.1125802993774414, + "learning_rate": 2.699937628561955e-07, + "loss": 0.7566, + "step": 19493 + }, + { + "epoch": 0.9492367248557446, + "grad_norm": 1.6215850114822388, + "learning_rate": 2.6947745092725577e-07, + "loss": 0.8095, + "step": 19494 + }, + { + "epoch": 0.9492854186448518, + "grad_norm": 1.7201451063156128, + "learning_rate": 2.6896162979686004e-07, + "loss": 0.8167, + "step": 19495 + }, + { + "epoch": 0.949334112433959, + "grad_norm": 1.7922985553741455, + "learning_rate": 2.6844629947784693e-07, + "loss": 0.7649, + "step": 19496 + }, + { + "epoch": 0.9493828062230663, + "grad_norm": 2.4069032669067383, + "learning_rate": 2.6793145998302847e-07, + "loss": 0.7879, + "step": 19497 + }, + { + "epoch": 0.9494315000121735, + "grad_norm": 1.9408519268035889, + "learning_rate": 2.6741711132521443e-07, + "loss": 0.7084, + "step": 19498 + }, + { + "epoch": 0.9494801938012807, + "grad_norm": 0.10505741089582443, + "learning_rate": 2.6690325351719894e-07, + "loss": 0.5916, + "step": 19499 + }, + { + "epoch": 0.9495288875903879, + "grad_norm": 6.621381759643555, + "learning_rate": 2.663898865717629e-07, + "loss": 0.8121, + "step": 19500 + }, + { + "epoch": 0.949577581379495, + "grad_norm": 0.09608332812786102, + "learning_rate": 2.6587701050168057e-07, + "loss": 0.6631, + "step": 19501 + }, + { + "epoch": 0.9496262751686022, + "grad_norm": 1.5979118347167969, + "learning_rate": 2.65364625319704e-07, + "loss": 0.875, + "step": 19502 + }, + { + "epoch": 0.9496749689577094, + "grad_norm": 2.2780444622039795, + "learning_rate": 2.648527310385851e-07, + "loss": 0.8049, + "step": 19503 + }, + { + "epoch": 0.9497236627468166, + "grad_norm": 2.1809003353118896, + "learning_rate": 2.6434132767104935e-07, + "loss": 0.8545, + "step": 19504 + }, + { + "epoch": 0.9497723565359238, + "grad_norm": 1.4443105459213257, + "learning_rate": 2.6383041522982653e-07, + "loss": 0.8021, + "step": 19505 + }, + { + "epoch": 0.9498210503250311, + "grad_norm": 1.4764140844345093, + "learning_rate": 2.6331999372761986e-07, + "loss": 0.8493, + "step": 19506 + }, + { + "epoch": 0.9498697441141383, + "grad_norm": 2.128690481185913, + "learning_rate": 2.628100631771258e-07, + "loss": 0.9543, + "step": 19507 + }, + { + "epoch": 0.9499184379032455, + "grad_norm": 1.1967864036560059, + "learning_rate": 2.6230062359103635e-07, + "loss": 0.8005, + "step": 19508 + }, + { + "epoch": 0.9499671316923527, + "grad_norm": 1.918337345123291, + "learning_rate": 2.617916749820126e-07, + "loss": 0.845, + "step": 19509 + }, + { + "epoch": 0.9500158254814598, + "grad_norm": 1.451135277748108, + "learning_rate": 2.6128321736272666e-07, + "loss": 0.7504, + "step": 19510 + }, + { + "epoch": 0.950064519270567, + "grad_norm": 1.6818331480026245, + "learning_rate": 2.60775250745815e-07, + "loss": 0.7804, + "step": 19511 + }, + { + "epoch": 0.9501132130596742, + "grad_norm": 2.7978053092956543, + "learning_rate": 2.6026777514392087e-07, + "loss": 0.7671, + "step": 19512 + }, + { + "epoch": 0.9501619068487814, + "grad_norm": 1.9908292293548584, + "learning_rate": 2.597607905696653e-07, + "loss": 0.7955, + "step": 19513 + }, + { + "epoch": 0.9502106006378886, + "grad_norm": 1.5780044794082642, + "learning_rate": 2.592542970356604e-07, + "loss": 0.8235, + "step": 19514 + }, + { + "epoch": 0.9502592944269959, + "grad_norm": 1.4068900346755981, + "learning_rate": 2.58748294554505e-07, + "loss": 0.8675, + "step": 19515 + }, + { + "epoch": 0.9503079882161031, + "grad_norm": 0.09699293226003647, + "learning_rate": 2.582427831387868e-07, + "loss": 0.5815, + "step": 19516 + }, + { + "epoch": 0.9503566820052103, + "grad_norm": 1.9846762418746948, + "learning_rate": 2.57737762801078e-07, + "loss": 0.8809, + "step": 19517 + }, + { + "epoch": 0.9504053757943174, + "grad_norm": 1.7753022909164429, + "learning_rate": 2.572332335539396e-07, + "loss": 0.7398, + "step": 19518 + }, + { + "epoch": 0.9504540695834246, + "grad_norm": 0.09311448782682419, + "learning_rate": 2.5672919540992825e-07, + "loss": 0.5961, + "step": 19519 + }, + { + "epoch": 0.9505027633725318, + "grad_norm": 1.681788682937622, + "learning_rate": 2.5622564838157617e-07, + "loss": 0.7739, + "step": 19520 + }, + { + "epoch": 0.950551457161639, + "grad_norm": 1.3649265766143799, + "learning_rate": 2.557225924814133e-07, + "loss": 0.8834, + "step": 19521 + }, + { + "epoch": 0.9506001509507462, + "grad_norm": 1.6909297704696655, + "learning_rate": 2.5522002772194966e-07, + "loss": 0.8885, + "step": 19522 + }, + { + "epoch": 0.9506488447398534, + "grad_norm": 1.3728042840957642, + "learning_rate": 2.5471795411568854e-07, + "loss": 0.8293, + "step": 19523 + }, + { + "epoch": 0.9506975385289607, + "grad_norm": 0.11067768931388855, + "learning_rate": 2.5421637167512e-07, + "loss": 0.5942, + "step": 19524 + }, + { + "epoch": 0.9507462323180679, + "grad_norm": 1.431104063987732, + "learning_rate": 2.5371528041271855e-07, + "loss": 0.8182, + "step": 19525 + }, + { + "epoch": 0.950794926107175, + "grad_norm": 1.8840793371200562, + "learning_rate": 2.5321468034094967e-07, + "loss": 0.8054, + "step": 19526 + }, + { + "epoch": 0.9508436198962822, + "grad_norm": 1.632792353630066, + "learning_rate": 2.5271457147226566e-07, + "loss": 0.7962, + "step": 19527 + }, + { + "epoch": 0.9508923136853894, + "grad_norm": 2.1141810417175293, + "learning_rate": 2.522149538191077e-07, + "loss": 0.869, + "step": 19528 + }, + { + "epoch": 0.9509410074744966, + "grad_norm": 2.564633846282959, + "learning_rate": 2.5171582739390133e-07, + "loss": 0.7982, + "step": 19529 + }, + { + "epoch": 0.9509897012636038, + "grad_norm": 1.4975762367248535, + "learning_rate": 2.512171922090678e-07, + "loss": 0.8643, + "step": 19530 + }, + { + "epoch": 0.951038395052711, + "grad_norm": 1.3384212255477905, + "learning_rate": 2.507190482770061e-07, + "loss": 0.7592, + "step": 19531 + }, + { + "epoch": 0.9510870888418183, + "grad_norm": 1.9147285223007202, + "learning_rate": 2.502213956101063e-07, + "loss": 0.7739, + "step": 19532 + }, + { + "epoch": 0.9511357826309255, + "grad_norm": 1.3775990009307861, + "learning_rate": 2.49724234220754e-07, + "loss": 0.7424, + "step": 19533 + }, + { + "epoch": 0.9511844764200327, + "grad_norm": 5.78030252456665, + "learning_rate": 2.492275641213082e-07, + "loss": 0.8593, + "step": 19534 + }, + { + "epoch": 0.9512331702091398, + "grad_norm": 1.8205064535140991, + "learning_rate": 2.487313853241302e-07, + "loss": 0.7902, + "step": 19535 + }, + { + "epoch": 0.951281863998247, + "grad_norm": 2.5769588947296143, + "learning_rate": 2.48235697841559e-07, + "loss": 0.7743, + "step": 19536 + }, + { + "epoch": 0.9513305577873542, + "grad_norm": 1.4083408117294312, + "learning_rate": 2.4774050168592467e-07, + "loss": 0.8159, + "step": 19537 + }, + { + "epoch": 0.9513792515764614, + "grad_norm": 1.406715989112854, + "learning_rate": 2.472457968695485e-07, + "loss": 0.8547, + "step": 19538 + }, + { + "epoch": 0.9514279453655686, + "grad_norm": 2.94822359085083, + "learning_rate": 2.467515834047318e-07, + "loss": 0.8269, + "step": 19539 + }, + { + "epoch": 0.9514766391546758, + "grad_norm": 1.45737886428833, + "learning_rate": 2.462578613037714e-07, + "loss": 0.8255, + "step": 19540 + }, + { + "epoch": 0.951525332943783, + "grad_norm": 1.626414179801941, + "learning_rate": 2.457646305789485e-07, + "loss": 0.8324, + "step": 19541 + }, + { + "epoch": 0.9515740267328903, + "grad_norm": 1.42860746383667, + "learning_rate": 2.4527189124253116e-07, + "loss": 0.8633, + "step": 19542 + }, + { + "epoch": 0.9516227205219974, + "grad_norm": 8.243858337402344, + "learning_rate": 2.4477964330677837e-07, + "loss": 0.7762, + "step": 19543 + }, + { + "epoch": 0.9516714143111046, + "grad_norm": 1.393053412437439, + "learning_rate": 2.4428788678392934e-07, + "loss": 0.8111, + "step": 19544 + }, + { + "epoch": 0.9517201081002118, + "grad_norm": 1.6223456859588623, + "learning_rate": 2.437966216862231e-07, + "loss": 0.7403, + "step": 19545 + }, + { + "epoch": 0.951768801889319, + "grad_norm": 2.3324410915374756, + "learning_rate": 2.4330584802587654e-07, + "loss": 0.8454, + "step": 19546 + }, + { + "epoch": 0.9518174956784262, + "grad_norm": 1.5406278371810913, + "learning_rate": 2.428155658150977e-07, + "loss": 0.9052, + "step": 19547 + }, + { + "epoch": 0.9518661894675334, + "grad_norm": 1.82416832447052, + "learning_rate": 2.4232577506608345e-07, + "loss": 0.8218, + "step": 19548 + }, + { + "epoch": 0.9519148832566406, + "grad_norm": 3.3151724338531494, + "learning_rate": 2.418364757910174e-07, + "loss": 0.7836, + "step": 19549 + }, + { + "epoch": 0.9519635770457479, + "grad_norm": 0.0948411226272583, + "learning_rate": 2.413476680020721e-07, + "loss": 0.5114, + "step": 19550 + }, + { + "epoch": 0.9520122708348551, + "grad_norm": 1.6124626398086548, + "learning_rate": 2.408593517114022e-07, + "loss": 0.7782, + "step": 19551 + }, + { + "epoch": 0.9520609646239622, + "grad_norm": 1.5090545415878296, + "learning_rate": 2.4037152693116016e-07, + "loss": 0.7537, + "step": 19552 + }, + { + "epoch": 0.9521096584130694, + "grad_norm": 1.9882643222808838, + "learning_rate": 2.3988419367347417e-07, + "loss": 0.8022, + "step": 19553 + }, + { + "epoch": 0.9521583522021766, + "grad_norm": 0.09592173248529434, + "learning_rate": 2.393973519504744e-07, + "loss": 0.5377, + "step": 19554 + }, + { + "epoch": 0.9522070459912838, + "grad_norm": 1.6491611003875732, + "learning_rate": 2.389110017742646e-07, + "loss": 0.739, + "step": 19555 + }, + { + "epoch": 0.952255739780391, + "grad_norm": 1.254874587059021, + "learning_rate": 2.384251431569462e-07, + "loss": 0.8972, + "step": 19556 + }, + { + "epoch": 0.9523044335694982, + "grad_norm": 1.5636773109436035, + "learning_rate": 2.3793977611060504e-07, + "loss": 0.8132, + "step": 19557 + }, + { + "epoch": 0.9523531273586054, + "grad_norm": 1.5128363370895386, + "learning_rate": 2.3745490064731146e-07, + "loss": 0.8054, + "step": 19558 + }, + { + "epoch": 0.9524018211477127, + "grad_norm": 3.083580493927002, + "learning_rate": 2.369705167791314e-07, + "loss": 0.8349, + "step": 19559 + }, + { + "epoch": 0.9524505149368198, + "grad_norm": 2.5112624168395996, + "learning_rate": 2.3648662451811076e-07, + "loss": 0.7663, + "step": 19560 + }, + { + "epoch": 0.952499208725927, + "grad_norm": 1.3224016427993774, + "learning_rate": 2.3600322387628882e-07, + "loss": 0.6767, + "step": 19561 + }, + { + "epoch": 0.9525479025150342, + "grad_norm": 1.4576177597045898, + "learning_rate": 2.3552031486568706e-07, + "loss": 0.7788, + "step": 19562 + }, + { + "epoch": 0.9525965963041414, + "grad_norm": 0.09338930249214172, + "learning_rate": 2.3503789749831807e-07, + "loss": 0.6125, + "step": 19563 + }, + { + "epoch": 0.9526452900932486, + "grad_norm": 2.0254178047180176, + "learning_rate": 2.3455597178618562e-07, + "loss": 0.8163, + "step": 19564 + }, + { + "epoch": 0.9526939838823558, + "grad_norm": 0.09620533138513565, + "learning_rate": 2.3407453774127564e-07, + "loss": 0.7004, + "step": 19565 + }, + { + "epoch": 0.952742677671463, + "grad_norm": 1.8919968605041504, + "learning_rate": 2.3359359537556303e-07, + "loss": 0.862, + "step": 19566 + }, + { + "epoch": 0.9527913714605702, + "grad_norm": 1.8767919540405273, + "learning_rate": 2.3311314470101153e-07, + "loss": 0.7616, + "step": 19567 + }, + { + "epoch": 0.9528400652496773, + "grad_norm": 1.7177754640579224, + "learning_rate": 2.3263318572957605e-07, + "loss": 0.7013, + "step": 19568 + }, + { + "epoch": 0.9528887590387846, + "grad_norm": 1.321758508682251, + "learning_rate": 2.3215371847318703e-07, + "loss": 0.6807, + "step": 19569 + }, + { + "epoch": 0.9529374528278918, + "grad_norm": 2.0204529762268066, + "learning_rate": 2.3167474294377934e-07, + "loss": 0.8564, + "step": 19570 + }, + { + "epoch": 0.952986146616999, + "grad_norm": 1.3692225217819214, + "learning_rate": 2.311962591532657e-07, + "loss": 0.8403, + "step": 19571 + }, + { + "epoch": 0.9530348404061062, + "grad_norm": 1.5343223810195923, + "learning_rate": 2.3071826711354774e-07, + "loss": 0.819, + "step": 19572 + }, + { + "epoch": 0.9530835341952134, + "grad_norm": 1.6718910932540894, + "learning_rate": 2.3024076683651365e-07, + "loss": 0.7986, + "step": 19573 + }, + { + "epoch": 0.9531322279843206, + "grad_norm": 1.2787449359893799, + "learning_rate": 2.2976375833404509e-07, + "loss": 0.8454, + "step": 19574 + }, + { + "epoch": 0.9531809217734278, + "grad_norm": 1.3122013807296753, + "learning_rate": 2.2928724161800364e-07, + "loss": 0.8385, + "step": 19575 + }, + { + "epoch": 0.953229615562535, + "grad_norm": 1.185128092765808, + "learning_rate": 2.2881121670024432e-07, + "loss": 0.7615, + "step": 19576 + }, + { + "epoch": 0.9532783093516422, + "grad_norm": 1.5257165431976318, + "learning_rate": 2.2833568359261093e-07, + "loss": 0.8405, + "step": 19577 + }, + { + "epoch": 0.9533270031407494, + "grad_norm": 1.3256525993347168, + "learning_rate": 2.278606423069274e-07, + "loss": 0.8542, + "step": 19578 + }, + { + "epoch": 0.9533756969298566, + "grad_norm": 1.661234974861145, + "learning_rate": 2.2738609285501534e-07, + "loss": 0.763, + "step": 19579 + }, + { + "epoch": 0.9534243907189638, + "grad_norm": 1.5280267000198364, + "learning_rate": 2.2691203524867644e-07, + "loss": 0.8123, + "step": 19580 + }, + { + "epoch": 0.953473084508071, + "grad_norm": 2.32302188873291, + "learning_rate": 2.264384694997035e-07, + "loss": 0.8932, + "step": 19581 + }, + { + "epoch": 0.9535217782971782, + "grad_norm": 1.5521485805511475, + "learning_rate": 2.2596539561987597e-07, + "loss": 0.86, + "step": 19582 + }, + { + "epoch": 0.9535704720862854, + "grad_norm": 1.751682996749878, + "learning_rate": 2.254928136209622e-07, + "loss": 0.762, + "step": 19583 + }, + { + "epoch": 0.9536191658753926, + "grad_norm": 1.7503905296325684, + "learning_rate": 2.250207235147195e-07, + "loss": 0.8578, + "step": 19584 + }, + { + "epoch": 0.9536678596644997, + "grad_norm": 2.1948421001434326, + "learning_rate": 2.245491253128873e-07, + "loss": 0.7974, + "step": 19585 + }, + { + "epoch": 0.953716553453607, + "grad_norm": 1.732733130455017, + "learning_rate": 2.240780190271985e-07, + "loss": 0.8751, + "step": 19586 + }, + { + "epoch": 0.9537652472427142, + "grad_norm": 1.3582144975662231, + "learning_rate": 2.2360740466937259e-07, + "loss": 0.664, + "step": 19587 + }, + { + "epoch": 0.9538139410318214, + "grad_norm": 1.6075793504714966, + "learning_rate": 2.2313728225111574e-07, + "loss": 0.7387, + "step": 19588 + }, + { + "epoch": 0.9538626348209286, + "grad_norm": 2.373680591583252, + "learning_rate": 2.2266765178412087e-07, + "loss": 0.8138, + "step": 19589 + }, + { + "epoch": 0.9539113286100358, + "grad_norm": 1.6119415760040283, + "learning_rate": 2.2219851328007414e-07, + "loss": 0.7629, + "step": 19590 + }, + { + "epoch": 0.953960022399143, + "grad_norm": 1.3929164409637451, + "learning_rate": 2.217298667506418e-07, + "loss": 0.7546, + "step": 19591 + }, + { + "epoch": 0.9540087161882502, + "grad_norm": 2.1906888484954834, + "learning_rate": 2.2126171220748117e-07, + "loss": 0.842, + "step": 19592 + }, + { + "epoch": 0.9540574099773574, + "grad_norm": 1.611383318901062, + "learning_rate": 2.207940496622385e-07, + "loss": 0.8278, + "step": 19593 + }, + { + "epoch": 0.9541061037664645, + "grad_norm": 1.4682154655456543, + "learning_rate": 2.2032687912654894e-07, + "loss": 0.8202, + "step": 19594 + }, + { + "epoch": 0.9541547975555718, + "grad_norm": 2.1513917446136475, + "learning_rate": 2.1986020061202985e-07, + "loss": 0.7326, + "step": 19595 + }, + { + "epoch": 0.954203491344679, + "grad_norm": 1.9286212921142578, + "learning_rate": 2.1939401413029417e-07, + "loss": 0.8539, + "step": 19596 + }, + { + "epoch": 0.9542521851337862, + "grad_norm": 1.659414291381836, + "learning_rate": 2.1892831969293482e-07, + "loss": 0.8811, + "step": 19597 + }, + { + "epoch": 0.9543008789228934, + "grad_norm": 1.3427636623382568, + "learning_rate": 2.184631173115359e-07, + "loss": 0.8816, + "step": 19598 + }, + { + "epoch": 0.9543495727120006, + "grad_norm": 1.8904918432235718, + "learning_rate": 2.179984069976726e-07, + "loss": 0.7711, + "step": 19599 + }, + { + "epoch": 0.9543982665011078, + "grad_norm": 2.8473994731903076, + "learning_rate": 2.1753418876290012e-07, + "loss": 0.8181, + "step": 19600 + }, + { + "epoch": 0.954446960290215, + "grad_norm": 1.275994062423706, + "learning_rate": 2.1707046261877142e-07, + "loss": 0.7734, + "step": 19601 + }, + { + "epoch": 0.9544956540793221, + "grad_norm": 1.4333808422088623, + "learning_rate": 2.1660722857681505e-07, + "loss": 0.7389, + "step": 19602 + }, + { + "epoch": 0.9545443478684293, + "grad_norm": 1.6109172105789185, + "learning_rate": 2.161444866485618e-07, + "loss": 0.7575, + "step": 19603 + }, + { + "epoch": 0.9545930416575366, + "grad_norm": 1.2688865661621094, + "learning_rate": 2.156822368455136e-07, + "loss": 0.7558, + "step": 19604 + }, + { + "epoch": 0.9546417354466438, + "grad_norm": 2.0039103031158447, + "learning_rate": 2.1522047917917898e-07, + "loss": 0.732, + "step": 19605 + }, + { + "epoch": 0.954690429235751, + "grad_norm": 1.4116723537445068, + "learning_rate": 2.1475921366103325e-07, + "loss": 0.7338, + "step": 19606 + }, + { + "epoch": 0.9547391230248582, + "grad_norm": 1.6702255010604858, + "learning_rate": 2.1429844030256053e-07, + "loss": 0.8675, + "step": 19607 + }, + { + "epoch": 0.9547878168139654, + "grad_norm": 2.449873447418213, + "learning_rate": 2.138381591152161e-07, + "loss": 0.7891, + "step": 19608 + }, + { + "epoch": 0.9548365106030726, + "grad_norm": 1.5919917821884155, + "learning_rate": 2.133783701104508e-07, + "loss": 0.7925, + "step": 19609 + }, + { + "epoch": 0.9548852043921798, + "grad_norm": 1.787973403930664, + "learning_rate": 2.129190732997044e-07, + "loss": 0.8739, + "step": 19610 + }, + { + "epoch": 0.9549338981812869, + "grad_norm": 1.9496374130249023, + "learning_rate": 2.1246026869439883e-07, + "loss": 0.7568, + "step": 19611 + }, + { + "epoch": 0.9549825919703941, + "grad_norm": 2.3018598556518555, + "learning_rate": 2.1200195630594943e-07, + "loss": 0.8282, + "step": 19612 + }, + { + "epoch": 0.9550312857595014, + "grad_norm": 1.4446276426315308, + "learning_rate": 2.1154413614575375e-07, + "loss": 0.6107, + "step": 19613 + }, + { + "epoch": 0.9550799795486086, + "grad_norm": 1.4121235609054565, + "learning_rate": 2.1108680822520267e-07, + "loss": 0.7422, + "step": 19614 + }, + { + "epoch": 0.9551286733377158, + "grad_norm": 1.5671391487121582, + "learning_rate": 2.106299725556715e-07, + "loss": 0.8783, + "step": 19615 + }, + { + "epoch": 0.955177367126823, + "grad_norm": 1.969377875328064, + "learning_rate": 2.1017362914852457e-07, + "loss": 0.8173, + "step": 19616 + }, + { + "epoch": 0.9552260609159302, + "grad_norm": 1.4176194667816162, + "learning_rate": 2.0971777801511272e-07, + "loss": 0.8547, + "step": 19617 + }, + { + "epoch": 0.9552747547050374, + "grad_norm": 1.242920994758606, + "learning_rate": 2.092624191667758e-07, + "loss": 0.8738, + "step": 19618 + }, + { + "epoch": 0.9553234484941445, + "grad_norm": 1.5249239206314087, + "learning_rate": 2.0880755261484254e-07, + "loss": 0.9438, + "step": 19619 + }, + { + "epoch": 0.9553721422832517, + "grad_norm": 1.4084495306015015, + "learning_rate": 2.0835317837062165e-07, + "loss": 0.7262, + "step": 19620 + }, + { + "epoch": 0.955420836072359, + "grad_norm": 1.69271719455719, + "learning_rate": 2.0789929644542185e-07, + "loss": 0.8185, + "step": 19621 + }, + { + "epoch": 0.9554695298614662, + "grad_norm": 1.5110093355178833, + "learning_rate": 2.074459068505319e-07, + "loss": 0.8224, + "step": 19622 + }, + { + "epoch": 0.9555182236505734, + "grad_norm": 1.7155262231826782, + "learning_rate": 2.0699300959722944e-07, + "loss": 0.7825, + "step": 19623 + }, + { + "epoch": 0.9555669174396806, + "grad_norm": 1.2614591121673584, + "learning_rate": 2.0654060469678104e-07, + "loss": 0.8276, + "step": 19624 + }, + { + "epoch": 0.9556156112287878, + "grad_norm": 2.639373540878296, + "learning_rate": 2.060886921604399e-07, + "loss": 0.9409, + "step": 19625 + }, + { + "epoch": 0.955664305017895, + "grad_norm": 2.170246124267578, + "learning_rate": 2.0563727199944817e-07, + "loss": 0.7973, + "step": 19626 + }, + { + "epoch": 0.9557129988070021, + "grad_norm": 2.353018045425415, + "learning_rate": 2.0518634422503235e-07, + "loss": 0.808, + "step": 19627 + }, + { + "epoch": 0.9557616925961093, + "grad_norm": 1.437027931213379, + "learning_rate": 2.0473590884841022e-07, + "loss": 0.8104, + "step": 19628 + }, + { + "epoch": 0.9558103863852165, + "grad_norm": 1.4299813508987427, + "learning_rate": 2.042859658807883e-07, + "loss": 0.7697, + "step": 19629 + }, + { + "epoch": 0.9558590801743237, + "grad_norm": 1.1362160444259644, + "learning_rate": 2.0383651533335992e-07, + "loss": 0.7704, + "step": 19630 + }, + { + "epoch": 0.955907773963431, + "grad_norm": 0.10176494717597961, + "learning_rate": 2.0338755721730053e-07, + "loss": 0.6433, + "step": 19631 + }, + { + "epoch": 0.9559564677525382, + "grad_norm": 1.2625188827514648, + "learning_rate": 2.0293909154378123e-07, + "loss": 0.7804, + "step": 19632 + }, + { + "epoch": 0.9560051615416454, + "grad_norm": 1.3696422576904297, + "learning_rate": 2.024911183239575e-07, + "loss": 0.8185, + "step": 19633 + }, + { + "epoch": 0.9560538553307526, + "grad_norm": 1.328450083732605, + "learning_rate": 2.020436375689716e-07, + "loss": 0.839, + "step": 19634 + }, + { + "epoch": 0.9561025491198598, + "grad_norm": 6.363387584686279, + "learning_rate": 2.0159664928995682e-07, + "loss": 0.8213, + "step": 19635 + }, + { + "epoch": 0.9561512429089669, + "grad_norm": 1.662678837776184, + "learning_rate": 2.011501534980287e-07, + "loss": 0.774, + "step": 19636 + }, + { + "epoch": 0.9561999366980741, + "grad_norm": 1.2769416570663452, + "learning_rate": 2.0070415020429612e-07, + "loss": 0.8516, + "step": 19637 + }, + { + "epoch": 0.9562486304871813, + "grad_norm": 1.3803273439407349, + "learning_rate": 2.0025863941985468e-07, + "loss": 0.8619, + "step": 19638 + }, + { + "epoch": 0.9562973242762886, + "grad_norm": 1.7401198148727417, + "learning_rate": 1.998136211557822e-07, + "loss": 0.7754, + "step": 19639 + }, + { + "epoch": 0.9563460180653958, + "grad_norm": 2.242654323577881, + "learning_rate": 1.993690954231542e-07, + "loss": 0.7299, + "step": 19640 + }, + { + "epoch": 0.956394711854503, + "grad_norm": 0.09628309309482574, + "learning_rate": 1.989250622330219e-07, + "loss": 0.6448, + "step": 19641 + }, + { + "epoch": 0.9564434056436102, + "grad_norm": 1.2207754850387573, + "learning_rate": 1.9848152159643642e-07, + "loss": 0.7543, + "step": 19642 + }, + { + "epoch": 0.9564920994327174, + "grad_norm": 1.9567995071411133, + "learning_rate": 1.9803847352442895e-07, + "loss": 0.8636, + "step": 19643 + }, + { + "epoch": 0.9565407932218245, + "grad_norm": 1.6958383321762085, + "learning_rate": 1.9759591802801737e-07, + "loss": 0.9179, + "step": 19644 + }, + { + "epoch": 0.9565894870109317, + "grad_norm": 8.90377426147461, + "learning_rate": 1.9715385511821506e-07, + "loss": 0.7524, + "step": 19645 + }, + { + "epoch": 0.9566381808000389, + "grad_norm": 1.6034821271896362, + "learning_rate": 1.9671228480601545e-07, + "loss": 0.7686, + "step": 19646 + }, + { + "epoch": 0.9566868745891461, + "grad_norm": 2.0460100173950195, + "learning_rate": 1.9627120710240311e-07, + "loss": 0.8098, + "step": 19647 + }, + { + "epoch": 0.9567355683782534, + "grad_norm": 1.8395590782165527, + "learning_rate": 1.9583062201835145e-07, + "loss": 0.7577, + "step": 19648 + }, + { + "epoch": 0.9567842621673606, + "grad_norm": 1.485779881477356, + "learning_rate": 1.953905295648162e-07, + "loss": 0.8098, + "step": 19649 + }, + { + "epoch": 0.9568329559564678, + "grad_norm": 1.455988883972168, + "learning_rate": 1.9495092975275076e-07, + "loss": 0.6753, + "step": 19650 + }, + { + "epoch": 0.956881649745575, + "grad_norm": 1.4898681640625, + "learning_rate": 1.94511822593082e-07, + "loss": 0.7968, + "step": 19651 + }, + { + "epoch": 0.9569303435346822, + "grad_norm": 1.332727074623108, + "learning_rate": 1.9407320809674112e-07, + "loss": 0.774, + "step": 19652 + }, + { + "epoch": 0.9569790373237893, + "grad_norm": 1.5822155475616455, + "learning_rate": 1.9363508627463502e-07, + "loss": 0.8065, + "step": 19653 + }, + { + "epoch": 0.9570277311128965, + "grad_norm": 1.938822865486145, + "learning_rate": 1.931974571376616e-07, + "loss": 0.6973, + "step": 19654 + }, + { + "epoch": 0.9570764249020037, + "grad_norm": 1.462809681892395, + "learning_rate": 1.9276032069670547e-07, + "loss": 0.827, + "step": 19655 + }, + { + "epoch": 0.9571251186911109, + "grad_norm": 1.3904160261154175, + "learning_rate": 1.9232367696264464e-07, + "loss": 0.8848, + "step": 19656 + }, + { + "epoch": 0.9571738124802182, + "grad_norm": 1.2150540351867676, + "learning_rate": 1.9188752594633708e-07, + "loss": 0.7762, + "step": 19657 + }, + { + "epoch": 0.9572225062693254, + "grad_norm": 2.0361835956573486, + "learning_rate": 1.914518676586341e-07, + "loss": 0.8291, + "step": 19658 + }, + { + "epoch": 0.9572712000584326, + "grad_norm": 1.8594231605529785, + "learning_rate": 1.910167021103715e-07, + "loss": 0.777, + "step": 19659 + }, + { + "epoch": 0.9573198938475398, + "grad_norm": 1.729986548423767, + "learning_rate": 1.9058202931237167e-07, + "loss": 0.9251, + "step": 19660 + }, + { + "epoch": 0.9573685876366469, + "grad_norm": 2.0502490997314453, + "learning_rate": 1.9014784927545493e-07, + "loss": 0.8189, + "step": 19661 + }, + { + "epoch": 0.9574172814257541, + "grad_norm": 2.4401636123657227, + "learning_rate": 1.8971416201041258e-07, + "loss": 0.8906, + "step": 19662 + }, + { + "epoch": 0.9574659752148613, + "grad_norm": 1.6588139533996582, + "learning_rate": 1.8928096752804047e-07, + "loss": 0.8267, + "step": 19663 + }, + { + "epoch": 0.9575146690039685, + "grad_norm": 0.09768784046173096, + "learning_rate": 1.8884826583910777e-07, + "loss": 0.574, + "step": 19664 + }, + { + "epoch": 0.9575633627930757, + "grad_norm": 1.3896276950836182, + "learning_rate": 1.8841605695438135e-07, + "loss": 0.8833, + "step": 19665 + }, + { + "epoch": 0.957612056582183, + "grad_norm": 1.2100234031677246, + "learning_rate": 1.8798434088461272e-07, + "loss": 0.835, + "step": 19666 + }, + { + "epoch": 0.9576607503712902, + "grad_norm": 2.0962917804718018, + "learning_rate": 1.8755311764053985e-07, + "loss": 0.8802, + "step": 19667 + }, + { + "epoch": 0.9577094441603974, + "grad_norm": 1.6554696559906006, + "learning_rate": 1.8712238723288978e-07, + "loss": 0.8618, + "step": 19668 + }, + { + "epoch": 0.9577581379495046, + "grad_norm": 1.561836838722229, + "learning_rate": 1.8669214967237616e-07, + "loss": 0.9435, + "step": 19669 + }, + { + "epoch": 0.9578068317386117, + "grad_norm": 1.4618921279907227, + "learning_rate": 1.862624049697015e-07, + "loss": 0.8451, + "step": 19670 + }, + { + "epoch": 0.9578555255277189, + "grad_norm": 1.7440563440322876, + "learning_rate": 1.8583315313555727e-07, + "loss": 0.8774, + "step": 19671 + }, + { + "epoch": 0.9579042193168261, + "grad_norm": 1.368517279624939, + "learning_rate": 1.8540439418061938e-07, + "loss": 0.7909, + "step": 19672 + }, + { + "epoch": 0.9579529131059333, + "grad_norm": 1.4235835075378418, + "learning_rate": 1.8497612811555265e-07, + "loss": 0.8598, + "step": 19673 + }, + { + "epoch": 0.9580016068950405, + "grad_norm": 0.09984328597784042, + "learning_rate": 1.8454835495101297e-07, + "loss": 0.6549, + "step": 19674 + }, + { + "epoch": 0.9580503006841478, + "grad_norm": 1.666149377822876, + "learning_rate": 1.841210746976385e-07, + "loss": 0.826, + "step": 19675 + }, + { + "epoch": 0.958098994473255, + "grad_norm": 1.2574700117111206, + "learning_rate": 1.8369428736605854e-07, + "loss": 0.8096, + "step": 19676 + }, + { + "epoch": 0.9581476882623622, + "grad_norm": 1.4618738889694214, + "learning_rate": 1.8326799296689345e-07, + "loss": 0.7804, + "step": 19677 + }, + { + "epoch": 0.9581963820514693, + "grad_norm": 1.4859707355499268, + "learning_rate": 1.8284219151073924e-07, + "loss": 0.7791, + "step": 19678 + }, + { + "epoch": 0.9582450758405765, + "grad_norm": 1.730338215827942, + "learning_rate": 1.8241688300819627e-07, + "loss": 0.8466, + "step": 19679 + }, + { + "epoch": 0.9582937696296837, + "grad_norm": 1.6351768970489502, + "learning_rate": 1.8199206746983833e-07, + "loss": 0.7338, + "step": 19680 + }, + { + "epoch": 0.9583424634187909, + "grad_norm": 1.4127368927001953, + "learning_rate": 1.8156774490623475e-07, + "loss": 0.8697, + "step": 19681 + }, + { + "epoch": 0.9583911572078981, + "grad_norm": 2.4689431190490723, + "learning_rate": 1.811439153279393e-07, + "loss": 0.7266, + "step": 19682 + }, + { + "epoch": 0.9584398509970053, + "grad_norm": 1.6606327295303345, + "learning_rate": 1.8072057874549908e-07, + "loss": 0.8715, + "step": 19683 + }, + { + "epoch": 0.9584885447861126, + "grad_norm": 1.7625534534454346, + "learning_rate": 1.8029773516943905e-07, + "loss": 0.8278, + "step": 19684 + }, + { + "epoch": 0.9585372385752198, + "grad_norm": 1.7061129808425903, + "learning_rate": 1.7987538461028187e-07, + "loss": 0.8136, + "step": 19685 + }, + { + "epoch": 0.9585859323643269, + "grad_norm": 2.284059762954712, + "learning_rate": 1.7945352707853024e-07, + "loss": 0.8246, + "step": 19686 + }, + { + "epoch": 0.9586346261534341, + "grad_norm": 1.4217796325683594, + "learning_rate": 1.7903216258467803e-07, + "loss": 0.8311, + "step": 19687 + }, + { + "epoch": 0.9586833199425413, + "grad_norm": 1.6880053281784058, + "learning_rate": 1.7861129113920794e-07, + "loss": 0.8236, + "step": 19688 + }, + { + "epoch": 0.9587320137316485, + "grad_norm": 1.8160641193389893, + "learning_rate": 1.781909127525916e-07, + "loss": 0.9019, + "step": 19689 + }, + { + "epoch": 0.9587807075207557, + "grad_norm": 1.871330976486206, + "learning_rate": 1.7777102743528285e-07, + "loss": 0.8331, + "step": 19690 + }, + { + "epoch": 0.9588294013098629, + "grad_norm": 1.8807921409606934, + "learning_rate": 1.773516351977267e-07, + "loss": 0.8103, + "step": 19691 + }, + { + "epoch": 0.9588780950989702, + "grad_norm": 1.7783865928649902, + "learning_rate": 1.76932736050357e-07, + "loss": 0.9102, + "step": 19692 + }, + { + "epoch": 0.9589267888880774, + "grad_norm": 1.5371240377426147, + "learning_rate": 1.7651433000359208e-07, + "loss": 0.7175, + "step": 19693 + }, + { + "epoch": 0.9589754826771846, + "grad_norm": 1.5688807964324951, + "learning_rate": 1.7609641706784143e-07, + "loss": 0.816, + "step": 19694 + }, + { + "epoch": 0.9590241764662917, + "grad_norm": 12.480988502502441, + "learning_rate": 1.756789972534967e-07, + "loss": 0.855, + "step": 19695 + }, + { + "epoch": 0.9590728702553989, + "grad_norm": 1.4964001178741455, + "learning_rate": 1.7526207057094736e-07, + "loss": 0.7509, + "step": 19696 + }, + { + "epoch": 0.9591215640445061, + "grad_norm": 1.440631628036499, + "learning_rate": 1.7484563703056067e-07, + "loss": 0.8167, + "step": 19697 + }, + { + "epoch": 0.9591702578336133, + "grad_norm": 1.6139483451843262, + "learning_rate": 1.7442969664269728e-07, + "loss": 0.7857, + "step": 19698 + }, + { + "epoch": 0.9592189516227205, + "grad_norm": 1.6033564805984497, + "learning_rate": 1.7401424941770216e-07, + "loss": 0.7609, + "step": 19699 + }, + { + "epoch": 0.9592676454118277, + "grad_norm": 2.1285693645477295, + "learning_rate": 1.7359929536590937e-07, + "loss": 0.8211, + "step": 19700 + }, + { + "epoch": 0.959316339200935, + "grad_norm": 2.7024173736572266, + "learning_rate": 1.731848344976439e-07, + "loss": 0.8108, + "step": 19701 + }, + { + "epoch": 0.9593650329900422, + "grad_norm": 1.5826700925827026, + "learning_rate": 1.7277086682321088e-07, + "loss": 0.79, + "step": 19702 + }, + { + "epoch": 0.9594137267791493, + "grad_norm": 1.617127537727356, + "learning_rate": 1.7235739235291316e-07, + "loss": 0.656, + "step": 19703 + }, + { + "epoch": 0.9594624205682565, + "grad_norm": 1.3824712038040161, + "learning_rate": 1.719444110970314e-07, + "loss": 0.8704, + "step": 19704 + }, + { + "epoch": 0.9595111143573637, + "grad_norm": 1.5614327192306519, + "learning_rate": 1.7153192306583965e-07, + "loss": 0.7317, + "step": 19705 + }, + { + "epoch": 0.9595598081464709, + "grad_norm": 2.0708189010620117, + "learning_rate": 1.7111992826960078e-07, + "loss": 0.8453, + "step": 19706 + }, + { + "epoch": 0.9596085019355781, + "grad_norm": 1.5494602918624878, + "learning_rate": 1.7070842671855992e-07, + "loss": 0.7774, + "step": 19707 + }, + { + "epoch": 0.9596571957246853, + "grad_norm": 1.5368952751159668, + "learning_rate": 1.7029741842295776e-07, + "loss": 0.6705, + "step": 19708 + }, + { + "epoch": 0.9597058895137925, + "grad_norm": 1.1870558261871338, + "learning_rate": 1.6988690339301284e-07, + "loss": 0.8205, + "step": 19709 + }, + { + "epoch": 0.9597545833028998, + "grad_norm": 1.1717838048934937, + "learning_rate": 1.6947688163893915e-07, + "loss": 0.8077, + "step": 19710 + }, + { + "epoch": 0.959803277092007, + "grad_norm": 1.6153525114059448, + "learning_rate": 1.6906735317093524e-07, + "loss": 0.8475, + "step": 19711 + }, + { + "epoch": 0.9598519708811141, + "grad_norm": 2.220216989517212, + "learning_rate": 1.6865831799919298e-07, + "loss": 0.8464, + "step": 19712 + }, + { + "epoch": 0.9599006646702213, + "grad_norm": 0.10297135263681412, + "learning_rate": 1.6824977613387972e-07, + "loss": 0.6116, + "step": 19713 + }, + { + "epoch": 0.9599493584593285, + "grad_norm": 2.1278979778289795, + "learning_rate": 1.6784172758516294e-07, + "loss": 0.7939, + "step": 19714 + }, + { + "epoch": 0.9599980522484357, + "grad_norm": 1.513061285018921, + "learning_rate": 1.6743417236319005e-07, + "loss": 0.7902, + "step": 19715 + }, + { + "epoch": 0.9600467460375429, + "grad_norm": 2.2446775436401367, + "learning_rate": 1.6702711047810182e-07, + "loss": 0.7722, + "step": 19716 + }, + { + "epoch": 0.9600954398266501, + "grad_norm": 1.373486042022705, + "learning_rate": 1.6662054194002353e-07, + "loss": 0.8355, + "step": 19717 + }, + { + "epoch": 0.9601441336157573, + "grad_norm": 2.58329176902771, + "learning_rate": 1.6621446675906482e-07, + "loss": 0.8577, + "step": 19718 + }, + { + "epoch": 0.9601928274048646, + "grad_norm": 1.20598566532135, + "learning_rate": 1.65808884945331e-07, + "loss": 0.8182, + "step": 19719 + }, + { + "epoch": 0.9602415211939717, + "grad_norm": 1.4971868991851807, + "learning_rate": 1.6540379650890948e-07, + "loss": 0.9124, + "step": 19720 + }, + { + "epoch": 0.9602902149830789, + "grad_norm": 2.3123207092285156, + "learning_rate": 1.649992014598789e-07, + "loss": 0.7923, + "step": 19721 + }, + { + "epoch": 0.9603389087721861, + "grad_norm": 1.5120717287063599, + "learning_rate": 1.645950998082979e-07, + "loss": 0.7807, + "step": 19722 + }, + { + "epoch": 0.9603876025612933, + "grad_norm": 3.9598398208618164, + "learning_rate": 1.6419149156422508e-07, + "loss": 0.7702, + "step": 19723 + }, + { + "epoch": 0.9604362963504005, + "grad_norm": 1.3534985780715942, + "learning_rate": 1.6378837673769465e-07, + "loss": 0.8915, + "step": 19724 + }, + { + "epoch": 0.9604849901395077, + "grad_norm": 1.817176342010498, + "learning_rate": 1.6338575533873858e-07, + "loss": 0.7416, + "step": 19725 + }, + { + "epoch": 0.9605336839286149, + "grad_norm": 1.4054877758026123, + "learning_rate": 1.6298362737737326e-07, + "loss": 0.8773, + "step": 19726 + }, + { + "epoch": 0.9605823777177221, + "grad_norm": 1.8395780324935913, + "learning_rate": 1.625819928635952e-07, + "loss": 0.8632, + "step": 19727 + }, + { + "epoch": 0.9606310715068294, + "grad_norm": 1.6193596124649048, + "learning_rate": 1.6218085180740084e-07, + "loss": 0.7878, + "step": 19728 + }, + { + "epoch": 0.9606797652959365, + "grad_norm": 1.8927586078643799, + "learning_rate": 1.617802042187644e-07, + "loss": 0.8047, + "step": 19729 + }, + { + "epoch": 0.9607284590850437, + "grad_norm": 1.6803741455078125, + "learning_rate": 1.6138005010765567e-07, + "loss": 0.7916, + "step": 19730 + }, + { + "epoch": 0.9607771528741509, + "grad_norm": 1.5684231519699097, + "learning_rate": 1.6098038948402672e-07, + "loss": 0.7326, + "step": 19731 + }, + { + "epoch": 0.9608258466632581, + "grad_norm": 1.4227595329284668, + "learning_rate": 1.6058122235781847e-07, + "loss": 0.7588, + "step": 19732 + }, + { + "epoch": 0.9608745404523653, + "grad_norm": 0.09490308165550232, + "learning_rate": 1.60182548738963e-07, + "loss": 0.617, + "step": 19733 + }, + { + "epoch": 0.9609232342414725, + "grad_norm": 1.5348304510116577, + "learning_rate": 1.5978436863737457e-07, + "loss": 0.9081, + "step": 19734 + }, + { + "epoch": 0.9609719280305797, + "grad_norm": 1.4925328493118286, + "learning_rate": 1.593866820629586e-07, + "loss": 0.825, + "step": 19735 + }, + { + "epoch": 0.961020621819687, + "grad_norm": 18.900161743164062, + "learning_rate": 1.589894890256094e-07, + "loss": 0.7626, + "step": 19736 + }, + { + "epoch": 0.961069315608794, + "grad_norm": 1.7079862356185913, + "learning_rate": 1.585927895352035e-07, + "loss": 0.7383, + "step": 19737 + }, + { + "epoch": 0.9611180093979013, + "grad_norm": 1.338118314743042, + "learning_rate": 1.5819658360161306e-07, + "loss": 0.8136, + "step": 19738 + }, + { + "epoch": 0.9611667031870085, + "grad_norm": 1.3737192153930664, + "learning_rate": 1.5780087123469013e-07, + "loss": 0.8114, + "step": 19739 + }, + { + "epoch": 0.9612153969761157, + "grad_norm": 1.7936983108520508, + "learning_rate": 1.574056524442802e-07, + "loss": 0.7854, + "step": 19740 + }, + { + "epoch": 0.9612640907652229, + "grad_norm": 1.8964533805847168, + "learning_rate": 1.570109272402154e-07, + "loss": 0.8622, + "step": 19741 + }, + { + "epoch": 0.9613127845543301, + "grad_norm": 4.862509250640869, + "learning_rate": 1.566166956323123e-07, + "loss": 0.84, + "step": 19742 + }, + { + "epoch": 0.9613614783434373, + "grad_norm": 1.526647925376892, + "learning_rate": 1.5622295763037644e-07, + "loss": 0.789, + "step": 19743 + }, + { + "epoch": 0.9614101721325445, + "grad_norm": 1.3838932514190674, + "learning_rate": 1.5582971324420437e-07, + "loss": 0.8009, + "step": 19744 + }, + { + "epoch": 0.9614588659216516, + "grad_norm": 1.873157024383545, + "learning_rate": 1.554369624835772e-07, + "loss": 0.8124, + "step": 19745 + }, + { + "epoch": 0.9615075597107589, + "grad_norm": 1.316441297531128, + "learning_rate": 1.550447053582671e-07, + "loss": 0.8071, + "step": 19746 + }, + { + "epoch": 0.9615562534998661, + "grad_norm": 1.4068554639816284, + "learning_rate": 1.5465294187802848e-07, + "loss": 0.7964, + "step": 19747 + }, + { + "epoch": 0.9616049472889733, + "grad_norm": 1.2987103462219238, + "learning_rate": 1.542616720526069e-07, + "loss": 0.8808, + "step": 19748 + }, + { + "epoch": 0.9616536410780805, + "grad_norm": 1.5469341278076172, + "learning_rate": 1.5387089589173453e-07, + "loss": 0.9448, + "step": 19749 + }, + { + "epoch": 0.9617023348671877, + "grad_norm": 1.538896918296814, + "learning_rate": 1.5348061340513477e-07, + "loss": 0.772, + "step": 19750 + }, + { + "epoch": 0.9617510286562949, + "grad_norm": 1.8615988492965698, + "learning_rate": 1.5309082460251312e-07, + "loss": 0.8631, + "step": 19751 + }, + { + "epoch": 0.9617997224454021, + "grad_norm": 1.6060329675674438, + "learning_rate": 1.5270152949356853e-07, + "loss": 0.8905, + "step": 19752 + }, + { + "epoch": 0.9618484162345093, + "grad_norm": 1.7902979850769043, + "learning_rate": 1.5231272808797992e-07, + "loss": 0.8828, + "step": 19753 + }, + { + "epoch": 0.9618971100236164, + "grad_norm": 1.428863525390625, + "learning_rate": 1.519244203954262e-07, + "loss": 0.7932, + "step": 19754 + }, + { + "epoch": 0.9619458038127237, + "grad_norm": 1.4818320274353027, + "learning_rate": 1.5153660642555744e-07, + "loss": 0.8296, + "step": 19755 + }, + { + "epoch": 0.9619944976018309, + "grad_norm": 2.872596263885498, + "learning_rate": 1.5114928618802816e-07, + "loss": 0.8149, + "step": 19756 + }, + { + "epoch": 0.9620431913909381, + "grad_norm": 1.4892048835754395, + "learning_rate": 1.5076245969247062e-07, + "loss": 0.8405, + "step": 19757 + }, + { + "epoch": 0.9620918851800453, + "grad_norm": 2.2917022705078125, + "learning_rate": 1.5037612694850602e-07, + "loss": 0.8208, + "step": 19758 + }, + { + "epoch": 0.9621405789691525, + "grad_norm": 1.677984595298767, + "learning_rate": 1.4999028796574445e-07, + "loss": 0.8265, + "step": 19759 + }, + { + "epoch": 0.9621892727582597, + "grad_norm": 1.7423745393753052, + "learning_rate": 1.496049427537827e-07, + "loss": 0.7718, + "step": 19760 + }, + { + "epoch": 0.9622379665473669, + "grad_norm": 3.9414665699005127, + "learning_rate": 1.4922009132221082e-07, + "loss": 0.7949, + "step": 19761 + }, + { + "epoch": 0.962286660336474, + "grad_norm": 2.191279888153076, + "learning_rate": 1.4883573368059678e-07, + "loss": 0.8205, + "step": 19762 + }, + { + "epoch": 0.9623353541255812, + "grad_norm": 1.4399908781051636, + "learning_rate": 1.4845186983850845e-07, + "loss": 0.7236, + "step": 19763 + }, + { + "epoch": 0.9623840479146885, + "grad_norm": 0.09234465658664703, + "learning_rate": 1.4806849980548488e-07, + "loss": 0.6325, + "step": 19764 + }, + { + "epoch": 0.9624327417037957, + "grad_norm": 1.6249381303787231, + "learning_rate": 1.4768562359106952e-07, + "loss": 0.7622, + "step": 19765 + }, + { + "epoch": 0.9624814354929029, + "grad_norm": 1.734422206878662, + "learning_rate": 1.4730324120478367e-07, + "loss": 0.765, + "step": 19766 + }, + { + "epoch": 0.9625301292820101, + "grad_norm": 1.1303718090057373, + "learning_rate": 1.4692135265613973e-07, + "loss": 0.8323, + "step": 19767 + }, + { + "epoch": 0.9625788230711173, + "grad_norm": 2.0865869522094727, + "learning_rate": 1.4653995795463672e-07, + "loss": 0.8721, + "step": 19768 + }, + { + "epoch": 0.9626275168602245, + "grad_norm": 0.09561175107955933, + "learning_rate": 1.4615905710976263e-07, + "loss": 0.64, + "step": 19769 + }, + { + "epoch": 0.9626762106493317, + "grad_norm": 1.7886817455291748, + "learning_rate": 1.4577865013099436e-07, + "loss": 0.8371, + "step": 19770 + }, + { + "epoch": 0.9627249044384388, + "grad_norm": 1.3799879550933838, + "learning_rate": 1.4539873702779096e-07, + "loss": 0.8087, + "step": 19771 + }, + { + "epoch": 0.962773598227546, + "grad_norm": 3.1362462043762207, + "learning_rate": 1.4501931780960489e-07, + "loss": 0.7345, + "step": 19772 + }, + { + "epoch": 0.9628222920166533, + "grad_norm": 1.9385004043579102, + "learning_rate": 1.446403924858708e-07, + "loss": 0.7937, + "step": 19773 + }, + { + "epoch": 0.9628709858057605, + "grad_norm": 1.9564642906188965, + "learning_rate": 1.4426196106601898e-07, + "loss": 0.7601, + "step": 19774 + }, + { + "epoch": 0.9629196795948677, + "grad_norm": 1.4795995950698853, + "learning_rate": 1.4388402355946184e-07, + "loss": 0.7952, + "step": 19775 + }, + { + "epoch": 0.9629683733839749, + "grad_norm": 1.631246566772461, + "learning_rate": 1.435065799756008e-07, + "loss": 0.8896, + "step": 19776 + }, + { + "epoch": 0.9630170671730821, + "grad_norm": 1.131637692451477, + "learning_rate": 1.4312963032382387e-07, + "loss": 0.7941, + "step": 19777 + }, + { + "epoch": 0.9630657609621893, + "grad_norm": 1.3837038278579712, + "learning_rate": 1.427531746135058e-07, + "loss": 0.7088, + "step": 19778 + }, + { + "epoch": 0.9631144547512964, + "grad_norm": 1.4521409273147583, + "learning_rate": 1.4237721285401462e-07, + "loss": 0.8853, + "step": 19779 + }, + { + "epoch": 0.9631631485404036, + "grad_norm": 2.421323537826538, + "learning_rate": 1.4200174505470065e-07, + "loss": 0.7974, + "step": 19780 + }, + { + "epoch": 0.9632118423295108, + "grad_norm": 1.191614031791687, + "learning_rate": 1.416267712249031e-07, + "loss": 0.8011, + "step": 19781 + }, + { + "epoch": 0.9632605361186181, + "grad_norm": 1.4911142587661743, + "learning_rate": 1.4125229137395003e-07, + "loss": 0.8482, + "step": 19782 + }, + { + "epoch": 0.9633092299077253, + "grad_norm": 1.5380158424377441, + "learning_rate": 1.4087830551115844e-07, + "loss": 0.8677, + "step": 19783 + }, + { + "epoch": 0.9633579236968325, + "grad_norm": 8.305586814880371, + "learning_rate": 1.4050481364582978e-07, + "loss": 0.7908, + "step": 19784 + }, + { + "epoch": 0.9634066174859397, + "grad_norm": 1.3714256286621094, + "learning_rate": 1.401318157872522e-07, + "loss": 0.7685, + "step": 19785 + }, + { + "epoch": 0.9634553112750469, + "grad_norm": 1.669858455657959, + "learning_rate": 1.3975931194470936e-07, + "loss": 0.84, + "step": 19786 + }, + { + "epoch": 0.963504005064154, + "grad_norm": 1.7841262817382812, + "learning_rate": 1.3938730212746055e-07, + "loss": 0.808, + "step": 19787 + }, + { + "epoch": 0.9635526988532612, + "grad_norm": 2.8282711505889893, + "learning_rate": 1.390157863447672e-07, + "loss": 0.7582, + "step": 19788 + }, + { + "epoch": 0.9636013926423684, + "grad_norm": 0.1297502964735031, + "learning_rate": 1.386447646058664e-07, + "loss": 0.6643, + "step": 19789 + }, + { + "epoch": 0.9636500864314756, + "grad_norm": 1.7581974267959595, + "learning_rate": 1.3827423691998633e-07, + "loss": 0.6776, + "step": 19790 + }, + { + "epoch": 0.9636987802205829, + "grad_norm": 1.8363666534423828, + "learning_rate": 1.3790420329634624e-07, + "loss": 0.7053, + "step": 19791 + }, + { + "epoch": 0.9637474740096901, + "grad_norm": 1.403473973274231, + "learning_rate": 1.3753466374414993e-07, + "loss": 0.8201, + "step": 19792 + }, + { + "epoch": 0.9637961677987973, + "grad_norm": 1.549116849899292, + "learning_rate": 1.3716561827259002e-07, + "loss": 0.7372, + "step": 19793 + }, + { + "epoch": 0.9638448615879045, + "grad_norm": 1.4648323059082031, + "learning_rate": 1.367970668908458e-07, + "loss": 0.7719, + "step": 19794 + }, + { + "epoch": 0.9638935553770117, + "grad_norm": 1.7890101671218872, + "learning_rate": 1.364290096080878e-07, + "loss": 0.8155, + "step": 19795 + }, + { + "epoch": 0.9639422491661188, + "grad_norm": 1.436012625694275, + "learning_rate": 1.360614464334664e-07, + "loss": 0.706, + "step": 19796 + }, + { + "epoch": 0.963990942955226, + "grad_norm": 1.6102901697158813, + "learning_rate": 1.356943773761299e-07, + "loss": 0.7164, + "step": 19797 + }, + { + "epoch": 0.9640396367443332, + "grad_norm": 1.574842095375061, + "learning_rate": 1.353278024452065e-07, + "loss": 0.8864, + "step": 19798 + }, + { + "epoch": 0.9640883305334405, + "grad_norm": 1.3204501867294312, + "learning_rate": 1.3496172164981557e-07, + "loss": 0.7174, + "step": 19799 + }, + { + "epoch": 0.9641370243225477, + "grad_norm": 1.5134960412979126, + "learning_rate": 1.34596134999061e-07, + "loss": 0.9378, + "step": 19800 + }, + { + "epoch": 0.9641857181116549, + "grad_norm": 1.4381465911865234, + "learning_rate": 1.342310425020399e-07, + "loss": 0.7523, + "step": 19801 + }, + { + "epoch": 0.9642344119007621, + "grad_norm": 1.5388214588165283, + "learning_rate": 1.338664441678339e-07, + "loss": 0.7838, + "step": 19802 + }, + { + "epoch": 0.9642831056898693, + "grad_norm": 1.5513349771499634, + "learning_rate": 1.3350234000551131e-07, + "loss": 0.7831, + "step": 19803 + }, + { + "epoch": 0.9643317994789764, + "grad_norm": 1.4184088706970215, + "learning_rate": 1.3313873002412714e-07, + "loss": 0.8957, + "step": 19804 + }, + { + "epoch": 0.9643804932680836, + "grad_norm": 1.3662335872650146, + "learning_rate": 1.3277561423272966e-07, + "loss": 0.8503, + "step": 19805 + }, + { + "epoch": 0.9644291870571908, + "grad_norm": 1.6413918733596802, + "learning_rate": 1.3241299264035167e-07, + "loss": 0.8048, + "step": 19806 + }, + { + "epoch": 0.964477880846298, + "grad_norm": 1.8084497451782227, + "learning_rate": 1.320508652560104e-07, + "loss": 0.8088, + "step": 19807 + }, + { + "epoch": 0.9645265746354053, + "grad_norm": 1.756653904914856, + "learning_rate": 1.3168923208871643e-07, + "loss": 0.8676, + "step": 19808 + }, + { + "epoch": 0.9645752684245125, + "grad_norm": 2.692776918411255, + "learning_rate": 1.3132809314746252e-07, + "loss": 0.9196, + "step": 19809 + }, + { + "epoch": 0.9646239622136197, + "grad_norm": 1.7421780824661255, + "learning_rate": 1.309674484412349e-07, + "loss": 0.7733, + "step": 19810 + }, + { + "epoch": 0.9646726560027269, + "grad_norm": 1.466072678565979, + "learning_rate": 1.3060729797900406e-07, + "loss": 0.8459, + "step": 19811 + }, + { + "epoch": 0.9647213497918341, + "grad_norm": 1.2442338466644287, + "learning_rate": 1.3024764176972738e-07, + "loss": 0.7676, + "step": 19812 + }, + { + "epoch": 0.9647700435809412, + "grad_norm": 1.4579476118087769, + "learning_rate": 1.298884798223532e-07, + "loss": 0.8424, + "step": 19813 + }, + { + "epoch": 0.9648187373700484, + "grad_norm": 1.4248120784759521, + "learning_rate": 1.295298121458144e-07, + "loss": 0.7789, + "step": 19814 + }, + { + "epoch": 0.9648674311591556, + "grad_norm": 0.09417710453271866, + "learning_rate": 1.2917163874903271e-07, + "loss": 0.6469, + "step": 19815 + }, + { + "epoch": 0.9649161249482628, + "grad_norm": 1.5796313285827637, + "learning_rate": 1.288139596409188e-07, + "loss": 0.7739, + "step": 19816 + }, + { + "epoch": 0.9649648187373701, + "grad_norm": 1.6424962282180786, + "learning_rate": 1.2845677483037e-07, + "loss": 0.7878, + "step": 19817 + }, + { + "epoch": 0.9650135125264773, + "grad_norm": 1.7245312929153442, + "learning_rate": 1.281000843262681e-07, + "loss": 0.8181, + "step": 19818 + }, + { + "epoch": 0.9650622063155845, + "grad_norm": 1.2695105075836182, + "learning_rate": 1.277438881374926e-07, + "loss": 0.8645, + "step": 19819 + }, + { + "epoch": 0.9651109001046917, + "grad_norm": 1.4580001831054688, + "learning_rate": 1.273881862728965e-07, + "loss": 0.8811, + "step": 19820 + }, + { + "epoch": 0.9651595938937988, + "grad_norm": 2.243037223815918, + "learning_rate": 1.2703297874133268e-07, + "loss": 0.8543, + "step": 19821 + }, + { + "epoch": 0.965208287682906, + "grad_norm": 1.690464973449707, + "learning_rate": 1.266782655516341e-07, + "loss": 0.7959, + "step": 19822 + }, + { + "epoch": 0.9652569814720132, + "grad_norm": 1.3068454265594482, + "learning_rate": 1.26324046712627e-07, + "loss": 0.7767, + "step": 19823 + }, + { + "epoch": 0.9653056752611204, + "grad_norm": 1.723027229309082, + "learning_rate": 1.2597032223312211e-07, + "loss": 0.8535, + "step": 19824 + }, + { + "epoch": 0.9653543690502276, + "grad_norm": 1.5391862392425537, + "learning_rate": 1.2561709212191464e-07, + "loss": 0.8241, + "step": 19825 + }, + { + "epoch": 0.9654030628393349, + "grad_norm": 0.09129940718412399, + "learning_rate": 1.2526435638779532e-07, + "loss": 0.5702, + "step": 19826 + }, + { + "epoch": 0.9654517566284421, + "grad_norm": 1.5429039001464844, + "learning_rate": 1.2491211503953716e-07, + "loss": 0.7429, + "step": 19827 + }, + { + "epoch": 0.9655004504175493, + "grad_norm": 1.2326421737670898, + "learning_rate": 1.245603680859042e-07, + "loss": 0.8752, + "step": 19828 + }, + { + "epoch": 0.9655491442066565, + "grad_norm": 1.563673973083496, + "learning_rate": 1.2420911553564285e-07, + "loss": 0.7399, + "step": 19829 + }, + { + "epoch": 0.9655978379957636, + "grad_norm": 1.8890661001205444, + "learning_rate": 1.2385835739749276e-07, + "loss": 0.7981, + "step": 19830 + }, + { + "epoch": 0.9656465317848708, + "grad_norm": 2.0695993900299072, + "learning_rate": 1.235080936801758e-07, + "loss": 0.8297, + "step": 19831 + }, + { + "epoch": 0.965695225573978, + "grad_norm": 2.574842691421509, + "learning_rate": 1.231583243924095e-07, + "loss": 0.8988, + "step": 19832 + }, + { + "epoch": 0.9657439193630852, + "grad_norm": 1.4481230974197388, + "learning_rate": 1.228090495428913e-07, + "loss": 0.7774, + "step": 19833 + }, + { + "epoch": 0.9657926131521924, + "grad_norm": 2.108649730682373, + "learning_rate": 1.2246026914030984e-07, + "loss": 0.8219, + "step": 19834 + }, + { + "epoch": 0.9658413069412997, + "grad_norm": 1.8247203826904297, + "learning_rate": 1.2211198319334262e-07, + "loss": 0.8127, + "step": 19835 + }, + { + "epoch": 0.9658900007304069, + "grad_norm": 2.9765470027923584, + "learning_rate": 1.2176419171065157e-07, + "loss": 0.7826, + "step": 19836 + }, + { + "epoch": 0.9659386945195141, + "grad_norm": 1.5406169891357422, + "learning_rate": 1.214168947008898e-07, + "loss": 0.8132, + "step": 19837 + }, + { + "epoch": 0.9659873883086212, + "grad_norm": 1.4777511358261108, + "learning_rate": 1.2107009217269483e-07, + "loss": 0.8654, + "step": 19838 + }, + { + "epoch": 0.9660360820977284, + "grad_norm": 1.3727283477783203, + "learning_rate": 1.207237841346931e-07, + "loss": 0.8, + "step": 19839 + }, + { + "epoch": 0.9660847758868356, + "grad_norm": 1.5540233850479126, + "learning_rate": 1.2037797059549994e-07, + "loss": 0.759, + "step": 19840 + }, + { + "epoch": 0.9661334696759428, + "grad_norm": 1.5935007333755493, + "learning_rate": 1.2003265156371735e-07, + "loss": 0.7549, + "step": 19841 + }, + { + "epoch": 0.96618216346505, + "grad_norm": 2.2087209224700928, + "learning_rate": 1.1968782704793403e-07, + "loss": 0.8489, + "step": 19842 + }, + { + "epoch": 0.9662308572541572, + "grad_norm": 1.3088278770446777, + "learning_rate": 1.1934349705673198e-07, + "loss": 0.7365, + "step": 19843 + }, + { + "epoch": 0.9662795510432645, + "grad_norm": 1.7817806005477905, + "learning_rate": 1.1899966159866883e-07, + "loss": 0.7539, + "step": 19844 + }, + { + "epoch": 0.9663282448323717, + "grad_norm": 1.6712453365325928, + "learning_rate": 1.1865632068230437e-07, + "loss": 0.8586, + "step": 19845 + }, + { + "epoch": 0.9663769386214788, + "grad_norm": 2.8534562587738037, + "learning_rate": 1.1831347431617623e-07, + "loss": 0.8113, + "step": 19846 + }, + { + "epoch": 0.966425632410586, + "grad_norm": 1.963322639465332, + "learning_rate": 1.1797112250881315e-07, + "loss": 0.8016, + "step": 19847 + }, + { + "epoch": 0.9664743261996932, + "grad_norm": 1.4138233661651611, + "learning_rate": 1.1762926526873053e-07, + "loss": 0.8491, + "step": 19848 + }, + { + "epoch": 0.9665230199888004, + "grad_norm": 1.632961630821228, + "learning_rate": 1.1728790260443268e-07, + "loss": 0.7348, + "step": 19849 + }, + { + "epoch": 0.9665717137779076, + "grad_norm": 1.412172555923462, + "learning_rate": 1.169470345244128e-07, + "loss": 0.8202, + "step": 19850 + }, + { + "epoch": 0.9666204075670148, + "grad_norm": 3.1130189895629883, + "learning_rate": 1.1660666103714635e-07, + "loss": 0.9141, + "step": 19851 + }, + { + "epoch": 0.966669101356122, + "grad_norm": 1.4128434658050537, + "learning_rate": 1.1626678215110432e-07, + "loss": 0.8045, + "step": 19852 + }, + { + "epoch": 0.9667177951452293, + "grad_norm": 6.6307196617126465, + "learning_rate": 1.1592739787473773e-07, + "loss": 0.7852, + "step": 19853 + }, + { + "epoch": 0.9667664889343365, + "grad_norm": 3.5554375648498535, + "learning_rate": 1.1558850821648871e-07, + "loss": 0.7864, + "step": 19854 + }, + { + "epoch": 0.9668151827234436, + "grad_norm": 1.5921906232833862, + "learning_rate": 1.1525011318479051e-07, + "loss": 0.766, + "step": 19855 + }, + { + "epoch": 0.9668638765125508, + "grad_norm": 2.3355512619018555, + "learning_rate": 1.1491221278805864e-07, + "loss": 0.7927, + "step": 19856 + }, + { + "epoch": 0.966912570301658, + "grad_norm": 2.424527883529663, + "learning_rate": 1.1457480703469748e-07, + "loss": 0.8335, + "step": 19857 + }, + { + "epoch": 0.9669612640907652, + "grad_norm": 1.388128399848938, + "learning_rate": 1.142378959331003e-07, + "loss": 0.8034, + "step": 19858 + }, + { + "epoch": 0.9670099578798724, + "grad_norm": 1.309025526046753, + "learning_rate": 1.1390147949165154e-07, + "loss": 0.8002, + "step": 19859 + }, + { + "epoch": 0.9670586516689796, + "grad_norm": 1.8897374868392944, + "learning_rate": 1.1356555771871337e-07, + "loss": 0.6945, + "step": 19860 + }, + { + "epoch": 0.9671073454580869, + "grad_norm": 1.6647188663482666, + "learning_rate": 1.1323013062264799e-07, + "loss": 0.8144, + "step": 19861 + }, + { + "epoch": 0.9671560392471941, + "grad_norm": 2.3978865146636963, + "learning_rate": 1.1289519821179317e-07, + "loss": 0.7498, + "step": 19862 + }, + { + "epoch": 0.9672047330363012, + "grad_norm": 1.5509053468704224, + "learning_rate": 1.1256076049448672e-07, + "loss": 0.8619, + "step": 19863 + }, + { + "epoch": 0.9672534268254084, + "grad_norm": 1.3813097476959229, + "learning_rate": 1.1222681747904196e-07, + "loss": 0.8397, + "step": 19864 + }, + { + "epoch": 0.9673021206145156, + "grad_norm": 1.3136818408966064, + "learning_rate": 1.1189336917377002e-07, + "loss": 0.7245, + "step": 19865 + }, + { + "epoch": 0.9673508144036228, + "grad_norm": 1.3033345937728882, + "learning_rate": 1.1156041558696429e-07, + "loss": 0.7879, + "step": 19866 + }, + { + "epoch": 0.96739950819273, + "grad_norm": 1.7702662944793701, + "learning_rate": 1.1122795672690478e-07, + "loss": 0.7687, + "step": 19867 + }, + { + "epoch": 0.9674482019818372, + "grad_norm": 1.8572719097137451, + "learning_rate": 1.108959926018649e-07, + "loss": 0.7047, + "step": 19868 + }, + { + "epoch": 0.9674968957709444, + "grad_norm": 1.590294599533081, + "learning_rate": 1.1056452322010026e-07, + "loss": 0.8613, + "step": 19869 + }, + { + "epoch": 0.9675455895600517, + "grad_norm": 1.7390884160995483, + "learning_rate": 1.1023354858985758e-07, + "loss": 0.8148, + "step": 19870 + }, + { + "epoch": 0.9675942833491589, + "grad_norm": 1.6960420608520508, + "learning_rate": 1.0990306871936585e-07, + "loss": 0.6958, + "step": 19871 + }, + { + "epoch": 0.967642977138266, + "grad_norm": 2.7017579078674316, + "learning_rate": 1.0957308361685182e-07, + "loss": 0.7532, + "step": 19872 + }, + { + "epoch": 0.9676916709273732, + "grad_norm": 1.3830747604370117, + "learning_rate": 1.0924359329051781e-07, + "loss": 0.8045, + "step": 19873 + }, + { + "epoch": 0.9677403647164804, + "grad_norm": 1.6310405731201172, + "learning_rate": 1.0891459774856616e-07, + "loss": 0.7146, + "step": 19874 + }, + { + "epoch": 0.9677890585055876, + "grad_norm": 1.5925699472427368, + "learning_rate": 1.0858609699917478e-07, + "loss": 0.7307, + "step": 19875 + }, + { + "epoch": 0.9678377522946948, + "grad_norm": 2.533127546310425, + "learning_rate": 1.0825809105051932e-07, + "loss": 0.856, + "step": 19876 + }, + { + "epoch": 0.967886446083802, + "grad_norm": 1.518141508102417, + "learning_rate": 1.0793057991075772e-07, + "loss": 0.8067, + "step": 19877 + }, + { + "epoch": 0.9679351398729092, + "grad_norm": 1.6804747581481934, + "learning_rate": 1.0760356358803459e-07, + "loss": 0.7791, + "step": 19878 + }, + { + "epoch": 0.9679838336620165, + "grad_norm": 0.10224813967943192, + "learning_rate": 1.0727704209049006e-07, + "loss": 0.6649, + "step": 19879 + }, + { + "epoch": 0.9680325274511236, + "grad_norm": 1.4383306503295898, + "learning_rate": 1.0695101542623987e-07, + "loss": 0.8573, + "step": 19880 + }, + { + "epoch": 0.9680812212402308, + "grad_norm": 1.4403088092803955, + "learning_rate": 1.0662548360339974e-07, + "loss": 0.8432, + "step": 19881 + }, + { + "epoch": 0.968129915029338, + "grad_norm": 1.577873945236206, + "learning_rate": 1.06300446630061e-07, + "loss": 0.7665, + "step": 19882 + }, + { + "epoch": 0.9681786088184452, + "grad_norm": 1.2816197872161865, + "learning_rate": 1.0597590451431494e-07, + "loss": 0.788, + "step": 19883 + }, + { + "epoch": 0.9682273026075524, + "grad_norm": 1.3399653434753418, + "learning_rate": 1.0565185726423288e-07, + "loss": 0.8315, + "step": 19884 + }, + { + "epoch": 0.9682759963966596, + "grad_norm": 1.2802674770355225, + "learning_rate": 1.0532830488787061e-07, + "loss": 0.7913, + "step": 19885 + }, + { + "epoch": 0.9683246901857668, + "grad_norm": 2.4393770694732666, + "learning_rate": 1.0500524739328611e-07, + "loss": 0.7304, + "step": 19886 + }, + { + "epoch": 0.968373383974874, + "grad_norm": 1.8034539222717285, + "learning_rate": 1.046826847885063e-07, + "loss": 0.8095, + "step": 19887 + }, + { + "epoch": 0.9684220777639813, + "grad_norm": 1.5238518714904785, + "learning_rate": 1.0436061708156032e-07, + "loss": 0.774, + "step": 19888 + }, + { + "epoch": 0.9684707715530884, + "grad_norm": 2.1187894344329834, + "learning_rate": 1.040390442804573e-07, + "loss": 0.7664, + "step": 19889 + }, + { + "epoch": 0.9685194653421956, + "grad_norm": 2.0202789306640625, + "learning_rate": 1.0371796639319754e-07, + "loss": 0.7288, + "step": 19890 + }, + { + "epoch": 0.9685681591313028, + "grad_norm": 1.5233250856399536, + "learning_rate": 1.0339738342776573e-07, + "loss": 0.8788, + "step": 19891 + }, + { + "epoch": 0.96861685292041, + "grad_norm": 1.7219675779342651, + "learning_rate": 1.0307729539213995e-07, + "loss": 0.762, + "step": 19892 + }, + { + "epoch": 0.9686655467095172, + "grad_norm": 2.202564239501953, + "learning_rate": 1.027577022942805e-07, + "loss": 0.8341, + "step": 19893 + }, + { + "epoch": 0.9687142404986244, + "grad_norm": 0.0945887491106987, + "learning_rate": 1.0243860414213658e-07, + "loss": 0.6271, + "step": 19894 + }, + { + "epoch": 0.9687629342877316, + "grad_norm": 1.3545585870742798, + "learning_rate": 1.021200009436485e-07, + "loss": 0.8599, + "step": 19895 + }, + { + "epoch": 0.9688116280768388, + "grad_norm": 1.6561444997787476, + "learning_rate": 1.0180189270673657e-07, + "loss": 0.8624, + "step": 19896 + }, + { + "epoch": 0.968860321865946, + "grad_norm": 1.3802670240402222, + "learning_rate": 1.0148427943931894e-07, + "loss": 0.7475, + "step": 19897 + }, + { + "epoch": 0.9689090156550532, + "grad_norm": 1.497931718826294, + "learning_rate": 1.011671611492937e-07, + "loss": 0.7166, + "step": 19898 + }, + { + "epoch": 0.9689577094441604, + "grad_norm": 2.457632303237915, + "learning_rate": 1.0085053784454791e-07, + "loss": 0.8182, + "step": 19899 + }, + { + "epoch": 0.9690064032332676, + "grad_norm": 1.991144061088562, + "learning_rate": 1.005344095329619e-07, + "loss": 0.7991, + "step": 19900 + }, + { + "epoch": 0.9690550970223748, + "grad_norm": 1.4044783115386963, + "learning_rate": 1.0021877622239606e-07, + "loss": 0.8092, + "step": 19901 + }, + { + "epoch": 0.969103790811482, + "grad_norm": 2.2777044773101807, + "learning_rate": 9.99036379207019e-08, + "loss": 0.8731, + "step": 19902 + }, + { + "epoch": 0.9691524846005892, + "grad_norm": 1.39576256275177, + "learning_rate": 9.958899463571981e-08, + "loss": 0.8584, + "step": 19903 + }, + { + "epoch": 0.9692011783896964, + "grad_norm": 1.8456014394760132, + "learning_rate": 9.927484637527684e-08, + "loss": 0.8619, + "step": 19904 + }, + { + "epoch": 0.9692498721788035, + "grad_norm": 1.493796467781067, + "learning_rate": 9.896119314718455e-08, + "loss": 0.8347, + "step": 19905 + }, + { + "epoch": 0.9692985659679108, + "grad_norm": 2.0093564987182617, + "learning_rate": 9.86480349592478e-08, + "loss": 0.7726, + "step": 19906 + }, + { + "epoch": 0.969347259757018, + "grad_norm": 1.882132887840271, + "learning_rate": 9.83353718192559e-08, + "loss": 0.7731, + "step": 19907 + }, + { + "epoch": 0.9693959535461252, + "grad_norm": 1.8248937129974365, + "learning_rate": 9.802320373498486e-08, + "loss": 0.7452, + "step": 19908 + }, + { + "epoch": 0.9694446473352324, + "grad_norm": 1.894714117050171, + "learning_rate": 9.771153071420403e-08, + "loss": 0.8311, + "step": 19909 + }, + { + "epoch": 0.9694933411243396, + "grad_norm": 1.3569844961166382, + "learning_rate": 9.740035276466053e-08, + "loss": 0.7914, + "step": 19910 + }, + { + "epoch": 0.9695420349134468, + "grad_norm": 1.7776588201522827, + "learning_rate": 9.70896698940993e-08, + "loss": 0.7462, + "step": 19911 + }, + { + "epoch": 0.969590728702554, + "grad_norm": 1.4909237623214722, + "learning_rate": 9.677948211024746e-08, + "loss": 0.831, + "step": 19912 + }, + { + "epoch": 0.9696394224916612, + "grad_norm": 1.4271085262298584, + "learning_rate": 9.646978942082107e-08, + "loss": 0.7653, + "step": 19913 + }, + { + "epoch": 0.9696881162807683, + "grad_norm": 1.7385042905807495, + "learning_rate": 9.616059183352289e-08, + "loss": 0.7372, + "step": 19914 + }, + { + "epoch": 0.9697368100698756, + "grad_norm": 1.2537109851837158, + "learning_rate": 9.58518893560445e-08, + "loss": 0.7869, + "step": 19915 + }, + { + "epoch": 0.9697855038589828, + "grad_norm": 1.474947452545166, + "learning_rate": 9.554368199606423e-08, + "loss": 0.814, + "step": 19916 + }, + { + "epoch": 0.96983419764809, + "grad_norm": 2.0932862758636475, + "learning_rate": 9.523596976125149e-08, + "loss": 0.7039, + "step": 19917 + }, + { + "epoch": 0.9698828914371972, + "grad_norm": 1.4213262796401978, + "learning_rate": 9.492875265925794e-08, + "loss": 0.8261, + "step": 19918 + }, + { + "epoch": 0.9699315852263044, + "grad_norm": 1.1836466789245605, + "learning_rate": 9.462203069772636e-08, + "loss": 0.7862, + "step": 19919 + }, + { + "epoch": 0.9699802790154116, + "grad_norm": 4.193950176239014, + "learning_rate": 9.431580388428618e-08, + "loss": 0.7167, + "step": 19920 + }, + { + "epoch": 0.9700289728045188, + "grad_norm": 1.2474143505096436, + "learning_rate": 9.4010072226558e-08, + "loss": 0.8521, + "step": 19921 + }, + { + "epoch": 0.9700776665936259, + "grad_norm": 1.5972520112991333, + "learning_rate": 9.370483573214239e-08, + "loss": 0.6951, + "step": 19922 + }, + { + "epoch": 0.9701263603827331, + "grad_norm": 1.8227307796478271, + "learning_rate": 9.34000944086333e-08, + "loss": 0.7472, + "step": 19923 + }, + { + "epoch": 0.9701750541718404, + "grad_norm": 1.5199975967407227, + "learning_rate": 9.309584826361351e-08, + "loss": 0.8284, + "step": 19924 + }, + { + "epoch": 0.9702237479609476, + "grad_norm": 2.486783266067505, + "learning_rate": 9.279209730464811e-08, + "loss": 0.8403, + "step": 19925 + }, + { + "epoch": 0.9702724417500548, + "grad_norm": 1.1780345439910889, + "learning_rate": 9.24888415392955e-08, + "loss": 0.8349, + "step": 19926 + }, + { + "epoch": 0.970321135539162, + "grad_norm": 1.6167911291122437, + "learning_rate": 9.21860809750963e-08, + "loss": 0.7697, + "step": 19927 + }, + { + "epoch": 0.9703698293282692, + "grad_norm": 1.8180502653121948, + "learning_rate": 9.188381561958448e-08, + "loss": 0.8088, + "step": 19928 + }, + { + "epoch": 0.9704185231173764, + "grad_norm": 2.0473294258117676, + "learning_rate": 9.158204548027849e-08, + "loss": 0.7704, + "step": 19929 + }, + { + "epoch": 0.9704672169064836, + "grad_norm": 1.3211876153945923, + "learning_rate": 9.128077056468343e-08, + "loss": 0.7256, + "step": 19930 + }, + { + "epoch": 0.9705159106955907, + "grad_norm": 2.1039936542510986, + "learning_rate": 9.097999088029552e-08, + "loss": 0.8347, + "step": 19931 + }, + { + "epoch": 0.970564604484698, + "grad_norm": 2.839390993118286, + "learning_rate": 9.067970643459322e-08, + "loss": 0.7918, + "step": 19932 + }, + { + "epoch": 0.9706132982738052, + "grad_norm": 2.419611930847168, + "learning_rate": 9.037991723505058e-08, + "loss": 0.83, + "step": 19933 + }, + { + "epoch": 0.9706619920629124, + "grad_norm": 1.302895426750183, + "learning_rate": 9.008062328912159e-08, + "loss": 0.8081, + "step": 19934 + }, + { + "epoch": 0.9707106858520196, + "grad_norm": 1.795399785041809, + "learning_rate": 8.978182460425366e-08, + "loss": 0.8258, + "step": 19935 + }, + { + "epoch": 0.9707593796411268, + "grad_norm": 2.2160069942474365, + "learning_rate": 8.94835211878764e-08, + "loss": 0.7116, + "step": 19936 + }, + { + "epoch": 0.970808073430234, + "grad_norm": 1.809118390083313, + "learning_rate": 8.918571304741497e-08, + "loss": 0.8174, + "step": 19937 + }, + { + "epoch": 0.9708567672193412, + "grad_norm": 2.024428606033325, + "learning_rate": 8.888840019027234e-08, + "loss": 0.8045, + "step": 19938 + }, + { + "epoch": 0.9709054610084483, + "grad_norm": 1.4087356328964233, + "learning_rate": 8.859158262384704e-08, + "loss": 0.923, + "step": 19939 + }, + { + "epoch": 0.9709541547975555, + "grad_norm": 2.0219693183898926, + "learning_rate": 8.829526035551983e-08, + "loss": 0.8375, + "step": 19940 + }, + { + "epoch": 0.9710028485866627, + "grad_norm": 1.4802777767181396, + "learning_rate": 8.799943339266481e-08, + "loss": 0.7545, + "step": 19941 + }, + { + "epoch": 0.97105154237577, + "grad_norm": 1.3349089622497559, + "learning_rate": 8.770410174263833e-08, + "loss": 0.802, + "step": 19942 + }, + { + "epoch": 0.9711002361648772, + "grad_norm": 1.7488033771514893, + "learning_rate": 8.740926541278783e-08, + "loss": 0.8611, + "step": 19943 + }, + { + "epoch": 0.9711489299539844, + "grad_norm": 1.227127194404602, + "learning_rate": 8.711492441044967e-08, + "loss": 0.759, + "step": 19944 + }, + { + "epoch": 0.9711976237430916, + "grad_norm": 1.8086925745010376, + "learning_rate": 8.682107874294021e-08, + "loss": 0.8297, + "step": 19945 + }, + { + "epoch": 0.9712463175321988, + "grad_norm": 1.9845080375671387, + "learning_rate": 8.652772841757585e-08, + "loss": 0.7985, + "step": 19946 + }, + { + "epoch": 0.9712950113213059, + "grad_norm": 1.7791924476623535, + "learning_rate": 8.623487344164627e-08, + "loss": 0.8451, + "step": 19947 + }, + { + "epoch": 0.9713437051104131, + "grad_norm": 1.625084400177002, + "learning_rate": 8.594251382244124e-08, + "loss": 0.7969, + "step": 19948 + }, + { + "epoch": 0.9713923988995203, + "grad_norm": 1.7481844425201416, + "learning_rate": 8.565064956723267e-08, + "loss": 0.8152, + "step": 19949 + }, + { + "epoch": 0.9714410926886275, + "grad_norm": 2.1576905250549316, + "learning_rate": 8.535928068327926e-08, + "loss": 0.8469, + "step": 19950 + }, + { + "epoch": 0.9714897864777348, + "grad_norm": 1.5827486515045166, + "learning_rate": 8.506840717782849e-08, + "loss": 0.7524, + "step": 19951 + }, + { + "epoch": 0.971538480266842, + "grad_norm": 2.0132155418395996, + "learning_rate": 8.477802905811905e-08, + "loss": 0.8076, + "step": 19952 + }, + { + "epoch": 0.9715871740559492, + "grad_norm": 1.4396778345108032, + "learning_rate": 8.448814633137182e-08, + "loss": 0.7873, + "step": 19953 + }, + { + "epoch": 0.9716358678450564, + "grad_norm": 1.3950554132461548, + "learning_rate": 8.419875900479435e-08, + "loss": 0.8673, + "step": 19954 + }, + { + "epoch": 0.9716845616341636, + "grad_norm": 1.73624849319458, + "learning_rate": 8.390986708559201e-08, + "loss": 0.8577, + "step": 19955 + }, + { + "epoch": 0.9717332554232707, + "grad_norm": 1.7579714059829712, + "learning_rate": 8.362147058094572e-08, + "loss": 0.7753, + "step": 19956 + }, + { + "epoch": 0.9717819492123779, + "grad_norm": 1.306052803993225, + "learning_rate": 8.333356949803196e-08, + "loss": 0.7828, + "step": 19957 + }, + { + "epoch": 0.9718306430014851, + "grad_norm": 2.5154030323028564, + "learning_rate": 8.304616384401166e-08, + "loss": 0.8747, + "step": 19958 + }, + { + "epoch": 0.9718793367905924, + "grad_norm": 2.7517077922821045, + "learning_rate": 8.275925362603465e-08, + "loss": 0.9279, + "step": 19959 + }, + { + "epoch": 0.9719280305796996, + "grad_norm": 1.7662265300750732, + "learning_rate": 8.247283885123747e-08, + "loss": 0.7886, + "step": 19960 + }, + { + "epoch": 0.9719767243688068, + "grad_norm": 1.4874811172485352, + "learning_rate": 8.21869195267433e-08, + "loss": 0.763, + "step": 19961 + }, + { + "epoch": 0.972025418157914, + "grad_norm": 1.4988179206848145, + "learning_rate": 8.190149565966421e-08, + "loss": 0.7396, + "step": 19962 + }, + { + "epoch": 0.9720741119470212, + "grad_norm": 1.4944664239883423, + "learning_rate": 8.161656725710343e-08, + "loss": 0.8651, + "step": 19963 + }, + { + "epoch": 0.9721228057361283, + "grad_norm": 2.2276782989501953, + "learning_rate": 8.13321343261464e-08, + "loss": 0.7387, + "step": 19964 + }, + { + "epoch": 0.9721714995252355, + "grad_norm": 1.7382615804672241, + "learning_rate": 8.104819687386745e-08, + "loss": 0.8266, + "step": 19965 + }, + { + "epoch": 0.9722201933143427, + "grad_norm": 1.8333455324172974, + "learning_rate": 8.076475490733205e-08, + "loss": 0.8067, + "step": 19966 + }, + { + "epoch": 0.9722688871034499, + "grad_norm": 1.7375997304916382, + "learning_rate": 8.04818084335901e-08, + "loss": 0.8587, + "step": 19967 + }, + { + "epoch": 0.9723175808925572, + "grad_norm": 1.9241830110549927, + "learning_rate": 8.019935745967822e-08, + "loss": 0.8293, + "step": 19968 + }, + { + "epoch": 0.9723662746816644, + "grad_norm": 0.09225708991289139, + "learning_rate": 7.99174019926241e-08, + "loss": 0.5736, + "step": 19969 + }, + { + "epoch": 0.9724149684707716, + "grad_norm": 1.588198184967041, + "learning_rate": 7.963594203943992e-08, + "loss": 0.7545, + "step": 19970 + }, + { + "epoch": 0.9724636622598788, + "grad_norm": 1.4645187854766846, + "learning_rate": 7.935497760712896e-08, + "loss": 0.774, + "step": 19971 + }, + { + "epoch": 0.972512356048986, + "grad_norm": 1.5681458711624146, + "learning_rate": 7.907450870267896e-08, + "loss": 0.8133, + "step": 19972 + }, + { + "epoch": 0.9725610498380931, + "grad_norm": 1.6220152378082275, + "learning_rate": 7.879453533306658e-08, + "loss": 0.8499, + "step": 19973 + }, + { + "epoch": 0.9726097436272003, + "grad_norm": 1.715732455253601, + "learning_rate": 7.851505750525734e-08, + "loss": 0.8448, + "step": 19974 + }, + { + "epoch": 0.9726584374163075, + "grad_norm": 1.857300043106079, + "learning_rate": 7.823607522620124e-08, + "loss": 0.8372, + "step": 19975 + }, + { + "epoch": 0.9727071312054147, + "grad_norm": 1.6439735889434814, + "learning_rate": 7.79575885028394e-08, + "loss": 0.7589, + "step": 19976 + }, + { + "epoch": 0.972755824994522, + "grad_norm": 2.362433671951294, + "learning_rate": 7.767959734209963e-08, + "loss": 0.7749, + "step": 19977 + }, + { + "epoch": 0.9728045187836292, + "grad_norm": 1.3367682695388794, + "learning_rate": 7.740210175089413e-08, + "loss": 0.8038, + "step": 19978 + }, + { + "epoch": 0.9728532125727364, + "grad_norm": 1.3555058240890503, + "learning_rate": 7.712510173613074e-08, + "loss": 0.8519, + "step": 19979 + }, + { + "epoch": 0.9729019063618436, + "grad_norm": 1.3881893157958984, + "learning_rate": 7.684859730469507e-08, + "loss": 0.7321, + "step": 19980 + }, + { + "epoch": 0.9729506001509507, + "grad_norm": 1.7275840044021606, + "learning_rate": 7.657258846346827e-08, + "loss": 0.7313, + "step": 19981 + }, + { + "epoch": 0.9729992939400579, + "grad_norm": 1.4216313362121582, + "learning_rate": 7.629707521931374e-08, + "loss": 0.8176, + "step": 19982 + }, + { + "epoch": 0.9730479877291651, + "grad_norm": 1.6030898094177246, + "learning_rate": 7.602205757908821e-08, + "loss": 0.8368, + "step": 19983 + }, + { + "epoch": 0.9730966815182723, + "grad_norm": 3.666517496109009, + "learning_rate": 7.574753554962843e-08, + "loss": 0.7658, + "step": 19984 + }, + { + "epoch": 0.9731453753073795, + "grad_norm": 1.2940185070037842, + "learning_rate": 7.547350913776674e-08, + "loss": 0.8647, + "step": 19985 + }, + { + "epoch": 0.9731940690964868, + "grad_norm": 1.3109625577926636, + "learning_rate": 7.519997835031767e-08, + "loss": 0.8153, + "step": 19986 + }, + { + "epoch": 0.973242762885594, + "grad_norm": 1.5596219301223755, + "learning_rate": 7.492694319408467e-08, + "loss": 0.8206, + "step": 19987 + }, + { + "epoch": 0.9732914566747012, + "grad_norm": 1.416897177696228, + "learning_rate": 7.46544036758623e-08, + "loss": 0.8223, + "step": 19988 + }, + { + "epoch": 0.9733401504638084, + "grad_norm": 1.584509253501892, + "learning_rate": 7.438235980242736e-08, + "loss": 0.8089, + "step": 19989 + }, + { + "epoch": 0.9733888442529155, + "grad_norm": 1.7819418907165527, + "learning_rate": 7.411081158054778e-08, + "loss": 0.8505, + "step": 19990 + }, + { + "epoch": 0.9734375380420227, + "grad_norm": 1.3038558959960938, + "learning_rate": 7.383975901697816e-08, + "loss": 0.7464, + "step": 19991 + }, + { + "epoch": 0.9734862318311299, + "grad_norm": 1.9781404733657837, + "learning_rate": 7.356920211846197e-08, + "loss": 0.8284, + "step": 19992 + }, + { + "epoch": 0.9735349256202371, + "grad_norm": 1.5146713256835938, + "learning_rate": 7.32991408917294e-08, + "loss": 0.8913, + "step": 19993 + }, + { + "epoch": 0.9735836194093443, + "grad_norm": 1.8095006942749023, + "learning_rate": 7.30295753434973e-08, + "loss": 0.8274, + "step": 19994 + }, + { + "epoch": 0.9736323131984516, + "grad_norm": 2.1770031452178955, + "learning_rate": 7.276050548047142e-08, + "loss": 0.7534, + "step": 19995 + }, + { + "epoch": 0.9736810069875588, + "grad_norm": 1.4838229417800903, + "learning_rate": 7.249193130934417e-08, + "loss": 0.8584, + "step": 19996 + }, + { + "epoch": 0.973729700776666, + "grad_norm": 1.8088414669036865, + "learning_rate": 7.222385283679689e-08, + "loss": 0.7974, + "step": 19997 + }, + { + "epoch": 0.9737783945657731, + "grad_norm": 0.10085171461105347, + "learning_rate": 7.19562700694998e-08, + "loss": 0.6179, + "step": 19998 + }, + { + "epoch": 0.9738270883548803, + "grad_norm": 1.7629497051239014, + "learning_rate": 7.168918301410533e-08, + "loss": 0.805, + "step": 19999 + }, + { + "epoch": 0.9738757821439875, + "grad_norm": 0.10196483135223389, + "learning_rate": 7.142259167726151e-08, + "loss": 0.6155, + "step": 20000 + }, + { + "epoch": 0.9739244759330947, + "grad_norm": 1.587172031402588, + "learning_rate": 7.115649606559638e-08, + "loss": 0.7973, + "step": 20001 + }, + { + "epoch": 0.9739731697222019, + "grad_norm": 3.526590347290039, + "learning_rate": 7.089089618573131e-08, + "loss": 0.8007, + "step": 20002 + }, + { + "epoch": 0.9740218635113091, + "grad_norm": 1.567044734954834, + "learning_rate": 7.062579204426989e-08, + "loss": 0.7181, + "step": 20003 + }, + { + "epoch": 0.9740705573004164, + "grad_norm": 1.2601853609085083, + "learning_rate": 7.036118364781131e-08, + "loss": 0.7258, + "step": 20004 + }, + { + "epoch": 0.9741192510895236, + "grad_norm": 2.594513416290283, + "learning_rate": 7.009707100293472e-08, + "loss": 0.8013, + "step": 20005 + }, + { + "epoch": 0.9741679448786307, + "grad_norm": 1.8998335599899292, + "learning_rate": 6.983345411621045e-08, + "loss": 0.8405, + "step": 20006 + }, + { + "epoch": 0.9742166386677379, + "grad_norm": 1.694817066192627, + "learning_rate": 6.957033299419769e-08, + "loss": 0.8449, + "step": 20007 + }, + { + "epoch": 0.9742653324568451, + "grad_norm": 1.703195571899414, + "learning_rate": 6.930770764343786e-08, + "loss": 0.8734, + "step": 20008 + }, + { + "epoch": 0.9743140262459523, + "grad_norm": 1.4014209508895874, + "learning_rate": 6.904557807046797e-08, + "loss": 0.8078, + "step": 20009 + }, + { + "epoch": 0.9743627200350595, + "grad_norm": 3.9368135929107666, + "learning_rate": 6.878394428180501e-08, + "loss": 0.7889, + "step": 20010 + }, + { + "epoch": 0.9744114138241667, + "grad_norm": 1.4566196203231812, + "learning_rate": 6.852280628395935e-08, + "loss": 0.8074, + "step": 20011 + }, + { + "epoch": 0.974460107613274, + "grad_norm": 1.5404659509658813, + "learning_rate": 6.826216408342579e-08, + "loss": 0.7814, + "step": 20012 + }, + { + "epoch": 0.9745088014023812, + "grad_norm": 1.5164597034454346, + "learning_rate": 6.800201768668802e-08, + "loss": 0.8095, + "step": 20013 + }, + { + "epoch": 0.9745574951914884, + "grad_norm": 2.3051140308380127, + "learning_rate": 6.774236710021864e-08, + "loss": 0.8437, + "step": 20014 + }, + { + "epoch": 0.9746061889805955, + "grad_norm": 1.5508393049240112, + "learning_rate": 6.748321233047472e-08, + "loss": 0.8125, + "step": 20015 + }, + { + "epoch": 0.9746548827697027, + "grad_norm": 2.054513931274414, + "learning_rate": 6.722455338390221e-08, + "loss": 0.8106, + "step": 20016 + }, + { + "epoch": 0.9747035765588099, + "grad_norm": 2.3364338874816895, + "learning_rate": 6.696639026693819e-08, + "loss": 0.7681, + "step": 20017 + }, + { + "epoch": 0.9747522703479171, + "grad_norm": 1.5359811782836914, + "learning_rate": 6.670872298600195e-08, + "loss": 0.8769, + "step": 20018 + }, + { + "epoch": 0.9748009641370243, + "grad_norm": 2.606391429901123, + "learning_rate": 6.645155154750393e-08, + "loss": 0.819, + "step": 20019 + }, + { + "epoch": 0.9748496579261315, + "grad_norm": 2.423414707183838, + "learning_rate": 6.619487595783902e-08, + "loss": 0.8151, + "step": 20020 + }, + { + "epoch": 0.9748983517152388, + "grad_norm": 1.128117322921753, + "learning_rate": 6.593869622339544e-08, + "loss": 0.8859, + "step": 20021 + }, + { + "epoch": 0.974947045504346, + "grad_norm": 1.1847012042999268, + "learning_rate": 6.568301235054364e-08, + "loss": 0.8591, + "step": 20022 + }, + { + "epoch": 0.9749957392934531, + "grad_norm": 1.298608422279358, + "learning_rate": 6.542782434564743e-08, + "loss": 0.8164, + "step": 20023 + }, + { + "epoch": 0.9750444330825603, + "grad_norm": 2.04569673538208, + "learning_rate": 6.517313221504839e-08, + "loss": 0.7347, + "step": 20024 + }, + { + "epoch": 0.9750931268716675, + "grad_norm": 1.7098286151885986, + "learning_rate": 6.491893596508592e-08, + "loss": 0.7517, + "step": 20025 + }, + { + "epoch": 0.9751418206607747, + "grad_norm": 1.430586338043213, + "learning_rate": 6.466523560208383e-08, + "loss": 0.7995, + "step": 20026 + }, + { + "epoch": 0.9751905144498819, + "grad_norm": 1.3630369901657104, + "learning_rate": 6.44120311323504e-08, + "loss": 0.8229, + "step": 20027 + }, + { + "epoch": 0.9752392082389891, + "grad_norm": 1.4258142709732056, + "learning_rate": 6.415932256218505e-08, + "loss": 0.7955, + "step": 20028 + }, + { + "epoch": 0.9752879020280963, + "grad_norm": 1.4820059537887573, + "learning_rate": 6.390710989787385e-08, + "loss": 0.8743, + "step": 20029 + }, + { + "epoch": 0.9753365958172036, + "grad_norm": 1.3110510110855103, + "learning_rate": 6.365539314569403e-08, + "loss": 0.7029, + "step": 20030 + }, + { + "epoch": 0.9753852896063108, + "grad_norm": 1.6746851205825806, + "learning_rate": 6.340417231190055e-08, + "loss": 0.8108, + "step": 20031 + }, + { + "epoch": 0.9754339833954179, + "grad_norm": 1.407719373703003, + "learning_rate": 6.315344740274842e-08, + "loss": 0.8229, + "step": 20032 + }, + { + "epoch": 0.9754826771845251, + "grad_norm": 2.0822858810424805, + "learning_rate": 6.290321842447045e-08, + "loss": 0.7584, + "step": 20033 + }, + { + "epoch": 0.9755313709736323, + "grad_norm": 1.4681978225708008, + "learning_rate": 6.265348538329274e-08, + "loss": 0.7633, + "step": 20034 + }, + { + "epoch": 0.9755800647627395, + "grad_norm": 2.1515979766845703, + "learning_rate": 6.240424828542813e-08, + "loss": 0.7396, + "step": 20035 + }, + { + "epoch": 0.9756287585518467, + "grad_norm": 1.3229528665542603, + "learning_rate": 6.215550713707607e-08, + "loss": 0.7686, + "step": 20036 + }, + { + "epoch": 0.9756774523409539, + "grad_norm": 1.5630437135696411, + "learning_rate": 6.190726194442276e-08, + "loss": 0.7376, + "step": 20037 + }, + { + "epoch": 0.9757261461300611, + "grad_norm": 1.4801843166351318, + "learning_rate": 6.165951271364546e-08, + "loss": 0.7521, + "step": 20038 + }, + { + "epoch": 0.9757748399191684, + "grad_norm": 1.6009103059768677, + "learning_rate": 6.14122594509059e-08, + "loss": 0.7647, + "step": 20039 + }, + { + "epoch": 0.9758235337082755, + "grad_norm": 1.494309425354004, + "learning_rate": 6.116550216235251e-08, + "loss": 0.8873, + "step": 20040 + }, + { + "epoch": 0.9758722274973827, + "grad_norm": 1.381597638130188, + "learning_rate": 6.091924085412481e-08, + "loss": 0.7562, + "step": 20041 + }, + { + "epoch": 0.9759209212864899, + "grad_norm": 1.271303415298462, + "learning_rate": 6.067347553235125e-08, + "loss": 0.7655, + "step": 20042 + }, + { + "epoch": 0.9759696150755971, + "grad_norm": 1.7501380443572998, + "learning_rate": 6.042820620314027e-08, + "loss": 0.7789, + "step": 20043 + }, + { + "epoch": 0.9760183088647043, + "grad_norm": 1.4726972579956055, + "learning_rate": 6.018343287259809e-08, + "loss": 0.8142, + "step": 20044 + }, + { + "epoch": 0.9760670026538115, + "grad_norm": 1.6672427654266357, + "learning_rate": 5.993915554680873e-08, + "loss": 0.7397, + "step": 20045 + }, + { + "epoch": 0.9761156964429187, + "grad_norm": 1.5654535293579102, + "learning_rate": 5.969537423185401e-08, + "loss": 0.7367, + "step": 20046 + }, + { + "epoch": 0.976164390232026, + "grad_norm": 1.8645776510238647, + "learning_rate": 5.94520889337935e-08, + "loss": 0.7333, + "step": 20047 + }, + { + "epoch": 0.9762130840211332, + "grad_norm": 1.900786280632019, + "learning_rate": 5.920929965868017e-08, + "loss": 0.8804, + "step": 20048 + }, + { + "epoch": 0.9762617778102403, + "grad_norm": 2.534559488296509, + "learning_rate": 5.8967006412553594e-08, + "loss": 0.6649, + "step": 20049 + }, + { + "epoch": 0.9763104715993475, + "grad_norm": 1.7036930322647095, + "learning_rate": 5.8725209201440094e-08, + "loss": 0.7419, + "step": 20050 + }, + { + "epoch": 0.9763591653884547, + "grad_norm": 1.6924983263015747, + "learning_rate": 5.848390803135706e-08, + "loss": 0.8534, + "step": 20051 + }, + { + "epoch": 0.9764078591775619, + "grad_norm": 1.7523809671401978, + "learning_rate": 5.824310290830193e-08, + "loss": 0.8313, + "step": 20052 + }, + { + "epoch": 0.9764565529666691, + "grad_norm": 1.3698078393936157, + "learning_rate": 5.800279383827212e-08, + "loss": 0.8279, + "step": 20053 + }, + { + "epoch": 0.9765052467557763, + "grad_norm": 1.4103037118911743, + "learning_rate": 5.7762980827238415e-08, + "loss": 0.9207, + "step": 20054 + }, + { + "epoch": 0.9765539405448835, + "grad_norm": 1.3242738246917725, + "learning_rate": 5.752366388116937e-08, + "loss": 0.7914, + "step": 20055 + }, + { + "epoch": 0.9766026343339907, + "grad_norm": 2.498176336288452, + "learning_rate": 5.728484300601578e-08, + "loss": 0.7852, + "step": 20056 + }, + { + "epoch": 0.9766513281230979, + "grad_norm": 2.528581380844116, + "learning_rate": 5.704651820772178e-08, + "loss": 0.819, + "step": 20057 + }, + { + "epoch": 0.9767000219122051, + "grad_norm": 1.5277252197265625, + "learning_rate": 5.6808689492213744e-08, + "loss": 0.7596, + "step": 20058 + }, + { + "epoch": 0.9767487157013123, + "grad_norm": 1.156567931175232, + "learning_rate": 5.6571356865409153e-08, + "loss": 0.841, + "step": 20059 + }, + { + "epoch": 0.9767974094904195, + "grad_norm": 1.463792324066162, + "learning_rate": 5.6334520333209964e-08, + "loss": 0.7967, + "step": 20060 + }, + { + "epoch": 0.9768461032795267, + "grad_norm": 1.4755162000656128, + "learning_rate": 5.609817990150701e-08, + "loss": 0.8077, + "step": 20061 + }, + { + "epoch": 0.9768947970686339, + "grad_norm": 5.9559407234191895, + "learning_rate": 5.586233557618004e-08, + "loss": 0.7108, + "step": 20062 + }, + { + "epoch": 0.9769434908577411, + "grad_norm": 1.633418083190918, + "learning_rate": 5.562698736309546e-08, + "loss": 0.8175, + "step": 20063 + }, + { + "epoch": 0.9769921846468483, + "grad_norm": 2.358376979827881, + "learning_rate": 5.53921352681086e-08, + "loss": 0.7318, + "step": 20064 + }, + { + "epoch": 0.9770408784359554, + "grad_norm": 1.9492950439453125, + "learning_rate": 5.515777929706145e-08, + "loss": 0.6733, + "step": 20065 + }, + { + "epoch": 0.9770895722250627, + "grad_norm": 1.3535187244415283, + "learning_rate": 5.492391945578268e-08, + "loss": 0.8113, + "step": 20066 + }, + { + "epoch": 0.9771382660141699, + "grad_norm": 1.727384090423584, + "learning_rate": 5.469055575008986e-08, + "loss": 0.8594, + "step": 20067 + }, + { + "epoch": 0.9771869598032771, + "grad_norm": 3.4219048023223877, + "learning_rate": 5.445768818578945e-08, + "loss": 0.7459, + "step": 20068 + }, + { + "epoch": 0.9772356535923843, + "grad_norm": 2.6419947147369385, + "learning_rate": 5.4225316768670156e-08, + "loss": 0.7914, + "step": 20069 + }, + { + "epoch": 0.9772843473814915, + "grad_norm": 1.3809735774993896, + "learning_rate": 5.399344150451624e-08, + "loss": 0.8102, + "step": 20070 + }, + { + "epoch": 0.9773330411705987, + "grad_norm": 1.7006690502166748, + "learning_rate": 5.3762062399094206e-08, + "loss": 0.7999, + "step": 20071 + }, + { + "epoch": 0.9773817349597059, + "grad_norm": 2.9804868698120117, + "learning_rate": 5.353117945815722e-08, + "loss": 0.806, + "step": 20072 + }, + { + "epoch": 0.9774304287488131, + "grad_norm": 1.9558192491531372, + "learning_rate": 5.3300792687454026e-08, + "loss": 0.7842, + "step": 20073 + }, + { + "epoch": 0.9774791225379202, + "grad_norm": 1.4249606132507324, + "learning_rate": 5.307090209271115e-08, + "loss": 0.8295, + "step": 20074 + }, + { + "epoch": 0.9775278163270275, + "grad_norm": 1.504094123840332, + "learning_rate": 5.284150767964624e-08, + "loss": 0.8446, + "step": 20075 + }, + { + "epoch": 0.9775765101161347, + "grad_norm": 1.61328125, + "learning_rate": 5.2612609453970286e-08, + "loss": 0.7782, + "step": 20076 + }, + { + "epoch": 0.9776252039052419, + "grad_norm": 1.369533658027649, + "learning_rate": 5.238420742137429e-08, + "loss": 0.8168, + "step": 20077 + }, + { + "epoch": 0.9776738976943491, + "grad_norm": 1.4277434349060059, + "learning_rate": 5.215630158753815e-08, + "loss": 0.8587, + "step": 20078 + }, + { + "epoch": 0.9777225914834563, + "grad_norm": 1.4594653844833374, + "learning_rate": 5.192889195813289e-08, + "loss": 0.7523, + "step": 20079 + }, + { + "epoch": 0.9777712852725635, + "grad_norm": 1.4321796894073486, + "learning_rate": 5.17019785388162e-08, + "loss": 0.7658, + "step": 20080 + }, + { + "epoch": 0.9778199790616707, + "grad_norm": 1.414616584777832, + "learning_rate": 5.147556133523024e-08, + "loss": 0.7748, + "step": 20081 + }, + { + "epoch": 0.9778686728507778, + "grad_norm": 3.286982297897339, + "learning_rate": 5.12496403530105e-08, + "loss": 0.7947, + "step": 20082 + }, + { + "epoch": 0.977917366639885, + "grad_norm": 1.83515202999115, + "learning_rate": 5.102421559777471e-08, + "loss": 0.7965, + "step": 20083 + }, + { + "epoch": 0.9779660604289923, + "grad_norm": 1.8816466331481934, + "learning_rate": 5.0799287075129486e-08, + "loss": 0.8072, + "step": 20084 + }, + { + "epoch": 0.9780147542180995, + "grad_norm": 1.2160090208053589, + "learning_rate": 5.057485479067037e-08, + "loss": 0.7951, + "step": 20085 + }, + { + "epoch": 0.9780634480072067, + "grad_norm": 2.124485969543457, + "learning_rate": 5.035091874998177e-08, + "loss": 0.7484, + "step": 20086 + }, + { + "epoch": 0.9781121417963139, + "grad_norm": 1.2453759908676147, + "learning_rate": 5.012747895863257e-08, + "loss": 0.7834, + "step": 20087 + }, + { + "epoch": 0.9781608355854211, + "grad_norm": 1.2353922128677368, + "learning_rate": 4.990453542218055e-08, + "loss": 0.7829, + "step": 20088 + }, + { + "epoch": 0.9782095293745283, + "grad_norm": 2.0155129432678223, + "learning_rate": 4.9682088146172366e-08, + "loss": 0.7445, + "step": 20089 + }, + { + "epoch": 0.9782582231636355, + "grad_norm": 1.433760643005371, + "learning_rate": 4.9460137136143614e-08, + "loss": 0.8382, + "step": 20090 + }, + { + "epoch": 0.9783069169527426, + "grad_norm": 1.7303833961486816, + "learning_rate": 4.923868239760987e-08, + "loss": 0.7637, + "step": 20091 + }, + { + "epoch": 0.9783556107418498, + "grad_norm": 1.7676888704299927, + "learning_rate": 4.90177239360845e-08, + "loss": 0.8876, + "step": 20092 + }, + { + "epoch": 0.9784043045309571, + "grad_norm": 1.8390082120895386, + "learning_rate": 4.8797261757063117e-08, + "loss": 0.7984, + "step": 20093 + }, + { + "epoch": 0.9784529983200643, + "grad_norm": 1.336366057395935, + "learning_rate": 4.8577295866027995e-08, + "loss": 0.8162, + "step": 20094 + }, + { + "epoch": 0.9785016921091715, + "grad_norm": 1.981202483177185, + "learning_rate": 4.835782626845032e-08, + "loss": 0.7548, + "step": 20095 + }, + { + "epoch": 0.9785503858982787, + "grad_norm": 1.348197340965271, + "learning_rate": 4.813885296979237e-08, + "loss": 0.8403, + "step": 20096 + }, + { + "epoch": 0.9785990796873859, + "grad_norm": 1.681677222251892, + "learning_rate": 4.79203759754987e-08, + "loss": 0.8416, + "step": 20097 + }, + { + "epoch": 0.9786477734764931, + "grad_norm": 1.5508488416671753, + "learning_rate": 4.7702395291004955e-08, + "loss": 0.8146, + "step": 20098 + }, + { + "epoch": 0.9786964672656002, + "grad_norm": 2.76422119140625, + "learning_rate": 4.7484910921735684e-08, + "loss": 0.8355, + "step": 20099 + }, + { + "epoch": 0.9787451610547074, + "grad_norm": 1.4278771877288818, + "learning_rate": 4.726792287309545e-08, + "loss": 0.7735, + "step": 20100 + }, + { + "epoch": 0.9787938548438146, + "grad_norm": 1.6293339729309082, + "learning_rate": 4.7051431150486604e-08, + "loss": 0.8575, + "step": 20101 + }, + { + "epoch": 0.9788425486329219, + "grad_norm": 1.3478425741195679, + "learning_rate": 4.68354357592915e-08, + "loss": 0.8066, + "step": 20102 + }, + { + "epoch": 0.9788912424220291, + "grad_norm": 3.112182140350342, + "learning_rate": 4.661993670488363e-08, + "loss": 0.8125, + "step": 20103 + }, + { + "epoch": 0.9789399362111363, + "grad_norm": 1.6090279817581177, + "learning_rate": 4.6404933992627574e-08, + "loss": 0.8236, + "step": 20104 + }, + { + "epoch": 0.9789886300002435, + "grad_norm": 3.744558572769165, + "learning_rate": 4.619042762786352e-08, + "loss": 0.8806, + "step": 20105 + }, + { + "epoch": 0.9790373237893507, + "grad_norm": 1.440844178199768, + "learning_rate": 4.597641761593607e-08, + "loss": 0.8696, + "step": 20106 + }, + { + "epoch": 0.9790860175784578, + "grad_norm": 1.3403664827346802, + "learning_rate": 4.576290396216099e-08, + "loss": 0.7968, + "step": 20107 + }, + { + "epoch": 0.979134711367565, + "grad_norm": 1.6075671911239624, + "learning_rate": 4.5549886671854005e-08, + "loss": 0.8587, + "step": 20108 + }, + { + "epoch": 0.9791834051566722, + "grad_norm": 0.09906001389026642, + "learning_rate": 4.533736575031533e-08, + "loss": 0.6998, + "step": 20109 + }, + { + "epoch": 0.9792320989457795, + "grad_norm": 1.5529404878616333, + "learning_rate": 4.512534120282741e-08, + "loss": 0.7868, + "step": 20110 + }, + { + "epoch": 0.9792807927348867, + "grad_norm": 1.902390480041504, + "learning_rate": 4.491381303466602e-08, + "loss": 0.7889, + "step": 20111 + }, + { + "epoch": 0.9793294865239939, + "grad_norm": 1.6841323375701904, + "learning_rate": 4.4702781251091396e-08, + "loss": 0.774, + "step": 20112 + }, + { + "epoch": 0.9793781803131011, + "grad_norm": 1.5251188278198242, + "learning_rate": 4.449224585735712e-08, + "loss": 0.8459, + "step": 20113 + }, + { + "epoch": 0.9794268741022083, + "grad_norm": 1.311059832572937, + "learning_rate": 4.4282206858696775e-08, + "loss": 0.7064, + "step": 20114 + }, + { + "epoch": 0.9794755678913155, + "grad_norm": 1.4805078506469727, + "learning_rate": 4.407266426033507e-08, + "loss": 0.8626, + "step": 20115 + }, + { + "epoch": 0.9795242616804226, + "grad_norm": 1.4482272863388062, + "learning_rate": 4.386361806748562e-08, + "loss": 0.8035, + "step": 20116 + }, + { + "epoch": 0.9795729554695298, + "grad_norm": 1.7466236352920532, + "learning_rate": 4.365506828534871e-08, + "loss": 0.8577, + "step": 20117 + }, + { + "epoch": 0.979621649258637, + "grad_norm": 1.4127044677734375, + "learning_rate": 4.34470149191113e-08, + "loss": 0.8366, + "step": 20118 + }, + { + "epoch": 0.9796703430477443, + "grad_norm": 1.6167439222335815, + "learning_rate": 4.323945797394924e-08, + "loss": 0.8229, + "step": 20119 + }, + { + "epoch": 0.9797190368368515, + "grad_norm": 1.943199872970581, + "learning_rate": 4.30323974550273e-08, + "loss": 0.7933, + "step": 20120 + }, + { + "epoch": 0.9797677306259587, + "grad_norm": 0.09517655521631241, + "learning_rate": 4.2825833367490246e-08, + "loss": 0.6018, + "step": 20121 + }, + { + "epoch": 0.9798164244150659, + "grad_norm": 2.196712017059326, + "learning_rate": 4.261976571648285e-08, + "loss": 0.811, + "step": 20122 + }, + { + "epoch": 0.9798651182041731, + "grad_norm": 1.7805471420288086, + "learning_rate": 4.241419450712769e-08, + "loss": 0.7326, + "step": 20123 + }, + { + "epoch": 0.9799138119932802, + "grad_norm": 2.3672258853912354, + "learning_rate": 4.220911974453845e-08, + "loss": 0.8509, + "step": 20124 + }, + { + "epoch": 0.9799625057823874, + "grad_norm": 1.6719989776611328, + "learning_rate": 4.200454143381771e-08, + "loss": 0.7971, + "step": 20125 + }, + { + "epoch": 0.9800111995714946, + "grad_norm": 1.7553675174713135, + "learning_rate": 4.1800459580052524e-08, + "loss": 0.7534, + "step": 20126 + }, + { + "epoch": 0.9800598933606018, + "grad_norm": 1.436247706413269, + "learning_rate": 4.1596874188323255e-08, + "loss": 0.8787, + "step": 20127 + }, + { + "epoch": 0.980108587149709, + "grad_norm": 1.4177522659301758, + "learning_rate": 4.1393785263688094e-08, + "loss": 0.8699, + "step": 20128 + }, + { + "epoch": 0.9801572809388163, + "grad_norm": 1.437509298324585, + "learning_rate": 4.1191192811205204e-08, + "loss": 0.8351, + "step": 20129 + }, + { + "epoch": 0.9802059747279235, + "grad_norm": 1.367400884628296, + "learning_rate": 4.0989096835910566e-08, + "loss": 0.7626, + "step": 20130 + }, + { + "epoch": 0.9802546685170307, + "grad_norm": 1.3345917463302612, + "learning_rate": 4.078749734282905e-08, + "loss": 0.9218, + "step": 20131 + }, + { + "epoch": 0.9803033623061379, + "grad_norm": 1.3889412879943848, + "learning_rate": 4.058639433698108e-08, + "loss": 0.8345, + "step": 20132 + }, + { + "epoch": 0.980352056095245, + "grad_norm": 0.09862914681434631, + "learning_rate": 4.038578782336711e-08, + "loss": 0.6179, + "step": 20133 + }, + { + "epoch": 0.9804007498843522, + "grad_norm": 1.2189379930496216, + "learning_rate": 4.0185677806974244e-08, + "loss": 0.7684, + "step": 20134 + }, + { + "epoch": 0.9804494436734594, + "grad_norm": 2.3561012744903564, + "learning_rate": 3.9986064292785176e-08, + "loss": 0.7814, + "step": 20135 + }, + { + "epoch": 0.9804981374625666, + "grad_norm": 1.3857553005218506, + "learning_rate": 3.978694728576038e-08, + "loss": 0.7539, + "step": 20136 + }, + { + "epoch": 0.9805468312516739, + "grad_norm": 2.3668594360351562, + "learning_rate": 3.958832679085589e-08, + "loss": 0.8313, + "step": 20137 + }, + { + "epoch": 0.9805955250407811, + "grad_norm": 1.495819091796875, + "learning_rate": 3.93902028130122e-08, + "loss": 0.8343, + "step": 20138 + }, + { + "epoch": 0.9806442188298883, + "grad_norm": 3.225999593734741, + "learning_rate": 3.9192575357156484e-08, + "loss": 0.8032, + "step": 20139 + }, + { + "epoch": 0.9806929126189955, + "grad_norm": 1.72428297996521, + "learning_rate": 3.8995444428204796e-08, + "loss": 0.7714, + "step": 20140 + }, + { + "epoch": 0.9807416064081026, + "grad_norm": 1.8669018745422363, + "learning_rate": 3.8798810031062115e-08, + "loss": 0.8472, + "step": 20141 + }, + { + "epoch": 0.9807903001972098, + "grad_norm": 1.5790603160858154, + "learning_rate": 3.860267217062008e-08, + "loss": 0.8554, + "step": 20142 + }, + { + "epoch": 0.980838993986317, + "grad_norm": 2.3078396320343018, + "learning_rate": 3.840703085175479e-08, + "loss": 0.7871, + "step": 20143 + }, + { + "epoch": 0.9808876877754242, + "grad_norm": 2.0424346923828125, + "learning_rate": 3.821188607933568e-08, + "loss": 0.898, + "step": 20144 + }, + { + "epoch": 0.9809363815645314, + "grad_norm": 1.282684564590454, + "learning_rate": 3.8017237858214426e-08, + "loss": 0.8667, + "step": 20145 + }, + { + "epoch": 0.9809850753536387, + "grad_norm": 2.020249843597412, + "learning_rate": 3.782308619323605e-08, + "loss": 0.8195, + "step": 20146 + }, + { + "epoch": 0.9810337691427459, + "grad_norm": 1.4444899559020996, + "learning_rate": 3.762943108922779e-08, + "loss": 0.7629, + "step": 20147 + }, + { + "epoch": 0.9810824629318531, + "grad_norm": 1.8110677003860474, + "learning_rate": 3.743627255101023e-08, + "loss": 0.7348, + "step": 20148 + }, + { + "epoch": 0.9811311567209603, + "grad_norm": 1.817643165588379, + "learning_rate": 3.7243610583381774e-08, + "loss": 0.7105, + "step": 20149 + }, + { + "epoch": 0.9811798505100674, + "grad_norm": 1.5493112802505493, + "learning_rate": 3.705144519114079e-08, + "loss": 0.7079, + "step": 20150 + }, + { + "epoch": 0.9812285442991746, + "grad_norm": 1.3198981285095215, + "learning_rate": 3.685977637906346e-08, + "loss": 0.8127, + "step": 20151 + }, + { + "epoch": 0.9812772380882818, + "grad_norm": 1.7298353910446167, + "learning_rate": 3.666860415192153e-08, + "loss": 0.8002, + "step": 20152 + }, + { + "epoch": 0.981325931877389, + "grad_norm": 6.367903709411621, + "learning_rate": 3.647792851446674e-08, + "loss": 0.7838, + "step": 20153 + }, + { + "epoch": 0.9813746256664962, + "grad_norm": 1.6369456052780151, + "learning_rate": 3.62877494714442e-08, + "loss": 0.8184, + "step": 20154 + }, + { + "epoch": 0.9814233194556035, + "grad_norm": 2.0308339595794678, + "learning_rate": 3.609806702758345e-08, + "loss": 0.7457, + "step": 20155 + }, + { + "epoch": 0.9814720132447107, + "grad_norm": 1.9891654253005981, + "learning_rate": 3.590888118760294e-08, + "loss": 0.7468, + "step": 20156 + }, + { + "epoch": 0.9815207070338179, + "grad_norm": 1.5164330005645752, + "learning_rate": 3.5720191956212235e-08, + "loss": 0.8153, + "step": 20157 + }, + { + "epoch": 0.981569400822925, + "grad_norm": 1.8479194641113281, + "learning_rate": 3.55319993380987e-08, + "loss": 0.8379, + "step": 20158 + }, + { + "epoch": 0.9816180946120322, + "grad_norm": 4.145127296447754, + "learning_rate": 3.534430333794969e-08, + "loss": 0.7764, + "step": 20159 + }, + { + "epoch": 0.9816667884011394, + "grad_norm": 1.7685996294021606, + "learning_rate": 3.5157103960430375e-08, + "loss": 0.7584, + "step": 20160 + }, + { + "epoch": 0.9817154821902466, + "grad_norm": 1.3655411005020142, + "learning_rate": 3.4970401210197016e-08, + "loss": 0.7922, + "step": 20161 + }, + { + "epoch": 0.9817641759793538, + "grad_norm": 1.5064682960510254, + "learning_rate": 3.478419509189701e-08, + "loss": 0.8147, + "step": 20162 + }, + { + "epoch": 0.981812869768461, + "grad_norm": 1.5401909351348877, + "learning_rate": 3.4598485610159994e-08, + "loss": 0.7588, + "step": 20163 + }, + { + "epoch": 0.9818615635575683, + "grad_norm": 1.6621276140213013, + "learning_rate": 3.441327276960671e-08, + "loss": 0.7963, + "step": 20164 + }, + { + "epoch": 0.9819102573466755, + "grad_norm": 3.927792549133301, + "learning_rate": 3.422855657484236e-08, + "loss": 0.8405, + "step": 20165 + }, + { + "epoch": 0.9819589511357826, + "grad_norm": 1.701019048690796, + "learning_rate": 3.4044337030463284e-08, + "loss": 0.7765, + "step": 20166 + }, + { + "epoch": 0.9820076449248898, + "grad_norm": 4.9008331298828125, + "learning_rate": 3.386061414105246e-08, + "loss": 0.76, + "step": 20167 + }, + { + "epoch": 0.982056338713997, + "grad_norm": 2.613865852355957, + "learning_rate": 3.367738791117736e-08, + "loss": 0.8068, + "step": 20168 + }, + { + "epoch": 0.9821050325031042, + "grad_norm": 1.7237224578857422, + "learning_rate": 3.3494658345401e-08, + "loss": 0.7215, + "step": 20169 + }, + { + "epoch": 0.9821537262922114, + "grad_norm": 1.4436661005020142, + "learning_rate": 3.3312425448261966e-08, + "loss": 0.7106, + "step": 20170 + }, + { + "epoch": 0.9822024200813186, + "grad_norm": 2.1816699504852295, + "learning_rate": 3.313068922430107e-08, + "loss": 0.8235, + "step": 20171 + }, + { + "epoch": 0.9822511138704259, + "grad_norm": 1.4699465036392212, + "learning_rate": 3.294944967803027e-08, + "loss": 0.9084, + "step": 20172 + }, + { + "epoch": 0.9822998076595331, + "grad_norm": 1.8914154767990112, + "learning_rate": 3.276870681396593e-08, + "loss": 0.7884, + "step": 20173 + }, + { + "epoch": 0.9823485014486403, + "grad_norm": 1.3659318685531616, + "learning_rate": 3.258846063659782e-08, + "loss": 0.8011, + "step": 20174 + }, + { + "epoch": 0.9823971952377474, + "grad_norm": 1.6851264238357544, + "learning_rate": 3.2408711150413444e-08, + "loss": 0.7935, + "step": 20175 + }, + { + "epoch": 0.9824458890268546, + "grad_norm": 1.823243260383606, + "learning_rate": 3.222945835988478e-08, + "loss": 0.8655, + "step": 20176 + }, + { + "epoch": 0.9824945828159618, + "grad_norm": 2.0582704544067383, + "learning_rate": 3.205070226946605e-08, + "loss": 0.8044, + "step": 20177 + }, + { + "epoch": 0.982543276605069, + "grad_norm": 1.4438860416412354, + "learning_rate": 3.1872442883609244e-08, + "loss": 0.8154, + "step": 20178 + }, + { + "epoch": 0.9825919703941762, + "grad_norm": 1.8472416400909424, + "learning_rate": 3.1694680206744157e-08, + "loss": 0.7963, + "step": 20179 + }, + { + "epoch": 0.9826406641832834, + "grad_norm": 1.171567678451538, + "learning_rate": 3.151741424329613e-08, + "loss": 0.8343, + "step": 20180 + }, + { + "epoch": 0.9826893579723907, + "grad_norm": 1.2739211320877075, + "learning_rate": 3.134064499767053e-08, + "loss": 0.848, + "step": 20181 + }, + { + "epoch": 0.9827380517614979, + "grad_norm": 1.5280488729476929, + "learning_rate": 3.116437247426829e-08, + "loss": 0.7953, + "step": 20182 + }, + { + "epoch": 0.982786745550605, + "grad_norm": 1.576506495475769, + "learning_rate": 3.098859667747478e-08, + "loss": 0.8352, + "step": 20183 + }, + { + "epoch": 0.9828354393397122, + "grad_norm": 2.5901594161987305, + "learning_rate": 3.081331761165762e-08, + "loss": 0.7198, + "step": 20184 + }, + { + "epoch": 0.9828841331288194, + "grad_norm": 1.809240460395813, + "learning_rate": 3.0638535281182216e-08, + "loss": 0.9416, + "step": 20185 + }, + { + "epoch": 0.9829328269179266, + "grad_norm": 0.10155586898326874, + "learning_rate": 3.046424969039397e-08, + "loss": 0.559, + "step": 20186 + }, + { + "epoch": 0.9829815207070338, + "grad_norm": 1.5342652797698975, + "learning_rate": 3.029046084362719e-08, + "loss": 0.7816, + "step": 20187 + }, + { + "epoch": 0.983030214496141, + "grad_norm": 1.4205857515335083, + "learning_rate": 3.011716874520509e-08, + "loss": 0.7505, + "step": 20188 + }, + { + "epoch": 0.9830789082852482, + "grad_norm": 1.6064294576644897, + "learning_rate": 2.9944373399439784e-08, + "loss": 0.756, + "step": 20189 + }, + { + "epoch": 0.9831276020743555, + "grad_norm": 1.7599866390228271, + "learning_rate": 2.9772074810630046e-08, + "loss": 0.7627, + "step": 20190 + }, + { + "epoch": 0.9831762958634627, + "grad_norm": 1.3451426029205322, + "learning_rate": 2.960027298306134e-08, + "loss": 0.8359, + "step": 20191 + }, + { + "epoch": 0.9832249896525698, + "grad_norm": 1.6352139711380005, + "learning_rate": 2.94289679210058e-08, + "loss": 0.9146, + "step": 20192 + }, + { + "epoch": 0.983273683441677, + "grad_norm": 1.483892798423767, + "learning_rate": 2.925815962872447e-08, + "loss": 0.8907, + "step": 20193 + }, + { + "epoch": 0.9833223772307842, + "grad_norm": 1.7119839191436768, + "learning_rate": 2.9087848110469498e-08, + "loss": 0.8905, + "step": 20194 + }, + { + "epoch": 0.9833710710198914, + "grad_norm": 1.697126030921936, + "learning_rate": 2.8918033370473052e-08, + "loss": 0.7421, + "step": 20195 + }, + { + "epoch": 0.9834197648089986, + "grad_norm": 1.7680416107177734, + "learning_rate": 2.874871541296065e-08, + "loss": 0.8051, + "step": 20196 + }, + { + "epoch": 0.9834684585981058, + "grad_norm": 1.7876921892166138, + "learning_rate": 2.857989424214891e-08, + "loss": 0.7612, + "step": 20197 + }, + { + "epoch": 0.983517152387213, + "grad_norm": 1.9488255977630615, + "learning_rate": 2.8411569862230037e-08, + "loss": 0.7902, + "step": 20198 + }, + { + "epoch": 0.9835658461763203, + "grad_norm": 1.3543100357055664, + "learning_rate": 2.8243742277394014e-08, + "loss": 0.8467, + "step": 20199 + }, + { + "epoch": 0.9836145399654274, + "grad_norm": 2.88209867477417, + "learning_rate": 2.8076411491817503e-08, + "loss": 0.787, + "step": 20200 + }, + { + "epoch": 0.9836632337545346, + "grad_norm": 2.069535732269287, + "learning_rate": 2.790957750965939e-08, + "loss": 0.7281, + "step": 20201 + }, + { + "epoch": 0.9837119275436418, + "grad_norm": 1.9484403133392334, + "learning_rate": 2.774324033507414e-08, + "loss": 0.8231, + "step": 20202 + }, + { + "epoch": 0.983760621332749, + "grad_norm": 1.4586175680160522, + "learning_rate": 2.7577399972193997e-08, + "loss": 0.7345, + "step": 20203 + }, + { + "epoch": 0.9838093151218562, + "grad_norm": 1.3442105054855347, + "learning_rate": 2.7412056425148993e-08, + "loss": 0.7502, + "step": 20204 + }, + { + "epoch": 0.9838580089109634, + "grad_norm": 1.4889825582504272, + "learning_rate": 2.7247209698049172e-08, + "loss": 0.828, + "step": 20205 + }, + { + "epoch": 0.9839067027000706, + "grad_norm": 1.745758056640625, + "learning_rate": 2.7082859794997916e-08, + "loss": 0.7329, + "step": 20206 + }, + { + "epoch": 0.9839553964891778, + "grad_norm": 1.3347278833389282, + "learning_rate": 2.6919006720078633e-08, + "loss": 0.7623, + "step": 20207 + }, + { + "epoch": 0.9840040902782851, + "grad_norm": 2.317385196685791, + "learning_rate": 2.6755650477372495e-08, + "loss": 0.8417, + "step": 20208 + }, + { + "epoch": 0.9840527840673922, + "grad_norm": 1.4246623516082764, + "learning_rate": 2.6592791070938485e-08, + "loss": 0.7648, + "step": 20209 + }, + { + "epoch": 0.9841014778564994, + "grad_norm": 2.21638560295105, + "learning_rate": 2.643042850483113e-08, + "loss": 0.6949, + "step": 20210 + }, + { + "epoch": 0.9841501716456066, + "grad_norm": 1.6566498279571533, + "learning_rate": 2.6268562783089425e-08, + "loss": 0.8125, + "step": 20211 + }, + { + "epoch": 0.9841988654347138, + "grad_norm": 1.5477488040924072, + "learning_rate": 2.6107193909734596e-08, + "loss": 0.8249, + "step": 20212 + }, + { + "epoch": 0.984247559223821, + "grad_norm": 2.3276963233947754, + "learning_rate": 2.5946321888787872e-08, + "loss": 0.8107, + "step": 20213 + }, + { + "epoch": 0.9842962530129282, + "grad_norm": 1.7745578289031982, + "learning_rate": 2.5785946724246058e-08, + "loss": 0.6539, + "step": 20214 + }, + { + "epoch": 0.9843449468020354, + "grad_norm": 2.8704164028167725, + "learning_rate": 2.5626068420101514e-08, + "loss": 0.83, + "step": 20215 + }, + { + "epoch": 0.9843936405911426, + "grad_norm": 1.4745402336120605, + "learning_rate": 2.5466686980328836e-08, + "loss": 0.8848, + "step": 20216 + }, + { + "epoch": 0.9844423343802498, + "grad_norm": 1.3992035388946533, + "learning_rate": 2.5307802408891524e-08, + "loss": 0.7812, + "step": 20217 + }, + { + "epoch": 0.984491028169357, + "grad_norm": 2.041459560394287, + "learning_rate": 2.5149414709746412e-08, + "loss": 0.8095, + "step": 20218 + }, + { + "epoch": 0.9845397219584642, + "grad_norm": 1.6589924097061157, + "learning_rate": 2.499152388683035e-08, + "loss": 0.7971, + "step": 20219 + }, + { + "epoch": 0.9845884157475714, + "grad_norm": 1.4039160013198853, + "learning_rate": 2.4834129944071306e-08, + "loss": 0.8972, + "step": 20220 + }, + { + "epoch": 0.9846371095366786, + "grad_norm": 1.2766542434692383, + "learning_rate": 2.4677232885383927e-08, + "loss": 0.8498, + "step": 20221 + }, + { + "epoch": 0.9846858033257858, + "grad_norm": 1.3413289785385132, + "learning_rate": 2.4520832714671762e-08, + "loss": 0.7928, + "step": 20222 + }, + { + "epoch": 0.984734497114893, + "grad_norm": 1.5995080471038818, + "learning_rate": 2.4364929435825024e-08, + "loss": 0.8338, + "step": 20223 + }, + { + "epoch": 0.9847831909040002, + "grad_norm": 1.5441213846206665, + "learning_rate": 2.4209523052722838e-08, + "loss": 0.8833, + "step": 20224 + }, + { + "epoch": 0.9848318846931073, + "grad_norm": 2.2362279891967773, + "learning_rate": 2.405461356922878e-08, + "loss": 0.8635, + "step": 20225 + }, + { + "epoch": 0.9848805784822146, + "grad_norm": 2.09452486038208, + "learning_rate": 2.390020098919976e-08, + "loss": 0.8414, + "step": 20226 + }, + { + "epoch": 0.9849292722713218, + "grad_norm": 1.3264796733856201, + "learning_rate": 2.3746285316472716e-08, + "loss": 0.8356, + "step": 20227 + }, + { + "epoch": 0.984977966060429, + "grad_norm": 1.624629020690918, + "learning_rate": 2.3592866554877915e-08, + "loss": 0.784, + "step": 20228 + }, + { + "epoch": 0.9850266598495362, + "grad_norm": 1.8212589025497437, + "learning_rate": 2.3439944708234518e-08, + "loss": 0.773, + "step": 20229 + }, + { + "epoch": 0.9850753536386434, + "grad_norm": 1.3507102727890015, + "learning_rate": 2.3287519780341718e-08, + "loss": 0.6984, + "step": 20230 + }, + { + "epoch": 0.9851240474277506, + "grad_norm": 1.5168856382369995, + "learning_rate": 2.313559177499203e-08, + "loss": 0.8306, + "step": 20231 + }, + { + "epoch": 0.9851727412168578, + "grad_norm": 1.6364177465438843, + "learning_rate": 2.2984160695969092e-08, + "loss": 0.7955, + "step": 20232 + }, + { + "epoch": 0.985221435005965, + "grad_norm": 1.1910359859466553, + "learning_rate": 2.2833226547034348e-08, + "loss": 0.8193, + "step": 20233 + }, + { + "epoch": 0.9852701287950721, + "grad_norm": 5.161966800689697, + "learning_rate": 2.2682789331944788e-08, + "loss": 0.9259, + "step": 20234 + }, + { + "epoch": 0.9853188225841794, + "grad_norm": 1.2228994369506836, + "learning_rate": 2.2532849054444084e-08, + "loss": 0.7743, + "step": 20235 + }, + { + "epoch": 0.9853675163732866, + "grad_norm": 15.198105812072754, + "learning_rate": 2.2383405718258144e-08, + "loss": 0.8292, + "step": 20236 + }, + { + "epoch": 0.9854162101623938, + "grad_norm": 1.554700255393982, + "learning_rate": 2.2234459327108437e-08, + "loss": 0.6876, + "step": 20237 + }, + { + "epoch": 0.985464903951501, + "grad_norm": 0.09984151273965836, + "learning_rate": 2.2086009884696446e-08, + "loss": 0.6838, + "step": 20238 + }, + { + "epoch": 0.9855135977406082, + "grad_norm": 1.4938002824783325, + "learning_rate": 2.1938057394716995e-08, + "loss": 0.8096, + "step": 20239 + }, + { + "epoch": 0.9855622915297154, + "grad_norm": 3.111569404602051, + "learning_rate": 2.179060186084936e-08, + "loss": 0.8701, + "step": 20240 + }, + { + "epoch": 0.9856109853188226, + "grad_norm": 0.09866586327552795, + "learning_rate": 2.1643643286761718e-08, + "loss": 0.6333, + "step": 20241 + }, + { + "epoch": 0.9856596791079297, + "grad_norm": 5.503349781036377, + "learning_rate": 2.1497181676111144e-08, + "loss": 0.8245, + "step": 20242 + }, + { + "epoch": 0.9857083728970369, + "grad_norm": 1.7499898672103882, + "learning_rate": 2.1351217032539174e-08, + "loss": 0.8453, + "step": 20243 + }, + { + "epoch": 0.9857570666861442, + "grad_norm": 1.8390743732452393, + "learning_rate": 2.120574935967845e-08, + "loss": 0.7424, + "step": 20244 + }, + { + "epoch": 0.9858057604752514, + "grad_norm": 1.4082962274551392, + "learning_rate": 2.1060778661143865e-08, + "loss": 0.8828, + "step": 20245 + }, + { + "epoch": 0.9858544542643586, + "grad_norm": 1.5032371282577515, + "learning_rate": 2.0916304940545862e-08, + "loss": 0.7163, + "step": 20246 + }, + { + "epoch": 0.9859031480534658, + "grad_norm": 1.5696650743484497, + "learning_rate": 2.07723282014749e-08, + "loss": 0.7475, + "step": 20247 + }, + { + "epoch": 0.985951841842573, + "grad_norm": 1.3742594718933105, + "learning_rate": 2.0628848447517003e-08, + "loss": 0.8117, + "step": 20248 + }, + { + "epoch": 0.9860005356316802, + "grad_norm": 1.4031286239624023, + "learning_rate": 2.0485865682235983e-08, + "loss": 0.846, + "step": 20249 + }, + { + "epoch": 0.9860492294207874, + "grad_norm": 0.09647572040557861, + "learning_rate": 2.0343379909189e-08, + "loss": 0.5849, + "step": 20250 + }, + { + "epoch": 0.9860979232098945, + "grad_norm": 0.10048630833625793, + "learning_rate": 2.020139113192432e-08, + "loss": 0.5523, + "step": 20251 + }, + { + "epoch": 0.9861466169990017, + "grad_norm": 2.5201456546783447, + "learning_rate": 2.0059899353972455e-08, + "loss": 0.8659, + "step": 20252 + }, + { + "epoch": 0.986195310788109, + "grad_norm": 1.7160558700561523, + "learning_rate": 1.9918904578850594e-08, + "loss": 0.8086, + "step": 20253 + }, + { + "epoch": 0.9862440045772162, + "grad_norm": 1.805366039276123, + "learning_rate": 1.977840681006704e-08, + "loss": 0.9342, + "step": 20254 + }, + { + "epoch": 0.9862926983663234, + "grad_norm": 1.8277100324630737, + "learning_rate": 1.9638406051118998e-08, + "loss": 0.8006, + "step": 20255 + }, + { + "epoch": 0.9863413921554306, + "grad_norm": 1.6846978664398193, + "learning_rate": 1.949890230548812e-08, + "loss": 0.836, + "step": 20256 + }, + { + "epoch": 0.9863900859445378, + "grad_norm": 0.096703439950943, + "learning_rate": 1.935989557664275e-08, + "loss": 0.5102, + "step": 20257 + }, + { + "epoch": 0.986438779733645, + "grad_norm": 1.4957438707351685, + "learning_rate": 1.922138586804012e-08, + "loss": 0.8036, + "step": 20258 + }, + { + "epoch": 0.9864874735227521, + "grad_norm": 3.131661891937256, + "learning_rate": 1.9083373183130806e-08, + "loss": 0.8044, + "step": 20259 + }, + { + "epoch": 0.9865361673118593, + "grad_norm": 1.936506748199463, + "learning_rate": 1.8945857525340948e-08, + "loss": 0.8307, + "step": 20260 + }, + { + "epoch": 0.9865848611009665, + "grad_norm": 1.6143617630004883, + "learning_rate": 1.8808838898096704e-08, + "loss": 0.7826, + "step": 20261 + }, + { + "epoch": 0.9866335548900738, + "grad_norm": 1.7598376274108887, + "learning_rate": 1.8672317304804232e-08, + "loss": 0.7937, + "step": 20262 + }, + { + "epoch": 0.986682248679181, + "grad_norm": 1.4689202308654785, + "learning_rate": 1.8536292748858598e-08, + "loss": 0.734, + "step": 20263 + }, + { + "epoch": 0.9867309424682882, + "grad_norm": 1.958375334739685, + "learning_rate": 1.840076523364598e-08, + "loss": 0.7291, + "step": 20264 + }, + { + "epoch": 0.9867796362573954, + "grad_norm": 1.2428613901138306, + "learning_rate": 1.8265734762534794e-08, + "loss": 0.8712, + "step": 20265 + }, + { + "epoch": 0.9868283300465026, + "grad_norm": 1.477565050125122, + "learning_rate": 1.8131201338886795e-08, + "loss": 0.8232, + "step": 20266 + }, + { + "epoch": 0.9868770238356098, + "grad_norm": 2.548666000366211, + "learning_rate": 1.7997164966045978e-08, + "loss": 0.8403, + "step": 20267 + }, + { + "epoch": 0.9869257176247169, + "grad_norm": 1.8739322423934937, + "learning_rate": 1.7863625647347448e-08, + "loss": 0.7658, + "step": 20268 + }, + { + "epoch": 0.9869744114138241, + "grad_norm": 2.8100340366363525, + "learning_rate": 1.7730583386113e-08, + "loss": 0.9348, + "step": 20269 + }, + { + "epoch": 0.9870231052029314, + "grad_norm": 1.749466061592102, + "learning_rate": 1.759803818565331e-08, + "loss": 0.7445, + "step": 20270 + }, + { + "epoch": 0.9870717989920386, + "grad_norm": 1.2544009685516357, + "learning_rate": 1.7465990049263525e-08, + "loss": 0.7889, + "step": 20271 + }, + { + "epoch": 0.9871204927811458, + "grad_norm": 1.779765248298645, + "learning_rate": 1.7334438980227686e-08, + "loss": 0.8335, + "step": 20272 + }, + { + "epoch": 0.987169186570253, + "grad_norm": 1.3547416925430298, + "learning_rate": 1.7203384981823168e-08, + "loss": 0.7285, + "step": 20273 + }, + { + "epoch": 0.9872178803593602, + "grad_norm": 1.3066751956939697, + "learning_rate": 1.7072828057302925e-08, + "loss": 0.7318, + "step": 20274 + }, + { + "epoch": 0.9872665741484674, + "grad_norm": 2.7978272438049316, + "learning_rate": 1.6942768209919914e-08, + "loss": 0.8496, + "step": 20275 + }, + { + "epoch": 0.9873152679375745, + "grad_norm": 1.4059499502182007, + "learning_rate": 1.6813205442907098e-08, + "loss": 0.7896, + "step": 20276 + }, + { + "epoch": 0.9873639617266817, + "grad_norm": 1.4290910959243774, + "learning_rate": 1.6684139759488572e-08, + "loss": 0.757, + "step": 20277 + }, + { + "epoch": 0.9874126555157889, + "grad_norm": 1.3740662336349487, + "learning_rate": 1.6555571162872873e-08, + "loss": 0.6529, + "step": 20278 + }, + { + "epoch": 0.9874613493048962, + "grad_norm": 1.5290755033493042, + "learning_rate": 1.642749965625967e-08, + "loss": 0.8592, + "step": 20279 + }, + { + "epoch": 0.9875100430940034, + "grad_norm": 1.9757568836212158, + "learning_rate": 1.6299925242835303e-08, + "loss": 0.8001, + "step": 20280 + }, + { + "epoch": 0.9875587368831106, + "grad_norm": 2.2640626430511475, + "learning_rate": 1.6172847925770562e-08, + "loss": 0.8725, + "step": 20281 + }, + { + "epoch": 0.9876074306722178, + "grad_norm": 1.3052560091018677, + "learning_rate": 1.604626770822959e-08, + "loss": 0.8101, + "step": 20282 + }, + { + "epoch": 0.987656124461325, + "grad_norm": 2.0635569095611572, + "learning_rate": 1.5920184593360978e-08, + "loss": 0.8593, + "step": 20283 + }, + { + "epoch": 0.9877048182504321, + "grad_norm": 1.2453279495239258, + "learning_rate": 1.579459858429777e-08, + "loss": 0.7795, + "step": 20284 + }, + { + "epoch": 0.9877535120395393, + "grad_norm": 1.1741983890533447, + "learning_rate": 1.5669509684166362e-08, + "loss": 0.7917, + "step": 20285 + }, + { + "epoch": 0.9878022058286465, + "grad_norm": 2.185452699661255, + "learning_rate": 1.5544917896079816e-08, + "loss": 0.8851, + "step": 20286 + }, + { + "epoch": 0.9878508996177537, + "grad_norm": 1.5185375213623047, + "learning_rate": 1.542082322313343e-08, + "loss": 0.8089, + "step": 20287 + }, + { + "epoch": 0.987899593406861, + "grad_norm": 1.469693660736084, + "learning_rate": 1.529722566841585e-08, + "loss": 0.8149, + "step": 20288 + }, + { + "epoch": 0.9879482871959682, + "grad_norm": 3.7862887382507324, + "learning_rate": 1.517412523500461e-08, + "loss": 0.816, + "step": 20289 + }, + { + "epoch": 0.9879969809850754, + "grad_norm": 0.1016521230340004, + "learning_rate": 1.5051521925957268e-08, + "loss": 0.5706, + "step": 20290 + }, + { + "epoch": 0.9880456747741826, + "grad_norm": 1.488385796546936, + "learning_rate": 1.4929415744324714e-08, + "loss": 0.7782, + "step": 20291 + }, + { + "epoch": 0.9880943685632898, + "grad_norm": 1.7818540334701538, + "learning_rate": 1.4807806693146743e-08, + "loss": 0.8077, + "step": 20292 + }, + { + "epoch": 0.9881430623523969, + "grad_norm": 2.7937114238739014, + "learning_rate": 1.4686694775445377e-08, + "loss": 0.8599, + "step": 20293 + }, + { + "epoch": 0.9881917561415041, + "grad_norm": 1.5663126707077026, + "learning_rate": 1.4566079994235982e-08, + "loss": 0.8402, + "step": 20294 + }, + { + "epoch": 0.9882404499306113, + "grad_norm": 1.2240569591522217, + "learning_rate": 1.4445962352516162e-08, + "loss": 0.8455, + "step": 20295 + }, + { + "epoch": 0.9882891437197185, + "grad_norm": 1.54116952419281, + "learning_rate": 1.4326341853276859e-08, + "loss": 0.899, + "step": 20296 + }, + { + "epoch": 0.9883378375088258, + "grad_norm": 1.2951422929763794, + "learning_rate": 1.4207218499491249e-08, + "loss": 0.8543, + "step": 20297 + }, + { + "epoch": 0.988386531297933, + "grad_norm": 1.3766580820083618, + "learning_rate": 1.4088592294123627e-08, + "loss": 0.903, + "step": 20298 + }, + { + "epoch": 0.9884352250870402, + "grad_norm": 1.4827829599380493, + "learning_rate": 1.3970463240122745e-08, + "loss": 0.8729, + "step": 20299 + }, + { + "epoch": 0.9884839188761474, + "grad_norm": 2.4736084938049316, + "learning_rate": 1.3852831340430695e-08, + "loss": 0.8162, + "step": 20300 + }, + { + "epoch": 0.9885326126652545, + "grad_norm": 1.8920427560806274, + "learning_rate": 1.3735696597971803e-08, + "loss": 0.7677, + "step": 20301 + }, + { + "epoch": 0.9885813064543617, + "grad_norm": 1.63438081741333, + "learning_rate": 1.3619059015659297e-08, + "loss": 0.7854, + "step": 20302 + }, + { + "epoch": 0.9886300002434689, + "grad_norm": 3.1962804794311523, + "learning_rate": 1.3502918596395298e-08, + "loss": 0.8227, + "step": 20303 + }, + { + "epoch": 0.9886786940325761, + "grad_norm": 1.5643478631973267, + "learning_rate": 1.3387275343068607e-08, + "loss": 0.7729, + "step": 20304 + }, + { + "epoch": 0.9887273878216833, + "grad_norm": 1.7326006889343262, + "learning_rate": 1.32721292585547e-08, + "loss": 0.8098, + "step": 20305 + }, + { + "epoch": 0.9887760816107906, + "grad_norm": 2.086500883102417, + "learning_rate": 1.3157480345717954e-08, + "loss": 0.7528, + "step": 20306 + }, + { + "epoch": 0.9888247753998978, + "grad_norm": 1.3713268041610718, + "learning_rate": 1.3043328607413863e-08, + "loss": 0.7743, + "step": 20307 + }, + { + "epoch": 0.988873469189005, + "grad_norm": 1.7606439590454102, + "learning_rate": 1.2929674046477936e-08, + "loss": 0.8073, + "step": 20308 + }, + { + "epoch": 0.9889221629781122, + "grad_norm": 1.6180020570755005, + "learning_rate": 1.281651666573902e-08, + "loss": 0.816, + "step": 20309 + }, + { + "epoch": 0.9889708567672193, + "grad_norm": 1.4663784503936768, + "learning_rate": 1.2703856468010422e-08, + "loss": 0.8187, + "step": 20310 + }, + { + "epoch": 0.9890195505563265, + "grad_norm": 1.610964059829712, + "learning_rate": 1.2591693456098786e-08, + "loss": 0.799, + "step": 20311 + }, + { + "epoch": 0.9890682443454337, + "grad_norm": 1.5602176189422607, + "learning_rate": 1.248002763278855e-08, + "loss": 0.7916, + "step": 20312 + }, + { + "epoch": 0.9891169381345409, + "grad_norm": 1.5194294452667236, + "learning_rate": 1.2368859000859711e-08, + "loss": 0.7723, + "step": 20313 + }, + { + "epoch": 0.9891656319236481, + "grad_norm": 1.9238300323486328, + "learning_rate": 1.2258187563078949e-08, + "loss": 0.8541, + "step": 20314 + }, + { + "epoch": 0.9892143257127554, + "grad_norm": 1.8065626621246338, + "learning_rate": 1.2148013322199615e-08, + "loss": 0.7857, + "step": 20315 + }, + { + "epoch": 0.9892630195018626, + "grad_norm": 1.231766939163208, + "learning_rate": 1.2038336280957296e-08, + "loss": 0.8883, + "step": 20316 + }, + { + "epoch": 0.9893117132909698, + "grad_norm": 1.6186165809631348, + "learning_rate": 1.1929156442087586e-08, + "loss": 0.7872, + "step": 20317 + }, + { + "epoch": 0.9893604070800769, + "grad_norm": 1.5612646341323853, + "learning_rate": 1.1820473808299427e-08, + "loss": 0.7722, + "step": 20318 + }, + { + "epoch": 0.9894091008691841, + "grad_norm": 1.7947200536727905, + "learning_rate": 1.1712288382301761e-08, + "loss": 0.8064, + "step": 20319 + }, + { + "epoch": 0.9894577946582913, + "grad_norm": 1.476685881614685, + "learning_rate": 1.1604600166783554e-08, + "loss": 0.8086, + "step": 20320 + }, + { + "epoch": 0.9895064884473985, + "grad_norm": 1.2683175802230835, + "learning_rate": 1.1497409164420437e-08, + "loss": 0.7217, + "step": 20321 + }, + { + "epoch": 0.9895551822365057, + "grad_norm": 1.4647436141967773, + "learning_rate": 1.139071537788361e-08, + "loss": 0.6881, + "step": 20322 + }, + { + "epoch": 0.989603876025613, + "grad_norm": 1.4822202920913696, + "learning_rate": 1.1284518809826506e-08, + "loss": 0.8033, + "step": 20323 + }, + { + "epoch": 0.9896525698147202, + "grad_norm": 1.720218300819397, + "learning_rate": 1.1178819462887014e-08, + "loss": 0.7736, + "step": 20324 + }, + { + "epoch": 0.9897012636038274, + "grad_norm": 2.4635891914367676, + "learning_rate": 1.1073617339698584e-08, + "loss": 0.7873, + "step": 20325 + }, + { + "epoch": 0.9897499573929345, + "grad_norm": 2.9204046726226807, + "learning_rate": 1.09689124428769e-08, + "loss": 0.7551, + "step": 20326 + }, + { + "epoch": 0.9897986511820417, + "grad_norm": 0.09509321302175522, + "learning_rate": 1.0864704775024326e-08, + "loss": 0.5793, + "step": 20327 + }, + { + "epoch": 0.9898473449711489, + "grad_norm": 1.8237206935882568, + "learning_rate": 1.0760994338736563e-08, + "loss": 0.7292, + "step": 20328 + }, + { + "epoch": 0.9898960387602561, + "grad_norm": 2.051652193069458, + "learning_rate": 1.0657781136589328e-08, + "loss": 0.812, + "step": 20329 + }, + { + "epoch": 0.9899447325493633, + "grad_norm": 1.4100449085235596, + "learning_rate": 1.05550651711539e-08, + "loss": 0.8048, + "step": 20330 + }, + { + "epoch": 0.9899934263384705, + "grad_norm": 0.09653452038764954, + "learning_rate": 1.0452846444983789e-08, + "loss": 0.6083, + "step": 20331 + }, + { + "epoch": 0.9900421201275778, + "grad_norm": 1.3934451341629028, + "learning_rate": 1.0351124960621407e-08, + "loss": 0.7274, + "step": 20332 + }, + { + "epoch": 0.990090813916685, + "grad_norm": 1.6729135513305664, + "learning_rate": 1.024990072059584e-08, + "loss": 0.916, + "step": 20333 + }, + { + "epoch": 0.9901395077057922, + "grad_norm": 1.0435858964920044, + "learning_rate": 1.0149173727429517e-08, + "loss": 0.8556, + "step": 20334 + }, + { + "epoch": 0.9901882014948993, + "grad_norm": 1.4176242351531982, + "learning_rate": 1.0048943983622661e-08, + "loss": 0.7275, + "step": 20335 + }, + { + "epoch": 0.9902368952840065, + "grad_norm": 1.6511561870574951, + "learning_rate": 9.949211491673272e-09, + "loss": 0.7699, + "step": 20336 + }, + { + "epoch": 0.9902855890731137, + "grad_norm": 1.768966794013977, + "learning_rate": 9.849976254057148e-09, + "loss": 0.7855, + "step": 20337 + }, + { + "epoch": 0.9903342828622209, + "grad_norm": 1.531880497932434, + "learning_rate": 9.751238273247865e-09, + "loss": 0.8221, + "step": 20338 + }, + { + "epoch": 0.9903829766513281, + "grad_norm": 1.3705822229385376, + "learning_rate": 9.652997551699016e-09, + "loss": 0.7397, + "step": 20339 + }, + { + "epoch": 0.9904316704404353, + "grad_norm": 1.4396536350250244, + "learning_rate": 9.555254091853095e-09, + "loss": 0.7017, + "step": 20340 + }, + { + "epoch": 0.9904803642295426, + "grad_norm": 1.5045921802520752, + "learning_rate": 9.458007896143706e-09, + "loss": 0.7421, + "step": 20341 + }, + { + "epoch": 0.9905290580186498, + "grad_norm": 1.80135977268219, + "learning_rate": 9.361258966988918e-09, + "loss": 0.7966, + "step": 20342 + }, + { + "epoch": 0.9905777518077569, + "grad_norm": 1.5433683395385742, + "learning_rate": 9.265007306795692e-09, + "loss": 0.8421, + "step": 20343 + }, + { + "epoch": 0.9906264455968641, + "grad_norm": 2.9437756538391113, + "learning_rate": 9.169252917957671e-09, + "loss": 0.8233, + "step": 20344 + }, + { + "epoch": 0.9906751393859713, + "grad_norm": 1.7212189435958862, + "learning_rate": 9.073995802857394e-09, + "loss": 0.7973, + "step": 20345 + }, + { + "epoch": 0.9907238331750785, + "grad_norm": 1.9946242570877075, + "learning_rate": 8.979235963864075e-09, + "loss": 0.797, + "step": 20346 + }, + { + "epoch": 0.9907725269641857, + "grad_norm": 1.7758678197860718, + "learning_rate": 8.884973403333607e-09, + "loss": 0.9011, + "step": 20347 + }, + { + "epoch": 0.9908212207532929, + "grad_norm": 2.219632863998413, + "learning_rate": 8.791208123613005e-09, + "loss": 0.7826, + "step": 20348 + }, + { + "epoch": 0.9908699145424001, + "grad_norm": 1.067765474319458, + "learning_rate": 8.697940127035954e-09, + "loss": 0.7667, + "step": 20349 + }, + { + "epoch": 0.9909186083315074, + "grad_norm": 1.223124623298645, + "learning_rate": 8.605169415918379e-09, + "loss": 0.8226, + "step": 20350 + }, + { + "epoch": 0.9909673021206146, + "grad_norm": 1.6068439483642578, + "learning_rate": 8.512895992569547e-09, + "loss": 0.8727, + "step": 20351 + }, + { + "epoch": 0.9910159959097217, + "grad_norm": 1.2296500205993652, + "learning_rate": 8.42111985928762e-09, + "loss": 0.8026, + "step": 20352 + }, + { + "epoch": 0.9910646896988289, + "grad_norm": 1.6743898391723633, + "learning_rate": 8.32984101835077e-09, + "loss": 0.8429, + "step": 20353 + }, + { + "epoch": 0.9911133834879361, + "grad_norm": 1.2522377967834473, + "learning_rate": 8.23905947203274e-09, + "loss": 0.8403, + "step": 20354 + }, + { + "epoch": 0.9911620772770433, + "grad_norm": 2.1175403594970703, + "learning_rate": 8.1487752225895e-09, + "loss": 0.8471, + "step": 20355 + }, + { + "epoch": 0.9912107710661505, + "grad_norm": 1.3373501300811768, + "learning_rate": 8.058988272270362e-09, + "loss": 0.8019, + "step": 20356 + }, + { + "epoch": 0.9912594648552577, + "grad_norm": 1.429031252861023, + "learning_rate": 7.969698623306877e-09, + "loss": 0.7831, + "step": 20357 + }, + { + "epoch": 0.991308158644365, + "grad_norm": 1.6389011144638062, + "learning_rate": 7.880906277917267e-09, + "loss": 0.8739, + "step": 20358 + }, + { + "epoch": 0.9913568524334722, + "grad_norm": 1.4738860130310059, + "learning_rate": 7.792611238315318e-09, + "loss": 0.8112, + "step": 20359 + }, + { + "epoch": 0.9914055462225793, + "grad_norm": 1.5545151233673096, + "learning_rate": 7.704813506694831e-09, + "loss": 0.8446, + "step": 20360 + }, + { + "epoch": 0.9914542400116865, + "grad_norm": 1.5197546482086182, + "learning_rate": 7.617513085238503e-09, + "loss": 0.8383, + "step": 20361 + }, + { + "epoch": 0.9915029338007937, + "grad_norm": 1.8792890310287476, + "learning_rate": 7.530709976120154e-09, + "loss": 0.7689, + "step": 20362 + }, + { + "epoch": 0.9915516275899009, + "grad_norm": 1.4984997510910034, + "learning_rate": 7.444404181498055e-09, + "loss": 0.7779, + "step": 20363 + }, + { + "epoch": 0.9916003213790081, + "grad_norm": 1.918583869934082, + "learning_rate": 7.358595703519378e-09, + "loss": 0.7476, + "step": 20364 + }, + { + "epoch": 0.9916490151681153, + "grad_norm": 1.7500512599945068, + "learning_rate": 7.273284544317971e-09, + "loss": 0.7995, + "step": 20365 + }, + { + "epoch": 0.9916977089572225, + "grad_norm": 1.7753764390945435, + "learning_rate": 7.188470706018802e-09, + "loss": 0.8466, + "step": 20366 + }, + { + "epoch": 0.9917464027463297, + "grad_norm": 1.5476632118225098, + "learning_rate": 7.104154190726853e-09, + "loss": 0.8102, + "step": 20367 + }, + { + "epoch": 0.991795096535437, + "grad_norm": 4.234926700592041, + "learning_rate": 7.020335000542666e-09, + "loss": 0.8802, + "step": 20368 + }, + { + "epoch": 0.9918437903245441, + "grad_norm": 3.432387590408325, + "learning_rate": 6.937013137551241e-09, + "loss": 0.7795, + "step": 20369 + }, + { + "epoch": 0.9918924841136513, + "grad_norm": 1.6889573335647583, + "learning_rate": 6.854188603822032e-09, + "loss": 0.7316, + "step": 20370 + }, + { + "epoch": 0.9919411779027585, + "grad_norm": 1.4347668886184692, + "learning_rate": 6.771861401420055e-09, + "loss": 0.8573, + "step": 20371 + }, + { + "epoch": 0.9919898716918657, + "grad_norm": 1.1488889455795288, + "learning_rate": 6.69003153239034e-09, + "loss": 0.7563, + "step": 20372 + }, + { + "epoch": 0.9920385654809729, + "grad_norm": 1.4039055109024048, + "learning_rate": 6.608698998769037e-09, + "loss": 0.8341, + "step": 20373 + }, + { + "epoch": 0.9920872592700801, + "grad_norm": 1.5077077150344849, + "learning_rate": 6.527863802578971e-09, + "loss": 0.7928, + "step": 20374 + }, + { + "epoch": 0.9921359530591873, + "grad_norm": 1.5345282554626465, + "learning_rate": 6.447525945831867e-09, + "loss": 0.7434, + "step": 20375 + }, + { + "epoch": 0.9921846468482945, + "grad_norm": 1.2324882745742798, + "learning_rate": 6.367685430526127e-09, + "loss": 0.8345, + "step": 20376 + }, + { + "epoch": 0.9922333406374017, + "grad_norm": 1.870823860168457, + "learning_rate": 6.288342258646829e-09, + "loss": 0.8067, + "step": 20377 + }, + { + "epoch": 0.9922820344265089, + "grad_norm": 1.4833303689956665, + "learning_rate": 6.209496432165729e-09, + "loss": 0.8013, + "step": 20378 + }, + { + "epoch": 0.9923307282156161, + "grad_norm": 0.10165119171142578, + "learning_rate": 6.131147953047922e-09, + "loss": 0.6276, + "step": 20379 + }, + { + "epoch": 0.9923794220047233, + "grad_norm": 2.162480115890503, + "learning_rate": 6.0532968232407394e-09, + "loss": 0.7952, + "step": 20380 + }, + { + "epoch": 0.9924281157938305, + "grad_norm": 1.3937456607818604, + "learning_rate": 5.975943044682631e-09, + "loss": 0.8985, + "step": 20381 + }, + { + "epoch": 0.9924768095829377, + "grad_norm": 1.6822532415390015, + "learning_rate": 5.8990866192942805e-09, + "loss": 0.7326, + "step": 20382 + }, + { + "epoch": 0.9925255033720449, + "grad_norm": 1.8039332628250122, + "learning_rate": 5.822727548987495e-09, + "loss": 0.785, + "step": 20383 + }, + { + "epoch": 0.9925741971611521, + "grad_norm": 1.6746854782104492, + "learning_rate": 5.7468658356651944e-09, + "loss": 0.8259, + "step": 20384 + }, + { + "epoch": 0.9926228909502592, + "grad_norm": 1.3456635475158691, + "learning_rate": 5.671501481212538e-09, + "loss": 0.7825, + "step": 20385 + }, + { + "epoch": 0.9926715847393665, + "grad_norm": 1.3398228883743286, + "learning_rate": 5.596634487503583e-09, + "loss": 0.7978, + "step": 20386 + }, + { + "epoch": 0.9927202785284737, + "grad_norm": 2.308222770690918, + "learning_rate": 5.522264856403503e-09, + "loss": 0.8626, + "step": 20387 + }, + { + "epoch": 0.9927689723175809, + "grad_norm": 3.2134647369384766, + "learning_rate": 5.4483925897574895e-09, + "loss": 0.7099, + "step": 20388 + }, + { + "epoch": 0.9928176661066881, + "grad_norm": 2.094191074371338, + "learning_rate": 5.375017689406292e-09, + "loss": 0.8307, + "step": 20389 + }, + { + "epoch": 0.9928663598957953, + "grad_norm": 1.5643310546875, + "learning_rate": 5.302140157175118e-09, + "loss": 0.7979, + "step": 20390 + }, + { + "epoch": 0.9929150536849025, + "grad_norm": 1.75249183177948, + "learning_rate": 5.229759994875849e-09, + "loss": 0.8122, + "step": 20391 + }, + { + "epoch": 0.9929637474740097, + "grad_norm": 1.252511739730835, + "learning_rate": 5.15787720430927e-09, + "loss": 0.796, + "step": 20392 + }, + { + "epoch": 0.9930124412631169, + "grad_norm": 3.58796763420105, + "learning_rate": 5.086491787265058e-09, + "loss": 0.723, + "step": 20393 + }, + { + "epoch": 0.993061135052224, + "grad_norm": 1.4879248142242432, + "learning_rate": 5.01560374551735e-09, + "loss": 0.8088, + "step": 20394 + }, + { + "epoch": 0.9931098288413313, + "grad_norm": 1.5830234289169312, + "learning_rate": 4.945213080826961e-09, + "loss": 0.9097, + "step": 20395 + }, + { + "epoch": 0.9931585226304385, + "grad_norm": 1.5230571031570435, + "learning_rate": 4.875319794950262e-09, + "loss": 0.8001, + "step": 20396 + }, + { + "epoch": 0.9932072164195457, + "grad_norm": 10.388947486877441, + "learning_rate": 4.805923889621422e-09, + "loss": 0.8677, + "step": 20397 + }, + { + "epoch": 0.9932559102086529, + "grad_norm": 1.408797264099121, + "learning_rate": 4.737025366567949e-09, + "loss": 0.843, + "step": 20398 + }, + { + "epoch": 0.9933046039977601, + "grad_norm": 1.7351689338684082, + "learning_rate": 4.6686242275062465e-09, + "loss": 0.822, + "step": 20399 + }, + { + "epoch": 0.9933532977868673, + "grad_norm": 2.5573127269744873, + "learning_rate": 4.600720474132736e-09, + "loss": 0.8522, + "step": 20400 + }, + { + "epoch": 0.9934019915759745, + "grad_norm": 1.4677388668060303, + "learning_rate": 4.533314108141618e-09, + "loss": 0.8019, + "step": 20401 + }, + { + "epoch": 0.9934506853650816, + "grad_norm": 1.2425470352172852, + "learning_rate": 4.466405131207108e-09, + "loss": 0.8317, + "step": 20402 + }, + { + "epoch": 0.9934993791541888, + "grad_norm": 3.427415370941162, + "learning_rate": 4.399993544992321e-09, + "loss": 0.8171, + "step": 20403 + }, + { + "epoch": 0.9935480729432961, + "grad_norm": 2.968503475189209, + "learning_rate": 4.334079351153708e-09, + "loss": 0.8709, + "step": 20404 + }, + { + "epoch": 0.9935967667324033, + "grad_norm": 2.8601248264312744, + "learning_rate": 4.26866255132552e-09, + "loss": 0.8423, + "step": 20405 + }, + { + "epoch": 0.9936454605215105, + "grad_norm": 0.09816156327724457, + "learning_rate": 4.203743147137562e-09, + "loss": 0.595, + "step": 20406 + }, + { + "epoch": 0.9936941543106177, + "grad_norm": 1.3059648275375366, + "learning_rate": 4.13932114020632e-09, + "loss": 0.887, + "step": 20407 + }, + { + "epoch": 0.9937428480997249, + "grad_norm": 2.812143087387085, + "learning_rate": 4.075396532130516e-09, + "loss": 0.8748, + "step": 20408 + }, + { + "epoch": 0.9937915418888321, + "grad_norm": 1.7205649614334106, + "learning_rate": 4.011969324504428e-09, + "loss": 0.7601, + "step": 20409 + }, + { + "epoch": 0.9938402356779393, + "grad_norm": 1.5136933326721191, + "learning_rate": 3.949039518902353e-09, + "loss": 0.793, + "step": 20410 + }, + { + "epoch": 0.9938889294670464, + "grad_norm": 6.947484016418457, + "learning_rate": 3.886607116891927e-09, + "loss": 0.7442, + "step": 20411 + }, + { + "epoch": 0.9939376232561536, + "grad_norm": 1.973712682723999, + "learning_rate": 3.824672120025241e-09, + "loss": 0.7397, + "step": 20412 + }, + { + "epoch": 0.9939863170452609, + "grad_norm": 1.6956398487091064, + "learning_rate": 3.763234529843285e-09, + "loss": 0.823, + "step": 20413 + }, + { + "epoch": 0.9940350108343681, + "grad_norm": 1.9335594177246094, + "learning_rate": 3.7022943478715044e-09, + "loss": 0.8115, + "step": 20414 + }, + { + "epoch": 0.9940837046234753, + "grad_norm": 2.0606601238250732, + "learning_rate": 3.641851575630906e-09, + "loss": 0.8628, + "step": 20415 + }, + { + "epoch": 0.9941323984125825, + "grad_norm": 2.0895626544952393, + "learning_rate": 3.5819062146225102e-09, + "loss": 0.7333, + "step": 20416 + }, + { + "epoch": 0.9941810922016897, + "grad_norm": 1.5853631496429443, + "learning_rate": 3.5224582663362374e-09, + "loss": 0.8263, + "step": 20417 + }, + { + "epoch": 0.9942297859907969, + "grad_norm": 1.2923656702041626, + "learning_rate": 3.463507732253124e-09, + "loss": 0.7917, + "step": 20418 + }, + { + "epoch": 0.994278479779904, + "grad_norm": 1.4310293197631836, + "learning_rate": 3.4050546138386652e-09, + "loss": 0.7828, + "step": 20419 + }, + { + "epoch": 0.9943271735690112, + "grad_norm": 1.458120584487915, + "learning_rate": 3.3470989125450326e-09, + "loss": 0.8303, + "step": 20420 + }, + { + "epoch": 0.9943758673581184, + "grad_norm": 1.4402996301651, + "learning_rate": 3.289640629815516e-09, + "loss": 0.7954, + "step": 20421 + }, + { + "epoch": 0.9944245611472257, + "grad_norm": 1.5339831113815308, + "learning_rate": 3.232679767082303e-09, + "loss": 0.8793, + "step": 20422 + }, + { + "epoch": 0.9944732549363329, + "grad_norm": 2.0713236331939697, + "learning_rate": 3.176216325755377e-09, + "loss": 0.7383, + "step": 20423 + }, + { + "epoch": 0.9945219487254401, + "grad_norm": 1.478041410446167, + "learning_rate": 3.1202503072469413e-09, + "loss": 0.9132, + "step": 20424 + }, + { + "epoch": 0.9945706425145473, + "grad_norm": 1.7346758842468262, + "learning_rate": 3.064781712942555e-09, + "loss": 0.8089, + "step": 20425 + }, + { + "epoch": 0.9946193363036545, + "grad_norm": 1.6731011867523193, + "learning_rate": 3.009810544227776e-09, + "loss": 0.8077, + "step": 20426 + }, + { + "epoch": 0.9946680300927617, + "grad_norm": 1.8479087352752686, + "learning_rate": 2.955336802465958e-09, + "loss": 0.837, + "step": 20427 + }, + { + "epoch": 0.9947167238818688, + "grad_norm": 1.8044989109039307, + "learning_rate": 2.901360489013794e-09, + "loss": 0.8124, + "step": 20428 + }, + { + "epoch": 0.994765417670976, + "grad_norm": 1.6043308973312378, + "learning_rate": 2.8478816052124327e-09, + "loss": 0.7806, + "step": 20429 + }, + { + "epoch": 0.9948141114600833, + "grad_norm": 0.09642400592565536, + "learning_rate": 2.7949001523941423e-09, + "loss": 0.628, + "step": 20430 + }, + { + "epoch": 0.9948628052491905, + "grad_norm": 1.1784937381744385, + "learning_rate": 2.742416131875647e-09, + "loss": 0.7215, + "step": 20431 + }, + { + "epoch": 0.9949114990382977, + "grad_norm": 1.7170219421386719, + "learning_rate": 2.6904295449625694e-09, + "loss": 0.838, + "step": 20432 + }, + { + "epoch": 0.9949601928274049, + "grad_norm": 2.1363155841827393, + "learning_rate": 2.6389403929494294e-09, + "loss": 0.865, + "step": 20433 + }, + { + "epoch": 0.9950088866165121, + "grad_norm": 1.3803044557571411, + "learning_rate": 2.587948677115204e-09, + "loss": 0.8241, + "step": 20434 + }, + { + "epoch": 0.9950575804056193, + "grad_norm": 1.4777116775512695, + "learning_rate": 2.5374543987277676e-09, + "loss": 0.8091, + "step": 20435 + }, + { + "epoch": 0.9951062741947264, + "grad_norm": 1.7932190895080566, + "learning_rate": 2.487457559046114e-09, + "loss": 0.7819, + "step": 20436 + }, + { + "epoch": 0.9951549679838336, + "grad_norm": 2.420210123062134, + "learning_rate": 2.4379581593114708e-09, + "loss": 0.8156, + "step": 20437 + }, + { + "epoch": 0.9952036617729408, + "grad_norm": 2.299093723297119, + "learning_rate": 2.3889562007561874e-09, + "loss": 0.8305, + "step": 20438 + }, + { + "epoch": 0.995252355562048, + "grad_norm": 1.3243873119354248, + "learning_rate": 2.3404516845992873e-09, + "loss": 0.7855, + "step": 20439 + }, + { + "epoch": 0.9953010493511553, + "grad_norm": 1.7075669765472412, + "learning_rate": 2.292444612046474e-09, + "loss": 0.8389, + "step": 20440 + }, + { + "epoch": 0.9953497431402625, + "grad_norm": 0.0941699743270874, + "learning_rate": 2.2449349842923464e-09, + "loss": 0.6184, + "step": 20441 + }, + { + "epoch": 0.9953984369293697, + "grad_norm": 1.999394416809082, + "learning_rate": 2.1979228025204026e-09, + "loss": 0.8302, + "step": 20442 + }, + { + "epoch": 0.9954471307184769, + "grad_norm": 2.3881006240844727, + "learning_rate": 2.1514080678963767e-09, + "loss": 0.7759, + "step": 20443 + }, + { + "epoch": 0.995495824507584, + "grad_norm": 2.2628164291381836, + "learning_rate": 2.105390781579342e-09, + "loss": 0.8298, + "step": 20444 + }, + { + "epoch": 0.9955445182966912, + "grad_norm": 2.2966954708099365, + "learning_rate": 2.0598709447150477e-09, + "loss": 0.736, + "step": 20445 + }, + { + "epoch": 0.9955932120857984, + "grad_norm": 1.4647809267044067, + "learning_rate": 2.0148485584359224e-09, + "loss": 0.8058, + "step": 20446 + }, + { + "epoch": 0.9956419058749056, + "grad_norm": 1.6175220012664795, + "learning_rate": 1.9703236238588495e-09, + "loss": 0.7933, + "step": 20447 + }, + { + "epoch": 0.9956905996640129, + "grad_norm": 1.7969624996185303, + "learning_rate": 1.926296142094053e-09, + "loss": 0.7539, + "step": 20448 + }, + { + "epoch": 0.9957392934531201, + "grad_norm": 1.6362786293029785, + "learning_rate": 1.882766114236212e-09, + "loss": 0.7492, + "step": 20449 + }, + { + "epoch": 0.9957879872422273, + "grad_norm": 1.4620037078857422, + "learning_rate": 1.839733541366684e-09, + "loss": 0.8938, + "step": 20450 + }, + { + "epoch": 0.9958366810313345, + "grad_norm": 1.375854730606079, + "learning_rate": 1.7971984245579444e-09, + "loss": 0.7457, + "step": 20451 + }, + { + "epoch": 0.9958853748204417, + "grad_norm": 1.1050828695297241, + "learning_rate": 1.755160764869146e-09, + "loss": 0.888, + "step": 20452 + }, + { + "epoch": 0.9959340686095488, + "grad_norm": 1.1974934339523315, + "learning_rate": 1.713620563341678e-09, + "loss": 0.8638, + "step": 20453 + }, + { + "epoch": 0.995982762398656, + "grad_norm": 2.336393356323242, + "learning_rate": 1.6725778210124888e-09, + "loss": 0.8019, + "step": 20454 + }, + { + "epoch": 0.9960314561877632, + "grad_norm": 1.3204333782196045, + "learning_rate": 1.632032538900763e-09, + "loss": 0.7662, + "step": 20455 + }, + { + "epoch": 0.9960801499768704, + "grad_norm": 1.2643461227416992, + "learning_rate": 1.5919847180145832e-09, + "loss": 0.8426, + "step": 20456 + }, + { + "epoch": 0.9961288437659777, + "grad_norm": 1.831673502922058, + "learning_rate": 1.55243435935315e-09, + "loss": 0.7807, + "step": 20457 + }, + { + "epoch": 0.9961775375550849, + "grad_norm": 2.2970526218414307, + "learning_rate": 1.51338146389568e-09, + "loss": 0.8512, + "step": 20458 + }, + { + "epoch": 0.9962262313441921, + "grad_norm": 1.4489104747772217, + "learning_rate": 1.4748260326191699e-09, + "loss": 0.7258, + "step": 20459 + }, + { + "epoch": 0.9962749251332993, + "grad_norm": 1.781006097793579, + "learning_rate": 1.4367680664784112e-09, + "loss": 0.7691, + "step": 20460 + }, + { + "epoch": 0.9963236189224064, + "grad_norm": 1.8371450901031494, + "learning_rate": 1.3992075664215343e-09, + "loss": 0.7564, + "step": 20461 + }, + { + "epoch": 0.9963723127115136, + "grad_norm": 1.5817526578903198, + "learning_rate": 1.362144533383347e-09, + "loss": 0.7617, + "step": 20462 + }, + { + "epoch": 0.9964210065006208, + "grad_norm": 2.80859112739563, + "learning_rate": 1.3255789682831143e-09, + "loss": 0.72, + "step": 20463 + }, + { + "epoch": 0.996469700289728, + "grad_norm": 1.7544599771499634, + "learning_rate": 1.2895108720356598e-09, + "loss": 0.9008, + "step": 20464 + }, + { + "epoch": 0.9965183940788352, + "grad_norm": 1.1769126653671265, + "learning_rate": 1.2539402455336025e-09, + "loss": 0.8905, + "step": 20465 + }, + { + "epoch": 0.9965670878679425, + "grad_norm": 1.334270715713501, + "learning_rate": 1.2188670896629008e-09, + "loss": 0.7804, + "step": 20466 + }, + { + "epoch": 0.9966157816570497, + "grad_norm": 1.3120884895324707, + "learning_rate": 1.1842914052961896e-09, + "loss": 0.7891, + "step": 20467 + }, + { + "epoch": 0.9966644754461569, + "grad_norm": 2.1369481086730957, + "learning_rate": 1.150213193292782e-09, + "loss": 0.8563, + "step": 20468 + }, + { + "epoch": 0.9967131692352641, + "grad_norm": 1.3725736141204834, + "learning_rate": 1.1166324545031082e-09, + "loss": 0.8206, + "step": 20469 + }, + { + "epoch": 0.9967618630243712, + "grad_norm": 3.520355463027954, + "learning_rate": 1.0835491897598361e-09, + "loss": 0.7546, + "step": 20470 + }, + { + "epoch": 0.9968105568134784, + "grad_norm": 1.3242409229278564, + "learning_rate": 1.0509633998867508e-09, + "loss": 0.7512, + "step": 20471 + }, + { + "epoch": 0.9968592506025856, + "grad_norm": 1.366606593132019, + "learning_rate": 1.0188750856943152e-09, + "loss": 0.7849, + "step": 20472 + }, + { + "epoch": 0.9969079443916928, + "grad_norm": 1.3459552526474, + "learning_rate": 9.872842479796695e-10, + "loss": 0.788, + "step": 20473 + }, + { + "epoch": 0.9969566381808, + "grad_norm": 1.8471828699111938, + "learning_rate": 9.56190887531072e-10, + "loss": 0.7175, + "step": 20474 + }, + { + "epoch": 0.9970053319699073, + "grad_norm": 1.5686882734298706, + "learning_rate": 9.255950051212381e-10, + "loss": 0.8578, + "step": 20475 + }, + { + "epoch": 0.9970540257590145, + "grad_norm": 1.3542686700820923, + "learning_rate": 8.954966015095601e-10, + "loss": 0.74, + "step": 20476 + }, + { + "epoch": 0.9971027195481217, + "grad_norm": 1.5920079946517944, + "learning_rate": 8.658956774465488e-10, + "loss": 0.805, + "step": 20477 + }, + { + "epoch": 0.9971514133372288, + "grad_norm": 0.09529022872447968, + "learning_rate": 8.367922336671718e-10, + "loss": 0.5615, + "step": 20478 + }, + { + "epoch": 0.997200107126336, + "grad_norm": 1.6502200365066528, + "learning_rate": 8.081862708975153e-10, + "loss": 0.8259, + "step": 20479 + }, + { + "epoch": 0.9972488009154432, + "grad_norm": 1.744956612586975, + "learning_rate": 7.800777898459012e-10, + "loss": 0.8007, + "step": 20480 + }, + { + "epoch": 0.9972974947045504, + "grad_norm": 1.7363755702972412, + "learning_rate": 7.524667912139905e-10, + "loss": 0.7823, + "step": 20481 + }, + { + "epoch": 0.9973461884936576, + "grad_norm": 1.86581289768219, + "learning_rate": 7.253532756879012e-10, + "loss": 0.7636, + "step": 20482 + }, + { + "epoch": 0.9973948822827648, + "grad_norm": 3.049661874771118, + "learning_rate": 6.987372439426488e-10, + "loss": 0.7776, + "step": 20483 + }, + { + "epoch": 0.9974435760718721, + "grad_norm": 1.2681320905685425, + "learning_rate": 6.726186966377057e-10, + "loss": 0.8839, + "step": 20484 + }, + { + "epoch": 0.9974922698609793, + "grad_norm": 1.2231049537658691, + "learning_rate": 6.469976344281037e-10, + "loss": 0.8382, + "step": 20485 + }, + { + "epoch": 0.9975409636500864, + "grad_norm": 1.8145203590393066, + "learning_rate": 6.218740579444493e-10, + "loss": 0.8179, + "step": 20486 + }, + { + "epoch": 0.9975896574391936, + "grad_norm": 4.743533611297607, + "learning_rate": 5.972479678173493e-10, + "loss": 0.7563, + "step": 20487 + }, + { + "epoch": 0.9976383512283008, + "grad_norm": 1.284605622291565, + "learning_rate": 5.731193646574262e-10, + "loss": 0.7757, + "step": 20488 + }, + { + "epoch": 0.997687045017408, + "grad_norm": 2.0966575145721436, + "learning_rate": 5.494882490642006e-10, + "loss": 0.7137, + "step": 20489 + }, + { + "epoch": 0.9977357388065152, + "grad_norm": 1.696340560913086, + "learning_rate": 5.263546216238702e-10, + "loss": 0.8241, + "step": 20490 + }, + { + "epoch": 0.9977844325956224, + "grad_norm": 1.484473705291748, + "learning_rate": 5.037184829159714e-10, + "loss": 0.7467, + "step": 20491 + }, + { + "epoch": 0.9978331263847297, + "grad_norm": 2.778918743133545, + "learning_rate": 4.81579833502277e-10, + "loss": 0.7715, + "step": 20492 + }, + { + "epoch": 0.9978818201738369, + "grad_norm": 1.5955588817596436, + "learning_rate": 4.5993867393123726e-10, + "loss": 0.7273, + "step": 20493 + }, + { + "epoch": 0.9979305139629441, + "grad_norm": 1.4246857166290283, + "learning_rate": 4.3879500474242056e-10, + "loss": 0.7816, + "step": 20494 + }, + { + "epoch": 0.9979792077520512, + "grad_norm": 2.7271666526794434, + "learning_rate": 4.18148826464293e-10, + "loss": 0.83, + "step": 20495 + }, + { + "epoch": 0.9980279015411584, + "grad_norm": 1.2788269519805908, + "learning_rate": 3.980001396075572e-10, + "loss": 0.838, + "step": 20496 + }, + { + "epoch": 0.9980765953302656, + "grad_norm": 1.425683617591858, + "learning_rate": 3.783489446718136e-10, + "loss": 0.7317, + "step": 20497 + }, + { + "epoch": 0.9981252891193728, + "grad_norm": 1.707741141319275, + "learning_rate": 3.5919524215000113e-10, + "loss": 0.8608, + "step": 20498 + }, + { + "epoch": 0.99817398290848, + "grad_norm": 2.7725226879119873, + "learning_rate": 3.4053903251729524e-10, + "loss": 0.8505, + "step": 20499 + }, + { + "epoch": 0.9982226766975872, + "grad_norm": 0.09095977246761322, + "learning_rate": 3.2238031623554876e-10, + "loss": 0.5201, + "step": 20500 + }, + { + "epoch": 0.9982713704866945, + "grad_norm": 0.1273185908794403, + "learning_rate": 3.0471909375773267e-10, + "loss": 0.6735, + "step": 20501 + }, + { + "epoch": 0.9983200642758017, + "grad_norm": 1.2251548767089844, + "learning_rate": 2.8755536552571574e-10, + "loss": 0.7856, + "step": 20502 + }, + { + "epoch": 0.9983687580649088, + "grad_norm": 1.5976675748825073, + "learning_rate": 2.708891319636031e-10, + "loss": 0.8238, + "step": 20503 + }, + { + "epoch": 0.998417451854016, + "grad_norm": 4.140344142913818, + "learning_rate": 2.5472039348439783e-10, + "loss": 0.8307, + "step": 20504 + }, + { + "epoch": 0.9984661456431232, + "grad_norm": 2.542854070663452, + "learning_rate": 2.390491504944414e-10, + "loss": 0.7759, + "step": 20505 + }, + { + "epoch": 0.9985148394322304, + "grad_norm": 1.5455398559570312, + "learning_rate": 2.2387540338231207e-10, + "loss": 0.9366, + "step": 20506 + }, + { + "epoch": 0.9985635332213376, + "grad_norm": 1.9902361631393433, + "learning_rate": 2.0919915252326506e-10, + "loss": 0.8331, + "step": 20507 + }, + { + "epoch": 0.9986122270104448, + "grad_norm": 1.474120020866394, + "learning_rate": 1.9502039828589448e-10, + "loss": 0.8842, + "step": 20508 + }, + { + "epoch": 0.998660920799552, + "grad_norm": 2.3226075172424316, + "learning_rate": 1.8133914101881035e-10, + "loss": 0.7914, + "step": 20509 + }, + { + "epoch": 0.9987096145886593, + "grad_norm": 1.4094780683517456, + "learning_rate": 1.681553810661818e-10, + "loss": 0.8791, + "step": 20510 + }, + { + "epoch": 0.9987583083777665, + "grad_norm": 2.6936018466949463, + "learning_rate": 1.5546911875219395e-10, + "loss": 0.742, + "step": 20511 + }, + { + "epoch": 0.9988070021668736, + "grad_norm": 1.8449478149414062, + "learning_rate": 1.4328035439659105e-10, + "loss": 0.7612, + "step": 20512 + }, + { + "epoch": 0.9988556959559808, + "grad_norm": 1.6226811408996582, + "learning_rate": 1.315890882991333e-10, + "loss": 0.8669, + "step": 20513 + }, + { + "epoch": 0.998904389745088, + "grad_norm": 4.360873699188232, + "learning_rate": 1.2039532075069916e-10, + "loss": 0.7637, + "step": 20514 + }, + { + "epoch": 0.9989530835341952, + "grad_norm": 2.186707019805908, + "learning_rate": 1.0969905203328523e-10, + "loss": 0.8205, + "step": 20515 + }, + { + "epoch": 0.9990017773233024, + "grad_norm": 2.738386869430542, + "learning_rate": 9.950028240890419e-11, + "loss": 0.8434, + "step": 20516 + }, + { + "epoch": 0.9990504711124096, + "grad_norm": 1.3402783870697021, + "learning_rate": 8.979901213512776e-11, + "loss": 0.8242, + "step": 20517 + }, + { + "epoch": 0.9990991649015168, + "grad_norm": 1.5917699337005615, + "learning_rate": 8.059524144954367e-11, + "loss": 0.8782, + "step": 20518 + }, + { + "epoch": 0.9991478586906241, + "grad_norm": 0.09736618399620056, + "learning_rate": 7.18889705830783e-11, + "loss": 0.6881, + "step": 20519 + }, + { + "epoch": 0.9991965524797312, + "grad_norm": 2.439640760421753, + "learning_rate": 6.368019975111494e-11, + "loss": 0.8515, + "step": 20520 + }, + { + "epoch": 0.9992452462688384, + "grad_norm": 1.3413864374160767, + "learning_rate": 5.596892915793461e-11, + "loss": 0.733, + "step": 20521 + }, + { + "epoch": 0.9992939400579456, + "grad_norm": 1.2610198259353638, + "learning_rate": 4.875515899893657e-11, + "loss": 0.7058, + "step": 20522 + }, + { + "epoch": 0.9993426338470528, + "grad_norm": 1.7922636270523071, + "learning_rate": 4.20388894473156e-11, + "loss": 0.8381, + "step": 20523 + }, + { + "epoch": 0.99939132763616, + "grad_norm": 1.4602751731872559, + "learning_rate": 3.58201206762665e-11, + "loss": 0.75, + "step": 20524 + }, + { + "epoch": 0.9994400214252672, + "grad_norm": 1.4417095184326172, + "learning_rate": 3.00988528367796e-11, + "loss": 0.7239, + "step": 20525 + }, + { + "epoch": 0.9994887152143744, + "grad_norm": 0.10340210795402527, + "learning_rate": 2.487508607096345e-11, + "loss": 0.6674, + "step": 20526 + }, + { + "epoch": 0.9995374090034816, + "grad_norm": 1.4207572937011719, + "learning_rate": 2.014882051204481e-11, + "loss": 0.7413, + "step": 20527 + }, + { + "epoch": 0.9995861027925889, + "grad_norm": 1.7379670143127441, + "learning_rate": 1.5920056273266426e-11, + "loss": 0.7532, + "step": 20528 + }, + { + "epoch": 0.999634796581696, + "grad_norm": 1.7162657976150513, + "learning_rate": 1.218879346343016e-11, + "loss": 0.7363, + "step": 20529 + }, + { + "epoch": 0.9996834903708032, + "grad_norm": 1.7339277267456055, + "learning_rate": 8.955032173574296e-12, + "loss": 0.7849, + "step": 20530 + }, + { + "epoch": 0.9997321841599104, + "grad_norm": 1.4073690176010132, + "learning_rate": 6.2187724836348936e-12, + "loss": 0.9097, + "step": 20531 + }, + { + "epoch": 0.9997808779490176, + "grad_norm": 1.7520190477371216, + "learning_rate": 3.980014464666226e-12, + "loss": 0.8732, + "step": 20532 + }, + { + "epoch": 0.9998295717381248, + "grad_norm": 1.5096455812454224, + "learning_rate": 2.2387581677385528e-12, + "loss": 0.8647, + "step": 20533 + }, + { + "epoch": 0.999878265527232, + "grad_norm": 1.666581153869629, + "learning_rate": 9.950036417016861e-13, + "loss": 0.8544, + "step": 20534 + }, + { + "epoch": 0.9999269593163392, + "grad_norm": 1.2763856649398804, + "learning_rate": 2.487509109805331e-13, + "loss": 0.869, + "step": 20535 + }, + { + "epoch": 0.9999756531054464, + "grad_norm": 1.5385262966156006, + "learning_rate": 0.0, + "loss": 0.7809, + "step": 20536 + }, + { + "epoch": 0.9999756531054464, + "step": 20536, + "total_flos": 5.0650379989689565e+19, + "train_loss": 0.8431105286515683, + "train_runtime": 205233.6339, + "train_samples_per_second": 12.808, + "train_steps_per_second": 0.1 + } + ], + "logging_steps": 1.0, + "max_steps": 20536, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.0650379989689565e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}